Transformers are described in the document "attention is all you need" and are the architecture used by AI large language models (chatgpt, etc):
FUNCTION Main()
聽 聽 LOCAL aEmbeddings, aWq, aWk, aWv, aBq, aBk, aBv
聽 聽 LOCAL aQ, aK, aV
聽 聽 LOCAL aAttentionScores, aOutput
聽 聽 // Simulamos embeddings y matrices de peso (normalmente ser铆an cargados o generados)
聽 聽 aEmbeddings := GenerateRandomMatrix(3, 4) 聽// [batch_size, seq_len, d_model]
聽 聽 aWq := GenerateRandomMatrix(4, 2) 聽// [d_model, d_k]
聽 聽 aWk := GenerateRandomMatrix(4, 2)
聽 聽 aWv := GenerateRandomMatrix(4, 2)
聽 聽 aBq := GenerateRandomVector(2) 聽// [d_k]
聽 聽 aBk := GenerateRandomVector(2)
聽 聽 aBv := GenerateRandomVector(2)
聽 聽
聽 聽 ? aEmbeddings
聽 聽
聽 聽 // Realizamos las transformaciones lineales
聽 聽 aQ := LinearTransformation(aEmbeddings, aWq, aBq)
聽 聽 aK := LinearTransformation(aEmbeddings, aWk, aBk)
聽 聽 aV := LinearTransformation(aEmbeddings, aWv, aBv)
聽 聽
聽 聽 // Calculamos las puntuaciones de atenci贸n
聽 聽 aAttentionScores := CalculateAttentionScores(aQ, aK)
聽 聽
聽 聽 // Aplicamos las puntuaciones de atenci贸n a los valores
聽 聽 aOutput := ApplyAttention(aAttentionScores, aV)
聽 聽
聽 聽 // Imprimimos los resultados
聽 聽 ? "Query:", aQ
聽 聽 ? "Key:", aK
聽 聽 ? "Value:", aV
聽 聽 ? "Attention Scores:", aAttentionScores
聽 聽 ? "Output:", aOutput
RETURN NIL
FUNCTION LinearTransformation(aX, aW, aB)
聽 聽 LOCAL aResult, i, j, k, nSum
聽 聽 LOCAL nRows := Len(aX), nCols := Len(aW[1]), nInner := Len(aW)
聽 聽
聽 聽 aResult := Array(nRows)
聽 聽 FOR i := 1 TO nRows
聽 聽 聽 聽 aResult[i] := Array(nCols)
聽 聽 聽 聽 FOR j := 1 TO nCols
聽 聽 聽 聽 聽 聽 nSum := 0
聽 聽 聽 聽 聽 聽 FOR k := 1 TO nInner
聽 聽 聽 聽 聽 聽 聽 聽 nSum += aX[i][k] * aW[k][j]
聽 聽 聽 聽 聽 聽 NEXT
聽 聽 聽 聽 聽 聽 aResult[i][j] := nSum + aB[j]
聽 聽 聽 聽 NEXT
聽 聽 NEXT
聽 聽
RETURN aResult
FUNCTION GenerateRandomMatrix(nRows, nCols)
聽 聽 LOCAL aMatrix := Array(nRows, nCols), i, j
聽 聽 FOR i := 1 TO nRows
聽 聽 聽 聽 FOR j := 1 TO nCols
聽 聽 聽 聽 聽 聽 aMatrix[i,j] := hb_Random(-1, 1)
聽 聽 聽 聽 NEXT 聽 聽
聽 聽 NEXT
RETURN aMatrix
FUNCTION GenerateRandomVector(nSize)
聽 聽 LOCAL aVector := Array(nSize), i
聽 聽 FOR i := 1 TO nSize
聽 聽 聽 聽 aVector[i] := hb_Random(-1, 1)
聽 聽 NEXT
RETURN aVector
FUNCTION CalculateAttentionScores(aQ, aK)
聽 聽 LOCAL aScores, i, j, k, nSum, nExpSum
聽 聽 LOCAL nRowsQ := Len(aQ), nColsQ := Len(aQ[1])
聽 聽 LOCAL nRowsK := Len(aK), nColsK := Len(aK[1])
聽 聽
聽 聽 // aQ y aK deben tener el mismo n煤mero de columnas (d_k)
聽 聽 IF nColsQ <> nColsK
聽 聽 聽 聽 ? "Error: Las dimensiones de aQ y aK no coinciden"
聽 聽 聽 聽 RETURN NIL
聽 聽 ENDIF
聽 聽
聽 聽 aScores := Array(nRowsQ, nRowsK)
聽 聽 FOR i := 1 TO nRowsQ
聽 聽 聽 聽 FOR j := 1 TO nRowsK
聽 聽 聽 聽 聽 聽 nSum := 0
聽 聽 聽 聽 聽 聽 FOR k := 1 TO nColsQ
聽 聽 聽 聽 聽 聽 聽 聽 nSum += aQ[i][k] * aK[j][k]
聽 聽 聽 聽 聽 聽 NEXT
聽 聽 聽 聽 聽 聽 aScores[i][j] := nSum / Sqrt(nColsQ) 聽// Escalado de las puntuaciones de atenci贸n
聽 聽 聽 聽 NEXT
聽 聽 NEXT
聽 聽
聽 聽 // Aplicamos la normalizaci贸n softmax
聽 聽 FOR i := 1 TO nRowsQ
聽 聽 聽 聽 nExpSum := 0
聽 聽 聽 聽 FOR j := 1 TO nRowsK
聽 聽 聽 聽 聽 聽 aScores[i][j] := Exp(aScores[i][j])
聽 聽 聽 聽 聽 聽 nExpSum += aScores[i][j]
聽 聽 聽 聽 NEXT
聽 聽 聽 聽 FOR j := 1 TO nRowsK
聽 聽 聽 聽 聽 聽 aScores[i][j] /= nExpSum
聽 聽 聽 聽 NEXT
聽 聽 NEXT
聽 聽
RETURN aScores
FUNCTION ApplyAttention(aScores, aV)
聽 聽 LOCAL aOutput, i, j, k, nSum
聽 聽 LOCAL nRows := Len(aScores), nCols := Len(aV[1]), nInner := Len(aV)
聽 聽
聽 聽 aOutput := Array(nRows, nCols)
聽 聽 FOR i := 1 TO nRows
聽 聽 聽 聽 FOR j := 1 TO nCols
聽 聽 聽 聽 聽 聽 nSum := 0
聽 聽 聽 聽 聽 聽 FOR k := 1 TO nInner
聽 聽 聽 聽 聽 聽 聽 聽 nSum += aScores[i][k] * aV[k][j]
聽 聽 聽 聽 聽 聽 NEXT
聽 聽 聽 聽 聽 聽 aOutput[i][j] := nSum
聽 聽 聽 聽 NEXT
聽 聽 NEXT
聽 聽
RETURN aOutput{{-0.20, -0.33, -0.13, 0.75}, {0.56, 0.31, 0.19, -0.09}, {-0.26, 0.48, 0.73, -0.32}}
Query: {{0.6859, -0.0584}, {1.3492, 0.9291}, {1.0082, 1.1412}}
Key: {{0.3594, 1.1780}, {1.0069, 1.3886}, {0.8579, 0.6985}}
Value: {{-0.2781, -0.6665}, {-1.0988, 0.3276}, {-0.3004, 0.3100}}
Attention Scores: {{0.27, 0.37, 0.36}, {0.23, 0.49, 0.27}, {0.26, 0.49, 0.25}}
Output: {{-0.590643, 0.049439}, {-0.690302, 0.091827}, {-0.684619, 0.064918}}