As the LLMs models improve, our Harbour Transfomer ( based on "Attention is all you need" ) is improving too (simpler and cleaner):
Example of use:
#include "hbclass.ch"
/*
* Clase: TransformerEncoderBlock
* -------------------------------
* Implementa un único bloque Encoder, capaz de realizar el forward pass,
* backpropagation, y actualizar sus propios pesos.
*/
CLASS TransformerEncoderBlock
// --- Propiedades ---
DATA oWq, oWk, oV // Pesos de Atención
DATA oW1, ob1, oW2, ob2 // Pesos de la Red Feed-Forward
DATA oGamma1, oBeta1 // Pesos de LayerNorm 1
DATA oGamma2, oBeta2 // Pesos de LayerNorm 2
// --- Gradientes ---
DATA gWq, gWk, gV
DATA gW1, gb1, gW2, gb2
DATA gGamma1, gBeta1
DATA gGamma2, gBeta2
// --- Cache para Backpropagation ---
DATA cInput, cNormalized1, cActivated
DATA cQ, cK, cV, cAttentionWeights
// --- Dimensiones ---
DATA nInputDim, nHiddenDim, nHeadDim
// --- Métodos ---
METHOD New( nInputDim, nHiddenDim, nHeadDim ) CONSTRUCTOR
METHOD Forward( mInput )
METHOD Backward( mDOutput )
METHOD ZeroGrads()
METHOD Update( nLr )
ENDCLASS
/*
* CONSTRUCTOR
*/
METHOD New( nInputDim, nHiddenDim, nHeadDim ) CLASS TransformerEncoderBlock
::nInputDim := nInputDim
::nHiddenDim := nHiddenDim
::nHeadDim := nHeadDim
// --- Inicializar Pesos ---
::oWq := HB_MATRIXRANDOM( nInputDim, nHeadDim )
::oWk := HB_MATRIXRANDOM( nInputDim, nHeadDim )
::oV := HB_MATRIXRANDOM( nInputDim, nHeadDim )
::oW1 := HB_MATRIXRANDOM( nInputDim, nHiddenDim )
::ob1 := HB_MATRIXZERO( 1, nHiddenDim )
::oW2 := HB_MATRIXRANDOM( nHiddenDim, nInputDim )
::ob2 := HB_MATRIXZERO( 1, nInputDim )
::oGamma1 := HB_MATRIXFILL( HB_MATRIXZERO( 1, nInputDim ), 1.0 )
::oBeta1 := HB_MATRIXZERO( 1, nInputDim )
::oGamma2 := HB_MATRIXFILL( HB_MATRIXZERO( 1, nInputDim ), 1.0 )
::oBeta2 := HB_MATRIXZERO( 1, nInputDim )
::ZeroGrads() // Crear e inicializar las matrices de gradientes
RETURN Self
/*
* ZeroGrads()
* Pone a cero todas las matrices de gradientes antes de una nueva iteración.
*/
METHOD ZeroGrads() CLASS TransformerEncoderBlock
::gWq := HB_MATRIXZERO( ::nInputDim, ::nHeadDim )
::gWk := HB_MATRIXZERO( ::nInputDim, ::nHeadDim )
::gV := HB_MATRIXZERO( ::nInputDim, ::nHeadDim )
::gW1 := HB_MATRIXZERO( ::nInputDim, ::nHiddenDim )
::gb1 := HB_MATRIXZERO( 1, ::nHiddenDim )
::gW2 := HB_MATRIXZERO( ::nHiddenDim, ::nInputDim )
::gb2 := HB_MATRIXZERO( 1, ::nInputDim )
::gGamma1 := HB_MATRIXZERO( 1, ::nInputDim )
::gBeta1 := HB_MATRIXZERO( 1, ::nInputDim )
::gGamma2 := HB_MATRIXZERO( 1, ::nInputDim )
::gBeta2 := HB_MATRIXZERO( 1, ::nInputDim )
RETURN Nil
/*
* Forward( mInput )
* Realiza el pase hacia adelante y guarda en caché los valores intermedios.
*/
METHOD Forward( mInput ) CLASS TransformerEncoderBlock
LOCAL mAttentionOutput, mNormalized1, mFFN_Output, mEncoderOutput
// Guardar la entrada para el backward pass
::cInput := mInput
// 1. Self-Attention
::cQ := HB_MATRIXMULTIPLY( mInput, ::oWq )
::cK := HB_MATRIXMULTIPLY( mInput, ::oWk )
::cV := HB_MATRIXMULTIPLY( mInput, ::oV )
mScores := HB_MATRIXMULTIPLY( ::cQ, HB_MATRIXTRANSPOSE(::cK) )
mScores := HB_MATRIXDIVSCALAR( mScores, Sqrt(::nHeadDim) )
::cAttentionWeights := HB_SOFTMAX( mScores )
mAttentionOutput := HB_MATRIXMULTIPLY( ::cAttentionWeights, ::cV )
// 2. Add & Norm 1
mSublayer1 := HB_MATRIXADD( mInput, mAttentionOutput )
::cNormalized1 := HB_LAYERNORM( mSublayer1, ::oGamma1, ::oBeta1 )
// 3. Feed-Forward Network
mLinear1 := HB_MATRIXMULTIPLY( ::cNormalized1, ::oW1 )
mWithBias1 := HB_MATRIXADDBROADCAST( mLinear1, ::ob1 )
::cActivated := HB_RELU( mWithBias1 )
mLinear2 := HB_MATRIXMULTIPLY( ::cActivated, ::oW2 )
mFFN_Output := HB_MATRIXADDBROADCAST( mLinear2, ::ob2 )
// 4. Add & Norm 2
mSublayer2 := HB_MATRIXADD( ::cNormalized1, mFFN_Output )
mEncoderOutput := HB_LAYERNORM( mSublayer2, ::oGamma2, ::oBeta2 )
RETURN mEncoderOutput
/*
* Backward( mDOutput )
* Realiza la retropropagación del error.
*/
METHOD Backward( mDOutput ) CLASS TransformerEncoderBlock
LOCAL aGrads, mDNormalized1, mDFFN_Output, mDInput
// --- Backprop a través de Add & Norm 2 ---
aGrads := HB_LAYERNORM_BACKWARD( mDOutput, ::cNormalized1, ::oGamma2, ::oBeta2 )
mDFFN_Output := aGrads[1]
::gGamma2 := HB_MATRIXADD( ::gGamma2, aGrads[2] )
::gBeta2 := HB_MATRIXADD( ::gBeta2, aGrads[3] )
mDNormalized1 := mDFFN_Output // Gradiente de la conexión residual
// --- Backprop a través de Feed-Forward ---
aGrads := HB_MATRIXADDBROADCAST_BACKWARD( mDFFN_Output )
mDLinear2 := aGrads[1]
::gb2 := HB_MATRIXADD( ::gb2, aGrads[2] )
aGrads := HB_MATRIXMULTIPLY_BACKWARD( mDLinear2, ::cActivated, ::oW2 )
mDActivated := aGrads[1]
::gW2 := HB_MATRIXADD( ::gW2, aGrads[2] )
mDReluInput := HB_RELU_BACKWARD( mDActivated, ::cActivated )
aGrads := HB_MATRIXADDBROADCAST_BACKWARD( mDReluInput )
mDLinear1 := aGrads[1]
::gb1 := HB_MATRIXADD( ::gb1, aGrads[2] )
aGrads := HB_MATRIXMULTIPLY_BACKWARD( mDLinear1, ::cNormalized1, ::oW1 )
mDNormalized1 := HB_MATRIXADD( mDNormalized1, aGrads[1] )
::gW1 := HB_MATRIXADD( ::gW1, aGrads[2] )
// --- Backprop a través de Add & Norm 1 ---
aGrads := HB_LAYERNORM_BACKWARD( mDNormalized1, ::cInput, ::oGamma1, ::oBeta1 )
mDAttentionOutput := aGrads[1]
::gGamma1 := HB_MATRIXADD( ::gGamma1, aGrads[2] )
::gBeta1 := HB_MATRIXADD( ::gBeta1, aGrads[3] )
mDInput_from_res1 := mDAttentionOutput
// --- Backprop a través de Self-Attention ---
aGrads := HB_MATRIXMULTIPLY_BACKWARD( mDAttentionOutput, ::cAttentionWeights, ::cV )
mDAttentionWeights := aGrads[1]
mDV := aGrads[2]
mDScores := HB_SOFTMAXBACKWARD( mDAttentionWeights, ::cAttentionWeights )
aGrads := HB_MATRIXMULTIPLY_BACKWARD( mDScores, ::cQ, HB_MATRIXTRANSPOSE(::cK) )
mDQ := aGrads[1]
mDK := HB_MATRIXTRANSPOSE(aGrads[2])
// Backprop final a los pesos Q, K, V
aGrads := HB_MATRIXMULTIPLY_BACKWARD( mDQ, ::cInput, ::oWq )
mDInput_from_Q := aGrads[1]
::gWq := HB_MATRIXADD( ::gWq, aGrads[2] )
aGrads := HB_MATRIXMULTIPLY_BACKWARD( mDK, ::cInput, ::oWk )
mDInput_from_K := aGrads[1]
::gWk := HB_MATRIXADD( ::gWk, aGrads[2] )
aGrads := HB_MATRIXMULTIPLY_BACKWARD( mDV, ::cInput, ::oV )
mDInput_from_V := aGrads[1]
::gV := HB_MATRIXADD( ::gV, aGrads[2] )
// Sumar todos los gradientes que llegan a la entrada original
mDInput := HB_MATRIXADD( mDInput_from_res1, HB_MATRIXADD( mDInput_from_Q, HB_MATRIXADD( mDInput_from_K, mDInput_from_V ) ) )
RETURN mDInput
/*
* Update( nLr )
* Actualiza los pesos del modelo usando Descenso de Gradiente Estocástico (SGD).
*/
METHOD Update( nLr ) CLASS TransformerEncoderBlock
HB_SGD_UPDATE( ::oWq, ::gWq, nLr )
HB_SGD_UPDATE( ::oWk, ::gWk, nLr )
HB_SGD_UPDATE( ::oV, ::gV, nLr )
HB_SGD_UPDATE( ::oW1, ::gW1, nLr )
HB_SGD_UPDATE( ::ob1, ::gb1, nLr )
HB_SGD_UPDATE( ::oW2, ::gW2, nLr )
HB_SGD_UPDATE( ::ob2, ::gb2, nLr )
HB_SGD_UPDATE( ::oGamma1, ::gGamma1, nLr )
HB_SGD_UPDATE( ::oBeta1, ::gBeta1, nLr )
HB_SGD_UPDATE( ::oGamma2, ::gGamma2, nLr )
HB_SGD_UPDATE( ::oBeta2, ::gBeta2, nLr )
RETURN Nil#include "hbclass.ch"
CLASS TransformerModel
DATA aEncoderBlocks // Array de objetos TransformerEncoderBlock
DATA nLayers // Número de bloques
METHOD New( nLayers, nInputDim, nHiddenDim, nHeadDim ) CONSTRUCTOR
METHOD Forward( mInput )
METHOD Backward( mDOutput )
METHOD ZeroGrads()
METHOD Update( nLr )
ENDCLASS
METHOD New( nLayers, nInputDim, nHiddenDim, nHeadDim ) CLASS TransformerModel
LOCAL i
::nLayers := nLayers
::aEncoderBlocks := {}
FOR i := 1 TO nLayers
AAdd( ::aEncoderBlocks, TransformerEncoderBlock():New( nInputDim, nHiddenDim, nHeadDim ) )
NEXT
RETURN Self
METHOD Forward( mInput ) CLASS TransformerModel
LOCAL i
FOR i := 1 TO ::nLayers
mInput := ::aEncoderBlocks[i]:Forward( mInput )
NEXT
RETURN mInput
METHOD Backward( mDOutput ) CLASS TransformerModel
LOCAL i
FOR i := ::nLayers TO 1 STEP -1
mDOutput := ::aEncoderBlocks[i]:Backward( mDOutput )
NEXT
RETURN mDOutput
METHOD ZeroGrads() CLASS TransformerModel
AEval( ::aEncoderBlocks, {|oBlock| oBlock:ZeroGrads()} )
RETURN Nil
METHOD Update( nLr ) CLASS TransformerModel
AEval( ::aEncoderBlocks, {|oBlock| oBlock:Update(nLr)} )
RETURN NilPROCEDURE Main()
LOCAL nLayers := 4 // 4 bloques de Encoder apilados
LOCAL nEmbedDim := 64 // Dimensión de entrada
LOCAL nHiddenDim := 256 // Dimensión de la capa oculta FFN
LOCAL nHeadDim := 64 // Dimensión de la atención
LOCAL nSeqLen := 20 // Longitud de la secuencia
LOCAL nEpochs := 100
LOCAL nLearningRate := 0.001
LOCAL oModel, mInput, mTarget, mOutput, dLoss
// 1. Crear el modelo completo
oModel := TransformerModel():New( nLayers, nEmbedDim, nHiddenDim, nHeadDim )
// 2. Crear datos de ejemplo (entrada y objetivo deseado)
mInput := HB_MATRIXRANDOM( nSeqLen, nEmbedDim )
mTarget := HB_MATRIXRANDOM( nSeqLen, nEmbedDim ) // Objetivo simplificado
? "Iniciando entrenamiento..."
// 3. Bucle de entrenamiento
FOR i := 1 TO nEpochs
// -- Pase hacia adelante --
mOutput := oModel:Forward( mInput )
// -- Calcular el gradiente inicial a partir del error --
// (Salida del modelo - Objetivo real)
dLoss := HB_CROSSENTROPYLOSS( mOutput, mTarget )
// -- Pase hacia atrás (Backpropagation) --
oModel:Backward( dLoss )
// -- Actualizar los pesos del modelo --
oModel:Update( nLearningRate )
// -- Poner a cero los gradientes para la siguiente iteración --
oModel:ZeroGrads()
IF i % 10 == 0
? "Época", i, "completada."
ENDIF
NEXT
? "Entrenamiento finalizado."
RETURN