我已经编写了一个定制的编码层和解码层,它实现了Attention Is All You Need篇文章中描述的体系 struct .一切正常,直到我try 编译它,我得到一个错误.如果我用一个样本数据运行它,它会编译,但是当我调用fit方法来训练模型时,它抛出另一个错误.我将提供我可能不正确实现的块,并让我知道是否需要更多代码来调试.

TF Version: 2.14.0

Multi-Head Sub Layer and Positional Encoding Layer:

class MhaSubLayer(Layer):
  def __init__(self, units, **kwargs):
    super().__init__()
    self.mha = MultiHeadAttention(key_dim=units, **kwargs)
    self.inner_dense = TimeDistributed(Dense(2048, activation='relu'))
    self.outer_dense = TimeDistributed(Dense(units))
    self.layernorm_mha = LayerNormalization()
    self.layernorm_ff = LayerNormalization()
    self.add = Add()

  def call(self, x, context, **kwargs):
    ### Calculate Attention Output
    attn_out, attn_scores = self.mha(query=x, value=context, return_attention_scores=True, **kwargs)

    attn_resid_cnxt = self.add([x, attn_out])  ## Residual connection
    attn_layer_norm = self.layernorm_mha(attn_resid_cnxt) 

    attn_scores = tf.reduce_mean(attn_scores, axis=1)
    self.last_attention_weights = attn_scores

    ### Pass the Attention output to the Dense Layer
    dense_out = self.outer_dense(self.inner_dense(attn_layer_norm))
    dense_resid_cnxt = self.add([attn_layer_norm, dense_out])  ### Feed forward residual connection

    dense_layer_norm = self.layernorm_ff(dense_resid_cnxt)
    return dense_layer_norm

class PositionalEncodingLayer(Layer):
  def __init__(self, **kwargs):
    super().__init__()
    self.add = Add()

  def get_positional_encodings(self, x):
    seq_len = x.shape[0]
    d = x.shape[1]
    
    P = np.zeros((seq_len, d))

    for k in range(seq_len):
        for i in np.arange(int(d/2)):
            denominator = np.power(10000, 2*i/d)
            P[k, 2*i] = np.sin(k/denominator)
            P[k, 2*i+1] = np.cos(k/denominator)
    return P
  
  def call(self, x):

    # pos_enc = []
    pos_enc = tf.map_fn(fn=self.get_positional_encodings, elems=x)

    # for n, elm in enumerate(x):
    #   p = self.get_positional_encodings(elm)
    #   pos_enc.append(p)
    
    # pos_enc = tf.convert_to_tensor(pos_enc)

    pos_embeddings = self.add([x, pos_enc])
    return pos_embeddings

Encoder-Decoder Block:

class Encoder(Layer):
  def __init__(self, units, embed_input_dim, name='encoder', **kwargs):
    super().__init__()

    ### Encoder Input Embedding and Layer
    self.embedding = Embedding(input_dim=embed_input_dim, output_dim=units, name='en_embed_layer')
    self.pos_embedding = PositionalEncodingLayer(name='en_positional_embed_layer')

    ### Encoder Multi-Head Self Attention Sub Layer
    self.mha_sub_layer1 = MhaSubLayer(units, num_heads=8, name='en_mha_layer_1')
    self.mha_sub_layer2 = MhaSubLayer(units, num_heads=8, name='en_mha_layer_2')
    self.mha_sub_layer3 = MhaSubLayer(units, num_heads=8, name='en_mha_layer_3')
    self.mha_sub_layer4 = MhaSubLayer(units, num_heads=8, name='en_mha_layer_4')
    self.mha_sub_layer5 = MhaSubLayer(units, num_heads=8, name='en_mha_layer_5')
    self.mha_sub_layer6 = MhaSubLayer(units, num_heads=8, name='en_mha_layer_6')

    ### Encoder MHA Dropout Layer
    self.dropout =  Dropout(rate=0.1, name='en_dropout_pos_enc')
    self.dropout1 = Dropout(rate=0.1, name='en_dropout_layer1')
    self.dropout2 = Dropout(rate=0.1, name='en_dropout_layer2')
    self.dropout3 = Dropout(rate=0.1, name='en_dropout_layer3')
    self.dropout4 = Dropout(rate=0.1, name='en_dropout_layer4')
    self.dropout5 = Dropout(rate=0.1, name='en_dropout_layer5')
    self.dropout6 = Dropout(rate=0.1, name='en_dropout_layer6')

  def call(self, x):
    embedding_output = self.embedding(x)

    positional_embedding = self.pos_embedding(embedding_output)
    postitional_embedding = self.dropout(positional_embedding)

    ### First MHa Sub-Layer
    sub_layer1_out = self.mha_sub_layer1(positional_embedding, positional_embedding)
    sub_layer1_out = self.dropout1(sub_layer1_out)

    ### Second MHa Sub-Layer
    sub_layer2_out = self.mha_sub_layer2(sub_layer1_out, sub_layer1_out)
    sub_layer2_out = self.dropout2(sub_layer2_out)

    ### Third MHa Sub-Layer
    sub_layer3_out = self.mha_sub_layer3(sub_layer2_out, sub_layer2_out)
    sub_layer3_out = self.dropout3(sub_layer3_out)

    ### Fourth MHa Sub-Layer
    sub_layer4_out = self.mha_sub_layer4(sub_layer3_out, sub_layer3_out)
    sub_layer4_out = self.dropout4(sub_layer4_out)

    ### Fifth MHa Sub-Layer
    sub_layer5_out = self.mha_sub_layer5(sub_layer4_out, sub_layer4_out)
    sub_layer5_out = self.dropout5(sub_layer5_out)

    ### Sixth MHa Sub-Layer
    sub_layer6_out = self.mha_sub_layer6(sub_layer5_out, sub_layer5_out)
    sub_layer6_out = self.dropout6(sub_layer6_out)

    return sub_layer6_out

class Decoder(Layer):
  def __init__(self, units, embed_input_dim, name='decoder', **kwargs):
    super().__init__()
    ### Decoder Input Embedding Layer
    self.embedding = Embedding(input_dim=embed_input_dim, output_dim=units, name='de_embed_layer')
    self.pos_embedding = PositionalEncodingLayer(name='de_positional_embed_layer')

    ### Decoder Multi-Head Attention Sub Layer
    self.mha_sub_layer1 = MhaSubLayer(units, num_heads=8, name='de_mha_layer_1')
    self.mha_sub_layer2 = MhaSubLayer(units, num_heads=8, name='de_mha_layer_2')
    self.mha_sub_layer3 = MhaSubLayer(units, num_heads=8, name='de_mha_layer_3')
    self.mha_sub_layer4 = MhaSubLayer(units, num_heads=8, name='de_mha_layer_4')
    self.mha_sub_layer5 = MhaSubLayer(units, num_heads=8, name='de_mha_layer_5')
    self.mha_sub_layer6 = MhaSubLayer(units, num_heads=8, name='de_mha_layer_6')

    ### Decoder MHA Droput Layer
    self.dropout =  Dropout(rate=0.1, name='de_dropout_pos_enc')
    self.dropout1 = Dropout(rate=0.1, name='de_dropout_layer1')
    self.dropout2 = Dropout(rate=0.1, name='de_dropout_layer2')
    self.dropout3 = Dropout(rate=0.1, name='de_dropout_layer3')
    self.dropout4 = Dropout(rate=0.1, name='de_dropout_layer4')
    self.dropout5 = Dropout(rate=0.1, name='de_dropout_layer5')
    self.dropout6 = Dropout(rate=0.1, name='de_dropout_layer6')

    ### Dense Output Layer
    self.output_dense_layer = TimeDistributed(Dense(1), name="output_layer")

  def call(self, x, en_context):
    embedding_output = self.embedding(x)
    positional_embedding = self.pos_embedding(embedding_output)
    postitional_embedding = self.dropout(positional_embedding)

    ### First MHA Sub-Layer
    sub_layer1_out = self.mha_sub_layer1(positional_embedding, positional_embedding)
    sub_layer1_out = self.dropout1(sub_layer1_out)

    ### Second MHA Sub-Layer
    sub_layer2_out = self.mha_sub_layer2(sub_layer1_out, en_context)
    sub_layer2_out = self.dropout2(sub_layer2_out)

    ### Third MHA Sub-Layer
    sub_layer3_out = self.mha_sub_layer3(sub_layer2_out, en_context)
    sub_layer3_out = self.dropout3(sub_layer3_out)

    ### Fourth MHA Sub-Layer
    sub_layer4_out = self.mha_sub_layer4(sub_layer3_out, en_context)
    sub_layer4_out = self.dropout4(sub_layer4_out)

    ### Fifth MHA Sub-Layer
    sub_layer5_out = self.mha_sub_layer5(sub_layer4_out, en_context)
    sub_layer5_out = self.dropout5(sub_layer5_out)

    ### Sixth MHA Sub-Layer
    sub_layer6_out = self.mha_sub_layer6(sub_layer5_out, en_context)
    sub_layer6_out = self.dropout6(sub_layer6_out)

    ### Output Dense Layer
    output = self.output_dense_layer(sub_layer6_out)
    output = tf.round(tf.abs(output))
    return output

Sample Data:

np.random.seed(42)
trainX = np.random.randint(0, high=250, size=(5,12))
trainXt_in = np.random.randint(0, high=250, size=(5,3))
trainY = np.random.randint(0, high=250, size=(5,3,1))

Modelling Block:

Training shape: ((1616304, 12), (1616304, 3), (1616304, 3, 1))

## The Model Sub-Class

class Trxster(Model):
  def __init__(self, units, en_embed_dim, de_embed_dim, name='Trxster', **kwargs):
    super().__init__()
    self.encoder = Encoder(units, en_embed_dim)
    self.decoder = Decoder(units, de_embed_dim)

  def call(self, inputs):
    context_vec, target_in = inputs
    context = self.encoder(context_vec)
    preds = self.decoder(target_in, context)
    return preds

forecastor = Trxster(hsize, embed_dim, embed_dim)
forecastor.build(((12, 1),(3, 1)))
forecastor.summary()

Error-1:

TypeError:将Shape转换为TensorShape时出错:尺寸值 必须为INTEGER或NONE或具有index方法,GET VALUE‘(12, 1)‘with type’<class‘tuple’>‘.

如果使用以下示例运行模型:

hsize = 512
embed_dim = 268

forecastor = Trxster(hsize, embed_dim, embed_dim)
forecastor((trainX, trainXt_in))

Model: "trxster_11"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
=================================================================
 encoder_13 (Encoder)        multiple                  63156224  
                                                                 
 decoder_13 (Decoder)        multiple                  63156737  
                                                                 
=================================================================
Total params: 126312961 (481.85 MB)
Trainable params: 126312961 (481.85 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________

### Fit the Model
batch_size = 64
epochs = 100
steps = trainX.shape[0]//batch_size
warmup_steps = steps//25

class MyLRSchedule(tf.keras.optimizers.schedules.LearningRateSchedule):
  def __init__(self, d_model, warmup_steps):
    self.d_model = d_model
    self.warmup_steps = warmup_steps

  def __call__(self, step):
    step_num = step.numpy()
    self.lr = []

    denom = self.d_model**(-0.5)
    numer = min(step_num**(-0.5), step_num*(self.warmup_steps**(-1.5)))
    lrate = np.divide(numer, denom)
    self.lr.append(lrate)

    return lrate

opt = tf.keras.optimizers.Adam(learning_rate=MyLRSchedule(hsize, warmup_steps), beta_1=0.9, beta_2=0.98, epsilon=1e-8)

### Configure Trxster
checkpoint_filepath = './training_ckpt'

cb = [tf.keras.callbacks.EarlyStopping(patience=10, 
                                        monitor='val_loss',
                                        restore_best_weights=True),
      tf.keras.callbacks.ModelCheckpoint(
                                        filepath=checkpoint_filepath,
                                        save_weights_only=True,
                                        monitor='val_loss',
                                        mode='min',
                                        verbose=1,
                                        save_best_only=True)]

loss = tf.keras.losses.MeanSquaredError()
metrics = [tf.keras.metrics.Accuracy(), tf.keras.losses.MeanAbsoluteError()]

forecastor.compile(optimizer=opt,
                   loss='mean_squared_error',
                   metrics=['acc','mean_absolute_error'])

history = forecastor.fit((trainX, trainXt_in), trainY,
                          batch_size=batch_size,
                          steps_per_epoch=steps,
                          epochs=1,
                          validation_data=((valX, ValXt_in), valY),
                          callbacks=cb)

Error-2:提供几行错误跟踪:

ValueError:没有为任何变量提供渐变: ([‘trxster_11/encoder_13/en_embed_layer/embeddings:0’, ‘trxster_11/encoder_13/mha_sub_layer_157/en_mha_layer_1/query/kernel:0’, ‘trxster_11/encoder_13/mha_sub_layer_157/en_mha_layer_1/query/bias:0’, ‘trxster_11/encoder_13/mha_sub_layer_157/en_mha_layer_1/key/kernel:0’, ‘trxster_11/encoder_13/mha_sub_layer_157/en_mha_layer_1/key/bias:0’, ‘trxster_11/encoder_13/mha_sub_layer_157/en_mha_layer_1/value/kernel:0’, ‘trxster_11/encoder_13/mha_sub_layer_157/en_mha_layer_1/value/bias:0’, ‘trxster_11/encoder_13/mha_sub_layer_157/en_mha_layer_1/attention_output/kernel:0’, ‘trxster_11/encoder_13/mha_sub_layer_157/en_mha_layer_1/attention_output/bias:0’, ‘trxster_11/encoder_13/mha_sub_layer_157/time_distributed_314/kernel:0’

我看到的每个例子都告诉我,它应该像我写的那样工作,但事实并非如此.

推荐答案

我想通了!当图形中有TensorFlow函数时,梯度计算失败,在我的网络中,我在Decoder的输出层应用了tf.roundtf.abs.这是未能通过梯度计算的.我把它们取下来,它就像预期的那样变成了火车模型.以下是第https://github.com/tensorflow/tensorflow/issues/1511期的链接.

Decoder:

class Decoder(Layer):
  def __init__(self, units, embed_input_dim, name='decoder', **kwargs):
    super().__init__()
    ### Decoder Input Embedding Layer
    self.embedding = Embedding(input_dim=embed_input_dim, output_dim=units, name='de_embed_layer')
    self.pos_embedding = PositionalEncodingLayer(name='de_positional_embed_layer')

    ### Decoder Multi-Head Attention Sub Layer
    self.mha_sub_layer1 = MhaSubLayer(units, num_heads=8, name='de_mha_layer_1')
    self.mha_sub_layer2 = MhaSubLayer(units, num_heads=8, name='de_mha_layer_2')
    self.mha_sub_layer3 = MhaSubLayer(units, num_heads=8, name='de_mha_layer_3')
    self.mha_sub_layer4 = MhaSubLayer(units, num_heads=8, name='de_mha_layer_4')
    self.mha_sub_layer5 = MhaSubLayer(units, num_heads=8, name='de_mha_layer_5')
    self.mha_sub_layer6 = MhaSubLayer(units, num_heads=8, name='de_mha_layer_6')

    ### Decoder MHA Droput Layer
    self.dropout =  Dropout(rate=0.1, name='de_dropout_pos_enc')
    self.dropout1 = Dropout(rate=0.1, name='de_dropout_layer1')
    self.dropout2 = Dropout(rate=0.1, name='de_dropout_layer2')
    self.dropout3 = Dropout(rate=0.1, name='de_dropout_layer3')
    self.dropout4 = Dropout(rate=0.1, name='de_dropout_layer4')
    self.dropout5 = Dropout(rate=0.1, name='de_dropout_layer5')
    self.dropout6 = Dropout(rate=0.1, name='de_dropout_layer6')

    ### Dense Output Layer
    self.output_dense_layer = TimeDistributed(Dense(1), name="output_layer")

  def call(self, x, en_context):
    embedding_output = self.embedding(x)
    positional_embedding = self.pos_embedding(embedding_output)
    postitional_embedding = self.dropout(positional_embedding)

    ### First MHA Sub-Layer
    sub_layer1_out = self.mha_sub_layer1(positional_embedding, positional_embedding)
    sub_layer1_out = self.dropout1(sub_layer1_out)

    ### Second MHA Sub-Layer
    sub_layer2_out = self.mha_sub_layer2(sub_layer1_out, en_context)
    sub_layer2_out = self.dropout2(sub_layer2_out)

    ### Third MHA Sub-Layer
    sub_layer3_out = self.mha_sub_layer3(sub_layer2_out, en_context)
    sub_layer3_out = self.dropout3(sub_layer3_out)

    ### Fourth MHA Sub-Layer
    sub_layer4_out = self.mha_sub_layer4(sub_layer3_out, en_context)
    sub_layer4_out = self.dropout4(sub_layer4_out)

    ### Fifth MHA Sub-Layer
    sub_layer5_out = self.mha_sub_layer5(sub_layer4_out, en_context)
    sub_layer5_out = self.dropout5(sub_layer5_out)

    ### Sixth MHA Sub-Layer
    sub_layer6_out = self.mha_sub_layer6(sub_layer5_out, en_context)
    sub_layer6_out = self.dropout6(sub_layer6_out)

    ### Output Dense Layer
    output = self.output_dense_layer(sub_layer6_out)
    return output

Python-3.x相关问答推荐

我有个问题继承遗产合伙人

Python避免捕获特定异常

以编程方式关闭jupyterlab内核

从PYTHON中获取单行和多行的Rguar表达式

如何将日期时间索引写入日期类型的表?

在 string.find() 条件下加入两个 Dataframes

在 groupby 之后,Pandas 在特定类别中获得最常见和最后的值

安装没有 sudo 权限的 python3 和 pip3

txt 文件与不同的分隔符到整数列表

使用gekko python的混合整数非线性规划

如果原始字符串包含正斜杠,如何返回具有不同可能性的新字符串

Python多进程:运行一个类的多个实例,将所有子进程保留在内存中

使用 from re findall 组合连续匹配并分离非连续匹配

根据另一列值对多个数据框列进行分组

Pandas 值列中列表中元素的计数

Python 3x 的最佳机器学习包?

在 Python 3 中获取所有超类

带百分号的 Python 字符串格式

在 Ipython 中使用 Pylint (Jupyter-Notebook)

在 Python 中生成马尔可夫转移矩阵