Python teacher forcing in keras

# The only difference from encoder decoder model is that the decoder will take input 1 less word for teacher forcing
# The goal is to predict the sequence with missing word  (NOTE : See ENCODER DECODER IN KERAS)
from keras.layers import Input, GRU, RepeatVector, TimeDistributed, Dense
from keras.models import Model
### WITHOUT EMBEDDING LAYER

# Encoder
en_input_tensors = Input(shape=(en_len, en_vocab)) # en_len is the number of words in a sentence, en_vocab is no of unique words
en_gru_layer = GRU(hsize, return_state=True)
en_output_tensors, en_state = en_gru_layer(en_input_tensors)
# Decoder
de_input_tensors = Input(shape=(fr_len-1, fr_vocab)) # takes 1 word less than encoder (for different language sentence and vocab)
de_gru_layer = GRU(hsize, return_sequences=True)
de_output_tensors = de_gru_layer(de_input_tensors, initial_state=en_state) # initial_state is connected to the output state of encoder
# Decoder Prediction
de_dense_layer = Dense(fr_vocab, activation= 'softmax' )
de_timed_dense_layer = TimeDistributed( de_dense_layer ) 
de_prediction = de_timed_dense_layer(de_output_tensors)
model_tf = Model(inputs=[en_input_tensors, de_input_tensors], outputs=de_prediction)
model_tf.compile(optimizer='adam', loss="categorical_crossentropy", metrics=["acc"])

## Helper function that takes in text and outputs into padded, embedded, reversed, one-hot vectors 
def sents2seqs(input_type, sentences, onehot=False, pad_type='post', reverse=False):     
    # logic (look into TEXT IN KERAS)
    return preproc_text
    
for ei in range(n_epochs):
    for i in range(0,train_size,bsize):
        encoder_x = sents2seqs('source', tr_en[i:i+bsize], onehot=True, reverse=True) # Input data for encoder
        decoder_xy = sents2seqs('target', tr_fr[i:i+bsize], onehot=True) # Preprocessed data before passing to decoder
        decoder_x = decoder_xy[:,:-1,:] # Inputs - All French words except the last word (onehot encoded)
        decoder_y = decoder_xy[:,1:,:] # Outputs - All French words except the first word as next sequence of input (onehot encoded)
        model_tf.train_on_batch([encoder_x, decoder_x], decoder_y)

### WITH EMBEDDING LAYER

en_input_tensors = Input(shape=(en_len,)) # Input layer will only take in a sentence 
en_embedding_layer = Embedding(en_vocab, 96, input_length=en_len) # Embedding layer will map inputs into en_vocabX96 embedding matrix
en_embedded_tensors = en_embedding_layer(en_input_tensors)
en_gru_layer = GRU(hsize, return_state=True)
en_output_tensors, en_state = en_gru_layer(en_embedded_tensors)

de_input_tensors = Input(shape=(fr_len-1,)) # Input layer will only take in a sentence with 1 less word
de_embedding_layer = Embedding(fr_vocab, 96, input_length=fr_len-1) # this layer will map inputs into fr_vocabX96 embedding matrix
de_embedded_tensors = de_embedding_layer(de_input_tensors)
de_gru_layer = GRU(hsize, return_sequences=True, return_state=True)
de_prediction, de_state = de_gru_layer(de_embedded_tensors, initial_state=en_state) # Connect encoder output state into decoder state

model_emb = Model(inputs=[en_input_tensors, de_input_tensors], outputs=de_prediction)
model_emb.compile(optimizer='adam', loss="categorical_crossentropy", metrics=["acc"])

for ei in range(3): # epoch
    for i in range(0, train_size, bsize): # batch
        encoder_x = sents2seqs('source', tr_en[i:i+bsize], onehot=False, reverse=True) # encoder input data
        decoder_xy = sents2seqs('target', tr_fr[i:i+bsize], onehot=False) # preprocessed data before passing into decoder
        decoder_x = decoder_xy[:,:-1]  # Inputs - All French words except the last word (not onehot encoded, since embedded layer)
        decoder_xy_oh = sents2seqs('target', tr_fr[i:i+bsize], onehot=True) # One-hot encoding the target for getting y for modeling
        decoder_y = decoder_xy_oh[:,1:,:] # Outputs- All French words except the first word as next sequence of input (onehot encoded)
        model_emb.train_on_batch([encoder_x, decoder_x], decoder_y)
        res = model_emb.evaluate([encoder_x, decoder_x], decoder_y, batch_size=bsize, verbose=0)
        print("{} => Loss:{}, Train Acc: {}".format(ei+1,res[0], res[1]*100.0))

# NOTE : Problem with this model, the decoder also needs values that we need to predict. 
# Solution: You need to create an inference model that will generate word by word prediction. You then need to copy this model's each layer's weight to the inference model's subsequent layer's weight
# see DECODER OF THE INFERENCE MODEL IN KERAS
Python相关代码片段