# The only difference from encoder decoder model is that the decoder will take input 1 less word for teacher forcing # The goal is to predict the sequence with missing word (NOTE : See ENCODER DECODER IN KERAS) from keras.layers import Input, GRU, RepeatVector, TimeDistributed, Dense from keras.models import Model ### WITHOUT EMBEDDING LAYER # Encoder en_input_tensors = Input(shape=(en_len, en_vocab)) # en_len is the number of words in a sentence, en_vocab is no of unique words en_gru_layer = GRU(hsize, return_state=True) en_output_tensors, en_state = en_gru_layer(en_input_tensors) # Decoder de_input_tensors = Input(shape=(fr_len-1, fr_vocab)) # takes 1 word less than encoder (for different language sentence and vocab) de_gru_layer = GRU(hsize, return_sequences=True) de_output_tensors = de_gru_layer(de_input_tensors, initial_state=en_state) # initial_state is connected to the output state of encoder # Decoder Prediction de_dense_layer = Dense(fr_vocab, activation= 'softmax' ) de_timed_dense_layer = TimeDistributed( de_dense_layer ) de_prediction = de_timed_dense_layer(de_output_tensors) model_tf = Model(inputs=[en_input_tensors, de_input_tensors], outputs=de_prediction) model_tf.compile(optimizer='adam', loss="categorical_crossentropy", metrics=["acc"]) ## Helper function that takes in text and outputs into padded, embedded, reversed, one-hot vectors def sents2seqs(input_type, sentences, onehot=False, pad_type='post', reverse=False): # logic (look into TEXT IN KERAS) return preproc_text for ei in range(n_epochs): for i in range(0,train_size,bsize): encoder_x = sents2seqs('source', tr_en[i:i+bsize], onehot=True, reverse=True) # Input data for encoder decoder_xy = sents2seqs('target', tr_fr[i:i+bsize], onehot=True) # Preprocessed data before passing to decoder decoder_x = decoder_xy[:,:-1,:] # Inputs - All French words except the last word (onehot encoded) decoder_y = decoder_xy[:,1:,:] # Outputs - All French words except the first word as next sequence of input (onehot encoded) model_tf.train_on_batch([encoder_x, decoder_x], decoder_y) ### WITH EMBEDDING LAYER en_input_tensors = Input(shape=(en_len,)) # Input layer will only take in a sentence en_embedding_layer = Embedding(en_vocab, 96, input_length=en_len) # Embedding layer will map inputs into en_vocabX96 embedding matrix en_embedded_tensors = en_embedding_layer(en_input_tensors) en_gru_layer = GRU(hsize, return_state=True) en_output_tensors, en_state = en_gru_layer(en_embedded_tensors) de_input_tensors = Input(shape=(fr_len-1,)) # Input layer will only take in a sentence with 1 less word de_embedding_layer = Embedding(fr_vocab, 96, input_length=fr_len-1) # this layer will map inputs into fr_vocabX96 embedding matrix de_embedded_tensors = de_embedding_layer(de_input_tensors) de_gru_layer = GRU(hsize, return_sequences=True, return_state=True) de_prediction, de_state = de_gru_layer(de_embedded_tensors, initial_state=en_state) # Connect encoder output state into decoder state model_emb = Model(inputs=[en_input_tensors, de_input_tensors], outputs=de_prediction) model_emb.compile(optimizer='adam', loss="categorical_crossentropy", metrics=["acc"]) for ei in range(3): # epoch for i in range(0, train_size, bsize): # batch encoder_x = sents2seqs('source', tr_en[i:i+bsize], onehot=False, reverse=True) # encoder input data decoder_xy = sents2seqs('target', tr_fr[i:i+bsize], onehot=False) # preprocessed data before passing into decoder decoder_x = decoder_xy[:,:-1] # Inputs - All French words except the last word (not onehot encoded, since embedded layer) decoder_xy_oh = sents2seqs('target', tr_fr[i:i+bsize], onehot=True) # One-hot encoding the target for getting y for modeling decoder_y = decoder_xy_oh[:,1:,:] # Outputs- All French words except the first word as next sequence of input (onehot encoded) model_emb.train_on_batch([encoder_x, decoder_x], decoder_y) res = model_emb.evaluate([encoder_x, decoder_x], decoder_y, batch_size=bsize, verbose=0) print("{} => Loss:{}, Train Acc: {}".format(ei+1,res[0], res[1]*100.0)) # NOTE : Problem with this model, the decoder also needs values that we need to predict. # Solution: You need to create an inference model that will generate word by word prediction. You then need to copy this model's each layer's weight to the inference model's subsequent layer's weight # see DECODER OF THE INFERENCE MODEL IN KERAS