text = 'Hi this is a small sentence' # We choose a sequence length seq_len = 3 # Split text into a list of words words = text.split() # ['Hi', 'this', 'is', 'a', 'small', 'sentence'] # Make lines with 3 words like : ['Hi this is', 'this is a', 'is a small', 'a small sentence'] lines = [] for i in range(seq_len, len(words) + 1): line = ' '.join(words[i-seq_len:i]) lines.append(line) # Import Tokenizer from keras preprocessing text from tensorflow.keras.preprocessing.text import Tokenizer # Instantiate Tokenizer tokenizer = Tokenizer() # Fit it on the previous lines tokenizer.fit_on_texts(lines) # Turn the lines into numeric sequences sequences = tokenizer.texts_to_sequences(lines) # array([[5, 3, 1], [3, 1, 2], [1, 2, 4], [2, 4, 6]]) print(tokenizer.index_word) # {1: 'is', 2: 'a', 3: 'this', 4: 'small', 5: 'hi', 6: 'sentence'}, we can use this to decode back original text # Import Dense, LSTM and Embedding layers from tensorflow.keras.layers import Dense, LSTM, Embedding model = Sequential() # Vocabulary size vocab_size = len(tokenizer.index_word) + 1 # we are adding 1 since our encoding started from 1 and not 0, reserved for special characters # Starting with an embedding layer (This is a layers that is specially required when we deal with categorical data like text in NLP to let the neural network understand the similarity between them) # input_dim=size of unique tokens, input_length= length of input sequence, output_dim= dense vector embedding matrix columns model.add(Embedding(input_dim=vocab_size, output_dim=8, input_length=2)) # Adding an LSTM layer model.add(LSTM(8)) # Adding a Dense hidden layer model.add(Dense(8, activation='relu')) # Adding an output layer with softmax, last dense layer should have same number of inputs as input dimension of embedding layer model.add(Dense(vocab_size, activation='softmax')) #### Example 2 from tensorflow.keras.preprocessing.text import Tokenizer # Split text into an array of words words = text.split() # Make sentences of 4 words each, moving one word at a time sentences = [] for i in range(4, len(words)): sentences.append(' '.join(words[i-4:i])) # Instantiate a Tokenizer, then fit it on the sentences tokenizer = Tokenizer() tokenizer.fit_on_texts(sentences) sequences = tokenizer.texts_to_sequences(sentences) # Turn sentences into a sequence of numbers print("Sentences: \n {} \n Sequences: \n {}".format(sentences[:5],sequences[:5])) vocab_size = len(tokenizer.index_word) + 1 print(tokenizer.index_word) from tensorflow.keras.layers import LSTM, Dense, Embedding # Import the Embedding, LSTM and Dense layer model = Sequential() # Add an Embedding layer with the right parameters model.add(Embedding(input_dim = vocab_size, input_length = 3, output_dim = 8 )) # feed each neuron 3 words at a time model.add(LSTM(32)) # Add a 32 unit LSTM layer # Add a hidden Dense layer of 32 units and an output layer of vocab_size with softmax model.add(Dense(32, activation='relu')) model.add(Dense(vocab_size, activation='softmax')) model.summary() def predict_text(test_text, model = model): if len(test_text.split()) != 3: print('Text input should be 3 words!') return False # Turn the test_text into a sequence of numbers test_seq = tokenizer.texts_to_sequences([test_text]) test_seq = np.array(test_seq) # Use the model passed as a parameter to predict the next word pred = model.predict(test_seq).argmax(axis = 1)[0] # Return the word that maps to the prediction return tokenizer.index_word[pred] predict_text('meet revenge with')from keras.models import Sequential from keras.layers import Dense from keras.layers import LSTM model = Sequential() model.add(LSTM(4, input_shape=(1, look_back))) model.add(Dense(1)) model.compile(loss='mean_squared_error', optimizer='adam') model.fit(trainX, trainY, epochs=100, batch_size=1, verbose=2)