您可以使用tensorflow-text
中的滑动窗口function;然而,TextVectorization
层似乎只适用于后期填充:
import tensorflow as tf
import tensorflow_text as tft
with open('data.txt', 'w') as f:
f.write('Lorem ipsum dolor sit amet, consectetur adipiscing elit. Aliquam efficitur viverra lacus?\n')
train_data = tf.data.TextLineDataset(['/content/data.txt'])
vectorize_layer = tf.keras.layers.TextVectorization(output_mode='int', max_tokens=50, pad_to_max_tokens=True)
vectorize_layer.adapt(train_data)
window_size = 5
def sliding_window(x):
encoded = vectorize_layer(x)
x = tft.sliding_window(encoded, width=window_size, axis=0)
y = tft.sliding_window(encoded, width=window_size + 1, axis=0)[:, -1]
return x[:tf.shape(y)[0],:], y
train_data = train_data.map(sliding_window)
vocab = tf.constant(vectorize_layer.get_vocabulary())
keys = tf.cast(tf.range(vocab.shape[0]), tf.int64)
table = tf.lookup.StaticHashTable(
tf.lookup.KeyValueTensorInitializer(keys, vocab),
default_value="")
train_data = tf.data.Dataset.zip((train_data.map(lambda x, y: x).flat_map(tf.data.Dataset.from_tensor_slices),
train_data.map(lambda x, y: y).flat_map(tf.data.Dataset.from_tensor_slices)))
for x, y in train_data:
print('x -->', x, 'y -->', y)
print('x -->', table.lookup(x), 'y -->', table.lookup(y), '\n')
x --> tf.Tensor([ 4 6 9 3 11], shape=(5,), dtype=int64) y --> tf.Tensor(10, shape=(), dtype=int64)
x --> tf.Tensor([b'lorem' b'ipsum' b'dolor' b'sit' b'amet'], shape=(5,), dtype=string) y --> tf.Tensor(b'consectetur', shape=(), dtype=string)
x --> tf.Tensor([ 6 9 3 11 10], shape=(5,), dtype=int64) y --> tf.Tensor(13, shape=(), dtype=int64)
x --> tf.Tensor([b'ipsum' b'dolor' b'sit' b'amet' b'consectetur'], shape=(5,), dtype=string) y --> tf.Tensor(b'adipiscing', shape=(), dtype=string)
x --> tf.Tensor([ 9 3 11 10 13], shape=(5,), dtype=int64) y --> tf.Tensor(7, shape=(), dtype=int64)
x --> tf.Tensor([b'dolor' b'sit' b'amet' b'consectetur' b'adipiscing'], shape=(5,), dtype=string) y --> tf.Tensor(b'elit', shape=(), dtype=string)
x --> tf.Tensor([ 3 11 10 13 7], shape=(5,), dtype=int64) y --> tf.Tensor(12, shape=(), dtype=int64)
x --> tf.Tensor([b'sit' b'amet' b'consectetur' b'adipiscing' b'elit'], shape=(5,), dtype=string) y --> tf.Tensor(b'aliquam', shape=(), dtype=string)
x --> tf.Tensor([11 10 13 7 12], shape=(5,), dtype=int64) y --> tf.Tensor(8, shape=(), dtype=int64)
x --> tf.Tensor([b'amet' b'consectetur' b'adipiscing' b'elit' b'aliquam'], shape=(5,), dtype=string) y --> tf.Tensor(b'efficitur', shape=(), dtype=string)
x --> tf.Tensor([10 13 7 12 8], shape=(5,), dtype=int64) y --> tf.Tensor(2, shape=(), dtype=int64)
x --> tf.Tensor([b'consectetur' b'adipiscing' b'elit' b'aliquam' b'efficitur'], shape=(5,), dtype=string) y --> tf.Tensor(b'viverra', shape=(), dtype=string)
x --> tf.Tensor([13 7 12 8 2], shape=(5,), dtype=int64) y --> tf.Tensor(5, shape=(), dtype=int64)
x --> tf.Tensor([b'adipiscing' b'elit' b'aliquam' b'efficitur' b'viverra'], shape=(5,), dtype=string) y --> tf.Tensor(b'lacus', shape=(), dtype=string)
注意,没有相应标签的序列将与第x[:tf.shape(y)[0],:]
行一起丢弃.此外,查找表仅用于演示目的,不需要实现您想要的功能.如果你想应用预填充,你可以看tft.pad_along_dimension.