【Package Switcher's Machine Learning】RNN Training Text Generation

Used chat records from group friends

# !pip install pymysql
# !pip install matplotlib
import tensorflow as tf
import numpy as np
import pymysql
import pandas as pd
from matplotlib.pyplot import plot

SENT_LENGTH = 1024

conn = pymysql.connect()
cs = conn.cursor()
cs.execute()
all_data = cs.fetchall()
all_data = [a[0] for a in all_data]
all_data[0]

'This u speed is too slow'

def padding(origin, endding, maxL):
    if len(origin) >= maxL:
        return origin[:maxL]
    for i in range(maxL - len(origin)):
        origin.append(endding)
    return origin

!pip install jieba

Looking in indexes: https://pypi.tuna.tsinghua.edu.cn/simple

import jieba

# Dictionary of all characters
all_char = set(''.join(all_data))
# encode to id 
ids_from_chars = tf.keras.layers.StringLookup(vocabulary=list(all_char), mask_token=None)
# decode to char
chars_from_ids = tf.keras.layers.StringLookup(vocabulary=ids_from_chars.get_vocabulary(), invert=True, mask_token=None)

# all char to all ints
all_data_num = ids_from_chars(list(''.join(all_data)))
all_data_num.shape, len(list(''.join(all_data)))

(TensorShape([147119]), 147119)

## to tensor
ids_dataset = tf.data.Dataset.from_tensor_slices(all_data_num)
for ids in ids_dataset.take(10):
    print(chars_from_ids(ids).numpy().decode('utf-8'))

This
u
speed
is
too
slow
了
淦

# max length
seq_length = 100

sequences = ids_dataset.batch(seq_length+1, drop_remainder=True)
# for seq in sequences.take(1):
#      for c in chars_from_ids(seq).numpy():
#             print(c.decode('utf-8'))

# generate x,y
def split_input_target(sequence):
    input_text = sequence[:-1]
    target_text = sequence[1:]
    return input_text, target_text

dataset = sequences.map(split_input_target)

def text_from_ids(ids):
    return tf.strings.reduce_join([x.numpy().decode('utf-8') for x in chars_from_ids(ids)], axis=-1)

for input_example, target_example in dataset.take(1):
    print("Input :", text_from_ids(input_example).numpy().decode("utf-8"))
    print("Target:", text_from_ids(target_example).numpy().decode("utf-8"))

Input : This u speed is too slow 淦 Found sudo-prompt, it's okay. No, this library is too garbage, it doesn't return that process object after executing the command, what are you doing, a client code electron
??? Let the barbecue guy write you a new library, my vscode is also carrying the barbershop
Target: u speed is too slow 淦 Found sudo-prompt, it's okay. No, this library is too garbage, it doesn't return that process object after executing the command, what are you doing, a client code electron
??? Let the barbecue guy write you a new library, my vscode is also carrying the barbershop now

# Batch size
BATCH_SIZE = 256

# Buffer size to shuffle the dataset
# (TF data is designed to work with possibly infinite sequences,
# so it doesn't attempt to shuffle the entire sequence in memory. Instead,
# it maintains a buffer in which it shuffles elements).
BUFFER_SIZE = 1000

dataset = (
    dataset
    .shuffle(BUFFER_SIZE)
    .batch(BATCH_SIZE, drop_remainder=True)
    .prefetch(tf.data.experimental.AUTOTUNE))

dataset

<PrefetchDataset element_spec=(TensorSpec(shape=(256, 100), dtype=tf.int64, name=None), TensorSpec(shape=(256, 100), dtype=tf.int64, name=None))>

# Length of the vocabulary in StringLookup Layer
vocab_size = len(ids_from_chars.get_vocabulary())

# The embedding dimension
embedding_dim = 256

# Number of RNN units
rnn_units = 1024

# Model
class MyModel(tf.keras.Model):
    def __init__(self, vocab_size, embedding_dim, rnn_units):
        super().__init__(self)
#      Dictionary size
        self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
#     GRU network
#         self.gru = tf.keras.layers.GRU(rnn_units,
#                                        return_sequences=True,
#                                        return_state=True)
        self.rnn = tf.keras.layers.SimpleRNN(rnn_units, return_sequences=True, return_state=True)
        # Output is the size of the dictionary space
        self.dense = tf.keras.layers.Dense(vocab_size)

    def call(self, inputs, states=None, return_state=False, training=False):
        x = inputs
        x = self.embedding(x, training=training)
        if states is None:
            states = self.rnn.get_initial_state(x)
#             states = self.gru.get_initial_state(x)
        x, states = self.rnn(x, initial_state=states, training=training)
#         x, states = self.gru(x, initial_state=states, training=training)
        x = self.dense(x, training=training)

        if return_state:
            return x, states
        else:
            return x

model = MyModel(
    vocab_size=vocab_size,
    embedding_dim=embedding_dim,
    rnn_units=rnn_units)

for input_example_batch, target_example_batch in dataset.take(1):
    example_batch_predictions = model(input_example_batch)
    print(example_batch_predictions.shape, "# (batch_size, sequence_length, vocab_size)")

(256, 100, 2707) # (batch_size, sequence_length, vocab_size)

model.summary()

Model: "my_model_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
=================================================================
 embedding_1 (Embedding)     multiple                  692992    
                                                                 
 simple_rnn_1 (SimpleRNN)    multiple                  1311744   
                                                                 
 dense_1 (Dense)             multiple                  2774675   
                                                                 
=================================================================
Total params: 4,779,411
Trainable params: 4,779,411
Non-trainable params: 0
_________________________________________________________________

sampled_indices = tf.random.categorical(example_batch_predictions[0], num_samples=1)
sampled_indices = tf.squeeze(sampled_indices, axis=-1).numpy()

sampled_indices

array([ 196, 1635,  858, 1146, 2427, 2403, 1519, 1179, 1108, 2644,  220,
        647,  320, 2198, 2584,  877, 2240,  465, 2452,  443,  368,  128,
        617, 2263,  401, 2111, 1505, 1328, 2615, 1895,   31,  440,  315,
        566, 2298, 2527, 1890, 2498, 2412, 1971,  296, 1594,  458, 2343,
        948, 2544, 1103,  668, 1156,  289,  406, 2270, 1455, 1187, 2687,
        873, 1899,  929, 2706, 2385, 1935,  160,  197,  258, 1187, 2703,
       1585, 2018,  210,  451,  857,   97,   76, 1130, 2286,  549, 2618,
        375,  735,   48, 1930,  897, 2428, 2261, 1117,  696,  300,  720,
       1159, 2628,  569, 1215,  145,  537, 1668,  795,  205, 2141, 2254,
       1568], dtype=int64)

print("Input:\n", text_from_ids(input_example_batch[0]).numpy().decode("utf-8"))
print()
print("Next Char Predictions:\n", text_from_ids(sampled_indices).numpy().decode("utf-8"))

Input:
 Some reviews that need to be changed just go in and agree with the reviewers, it's uncomfortable for me as a coder, whatever, everyone is busy, this is nothing, a small problem, hundreds of lines really haven't appeared much else, just do it according to your own habits, but some specifications I also find unsightly, I remember one I borrowed called

Next Char Predictions:
 Posture axe ugly green ball estimated capable sheep strange arrival Q kneel ya - magnetic concept ⢴ straight comfort slippery [R universe meow roast contact insect ⠔ hanging lady cotton deer dragon camp paste red paving lead stain spread life garden sulfur delivery canal W kick ☕ walk stretch gamble item long beautiful entertainment true E world them pout item skin is original morning ten thousand travel guy male bath 5 seedlings rush sentence sound fill painting hear make conclusion reduce ban look accompany jade stone luxury back not carbon claw hidden cat help stall；

loss = tf.losses.SparseCategoricalCrossentropy(from_logits=True)

example_batch_mean_loss = loss(target_example_batch, example_batch_predictions)
print("Prediction shape: ", example_batch_predictions.shape, " # (batch_size, sequence_length, vocab_size)")
print("Mean loss:        ", example_batch_mean_loss)

Prediction shape:  (256, 100, 2707)  # (batch_size, sequence_length, vocab_size)
Mean loss:         tf.Tensor(7.906991, shape=(), dtype=float32)

tf.exp(example_batch_mean_loss).numpy()

2716.205

opt = tf.keras.optimizers.Adam(0.001)
model.compile(optimizer=opt, loss=loss, metrics=['accuracy'])

import os
# Directory where the checkpoints will be saved
checkpoint_dir = './qqmsg_rnn_training_checkpoints'
# Name of the checkpoint files
checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt_{epoch}")

checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
    filepath=checkpoint_prefix,
    save_weights_only=True)

EPOCHS = 200
history = model.fit(dataset, epochs=EPOCHS,
                    callbacks=[checkpoint_callback])

Epoch 1/200
5/5 [==============================] - 1s 150ms/step - loss: 4.4653 - accuracy: 0.2164
Epoch 2/200
5/5 [==============================] - 1s 147ms/step - loss: 4.4367 - accuracy: 0.2187

5/5 [==============================] - 1s 145ms/step - loss: 0.5430 - accuracy: 0.9156
Epoch 200/200
5/5 [==============================] - 1s 152ms/step - loss: 0.5547 - accuracy: 0.9111

import matplotlib.pyplot as plt
print(history.history.keys())
plt.plot(history.history["loss"], label="Training Loss")
plt.plot(history.history["accuracy"], label="accuracy")
# plt.plot(history.history["val_loss"], label="val_loss")
# plt.plot(history.history["val_accuracy"], label="val_accuracy")
# plt.plot(history.history["sparse_categorical_accuracy"], label="sparse_categorical_accuracy")
# plt.plot(history.history["val_sparse_categorical_accuracy"], label="val_sparse_categorical_accuracy")
plt.legend()
plt.show()

dict_keys(['loss', 'accuracy'])

png

class OneStep(tf.keras.Model):
  def __init__(self, model, chars_from_ids, ids_from_chars, temperature=1.0):
    super().__init__()
    self.temperature = temperature
    self.model = model
    self.chars_from_ids = chars_from_ids
    self.ids_from_chars = ids_from_chars

    # Create a mask to prevent "[UNK]" from being generated.
    skip_ids = self.ids_from_chars(['[UNK]'])[:, None]
    sparse_mask = tf.SparseTensor(
        # Put a -inf at each bad index.
        values=[-float('inf')]*len(skip_ids),
        indices=skip_ids,
        # Match the shape to the vocabulary
        dense_shape=[len(ids_from_chars.get_vocabulary())])
    self.prediction_mask = tf.sparse.to_dense(sparse_mask)

  @tf.function
  def generate_one_step(self, inputs, states=None):
    # Convert strings to token IDs.
    input_chars = tf.strings.unicode_split(inputs, 'UTF-8')
    input_ids = self.ids_from_chars(input_chars).to_tensor()

    # Run the model.
    # predicted_logits.shape is [batch, char, next_char_logits]
    predicted_logits, states = self.model(inputs=input_ids, states=states,
                                          return_state=True)
    # Only use the last prediction.
    predicted_logits = predicted_logits[:, -1, :]
    predicted_logits = predicted_logits/self.temperature
    # Apply the prediction mask: prevent "[UNK]" from being generated.
    predicted_logits = predicted_logits + self.prediction_mask

    # Sample the output logits to generate token IDs.
    predicted_ids = tf.random.categorical(predicted_logits, num_samples=1)
    predicted_ids = tf.squeeze(predicted_ids, axis=-1)

    # Convert from token ids to characters
    predicted_chars = self.chars_from_ids(predicted_ids)

    # Return the characters and model state.
    return predicted_chars, states

one_step_model = OneStep(model, chars_from_ids, ids_from_chars)

import time
start = time.time()
states = None
next_char = tf.constant(["Rich woman", "Customer"])
result = [next_char]

for n in range(50):
    next_char, states = one_step_model.generate_one_step(next_char, states=states)
    result.append(next_char)

result = tf.strings.join(result)
end = time.time()
print(result[0].numpy().decode('utf-8'), '\n\n' + '_'*80)
print('\nRun time:', end - start)

Rich woman, want to touch you, send you more than 100 generated products, what actual products have been done

Let's see the work that has come, let's see, all are native d+10 China 

________________________________________________________________________________

Run time: 0.16722774505615234