Used chat records from group friends
# !pip install pymysql
# !pip install matplotlib
import tensorflow as tf
import numpy as np
import pymysql
import pandas as pd
from matplotlib.pyplot import plot
SENT_LENGTH = 1024
conn = pymysql.connect()
cs = conn.cursor()
cs.execute()
all_data = cs.fetchall()
all_data = [a[0] for a in all_data]
all_data[0]
'This u speed is too slow'
def padding(origin, endding, maxL):
if len(origin) >= maxL:
return origin[:maxL]
for i in range(maxL - len(origin)):
origin.append(endding)
return origin
!pip install jieba
Looking in indexes: https://pypi.tuna.tsinghua.edu.cn/simple
import jieba
# Dictionary of all characters
all_char = set(''.join(all_data))
# encode to id
ids_from_chars = tf.keras.layers.StringLookup(vocabulary=list(all_char), mask_token=None)
# decode to char
chars_from_ids = tf.keras.layers.StringLookup(vocabulary=ids_from_chars.get_vocabulary(), invert=True, mask_token=None)
# all char to all ints
all_data_num = ids_from_chars(list(''.join(all_data)))
all_data_num.shape, len(list(''.join(all_data)))
(TensorShape([147119]), 147119)
## to tensor
ids_dataset = tf.data.Dataset.from_tensor_slices(all_data_num)
for ids in ids_dataset.take(10):
print(chars_from_ids(ids).numpy().decode('utf-8'))
This
u
speed
is
too
slow
了
淦
# max length
seq_length = 100
sequences = ids_dataset.batch(seq_length+1, drop_remainder=True)
# for seq in sequences.take(1):
# for c in chars_from_ids(seq).numpy():
# print(c.decode('utf-8'))
# generate x,y
def split_input_target(sequence):
input_text = sequence[:-1]
target_text = sequence[1:]
return input_text, target_text
dataset = sequences.map(split_input_target)
def text_from_ids(ids):
return tf.strings.reduce_join([x.numpy().decode('utf-8') for x in chars_from_ids(ids)], axis=-1)
for input_example, target_example in dataset.take(1):
print("Input :", text_from_ids(input_example).numpy().decode("utf-8"))
print("Target:", text_from_ids(target_example).numpy().decode("utf-8"))
Input : This u speed is too slow 淦 Found sudo-prompt, it's okay. No, this library is too garbage, it doesn't return that process object after executing the command, what are you doing, a client code electron
??? Let the barbecue guy write you a new library, my vscode is also carrying the barbershop
Target: u speed is too slow 淦 Found sudo-prompt, it's okay. No, this library is too garbage, it doesn't return that process object after executing the command, what are you doing, a client code electron
??? Let the barbecue guy write you a new library, my vscode is also carrying the barbershop now
# Batch size
BATCH_SIZE = 256
# Buffer size to shuffle the dataset
# (TF data is designed to work with possibly infinite sequences,
# so it doesn't attempt to shuffle the entire sequence in memory. Instead,
# it maintains a buffer in which it shuffles elements).
BUFFER_SIZE = 1000
dataset = (
dataset
.shuffle(BUFFER_SIZE)
.batch(BATCH_SIZE, drop_remainder=True)
.prefetch(tf.data.experimental.AUTOTUNE))
dataset
<PrefetchDataset element_spec=(TensorSpec(shape=(256, 100), dtype=tf.int64, name=None), TensorSpec(shape=(256, 100), dtype=tf.int64, name=None))>
# Length of the vocabulary in StringLookup Layer
vocab_size = len(ids_from_chars.get_vocabulary())
# The embedding dimension
embedding_dim = 256
# Number of RNN units
rnn_units = 1024
# Model
class MyModel(tf.keras.Model):
def __init__(self, vocab_size, embedding_dim, rnn_units):
super().__init__(self)
# Dictionary size
self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
# GRU network
# self.gru = tf.keras.layers.GRU(rnn_units,
# return_sequences=True,
# return_state=True)
self.rnn = tf.keras.layers.SimpleRNN(rnn_units, return_sequences=True, return_state=True)
# Output is the size of the dictionary space
self.dense = tf.keras.layers.Dense(vocab_size)
def call(self, inputs, states=None, return_state=False, training=False):
x = inputs
x = self.embedding(x, training=training)
if states is None:
states = self.rnn.get_initial_state(x)
# states = self.gru.get_initial_state(x)
x, states = self.rnn(x, initial_state=states, training=training)
# x, states = self.gru(x, initial_state=states, training=training)
x = self.dense(x, training=training)
if return_state:
return x, states
else:
return x
model = MyModel(
vocab_size=vocab_size,
embedding_dim=embedding_dim,
rnn_units=rnn_units)
for input_example_batch, target_example_batch in dataset.take(1):
example_batch_predictions = model(input_example_batch)
print(example_batch_predictions.shape, "# (batch_size, sequence_length, vocab_size)")
(256, 100, 2707) # (batch_size, sequence_length, vocab_size)
model.summary()
Model: "my_model_1"
_________________________________________________________________
Layer (type) Output Shape Param #
=================================================================
embedding_1 (Embedding) multiple 692992
simple_rnn_1 (SimpleRNN) multiple 1311744
dense_1 (Dense) multiple 2774675
=================================================================
Total params: 4,779,411
Trainable params: 4,779,411
Non-trainable params: 0
_________________________________________________________________
sampled_indices = tf.random.categorical(example_batch_predictions[0], num_samples=1)
sampled_indices = tf.squeeze(sampled_indices, axis=-1).numpy()
sampled_indices
array([ 196, 1635, 858, 1146, 2427, 2403, 1519, 1179, 1108, 2644, 220,
647, 320, 2198, 2584, 877, 2240, 465, 2452, 443, 368, 128,
617, 2263, 401, 2111, 1505, 1328, 2615, 1895, 31, 440, 315,
566, 2298, 2527, 1890, 2498, 2412, 1971, 296, 1594, 458, 2343,
948, 2544, 1103, 668, 1156, 289, 406, 2270, 1455, 1187, 2687,
873, 1899, 929, 2706, 2385, 1935, 160, 197, 258, 1187, 2703,
1585, 2018, 210, 451, 857, 97, 76, 1130, 2286, 549, 2618,
375, 735, 48, 1930, 897, 2428, 2261, 1117, 696, 300, 720,
1159, 2628, 569, 1215, 145, 537, 1668, 795, 205, 2141, 2254,
1568], dtype=int64)
print("Input:\n", text_from_ids(input_example_batch[0]).numpy().decode("utf-8"))
print()
print("Next Char Predictions:\n", text_from_ids(sampled_indices).numpy().decode("utf-8"))
Input:
Some reviews that need to be changed just go in and agree with the reviewers, it's uncomfortable for me as a coder, whatever, everyone is busy, this is nothing, a small problem, hundreds of lines really haven't appeared much else, just do it according to your own habits, but some specifications I also find unsightly, I remember one I borrowed called
Next Char Predictions:
Posture axe ugly green ball estimated capable sheep strange arrival Q kneel ya - magnetic concept ⢴ straight comfort slippery [R universe meow roast contact insect ⠔ hanging lady cotton deer dragon camp paste red paving lead stain spread life garden sulfur delivery canal W kick ☕ walk stretch gamble item long beautiful entertainment true E world them pout item skin is original morning ten thousand travel guy male bath 5 seedlings rush sentence sound fill painting hear make conclusion reduce ban look accompany jade stone luxury back not carbon claw hidden cat help stall;
loss = tf.losses.SparseCategoricalCrossentropy(from_logits=True)
example_batch_mean_loss = loss(target_example_batch, example_batch_predictions)
print("Prediction shape: ", example_batch_predictions.shape, " # (batch_size, sequence_length, vocab_size)")
print("Mean loss: ", example_batch_mean_loss)
Prediction shape: (256, 100, 2707) # (batch_size, sequence_length, vocab_size)
Mean loss: tf.Tensor(7.906991, shape=(), dtype=float32)
tf.exp(example_batch_mean_loss).numpy()
2716.205
opt = tf.keras.optimizers.Adam(0.001)
model.compile(optimizer=opt, loss=loss, metrics=['accuracy'])
import os
# Directory where the checkpoints will be saved
checkpoint_dir = './qqmsg_rnn_training_checkpoints'
# Name of the checkpoint files
checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt_{epoch}")
checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
filepath=checkpoint_prefix,
save_weights_only=True)
EPOCHS = 200
history = model.fit(dataset, epochs=EPOCHS,
callbacks=[checkpoint_callback])
Epoch 1/200
5/5 [==============================] - 1s 150ms/step - loss: 4.4653 - accuracy: 0.2164
Epoch 2/200
5/5 [==============================] - 1s 147ms/step - loss: 4.4367 - accuracy: 0.2187
5/5 [==============================] - 1s 145ms/step - loss: 0.5430 - accuracy: 0.9156
Epoch 200/200
5/5 [==============================] - 1s 152ms/step - loss: 0.5547 - accuracy: 0.9111
import matplotlib.pyplot as plt
print(history.history.keys())
plt.plot(history.history["loss"], label="Training Loss")
plt.plot(history.history["accuracy"], label="accuracy")
# plt.plot(history.history["val_loss"], label="val_loss")
# plt.plot(history.history["val_accuracy"], label="val_accuracy")
# plt.plot(history.history["sparse_categorical_accuracy"], label="sparse_categorical_accuracy")
# plt.plot(history.history["val_sparse_categorical_accuracy"], label="val_sparse_categorical_accuracy")
plt.legend()
plt.show()
dict_keys(['loss', 'accuracy'])
class OneStep(tf.keras.Model):
def __init__(self, model, chars_from_ids, ids_from_chars, temperature=1.0):
super().__init__()
self.temperature = temperature
self.model = model
self.chars_from_ids = chars_from_ids
self.ids_from_chars = ids_from_chars
# Create a mask to prevent "[UNK]" from being generated.
skip_ids = self.ids_from_chars(['[UNK]'])[:, None]
sparse_mask = tf.SparseTensor(
# Put a -inf at each bad index.
values=[-float('inf')]*len(skip_ids),
indices=skip_ids,
# Match the shape to the vocabulary
dense_shape=[len(ids_from_chars.get_vocabulary())])
self.prediction_mask = tf.sparse.to_dense(sparse_mask)
@tf.function
def generate_one_step(self, inputs, states=None):
# Convert strings to token IDs.
input_chars = tf.strings.unicode_split(inputs, 'UTF-8')
input_ids = self.ids_from_chars(input_chars).to_tensor()
# Run the model.
# predicted_logits.shape is [batch, char, next_char_logits]
predicted_logits, states = self.model(inputs=input_ids, states=states,
return_state=True)
# Only use the last prediction.
predicted_logits = predicted_logits[:, -1, :]
predicted_logits = predicted_logits/self.temperature
# Apply the prediction mask: prevent "[UNK]" from being generated.
predicted_logits = predicted_logits + self.prediction_mask
# Sample the output logits to generate token IDs.
predicted_ids = tf.random.categorical(predicted_logits, num_samples=1)
predicted_ids = tf.squeeze(predicted_ids, axis=-1)
# Convert from token ids to characters
predicted_chars = self.chars_from_ids(predicted_ids)
# Return the characters and model state.
return predicted_chars, states
one_step_model = OneStep(model, chars_from_ids, ids_from_chars)
import time
start = time.time()
states = None
next_char = tf.constant(["Rich woman", "Customer"])
result = [next_char]
for n in range(50):
next_char, states = one_step_model.generate_one_step(next_char, states=states)
result.append(next_char)
result = tf.strings.join(result)
end = time.time()
print(result[0].numpy().decode('utf-8'), '\n\n' + '_'*80)
print('\nRun time:', end - start)
Rich woman, want to touch you, send you more than 100 generated products, what actual products have been done
Let's see the work that has come, let's see, all are native d+10 China
________________________________________________________________________________
Run time: 0.16722774505615234