From e300298d50c23ed2c5decba30263c852824c4308 Mon Sep 17 00:00:00 2001 From: Sadeq Kord <119689797+Sadeqk94@users.noreply.github.com> Date: Sat, 25 May 2024 10:52:29 +0330 Subject: [PATCH] Add files via upload --- .../Generative_Pretrained_Transformer(GPT).py | 380 ++++++++++++++++++ 1 file changed, 380 insertions(+) create mode 100644 examples/nlp/Generative_Pretrained_Transformer(GPT).py diff --git a/examples/nlp/Generative_Pretrained_Transformer(GPT).py b/examples/nlp/Generative_Pretrained_Transformer(GPT).py new file mode 100644 index 0000000000..9f2b4fa4d0 --- /dev/null +++ b/examples/nlp/Generative_Pretrained_Transformer(GPT).py @@ -0,0 +1,380 @@ +""" +Title: Generative Pretrained Transformer(GPT) +Author: [Sadeq](https://github.com/Sadeqk94) [Kord](https://www.linkedin.com/in/sadeq-kord) +Date created: 2024/05/25 +Last modified: 2024/05/25 +Description: Generative Transformer for Tiny Shakespeare dataset. +Accelerator: GPU +""" +""" +# Introduction +## Let's build GPT using Tensorflow and Keras! + +For those interested in learning about GPT models, there is an excellent video by [Andrej +Karpathy](https://karpathy.ai/) titled ["Let's build GPT: from scratch, in code, spelled +out."](https://youtu.be/kCc8FmEb1nY?si=_l3tBiaZgq1NXwWW). In the video, Karpathy provides +a detailed, step-by-step guide to building a GPT model using PyTorch. + +In this notebook, I present a TensorFlow/Keras version of Karpathy's implementation. This +notebook follows the same principles and steps outlined in the video, allowing you to +gain the same understanding and insights using TensorFlow/Keras. You can follow along +with the video and refer to this notebook for the equivalent TensorFlow/Keras code, +making it a valuable resource for anyone familiar with TensorFlow or looking to learn it. +""" + +""" +We begin by importing essential libraries for building and training our neural network +model using TensorFlow and Keras. TensorFlow is a powerful deep learning library widely +used for building various machine learning models, while Keras provides a high-level API +simplifying the process of building neural networks. Additionally, we import NumPy for +numerical computations and matplotlib for data visualization purposes. + +Furthermore, we define several hyperparameters essential for training our language model. +These parameters include batch size, determining the number of independent sequences +processed simultaneously during training, and block size, representing the maximum +context length for predictions. Other parameters such as the number of iterations, +learning rate, embedding size, number of heads, number of layers, and dropout rate are +also specified. Setting seeds for NumPy and TensorFlow ensures reproducibility of results +across different runs, a crucial aspect in machine learning experimentation. +""" + +# Importing Libraries + +import tensorflow as tf +import keras +from keras import layers, models, optimizers +import numpy as np +import matplotlib.pyplot as plt + +# hyperparameters +batch_size = 16 # how many independent sequences will we process in parallel? +block_size = 32 # what is the maximum context length for predictions? +max_iters = 5000 +eval_interval = 100 + +Epochs = max_iters // eval_interval + +learning_rate = 1e-3 +eval_iters = 200 +n_embd = 64 +n_head = 4 +n_layer = 4 +dropout = 0.0 +# ------------ + + +""" +# Dataset prepration +""" + +""" +In next step we should prepare data, therefore we download the `Tiny Shakespeare` +dataset, a popular choice for language modeling tasks, to preprocess it for training a +language model. The dataset is read from a text file, and we extract unique characters +from it to construct the vocabulary. Using these characters, we create mappings from +characters to integers and vice versa, facilitating the encoding and decoding of text +data into numerical form, which is essential for training neural networks. Subsequently, +we split the dataset into training and validation sets, with 90% of the data allocated +for training and the remaining 10% for validation. + +Following data preparation, we define functions to generate batches of data for training +and validation. These functions enable the creation of input-output pairs, where input +sequences serve as context for predicting subsequent characters. Utilizing TensorFlow's +data processing capabilities, we convert these batch generation functions into TensorFlow +datasets, ensuring seamless integration with TensorFlow's training pipeline. These +datasets are structured to produce batches of sequences, each consisting of a fixed +number of characters, which will be used to train and validate our language model +efficiently. +""" +""" +### Download and prepare dataset +""" +"""shell +wget https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt +""" + +# Read the text file +with open("input.txt", "r", encoding="utf-8") as f: + text = f.read() + +# Unique characters +chars = sorted(list(set(text))) +vocab_size = len(chars) + +# create a mapping from characters to integers +stoi = {ch: i for i, ch in enumerate(chars)} +itos = {i: ch for i, ch in enumerate(chars)} +encode = lambda s: [ + stoi[c] for c in s +] # encoder: take a string, output a list of integers +decode = lambda l: "".join( + [itos[i] for i in l] +) # decoder: take a list of integers, output a string + +# Train and test splits +data = np.array(encode(text), dtype=np.int64) +n = int(0.9 * len(data)) # first 90% will be train, rest val +train_data = data[:n] +val_data = data[n:] + + +# Data loading +def get_batch(split): + # Generate a small batch of data of inputs x and targets y + data = train_data if split == "train" else val_data + ix = np.random.randint(0, len(data) - block_size, batch_size) + x = np.stack([data[i : i + block_size] for i in ix]) + y = np.stack([data[i + 1 : i + block_size + 1] for i in ix]) + return x, y + + +# Prepare train/val dataset +def train_data_generator(): + while True: + yield get_batch("train") + + +def val_data_generator(): + while True: + yield get_batch("val") + + +train_data_generator = tf.data.Dataset.from_generator( + train_data_generator, + output_signature=( + tf.TensorSpec(shape=(batch_size, block_size), dtype=tf.int64), + tf.TensorSpec(shape=(batch_size, block_size), dtype=tf.int64), + ), +) + +val_data_generator = tf.data.Dataset.from_generator( + val_data_generator, + output_signature=( + tf.TensorSpec(shape=(batch_size, block_size), dtype=tf.int64), + tf.TensorSpec(shape=(batch_size, block_size), dtype=tf.int64), + ), +) + +""" +# Language model Architucture +""" + +""" +Our languge model is constructed based on 3 main parts/class. The first class, +`FeedForward`, represents a simple feedforward neural network layer, a fundamental +component of many deep learning architectures. In its `__init__` method, it initializes a +sequential model consisting of two dense layers with ReLU activation functions and a +dropout layer. The `call` method executes a forward pass through this network, taking an +input tensor `x` and passing it through the sequential model, returning the output +tensor. + +The `Block` class represents a single block of a transformer architecture. In its +`__init__` method, it initializes the block with a multi-head self-attention layer +(`sa`), a feedforward neural network layer (`ffwd`), and layer normalization layers +(`ln1` and `ln2`). The `call` method executes a forward pass through this block. It first +applies self-attention to the input tensor `x`, then adds the output to the input tensor, +and passes it through the feedforward neural network layer. Finally, it returns the +output tensor. +It should be noted that, for language modeling we use `decoder attention` block and it +has triangular masking that provides autoregressive settings and allows tokens to +cominucate only with pervious tokens, you can see the explination +[here](https://www.youtube.com/watch?v=kCc8FmEb1nY&t=4454s). when calling Keras +MultiHeadAttention layer in `Block` class, we set `use_causal_mask=True` for this reason. + +The `BigramLanguageModel` class represents the entire language model architecture. It +consists of embedding layers for token and positional embeddings, multiple transformer +blocks, layer normalization, and a dense layer for output logits. The `call` method +executes a forward pass through the model, applying token and positional embeddings, +passing the input through the transformer blocks, and generating logits for the next +token. It also computes the loss if targets are provided. Additionally, it includes +methods `train_step` and `test_step` for training and evaluation steps, respectively, and +a `generate` method for generating text given an input sequence. +""" + + +# %% Model components +class FeedForward(layers.Layer): + """A simple linear layer followed by a non-linearity""" + + def __init__(self, n_embd): + super().__init__() + self.net = models.Sequential( + [ + layers.Dense(4 * n_embd, activation="relu"), + layers.Dense(n_embd), + layers.Dropout(dropout), + ] + ) + + def call(self, x): + return self.net(x) + + +class Block(layers.Layer): + """Transformer block: communication followed by computation""" + + def __init__(self, n_embd, n_head): + super().__init__() + # n_embd: embedding dimension, n_head: the number of heads we'd like + self.sa = layers.MultiHeadAttention( + num_heads=n_head, key_dim=n_embd // n_head, dropout=dropout + ) + self.ffwd = FeedForward(n_embd) + self.ln1 = layers.LayerNormalization() + self.ln2 = layers.LayerNormalization() + + def call(self, x): + attn_output = self.sa( + self.ln1(x), self.ln1(x), use_causal_mask=True + ) # use causal mask to ensure each token can only see previous tokens + x = x + attn_output + x = x + self.ffwd(self.ln2(x)) + return x + + +# Bigram Language Model +class BigramLanguageModel(keras.Model): + def __init__(self): + super().__init__() + # each token directly reads off the logits for the next token from a lookup table + self.token_embedding_table = layers.Embedding(vocab_size, n_embd) + self.position_embedding_table = layers.Embedding(block_size, n_embd) + self.blocks = [Block(n_embd, n_head) for _ in range(n_layer)] + self.ln_f = layers.LayerNormalization() + self.lm_head = layers.Dense(vocab_size) + + def call(self, idx, targets=None): + B, T = idx.shape + # idx and targets are both (B,T) tensor of integers + tok_emb = self.token_embedding_table(idx) # (B,T,C) + pos_emb = self.position_embedding_table( + tf.range(T)[tf.newaxis, :] + ) # initially (T,C) adding new axis and get # (1,T,C) + x = tok_emb + pos_emb # (B,T,C) + for block in self.blocks: # (B,T,C) + x = block(x) + x = self.ln_f(x) # (B,T,C) + logits = self.lm_head(x) # (B,T,vocab_size) + + if targets is None: + return logits, None + + logits_flat = tf.reshape(logits, [-1, logits.shape[-1]]) + targets_flat = tf.reshape(targets, [-1]) + loss = keras.losses.sparse_categorical_crossentropy( + targets_flat, logits_flat, from_logits=True + ) + return logits, tf.reduce_mean(loss) + + def train_step(self, data): + x, y = data + with tf.GradientTape() as tape: + logits, loss = self(x, y) + grads = tape.gradient(loss, self.trainable_variables) + self.optimizer.apply_gradients(zip(grads, self.trainable_variables)) + return {"loss": loss} + + def test_step(self, data): + x, y = data + logits, loss = self(x, y) + return {"loss": loss} + + def generate(self, idx, max_new_tokens): + # idx is (B, T) array of indices in the current context + for _ in range(max_new_tokens): + # crop idx to the last block_size tokens + idx_cond = idx[:, -block_size:] + # get the predictions + logits, _ = self(idx_cond) + # focus only on the last time step + logits = logits[:, -1, :] # becomes (B, C) + # sample from the distribution + idx_next = tf.random.categorical(logits, num_samples=1) # (B, 1) + # append sampled index to the running sequence + idx = tf.concat([idx, idx_next], axis=1) # (B, T+1) + return idx + + +""" +# Model training +""" + +""" +Now its time to initialize the language model (BigramLanguageModel) and see the number of +trainable parameters in the model using the count_params() method. This gives us insight +into the complexity of the model and the memory requirements for training. + +After initializing the model, we compile it using the Adam optimizer with a specified +learning rate. Compilation involves setting up the model for training, including +specifying the loss function and optimization algorithm. + +Next, we train the model using the fit method. We specify the training data generator +(train_data_generator), the number of epochs (Epochs), the evaluation interval +(eval_interval), the validation data generator (val_data_generator), and the number of +validation steps (eval_iters). During training, the model learns to minimize the loss +function on the training data while monitoring its performance on the validation data. + +Finally, we plot the learning curves using Matplotlib. The learning curves show the +training and validation loss as a function of the number of epochs. This visualization +helps us understand how well the model is learning over time and whether it is +overfitting or underfitting. By observing the trends in the loss curves, we can make +informed decisions about model training and optimization. +""" + +# Initialize the model train and plotting loss curves +model = BigramLanguageModel() +# print the number of parameters in the model +model.build((batch_size, block_size)) +print("Number of trainable parameters:", model.count_params()) + +# Compile the model +model.compile(optimizer=optimizers.Adam(learning_rate)) + +# Train the model +Hist = model.fit( + train_data_generator, + epochs=Epochs, + steps_per_epoch=eval_interval, + validation_data=val_data_generator, + validation_steps=eval_iters, +) + +# Plot learning curve +plt.figure() +plt.plot( + np.arange(1, Epochs + 1), + np.vstack((Hist.history["loss"], Hist.history["val_loss"])).T, +) +plt.xlabel("Epochs") +plt.ylabel("Loss") +plt.legend(["train_loss", "val_loss"]) + +""" +# Geneating Shakespeare-like text! +""" + +""" +And finally, we generate text using the trained language model (BigramLanguageModel). We +initialize the generation process by providing an initial context, which is represented +as an array of zeros with shape (1, 1). This context serves as the starting point for +text generation. + +We then use the generate method of the model to generate a sequence of tokens. We specify +the maximum number of new tokens to generate (max_new_tokens) as 2000. The model +iteratively predicts the next token based on the provided context and appends it to the +sequence. + +Once the text generation process is complete, we decode the generated sequence of tokens +into human-readable text using the decode function. This function maps each token index +back to its corresponding character in the vocabulary. + +Finally, we print the generated text to the console, allowing us to inspect the output of +the language model and assess its quality. This text generation process demonstrates the +model's ability to generate coherent and contextually relevant text based on the patterns +learned during training. +""" + +# Generate text +context = np.zeros((1, 1), dtype=np.int64) +generated = model.generate(context, max_new_tokens=2000) +print(decode(generated[0].numpy().tolist()))