diff --git a/cli/src/benchmark_gpt.ts b/cli/src/benchmark_gpt.ts index 7b0aafbbc..5a30414f4 100644 --- a/cli/src/benchmark_gpt.ts +++ b/cli/src/benchmark_gpt.ts @@ -69,18 +69,18 @@ async function main(args: Required): Promise { const config: models.GPTConfig = { modelType: modelType as models.GPTConfig['modelType'], maxIter: iterationsPerEpoch, - blockSize: contextLength, lr: 0.0001, + contextLength, } // Load the dataset after setting the Task batch size and max sequence length // to make sure the dataset is batched and tokenized correctly task.trainingInformation.batchSize = batchSize - task.trainingInformation.maxSequenceLength = contextLength + task.trainingInformation.contextLength = contextLength const dataset = loadText('../datasets/wikitext/wiki.train.tokens') .map(text => processing.tokenize(tokenizer, text)) .flatten() - .batch(config.blockSize + 1, 1) + .batch(config.contextLength + 1, 1) const preprocessedDataset = dataset .map((tokens) => [tokens.pop(), tokens.last()] as [List, number]) diff --git a/cli/src/train_gpt.ts b/cli/src/train_gpt.ts index cc1c3dea9..dc75e7327 100644 --- a/cli/src/train_gpt.ts +++ b/cli/src/train_gpt.ts @@ -13,7 +13,7 @@ async function main(): Promise { maxIter: 50, evaluateEvery:50, maxEvalBatches: 10, - blockSize: 16, + contextLength: 16, seed } @@ -22,7 +22,7 @@ async function main(): Promise { const tokenDataset = new Dataset([data]) .map((text: string) => processing.tokenize(tokenizer, text)) .flatten() - .batch(config.blockSize + 1, 1) + .batch(config.contextLength + 1, 1) .map((tokens) => [tokens.pop(), tokens.last()] as [List, number]) .repeat() .batch(8); diff --git a/discojs/src/dataset/dataset.spec.ts b/discojs/src/dataset/dataset.spec.ts index 2f84b3190..1602b4fc3 100644 --- a/discojs/src/dataset/dataset.spec.ts +++ b/discojs/src/dataset/dataset.spec.ts @@ -152,29 +152,29 @@ describe("dataset", () => { it("batch with overlap yields correct batches", async () => { const expectedTokens = Range(0, 53).toList() - const blockSize = 4 + const contextLength = 4 const parsed = new Dataset([expectedTokens]) .flatten() - .batch(blockSize + 1, 1) + .batch(contextLength + 1, 1) // -1 because the last sequence is dropped as there is no next token label - const expectedLength = Math.ceil(expectedTokens.size / blockSize) - 1 + const expectedLength = Math.ceil(expectedTokens.size / contextLength) - 1 expect(await parsed.size()).to.equal(expectedLength); // exclude the last sequence because it has been padded let sequences = List(await arrayFromAsync(parsed)) - // we expect the last sequence to have blockSize + 1 tokens via padding - expect(sequences.last()?.size).to.equal(blockSize + 1) + // we expect the last sequence to have contextLength + 1 tokens via padding + expect(sequences.last()?.size).to.equal(contextLength + 1) sequences = sequences.pop() let i = 0 for await (const tokens of sequences) { - // each sequence has length blockSize + 1 (for the label) + // each sequence has length contextLength + 1 (for the label) expect(tokens.toArray()).to.deep.equal( - expectedTokens.slice(i, i + blockSize + 1).toArray() + expectedTokens.slice(i, i + contextLength + 1).toArray() ); - // but the window should move by blockSize only - i += blockSize + // but the window should move by contextLength only + i += contextLength } }) diff --git a/discojs/src/default_tasks/wikitext.ts b/discojs/src/default_tasks/wikitext.ts index 86982f975..6b1dbadd7 100644 --- a/discojs/src/default_tasks/wikitext.ts +++ b/discojs/src/default_tasks/wikitext.ts @@ -35,7 +35,7 @@ export const wikitext: TaskProvider<'text'> = { roundDuration: 2, batchSize: 8, // If set too high firefox raises a WebGL error tokenizer: 'Xenova/gpt2', - maxSequenceLength: 64, + contextLength: 64, tensorBackend: 'gpt' } } @@ -43,7 +43,7 @@ export const wikitext: TaskProvider<'text'> = { getModel(): Promise> { return Promise.resolve(new models.GPT({ - blockSize: this.getTask().trainingInformation.maxSequenceLength, + contextLength: this.getTask().trainingInformation.contextLength, })) } } diff --git a/discojs/src/models/gpt/config.ts b/discojs/src/models/gpt/config.ts index a29b38789..05cb07563 100644 --- a/discojs/src/models/gpt/config.ts +++ b/discojs/src/models/gpt/config.ts @@ -9,7 +9,7 @@ type GPTModelType = export interface GPTConfig { lr: number - blockSize: number + contextLength: number vocabSize?: number modelType: GPTModelType name?: string, @@ -39,7 +39,7 @@ export const DefaultGPTConfig: Required = { evaluate: true, maxEvalBatches: 12, evaluateEvery: 100, - blockSize: 128, + contextLength: 128, vocabSize: 50257, debug: false, dropout: 0.2, diff --git a/discojs/src/models/gpt/gpt.spec.ts b/discojs/src/models/gpt/gpt.spec.ts index de0a491c2..d97a26351 100644 --- a/discojs/src/models/gpt/gpt.spec.ts +++ b/discojs/src/models/gpt/gpt.spec.ts @@ -25,7 +25,7 @@ describe("gpt-tfjs", function () { maxIter: 10, evaluateEvery: 50, maxEvalBatches: 10, - blockSize: 8, + contextLength: 8, seed }); for (let i = 0; i < 5; i++) diff --git a/discojs/src/models/gpt/index.ts b/discojs/src/models/gpt/index.ts index e3e94c572..2eb02d4fe 100644 --- a/discojs/src/models/gpt/index.ts +++ b/discojs/src/models/gpt/index.ts @@ -27,7 +27,7 @@ export type GPTSerialization = { export class GPT extends Model<"text"> { private readonly model: GPTModel; - readonly #blockSize: number; + readonly #contextLength: number; readonly #maxBatchCount: number; readonly #vocabSize: number; @@ -38,7 +38,7 @@ export class GPT extends Model<"text"> { model.compile(); this.model = model; - this.#blockSize = partialConfig?.blockSize ?? DefaultGPTConfig.blockSize; + this.#contextLength = partialConfig?.contextLength ?? DefaultGPTConfig.contextLength; this.#maxBatchCount = partialConfig?.maxIter ?? DefaultGPTConfig.maxIter; this.#vocabSize = partialConfig?.vocabSize ?? DefaultGPTConfig.vocabSize; } @@ -157,7 +157,7 @@ export class GPT extends Model<"text"> { * Generate the next token after the input sequence. * In other words, takes an input tensor of shape (prompt length T) and returns a tensor of shape (T+1) * - * @param token input tokens of shape (T,). T is truncated to the model's block size + * @param token input tokens of shape (T,). T is truncated to the model's context length * @param config generation config: temperature, doSample, topk * @returns the next token predicted by the model */ @@ -166,7 +166,7 @@ export class GPT extends Model<"text"> { config: GenerationConfig, ): Promise { // slice input tokens if longer than context length - tokens = tokens.slice(-this.#blockSize); + tokens = tokens.slice(-this.#contextLength); const input = tf.tidy(() => tf.tensor1d(tokens.toArray(), "int32").expandDims(0), diff --git a/discojs/src/models/gpt/layers.ts b/discojs/src/models/gpt/layers.ts index 4906920aa..49b8e72f7 100644 --- a/discojs/src/models/gpt/layers.ts +++ b/discojs/src/models/gpt/layers.ts @@ -67,7 +67,7 @@ tf.serialization.registerClass(LogLayer) type CausalSelfAttentionConfig = ConstructorParameters[0] - & Record<'blockSize' | 'nHead' | 'nEmbd' | 'dropout' | 'nLayer' | 'seed', number> + & Record<'contextLength' | 'nHead' | 'nEmbd' | 'dropout' | 'nLayer' | 'seed', number> class CausalSelfAttention extends tf.layers.Layer { static readonly className = 'CausalSelfAttention' @@ -97,7 +97,7 @@ class CausalSelfAttention extends tf.layers.Layer { // mask is a lower triangular matrix filled with 1 // calling bandPart zero out the upper triangular part of the all-ones matrix // from the doc: tf.linalg.band_part(input, -1, 0) ==> Lower triangular part - this.mask = tf.linalg.bandPart(tf.ones([config.blockSize, config.blockSize]), -1, 0) + this.mask = tf.linalg.bandPart(tf.ones([config.contextLength, config.contextLength]), -1, 0) } override build (): void { @@ -266,7 +266,7 @@ class GELU extends tf.layers.Layer { tf.serialization.registerClass(GELU) type MLPConfig = ConstructorParameters[0] & - Required & Record<'blockSize' | 'residDrop' | 'nLayer' | 'seed', number> + Required & Record<'contextLength' | 'residDrop' | 'nLayer' | 'seed', number> function MLP(config: MLPConfig): tf.LayersModel { return tf.sequential({ layers: [ @@ -274,7 +274,7 @@ function MLP(config: MLPConfig): tf.LayersModel { name: config.name + `.mlp.c_fc`, units: 4 * config.nEmbd, inputDim: config.nEmbd, - inputShape: [config.blockSize, config.nEmbd], + inputShape: [config.contextLength, config.nEmbd], kernelInitializer: tf.initializers.randomNormal({ mean: 0, stddev: 0.02, seed: config.seed }), @@ -284,7 +284,7 @@ function MLP(config: MLPConfig): tf.LayersModel { name: config.name + '.mlp.c_proj', units: config.nEmbd, inputDim: 4 * config.nEmbd, - inputShape: [config.blockSize, 4 * config.nEmbd], + inputShape: [config.contextLength, 4 * config.nEmbd], kernelInitializer: tf.initializers.randomNormal({ mean: 0, stddev: 0.02 * Math.sqrt(2 * config.nLayer), seed: config.seed }), @@ -306,7 +306,7 @@ type BlockConfig = CausalSelfAttentionConfig & MLPConfig & { debug: boolean } */ function TransformerBlock (conf: BlockConfig): tf.LayersModel { const config = Object.assign({ name: '.h' }, conf) - const inputs = tf.input({ shape: [config.blockSize, config.nEmbd] }) + const inputs = tf.input({ shape: [config.contextLength, config.nEmbd] }) let x1, x2 // input normalization x1 = tf.layers.layerNormalization({ @@ -469,7 +469,7 @@ export function GPTArchitecture(config: Required): tf.LayersModel { const range = new Range({}).apply(inputs) let posEmb = tf.layers.embedding({ name: config.name + '.wpe', - inputDim: config.blockSize, + inputDim: config.contextLength, outputDim: config.nEmbd, embeddingsInitializer: tf.initializers.randomNormal({ mean: 0, stddev: 0.02, seed: config.seed diff --git a/discojs/src/processing/index.ts b/discojs/src/processing/index.ts index dcccf48d9..9cc0e6268 100644 --- a/discojs/src/processing/index.ts +++ b/discojs/src/processing/index.ts @@ -56,12 +56,12 @@ export async function preprocess( // cast as typescript doesn't reduce generic type const d = dataset as Dataset; const t = task as Task<"text">; - const blockSize = task.trainingInformation.maxSequenceLength + const contextLength = task.trainingInformation.contextLength const tokenizer = await models.getTaskTokenizer(t); return d.map(text => processing.tokenize(tokenizer, text)) .flatten() - .batch(blockSize + 1, 1) + .batch(contextLength + 1, 1) .map((tokens) => [tokens.pop(), tokens.last()]) as Dataset; } @@ -97,12 +97,12 @@ export async function preprocessWithoutLabel( // cast as typescript doesn't reduce generic type const d = dataset as Dataset; const t = task as Task<"text">; - const blockSize = task.trainingInformation.maxSequenceLength + const contextLength = task.trainingInformation.contextLength const tokenizer = await models.getTaskTokenizer(t); return d.map(text => processing.tokenize(tokenizer, text)) .flatten() - .batch(blockSize) + .batch(contextLength) } } } diff --git a/discojs/src/serialization/model.spec.ts b/discojs/src/serialization/model.spec.ts index e9b983202..a966d39db 100644 --- a/discojs/src/serialization/model.spec.ts +++ b/discojs/src/serialization/model.spec.ts @@ -51,7 +51,7 @@ describe('serialization', () => { maxIter: 10, evaluateEvery:10, maxEvalBatches: 10, - blockSize: 8, + contextLength: 8, } const model = new models.GPT(config) diff --git a/discojs/src/task/training_information.ts b/discojs/src/task/training_information.ts index 79b0ef458..1b60c9df9 100644 --- a/discojs/src/task/training_information.ts +++ b/discojs/src/task/training_information.ts @@ -65,9 +65,9 @@ interface DataTypeToTrainingInformation { // When the tokenizer is first called, the actual object will be initialized and loaded into this field for the subsequent tokenizations. tokenizer: string | PreTrainedTokenizer; - // maxSequenceLength: the maximum length of a input string used as input to a GPT model. It is used during preprocessing to + // contextLength: the maximum length of a input string used as input to a GPT model. It is used during preprocessing to // truncate strings to a maximum length. The default value is tokenizer.model_max_length - maxSequenceLength: number; + contextLength: number; }; } @@ -224,7 +224,7 @@ export function isTrainingInformation( } case "text": { const { - maxSequenceLength, + contextLength, tokenizer, }: Partial< Omit, @@ -234,14 +234,14 @@ export function isTrainingInformation( if ( (typeof tokenizer !== "string" && !(tokenizer instanceof PreTrainedTokenizer)) || - (typeof maxSequenceLength !== "number") + (typeof contextLength !== "number") ) return false; const _: TrainingInformation<"text"> = { ...repack, dataType, - maxSequenceLength, + contextLength, tokenizer, } satisfies Record, unknown>; diff --git a/webapp/src/components/testing/__tests__/Testing.spec.ts b/webapp/src/components/testing/__tests__/Testing.spec.ts index d37deb30e..a0cfdc368 100644 --- a/webapp/src/components/testing/__tests__/Testing.spec.ts +++ b/webapp/src/components/testing/__tests__/Testing.spec.ts @@ -28,7 +28,7 @@ const TASK: Task<"text"> = { batchSize: 1, roundDuration: 1, validationSplit: 0, - maxSequenceLength: 64, + contextLength: 64, }, };