Skip to content

Commit

Permalink
discojs*,cli*: rename blockSize and maxSequenceLength to contextLength
Browse files Browse the repository at this point in the history
  • Loading branch information
JulienVig committed Dec 3, 2024
1 parent c60eab6 commit 0781b7c
Show file tree
Hide file tree
Showing 12 changed files with 41 additions and 41 deletions.
6 changes: 3 additions & 3 deletions cli/src/benchmark_gpt.ts
Original file line number Diff line number Diff line change
Expand Up @@ -69,18 +69,18 @@ async function main(args: Required<CLIArguments>): Promise<void> {
const config: models.GPTConfig = {
modelType: modelType as models.GPTConfig['modelType'],
maxIter: iterationsPerEpoch,
blockSize: contextLength,
lr: 0.0001,
contextLength,
}

// Load the dataset after setting the Task batch size and max sequence length
// to make sure the dataset is batched and tokenized correctly
task.trainingInformation.batchSize = batchSize
task.trainingInformation.maxSequenceLength = contextLength
task.trainingInformation.contextLength = contextLength
const dataset = loadText('../datasets/wikitext/wiki.train.tokens')
.map(text => processing.tokenize(tokenizer, text))
.flatten()
.batch(config.blockSize + 1, 1)
.batch(config.contextLength + 1, 1)

const preprocessedDataset = dataset
.map((tokens) => [tokens.pop(), tokens.last()] as [List<number>, number])
Expand Down
4 changes: 2 additions & 2 deletions cli/src/train_gpt.ts
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ async function main(): Promise<void> {
maxIter: 50,
evaluateEvery:50,
maxEvalBatches: 10,
blockSize: 16,
contextLength: 16,
seed
}

Expand All @@ -22,7 +22,7 @@ async function main(): Promise<void> {
const tokenDataset = new Dataset([data])
.map((text: string) => processing.tokenize(tokenizer, text))
.flatten()
.batch(config.blockSize + 1, 1)
.batch(config.contextLength + 1, 1)
.map((tokens) => [tokens.pop(), tokens.last()] as [List<number>, number])
.repeat()
.batch(8);
Expand Down
18 changes: 9 additions & 9 deletions discojs/src/dataset/dataset.spec.ts
Original file line number Diff line number Diff line change
Expand Up @@ -152,29 +152,29 @@ describe("dataset", () => {

it("batch with overlap yields correct batches", async () => {
const expectedTokens = Range(0, 53).toList()
const blockSize = 4
const contextLength = 4

const parsed = new Dataset([expectedTokens])
.flatten()
.batch(blockSize + 1, 1)
.batch(contextLength + 1, 1)

// -1 because the last sequence is dropped as there is no next token label
const expectedLength = Math.ceil(expectedTokens.size / blockSize) - 1
const expectedLength = Math.ceil(expectedTokens.size / contextLength) - 1
expect(await parsed.size()).to.equal(expectedLength);

// exclude the last sequence because it has been padded
let sequences = List(await arrayFromAsync(parsed))
// we expect the last sequence to have blockSize + 1 tokens via padding
expect(sequences.last()?.size).to.equal(blockSize + 1)
// we expect the last sequence to have contextLength + 1 tokens via padding
expect(sequences.last()?.size).to.equal(contextLength + 1)
sequences = sequences.pop()
let i = 0
for await (const tokens of sequences) {
// each sequence has length blockSize + 1 (for the label)
// each sequence has length contextLength + 1 (for the label)
expect(tokens.toArray()).to.deep.equal(
expectedTokens.slice(i, i + blockSize + 1).toArray()
expectedTokens.slice(i, i + contextLength + 1).toArray()
);
// but the window should move by blockSize only
i += blockSize
// but the window should move by contextLength only
i += contextLength
}
})

Expand Down
4 changes: 2 additions & 2 deletions discojs/src/default_tasks/wikitext.ts
Original file line number Diff line number Diff line change
Expand Up @@ -35,15 +35,15 @@ export const wikitext: TaskProvider<'text'> = {
roundDuration: 2,
batchSize: 8, // If set too high firefox raises a WebGL error
tokenizer: 'Xenova/gpt2',
maxSequenceLength: 64,
contextLength: 64,
tensorBackend: 'gpt'
}
}
},

getModel(): Promise<Model<'text'>> {
return Promise.resolve(new models.GPT({
blockSize: this.getTask().trainingInformation.maxSequenceLength,
contextLength: this.getTask().trainingInformation.contextLength,
}))
}
}
4 changes: 2 additions & 2 deletions discojs/src/models/gpt/config.ts
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ type GPTModelType =

export interface GPTConfig {
lr: number
blockSize: number
contextLength: number
vocabSize?: number
modelType: GPTModelType
name?: string,
Expand Down Expand Up @@ -39,7 +39,7 @@ export const DefaultGPTConfig: Required<GPTConfig> = {
evaluate: true,
maxEvalBatches: 12,
evaluateEvery: 100,
blockSize: 128,
contextLength: 128,
vocabSize: 50257,
debug: false,
dropout: 0.2,
Expand Down
2 changes: 1 addition & 1 deletion discojs/src/models/gpt/gpt.spec.ts
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ describe("gpt-tfjs", function () {
maxIter: 10,
evaluateEvery: 50,
maxEvalBatches: 10,
blockSize: 8,
contextLength: 8,
seed
});
for (let i = 0; i < 5; i++)
Expand Down
8 changes: 4 additions & 4 deletions discojs/src/models/gpt/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ export type GPTSerialization = {
export class GPT extends Model<"text"> {
private readonly model: GPTModel;

readonly #blockSize: number;
readonly #contextLength: number;
readonly #maxBatchCount: number;
readonly #vocabSize: number;

Expand All @@ -38,7 +38,7 @@ export class GPT extends Model<"text"> {
model.compile();
this.model = model;

this.#blockSize = partialConfig?.blockSize ?? DefaultGPTConfig.blockSize;
this.#contextLength = partialConfig?.contextLength ?? DefaultGPTConfig.contextLength;
this.#maxBatchCount = partialConfig?.maxIter ?? DefaultGPTConfig.maxIter;
this.#vocabSize = partialConfig?.vocabSize ?? DefaultGPTConfig.vocabSize;
}
Expand Down Expand Up @@ -157,7 +157,7 @@ export class GPT extends Model<"text"> {
* Generate the next token after the input sequence.
* In other words, takes an input tensor of shape (prompt length T) and returns a tensor of shape (T+1)
*
* @param token input tokens of shape (T,). T is truncated to the model's block size
* @param token input tokens of shape (T,). T is truncated to the model's context length
* @param config generation config: temperature, doSample, topk
* @returns the next token predicted by the model
*/
Expand All @@ -166,7 +166,7 @@ export class GPT extends Model<"text"> {
config: GenerationConfig,
): Promise<DataFormat.ModelEncoded["text"][1]> {
// slice input tokens if longer than context length
tokens = tokens.slice(-this.#blockSize);
tokens = tokens.slice(-this.#contextLength);

const input = tf.tidy(() =>
tf.tensor1d(tokens.toArray(), "int32").expandDims<tf.Tensor2D>(0),
Expand Down
14 changes: 7 additions & 7 deletions discojs/src/models/gpt/layers.ts
Original file line number Diff line number Diff line change
Expand Up @@ -67,7 +67,7 @@ tf.serialization.registerClass(LogLayer)

type CausalSelfAttentionConfig =
ConstructorParameters<typeof tf.layers.Layer>[0]
& Record<'blockSize' | 'nHead' | 'nEmbd' | 'dropout' | 'nLayer' | 'seed', number>
& Record<'contextLength' | 'nHead' | 'nEmbd' | 'dropout' | 'nLayer' | 'seed', number>

class CausalSelfAttention extends tf.layers.Layer {
static readonly className = 'CausalSelfAttention'
Expand Down Expand Up @@ -97,7 +97,7 @@ class CausalSelfAttention extends tf.layers.Layer {
// mask is a lower triangular matrix filled with 1
// calling bandPart zero out the upper triangular part of the all-ones matrix
// from the doc: tf.linalg.band_part(input, -1, 0) ==> Lower triangular part
this.mask = tf.linalg.bandPart(tf.ones([config.blockSize, config.blockSize]), -1, 0)
this.mask = tf.linalg.bandPart(tf.ones([config.contextLength, config.contextLength]), -1, 0)
}

override build (): void {
Expand Down Expand Up @@ -266,15 +266,15 @@ class GELU extends tf.layers.Layer {
tf.serialization.registerClass(GELU)

type MLPConfig = ConstructorParameters<typeof tf.layers.Layer>[0] &
Required<ModelSize> & Record<'blockSize' | 'residDrop' | 'nLayer' | 'seed', number>
Required<ModelSize> & Record<'contextLength' | 'residDrop' | 'nLayer' | 'seed', number>

function MLP(config: MLPConfig): tf.LayersModel {
return tf.sequential({ layers: [
tf.layers.dense({
name: config.name + `.mlp.c_fc`,
units: 4 * config.nEmbd,
inputDim: config.nEmbd,
inputShape: [config.blockSize, config.nEmbd],
inputShape: [config.contextLength, config.nEmbd],
kernelInitializer: tf.initializers.randomNormal({
mean: 0, stddev: 0.02, seed: config.seed
}),
Expand All @@ -284,7 +284,7 @@ function MLP(config: MLPConfig): tf.LayersModel {
name: config.name + '.mlp.c_proj',
units: config.nEmbd,
inputDim: 4 * config.nEmbd,
inputShape: [config.blockSize, 4 * config.nEmbd],
inputShape: [config.contextLength, 4 * config.nEmbd],
kernelInitializer: tf.initializers.randomNormal({
mean: 0, stddev: 0.02 * Math.sqrt(2 * config.nLayer), seed: config.seed
}),
Expand All @@ -306,7 +306,7 @@ type BlockConfig = CausalSelfAttentionConfig & MLPConfig & { debug: boolean }
*/
function TransformerBlock (conf: BlockConfig): tf.LayersModel {
const config = Object.assign({ name: '.h' }, conf)
const inputs = tf.input({ shape: [config.blockSize, config.nEmbd] })
const inputs = tf.input({ shape: [config.contextLength, config.nEmbd] })
let x1, x2
// input normalization
x1 = tf.layers.layerNormalization({
Expand Down Expand Up @@ -469,7 +469,7 @@ export function GPTArchitecture(config: Required<GPTConfig>): tf.LayersModel {
const range = new Range({}).apply(inputs)
let posEmb = tf.layers.embedding({
name: config.name + '.wpe',
inputDim: config.blockSize,
inputDim: config.contextLength,
outputDim: config.nEmbd,
embeddingsInitializer: tf.initializers.randomNormal({
mean: 0, stddev: 0.02, seed: config.seed
Expand Down
8 changes: 4 additions & 4 deletions discojs/src/processing/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -56,12 +56,12 @@ export async function preprocess<D extends DataType>(
// cast as typescript doesn't reduce generic type
const d = dataset as Dataset<DataFormat.Raw["text"]>;
const t = task as Task<"text">;
const blockSize = task.trainingInformation.maxSequenceLength
const contextLength = task.trainingInformation.contextLength

const tokenizer = await models.getTaskTokenizer(t);
return d.map(text => processing.tokenize(tokenizer, text))
.flatten()
.batch(blockSize + 1, 1)
.batch(contextLength + 1, 1)
.map((tokens) => [tokens.pop(), tokens.last()]) as
Dataset<DataFormat.ModelEncoded[D]>;
}
Expand Down Expand Up @@ -97,12 +97,12 @@ export async function preprocessWithoutLabel<D extends DataType>(
// cast as typescript doesn't reduce generic type
const d = dataset as Dataset<DataFormat.Raw["text"]>;
const t = task as Task<"text">;
const blockSize = task.trainingInformation.maxSequenceLength
const contextLength = task.trainingInformation.contextLength
const tokenizer = await models.getTaskTokenizer(t);

return d.map(text => processing.tokenize(tokenizer, text))
.flatten()
.batch(blockSize)
.batch(contextLength)
}
}
}
Expand Down
2 changes: 1 addition & 1 deletion discojs/src/serialization/model.spec.ts
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,7 @@ describe('serialization', () => {
maxIter: 10,
evaluateEvery:10,
maxEvalBatches: 10,
blockSize: 8,
contextLength: 8,
}
const model = new models.GPT(config)

Expand Down
10 changes: 5 additions & 5 deletions discojs/src/task/training_information.ts
Original file line number Diff line number Diff line change
Expand Up @@ -65,9 +65,9 @@ interface DataTypeToTrainingInformation {
// When the tokenizer is first called, the actual object will be initialized and loaded into this field for the subsequent tokenizations.
tokenizer: string | PreTrainedTokenizer;

// maxSequenceLength: the maximum length of a input string used as input to a GPT model. It is used during preprocessing to
// contextLength: the maximum length of a input string used as input to a GPT model. It is used during preprocessing to
// truncate strings to a maximum length. The default value is tokenizer.model_max_length
maxSequenceLength: number;
contextLength: number;
};
}

Expand Down Expand Up @@ -224,7 +224,7 @@ export function isTrainingInformation(
}
case "text": {
const {
maxSequenceLength,
contextLength,
tokenizer,
}: Partial<
Omit<TrainingInformation<"text">,
Expand All @@ -234,14 +234,14 @@ export function isTrainingInformation(
if (
(typeof tokenizer !== "string" &&
!(tokenizer instanceof PreTrainedTokenizer)) ||
(typeof maxSequenceLength !== "number")
(typeof contextLength !== "number")
)
return false;

const _: TrainingInformation<"text"> = {
...repack,
dataType,
maxSequenceLength,
contextLength,
tokenizer,
} satisfies Record<keyof TrainingInformation<"text">, unknown>;

Expand Down
2 changes: 1 addition & 1 deletion webapp/src/components/testing/__tests__/Testing.spec.ts
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ const TASK: Task<"text"> = {
batchSize: 1,
roundDuration: 1,
validationSplit: 0,
maxSequenceLength: 64,
contextLength: 64,
},
};

Expand Down

0 comments on commit 0781b7c

Please sign in to comment.