Skip to content

Commit

Permalink
Adding set values for input / output
Browse files Browse the repository at this point in the history
  • Loading branch information
TheQuantumFractal committed Oct 21, 2024
1 parent d4df28b commit 48341cf
Show file tree
Hide file tree
Showing 2 changed files with 12 additions and 4 deletions.
2 changes: 2 additions & 0 deletions config/gpt2_small_fast_supervised.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,8 @@ supervised_data:
validation_urls:
- "gs://marin-us-central2/benchmarks/mmlu/mmlu-*-dev-evaluation.jsonl.gz"
cache_dir: "gs://marin-us-central2/benchmarks/tokenized-gpt2/mmlu/"
input_field: "input"
output_field: "output"
model:
type: gpt2
hidden_dim: 768
Expand Down
14 changes: 10 additions & 4 deletions src/levanter/data/text.py
Original file line number Diff line number Diff line change
Expand Up @@ -570,13 +570,16 @@ class LMSupervisedDatasetConfig:
"""tags for the dataset. Typically the name of the dataset in the config will be added as a tag as well"""
name: Optional[str] = None # name for hf dataset

input_field: str = "prompt" # name of the input field in the jsonl file
output_field: str = "response" # name of the output field in the jsonl file

validation_urls: List[str] = () # type:ignore


def preprocess_supervised_example(batch, tokenizer: PreTrainedTokenizerBase):
sources = [example["input"] for example in batch]
def preprocess_supervised_example(batch, tokenizer: PreTrainedTokenizerBase, input_field: str, output_field: str) -> dict:
sources = [example[input_field] for example in batch]

targets = [f"{example['output']}" for example in batch]
targets = [f"{example[output_field]}" for example in batch]
# TODO: this seems pretty wasteful since you end up tokenizing twice, but it's how alpaca does it.
examples = [s + t for s, t in zip(sources, targets)]
sources_tokenized = tokenizer(sources, padding=False, truncation=True)
Expand Down Expand Up @@ -623,9 +626,12 @@ def mk_supervised_dataset(config: LMSupervisedDatasetConfig, tokenizer: PreTrain
validation_urls = [url for url_pat in config.validation_urls for url in fsspec_expand_glob(url_pat)]
dataset = levanter.data.datasource_from_jsonl(validation_urls)

input_field = config.input_field
output_field = config.output_field

output_exemplar = {"input_ids": np.zeros((0,), dtype=np.int32), "sources_len": np.zeros((), dtype=np.int32)}

dataset = dataset.map_batches(lambda ex: preprocess_supervised_example(ex, tokenizer), batch_size=128, num_cpus=num_cpus_used_by_tokenizer(tokenizer), output_exemplar=output_exemplar) # type: ignore
dataset = dataset.map_batches(lambda ex: preprocess_supervised_example(ex, tokenizer, input_field, output_field), batch_size=128, num_cpus=num_cpus_used_by_tokenizer(tokenizer), output_exemplar=output_exemplar) # type: ignore
dataset = dataset.build_or_load_cache(config.cache_dir, await_finished=True) # type: ignore
if tokenizer.pad_token is None:
tokenizer.pad_token = tokenizer.eos_token
Expand Down

0 comments on commit 48341cf

Please sign in to comment.