From 8d6a8793d526bd176b09f4e2874b98e12e0ab731 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Quentin=20Gallou=C3=A9dec?= Date: Mon, 9 Dec 2024 19:33:40 +0000 Subject: [PATCH] update doc --- docs/source/stepwise_reward_trainer.mdx | 135 +++++++++++++++--------- 1 file changed, 83 insertions(+), 52 deletions(-) diff --git a/docs/source/stepwise_reward_trainer.mdx b/docs/source/stepwise_reward_trainer.mdx index ef8feef386..1e44e67036 100644 --- a/docs/source/stepwise_reward_trainer.mdx +++ b/docs/source/stepwise_reward_trainer.mdx @@ -10,71 +10,102 @@ The abstract from the paper is the following: This post-training method was contributed by [Gaetan Lopez](https://github.com/gaetanlop), [Lewis Tunstall](https://huggingface.co/lewtun), [Quentin Gallouédec](https://huggingface.co/qgallouedec) and [Agustín Piqueres](https://huggingface.co/plaguss). -## Usage tips -The [`StepwiseRewardTrainer`] is a wrapper around the [`Trainer`] class. It needs two parameters to be set via the [`StepwiseRewardConfig`], namely: -* `max_length`: controls the maximum length of the sequences, where a sequence is composed of the prompt and the concatenation of each completion step. -* `step_separator`: indicates the separator used to separate each step of the reasoning process. By default, it is set to `"\n"`. +## Quick start -The basic API is as follows: +This example demonstrates how to train a model using the PRM method. We use the [Qwen 0.5B model](https://huggingface.co/Qwen/Qwen2-0.5B) as the base model. We use the stepwise supervision data from the [Math Shepherd dataset](https://huggingface.co/datasets/trl-lib/math_shepherd). You can view the data in the dataset here: + + + +Below is the script to train the model: ```python -from datasets import Dataset +# train_prm.py +from datasets import load_dataset +from trl import StepwiseRewardConfig, StepwiseRewardTrainer from transformers import AutoModelForTokenClassification, AutoTokenizer -from trl import StepwiseRewardTrainer, StepwiseRewardConfig - - -NUM_DUMMY_SAMPLES = 100 - -model = AutoModelForTokenClassification.from_pretrained("Qwen/Qwen2-0.5B-Instruct", num_labels=2) - -train_dataset = Dataset.from_dict( - { - "prompt": [ - "Which number has a larger absolute value, -13.1 or 7.0?", - ], - "completions": [ - ["The absolute value of -13.1 is 13.1.","The absolute value of 7.0 is 7.0", "7.0 is larger than 13.1.", "Hence, in absolute value, 7.0 is larger than -13.1."] - ] - "labels": [ - [True, True, False, False] - ] - * NUM_DUMMY_SAMPLES - } -) -eval_dataset = Dataset.from_dict( - { - "prompt": [ - "Is 19 divisible by 6?", - ], - "completion": [ - ["Dividing 19 by 6 gives a remainder of 1.", "A number is divisible by another number if the division results in no remainder.", "Hence, 19 is not divisible by 6."] - ] - "labels": [ - [True, True, True] - ] - * NUM_DUMMY_SAMPLES - } -) - -config = StepwiseRewardConfig(output_dir="stepwise-reward-model", per_device_train_batch_size=1, max_length=512, step_separator="\n") -trainer = StepwiseRewardTrainer( - model=model, - args=training_args, - tokenizer=tokenizer, - train_dataset=dataset, -) +model = AutoModelForTokenClassification.from_pretrained("Qwen/Qwen2-0.5B", num_labels=2) +tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2-0.5B") +train_dataset = load_dataset("trl-lib/math_shepherd", split="train[:10%]") + +training_args = StepwiseRewardConfig(output_dir="Qwen2-0.5B-Reward-Math-Sheperd", logging_steps=10) +trainer = StepwiseRewardTrainer(model=model, args=training_args, processing_class=tokenizer, train_dataset=train_dataset) trainer.train() ``` -## Expected dataset format +Execute the script using the following command: + +```bash +accelerate launch train_prm.py +``` + +Distributed across 8 GPUs, the training takes approximately 1 hour. + +To see how the [trained model](https://huggingface.co/trl-lib/Qwen2-0.5B-Reward-Math-Sheperd) performs, you can use the following script. + + +```python +from datasets import load_dataset +from transformers import pipeline + +pipe = pipeline("token-classification", model="trl-lib/Qwen2-0.5B-Reward-Math-Sheperd") +dataset = load_dataset("trl-lib/math_shepherd") +example = { + "prompt": "Musa is the class teacher of a class of 45 students. He wants to split them into three groups by age. If a third of the class is under 11 years, and two-fifths are above 11 but under 13, how many students will be in the third group (13 years and above)?", + "completions": [ + "Step 1: A third of the class is under 11 years because 11 - 1/3 = <<11-1/3=7>>7.", + "Step 2: Two-fifths of the class are above 11 but under 13 because 2/5 * 11 = <<2/5*11=8>>8.", + "Step 3: There are 45 students, so the third group will have 45 - 7 - 8 = <<45-7-8=20>>20 students. The answer is: 20", + ], + "labels": [True, False, False], +} + + +separator = "\n" # It's important to use the same separator as the one used during training + +for idx in range(1, len(example["completions"]) + 1): + steps = example["completions"][0:idx] + text = separator.join((example["prompt"], *steps)) + separator # Add a separator between the prompt and each steps + pred_entity = pipe(text)[-1]["entity"] + pred = {"LABEL_0": False, "LABEL_1": True}[pred_entity] + label = example["labels"][idx - 1] + print(f"Step {idx}\tPredicted: {pred} \tLabel: {label}") +``` + +``` +Step 1 Predicted: True Label: True +Step 2 Predicted: False Label: False +Step 3 Predicted: False Label: False +``` + +It's a win! -The dataset should be formatted as a [Stepwise Supervision](dataset_formats#stepwise-supervision) dataset, which implies that it should contain the following columns: `prompt`, `completions` and `labels`, where `completions` contains a list of reasoning steps and `labels` a list of booleans or floats indicating the correctness of each step. +## Expected dataset type + +The dataset should be formatted as a [Stepwise supervision](dataset_formats#stepwise-supervision) dataset, which implies that it should contain the following columns: `prompt`, `completions` and `labels`, where `completions` contains a list of reasoning steps and `labels` a list of booleans or floats indicating the correctness of each step. The [`StepwiseRewardTrainer`] only supports [standard](dataset_formats#standard) dataset format. -You can also use a pretokenized dataset, in which case the dataset should contain the following columns: `input_ids`, `attention_mask` and `labels`. +## Example script + +We provide an example script to train a model using the stepwise reward modeling method. The script is available in [`examples/scripts/stepwise_reward_trainer.py`](https://github.com/huggingface/trl/blob/main/examples/scripts/stepwise_reward_trainer.py) + +To use the stepwise reward modeling script with the [Qwen2 0.5B model](https://huggingface.co/Qwen/Qwen2-0.5B) on the [Math Shepherd dataset](https://huggingface.co/datasets/trl-lib/math_shepherd), run the following command: + +```bash +accelerate launch examples/scripts/stepwise_reward_modeling.py \ + --model_name_or_path Qwen/Qwen2-0.5B \ + --dataset_name trl-lib/math_shepherd \ + --num_train_epochs 1 \ + --logging_steps 2 + --output_dir Qwen2-0.5B-Reward-Math-Sheperd +``` ## StepwiseRewardTrainer