diff --git a/README.md b/README.md index b474f1a..3b0b665 100644 --- a/README.md +++ b/README.md @@ -116,7 +116,7 @@ The command line arguments means : - **`mode`**: Quantization techniques to apply. You can use `fp8`, `int8`. - **`pedantic-level`**: Represent to accuracy-latency trade-off. Higher pedantic level ensure a more accurate representaition of the model, but increase the quantization processing time. Defaults to 1. - **`device`**: Device to run the quantization process. Defaults to "cuda:0". -- **`offload`**: When enabled, this option significantly reduces GPU memory usage by offloading model layers onto CPU RAM. Defaults to true. +- **`offload`**: When enabled, this option significantly reduces GPU memory usage by offloading model layers onto CPU RAM. Defaults to False. ## Example: Run FP8 quantization with Meta-Llama-3-8B-Instruct ```bash diff --git a/pyproject.toml b/pyproject.toml index 4daf25e..0fca126 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta" [project] name = "friendli-model-optimizer" -version = "0.5.0" +version = "0.5.1" authors = [ { name = "FriendliAI teams", email = "eng@friendli.ai" }, ] diff --git a/src/fmo/main.py b/src/fmo/main.py index 9ac8b04..703d04a 100644 --- a/src/fmo/main.py +++ b/src/fmo/main.py @@ -79,7 +79,7 @@ def quantize( False, "--offload", help=( - "When enabled, significantly reduces GPU memory usage by offloading model layers onto CPU RAM. Defaults to true." + "When enabled, significantly reduces GPU memory usage by offloading model layers onto CPU RAM. Defaults to False." ), ), seed: Optional[int] = typer.Option( @@ -138,7 +138,9 @@ def quantize( os.mkdir(output_dir) dataset = safe_load_datasets( - dataset_name_or_path=dataset_name_or_path, split_name=dataset_split_name + dataset_name_or_path=dataset_name_or_path, + split_name=dataset_split_name, + cache_dir=cache_dir, ) tokenizer = get_tokenizer( model_name_or_path=model_name_or_path, cache_dir=cache_dir diff --git a/src/fmo/utils/dataset.py b/src/fmo/utils/dataset.py index 8519c45..0542f58 100644 --- a/src/fmo/utils/dataset.py +++ b/src/fmo/utils/dataset.py @@ -17,6 +17,7 @@ def safe_load_datasets( dataset_name_or_path: str, split_name: Optional[str], + cache_dir: Optional[str] = None, ) -> datasets.Dataset: """Load dataset from calibration dataset config.""" try: @@ -28,11 +29,13 @@ def safe_load_datasets( else: data_name_parts = dataset_name_or_path.split(":") if len(data_name_parts) == 1: - dataset = datasets.load_dataset(dataset_name_or_path, split=split_name) + dataset = datasets.load_dataset( + dataset_name_or_path, split=split_name, cache_dir=cache_dir + ) elif len(data_name_parts) == 2: data_name, subset_name = data_name_parts dataset = datasets.load_dataset( - data_name, subset_name, split=split_name + data_name, subset_name, split=split_name, cache_dir=cache_dir ) else: raise QuantizationError(