diff --git a/README.md b/README.md index fe5d5f8..b474f1a 100644 --- a/README.md +++ b/README.md @@ -29,12 +29,15 @@ The optimizations improve generative AI serving performance without compromising FMO is designed to work with Hugging Face pretrained models, which can be loaded using ['PreTrainedModel.from_pretrained()'](https://huggingface.co/docs/transformers/en/main_classes/model#transformers.PreTrainedModel.from_pretrained). +FMO offers a pedantic level setting, which controls the trade-off between accuracy and processing time. Higher pedantic levels provide more accurate model but can increase the time required to generate quantized models, and may sometimes slow down inference. Lower pedantic levels allow for faster quantization, though they may reduce model accuracy. Each quantization mode supports different ranges of pedantic levels. + > [!NOTE] > The list of Hugging Face model architectures that can be optimized with FMO is specified in [Supported Features & Model Architecture](#supported-features--model-architecture). > [!NOTE] > Currently, FMO supports Python3.8 to Python3.11. + # Table of Contents - [Quick Installation](#quick-installation) - [Supported Features & Model Architecture](#supported-features--model-architecture) @@ -57,6 +60,8 @@ FP8 is an 8-bit floating-point format that offers a higher dynamic range than IN making it better suited for quantizing both weights and activations. This leads to increased throughput and reduced latency while maintaining high output quality with minimal degradation. +FP8 support 0-2 pedantic level. Defaults to 1. + > [!IMPORTANT] > FP8 is only supported by NVIDIA Ada, Hopper, and Blackwell GPU architectures. @@ -74,6 +79,9 @@ This leads to increased throughput and reduced latency while maintaining high ou - `ArcticForCausalLM` - `MixtralForCausalLM` +> [!NOTE] +> Currently, `Phi3ForCausalLM`, `MptForCausalLM`, `ArcticForCausalLM`, and `MixtralForCausalLM` only support pendantic level 0 +> Please add `--pedantic-level 0` in command line. ## INT8 @@ -81,6 +89,8 @@ INT8 Quantization represents weights and activations using the INT8 format with Friendli Engine enables dynamic activation scaling, where scales are computed on the fly during runtime. Thus, FMO only quantizes model weights, and Friendli Engine will load the quantized weights. +INT8 support 0-1 pedantic level. Defaults to 1. + ### Supported Model Architectures for INT8 Quantization - `LlamaForCausalLM` - `MistralForcausalLM` @@ -96,6 +106,7 @@ fmo quantize \ --model-name-or-path $MODEL_NAME_OR_PATH \ --output-dir $OUTPUT_DIR \ --mode $QUANTIZATION_SCHEME \ +--pedantic-level $PEDANTIC_LEVEL --device $DEVICE \ --offload ``` @@ -103,6 +114,7 @@ The command line arguments means : - **`model-name-or-path`**: Hugging Face pretrained model name or directory path of the saved model checkpoint. - **`output-dir`**: Directory path to save the quantized checkpoint and related configurations. - **`mode`**: Quantization techniques to apply. You can use `fp8`, `int8`. +- **`pedantic-level`**: Represent to accuracy-latency trade-off. Higher pedantic level ensure a more accurate representaition of the model, but increase the quantization processing time. Defaults to 1. - **`device`**: Device to run the quantization process. Defaults to "cuda:0". - **`offload`**: When enabled, this option significantly reduces GPU memory usage by offloading model layers onto CPU RAM. Defaults to true. diff --git a/pyproject.toml b/pyproject.toml index 9991298..4daf25e 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,11 +4,11 @@ build-backend = "setuptools.build_meta" [project] name = "friendli-model-optimizer" -version = "0.3.0" +version = "0.5.0" authors = [ { name = "FriendliAI teams", email = "eng@friendli.ai" }, ] -requires-python=">=3.8,<=3.11" +requires-python=">=3.8,<3.12" description = "Model Optimizer CLI for Friendli Engine." dependencies = [ "transformers>=4.43.1", @@ -16,9 +16,8 @@ dependencies = [ "typer", "torch>=2.0.0", "accelerate", - "fmo-core==0.3", + "fmo-core==0.5", ] -requires-python=">=3.8,<=3.11" keywords = [ "generative-ai", "llm", "quantization" ] readme = "README.txt" license = { file = "LICENSE.txt" } diff --git a/src/fmo/main.py b/src/fmo/main.py index d6c0548..9ac8b04 100644 --- a/src/fmo/main.py +++ b/src/fmo/main.py @@ -59,6 +59,15 @@ def quantize( "--mode", help=("Qantization techniques to apply. You can use `fp8`, and `int8`."), ), + pedantic_level: int = typer.Option( + 1, + "--pedantic-level", + help=( + "Higher pedantic level ensure a more accurate representation of the model," + "but increase the quantization processing time. Lower levels allow for faster" + "quantization, but may sacrifice some model accuracy. Defaults to 1." + ), + ), device: Optional[str] = typer.Option( "cuda:0", "--device", @@ -94,7 +103,7 @@ def quantize( help=("Huggingface dataset column name for gathering sample activations."), ), dataset_num_samples: int = typer.Option( - 128, + 512, "--dataset-num-samples", help=("The number of samples for gathering sample activations."), ), @@ -153,6 +162,7 @@ def quantize( device=device, offload=offload, calib_dataloader=calib_dataloader, + pedantic_level=pedantic_level, ) msg = (