Skip to content
This repository has been archived by the owner on Oct 25, 2024. It is now read-only.

Commit

Permalink
Add itrex llm runtime graph int4 notebook (#399)
Browse files Browse the repository at this point in the history
  • Loading branch information
lvliang-intel authored Oct 27, 2023
1 parent 89cf760 commit daece44
Show file tree
Hide file tree
Showing 24 changed files with 335 additions and 120 deletions.
78 changes: 47 additions & 31 deletions intel_extension_for_transformers/llm/quantization/optimization.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,44 +15,60 @@
# See the License for the specific language governing permissions and
# limitations under the License.

from typing import Union
from intel_extension_for_transformers.neural_chat.config import (
AMPConfig,
WeightOnlyQuantizationConfig,
BitsAndBytesConfig
)
import re

class Optimization:
def __init__(
self,
optimization_config: Union[AMPConfig, WeightOnlyQuantizationConfig, BitsAndBytesConfig]
optimization_config
):
self.optimization_config = optimization_config

def optimize(self, model):
def optimize(self, model, use_llm_runtime=False):
optimized_model = model
from intel_extension_for_transformers.transformers import (
MixedPrecisionConfig,
WeightOnlyQuantConfig,
BitsAndBytesConfig
)
assert type(self.optimization_config) in [MixedPrecisionConfig, WeightOnlyQuantConfig, BitsAndBytesConfig], \
f"Expect optimization_config be an object of MixedPrecisionConfig, WeightOnlyQuantConfig" + \
" or BitsAndBytesConfig,got {type(self.optimization_config)}."
config = self.optimization_config
if isinstance(config, WeightOnlyQuantizationConfig):
print("Applying Weight Only Quantization.")
from neural_compressor import PostTrainingQuantConfig, quantization
op_type_dict = {
'.*':{ # re.match
"weight": {
'bits': config.bits, # 1-8 bits
'group_size': config.group_size, # -1 (per-channel)
'scheme': config.scheme, # sym/asym
'algorithm': config.algorithm, # RTN/AWQ/TEQ
},
},
}
recipes = {"rtn_args": {"enable_full_range": config.enable_full_range}}
conf = PostTrainingQuantConfig(
approach='weight_only',
op_type_dict=op_type_dict,
recipes=recipes,
)
optimized_model = quantization.fit(
model,
conf,
).model
if re.search("flan-t5", model.config._name_or_path, re.IGNORECASE):
from intel_extension_for_transformers.transformers import AutoModelForSeq2SeqLM
optimized_model = AutoModelForSeq2SeqLM.from_pretrained(
model.config._name_or_path,
quantization_config=config,
use_llm_runtime=use_llm_runtime,
trust_remote_code=True)
elif (
re.search("gpt", model.config._name_or_path, re.IGNORECASE)
or re.search("mpt", model.config._name_or_path, re.IGNORECASE)
or re.search("bloom", model.config._name_or_path, re.IGNORECASE)
or re.search("llama", model.config._name_or_path, re.IGNORECASE)
or re.search("opt", model.config._name_or_path, re.IGNORECASE)
or re.search("neural-chat-7b-v1", model.config._name_or_path, re.IGNORECASE)
or re.search("neural-chat-7b-v2", model.config._name_or_path, re.IGNORECASE)
):
from intel_extension_for_transformers.transformers import AutoModelForCausalLM
optimized_model = AutoModelForCausalLM.from_pretrained(
model.config._name_or_path,
quantization_config=config,
use_llm_runtime=use_llm_runtime,
trust_remote_code=True)
elif re.search("starcoder", model.config._name_or_path, re.IGNORECASE):
from intel_extension_for_transformers.transformers import GPTBigCodeForCausalLM
optimized_model = GPTBigCodeForCausalLM.from_pretrained(
model.config._name_or_path,
quantization_config=config,
use_llm_runtime=use_llm_runtime,
trust_remote_code=True)
elif re.search("chatglm", model.config._name_or_path, re.IGNORECASE):
from intel_extension_for_transformers.transformers import AutoModel
optimized_model = AutoModel.from_pretrained(
model.config._name_or_path,
quantization_config=config,
use_llm_runtime=use_llm_runtime,
trust_remote_code=True)
return optimized_model
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@ def __import_package(self, model_name):
import intel_extension_for_transformers.llm.runtime.graph.llama_cpp as cpp_model
elif model_name == "mpt":
import intel_extension_for_transformers.llm.runtime.graph.mpt_cpp as cpp_model
elif model_name == "starcoder":
elif model_name == "gpt_bigcode" or model_name == "starcoder":
import intel_extension_for_transformers.llm.runtime.graph.starcoder_cpp as cpp_model
elif model_name == "opt":
import intel_extension_for_transformers.llm.runtime.graph.opt_cpp as cpp_model
Expand Down
4 changes: 2 additions & 2 deletions intel_extension_for_transformers/neural_chat/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -129,8 +129,8 @@ neuralchat optimize --base_model "meta-llama/Llama-2-7b-chat-hf" --config pipeli

```python
# Python code
from intel_extension_for_transformers.neural_chat import build_chatbot, AMPConfig
pipeline_cfg = PipelineConfig(optimization_config=AMPConfig())
from intel_extension_for_transformers.neural_chat import build_chatbot, MixedPrecisionConfig
pipeline_cfg = PipelineConfig(optimization_config=MixedPrecisionConfig())
chatbot = build_chatbot(pipeline_cfg)
```

Expand Down
9 changes: 6 additions & 3 deletions intel_extension_for_transformers/neural_chat/chatbot.py
Original file line number Diff line number Diff line change
Expand Up @@ -125,6 +125,7 @@ def build_chatbot(config: PipelineConfig=None):
parameters["use_cache"] = config.loading_config.use_cache
parameters["peft_path"] = config.loading_config.peft_path
parameters["use_deepspeed"] = config.loading_config.use_deepspeed
parameters["use_llm_runtime"] = config.loading_config.use_llm_runtime
parameters["optimization_config"] = config.optimization_config
parameters["hf_access_token"] = config.hf_access_token

Expand All @@ -143,12 +144,14 @@ def finetune_model(config: BaseFinetuningConfig):
finetuning = Finetuning(config)
finetuning.finetune()

def optimize_model(model, config):
def optimize_model(model, config, use_llm_runtime=False):
"""Optimize the model based on the provided configuration.
Args:
config (OptimizationConfig): Configuration for optimizing the model.
model: large language model
config (OptimizationConfig): The configuration required for optimizing the model.
use_llm_runtime (bool): A boolean indicating whether to use the LLM runtime graph optimization.
"""
optimization = Optimization(optimization_config=config)
model = optimization.optimize(model)
model = optimization.optimize(model, use_llm_runtime)
return model
26 changes: 10 additions & 16 deletions intel_extension_for_transformers/neural_chat/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@

from dataclasses import dataclass, field
from typing import Optional, List, Dict
from transformers import TrainingArguments, BitsAndBytesConfig
from transformers import TrainingArguments
from transformers.utils.versions import require_version
from dataclasses import dataclass
from .utils.common import get_device_type
Expand Down Expand Up @@ -406,18 +406,7 @@ class LoadingModelConfig:
use_cache: bool = True
use_deepspeed: bool = False
ipex_int8: bool = False

@dataclass
class WeightOnlyQuantizationConfig:
algorithm: str = 'RTN'
bits: int = 8
group_size: int = -1
scheme: str = 'sym'
enable_full_range: bool = True

@dataclass
class AMPConfig:
dtype: str = 'bfloat16'
use_llm_runtime: bool = False

class PipelineConfig:
def __init__(self,
Expand All @@ -441,7 +430,12 @@ def __init__(self,
self.loading_config = loading_config if loading_config is not None else \
LoadingModelConfig(cpu_jit=True if self.device == "cpu" else False, \
use_hpu_graphs = True if self.device == "hpu" else False)
self.optimization_config = optimization_config if optimization_config is not None else AMPConfig()
assert type(self.optimization_config) in [AMPConfig, WeightOnlyQuantizationConfig, BitsAndBytesConfig], \
f"Expect optimization_config be an object of AMPConfig, WeightOnlyQuantizationConfig" + \
from intel_extension_for_transformers.transformers import (
MixedPrecisionConfig,
WeightOnlyQuantConfig,
BitsAndBytesConfig
)
self.optimization_config = optimization_config if optimization_config is not None else MixedPrecisionConfig()
assert type(self.optimization_config) in [MixedPrecisionConfig, WeightOnlyQuantConfig, BitsAndBytesConfig], \
f"Expect optimization_config be an object of MixedPrecisionConfig, WeightOnlyQuantConfig" + \
" or BitsAndBytesConfig,got {type(self.optimization_config)}."
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ Welcome to use Jupyter Notebooks to explore how to build and customize chatbots
| 3.1 | Enabling Chatbot with BF16 Optimization on SPR | Learn how to optimize chatbot using mixed precision on SPR | [Notebook](./notebooks/amp_optimization_on_spr.ipynb) |
| 3.2 | Enabling Chatbot with BF16 Optimization on Habana Gaudi1/Gaudi2 | Learn how to optimze chatbot using mixed precision on Habana Gaudi1/Gaudi2 | [Notebook](./notebooks/amp_optimization_on_habana_gaudi.ipynb) |
| 3.3 | Enabling Chatbot with BitsAndBytes Optimization on Nvidia A100 | Learn how to optimize chatbot using BitsAndBytes on Nvidia A100 | [Notebook](./notebooks/weight_only_optimization_on_nv_a100.ipynb) |
| 3.4 | Enabling Chatbot with Weight Only INT4 Optimization on SPR | Learn how to optimize chatbot using ITREX LLM graph Weight Only INT4 on SPR | [Notebook](./notebooks/itrex_llm_graph_int4_optimization_on_spr.ipynb) |
| 4 | Fine-Tuning Chatbots | | |
| 4.1 | Fine-tuning on SPR (Single Node) | Learn how to fine-tune chatbot on SPR with single node | [Notebook](./notebooks/single_node_finetuning_on_spr.ipynb) |
| 4.2 | Fine-tuning on SPR (Multiple Nodes) | Learn how to fine-tune chatbot on SPR with multiple nodes | [Notebook](./notebooks/multi_node_finetuning_on_spr.ipynb) |
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -42,8 +42,8 @@
"outputs": [],
"source": [
"from intel_extension_for_transformers.neural_chat import build_chatbot\n",
"from intel_extension_for_transformers.neural_chat.config import PipelineConfig, AMPConfig\n",
"config = PipelineConfig(optimization_config=AMPConfig(), model_name_or_path='Intel/neural-chat-7b-v1-1')\n",
"from intel_extension_for_transformers.neural_chat.config import PipelineConfig, MixedPrecisionConfig\n",
"config = PipelineConfig(optimization_config=MixedPrecisionConfig(), model_name_or_path='Intel/neural-chat-7b-v1-1')\n",
"chatbot = build_chatbot(config)\n",
"response = chatbot.predict(query=\"Tell me about Intel Xeon Scalable Processors.\")\n",
"print(response)"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -62,8 +62,8 @@
"outputs": [],
"source": [
"from intel_extension_for_transformers.neural_chat import build_chatbot\n",
"from intel_extension_for_transformers.neural_chat.config import PipelineConfig, AMPConfig\n",
"config = PipelineConfig(optimization_config=AMPConfig(), model_name_or_path='Intel/neural-chat-7b-v1-1')\n",
"from intel_extension_for_transformers.neural_chat.config import PipelineConfig, MixedPrecisionConfig\n",
"config = PipelineConfig(optimization_config=MixedPrecisionConfig(), model_name_or_path='Intel/neural-chat-7b-v1-1')\n",
"chatbot = build_chatbot(config)\n",
"response = chatbot.predict(query=\"Tell me about Intel Xeon Scalable Processors.\")\n",
"print(response)"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -78,9 +78,9 @@
"outputs": [],
"source": [
"# BF16 Optimization\n",
"from intel_extension_for_transformers.neural_chat.config import AMPConfig\n",
"from intel_extension_for_transformers.neural_chat.config import MixedPrecisionConfig\n",
"from intel_extension_for_transformers.neural_chat import build_chatbot, PipelineConfig\n",
"config = PipelineConfig(optimization_config=AMPConfig(), model_name_or_path='Intel/neural-chat-7b-v1-1')\n",
"config = PipelineConfig(optimization_config=MixedPrecisionConfig(), model_name_or_path='Intel/neural-chat-7b-v1-1')\n",
"chatbot = build_chatbot(config)\n",
"response = chatbot.predict(query=\"Tell me about Intel Xeon Scalable Processors.\")\n",
"print(response)"
Expand Down Expand Up @@ -207,8 +207,8 @@
"outputs": [],
"source": [
"# BF16 Optimization\n",
"from intel_extension_for_transformers.neural_chat.config import PipelineConfig, AMPConfig\n",
"config = PipelineConfig(optimization_config=AMPConfig(), model_name_or_path='Intel/neural-chat-7b-v1-1')\n",
"from intel_extension_for_transformers.neural_chat.config import PipelineConfig, MixedPrecisionConfig\n",
"config = PipelineConfig(optimization_config=MixedPrecisionConfig(), model_name_or_path='Intel/neural-chat-7b-v1-1')\n",
"chatbot = build_chatbot(config)\n",
"response = chatbot.predict(query=\"Tell me about Intel Xeon Scalable Processors.\")\n",
"print(response)"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -73,8 +73,8 @@
"outputs": [],
"source": [
"from intel_extension_for_transformers.neural_chat import build_chatbot\n",
"from intel_extension_for_transformers.neural_chat.config import PipelineConfig, AMPConfig\n",
"config = PipelineConfig(optimization_config=AMPConfig())\n",
"from intel_extension_for_transformers.neural_chat.config import PipelineConfig, MixedPrecisionConfig\n",
"config = PipelineConfig(optimization_config=MixedPrecisionConfig())\n",
"chatbot = build_chatbot(config)\n",
"response = chatbot.predict(query=\"Tell me about Intel Xeon Scalable Processors.\")\n",
"print(response)"
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,137 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# INT4 Optimization of Chatbot on 4th Generation of Intel® Xeon® Scalable Processors Sapphire Rapids"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Prepare Environment"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Install intel extension for transformers:"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"!pip install intel-extension-for-transformers"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Install Requirements:"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"!git clone https://github.com/intel/intel-extension-for-transformers.git\n",
"!cd ./intel-extension-for-transformers/intel_extension_for_transformers/neural_chat/\n",
"!pip install -r requirements.txt"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Build LLM Runtime"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Intel extension for transformers LLM Runtime is designed to provide the efficient inference of large language models (LLMs) on Intel platforms through the state-of-the-art (SOTA) model compression techniques. The work is highly inspired from [llama.cpp](https://github.com/ggerganov/llama.cpp), which organizes almost all the core code (e.g., kernels) in a single big file with a large number of pre-defined macros, thus making it not easy for developers to support a new model. Our LLM Runtime has the following features:\n",
"\n",
"- Modular design to support new models\n",
"- Highly optimized low precision kernels\n",
"- Utilize AMX, VNNI and AVX512F instruction set\n",
"- Support CPU (x86 platforms only) and initial (Intel) GPU\n",
"- Support 4bits and 8bits quantization\n",
"\n",
"We support the following models:\n",
"### Text generation models\n",
"| model name | INT8 | INT4|\n",
"|---|:---:|:---:|\n",
"|[LLaMA2-7B](https://huggingface.co/meta-llama/Llama-2-7b-chat-hf), [LLaMA2-13B](https://huggingface.co/meta-llama/Llama-2-13b-chat-hf)| ✅ | ✅ | \n",
"|[LLaMA-7B](https://huggingface.co/decapoda-research/llama-7b-hf), [LLaMA-13B](https://huggingface.co/decapoda-research/llama-13b-hf)| ✅ | ✅ | \n",
"|[GPT-J-6B](https://huggingface.co/EleutherAI/gpt-j-6b)| ✅ | ✅ | \n",
"|[GPT-NeoX-20B](https://huggingface.co/EleutherAI/gpt-neox-20b)| ✅ | ✅ | \n",
"|[Dolly-v2-3B](https://huggingface.co/databricks/dolly-v2-3b)| ✅ | ✅ | \n",
"|[MPT-7B](https://huggingface.co/mosaicml/mpt-7b), [MPT-30B](https://huggingface.co/mosaicml/mpt-30b)| ✅ | ✅ | \n",
"|[Falcon-7B](https://huggingface.co/tiiuae/falcon-7b), [Falcon-40B](https://huggingface.co/tiiuae/falcon-40b)| ✅ | ✅ | \n",
"|[BLOOM-7B](https://huggingface.co/bigscience/bloomz-7b1)| ✅ | ✅ |\n",
"|[OPT-125m](https://huggingface.co/facebook/opt-125m), [OPT-350m](https://huggingface.co/facebook/opt-350m), [OPT-1.3B](https://huggingface.co/facebook/opt-1.3b), [OPT-13B](https://huggingface.co/facebook/opt-13b)| ✅ | ✅ | \n",
"|[ChatGLM-6B](https://huggingface.co/THUDM/chatglm-6b), [ChatGLM2-6B](https://huggingface.co/THUDM/chatglm2-6b)| ✅ | ✅ |\n",
"\n",
"### Code generation models\n",
"| model name | INT8 | INT4|\n",
"|---|:---:|:---:|\n",
"|[Code-LLaMA-7B](https://huggingface.co/codellama/CodeLlama-7b-hf), [Code-LLaMA-13B](https://huggingface.co/codellama/CodeLlama-13b-hf)| ✅ | ✅ | \n",
"|[StarCoder-1B](https://huggingface.co/bigcode/starcoderbase-1b), [StarCoder-3B](https://huggingface.co/bigcode/starcoderbase-3b), [StarCoder-15.5B](https://huggingface.co/bigcode/starcoder)| ✅ | ✅ | \n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## INT4 Optimization"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from intel_extension_for_transformers.neural_chat import build_chatbot\n",
"from intel_extension_for_transformers.neural_chat.config import PipelineConfig, WeightOnlyQuantConfig\n",
"config = PipelineConfig(optimization_config=WeightOnlyQuantConfig(compute_dtype=\"int8\", weight_dtype=\"int4\"),\n",
" model_name_or_path='Intel/neural-chat-7b-v1-1')\n",
"chatbot = build_chatbot(config)\n",
"response = chatbot.predict(query=\"Tell me about Intel Xeon Scalable Processors.\")\n",
"print(response)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "py39",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.16"
},
"orig_nbformat": 4
},
"nbformat": 4,
"nbformat_minor": 2
}
Original file line number Diff line number Diff line change
Expand Up @@ -62,8 +62,8 @@
"outputs": [],
"source": [
"from intel_extension_for_transformers.neural_chat import build_chatbot\n",
"from intel_extension_for_transformers.neural_chat.config import PipelineConfig, WeightOnlyQuantizationConfig\n",
"config = PipelineConfig(optimization_config=WeightOnlyQuantizationConfig(), model_name_or_path='neural-chat-7b-v1-1')\n",
"from intel_extension_for_transformers.neural_chat.config import PipelineConfig, WeightOnlyQuantConfig\n",
"config = PipelineConfig(optimization_config=WeightOnlyQuantConfig(), model_name_or_path='neural-chat-7b-v1-1')\n",
"chatbot = build_chatbot(config)\n",
"response = chatbot.predict(query=\"Tell me about Intel Xeon Scalable Processors.\")\n",
"print(response)"
Expand Down
Loading

0 comments on commit daece44

Please sign in to comment.