From f602a80689e1840ade32f4f1be761b30421d23ac Mon Sep 17 00:00:00 2001 From: 1000850000 user Date: Sun, 30 Jun 2024 15:58:06 +0000 Subject: [PATCH 01/20] added gptqmodel to plugin --- .../src/gptqmodel/__init__.py | 3 + .../src/gptqmodel/models/__init__.py | 10 + .../src/gptqmodel/models/_const.py | 35 + .../src/gptqmodel/models/auto.py | 113 ++ .../src/gptqmodel/models/base.py | 1080 +++++++++++++++++ .../src/gptqmodel/models/dbrx.py | 22 + .../src/gptqmodel/models/dbrx_converted.py | 66 + .../src/gptqmodel/models/gemma.py | 29 + .../src/gptqmodel/models/gpt_bigcode.py | 29 + .../src/gptqmodel/models/gpt_neox.py | 31 + .../src/gptqmodel/models/llama.py | 37 + .../src/gptqmodel/models/mistral.py | 29 + .../src/gptqmodel/models/mixtral.py | 55 + .../src/gptqmodel/nn_modules/__init__.py | 0 .../gptqmodel/nn_modules/qlinear/__init__.py | 43 + .../nn_modules/qlinear/qlinear_tritonv2.py | 206 ++++ .../nn_modules/triton_utils/__init__.py | 0 .../triton_utils/custom_autotune.py | 233 ++++ .../nn_modules/triton_utils/dequant.py | 153 +++ .../nn_modules/triton_utils/kernels.py | 476 ++++++++ .../nn_modules/triton_utils/mixin.py | 19 + .../src/gptqmodel/quantization/__init__.py | 4 + .../src/gptqmodel/quantization/config.py | 309 +++++ .../src/gptqmodel/quantization/gptq.py | 209 ++++ .../src/gptqmodel/quantization/quantizer.py | 142 +++ .../src/gptqmodel/utils/__init__.py | 1 + .../src/gptqmodel/utils/backend.py | 27 + .../src/gptqmodel/utils/data.py | 275 +++++ .../src/gptqmodel/utils/importer.py | 63 + .../src/gptqmodel/utils/model.py | 661 ++++++++++ .../src/gptqmodel/utils/peft.py | 153 +++ .../accelerated-peft/tests/test_q4_triton.py | 99 ++ plugins/accelerated-peft/tests/test_triton.py | 107 ++ 33 files changed, 4719 insertions(+) create mode 100644 plugins/accelerated-peft/src/gptqmodel/__init__.py create mode 100644 plugins/accelerated-peft/src/gptqmodel/models/__init__.py create mode 100644 plugins/accelerated-peft/src/gptqmodel/models/_const.py create mode 100644 plugins/accelerated-peft/src/gptqmodel/models/auto.py create mode 100644 plugins/accelerated-peft/src/gptqmodel/models/base.py create mode 100644 plugins/accelerated-peft/src/gptqmodel/models/dbrx.py create mode 100644 plugins/accelerated-peft/src/gptqmodel/models/dbrx_converted.py create mode 100644 plugins/accelerated-peft/src/gptqmodel/models/gemma.py create mode 100644 plugins/accelerated-peft/src/gptqmodel/models/gpt_bigcode.py create mode 100644 plugins/accelerated-peft/src/gptqmodel/models/gpt_neox.py create mode 100644 plugins/accelerated-peft/src/gptqmodel/models/llama.py create mode 100644 plugins/accelerated-peft/src/gptqmodel/models/mistral.py create mode 100644 plugins/accelerated-peft/src/gptqmodel/models/mixtral.py create mode 100644 plugins/accelerated-peft/src/gptqmodel/nn_modules/__init__.py create mode 100644 plugins/accelerated-peft/src/gptqmodel/nn_modules/qlinear/__init__.py create mode 100644 plugins/accelerated-peft/src/gptqmodel/nn_modules/qlinear/qlinear_tritonv2.py create mode 100644 plugins/accelerated-peft/src/gptqmodel/nn_modules/triton_utils/__init__.py create mode 100644 plugins/accelerated-peft/src/gptqmodel/nn_modules/triton_utils/custom_autotune.py create mode 100644 plugins/accelerated-peft/src/gptqmodel/nn_modules/triton_utils/dequant.py create mode 100644 plugins/accelerated-peft/src/gptqmodel/nn_modules/triton_utils/kernels.py create mode 100644 plugins/accelerated-peft/src/gptqmodel/nn_modules/triton_utils/mixin.py create mode 100644 plugins/accelerated-peft/src/gptqmodel/quantization/__init__.py create mode 100644 plugins/accelerated-peft/src/gptqmodel/quantization/config.py create mode 100644 plugins/accelerated-peft/src/gptqmodel/quantization/gptq.py create mode 100644 plugins/accelerated-peft/src/gptqmodel/quantization/quantizer.py create mode 100644 plugins/accelerated-peft/src/gptqmodel/utils/__init__.py create mode 100644 plugins/accelerated-peft/src/gptqmodel/utils/backend.py create mode 100644 plugins/accelerated-peft/src/gptqmodel/utils/data.py create mode 100644 plugins/accelerated-peft/src/gptqmodel/utils/importer.py create mode 100644 plugins/accelerated-peft/src/gptqmodel/utils/model.py create mode 100644 plugins/accelerated-peft/src/gptqmodel/utils/peft.py create mode 100644 plugins/accelerated-peft/tests/test_q4_triton.py create mode 100644 plugins/accelerated-peft/tests/test_triton.py diff --git a/plugins/accelerated-peft/src/gptqmodel/__init__.py b/plugins/accelerated-peft/src/gptqmodel/__init__.py new file mode 100644 index 00000000..2808d505 --- /dev/null +++ b/plugins/accelerated-peft/src/gptqmodel/__init__.py @@ -0,0 +1,3 @@ +from .models import GPTQModel +from .quantization import BaseQuantizeConfig, QuantizeConfig +from .utils import Backend, get_backend \ No newline at end of file diff --git a/plugins/accelerated-peft/src/gptqmodel/models/__init__.py b/plugins/accelerated-peft/src/gptqmodel/models/__init__.py new file mode 100644 index 00000000..5496b45e --- /dev/null +++ b/plugins/accelerated-peft/src/gptqmodel/models/__init__.py @@ -0,0 +1,10 @@ +from .auto import MODEL_MAP, GPTQModel +from .base import BaseGPTQModel +from .dbrx import DbrxGPTQ +from .dbrx_converted import DbrxConvertedGPTQ +from .gemma import GemmaGPTQ +from .gpt_bigcode import GPTBigCodeGPTQ +from .gpt_neox import GPTNeoXGPTQ +from .llama import LlamaGPTQ +from .mistral import MistralGPTQ +from .mixtral import MixtralGPTQ diff --git a/plugins/accelerated-peft/src/gptqmodel/models/_const.py b/plugins/accelerated-peft/src/gptqmodel/models/_const.py new file mode 100644 index 00000000..49484bee --- /dev/null +++ b/plugins/accelerated-peft/src/gptqmodel/models/_const.py @@ -0,0 +1,35 @@ +############################################################################### +# Adapted from https://github.com/ModelCloud/GPTQModel +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +############################################################################### +from torch import device + +CPU = device("cpu") +CUDA_0 = device("cuda:0") + +SUPPORTED_MODELS = [ + "gpt_neox", + "gpt_bigcode", + "llama", + "mistral", + "mixtral", + "gemma", + "dbrx_converted", +] + +EXLLAMA_DEFAULT_MAX_INPUT_LENGTH = 2048 + +EXPERT_INDEX_PLACEHOLDER = "{expert_index}" + + diff --git a/plugins/accelerated-peft/src/gptqmodel/models/auto.py b/plugins/accelerated-peft/src/gptqmodel/models/auto.py new file mode 100644 index 00000000..d31c12e9 --- /dev/null +++ b/plugins/accelerated-peft/src/gptqmodel/models/auto.py @@ -0,0 +1,113 @@ +############################################################################### +# Adapted from https://github.com/ModelCloud/GPTQModel +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +############################################################################### +from typing import Dict, List, Optional, Union + +import torch + +from ..utils import Backend +from ..utils.model import check_and_get_model_type +from .base import BaseGPTQModel, QuantizeConfig +from .dbrx import DbrxGPTQ +from .dbrx_converted import DbrxConvertedGPTQ +from .gemma import GemmaGPTQ +from .gpt_bigcode import GPTBigCodeGPTQ +from .gpt_neox import GPTNeoXGPTQ +from .llama import LlamaGPTQ +from .mistral import MistralGPTQ +from .mixtral import MixtralGPTQ + +MODEL_MAP = { + "gpt_neox": GPTNeoXGPTQ, + "llama": LlamaGPTQ, + "gpt_bigcode": GPTBigCodeGPTQ, + "mistral": MistralGPTQ, + "mixtral": MixtralGPTQ, + "gemma": GemmaGPTQ, + "dbrx": DbrxGPTQ, + "dbrx_converted": DbrxConvertedGPTQ, +} + +at_least_one_cuda_v6 = any(torch.cuda.get_device_capability(i)[0] >= 6 for i in range(torch.cuda.device_count())) + +if not at_least_one_cuda_v6: + raise EnvironmentError("GPTQModel requires at least one GPU device with CUDA compute capability >= `6.0`.") + + +class GPTQModel: + def __init__(self): + raise EnvironmentError( + "ModelGPTQ is not designed to be instantiated\n" + "use `ModelGPTQ.from_pretrained` to load pretrained model and prepare for quantization via `.quantize()`.\n" + "use `ModelGPTQ.from_quantized` to inference with post-quantized model." + ) + + @classmethod + def from_pretrained( + cls, + pretrained_model_name_or_path: str, + quantize_config: QuantizeConfig, + max_memory: Optional[dict] = None, + trust_remote_code: bool = False, + **model_init_kwargs, + ) -> BaseGPTQModel: + model_type = check_and_get_model_type(pretrained_model_name_or_path, trust_remote_code) + return MODEL_MAP[model_type].from_pretrained( + pretrained_model_name_or_path=pretrained_model_name_or_path, + quantize_config=quantize_config, + max_memory=max_memory, + trust_remote_code=trust_remote_code, + **model_init_kwargs, + ) + + @classmethod + def from_quantized( + cls, + model_name_or_path: Optional[str], + device_map: Optional[Union[str, Dict[str, Union[str, int]]]] = None, + max_memory: Optional[dict] = None, + device: Optional[Union[str, int]] = None, + backend: Backend = Backend.AUTO, + use_cuda_fp16: bool = True, + quantize_config: Optional[QuantizeConfig | Dict] = None, + model_basename: Optional[str] = None, + use_safetensors: bool = True, + trust_remote_code: bool = False, + warmup_triton: bool = False, + # verify weight files matches predefined hash during loading + # usage: hash_format:hash_value, example: md5:ugkdh232 + # supports all hashlib hash methods + verify_hash: Optional[Union[str, List[str]]] = None, + **kwargs, + ) -> BaseGPTQModel: + model_type = check_and_get_model_type(model_name_or_path, trust_remote_code) + quant_func = MODEL_MAP[model_type].from_quantized + + return quant_func( + model_name_or_path=model_name_or_path, + device_map=device_map, + max_memory=max_memory, + device=device, + backend=backend, + use_cuda_fp16=use_cuda_fp16, + quantize_config=quantize_config, + model_basename=model_basename, + use_safetensors=use_safetensors, + trust_remote_code=trust_remote_code, + warmup_triton=warmup_triton, + verify_hash=verify_hash, + **kwargs, + ) + diff --git a/plugins/accelerated-peft/src/gptqmodel/models/base.py b/plugins/accelerated-peft/src/gptqmodel/models/base.py new file mode 100644 index 00000000..d1010a39 --- /dev/null +++ b/plugins/accelerated-peft/src/gptqmodel/models/base.py @@ -0,0 +1,1080 @@ +############################################################################### +# Adapted from https://github.com/ModelCloud/GPTQModel +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +############################################################################### +import copy +import json +import logging +import os +import re +from os.path import isfile, join +from typing import Dict, List, Optional, Union + +import accelerate +import torch +import torch.nn as nn +import transformers +from accelerate.hooks import remove_hook_from_module +from safetensors.torch import save_file as safe_save +from tqdm import tqdm +from transformers import AutoConfig, AutoModelForCausalLM, PretrainedConfig, PreTrainedModel +from transformers.modeling_utils import no_init_weights, shard_checkpoint +from transformers.utils.generic import ContextManagers + +from ..quantization import GPTQ, QuantizeConfig +from ..quantization.config import (FORMAT, FORMAT_FIELD_JSON, META_FIELD_QUANTIZER, + META_QUANTIZER_GPTQMODEL, MIN_VERSION_WITH_V2, QUANTIZE_BLACK_LIST) +from ..utils.backend import Backend +from ..utils.data import collate_data +from ..utils.importer import select_quant_linear +from ..utils.model import (auto_dtype_from_config, convert_gptq_v1_to_v2_format, convert_gptq_v2_to_v1_format, + find_layers, get_checkpoints, get_device, get_module_by_name_prefix, + get_module_by_name_suffix, get_moe_layer_modules, gptqmodel_post_init, make_quant, + move_to, nested_move_to, pack_model, simple_dispatch_model, verify_model_hash, + verify_sharded_model_hashes) +from ._const import CPU, CUDA_0, SUPPORTED_MODELS + +logger = logging.getLogger(__name__) +handler = logging.StreamHandler() +formatter = logging.Formatter("%(levelname)s - %(message)s") +handler.setFormatter(formatter) +logger.propagate = False +logger.addHandler(handler) +logger.setLevel(logging.INFO) + + +class BaseGPTQModel(nn.Module): + # these modules are non-repeating and at the root level + # does not include the node which holds all the repeating layers + base_modules: List[str] = None + + # name of lm_head + lm_head: str = "lm_head" + + # repeating layers + # node holding all the repeating layers + layers_node: str = None + # repeating layer type + layer_type: str = None + # for each repeating layer there are multiple modules within each layer + layer_modules: List[List[str]] = None + + # some models require trust_remove_code = True (dbrx_converted) + require_trust_remote_code = None + + # TODO: use a better name and what if the value is not at the config root? + # allow dynamic expert n-count layer extraction + # so moe model defs do not need to write out 64 layers if expert size is 64 (Qwen2Moe) + # usage: set to property in model.config that holds this int value: total number of experts + dynamic_expert_index: Optional[str] = None + + # allow models to define optional notes that output messages to users that want to use this model + # list of supported keys: [ "notes" = print the notes value on model load ] + info: Dict[str, str] = {} + + def __init__( + self, + model: PreTrainedModel, + quantized: bool, + quantize_config: QuantizeConfig, + qlinear_kernel: nn.Module = None, + ): + super().__init__() + + self.model = model + self.model_type = self.model.config.model_type + self._quantized = quantized + self.quantize_config = quantize_config + self.config = self.model.config + + # compat: state to assist in checkpoint_format gptq(v1) to gptq_v2 conversion + self.qlinear_kernel = qlinear_kernel + + @property + def quantized(self): + return self._quantized + + @property + def hf_device_map(self): + return getattr(self.model, "hf_device_map", None) + + def _prepare_dataset_for_quantization( + self, + calibration_dataset: List[Dict[str, Union[List[int], torch.LongTensor]]], + batch_size: int = 1, + ): + def _convert_tensor_to_list(tensor): + if isinstance(tensor, torch.Tensor): + if len(tensor.shape) == 1: + tensor = tensor.unsqueeze(0) + tensor = tensor.long() + return tensor.cpu().numpy().tolist() + return [tensor] + + new_calibration_dataset = [] + for example in calibration_dataset: + input_ids = _convert_tensor_to_list(example["input_ids"]) + attention_mask = _convert_tensor_to_list(example["attention_mask"]) + if "labels" in example: + labels = _convert_tensor_to_list(example["labels"]) + elif "label" in example: + labels = _convert_tensor_to_list(example["label"]) + elif "label_ids" in example: + labels = _convert_tensor_to_list(example["label_ids"]) + else: + labels = copy.deepcopy(input_ids) + new_calibration_dataset.append( + { + "input_ids": input_ids, + "attention_mask": attention_mask, + "labels": labels, + } + ) + pad_token_id = self.config.pad_token_id + if not pad_token_id: + pad_token_id = self.config.eos_token_id + + if pad_token_id is None: + raise ValueError("Calibration data requires model's `pad_token_id` or `eos_token_id` to be set: actual = `None`.") + + new_calibration_dataset_batched = [ + collate_data(new_calibration_dataset[start: start + batch_size], pad_token_id) + for start in range(0, len(new_calibration_dataset), batch_size) + ] + + for new_example in new_calibration_dataset_batched: + del new_example["labels"] + + return new_calibration_dataset_batched + + @torch.inference_mode() + def quantize( + self, + calibration_dataset: List[Dict[str, Union[List[int], torch.LongTensor]]], + batch_size: int = 1, + + # TODO: remove use_cuda_fp16 arg..why? doesn't pass smell test @ZX-ModelCloud + use_cuda_fp16: bool = True, + + autotune_warmup_after_quantized: bool = False, + calibration_enable_gpu_cache: bool = True, + ): + if self.quantized: + raise EnvironmentError("quantize() is called a model that is already quantized") + + if self.quantize_config.quant_method in QUANTIZE_BLACK_LIST: + raise ValueError( + f"Unsupported quantization operation for quant method: {self.quantize_config.quant_method}" + ) + + + # TODO: lm_head quantization is yet ready but pending + if self.quantize_config.lm_head: + raise ValueError("lm_head quantization is currently inference only and not applicable for quantization. Please set `lm_head=False`.") + + if len(calibration_dataset) == 0: + raise ValueError("Calibration dataset must not be empty.") + + min_calibration_dataset_size = 256 + min_calibration_dataset_input_ids_avg_length = 256 + + if len(calibration_dataset) < min_calibration_dataset_size: + logger.warning(f"Calibration dataset size should be greater than {min_calibration_dataset_size}. " + f"Current size: {len(calibration_dataset)}.") + + # Calculate the average length of the average input_ids + total_input_ids_length = 0 + for e in calibration_dataset: + input_ids_length = len(e["input_ids"]) + total_input_ids_length += input_ids_length + avg = total_input_ids_length / len(calibration_dataset) + + if avg < min_calibration_dataset_input_ids_avg_length: + logger.warning(f"The average length of input_ids of calibration_dataset should be greater than " + f"{min_calibration_dataset_input_ids_avg_length}! Current AVG is {avg}.") + + + device_map = self.hf_device_map + if device_map: + for name, device in device_map.items(): + if device == "cpu": + logger.info(f"truly offloading {name} to cpu with hook.") + module = get_module_by_name_suffix(self.model, name) + remove_hook_from_module(module, recurse=True) + accelerate.cpu_offload_with_hook(module, CUDA_0) + + layer_inputs = [] + attention_masks = [] + position_ids = [] + layer_input_kwargs = [] + layer_outputs = [] + + calibration_dataset = self._prepare_dataset_for_quantization(calibration_dataset, batch_size) + + forward_pass_use_cache = self.model.config.use_cache + self.model.config.use_cache = False + + num_batches = len(calibration_dataset) + layers = get_module_by_name_prefix(self.model, self.layers_node) + + cur_layer_device = get_device(layers[0]) + data_device = cur_layer_device if calibration_enable_gpu_cache else CPU + + def store_input_hook(_, args, kwargs): + # Positional arguments. + layer_input = [] + for inp in args: + layer_input.append(move_to(inp, data_device)) + layer_inputs.append(layer_input) + + # Keyword arguments. + if kwargs["attention_mask"] is not None: + attention_masks.append(kwargs["attention_mask"].to(data_device)) + else: + attention_masks.append(None) + + pos_ids = kwargs.get("position_ids", None) + if pos_ids is not None: + position_ids.append(move_to(pos_ids, data_device)) + one_kwargs = {} + for (k, v) in kwargs.items(): # make sure other arguments also be captured + if k not in ["hidden_states", "attention_mask", "position_ids"]: + one_kwargs[k] = nested_move_to(v, data_device) + layer_input_kwargs.append(one_kwargs) + raise ValueError + + force_layer_back_to_cpu = False + if get_device(layers[0]) == CPU: + layers[0] = layers[0].to(CUDA_0) + force_layer_back_to_cpu = True + + ori_outside_layer_module_devices = {} + for module_name in self.base_modules: + module = get_module_by_name_prefix(self.model, module_name) + + if module is None: + continue + + ori_outside_layer_module_devices[module_name] = get_device(module) + if module is not None: + move_to(module, cur_layer_device) + + # TODO: make this optional, backporting https://github.com/huggingface/optimum/blob/main/optimum/gptq/quantizer.py + handle = layers[0].register_forward_pre_hook(store_input_hook, with_kwargs=True) + for example in calibration_dataset: + for k, v in example.items(): + if len(v.shape) == 1: + v = v.unsqueeze(0) + example[k] = move_to(v, cur_layer_device) + try: + self.model(**example) + except ValueError: + pass + handle.remove() + + move_to(layers[0], CPU if force_layer_back_to_cpu else cur_layer_device) + for module_name in self.base_modules: + module = get_module_by_name_prefix(self.model, module_name) + if module is not None: + move_to(module, ori_outside_layer_module_devices[module_name]) + + torch.cuda.empty_cache() + + layer_modules = self.layer_modules + + if not self.quantize_config.true_sequential: + layer_modules = [sum(layer_modules, [])] + + # dynamic expert layer index for model defs + if self.dynamic_expert_index is not None: + num_experts = getattr(self.model.config, self.dynamic_expert_index) + layer_modules = get_moe_layer_modules(layer_modules=self.layer_modules, + num_experts=num_experts) + + quantizers = {} + + # stores all per-layer quant stats such as avg loss and processing time + quant_log = [] + + layer_count = len(layers) + layer_pb = tqdm(range(layer_count)) + for i in layer_pb: + layer_pb.set_description(f"Quantizing layer {i + 1} of {layer_count}") + layer = layers[i] + force_layer_back_to_cpu = False + if get_device(layer) == CPU: + move_to(layer, CUDA_0) + force_layer_back_to_cpu = True + cur_layer_device = get_device(layer) + + full = find_layers(layer) + for names in layer_modules: + subset = {n: full[n] for n in names if n in full} + gptq = {} + for name in subset: + gptq[name] = GPTQ(subset[name]) + gptq[name].quantizer.configure( + self.quantize_config.bits, + perchannel=True, + sym=self.quantize_config.sym, + mse=False, + ) + + def add_batch(name): + def tmp(_, inp, out): + # gptq is mutable. + gptq[name].add_batch(inp[0].data, out.data) # noqa: F821 + + return tmp + + handles = [] + for name in subset: + handles.append(subset[name].register_forward_hook(add_batch(name))) + for j in range(num_batches): + layer_input = [] + for k, layer_inp in enumerate(layer_inputs[j]): + layer_input.append(move_to(layer_inp, cur_layer_device)) + + mask = attention_masks[j] + layer_attention_mask = mask if mask is None else move_to(mask, cur_layer_device) + + additional_layer_inputs = {"attention_mask": layer_attention_mask} + layer_position_ids = ( + None if not position_ids else move_to(position_ids[j], cur_layer_device) + ) + if layer_position_ids is not None: + additional_layer_inputs["position_ids"] = layer_position_ids + for k, v in layer_input_kwargs[j].items(): + additional_layer_inputs[k] = nested_move_to(v, cur_layer_device) + layer(*layer_input, **additional_layer_inputs) + for h in handles: + h.remove() + + for name in subset: + layer_pb.set_description(f"Quantizing {name} in layer {i + 1} of {layer_count}") + + try: + scale, zero, g_idx, duration, avg_loss = gptq[name].fasterquant( + percdamp=self.quantize_config.damp_percent, + group_size=self.quantize_config.group_size, + actorder=self.quantize_config.desc_act, + static_groups=self.quantize_config.static_groups, + ) + + stat = {"layer": i + 1, "module": name, "avg_loss": f"{avg_loss:.4f}", + "time": f"{duration:.4f}"} + + quant_log.append(stat) + logger.info(stat) + + except torch._C._LinAlgError as e: + if "not positive-definite" in str(e).lower(): + logger.warning( + "Please increase damp or nsamples for calibration data to avoid the following quant error. " + ) + raise e + + quantizers[f"{self.layers_node}.{i}.{name}"] = ( + gptq[name].quantizer.to(CPU if force_layer_back_to_cpu else cur_layer_device), + move_to(scale, CPU if force_layer_back_to_cpu else cur_layer_device), + move_to(zero, CPU if force_layer_back_to_cpu else cur_layer_device), + move_to(g_idx, CPU if force_layer_back_to_cpu else cur_layer_device), + ) + gptq[name].free() + + for j in range(num_batches): + layer_input = [] + for k, layer_inp in enumerate(layer_inputs[j]): + layer_input.append(move_to(layer_inp, cur_layer_device)) + + mask = attention_masks[j] + layer_attention_mask = mask if mask is None else move_to(mask, cur_layer_device) + + additional_layer_inputs = {"attention_mask": layer_attention_mask} + layer_position_ids = None if not position_ids else move_to(position_ids[j], cur_layer_device) + if layer_position_ids is not None: + additional_layer_inputs["position_ids"] = layer_position_ids + for k, v in layer_input_kwargs[j].items(): + additional_layer_inputs[k] = nested_move_to(v, cur_layer_device) + layer_output = move_to( + layer(*layer_input, **additional_layer_inputs)[0], + cur_layer_device if calibration_enable_gpu_cache else CPU, + ) + layer_outputs.append([layer_output]) + + layers[i] = move_to(layer, CPU if force_layer_back_to_cpu else cur_layer_device) + del layer + del gptq + del layer_inputs + layer_inputs, layer_outputs = ( + layer_outputs, + [], + ) # TODO: is it really OK to cache only the first positional argument? + torch.cuda.empty_cache() + + logger.info(f"Quantization summary:\n{quant_log}") + for module_log in quant_log: + logger.info(module_log) + + self.qlinear_kernel = pack_model( + model=self.model, + quantizers=quantizers, + bits=self.quantize_config.bits, + group_size=self.quantize_config.group_size, + backend=Backend.AUTO, + use_cuda_fp16=use_cuda_fp16, + desc_act=self.quantize_config.desc_act, + warmup_triton=autotune_warmup_after_quantized, + force_layer_back_to_cpu=force_layer_back_to_cpu, + format=self.quantize_config.format, + ) + if device_map: + self.model = remove_hook_from_module(self.model, recurse=True) + self.model = simple_dispatch_model(self.model, device_map) + self.model.config.use_cache = forward_pass_use_cache + + self._quantized = True + + torch.cuda.empty_cache() + + return quant_log + + @property + def device(self): + if not self.hf_device_map: + return self.model.device + else: + device = [d for d in self.hf_device_map.values() if d not in {"disk"}][0] + return torch.device(device) + + def to(self, device: Union[str, torch.device]): + self.model.to(device) + return self + + def forward(self, *args, **kwargs): + return self.model(*args, **kwargs) + + def generate(self, **kwargs): + """shortcut for model.generate""" + with torch.inference_mode(), torch.amp.autocast(device_type=self.device.type): + return self.model.generate(**kwargs) + + def prepare_inputs_for_generation(self, *args, **kwargs): + """shortcut for model.prepare_inputs_for_generation""" + return self.model.prepare_inputs_for_generation(*args, **kwargs) + + def save_quantized( + self, + save_dir: str, + safetensors_metadata: Optional[Dict[str, str]] = None, + use_safetensors: bool = True, + max_shard_size: Optional[str] = None, + model_base_name: Optional[str] = None + ): + """save quantized model and configs to local disk""" + os.makedirs(save_dir, exist_ok=True) + + # write autogptq tooling fingerprint to config + self.quantize_config.meta_set_versionable( + key=META_FIELD_QUANTIZER, + value=META_QUANTIZER_GPTQMODEL, + version=__version__, + ) + + # The config, quantize_config and model may be edited in place in save_quantized. + config = copy.deepcopy(self.model.config) + quantize_config = copy.deepcopy(self.quantize_config) + model = self.model + + if not self.quantized: + raise ValueError("Save aborted as model is not quantized. Please call `quantize()` first.") + + if model_base_name is None: + model_base_name = ( + self.quantize_config.model_file_base_name or + f"gptq_model-{self.quantize_config.bits}bit-{self.quantize_config.group_size}g" + ) + + if quantize_config.format == FORMAT.GPTQ_V2: + logger.warning( + f"Using 'format = {FORMAT.GPTQ_V2}': the serialized model is only supported by GPTQModel version >= {MIN_VERSION_WITH_V2}." + ) + + # internal is always gptq v2 but allow users to pass gptq (v1) via config + if quantize_config.format == FORMAT.GPTQ: + # Model qzeros may be edited in place. + # TODO: avoid inplace modification of the weights + # fix ModelCloud/GPTQModel/issues/47 + # fix gptqmodel_cuda cannot be serialized + # no need to set it back, no calculation below + if quantize_config.bits != 4: + cuda_name_modules = {} + from gptqmodel.nn_modules.qlinear.qlinear_cuda import BaseCudaQuantLinear + for name, module in model.named_modules(): + if isinstance(module, BaseCudaQuantLinear): + cuda_name_modules[name] = module.gptqmodel_cuda + module.gptqmodel_cuda = None + model = copy.deepcopy(self.model) + + for name, modules in model.named_modules(): + if isinstance(module, BaseCudaQuantLinear) and name in cuda_name_modules: + module.gptqmodel_cuda = cuda_name_modules[name] + + del cuda_name_modules + else: + model = copy.deepcopy(self.model) + model = convert_gptq_v2_to_v1_format( + model, quantize_config=quantize_config, qlinear_kernel=self.qlinear_kernel + ) + + model.to(CPU) + + state_dict = model.state_dict() + + if quantize_config.model_file_base_name is None: + if use_safetensors: + model_base_name = "model" + else: + model_base_name = "pytorch_model" + else: + model_base_name = quantize_config.model_file_base_name + + if use_safetensors: + state_dict = {k: v.clone().contiguous() for k, v in state_dict.items()} + model_save_name = model_base_name + ".safetensors" + else: + model_save_name = model_base_name + ".bin" + + if not self.qlinear_kernel.SUPPORTED_SHARDS and max_shard_size is not None: + logger.warning("Sharding is not supported for this quant. Disabling sharding.") + max_shard_size = None + + if max_shard_size is None: + if use_safetensors: + if safetensors_metadata is None: + safetensors_metadata = {} + elif not isinstance(safetensors_metadata, dict): + raise TypeError("safetensors_metadata must be a dictionary.") + else: + logger.debug(f"Received safetensors_metadata: {safetensors_metadata}") + new_safetensors_metadata = {} + converted_keys = False + for key, value in safetensors_metadata.items(): + if not isinstance(key, str) or not isinstance(value, str): + converted_keys = True + try: + new_key = str(key) + new_value = str(value) + except Exception as e: + raise TypeError( + f"safetensors_metadata: both keys and values must be strings and an error occured when trying to convert them: {e}" + ) + if new_key in new_safetensors_metadata: + logger.warning( + f"After converting safetensors_metadata keys to strings, the key '{new_key}' is duplicated. Ensure that all your metadata keys are strings to avoid overwriting." + ) + new_safetensors_metadata[new_key] = new_value + safetensors_metadata = new_safetensors_metadata + if converted_keys: + logger.debug( + f"One or more safetensors_metadata keys or values had to be converted to str(). Final safetensors_metadata: {safetensors_metadata}" + ) + + # Format is required to enable Accelerate to load the metadata + # otherwise it raises an OSError + safetensors_metadata["format"] = "pt" + safe_save(state_dict, join(save_dir, model_save_name), safetensors_metadata) + else: + logger.warning( + "We highly suggest saving quantized model using safetensors format for security reasons. Please set `use_safetensors=True` whenever possible.") + torch.save(model.state_dict(), join(save_dir, model_save_name)) + else: + # Shard checkpoint + shards, index = shard_checkpoint(state_dict, max_shard_size=max_shard_size, weights_name=model_save_name) + + # Clean the folder from a previous save + for filename in os.listdir(save_dir): + full_filename = join(save_dir, filename) + + # make sure that file to be deleted matches format of sharded file, e.g. pytorch_model-00001-of-00005 + filename_no_suffix = filename.replace(".bin", "").replace(".safetensors", "") + reg = re.compile(r"(.*?)-\d{5}-of-\d{5}") + + if ( + filename.startswith(model_base_name) + and isfile(full_filename) + and filename not in shards.keys() + and reg.fullmatch(filename_no_suffix) is not None + ): + os.remove(full_filename) + + # Save the model + for shard_file, shard in shards.items(): + if use_safetensors: + if safetensors_metadata is None: + safetensors_metadata = {} + elif not isinstance(safetensors_metadata, dict): + raise TypeError("safetensors_metadata must be a dictionary.") + else: + logger.debug(f"Received safetensors_metadata: {safetensors_metadata}") + new_safetensors_metadata = {} + converted_keys = False + for key, value in safetensors_metadata.items(): + if not isinstance(key, str) or not isinstance(value, str): + converted_keys = True + try: + new_key = str(key) + new_value = str(value) + except Exception as e: + raise TypeError( + f"safetensors_metadata: both keys and values must be strings and an error occured when trying to convert them: {e}") + if new_key in new_safetensors_metadata: + logger.warning( + f"After converting safetensors_metadata keys to strings, the key '{new_key}' is duplicated. Ensure that all your metadata keys are strings to avoid overwriting.") + new_safetensors_metadata[new_key] = new_value + safetensors_metadata = new_safetensors_metadata + if converted_keys: + logger.debug( + f"One or more safetensors_metadata keys or values had to be converted to str(). Final safetensors_metadata: {safetensors_metadata}") + + # Format is required to enable Accelerate to load the metadata + # otherwise it raises an OSError + safetensors_metadata["format"] = "pt" + + safe_save(shard, join(save_dir, shard_file), safetensors_metadata) + else: + torch.save(shard, join(save_dir, shard_file)) + + if index is not None: + index_save_name = model_save_name + ".index.json" + index_save_path = join(save_dir, index_save_name) + # Save the index as well + with open(index_save_path, "w", encoding="utf-8") as f: + content = json.dumps(index, indent=2, sort_keys=True) + "\n" + f.write(content) + config.quantization_config = quantize_config.to_dict() + config.save_pretrained(save_dir) + + quantize_config.model_name_or_path = save_dir + quantize_config.model_file_base_name = model_base_name + quantize_config.save_pretrained(save_dir) + + def save_pretrained( + self, + save_dir: str, + **kwargs, + ): + logger.warning("You are using save_pretrained, which will re-direct to save_quantized.") + self.save_quantized(save_dir=save_dir, **kwargs) + + @classmethod + def from_pretrained( + cls, + pretrained_model_name_or_path: str, + quantize_config: QuantizeConfig, + max_memory: Optional[dict] = None, + trust_remote_code: bool = False, + torch_dtype: [str | torch.dtype] = "auto", + **model_init_kwargs, + ): + """load un-quantized pretrained model to cpu""" + + if not torch.cuda.is_available(): + raise EnvironmentError("Load pretrained model to do quantization requires CUDA available.") + + if cls.require_trust_remote_code and not trust_remote_code: + raise ValueError( + f"{pretrained_model_name_or_path} requires trust_remote_code=True. Please set trust_remote_code=True to load this model." + ) + + # allow models to define optional notes that output messages to users that want to use this model + notes = cls.info.get("notes") + if notes: + logger.info(notes) + + def skip(*args, **kwargs): + pass + + torch.nn.init.kaiming_uniform_ = skip + torch.nn.init.uniform_ = skip + torch.nn.init.normal_ = skip + + model_init_kwargs["trust_remote_code"] = trust_remote_code + + config = AutoConfig.from_pretrained(pretrained_model_name_or_path, **model_init_kwargs) + + if torch_dtype == "auto": + torch_dtype = auto_dtype_from_config(config) + elif not isinstance(torch_dtype, torch.dtype): + raise ValueError(f"torch_dtype value of `{torch_dtype}` is not a torch.dtype instance.") + + # enforce some values despite user specified + model_init_kwargs["torch_dtype"] = torch_dtype + + if config.model_type not in SUPPORTED_MODELS: + raise TypeError(f"{config.model_type} isn't supported yet.") + + if max_memory: + if "disk" in max_memory: + raise NotImplementedError("disk offload not support yet.") + with accelerate.init_empty_weights(): + model = AutoModelForCausalLM.from_config(config, trust_remote_code=True) + model.tie_weights() + + max_memory = accelerate.utils.get_balanced_memory( + model, + max_memory=max_memory, + no_split_module_classes=[cls.layer_type], + dtype=model_init_kwargs["torch_dtype"], + low_zero=False, + ) + model_init_kwargs["device_map"] = accelerate.infer_auto_device_map( + model, + max_memory=max_memory, + no_split_module_classes=[cls.layer_type], + dtype=model_init_kwargs["torch_dtype"], + ) + del model + else: + model_init_kwargs["device_map"] = None + + torch.cuda.empty_cache() + + model = AutoModelForCausalLM.from_pretrained(pretrained_model_name_or_path, **model_init_kwargs) + + model_config = model.config.to_dict() + seq_len_keys = ["max_position_embeddings", "seq_length", "n_positions"] + if any(k in model_config for k in seq_len_keys): + for key in seq_len_keys: + if key in model_config: + model.seqlen = model_config[key] + break + else: + logger.warning("can't get model's sequence length from model config, will set to 4096.") + model.seqlen = 4096 + model.eval() + + return cls(model, quantized=False, quantize_config=quantize_config) + + @classmethod + def from_quantized( + cls, + model_name_or_path: Optional[str], + device_map: Optional[Union[str, Dict[str, Union[int, str]]]] = None, + max_memory: Optional[dict] = None, + device: Optional[Union[str, int]] = None, + + backend: Backend = Backend.AUTO, + + torch_dtype: [str | torch.dtype] = "auto", + use_cuda_fp16: bool = True, + quantize_config: Optional[QuantizeConfig] = None, + model_basename: Optional[str] = None, + use_safetensors: bool = True, + trust_remote_code: bool = False, + warmup_triton: bool = False, + format: Optional[FORMAT] = None, + allow_unsafe_loading: bool = False, + verify_hash: Optional[Union[str, List[str]]] = None, + **kwargs, + ): + """load quantized model from local disk""" + if cls.require_trust_remote_code and not trust_remote_code: + raise ValueError( + f"{model_name_or_path} requires trust_remote_code=True. Please set trust_remote_code=True to load this model." + ) + + # Parameters related to loading from Hugging Face Hub + cache_dir = kwargs.pop("cache_dir", None) + force_download = kwargs.pop("force_download", False) + resume_download = kwargs.pop("resume_download", False) + proxies = kwargs.pop("proxies", None) + local_files_only = kwargs.pop("local_files_only", False) + use_auth_token = kwargs.pop("use_auth_token", None) + revision = kwargs.pop("revision", None) + subfolder = kwargs.pop("subfolder", "") + commit_hash = kwargs.pop("_commit_hash", None) + + cached_file_kwargs = { + "cache_dir": cache_dir, + "force_download": force_download, + "proxies": proxies, + "resume_download": resume_download, + "local_files_only": local_files_only, + "use_auth_token": use_auth_token, + "revision": revision, + "subfolder": subfolder, + "_raise_exceptions_for_missing_entries": False, + "_commit_hash": commit_hash, + } + + # == step1: prepare configs and file names == # + config: PretrainedConfig = AutoConfig.from_pretrained( + model_name_or_path, + trust_remote_code=trust_remote_code, + **cached_file_kwargs, + ) + + if torch_dtype == "auto": + torch_dtype = auto_dtype_from_config(config, quant_inference=True) + elif not isinstance(torch_dtype, torch.dtype): + raise ValueError(f"torch_dtype value of `{torch_dtype}` is not a torch.dtype instance.") + + if config.model_type not in SUPPORTED_MODELS: + raise TypeError(f"{config.model_type} isn't supported yet.") + + if quantize_config is None: + quantize_config = QuantizeConfig.from_pretrained( + model_name_or_path, format=format, **cached_file_kwargs, **kwargs + ) + else: + if not isinstance(quantize_config, QuantizeConfig): + quantize_config = QuantizeConfig.from_quant_config(quantize_config, format) + + + if model_basename is None: + if quantize_config.model_file_base_name: + possible_model_basenames = [quantize_config.model_file_base_name] + else: + possible_model_basenames = [ + f"gptq_model-{quantize_config.bits}bit-{quantize_config.group_size}g", + "model", + ] + else: + possible_model_basenames = [model_basename] + + quantize_config.model_name_or_path = model_name_or_path + + extensions = [] + if use_safetensors: + extensions.append(".safetensors") + else: + extensions += [".bin", ".pt"] + + model_name_or_path = str(model_name_or_path) + + # Retrieve (and if necessary download) the quantized checkpoint(s). + is_sharded, resolved_archive_file, true_model_basename = get_checkpoints( + model_name_or_path=model_name_or_path, + extensions=extensions, + possible_model_basenames=possible_model_basenames, + **cached_file_kwargs, + ) + + # bin files have security issues: disable loading by default + if ".bin" in resolved_archive_file: + if allow_unsafe_loading: + logger.warning( + "There are security risks when loading tensors from .bin files. Make sure you are loading model only from a trusted source." + ) + else: + raise ValueError( + "Loading of unsafe .bin files are not allowed by default. Pass allow_unsafe_loading=True to bypass." + ) + + quantize_config.model_file_base_name = true_model_basename + + model_save_name = resolved_archive_file # In case a model is sharded, this would be `model.safetensors.index.json` which may later break. + if verify_hash: + if is_sharded: + verfieid = verify_sharded_model_hashes(model_save_name, verify_hash) + else: + verfieid = verify_model_hash(model_save_name, verify_hash) + if not verfieid: + raise ValueError(f"Hash verification failed for {model_save_name}") + logger.info(f"Hash verification succeeded for {model_save_name}") + # == step2: convert model to gptq-model (replace Linear with QuantLinear) == # + def skip(*args, **kwargs): + pass + + if torch_dtype != torch.float16: + logger.warning("Overriding use_cuda_fp16 to False since torch_dtype is not torch.float16.") + use_cuda_fp16 = False + + torch.nn.init.kaiming_uniform_ = skip + torch.nn.init.uniform_ = skip + torch.nn.init.normal_ = skip + + transformers.modeling_utils._init_weights = False + + init_contexts = [no_init_weights()] + + with ContextManagers(init_contexts): + model = AutoModelForCausalLM.from_config( + config, trust_remote_code=trust_remote_code, torch_dtype=torch_dtype + ) + + if cls.dynamic_expert_index is not None: + num_experts = getattr(config, cls.dynamic_expert_index) + cls.layer_modules = get_moe_layer_modules(layer_modules=cls.layer_modules, + num_experts=num_experts) + + layers = find_layers(model) + ignore_layers = [cls.lm_head] + cls.base_modules + + for name in list(layers.keys()): + # allow loading of quantized lm_head + if quantize_config.lm_head and name == cls.lm_head: + continue + + if any(name.startswith(ignore_layer) for ignore_layer in ignore_layers) or all( + not name.endswith(ignore_layer) for sublist in cls.layer_modules for ignore_layer in sublist + ): + # log non-lm-head quantizerd layers only + if name is not cls.lm_head: + logger.info(f"The layer {name} is not quantized.") + del layers[name] + + preload_qlinear_kernel = make_quant( + model, + layers, + quantize_config.bits, + quantize_config.group_size, + backend=backend, + format=quantize_config.format, + use_cuda_fp16=use_cuda_fp16, + desc_act=quantize_config.desc_act, + ) + model.tie_weights() + + # == step3: load checkpoint and dispatch == # + if isinstance(device_map, str) and device_map not in [ + "auto", + "balanced", + "balanced_low_0", + "sequential", + ]: + raise ValueError( + "If passing a string for `device_map`, please choose 'auto', 'balanced', 'balanced_low_0' or " + "'sequential'." + ) + if isinstance(device_map, dict): + max_memory = None + else: + if device is None and not device_map and not max_memory: + device_map = "auto" + if device is not None: + device = torch.device(device) + if not max_memory and not device_map: + device_map = {"": device.index if device.type == "cuda" else device.type} + if not isinstance(device_map, dict) and device_map != "sequential": + max_memory = accelerate.utils.get_balanced_memory( + model=model, + max_memory=max_memory, + no_split_module_classes=[cls.layer_type], + low_zero=(device_map == "balanced_low_0"), + ) + if not isinstance(device_map, dict): + device_map = accelerate.infer_auto_device_map( + model, + max_memory=max_memory, + no_split_module_classes=[cls.layer_type], + ) + + load_checkpoint_in_model = False + # compat: runtime convert checkpoint gptq(v1) to gptq_v2 format + if quantize_config.format == FORMAT.GPTQ: + accelerate.load_checkpoint_in_model( + model, + dtype=torch_dtype, + # This is very hacky but works due to https://github.com/huggingface/accelerate/blob/bd72a5f1a80d5146554458823f8aeda0a9db5297/src/accelerate/utils/modeling.py#L292 + checkpoint=model_save_name, + device_map=device_map, + offload_state_dict=True, + offload_buffers=True, + ) + # validate sym=False v1 loading needs to be protected for models produced with new v2 format codebase + if not quantize_config.sym and not quantize_config.is_quantized_or_packed_by_v2(): + raise ValueError( + f"Loading of a sym=False model with format={FORMAT.GPTQ} is only supported if produced by gptqmodel version >= {MIN_VERSION_WITH_V2}" + ) + + logger.info( + f"Compatibility: converting `{FORMAT_FIELD_JSON}` from `{FORMAT.GPTQ}` to `{FORMAT.GPTQ_V2}`.") + model = convert_gptq_v1_to_v2_format( + model, + quantize_config=quantize_config, + qlinear_kernel=preload_qlinear_kernel, + ) + load_checkpoint_in_model = True + quantize_config.format = FORMAT.GPTQ_V2 + + if not load_checkpoint_in_model and backend == Backend.TRITON: + accelerate.load_checkpoint_in_model( + model, + dtype=torch_dtype, # This is very hacky but works due to https://github.com/huggingface/accelerate/blob/bd72a5f1a80d5146554458823f8aeda0a9db5297/src/accelerate/utils/modeling.py#L292 + checkpoint=model_save_name, + device_map=device_map, + offload_state_dict=True, + offload_buffers=True, + ) + + # TODO: Why are we using this custom function and not dispatch_model? + model = simple_dispatch_model(model, device_map) + + qlinear_kernel = select_quant_linear( + bits=quantize_config.bits, + group_size=quantize_config.group_size, + desc_act=quantize_config.desc_act, + sym=quantize_config.sym, + backend=backend, + format=quantize_config.format, + ) + + # == step4: set seqlen == # + model_config = model.config.to_dict() + seq_len_keys = ["max_position_embeddings", "seq_length", "n_positions"] + if any(k in model_config for k in seq_len_keys): + for key in seq_len_keys: + if key in model_config: + model.seqlen = model_config[key] + break + else: + logger.warning("can't get model's sequence length from model config, will set to 4096.") + model.seqlen = 4096 + + # Any post-initialization that require device information, for example buffers initialization on device. + model = gptqmodel_post_init(model, use_act_order=quantize_config.desc_act) + + model.eval() + + # == step6: (optional) warmup triton == # + if backend != Backend.TRITON and warmup_triton: + from ..nn_modules.qlinear.qlinear_tritonv2 import QuantLinear + + QuantLinear.warmup(model, seqlen=model.seqlen) + + return cls( + model, + quantized=True, + quantize_config=quantize_config, + qlinear_kernel=qlinear_kernel, + ) + + def warmup_triton(self, enabled: bool = True): + if not enabled: + return + + from ..nn_modules.qlinear.qlinear_tritonv2 import QuantLinear + + QuantLinear.warmup(self.model, seqlen=self.model.seqlen) + + def __getattr__(self, item): + try: + return super().__getattr__(item) + except Exception: + return getattr(self.model, item) + + +__all__ = ["BaseGPTQModel"] diff --git a/plugins/accelerated-peft/src/gptqmodel/models/dbrx.py b/plugins/accelerated-peft/src/gptqmodel/models/dbrx.py new file mode 100644 index 00000000..fb758032 --- /dev/null +++ b/plugins/accelerated-peft/src/gptqmodel/models/dbrx.py @@ -0,0 +1,22 @@ +############################################################################### +# Adapted from https://github.com/ModelCloud/GPTQModel +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +############################################################################### +from .base import BaseGPTQModel + + +# placer=holder only as dbrx original models are not supported +# supported dbrx_converted models can be found on https://hf.co/ModelCloud +class DbrxGPTQ(BaseGPTQModel): + info = {"notes": "Dbrx is only supported using defused/converted models on https://hf.co/ModelCloud with `trust_remote_code=True`"} diff --git a/plugins/accelerated-peft/src/gptqmodel/models/dbrx_converted.py b/plugins/accelerated-peft/src/gptqmodel/models/dbrx_converted.py new file mode 100644 index 00000000..35a33170 --- /dev/null +++ b/plugins/accelerated-peft/src/gptqmodel/models/dbrx_converted.py @@ -0,0 +1,66 @@ +############################################################################### +# Adapted from https://github.com/ModelCloud/GPTQModel +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +############################################################################### +from .base import BaseGPTQModel + + +class DbrxConvertedGPTQ(BaseGPTQModel): + # dbrx_converted requires custom model code + require_trust_remote_code = True + + base_modules = ["transformer.wte", "transformer.norm_f"] + + layers_node = "transformer.blocks" + layer_type = "DbrxBlock" + layer_modules = [ + ["norm_attn_norm.attn.q_proj", "norm_attn_norm.attn.k_proj", "norm_attn_norm.attn.v_proj"], + ["norm_attn_norm.attn.out_proj"], + [ + "ffn.experts.mlp.0.w1", "ffn.experts.mlp.0.v1", + "ffn.experts.mlp.1.w1", "ffn.experts.mlp.1.v1", + "ffn.experts.mlp.2.w1", "ffn.experts.mlp.2.v1", + "ffn.experts.mlp.3.w1", "ffn.experts.mlp.3.v1", + "ffn.experts.mlp.4.w1", "ffn.experts.mlp.4.v1", + "ffn.experts.mlp.5.w1", "ffn.experts.mlp.5.v1", + "ffn.experts.mlp.6.w1", "ffn.experts.mlp.6.v1", + "ffn.experts.mlp.7.w1", "ffn.experts.mlp.7.v1", + "ffn.experts.mlp.8.w1", "ffn.experts.mlp.8.v1", + "ffn.experts.mlp.9.w1", "ffn.experts.mlp.9.v1", + "ffn.experts.mlp.10.w1", "ffn.experts.mlp.10.v1", + "ffn.experts.mlp.11.w1", "ffn.experts.mlp.11.v1", + "ffn.experts.mlp.12.w1", "ffn.experts.mlp.12.v1", + "ffn.experts.mlp.13.w1", "ffn.experts.mlp.13.v1", + "ffn.experts.mlp.14.w1", "ffn.experts.mlp.14.v1", + "ffn.experts.mlp.15.w1", "ffn.experts.mlp.15.v1", + ], + [ + "ffn.experts.mlp.0.w2", + "ffn.experts.mlp.1.w2", + "ffn.experts.mlp.2.w2", + "ffn.experts.mlp.3.w2", + "ffn.experts.mlp.4.w2", + "ffn.experts.mlp.5.w2", + "ffn.experts.mlp.6.w2", + "ffn.experts.mlp.7.w2", + "ffn.experts.mlp.8.w2", + "ffn.experts.mlp.9.w2", + "ffn.experts.mlp.10.w2", + "ffn.experts.mlp.11.w2", + "ffn.experts.mlp.12.w2", + "ffn.experts.mlp.13.w2", + "ffn.experts.mlp.14.w2", + "ffn.experts.mlp.15.w2", + ] + ] diff --git a/plugins/accelerated-peft/src/gptqmodel/models/gemma.py b/plugins/accelerated-peft/src/gptqmodel/models/gemma.py new file mode 100644 index 00000000..e6191904 --- /dev/null +++ b/plugins/accelerated-peft/src/gptqmodel/models/gemma.py @@ -0,0 +1,29 @@ +############################################################################### +# Adapted from https://github.com/ModelCloud/GPTQModel +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +############################################################################### +from .base import BaseGPTQModel + + +class GemmaGPTQ(BaseGPTQModel): + base_modules = ["model.embed_tokens", "model.norm"] + + layers_node = "model.layers" + layer_type = "GemmaDecoderLayer" + layer_modules = [ + ["self_attn.k_proj", "self_attn.v_proj", "self_attn.q_proj"], + ["self_attn.o_proj"], + ["mlp.up_proj", "mlp.gate_proj"], + ["mlp.down_proj"], + ] diff --git a/plugins/accelerated-peft/src/gptqmodel/models/gpt_bigcode.py b/plugins/accelerated-peft/src/gptqmodel/models/gpt_bigcode.py new file mode 100644 index 00000000..0a15062e --- /dev/null +++ b/plugins/accelerated-peft/src/gptqmodel/models/gpt_bigcode.py @@ -0,0 +1,29 @@ +############################################################################### +# Adapted from https://github.com/ModelCloud/GPTQModel +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +############################################################################### +from .base import BaseGPTQModel + + +class GPTBigCodeGPTQ(BaseGPTQModel): + base_modules = ["transformer.wpe", "transformer.wte", "transformer.ln_f"] + + layers_node = "transformer.h" + layer_type = "GPTBigCodeBlock" + layer_modules = [ + ["attn.c_attn"], + ["attn.c_proj"], + ["mlp.c_fc"], + ["mlp.c_proj"], + ] diff --git a/plugins/accelerated-peft/src/gptqmodel/models/gpt_neox.py b/plugins/accelerated-peft/src/gptqmodel/models/gpt_neox.py new file mode 100644 index 00000000..81e8e401 --- /dev/null +++ b/plugins/accelerated-peft/src/gptqmodel/models/gpt_neox.py @@ -0,0 +1,31 @@ +############################################################################### +# Adapted from https://github.com/ModelCloud/GPTQModel +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +############################################################################### +from .base import BaseGPTQModel + + +class GPTNeoXGPTQ(BaseGPTQModel): + base_modules = ["gpt_neox.embed_in", "gpt_neox.final_layer_norm"] + lm_head = "embed_out" + + layers_node = "gpt_neox.layers" + layer_type = "GPTNeoXLayer" + layer_modules = [ + ["attention.query_key_value"], + ["attention.dense"], + ["mlp.dense_h_to_4h"], + ["mlp.dense_4h_to_h"], + ] + diff --git a/plugins/accelerated-peft/src/gptqmodel/models/llama.py b/plugins/accelerated-peft/src/gptqmodel/models/llama.py new file mode 100644 index 00000000..19930ef6 --- /dev/null +++ b/plugins/accelerated-peft/src/gptqmodel/models/llama.py @@ -0,0 +1,37 @@ +############################################################################### +# Adapted from https://github.com/ModelCloud/GPTQModel +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +############################################################################### +from .base import BaseGPTQModel + + +class LlamaGPTQ(BaseGPTQModel): + # Non-repeating layers at the root level: same level as `layers_node` + # Excluding `layers_node`. + base_modules = ["model.embed_tokens", "model.norm"] + + # Below describes all the repeating layers in this transformer model + # `model.layers` is a node/module that hold all the repeating layers. The parent node for all n-layers. + layers_node = "model.layers" + # Each repeating layer in `model.layers` is of type `LlamaDecoderLayer` + layer_type = "LlamaDecoderLayer" + # Inside each `LlamaDecoderLayer` layer are many internal modules + # List them in the order executed in model forward() code + # Many models have same execution order of: attention (q_k_v) projection, attention (output) projection, mlp (n) projections + layer_modules = [ + ["self_attn.k_proj", "self_attn.v_proj", "self_attn.q_proj"], + ["self_attn.o_proj"], + ["mlp.up_proj", "mlp.gate_proj"], + ["mlp.down_proj"], + ] diff --git a/plugins/accelerated-peft/src/gptqmodel/models/mistral.py b/plugins/accelerated-peft/src/gptqmodel/models/mistral.py new file mode 100644 index 00000000..aa1748eb --- /dev/null +++ b/plugins/accelerated-peft/src/gptqmodel/models/mistral.py @@ -0,0 +1,29 @@ +############################################################################### +# Adapted from https://github.com/ModelCloud/GPTQModel +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +############################################################################### +from .base import BaseGPTQModel + + +class MistralGPTQ(BaseGPTQModel): + base_modules = ["model.embed_tokens", "model.norm"] + + layers_node = "model.layers" + layer_type = "MistralDecoderLayer" + layer_modules = [ + ["self_attn.k_proj", "self_attn.v_proj", "self_attn.q_proj"], + ["self_attn.o_proj"], + ["mlp.up_proj", "mlp.gate_proj"], + ["mlp.down_proj"], + ] diff --git a/plugins/accelerated-peft/src/gptqmodel/models/mixtral.py b/plugins/accelerated-peft/src/gptqmodel/models/mixtral.py new file mode 100644 index 00000000..acb4f640 --- /dev/null +++ b/plugins/accelerated-peft/src/gptqmodel/models/mixtral.py @@ -0,0 +1,55 @@ +############################################################################### +# Adapted from https://github.com/ModelCloud/GPTQModel +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +############################################################################### +from .base import BaseGPTQModel + + +class MixtralGPTQ(BaseGPTQModel): + base_modules = ["model.embed_tokens", "model.norm"] + + layers_node = "model.layers" + layer_type = "MixtralDecoderLayer" + layer_modules = [ + ["self_attn.k_proj", "self_attn.v_proj", "self_attn.q_proj"], + ["self_attn.o_proj"], + [ + "block_sparse_moe.experts.0.w1", + "block_sparse_moe.experts.1.w1", + "block_sparse_moe.experts.2.w1", + "block_sparse_moe.experts.3.w1", + "block_sparse_moe.experts.4.w1", + "block_sparse_moe.experts.5.w1", + "block_sparse_moe.experts.6.w1", + "block_sparse_moe.experts.7.w1", + "block_sparse_moe.experts.0.w3", + "block_sparse_moe.experts.1.w3", + "block_sparse_moe.experts.2.w3", + "block_sparse_moe.experts.3.w3", + "block_sparse_moe.experts.4.w3", + "block_sparse_moe.experts.5.w3", + "block_sparse_moe.experts.6.w3", + "block_sparse_moe.experts.7.w3", + ], + [ + "block_sparse_moe.experts.0.w2", + "block_sparse_moe.experts.1.w2", + "block_sparse_moe.experts.2.w2", + "block_sparse_moe.experts.3.w2", + "block_sparse_moe.experts.4.w2", + "block_sparse_moe.experts.5.w2", + "block_sparse_moe.experts.6.w2", + "block_sparse_moe.experts.7.w2", + ], + ] diff --git a/plugins/accelerated-peft/src/gptqmodel/nn_modules/__init__.py b/plugins/accelerated-peft/src/gptqmodel/nn_modules/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/plugins/accelerated-peft/src/gptqmodel/nn_modules/qlinear/__init__.py b/plugins/accelerated-peft/src/gptqmodel/nn_modules/qlinear/__init__.py new file mode 100644 index 00000000..ff590d3c --- /dev/null +++ b/plugins/accelerated-peft/src/gptqmodel/nn_modules/qlinear/__init__.py @@ -0,0 +1,43 @@ +import torch.nn as nn + + +class BaseQuantLinear(nn.Module): + # override me + QUANT_TYPE = "base" + + SUPPORTED_BITS = [] + SUPPORTED_GROUP_SIZE = [] + SUPPORTED_DESC_ACT = [True, False] + SUPPORTED_SYM = [True, False] + SUPPORTED_SHARDS: bool = True + + @classmethod + def validate(cls, bits: int, group_size: int, desc_act: bool, sym: bool, raise_error: bool = True) -> bool: + validate = True + err = "" + if cls.SUPPORTED_BITS and bits not in cls.SUPPORTED_BITS: + validate = False + err = f"{cls.QUANT_TYPE} only supports `{cls.SUPPORTED_BITS}` bits: actual bits = `{bits}`" + elif cls.SUPPORTED_GROUP_SIZE and group_size not in cls.SUPPORTED_GROUP_SIZE: + validate = False + err = f"{cls.QUANT_TYPE} only supports `{cls.SUPPORTED_GROUP_SIZE}` group_size: actual group_size = `{group_size}`" + elif cls.SUPPORTED_SYM and sym not in cls.SUPPORTED_SYM: + validate = False + err = f"{cls.QUANT_TYPE} only supports `{cls.SUPPORTED_SYM}` bits: actual sym = `{sym}`" + elif cls.SUPPORTED_DESC_ACT and desc_act not in cls.SUPPORTED_DESC_ACT: + validate = False + err = f"{cls.QUANT_TYPE} only supports `{cls.SUPPORTED_DESC_ACT}` bits: actual desc_act = `{desc_act}`" + + if not validate and raise_error: + raise NotImplementedError(err) + + return validate + + # override me + def post_init(self): + pass + + +class BaseCudaQuantLinear(BaseQuantLinear): + # override me + QUANT_TYPE = "base-cuda" diff --git a/plugins/accelerated-peft/src/gptqmodel/nn_modules/qlinear/qlinear_tritonv2.py b/plugins/accelerated-peft/src/gptqmodel/nn_modules/qlinear/qlinear_tritonv2.py new file mode 100644 index 00000000..a0ab76c5 --- /dev/null +++ b/plugins/accelerated-peft/src/gptqmodel/nn_modules/qlinear/qlinear_tritonv2.py @@ -0,0 +1,206 @@ +############################################################################### +# Adapted from https://github.com/ModelCloud/GPTQModel +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +############################################################################### +import math +from logging import getLogger + +import numpy as np +import torch +import torch.nn as nn +import transformers + +from ..triton_utils.dequant import QuantLinearFunction, quant_matmul_248 +from ..triton_utils.mixin import TritonModuleMixin +from . import BaseQuantLinear + +logger = getLogger(__name__) + + +class QuantLinear(BaseQuantLinear, TritonModuleMixin): + """ + Triton v2 quantized linear layer. + + Calls dequant kernel (see triton_utils/dequant) to dequantize the weights then uses + torch.matmul to compute the output whereas original `triton` quantized linear layer fused + dequant and matmul into single kernel.add() + """ + + QUANT_TYPE = "tritonv2" + + def __init__(self, bits, group_size, infeatures, outfeatures, bias, **kwargs,): + super().__init__() + if bits not in [2, 4, 8]: + raise NotImplementedError("Only 2,4,8 bits are supported.") + if infeatures % 32 != 0 or outfeatures % 32 != 0: + raise NotImplementedError("in_feature and out_feature must be divisible by 32.") + self.infeatures = infeatures + self.outfeatures = outfeatures + self.bits = bits + self.group_size = group_size if group_size != -1 else infeatures + self.maxq = 2**self.bits - 1 + + self.register_buffer( + "qweight", + torch.zeros((infeatures // 32 * self.bits, outfeatures), dtype=torch.int32), + ) + self.register_buffer( + "qzeros", + torch.zeros( + ( + math.ceil(infeatures / self.group_size), + outfeatures // 32 * self.bits, + ), + dtype=torch.int32, + ), + ) + self.register_buffer( + "scales", + torch.zeros( + (math.ceil(infeatures / self.group_size), outfeatures), + dtype=torch.float16, + ), + ) + self.register_buffer( + "g_idx", + torch.tensor([i // self.group_size for i in range(infeatures)], dtype=torch.int32), + ) + if bias: + self.register_buffer("bias", torch.zeros((outfeatures), dtype=torch.float16)) + else: + self.bias = None + + def post_init(self): + pass + + def pack(self, linear, scales, zeros, g_idx=None): + W = linear.weight.data.clone() + if isinstance(linear, nn.Conv2d): + W = W.flatten(1) + if isinstance(linear, transformers.pytorch_utils.Conv1D): + W = W.t() + + self.g_idx = g_idx.clone() if g_idx is not None else self.g_idx + + scales = scales.t().contiguous() + zeros = zeros.t().contiguous() + scale_zeros = zeros * scales + self.scales = scales.clone().half() + if linear.bias is not None: + self.bias = linear.bias.clone().half() + + intweight = [] + for idx in range(self.infeatures): + intweight.append( + torch.round((W[:, idx] + scale_zeros[self.g_idx[idx]]) / self.scales[self.g_idx[idx]]).to(torch.int)[ + :, None + ] + ) + intweight = torch.cat(intweight, dim=1) + intweight = intweight.t().contiguous() + intweight = intweight.numpy().astype(np.uint32) + + i = 0 + row = 0 + qweight = np.zeros((intweight.shape[0] // 32 * self.bits, intweight.shape[1]), dtype=np.uint32) + while row < qweight.shape[0]: + if self.bits in [2, 4, 8]: + for j in range(i, i + (32 // self.bits)): + qweight[row] |= intweight[j] << (self.bits * (j - i)) + i += 32 // self.bits + row += 1 + else: + raise NotImplementedError("Only 2,4,8 bits are supported.") + + qweight = qweight.astype(np.int32) + self.qweight = torch.from_numpy(qweight) + + zeros = zeros.numpy().astype(np.uint32) + qzeros = np.zeros((zeros.shape[0], zeros.shape[1] // 32 * self.bits), dtype=np.uint32) + i = 0 + col = 0 + while col < qzeros.shape[1]: + if self.bits in [2, 4, 8]: + for j in range(i, i + (32 // self.bits)): + qzeros[:, col] |= zeros[:, j] << (self.bits * (j - i)) + i += 32 // self.bits + col += 1 + else: + raise NotImplementedError("Only 2,4,8 bits are supported.") + + qzeros = qzeros.astype(np.int32) + self.qzeros = torch.from_numpy(qzeros) + + def forward(self, x): + out_shape = x.shape[:-1] + (self.outfeatures,) + quant_linear_fn = QuantLinearFunction + + out = quant_linear_fn.apply( + x.reshape(-1, x.shape[-1]), + self.qweight, + self.scales, + self.qzeros, + self.g_idx, + self.bits, + self.maxq, + ) + out = out.half().reshape(out_shape) + out = out + self.bias if self.bias is not None else out + return out + + @classmethod + def warmup(cls, model, transpose=False, seqlen=2048): + """ + Pre-tunes the quantized kernel + """ + from tqdm import tqdm + + kn_values = {} + + for _, m in model.named_modules(): + if not isinstance(m, cls): + continue + + k = m.infeatures + n = m.outfeatures + + if (k, n) not in kn_values: + kn_values[(k, n)] = ( + m.qweight, + m.scales, + m.qzeros, + m.g_idx, + m.bits, + m.maxq, + ) + + logger.info(f"Found {len(kn_values)} unique KN Linear values.") + logger.info("Warming up autotune cache ...") + with torch.no_grad(): + for m in tqdm(range(0, math.ceil(math.log2(seqlen)) + 1)): + m = 2**m + for (k, n), ( + qweight, + scales, + qzeros, + g_idx, + bits, + maxq, + ) in kn_values.items(): + a = torch.randn(m, k, dtype=torch.float16, device=model.device) + quant_matmul_248(a, qweight, scales, qzeros, g_idx, bits, maxq) + del kn_values + + +__all__ = ["QuantLinear"] diff --git a/plugins/accelerated-peft/src/gptqmodel/nn_modules/triton_utils/__init__.py b/plugins/accelerated-peft/src/gptqmodel/nn_modules/triton_utils/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/plugins/accelerated-peft/src/gptqmodel/nn_modules/triton_utils/custom_autotune.py b/plugins/accelerated-peft/src/gptqmodel/nn_modules/triton_utils/custom_autotune.py new file mode 100644 index 00000000..fed33846 --- /dev/null +++ b/plugins/accelerated-peft/src/gptqmodel/nn_modules/triton_utils/custom_autotune.py @@ -0,0 +1,233 @@ +############################################################################### +# Adapted from https://github.com/ModelCloud/GPTQModel +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +############################################################################### +import builtins +import math +import time +from typing import Dict + +import triton + +# code based https://github.com/fpgaminer/GPTQ-triton +""" +Mostly the same as the autotuner in Triton, but with a few changes like using 40 runs instead of 100. +""" + + +class CustomizedTritonAutoTuner(triton.KernelInterface): + def __init__( + self, + fn, + arg_names, + configs, + key, + reset_to_zero, + prune_configs_by: Dict = None, + nearest_power_of_two: bool = False, + ): + if not configs: + self.configs = [triton.Config({}, num_warps=4, num_stages=2)] + else: + self.configs = configs + self.key_idx = [arg_names.index(k) for k in key] + self.nearest_power_of_two = nearest_power_of_two + self.cache = {} + # hook to reset all required tensor to zeros before relaunching a kernel + self.hook = lambda args: 0 + if reset_to_zero is not None: + self.reset_idx = [arg_names.index(k) for k in reset_to_zero] + + def _hook(args): + for i in self.reset_idx: + args[i].zero_() + + self.hook = _hook + self.arg_names = arg_names + # prune configs + if prune_configs_by: + perf_model, top_k = ( + prune_configs_by["perf_model"], + prune_configs_by["top_k"], + ) + if "early_config_prune" in prune_configs_by: + early_config_prune = prune_configs_by["early_config_prune"] + else: + perf_model, top_k, early_config_prune = None, None, None + self.perf_model, self.configs_top_k = perf_model, top_k + self.early_config_prune = early_config_prune + self.fn = fn + + def _bench(self, *args, config, **meta): + # check for conflicts, i.e. meta-parameters both provided + # as kwargs and by the autotuner + conflicts = meta.keys() & config.kwargs.keys() + if conflicts: + raise ValueError( + f"Conflicting meta-parameters: {', '.join(conflicts)}." + " Make sure that you don't re-define auto-tuned symbols." + ) + # augment meta-parameters with tunable ones + current = dict(meta, **config.kwargs) + + def kernel_call(): + if config.pre_hook: + config.pre_hook(self.nargs) + self.hook(args) + self.fn.run( + *args, + num_warps=config.num_warps, + num_stages=config.num_stages, + **current, + ) + + try: + # In testings using only 40 reps seems to be close enough and it appears to be what PyTorch uses + # PyTorch also sets fast_flush to True, but I didn't see any speedup so I'll leave the default + return triton.testing.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8), rep=40) + except triton.OutOfResources: + return (float("inf"), float("inf"), float("inf")) + + def run(self, *args, **kwargs): + self.nargs = dict(zip(self.arg_names, args)) + if len(self.configs) > 1: + key = tuple(args[i] for i in self.key_idx) + + # This reduces the amount of autotuning by rounding the keys to the nearest power of two + # In my testing this gives decent results, and greatly reduces the amount of tuning required + if self.nearest_power_of_two: + key = tuple([2 ** int(math.log2(x) + 0.5) for x in key]) + + if key not in self.cache: + # prune configs + pruned_configs = self.prune_configs(kwargs) + bench_start = time.time() + timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs} + bench_end = time.time() + self.bench_time = bench_end - bench_start + self.cache[key] = builtins.min(timings, key=timings.get) + self.hook(args) + self.configs_timings = timings + config = self.cache[key] + else: + config = self.configs[0] + self.best_config = config + if config.pre_hook is not None: + config.pre_hook(self.nargs) + return self.fn.run( + *args, + num_warps=config.num_warps, + num_stages=config.num_stages, + **kwargs, + **config.kwargs, + ) + + def prune_configs(self, kwargs): + pruned_configs = self.configs + if self.early_config_prune: + pruned_configs = self.early_config_prune(self.configs, self.nargs) + if self.perf_model: + top_k = self.configs_top_k + if isinstance(top_k, float) and top_k <= 1.0: + top_k = int(len(self.configs) * top_k) + if len(pruned_configs) > top_k: + est_timing = { + config: self.perf_model( + **self.nargs, + **kwargs, + **config.kwargs, + num_stages=config.num_stages, + num_warps=config.num_warps, + ) + for config in pruned_configs + } + pruned_configs = sorted(est_timing.keys(), key=lambda x: est_timing[x])[:top_k] + return pruned_configs + + def warmup(self, *args, **kwargs): + self.nargs = dict(zip(self.arg_names, args)) + for config in self.prune_configs(kwargs): + self.fn.warmup( + *args, + num_warps=config.num_warps, + num_stages=config.num_stages, + **kwargs, + **config.kwargs, + ) + self.nargs = None + + +def autotune(configs, key, prune_configs_by=None, reset_to_zero=None, nearest_power_of_two=False): + def decorator(fn): + return CustomizedTritonAutoTuner( + fn, + fn.arg_names, + configs, + key, + reset_to_zero, + prune_configs_by, + nearest_power_of_two, + ) + + return decorator + + +def matmul248_kernel_config_pruner(configs, nargs): + """ + The main purpose of this function is to shrink BLOCK_SIZE_* when the corresponding dimension is smaller. + """ + m = max(2 ** int(math.ceil(math.log2(nargs["M"]))), 16) + n = max(2 ** int(math.ceil(math.log2(nargs["N"]))), 16) + k = max(2 ** int(math.ceil(math.log2(nargs["K"]))), 16) + + used = set() + for config in configs: + block_size_m = min(m, config.kwargs["BLOCK_SIZE_M"]) + block_size_n = min(n, config.kwargs["BLOCK_SIZE_N"]) + block_size_k = min(k, config.kwargs["BLOCK_SIZE_K"]) + group_size_m = config.kwargs["GROUP_SIZE_M"] + + if ( + block_size_m, + block_size_n, + block_size_k, + group_size_m, + config.num_stages, + config.num_warps, + ) in used: + continue + + used.add( + ( + block_size_m, + block_size_n, + block_size_k, + group_size_m, + config.num_stages, + config.num_warps, + ) + ) + yield triton.Config( + { + "BLOCK_SIZE_M": block_size_m, + "BLOCK_SIZE_N": block_size_n, + "BLOCK_SIZE_K": block_size_k, + "GROUP_SIZE_M": group_size_m, + }, + num_stages=config.num_stages, + num_warps=config.num_warps, + ) + + +__all__ = ["autotune"] diff --git a/plugins/accelerated-peft/src/gptqmodel/nn_modules/triton_utils/dequant.py b/plugins/accelerated-peft/src/gptqmodel/nn_modules/triton_utils/dequant.py new file mode 100644 index 00000000..05091699 --- /dev/null +++ b/plugins/accelerated-peft/src/gptqmodel/nn_modules/triton_utils/dequant.py @@ -0,0 +1,153 @@ +############################################################################### +# Adapted from https://github.com/ModelCloud/GPTQModel +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +############################################################################### +import itertools + +import torch +import triton +import triton.language as tl +from torch.cuda.amp import custom_bwd, custom_fwd + + +def make_dequant_configs(block_sizes, num_warps): + configs = [] + for bs, ws in itertools.product(block_sizes, num_warps): + configs.append(triton.Config({"X_BLOCK": bs}, num_warps=ws)) + return configs + + +DEFAULT_DEQUANT_CONFIGS = make_dequant_configs([128, 256, 512, 1024], [4, 8]) + + +@triton.autotune(DEFAULT_DEQUANT_CONFIGS, key=["numels"]) +@triton.jit +def dequant_kernel_248( + g_idx_ptr, + scales_ptr, + qweight_ptr, + qzeros_ptr, + out_ptr, + numels, + maxq: tl.constexpr, + bits: tl.constexpr, + outfeatures: tl.constexpr, + num_groups: tl.constexpr, + X_BLOCK: tl.constexpr, +): + # Block indexing + xoffset = tl.program_id(0) * X_BLOCK + x_index = xoffset + tl.arange(0, X_BLOCK) + xmask = x_index < numels + row_idx = x_index // outfeatures + col_idx = x_index % outfeatures + + elements_per_feature: tl.constexpr = 32 // bits + + # Load parameters + g_idx = tl.load(g_idx_ptr + (row_idx), None, eviction_policy="evict_last") + qweights = tl.load( + qweight_ptr + (col_idx + (outfeatures * (row_idx // elements_per_feature))), + None, + ) + + wf_weights = (row_idx % elements_per_feature) * bits + + wf_zeros = (col_idx % elements_per_feature) * bits + + tmp1 = g_idx + num_groups + tmp2 = g_idx < 0 + tl.device_assert(g_idx >= 0, "index out of bounds: 0 <= tmp0 < 0") + groups = tl.where(tmp2, tmp1, g_idx) # tmp3 are g_idx + + scales = tl.load(scales_ptr + (col_idx + (outfeatures * groups)), None).to(tl.float32) + + # Unpack weights + weights = qweights >> wf_weights # bit shift qweight + + weights = weights & maxq + + # Unpack zeros + qzero_ncols: tl.constexpr = outfeatures // elements_per_feature + qzeros = tl.load( + qzeros_ptr + ((qzero_ncols * groups) + (col_idx // elements_per_feature)), + None, + eviction_policy="evict_last", + ) + zeros = qzeros >> wf_zeros + zeros = zeros & maxq + + # Dequantize + weights = weights - zeros + weights = weights.to(tl.float32) + weights = scales * weights + + tl.store(out_ptr + (x_index), weights, mask=xmask) + + +def dequant248(qweight, scales, qzeros, g_idx, bits, maxq=None): + """ + Launcher for triton dequant kernel. Only valid for bits = 2, 4, 8 + """ + + num_groups = scales.shape[0] + outfeatures = scales.shape[1] + infeatures = g_idx.shape[0] + + out = torch.empty((infeatures, outfeatures), device="cuda", dtype=torch.float16) + numels = out.numel() + maxq = 2**bits - 1 if maxq is None else maxq + grid = lambda meta: (triton.cdiv(numels, meta["X_BLOCK"]),) # noqa: E731 + + dequant_kernel_248[grid]( + g_idx, + scales, + qweight, + qzeros, + out, + numels, + maxq=maxq, + bits=bits, + outfeatures=outfeatures, + num_groups=num_groups, + ) + return out + + +def quant_matmul_248(input, qweight, scales, qzeros, g_idx, bits, maxq=None, transpose=False): + W = dequant248(qweight, scales, qzeros, g_idx, bits, maxq=maxq) + if transpose: + return input @ W.t() + return input @ W + + +class QuantLinearFunction(torch.autograd.Function): + @staticmethod + @custom_fwd + def forward(ctx, input, qweight, scales, qzeros, g_idx, bits, maxq): + output = quant_matmul_248(input, qweight, scales, qzeros, g_idx, bits, maxq) + ctx.save_for_backward(qweight, scales, qzeros, g_idx) + ctx.bits, ctx.maxq = bits, maxq + return output + + @staticmethod + @custom_bwd + def backward(ctx, grad_output): + qweight, scales, qzeros, g_idx = ctx.saved_tensors + bits, maxq = ctx.bits, ctx.maxq + grad_input = None + + if ctx.needs_input_grad[0]: + grad_input = quant_matmul_248(grad_output, qweight, scales, qzeros, g_idx, bits, maxq, transpose=True) + return grad_input, None, None, None, None, None, None diff --git a/plugins/accelerated-peft/src/gptqmodel/nn_modules/triton_utils/kernels.py b/plugins/accelerated-peft/src/gptqmodel/nn_modules/triton_utils/kernels.py new file mode 100644 index 00000000..541c22dc --- /dev/null +++ b/plugins/accelerated-peft/src/gptqmodel/nn_modules/triton_utils/kernels.py @@ -0,0 +1,476 @@ +############################################################################### +# Adapted from https://github.com/ModelCloud/GPTQModel +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +############################################################################### +from logging import getLogger + +import torch +import triton +import triton.language as tl +from torch.cuda.amp import custom_bwd, custom_fwd + +from . import custom_autotune + +logger = getLogger(__name__) + + +# code based https://github.com/fpgaminer/GPTQ-triton + + +@custom_autotune.autotune( + configs=[ + triton.Config( + { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 32, + "GROUP_SIZE_M": 8, + }, + num_stages=4, + num_warps=4, + ), + triton.Config( + { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 32, + "GROUP_SIZE_M": 8, + }, + num_stages=4, + num_warps=4, + ), + triton.Config( + { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 32, + "GROUP_SIZE_M": 8, + }, + num_stages=4, + num_warps=4, + ), + triton.Config( + { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 32, + "GROUP_SIZE_M": 8, + }, + num_stages=4, + num_warps=4, + ), + triton.Config( + { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 32, + "GROUP_SIZE_M": 8, + }, + num_stages=4, + num_warps=4, + ), + triton.Config( + { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 32, + "GROUP_SIZE_M": 8, + }, + num_stages=2, + num_warps=8, + ), + ], + key=["M", "N", "K"], + nearest_power_of_two=True, + prune_configs_by={ + "early_config_prune": custom_autotune.matmul248_kernel_config_pruner, + "perf_model": None, + "top_k": None, + }, +) +@triton.jit +def quant_matmul_248_kernel( + a_ptr, + b_ptr, + c_ptr, + scales_ptr, + zeros_ptr, + g_ptr, + M, + N, + K, + bits, + maxq, + stride_am, + stride_ak, + stride_bk, + stride_bn, + stride_cm, + stride_cn, + stride_scales, + stride_zeros, + BLOCK_SIZE_M: tl.constexpr, + BLOCK_SIZE_N: tl.constexpr, + BLOCK_SIZE_K: tl.constexpr, + GROUP_SIZE_M: tl.constexpr, +): + """ + Compute the matrix multiplication C = A x B. + A is of shape (M, K) float16 + B is of shape (K//8, N) int32 + C is of shape (M, N) float16 + scales is of shape (G, N) float16 + zeros is of shape (G, N) float16 + g_ptr is of shape (K) int32 + """ + infearure_per_bits = 32 // bits + + pid = tl.program_id(axis=0) + num_pid_m = tl.cdiv(M, BLOCK_SIZE_M) + num_pid_n = tl.cdiv(N, BLOCK_SIZE_N) + num_pid_k = tl.cdiv(K, BLOCK_SIZE_K) + num_pid_in_group = GROUP_SIZE_M * num_pid_n + group_id = pid // num_pid_in_group + first_pid_m = group_id * GROUP_SIZE_M + group_size_m = min(num_pid_m - first_pid_m, GROUP_SIZE_M) + pid_m = first_pid_m + (pid % group_size_m) + pid_n = (pid % num_pid_in_group) // group_size_m + + offs_am = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M) + offs_bn = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N) + offs_k = tl.arange(0, BLOCK_SIZE_K) + a_ptrs = a_ptr + (offs_am[:, None] * stride_am + offs_k[None, :] * stride_ak) # (BLOCK_SIZE_M, BLOCK_SIZE_K) + a_mask = offs_am[:, None] < M + # b_ptrs is set up such that it repeats elements along the K axis 8 times + b_ptrs = b_ptr + ( + (offs_k[:, None] // infearure_per_bits) * stride_bk + offs_bn[None, :] * stride_bn + ) # (BLOCK_SIZE_K, BLOCK_SIZE_N) + g_ptrs = g_ptr + offs_k + # shifter is used to extract the N bits of each element in the 32-bit word from B + scales_ptrs = scales_ptr + offs_bn[None, :] + zeros_ptrs = zeros_ptr + (offs_bn[None, :] // infearure_per_bits) + + shifter = (offs_k % infearure_per_bits) * bits + zeros_shifter = (offs_bn % infearure_per_bits) * bits + accumulator = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32) + + for k in range(0, num_pid_k): + g_idx = tl.load(g_ptrs) + + # Fetch scales and zeros; these are per-outfeature and thus reused in the inner loop + scales = tl.load(scales_ptrs + g_idx[:, None] * stride_scales) # (BLOCK_SIZE_K, BLOCK_SIZE_N,) + zeros = tl.load(zeros_ptrs + g_idx[:, None] * stride_zeros) # (BLOCK_SIZE_K, BLOCK_SIZE_N,) + + zeros = (zeros >> zeros_shifter[None, :]) & maxq + + a = tl.load(a_ptrs, mask=a_mask, other=0.0) # (BLOCK_SIZE_M, BLOCK_SIZE_K) + b = tl.load(b_ptrs) # (BLOCK_SIZE_K, BLOCK_SIZE_N), but repeated + + # Now we need to unpack b (which is N-bit values) into 32-bit values + b = (b >> shifter[:, None]) & maxq # Extract the N-bit values + b = (b - zeros) * scales # Scale and shift + + accumulator += tl.dot(a, b) + a_ptrs += BLOCK_SIZE_K + b_ptrs += (BLOCK_SIZE_K // infearure_per_bits) * stride_bk + g_ptrs += BLOCK_SIZE_K + + c_ptrs = c_ptr + stride_cm * offs_am[:, None] + stride_cn * offs_bn[None, :] + c_mask = (offs_am[:, None] < M) & (offs_bn[None, :] < N) + tl.store(c_ptrs, accumulator, mask=c_mask) + + +@custom_autotune.autotune( + configs=[ + triton.Config( + { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 8, + }, + num_stages=4, + num_warps=4, + ), + triton.Config( + { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 8, + }, + num_stages=4, + num_warps=4, + ), + triton.Config( + { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 8, + }, + num_stages=4, + num_warps=4, + ), + triton.Config( + { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 32, + "GROUP_SIZE_M": 8, + }, + num_stages=4, + num_warps=4, + ), + triton.Config( + { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 8, + }, + num_stages=4, + num_warps=4, + ), + triton.Config( + { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 8, + }, + num_stages=2, + num_warps=8, + ), + ], + key=["M", "N", "K"], + nearest_power_of_two=True, +) +@triton.jit +def transpose_quant_matmul_248_kernel( + a_ptr, + b_ptr, + c_ptr, + scales_ptr, + zeros_ptr, + g_ptr, + M, + N, + K, + bits, + maxq, + stride_am, + stride_ak, + stride_bk, + stride_bn, + stride_cm, + stride_cn, + stride_scales, + stride_zeros, + BLOCK_SIZE_M: tl.constexpr, + BLOCK_SIZE_N: tl.constexpr, + BLOCK_SIZE_K: tl.constexpr, + GROUP_SIZE_M: tl.constexpr, +): + """ + Compute the matrix multiplication C = A x B. + A is of shape (M, N) float16 + B is of shape (K//8, N) int32 + C is of shape (M, K) float16 + scales is of shape (G, N) float16 + zeros is of shape (G, N) float16 + g_ptr is of shape (K) int32 + """ + infearure_per_bits = 32 // bits + + pid = tl.program_id(axis=0) + num_pid_m = tl.cdiv(M, BLOCK_SIZE_M) + num_pid_k = tl.cdiv(K, BLOCK_SIZE_K) + num_pid_n = tl.cdiv(N, BLOCK_SIZE_N) + num_pid_in_group = GROUP_SIZE_M * num_pid_k + group_id = pid // num_pid_in_group + first_pid_m = group_id * GROUP_SIZE_M + group_size_m = min(num_pid_m - first_pid_m, GROUP_SIZE_M) + pid_m = first_pid_m + (pid % group_size_m) + pid_k = (pid % num_pid_in_group) // group_size_m + + offs_am = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M) + offs_bk = pid_k * BLOCK_SIZE_K + tl.arange(0, BLOCK_SIZE_K) + offs_n = tl.arange(0, BLOCK_SIZE_N) + a_ptrs = a_ptr + (offs_am[:, None] * stride_am + offs_n[None, :] * stride_ak) # (BLOCK_SIZE_M, BLOCK_SIZE_N) + a_mask = offs_am[:, None] < M + # b_ptrs is set up such that it repeats elements along the K axis 8 times + b_ptrs = b_ptr + ( + (offs_bk[:, None] // infearure_per_bits) * stride_bk + offs_n[None, :] * stride_bn + ) # (BLOCK_SIZE_K, BLOCK_SIZE_N) + g_ptrs = g_ptr + offs_bk + g_idx = tl.load(g_ptrs) + + # shifter is used to extract the N bits of each element in the 32-bit word from B + scales_ptrs = scales_ptr + offs_n[None, :] + g_idx[:, None] * stride_scales + zeros_ptrs = zeros_ptr + (offs_n[None, :] // infearure_per_bits) + g_idx[:, None] * stride_zeros + + shifter = (offs_bk % infearure_per_bits) * bits + zeros_shifter = (offs_n % infearure_per_bits) * bits + accumulator = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_K), dtype=tl.float32) + + for k in range(0, num_pid_n): + # Fetch scales and zeros; these are per-outfeature and thus reused in the inner loop + scales = tl.load(scales_ptrs) # (BLOCK_SIZE_K, BLOCK_SIZE_N,) + zeros = tl.load(zeros_ptrs) # (BLOCK_SIZE_K, BLOCK_SIZE_N,) + + zeros = (zeros >> zeros_shifter[None, :]) & maxq + + a = tl.load(a_ptrs, mask=a_mask, other=0.0) # (BLOCK_SIZE_M, BLOCK_SIZE_N) + b = tl.load(b_ptrs) # (BLOCK_SIZE_K, BLOCK_SIZE_N), but repeated + + # Now we need to unpack b (which is N-bit values) into 32-bit values + b = (b >> shifter[:, None]) & maxq # Extract the N-bit values + b = (b - zeros) * scales # Scale and shift + b = tl.trans(b) + + accumulator += tl.dot(a, b) + a_ptrs += BLOCK_SIZE_N + b_ptrs += BLOCK_SIZE_N + scales_ptrs += BLOCK_SIZE_N + zeros_ptrs += BLOCK_SIZE_N // infearure_per_bits + + c_ptrs = c_ptr + stride_cm * offs_am[:, None] + stride_cn * offs_bk[None, :] + c_mask = (offs_am[:, None] < M) & (offs_bk[None, :] < K) + tl.store(c_ptrs, accumulator, mask=c_mask) + + +@triton.jit +def silu(x): + return x * tl.sigmoid(x) + + +def quant_matmul_248(input, qweight, scales, qzeros, g_idx, bits, maxq): + with torch.cuda.device(input.device): + output = torch.empty((input.shape[0], qweight.shape[1]), device=input.device, dtype=input.dtype) + grid = lambda META: ( # noqa: E731 + triton.cdiv(input.shape[0], META["BLOCK_SIZE_M"]) * triton.cdiv(qweight.shape[1], META["BLOCK_SIZE_N"]), + ) + quant_matmul_248_kernel[grid]( + input, + qweight, + output, + scales.to(input.dtype), + qzeros, + g_idx, + input.shape[0], + qweight.shape[1], + input.shape[1], + bits, + maxq, + input.stride(0), + input.stride(1), + qweight.stride(0), + qweight.stride(1), + output.stride(0), + output.stride(1), + scales.stride(0), + qzeros.stride(0), + ) + return output + + +def transpose_quant_matmul_248(input, qweight, scales, qzeros, g_idx, bits, maxq): + with torch.cuda.device(input.device): + output_dim = (qweight.shape[0] * 32) // bits + output = torch.empty((input.shape[0], output_dim), device=input.device, dtype=input.dtype) + grid = lambda META: ( # noqa: E731 + triton.cdiv(input.shape[0], META["BLOCK_SIZE_M"]) * triton.cdiv(output_dim, META["BLOCK_SIZE_K"]), + ) + transpose_quant_matmul_248_kernel[grid]( + input, + qweight, + output, + scales.to(input.dtype), + qzeros, + g_idx, + input.shape[0], + qweight.shape[1], + output_dim, + bits, + maxq, + input.stride(0), + input.stride(1), + qweight.stride(0), + qweight.stride(1), + output.stride(0), + output.stride(1), + scales.stride(0), + qzeros.stride(0), + ) + return output + + +class QuantLinearFunction(torch.autograd.Function): + @staticmethod + @custom_fwd + def forward(ctx, input, qweight, scales, qzeros, g_idx, bits, maxq): + output = quant_matmul_248(input, qweight, scales, qzeros, g_idx, bits, maxq) + ctx.save_for_backward(qweight, scales, qzeros, g_idx) + ctx.bits, ctx.maxq = bits, maxq + return output + + @staticmethod + @custom_bwd + def backward(ctx, grad_output): + qweight, scales, qzeros, g_idx = ctx.saved_tensors + bits, maxq = ctx.bits, ctx.maxq + grad_input = None + + if ctx.needs_input_grad[0]: + grad_input = transpose_quant_matmul_248(grad_output, qweight, scales, qzeros, g_idx, bits, maxq) + return grad_input, None, None, None, None, None, None + + +def quant_matmul_inference_only_248(input, qweight, scales, qzeros, g_idx, bits, maxq): + with torch.cuda.device(input.device): + output = torch.empty((input.shape[0], qweight.shape[1]), device=input.device, dtype=torch.float16) + grid = lambda META: ( # noqa: E731 + triton.cdiv(input.shape[0], META["BLOCK_SIZE_M"]) * triton.cdiv(qweight.shape[1], META["BLOCK_SIZE_N"]), + ) + quant_matmul_248_kernel[grid]( + input, + qweight, + output, + scales, + qzeros, + g_idx, + input.shape[0], + qweight.shape[1], + input.shape[1], + bits, + maxq, + input.stride(0), + input.stride(1), + qweight.stride(0), + qweight.stride(1), + output.stride(0), + output.stride(1), + scales.stride(0), + qzeros.stride(0), + ) + return output + + +class QuantLinearInferenceOnlyFunction(torch.autograd.Function): + @staticmethod + @custom_fwd(cast_inputs=torch.float16) + def forward(ctx, input, qweight, scales, qzeros, g_idx, bits, maxq): + output = quant_matmul_248(input, qweight, scales, qzeros, g_idx, bits, maxq) + return output diff --git a/plugins/accelerated-peft/src/gptqmodel/nn_modules/triton_utils/mixin.py b/plugins/accelerated-peft/src/gptqmodel/nn_modules/triton_utils/mixin.py new file mode 100644 index 00000000..457d49cd --- /dev/null +++ b/plugins/accelerated-peft/src/gptqmodel/nn_modules/triton_utils/mixin.py @@ -0,0 +1,19 @@ +############################################################################### +# Adapted from https://github.com/ModelCloud/GPTQModel +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +############################################################################### +class TritonModuleMixin: + @classmethod + def warmup(cls, model, transpose=False, seqlen=2048): + pass diff --git a/plugins/accelerated-peft/src/gptqmodel/quantization/__init__.py b/plugins/accelerated-peft/src/gptqmodel/quantization/__init__.py new file mode 100644 index 00000000..ca86e26f --- /dev/null +++ b/plugins/accelerated-peft/src/gptqmodel/quantization/__init__.py @@ -0,0 +1,4 @@ +from .config import (FORMAT, FORMAT_FIELD_CODE, FORMAT_FIELD_JSON, + QUANT_CONFIG_FILENAME, QUANT_METHOD, QUANT_METHOD_FIELD, BaseQuantizeConfig, QuantizeConfig) +from .gptq import GPTQ +from .quantizer import Quantizer, quantize diff --git a/plugins/accelerated-peft/src/gptqmodel/quantization/config.py b/plugins/accelerated-peft/src/gptqmodel/quantization/config.py new file mode 100644 index 00000000..a99020fa --- /dev/null +++ b/plugins/accelerated-peft/src/gptqmodel/quantization/config.py @@ -0,0 +1,309 @@ +############################################################################### +# Adapted from https://github.com/ModelCloud/GPTQModel +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +############################################################################### +import json +import logging +from dataclasses import dataclass, field, fields +from os.path import isdir, join +from typing import Any, Dict, Optional, Tuple + +from packaging import version +from transformers.utils.hub import cached_file + +logger = logging.getLogger(__name__) +handler = logging.StreamHandler() +formatter = logging.Formatter("%(levelname)s - %(message)s") +handler.setFormatter(formatter) +logger.propagate = False +logger.addHandler(handler) +logger.setLevel(logging.INFO) + +FORMAT_FIELD_CODE = "format" +FORMAT_FIELD_JSON = "checkpoint_format" +QUANT_METHOD_FIELD = "quant_method" +QUANT_CONFIG_FILENAME = "quantize_config.json" +QUANT_CONFIG_FILENAME_COMPAT = [QUANT_CONFIG_FILENAME, "quant_config.json", "config.json"] + +MIN_VERSION_WITH_V2 = "0.9.0" + +META_FIELD = "meta" +# quantizer is the tool that did the quantization +META_FIELD_QUANTIZER = "quantizer" +# packer is the tool that packed the weights post quantization +META_FIELD_PACKER = "packer" + +META_QUANTIZER_GPTQMODEL = "gptqmodel" + + +# saved formats +class FORMAT: + GPTQ = "gptq" + # v2 format fixed sym = False quantization + GPTQ_V2 = "gptq_v2" + TRITON = "triton" + + +# quant methods +class QUANT_METHOD: + GPTQ = "gptq" + + +QUANT_METHOD_FORMAT_MAPPING = { + QUANT_METHOD.GPTQ: { + FORMAT.GPTQ, + FORMAT.GPTQ_V2, + FORMAT.TRITON, + }, +} + +# inference only methods should go here +QUANTIZE_BLACK_LIST = {} + +# compat +QUANT_CONFIG_ARG_SYNONYMS = { + "w_bit": "bits", + "q_group_size": "group_size", + # map format field (checkpoint_format) to class/code (format) + FORMAT_FIELD_JSON: FORMAT_FIELD_CODE, +} + + +@dataclass +class QuantizeConfig(): + bits: int = field(default=4, metadata={"choices": [2, 3, 4, 8]}) + group_size: int = field(default=-1) + damp_percent: float = field(default=0.01) + desc_act: bool = field(default=True) + static_groups: bool = field(default=False) + sym: bool = field(default=True) + true_sequential: bool = field(default=True) + lm_head: bool = field(default=False) + quant_method: str = field(default=QUANT_METHOD.GPTQ) + # default to gptq v1 format for maximum compat with 3rd party inference libs with minimal loss vs v2 + # if you inference with autogptq, save to gptq_v2 format for best result + format: FORMAT = field(default=FORMAT.GPTQ) + + # TODO: remove + model_name_or_path: Optional[str] = field(default=None) + model_file_base_name: Optional[str] = field(default=None) + + # properties that do not directly contributes to quantization or quant inference should be placed in meta + # i.e. quantizer tool (producer) + version, timestamp, entity who made the quant, etc + meta: Optional[Dict] = field(default=None) + + def __post_init__(self): + fields_info = fields(self) + + # validate quant method and format is matched + valid_formats = QUANT_METHOD_FORMAT_MAPPING.get(self.quant_method, None) + if valid_formats is None: + raise ValueError(f"Unsupported quantization method: {self.quant_method}") + + if self.format not in valid_formats: + raise ValueError( + f"The checkpoint format used is {self.format}, and the quantization method is {self.quant_method}. " + ) + + if self.bits not in fields_info[0].metadata["choices"]: + raise ValueError(f"only support quantize to {fields_info[0].metadata['choices']} bits.") + + if self.group_size != -1 and self.group_size <= 0: + raise ValueError("unless equal to -1, group_size must greater then 0.") + + if not (0 < self.damp_percent < 1): + raise ValueError("damp_percent must between 0 and 1.") + + # validate meta + if self.meta is not None: + if not isinstance(self.meta, dict): + raise ValueError("meta must be a dictionary") + for key, value in self.meta.items(): + if not isinstance(key, str): + raise ValueError("Keys in the meta dictionary must be strings") + else: + self.meta = {} + + def meta_set(self, key: str, value: Any): + self.meta[key] = value + + def meta_get(self, key: str) -> Any: + return self.meta.get(key) + + # versionable is a meta.property that pairs value with version i.e "value:1.0.0" + def meta_set_versionable(self, key: str, value: str, version: str): + self.meta_set(key, f"{value}:{version}") + + # versionable is a meta.property that pairs value with version i.e "value:1.0.0" + def meta_get_versionable(self, key: str) -> Tuple[str, str]: + val = self.meta_get(key) + if val is None: + return None, None + parts = val.split(":") + return parts[0].lower(), parts[1].lower() if len(parts) >= 2 else None + + # is quantized model quantized or packed by autogptq version with v2 format code + def is_quantized_or_packed_by_v2(self) -> bool: + # check meta.quantizer + producer, _version = self.meta_get_versionable(META_FIELD_QUANTIZER) + by_v2 = (producer == META_QUANTIZER_GPTQMODEL) and (version.parse(_version) >= version.parse(MIN_VERSION_WITH_V2)) + + # fallback to meta.packer + if not by_v2: + producer, _version = self.meta_get_versionable(META_FIELD_PACKER) + by_v2 = producer == META_QUANTIZER_GPTQMODEL and version.parse(_version) >= version.parse( + MIN_VERSION_WITH_V2 + ) + + return by_v2 + + def save_pretrained(self, save_dir: str, **kwargs): + with open(join(save_dir, QUANT_CONFIG_FILENAME), "w", encoding="utf-8") as f: + json.dump(self.to_dict(), f, indent=2) + + @classmethod + # normalize quant config for compat and also performs validation + def from_quant_config(cls, quantize_cfg, format: str = None): + valid_formats = {FORMAT.GPTQ, FORMAT.GPTQ_V2} + format_auto_inferred = False + # compat: format can be passed in via from_quantized() if field missing from json + if format: + if format not in valid_formats: + raise ValueError(f"Unknown quantization checkpoint format: {format}.") + if quantize_cfg.get(FORMAT_FIELD_JSON): + raise ValueError("Conflict: quantization format is passed in and also exists in model config.") + # compat: warn if checkpoint_format is missing + elif quantize_cfg.get(FORMAT_FIELD_JSON) is None: + format_auto_inferred = True + + field_names = [field.name for field in fields(cls)] + + normalized = { + QUANT_METHOD_FIELD: QUANT_METHOD.GPTQ, + # compat: default to gptq(v1) when loading models + FORMAT_FIELD_CODE: format if format else FORMAT.GPTQ, + } + for key, val in quantize_cfg.items(): + key = key.lower() + + # remap keys according to compat map + if key in QUANT_CONFIG_ARG_SYNONYMS and QUANT_CONFIG_ARG_SYNONYMS[key] in field_names: + key = QUANT_CONFIG_ARG_SYNONYMS[key] + + if key == FORMAT_FIELD_JSON: + val = val.lower() + + if val in {FORMAT.GPTQ, FORMAT.GPTQ_V2}: + normalized[key] = val + else: + raise ValueError(f"Unknown quantization format: {val}.") + elif key == QUANT_METHOD_FIELD: + val = val.lower() + # compat: some hf models use quant_method=marlin or bitblas + if val not in {QUANT_METHOD.GPTQ}: + raise ValueError(f"Unknown quantization method: {val}.") + else: + normalized[QUANT_METHOD_FIELD] = val + elif key in field_names: + normalized[key] = val + else: + logger.info(f"Ignoring unknown parameter in the quantization configuration: {key}.") + + if format_auto_inferred: + logger.info(f"`{FORMAT_FIELD_JSON}` is missing from the quantization configuration and is automatically inferred to {normalized[FORMAT_FIELD_CODE]}") + + if "sym" not in normalized: + logger.warning( + "The quantization configuration does not contain an entry `sym` (symmetric quantization). " + "This may result in silent errors. Defaulting to `sym=True`." + ) + + return cls(**normalized) + + @classmethod + def from_pretrained(cls, save_dir: str, **kwargs): + # Parameters related to loading from Hugging Face Hub + cache_dir = kwargs.pop("cache_dir", None) + force_download = kwargs.pop("force_download", False) + resume_download = kwargs.pop("resume_download", False) + proxies = kwargs.pop("proxies", None) + local_files_only = kwargs.pop("local_files_only", False) + use_auth_token = kwargs.pop("use_auth_token", None) + revision = kwargs.pop("revision", None) + subfolder = kwargs.pop("subfolder", None) + commit_hash = kwargs.pop("_commit_hash", None) + format = kwargs.pop("format", None) + + transformers_config = False + for quantize_config_filename in QUANT_CONFIG_FILENAME_COMPAT: + if isdir(save_dir): # Local + resolved_config_file = join(save_dir, quantize_config_filename) + else: # Remote + resolved_config_file = cached_file( + save_dir, + quantize_config_filename, + cache_dir=cache_dir, + force_download=force_download, + resume_download=resume_download, + proxies=proxies, + use_auth_token=use_auth_token, + revision=revision, + local_files_only=local_files_only, + subfolder=subfolder, + _raise_exceptions_for_missing_entries=False, + _raise_exceptions_for_connection_errors=False, + _commit_hash=commit_hash, + ) + if resolved_config_file is not None: + if quantize_config_filename == "config.json": + transformers_config = True + break + + if resolved_config_file is None: + raise ValueError( + "No quantize_config.json, quant_config.json or config.json file was found in the model repository." + ) + + with open(resolved_config_file, "r", encoding="utf-8") as f: + args_from_json = json.load(f) + + if transformers_config: + args_from_json = args_from_json["quantization_config"] + + return cls.from_quant_config(args_from_json, format) + + def to_dict(self): + return { + "bits": self.bits, + "group_size": self.group_size, + "desc_act": self.desc_act, + "static_groups": self.static_groups, + "sym": self.sym, + "lm_head": self.lm_head, + "damp_percent": self.damp_percent, + "true_sequential": self.true_sequential, + # TODO: deprecate? + "model_name_or_path": self.model_name_or_path, + "model_file_base_name": self.model_file_base_name, + QUANT_METHOD_FIELD: self.quant_method, + FORMAT_FIELD_JSON: self.format, + META_FIELD: self.meta, + } + +# deprecated: will be removed in future update +@dataclass +class BaseQuantizeConfig(QuantizeConfig): + def __init__(self, **kwargs): + super().__init__(**kwargs) + logging.warning("BaseQuantizeConfig is re-named and pending deprecation. Please use `QuantizeConfig` instead.") diff --git a/plugins/accelerated-peft/src/gptqmodel/quantization/gptq.py b/plugins/accelerated-peft/src/gptqmodel/quantization/gptq.py new file mode 100644 index 00000000..c3f2e4b3 --- /dev/null +++ b/plugins/accelerated-peft/src/gptqmodel/quantization/gptq.py @@ -0,0 +1,209 @@ +# License: GPTQModel/licenses/LICENSE.mit +# adapted from @qwopqwop200 's [GPTQ-for-LLaMa](https://github.com/qwopqwop200/GPTQ-for-LLaMa/tree/cuda), which itself is based on [gptq](https://github.com/IST-DASLab/gptq) + +import math +import os +import time +from logging import getLogger + +import torch +import torch.nn as nn +import transformers + +from .quantizer import Quantizer + +logger = getLogger(__name__) + +torch.backends.cuda.matmul.allow_tf32 = False +torch.backends.cudnn.allow_tf32 = False + + +class GPTQ: + def __init__(self, layer): + self.layer = layer + self.dev = self.layer.weight.device + W = layer.weight.data.clone() + if isinstance(self.layer, nn.Conv2d): + W = W.flatten(1) + if isinstance(self.layer, transformers.pytorch_utils.Conv1D): + W = W.t() + self.rows = W.shape[0] + self.columns = W.shape[1] + self.H = torch.zeros((self.columns, self.columns), device=self.dev) + self.nsamples = 0 + self.quantizer = Quantizer() + + def add_batch(self, inp, out): + if os.environ.get("DEBUG"): + self.inp1 = inp + self.out1 = out + if len(inp.shape) == 2: + inp = inp.unsqueeze(0) + tmp = inp.shape[0] + if isinstance(self.layer, nn.Linear) or isinstance(self.layer, transformers.Conv1D): + if len(inp.shape) == 3: + inp = inp.reshape((-1, inp.shape[-1])) + inp = inp.t() + if isinstance(self.layer, nn.Conv2d): + unfold = nn.Unfold( + self.layer.kernel_size, + dilation=self.layer.dilation, + padding=self.layer.padding, + stride=self.layer.stride, + ) + inp = unfold(inp) + inp = inp.permute([1, 0, 2]) + inp = inp.flatten(1) + self.H *= self.nsamples / (self.nsamples + tmp) + self.nsamples += tmp + # inp = inp.float() + inp = math.sqrt(2 / self.nsamples) * inp.float() + # self.H += 2 / self.nsamples * inp.matmul(inp.t()) + self.H += inp.matmul(inp.t()) + + def fasterquant( + self, + blocksize=128, + percdamp=0.01, + group_size=-1, + actorder=False, + static_groups=False, + ): + W = self.layer.weight.data.clone() + if isinstance(self.layer, nn.Conv2d): + W = W.flatten(1) + if isinstance(self.layer, transformers.Conv1D): + W = W.t() + W = W.float() + + tick = time.time() + + if not self.quantizer.ready(): + self.quantizer.find_params(W, weight=True) + + H = self.H + del self.H + dead = torch.diag(H) == 0 + H[dead, dead] = 1 + W[:, dead] = 0 + + g_idx = [] + scale = [] + zero = [] + now_idx = 1 + + if static_groups: + import copy + + groups = [] + for i in range(0, self.columns, group_size): + quantizer = copy.deepcopy(self.quantizer) + quantizer.find_params(W[:, i : (i + group_size)], weight=True) + scale.append(quantizer.scale) + zero.append(quantizer.zero) + groups.append(quantizer) + + if actorder: + perm = torch.argsort(torch.diag(H), descending=True) + W = W[:, perm] + H = H[perm][:, perm] + invperm = torch.argsort(perm) + + Losses = torch.zeros_like(W) + Q = torch.zeros_like(W) + + damp = percdamp * torch.mean(torch.diag(H)) + diag = torch.arange(self.columns, device=self.dev) + H[diag, diag] += damp + H = torch.linalg.cholesky(H) + H = torch.cholesky_inverse(H) + H = torch.linalg.cholesky(H, upper=True) + Hinv = H + + for i1 in range(0, self.columns, blocksize): + i2 = min(i1 + blocksize, self.columns) + count = i2 - i1 + + W1 = W[:, i1:i2].clone() + Q1 = torch.zeros_like(W1) + Err1 = torch.zeros_like(W1) + Losses1 = torch.zeros_like(W1) + Hinv1 = Hinv[i1:i2, i1:i2] + + for i in range(count): + w = W1[:, i] + d = Hinv1[i, i] + + if group_size != -1: + if not static_groups: + if (i1 + i) % group_size == 0: + self.quantizer.find_params(W[:, (i1 + i) : (i1 + i + group_size)], weight=True) + + if ((i1 + i) // group_size) - now_idx == -1: + scale.append(self.quantizer.scale) + zero.append(self.quantizer.zero) + now_idx += 1 + else: + idx = i1 + i + if actorder: + idx = perm[idx] + self.quantizer = groups[idx // group_size] + + q = self.quantizer.quantize(w.unsqueeze(1)).flatten() + Q1[:, i] = q + Losses1[:, i] = (w - q) ** 2 / d**2 + + err1 = (w - q) / d + W1[:, i:] -= err1.unsqueeze(1).matmul(Hinv1[i, i:].unsqueeze(0)) + Err1[:, i] = err1 + + Q[:, i1:i2] = Q1 + Losses[:, i1:i2] = Losses1 / 2 + + W[:, i2:] -= Err1.matmul(Hinv[i1:i2, i2:]) + + if os.environ.get("DEBUG"): + self.layer.weight.data[:, :i2] = Q[:, :i2] + self.layer.weight.data[:, i2:] = W[:, i2:] + logger.debug(torch.sum((self.layer(self.inp1) - self.out1) ** 2)) + logger.debug(torch.sum(Losses)) + + torch.cuda.synchronize() + + duration = time.time() - tick + avg_loss = torch.sum(Losses).item() / self.nsamples + + group_size = group_size if group_size != -1 else self.columns + if static_groups and actorder: + g_idx = [perm[i] // group_size for i in range(self.columns)] + else: + g_idx = [i // group_size for i in range(self.columns)] + g_idx = torch.tensor(g_idx, dtype=torch.int32, device=Q.device) + if actorder: + Q = Q[:, invperm] + g_idx = g_idx[invperm] + + if isinstance(self.layer, transformers.Conv1D): + Q = Q.t() + self.layer.weight.data = Q.reshape(self.layer.weight.shape).type_as(self.layer.weight.data) + if os.environ.get("DEBUG"): + logger.debug(torch.sum((self.layer(self.inp1) - self.out1) ** 2)) + + if scale == []: + scale.append(self.quantizer.scale) + zero.append(self.quantizer.zero) + scale = torch.cat(scale, dim=1) + zero = torch.cat(zero, dim=1) + return scale, zero, g_idx, duration, avg_loss + + def free(self): + if os.environ.get("DEBUG"): + self.inp1 = None + self.out1 = None + self.H = None + self.Losses = None + self.Trace = None + torch.cuda.empty_cache() + + +__all__ = ["GPTQ"] diff --git a/plugins/accelerated-peft/src/gptqmodel/quantization/quantizer.py b/plugins/accelerated-peft/src/gptqmodel/quantization/quantizer.py new file mode 100644 index 00000000..3ca205d1 --- /dev/null +++ b/plugins/accelerated-peft/src/gptqmodel/quantization/quantizer.py @@ -0,0 +1,142 @@ +# License: GPTQModel/licenses/LICENSE.mit +# adapted from @qwopqwop200 's [GPTQ-for-LLaMa](https://github.com/qwopqwop200/GPTQ-for-LLaMa/tree/cuda), which itself is based on [gptq](https://github.com/IST-DASLab/gptq) + +from logging import getLogger + +import torch +import torch.nn as nn + +logger = getLogger(__name__) + + +def quantize(x, scale, zero, maxq): + if maxq < 0: + return (x > scale / 2).float() * scale + (x < zero / 2).float() * zero + q = torch.clamp(torch.round(x / scale) + zero, 0, maxq) + return scale * (q - zero) + + +class Quantizer(nn.Module): + def __init__(self, shape=1): + super(Quantizer, self).__init__() + self.register_buffer("maxq", torch.tensor(0)) + self.register_buffer("scale", torch.zeros(shape)) + self.register_buffer("zero", torch.zeros(shape)) + + def configure( + self, + bits, + perchannel=False, + sym=True, + mse=False, + norm=2.4, + grid=100, + maxshrink=0.8, + trits=False, + ): + self.maxq = torch.tensor(2**bits - 1) + self.perchannel = perchannel + self.sym = sym + self.mse = mse + self.norm = norm + self.grid = grid + self.maxshrink = maxshrink + if trits: + self.maxq = torch.tensor(-1) + + def find_params(self, x, weight=False): + dev = x.device + self.maxq = self.maxq.to(dev) + + shape = x.shape + if self.perchannel: + if weight: + x = x.flatten(1) + else: + if len(shape) == 4: + x = x.permute([1, 0, 2, 3]) + x = x.flatten(1) + if len(shape) == 3: + x = x.reshape((-1, shape[-1])).t() + if len(shape) == 2: + x = x.t() + else: + x = x.flatten().unsqueeze(0) + + tmp = torch.zeros(x.shape[0], device=dev) + xmin = torch.minimum(x.min(1)[0], tmp) + xmax = torch.maximum(x.max(1)[0], tmp) + + if self.sym: + xmax = torch.maximum(torch.abs(xmin), xmax) + tmp = xmin < 0 + if torch.any(tmp): + xmin[tmp] = -xmax[tmp] + tmp = (xmin == 0) & (xmax == 0) + xmin[tmp] = -1 + xmax[tmp] = +1 + + if self.maxq < 0: + self.scale = xmax + self.zero = xmin + else: + self.scale = (xmax - xmin) / self.maxq + if self.sym: + self.zero = torch.full_like(self.scale, (self.maxq + 1) / 2) + else: + self.zero = torch.round(-xmin / self.scale) + + if self.mse: + best = torch.full([x.shape[0]], float("inf"), device=dev) + for i in range(int(self.maxshrink * self.grid)): + p = 1 - i / self.grid + xmin1 = p * xmin + xmax1 = p * xmax + scale1 = (xmax1 - xmin1) / self.maxq + zero1 = torch.round(-xmin1 / scale1) if not self.sym else self.zero + q = quantize(x, scale1.unsqueeze(1), zero1.unsqueeze(1), self.maxq) + q -= x + q.abs_() + q.pow_(self.norm) + err = torch.sum(q, 1) + tmp = err < best + if torch.any(tmp): + best[tmp] = err[tmp] + self.scale[tmp] = scale1[tmp] + self.zero[tmp] = zero1[tmp] + if not self.perchannel: + if weight: + tmp = shape[0] + else: + tmp = shape[1] if len(shape) != 3 else shape[2] + self.scale = self.scale.repeat(tmp) + self.zero = self.zero.repeat(tmp) + + if weight: + shape = [-1] + [1] * (len(shape) - 1) + self.scale = self.scale.reshape(shape) + self.zero = self.zero.reshape(shape) + return + if len(shape) == 4: + self.scale = self.scale.reshape((1, -1, 1, 1)) + self.zero = self.zero.reshape((1, -1, 1, 1)) + if len(shape) == 3: + self.scale = self.scale.reshape((1, 1, -1)) + self.zero = self.zero.reshape((1, 1, -1)) + if len(shape) == 2: + self.scale = self.scale.unsqueeze(0) + self.zero = self.zero.unsqueeze(0) + + def quantize(self, x): + if self.ready(): + return quantize(x, self.scale, self.zero, self.maxq) + return x + + def enabled(self): + return self.maxq > 0 + + def ready(self): + return torch.all(self.scale != 0) + + +__all__ = ["Quantizer"] diff --git a/plugins/accelerated-peft/src/gptqmodel/utils/__init__.py b/plugins/accelerated-peft/src/gptqmodel/utils/__init__.py new file mode 100644 index 00000000..0e79e1d3 --- /dev/null +++ b/plugins/accelerated-peft/src/gptqmodel/utils/__init__.py @@ -0,0 +1 @@ +from .backend import Backend, get_backend diff --git a/plugins/accelerated-peft/src/gptqmodel/utils/backend.py b/plugins/accelerated-peft/src/gptqmodel/utils/backend.py new file mode 100644 index 00000000..43dfffea --- /dev/null +++ b/plugins/accelerated-peft/src/gptqmodel/utils/backend.py @@ -0,0 +1,27 @@ +############################################################################### +# Adapted from https://github.com/ModelCloud/GPTQModel +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +############################################################################### +from enum import Enum + + +class Backend(Enum): + AUTO = 0 # choose the fastest one based on quant model compatibility + TRITON = 3 + +def get_backend(backend: str): + try: + return Backend[backend] + except KeyError: + raise ValueError(f"Invalid Backend str: {backend}") diff --git a/plugins/accelerated-peft/src/gptqmodel/utils/data.py b/plugins/accelerated-peft/src/gptqmodel/utils/data.py new file mode 100644 index 00000000..19e61779 --- /dev/null +++ b/plugins/accelerated-peft/src/gptqmodel/utils/data.py @@ -0,0 +1,275 @@ +############################################################################### +# Adapted from https://github.com/ModelCloud/GPTQModel +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +############################################################################### +import copy +import random +from functools import partial +from typing import Callable, Dict, List, Optional + +import torch +from datasets import DatasetDict, IterableDatasetDict, load_dataset +from torch import LongTensor +from torch.utils.data import DataLoader +from transformers import PreTrainedTokenizer + + +def make_data_block( + samples: Dict[str, List[str]], + prompt_col_name: str, + label_col_name: str, + tokenizer: PreTrainedTokenizer, + preprocess_fn: Optional[Callable] = None, + sample_max_len: int = 1024, + block_max_len: int = 2048, + add_eos_token: bool = False, + truncate_prompt: bool = True, + merge_prompt_label: bool = False, +) -> Dict[str, List[LongTensor]]: + """A simple implementation of text generation oriented smart batching to maximize VRAM usage when evaluation + + :param samples: Dict[str, List[str]], samples that used to make data blocks + :param prompt_col_name: str, name of the key in samples whose value stores prompt + :param label_col_name: str, name of the key in samples whose value stores label + :param tokenizer: transformers.PretrainedTokenizer, tokenizer that used to tokenize samples + :param preprocess_fn: Optional[Callable], optional function that used to preprocess samples such as + refactor the data structure of samples, note the output of this function must be a dict whose keys + at least contains `prompt_col_name` and `label_col_name` + :param sample_max_len: int, defaults to 1024, max tokens number of each sample (before padding) + :param block_max_len: int, defaults to 2048, max tokens number of each data block (after padding) + :param add_eos_token: bool, defaults to False, whether add eos_token or not to the label + :param truncate_prompt: bool, defaults to True, whether to truncate prompt if the sample's total tokens + number exceeds `sample_max_len`, if not, will truncate label and drop this sample when all tokens + in label are truncated + :param merge_prompt_label: bool, defaults to False, will merge label into prompt if set to True, usually + this only required when doing language modeling task + :return: Dict[str, List[torch.LongTensor]], a dict whose keys are `input_ids`, `attention_mask` and + `label` and values are a list of torch.LongTensor + """ + if preprocess_fn: + samples = preprocess_fn(samples) + + prompts = samples[prompt_col_name] + labels = samples[label_col_name] + + # tokenize samples + tokenized_prompts = tokenizer(prompts, truncation=False)["input_ids"] + tokenized_labels = tokenizer(labels, truncation=False)["input_ids"] + + # filter tokenized samples by length + dropped_indices = [] + for idx, (tokenized_prompt, tokenized_label) in enumerate(zip(tokenized_prompts, tokenized_labels)): + if add_eos_token: + tokenized_label += [tokenizer.eos_token_id] + len_prompt = len(tokenized_prompt) + len_label = len(tokenized_label) + exceed_len = len_prompt + len_label - sample_max_len + if exceed_len > 0: + if truncate_prompt: + tokenized_prompt = tokenized_prompt[exceed_len:] + else: + tokenized_label = tokenized_label[:-exceed_len] + tokenized_prompts[idx] = tokenized_prompt + tokenized_labels[idx] = tokenized_label + if not tokenized_label: + dropped_indices.append(idx) + + # make data blocks of samples + tokenized_samples = sorted( + [(p, l) for idx, (p, l) in enumerate(zip(tokenized_prompts, tokenized_labels)) if idx not in dropped_indices], + key=lambda x: (len(x[0]) + len(x[1])) if merge_prompt_label else len(x[0]), + ) + sample_blocks = [] + sample_block = [] + blk_max_len = 0 + blk_total_len = 0 + for tokenized_sample in tokenized_samples: + prompt_ids, label_ids = tokenized_sample + ori_sample_len = len(prompt_ids) + if merge_prompt_label: + ori_sample_len += len(label_ids) + if ori_sample_len <= blk_max_len: + additional_len = blk_max_len + sample_len = blk_max_len + else: + additional_len = len(sample_block) * (ori_sample_len - blk_max_len) + ori_sample_len + sample_len = ori_sample_len + + if blk_total_len + additional_len > block_max_len: + sample_blocks.append((copy.copy(sample_block), blk_max_len)) + sample_block = [] + blk_max_len = 0 + blk_total_len = 0 + sample_len = ori_sample_len + additional_len = ori_sample_len + + sample_block.append(tokenized_sample) + blk_max_len = max(blk_max_len, sample_len) + blk_total_len += additional_len + + if sample_block: + sample_blocks.append((copy.copy(sample_block), blk_max_len)) + del sample_block + del blk_max_len + del blk_total_len + + new_samples = {"input_ids": [], "attention_mask": [], "labels": []} + # padding each data block internally + for block, blk_max_len in sample_blocks: + input_ids = [] + attention_mask = [] + label_ids = [] + label_max_len = max([len(sample[1]) for sample in block]) + + for sample in block: + tokenized_prompt, tokenized_label = sample + sample_len = len(tokenized_prompt) + if merge_prompt_label: + sample_len += len(tokenized_label) + pad_num = blk_max_len - sample_len + if merge_prompt_label: + input_ids.append([tokenizer.pad_token_id] * pad_num + tokenized_prompt + tokenized_label) + label_ids.append([-100] * (pad_num + len(tokenized_prompt)) + tokenized_label) + else: + input_ids.append([tokenizer.pad_token_id] * pad_num + tokenized_prompt) + label_ids.append([-100] * (label_max_len - len(tokenized_label)) + tokenized_label) + attention_mask.append([0] * pad_num + [1] * sample_len) + + new_samples["input_ids"].append(input_ids) + new_samples["attention_mask"].append(attention_mask) + new_samples["labels"].append(label_ids) + + return new_samples + + +def collate_data(blocks: List[Dict[str, List[List[int]]]], pad_token_id: int) -> Dict[str, LongTensor]: + def pad_block(block, pads): + return torch.cat((block, pads.to(block.device)), dim=-1) + + input_ids_blocks = [LongTensor(block["input_ids"]) for block in blocks] + attention_mask_blocks = [LongTensor(block["attention_mask"]) for block in blocks] + label_blocks = [LongTensor(block["labels"]) for block in blocks] + + bsz = len(blocks) + inp_max_len = max([block.size(-1) for block in input_ids_blocks]) + label_max_len = max([block.size(-1) for block in label_blocks]) + + for i in range(bsz): + block_bsz, block_inp_len = input_ids_blocks[i].shape + block_label_len = label_blocks[i].shape[-1] + pad_num = inp_max_len - block_inp_len + if pad_num > 0: + input_ids_blocks[i] = pad_block(input_ids_blocks[i], torch.ones((block_bsz, pad_num)) * pad_token_id) + attention_mask_blocks[i] = pad_block(attention_mask_blocks[i], torch.zeros((block_bsz, pad_num))) + label_pad_num = label_max_len - block_label_len + if label_pad_num > 0: + label_blocks[i] = pad_block(label_blocks[i], torch.ones((block_bsz, label_pad_num)) * -100) + + return { + "input_ids": torch.cat(input_ids_blocks, dim=0).long(), + "attention_mask": torch.cat(attention_mask_blocks, dim=0).long(), + "labels": torch.cat(label_blocks, dim=0).long(), + } + + +def get_dataloader( + data_path_or_name: str, + prompt_col_name: str, + label_col_name: str, + tokenizer: PreTrainedTokenizer, + load_fn: Optional[Callable] = None, + preprocess_fn: Optional[Callable] = None, + num_samples: int = 128, + sample_max_len: int = 1024, + block_max_len: int = 2048, + add_eos_token: bool = False, + truncate_prompt: bool = True, + merge_prompt_label: bool = False, + load_fn_kwargs: Optional[dict] = None, + preprocess_fn_kwargs: Optional[dict] = None, + **kwargs, +) -> DataLoader: + """load dataset and build dataloader + + :param data_path_or_name: str, dataset name in hf-hub or local file path + :param prompt_col_name: str, see `make_data_block` + :param label_col_name: str, see `make_data_block` + :param tokenizer: str, see `make_data_block` + :param load_fn: Optional[Callable], defaults to None, function used to load dataset, if not specified, + use `datasets.load_dataset` + :param preprocess_fn: Optional[Callable], see `make_data_block` + :param num_samples: int, defaults to 128, total samples used to evaluation + :param sample_max_len: int, see `make_data_block` + :param block_max_len: int, see `make_data_block` + :param add_eos_token: bool, see `make_data_block` + :param truncate_prompt: bool, see `make_data_block` + :param merge_prompt_label: bool, see `make_data_block` + :param load_fn_kwargs: Optional[dict], defaults to None, keyword arguments used + for `load_fn` or `datasets.load_dataset` + :param preprocess_fn_kwargs: Optional[dict], defaults to None, keyword arguments used + for `preprocess_fn` + :param kwargs: additional keyword arguments will be passed to torch's `DataLoader` initialization, + note values of `batch_size`, `shuffle` and `collate_fn` will always be overridden to fixed value + :return: torch.utils.data.DataLoader + """ + + if not load_fn_kwargs: + load_fn_kwargs = {} + if not preprocess_fn_kwargs: + preprocess_fn_kwargs = {} + + if load_fn: + ds = load_fn(data_path_or_name, **load_fn_kwargs) + else: + ds = load_dataset(data_path_or_name, **load_fn_kwargs) + if isinstance(ds, (DatasetDict, IterableDatasetDict)): + if "evaluation" in ds: + ds = ds["evaluation"] + elif "test" in ds: + ds = ds["test"] + else: + ds = ds["train"] + + ds = ds.select( + indices=random.sample(range(len(ds)), min(len(ds), num_samples)), + keep_in_memory=True, + ) + ds = ds.map( + make_data_block, + batched=True, + batch_size=len(ds), + num_proc=1, + remove_columns=ds.column_names, + keep_in_memory=True, + load_from_cache_file=False, + fn_kwargs={ + "prompt_col_name": prompt_col_name, + "label_col_name": label_col_name, + "tokenizer": tokenizer, + "preprocess_fn": partial(preprocess_fn, **preprocess_fn_kwargs), + "sample_max_len": sample_max_len, + "block_max_len": block_max_len, + "add_eos_token": add_eos_token, + "truncate_prompt": truncate_prompt, + "merge_prompt_label": merge_prompt_label, + }, + ) + + # override some arguments' values in kwargs despite user specified + kwargs["batch_size"] = 1 + kwargs["shuffle"] = False + kwargs["collate_fn"] = partial(collate_data, pad_token_id=tokenizer.pad_token_id) + dl = DataLoader(ds, **kwargs) + + return dl diff --git a/plugins/accelerated-peft/src/gptqmodel/utils/importer.py b/plugins/accelerated-peft/src/gptqmodel/utils/importer.py new file mode 100644 index 00000000..40503569 --- /dev/null +++ b/plugins/accelerated-peft/src/gptqmodel/utils/importer.py @@ -0,0 +1,63 @@ +############################################################################### +# Adapted from https://github.com/ModelCloud/GPTQModel +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +############################################################################### +from collections import OrderedDict +from logging import getLogger + +from ..nn_modules.qlinear.qlinear_tritonv2 import QuantLinear as TritonV2QuantLinear +from ..quantization import FORMAT +from .backend import Backend + +backend_dict = OrderedDict({ + Backend.TRITON: TritonV2QuantLinear, +}) + +format_dict = { + FORMAT.GPTQ: [Backend.TRITON], + FORMAT.GPTQ_V2: [Backend.TRITON], + FORMAT.TRITON: [Backend.TRITON], +} + +logger = getLogger(__name__) + +# auto select the correct/optimal QuantLinear class +def select_quant_linear( + bits: int, + group_size: int, + desc_act: bool, + sym: bool, + backend: Backend, + format: FORMAT, + pack: bool = False, +): + # Handle the case where backend is AUTO. + if backend == Backend.AUTO: + allow_backends = format_dict[format] + for k, v in backend_dict.items(): + in_allow_backends = k in allow_backends + validate = v.validate(bits, group_size, desc_act, sym, raise_error=False) + check_pack_func = hasattr(v, "pack") if pack else True + if in_allow_backends and validate and check_pack_func: + logger.info(f"Auto choose the fastest one based on quant model compatibility: {v}") + return v + + # Handle the case where backend is not AUTO. + if backend == Backend.TRITON: + logger.info("Using tritonv2 for GPTQ") + from ..nn_modules.qlinear.qlinear_tritonv2 import QuantLinear + else: + raise NotImplementedError("Invalid Backend") + + return QuantLinear diff --git a/plugins/accelerated-peft/src/gptqmodel/utils/model.py b/plugins/accelerated-peft/src/gptqmodel/utils/model.py new file mode 100644 index 00000000..23ef3821 --- /dev/null +++ b/plugins/accelerated-peft/src/gptqmodel/utils/model.py @@ -0,0 +1,661 @@ +############################################################################### +# Adapted from https://github.com/ModelCloud/GPTQModel +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +############################################################################### +import functools +import hashlib +import json +import logging +import os +from logging import getLogger +from typing import List, Optional + +import accelerate +import threadpoolctl as tctl +import torch +import torch.nn as nn +import transformers +from tqdm import tqdm +from transformers import AutoConfig, PretrainedConfig +from transformers.utils.hub import cached_file + +from ..models._const import CPU, CUDA_0, EXLLAMA_DEFAULT_MAX_INPUT_LENGTH, EXPERT_INDEX_PLACEHOLDER, SUPPORTED_MODELS +from ..nn_modules.qlinear import BaseQuantLinear +from ..quantization import FORMAT, QuantizeConfig +from .backend import Backend +from .importer import select_quant_linear + +logger = getLogger(__name__) +handler = logging.StreamHandler() +formatter = logging.Formatter("%(levelname)s - %(message)s") +handler.setFormatter(formatter) +logger.addHandler(handler) +logger.setLevel(logging.INFO) + + +def recurse_getattr(obj, attr: str): + """ + Recursive `getattr`. + + Args: + obj: + A class instance holding the attribute. + attr (`str`): + The attribute that is to be retrieved, e.g. 'attribute1.attribute2'. + """ + + def _getattr(obj, attr): + return getattr(obj, attr) + + return functools.reduce(_getattr, [obj] + attr.split(".")) + + +def recurse_setattr(module, name, value): + """A function to recursively set attributes to a module.""" + if "." not in name: + setattr(module, name, value) + else: + name, rest = name.split(".", 1) + recurse_setattr(getattr(module, name), rest, value) + + +def get_device(obj: torch.Tensor | nn.Module): + if isinstance(obj, torch.Tensor): + return obj.device + return next(obj.parameters()).device + + +def move_to(obj: torch.Tensor | nn.Module, device: torch.device): + if get_device(obj) != device: + obj = obj.to(device) + return obj + + +def nested_move_to(v, device): + if isinstance(v, torch.Tensor): + return move_to(v, device) + elif isinstance(v, (list, tuple)): + return type(v)([nested_move_to(e, device) for e in v]) + else: + return v + + +def find_layers(module, layers=None, name=""): + if not layers: + layers = [transformers.pytorch_utils.Conv1D, nn.Conv2d, nn.Linear] + for layer in layers: + if isinstance(module, layer): + return {name: module} + res = {} + for name1, child in module.named_children(): + res.update(find_layers(child, layers=layers, name=name + "." + name1 if name != "" else name1)) + return res + + +def get_module_by_name_prefix(model, module_name: str): + for name, module in model.named_modules(): + if name.startswith(module_name): + return module + + +def get_module_by_name_suffix(model, module_name: str): + for name, module in model.named_modules(): + if name.endswith(module_name): + return module + + +def make_quant( + module, + names, + bits: int, + group_size: int, + backend: Backend, + format: str, + desc_act: bool = False, + sym: bool = True, + use_cuda_fp16: bool = True, + pack: bool = False, +) -> BaseQuantLinear: + select_quant_linear_func = select_quant_linear_with_pack if pack else select_quant_linear + QuantLinear = select_quant_linear_func( + bits=bits, + group_size=group_size, + desc_act=desc_act, + sym=sym, + backend=backend, + format=format, + pack=pack, + ) + + if isinstance(module, QuantLinear): + return QuantLinear + + for name, submodule in module.named_modules(): + if name in names: + ori_layer_device = next(submodule.parameters()).device + + if isinstance(submodule, nn.Linear): + in_features = submodule.in_features + out_features = submodule.out_features + elif isinstance(submodule, nn.Conv2d): + in_features = submodule.in_channels + out_features = submodule.out_channels + elif isinstance(submodule, transformers.pytorch_utils.Conv1D): + in_features = submodule.weight.shape[0] + out_features = submodule.weight.shape[1] + else: + raise NotImplementedError(f"Unsupported module {submodule}") + + bias = submodule.bias is not None + if (not (desc_act) or group_size == -1) and backend != Backend.TRITON: + new_layer = QuantLinear( + bits=bits, + group_size=group_size, + desc_act=desc_act, + sym=sym, + infeatures=in_features, + outfeatures=out_features, + bias=bias, + use_cuda_fp16=use_cuda_fp16, + weight_dtype=submodule.weight.dtype, + ) + else: + new_layer = QuantLinear( + bits=bits, + group_size=group_size, + desc_act=desc_act, + sym=sym, + infeatures=in_features, + outfeatures=out_features, + bias=bias, + weight_dtype=submodule.weight.dtype, + ) + new_layer.device = ori_layer_device + recurse_setattr(module, name, new_layer.to(ori_layer_device)) + + return QuantLinear + +def convert_gptq_v1_to_v2_format( + model, + quantize_config: QuantizeConfig, + qlinear_kernel: nn.Module, +): + # Limit thread usage to avoid auto-parallizataion regression + with tctl.threadpool_limits(limits=1): + for _, submodule in model.named_modules(): + # v1 checkpoint format used to do `qzeros = qzeros -= 1` before serialization, thus the + # additions here do not overflow. + # v1 checkpoint format with sym=False saved via convert_gptq_v2_to_v1_format() will + # overflow ~<=13% based on testing + if isinstance(submodule, qlinear_kernel): + if quantize_config.bits == 2: + submodule.qzeros.data += 0b01010101010101010101010101010101 + elif quantize_config.bits == 3: + submodule.qzeros.data[:, range(0, submodule.qzeros.data.shape[1], 3)] += ( + 0b00100100100100100100100100100100 + ) + submodule.qzeros.data[:, range(1, submodule.qzeros.data.shape[1], 3)] += ( + 0b10010010010010010010010010010010 + ) + submodule.qzeros.data[:, range(2, submodule.qzeros.data.shape[1], 3)] += ( + 0b01001001001001001001001001001001 + ) + elif quantize_config.bits == 4: + submodule.qzeros.data += 0b00010001000100010001000100010001 + elif quantize_config.bits == 8: + submodule.qzeros.data += 0b00000001000000010000000100000001 + else: + raise NotImplementedError("Only 2,3,4,8 bits are supported.") + + return model + + +def convert_gptq_v2_to_v1_format( + model, + quantize_config: QuantizeConfig, + qlinear_kernel: nn.Module, +): + # Limit thread usage to avoid auto-parallizataion regression + with tctl.threadpool_limits(limits=1): + for _, submodule in model.named_modules(): + # sym=False has underflow probability of ~<=13% during testing. No underflow possible for sym=True. + if isinstance(submodule, qlinear_kernel): + if quantize_config.bits == 2: + submodule.qzeros.data -= 0b01010101010101010101010101010101 + elif quantize_config.bits == 3: + submodule.qzeros.data[:, range(0, submodule.qzeros.data.shape[1], 3)] -= ( + 0b00100100100100100100100100100100 + ) + submodule.qzeros.data[:, range(1, submodule.qzeros.data.shape[1], 3)] -= ( + 0b10010010010010010010010010010010 + ) + submodule.qzeros.data[:, range(2, submodule.qzeros.data.shape[1], 3)] -= ( + 0b01001001001001001001001001001001 + ) + elif quantize_config.bits == 4: + submodule.qzeros.data -= 0b00010001000100010001000100010001 + elif quantize_config.bits == 8: + submodule.qzeros.data -= 0b00000001000000010000000100000001 + else: + raise NotImplementedError("Only 2,3,4,8 bits are supported.") + + return model + +def select_quant_linear_with_pack(bits: int, + group_size: int, + desc_act: bool, + sym: bool, + backend: Backend, format: str, pack: bool): + QuantLinear = select_quant_linear( + bits=bits, + group_size=group_size, + desc_act=desc_act, + sym=sym, + backend=backend, + format=format, + pack=pack, + ) + return QuantLinear + +def pack_model( + model, + quantizers, + bits, + group_size, + backend: Backend, + format: str, + desc_act=False, + sym: bool = True, + use_cuda_fp16=True, + warmup_triton: bool = False, + force_layer_back_to_cpu: bool = False, +): + QuantLinear = select_quant_linear_with_pack( + bits=bits, + group_size=group_size, + desc_act=desc_act, + sym=sym, + backend=backend, + format=format, + pack=True, + ) + + if force_layer_back_to_cpu: + model.to(CPU) + + logger.info("Packing model...") + layers = find_layers(model) + layers = {n: layers[n] for n in quantizers} + make_quant( + model, + quantizers, + bits, + group_size, + backend=backend, + format=format, + use_cuda_fp16=use_cuda_fp16, + desc_act=desc_act, + pack=True, + ) + qlayers = find_layers(model, [QuantLinear]) + + # Limit pack() thread usage to avoid auto-parallizataion regression + with tctl.threadpool_limits(limits=1): + pbar = tqdm(qlayers.keys(), leave=True) + for name in pbar: + pbar.set_description(f"Packing {name}") + + quantizers[name], scale, zero, g_idx = quantizers[name] + # so far can only pack layer on CPU + layer_device = qlayers[name].device + qlayers[name].to(CPU) + layers[name], scale, zero, g_idx = ( + layers[name].to(CPU), + scale.to(CPU), + zero.to(CPU), + g_idx.to(CPU), + ) + if QuantLinear.QUANT_TYPE == "marlin": + qlayers[name].pack(layers[name], scale) + else: + qlayers[name].pack(layers[name], scale, zero, g_idx) + qlayers[name].to(layer_device) + + logger.info("Model packed.") + + if backend != Backend.TRITON and warmup_triton: + logger.warning( + "using autotune_warmup will move model to GPU, make sure you have enough VRAM to load the whole model." + ) + QuantLinear.warmup(model.to(CUDA_0), seqlen=model.seqlen) + return QuantLinear + +def verify_model_hash(file_path: str, verify_hash: str): + if not isinstance(verify_hash, str): + raise ValueError("model verify_hash must be a string") + if ':' not in verify_hash: + raise ValueError("verify_hash must be in the format 'hash_type:hash_value'") + hash_type, hash_value = verify_hash.split(':', 1) + hash_func = getattr(hashlib, hash_type, None) + if not hash_func: + raise ValueError(f"No hash function found for type: {hash_type}") + with open(file_path, "rb") as f: + file_hash = hash_func(f.read()).hexdigest() + return file_hash == hash_value + + +def verify_sharded_model_hashes(jsonPath: str, verify_hash: List[str]): + if not isinstance(verify_hash, list): + raise ValueError("sharded model verify_hash must be a list") + + with open(jsonPath, 'r') as f: + index_data = json.load(f) + weight_map = index_data['weight_map'] + shard_files = set(weight_map.values()) + if len(shard_files) != len(verify_hash): + raise ValueError("Number of shards and number of hash values do not match.") + + for shard_file, expected_hash in zip(shard_files, verify_hash): + if not verify_model_hash(shard_file, expected_hash): + logger.info(f"Hash verification failed for {shard_file}") + return False + return True + +def check_and_get_model_type(model_dir, trust_remote_code=False): + config = AutoConfig.from_pretrained(model_dir, trust_remote_code=trust_remote_code) + if config.model_type not in SUPPORTED_MODELS: + raise TypeError(f"{config.model_type} isn't supported yet.") + model_type = config.model_type + return model_type + + +def simple_dispatch_model(model, device_map): + from accelerate.hooks import AlignDevicesHook, add_hook_to_module + + if "" in device_map: + d = device_map[""] + model = model.to(torch.device(d)) + model.hf_device_map = device_map + return model + + tied_params = accelerate.utils.modeling.find_tied_parameters(model) + if set(device_map.values()) == {"cpu"} or set(device_map.values()) == { + "cpu", + "disk", + }: + main_device = "cpu" + else: + main_device = [d for d in device_map.values() if d not in ["cpu", "disk"]][0] + + cpu_offload_group = [(n, d) for n, d in device_map.items() if d == "cpu"] + prev_hook = None + for idx, (n, d) in enumerate(cpu_offload_group): + m = get_module_by_name_suffix(model, n) + _, prev_hook = accelerate.cpu_offload_with_hook(m, execution_device=main_device, prev_module_hook=prev_hook) + # set first cpu offload module's prev_module_hook to the last cpu offload module's hook + if len(cpu_offload_group) > 1: + get_module_by_name_suffix(model, cpu_offload_group[0][0])._hf_hook.prev_module_hook = prev_hook + + for n, d in device_map.items(): + m = get_module_by_name_suffix(model, n) + if d != "cpu": + d = torch.device(d) + hook = AlignDevicesHook(d, io_same_device=True, place_submodules=True) + add_hook_to_module(m, hook) + accelerate.utils.modeling.retie_parameters(model, tied_params) + model.hf_device_map = device_map + + return model + + +# TODO: refractor. very strange post_init has to re-determine qlinear type again +# when qliear type is selected, it should auto-override the model post_init method and +# not have to go about looping over modules to match qlinear type a second time as it is +# very prone to bugs +def gptqmodel_post_init(model, use_act_order: bool, max_input_length: Optional[int] = None): + """ + The max_input_length argument is specific to the exllama backend, that requires to initialize a buffer temp_state. + """ + + # post init for bitblas backend. + device_to_buffers_size = {} + for _, submodule in model.named_modules(): + if hasattr(submodule, "QUANT_TYPE") and submodule.QUANT_TYPE == "bitblas": + submodule.post_init() + + model_uses_exllama = False + for name, submodule in model.named_modules(): + if isinstance(submodule, BaseQuantLinear) and submodule.QUANT_TYPE == "exllama": + model_uses_exllama = True + device = submodule.qweight.device + if device not in device_to_buffers_size: + device_to_buffers_size[device] = { + "max_dq_buffer_size": 1, + "max_inner_outer_dim": 1, + } + + if not use_act_order: + submodule._use_act_order = False + else: + submodule._use_act_order = True + + # Disable this heuristic for detecting act_order, but it could be used instead of the config. + """ + if submodule.g_idx is None: + submodule.act_order = False + elif submodule.g_idx is not None and ((submodule.g_idx == 0).all() or torch.equal(submodule.g_idx.cpu(), torch.tensor([i // submodule.group_size for i in range(submodule.g_idx.shape[0])], dtype=torch.int32))): + submodule.g_idx = None + submodule.act_order = False + else: + submodule.act_order = True + """ + + device_to_buffers_size[device]["max_dq_buffer_size"] = max( + device_to_buffers_size[device]["max_dq_buffer_size"], + submodule.qweight.numel() * 8, + ) + + if use_act_order: + device_to_buffers_size[device]["max_inner_outer_dim"] = max( + device_to_buffers_size[device]["max_inner_outer_dim"], + submodule.infeatures, + submodule.outfeatures, + ) + + if model_uses_exllama: + # To be honest this is quite ugly, not proud of this. + from gptqmodel_exllama_kernels import prepare_buffers, set_tuning_params + + device_to_buffers = {} + + if use_act_order: + if max_input_length is None: + max_input_len = EXLLAMA_DEFAULT_MAX_INPUT_LENGTH + else: + max_input_len = max_input_length + else: + if max_input_length is not None: + logger.info( + "Using exllama backend without act-order, the parameter max_input_length was set although not needed, it will be ignored." + ) + max_input_len = 1 + + for device, buffers_size in device_to_buffers_size.items(): + # The temp_state buffer is required to reorder X in the act-order case. + # The temp_dq buffer is required to dequantize weights when using cuBLAS, typically for the prefill. + device_to_buffers[device] = { + "temp_state": torch.zeros( + (max_input_len, buffers_size["max_inner_outer_dim"]), + dtype=torch.float16, + device=device, + ), + "temp_dq": torch.zeros( + (1, buffers_size["max_dq_buffer_size"]), + dtype=torch.float16, + device=device, + ), + "max_dq_buffer_size": buffers_size["max_dq_buffer_size"], + "max_inner_outer_dim": buffers_size["max_inner_outer_dim"], + } + + # Buffers need to be persistent to avoid any bug. + model.device_to_buffers = device_to_buffers + + for device, buffers in model.device_to_buffers.items(): + prepare_buffers(device, buffers["temp_state"], buffers["temp_dq"]) + + # Using the default from exllama repo here. + matmul_recons_thd = 8 + matmul_fused_remap = False + matmul_no_half2 = False + set_tuning_params(matmul_recons_thd, matmul_fused_remap, matmul_no_half2) + + # The buffers need to have been initialized first before calling make_q4. + for name, submodule in model.named_modules(): + if isinstance(submodule, BaseQuantLinear) and submodule.QUANT_TYPE == "exllama": + submodule.post_init() + + # exllamav2 + fixed_bytes = {} + model_uses_exllamav2 = False + + for _, submodule in model.named_modules(): + if isinstance(submodule, BaseQuantLinear) and submodule.QUANT_TYPE == "exllamav2": + model_uses_exllamav2 = True + device = submodule.qweight.device + scratch_fixed = submodule.scratch_space_fixed() + fixed_bytes[device] = max(scratch_fixed, fixed_bytes.get(device, 0)) + + if model_uses_exllamav2: + from ..nn_modules.qlinear.qlinear_exllamav2 import ExLlamaV2DeviceTensors + + device_tensors = {} + for device, scratch_bytes in fixed_bytes.items(): + device_tensors[device] = ExLlamaV2DeviceTensors(device.index, scratch_bytes) + + # have persistent buffers, otherwise we will get OOM + model.device_tensors = device_tensors + + for _, submodule in model.named_modules(): + if isinstance(submodule, BaseQuantLinear) and submodule.QUANT_TYPE == "exllamav2": + device = submodule.qweight.device + submodule.post_init(temp_dq=model.device_tensors[device]) + torch.cuda.empty_cache() + + return model + + +def get_checkpoints( + model_name_or_path: str, extensions: List[str], possible_model_basenames: List[str], **cached_file_kwargs +): + """ + Retrives (and if necessary downloads from Hugging Face Hub) the model checkpoint. Sharding is supported. All the `possible_model_basenames` (e.g. `["model", "model-4bit-gptq"]`) will be explored over all `extensions` (e.g. `[".bin", ".safetensors"]`). + """ + searched_files = [] + resolved_archive_file = None + true_model_basename = None + + if os.path.isdir(model_name_or_path): + for ext in extensions: + for possible_model_basename in possible_model_basenames: + shard_index_name = possible_model_basename + ext + ".index.json" + searched_files.append(shard_index_name) + possible_index_file = os.path.join(model_name_or_path, shard_index_name) + if os.path.isfile(possible_index_file): + # The model is sharded over several checkpoints. + possible_model_basename = possible_index_file.replace(ext + ".index.json", "") + return True, possible_index_file, possible_model_basename + else: + model_save_name = os.path.join(model_name_or_path, possible_model_basename) + searched_files.append(possible_model_basename + ext) + if os.path.isfile(model_save_name + ext): + resolved_archive_file = model_save_name + ext + return False, resolved_archive_file, possible_model_basename + else: + temp = None + for ext in extensions: + for possible_model_basename in possible_model_basenames: + shard_index_name = possible_model_basename + ext + ".index.json" + shard_index = cached_file( + model_name_or_path, + shard_index_name, + **cached_file_kwargs, + ) + searched_files.append(shard_index_name) + if shard_index is not None: + # The model is sharded over several checkpoints. + with open(str(shard_index)) as f: + index_json = json.load(f) + # Download the shards from the index.json. + shards = list(set(index_json["weight_map"].values())) + for shard in shards: + resolved_archive_file = cached_file( + model_name_or_path, + shard, + **cached_file_kwargs, + ) + return True, shard_index, possible_model_basename + else: + resolved_archive_file = cached_file( + model_name_or_path, + possible_model_basename + ext, + **cached_file_kwargs, + ) + if resolved_archive_file is None: + resolved_archive_file = temp + searched_files.append(possible_model_basename + ext) + if resolved_archive_file is not None: + temp = resolved_archive_file + return False, resolved_archive_file, possible_model_basename + + if resolved_archive_file is None: + raise FileNotFoundError( + f"Could not find a model in {model_name_or_path} with a name in {', '.join(searched_files)}. Please specify the argument model_basename to use a custom file name." + ) + + return False, resolved_archive_file, true_model_basename + + +# return the most stable tensor dtype for quantization while minimizing vram +def auto_dtype_from_config(config: PretrainedConfig, quant_inference: bool = False) -> torch.dtype: + # all the gptq inference kernels are float16 only + if quant_inference: + return torch.float16 + + dtype = getattr(config, "torch_dtype") + if not dtype or not isinstance(dtype, torch.dtype): + raise ValueError("Your model config.json does not have torch_dtype set. Please check for model " "corruption.") + + if dtype == torch.float32: + return torch.bfloat16 + elif dtype == torch.float16: + return torch.float16 + else: + # up/down-cast everything else to bfloat16 if not already in bfloat16 + return torch.bfloat16 + + +# generate layer modules for moe models with experts +def get_moe_layer_modules(layer_modules: List, num_experts: int) -> List: + new_inside_layer_modules = [] + for names in layer_modules: + new_inside_layer_modules.append([]) + for n in names: + if EXPERT_INDEX_PLACEHOLDER in n: + for index in range(num_experts): + new_inside_layer_modules[-1].append(n.replace(EXPERT_INDEX_PLACEHOLDER, str(index))) + else: + new_inside_layer_modules[-1].append(n) + + return new_inside_layer_modules diff --git a/plugins/accelerated-peft/src/gptqmodel/utils/peft.py b/plugins/accelerated-peft/src/gptqmodel/utils/peft.py new file mode 100644 index 00000000..04f2c878 --- /dev/null +++ b/plugins/accelerated-peft/src/gptqmodel/utils/peft.py @@ -0,0 +1,153 @@ +############################################################################### +# Adapted from https://github.com/ModelCloud/GPTQModel +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +############################################################################### +import warnings +from contextlib import contextmanager +from typing import List, Optional, Tuple, Union + +import torch +from peft import PeftConfig, PeftModel, PeftType, get_peft_model +from peft.mapping import PEFT_TYPE_TO_CONFIG_MAPPING +from peft.peft_model import PEFT_TYPE_TO_MODEL_MAPPING +from peft.tuners.lora import LoraConfig, LoraLayer, LoraModel +from peft.tuners.lora.gptq import QuantLinear as LoraLinearGPTQ + +from ..models.base import BaseGPTQModel +from ..nn_modules.qlinear.qlinear_tritonv2 import QuantLinear as QuantLinearTriton + +class GPTQLoraConfig(LoraConfig): + injected_fused_attention: bool = False + injected_fused_mlp: bool = False + +class GPTQLoraModel(LoraModel): + def _replace_module(self, parent_module, child_name, new_module, old_module): + # replace the lora linear + setattr(parent_module, child_name, new_module) + # dispatch to correct device + # FIXME: refactor + for name, module in new_module.named_modules(): + if "lora_" in name: + device = (list(old_module.parameters()) + list(old_module.buffers()))[ + 0 + ].device + module.to(device) + + @staticmethod + def _create_new_module( + lora_config: LoraConfig, + adapter_name: str, + target: torch.nn.Module, + target_cls: torch.nn.Module = QuantLinearTriton, + **kwargs, + ): + # if the base layer module matches a supported class, dispatch the lora linear + # to be installed + new_module = None + if isinstance(target, target_cls): + new_module = LoraLinearGPTQ( + target, adapter_name, lora_config=lora_config, **kwargs + ) + + # if module cannot be found, return None which results in a raise in the call-stack + return new_module + + + def merge_adapter(self): + raise NotImplementedError("gptq model not support merge ada lora adapter") + + def unmerge_adapter(self): + raise NotImplementedError("gptq model not support unmerge ada lora adapter") + + def merge_and_unload(self): + raise NotImplementedError("gptq model not support merge and unload") + + +def find_all_linear_names( + model: BaseGPTQModel, + ignore: Optional[List[str]] = None, + ignore_lm_head: bool = True, +): + if not ignore: + ignore = [] + lm_head_name = model.lm_head_name + if ignore_lm_head and lm_head_name not in ignore: + ignore.append(lm_head_name) + results = set() + for n, m in model.named_modules(): + if isinstance(m, torch.nn.Linear): + res = n.split(".")[-1] + if res not in ignore: + results.add(res) + return list(results) + + +@contextmanager +def hijack_peft_mappings(): + PEFT_TYPE_TO_CONFIG_MAPPING[PeftType.LORA] = GPTQLoraConfig + PEFT_TYPE_TO_MODEL_MAPPING[PeftType.LORA] = GPTQLoraModel + + try: + yield + except: + PEFT_TYPE_TO_CONFIG_MAPPING[PeftType.LORA] = GPTQLoraConfig + PEFT_TYPE_TO_MODEL_MAPPING[PeftType.LORA] = GPTQLoraModel + raise + finally: + PEFT_TYPE_TO_CONFIG_MAPPING[PeftType.LORA] = GPTQLoraConfig + PEFT_TYPE_TO_MODEL_MAPPING[PeftType.LORA] = GPTQLoraModel + +def get_gptq_peft_model( + model: BaseGPTQModel, + peft_config: PeftConfig = None, + model_id: str = None, + adapter_name: str = "default", + auto_find_all_linears: bool = True, + train_mode: bool = False, +): + if train_mode and not peft_config: + raise ValueError("peft_config not specified when in train mode.") + if not train_mode and not model_id: + raise ValueError("model_id(where to load adapters) not specified when in inference mode.") + + if train_mode: + peft_type = peft_config.peft_type + if not isinstance(peft_type, str): + peft_type = peft_type.value + if peft_type in [PeftType.LORA.value]: + if auto_find_all_linears: + peft_config.target_modules = find_all_linear_names(model, ignore_lm_head=True) + if peft_type == PeftType.LORA.value and not isinstance(peft_config, GPTQLoraConfig): + peft_config = GPTQLoraConfig(**peft_config.to_dict()) + + with hijack_peft_mappings(): + try: + if train_mode: + peft_model = get_peft_model(model.model, peft_config, adapter_name=adapter_name) + else: + peft_model = PeftModel.from_pretrained(model.model, model_id, adapter_name) + except: + raise NotImplementedError( + f"{model.__class__.__name__} not support {peft_config.peft_type.value} peft type yet." + ) + + return peft_model + + +__all__ = [ + "GPTQLoraConfig", + "GPTQLoraModel", + "find_all_linear_names", + "get_gptq_peft_model", +] \ No newline at end of file diff --git a/plugins/accelerated-peft/tests/test_q4_triton.py b/plugins/accelerated-peft/tests/test_q4_triton.py new file mode 100644 index 00000000..c383db75 --- /dev/null +++ b/plugins/accelerated-peft/tests/test_q4_triton.py @@ -0,0 +1,99 @@ +############################################################################### +# Adapted from https://github.com/ModelCloud/GPTQModel +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +############################################################################### +# -- do not touch +import os + +os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" +# -- end do not touch + +import unittest # noqa: E402 + +import torch # noqa: E402 +from gptqmodel import Backend, GPTQModel # noqa: E402 +from gptqmodel.nn_modules.qlinear.qlinear_tritonv2 import QuantLinear as TritonV2QuantLinear # noqa: E402 +from transformers import AutoTokenizer # noqa: E402 + +GENERATE_EVAL_SIZE = 100 + +class TestsQ4Triton(unittest.TestCase): + def test_generation_desc_act_false(self): + prompt = "I am in Paris and" + + reference_output = " I am in Paris and I am in love with you.\n\nScene 2:\n\n(The stage is now dark, but the audience can see the characters walking around the stage.)\n\n(The stage is now lit up, but the audience can only see the characters' silhouettes.)\n\n(" + new_tokens = 60 + + model_id = "LnL-AI/TinyLlama-1.1B-Chat-v1.0-GPTQ-4bit" + + model_q = GPTQModel.from_quantized( + model_id, + device="cuda:0", + backend=Backend.TRITON, + torch_dtype=torch.float16, + ) + for _, submodule in model_q.named_modules(): + if isinstance(submodule, TritonV2QuantLinear): + break + else: + raise ValueError("Did not find a tritonv2 linear layer") + + tokenizer = AutoTokenizer.from_pretrained(model_id) + + inp = tokenizer(prompt, return_tensors="pt").to("cuda:0") + + # This one uses Autocast. + res = model_q.generate(**inp, num_beams=1, min_new_tokens=new_tokens, max_new_tokens=new_tokens) + predicted_text = tokenizer.decode(res[0]) + + self.assertEqual(predicted_text[:GENERATE_EVAL_SIZE], reference_output[:GENERATE_EVAL_SIZE]) + + # This one does not. + res = model_q.model.generate(**inp, num_beams=1, min_new_tokens=new_tokens, max_new_tokens=new_tokens) + predicted_text = tokenizer.decode(res[0]) + + self.assertEqual(predicted_text[:GENERATE_EVAL_SIZE], reference_output[:GENERATE_EVAL_SIZE]) + + def test_generation_desc_act_true(self): + prompt = "I am in Paris and" + device = torch.device("cuda:0") + + # Reference generated with the cuda-old kernel + reference_output = " I am in Paris and I am in love with you.\n\nScene 2:\n\nThe stage is now set in a Parisian café. The café is filled with people, including a group of friends, a couple, and a group of tourists. The friends are discussing their plans for the" + + model_id = "LnL-AI/TinyLlama-1.1B-Chat-v1.0-GPTQ-4bit" + revision = "desc_act_true" + + model_q = GPTQModel.from_quantized( + model_id, + device="cuda:0", + backend=Backend.TRITON, + revision=revision, + + ) + for _, submodule in model_q.named_modules(): + if isinstance(submodule, TritonV2QuantLinear): + break + else: + raise ValueError("Did not find a tritonv2 linear layer") + + tokenizer = AutoTokenizer.from_pretrained(model_id) + + inp = tokenizer(prompt, return_tensors="pt").to(device) + + res = model_q.generate(**inp, num_beams=1, min_new_tokens=60, max_new_tokens=60) + + predicted_text = tokenizer.decode(res[0]) + + self.assertEqual(predicted_text[:GENERATE_EVAL_SIZE], reference_output[:GENERATE_EVAL_SIZE]) diff --git a/plugins/accelerated-peft/tests/test_triton.py b/plugins/accelerated-peft/tests/test_triton.py new file mode 100644 index 00000000..410aa6e1 --- /dev/null +++ b/plugins/accelerated-peft/tests/test_triton.py @@ -0,0 +1,107 @@ +############################################################################### +# Adapted from https://github.com/ModelCloud/GPTQModel +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +############################################################################### +# -- do not touch +import os + +os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" +# -- end do not touch + +import os # noqa: E402 +import unittest # noqa: E402 + +import torch # noqa: E402 +import torch.utils.benchmark as benchmark # noqa: E402 +from gptqmodel import Backend, GPTQModel # noqa: E402 +from transformers import AutoTokenizer # noqa: E402 + +MODEL_ID = "TheBloke/Llama-7B-GPTQ" +DATASET_ID = "timdettmers/openassistant-guanaco" +LEARNING_RATE = 3e-5 +MAX_SEQ_LEN = 10 +BATCH_SIZE = 5 +NUM_TRAIN_STEPS = 10 + +os.environ["TOKENIZERS_PARALLELISM"] = "false" + + +def benchmark_forward( + fn, + *inputs, + repeats="auto", + desc="", + verbose=True, + amp=False, + amp_dtype=torch.float16, + **kwinputs, +): + if verbose: + print(desc, "- Forward pass") + + def amp_wrapper(*inputs, **kwinputs): + with torch.autocast(device_type="cuda", dtype=amp_dtype, enabled=amp): + fn(*inputs, **kwinputs) + + t = benchmark.Timer( + stmt="fn_amp(*inputs, **kwinputs)", + globals={"fn_amp": amp_wrapper, "inputs": inputs, "kwinputs": kwinputs}, + num_threads=torch.get_num_threads(), + ) + if repeats == "auto": + m = t.blocked_autorange() + else: + m = t.timeit(repeats) + if verbose: + print(m) + return t, m + + +def get_model_and_tokenizer( + model_id=MODEL_ID, + **model_kwargs, +): + tokenizer = AutoTokenizer.from_pretrained( + MODEL_ID, + use_fast=True, + ) + if not tokenizer.pad_token_id: + tokenizer.pad_token_id = tokenizer.eos_token_id + + model = GPTQModel.from_quantized( + model_id, + disable_exllamav2=True, + disable_exllama=True, + **model_kwargs, + ) + + model.warmup_triton() + return model, tokenizer + + +class TestTriton(unittest.TestCase): + def test_triton_qlinear(self): + ref_model, _ = get_model_and_tokenizer( + model_id=MODEL_ID, + backend=Backend.TRITON, + ) + + hidden_size = ref_model.model.model.embed_tokens.weight.shape[1] + test_data = torch.randn((1, 2048, hidden_size), dtype=torch.float16).cuda() + + qlinear_ref = ref_model.model.model.layers[0].self_attn.q_proj + + ref_out = qlinear_ref(test_data) # noqa: F841 + + _, measure_triton = benchmark_forward(qlinear_ref, test_data, desc="Triton", verbose=True) From 62424cb2265c03b8f56b271548d2550300483580 Mon Sep 17 00:00:00 2001 From: 1000850000 user Date: Sun, 30 Jun 2024 16:07:20 +0000 Subject: [PATCH 02/20] edited peft header --- .../src/gptqmodel/utils/peft.py | 34 ++++++++++++------- 1 file changed, 21 insertions(+), 13 deletions(-) diff --git a/plugins/accelerated-peft/src/gptqmodel/utils/peft.py b/plugins/accelerated-peft/src/gptqmodel/utils/peft.py index 04f2c878..db27fa09 100644 --- a/plugins/accelerated-peft/src/gptqmodel/utils/peft.py +++ b/plugins/accelerated-peft/src/gptqmodel/utils/peft.py @@ -1,17 +1,25 @@ ############################################################################### -# Adapted from https://github.com/ModelCloud/GPTQModel -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. +# Adapted from https://github.com/AutoGPTQ/AutoGPTQ +# MIT License +# Copyright (c) 2024 + +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: + +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. + +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. ############################################################################### import warnings from contextlib import contextmanager From 6a2a9e08512600d92703e22614b8c8be72bbb995 Mon Sep 17 00:00:00 2001 From: 1000850000 user Date: Sun, 30 Jun 2024 17:41:32 +0000 Subject: [PATCH 03/20] add package build workflow --- .github/workflows/build.yml | 55 +++++++++++++++++++++++++++ plugins/accelerated-peft/tox.ini | 5 +-- plugins/framework/tox.ini | 2 +- plugins/fused-ops-and-kernels/tox.ini | 2 +- 4 files changed, 59 insertions(+), 5 deletions(-) create mode 100644 .github/workflows/build.yml diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml new file mode 100644 index 00000000..81848e62 --- /dev/null +++ b/.github/workflows/build.yml @@ -0,0 +1,55 @@ +on: + push: + branches: ["extracted_autogptq"] + +jobs: + build: + runs-on: ubuntu-latest + strategy: + matrix: + python-version: + # - setup: "3.10" + # tox: "py310" + - setup: "3.9" + tox: "py39" + plugin_name: + - "framework" + - "accelerated-peft" + - "fused-ops-and-kernels" + + permissions: + id-token: write # IMPORTANT: this permission is mandatory for trusted publishing + + steps: + - uses: actions/checkout@v4 + + # should only trigger the release if the plugin has changes + - uses: dorny/paths-filter@v2 + id: changes + with: + filters: | + plugins: + - 'plugins/${{ matrix.plugin_name }}/**' + - if: steps.changes.outputs.plugins == 'true' + name: Set up Python ${{ matrix.python-version.setup }} + uses: actions/setup-python@v4 + with: + python-version: ${{ matrix.python-version.setup }} + + - if: steps.changes.outputs.plugins == 'true' + name: Install dependencies + run: | + python -m pip install --upgrade pip + python -m pip install --upgrade tox + + - if: steps.changes.outputs.plugins == 'true' + name: Build and check wheel package + run: | + cd plugins/${{ matrix.plugin_name }} + tox -e build,twinecheck + ls dist/* + # - if: steps.changes.outputs.plugins == 'true' + # name: "Upload to PyPI" + # uses: pypa/gh-action-pypi-publish@ec4db0b4ddc65acdf4bff5fa45ac92d78b56bdf0 # v1.9.0 + # with: + # packages-dir: plugins/${{ matrix.plugin_name }}/dist \ No newline at end of file diff --git a/plugins/accelerated-peft/tox.ini b/plugins/accelerated-peft/tox.ini index c2b33772..cdcf221d 100644 --- a/plugins/accelerated-peft/tox.ini +++ b/plugins/accelerated-peft/tox.ini @@ -1,5 +1,5 @@ [tox] -envlist = py, lint +envlist = py, lint, fmt, build, twinecheck [testenv] deps = @@ -37,14 +37,13 @@ commands = black {posargs:.} isort {posargs:.} - [testenv:build] description = build wheel deps = build commands = python -m build -w skip_install = True - + [testenv:twinecheck] description = check wheel deps = diff --git a/plugins/framework/tox.ini b/plugins/framework/tox.ini index acfde95d..16764631 100644 --- a/plugins/framework/tox.ini +++ b/plugins/framework/tox.ini @@ -1,5 +1,5 @@ [tox] -envlist = py, lint, fmt +envlist = py, lint, fmt, build, twinecheck [testenv] deps = diff --git a/plugins/fused-ops-and-kernels/tox.ini b/plugins/fused-ops-and-kernels/tox.ini index 6a667f53..c3a38721 100644 --- a/plugins/fused-ops-and-kernels/tox.ini +++ b/plugins/fused-ops-and-kernels/tox.ini @@ -1,5 +1,5 @@ [tox] -envlist = py, lint +envlist = py, lint, fmt, build, twinecheck [testenv] deps = From 022d636eb395debeea9d27a8dde9ca54dc53774e Mon Sep 17 00:00:00 2001 From: 1000850000 user Date: Tue, 2 Jul 2024 03:01:36 +0000 Subject: [PATCH 04/20] add unit tests on extracted autogptq --- .../accelerated-peft/tests/test_gptqmodel.py | 223 ++++++++++++++++++ 1 file changed, 223 insertions(+) create mode 100644 plugins/accelerated-peft/tests/test_gptqmodel.py diff --git a/plugins/accelerated-peft/tests/test_gptqmodel.py b/plugins/accelerated-peft/tests/test_gptqmodel.py new file mode 100644 index 00000000..4e8e349c --- /dev/null +++ b/plugins/accelerated-peft/tests/test_gptqmodel.py @@ -0,0 +1,223 @@ +from transformers.utils.import_utils import _is_package_available +import pytest # pylint: disable=import-error +import torch +from typing import List +from types import MethodType +from functools import partial +from peft import LoraConfig, prepare_model_for_kbit_training +from peft.tuners.lora.gptq import QuantLinear as LoraLinearGPTQ +from peft.tuners.lora.model import LoraModel + +GPTQ = "gptq" +# r, lora_alpha +FLOAT16 = "float16" +LORA_r = 8 +LORA_alpha = 1.0 +BS = 1 +SEQLEN = 128 + +ALLCLOSE_RTOL = 1e-3 +ALLCLOSE_ATOL = 1e-4 + +VANILLA_MODEL_NAME = "TinyLlama/TinyLlama-1.1B-Chat-v0.3" +QUANTIZED_MODEL_NAME = "TheBloke/TinyLlama-1.1B-Chat-v0.3-GPTQ" +TARGET_MODULES = ["q_proj", "k_proj", "v_proj", "o_proj"] + +def replace_module_peft(self, parent_module, child_name, new_module, old_module): + setattr(parent_module, child_name, new_module) + for name, module in new_module.named_modules(): + if "lora_" in name: + device = (list(old_module.parameters()) + list(old_module.buffers()))[0].device + module.to(device) + +def create_new_module_peft( + lora_config: LoraConfig, + adapter_name: str, + target: torch.nn.Module, + target_cls, + **kwargs, +): + new_module = None + if isinstance(target, target_cls): + new_module = LoraLinearGPTQ( + target, adapter_name, lora_config=lora_config, **kwargs + ) + return new_module + + +def get_autogptq_peft_model(model, peft_config): + from auto_gptq.nn_modules.qlinear.qlinear_tritonv2 import QuantLinear + from auto_gptq.utils.peft_utils import GPTQLoraModel, get_gptq_peft_model + + model = prepare_model_for_kbit_training( + model, + use_gradient_checkpointing=False, + gradient_checkpointing_kwargs={}, + ) + + _old_create_new_module = LoraModel._create_new_module + _old_replace_module = GPTQLoraModel._replace_module + _create_new_module = partial(create_new_module_peft, target_cls=QuantLinear) + LoraModel._create_new_module = staticmethod(_create_new_module) + GPTQLoraModel._replace_module = MethodType(replace_module_peft, GPTQLoraModel) + + # Install GPTQ adapters using the AutoGPTQ package (with the above patches) + model = get_gptq_peft_model( + model, + peft_config=peft_config, + auto_find_all_linears=peft_config.target_modules is None, + train_mode=True, # install adapaters for training + ) + + # undo the patching for hygine + LoraModel._create_new_module = staticmethod(_old_create_new_module) + GPTQLoraModel._replace_module = MethodType(_old_replace_module, GPTQLoraModel) + return model + +def get_autogptq_lib_quantized_model(model_name:str, target_modules:List, torch_dtype:str): + from auto_gptq import AutoGPTQForCausalLM, BaseQuantizeConfig + quantize_config = BaseQuantizeConfig.from_pretrained(model_name) + + device_map = { + "": ( + torch.cuda.current_device() + if torch.cuda.is_available() + else None + ) + } + model = AutoGPTQForCausalLM.from_quantized( + model_name, + quantize_config=quantize_config, + torch_dtype=getattr(torch, torch_dtype), + low_cpu_mem_usage=False, + use_marlin=False, + disable_exllama=True, + warmup_triton=False, + use_tritonv2=True, + trainable=True, + device_map=device_map, + ) + + peft_config = LoraConfig( + r=LORA_r, + lora_alpha=LORA_alpha, + lora_dropout=0.0, # anyway we are going to override it + target_modules=target_modules, + ) + # model = get_autogptq_peft_model(model, peft_config) + return model + +def load_autogptq_plugin_model(model_name:str, target_modules:List, torch_dtype:str): + from fms_acceleration_peft.framework_plugin_autogptq import ( + AutoGPTQAccelerationPlugin, + ) + plugins = { + GPTQ: AutoGPTQAccelerationPlugin( + { + "peft": { + "quantization": { + "auto_gptq": {"kernel": "triton_v2", "from_quantized": True} + } + } + } + ), + } + + class TrainArgs: + gradient_checkpointing = False + gradient_checkpointing_kwargs = {} + + args = TrainArgs() + peft_config = LoraConfig( + r=LORA_r, + lora_alpha=LORA_alpha, + lora_dropout=0.0, # anyway we are going to override it + target_modules=target_modules, + ) + + _plugin = plugins[GPTQ] + model = _plugin.model_loader( + model_name, torch_dtype=getattr(torch, FLOAT16) + ) + # model, _ = _plugin.augmentation(model, args, (peft_config,)) + return model + +@pytest.fixture() +def input_ids(seed: int = 42, device: torch.device = "cuda"): + torch.manual_seed(seed) + yield torch.randint(0, 10000, (BS, SEQLEN)) + +@pytest.mark.skipif( + not _is_package_available("auto_gptq"), + reason="Only runs if auto_gptq is installed", +) +def test_already_quantized_outputs_match( + input_ids, seed: int = 42, +): + torch.manual_seed(seed) + original_model = get_autogptq_lib_quantized_model(QUANTIZED_MODEL_NAME, TARGET_MODULES, FLOAT16) + refactored_model = load_autogptq_plugin_model(QUANTIZED_MODEL_NAME, TARGET_MODULES, FLOAT16) + with torch.autocast(device_type='cuda', dtype=torch.float32): + original_model.eval() + original_logits = original_model(input_ids.to(original_model.device)).logits + refactored_model.eval() + refactored_logits = refactored_model(input_ids.to(refactored_model.device)).logits + + assert torch.allclose( + original_logits, refactored_logits, atol=ALLCLOSE_ATOL, rtol=ALLCLOSE_RTOL + ), "Logits don't match between refactored quantized model and original library" + + +@pytest.mark.skipif( + not _is_package_available("auto_gptq"), + reason="Only runs if auto_gptq is installed", +) +def test_pretrained_to_quantized_outputs_match( + input_ids, seed: int = 42, +): + torch.manual_seed(seed) + from transformers import AutoTokenizer + from auto_gptq import AutoGPTQForCausalLM, BaseQuantizeConfig + from gptqmodel import GPTQModel, QuantizeConfig + device = torch.device("cuda") + + tokenizer = AutoTokenizer.from_pretrained(VANILLA_MODEL_NAME, use_fast=True) + calibration_dataset = [ + tokenizer( + "The world is a wonderful place full of beauty and love." + ) + ] + + original_quantize_config = BaseQuantizeConfig( + bits=4, + group_size=-1, + desc_act=False, + ) + # load un-quantized model, by default, the model will always be loaded into CPU memory + original_model = AutoGPTQForCausalLM.from_pretrained( + VANILLA_MODEL_NAME, + original_quantize_config + ).to(device) + # quantize model, the examples should be list of dict whose keys can only be "input_ids" and "attention_mask" + original_model.quantize(calibration_dataset, use_triton=True) + + refactored_quant_config = QuantizeConfig( + bits=4, + group_size=-1, + desc_act=False, + ) + # load un-quantized model, by default, the model will always be loaded into CPU memory + refactored_model = GPTQModel.from_pretrained(VANILLA_MODEL_NAME, refactored_quant_config).to(device) + # quantize model, the calibration_dataset should be list of dict whose keys can only be "input_ids" and "attention_mask" + refactored_model.quantize(calibration_dataset) + + with torch.autocast(device_type='cuda', dtype=torch.float16): + with torch.no_grad(): + original_model.eval() + original_logits = original_model(input_ids.to(original_model.device)).logits + refactored_model.eval() + refactored_logits = refactored_model(input_ids.to(refactored_model.device)).logits + + assert torch.allclose( + original_logits, refactored_logits, atol=ALLCLOSE_ATOL, rtol=ALLCLOSE_RTOL + ), "Logits don't match between refactored quantized model and original library" \ No newline at end of file From 8976983f139f9694789bb1a6c0a7f30815b6b2fe Mon Sep 17 00:00:00 2001 From: 1000850000 user Date: Wed, 3 Jul 2024 07:20:03 +0000 Subject: [PATCH 05/20] modify autogptq plugin to support both external and extracted autogptq --- .github/workflows/build.yml | 55 --- plugins/accelerated-peft/pyproject.toml | 3 +- plugins/accelerated-peft/requirements.txt | 6 +- .../framework_plugin_autogptq.py | 172 ++++++---- .../accelerated-peft/tests/test_gptqmodel.py | 313 +++++++++--------- 5 files changed, 266 insertions(+), 283 deletions(-) delete mode 100644 .github/workflows/build.yml diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml deleted file mode 100644 index 81848e62..00000000 --- a/.github/workflows/build.yml +++ /dev/null @@ -1,55 +0,0 @@ -on: - push: - branches: ["extracted_autogptq"] - -jobs: - build: - runs-on: ubuntu-latest - strategy: - matrix: - python-version: - # - setup: "3.10" - # tox: "py310" - - setup: "3.9" - tox: "py39" - plugin_name: - - "framework" - - "accelerated-peft" - - "fused-ops-and-kernels" - - permissions: - id-token: write # IMPORTANT: this permission is mandatory for trusted publishing - - steps: - - uses: actions/checkout@v4 - - # should only trigger the release if the plugin has changes - - uses: dorny/paths-filter@v2 - id: changes - with: - filters: | - plugins: - - 'plugins/${{ matrix.plugin_name }}/**' - - if: steps.changes.outputs.plugins == 'true' - name: Set up Python ${{ matrix.python-version.setup }} - uses: actions/setup-python@v4 - with: - python-version: ${{ matrix.python-version.setup }} - - - if: steps.changes.outputs.plugins == 'true' - name: Install dependencies - run: | - python -m pip install --upgrade pip - python -m pip install --upgrade tox - - - if: steps.changes.outputs.plugins == 'true' - name: Build and check wheel package - run: | - cd plugins/${{ matrix.plugin_name }} - tox -e build,twinecheck - ls dist/* - # - if: steps.changes.outputs.plugins == 'true' - # name: "Upload to PyPI" - # uses: pypa/gh-action-pypi-publish@ec4db0b4ddc65acdf4bff5fa45ac92d78b56bdf0 # v1.9.0 - # with: - # packages-dir: plugins/${{ matrix.plugin_name }}/dist \ No newline at end of file diff --git a/plugins/accelerated-peft/pyproject.toml b/plugins/accelerated-peft/pyproject.toml index 35789df0..a2b2f12e 100644 --- a/plugins/accelerated-peft/pyproject.toml +++ b/plugins/accelerated-peft/pyproject.toml @@ -26,12 +26,13 @@ classifiers=[ [project.optional-dependencies] flash-attn = ["flash-attn"] +auto_gptq = ["auto_gptq @ git+https://github.com/AutoGPTQ/AutoGPTQ.git"] [tool.hatch.metadata.hooks.requirements_txt] files = ["requirements.txt"] [tool.hatch.build.targets.wheel] -only-include = ["src/fms_acceleration_peft"] +only-include = ["src/"] [tool.hatch.metadata] allow-direct-references = true diff --git a/plugins/accelerated-peft/requirements.txt b/plugins/accelerated-peft/requirements.txt index 77db9597..3bcb995f 100644 --- a/plugins/accelerated-peft/requirements.txt +++ b/plugins/accelerated-peft/requirements.txt @@ -3,11 +3,7 @@ # put this in here because there is a breaking FSDP api change that # is fixed after peft > 0.10 -accelerate < 0.29 +accelerate <= 0.29 # bitsandbytes for the BNB plugin bitsandbytes - -# Installing from repository because "auto_gptq > 0.7.1" it not yet available -# Specifying the commit id here as recent commits to the main branch have introduced additional dependencies -auto_gptq @ git+https://github.com/AutoGPTQ/AutoGPTQ.git@ea829c7bbe83561c2b1de26795b6592992373ef7 \ No newline at end of file diff --git a/plugins/accelerated-peft/src/fms_acceleration_peft/framework_plugin_autogptq.py b/plugins/accelerated-peft/src/fms_acceleration_peft/framework_plugin_autogptq.py index 7928d9a9..63254c25 100644 --- a/plugins/accelerated-peft/src/fms_acceleration_peft/framework_plugin_autogptq.py +++ b/plugins/accelerated-peft/src/fms_acceleration_peft/framework_plugin_autogptq.py @@ -20,6 +20,7 @@ from functools import partial from types import MethodType from typing import Dict, Tuple +import importlib import os # Third Party @@ -31,12 +32,11 @@ import torch import torch.distributed - class AutoGPTQAccelerationPlugin(AccelerationPlugin): - require_packages = ["auto_gptq"] + require_packages = [] - def __init__(self, configurations: Dict[str, Dict]): + def __init__(self, configurations: Dict[str, Dict], use_external_lib: bool = False): super().__init__(configurations) # just do checking, nothing must to configure at this point @@ -47,17 +47,25 @@ def __init__(self, configurations: Dict[str, Dict]): self._check_config_equal( key="peft.quantization.auto_gptq.from_quantized", value=True ) + self.use_external_lib = use_external_lib and importlib.util.find_spec("autogptq") is not None def model_loader(self, model_name: str, **kwargs): # guarded imports # Third Party - from auto_gptq import ( # pylint: disable=import-outside-toplevel,import-error - AutoGPTQForCausalLM, - BaseQuantizeConfig, - ) - from auto_gptq.nn_modules.qlinear.qlinear_tritonv2 import ( # pylint: disable=import-outside-toplevel,import-error - QuantLinear, - ) + if self.use_external_lib: + from auto_gptq import ( # pylint: disable=import-outside-toplevel,import-error + AutoGPTQForCausalLM as GPTQModel, + BaseQuantizeConfig as QuantizeConfig, + ) + from auto_gptq.nn_modules.qlinear.qlinear_tritonv2 import ( # pylint: disable=import-outside-toplevel,import-error + QuantLinear, + ) + else: + from gptqmodel import GPTQModel, QuantizeConfig + from gptqmodel.utils import Backend + from gptqmodel.nn_modules.qlinear.qlinear_tritonv2 import ( + QuantLinear, + ) # Local from .autogptq_utils import ( # pylint: disable=import-outside-toplevel @@ -85,7 +93,7 @@ def model_loader(self, model_name: str, **kwargs): # switching to cuda/cuda_old/triton backend." # assume model_name points to a quantized checkpoint. Thus we load the quantization # config directly from the checkpoint. - quantize_config = BaseQuantizeConfig.from_pretrained(model_name) + quantize_config = QuantizeConfig.from_pretrained(model_name) # get additional parameters torch_dtype = kwargs.get("torch_dtype", torch.float32) @@ -101,23 +109,39 @@ def model_loader(self, model_name: str, **kwargs): ) AutoModelForCausalLM.from_config = _from_config # patch + if self.use_external_lib: + kwargs = { + "low_cpu_mem_usage": low_cpu_mem_usage, + "use_marlin": False, # disable, cannot be used for training (no forward+backward) + "disable_exllama": True, # disable, cannot be used for training (no backward) + "use_tritonv2": True, + "trainable": True, # only support trainable mode + } + else: + kwargs = { + "low_cpu_mem_usage": low_cpu_mem_usage, # this is only used for device map + "backend": Backend.TRITON, + } + + # this is a HF method that checks if the low_cpu_mem mode is enabled # via HF accelerate if is_fsdp_enabled(): - # Local - from .autogptq_utils import ( # pylint: disable=import-outside-toplevel - _patch_target_module, - make_sure_no_tensor_in_meta_device, - ) - - # We patch `make_sure_no_tensor_in_meta_device` - # from autogptq to avoid errors on models without bias - _patch_target_module( - to_patch="auto_gptq.modeling._utils.make_sure_no_tensor_in_meta_device", - replace_with=make_sure_no_tensor_in_meta_device, - target_module="auto_gptq.modeling._base", - ) - low_cpu_mem_usage = True + if self.use_external_lib: + # Local + from .autogptq_utils import ( # pylint: disable=import-outside-toplevel + _patch_target_module, + make_sure_no_tensor_in_meta_device, + ) + + # We patch `make_sure_no_tensor_in_meta_device` + # from autogptq to avoid errors on models without bias + _patch_target_module( + to_patch="auto_gptq.modeling._utils.make_sure_no_tensor_in_meta_device", + replace_with=make_sure_no_tensor_in_meta_device, + target_module="auto_gptq.modeling._base", + ) + kwargs["low_cpu_mem_usage"] = True # NOTE: need to set the device map as below as we want to use AutoGPTQ for training. # device_map is for inference only @@ -130,7 +154,7 @@ def model_loader(self, model_name: str, **kwargs): # see https://github.com/huggingface/transformers/pull/25107#issuecomment-2134833262 device_map = { "": ( - (torch.cuda.current_device() if not low_cpu_mem_usage else "cpu") + (torch.cuda.current_device() if not kwargs["low_cpu_mem_usage"] else "cpu") if torch.cuda.is_available() else None ) @@ -138,17 +162,13 @@ def model_loader(self, model_name: str, **kwargs): # currently only enable triton_v2, because the triton kernels are the only ones # that have backwards - model = AutoGPTQForCausalLM.from_quantized( + model = GPTQModel.from_quantized( model_name, quantize_config=quantize_config, torch_dtype=torch_dtype, - low_cpu_mem_usage=low_cpu_mem_usage, - use_marlin=False, # disable, cannot be used for training (no forward+backward) - disable_exllama=True, # disable, cannot be used for training (no backward) - warmup_triton=False, # disable for now as it will try to run the warmup while on CPU - use_tritonv2=True, - trainable=True, # only support trainable mode device_map=device_map, + warmup_triton=False, # disable for now as it will try to run the warmup while on CPU + **kwargs, ) # https://github.com/foundation-model-stack/fms-acceleration/pull/15 @@ -219,19 +239,22 @@ def augmentation( ): # guarded imports # Third Party - from auto_gptq.nn_modules.qlinear.qlinear_tritonv2 import ( # pylint: disable=import-outside-toplevel,import-error - QuantLinear, - ) - from auto_gptq.utils.peft_utils import ( # pylint: disable=import-outside-toplevel,import-error - GPTQLoraModel, - get_gptq_peft_model, - ) + if self.use_external_lib: + from auto_gptq.nn_modules.qlinear.qlinear_tritonv2 import ( # pylint: disable=import-outside-toplevel,import-error + QuantLinear, + ) + from auto_gptq.utils.peft_utils import ( # pylint: disable=import-outside-toplevel,import-error + GPTQLoraModel, + get_gptq_peft_model, + ) + # Local + from .autogptq_utils import ( # pylint: disable=import-outside-toplevel + create_new_module_peft, + replace_module_peft, + ) + else: + from gptqmodel.utils.peft import get_gptq_peft_model - # Local - from .autogptq_utils import ( # pylint: disable=import-outside-toplevel - create_new_module_peft, - replace_module_peft, - ) (peft_config,) = modifiable_args # unpack modifiable args @@ -249,31 +272,33 @@ def augmentation( gradient_checkpointing_kwargs=train_args.gradient_checkpointing_kwargs, ) - # These functions need to replaced due to some incompatibliites - # with newer PEFT packages. - # - on augmentation we call auto_gptq.utils.peft_utils.get_gptq_peft_model - # - this internally calls peft.utils.other.get_peft_model - # - however the problem is that peft API moves very fast, and there are incompatiblities - # - # During peft wrapping there are two key operations - # 1. LoraModel._create_new_module is called to create a LoraLinear layer that is - # compatible with the base layer. For quantized base layers, the LoraLinear - # may be different. - # 2. GPTQLoraModel._replace_module to replace the existing Linear with the LoraLinear. - # Also move to device (which may depend on how base layer is implemented) - - # NOTE: GPTQLoraModel inherits from LoraModel, and the _create_new_module method is called - # on the parent. Hence _create_new_module is patched on the parent - - # FIXME: - # 1. investigate using BaseGPTQForCausalLM.make_sure_compatible_with_peft - # to see if we can get around the patching - - _old_create_new_module = LoraModel._create_new_module - _old_replace_module = GPTQLoraModel._replace_module - _create_new_module = partial(create_new_module_peft, target_cls=QuantLinear) - LoraModel._create_new_module = staticmethod(_create_new_module) - GPTQLoraModel._replace_module = MethodType(replace_module_peft, GPTQLoraModel) + if self.use_external_lib: + # These functions need to replaced due to some incompatibliites + # with newer PEFT packages. + # - on augmentation we call auto_gptq.utils.peft_utils.get_gptq_peft_model + # - this internally calls peft.utils.other.get_peft_model + # - however the problem is that peft API moves very fast, and there are incompatiblities + # + # During peft wrapping there are two key operations + # 1. LoraModel._create_new_module is called to create a LoraLinear layer that is + # compatible with the base layer. For quantized base layers, the LoraLinear + # may be different. + # 2. GPTQLoraModel._replace_module to replace the existing Linear with the LoraLinear. + # Also move to device (which may depend on how base layer is implemented) + + # NOTE: GPTQLoraModel inherits from LoraModel, and the _create_new_module method is called + # on the parent. Hence _create_new_module is patched on the parent + + # FIXME: + # 1. investigate using BaseGPTQForCausalLM.make_sure_compatible_with_peft + # to see if we can get around the patching + + _old_create_new_module = LoraModel._create_new_module + _old_replace_module = GPTQLoraModel._replace_module + _create_new_module = partial(create_new_module_peft, target_cls=QuantLinear) + LoraModel._create_new_module = staticmethod(_create_new_module) + GPTQLoraModel._replace_module = MethodType(replace_module_peft, GPTQLoraModel) + # Install GPTQ adapters using the AutoGPTQ package (with the above patches) model = get_gptq_peft_model( @@ -284,9 +309,10 @@ def augmentation( ) modifiable_args = (None,) # return a None for peft_config - # undo the patching for hygine - LoraModel._create_new_module = staticmethod(_old_create_new_module) - GPTQLoraModel._replace_module = MethodType(_old_replace_module, GPTQLoraModel) + if self.use_external_lib: + # undo the patching for hygine + LoraModel._create_new_module = staticmethod(_old_create_new_module) + GPTQLoraModel._replace_module = MethodType(_old_replace_module, GPTQLoraModel) return model, modifiable_args diff --git a/plugins/accelerated-peft/tests/test_gptqmodel.py b/plugins/accelerated-peft/tests/test_gptqmodel.py index 4e8e349c..84581da1 100644 --- a/plugins/accelerated-peft/tests/test_gptqmodel.py +++ b/plugins/accelerated-peft/tests/test_gptqmodel.py @@ -1,12 +1,10 @@ -from transformers.utils.import_utils import _is_package_available import pytest # pylint: disable=import-error import torch from typing import List -from types import MethodType -from functools import partial -from peft import LoraConfig, prepare_model_for_kbit_training -from peft.tuners.lora.gptq import QuantLinear as LoraLinearGPTQ -from peft.tuners.lora.model import LoraModel + +from transformers.utils.import_utils import _is_package_available +from transformers import AutoTokenizer, AutoConfig, GenerationConfig, AutoModelForCausalLM +from peft import LoraConfig GPTQ = "gptq" # r, lora_alpha @@ -16,112 +14,30 @@ BS = 1 SEQLEN = 128 +LOSS_TOLERANCE = 1e-3 ALLCLOSE_RTOL = 1e-3 ALLCLOSE_ATOL = 1e-4 VANILLA_MODEL_NAME = "TinyLlama/TinyLlama-1.1B-Chat-v0.3" QUANTIZED_MODEL_NAME = "TheBloke/TinyLlama-1.1B-Chat-v0.3-GPTQ" TARGET_MODULES = ["q_proj", "k_proj", "v_proj", "o_proj"] - -def replace_module_peft(self, parent_module, child_name, new_module, old_module): - setattr(parent_module, child_name, new_module) - for name, module in new_module.named_modules(): - if "lora_" in name: - device = (list(old_module.parameters()) + list(old_module.buffers()))[0].device - module.to(device) - -def create_new_module_peft( - lora_config: LoraConfig, - adapter_name: str, - target: torch.nn.Module, - target_cls, - **kwargs, -): - new_module = None - if isinstance(target, target_cls): - new_module = LoraLinearGPTQ( - target, adapter_name, lora_config=lora_config, **kwargs - ) - return new_module - - -def get_autogptq_peft_model(model, peft_config): - from auto_gptq.nn_modules.qlinear.qlinear_tritonv2 import QuantLinear - from auto_gptq.utils.peft_utils import GPTQLoraModel, get_gptq_peft_model - - model = prepare_model_for_kbit_training( - model, - use_gradient_checkpointing=False, - gradient_checkpointing_kwargs={}, - ) - - _old_create_new_module = LoraModel._create_new_module - _old_replace_module = GPTQLoraModel._replace_module - _create_new_module = partial(create_new_module_peft, target_cls=QuantLinear) - LoraModel._create_new_module = staticmethod(_create_new_module) - GPTQLoraModel._replace_module = MethodType(replace_module_peft, GPTQLoraModel) - - # Install GPTQ adapters using the AutoGPTQ package (with the above patches) - model = get_gptq_peft_model( - model, - peft_config=peft_config, - auto_find_all_linears=peft_config.target_modules is None, - train_mode=True, # install adapaters for training - ) - - # undo the patching for hygine - LoraModel._create_new_module = staticmethod(_old_create_new_module) - GPTQLoraModel._replace_module = MethodType(_old_replace_module, GPTQLoraModel) - return model - -def get_autogptq_lib_quantized_model(model_name:str, target_modules:List, torch_dtype:str): - from auto_gptq import AutoGPTQForCausalLM, BaseQuantizeConfig - quantize_config = BaseQuantizeConfig.from_pretrained(model_name) - - device_map = { - "": ( - torch.cuda.current_device() - if torch.cuda.is_available() - else None - ) - } - model = AutoGPTQForCausalLM.from_quantized( - model_name, - quantize_config=quantize_config, - torch_dtype=getattr(torch, torch_dtype), - low_cpu_mem_usage=False, - use_marlin=False, - disable_exllama=True, - warmup_triton=False, - use_tritonv2=True, - trainable=True, - device_map=device_map, - ) - - peft_config = LoraConfig( - r=LORA_r, - lora_alpha=LORA_alpha, - lora_dropout=0.0, # anyway we are going to override it - target_modules=target_modules, - ) - # model = get_autogptq_peft_model(model, peft_config) - return model -def load_autogptq_plugin_model(model_name:str, target_modules:List, torch_dtype:str): +# Model loading function for quantized models +def load_autogptq_plugin_model(model_name:str, target_modules:List, torch_dtype:str, use_external_lib:bool = False): from fms_acceleration_peft.framework_plugin_autogptq import ( AutoGPTQAccelerationPlugin, ) - plugins = { - GPTQ: AutoGPTQAccelerationPlugin( + + _plugin = AutoGPTQAccelerationPlugin( { "peft": { "quantization": { "auto_gptq": {"kernel": "triton_v2", "from_quantized": True} } } - } - ), - } + }, + use_external_lib = use_external_lib, + ) class TrainArgs: gradient_checkpointing = False @@ -135,89 +51,188 @@ class TrainArgs: target_modules=target_modules, ) - _plugin = plugins[GPTQ] model = _plugin.model_loader( - model_name, torch_dtype=getattr(torch, FLOAT16) + model_name, torch_dtype=getattr(torch, torch_dtype) ) - # model, _ = _plugin.augmentation(model, args, (peft_config,)) + model, _ = _plugin.augmentation(model, args, (peft_config,)) + model.eval() return model +# quantization function to manage the loading and quantizing of pretrained model +# using external or local autogptq +def quantize_model(model_name, config, calibration_dataset, quant_config_kwargs, device, use_external_lib=False): + if use_external_lib: + from auto_gptq import AutoGPTQForCausalLM as GPTQModel, BaseQuantizeConfig as QuantizeConfig + quantize_kwargs = {"use_triton": True} + else: + from gptqmodel import GPTQModel, QuantizeConfig + quantize_kwargs = {} + + quantize_config = QuantizeConfig( + **quant_config_kwargs + ) + # load un-quantized model, by default, the model will always be loaded into CPU memory + model = GPTQModel.from_pretrained( + model_name, + quantize_config = quantize_config, + config = config, + ).to(device) + # quantize model, the examples should be list of dict whose keys can only be "input_ids" + model.quantize(calibration_dataset, **quantize_kwargs) + model.eval() + return model + +def get_wikitext2(tokenizer, num_samples=128, seqlen=128): + import random + import numpy as np + import torch + from datasets import load_dataset + wikidata = load_dataset('wikitext', 'wikitext-2-v1', split='test') + wikilist = [' \n' if s == '' else s for s in wikidata['text'] ] + + text = ''.join(wikilist) + trainenc = tokenizer(text, return_tensors='pt') + + random.seed(0) + np.random.seed(0) + torch.random.manual_seed(0) + + traindataset = [] + + for _ in range(num_samples): + i = random.randint(0, trainenc.input_ids.shape[1] - seqlen - 1) + j = i + seqlen + inp = trainenc.input_ids[:, i:j] + attention_mask = torch.ones_like(inp) + traindataset.append({'input_ids':inp,'attention_mask': attention_mask}) + return traindataset + @pytest.fixture() def input_ids(seed: int = 42, device: torch.device = "cuda"): torch.manual_seed(seed) - yield torch.randint(0, 10000, (BS, SEQLEN)) + yield torch.randint(0, 10000, (BS, SEQLEN), device=device) @pytest.mark.skipif( not _is_package_available("auto_gptq"), reason="Only runs if auto_gptq is installed", ) -def test_already_quantized_outputs_match( +def test_pre_quantized_model_outputs_match( input_ids, seed: int = 42, ): + """ + Test for output equivalence when loading quantized models between + extracted gptq library against original autogptq library + """ torch.manual_seed(seed) - original_model = get_autogptq_lib_quantized_model(QUANTIZED_MODEL_NAME, TARGET_MODULES, FLOAT16) - refactored_model = load_autogptq_plugin_model(QUANTIZED_MODEL_NAME, TARGET_MODULES, FLOAT16) + original_model = load_autogptq_plugin_model( + QUANTIZED_MODEL_NAME, + TARGET_MODULES, FLOAT16, + use_external_lib=True + ) + refactored_model = load_autogptq_plugin_model( + QUANTIZED_MODEL_NAME, + TARGET_MODULES, + FLOAT16 + ) with torch.autocast(device_type='cuda', dtype=torch.float32): - original_model.eval() - original_logits = original_model(input_ids.to(original_model.device)).logits - refactored_model.eval() - refactored_logits = refactored_model(input_ids.to(refactored_model.device)).logits + with torch.no_grad(): + original_logits = original_model(input_ids.to(original_model.device)).logits + refactored_logits = refactored_model(input_ids.to(refactored_model.device)).logits assert torch.allclose( original_logits, refactored_logits, atol=ALLCLOSE_ATOL, rtol=ALLCLOSE_RTOL - ), "Logits don't match between refactored quantized model and original library" - + ), "Pre-quantized model logits don't match between extracted and external autogptq library" @pytest.mark.skipif( not _is_package_available("auto_gptq"), reason="Only runs if auto_gptq is installed", ) -def test_pretrained_to_quantized_outputs_match( +def test_quantizing_pretrained_model_outputs_match( input_ids, seed: int = 42, ): + """ + Test for regression of quantizing pretrained models + with refactored gptq library against original autogptq library + by calculating KL loss on the output logits of both variants + """ torch.manual_seed(seed) - from transformers import AutoTokenizer - from auto_gptq import AutoGPTQForCausalLM, BaseQuantizeConfig - from gptqmodel import GPTQModel, QuantizeConfig - device = torch.device("cuda") - + # Initialize common arguments + device = input_ids.device tokenizer = AutoTokenizer.from_pretrained(VANILLA_MODEL_NAME, use_fast=True) - calibration_dataset = [ - tokenizer( - "The world is a wonderful place full of beauty and love." - ) - ] + config = AutoConfig.from_pretrained(VANILLA_MODEL_NAME) + config.num_hidden_layers = 2 + # calibration_dataset = [ + # tokenizer( + # "The world is a wonderful place full of beauty and love." + # ) + # ] + calibration_dataset = get_wikitext2(tokenizer, num_samples=128, seqlen=128) + quant_config_kwargs = { + "bits": 4, + "group_size": -1, + "desc_act": True, + "damp_percent": 0.1, + "static_groups": False, + "sym": True, + "true_sequential": True, + } - original_quantize_config = BaseQuantizeConfig( - bits=4, - group_size=-1, - desc_act=False, + # quantize models for external autogptq lib and extracted gptq lib + original_model = quantize_model( + VANILLA_MODEL_NAME, + config, + calibration_dataset, + quant_config_kwargs, + device, + use_external_lib=True ) - # load un-quantized model, by default, the model will always be loaded into CPU memory - original_model = AutoGPTQForCausalLM.from_pretrained( + refactored_model = quantize_model( VANILLA_MODEL_NAME, - original_quantize_config - ).to(device) - # quantize model, the examples should be list of dict whose keys can only be "input_ids" and "attention_mask" - original_model.quantize(calibration_dataset, use_triton=True) - - refactored_quant_config = QuantizeConfig( - bits=4, - group_size=-1, - desc_act=False, + config, + calibration_dataset, + quant_config_kwargs, + device, + use_external_lib=False ) - # load un-quantized model, by default, the model will always be loaded into CPU memory - refactored_model = GPTQModel.from_pretrained(VANILLA_MODEL_NAME, refactored_quant_config).to(device) - # quantize model, the calibration_dataset should be list of dict whose keys can only be "input_ids" and "attention_mask" - refactored_model.quantize(calibration_dataset) - with torch.autocast(device_type='cuda', dtype=torch.float16): - with torch.no_grad(): - original_model.eval() - original_logits = original_model(input_ids.to(original_model.device)).logits - refactored_model.eval() - refactored_logits = refactored_model(input_ids.to(refactored_model.device)).logits - - assert torch.allclose( - original_logits, refactored_logits, atol=ALLCLOSE_ATOL, rtol=ALLCLOSE_RTOL - ), "Logits don't match between refactored quantized model and original library" \ No newline at end of file + # compare generated tokens between + # unquantized, original library and refactored gptqmodel library + unquantized_model = AutoModelForCausalLM.from_pretrained( + VANILLA_MODEL_NAME, + config=config + ).to(device) + gen_config = GenerationConfig.from_pretrained(VANILLA_MODEL_NAME) + gen_config.max_new_tokens = 5 + _inputs = torch.tensor([tokenizer("auto-gptq is an easy to use")["input_ids"]], device="cuda") + output1 = tokenizer.decode( + original_model.generate( + inputs=_inputs, generation_config=gen_config + ).view(-1), skip_special_tokens=True + ) + output2 = tokenizer.decode( + refactored_model.generate( + inputs=_inputs, generation_config=gen_config + ).view(-1), skip_special_tokens=True + ) + output3 = tokenizer.decode( + unquantized_model.generate( + inputs=_inputs, generation_config=gen_config + ).view(-1), skip_special_tokens=True + ) + assert output1==output2==output3, f"generated tokens ({output1}, {output2}, {output3}) \ + don't match between both libraries after quantization" + + # compare prob. distributions between original library and refactored gptqmodel library + with torch.no_grad(): + original_logits = original_model(input_ids).logits + refactored_logits = refactored_model(input_ids).logits + + # Measure the distribution error with KD Loss + loss_fn = torch.nn.KLDivLoss(reduction="mean") + # input should be a distribution in the log space + input = torch.nn.functional.log_softmax(refactored_logits, dim=1) + # target must be prob distribution + target = torch.nn.functional.softmax(original_logits, dim=1) + error = loss_fn(input, target) + assert error.lt(LOSS_TOLERANCE), "Model logits don't match between both libraries \ + after quantization" From f510537483c2787909cdbb3b27c9d1855900f060 Mon Sep 17 00:00:00 2001 From: 1000850000 user Date: Wed, 3 Jul 2024 19:06:02 +0000 Subject: [PATCH 06/20] addressed additional PR changes --- plugins/accelerated-peft/pyproject.toml | 4 +- plugins/accelerated-peft/requirements.txt | 5 +-- .../framework_plugin_autogptq.py | 44 ++++++++++--------- .../gptqmodel/__init__.py | 18 ++++++++ .../gptqmodel/models/__init__.py | 25 +++++++++++ .../gptqmodel/models/_const.py | 0 .../gptqmodel/models/auto.py | 0 .../gptqmodel/models/base.py | 0 .../gptqmodel/models/dbrx.py | 0 .../gptqmodel/models/dbrx_converted.py | 0 .../gptqmodel/models/gemma.py | 0 .../gptqmodel/models/gpt_bigcode.py | 0 .../gptqmodel/models/gpt_neox.py | 0 .../gptqmodel/models/llama.py | 0 .../gptqmodel/models/mistral.py | 0 .../gptqmodel/models/mixtral.py | 0 .../gptqmodel/nn_modules/__init__.py | 15 +++++++ .../gptqmodel/nn_modules/qlinear/__init__.py | 0 .../nn_modules/qlinear/qlinear_tritonv2.py | 0 .../nn_modules/triton_utils}/__init__.py | 0 .../triton_utils/custom_autotune.py | 0 .../nn_modules/triton_utils/dequant.py | 0 .../nn_modules/triton_utils/kernels.py | 0 .../nn_modules/triton_utils/mixin.py | 0 .../gptqmodel/quantization/__init__.py | 19 ++++++++ .../gptqmodel/quantization/config.py | 0 .../gptqmodel/quantization/gptq.py | 0 .../gptqmodel/quantization/quantizer.py | 0 .../gptqmodel/utils/__init__.py | 16 +++++++ .../gptqmodel/utils/backend.py | 0 .../gptqmodel/utils/data.py | 0 .../gptqmodel/utils/importer.py | 0 .../gptqmodel/utils/model.py | 0 .../gptqmodel/utils/peft.py | 13 +++--- .../src/gptqmodel/__init__.py | 3 -- .../src/gptqmodel/models/__init__.py | 10 ----- .../nn_modules/triton_utils/__init__.py | 0 .../src/gptqmodel/quantization/__init__.py | 4 -- .../src/gptqmodel/utils/__init__.py | 1 - .../accelerated-peft/tests/test_gptqmodel.py | 28 ++++++++++-- 40 files changed, 151 insertions(+), 54 deletions(-) create mode 100644 plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/__init__.py create mode 100644 plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/models/__init__.py rename plugins/accelerated-peft/src/{ => fms_acceleration_peft}/gptqmodel/models/_const.py (100%) rename plugins/accelerated-peft/src/{ => fms_acceleration_peft}/gptqmodel/models/auto.py (100%) rename plugins/accelerated-peft/src/{ => fms_acceleration_peft}/gptqmodel/models/base.py (100%) rename plugins/accelerated-peft/src/{ => fms_acceleration_peft}/gptqmodel/models/dbrx.py (100%) rename plugins/accelerated-peft/src/{ => fms_acceleration_peft}/gptqmodel/models/dbrx_converted.py (100%) rename plugins/accelerated-peft/src/{ => fms_acceleration_peft}/gptqmodel/models/gemma.py (100%) rename plugins/accelerated-peft/src/{ => fms_acceleration_peft}/gptqmodel/models/gpt_bigcode.py (100%) rename plugins/accelerated-peft/src/{ => fms_acceleration_peft}/gptqmodel/models/gpt_neox.py (100%) rename plugins/accelerated-peft/src/{ => fms_acceleration_peft}/gptqmodel/models/llama.py (100%) rename plugins/accelerated-peft/src/{ => fms_acceleration_peft}/gptqmodel/models/mistral.py (100%) rename plugins/accelerated-peft/src/{ => fms_acceleration_peft}/gptqmodel/models/mixtral.py (100%) create mode 100644 plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/nn_modules/__init__.py rename plugins/accelerated-peft/src/{ => fms_acceleration_peft}/gptqmodel/nn_modules/qlinear/__init__.py (100%) rename plugins/accelerated-peft/src/{ => fms_acceleration_peft}/gptqmodel/nn_modules/qlinear/qlinear_tritonv2.py (100%) rename plugins/accelerated-peft/src/{gptqmodel/nn_modules => fms_acceleration_peft/gptqmodel/nn_modules/triton_utils}/__init__.py (100%) rename plugins/accelerated-peft/src/{ => fms_acceleration_peft}/gptqmodel/nn_modules/triton_utils/custom_autotune.py (100%) rename plugins/accelerated-peft/src/{ => fms_acceleration_peft}/gptqmodel/nn_modules/triton_utils/dequant.py (100%) rename plugins/accelerated-peft/src/{ => fms_acceleration_peft}/gptqmodel/nn_modules/triton_utils/kernels.py (100%) rename plugins/accelerated-peft/src/{ => fms_acceleration_peft}/gptqmodel/nn_modules/triton_utils/mixin.py (100%) create mode 100644 plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/quantization/__init__.py rename plugins/accelerated-peft/src/{ => fms_acceleration_peft}/gptqmodel/quantization/config.py (100%) rename plugins/accelerated-peft/src/{ => fms_acceleration_peft}/gptqmodel/quantization/gptq.py (100%) rename plugins/accelerated-peft/src/{ => fms_acceleration_peft}/gptqmodel/quantization/quantizer.py (100%) create mode 100644 plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/utils/__init__.py rename plugins/accelerated-peft/src/{ => fms_acceleration_peft}/gptqmodel/utils/backend.py (100%) rename plugins/accelerated-peft/src/{ => fms_acceleration_peft}/gptqmodel/utils/data.py (100%) rename plugins/accelerated-peft/src/{ => fms_acceleration_peft}/gptqmodel/utils/importer.py (100%) rename plugins/accelerated-peft/src/{ => fms_acceleration_peft}/gptqmodel/utils/model.py (100%) rename plugins/accelerated-peft/src/{ => fms_acceleration_peft}/gptqmodel/utils/peft.py (93%) delete mode 100644 plugins/accelerated-peft/src/gptqmodel/__init__.py delete mode 100644 plugins/accelerated-peft/src/gptqmodel/models/__init__.py delete mode 100644 plugins/accelerated-peft/src/gptqmodel/nn_modules/triton_utils/__init__.py delete mode 100644 plugins/accelerated-peft/src/gptqmodel/quantization/__init__.py delete mode 100644 plugins/accelerated-peft/src/gptqmodel/utils/__init__.py diff --git a/plugins/accelerated-peft/pyproject.toml b/plugins/accelerated-peft/pyproject.toml index a2b2f12e..d586a0db 100644 --- a/plugins/accelerated-peft/pyproject.toml +++ b/plugins/accelerated-peft/pyproject.toml @@ -26,13 +26,13 @@ classifiers=[ [project.optional-dependencies] flash-attn = ["flash-attn"] -auto_gptq = ["auto_gptq @ git+https://github.com/AutoGPTQ/AutoGPTQ.git"] +auto_gptq = ["auto_gptq @ git+https://github.com/AutoGPTQ/AutoGPTQ.git@ea829c7bbe83561c2b1de26795b6592992373ef7"] # known working commitid [tool.hatch.metadata.hooks.requirements_txt] files = ["requirements.txt"] [tool.hatch.build.targets.wheel] -only-include = ["src/"] +only-include = ["src/fms_acceleration_peft"] [tool.hatch.metadata] allow-direct-references = true diff --git a/plugins/accelerated-peft/requirements.txt b/plugins/accelerated-peft/requirements.txt index 3bcb995f..a00233d3 100644 --- a/plugins/accelerated-peft/requirements.txt +++ b/plugins/accelerated-peft/requirements.txt @@ -1,9 +1,8 @@ # decide not to have this as an requirement for now # fms_acceleration @ git+https://github.com/foundation-model-stack/fms-acceleration.git#subdirectory=plugins/framework -# put this in here because there is a breaking FSDP api change that -# is fixed after peft > 0.10 -accelerate <= 0.29 +# Needs a lower bound due to`accelerate.load_checkpoint_in_model` function used in gptqmodel +accelerate >= 0.29 # bitsandbytes for the BNB plugin bitsandbytes diff --git a/plugins/accelerated-peft/src/fms_acceleration_peft/framework_plugin_autogptq.py b/plugins/accelerated-peft/src/fms_acceleration_peft/framework_plugin_autogptq.py index 63254c25..efa82c45 100644 --- a/plugins/accelerated-peft/src/fms_acceleration_peft/framework_plugin_autogptq.py +++ b/plugins/accelerated-peft/src/fms_acceleration_peft/framework_plugin_autogptq.py @@ -47,7 +47,10 @@ def __init__(self, configurations: Dict[str, Dict], use_external_lib: bool = Fal self._check_config_equal( key="peft.quantization.auto_gptq.from_quantized", value=True ) - self.use_external_lib = use_external_lib and importlib.util.find_spec("autogptq") is not None + self.use_external_lib = use_external_lib + + if self.use_external_lib: + assert importlib.util.find_spec("auto_gptq") is not None, "Unable to use external library, autogptq module not found." def model_loader(self, model_name: str, **kwargs): # guarded imports @@ -61,9 +64,9 @@ def model_loader(self, model_name: str, **kwargs): QuantLinear, ) else: - from gptqmodel import GPTQModel, QuantizeConfig - from gptqmodel.utils import Backend - from gptqmodel.nn_modules.qlinear.qlinear_tritonv2 import ( + from .gptqmodel import GPTQModel, QuantizeConfig + from .gptqmodel.utils import Backend + from .gptqmodel.nn_modules.qlinear.qlinear_tritonv2 import ( QuantLinear, ) @@ -126,22 +129,21 @@ def model_loader(self, model_name: str, **kwargs): # this is a HF method that checks if the low_cpu_mem mode is enabled # via HF accelerate - if is_fsdp_enabled(): - if self.use_external_lib: - # Local - from .autogptq_utils import ( # pylint: disable=import-outside-toplevel - _patch_target_module, - make_sure_no_tensor_in_meta_device, - ) - - # We patch `make_sure_no_tensor_in_meta_device` - # from autogptq to avoid errors on models without bias - _patch_target_module( - to_patch="auto_gptq.modeling._utils.make_sure_no_tensor_in_meta_device", - replace_with=make_sure_no_tensor_in_meta_device, - target_module="auto_gptq.modeling._base", - ) - kwargs["low_cpu_mem_usage"] = True + if is_fsdp_enabled() and self.use_external_lib: + # Local + from .autogptq_utils import ( # pylint: disable=import-outside-toplevel + _patch_target_module, + make_sure_no_tensor_in_meta_device, + ) + + # We patch `make_sure_no_tensor_in_meta_device` + # from autogptq to avoid errors on models without bias + _patch_target_module( + to_patch="auto_gptq.modeling._utils.make_sure_no_tensor_in_meta_device", + replace_with=make_sure_no_tensor_in_meta_device, + target_module="auto_gptq.modeling._base", + ) + kwargs["low_cpu_mem_usage"] = True # NOTE: need to set the device map as below as we want to use AutoGPTQ for training. # device_map is for inference only @@ -253,7 +255,7 @@ def augmentation( replace_module_peft, ) else: - from gptqmodel.utils.peft import get_gptq_peft_model + from .gptqmodel.utils.peft import get_gptq_peft_model (peft_config,) = modifiable_args # unpack modifiable args diff --git a/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/__init__.py b/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/__init__.py new file mode 100644 index 00000000..dd205fa8 --- /dev/null +++ b/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/__init__.py @@ -0,0 +1,18 @@ +############################################################################### +# Adapted from https://github.com/ModelCloud/GPTQModel +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +############################################################################### +from .models import GPTQModel +from .quantization import BaseQuantizeConfig, QuantizeConfig +from .utils import Backend, get_backend \ No newline at end of file diff --git a/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/models/__init__.py b/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/models/__init__.py new file mode 100644 index 00000000..ac31ca08 --- /dev/null +++ b/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/models/__init__.py @@ -0,0 +1,25 @@ +############################################################################### +# Adapted from https://github.com/ModelCloud/GPTQModel +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +############################################################################### +from .auto import MODEL_MAP, GPTQModel +from .base import BaseGPTQModel +from .dbrx import DbrxGPTQ +from .dbrx_converted import DbrxConvertedGPTQ +from .gemma import GemmaGPTQ +from .gpt_bigcode import GPTBigCodeGPTQ +from .gpt_neox import GPTNeoXGPTQ +from .llama import LlamaGPTQ +from .mistral import MistralGPTQ +from .mixtral import MixtralGPTQ diff --git a/plugins/accelerated-peft/src/gptqmodel/models/_const.py b/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/models/_const.py similarity index 100% rename from plugins/accelerated-peft/src/gptqmodel/models/_const.py rename to plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/models/_const.py diff --git a/plugins/accelerated-peft/src/gptqmodel/models/auto.py b/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/models/auto.py similarity index 100% rename from plugins/accelerated-peft/src/gptqmodel/models/auto.py rename to plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/models/auto.py diff --git a/plugins/accelerated-peft/src/gptqmodel/models/base.py b/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/models/base.py similarity index 100% rename from plugins/accelerated-peft/src/gptqmodel/models/base.py rename to plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/models/base.py diff --git a/plugins/accelerated-peft/src/gptqmodel/models/dbrx.py b/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/models/dbrx.py similarity index 100% rename from plugins/accelerated-peft/src/gptqmodel/models/dbrx.py rename to plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/models/dbrx.py diff --git a/plugins/accelerated-peft/src/gptqmodel/models/dbrx_converted.py b/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/models/dbrx_converted.py similarity index 100% rename from plugins/accelerated-peft/src/gptqmodel/models/dbrx_converted.py rename to plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/models/dbrx_converted.py diff --git a/plugins/accelerated-peft/src/gptqmodel/models/gemma.py b/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/models/gemma.py similarity index 100% rename from plugins/accelerated-peft/src/gptqmodel/models/gemma.py rename to plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/models/gemma.py diff --git a/plugins/accelerated-peft/src/gptqmodel/models/gpt_bigcode.py b/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/models/gpt_bigcode.py similarity index 100% rename from plugins/accelerated-peft/src/gptqmodel/models/gpt_bigcode.py rename to plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/models/gpt_bigcode.py diff --git a/plugins/accelerated-peft/src/gptqmodel/models/gpt_neox.py b/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/models/gpt_neox.py similarity index 100% rename from plugins/accelerated-peft/src/gptqmodel/models/gpt_neox.py rename to plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/models/gpt_neox.py diff --git a/plugins/accelerated-peft/src/gptqmodel/models/llama.py b/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/models/llama.py similarity index 100% rename from plugins/accelerated-peft/src/gptqmodel/models/llama.py rename to plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/models/llama.py diff --git a/plugins/accelerated-peft/src/gptqmodel/models/mistral.py b/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/models/mistral.py similarity index 100% rename from plugins/accelerated-peft/src/gptqmodel/models/mistral.py rename to plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/models/mistral.py diff --git a/plugins/accelerated-peft/src/gptqmodel/models/mixtral.py b/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/models/mixtral.py similarity index 100% rename from plugins/accelerated-peft/src/gptqmodel/models/mixtral.py rename to plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/models/mixtral.py diff --git a/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/nn_modules/__init__.py b/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/nn_modules/__init__.py new file mode 100644 index 00000000..feb7a9e6 --- /dev/null +++ b/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/nn_modules/__init__.py @@ -0,0 +1,15 @@ +############################################################################### +# Adapted from https://github.com/ModelCloud/GPTQModel +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +############################################################################### \ No newline at end of file diff --git a/plugins/accelerated-peft/src/gptqmodel/nn_modules/qlinear/__init__.py b/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/nn_modules/qlinear/__init__.py similarity index 100% rename from plugins/accelerated-peft/src/gptqmodel/nn_modules/qlinear/__init__.py rename to plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/nn_modules/qlinear/__init__.py diff --git a/plugins/accelerated-peft/src/gptqmodel/nn_modules/qlinear/qlinear_tritonv2.py b/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/nn_modules/qlinear/qlinear_tritonv2.py similarity index 100% rename from plugins/accelerated-peft/src/gptqmodel/nn_modules/qlinear/qlinear_tritonv2.py rename to plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/nn_modules/qlinear/qlinear_tritonv2.py diff --git a/plugins/accelerated-peft/src/gptqmodel/nn_modules/__init__.py b/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/nn_modules/triton_utils/__init__.py similarity index 100% rename from plugins/accelerated-peft/src/gptqmodel/nn_modules/__init__.py rename to plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/nn_modules/triton_utils/__init__.py diff --git a/plugins/accelerated-peft/src/gptqmodel/nn_modules/triton_utils/custom_autotune.py b/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/nn_modules/triton_utils/custom_autotune.py similarity index 100% rename from plugins/accelerated-peft/src/gptqmodel/nn_modules/triton_utils/custom_autotune.py rename to plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/nn_modules/triton_utils/custom_autotune.py diff --git a/plugins/accelerated-peft/src/gptqmodel/nn_modules/triton_utils/dequant.py b/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/nn_modules/triton_utils/dequant.py similarity index 100% rename from plugins/accelerated-peft/src/gptqmodel/nn_modules/triton_utils/dequant.py rename to plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/nn_modules/triton_utils/dequant.py diff --git a/plugins/accelerated-peft/src/gptqmodel/nn_modules/triton_utils/kernels.py b/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/nn_modules/triton_utils/kernels.py similarity index 100% rename from plugins/accelerated-peft/src/gptqmodel/nn_modules/triton_utils/kernels.py rename to plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/nn_modules/triton_utils/kernels.py diff --git a/plugins/accelerated-peft/src/gptqmodel/nn_modules/triton_utils/mixin.py b/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/nn_modules/triton_utils/mixin.py similarity index 100% rename from plugins/accelerated-peft/src/gptqmodel/nn_modules/triton_utils/mixin.py rename to plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/nn_modules/triton_utils/mixin.py diff --git a/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/quantization/__init__.py b/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/quantization/__init__.py new file mode 100644 index 00000000..377ffb63 --- /dev/null +++ b/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/quantization/__init__.py @@ -0,0 +1,19 @@ +############################################################################### +# Adapted from https://github.com/ModelCloud/GPTQModel +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +############################################################################### +from .config import (FORMAT, FORMAT_FIELD_CODE, FORMAT_FIELD_JSON, + QUANT_CONFIG_FILENAME, QUANT_METHOD, QUANT_METHOD_FIELD, BaseQuantizeConfig, QuantizeConfig) +from .gptq import GPTQ +from .quantizer import Quantizer, quantize diff --git a/plugins/accelerated-peft/src/gptqmodel/quantization/config.py b/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/quantization/config.py similarity index 100% rename from plugins/accelerated-peft/src/gptqmodel/quantization/config.py rename to plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/quantization/config.py diff --git a/plugins/accelerated-peft/src/gptqmodel/quantization/gptq.py b/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/quantization/gptq.py similarity index 100% rename from plugins/accelerated-peft/src/gptqmodel/quantization/gptq.py rename to plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/quantization/gptq.py diff --git a/plugins/accelerated-peft/src/gptqmodel/quantization/quantizer.py b/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/quantization/quantizer.py similarity index 100% rename from plugins/accelerated-peft/src/gptqmodel/quantization/quantizer.py rename to plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/quantization/quantizer.py diff --git a/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/utils/__init__.py b/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/utils/__init__.py new file mode 100644 index 00000000..668d0859 --- /dev/null +++ b/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/utils/__init__.py @@ -0,0 +1,16 @@ +############################################################################### +# Adapted from https://github.com/ModelCloud/GPTQModel +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +############################################################################### +from .backend import Backend, get_backend diff --git a/plugins/accelerated-peft/src/gptqmodel/utils/backend.py b/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/utils/backend.py similarity index 100% rename from plugins/accelerated-peft/src/gptqmodel/utils/backend.py rename to plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/utils/backend.py diff --git a/plugins/accelerated-peft/src/gptqmodel/utils/data.py b/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/utils/data.py similarity index 100% rename from plugins/accelerated-peft/src/gptqmodel/utils/data.py rename to plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/utils/data.py diff --git a/plugins/accelerated-peft/src/gptqmodel/utils/importer.py b/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/utils/importer.py similarity index 100% rename from plugins/accelerated-peft/src/gptqmodel/utils/importer.py rename to plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/utils/importer.py diff --git a/plugins/accelerated-peft/src/gptqmodel/utils/model.py b/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/utils/model.py similarity index 100% rename from plugins/accelerated-peft/src/gptqmodel/utils/model.py rename to plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/utils/model.py diff --git a/plugins/accelerated-peft/src/gptqmodel/utils/peft.py b/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/utils/peft.py similarity index 93% rename from plugins/accelerated-peft/src/gptqmodel/utils/peft.py rename to plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/utils/peft.py index db27fa09..e3257d80 100644 --- a/plugins/accelerated-peft/src/gptqmodel/utils/peft.py +++ b/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/utils/peft.py @@ -21,7 +21,6 @@ # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # SOFTWARE. ############################################################################### -import warnings from contextlib import contextmanager from typing import List, Optional, Tuple, Union @@ -29,7 +28,7 @@ from peft import PeftConfig, PeftModel, PeftType, get_peft_model from peft.mapping import PEFT_TYPE_TO_CONFIG_MAPPING from peft.peft_model import PEFT_TYPE_TO_MODEL_MAPPING -from peft.tuners.lora import LoraConfig, LoraLayer, LoraModel +from peft.tuners.lora import LoraConfig, LoraModel from peft.tuners.lora.gptq import QuantLinear as LoraLinearGPTQ from ..models.base import BaseGPTQModel @@ -100,7 +99,6 @@ def find_all_linear_names( results.add(res) return list(results) - @contextmanager def hijack_peft_mappings(): PEFT_TYPE_TO_CONFIG_MAPPING[PeftType.LORA] = GPTQLoraConfig @@ -139,16 +137,19 @@ def get_gptq_peft_model( if peft_type == PeftType.LORA.value and not isinstance(peft_config, GPTQLoraConfig): peft_config = GPTQLoraConfig(**peft_config.to_dict()) + # this hijack is needed as `get_peft_model` uses PEFTModelForCausalLM which inherits from + # PEFTModel and it in turn relies on PEFT_TYPE_TO_MODEL_MAPPING to initialize its base LoraModel with hijack_peft_mappings(): try: if train_mode: peft_model = get_peft_model(model.model, peft_config, adapter_name=adapter_name) else: peft_model = PeftModel.from_pretrained(model.model, model_id, adapter_name) - except: + except Exception as exc: raise NotImplementedError( - f"{model.__class__.__name__} not support {peft_config.peft_type.value} peft type yet." - ) + f"{model.__class__.__name__} not support \ + {peft_config.peft_type.value} peft type yet." + ) from exc return peft_model diff --git a/plugins/accelerated-peft/src/gptqmodel/__init__.py b/plugins/accelerated-peft/src/gptqmodel/__init__.py deleted file mode 100644 index 2808d505..00000000 --- a/plugins/accelerated-peft/src/gptqmodel/__init__.py +++ /dev/null @@ -1,3 +0,0 @@ -from .models import GPTQModel -from .quantization import BaseQuantizeConfig, QuantizeConfig -from .utils import Backend, get_backend \ No newline at end of file diff --git a/plugins/accelerated-peft/src/gptqmodel/models/__init__.py b/plugins/accelerated-peft/src/gptqmodel/models/__init__.py deleted file mode 100644 index 5496b45e..00000000 --- a/plugins/accelerated-peft/src/gptqmodel/models/__init__.py +++ /dev/null @@ -1,10 +0,0 @@ -from .auto import MODEL_MAP, GPTQModel -from .base import BaseGPTQModel -from .dbrx import DbrxGPTQ -from .dbrx_converted import DbrxConvertedGPTQ -from .gemma import GemmaGPTQ -from .gpt_bigcode import GPTBigCodeGPTQ -from .gpt_neox import GPTNeoXGPTQ -from .llama import LlamaGPTQ -from .mistral import MistralGPTQ -from .mixtral import MixtralGPTQ diff --git a/plugins/accelerated-peft/src/gptqmodel/nn_modules/triton_utils/__init__.py b/plugins/accelerated-peft/src/gptqmodel/nn_modules/triton_utils/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/plugins/accelerated-peft/src/gptqmodel/quantization/__init__.py b/plugins/accelerated-peft/src/gptqmodel/quantization/__init__.py deleted file mode 100644 index ca86e26f..00000000 --- a/plugins/accelerated-peft/src/gptqmodel/quantization/__init__.py +++ /dev/null @@ -1,4 +0,0 @@ -from .config import (FORMAT, FORMAT_FIELD_CODE, FORMAT_FIELD_JSON, - QUANT_CONFIG_FILENAME, QUANT_METHOD, QUANT_METHOD_FIELD, BaseQuantizeConfig, QuantizeConfig) -from .gptq import GPTQ -from .quantizer import Quantizer, quantize diff --git a/plugins/accelerated-peft/src/gptqmodel/utils/__init__.py b/plugins/accelerated-peft/src/gptqmodel/utils/__init__.py deleted file mode 100644 index 0e79e1d3..00000000 --- a/plugins/accelerated-peft/src/gptqmodel/utils/__init__.py +++ /dev/null @@ -1 +0,0 @@ -from .backend import Backend, get_backend diff --git a/plugins/accelerated-peft/tests/test_gptqmodel.py b/plugins/accelerated-peft/tests/test_gptqmodel.py index 84581da1..9157ccf7 100644 --- a/plugins/accelerated-peft/tests/test_gptqmodel.py +++ b/plugins/accelerated-peft/tests/test_gptqmodel.py @@ -1,3 +1,20 @@ +# Copyright The IBM Tuning Team +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# SPDX-License-Identifier: Apache-2.0 +# https://spdx.dev/learn/handling-license-info/ + import pytest # pylint: disable=import-error import torch from typing import List @@ -14,7 +31,7 @@ BS = 1 SEQLEN = 128 -LOSS_TOLERANCE = 1e-3 +LOSS_TOLERANCE = 0.1 ALLCLOSE_RTOL = 1e-3 ALLCLOSE_ATOL = 1e-4 @@ -60,12 +77,12 @@ class TrainArgs: # quantization function to manage the loading and quantizing of pretrained model # using external or local autogptq -def quantize_model(model_name, config, calibration_dataset, quant_config_kwargs, device, use_external_lib=False): +def quantize_model(model_name, config, calibration_dataset, quant_config_kwargs, device, torch_dtype, use_external_lib=False): if use_external_lib: from auto_gptq import AutoGPTQForCausalLM as GPTQModel, BaseQuantizeConfig as QuantizeConfig quantize_kwargs = {"use_triton": True} else: - from gptqmodel import GPTQModel, QuantizeConfig + from fms_acceleration_peft.gptqmodel import GPTQModel, QuantizeConfig quantize_kwargs = {} quantize_config = QuantizeConfig( @@ -76,6 +93,7 @@ def quantize_model(model_name, config, calibration_dataset, quant_config_kwargs, model_name, quantize_config = quantize_config, config = config, + torch_dtype = getattr(torch, torch_dtype), ).to(device) # quantize model, the examples should be list of dict whose keys can only be "input_ids" model.quantize(calibration_dataset, **quantize_kwargs) @@ -184,6 +202,7 @@ def test_quantizing_pretrained_model_outputs_match( calibration_dataset, quant_config_kwargs, device, + FLOAT16, use_external_lib=True ) refactored_model = quantize_model( @@ -192,6 +211,7 @@ def test_quantizing_pretrained_model_outputs_match( calibration_dataset, quant_config_kwargs, device, + FLOAT16, use_external_lib=False ) @@ -228,7 +248,7 @@ def test_quantizing_pretrained_model_outputs_match( refactored_logits = refactored_model(input_ids).logits # Measure the distribution error with KD Loss - loss_fn = torch.nn.KLDivLoss(reduction="mean") + loss_fn = torch.nn.KLDivLoss(reduction="batchmean") # input should be a distribution in the log space input = torch.nn.functional.log_softmax(refactored_logits, dim=1) # target must be prob distribution From 8e18154a404ad3c71bc9b8f9739809fad62ccd7c Mon Sep 17 00:00:00 2001 From: 1000850000 user Date: Thu, 4 Jul 2024 08:11:37 +0000 Subject: [PATCH 07/20] reintroduce support for low_cpu_mem_usage in extracted lib --- .../framework_plugin_autogptq.py | 65 +++++++++---------- .../gptqmodel/models/base.py | 10 ++- 2 files changed, 40 insertions(+), 35 deletions(-) diff --git a/plugins/accelerated-peft/src/fms_acceleration_peft/framework_plugin_autogptq.py b/plugins/accelerated-peft/src/fms_acceleration_peft/framework_plugin_autogptq.py index efa82c45..2a880ee5 100644 --- a/plugins/accelerated-peft/src/fms_acceleration_peft/framework_plugin_autogptq.py +++ b/plugins/accelerated-peft/src/fms_acceleration_peft/framework_plugin_autogptq.py @@ -100,7 +100,7 @@ def model_loader(self, model_name: str, **kwargs): # get additional parameters torch_dtype = kwargs.get("torch_dtype", torch.float32) - low_cpu_mem_usage = kwargs.get("low_cpu_mem_usage") + low_cpu_mem_usage = kwargs.get("low_cpu_mem_usage", False) attn_implementation = kwargs.get("attn_implementation") # there are some kwargs that we wont be passed to AutoModel, so we need @@ -129,38 +129,38 @@ def model_loader(self, model_name: str, **kwargs): # this is a HF method that checks if the low_cpu_mem mode is enabled # via HF accelerate - if is_fsdp_enabled() and self.use_external_lib: - # Local - from .autogptq_utils import ( # pylint: disable=import-outside-toplevel - _patch_target_module, - make_sure_no_tensor_in_meta_device, - ) - - # We patch `make_sure_no_tensor_in_meta_device` - # from autogptq to avoid errors on models without bias - _patch_target_module( - to_patch="auto_gptq.modeling._utils.make_sure_no_tensor_in_meta_device", - replace_with=make_sure_no_tensor_in_meta_device, - target_module="auto_gptq.modeling._base", - ) + if is_fsdp_enabled(): kwargs["low_cpu_mem_usage"] = True - - # NOTE: need to set the device map as below as we want to use AutoGPTQ for training. - # device_map is for inference only - # https://huggingface.co/docs/accelerate/en/concept_guides/big_model_inference - # For low_cpu_mem_usage = True, we have to set the device map to load checkpoints to "cpu" - # to avoid gpu consumption before train - # This approach will divert consumption to cpu memory, - # a better approach would be to load the checkpoints to meta device - # QLoRA is currently implemented by the former approach and will encounter the same issue. - # see https://github.com/huggingface/transformers/pull/25107#issuecomment-2134833262 - device_map = { - "": ( - (torch.cuda.current_device() if not kwargs["low_cpu_mem_usage"] else "cpu") - if torch.cuda.is_available() - else None - ) - } + if self.use_external_lib: + # Local + from .autogptq_utils import ( # pylint: disable=import-outside-toplevel + _patch_target_module, + make_sure_no_tensor_in_meta_device, + ) + + # We patch `make_sure_no_tensor_in_meta_device` + # from autogptq to avoid errors on models without bias + _patch_target_module( + to_patch="auto_gptq.modeling._utils.make_sure_no_tensor_in_meta_device", + replace_with=make_sure_no_tensor_in_meta_device, + target_module="auto_gptq.modeling._base", + ) + + # NOTE: need to set the device map as below as we want to use AutoGPTQ for training. + # For low_cpu_mem_usage = True, we have to set the device map to load checkpoints to "cpu" + # to avoid gpu consumption before train + # This approach will divert consumption to cpu memory, + # a better approach would be to load the checkpoints to meta device + # QLoRA is currently implemented by the former approach and will encounter the same issue. + # see https://github.com/huggingface/transformers/pull/25107#issuecomment-2134833262 + + kwargs["device_map"] = { + "": ( + (torch.cuda.current_device() if not kwargs["low_cpu_mem_usage"] else "cpu") + if torch.cuda.is_available() + else None + ) + } # currently only enable triton_v2, because the triton kernels are the only ones # that have backwards @@ -168,7 +168,6 @@ def model_loader(self, model_name: str, **kwargs): model_name, quantize_config=quantize_config, torch_dtype=torch_dtype, - device_map=device_map, warmup_triton=False, # disable for now as it will try to run the warmup while on CPU **kwargs, ) diff --git a/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/models/base.py b/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/models/base.py index d1010a39..e3ca3938 100644 --- a/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/models/base.py +++ b/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/models/base.py @@ -774,7 +774,7 @@ def from_quantized( device_map: Optional[Union[str, Dict[str, Union[int, str]]]] = None, max_memory: Optional[dict] = None, device: Optional[Union[str, int]] = None, - + low_cpu_mem_usage: bool = False, backend: Backend = Backend.AUTO, torch_dtype: [str | torch.dtype] = "auto", @@ -909,6 +909,8 @@ def skip(*args, **kwargs): transformers.modeling_utils._init_weights = False init_contexts = [no_init_weights()] + if low_cpu_mem_usage: + init_contexts.append(accelerate.init_empty_weights(include_buffers=False)) with ContextManagers(init_contexts): model = AutoModelForCausalLM.from_config( @@ -982,6 +984,11 @@ def skip(*args, **kwargs): no_split_module_classes=[cls.layer_type], ) + if low_cpu_mem_usage: + # set device_map on so `dispatch_model` initializes weights on cpu until accelerator + # prepares the model on gpu in `trainer.train` to avoid unnecessary gpu usage + device_map = {"": "cpu"} + load_checkpoint_in_model = False # compat: runtime convert checkpoint gptq(v1) to gptq_v2 format if quantize_config.format == FORMAT.GPTQ: @@ -1019,7 +1026,6 @@ def skip(*args, **kwargs): offload_state_dict=True, offload_buffers=True, ) - # TODO: Why are we using this custom function and not dispatch_model? model = simple_dispatch_model(model, device_map) From b41d141eeee923256a0d6a216663792752a622c6 Mon Sep 17 00:00:00 2001 From: 1000850000 user Date: Thu, 4 Jul 2024 08:59:55 +0000 Subject: [PATCH 08/20] Use transformers package checking instead of importlib --- .../fms_acceleration_peft/framework_plugin_autogptq.py | 4 ++-- plugins/accelerated-peft/tests/test_gptqmodel.py | 10 +++++++--- 2 files changed, 9 insertions(+), 5 deletions(-) diff --git a/plugins/accelerated-peft/src/fms_acceleration_peft/framework_plugin_autogptq.py b/plugins/accelerated-peft/src/fms_acceleration_peft/framework_plugin_autogptq.py index 2a880ee5..914e7d39 100644 --- a/plugins/accelerated-peft/src/fms_acceleration_peft/framework_plugin_autogptq.py +++ b/plugins/accelerated-peft/src/fms_acceleration_peft/framework_plugin_autogptq.py @@ -20,7 +20,6 @@ from functools import partial from types import MethodType from typing import Dict, Tuple -import importlib import os # Third Party @@ -28,6 +27,7 @@ from peft import LoraConfig, prepare_model_for_kbit_training from peft.tuners.lora.model import LoraModel from transformers import AutoModelForCausalLM, TrainingArguments +from transformers.utils.import_utils import _is_package_available from transformers.modeling_utils import is_fsdp_enabled import torch import torch.distributed @@ -50,7 +50,7 @@ def __init__(self, configurations: Dict[str, Dict], use_external_lib: bool = Fal self.use_external_lib = use_external_lib if self.use_external_lib: - assert importlib.util.find_spec("auto_gptq") is not None, "Unable to use external library, autogptq module not found." + assert _is_package_available("auto_gptq") is True, "Unable to use external library, autogptq module not found." def model_loader(self, model_name: str, **kwargs): # guarded imports diff --git a/plugins/accelerated-peft/tests/test_gptqmodel.py b/plugins/accelerated-peft/tests/test_gptqmodel.py index 9157ccf7..dbab2821 100644 --- a/plugins/accelerated-peft/tests/test_gptqmodel.py +++ b/plugins/accelerated-peft/tests/test_gptqmodel.py @@ -31,7 +31,7 @@ BS = 1 SEQLEN = 128 -LOSS_TOLERANCE = 0.1 +LOSS_TOLERANCE = 1e-3 ALLCLOSE_RTOL = 1e-3 ALLCLOSE_ATOL = 1e-4 @@ -248,11 +248,15 @@ def test_quantizing_pretrained_model_outputs_match( refactored_logits = refactored_model(input_ids).logits # Measure the distribution error with KD Loss + # flatten as a single batch bs*seqlen + # since batchmean sums the loss and averages on dim=0 loss_fn = torch.nn.KLDivLoss(reduction="batchmean") # input should be a distribution in the log space - input = torch.nn.functional.log_softmax(refactored_logits, dim=1) + input = torch.nn.functional.log_softmax(refactored_logits, dim=-1) + input = torch.flatten(input, start_dim=0, end_dim=1) # target must be prob distribution - target = torch.nn.functional.softmax(original_logits, dim=1) + target = torch.nn.functional.softmax(original_logits, dim=-1) + target = torch.flatten(target, start_dim=0, end_dim=1) error = loss_fn(input, target) assert error.lt(LOSS_TOLERANCE), "Model logits don't match between both libraries \ after quantization" From b6282e1132880313951ffcbb183b9e716b5725cb Mon Sep 17 00:00:00 2001 From: 1000850000 user Date: Thu, 4 Jul 2024 18:44:20 +0000 Subject: [PATCH 09/20] formatting --- .../framework_plugin_autogptq.py | 37 ++- .../gptqmodel/__init__.py | 3 +- .../gptqmodel/models/__init__.py | 1 + .../gptqmodel/models/_const.py | 3 +- .../gptqmodel/models/auto.py | 17 +- .../gptqmodel/models/base.py | 294 +++++++++++++----- .../gptqmodel/models/dbrx.py | 5 +- .../gptqmodel/models/dbrx_converted.py | 57 ++-- .../gptqmodel/models/gemma.py | 1 + .../gptqmodel/models/gpt_bigcode.py | 1 + .../gptqmodel/models/gpt_neox.py | 2 +- .../gptqmodel/models/llama.py | 1 + .../gptqmodel/models/mistral.py | 1 + .../gptqmodel/models/mixtral.py | 1 + .../gptqmodel/nn_modules/__init__.py | 2 +- .../gptqmodel/nn_modules/qlinear/__init__.py | 10 +- .../nn_modules/qlinear/qlinear_tritonv2.py | 43 ++- .../triton_utils/custom_autotune.py | 21 +- .../nn_modules/triton_utils/dequant.py | 16 +- .../nn_modules/triton_utils/kernels.py | 58 +++- .../gptqmodel/quantization/__init__.py | 13 +- .../gptqmodel/quantization/config.py | 50 ++- .../gptqmodel/quantization/gptq.py | 18 +- .../gptqmodel/quantization/quantizer.py | 2 + .../gptqmodel/utils/__init__.py | 1 + .../gptqmodel/utils/backend.py | 2 + .../gptqmodel/utils/data.py | 52 +++- .../gptqmodel/utils/importer.py | 30 +- .../gptqmodel/utils/model.py | 144 ++++++--- .../gptqmodel/utils/peft.py | 32 +- .../accelerated-peft/tests/test_gptqmodel.py | 182 ++++++----- .../accelerated-peft/tests/test_q4_triton.py | 31 +- plugins/accelerated-peft/tests/test_triton.py | 13 +- 33 files changed, 803 insertions(+), 341 deletions(-) diff --git a/plugins/accelerated-peft/src/fms_acceleration_peft/framework_plugin_autogptq.py b/plugins/accelerated-peft/src/fms_acceleration_peft/framework_plugin_autogptq.py index 914e7d39..8bcd0bb9 100644 --- a/plugins/accelerated-peft/src/fms_acceleration_peft/framework_plugin_autogptq.py +++ b/plugins/accelerated-peft/src/fms_acceleration_peft/framework_plugin_autogptq.py @@ -27,11 +27,12 @@ from peft import LoraConfig, prepare_model_for_kbit_training from peft.tuners.lora.model import LoraModel from transformers import AutoModelForCausalLM, TrainingArguments -from transformers.utils.import_utils import _is_package_available from transformers.modeling_utils import is_fsdp_enabled +from transformers.utils.import_utils import _is_package_available import torch import torch.distributed + class AutoGPTQAccelerationPlugin(AccelerationPlugin): require_packages = [] @@ -50,16 +51,19 @@ def __init__(self, configurations: Dict[str, Dict], use_external_lib: bool = Fal self.use_external_lib = use_external_lib if self.use_external_lib: - assert _is_package_available("auto_gptq") is True, "Unable to use external library, autogptq module not found." + assert ( + _is_package_available("auto_gptq") is True + ), "Unable to use external library, autogptq module not found." def model_loader(self, model_name: str, **kwargs): # guarded imports # Third Party if self.use_external_lib: - from auto_gptq import ( # pylint: disable=import-outside-toplevel,import-error - AutoGPTQForCausalLM as GPTQModel, - BaseQuantizeConfig as QuantizeConfig, + # Third Party + from auto_gptq import ( + AutoGPTQForCausalLM as GPTQModel, # pylint: disable=import-outside-toplevel,import-error ) + from auto_gptq import BaseQuantizeConfig as QuantizeConfig from auto_gptq.nn_modules.qlinear.qlinear_tritonv2 import ( # pylint: disable=import-outside-toplevel,import-error QuantLinear, ) @@ -69,7 +73,6 @@ def model_loader(self, model_name: str, **kwargs): from .gptqmodel.nn_modules.qlinear.qlinear_tritonv2 import ( QuantLinear, ) - # Local from .autogptq_utils import ( # pylint: disable=import-outside-toplevel PATCH_FOR_FSDP_TRITON_V2, @@ -122,11 +125,10 @@ def model_loader(self, model_name: str, **kwargs): } else: kwargs = { - "low_cpu_mem_usage": low_cpu_mem_usage, # this is only used for device map + "low_cpu_mem_usage": low_cpu_mem_usage, # this is only used for device map "backend": Backend.TRITON, } - # this is a HF method that checks if the low_cpu_mem mode is enabled # via HF accelerate if is_fsdp_enabled(): @@ -156,7 +158,11 @@ def model_loader(self, model_name: str, **kwargs): kwargs["device_map"] = { "": ( - (torch.cuda.current_device() if not kwargs["low_cpu_mem_usage"] else "cpu") + ( + torch.cuda.current_device() + if not kwargs["low_cpu_mem_usage"] + else "cpu" + ) if torch.cuda.is_available() else None ) @@ -241,6 +247,7 @@ def augmentation( # guarded imports # Third Party if self.use_external_lib: + # Third Party from auto_gptq.nn_modules.qlinear.qlinear_tritonv2 import ( # pylint: disable=import-outside-toplevel,import-error QuantLinear, ) @@ -248,15 +255,16 @@ def augmentation( GPTQLoraModel, get_gptq_peft_model, ) + # Local from .autogptq_utils import ( # pylint: disable=import-outside-toplevel create_new_module_peft, replace_module_peft, ) else: + # Local from .gptqmodel.utils.peft import get_gptq_peft_model - (peft_config,) = modifiable_args # unpack modifiable args # some assertions @@ -298,8 +306,9 @@ def augmentation( _old_replace_module = GPTQLoraModel._replace_module _create_new_module = partial(create_new_module_peft, target_cls=QuantLinear) LoraModel._create_new_module = staticmethod(_create_new_module) - GPTQLoraModel._replace_module = MethodType(replace_module_peft, GPTQLoraModel) - + GPTQLoraModel._replace_module = MethodType( + replace_module_peft, GPTQLoraModel + ) # Install GPTQ adapters using the AutoGPTQ package (with the above patches) model = get_gptq_peft_model( @@ -313,7 +322,9 @@ def augmentation( if self.use_external_lib: # undo the patching for hygine LoraModel._create_new_module = staticmethod(_old_create_new_module) - GPTQLoraModel._replace_module = MethodType(_old_replace_module, GPTQLoraModel) + GPTQLoraModel._replace_module = MethodType( + _old_replace_module, GPTQLoraModel + ) return model, modifiable_args diff --git a/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/__init__.py b/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/__init__.py index dd205fa8..d9eb61ed 100644 --- a/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/__init__.py +++ b/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/__init__.py @@ -13,6 +13,7 @@ # See the License for the specific language governing permissions and # limitations under the License. ############################################################################### +# Local from .models import GPTQModel from .quantization import BaseQuantizeConfig, QuantizeConfig -from .utils import Backend, get_backend \ No newline at end of file +from .utils import Backend, get_backend diff --git a/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/models/__init__.py b/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/models/__init__.py index ac31ca08..967dd1a6 100644 --- a/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/models/__init__.py +++ b/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/models/__init__.py @@ -13,6 +13,7 @@ # See the License for the specific language governing permissions and # limitations under the License. ############################################################################### +# Local from .auto import MODEL_MAP, GPTQModel from .base import BaseGPTQModel from .dbrx import DbrxGPTQ diff --git a/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/models/_const.py b/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/models/_const.py index 49484bee..ab8bf4dd 100644 --- a/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/models/_const.py +++ b/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/models/_const.py @@ -13,6 +13,7 @@ # See the License for the specific language governing permissions and # limitations under the License. ############################################################################### +# Third Party from torch import device CPU = device("cpu") @@ -31,5 +32,3 @@ EXLLAMA_DEFAULT_MAX_INPUT_LENGTH = 2048 EXPERT_INDEX_PLACEHOLDER = "{expert_index}" - - diff --git a/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/models/auto.py b/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/models/auto.py index d31c12e9..178d9c89 100644 --- a/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/models/auto.py +++ b/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/models/auto.py @@ -13,10 +13,13 @@ # See the License for the specific language governing permissions and # limitations under the License. ############################################################################### +# Standard from typing import Dict, List, Optional, Union +# Third Party import torch +# Local from ..utils import Backend from ..utils.model import check_and_get_model_type from .base import BaseGPTQModel, QuantizeConfig @@ -40,10 +43,15 @@ "dbrx_converted": DbrxConvertedGPTQ, } -at_least_one_cuda_v6 = any(torch.cuda.get_device_capability(i)[0] >= 6 for i in range(torch.cuda.device_count())) +at_least_one_cuda_v6 = any( + torch.cuda.get_device_capability(i)[0] >= 6 + for i in range(torch.cuda.device_count()) +) if not at_least_one_cuda_v6: - raise EnvironmentError("GPTQModel requires at least one GPU device with CUDA compute capability >= `6.0`.") + raise EnvironmentError( + "GPTQModel requires at least one GPU device with CUDA compute capability >= `6.0`." + ) class GPTQModel: @@ -63,7 +71,9 @@ def from_pretrained( trust_remote_code: bool = False, **model_init_kwargs, ) -> BaseGPTQModel: - model_type = check_and_get_model_type(pretrained_model_name_or_path, trust_remote_code) + model_type = check_and_get_model_type( + pretrained_model_name_or_path, trust_remote_code + ) return MODEL_MAP[model_type].from_pretrained( pretrained_model_name_or_path=pretrained_model_name_or_path, quantize_config=quantize_config, @@ -110,4 +120,3 @@ def from_quantized( verify_hash=verify_hash, **kwargs, ) - diff --git a/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/models/base.py b/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/models/base.py index e3ca3938..8c319c45 100644 --- a/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/models/base.py +++ b/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/models/base.py @@ -13,36 +13,64 @@ # See the License for the specific language governing permissions and # limitations under the License. ############################################################################### +# Standard +from os.path import isfile, join +from typing import Dict, List, Optional, Union import copy import json import logging import os import re -from os.path import isfile, join -from typing import Dict, List, Optional, Union -import accelerate -import torch -import torch.nn as nn -import transformers +# Third Party from accelerate.hooks import remove_hook_from_module from safetensors.torch import save_file as safe_save from tqdm import tqdm -from transformers import AutoConfig, AutoModelForCausalLM, PretrainedConfig, PreTrainedModel +from transformers import ( + AutoConfig, + AutoModelForCausalLM, + PretrainedConfig, + PreTrainedModel, +) from transformers.modeling_utils import no_init_weights, shard_checkpoint from transformers.utils.generic import ContextManagers +import accelerate +import torch +import torch.nn as nn +import transformers +# Local from ..quantization import GPTQ, QuantizeConfig -from ..quantization.config import (FORMAT, FORMAT_FIELD_JSON, META_FIELD_QUANTIZER, - META_QUANTIZER_GPTQMODEL, MIN_VERSION_WITH_V2, QUANTIZE_BLACK_LIST) +from ..quantization.config import ( + FORMAT, + FORMAT_FIELD_JSON, + META_FIELD_QUANTIZER, + META_QUANTIZER_GPTQMODEL, + MIN_VERSION_WITH_V2, + QUANTIZE_BLACK_LIST, +) from ..utils.backend import Backend from ..utils.data import collate_data from ..utils.importer import select_quant_linear -from ..utils.model import (auto_dtype_from_config, convert_gptq_v1_to_v2_format, convert_gptq_v2_to_v1_format, - find_layers, get_checkpoints, get_device, get_module_by_name_prefix, - get_module_by_name_suffix, get_moe_layer_modules, gptqmodel_post_init, make_quant, - move_to, nested_move_to, pack_model, simple_dispatch_model, verify_model_hash, - verify_sharded_model_hashes) +from ..utils.model import ( + auto_dtype_from_config, + convert_gptq_v1_to_v2_format, + convert_gptq_v2_to_v1_format, + find_layers, + get_checkpoints, + get_device, + get_module_by_name_prefix, + get_module_by_name_suffix, + get_moe_layer_modules, + gptqmodel_post_init, + make_quant, + move_to, + nested_move_to, + pack_model, + simple_dispatch_model, + verify_model_hash, + verify_sharded_model_hashes, +) from ._const import CPU, CUDA_0, SUPPORTED_MODELS logger = logging.getLogger(__name__) @@ -110,9 +138,9 @@ def hf_device_map(self): return getattr(self.model, "hf_device_map", None) def _prepare_dataset_for_quantization( - self, - calibration_dataset: List[Dict[str, Union[List[int], torch.LongTensor]]], - batch_size: int = 1, + self, + calibration_dataset: List[Dict[str, Union[List[int], torch.LongTensor]]], + batch_size: int = 1, ): def _convert_tensor_to_list(tensor): if isinstance(tensor, torch.Tensor): @@ -146,10 +174,14 @@ def _convert_tensor_to_list(tensor): pad_token_id = self.config.eos_token_id if pad_token_id is None: - raise ValueError("Calibration data requires model's `pad_token_id` or `eos_token_id` to be set: actual = `None`.") + raise ValueError( + "Calibration data requires model's `pad_token_id` or `eos_token_id` to be set: actual = `None`." + ) new_calibration_dataset_batched = [ - collate_data(new_calibration_dataset[start: start + batch_size], pad_token_id) + collate_data( + new_calibration_dataset[start : start + batch_size], pad_token_id + ) for start in range(0, len(new_calibration_dataset), batch_size) ] @@ -163,25 +195,26 @@ def quantize( self, calibration_dataset: List[Dict[str, Union[List[int], torch.LongTensor]]], batch_size: int = 1, - # TODO: remove use_cuda_fp16 arg..why? doesn't pass smell test @ZX-ModelCloud use_cuda_fp16: bool = True, - autotune_warmup_after_quantized: bool = False, calibration_enable_gpu_cache: bool = True, ): if self.quantized: - raise EnvironmentError("quantize() is called a model that is already quantized") + raise EnvironmentError( + "quantize() is called a model that is already quantized" + ) if self.quantize_config.quant_method in QUANTIZE_BLACK_LIST: raise ValueError( f"Unsupported quantization operation for quant method: {self.quantize_config.quant_method}" ) - # TODO: lm_head quantization is yet ready but pending if self.quantize_config.lm_head: - raise ValueError("lm_head quantization is currently inference only and not applicable for quantization. Please set `lm_head=False`.") + raise ValueError( + "lm_head quantization is currently inference only and not applicable for quantization. Please set `lm_head=False`." + ) if len(calibration_dataset) == 0: raise ValueError("Calibration dataset must not be empty.") @@ -190,8 +223,10 @@ def quantize( min_calibration_dataset_input_ids_avg_length = 256 if len(calibration_dataset) < min_calibration_dataset_size: - logger.warning(f"Calibration dataset size should be greater than {min_calibration_dataset_size}. " - f"Current size: {len(calibration_dataset)}.") + logger.warning( + f"Calibration dataset size should be greater than {min_calibration_dataset_size}. " + f"Current size: {len(calibration_dataset)}." + ) # Calculate the average length of the average input_ids total_input_ids_length = 0 @@ -201,9 +236,10 @@ def quantize( avg = total_input_ids_length / len(calibration_dataset) if avg < min_calibration_dataset_input_ids_avg_length: - logger.warning(f"The average length of input_ids of calibration_dataset should be greater than " - f"{min_calibration_dataset_input_ids_avg_length}! Current AVG is {avg}.") - + logger.warning( + f"The average length of input_ids of calibration_dataset should be greater than " + f"{min_calibration_dataset_input_ids_avg_length}! Current AVG is {avg}." + ) device_map = self.hf_device_map if device_map: @@ -220,7 +256,9 @@ def quantize( layer_input_kwargs = [] layer_outputs = [] - calibration_dataset = self._prepare_dataset_for_quantization(calibration_dataset, batch_size) + calibration_dataset = self._prepare_dataset_for_quantization( + calibration_dataset, batch_size + ) forward_pass_use_cache = self.model.config.use_cache self.model.config.use_cache = False @@ -248,7 +286,7 @@ def store_input_hook(_, args, kwargs): if pos_ids is not None: position_ids.append(move_to(pos_ids, data_device)) one_kwargs = {} - for (k, v) in kwargs.items(): # make sure other arguments also be captured + for k, v in kwargs.items(): # make sure other arguments also be captured if k not in ["hidden_states", "attention_mask", "position_ids"]: one_kwargs[k] = nested_move_to(v, data_device) layer_input_kwargs.append(one_kwargs) @@ -299,8 +337,9 @@ def store_input_hook(_, args, kwargs): # dynamic expert layer index for model defs if self.dynamic_expert_index is not None: num_experts = getattr(self.model.config, self.dynamic_expert_index) - layer_modules = get_moe_layer_modules(layer_modules=self.layer_modules, - num_experts=num_experts) + layer_modules = get_moe_layer_modules( + layer_modules=self.layer_modules, num_experts=num_experts + ) quantizers = {} @@ -347,11 +386,15 @@ def tmp(_, inp, out): layer_input.append(move_to(layer_inp, cur_layer_device)) mask = attention_masks[j] - layer_attention_mask = mask if mask is None else move_to(mask, cur_layer_device) + layer_attention_mask = ( + mask if mask is None else move_to(mask, cur_layer_device) + ) additional_layer_inputs = {"attention_mask": layer_attention_mask} layer_position_ids = ( - None if not position_ids else move_to(position_ids[j], cur_layer_device) + None + if not position_ids + else move_to(position_ids[j], cur_layer_device) ) if layer_position_ids is not None: additional_layer_inputs["position_ids"] = layer_position_ids @@ -362,7 +405,9 @@ def tmp(_, inp, out): h.remove() for name in subset: - layer_pb.set_description(f"Quantizing {name} in layer {i + 1} of {layer_count}") + layer_pb.set_description( + f"Quantizing {name} in layer {i + 1} of {layer_count}" + ) try: scale, zero, g_idx, duration, avg_loss = gptq[name].fasterquant( @@ -372,8 +417,12 @@ def tmp(_, inp, out): static_groups=self.quantize_config.static_groups, ) - stat = {"layer": i + 1, "module": name, "avg_loss": f"{avg_loss:.4f}", - "time": f"{duration:.4f}"} + stat = { + "layer": i + 1, + "module": name, + "avg_loss": f"{avg_loss:.4f}", + "time": f"{duration:.4f}", + } quant_log.append(stat) logger.info(stat) @@ -386,10 +435,18 @@ def tmp(_, inp, out): raise e quantizers[f"{self.layers_node}.{i}.{name}"] = ( - gptq[name].quantizer.to(CPU if force_layer_back_to_cpu else cur_layer_device), - move_to(scale, CPU if force_layer_back_to_cpu else cur_layer_device), - move_to(zero, CPU if force_layer_back_to_cpu else cur_layer_device), - move_to(g_idx, CPU if force_layer_back_to_cpu else cur_layer_device), + gptq[name].quantizer.to( + CPU if force_layer_back_to_cpu else cur_layer_device + ), + move_to( + scale, CPU if force_layer_back_to_cpu else cur_layer_device + ), + move_to( + zero, CPU if force_layer_back_to_cpu else cur_layer_device + ), + move_to( + g_idx, CPU if force_layer_back_to_cpu else cur_layer_device + ), ) gptq[name].free() @@ -399,10 +456,16 @@ def tmp(_, inp, out): layer_input.append(move_to(layer_inp, cur_layer_device)) mask = attention_masks[j] - layer_attention_mask = mask if mask is None else move_to(mask, cur_layer_device) + layer_attention_mask = ( + mask if mask is None else move_to(mask, cur_layer_device) + ) additional_layer_inputs = {"attention_mask": layer_attention_mask} - layer_position_ids = None if not position_ids else move_to(position_ids[j], cur_layer_device) + layer_position_ids = ( + None + if not position_ids + else move_to(position_ids[j], cur_layer_device) + ) if layer_position_ids is not None: additional_layer_inputs["position_ids"] = layer_position_ids for k, v in layer_input_kwargs[j].items(): @@ -413,7 +476,9 @@ def tmp(_, inp, out): ) layer_outputs.append([layer_output]) - layers[i] = move_to(layer, CPU if force_layer_back_to_cpu else cur_layer_device) + layers[i] = move_to( + layer, CPU if force_layer_back_to_cpu else cur_layer_device + ) del layer del gptq del layer_inputs @@ -480,7 +545,7 @@ def save_quantized( safetensors_metadata: Optional[Dict[str, str]] = None, use_safetensors: bool = True, max_shard_size: Optional[str] = None, - model_base_name: Optional[str] = None + model_base_name: Optional[str] = None, ): """save quantized model and configs to local disk""" os.makedirs(save_dir, exist_ok=True) @@ -498,12 +563,14 @@ def save_quantized( model = self.model if not self.quantized: - raise ValueError("Save aborted as model is not quantized. Please call `quantize()` first.") + raise ValueError( + "Save aborted as model is not quantized. Please call `quantize()` first." + ) if model_base_name is None: model_base_name = ( - self.quantize_config.model_file_base_name or - f"gptq_model-{self.quantize_config.bits}bit-{self.quantize_config.group_size}g" + self.quantize_config.model_file_base_name + or f"gptq_model-{self.quantize_config.bits}bit-{self.quantize_config.group_size}g" ) if quantize_config.format == FORMAT.GPTQ_V2: @@ -520,7 +587,11 @@ def save_quantized( # no need to set it back, no calculation below if quantize_config.bits != 4: cuda_name_modules = {} - from gptqmodel.nn_modules.qlinear.qlinear_cuda import BaseCudaQuantLinear + # Third Party + from gptqmodel.nn_modules.qlinear.qlinear_cuda import ( + BaseCudaQuantLinear, + ) + for name, module in model.named_modules(): if isinstance(module, BaseCudaQuantLinear): cuda_name_modules[name] = module.gptqmodel_cuda @@ -528,14 +599,19 @@ def save_quantized( model = copy.deepcopy(self.model) for name, modules in model.named_modules(): - if isinstance(module, BaseCudaQuantLinear) and name in cuda_name_modules: + if ( + isinstance(module, BaseCudaQuantLinear) + and name in cuda_name_modules + ): module.gptqmodel_cuda = cuda_name_modules[name] del cuda_name_modules else: model = copy.deepcopy(self.model) model = convert_gptq_v2_to_v1_format( - model, quantize_config=quantize_config, qlinear_kernel=self.qlinear_kernel + model, + quantize_config=quantize_config, + qlinear_kernel=self.qlinear_kernel, ) model.to(CPU) @@ -557,7 +633,9 @@ def save_quantized( model_save_name = model_base_name + ".bin" if not self.qlinear_kernel.SUPPORTED_SHARDS and max_shard_size is not None: - logger.warning("Sharding is not supported for this quant. Disabling sharding.") + logger.warning( + "Sharding is not supported for this quant. Disabling sharding." + ) max_shard_size = None if max_shard_size is None: @@ -567,7 +645,9 @@ def save_quantized( elif not isinstance(safetensors_metadata, dict): raise TypeError("safetensors_metadata must be a dictionary.") else: - logger.debug(f"Received safetensors_metadata: {safetensors_metadata}") + logger.debug( + f"Received safetensors_metadata: {safetensors_metadata}" + ) new_safetensors_metadata = {} converted_keys = False for key, value in safetensors_metadata.items(): @@ -594,28 +674,35 @@ def save_quantized( # Format is required to enable Accelerate to load the metadata # otherwise it raises an OSError safetensors_metadata["format"] = "pt" - safe_save(state_dict, join(save_dir, model_save_name), safetensors_metadata) + safe_save( + state_dict, join(save_dir, model_save_name), safetensors_metadata + ) else: logger.warning( - "We highly suggest saving quantized model using safetensors format for security reasons. Please set `use_safetensors=True` whenever possible.") + "We highly suggest saving quantized model using safetensors format for security reasons. Please set `use_safetensors=True` whenever possible." + ) torch.save(model.state_dict(), join(save_dir, model_save_name)) else: # Shard checkpoint - shards, index = shard_checkpoint(state_dict, max_shard_size=max_shard_size, weights_name=model_save_name) + shards, index = shard_checkpoint( + state_dict, max_shard_size=max_shard_size, weights_name=model_save_name + ) # Clean the folder from a previous save for filename in os.listdir(save_dir): full_filename = join(save_dir, filename) # make sure that file to be deleted matches format of sharded file, e.g. pytorch_model-00001-of-00005 - filename_no_suffix = filename.replace(".bin", "").replace(".safetensors", "") + filename_no_suffix = filename.replace(".bin", "").replace( + ".safetensors", "" + ) reg = re.compile(r"(.*?)-\d{5}-of-\d{5}") if ( - filename.startswith(model_base_name) - and isfile(full_filename) - and filename not in shards.keys() - and reg.fullmatch(filename_no_suffix) is not None + filename.startswith(model_base_name) + and isfile(full_filename) + and filename not in shards.keys() + and reg.fullmatch(filename_no_suffix) is not None ): os.remove(full_filename) @@ -627,7 +714,9 @@ def save_quantized( elif not isinstance(safetensors_metadata, dict): raise TypeError("safetensors_metadata must be a dictionary.") else: - logger.debug(f"Received safetensors_metadata: {safetensors_metadata}") + logger.debug( + f"Received safetensors_metadata: {safetensors_metadata}" + ) new_safetensors_metadata = {} converted_keys = False for key, value in safetensors_metadata.items(): @@ -638,15 +727,18 @@ def save_quantized( new_value = str(value) except Exception as e: raise TypeError( - f"safetensors_metadata: both keys and values must be strings and an error occured when trying to convert them: {e}") + f"safetensors_metadata: both keys and values must be strings and an error occured when trying to convert them: {e}" + ) if new_key in new_safetensors_metadata: logger.warning( - f"After converting safetensors_metadata keys to strings, the key '{new_key}' is duplicated. Ensure that all your metadata keys are strings to avoid overwriting.") + f"After converting safetensors_metadata keys to strings, the key '{new_key}' is duplicated. Ensure that all your metadata keys are strings to avoid overwriting." + ) new_safetensors_metadata[new_key] = new_value safetensors_metadata = new_safetensors_metadata if converted_keys: logger.debug( - f"One or more safetensors_metadata keys or values had to be converted to str(). Final safetensors_metadata: {safetensors_metadata}") + f"One or more safetensors_metadata keys or values had to be converted to str(). Final safetensors_metadata: {safetensors_metadata}" + ) # Format is required to enable Accelerate to load the metadata # otherwise it raises an OSError @@ -675,7 +767,9 @@ def save_pretrained( save_dir: str, **kwargs, ): - logger.warning("You are using save_pretrained, which will re-direct to save_quantized.") + logger.warning( + "You are using save_pretrained, which will re-direct to save_quantized." + ) self.save_quantized(save_dir=save_dir, **kwargs) @classmethod @@ -691,7 +785,9 @@ def from_pretrained( """load un-quantized pretrained model to cpu""" if not torch.cuda.is_available(): - raise EnvironmentError("Load pretrained model to do quantization requires CUDA available.") + raise EnvironmentError( + "Load pretrained model to do quantization requires CUDA available." + ) if cls.require_trust_remote_code and not trust_remote_code: raise ValueError( @@ -712,12 +808,16 @@ def skip(*args, **kwargs): model_init_kwargs["trust_remote_code"] = trust_remote_code - config = AutoConfig.from_pretrained(pretrained_model_name_or_path, **model_init_kwargs) + config = AutoConfig.from_pretrained( + pretrained_model_name_or_path, **model_init_kwargs + ) if torch_dtype == "auto": torch_dtype = auto_dtype_from_config(config) elif not isinstance(torch_dtype, torch.dtype): - raise ValueError(f"torch_dtype value of `{torch_dtype}` is not a torch.dtype instance.") + raise ValueError( + f"torch_dtype value of `{torch_dtype}` is not a torch.dtype instance." + ) # enforce some values despite user specified model_init_kwargs["torch_dtype"] = torch_dtype @@ -751,7 +851,9 @@ def skip(*args, **kwargs): torch.cuda.empty_cache() - model = AutoModelForCausalLM.from_pretrained(pretrained_model_name_or_path, **model_init_kwargs) + model = AutoModelForCausalLM.from_pretrained( + pretrained_model_name_or_path, **model_init_kwargs + ) model_config = model.config.to_dict() seq_len_keys = ["max_position_embeddings", "seq_length", "n_positions"] @@ -761,7 +863,9 @@ def skip(*args, **kwargs): model.seqlen = model_config[key] break else: - logger.warning("can't get model's sequence length from model config, will set to 4096.") + logger.warning( + "can't get model's sequence length from model config, will set to 4096." + ) model.seqlen = 4096 model.eval() @@ -776,7 +880,6 @@ def from_quantized( device: Optional[Union[str, int]] = None, low_cpu_mem_usage: bool = False, backend: Backend = Backend.AUTO, - torch_dtype: [str | torch.dtype] = "auto", use_cuda_fp16: bool = True, quantize_config: Optional[QuantizeConfig] = None, @@ -829,7 +932,9 @@ def from_quantized( if torch_dtype == "auto": torch_dtype = auto_dtype_from_config(config, quant_inference=True) elif not isinstance(torch_dtype, torch.dtype): - raise ValueError(f"torch_dtype value of `{torch_dtype}` is not a torch.dtype instance.") + raise ValueError( + f"torch_dtype value of `{torch_dtype}` is not a torch.dtype instance." + ) if config.model_type not in SUPPORTED_MODELS: raise TypeError(f"{config.model_type} isn't supported yet.") @@ -840,8 +945,9 @@ def from_quantized( ) else: if not isinstance(quantize_config, QuantizeConfig): - quantize_config = QuantizeConfig.from_quant_config(quantize_config, format) - + quantize_config = QuantizeConfig.from_quant_config( + quantize_config, format + ) if model_basename is None: if quantize_config.model_file_base_name: @@ -894,12 +1000,15 @@ def from_quantized( if not verfieid: raise ValueError(f"Hash verification failed for {model_save_name}") logger.info(f"Hash verification succeeded for {model_save_name}") + # == step2: convert model to gptq-model (replace Linear with QuantLinear) == # def skip(*args, **kwargs): pass if torch_dtype != torch.float16: - logger.warning("Overriding use_cuda_fp16 to False since torch_dtype is not torch.float16.") + logger.warning( + "Overriding use_cuda_fp16 to False since torch_dtype is not torch.float16." + ) use_cuda_fp16 = False torch.nn.init.kaiming_uniform_ = skip @@ -919,8 +1028,9 @@ def skip(*args, **kwargs): if cls.dynamic_expert_index is not None: num_experts = getattr(config, cls.dynamic_expert_index) - cls.layer_modules = get_moe_layer_modules(layer_modules=cls.layer_modules, - num_experts=num_experts) + cls.layer_modules = get_moe_layer_modules( + layer_modules=cls.layer_modules, num_experts=num_experts + ) layers = find_layers(model) ignore_layers = [cls.lm_head] + cls.base_modules @@ -930,8 +1040,12 @@ def skip(*args, **kwargs): if quantize_config.lm_head and name == cls.lm_head: continue - if any(name.startswith(ignore_layer) for ignore_layer in ignore_layers) or all( - not name.endswith(ignore_layer) for sublist in cls.layer_modules for ignore_layer in sublist + if any( + name.startswith(ignore_layer) for ignore_layer in ignore_layers + ) or all( + not name.endswith(ignore_layer) + for sublist in cls.layer_modules + for ignore_layer in sublist ): # log non-lm-head quantizerd layers only if name is not cls.lm_head: @@ -969,7 +1083,9 @@ def skip(*args, **kwargs): if device is not None: device = torch.device(device) if not max_memory and not device_map: - device_map = {"": device.index if device.type == "cuda" else device.type} + device_map = { + "": device.index if device.type == "cuda" else device.type + } if not isinstance(device_map, dict) and device_map != "sequential": max_memory = accelerate.utils.get_balanced_memory( model=model, @@ -1002,13 +1118,17 @@ def skip(*args, **kwargs): offload_buffers=True, ) # validate sym=False v1 loading needs to be protected for models produced with new v2 format codebase - if not quantize_config.sym and not quantize_config.is_quantized_or_packed_by_v2(): + if ( + not quantize_config.sym + and not quantize_config.is_quantized_or_packed_by_v2() + ): raise ValueError( f"Loading of a sym=False model with format={FORMAT.GPTQ} is only supported if produced by gptqmodel version >= {MIN_VERSION_WITH_V2}" ) logger.info( - f"Compatibility: converting `{FORMAT_FIELD_JSON}` from `{FORMAT.GPTQ}` to `{FORMAT.GPTQ_V2}`.") + f"Compatibility: converting `{FORMAT_FIELD_JSON}` from `{FORMAT.GPTQ}` to `{FORMAT.GPTQ_V2}`." + ) model = convert_gptq_v1_to_v2_format( model, quantize_config=quantize_config, @@ -1047,7 +1167,9 @@ def skip(*args, **kwargs): model.seqlen = model_config[key] break else: - logger.warning("can't get model's sequence length from model config, will set to 4096.") + logger.warning( + "can't get model's sequence length from model config, will set to 4096." + ) model.seqlen = 4096 # Any post-initialization that require device information, for example buffers initialization on device. @@ -1057,6 +1179,7 @@ def skip(*args, **kwargs): # == step6: (optional) warmup triton == # if backend != Backend.TRITON and warmup_triton: + # Local from ..nn_modules.qlinear.qlinear_tritonv2 import QuantLinear QuantLinear.warmup(model, seqlen=model.seqlen) @@ -1072,6 +1195,7 @@ def warmup_triton(self, enabled: bool = True): if not enabled: return + # Local from ..nn_modules.qlinear.qlinear_tritonv2 import QuantLinear QuantLinear.warmup(self.model, seqlen=self.model.seqlen) diff --git a/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/models/dbrx.py b/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/models/dbrx.py index fb758032..dc899be2 100644 --- a/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/models/dbrx.py +++ b/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/models/dbrx.py @@ -13,10 +13,13 @@ # See the License for the specific language governing permissions and # limitations under the License. ############################################################################### +# Local from .base import BaseGPTQModel # placer=holder only as dbrx original models are not supported # supported dbrx_converted models can be found on https://hf.co/ModelCloud class DbrxGPTQ(BaseGPTQModel): - info = {"notes": "Dbrx is only supported using defused/converted models on https://hf.co/ModelCloud with `trust_remote_code=True`"} + info = { + "notes": "Dbrx is only supported using defused/converted models on https://hf.co/ModelCloud with `trust_remote_code=True`" + } diff --git a/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/models/dbrx_converted.py b/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/models/dbrx_converted.py index 35a33170..9878d082 100644 --- a/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/models/dbrx_converted.py +++ b/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/models/dbrx_converted.py @@ -13,6 +13,7 @@ # See the License for the specific language governing permissions and # limitations under the License. ############################################################################### +# Local from .base import BaseGPTQModel @@ -25,25 +26,45 @@ class DbrxConvertedGPTQ(BaseGPTQModel): layers_node = "transformer.blocks" layer_type = "DbrxBlock" layer_modules = [ - ["norm_attn_norm.attn.q_proj", "norm_attn_norm.attn.k_proj", "norm_attn_norm.attn.v_proj"], + [ + "norm_attn_norm.attn.q_proj", + "norm_attn_norm.attn.k_proj", + "norm_attn_norm.attn.v_proj", + ], ["norm_attn_norm.attn.out_proj"], [ - "ffn.experts.mlp.0.w1", "ffn.experts.mlp.0.v1", - "ffn.experts.mlp.1.w1", "ffn.experts.mlp.1.v1", - "ffn.experts.mlp.2.w1", "ffn.experts.mlp.2.v1", - "ffn.experts.mlp.3.w1", "ffn.experts.mlp.3.v1", - "ffn.experts.mlp.4.w1", "ffn.experts.mlp.4.v1", - "ffn.experts.mlp.5.w1", "ffn.experts.mlp.5.v1", - "ffn.experts.mlp.6.w1", "ffn.experts.mlp.6.v1", - "ffn.experts.mlp.7.w1", "ffn.experts.mlp.7.v1", - "ffn.experts.mlp.8.w1", "ffn.experts.mlp.8.v1", - "ffn.experts.mlp.9.w1", "ffn.experts.mlp.9.v1", - "ffn.experts.mlp.10.w1", "ffn.experts.mlp.10.v1", - "ffn.experts.mlp.11.w1", "ffn.experts.mlp.11.v1", - "ffn.experts.mlp.12.w1", "ffn.experts.mlp.12.v1", - "ffn.experts.mlp.13.w1", "ffn.experts.mlp.13.v1", - "ffn.experts.mlp.14.w1", "ffn.experts.mlp.14.v1", - "ffn.experts.mlp.15.w1", "ffn.experts.mlp.15.v1", + "ffn.experts.mlp.0.w1", + "ffn.experts.mlp.0.v1", + "ffn.experts.mlp.1.w1", + "ffn.experts.mlp.1.v1", + "ffn.experts.mlp.2.w1", + "ffn.experts.mlp.2.v1", + "ffn.experts.mlp.3.w1", + "ffn.experts.mlp.3.v1", + "ffn.experts.mlp.4.w1", + "ffn.experts.mlp.4.v1", + "ffn.experts.mlp.5.w1", + "ffn.experts.mlp.5.v1", + "ffn.experts.mlp.6.w1", + "ffn.experts.mlp.6.v1", + "ffn.experts.mlp.7.w1", + "ffn.experts.mlp.7.v1", + "ffn.experts.mlp.8.w1", + "ffn.experts.mlp.8.v1", + "ffn.experts.mlp.9.w1", + "ffn.experts.mlp.9.v1", + "ffn.experts.mlp.10.w1", + "ffn.experts.mlp.10.v1", + "ffn.experts.mlp.11.w1", + "ffn.experts.mlp.11.v1", + "ffn.experts.mlp.12.w1", + "ffn.experts.mlp.12.v1", + "ffn.experts.mlp.13.w1", + "ffn.experts.mlp.13.v1", + "ffn.experts.mlp.14.w1", + "ffn.experts.mlp.14.v1", + "ffn.experts.mlp.15.w1", + "ffn.experts.mlp.15.v1", ], [ "ffn.experts.mlp.0.w2", @@ -62,5 +83,5 @@ class DbrxConvertedGPTQ(BaseGPTQModel): "ffn.experts.mlp.13.w2", "ffn.experts.mlp.14.w2", "ffn.experts.mlp.15.w2", - ] + ], ] diff --git a/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/models/gemma.py b/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/models/gemma.py index e6191904..6bd25f9e 100644 --- a/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/models/gemma.py +++ b/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/models/gemma.py @@ -13,6 +13,7 @@ # See the License for the specific language governing permissions and # limitations under the License. ############################################################################### +# Local from .base import BaseGPTQModel diff --git a/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/models/gpt_bigcode.py b/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/models/gpt_bigcode.py index 0a15062e..d9c1bf79 100644 --- a/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/models/gpt_bigcode.py +++ b/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/models/gpt_bigcode.py @@ -13,6 +13,7 @@ # See the License for the specific language governing permissions and # limitations under the License. ############################################################################### +# Local from .base import BaseGPTQModel diff --git a/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/models/gpt_neox.py b/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/models/gpt_neox.py index 81e8e401..16e806fd 100644 --- a/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/models/gpt_neox.py +++ b/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/models/gpt_neox.py @@ -13,6 +13,7 @@ # See the License for the specific language governing permissions and # limitations under the License. ############################################################################### +# Local from .base import BaseGPTQModel @@ -28,4 +29,3 @@ class GPTNeoXGPTQ(BaseGPTQModel): ["mlp.dense_h_to_4h"], ["mlp.dense_4h_to_h"], ] - diff --git a/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/models/llama.py b/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/models/llama.py index 19930ef6..69aaca23 100644 --- a/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/models/llama.py +++ b/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/models/llama.py @@ -13,6 +13,7 @@ # See the License for the specific language governing permissions and # limitations under the License. ############################################################################### +# Local from .base import BaseGPTQModel diff --git a/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/models/mistral.py b/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/models/mistral.py index aa1748eb..e6ff5782 100644 --- a/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/models/mistral.py +++ b/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/models/mistral.py @@ -13,6 +13,7 @@ # See the License for the specific language governing permissions and # limitations under the License. ############################################################################### +# Local from .base import BaseGPTQModel diff --git a/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/models/mixtral.py b/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/models/mixtral.py index acb4f640..12862755 100644 --- a/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/models/mixtral.py +++ b/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/models/mixtral.py @@ -13,6 +13,7 @@ # See the License for the specific language governing permissions and # limitations under the License. ############################################################################### +# Local from .base import BaseGPTQModel diff --git a/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/nn_modules/__init__.py b/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/nn_modules/__init__.py index feb7a9e6..d5a1b04c 100644 --- a/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/nn_modules/__init__.py +++ b/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/nn_modules/__init__.py @@ -12,4 +12,4 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -############################################################################### \ No newline at end of file +############################################################################### diff --git a/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/nn_modules/qlinear/__init__.py b/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/nn_modules/qlinear/__init__.py index ff590d3c..93d356fa 100644 --- a/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/nn_modules/qlinear/__init__.py +++ b/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/nn_modules/qlinear/__init__.py @@ -1,3 +1,4 @@ +# Third Party import torch.nn as nn @@ -12,7 +13,14 @@ class BaseQuantLinear(nn.Module): SUPPORTED_SHARDS: bool = True @classmethod - def validate(cls, bits: int, group_size: int, desc_act: bool, sym: bool, raise_error: bool = True) -> bool: + def validate( + cls, + bits: int, + group_size: int, + desc_act: bool, + sym: bool, + raise_error: bool = True, + ) -> bool: validate = True err = "" if cls.SUPPORTED_BITS and bits not in cls.SUPPORTED_BITS: diff --git a/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/nn_modules/qlinear/qlinear_tritonv2.py b/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/nn_modules/qlinear/qlinear_tritonv2.py index a0ab76c5..0b88da46 100644 --- a/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/nn_modules/qlinear/qlinear_tritonv2.py +++ b/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/nn_modules/qlinear/qlinear_tritonv2.py @@ -13,14 +13,17 @@ # See the License for the specific language governing permissions and # limitations under the License. ############################################################################### -import math +# Standard from logging import getLogger +import math +# Third Party import numpy as np import torch import torch.nn as nn import transformers +# Local from ..triton_utils.dequant import QuantLinearFunction, quant_matmul_248 from ..triton_utils.mixin import TritonModuleMixin from . import BaseQuantLinear @@ -39,12 +42,22 @@ class QuantLinear(BaseQuantLinear, TritonModuleMixin): QUANT_TYPE = "tritonv2" - def __init__(self, bits, group_size, infeatures, outfeatures, bias, **kwargs,): + def __init__( + self, + bits, + group_size, + infeatures, + outfeatures, + bias, + **kwargs, + ): super().__init__() if bits not in [2, 4, 8]: raise NotImplementedError("Only 2,4,8 bits are supported.") if infeatures % 32 != 0 or outfeatures % 32 != 0: - raise NotImplementedError("in_feature and out_feature must be divisible by 32.") + raise NotImplementedError( + "in_feature and out_feature must be divisible by 32." + ) self.infeatures = infeatures self.outfeatures = outfeatures self.bits = bits @@ -74,10 +87,14 @@ def __init__(self, bits, group_size, infeatures, outfeatures, bias, **kwargs,): ) self.register_buffer( "g_idx", - torch.tensor([i // self.group_size for i in range(infeatures)], dtype=torch.int32), + torch.tensor( + [i // self.group_size for i in range(infeatures)], dtype=torch.int32 + ), ) if bias: - self.register_buffer("bias", torch.zeros((outfeatures), dtype=torch.float16)) + self.register_buffer( + "bias", torch.zeros((outfeatures), dtype=torch.float16) + ) else: self.bias = None @@ -103,9 +120,10 @@ def pack(self, linear, scales, zeros, g_idx=None): intweight = [] for idx in range(self.infeatures): intweight.append( - torch.round((W[:, idx] + scale_zeros[self.g_idx[idx]]) / self.scales[self.g_idx[idx]]).to(torch.int)[ - :, None - ] + torch.round( + (W[:, idx] + scale_zeros[self.g_idx[idx]]) + / self.scales[self.g_idx[idx]] + ).to(torch.int)[:, None] ) intweight = torch.cat(intweight, dim=1) intweight = intweight.t().contiguous() @@ -113,7 +131,9 @@ def pack(self, linear, scales, zeros, g_idx=None): i = 0 row = 0 - qweight = np.zeros((intweight.shape[0] // 32 * self.bits, intweight.shape[1]), dtype=np.uint32) + qweight = np.zeros( + (intweight.shape[0] // 32 * self.bits, intweight.shape[1]), dtype=np.uint32 + ) while row < qweight.shape[0]: if self.bits in [2, 4, 8]: for j in range(i, i + (32 // self.bits)): @@ -127,7 +147,9 @@ def pack(self, linear, scales, zeros, g_idx=None): self.qweight = torch.from_numpy(qweight) zeros = zeros.numpy().astype(np.uint32) - qzeros = np.zeros((zeros.shape[0], zeros.shape[1] // 32 * self.bits), dtype=np.uint32) + qzeros = np.zeros( + (zeros.shape[0], zeros.shape[1] // 32 * self.bits), dtype=np.uint32 + ) i = 0 col = 0 while col < qzeros.shape[1]: @@ -164,6 +186,7 @@ def warmup(cls, model, transpose=False, seqlen=2048): """ Pre-tunes the quantized kernel """ + # Third Party from tqdm import tqdm kn_values = {} diff --git a/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/nn_modules/triton_utils/custom_autotune.py b/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/nn_modules/triton_utils/custom_autotune.py index fed33846..d4e20e5f 100644 --- a/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/nn_modules/triton_utils/custom_autotune.py +++ b/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/nn_modules/triton_utils/custom_autotune.py @@ -13,11 +13,13 @@ # See the License for the specific language governing permissions and # limitations under the License. ############################################################################### +# Standard +from typing import Dict import builtins import math import time -from typing import Dict +# Third Party import triton # code based https://github.com/fpgaminer/GPTQ-triton @@ -95,7 +97,9 @@ def kernel_call(): try: # In testings using only 40 reps seems to be close enough and it appears to be what PyTorch uses # PyTorch also sets fast_flush to True, but I didn't see any speedup so I'll leave the default - return triton.testing.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8), rep=40) + return triton.testing.do_bench( + kernel_call, quantiles=(0.5, 0.2, 0.8), rep=40 + ) except triton.OutOfResources: return (float("inf"), float("inf"), float("inf")) @@ -113,7 +117,10 @@ def run(self, *args, **kwargs): # prune configs pruned_configs = self.prune_configs(kwargs) bench_start = time.time() - timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs} + timings = { + config: self._bench(*args, config=config, **kwargs) + for config in pruned_configs + } bench_end = time.time() self.bench_time = bench_end - bench_start self.cache[key] = builtins.min(timings, key=timings.get) @@ -152,7 +159,9 @@ def prune_configs(self, kwargs): ) for config in pruned_configs } - pruned_configs = sorted(est_timing.keys(), key=lambda x: est_timing[x])[:top_k] + pruned_configs = sorted(est_timing.keys(), key=lambda x: est_timing[x])[ + :top_k + ] return pruned_configs def warmup(self, *args, **kwargs): @@ -168,7 +177,9 @@ def warmup(self, *args, **kwargs): self.nargs = None -def autotune(configs, key, prune_configs_by=None, reset_to_zero=None, nearest_power_of_two=False): +def autotune( + configs, key, prune_configs_by=None, reset_to_zero=None, nearest_power_of_two=False +): def decorator(fn): return CustomizedTritonAutoTuner( fn, diff --git a/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/nn_modules/triton_utils/dequant.py b/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/nn_modules/triton_utils/dequant.py index 05091699..fc601bd6 100644 --- a/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/nn_modules/triton_utils/dequant.py +++ b/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/nn_modules/triton_utils/dequant.py @@ -13,12 +13,14 @@ # See the License for the specific language governing permissions and # limitations under the License. ############################################################################### +# Standard import itertools +# Third Party +from torch.cuda.amp import custom_bwd, custom_fwd import torch import triton import triton.language as tl -from torch.cuda.amp import custom_bwd, custom_fwd def make_dequant_configs(block_sizes, num_warps): @@ -71,7 +73,9 @@ def dequant_kernel_248( tl.device_assert(g_idx >= 0, "index out of bounds: 0 <= tmp0 < 0") groups = tl.where(tmp2, tmp1, g_idx) # tmp3 are g_idx - scales = tl.load(scales_ptr + (col_idx + (outfeatures * groups)), None).to(tl.float32) + scales = tl.load(scales_ptr + (col_idx + (outfeatures * groups)), None).to( + tl.float32 + ) # Unpack weights weights = qweights >> wf_weights # bit shift qweight @@ -125,7 +129,9 @@ def dequant248(qweight, scales, qzeros, g_idx, bits, maxq=None): return out -def quant_matmul_248(input, qweight, scales, qzeros, g_idx, bits, maxq=None, transpose=False): +def quant_matmul_248( + input, qweight, scales, qzeros, g_idx, bits, maxq=None, transpose=False +): W = dequant248(qweight, scales, qzeros, g_idx, bits, maxq=maxq) if transpose: return input @ W.t() @@ -149,5 +155,7 @@ def backward(ctx, grad_output): grad_input = None if ctx.needs_input_grad[0]: - grad_input = quant_matmul_248(grad_output, qweight, scales, qzeros, g_idx, bits, maxq, transpose=True) + grad_input = quant_matmul_248( + grad_output, qweight, scales, qzeros, g_idx, bits, maxq, transpose=True + ) return grad_input, None, None, None, None, None, None diff --git a/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/nn_modules/triton_utils/kernels.py b/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/nn_modules/triton_utils/kernels.py index 541c22dc..a542d3fa 100644 --- a/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/nn_modules/triton_utils/kernels.py +++ b/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/nn_modules/triton_utils/kernels.py @@ -13,13 +13,16 @@ # See the License for the specific language governing permissions and # limitations under the License. ############################################################################### +# Standard from logging import getLogger +# Third Party +from torch.cuda.amp import custom_bwd, custom_fwd import torch import triton import triton.language as tl -from torch.cuda.amp import custom_bwd, custom_fwd +# Local from . import custom_autotune logger = getLogger(__name__) @@ -150,11 +153,14 @@ def quant_matmul_248_kernel( offs_am = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M) offs_bn = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N) offs_k = tl.arange(0, BLOCK_SIZE_K) - a_ptrs = a_ptr + (offs_am[:, None] * stride_am + offs_k[None, :] * stride_ak) # (BLOCK_SIZE_M, BLOCK_SIZE_K) + a_ptrs = a_ptr + ( + offs_am[:, None] * stride_am + offs_k[None, :] * stride_ak + ) # (BLOCK_SIZE_M, BLOCK_SIZE_K) a_mask = offs_am[:, None] < M # b_ptrs is set up such that it repeats elements along the K axis 8 times b_ptrs = b_ptr + ( - (offs_k[:, None] // infearure_per_bits) * stride_bk + offs_bn[None, :] * stride_bn + (offs_k[:, None] // infearure_per_bits) * stride_bk + + offs_bn[None, :] * stride_bn ) # (BLOCK_SIZE_K, BLOCK_SIZE_N) g_ptrs = g_ptr + offs_k # shifter is used to extract the N bits of each element in the 32-bit word from B @@ -169,8 +175,12 @@ def quant_matmul_248_kernel( g_idx = tl.load(g_ptrs) # Fetch scales and zeros; these are per-outfeature and thus reused in the inner loop - scales = tl.load(scales_ptrs + g_idx[:, None] * stride_scales) # (BLOCK_SIZE_K, BLOCK_SIZE_N,) - zeros = tl.load(zeros_ptrs + g_idx[:, None] * stride_zeros) # (BLOCK_SIZE_K, BLOCK_SIZE_N,) + scales = tl.load( + scales_ptrs + g_idx[:, None] * stride_scales + ) # (BLOCK_SIZE_K, BLOCK_SIZE_N,) + zeros = tl.load( + zeros_ptrs + g_idx[:, None] * stride_zeros + ) # (BLOCK_SIZE_K, BLOCK_SIZE_N,) zeros = (zeros >> zeros_shifter[None, :]) & maxq @@ -308,18 +318,25 @@ def transpose_quant_matmul_248_kernel( offs_am = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M) offs_bk = pid_k * BLOCK_SIZE_K + tl.arange(0, BLOCK_SIZE_K) offs_n = tl.arange(0, BLOCK_SIZE_N) - a_ptrs = a_ptr + (offs_am[:, None] * stride_am + offs_n[None, :] * stride_ak) # (BLOCK_SIZE_M, BLOCK_SIZE_N) + a_ptrs = a_ptr + ( + offs_am[:, None] * stride_am + offs_n[None, :] * stride_ak + ) # (BLOCK_SIZE_M, BLOCK_SIZE_N) a_mask = offs_am[:, None] < M # b_ptrs is set up such that it repeats elements along the K axis 8 times b_ptrs = b_ptr + ( - (offs_bk[:, None] // infearure_per_bits) * stride_bk + offs_n[None, :] * stride_bn + (offs_bk[:, None] // infearure_per_bits) * stride_bk + + offs_n[None, :] * stride_bn ) # (BLOCK_SIZE_K, BLOCK_SIZE_N) g_ptrs = g_ptr + offs_bk g_idx = tl.load(g_ptrs) # shifter is used to extract the N bits of each element in the 32-bit word from B scales_ptrs = scales_ptr + offs_n[None, :] + g_idx[:, None] * stride_scales - zeros_ptrs = zeros_ptr + (offs_n[None, :] // infearure_per_bits) + g_idx[:, None] * stride_zeros + zeros_ptrs = ( + zeros_ptr + + (offs_n[None, :] // infearure_per_bits) + + g_idx[:, None] * stride_zeros + ) shifter = (offs_bk % infearure_per_bits) * bits zeros_shifter = (offs_n % infearure_per_bits) * bits @@ -358,9 +375,12 @@ def silu(x): def quant_matmul_248(input, qweight, scales, qzeros, g_idx, bits, maxq): with torch.cuda.device(input.device): - output = torch.empty((input.shape[0], qweight.shape[1]), device=input.device, dtype=input.dtype) + output = torch.empty( + (input.shape[0], qweight.shape[1]), device=input.device, dtype=input.dtype + ) grid = lambda META: ( # noqa: E731 - triton.cdiv(input.shape[0], META["BLOCK_SIZE_M"]) * triton.cdiv(qweight.shape[1], META["BLOCK_SIZE_N"]), + triton.cdiv(input.shape[0], META["BLOCK_SIZE_M"]) + * triton.cdiv(qweight.shape[1], META["BLOCK_SIZE_N"]), ) quant_matmul_248_kernel[grid]( input, @@ -389,9 +409,12 @@ def quant_matmul_248(input, qweight, scales, qzeros, g_idx, bits, maxq): def transpose_quant_matmul_248(input, qweight, scales, qzeros, g_idx, bits, maxq): with torch.cuda.device(input.device): output_dim = (qweight.shape[0] * 32) // bits - output = torch.empty((input.shape[0], output_dim), device=input.device, dtype=input.dtype) + output = torch.empty( + (input.shape[0], output_dim), device=input.device, dtype=input.dtype + ) grid = lambda META: ( # noqa: E731 - triton.cdiv(input.shape[0], META["BLOCK_SIZE_M"]) * triton.cdiv(output_dim, META["BLOCK_SIZE_K"]), + triton.cdiv(input.shape[0], META["BLOCK_SIZE_M"]) + * triton.cdiv(output_dim, META["BLOCK_SIZE_K"]), ) transpose_quant_matmul_248_kernel[grid]( input, @@ -434,15 +457,20 @@ def backward(ctx, grad_output): grad_input = None if ctx.needs_input_grad[0]: - grad_input = transpose_quant_matmul_248(grad_output, qweight, scales, qzeros, g_idx, bits, maxq) + grad_input = transpose_quant_matmul_248( + grad_output, qweight, scales, qzeros, g_idx, bits, maxq + ) return grad_input, None, None, None, None, None, None def quant_matmul_inference_only_248(input, qweight, scales, qzeros, g_idx, bits, maxq): with torch.cuda.device(input.device): - output = torch.empty((input.shape[0], qweight.shape[1]), device=input.device, dtype=torch.float16) + output = torch.empty( + (input.shape[0], qweight.shape[1]), device=input.device, dtype=torch.float16 + ) grid = lambda META: ( # noqa: E731 - triton.cdiv(input.shape[0], META["BLOCK_SIZE_M"]) * triton.cdiv(qweight.shape[1], META["BLOCK_SIZE_N"]), + triton.cdiv(input.shape[0], META["BLOCK_SIZE_M"]) + * triton.cdiv(qweight.shape[1], META["BLOCK_SIZE_N"]), ) quant_matmul_248_kernel[grid]( input, diff --git a/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/quantization/__init__.py b/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/quantization/__init__.py index 377ffb63..dd76bfc3 100644 --- a/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/quantization/__init__.py +++ b/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/quantization/__init__.py @@ -13,7 +13,16 @@ # See the License for the specific language governing permissions and # limitations under the License. ############################################################################### -from .config import (FORMAT, FORMAT_FIELD_CODE, FORMAT_FIELD_JSON, - QUANT_CONFIG_FILENAME, QUANT_METHOD, QUANT_METHOD_FIELD, BaseQuantizeConfig, QuantizeConfig) +# Local +from .config import ( + FORMAT, + FORMAT_FIELD_CODE, + FORMAT_FIELD_JSON, + QUANT_CONFIG_FILENAME, + QUANT_METHOD, + QUANT_METHOD_FIELD, + BaseQuantizeConfig, + QuantizeConfig, +) from .gptq import GPTQ from .quantizer import Quantizer, quantize diff --git a/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/quantization/config.py b/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/quantization/config.py index a99020fa..e45c371a 100644 --- a/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/quantization/config.py +++ b/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/quantization/config.py @@ -13,12 +13,14 @@ # See the License for the specific language governing permissions and # limitations under the License. ############################################################################### -import json -import logging +# Standard from dataclasses import dataclass, field, fields from os.path import isdir, join from typing import Any, Dict, Optional, Tuple +import json +import logging +# Third Party from packaging import version from transformers.utils.hub import cached_file @@ -34,7 +36,11 @@ FORMAT_FIELD_JSON = "checkpoint_format" QUANT_METHOD_FIELD = "quant_method" QUANT_CONFIG_FILENAME = "quantize_config.json" -QUANT_CONFIG_FILENAME_COMPAT = [QUANT_CONFIG_FILENAME, "quant_config.json", "config.json"] +QUANT_CONFIG_FILENAME_COMPAT = [ + QUANT_CONFIG_FILENAME, + "quant_config.json", + "config.json", +] MIN_VERSION_WITH_V2 = "0.9.0" @@ -81,7 +87,7 @@ class QUANT_METHOD: @dataclass -class QuantizeConfig(): +class QuantizeConfig: bits: int = field(default=4, metadata={"choices": [2, 3, 4, 8]}) group_size: int = field(default=-1) damp_percent: float = field(default=0.01) @@ -117,7 +123,9 @@ def __post_init__(self): ) if self.bits not in fields_info[0].metadata["choices"]: - raise ValueError(f"only support quantize to {fields_info[0].metadata['choices']} bits.") + raise ValueError( + f"only support quantize to {fields_info[0].metadata['choices']} bits." + ) if self.group_size != -1 and self.group_size <= 0: raise ValueError("unless equal to -1, group_size must greater then 0.") @@ -157,14 +165,16 @@ def meta_get_versionable(self, key: str) -> Tuple[str, str]: def is_quantized_or_packed_by_v2(self) -> bool: # check meta.quantizer producer, _version = self.meta_get_versionable(META_FIELD_QUANTIZER) - by_v2 = (producer == META_QUANTIZER_GPTQMODEL) and (version.parse(_version) >= version.parse(MIN_VERSION_WITH_V2)) + by_v2 = (producer == META_QUANTIZER_GPTQMODEL) and ( + version.parse(_version) >= version.parse(MIN_VERSION_WITH_V2) + ) # fallback to meta.packer if not by_v2: producer, _version = self.meta_get_versionable(META_FIELD_PACKER) - by_v2 = producer == META_QUANTIZER_GPTQMODEL and version.parse(_version) >= version.parse( - MIN_VERSION_WITH_V2 - ) + by_v2 = producer == META_QUANTIZER_GPTQMODEL and version.parse( + _version + ) >= version.parse(MIN_VERSION_WITH_V2) return by_v2 @@ -182,7 +192,9 @@ def from_quant_config(cls, quantize_cfg, format: str = None): if format not in valid_formats: raise ValueError(f"Unknown quantization checkpoint format: {format}.") if quantize_cfg.get(FORMAT_FIELD_JSON): - raise ValueError("Conflict: quantization format is passed in and also exists in model config.") + raise ValueError( + "Conflict: quantization format is passed in and also exists in model config." + ) # compat: warn if checkpoint_format is missing elif quantize_cfg.get(FORMAT_FIELD_JSON) is None: format_auto_inferred = True @@ -198,7 +210,10 @@ def from_quant_config(cls, quantize_cfg, format: str = None): key = key.lower() # remap keys according to compat map - if key in QUANT_CONFIG_ARG_SYNONYMS and QUANT_CONFIG_ARG_SYNONYMS[key] in field_names: + if ( + key in QUANT_CONFIG_ARG_SYNONYMS + and QUANT_CONFIG_ARG_SYNONYMS[key] in field_names + ): key = QUANT_CONFIG_ARG_SYNONYMS[key] if key == FORMAT_FIELD_JSON: @@ -218,10 +233,14 @@ def from_quant_config(cls, quantize_cfg, format: str = None): elif key in field_names: normalized[key] = val else: - logger.info(f"Ignoring unknown parameter in the quantization configuration: {key}.") + logger.info( + f"Ignoring unknown parameter in the quantization configuration: {key}." + ) if format_auto_inferred: - logger.info(f"`{FORMAT_FIELD_JSON}` is missing from the quantization configuration and is automatically inferred to {normalized[FORMAT_FIELD_CODE]}") + logger.info( + f"`{FORMAT_FIELD_JSON}` is missing from the quantization configuration and is automatically inferred to {normalized[FORMAT_FIELD_CODE]}" + ) if "sym" not in normalized: logger.warning( @@ -301,9 +320,12 @@ def to_dict(self): META_FIELD: self.meta, } + # deprecated: will be removed in future update @dataclass class BaseQuantizeConfig(QuantizeConfig): def __init__(self, **kwargs): super().__init__(**kwargs) - logging.warning("BaseQuantizeConfig is re-named and pending deprecation. Please use `QuantizeConfig` instead.") + logging.warning( + "BaseQuantizeConfig is re-named and pending deprecation. Please use `QuantizeConfig` instead." + ) diff --git a/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/quantization/gptq.py b/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/quantization/gptq.py index c3f2e4b3..470ed3fb 100644 --- a/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/quantization/gptq.py +++ b/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/quantization/gptq.py @@ -1,15 +1,18 @@ # License: GPTQModel/licenses/LICENSE.mit # adapted from @qwopqwop200 's [GPTQ-for-LLaMa](https://github.com/qwopqwop200/GPTQ-for-LLaMa/tree/cuda), which itself is based on [gptq](https://github.com/IST-DASLab/gptq) +# Standard +from logging import getLogger import math import os import time -from logging import getLogger +# Third Party import torch import torch.nn as nn import transformers +# Local from .quantizer import Quantizer logger = getLogger(__name__) @@ -40,7 +43,9 @@ def add_batch(self, inp, out): if len(inp.shape) == 2: inp = inp.unsqueeze(0) tmp = inp.shape[0] - if isinstance(self.layer, nn.Linear) or isinstance(self.layer, transformers.Conv1D): + if isinstance(self.layer, nn.Linear) or isinstance( + self.layer, transformers.Conv1D + ): if len(inp.shape) == 3: inp = inp.reshape((-1, inp.shape[-1])) inp = inp.t() @@ -93,6 +98,7 @@ def fasterquant( now_idx = 1 if static_groups: + # Standard import copy groups = [] @@ -137,7 +143,9 @@ def fasterquant( if group_size != -1: if not static_groups: if (i1 + i) % group_size == 0: - self.quantizer.find_params(W[:, (i1 + i) : (i1 + i + group_size)], weight=True) + self.quantizer.find_params( + W[:, (i1 + i) : (i1 + i + group_size)], weight=True + ) if ((i1 + i) // group_size) - now_idx == -1: scale.append(self.quantizer.scale) @@ -185,7 +193,9 @@ def fasterquant( if isinstance(self.layer, transformers.Conv1D): Q = Q.t() - self.layer.weight.data = Q.reshape(self.layer.weight.shape).type_as(self.layer.weight.data) + self.layer.weight.data = Q.reshape(self.layer.weight.shape).type_as( + self.layer.weight.data + ) if os.environ.get("DEBUG"): logger.debug(torch.sum((self.layer(self.inp1) - self.out1) ** 2)) diff --git a/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/quantization/quantizer.py b/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/quantization/quantizer.py index 3ca205d1..f5eb9e00 100644 --- a/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/quantization/quantizer.py +++ b/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/quantization/quantizer.py @@ -1,8 +1,10 @@ # License: GPTQModel/licenses/LICENSE.mit # adapted from @qwopqwop200 's [GPTQ-for-LLaMa](https://github.com/qwopqwop200/GPTQ-for-LLaMa/tree/cuda), which itself is based on [gptq](https://github.com/IST-DASLab/gptq) +# Standard from logging import getLogger +# Third Party import torch import torch.nn as nn diff --git a/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/utils/__init__.py b/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/utils/__init__.py index 668d0859..869f6327 100644 --- a/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/utils/__init__.py +++ b/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/utils/__init__.py @@ -13,4 +13,5 @@ # See the License for the specific language governing permissions and # limitations under the License. ############################################################################### +# Local from .backend import Backend, get_backend diff --git a/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/utils/backend.py b/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/utils/backend.py index 43dfffea..86362acc 100644 --- a/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/utils/backend.py +++ b/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/utils/backend.py @@ -13,6 +13,7 @@ # See the License for the specific language governing permissions and # limitations under the License. ############################################################################### +# Standard from enum import Enum @@ -20,6 +21,7 @@ class Backend(Enum): AUTO = 0 # choose the fastest one based on quant model compatibility TRITON = 3 + def get_backend(backend: str): try: return Backend[backend] diff --git a/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/utils/data.py b/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/utils/data.py index 19e61779..b594886e 100644 --- a/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/utils/data.py +++ b/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/utils/data.py @@ -13,16 +13,18 @@ # See the License for the specific language governing permissions and # limitations under the License. ############################################################################### -import copy -import random +# Standard from functools import partial from typing import Callable, Dict, List, Optional +import copy +import random -import torch +# Third Party from datasets import DatasetDict, IterableDatasetDict, load_dataset from torch import LongTensor from torch.utils.data import DataLoader from transformers import PreTrainedTokenizer +import torch def make_data_block( @@ -69,7 +71,9 @@ def make_data_block( # filter tokenized samples by length dropped_indices = [] - for idx, (tokenized_prompt, tokenized_label) in enumerate(zip(tokenized_prompts, tokenized_labels)): + for idx, (tokenized_prompt, tokenized_label) in enumerate( + zip(tokenized_prompts, tokenized_labels) + ): if add_eos_token: tokenized_label += [tokenizer.eos_token_id] len_prompt = len(tokenized_prompt) @@ -87,7 +91,11 @@ def make_data_block( # make data blocks of samples tokenized_samples = sorted( - [(p, l) for idx, (p, l) in enumerate(zip(tokenized_prompts, tokenized_labels)) if idx not in dropped_indices], + [ + (p, l) + for idx, (p, l) in enumerate(zip(tokenized_prompts, tokenized_labels)) + if idx not in dropped_indices + ], key=lambda x: (len(x[0]) + len(x[1])) if merge_prompt_label else len(x[0]), ) sample_blocks = [] @@ -103,7 +111,9 @@ def make_data_block( additional_len = blk_max_len sample_len = blk_max_len else: - additional_len = len(sample_block) * (ori_sample_len - blk_max_len) + ori_sample_len + additional_len = ( + len(sample_block) * (ori_sample_len - blk_max_len) + ori_sample_len + ) sample_len = ori_sample_len if blk_total_len + additional_len > block_max_len: @@ -139,11 +149,19 @@ def make_data_block( sample_len += len(tokenized_label) pad_num = blk_max_len - sample_len if merge_prompt_label: - input_ids.append([tokenizer.pad_token_id] * pad_num + tokenized_prompt + tokenized_label) - label_ids.append([-100] * (pad_num + len(tokenized_prompt)) + tokenized_label) + input_ids.append( + [tokenizer.pad_token_id] * pad_num + + tokenized_prompt + + tokenized_label + ) + label_ids.append( + [-100] * (pad_num + len(tokenized_prompt)) + tokenized_label + ) else: input_ids.append([tokenizer.pad_token_id] * pad_num + tokenized_prompt) - label_ids.append([-100] * (label_max_len - len(tokenized_label)) + tokenized_label) + label_ids.append( + [-100] * (label_max_len - len(tokenized_label)) + tokenized_label + ) attention_mask.append([0] * pad_num + [1] * sample_len) new_samples["input_ids"].append(input_ids) @@ -153,7 +171,9 @@ def make_data_block( return new_samples -def collate_data(blocks: List[Dict[str, List[List[int]]]], pad_token_id: int) -> Dict[str, LongTensor]: +def collate_data( + blocks: List[Dict[str, List[List[int]]]], pad_token_id: int +) -> Dict[str, LongTensor]: def pad_block(block, pads): return torch.cat((block, pads.to(block.device)), dim=-1) @@ -170,11 +190,17 @@ def pad_block(block, pads): block_label_len = label_blocks[i].shape[-1] pad_num = inp_max_len - block_inp_len if pad_num > 0: - input_ids_blocks[i] = pad_block(input_ids_blocks[i], torch.ones((block_bsz, pad_num)) * pad_token_id) - attention_mask_blocks[i] = pad_block(attention_mask_blocks[i], torch.zeros((block_bsz, pad_num))) + input_ids_blocks[i] = pad_block( + input_ids_blocks[i], torch.ones((block_bsz, pad_num)) * pad_token_id + ) + attention_mask_blocks[i] = pad_block( + attention_mask_blocks[i], torch.zeros((block_bsz, pad_num)) + ) label_pad_num = label_max_len - block_label_len if label_pad_num > 0: - label_blocks[i] = pad_block(label_blocks[i], torch.ones((block_bsz, label_pad_num)) * -100) + label_blocks[i] = pad_block( + label_blocks[i], torch.ones((block_bsz, label_pad_num)) * -100 + ) return { "input_ids": torch.cat(input_ids_blocks, dim=0).long(), diff --git a/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/utils/importer.py b/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/utils/importer.py index 40503569..98e4f239 100644 --- a/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/utils/importer.py +++ b/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/utils/importer.py @@ -13,16 +13,20 @@ # See the License for the specific language governing permissions and # limitations under the License. ############################################################################### +# Standard from collections import OrderedDict from logging import getLogger +# Local from ..nn_modules.qlinear.qlinear_tritonv2 import QuantLinear as TritonV2QuantLinear from ..quantization import FORMAT from .backend import Backend -backend_dict = OrderedDict({ - Backend.TRITON: TritonV2QuantLinear, -}) +backend_dict = OrderedDict( + { + Backend.TRITON: TritonV2QuantLinear, + } +) format_dict = { FORMAT.GPTQ: [Backend.TRITON], @@ -32,15 +36,16 @@ logger = getLogger(__name__) + # auto select the correct/optimal QuantLinear class def select_quant_linear( - bits: int, - group_size: int, - desc_act: bool, - sym: bool, - backend: Backend, - format: FORMAT, - pack: bool = False, + bits: int, + group_size: int, + desc_act: bool, + sym: bool, + backend: Backend, + format: FORMAT, + pack: bool = False, ): # Handle the case where backend is AUTO. if backend == Backend.AUTO: @@ -50,12 +55,15 @@ def select_quant_linear( validate = v.validate(bits, group_size, desc_act, sym, raise_error=False) check_pack_func = hasattr(v, "pack") if pack else True if in_allow_backends and validate and check_pack_func: - logger.info(f"Auto choose the fastest one based on quant model compatibility: {v}") + logger.info( + f"Auto choose the fastest one based on quant model compatibility: {v}" + ) return v # Handle the case where backend is not AUTO. if backend == Backend.TRITON: logger.info("Using tritonv2 for GPTQ") + # Local from ..nn_modules.qlinear.qlinear_tritonv2 import QuantLinear else: raise NotImplementedError("Invalid Backend") diff --git a/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/utils/model.py b/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/utils/model.py index 23ef3821..e98103e8 100644 --- a/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/utils/model.py +++ b/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/utils/model.py @@ -13,24 +13,33 @@ # See the License for the specific language governing permissions and # limitations under the License. ############################################################################### +# Standard +from logging import getLogger +from typing import List, Optional import functools import hashlib import json import logging import os -from logging import getLogger -from typing import List, Optional +# Third Party +from tqdm import tqdm +from transformers import AutoConfig, PretrainedConfig +from transformers.utils.hub import cached_file import accelerate import threadpoolctl as tctl import torch import torch.nn as nn import transformers -from tqdm import tqdm -from transformers import AutoConfig, PretrainedConfig -from transformers.utils.hub import cached_file -from ..models._const import CPU, CUDA_0, EXLLAMA_DEFAULT_MAX_INPUT_LENGTH, EXPERT_INDEX_PLACEHOLDER, SUPPORTED_MODELS +# Local +from ..models._const import ( + CPU, + CUDA_0, + EXLLAMA_DEFAULT_MAX_INPUT_LENGTH, + EXPERT_INDEX_PLACEHOLDER, + SUPPORTED_MODELS, +) from ..nn_modules.qlinear import BaseQuantLinear from ..quantization import FORMAT, QuantizeConfig from .backend import Backend @@ -99,7 +108,11 @@ def find_layers(module, layers=None, name=""): return {name: module} res = {} for name1, child in module.named_children(): - res.update(find_layers(child, layers=layers, name=name + "." + name1 if name != "" else name1)) + res.update( + find_layers( + child, layers=layers, name=name + "." + name1 if name != "" else name1 + ) + ) return res @@ -127,7 +140,9 @@ def make_quant( use_cuda_fp16: bool = True, pack: bool = False, ) -> BaseQuantLinear: - select_quant_linear_func = select_quant_linear_with_pack if pack else select_quant_linear + select_quant_linear_func = ( + select_quant_linear_with_pack if pack else select_quant_linear + ) QuantLinear = select_quant_linear_func( bits=bits, group_size=group_size, @@ -186,6 +201,7 @@ def make_quant( return QuantLinear + def convert_gptq_v1_to_v2_format( model, quantize_config: QuantizeConfig, @@ -202,15 +218,15 @@ def convert_gptq_v1_to_v2_format( if quantize_config.bits == 2: submodule.qzeros.data += 0b01010101010101010101010101010101 elif quantize_config.bits == 3: - submodule.qzeros.data[:, range(0, submodule.qzeros.data.shape[1], 3)] += ( - 0b00100100100100100100100100100100 - ) - submodule.qzeros.data[:, range(1, submodule.qzeros.data.shape[1], 3)] += ( - 0b10010010010010010010010010010010 - ) - submodule.qzeros.data[:, range(2, submodule.qzeros.data.shape[1], 3)] += ( - 0b01001001001001001001001001001001 - ) + submodule.qzeros.data[ + :, range(0, submodule.qzeros.data.shape[1], 3) + ] += 0b00100100100100100100100100100100 + submodule.qzeros.data[ + :, range(1, submodule.qzeros.data.shape[1], 3) + ] += 0b10010010010010010010010010010010 + submodule.qzeros.data[ + :, range(2, submodule.qzeros.data.shape[1], 3) + ] += 0b01001001001001001001001001001001 elif quantize_config.bits == 4: submodule.qzeros.data += 0b00010001000100010001000100010001 elif quantize_config.bits == 8: @@ -234,15 +250,15 @@ def convert_gptq_v2_to_v1_format( if quantize_config.bits == 2: submodule.qzeros.data -= 0b01010101010101010101010101010101 elif quantize_config.bits == 3: - submodule.qzeros.data[:, range(0, submodule.qzeros.data.shape[1], 3)] -= ( - 0b00100100100100100100100100100100 - ) - submodule.qzeros.data[:, range(1, submodule.qzeros.data.shape[1], 3)] -= ( - 0b10010010010010010010010010010010 - ) - submodule.qzeros.data[:, range(2, submodule.qzeros.data.shape[1], 3)] -= ( - 0b01001001001001001001001001001001 - ) + submodule.qzeros.data[ + :, range(0, submodule.qzeros.data.shape[1], 3) + ] -= 0b00100100100100100100100100100100 + submodule.qzeros.data[ + :, range(1, submodule.qzeros.data.shape[1], 3) + ] -= 0b10010010010010010010010010010010 + submodule.qzeros.data[ + :, range(2, submodule.qzeros.data.shape[1], 3) + ] -= 0b01001001001001001001001001001001 elif quantize_config.bits == 4: submodule.qzeros.data -= 0b00010001000100010001000100010001 elif quantize_config.bits == 8: @@ -252,11 +268,16 @@ def convert_gptq_v2_to_v1_format( return model -def select_quant_linear_with_pack(bits: int, + +def select_quant_linear_with_pack( + bits: int, group_size: int, desc_act: bool, sym: bool, - backend: Backend, format: str, pack: bool): + backend: Backend, + format: str, + pack: bool, +): QuantLinear = select_quant_linear( bits=bits, group_size=group_size, @@ -268,6 +289,7 @@ def select_quant_linear_with_pack(bits: int, ) return QuantLinear + def pack_model( model, quantizers, @@ -341,12 +363,13 @@ def pack_model( QuantLinear.warmup(model.to(CUDA_0), seqlen=model.seqlen) return QuantLinear + def verify_model_hash(file_path: str, verify_hash: str): if not isinstance(verify_hash, str): raise ValueError("model verify_hash must be a string") - if ':' not in verify_hash: + if ":" not in verify_hash: raise ValueError("verify_hash must be in the format 'hash_type:hash_value'") - hash_type, hash_value = verify_hash.split(':', 1) + hash_type, hash_value = verify_hash.split(":", 1) hash_func = getattr(hashlib, hash_type, None) if not hash_func: raise ValueError(f"No hash function found for type: {hash_type}") @@ -359,9 +382,9 @@ def verify_sharded_model_hashes(jsonPath: str, verify_hash: List[str]): if not isinstance(verify_hash, list): raise ValueError("sharded model verify_hash must be a list") - with open(jsonPath, 'r') as f: + with open(jsonPath, "r") as f: index_data = json.load(f) - weight_map = index_data['weight_map'] + weight_map = index_data["weight_map"] shard_files = set(weight_map.values()) if len(shard_files) != len(verify_hash): raise ValueError("Number of shards and number of hash values do not match.") @@ -372,6 +395,7 @@ def verify_sharded_model_hashes(jsonPath: str, verify_hash: List[str]): return False return True + def check_and_get_model_type(model_dir, trust_remote_code=False): config = AutoConfig.from_pretrained(model_dir, trust_remote_code=trust_remote_code) if config.model_type not in SUPPORTED_MODELS: @@ -381,6 +405,7 @@ def check_and_get_model_type(model_dir, trust_remote_code=False): def simple_dispatch_model(model, device_map): + # Third Party from accelerate.hooks import AlignDevicesHook, add_hook_to_module if "" in device_map: @@ -402,10 +427,14 @@ def simple_dispatch_model(model, device_map): prev_hook = None for idx, (n, d) in enumerate(cpu_offload_group): m = get_module_by_name_suffix(model, n) - _, prev_hook = accelerate.cpu_offload_with_hook(m, execution_device=main_device, prev_module_hook=prev_hook) + _, prev_hook = accelerate.cpu_offload_with_hook( + m, execution_device=main_device, prev_module_hook=prev_hook + ) # set first cpu offload module's prev_module_hook to the last cpu offload module's hook if len(cpu_offload_group) > 1: - get_module_by_name_suffix(model, cpu_offload_group[0][0])._hf_hook.prev_module_hook = prev_hook + get_module_by_name_suffix( + model, cpu_offload_group[0][0] + )._hf_hook.prev_module_hook = prev_hook for n, d in device_map.items(): m = get_module_by_name_suffix(model, n) @@ -423,7 +452,9 @@ def simple_dispatch_model(model, device_map): # when qliear type is selected, it should auto-override the model post_init method and # not have to go about looping over modules to match qlinear type a second time as it is # very prone to bugs -def gptqmodel_post_init(model, use_act_order: bool, max_input_length: Optional[int] = None): +def gptqmodel_post_init( + model, use_act_order: bool, max_input_length: Optional[int] = None +): """ The max_input_length argument is specific to the exllama backend, that requires to initialize a buffer temp_state. """ @@ -475,6 +506,7 @@ def gptqmodel_post_init(model, use_act_order: bool, max_input_length: Optional[i if model_uses_exllama: # To be honest this is quite ugly, not proud of this. + # Third Party from gptqmodel_exllama_kernels import prepare_buffers, set_tuning_params device_to_buffers = {} @@ -523,7 +555,10 @@ def gptqmodel_post_init(model, use_act_order: bool, max_input_length: Optional[i # The buffers need to have been initialized first before calling make_q4. for name, submodule in model.named_modules(): - if isinstance(submodule, BaseQuantLinear) and submodule.QUANT_TYPE == "exllama": + if ( + isinstance(submodule, BaseQuantLinear) + and submodule.QUANT_TYPE == "exllama" + ): submodule.post_init() # exllamav2 @@ -531,13 +566,17 @@ def gptqmodel_post_init(model, use_act_order: bool, max_input_length: Optional[i model_uses_exllamav2 = False for _, submodule in model.named_modules(): - if isinstance(submodule, BaseQuantLinear) and submodule.QUANT_TYPE == "exllamav2": + if ( + isinstance(submodule, BaseQuantLinear) + and submodule.QUANT_TYPE == "exllamav2" + ): model_uses_exllamav2 = True device = submodule.qweight.device scratch_fixed = submodule.scratch_space_fixed() fixed_bytes[device] = max(scratch_fixed, fixed_bytes.get(device, 0)) if model_uses_exllamav2: + # Local from ..nn_modules.qlinear.qlinear_exllamav2 import ExLlamaV2DeviceTensors device_tensors = {} @@ -548,7 +587,10 @@ def gptqmodel_post_init(model, use_act_order: bool, max_input_length: Optional[i model.device_tensors = device_tensors for _, submodule in model.named_modules(): - if isinstance(submodule, BaseQuantLinear) and submodule.QUANT_TYPE == "exllamav2": + if ( + isinstance(submodule, BaseQuantLinear) + and submodule.QUANT_TYPE == "exllamav2" + ): device = submodule.qweight.device submodule.post_init(temp_dq=model.device_tensors[device]) torch.cuda.empty_cache() @@ -557,7 +599,10 @@ def gptqmodel_post_init(model, use_act_order: bool, max_input_length: Optional[i def get_checkpoints( - model_name_or_path: str, extensions: List[str], possible_model_basenames: List[str], **cached_file_kwargs + model_name_or_path: str, + extensions: List[str], + possible_model_basenames: List[str], + **cached_file_kwargs, ): """ Retrives (and if necessary downloads from Hugging Face Hub) the model checkpoint. Sharding is supported. All the `possible_model_basenames` (e.g. `["model", "model-4bit-gptq"]`) will be explored over all `extensions` (e.g. `[".bin", ".safetensors"]`). @@ -574,10 +619,14 @@ def get_checkpoints( possible_index_file = os.path.join(model_name_or_path, shard_index_name) if os.path.isfile(possible_index_file): # The model is sharded over several checkpoints. - possible_model_basename = possible_index_file.replace(ext + ".index.json", "") + possible_model_basename = possible_index_file.replace( + ext + ".index.json", "" + ) return True, possible_index_file, possible_model_basename else: - model_save_name = os.path.join(model_name_or_path, possible_model_basename) + model_save_name = os.path.join( + model_name_or_path, possible_model_basename + ) searched_files.append(possible_model_basename + ext) if os.path.isfile(model_save_name + ext): resolved_archive_file = model_save_name + ext @@ -628,14 +677,19 @@ def get_checkpoints( # return the most stable tensor dtype for quantization while minimizing vram -def auto_dtype_from_config(config: PretrainedConfig, quant_inference: bool = False) -> torch.dtype: +def auto_dtype_from_config( + config: PretrainedConfig, quant_inference: bool = False +) -> torch.dtype: # all the gptq inference kernels are float16 only if quant_inference: return torch.float16 dtype = getattr(config, "torch_dtype") if not dtype or not isinstance(dtype, torch.dtype): - raise ValueError("Your model config.json does not have torch_dtype set. Please check for model " "corruption.") + raise ValueError( + "Your model config.json does not have torch_dtype set. Please check for model " + "corruption." + ) if dtype == torch.float32: return torch.bfloat16 @@ -654,7 +708,9 @@ def get_moe_layer_modules(layer_modules: List, num_experts: int) -> List: for n in names: if EXPERT_INDEX_PLACEHOLDER in n: for index in range(num_experts): - new_inside_layer_modules[-1].append(n.replace(EXPERT_INDEX_PLACEHOLDER, str(index))) + new_inside_layer_modules[-1].append( + n.replace(EXPERT_INDEX_PLACEHOLDER, str(index)) + ) else: new_inside_layer_modules[-1].append(n) diff --git a/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/utils/peft.py b/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/utils/peft.py index e3257d80..ac5041ff 100644 --- a/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/utils/peft.py +++ b/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/utils/peft.py @@ -13,6 +13,7 @@ # The above copyright notice and this permission notice shall be included in all # copies or substantial portions of the Software. +# Standard # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE @@ -24,20 +25,24 @@ from contextlib import contextmanager from typing import List, Optional, Tuple, Union -import torch +# Third Party from peft import PeftConfig, PeftModel, PeftType, get_peft_model from peft.mapping import PEFT_TYPE_TO_CONFIG_MAPPING from peft.peft_model import PEFT_TYPE_TO_MODEL_MAPPING from peft.tuners.lora import LoraConfig, LoraModel from peft.tuners.lora.gptq import QuantLinear as LoraLinearGPTQ +import torch +# Local from ..models.base import BaseGPTQModel from ..nn_modules.qlinear.qlinear_tritonv2 import QuantLinear as QuantLinearTriton + class GPTQLoraConfig(LoraConfig): injected_fused_attention: bool = False injected_fused_mlp: bool = False + class GPTQLoraModel(LoraModel): def _replace_module(self, parent_module, child_name, new_module, old_module): # replace the lora linear @@ -70,7 +75,6 @@ def _create_new_module( # if module cannot be found, return None which results in a raise in the call-stack return new_module - def merge_adapter(self): raise NotImplementedError("gptq model not support merge ada lora adapter") @@ -99,6 +103,7 @@ def find_all_linear_names( results.add(res) return list(results) + @contextmanager def hijack_peft_mappings(): PEFT_TYPE_TO_CONFIG_MAPPING[PeftType.LORA] = GPTQLoraConfig @@ -114,6 +119,7 @@ def hijack_peft_mappings(): PEFT_TYPE_TO_CONFIG_MAPPING[PeftType.LORA] = GPTQLoraConfig PEFT_TYPE_TO_MODEL_MAPPING[PeftType.LORA] = GPTQLoraModel + def get_gptq_peft_model( model: BaseGPTQModel, peft_config: PeftConfig = None, @@ -125,7 +131,9 @@ def get_gptq_peft_model( if train_mode and not peft_config: raise ValueError("peft_config not specified when in train mode.") if not train_mode and not model_id: - raise ValueError("model_id(where to load adapters) not specified when in inference mode.") + raise ValueError( + "model_id(where to load adapters) not specified when in inference mode." + ) if train_mode: peft_type = peft_config.peft_type @@ -133,8 +141,12 @@ def get_gptq_peft_model( peft_type = peft_type.value if peft_type in [PeftType.LORA.value]: if auto_find_all_linears: - peft_config.target_modules = find_all_linear_names(model, ignore_lm_head=True) - if peft_type == PeftType.LORA.value and not isinstance(peft_config, GPTQLoraConfig): + peft_config.target_modules = find_all_linear_names( + model, ignore_lm_head=True + ) + if peft_type == PeftType.LORA.value and not isinstance( + peft_config, GPTQLoraConfig + ): peft_config = GPTQLoraConfig(**peft_config.to_dict()) # this hijack is needed as `get_peft_model` uses PEFTModelForCausalLM which inherits from @@ -142,9 +154,13 @@ def get_gptq_peft_model( with hijack_peft_mappings(): try: if train_mode: - peft_model = get_peft_model(model.model, peft_config, adapter_name=adapter_name) + peft_model = get_peft_model( + model.model, peft_config, adapter_name=adapter_name + ) else: - peft_model = PeftModel.from_pretrained(model.model, model_id, adapter_name) + peft_model = PeftModel.from_pretrained( + model.model, model_id, adapter_name + ) except Exception as exc: raise NotImplementedError( f"{model.__class__.__name__} not support \ @@ -159,4 +175,4 @@ def get_gptq_peft_model( "GPTQLoraModel", "find_all_linear_names", "get_gptq_peft_model", -] \ No newline at end of file +] diff --git a/plugins/accelerated-peft/tests/test_gptqmodel.py b/plugins/accelerated-peft/tests/test_gptqmodel.py index dbab2821..99abf154 100644 --- a/plugins/accelerated-peft/tests/test_gptqmodel.py +++ b/plugins/accelerated-peft/tests/test_gptqmodel.py @@ -15,13 +15,20 @@ # SPDX-License-Identifier: Apache-2.0 # https://spdx.dev/learn/handling-license-info/ -import pytest # pylint: disable=import-error -import torch +# Standard from typing import List -from transformers.utils.import_utils import _is_package_available -from transformers import AutoTokenizer, AutoConfig, GenerationConfig, AutoModelForCausalLM +# Third Party from peft import LoraConfig +from transformers import ( + AutoConfig, + AutoModelForCausalLM, + AutoTokenizer, + GenerationConfig, +) +from transformers.utils.import_utils import _is_package_available +import pytest # pylint: disable=import-error +import torch GPTQ = "gptq" # r, lora_alpha @@ -38,22 +45,29 @@ VANILLA_MODEL_NAME = "TinyLlama/TinyLlama-1.1B-Chat-v0.3" QUANTIZED_MODEL_NAME = "TheBloke/TinyLlama-1.1B-Chat-v0.3-GPTQ" TARGET_MODULES = ["q_proj", "k_proj", "v_proj", "o_proj"] - + + # Model loading function for quantized models -def load_autogptq_plugin_model(model_name:str, target_modules:List, torch_dtype:str, use_external_lib:bool = False): +def load_autogptq_plugin_model( + model_name: str, + target_modules: List, + torch_dtype: str, + use_external_lib: bool = False, +): + # First Party from fms_acceleration_peft.framework_plugin_autogptq import ( AutoGPTQAccelerationPlugin, ) _plugin = AutoGPTQAccelerationPlugin( - { - "peft": { - "quantization": { - "auto_gptq": {"kernel": "triton_v2", "from_quantized": True} - } + { + "peft": { + "quantization": { + "auto_gptq": {"kernel": "triton_v2", "from_quantized": True} } - }, - use_external_lib = use_external_lib, + } + }, + use_external_lib=use_external_lib, ) class TrainArgs: @@ -68,48 +82,63 @@ class TrainArgs: target_modules=target_modules, ) - model = _plugin.model_loader( - model_name, torch_dtype=getattr(torch, torch_dtype) - ) + model = _plugin.model_loader(model_name, torch_dtype=getattr(torch, torch_dtype)) model, _ = _plugin.augmentation(model, args, (peft_config,)) model.eval() return model -# quantization function to manage the loading and quantizing of pretrained model + +# quantization function to manage the loading and quantizing of pretrained model # using external or local autogptq -def quantize_model(model_name, config, calibration_dataset, quant_config_kwargs, device, torch_dtype, use_external_lib=False): +def quantize_model( + model_name, + config, + calibration_dataset, + quant_config_kwargs, + device, + torch_dtype, + use_external_lib=False, +): if use_external_lib: - from auto_gptq import AutoGPTQForCausalLM as GPTQModel, BaseQuantizeConfig as QuantizeConfig + # Third Party + from auto_gptq import AutoGPTQForCausalLM as GPTQModel + from auto_gptq import BaseQuantizeConfig as QuantizeConfig + quantize_kwargs = {"use_triton": True} else: + # First Party from fms_acceleration_peft.gptqmodel import GPTQModel, QuantizeConfig + quantize_kwargs = {} - quantize_config = QuantizeConfig( - **quant_config_kwargs - ) + quantize_config = QuantizeConfig(**quant_config_kwargs) # load un-quantized model, by default, the model will always be loaded into CPU memory model = GPTQModel.from_pretrained( - model_name, - quantize_config = quantize_config, - config = config, - torch_dtype = getattr(torch, torch_dtype), + model_name, + quantize_config=quantize_config, + config=config, + torch_dtype=getattr(torch, torch_dtype), ).to(device) # quantize model, the examples should be list of dict whose keys can only be "input_ids" model.quantize(calibration_dataset, **quantize_kwargs) model.eval() return model + def get_wikitext2(tokenizer, num_samples=128, seqlen=128): + # Standard import random + + # Third Party + from datasets import load_dataset import numpy as np import torch - from datasets import load_dataset - wikidata = load_dataset('wikitext', 'wikitext-2-v1', split='test') - wikilist = [' \n' if s == '' else s for s in wikidata['text'] ] - text = ''.join(wikilist) - trainenc = tokenizer(text, return_tensors='pt') + wikidata = load_dataset("wikitext", "wikitext-2-v1", split="test") + wikilist = [" \n" if s == "" else s for s in wikidata["text"]] + + text = "".join(wikilist) + trainenc = tokenizer(text, return_tensors="pt") random.seed(0) np.random.seed(0) @@ -122,20 +151,23 @@ def get_wikitext2(tokenizer, num_samples=128, seqlen=128): j = i + seqlen inp = trainenc.input_ids[:, i:j] attention_mask = torch.ones_like(inp) - traindataset.append({'input_ids':inp,'attention_mask': attention_mask}) + traindataset.append({"input_ids": inp, "attention_mask": attention_mask}) return traindataset + @pytest.fixture() def input_ids(seed: int = 42, device: torch.device = "cuda"): torch.manual_seed(seed) - yield torch.randint(0, 10000, (BS, SEQLEN), device=device) + yield torch.randint(0, 10000, (BS, SEQLEN), device=device) + @pytest.mark.skipif( not _is_package_available("auto_gptq"), reason="Only runs if auto_gptq is installed", ) def test_pre_quantized_model_outputs_match( - input_ids, seed: int = 42, + input_ids, + seed: int = 42, ): """ Test for output equivalence when loading quantized models between @@ -143,33 +175,33 @@ def test_pre_quantized_model_outputs_match( """ torch.manual_seed(seed) original_model = load_autogptq_plugin_model( - QUANTIZED_MODEL_NAME, - TARGET_MODULES, FLOAT16, - use_external_lib=True + QUANTIZED_MODEL_NAME, TARGET_MODULES, FLOAT16, use_external_lib=True ) refactored_model = load_autogptq_plugin_model( - QUANTIZED_MODEL_NAME, - TARGET_MODULES, - FLOAT16 + QUANTIZED_MODEL_NAME, TARGET_MODULES, FLOAT16 ) - with torch.autocast(device_type='cuda', dtype=torch.float32): + with torch.autocast(device_type="cuda", dtype=torch.float32): with torch.no_grad(): original_logits = original_model(input_ids.to(original_model.device)).logits - refactored_logits = refactored_model(input_ids.to(refactored_model.device)).logits + refactored_logits = refactored_model( + input_ids.to(refactored_model.device) + ).logits assert torch.allclose( original_logits, refactored_logits, atol=ALLCLOSE_ATOL, rtol=ALLCLOSE_RTOL ), "Pre-quantized model logits don't match between extracted and external autogptq library" + @pytest.mark.skipif( not _is_package_available("auto_gptq"), reason="Only runs if auto_gptq is installed", ) def test_quantizing_pretrained_model_outputs_match( - input_ids, seed: int = 42, + input_ids, + seed: int = 42, ): """ - Test for regression of quantizing pretrained models + Test for regression of quantizing pretrained models with refactored gptq library against original autogptq library by calculating KL loss on the output logits of both variants """ @@ -197,49 +229,53 @@ def test_quantizing_pretrained_model_outputs_match( # quantize models for external autogptq lib and extracted gptq lib original_model = quantize_model( - VANILLA_MODEL_NAME, + VANILLA_MODEL_NAME, config, - calibration_dataset, - quant_config_kwargs, - device, + calibration_dataset, + quant_config_kwargs, + device, FLOAT16, - use_external_lib=True + use_external_lib=True, ) refactored_model = quantize_model( - VANILLA_MODEL_NAME, + VANILLA_MODEL_NAME, config, - calibration_dataset, - quant_config_kwargs, - device, + calibration_dataset, + quant_config_kwargs, + device, FLOAT16, - use_external_lib=False + use_external_lib=False, ) - # compare generated tokens between + # compare generated tokens between # unquantized, original library and refactored gptqmodel library unquantized_model = AutoModelForCausalLM.from_pretrained( - VANILLA_MODEL_NAME, - config=config + VANILLA_MODEL_NAME, config=config ).to(device) gen_config = GenerationConfig.from_pretrained(VANILLA_MODEL_NAME) gen_config.max_new_tokens = 5 - _inputs = torch.tensor([tokenizer("auto-gptq is an easy to use")["input_ids"]], device="cuda") + _inputs = torch.tensor( + [tokenizer("auto-gptq is an easy to use")["input_ids"]], device="cuda" + ) output1 = tokenizer.decode( - original_model.generate( - inputs=_inputs, generation_config=gen_config - ).view(-1), skip_special_tokens=True - ) + original_model.generate(inputs=_inputs, generation_config=gen_config).view(-1), + skip_special_tokens=True, + ) output2 = tokenizer.decode( - refactored_model.generate( - inputs=_inputs, generation_config=gen_config - ).view(-1), skip_special_tokens=True - ) + refactored_model.generate(inputs=_inputs, generation_config=gen_config).view( + -1 + ), + skip_special_tokens=True, + ) output3 = tokenizer.decode( - unquantized_model.generate( - inputs=_inputs, generation_config=gen_config - ).view(-1), skip_special_tokens=True - ) - assert output1==output2==output3, f"generated tokens ({output1}, {output2}, {output3}) \ + unquantized_model.generate(inputs=_inputs, generation_config=gen_config).view( + -1 + ), + skip_special_tokens=True, + ) + assert ( + output1 == output2 == output3 + ), f"generated tokens ({output1}, {output2}, {output3}) \ don't match between both libraries after quantization" # compare prob. distributions between original library and refactored gptqmodel library @@ -258,5 +294,7 @@ def test_quantizing_pretrained_model_outputs_match( target = torch.nn.functional.softmax(original_logits, dim=-1) target = torch.flatten(target, start_dim=0, end_dim=1) error = loss_fn(input, target) - assert error.lt(LOSS_TOLERANCE), "Model logits don't match between both libraries \ + assert error.lt( + LOSS_TOLERANCE + ), "Model logits don't match between both libraries \ after quantization" diff --git a/plugins/accelerated-peft/tests/test_q4_triton.py b/plugins/accelerated-peft/tests/test_q4_triton.py index c383db75..e16b8dce 100644 --- a/plugins/accelerated-peft/tests/test_q4_triton.py +++ b/plugins/accelerated-peft/tests/test_q4_triton.py @@ -14,20 +14,26 @@ # limitations under the License. ############################################################################### # -- do not touch +# Standard import os os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" # -- end do not touch +# Standard import unittest # noqa: E402 -import torch # noqa: E402 +# Third Party from gptqmodel import Backend, GPTQModel # noqa: E402 -from gptqmodel.nn_modules.qlinear.qlinear_tritonv2 import QuantLinear as TritonV2QuantLinear # noqa: E402 +from gptqmodel.nn_modules.qlinear.qlinear_tritonv2 import ( # noqa: E402 + QuantLinear as TritonV2QuantLinear, +) from transformers import AutoTokenizer # noqa: E402 +import torch # noqa: E402 GENERATE_EVAL_SIZE = 100 + class TestsQ4Triton(unittest.TestCase): def test_generation_desc_act_false(self): prompt = "I am in Paris and" @@ -54,16 +60,24 @@ def test_generation_desc_act_false(self): inp = tokenizer(prompt, return_tensors="pt").to("cuda:0") # This one uses Autocast. - res = model_q.generate(**inp, num_beams=1, min_new_tokens=new_tokens, max_new_tokens=new_tokens) + res = model_q.generate( + **inp, num_beams=1, min_new_tokens=new_tokens, max_new_tokens=new_tokens + ) predicted_text = tokenizer.decode(res[0]) - self.assertEqual(predicted_text[:GENERATE_EVAL_SIZE], reference_output[:GENERATE_EVAL_SIZE]) + self.assertEqual( + predicted_text[:GENERATE_EVAL_SIZE], reference_output[:GENERATE_EVAL_SIZE] + ) # This one does not. - res = model_q.model.generate(**inp, num_beams=1, min_new_tokens=new_tokens, max_new_tokens=new_tokens) + res = model_q.model.generate( + **inp, num_beams=1, min_new_tokens=new_tokens, max_new_tokens=new_tokens + ) predicted_text = tokenizer.decode(res[0]) - self.assertEqual(predicted_text[:GENERATE_EVAL_SIZE], reference_output[:GENERATE_EVAL_SIZE]) + self.assertEqual( + predicted_text[:GENERATE_EVAL_SIZE], reference_output[:GENERATE_EVAL_SIZE] + ) def test_generation_desc_act_true(self): prompt = "I am in Paris and" @@ -80,7 +94,6 @@ def test_generation_desc_act_true(self): device="cuda:0", backend=Backend.TRITON, revision=revision, - ) for _, submodule in model_q.named_modules(): if isinstance(submodule, TritonV2QuantLinear): @@ -96,4 +109,6 @@ def test_generation_desc_act_true(self): predicted_text = tokenizer.decode(res[0]) - self.assertEqual(predicted_text[:GENERATE_EVAL_SIZE], reference_output[:GENERATE_EVAL_SIZE]) + self.assertEqual( + predicted_text[:GENERATE_EVAL_SIZE], reference_output[:GENERATE_EVAL_SIZE] + ) diff --git a/plugins/accelerated-peft/tests/test_triton.py b/plugins/accelerated-peft/tests/test_triton.py index 410aa6e1..15facdcf 100644 --- a/plugins/accelerated-peft/tests/test_triton.py +++ b/plugins/accelerated-peft/tests/test_triton.py @@ -14,18 +14,21 @@ # limitations under the License. ############################################################################### # -- do not touch +# Standard import os os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" # -- end do not touch +# Standard import os # noqa: E402 import unittest # noqa: E402 -import torch # noqa: E402 -import torch.utils.benchmark as benchmark # noqa: E402 +# Third Party from gptqmodel import Backend, GPTQModel # noqa: E402 from transformers import AutoTokenizer # noqa: E402 +import torch # noqa: E402 +import torch.utils.benchmark as benchmark # noqa: E402 MODEL_ID = "TheBloke/Llama-7B-GPTQ" DATASET_ID = "timdettmers/openassistant-guanaco" @@ -102,6 +105,8 @@ def test_triton_qlinear(self): qlinear_ref = ref_model.model.model.layers[0].self_attn.q_proj - ref_out = qlinear_ref(test_data) # noqa: F841 + ref_out = qlinear_ref(test_data) # noqa: F841 - _, measure_triton = benchmark_forward(qlinear_ref, test_data, desc="Triton", verbose=True) + _, measure_triton = benchmark_forward( + qlinear_ref, test_data, desc="Triton", verbose=True + ) From 12dd566195ed396cc34585b1cefe81a8ea519f64 Mon Sep 17 00:00:00 2001 From: 1000850000 user Date: Sun, 7 Jul 2024 18:16:29 +0000 Subject: [PATCH 10/20] linting --- plugins/accelerated-peft/.pylintrc | 2 +- .../framework_plugin_autogptq.py | 24 ++++++++++--------- .../accelerated-peft/tests/test_q4_triton.py | 4 ++-- plugins/accelerated-peft/tests/test_triton.py | 2 +- 4 files changed, 17 insertions(+), 15 deletions(-) diff --git a/plugins/accelerated-peft/.pylintrc b/plugins/accelerated-peft/.pylintrc index 45da4212..b4c1a280 100644 --- a/plugins/accelerated-peft/.pylintrc +++ b/plugins/accelerated-peft/.pylintrc @@ -52,7 +52,7 @@ ignore=CVS,protobufs # ignore-list. The regex matches against paths and can be in Posix or Windows # format. Because '\\' represents the directory delimiter on Windows systems, # it can't be used as an escape character. -ignore-paths= +ignore-paths=.*gptqmodel/,tests/test_q4_triton.py,tests/test_triton.py # Files or directories matching the regular expression patterns are skipped. # The regex matches against base names, not paths. The default value ignores diff --git a/plugins/accelerated-peft/src/fms_acceleration_peft/framework_plugin_autogptq.py b/plugins/accelerated-peft/src/fms_acceleration_peft/framework_plugin_autogptq.py index 8bcd0bb9..2d1f6212 100644 --- a/plugins/accelerated-peft/src/fms_acceleration_peft/framework_plugin_autogptq.py +++ b/plugins/accelerated-peft/src/fms_acceleration_peft/framework_plugin_autogptq.py @@ -60,17 +60,17 @@ def model_loader(self, model_name: str, **kwargs): # Third Party if self.use_external_lib: # Third Party - from auto_gptq import ( - AutoGPTQForCausalLM as GPTQModel, # pylint: disable=import-outside-toplevel,import-error + from auto_gptq import ( # pylint: disable=import-outside-toplevel,import-error + AutoGPTQForCausalLM as GPTQModel, ) - from auto_gptq import BaseQuantizeConfig as QuantizeConfig + from auto_gptq import BaseQuantizeConfig as QuantizeConfig # pylint: disable=import-outside-toplevel,import-error from auto_gptq.nn_modules.qlinear.qlinear_tritonv2 import ( # pylint: disable=import-outside-toplevel,import-error QuantLinear, ) else: - from .gptqmodel import GPTQModel, QuantizeConfig - from .gptqmodel.utils import Backend - from .gptqmodel.nn_modules.qlinear.qlinear_tritonv2 import ( + from .gptqmodel import GPTQModel, QuantizeConfig # pylint: disable=import-outside-toplevel,import-error + from .gptqmodel.utils import Backend # pylint: disable=import-outside-toplevel,import-error + from .gptqmodel.nn_modules.qlinear.qlinear_tritonv2 import ( # pylint: disable=import-outside-toplevel,import-error QuantLinear, ) # Local @@ -149,11 +149,12 @@ def model_loader(self, model_name: str, **kwargs): ) # NOTE: need to set the device map as below as we want to use AutoGPTQ for training. - # For low_cpu_mem_usage = True, we have to set the device map to load checkpoints to "cpu" - # to avoid gpu consumption before train + # For low_cpu_mem_usage = True, we have to set the device map to load checkpoints + # to "cpu" to avoid gpu consumption before train # This approach will divert consumption to cpu memory, # a better approach would be to load the checkpoints to meta device - # QLoRA is currently implemented by the former approach and will encounter the same issue. + # QLoRA is currently implemented by the former approach and + # will encounter the same issue. # see https://github.com/huggingface/transformers/pull/25107#issuecomment-2134833262 kwargs["device_map"] = { @@ -263,7 +264,7 @@ def augmentation( ) else: # Local - from .gptqmodel.utils.peft import get_gptq_peft_model + from .gptqmodel.utils.peft import get_gptq_peft_model # pylint: disable=import-outside-toplevel,import-error (peft_config,) = modifiable_args # unpack modifiable args @@ -295,7 +296,8 @@ def augmentation( # 2. GPTQLoraModel._replace_module to replace the existing Linear with the LoraLinear. # Also move to device (which may depend on how base layer is implemented) - # NOTE: GPTQLoraModel inherits from LoraModel, and the _create_new_module method is called + # NOTE: GPTQLoraModel inherits from LoraModel, + # and the _create_new_module method is called # on the parent. Hence _create_new_module is patched on the parent # FIXME: diff --git a/plugins/accelerated-peft/tests/test_q4_triton.py b/plugins/accelerated-peft/tests/test_q4_triton.py index e16b8dce..33927a7d 100644 --- a/plugins/accelerated-peft/tests/test_q4_triton.py +++ b/plugins/accelerated-peft/tests/test_q4_triton.py @@ -24,8 +24,8 @@ import unittest # noqa: E402 # Third Party -from gptqmodel import Backend, GPTQModel # noqa: E402 -from gptqmodel.nn_modules.qlinear.qlinear_tritonv2 import ( # noqa: E402 +from fms_acceleration_peft.gptqmodel import Backend, GPTQModel # noqa: E402 +from fms_acceleration_peft.gptqmodel.nn_modules.qlinear.qlinear_tritonv2 import ( # noqa: E402 QuantLinear as TritonV2QuantLinear, ) from transformers import AutoTokenizer # noqa: E402 diff --git a/plugins/accelerated-peft/tests/test_triton.py b/plugins/accelerated-peft/tests/test_triton.py index 15facdcf..245bb939 100644 --- a/plugins/accelerated-peft/tests/test_triton.py +++ b/plugins/accelerated-peft/tests/test_triton.py @@ -25,7 +25,7 @@ import unittest # noqa: E402 # Third Party -from gptqmodel import Backend, GPTQModel # noqa: E402 +from fms_acceleration_peft.gptqmodel import Backend, GPTQModel # noqa: E402 from transformers import AutoTokenizer # noqa: E402 import torch # noqa: E402 import torch.utils.benchmark as benchmark # noqa: E402 From b6ec481cad977ef4cd7881002bc6dcbf79216c87 Mon Sep 17 00:00:00 2001 From: 1000850000 user Date: Sun, 7 Jul 2024 18:16:45 +0000 Subject: [PATCH 11/20] add additional entry to requirements.txt --- plugins/accelerated-peft/requirements.txt | 5 +++++ plugins/accelerated-peft/tests/test_gptqmodel.py | 8 ++++---- plugins/accelerated-peft/tests/test_q4_triton.py | 4 ++-- plugins/accelerated-peft/tox.ini | 3 ++- 4 files changed, 13 insertions(+), 7 deletions(-) diff --git a/plugins/accelerated-peft/requirements.txt b/plugins/accelerated-peft/requirements.txt index a00233d3..237f9092 100644 --- a/plugins/accelerated-peft/requirements.txt +++ b/plugins/accelerated-peft/requirements.txt @@ -6,3 +6,8 @@ accelerate >= 0.29 # bitsandbytes for the BNB plugin bitsandbytes + +# Used to manage the thread limit in functions for converting old +# GPTQ models to new GPTQ model format that support symmetrical=False +# https://github.com/AutoGPTQ/AutoGPTQ/pull/640 +threadpoolctl \ No newline at end of file diff --git a/plugins/accelerated-peft/tests/test_gptqmodel.py b/plugins/accelerated-peft/tests/test_gptqmodel.py index 99abf154..a56cbc9a 100644 --- a/plugins/accelerated-peft/tests/test_gptqmodel.py +++ b/plugins/accelerated-peft/tests/test_gptqmodel.py @@ -219,7 +219,7 @@ def test_quantizing_pretrained_model_outputs_match( calibration_dataset = get_wikitext2(tokenizer, num_samples=128, seqlen=128) quant_config_kwargs = { "bits": 4, - "group_size": -1, + "group_size": 64, "desc_act": True, "damp_percent": 0.1, "static_groups": False, @@ -286,13 +286,13 @@ def test_quantizing_pretrained_model_outputs_match( # Measure the distribution error with KD Loss # flatten as a single batch bs*seqlen # since batchmean sums the loss and averages on dim=0 - loss_fn = torch.nn.KLDivLoss(reduction="batchmean") + loss_fn = torch.nn.KLDivLoss(reduction="sum") # input should be a distribution in the log space input = torch.nn.functional.log_softmax(refactored_logits, dim=-1) - input = torch.flatten(input, start_dim=0, end_dim=1) + input = input.view(BS*SEQLEN, -1) # target must be prob distribution target = torch.nn.functional.softmax(original_logits, dim=-1) - target = torch.flatten(target, start_dim=0, end_dim=1) + target = target.view(BS*SEQLEN, -1) error = loss_fn(input, target) assert error.lt( LOSS_TOLERANCE diff --git a/plugins/accelerated-peft/tests/test_q4_triton.py b/plugins/accelerated-peft/tests/test_q4_triton.py index 33927a7d..ba5d0674 100644 --- a/plugins/accelerated-peft/tests/test_q4_triton.py +++ b/plugins/accelerated-peft/tests/test_q4_triton.py @@ -55,7 +55,7 @@ def test_generation_desc_act_false(self): else: raise ValueError("Did not find a tritonv2 linear layer") - tokenizer = AutoTokenizer.from_pretrained(model_id) + tokenizer = AutoTokenizer.from_pretrained(model_id, use_fast=True) inp = tokenizer(prompt, return_tensors="pt").to("cuda:0") @@ -101,7 +101,7 @@ def test_generation_desc_act_true(self): else: raise ValueError("Did not find a tritonv2 linear layer") - tokenizer = AutoTokenizer.from_pretrained(model_id) + tokenizer = AutoTokenizer.from_pretrained(model_id, use_fast=True) inp = tokenizer(prompt, return_tensors="pt").to(device) diff --git a/plugins/accelerated-peft/tox.ini b/plugins/accelerated-peft/tox.ini index cdcf221d..2e81324d 100644 --- a/plugins/accelerated-peft/tox.ini +++ b/plugins/accelerated-peft/tox.ini @@ -5,13 +5,14 @@ envlist = py, lint, fmt, build, twinecheck deps = pytest>=7 # for the tests, we need to install the deps ourselves - # as the package will install the github version + # as the package will install the github version -e {toxinidir}/../framework # set skip package installation as it will install package pyproject.toml before deps, will throw error when AutoGPTQ needs torch skip_install = true commands = # install the current package pip install --no-deps {toxinidir} + pip install threadpoolctl protobuf sentencepiece # these packages are required for some tests pytest {posargs:tests} [testenv:lint] From 9a3035927ae53fd11bb039f40b248b835636e4aa Mon Sep 17 00:00:00 2001 From: 1000850000 user Date: Mon, 8 Jul 2024 06:58:42 +0000 Subject: [PATCH 12/20] fixed union type backward compatibility with py39 --- .../gptqmodel/models/auto.py | 2 +- .../gptqmodel/models/base.py | 4 ++-- .../gptqmodel/utils/model.py | 6 ++--- .../accelerated-peft/tests/test_q4_triton.py | 22 ++++++++++++++----- plugins/accelerated-peft/tests/test_triton.py | 10 ++++++++- 5 files changed, 32 insertions(+), 12 deletions(-) diff --git a/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/models/auto.py b/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/models/auto.py index 178d9c89..19d15ef2 100644 --- a/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/models/auto.py +++ b/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/models/auto.py @@ -91,7 +91,7 @@ def from_quantized( device: Optional[Union[str, int]] = None, backend: Backend = Backend.AUTO, use_cuda_fp16: bool = True, - quantize_config: Optional[QuantizeConfig | Dict] = None, + quantize_config: Optional[Union[QuantizeConfig, Dict]] = None, model_basename: Optional[str] = None, use_safetensors: bool = True, trust_remote_code: bool = False, diff --git a/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/models/base.py b/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/models/base.py index 8c319c45..9536c65e 100644 --- a/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/models/base.py +++ b/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/models/base.py @@ -779,7 +779,7 @@ def from_pretrained( quantize_config: QuantizeConfig, max_memory: Optional[dict] = None, trust_remote_code: bool = False, - torch_dtype: [str | torch.dtype] = "auto", + torch_dtype: Union[str, torch.dtype] = "auto", **model_init_kwargs, ): """load un-quantized pretrained model to cpu""" @@ -880,7 +880,7 @@ def from_quantized( device: Optional[Union[str, int]] = None, low_cpu_mem_usage: bool = False, backend: Backend = Backend.AUTO, - torch_dtype: [str | torch.dtype] = "auto", + torch_dtype: Union[str, torch.dtype] = "auto", use_cuda_fp16: bool = True, quantize_config: Optional[QuantizeConfig] = None, model_basename: Optional[str] = None, diff --git a/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/utils/model.py b/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/utils/model.py index e98103e8..d51e0e60 100644 --- a/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/utils/model.py +++ b/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/utils/model.py @@ -15,7 +15,7 @@ ############################################################################### # Standard from logging import getLogger -from typing import List, Optional +from typing import List, Optional, Union import functools import hashlib import json @@ -79,13 +79,13 @@ def recurse_setattr(module, name, value): recurse_setattr(getattr(module, name), rest, value) -def get_device(obj: torch.Tensor | nn.Module): +def get_device(obj: Union[torch.Tensor, nn.Module]): if isinstance(obj, torch.Tensor): return obj.device return next(obj.parameters()).device -def move_to(obj: torch.Tensor | nn.Module, device: torch.device): +def move_to(obj: Union[torch.Tensor, nn.Module], device: torch.device): if get_device(obj) != device: obj = obj.to(device) return obj diff --git a/plugins/accelerated-peft/tests/test_q4_triton.py b/plugins/accelerated-peft/tests/test_q4_triton.py index ba5d0674..1201f342 100644 --- a/plugins/accelerated-peft/tests/test_q4_triton.py +++ b/plugins/accelerated-peft/tests/test_q4_triton.py @@ -24,17 +24,25 @@ import unittest # noqa: E402 # Third Party -from fms_acceleration_peft.gptqmodel import Backend, GPTQModel # noqa: E402 -from fms_acceleration_peft.gptqmodel.nn_modules.qlinear.qlinear_tritonv2 import ( # noqa: E402 - QuantLinear as TritonV2QuantLinear, -) from transformers import AutoTokenizer # noqa: E402 import torch # noqa: E402 -GENERATE_EVAL_SIZE = 100 +CUDA_AVAILABLE = False +if torch.cuda.is_available(): + from fms_acceleration_peft.gptqmodel import Backend, GPTQModel # noqa: E402 + from fms_acceleration_peft.gptqmodel.nn_modules.qlinear.qlinear_tritonv2 import ( # noqa: E402 + QuantLinear as TritonV2QuantLinear, + ) + CUDA_AVAILABLE = True + +GENERATE_EVAL_SIZE = 100 class TestsQ4Triton(unittest.TestCase): + @unittest.skipIf( + CUDA_AVAILABLE is False, + "Only runs if there is a cuda device available", + ) def test_generation_desc_act_false(self): prompt = "I am in Paris and" @@ -79,6 +87,10 @@ def test_generation_desc_act_false(self): predicted_text[:GENERATE_EVAL_SIZE], reference_output[:GENERATE_EVAL_SIZE] ) + @unittest.skipIf( + CUDA_AVAILABLE is False, + "Only runs if there is a cuda device available", + ) def test_generation_desc_act_true(self): prompt = "I am in Paris and" device = torch.device("cuda:0") diff --git a/plugins/accelerated-peft/tests/test_triton.py b/plugins/accelerated-peft/tests/test_triton.py index 245bb939..aed7af12 100644 --- a/plugins/accelerated-peft/tests/test_triton.py +++ b/plugins/accelerated-peft/tests/test_triton.py @@ -25,11 +25,15 @@ import unittest # noqa: E402 # Third Party -from fms_acceleration_peft.gptqmodel import Backend, GPTQModel # noqa: E402 from transformers import AutoTokenizer # noqa: E402 import torch # noqa: E402 import torch.utils.benchmark as benchmark # noqa: E402 +CUDA_AVAILABLE = False +if torch.cuda.is_available(): + from fms_acceleration_peft.gptqmodel import Backend, GPTQModel # noqa: E402 + CUDA_AVAILABLE = True + MODEL_ID = "TheBloke/Llama-7B-GPTQ" DATASET_ID = "timdettmers/openassistant-guanaco" LEARNING_RATE = 3e-5 @@ -94,6 +98,10 @@ def get_model_and_tokenizer( class TestTriton(unittest.TestCase): + @unittest.skipIf( + CUDA_AVAILABLE is False, + "Only runs if there is a cuda device available", + ) def test_triton_qlinear(self): ref_model, _ = get_model_and_tokenizer( model_id=MODEL_ID, From 68774dfe87bd75c7ecc6d41dc5cde88eba3d9e26 Mon Sep 17 00:00:00 2001 From: 1000850000 user Date: Thu, 11 Jul 2024 04:12:19 +0000 Subject: [PATCH 13/20] Fix FOAK dequant for compatibility with local gptq package --- .../fused_ops/unsloth_lora/gptq/fast_lora.py | 3 +++ .../fused_ops/unsloth_lora/gptq/triton/kernels.py | 5 ++++- 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/plugins/fused-ops-and-kernels/src/fms_acceleration_foak/fused_ops/unsloth_lora/gptq/fast_lora.py b/plugins/fused-ops-and-kernels/src/fms_acceleration_foak/fused_ops/unsloth_lora/gptq/fast_lora.py index 41e5355e..4000a258 100644 --- a/plugins/fused-ops-and-kernels/src/fms_acceleration_foak/fused_ops/unsloth_lora/gptq/fast_lora.py +++ b/plugins/fused-ops-and-kernels/src/fms_acceleration_foak/fused_ops/unsloth_lora/gptq/fast_lora.py @@ -98,6 +98,9 @@ def get_lora_parameters(proj): base_layer = proj.base_layer if hasattr(proj, "base_layer") else proj qstate = extract_gptq_state(base_layer) + if base_layer.__module__.startswith("auto_gptq"): + setattr(qstate.qzeros, "offset", 1) + if not hasattr(proj, "disable_adapters") or proj.disable_adapters or proj.merged: return qstate, None, None, None, None diff --git a/plugins/fused-ops-and-kernels/src/fms_acceleration_foak/fused_ops/unsloth_lora/gptq/triton/kernels.py b/plugins/fused-ops-and-kernels/src/fms_acceleration_foak/fused_ops/unsloth_lora/gptq/triton/kernels.py index c252d26d..efc3b41e 100644 --- a/plugins/fused-ops-and-kernels/src/fms_acceleration_foak/fused_ops/unsloth_lora/gptq/triton/kernels.py +++ b/plugins/fused-ops-and-kernels/src/fms_acceleration_foak/fused_ops/unsloth_lora/gptq/triton/kernels.py @@ -110,7 +110,10 @@ def dequant_kernel_248( zeros = zeros & maxq # Dequantize - zeros = zeros + 1 + # None if using local gptqpackage, official autogptq should have an offset value + if getattr(qzeros_ptr, "offset", None) is not None: + zeros = zeros + qzeros_ptr.offset + weights = weights - zeros weights = weights.to(tl.float32) weights = scales * weights From f125683cef4c676c2a22bb3fe338034b6beb50d1 Mon Sep 17 00:00:00 2001 From: 1000850000 user Date: Thu, 11 Jul 2024 06:37:08 +0000 Subject: [PATCH 14/20] add benchmark comparison script --- scripts/benchmarks/compare_with_reference.py | 101 +++++++++++++++++++ scripts/run_benchmarks.sh | 2 + 2 files changed, 103 insertions(+) create mode 100644 scripts/benchmarks/compare_with_reference.py diff --git a/scripts/benchmarks/compare_with_reference.py b/scripts/benchmarks/compare_with_reference.py new file mode 100644 index 00000000..0bcc9dbb --- /dev/null +++ b/scripts/benchmarks/compare_with_reference.py @@ -0,0 +1,101 @@ +import argparse +import pandas as pd +from copy import copy +import matplotlib.pyplot as plt +import os +from numpy import linalg + +PLOT_COLUMNS = ["mem_torch_mem_alloc_in_bytes", "mem_peak_torch_mem_alloc_in_bytes", "train_loss", "train_tokens_per_second"] +INDICES = ["framework_config", "peft_method", "model_name_or_path", "num_gpus", "per_device_train_batch_size"] +REFERENCE_FILEPATH = "scripts/benchmarks/refs/a100_80gb.csv" +BENCHMARK_FILENAME = "benchmarks.csv" +FIGURE_FILENAME = "comparison.jpg" + +def plot_chart(ax, x, y, title, xlabel, ylabel): + ax.scatter(x, y, s=10) + ax.plot() + ax.set_title(title, fontsize=8) + ax.set_xlabel(xlabel) + ax.set_ylabel(ylabel) + ax.axline((0, 0), slope=1) + +def plot_table(ax, cell_inputs, title, col_widths, col_labels): + table = ax.table(cellText=cell_inputs, loc="center", colWidths=col_widths, colLabels=col_labels) + table.scale(1, 2) + table.auto_set_font_size(False) + table.set_fontsize(8) + table.auto_set_column_width(0) + ax.set_title(title, fontsize=10) + +def compare_results(df, ref, plot_columns, num_columns = 2, threshold_ratio=0.1): + num_plots = len(plot_columns) + rows = num_plots + fig, axs = plt.subplots(rows, 2, figsize=(20, 28)) + fig.tight_layout(pad=5.0) + + # filter ref to only those rows seen in df + ref = ref[ref.index.isin(df.index.tolist())] + for idx in range(num_plots): + column = plot_columns[idx] + assert (column in ref.columns) and (column in df.columns), f"Column Name `{column}` not in Dataframe" + ax1 = axs[idx][0] + ax2 = axs[idx][1] + ax2.axis('off') + + ref_series = ref[column].fillna(0) + df_series = df[column].fillna(0) + # Calculate difference of l1 norms as a percentage on reference + ref_norm = linalg.norm(ref_series, ord=1) + df_norm = linalg.norm(df_series, ord=1) + norm_difference = abs(df_norm - ref_norm)/(ref_norm+1e-9) + # Extract outliers from reference based on % threshold on referance + ds = abs(df_series-ref_series)/(ref_series+1e-9) + outliers = ds.index[ds>threshold_ratio].to_list() + + plot_chart( + ax1, + ref_series, + df_series, + title=f"Metric: {column}", + xlabel="Reference", + ylabel="New", + ) + + cell_inputs = [[outlier, ref_series[outlier], df_series[outlier]] for outlier in outliers] if len(outliers)>0 else [["","",""]] + + plot_table( + ax2, + cell_inputs = cell_inputs, + title=f"Metric: {column} outliers\n\nNorm Difference={norm_difference:.3f}", + col_widths=[0.9, 0.2, 0.2], + col_labels=["Experiment", "Reference", "New"] + ) + return fig + +def read_df(file_path, indices, plot_columns): + df = pd.read_csv(file_path) + df.set_index(indices, inplace=True) + df = df[plot_columns] + return df + +def main(result_dir): + ref = read_df(REFERENCE_FILEPATH, INDICES, PLOT_COLUMNS) + df = read_df(os.path.join(result_dir, BENCHMARK_FILENAME), INDICES, PLOT_COLUMNS) + fig = compare_results(df, ref, PLOT_COLUMNS, threshold_ratio=.1) + plt.savefig(os.path.join(result_dir, FIGURE_FILENAME)) + +if __name__ == "__main__": + parser = argparse.ArgumentParser( + prog="Acceleration Benchmarking Debug Tool", + description="This script analyses benchmark outputs against the current reference", + ) + parser.add_argument( + "--result_dir", + default="benchmark_outputs", + help="benchmark result directory", + ) + + args = parser.parse_args() + main( + result_dir=args.result_dir, + ) \ No newline at end of file diff --git a/scripts/run_benchmarks.sh b/scripts/run_benchmarks.sh index 8f8a1f9b..6e63f530 100644 --- a/scripts/run_benchmarks.sh +++ b/scripts/run_benchmarks.sh @@ -137,3 +137,5 @@ PYTHONPATH=. \ 'error_messages' \ 'acceleration_framework_config_file' +PYTHONPATH=. \ + python $WORKING_DIR/compare_with_reference.py --result_dir $RESULT_DIR From 2a705ca4a2f7bc210ac515022f91fc42c36d1320 Mon Sep 17 00:00:00 2001 From: 1000850000 user Date: Thu, 11 Jul 2024 09:01:28 +0000 Subject: [PATCH 15/20] modified comparison script --- scripts/benchmarks/compare_with_reference.py | 101 ++++++++++--------- 1 file changed, 51 insertions(+), 50 deletions(-) diff --git a/scripts/benchmarks/compare_with_reference.py b/scripts/benchmarks/compare_with_reference.py index 0bcc9dbb..054315ea 100644 --- a/scripts/benchmarks/compare_with_reference.py +++ b/scripts/benchmarks/compare_with_reference.py @@ -5,72 +5,56 @@ import os from numpy import linalg -PLOT_COLUMNS = ["mem_torch_mem_alloc_in_bytes", "mem_peak_torch_mem_alloc_in_bytes", "train_loss", "train_tokens_per_second"] -INDICES = ["framework_config", "peft_method", "model_name_or_path", "num_gpus", "per_device_train_batch_size"] -REFERENCE_FILEPATH = "scripts/benchmarks/refs/a100_80gb.csv" +# default columns to compare +DEFAULT_PLOT_COLUMNS = ["mem_torch_mem_alloc_in_bytes", "mem_peak_torch_mem_alloc_in_bytes", "train_loss", "train_tokens_per_second"] +# Used as combined identifier of experiment +DEFAULT_INDICES = ["framework_config", "peft_method", "model_name_or_path", "num_gpus", "per_device_train_batch_size"] +DEFAULT_OUTLIERS_DF_COLUMN_NAMES = ["scenario", *DEFAULT_INDICES, "reference", "new"] +DEFAULT_REFERENCE_FILEPATH = "scripts/benchmarks/refs/a100_80gb.csv" BENCHMARK_FILENAME = "benchmarks.csv" -FIGURE_FILENAME = "comparison.jpg" def plot_chart(ax, x, y, title, xlabel, ylabel): ax.scatter(x, y, s=10) - ax.plot() ax.set_title(title, fontsize=8) ax.set_xlabel(xlabel) ax.set_ylabel(ylabel) ax.axline((0, 0), slope=1) - -def plot_table(ax, cell_inputs, title, col_widths, col_labels): - table = ax.table(cellText=cell_inputs, loc="center", colWidths=col_widths, colLabels=col_labels) - table.scale(1, 2) - table.auto_set_font_size(False) - table.set_fontsize(8) - table.auto_set_column_width(0) - ax.set_title(title, fontsize=10) - + def compare_results(df, ref, plot_columns, num_columns = 2, threshold_ratio=0.1): num_plots = len(plot_columns) - rows = num_plots - fig, axs = plt.subplots(rows, 2, figsize=(20, 28)) - fig.tight_layout(pad=5.0) - + + charts = [] + total_outliers = [] # filter ref to only those rows seen in df ref = ref[ref.index.isin(df.index.tolist())] for idx in range(num_plots): + _, ax = plt.subplots(figsize=(8, 8)) column = plot_columns[idx] - assert (column in ref.columns) and (column in df.columns), f"Column Name `{column}` not in Dataframe" - ax1 = axs[idx][0] - ax2 = axs[idx][1] - ax2.axis('off') - + assert ( + (column in ref.columns) + and + (column in df.columns) + ), f"Column Name `{column}` not in Dataframe" + ref_series = ref[column].fillna(0) df_series = df[column].fillna(0) - # Calculate difference of l1 norms as a percentage on reference - ref_norm = linalg.norm(ref_series, ord=1) - df_norm = linalg.norm(df_series, ord=1) - norm_difference = abs(df_norm - ref_norm)/(ref_norm+1e-9) - # Extract outliers from reference based on % threshold on referance + # Extract outliers base on some threshold % difference on referance ds = abs(df_series-ref_series)/(ref_series+1e-9) - outliers = ds.index[ds>threshold_ratio].to_list() - + outliers = ds.index[ds>threshold_ratio].to_list() plot_chart( - ax1, + ax, ref_series, df_series, title=f"Metric: {column}", xlabel="Reference", ylabel="New", - ) - - cell_inputs = [[outlier, ref_series[outlier], df_series[outlier]] for outlier in outliers] if len(outliers)>0 else [["","",""]] - - plot_table( - ax2, - cell_inputs = cell_inputs, - title=f"Metric: {column} outliers\n\nNorm Difference={norm_difference:.3f}", - col_widths=[0.9, 0.2, 0.2], - col_labels=["Experiment", "Reference", "New"] - ) - return fig + ) + charts.append((ax, f"compare-{column}.jpg")) + total_outliers += [ + [column, *outlier, ref_series[outlier].item(), df_series[outlier].item()] + for outlier in outliers + ] + return total_outliers, charts def read_df(file_path, indices, plot_columns): df = pd.read_csv(file_path) @@ -78,11 +62,14 @@ def read_df(file_path, indices, plot_columns): df = df[plot_columns] return df -def main(result_dir): - ref = read_df(REFERENCE_FILEPATH, INDICES, PLOT_COLUMNS) - df = read_df(os.path.join(result_dir, BENCHMARK_FILENAME), INDICES, PLOT_COLUMNS) - fig = compare_results(df, ref, PLOT_COLUMNS, threshold_ratio=.1) - plt.savefig(os.path.join(result_dir, FIGURE_FILENAME)) +def main(result_dir, reference_benchmark_filepath, plot_columns): + ref = read_df(reference_benchmark_filepath, DEFAULT_INDICES, plot_columns) + df = read_df(os.path.join(result_dir, BENCHMARK_FILENAME), DEFAULT_INDICES, plot_columns) + total_outliers, charts = compare_results(df, ref, plot_columns, threshold_ratio=.1) + outliers_df = pd.DataFrame(total_outliers, columns=DEFAULT_OUTLIERS_DF_COLUMN_NAMES) + outliers_df.to_csv(os.path.join(result_dir, "outliers.csv"), index=None) + for chart, filename in charts: + chart.figure.savefig(os.path.join(result_dir, filename)) if __name__ == "__main__": parser = argparse.ArgumentParser( @@ -92,10 +79,24 @@ def main(result_dir): parser.add_argument( "--result_dir", default="benchmark_outputs", - help="benchmark result directory", + help="benchmark result directory to use for comparison", + ) + + parser.add_argument( + "--reference_benchmark_filepath", + default="scripts/benchmarks/refs/a100_80gb.csv", + help="file path of the csv to compare on", + ) + + parser.add_argument( + "--plot_columns", + default=DEFAULT_PLOT_COLUMNS, + nargs='+' ) args = parser.parse_args() main( result_dir=args.result_dir, - ) \ No newline at end of file + reference_benchmark_filepath=args.reference_benchmark_filepath, + plot_columns=args.plot_columns, + ) From eefdd888595bd9de63fbeb972f4417e021b6795d Mon Sep 17 00:00:00 2001 From: 1000850000 user Date: Thu, 11 Jul 2024 09:28:26 +0000 Subject: [PATCH 16/20] formatted scripts/ --- scripts/benchmarks/compare_with_reference.py | 77 ++++++++++++-------- 1 file changed, 46 insertions(+), 31 deletions(-) diff --git a/scripts/benchmarks/compare_with_reference.py b/scripts/benchmarks/compare_with_reference.py index 054315ea..4bd54d43 100644 --- a/scripts/benchmarks/compare_with_reference.py +++ b/scripts/benchmarks/compare_with_reference.py @@ -1,28 +1,44 @@ -import argparse -import pandas as pd +# Standard from copy import copy -import matplotlib.pyplot as plt +import argparse import os + +# Third Party from numpy import linalg +import matplotlib.pyplot as plt +import pandas as pd # default columns to compare -DEFAULT_PLOT_COLUMNS = ["mem_torch_mem_alloc_in_bytes", "mem_peak_torch_mem_alloc_in_bytes", "train_loss", "train_tokens_per_second"] +DEFAULT_PLOT_COLUMNS = [ + "mem_torch_mem_alloc_in_bytes", + "mem_peak_torch_mem_alloc_in_bytes", + "train_loss", + "train_tokens_per_second", +] # Used as combined identifier of experiment -DEFAULT_INDICES = ["framework_config", "peft_method", "model_name_or_path", "num_gpus", "per_device_train_batch_size"] +DEFAULT_INDICES = [ + "framework_config", + "peft_method", + "model_name_or_path", + "num_gpus", + "per_device_train_batch_size", +] DEFAULT_OUTLIERS_DF_COLUMN_NAMES = ["scenario", *DEFAULT_INDICES, "reference", "new"] DEFAULT_REFERENCE_FILEPATH = "scripts/benchmarks/refs/a100_80gb.csv" BENCHMARK_FILENAME = "benchmarks.csv" + def plot_chart(ax, x, y, title, xlabel, ylabel): ax.scatter(x, y, s=10) ax.set_title(title, fontsize=8) ax.set_xlabel(xlabel) ax.set_ylabel(ylabel) ax.axline((0, 0), slope=1) - -def compare_results(df, ref, plot_columns, num_columns = 2, threshold_ratio=0.1): - num_plots = len(plot_columns) - + + +def compare_results(df, ref, plot_columns, num_columns=2, threshold_ratio=0.1): + num_plots = len(plot_columns) + charts = [] total_outliers = [] # filter ref to only those rows seen in df @@ -30,25 +46,23 @@ def compare_results(df, ref, plot_columns, num_columns = 2, threshold_ratio=0.1) for idx in range(num_plots): _, ax = plt.subplots(figsize=(8, 8)) column = plot_columns[idx] - assert ( - (column in ref.columns) - and - (column in df.columns) - ), f"Column Name `{column}` not in Dataframe" + assert (column in ref.columns) and ( + column in df.columns + ), f"Column Name `{column}` not in Dataframe" ref_series = ref[column].fillna(0) df_series = df[column].fillna(0) # Extract outliers base on some threshold % difference on referance - ds = abs(df_series-ref_series)/(ref_series+1e-9) - outliers = ds.index[ds>threshold_ratio].to_list() + ds = abs(df_series - ref_series) / (ref_series + 1e-9) + outliers = ds.index[ds > threshold_ratio].to_list() plot_chart( - ax, - ref_series, - df_series, - title=f"Metric: {column}", - xlabel="Reference", + ax, + ref_series, + df_series, + title=f"Metric: {column}", + xlabel="Reference", ylabel="New", - ) + ) charts.append((ax, f"compare-{column}.jpg")) total_outliers += [ [column, *outlier, ref_series[outlier].item(), df_series[outlier].item()] @@ -56,25 +70,30 @@ def compare_results(df, ref, plot_columns, num_columns = 2, threshold_ratio=0.1) ] return total_outliers, charts + def read_df(file_path, indices, plot_columns): df = pd.read_csv(file_path) df.set_index(indices, inplace=True) df = df[plot_columns] return df + def main(result_dir, reference_benchmark_filepath, plot_columns): ref = read_df(reference_benchmark_filepath, DEFAULT_INDICES, plot_columns) - df = read_df(os.path.join(result_dir, BENCHMARK_FILENAME), DEFAULT_INDICES, plot_columns) - total_outliers, charts = compare_results(df, ref, plot_columns, threshold_ratio=.1) + df = read_df( + os.path.join(result_dir, BENCHMARK_FILENAME), DEFAULT_INDICES, plot_columns + ) + total_outliers, charts = compare_results(df, ref, plot_columns, threshold_ratio=0.1) outliers_df = pd.DataFrame(total_outliers, columns=DEFAULT_OUTLIERS_DF_COLUMN_NAMES) outliers_df.to_csv(os.path.join(result_dir, "outliers.csv"), index=None) for chart, filename in charts: chart.figure.savefig(os.path.join(result_dir, filename)) + if __name__ == "__main__": parser = argparse.ArgumentParser( - prog="Acceleration Benchmarking Debug Tool", - description="This script analyses benchmark outputs against the current reference", + prog="Acceleration Benchmarking Comparison Tool", + description="This script analyses benchmark outputs against a given reference", ) parser.add_argument( "--result_dir", @@ -88,11 +107,7 @@ def main(result_dir, reference_benchmark_filepath, plot_columns): help="file path of the csv to compare on", ) - parser.add_argument( - "--plot_columns", - default=DEFAULT_PLOT_COLUMNS, - nargs='+' - ) + parser.add_argument("--plot_columns", default=DEFAULT_PLOT_COLUMNS, nargs="+") args = parser.parse_args() main( From aa57cf2314cd71a3139e02d66f6a61003d220a9c Mon Sep 17 00:00:00 2001 From: 1000850000 user Date: Thu, 11 Jul 2024 19:38:59 +0000 Subject: [PATCH 17/20] edited comparison script to detect difference in command args --- scripts/benchmarks/compare_with_reference.py | 67 +++++++++++++++----- 1 file changed, 52 insertions(+), 15 deletions(-) diff --git a/scripts/benchmarks/compare_with_reference.py b/scripts/benchmarks/compare_with_reference.py index 4bd54d43..38375a9d 100644 --- a/scripts/benchmarks/compare_with_reference.py +++ b/scripts/benchmarks/compare_with_reference.py @@ -23,7 +23,15 @@ "num_gpus", "per_device_train_batch_size", ] -DEFAULT_OUTLIERS_DF_COLUMN_NAMES = ["scenario", *DEFAULT_INDICES, "reference", "new"] + +DEFAULT_IGNORED_COLUMNS = [ + "epoch", + "train_runtime", + "train_steps_per_second", + "train_samples_per_second", + "mem_nvidia_mem_reserved", +] + DEFAULT_REFERENCE_FILEPATH = "scripts/benchmarks/refs/a100_80gb.csv" BENCHMARK_FILENAME = "benchmarks.csv" @@ -36,7 +44,7 @@ def plot_chart(ax, x, y, title, xlabel, ylabel): ax.axline((0, 0), slope=1) -def compare_results(df, ref, plot_columns, num_columns=2, threshold_ratio=0.1): +def compare_results(df, ref, plot_columns, threshold_ratio=0.1): num_plots = len(plot_columns) charts = [] @@ -68,24 +76,45 @@ def compare_results(df, ref, plot_columns, num_columns=2, threshold_ratio=0.1): [column, *outlier, ref_series[outlier].item(), df_series[outlier].item()] for outlier in outliers ] - return total_outliers, charts + outliers_df = pd.DataFrame( + total_outliers, columns=["scenario", *df.index.names, "reference", "new"] + ) + return outliers_df, outliers, charts def read_df(file_path, indices, plot_columns): df = pd.read_csv(file_path) df.set_index(indices, inplace=True) - df = df[plot_columns] - return df - - -def main(result_dir, reference_benchmark_filepath, plot_columns): - ref = read_df(reference_benchmark_filepath, DEFAULT_INDICES, plot_columns) - df = read_df( - os.path.join(result_dir, BENCHMARK_FILENAME), DEFAULT_INDICES, plot_columns + # all other columns not for plotting or explicitly ignored are hyperparameters + argument_columns = [ + col + for col in df.columns + if col not in (DEFAULT_IGNORED_COLUMNS + DEFAULT_PLOT_COLUMNS) + ] + return df[plot_columns], df[argument_columns] + + +def main( + result_dir, reference_benchmark_filepath, plot_columns, threshold_ratio, indices +): + ref, args_ref = read_df(reference_benchmark_filepath, indices, plot_columns) + df, args_df = read_df( + os.path.join(result_dir, BENCHMARK_FILENAME), indices, plot_columns + ) + # Analyse between both sets of results and retrieve outliers + outliers_df, outliers, charts = compare_results( + df, ref, plot_columns, threshold_ratio=threshold_ratio ) - total_outliers, charts = compare_results(df, ref, plot_columns, threshold_ratio=0.1) - outliers_df = pd.DataFrame(total_outliers, columns=DEFAULT_OUTLIERS_DF_COLUMN_NAMES) - outliers_df.to_csv(os.path.join(result_dir, "outliers.csv"), index=None) + # Find arguments that are different between ref and new + # to highlight as possible cause of anomaly + diff = args_df.compare(args_ref, align_axis=1).rename( + columns={"self": "new", "other": "ref"}, level=-1 + ) + diff = diff[diff.index.isin([outlier for outlier in outliers])] + outliers_df = outliers_df.set_index(indices).merge( + diff, left_index=True, right_index=True + ) + outliers_df.to_csv(os.path.join(result_dir, "outliers.csv")) for chart, filename in charts: chart.figure.savefig(os.path.join(result_dir, filename)) @@ -100,12 +129,18 @@ def main(result_dir, reference_benchmark_filepath, plot_columns): default="benchmark_outputs", help="benchmark result directory to use for comparison", ) - parser.add_argument( "--reference_benchmark_filepath", default="scripts/benchmarks/refs/a100_80gb.csv", help="file path of the csv to compare on", ) + parser.add_argument( + "--threshold_ratio", + default=0.1, + help="the acceptable threshold percentage difference from the reference value.", + ) + + parser.add_argument("--indices", default=DEFAULT_INDICES, nargs="+") parser.add_argument("--plot_columns", default=DEFAULT_PLOT_COLUMNS, nargs="+") @@ -114,4 +149,6 @@ def main(result_dir, reference_benchmark_filepath, plot_columns): result_dir=args.result_dir, reference_benchmark_filepath=args.reference_benchmark_filepath, plot_columns=args.plot_columns, + threshold_ratio=args.threshold_ratio, + indices=args.indices, ) From d98e56ce6c76e5b91e92bf062c5d271743d5031c Mon Sep 17 00:00:00 2001 From: 1000850000 user Date: Fri, 12 Jul 2024 02:13:13 +0000 Subject: [PATCH 18/20] addresed PR edits --- scripts/benchmarks/compare_with_reference.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/scripts/benchmarks/compare_with_reference.py b/scripts/benchmarks/compare_with_reference.py index 38375a9d..a580b8de 100644 --- a/scripts/benchmarks/compare_with_reference.py +++ b/scripts/benchmarks/compare_with_reference.py @@ -34,7 +34,7 @@ DEFAULT_REFERENCE_FILEPATH = "scripts/benchmarks/refs/a100_80gb.csv" BENCHMARK_FILENAME = "benchmarks.csv" - +OUTLIERS_FILENAME = "outliers.csv" def plot_chart(ax, x, y, title, xlabel, ylabel): ax.scatter(x, y, s=10) @@ -114,7 +114,7 @@ def main( outliers_df = outliers_df.set_index(indices).merge( diff, left_index=True, right_index=True ) - outliers_df.to_csv(os.path.join(result_dir, "outliers.csv")) + outliers_df.to_csv(os.path.join(result_dir, OUTLIERS_FILENAME)) for chart, filename in charts: chart.figure.savefig(os.path.join(result_dir, filename)) @@ -137,7 +137,7 @@ def main( parser.add_argument( "--threshold_ratio", default=0.1, - help="the acceptable threshold percentage difference from the reference value.", + help="the acceptable relative difference from the reference value.", ) parser.add_argument("--indices", default=DEFAULT_INDICES, nargs="+") From b92c99df69c6fce29e169390f57e08cba7c3e0c3 Mon Sep 17 00:00:00 2001 From: 1000850000 user Date: Fri, 12 Jul 2024 08:50:08 +0000 Subject: [PATCH 19/20] updated benchmarks --- scripts/benchmarks/refs/a100_80gb.csv | 168 +++++++++++------------ scripts/benchmarks/refs/requirements.txt | 78 +++++++++++ 2 files changed, 162 insertions(+), 84 deletions(-) create mode 100644 scripts/benchmarks/refs/requirements.txt diff --git a/scripts/benchmarks/refs/a100_80gb.csv b/scripts/benchmarks/refs/a100_80gb.csv index 45cdf125..6bb7714a 100644 --- a/scripts/benchmarks/refs/a100_80gb.csv +++ b/scripts/benchmarks/refs/a100_80gb.csv @@ -1,85 +1,85 @@ epoch,fp16,framework_config,learning_rate,lora_alpha,lora_dropout,mem_nvidia_mem_reserved,mem_peak_torch_mem_alloc_in_bytes,mem_torch_mem_alloc_in_bytes,model_name_or_path,num_gpus,peft_method,per_device_train_batch_size,r,target_modules,torch_dtype,train_loss,train_runtime,train_samples_per_second,train_steps_per_second,train_tokens_per_second -0.15,,none,2e-5,,,76679.0,72971724288,44004763136,mistralai/Mistral-7B-v0.1,1,,4,,,float16,0.9112484455108643,565.9213,0.707,0.177,2895.102 -0.15,,none,2e-5,,,43702.0,36762859520,29521119232,mistralai/Mistral-7B-v0.1,2,,2,,,float16,0.8622726058959961,307.6782,1.3,0.325,2662.522 -0.29,,none,2e-5,,,70669.0,72972117504,44005156352,mistralai/Mistral-7B-v0.1,1,,8,,,float16,1.017976951599121,1094.9632,0.731,0.091,2992.612 -0.29,,none,2e-5,,,52882.0,36763056128,29521315840,mistralai/Mistral-7B-v0.1,2,,4,,,float16,0.8944576263427735,576.1931,1.388,0.174,2843.491 -,,none,2e-5,,,80969.0,0,0,mistralai/Mixtral-8x7B-Instruct-v0.1,1,,4,,,float16,,,,, -,,none,2e-5,,,79169.0,0,0,mistralai/Mixtral-8x7B-Instruct-v0.1,2,,2,,,float16,,,,, -,,none,2e-5,,,80969.0,0,0,mistralai/Mixtral-8x7B-Instruct-v0.1,1,,8,,,float16,,,,, -,,none,2e-5,,,80083.0,0,0,mistralai/Mixtral-8x7B-Instruct-v0.1,2,,4,,,float16,,,,, -,,none,2e-5,,,80987.0,0,0,NousResearch/Llama-2-70b-hf,1,,4,,,float16,,,,, -,,none,2e-5,,,80923.0,0,0,NousResearch/Llama-2-70b-hf,2,,2,,,float16,,,,, -,,none,2e-5,,,80987.0,0,0,NousResearch/Llama-2-70b-hf,1,,8,,,float16,,,,, -,,none,2e-5,,,81006.0,0,0,NousResearch/Llama-2-70b-hf,2,,4,,,float16,,,,, -0.15,,none,2e-4,16,0.0,28703.0,26108963328,15119590912,mistralai/Mistral-7B-v0.1,1,lora,4,16,q_proj k_proj v_proj o_proj,float16,0.8818108749389648,458.2667,0.873,0.218,3575.21 -0.15,,none,2e-4,16,0.0,17669.0,15123161088,7850391552,mistralai/Mistral-7B-v0.1,2,lora,2,16,q_proj k_proj v_proj o_proj,float16,0.8540384006500245,270.1999,1.48,0.37,3031.829 -0.29,,none,2e-4,16,0.0,42167.0,37098695168,15119984128,mistralai/Mistral-7B-v0.1,1,lora,8,16,q_proj k_proj v_proj o_proj,float16,1.0028394603729247,912.5081,0.877,0.11,3590.982 -0.29,,none,2e-4,16,0.0,25207.0,21433753600,7850588160,mistralai/Mistral-7B-v0.1,2,lora,4,16,q_proj k_proj v_proj o_proj,float16,0.8833828353881836,482.6901,1.657,0.207,3394.311 -,,none,2e-4,16,0.0,80990.0,0,0,mistralai/Mixtral-8x7B-Instruct-v0.1,1,lora,4,16,q_proj k_proj v_proj o_proj,float16,,,,, -0.15,,none,2e-4,16,0.0,61532.0,57546370048,47311452160,mistralai/Mixtral-8x7B-Instruct-v0.1,2,lora,2,16,q_proj k_proj v_proj o_proj,float16,0.8696129798889161,561.2483,0.713,0.178,1459.604 -,,none,2e-4,16,0.0,80207.0,0,0,mistralai/Mixtral-8x7B-Instruct-v0.1,1,lora,8,16,q_proj k_proj v_proj o_proj,float16,,,,, -0.29,,none,2e-4,16,0.0,69171.0,64398757376,47311648768,mistralai/Mixtral-8x7B-Instruct-v0.1,2,lora,4,16,q_proj k_proj v_proj o_proj,float16,0.885084867477417,938.9714,0.852,0.106,1744.888 -,,none,2e-4,16,0.0,80617.0,0,0,NousResearch/Llama-2-70b-hf,1,lora,4,16,q_proj k_proj v_proj o_proj,float16,,,,, -,,none,2e-4,16,0.0,80907.0,0,0,NousResearch/Llama-2-70b-hf,2,lora,2,16,q_proj k_proj v_proj o_proj,float16,,,,, -,,none,2e-4,16,0.0,80617.0,0,0,NousResearch/Llama-2-70b-hf,1,lora,8,16,q_proj k_proj v_proj o_proj,float16,,,,, -,,none,2e-4,16,0.0,80783.0,0,0,NousResearch/Llama-2-70b-hf,2,lora,4,16,q_proj k_proj v_proj o_proj,float16,,,,, -0.15,True,baseline-peft-bnb,2e-4,16,0.0,25995.0,22825932800,5368221184,mistralai/Mistral-7B-v0.1,1,lora,4,16,q_proj k_proj v_proj o_proj,float16,0.8698946189880371,586.9178,0.682,0.17,2791.532 -0.15,True,baseline-peft-bnb,2e-4,16,0.0,12476.0,9974622720,2727018496,mistralai/Mistral-7B-v0.1,2,lora,2,16,q_proj k_proj v_proj o_proj,float16,0.8552890300750733,284.376,1.407,0.352,2880.693 -0.29,True,baseline-peft-bnb,2e-4,16,0.0,46117.0,40278956032,5368614400,mistralai/Mistral-7B-v0.1,1,lora,8,16,q_proj k_proj v_proj o_proj,float16,0.8654958820343017,1148.1408,0.697,0.087,2854.005 -0.29,True,baseline-peft-bnb,2e-4,16,0.0,20405.0,16587205120,2727215104,mistralai/Mistral-7B-v0.1,2,lora,4,16,q_proj k_proj v_proj o_proj,float16,0.8869294357299805,503.0597,1.59,0.199,3256.87 -0.15,True,baseline-peft-bnb,2e-4,16,0.0,47189.0,46475660288,25726225920,mistralai/Mixtral-8x7B-Instruct-v0.1,1,lora,4,16,q_proj k_proj v_proj o_proj,float16,0.8893787956237793,1185.2488,0.337,0.084,1382.326 -0.15,True,baseline-peft-bnb,2e-4,16,0.0,24751.0,21932720128,13219233792,mistralai/Mixtral-8x7B-Instruct-v0.1,2,lora,2,16,q_proj k_proj v_proj o_proj,float16,0.8617707204818725,568.5808,0.704,0.176,1440.78 -0.29,True,baseline-peft-bnb,2e-4,16,0.0,68683.0,67165218816,25726619136,mistralai/Mixtral-8x7B-Instruct-v0.1,1,lora,8,16,q_proj k_proj v_proj o_proj,float16,0.8893123245239258,2124.0668,0.377,0.047,1542.701 -0.29,True,baseline-peft-bnb,2e-4,16,0.0,32064.0,29353074176,13219430400,mistralai/Mixtral-8x7B-Instruct-v0.1,2,lora,4,16,q_proj k_proj v_proj o_proj,float16,0.8585504531860352,962.8971,0.831,0.104,1701.532 -,True,baseline-peft-bnb,2e-4,16,0.0,80121.0,0,0,NousResearch/Llama-2-70b-hf,1,lora,4,16,q_proj k_proj v_proj o_proj,float16,,,,, -0.14,True,baseline-peft-bnb,2e-4,16,0.0,51701.0,46524471808,19172741120,NousResearch/Llama-2-70b-hf,2,lora,2,16,q_proj k_proj v_proj o_proj,float16,0.9204118633270264,1981.2518,0.202,0.05,413.476 -,True,baseline-peft-bnb,2e-4,16,0.0,79555.0,0,0,NousResearch/Llama-2-70b-hf,1,lora,8,16,q_proj k_proj v_proj o_proj,float16,,,,, -0.28,True,baseline-peft-bnb,2e-4,16,0.0,80394.0,72398346752,19172937728,NousResearch/Llama-2-70b-hf,2,lora,4,16,q_proj k_proj v_proj o_proj,float16,0.9444941711425782,3760.1788,0.213,0.027,435.724 -0.15,True,accelerated-peft-bnb,2e-4,16,0.0,18903.0,15860019712,4843384320,mistralai/Mistral-7B-v0.1,1,lora,4,16,q_proj k_proj v_proj o_proj,float16,0.8704616069793701,479.6819,0.834,0.208,3415.597 -0.15,True,accelerated-peft-bnb,2e-4,16,0.0,12533.0,9974622720,2727018496,mistralai/Mistral-7B-v0.1,2,lora,2,16,q_proj k_proj v_proj o_proj,float16,0.8528211212158203,282.8845,1.414,0.354,2895.882 -0.29,True,accelerated-peft-bnb,2e-4,16,0.0,33327.0,26849751552,4843777536,mistralai/Mistral-7B-v0.1,1,lora,8,16,q_proj k_proj v_proj o_proj,float16,0.8675907611846924,945.5376,0.846,0.106,3465.542 -0.29,True,accelerated-peft-bnb,2e-4,16,0.0,20423.0,16587205120,2727215104,mistralai/Mistral-7B-v0.1,2,lora,4,16,q_proj k_proj v_proj o_proj,float16,0.854712610244751,502.3584,1.592,0.199,3261.417 -0.15,True,accelerated-peft-bnb-foak,2e-4,16,0.0,19257.0,13636909056,4843384320,mistralai/Mistral-7B-v0.1,1,lora,4,16,q_proj k_proj v_proj o_proj,float16,0.8722561931610108,420.8819,0.95,0.238,3892.778 -0.15,True,accelerated-peft-bnb-foak,2e-4,16,0.0,12118.0,9796856320,2727018496,mistralai/Mistral-7B-v0.1,2,lora,2,16,q_proj k_proj v_proj o_proj,float16,0.8581914234161377,232.51,1.72,0.43,3523.289 -0.29,True,accelerated-peft-bnb-foak,2e-4,16,0.0,32209.0,22430791680,4843777536,mistralai/Mistral-7B-v0.1,1,lora,8,16,q_proj k_proj v_proj o_proj,float16,0.8683128643035889,821.991,0.973,0.122,3986.418 -0.29,True,accelerated-peft-bnb-foak,2e-4,16,0.0,19463.0,16207063552,2727215104,mistralai/Mistral-7B-v0.1,2,lora,4,16,q_proj k_proj v_proj o_proj,float16,0.852388572692871,427.1268,1.873,0.234,3835.864 -0.15,True,accelerated-peft-bnb,2e-4,16,0.0,37417.0,36218024448,25201389056,mistralai/Mixtral-8x7B-Instruct-v0.1,1,lora,4,16,q_proj k_proj v_proj o_proj,float16,0.8887558174133301,913.0381,0.438,0.11,1794.449 -0.15,True,accelerated-peft-bnb,2e-4,16,0.0,24952.0,21921468928,13219233792,mistralai/Mixtral-8x7B-Instruct-v0.1,2,lora,2,16,q_proj k_proj v_proj o_proj,float16,0.8612120914459228,572.3054,0.699,0.175,1431.404 -0.29,True,accelerated-peft-bnb,2e-4,16,0.0,49893.0,47207756288,25201782272,mistralai/Mixtral-8x7B-Instruct-v0.1,1,lora,8,16,q_proj k_proj v_proj o_proj,float16,0.8909227275848388,1711.7453,0.467,0.058,1914.303 -0.29,True,accelerated-peft-bnb,2e-4,16,0.0,32207.0,29359173632,13219430400,mistralai/Mixtral-8x7B-Instruct-v0.1,2,lora,4,16,q_proj k_proj v_proj o_proj,float16,0.8591176319122314,959.9538,0.833,0.104,1706.749 -0.15,True,accelerated-peft-bnb-foak,2e-4,16,0.0,37547.0,35651058176,25201389056,mistralai/Mixtral-8x7B-Instruct-v0.1,1,lora,4,16,q_proj k_proj v_proj o_proj,float16,0.8895366668701172,854.9879,0.468,0.117,1916.284 -0.15,True,accelerated-peft-bnb-foak,2e-4,16,0.0,24572.0,21746056192,13219233792,mistralai/Mixtral-8x7B-Instruct-v0.1,2,lora,2,16,q_proj k_proj v_proj o_proj,float16,0.8630767631530761,514.5553,0.777,0.194,1592.054 -0.29,True,accelerated-peft-bnb-foak,2e-4,16,0.0,49861.0,46058696192,25201782272,mistralai/Mixtral-8x7B-Instruct-v0.1,1,lora,8,16,q_proj k_proj v_proj o_proj,float16,0.8951810073852539,1601.6113,0.499,0.062,2045.94 -0.29,True,accelerated-peft-bnb-foak,2e-4,16,0.0,31701.0,29043888640,13219430400,mistralai/Mixtral-8x7B-Instruct-v0.1,2,lora,4,16,q_proj k_proj v_proj o_proj,float16,0.8600863265991211,880.114,0.909,0.114,1861.577 -0.14,True,accelerated-peft-bnb,2e-4,16,0.0,71801.0,68159977472,37346815488,NousResearch/Llama-2-70b-hf,1,lora,4,16,q_proj k_proj v_proj o_proj,float16,0.9996430969238281,3700.3604,0.108,0.027,442.768 -0.14,True,accelerated-peft-bnb,2e-4,16,0.0,51579.0,46524471808,19172741120,NousResearch/Llama-2-70b-hf,2,lora,2,16,q_proj k_proj v_proj o_proj,float16,0.9264963436126709,1955.4907,0.205,0.051,418.923 -,True,accelerated-peft-bnb,2e-4,16,0.0,79375.0,0,0,NousResearch/Llama-2-70b-hf,1,lora,8,16,q_proj k_proj v_proj o_proj,float16,,,,, -0.28,True,accelerated-peft-bnb,2e-4,16,0.0,80815.0,72398346752,19172937728,NousResearch/Llama-2-70b-hf,2,lora,4,16,q_proj k_proj v_proj o_proj,float16,0.9262647342681884,3714.7153,0.215,0.027,441.057 -0.14,True,accelerated-peft-bnb-foak,2e-4,16,0.0,71995.0,67350935552,37346815488,NousResearch/Llama-2-70b-hf,1,lora,4,16,q_proj k_proj v_proj o_proj,float16,0.9998687934875489,3351.04,0.119,0.03,488.923 -0.14,True,accelerated-peft-bnb-foak,2e-4,16,0.0,51141.0,46250760704,19172741120,NousResearch/Llama-2-70b-hf,2,lora,2,16,q_proj k_proj v_proj o_proj,float16,0.9389877033233642,1747.6289,0.229,0.057,468.749 -,True,accelerated-peft-bnb-foak,2e-4,16,0.0,80303.0,0,0,NousResearch/Llama-2-70b-hf,1,lora,8,16,q_proj k_proj v_proj o_proj,float16,,,,, -0.28,True,accelerated-peft-bnb-foak,2e-4,16,0.0,79861.0,71720933888,19172937728,NousResearch/Llama-2-70b-hf,2,lora,4,16,q_proj k_proj v_proj o_proj,float16,0.9403298473358155,3375.4111,0.237,0.03,485.393 -0.15,True,accelerated-peft-autogptq,2e-4,16,0.0,19425.0,15890329088,4873693696,TheBloke/Mistral-7B-v0.1-GPTQ,1,lora,4,16,q_proj k_proj v_proj o_proj,float16,1.009563512802124,491.6352,0.814,0.203,3332.552 -0.15,True,accelerated-peft-autogptq,2e-4,16,0.0,12230.0,9690031616,2743565312,TheBloke/Mistral-7B-v0.1-GPTQ,2,lora,2,16,q_proj k_proj v_proj o_proj,float16,0.9266629409790039,294.4237,1.359,0.34,2782.385 -0.29,True,accelerated-peft-autogptq,2e-4,16,0.0,33219.0,26880060928,4874086912,TheBloke/Mistral-7B-v0.1-GPTQ,1,lora,8,16,q_proj k_proj v_proj o_proj,float16,0.9904310989379883,953.3973,0.839,0.105,3436.972 -0.29,True,accelerated-peft-autogptq,2e-4,16,0.0,19477.0,16000624128,2743761920,TheBloke/Mistral-7B-v0.1-GPTQ,2,lora,4,16,q_proj k_proj v_proj o_proj,float16,0.8998308277130127,506.1818,1.58,0.198,3236.781 -0.15,True,accelerated-peft-autogptq-foak,2e-4,16,0.0,19065.0,13631990784,4873693696,TheBloke/Mistral-7B-v0.1-GPTQ,1,lora,4,16,q_proj k_proj v_proj o_proj,float16,1.003525791168213,414.297,0.965,0.241,3954.651 -0.15,True,accelerated-peft-autogptq-foak,2e-4,16,0.0,11879.0,9512265216,2743565312,TheBloke/Mistral-7B-v0.1-GPTQ,2,lora,2,16,q_proj k_proj v_proj o_proj,float16,0.9293491744995117,224.6767,1.78,0.445,3646.128 -0.29,True,accelerated-peft-autogptq-foak,2e-4,16,0.0,32721.0,22390647808,4874086912,TheBloke/Mistral-7B-v0.1-GPTQ,1,lora,8,16,q_proj k_proj v_proj o_proj,float16,0.992929859161377,810.9726,0.986,0.123,4040.581 -0.29,True,accelerated-peft-autogptq-foak,2e-4,16,0.0,19063.0,15620482560,2743761920,TheBloke/Mistral-7B-v0.1-GPTQ,2,lora,4,16,q_proj k_proj v_proj o_proj,float16,0.9045120429992676,418.8226,1.91,0.239,3911.919 -0.15,True,accelerated-peft-autogptq,2e-4,16,0.0,36389.0,35528093184,24511457792,TheBloke/Mixtral-8x7B-Instruct-v0.1-GPTQ,1,lora,4,16,q_proj k_proj v_proj o_proj,float16,0.89991379737854,897.8879,0.445,0.111,1824.727 -0.15,True,accelerated-peft-autogptq,2e-4,16,0.0,22882.0,20691720192,12526730240,TheBloke/Mixtral-8x7B-Instruct-v0.1-GPTQ,2,lora,2,16,q_proj k_proj v_proj o_proj,float16,0.8638970375061035,557.2929,0.718,0.179,1469.963 -0.29,True,accelerated-peft-autogptq,2e-4,16,0.0,48959.0,46517825024,24511851008,TheBloke/Mixtral-8x7B-Instruct-v0.1-GPTQ,1,lora,8,16,q_proj k_proj v_proj o_proj,float16,0.893577823638916,1673.2594,0.478,0.06,1958.334 -0.29,True,accelerated-peft-autogptq,2e-4,16,0.0,29704.0,27482931712,12526926848,TheBloke/Mixtral-8x7B-Instruct-v0.1-GPTQ,2,lora,4,16,q_proj k_proj v_proj o_proj,float16,0.864154224395752,938.3626,0.853,0.107,1746.02 -0.15,True,accelerated-peft-autogptq-foak,2e-4,16,0.0,36607.0,33649802752,24511457792,TheBloke/Mixtral-8x7B-Instruct-v0.1-GPTQ,1,lora,4,16,q_proj k_proj v_proj o_proj,float16,0.8993340969085694,811.6061,0.493,0.123,2018.713 -0.15,True,accelerated-peft-autogptq-foak,2e-4,16,0.0,22801.0,20438869504,12526730240,TheBloke/Mixtral-8x7B-Instruct-v0.1-GPTQ,2,lora,2,16,q_proj k_proj v_proj o_proj,float16,0.8660580062866211,478.0288,0.837,0.209,1713.704 -0.29,True,accelerated-peft-autogptq-foak,2e-4,16,0.0,49669.0,42707730944,24511851008,TheBloke/Mixtral-8x7B-Instruct-v0.1-GPTQ,1,lora,8,16,q_proj k_proj v_proj o_proj,float16,0.8937735366821289,1533.2657,0.522,0.065,2137.138 -0.29,True,accelerated-peft-autogptq-foak,2e-4,16,0.0,29370.0,26951336960,12526926848,TheBloke/Mixtral-8x7B-Instruct-v0.1-GPTQ,2,lora,4,16,q_proj k_proj v_proj o_proj,float16,0.8651807403564453,838.8338,0.954,0.119,1953.188 -0.14,True,accelerated-peft-autogptq,2e-4,16,0.0,71177.0,65895347200,36290144768,TheBloke/Llama-2-70B-GPTQ,1,lora,4,16,q_proj k_proj v_proj o_proj,float16,0.9811842250823974,3639.6437,0.11,0.027,450.154 -0.14,True,accelerated-peft-autogptq,2e-4,16,0.0,49475.0,44873390592,18125597696,TheBloke/Llama-2-70B-GPTQ,2,lora,2,16,q_proj k_proj v_proj o_proj,float16,0.9557892894744873,1923.445,0.208,0.052,425.902 -,True,accelerated-peft-autogptq,2e-4,16,0.0,79265.0,0,0,TheBloke/Llama-2-70B-GPTQ,1,lora,8,16,q_proj k_proj v_proj o_proj,float16,,,,, -0.28,True,accelerated-peft-autogptq,2e-4,16,0.0,79187.0,70143285760,18125794304,TheBloke/Llama-2-70B-GPTQ,2,lora,4,16,q_proj k_proj v_proj o_proj,float16,0.9580207633972168,3685.3642,0.217,0.027,444.569 -0.14,True,accelerated-peft-autogptq-foak,2e-4,16,0.0,71223.0,65086305280,36290144768,TheBloke/Llama-2-70B-GPTQ,1,lora,4,16,q_proj k_proj v_proj o_proj,float16,0.981500825881958,3273.1958,0.122,0.031,500.551 -0.14,True,accelerated-peft-autogptq-foak,2e-4,16,0.0,49187.0,44599679488,18125597696,TheBloke/Llama-2-70B-GPTQ,2,lora,2,16,q_proj k_proj v_proj o_proj,float16,0.9558010864257812,1682.0158,0.238,0.059,487.035 -,True,accelerated-peft-autogptq-foak,2e-4,16,0.0,80945.0,0,0,TheBloke/Llama-2-70B-GPTQ,1,lora,8,16,q_proj k_proj v_proj o_proj,float16,,,,, -0.28,True,accelerated-peft-autogptq-foak,2e-4,16,0.0,78208.0,69465872896,18125794304,TheBloke/Llama-2-70B-GPTQ,2,lora,4,16,q_proj k_proj v_proj o_proj,float16,0.9556115436553955,3298.135,0.243,0.03,496.766 +0.15,,none,2e-5,,,76671.0,72972297728.0,44005107200.0,mistralai/Mistral-7B-v0.1,1,,4,,,float16,0.9160769081115723,549.026,0.729,0.182,2984.194 +0.15,,none,2e-5,,,43744.0,36763146240.0,29521348608.0,mistralai/Mistral-7B-v0.1,2,,2,,,float16,0.8728336906433105,298.0786,1.342,0.335,2748.269 +0.29,,none,2e-5,,,79365.0,72972690944.0,44005500416.0,mistralai/Mistral-7B-v0.1,1,,8,,,float16,1.001595754623413,1066.0306,0.75,0.094,3073.833 +0.29,,none,2e-5,,,52883.0,36763342848.0,29521545216.0,mistralai/Mistral-7B-v0.1,2,,4,,,float16,0.9138528442382813,552.1771,1.449,0.181,2967.164 +,,none,2e-5,,,80969.0,,,mistralai/Mixtral-8x7B-Instruct-v0.1,1,,4,,,float16,,,,, +,,none,2e-5,,,80925.0,,,mistralai/Mixtral-8x7B-Instruct-v0.1,2,,2,,,float16,,,,, +,,none,2e-5,,,80969.0,,,mistralai/Mixtral-8x7B-Instruct-v0.1,1,,8,,,float16,,,,, +,,none,2e-5,,,81003.0,,,mistralai/Mixtral-8x7B-Instruct-v0.1,2,,4,,,float16,,,,, +,,none,2e-5,,,80987.0,,,NousResearch/Llama-2-70b-hf,1,,4,,,float16,,,,, +,,none,2e-5,,,80922.0,,,NousResearch/Llama-2-70b-hf,2,,2,,,float16,,,,, +,,none,2e-5,,,80987.0,,,NousResearch/Llama-2-70b-hf,1,,8,,,float16,,,,, +,,none,2e-5,,,80863.0,,,NousResearch/Llama-2-70b-hf,2,,4,,,float16,,,,, +0.15,,none,2e-4,16,0.1,28707.0,26109561344.0,15119705600.0,mistralai/Mistral-7B-v0.1,1,lora,4,16,q_proj k_proj v_proj o_proj,float16,0.8970945072174072,458.7158,0.872,0.218,3571.71 +0.15,,none,2e-4,16,0.1,17897.0,15458877440.0,7850448896.0,mistralai/Mistral-7B-v0.1,2,lora,2,16,q_proj k_proj v_proj o_proj,float16,0.8571704006195069,270.088,1.481,0.37,3033.086 +0.29,,none,2e-4,16,0.1,42171.0,37100825088.0,15120098816.0,mistralai/Mistral-7B-v0.1,1,lora,8,16,q_proj k_proj v_proj o_proj,float16,0.9901649284362793,913.5703,0.876,0.109,3586.807 +0.29,,none,2e-4,16,0.1,25659.0,22105014272.0,7850645504.0,mistralai/Mistral-7B-v0.1,2,lora,4,16,q_proj k_proj v_proj o_proj,float16,0.9025015163421631,482.2349,1.659,0.207,3397.514 +,,none,2e-4,16,0.1,80991.0,,,mistralai/Mixtral-8x7B-Instruct-v0.1,1,lora,4,16,q_proj k_proj v_proj o_proj,float16,,,,, +0.15,,none,2e-4,16,0.1,61532.0,57898183168.0,47311509504.0,mistralai/Mixtral-8x7B-Instruct-v0.1,2,lora,2,16,q_proj k_proj v_proj o_proj,float16,0.8681951332092285,551.3062,0.726,0.181,1485.926 +,,none,2e-4,16,0.1,80991.0,,,mistralai/Mixtral-8x7B-Instruct-v0.1,1,lora,8,16,q_proj k_proj v_proj o_proj,float16,,,,, +0.29,,none,2e-4,16,0.1,69436.0,65039245312.0,47311706112.0,mistralai/Mixtral-8x7B-Instruct-v0.1,2,lora,4,16,q_proj k_proj v_proj o_proj,float16,0.8880744457244873,924.9663,0.865,0.108,1771.308 +,,none,2e-4,16,0.1,80617.0,,,NousResearch/Llama-2-70b-hf,1,lora,4,16,q_proj k_proj v_proj o_proj,float16,,,,, +,,none,2e-4,16,0.1,80756.0,,,NousResearch/Llama-2-70b-hf,2,lora,2,16,q_proj k_proj v_proj o_proj,float16,,,,, +,,none,2e-4,16,0.1,80617.0,,,NousResearch/Llama-2-70b-hf,1,lora,8,16,q_proj k_proj v_proj o_proj,float16,,,,, +,,none,2e-4,16,0.1,80851.0,,,NousResearch/Llama-2-70b-hf,2,lora,4,16,q_proj k_proj v_proj o_proj,float16,,,,, +0.15,True,baseline-peft-bnb,2e-4,16,0.1,25999.0,23228815360.0,5368450560.0,mistralai/Mistral-7B-v0.1,1,lora,4,16,q_proj k_proj v_proj o_proj,float16,0.8647766017913818,593.395,0.674,0.169,2761.062 +0.15,True,baseline-peft-bnb,2e-4,16,0.1,12818.0,10431547904.0,2781601792.0,mistralai/Mistral-7B-v0.1,2,lora,2,16,q_proj k_proj v_proj o_proj,float16,0.8678814029693603,284.7643,1.405,0.351,2876.765 +0.29,True,baseline-peft-bnb,2e-4,16,0.1,46121.0,41084491776.0,5368843776.0,mistralai/Mistral-7B-v0.1,1,lora,8,16,q_proj k_proj v_proj o_proj,float16,0.868037691116333,1158.2474,0.691,0.086,2829.102 +0.29,True,baseline-peft-bnb,2e-4,16,0.1,20421.0,17446783488.0,2781798400.0,mistralai/Mistral-7B-v0.1,2,lora,4,16,q_proj k_proj v_proj o_proj,float16,0.8695751667022705,502.2826,1.593,0.199,3261.909 +0.15,True,baseline-peft-bnb,2e-4,16,0.1,47567.0,46825980416.0,25726455296.0,mistralai/Mixtral-8x7B-Instruct-v0.1,1,lora,4,16,q_proj k_proj v_proj o_proj,float16,0.8924774932861328,1171.0504,0.342,0.085,1399.086 +0.15,True,baseline-peft-bnb,2e-4,16,0.1,25163.0,22356893696.0,13273817088.0,mistralai/Mixtral-8x7B-Instruct-v0.1,2,lora,2,16,q_proj k_proj v_proj o_proj,float16,0.8943204975128174,568.962,0.703,0.176,1439.815 +0.29,True,baseline-peft-bnb,2e-4,16,0.1,69237.0,67906358784.0,25726848512.0,mistralai/Mixtral-8x7B-Instruct-v0.1,1,lora,8,16,q_proj k_proj v_proj o_proj,float16,0.8907253837585449,2126.1835,0.376,0.047,1541.165 +0.29,True,baseline-peft-bnb,2e-4,16,0.1,32960.0,30165152256.0,13274013696.0,mistralai/Mixtral-8x7B-Instruct-v0.1,2,lora,4,16,q_proj k_proj v_proj o_proj,float16,0.893255443572998,957.9628,0.835,0.104,1710.296 +,True,baseline-peft-bnb,2e-4,16,0.1,80123.0,,,NousResearch/Llama-2-70b-hf,1,lora,4,16,q_proj k_proj v_proj o_proj,float16,,,,, +0.14,True,baseline-peft-bnb,2e-4,16,0.1,52469.0,47591447040.0,19434999808.0,NousResearch/Llama-2-70b-hf,2,lora,2,16,q_proj k_proj v_proj o_proj,float16,1.0088242053985597,1955.5844,0.205,0.051,418.903 +,True,baseline-peft-bnb,2e-4,16,0.1,80581.0,,,NousResearch/Llama-2-70b-hf,1,lora,8,16,q_proj k_proj v_proj o_proj,float16,,,,, +,True,baseline-peft-bnb,2e-4,16,0.1,80585.0,,,NousResearch/Llama-2-70b-hf,2,lora,4,16,q_proj k_proj v_proj o_proj,float16,,,,, +0.15,True,accelerated-peft-bnb,2e-4,16,0.1,18907.0,15860617728.0,4843499008.0,mistralai/Mistral-7B-v0.1,1,lora,4,16,q_proj k_proj v_proj o_proj,float16,0.8644689750671387,482.8812,0.828,0.207,3392.967 +0.15,True,accelerated-peft-bnb,2e-4,16,0.1,12783.0,10431547904.0,2781601792.0,mistralai/Mistral-7B-v0.1,2,lora,2,16,q_proj k_proj v_proj o_proj,float16,0.8698421669006348,284.1914,1.408,0.352,2882.564 +0.29,True,accelerated-peft-bnb,2e-4,16,0.1,33331.0,26851881472.0,4843892224.0,mistralai/Mistral-7B-v0.1,1,lora,8,16,q_proj k_proj v_proj o_proj,float16,0.8686403369903565,948.8322,0.843,0.105,3453.508 +0.29,True,accelerated-peft-bnb,2e-4,16,0.1,20523.0,17446783488.0,2781798400.0,mistralai/Mistral-7B-v0.1,2,lora,4,16,q_proj k_proj v_proj o_proj,float16,0.8683876323699952,504.0477,1.587,0.198,3250.486 +0.15,True,accelerated-peft-bnb-foak,2e-4,16,0.1,17449.0,14173894656.0,4843499008.0,mistralai/Mistral-7B-v0.1,1,lora,4,16,q_proj k_proj v_proj o_proj,float16,0.8669318771362304,419.9549,0.952,0.238,3901.371 +0.15,True,accelerated-peft-bnb-foak,2e-4,16,0.1,12699.0,10065463808.0,2727075840.0,mistralai/Mistral-7B-v0.1,2,lora,2,16,q_proj k_proj v_proj o_proj,float16,0.8524643421173096,225.0245,1.778,0.444,3640.493 +0.29,True,accelerated-peft-bnb-foak,2e-4,16,0.1,28593.0,23504648192.0,4843892224.0,mistralai/Mistral-7B-v0.1,1,lora,8,16,q_proj k_proj v_proj o_proj,float16,0.8657933044433593,819.2575,0.976,0.122,3999.719 +0.29,True,accelerated-peft-bnb-foak,2e-4,16,0.1,19860.0,16744106496.0,2727272448.0,mistralai/Mistral-7B-v0.1,2,lora,4,16,q_proj k_proj v_proj o_proj,float16,0.8490522384643555,420.7803,1.901,0.238,3893.719 +0.15,True,accelerated-peft-bnb,2e-4,16,0.1,37399.0,36828377600.0,25201503744.0,mistralai/Mixtral-8x7B-Instruct-v0.1,1,lora,4,16,q_proj k_proj v_proj o_proj,float16,0.8931312561035156,925.5545,0.432,0.108,1770.182 +0.15,True,accelerated-peft-bnb,2e-4,16,0.1,25216.0,22359233024.0,13273817088.0,mistralai/Mixtral-8x7B-Instruct-v0.1,2,lora,2,16,q_proj k_proj v_proj o_proj,float16,0.892439432144165,570.8031,0.701,0.175,1435.171 +0.29,True,accelerated-peft-bnb,2e-4,16,0.1,49913.0,48447599616.0,25201896960.0,mistralai/Mixtral-8x7B-Instruct-v0.1,1,lora,8,16,q_proj k_proj v_proj o_proj,float16,0.8924949169158936,1720.4669,0.465,0.058,1904.599 +0.29,True,accelerated-peft-bnb,2e-4,16,0.1,33214.0,30167236096.0,13274013696.0,mistralai/Mixtral-8x7B-Instruct-v0.1,2,lora,4,16,q_proj k_proj v_proj o_proj,float16,0.8910456848144531,961.4325,0.832,0.104,1704.124 +0.15,True,accelerated-peft-bnb-foak,2e-4,16,0.1,36039.0,36153218048.0,25201503744.0,mistralai/Mixtral-8x7B-Instruct-v0.1,1,lora,4,16,q_proj k_proj v_proj o_proj,float16,0.8932661628723144,855.0375,0.468,0.117,1916.173 +0.15,True,accelerated-peft-bnb-foak,2e-4,16,0.1,25513.0,22008699904.0,13219291136.0,mistralai/Mixtral-8x7B-Instruct-v0.1,2,lora,2,16,q_proj k_proj v_proj o_proj,float16,0.8599378490447998,511.3077,0.782,0.196,1602.166 +0.29,True,accelerated-peft-bnb-foak,2e-4,16,0.1,46959.0,47096648192.0,25201896960.0,mistralai/Mixtral-8x7B-Instruct-v0.1,1,lora,8,16,q_proj k_proj v_proj o_proj,float16,0.8929532051086426,1595.4842,0.501,0.063,2053.797 +0.29,True,accelerated-peft-bnb-foak,2e-4,16,0.1,33064.0,29497270272.0,13219487744.0,mistralai/Mixtral-8x7B-Instruct-v0.1,2,lora,4,16,q_proj k_proj v_proj o_proj,float16,0.8600027751922608,878.2625,0.911,0.114,1865.502 +0.14,True,accelerated-peft-bnb,2e-4,16,0.1,72701.0,69770819584.0,37347044864.0,NousResearch/Llama-2-70b-hf,1,lora,4,16,q_proj k_proj v_proj o_proj,float16,1.0098001098632812,3656.7382,0.109,0.027,448.05 +0.14,True,accelerated-peft-bnb,2e-4,16,0.1,52469.0,47591447040.0,19434999808.0,NousResearch/Llama-2-70b-hf,2,lora,2,16,q_proj k_proj v_proj o_proj,float16,1.0093148803710938,1952.1407,0.205,0.051,419.642 +,True,accelerated-peft-bnb,2e-4,16,0.1,79377.0,,,NousResearch/Llama-2-70b-hf,1,lora,8,16,q_proj k_proj v_proj o_proj,float16,,,,, +,True,accelerated-peft-bnb,2e-4,16,0.1,80837.0,,,NousResearch/Llama-2-70b-hf,2,lora,4,16,q_proj k_proj v_proj o_proj,float16,,,,, +0.14,True,accelerated-peft-bnb-foak,2e-4,16,0.1,71019.0,68424906752.0,37347044864.0,NousResearch/Llama-2-70b-hf,1,lora,4,16,q_proj k_proj v_proj o_proj,float16,1.0100258159637452,3358.344,0.119,0.03,487.859 +0.14,True,accelerated-peft-bnb-foak,2e-4,16,0.1,51461.0,46787975680.0,19172855808.0,NousResearch/Llama-2-70b-hf,2,lora,2,16,q_proj k_proj v_proj o_proj,float16,0.9600833988189698,1747.5665,0.229,0.057,468.766 +,True,accelerated-peft-bnb-foak,2e-4,16,0.1,80945.0,,,NousResearch/Llama-2-70b-hf,1,lora,8,16,q_proj k_proj v_proj o_proj,float16,,,,, +0.28,True,accelerated-peft-bnb-foak,2e-4,16,0.1,80967.0,72795019776.0,19173052416.0,NousResearch/Llama-2-70b-hf,2,lora,4,16,q_proj k_proj v_proj o_proj,float16,0.9324470138549805,3384.1355,0.236,0.03,484.141 +0.15,True,accelerated-peft-autogptq,2e-4,16,0.1,19429.0,15890927104.0,4873808384.0,TheBloke/Mistral-7B-v0.1-GPTQ,1,lora,4,16,q_proj k_proj v_proj o_proj,float16,0.9843631744384765,481.226,0.831,0.208,3404.637 +0.15,True,accelerated-peft-autogptq,2e-4,16,0.1,12860.0,10079847936.0,2798148608.0,TheBloke/Mistral-7B-v0.1-GPTQ,2,lora,2,16,q_proj k_proj v_proj o_proj,float16,0.9855545139312745,282.415,1.416,0.354,2900.695 +0.29,True,accelerated-peft-autogptq,2e-4,16,0.1,33223.0,26882190848.0,4874201600.0,TheBloke/Mistral-7B-v0.1-GPTQ,1,lora,8,16,q_proj k_proj v_proj o_proj,float16,0.9518059539794922,944.0475,0.847,0.106,3471.012 +0.29,True,accelerated-peft-autogptq,2e-4,16,0.1,20472.0,16725984768.0,2798345216.0,TheBloke/Mistral-7B-v0.1-GPTQ,2,lora,4,16,q_proj k_proj v_proj o_proj,float16,0.9537856483459473,497.4081,1.608,0.201,3293.875 +0.15,True,accelerated-peft-autogptq-foak,2e-4,16,0.1,17193.0,13632576512.0,4873808384.0,TheBloke/Mistral-7B-v0.1-GPTQ,1,lora,4,16,q_proj k_proj v_proj o_proj,float16,0.9757871055603027,413.121,0.968,0.242,3965.908 +0.15,True,accelerated-peft-autogptq-foak,2e-4,16,0.1,12703.0,9780872704.0,2743622656.0,TheBloke/Mistral-7B-v0.1-GPTQ,2,lora,2,16,q_proj k_proj v_proj o_proj,float16,0.9560029792785645,221.2793,1.808,0.452,3702.109 +0.29,True,accelerated-peft-autogptq-foak,2e-4,16,0.1,28977.0,22392753152.0,4874201600.0,TheBloke/Mistral-7B-v0.1-GPTQ,1,lora,8,16,q_proj k_proj v_proj o_proj,float16,0.9514095497131347,805.8956,0.993,0.124,4066.035 +0.29,True,accelerated-peft-autogptq-foak,2e-4,16,0.1,19800.0,16157525504.0,2743819264.0,TheBloke/Mistral-7B-v0.1-GPTQ,2,lora,4,16,q_proj k_proj v_proj o_proj,float16,0.9230777645111083,415.3379,1.926,0.241,3944.74 +0.15,True,accelerated-peft-autogptq,2e-4,16,0.1,36387.0,35528691200.0,24511572480.0,TheBloke/Mixtral-8x7B-Instruct-v0.1-GPTQ,1,lora,4,16,q_proj k_proj v_proj o_proj,float16,0.8990980052947998,885.7851,0.452,0.113,1849.659 +0.15,True,accelerated-peft-autogptq,2e-4,16,0.1,23548.0,21067523584.0,12581313536.0,TheBloke/Mixtral-8x7B-Instruct-v0.1-GPTQ,2,lora,2,16,q_proj k_proj v_proj o_proj,float16,0.8998581314086914,536.746,0.745,0.186,1526.234 +0.29,True,accelerated-peft-autogptq,2e-4,16,0.1,48905.0,46519954944.0,24511965696.0,TheBloke/Mixtral-8x7B-Instruct-v0.1-GPTQ,1,lora,8,16,q_proj k_proj v_proj o_proj,float16,0.8961446380615234,1669.0298,0.479,0.06,1963.296 +0.29,True,accelerated-peft-autogptq,2e-4,16,0.1,30516.0,28187328512.0,12581510144.0,TheBloke/Mixtral-8x7B-Instruct-v0.1-GPTQ,2,lora,4,16,q_proj k_proj v_proj o_proj,float16,0.8947424793243408,921.8778,0.868,0.108,1777.242 +0.15,True,accelerated-peft-autogptq-foak,2e-4,16,0.1,34731.0,34183981056.0,24511572480.0,TheBloke/Mixtral-8x7B-Instruct-v0.1-GPTQ,1,lora,4,16,q_proj k_proj v_proj o_proj,float16,0.8992811870574952,814.994,0.491,0.123,2010.322 +0.15,True,accelerated-peft-autogptq-foak,2e-4,16,0.1,24177.0,20715718656.0,12526787584.0,TheBloke/Mixtral-8x7B-Instruct-v0.1-GPTQ,2,lora,2,16,q_proj k_proj v_proj o_proj,float16,0.8655492782592773,475.158,0.842,0.21,1724.058 +0.29,True,accelerated-peft-autogptq-foak,2e-4,16,0.1,45901.0,43758690304.0,24511965696.0,TheBloke/Mixtral-8x7B-Instruct-v0.1-GPTQ,1,lora,8,16,q_proj k_proj v_proj o_proj,float16,0.895248155593872,1528.7913,0.523,0.065,2143.393 +0.29,True,accelerated-peft-autogptq-foak,2e-4,16,0.1,31452.0,27526991360.0,12526984192.0,TheBloke/Mixtral-8x7B-Instruct-v0.1-GPTQ,2,lora,4,16,q_proj k_proj v_proj o_proj,float16,0.8628469562530517,835.9993,0.957,0.12,1959.81 +0.14,True,accelerated-peft-autogptq,2e-4,16,0.1,71181.0,67237753856.0,36290374144.0,TheBloke/Llama-2-70B-GPTQ,1,lora,4,16,q_proj k_proj v_proj o_proj,float16,0.9894431018829346,3599.2898,0.111,0.028,455.201 +0.14,True,accelerated-peft-autogptq,2e-4,16,0.1,51115.0,45806148096.0,18387856384.0,TheBloke/Llama-2-70B-GPTQ,2,lora,2,16,q_proj k_proj v_proj o_proj,float16,0.9900471115112305,1900.1037,0.211,0.053,431.134 +,True,accelerated-peft-autogptq,2e-4,16,0.1,79265.0,,,TheBloke/Llama-2-70B-GPTQ,1,lora,8,16,q_proj k_proj v_proj o_proj,float16,,,,, +0.28,True,accelerated-peft-autogptq,2e-4,16,0.1,80813.0,71747131904.0,18388052992.0,TheBloke/Llama-2-70B-GPTQ,2,lora,4,16,q_proj k_proj v_proj o_proj,float16,0.9895571708679199,3672.2631,0.218,0.027,446.155 +0.14,True,accelerated-peft-autogptq-foak,2e-4,16,0.1,69479.0,66160276480.0,36290374144.0,TheBloke/Llama-2-70B-GPTQ,1,lora,4,16,q_proj k_proj v_proj o_proj,float16,0.9900266265869141,3283.8655,0.122,0.03,498.924 +0.14,True,accelerated-peft-autogptq-foak,2e-4,16,0.1,50518.0,45136894464.0,18125712384.0,TheBloke/Llama-2-70B-GPTQ,2,lora,2,16,q_proj k_proj v_proj o_proj,float16,0.9589622497558594,1684.1824,0.238,0.059,486.408 +,True,accelerated-peft-autogptq-foak,2e-4,16,0.1,80301.0,,,TheBloke/Llama-2-70B-GPTQ,1,lora,8,16,q_proj k_proj v_proj o_proj,float16,,,,, +0.28,True,accelerated-peft-autogptq-foak,2e-4,16,0.1,79950.0,70539958784.0,18125908992.0,TheBloke/Llama-2-70B-GPTQ,2,lora,4,16,q_proj k_proj v_proj o_proj,float16,0.9595681858062745,3305.9445,0.242,0.03,495.592 diff --git a/scripts/benchmarks/refs/requirements.txt b/scripts/benchmarks/refs/requirements.txt new file mode 100644 index 00000000..7bfa45a4 --- /dev/null +++ b/scripts/benchmarks/refs/requirements.txt @@ -0,0 +1,78 @@ +accelerate==0.32.1 +aiohttp==3.9.5 +aiosignal==1.3.1 +async-timeout==4.0.3 +attrs==23.2.0 +bitsandbytes==0.43.1 +certifi==2024.7.4 +charset-normalizer==3.3.2 +datasets==2.20.0 +dill==0.3.8 +docstring_parser==0.16 +einops==0.8.0 +filelock==3.15.4 +fire==0.6.0 +flash-attn==2.5.9.post1 +-e git+https://github.com/achew010/fms-acceleration.git@33bf943ed4e19db7941ca8f852666a51160fb2de#egg=fms_acceleration&subdirectory=plugins/framework +-e git+https://github.com/achew010/fms-acceleration.git@33bf943ed4e19db7941ca8f852666a51160fb2de#egg=fms_acceleration_foak&subdirectory=plugins/fused-ops-and-kernels +-e git+https://github.com/achew010/fms-acceleration.git@33bf943ed4e19db7941ca8f852666a51160fb2de#egg=fms_acceleration_peft&subdirectory=plugins/accelerated-peft +fms-hf-tuning @ git+https://github.com/foundation-model-stack/fms-hf-tuning.git@85f32cb15019217ccc22156233f15d280d3f4690 +frozenlist==1.4.1 +fsspec==2024.5.0 +huggingface-hub==0.23.4 +idna==3.7 +Jinja2==3.1.4 +markdown-it-py==3.0.0 +MarkupSafe==2.1.5 +mdurl==0.1.2 +mpmath==1.3.0 +multidict==6.0.5 +multiprocess==0.70.16 +networkx==3.3 +numpy==1.26.4 +nvidia-cublas-cu12==12.1.3.1 +nvidia-cuda-cupti-cu12==12.1.105 +nvidia-cuda-nvrtc-cu12==12.1.105 +nvidia-cuda-runtime-cu12==12.1.105 +nvidia-cudnn-cu12==8.9.2.26 +nvidia-cufft-cu12==11.0.2.54 +nvidia-curand-cu12==10.3.2.106 +nvidia-cusolver-cu12==11.4.5.107 +nvidia-cusparse-cu12==12.1.0.106 +nvidia-nccl-cu12==2.19.3 +nvidia-nvjitlink-cu12==12.5.82 +nvidia-nvtx-cu12==12.1.105 +packaging==24.1 +pandas==2.2.2 +peft==0.11.1 +protobuf==5.27.2 +psutil==6.0.0 +pyarrow==16.1.0 +pyarrow-hotfix==0.6 +Pygments==2.18.0 +python-dateutil==2.9.0.post0 +pytz==2024.1 +PyYAML==6.0.1 +regex==2024.5.15 +requests==2.32.3 +rich==13.7.1 +safetensors==0.4.3 +sentencepiece==0.2.0 +shtab==1.7.1 +simpleeval==0.9.13 +six==1.16.0 +sympy==1.13.0 +termcolor==2.4.0 +threadpoolctl==3.5.0 +tokenizers==0.15.2 +torch==2.2.2 +tqdm==4.66.4 +transformers==4.39.3 +triton==2.2.0 +trl==0.9.6 +typing_extensions==4.12.2 +tyro==0.8.5 +tzdata==2024.1 +urllib3==2.2.2 +xxhash==3.4.1 +yarl==1.9.4 From 61fe08c8df87ee81799eb097fb7e494ad91bdaf2 Mon Sep 17 00:00:00 2001 From: 1000850000 user Date: Mon, 15 Jul 2024 03:17:22 +0000 Subject: [PATCH 20/20] Add comment for foak kernel --- .../fused_ops/unsloth_lora/gptq/triton/kernels.py | 1 + 1 file changed, 1 insertion(+) diff --git a/plugins/fused-ops-and-kernels/src/fms_acceleration_foak/fused_ops/unsloth_lora/gptq/triton/kernels.py b/plugins/fused-ops-and-kernels/src/fms_acceleration_foak/fused_ops/unsloth_lora/gptq/triton/kernels.py index efc3b41e..aa6a2c70 100644 --- a/plugins/fused-ops-and-kernels/src/fms_acceleration_foak/fused_ops/unsloth_lora/gptq/triton/kernels.py +++ b/plugins/fused-ops-and-kernels/src/fms_acceleration_foak/fused_ops/unsloth_lora/gptq/triton/kernels.py @@ -111,6 +111,7 @@ def dequant_kernel_248( # Dequantize # None if using local gptqpackage, official autogptq should have an offset value + # Triton compiler throws an NameError for function `hasattr` if getattr(qzeros_ptr, "offset", None) is not None: zeros = zeros + qzeros_ptr.offset