From f602a80689e1840ade32f4f1be761b30421d23ac Mon Sep 17 00:00:00 2001
From: 1000850000 user <aaron.chew1@ibm.com>
Date: Sun, 30 Jun 2024 15:58:06 +0000
Subject: [PATCH 01/20] added gptqmodel to plugin

---
 .../src/gptqmodel/__init__.py                 |    3 +
 .../src/gptqmodel/models/__init__.py          |   10 +
 .../src/gptqmodel/models/_const.py            |   35 +
 .../src/gptqmodel/models/auto.py              |  113 ++
 .../src/gptqmodel/models/base.py              | 1080 +++++++++++++++++
 .../src/gptqmodel/models/dbrx.py              |   22 +
 .../src/gptqmodel/models/dbrx_converted.py    |   66 +
 .../src/gptqmodel/models/gemma.py             |   29 +
 .../src/gptqmodel/models/gpt_bigcode.py       |   29 +
 .../src/gptqmodel/models/gpt_neox.py          |   31 +
 .../src/gptqmodel/models/llama.py             |   37 +
 .../src/gptqmodel/models/mistral.py           |   29 +
 .../src/gptqmodel/models/mixtral.py           |   55 +
 .../src/gptqmodel/nn_modules/__init__.py      |    0
 .../gptqmodel/nn_modules/qlinear/__init__.py  |   43 +
 .../nn_modules/qlinear/qlinear_tritonv2.py    |  206 ++++
 .../nn_modules/triton_utils/__init__.py       |    0
 .../triton_utils/custom_autotune.py           |  233 ++++
 .../nn_modules/triton_utils/dequant.py        |  153 +++
 .../nn_modules/triton_utils/kernels.py        |  476 ++++++++
 .../nn_modules/triton_utils/mixin.py          |   19 +
 .../src/gptqmodel/quantization/__init__.py    |    4 +
 .../src/gptqmodel/quantization/config.py      |  309 +++++
 .../src/gptqmodel/quantization/gptq.py        |  209 ++++
 .../src/gptqmodel/quantization/quantizer.py   |  142 +++
 .../src/gptqmodel/utils/__init__.py           |    1 +
 .../src/gptqmodel/utils/backend.py            |   27 +
 .../src/gptqmodel/utils/data.py               |  275 +++++
 .../src/gptqmodel/utils/importer.py           |   63 +
 .../src/gptqmodel/utils/model.py              |  661 ++++++++++
 .../src/gptqmodel/utils/peft.py               |  153 +++
 .../accelerated-peft/tests/test_q4_triton.py  |   99 ++
 plugins/accelerated-peft/tests/test_triton.py |  107 ++
 33 files changed, 4719 insertions(+)
 create mode 100644 plugins/accelerated-peft/src/gptqmodel/__init__.py
 create mode 100644 plugins/accelerated-peft/src/gptqmodel/models/__init__.py
 create mode 100644 plugins/accelerated-peft/src/gptqmodel/models/_const.py
 create mode 100644 plugins/accelerated-peft/src/gptqmodel/models/auto.py
 create mode 100644 plugins/accelerated-peft/src/gptqmodel/models/base.py
 create mode 100644 plugins/accelerated-peft/src/gptqmodel/models/dbrx.py
 create mode 100644 plugins/accelerated-peft/src/gptqmodel/models/dbrx_converted.py
 create mode 100644 plugins/accelerated-peft/src/gptqmodel/models/gemma.py
 create mode 100644 plugins/accelerated-peft/src/gptqmodel/models/gpt_bigcode.py
 create mode 100644 plugins/accelerated-peft/src/gptqmodel/models/gpt_neox.py
 create mode 100644 plugins/accelerated-peft/src/gptqmodel/models/llama.py
 create mode 100644 plugins/accelerated-peft/src/gptqmodel/models/mistral.py
 create mode 100644 plugins/accelerated-peft/src/gptqmodel/models/mixtral.py
 create mode 100644 plugins/accelerated-peft/src/gptqmodel/nn_modules/__init__.py
 create mode 100644 plugins/accelerated-peft/src/gptqmodel/nn_modules/qlinear/__init__.py
 create mode 100644 plugins/accelerated-peft/src/gptqmodel/nn_modules/qlinear/qlinear_tritonv2.py
 create mode 100644 plugins/accelerated-peft/src/gptqmodel/nn_modules/triton_utils/__init__.py
 create mode 100644 plugins/accelerated-peft/src/gptqmodel/nn_modules/triton_utils/custom_autotune.py
 create mode 100644 plugins/accelerated-peft/src/gptqmodel/nn_modules/triton_utils/dequant.py
 create mode 100644 plugins/accelerated-peft/src/gptqmodel/nn_modules/triton_utils/kernels.py
 create mode 100644 plugins/accelerated-peft/src/gptqmodel/nn_modules/triton_utils/mixin.py
 create mode 100644 plugins/accelerated-peft/src/gptqmodel/quantization/__init__.py
 create mode 100644 plugins/accelerated-peft/src/gptqmodel/quantization/config.py
 create mode 100644 plugins/accelerated-peft/src/gptqmodel/quantization/gptq.py
 create mode 100644 plugins/accelerated-peft/src/gptqmodel/quantization/quantizer.py
 create mode 100644 plugins/accelerated-peft/src/gptqmodel/utils/__init__.py
 create mode 100644 plugins/accelerated-peft/src/gptqmodel/utils/backend.py
 create mode 100644 plugins/accelerated-peft/src/gptqmodel/utils/data.py
 create mode 100644 plugins/accelerated-peft/src/gptqmodel/utils/importer.py
 create mode 100644 plugins/accelerated-peft/src/gptqmodel/utils/model.py
 create mode 100644 plugins/accelerated-peft/src/gptqmodel/utils/peft.py
 create mode 100644 plugins/accelerated-peft/tests/test_q4_triton.py
 create mode 100644 plugins/accelerated-peft/tests/test_triton.py

diff --git a/plugins/accelerated-peft/src/gptqmodel/__init__.py b/plugins/accelerated-peft/src/gptqmodel/__init__.py
new file mode 100644
index 00000000..2808d505
--- /dev/null
+++ b/plugins/accelerated-peft/src/gptqmodel/__init__.py
@@ -0,0 +1,3 @@
+from .models import GPTQModel
+from .quantization import BaseQuantizeConfig, QuantizeConfig
+from .utils import Backend, get_backend
\ No newline at end of file
diff --git a/plugins/accelerated-peft/src/gptqmodel/models/__init__.py b/plugins/accelerated-peft/src/gptqmodel/models/__init__.py
new file mode 100644
index 00000000..5496b45e
--- /dev/null
+++ b/plugins/accelerated-peft/src/gptqmodel/models/__init__.py
@@ -0,0 +1,10 @@
+from .auto import MODEL_MAP, GPTQModel
+from .base import BaseGPTQModel
+from .dbrx import DbrxGPTQ
+from .dbrx_converted import DbrxConvertedGPTQ
+from .gemma import GemmaGPTQ
+from .gpt_bigcode import GPTBigCodeGPTQ
+from .gpt_neox import GPTNeoXGPTQ
+from .llama import LlamaGPTQ
+from .mistral import MistralGPTQ
+from .mixtral import MixtralGPTQ
diff --git a/plugins/accelerated-peft/src/gptqmodel/models/_const.py b/plugins/accelerated-peft/src/gptqmodel/models/_const.py
new file mode 100644
index 00000000..49484bee
--- /dev/null
+++ b/plugins/accelerated-peft/src/gptqmodel/models/_const.py
@@ -0,0 +1,35 @@
+###############################################################################
+# Adapted from https://github.com/ModelCloud/GPTQModel
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+###############################################################################
+from torch import device
+
+CPU = device("cpu")
+CUDA_0 = device("cuda:0")
+
+SUPPORTED_MODELS = [
+    "gpt_neox",
+    "gpt_bigcode",
+    "llama",
+    "mistral",
+    "mixtral",
+    "gemma",
+    "dbrx_converted",
+]
+
+EXLLAMA_DEFAULT_MAX_INPUT_LENGTH = 2048
+
+EXPERT_INDEX_PLACEHOLDER = "{expert_index}"
+
+
diff --git a/plugins/accelerated-peft/src/gptqmodel/models/auto.py b/plugins/accelerated-peft/src/gptqmodel/models/auto.py
new file mode 100644
index 00000000..d31c12e9
--- /dev/null
+++ b/plugins/accelerated-peft/src/gptqmodel/models/auto.py
@@ -0,0 +1,113 @@
+###############################################################################
+# Adapted from https://github.com/ModelCloud/GPTQModel
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+###############################################################################
+from typing import Dict, List, Optional, Union
+
+import torch
+
+from ..utils import Backend
+from ..utils.model import check_and_get_model_type
+from .base import BaseGPTQModel, QuantizeConfig
+from .dbrx import DbrxGPTQ
+from .dbrx_converted import DbrxConvertedGPTQ
+from .gemma import GemmaGPTQ
+from .gpt_bigcode import GPTBigCodeGPTQ
+from .gpt_neox import GPTNeoXGPTQ
+from .llama import LlamaGPTQ
+from .mistral import MistralGPTQ
+from .mixtral import MixtralGPTQ
+
+MODEL_MAP = {
+    "gpt_neox": GPTNeoXGPTQ,
+    "llama": LlamaGPTQ,
+    "gpt_bigcode": GPTBigCodeGPTQ,
+    "mistral": MistralGPTQ,
+    "mixtral": MixtralGPTQ,
+    "gemma": GemmaGPTQ,
+    "dbrx": DbrxGPTQ,
+    "dbrx_converted": DbrxConvertedGPTQ,
+}
+
+at_least_one_cuda_v6 = any(torch.cuda.get_device_capability(i)[0] >= 6 for i in range(torch.cuda.device_count()))
+
+if not at_least_one_cuda_v6:
+    raise EnvironmentError("GPTQModel requires at least one GPU device with CUDA compute capability >= `6.0`.")
+
+
+class GPTQModel:
+    def __init__(self):
+        raise EnvironmentError(
+            "ModelGPTQ is not designed to be instantiated\n"
+            "use `ModelGPTQ.from_pretrained` to load pretrained model and prepare for quantization via `.quantize()`.\n"
+            "use `ModelGPTQ.from_quantized` to inference with post-quantized model."
+        )
+
+    @classmethod
+    def from_pretrained(
+        cls,
+        pretrained_model_name_or_path: str,
+        quantize_config: QuantizeConfig,
+        max_memory: Optional[dict] = None,
+        trust_remote_code: bool = False,
+        **model_init_kwargs,
+    ) -> BaseGPTQModel:
+        model_type = check_and_get_model_type(pretrained_model_name_or_path, trust_remote_code)
+        return MODEL_MAP[model_type].from_pretrained(
+            pretrained_model_name_or_path=pretrained_model_name_or_path,
+            quantize_config=quantize_config,
+            max_memory=max_memory,
+            trust_remote_code=trust_remote_code,
+            **model_init_kwargs,
+        )
+
+    @classmethod
+    def from_quantized(
+        cls,
+        model_name_or_path: Optional[str],
+        device_map: Optional[Union[str, Dict[str, Union[str, int]]]] = None,
+        max_memory: Optional[dict] = None,
+        device: Optional[Union[str, int]] = None,
+        backend: Backend = Backend.AUTO,
+        use_cuda_fp16: bool = True,
+        quantize_config: Optional[QuantizeConfig | Dict] = None,
+        model_basename: Optional[str] = None,
+        use_safetensors: bool = True,
+        trust_remote_code: bool = False,
+        warmup_triton: bool = False,
+        # verify weight files matches predefined hash during loading
+        # usage: hash_format:hash_value, example: md5:ugkdh232
+        # supports all hashlib hash methods
+        verify_hash: Optional[Union[str, List[str]]] = None,
+        **kwargs,
+    ) -> BaseGPTQModel:
+        model_type = check_and_get_model_type(model_name_or_path, trust_remote_code)
+        quant_func = MODEL_MAP[model_type].from_quantized
+
+        return quant_func(
+            model_name_or_path=model_name_or_path,
+            device_map=device_map,
+            max_memory=max_memory,
+            device=device,
+            backend=backend,
+            use_cuda_fp16=use_cuda_fp16,
+            quantize_config=quantize_config,
+            model_basename=model_basename,
+            use_safetensors=use_safetensors,
+            trust_remote_code=trust_remote_code,
+            warmup_triton=warmup_triton,
+            verify_hash=verify_hash,
+            **kwargs,
+        )
+
diff --git a/plugins/accelerated-peft/src/gptqmodel/models/base.py b/plugins/accelerated-peft/src/gptqmodel/models/base.py
new file mode 100644
index 00000000..d1010a39
--- /dev/null
+++ b/plugins/accelerated-peft/src/gptqmodel/models/base.py
@@ -0,0 +1,1080 @@
+###############################################################################
+# Adapted from https://github.com/ModelCloud/GPTQModel
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+###############################################################################
+import copy
+import json
+import logging
+import os
+import re
+from os.path import isfile, join
+from typing import Dict, List, Optional, Union
+
+import accelerate
+import torch
+import torch.nn as nn
+import transformers
+from accelerate.hooks import remove_hook_from_module
+from safetensors.torch import save_file as safe_save
+from tqdm import tqdm
+from transformers import AutoConfig, AutoModelForCausalLM, PretrainedConfig, PreTrainedModel
+from transformers.modeling_utils import no_init_weights, shard_checkpoint
+from transformers.utils.generic import ContextManagers
+
+from ..quantization import GPTQ, QuantizeConfig
+from ..quantization.config import (FORMAT, FORMAT_FIELD_JSON, META_FIELD_QUANTIZER,
+                                   META_QUANTIZER_GPTQMODEL, MIN_VERSION_WITH_V2, QUANTIZE_BLACK_LIST)
+from ..utils.backend import Backend
+from ..utils.data import collate_data
+from ..utils.importer import select_quant_linear
+from ..utils.model import (auto_dtype_from_config, convert_gptq_v1_to_v2_format, convert_gptq_v2_to_v1_format,
+                           find_layers, get_checkpoints, get_device, get_module_by_name_prefix,
+                           get_module_by_name_suffix, get_moe_layer_modules, gptqmodel_post_init, make_quant,
+                           move_to, nested_move_to, pack_model, simple_dispatch_model, verify_model_hash,
+                           verify_sharded_model_hashes)
+from ._const import CPU, CUDA_0, SUPPORTED_MODELS
+
+logger = logging.getLogger(__name__)
+handler = logging.StreamHandler()
+formatter = logging.Formatter("%(levelname)s - %(message)s")
+handler.setFormatter(formatter)
+logger.propagate = False
+logger.addHandler(handler)
+logger.setLevel(logging.INFO)
+
+
+class BaseGPTQModel(nn.Module):
+    # these modules are non-repeating and at the root level
+    # does not include the node which holds all the repeating layers
+    base_modules: List[str] = None
+
+    # name of lm_head
+    lm_head: str = "lm_head"
+
+    # repeating layers
+    # node holding all the repeating layers
+    layers_node: str = None
+    # repeating layer type
+    layer_type: str = None
+    # for each repeating layer there are multiple modules within each layer
+    layer_modules: List[List[str]] = None
+
+    # some models require trust_remove_code = True (dbrx_converted)
+    require_trust_remote_code = None
+
+    # TODO: use a better name and what if the value is not at the config root?
+    # allow dynamic expert n-count layer extraction
+    # so moe model defs do not need to write out 64 layers if expert size is 64 (Qwen2Moe)
+    # usage: set to property in model.config that holds this int value: total number of experts
+    dynamic_expert_index: Optional[str] = None
+
+    # allow models to define optional notes that output messages to users that want to use this model
+    # list of supported keys: [ "notes" = print the notes value on model load ]
+    info: Dict[str, str] = {}
+
+    def __init__(
+        self,
+        model: PreTrainedModel,
+        quantized: bool,
+        quantize_config: QuantizeConfig,
+        qlinear_kernel: nn.Module = None,
+    ):
+        super().__init__()
+
+        self.model = model
+        self.model_type = self.model.config.model_type
+        self._quantized = quantized
+        self.quantize_config = quantize_config
+        self.config = self.model.config
+
+        # compat: state to assist in checkpoint_format gptq(v1) to gptq_v2 conversion
+        self.qlinear_kernel = qlinear_kernel
+
+    @property
+    def quantized(self):
+        return self._quantized
+
+    @property
+    def hf_device_map(self):
+        return getattr(self.model, "hf_device_map", None)
+
+    def _prepare_dataset_for_quantization(
+            self,
+            calibration_dataset: List[Dict[str, Union[List[int], torch.LongTensor]]],
+            batch_size: int = 1,
+    ):
+        def _convert_tensor_to_list(tensor):
+            if isinstance(tensor, torch.Tensor):
+                if len(tensor.shape) == 1:
+                    tensor = tensor.unsqueeze(0)
+                tensor = tensor.long()
+                return tensor.cpu().numpy().tolist()
+            return [tensor]
+
+        new_calibration_dataset = []
+        for example in calibration_dataset:
+            input_ids = _convert_tensor_to_list(example["input_ids"])
+            attention_mask = _convert_tensor_to_list(example["attention_mask"])
+            if "labels" in example:
+                labels = _convert_tensor_to_list(example["labels"])
+            elif "label" in example:
+                labels = _convert_tensor_to_list(example["label"])
+            elif "label_ids" in example:
+                labels = _convert_tensor_to_list(example["label_ids"])
+            else:
+                labels = copy.deepcopy(input_ids)
+            new_calibration_dataset.append(
+                {
+                    "input_ids": input_ids,
+                    "attention_mask": attention_mask,
+                    "labels": labels,
+                }
+            )
+        pad_token_id = self.config.pad_token_id
+        if not pad_token_id:
+            pad_token_id = self.config.eos_token_id
+
+        if pad_token_id is None:
+            raise ValueError("Calibration data requires model's `pad_token_id` or `eos_token_id` to be set: actual = `None`.")
+
+        new_calibration_dataset_batched = [
+            collate_data(new_calibration_dataset[start: start + batch_size], pad_token_id)
+            for start in range(0, len(new_calibration_dataset), batch_size)
+        ]
+
+        for new_example in new_calibration_dataset_batched:
+            del new_example["labels"]
+
+        return new_calibration_dataset_batched
+
+    @torch.inference_mode()
+    def quantize(
+        self,
+        calibration_dataset: List[Dict[str, Union[List[int], torch.LongTensor]]],
+        batch_size: int = 1,
+
+        # TODO: remove use_cuda_fp16 arg..why? doesn't pass smell test @ZX-ModelCloud
+        use_cuda_fp16: bool = True,
+
+        autotune_warmup_after_quantized: bool = False,
+        calibration_enable_gpu_cache: bool = True,
+    ):
+        if self.quantized:
+            raise EnvironmentError("quantize() is called a model that is already quantized")
+
+        if self.quantize_config.quant_method in QUANTIZE_BLACK_LIST:
+            raise ValueError(
+                f"Unsupported quantization operation for quant method: {self.quantize_config.quant_method}"
+            )
+
+
+        # TODO: lm_head quantization is yet ready but pending
+        if self.quantize_config.lm_head:
+            raise ValueError("lm_head quantization is currently inference only and not applicable for quantization. Please set `lm_head=False`.")
+
+        if len(calibration_dataset) == 0:
+            raise ValueError("Calibration dataset must not be empty.")
+
+        min_calibration_dataset_size = 256
+        min_calibration_dataset_input_ids_avg_length = 256
+
+        if len(calibration_dataset) < min_calibration_dataset_size:
+            logger.warning(f"Calibration dataset size should be greater than {min_calibration_dataset_size}. "
+                             f"Current size: {len(calibration_dataset)}.")
+
+        # Calculate the average length of the average input_ids
+        total_input_ids_length = 0
+        for e in calibration_dataset:
+            input_ids_length = len(e["input_ids"])
+            total_input_ids_length += input_ids_length
+        avg = total_input_ids_length / len(calibration_dataset)
+
+        if avg < min_calibration_dataset_input_ids_avg_length:
+            logger.warning(f"The average length of input_ids of calibration_dataset should be greater than "
+                             f"{min_calibration_dataset_input_ids_avg_length}! Current AVG is {avg}.")
+
+
+        device_map = self.hf_device_map
+        if device_map:
+            for name, device in device_map.items():
+                if device == "cpu":
+                    logger.info(f"truly offloading {name} to cpu with hook.")
+                    module = get_module_by_name_suffix(self.model, name)
+                    remove_hook_from_module(module, recurse=True)
+                    accelerate.cpu_offload_with_hook(module, CUDA_0)
+
+        layer_inputs = []
+        attention_masks = []
+        position_ids = []
+        layer_input_kwargs = []
+        layer_outputs = []
+
+        calibration_dataset = self._prepare_dataset_for_quantization(calibration_dataset, batch_size)
+
+        forward_pass_use_cache = self.model.config.use_cache
+        self.model.config.use_cache = False
+
+        num_batches = len(calibration_dataset)
+        layers = get_module_by_name_prefix(self.model, self.layers_node)
+
+        cur_layer_device = get_device(layers[0])
+        data_device = cur_layer_device if calibration_enable_gpu_cache else CPU
+
+        def store_input_hook(_, args, kwargs):
+            # Positional arguments.
+            layer_input = []
+            for inp in args:
+                layer_input.append(move_to(inp, data_device))
+            layer_inputs.append(layer_input)
+
+            # Keyword arguments.
+            if kwargs["attention_mask"] is not None:
+                attention_masks.append(kwargs["attention_mask"].to(data_device))
+            else:
+                attention_masks.append(None)
+
+            pos_ids = kwargs.get("position_ids", None)
+            if pos_ids is not None:
+                position_ids.append(move_to(pos_ids, data_device))
+            one_kwargs = {}
+            for (k, v) in kwargs.items():  # make sure other arguments also be captured
+                if k not in ["hidden_states", "attention_mask", "position_ids"]:
+                    one_kwargs[k] = nested_move_to(v, data_device)
+            layer_input_kwargs.append(one_kwargs)
+            raise ValueError
+
+        force_layer_back_to_cpu = False
+        if get_device(layers[0]) == CPU:
+            layers[0] = layers[0].to(CUDA_0)
+            force_layer_back_to_cpu = True
+
+        ori_outside_layer_module_devices = {}
+        for module_name in self.base_modules:
+            module = get_module_by_name_prefix(self.model, module_name)
+
+            if module is None:
+                continue
+
+            ori_outside_layer_module_devices[module_name] = get_device(module)
+            if module is not None:
+                move_to(module, cur_layer_device)
+
+        # TODO: make this optional, backporting https://github.com/huggingface/optimum/blob/main/optimum/gptq/quantizer.py
+        handle = layers[0].register_forward_pre_hook(store_input_hook, with_kwargs=True)
+        for example in calibration_dataset:
+            for k, v in example.items():
+                if len(v.shape) == 1:
+                    v = v.unsqueeze(0)
+                example[k] = move_to(v, cur_layer_device)
+            try:
+                self.model(**example)
+            except ValueError:
+                pass
+        handle.remove()
+
+        move_to(layers[0], CPU if force_layer_back_to_cpu else cur_layer_device)
+        for module_name in self.base_modules:
+            module = get_module_by_name_prefix(self.model, module_name)
+            if module is not None:
+                move_to(module, ori_outside_layer_module_devices[module_name])
+
+        torch.cuda.empty_cache()
+
+        layer_modules = self.layer_modules
+
+        if not self.quantize_config.true_sequential:
+            layer_modules = [sum(layer_modules, [])]
+
+        # dynamic expert layer index for model defs
+        if self.dynamic_expert_index is not None:
+            num_experts = getattr(self.model.config, self.dynamic_expert_index)
+            layer_modules = get_moe_layer_modules(layer_modules=self.layer_modules,
+                                                      num_experts=num_experts)
+
+        quantizers = {}
+
+        # stores all per-layer quant stats such as avg loss and processing time
+        quant_log = []
+
+        layer_count = len(layers)
+        layer_pb = tqdm(range(layer_count))
+        for i in layer_pb:
+            layer_pb.set_description(f"Quantizing layer {i + 1} of {layer_count}")
+            layer = layers[i]
+            force_layer_back_to_cpu = False
+            if get_device(layer) == CPU:
+                move_to(layer, CUDA_0)
+                force_layer_back_to_cpu = True
+            cur_layer_device = get_device(layer)
+
+            full = find_layers(layer)
+            for names in layer_modules:
+                subset = {n: full[n] for n in names if n in full}
+                gptq = {}
+                for name in subset:
+                    gptq[name] = GPTQ(subset[name])
+                    gptq[name].quantizer.configure(
+                        self.quantize_config.bits,
+                        perchannel=True,
+                        sym=self.quantize_config.sym,
+                        mse=False,
+                    )
+
+                def add_batch(name):
+                    def tmp(_, inp, out):
+                        # gptq is mutable.
+                        gptq[name].add_batch(inp[0].data, out.data)  # noqa: F821
+
+                    return tmp
+
+                handles = []
+                for name in subset:
+                    handles.append(subset[name].register_forward_hook(add_batch(name)))
+                for j in range(num_batches):
+                    layer_input = []
+                    for k, layer_inp in enumerate(layer_inputs[j]):
+                        layer_input.append(move_to(layer_inp, cur_layer_device))
+
+                    mask = attention_masks[j]
+                    layer_attention_mask = mask if mask is None else move_to(mask, cur_layer_device)
+
+                    additional_layer_inputs = {"attention_mask": layer_attention_mask}
+                    layer_position_ids = (
+                        None if not position_ids else move_to(position_ids[j], cur_layer_device)
+                    )
+                    if layer_position_ids is not None:
+                        additional_layer_inputs["position_ids"] = layer_position_ids
+                    for k, v in layer_input_kwargs[j].items():
+                        additional_layer_inputs[k] = nested_move_to(v, cur_layer_device)
+                    layer(*layer_input, **additional_layer_inputs)
+                for h in handles:
+                    h.remove()
+
+                for name in subset:
+                    layer_pb.set_description(f"Quantizing {name} in layer {i + 1} of {layer_count}")
+
+                    try:
+                        scale, zero, g_idx, duration, avg_loss = gptq[name].fasterquant(
+                            percdamp=self.quantize_config.damp_percent,
+                            group_size=self.quantize_config.group_size,
+                            actorder=self.quantize_config.desc_act,
+                            static_groups=self.quantize_config.static_groups,
+                        )
+
+                        stat = {"layer": i + 1, "module": name, "avg_loss": f"{avg_loss:.4f}",
+                                "time": f"{duration:.4f}"}
+
+                        quant_log.append(stat)
+                        logger.info(stat)
+
+                    except torch._C._LinAlgError as e:
+                        if "not positive-definite" in str(e).lower():
+                            logger.warning(
+                                "Please increase damp or nsamples for calibration data to avoid the following quant error. "
+                            )
+                        raise e
+
+                    quantizers[f"{self.layers_node}.{i}.{name}"] = (
+                        gptq[name].quantizer.to(CPU if force_layer_back_to_cpu else cur_layer_device),
+                        move_to(scale, CPU if force_layer_back_to_cpu else cur_layer_device),
+                        move_to(zero, CPU if force_layer_back_to_cpu else cur_layer_device),
+                        move_to(g_idx, CPU if force_layer_back_to_cpu else cur_layer_device),
+                    )
+                    gptq[name].free()
+
+            for j in range(num_batches):
+                layer_input = []
+                for k, layer_inp in enumerate(layer_inputs[j]):
+                    layer_input.append(move_to(layer_inp, cur_layer_device))
+
+                mask = attention_masks[j]
+                layer_attention_mask = mask if mask is None else move_to(mask, cur_layer_device)
+
+                additional_layer_inputs = {"attention_mask": layer_attention_mask}
+                layer_position_ids = None if not position_ids else move_to(position_ids[j], cur_layer_device)
+                if layer_position_ids is not None:
+                    additional_layer_inputs["position_ids"] = layer_position_ids
+                for k, v in layer_input_kwargs[j].items():
+                    additional_layer_inputs[k] = nested_move_to(v, cur_layer_device)
+                layer_output = move_to(
+                    layer(*layer_input, **additional_layer_inputs)[0],
+                    cur_layer_device if calibration_enable_gpu_cache else CPU,
+                )
+                layer_outputs.append([layer_output])
+
+            layers[i] = move_to(layer, CPU if force_layer_back_to_cpu else cur_layer_device)
+            del layer
+            del gptq
+            del layer_inputs
+            layer_inputs, layer_outputs = (
+                layer_outputs,
+                [],
+            )  # TODO: is it really OK to cache only the first positional argument?
+            torch.cuda.empty_cache()
+
+        logger.info(f"Quantization summary:\n{quant_log}")
+        for module_log in quant_log:
+            logger.info(module_log)
+
+        self.qlinear_kernel = pack_model(
+            model=self.model,
+            quantizers=quantizers,
+            bits=self.quantize_config.bits,
+            group_size=self.quantize_config.group_size,
+            backend=Backend.AUTO,
+            use_cuda_fp16=use_cuda_fp16,
+            desc_act=self.quantize_config.desc_act,
+            warmup_triton=autotune_warmup_after_quantized,
+            force_layer_back_to_cpu=force_layer_back_to_cpu,
+            format=self.quantize_config.format,
+        )
+        if device_map:
+            self.model = remove_hook_from_module(self.model, recurse=True)
+            self.model = simple_dispatch_model(self.model, device_map)
+        self.model.config.use_cache = forward_pass_use_cache
+
+        self._quantized = True
+
+        torch.cuda.empty_cache()
+
+        return quant_log
+
+    @property
+    def device(self):
+        if not self.hf_device_map:
+            return self.model.device
+        else:
+            device = [d for d in self.hf_device_map.values() if d not in {"disk"}][0]
+            return torch.device(device)
+
+    def to(self, device: Union[str, torch.device]):
+        self.model.to(device)
+        return self
+
+    def forward(self, *args, **kwargs):
+        return self.model(*args, **kwargs)
+
+    def generate(self, **kwargs):
+        """shortcut for model.generate"""
+        with torch.inference_mode(), torch.amp.autocast(device_type=self.device.type):
+            return self.model.generate(**kwargs)
+
+    def prepare_inputs_for_generation(self, *args, **kwargs):
+        """shortcut for model.prepare_inputs_for_generation"""
+        return self.model.prepare_inputs_for_generation(*args, **kwargs)
+
+    def save_quantized(
+        self,
+        save_dir: str,
+        safetensors_metadata: Optional[Dict[str, str]] = None,
+        use_safetensors: bool = True,
+        max_shard_size: Optional[str] = None,
+        model_base_name: Optional[str] = None
+    ):
+        """save quantized model and configs to local disk"""
+        os.makedirs(save_dir, exist_ok=True)
+
+        # write autogptq tooling fingerprint to config
+        self.quantize_config.meta_set_versionable(
+            key=META_FIELD_QUANTIZER,
+            value=META_QUANTIZER_GPTQMODEL,
+            version=__version__,
+        )
+
+        # The config, quantize_config and model may be edited in place in save_quantized.
+        config = copy.deepcopy(self.model.config)
+        quantize_config = copy.deepcopy(self.quantize_config)
+        model = self.model
+
+        if not self.quantized:
+            raise ValueError("Save aborted as model is not quantized. Please call `quantize()` first.")
+
+        if model_base_name is None:
+            model_base_name = (
+                    self.quantize_config.model_file_base_name or
+                    f"gptq_model-{self.quantize_config.bits}bit-{self.quantize_config.group_size}g"
+            )
+
+        if quantize_config.format == FORMAT.GPTQ_V2:
+            logger.warning(
+                f"Using 'format = {FORMAT.GPTQ_V2}': the serialized model is only supported by GPTQModel version >= {MIN_VERSION_WITH_V2}."
+            )
+
+        # internal is always gptq v2 but allow users to pass gptq (v1) via config
+        if quantize_config.format == FORMAT.GPTQ:
+            # Model qzeros may be edited in place.
+            # TODO: avoid inplace modification of the weights
+            # fix ModelCloud/GPTQModel/issues/47
+            # fix gptqmodel_cuda cannot be serialized
+            # no need to set it back, no calculation below
+            if quantize_config.bits != 4:
+                cuda_name_modules = {}
+                from gptqmodel.nn_modules.qlinear.qlinear_cuda import BaseCudaQuantLinear
+                for name, module in model.named_modules():
+                    if isinstance(module, BaseCudaQuantLinear):
+                        cuda_name_modules[name] = module.gptqmodel_cuda
+                        module.gptqmodel_cuda = None
+                model = copy.deepcopy(self.model)
+
+                for name, modules in model.named_modules():
+                    if isinstance(module, BaseCudaQuantLinear) and name in cuda_name_modules:
+                        module.gptqmodel_cuda = cuda_name_modules[name]
+
+                del cuda_name_modules
+            else:
+                model = copy.deepcopy(self.model)
+            model = convert_gptq_v2_to_v1_format(
+                model, quantize_config=quantize_config, qlinear_kernel=self.qlinear_kernel
+            )
+
+        model.to(CPU)
+
+        state_dict = model.state_dict()
+
+        if quantize_config.model_file_base_name is None:
+            if use_safetensors:
+                model_base_name = "model"
+            else:
+                model_base_name = "pytorch_model"
+        else:
+            model_base_name = quantize_config.model_file_base_name
+
+        if use_safetensors:
+            state_dict = {k: v.clone().contiguous() for k, v in state_dict.items()}
+            model_save_name = model_base_name + ".safetensors"
+        else:
+            model_save_name = model_base_name + ".bin"
+
+        if not self.qlinear_kernel.SUPPORTED_SHARDS and max_shard_size is not None:
+            logger.warning("Sharding is not supported for this quant. Disabling sharding.")
+            max_shard_size = None
+
+        if max_shard_size is None:
+            if use_safetensors:
+                if safetensors_metadata is None:
+                    safetensors_metadata = {}
+                elif not isinstance(safetensors_metadata, dict):
+                    raise TypeError("safetensors_metadata must be a dictionary.")
+                else:
+                    logger.debug(f"Received safetensors_metadata: {safetensors_metadata}")
+                    new_safetensors_metadata = {}
+                    converted_keys = False
+                    for key, value in safetensors_metadata.items():
+                        if not isinstance(key, str) or not isinstance(value, str):
+                            converted_keys = True
+                            try:
+                                new_key = str(key)
+                                new_value = str(value)
+                            except Exception as e:
+                                raise TypeError(
+                                    f"safetensors_metadata: both keys and values must be strings and an error occured when trying to convert them: {e}"
+                                )
+                            if new_key in new_safetensors_metadata:
+                                logger.warning(
+                                    f"After converting safetensors_metadata keys to strings, the key '{new_key}' is duplicated. Ensure that all your metadata keys are strings to avoid overwriting."
+                                )
+                            new_safetensors_metadata[new_key] = new_value
+                    safetensors_metadata = new_safetensors_metadata
+                    if converted_keys:
+                        logger.debug(
+                            f"One or more safetensors_metadata keys or values had to be converted to str(). Final safetensors_metadata: {safetensors_metadata}"
+                        )
+
+                # Format is required to enable Accelerate to load the metadata
+                # otherwise it raises an OSError
+                safetensors_metadata["format"] = "pt"
+                safe_save(state_dict, join(save_dir, model_save_name), safetensors_metadata)
+            else:
+                logger.warning(
+                    "We highly suggest saving quantized model using safetensors format for security reasons. Please set `use_safetensors=True` whenever possible.")
+                torch.save(model.state_dict(), join(save_dir, model_save_name))
+        else:
+            # Shard checkpoint
+            shards, index = shard_checkpoint(state_dict, max_shard_size=max_shard_size, weights_name=model_save_name)
+
+            # Clean the folder from a previous save
+            for filename in os.listdir(save_dir):
+                full_filename = join(save_dir, filename)
+
+                # make sure that file to be deleted matches format of sharded file, e.g. pytorch_model-00001-of-00005
+                filename_no_suffix = filename.replace(".bin", "").replace(".safetensors", "")
+                reg = re.compile(r"(.*?)-\d{5}-of-\d{5}")
+
+                if (
+                        filename.startswith(model_base_name)
+                        and isfile(full_filename)
+                        and filename not in shards.keys()
+                        and reg.fullmatch(filename_no_suffix) is not None
+                ):
+                    os.remove(full_filename)
+
+            # Save the model
+            for shard_file, shard in shards.items():
+                if use_safetensors:
+                    if safetensors_metadata is None:
+                        safetensors_metadata = {}
+                    elif not isinstance(safetensors_metadata, dict):
+                        raise TypeError("safetensors_metadata must be a dictionary.")
+                    else:
+                        logger.debug(f"Received safetensors_metadata: {safetensors_metadata}")
+                        new_safetensors_metadata = {}
+                        converted_keys = False
+                        for key, value in safetensors_metadata.items():
+                            if not isinstance(key, str) or not isinstance(value, str):
+                                converted_keys = True
+                                try:
+                                    new_key = str(key)
+                                    new_value = str(value)
+                                except Exception as e:
+                                    raise TypeError(
+                                        f"safetensors_metadata: both keys and values must be strings and an error occured when trying to convert them: {e}")
+                                if new_key in new_safetensors_metadata:
+                                    logger.warning(
+                                        f"After converting safetensors_metadata keys to strings, the key '{new_key}' is duplicated. Ensure that all your metadata keys are strings to avoid overwriting.")
+                                new_safetensors_metadata[new_key] = new_value
+                        safetensors_metadata = new_safetensors_metadata
+                        if converted_keys:
+                            logger.debug(
+                                f"One or more safetensors_metadata keys or values had to be converted to str(). Final safetensors_metadata: {safetensors_metadata}")
+
+                    # Format is required to enable Accelerate to load the metadata
+                    # otherwise it raises an OSError
+                    safetensors_metadata["format"] = "pt"
+
+                    safe_save(shard, join(save_dir, shard_file), safetensors_metadata)
+                else:
+                    torch.save(shard, join(save_dir, shard_file))
+
+            if index is not None:
+                index_save_name = model_save_name + ".index.json"
+                index_save_path = join(save_dir, index_save_name)
+                # Save the index as well
+                with open(index_save_path, "w", encoding="utf-8") as f:
+                    content = json.dumps(index, indent=2, sort_keys=True) + "\n"
+                    f.write(content)
+        config.quantization_config = quantize_config.to_dict()
+        config.save_pretrained(save_dir)
+
+        quantize_config.model_name_or_path = save_dir
+        quantize_config.model_file_base_name = model_base_name
+        quantize_config.save_pretrained(save_dir)
+
+    def save_pretrained(
+        self,
+        save_dir: str,
+        **kwargs,
+    ):
+        logger.warning("You are using save_pretrained, which will re-direct to save_quantized.")
+        self.save_quantized(save_dir=save_dir, **kwargs)
+
+    @classmethod
+    def from_pretrained(
+        cls,
+        pretrained_model_name_or_path: str,
+        quantize_config: QuantizeConfig,
+        max_memory: Optional[dict] = None,
+        trust_remote_code: bool = False,
+        torch_dtype: [str | torch.dtype] = "auto",
+        **model_init_kwargs,
+    ):
+        """load un-quantized pretrained model to cpu"""
+
+        if not torch.cuda.is_available():
+            raise EnvironmentError("Load pretrained model to do quantization requires CUDA available.")
+
+        if cls.require_trust_remote_code and not trust_remote_code:
+            raise ValueError(
+                f"{pretrained_model_name_or_path} requires trust_remote_code=True. Please set trust_remote_code=True to load this model."
+            )
+
+        # allow models to define optional notes that output messages to users that want to use this model
+        notes = cls.info.get("notes")
+        if notes:
+            logger.info(notes)
+
+        def skip(*args, **kwargs):
+            pass
+
+        torch.nn.init.kaiming_uniform_ = skip
+        torch.nn.init.uniform_ = skip
+        torch.nn.init.normal_ = skip
+
+        model_init_kwargs["trust_remote_code"] = trust_remote_code
+
+        config = AutoConfig.from_pretrained(pretrained_model_name_or_path, **model_init_kwargs)
+
+        if torch_dtype == "auto":
+            torch_dtype = auto_dtype_from_config(config)
+        elif not isinstance(torch_dtype, torch.dtype):
+            raise ValueError(f"torch_dtype value of `{torch_dtype}` is not a torch.dtype instance.")
+
+        # enforce some values despite user specified
+        model_init_kwargs["torch_dtype"] = torch_dtype
+
+        if config.model_type not in SUPPORTED_MODELS:
+            raise TypeError(f"{config.model_type} isn't supported yet.")
+
+        if max_memory:
+            if "disk" in max_memory:
+                raise NotImplementedError("disk offload not support yet.")
+            with accelerate.init_empty_weights():
+                model = AutoModelForCausalLM.from_config(config, trust_remote_code=True)
+            model.tie_weights()
+
+            max_memory = accelerate.utils.get_balanced_memory(
+                model,
+                max_memory=max_memory,
+                no_split_module_classes=[cls.layer_type],
+                dtype=model_init_kwargs["torch_dtype"],
+                low_zero=False,
+            )
+            model_init_kwargs["device_map"] = accelerate.infer_auto_device_map(
+                model,
+                max_memory=max_memory,
+                no_split_module_classes=[cls.layer_type],
+                dtype=model_init_kwargs["torch_dtype"],
+            )
+            del model
+        else:
+            model_init_kwargs["device_map"] = None
+
+        torch.cuda.empty_cache()
+
+        model = AutoModelForCausalLM.from_pretrained(pretrained_model_name_or_path, **model_init_kwargs)
+
+        model_config = model.config.to_dict()
+        seq_len_keys = ["max_position_embeddings", "seq_length", "n_positions"]
+        if any(k in model_config for k in seq_len_keys):
+            for key in seq_len_keys:
+                if key in model_config:
+                    model.seqlen = model_config[key]
+                    break
+        else:
+            logger.warning("can't get model's sequence length from model config, will set to 4096.")
+            model.seqlen = 4096
+        model.eval()
+
+        return cls(model, quantized=False, quantize_config=quantize_config)
+
+    @classmethod
+    def from_quantized(
+        cls,
+        model_name_or_path: Optional[str],
+        device_map: Optional[Union[str, Dict[str, Union[int, str]]]] = None,
+        max_memory: Optional[dict] = None,
+        device: Optional[Union[str, int]] = None,
+
+        backend: Backend = Backend.AUTO,
+
+        torch_dtype: [str | torch.dtype] = "auto",
+        use_cuda_fp16: bool = True,
+        quantize_config: Optional[QuantizeConfig] = None,
+        model_basename: Optional[str] = None,
+        use_safetensors: bool = True,
+        trust_remote_code: bool = False,
+        warmup_triton: bool = False,
+        format: Optional[FORMAT] = None,
+        allow_unsafe_loading: bool = False,
+        verify_hash: Optional[Union[str, List[str]]] = None,
+        **kwargs,
+    ):
+        """load quantized model from local disk"""
+        if cls.require_trust_remote_code and not trust_remote_code:
+            raise ValueError(
+                f"{model_name_or_path} requires trust_remote_code=True. Please set trust_remote_code=True to load this model."
+            )
+
+        # Parameters related to loading from Hugging Face Hub
+        cache_dir = kwargs.pop("cache_dir", None)
+        force_download = kwargs.pop("force_download", False)
+        resume_download = kwargs.pop("resume_download", False)
+        proxies = kwargs.pop("proxies", None)
+        local_files_only = kwargs.pop("local_files_only", False)
+        use_auth_token = kwargs.pop("use_auth_token", None)
+        revision = kwargs.pop("revision", None)
+        subfolder = kwargs.pop("subfolder", "")
+        commit_hash = kwargs.pop("_commit_hash", None)
+
+        cached_file_kwargs = {
+            "cache_dir": cache_dir,
+            "force_download": force_download,
+            "proxies": proxies,
+            "resume_download": resume_download,
+            "local_files_only": local_files_only,
+            "use_auth_token": use_auth_token,
+            "revision": revision,
+            "subfolder": subfolder,
+            "_raise_exceptions_for_missing_entries": False,
+            "_commit_hash": commit_hash,
+        }
+
+        # == step1: prepare configs and file names == #
+        config: PretrainedConfig = AutoConfig.from_pretrained(
+            model_name_or_path,
+            trust_remote_code=trust_remote_code,
+            **cached_file_kwargs,
+        )
+
+        if torch_dtype == "auto":
+            torch_dtype = auto_dtype_from_config(config, quant_inference=True)
+        elif not isinstance(torch_dtype, torch.dtype):
+            raise ValueError(f"torch_dtype value of `{torch_dtype}` is not a torch.dtype instance.")
+
+        if config.model_type not in SUPPORTED_MODELS:
+            raise TypeError(f"{config.model_type} isn't supported yet.")
+
+        if quantize_config is None:
+            quantize_config = QuantizeConfig.from_pretrained(
+                model_name_or_path, format=format, **cached_file_kwargs, **kwargs
+            )
+        else:
+            if not isinstance(quantize_config, QuantizeConfig):
+                quantize_config = QuantizeConfig.from_quant_config(quantize_config, format)
+
+
+        if model_basename is None:
+            if quantize_config.model_file_base_name:
+                possible_model_basenames = [quantize_config.model_file_base_name]
+            else:
+                possible_model_basenames = [
+                    f"gptq_model-{quantize_config.bits}bit-{quantize_config.group_size}g",
+                    "model",
+                ]
+        else:
+            possible_model_basenames = [model_basename]
+
+        quantize_config.model_name_or_path = model_name_or_path
+
+        extensions = []
+        if use_safetensors:
+            extensions.append(".safetensors")
+        else:
+            extensions += [".bin", ".pt"]
+
+        model_name_or_path = str(model_name_or_path)
+
+        # Retrieve (and if necessary download) the quantized checkpoint(s).
+        is_sharded, resolved_archive_file, true_model_basename = get_checkpoints(
+            model_name_or_path=model_name_or_path,
+            extensions=extensions,
+            possible_model_basenames=possible_model_basenames,
+            **cached_file_kwargs,
+        )
+
+        # bin files have security issues: disable loading by default
+        if ".bin" in resolved_archive_file:
+            if allow_unsafe_loading:
+                logger.warning(
+                    "There are security risks when loading tensors from .bin files. Make sure you are loading model only from a trusted source."
+                )
+            else:
+                raise ValueError(
+                    "Loading of unsafe .bin files are not allowed by default. Pass allow_unsafe_loading=True to bypass."
+                )
+
+        quantize_config.model_file_base_name = true_model_basename
+
+        model_save_name = resolved_archive_file  # In case a model is sharded, this would be `model.safetensors.index.json` which may later break.
+        if verify_hash:
+            if is_sharded:
+                verfieid = verify_sharded_model_hashes(model_save_name, verify_hash)
+            else:
+                verfieid = verify_model_hash(model_save_name, verify_hash)
+            if not verfieid:
+                raise ValueError(f"Hash verification failed for {model_save_name}")
+            logger.info(f"Hash verification succeeded for {model_save_name}")
+        # == step2: convert model to gptq-model (replace Linear with QuantLinear) == #
+        def skip(*args, **kwargs):
+            pass
+
+        if torch_dtype != torch.float16:
+            logger.warning("Overriding use_cuda_fp16 to False since torch_dtype is not torch.float16.")
+            use_cuda_fp16 = False
+
+        torch.nn.init.kaiming_uniform_ = skip
+        torch.nn.init.uniform_ = skip
+        torch.nn.init.normal_ = skip
+
+        transformers.modeling_utils._init_weights = False
+
+        init_contexts = [no_init_weights()]
+
+        with ContextManagers(init_contexts):
+            model = AutoModelForCausalLM.from_config(
+                config, trust_remote_code=trust_remote_code, torch_dtype=torch_dtype
+            )
+
+            if cls.dynamic_expert_index is not None:
+                num_experts = getattr(config, cls.dynamic_expert_index)
+                cls.layer_modules = get_moe_layer_modules(layer_modules=cls.layer_modules,
+                                                          num_experts=num_experts)
+
+            layers = find_layers(model)
+            ignore_layers = [cls.lm_head] + cls.base_modules
+
+            for name in list(layers.keys()):
+                # allow loading of quantized lm_head
+                if quantize_config.lm_head and name == cls.lm_head:
+                    continue
+
+                if any(name.startswith(ignore_layer) for ignore_layer in ignore_layers) or all(
+                        not name.endswith(ignore_layer) for sublist in cls.layer_modules for ignore_layer in sublist
+                ):
+                    # log non-lm-head quantizerd layers only
+                    if name is not cls.lm_head:
+                        logger.info(f"The layer {name} is not quantized.")
+                    del layers[name]
+
+            preload_qlinear_kernel = make_quant(
+                model,
+                layers,
+                quantize_config.bits,
+                quantize_config.group_size,
+                backend=backend,
+                format=quantize_config.format,
+                use_cuda_fp16=use_cuda_fp16,
+                desc_act=quantize_config.desc_act,
+            )
+            model.tie_weights()
+
+        # == step3: load checkpoint and dispatch == #
+        if isinstance(device_map, str) and device_map not in [
+            "auto",
+            "balanced",
+            "balanced_low_0",
+            "sequential",
+        ]:
+            raise ValueError(
+                "If passing a string for `device_map`, please choose 'auto', 'balanced', 'balanced_low_0' or "
+                "'sequential'."
+            )
+        if isinstance(device_map, dict):
+            max_memory = None
+        else:
+            if device is None and not device_map and not max_memory:
+                device_map = "auto"
+            if device is not None:
+                device = torch.device(device)
+                if not max_memory and not device_map:
+                    device_map = {"": device.index if device.type == "cuda" else device.type}
+            if not isinstance(device_map, dict) and device_map != "sequential":
+                max_memory = accelerate.utils.get_balanced_memory(
+                    model=model,
+                    max_memory=max_memory,
+                    no_split_module_classes=[cls.layer_type],
+                    low_zero=(device_map == "balanced_low_0"),
+                )
+        if not isinstance(device_map, dict):
+            device_map = accelerate.infer_auto_device_map(
+                model,
+                max_memory=max_memory,
+                no_split_module_classes=[cls.layer_type],
+            )
+
+        load_checkpoint_in_model = False
+        # compat: runtime convert checkpoint gptq(v1) to gptq_v2 format
+        if quantize_config.format == FORMAT.GPTQ:
+            accelerate.load_checkpoint_in_model(
+                model,
+                dtype=torch_dtype,
+                # This is very hacky but works due to https://github.com/huggingface/accelerate/blob/bd72a5f1a80d5146554458823f8aeda0a9db5297/src/accelerate/utils/modeling.py#L292
+                checkpoint=model_save_name,
+                device_map=device_map,
+                offload_state_dict=True,
+                offload_buffers=True,
+            )
+            # validate sym=False v1 loading needs to be protected for models produced with new v2 format codebase
+            if not quantize_config.sym and not quantize_config.is_quantized_or_packed_by_v2():
+                raise ValueError(
+                    f"Loading of a sym=False model with format={FORMAT.GPTQ} is only supported if produced by gptqmodel version >= {MIN_VERSION_WITH_V2}"
+                )
+
+            logger.info(
+                f"Compatibility: converting `{FORMAT_FIELD_JSON}` from `{FORMAT.GPTQ}` to `{FORMAT.GPTQ_V2}`.")
+            model = convert_gptq_v1_to_v2_format(
+                model,
+                quantize_config=quantize_config,
+                qlinear_kernel=preload_qlinear_kernel,
+            )
+            load_checkpoint_in_model = True
+            quantize_config.format = FORMAT.GPTQ_V2
+
+        if not load_checkpoint_in_model and backend == Backend.TRITON:
+            accelerate.load_checkpoint_in_model(
+                model,
+                dtype=torch_dtype,  # This is very hacky but works due to https://github.com/huggingface/accelerate/blob/bd72a5f1a80d5146554458823f8aeda0a9db5297/src/accelerate/utils/modeling.py#L292
+                checkpoint=model_save_name,
+                device_map=device_map,
+                offload_state_dict=True,
+                offload_buffers=True,
+            )
+
+        # TODO: Why are we using this custom function and not dispatch_model?
+        model = simple_dispatch_model(model, device_map)
+
+        qlinear_kernel = select_quant_linear(
+            bits=quantize_config.bits,
+            group_size=quantize_config.group_size,
+            desc_act=quantize_config.desc_act,
+            sym=quantize_config.sym,
+            backend=backend,
+            format=quantize_config.format,
+        )
+
+        # == step4: set seqlen == #
+        model_config = model.config.to_dict()
+        seq_len_keys = ["max_position_embeddings", "seq_length", "n_positions"]
+        if any(k in model_config for k in seq_len_keys):
+            for key in seq_len_keys:
+                if key in model_config:
+                    model.seqlen = model_config[key]
+                    break
+        else:
+            logger.warning("can't get model's sequence length from model config, will set to 4096.")
+            model.seqlen = 4096
+
+        # Any post-initialization that require device information, for example buffers initialization on device.
+        model = gptqmodel_post_init(model, use_act_order=quantize_config.desc_act)
+
+        model.eval()
+
+        # == step6: (optional) warmup triton == #
+        if backend != Backend.TRITON and warmup_triton:
+            from ..nn_modules.qlinear.qlinear_tritonv2 import QuantLinear
+
+            QuantLinear.warmup(model, seqlen=model.seqlen)
+
+        return cls(
+            model,
+            quantized=True,
+            quantize_config=quantize_config,
+            qlinear_kernel=qlinear_kernel,
+        )
+
+    def warmup_triton(self, enabled: bool = True):
+        if not enabled:
+            return
+
+        from ..nn_modules.qlinear.qlinear_tritonv2 import QuantLinear
+
+        QuantLinear.warmup(self.model, seqlen=self.model.seqlen)
+
+    def __getattr__(self, item):
+        try:
+            return super().__getattr__(item)
+        except Exception:
+            return getattr(self.model, item)
+
+
+__all__ = ["BaseGPTQModel"]
diff --git a/plugins/accelerated-peft/src/gptqmodel/models/dbrx.py b/plugins/accelerated-peft/src/gptqmodel/models/dbrx.py
new file mode 100644
index 00000000..fb758032
--- /dev/null
+++ b/plugins/accelerated-peft/src/gptqmodel/models/dbrx.py
@@ -0,0 +1,22 @@
+###############################################################################
+# Adapted from https://github.com/ModelCloud/GPTQModel
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+###############################################################################
+from .base import BaseGPTQModel
+
+
+# placer=holder only as dbrx original models are not supported
+# supported dbrx_converted models can be found on https://hf.co/ModelCloud
+class DbrxGPTQ(BaseGPTQModel):
+    info = {"notes": "Dbrx is only supported using defused/converted models on https://hf.co/ModelCloud with `trust_remote_code=True`"}
diff --git a/plugins/accelerated-peft/src/gptqmodel/models/dbrx_converted.py b/plugins/accelerated-peft/src/gptqmodel/models/dbrx_converted.py
new file mode 100644
index 00000000..35a33170
--- /dev/null
+++ b/plugins/accelerated-peft/src/gptqmodel/models/dbrx_converted.py
@@ -0,0 +1,66 @@
+###############################################################################
+# Adapted from https://github.com/ModelCloud/GPTQModel
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+###############################################################################
+from .base import BaseGPTQModel
+
+
+class DbrxConvertedGPTQ(BaseGPTQModel):
+    # dbrx_converted requires custom model code
+    require_trust_remote_code = True
+
+    base_modules = ["transformer.wte", "transformer.norm_f"]
+
+    layers_node = "transformer.blocks"
+    layer_type = "DbrxBlock"
+    layer_modules = [
+        ["norm_attn_norm.attn.q_proj", "norm_attn_norm.attn.k_proj", "norm_attn_norm.attn.v_proj"],
+        ["norm_attn_norm.attn.out_proj"],
+        [
+            "ffn.experts.mlp.0.w1",  "ffn.experts.mlp.0.v1",
+            "ffn.experts.mlp.1.w1",  "ffn.experts.mlp.1.v1",
+            "ffn.experts.mlp.2.w1",  "ffn.experts.mlp.2.v1",
+            "ffn.experts.mlp.3.w1",  "ffn.experts.mlp.3.v1",
+            "ffn.experts.mlp.4.w1",  "ffn.experts.mlp.4.v1",
+            "ffn.experts.mlp.5.w1",  "ffn.experts.mlp.5.v1",
+            "ffn.experts.mlp.6.w1",  "ffn.experts.mlp.6.v1",
+            "ffn.experts.mlp.7.w1",  "ffn.experts.mlp.7.v1",
+            "ffn.experts.mlp.8.w1",  "ffn.experts.mlp.8.v1",
+            "ffn.experts.mlp.9.w1",  "ffn.experts.mlp.9.v1",
+            "ffn.experts.mlp.10.w1", "ffn.experts.mlp.10.v1",
+            "ffn.experts.mlp.11.w1", "ffn.experts.mlp.11.v1",
+            "ffn.experts.mlp.12.w1", "ffn.experts.mlp.12.v1",
+            "ffn.experts.mlp.13.w1", "ffn.experts.mlp.13.v1",
+            "ffn.experts.mlp.14.w1", "ffn.experts.mlp.14.v1",
+            "ffn.experts.mlp.15.w1", "ffn.experts.mlp.15.v1",
+        ],
+        [
+            "ffn.experts.mlp.0.w2",
+            "ffn.experts.mlp.1.w2",
+            "ffn.experts.mlp.2.w2",
+            "ffn.experts.mlp.3.w2",
+            "ffn.experts.mlp.4.w2",
+            "ffn.experts.mlp.5.w2",
+            "ffn.experts.mlp.6.w2",
+            "ffn.experts.mlp.7.w2",
+            "ffn.experts.mlp.8.w2",
+            "ffn.experts.mlp.9.w2",
+            "ffn.experts.mlp.10.w2",
+            "ffn.experts.mlp.11.w2",
+            "ffn.experts.mlp.12.w2",
+            "ffn.experts.mlp.13.w2",
+            "ffn.experts.mlp.14.w2",
+            "ffn.experts.mlp.15.w2",
+        ]
+    ]
diff --git a/plugins/accelerated-peft/src/gptqmodel/models/gemma.py b/plugins/accelerated-peft/src/gptqmodel/models/gemma.py
new file mode 100644
index 00000000..e6191904
--- /dev/null
+++ b/plugins/accelerated-peft/src/gptqmodel/models/gemma.py
@@ -0,0 +1,29 @@
+###############################################################################
+# Adapted from https://github.com/ModelCloud/GPTQModel
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+###############################################################################
+from .base import BaseGPTQModel
+
+
+class GemmaGPTQ(BaseGPTQModel):
+    base_modules = ["model.embed_tokens", "model.norm"]
+
+    layers_node = "model.layers"
+    layer_type = "GemmaDecoderLayer"
+    layer_modules = [
+        ["self_attn.k_proj", "self_attn.v_proj", "self_attn.q_proj"],
+        ["self_attn.o_proj"],
+        ["mlp.up_proj", "mlp.gate_proj"],
+        ["mlp.down_proj"],
+    ]
diff --git a/plugins/accelerated-peft/src/gptqmodel/models/gpt_bigcode.py b/plugins/accelerated-peft/src/gptqmodel/models/gpt_bigcode.py
new file mode 100644
index 00000000..0a15062e
--- /dev/null
+++ b/plugins/accelerated-peft/src/gptqmodel/models/gpt_bigcode.py
@@ -0,0 +1,29 @@
+###############################################################################
+# Adapted from https://github.com/ModelCloud/GPTQModel
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+###############################################################################
+from .base import BaseGPTQModel
+
+
+class GPTBigCodeGPTQ(BaseGPTQModel):
+    base_modules = ["transformer.wpe", "transformer.wte", "transformer.ln_f"]
+
+    layers_node = "transformer.h"
+    layer_type = "GPTBigCodeBlock"
+    layer_modules = [
+        ["attn.c_attn"],
+        ["attn.c_proj"],
+        ["mlp.c_fc"],
+        ["mlp.c_proj"],
+    ]
diff --git a/plugins/accelerated-peft/src/gptqmodel/models/gpt_neox.py b/plugins/accelerated-peft/src/gptqmodel/models/gpt_neox.py
new file mode 100644
index 00000000..81e8e401
--- /dev/null
+++ b/plugins/accelerated-peft/src/gptqmodel/models/gpt_neox.py
@@ -0,0 +1,31 @@
+###############################################################################
+# Adapted from https://github.com/ModelCloud/GPTQModel
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+###############################################################################
+from .base import BaseGPTQModel
+
+
+class GPTNeoXGPTQ(BaseGPTQModel):
+    base_modules = ["gpt_neox.embed_in", "gpt_neox.final_layer_norm"]
+    lm_head = "embed_out"
+
+    layers_node = "gpt_neox.layers"
+    layer_type = "GPTNeoXLayer"
+    layer_modules = [
+        ["attention.query_key_value"],
+        ["attention.dense"],
+        ["mlp.dense_h_to_4h"],
+        ["mlp.dense_4h_to_h"],
+    ]
+
diff --git a/plugins/accelerated-peft/src/gptqmodel/models/llama.py b/plugins/accelerated-peft/src/gptqmodel/models/llama.py
new file mode 100644
index 00000000..19930ef6
--- /dev/null
+++ b/plugins/accelerated-peft/src/gptqmodel/models/llama.py
@@ -0,0 +1,37 @@
+###############################################################################
+# Adapted from https://github.com/ModelCloud/GPTQModel
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+###############################################################################
+from .base import BaseGPTQModel
+
+
+class LlamaGPTQ(BaseGPTQModel):
+    # Non-repeating layers at the root level: same level as `layers_node`
+    # Excluding `layers_node`.
+    base_modules = ["model.embed_tokens", "model.norm"]
+
+    # Below describes all the repeating layers in this transformer model
+    # `model.layers` is a node/module that hold all the repeating layers. The parent node for all n-layers.
+    layers_node = "model.layers"
+    # Each repeating layer in `model.layers` is of type `LlamaDecoderLayer`
+    layer_type = "LlamaDecoderLayer"
+    # Inside each `LlamaDecoderLayer` layer are many internal modules
+    # List them in the order executed in model forward() code
+    # Many models have same execution order of: attention (q_k_v) projection, attention (output) projection, mlp (n) projections
+    layer_modules = [
+        ["self_attn.k_proj", "self_attn.v_proj", "self_attn.q_proj"],
+        ["self_attn.o_proj"],
+        ["mlp.up_proj", "mlp.gate_proj"],
+        ["mlp.down_proj"],
+    ]
diff --git a/plugins/accelerated-peft/src/gptqmodel/models/mistral.py b/plugins/accelerated-peft/src/gptqmodel/models/mistral.py
new file mode 100644
index 00000000..aa1748eb
--- /dev/null
+++ b/plugins/accelerated-peft/src/gptqmodel/models/mistral.py
@@ -0,0 +1,29 @@
+###############################################################################
+# Adapted from https://github.com/ModelCloud/GPTQModel
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+###############################################################################
+from .base import BaseGPTQModel
+
+
+class MistralGPTQ(BaseGPTQModel):
+    base_modules = ["model.embed_tokens", "model.norm"]
+
+    layers_node = "model.layers"
+    layer_type = "MistralDecoderLayer"
+    layer_modules = [
+        ["self_attn.k_proj", "self_attn.v_proj", "self_attn.q_proj"],
+        ["self_attn.o_proj"],
+        ["mlp.up_proj", "mlp.gate_proj"],
+        ["mlp.down_proj"],
+    ]
diff --git a/plugins/accelerated-peft/src/gptqmodel/models/mixtral.py b/plugins/accelerated-peft/src/gptqmodel/models/mixtral.py
new file mode 100644
index 00000000..acb4f640
--- /dev/null
+++ b/plugins/accelerated-peft/src/gptqmodel/models/mixtral.py
@@ -0,0 +1,55 @@
+###############################################################################
+# Adapted from https://github.com/ModelCloud/GPTQModel
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+###############################################################################
+from .base import BaseGPTQModel
+
+
+class MixtralGPTQ(BaseGPTQModel):
+    base_modules = ["model.embed_tokens", "model.norm"]
+
+    layers_node = "model.layers"
+    layer_type = "MixtralDecoderLayer"
+    layer_modules = [
+        ["self_attn.k_proj", "self_attn.v_proj", "self_attn.q_proj"],
+        ["self_attn.o_proj"],
+        [
+            "block_sparse_moe.experts.0.w1",
+            "block_sparse_moe.experts.1.w1",
+            "block_sparse_moe.experts.2.w1",
+            "block_sparse_moe.experts.3.w1",
+            "block_sparse_moe.experts.4.w1",
+            "block_sparse_moe.experts.5.w1",
+            "block_sparse_moe.experts.6.w1",
+            "block_sparse_moe.experts.7.w1",
+            "block_sparse_moe.experts.0.w3",
+            "block_sparse_moe.experts.1.w3",
+            "block_sparse_moe.experts.2.w3",
+            "block_sparse_moe.experts.3.w3",
+            "block_sparse_moe.experts.4.w3",
+            "block_sparse_moe.experts.5.w3",
+            "block_sparse_moe.experts.6.w3",
+            "block_sparse_moe.experts.7.w3",
+        ],
+        [
+            "block_sparse_moe.experts.0.w2",
+            "block_sparse_moe.experts.1.w2",
+            "block_sparse_moe.experts.2.w2",
+            "block_sparse_moe.experts.3.w2",
+            "block_sparse_moe.experts.4.w2",
+            "block_sparse_moe.experts.5.w2",
+            "block_sparse_moe.experts.6.w2",
+            "block_sparse_moe.experts.7.w2",
+        ],
+    ]
diff --git a/plugins/accelerated-peft/src/gptqmodel/nn_modules/__init__.py b/plugins/accelerated-peft/src/gptqmodel/nn_modules/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/plugins/accelerated-peft/src/gptqmodel/nn_modules/qlinear/__init__.py b/plugins/accelerated-peft/src/gptqmodel/nn_modules/qlinear/__init__.py
new file mode 100644
index 00000000..ff590d3c
--- /dev/null
+++ b/plugins/accelerated-peft/src/gptqmodel/nn_modules/qlinear/__init__.py
@@ -0,0 +1,43 @@
+import torch.nn as nn
+
+
+class BaseQuantLinear(nn.Module):
+    # override me
+    QUANT_TYPE = "base"
+
+    SUPPORTED_BITS = []
+    SUPPORTED_GROUP_SIZE = []
+    SUPPORTED_DESC_ACT = [True, False]
+    SUPPORTED_SYM = [True, False]
+    SUPPORTED_SHARDS: bool = True
+
+    @classmethod
+    def validate(cls, bits: int, group_size: int, desc_act: bool, sym: bool, raise_error: bool = True) -> bool:
+        validate = True
+        err = ""
+        if cls.SUPPORTED_BITS and bits not in cls.SUPPORTED_BITS:
+            validate = False
+            err = f"{cls.QUANT_TYPE} only supports `{cls.SUPPORTED_BITS}` bits: actual bits = `{bits}`"
+        elif cls.SUPPORTED_GROUP_SIZE and group_size not in cls.SUPPORTED_GROUP_SIZE:
+            validate = False
+            err = f"{cls.QUANT_TYPE} only supports `{cls.SUPPORTED_GROUP_SIZE}` group_size: actual group_size = `{group_size}`"
+        elif cls.SUPPORTED_SYM and sym not in cls.SUPPORTED_SYM:
+            validate = False
+            err = f"{cls.QUANT_TYPE} only supports `{cls.SUPPORTED_SYM}` bits: actual sym = `{sym}`"
+        elif cls.SUPPORTED_DESC_ACT and desc_act not in cls.SUPPORTED_DESC_ACT:
+            validate = False
+            err = f"{cls.QUANT_TYPE} only supports `{cls.SUPPORTED_DESC_ACT}` bits: actual desc_act = `{desc_act}`"
+
+        if not validate and raise_error:
+            raise NotImplementedError(err)
+
+        return validate
+
+    # override me
+    def post_init(self):
+        pass
+
+
+class BaseCudaQuantLinear(BaseQuantLinear):
+    # override me
+    QUANT_TYPE = "base-cuda"
diff --git a/plugins/accelerated-peft/src/gptqmodel/nn_modules/qlinear/qlinear_tritonv2.py b/plugins/accelerated-peft/src/gptqmodel/nn_modules/qlinear/qlinear_tritonv2.py
new file mode 100644
index 00000000..a0ab76c5
--- /dev/null
+++ b/plugins/accelerated-peft/src/gptqmodel/nn_modules/qlinear/qlinear_tritonv2.py
@@ -0,0 +1,206 @@
+###############################################################################
+# Adapted from https://github.com/ModelCloud/GPTQModel
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+###############################################################################
+import math
+from logging import getLogger
+
+import numpy as np
+import torch
+import torch.nn as nn
+import transformers
+
+from ..triton_utils.dequant import QuantLinearFunction, quant_matmul_248
+from ..triton_utils.mixin import TritonModuleMixin
+from . import BaseQuantLinear
+
+logger = getLogger(__name__)
+
+
+class QuantLinear(BaseQuantLinear, TritonModuleMixin):
+    """
+    Triton v2 quantized linear layer.
+
+    Calls dequant kernel (see triton_utils/dequant) to dequantize the weights then uses
+    torch.matmul to compute the output whereas original `triton` quantized linear layer fused
+    dequant and matmul into single kernel.add()
+    """
+
+    QUANT_TYPE = "tritonv2"
+
+    def __init__(self, bits, group_size, infeatures, outfeatures, bias, **kwargs,):
+        super().__init__()
+        if bits not in [2, 4, 8]:
+            raise NotImplementedError("Only 2,4,8 bits are supported.")
+        if infeatures % 32 != 0 or outfeatures % 32 != 0:
+            raise NotImplementedError("in_feature and out_feature must be divisible by 32.")
+        self.infeatures = infeatures
+        self.outfeatures = outfeatures
+        self.bits = bits
+        self.group_size = group_size if group_size != -1 else infeatures
+        self.maxq = 2**self.bits - 1
+
+        self.register_buffer(
+            "qweight",
+            torch.zeros((infeatures // 32 * self.bits, outfeatures), dtype=torch.int32),
+        )
+        self.register_buffer(
+            "qzeros",
+            torch.zeros(
+                (
+                    math.ceil(infeatures / self.group_size),
+                    outfeatures // 32 * self.bits,
+                ),
+                dtype=torch.int32,
+            ),
+        )
+        self.register_buffer(
+            "scales",
+            torch.zeros(
+                (math.ceil(infeatures / self.group_size), outfeatures),
+                dtype=torch.float16,
+            ),
+        )
+        self.register_buffer(
+            "g_idx",
+            torch.tensor([i // self.group_size for i in range(infeatures)], dtype=torch.int32),
+        )
+        if bias:
+            self.register_buffer("bias", torch.zeros((outfeatures), dtype=torch.float16))
+        else:
+            self.bias = None
+
+    def post_init(self):
+        pass
+
+    def pack(self, linear, scales, zeros, g_idx=None):
+        W = linear.weight.data.clone()
+        if isinstance(linear, nn.Conv2d):
+            W = W.flatten(1)
+        if isinstance(linear, transformers.pytorch_utils.Conv1D):
+            W = W.t()
+
+        self.g_idx = g_idx.clone() if g_idx is not None else self.g_idx
+
+        scales = scales.t().contiguous()
+        zeros = zeros.t().contiguous()
+        scale_zeros = zeros * scales
+        self.scales = scales.clone().half()
+        if linear.bias is not None:
+            self.bias = linear.bias.clone().half()
+
+        intweight = []
+        for idx in range(self.infeatures):
+            intweight.append(
+                torch.round((W[:, idx] + scale_zeros[self.g_idx[idx]]) / self.scales[self.g_idx[idx]]).to(torch.int)[
+                    :, None
+                ]
+            )
+        intweight = torch.cat(intweight, dim=1)
+        intweight = intweight.t().contiguous()
+        intweight = intweight.numpy().astype(np.uint32)
+
+        i = 0
+        row = 0
+        qweight = np.zeros((intweight.shape[0] // 32 * self.bits, intweight.shape[1]), dtype=np.uint32)
+        while row < qweight.shape[0]:
+            if self.bits in [2, 4, 8]:
+                for j in range(i, i + (32 // self.bits)):
+                    qweight[row] |= intweight[j] << (self.bits * (j - i))
+                i += 32 // self.bits
+                row += 1
+            else:
+                raise NotImplementedError("Only 2,4,8 bits are supported.")
+
+        qweight = qweight.astype(np.int32)
+        self.qweight = torch.from_numpy(qweight)
+
+        zeros = zeros.numpy().astype(np.uint32)
+        qzeros = np.zeros((zeros.shape[0], zeros.shape[1] // 32 * self.bits), dtype=np.uint32)
+        i = 0
+        col = 0
+        while col < qzeros.shape[1]:
+            if self.bits in [2, 4, 8]:
+                for j in range(i, i + (32 // self.bits)):
+                    qzeros[:, col] |= zeros[:, j] << (self.bits * (j - i))
+                i += 32 // self.bits
+                col += 1
+            else:
+                raise NotImplementedError("Only 2,4,8 bits are supported.")
+
+        qzeros = qzeros.astype(np.int32)
+        self.qzeros = torch.from_numpy(qzeros)
+
+    def forward(self, x):
+        out_shape = x.shape[:-1] + (self.outfeatures,)
+        quant_linear_fn = QuantLinearFunction
+
+        out = quant_linear_fn.apply(
+            x.reshape(-1, x.shape[-1]),
+            self.qweight,
+            self.scales,
+            self.qzeros,
+            self.g_idx,
+            self.bits,
+            self.maxq,
+        )
+        out = out.half().reshape(out_shape)
+        out = out + self.bias if self.bias is not None else out
+        return out
+
+    @classmethod
+    def warmup(cls, model, transpose=False, seqlen=2048):
+        """
+        Pre-tunes the quantized kernel
+        """
+        from tqdm import tqdm
+
+        kn_values = {}
+
+        for _, m in model.named_modules():
+            if not isinstance(m, cls):
+                continue
+
+            k = m.infeatures
+            n = m.outfeatures
+
+            if (k, n) not in kn_values:
+                kn_values[(k, n)] = (
+                    m.qweight,
+                    m.scales,
+                    m.qzeros,
+                    m.g_idx,
+                    m.bits,
+                    m.maxq,
+                )
+
+        logger.info(f"Found {len(kn_values)} unique KN Linear values.")
+        logger.info("Warming up autotune cache ...")
+        with torch.no_grad():
+            for m in tqdm(range(0, math.ceil(math.log2(seqlen)) + 1)):
+                m = 2**m
+                for (k, n), (
+                    qweight,
+                    scales,
+                    qzeros,
+                    g_idx,
+                    bits,
+                    maxq,
+                ) in kn_values.items():
+                    a = torch.randn(m, k, dtype=torch.float16, device=model.device)
+                    quant_matmul_248(a, qweight, scales, qzeros, g_idx, bits, maxq)
+        del kn_values
+
+
+__all__ = ["QuantLinear"]
diff --git a/plugins/accelerated-peft/src/gptqmodel/nn_modules/triton_utils/__init__.py b/plugins/accelerated-peft/src/gptqmodel/nn_modules/triton_utils/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/plugins/accelerated-peft/src/gptqmodel/nn_modules/triton_utils/custom_autotune.py b/plugins/accelerated-peft/src/gptqmodel/nn_modules/triton_utils/custom_autotune.py
new file mode 100644
index 00000000..fed33846
--- /dev/null
+++ b/plugins/accelerated-peft/src/gptqmodel/nn_modules/triton_utils/custom_autotune.py
@@ -0,0 +1,233 @@
+###############################################################################
+# Adapted from https://github.com/ModelCloud/GPTQModel
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+###############################################################################
+import builtins
+import math
+import time
+from typing import Dict
+
+import triton
+
+#  code based https://github.com/fpgaminer/GPTQ-triton
+"""
+Mostly the same as the autotuner in Triton, but with a few changes like using 40 runs instead of 100.
+"""
+
+
+class CustomizedTritonAutoTuner(triton.KernelInterface):
+    def __init__(
+        self,
+        fn,
+        arg_names,
+        configs,
+        key,
+        reset_to_zero,
+        prune_configs_by: Dict = None,
+        nearest_power_of_two: bool = False,
+    ):
+        if not configs:
+            self.configs = [triton.Config({}, num_warps=4, num_stages=2)]
+        else:
+            self.configs = configs
+        self.key_idx = [arg_names.index(k) for k in key]
+        self.nearest_power_of_two = nearest_power_of_two
+        self.cache = {}
+        # hook to reset all required tensor to zeros before relaunching a kernel
+        self.hook = lambda args: 0
+        if reset_to_zero is not None:
+            self.reset_idx = [arg_names.index(k) for k in reset_to_zero]
+
+            def _hook(args):
+                for i in self.reset_idx:
+                    args[i].zero_()
+
+            self.hook = _hook
+        self.arg_names = arg_names
+        # prune configs
+        if prune_configs_by:
+            perf_model, top_k = (
+                prune_configs_by["perf_model"],
+                prune_configs_by["top_k"],
+            )
+            if "early_config_prune" in prune_configs_by:
+                early_config_prune = prune_configs_by["early_config_prune"]
+        else:
+            perf_model, top_k, early_config_prune = None, None, None
+        self.perf_model, self.configs_top_k = perf_model, top_k
+        self.early_config_prune = early_config_prune
+        self.fn = fn
+
+    def _bench(self, *args, config, **meta):
+        # check for conflicts, i.e. meta-parameters both provided
+        # as kwargs and by the autotuner
+        conflicts = meta.keys() & config.kwargs.keys()
+        if conflicts:
+            raise ValueError(
+                f"Conflicting meta-parameters: {', '.join(conflicts)}."
+                " Make sure that you don't re-define auto-tuned symbols."
+            )
+        # augment meta-parameters with tunable ones
+        current = dict(meta, **config.kwargs)
+
+        def kernel_call():
+            if config.pre_hook:
+                config.pre_hook(self.nargs)
+            self.hook(args)
+            self.fn.run(
+                *args,
+                num_warps=config.num_warps,
+                num_stages=config.num_stages,
+                **current,
+            )
+
+        try:
+            # In testings using only 40 reps seems to be close enough and it appears to be what PyTorch uses
+            # PyTorch also sets fast_flush to True, but I didn't see any speedup so I'll leave the default
+            return triton.testing.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8), rep=40)
+        except triton.OutOfResources:
+            return (float("inf"), float("inf"), float("inf"))
+
+    def run(self, *args, **kwargs):
+        self.nargs = dict(zip(self.arg_names, args))
+        if len(self.configs) > 1:
+            key = tuple(args[i] for i in self.key_idx)
+
+            # This reduces the amount of autotuning by rounding the keys to the nearest power of two
+            # In my testing this gives decent results, and greatly reduces the amount of tuning required
+            if self.nearest_power_of_two:
+                key = tuple([2 ** int(math.log2(x) + 0.5) for x in key])
+
+            if key not in self.cache:
+                # prune configs
+                pruned_configs = self.prune_configs(kwargs)
+                bench_start = time.time()
+                timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}
+                bench_end = time.time()
+                self.bench_time = bench_end - bench_start
+                self.cache[key] = builtins.min(timings, key=timings.get)
+                self.hook(args)
+                self.configs_timings = timings
+            config = self.cache[key]
+        else:
+            config = self.configs[0]
+        self.best_config = config
+        if config.pre_hook is not None:
+            config.pre_hook(self.nargs)
+        return self.fn.run(
+            *args,
+            num_warps=config.num_warps,
+            num_stages=config.num_stages,
+            **kwargs,
+            **config.kwargs,
+        )
+
+    def prune_configs(self, kwargs):
+        pruned_configs = self.configs
+        if self.early_config_prune:
+            pruned_configs = self.early_config_prune(self.configs, self.nargs)
+        if self.perf_model:
+            top_k = self.configs_top_k
+            if isinstance(top_k, float) and top_k <= 1.0:
+                top_k = int(len(self.configs) * top_k)
+            if len(pruned_configs) > top_k:
+                est_timing = {
+                    config: self.perf_model(
+                        **self.nargs,
+                        **kwargs,
+                        **config.kwargs,
+                        num_stages=config.num_stages,
+                        num_warps=config.num_warps,
+                    )
+                    for config in pruned_configs
+                }
+                pruned_configs = sorted(est_timing.keys(), key=lambda x: est_timing[x])[:top_k]
+        return pruned_configs
+
+    def warmup(self, *args, **kwargs):
+        self.nargs = dict(zip(self.arg_names, args))
+        for config in self.prune_configs(kwargs):
+            self.fn.warmup(
+                *args,
+                num_warps=config.num_warps,
+                num_stages=config.num_stages,
+                **kwargs,
+                **config.kwargs,
+            )
+        self.nargs = None
+
+
+def autotune(configs, key, prune_configs_by=None, reset_to_zero=None, nearest_power_of_two=False):
+    def decorator(fn):
+        return CustomizedTritonAutoTuner(
+            fn,
+            fn.arg_names,
+            configs,
+            key,
+            reset_to_zero,
+            prune_configs_by,
+            nearest_power_of_two,
+        )
+
+    return decorator
+
+
+def matmul248_kernel_config_pruner(configs, nargs):
+    """
+    The main purpose of this function is to shrink BLOCK_SIZE_* when the corresponding dimension is smaller.
+    """
+    m = max(2 ** int(math.ceil(math.log2(nargs["M"]))), 16)
+    n = max(2 ** int(math.ceil(math.log2(nargs["N"]))), 16)
+    k = max(2 ** int(math.ceil(math.log2(nargs["K"]))), 16)
+
+    used = set()
+    for config in configs:
+        block_size_m = min(m, config.kwargs["BLOCK_SIZE_M"])
+        block_size_n = min(n, config.kwargs["BLOCK_SIZE_N"])
+        block_size_k = min(k, config.kwargs["BLOCK_SIZE_K"])
+        group_size_m = config.kwargs["GROUP_SIZE_M"]
+
+        if (
+            block_size_m,
+            block_size_n,
+            block_size_k,
+            group_size_m,
+            config.num_stages,
+            config.num_warps,
+        ) in used:
+            continue
+
+        used.add(
+            (
+                block_size_m,
+                block_size_n,
+                block_size_k,
+                group_size_m,
+                config.num_stages,
+                config.num_warps,
+            )
+        )
+        yield triton.Config(
+            {
+                "BLOCK_SIZE_M": block_size_m,
+                "BLOCK_SIZE_N": block_size_n,
+                "BLOCK_SIZE_K": block_size_k,
+                "GROUP_SIZE_M": group_size_m,
+            },
+            num_stages=config.num_stages,
+            num_warps=config.num_warps,
+        )
+
+
+__all__ = ["autotune"]
diff --git a/plugins/accelerated-peft/src/gptqmodel/nn_modules/triton_utils/dequant.py b/plugins/accelerated-peft/src/gptqmodel/nn_modules/triton_utils/dequant.py
new file mode 100644
index 00000000..05091699
--- /dev/null
+++ b/plugins/accelerated-peft/src/gptqmodel/nn_modules/triton_utils/dequant.py
@@ -0,0 +1,153 @@
+###############################################################################
+# Adapted from https://github.com/ModelCloud/GPTQModel
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+###############################################################################
+import itertools
+
+import torch
+import triton
+import triton.language as tl
+from torch.cuda.amp import custom_bwd, custom_fwd
+
+
+def make_dequant_configs(block_sizes, num_warps):
+    configs = []
+    for bs, ws in itertools.product(block_sizes, num_warps):
+        configs.append(triton.Config({"X_BLOCK": bs}, num_warps=ws))
+    return configs
+
+
+DEFAULT_DEQUANT_CONFIGS = make_dequant_configs([128, 256, 512, 1024], [4, 8])
+
+
+@triton.autotune(DEFAULT_DEQUANT_CONFIGS, key=["numels"])
+@triton.jit
+def dequant_kernel_248(
+    g_idx_ptr,
+    scales_ptr,
+    qweight_ptr,
+    qzeros_ptr,
+    out_ptr,
+    numels,
+    maxq: tl.constexpr,
+    bits: tl.constexpr,
+    outfeatures: tl.constexpr,
+    num_groups: tl.constexpr,
+    X_BLOCK: tl.constexpr,
+):
+    # Block indexing
+    xoffset = tl.program_id(0) * X_BLOCK
+    x_index = xoffset + tl.arange(0, X_BLOCK)
+    xmask = x_index < numels
+    row_idx = x_index // outfeatures
+    col_idx = x_index % outfeatures
+
+    elements_per_feature: tl.constexpr = 32 // bits
+
+    # Load parameters
+    g_idx = tl.load(g_idx_ptr + (row_idx), None, eviction_policy="evict_last")
+    qweights = tl.load(
+        qweight_ptr + (col_idx + (outfeatures * (row_idx // elements_per_feature))),
+        None,
+    )
+
+    wf_weights = (row_idx % elements_per_feature) * bits
+
+    wf_zeros = (col_idx % elements_per_feature) * bits
+
+    tmp1 = g_idx + num_groups
+    tmp2 = g_idx < 0
+    tl.device_assert(g_idx >= 0, "index out of bounds: 0 <= tmp0 < 0")
+    groups = tl.where(tmp2, tmp1, g_idx)  # tmp3 are g_idx
+
+    scales = tl.load(scales_ptr + (col_idx + (outfeatures * groups)), None).to(tl.float32)
+
+    # Unpack weights
+    weights = qweights >> wf_weights  # bit shift qweight
+
+    weights = weights & maxq
+
+    # Unpack zeros
+    qzero_ncols: tl.constexpr = outfeatures // elements_per_feature
+    qzeros = tl.load(
+        qzeros_ptr + ((qzero_ncols * groups) + (col_idx // elements_per_feature)),
+        None,
+        eviction_policy="evict_last",
+    )
+    zeros = qzeros >> wf_zeros
+    zeros = zeros & maxq
+
+    # Dequantize
+    weights = weights - zeros
+    weights = weights.to(tl.float32)
+    weights = scales * weights
+
+    tl.store(out_ptr + (x_index), weights, mask=xmask)
+
+
+def dequant248(qweight, scales, qzeros, g_idx, bits, maxq=None):
+    """
+    Launcher for triton dequant kernel.  Only valid for bits = 2, 4, 8
+    """
+
+    num_groups = scales.shape[0]
+    outfeatures = scales.shape[1]
+    infeatures = g_idx.shape[0]
+
+    out = torch.empty((infeatures, outfeatures), device="cuda", dtype=torch.float16)
+    numels = out.numel()
+    maxq = 2**bits - 1 if maxq is None else maxq
+    grid = lambda meta: (triton.cdiv(numels, meta["X_BLOCK"]),)  # noqa: E731
+
+    dequant_kernel_248[grid](
+        g_idx,
+        scales,
+        qweight,
+        qzeros,
+        out,
+        numels,
+        maxq=maxq,
+        bits=bits,
+        outfeatures=outfeatures,
+        num_groups=num_groups,
+    )
+    return out
+
+
+def quant_matmul_248(input, qweight, scales, qzeros, g_idx, bits, maxq=None, transpose=False):
+    W = dequant248(qweight, scales, qzeros, g_idx, bits, maxq=maxq)
+    if transpose:
+        return input @ W.t()
+    return input @ W
+
+
+class QuantLinearFunction(torch.autograd.Function):
+    @staticmethod
+    @custom_fwd
+    def forward(ctx, input, qweight, scales, qzeros, g_idx, bits, maxq):
+        output = quant_matmul_248(input, qweight, scales, qzeros, g_idx, bits, maxq)
+        ctx.save_for_backward(qweight, scales, qzeros, g_idx)
+        ctx.bits, ctx.maxq = bits, maxq
+        return output
+
+    @staticmethod
+    @custom_bwd
+    def backward(ctx, grad_output):
+        qweight, scales, qzeros, g_idx = ctx.saved_tensors
+        bits, maxq = ctx.bits, ctx.maxq
+        grad_input = None
+
+        if ctx.needs_input_grad[0]:
+            grad_input = quant_matmul_248(grad_output, qweight, scales, qzeros, g_idx, bits, maxq, transpose=True)
+        return grad_input, None, None, None, None, None, None
diff --git a/plugins/accelerated-peft/src/gptqmodel/nn_modules/triton_utils/kernels.py b/plugins/accelerated-peft/src/gptqmodel/nn_modules/triton_utils/kernels.py
new file mode 100644
index 00000000..541c22dc
--- /dev/null
+++ b/plugins/accelerated-peft/src/gptqmodel/nn_modules/triton_utils/kernels.py
@@ -0,0 +1,476 @@
+###############################################################################
+# Adapted from https://github.com/ModelCloud/GPTQModel
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+###############################################################################
+from logging import getLogger
+
+import torch
+import triton
+import triton.language as tl
+from torch.cuda.amp import custom_bwd, custom_fwd
+
+from . import custom_autotune
+
+logger = getLogger(__name__)
+
+
+# code based https://github.com/fpgaminer/GPTQ-triton
+
+
+@custom_autotune.autotune(
+    configs=[
+        triton.Config(
+            {
+                "BLOCK_SIZE_M": 64,
+                "BLOCK_SIZE_N": 256,
+                "BLOCK_SIZE_K": 32,
+                "GROUP_SIZE_M": 8,
+            },
+            num_stages=4,
+            num_warps=4,
+        ),
+        triton.Config(
+            {
+                "BLOCK_SIZE_M": 128,
+                "BLOCK_SIZE_N": 128,
+                "BLOCK_SIZE_K": 32,
+                "GROUP_SIZE_M": 8,
+            },
+            num_stages=4,
+            num_warps=4,
+        ),
+        triton.Config(
+            {
+                "BLOCK_SIZE_M": 64,
+                "BLOCK_SIZE_N": 128,
+                "BLOCK_SIZE_K": 32,
+                "GROUP_SIZE_M": 8,
+            },
+            num_stages=4,
+            num_warps=4,
+        ),
+        triton.Config(
+            {
+                "BLOCK_SIZE_M": 128,
+                "BLOCK_SIZE_N": 32,
+                "BLOCK_SIZE_K": 32,
+                "GROUP_SIZE_M": 8,
+            },
+            num_stages=4,
+            num_warps=4,
+        ),
+        triton.Config(
+            {
+                "BLOCK_SIZE_M": 64,
+                "BLOCK_SIZE_N": 64,
+                "BLOCK_SIZE_K": 32,
+                "GROUP_SIZE_M": 8,
+            },
+            num_stages=4,
+            num_warps=4,
+        ),
+        triton.Config(
+            {
+                "BLOCK_SIZE_M": 64,
+                "BLOCK_SIZE_N": 128,
+                "BLOCK_SIZE_K": 32,
+                "GROUP_SIZE_M": 8,
+            },
+            num_stages=2,
+            num_warps=8,
+        ),
+    ],
+    key=["M", "N", "K"],
+    nearest_power_of_two=True,
+    prune_configs_by={
+        "early_config_prune": custom_autotune.matmul248_kernel_config_pruner,
+        "perf_model": None,
+        "top_k": None,
+    },
+)
+@triton.jit
+def quant_matmul_248_kernel(
+    a_ptr,
+    b_ptr,
+    c_ptr,
+    scales_ptr,
+    zeros_ptr,
+    g_ptr,
+    M,
+    N,
+    K,
+    bits,
+    maxq,
+    stride_am,
+    stride_ak,
+    stride_bk,
+    stride_bn,
+    stride_cm,
+    stride_cn,
+    stride_scales,
+    stride_zeros,
+    BLOCK_SIZE_M: tl.constexpr,
+    BLOCK_SIZE_N: tl.constexpr,
+    BLOCK_SIZE_K: tl.constexpr,
+    GROUP_SIZE_M: tl.constexpr,
+):
+    """
+    Compute the matrix multiplication C = A x B.
+    A is of shape (M, K) float16
+    B is of shape (K//8, N) int32
+    C is of shape (M, N) float16
+    scales is of shape (G, N) float16
+    zeros is of shape (G, N) float16
+    g_ptr is of shape (K) int32
+    """
+    infearure_per_bits = 32 // bits
+
+    pid = tl.program_id(axis=0)
+    num_pid_m = tl.cdiv(M, BLOCK_SIZE_M)
+    num_pid_n = tl.cdiv(N, BLOCK_SIZE_N)
+    num_pid_k = tl.cdiv(K, BLOCK_SIZE_K)
+    num_pid_in_group = GROUP_SIZE_M * num_pid_n
+    group_id = pid // num_pid_in_group
+    first_pid_m = group_id * GROUP_SIZE_M
+    group_size_m = min(num_pid_m - first_pid_m, GROUP_SIZE_M)
+    pid_m = first_pid_m + (pid % group_size_m)
+    pid_n = (pid % num_pid_in_group) // group_size_m
+
+    offs_am = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)
+    offs_bn = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
+    offs_k = tl.arange(0, BLOCK_SIZE_K)
+    a_ptrs = a_ptr + (offs_am[:, None] * stride_am + offs_k[None, :] * stride_ak)  # (BLOCK_SIZE_M, BLOCK_SIZE_K)
+    a_mask = offs_am[:, None] < M
+    # b_ptrs is set up such that it repeats elements along the K axis 8 times
+    b_ptrs = b_ptr + (
+        (offs_k[:, None] // infearure_per_bits) * stride_bk + offs_bn[None, :] * stride_bn
+    )  # (BLOCK_SIZE_K, BLOCK_SIZE_N)
+    g_ptrs = g_ptr + offs_k
+    # shifter is used to extract the N bits of each element in the 32-bit word from B
+    scales_ptrs = scales_ptr + offs_bn[None, :]
+    zeros_ptrs = zeros_ptr + (offs_bn[None, :] // infearure_per_bits)
+
+    shifter = (offs_k % infearure_per_bits) * bits
+    zeros_shifter = (offs_bn % infearure_per_bits) * bits
+    accumulator = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32)
+
+    for k in range(0, num_pid_k):
+        g_idx = tl.load(g_ptrs)
+
+        # Fetch scales and zeros; these are per-outfeature and thus reused in the inner loop
+        scales = tl.load(scales_ptrs + g_idx[:, None] * stride_scales)  # (BLOCK_SIZE_K, BLOCK_SIZE_N,)
+        zeros = tl.load(zeros_ptrs + g_idx[:, None] * stride_zeros)  # (BLOCK_SIZE_K, BLOCK_SIZE_N,)
+
+        zeros = (zeros >> zeros_shifter[None, :]) & maxq
+
+        a = tl.load(a_ptrs, mask=a_mask, other=0.0)  # (BLOCK_SIZE_M, BLOCK_SIZE_K)
+        b = tl.load(b_ptrs)  # (BLOCK_SIZE_K, BLOCK_SIZE_N), but repeated
+
+        # Now we need to unpack b (which is N-bit values) into 32-bit values
+        b = (b >> shifter[:, None]) & maxq  # Extract the N-bit values
+        b = (b - zeros) * scales  # Scale and shift
+
+        accumulator += tl.dot(a, b)
+        a_ptrs += BLOCK_SIZE_K
+        b_ptrs += (BLOCK_SIZE_K // infearure_per_bits) * stride_bk
+        g_ptrs += BLOCK_SIZE_K
+
+    c_ptrs = c_ptr + stride_cm * offs_am[:, None] + stride_cn * offs_bn[None, :]
+    c_mask = (offs_am[:, None] < M) & (offs_bn[None, :] < N)
+    tl.store(c_ptrs, accumulator, mask=c_mask)
+
+
+@custom_autotune.autotune(
+    configs=[
+        triton.Config(
+            {
+                "BLOCK_SIZE_M": 64,
+                "BLOCK_SIZE_N": 32,
+                "BLOCK_SIZE_K": 256,
+                "GROUP_SIZE_M": 8,
+            },
+            num_stages=4,
+            num_warps=4,
+        ),
+        triton.Config(
+            {
+                "BLOCK_SIZE_M": 128,
+                "BLOCK_SIZE_N": 32,
+                "BLOCK_SIZE_K": 128,
+                "GROUP_SIZE_M": 8,
+            },
+            num_stages=4,
+            num_warps=4,
+        ),
+        triton.Config(
+            {
+                "BLOCK_SIZE_M": 64,
+                "BLOCK_SIZE_N": 32,
+                "BLOCK_SIZE_K": 128,
+                "GROUP_SIZE_M": 8,
+            },
+            num_stages=4,
+            num_warps=4,
+        ),
+        triton.Config(
+            {
+                "BLOCK_SIZE_M": 128,
+                "BLOCK_SIZE_N": 32,
+                "BLOCK_SIZE_K": 32,
+                "GROUP_SIZE_M": 8,
+            },
+            num_stages=4,
+            num_warps=4,
+        ),
+        triton.Config(
+            {
+                "BLOCK_SIZE_M": 64,
+                "BLOCK_SIZE_N": 32,
+                "BLOCK_SIZE_K": 64,
+                "GROUP_SIZE_M": 8,
+            },
+            num_stages=4,
+            num_warps=4,
+        ),
+        triton.Config(
+            {
+                "BLOCK_SIZE_M": 64,
+                "BLOCK_SIZE_N": 32,
+                "BLOCK_SIZE_K": 128,
+                "GROUP_SIZE_M": 8,
+            },
+            num_stages=2,
+            num_warps=8,
+        ),
+    ],
+    key=["M", "N", "K"],
+    nearest_power_of_two=True,
+)
+@triton.jit
+def transpose_quant_matmul_248_kernel(
+    a_ptr,
+    b_ptr,
+    c_ptr,
+    scales_ptr,
+    zeros_ptr,
+    g_ptr,
+    M,
+    N,
+    K,
+    bits,
+    maxq,
+    stride_am,
+    stride_ak,
+    stride_bk,
+    stride_bn,
+    stride_cm,
+    stride_cn,
+    stride_scales,
+    stride_zeros,
+    BLOCK_SIZE_M: tl.constexpr,
+    BLOCK_SIZE_N: tl.constexpr,
+    BLOCK_SIZE_K: tl.constexpr,
+    GROUP_SIZE_M: tl.constexpr,
+):
+    """
+    Compute the matrix multiplication C = A x B.
+    A is of shape (M, N) float16
+    B is of shape (K//8, N) int32
+    C is of shape (M, K) float16
+    scales is of shape (G, N) float16
+    zeros is of shape (G, N) float16
+    g_ptr is of shape (K) int32
+    """
+    infearure_per_bits = 32 // bits
+
+    pid = tl.program_id(axis=0)
+    num_pid_m = tl.cdiv(M, BLOCK_SIZE_M)
+    num_pid_k = tl.cdiv(K, BLOCK_SIZE_K)
+    num_pid_n = tl.cdiv(N, BLOCK_SIZE_N)
+    num_pid_in_group = GROUP_SIZE_M * num_pid_k
+    group_id = pid // num_pid_in_group
+    first_pid_m = group_id * GROUP_SIZE_M
+    group_size_m = min(num_pid_m - first_pid_m, GROUP_SIZE_M)
+    pid_m = first_pid_m + (pid % group_size_m)
+    pid_k = (pid % num_pid_in_group) // group_size_m
+
+    offs_am = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)
+    offs_bk = pid_k * BLOCK_SIZE_K + tl.arange(0, BLOCK_SIZE_K)
+    offs_n = tl.arange(0, BLOCK_SIZE_N)
+    a_ptrs = a_ptr + (offs_am[:, None] * stride_am + offs_n[None, :] * stride_ak)  # (BLOCK_SIZE_M, BLOCK_SIZE_N)
+    a_mask = offs_am[:, None] < M
+    # b_ptrs is set up such that it repeats elements along the K axis 8 times
+    b_ptrs = b_ptr + (
+        (offs_bk[:, None] // infearure_per_bits) * stride_bk + offs_n[None, :] * stride_bn
+    )  # (BLOCK_SIZE_K, BLOCK_SIZE_N)
+    g_ptrs = g_ptr + offs_bk
+    g_idx = tl.load(g_ptrs)
+
+    # shifter is used to extract the N bits of each element in the 32-bit word from B
+    scales_ptrs = scales_ptr + offs_n[None, :] + g_idx[:, None] * stride_scales
+    zeros_ptrs = zeros_ptr + (offs_n[None, :] // infearure_per_bits) + g_idx[:, None] * stride_zeros
+
+    shifter = (offs_bk % infearure_per_bits) * bits
+    zeros_shifter = (offs_n % infearure_per_bits) * bits
+    accumulator = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_K), dtype=tl.float32)
+
+    for k in range(0, num_pid_n):
+        # Fetch scales and zeros; these are per-outfeature and thus reused in the inner loop
+        scales = tl.load(scales_ptrs)  # (BLOCK_SIZE_K, BLOCK_SIZE_N,)
+        zeros = tl.load(zeros_ptrs)  # (BLOCK_SIZE_K, BLOCK_SIZE_N,)
+
+        zeros = (zeros >> zeros_shifter[None, :]) & maxq
+
+        a = tl.load(a_ptrs, mask=a_mask, other=0.0)  # (BLOCK_SIZE_M, BLOCK_SIZE_N)
+        b = tl.load(b_ptrs)  # (BLOCK_SIZE_K, BLOCK_SIZE_N), but repeated
+
+        # Now we need to unpack b (which is N-bit values) into 32-bit values
+        b = (b >> shifter[:, None]) & maxq  # Extract the N-bit values
+        b = (b - zeros) * scales  # Scale and shift
+        b = tl.trans(b)
+
+        accumulator += tl.dot(a, b)
+        a_ptrs += BLOCK_SIZE_N
+        b_ptrs += BLOCK_SIZE_N
+        scales_ptrs += BLOCK_SIZE_N
+        zeros_ptrs += BLOCK_SIZE_N // infearure_per_bits
+
+    c_ptrs = c_ptr + stride_cm * offs_am[:, None] + stride_cn * offs_bk[None, :]
+    c_mask = (offs_am[:, None] < M) & (offs_bk[None, :] < K)
+    tl.store(c_ptrs, accumulator, mask=c_mask)
+
+
+@triton.jit
+def silu(x):
+    return x * tl.sigmoid(x)
+
+
+def quant_matmul_248(input, qweight, scales, qzeros, g_idx, bits, maxq):
+    with torch.cuda.device(input.device):
+        output = torch.empty((input.shape[0], qweight.shape[1]), device=input.device, dtype=input.dtype)
+        grid = lambda META: (  # noqa: E731
+            triton.cdiv(input.shape[0], META["BLOCK_SIZE_M"]) * triton.cdiv(qweight.shape[1], META["BLOCK_SIZE_N"]),
+        )
+        quant_matmul_248_kernel[grid](
+            input,
+            qweight,
+            output,
+            scales.to(input.dtype),
+            qzeros,
+            g_idx,
+            input.shape[0],
+            qweight.shape[1],
+            input.shape[1],
+            bits,
+            maxq,
+            input.stride(0),
+            input.stride(1),
+            qweight.stride(0),
+            qweight.stride(1),
+            output.stride(0),
+            output.stride(1),
+            scales.stride(0),
+            qzeros.stride(0),
+        )
+        return output
+
+
+def transpose_quant_matmul_248(input, qweight, scales, qzeros, g_idx, bits, maxq):
+    with torch.cuda.device(input.device):
+        output_dim = (qweight.shape[0] * 32) // bits
+        output = torch.empty((input.shape[0], output_dim), device=input.device, dtype=input.dtype)
+        grid = lambda META: (  # noqa: E731
+            triton.cdiv(input.shape[0], META["BLOCK_SIZE_M"]) * triton.cdiv(output_dim, META["BLOCK_SIZE_K"]),
+        )
+        transpose_quant_matmul_248_kernel[grid](
+            input,
+            qweight,
+            output,
+            scales.to(input.dtype),
+            qzeros,
+            g_idx,
+            input.shape[0],
+            qweight.shape[1],
+            output_dim,
+            bits,
+            maxq,
+            input.stride(0),
+            input.stride(1),
+            qweight.stride(0),
+            qweight.stride(1),
+            output.stride(0),
+            output.stride(1),
+            scales.stride(0),
+            qzeros.stride(0),
+        )
+        return output
+
+
+class QuantLinearFunction(torch.autograd.Function):
+    @staticmethod
+    @custom_fwd
+    def forward(ctx, input, qweight, scales, qzeros, g_idx, bits, maxq):
+        output = quant_matmul_248(input, qweight, scales, qzeros, g_idx, bits, maxq)
+        ctx.save_for_backward(qweight, scales, qzeros, g_idx)
+        ctx.bits, ctx.maxq = bits, maxq
+        return output
+
+    @staticmethod
+    @custom_bwd
+    def backward(ctx, grad_output):
+        qweight, scales, qzeros, g_idx = ctx.saved_tensors
+        bits, maxq = ctx.bits, ctx.maxq
+        grad_input = None
+
+        if ctx.needs_input_grad[0]:
+            grad_input = transpose_quant_matmul_248(grad_output, qweight, scales, qzeros, g_idx, bits, maxq)
+        return grad_input, None, None, None, None, None, None
+
+
+def quant_matmul_inference_only_248(input, qweight, scales, qzeros, g_idx, bits, maxq):
+    with torch.cuda.device(input.device):
+        output = torch.empty((input.shape[0], qweight.shape[1]), device=input.device, dtype=torch.float16)
+        grid = lambda META: (  # noqa: E731
+            triton.cdiv(input.shape[0], META["BLOCK_SIZE_M"]) * triton.cdiv(qweight.shape[1], META["BLOCK_SIZE_N"]),
+        )
+        quant_matmul_248_kernel[grid](
+            input,
+            qweight,
+            output,
+            scales,
+            qzeros,
+            g_idx,
+            input.shape[0],
+            qweight.shape[1],
+            input.shape[1],
+            bits,
+            maxq,
+            input.stride(0),
+            input.stride(1),
+            qweight.stride(0),
+            qweight.stride(1),
+            output.stride(0),
+            output.stride(1),
+            scales.stride(0),
+            qzeros.stride(0),
+        )
+        return output
+
+
+class QuantLinearInferenceOnlyFunction(torch.autograd.Function):
+    @staticmethod
+    @custom_fwd(cast_inputs=torch.float16)
+    def forward(ctx, input, qweight, scales, qzeros, g_idx, bits, maxq):
+        output = quant_matmul_248(input, qweight, scales, qzeros, g_idx, bits, maxq)
+        return output
diff --git a/plugins/accelerated-peft/src/gptqmodel/nn_modules/triton_utils/mixin.py b/plugins/accelerated-peft/src/gptqmodel/nn_modules/triton_utils/mixin.py
new file mode 100644
index 00000000..457d49cd
--- /dev/null
+++ b/plugins/accelerated-peft/src/gptqmodel/nn_modules/triton_utils/mixin.py
@@ -0,0 +1,19 @@
+###############################################################################
+# Adapted from https://github.com/ModelCloud/GPTQModel
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+###############################################################################
+class TritonModuleMixin:
+    @classmethod
+    def warmup(cls, model, transpose=False, seqlen=2048):
+        pass
diff --git a/plugins/accelerated-peft/src/gptqmodel/quantization/__init__.py b/plugins/accelerated-peft/src/gptqmodel/quantization/__init__.py
new file mode 100644
index 00000000..ca86e26f
--- /dev/null
+++ b/plugins/accelerated-peft/src/gptqmodel/quantization/__init__.py
@@ -0,0 +1,4 @@
+from .config import (FORMAT, FORMAT_FIELD_CODE, FORMAT_FIELD_JSON,
+                     QUANT_CONFIG_FILENAME, QUANT_METHOD, QUANT_METHOD_FIELD, BaseQuantizeConfig, QuantizeConfig)
+from .gptq import GPTQ
+from .quantizer import Quantizer, quantize
diff --git a/plugins/accelerated-peft/src/gptqmodel/quantization/config.py b/plugins/accelerated-peft/src/gptqmodel/quantization/config.py
new file mode 100644
index 00000000..a99020fa
--- /dev/null
+++ b/plugins/accelerated-peft/src/gptqmodel/quantization/config.py
@@ -0,0 +1,309 @@
+###############################################################################
+# Adapted from https://github.com/ModelCloud/GPTQModel
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+###############################################################################
+import json
+import logging
+from dataclasses import dataclass, field, fields
+from os.path import isdir, join
+from typing import Any, Dict, Optional, Tuple
+
+from packaging import version
+from transformers.utils.hub import cached_file
+
+logger = logging.getLogger(__name__)
+handler = logging.StreamHandler()
+formatter = logging.Formatter("%(levelname)s - %(message)s")
+handler.setFormatter(formatter)
+logger.propagate = False
+logger.addHandler(handler)
+logger.setLevel(logging.INFO)
+
+FORMAT_FIELD_CODE = "format"
+FORMAT_FIELD_JSON = "checkpoint_format"
+QUANT_METHOD_FIELD = "quant_method"
+QUANT_CONFIG_FILENAME = "quantize_config.json"
+QUANT_CONFIG_FILENAME_COMPAT = [QUANT_CONFIG_FILENAME, "quant_config.json", "config.json"]
+
+MIN_VERSION_WITH_V2 = "0.9.0"
+
+META_FIELD = "meta"
+# quantizer is the tool that did the quantization
+META_FIELD_QUANTIZER = "quantizer"
+# packer is the tool that packed the weights post quantization
+META_FIELD_PACKER = "packer"
+
+META_QUANTIZER_GPTQMODEL = "gptqmodel"
+
+
+# saved formats
+class FORMAT:
+    GPTQ = "gptq"
+    # v2 format fixed sym = False quantization
+    GPTQ_V2 = "gptq_v2"
+    TRITON = "triton"
+
+
+# quant methods
+class QUANT_METHOD:
+    GPTQ = "gptq"
+
+
+QUANT_METHOD_FORMAT_MAPPING = {
+    QUANT_METHOD.GPTQ: {
+        FORMAT.GPTQ,
+        FORMAT.GPTQ_V2,
+        FORMAT.TRITON,
+    },
+}
+
+# inference only methods should go here
+QUANTIZE_BLACK_LIST = {}
+
+# compat
+QUANT_CONFIG_ARG_SYNONYMS = {
+    "w_bit": "bits",
+    "q_group_size": "group_size",
+    # map format field (checkpoint_format) to class/code (format)
+    FORMAT_FIELD_JSON: FORMAT_FIELD_CODE,
+}
+
+
+@dataclass
+class QuantizeConfig():
+    bits: int = field(default=4, metadata={"choices": [2, 3, 4, 8]})
+    group_size: int = field(default=-1)
+    damp_percent: float = field(default=0.01)
+    desc_act: bool = field(default=True)
+    static_groups: bool = field(default=False)
+    sym: bool = field(default=True)
+    true_sequential: bool = field(default=True)
+    lm_head: bool = field(default=False)
+    quant_method: str = field(default=QUANT_METHOD.GPTQ)
+    # default to gptq v1 format for maximum compat with 3rd party inference libs with minimal loss vs v2
+    # if you inference with autogptq, save to gptq_v2 format for best result
+    format: FORMAT = field(default=FORMAT.GPTQ)
+
+    # TODO: remove
+    model_name_or_path: Optional[str] = field(default=None)
+    model_file_base_name: Optional[str] = field(default=None)
+
+    # properties that do not directly contributes to quantization or quant inference should be placed in meta
+    # i.e. quantizer tool (producer) + version, timestamp, entity who made the quant, etc
+    meta: Optional[Dict] = field(default=None)
+
+    def __post_init__(self):
+        fields_info = fields(self)
+
+        # validate quant method and format is matched
+        valid_formats = QUANT_METHOD_FORMAT_MAPPING.get(self.quant_method, None)
+        if valid_formats is None:
+            raise ValueError(f"Unsupported quantization method: {self.quant_method}")
+
+        if self.format not in valid_formats:
+            raise ValueError(
+                f"The checkpoint format used is {self.format}, and the quantization method is {self.quant_method}. "
+            )
+
+        if self.bits not in fields_info[0].metadata["choices"]:
+            raise ValueError(f"only support quantize to {fields_info[0].metadata['choices']} bits.")
+
+        if self.group_size != -1 and self.group_size <= 0:
+            raise ValueError("unless equal to -1, group_size must greater then 0.")
+
+        if not (0 < self.damp_percent < 1):
+            raise ValueError("damp_percent must between 0 and 1.")
+
+        # validate meta
+        if self.meta is not None:
+            if not isinstance(self.meta, dict):
+                raise ValueError("meta must be a dictionary")
+            for key, value in self.meta.items():
+                if not isinstance(key, str):
+                    raise ValueError("Keys in the meta dictionary must be strings")
+        else:
+            self.meta = {}
+
+    def meta_set(self, key: str, value: Any):
+        self.meta[key] = value
+
+    def meta_get(self, key: str) -> Any:
+        return self.meta.get(key)
+
+    # versionable is a meta.property that pairs value with version i.e "value:1.0.0"
+    def meta_set_versionable(self, key: str, value: str, version: str):
+        self.meta_set(key, f"{value}:{version}")
+
+    # versionable is a meta.property that pairs value with version i.e "value:1.0.0"
+    def meta_get_versionable(self, key: str) -> Tuple[str, str]:
+        val = self.meta_get(key)
+        if val is None:
+            return None, None
+        parts = val.split(":")
+        return parts[0].lower(), parts[1].lower() if len(parts) >= 2 else None
+
+    # is quantized model quantized or packed by autogptq version with v2 format code
+    def is_quantized_or_packed_by_v2(self) -> bool:
+        # check meta.quantizer
+        producer, _version = self.meta_get_versionable(META_FIELD_QUANTIZER)
+        by_v2 = (producer == META_QUANTIZER_GPTQMODEL) and (version.parse(_version) >= version.parse(MIN_VERSION_WITH_V2))
+
+        # fallback to meta.packer
+        if not by_v2:
+            producer, _version = self.meta_get_versionable(META_FIELD_PACKER)
+            by_v2 = producer == META_QUANTIZER_GPTQMODEL and version.parse(_version) >= version.parse(
+                MIN_VERSION_WITH_V2
+            )
+
+        return by_v2
+
+    def save_pretrained(self, save_dir: str, **kwargs):
+        with open(join(save_dir, QUANT_CONFIG_FILENAME), "w", encoding="utf-8") as f:
+            json.dump(self.to_dict(), f, indent=2)
+
+    @classmethod
+    # normalize quant config for compat and also performs validation
+    def from_quant_config(cls, quantize_cfg, format: str = None):
+        valid_formats = {FORMAT.GPTQ, FORMAT.GPTQ_V2}
+        format_auto_inferred = False
+        # compat: format can be passed in via from_quantized() if field missing from json
+        if format:
+            if format not in valid_formats:
+                raise ValueError(f"Unknown quantization checkpoint format: {format}.")
+            if quantize_cfg.get(FORMAT_FIELD_JSON):
+                raise ValueError("Conflict: quantization format is passed in and also exists in model config.")
+        # compat: warn if checkpoint_format is missing
+        elif quantize_cfg.get(FORMAT_FIELD_JSON) is None:
+            format_auto_inferred = True
+
+        field_names = [field.name for field in fields(cls)]
+
+        normalized = {
+            QUANT_METHOD_FIELD: QUANT_METHOD.GPTQ,
+            # compat: default to gptq(v1) when loading models
+            FORMAT_FIELD_CODE: format if format else FORMAT.GPTQ,
+        }
+        for key, val in quantize_cfg.items():
+            key = key.lower()
+
+            # remap keys according to compat map
+            if key in QUANT_CONFIG_ARG_SYNONYMS and QUANT_CONFIG_ARG_SYNONYMS[key] in field_names:
+                key = QUANT_CONFIG_ARG_SYNONYMS[key]
+
+            if key == FORMAT_FIELD_JSON:
+                val = val.lower()
+
+                if val in {FORMAT.GPTQ, FORMAT.GPTQ_V2}:
+                    normalized[key] = val
+                else:
+                    raise ValueError(f"Unknown quantization format: {val}.")
+            elif key == QUANT_METHOD_FIELD:
+                val = val.lower()
+                # compat: some hf models use quant_method=marlin or bitblas
+                if val not in {QUANT_METHOD.GPTQ}:
+                    raise ValueError(f"Unknown quantization method: {val}.")
+                else:
+                    normalized[QUANT_METHOD_FIELD] = val
+            elif key in field_names:
+                normalized[key] = val
+            else:
+                logger.info(f"Ignoring unknown parameter in the quantization configuration: {key}.")
+
+        if format_auto_inferred:
+            logger.info(f"`{FORMAT_FIELD_JSON}` is missing from the quantization configuration and is automatically inferred to {normalized[FORMAT_FIELD_CODE]}")
+
+        if "sym" not in normalized:
+            logger.warning(
+                "The quantization configuration does not contain an entry `sym` (symmetric quantization). "
+                "This may result in silent errors. Defaulting to `sym=True`."
+            )
+
+        return cls(**normalized)
+
+    @classmethod
+    def from_pretrained(cls, save_dir: str, **kwargs):
+        # Parameters related to loading from Hugging Face Hub
+        cache_dir = kwargs.pop("cache_dir", None)
+        force_download = kwargs.pop("force_download", False)
+        resume_download = kwargs.pop("resume_download", False)
+        proxies = kwargs.pop("proxies", None)
+        local_files_only = kwargs.pop("local_files_only", False)
+        use_auth_token = kwargs.pop("use_auth_token", None)
+        revision = kwargs.pop("revision", None)
+        subfolder = kwargs.pop("subfolder", None)
+        commit_hash = kwargs.pop("_commit_hash", None)
+        format = kwargs.pop("format", None)
+
+        transformers_config = False
+        for quantize_config_filename in QUANT_CONFIG_FILENAME_COMPAT:
+            if isdir(save_dir):  # Local
+                resolved_config_file = join(save_dir, quantize_config_filename)
+            else:  # Remote
+                resolved_config_file = cached_file(
+                    save_dir,
+                    quantize_config_filename,
+                    cache_dir=cache_dir,
+                    force_download=force_download,
+                    resume_download=resume_download,
+                    proxies=proxies,
+                    use_auth_token=use_auth_token,
+                    revision=revision,
+                    local_files_only=local_files_only,
+                    subfolder=subfolder,
+                    _raise_exceptions_for_missing_entries=False,
+                    _raise_exceptions_for_connection_errors=False,
+                    _commit_hash=commit_hash,
+                )
+            if resolved_config_file is not None:
+                if quantize_config_filename == "config.json":
+                    transformers_config = True
+                break
+
+        if resolved_config_file is None:
+            raise ValueError(
+                "No quantize_config.json, quant_config.json or config.json file was found in the model repository."
+            )
+
+        with open(resolved_config_file, "r", encoding="utf-8") as f:
+            args_from_json = json.load(f)
+
+            if transformers_config:
+                args_from_json = args_from_json["quantization_config"]
+
+            return cls.from_quant_config(args_from_json, format)
+
+    def to_dict(self):
+        return {
+            "bits": self.bits,
+            "group_size": self.group_size,
+            "desc_act": self.desc_act,
+            "static_groups": self.static_groups,
+            "sym": self.sym,
+            "lm_head": self.lm_head,
+            "damp_percent": self.damp_percent,
+            "true_sequential": self.true_sequential,
+            # TODO: deprecate?
+            "model_name_or_path": self.model_name_or_path,
+            "model_file_base_name": self.model_file_base_name,
+            QUANT_METHOD_FIELD: self.quant_method,
+            FORMAT_FIELD_JSON: self.format,
+            META_FIELD: self.meta,
+        }
+
+# deprecated: will be removed in future update
+@dataclass
+class BaseQuantizeConfig(QuantizeConfig):
+    def __init__(self, **kwargs):
+        super().__init__(**kwargs)
+        logging.warning("BaseQuantizeConfig is re-named and pending deprecation. Please use `QuantizeConfig` instead.")
diff --git a/plugins/accelerated-peft/src/gptqmodel/quantization/gptq.py b/plugins/accelerated-peft/src/gptqmodel/quantization/gptq.py
new file mode 100644
index 00000000..c3f2e4b3
--- /dev/null
+++ b/plugins/accelerated-peft/src/gptqmodel/quantization/gptq.py
@@ -0,0 +1,209 @@
+# License: GPTQModel/licenses/LICENSE.mit
+# adapted from @qwopqwop200 's [GPTQ-for-LLaMa](https://github.com/qwopqwop200/GPTQ-for-LLaMa/tree/cuda), which itself is based on [gptq](https://github.com/IST-DASLab/gptq)
+
+import math
+import os
+import time
+from logging import getLogger
+
+import torch
+import torch.nn as nn
+import transformers
+
+from .quantizer import Quantizer
+
+logger = getLogger(__name__)
+
+torch.backends.cuda.matmul.allow_tf32 = False
+torch.backends.cudnn.allow_tf32 = False
+
+
+class GPTQ:
+    def __init__(self, layer):
+        self.layer = layer
+        self.dev = self.layer.weight.device
+        W = layer.weight.data.clone()
+        if isinstance(self.layer, nn.Conv2d):
+            W = W.flatten(1)
+        if isinstance(self.layer, transformers.pytorch_utils.Conv1D):
+            W = W.t()
+        self.rows = W.shape[0]
+        self.columns = W.shape[1]
+        self.H = torch.zeros((self.columns, self.columns), device=self.dev)
+        self.nsamples = 0
+        self.quantizer = Quantizer()
+
+    def add_batch(self, inp, out):
+        if os.environ.get("DEBUG"):
+            self.inp1 = inp
+            self.out1 = out
+        if len(inp.shape) == 2:
+            inp = inp.unsqueeze(0)
+        tmp = inp.shape[0]
+        if isinstance(self.layer, nn.Linear) or isinstance(self.layer, transformers.Conv1D):
+            if len(inp.shape) == 3:
+                inp = inp.reshape((-1, inp.shape[-1]))
+            inp = inp.t()
+        if isinstance(self.layer, nn.Conv2d):
+            unfold = nn.Unfold(
+                self.layer.kernel_size,
+                dilation=self.layer.dilation,
+                padding=self.layer.padding,
+                stride=self.layer.stride,
+            )
+            inp = unfold(inp)
+            inp = inp.permute([1, 0, 2])
+            inp = inp.flatten(1)
+        self.H *= self.nsamples / (self.nsamples + tmp)
+        self.nsamples += tmp
+        # inp = inp.float()
+        inp = math.sqrt(2 / self.nsamples) * inp.float()
+        # self.H += 2 / self.nsamples * inp.matmul(inp.t())
+        self.H += inp.matmul(inp.t())
+
+    def fasterquant(
+        self,
+        blocksize=128,
+        percdamp=0.01,
+        group_size=-1,
+        actorder=False,
+        static_groups=False,
+    ):
+        W = self.layer.weight.data.clone()
+        if isinstance(self.layer, nn.Conv2d):
+            W = W.flatten(1)
+        if isinstance(self.layer, transformers.Conv1D):
+            W = W.t()
+        W = W.float()
+
+        tick = time.time()
+
+        if not self.quantizer.ready():
+            self.quantizer.find_params(W, weight=True)
+
+        H = self.H
+        del self.H
+        dead = torch.diag(H) == 0
+        H[dead, dead] = 1
+        W[:, dead] = 0
+
+        g_idx = []
+        scale = []
+        zero = []
+        now_idx = 1
+
+        if static_groups:
+            import copy
+
+            groups = []
+            for i in range(0, self.columns, group_size):
+                quantizer = copy.deepcopy(self.quantizer)
+                quantizer.find_params(W[:, i : (i + group_size)], weight=True)
+                scale.append(quantizer.scale)
+                zero.append(quantizer.zero)
+                groups.append(quantizer)
+
+        if actorder:
+            perm = torch.argsort(torch.diag(H), descending=True)
+            W = W[:, perm]
+            H = H[perm][:, perm]
+            invperm = torch.argsort(perm)
+
+        Losses = torch.zeros_like(W)
+        Q = torch.zeros_like(W)
+
+        damp = percdamp * torch.mean(torch.diag(H))
+        diag = torch.arange(self.columns, device=self.dev)
+        H[diag, diag] += damp
+        H = torch.linalg.cholesky(H)
+        H = torch.cholesky_inverse(H)
+        H = torch.linalg.cholesky(H, upper=True)
+        Hinv = H
+
+        for i1 in range(0, self.columns, blocksize):
+            i2 = min(i1 + blocksize, self.columns)
+            count = i2 - i1
+
+            W1 = W[:, i1:i2].clone()
+            Q1 = torch.zeros_like(W1)
+            Err1 = torch.zeros_like(W1)
+            Losses1 = torch.zeros_like(W1)
+            Hinv1 = Hinv[i1:i2, i1:i2]
+
+            for i in range(count):
+                w = W1[:, i]
+                d = Hinv1[i, i]
+
+                if group_size != -1:
+                    if not static_groups:
+                        if (i1 + i) % group_size == 0:
+                            self.quantizer.find_params(W[:, (i1 + i) : (i1 + i + group_size)], weight=True)
+
+                        if ((i1 + i) // group_size) - now_idx == -1:
+                            scale.append(self.quantizer.scale)
+                            zero.append(self.quantizer.zero)
+                            now_idx += 1
+                    else:
+                        idx = i1 + i
+                        if actorder:
+                            idx = perm[idx]
+                        self.quantizer = groups[idx // group_size]
+
+                q = self.quantizer.quantize(w.unsqueeze(1)).flatten()
+                Q1[:, i] = q
+                Losses1[:, i] = (w - q) ** 2 / d**2
+
+                err1 = (w - q) / d
+                W1[:, i:] -= err1.unsqueeze(1).matmul(Hinv1[i, i:].unsqueeze(0))
+                Err1[:, i] = err1
+
+            Q[:, i1:i2] = Q1
+            Losses[:, i1:i2] = Losses1 / 2
+
+            W[:, i2:] -= Err1.matmul(Hinv[i1:i2, i2:])
+
+            if os.environ.get("DEBUG"):
+                self.layer.weight.data[:, :i2] = Q[:, :i2]
+                self.layer.weight.data[:, i2:] = W[:, i2:]
+                logger.debug(torch.sum((self.layer(self.inp1) - self.out1) ** 2))
+                logger.debug(torch.sum(Losses))
+
+        torch.cuda.synchronize()
+
+        duration = time.time() - tick
+        avg_loss = torch.sum(Losses).item() / self.nsamples
+
+        group_size = group_size if group_size != -1 else self.columns
+        if static_groups and actorder:
+            g_idx = [perm[i] // group_size for i in range(self.columns)]
+        else:
+            g_idx = [i // group_size for i in range(self.columns)]
+        g_idx = torch.tensor(g_idx, dtype=torch.int32, device=Q.device)
+        if actorder:
+            Q = Q[:, invperm]
+            g_idx = g_idx[invperm]
+
+        if isinstance(self.layer, transformers.Conv1D):
+            Q = Q.t()
+        self.layer.weight.data = Q.reshape(self.layer.weight.shape).type_as(self.layer.weight.data)
+        if os.environ.get("DEBUG"):
+            logger.debug(torch.sum((self.layer(self.inp1) - self.out1) ** 2))
+
+        if scale == []:
+            scale.append(self.quantizer.scale)
+            zero.append(self.quantizer.zero)
+        scale = torch.cat(scale, dim=1)
+        zero = torch.cat(zero, dim=1)
+        return scale, zero, g_idx, duration, avg_loss
+
+    def free(self):
+        if os.environ.get("DEBUG"):
+            self.inp1 = None
+            self.out1 = None
+        self.H = None
+        self.Losses = None
+        self.Trace = None
+        torch.cuda.empty_cache()
+
+
+__all__ = ["GPTQ"]
diff --git a/plugins/accelerated-peft/src/gptqmodel/quantization/quantizer.py b/plugins/accelerated-peft/src/gptqmodel/quantization/quantizer.py
new file mode 100644
index 00000000..3ca205d1
--- /dev/null
+++ b/plugins/accelerated-peft/src/gptqmodel/quantization/quantizer.py
@@ -0,0 +1,142 @@
+# License: GPTQModel/licenses/LICENSE.mit
+# adapted from @qwopqwop200 's [GPTQ-for-LLaMa](https://github.com/qwopqwop200/GPTQ-for-LLaMa/tree/cuda), which itself is based on [gptq](https://github.com/IST-DASLab/gptq)
+
+from logging import getLogger
+
+import torch
+import torch.nn as nn
+
+logger = getLogger(__name__)
+
+
+def quantize(x, scale, zero, maxq):
+    if maxq < 0:
+        return (x > scale / 2).float() * scale + (x < zero / 2).float() * zero
+    q = torch.clamp(torch.round(x / scale) + zero, 0, maxq)
+    return scale * (q - zero)
+
+
+class Quantizer(nn.Module):
+    def __init__(self, shape=1):
+        super(Quantizer, self).__init__()
+        self.register_buffer("maxq", torch.tensor(0))
+        self.register_buffer("scale", torch.zeros(shape))
+        self.register_buffer("zero", torch.zeros(shape))
+
+    def configure(
+        self,
+        bits,
+        perchannel=False,
+        sym=True,
+        mse=False,
+        norm=2.4,
+        grid=100,
+        maxshrink=0.8,
+        trits=False,
+    ):
+        self.maxq = torch.tensor(2**bits - 1)
+        self.perchannel = perchannel
+        self.sym = sym
+        self.mse = mse
+        self.norm = norm
+        self.grid = grid
+        self.maxshrink = maxshrink
+        if trits:
+            self.maxq = torch.tensor(-1)
+
+    def find_params(self, x, weight=False):
+        dev = x.device
+        self.maxq = self.maxq.to(dev)
+
+        shape = x.shape
+        if self.perchannel:
+            if weight:
+                x = x.flatten(1)
+            else:
+                if len(shape) == 4:
+                    x = x.permute([1, 0, 2, 3])
+                    x = x.flatten(1)
+                if len(shape) == 3:
+                    x = x.reshape((-1, shape[-1])).t()
+                if len(shape) == 2:
+                    x = x.t()
+        else:
+            x = x.flatten().unsqueeze(0)
+
+        tmp = torch.zeros(x.shape[0], device=dev)
+        xmin = torch.minimum(x.min(1)[0], tmp)
+        xmax = torch.maximum(x.max(1)[0], tmp)
+
+        if self.sym:
+            xmax = torch.maximum(torch.abs(xmin), xmax)
+            tmp = xmin < 0
+            if torch.any(tmp):
+                xmin[tmp] = -xmax[tmp]
+        tmp = (xmin == 0) & (xmax == 0)
+        xmin[tmp] = -1
+        xmax[tmp] = +1
+
+        if self.maxq < 0:
+            self.scale = xmax
+            self.zero = xmin
+        else:
+            self.scale = (xmax - xmin) / self.maxq
+            if self.sym:
+                self.zero = torch.full_like(self.scale, (self.maxq + 1) / 2)
+            else:
+                self.zero = torch.round(-xmin / self.scale)
+
+        if self.mse:
+            best = torch.full([x.shape[0]], float("inf"), device=dev)
+            for i in range(int(self.maxshrink * self.grid)):
+                p = 1 - i / self.grid
+                xmin1 = p * xmin
+                xmax1 = p * xmax
+                scale1 = (xmax1 - xmin1) / self.maxq
+                zero1 = torch.round(-xmin1 / scale1) if not self.sym else self.zero
+                q = quantize(x, scale1.unsqueeze(1), zero1.unsqueeze(1), self.maxq)
+                q -= x
+                q.abs_()
+                q.pow_(self.norm)
+                err = torch.sum(q, 1)
+                tmp = err < best
+                if torch.any(tmp):
+                    best[tmp] = err[tmp]
+                    self.scale[tmp] = scale1[tmp]
+                    self.zero[tmp] = zero1[tmp]
+        if not self.perchannel:
+            if weight:
+                tmp = shape[0]
+            else:
+                tmp = shape[1] if len(shape) != 3 else shape[2]
+            self.scale = self.scale.repeat(tmp)
+            self.zero = self.zero.repeat(tmp)
+
+        if weight:
+            shape = [-1] + [1] * (len(shape) - 1)
+            self.scale = self.scale.reshape(shape)
+            self.zero = self.zero.reshape(shape)
+            return
+        if len(shape) == 4:
+            self.scale = self.scale.reshape((1, -1, 1, 1))
+            self.zero = self.zero.reshape((1, -1, 1, 1))
+        if len(shape) == 3:
+            self.scale = self.scale.reshape((1, 1, -1))
+            self.zero = self.zero.reshape((1, 1, -1))
+        if len(shape) == 2:
+            self.scale = self.scale.unsqueeze(0)
+            self.zero = self.zero.unsqueeze(0)
+
+    def quantize(self, x):
+        if self.ready():
+            return quantize(x, self.scale, self.zero, self.maxq)
+        return x
+
+    def enabled(self):
+        return self.maxq > 0
+
+    def ready(self):
+        return torch.all(self.scale != 0)
+
+
+__all__ = ["Quantizer"]
diff --git a/plugins/accelerated-peft/src/gptqmodel/utils/__init__.py b/plugins/accelerated-peft/src/gptqmodel/utils/__init__.py
new file mode 100644
index 00000000..0e79e1d3
--- /dev/null
+++ b/plugins/accelerated-peft/src/gptqmodel/utils/__init__.py
@@ -0,0 +1 @@
+from .backend import Backend, get_backend
diff --git a/plugins/accelerated-peft/src/gptqmodel/utils/backend.py b/plugins/accelerated-peft/src/gptqmodel/utils/backend.py
new file mode 100644
index 00000000..43dfffea
--- /dev/null
+++ b/plugins/accelerated-peft/src/gptqmodel/utils/backend.py
@@ -0,0 +1,27 @@
+###############################################################################
+# Adapted from https://github.com/ModelCloud/GPTQModel
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+###############################################################################
+from enum import Enum
+
+
+class Backend(Enum):
+    AUTO = 0  # choose the fastest one based on quant model compatibility
+    TRITON = 3
+
+def get_backend(backend: str):
+    try:
+        return Backend[backend]
+    except KeyError:
+        raise ValueError(f"Invalid Backend str: {backend}")
diff --git a/plugins/accelerated-peft/src/gptqmodel/utils/data.py b/plugins/accelerated-peft/src/gptqmodel/utils/data.py
new file mode 100644
index 00000000..19e61779
--- /dev/null
+++ b/plugins/accelerated-peft/src/gptqmodel/utils/data.py
@@ -0,0 +1,275 @@
+###############################################################################
+# Adapted from https://github.com/ModelCloud/GPTQModel
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+###############################################################################
+import copy
+import random
+from functools import partial
+from typing import Callable, Dict, List, Optional
+
+import torch
+from datasets import DatasetDict, IterableDatasetDict, load_dataset
+from torch import LongTensor
+from torch.utils.data import DataLoader
+from transformers import PreTrainedTokenizer
+
+
+def make_data_block(
+    samples: Dict[str, List[str]],
+    prompt_col_name: str,
+    label_col_name: str,
+    tokenizer: PreTrainedTokenizer,
+    preprocess_fn: Optional[Callable] = None,
+    sample_max_len: int = 1024,
+    block_max_len: int = 2048,
+    add_eos_token: bool = False,
+    truncate_prompt: bool = True,
+    merge_prompt_label: bool = False,
+) -> Dict[str, List[LongTensor]]:
+    """A simple implementation of text generation oriented smart batching to maximize VRAM usage when evaluation
+
+    :param samples: Dict[str, List[str]], samples that used to make data blocks
+    :param prompt_col_name: str, name of the key in samples whose value stores prompt
+    :param label_col_name: str, name of the key in samples whose value stores label
+    :param tokenizer: transformers.PretrainedTokenizer, tokenizer that used to tokenize samples
+    :param preprocess_fn: Optional[Callable], optional function that used to preprocess samples such as
+        refactor the data structure of samples, note the output of this function must be a dict whose keys
+        at least contains `prompt_col_name` and `label_col_name`
+    :param sample_max_len: int, defaults to 1024, max tokens number of each sample (before padding)
+    :param block_max_len: int, defaults to 2048, max tokens number of each data block (after padding)
+    :param add_eos_token: bool, defaults to False, whether add eos_token or not to the label
+    :param truncate_prompt: bool, defaults to True, whether to truncate prompt if the sample's total tokens
+        number exceeds `sample_max_len`, if not, will truncate label and drop this sample when all tokens
+        in label are truncated
+    :param merge_prompt_label: bool, defaults to False, will merge label into prompt if set to True, usually
+        this only required when doing language modeling task
+    :return: Dict[str, List[torch.LongTensor]], a dict whose keys are `input_ids`, `attention_mask` and
+        `label` and values are a list of torch.LongTensor
+    """
+    if preprocess_fn:
+        samples = preprocess_fn(samples)
+
+    prompts = samples[prompt_col_name]
+    labels = samples[label_col_name]
+
+    # tokenize samples
+    tokenized_prompts = tokenizer(prompts, truncation=False)["input_ids"]
+    tokenized_labels = tokenizer(labels, truncation=False)["input_ids"]
+
+    # filter tokenized samples by length
+    dropped_indices = []
+    for idx, (tokenized_prompt, tokenized_label) in enumerate(zip(tokenized_prompts, tokenized_labels)):
+        if add_eos_token:
+            tokenized_label += [tokenizer.eos_token_id]
+        len_prompt = len(tokenized_prompt)
+        len_label = len(tokenized_label)
+        exceed_len = len_prompt + len_label - sample_max_len
+        if exceed_len > 0:
+            if truncate_prompt:
+                tokenized_prompt = tokenized_prompt[exceed_len:]
+            else:
+                tokenized_label = tokenized_label[:-exceed_len]
+        tokenized_prompts[idx] = tokenized_prompt
+        tokenized_labels[idx] = tokenized_label
+        if not tokenized_label:
+            dropped_indices.append(idx)
+
+    # make data blocks of samples
+    tokenized_samples = sorted(
+        [(p, l) for idx, (p, l) in enumerate(zip(tokenized_prompts, tokenized_labels)) if idx not in dropped_indices],
+        key=lambda x: (len(x[0]) + len(x[1])) if merge_prompt_label else len(x[0]),
+    )
+    sample_blocks = []
+    sample_block = []
+    blk_max_len = 0
+    blk_total_len = 0
+    for tokenized_sample in tokenized_samples:
+        prompt_ids, label_ids = tokenized_sample
+        ori_sample_len = len(prompt_ids)
+        if merge_prompt_label:
+            ori_sample_len += len(label_ids)
+        if ori_sample_len <= blk_max_len:
+            additional_len = blk_max_len
+            sample_len = blk_max_len
+        else:
+            additional_len = len(sample_block) * (ori_sample_len - blk_max_len) + ori_sample_len
+            sample_len = ori_sample_len
+
+        if blk_total_len + additional_len > block_max_len:
+            sample_blocks.append((copy.copy(sample_block), blk_max_len))
+            sample_block = []
+            blk_max_len = 0
+            blk_total_len = 0
+            sample_len = ori_sample_len
+            additional_len = ori_sample_len
+
+        sample_block.append(tokenized_sample)
+        blk_max_len = max(blk_max_len, sample_len)
+        blk_total_len += additional_len
+
+    if sample_block:
+        sample_blocks.append((copy.copy(sample_block), blk_max_len))
+    del sample_block
+    del blk_max_len
+    del blk_total_len
+
+    new_samples = {"input_ids": [], "attention_mask": [], "labels": []}
+    # padding each data block internally
+    for block, blk_max_len in sample_blocks:
+        input_ids = []
+        attention_mask = []
+        label_ids = []
+        label_max_len = max([len(sample[1]) for sample in block])
+
+        for sample in block:
+            tokenized_prompt, tokenized_label = sample
+            sample_len = len(tokenized_prompt)
+            if merge_prompt_label:
+                sample_len += len(tokenized_label)
+            pad_num = blk_max_len - sample_len
+            if merge_prompt_label:
+                input_ids.append([tokenizer.pad_token_id] * pad_num + tokenized_prompt + tokenized_label)
+                label_ids.append([-100] * (pad_num + len(tokenized_prompt)) + tokenized_label)
+            else:
+                input_ids.append([tokenizer.pad_token_id] * pad_num + tokenized_prompt)
+                label_ids.append([-100] * (label_max_len - len(tokenized_label)) + tokenized_label)
+            attention_mask.append([0] * pad_num + [1] * sample_len)
+
+        new_samples["input_ids"].append(input_ids)
+        new_samples["attention_mask"].append(attention_mask)
+        new_samples["labels"].append(label_ids)
+
+    return new_samples
+
+
+def collate_data(blocks: List[Dict[str, List[List[int]]]], pad_token_id: int) -> Dict[str, LongTensor]:
+    def pad_block(block, pads):
+        return torch.cat((block, pads.to(block.device)), dim=-1)
+
+    input_ids_blocks = [LongTensor(block["input_ids"]) for block in blocks]
+    attention_mask_blocks = [LongTensor(block["attention_mask"]) for block in blocks]
+    label_blocks = [LongTensor(block["labels"]) for block in blocks]
+
+    bsz = len(blocks)
+    inp_max_len = max([block.size(-1) for block in input_ids_blocks])
+    label_max_len = max([block.size(-1) for block in label_blocks])
+
+    for i in range(bsz):
+        block_bsz, block_inp_len = input_ids_blocks[i].shape
+        block_label_len = label_blocks[i].shape[-1]
+        pad_num = inp_max_len - block_inp_len
+        if pad_num > 0:
+            input_ids_blocks[i] = pad_block(input_ids_blocks[i], torch.ones((block_bsz, pad_num)) * pad_token_id)
+            attention_mask_blocks[i] = pad_block(attention_mask_blocks[i], torch.zeros((block_bsz, pad_num)))
+        label_pad_num = label_max_len - block_label_len
+        if label_pad_num > 0:
+            label_blocks[i] = pad_block(label_blocks[i], torch.ones((block_bsz, label_pad_num)) * -100)
+
+    return {
+        "input_ids": torch.cat(input_ids_blocks, dim=0).long(),
+        "attention_mask": torch.cat(attention_mask_blocks, dim=0).long(),
+        "labels": torch.cat(label_blocks, dim=0).long(),
+    }
+
+
+def get_dataloader(
+    data_path_or_name: str,
+    prompt_col_name: str,
+    label_col_name: str,
+    tokenizer: PreTrainedTokenizer,
+    load_fn: Optional[Callable] = None,
+    preprocess_fn: Optional[Callable] = None,
+    num_samples: int = 128,
+    sample_max_len: int = 1024,
+    block_max_len: int = 2048,
+    add_eos_token: bool = False,
+    truncate_prompt: bool = True,
+    merge_prompt_label: bool = False,
+    load_fn_kwargs: Optional[dict] = None,
+    preprocess_fn_kwargs: Optional[dict] = None,
+    **kwargs,
+) -> DataLoader:
+    """load dataset and build dataloader
+
+    :param data_path_or_name: str, dataset name in hf-hub or local file path
+    :param prompt_col_name: str, see `make_data_block`
+    :param label_col_name: str, see `make_data_block`
+    :param tokenizer: str, see `make_data_block`
+    :param load_fn: Optional[Callable], defaults to None, function used to load dataset, if not specified,
+        use `datasets.load_dataset`
+    :param preprocess_fn: Optional[Callable], see `make_data_block`
+    :param num_samples: int, defaults to 128, total samples used to evaluation
+    :param sample_max_len: int, see `make_data_block`
+    :param block_max_len: int, see `make_data_block`
+    :param add_eos_token: bool, see `make_data_block`
+    :param truncate_prompt: bool, see `make_data_block`
+    :param merge_prompt_label: bool, see `make_data_block`
+    :param load_fn_kwargs: Optional[dict], defaults to None, keyword arguments used
+        for `load_fn` or `datasets.load_dataset`
+    :param preprocess_fn_kwargs: Optional[dict], defaults to None, keyword arguments used
+        for `preprocess_fn`
+    :param kwargs: additional keyword arguments will be passed to torch's `DataLoader` initialization,
+        note values of `batch_size`, `shuffle` and `collate_fn` will always be overridden to fixed value
+    :return: torch.utils.data.DataLoader
+    """
+
+    if not load_fn_kwargs:
+        load_fn_kwargs = {}
+    if not preprocess_fn_kwargs:
+        preprocess_fn_kwargs = {}
+
+    if load_fn:
+        ds = load_fn(data_path_or_name, **load_fn_kwargs)
+    else:
+        ds = load_dataset(data_path_or_name, **load_fn_kwargs)
+    if isinstance(ds, (DatasetDict, IterableDatasetDict)):
+        if "evaluation" in ds:
+            ds = ds["evaluation"]
+        elif "test" in ds:
+            ds = ds["test"]
+        else:
+            ds = ds["train"]
+
+    ds = ds.select(
+        indices=random.sample(range(len(ds)), min(len(ds), num_samples)),
+        keep_in_memory=True,
+    )
+    ds = ds.map(
+        make_data_block,
+        batched=True,
+        batch_size=len(ds),
+        num_proc=1,
+        remove_columns=ds.column_names,
+        keep_in_memory=True,
+        load_from_cache_file=False,
+        fn_kwargs={
+            "prompt_col_name": prompt_col_name,
+            "label_col_name": label_col_name,
+            "tokenizer": tokenizer,
+            "preprocess_fn": partial(preprocess_fn, **preprocess_fn_kwargs),
+            "sample_max_len": sample_max_len,
+            "block_max_len": block_max_len,
+            "add_eos_token": add_eos_token,
+            "truncate_prompt": truncate_prompt,
+            "merge_prompt_label": merge_prompt_label,
+        },
+    )
+
+    # override some arguments' values in kwargs despite user specified
+    kwargs["batch_size"] = 1
+    kwargs["shuffle"] = False
+    kwargs["collate_fn"] = partial(collate_data, pad_token_id=tokenizer.pad_token_id)
+    dl = DataLoader(ds, **kwargs)
+
+    return dl
diff --git a/plugins/accelerated-peft/src/gptqmodel/utils/importer.py b/plugins/accelerated-peft/src/gptqmodel/utils/importer.py
new file mode 100644
index 00000000..40503569
--- /dev/null
+++ b/plugins/accelerated-peft/src/gptqmodel/utils/importer.py
@@ -0,0 +1,63 @@
+###############################################################################
+# Adapted from https://github.com/ModelCloud/GPTQModel
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+###############################################################################
+from collections import OrderedDict
+from logging import getLogger
+
+from ..nn_modules.qlinear.qlinear_tritonv2 import QuantLinear as TritonV2QuantLinear
+from ..quantization import FORMAT
+from .backend import Backend
+
+backend_dict = OrderedDict({
+    Backend.TRITON: TritonV2QuantLinear,
+})
+
+format_dict = {
+    FORMAT.GPTQ: [Backend.TRITON],
+    FORMAT.GPTQ_V2: [Backend.TRITON],
+    FORMAT.TRITON: [Backend.TRITON],
+}
+
+logger = getLogger(__name__)
+
+# auto select the correct/optimal QuantLinear class
+def select_quant_linear(
+        bits: int,
+        group_size: int,
+        desc_act: bool,
+        sym: bool,
+        backend: Backend,
+        format: FORMAT,
+        pack: bool = False,
+):
+    # Handle the case where backend is AUTO.
+    if backend == Backend.AUTO:
+        allow_backends = format_dict[format]
+        for k, v in backend_dict.items():
+            in_allow_backends = k in allow_backends
+            validate = v.validate(bits, group_size, desc_act, sym, raise_error=False)
+            check_pack_func = hasattr(v, "pack") if pack else True
+            if in_allow_backends and validate and check_pack_func:
+                logger.info(f"Auto choose the fastest one based on quant model compatibility: {v}")
+                return v
+
+    # Handle the case where backend is not AUTO.
+    if backend == Backend.TRITON:
+        logger.info("Using tritonv2 for GPTQ")
+        from ..nn_modules.qlinear.qlinear_tritonv2 import QuantLinear
+    else:
+        raise NotImplementedError("Invalid Backend")
+
+    return QuantLinear
diff --git a/plugins/accelerated-peft/src/gptqmodel/utils/model.py b/plugins/accelerated-peft/src/gptqmodel/utils/model.py
new file mode 100644
index 00000000..23ef3821
--- /dev/null
+++ b/plugins/accelerated-peft/src/gptqmodel/utils/model.py
@@ -0,0 +1,661 @@
+###############################################################################
+# Adapted from https://github.com/ModelCloud/GPTQModel
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+###############################################################################
+import functools
+import hashlib
+import json
+import logging
+import os
+from logging import getLogger
+from typing import List, Optional
+
+import accelerate
+import threadpoolctl as tctl
+import torch
+import torch.nn as nn
+import transformers
+from tqdm import tqdm
+from transformers import AutoConfig, PretrainedConfig
+from transformers.utils.hub import cached_file
+
+from ..models._const import CPU, CUDA_0, EXLLAMA_DEFAULT_MAX_INPUT_LENGTH, EXPERT_INDEX_PLACEHOLDER, SUPPORTED_MODELS
+from ..nn_modules.qlinear import BaseQuantLinear
+from ..quantization import FORMAT, QuantizeConfig
+from .backend import Backend
+from .importer import select_quant_linear
+
+logger = getLogger(__name__)
+handler = logging.StreamHandler()
+formatter = logging.Formatter("%(levelname)s - %(message)s")
+handler.setFormatter(formatter)
+logger.addHandler(handler)
+logger.setLevel(logging.INFO)
+
+
+def recurse_getattr(obj, attr: str):
+    """
+    Recursive `getattr`.
+
+    Args:
+        obj:
+            A class instance holding the attribute.
+        attr (`str`):
+            The attribute that is to be retrieved, e.g. 'attribute1.attribute2'.
+    """
+
+    def _getattr(obj, attr):
+        return getattr(obj, attr)
+
+    return functools.reduce(_getattr, [obj] + attr.split("."))
+
+
+def recurse_setattr(module, name, value):
+    """A function to recursively set attributes to a module."""
+    if "." not in name:
+        setattr(module, name, value)
+    else:
+        name, rest = name.split(".", 1)
+        recurse_setattr(getattr(module, name), rest, value)
+
+
+def get_device(obj: torch.Tensor | nn.Module):
+    if isinstance(obj, torch.Tensor):
+        return obj.device
+    return next(obj.parameters()).device
+
+
+def move_to(obj: torch.Tensor | nn.Module, device: torch.device):
+    if get_device(obj) != device:
+        obj = obj.to(device)
+    return obj
+
+
+def nested_move_to(v, device):
+    if isinstance(v, torch.Tensor):
+        return move_to(v, device)
+    elif isinstance(v, (list, tuple)):
+        return type(v)([nested_move_to(e, device) for e in v])
+    else:
+        return v
+
+
+def find_layers(module, layers=None, name=""):
+    if not layers:
+        layers = [transformers.pytorch_utils.Conv1D, nn.Conv2d, nn.Linear]
+    for layer in layers:
+        if isinstance(module, layer):
+            return {name: module}
+    res = {}
+    for name1, child in module.named_children():
+        res.update(find_layers(child, layers=layers, name=name + "." + name1 if name != "" else name1))
+    return res
+
+
+def get_module_by_name_prefix(model, module_name: str):
+    for name, module in model.named_modules():
+        if name.startswith(module_name):
+            return module
+
+
+def get_module_by_name_suffix(model, module_name: str):
+    for name, module in model.named_modules():
+        if name.endswith(module_name):
+            return module
+
+
+def make_quant(
+    module,
+    names,
+    bits: int,
+    group_size: int,
+    backend: Backend,
+    format: str,
+    desc_act: bool = False,
+    sym: bool = True,
+    use_cuda_fp16: bool = True,
+    pack: bool = False,
+) -> BaseQuantLinear:
+    select_quant_linear_func = select_quant_linear_with_pack if pack else select_quant_linear
+    QuantLinear = select_quant_linear_func(
+        bits=bits,
+        group_size=group_size,
+        desc_act=desc_act,
+        sym=sym,
+        backend=backend,
+        format=format,
+        pack=pack,
+    )
+
+    if isinstance(module, QuantLinear):
+        return QuantLinear
+
+    for name, submodule in module.named_modules():
+        if name in names:
+            ori_layer_device = next(submodule.parameters()).device
+
+            if isinstance(submodule, nn.Linear):
+                in_features = submodule.in_features
+                out_features = submodule.out_features
+            elif isinstance(submodule, nn.Conv2d):
+                in_features = submodule.in_channels
+                out_features = submodule.out_channels
+            elif isinstance(submodule, transformers.pytorch_utils.Conv1D):
+                in_features = submodule.weight.shape[0]
+                out_features = submodule.weight.shape[1]
+            else:
+                raise NotImplementedError(f"Unsupported module {submodule}")
+
+            bias = submodule.bias is not None
+            if (not (desc_act) or group_size == -1) and backend != Backend.TRITON:
+                new_layer = QuantLinear(
+                    bits=bits,
+                    group_size=group_size,
+                    desc_act=desc_act,
+                    sym=sym,
+                    infeatures=in_features,
+                    outfeatures=out_features,
+                    bias=bias,
+                    use_cuda_fp16=use_cuda_fp16,
+                    weight_dtype=submodule.weight.dtype,
+                )
+            else:
+                new_layer = QuantLinear(
+                    bits=bits,
+                    group_size=group_size,
+                    desc_act=desc_act,
+                    sym=sym,
+                    infeatures=in_features,
+                    outfeatures=out_features,
+                    bias=bias,
+                    weight_dtype=submodule.weight.dtype,
+                )
+            new_layer.device = ori_layer_device
+            recurse_setattr(module, name, new_layer.to(ori_layer_device))
+
+    return QuantLinear
+
+def convert_gptq_v1_to_v2_format(
+    model,
+    quantize_config: QuantizeConfig,
+    qlinear_kernel: nn.Module,
+):
+    # Limit thread usage to avoid auto-parallizataion regression
+    with tctl.threadpool_limits(limits=1):
+        for _, submodule in model.named_modules():
+            # v1 checkpoint format used to do `qzeros = qzeros -= 1` before serialization, thus the
+            # additions here do not overflow.
+            # v1 checkpoint format with sym=False saved via convert_gptq_v2_to_v1_format() will
+            # overflow ~<=13% based on testing
+            if isinstance(submodule, qlinear_kernel):
+                if quantize_config.bits == 2:
+                    submodule.qzeros.data += 0b01010101010101010101010101010101
+                elif quantize_config.bits == 3:
+                    submodule.qzeros.data[:, range(0, submodule.qzeros.data.shape[1], 3)] += (
+                        0b00100100100100100100100100100100
+                    )
+                    submodule.qzeros.data[:, range(1, submodule.qzeros.data.shape[1], 3)] += (
+                        0b10010010010010010010010010010010
+                    )
+                    submodule.qzeros.data[:, range(2, submodule.qzeros.data.shape[1], 3)] += (
+                        0b01001001001001001001001001001001
+                    )
+                elif quantize_config.bits == 4:
+                    submodule.qzeros.data += 0b00010001000100010001000100010001
+                elif quantize_config.bits == 8:
+                    submodule.qzeros.data += 0b00000001000000010000000100000001
+                else:
+                    raise NotImplementedError("Only 2,3,4,8 bits are supported.")
+
+    return model
+
+
+def convert_gptq_v2_to_v1_format(
+    model,
+    quantize_config: QuantizeConfig,
+    qlinear_kernel: nn.Module,
+):
+    # Limit thread usage to avoid auto-parallizataion regression
+    with tctl.threadpool_limits(limits=1):
+        for _, submodule in model.named_modules():
+            # sym=False has underflow probability of ~<=13% during testing. No underflow possible for sym=True.
+            if isinstance(submodule, qlinear_kernel):
+                if quantize_config.bits == 2:
+                    submodule.qzeros.data -= 0b01010101010101010101010101010101
+                elif quantize_config.bits == 3:
+                    submodule.qzeros.data[:, range(0, submodule.qzeros.data.shape[1], 3)] -= (
+                        0b00100100100100100100100100100100
+                    )
+                    submodule.qzeros.data[:, range(1, submodule.qzeros.data.shape[1], 3)] -= (
+                        0b10010010010010010010010010010010
+                    )
+                    submodule.qzeros.data[:, range(2, submodule.qzeros.data.shape[1], 3)] -= (
+                        0b01001001001001001001001001001001
+                    )
+                elif quantize_config.bits == 4:
+                    submodule.qzeros.data -= 0b00010001000100010001000100010001
+                elif quantize_config.bits == 8:
+                    submodule.qzeros.data -= 0b00000001000000010000000100000001
+                else:
+                    raise NotImplementedError("Only 2,3,4,8 bits are supported.")
+
+    return model
+
+def select_quant_linear_with_pack(bits: int,
+    group_size: int,
+    desc_act: bool,
+    sym: bool,
+    backend: Backend, format: str, pack: bool):
+    QuantLinear = select_quant_linear(
+        bits=bits,
+        group_size=group_size,
+        desc_act=desc_act,
+        sym=sym,
+        backend=backend,
+        format=format,
+        pack=pack,
+    )
+    return QuantLinear
+
+def pack_model(
+    model,
+    quantizers,
+    bits,
+    group_size,
+    backend: Backend,
+    format: str,
+    desc_act=False,
+    sym: bool = True,
+    use_cuda_fp16=True,
+    warmup_triton: bool = False,
+    force_layer_back_to_cpu: bool = False,
+):
+    QuantLinear = select_quant_linear_with_pack(
+        bits=bits,
+        group_size=group_size,
+        desc_act=desc_act,
+        sym=sym,
+        backend=backend,
+        format=format,
+        pack=True,
+    )
+
+    if force_layer_back_to_cpu:
+        model.to(CPU)
+
+    logger.info("Packing model...")
+    layers = find_layers(model)
+    layers = {n: layers[n] for n in quantizers}
+    make_quant(
+        model,
+        quantizers,
+        bits,
+        group_size,
+        backend=backend,
+        format=format,
+        use_cuda_fp16=use_cuda_fp16,
+        desc_act=desc_act,
+        pack=True,
+    )
+    qlayers = find_layers(model, [QuantLinear])
+
+    # Limit pack() thread usage to avoid auto-parallizataion regression
+    with tctl.threadpool_limits(limits=1):
+        pbar = tqdm(qlayers.keys(), leave=True)
+        for name in pbar:
+            pbar.set_description(f"Packing {name}")
+
+            quantizers[name], scale, zero, g_idx = quantizers[name]
+            # so far can only pack layer on CPU
+            layer_device = qlayers[name].device
+            qlayers[name].to(CPU)
+            layers[name], scale, zero, g_idx = (
+                layers[name].to(CPU),
+                scale.to(CPU),
+                zero.to(CPU),
+                g_idx.to(CPU),
+            )
+            if QuantLinear.QUANT_TYPE == "marlin":
+                qlayers[name].pack(layers[name], scale)
+            else:
+                qlayers[name].pack(layers[name], scale, zero, g_idx)
+            qlayers[name].to(layer_device)
+
+    logger.info("Model packed.")
+
+    if backend != Backend.TRITON and warmup_triton:
+        logger.warning(
+            "using autotune_warmup will move model to GPU, make sure you have enough VRAM to load the whole model."
+        )
+        QuantLinear.warmup(model.to(CUDA_0), seqlen=model.seqlen)
+    return QuantLinear
+
+def verify_model_hash(file_path: str, verify_hash: str):
+    if not isinstance(verify_hash, str):
+        raise ValueError("model verify_hash must be a string")
+    if ':' not in verify_hash:
+        raise ValueError("verify_hash must be in the format 'hash_type:hash_value'")
+    hash_type, hash_value = verify_hash.split(':', 1)
+    hash_func = getattr(hashlib, hash_type, None)
+    if not hash_func:
+        raise ValueError(f"No hash function found for type: {hash_type}")
+    with open(file_path, "rb") as f:
+        file_hash = hash_func(f.read()).hexdigest()
+    return file_hash == hash_value
+
+
+def verify_sharded_model_hashes(jsonPath: str, verify_hash: List[str]):
+    if not isinstance(verify_hash, list):
+        raise ValueError("sharded model verify_hash must be a list")
+
+    with open(jsonPath, 'r') as f:
+        index_data = json.load(f)
+    weight_map = index_data['weight_map']
+    shard_files = set(weight_map.values())
+    if len(shard_files) != len(verify_hash):
+        raise ValueError("Number of shards and number of hash values do not match.")
+
+    for shard_file, expected_hash in zip(shard_files, verify_hash):
+        if not verify_model_hash(shard_file, expected_hash):
+            logger.info(f"Hash verification failed for {shard_file}")
+            return False
+    return True
+
+def check_and_get_model_type(model_dir, trust_remote_code=False):
+    config = AutoConfig.from_pretrained(model_dir, trust_remote_code=trust_remote_code)
+    if config.model_type not in SUPPORTED_MODELS:
+        raise TypeError(f"{config.model_type} isn't supported yet.")
+    model_type = config.model_type
+    return model_type
+
+
+def simple_dispatch_model(model, device_map):
+    from accelerate.hooks import AlignDevicesHook, add_hook_to_module
+
+    if "" in device_map:
+        d = device_map[""]
+        model = model.to(torch.device(d))
+        model.hf_device_map = device_map
+        return model
+
+    tied_params = accelerate.utils.modeling.find_tied_parameters(model)
+    if set(device_map.values()) == {"cpu"} or set(device_map.values()) == {
+        "cpu",
+        "disk",
+    }:
+        main_device = "cpu"
+    else:
+        main_device = [d for d in device_map.values() if d not in ["cpu", "disk"]][0]
+
+    cpu_offload_group = [(n, d) for n, d in device_map.items() if d == "cpu"]
+    prev_hook = None
+    for idx, (n, d) in enumerate(cpu_offload_group):
+        m = get_module_by_name_suffix(model, n)
+        _, prev_hook = accelerate.cpu_offload_with_hook(m, execution_device=main_device, prev_module_hook=prev_hook)
+    # set first cpu offload module's prev_module_hook to the last cpu offload module's hook
+    if len(cpu_offload_group) > 1:
+        get_module_by_name_suffix(model, cpu_offload_group[0][0])._hf_hook.prev_module_hook = prev_hook
+
+    for n, d in device_map.items():
+        m = get_module_by_name_suffix(model, n)
+        if d != "cpu":
+            d = torch.device(d)
+            hook = AlignDevicesHook(d, io_same_device=True, place_submodules=True)
+            add_hook_to_module(m, hook)
+    accelerate.utils.modeling.retie_parameters(model, tied_params)
+    model.hf_device_map = device_map
+
+    return model
+
+
+# TODO: refractor. very strange post_init has to re-determine qlinear type again
+# when qliear type is selected, it should auto-override the model post_init method and
+# not have to go about looping over modules to match qlinear type a second time as it is
+# very prone to bugs
+def gptqmodel_post_init(model, use_act_order: bool, max_input_length: Optional[int] = None):
+    """
+    The max_input_length argument is specific to the exllama backend, that requires to initialize a buffer temp_state.
+    """
+
+    # post init for bitblas backend.
+    device_to_buffers_size = {}
+    for _, submodule in model.named_modules():
+        if hasattr(submodule, "QUANT_TYPE") and submodule.QUANT_TYPE == "bitblas":
+            submodule.post_init()
+
+    model_uses_exllama = False
+    for name, submodule in model.named_modules():
+        if isinstance(submodule, BaseQuantLinear) and submodule.QUANT_TYPE == "exllama":
+            model_uses_exllama = True
+            device = submodule.qweight.device
+            if device not in device_to_buffers_size:
+                device_to_buffers_size[device] = {
+                    "max_dq_buffer_size": 1,
+                    "max_inner_outer_dim": 1,
+                }
+
+            if not use_act_order:
+                submodule._use_act_order = False
+            else:
+                submodule._use_act_order = True
+
+            # Disable this heuristic for detecting act_order, but it could be used instead of the config.
+            """
+            if submodule.g_idx is None:
+                submodule.act_order = False
+            elif submodule.g_idx is not None and ((submodule.g_idx == 0).all() or torch.equal(submodule.g_idx.cpu(), torch.tensor([i // submodule.group_size for i in range(submodule.g_idx.shape[0])], dtype=torch.int32))):
+                submodule.g_idx = None
+                submodule.act_order = False
+            else:
+                submodule.act_order = True
+            """
+
+            device_to_buffers_size[device]["max_dq_buffer_size"] = max(
+                device_to_buffers_size[device]["max_dq_buffer_size"],
+                submodule.qweight.numel() * 8,
+            )
+
+            if use_act_order:
+                device_to_buffers_size[device]["max_inner_outer_dim"] = max(
+                    device_to_buffers_size[device]["max_inner_outer_dim"],
+                    submodule.infeatures,
+                    submodule.outfeatures,
+                )
+
+    if model_uses_exllama:
+        # To be honest this is quite ugly, not proud of this.
+        from gptqmodel_exllama_kernels import prepare_buffers, set_tuning_params
+
+        device_to_buffers = {}
+
+        if use_act_order:
+            if max_input_length is None:
+                max_input_len = EXLLAMA_DEFAULT_MAX_INPUT_LENGTH
+            else:
+                max_input_len = max_input_length
+        else:
+            if max_input_length is not None:
+                logger.info(
+                    "Using exllama backend without act-order, the parameter max_input_length was set although not needed, it will be ignored."
+                )
+            max_input_len = 1
+
+        for device, buffers_size in device_to_buffers_size.items():
+            # The temp_state buffer is required to reorder X in the act-order case.
+            # The temp_dq buffer is required to dequantize weights when using cuBLAS, typically for the prefill.
+            device_to_buffers[device] = {
+                "temp_state": torch.zeros(
+                    (max_input_len, buffers_size["max_inner_outer_dim"]),
+                    dtype=torch.float16,
+                    device=device,
+                ),
+                "temp_dq": torch.zeros(
+                    (1, buffers_size["max_dq_buffer_size"]),
+                    dtype=torch.float16,
+                    device=device,
+                ),
+                "max_dq_buffer_size": buffers_size["max_dq_buffer_size"],
+                "max_inner_outer_dim": buffers_size["max_inner_outer_dim"],
+            }
+
+        # Buffers need to be persistent to avoid any bug.
+        model.device_to_buffers = device_to_buffers
+
+        for device, buffers in model.device_to_buffers.items():
+            prepare_buffers(device, buffers["temp_state"], buffers["temp_dq"])
+
+        # Using the default from exllama repo here.
+        matmul_recons_thd = 8
+        matmul_fused_remap = False
+        matmul_no_half2 = False
+        set_tuning_params(matmul_recons_thd, matmul_fused_remap, matmul_no_half2)
+
+        # The buffers need to have been initialized first before calling make_q4.
+        for name, submodule in model.named_modules():
+            if isinstance(submodule, BaseQuantLinear) and submodule.QUANT_TYPE == "exllama":
+                submodule.post_init()
+
+    # exllamav2
+    fixed_bytes = {}
+    model_uses_exllamav2 = False
+
+    for _, submodule in model.named_modules():
+        if isinstance(submodule, BaseQuantLinear) and submodule.QUANT_TYPE == "exllamav2":
+            model_uses_exllamav2 = True
+            device = submodule.qweight.device
+            scratch_fixed = submodule.scratch_space_fixed()
+            fixed_bytes[device] = max(scratch_fixed, fixed_bytes.get(device, 0))
+
+    if model_uses_exllamav2:
+        from ..nn_modules.qlinear.qlinear_exllamav2 import ExLlamaV2DeviceTensors
+
+        device_tensors = {}
+        for device, scratch_bytes in fixed_bytes.items():
+            device_tensors[device] = ExLlamaV2DeviceTensors(device.index, scratch_bytes)
+
+        # have persistent buffers, otherwise we will get OOM
+        model.device_tensors = device_tensors
+
+        for _, submodule in model.named_modules():
+            if isinstance(submodule, BaseQuantLinear) and submodule.QUANT_TYPE == "exllamav2":
+                device = submodule.qweight.device
+                submodule.post_init(temp_dq=model.device_tensors[device])
+    torch.cuda.empty_cache()
+
+    return model
+
+
+def get_checkpoints(
+    model_name_or_path: str, extensions: List[str], possible_model_basenames: List[str], **cached_file_kwargs
+):
+    """
+    Retrives (and if necessary downloads from Hugging Face Hub) the model checkpoint. Sharding is supported. All the `possible_model_basenames` (e.g. `["model", "model-4bit-gptq"]`) will be explored over all `extensions` (e.g. `[".bin", ".safetensors"]`).
+    """
+    searched_files = []
+    resolved_archive_file = None
+    true_model_basename = None
+
+    if os.path.isdir(model_name_or_path):
+        for ext in extensions:
+            for possible_model_basename in possible_model_basenames:
+                shard_index_name = possible_model_basename + ext + ".index.json"
+                searched_files.append(shard_index_name)
+                possible_index_file = os.path.join(model_name_or_path, shard_index_name)
+                if os.path.isfile(possible_index_file):
+                    # The model is sharded over several checkpoints.
+                    possible_model_basename = possible_index_file.replace(ext + ".index.json", "")
+                    return True, possible_index_file, possible_model_basename
+                else:
+                    model_save_name = os.path.join(model_name_or_path, possible_model_basename)
+                    searched_files.append(possible_model_basename + ext)
+                    if os.path.isfile(model_save_name + ext):
+                        resolved_archive_file = model_save_name + ext
+                        return False, resolved_archive_file, possible_model_basename
+    else:
+        temp = None
+        for ext in extensions:
+            for possible_model_basename in possible_model_basenames:
+                shard_index_name = possible_model_basename + ext + ".index.json"
+                shard_index = cached_file(
+                    model_name_or_path,
+                    shard_index_name,
+                    **cached_file_kwargs,
+                )
+                searched_files.append(shard_index_name)
+                if shard_index is not None:
+                    # The model is sharded over several checkpoints.
+                    with open(str(shard_index)) as f:
+                        index_json = json.load(f)
+                        # Download the shards from the index.json.
+                        shards = list(set(index_json["weight_map"].values()))
+                        for shard in shards:
+                            resolved_archive_file = cached_file(
+                                model_name_or_path,
+                                shard,
+                                **cached_file_kwargs,
+                            )
+                        return True, shard_index, possible_model_basename
+                else:
+                    resolved_archive_file = cached_file(
+                        model_name_or_path,
+                        possible_model_basename + ext,
+                        **cached_file_kwargs,
+                    )
+                    if resolved_archive_file is None:
+                        resolved_archive_file = temp
+                    searched_files.append(possible_model_basename + ext)
+                    if resolved_archive_file is not None:
+                        temp = resolved_archive_file
+                        return False, resolved_archive_file, possible_model_basename
+
+    if resolved_archive_file is None:
+        raise FileNotFoundError(
+            f"Could not find a model in {model_name_or_path} with a name in {', '.join(searched_files)}. Please specify the argument model_basename to use a custom file name."
+        )
+
+    return False, resolved_archive_file, true_model_basename
+
+
+# return the most stable tensor dtype for quantization while minimizing vram
+def auto_dtype_from_config(config: PretrainedConfig, quant_inference: bool = False) -> torch.dtype:
+    # all the gptq inference kernels are float16 only
+    if quant_inference:
+        return torch.float16
+
+    dtype = getattr(config, "torch_dtype")
+    if not dtype or not isinstance(dtype, torch.dtype):
+        raise ValueError("Your model config.json does not have torch_dtype set. Please check for model " "corruption.")
+
+    if dtype == torch.float32:
+        return torch.bfloat16
+    elif dtype == torch.float16:
+        return torch.float16
+    else:
+        # up/down-cast everything else to bfloat16 if not already in bfloat16
+        return torch.bfloat16
+
+
+# generate layer modules for moe models with experts
+def get_moe_layer_modules(layer_modules: List, num_experts: int) -> List:
+    new_inside_layer_modules = []
+    for names in layer_modules:
+        new_inside_layer_modules.append([])
+        for n in names:
+            if EXPERT_INDEX_PLACEHOLDER in n:
+                for index in range(num_experts):
+                    new_inside_layer_modules[-1].append(n.replace(EXPERT_INDEX_PLACEHOLDER, str(index)))
+            else:
+                new_inside_layer_modules[-1].append(n)
+
+    return new_inside_layer_modules
diff --git a/plugins/accelerated-peft/src/gptqmodel/utils/peft.py b/plugins/accelerated-peft/src/gptqmodel/utils/peft.py
new file mode 100644
index 00000000..04f2c878
--- /dev/null
+++ b/plugins/accelerated-peft/src/gptqmodel/utils/peft.py
@@ -0,0 +1,153 @@
+###############################################################################
+# Adapted from https://github.com/ModelCloud/GPTQModel
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+###############################################################################
+import warnings
+from contextlib import contextmanager
+from typing import List, Optional, Tuple, Union
+
+import torch
+from peft import PeftConfig, PeftModel, PeftType, get_peft_model
+from peft.mapping import PEFT_TYPE_TO_CONFIG_MAPPING
+from peft.peft_model import PEFT_TYPE_TO_MODEL_MAPPING
+from peft.tuners.lora import LoraConfig, LoraLayer, LoraModel
+from peft.tuners.lora.gptq import QuantLinear as LoraLinearGPTQ
+
+from ..models.base import BaseGPTQModel
+from ..nn_modules.qlinear.qlinear_tritonv2 import QuantLinear as QuantLinearTriton
+
+class GPTQLoraConfig(LoraConfig):
+    injected_fused_attention: bool = False
+    injected_fused_mlp: bool = False
+
+class GPTQLoraModel(LoraModel):
+    def _replace_module(self, parent_module, child_name, new_module, old_module):
+        # replace the lora linear
+        setattr(parent_module, child_name, new_module)
+        # dispatch to correct device
+        # FIXME: refactor
+        for name, module in new_module.named_modules():
+            if "lora_" in name:
+                device = (list(old_module.parameters()) + list(old_module.buffers()))[
+                    0
+                ].device
+                module.to(device)
+
+    @staticmethod
+    def _create_new_module(
+        lora_config: LoraConfig,
+        adapter_name: str,
+        target: torch.nn.Module,
+        target_cls: torch.nn.Module = QuantLinearTriton,
+        **kwargs,
+    ):
+        # if the base layer module matches a supported class, dispatch the lora linear
+        # to be installed
+        new_module = None
+        if isinstance(target, target_cls):
+            new_module = LoraLinearGPTQ(
+                target, adapter_name, lora_config=lora_config, **kwargs
+            )
+
+        # if module cannot be found, return None which results in a raise in the call-stack
+        return new_module
+
+
+    def merge_adapter(self):
+        raise NotImplementedError("gptq model not support merge ada lora adapter")
+
+    def unmerge_adapter(self):
+        raise NotImplementedError("gptq model not support unmerge ada lora adapter")
+
+    def merge_and_unload(self):
+        raise NotImplementedError("gptq model not support merge and unload")
+
+
+def find_all_linear_names(
+    model: BaseGPTQModel,
+    ignore: Optional[List[str]] = None,
+    ignore_lm_head: bool = True,
+):
+    if not ignore:
+        ignore = []
+    lm_head_name = model.lm_head_name
+    if ignore_lm_head and lm_head_name not in ignore:
+        ignore.append(lm_head_name)
+    results = set()
+    for n, m in model.named_modules():
+        if isinstance(m, torch.nn.Linear):
+            res = n.split(".")[-1]
+            if res not in ignore:
+                results.add(res)
+    return list(results)
+
+
+@contextmanager
+def hijack_peft_mappings():
+    PEFT_TYPE_TO_CONFIG_MAPPING[PeftType.LORA] = GPTQLoraConfig
+    PEFT_TYPE_TO_MODEL_MAPPING[PeftType.LORA] = GPTQLoraModel
+
+    try:
+        yield
+    except:
+        PEFT_TYPE_TO_CONFIG_MAPPING[PeftType.LORA] = GPTQLoraConfig
+        PEFT_TYPE_TO_MODEL_MAPPING[PeftType.LORA] = GPTQLoraModel
+        raise
+    finally:
+        PEFT_TYPE_TO_CONFIG_MAPPING[PeftType.LORA] = GPTQLoraConfig
+        PEFT_TYPE_TO_MODEL_MAPPING[PeftType.LORA] = GPTQLoraModel
+
+def get_gptq_peft_model(
+    model: BaseGPTQModel,
+    peft_config: PeftConfig = None,
+    model_id: str = None,
+    adapter_name: str = "default",
+    auto_find_all_linears: bool = True,
+    train_mode: bool = False,
+):
+    if train_mode and not peft_config:
+        raise ValueError("peft_config not specified when in train mode.")
+    if not train_mode and not model_id:
+        raise ValueError("model_id(where to load adapters) not specified when in inference mode.")
+
+    if train_mode:
+        peft_type = peft_config.peft_type
+        if not isinstance(peft_type, str):
+            peft_type = peft_type.value
+        if peft_type in [PeftType.LORA.value]:
+            if auto_find_all_linears:
+                peft_config.target_modules = find_all_linear_names(model, ignore_lm_head=True)
+            if peft_type == PeftType.LORA.value and not isinstance(peft_config, GPTQLoraConfig):
+                peft_config = GPTQLoraConfig(**peft_config.to_dict())
+
+    with hijack_peft_mappings():
+        try:
+            if train_mode:
+                peft_model = get_peft_model(model.model, peft_config, adapter_name=adapter_name)
+            else:
+                peft_model = PeftModel.from_pretrained(model.model, model_id, adapter_name)
+        except:
+            raise NotImplementedError(
+                f"{model.__class__.__name__} not support {peft_config.peft_type.value} peft type yet."
+            )
+
+    return peft_model
+
+
+__all__ = [
+    "GPTQLoraConfig",
+    "GPTQLoraModel",
+    "find_all_linear_names",
+    "get_gptq_peft_model",
+]
\ No newline at end of file
diff --git a/plugins/accelerated-peft/tests/test_q4_triton.py b/plugins/accelerated-peft/tests/test_q4_triton.py
new file mode 100644
index 00000000..c383db75
--- /dev/null
+++ b/plugins/accelerated-peft/tests/test_q4_triton.py
@@ -0,0 +1,99 @@
+###############################################################################
+# Adapted from https://github.com/ModelCloud/GPTQModel
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+###############################################################################
+# -- do not touch
+import os
+
+os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
+# -- end do not touch
+
+import unittest  # noqa: E402
+
+import torch  # noqa: E402
+from gptqmodel import Backend, GPTQModel  # noqa: E402
+from gptqmodel.nn_modules.qlinear.qlinear_tritonv2 import QuantLinear as TritonV2QuantLinear  # noqa: E402
+from transformers import AutoTokenizer  # noqa: E402
+
+GENERATE_EVAL_SIZE = 100
+
+class TestsQ4Triton(unittest.TestCase):
+    def test_generation_desc_act_false(self):
+        prompt = "I am in Paris and"
+
+        reference_output = "<s> I am in Paris and I am in love with you.\n\nScene 2:\n\n(The stage is now dark, but the audience can see the characters walking around the stage.)\n\n(The stage is now lit up, but the audience can only see the characters' silhouettes.)\n\n("
+        new_tokens = 60
+
+        model_id = "LnL-AI/TinyLlama-1.1B-Chat-v1.0-GPTQ-4bit"
+
+        model_q = GPTQModel.from_quantized(
+            model_id,
+            device="cuda:0",
+            backend=Backend.TRITON,
+            torch_dtype=torch.float16,
+        )
+        for _, submodule in model_q.named_modules():
+            if isinstance(submodule, TritonV2QuantLinear):
+                break
+        else:
+            raise ValueError("Did not find a tritonv2 linear layer")
+
+        tokenizer = AutoTokenizer.from_pretrained(model_id)
+
+        inp = tokenizer(prompt, return_tensors="pt").to("cuda:0")
+
+        # This one uses Autocast.
+        res = model_q.generate(**inp, num_beams=1, min_new_tokens=new_tokens, max_new_tokens=new_tokens)
+        predicted_text = tokenizer.decode(res[0])
+
+        self.assertEqual(predicted_text[:GENERATE_EVAL_SIZE], reference_output[:GENERATE_EVAL_SIZE])
+
+        # This one does not.
+        res = model_q.model.generate(**inp, num_beams=1, min_new_tokens=new_tokens, max_new_tokens=new_tokens)
+        predicted_text = tokenizer.decode(res[0])
+
+        self.assertEqual(predicted_text[:GENERATE_EVAL_SIZE], reference_output[:GENERATE_EVAL_SIZE])
+
+    def test_generation_desc_act_true(self):
+        prompt = "I am in Paris and"
+        device = torch.device("cuda:0")
+
+        # Reference generated with the cuda-old kernel
+        reference_output = "<s> I am in Paris and I am in love with you.\n\nScene 2:\n\nThe stage is now set in a Parisian café. The café is filled with people, including a group of friends, a couple, and a group of tourists. The friends are discussing their plans for the"
+
+        model_id = "LnL-AI/TinyLlama-1.1B-Chat-v1.0-GPTQ-4bit"
+        revision = "desc_act_true"
+
+        model_q = GPTQModel.from_quantized(
+            model_id,
+            device="cuda:0",
+            backend=Backend.TRITON,
+            revision=revision,
+
+        )
+        for _, submodule in model_q.named_modules():
+            if isinstance(submodule, TritonV2QuantLinear):
+                break
+        else:
+            raise ValueError("Did not find a tritonv2 linear layer")
+
+        tokenizer = AutoTokenizer.from_pretrained(model_id)
+
+        inp = tokenizer(prompt, return_tensors="pt").to(device)
+
+        res = model_q.generate(**inp, num_beams=1, min_new_tokens=60, max_new_tokens=60)
+
+        predicted_text = tokenizer.decode(res[0])
+
+        self.assertEqual(predicted_text[:GENERATE_EVAL_SIZE], reference_output[:GENERATE_EVAL_SIZE])
diff --git a/plugins/accelerated-peft/tests/test_triton.py b/plugins/accelerated-peft/tests/test_triton.py
new file mode 100644
index 00000000..410aa6e1
--- /dev/null
+++ b/plugins/accelerated-peft/tests/test_triton.py
@@ -0,0 +1,107 @@
+###############################################################################
+# Adapted from https://github.com/ModelCloud/GPTQModel
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+###############################################################################
+# -- do not touch
+import os
+
+os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
+# -- end do not touch
+
+import os  # noqa: E402
+import unittest  # noqa: E402
+
+import torch  # noqa: E402
+import torch.utils.benchmark as benchmark  # noqa: E402
+from gptqmodel import Backend, GPTQModel  # noqa: E402
+from transformers import AutoTokenizer  # noqa: E402
+
+MODEL_ID = "TheBloke/Llama-7B-GPTQ"
+DATASET_ID = "timdettmers/openassistant-guanaco"
+LEARNING_RATE = 3e-5
+MAX_SEQ_LEN = 10
+BATCH_SIZE = 5
+NUM_TRAIN_STEPS = 10
+
+os.environ["TOKENIZERS_PARALLELISM"] = "false"
+
+
+def benchmark_forward(
+    fn,
+    *inputs,
+    repeats="auto",
+    desc="",
+    verbose=True,
+    amp=False,
+    amp_dtype=torch.float16,
+    **kwinputs,
+):
+    if verbose:
+        print(desc, "- Forward pass")
+
+    def amp_wrapper(*inputs, **kwinputs):
+        with torch.autocast(device_type="cuda", dtype=amp_dtype, enabled=amp):
+            fn(*inputs, **kwinputs)
+
+    t = benchmark.Timer(
+        stmt="fn_amp(*inputs, **kwinputs)",
+        globals={"fn_amp": amp_wrapper, "inputs": inputs, "kwinputs": kwinputs},
+        num_threads=torch.get_num_threads(),
+    )
+    if repeats == "auto":
+        m = t.blocked_autorange()
+    else:
+        m = t.timeit(repeats)
+    if verbose:
+        print(m)
+    return t, m
+
+
+def get_model_and_tokenizer(
+    model_id=MODEL_ID,
+    **model_kwargs,
+):
+    tokenizer = AutoTokenizer.from_pretrained(
+        MODEL_ID,
+        use_fast=True,
+    )
+    if not tokenizer.pad_token_id:
+        tokenizer.pad_token_id = tokenizer.eos_token_id
+
+    model = GPTQModel.from_quantized(
+        model_id,
+        disable_exllamav2=True,
+        disable_exllama=True,
+        **model_kwargs,
+    )
+
+    model.warmup_triton()
+    return model, tokenizer
+
+
+class TestTriton(unittest.TestCase):
+    def test_triton_qlinear(self):
+        ref_model, _ = get_model_and_tokenizer(
+            model_id=MODEL_ID,
+            backend=Backend.TRITON,
+        )
+
+        hidden_size = ref_model.model.model.embed_tokens.weight.shape[1]
+        test_data = torch.randn((1, 2048, hidden_size), dtype=torch.float16).cuda()
+
+        qlinear_ref = ref_model.model.model.layers[0].self_attn.q_proj
+
+        ref_out = qlinear_ref(test_data) # noqa: F841
+
+        _, measure_triton = benchmark_forward(qlinear_ref, test_data, desc="Triton", verbose=True)

From 62424cb2265c03b8f56b271548d2550300483580 Mon Sep 17 00:00:00 2001
From: 1000850000 user <aaron.chew1@ibm.com>
Date: Sun, 30 Jun 2024 16:07:20 +0000
Subject: [PATCH 02/20] edited peft header

---
 .../src/gptqmodel/utils/peft.py               | 34 ++++++++++++-------
 1 file changed, 21 insertions(+), 13 deletions(-)

diff --git a/plugins/accelerated-peft/src/gptqmodel/utils/peft.py b/plugins/accelerated-peft/src/gptqmodel/utils/peft.py
index 04f2c878..db27fa09 100644
--- a/plugins/accelerated-peft/src/gptqmodel/utils/peft.py
+++ b/plugins/accelerated-peft/src/gptqmodel/utils/peft.py
@@ -1,17 +1,25 @@
 ###############################################################################
-# Adapted from https://github.com/ModelCloud/GPTQModel
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
+# Adapted from https://github.com/AutoGPTQ/AutoGPTQ
+# MIT License
+# Copyright (c) 2024
+
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
 ###############################################################################
 import warnings
 from contextlib import contextmanager

From 6a2a9e08512600d92703e22614b8c8be72bbb995 Mon Sep 17 00:00:00 2001
From: 1000850000 user <aaron.chew1@ibm.com>
Date: Sun, 30 Jun 2024 17:41:32 +0000
Subject: [PATCH 03/20] add package build workflow

---
 .github/workflows/build.yml           | 55 +++++++++++++++++++++++++++
 plugins/accelerated-peft/tox.ini      |  5 +--
 plugins/framework/tox.ini             |  2 +-
 plugins/fused-ops-and-kernels/tox.ini |  2 +-
 4 files changed, 59 insertions(+), 5 deletions(-)
 create mode 100644 .github/workflows/build.yml

diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
new file mode 100644
index 00000000..81848e62
--- /dev/null
+++ b/.github/workflows/build.yml
@@ -0,0 +1,55 @@
+on:
+  push:
+    branches: ["extracted_autogptq"]
+
+jobs:
+  build:
+    runs-on: ubuntu-latest
+    strategy:
+      matrix:
+        python-version:
+          # - setup: "3.10"
+          #   tox: "py310"
+          - setup: "3.9"
+            tox: "py39"
+        plugin_name:
+          - "framework"
+          - "accelerated-peft"
+          - "fused-ops-and-kernels"
+
+    permissions:
+      id-token: write  # IMPORTANT: this permission is mandatory for trusted publishing
+
+    steps:
+      - uses: actions/checkout@v4
+
+      # should only trigger the release if the plugin has changes
+      - uses: dorny/paths-filter@v2
+        id: changes
+        with:
+          filters: |
+              plugins:
+                - 'plugins/${{ matrix.plugin_name }}/**'
+      - if: steps.changes.outputs.plugins == 'true'
+        name: Set up Python ${{ matrix.python-version.setup }}
+        uses: actions/setup-python@v4
+        with:
+          python-version: ${{ matrix.python-version.setup }}
+
+      - if: steps.changes.outputs.plugins == 'true'
+        name: Install dependencies
+        run: |
+          python -m pip install --upgrade pip
+          python -m pip install --upgrade tox
+
+      - if: steps.changes.outputs.plugins == 'true'
+        name: Build and check wheel package
+        run: |
+          cd plugins/${{ matrix.plugin_name }}
+          tox -e build,twinecheck
+          ls dist/*
+      # - if: steps.changes.outputs.plugins == 'true'
+      #   name: "Upload to PyPI"
+      #   uses: pypa/gh-action-pypi-publish@ec4db0b4ddc65acdf4bff5fa45ac92d78b56bdf0 # v1.9.0
+      #   with:
+      #     packages-dir: plugins/${{ matrix.plugin_name }}/dist
\ No newline at end of file
diff --git a/plugins/accelerated-peft/tox.ini b/plugins/accelerated-peft/tox.ini
index c2b33772..cdcf221d 100644
--- a/plugins/accelerated-peft/tox.ini
+++ b/plugins/accelerated-peft/tox.ini
@@ -1,5 +1,5 @@
 [tox]
-envlist = py, lint
+envlist = py, lint, fmt, build, twinecheck
 
 [testenv]
 deps = 
@@ -37,14 +37,13 @@ commands =
     black {posargs:.}
     isort {posargs:.}
 
-
 [testenv:build]
 description = build wheel
 deps =
     build
 commands = python -m build -w
 skip_install = True
-
+ 
 [testenv:twinecheck]
 description = check wheel
 deps =
diff --git a/plugins/framework/tox.ini b/plugins/framework/tox.ini
index acfde95d..16764631 100644
--- a/plugins/framework/tox.ini
+++ b/plugins/framework/tox.ini
@@ -1,5 +1,5 @@
 [tox]
-envlist = py, lint, fmt
+envlist = py, lint, fmt, build, twinecheck
 
 [testenv]
 deps = 
diff --git a/plugins/fused-ops-and-kernels/tox.ini b/plugins/fused-ops-and-kernels/tox.ini
index 6a667f53..c3a38721 100644
--- a/plugins/fused-ops-and-kernels/tox.ini
+++ b/plugins/fused-ops-and-kernels/tox.ini
@@ -1,5 +1,5 @@
 [tox]
-envlist = py, lint
+envlist = py, lint, fmt, build, twinecheck
 
 [testenv]
 deps = 

From 022d636eb395debeea9d27a8dde9ca54dc53774e Mon Sep 17 00:00:00 2001
From: 1000850000 user <aaron.chew1@ibm.com>
Date: Tue, 2 Jul 2024 03:01:36 +0000
Subject: [PATCH 04/20] add unit tests on extracted autogptq

---
 .../accelerated-peft/tests/test_gptqmodel.py  | 223 ++++++++++++++++++
 1 file changed, 223 insertions(+)
 create mode 100644 plugins/accelerated-peft/tests/test_gptqmodel.py

diff --git a/plugins/accelerated-peft/tests/test_gptqmodel.py b/plugins/accelerated-peft/tests/test_gptqmodel.py
new file mode 100644
index 00000000..4e8e349c
--- /dev/null
+++ b/plugins/accelerated-peft/tests/test_gptqmodel.py
@@ -0,0 +1,223 @@
+from transformers.utils.import_utils import _is_package_available
+import pytest  # pylint: disable=import-error
+import torch
+from typing import List
+from types import MethodType
+from functools import partial
+from peft import LoraConfig, prepare_model_for_kbit_training
+from peft.tuners.lora.gptq import QuantLinear as LoraLinearGPTQ
+from peft.tuners.lora.model import LoraModel
+
+GPTQ = "gptq"
+# r, lora_alpha
+FLOAT16 = "float16"
+LORA_r = 8
+LORA_alpha = 1.0
+BS = 1
+SEQLEN = 128
+
+ALLCLOSE_RTOL = 1e-3
+ALLCLOSE_ATOL = 1e-4
+
+VANILLA_MODEL_NAME = "TinyLlama/TinyLlama-1.1B-Chat-v0.3"
+QUANTIZED_MODEL_NAME = "TheBloke/TinyLlama-1.1B-Chat-v0.3-GPTQ"
+TARGET_MODULES = ["q_proj", "k_proj", "v_proj", "o_proj"]
+
+def replace_module_peft(self, parent_module, child_name, new_module, old_module):
+    setattr(parent_module, child_name, new_module)
+    for name, module in new_module.named_modules():
+        if "lora_" in name:
+            device = (list(old_module.parameters()) + list(old_module.buffers()))[0].device
+            module.to(device)
+
+def create_new_module_peft(
+    lora_config: LoraConfig,
+    adapter_name: str,
+    target: torch.nn.Module,
+    target_cls,
+    **kwargs,
+):
+    new_module = None
+    if isinstance(target, target_cls):
+        new_module = LoraLinearGPTQ(
+            target, adapter_name, lora_config=lora_config, **kwargs
+        )
+    return new_module            
+
+
+def get_autogptq_peft_model(model, peft_config):
+    from auto_gptq.nn_modules.qlinear.qlinear_tritonv2 import QuantLinear
+    from auto_gptq.utils.peft_utils import GPTQLoraModel, get_gptq_peft_model
+
+    model = prepare_model_for_kbit_training(
+            model,
+            use_gradient_checkpointing=False,
+            gradient_checkpointing_kwargs={},
+        )
+
+    _old_create_new_module = LoraModel._create_new_module
+    _old_replace_module = GPTQLoraModel._replace_module
+    _create_new_module = partial(create_new_module_peft, target_cls=QuantLinear)
+    LoraModel._create_new_module = staticmethod(_create_new_module)
+    GPTQLoraModel._replace_module = MethodType(replace_module_peft, GPTQLoraModel)
+
+    # Install GPTQ adapters using the AutoGPTQ package (with the above patches)
+    model = get_gptq_peft_model(
+        model,
+        peft_config=peft_config,
+        auto_find_all_linears=peft_config.target_modules is None,
+        train_mode=True,  # install adapaters for training
+    )
+
+    # undo the patching for hygine
+    LoraModel._create_new_module = staticmethod(_old_create_new_module)
+    GPTQLoraModel._replace_module = MethodType(_old_replace_module, GPTQLoraModel)
+    return model
+
+def get_autogptq_lib_quantized_model(model_name:str, target_modules:List, torch_dtype:str):
+    from auto_gptq import AutoGPTQForCausalLM, BaseQuantizeConfig
+    quantize_config = BaseQuantizeConfig.from_pretrained(model_name)
+
+    device_map = {
+        "": (
+            torch.cuda.current_device()
+            if torch.cuda.is_available()
+            else None
+        )
+    }
+    model = AutoGPTQForCausalLM.from_quantized(
+        model_name,
+        quantize_config=quantize_config,
+        torch_dtype=getattr(torch, torch_dtype),
+        low_cpu_mem_usage=False,
+        use_marlin=False,
+        disable_exllama=True,
+        warmup_triton=False,
+        use_tritonv2=True,
+        trainable=True,
+        device_map=device_map,
+    )
+
+    peft_config = LoraConfig(
+        r=LORA_r,
+        lora_alpha=LORA_alpha,
+        lora_dropout=0.0,  # anyway we are going to override it
+        target_modules=target_modules,
+    )
+    # model = get_autogptq_peft_model(model, peft_config)
+    return model
+                
+def load_autogptq_plugin_model(model_name:str, target_modules:List, torch_dtype:str):
+    from fms_acceleration_peft.framework_plugin_autogptq import (
+        AutoGPTQAccelerationPlugin,
+    )
+    plugins = {
+        GPTQ: AutoGPTQAccelerationPlugin(
+            {
+                "peft": {
+                    "quantization": {
+                        "auto_gptq": {"kernel": "triton_v2", "from_quantized": True}
+                    }
+                }
+            }
+        ),
+    }
+
+    class TrainArgs:
+        gradient_checkpointing = False
+        gradient_checkpointing_kwargs = {}
+
+    args = TrainArgs()
+    peft_config = LoraConfig(
+        r=LORA_r,
+        lora_alpha=LORA_alpha,
+        lora_dropout=0.0,  # anyway we are going to override it
+        target_modules=target_modules,
+    )
+
+    _plugin = plugins[GPTQ]
+    model = _plugin.model_loader(
+        model_name, torch_dtype=getattr(torch, FLOAT16)
+    )
+    # model, _ = _plugin.augmentation(model, args, (peft_config,))
+    return model
+
+@pytest.fixture()
+def input_ids(seed: int = 42, device: torch.device = "cuda"):
+    torch.manual_seed(seed)
+    yield torch.randint(0, 10000, (BS, SEQLEN))    
+
+@pytest.mark.skipif(
+    not _is_package_available("auto_gptq"),
+    reason="Only runs if auto_gptq is installed",
+)
+def test_already_quantized_outputs_match(
+    input_ids, seed: int = 42,
+):
+    torch.manual_seed(seed)
+    original_model = get_autogptq_lib_quantized_model(QUANTIZED_MODEL_NAME, TARGET_MODULES, FLOAT16)
+    refactored_model = load_autogptq_plugin_model(QUANTIZED_MODEL_NAME, TARGET_MODULES, FLOAT16)
+    with torch.autocast(device_type='cuda', dtype=torch.float32):
+        original_model.eval()
+        original_logits = original_model(input_ids.to(original_model.device)).logits
+        refactored_model.eval()
+        refactored_logits = refactored_model(input_ids.to(refactored_model.device)).logits
+
+    assert torch.allclose(
+        original_logits, refactored_logits, atol=ALLCLOSE_ATOL, rtol=ALLCLOSE_RTOL
+    ), "Logits don't match between refactored quantized model and original library"
+
+
+@pytest.mark.skipif(
+    not _is_package_available("auto_gptq"),
+    reason="Only runs if auto_gptq is installed",
+)
+def test_pretrained_to_quantized_outputs_match(
+    input_ids, seed: int = 42,
+):
+    torch.manual_seed(seed)
+    from transformers import AutoTokenizer
+    from auto_gptq import AutoGPTQForCausalLM, BaseQuantizeConfig
+    from gptqmodel import GPTQModel, QuantizeConfig
+    device = torch.device("cuda")
+
+    tokenizer = AutoTokenizer.from_pretrained(VANILLA_MODEL_NAME, use_fast=True)
+    calibration_dataset = [
+        tokenizer(
+            "The world is a wonderful place full of beauty and love."
+        )
+    ]
+
+    original_quantize_config = BaseQuantizeConfig(
+        bits=4,
+        group_size=-1,
+        desc_act=False,
+    )
+    # load un-quantized model, by default, the model will always be loaded into CPU memory
+    original_model = AutoGPTQForCausalLM.from_pretrained(
+        VANILLA_MODEL_NAME, 
+        original_quantize_config
+    ).to(device)
+    # quantize model, the examples should be list of dict whose keys can only be "input_ids" and "attention_mask"
+    original_model.quantize(calibration_dataset, use_triton=True)
+
+    refactored_quant_config = QuantizeConfig(
+        bits=4,
+        group_size=-1,
+        desc_act=False,
+    )
+    # load un-quantized model, by default, the model will always be loaded into CPU memory
+    refactored_model = GPTQModel.from_pretrained(VANILLA_MODEL_NAME, refactored_quant_config).to(device)
+    # quantize model, the calibration_dataset should be list of dict whose keys can only be "input_ids" and "attention_mask"
+    refactored_model.quantize(calibration_dataset)
+
+    with torch.autocast(device_type='cuda', dtype=torch.float16):
+        with torch.no_grad():
+            original_model.eval()
+            original_logits = original_model(input_ids.to(original_model.device)).logits
+            refactored_model.eval()
+            refactored_logits = refactored_model(input_ids.to(refactored_model.device)).logits
+
+    assert torch.allclose(
+        original_logits, refactored_logits, atol=ALLCLOSE_ATOL, rtol=ALLCLOSE_RTOL
+    ), "Logits don't match between refactored quantized model and original library"
\ No newline at end of file

From 8976983f139f9694789bb1a6c0a7f30815b6b2fe Mon Sep 17 00:00:00 2001
From: 1000850000 user <aaron.chew1@ibm.com>
Date: Wed, 3 Jul 2024 07:20:03 +0000
Subject: [PATCH 05/20] modify autogptq plugin to support both external and
 extracted autogptq

---
 .github/workflows/build.yml                   |  55 ---
 plugins/accelerated-peft/pyproject.toml       |   3 +-
 plugins/accelerated-peft/requirements.txt     |   6 +-
 .../framework_plugin_autogptq.py              | 172 ++++++----
 .../accelerated-peft/tests/test_gptqmodel.py  | 313 +++++++++---------
 5 files changed, 266 insertions(+), 283 deletions(-)
 delete mode 100644 .github/workflows/build.yml

diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
deleted file mode 100644
index 81848e62..00000000
--- a/.github/workflows/build.yml
+++ /dev/null
@@ -1,55 +0,0 @@
-on:
-  push:
-    branches: ["extracted_autogptq"]
-
-jobs:
-  build:
-    runs-on: ubuntu-latest
-    strategy:
-      matrix:
-        python-version:
-          # - setup: "3.10"
-          #   tox: "py310"
-          - setup: "3.9"
-            tox: "py39"
-        plugin_name:
-          - "framework"
-          - "accelerated-peft"
-          - "fused-ops-and-kernels"
-
-    permissions:
-      id-token: write  # IMPORTANT: this permission is mandatory for trusted publishing
-
-    steps:
-      - uses: actions/checkout@v4
-
-      # should only trigger the release if the plugin has changes
-      - uses: dorny/paths-filter@v2
-        id: changes
-        with:
-          filters: |
-              plugins:
-                - 'plugins/${{ matrix.plugin_name }}/**'
-      - if: steps.changes.outputs.plugins == 'true'
-        name: Set up Python ${{ matrix.python-version.setup }}
-        uses: actions/setup-python@v4
-        with:
-          python-version: ${{ matrix.python-version.setup }}
-
-      - if: steps.changes.outputs.plugins == 'true'
-        name: Install dependencies
-        run: |
-          python -m pip install --upgrade pip
-          python -m pip install --upgrade tox
-
-      - if: steps.changes.outputs.plugins == 'true'
-        name: Build and check wheel package
-        run: |
-          cd plugins/${{ matrix.plugin_name }}
-          tox -e build,twinecheck
-          ls dist/*
-      # - if: steps.changes.outputs.plugins == 'true'
-      #   name: "Upload to PyPI"
-      #   uses: pypa/gh-action-pypi-publish@ec4db0b4ddc65acdf4bff5fa45ac92d78b56bdf0 # v1.9.0
-      #   with:
-      #     packages-dir: plugins/${{ matrix.plugin_name }}/dist
\ No newline at end of file
diff --git a/plugins/accelerated-peft/pyproject.toml b/plugins/accelerated-peft/pyproject.toml
index 35789df0..a2b2f12e 100644
--- a/plugins/accelerated-peft/pyproject.toml
+++ b/plugins/accelerated-peft/pyproject.toml
@@ -26,12 +26,13 @@ classifiers=[
 
 [project.optional-dependencies]
 flash-attn = ["flash-attn"]
+auto_gptq = ["auto_gptq @ git+https://github.com/AutoGPTQ/AutoGPTQ.git"]
 
 [tool.hatch.metadata.hooks.requirements_txt]
 files = ["requirements.txt"]
 
 [tool.hatch.build.targets.wheel]
-only-include = ["src/fms_acceleration_peft"]
+only-include = ["src/"]
 
 [tool.hatch.metadata]
 allow-direct-references = true
diff --git a/plugins/accelerated-peft/requirements.txt b/plugins/accelerated-peft/requirements.txt
index 77db9597..3bcb995f 100644
--- a/plugins/accelerated-peft/requirements.txt
+++ b/plugins/accelerated-peft/requirements.txt
@@ -3,11 +3,7 @@
 
 # put this in here because there is a breaking FSDP api change that 
 # is fixed after peft > 0.10
-accelerate < 0.29
+accelerate <= 0.29
 
 # bitsandbytes for the BNB plugin
 bitsandbytes
-
-# Installing from repository because "auto_gptq > 0.7.1" it not yet available
-# Specifying the commit id here as recent commits to the main branch have introduced additional dependencies
-auto_gptq @ git+https://github.com/AutoGPTQ/AutoGPTQ.git@ea829c7bbe83561c2b1de26795b6592992373ef7
\ No newline at end of file
diff --git a/plugins/accelerated-peft/src/fms_acceleration_peft/framework_plugin_autogptq.py b/plugins/accelerated-peft/src/fms_acceleration_peft/framework_plugin_autogptq.py
index 7928d9a9..63254c25 100644
--- a/plugins/accelerated-peft/src/fms_acceleration_peft/framework_plugin_autogptq.py
+++ b/plugins/accelerated-peft/src/fms_acceleration_peft/framework_plugin_autogptq.py
@@ -20,6 +20,7 @@
 from functools import partial
 from types import MethodType
 from typing import Dict, Tuple
+import importlib
 import os
 
 # Third Party
@@ -31,12 +32,11 @@
 import torch
 import torch.distributed
 
-
 class AutoGPTQAccelerationPlugin(AccelerationPlugin):
 
-    require_packages = ["auto_gptq"]
+    require_packages = []
 
-    def __init__(self, configurations: Dict[str, Dict]):
+    def __init__(self, configurations: Dict[str, Dict], use_external_lib: bool = False):
         super().__init__(configurations)
 
         # just do checking, nothing must to configure at this point
@@ -47,17 +47,25 @@ def __init__(self, configurations: Dict[str, Dict]):
         self._check_config_equal(
             key="peft.quantization.auto_gptq.from_quantized", value=True
         )
+        self.use_external_lib = use_external_lib and importlib.util.find_spec("autogptq") is not None
 
     def model_loader(self, model_name: str, **kwargs):
         # guarded imports
         # Third Party
-        from auto_gptq import (  # pylint: disable=import-outside-toplevel,import-error
-            AutoGPTQForCausalLM,
-            BaseQuantizeConfig,
-        )
-        from auto_gptq.nn_modules.qlinear.qlinear_tritonv2 import (  # pylint: disable=import-outside-toplevel,import-error
-            QuantLinear,
-        )
+        if self.use_external_lib:
+            from auto_gptq import (  # pylint: disable=import-outside-toplevel,import-error
+                AutoGPTQForCausalLM as GPTQModel,
+                BaseQuantizeConfig as QuantizeConfig,
+            )
+            from auto_gptq.nn_modules.qlinear.qlinear_tritonv2 import (  # pylint: disable=import-outside-toplevel,import-error
+                QuantLinear,
+            )
+        else:
+            from gptqmodel import GPTQModel, QuantizeConfig
+            from gptqmodel.utils import Backend
+            from gptqmodel.nn_modules.qlinear.qlinear_tritonv2 import (
+                QuantLinear,
+            )
 
         # Local
         from .autogptq_utils import (  # pylint: disable=import-outside-toplevel
@@ -85,7 +93,7 @@ def model_loader(self, model_name: str, **kwargs):
         # switching to cuda/cuda_old/triton backend."
         # assume model_name points to a quantized checkpoint. Thus we load the quantization
         # config directly from the checkpoint.
-        quantize_config = BaseQuantizeConfig.from_pretrained(model_name)
+        quantize_config = QuantizeConfig.from_pretrained(model_name)
 
         # get additional parameters
         torch_dtype = kwargs.get("torch_dtype", torch.float32)
@@ -101,23 +109,39 @@ def model_loader(self, model_name: str, **kwargs):
         )
         AutoModelForCausalLM.from_config = _from_config  # patch
 
+        if self.use_external_lib:
+            kwargs = {
+                "low_cpu_mem_usage": low_cpu_mem_usage,
+                "use_marlin": False,  # disable, cannot be used for training (no forward+backward)
+                "disable_exllama": True,  # disable, cannot be used for training (no backward)
+                "use_tritonv2": True,
+                "trainable": True,  # only support trainable mode
+            }
+        else:
+            kwargs = {
+                "low_cpu_mem_usage": low_cpu_mem_usage, # this is only used for device map
+                "backend": Backend.TRITON,
+            }
+
+
         # this is a HF method that checks if the low_cpu_mem mode is enabled
         # via HF accelerate
         if is_fsdp_enabled():
-            # Local
-            from .autogptq_utils import (  # pylint: disable=import-outside-toplevel
-                _patch_target_module,
-                make_sure_no_tensor_in_meta_device,
-            )
-
-            # We patch `make_sure_no_tensor_in_meta_device`
-            # from autogptq to avoid errors on models without bias
-            _patch_target_module(
-                to_patch="auto_gptq.modeling._utils.make_sure_no_tensor_in_meta_device",
-                replace_with=make_sure_no_tensor_in_meta_device,
-                target_module="auto_gptq.modeling._base",
-            )
-            low_cpu_mem_usage = True
+            if self.use_external_lib:
+                # Local
+                from .autogptq_utils import (  # pylint: disable=import-outside-toplevel
+                    _patch_target_module,
+                    make_sure_no_tensor_in_meta_device,
+                )
+
+                # We patch `make_sure_no_tensor_in_meta_device`
+                # from autogptq to avoid errors on models without bias
+                _patch_target_module(
+                    to_patch="auto_gptq.modeling._utils.make_sure_no_tensor_in_meta_device",
+                    replace_with=make_sure_no_tensor_in_meta_device,
+                    target_module="auto_gptq.modeling._base",
+                )
+                kwargs["low_cpu_mem_usage"] = True
 
         # NOTE: need to set the device map as below as we want to use AutoGPTQ for training.
         # device_map is for inference only
@@ -130,7 +154,7 @@ def model_loader(self, model_name: str, **kwargs):
         # see https://github.com/huggingface/transformers/pull/25107#issuecomment-2134833262
         device_map = {
             "": (
-                (torch.cuda.current_device() if not low_cpu_mem_usage else "cpu")
+                (torch.cuda.current_device() if not kwargs["low_cpu_mem_usage"] else "cpu")
                 if torch.cuda.is_available()
                 else None
             )
@@ -138,17 +162,13 @@ def model_loader(self, model_name: str, **kwargs):
 
         # currently only enable triton_v2, because the triton kernels are the only ones
         # that have backwards
-        model = AutoGPTQForCausalLM.from_quantized(
+        model = GPTQModel.from_quantized(
             model_name,
             quantize_config=quantize_config,
             torch_dtype=torch_dtype,
-            low_cpu_mem_usage=low_cpu_mem_usage,
-            use_marlin=False,  # disable, cannot be used for training (no forward+backward)
-            disable_exllama=True,  # disable, cannot be used for training (no backward)
-            warmup_triton=False,  # disable for now as it will try to run the warmup while on CPU
-            use_tritonv2=True,
-            trainable=True,  # only support trainable mode
             device_map=device_map,
+            warmup_triton=False,  # disable for now as it will try to run the warmup while on CPU
+            **kwargs,
         )
 
         # https://github.com/foundation-model-stack/fms-acceleration/pull/15
@@ -219,19 +239,22 @@ def augmentation(
     ):
         # guarded imports
         # Third Party
-        from auto_gptq.nn_modules.qlinear.qlinear_tritonv2 import (  # pylint: disable=import-outside-toplevel,import-error
-            QuantLinear,
-        )
-        from auto_gptq.utils.peft_utils import (  # pylint: disable=import-outside-toplevel,import-error
-            GPTQLoraModel,
-            get_gptq_peft_model,
-        )
+        if self.use_external_lib:
+            from auto_gptq.nn_modules.qlinear.qlinear_tritonv2 import (  # pylint: disable=import-outside-toplevel,import-error
+                QuantLinear,
+            )
+            from auto_gptq.utils.peft_utils import (  # pylint: disable=import-outside-toplevel,import-error
+                GPTQLoraModel,
+                get_gptq_peft_model,
+            )
+            # Local
+            from .autogptq_utils import (  # pylint: disable=import-outside-toplevel
+                create_new_module_peft,
+                replace_module_peft,
+            )
+        else:
+            from gptqmodel.utils.peft import get_gptq_peft_model
 
-        # Local
-        from .autogptq_utils import (  # pylint: disable=import-outside-toplevel
-            create_new_module_peft,
-            replace_module_peft,
-        )
 
         (peft_config,) = modifiable_args  # unpack modifiable args
 
@@ -249,31 +272,33 @@ def augmentation(
             gradient_checkpointing_kwargs=train_args.gradient_checkpointing_kwargs,
         )
 
-        # These functions need to replaced due to some incompatibliites
-        # with newer PEFT packages.
-        # - on augmentation we call auto_gptq.utils.peft_utils.get_gptq_peft_model
-        # - this internally calls peft.utils.other.get_peft_model
-        # - however the problem is that peft API moves very fast, and there are incompatiblities
-        #
-        # During peft wrapping there are two key operations
-        # 1. LoraModel._create_new_module is called to create a LoraLinear layer that is
-        #    compatible with the base layer. For quantized base layers, the LoraLinear
-        #    may be different.
-        # 2. GPTQLoraModel._replace_module to replace the existing Linear with the LoraLinear.
-        #    Also move to device (which may depend on how base layer is implemented)
-
-        # NOTE: GPTQLoraModel inherits from LoraModel, and the _create_new_module method is called
-        # on the parent. Hence _create_new_module is patched on the parent
-
-        # FIXME:
-        # 1. investigate using BaseGPTQForCausalLM.make_sure_compatible_with_peft
-        #    to see if we can get around the patching
-
-        _old_create_new_module = LoraModel._create_new_module
-        _old_replace_module = GPTQLoraModel._replace_module
-        _create_new_module = partial(create_new_module_peft, target_cls=QuantLinear)
-        LoraModel._create_new_module = staticmethod(_create_new_module)
-        GPTQLoraModel._replace_module = MethodType(replace_module_peft, GPTQLoraModel)
+        if self.use_external_lib:
+            # These functions need to replaced due to some incompatibliites
+            # with newer PEFT packages.
+            # - on augmentation we call auto_gptq.utils.peft_utils.get_gptq_peft_model
+            # - this internally calls peft.utils.other.get_peft_model
+            # - however the problem is that peft API moves very fast, and there are incompatiblities
+            #
+            # During peft wrapping there are two key operations
+            # 1. LoraModel._create_new_module is called to create a LoraLinear layer that is
+            #    compatible with the base layer. For quantized base layers, the LoraLinear
+            #    may be different.
+            # 2. GPTQLoraModel._replace_module to replace the existing Linear with the LoraLinear.
+            #    Also move to device (which may depend on how base layer is implemented)
+
+            # NOTE: GPTQLoraModel inherits from LoraModel, and the _create_new_module method is called
+            # on the parent. Hence _create_new_module is patched on the parent
+
+            # FIXME:
+            # 1. investigate using BaseGPTQForCausalLM.make_sure_compatible_with_peft
+            #    to see if we can get around the patching
+
+            _old_create_new_module = LoraModel._create_new_module
+            _old_replace_module = GPTQLoraModel._replace_module
+            _create_new_module = partial(create_new_module_peft, target_cls=QuantLinear)
+            LoraModel._create_new_module = staticmethod(_create_new_module)
+            GPTQLoraModel._replace_module = MethodType(replace_module_peft, GPTQLoraModel)
+
 
         # Install GPTQ adapters using the AutoGPTQ package (with the above patches)
         model = get_gptq_peft_model(
@@ -284,9 +309,10 @@ def augmentation(
         )
         modifiable_args = (None,)  # return a None for peft_config
 
-        # undo the patching for hygine
-        LoraModel._create_new_module = staticmethod(_old_create_new_module)
-        GPTQLoraModel._replace_module = MethodType(_old_replace_module, GPTQLoraModel)
+        if self.use_external_lib:
+            # undo the patching for hygine
+            LoraModel._create_new_module = staticmethod(_old_create_new_module)
+            GPTQLoraModel._replace_module = MethodType(_old_replace_module, GPTQLoraModel)
 
         return model, modifiable_args
 
diff --git a/plugins/accelerated-peft/tests/test_gptqmodel.py b/plugins/accelerated-peft/tests/test_gptqmodel.py
index 4e8e349c..84581da1 100644
--- a/plugins/accelerated-peft/tests/test_gptqmodel.py
+++ b/plugins/accelerated-peft/tests/test_gptqmodel.py
@@ -1,12 +1,10 @@
-from transformers.utils.import_utils import _is_package_available
 import pytest  # pylint: disable=import-error
 import torch
 from typing import List
-from types import MethodType
-from functools import partial
-from peft import LoraConfig, prepare_model_for_kbit_training
-from peft.tuners.lora.gptq import QuantLinear as LoraLinearGPTQ
-from peft.tuners.lora.model import LoraModel
+
+from transformers.utils.import_utils import _is_package_available
+from transformers import AutoTokenizer, AutoConfig, GenerationConfig, AutoModelForCausalLM
+from peft import LoraConfig
 
 GPTQ = "gptq"
 # r, lora_alpha
@@ -16,112 +14,30 @@
 BS = 1
 SEQLEN = 128
 
+LOSS_TOLERANCE = 1e-3
 ALLCLOSE_RTOL = 1e-3
 ALLCLOSE_ATOL = 1e-4
 
 VANILLA_MODEL_NAME = "TinyLlama/TinyLlama-1.1B-Chat-v0.3"
 QUANTIZED_MODEL_NAME = "TheBloke/TinyLlama-1.1B-Chat-v0.3-GPTQ"
 TARGET_MODULES = ["q_proj", "k_proj", "v_proj", "o_proj"]
-
-def replace_module_peft(self, parent_module, child_name, new_module, old_module):
-    setattr(parent_module, child_name, new_module)
-    for name, module in new_module.named_modules():
-        if "lora_" in name:
-            device = (list(old_module.parameters()) + list(old_module.buffers()))[0].device
-            module.to(device)
-
-def create_new_module_peft(
-    lora_config: LoraConfig,
-    adapter_name: str,
-    target: torch.nn.Module,
-    target_cls,
-    **kwargs,
-):
-    new_module = None
-    if isinstance(target, target_cls):
-        new_module = LoraLinearGPTQ(
-            target, adapter_name, lora_config=lora_config, **kwargs
-        )
-    return new_module            
-
-
-def get_autogptq_peft_model(model, peft_config):
-    from auto_gptq.nn_modules.qlinear.qlinear_tritonv2 import QuantLinear
-    from auto_gptq.utils.peft_utils import GPTQLoraModel, get_gptq_peft_model
-
-    model = prepare_model_for_kbit_training(
-            model,
-            use_gradient_checkpointing=False,
-            gradient_checkpointing_kwargs={},
-        )
-
-    _old_create_new_module = LoraModel._create_new_module
-    _old_replace_module = GPTQLoraModel._replace_module
-    _create_new_module = partial(create_new_module_peft, target_cls=QuantLinear)
-    LoraModel._create_new_module = staticmethod(_create_new_module)
-    GPTQLoraModel._replace_module = MethodType(replace_module_peft, GPTQLoraModel)
-
-    # Install GPTQ adapters using the AutoGPTQ package (with the above patches)
-    model = get_gptq_peft_model(
-        model,
-        peft_config=peft_config,
-        auto_find_all_linears=peft_config.target_modules is None,
-        train_mode=True,  # install adapaters for training
-    )
-
-    # undo the patching for hygine
-    LoraModel._create_new_module = staticmethod(_old_create_new_module)
-    GPTQLoraModel._replace_module = MethodType(_old_replace_module, GPTQLoraModel)
-    return model
-
-def get_autogptq_lib_quantized_model(model_name:str, target_modules:List, torch_dtype:str):
-    from auto_gptq import AutoGPTQForCausalLM, BaseQuantizeConfig
-    quantize_config = BaseQuantizeConfig.from_pretrained(model_name)
-
-    device_map = {
-        "": (
-            torch.cuda.current_device()
-            if torch.cuda.is_available()
-            else None
-        )
-    }
-    model = AutoGPTQForCausalLM.from_quantized(
-        model_name,
-        quantize_config=quantize_config,
-        torch_dtype=getattr(torch, torch_dtype),
-        low_cpu_mem_usage=False,
-        use_marlin=False,
-        disable_exllama=True,
-        warmup_triton=False,
-        use_tritonv2=True,
-        trainable=True,
-        device_map=device_map,
-    )
-
-    peft_config = LoraConfig(
-        r=LORA_r,
-        lora_alpha=LORA_alpha,
-        lora_dropout=0.0,  # anyway we are going to override it
-        target_modules=target_modules,
-    )
-    # model = get_autogptq_peft_model(model, peft_config)
-    return model
                 
-def load_autogptq_plugin_model(model_name:str, target_modules:List, torch_dtype:str):
+# Model loading function for quantized models
+def load_autogptq_plugin_model(model_name:str, target_modules:List, torch_dtype:str, use_external_lib:bool = False):
     from fms_acceleration_peft.framework_plugin_autogptq import (
         AutoGPTQAccelerationPlugin,
     )
-    plugins = {
-        GPTQ: AutoGPTQAccelerationPlugin(
+
+    _plugin = AutoGPTQAccelerationPlugin(
             {
                 "peft": {
                     "quantization": {
                         "auto_gptq": {"kernel": "triton_v2", "from_quantized": True}
                     }
                 }
-            }
-        ),
-    }
+            },
+            use_external_lib = use_external_lib,
+    )
 
     class TrainArgs:
         gradient_checkpointing = False
@@ -135,89 +51,188 @@ class TrainArgs:
         target_modules=target_modules,
     )
 
-    _plugin = plugins[GPTQ]
     model = _plugin.model_loader(
-        model_name, torch_dtype=getattr(torch, FLOAT16)
+        model_name, torch_dtype=getattr(torch, torch_dtype)
     )
-    # model, _ = _plugin.augmentation(model, args, (peft_config,))
+    model, _ = _plugin.augmentation(model, args, (peft_config,))
+    model.eval()
     return model
 
+# quantization function to manage the loading and quantizing of pretrained model 
+# using external or local autogptq
+def quantize_model(model_name, config, calibration_dataset, quant_config_kwargs, device, use_external_lib=False):
+    if use_external_lib:
+        from auto_gptq import AutoGPTQForCausalLM as GPTQModel, BaseQuantizeConfig as QuantizeConfig
+        quantize_kwargs = {"use_triton": True}
+    else:
+        from gptqmodel import GPTQModel, QuantizeConfig
+        quantize_kwargs = {}
+
+    quantize_config = QuantizeConfig(
+        **quant_config_kwargs
+    )
+    # load un-quantized model, by default, the model will always be loaded into CPU memory
+    model = GPTQModel.from_pretrained(
+        model_name, 
+        quantize_config = quantize_config,
+        config = config,
+    ).to(device)
+    # quantize model, the examples should be list of dict whose keys can only be "input_ids"
+    model.quantize(calibration_dataset, **quantize_kwargs)
+    model.eval()
+    return model
+
+def get_wikitext2(tokenizer, num_samples=128, seqlen=128):
+    import random
+    import numpy as np
+    import torch
+    from datasets import load_dataset
+    wikidata = load_dataset('wikitext', 'wikitext-2-v1', split='test')
+    wikilist = [' \n' if s == '' else s for s in wikidata['text'] ]
+
+    text = ''.join(wikilist)
+    trainenc = tokenizer(text, return_tensors='pt')
+
+    random.seed(0)
+    np.random.seed(0)
+    torch.random.manual_seed(0)
+
+    traindataset = []
+
+    for _ in range(num_samples):
+        i = random.randint(0, trainenc.input_ids.shape[1] - seqlen - 1)
+        j = i + seqlen
+        inp = trainenc.input_ids[:, i:j]
+        attention_mask = torch.ones_like(inp)
+        traindataset.append({'input_ids':inp,'attention_mask': attention_mask})
+    return traindataset
+
 @pytest.fixture()
 def input_ids(seed: int = 42, device: torch.device = "cuda"):
     torch.manual_seed(seed)
-    yield torch.randint(0, 10000, (BS, SEQLEN))    
+    yield torch.randint(0, 10000, (BS, SEQLEN), device=device)    
 
 @pytest.mark.skipif(
     not _is_package_available("auto_gptq"),
     reason="Only runs if auto_gptq is installed",
 )
-def test_already_quantized_outputs_match(
+def test_pre_quantized_model_outputs_match(
     input_ids, seed: int = 42,
 ):
+    """
+    Test for output equivalence when loading quantized models between
+    extracted gptq library against original autogptq library
+    """
     torch.manual_seed(seed)
-    original_model = get_autogptq_lib_quantized_model(QUANTIZED_MODEL_NAME, TARGET_MODULES, FLOAT16)
-    refactored_model = load_autogptq_plugin_model(QUANTIZED_MODEL_NAME, TARGET_MODULES, FLOAT16)
+    original_model = load_autogptq_plugin_model(
+        QUANTIZED_MODEL_NAME, 
+        TARGET_MODULES, FLOAT16, 
+        use_external_lib=True
+    )
+    refactored_model = load_autogptq_plugin_model(
+        QUANTIZED_MODEL_NAME, 
+        TARGET_MODULES, 
+        FLOAT16
+    )
     with torch.autocast(device_type='cuda', dtype=torch.float32):
-        original_model.eval()
-        original_logits = original_model(input_ids.to(original_model.device)).logits
-        refactored_model.eval()
-        refactored_logits = refactored_model(input_ids.to(refactored_model.device)).logits
+        with torch.no_grad():
+            original_logits = original_model(input_ids.to(original_model.device)).logits
+            refactored_logits = refactored_model(input_ids.to(refactored_model.device)).logits
 
     assert torch.allclose(
         original_logits, refactored_logits, atol=ALLCLOSE_ATOL, rtol=ALLCLOSE_RTOL
-    ), "Logits don't match between refactored quantized model and original library"
-
+    ), "Pre-quantized model logits don't match between extracted and external autogptq library"
 
 @pytest.mark.skipif(
     not _is_package_available("auto_gptq"),
     reason="Only runs if auto_gptq is installed",
 )
-def test_pretrained_to_quantized_outputs_match(
+def test_quantizing_pretrained_model_outputs_match(
     input_ids, seed: int = 42,
 ):
+    """
+    Test for regression of quantizing pretrained models 
+    with refactored gptq library against original autogptq library
+    by calculating KL loss on the output logits of both variants
+    """
     torch.manual_seed(seed)
-    from transformers import AutoTokenizer
-    from auto_gptq import AutoGPTQForCausalLM, BaseQuantizeConfig
-    from gptqmodel import GPTQModel, QuantizeConfig
-    device = torch.device("cuda")
-
+    # Initialize common arguments
+    device = input_ids.device
     tokenizer = AutoTokenizer.from_pretrained(VANILLA_MODEL_NAME, use_fast=True)
-    calibration_dataset = [
-        tokenizer(
-            "The world is a wonderful place full of beauty and love."
-        )
-    ]
+    config = AutoConfig.from_pretrained(VANILLA_MODEL_NAME)
+    config.num_hidden_layers = 2
+    # calibration_dataset = [
+    #     tokenizer(
+    #         "The world is a wonderful place full of beauty and love."
+    #     )
+    # ]
+    calibration_dataset = get_wikitext2(tokenizer, num_samples=128, seqlen=128)
+    quant_config_kwargs = {
+        "bits": 4,
+        "group_size": -1,
+        "desc_act": True,
+        "damp_percent": 0.1,
+        "static_groups": False,
+        "sym": True,
+        "true_sequential": True,
+    }
 
-    original_quantize_config = BaseQuantizeConfig(
-        bits=4,
-        group_size=-1,
-        desc_act=False,
+    # quantize models for external autogptq lib and extracted gptq lib
+    original_model = quantize_model(
+        VANILLA_MODEL_NAME, 
+        config,
+        calibration_dataset, 
+        quant_config_kwargs, 
+        device, 
+        use_external_lib=True
     )
-    # load un-quantized model, by default, the model will always be loaded into CPU memory
-    original_model = AutoGPTQForCausalLM.from_pretrained(
+    refactored_model = quantize_model(
         VANILLA_MODEL_NAME, 
-        original_quantize_config
-    ).to(device)
-    # quantize model, the examples should be list of dict whose keys can only be "input_ids" and "attention_mask"
-    original_model.quantize(calibration_dataset, use_triton=True)
-
-    refactored_quant_config = QuantizeConfig(
-        bits=4,
-        group_size=-1,
-        desc_act=False,
+        config,
+        calibration_dataset, 
+        quant_config_kwargs, 
+        device, 
+        use_external_lib=False
     )
-    # load un-quantized model, by default, the model will always be loaded into CPU memory
-    refactored_model = GPTQModel.from_pretrained(VANILLA_MODEL_NAME, refactored_quant_config).to(device)
-    # quantize model, the calibration_dataset should be list of dict whose keys can only be "input_ids" and "attention_mask"
-    refactored_model.quantize(calibration_dataset)
 
-    with torch.autocast(device_type='cuda', dtype=torch.float16):
-        with torch.no_grad():
-            original_model.eval()
-            original_logits = original_model(input_ids.to(original_model.device)).logits
-            refactored_model.eval()
-            refactored_logits = refactored_model(input_ids.to(refactored_model.device)).logits
-
-    assert torch.allclose(
-        original_logits, refactored_logits, atol=ALLCLOSE_ATOL, rtol=ALLCLOSE_RTOL
-    ), "Logits don't match between refactored quantized model and original library"
\ No newline at end of file
+    # compare generated tokens between 
+    # unquantized, original library and refactored gptqmodel library
+    unquantized_model = AutoModelForCausalLM.from_pretrained(
+        VANILLA_MODEL_NAME,
+        config=config
+    ).to(device)
+    gen_config = GenerationConfig.from_pretrained(VANILLA_MODEL_NAME)
+    gen_config.max_new_tokens = 5
+    _inputs = torch.tensor([tokenizer("auto-gptq is an easy to use")["input_ids"]], device="cuda")
+    output1 = tokenizer.decode(
+        original_model.generate(
+                inputs=_inputs, generation_config=gen_config
+            ).view(-1), skip_special_tokens=True
+        )
+    output2 = tokenizer.decode(
+        refactored_model.generate(
+                inputs=_inputs, generation_config=gen_config
+            ).view(-1), skip_special_tokens=True
+        )
+    output3 = tokenizer.decode(
+        unquantized_model.generate(
+                inputs=_inputs, generation_config=gen_config
+            ).view(-1), skip_special_tokens=True
+        )    
+    assert output1==output2==output3, f"generated tokens ({output1}, {output2}, {output3}) \
+        don't match between both libraries after quantization"
+
+    # compare prob. distributions between original library and refactored gptqmodel library
+    with torch.no_grad():
+        original_logits = original_model(input_ids).logits
+        refactored_logits = refactored_model(input_ids).logits
+
+    # Measure the distribution error with KD Loss
+    loss_fn = torch.nn.KLDivLoss(reduction="mean")
+    # input should be a distribution in the log space
+    input = torch.nn.functional.log_softmax(refactored_logits, dim=1)
+    # target must be prob distribution
+    target = torch.nn.functional.softmax(original_logits, dim=1)
+    error = loss_fn(input, target)
+    assert error.lt(LOSS_TOLERANCE), "Model logits don't match between both libraries \
+        after quantization"

From f510537483c2787909cdbb3b27c9d1855900f060 Mon Sep 17 00:00:00 2001
From: 1000850000 user <aaron.chew1@ibm.com>
Date: Wed, 3 Jul 2024 19:06:02 +0000
Subject: [PATCH 06/20] addressed additional PR changes

---
 plugins/accelerated-peft/pyproject.toml       |  4 +-
 plugins/accelerated-peft/requirements.txt     |  5 +--
 .../framework_plugin_autogptq.py              | 44 ++++++++++---------
 .../gptqmodel/__init__.py                     | 18 ++++++++
 .../gptqmodel/models/__init__.py              | 25 +++++++++++
 .../gptqmodel/models/_const.py                |  0
 .../gptqmodel/models/auto.py                  |  0
 .../gptqmodel/models/base.py                  |  0
 .../gptqmodel/models/dbrx.py                  |  0
 .../gptqmodel/models/dbrx_converted.py        |  0
 .../gptqmodel/models/gemma.py                 |  0
 .../gptqmodel/models/gpt_bigcode.py           |  0
 .../gptqmodel/models/gpt_neox.py              |  0
 .../gptqmodel/models/llama.py                 |  0
 .../gptqmodel/models/mistral.py               |  0
 .../gptqmodel/models/mixtral.py               |  0
 .../gptqmodel/nn_modules/__init__.py          | 15 +++++++
 .../gptqmodel/nn_modules/qlinear/__init__.py  |  0
 .../nn_modules/qlinear/qlinear_tritonv2.py    |  0
 .../nn_modules/triton_utils}/__init__.py      |  0
 .../triton_utils/custom_autotune.py           |  0
 .../nn_modules/triton_utils/dequant.py        |  0
 .../nn_modules/triton_utils/kernels.py        |  0
 .../nn_modules/triton_utils/mixin.py          |  0
 .../gptqmodel/quantization/__init__.py        | 19 ++++++++
 .../gptqmodel/quantization/config.py          |  0
 .../gptqmodel/quantization/gptq.py            |  0
 .../gptqmodel/quantization/quantizer.py       |  0
 .../gptqmodel/utils/__init__.py               | 16 +++++++
 .../gptqmodel/utils/backend.py                |  0
 .../gptqmodel/utils/data.py                   |  0
 .../gptqmodel/utils/importer.py               |  0
 .../gptqmodel/utils/model.py                  |  0
 .../gptqmodel/utils/peft.py                   | 13 +++---
 .../src/gptqmodel/__init__.py                 |  3 --
 .../src/gptqmodel/models/__init__.py          | 10 -----
 .../nn_modules/triton_utils/__init__.py       |  0
 .../src/gptqmodel/quantization/__init__.py    |  4 --
 .../src/gptqmodel/utils/__init__.py           |  1 -
 .../accelerated-peft/tests/test_gptqmodel.py  | 28 ++++++++++--
 40 files changed, 151 insertions(+), 54 deletions(-)
 create mode 100644 plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/__init__.py
 create mode 100644 plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/models/__init__.py
 rename plugins/accelerated-peft/src/{ => fms_acceleration_peft}/gptqmodel/models/_const.py (100%)
 rename plugins/accelerated-peft/src/{ => fms_acceleration_peft}/gptqmodel/models/auto.py (100%)
 rename plugins/accelerated-peft/src/{ => fms_acceleration_peft}/gptqmodel/models/base.py (100%)
 rename plugins/accelerated-peft/src/{ => fms_acceleration_peft}/gptqmodel/models/dbrx.py (100%)
 rename plugins/accelerated-peft/src/{ => fms_acceleration_peft}/gptqmodel/models/dbrx_converted.py (100%)
 rename plugins/accelerated-peft/src/{ => fms_acceleration_peft}/gptqmodel/models/gemma.py (100%)
 rename plugins/accelerated-peft/src/{ => fms_acceleration_peft}/gptqmodel/models/gpt_bigcode.py (100%)
 rename plugins/accelerated-peft/src/{ => fms_acceleration_peft}/gptqmodel/models/gpt_neox.py (100%)
 rename plugins/accelerated-peft/src/{ => fms_acceleration_peft}/gptqmodel/models/llama.py (100%)
 rename plugins/accelerated-peft/src/{ => fms_acceleration_peft}/gptqmodel/models/mistral.py (100%)
 rename plugins/accelerated-peft/src/{ => fms_acceleration_peft}/gptqmodel/models/mixtral.py (100%)
 create mode 100644 plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/nn_modules/__init__.py
 rename plugins/accelerated-peft/src/{ => fms_acceleration_peft}/gptqmodel/nn_modules/qlinear/__init__.py (100%)
 rename plugins/accelerated-peft/src/{ => fms_acceleration_peft}/gptqmodel/nn_modules/qlinear/qlinear_tritonv2.py (100%)
 rename plugins/accelerated-peft/src/{gptqmodel/nn_modules => fms_acceleration_peft/gptqmodel/nn_modules/triton_utils}/__init__.py (100%)
 rename plugins/accelerated-peft/src/{ => fms_acceleration_peft}/gptqmodel/nn_modules/triton_utils/custom_autotune.py (100%)
 rename plugins/accelerated-peft/src/{ => fms_acceleration_peft}/gptqmodel/nn_modules/triton_utils/dequant.py (100%)
 rename plugins/accelerated-peft/src/{ => fms_acceleration_peft}/gptqmodel/nn_modules/triton_utils/kernels.py (100%)
 rename plugins/accelerated-peft/src/{ => fms_acceleration_peft}/gptqmodel/nn_modules/triton_utils/mixin.py (100%)
 create mode 100644 plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/quantization/__init__.py
 rename plugins/accelerated-peft/src/{ => fms_acceleration_peft}/gptqmodel/quantization/config.py (100%)
 rename plugins/accelerated-peft/src/{ => fms_acceleration_peft}/gptqmodel/quantization/gptq.py (100%)
 rename plugins/accelerated-peft/src/{ => fms_acceleration_peft}/gptqmodel/quantization/quantizer.py (100%)
 create mode 100644 plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/utils/__init__.py
 rename plugins/accelerated-peft/src/{ => fms_acceleration_peft}/gptqmodel/utils/backend.py (100%)
 rename plugins/accelerated-peft/src/{ => fms_acceleration_peft}/gptqmodel/utils/data.py (100%)
 rename plugins/accelerated-peft/src/{ => fms_acceleration_peft}/gptqmodel/utils/importer.py (100%)
 rename plugins/accelerated-peft/src/{ => fms_acceleration_peft}/gptqmodel/utils/model.py (100%)
 rename plugins/accelerated-peft/src/{ => fms_acceleration_peft}/gptqmodel/utils/peft.py (93%)
 delete mode 100644 plugins/accelerated-peft/src/gptqmodel/__init__.py
 delete mode 100644 plugins/accelerated-peft/src/gptqmodel/models/__init__.py
 delete mode 100644 plugins/accelerated-peft/src/gptqmodel/nn_modules/triton_utils/__init__.py
 delete mode 100644 plugins/accelerated-peft/src/gptqmodel/quantization/__init__.py
 delete mode 100644 plugins/accelerated-peft/src/gptqmodel/utils/__init__.py

diff --git a/plugins/accelerated-peft/pyproject.toml b/plugins/accelerated-peft/pyproject.toml
index a2b2f12e..d586a0db 100644
--- a/plugins/accelerated-peft/pyproject.toml
+++ b/plugins/accelerated-peft/pyproject.toml
@@ -26,13 +26,13 @@ classifiers=[
 
 [project.optional-dependencies]
 flash-attn = ["flash-attn"]
-auto_gptq = ["auto_gptq @ git+https://github.com/AutoGPTQ/AutoGPTQ.git"]
+auto_gptq = ["auto_gptq @ git+https://github.com/AutoGPTQ/AutoGPTQ.git@ea829c7bbe83561c2b1de26795b6592992373ef7"] # known working commitid
 
 [tool.hatch.metadata.hooks.requirements_txt]
 files = ["requirements.txt"]
 
 [tool.hatch.build.targets.wheel]
-only-include = ["src/"]
+only-include = ["src/fms_acceleration_peft"]
 
 [tool.hatch.metadata]
 allow-direct-references = true
diff --git a/plugins/accelerated-peft/requirements.txt b/plugins/accelerated-peft/requirements.txt
index 3bcb995f..a00233d3 100644
--- a/plugins/accelerated-peft/requirements.txt
+++ b/plugins/accelerated-peft/requirements.txt
@@ -1,9 +1,8 @@
 # decide not to have this as an requirement for now
 # fms_acceleration @ git+https://github.com/foundation-model-stack/fms-acceleration.git#subdirectory=plugins/framework
 
-# put this in here because there is a breaking FSDP api change that 
-# is fixed after peft > 0.10
-accelerate <= 0.29
+# Needs a lower bound due to`accelerate.load_checkpoint_in_model` function used in gptqmodel
+accelerate >= 0.29
 
 # bitsandbytes for the BNB plugin
 bitsandbytes
diff --git a/plugins/accelerated-peft/src/fms_acceleration_peft/framework_plugin_autogptq.py b/plugins/accelerated-peft/src/fms_acceleration_peft/framework_plugin_autogptq.py
index 63254c25..efa82c45 100644
--- a/plugins/accelerated-peft/src/fms_acceleration_peft/framework_plugin_autogptq.py
+++ b/plugins/accelerated-peft/src/fms_acceleration_peft/framework_plugin_autogptq.py
@@ -47,7 +47,10 @@ def __init__(self, configurations: Dict[str, Dict], use_external_lib: bool = Fal
         self._check_config_equal(
             key="peft.quantization.auto_gptq.from_quantized", value=True
         )
-        self.use_external_lib = use_external_lib and importlib.util.find_spec("autogptq") is not None
+        self.use_external_lib = use_external_lib
+
+        if self.use_external_lib:
+            assert importlib.util.find_spec("auto_gptq") is not None, "Unable to use external library, autogptq module not found."
 
     def model_loader(self, model_name: str, **kwargs):
         # guarded imports
@@ -61,9 +64,9 @@ def model_loader(self, model_name: str, **kwargs):
                 QuantLinear,
             )
         else:
-            from gptqmodel import GPTQModel, QuantizeConfig
-            from gptqmodel.utils import Backend
-            from gptqmodel.nn_modules.qlinear.qlinear_tritonv2 import (
+            from .gptqmodel import GPTQModel, QuantizeConfig
+            from .gptqmodel.utils import Backend
+            from .gptqmodel.nn_modules.qlinear.qlinear_tritonv2 import (
                 QuantLinear,
             )
 
@@ -126,22 +129,21 @@ def model_loader(self, model_name: str, **kwargs):
 
         # this is a HF method that checks if the low_cpu_mem mode is enabled
         # via HF accelerate
-        if is_fsdp_enabled():
-            if self.use_external_lib:
-                # Local
-                from .autogptq_utils import (  # pylint: disable=import-outside-toplevel
-                    _patch_target_module,
-                    make_sure_no_tensor_in_meta_device,
-                )
-
-                # We patch `make_sure_no_tensor_in_meta_device`
-                # from autogptq to avoid errors on models without bias
-                _patch_target_module(
-                    to_patch="auto_gptq.modeling._utils.make_sure_no_tensor_in_meta_device",
-                    replace_with=make_sure_no_tensor_in_meta_device,
-                    target_module="auto_gptq.modeling._base",
-                )
-                kwargs["low_cpu_mem_usage"] = True
+        if is_fsdp_enabled() and self.use_external_lib:
+            # Local
+            from .autogptq_utils import (  # pylint: disable=import-outside-toplevel
+                _patch_target_module,
+                make_sure_no_tensor_in_meta_device,
+            )
+
+            # We patch `make_sure_no_tensor_in_meta_device`
+            # from autogptq to avoid errors on models without bias
+            _patch_target_module(
+                to_patch="auto_gptq.modeling._utils.make_sure_no_tensor_in_meta_device",
+                replace_with=make_sure_no_tensor_in_meta_device,
+                target_module="auto_gptq.modeling._base",
+            )
+            kwargs["low_cpu_mem_usage"] = True
 
         # NOTE: need to set the device map as below as we want to use AutoGPTQ for training.
         # device_map is for inference only
@@ -253,7 +255,7 @@ def augmentation(
                 replace_module_peft,
             )
         else:
-            from gptqmodel.utils.peft import get_gptq_peft_model
+            from .gptqmodel.utils.peft import get_gptq_peft_model
 
 
         (peft_config,) = modifiable_args  # unpack modifiable args
diff --git a/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/__init__.py b/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/__init__.py
new file mode 100644
index 00000000..dd205fa8
--- /dev/null
+++ b/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/__init__.py
@@ -0,0 +1,18 @@
+###############################################################################
+# Adapted from https://github.com/ModelCloud/GPTQModel
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+###############################################################################
+from .models import GPTQModel
+from .quantization import BaseQuantizeConfig, QuantizeConfig
+from .utils import Backend, get_backend
\ No newline at end of file
diff --git a/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/models/__init__.py b/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/models/__init__.py
new file mode 100644
index 00000000..ac31ca08
--- /dev/null
+++ b/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/models/__init__.py
@@ -0,0 +1,25 @@
+###############################################################################
+# Adapted from https://github.com/ModelCloud/GPTQModel
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+###############################################################################
+from .auto import MODEL_MAP, GPTQModel
+from .base import BaseGPTQModel
+from .dbrx import DbrxGPTQ
+from .dbrx_converted import DbrxConvertedGPTQ
+from .gemma import GemmaGPTQ
+from .gpt_bigcode import GPTBigCodeGPTQ
+from .gpt_neox import GPTNeoXGPTQ
+from .llama import LlamaGPTQ
+from .mistral import MistralGPTQ
+from .mixtral import MixtralGPTQ
diff --git a/plugins/accelerated-peft/src/gptqmodel/models/_const.py b/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/models/_const.py
similarity index 100%
rename from plugins/accelerated-peft/src/gptqmodel/models/_const.py
rename to plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/models/_const.py
diff --git a/plugins/accelerated-peft/src/gptqmodel/models/auto.py b/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/models/auto.py
similarity index 100%
rename from plugins/accelerated-peft/src/gptqmodel/models/auto.py
rename to plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/models/auto.py
diff --git a/plugins/accelerated-peft/src/gptqmodel/models/base.py b/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/models/base.py
similarity index 100%
rename from plugins/accelerated-peft/src/gptqmodel/models/base.py
rename to plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/models/base.py
diff --git a/plugins/accelerated-peft/src/gptqmodel/models/dbrx.py b/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/models/dbrx.py
similarity index 100%
rename from plugins/accelerated-peft/src/gptqmodel/models/dbrx.py
rename to plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/models/dbrx.py
diff --git a/plugins/accelerated-peft/src/gptqmodel/models/dbrx_converted.py b/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/models/dbrx_converted.py
similarity index 100%
rename from plugins/accelerated-peft/src/gptqmodel/models/dbrx_converted.py
rename to plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/models/dbrx_converted.py
diff --git a/plugins/accelerated-peft/src/gptqmodel/models/gemma.py b/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/models/gemma.py
similarity index 100%
rename from plugins/accelerated-peft/src/gptqmodel/models/gemma.py
rename to plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/models/gemma.py
diff --git a/plugins/accelerated-peft/src/gptqmodel/models/gpt_bigcode.py b/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/models/gpt_bigcode.py
similarity index 100%
rename from plugins/accelerated-peft/src/gptqmodel/models/gpt_bigcode.py
rename to plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/models/gpt_bigcode.py
diff --git a/plugins/accelerated-peft/src/gptqmodel/models/gpt_neox.py b/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/models/gpt_neox.py
similarity index 100%
rename from plugins/accelerated-peft/src/gptqmodel/models/gpt_neox.py
rename to plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/models/gpt_neox.py
diff --git a/plugins/accelerated-peft/src/gptqmodel/models/llama.py b/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/models/llama.py
similarity index 100%
rename from plugins/accelerated-peft/src/gptqmodel/models/llama.py
rename to plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/models/llama.py
diff --git a/plugins/accelerated-peft/src/gptqmodel/models/mistral.py b/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/models/mistral.py
similarity index 100%
rename from plugins/accelerated-peft/src/gptqmodel/models/mistral.py
rename to plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/models/mistral.py
diff --git a/plugins/accelerated-peft/src/gptqmodel/models/mixtral.py b/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/models/mixtral.py
similarity index 100%
rename from plugins/accelerated-peft/src/gptqmodel/models/mixtral.py
rename to plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/models/mixtral.py
diff --git a/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/nn_modules/__init__.py b/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/nn_modules/__init__.py
new file mode 100644
index 00000000..feb7a9e6
--- /dev/null
+++ b/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/nn_modules/__init__.py
@@ -0,0 +1,15 @@
+###############################################################################
+# Adapted from https://github.com/ModelCloud/GPTQModel
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+###############################################################################
\ No newline at end of file
diff --git a/plugins/accelerated-peft/src/gptqmodel/nn_modules/qlinear/__init__.py b/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/nn_modules/qlinear/__init__.py
similarity index 100%
rename from plugins/accelerated-peft/src/gptqmodel/nn_modules/qlinear/__init__.py
rename to plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/nn_modules/qlinear/__init__.py
diff --git a/plugins/accelerated-peft/src/gptqmodel/nn_modules/qlinear/qlinear_tritonv2.py b/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/nn_modules/qlinear/qlinear_tritonv2.py
similarity index 100%
rename from plugins/accelerated-peft/src/gptqmodel/nn_modules/qlinear/qlinear_tritonv2.py
rename to plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/nn_modules/qlinear/qlinear_tritonv2.py
diff --git a/plugins/accelerated-peft/src/gptqmodel/nn_modules/__init__.py b/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/nn_modules/triton_utils/__init__.py
similarity index 100%
rename from plugins/accelerated-peft/src/gptqmodel/nn_modules/__init__.py
rename to plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/nn_modules/triton_utils/__init__.py
diff --git a/plugins/accelerated-peft/src/gptqmodel/nn_modules/triton_utils/custom_autotune.py b/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/nn_modules/triton_utils/custom_autotune.py
similarity index 100%
rename from plugins/accelerated-peft/src/gptqmodel/nn_modules/triton_utils/custom_autotune.py
rename to plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/nn_modules/triton_utils/custom_autotune.py
diff --git a/plugins/accelerated-peft/src/gptqmodel/nn_modules/triton_utils/dequant.py b/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/nn_modules/triton_utils/dequant.py
similarity index 100%
rename from plugins/accelerated-peft/src/gptqmodel/nn_modules/triton_utils/dequant.py
rename to plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/nn_modules/triton_utils/dequant.py
diff --git a/plugins/accelerated-peft/src/gptqmodel/nn_modules/triton_utils/kernels.py b/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/nn_modules/triton_utils/kernels.py
similarity index 100%
rename from plugins/accelerated-peft/src/gptqmodel/nn_modules/triton_utils/kernels.py
rename to plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/nn_modules/triton_utils/kernels.py
diff --git a/plugins/accelerated-peft/src/gptqmodel/nn_modules/triton_utils/mixin.py b/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/nn_modules/triton_utils/mixin.py
similarity index 100%
rename from plugins/accelerated-peft/src/gptqmodel/nn_modules/triton_utils/mixin.py
rename to plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/nn_modules/triton_utils/mixin.py
diff --git a/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/quantization/__init__.py b/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/quantization/__init__.py
new file mode 100644
index 00000000..377ffb63
--- /dev/null
+++ b/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/quantization/__init__.py
@@ -0,0 +1,19 @@
+###############################################################################
+# Adapted from https://github.com/ModelCloud/GPTQModel
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+###############################################################################
+from .config import (FORMAT, FORMAT_FIELD_CODE, FORMAT_FIELD_JSON,
+                     QUANT_CONFIG_FILENAME, QUANT_METHOD, QUANT_METHOD_FIELD, BaseQuantizeConfig, QuantizeConfig)
+from .gptq import GPTQ
+from .quantizer import Quantizer, quantize
diff --git a/plugins/accelerated-peft/src/gptqmodel/quantization/config.py b/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/quantization/config.py
similarity index 100%
rename from plugins/accelerated-peft/src/gptqmodel/quantization/config.py
rename to plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/quantization/config.py
diff --git a/plugins/accelerated-peft/src/gptqmodel/quantization/gptq.py b/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/quantization/gptq.py
similarity index 100%
rename from plugins/accelerated-peft/src/gptqmodel/quantization/gptq.py
rename to plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/quantization/gptq.py
diff --git a/plugins/accelerated-peft/src/gptqmodel/quantization/quantizer.py b/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/quantization/quantizer.py
similarity index 100%
rename from plugins/accelerated-peft/src/gptqmodel/quantization/quantizer.py
rename to plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/quantization/quantizer.py
diff --git a/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/utils/__init__.py b/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/utils/__init__.py
new file mode 100644
index 00000000..668d0859
--- /dev/null
+++ b/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/utils/__init__.py
@@ -0,0 +1,16 @@
+###############################################################################
+# Adapted from https://github.com/ModelCloud/GPTQModel
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+###############################################################################
+from .backend import Backend, get_backend
diff --git a/plugins/accelerated-peft/src/gptqmodel/utils/backend.py b/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/utils/backend.py
similarity index 100%
rename from plugins/accelerated-peft/src/gptqmodel/utils/backend.py
rename to plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/utils/backend.py
diff --git a/plugins/accelerated-peft/src/gptqmodel/utils/data.py b/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/utils/data.py
similarity index 100%
rename from plugins/accelerated-peft/src/gptqmodel/utils/data.py
rename to plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/utils/data.py
diff --git a/plugins/accelerated-peft/src/gptqmodel/utils/importer.py b/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/utils/importer.py
similarity index 100%
rename from plugins/accelerated-peft/src/gptqmodel/utils/importer.py
rename to plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/utils/importer.py
diff --git a/plugins/accelerated-peft/src/gptqmodel/utils/model.py b/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/utils/model.py
similarity index 100%
rename from plugins/accelerated-peft/src/gptqmodel/utils/model.py
rename to plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/utils/model.py
diff --git a/plugins/accelerated-peft/src/gptqmodel/utils/peft.py b/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/utils/peft.py
similarity index 93%
rename from plugins/accelerated-peft/src/gptqmodel/utils/peft.py
rename to plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/utils/peft.py
index db27fa09..e3257d80 100644
--- a/plugins/accelerated-peft/src/gptqmodel/utils/peft.py
+++ b/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/utils/peft.py
@@ -21,7 +21,6 @@
 # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 # SOFTWARE.
 ###############################################################################
-import warnings
 from contextlib import contextmanager
 from typing import List, Optional, Tuple, Union
 
@@ -29,7 +28,7 @@
 from peft import PeftConfig, PeftModel, PeftType, get_peft_model
 from peft.mapping import PEFT_TYPE_TO_CONFIG_MAPPING
 from peft.peft_model import PEFT_TYPE_TO_MODEL_MAPPING
-from peft.tuners.lora import LoraConfig, LoraLayer, LoraModel
+from peft.tuners.lora import LoraConfig, LoraModel
 from peft.tuners.lora.gptq import QuantLinear as LoraLinearGPTQ
 
 from ..models.base import BaseGPTQModel
@@ -100,7 +99,6 @@ def find_all_linear_names(
                 results.add(res)
     return list(results)
 
-
 @contextmanager
 def hijack_peft_mappings():
     PEFT_TYPE_TO_CONFIG_MAPPING[PeftType.LORA] = GPTQLoraConfig
@@ -139,16 +137,19 @@ def get_gptq_peft_model(
             if peft_type == PeftType.LORA.value and not isinstance(peft_config, GPTQLoraConfig):
                 peft_config = GPTQLoraConfig(**peft_config.to_dict())
 
+    # this hijack is needed as `get_peft_model` uses PEFTModelForCausalLM which inherits from
+    # PEFTModel and it in turn relies on PEFT_TYPE_TO_MODEL_MAPPING to initialize its base LoraModel
     with hijack_peft_mappings():
         try:
             if train_mode:
                 peft_model = get_peft_model(model.model, peft_config, adapter_name=adapter_name)
             else:
                 peft_model = PeftModel.from_pretrained(model.model, model_id, adapter_name)
-        except:
+        except Exception as exc:
             raise NotImplementedError(
-                f"{model.__class__.__name__} not support {peft_config.peft_type.value} peft type yet."
-            )
+                f"{model.__class__.__name__} not support \
+                    {peft_config.peft_type.value} peft type yet."
+            ) from exc
 
     return peft_model
 
diff --git a/plugins/accelerated-peft/src/gptqmodel/__init__.py b/plugins/accelerated-peft/src/gptqmodel/__init__.py
deleted file mode 100644
index 2808d505..00000000
--- a/plugins/accelerated-peft/src/gptqmodel/__init__.py
+++ /dev/null
@@ -1,3 +0,0 @@
-from .models import GPTQModel
-from .quantization import BaseQuantizeConfig, QuantizeConfig
-from .utils import Backend, get_backend
\ No newline at end of file
diff --git a/plugins/accelerated-peft/src/gptqmodel/models/__init__.py b/plugins/accelerated-peft/src/gptqmodel/models/__init__.py
deleted file mode 100644
index 5496b45e..00000000
--- a/plugins/accelerated-peft/src/gptqmodel/models/__init__.py
+++ /dev/null
@@ -1,10 +0,0 @@
-from .auto import MODEL_MAP, GPTQModel
-from .base import BaseGPTQModel
-from .dbrx import DbrxGPTQ
-from .dbrx_converted import DbrxConvertedGPTQ
-from .gemma import GemmaGPTQ
-from .gpt_bigcode import GPTBigCodeGPTQ
-from .gpt_neox import GPTNeoXGPTQ
-from .llama import LlamaGPTQ
-from .mistral import MistralGPTQ
-from .mixtral import MixtralGPTQ
diff --git a/plugins/accelerated-peft/src/gptqmodel/nn_modules/triton_utils/__init__.py b/plugins/accelerated-peft/src/gptqmodel/nn_modules/triton_utils/__init__.py
deleted file mode 100644
index e69de29b..00000000
diff --git a/plugins/accelerated-peft/src/gptqmodel/quantization/__init__.py b/plugins/accelerated-peft/src/gptqmodel/quantization/__init__.py
deleted file mode 100644
index ca86e26f..00000000
--- a/plugins/accelerated-peft/src/gptqmodel/quantization/__init__.py
+++ /dev/null
@@ -1,4 +0,0 @@
-from .config import (FORMAT, FORMAT_FIELD_CODE, FORMAT_FIELD_JSON,
-                     QUANT_CONFIG_FILENAME, QUANT_METHOD, QUANT_METHOD_FIELD, BaseQuantizeConfig, QuantizeConfig)
-from .gptq import GPTQ
-from .quantizer import Quantizer, quantize
diff --git a/plugins/accelerated-peft/src/gptqmodel/utils/__init__.py b/plugins/accelerated-peft/src/gptqmodel/utils/__init__.py
deleted file mode 100644
index 0e79e1d3..00000000
--- a/plugins/accelerated-peft/src/gptqmodel/utils/__init__.py
+++ /dev/null
@@ -1 +0,0 @@
-from .backend import Backend, get_backend
diff --git a/plugins/accelerated-peft/tests/test_gptqmodel.py b/plugins/accelerated-peft/tests/test_gptqmodel.py
index 84581da1..9157ccf7 100644
--- a/plugins/accelerated-peft/tests/test_gptqmodel.py
+++ b/plugins/accelerated-peft/tests/test_gptqmodel.py
@@ -1,3 +1,20 @@
+# Copyright The IBM Tuning Team
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# SPDX-License-Identifier: Apache-2.0
+# https://spdx.dev/learn/handling-license-info/
+
 import pytest  # pylint: disable=import-error
 import torch
 from typing import List
@@ -14,7 +31,7 @@
 BS = 1
 SEQLEN = 128
 
-LOSS_TOLERANCE = 1e-3
+LOSS_TOLERANCE = 0.1
 ALLCLOSE_RTOL = 1e-3
 ALLCLOSE_ATOL = 1e-4
 
@@ -60,12 +77,12 @@ class TrainArgs:
 
 # quantization function to manage the loading and quantizing of pretrained model 
 # using external or local autogptq
-def quantize_model(model_name, config, calibration_dataset, quant_config_kwargs, device, use_external_lib=False):
+def quantize_model(model_name, config, calibration_dataset, quant_config_kwargs, device, torch_dtype, use_external_lib=False):
     if use_external_lib:
         from auto_gptq import AutoGPTQForCausalLM as GPTQModel, BaseQuantizeConfig as QuantizeConfig
         quantize_kwargs = {"use_triton": True}
     else:
-        from gptqmodel import GPTQModel, QuantizeConfig
+        from fms_acceleration_peft.gptqmodel import GPTQModel, QuantizeConfig
         quantize_kwargs = {}
 
     quantize_config = QuantizeConfig(
@@ -76,6 +93,7 @@ def quantize_model(model_name, config, calibration_dataset, quant_config_kwargs,
         model_name, 
         quantize_config = quantize_config,
         config = config,
+        torch_dtype = getattr(torch, torch_dtype),
     ).to(device)
     # quantize model, the examples should be list of dict whose keys can only be "input_ids"
     model.quantize(calibration_dataset, **quantize_kwargs)
@@ -184,6 +202,7 @@ def test_quantizing_pretrained_model_outputs_match(
         calibration_dataset, 
         quant_config_kwargs, 
         device, 
+        FLOAT16,
         use_external_lib=True
     )
     refactored_model = quantize_model(
@@ -192,6 +211,7 @@ def test_quantizing_pretrained_model_outputs_match(
         calibration_dataset, 
         quant_config_kwargs, 
         device, 
+        FLOAT16,
         use_external_lib=False
     )
 
@@ -228,7 +248,7 @@ def test_quantizing_pretrained_model_outputs_match(
         refactored_logits = refactored_model(input_ids).logits
 
     # Measure the distribution error with KD Loss
-    loss_fn = torch.nn.KLDivLoss(reduction="mean")
+    loss_fn = torch.nn.KLDivLoss(reduction="batchmean")
     # input should be a distribution in the log space
     input = torch.nn.functional.log_softmax(refactored_logits, dim=1)
     # target must be prob distribution

From 8e18154a404ad3c71bc9b8f9739809fad62ccd7c Mon Sep 17 00:00:00 2001
From: 1000850000 user <aaron.chew1@ibm.com>
Date: Thu, 4 Jul 2024 08:11:37 +0000
Subject: [PATCH 07/20] reintroduce support for low_cpu_mem_usage in extracted
 lib

---
 .../framework_plugin_autogptq.py              | 65 +++++++++----------
 .../gptqmodel/models/base.py                  | 10 ++-
 2 files changed, 40 insertions(+), 35 deletions(-)

diff --git a/plugins/accelerated-peft/src/fms_acceleration_peft/framework_plugin_autogptq.py b/plugins/accelerated-peft/src/fms_acceleration_peft/framework_plugin_autogptq.py
index efa82c45..2a880ee5 100644
--- a/plugins/accelerated-peft/src/fms_acceleration_peft/framework_plugin_autogptq.py
+++ b/plugins/accelerated-peft/src/fms_acceleration_peft/framework_plugin_autogptq.py
@@ -100,7 +100,7 @@ def model_loader(self, model_name: str, **kwargs):
 
         # get additional parameters
         torch_dtype = kwargs.get("torch_dtype", torch.float32)
-        low_cpu_mem_usage = kwargs.get("low_cpu_mem_usage")
+        low_cpu_mem_usage = kwargs.get("low_cpu_mem_usage", False)
         attn_implementation = kwargs.get("attn_implementation")
 
         # there are some kwargs that we wont be passed to AutoModel, so we need
@@ -129,38 +129,38 @@ def model_loader(self, model_name: str, **kwargs):
 
         # this is a HF method that checks if the low_cpu_mem mode is enabled
         # via HF accelerate
-        if is_fsdp_enabled() and self.use_external_lib:
-            # Local
-            from .autogptq_utils import (  # pylint: disable=import-outside-toplevel
-                _patch_target_module,
-                make_sure_no_tensor_in_meta_device,
-            )
-
-            # We patch `make_sure_no_tensor_in_meta_device`
-            # from autogptq to avoid errors on models without bias
-            _patch_target_module(
-                to_patch="auto_gptq.modeling._utils.make_sure_no_tensor_in_meta_device",
-                replace_with=make_sure_no_tensor_in_meta_device,
-                target_module="auto_gptq.modeling._base",
-            )
+        if is_fsdp_enabled():
             kwargs["low_cpu_mem_usage"] = True
-
-        # NOTE: need to set the device map as below as we want to use AutoGPTQ for training.
-        # device_map is for inference only
-        # https://huggingface.co/docs/accelerate/en/concept_guides/big_model_inference
-        # For low_cpu_mem_usage = True, we have to set the device map to load checkpoints to "cpu"
-        # to avoid gpu consumption before train
-        # This approach will divert consumption to cpu memory,
-        # a better approach would be to load the checkpoints to meta device
-        # QLoRA is currently implemented by the former approach and will encounter the same issue.
-        # see https://github.com/huggingface/transformers/pull/25107#issuecomment-2134833262
-        device_map = {
-            "": (
-                (torch.cuda.current_device() if not kwargs["low_cpu_mem_usage"] else "cpu")
-                if torch.cuda.is_available()
-                else None
-            )
-        }
+            if self.use_external_lib:
+                # Local
+                from .autogptq_utils import (  # pylint: disable=import-outside-toplevel
+                    _patch_target_module,
+                    make_sure_no_tensor_in_meta_device,
+                )
+
+                # We patch `make_sure_no_tensor_in_meta_device`
+                # from autogptq to avoid errors on models without bias
+                _patch_target_module(
+                    to_patch="auto_gptq.modeling._utils.make_sure_no_tensor_in_meta_device",
+                    replace_with=make_sure_no_tensor_in_meta_device,
+                    target_module="auto_gptq.modeling._base",
+                )
+
+                # NOTE: need to set the device map as below as we want to use AutoGPTQ for training.
+                # For low_cpu_mem_usage = True, we have to set the device map to load checkpoints to "cpu"
+                # to avoid gpu consumption before train
+                # This approach will divert consumption to cpu memory,
+                # a better approach would be to load the checkpoints to meta device
+                # QLoRA is currently implemented by the former approach and will encounter the same issue.
+                # see https://github.com/huggingface/transformers/pull/25107#issuecomment-2134833262
+
+                kwargs["device_map"] = {
+                    "": (
+                        (torch.cuda.current_device() if not kwargs["low_cpu_mem_usage"] else "cpu")
+                        if torch.cuda.is_available()
+                        else None
+                    )
+                }
 
         # currently only enable triton_v2, because the triton kernels are the only ones
         # that have backwards
@@ -168,7 +168,6 @@ def model_loader(self, model_name: str, **kwargs):
             model_name,
             quantize_config=quantize_config,
             torch_dtype=torch_dtype,
-            device_map=device_map,
             warmup_triton=False,  # disable for now as it will try to run the warmup while on CPU
             **kwargs,
         )
diff --git a/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/models/base.py b/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/models/base.py
index d1010a39..e3ca3938 100644
--- a/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/models/base.py
+++ b/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/models/base.py
@@ -774,7 +774,7 @@ def from_quantized(
         device_map: Optional[Union[str, Dict[str, Union[int, str]]]] = None,
         max_memory: Optional[dict] = None,
         device: Optional[Union[str, int]] = None,
-
+        low_cpu_mem_usage: bool = False,
         backend: Backend = Backend.AUTO,
 
         torch_dtype: [str | torch.dtype] = "auto",
@@ -909,6 +909,8 @@ def skip(*args, **kwargs):
         transformers.modeling_utils._init_weights = False
 
         init_contexts = [no_init_weights()]
+        if low_cpu_mem_usage:
+            init_contexts.append(accelerate.init_empty_weights(include_buffers=False))
 
         with ContextManagers(init_contexts):
             model = AutoModelForCausalLM.from_config(
@@ -982,6 +984,11 @@ def skip(*args, **kwargs):
                 no_split_module_classes=[cls.layer_type],
             )
 
+        if low_cpu_mem_usage:
+            # set device_map on so `dispatch_model` initializes weights on cpu until accelerator
+            # prepares the model on gpu in `trainer.train` to avoid unnecessary gpu usage
+            device_map = {"": "cpu"}
+
         load_checkpoint_in_model = False
         # compat: runtime convert checkpoint gptq(v1) to gptq_v2 format
         if quantize_config.format == FORMAT.GPTQ:
@@ -1019,7 +1026,6 @@ def skip(*args, **kwargs):
                 offload_state_dict=True,
                 offload_buffers=True,
             )
-
         # TODO: Why are we using this custom function and not dispatch_model?
         model = simple_dispatch_model(model, device_map)
 

From b41d141eeee923256a0d6a216663792752a622c6 Mon Sep 17 00:00:00 2001
From: 1000850000 user <aaron.chew1@ibm.com>
Date: Thu, 4 Jul 2024 08:59:55 +0000
Subject: [PATCH 08/20] Use transformers package checking instead of importlib

---
 .../fms_acceleration_peft/framework_plugin_autogptq.py |  4 ++--
 plugins/accelerated-peft/tests/test_gptqmodel.py       | 10 +++++++---
 2 files changed, 9 insertions(+), 5 deletions(-)

diff --git a/plugins/accelerated-peft/src/fms_acceleration_peft/framework_plugin_autogptq.py b/plugins/accelerated-peft/src/fms_acceleration_peft/framework_plugin_autogptq.py
index 2a880ee5..914e7d39 100644
--- a/plugins/accelerated-peft/src/fms_acceleration_peft/framework_plugin_autogptq.py
+++ b/plugins/accelerated-peft/src/fms_acceleration_peft/framework_plugin_autogptq.py
@@ -20,7 +20,6 @@
 from functools import partial
 from types import MethodType
 from typing import Dict, Tuple
-import importlib
 import os
 
 # Third Party
@@ -28,6 +27,7 @@
 from peft import LoraConfig, prepare_model_for_kbit_training
 from peft.tuners.lora.model import LoraModel
 from transformers import AutoModelForCausalLM, TrainingArguments
+from transformers.utils.import_utils import _is_package_available
 from transformers.modeling_utils import is_fsdp_enabled
 import torch
 import torch.distributed
@@ -50,7 +50,7 @@ def __init__(self, configurations: Dict[str, Dict], use_external_lib: bool = Fal
         self.use_external_lib = use_external_lib
 
         if self.use_external_lib:
-            assert importlib.util.find_spec("auto_gptq") is not None, "Unable to use external library, autogptq module not found."
+            assert _is_package_available("auto_gptq") is True, "Unable to use external library, autogptq module not found."
 
     def model_loader(self, model_name: str, **kwargs):
         # guarded imports
diff --git a/plugins/accelerated-peft/tests/test_gptqmodel.py b/plugins/accelerated-peft/tests/test_gptqmodel.py
index 9157ccf7..dbab2821 100644
--- a/plugins/accelerated-peft/tests/test_gptqmodel.py
+++ b/plugins/accelerated-peft/tests/test_gptqmodel.py
@@ -31,7 +31,7 @@
 BS = 1
 SEQLEN = 128
 
-LOSS_TOLERANCE = 0.1
+LOSS_TOLERANCE = 1e-3
 ALLCLOSE_RTOL = 1e-3
 ALLCLOSE_ATOL = 1e-4
 
@@ -248,11 +248,15 @@ def test_quantizing_pretrained_model_outputs_match(
         refactored_logits = refactored_model(input_ids).logits
 
     # Measure the distribution error with KD Loss
+    # flatten as a single batch bs*seqlen
+    # since batchmean sums the loss and averages on dim=0
     loss_fn = torch.nn.KLDivLoss(reduction="batchmean")
     # input should be a distribution in the log space
-    input = torch.nn.functional.log_softmax(refactored_logits, dim=1)
+    input = torch.nn.functional.log_softmax(refactored_logits, dim=-1)
+    input = torch.flatten(input, start_dim=0, end_dim=1)
     # target must be prob distribution
-    target = torch.nn.functional.softmax(original_logits, dim=1)
+    target = torch.nn.functional.softmax(original_logits, dim=-1)
+    target = torch.flatten(target, start_dim=0, end_dim=1)
     error = loss_fn(input, target)
     assert error.lt(LOSS_TOLERANCE), "Model logits don't match between both libraries \
         after quantization"

From b6282e1132880313951ffcbb183b9e716b5725cb Mon Sep 17 00:00:00 2001
From: 1000850000 user <aaron.chew1@ibm.com>
Date: Thu, 4 Jul 2024 18:44:20 +0000
Subject: [PATCH 09/20] formatting

---
 .../framework_plugin_autogptq.py              |  37 ++-
 .../gptqmodel/__init__.py                     |   3 +-
 .../gptqmodel/models/__init__.py              |   1 +
 .../gptqmodel/models/_const.py                |   3 +-
 .../gptqmodel/models/auto.py                  |  17 +-
 .../gptqmodel/models/base.py                  | 294 +++++++++++++-----
 .../gptqmodel/models/dbrx.py                  |   5 +-
 .../gptqmodel/models/dbrx_converted.py        |  57 ++--
 .../gptqmodel/models/gemma.py                 |   1 +
 .../gptqmodel/models/gpt_bigcode.py           |   1 +
 .../gptqmodel/models/gpt_neox.py              |   2 +-
 .../gptqmodel/models/llama.py                 |   1 +
 .../gptqmodel/models/mistral.py               |   1 +
 .../gptqmodel/models/mixtral.py               |   1 +
 .../gptqmodel/nn_modules/__init__.py          |   2 +-
 .../gptqmodel/nn_modules/qlinear/__init__.py  |  10 +-
 .../nn_modules/qlinear/qlinear_tritonv2.py    |  43 ++-
 .../triton_utils/custom_autotune.py           |  21 +-
 .../nn_modules/triton_utils/dequant.py        |  16 +-
 .../nn_modules/triton_utils/kernels.py        |  58 +++-
 .../gptqmodel/quantization/__init__.py        |  13 +-
 .../gptqmodel/quantization/config.py          |  50 ++-
 .../gptqmodel/quantization/gptq.py            |  18 +-
 .../gptqmodel/quantization/quantizer.py       |   2 +
 .../gptqmodel/utils/__init__.py               |   1 +
 .../gptqmodel/utils/backend.py                |   2 +
 .../gptqmodel/utils/data.py                   |  52 +++-
 .../gptqmodel/utils/importer.py               |  30 +-
 .../gptqmodel/utils/model.py                  | 144 ++++++---
 .../gptqmodel/utils/peft.py                   |  32 +-
 .../accelerated-peft/tests/test_gptqmodel.py  | 182 ++++++-----
 .../accelerated-peft/tests/test_q4_triton.py  |  31 +-
 plugins/accelerated-peft/tests/test_triton.py |  13 +-
 33 files changed, 803 insertions(+), 341 deletions(-)

diff --git a/plugins/accelerated-peft/src/fms_acceleration_peft/framework_plugin_autogptq.py b/plugins/accelerated-peft/src/fms_acceleration_peft/framework_plugin_autogptq.py
index 914e7d39..8bcd0bb9 100644
--- a/plugins/accelerated-peft/src/fms_acceleration_peft/framework_plugin_autogptq.py
+++ b/plugins/accelerated-peft/src/fms_acceleration_peft/framework_plugin_autogptq.py
@@ -27,11 +27,12 @@
 from peft import LoraConfig, prepare_model_for_kbit_training
 from peft.tuners.lora.model import LoraModel
 from transformers import AutoModelForCausalLM, TrainingArguments
-from transformers.utils.import_utils import _is_package_available
 from transformers.modeling_utils import is_fsdp_enabled
+from transformers.utils.import_utils import _is_package_available
 import torch
 import torch.distributed
 
+
 class AutoGPTQAccelerationPlugin(AccelerationPlugin):
 
     require_packages = []
@@ -50,16 +51,19 @@ def __init__(self, configurations: Dict[str, Dict], use_external_lib: bool = Fal
         self.use_external_lib = use_external_lib
 
         if self.use_external_lib:
-            assert _is_package_available("auto_gptq") is True, "Unable to use external library, autogptq module not found."
+            assert (
+                _is_package_available("auto_gptq") is True
+            ), "Unable to use external library, autogptq module not found."
 
     def model_loader(self, model_name: str, **kwargs):
         # guarded imports
         # Third Party
         if self.use_external_lib:
-            from auto_gptq import (  # pylint: disable=import-outside-toplevel,import-error
-                AutoGPTQForCausalLM as GPTQModel,
-                BaseQuantizeConfig as QuantizeConfig,
+            # Third Party
+            from auto_gptq import (
+                AutoGPTQForCausalLM as GPTQModel,  # pylint: disable=import-outside-toplevel,import-error
             )
+            from auto_gptq import BaseQuantizeConfig as QuantizeConfig
             from auto_gptq.nn_modules.qlinear.qlinear_tritonv2 import (  # pylint: disable=import-outside-toplevel,import-error
                 QuantLinear,
             )
@@ -69,7 +73,6 @@ def model_loader(self, model_name: str, **kwargs):
             from .gptqmodel.nn_modules.qlinear.qlinear_tritonv2 import (
                 QuantLinear,
             )
-
         # Local
         from .autogptq_utils import (  # pylint: disable=import-outside-toplevel
             PATCH_FOR_FSDP_TRITON_V2,
@@ -122,11 +125,10 @@ def model_loader(self, model_name: str, **kwargs):
             }
         else:
             kwargs = {
-                "low_cpu_mem_usage": low_cpu_mem_usage, # this is only used for device map
+                "low_cpu_mem_usage": low_cpu_mem_usage,  # this is only used for device map
                 "backend": Backend.TRITON,
             }
 
-
         # this is a HF method that checks if the low_cpu_mem mode is enabled
         # via HF accelerate
         if is_fsdp_enabled():
@@ -156,7 +158,11 @@ def model_loader(self, model_name: str, **kwargs):
 
                 kwargs["device_map"] = {
                     "": (
-                        (torch.cuda.current_device() if not kwargs["low_cpu_mem_usage"] else "cpu")
+                        (
+                            torch.cuda.current_device()
+                            if not kwargs["low_cpu_mem_usage"]
+                            else "cpu"
+                        )
                         if torch.cuda.is_available()
                         else None
                     )
@@ -241,6 +247,7 @@ def augmentation(
         # guarded imports
         # Third Party
         if self.use_external_lib:
+            # Third Party
             from auto_gptq.nn_modules.qlinear.qlinear_tritonv2 import (  # pylint: disable=import-outside-toplevel,import-error
                 QuantLinear,
             )
@@ -248,15 +255,16 @@ def augmentation(
                 GPTQLoraModel,
                 get_gptq_peft_model,
             )
+
             # Local
             from .autogptq_utils import (  # pylint: disable=import-outside-toplevel
                 create_new_module_peft,
                 replace_module_peft,
             )
         else:
+            # Local
             from .gptqmodel.utils.peft import get_gptq_peft_model
 
-
         (peft_config,) = modifiable_args  # unpack modifiable args
 
         # some assertions
@@ -298,8 +306,9 @@ def augmentation(
             _old_replace_module = GPTQLoraModel._replace_module
             _create_new_module = partial(create_new_module_peft, target_cls=QuantLinear)
             LoraModel._create_new_module = staticmethod(_create_new_module)
-            GPTQLoraModel._replace_module = MethodType(replace_module_peft, GPTQLoraModel)
-
+            GPTQLoraModel._replace_module = MethodType(
+                replace_module_peft, GPTQLoraModel
+            )
 
         # Install GPTQ adapters using the AutoGPTQ package (with the above patches)
         model = get_gptq_peft_model(
@@ -313,7 +322,9 @@ def augmentation(
         if self.use_external_lib:
             # undo the patching for hygine
             LoraModel._create_new_module = staticmethod(_old_create_new_module)
-            GPTQLoraModel._replace_module = MethodType(_old_replace_module, GPTQLoraModel)
+            GPTQLoraModel._replace_module = MethodType(
+                _old_replace_module, GPTQLoraModel
+            )
 
         return model, modifiable_args
 
diff --git a/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/__init__.py b/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/__init__.py
index dd205fa8..d9eb61ed 100644
--- a/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/__init__.py
+++ b/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/__init__.py
@@ -13,6 +13,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 ###############################################################################
+# Local
 from .models import GPTQModel
 from .quantization import BaseQuantizeConfig, QuantizeConfig
-from .utils import Backend, get_backend
\ No newline at end of file
+from .utils import Backend, get_backend
diff --git a/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/models/__init__.py b/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/models/__init__.py
index ac31ca08..967dd1a6 100644
--- a/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/models/__init__.py
+++ b/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/models/__init__.py
@@ -13,6 +13,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 ###############################################################################
+# Local
 from .auto import MODEL_MAP, GPTQModel
 from .base import BaseGPTQModel
 from .dbrx import DbrxGPTQ
diff --git a/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/models/_const.py b/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/models/_const.py
index 49484bee..ab8bf4dd 100644
--- a/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/models/_const.py
+++ b/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/models/_const.py
@@ -13,6 +13,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 ###############################################################################
+# Third Party
 from torch import device
 
 CPU = device("cpu")
@@ -31,5 +32,3 @@
 EXLLAMA_DEFAULT_MAX_INPUT_LENGTH = 2048
 
 EXPERT_INDEX_PLACEHOLDER = "{expert_index}"
-
-
diff --git a/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/models/auto.py b/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/models/auto.py
index d31c12e9..178d9c89 100644
--- a/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/models/auto.py
+++ b/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/models/auto.py
@@ -13,10 +13,13 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 ###############################################################################
+# Standard
 from typing import Dict, List, Optional, Union
 
+# Third Party
 import torch
 
+# Local
 from ..utils import Backend
 from ..utils.model import check_and_get_model_type
 from .base import BaseGPTQModel, QuantizeConfig
@@ -40,10 +43,15 @@
     "dbrx_converted": DbrxConvertedGPTQ,
 }
 
-at_least_one_cuda_v6 = any(torch.cuda.get_device_capability(i)[0] >= 6 for i in range(torch.cuda.device_count()))
+at_least_one_cuda_v6 = any(
+    torch.cuda.get_device_capability(i)[0] >= 6
+    for i in range(torch.cuda.device_count())
+)
 
 if not at_least_one_cuda_v6:
-    raise EnvironmentError("GPTQModel requires at least one GPU device with CUDA compute capability >= `6.0`.")
+    raise EnvironmentError(
+        "GPTQModel requires at least one GPU device with CUDA compute capability >= `6.0`."
+    )
 
 
 class GPTQModel:
@@ -63,7 +71,9 @@ def from_pretrained(
         trust_remote_code: bool = False,
         **model_init_kwargs,
     ) -> BaseGPTQModel:
-        model_type = check_and_get_model_type(pretrained_model_name_or_path, trust_remote_code)
+        model_type = check_and_get_model_type(
+            pretrained_model_name_or_path, trust_remote_code
+        )
         return MODEL_MAP[model_type].from_pretrained(
             pretrained_model_name_or_path=pretrained_model_name_or_path,
             quantize_config=quantize_config,
@@ -110,4 +120,3 @@ def from_quantized(
             verify_hash=verify_hash,
             **kwargs,
         )
-
diff --git a/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/models/base.py b/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/models/base.py
index e3ca3938..8c319c45 100644
--- a/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/models/base.py
+++ b/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/models/base.py
@@ -13,36 +13,64 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 ###############################################################################
+# Standard
+from os.path import isfile, join
+from typing import Dict, List, Optional, Union
 import copy
 import json
 import logging
 import os
 import re
-from os.path import isfile, join
-from typing import Dict, List, Optional, Union
 
-import accelerate
-import torch
-import torch.nn as nn
-import transformers
+# Third Party
 from accelerate.hooks import remove_hook_from_module
 from safetensors.torch import save_file as safe_save
 from tqdm import tqdm
-from transformers import AutoConfig, AutoModelForCausalLM, PretrainedConfig, PreTrainedModel
+from transformers import (
+    AutoConfig,
+    AutoModelForCausalLM,
+    PretrainedConfig,
+    PreTrainedModel,
+)
 from transformers.modeling_utils import no_init_weights, shard_checkpoint
 from transformers.utils.generic import ContextManagers
+import accelerate
+import torch
+import torch.nn as nn
+import transformers
 
+# Local
 from ..quantization import GPTQ, QuantizeConfig
-from ..quantization.config import (FORMAT, FORMAT_FIELD_JSON, META_FIELD_QUANTIZER,
-                                   META_QUANTIZER_GPTQMODEL, MIN_VERSION_WITH_V2, QUANTIZE_BLACK_LIST)
+from ..quantization.config import (
+    FORMAT,
+    FORMAT_FIELD_JSON,
+    META_FIELD_QUANTIZER,
+    META_QUANTIZER_GPTQMODEL,
+    MIN_VERSION_WITH_V2,
+    QUANTIZE_BLACK_LIST,
+)
 from ..utils.backend import Backend
 from ..utils.data import collate_data
 from ..utils.importer import select_quant_linear
-from ..utils.model import (auto_dtype_from_config, convert_gptq_v1_to_v2_format, convert_gptq_v2_to_v1_format,
-                           find_layers, get_checkpoints, get_device, get_module_by_name_prefix,
-                           get_module_by_name_suffix, get_moe_layer_modules, gptqmodel_post_init, make_quant,
-                           move_to, nested_move_to, pack_model, simple_dispatch_model, verify_model_hash,
-                           verify_sharded_model_hashes)
+from ..utils.model import (
+    auto_dtype_from_config,
+    convert_gptq_v1_to_v2_format,
+    convert_gptq_v2_to_v1_format,
+    find_layers,
+    get_checkpoints,
+    get_device,
+    get_module_by_name_prefix,
+    get_module_by_name_suffix,
+    get_moe_layer_modules,
+    gptqmodel_post_init,
+    make_quant,
+    move_to,
+    nested_move_to,
+    pack_model,
+    simple_dispatch_model,
+    verify_model_hash,
+    verify_sharded_model_hashes,
+)
 from ._const import CPU, CUDA_0, SUPPORTED_MODELS
 
 logger = logging.getLogger(__name__)
@@ -110,9 +138,9 @@ def hf_device_map(self):
         return getattr(self.model, "hf_device_map", None)
 
     def _prepare_dataset_for_quantization(
-            self,
-            calibration_dataset: List[Dict[str, Union[List[int], torch.LongTensor]]],
-            batch_size: int = 1,
+        self,
+        calibration_dataset: List[Dict[str, Union[List[int], torch.LongTensor]]],
+        batch_size: int = 1,
     ):
         def _convert_tensor_to_list(tensor):
             if isinstance(tensor, torch.Tensor):
@@ -146,10 +174,14 @@ def _convert_tensor_to_list(tensor):
             pad_token_id = self.config.eos_token_id
 
         if pad_token_id is None:
-            raise ValueError("Calibration data requires model's `pad_token_id` or `eos_token_id` to be set: actual = `None`.")
+            raise ValueError(
+                "Calibration data requires model's `pad_token_id` or `eos_token_id` to be set: actual = `None`."
+            )
 
         new_calibration_dataset_batched = [
-            collate_data(new_calibration_dataset[start: start + batch_size], pad_token_id)
+            collate_data(
+                new_calibration_dataset[start : start + batch_size], pad_token_id
+            )
             for start in range(0, len(new_calibration_dataset), batch_size)
         ]
 
@@ -163,25 +195,26 @@ def quantize(
         self,
         calibration_dataset: List[Dict[str, Union[List[int], torch.LongTensor]]],
         batch_size: int = 1,
-
         # TODO: remove use_cuda_fp16 arg..why? doesn't pass smell test @ZX-ModelCloud
         use_cuda_fp16: bool = True,
-
         autotune_warmup_after_quantized: bool = False,
         calibration_enable_gpu_cache: bool = True,
     ):
         if self.quantized:
-            raise EnvironmentError("quantize() is called a model that is already quantized")
+            raise EnvironmentError(
+                "quantize() is called a model that is already quantized"
+            )
 
         if self.quantize_config.quant_method in QUANTIZE_BLACK_LIST:
             raise ValueError(
                 f"Unsupported quantization operation for quant method: {self.quantize_config.quant_method}"
             )
 
-
         # TODO: lm_head quantization is yet ready but pending
         if self.quantize_config.lm_head:
-            raise ValueError("lm_head quantization is currently inference only and not applicable for quantization. Please set `lm_head=False`.")
+            raise ValueError(
+                "lm_head quantization is currently inference only and not applicable for quantization. Please set `lm_head=False`."
+            )
 
         if len(calibration_dataset) == 0:
             raise ValueError("Calibration dataset must not be empty.")
@@ -190,8 +223,10 @@ def quantize(
         min_calibration_dataset_input_ids_avg_length = 256
 
         if len(calibration_dataset) < min_calibration_dataset_size:
-            logger.warning(f"Calibration dataset size should be greater than {min_calibration_dataset_size}. "
-                             f"Current size: {len(calibration_dataset)}.")
+            logger.warning(
+                f"Calibration dataset size should be greater than {min_calibration_dataset_size}. "
+                f"Current size: {len(calibration_dataset)}."
+            )
 
         # Calculate the average length of the average input_ids
         total_input_ids_length = 0
@@ -201,9 +236,10 @@ def quantize(
         avg = total_input_ids_length / len(calibration_dataset)
 
         if avg < min_calibration_dataset_input_ids_avg_length:
-            logger.warning(f"The average length of input_ids of calibration_dataset should be greater than "
-                             f"{min_calibration_dataset_input_ids_avg_length}! Current AVG is {avg}.")
-
+            logger.warning(
+                f"The average length of input_ids of calibration_dataset should be greater than "
+                f"{min_calibration_dataset_input_ids_avg_length}! Current AVG is {avg}."
+            )
 
         device_map = self.hf_device_map
         if device_map:
@@ -220,7 +256,9 @@ def quantize(
         layer_input_kwargs = []
         layer_outputs = []
 
-        calibration_dataset = self._prepare_dataset_for_quantization(calibration_dataset, batch_size)
+        calibration_dataset = self._prepare_dataset_for_quantization(
+            calibration_dataset, batch_size
+        )
 
         forward_pass_use_cache = self.model.config.use_cache
         self.model.config.use_cache = False
@@ -248,7 +286,7 @@ def store_input_hook(_, args, kwargs):
             if pos_ids is not None:
                 position_ids.append(move_to(pos_ids, data_device))
             one_kwargs = {}
-            for (k, v) in kwargs.items():  # make sure other arguments also be captured
+            for k, v in kwargs.items():  # make sure other arguments also be captured
                 if k not in ["hidden_states", "attention_mask", "position_ids"]:
                     one_kwargs[k] = nested_move_to(v, data_device)
             layer_input_kwargs.append(one_kwargs)
@@ -299,8 +337,9 @@ def store_input_hook(_, args, kwargs):
         # dynamic expert layer index for model defs
         if self.dynamic_expert_index is not None:
             num_experts = getattr(self.model.config, self.dynamic_expert_index)
-            layer_modules = get_moe_layer_modules(layer_modules=self.layer_modules,
-                                                      num_experts=num_experts)
+            layer_modules = get_moe_layer_modules(
+                layer_modules=self.layer_modules, num_experts=num_experts
+            )
 
         quantizers = {}
 
@@ -347,11 +386,15 @@ def tmp(_, inp, out):
                         layer_input.append(move_to(layer_inp, cur_layer_device))
 
                     mask = attention_masks[j]
-                    layer_attention_mask = mask if mask is None else move_to(mask, cur_layer_device)
+                    layer_attention_mask = (
+                        mask if mask is None else move_to(mask, cur_layer_device)
+                    )
 
                     additional_layer_inputs = {"attention_mask": layer_attention_mask}
                     layer_position_ids = (
-                        None if not position_ids else move_to(position_ids[j], cur_layer_device)
+                        None
+                        if not position_ids
+                        else move_to(position_ids[j], cur_layer_device)
                     )
                     if layer_position_ids is not None:
                         additional_layer_inputs["position_ids"] = layer_position_ids
@@ -362,7 +405,9 @@ def tmp(_, inp, out):
                     h.remove()
 
                 for name in subset:
-                    layer_pb.set_description(f"Quantizing {name} in layer {i + 1} of {layer_count}")
+                    layer_pb.set_description(
+                        f"Quantizing {name} in layer {i + 1} of {layer_count}"
+                    )
 
                     try:
                         scale, zero, g_idx, duration, avg_loss = gptq[name].fasterquant(
@@ -372,8 +417,12 @@ def tmp(_, inp, out):
                             static_groups=self.quantize_config.static_groups,
                         )
 
-                        stat = {"layer": i + 1, "module": name, "avg_loss": f"{avg_loss:.4f}",
-                                "time": f"{duration:.4f}"}
+                        stat = {
+                            "layer": i + 1,
+                            "module": name,
+                            "avg_loss": f"{avg_loss:.4f}",
+                            "time": f"{duration:.4f}",
+                        }
 
                         quant_log.append(stat)
                         logger.info(stat)
@@ -386,10 +435,18 @@ def tmp(_, inp, out):
                         raise e
 
                     quantizers[f"{self.layers_node}.{i}.{name}"] = (
-                        gptq[name].quantizer.to(CPU if force_layer_back_to_cpu else cur_layer_device),
-                        move_to(scale, CPU if force_layer_back_to_cpu else cur_layer_device),
-                        move_to(zero, CPU if force_layer_back_to_cpu else cur_layer_device),
-                        move_to(g_idx, CPU if force_layer_back_to_cpu else cur_layer_device),
+                        gptq[name].quantizer.to(
+                            CPU if force_layer_back_to_cpu else cur_layer_device
+                        ),
+                        move_to(
+                            scale, CPU if force_layer_back_to_cpu else cur_layer_device
+                        ),
+                        move_to(
+                            zero, CPU if force_layer_back_to_cpu else cur_layer_device
+                        ),
+                        move_to(
+                            g_idx, CPU if force_layer_back_to_cpu else cur_layer_device
+                        ),
                     )
                     gptq[name].free()
 
@@ -399,10 +456,16 @@ def tmp(_, inp, out):
                     layer_input.append(move_to(layer_inp, cur_layer_device))
 
                 mask = attention_masks[j]
-                layer_attention_mask = mask if mask is None else move_to(mask, cur_layer_device)
+                layer_attention_mask = (
+                    mask if mask is None else move_to(mask, cur_layer_device)
+                )
 
                 additional_layer_inputs = {"attention_mask": layer_attention_mask}
-                layer_position_ids = None if not position_ids else move_to(position_ids[j], cur_layer_device)
+                layer_position_ids = (
+                    None
+                    if not position_ids
+                    else move_to(position_ids[j], cur_layer_device)
+                )
                 if layer_position_ids is not None:
                     additional_layer_inputs["position_ids"] = layer_position_ids
                 for k, v in layer_input_kwargs[j].items():
@@ -413,7 +476,9 @@ def tmp(_, inp, out):
                 )
                 layer_outputs.append([layer_output])
 
-            layers[i] = move_to(layer, CPU if force_layer_back_to_cpu else cur_layer_device)
+            layers[i] = move_to(
+                layer, CPU if force_layer_back_to_cpu else cur_layer_device
+            )
             del layer
             del gptq
             del layer_inputs
@@ -480,7 +545,7 @@ def save_quantized(
         safetensors_metadata: Optional[Dict[str, str]] = None,
         use_safetensors: bool = True,
         max_shard_size: Optional[str] = None,
-        model_base_name: Optional[str] = None
+        model_base_name: Optional[str] = None,
     ):
         """save quantized model and configs to local disk"""
         os.makedirs(save_dir, exist_ok=True)
@@ -498,12 +563,14 @@ def save_quantized(
         model = self.model
 
         if not self.quantized:
-            raise ValueError("Save aborted as model is not quantized. Please call `quantize()` first.")
+            raise ValueError(
+                "Save aborted as model is not quantized. Please call `quantize()` first."
+            )
 
         if model_base_name is None:
             model_base_name = (
-                    self.quantize_config.model_file_base_name or
-                    f"gptq_model-{self.quantize_config.bits}bit-{self.quantize_config.group_size}g"
+                self.quantize_config.model_file_base_name
+                or f"gptq_model-{self.quantize_config.bits}bit-{self.quantize_config.group_size}g"
             )
 
         if quantize_config.format == FORMAT.GPTQ_V2:
@@ -520,7 +587,11 @@ def save_quantized(
             # no need to set it back, no calculation below
             if quantize_config.bits != 4:
                 cuda_name_modules = {}
-                from gptqmodel.nn_modules.qlinear.qlinear_cuda import BaseCudaQuantLinear
+                # Third Party
+                from gptqmodel.nn_modules.qlinear.qlinear_cuda import (
+                    BaseCudaQuantLinear,
+                )
+
                 for name, module in model.named_modules():
                     if isinstance(module, BaseCudaQuantLinear):
                         cuda_name_modules[name] = module.gptqmodel_cuda
@@ -528,14 +599,19 @@ def save_quantized(
                 model = copy.deepcopy(self.model)
 
                 for name, modules in model.named_modules():
-                    if isinstance(module, BaseCudaQuantLinear) and name in cuda_name_modules:
+                    if (
+                        isinstance(module, BaseCudaQuantLinear)
+                        and name in cuda_name_modules
+                    ):
                         module.gptqmodel_cuda = cuda_name_modules[name]
 
                 del cuda_name_modules
             else:
                 model = copy.deepcopy(self.model)
             model = convert_gptq_v2_to_v1_format(
-                model, quantize_config=quantize_config, qlinear_kernel=self.qlinear_kernel
+                model,
+                quantize_config=quantize_config,
+                qlinear_kernel=self.qlinear_kernel,
             )
 
         model.to(CPU)
@@ -557,7 +633,9 @@ def save_quantized(
             model_save_name = model_base_name + ".bin"
 
         if not self.qlinear_kernel.SUPPORTED_SHARDS and max_shard_size is not None:
-            logger.warning("Sharding is not supported for this quant. Disabling sharding.")
+            logger.warning(
+                "Sharding is not supported for this quant. Disabling sharding."
+            )
             max_shard_size = None
 
         if max_shard_size is None:
@@ -567,7 +645,9 @@ def save_quantized(
                 elif not isinstance(safetensors_metadata, dict):
                     raise TypeError("safetensors_metadata must be a dictionary.")
                 else:
-                    logger.debug(f"Received safetensors_metadata: {safetensors_metadata}")
+                    logger.debug(
+                        f"Received safetensors_metadata: {safetensors_metadata}"
+                    )
                     new_safetensors_metadata = {}
                     converted_keys = False
                     for key, value in safetensors_metadata.items():
@@ -594,28 +674,35 @@ def save_quantized(
                 # Format is required to enable Accelerate to load the metadata
                 # otherwise it raises an OSError
                 safetensors_metadata["format"] = "pt"
-                safe_save(state_dict, join(save_dir, model_save_name), safetensors_metadata)
+                safe_save(
+                    state_dict, join(save_dir, model_save_name), safetensors_metadata
+                )
             else:
                 logger.warning(
-                    "We highly suggest saving quantized model using safetensors format for security reasons. Please set `use_safetensors=True` whenever possible.")
+                    "We highly suggest saving quantized model using safetensors format for security reasons. Please set `use_safetensors=True` whenever possible."
+                )
                 torch.save(model.state_dict(), join(save_dir, model_save_name))
         else:
             # Shard checkpoint
-            shards, index = shard_checkpoint(state_dict, max_shard_size=max_shard_size, weights_name=model_save_name)
+            shards, index = shard_checkpoint(
+                state_dict, max_shard_size=max_shard_size, weights_name=model_save_name
+            )
 
             # Clean the folder from a previous save
             for filename in os.listdir(save_dir):
                 full_filename = join(save_dir, filename)
 
                 # make sure that file to be deleted matches format of sharded file, e.g. pytorch_model-00001-of-00005
-                filename_no_suffix = filename.replace(".bin", "").replace(".safetensors", "")
+                filename_no_suffix = filename.replace(".bin", "").replace(
+                    ".safetensors", ""
+                )
                 reg = re.compile(r"(.*?)-\d{5}-of-\d{5}")
 
                 if (
-                        filename.startswith(model_base_name)
-                        and isfile(full_filename)
-                        and filename not in shards.keys()
-                        and reg.fullmatch(filename_no_suffix) is not None
+                    filename.startswith(model_base_name)
+                    and isfile(full_filename)
+                    and filename not in shards.keys()
+                    and reg.fullmatch(filename_no_suffix) is not None
                 ):
                     os.remove(full_filename)
 
@@ -627,7 +714,9 @@ def save_quantized(
                     elif not isinstance(safetensors_metadata, dict):
                         raise TypeError("safetensors_metadata must be a dictionary.")
                     else:
-                        logger.debug(f"Received safetensors_metadata: {safetensors_metadata}")
+                        logger.debug(
+                            f"Received safetensors_metadata: {safetensors_metadata}"
+                        )
                         new_safetensors_metadata = {}
                         converted_keys = False
                         for key, value in safetensors_metadata.items():
@@ -638,15 +727,18 @@ def save_quantized(
                                     new_value = str(value)
                                 except Exception as e:
                                     raise TypeError(
-                                        f"safetensors_metadata: both keys and values must be strings and an error occured when trying to convert them: {e}")
+                                        f"safetensors_metadata: both keys and values must be strings and an error occured when trying to convert them: {e}"
+                                    )
                                 if new_key in new_safetensors_metadata:
                                     logger.warning(
-                                        f"After converting safetensors_metadata keys to strings, the key '{new_key}' is duplicated. Ensure that all your metadata keys are strings to avoid overwriting.")
+                                        f"After converting safetensors_metadata keys to strings, the key '{new_key}' is duplicated. Ensure that all your metadata keys are strings to avoid overwriting."
+                                    )
                                 new_safetensors_metadata[new_key] = new_value
                         safetensors_metadata = new_safetensors_metadata
                         if converted_keys:
                             logger.debug(
-                                f"One or more safetensors_metadata keys or values had to be converted to str(). Final safetensors_metadata: {safetensors_metadata}")
+                                f"One or more safetensors_metadata keys or values had to be converted to str(). Final safetensors_metadata: {safetensors_metadata}"
+                            )
 
                     # Format is required to enable Accelerate to load the metadata
                     # otherwise it raises an OSError
@@ -675,7 +767,9 @@ def save_pretrained(
         save_dir: str,
         **kwargs,
     ):
-        logger.warning("You are using save_pretrained, which will re-direct to save_quantized.")
+        logger.warning(
+            "You are using save_pretrained, which will re-direct to save_quantized."
+        )
         self.save_quantized(save_dir=save_dir, **kwargs)
 
     @classmethod
@@ -691,7 +785,9 @@ def from_pretrained(
         """load un-quantized pretrained model to cpu"""
 
         if not torch.cuda.is_available():
-            raise EnvironmentError("Load pretrained model to do quantization requires CUDA available.")
+            raise EnvironmentError(
+                "Load pretrained model to do quantization requires CUDA available."
+            )
 
         if cls.require_trust_remote_code and not trust_remote_code:
             raise ValueError(
@@ -712,12 +808,16 @@ def skip(*args, **kwargs):
 
         model_init_kwargs["trust_remote_code"] = trust_remote_code
 
-        config = AutoConfig.from_pretrained(pretrained_model_name_or_path, **model_init_kwargs)
+        config = AutoConfig.from_pretrained(
+            pretrained_model_name_or_path, **model_init_kwargs
+        )
 
         if torch_dtype == "auto":
             torch_dtype = auto_dtype_from_config(config)
         elif not isinstance(torch_dtype, torch.dtype):
-            raise ValueError(f"torch_dtype value of `{torch_dtype}` is not a torch.dtype instance.")
+            raise ValueError(
+                f"torch_dtype value of `{torch_dtype}` is not a torch.dtype instance."
+            )
 
         # enforce some values despite user specified
         model_init_kwargs["torch_dtype"] = torch_dtype
@@ -751,7 +851,9 @@ def skip(*args, **kwargs):
 
         torch.cuda.empty_cache()
 
-        model = AutoModelForCausalLM.from_pretrained(pretrained_model_name_or_path, **model_init_kwargs)
+        model = AutoModelForCausalLM.from_pretrained(
+            pretrained_model_name_or_path, **model_init_kwargs
+        )
 
         model_config = model.config.to_dict()
         seq_len_keys = ["max_position_embeddings", "seq_length", "n_positions"]
@@ -761,7 +863,9 @@ def skip(*args, **kwargs):
                     model.seqlen = model_config[key]
                     break
         else:
-            logger.warning("can't get model's sequence length from model config, will set to 4096.")
+            logger.warning(
+                "can't get model's sequence length from model config, will set to 4096."
+            )
             model.seqlen = 4096
         model.eval()
 
@@ -776,7 +880,6 @@ def from_quantized(
         device: Optional[Union[str, int]] = None,
         low_cpu_mem_usage: bool = False,
         backend: Backend = Backend.AUTO,
-
         torch_dtype: [str | torch.dtype] = "auto",
         use_cuda_fp16: bool = True,
         quantize_config: Optional[QuantizeConfig] = None,
@@ -829,7 +932,9 @@ def from_quantized(
         if torch_dtype == "auto":
             torch_dtype = auto_dtype_from_config(config, quant_inference=True)
         elif not isinstance(torch_dtype, torch.dtype):
-            raise ValueError(f"torch_dtype value of `{torch_dtype}` is not a torch.dtype instance.")
+            raise ValueError(
+                f"torch_dtype value of `{torch_dtype}` is not a torch.dtype instance."
+            )
 
         if config.model_type not in SUPPORTED_MODELS:
             raise TypeError(f"{config.model_type} isn't supported yet.")
@@ -840,8 +945,9 @@ def from_quantized(
             )
         else:
             if not isinstance(quantize_config, QuantizeConfig):
-                quantize_config = QuantizeConfig.from_quant_config(quantize_config, format)
-
+                quantize_config = QuantizeConfig.from_quant_config(
+                    quantize_config, format
+                )
 
         if model_basename is None:
             if quantize_config.model_file_base_name:
@@ -894,12 +1000,15 @@ def from_quantized(
             if not verfieid:
                 raise ValueError(f"Hash verification failed for {model_save_name}")
             logger.info(f"Hash verification succeeded for {model_save_name}")
+
         # == step2: convert model to gptq-model (replace Linear with QuantLinear) == #
         def skip(*args, **kwargs):
             pass
 
         if torch_dtype != torch.float16:
-            logger.warning("Overriding use_cuda_fp16 to False since torch_dtype is not torch.float16.")
+            logger.warning(
+                "Overriding use_cuda_fp16 to False since torch_dtype is not torch.float16."
+            )
             use_cuda_fp16 = False
 
         torch.nn.init.kaiming_uniform_ = skip
@@ -919,8 +1028,9 @@ def skip(*args, **kwargs):
 
             if cls.dynamic_expert_index is not None:
                 num_experts = getattr(config, cls.dynamic_expert_index)
-                cls.layer_modules = get_moe_layer_modules(layer_modules=cls.layer_modules,
-                                                          num_experts=num_experts)
+                cls.layer_modules = get_moe_layer_modules(
+                    layer_modules=cls.layer_modules, num_experts=num_experts
+                )
 
             layers = find_layers(model)
             ignore_layers = [cls.lm_head] + cls.base_modules
@@ -930,8 +1040,12 @@ def skip(*args, **kwargs):
                 if quantize_config.lm_head and name == cls.lm_head:
                     continue
 
-                if any(name.startswith(ignore_layer) for ignore_layer in ignore_layers) or all(
-                        not name.endswith(ignore_layer) for sublist in cls.layer_modules for ignore_layer in sublist
+                if any(
+                    name.startswith(ignore_layer) for ignore_layer in ignore_layers
+                ) or all(
+                    not name.endswith(ignore_layer)
+                    for sublist in cls.layer_modules
+                    for ignore_layer in sublist
                 ):
                     # log non-lm-head quantizerd layers only
                     if name is not cls.lm_head:
@@ -969,7 +1083,9 @@ def skip(*args, **kwargs):
             if device is not None:
                 device = torch.device(device)
                 if not max_memory and not device_map:
-                    device_map = {"": device.index if device.type == "cuda" else device.type}
+                    device_map = {
+                        "": device.index if device.type == "cuda" else device.type
+                    }
             if not isinstance(device_map, dict) and device_map != "sequential":
                 max_memory = accelerate.utils.get_balanced_memory(
                     model=model,
@@ -1002,13 +1118,17 @@ def skip(*args, **kwargs):
                 offload_buffers=True,
             )
             # validate sym=False v1 loading needs to be protected for models produced with new v2 format codebase
-            if not quantize_config.sym and not quantize_config.is_quantized_or_packed_by_v2():
+            if (
+                not quantize_config.sym
+                and not quantize_config.is_quantized_or_packed_by_v2()
+            ):
                 raise ValueError(
                     f"Loading of a sym=False model with format={FORMAT.GPTQ} is only supported if produced by gptqmodel version >= {MIN_VERSION_WITH_V2}"
                 )
 
             logger.info(
-                f"Compatibility: converting `{FORMAT_FIELD_JSON}` from `{FORMAT.GPTQ}` to `{FORMAT.GPTQ_V2}`.")
+                f"Compatibility: converting `{FORMAT_FIELD_JSON}` from `{FORMAT.GPTQ}` to `{FORMAT.GPTQ_V2}`."
+            )
             model = convert_gptq_v1_to_v2_format(
                 model,
                 quantize_config=quantize_config,
@@ -1047,7 +1167,9 @@ def skip(*args, **kwargs):
                     model.seqlen = model_config[key]
                     break
         else:
-            logger.warning("can't get model's sequence length from model config, will set to 4096.")
+            logger.warning(
+                "can't get model's sequence length from model config, will set to 4096."
+            )
             model.seqlen = 4096
 
         # Any post-initialization that require device information, for example buffers initialization on device.
@@ -1057,6 +1179,7 @@ def skip(*args, **kwargs):
 
         # == step6: (optional) warmup triton == #
         if backend != Backend.TRITON and warmup_triton:
+            # Local
             from ..nn_modules.qlinear.qlinear_tritonv2 import QuantLinear
 
             QuantLinear.warmup(model, seqlen=model.seqlen)
@@ -1072,6 +1195,7 @@ def warmup_triton(self, enabled: bool = True):
         if not enabled:
             return
 
+        # Local
         from ..nn_modules.qlinear.qlinear_tritonv2 import QuantLinear
 
         QuantLinear.warmup(self.model, seqlen=self.model.seqlen)
diff --git a/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/models/dbrx.py b/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/models/dbrx.py
index fb758032..dc899be2 100644
--- a/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/models/dbrx.py
+++ b/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/models/dbrx.py
@@ -13,10 +13,13 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 ###############################################################################
+# Local
 from .base import BaseGPTQModel
 
 
 # placer=holder only as dbrx original models are not supported
 # supported dbrx_converted models can be found on https://hf.co/ModelCloud
 class DbrxGPTQ(BaseGPTQModel):
-    info = {"notes": "Dbrx is only supported using defused/converted models on https://hf.co/ModelCloud with `trust_remote_code=True`"}
+    info = {
+        "notes": "Dbrx is only supported using defused/converted models on https://hf.co/ModelCloud with `trust_remote_code=True`"
+    }
diff --git a/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/models/dbrx_converted.py b/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/models/dbrx_converted.py
index 35a33170..9878d082 100644
--- a/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/models/dbrx_converted.py
+++ b/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/models/dbrx_converted.py
@@ -13,6 +13,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 ###############################################################################
+# Local
 from .base import BaseGPTQModel
 
 
@@ -25,25 +26,45 @@ class DbrxConvertedGPTQ(BaseGPTQModel):
     layers_node = "transformer.blocks"
     layer_type = "DbrxBlock"
     layer_modules = [
-        ["norm_attn_norm.attn.q_proj", "norm_attn_norm.attn.k_proj", "norm_attn_norm.attn.v_proj"],
+        [
+            "norm_attn_norm.attn.q_proj",
+            "norm_attn_norm.attn.k_proj",
+            "norm_attn_norm.attn.v_proj",
+        ],
         ["norm_attn_norm.attn.out_proj"],
         [
-            "ffn.experts.mlp.0.w1",  "ffn.experts.mlp.0.v1",
-            "ffn.experts.mlp.1.w1",  "ffn.experts.mlp.1.v1",
-            "ffn.experts.mlp.2.w1",  "ffn.experts.mlp.2.v1",
-            "ffn.experts.mlp.3.w1",  "ffn.experts.mlp.3.v1",
-            "ffn.experts.mlp.4.w1",  "ffn.experts.mlp.4.v1",
-            "ffn.experts.mlp.5.w1",  "ffn.experts.mlp.5.v1",
-            "ffn.experts.mlp.6.w1",  "ffn.experts.mlp.6.v1",
-            "ffn.experts.mlp.7.w1",  "ffn.experts.mlp.7.v1",
-            "ffn.experts.mlp.8.w1",  "ffn.experts.mlp.8.v1",
-            "ffn.experts.mlp.9.w1",  "ffn.experts.mlp.9.v1",
-            "ffn.experts.mlp.10.w1", "ffn.experts.mlp.10.v1",
-            "ffn.experts.mlp.11.w1", "ffn.experts.mlp.11.v1",
-            "ffn.experts.mlp.12.w1", "ffn.experts.mlp.12.v1",
-            "ffn.experts.mlp.13.w1", "ffn.experts.mlp.13.v1",
-            "ffn.experts.mlp.14.w1", "ffn.experts.mlp.14.v1",
-            "ffn.experts.mlp.15.w1", "ffn.experts.mlp.15.v1",
+            "ffn.experts.mlp.0.w1",
+            "ffn.experts.mlp.0.v1",
+            "ffn.experts.mlp.1.w1",
+            "ffn.experts.mlp.1.v1",
+            "ffn.experts.mlp.2.w1",
+            "ffn.experts.mlp.2.v1",
+            "ffn.experts.mlp.3.w1",
+            "ffn.experts.mlp.3.v1",
+            "ffn.experts.mlp.4.w1",
+            "ffn.experts.mlp.4.v1",
+            "ffn.experts.mlp.5.w1",
+            "ffn.experts.mlp.5.v1",
+            "ffn.experts.mlp.6.w1",
+            "ffn.experts.mlp.6.v1",
+            "ffn.experts.mlp.7.w1",
+            "ffn.experts.mlp.7.v1",
+            "ffn.experts.mlp.8.w1",
+            "ffn.experts.mlp.8.v1",
+            "ffn.experts.mlp.9.w1",
+            "ffn.experts.mlp.9.v1",
+            "ffn.experts.mlp.10.w1",
+            "ffn.experts.mlp.10.v1",
+            "ffn.experts.mlp.11.w1",
+            "ffn.experts.mlp.11.v1",
+            "ffn.experts.mlp.12.w1",
+            "ffn.experts.mlp.12.v1",
+            "ffn.experts.mlp.13.w1",
+            "ffn.experts.mlp.13.v1",
+            "ffn.experts.mlp.14.w1",
+            "ffn.experts.mlp.14.v1",
+            "ffn.experts.mlp.15.w1",
+            "ffn.experts.mlp.15.v1",
         ],
         [
             "ffn.experts.mlp.0.w2",
@@ -62,5 +83,5 @@ class DbrxConvertedGPTQ(BaseGPTQModel):
             "ffn.experts.mlp.13.w2",
             "ffn.experts.mlp.14.w2",
             "ffn.experts.mlp.15.w2",
-        ]
+        ],
     ]
diff --git a/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/models/gemma.py b/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/models/gemma.py
index e6191904..6bd25f9e 100644
--- a/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/models/gemma.py
+++ b/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/models/gemma.py
@@ -13,6 +13,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 ###############################################################################
+# Local
 from .base import BaseGPTQModel
 
 
diff --git a/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/models/gpt_bigcode.py b/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/models/gpt_bigcode.py
index 0a15062e..d9c1bf79 100644
--- a/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/models/gpt_bigcode.py
+++ b/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/models/gpt_bigcode.py
@@ -13,6 +13,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 ###############################################################################
+# Local
 from .base import BaseGPTQModel
 
 
diff --git a/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/models/gpt_neox.py b/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/models/gpt_neox.py
index 81e8e401..16e806fd 100644
--- a/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/models/gpt_neox.py
+++ b/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/models/gpt_neox.py
@@ -13,6 +13,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 ###############################################################################
+# Local
 from .base import BaseGPTQModel
 
 
@@ -28,4 +29,3 @@ class GPTNeoXGPTQ(BaseGPTQModel):
         ["mlp.dense_h_to_4h"],
         ["mlp.dense_4h_to_h"],
     ]
-
diff --git a/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/models/llama.py b/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/models/llama.py
index 19930ef6..69aaca23 100644
--- a/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/models/llama.py
+++ b/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/models/llama.py
@@ -13,6 +13,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 ###############################################################################
+# Local
 from .base import BaseGPTQModel
 
 
diff --git a/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/models/mistral.py b/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/models/mistral.py
index aa1748eb..e6ff5782 100644
--- a/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/models/mistral.py
+++ b/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/models/mistral.py
@@ -13,6 +13,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 ###############################################################################
+# Local
 from .base import BaseGPTQModel
 
 
diff --git a/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/models/mixtral.py b/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/models/mixtral.py
index acb4f640..12862755 100644
--- a/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/models/mixtral.py
+++ b/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/models/mixtral.py
@@ -13,6 +13,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 ###############################################################################
+# Local
 from .base import BaseGPTQModel
 
 
diff --git a/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/nn_modules/__init__.py b/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/nn_modules/__init__.py
index feb7a9e6..d5a1b04c 100644
--- a/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/nn_modules/__init__.py
+++ b/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/nn_modules/__init__.py
@@ -12,4 +12,4 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-###############################################################################
\ No newline at end of file
+###############################################################################
diff --git a/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/nn_modules/qlinear/__init__.py b/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/nn_modules/qlinear/__init__.py
index ff590d3c..93d356fa 100644
--- a/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/nn_modules/qlinear/__init__.py
+++ b/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/nn_modules/qlinear/__init__.py
@@ -1,3 +1,4 @@
+# Third Party
 import torch.nn as nn
 
 
@@ -12,7 +13,14 @@ class BaseQuantLinear(nn.Module):
     SUPPORTED_SHARDS: bool = True
 
     @classmethod
-    def validate(cls, bits: int, group_size: int, desc_act: bool, sym: bool, raise_error: bool = True) -> bool:
+    def validate(
+        cls,
+        bits: int,
+        group_size: int,
+        desc_act: bool,
+        sym: bool,
+        raise_error: bool = True,
+    ) -> bool:
         validate = True
         err = ""
         if cls.SUPPORTED_BITS and bits not in cls.SUPPORTED_BITS:
diff --git a/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/nn_modules/qlinear/qlinear_tritonv2.py b/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/nn_modules/qlinear/qlinear_tritonv2.py
index a0ab76c5..0b88da46 100644
--- a/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/nn_modules/qlinear/qlinear_tritonv2.py
+++ b/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/nn_modules/qlinear/qlinear_tritonv2.py
@@ -13,14 +13,17 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 ###############################################################################
-import math
+# Standard
 from logging import getLogger
+import math
 
+# Third Party
 import numpy as np
 import torch
 import torch.nn as nn
 import transformers
 
+# Local
 from ..triton_utils.dequant import QuantLinearFunction, quant_matmul_248
 from ..triton_utils.mixin import TritonModuleMixin
 from . import BaseQuantLinear
@@ -39,12 +42,22 @@ class QuantLinear(BaseQuantLinear, TritonModuleMixin):
 
     QUANT_TYPE = "tritonv2"
 
-    def __init__(self, bits, group_size, infeatures, outfeatures, bias, **kwargs,):
+    def __init__(
+        self,
+        bits,
+        group_size,
+        infeatures,
+        outfeatures,
+        bias,
+        **kwargs,
+    ):
         super().__init__()
         if bits not in [2, 4, 8]:
             raise NotImplementedError("Only 2,4,8 bits are supported.")
         if infeatures % 32 != 0 or outfeatures % 32 != 0:
-            raise NotImplementedError("in_feature and out_feature must be divisible by 32.")
+            raise NotImplementedError(
+                "in_feature and out_feature must be divisible by 32."
+            )
         self.infeatures = infeatures
         self.outfeatures = outfeatures
         self.bits = bits
@@ -74,10 +87,14 @@ def __init__(self, bits, group_size, infeatures, outfeatures, bias, **kwargs,):
         )
         self.register_buffer(
             "g_idx",
-            torch.tensor([i // self.group_size for i in range(infeatures)], dtype=torch.int32),
+            torch.tensor(
+                [i // self.group_size for i in range(infeatures)], dtype=torch.int32
+            ),
         )
         if bias:
-            self.register_buffer("bias", torch.zeros((outfeatures), dtype=torch.float16))
+            self.register_buffer(
+                "bias", torch.zeros((outfeatures), dtype=torch.float16)
+            )
         else:
             self.bias = None
 
@@ -103,9 +120,10 @@ def pack(self, linear, scales, zeros, g_idx=None):
         intweight = []
         for idx in range(self.infeatures):
             intweight.append(
-                torch.round((W[:, idx] + scale_zeros[self.g_idx[idx]]) / self.scales[self.g_idx[idx]]).to(torch.int)[
-                    :, None
-                ]
+                torch.round(
+                    (W[:, idx] + scale_zeros[self.g_idx[idx]])
+                    / self.scales[self.g_idx[idx]]
+                ).to(torch.int)[:, None]
             )
         intweight = torch.cat(intweight, dim=1)
         intweight = intweight.t().contiguous()
@@ -113,7 +131,9 @@ def pack(self, linear, scales, zeros, g_idx=None):
 
         i = 0
         row = 0
-        qweight = np.zeros((intweight.shape[0] // 32 * self.bits, intweight.shape[1]), dtype=np.uint32)
+        qweight = np.zeros(
+            (intweight.shape[0] // 32 * self.bits, intweight.shape[1]), dtype=np.uint32
+        )
         while row < qweight.shape[0]:
             if self.bits in [2, 4, 8]:
                 for j in range(i, i + (32 // self.bits)):
@@ -127,7 +147,9 @@ def pack(self, linear, scales, zeros, g_idx=None):
         self.qweight = torch.from_numpy(qweight)
 
         zeros = zeros.numpy().astype(np.uint32)
-        qzeros = np.zeros((zeros.shape[0], zeros.shape[1] // 32 * self.bits), dtype=np.uint32)
+        qzeros = np.zeros(
+            (zeros.shape[0], zeros.shape[1] // 32 * self.bits), dtype=np.uint32
+        )
         i = 0
         col = 0
         while col < qzeros.shape[1]:
@@ -164,6 +186,7 @@ def warmup(cls, model, transpose=False, seqlen=2048):
         """
         Pre-tunes the quantized kernel
         """
+        # Third Party
         from tqdm import tqdm
 
         kn_values = {}
diff --git a/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/nn_modules/triton_utils/custom_autotune.py b/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/nn_modules/triton_utils/custom_autotune.py
index fed33846..d4e20e5f 100644
--- a/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/nn_modules/triton_utils/custom_autotune.py
+++ b/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/nn_modules/triton_utils/custom_autotune.py
@@ -13,11 +13,13 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 ###############################################################################
+# Standard
+from typing import Dict
 import builtins
 import math
 import time
-from typing import Dict
 
+# Third Party
 import triton
 
 #  code based https://github.com/fpgaminer/GPTQ-triton
@@ -95,7 +97,9 @@ def kernel_call():
         try:
             # In testings using only 40 reps seems to be close enough and it appears to be what PyTorch uses
             # PyTorch also sets fast_flush to True, but I didn't see any speedup so I'll leave the default
-            return triton.testing.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8), rep=40)
+            return triton.testing.do_bench(
+                kernel_call, quantiles=(0.5, 0.2, 0.8), rep=40
+            )
         except triton.OutOfResources:
             return (float("inf"), float("inf"), float("inf"))
 
@@ -113,7 +117,10 @@ def run(self, *args, **kwargs):
                 # prune configs
                 pruned_configs = self.prune_configs(kwargs)
                 bench_start = time.time()
-                timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}
+                timings = {
+                    config: self._bench(*args, config=config, **kwargs)
+                    for config in pruned_configs
+                }
                 bench_end = time.time()
                 self.bench_time = bench_end - bench_start
                 self.cache[key] = builtins.min(timings, key=timings.get)
@@ -152,7 +159,9 @@ def prune_configs(self, kwargs):
                     )
                     for config in pruned_configs
                 }
-                pruned_configs = sorted(est_timing.keys(), key=lambda x: est_timing[x])[:top_k]
+                pruned_configs = sorted(est_timing.keys(), key=lambda x: est_timing[x])[
+                    :top_k
+                ]
         return pruned_configs
 
     def warmup(self, *args, **kwargs):
@@ -168,7 +177,9 @@ def warmup(self, *args, **kwargs):
         self.nargs = None
 
 
-def autotune(configs, key, prune_configs_by=None, reset_to_zero=None, nearest_power_of_two=False):
+def autotune(
+    configs, key, prune_configs_by=None, reset_to_zero=None, nearest_power_of_two=False
+):
     def decorator(fn):
         return CustomizedTritonAutoTuner(
             fn,
diff --git a/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/nn_modules/triton_utils/dequant.py b/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/nn_modules/triton_utils/dequant.py
index 05091699..fc601bd6 100644
--- a/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/nn_modules/triton_utils/dequant.py
+++ b/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/nn_modules/triton_utils/dequant.py
@@ -13,12 +13,14 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 ###############################################################################
+# Standard
 import itertools
 
+# Third Party
+from torch.cuda.amp import custom_bwd, custom_fwd
 import torch
 import triton
 import triton.language as tl
-from torch.cuda.amp import custom_bwd, custom_fwd
 
 
 def make_dequant_configs(block_sizes, num_warps):
@@ -71,7 +73,9 @@ def dequant_kernel_248(
     tl.device_assert(g_idx >= 0, "index out of bounds: 0 <= tmp0 < 0")
     groups = tl.where(tmp2, tmp1, g_idx)  # tmp3 are g_idx
 
-    scales = tl.load(scales_ptr + (col_idx + (outfeatures * groups)), None).to(tl.float32)
+    scales = tl.load(scales_ptr + (col_idx + (outfeatures * groups)), None).to(
+        tl.float32
+    )
 
     # Unpack weights
     weights = qweights >> wf_weights  # bit shift qweight
@@ -125,7 +129,9 @@ def dequant248(qweight, scales, qzeros, g_idx, bits, maxq=None):
     return out
 
 
-def quant_matmul_248(input, qweight, scales, qzeros, g_idx, bits, maxq=None, transpose=False):
+def quant_matmul_248(
+    input, qweight, scales, qzeros, g_idx, bits, maxq=None, transpose=False
+):
     W = dequant248(qweight, scales, qzeros, g_idx, bits, maxq=maxq)
     if transpose:
         return input @ W.t()
@@ -149,5 +155,7 @@ def backward(ctx, grad_output):
         grad_input = None
 
         if ctx.needs_input_grad[0]:
-            grad_input = quant_matmul_248(grad_output, qweight, scales, qzeros, g_idx, bits, maxq, transpose=True)
+            grad_input = quant_matmul_248(
+                grad_output, qweight, scales, qzeros, g_idx, bits, maxq, transpose=True
+            )
         return grad_input, None, None, None, None, None, None
diff --git a/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/nn_modules/triton_utils/kernels.py b/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/nn_modules/triton_utils/kernels.py
index 541c22dc..a542d3fa 100644
--- a/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/nn_modules/triton_utils/kernels.py
+++ b/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/nn_modules/triton_utils/kernels.py
@@ -13,13 +13,16 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 ###############################################################################
+# Standard
 from logging import getLogger
 
+# Third Party
+from torch.cuda.amp import custom_bwd, custom_fwd
 import torch
 import triton
 import triton.language as tl
-from torch.cuda.amp import custom_bwd, custom_fwd
 
+# Local
 from . import custom_autotune
 
 logger = getLogger(__name__)
@@ -150,11 +153,14 @@ def quant_matmul_248_kernel(
     offs_am = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)
     offs_bn = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
     offs_k = tl.arange(0, BLOCK_SIZE_K)
-    a_ptrs = a_ptr + (offs_am[:, None] * stride_am + offs_k[None, :] * stride_ak)  # (BLOCK_SIZE_M, BLOCK_SIZE_K)
+    a_ptrs = a_ptr + (
+        offs_am[:, None] * stride_am + offs_k[None, :] * stride_ak
+    )  # (BLOCK_SIZE_M, BLOCK_SIZE_K)
     a_mask = offs_am[:, None] < M
     # b_ptrs is set up such that it repeats elements along the K axis 8 times
     b_ptrs = b_ptr + (
-        (offs_k[:, None] // infearure_per_bits) * stride_bk + offs_bn[None, :] * stride_bn
+        (offs_k[:, None] // infearure_per_bits) * stride_bk
+        + offs_bn[None, :] * stride_bn
     )  # (BLOCK_SIZE_K, BLOCK_SIZE_N)
     g_ptrs = g_ptr + offs_k
     # shifter is used to extract the N bits of each element in the 32-bit word from B
@@ -169,8 +175,12 @@ def quant_matmul_248_kernel(
         g_idx = tl.load(g_ptrs)
 
         # Fetch scales and zeros; these are per-outfeature and thus reused in the inner loop
-        scales = tl.load(scales_ptrs + g_idx[:, None] * stride_scales)  # (BLOCK_SIZE_K, BLOCK_SIZE_N,)
-        zeros = tl.load(zeros_ptrs + g_idx[:, None] * stride_zeros)  # (BLOCK_SIZE_K, BLOCK_SIZE_N,)
+        scales = tl.load(
+            scales_ptrs + g_idx[:, None] * stride_scales
+        )  # (BLOCK_SIZE_K, BLOCK_SIZE_N,)
+        zeros = tl.load(
+            zeros_ptrs + g_idx[:, None] * stride_zeros
+        )  # (BLOCK_SIZE_K, BLOCK_SIZE_N,)
 
         zeros = (zeros >> zeros_shifter[None, :]) & maxq
 
@@ -308,18 +318,25 @@ def transpose_quant_matmul_248_kernel(
     offs_am = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)
     offs_bk = pid_k * BLOCK_SIZE_K + tl.arange(0, BLOCK_SIZE_K)
     offs_n = tl.arange(0, BLOCK_SIZE_N)
-    a_ptrs = a_ptr + (offs_am[:, None] * stride_am + offs_n[None, :] * stride_ak)  # (BLOCK_SIZE_M, BLOCK_SIZE_N)
+    a_ptrs = a_ptr + (
+        offs_am[:, None] * stride_am + offs_n[None, :] * stride_ak
+    )  # (BLOCK_SIZE_M, BLOCK_SIZE_N)
     a_mask = offs_am[:, None] < M
     # b_ptrs is set up such that it repeats elements along the K axis 8 times
     b_ptrs = b_ptr + (
-        (offs_bk[:, None] // infearure_per_bits) * stride_bk + offs_n[None, :] * stride_bn
+        (offs_bk[:, None] // infearure_per_bits) * stride_bk
+        + offs_n[None, :] * stride_bn
     )  # (BLOCK_SIZE_K, BLOCK_SIZE_N)
     g_ptrs = g_ptr + offs_bk
     g_idx = tl.load(g_ptrs)
 
     # shifter is used to extract the N bits of each element in the 32-bit word from B
     scales_ptrs = scales_ptr + offs_n[None, :] + g_idx[:, None] * stride_scales
-    zeros_ptrs = zeros_ptr + (offs_n[None, :] // infearure_per_bits) + g_idx[:, None] * stride_zeros
+    zeros_ptrs = (
+        zeros_ptr
+        + (offs_n[None, :] // infearure_per_bits)
+        + g_idx[:, None] * stride_zeros
+    )
 
     shifter = (offs_bk % infearure_per_bits) * bits
     zeros_shifter = (offs_n % infearure_per_bits) * bits
@@ -358,9 +375,12 @@ def silu(x):
 
 def quant_matmul_248(input, qweight, scales, qzeros, g_idx, bits, maxq):
     with torch.cuda.device(input.device):
-        output = torch.empty((input.shape[0], qweight.shape[1]), device=input.device, dtype=input.dtype)
+        output = torch.empty(
+            (input.shape[0], qweight.shape[1]), device=input.device, dtype=input.dtype
+        )
         grid = lambda META: (  # noqa: E731
-            triton.cdiv(input.shape[0], META["BLOCK_SIZE_M"]) * triton.cdiv(qweight.shape[1], META["BLOCK_SIZE_N"]),
+            triton.cdiv(input.shape[0], META["BLOCK_SIZE_M"])
+            * triton.cdiv(qweight.shape[1], META["BLOCK_SIZE_N"]),
         )
         quant_matmul_248_kernel[grid](
             input,
@@ -389,9 +409,12 @@ def quant_matmul_248(input, qweight, scales, qzeros, g_idx, bits, maxq):
 def transpose_quant_matmul_248(input, qweight, scales, qzeros, g_idx, bits, maxq):
     with torch.cuda.device(input.device):
         output_dim = (qweight.shape[0] * 32) // bits
-        output = torch.empty((input.shape[0], output_dim), device=input.device, dtype=input.dtype)
+        output = torch.empty(
+            (input.shape[0], output_dim), device=input.device, dtype=input.dtype
+        )
         grid = lambda META: (  # noqa: E731
-            triton.cdiv(input.shape[0], META["BLOCK_SIZE_M"]) * triton.cdiv(output_dim, META["BLOCK_SIZE_K"]),
+            triton.cdiv(input.shape[0], META["BLOCK_SIZE_M"])
+            * triton.cdiv(output_dim, META["BLOCK_SIZE_K"]),
         )
         transpose_quant_matmul_248_kernel[grid](
             input,
@@ -434,15 +457,20 @@ def backward(ctx, grad_output):
         grad_input = None
 
         if ctx.needs_input_grad[0]:
-            grad_input = transpose_quant_matmul_248(grad_output, qweight, scales, qzeros, g_idx, bits, maxq)
+            grad_input = transpose_quant_matmul_248(
+                grad_output, qweight, scales, qzeros, g_idx, bits, maxq
+            )
         return grad_input, None, None, None, None, None, None
 
 
 def quant_matmul_inference_only_248(input, qweight, scales, qzeros, g_idx, bits, maxq):
     with torch.cuda.device(input.device):
-        output = torch.empty((input.shape[0], qweight.shape[1]), device=input.device, dtype=torch.float16)
+        output = torch.empty(
+            (input.shape[0], qweight.shape[1]), device=input.device, dtype=torch.float16
+        )
         grid = lambda META: (  # noqa: E731
-            triton.cdiv(input.shape[0], META["BLOCK_SIZE_M"]) * triton.cdiv(qweight.shape[1], META["BLOCK_SIZE_N"]),
+            triton.cdiv(input.shape[0], META["BLOCK_SIZE_M"])
+            * triton.cdiv(qweight.shape[1], META["BLOCK_SIZE_N"]),
         )
         quant_matmul_248_kernel[grid](
             input,
diff --git a/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/quantization/__init__.py b/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/quantization/__init__.py
index 377ffb63..dd76bfc3 100644
--- a/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/quantization/__init__.py
+++ b/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/quantization/__init__.py
@@ -13,7 +13,16 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 ###############################################################################
-from .config import (FORMAT, FORMAT_FIELD_CODE, FORMAT_FIELD_JSON,
-                     QUANT_CONFIG_FILENAME, QUANT_METHOD, QUANT_METHOD_FIELD, BaseQuantizeConfig, QuantizeConfig)
+# Local
+from .config import (
+    FORMAT,
+    FORMAT_FIELD_CODE,
+    FORMAT_FIELD_JSON,
+    QUANT_CONFIG_FILENAME,
+    QUANT_METHOD,
+    QUANT_METHOD_FIELD,
+    BaseQuantizeConfig,
+    QuantizeConfig,
+)
 from .gptq import GPTQ
 from .quantizer import Quantizer, quantize
diff --git a/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/quantization/config.py b/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/quantization/config.py
index a99020fa..e45c371a 100644
--- a/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/quantization/config.py
+++ b/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/quantization/config.py
@@ -13,12 +13,14 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 ###############################################################################
-import json
-import logging
+# Standard
 from dataclasses import dataclass, field, fields
 from os.path import isdir, join
 from typing import Any, Dict, Optional, Tuple
+import json
+import logging
 
+# Third Party
 from packaging import version
 from transformers.utils.hub import cached_file
 
@@ -34,7 +36,11 @@
 FORMAT_FIELD_JSON = "checkpoint_format"
 QUANT_METHOD_FIELD = "quant_method"
 QUANT_CONFIG_FILENAME = "quantize_config.json"
-QUANT_CONFIG_FILENAME_COMPAT = [QUANT_CONFIG_FILENAME, "quant_config.json", "config.json"]
+QUANT_CONFIG_FILENAME_COMPAT = [
+    QUANT_CONFIG_FILENAME,
+    "quant_config.json",
+    "config.json",
+]
 
 MIN_VERSION_WITH_V2 = "0.9.0"
 
@@ -81,7 +87,7 @@ class QUANT_METHOD:
 
 
 @dataclass
-class QuantizeConfig():
+class QuantizeConfig:
     bits: int = field(default=4, metadata={"choices": [2, 3, 4, 8]})
     group_size: int = field(default=-1)
     damp_percent: float = field(default=0.01)
@@ -117,7 +123,9 @@ def __post_init__(self):
             )
 
         if self.bits not in fields_info[0].metadata["choices"]:
-            raise ValueError(f"only support quantize to {fields_info[0].metadata['choices']} bits.")
+            raise ValueError(
+                f"only support quantize to {fields_info[0].metadata['choices']} bits."
+            )
 
         if self.group_size != -1 and self.group_size <= 0:
             raise ValueError("unless equal to -1, group_size must greater then 0.")
@@ -157,14 +165,16 @@ def meta_get_versionable(self, key: str) -> Tuple[str, str]:
     def is_quantized_or_packed_by_v2(self) -> bool:
         # check meta.quantizer
         producer, _version = self.meta_get_versionable(META_FIELD_QUANTIZER)
-        by_v2 = (producer == META_QUANTIZER_GPTQMODEL) and (version.parse(_version) >= version.parse(MIN_VERSION_WITH_V2))
+        by_v2 = (producer == META_QUANTIZER_GPTQMODEL) and (
+            version.parse(_version) >= version.parse(MIN_VERSION_WITH_V2)
+        )
 
         # fallback to meta.packer
         if not by_v2:
             producer, _version = self.meta_get_versionable(META_FIELD_PACKER)
-            by_v2 = producer == META_QUANTIZER_GPTQMODEL and version.parse(_version) >= version.parse(
-                MIN_VERSION_WITH_V2
-            )
+            by_v2 = producer == META_QUANTIZER_GPTQMODEL and version.parse(
+                _version
+            ) >= version.parse(MIN_VERSION_WITH_V2)
 
         return by_v2
 
@@ -182,7 +192,9 @@ def from_quant_config(cls, quantize_cfg, format: str = None):
             if format not in valid_formats:
                 raise ValueError(f"Unknown quantization checkpoint format: {format}.")
             if quantize_cfg.get(FORMAT_FIELD_JSON):
-                raise ValueError("Conflict: quantization format is passed in and also exists in model config.")
+                raise ValueError(
+                    "Conflict: quantization format is passed in and also exists in model config."
+                )
         # compat: warn if checkpoint_format is missing
         elif quantize_cfg.get(FORMAT_FIELD_JSON) is None:
             format_auto_inferred = True
@@ -198,7 +210,10 @@ def from_quant_config(cls, quantize_cfg, format: str = None):
             key = key.lower()
 
             # remap keys according to compat map
-            if key in QUANT_CONFIG_ARG_SYNONYMS and QUANT_CONFIG_ARG_SYNONYMS[key] in field_names:
+            if (
+                key in QUANT_CONFIG_ARG_SYNONYMS
+                and QUANT_CONFIG_ARG_SYNONYMS[key] in field_names
+            ):
                 key = QUANT_CONFIG_ARG_SYNONYMS[key]
 
             if key == FORMAT_FIELD_JSON:
@@ -218,10 +233,14 @@ def from_quant_config(cls, quantize_cfg, format: str = None):
             elif key in field_names:
                 normalized[key] = val
             else:
-                logger.info(f"Ignoring unknown parameter in the quantization configuration: {key}.")
+                logger.info(
+                    f"Ignoring unknown parameter in the quantization configuration: {key}."
+                )
 
         if format_auto_inferred:
-            logger.info(f"`{FORMAT_FIELD_JSON}` is missing from the quantization configuration and is automatically inferred to {normalized[FORMAT_FIELD_CODE]}")
+            logger.info(
+                f"`{FORMAT_FIELD_JSON}` is missing from the quantization configuration and is automatically inferred to {normalized[FORMAT_FIELD_CODE]}"
+            )
 
         if "sym" not in normalized:
             logger.warning(
@@ -301,9 +320,12 @@ def to_dict(self):
             META_FIELD: self.meta,
         }
 
+
 # deprecated: will be removed in future update
 @dataclass
 class BaseQuantizeConfig(QuantizeConfig):
     def __init__(self, **kwargs):
         super().__init__(**kwargs)
-        logging.warning("BaseQuantizeConfig is re-named and pending deprecation. Please use `QuantizeConfig` instead.")
+        logging.warning(
+            "BaseQuantizeConfig is re-named and pending deprecation. Please use `QuantizeConfig` instead."
+        )
diff --git a/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/quantization/gptq.py b/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/quantization/gptq.py
index c3f2e4b3..470ed3fb 100644
--- a/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/quantization/gptq.py
+++ b/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/quantization/gptq.py
@@ -1,15 +1,18 @@
 # License: GPTQModel/licenses/LICENSE.mit
 # adapted from @qwopqwop200 's [GPTQ-for-LLaMa](https://github.com/qwopqwop200/GPTQ-for-LLaMa/tree/cuda), which itself is based on [gptq](https://github.com/IST-DASLab/gptq)
 
+# Standard
+from logging import getLogger
 import math
 import os
 import time
-from logging import getLogger
 
+# Third Party
 import torch
 import torch.nn as nn
 import transformers
 
+# Local
 from .quantizer import Quantizer
 
 logger = getLogger(__name__)
@@ -40,7 +43,9 @@ def add_batch(self, inp, out):
         if len(inp.shape) == 2:
             inp = inp.unsqueeze(0)
         tmp = inp.shape[0]
-        if isinstance(self.layer, nn.Linear) or isinstance(self.layer, transformers.Conv1D):
+        if isinstance(self.layer, nn.Linear) or isinstance(
+            self.layer, transformers.Conv1D
+        ):
             if len(inp.shape) == 3:
                 inp = inp.reshape((-1, inp.shape[-1]))
             inp = inp.t()
@@ -93,6 +98,7 @@ def fasterquant(
         now_idx = 1
 
         if static_groups:
+            # Standard
             import copy
 
             groups = []
@@ -137,7 +143,9 @@ def fasterquant(
                 if group_size != -1:
                     if not static_groups:
                         if (i1 + i) % group_size == 0:
-                            self.quantizer.find_params(W[:, (i1 + i) : (i1 + i + group_size)], weight=True)
+                            self.quantizer.find_params(
+                                W[:, (i1 + i) : (i1 + i + group_size)], weight=True
+                            )
 
                         if ((i1 + i) // group_size) - now_idx == -1:
                             scale.append(self.quantizer.scale)
@@ -185,7 +193,9 @@ def fasterquant(
 
         if isinstance(self.layer, transformers.Conv1D):
             Q = Q.t()
-        self.layer.weight.data = Q.reshape(self.layer.weight.shape).type_as(self.layer.weight.data)
+        self.layer.weight.data = Q.reshape(self.layer.weight.shape).type_as(
+            self.layer.weight.data
+        )
         if os.environ.get("DEBUG"):
             logger.debug(torch.sum((self.layer(self.inp1) - self.out1) ** 2))
 
diff --git a/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/quantization/quantizer.py b/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/quantization/quantizer.py
index 3ca205d1..f5eb9e00 100644
--- a/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/quantization/quantizer.py
+++ b/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/quantization/quantizer.py
@@ -1,8 +1,10 @@
 # License: GPTQModel/licenses/LICENSE.mit
 # adapted from @qwopqwop200 's [GPTQ-for-LLaMa](https://github.com/qwopqwop200/GPTQ-for-LLaMa/tree/cuda), which itself is based on [gptq](https://github.com/IST-DASLab/gptq)
 
+# Standard
 from logging import getLogger
 
+# Third Party
 import torch
 import torch.nn as nn
 
diff --git a/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/utils/__init__.py b/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/utils/__init__.py
index 668d0859..869f6327 100644
--- a/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/utils/__init__.py
+++ b/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/utils/__init__.py
@@ -13,4 +13,5 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 ###############################################################################
+# Local
 from .backend import Backend, get_backend
diff --git a/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/utils/backend.py b/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/utils/backend.py
index 43dfffea..86362acc 100644
--- a/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/utils/backend.py
+++ b/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/utils/backend.py
@@ -13,6 +13,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 ###############################################################################
+# Standard
 from enum import Enum
 
 
@@ -20,6 +21,7 @@ class Backend(Enum):
     AUTO = 0  # choose the fastest one based on quant model compatibility
     TRITON = 3
 
+
 def get_backend(backend: str):
     try:
         return Backend[backend]
diff --git a/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/utils/data.py b/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/utils/data.py
index 19e61779..b594886e 100644
--- a/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/utils/data.py
+++ b/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/utils/data.py
@@ -13,16 +13,18 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 ###############################################################################
-import copy
-import random
+# Standard
 from functools import partial
 from typing import Callable, Dict, List, Optional
+import copy
+import random
 
-import torch
+# Third Party
 from datasets import DatasetDict, IterableDatasetDict, load_dataset
 from torch import LongTensor
 from torch.utils.data import DataLoader
 from transformers import PreTrainedTokenizer
+import torch
 
 
 def make_data_block(
@@ -69,7 +71,9 @@ def make_data_block(
 
     # filter tokenized samples by length
     dropped_indices = []
-    for idx, (tokenized_prompt, tokenized_label) in enumerate(zip(tokenized_prompts, tokenized_labels)):
+    for idx, (tokenized_prompt, tokenized_label) in enumerate(
+        zip(tokenized_prompts, tokenized_labels)
+    ):
         if add_eos_token:
             tokenized_label += [tokenizer.eos_token_id]
         len_prompt = len(tokenized_prompt)
@@ -87,7 +91,11 @@ def make_data_block(
 
     # make data blocks of samples
     tokenized_samples = sorted(
-        [(p, l) for idx, (p, l) in enumerate(zip(tokenized_prompts, tokenized_labels)) if idx not in dropped_indices],
+        [
+            (p, l)
+            for idx, (p, l) in enumerate(zip(tokenized_prompts, tokenized_labels))
+            if idx not in dropped_indices
+        ],
         key=lambda x: (len(x[0]) + len(x[1])) if merge_prompt_label else len(x[0]),
     )
     sample_blocks = []
@@ -103,7 +111,9 @@ def make_data_block(
             additional_len = blk_max_len
             sample_len = blk_max_len
         else:
-            additional_len = len(sample_block) * (ori_sample_len - blk_max_len) + ori_sample_len
+            additional_len = (
+                len(sample_block) * (ori_sample_len - blk_max_len) + ori_sample_len
+            )
             sample_len = ori_sample_len
 
         if blk_total_len + additional_len > block_max_len:
@@ -139,11 +149,19 @@ def make_data_block(
                 sample_len += len(tokenized_label)
             pad_num = blk_max_len - sample_len
             if merge_prompt_label:
-                input_ids.append([tokenizer.pad_token_id] * pad_num + tokenized_prompt + tokenized_label)
-                label_ids.append([-100] * (pad_num + len(tokenized_prompt)) + tokenized_label)
+                input_ids.append(
+                    [tokenizer.pad_token_id] * pad_num
+                    + tokenized_prompt
+                    + tokenized_label
+                )
+                label_ids.append(
+                    [-100] * (pad_num + len(tokenized_prompt)) + tokenized_label
+                )
             else:
                 input_ids.append([tokenizer.pad_token_id] * pad_num + tokenized_prompt)
-                label_ids.append([-100] * (label_max_len - len(tokenized_label)) + tokenized_label)
+                label_ids.append(
+                    [-100] * (label_max_len - len(tokenized_label)) + tokenized_label
+                )
             attention_mask.append([0] * pad_num + [1] * sample_len)
 
         new_samples["input_ids"].append(input_ids)
@@ -153,7 +171,9 @@ def make_data_block(
     return new_samples
 
 
-def collate_data(blocks: List[Dict[str, List[List[int]]]], pad_token_id: int) -> Dict[str, LongTensor]:
+def collate_data(
+    blocks: List[Dict[str, List[List[int]]]], pad_token_id: int
+) -> Dict[str, LongTensor]:
     def pad_block(block, pads):
         return torch.cat((block, pads.to(block.device)), dim=-1)
 
@@ -170,11 +190,17 @@ def pad_block(block, pads):
         block_label_len = label_blocks[i].shape[-1]
         pad_num = inp_max_len - block_inp_len
         if pad_num > 0:
-            input_ids_blocks[i] = pad_block(input_ids_blocks[i], torch.ones((block_bsz, pad_num)) * pad_token_id)
-            attention_mask_blocks[i] = pad_block(attention_mask_blocks[i], torch.zeros((block_bsz, pad_num)))
+            input_ids_blocks[i] = pad_block(
+                input_ids_blocks[i], torch.ones((block_bsz, pad_num)) * pad_token_id
+            )
+            attention_mask_blocks[i] = pad_block(
+                attention_mask_blocks[i], torch.zeros((block_bsz, pad_num))
+            )
         label_pad_num = label_max_len - block_label_len
         if label_pad_num > 0:
-            label_blocks[i] = pad_block(label_blocks[i], torch.ones((block_bsz, label_pad_num)) * -100)
+            label_blocks[i] = pad_block(
+                label_blocks[i], torch.ones((block_bsz, label_pad_num)) * -100
+            )
 
     return {
         "input_ids": torch.cat(input_ids_blocks, dim=0).long(),
diff --git a/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/utils/importer.py b/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/utils/importer.py
index 40503569..98e4f239 100644
--- a/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/utils/importer.py
+++ b/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/utils/importer.py
@@ -13,16 +13,20 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 ###############################################################################
+# Standard
 from collections import OrderedDict
 from logging import getLogger
 
+# Local
 from ..nn_modules.qlinear.qlinear_tritonv2 import QuantLinear as TritonV2QuantLinear
 from ..quantization import FORMAT
 from .backend import Backend
 
-backend_dict = OrderedDict({
-    Backend.TRITON: TritonV2QuantLinear,
-})
+backend_dict = OrderedDict(
+    {
+        Backend.TRITON: TritonV2QuantLinear,
+    }
+)
 
 format_dict = {
     FORMAT.GPTQ: [Backend.TRITON],
@@ -32,15 +36,16 @@
 
 logger = getLogger(__name__)
 
+
 # auto select the correct/optimal QuantLinear class
 def select_quant_linear(
-        bits: int,
-        group_size: int,
-        desc_act: bool,
-        sym: bool,
-        backend: Backend,
-        format: FORMAT,
-        pack: bool = False,
+    bits: int,
+    group_size: int,
+    desc_act: bool,
+    sym: bool,
+    backend: Backend,
+    format: FORMAT,
+    pack: bool = False,
 ):
     # Handle the case where backend is AUTO.
     if backend == Backend.AUTO:
@@ -50,12 +55,15 @@ def select_quant_linear(
             validate = v.validate(bits, group_size, desc_act, sym, raise_error=False)
             check_pack_func = hasattr(v, "pack") if pack else True
             if in_allow_backends and validate and check_pack_func:
-                logger.info(f"Auto choose the fastest one based on quant model compatibility: {v}")
+                logger.info(
+                    f"Auto choose the fastest one based on quant model compatibility: {v}"
+                )
                 return v
 
     # Handle the case where backend is not AUTO.
     if backend == Backend.TRITON:
         logger.info("Using tritonv2 for GPTQ")
+        # Local
         from ..nn_modules.qlinear.qlinear_tritonv2 import QuantLinear
     else:
         raise NotImplementedError("Invalid Backend")
diff --git a/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/utils/model.py b/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/utils/model.py
index 23ef3821..e98103e8 100644
--- a/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/utils/model.py
+++ b/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/utils/model.py
@@ -13,24 +13,33 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 ###############################################################################
+# Standard
+from logging import getLogger
+from typing import List, Optional
 import functools
 import hashlib
 import json
 import logging
 import os
-from logging import getLogger
-from typing import List, Optional
 
+# Third Party
+from tqdm import tqdm
+from transformers import AutoConfig, PretrainedConfig
+from transformers.utils.hub import cached_file
 import accelerate
 import threadpoolctl as tctl
 import torch
 import torch.nn as nn
 import transformers
-from tqdm import tqdm
-from transformers import AutoConfig, PretrainedConfig
-from transformers.utils.hub import cached_file
 
-from ..models._const import CPU, CUDA_0, EXLLAMA_DEFAULT_MAX_INPUT_LENGTH, EXPERT_INDEX_PLACEHOLDER, SUPPORTED_MODELS
+# Local
+from ..models._const import (
+    CPU,
+    CUDA_0,
+    EXLLAMA_DEFAULT_MAX_INPUT_LENGTH,
+    EXPERT_INDEX_PLACEHOLDER,
+    SUPPORTED_MODELS,
+)
 from ..nn_modules.qlinear import BaseQuantLinear
 from ..quantization import FORMAT, QuantizeConfig
 from .backend import Backend
@@ -99,7 +108,11 @@ def find_layers(module, layers=None, name=""):
             return {name: module}
     res = {}
     for name1, child in module.named_children():
-        res.update(find_layers(child, layers=layers, name=name + "." + name1 if name != "" else name1))
+        res.update(
+            find_layers(
+                child, layers=layers, name=name + "." + name1 if name != "" else name1
+            )
+        )
     return res
 
 
@@ -127,7 +140,9 @@ def make_quant(
     use_cuda_fp16: bool = True,
     pack: bool = False,
 ) -> BaseQuantLinear:
-    select_quant_linear_func = select_quant_linear_with_pack if pack else select_quant_linear
+    select_quant_linear_func = (
+        select_quant_linear_with_pack if pack else select_quant_linear
+    )
     QuantLinear = select_quant_linear_func(
         bits=bits,
         group_size=group_size,
@@ -186,6 +201,7 @@ def make_quant(
 
     return QuantLinear
 
+
 def convert_gptq_v1_to_v2_format(
     model,
     quantize_config: QuantizeConfig,
@@ -202,15 +218,15 @@ def convert_gptq_v1_to_v2_format(
                 if quantize_config.bits == 2:
                     submodule.qzeros.data += 0b01010101010101010101010101010101
                 elif quantize_config.bits == 3:
-                    submodule.qzeros.data[:, range(0, submodule.qzeros.data.shape[1], 3)] += (
-                        0b00100100100100100100100100100100
-                    )
-                    submodule.qzeros.data[:, range(1, submodule.qzeros.data.shape[1], 3)] += (
-                        0b10010010010010010010010010010010
-                    )
-                    submodule.qzeros.data[:, range(2, submodule.qzeros.data.shape[1], 3)] += (
-                        0b01001001001001001001001001001001
-                    )
+                    submodule.qzeros.data[
+                        :, range(0, submodule.qzeros.data.shape[1], 3)
+                    ] += 0b00100100100100100100100100100100
+                    submodule.qzeros.data[
+                        :, range(1, submodule.qzeros.data.shape[1], 3)
+                    ] += 0b10010010010010010010010010010010
+                    submodule.qzeros.data[
+                        :, range(2, submodule.qzeros.data.shape[1], 3)
+                    ] += 0b01001001001001001001001001001001
                 elif quantize_config.bits == 4:
                     submodule.qzeros.data += 0b00010001000100010001000100010001
                 elif quantize_config.bits == 8:
@@ -234,15 +250,15 @@ def convert_gptq_v2_to_v1_format(
                 if quantize_config.bits == 2:
                     submodule.qzeros.data -= 0b01010101010101010101010101010101
                 elif quantize_config.bits == 3:
-                    submodule.qzeros.data[:, range(0, submodule.qzeros.data.shape[1], 3)] -= (
-                        0b00100100100100100100100100100100
-                    )
-                    submodule.qzeros.data[:, range(1, submodule.qzeros.data.shape[1], 3)] -= (
-                        0b10010010010010010010010010010010
-                    )
-                    submodule.qzeros.data[:, range(2, submodule.qzeros.data.shape[1], 3)] -= (
-                        0b01001001001001001001001001001001
-                    )
+                    submodule.qzeros.data[
+                        :, range(0, submodule.qzeros.data.shape[1], 3)
+                    ] -= 0b00100100100100100100100100100100
+                    submodule.qzeros.data[
+                        :, range(1, submodule.qzeros.data.shape[1], 3)
+                    ] -= 0b10010010010010010010010010010010
+                    submodule.qzeros.data[
+                        :, range(2, submodule.qzeros.data.shape[1], 3)
+                    ] -= 0b01001001001001001001001001001001
                 elif quantize_config.bits == 4:
                     submodule.qzeros.data -= 0b00010001000100010001000100010001
                 elif quantize_config.bits == 8:
@@ -252,11 +268,16 @@ def convert_gptq_v2_to_v1_format(
 
     return model
 
-def select_quant_linear_with_pack(bits: int,
+
+def select_quant_linear_with_pack(
+    bits: int,
     group_size: int,
     desc_act: bool,
     sym: bool,
-    backend: Backend, format: str, pack: bool):
+    backend: Backend,
+    format: str,
+    pack: bool,
+):
     QuantLinear = select_quant_linear(
         bits=bits,
         group_size=group_size,
@@ -268,6 +289,7 @@ def select_quant_linear_with_pack(bits: int,
     )
     return QuantLinear
 
+
 def pack_model(
     model,
     quantizers,
@@ -341,12 +363,13 @@ def pack_model(
         QuantLinear.warmup(model.to(CUDA_0), seqlen=model.seqlen)
     return QuantLinear
 
+
 def verify_model_hash(file_path: str, verify_hash: str):
     if not isinstance(verify_hash, str):
         raise ValueError("model verify_hash must be a string")
-    if ':' not in verify_hash:
+    if ":" not in verify_hash:
         raise ValueError("verify_hash must be in the format 'hash_type:hash_value'")
-    hash_type, hash_value = verify_hash.split(':', 1)
+    hash_type, hash_value = verify_hash.split(":", 1)
     hash_func = getattr(hashlib, hash_type, None)
     if not hash_func:
         raise ValueError(f"No hash function found for type: {hash_type}")
@@ -359,9 +382,9 @@ def verify_sharded_model_hashes(jsonPath: str, verify_hash: List[str]):
     if not isinstance(verify_hash, list):
         raise ValueError("sharded model verify_hash must be a list")
 
-    with open(jsonPath, 'r') as f:
+    with open(jsonPath, "r") as f:
         index_data = json.load(f)
-    weight_map = index_data['weight_map']
+    weight_map = index_data["weight_map"]
     shard_files = set(weight_map.values())
     if len(shard_files) != len(verify_hash):
         raise ValueError("Number of shards and number of hash values do not match.")
@@ -372,6 +395,7 @@ def verify_sharded_model_hashes(jsonPath: str, verify_hash: List[str]):
             return False
     return True
 
+
 def check_and_get_model_type(model_dir, trust_remote_code=False):
     config = AutoConfig.from_pretrained(model_dir, trust_remote_code=trust_remote_code)
     if config.model_type not in SUPPORTED_MODELS:
@@ -381,6 +405,7 @@ def check_and_get_model_type(model_dir, trust_remote_code=False):
 
 
 def simple_dispatch_model(model, device_map):
+    # Third Party
     from accelerate.hooks import AlignDevicesHook, add_hook_to_module
 
     if "" in device_map:
@@ -402,10 +427,14 @@ def simple_dispatch_model(model, device_map):
     prev_hook = None
     for idx, (n, d) in enumerate(cpu_offload_group):
         m = get_module_by_name_suffix(model, n)
-        _, prev_hook = accelerate.cpu_offload_with_hook(m, execution_device=main_device, prev_module_hook=prev_hook)
+        _, prev_hook = accelerate.cpu_offload_with_hook(
+            m, execution_device=main_device, prev_module_hook=prev_hook
+        )
     # set first cpu offload module's prev_module_hook to the last cpu offload module's hook
     if len(cpu_offload_group) > 1:
-        get_module_by_name_suffix(model, cpu_offload_group[0][0])._hf_hook.prev_module_hook = prev_hook
+        get_module_by_name_suffix(
+            model, cpu_offload_group[0][0]
+        )._hf_hook.prev_module_hook = prev_hook
 
     for n, d in device_map.items():
         m = get_module_by_name_suffix(model, n)
@@ -423,7 +452,9 @@ def simple_dispatch_model(model, device_map):
 # when qliear type is selected, it should auto-override the model post_init method and
 # not have to go about looping over modules to match qlinear type a second time as it is
 # very prone to bugs
-def gptqmodel_post_init(model, use_act_order: bool, max_input_length: Optional[int] = None):
+def gptqmodel_post_init(
+    model, use_act_order: bool, max_input_length: Optional[int] = None
+):
     """
     The max_input_length argument is specific to the exllama backend, that requires to initialize a buffer temp_state.
     """
@@ -475,6 +506,7 @@ def gptqmodel_post_init(model, use_act_order: bool, max_input_length: Optional[i
 
     if model_uses_exllama:
         # To be honest this is quite ugly, not proud of this.
+        # Third Party
         from gptqmodel_exllama_kernels import prepare_buffers, set_tuning_params
 
         device_to_buffers = {}
@@ -523,7 +555,10 @@ def gptqmodel_post_init(model, use_act_order: bool, max_input_length: Optional[i
 
         # The buffers need to have been initialized first before calling make_q4.
         for name, submodule in model.named_modules():
-            if isinstance(submodule, BaseQuantLinear) and submodule.QUANT_TYPE == "exllama":
+            if (
+                isinstance(submodule, BaseQuantLinear)
+                and submodule.QUANT_TYPE == "exllama"
+            ):
                 submodule.post_init()
 
     # exllamav2
@@ -531,13 +566,17 @@ def gptqmodel_post_init(model, use_act_order: bool, max_input_length: Optional[i
     model_uses_exllamav2 = False
 
     for _, submodule in model.named_modules():
-        if isinstance(submodule, BaseQuantLinear) and submodule.QUANT_TYPE == "exllamav2":
+        if (
+            isinstance(submodule, BaseQuantLinear)
+            and submodule.QUANT_TYPE == "exllamav2"
+        ):
             model_uses_exllamav2 = True
             device = submodule.qweight.device
             scratch_fixed = submodule.scratch_space_fixed()
             fixed_bytes[device] = max(scratch_fixed, fixed_bytes.get(device, 0))
 
     if model_uses_exllamav2:
+        # Local
         from ..nn_modules.qlinear.qlinear_exllamav2 import ExLlamaV2DeviceTensors
 
         device_tensors = {}
@@ -548,7 +587,10 @@ def gptqmodel_post_init(model, use_act_order: bool, max_input_length: Optional[i
         model.device_tensors = device_tensors
 
         for _, submodule in model.named_modules():
-            if isinstance(submodule, BaseQuantLinear) and submodule.QUANT_TYPE == "exllamav2":
+            if (
+                isinstance(submodule, BaseQuantLinear)
+                and submodule.QUANT_TYPE == "exllamav2"
+            ):
                 device = submodule.qweight.device
                 submodule.post_init(temp_dq=model.device_tensors[device])
     torch.cuda.empty_cache()
@@ -557,7 +599,10 @@ def gptqmodel_post_init(model, use_act_order: bool, max_input_length: Optional[i
 
 
 def get_checkpoints(
-    model_name_or_path: str, extensions: List[str], possible_model_basenames: List[str], **cached_file_kwargs
+    model_name_or_path: str,
+    extensions: List[str],
+    possible_model_basenames: List[str],
+    **cached_file_kwargs,
 ):
     """
     Retrives (and if necessary downloads from Hugging Face Hub) the model checkpoint. Sharding is supported. All the `possible_model_basenames` (e.g. `["model", "model-4bit-gptq"]`) will be explored over all `extensions` (e.g. `[".bin", ".safetensors"]`).
@@ -574,10 +619,14 @@ def get_checkpoints(
                 possible_index_file = os.path.join(model_name_or_path, shard_index_name)
                 if os.path.isfile(possible_index_file):
                     # The model is sharded over several checkpoints.
-                    possible_model_basename = possible_index_file.replace(ext + ".index.json", "")
+                    possible_model_basename = possible_index_file.replace(
+                        ext + ".index.json", ""
+                    )
                     return True, possible_index_file, possible_model_basename
                 else:
-                    model_save_name = os.path.join(model_name_or_path, possible_model_basename)
+                    model_save_name = os.path.join(
+                        model_name_or_path, possible_model_basename
+                    )
                     searched_files.append(possible_model_basename + ext)
                     if os.path.isfile(model_save_name + ext):
                         resolved_archive_file = model_save_name + ext
@@ -628,14 +677,19 @@ def get_checkpoints(
 
 
 # return the most stable tensor dtype for quantization while minimizing vram
-def auto_dtype_from_config(config: PretrainedConfig, quant_inference: bool = False) -> torch.dtype:
+def auto_dtype_from_config(
+    config: PretrainedConfig, quant_inference: bool = False
+) -> torch.dtype:
     # all the gptq inference kernels are float16 only
     if quant_inference:
         return torch.float16
 
     dtype = getattr(config, "torch_dtype")
     if not dtype or not isinstance(dtype, torch.dtype):
-        raise ValueError("Your model config.json does not have torch_dtype set. Please check for model " "corruption.")
+        raise ValueError(
+            "Your model config.json does not have torch_dtype set. Please check for model "
+            "corruption."
+        )
 
     if dtype == torch.float32:
         return torch.bfloat16
@@ -654,7 +708,9 @@ def get_moe_layer_modules(layer_modules: List, num_experts: int) -> List:
         for n in names:
             if EXPERT_INDEX_PLACEHOLDER in n:
                 for index in range(num_experts):
-                    new_inside_layer_modules[-1].append(n.replace(EXPERT_INDEX_PLACEHOLDER, str(index)))
+                    new_inside_layer_modules[-1].append(
+                        n.replace(EXPERT_INDEX_PLACEHOLDER, str(index))
+                    )
             else:
                 new_inside_layer_modules[-1].append(n)
 
diff --git a/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/utils/peft.py b/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/utils/peft.py
index e3257d80..ac5041ff 100644
--- a/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/utils/peft.py
+++ b/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/utils/peft.py
@@ -13,6 +13,7 @@
 # The above copyright notice and this permission notice shall be included in all
 # copies or substantial portions of the Software.
 
+# Standard
 # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
@@ -24,20 +25,24 @@
 from contextlib import contextmanager
 from typing import List, Optional, Tuple, Union
 
-import torch
+# Third Party
 from peft import PeftConfig, PeftModel, PeftType, get_peft_model
 from peft.mapping import PEFT_TYPE_TO_CONFIG_MAPPING
 from peft.peft_model import PEFT_TYPE_TO_MODEL_MAPPING
 from peft.tuners.lora import LoraConfig, LoraModel
 from peft.tuners.lora.gptq import QuantLinear as LoraLinearGPTQ
+import torch
 
+# Local
 from ..models.base import BaseGPTQModel
 from ..nn_modules.qlinear.qlinear_tritonv2 import QuantLinear as QuantLinearTriton
 
+
 class GPTQLoraConfig(LoraConfig):
     injected_fused_attention: bool = False
     injected_fused_mlp: bool = False
 
+
 class GPTQLoraModel(LoraModel):
     def _replace_module(self, parent_module, child_name, new_module, old_module):
         # replace the lora linear
@@ -70,7 +75,6 @@ def _create_new_module(
         # if module cannot be found, return None which results in a raise in the call-stack
         return new_module
 
-
     def merge_adapter(self):
         raise NotImplementedError("gptq model not support merge ada lora adapter")
 
@@ -99,6 +103,7 @@ def find_all_linear_names(
                 results.add(res)
     return list(results)
 
+
 @contextmanager
 def hijack_peft_mappings():
     PEFT_TYPE_TO_CONFIG_MAPPING[PeftType.LORA] = GPTQLoraConfig
@@ -114,6 +119,7 @@ def hijack_peft_mappings():
         PEFT_TYPE_TO_CONFIG_MAPPING[PeftType.LORA] = GPTQLoraConfig
         PEFT_TYPE_TO_MODEL_MAPPING[PeftType.LORA] = GPTQLoraModel
 
+
 def get_gptq_peft_model(
     model: BaseGPTQModel,
     peft_config: PeftConfig = None,
@@ -125,7 +131,9 @@ def get_gptq_peft_model(
     if train_mode and not peft_config:
         raise ValueError("peft_config not specified when in train mode.")
     if not train_mode and not model_id:
-        raise ValueError("model_id(where to load adapters) not specified when in inference mode.")
+        raise ValueError(
+            "model_id(where to load adapters) not specified when in inference mode."
+        )
 
     if train_mode:
         peft_type = peft_config.peft_type
@@ -133,8 +141,12 @@ def get_gptq_peft_model(
             peft_type = peft_type.value
         if peft_type in [PeftType.LORA.value]:
             if auto_find_all_linears:
-                peft_config.target_modules = find_all_linear_names(model, ignore_lm_head=True)
-            if peft_type == PeftType.LORA.value and not isinstance(peft_config, GPTQLoraConfig):
+                peft_config.target_modules = find_all_linear_names(
+                    model, ignore_lm_head=True
+                )
+            if peft_type == PeftType.LORA.value and not isinstance(
+                peft_config, GPTQLoraConfig
+            ):
                 peft_config = GPTQLoraConfig(**peft_config.to_dict())
 
     # this hijack is needed as `get_peft_model` uses PEFTModelForCausalLM which inherits from
@@ -142,9 +154,13 @@ def get_gptq_peft_model(
     with hijack_peft_mappings():
         try:
             if train_mode:
-                peft_model = get_peft_model(model.model, peft_config, adapter_name=adapter_name)
+                peft_model = get_peft_model(
+                    model.model, peft_config, adapter_name=adapter_name
+                )
             else:
-                peft_model = PeftModel.from_pretrained(model.model, model_id, adapter_name)
+                peft_model = PeftModel.from_pretrained(
+                    model.model, model_id, adapter_name
+                )
         except Exception as exc:
             raise NotImplementedError(
                 f"{model.__class__.__name__} not support \
@@ -159,4 +175,4 @@ def get_gptq_peft_model(
     "GPTQLoraModel",
     "find_all_linear_names",
     "get_gptq_peft_model",
-]
\ No newline at end of file
+]
diff --git a/plugins/accelerated-peft/tests/test_gptqmodel.py b/plugins/accelerated-peft/tests/test_gptqmodel.py
index dbab2821..99abf154 100644
--- a/plugins/accelerated-peft/tests/test_gptqmodel.py
+++ b/plugins/accelerated-peft/tests/test_gptqmodel.py
@@ -15,13 +15,20 @@
 # SPDX-License-Identifier: Apache-2.0
 # https://spdx.dev/learn/handling-license-info/
 
-import pytest  # pylint: disable=import-error
-import torch
+# Standard
 from typing import List
 
-from transformers.utils.import_utils import _is_package_available
-from transformers import AutoTokenizer, AutoConfig, GenerationConfig, AutoModelForCausalLM
+# Third Party
 from peft import LoraConfig
+from transformers import (
+    AutoConfig,
+    AutoModelForCausalLM,
+    AutoTokenizer,
+    GenerationConfig,
+)
+from transformers.utils.import_utils import _is_package_available
+import pytest  # pylint: disable=import-error
+import torch
 
 GPTQ = "gptq"
 # r, lora_alpha
@@ -38,22 +45,29 @@
 VANILLA_MODEL_NAME = "TinyLlama/TinyLlama-1.1B-Chat-v0.3"
 QUANTIZED_MODEL_NAME = "TheBloke/TinyLlama-1.1B-Chat-v0.3-GPTQ"
 TARGET_MODULES = ["q_proj", "k_proj", "v_proj", "o_proj"]
-                
+
+
 # Model loading function for quantized models
-def load_autogptq_plugin_model(model_name:str, target_modules:List, torch_dtype:str, use_external_lib:bool = False):
+def load_autogptq_plugin_model(
+    model_name: str,
+    target_modules: List,
+    torch_dtype: str,
+    use_external_lib: bool = False,
+):
+    # First Party
     from fms_acceleration_peft.framework_plugin_autogptq import (
         AutoGPTQAccelerationPlugin,
     )
 
     _plugin = AutoGPTQAccelerationPlugin(
-            {
-                "peft": {
-                    "quantization": {
-                        "auto_gptq": {"kernel": "triton_v2", "from_quantized": True}
-                    }
+        {
+            "peft": {
+                "quantization": {
+                    "auto_gptq": {"kernel": "triton_v2", "from_quantized": True}
                 }
-            },
-            use_external_lib = use_external_lib,
+            }
+        },
+        use_external_lib=use_external_lib,
     )
 
     class TrainArgs:
@@ -68,48 +82,63 @@ class TrainArgs:
         target_modules=target_modules,
     )
 
-    model = _plugin.model_loader(
-        model_name, torch_dtype=getattr(torch, torch_dtype)
-    )
+    model = _plugin.model_loader(model_name, torch_dtype=getattr(torch, torch_dtype))
     model, _ = _plugin.augmentation(model, args, (peft_config,))
     model.eval()
     return model
 
-# quantization function to manage the loading and quantizing of pretrained model 
+
+# quantization function to manage the loading and quantizing of pretrained model
 # using external or local autogptq
-def quantize_model(model_name, config, calibration_dataset, quant_config_kwargs, device, torch_dtype, use_external_lib=False):
+def quantize_model(
+    model_name,
+    config,
+    calibration_dataset,
+    quant_config_kwargs,
+    device,
+    torch_dtype,
+    use_external_lib=False,
+):
     if use_external_lib:
-        from auto_gptq import AutoGPTQForCausalLM as GPTQModel, BaseQuantizeConfig as QuantizeConfig
+        # Third Party
+        from auto_gptq import AutoGPTQForCausalLM as GPTQModel
+        from auto_gptq import BaseQuantizeConfig as QuantizeConfig
+
         quantize_kwargs = {"use_triton": True}
     else:
+        # First Party
         from fms_acceleration_peft.gptqmodel import GPTQModel, QuantizeConfig
+
         quantize_kwargs = {}
 
-    quantize_config = QuantizeConfig(
-        **quant_config_kwargs
-    )
+    quantize_config = QuantizeConfig(**quant_config_kwargs)
     # load un-quantized model, by default, the model will always be loaded into CPU memory
     model = GPTQModel.from_pretrained(
-        model_name, 
-        quantize_config = quantize_config,
-        config = config,
-        torch_dtype = getattr(torch, torch_dtype),
+        model_name,
+        quantize_config=quantize_config,
+        config=config,
+        torch_dtype=getattr(torch, torch_dtype),
     ).to(device)
     # quantize model, the examples should be list of dict whose keys can only be "input_ids"
     model.quantize(calibration_dataset, **quantize_kwargs)
     model.eval()
     return model
 
+
 def get_wikitext2(tokenizer, num_samples=128, seqlen=128):
+    # Standard
     import random
+
+    # Third Party
+    from datasets import load_dataset
     import numpy as np
     import torch
-    from datasets import load_dataset
-    wikidata = load_dataset('wikitext', 'wikitext-2-v1', split='test')
-    wikilist = [' \n' if s == '' else s for s in wikidata['text'] ]
 
-    text = ''.join(wikilist)
-    trainenc = tokenizer(text, return_tensors='pt')
+    wikidata = load_dataset("wikitext", "wikitext-2-v1", split="test")
+    wikilist = [" \n" if s == "" else s for s in wikidata["text"]]
+
+    text = "".join(wikilist)
+    trainenc = tokenizer(text, return_tensors="pt")
 
     random.seed(0)
     np.random.seed(0)
@@ -122,20 +151,23 @@ def get_wikitext2(tokenizer, num_samples=128, seqlen=128):
         j = i + seqlen
         inp = trainenc.input_ids[:, i:j]
         attention_mask = torch.ones_like(inp)
-        traindataset.append({'input_ids':inp,'attention_mask': attention_mask})
+        traindataset.append({"input_ids": inp, "attention_mask": attention_mask})
     return traindataset
 
+
 @pytest.fixture()
 def input_ids(seed: int = 42, device: torch.device = "cuda"):
     torch.manual_seed(seed)
-    yield torch.randint(0, 10000, (BS, SEQLEN), device=device)    
+    yield torch.randint(0, 10000, (BS, SEQLEN), device=device)
+
 
 @pytest.mark.skipif(
     not _is_package_available("auto_gptq"),
     reason="Only runs if auto_gptq is installed",
 )
 def test_pre_quantized_model_outputs_match(
-    input_ids, seed: int = 42,
+    input_ids,
+    seed: int = 42,
 ):
     """
     Test for output equivalence when loading quantized models between
@@ -143,33 +175,33 @@ def test_pre_quantized_model_outputs_match(
     """
     torch.manual_seed(seed)
     original_model = load_autogptq_plugin_model(
-        QUANTIZED_MODEL_NAME, 
-        TARGET_MODULES, FLOAT16, 
-        use_external_lib=True
+        QUANTIZED_MODEL_NAME, TARGET_MODULES, FLOAT16, use_external_lib=True
     )
     refactored_model = load_autogptq_plugin_model(
-        QUANTIZED_MODEL_NAME, 
-        TARGET_MODULES, 
-        FLOAT16
+        QUANTIZED_MODEL_NAME, TARGET_MODULES, FLOAT16
     )
-    with torch.autocast(device_type='cuda', dtype=torch.float32):
+    with torch.autocast(device_type="cuda", dtype=torch.float32):
         with torch.no_grad():
             original_logits = original_model(input_ids.to(original_model.device)).logits
-            refactored_logits = refactored_model(input_ids.to(refactored_model.device)).logits
+            refactored_logits = refactored_model(
+                input_ids.to(refactored_model.device)
+            ).logits
 
     assert torch.allclose(
         original_logits, refactored_logits, atol=ALLCLOSE_ATOL, rtol=ALLCLOSE_RTOL
     ), "Pre-quantized model logits don't match between extracted and external autogptq library"
 
+
 @pytest.mark.skipif(
     not _is_package_available("auto_gptq"),
     reason="Only runs if auto_gptq is installed",
 )
 def test_quantizing_pretrained_model_outputs_match(
-    input_ids, seed: int = 42,
+    input_ids,
+    seed: int = 42,
 ):
     """
-    Test for regression of quantizing pretrained models 
+    Test for regression of quantizing pretrained models
     with refactored gptq library against original autogptq library
     by calculating KL loss on the output logits of both variants
     """
@@ -197,49 +229,53 @@ def test_quantizing_pretrained_model_outputs_match(
 
     # quantize models for external autogptq lib and extracted gptq lib
     original_model = quantize_model(
-        VANILLA_MODEL_NAME, 
+        VANILLA_MODEL_NAME,
         config,
-        calibration_dataset, 
-        quant_config_kwargs, 
-        device, 
+        calibration_dataset,
+        quant_config_kwargs,
+        device,
         FLOAT16,
-        use_external_lib=True
+        use_external_lib=True,
     )
     refactored_model = quantize_model(
-        VANILLA_MODEL_NAME, 
+        VANILLA_MODEL_NAME,
         config,
-        calibration_dataset, 
-        quant_config_kwargs, 
-        device, 
+        calibration_dataset,
+        quant_config_kwargs,
+        device,
         FLOAT16,
-        use_external_lib=False
+        use_external_lib=False,
     )
 
-    # compare generated tokens between 
+    # compare generated tokens between
     # unquantized, original library and refactored gptqmodel library
     unquantized_model = AutoModelForCausalLM.from_pretrained(
-        VANILLA_MODEL_NAME,
-        config=config
+        VANILLA_MODEL_NAME, config=config
     ).to(device)
     gen_config = GenerationConfig.from_pretrained(VANILLA_MODEL_NAME)
     gen_config.max_new_tokens = 5
-    _inputs = torch.tensor([tokenizer("auto-gptq is an easy to use")["input_ids"]], device="cuda")
+    _inputs = torch.tensor(
+        [tokenizer("auto-gptq is an easy to use")["input_ids"]], device="cuda"
+    )
     output1 = tokenizer.decode(
-        original_model.generate(
-                inputs=_inputs, generation_config=gen_config
-            ).view(-1), skip_special_tokens=True
-        )
+        original_model.generate(inputs=_inputs, generation_config=gen_config).view(-1),
+        skip_special_tokens=True,
+    )
     output2 = tokenizer.decode(
-        refactored_model.generate(
-                inputs=_inputs, generation_config=gen_config
-            ).view(-1), skip_special_tokens=True
-        )
+        refactored_model.generate(inputs=_inputs, generation_config=gen_config).view(
+            -1
+        ),
+        skip_special_tokens=True,
+    )
     output3 = tokenizer.decode(
-        unquantized_model.generate(
-                inputs=_inputs, generation_config=gen_config
-            ).view(-1), skip_special_tokens=True
-        )    
-    assert output1==output2==output3, f"generated tokens ({output1}, {output2}, {output3}) \
+        unquantized_model.generate(inputs=_inputs, generation_config=gen_config).view(
+            -1
+        ),
+        skip_special_tokens=True,
+    )
+    assert (
+        output1 == output2 == output3
+    ), f"generated tokens ({output1}, {output2}, {output3}) \
         don't match between both libraries after quantization"
 
     # compare prob. distributions between original library and refactored gptqmodel library
@@ -258,5 +294,7 @@ def test_quantizing_pretrained_model_outputs_match(
     target = torch.nn.functional.softmax(original_logits, dim=-1)
     target = torch.flatten(target, start_dim=0, end_dim=1)
     error = loss_fn(input, target)
-    assert error.lt(LOSS_TOLERANCE), "Model logits don't match between both libraries \
+    assert error.lt(
+        LOSS_TOLERANCE
+    ), "Model logits don't match between both libraries \
         after quantization"
diff --git a/plugins/accelerated-peft/tests/test_q4_triton.py b/plugins/accelerated-peft/tests/test_q4_triton.py
index c383db75..e16b8dce 100644
--- a/plugins/accelerated-peft/tests/test_q4_triton.py
+++ b/plugins/accelerated-peft/tests/test_q4_triton.py
@@ -14,20 +14,26 @@
 # limitations under the License.
 ###############################################################################
 # -- do not touch
+# Standard
 import os
 
 os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
 # -- end do not touch
 
+# Standard
 import unittest  # noqa: E402
 
-import torch  # noqa: E402
+# Third Party
 from gptqmodel import Backend, GPTQModel  # noqa: E402
-from gptqmodel.nn_modules.qlinear.qlinear_tritonv2 import QuantLinear as TritonV2QuantLinear  # noqa: E402
+from gptqmodel.nn_modules.qlinear.qlinear_tritonv2 import (  # noqa: E402
+    QuantLinear as TritonV2QuantLinear,
+)
 from transformers import AutoTokenizer  # noqa: E402
+import torch  # noqa: E402
 
 GENERATE_EVAL_SIZE = 100
 
+
 class TestsQ4Triton(unittest.TestCase):
     def test_generation_desc_act_false(self):
         prompt = "I am in Paris and"
@@ -54,16 +60,24 @@ def test_generation_desc_act_false(self):
         inp = tokenizer(prompt, return_tensors="pt").to("cuda:0")
 
         # This one uses Autocast.
-        res = model_q.generate(**inp, num_beams=1, min_new_tokens=new_tokens, max_new_tokens=new_tokens)
+        res = model_q.generate(
+            **inp, num_beams=1, min_new_tokens=new_tokens, max_new_tokens=new_tokens
+        )
         predicted_text = tokenizer.decode(res[0])
 
-        self.assertEqual(predicted_text[:GENERATE_EVAL_SIZE], reference_output[:GENERATE_EVAL_SIZE])
+        self.assertEqual(
+            predicted_text[:GENERATE_EVAL_SIZE], reference_output[:GENERATE_EVAL_SIZE]
+        )
 
         # This one does not.
-        res = model_q.model.generate(**inp, num_beams=1, min_new_tokens=new_tokens, max_new_tokens=new_tokens)
+        res = model_q.model.generate(
+            **inp, num_beams=1, min_new_tokens=new_tokens, max_new_tokens=new_tokens
+        )
         predicted_text = tokenizer.decode(res[0])
 
-        self.assertEqual(predicted_text[:GENERATE_EVAL_SIZE], reference_output[:GENERATE_EVAL_SIZE])
+        self.assertEqual(
+            predicted_text[:GENERATE_EVAL_SIZE], reference_output[:GENERATE_EVAL_SIZE]
+        )
 
     def test_generation_desc_act_true(self):
         prompt = "I am in Paris and"
@@ -80,7 +94,6 @@ def test_generation_desc_act_true(self):
             device="cuda:0",
             backend=Backend.TRITON,
             revision=revision,
-
         )
         for _, submodule in model_q.named_modules():
             if isinstance(submodule, TritonV2QuantLinear):
@@ -96,4 +109,6 @@ def test_generation_desc_act_true(self):
 
         predicted_text = tokenizer.decode(res[0])
 
-        self.assertEqual(predicted_text[:GENERATE_EVAL_SIZE], reference_output[:GENERATE_EVAL_SIZE])
+        self.assertEqual(
+            predicted_text[:GENERATE_EVAL_SIZE], reference_output[:GENERATE_EVAL_SIZE]
+        )
diff --git a/plugins/accelerated-peft/tests/test_triton.py b/plugins/accelerated-peft/tests/test_triton.py
index 410aa6e1..15facdcf 100644
--- a/plugins/accelerated-peft/tests/test_triton.py
+++ b/plugins/accelerated-peft/tests/test_triton.py
@@ -14,18 +14,21 @@
 # limitations under the License.
 ###############################################################################
 # -- do not touch
+# Standard
 import os
 
 os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
 # -- end do not touch
 
+# Standard
 import os  # noqa: E402
 import unittest  # noqa: E402
 
-import torch  # noqa: E402
-import torch.utils.benchmark as benchmark  # noqa: E402
+# Third Party
 from gptqmodel import Backend, GPTQModel  # noqa: E402
 from transformers import AutoTokenizer  # noqa: E402
+import torch  # noqa: E402
+import torch.utils.benchmark as benchmark  # noqa: E402
 
 MODEL_ID = "TheBloke/Llama-7B-GPTQ"
 DATASET_ID = "timdettmers/openassistant-guanaco"
@@ -102,6 +105,8 @@ def test_triton_qlinear(self):
 
         qlinear_ref = ref_model.model.model.layers[0].self_attn.q_proj
 
-        ref_out = qlinear_ref(test_data) # noqa: F841
+        ref_out = qlinear_ref(test_data)  # noqa: F841
 
-        _, measure_triton = benchmark_forward(qlinear_ref, test_data, desc="Triton", verbose=True)
+        _, measure_triton = benchmark_forward(
+            qlinear_ref, test_data, desc="Triton", verbose=True
+        )

From 12dd566195ed396cc34585b1cefe81a8ea519f64 Mon Sep 17 00:00:00 2001
From: 1000850000 user <aaron.chew1@ibm.com>
Date: Sun, 7 Jul 2024 18:16:29 +0000
Subject: [PATCH 10/20] linting

---
 plugins/accelerated-peft/.pylintrc            |  2 +-
 .../framework_plugin_autogptq.py              | 24 ++++++++++---------
 .../accelerated-peft/tests/test_q4_triton.py  |  4 ++--
 plugins/accelerated-peft/tests/test_triton.py |  2 +-
 4 files changed, 17 insertions(+), 15 deletions(-)

diff --git a/plugins/accelerated-peft/.pylintrc b/plugins/accelerated-peft/.pylintrc
index 45da4212..b4c1a280 100644
--- a/plugins/accelerated-peft/.pylintrc
+++ b/plugins/accelerated-peft/.pylintrc
@@ -52,7 +52,7 @@ ignore=CVS,protobufs
 # ignore-list. The regex matches against paths and can be in Posix or Windows
 # format. Because '\\' represents the directory delimiter on Windows systems,
 # it can't be used as an escape character.
-ignore-paths=
+ignore-paths=.*gptqmodel/,tests/test_q4_triton.py,tests/test_triton.py
 
 # Files or directories matching the regular expression patterns are skipped.
 # The regex matches against base names, not paths. The default value ignores
diff --git a/plugins/accelerated-peft/src/fms_acceleration_peft/framework_plugin_autogptq.py b/plugins/accelerated-peft/src/fms_acceleration_peft/framework_plugin_autogptq.py
index 8bcd0bb9..2d1f6212 100644
--- a/plugins/accelerated-peft/src/fms_acceleration_peft/framework_plugin_autogptq.py
+++ b/plugins/accelerated-peft/src/fms_acceleration_peft/framework_plugin_autogptq.py
@@ -60,17 +60,17 @@ def model_loader(self, model_name: str, **kwargs):
         # Third Party
         if self.use_external_lib:
             # Third Party
-            from auto_gptq import (
-                AutoGPTQForCausalLM as GPTQModel,  # pylint: disable=import-outside-toplevel,import-error
+            from auto_gptq import ( # pylint: disable=import-outside-toplevel,import-error
+                AutoGPTQForCausalLM as GPTQModel,
             )
-            from auto_gptq import BaseQuantizeConfig as QuantizeConfig
+            from auto_gptq import BaseQuantizeConfig as QuantizeConfig # pylint: disable=import-outside-toplevel,import-error
             from auto_gptq.nn_modules.qlinear.qlinear_tritonv2 import (  # pylint: disable=import-outside-toplevel,import-error
                 QuantLinear,
             )
         else:
-            from .gptqmodel import GPTQModel, QuantizeConfig
-            from .gptqmodel.utils import Backend
-            from .gptqmodel.nn_modules.qlinear.qlinear_tritonv2 import (
+            from .gptqmodel import GPTQModel, QuantizeConfig # pylint: disable=import-outside-toplevel,import-error
+            from .gptqmodel.utils import Backend # pylint: disable=import-outside-toplevel,import-error
+            from .gptqmodel.nn_modules.qlinear.qlinear_tritonv2 import ( # pylint: disable=import-outside-toplevel,import-error
                 QuantLinear,
             )
         # Local
@@ -149,11 +149,12 @@ def model_loader(self, model_name: str, **kwargs):
                 )
 
                 # NOTE: need to set the device map as below as we want to use AutoGPTQ for training.
-                # For low_cpu_mem_usage = True, we have to set the device map to load checkpoints to "cpu"
-                # to avoid gpu consumption before train
+                # For low_cpu_mem_usage = True, we have to set the device map to load checkpoints
+                # to "cpu" to avoid gpu consumption before train
                 # This approach will divert consumption to cpu memory,
                 # a better approach would be to load the checkpoints to meta device
-                # QLoRA is currently implemented by the former approach and will encounter the same issue.
+                # QLoRA is currently implemented by the former approach and
+                # will encounter the same issue.
                 # see https://github.com/huggingface/transformers/pull/25107#issuecomment-2134833262
 
                 kwargs["device_map"] = {
@@ -263,7 +264,7 @@ def augmentation(
             )
         else:
             # Local
-            from .gptqmodel.utils.peft import get_gptq_peft_model
+            from .gptqmodel.utils.peft import get_gptq_peft_model # pylint: disable=import-outside-toplevel,import-error
 
         (peft_config,) = modifiable_args  # unpack modifiable args
 
@@ -295,7 +296,8 @@ def augmentation(
             # 2. GPTQLoraModel._replace_module to replace the existing Linear with the LoraLinear.
             #    Also move to device (which may depend on how base layer is implemented)
 
-            # NOTE: GPTQLoraModel inherits from LoraModel, and the _create_new_module method is called
+            # NOTE: GPTQLoraModel inherits from LoraModel,
+            # and the _create_new_module method is called
             # on the parent. Hence _create_new_module is patched on the parent
 
             # FIXME:
diff --git a/plugins/accelerated-peft/tests/test_q4_triton.py b/plugins/accelerated-peft/tests/test_q4_triton.py
index e16b8dce..33927a7d 100644
--- a/plugins/accelerated-peft/tests/test_q4_triton.py
+++ b/plugins/accelerated-peft/tests/test_q4_triton.py
@@ -24,8 +24,8 @@
 import unittest  # noqa: E402
 
 # Third Party
-from gptqmodel import Backend, GPTQModel  # noqa: E402
-from gptqmodel.nn_modules.qlinear.qlinear_tritonv2 import (  # noqa: E402
+from fms_acceleration_peft.gptqmodel import Backend, GPTQModel  # noqa: E402
+from fms_acceleration_peft.gptqmodel.nn_modules.qlinear.qlinear_tritonv2 import (  # noqa: E402
     QuantLinear as TritonV2QuantLinear,
 )
 from transformers import AutoTokenizer  # noqa: E402
diff --git a/plugins/accelerated-peft/tests/test_triton.py b/plugins/accelerated-peft/tests/test_triton.py
index 15facdcf..245bb939 100644
--- a/plugins/accelerated-peft/tests/test_triton.py
+++ b/plugins/accelerated-peft/tests/test_triton.py
@@ -25,7 +25,7 @@
 import unittest  # noqa: E402
 
 # Third Party
-from gptqmodel import Backend, GPTQModel  # noqa: E402
+from fms_acceleration_peft.gptqmodel import Backend, GPTQModel  # noqa: E402
 from transformers import AutoTokenizer  # noqa: E402
 import torch  # noqa: E402
 import torch.utils.benchmark as benchmark  # noqa: E402

From b6ec481cad977ef4cd7881002bc6dcbf79216c87 Mon Sep 17 00:00:00 2001
From: 1000850000 user <aaron.chew1@ibm.com>
Date: Sun, 7 Jul 2024 18:16:45 +0000
Subject: [PATCH 11/20] add additional entry to requirements.txt

---
 plugins/accelerated-peft/requirements.txt        | 5 +++++
 plugins/accelerated-peft/tests/test_gptqmodel.py | 8 ++++----
 plugins/accelerated-peft/tests/test_q4_triton.py | 4 ++--
 plugins/accelerated-peft/tox.ini                 | 3 ++-
 4 files changed, 13 insertions(+), 7 deletions(-)

diff --git a/plugins/accelerated-peft/requirements.txt b/plugins/accelerated-peft/requirements.txt
index a00233d3..237f9092 100644
--- a/plugins/accelerated-peft/requirements.txt
+++ b/plugins/accelerated-peft/requirements.txt
@@ -6,3 +6,8 @@ accelerate >= 0.29
 
 # bitsandbytes for the BNB plugin
 bitsandbytes
+
+# Used to manage the thread limit in functions for converting old 
+# GPTQ models to new GPTQ model format that support symmetrical=False
+# https://github.com/AutoGPTQ/AutoGPTQ/pull/640
+threadpoolctl
\ No newline at end of file
diff --git a/plugins/accelerated-peft/tests/test_gptqmodel.py b/plugins/accelerated-peft/tests/test_gptqmodel.py
index 99abf154..a56cbc9a 100644
--- a/plugins/accelerated-peft/tests/test_gptqmodel.py
+++ b/plugins/accelerated-peft/tests/test_gptqmodel.py
@@ -219,7 +219,7 @@ def test_quantizing_pretrained_model_outputs_match(
     calibration_dataset = get_wikitext2(tokenizer, num_samples=128, seqlen=128)
     quant_config_kwargs = {
         "bits": 4,
-        "group_size": -1,
+        "group_size": 64,
         "desc_act": True,
         "damp_percent": 0.1,
         "static_groups": False,
@@ -286,13 +286,13 @@ def test_quantizing_pretrained_model_outputs_match(
     # Measure the distribution error with KD Loss
     # flatten as a single batch bs*seqlen
     # since batchmean sums the loss and averages on dim=0
-    loss_fn = torch.nn.KLDivLoss(reduction="batchmean")
+    loss_fn = torch.nn.KLDivLoss(reduction="sum")
     # input should be a distribution in the log space
     input = torch.nn.functional.log_softmax(refactored_logits, dim=-1)
-    input = torch.flatten(input, start_dim=0, end_dim=1)
+    input = input.view(BS*SEQLEN, -1)
     # target must be prob distribution
     target = torch.nn.functional.softmax(original_logits, dim=-1)
-    target = torch.flatten(target, start_dim=0, end_dim=1)
+    target = target.view(BS*SEQLEN, -1)
     error = loss_fn(input, target)
     assert error.lt(
         LOSS_TOLERANCE
diff --git a/plugins/accelerated-peft/tests/test_q4_triton.py b/plugins/accelerated-peft/tests/test_q4_triton.py
index 33927a7d..ba5d0674 100644
--- a/plugins/accelerated-peft/tests/test_q4_triton.py
+++ b/plugins/accelerated-peft/tests/test_q4_triton.py
@@ -55,7 +55,7 @@ def test_generation_desc_act_false(self):
         else:
             raise ValueError("Did not find a tritonv2 linear layer")
 
-        tokenizer = AutoTokenizer.from_pretrained(model_id)
+        tokenizer = AutoTokenizer.from_pretrained(model_id, use_fast=True)
 
         inp = tokenizer(prompt, return_tensors="pt").to("cuda:0")
 
@@ -101,7 +101,7 @@ def test_generation_desc_act_true(self):
         else:
             raise ValueError("Did not find a tritonv2 linear layer")
 
-        tokenizer = AutoTokenizer.from_pretrained(model_id)
+        tokenizer = AutoTokenizer.from_pretrained(model_id, use_fast=True)
 
         inp = tokenizer(prompt, return_tensors="pt").to(device)
 
diff --git a/plugins/accelerated-peft/tox.ini b/plugins/accelerated-peft/tox.ini
index cdcf221d..2e81324d 100644
--- a/plugins/accelerated-peft/tox.ini
+++ b/plugins/accelerated-peft/tox.ini
@@ -5,13 +5,14 @@ envlist = py, lint, fmt, build, twinecheck
 deps = 
     pytest>=7
     # for the tests, we need to install the deps ourselves
-    # as the package will install the github version
+    # as the package will install the github version    
     -e {toxinidir}/../framework
 # set skip package installation as it will install package pyproject.toml before deps, will throw error when AutoGPTQ needs torch
 skip_install = true 
 commands = 
     # install the current package
     pip install --no-deps {toxinidir}
+    pip install threadpoolctl protobuf sentencepiece # these packages are required for some tests
     pytest {posargs:tests}
 
 [testenv:lint] 

From 9a3035927ae53fd11bb039f40b248b835636e4aa Mon Sep 17 00:00:00 2001
From: 1000850000 user <aaron.chew1@ibm.com>
Date: Mon, 8 Jul 2024 06:58:42 +0000
Subject: [PATCH 12/20] fixed union type backward compatibility with py39

---
 .../gptqmodel/models/auto.py                  |  2 +-
 .../gptqmodel/models/base.py                  |  4 ++--
 .../gptqmodel/utils/model.py                  |  6 ++---
 .../accelerated-peft/tests/test_q4_triton.py  | 22 ++++++++++++++-----
 plugins/accelerated-peft/tests/test_triton.py | 10 ++++++++-
 5 files changed, 32 insertions(+), 12 deletions(-)

diff --git a/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/models/auto.py b/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/models/auto.py
index 178d9c89..19d15ef2 100644
--- a/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/models/auto.py
+++ b/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/models/auto.py
@@ -91,7 +91,7 @@ def from_quantized(
         device: Optional[Union[str, int]] = None,
         backend: Backend = Backend.AUTO,
         use_cuda_fp16: bool = True,
-        quantize_config: Optional[QuantizeConfig | Dict] = None,
+        quantize_config: Optional[Union[QuantizeConfig, Dict]] = None,
         model_basename: Optional[str] = None,
         use_safetensors: bool = True,
         trust_remote_code: bool = False,
diff --git a/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/models/base.py b/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/models/base.py
index 8c319c45..9536c65e 100644
--- a/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/models/base.py
+++ b/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/models/base.py
@@ -779,7 +779,7 @@ def from_pretrained(
         quantize_config: QuantizeConfig,
         max_memory: Optional[dict] = None,
         trust_remote_code: bool = False,
-        torch_dtype: [str | torch.dtype] = "auto",
+        torch_dtype: Union[str, torch.dtype] = "auto",
         **model_init_kwargs,
     ):
         """load un-quantized pretrained model to cpu"""
@@ -880,7 +880,7 @@ def from_quantized(
         device: Optional[Union[str, int]] = None,
         low_cpu_mem_usage: bool = False,
         backend: Backend = Backend.AUTO,
-        torch_dtype: [str | torch.dtype] = "auto",
+        torch_dtype: Union[str, torch.dtype] = "auto",
         use_cuda_fp16: bool = True,
         quantize_config: Optional[QuantizeConfig] = None,
         model_basename: Optional[str] = None,
diff --git a/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/utils/model.py b/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/utils/model.py
index e98103e8..d51e0e60 100644
--- a/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/utils/model.py
+++ b/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/utils/model.py
@@ -15,7 +15,7 @@
 ###############################################################################
 # Standard
 from logging import getLogger
-from typing import List, Optional
+from typing import List, Optional, Union
 import functools
 import hashlib
 import json
@@ -79,13 +79,13 @@ def recurse_setattr(module, name, value):
         recurse_setattr(getattr(module, name), rest, value)
 
 
-def get_device(obj: torch.Tensor | nn.Module):
+def get_device(obj: Union[torch.Tensor, nn.Module]):
     if isinstance(obj, torch.Tensor):
         return obj.device
     return next(obj.parameters()).device
 
 
-def move_to(obj: torch.Tensor | nn.Module, device: torch.device):
+def move_to(obj: Union[torch.Tensor, nn.Module], device: torch.device):
     if get_device(obj) != device:
         obj = obj.to(device)
     return obj
diff --git a/plugins/accelerated-peft/tests/test_q4_triton.py b/plugins/accelerated-peft/tests/test_q4_triton.py
index ba5d0674..1201f342 100644
--- a/plugins/accelerated-peft/tests/test_q4_triton.py
+++ b/plugins/accelerated-peft/tests/test_q4_triton.py
@@ -24,17 +24,25 @@
 import unittest  # noqa: E402
 
 # Third Party
-from fms_acceleration_peft.gptqmodel import Backend, GPTQModel  # noqa: E402
-from fms_acceleration_peft.gptqmodel.nn_modules.qlinear.qlinear_tritonv2 import (  # noqa: E402
-    QuantLinear as TritonV2QuantLinear,
-)
 from transformers import AutoTokenizer  # noqa: E402
 import torch  # noqa: E402
 
-GENERATE_EVAL_SIZE = 100
+CUDA_AVAILABLE = False
+if torch.cuda.is_available():
+    from fms_acceleration_peft.gptqmodel import Backend, GPTQModel  # noqa: E402
+    from fms_acceleration_peft.gptqmodel.nn_modules.qlinear.qlinear_tritonv2 import (  # noqa: E402
+        QuantLinear as TritonV2QuantLinear,
+    )
+    CUDA_AVAILABLE = True
+
 
+GENERATE_EVAL_SIZE = 100
 
 class TestsQ4Triton(unittest.TestCase):
+    @unittest.skipIf(
+        CUDA_AVAILABLE is False,
+        "Only runs if there is a cuda device available",
+    )
     def test_generation_desc_act_false(self):
         prompt = "I am in Paris and"
 
@@ -79,6 +87,10 @@ def test_generation_desc_act_false(self):
             predicted_text[:GENERATE_EVAL_SIZE], reference_output[:GENERATE_EVAL_SIZE]
         )
 
+    @unittest.skipIf(
+        CUDA_AVAILABLE is False,
+        "Only runs if there is a cuda device available",
+    )
     def test_generation_desc_act_true(self):
         prompt = "I am in Paris and"
         device = torch.device("cuda:0")
diff --git a/plugins/accelerated-peft/tests/test_triton.py b/plugins/accelerated-peft/tests/test_triton.py
index 245bb939..aed7af12 100644
--- a/plugins/accelerated-peft/tests/test_triton.py
+++ b/plugins/accelerated-peft/tests/test_triton.py
@@ -25,11 +25,15 @@
 import unittest  # noqa: E402
 
 # Third Party
-from fms_acceleration_peft.gptqmodel import Backend, GPTQModel  # noqa: E402
 from transformers import AutoTokenizer  # noqa: E402
 import torch  # noqa: E402
 import torch.utils.benchmark as benchmark  # noqa: E402
 
+CUDA_AVAILABLE = False
+if torch.cuda.is_available():
+    from fms_acceleration_peft.gptqmodel import Backend, GPTQModel  # noqa: E402
+    CUDA_AVAILABLE = True    
+
 MODEL_ID = "TheBloke/Llama-7B-GPTQ"
 DATASET_ID = "timdettmers/openassistant-guanaco"
 LEARNING_RATE = 3e-5
@@ -94,6 +98,10 @@ def get_model_and_tokenizer(
 
 
 class TestTriton(unittest.TestCase):
+    @unittest.skipIf(
+        CUDA_AVAILABLE is False,
+        "Only runs if there is a cuda device available",
+    )
     def test_triton_qlinear(self):
         ref_model, _ = get_model_and_tokenizer(
             model_id=MODEL_ID,

From 68774dfe87bd75c7ecc6d41dc5cde88eba3d9e26 Mon Sep 17 00:00:00 2001
From: 1000850000 user <aaron.chew1@ibm.com>
Date: Thu, 11 Jul 2024 04:12:19 +0000
Subject: [PATCH 13/20] Fix FOAK dequant for compatibility with local gptq
 package

---
 .../fused_ops/unsloth_lora/gptq/fast_lora.py                 | 3 +++
 .../fused_ops/unsloth_lora/gptq/triton/kernels.py            | 5 ++++-
 2 files changed, 7 insertions(+), 1 deletion(-)

diff --git a/plugins/fused-ops-and-kernels/src/fms_acceleration_foak/fused_ops/unsloth_lora/gptq/fast_lora.py b/plugins/fused-ops-and-kernels/src/fms_acceleration_foak/fused_ops/unsloth_lora/gptq/fast_lora.py
index 41e5355e..4000a258 100644
--- a/plugins/fused-ops-and-kernels/src/fms_acceleration_foak/fused_ops/unsloth_lora/gptq/fast_lora.py
+++ b/plugins/fused-ops-and-kernels/src/fms_acceleration_foak/fused_ops/unsloth_lora/gptq/fast_lora.py
@@ -98,6 +98,9 @@ def get_lora_parameters(proj):
     base_layer = proj.base_layer if hasattr(proj, "base_layer") else proj
     qstate = extract_gptq_state(base_layer)
 
+    if base_layer.__module__.startswith("auto_gptq"):
+        setattr(qstate.qzeros, "offset", 1)
+
     if not hasattr(proj, "disable_adapters") or proj.disable_adapters or proj.merged:
         return qstate, None, None, None, None
 
diff --git a/plugins/fused-ops-and-kernels/src/fms_acceleration_foak/fused_ops/unsloth_lora/gptq/triton/kernels.py b/plugins/fused-ops-and-kernels/src/fms_acceleration_foak/fused_ops/unsloth_lora/gptq/triton/kernels.py
index c252d26d..efc3b41e 100644
--- a/plugins/fused-ops-and-kernels/src/fms_acceleration_foak/fused_ops/unsloth_lora/gptq/triton/kernels.py
+++ b/plugins/fused-ops-and-kernels/src/fms_acceleration_foak/fused_ops/unsloth_lora/gptq/triton/kernels.py
@@ -110,7 +110,10 @@ def dequant_kernel_248(
     zeros = zeros & maxq
 
     # Dequantize
-    zeros = zeros + 1
+    # None if using local gptqpackage, official autogptq should have an offset value
+    if getattr(qzeros_ptr, "offset", None) is not None:
+        zeros = zeros + qzeros_ptr.offset
+
     weights = weights - zeros
     weights = weights.to(tl.float32)
     weights = scales * weights

From f125683cef4c676c2a22bb3fe338034b6beb50d1 Mon Sep 17 00:00:00 2001
From: 1000850000 user <aaron.chew1@ibm.com>
Date: Thu, 11 Jul 2024 06:37:08 +0000
Subject: [PATCH 14/20] add benchmark comparison script

---
 scripts/benchmarks/compare_with_reference.py | 101 +++++++++++++++++++
 scripts/run_benchmarks.sh                    |   2 +
 2 files changed, 103 insertions(+)
 create mode 100644 scripts/benchmarks/compare_with_reference.py

diff --git a/scripts/benchmarks/compare_with_reference.py b/scripts/benchmarks/compare_with_reference.py
new file mode 100644
index 00000000..0bcc9dbb
--- /dev/null
+++ b/scripts/benchmarks/compare_with_reference.py
@@ -0,0 +1,101 @@
+import argparse
+import pandas as pd
+from copy import copy
+import matplotlib.pyplot as plt
+import os
+from numpy import linalg
+
+PLOT_COLUMNS = ["mem_torch_mem_alloc_in_bytes", "mem_peak_torch_mem_alloc_in_bytes", "train_loss", "train_tokens_per_second"]     
+INDICES = ["framework_config", "peft_method", "model_name_or_path", "num_gpus", "per_device_train_batch_size"]
+REFERENCE_FILEPATH = "scripts/benchmarks/refs/a100_80gb.csv"
+BENCHMARK_FILENAME = "benchmarks.csv"
+FIGURE_FILENAME = "comparison.jpg"
+
+def plot_chart(ax, x, y, title, xlabel, ylabel):
+    ax.scatter(x, y, s=10)
+    ax.plot()
+    ax.set_title(title, fontsize=8)
+    ax.set_xlabel(xlabel)
+    ax.set_ylabel(ylabel)
+    ax.axline((0, 0), slope=1)
+
+def plot_table(ax, cell_inputs, title, col_widths, col_labels):
+    table = ax.table(cellText=cell_inputs, loc="center", colWidths=col_widths, colLabels=col_labels)
+    table.scale(1, 2)
+    table.auto_set_font_size(False)
+    table.set_fontsize(8)
+    table.auto_set_column_width(0)            
+    ax.set_title(title, fontsize=10)
+    
+def compare_results(df, ref, plot_columns, num_columns = 2, threshold_ratio=0.1):
+    num_plots = len(plot_columns)    
+    rows = num_plots
+    fig, axs = plt.subplots(rows, 2, figsize=(20, 28))
+    fig.tight_layout(pad=5.0)
+    
+    # filter ref to only those rows seen in df
+    ref = ref[ref.index.isin(df.index.tolist())]
+    for idx in range(num_plots):
+        column = plot_columns[idx]
+        assert (column in ref.columns) and (column in df.columns), f"Column Name `{column}` not in Dataframe"
+        ax1 = axs[idx][0]
+        ax2 = axs[idx][1]
+        ax2.axis('off')
+        
+        ref_series = ref[column].fillna(0)
+        df_series = df[column].fillna(0)
+        # Calculate difference of l1 norms as a percentage on reference
+        ref_norm = linalg.norm(ref_series, ord=1)
+        df_norm = linalg.norm(df_series, ord=1)
+        norm_difference = abs(df_norm - ref_norm)/(ref_norm+1e-9)
+        # Extract outliers from reference based on % threshold on referance
+        ds = abs(df_series-ref_series)/(ref_series+1e-9)    
+        outliers = ds.index[ds>threshold_ratio].to_list()
+        
+        plot_chart(
+            ax1, 
+            ref_series, 
+            df_series, 
+            title=f"Metric: {column}", 
+            xlabel="Reference", 
+            ylabel="New",
+        )
+        
+        cell_inputs = [[outlier, ref_series[outlier], df_series[outlier]] for outlier in outliers] if len(outliers)>0 else [["","",""]]
+
+        plot_table(
+            ax2,
+            cell_inputs = cell_inputs,
+            title=f"Metric: {column} outliers\n\nNorm Difference={norm_difference:.3f}", 
+            col_widths=[0.9, 0.2, 0.2], 
+            col_labels=["Experiment", "Reference", "New"]
+        )                
+    return fig
+
+def read_df(file_path, indices, plot_columns):
+    df = pd.read_csv(file_path)
+    df.set_index(indices, inplace=True)
+    df = df[plot_columns]
+    return df
+
+def main(result_dir):
+    ref = read_df(REFERENCE_FILEPATH, INDICES, PLOT_COLUMNS)
+    df = read_df(os.path.join(result_dir, BENCHMARK_FILENAME), INDICES, PLOT_COLUMNS)
+    fig = compare_results(df, ref, PLOT_COLUMNS, threshold_ratio=.1)
+    plt.savefig(os.path.join(result_dir, FIGURE_FILENAME))    
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(
+        prog="Acceleration Benchmarking Debug Tool",
+        description="This script analyses benchmark outputs against the current reference",
+    )
+    parser.add_argument(
+        "--result_dir",
+        default="benchmark_outputs",
+        help="benchmark result directory",
+    )
+
+    args = parser.parse_args()
+    main(
+        result_dir=args.result_dir,
+    )
\ No newline at end of file
diff --git a/scripts/run_benchmarks.sh b/scripts/run_benchmarks.sh
index 8f8a1f9b..6e63f530 100644
--- a/scripts/run_benchmarks.sh
+++ b/scripts/run_benchmarks.sh
@@ -137,3 +137,5 @@ PYTHONPATH=. \
         'error_messages' \
         'acceleration_framework_config_file'
 
+PYTHONPATH=. \
+    python $WORKING_DIR/compare_with_reference.py --result_dir $RESULT_DIR

From 2a705ca4a2f7bc210ac515022f91fc42c36d1320 Mon Sep 17 00:00:00 2001
From: 1000850000 user <aaron.chew1@ibm.com>
Date: Thu, 11 Jul 2024 09:01:28 +0000
Subject: [PATCH 15/20] modified comparison script

---
 scripts/benchmarks/compare_with_reference.py | 101 ++++++++++---------
 1 file changed, 51 insertions(+), 50 deletions(-)

diff --git a/scripts/benchmarks/compare_with_reference.py b/scripts/benchmarks/compare_with_reference.py
index 0bcc9dbb..054315ea 100644
--- a/scripts/benchmarks/compare_with_reference.py
+++ b/scripts/benchmarks/compare_with_reference.py
@@ -5,72 +5,56 @@
 import os
 from numpy import linalg
 
-PLOT_COLUMNS = ["mem_torch_mem_alloc_in_bytes", "mem_peak_torch_mem_alloc_in_bytes", "train_loss", "train_tokens_per_second"]     
-INDICES = ["framework_config", "peft_method", "model_name_or_path", "num_gpus", "per_device_train_batch_size"]
-REFERENCE_FILEPATH = "scripts/benchmarks/refs/a100_80gb.csv"
+# default columns to compare
+DEFAULT_PLOT_COLUMNS = ["mem_torch_mem_alloc_in_bytes", "mem_peak_torch_mem_alloc_in_bytes", "train_loss", "train_tokens_per_second"]     
+# Used as combined identifier of experiment
+DEFAULT_INDICES = ["framework_config", "peft_method", "model_name_or_path", "num_gpus", "per_device_train_batch_size"]
+DEFAULT_OUTLIERS_DF_COLUMN_NAMES = ["scenario", *DEFAULT_INDICES, "reference", "new"]
+DEFAULT_REFERENCE_FILEPATH = "scripts/benchmarks/refs/a100_80gb.csv"
 BENCHMARK_FILENAME = "benchmarks.csv"
-FIGURE_FILENAME = "comparison.jpg"
 
 def plot_chart(ax, x, y, title, xlabel, ylabel):
     ax.scatter(x, y, s=10)
-    ax.plot()
     ax.set_title(title, fontsize=8)
     ax.set_xlabel(xlabel)
     ax.set_ylabel(ylabel)
     ax.axline((0, 0), slope=1)
-
-def plot_table(ax, cell_inputs, title, col_widths, col_labels):
-    table = ax.table(cellText=cell_inputs, loc="center", colWidths=col_widths, colLabels=col_labels)
-    table.scale(1, 2)
-    table.auto_set_font_size(False)
-    table.set_fontsize(8)
-    table.auto_set_column_width(0)            
-    ax.set_title(title, fontsize=10)
-    
+   
 def compare_results(df, ref, plot_columns, num_columns = 2, threshold_ratio=0.1):
     num_plots = len(plot_columns)    
-    rows = num_plots
-    fig, axs = plt.subplots(rows, 2, figsize=(20, 28))
-    fig.tight_layout(pad=5.0)
-    
+   
+    charts = []
+    total_outliers = []
     # filter ref to only those rows seen in df
     ref = ref[ref.index.isin(df.index.tolist())]
     for idx in range(num_plots):
+        _, ax = plt.subplots(figsize=(8, 8))
         column = plot_columns[idx]
-        assert (column in ref.columns) and (column in df.columns), f"Column Name `{column}` not in Dataframe"
-        ax1 = axs[idx][0]
-        ax2 = axs[idx][1]
-        ax2.axis('off')
-        
+        assert (
+            (column in ref.columns)
+            and 
+            (column in df.columns)
+        ), f"Column Name `{column}` not in Dataframe"        
+
         ref_series = ref[column].fillna(0)
         df_series = df[column].fillna(0)
-        # Calculate difference of l1 norms as a percentage on reference
-        ref_norm = linalg.norm(ref_series, ord=1)
-        df_norm = linalg.norm(df_series, ord=1)
-        norm_difference = abs(df_norm - ref_norm)/(ref_norm+1e-9)
-        # Extract outliers from reference based on % threshold on referance
+        # Extract outliers base on some threshold % difference on referance
         ds = abs(df_series-ref_series)/(ref_series+1e-9)    
-        outliers = ds.index[ds>threshold_ratio].to_list()
-        
+        outliers = ds.index[ds>threshold_ratio].to_list()        
         plot_chart(
-            ax1, 
+            ax, 
             ref_series, 
             df_series, 
             title=f"Metric: {column}", 
             xlabel="Reference", 
             ylabel="New",
-        )
-        
-        cell_inputs = [[outlier, ref_series[outlier], df_series[outlier]] for outlier in outliers] if len(outliers)>0 else [["","",""]]
-
-        plot_table(
-            ax2,
-            cell_inputs = cell_inputs,
-            title=f"Metric: {column} outliers\n\nNorm Difference={norm_difference:.3f}", 
-            col_widths=[0.9, 0.2, 0.2], 
-            col_labels=["Experiment", "Reference", "New"]
-        )                
-    return fig
+        )        
+        charts.append((ax, f"compare-{column}.jpg"))
+        total_outliers += [
+            [column, *outlier, ref_series[outlier].item(), df_series[outlier].item()]
+            for outlier in outliers
+        ]
+    return total_outliers, charts
 
 def read_df(file_path, indices, plot_columns):
     df = pd.read_csv(file_path)
@@ -78,11 +62,14 @@ def read_df(file_path, indices, plot_columns):
     df = df[plot_columns]
     return df
 
-def main(result_dir):
-    ref = read_df(REFERENCE_FILEPATH, INDICES, PLOT_COLUMNS)
-    df = read_df(os.path.join(result_dir, BENCHMARK_FILENAME), INDICES, PLOT_COLUMNS)
-    fig = compare_results(df, ref, PLOT_COLUMNS, threshold_ratio=.1)
-    plt.savefig(os.path.join(result_dir, FIGURE_FILENAME))    
+def main(result_dir, reference_benchmark_filepath, plot_columns):
+    ref = read_df(reference_benchmark_filepath, DEFAULT_INDICES, plot_columns)
+    df = read_df(os.path.join(result_dir, BENCHMARK_FILENAME), DEFAULT_INDICES, plot_columns)
+    total_outliers, charts = compare_results(df, ref, plot_columns, threshold_ratio=.1)
+    outliers_df = pd.DataFrame(total_outliers, columns=DEFAULT_OUTLIERS_DF_COLUMN_NAMES)
+    outliers_df.to_csv(os.path.join(result_dir, "outliers.csv"), index=None)
+    for chart, filename in charts:
+        chart.figure.savefig(os.path.join(result_dir, filename))
 
 if __name__ == "__main__":
     parser = argparse.ArgumentParser(
@@ -92,10 +79,24 @@ def main(result_dir):
     parser.add_argument(
         "--result_dir",
         default="benchmark_outputs",
-        help="benchmark result directory",
+        help="benchmark result directory to use for comparison",
+    )
+
+    parser.add_argument(
+        "--reference_benchmark_filepath",
+        default="scripts/benchmarks/refs/a100_80gb.csv",
+        help="file path of the csv to compare on",
+    )
+
+    parser.add_argument(
+        "--plot_columns",
+         default=DEFAULT_PLOT_COLUMNS,
+         nargs='+'
     )
 
     args = parser.parse_args()
     main(
         result_dir=args.result_dir,
-    )
\ No newline at end of file
+        reference_benchmark_filepath=args.reference_benchmark_filepath,
+        plot_columns=args.plot_columns,
+    )

From eefdd888595bd9de63fbeb972f4417e021b6795d Mon Sep 17 00:00:00 2001
From: 1000850000 user <aaron.chew1@ibm.com>
Date: Thu, 11 Jul 2024 09:28:26 +0000
Subject: [PATCH 16/20] formatted scripts/

---
 scripts/benchmarks/compare_with_reference.py | 77 ++++++++++++--------
 1 file changed, 46 insertions(+), 31 deletions(-)

diff --git a/scripts/benchmarks/compare_with_reference.py b/scripts/benchmarks/compare_with_reference.py
index 054315ea..4bd54d43 100644
--- a/scripts/benchmarks/compare_with_reference.py
+++ b/scripts/benchmarks/compare_with_reference.py
@@ -1,28 +1,44 @@
-import argparse
-import pandas as pd
+# Standard
 from copy import copy
-import matplotlib.pyplot as plt
+import argparse
 import os
+
+# Third Party
 from numpy import linalg
+import matplotlib.pyplot as plt
+import pandas as pd
 
 # default columns to compare
-DEFAULT_PLOT_COLUMNS = ["mem_torch_mem_alloc_in_bytes", "mem_peak_torch_mem_alloc_in_bytes", "train_loss", "train_tokens_per_second"]     
+DEFAULT_PLOT_COLUMNS = [
+    "mem_torch_mem_alloc_in_bytes",
+    "mem_peak_torch_mem_alloc_in_bytes",
+    "train_loss",
+    "train_tokens_per_second",
+]
 # Used as combined identifier of experiment
-DEFAULT_INDICES = ["framework_config", "peft_method", "model_name_or_path", "num_gpus", "per_device_train_batch_size"]
+DEFAULT_INDICES = [
+    "framework_config",
+    "peft_method",
+    "model_name_or_path",
+    "num_gpus",
+    "per_device_train_batch_size",
+]
 DEFAULT_OUTLIERS_DF_COLUMN_NAMES = ["scenario", *DEFAULT_INDICES, "reference", "new"]
 DEFAULT_REFERENCE_FILEPATH = "scripts/benchmarks/refs/a100_80gb.csv"
 BENCHMARK_FILENAME = "benchmarks.csv"
 
+
 def plot_chart(ax, x, y, title, xlabel, ylabel):
     ax.scatter(x, y, s=10)
     ax.set_title(title, fontsize=8)
     ax.set_xlabel(xlabel)
     ax.set_ylabel(ylabel)
     ax.axline((0, 0), slope=1)
-   
-def compare_results(df, ref, plot_columns, num_columns = 2, threshold_ratio=0.1):
-    num_plots = len(plot_columns)    
-   
+
+
+def compare_results(df, ref, plot_columns, num_columns=2, threshold_ratio=0.1):
+    num_plots = len(plot_columns)
+
     charts = []
     total_outliers = []
     # filter ref to only those rows seen in df
@@ -30,25 +46,23 @@ def compare_results(df, ref, plot_columns, num_columns = 2, threshold_ratio=0.1)
     for idx in range(num_plots):
         _, ax = plt.subplots(figsize=(8, 8))
         column = plot_columns[idx]
-        assert (
-            (column in ref.columns)
-            and 
-            (column in df.columns)
-        ), f"Column Name `{column}` not in Dataframe"        
+        assert (column in ref.columns) and (
+            column in df.columns
+        ), f"Column Name `{column}` not in Dataframe"
 
         ref_series = ref[column].fillna(0)
         df_series = df[column].fillna(0)
         # Extract outliers base on some threshold % difference on referance
-        ds = abs(df_series-ref_series)/(ref_series+1e-9)    
-        outliers = ds.index[ds>threshold_ratio].to_list()        
+        ds = abs(df_series - ref_series) / (ref_series + 1e-9)
+        outliers = ds.index[ds > threshold_ratio].to_list()
         plot_chart(
-            ax, 
-            ref_series, 
-            df_series, 
-            title=f"Metric: {column}", 
-            xlabel="Reference", 
+            ax,
+            ref_series,
+            df_series,
+            title=f"Metric: {column}",
+            xlabel="Reference",
             ylabel="New",
-        )        
+        )
         charts.append((ax, f"compare-{column}.jpg"))
         total_outliers += [
             [column, *outlier, ref_series[outlier].item(), df_series[outlier].item()]
@@ -56,25 +70,30 @@ def compare_results(df, ref, plot_columns, num_columns = 2, threshold_ratio=0.1)
         ]
     return total_outliers, charts
 
+
 def read_df(file_path, indices, plot_columns):
     df = pd.read_csv(file_path)
     df.set_index(indices, inplace=True)
     df = df[plot_columns]
     return df
 
+
 def main(result_dir, reference_benchmark_filepath, plot_columns):
     ref = read_df(reference_benchmark_filepath, DEFAULT_INDICES, plot_columns)
-    df = read_df(os.path.join(result_dir, BENCHMARK_FILENAME), DEFAULT_INDICES, plot_columns)
-    total_outliers, charts = compare_results(df, ref, plot_columns, threshold_ratio=.1)
+    df = read_df(
+        os.path.join(result_dir, BENCHMARK_FILENAME), DEFAULT_INDICES, plot_columns
+    )
+    total_outliers, charts = compare_results(df, ref, plot_columns, threshold_ratio=0.1)
     outliers_df = pd.DataFrame(total_outliers, columns=DEFAULT_OUTLIERS_DF_COLUMN_NAMES)
     outliers_df.to_csv(os.path.join(result_dir, "outliers.csv"), index=None)
     for chart, filename in charts:
         chart.figure.savefig(os.path.join(result_dir, filename))
 
+
 if __name__ == "__main__":
     parser = argparse.ArgumentParser(
-        prog="Acceleration Benchmarking Debug Tool",
-        description="This script analyses benchmark outputs against the current reference",
+        prog="Acceleration Benchmarking Comparison Tool",
+        description="This script analyses benchmark outputs against a given reference",
     )
     parser.add_argument(
         "--result_dir",
@@ -88,11 +107,7 @@ def main(result_dir, reference_benchmark_filepath, plot_columns):
         help="file path of the csv to compare on",
     )
 
-    parser.add_argument(
-        "--plot_columns",
-         default=DEFAULT_PLOT_COLUMNS,
-         nargs='+'
-    )
+    parser.add_argument("--plot_columns", default=DEFAULT_PLOT_COLUMNS, nargs="+")
 
     args = parser.parse_args()
     main(

From aa57cf2314cd71a3139e02d66f6a61003d220a9c Mon Sep 17 00:00:00 2001
From: 1000850000 user <aaron.chew1@ibm.com>
Date: Thu, 11 Jul 2024 19:38:59 +0000
Subject: [PATCH 17/20] edited comparison script to detect difference in
 command args

---
 scripts/benchmarks/compare_with_reference.py | 67 +++++++++++++++-----
 1 file changed, 52 insertions(+), 15 deletions(-)

diff --git a/scripts/benchmarks/compare_with_reference.py b/scripts/benchmarks/compare_with_reference.py
index 4bd54d43..38375a9d 100644
--- a/scripts/benchmarks/compare_with_reference.py
+++ b/scripts/benchmarks/compare_with_reference.py
@@ -23,7 +23,15 @@
     "num_gpus",
     "per_device_train_batch_size",
 ]
-DEFAULT_OUTLIERS_DF_COLUMN_NAMES = ["scenario", *DEFAULT_INDICES, "reference", "new"]
+
+DEFAULT_IGNORED_COLUMNS = [
+    "epoch",
+    "train_runtime",
+    "train_steps_per_second",
+    "train_samples_per_second",
+    "mem_nvidia_mem_reserved",
+]
+
 DEFAULT_REFERENCE_FILEPATH = "scripts/benchmarks/refs/a100_80gb.csv"
 BENCHMARK_FILENAME = "benchmarks.csv"
 
@@ -36,7 +44,7 @@ def plot_chart(ax, x, y, title, xlabel, ylabel):
     ax.axline((0, 0), slope=1)
 
 
-def compare_results(df, ref, plot_columns, num_columns=2, threshold_ratio=0.1):
+def compare_results(df, ref, plot_columns, threshold_ratio=0.1):
     num_plots = len(plot_columns)
 
     charts = []
@@ -68,24 +76,45 @@ def compare_results(df, ref, plot_columns, num_columns=2, threshold_ratio=0.1):
             [column, *outlier, ref_series[outlier].item(), df_series[outlier].item()]
             for outlier in outliers
         ]
-    return total_outliers, charts
+    outliers_df = pd.DataFrame(
+        total_outliers, columns=["scenario", *df.index.names, "reference", "new"]
+    )
+    return outliers_df, outliers, charts
 
 
 def read_df(file_path, indices, plot_columns):
     df = pd.read_csv(file_path)
     df.set_index(indices, inplace=True)
-    df = df[plot_columns]
-    return df
-
-
-def main(result_dir, reference_benchmark_filepath, plot_columns):
-    ref = read_df(reference_benchmark_filepath, DEFAULT_INDICES, plot_columns)
-    df = read_df(
-        os.path.join(result_dir, BENCHMARK_FILENAME), DEFAULT_INDICES, plot_columns
+    # all other columns not for plotting or explicitly ignored are hyperparameters
+    argument_columns = [
+        col
+        for col in df.columns
+        if col not in (DEFAULT_IGNORED_COLUMNS + DEFAULT_PLOT_COLUMNS)
+    ]
+    return df[plot_columns], df[argument_columns]
+
+
+def main(
+    result_dir, reference_benchmark_filepath, plot_columns, threshold_ratio, indices
+):
+    ref, args_ref = read_df(reference_benchmark_filepath, indices, plot_columns)
+    df, args_df = read_df(
+        os.path.join(result_dir, BENCHMARK_FILENAME), indices, plot_columns
+    )
+    # Analyse between both sets of results and retrieve outliers
+    outliers_df, outliers, charts = compare_results(
+        df, ref, plot_columns, threshold_ratio=threshold_ratio
     )
-    total_outliers, charts = compare_results(df, ref, plot_columns, threshold_ratio=0.1)
-    outliers_df = pd.DataFrame(total_outliers, columns=DEFAULT_OUTLIERS_DF_COLUMN_NAMES)
-    outliers_df.to_csv(os.path.join(result_dir, "outliers.csv"), index=None)
+    # Find arguments that are different between ref and new
+    # to highlight as possible cause of anomaly
+    diff = args_df.compare(args_ref, align_axis=1).rename(
+        columns={"self": "new", "other": "ref"}, level=-1
+    )
+    diff = diff[diff.index.isin([outlier for outlier in outliers])]
+    outliers_df = outliers_df.set_index(indices).merge(
+        diff, left_index=True, right_index=True
+    )
+    outliers_df.to_csv(os.path.join(result_dir, "outliers.csv"))
     for chart, filename in charts:
         chart.figure.savefig(os.path.join(result_dir, filename))
 
@@ -100,12 +129,18 @@ def main(result_dir, reference_benchmark_filepath, plot_columns):
         default="benchmark_outputs",
         help="benchmark result directory to use for comparison",
     )
-
     parser.add_argument(
         "--reference_benchmark_filepath",
         default="scripts/benchmarks/refs/a100_80gb.csv",
         help="file path of the csv to compare on",
     )
+    parser.add_argument(
+        "--threshold_ratio",
+        default=0.1,
+        help="the acceptable threshold percentage difference from the reference value.",
+    )
+
+    parser.add_argument("--indices", default=DEFAULT_INDICES, nargs="+")
 
     parser.add_argument("--plot_columns", default=DEFAULT_PLOT_COLUMNS, nargs="+")
 
@@ -114,4 +149,6 @@ def main(result_dir, reference_benchmark_filepath, plot_columns):
         result_dir=args.result_dir,
         reference_benchmark_filepath=args.reference_benchmark_filepath,
         plot_columns=args.plot_columns,
+        threshold_ratio=args.threshold_ratio,
+        indices=args.indices,
     )

From d98e56ce6c76e5b91e92bf062c5d271743d5031c Mon Sep 17 00:00:00 2001
From: 1000850000 user <aaron.chew1@ibm.com>
Date: Fri, 12 Jul 2024 02:13:13 +0000
Subject: [PATCH 18/20] addresed PR edits

---
 scripts/benchmarks/compare_with_reference.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/scripts/benchmarks/compare_with_reference.py b/scripts/benchmarks/compare_with_reference.py
index 38375a9d..a580b8de 100644
--- a/scripts/benchmarks/compare_with_reference.py
+++ b/scripts/benchmarks/compare_with_reference.py
@@ -34,7 +34,7 @@
 
 DEFAULT_REFERENCE_FILEPATH = "scripts/benchmarks/refs/a100_80gb.csv"
 BENCHMARK_FILENAME = "benchmarks.csv"
-
+OUTLIERS_FILENAME = "outliers.csv"
 
 def plot_chart(ax, x, y, title, xlabel, ylabel):
     ax.scatter(x, y, s=10)
@@ -114,7 +114,7 @@ def main(
     outliers_df = outliers_df.set_index(indices).merge(
         diff, left_index=True, right_index=True
     )
-    outliers_df.to_csv(os.path.join(result_dir, "outliers.csv"))
+    outliers_df.to_csv(os.path.join(result_dir, OUTLIERS_FILENAME))
     for chart, filename in charts:
         chart.figure.savefig(os.path.join(result_dir, filename))
 
@@ -137,7 +137,7 @@ def main(
     parser.add_argument(
         "--threshold_ratio",
         default=0.1,
-        help="the acceptable threshold percentage difference from the reference value.",
+        help="the acceptable relative difference from the reference value.",
     )
 
     parser.add_argument("--indices", default=DEFAULT_INDICES, nargs="+")

From b92c99df69c6fce29e169390f57e08cba7c3e0c3 Mon Sep 17 00:00:00 2001
From: 1000850000 user <aaron.chew1@ibm.com>
Date: Fri, 12 Jul 2024 08:50:08 +0000
Subject: [PATCH 19/20] updated benchmarks

---
 scripts/benchmarks/refs/a100_80gb.csv    | 168 +++++++++++------------
 scripts/benchmarks/refs/requirements.txt |  78 +++++++++++
 2 files changed, 162 insertions(+), 84 deletions(-)
 create mode 100644 scripts/benchmarks/refs/requirements.txt

diff --git a/scripts/benchmarks/refs/a100_80gb.csv b/scripts/benchmarks/refs/a100_80gb.csv
index 45cdf125..6bb7714a 100644
--- a/scripts/benchmarks/refs/a100_80gb.csv
+++ b/scripts/benchmarks/refs/a100_80gb.csv
@@ -1,85 +1,85 @@
 epoch,fp16,framework_config,learning_rate,lora_alpha,lora_dropout,mem_nvidia_mem_reserved,mem_peak_torch_mem_alloc_in_bytes,mem_torch_mem_alloc_in_bytes,model_name_or_path,num_gpus,peft_method,per_device_train_batch_size,r,target_modules,torch_dtype,train_loss,train_runtime,train_samples_per_second,train_steps_per_second,train_tokens_per_second
-0.15,,none,2e-5,,,76679.0,72971724288,44004763136,mistralai/Mistral-7B-v0.1,1,,4,,,float16,0.9112484455108643,565.9213,0.707,0.177,2895.102
-0.15,,none,2e-5,,,43702.0,36762859520,29521119232,mistralai/Mistral-7B-v0.1,2,,2,,,float16,0.8622726058959961,307.6782,1.3,0.325,2662.522
-0.29,,none,2e-5,,,70669.0,72972117504,44005156352,mistralai/Mistral-7B-v0.1,1,,8,,,float16,1.017976951599121,1094.9632,0.731,0.091,2992.612
-0.29,,none,2e-5,,,52882.0,36763056128,29521315840,mistralai/Mistral-7B-v0.1,2,,4,,,float16,0.8944576263427735,576.1931,1.388,0.174,2843.491
-,,none,2e-5,,,80969.0,0,0,mistralai/Mixtral-8x7B-Instruct-v0.1,1,,4,,,float16,,,,,
-,,none,2e-5,,,79169.0,0,0,mistralai/Mixtral-8x7B-Instruct-v0.1,2,,2,,,float16,,,,,
-,,none,2e-5,,,80969.0,0,0,mistralai/Mixtral-8x7B-Instruct-v0.1,1,,8,,,float16,,,,,
-,,none,2e-5,,,80083.0,0,0,mistralai/Mixtral-8x7B-Instruct-v0.1,2,,4,,,float16,,,,,
-,,none,2e-5,,,80987.0,0,0,NousResearch/Llama-2-70b-hf,1,,4,,,float16,,,,,
-,,none,2e-5,,,80923.0,0,0,NousResearch/Llama-2-70b-hf,2,,2,,,float16,,,,,
-,,none,2e-5,,,80987.0,0,0,NousResearch/Llama-2-70b-hf,1,,8,,,float16,,,,,
-,,none,2e-5,,,81006.0,0,0,NousResearch/Llama-2-70b-hf,2,,4,,,float16,,,,,
-0.15,,none,2e-4,16,0.0,28703.0,26108963328,15119590912,mistralai/Mistral-7B-v0.1,1,lora,4,16,q_proj k_proj v_proj o_proj,float16,0.8818108749389648,458.2667,0.873,0.218,3575.21
-0.15,,none,2e-4,16,0.0,17669.0,15123161088,7850391552,mistralai/Mistral-7B-v0.1,2,lora,2,16,q_proj k_proj v_proj o_proj,float16,0.8540384006500245,270.1999,1.48,0.37,3031.829
-0.29,,none,2e-4,16,0.0,42167.0,37098695168,15119984128,mistralai/Mistral-7B-v0.1,1,lora,8,16,q_proj k_proj v_proj o_proj,float16,1.0028394603729247,912.5081,0.877,0.11,3590.982
-0.29,,none,2e-4,16,0.0,25207.0,21433753600,7850588160,mistralai/Mistral-7B-v0.1,2,lora,4,16,q_proj k_proj v_proj o_proj,float16,0.8833828353881836,482.6901,1.657,0.207,3394.311
-,,none,2e-4,16,0.0,80990.0,0,0,mistralai/Mixtral-8x7B-Instruct-v0.1,1,lora,4,16,q_proj k_proj v_proj o_proj,float16,,,,,
-0.15,,none,2e-4,16,0.0,61532.0,57546370048,47311452160,mistralai/Mixtral-8x7B-Instruct-v0.1,2,lora,2,16,q_proj k_proj v_proj o_proj,float16,0.8696129798889161,561.2483,0.713,0.178,1459.604
-,,none,2e-4,16,0.0,80207.0,0,0,mistralai/Mixtral-8x7B-Instruct-v0.1,1,lora,8,16,q_proj k_proj v_proj o_proj,float16,,,,,
-0.29,,none,2e-4,16,0.0,69171.0,64398757376,47311648768,mistralai/Mixtral-8x7B-Instruct-v0.1,2,lora,4,16,q_proj k_proj v_proj o_proj,float16,0.885084867477417,938.9714,0.852,0.106,1744.888
-,,none,2e-4,16,0.0,80617.0,0,0,NousResearch/Llama-2-70b-hf,1,lora,4,16,q_proj k_proj v_proj o_proj,float16,,,,,
-,,none,2e-4,16,0.0,80907.0,0,0,NousResearch/Llama-2-70b-hf,2,lora,2,16,q_proj k_proj v_proj o_proj,float16,,,,,
-,,none,2e-4,16,0.0,80617.0,0,0,NousResearch/Llama-2-70b-hf,1,lora,8,16,q_proj k_proj v_proj o_proj,float16,,,,,
-,,none,2e-4,16,0.0,80783.0,0,0,NousResearch/Llama-2-70b-hf,2,lora,4,16,q_proj k_proj v_proj o_proj,float16,,,,,
-0.15,True,baseline-peft-bnb,2e-4,16,0.0,25995.0,22825932800,5368221184,mistralai/Mistral-7B-v0.1,1,lora,4,16,q_proj k_proj v_proj o_proj,float16,0.8698946189880371,586.9178,0.682,0.17,2791.532
-0.15,True,baseline-peft-bnb,2e-4,16,0.0,12476.0,9974622720,2727018496,mistralai/Mistral-7B-v0.1,2,lora,2,16,q_proj k_proj v_proj o_proj,float16,0.8552890300750733,284.376,1.407,0.352,2880.693
-0.29,True,baseline-peft-bnb,2e-4,16,0.0,46117.0,40278956032,5368614400,mistralai/Mistral-7B-v0.1,1,lora,8,16,q_proj k_proj v_proj o_proj,float16,0.8654958820343017,1148.1408,0.697,0.087,2854.005
-0.29,True,baseline-peft-bnb,2e-4,16,0.0,20405.0,16587205120,2727215104,mistralai/Mistral-7B-v0.1,2,lora,4,16,q_proj k_proj v_proj o_proj,float16,0.8869294357299805,503.0597,1.59,0.199,3256.87
-0.15,True,baseline-peft-bnb,2e-4,16,0.0,47189.0,46475660288,25726225920,mistralai/Mixtral-8x7B-Instruct-v0.1,1,lora,4,16,q_proj k_proj v_proj o_proj,float16,0.8893787956237793,1185.2488,0.337,0.084,1382.326
-0.15,True,baseline-peft-bnb,2e-4,16,0.0,24751.0,21932720128,13219233792,mistralai/Mixtral-8x7B-Instruct-v0.1,2,lora,2,16,q_proj k_proj v_proj o_proj,float16,0.8617707204818725,568.5808,0.704,0.176,1440.78
-0.29,True,baseline-peft-bnb,2e-4,16,0.0,68683.0,67165218816,25726619136,mistralai/Mixtral-8x7B-Instruct-v0.1,1,lora,8,16,q_proj k_proj v_proj o_proj,float16,0.8893123245239258,2124.0668,0.377,0.047,1542.701
-0.29,True,baseline-peft-bnb,2e-4,16,0.0,32064.0,29353074176,13219430400,mistralai/Mixtral-8x7B-Instruct-v0.1,2,lora,4,16,q_proj k_proj v_proj o_proj,float16,0.8585504531860352,962.8971,0.831,0.104,1701.532
-,True,baseline-peft-bnb,2e-4,16,0.0,80121.0,0,0,NousResearch/Llama-2-70b-hf,1,lora,4,16,q_proj k_proj v_proj o_proj,float16,,,,,
-0.14,True,baseline-peft-bnb,2e-4,16,0.0,51701.0,46524471808,19172741120,NousResearch/Llama-2-70b-hf,2,lora,2,16,q_proj k_proj v_proj o_proj,float16,0.9204118633270264,1981.2518,0.202,0.05,413.476
-,True,baseline-peft-bnb,2e-4,16,0.0,79555.0,0,0,NousResearch/Llama-2-70b-hf,1,lora,8,16,q_proj k_proj v_proj o_proj,float16,,,,,
-0.28,True,baseline-peft-bnb,2e-4,16,0.0,80394.0,72398346752,19172937728,NousResearch/Llama-2-70b-hf,2,lora,4,16,q_proj k_proj v_proj o_proj,float16,0.9444941711425782,3760.1788,0.213,0.027,435.724
-0.15,True,accelerated-peft-bnb,2e-4,16,0.0,18903.0,15860019712,4843384320,mistralai/Mistral-7B-v0.1,1,lora,4,16,q_proj k_proj v_proj o_proj,float16,0.8704616069793701,479.6819,0.834,0.208,3415.597
-0.15,True,accelerated-peft-bnb,2e-4,16,0.0,12533.0,9974622720,2727018496,mistralai/Mistral-7B-v0.1,2,lora,2,16,q_proj k_proj v_proj o_proj,float16,0.8528211212158203,282.8845,1.414,0.354,2895.882
-0.29,True,accelerated-peft-bnb,2e-4,16,0.0,33327.0,26849751552,4843777536,mistralai/Mistral-7B-v0.1,1,lora,8,16,q_proj k_proj v_proj o_proj,float16,0.8675907611846924,945.5376,0.846,0.106,3465.542
-0.29,True,accelerated-peft-bnb,2e-4,16,0.0,20423.0,16587205120,2727215104,mistralai/Mistral-7B-v0.1,2,lora,4,16,q_proj k_proj v_proj o_proj,float16,0.854712610244751,502.3584,1.592,0.199,3261.417
-0.15,True,accelerated-peft-bnb-foak,2e-4,16,0.0,19257.0,13636909056,4843384320,mistralai/Mistral-7B-v0.1,1,lora,4,16,q_proj k_proj v_proj o_proj,float16,0.8722561931610108,420.8819,0.95,0.238,3892.778
-0.15,True,accelerated-peft-bnb-foak,2e-4,16,0.0,12118.0,9796856320,2727018496,mistralai/Mistral-7B-v0.1,2,lora,2,16,q_proj k_proj v_proj o_proj,float16,0.8581914234161377,232.51,1.72,0.43,3523.289
-0.29,True,accelerated-peft-bnb-foak,2e-4,16,0.0,32209.0,22430791680,4843777536,mistralai/Mistral-7B-v0.1,1,lora,8,16,q_proj k_proj v_proj o_proj,float16,0.8683128643035889,821.991,0.973,0.122,3986.418
-0.29,True,accelerated-peft-bnb-foak,2e-4,16,0.0,19463.0,16207063552,2727215104,mistralai/Mistral-7B-v0.1,2,lora,4,16,q_proj k_proj v_proj o_proj,float16,0.852388572692871,427.1268,1.873,0.234,3835.864
-0.15,True,accelerated-peft-bnb,2e-4,16,0.0,37417.0,36218024448,25201389056,mistralai/Mixtral-8x7B-Instruct-v0.1,1,lora,4,16,q_proj k_proj v_proj o_proj,float16,0.8887558174133301,913.0381,0.438,0.11,1794.449
-0.15,True,accelerated-peft-bnb,2e-4,16,0.0,24952.0,21921468928,13219233792,mistralai/Mixtral-8x7B-Instruct-v0.1,2,lora,2,16,q_proj k_proj v_proj o_proj,float16,0.8612120914459228,572.3054,0.699,0.175,1431.404
-0.29,True,accelerated-peft-bnb,2e-4,16,0.0,49893.0,47207756288,25201782272,mistralai/Mixtral-8x7B-Instruct-v0.1,1,lora,8,16,q_proj k_proj v_proj o_proj,float16,0.8909227275848388,1711.7453,0.467,0.058,1914.303
-0.29,True,accelerated-peft-bnb,2e-4,16,0.0,32207.0,29359173632,13219430400,mistralai/Mixtral-8x7B-Instruct-v0.1,2,lora,4,16,q_proj k_proj v_proj o_proj,float16,0.8591176319122314,959.9538,0.833,0.104,1706.749
-0.15,True,accelerated-peft-bnb-foak,2e-4,16,0.0,37547.0,35651058176,25201389056,mistralai/Mixtral-8x7B-Instruct-v0.1,1,lora,4,16,q_proj k_proj v_proj o_proj,float16,0.8895366668701172,854.9879,0.468,0.117,1916.284
-0.15,True,accelerated-peft-bnb-foak,2e-4,16,0.0,24572.0,21746056192,13219233792,mistralai/Mixtral-8x7B-Instruct-v0.1,2,lora,2,16,q_proj k_proj v_proj o_proj,float16,0.8630767631530761,514.5553,0.777,0.194,1592.054
-0.29,True,accelerated-peft-bnb-foak,2e-4,16,0.0,49861.0,46058696192,25201782272,mistralai/Mixtral-8x7B-Instruct-v0.1,1,lora,8,16,q_proj k_proj v_proj o_proj,float16,0.8951810073852539,1601.6113,0.499,0.062,2045.94
-0.29,True,accelerated-peft-bnb-foak,2e-4,16,0.0,31701.0,29043888640,13219430400,mistralai/Mixtral-8x7B-Instruct-v0.1,2,lora,4,16,q_proj k_proj v_proj o_proj,float16,0.8600863265991211,880.114,0.909,0.114,1861.577
-0.14,True,accelerated-peft-bnb,2e-4,16,0.0,71801.0,68159977472,37346815488,NousResearch/Llama-2-70b-hf,1,lora,4,16,q_proj k_proj v_proj o_proj,float16,0.9996430969238281,3700.3604,0.108,0.027,442.768
-0.14,True,accelerated-peft-bnb,2e-4,16,0.0,51579.0,46524471808,19172741120,NousResearch/Llama-2-70b-hf,2,lora,2,16,q_proj k_proj v_proj o_proj,float16,0.9264963436126709,1955.4907,0.205,0.051,418.923
-,True,accelerated-peft-bnb,2e-4,16,0.0,79375.0,0,0,NousResearch/Llama-2-70b-hf,1,lora,8,16,q_proj k_proj v_proj o_proj,float16,,,,,
-0.28,True,accelerated-peft-bnb,2e-4,16,0.0,80815.0,72398346752,19172937728,NousResearch/Llama-2-70b-hf,2,lora,4,16,q_proj k_proj v_proj o_proj,float16,0.9262647342681884,3714.7153,0.215,0.027,441.057
-0.14,True,accelerated-peft-bnb-foak,2e-4,16,0.0,71995.0,67350935552,37346815488,NousResearch/Llama-2-70b-hf,1,lora,4,16,q_proj k_proj v_proj o_proj,float16,0.9998687934875489,3351.04,0.119,0.03,488.923
-0.14,True,accelerated-peft-bnb-foak,2e-4,16,0.0,51141.0,46250760704,19172741120,NousResearch/Llama-2-70b-hf,2,lora,2,16,q_proj k_proj v_proj o_proj,float16,0.9389877033233642,1747.6289,0.229,0.057,468.749
-,True,accelerated-peft-bnb-foak,2e-4,16,0.0,80303.0,0,0,NousResearch/Llama-2-70b-hf,1,lora,8,16,q_proj k_proj v_proj o_proj,float16,,,,,
-0.28,True,accelerated-peft-bnb-foak,2e-4,16,0.0,79861.0,71720933888,19172937728,NousResearch/Llama-2-70b-hf,2,lora,4,16,q_proj k_proj v_proj o_proj,float16,0.9403298473358155,3375.4111,0.237,0.03,485.393
-0.15,True,accelerated-peft-autogptq,2e-4,16,0.0,19425.0,15890329088,4873693696,TheBloke/Mistral-7B-v0.1-GPTQ,1,lora,4,16,q_proj k_proj v_proj o_proj,float16,1.009563512802124,491.6352,0.814,0.203,3332.552
-0.15,True,accelerated-peft-autogptq,2e-4,16,0.0,12230.0,9690031616,2743565312,TheBloke/Mistral-7B-v0.1-GPTQ,2,lora,2,16,q_proj k_proj v_proj o_proj,float16,0.9266629409790039,294.4237,1.359,0.34,2782.385
-0.29,True,accelerated-peft-autogptq,2e-4,16,0.0,33219.0,26880060928,4874086912,TheBloke/Mistral-7B-v0.1-GPTQ,1,lora,8,16,q_proj k_proj v_proj o_proj,float16,0.9904310989379883,953.3973,0.839,0.105,3436.972
-0.29,True,accelerated-peft-autogptq,2e-4,16,0.0,19477.0,16000624128,2743761920,TheBloke/Mistral-7B-v0.1-GPTQ,2,lora,4,16,q_proj k_proj v_proj o_proj,float16,0.8998308277130127,506.1818,1.58,0.198,3236.781
-0.15,True,accelerated-peft-autogptq-foak,2e-4,16,0.0,19065.0,13631990784,4873693696,TheBloke/Mistral-7B-v0.1-GPTQ,1,lora,4,16,q_proj k_proj v_proj o_proj,float16,1.003525791168213,414.297,0.965,0.241,3954.651
-0.15,True,accelerated-peft-autogptq-foak,2e-4,16,0.0,11879.0,9512265216,2743565312,TheBloke/Mistral-7B-v0.1-GPTQ,2,lora,2,16,q_proj k_proj v_proj o_proj,float16,0.9293491744995117,224.6767,1.78,0.445,3646.128
-0.29,True,accelerated-peft-autogptq-foak,2e-4,16,0.0,32721.0,22390647808,4874086912,TheBloke/Mistral-7B-v0.1-GPTQ,1,lora,8,16,q_proj k_proj v_proj o_proj,float16,0.992929859161377,810.9726,0.986,0.123,4040.581
-0.29,True,accelerated-peft-autogptq-foak,2e-4,16,0.0,19063.0,15620482560,2743761920,TheBloke/Mistral-7B-v0.1-GPTQ,2,lora,4,16,q_proj k_proj v_proj o_proj,float16,0.9045120429992676,418.8226,1.91,0.239,3911.919
-0.15,True,accelerated-peft-autogptq,2e-4,16,0.0,36389.0,35528093184,24511457792,TheBloke/Mixtral-8x7B-Instruct-v0.1-GPTQ,1,lora,4,16,q_proj k_proj v_proj o_proj,float16,0.89991379737854,897.8879,0.445,0.111,1824.727
-0.15,True,accelerated-peft-autogptq,2e-4,16,0.0,22882.0,20691720192,12526730240,TheBloke/Mixtral-8x7B-Instruct-v0.1-GPTQ,2,lora,2,16,q_proj k_proj v_proj o_proj,float16,0.8638970375061035,557.2929,0.718,0.179,1469.963
-0.29,True,accelerated-peft-autogptq,2e-4,16,0.0,48959.0,46517825024,24511851008,TheBloke/Mixtral-8x7B-Instruct-v0.1-GPTQ,1,lora,8,16,q_proj k_proj v_proj o_proj,float16,0.893577823638916,1673.2594,0.478,0.06,1958.334
-0.29,True,accelerated-peft-autogptq,2e-4,16,0.0,29704.0,27482931712,12526926848,TheBloke/Mixtral-8x7B-Instruct-v0.1-GPTQ,2,lora,4,16,q_proj k_proj v_proj o_proj,float16,0.864154224395752,938.3626,0.853,0.107,1746.02
-0.15,True,accelerated-peft-autogptq-foak,2e-4,16,0.0,36607.0,33649802752,24511457792,TheBloke/Mixtral-8x7B-Instruct-v0.1-GPTQ,1,lora,4,16,q_proj k_proj v_proj o_proj,float16,0.8993340969085694,811.6061,0.493,0.123,2018.713
-0.15,True,accelerated-peft-autogptq-foak,2e-4,16,0.0,22801.0,20438869504,12526730240,TheBloke/Mixtral-8x7B-Instruct-v0.1-GPTQ,2,lora,2,16,q_proj k_proj v_proj o_proj,float16,0.8660580062866211,478.0288,0.837,0.209,1713.704
-0.29,True,accelerated-peft-autogptq-foak,2e-4,16,0.0,49669.0,42707730944,24511851008,TheBloke/Mixtral-8x7B-Instruct-v0.1-GPTQ,1,lora,8,16,q_proj k_proj v_proj o_proj,float16,0.8937735366821289,1533.2657,0.522,0.065,2137.138
-0.29,True,accelerated-peft-autogptq-foak,2e-4,16,0.0,29370.0,26951336960,12526926848,TheBloke/Mixtral-8x7B-Instruct-v0.1-GPTQ,2,lora,4,16,q_proj k_proj v_proj o_proj,float16,0.8651807403564453,838.8338,0.954,0.119,1953.188
-0.14,True,accelerated-peft-autogptq,2e-4,16,0.0,71177.0,65895347200,36290144768,TheBloke/Llama-2-70B-GPTQ,1,lora,4,16,q_proj k_proj v_proj o_proj,float16,0.9811842250823974,3639.6437,0.11,0.027,450.154
-0.14,True,accelerated-peft-autogptq,2e-4,16,0.0,49475.0,44873390592,18125597696,TheBloke/Llama-2-70B-GPTQ,2,lora,2,16,q_proj k_proj v_proj o_proj,float16,0.9557892894744873,1923.445,0.208,0.052,425.902
-,True,accelerated-peft-autogptq,2e-4,16,0.0,79265.0,0,0,TheBloke/Llama-2-70B-GPTQ,1,lora,8,16,q_proj k_proj v_proj o_proj,float16,,,,,
-0.28,True,accelerated-peft-autogptq,2e-4,16,0.0,79187.0,70143285760,18125794304,TheBloke/Llama-2-70B-GPTQ,2,lora,4,16,q_proj k_proj v_proj o_proj,float16,0.9580207633972168,3685.3642,0.217,0.027,444.569
-0.14,True,accelerated-peft-autogptq-foak,2e-4,16,0.0,71223.0,65086305280,36290144768,TheBloke/Llama-2-70B-GPTQ,1,lora,4,16,q_proj k_proj v_proj o_proj,float16,0.981500825881958,3273.1958,0.122,0.031,500.551
-0.14,True,accelerated-peft-autogptq-foak,2e-4,16,0.0,49187.0,44599679488,18125597696,TheBloke/Llama-2-70B-GPTQ,2,lora,2,16,q_proj k_proj v_proj o_proj,float16,0.9558010864257812,1682.0158,0.238,0.059,487.035
-,True,accelerated-peft-autogptq-foak,2e-4,16,0.0,80945.0,0,0,TheBloke/Llama-2-70B-GPTQ,1,lora,8,16,q_proj k_proj v_proj o_proj,float16,,,,,
-0.28,True,accelerated-peft-autogptq-foak,2e-4,16,0.0,78208.0,69465872896,18125794304,TheBloke/Llama-2-70B-GPTQ,2,lora,4,16,q_proj k_proj v_proj o_proj,float16,0.9556115436553955,3298.135,0.243,0.03,496.766
+0.15,,none,2e-5,,,76671.0,72972297728.0,44005107200.0,mistralai/Mistral-7B-v0.1,1,,4,,,float16,0.9160769081115723,549.026,0.729,0.182,2984.194
+0.15,,none,2e-5,,,43744.0,36763146240.0,29521348608.0,mistralai/Mistral-7B-v0.1,2,,2,,,float16,0.8728336906433105,298.0786,1.342,0.335,2748.269
+0.29,,none,2e-5,,,79365.0,72972690944.0,44005500416.0,mistralai/Mistral-7B-v0.1,1,,8,,,float16,1.001595754623413,1066.0306,0.75,0.094,3073.833
+0.29,,none,2e-5,,,52883.0,36763342848.0,29521545216.0,mistralai/Mistral-7B-v0.1,2,,4,,,float16,0.9138528442382813,552.1771,1.449,0.181,2967.164
+,,none,2e-5,,,80969.0,,,mistralai/Mixtral-8x7B-Instruct-v0.1,1,,4,,,float16,,,,,
+,,none,2e-5,,,80925.0,,,mistralai/Mixtral-8x7B-Instruct-v0.1,2,,2,,,float16,,,,,
+,,none,2e-5,,,80969.0,,,mistralai/Mixtral-8x7B-Instruct-v0.1,1,,8,,,float16,,,,,
+,,none,2e-5,,,81003.0,,,mistralai/Mixtral-8x7B-Instruct-v0.1,2,,4,,,float16,,,,,
+,,none,2e-5,,,80987.0,,,NousResearch/Llama-2-70b-hf,1,,4,,,float16,,,,,
+,,none,2e-5,,,80922.0,,,NousResearch/Llama-2-70b-hf,2,,2,,,float16,,,,,
+,,none,2e-5,,,80987.0,,,NousResearch/Llama-2-70b-hf,1,,8,,,float16,,,,,
+,,none,2e-5,,,80863.0,,,NousResearch/Llama-2-70b-hf,2,,4,,,float16,,,,,
+0.15,,none,2e-4,16,0.1,28707.0,26109561344.0,15119705600.0,mistralai/Mistral-7B-v0.1,1,lora,4,16,q_proj k_proj v_proj o_proj,float16,0.8970945072174072,458.7158,0.872,0.218,3571.71
+0.15,,none,2e-4,16,0.1,17897.0,15458877440.0,7850448896.0,mistralai/Mistral-7B-v0.1,2,lora,2,16,q_proj k_proj v_proj o_proj,float16,0.8571704006195069,270.088,1.481,0.37,3033.086
+0.29,,none,2e-4,16,0.1,42171.0,37100825088.0,15120098816.0,mistralai/Mistral-7B-v0.1,1,lora,8,16,q_proj k_proj v_proj o_proj,float16,0.9901649284362793,913.5703,0.876,0.109,3586.807
+0.29,,none,2e-4,16,0.1,25659.0,22105014272.0,7850645504.0,mistralai/Mistral-7B-v0.1,2,lora,4,16,q_proj k_proj v_proj o_proj,float16,0.9025015163421631,482.2349,1.659,0.207,3397.514
+,,none,2e-4,16,0.1,80991.0,,,mistralai/Mixtral-8x7B-Instruct-v0.1,1,lora,4,16,q_proj k_proj v_proj o_proj,float16,,,,,
+0.15,,none,2e-4,16,0.1,61532.0,57898183168.0,47311509504.0,mistralai/Mixtral-8x7B-Instruct-v0.1,2,lora,2,16,q_proj k_proj v_proj o_proj,float16,0.8681951332092285,551.3062,0.726,0.181,1485.926
+,,none,2e-4,16,0.1,80991.0,,,mistralai/Mixtral-8x7B-Instruct-v0.1,1,lora,8,16,q_proj k_proj v_proj o_proj,float16,,,,,
+0.29,,none,2e-4,16,0.1,69436.0,65039245312.0,47311706112.0,mistralai/Mixtral-8x7B-Instruct-v0.1,2,lora,4,16,q_proj k_proj v_proj o_proj,float16,0.8880744457244873,924.9663,0.865,0.108,1771.308
+,,none,2e-4,16,0.1,80617.0,,,NousResearch/Llama-2-70b-hf,1,lora,4,16,q_proj k_proj v_proj o_proj,float16,,,,,
+,,none,2e-4,16,0.1,80756.0,,,NousResearch/Llama-2-70b-hf,2,lora,2,16,q_proj k_proj v_proj o_proj,float16,,,,,
+,,none,2e-4,16,0.1,80617.0,,,NousResearch/Llama-2-70b-hf,1,lora,8,16,q_proj k_proj v_proj o_proj,float16,,,,,
+,,none,2e-4,16,0.1,80851.0,,,NousResearch/Llama-2-70b-hf,2,lora,4,16,q_proj k_proj v_proj o_proj,float16,,,,,
+0.15,True,baseline-peft-bnb,2e-4,16,0.1,25999.0,23228815360.0,5368450560.0,mistralai/Mistral-7B-v0.1,1,lora,4,16,q_proj k_proj v_proj o_proj,float16,0.8647766017913818,593.395,0.674,0.169,2761.062
+0.15,True,baseline-peft-bnb,2e-4,16,0.1,12818.0,10431547904.0,2781601792.0,mistralai/Mistral-7B-v0.1,2,lora,2,16,q_proj k_proj v_proj o_proj,float16,0.8678814029693603,284.7643,1.405,0.351,2876.765
+0.29,True,baseline-peft-bnb,2e-4,16,0.1,46121.0,41084491776.0,5368843776.0,mistralai/Mistral-7B-v0.1,1,lora,8,16,q_proj k_proj v_proj o_proj,float16,0.868037691116333,1158.2474,0.691,0.086,2829.102
+0.29,True,baseline-peft-bnb,2e-4,16,0.1,20421.0,17446783488.0,2781798400.0,mistralai/Mistral-7B-v0.1,2,lora,4,16,q_proj k_proj v_proj o_proj,float16,0.8695751667022705,502.2826,1.593,0.199,3261.909
+0.15,True,baseline-peft-bnb,2e-4,16,0.1,47567.0,46825980416.0,25726455296.0,mistralai/Mixtral-8x7B-Instruct-v0.1,1,lora,4,16,q_proj k_proj v_proj o_proj,float16,0.8924774932861328,1171.0504,0.342,0.085,1399.086
+0.15,True,baseline-peft-bnb,2e-4,16,0.1,25163.0,22356893696.0,13273817088.0,mistralai/Mixtral-8x7B-Instruct-v0.1,2,lora,2,16,q_proj k_proj v_proj o_proj,float16,0.8943204975128174,568.962,0.703,0.176,1439.815
+0.29,True,baseline-peft-bnb,2e-4,16,0.1,69237.0,67906358784.0,25726848512.0,mistralai/Mixtral-8x7B-Instruct-v0.1,1,lora,8,16,q_proj k_proj v_proj o_proj,float16,0.8907253837585449,2126.1835,0.376,0.047,1541.165
+0.29,True,baseline-peft-bnb,2e-4,16,0.1,32960.0,30165152256.0,13274013696.0,mistralai/Mixtral-8x7B-Instruct-v0.1,2,lora,4,16,q_proj k_proj v_proj o_proj,float16,0.893255443572998,957.9628,0.835,0.104,1710.296
+,True,baseline-peft-bnb,2e-4,16,0.1,80123.0,,,NousResearch/Llama-2-70b-hf,1,lora,4,16,q_proj k_proj v_proj o_proj,float16,,,,,
+0.14,True,baseline-peft-bnb,2e-4,16,0.1,52469.0,47591447040.0,19434999808.0,NousResearch/Llama-2-70b-hf,2,lora,2,16,q_proj k_proj v_proj o_proj,float16,1.0088242053985597,1955.5844,0.205,0.051,418.903
+,True,baseline-peft-bnb,2e-4,16,0.1,80581.0,,,NousResearch/Llama-2-70b-hf,1,lora,8,16,q_proj k_proj v_proj o_proj,float16,,,,,
+,True,baseline-peft-bnb,2e-4,16,0.1,80585.0,,,NousResearch/Llama-2-70b-hf,2,lora,4,16,q_proj k_proj v_proj o_proj,float16,,,,,
+0.15,True,accelerated-peft-bnb,2e-4,16,0.1,18907.0,15860617728.0,4843499008.0,mistralai/Mistral-7B-v0.1,1,lora,4,16,q_proj k_proj v_proj o_proj,float16,0.8644689750671387,482.8812,0.828,0.207,3392.967
+0.15,True,accelerated-peft-bnb,2e-4,16,0.1,12783.0,10431547904.0,2781601792.0,mistralai/Mistral-7B-v0.1,2,lora,2,16,q_proj k_proj v_proj o_proj,float16,0.8698421669006348,284.1914,1.408,0.352,2882.564
+0.29,True,accelerated-peft-bnb,2e-4,16,0.1,33331.0,26851881472.0,4843892224.0,mistralai/Mistral-7B-v0.1,1,lora,8,16,q_proj k_proj v_proj o_proj,float16,0.8686403369903565,948.8322,0.843,0.105,3453.508
+0.29,True,accelerated-peft-bnb,2e-4,16,0.1,20523.0,17446783488.0,2781798400.0,mistralai/Mistral-7B-v0.1,2,lora,4,16,q_proj k_proj v_proj o_proj,float16,0.8683876323699952,504.0477,1.587,0.198,3250.486
+0.15,True,accelerated-peft-bnb-foak,2e-4,16,0.1,17449.0,14173894656.0,4843499008.0,mistralai/Mistral-7B-v0.1,1,lora,4,16,q_proj k_proj v_proj o_proj,float16,0.8669318771362304,419.9549,0.952,0.238,3901.371
+0.15,True,accelerated-peft-bnb-foak,2e-4,16,0.1,12699.0,10065463808.0,2727075840.0,mistralai/Mistral-7B-v0.1,2,lora,2,16,q_proj k_proj v_proj o_proj,float16,0.8524643421173096,225.0245,1.778,0.444,3640.493
+0.29,True,accelerated-peft-bnb-foak,2e-4,16,0.1,28593.0,23504648192.0,4843892224.0,mistralai/Mistral-7B-v0.1,1,lora,8,16,q_proj k_proj v_proj o_proj,float16,0.8657933044433593,819.2575,0.976,0.122,3999.719
+0.29,True,accelerated-peft-bnb-foak,2e-4,16,0.1,19860.0,16744106496.0,2727272448.0,mistralai/Mistral-7B-v0.1,2,lora,4,16,q_proj k_proj v_proj o_proj,float16,0.8490522384643555,420.7803,1.901,0.238,3893.719
+0.15,True,accelerated-peft-bnb,2e-4,16,0.1,37399.0,36828377600.0,25201503744.0,mistralai/Mixtral-8x7B-Instruct-v0.1,1,lora,4,16,q_proj k_proj v_proj o_proj,float16,0.8931312561035156,925.5545,0.432,0.108,1770.182
+0.15,True,accelerated-peft-bnb,2e-4,16,0.1,25216.0,22359233024.0,13273817088.0,mistralai/Mixtral-8x7B-Instruct-v0.1,2,lora,2,16,q_proj k_proj v_proj o_proj,float16,0.892439432144165,570.8031,0.701,0.175,1435.171
+0.29,True,accelerated-peft-bnb,2e-4,16,0.1,49913.0,48447599616.0,25201896960.0,mistralai/Mixtral-8x7B-Instruct-v0.1,1,lora,8,16,q_proj k_proj v_proj o_proj,float16,0.8924949169158936,1720.4669,0.465,0.058,1904.599
+0.29,True,accelerated-peft-bnb,2e-4,16,0.1,33214.0,30167236096.0,13274013696.0,mistralai/Mixtral-8x7B-Instruct-v0.1,2,lora,4,16,q_proj k_proj v_proj o_proj,float16,0.8910456848144531,961.4325,0.832,0.104,1704.124
+0.15,True,accelerated-peft-bnb-foak,2e-4,16,0.1,36039.0,36153218048.0,25201503744.0,mistralai/Mixtral-8x7B-Instruct-v0.1,1,lora,4,16,q_proj k_proj v_proj o_proj,float16,0.8932661628723144,855.0375,0.468,0.117,1916.173
+0.15,True,accelerated-peft-bnb-foak,2e-4,16,0.1,25513.0,22008699904.0,13219291136.0,mistralai/Mixtral-8x7B-Instruct-v0.1,2,lora,2,16,q_proj k_proj v_proj o_proj,float16,0.8599378490447998,511.3077,0.782,0.196,1602.166
+0.29,True,accelerated-peft-bnb-foak,2e-4,16,0.1,46959.0,47096648192.0,25201896960.0,mistralai/Mixtral-8x7B-Instruct-v0.1,1,lora,8,16,q_proj k_proj v_proj o_proj,float16,0.8929532051086426,1595.4842,0.501,0.063,2053.797
+0.29,True,accelerated-peft-bnb-foak,2e-4,16,0.1,33064.0,29497270272.0,13219487744.0,mistralai/Mixtral-8x7B-Instruct-v0.1,2,lora,4,16,q_proj k_proj v_proj o_proj,float16,0.8600027751922608,878.2625,0.911,0.114,1865.502
+0.14,True,accelerated-peft-bnb,2e-4,16,0.1,72701.0,69770819584.0,37347044864.0,NousResearch/Llama-2-70b-hf,1,lora,4,16,q_proj k_proj v_proj o_proj,float16,1.0098001098632812,3656.7382,0.109,0.027,448.05
+0.14,True,accelerated-peft-bnb,2e-4,16,0.1,52469.0,47591447040.0,19434999808.0,NousResearch/Llama-2-70b-hf,2,lora,2,16,q_proj k_proj v_proj o_proj,float16,1.0093148803710938,1952.1407,0.205,0.051,419.642
+,True,accelerated-peft-bnb,2e-4,16,0.1,79377.0,,,NousResearch/Llama-2-70b-hf,1,lora,8,16,q_proj k_proj v_proj o_proj,float16,,,,,
+,True,accelerated-peft-bnb,2e-4,16,0.1,80837.0,,,NousResearch/Llama-2-70b-hf,2,lora,4,16,q_proj k_proj v_proj o_proj,float16,,,,,
+0.14,True,accelerated-peft-bnb-foak,2e-4,16,0.1,71019.0,68424906752.0,37347044864.0,NousResearch/Llama-2-70b-hf,1,lora,4,16,q_proj k_proj v_proj o_proj,float16,1.0100258159637452,3358.344,0.119,0.03,487.859
+0.14,True,accelerated-peft-bnb-foak,2e-4,16,0.1,51461.0,46787975680.0,19172855808.0,NousResearch/Llama-2-70b-hf,2,lora,2,16,q_proj k_proj v_proj o_proj,float16,0.9600833988189698,1747.5665,0.229,0.057,468.766
+,True,accelerated-peft-bnb-foak,2e-4,16,0.1,80945.0,,,NousResearch/Llama-2-70b-hf,1,lora,8,16,q_proj k_proj v_proj o_proj,float16,,,,,
+0.28,True,accelerated-peft-bnb-foak,2e-4,16,0.1,80967.0,72795019776.0,19173052416.0,NousResearch/Llama-2-70b-hf,2,lora,4,16,q_proj k_proj v_proj o_proj,float16,0.9324470138549805,3384.1355,0.236,0.03,484.141
+0.15,True,accelerated-peft-autogptq,2e-4,16,0.1,19429.0,15890927104.0,4873808384.0,TheBloke/Mistral-7B-v0.1-GPTQ,1,lora,4,16,q_proj k_proj v_proj o_proj,float16,0.9843631744384765,481.226,0.831,0.208,3404.637
+0.15,True,accelerated-peft-autogptq,2e-4,16,0.1,12860.0,10079847936.0,2798148608.0,TheBloke/Mistral-7B-v0.1-GPTQ,2,lora,2,16,q_proj k_proj v_proj o_proj,float16,0.9855545139312745,282.415,1.416,0.354,2900.695
+0.29,True,accelerated-peft-autogptq,2e-4,16,0.1,33223.0,26882190848.0,4874201600.0,TheBloke/Mistral-7B-v0.1-GPTQ,1,lora,8,16,q_proj k_proj v_proj o_proj,float16,0.9518059539794922,944.0475,0.847,0.106,3471.012
+0.29,True,accelerated-peft-autogptq,2e-4,16,0.1,20472.0,16725984768.0,2798345216.0,TheBloke/Mistral-7B-v0.1-GPTQ,2,lora,4,16,q_proj k_proj v_proj o_proj,float16,0.9537856483459473,497.4081,1.608,0.201,3293.875
+0.15,True,accelerated-peft-autogptq-foak,2e-4,16,0.1,17193.0,13632576512.0,4873808384.0,TheBloke/Mistral-7B-v0.1-GPTQ,1,lora,4,16,q_proj k_proj v_proj o_proj,float16,0.9757871055603027,413.121,0.968,0.242,3965.908
+0.15,True,accelerated-peft-autogptq-foak,2e-4,16,0.1,12703.0,9780872704.0,2743622656.0,TheBloke/Mistral-7B-v0.1-GPTQ,2,lora,2,16,q_proj k_proj v_proj o_proj,float16,0.9560029792785645,221.2793,1.808,0.452,3702.109
+0.29,True,accelerated-peft-autogptq-foak,2e-4,16,0.1,28977.0,22392753152.0,4874201600.0,TheBloke/Mistral-7B-v0.1-GPTQ,1,lora,8,16,q_proj k_proj v_proj o_proj,float16,0.9514095497131347,805.8956,0.993,0.124,4066.035
+0.29,True,accelerated-peft-autogptq-foak,2e-4,16,0.1,19800.0,16157525504.0,2743819264.0,TheBloke/Mistral-7B-v0.1-GPTQ,2,lora,4,16,q_proj k_proj v_proj o_proj,float16,0.9230777645111083,415.3379,1.926,0.241,3944.74
+0.15,True,accelerated-peft-autogptq,2e-4,16,0.1,36387.0,35528691200.0,24511572480.0,TheBloke/Mixtral-8x7B-Instruct-v0.1-GPTQ,1,lora,4,16,q_proj k_proj v_proj o_proj,float16,0.8990980052947998,885.7851,0.452,0.113,1849.659
+0.15,True,accelerated-peft-autogptq,2e-4,16,0.1,23548.0,21067523584.0,12581313536.0,TheBloke/Mixtral-8x7B-Instruct-v0.1-GPTQ,2,lora,2,16,q_proj k_proj v_proj o_proj,float16,0.8998581314086914,536.746,0.745,0.186,1526.234
+0.29,True,accelerated-peft-autogptq,2e-4,16,0.1,48905.0,46519954944.0,24511965696.0,TheBloke/Mixtral-8x7B-Instruct-v0.1-GPTQ,1,lora,8,16,q_proj k_proj v_proj o_proj,float16,0.8961446380615234,1669.0298,0.479,0.06,1963.296
+0.29,True,accelerated-peft-autogptq,2e-4,16,0.1,30516.0,28187328512.0,12581510144.0,TheBloke/Mixtral-8x7B-Instruct-v0.1-GPTQ,2,lora,4,16,q_proj k_proj v_proj o_proj,float16,0.8947424793243408,921.8778,0.868,0.108,1777.242
+0.15,True,accelerated-peft-autogptq-foak,2e-4,16,0.1,34731.0,34183981056.0,24511572480.0,TheBloke/Mixtral-8x7B-Instruct-v0.1-GPTQ,1,lora,4,16,q_proj k_proj v_proj o_proj,float16,0.8992811870574952,814.994,0.491,0.123,2010.322
+0.15,True,accelerated-peft-autogptq-foak,2e-4,16,0.1,24177.0,20715718656.0,12526787584.0,TheBloke/Mixtral-8x7B-Instruct-v0.1-GPTQ,2,lora,2,16,q_proj k_proj v_proj o_proj,float16,0.8655492782592773,475.158,0.842,0.21,1724.058
+0.29,True,accelerated-peft-autogptq-foak,2e-4,16,0.1,45901.0,43758690304.0,24511965696.0,TheBloke/Mixtral-8x7B-Instruct-v0.1-GPTQ,1,lora,8,16,q_proj k_proj v_proj o_proj,float16,0.895248155593872,1528.7913,0.523,0.065,2143.393
+0.29,True,accelerated-peft-autogptq-foak,2e-4,16,0.1,31452.0,27526991360.0,12526984192.0,TheBloke/Mixtral-8x7B-Instruct-v0.1-GPTQ,2,lora,4,16,q_proj k_proj v_proj o_proj,float16,0.8628469562530517,835.9993,0.957,0.12,1959.81
+0.14,True,accelerated-peft-autogptq,2e-4,16,0.1,71181.0,67237753856.0,36290374144.0,TheBloke/Llama-2-70B-GPTQ,1,lora,4,16,q_proj k_proj v_proj o_proj,float16,0.9894431018829346,3599.2898,0.111,0.028,455.201
+0.14,True,accelerated-peft-autogptq,2e-4,16,0.1,51115.0,45806148096.0,18387856384.0,TheBloke/Llama-2-70B-GPTQ,2,lora,2,16,q_proj k_proj v_proj o_proj,float16,0.9900471115112305,1900.1037,0.211,0.053,431.134
+,True,accelerated-peft-autogptq,2e-4,16,0.1,79265.0,,,TheBloke/Llama-2-70B-GPTQ,1,lora,8,16,q_proj k_proj v_proj o_proj,float16,,,,,
+0.28,True,accelerated-peft-autogptq,2e-4,16,0.1,80813.0,71747131904.0,18388052992.0,TheBloke/Llama-2-70B-GPTQ,2,lora,4,16,q_proj k_proj v_proj o_proj,float16,0.9895571708679199,3672.2631,0.218,0.027,446.155
+0.14,True,accelerated-peft-autogptq-foak,2e-4,16,0.1,69479.0,66160276480.0,36290374144.0,TheBloke/Llama-2-70B-GPTQ,1,lora,4,16,q_proj k_proj v_proj o_proj,float16,0.9900266265869141,3283.8655,0.122,0.03,498.924
+0.14,True,accelerated-peft-autogptq-foak,2e-4,16,0.1,50518.0,45136894464.0,18125712384.0,TheBloke/Llama-2-70B-GPTQ,2,lora,2,16,q_proj k_proj v_proj o_proj,float16,0.9589622497558594,1684.1824,0.238,0.059,486.408
+,True,accelerated-peft-autogptq-foak,2e-4,16,0.1,80301.0,,,TheBloke/Llama-2-70B-GPTQ,1,lora,8,16,q_proj k_proj v_proj o_proj,float16,,,,,
+0.28,True,accelerated-peft-autogptq-foak,2e-4,16,0.1,79950.0,70539958784.0,18125908992.0,TheBloke/Llama-2-70B-GPTQ,2,lora,4,16,q_proj k_proj v_proj o_proj,float16,0.9595681858062745,3305.9445,0.242,0.03,495.592
diff --git a/scripts/benchmarks/refs/requirements.txt b/scripts/benchmarks/refs/requirements.txt
new file mode 100644
index 00000000..7bfa45a4
--- /dev/null
+++ b/scripts/benchmarks/refs/requirements.txt
@@ -0,0 +1,78 @@
+accelerate==0.32.1
+aiohttp==3.9.5
+aiosignal==1.3.1
+async-timeout==4.0.3
+attrs==23.2.0
+bitsandbytes==0.43.1
+certifi==2024.7.4
+charset-normalizer==3.3.2
+datasets==2.20.0
+dill==0.3.8
+docstring_parser==0.16
+einops==0.8.0
+filelock==3.15.4
+fire==0.6.0
+flash-attn==2.5.9.post1
+-e git+https://github.com/achew010/fms-acceleration.git@33bf943ed4e19db7941ca8f852666a51160fb2de#egg=fms_acceleration&subdirectory=plugins/framework
+-e git+https://github.com/achew010/fms-acceleration.git@33bf943ed4e19db7941ca8f852666a51160fb2de#egg=fms_acceleration_foak&subdirectory=plugins/fused-ops-and-kernels
+-e git+https://github.com/achew010/fms-acceleration.git@33bf943ed4e19db7941ca8f852666a51160fb2de#egg=fms_acceleration_peft&subdirectory=plugins/accelerated-peft
+fms-hf-tuning @ git+https://github.com/foundation-model-stack/fms-hf-tuning.git@85f32cb15019217ccc22156233f15d280d3f4690
+frozenlist==1.4.1
+fsspec==2024.5.0
+huggingface-hub==0.23.4
+idna==3.7
+Jinja2==3.1.4
+markdown-it-py==3.0.0
+MarkupSafe==2.1.5
+mdurl==0.1.2
+mpmath==1.3.0
+multidict==6.0.5
+multiprocess==0.70.16
+networkx==3.3
+numpy==1.26.4
+nvidia-cublas-cu12==12.1.3.1
+nvidia-cuda-cupti-cu12==12.1.105
+nvidia-cuda-nvrtc-cu12==12.1.105
+nvidia-cuda-runtime-cu12==12.1.105
+nvidia-cudnn-cu12==8.9.2.26
+nvidia-cufft-cu12==11.0.2.54
+nvidia-curand-cu12==10.3.2.106
+nvidia-cusolver-cu12==11.4.5.107
+nvidia-cusparse-cu12==12.1.0.106
+nvidia-nccl-cu12==2.19.3
+nvidia-nvjitlink-cu12==12.5.82
+nvidia-nvtx-cu12==12.1.105
+packaging==24.1
+pandas==2.2.2
+peft==0.11.1
+protobuf==5.27.2
+psutil==6.0.0
+pyarrow==16.1.0
+pyarrow-hotfix==0.6
+Pygments==2.18.0
+python-dateutil==2.9.0.post0
+pytz==2024.1
+PyYAML==6.0.1
+regex==2024.5.15
+requests==2.32.3
+rich==13.7.1
+safetensors==0.4.3
+sentencepiece==0.2.0
+shtab==1.7.1
+simpleeval==0.9.13
+six==1.16.0
+sympy==1.13.0
+termcolor==2.4.0
+threadpoolctl==3.5.0
+tokenizers==0.15.2
+torch==2.2.2
+tqdm==4.66.4
+transformers==4.39.3
+triton==2.2.0
+trl==0.9.6
+typing_extensions==4.12.2
+tyro==0.8.5
+tzdata==2024.1
+urllib3==2.2.2
+xxhash==3.4.1
+yarl==1.9.4

From 61fe08c8df87ee81799eb097fb7e494ad91bdaf2 Mon Sep 17 00:00:00 2001
From: 1000850000 user <aaron.chew1@ibm.com>
Date: Mon, 15 Jul 2024 03:17:22 +0000
Subject: [PATCH 20/20] Add comment for foak kernel

---
 .../fused_ops/unsloth_lora/gptq/triton/kernels.py                | 1 +
 1 file changed, 1 insertion(+)

diff --git a/plugins/fused-ops-and-kernels/src/fms_acceleration_foak/fused_ops/unsloth_lora/gptq/triton/kernels.py b/plugins/fused-ops-and-kernels/src/fms_acceleration_foak/fused_ops/unsloth_lora/gptq/triton/kernels.py
index efc3b41e..aa6a2c70 100644
--- a/plugins/fused-ops-and-kernels/src/fms_acceleration_foak/fused_ops/unsloth_lora/gptq/triton/kernels.py
+++ b/plugins/fused-ops-and-kernels/src/fms_acceleration_foak/fused_ops/unsloth_lora/gptq/triton/kernels.py
@@ -111,6 +111,7 @@ def dequant_kernel_248(
 
     # Dequantize
     # None if using local gptqpackage, official autogptq should have an offset value
+    # Triton compiler throws an NameError for function `hasattr`
     if getattr(qzeros_ptr, "offset", None) is not None:
         zeros = zeros + qzeros_ptr.offset