diff --git a/README.md b/README.md
index 013b1e18..9e2ca640 100644
--- a/README.md
+++ b/README.md
@@ -1,5 +1,5 @@
 ## transformer is all you need.
-- deep training framework based on lightning and transformers
+- deep training framework based on transformers
 
 ## install and download
 
@@ -17,6 +17,10 @@ pip install -U git+https://github.com/ssbuild/deep_training.git --no-deps --forc
 
   
 ## update
+- <strong>2023-12-12</strong>
+   - 0.2.10 update qwen model for 1.8b 7b 14b 72b
+
+
 - <strong>2023-11-13</strong>
   - 0.2.9 release
   - 0.2.9.post0 support chatglm3-6b-32k
diff --git a/setup.py b/setup.py
index 69378c0a..b706151c 100644
--- a/setup.py
+++ b/setup.py
@@ -20,7 +20,7 @@
 ]
 setup(
     name='deep_training',
-    version='0.2.9.post0',
+    version='0.2.10',
     description='an easy training architecture',
     long_description='torch_training: https://github.com/ssbuild/deep_training.git',
     license='Apache License 2.0',
diff --git a/src/deep_training/nlp/models/qwen/cache_autogptq_cuda_256.cpp b/src/deep_training/nlp/models/qwen/cache_autogptq_cuda_256.cpp
new file mode 100644
index 00000000..8458a9b5
--- /dev/null
+++ b/src/deep_training/nlp/models/qwen/cache_autogptq_cuda_256.cpp
@@ -0,0 +1,198 @@
+#include <torch/all.h>
+#include <torch/python.h>
+#include <c10/cuda/CUDAGuard.h>
+
+// adapted from https://github.com/PanQiWei/AutoGPTQ/blob/main/autogptq_extension/cuda_256/autogptq_cuda_256.cpp
+void vecquant8matmul_cuda(
+  torch::Tensor vec, torch::Tensor mat, torch::Tensor mul,
+  torch::Tensor scales, torch::Tensor zeros,
+  torch::Tensor g_idx
+);
+
+void vecquant8matmul(
+  torch::Tensor vec, torch::Tensor mat, torch::Tensor mul,
+  torch::Tensor scales, torch::Tensor zeros,
+  torch::Tensor g_idx
+) {
+  const at::cuda::OptionalCUDAGuard device_guard(device_of(vec));
+  vecquant8matmul_cuda(vec, mat, mul, scales, zeros, g_idx);
+}
+
+void vecquant8matmul_batched_cuda(
+  torch::Tensor vec, torch::Tensor mat, torch::Tensor mul,
+  torch::Tensor scales, torch::Tensor zeros
+);
+
+void vecquant8matmul_batched(
+  torch::Tensor vec, torch::Tensor mat, torch::Tensor mul,
+  torch::Tensor scales, torch::Tensor zeros
+) {
+  const at::cuda::OptionalCUDAGuard device_guard(device_of(vec));
+  vecquant8matmul_batched_cuda(vec, mat, mul, scales, zeros);
+}
+
+void vecquant8matmul_batched_column_compression_cuda(
+  torch::Tensor vec, torch::Tensor mat, torch::Tensor mul,
+  torch::Tensor scales, torch::Tensor zeros
+);
+
+void vecquant8matmul_batched_column_compression(
+  torch::Tensor vec, torch::Tensor mat, torch::Tensor mul,
+  torch::Tensor scales, torch::Tensor zeros
+) {
+  const at::cuda::OptionalCUDAGuard device_guard(device_of(vec));
+  vecquant8matmul_batched_column_compression_cuda(vec, mat, mul, scales, zeros);
+}
+
+void vecquant4matmul_batched_cuda(
+  torch::Tensor vec, torch::Tensor mat, torch::Tensor mul,
+  torch::Tensor scales, torch::Tensor zeros
+);
+
+void vecquant4matmul_batched(
+  torch::Tensor vec, torch::Tensor mat, torch::Tensor mul,
+  torch::Tensor scales, torch::Tensor zeros
+) {
+  const at::cuda::OptionalCUDAGuard device_guard(device_of(vec));
+  vecquant4matmul_batched_cuda(vec, mat, mul, scales, zeros);
+}
+
+void vecquant4matmul_batched_column_compression_cuda(
+  torch::Tensor vec, torch::Tensor mat, torch::Tensor mul,
+  torch::Tensor scales, torch::Tensor zeros
+);
+
+void vecquant4matmul_batched_column_compression(
+  torch::Tensor vec, torch::Tensor mat, torch::Tensor mul,
+  torch::Tensor scales, torch::Tensor zeros
+) {
+  const at::cuda::OptionalCUDAGuard device_guard(device_of(vec));
+  vecquant4matmul_batched_column_compression_cuda(vec, mat, mul, scales, zeros);
+}
+
+void vecquant8matmul_batched_old_cuda(
+  torch::Tensor vec, torch::Tensor mat, torch::Tensor mul,
+  torch::Tensor scales, torch::Tensor zeros
+);
+
+void vecquant8matmul_batched_old(
+  torch::Tensor vec, torch::Tensor mat, torch::Tensor mul,
+  torch::Tensor scales, torch::Tensor zeros
+) {
+  const at::cuda::OptionalCUDAGuard device_guard(device_of(vec));
+  vecquant8matmul_batched_old_cuda(vec, mat, mul, scales, zeros);
+}
+
+
+void vecquant4matmul_batched_old_cuda(
+  torch::Tensor vec, torch::Tensor mat, torch::Tensor mul,
+  torch::Tensor scales, torch::Tensor zeros
+);
+
+void vecquant4matmul_batched_old(
+  torch::Tensor vec, torch::Tensor mat, torch::Tensor mul,
+  torch::Tensor scales, torch::Tensor zeros
+) {
+  const at::cuda::OptionalCUDAGuard device_guard(device_of(vec));
+  vecquant4matmul_batched_old_cuda(vec, mat, mul, scales, zeros);
+}
+
+void vecquant8matmul_batched_column_compression_old_cuda(
+  torch::Tensor vec, torch::Tensor mat, torch::Tensor mul,
+  torch::Tensor scales, torch::Tensor zeros
+);
+
+void vecquant8matmul_batched_column_compression_old(
+  torch::Tensor vec, torch::Tensor mat, torch::Tensor mul,
+  torch::Tensor scales, torch::Tensor zeros
+) {
+  const at::cuda::OptionalCUDAGuard device_guard(device_of(vec));
+  vecquant8matmul_batched_column_compression_old_cuda(vec, mat, mul, scales, zeros);
+}
+
+void vecquant4matmul_batched_column_compression_old_cuda(
+  torch::Tensor vec, torch::Tensor mat, torch::Tensor mul,
+  torch::Tensor scales, torch::Tensor zeros
+);
+
+void vecquant4matmul_batched_column_compression_old(
+  torch::Tensor vec, torch::Tensor mat, torch::Tensor mul,
+  torch::Tensor scales, torch::Tensor zeros
+) {
+  const at::cuda::OptionalCUDAGuard device_guard(device_of(vec));
+  vecquant4matmul_batched_column_compression_old_cuda(vec, mat, mul, scales, zeros);
+}
+
+
+
+void vecquant8matmul_batched_faster_cuda(
+  torch::Tensor vec, torch::Tensor mat, torch::Tensor mul,
+  torch::Tensor scales, torch::Tensor zeros
+);
+
+void vecquant8matmul_batched_faster(
+  torch::Tensor vec, torch::Tensor mat, torch::Tensor mul,
+  torch::Tensor scales, torch::Tensor zeros
+) {
+  const at::cuda::OptionalCUDAGuard device_guard(device_of(vec));
+  vecquant8matmul_batched_faster_cuda(vec, mat, mul, scales, zeros);
+}
+
+
+void vecquant8matmul_batched_faster_old_cuda(
+  torch::Tensor vec, torch::Tensor mat, torch::Tensor mul,
+  torch::Tensor scales, torch::Tensor zeros
+);
+
+void vecquant8matmul_batched_faster_old(
+  torch::Tensor vec, torch::Tensor mat, torch::Tensor mul,
+  torch::Tensor scales, torch::Tensor zeros
+) {
+  const at::cuda::OptionalCUDAGuard device_guard(device_of(vec));
+  vecquant8matmul_batched_faster_old_cuda(vec, mat, mul, scales, zeros);
+}
+
+void vecquant8matmul_batched_column_compression_faster_cuda(
+  torch::Tensor vec, torch::Tensor mat, torch::Tensor mul,
+  torch::Tensor scales, torch::Tensor zeros
+);
+
+void vecquant8matmul_batched_column_compression_faster(
+  torch::Tensor vec, torch::Tensor mat, torch::Tensor mul,
+  torch::Tensor scales, torch::Tensor zeros
+) {
+  const at::cuda::OptionalCUDAGuard device_guard(device_of(vec));
+  vecquant8matmul_batched_column_compression_faster_cuda(vec, mat, mul, scales, zeros);
+}
+
+
+void vecquant8matmul_batched_column_compression_faster_old_cuda(
+  torch::Tensor vec, torch::Tensor mat, torch::Tensor mul,
+  torch::Tensor scales, torch::Tensor zeros
+);
+
+void vecquant8matmul_batched_column_compression_faster_old(
+  torch::Tensor vec, torch::Tensor mat, torch::Tensor mul,
+  torch::Tensor scales, torch::Tensor zeros
+) {
+  const at::cuda::OptionalCUDAGuard device_guard(device_of(vec));
+  vecquant8matmul_batched_column_compression_faster_old_cuda(vec, mat, mul, scales, zeros);
+}
+
+
+
+PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
+  m.def("vecquant8matmul", &vecquant8matmul, "Vector 8-bit Quantized Matrix Multiplication (CUDA) (desc_act)");
+  m.def("vecquant8matmul_batched", &vecquant8matmul_batched, "Vector 8-bit Batched Quantized Matrix Multiplication (CUDA) (desc_act)");
+  m.def("vecquant8matmul_batched_old", &vecquant8matmul_batched_old, "Vector 8-bit old Batched Quantized Matrix Multiplication (CUDA) (desc_act)");
+  m.def("vecquant8matmul_batched_faster", &vecquant8matmul_batched_faster, "Vector 8-bit old Batched Quantized Matrix Multiplication (CUDA) (desc_act)");
+  m.def("vecquant8matmul_batched_faster_old", &vecquant8matmul_batched_faster_old, "Vector 8-bit old Batched Quantized Matrix Multiplication (CUDA) (desc_act)");
+  m.def("vecquant4matmul_batched_old", &vecquant4matmul_batched_old, "Vector 4-bit old Batched Quantized Matrix Multiplication (CUDA) (desc_act)");
+  m.def("vecquant8matmul_batched_column_compression", &vecquant8matmul_batched_column_compression, "Vector 8-bit Batched Quantized Matrix Multiplication (CUDA) with weight's column compressed (desc_act)");
+  m.def("vecquant8matmul_batched_column_compression_old", &vecquant8matmul_batched_column_compression_old, "Vector old 8-bit Batched Quantized Matrix Multiplication (CUDA) with weight's column compressed (desc_act)");
+  m.def("vecquant8matmul_batched_column_compression_faster", &vecquant8matmul_batched_column_compression_faster, "Vector old 8-bit Batched Quantized Matrix Multiplication (CUDA) with weight's column compressed (desc_act)");
+  m.def("vecquant8matmul_batched_column_compression_faster_old", &vecquant8matmul_batched_column_compression_faster_old, "Vector old 8-bit Batched Quantized Matrix Multiplication (CUDA) with weight's column compressed (desc_act)");
+  m.def("vecquant4matmul_batched_column_compression_old", &vecquant4matmul_batched_column_compression_old, "Vector old 4-bit Batched Quantized Matrix Multiplication (CUDA) with weight's column compressed (desc_act)");
+  m.def("vecquant4matmul_batched", &vecquant4matmul_batched, "Vector 4-bit Batched Quantized Matrix Multiplication (CUDA) (desc_act)");
+  m.def("vecquant4matmul_batched_column_compression", &vecquant4matmul_batched_column_compression, "Vector 4-bit Batched Quantized Matrix Multiplication (CUDA) with weight's column compressed (desc_act)");
+}
diff --git a/src/deep_training/nlp/models/qwen/cache_autogptq_cuda_kernel_256.cu b/src/deep_training/nlp/models/qwen/cache_autogptq_cuda_kernel_256.cu
new file mode 100644
index 00000000..b7932cd7
--- /dev/null
+++ b/src/deep_training/nlp/models/qwen/cache_autogptq_cuda_kernel_256.cu
@@ -0,0 +1,1708 @@
+#define _CRT_SECURE_NO_WARNINGS
+#include <torch/all.h>
+#include <torch/python.h>
+#include <cuda.h>
+#include <cuda_runtime.h>
+#include <cuda_fp16.h>
+#include <stdint.h>
+
+#if (defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 700) || defined(USE_ROCM)
+// adapted from https://github.com/PanQiWei/AutoGPTQ/blob/main/autogptq_extension/cuda_256/autogptq_cuda_kernel_256.cu
+__device__ __forceinline__ void atomicAdd(c10::Half* address, c10::Half val) {
+    unsigned int *address_as_ui = reinterpret_cast<unsigned int *>(reinterpret_cast<char *>(address) - (reinterpret_cast<size_t>(address) & 2));
+    unsigned int old = *address_as_ui;
+    unsigned int assumed;
+
+    do {
+        assumed = old;
+        unsigned short hsum = reinterpret_cast<size_t>(address) & 2 ? (old >> 16) : (old & 0xffff);
+        hsum += val;
+        old = reinterpret_cast<size_t>(address) & 2
+                 ? (old & 0xffff) | (hsum << 16)
+                 : (old & 0xffff0000) | hsum;
+        old = atomicCAS(address_as_ui, assumed, old);
+
+    // Note: uses integer comparison to avoid hang in case of NaN (since NaN != NaN)
+    } while (assumed != old);
+}
+__device__ __forceinline__ void atomicAdd(__half* address, c10::Half val) {
+    unsigned int * address_as_ui = (unsigned int *) ((char *)address - ((size_t)address & 2));
+    unsigned int old = *address_as_ui;
+    unsigned int assumed;
+
+    do {
+        assumed = old;
+        __half_raw hsum;
+        hsum.x = (size_t)address & 2 ? (old >> 16) : (old & 0xffff);
+        half tmpres = __hadd(hsum, val);
+        hsum = __half_raw(tmpres);
+        old = (size_t)address & 2 ? (old & 0xffff) | (hsum.x << 16) : (old & 0xffff0000) | hsum.x;
+        old = atomicCAS(address_as_ui, assumed, old);
+    } while (assumed != old);
+}
+#endif
+
+template <typename scalar_t>
+__global__ void VecQuant8MatMulKernel(
+    const  scalar_t* __restrict__ vec,
+    const       int* __restrict__ mat,
+           scalar_t* __restrict__ mul,
+    const  scalar_t* __restrict__ scales,
+    const       int* __restrict__ zeros,
+    const       int* __restrict__ g_idx,
+    int batch,
+    int vec_height,
+    int height,
+    int width,
+    int zero_width
+);
+
+template <typename scalar_t>
+__global__ void VecQuant8BatchMatMulColumnCompressionKernel(
+    const  scalar_t* __restrict__ vec,
+    const       int* __restrict__ mat,
+           scalar_t* __restrict__ mul,
+    const  scalar_t* __restrict__ scales,
+    const       int* __restrict__ zeros,
+    int batch,
+    int heads,
+    int vec_row,
+    int height,
+    int width
+);
+
+template <typename scalar_t>
+__global__ void VecQuant4BatchMatMulColumnCompressionKernel(
+    const  scalar_t* __restrict__ vec,
+    const       int* __restrict__ mat,
+           scalar_t* __restrict__ mul,
+    const  scalar_t* __restrict__ scales,
+    const       int* __restrict__ zeros,
+    int batch,
+    int heads,
+    int vec_row,
+    int height,
+    int width
+);
+
+template <typename scalar_t>
+__global__ void VecQuant8BatchMatMulKernel(
+    const  scalar_t* __restrict__ vec,
+    const       int* __restrict__ mat,
+           scalar_t* __restrict__ mul,
+    const  scalar_t* __restrict__ scales,
+    const       int* __restrict__ zeros,
+    int batch,
+    int heads,
+    int vec_row,
+    int vec_height,
+    int height,
+    int width,
+    int zero_width
+);
+
+template <typename scalar_t>
+__global__ void VecQuant4BatchMatMulKernel(
+    const  scalar_t* __restrict__ vec,
+    const       int* __restrict__ mat,
+           scalar_t* __restrict__ mul,
+    const  scalar_t* __restrict__ scales,
+    const       int* __restrict__ zeros,
+    int batch,
+    int heads,
+    int vec_row,
+    int vec_height,
+    int height,
+    int width,
+    int zero_width
+);
+
+
+
+template <typename scalar_t>
+__global__ void VecQuant8BatchMatMulKernel_old(
+    const  scalar_t* __restrict__ vec,
+    const  uint8_t* __restrict__ mat,
+           scalar_t* __restrict__ mul,
+    const  scalar_t* __restrict__ scales,
+    const  scalar_t* __restrict__ zeros,
+    int batch,
+    int heads,
+    int vec_row,
+    int vec_height,
+    int height,
+    int width,
+    int zero_width
+);
+
+__global__ void VecQuant8BatchMatMulKernel_faster(
+    const  half* __restrict__ vec,
+    const  uint8_t* __restrict__ mat,
+           half* __restrict__ mul,
+    const  half* __restrict__ scales,
+    const  half* __restrict__ zeros,
+    int batch,
+    int heads,
+    int vec_row,
+    int vec_height,
+    int height,
+    int width,
+    int zero_width
+);
+
+
+
+__global__ void VecQuant8BatchMatMulKernel_faster_old(
+    const  half* __restrict__ vec,
+    const  uint8_t* __restrict__ mat,
+           half* __restrict__ mul,
+    const  half* __restrict__ scales,
+    const  half* __restrict__ zeros,
+    int batch,
+    int heads,
+    int vec_row,
+    int vec_height,
+    int height,
+    int width
+);
+
+
+template <typename scalar_t>
+__global__ void VecQuant4BatchMatMulKernel_old(
+    const  scalar_t* __restrict__ vec,
+    const  uint8_t* __restrict__ mat,
+           scalar_t* __restrict__ mul,
+    const  scalar_t* __restrict__ scales,
+    const  scalar_t* __restrict__ zeros,
+    int batch,
+    int heads,
+    int vec_row,
+    int vec_height,
+    int height,
+    int width,
+    int zero_width
+);
+
+
+template <typename scalar_t>
+__global__ void VecQuant8BatchMatMulColumnCompressionKernel_old(
+    const  scalar_t* __restrict__ vec,
+    const  uint8_t* __restrict__ mat,
+           scalar_t* __restrict__ mul,
+    const  scalar_t* __restrict__ scales,
+    const  scalar_t* __restrict__ zeros,
+    int batch,
+    int heads,
+    int vec_row,
+    int height,
+    int width
+);
+
+__global__ void VecQuant8BatchMatMulColumnCompressionKernel_faster(
+    const  half* __restrict__ vec,
+    const  uint8_t* __restrict__ mat,
+           half* __restrict__ mul,
+    const  half* __restrict__ scales,
+    const  half* __restrict__ zeros,
+    int batch,
+    int heads,
+    int vec_row,
+    int height,
+    int width
+);
+
+__global__ void VecQuant8BatchMatMulColumnCompressionKernel_faster_old(
+    const  half* __restrict__ vec,
+    const  uint8_t* __restrict__ mat,
+           half* __restrict__ mul,
+    const  half* __restrict__ scales,
+    const  half* __restrict__ zeros,
+    int batch,
+    int heads,
+    int vec_row,
+    int height,
+    int width
+);
+
+
+template <typename scalar_t>
+__global__ void VecQuant4BatchMatMulColumnCompressionKernel_old(
+    const  scalar_t* __restrict__ vec,
+    const  uint8_t* __restrict__ mat,
+           scalar_t* __restrict__ mul,
+    const  scalar_t* __restrict__ scales,
+    const  scalar_t* __restrict__ zeros,
+    int batch,
+    int heads,
+    int vec_row,
+    int height,
+    int width
+);
+
+
+__global__ void VecQuant8BatchMatMulKernel_faster(
+    const  half* __restrict__ vec,
+    const  uint8_t* __restrict__ mat,
+           half* __restrict__ mul,
+    const  half* __restrict__ scales,
+    const  half* __restrict__ zeros,
+    int batch,
+    int heads,
+    int vec_row,
+    int vec_height,
+    int height,
+    int width
+);
+
+
+__global__ void VecQuant8BatchMatMulColumnCompressionKernel_faster(
+    const  half* __restrict__ vec,
+    const  uint8_t* __restrict__ mat,
+           half* __restrict__ mul,
+    const  half* __restrict__ scales,
+    const  half* __restrict__ zeros,
+    int batch,
+    int heads,
+    int vec_row,
+    int height,
+    int width
+);
+
+const int BLOCKWIDTH  = 128;
+const int BLOCKHEIGHT8 =  32;
+const int BLOCKHEIGHT4 =  16;
+const int BLOCKHEIGHT_OLD4 =  128;
+//const int BLOCKHEIGHT_OLD8 =  128;
+
+__device__ inline unsigned int as_unsigned(int i) {
+  return *reinterpret_cast<unsigned int*>(&i);
+}
+
+__device__ inline int as_int(int i) {
+  return *reinterpret_cast<int*>(&i);
+}
+
+void vecquant8matmul_batched_column_compression_cuda(
+  torch::Tensor vec,
+  torch::Tensor mat,
+  torch::Tensor mul,
+  torch::Tensor scales,
+  torch::Tensor zeros
+) {
+  int batch = vec.size(0);
+  int heads = vec.size(1);
+  int vec_row = vec.size(2);
+  int height = vec.size(3);
+  int width = mat.size(3) * 4;
+
+  dim3 blocks(
+    (height + BLOCKWIDTH - 1) / BLOCKWIDTH,
+    (width + BLOCKWIDTH - 1) / BLOCKWIDTH
+  );
+  dim3 threads(BLOCKWIDTH);
+
+  AT_DISPATCH_FLOATING_TYPES(
+    vec.type(), "vecquant8matmul_batched_cuda", ([&] {
+      VecQuant8BatchMatMulColumnCompressionKernel<<<blocks, threads>>>(
+        vec.data<scalar_t>(), mat.data<int>(), mul.data<scalar_t>(),
+        scales.data<scalar_t>(), zeros.data<int>(),
+        batch, heads, vec_row, height, width
+      );
+    })
+  );
+
+}
+
+template <typename scalar_t>
+__global__ void VecQuant8BatchMatMulColumnCompressionKernel(
+    const  scalar_t* __restrict__ vec,
+    const       int* __restrict__ mat,
+           scalar_t* __restrict__ mul,
+    const  scalar_t* __restrict__ scales,
+    const       int* __restrict__ zeros,
+    int batch,
+    int heads,
+    int vec_row,
+    int height,
+    int width
+) {
+  int weight_total = batch * heads * height * width / 4;
+  int input_total = batch * heads * vec_row * height;
+  int out_total = batch * heads * vec_row * width;
+  int tid = threadIdx.x;
+  // h is index of height with step being BLOCKWIDTH
+  int h = BLOCKWIDTH * blockIdx.x;
+  // w is index of width with step being 1
+  int w = BLOCKWIDTH * blockIdx.y + tid;
+  if (w >= width && tid >= height) {
+    return;
+  }
+
+  __shared__ scalar_t blockvec[BLOCKWIDTH];
+  int k;
+  scalar_t w_tmp;
+
+  float weight[BLOCKWIDTH];
+
+  for (int b = 0; b < batch; ++b){
+    for (int head = 0; head < heads; ++head){
+      int batch_shift = b * heads + head;
+      for (k = 0; k <  BLOCKWIDTH && h + k < height; ++k){
+        int i_w = (w / 4);
+        int w_bit = (w % 4) * 8;
+
+        int w_index = (batch_shift * height + h + k) * width / 4 + i_w;
+        if (w_index >= weight_total || w >= width) {
+          weight[k] = 0;
+        } else {
+          scalar_t scale = scales[batch_shift * height + h + k];
+          scalar_t zero = zeros[batch_shift * height + h + k];
+          w_tmp = ((as_unsigned(mat[w_index]) >> w_bit) & 0xFF);
+          weight[k] = scale * (w_tmp - zero);
+        }
+      }
+
+      scalar_t res;
+      for (int vr = 0; vr < vec_row; ++vr){
+          res = 0;
+        int vec_index = (batch_shift * vec_row + vr) * height + blockIdx.x * BLOCKWIDTH + tid;
+        if (vec_index < input_total) {
+            blockvec[tid] = vec[vec_index];
+        } else {
+            blockvec[tid] = 0;
+        }
+
+        __syncthreads();
+          for (k = 0; k <  BLOCKWIDTH && h + k < height; ++k){
+          // res is the dot product of BLOCKWIDTH elements (part of width)
+            res += weight[k] * blockvec[k];
+        }
+        // add res to the final result, final matrix shape: (batch, vec_row, width)
+        int out_index = (batch_shift * vec_row + vr) * width + w;
+        if (out_index < out_total) {
+            atomicAdd(&mul[out_index], res);
+        }
+        __syncthreads();
+      }
+    }
+  }
+}
+
+void vecquant8matmul_batched_cuda(
+  torch::Tensor vec,
+  torch::Tensor mat,
+  torch::Tensor mul,
+  torch::Tensor scales,
+  torch::Tensor zeros
+) {
+  int batch = vec.size(0);
+  int heads = vec.size(1);
+  int vec_row = vec.size(2);
+  int vec_height = vec.size(3);
+  int height = mat.size(2);
+  int width = mat.size(3);
+  int zero_width = zeros.size(2);
+
+  dim3 blocks(
+    (height + BLOCKHEIGHT8 - 1) / BLOCKHEIGHT8,
+    (width + BLOCKWIDTH - 1) / BLOCKWIDTH
+  );
+  dim3 threads(BLOCKWIDTH);
+
+  AT_DISPATCH_FLOATING_TYPES(
+    vec.type(), "vecquant8matmul_batched_cuda", ([&] {
+      VecQuant8BatchMatMulKernel<<<blocks, threads>>>(
+        vec.data<scalar_t>(), mat.data<int>(), mul.data<scalar_t>(),
+        scales.data<scalar_t>(), zeros.data<int>(),
+        batch, heads, vec_row, vec_height, height, width, zero_width
+      );
+    })
+  );
+
+}
+
+template <typename scalar_t>
+__global__ void VecQuant8BatchMatMulKernel(
+    const  scalar_t* __restrict__ vec,
+    const       int* __restrict__ mat,
+           scalar_t* __restrict__ mul,
+    const  scalar_t* __restrict__ scales,
+    const       int* __restrict__ zeros,
+    int batch,
+    int heads,
+    int vec_row,
+    int vec_height,
+    int height,
+    int width,
+    int zero_width
+) {
+  int weight_total = batch * heads * height * width;
+  int input_total = batch * heads * vec_row * vec_height;
+  int out_total = batch * heads * vec_row * width;
+  int tid = threadIdx.x;
+  // h is index of height with step being BLOCKHEIGHT8
+  int h = BLOCKHEIGHT8 * blockIdx.x;
+  // w is index of width with step being 1
+  int w = BLOCKWIDTH * blockIdx.y + tid;
+  if (w >= width && tid >= vec_height) {
+    return;
+  }
+
+  __shared__ scalar_t blockvec[BLOCKWIDTH];
+  // i is index of mat of block first row
+  int i = width * h + w;
+  // if (i >= width * height) {
+  //   return;
+  // }
+  int k;
+  scalar_t w_tmp;
+
+  int z_w = w / 4;
+  int z_mod = (w % 4) * 8;
+
+  float weight[BLOCKWIDTH];
+
+  for (int b = 0; b < batch; ++b){
+    for (int head = 0; head < heads; ++head){
+      int batch_shift = b * heads + head;
+      for (k = 0; k <  BLOCKWIDTH && h * 4 + k < vec_height; ++k){
+        int k_w = (k / 4);
+        int k_bit = (k % 4) * 8;
+
+        int w_index = batch_shift * height * width + i + (k_w * width);
+        if (w_index >= weight_total || w >= width) {
+          weight[k] = 0;
+        } else {
+          scalar_t scale = scales[batch_shift * width + w];
+          scalar_t zero;
+          if (zero_width == width) {
+            zero = zeros[batch_shift * width + w];
+          } else {
+            zero = scalar_t(((as_unsigned(zeros[batch_shift * zero_width + z_w]) >> z_mod) & 0xFF) + 1);
+          }
+          w_tmp = ((as_unsigned(mat[w_index]) >> k_bit) & 0xFF);
+          weight[k] = scale * (w_tmp - zero);
+        }
+      }
+
+      scalar_t res;
+      for (int vr = 0; vr < vec_row; ++vr){
+          res = 0;
+        int vec_index = (batch_shift * vec_row + vr) * vec_height + blockIdx.x * BLOCKWIDTH + tid;
+        if (vec_index < input_total) {
+            blockvec[tid] = vec[vec_index];
+        } else {
+            blockvec[tid] = 0;
+        }
+
+        __syncthreads();
+          for (k = 0; k <  BLOCKWIDTH && h * 4 + k < vec_height; ++k){
+          // res is the dot product of BLOCKWIDTH elements (part of width)
+            res += weight[k] * blockvec[k];
+        }
+        // add res to the final result, final matrix shape: (batch, vec_row, width)
+        int out_index = (batch_shift * vec_row + vr) * width + w;
+        if (out_index < out_total) {
+            atomicAdd(&mul[out_index], res);
+        }
+        __syncthreads();
+      }
+    }
+  }
+}
+
+
+void vecquant8matmul_cuda(
+  torch::Tensor vec,
+  torch::Tensor mat,
+  torch::Tensor mul,
+  torch::Tensor scales,
+  torch::Tensor zeros,
+  torch::Tensor g_idx
+) {
+  int batch = vec.size(0);
+  int vec_height = vec.size(1);
+  int height = mat.size(0);
+  int width = mat.size(1);
+  int zero_width = zeros.size(1);
+
+  dim3 blocks(
+    (height + BLOCKHEIGHT8 - 1) / BLOCKHEIGHT8,
+    (width + BLOCKWIDTH - 1) / BLOCKWIDTH
+  );
+  dim3 threads(BLOCKWIDTH);
+
+  AT_DISPATCH_FLOATING_TYPES(
+    vec.type(), "vecquant8matmul_cuda", ([&] {
+      VecQuant8MatMulKernel<<<blocks, threads>>>(
+        vec.data<scalar_t>(), mat.data<int>(), mul.data<scalar_t>(),
+        scales.data<scalar_t>(), zeros.data<int>(), g_idx.data<int>(),
+        batch, vec_height, height, width, zero_width
+      );
+    })
+  );
+}
+
+template <typename scalar_t>
+__global__ void VecQuant8MatMulKernel(
+    const  scalar_t* __restrict__ vec,
+    const       int* __restrict__ mat,
+           scalar_t* __restrict__ mul,
+    const  scalar_t* __restrict__ scales,
+    const       int* __restrict__ zeros,
+    const       int* __restrict__ g_idx,
+    int batch,
+    int vec_height,
+    int height,
+    int width,
+    int zero_width
+) {
+  int h = BLOCKHEIGHT8 * blockIdx.x;
+  int w = BLOCKWIDTH * blockIdx.y + threadIdx.x;
+
+  __shared__ scalar_t blockvec[BLOCKWIDTH];
+  int i = width * h + w;
+  int g_h = h * 4;
+  int k;
+  unsigned int g;
+  scalar_t w_tmp;
+
+  int z_w = w / 4;
+  int z_mod = (w % 4) * 8;
+
+  float weight[BLOCKWIDTH];
+
+  for (k = 0; k <  BLOCKWIDTH; ++k){
+    int k_w = (k / 4);
+    int k_bit = (k % 4) * 8;
+
+      g = as_int(g_idx[g_h + k]);
+      scalar_t scale = scales[g * width + w];
+      scalar_t zero = scalar_t(((as_unsigned(zeros[g * zero_width + z_w]) >> z_mod) & 0xFF) + 1);
+
+      w_tmp = ((as_unsigned(mat[i + (k_w * width)]) >> k_bit) & 0xFF);
+
+    weight[k] = scale * (w_tmp - zero);
+  }
+
+
+  scalar_t res;
+  for (int b = 0; b < batch; ++b){
+      res = 0;
+    blockvec[threadIdx.x] = vec[b * vec_height + blockIdx.x * BLOCKWIDTH + threadIdx.x];
+    __syncthreads();
+    for (k = 0; k <  BLOCKWIDTH; ++k){
+      res += weight[k] * blockvec[k];
+    }
+    atomicAdd(&mul[b * width + w], res);
+    __syncthreads();
+  }
+}
+
+
+
+void vecquant4matmul_batched_cuda(
+  torch::Tensor vec,
+  torch::Tensor mat,
+  torch::Tensor mul,
+  torch::Tensor scales,
+  torch::Tensor zeros
+) {
+  int batch = vec.size(0);
+  int heads = vec.size(1);
+  int vec_row = vec.size(2);
+  int vec_height = vec.size(3);
+  int height = mat.size(2);
+  int width = mat.size(3);
+  int zero_width = zeros.size(2);
+
+  dim3 blocks(
+    (height + BLOCKHEIGHT4 - 1) / BLOCKHEIGHT4,
+    (width + BLOCKWIDTH - 1) / BLOCKWIDTH
+  );
+  dim3 threads(BLOCKWIDTH);
+
+  AT_DISPATCH_FLOATING_TYPES(
+    vec.type(), "vecquant4matmul_batched_cuda", ([&] {
+      VecQuant4BatchMatMulKernel<<<blocks, threads>>>(
+        vec.data<scalar_t>(), mat.data<int>(), mul.data<scalar_t>(),
+        scales.data<scalar_t>(), zeros.data<int>(),
+        batch, heads, vec_row, vec_height, height, width, zero_width
+      );
+    })
+  );
+
+}
+
+template <typename scalar_t>
+__global__ void VecQuant4BatchMatMulKernel(
+    const  scalar_t* __restrict__ vec,
+    const       int* __restrict__ mat,
+           scalar_t* __restrict__ mul,
+    const  scalar_t* __restrict__ scales,
+    const       int* __restrict__ zeros,
+    int batch,
+    int heads,
+    int vec_row,
+    int vec_height,
+    int height,
+    int width,
+    int zero_width
+) {
+  int weight_total = batch * heads * height * width;
+  int input_total = batch * heads * vec_row * vec_height;
+  int out_total = batch * heads * vec_row * width;
+  int tid = threadIdx.x;
+  // h is index of height with step being BLOCKHEIGHT4
+  int h = BLOCKHEIGHT4 * blockIdx.x;
+  // w is index of width with step being 1
+  int w = BLOCKWIDTH * blockIdx.y + tid;
+  if (w >= width && tid >= vec_height) {
+    return;
+  }
+
+  __shared__ scalar_t blockvec[BLOCKWIDTH];
+  // i is index of mat of block first row
+  int i = width * h + w;
+  int k;
+  scalar_t w_tmp;
+
+  int z_w = w / 8;
+  int z_mod = (w % 8) * 4;
+
+  float weight[BLOCKWIDTH];
+
+  for (int b = 0; b < batch; ++b){
+    for (int head = 0; head < heads; ++head){
+      int batch_shift = b * heads + head;
+      for (k = 0; k <  BLOCKWIDTH && h * 8 + k < vec_height; ++k){
+        int k_w = (k / 8);
+        int k_bit = (k % 8) * 4;
+
+        int w_index = batch_shift * height * width + i + (k_w * width);
+        if (w_index >= weight_total || w >= width) {
+          weight[k] = 0;
+        } else {
+          scalar_t scale = scales[batch_shift * width + w];
+          scalar_t zero;
+          if (zero_width == width) {
+            zero = zeros[batch_shift * width + w];
+          } else {
+            zero = scalar_t(((as_unsigned(zeros[batch_shift * zero_width + z_w]) >> z_mod) & 0xF));
+          }
+          w_tmp = ((as_unsigned(mat[w_index]) >> k_bit) & 0xF);
+          weight[k] = scale * (w_tmp - zero);
+        }
+      }
+
+      scalar_t res;
+      for (int vr = 0; vr < vec_row; ++vr){
+          res = 0;
+        int vec_index = (batch_shift * vec_row + vr) * vec_height + blockIdx.x * BLOCKWIDTH + tid;
+        if (vec_index < input_total) {
+            blockvec[tid] = vec[vec_index];
+        } else {
+            blockvec[tid] = 0;
+        }
+
+        __syncthreads();
+          for (k = 0; k <  BLOCKWIDTH && h * 8 + k < vec_height; ++k){
+          // res is the dot product of BLOCKWIDTH elements (part of width)
+            res += weight[k] * blockvec[k];
+        }
+        // add res to the final result, final matrix shape: (batch, vec_row, width)
+        int out_index = (batch_shift * vec_row + vr) * width + w;
+        if (out_index < out_total) {
+            atomicAdd(&mul[out_index], res);
+        }
+        __syncthreads();
+      }
+    }
+  }
+}
+
+
+
+void vecquant4matmul_batched_column_compression_cuda(
+  torch::Tensor vec,
+  torch::Tensor mat,
+  torch::Tensor mul,
+  torch::Tensor scales,
+  torch::Tensor zeros
+) {
+  int batch = vec.size(0);
+  int heads = vec.size(1);
+  int vec_row = vec.size(2);
+  int height = vec.size(3);
+  int width = mat.size(3) * 8;
+
+  dim3 blocks(
+    (height + BLOCKWIDTH - 1) / BLOCKWIDTH,
+    (width + BLOCKWIDTH - 1) / BLOCKWIDTH
+  );
+  dim3 threads(BLOCKWIDTH);
+
+  AT_DISPATCH_FLOATING_TYPES(
+    vec.type(), "vecquant4matmul_batched_cuda", ([&] {
+      VecQuant4BatchMatMulColumnCompressionKernel<<<blocks, threads>>>(
+        vec.data<scalar_t>(), mat.data<int>(), mul.data<scalar_t>(),
+        scales.data<scalar_t>(), zeros.data<int>(),
+        batch, heads, vec_row, height, width
+      );
+    })
+  );
+
+}
+
+template <typename scalar_t>
+__global__ void VecQuant4BatchMatMulColumnCompressionKernel(
+    const  scalar_t* __restrict__ vec,
+    const       int* __restrict__ mat,
+           scalar_t* __restrict__ mul,
+    const  scalar_t* __restrict__ scales,
+    const       int* __restrict__ zeros,
+    int batch,
+    int heads,
+    int vec_row,
+    int height,
+    int width
+) {
+  int weight_total = batch * heads * height * width / 8;
+  int input_total = batch * heads * vec_row * height;
+  int out_total = batch * heads * vec_row * width;
+  int tid = threadIdx.x;
+  // h is index of height with step being BLOCKWIDTH
+  int h = BLOCKWIDTH * blockIdx.x;
+  // w is index of width with step being 1
+  int w = BLOCKWIDTH * blockIdx.y + tid;
+  if (w >= width && tid >= height) {
+    return;
+  }
+
+  __shared__ scalar_t blockvec[BLOCKWIDTH];
+  int k;
+  scalar_t w_tmp;
+
+  float weight[BLOCKWIDTH];
+
+  for (int b = 0; b < batch; ++b){
+    for (int head = 0; head < heads; ++head){
+      int batch_shift = b * heads + head;
+      for (k = 0; k <  BLOCKWIDTH && h + k < height; ++k){
+        int i_w = (w / 8);
+        int w_bit = (w % 8) * 4;
+
+        int w_index = (batch_shift * height + h + k) * width / 8 + i_w;
+        if (w_index >= weight_total || w >= width) {
+          weight[k] = 0;
+        } else {
+          scalar_t scale = scales[batch_shift * height + h + k];
+          scalar_t zero = zeros[batch_shift * height + h + k];
+          w_tmp = ((as_unsigned(mat[w_index]) >> w_bit) & 0xF);
+          weight[k] = scale * (w_tmp - zero);
+        }
+      }
+
+      scalar_t res;
+      for (int vr = 0; vr < vec_row; ++vr){
+          res = 0;
+        int vec_index = (batch_shift * vec_row + vr) * height + blockIdx.x * BLOCKWIDTH + tid;
+        if (vec_index < input_total) {
+            blockvec[tid] = vec[vec_index];
+        } else {
+            blockvec[tid] = 0;
+        }
+
+        __syncthreads();
+          for (k = 0; k <  BLOCKWIDTH && h + k < height; ++k){
+          // res is the dot product of BLOCKWIDTH elements (part of width)
+            res += weight[k] * blockvec[k];
+        }
+        // add res to the final result, final matrix shape: (batch, vec_row, width)
+        int out_index = (batch_shift * vec_row + vr) * width + w;
+        if (out_index < out_total) {
+            atomicAdd(&mul[out_index], res);
+        }
+        __syncthreads();
+      }
+    }
+  }
+}
+
+
+void vecquant8matmul_batched_old_cuda(
+  torch::Tensor vec,
+  torch::Tensor mat,
+  torch::Tensor mul,
+  torch::Tensor scales,
+  torch::Tensor zeros
+) {
+  int batch = vec.size(0);
+  int heads = vec.size(1);
+  int vec_row = vec.size(2);
+  int vec_height = vec.size(3);
+  int height = mat.size(2);
+  int width = mat.size(3);
+  int zero_width = zeros.size(2);
+
+  dim3 blocks(
+    (height + BLOCKWIDTH - 1) / BLOCKWIDTH,
+    (width + BLOCKWIDTH - 1) / BLOCKWIDTH
+  );
+  dim3 threads(BLOCKWIDTH);
+
+  AT_DISPATCH_FLOATING_TYPES(
+    vec.type(), "vecquant8matmul_batched_old_cuda", ([&] {
+      VecQuant8BatchMatMulKernel_old<<<blocks, threads>>>(
+        vec.data<scalar_t>(), mat.data<uint8_t>(), mul.data<scalar_t>(),
+        scales.data<scalar_t>(), zeros.data<scalar_t>(),
+        batch, heads, vec_row, vec_height, height, width, zero_width
+      );
+    })
+  );
+}
+
+
+template <typename scalar_t>
+__global__ void VecQuant8BatchMatMulKernel_old(
+    const  scalar_t* __restrict__ vec,
+    const  uint8_t* __restrict__ mat,
+           scalar_t* __restrict__ mul,
+    const  scalar_t* __restrict__ scales,
+    const  scalar_t* __restrict__ zeros,
+    int batch,
+    int heads,
+    int vec_row,
+    int vec_height,
+    int height,
+    int width,
+    int zero_width
+) {
+  int weight_total = batch * heads * height * width;
+  int input_total = batch * heads * vec_row * vec_height;
+  int out_total = batch * heads * vec_row * width;
+  int tid = threadIdx.x;
+  // h is index of height with step being BLOCKHEIGHT8
+  int h = BLOCKWIDTH * blockIdx.x;
+  // w is index of width with step being 1
+  int w = BLOCKWIDTH * blockIdx.y + tid;
+  if (w >= width && tid >= vec_height) {
+    return;
+  }
+
+  __shared__ scalar_t blockvec[BLOCKWIDTH];
+  // i is index of mat of block first row
+  int i = width * h + w;
+  int k;
+  scalar_t w_tmp;
+
+  float weight[BLOCKWIDTH];
+  for (int b = 0; b < batch; ++b){
+    for (int head = 0; head < heads; ++head){
+      int batch_shift = b * heads + head;
+      for (k = 0; k <  BLOCKWIDTH && h + k < vec_height; ++k){
+        int k_w = k;
+        int w_index = batch_shift * height * width + i + (k_w * width);
+        if (w_index >= weight_total || w >= width) {
+          weight[k] = 0;
+        } else {
+          scalar_t scale = scales[batch_shift * width + w];
+          scalar_t zero = zeros[batch_shift * width + w];
+          w_tmp = as_unsigned(mat[w_index]);
+          weight[k] = scale * (w_tmp - zero);
+        }
+      }
+
+      scalar_t res;
+      for (int vr = 0; vr < vec_row; ++vr){
+          res = 0;
+        int vec_index = (batch_shift * vec_row + vr) * vec_height + blockIdx.x * BLOCKWIDTH + tid;
+        if (vec_index < input_total) {
+            blockvec[tid] = vec[vec_index];
+        } else {
+            blockvec[tid] = 0;
+        }
+
+        __syncthreads();
+          for (k = 0; k <  BLOCKWIDTH && h + k < vec_height; ++k){
+          // res is the dot product of BLOCKWIDTH elements (part of width)
+            res += weight[k] * blockvec[k];
+        }
+        // add res to the final result, final matrix shape: (batch, vec_row, width)
+        int out_index = (batch_shift * vec_row + vr) * width + w;
+        if (out_index < out_total) {
+            atomicAdd(&mul[out_index], res);
+        }
+        __syncthreads();
+      }
+    }
+  }
+}
+
+
+
+void vecquant8matmul_batched_faster_cuda(
+  torch::Tensor vec,
+  torch::Tensor mat,
+  torch::Tensor mul,
+  torch::Tensor scales,
+  torch::Tensor zeros
+) {
+  int batch = vec.size(0);
+  int heads = vec.size(1);
+  int vec_row = vec.size(2);
+  int vec_height = vec.size(3);
+  int height = mat.size(2);
+  int width = mat.size(3);
+  int zero_width = zeros.size(2);
+
+  dim3 blocks(
+    (height + BLOCKWIDTH - 1) / BLOCKWIDTH,
+    (width + BLOCKWIDTH - 1) / BLOCKWIDTH
+  );
+  dim3 threads(BLOCKWIDTH);
+
+  VecQuant8BatchMatMulKernel_faster<<<blocks, threads>>>(
+    (half*) vec.data_ptr(),
+    (uint8_t*) mat.data_ptr(),
+    (half*) mul.data_ptr(),
+    (half*) scales.data_ptr(),
+    (half*) zeros.data_ptr(),
+    batch, heads, vec_row, vec_height, height, width, zero_width
+  );
+}
+
+
+
+__global__ void VecQuant8BatchMatMulKernel_faster(
+    const  half* __restrict__ vec,
+    const  uint8_t* __restrict__ mat,
+           half* __restrict__ mul,
+    const  half* __restrict__ scales,
+    const  half* __restrict__ zeros,
+    int batch,
+    int heads,
+    int vec_row,
+    int vec_height,
+    int height,
+    int width,
+    int zero_width
+) {
+  //int weight_total = batch * heads * height * width;
+  int input_total = batch * heads * vec_row * vec_height;
+  int out_total = batch * heads * vec_row * width;
+  int tid = threadIdx.x;
+  int h = BLOCKWIDTH * blockIdx.x;
+  int w = BLOCKWIDTH * blockIdx.y + tid;
+  if (w >= width && tid >= height) {
+    return;
+  }
+
+  __shared__ float blockvec[BLOCKWIDTH];
+  int i = width * h + w;
+  int k;
+  float w_tmp;
+
+  float weight[BLOCKWIDTH];
+  for (int b = 0; b < batch; ++b){
+    for (int head = 0; head < heads; ++head){
+      int batch_shift = b * heads + head;
+      for (k = 0; k <  BLOCKWIDTH && h + k < vec_height; ++k){
+        int k_w = k;
+        int w_index = batch_shift * height * width + i + (k_w * width);
+        float scale = __half2float(scales[batch_shift * width + w]);
+        float zero = __half2float(zeros[batch_shift * width + w]);
+        w_tmp = as_unsigned(mat[w_index]);
+        weight[k] = scale *(w_tmp-zero);
+      }
+
+      float res;
+      for (int vr = 0; vr < vec_row; ++vr){
+        res = 0;
+        int vec_index = (batch_shift * vec_row + vr) * vec_height + blockIdx.x * BLOCKWIDTH + tid;
+        if (vec_index < input_total) {
+            blockvec[tid] = __half2float(vec[vec_index]);
+        } else {
+            blockvec[tid] = 0;
+        }
+        __syncthreads();
+          for (k = 0; k <  BLOCKWIDTH && h + k < vec_height; ++k){
+            float temp_res = weight[k]*blockvec[k];
+            res += temp_res;
+        }
+        int out_index = (batch_shift * vec_row + vr) * width + w;
+        if (out_index < out_total) {
+            atomicAdd(&mul[out_index], __float2half(res));
+        }
+        __syncthreads();
+      }
+    }
+  }
+}
+
+
+
+
+void vecquant8matmul_batched_column_compression_faster_cuda(
+  torch::Tensor vec,
+  torch::Tensor mat,
+  torch::Tensor mul,
+  torch::Tensor scales,
+  torch::Tensor zeros
+) {
+  int batch = vec.size(0);
+  int heads = vec.size(1);
+  int vec_row = vec.size(2);
+  int height = vec.size(3);
+  int width = mat.size(3);
+
+  dim3 blocks(
+    (height + BLOCKWIDTH - 1) / BLOCKWIDTH,
+    (width + BLOCKWIDTH - 1) / BLOCKWIDTH
+  );
+  dim3 threads(BLOCKWIDTH);
+
+  VecQuant8BatchMatMulColumnCompressionKernel_faster<<<blocks, threads>>>(
+    (half*) vec.data_ptr(),
+    (uint8_t*) mat.data_ptr(),
+    (half*) mul.data_ptr(),
+    (half*) scales.data_ptr(),
+    (half*) zeros.data_ptr(),
+    batch, heads, vec_row, height, width
+  );
+
+}
+
+__global__ void VecQuant8BatchMatMulColumnCompressionKernel_faster(
+    const  half* __restrict__ vec,
+    const  uint8_t* __restrict__ mat,
+           half* __restrict__ mul,
+    const  half* __restrict__ scales,
+    const  half* __restrict__ zeros,
+    int batch,
+    int heads,
+    int vec_row,
+    int height,
+    int width
+) {
+  //int weight_total = batch * heads * height * width;
+  int input_total = batch * heads * vec_row * height;
+  int out_total = batch * heads * vec_row * width;
+  int tid = threadIdx.x;
+  int h = BLOCKWIDTH * blockIdx.x;
+  int w = BLOCKWIDTH * blockIdx.y + tid;
+  if (w >= width && tid >= height) {
+    return;
+  }
+
+  __shared__ float blockvec[BLOCKWIDTH];
+  int k;
+  float w_tmp;
+  float weight[BLOCKWIDTH];
+
+  for (int b = 0; b < batch; ++b){
+    for (int head = 0; head < heads; ++head){
+      int batch_shift = b * heads + head;
+      for (k = 0; k <  BLOCKWIDTH; ++k){
+        int w_index = (batch_shift * height + h + k) * width  + w;
+        float scale = __half2float(scales[batch_shift * height + h + k]);
+        float zero = __half2float(zeros[batch_shift * height + h + k]);
+        w_tmp = mat[w_index];
+        weight[k] = scale * (w_tmp-zero);
+      }
+
+      float res;
+      for (int vr = 0; vr < vec_row; ++vr){
+        res = 0;
+        int vec_index = (batch_shift * vec_row + vr) * height + blockIdx.x * BLOCKWIDTH + tid;
+        if (vec_index < input_total) {
+            blockvec[tid] = __half2float(vec[vec_index]);
+        } else {
+            blockvec[tid] = 0;
+        }
+        __syncthreads();
+          for (k = 0; k <  BLOCKWIDTH; ++k){
+            res += weight[k]*blockvec[k];
+        }
+        int out_index = (batch_shift * vec_row + vr) * width + w;
+        if (out_index < out_total) {
+            atomicAdd(&mul[out_index], __float2half(res));
+        }
+        __syncthreads();
+      }
+    }
+  }
+}
+
+
+
+void vecquant8matmul_batched_column_compression_old_cuda(
+  torch::Tensor vec,
+  torch::Tensor mat,
+  torch::Tensor mul,
+  torch::Tensor scales,
+  torch::Tensor zeros
+) {
+  int batch = vec.size(0);
+  int heads = vec.size(1);
+  int vec_row = vec.size(2);
+  int height = vec.size(3);
+  int width = mat.size(3);
+
+  dim3 blocks(
+    (height + BLOCKWIDTH - 1) / BLOCKWIDTH,
+    (width + BLOCKWIDTH - 1) / BLOCKWIDTH
+  );
+  dim3 threads(BLOCKWIDTH);
+
+  AT_DISPATCH_FLOATING_TYPES(
+    vec.type(), "vecquant8matmul_batched_column_compression_old_cuda", ([&] {
+      VecQuant8BatchMatMulColumnCompressionKernel_old<<<blocks, threads>>>(
+        vec.data<scalar_t>(), mat.data<uint8_t>(), mul.data<scalar_t>(),
+        scales.data<scalar_t>(), zeros.data<scalar_t>(),
+        batch, heads, vec_row, height, width
+      );
+    })
+  );
+
+}
+
+template <typename scalar_t>
+__global__ void VecQuant8BatchMatMulColumnCompressionKernel_old(
+    const  scalar_t* __restrict__ vec,
+    const  uint8_t* __restrict__ mat,
+           scalar_t* __restrict__ mul,
+    const  scalar_t* __restrict__ scales,
+    const  scalar_t* __restrict__ zeros,
+    int batch,
+    int heads,
+    int vec_row,
+    int height,
+    int width
+) {
+  int weight_total = batch * heads * height * width;
+  int input_total = batch * heads * vec_row * height;
+  int out_total = batch * heads * vec_row * width;
+  int tid = threadIdx.x;
+  // h is index of height with step being BLOCKWIDTH
+  int h = BLOCKWIDTH * blockIdx.x;
+  // w is index of width with step being 1
+  int w = BLOCKWIDTH * blockIdx.y + tid;
+  if (w >= width && tid >= height) {
+    return;
+  }
+
+  __shared__ scalar_t blockvec[BLOCKWIDTH];
+  int k;
+  scalar_t w_tmp;
+
+  float weight[BLOCKWIDTH];
+
+  for (int b = 0; b < batch; ++b){
+    for (int head = 0; head < heads; ++head){
+      int batch_shift = b * heads + head;
+      for (k = 0; k <  BLOCKWIDTH && h + k < height; ++k){
+        int w_index = (batch_shift * height + h + k) * width  + w;
+        if (w_index >= weight_total || w >= width) {
+          weight[k] = 0;
+        } else {
+          scalar_t scale = scales[batch_shift * height + h + k];
+          scalar_t zero = zeros[batch_shift * height + h + k];
+          w_tmp = mat[w_index];
+          weight[k] = scale * (w_tmp - zero);
+        }
+      }
+
+      scalar_t res;
+      for (int vr = 0; vr < vec_row; ++vr){
+          res = 0;
+        int vec_index = (batch_shift * vec_row + vr) * height + blockIdx.x * BLOCKWIDTH + tid;
+        if (vec_index < input_total) {
+            blockvec[tid] = vec[vec_index];
+        } else {
+            blockvec[tid] = 0;
+        }
+
+        __syncthreads();
+          for (k = 0; k <  BLOCKWIDTH && h + k < height; ++k){
+          // res is the dot product of BLOCKWIDTH elements (part of width)
+            res += weight[k] * blockvec[k];
+        }
+        // add res to the final result, final matrix shape: (batch, vec_row, width)
+        int out_index = (batch_shift * vec_row + vr) * width + w;
+        if (out_index < out_total) {
+            atomicAdd(&mul[out_index], res);
+        }
+        __syncthreads();
+      }
+    }
+  }
+}
+
+
+void vecquant4matmul_batched_old_cuda(
+  torch::Tensor vec,
+  torch::Tensor mat,
+  torch::Tensor mul,
+  torch::Tensor scales,
+  torch::Tensor zeros
+) {
+  int batch = vec.size(0);
+  int heads = vec.size(1);
+  int vec_row = vec.size(2);
+  int vec_height = vec.size(3);
+  int height = mat.size(2);
+  int width = mat.size(3);
+  int zero_width = zeros.size(2);
+
+  dim3 blocks(
+    (height + BLOCKHEIGHT_OLD4 - 1) / BLOCKHEIGHT_OLD4,
+    (width + BLOCKWIDTH - 1) / BLOCKWIDTH
+  );
+  dim3 threads(BLOCKWIDTH);
+
+  AT_DISPATCH_FLOATING_TYPES(
+    vec.type(), "vecquant4matmul_batched_old_cuda", ([&] {
+      VecQuant4BatchMatMulKernel_old<<<blocks, threads>>>(
+        vec.data<scalar_t>(), mat.data<uint8_t>(), mul.data<scalar_t>(),
+        scales.data<scalar_t>(), zeros.data<scalar_t>(),
+        batch, heads, vec_row, vec_height, height, width, zero_width
+      );
+    })
+  );
+
+}
+
+template <typename scalar_t>
+__global__ void VecQuant4BatchMatMulKernel_old(
+    const  scalar_t* __restrict__ vec,
+    const  uint8_t* __restrict__ mat,
+           scalar_t* __restrict__ mul,
+    const  scalar_t* __restrict__ scales,
+    const  scalar_t* __restrict__ zeros,
+    int batch,
+    int heads,
+    int vec_row,
+    int vec_height,
+    int height,
+    int width,
+    int zero_width
+) {
+  int weight_total = batch * heads * height * width;
+  int input_total = batch * heads * vec_row * vec_height;
+  int out_total = batch * heads * vec_row * width;
+  int tid = threadIdx.x;
+  // h is index of height with step being BLOCKHEIGHT_OLD4
+  int h = BLOCKHEIGHT_OLD4 * blockIdx.x;
+  // w is index of width with step being 1
+  int w = BLOCKWIDTH * blockIdx.y + tid;
+  if (w >= width && tid >= vec_height) {
+    return;
+  }
+
+  __shared__ scalar_t blockvec[BLOCKWIDTH];
+  // i is index of mat of block first row
+  int i = width * h + w;
+  int k;
+  scalar_t w_tmp;
+
+  float weight[BLOCKWIDTH];
+  for (int b = 0; b < batch; ++b){
+    for (int head = 0; head < heads; ++head){
+      int batch_shift = b * heads + head;
+      for (k = 0; k <  BLOCKWIDTH && h*2 + k < vec_height; ++k){
+        int k_w = (k / 2);
+        int k_bit = (k % 2) * 4;
+        int w_index = batch_shift * height * width + i + (k_w * width);
+        if (w_index >= weight_total || w >= width) {
+          weight[k] = 0;
+        } else {
+          scalar_t scale = scales[batch_shift * width + w];
+          scalar_t zero = zeros[batch_shift * width + w];
+          w_tmp = ((as_unsigned(mat[w_index]) >> k_bit) & 0xF);
+          weight[k] = scale * (w_tmp - zero);
+        }
+      }
+
+      scalar_t res;
+      for (int vr = 0; vr < vec_row; ++vr){
+          res = 0;
+        int vec_index = (batch_shift * vec_row + vr) * vec_height + blockIdx.x * BLOCKWIDTH + tid;
+        if (vec_index < input_total) {
+            blockvec[tid] = vec[vec_index];
+        } else {
+            blockvec[tid] = 0;
+        }
+
+        __syncthreads();
+          for (k = 0; k <  BLOCKWIDTH && h*2 + k < vec_height; ++k){
+          // res is the dot product of BLOCKWIDTH elements (part of width)
+            res += weight[k] * blockvec[k];
+        }
+        // add res to the final result, final matrix shape: (batch, vec_row, width)
+        int out_index = (batch_shift * vec_row + vr) * width + w;
+        if (out_index < out_total) {
+            atomicAdd(&mul[out_index], res);
+        }
+        __syncthreads();
+      }
+    }
+  }
+}
+
+
+
+
+
+void vecquant4matmul_batched_column_compression_old_cuda(
+  torch::Tensor vec,
+  torch::Tensor mat,
+  torch::Tensor mul,
+  torch::Tensor scales,
+  torch::Tensor zeros
+) {
+  int batch = vec.size(0);
+  int heads = vec.size(1);
+  int vec_row = vec.size(2);
+  int height = vec.size(3);
+  int width = mat.size(3);
+
+  dim3 blocks(
+    (height + BLOCKHEIGHT_OLD4 - 1) / BLOCKHEIGHT_OLD4,
+    (width + BLOCKWIDTH - 1) / BLOCKWIDTH
+  );
+  dim3 threads(BLOCKWIDTH);
+
+  AT_DISPATCH_FLOATING_TYPES(
+    vec.type(), "vecquant4matmul_batched_column_compression_old_cuda", ([&] {
+      VecQuant4BatchMatMulColumnCompressionKernel_old<<<blocks, threads>>>(
+        vec.data<scalar_t>(), mat.data<uint8_t>(), mul.data<scalar_t>(),
+        scales.data<scalar_t>(), zeros.data<scalar_t>(),
+        batch, heads, vec_row, height, width
+      );
+    })
+  );
+
+}
+
+template <typename scalar_t>
+__global__ void VecQuant4BatchMatMulColumnCompressionKernel_old(
+    const  scalar_t* __restrict__ vec,
+    const  uint8_t* __restrict__ mat,
+           scalar_t* __restrict__ mul,
+    const  scalar_t* __restrict__ scales,
+    const  scalar_t* __restrict__ zeros,
+    int batch,
+    int heads,
+    int vec_row,
+    int height,
+    int width
+) {
+  int weight_total = batch * heads * height * width;
+  int input_total = batch * heads * vec_row * height;
+  int out_total = batch * heads * vec_row * width;
+  int tid = threadIdx.x;
+  // h is index of height with step being BLOCKWIDTH
+  int h = BLOCKHEIGHT_OLD4 * blockIdx.x;
+  // w is index of width with step being 1
+  int w = BLOCKWIDTH * blockIdx.y + tid;
+  if (w >= width && tid >= height) {
+    return;
+  }
+
+  __shared__ scalar_t blockvec[BLOCKWIDTH];
+  int k;
+  scalar_t w_tmp;
+
+  float weight[BLOCKWIDTH];
+
+  for (int b = 0; b < batch; ++b){
+    for (int head = 0; head < heads; ++head){
+      int batch_shift = b * heads + head;
+      for (k = 0; k <  BLOCKWIDTH && h*2 + k < height; ++k){
+        int k_w = (k / 2);
+        int k_bit = (k % 2) * 4;
+        int w_index = (batch_shift * height + h + k) * width  + k_w;
+        if (w_index >= weight_total || w >= width) {
+          weight[k] = 0;
+        } else {
+          scalar_t scale = scales[batch_shift * height + h + k];
+          scalar_t zero = zeros[batch_shift * height + h + k];
+          w_tmp = ((as_unsigned(mat[w_index]) >> k_bit) & 0xF);
+          weight[k] = scale * (w_tmp - zero);
+        }
+      }
+
+      scalar_t res;
+      for (int vr = 0; vr < vec_row; ++vr){
+          res = 0;
+        int vec_index = (batch_shift * vec_row + vr) * height + blockIdx.x * BLOCKWIDTH + tid;
+        if (vec_index < input_total) {
+            blockvec[tid] = vec[vec_index];
+        } else {
+            blockvec[tid] = 0;
+        }
+
+        __syncthreads();
+          for (k = 0; k <  BLOCKWIDTH && h*2 + k < height; ++k){
+          // res is the dot product of BLOCKWIDTH elements (part of width)
+            res += weight[k] * blockvec[k];
+        }
+        // add res to the final result, final matrix shape: (batch, vec_row, width)
+        int out_index = (batch_shift * vec_row + vr) * width + w;
+        if (out_index < out_total) {
+            atomicAdd(&mul[out_index], res);
+        }
+        __syncthreads();
+      }
+    }
+  }
+}
+
+
+
+
+
+void vecquant8matmul_batched_faster_old_cuda(
+  torch::Tensor vec,
+  torch::Tensor mat,
+  torch::Tensor mul,
+  torch::Tensor scales,
+  torch::Tensor zeros
+) {
+  int batch = vec.size(0);
+  int heads = vec.size(1);
+  int vec_row = vec.size(2);
+  int vec_height = vec.size(3);
+  int height = mat.size(2);
+  int width = mat.size(3);
+
+  dim3 blocks(
+    (height + BLOCKWIDTH - 1) / BLOCKWIDTH,
+    (width + BLOCKWIDTH - 1) / BLOCKWIDTH
+  );
+  dim3 threads(BLOCKWIDTH);
+
+  VecQuant8BatchMatMulKernel_faster_old<<<blocks, threads>>>(
+    (half*) vec.data_ptr(),
+    (uint8_t*) mat.data_ptr(),
+    (half*) mul.data_ptr(),
+    (half*) scales.data_ptr(),
+    (half*) zeros.data_ptr(),
+    batch, heads, vec_row, vec_height, height, width
+  );
+}
+
+
+__global__ void VecQuant8BatchMatMulKernel_faster_old(
+    const  half* __restrict__ vec,
+    const  uint8_t* __restrict__ mat,
+           half* __restrict__ mul,
+    const  half* __restrict__ scales,
+    const  half* __restrict__ zeros,
+    int batch,
+    int heads,
+    int vec_row,
+    int vec_height,
+    int height,
+    int width
+) {
+ int weight_total = batch * heads * height * width;
+  int input_total = batch * heads * vec_row * vec_height;
+  int out_total = batch * heads * vec_row * width;
+  int tid = threadIdx.x;
+  const int BLOCKWIDTH_half = BLOCKWIDTH/2;
+
+  int h = BLOCKWIDTH * blockIdx.x; //head_dim, dim=-1
+  int w = BLOCKWIDTH * blockIdx.y + tid; //seq-len, +0-256 ,dim=-2
+  /*
+  if (w >= width && tid >= vec_height) {
+    return;
+  }
+  */
+  __shared__ half blockvec[BLOCKWIDTH]; //256
+  int i = width * h + w;
+  int k;
+
+  half w_tmp1 = __float2half(0);
+  half w_tmp2 = __float2half(0);
+
+  half2 weight[BLOCKWIDTH_half];
+  for (int b = 0; b < batch; ++b){
+    for (int head = 0; head < heads; ++head){
+      int batch_shift = b * heads + head;
+      //int zero_index = batch_shift;
+      for (k = 0; k <  BLOCKWIDTH_half; ++k){
+        int w_index1 = batch_shift * height * width + i + (2 * k * width); // [batch,head,h+k, w]
+        int w_index2 = batch_shift * height * width + i + ((2 * k + 1) * width);
+        int zero_index = batch_shift * width + w; // [batch,head, w]
+        if (w_index1 >= weight_total || w >= width || (2 * k + h) >= height) {
+          weight[k] = __float2half2_rn(0);
+        } else {
+            float zero_f=__half2float(zeros[zero_index]);
+            float scale_f= __half2float(scales[zero_index]);
+            if (w_index2 >= weight_total){
+              w_tmp1 = __float2half((as_unsigned(mat[w_index1]) -zero_f)*scale_f);
+              w_tmp2 = __float2half(0);
+              weight[k] = __halves2half2(w_tmp1,w_tmp2);
+              //printf("zero_index is %d w is %d height is %d width is %d w_index1 is %d w_tmp1 is %f w_tmp2 is %f zero is %f scale is %f low is %f high is %f \n ",zero_index,w,height, width,w_index1,__half2float(w_tmp1),__half2float(w_tmp2),zero_f,scale_f,__low2float(weight[k]),__high2float(weight[k]));
+            }else{
+              w_tmp1 = __int2half_rn(as_unsigned(mat[w_index1]));
+              w_tmp2 = __int2half_rn(as_unsigned(mat[w_index2]));
+
+              //weight[k] = __hmul2(__hsub2(__halves2half2(w_tmp1,w_tmp2), __halves2half2(zero,zero)),__halves2half2(scale,scale));
+              weight[k] = __hfma2(__halves2half2(w_tmp1,w_tmp2), __float2half2_rn(scale_f), __float2half2_rn(-(scale_f * zero_f)));
+              //printf("zero_index1 is %d zero_index2 is %d k is %d head is %d w is %d h is %d height is %d width is %d w_index1 is %d w_index2 is %d zero is %f scale is %f low is %f high is %f \n ",zero_index1,zero_index2,k,head,w,h,height, width,w_index1,w_index2,__half2float(zero1),__half2float(scale1),__low2float(weight[k]),__high2float(weight[k]));
+            }
+        }
+      }
+
+
+      for (int vr = 0; vr < vec_row; ++vr){
+        float res=0;
+        int vec_index = (batch_shift * vec_row + vr) * height + blockIdx.x * BLOCKWIDTH + tid;
+        int out_index = (batch_shift * vec_row + vr) * width + w;
+        if (vec_index < input_total) {
+            //blockvec[tid] = __half2float(vec[vec_index]);// [batch, head, vr, tid(seq_len dim+)]
+            blockvec[tid] = vec[vec_index];
+            //printf("width is %d height is %d h is %d w is %d vec_index is %d out_index is %d vec_row is %d vec_height is %d,vr is %d tid is %d blockvec is %f\n",width,height, h,w,vec_index,out_index,vec_row,vec_height,vr,tid,blockvec[tid]);
+        } else {
+            blockvec[tid] = __float2half(0);
+        }
+        __syncthreads();
+        if (out_index < out_total) {
+          for (k = 0; k <  BLOCKWIDTH_half; ++k){
+            half2 res2 = __hmul2(weight[k],__halves2half2(blockvec[2*k],blockvec[2*k+1]));
+            res += __low2float(res2) + __high2float(res2);
+          }
+          atomicAdd(&mul[out_index], __float2half(res));
+        }
+        __syncthreads();
+      }
+    }
+  }
+}
+
+
+void vecquant8matmul_batched_column_compression_faster_old_cuda(
+  torch::Tensor vec,  // [batch,heads, seq_q, seq_v]
+  torch::Tensor mat, // [batch,heads, seq_v, head_dim]
+  torch::Tensor mul,  // [batch,heads, seq_q,head_dim]
+  torch::Tensor scales, // [batch,heads, head_dim]
+  torch::Tensor zeros
+) {
+  int batch = vec.size(0);
+  int heads = vec.size(1);
+  int vec_row = vec.size(2); //ql
+  int height = mat.size(2); //vl
+  int width = mat.size(3); //head_dim
+
+  dim3 blocks(
+    (height + BLOCKWIDTH - 1) / BLOCKWIDTH,
+    (width + BLOCKWIDTH - 1) / BLOCKWIDTH
+  );
+  dim3 threads(BLOCKWIDTH);
+
+  VecQuant8BatchMatMulColumnCompressionKernel_faster_old<<<blocks, threads>>>(
+    (half*) vec.data_ptr(),
+    (uint8_t*) mat.data_ptr(),
+    (half*) mul.data_ptr(),
+    (half*) scales.data_ptr(),
+    (half*) zeros.data_ptr(),
+    batch, heads, vec_row, height, width
+  );
+
+}
+
+
+__global__ void VecQuant8BatchMatMulColumnCompressionKernel_faster_old(
+    const  half* __restrict__ vec,  // [batch,heads, seq_q, seq_v]
+    const  uint8_t* __restrict__ mat, // [batch,heads, seq_v, head_dim]
+           half* __restrict__ mul, // [batch,heads, seq_q,head_dim]
+    const  half* __restrict__ scales, // [batch,heads, seq_v]
+    const  half* __restrict__ zeros,
+    int batch,
+    int heads,
+    int vec_row, //seq_q
+    int height, //seq_v
+    int width //head_dim
+) {
+  int weight_total = batch * heads * height * width;
+  int input_total = batch * heads * vec_row * height;
+  int out_total = batch * heads * vec_row * width;
+  int tid = threadIdx.x;
+  int h = BLOCKWIDTH * blockIdx.x; // vl
+  int w = BLOCKWIDTH * blockIdx.y + tid; //head_dim + block
+  if (w >= width && tid >= height) {
+    return;
+  }
+  __shared__ half blockvec[BLOCKWIDTH];
+  int k;
+  half w_tmp1 = __float2half(0);
+  half w_tmp2 = __float2half(0);
+  int i = width * h + w;
+  const int BLOCKWIDTH_half = BLOCKWIDTH/2;
+  half2 weight[BLOCKWIDTH_half];
+
+  for (int b = 0; b < batch; ++b){
+    for (int head = 0; head < heads; ++head){
+      int batch_shift = b * heads + head;
+      //int zero_index = batch_shift;
+      for (k = 0; k <  BLOCKWIDTH_half; ++k){
+        int w_index1 = batch_shift * height * width + i + (2 * k) * width; // [batch,head, h+k, w]
+        int w_index2 = batch_shift * height * width + i + ((2 * k + 1) * width);
+        int zero_index1 = batch_shift * height + h + 2*k; // [batch,head, w]
+        int zero_index2 = batch_shift * height + h + 2*k+1; // [batch,head, w]
+
+        if (w_index1 >= weight_total || (2 * k + h)>=height) {
+          weight[k]=__float2half2_rn(0);
+        } else{
+            //int zero_index = batch_shift + h; // [batch,head, w]
+            //float scale_f1 = __half2float(scales[zero_index1]);
+            //float zero_f1 =  __half2float(zeros[zero_index1]);
+            if (w_index2>=weight_total){
+              w_tmp1 = __float2half((as_unsigned(mat[w_index1]) - __half2float(zeros[zero_index1]))* __half2float(scales[zero_index1]));
+              w_tmp2 = __float2half(0);
+              weight[k] = __halves2half2(w_tmp1,w_tmp2);
+              //printf("zero_index is %d k is %d w is %d head is %d height is %d width is %d w_index1 is %d w_tmp1 is %f w_tmp2 is %f zero is %f scale is %f low is %f high is %f \n ",zero_index,k,w,head,height, width,w_index1,__half2float(w_tmp1),__half2float(w_tmp2),zero_f,scale_f,__low2float(weight[k]),__high2float(weight[k]));
+            }else{
+              w_tmp1 = __int2half_rn(as_unsigned(mat[w_index1]));
+              w_tmp2 = __int2half_rn(as_unsigned(mat[w_index2]));
+              half zero1=zeros[zero_index1];
+              half zero2=zeros[zero_index2];
+              half scale1=scales[zero_index1];
+              half scale2=scales[zero_index2];
+              weight[k] = __hmul2(__hsub2(__halves2half2(w_tmp1,w_tmp2), __halves2half2(zero1,zero2)),__halves2half2(scale1,scale2));
+              //weight[k] = __hfma2(__halves2half2(w_tmp1,w_tmp2), __float2half2_rn(scale_f), __float2half2_rn(-(scale_f * zero_f)));
+              //printf("zero_index1 is %d zero_index2 is %d k is %d head is %d w is %d h is %d height is %d width is %d w_index1 is %d w_index2 is %d zero is %f scale is %f low is %f high is %f \n ",zero_index1,zero_index2,k,head,w,h,height, width,w_index1,w_index2,__half2float(zero1),__half2float(scale1),__low2float(weight[k]),__high2float(weight[k]));
+            }
+          }
+       }
+
+
+      for (int vr = 0; vr < vec_row; ++vr){
+        float res=0;
+        int vec_index = (batch_shift * vec_row + vr) * height + blockIdx.x * BLOCKWIDTH + tid;
+        int out_index = (batch_shift * vec_row + vr) * width + w;
+
+        if (vec_index < input_total) {
+            //blockvec[tid] = __half2float(vec[vec_index]);
+            blockvec[tid] = vec[vec_index];
+            //printf("vec_index is %d out_index is %d vec_row is %d ,vr is %d tid is %d blockvec is %f\n",vec_index,out_index,vec_row,vr,tid,blockvec[tid]);
+        } else {
+            blockvec[tid] = __float2half(0);
+            //blockvec[tid] = 0;
+        }
+        __syncthreads();
+        if (out_index < out_total) {
+            for (k = 0; k <  BLOCKWIDTH_half; ++k){
+                half2 res2 = __hmul2(weight[k],__halves2half2(blockvec[2*k],blockvec[2*k+1]));
+                res += __low2float(res2) + __high2float(res2);
+            }
+            atomicAdd(&mul[out_index], __float2half(res));
+        }
+        __syncthreads();
+      }
+    }
+  }
+}
diff --git a/src/deep_training/nlp/models/qwen/configuration_qwen.py b/src/deep_training/nlp/models/qwen/configuration_qwen.py
index 0aa04b65..f8fe2cb4 100644
--- a/src/deep_training/nlp/models/qwen/configuration_qwen.py
+++ b/src/deep_training/nlp/models/qwen/configuration_qwen.py
@@ -10,18 +10,9 @@ class QWenConfig(PretrainedConfig):
     model_type = "qwen"
     keys_to_ignore_at_inference = ["past_key_values"]
 
-
-    attribute_map = {
-        "n_embd": "hidden_size",
-        "n_head": "num_attention_heads",
-        "n_positions": "max_position_embeddings",
-        "n_layer": "num_hidden_layers",
-        "padded_vocab_size": "vocab_size",
-    }
-
     def __init__(
         self,
-        vocab_size=151851,
+        vocab_size=151936,
         hidden_size=4096,
         num_hidden_layers=32,
         num_attention_heads=32,
@@ -35,8 +26,6 @@ def __init__(
         bf16=False,
         fp16=False,
         fp32=False,
-        eos_token_id=151643,
-
         kv_channels=128,
         rotary_pct=1.0,
         rotary_emb_base=10000,
@@ -46,19 +35,14 @@ def __init__(
         intermediate_size=22016,
         no_bias=True,
         tie_word_embeddings=False,
-        quantization_bit = 0,
-        initializer_weight=False,
-        apply_residual_connection_post_layernorm=False,
-        pos_emb= "rotary",
+        use_cache_quantization=False,
+        use_cache_kernel=False,
+        softmax_in_fp32=False,
         **kwargs,
     ):
-        self.eos_token_id = eos_token_id
-        super().__init__(
-            eos_token_id=eos_token_id, tie_word_embeddings=tie_word_embeddings, **kwargs
-        )
-
         self.vocab_size = vocab_size
         self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
         self.num_hidden_layers = num_hidden_layers
         self.num_attention_heads = num_attention_heads
         self.emb_dropout_prob = emb_dropout_prob
@@ -66,10 +50,8 @@ def __init__(
         self.layer_norm_epsilon = layer_norm_epsilon
         self.initializer_range = initializer_range
         self.scale_attn_weights = scale_attn_weights
-        self.max_position_embeddings = max_position_embeddings
-
         self.use_cache = use_cache
-
+        self.max_position_embeddings = max_position_embeddings
         self.bf16 = bf16
         self.fp16 = fp16
         self.fp32 = fp32
@@ -79,13 +61,11 @@ def __init__(
         self.use_dynamic_ntk = use_dynamic_ntk
         self.use_logn_attn = use_logn_attn
         self.use_flash_attn = use_flash_attn
-        self.intermediate_size = intermediate_size
         self.no_bias = no_bias
-        self.tie_word_embeddings = tie_word_embeddings
-
-        self.pos_emb = pos_emb
-        self.apply_residual_connection_post_layernorm = (
-            apply_residual_connection_post_layernorm
+        self.use_cache_quantization = use_cache_quantization
+        self.use_cache_kernel = use_cache_kernel
+        self.softmax_in_fp32 = softmax_in_fp32
+        super().__init__(
+            tie_word_embeddings=tie_word_embeddings,
+            **kwargs
         )
-        self.quantization_bit = quantization_bit
-        self.initializer_weight = initializer_weight
diff --git a/src/deep_training/nlp/models/qwen/cpp_kernels.py b/src/deep_training/nlp/models/qwen/cpp_kernels.py
new file mode 100644
index 00000000..d9cee703
--- /dev/null
+++ b/src/deep_training/nlp/models/qwen/cpp_kernels.py
@@ -0,0 +1,55 @@
+from torch.utils import cpp_extension
+import pathlib
+import os
+import subprocess
+
+def _get_cuda_bare_metal_version(cuda_dir):
+    raw_output = subprocess.check_output([cuda_dir + "/bin/nvcc", "-V"],
+                                         universal_newlines=True)
+    output = raw_output.split()
+    release_idx = output.index("release") + 1
+    release = output[release_idx].split(".")
+    bare_metal_major = release[0]
+    bare_metal_minor = release[1][0]
+
+    return raw_output, bare_metal_major, bare_metal_minor
+
+def _create_build_dir(buildpath):
+    try:
+        os.mkdir(buildpath)
+    except OSError:
+        if not os.path.isdir(buildpath):
+            print(f"Creation of the build directory {buildpath} failed")
+
+# Check if cuda 11 is installed for compute capability 8.0
+cc_flag = []
+_, bare_metal_major, bare_metal_minor = _get_cuda_bare_metal_version(cpp_extension.CUDA_HOME)
+if int(bare_metal_major) >= 11:
+    cc_flag.append('-gencode')
+    cc_flag.append('arch=compute_80,code=sm_80')
+    if int(bare_metal_minor) >= 7:
+        cc_flag.append('-gencode')
+        cc_flag.append('arch=compute_90,code=sm_90')
+
+# Build path
+srcpath = pathlib.Path(__file__).parent.absolute()
+buildpath = srcpath / 'build'
+_create_build_dir(buildpath)
+
+def _cpp_extention_load_helper(name, sources, extra_cuda_flags):
+    return cpp_extension.load(
+        name=name,
+        sources=sources,
+        build_directory=buildpath,
+        extra_cflags=['-O3', ],
+        extra_cuda_cflags=['-O3',
+                           '-gencode', 'arch=compute_70,code=sm_70',
+                           '--use_fast_math'] + extra_cuda_flags + cc_flag,
+        verbose=1
+    )
+
+extra_flags = []
+
+cache_autogptq_cuda_256_sources = ["./cache_autogptq_cuda_256.cpp",
+           "./cache_autogptq_cuda_kernel_256.cu"]
+cache_autogptq_cuda_256 = _cpp_extention_load_helper("cache_autogptq_cuda_256", cache_autogptq_cuda_256_sources, extra_flags)
diff --git a/src/deep_training/nlp/models/qwen/modeling_qwen.py b/src/deep_training/nlp/models/qwen/modeling_qwen.py
index afe32574..f1f3bf66 100644
--- a/src/deep_training/nlp/models/qwen/modeling_qwen.py
+++ b/src/deep_training/nlp/models/qwen/modeling_qwen.py
@@ -3,19 +3,21 @@
 # This source code is licensed under the license found in the
 # LICENSE file in the root directory of this source tree.
 
+import copy
 import importlib
 import math
+import pathlib
 from typing import TYPE_CHECKING, Optional, Tuple, Union, Callable, List, Any, Generator
+
 import torch
 import torch.nn.functional as F
 import torch.utils.checkpoint
-from torch.cuda.amp import autocast
+import warnings
 
 from torch.nn import CrossEntropyLoss
 from transformers import PreTrainedTokenizer, GenerationConfig, StoppingCriteriaList
 from transformers.generation.logits_process import LogitsProcessorList
 
-from ..transformer_base import TransformerBase
 from ...utils.torch_utils import skip_init
 
 if TYPE_CHECKING:
@@ -27,12 +29,29 @@
 )
 from transformers.modeling_utils import PreTrainedModel
 from transformers.utils import logging
-from einops import rearrange
-from torch import nn
+
 try:
-    from kernels.cpp_kernels import cache_autogptq_cuda_256
+    from einops import rearrange
 except ImportError:
-    cache_autogptq_cuda_256 = None
+    rearrange = None
+from torch import nn
+
+SUPPORT_CUDA = torch.cuda.is_available()
+SUPPORT_BF16 = SUPPORT_CUDA and torch.cuda.is_bf16_supported()
+SUPPORT_FP16 = SUPPORT_CUDA and torch.cuda.get_device_capability(0)[0] >= 6
+SUPPORT_TORCH2 = hasattr(torch, '__version__') and int(torch.__version__.split(".")[0]) >= 2
+
+def default_init(cls, *args, **kwargs):
+    return cls(*args, **kwargs)
+skip_init_function = skip_init
+def setup_model_profile(skip_init_flag=True):
+    global skip_init_function
+    if skip_init_flag:
+        skip_init_function = skip_init
+    else:
+        skip_init_function = default_init
+
+
 from .configuration_qwen import QWenConfig
 from .qwen_generation_utils import (
     HistoryType,
@@ -57,30 +76,29 @@
 如果您在直接使用我们从Huggingface提供的模型，请确保您在调用model.chat()时，使用的是"Qwen/Qwen-7B-Chat"模型（而非"Qwen/Qwen-7B"预训练模型）。
 """
 
+_SENTINEL = object()
+_ERROR_STREAM_IN_CHAT = """\
+Pass argument `stream` to model.chat() is buggy, deprecated, and marked for removal. Please use model.chat_stream(...) instead of model.chat(..., stream=True).
+向model.chat()传入参数stream的用法可能存在Bug，该用法已被废弃，将在未来被移除。请使用model.chat_stream(...)代替model.chat(..., stream=True)。
+"""
+
 _ERROR_INPUT_CPU_QUERY_WITH_FLASH_ATTN_ACTIVATED = """\
 We detect you have activated flash attention support, but running model computation on CPU. Please make sure that your input data has been placed on GPU. If you actually want to run CPU computation, please following the readme and set device_map="cpu" to disable flash attention when loading the model (calling AutoModelForCausalLM.from_pretrained).
 检测到您的模型已激活了flash attention支持，但正在执行CPU运算任务。如使用flash attention，请您确认模型输入已经传到GPU上。如果您确认要执行CPU运算，请您在载入模型（调用AutoModelForCausalLM.from_pretrained）时，按照readme说法，指定device_map="cpu"以禁用flash attention。
 """
 
-SUPPORT_CUDA,SUPPORT_BF16,SUPPORT_FP16 = True,True,False
-def _AutoDetect():
-    global SUPPORT_CUDA,SUPPORT_BF16,SUPPORT_FP16
-    SUPPORT_CUDA = torch.cuda.is_available()
-    SUPPORT_BF16 = SUPPORT_CUDA and torch.cuda.is_bf16_supported()
-    SUPPORT_FP16 = SUPPORT_CUDA and torch.cuda.get_device_capability(0)[0] >= 6
-
-_AutoDetect()
-
-apply_rotary_emb_func,rms_norm,flash_attn_unpadded_func = None,None,None
-
+apply_rotary_emb_func = None
+rms_norm = None
+flash_attn_unpadded_func = None
+flash_attn_func = None
 
 def _import_flash_attn():
-    global apply_rotary_emb_func, rms_norm, flash_attn_unpadded_func
+    global apply_rotary_emb_func, rms_norm, flash_attn_unpadded_func, flash_attn_func
     try:
         from flash_attn.layers.rotary import apply_rotary_emb_func as __apply_rotary_emb_func
         apply_rotary_emb_func = __apply_rotary_emb_func
     except ImportError:
-        logger.warning(
+        logger.warn(
             "Warning: import flash_attn rotary fail, please install FlashAttention rotary to get higher efficiency "
             "https://github.com/Dao-AILab/flash-attention/tree/main/csrc/rotary"
         )
@@ -89,40 +107,31 @@ def _import_flash_attn():
         from flash_attn.ops.rms_norm import rms_norm as __rms_norm
         rms_norm = __rms_norm
     except ImportError:
-        logger.warning(
+        logger.warn(
             "Warning: import flash_attn rms_norm fail, please install FlashAttention layer_norm to get higher efficiency "
             "https://github.com/Dao-AILab/flash-attention/tree/main/csrc/layer_norm"
         )
 
     try:
         import flash_attn
+        _flash_attn_func = None
         if not hasattr(flash_attn, '__version__'):
             from flash_attn.flash_attn_interface import flash_attn_unpadded_func as __flash_attn_unpadded_func
         else:
             if int(flash_attn.__version__.split(".")[0]) >= 2:
+                if int(flash_attn.__version__.split(".")[1]) >= 1:
+                    from flash_attn.flash_attn_interface import flash_attn_func as _flash_attn_func
                 from flash_attn.flash_attn_interface import flash_attn_varlen_func as __flash_attn_unpadded_func
             else:
                 from flash_attn.flash_attn_interface import flash_attn_unpadded_func as __flash_attn_unpadded_func
         flash_attn_unpadded_func = __flash_attn_unpadded_func
+        flash_attn_func = _flash_attn_func
     except ImportError:
         logger.warn(
             "Warning: import flash_attn fail, please install FlashAttention to get higher efficiency "
             "https://github.com/Dao-AILab/flash-attention"
         )
 
-
-def default_init(cls, *args, **kwargs):
-    return cls(*args, **kwargs)
-skip_init_function = skip_init
-def setup_model_profile(skip_init_flag=True):
-    global skip_init_function
-    if skip_init_flag:
-        skip_init_function = skip_init
-    else:
-        skip_init_function = default_init
-        
-        
-        
 def quantize_cache_v(fdata, bits, qmax, qmin):
     # b, s, head, h-dim->b, head, s, h-dim
     qtype = torch.uint8
@@ -155,6 +164,7 @@ def __init__(
         causal=False,
         softmax_scale=None,
         attention_dropout=0.0,
+            **kwargs
     ):
         super().__init__()
         assert flash_attn_unpadded_func is not None, (
@@ -187,6 +197,12 @@ def forward(self, q, k, v, attention_mask=None):
         assert all((i.is_cuda for i in (q, k, v)))
         batch_size, seqlen_q = q.shape[0], q.shape[1]
         seqlen_k = k.shape[1]
+        seqlen_out = seqlen_q
+
+        if flash_attn_func is not None and batch_size == 1:
+            dropout_p = self.dropout_p if self.training else 0
+            output = flash_attn_func(q, k, v, dropout_p, softmax_scale=self.softmax_scale, causal=self.causal)
+            return output
 
         q, k, v = [rearrange(x, "b s ... -> (b s) ...") for x in [q, k, v]]
         cu_seqlens_q = torch.arange(
@@ -197,12 +213,13 @@ def forward(self, q, k, v, attention_mask=None):
             device=q.device,
         )
 
-        if attention_mask is not None:
+        if batch_size > 1 and attention_mask is not None:
             k, indices_k, cu_seqlens_k, seqlen_k = self.unpad_input(k, attention_mask)
-            v = v[indices_k]
-            if seqlen_q == seqlen_k:
+            if q.size(0) == v.size(0):
                 q = q[indices_k]
                 cu_seqlens_q = cu_seqlens_k
+                seqlen_q = seqlen_k
+            v = v[indices_k]
         else:
             cu_seqlens_k = torch.arange(
                 0,
@@ -232,8 +249,8 @@ def forward(self, q, k, v, attention_mask=None):
             softmax_scale=self.softmax_scale,
             causal=is_causal,
         )
-        if attention_mask is not None and seqlen_q == seqlen_k:
-            output = self.pad_input(output, indices_k, batch_size, seqlen_q)
+        if batch_size > 1 and attention_mask is not None and seqlen_q == seqlen_k:
+            output = self.pad_input(output, indices_k, batch_size, seqlen_out)
         else:
             new_shape = (batch_size, output.shape[0] // batch_size) + output.shape[1:]
             output = output.view(new_shape)
@@ -262,12 +279,9 @@ def __init__(self, config,**kwargs):
             self.projection_size // config.num_attention_heads
         )
 
-        global skip_init_function
-        init_method = skip_init_function
-
-        self.c_attn = init_method(nn.Linear,config.hidden_size, 3 * self.projection_size,**kwargs)
+        self.c_attn = nn.Linear(config.hidden_size, 3 * self.projection_size,**kwargs)
 
-        self.c_proj = init_method(nn.Linear,
+        self.c_proj = nn.Linear(
             config.hidden_size, self.projection_size, bias=not config.no_bias,**kwargs
         )
 
@@ -278,7 +292,7 @@ def __init__(self, config,**kwargs):
             and not self.is_fp32
         ):
             self.core_attention_flash = FlashSelfAttention(
-                causal=True, attention_dropout=config.attn_dropout_prob
+                causal=True, attention_dropout=config.attn_dropout_prob,**kwargs
             )
         self.bf16 = config.bf16
 
@@ -291,7 +305,9 @@ def __init__(self, config,**kwargs):
         ]
         logn_tensor = torch.tensor(logn_list)[None, :, None, None]
         self.register_buffer("logn_tensor", logn_tensor, persistent=False)
+
         self.attn_dropout = nn.Dropout(config.attn_dropout_prob)
+        self.softmax_in_fp32 = config.softmax_in_fp32 if hasattr(config, 'softmax_in_fp32') else False
         self.use_cache_quantization = config.use_cache_quantization if hasattr(config, 'use_cache_quantization') else False
         self.use_cache_kernel = config.use_cache_kernel if hasattr(config,'use_cache_kernel') else False
         cache_dtype = torch.float
@@ -302,14 +318,29 @@ def __init__(self, config,**kwargs):
         self.cache_qmax = torch.tensor(torch.iinfo(torch.uint8).max, dtype=cache_dtype)
         self.cache_qmin = torch.tensor(torch.iinfo(torch.uint8).min, dtype=cache_dtype)
 
-    def _attn(self, query, key, value, registered_causal_mask, attention_mask=None, head_mask=None):
+        if config.use_cache_quantization and config.use_cache_kernel:
+            # pre check if the support files existing
+            module_root = pathlib.Path(__file__).parent
+            src_files = ("cache_autogptq_cuda_256.cpp", "cache_autogptq_cuda_kernel_256.cu")
+            if any(not (module_root/src).is_file() for src in src_files):
+                warnings.warn("KV cache kernel source files (.cpp and .cu) not found.")
+                self.cache_kernels = None
+            else:
+                try:
+                    from .cpp_kernels import cache_autogptq_cuda_256
+                    self.cache_kernels = cache_autogptq_cuda_256
+                except ImportError:
+                    warnings.warn("Failed to import KV cache kernels.")
+                    self.cache_kernels = None
+
+    def _attn(self, query, key, value, causal_mask=None, attention_mask=None, head_mask=None):
         device = query.device
         if self.use_cache_quantization:
             qk, qk_scale, qk_zero = key
-            if self.use_cache_kernel and cache_autogptq_cuda_256 is not None:
+            if self.use_cache_kernel and self.cache_kernels is not None:
                 shape = query.shape[:-1] + (qk.shape[-2],)
                 attn_weights = torch.zeros(shape, dtype=torch.float16, device=device)
-                cache_autogptq_cuda_256.vecquant8matmul_batched_faster_old(
+                self.cache_kernels.vecquant8matmul_batched_faster_old(
                     query.contiguous() if query.dtype == torch.float16 else query.to(torch.float16).contiguous(),
                     qk.transpose(-1, -2).contiguous(),
                     attn_weights,
@@ -327,31 +358,21 @@ def _attn(self, query, key, value, registered_causal_mask, attention_mask=None,
                 size_temp = value[0].size(-1)
             else:
                 size_temp = value.size(-1)
-            attn_weights = attn_weights / torch.full(
-                [],
-                size_temp ** 0.5,
-                dtype=attn_weights.dtype,
-                device=attn_weights.device,
-            )
-        if self.use_cache_quantization:
-            query_length, key_length = query.size(-2), key[0].size(-2)
-        else:
-            query_length, key_length = query.size(-2), key.size(-2)
-        causal_mask = registered_causal_mask[
-            :, :, key_length - query_length : key_length, :key_length
-        ]
+            attn_weights = attn_weights / (size_temp ** 0.5)
+
         mask_value = torch.finfo(attn_weights.dtype).min
-        mask_value = torch.full([], mask_value, dtype=attn_weights.dtype).to(
-            attn_weights.device
-        )
-        attn_weights = torch.where(
-            causal_mask, attn_weights.to(attn_weights.dtype), mask_value
-        )
+        if causal_mask is not None:
+            attn_weights = torch.where(
+                causal_mask, attn_weights.to(attn_weights.dtype), mask_value
+            )
 
         if attention_mask is not None:
             attn_weights = attn_weights + attention_mask
 
-        attn_weights = nn.functional.softmax(attn_weights.float(), dim=-1)
+        if self.softmax_in_fp32:
+            attn_weights = nn.functional.softmax(attn_weights.float(), dim=-1)
+        else:
+            attn_weights = nn.functional.softmax(attn_weights, dim=-1)
 
         attn_weights = attn_weights.type(query.dtype)
         attn_weights = self.attn_dropout(attn_weights)
@@ -361,10 +382,10 @@ def _attn(self, query, key, value, registered_causal_mask, attention_mask=None,
 
         if self.use_cache_quantization:
             qv, qv_scale, qv_zero = value
-            if self.use_cache_kernel and cache_autogptq_cuda_256 is not None:
+            if self.use_cache_kernel and self.cache_kernels is not None:
                 shape = attn_weights.shape[:-1] + (query.shape[-1],)
                 attn_output = torch.zeros(shape, dtype=torch.float16, device=device)
-                cache_autogptq_cuda_256.vecquant8matmul_batched_column_compression_faster_old(
+                self.cache_kernels.vecquant8matmul_batched_column_compression_faster_old(
                     attn_weights.contiguous() if attn_weights.dtype == torch.float16 else attn_weights.to(torch.float16).contiguous(),
                     qv.contiguous(),  # dtype: int32
                     attn_output,
@@ -383,62 +404,6 @@ def _attn(self, query, key, value, registered_causal_mask, attention_mask=None,
 
         return attn_output, attn_weights
 
-    def _upcast_and_reordered_attn(
-        self, query, key, value, registered_causal_mask, attention_mask=None, head_mask=None
-    ):
-        bsz, num_heads, q_seq_len, dk = query.size()
-        _, _, k_seq_len, _ = key.size()
-
-        attn_weights = torch.empty(
-            bsz * num_heads,
-            q_seq_len,
-            k_seq_len,
-            dtype=torch.float32,
-            device=query.device,
-        )
-
-        scale_factor = 1.0
-        if self.scale_attn_weights:
-            scale_factor /= float(value.size(-1)) ** 0.5
-
-        with autocast(enabled=False):
-            q, k = query.reshape(-1, q_seq_len, dk), key.transpose(-1, -2).reshape(
-                -1, dk, k_seq_len
-            )
-            attn_weights = torch.baddbmm(
-                attn_weights, q.float(), k.float(), beta=0, alpha=scale_factor
-            )
-            attn_weights = attn_weights.reshape(bsz, num_heads, q_seq_len, k_seq_len)
-
-        query_length, key_length = query.size(-2), key.size(-2)
-        causal_mask = registered_causal_mask[
-            :, :, key_length - query_length : key_length, :key_length
-        ]
-        mask_value = torch.finfo(attn_weights.dtype).min
-        mask_value = torch.tensor(mask_value, dtype=attn_weights.dtype).to(
-            attn_weights.device
-        )
-        attn_weights = torch.where(causal_mask, attn_weights, mask_value)
-
-        if attention_mask is not None:
-            attn_weights = attn_weights + attention_mask
-
-        attn_weights = nn.functional.softmax(attn_weights, dim=-1)
-
-        if attn_weights.dtype != torch.float32:
-            raise RuntimeError(
-                "Error with upcasting, attn_weights does not have dtype torch.float32"
-            )
-        attn_weights = attn_weights.type(value.dtype)
-        attn_weights = self.attn_dropout(attn_weights)
-
-        if head_mask is not None:
-            attn_weights = attn_weights * head_mask
-
-        attn_output = torch.matmul(attn_weights, value)
-
-        return attn_output, attn_weights
-
     def _split_heads(self, tensor, num_heads, attn_head_size):
         new_shape = tensor.size()[:-1] + (num_heads, attn_head_size)
         tensor = tensor.view(new_shape)
@@ -452,8 +417,7 @@ def _merge_heads(self, tensor, num_heads, attn_head_size):
     def forward(
         self,
         hidden_states: Optional[Tuple[torch.FloatTensor]],
-        rotary_pos_emb_list: Optional[List[torch.Tensor]] = None,
-        registered_causal_mask: Optional[torch.Tensor] = None,
+        rotary_pos_emb_list: Optional[List[List[torch.Tensor]]] = None,
         layer_past: Optional[Tuple[torch.Tensor]] = None,
         attention_mask: Optional[torch.FloatTensor] = None,
         head_mask: Optional[torch.FloatTensor] = None,
@@ -527,14 +491,15 @@ def forward(
         else:
             present = None
 
-        if self.use_logn_attn and not self.training:
+        key_size = key[0].size(2) if self.use_cache_quantization else key.size(1)
+        if key_size > self.seq_length and self.use_logn_attn and not self.training:
             if self.use_cache_quantization:
                 seq_start = key[0].size(2) - query.size(1)
                 seq_end = key[0].size(2)
             else:
                 seq_start = key.size(1) - query.size(1)
                 seq_end = key.size(1)
-            logn_tensor = self.logn_tensor[:, seq_start:seq_end, :, :]
+            logn_tensor = self.logn_tensor[:, seq_start:seq_end, :, :].type_as(query)
             query = query * logn_tensor.expand_as(query)
 
         if (
@@ -544,30 +509,49 @@ def forward(
             and query.is_cuda
         ):
             q, k, v = query, key, value
-            context_layer = self.core_attention_flash(q, k, v, attention_mask=attention_mask)
-
-            # b s h d -> b s (h d)
-            context_layer = context_layer.flatten(2,3).contiguous()
-
+            attn_output = self.core_attention_flash(q, k, v, attention_mask=attention_mask)
         else:
+            key_size = key[0].size(2) if self.use_cache_quantization else key.size(1)
+            if query.size(1) == key_size:
+                causal_mask = torch.tril(
+                    torch.ones((key_size, key_size), dtype=torch.bool, device=query.device)
+                ).view(1, 1, key_size, key_size)
+            else:
+                causal_mask = None
             query = query.permute(0, 2, 1, 3)
             if not self.use_cache_quantization:
                 key = key.permute(0, 2, 1, 3)
                 value = value.permute(0, 2, 1, 3)
             if (
-                registered_causal_mask is None
+                causal_mask is None
                 and self.use_flash_attn
                 and flash_attn_unpadded_func is not None
                 and not self.is_fp32
                 and not query.is_cuda
             ):
                 raise Exception(_ERROR_INPUT_CPU_QUERY_WITH_FLASH_ATTN_ACTIVATED)
-            attn_output, attn_weight = self._attn(
-                query, key, value, registered_causal_mask, attention_mask, head_mask
-            )
-            context_layer = self._merge_heads(
-                attn_output, self.num_heads, self.head_dim
-            )
+
+            if not self.use_cache_quantization and SUPPORT_TORCH2:
+                if attention_mask is not None:
+                    attention_mask = attention_mask.expand(
+                        -1, -1, causal_mask.size(2), -1
+                    )
+                    if causal_mask is not None:
+                        # attention_mask.masked_fill(~causal_mask, torch.finfo(query.dtype).min)
+                        attention_mask.masked_fill(~causal_mask, torch.finfo(attention_mask.dtype).min)
+                else:
+                    attention_mask = causal_mask
+                attn_output = F.scaled_dot_product_attention(
+                    query, key, value, attn_mask=attention_mask
+                ).transpose(1, 2)
+                attn_weight = None
+            else:
+                attn_output, attn_weight = self._attn(
+                    query, key, value, causal_mask, attention_mask, head_mask
+                )
+        context_layer = self._merge_heads(
+            attn_output, self.num_heads, self.head_dim
+        )
 
         attn_output = self.c_proj(context_layer)
 
@@ -579,6 +563,8 @@ def forward(
                 and not self.is_fp32
             ):
                 raise ValueError("Cannot output attentions while using flash-attn")
+            elif not self.use_cache_quantization and SUPPORT_TORCH2:
+                raise ValueError("Cannot output attentions while using scaled_dot_product_attention")
             else:
                 outputs += (attn_weight,)
 
@@ -606,32 +592,27 @@ def forward(self, hidden_states):
 
 
 class QWenBlock(nn.Module):
-    def __init__(self, config,  **kwargs):
+    def __init__(self, config,**kwargs):
         super().__init__()
-
         hidden_size = config.hidden_size
         self.bf16 = config.bf16
 
-        global skip_init_function
-        init_method = skip_init_function
-
         self.ln_1 = RMSNorm(
             hidden_size,
-            eps=config.layer_norm_epsilon,**kwargs
+            eps=config.layer_norm_epsilon,
         )
-        self.attn = QWenAttention(config, **kwargs)
+        self.attn = QWenAttention(config,**kwargs)
         self.ln_2 = RMSNorm(
             hidden_size,
-            eps=config.layer_norm_epsilon,**kwargs
+            eps=config.layer_norm_epsilon,
         )
 
-        self.mlp = init_method(QWenMLP,config,**kwargs)
+        self.mlp = QWenMLP(config,**kwargs)
 
     def forward(
         self,
         hidden_states: Optional[Tuple[torch.FloatTensor]],
-        rotary_pos_emb_list: Optional[List[torch.Tensor]] = None,
-        registered_causal_mask: Optional[torch.Tensor] = None,
+        rotary_pos_emb_list: Optional[List[List[torch.Tensor]]] = None,
         layer_past: Optional[Tuple[torch.Tensor]] = None,
         attention_mask: Optional[torch.FloatTensor] = None,
         head_mask: Optional[torch.FloatTensor] = None,
@@ -645,7 +626,6 @@ def forward(
         attn_outputs = self.attn(
             layernorm_output,
             rotary_pos_emb_list,
-            registered_causal_mask=registered_causal_mask,
             layer_past=layer_past,
             attention_mask=attention_mask,
             head_mask=head_mask,
@@ -662,7 +642,6 @@ def forward(
         layernorm_output = self.ln_2(layernorm_input)
 
         residual = layernorm_input
-
         mlp_output = self.mlp(layernorm_output)
         hidden_states = residual + mlp_output
 
@@ -680,6 +659,7 @@ class QWenPreTrainedModel(PreTrainedModel):
     is_parallelizable = False
     supports_gradient_checkpointing = True
     _no_split_modules = ["QWenBlock"]
+    _skip_keys_device_placement = "past_key_values"
 
     def __init__(self, *inputs, **kwargs):
         super().__init__(*inputs, **kwargs)
@@ -707,12 +687,11 @@ def _init_weights(self, module):
                 p.data.normal_(
                     mean=0.0,
                     std=(
-                            self.config.initializer_range
-                            / math.sqrt(2 * self.config.num_hidden_layers)
+                        self.config.initializer_range
+                        / math.sqrt(2 * self.config.num_hidden_layers)
                     ),
                 )
 
-
     def _set_gradient_checkpointing(self, module, value=False):
         if isinstance(module, QWenModel):
             module.gradient_checkpointing = value
@@ -721,23 +700,20 @@ def _set_gradient_checkpointing(self, module, value=False):
 class QWenModel(QWenPreTrainedModel):
     _keys_to_ignore_on_load_missing = ["attn.masked_bias"]
 
-    def __init__(self, config: QWenConfig,**kwargs):
+    def __init__(self, config,**kwargs):
         super().__init__(config)
-        self.use_cache_quantization = config.use_cache_quantization if hasattr(config,
-                                                                               'use_cache_quantization') else False
-
         self.vocab_size = config.vocab_size
         self.num_hidden_layers = config.num_hidden_layers
         self.embed_dim = config.hidden_size
+        self.use_cache_quantization = self.config.use_cache_quantization if hasattr(self.config, 'use_cache_quantization') else False
 
         self.gradient_checkpointing = False
         self.use_dynamic_ntk = config.use_dynamic_ntk
         self.seq_length = config.seq_length
 
         global skip_init_function
-        init_method = skip_init_function
 
-        self.wte = init_method(nn.Embedding,self.vocab_size, self.embed_dim,**kwargs)
+        self.wte = skip_init_function(nn.Embedding,self.vocab_size, self.embed_dim,**kwargs)
 
         self.drop = nn.Dropout(config.emb_dropout_prob)
 
@@ -757,27 +733,11 @@ def __init__(self, config: QWenConfig,**kwargs):
 
         self.use_flash_attn = config.use_flash_attn
         self.is_fp32 = not (config.bf16 or config.fp16)
-        if (
-                self.use_flash_attn
-                and flash_attn_unpadded_func is not None
-                and not self.is_fp32
-        ):
-            self.registered_causal_mask = None
-        else:
-            max_positions = config.max_position_embeddings
-            self.register_buffer(
-                "registered_causal_mask",
-                torch.tril(
-                    torch.ones((max_positions, max_positions), dtype=torch.bool)
-                ).view(1, 1, max_positions, max_positions),
-                persistent=False,
-            )
 
         self.h = nn.ModuleList(
             [
-                QWenBlock(
-                    config,
-                    **kwargs
+                skip_init_function(QWenBlock,
+                    config,**kwargs
                 )
                 for i in range(config.num_hidden_layers)
             ]
@@ -785,7 +745,6 @@ def __init__(self, config: QWenConfig,**kwargs):
         self.ln_f = RMSNorm(
             self.embed_dim,
             eps=config.layer_norm_epsilon,
-            **kwargs
         )
 
         self.post_init()
@@ -801,7 +760,7 @@ def get_ntk_alpha(self, true_seq_len):
         ntk_alpha = 2 ** math.ceil(context_value) - 1
         ntk_alpha = max(ntk_alpha, 1)
         return ntk_alpha
-        
+
     def forward(
         self,
         input_ids: Optional[torch.LongTensor] = None,
@@ -862,7 +821,6 @@ def forward(
                 past_length = past_key_values[0][0][0].size(2)
             else:
                 past_length = past_key_values[0][0].size(-2)
-
         if position_ids is None:
             position_ids = torch.arange(
                 past_length,
@@ -911,11 +869,9 @@ def forward(
                 ntk_alpha = self.get_ntk_alpha(kv_seq_len)
                 ntk_alpha_list.append(ntk_alpha)
         self.rotary_emb._ntk_alpha_cached_list = ntk_alpha_list
-
-        rotary_pos_emb_list = []
-        for ntk_alpha in ntk_alpha_list:
-            rotary_pos_emb = self.rotary_emb(kv_seq_len, ntk_alpha=ntk_alpha)
-            rotary_pos_emb_list.append(rotary_pos_emb)
+        rotary_pos_emb_list = [
+            self.rotary_emb(kv_seq_len, ntk_alpha=ntk_alpha) for ntk_alpha in ntk_alpha_list
+        ]
 
         hidden_states = self.drop(hidden_states)
         output_shape = input_shape + (hidden_states.size(-1),)
@@ -948,7 +904,6 @@ def custom_forward(*inputs):
                     create_custom_forward(block),
                     hidden_states,
                     rotary_pos_emb_list,
-                    self.registered_causal_mask,
                     None,
                     attention_mask,
                     head_mask[i],
@@ -960,7 +915,6 @@ def custom_forward(*inputs):
                     hidden_states,
                     layer_past=layer_past,
                     rotary_pos_emb_list=rotary_pos_emb_list,
-                    registered_causal_mask=self.registered_causal_mask,
                     attention_mask=attention_mask,
                     head_mask=head_mask[i],
                     encoder_hidden_states=encoder_hidden_states,
@@ -1001,23 +955,21 @@ class QWenLMHeadModel(QWenPreTrainedModel):
 
     def __init__(self, config,**kwargs):
         super().__init__(config)
-        global skip_init_function
-        init_method = skip_init_function
-
-        global SUPPORT_CUDA, SUPPORT_BF16, SUPPORT_FP16
+        assert (
+            config.bf16 + config.fp16 + config.fp32 <= 1
+        ), "Only one of \"bf16\", \"fp16\", \"fp32\" can be true"
 
-        config = self.config
         autoset_precision = config.bf16 + config.fp16 + config.fp32 == 0
 
         if autoset_precision:
             if SUPPORT_BF16:
-                logger.warning(
+                logger.warn(
                     "The model is automatically converting to bf16 for faster inference. "
                     "If you want to disable the automatic precision, please manually add bf16/fp16/fp32=True to \"AutoModelForCausalLM.from_pretrained\"."
                 )
                 config.bf16 = True
             elif SUPPORT_FP16:
-                logger.warning(
+                logger.warn(
                     "The model is automatically converting to fp16 for faster inference. "
                     "If you want to disable the automatic precision, please manually add bf16/fp16/fp32=True to \"AutoModelForCausalLM.from_pretrained\"."
                 )
@@ -1026,45 +978,31 @@ def __init__(self, config,**kwargs):
                 config.fp32 = True
 
         if config.bf16 and SUPPORT_CUDA and not SUPPORT_BF16:
-            logger.warning(
-                "Your device does NOT seem to support bf16, you can switch to fp16 or fp32 by by passing fp16/fp32=True in \"AutoModelForCausalLM.from_pretrained\".")
+            logger.warn("Your device does NOT seem to support bf16, you can switch to fp16 or fp32 by by passing fp16/fp32=True in \"AutoModelForCausalLM.from_pretrained\".")
         if config.fp16 and SUPPORT_CUDA and not SUPPORT_FP16:
-            logger.warning(
-                "Your device does NOT support faster inference with fp16, please switch to fp32 which is likely to be faster")
+            logger.warn("Your device does NOT support faster inference with fp16, please switch to fp32 which is likely to be faster")
         if config.fp32:
             if SUPPORT_BF16:
-                logger.warning(
-                    "Your device support faster inference by passing bf16=True in \"AutoModelForCausalLM.from_pretrained\".")
+                logger.warn("Your device support faster inference by passing bf16=True in \"AutoModelForCausalLM.from_pretrained\".")
             elif SUPPORT_FP16:
-                logger.warning(
-                    "Your device support faster inference by passing fp16=True in \"AutoModelForCausalLM.from_pretrained\".")
-
-        assert (
-                config.bf16 + config.fp16 + config.fp32 <= 1
-        ), "Only one of \"bf16\", \"fp16\", \"fp32\" can be true"
+                logger.warn("Your device support faster inference by passing fp16=True in \"AutoModelForCausalLM.from_pretrained\".")
 
         if config.use_flash_attn == "auto":
             if config.bf16 or config.fp16:
-                logger.warning("Try importing flash-attention for faster inference...")
+                logger.warn("Try importing flash-attention for faster inference...")
                 config.use_flash_attn = True
             else:
                 config.use_flash_attn = False
         if config.use_flash_attn and config.fp32:
-            logger.warning("Flash attention will be disabled because it does NOT support fp32.")
+            logger.warn("Flash attention will be disabled because it does NOT support fp32.")
 
         if config.use_flash_attn:
             _import_flash_attn()
 
-        if hasattr(config, 'use_cache_quantization') and config.use_cache_quantization:
-            config.use_flash_attn = False
-            if hasattr(config, 'use_cache_kernel') and config.use_cache_kernel:
-                try:
-                    from kernels.cpp_kernels import cache_autogptq_cuda_256
-                except ImportError:
-                    cache_autogptq_cuda_256 = None
-
+        global skip_init_function
         self.transformer = QWenModel(config,**kwargs)
-        self.lm_head = init_method(nn.Linear,config.hidden_size, config.vocab_size, bias=False,**kwargs)                                                                                   
+        self.lm_head = skip_init_function(nn.Linear,config.hidden_size, config.vocab_size, bias=False,**kwargs)
+
         if config.bf16:
             self.transformer.bfloat16()
             self.lm_head.bfloat16()
@@ -1074,7 +1012,7 @@ def __init__(self, config,**kwargs):
         self.post_init()
 
         self.quantized = False
-        if self.config.quantization_bit in [4,8]:
+        if getattr(self.config,"quantization_bit",0) in [4, 8]:
             self.quantize(self.config.quantization_bit, empty_init=True)
 
     def get_output_embeddings(self):
@@ -1086,22 +1024,13 @@ def set_output_embeddings(self, new_embeddings):
     def prepare_inputs_for_generation(
         self, input_ids, past_key_values=None, inputs_embeds=None, **kwargs
     ):
-        token_type_ids = kwargs.get("token_type_ids", None)
         if past_key_values:
             input_ids = input_ids[:, -1].unsqueeze(-1)
-            if token_type_ids is not None:
-                token_type_ids = token_type_ids[:, -1].unsqueeze(-1)
 
-        attention_mask = kwargs.get("attention_mask", None)
-        position_ids = kwargs.get("position_ids", None)
-
-        if attention_mask is not None and position_ids is None:
-            position_ids = attention_mask.long().cumsum(-1) - 1
-            position_ids.masked_fill_(attention_mask == 0, 1)
-            if past_key_values:
-                position_ids = position_ids[:, -1].unsqueeze(-1)
+        if input_ids.size(0) == 1:
+            attention_mask = None
         else:
-            position_ids = None
+            attention_mask = kwargs.get("attention_mask", None)
 
         if inputs_embeds is not None and past_key_values is None:
             model_inputs = {"inputs_embeds": inputs_embeds}
@@ -1112,9 +1041,7 @@ def prepare_inputs_for_generation(
             {
                 "past_key_values": past_key_values,
                 "use_cache": kwargs.get("use_cache"),
-                "position_ids": position_ids,
                 "attention_mask": attention_mask,
-                "token_type_ids": token_type_ids,
             }
         )
         return model_inputs
@@ -1202,21 +1129,25 @@ def chat(
         query: str,
         history: Optional[HistoryType],
         system: str = "You are a helpful assistant.",
-        append_history: bool = True,
+        stream: Optional[bool] = _SENTINEL,
         stop_words_ids: Optional[List[List[int]]] = None,
         generation_config: Optional[GenerationConfig] = None,
-        **kwargs
+        **kwargs,
     ) -> Tuple[str, HistoryType]:
         generation_config = generation_config if generation_config is not None else self.generation_config
-        assert generation_config.chat_format == 'chatml', _ERROR_BAD_CHAT_FORMAT
 
+        assert stream is _SENTINEL, _ERROR_STREAM_IN_CHAT
+        assert generation_config.chat_format == 'chatml', _ERROR_BAD_CHAT_FORMAT
         if history is None:
             history = []
+        else:
+            # make a copy of the user's input such that is is left untouched
+            history = copy.deepcopy(history)
 
         if stop_words_ids is None:
             stop_words_ids = []
 
-        max_window_size = kwargs.pop('max_window_size', None)
+        max_window_size = kwargs.get('max_window_size', None)
         if max_window_size is None:
             max_window_size = generation_config.max_window_size
         raw_text, context_tokens = make_context(
@@ -1232,13 +1163,13 @@ def chat(
             generation_config.chat_format, tokenizer
         ))
         input_ids = torch.tensor([context_tokens]).to(self.device)
-
         outputs = self.generate(
-            input_ids,
-            stop_words_ids=stop_words_ids,
-            return_dict_in_generate=False,
-            **kwargs,
-        )
+                    input_ids,
+                    stop_words_ids=stop_words_ids,
+                    return_dict_in_generate=False,
+                    generation_config=generation_config,
+                    **kwargs,
+                )
 
         response = decode_tokens(
             outputs[0],
@@ -1247,11 +1178,14 @@ def chat(
             context_length=len(context_tokens),
             chat_format=generation_config.chat_format,
             verbose=False,
-            errors='replace',                
+            errors='replace'
         )
 
-        if append_history:
-            history.append((query, response))
+        # as history is a copy of the user inputs,
+        # we can always return the new turn to the user.
+        # separating input history and output history also enables the user
+        # to implement more complex history management
+        history.append((query, response))
 
         return response, history
 
@@ -1262,9 +1196,10 @@ def chat_stream(
             query: str,
             history: Optional[HistoryType],
             system: str = "You are a helpful assistant.",
+            stop_words_ids: Optional[List[List[int]]] = None,
+            logits_processor: Optional[LogitsProcessorList] = None,
             generation_config: Optional[GenerationConfig] = None,
-            stop_words_ids=None,
-            **kwargs
+            **kwargs,
     ) -> Generator[str, Any, None]:
         generation_config = generation_config if generation_config is not None else self.generation_config
         assert generation_config.chat_format == 'chatml', _ERROR_BAD_CHAT_FORMAT
@@ -1272,11 +1207,10 @@ def chat_stream(
             history = []
         if stop_words_ids is None:
             stop_words_ids = []
+
         max_window_size = kwargs.get('max_window_size', None)
         if max_window_size is None:
             max_window_size = generation_config.max_window_size
-        logits_processor = kwargs.pop('logits_processor',None)
-        
         raw_text, context_tokens = make_context(
             tokenizer,
             query,
@@ -1312,9 +1246,11 @@ def stream_generator():
                     return_dict_in_generate=False,
                     generation_config=stream_config,
                     logits_processor=logits_processor,
+                    seed=-1,
                     **kwargs):
                 outputs.append(token.item())
                 yield tokenizer.decode(outputs, skip_special_tokens=True, errors='ignore')
+
         return stream_generator()
 
     @torch.no_grad()
@@ -1326,17 +1262,18 @@ def generate(
         **kwargs,
     ) -> Union[GenerateOutput, torch.LongTensor]:
         generation_config = generation_config if generation_config is not None else self.generation_config
+
         # Process stop_words_ids.
         stop_words_ids = kwargs.pop("stop_words_ids", None)
         if stop_words_ids is None and generation_config is not None:
             stop_words_ids = getattr(generation_config, "stop_words_ids", None)
         if stop_words_ids is None:
-            stop_words_ids = getattr(self.generation_config, "stop_words_ids", None)
+            stop_words_ids = getattr(generation_config, "stop_words_ids", None)
 
         if stop_words_ids is not None:
             stop_words_logits_processor = StopWordsLogitsProcessor(
                 stop_words_ids=stop_words_ids,
-                eos_token_id=self.generation_config.eos_token_id,
+                eos_token_id=generation_config.eos_token_id,
             )
             if logits_processor is None:
                 logits_processor = LogitsProcessorList([stop_words_logits_processor])
@@ -1349,6 +1286,7 @@ def generate(
             logits_processor=logits_processor,
             **kwargs,
         )
+
     def quantize(self, bits: int, empty_init=False, device=None, **kwarg):
         if bits == 0:
             return
@@ -1361,13 +1299,12 @@ def quantize(self, bits: int, empty_init=False, device=None, **kwarg):
         self.quantized = True
         return self
 
-
 class RotaryEmbedding(torch.nn.Module):
-    def __init__(self, dim, base=10000,**kwargs):
+    def __init__(self, dim, base=10000):
         super().__init__()
         self.dim = dim
         self.base = base
-        inv_freq = 1.0 / (base ** (torch.arange(0, dim, 2,**kwargs).float() / dim))
+        inv_freq = 1.0 / (base ** (torch.arange(0, dim, 2).float() / dim))
         self.register_buffer("inv_freq", inv_freq, persistent=False)
         if importlib.util.find_spec("einops") is None:
             raise RuntimeError("einops is required for Rotary Embedding")
@@ -1376,9 +1313,8 @@ def __init__(self, dim, base=10000,**kwargs):
         self._seq_len_cached = 0
         self._ntk_alpha_cached = 1.0
         self._ntk_alpha_cached_list = [1.0]
-        
-    def update_rotary_pos_emb_cache(self, max_seq_len, offset=0, ntk_alpha=1.0):
-        seqlen = max_seq_len + offset
+
+    def update_rotary_pos_emb_cache(self, seqlen, ntk_alpha=1.0):
         if seqlen > self._seq_len_cached or ntk_alpha != self._ntk_alpha_cached:
             base = self.base * ntk_alpha ** (self.dim / (self.dim - 2))
             self.inv_freq = 1.0 / (
@@ -1401,13 +1337,14 @@ def update_rotary_pos_emb_cache(self, max_seq_len, offset=0, ntk_alpha=1.0):
             cos, sin = emb.cos(), emb.sin()
             self._rotary_pos_emb_cache = [cos, sin]
 
-    def forward(self, max_seq_len, offset=0, ntk_alpha=1.0):
-        self.update_rotary_pos_emb_cache(max_seq_len, offset, ntk_alpha)
+    def forward(self, max_seq_len, ntk_alpha=1.0):
+        self.update_rotary_pos_emb_cache(max_seq_len, ntk_alpha)
         cos, sin = self._rotary_pos_emb_cache
-        return [cos[:, offset : offset + max_seq_len], sin[:, offset : offset + max_seq_len]]
+        return [cos[:, :max_seq_len], sin[:, :max_seq_len]]
+
 
 def _rotate_half(x):
-    # from einops import rearrange
+    from einops import rearrange
 
     x = rearrange(x, "... (j d) -> ... j d", j=2)
     x1, x2 = x.unbind(dim=-2)
@@ -1415,28 +1352,35 @@ def _rotate_half(x):
 
 
 def apply_rotary_pos_emb(t, freqs):
+    """ Apply rotary embedding to the first rotary_dim of the iput
+
+    Arguments:
+      t (tensor(batch_size, seq_len, n_head, head_dim)):
+        the input embedding/hidden states
+      freqs (list[tensor(1, seq_len, 1, rotary_dim), tensor(1, seq_len, 1, rotary_dim)]):
+        the cached cos/sin position embeddings 
+    """
+    rot_dim = freqs[0].shape[-1]
     cos, sin = freqs
+    t_float = t.float()
     if apply_rotary_emb_func is not None and t.is_cuda:
-        t_ = t.float()
-        cos = cos.squeeze(0).squeeze(1)[:, : cos.shape[-1] // 2]
-        sin = sin.squeeze(0).squeeze(1)[:, : sin.shape[-1] // 2]
-        output = apply_rotary_emb_func(t_, cos, sin).type_as(t)
-        return output
+        # apply_rotary_emb in flash_attn requires cos/sin to be of 
+        # shape (seqlen, rotary_dim / 2) and apply rotary embedding 
+        # to the first rotary_dim of the input
+        cos = cos.squeeze(0).squeeze(1)[:, : rot_dim // 2]
+        sin = sin.squeeze(0).squeeze(1)[:, : rot_dim // 2]
+        return apply_rotary_emb_func(t_float, cos, sin).type_as(t)
     else:
-        rot_dim = freqs[0].shape[-1]
-        cos, sin = freqs
-        t_, t_pass_ = t[..., :rot_dim], t[..., rot_dim:]
-        t_ = t_.float()
-        t_pass_ = t_pass_.float()
-        t_ = (t_ * cos) + (_rotate_half(t_) * sin)
-        return torch.cat((t_, t_pass_), dim=-1).type_as(t)
+        t_rot, t_pass = t_float[..., :rot_dim], t_float[..., rot_dim:]
+        t_rot = (t_rot * cos) + (_rotate_half(t_rot) * sin)
+        return torch.cat((t_rot, t_pass), dim=-1).type_as(t)
 
 
 class RMSNorm(torch.nn.Module):
-    def __init__(self, dim: int, eps: float = 1e-6,**kwargs):
+    def __init__(self, dim: int, eps: float = 1e-6):
         super().__init__()
         self.eps = eps
-        self.weight = nn.Parameter(torch.ones(dim,**kwargs))
+        self.weight = nn.Parameter(torch.ones(dim))
 
     def _norm(self, x):
         return x * torch.rsqrt(x.pow(2).mean(-1, keepdim=True) + self.eps)
@@ -1447,11 +1391,3 @@ def forward(self, x):
         else:
             output = self._norm(x.float()).type_as(x)
             return output * self.weight
-
-
-
-
-class TransformerQWenLMHeadModel(TransformerBase):
-    def __init__(self, *args,**kwargs):
-        super(TransformerQWenLMHeadModel, self).__init__(*args,**kwargs)
-        self.set_model(self.from_pretrained(QWenLMHeadModel, *args, **kwargs))
diff --git a/src/deep_training/nlp/models/qwen/qwen_generation_utils.py b/src/deep_training/nlp/models/qwen/qwen_generation_utils.py
index 3da82ea1..4e8e1d8c 100644
--- a/src/deep_training/nlp/models/qwen/qwen_generation_utils.py
+++ b/src/deep_training/nlp/models/qwen/qwen_generation_utils.py
@@ -301,6 +301,7 @@ def decode_tokens(
 class StopWordsLogitsProcessor(LogitsProcessor):
     """
     :class:`transformers.LogitsProcessor` that enforces that when specified sequences appear, stop geration.
+
     Args:
         stop_words_ids (:obj:`List[List[int]]`):
             List of list of token ids of stop ids. In order to get the tokens of the words
@@ -412,4 +413,4 @@ def top_k_logits(logits, top_k=0, top_p=0.0, filter_value=-float("Inf")):
 
 def switch(val1, val2, boolean):
     boolean = boolean.type_as(val1)
-    return (1 - boolean) * val1 + boolean * val2
\ No newline at end of file
+    return (1 - boolean) * val1 + boolean * val2
diff --git a/src/deep_training/nlp/models/qwen/tokenization_qwen.py b/src/deep_training/nlp/models/qwen/tokenization_qwen.py
index 0c3b3c88..2a526d66 100644
--- a/src/deep_training/nlp/models/qwen/tokenization_qwen.py
+++ b/src/deep_training/nlp/models/qwen/tokenization_qwen.py
@@ -27,11 +27,22 @@
 # regular texts, the surface forms of special tokens need to be
 # as different as possible to minimize the impact
 EXTRAS = tuple((f"<|extra_{i}|>" for i in range(205)))
-SPECIAL_TOKENS = (
-    ENDOFTEXT,
-    IMSTART,
-    IMEND,
-) + EXTRAS
+# changed to use actual index to avoid misconfiguration with vocabulary expansion
+SPECIAL_START_ID = 151643
+SPECIAL_TOKENS = tuple(
+    enumerate(
+        (
+            (
+                ENDOFTEXT,
+                IMSTART,
+                IMEND,
+            )
+            + EXTRAS
+        ),
+        start=SPECIAL_START_ID,
+    )
+)
+SPECIAL_TOKENS_SET = set(t for i, t in SPECIAL_TOKENS)
 
 
 def _load_tiktoken_bpe(tiktoken_bpe_file: str) -> Dict[bytes, int]:
@@ -42,6 +53,7 @@ def _load_tiktoken_bpe(tiktoken_bpe_file: str) -> Dict[bytes, int]:
         for token, rank in (line.split() for line in contents.splitlines() if line)
     }
 
+
 class QWenTokenizer(PreTrainedTokenizer):
     """QWen tokenizer."""
 
@@ -51,20 +63,35 @@ def __init__(
         self,
         vocab_file,
         errors="replace",
+        extra_vocab_file=None,
         **kwargs,
     ):
         super().__init__(**kwargs)
 
-        self.errors = errors  # how to handle errors in decoding
+        # how to handle errors in decoding UTF-8 byte sequences
+        # use ignore if you are in streaming inference
+        self.errors = errors  
 
-        self.mergeable_ranks = _load_tiktoken_bpe(vocab_file)  # type: dict[bytes, int]
+        self.mergeable_ranks = _load_tiktoken_bpe(vocab_file)  # type: Dict[bytes, int]
         self.special_tokens = {
             token: index
-            for index, token in enumerate(
-                SPECIAL_TOKENS, start=len(self.mergeable_ranks)
-            )
+            for index, token in SPECIAL_TOKENS
         }
 
+        # try load extra vocab from file
+        if extra_vocab_file is not None:
+            used_ids = set(self.mergeable_ranks.values()) | set(self.special_tokens.values())
+            extra_mergeable_ranks = _load_tiktoken_bpe(extra_vocab_file)
+            for token, index in extra_mergeable_ranks.items():
+                if token in self.mergeable_ranks:
+                    logger.info(f"extra token {token} exists, skipping")
+                    continue
+                if index in used_ids:
+                    logger.info(f'the index {index} for extra token {token} exists, skipping')
+                    continue
+                self.mergeable_ranks[token] = index
+            # the index may be sparse after this, but don't worry tiktoken.Encoding will handle this
+
         enc = tiktoken.Encoding(
             "Qwen",
             pat_str=PAT_STR,
@@ -89,7 +116,7 @@ def __init__(
     def __getstate__(self):
         # for pickle lovers
         state = self.__dict__.copy()
-        del state['tokenizer']
+        del state["tokenizer"]
         return state
 
     def __setstate__(self, state):
@@ -103,7 +130,6 @@ def __setstate__(self, state):
         )
         self.tokenizer = enc
 
-
     def __len__(self) -> int:
         return self.tokenizer.n_vocab
 
@@ -126,18 +152,23 @@ def convert_tokens_to_ids(
                 ids.append(self.mergeable_ranks.get(token))
         return ids
 
-    def _add_tokens(self, new_tokens: Union[List[str], List[AddedToken]], special_tokens: bool = False) -> int:
+    def _add_tokens(
+        self,
+        new_tokens: Union[List[str], List[AddedToken]],
+        special_tokens: bool = False,
+    ) -> int:
         if not special_tokens and new_tokens:
-            raise ValueError('Adding regular tokens is not supported')
+            raise ValueError("Adding regular tokens is not supported")
         for token in new_tokens:
             surface_form = token.content if isinstance(token, AddedToken) else token
-            if surface_form not in SPECIAL_TOKENS:
-                raise ValueError('Adding unknown special tokens is not supported')
+            if surface_form not in SPECIAL_TOKENS_SET:
+                raise ValueError("Adding unknown special tokens is not supported")
         return 0
 
     def save_vocabulary(self, save_directory: str, **kwargs) -> Tuple[str]:
         """
         Save only the vocabulary of the tokenizer (vocabulary).
+
         Returns:
             `Tuple(str)`: Paths to the files saved.
         """
@@ -157,6 +188,7 @@ def tokenize(
     ) -> List[Union[bytes, str]]:
         """
         Converts a string in a sequence of tokens.
+
         Args:
             text (`str`):
                 The sequence to be encoded.
@@ -166,8 +198,10 @@ def tokenize(
             disallowed_special (`Literal["all"]` or `Collection`):
                 The surface forms of the tokens that should not be in regular texts and trigger errors.
                 Default to an empty tuple.
+
             kwargs (additional keyword arguments, *optional*):
                 Will be passed to the underlying model specific encode method.
+
         Returns:
             `List[bytes|str]`: The list of tokens.
         """
@@ -223,6 +257,7 @@ def _tokenize(self, text: str, **kwargs):
         """
         Converts a string in a sequence of tokens (string), using the tokenizer. Split in words for word-based
         vocabulary or sub-words for sub-word-based vocabularies (BPE/SentencePieces/WordPieces).
+
         Do NOT take care of added tokens.
         """
         raise NotImplementedError
@@ -238,4 +273,4 @@ def _decode(
             token_ids = [token_ids]
         if skip_special_tokens:
             token_ids = [i for i in token_ids if i < self.eod_id]
-        return self.tokenizer.decode(token_ids, errors=errors or self.errors)
\ No newline at end of file
+        return self.tokenizer.decode(token_ids, errors=errors or self.errors)