Skip to content

Commit

Permalink
Merge pull request #95 from ssbuild/dev
Browse files Browse the repository at this point in the history
update qwen model
  • Loading branch information
ssbuild authored Dec 2, 2023
2 parents 36c9db2 + 524f307 commit 62a7b92
Show file tree
Hide file tree
Showing 9 changed files with 2,256 additions and 339 deletions.
6 changes: 5 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
## transformer is all you need.
- deep training framework based on lightning and transformers
- deep training framework based on transformers

## install and download

Expand All @@ -17,6 +17,10 @@ pip install -U git+https://github.com/ssbuild/deep_training.git --no-deps --forc


## update
- <strong>2023-12-12</strong>
- 0.2.10 update qwen model for 1.8b 7b 14b 72b


- <strong>2023-11-13</strong>
- 0.2.9 release
- 0.2.9.post0 support chatglm3-6b-32k
Expand Down
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@
]
setup(
name='deep_training',
version='0.2.9.post0',
version='0.2.10',
description='an easy training architecture',
long_description='torch_training: https://github.com/ssbuild/deep_training.git',
license='Apache License 2.0',
Expand Down
198 changes: 198 additions & 0 deletions src/deep_training/nlp/models/qwen/cache_autogptq_cuda_256.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,198 @@
#include <torch/all.h>
#include <torch/python.h>
#include <c10/cuda/CUDAGuard.h>

// adapted from https://github.com/PanQiWei/AutoGPTQ/blob/main/autogptq_extension/cuda_256/autogptq_cuda_256.cpp
void vecquant8matmul_cuda(
torch::Tensor vec, torch::Tensor mat, torch::Tensor mul,
torch::Tensor scales, torch::Tensor zeros,
torch::Tensor g_idx
);

void vecquant8matmul(
torch::Tensor vec, torch::Tensor mat, torch::Tensor mul,
torch::Tensor scales, torch::Tensor zeros,
torch::Tensor g_idx
) {
const at::cuda::OptionalCUDAGuard device_guard(device_of(vec));
vecquant8matmul_cuda(vec, mat, mul, scales, zeros, g_idx);
}

void vecquant8matmul_batched_cuda(
torch::Tensor vec, torch::Tensor mat, torch::Tensor mul,
torch::Tensor scales, torch::Tensor zeros
);

void vecquant8matmul_batched(
torch::Tensor vec, torch::Tensor mat, torch::Tensor mul,
torch::Tensor scales, torch::Tensor zeros
) {
const at::cuda::OptionalCUDAGuard device_guard(device_of(vec));
vecquant8matmul_batched_cuda(vec, mat, mul, scales, zeros);
}

void vecquant8matmul_batched_column_compression_cuda(
torch::Tensor vec, torch::Tensor mat, torch::Tensor mul,
torch::Tensor scales, torch::Tensor zeros
);

void vecquant8matmul_batched_column_compression(
torch::Tensor vec, torch::Tensor mat, torch::Tensor mul,
torch::Tensor scales, torch::Tensor zeros
) {
const at::cuda::OptionalCUDAGuard device_guard(device_of(vec));
vecquant8matmul_batched_column_compression_cuda(vec, mat, mul, scales, zeros);
}

void vecquant4matmul_batched_cuda(
torch::Tensor vec, torch::Tensor mat, torch::Tensor mul,
torch::Tensor scales, torch::Tensor zeros
);

void vecquant4matmul_batched(
torch::Tensor vec, torch::Tensor mat, torch::Tensor mul,
torch::Tensor scales, torch::Tensor zeros
) {
const at::cuda::OptionalCUDAGuard device_guard(device_of(vec));
vecquant4matmul_batched_cuda(vec, mat, mul, scales, zeros);
}

void vecquant4matmul_batched_column_compression_cuda(
torch::Tensor vec, torch::Tensor mat, torch::Tensor mul,
torch::Tensor scales, torch::Tensor zeros
);

void vecquant4matmul_batched_column_compression(
torch::Tensor vec, torch::Tensor mat, torch::Tensor mul,
torch::Tensor scales, torch::Tensor zeros
) {
const at::cuda::OptionalCUDAGuard device_guard(device_of(vec));
vecquant4matmul_batched_column_compression_cuda(vec, mat, mul, scales, zeros);
}

void vecquant8matmul_batched_old_cuda(
torch::Tensor vec, torch::Tensor mat, torch::Tensor mul,
torch::Tensor scales, torch::Tensor zeros
);

void vecquant8matmul_batched_old(
torch::Tensor vec, torch::Tensor mat, torch::Tensor mul,
torch::Tensor scales, torch::Tensor zeros
) {
const at::cuda::OptionalCUDAGuard device_guard(device_of(vec));
vecquant8matmul_batched_old_cuda(vec, mat, mul, scales, zeros);
}


void vecquant4matmul_batched_old_cuda(
torch::Tensor vec, torch::Tensor mat, torch::Tensor mul,
torch::Tensor scales, torch::Tensor zeros
);

void vecquant4matmul_batched_old(
torch::Tensor vec, torch::Tensor mat, torch::Tensor mul,
torch::Tensor scales, torch::Tensor zeros
) {
const at::cuda::OptionalCUDAGuard device_guard(device_of(vec));
vecquant4matmul_batched_old_cuda(vec, mat, mul, scales, zeros);
}

void vecquant8matmul_batched_column_compression_old_cuda(
torch::Tensor vec, torch::Tensor mat, torch::Tensor mul,
torch::Tensor scales, torch::Tensor zeros
);

void vecquant8matmul_batched_column_compression_old(
torch::Tensor vec, torch::Tensor mat, torch::Tensor mul,
torch::Tensor scales, torch::Tensor zeros
) {
const at::cuda::OptionalCUDAGuard device_guard(device_of(vec));
vecquant8matmul_batched_column_compression_old_cuda(vec, mat, mul, scales, zeros);
}

void vecquant4matmul_batched_column_compression_old_cuda(
torch::Tensor vec, torch::Tensor mat, torch::Tensor mul,
torch::Tensor scales, torch::Tensor zeros
);

void vecquant4matmul_batched_column_compression_old(
torch::Tensor vec, torch::Tensor mat, torch::Tensor mul,
torch::Tensor scales, torch::Tensor zeros
) {
const at::cuda::OptionalCUDAGuard device_guard(device_of(vec));
vecquant4matmul_batched_column_compression_old_cuda(vec, mat, mul, scales, zeros);
}



void vecquant8matmul_batched_faster_cuda(
torch::Tensor vec, torch::Tensor mat, torch::Tensor mul,
torch::Tensor scales, torch::Tensor zeros
);

void vecquant8matmul_batched_faster(
torch::Tensor vec, torch::Tensor mat, torch::Tensor mul,
torch::Tensor scales, torch::Tensor zeros
) {
const at::cuda::OptionalCUDAGuard device_guard(device_of(vec));
vecquant8matmul_batched_faster_cuda(vec, mat, mul, scales, zeros);
}


void vecquant8matmul_batched_faster_old_cuda(
torch::Tensor vec, torch::Tensor mat, torch::Tensor mul,
torch::Tensor scales, torch::Tensor zeros
);

void vecquant8matmul_batched_faster_old(
torch::Tensor vec, torch::Tensor mat, torch::Tensor mul,
torch::Tensor scales, torch::Tensor zeros
) {
const at::cuda::OptionalCUDAGuard device_guard(device_of(vec));
vecquant8matmul_batched_faster_old_cuda(vec, mat, mul, scales, zeros);
}

void vecquant8matmul_batched_column_compression_faster_cuda(
torch::Tensor vec, torch::Tensor mat, torch::Tensor mul,
torch::Tensor scales, torch::Tensor zeros
);

void vecquant8matmul_batched_column_compression_faster(
torch::Tensor vec, torch::Tensor mat, torch::Tensor mul,
torch::Tensor scales, torch::Tensor zeros
) {
const at::cuda::OptionalCUDAGuard device_guard(device_of(vec));
vecquant8matmul_batched_column_compression_faster_cuda(vec, mat, mul, scales, zeros);
}


void vecquant8matmul_batched_column_compression_faster_old_cuda(
torch::Tensor vec, torch::Tensor mat, torch::Tensor mul,
torch::Tensor scales, torch::Tensor zeros
);

void vecquant8matmul_batched_column_compression_faster_old(
torch::Tensor vec, torch::Tensor mat, torch::Tensor mul,
torch::Tensor scales, torch::Tensor zeros
) {
const at::cuda::OptionalCUDAGuard device_guard(device_of(vec));
vecquant8matmul_batched_column_compression_faster_old_cuda(vec, mat, mul, scales, zeros);
}



PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
m.def("vecquant8matmul", &vecquant8matmul, "Vector 8-bit Quantized Matrix Multiplication (CUDA) (desc_act)");
m.def("vecquant8matmul_batched", &vecquant8matmul_batched, "Vector 8-bit Batched Quantized Matrix Multiplication (CUDA) (desc_act)");
m.def("vecquant8matmul_batched_old", &vecquant8matmul_batched_old, "Vector 8-bit old Batched Quantized Matrix Multiplication (CUDA) (desc_act)");
m.def("vecquant8matmul_batched_faster", &vecquant8matmul_batched_faster, "Vector 8-bit old Batched Quantized Matrix Multiplication (CUDA) (desc_act)");
m.def("vecquant8matmul_batched_faster_old", &vecquant8matmul_batched_faster_old, "Vector 8-bit old Batched Quantized Matrix Multiplication (CUDA) (desc_act)");
m.def("vecquant4matmul_batched_old", &vecquant4matmul_batched_old, "Vector 4-bit old Batched Quantized Matrix Multiplication (CUDA) (desc_act)");
m.def("vecquant8matmul_batched_column_compression", &vecquant8matmul_batched_column_compression, "Vector 8-bit Batched Quantized Matrix Multiplication (CUDA) with weight's column compressed (desc_act)");
m.def("vecquant8matmul_batched_column_compression_old", &vecquant8matmul_batched_column_compression_old, "Vector old 8-bit Batched Quantized Matrix Multiplication (CUDA) with weight's column compressed (desc_act)");
m.def("vecquant8matmul_batched_column_compression_faster", &vecquant8matmul_batched_column_compression_faster, "Vector old 8-bit Batched Quantized Matrix Multiplication (CUDA) with weight's column compressed (desc_act)");
m.def("vecquant8matmul_batched_column_compression_faster_old", &vecquant8matmul_batched_column_compression_faster_old, "Vector old 8-bit Batched Quantized Matrix Multiplication (CUDA) with weight's column compressed (desc_act)");
m.def("vecquant4matmul_batched_column_compression_old", &vecquant4matmul_batched_column_compression_old, "Vector old 4-bit Batched Quantized Matrix Multiplication (CUDA) with weight's column compressed (desc_act)");
m.def("vecquant4matmul_batched", &vecquant4matmul_batched, "Vector 4-bit Batched Quantized Matrix Multiplication (CUDA) (desc_act)");
m.def("vecquant4matmul_batched_column_compression", &vecquant4matmul_batched_column_compression, "Vector 4-bit Batched Quantized Matrix Multiplication (CUDA) with weight's column compressed (desc_act)");
}
Loading

0 comments on commit 62a7b92

Please sign in to comment.