SJTU-IPADS · hodlen · Mar 27, 2024 · Mar 21, 2024 · Mar 22, 2024 · Mar 22, 2024
diff --git a/common/common.h b/common/common.h
@@ -49,7 +49,7 @@ struct gpt_params {
     int32_t n_threads_batch                 = -1;    // number of threads to use for batch processing (-1 = use n_threads)
     int32_t n_predict                       = -1;    // new tokens to predict
     int32_t n_ctx                           = 512;   // context size
-    int32_t n_batch                         = 32;    // batch size for prompt processing (must be >=32 to use BLAS)
+    int32_t n_batch                         = 512;   // batch size for prompt processing (must be >=32 to use BLAS)
     int32_t n_keep                          = 0;     // number of tokens to keep from initial prompt
     int32_t n_draft                         = 16;    // number of tokens to draft during speculative decoding
     int32_t n_chunks                        = -1;    // max number of chunks to process (-1 = unlimited)

diff --git a/ggml-cuda.cu b/ggml-cuda.cu
@@ -6873,7 +6873,7 @@ inline void * ggml_cuda_get_tensor_data(const ggml_tensor * tensor) {
     }
 }
 
-inline void ggml_cuda_op_mul_mat_batch_sparse_cublas(
+inline void ggml_cuda_op_mul_mat_batch_sparse(
     const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, const char * src0_dd_i, const float * src1_ddf_i,
     const char * src1_ddq_i, float * dst_dd_i, const int64_t row_low, const int64_t row_high, const int64_t src1_ncols,
     const int64_t src1_padded_row_size, const cudaStream_t & stream) {
@@ -7111,7 +7111,7 @@ inline void ggml_cuda_op_dequantize_mul_mat_vec(
     (void) src1_padded_row_size;
 }
 
-inline void ggml_cuda_op_mul_mat_vec_sparse_cublas(
+inline void ggml_cuda_op_mul_mat_vec_sparse(
     const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, const char * src0_dd_i, const float * src1_ddf_i,
     const char * src1_ddq_i, float * dst_dd_i, const int64_t row_low, const int64_t row_high, const int64_t src1_ncols,
     const int64_t src1_padded_row_size, const cudaStream_t & stream) {
@@ -7338,6 +7338,14 @@ inline void ggml_cuda_op_mul_mat_transpose_select_gemm(
     (void) src1_padded_row_size;
 }
 
+__global__ void matrix_row_select_cont(const float * src, float * dst, const int * lst, const int src1_ncols, const int stride_src, const int stride_dst) {
+    const int tid = threadIdx.x + blockIdx.x * blockDim.x;
+    int col_to_read = lst[tid];
+    for (int i = 0; i < src1_ncols; i++) {
+        dst[stride_dst * i + tid] = src[stride_src * i + col_to_read];
+    }
+}
+
 inline void ggml_cuda_op_mul_mat_transpose_gemm(
     const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, const char * src0_dd_i, const float * src1_ddf_i,
     const char * src1_ddq_i, float * dst_dd_i, const int64_t row_low, const int64_t row_high, const int64_t src1_ncols,
@@ -7375,27 +7383,36 @@ inline void ggml_cuda_op_mul_mat_transpose_gemm(
     // ldc == nrows of the matrix that cuBLAS writes into
     int ldc = dst->backend == GGML_BACKEND_GPU && id == g_main_device ? ne0 : row_diff;
     ldc = ne0;
-    // size_t src0_as_t = 0;
-    // float *transpose = (float *) ggml_cuda_pool_malloc(row_diff*ne00 * sizeof(float), &src0_as_t); // NOLINT
-    // int blockSize = 32;
-    // int numBlocks = ne00;
-    // transpose_cont<<< numBlocks, blockSize, 0, stream>>>((float *)src0_ddf_i, transpose, ne00, ne01, 1, ne00, ne01,NULL);
 
     CUBLAS_CHECK(cublasSetStream(g_cublas_handles[id], stream));
-    // CUBLAS_CHECK(
-    //     cublasSgemm(g_cublas_handles[id], CUBLAS_OP_T, CUBLAS_OP_N,
-    //             ne00, src1_ncols, ne10,
-    //             &alpha, transpose, ne01,
-    //                     src1_ddf_i,  ne10,
-    //             &beta,  dst_dd_i,   ldc));
-
-    CUBLAS_CHECK(
-        cublasSgemm(g_cublas_handles[id], CUBLAS_OP_N, CUBLAS_OP_N,
-                ne00, src1_ncols, ne10,
-                &alpha, src0_ddf_i, ne00,
-                        src1_ddf_i,  ne10,
-                &beta,  dst_dd_i,   ldc));
-
+
+    // dst->src[3]->data is gpu_bucket, ne01 is length
+    if (dst->src[3] != NULL) {
+        // compress src1
+        GGML_ASSERT(ne01 % 32 == 0);
+        const int block_nums = ne01 / 32;
+        size_t actual_size;
+        float * src1_cont = (float *)ggml_cuda_pool_malloc(ne01 * src1_ncols * sizeof(float), &actual_size);
+        int * row_lookup = static_cast<int *>(ggml_cuda_get_tensor_data(dst->src[3]));
+        matrix_row_select_cont<<<block_nums, 32, 0, stream>>>(src1_ddf_i, src1_cont, row_lookup, src1_ncols, ne10, ne01);
+
+        CUBLAS_CHECK(
+            cublasSgemm(g_cublas_handles[id], CUBLAS_OP_N, CUBLAS_OP_N,
+                    ne00, src1_ncols, ne01,
+                    &alpha, src0_ddf_i, ne00,
+                    src1_cont,  ne01,
+                    &beta,  dst_dd_i,   ldc));
+
+        ggml_cuda_pool_free(src1_cont, actual_size);
+    } else {
+        // full_gpu
+        CUBLAS_CHECK(
+            cublasSgemm(g_cublas_handles[id], CUBLAS_OP_N, CUBLAS_OP_N,
+                    ne00, src1_ncols, ne10,
+                    &alpha, src0_ddf_i, ne00,
+                            src1_ddf_i,  ne10,
+                    &beta,  dst_dd_i,   ldc));
+    }
 
     if (src0_as > 0) {
         ggml_cuda_pool_free(src0_ddq_as_f32, src0_as);
@@ -8604,18 +8621,21 @@ static void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1
 static void ggml_cuda_mul_mat_sparse(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
     GGML_ASSERT(dst->src[2] != NULL && "dst->src[2] must be present for sparse matrix multiplication");
     if (src1->ne[1] == 1 && src0->ne[0] % GGML_CUDA_DMMV_X == 0) {
-        ggml_cuda_op_mul_mat(src0, src1, dst, ggml_cuda_op_mul_mat_vec_sparse_cublas, false);
+        ggml_cuda_op_mul_mat(src0, src1, dst, ggml_cuda_op_mul_mat_vec_sparse, false);
     } else {
-        ggml_cuda_op_mul_mat(src0, src1, dst, ggml_cuda_op_mul_mat_batch_sparse_cublas, false);
+        ggml_cuda_op_mul_mat(src0, src1, dst, ggml_cuda_op_mul_mat_batch_sparse, false);
     }
 }
 
 void ggml_cuda_axpy(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
     GGML_ASSERT(dst->src[2] != NULL && "dst->src[2] must be present for axpy");
     bool all_on_device = (src0->backend == GGML_BACKEND_GPU || src0->backend == GGML_BACKEND_GPU_SPLIT) &&
         src1->backend == GGML_BACKEND_GPU && dst->backend == GGML_BACKEND_GPU;
-    ggml_cuda_op_mul_mat(src0, src1, dst, ggml_cuda_op_dequantize_axpy, false);
-    // ggml_cuda_op_mul_mat(src0, src1, dst, ggml_cuda_op_mul_mat_transpose_gemm, false); // fallback
+    if (src1->ne[1] > 100) {
+        ggml_cuda_op_mul_mat(src0, src1, dst, ggml_cuda_op_mul_mat_transpose_gemm, false);
+    } else {
+        ggml_cuda_op_mul_mat(src0, src1, dst, ggml_cuda_op_dequantize_axpy, false);
+    }
 }
 
 static void ggml_cuda_scale(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {