fix: use get_rows op to slice ffn

SJTU-IPADS · Apr 2, 2024 · 0658ae9 · 0658ae9
1 parent 9a5dd7f
commit 0658ae9
Show file tree

Hide file tree

Showing 3 changed files with 100 additions and 18 deletions.
diff --git a/ggml.c b/ggml.c
@@ -4881,6 +4881,30 @@ struct ggml_tensor * ggml_get_rows(
     return result;
 }
 
+struct ggml_tensor * ggml_select_rows(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a,
+        struct ggml_tensor  * b) {
+    GGML_ASSERT(a->ne[2] == b->ne[1]);
+    GGML_ASSERT(b->ne[3] == 1);
+    GGML_ASSERT(b->type == GGML_TYPE_I32);
+
+    bool is_node = false;
+
+    if (a->grad || b->grad) {
+        is_node = true;
+    }
+
+    struct ggml_tensor * result = ggml_new_tensor_4d(ctx, a->type, a->ne[0], b->ne[0], b->ne[1], b->ne[2]);
+
+    result->op   = GGML_OP_GET_ROWS;
+    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
+    result->src[0] = a;
+    result->src[1] = b;
+
+    return result;
+}
+
 // ggml_get_rows_back
 
 struct ggml_tensor * ggml_get_rows_back(
@@ -10609,11 +10633,55 @@ static void ggml_compute_forward_get_rows_f32(
     }
 }
 
+static void ggml_compute_forward_cpy_rows(
+        const struct ggml_compute_params * params,
+        const struct ggml_tensor * src0,
+        const struct ggml_tensor * src1,
+              struct ggml_tensor * dst) {
+    assert(params->ith == 0);
+
+    if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
+        return;
+    }
+
+    GGML_TENSOR_BINARY_OP_LOCALS
+
+    const int64_t nc = ne00;
+    const int64_t nr = ggml_nelements(src1); GGML_UNUSED(nr);
+
+    assert(ne0  == nc);
+    assert(ne02 == ne11);
+    assert(nb00 == ggml_type_size(src0->type));
+    assert(ggml_nrows(dst) == nr);
+    assert(src0->type == dst->type && nb1 == nb01 && "src0 and dst must be of same type and row size");
+
+    for (int64_t i12 = 0; i12 < ne12; ++i12) {
+        for (int64_t i11 = 0; i11 < ne11; ++i11) {
+            for (int64_t i10 = 0; i10 < ne10; ++i10) {
+                const int64_t i01 = *(int32_t *) ((char *) src1->data + i10*nb10 + i11*nb11 + i12*nb12);
+                memcpy(
+                    (char *)  dst->data + i10*nb1  + i11*nb2  + i12*nb3,
+                    (char *) src0->data + i01*nb01 + i11*nb02 + i12*nb03,
+                    nb1
+                );
+            }
+        }
+    }
+}
+
 static void ggml_compute_forward_get_rows(
         const struct ggml_compute_params * params,
         const struct ggml_tensor * src0,
         const struct ggml_tensor * src1,
         struct ggml_tensor * dst) {
+    if (src0->type == dst->type && dst->type != GGML_TYPE_F32) {
+        // Fast pass with memcpy
+        // TODO: only implemented on CPU for now
+        printf("src0->type = %d, dst->type = %d\n", src0->type, dst->type);
+        ggml_compute_forward_cpy_rows(params, src0, src1, dst);
+        return;
+    }
+
     switch (src0->type) {
         case GGML_TYPE_Q4_0:
         case GGML_TYPE_Q4_1:

diff --git a/ggml.h b/ggml.h
@@ -1344,6 +1344,12 @@ extern "C" {
             struct ggml_tensor  * a,
             struct ggml_tensor  * b);
 
+    // Same as ggml_get_rows, but keep the original data type as a
+    GGML_API struct ggml_tensor * ggml_select_rows(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a,
+            struct ggml_tensor  * b);
+
     GGML_API struct ggml_tensor * ggml_get_rows_back(
             struct ggml_context * ctx,
             struct ggml_tensor  * a,

diff --git a/llama.cpp b/llama.cpp
@@ -3219,13 +3219,6 @@ struct llama_gpu_split_loader {
                 // no hybrid inference for this layer, unset gpu_bucket
                 model_layer.gpu_bucket = NULL;
                 // TODO: maybe can also unset gpu_idx
-            } else {
-#if defined(GGML_USE_CUBLAS)
-                ggml_set_backend(gpu_bucket, GGML_BACKEND_GPU);
-                ggml_cuda_transform_tensor(gpu_bucket->data, gpu_bucket);
-#else
-                GGML_ASSERT(false && "cublas is not enabled");
-#endif
             }
         }
 
@@ -3321,30 +3314,45 @@ struct llama_augmentation_model_loader {
         printf("gpu_bucket: %p[%d]; offload_ratio: %.2f\n", gpu_bucket, gpu_bucket ? gpu_bucket->ne[0] : -1, layer.gpu_offload_ratio);
 
         if (layer.ffn_gate) {
-            layer.ffn_gate_gpu = create_striped_mat_to_gpu(layer.ffn_gate, gpu_bucket);
+            layer.ffn_gate_gpu = slice_gpu_mat(layer.ffn_gate, gpu_bucket);
             offloaded_bytes += ggml_nbytes(layer.ffn_gate_gpu);
         }
 
-        layer.ffn_up_gpu = create_striped_mat_to_gpu(layer.ffn_up, gpu_bucket);
-        offloaded_bytes += ggml_nbytes(layer.ffn_up_gpu);
-
-        layer.ffn_down_gpu = create_striped_mat_to_gpu(layer.ffn_down_t, gpu_bucket);
+        layer.ffn_up_gpu = slice_gpu_mat(layer.ffn_up, gpu_bucket);
+        layer.ffn_down_gpu = slice_gpu_mat(layer.ffn_down_t, gpu_bucket);
         offloaded_bytes += ggml_nbytes(layer.ffn_down_gpu);
 
+#if defined(GGML_USE_CUBLAS)
+        ggml_set_backend(gpu_bucket, GGML_BACKEND_GPU);
+        ggml_cuda_transform_tensor(gpu_bucket->data, gpu_bucket);
+#else
+        GGML_ASSERT(false && "cublas is not enabled");
+#endif
+
         return offloaded_bytes;
     }
 
+    ggml_tensor * slice_gpu_mat(ggml_tensor * src, ggml_tensor * gpu_bucket) {
+        ggml_tensor * sliced_mat = ggml_select_rows(aux_ctx, src, gpu_bucket);
+        ggml_cgraph * gf = ggml_new_graph_custom(aux_ctx, 3, false);
+        ggml_build_forward_expand(gf, sliced_mat);
+        ggml_graph_compute_with_ctx(aux_ctx, gf, 1);
+
+        // Turn the computed tensor into a GPU weight tensor
+        ggml_tensor *gpu_mat = ggml_dup_tensor(aux_ctx, sliced_mat);
+        std::string name = std::string(ggml_get_name(src)) + ".gpu";
+        ggml_set_name(gpu_mat, name.c_str());
+        ggml_set_backend(gpu_mat, GGML_BACKEND_GPU);
+        ggml_cuda_transform_tensor(gpu_mat->data, gpu_mat);
+
+        return gpu_mat;
+    }
+
     size_t offload_ffn_split(llama_model * model) {
         LLAMA_LOG_INFO("%s: applying augmentation to model - please wait ...\n", __func__);
         const int64_t t_start_aug_us = ggml_time_us();
         std::vector<uint8_t> work_buffer;
 
-        // Set sparsity threshold via global virables
-        sparse_pred_threshold = model->hparams.sparse_pred_threshold;
-#if defined (GGML_USE_CUBLAS)
-        ggml_cuda_set_device_constants(model->hparams.sparse_pred_threshold);
-#endif
-
         // load gpu_idx and slice mat to gpu
         size_t offloaded_bytes = 0;
         for (int il = 0; il < model->layers.size(); il++) {