From 0658ae9f65fa67e5f240e247589f48e0d1fbb0f6 Mon Sep 17 00:00:00 2001 From: Holden Date: Wed, 3 Apr 2024 00:25:03 +0800 Subject: [PATCH] fix: use get_rows op to slice ffn --- ggml.c | 68 +++++++++++++++++++++++++++++++++++++++++++++++++++++++ ggml.h | 6 +++++ llama.cpp | 44 ++++++++++++++++++++--------------- 3 files changed, 100 insertions(+), 18 deletions(-) diff --git a/ggml.c b/ggml.c index cdcc6a2..44af2f7 100644 --- a/ggml.c +++ b/ggml.c @@ -4881,6 +4881,30 @@ struct ggml_tensor * ggml_get_rows( return result; } +struct ggml_tensor * ggml_select_rows( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b) { + GGML_ASSERT(a->ne[2] == b->ne[1]); + GGML_ASSERT(b->ne[3] == 1); + GGML_ASSERT(b->type == GGML_TYPE_I32); + + bool is_node = false; + + if (a->grad || b->grad) { + is_node = true; + } + + struct ggml_tensor * result = ggml_new_tensor_4d(ctx, a->type, a->ne[0], b->ne[0], b->ne[1], b->ne[2]); + + result->op = GGML_OP_GET_ROWS; + result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; + result->src[0] = a; + result->src[1] = b; + + return result; +} + // ggml_get_rows_back struct ggml_tensor * ggml_get_rows_back( @@ -10609,11 +10633,55 @@ static void ggml_compute_forward_get_rows_f32( } } +static void ggml_compute_forward_cpy_rows( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + const struct ggml_tensor * src1, + struct ggml_tensor * dst) { + assert(params->ith == 0); + + if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { + return; + } + + GGML_TENSOR_BINARY_OP_LOCALS + + const int64_t nc = ne00; + const int64_t nr = ggml_nelements(src1); GGML_UNUSED(nr); + + assert(ne0 == nc); + assert(ne02 == ne11); + assert(nb00 == ggml_type_size(src0->type)); + assert(ggml_nrows(dst) == nr); + assert(src0->type == dst->type && nb1 == nb01 && "src0 and dst must be of same type and row size"); + + for (int64_t i12 = 0; i12 < ne12; ++i12) { + for (int64_t i11 = 0; i11 < ne11; ++i11) { + for (int64_t i10 = 0; i10 < ne10; ++i10) { + const int64_t i01 = *(int32_t *) ((char *) src1->data + i10*nb10 + i11*nb11 + i12*nb12); + memcpy( + (char *) dst->data + i10*nb1 + i11*nb2 + i12*nb3, + (char *) src0->data + i01*nb01 + i11*nb02 + i12*nb03, + nb1 + ); + } + } + } +} + static void ggml_compute_forward_get_rows( const struct ggml_compute_params * params, const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst) { + if (src0->type == dst->type && dst->type != GGML_TYPE_F32) { + // Fast pass with memcpy + // TODO: only implemented on CPU for now + printf("src0->type = %d, dst->type = %d\n", src0->type, dst->type); + ggml_compute_forward_cpy_rows(params, src0, src1, dst); + return; + } + switch (src0->type) { case GGML_TYPE_Q4_0: case GGML_TYPE_Q4_1: diff --git a/ggml.h b/ggml.h index 27c5afd..2aee6c4 100644 --- a/ggml.h +++ b/ggml.h @@ -1344,6 +1344,12 @@ extern "C" { struct ggml_tensor * a, struct ggml_tensor * b); + // Same as ggml_get_rows, but keep the original data type as a + GGML_API struct ggml_tensor * ggml_select_rows( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b); + GGML_API struct ggml_tensor * ggml_get_rows_back( struct ggml_context * ctx, struct ggml_tensor * a, diff --git a/llama.cpp b/llama.cpp index 3d6ae30..98580a2 100644 --- a/llama.cpp +++ b/llama.cpp @@ -3219,13 +3219,6 @@ struct llama_gpu_split_loader { // no hybrid inference for this layer, unset gpu_bucket model_layer.gpu_bucket = NULL; // TODO: maybe can also unset gpu_idx - } else { -#if defined(GGML_USE_CUBLAS) - ggml_set_backend(gpu_bucket, GGML_BACKEND_GPU); - ggml_cuda_transform_tensor(gpu_bucket->data, gpu_bucket); -#else - GGML_ASSERT(false && "cublas is not enabled"); -#endif } } @@ -3321,30 +3314,45 @@ struct llama_augmentation_model_loader { printf("gpu_bucket: %p[%d]; offload_ratio: %.2f\n", gpu_bucket, gpu_bucket ? gpu_bucket->ne[0] : -1, layer.gpu_offload_ratio); if (layer.ffn_gate) { - layer.ffn_gate_gpu = create_striped_mat_to_gpu(layer.ffn_gate, gpu_bucket); + layer.ffn_gate_gpu = slice_gpu_mat(layer.ffn_gate, gpu_bucket); offloaded_bytes += ggml_nbytes(layer.ffn_gate_gpu); } - layer.ffn_up_gpu = create_striped_mat_to_gpu(layer.ffn_up, gpu_bucket); - offloaded_bytes += ggml_nbytes(layer.ffn_up_gpu); - - layer.ffn_down_gpu = create_striped_mat_to_gpu(layer.ffn_down_t, gpu_bucket); + layer.ffn_up_gpu = slice_gpu_mat(layer.ffn_up, gpu_bucket); + layer.ffn_down_gpu = slice_gpu_mat(layer.ffn_down_t, gpu_bucket); offloaded_bytes += ggml_nbytes(layer.ffn_down_gpu); +#if defined(GGML_USE_CUBLAS) + ggml_set_backend(gpu_bucket, GGML_BACKEND_GPU); + ggml_cuda_transform_tensor(gpu_bucket->data, gpu_bucket); +#else + GGML_ASSERT(false && "cublas is not enabled"); +#endif + return offloaded_bytes; } + ggml_tensor * slice_gpu_mat(ggml_tensor * src, ggml_tensor * gpu_bucket) { + ggml_tensor * sliced_mat = ggml_select_rows(aux_ctx, src, gpu_bucket); + ggml_cgraph * gf = ggml_new_graph_custom(aux_ctx, 3, false); + ggml_build_forward_expand(gf, sliced_mat); + ggml_graph_compute_with_ctx(aux_ctx, gf, 1); + + // Turn the computed tensor into a GPU weight tensor + ggml_tensor *gpu_mat = ggml_dup_tensor(aux_ctx, sliced_mat); + std::string name = std::string(ggml_get_name(src)) + ".gpu"; + ggml_set_name(gpu_mat, name.c_str()); + ggml_set_backend(gpu_mat, GGML_BACKEND_GPU); + ggml_cuda_transform_tensor(gpu_mat->data, gpu_mat); + + return gpu_mat; + } + size_t offload_ffn_split(llama_model * model) { LLAMA_LOG_INFO("%s: applying augmentation to model - please wait ...\n", __func__); const int64_t t_start_aug_us = ggml_time_us(); std::vector work_buffer; - // Set sparsity threshold via global virables - sparse_pred_threshold = model->hparams.sparse_pred_threshold; -#if defined (GGML_USE_CUBLAS) - ggml_cuda_set_device_constants(model->hparams.sparse_pred_threshold); -#endif - // load gpu_idx and slice mat to gpu size_t offloaded_bytes = 0; for (int il = 0; il < model->layers.size(); il++) {