Skip to content

Commit

Permalink
fix: use get_rows op to slice ffn
Browse files Browse the repository at this point in the history
  • Loading branch information
hodlen committed Apr 2, 2024
1 parent 9a5dd7f commit 0658ae9
Show file tree
Hide file tree
Showing 3 changed files with 100 additions and 18 deletions.
68 changes: 68 additions & 0 deletions ggml.c
Original file line number Diff line number Diff line change
Expand Up @@ -4881,6 +4881,30 @@ struct ggml_tensor * ggml_get_rows(
return result;
}

struct ggml_tensor * ggml_select_rows(
struct ggml_context * ctx,
struct ggml_tensor * a,
struct ggml_tensor * b) {
GGML_ASSERT(a->ne[2] == b->ne[1]);
GGML_ASSERT(b->ne[3] == 1);
GGML_ASSERT(b->type == GGML_TYPE_I32);

bool is_node = false;

if (a->grad || b->grad) {
is_node = true;
}

struct ggml_tensor * result = ggml_new_tensor_4d(ctx, a->type, a->ne[0], b->ne[0], b->ne[1], b->ne[2]);

result->op = GGML_OP_GET_ROWS;
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
result->src[0] = a;
result->src[1] = b;

return result;
}

// ggml_get_rows_back

struct ggml_tensor * ggml_get_rows_back(
Expand Down Expand Up @@ -10609,11 +10633,55 @@ static void ggml_compute_forward_get_rows_f32(
}
}

static void ggml_compute_forward_cpy_rows(
const struct ggml_compute_params * params,
const struct ggml_tensor * src0,
const struct ggml_tensor * src1,
struct ggml_tensor * dst) {
assert(params->ith == 0);

if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
return;
}

GGML_TENSOR_BINARY_OP_LOCALS

const int64_t nc = ne00;
const int64_t nr = ggml_nelements(src1); GGML_UNUSED(nr);

assert(ne0 == nc);
assert(ne02 == ne11);
assert(nb00 == ggml_type_size(src0->type));
assert(ggml_nrows(dst) == nr);
assert(src0->type == dst->type && nb1 == nb01 && "src0 and dst must be of same type and row size");

for (int64_t i12 = 0; i12 < ne12; ++i12) {
for (int64_t i11 = 0; i11 < ne11; ++i11) {
for (int64_t i10 = 0; i10 < ne10; ++i10) {
const int64_t i01 = *(int32_t *) ((char *) src1->data + i10*nb10 + i11*nb11 + i12*nb12);
memcpy(
(char *) dst->data + i10*nb1 + i11*nb2 + i12*nb3,
(char *) src0->data + i01*nb01 + i11*nb02 + i12*nb03,
nb1
);
}
}
}
}

static void ggml_compute_forward_get_rows(
const struct ggml_compute_params * params,
const struct ggml_tensor * src0,
const struct ggml_tensor * src1,
struct ggml_tensor * dst) {
if (src0->type == dst->type && dst->type != GGML_TYPE_F32) {
// Fast pass with memcpy
// TODO: only implemented on CPU for now
printf("src0->type = %d, dst->type = %d\n", src0->type, dst->type);
ggml_compute_forward_cpy_rows(params, src0, src1, dst);
return;
}

switch (src0->type) {
case GGML_TYPE_Q4_0:
case GGML_TYPE_Q4_1:
Expand Down
6 changes: 6 additions & 0 deletions ggml.h
Original file line number Diff line number Diff line change
Expand Up @@ -1344,6 +1344,12 @@ extern "C" {
struct ggml_tensor * a,
struct ggml_tensor * b);

// Same as ggml_get_rows, but keep the original data type as a
GGML_API struct ggml_tensor * ggml_select_rows(
struct ggml_context * ctx,
struct ggml_tensor * a,
struct ggml_tensor * b);

GGML_API struct ggml_tensor * ggml_get_rows_back(
struct ggml_context * ctx,
struct ggml_tensor * a,
Expand Down
44 changes: 26 additions & 18 deletions llama.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3219,13 +3219,6 @@ struct llama_gpu_split_loader {
// no hybrid inference for this layer, unset gpu_bucket
model_layer.gpu_bucket = NULL;
// TODO: maybe can also unset gpu_idx
} else {
#if defined(GGML_USE_CUBLAS)
ggml_set_backend(gpu_bucket, GGML_BACKEND_GPU);
ggml_cuda_transform_tensor(gpu_bucket->data, gpu_bucket);
#else
GGML_ASSERT(false && "cublas is not enabled");
#endif
}
}

Expand Down Expand Up @@ -3321,30 +3314,45 @@ struct llama_augmentation_model_loader {
printf("gpu_bucket: %p[%d]; offload_ratio: %.2f\n", gpu_bucket, gpu_bucket ? gpu_bucket->ne[0] : -1, layer.gpu_offload_ratio);

if (layer.ffn_gate) {
layer.ffn_gate_gpu = create_striped_mat_to_gpu(layer.ffn_gate, gpu_bucket);
layer.ffn_gate_gpu = slice_gpu_mat(layer.ffn_gate, gpu_bucket);
offloaded_bytes += ggml_nbytes(layer.ffn_gate_gpu);
}

layer.ffn_up_gpu = create_striped_mat_to_gpu(layer.ffn_up, gpu_bucket);
offloaded_bytes += ggml_nbytes(layer.ffn_up_gpu);

layer.ffn_down_gpu = create_striped_mat_to_gpu(layer.ffn_down_t, gpu_bucket);
layer.ffn_up_gpu = slice_gpu_mat(layer.ffn_up, gpu_bucket);
layer.ffn_down_gpu = slice_gpu_mat(layer.ffn_down_t, gpu_bucket);
offloaded_bytes += ggml_nbytes(layer.ffn_down_gpu);

#if defined(GGML_USE_CUBLAS)
ggml_set_backend(gpu_bucket, GGML_BACKEND_GPU);
ggml_cuda_transform_tensor(gpu_bucket->data, gpu_bucket);
#else
GGML_ASSERT(false && "cublas is not enabled");
#endif

return offloaded_bytes;
}

ggml_tensor * slice_gpu_mat(ggml_tensor * src, ggml_tensor * gpu_bucket) {
ggml_tensor * sliced_mat = ggml_select_rows(aux_ctx, src, gpu_bucket);
ggml_cgraph * gf = ggml_new_graph_custom(aux_ctx, 3, false);
ggml_build_forward_expand(gf, sliced_mat);
ggml_graph_compute_with_ctx(aux_ctx, gf, 1);

// Turn the computed tensor into a GPU weight tensor
ggml_tensor *gpu_mat = ggml_dup_tensor(aux_ctx, sliced_mat);
std::string name = std::string(ggml_get_name(src)) + ".gpu";
ggml_set_name(gpu_mat, name.c_str());
ggml_set_backend(gpu_mat, GGML_BACKEND_GPU);
ggml_cuda_transform_tensor(gpu_mat->data, gpu_mat);

return gpu_mat;
}

size_t offload_ffn_split(llama_model * model) {
LLAMA_LOG_INFO("%s: applying augmentation to model - please wait ...\n", __func__);
const int64_t t_start_aug_us = ggml_time_us();
std::vector<uint8_t> work_buffer;

// Set sparsity threshold via global virables
sparse_pred_threshold = model->hparams.sparse_pred_threshold;
#if defined (GGML_USE_CUBLAS)
ggml_cuda_set_device_constants(model->hparams.sparse_pred_threshold);
#endif

// load gpu_idx and slice mat to gpu
size_t offloaded_bytes = 0;
for (int il = 0; il < model->layers.size(); il++) {
Expand Down

0 comments on commit 0658ae9

Please sign in to comment.