Skip to content

Commit

Permalink
minor: code clearnup
Browse files Browse the repository at this point in the history
  • Loading branch information
hodlen committed Apr 2, 2024
1 parent db09779 commit eb45354
Show file tree
Hide file tree
Showing 3 changed files with 0 additions and 72 deletions.
22 changes: 0 additions & 22 deletions ggml-cuda.cu
Original file line number Diff line number Diff line change
Expand Up @@ -7127,24 +7127,6 @@ static cudaError_t ggml_cuda_cpy_tensor_1d(
return cudaMemcpyAsync(dst_ptr, x, i1_diff*nb0/blck, kind, stream);
}

void ggml_cuda_cpy_1d(struct ggml_tensor * dst, const struct ggml_tensor * src) {
ggml_cuda_set_device(g_main_device);
const cudaStream_t main_stream = g_cudaStreams[g_main_device][0];

// TODO: only supports CPU -> GPU as of now
GGML_ASSERT(src->backend == GGML_BACKEND_CPU && dst->backend == GGML_BACKEND_GPU);
struct ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra;

CUDA_CHECK(ggml_cuda_cpy_tensor_1d(dst_extra->data_device[0], src, 0, src->ne[0], main_stream));
}

void ** ggml_cuda_get_data_pp(struct ggml_tensor * tensor) {
// only supports one device for now
GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU);
struct ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) tensor->extra;
return &extra->data_device[0];
}

template<class op>
inline void ggml_cuda_op_bin_bcast(
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
Expand Down Expand Up @@ -9682,10 +9664,6 @@ void ggml_cuda_transform_tensor(void * data, struct ggml_tensor * tensor) {
return ggml_cuda_transform_tensor_impl(data, tensor, false);
}

void ggml_cuda_alloc_tensor(struct ggml_tensor * tensor) {
return ggml_cuda_transform_tensor_impl(nullptr, tensor, true);
}

void ggml_cuda_free_data(struct ggml_tensor * tensor) {
if (!tensor || (tensor->backend != GGML_BACKEND_GPU && tensor->backend != GGML_BACKEND_GPU_SPLIT) ) {
return;
Expand Down
4 changes: 0 additions & 4 deletions ggml-cuda.h
Original file line number Diff line number Diff line change
Expand Up @@ -29,11 +29,7 @@ GGML_API void ggml_cuda_host_free(void * ptr);
GGML_API bool ggml_cuda_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst);
GGML_API void ggml_cuda_set_tensor_split(const float * tensor_split);
GGML_API void ggml_cuda_transform_tensor(void * data, struct ggml_tensor * tensor);
GGML_API void ggml_cuda_alloc_tensor(struct ggml_tensor * tensor);
GGML_API void ggml_cuda_free_data(struct ggml_tensor * tensor);
GGML_API void ggml_cuda_cpy_1d(struct ggml_tensor * dst, const struct ggml_tensor * src);
GGML_API bool debug_equal(short *a, short *b);
GGML_API void **ggml_cuda_get_data_pp(struct ggml_tensor * tensor);

GGML_API void ggml_cuda_assign_buffers(struct ggml_tensor * tensor);
GGML_API void ggml_cuda_assign_buffers_no_scratch(struct ggml_tensor * tensor);
Expand Down
46 changes: 0 additions & 46 deletions llama.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3258,52 +3258,6 @@ struct llama_augmentation_model_loader {
aux_ctx = ggml_init(params);
}

// allocate and copy selected weights to gpu
ggml_tensor * create_striped_mat_to_gpu(struct ggml_tensor *src, struct ggml_tensor * gpu_bucket) {
#ifdef GGML_USE_CUBLAS
if (gpu_bucket == NULL) {
// offload the whole tensor to gpu
ggml_set_backend(src, GGML_BACKEND_GPU);
ggml_cuda_transform_tensor(src->data, src);
return src;
}

int64_t row_len = src->ne[0];
int64_t gpu_rows = gpu_bucket->ne[0];
GGML_ASSERT(0 < gpu_rows && gpu_rows <= src->ne[1]);

ggml_set_no_alloc(aux_ctx, true);
ggml_tensor * gpu_dst = ggml_new_tensor_2d(aux_ctx, src->type, row_len, gpu_rows);
ggml_set_backend(gpu_dst, GGML_BACKEND_GPU);
ggml_cuda_alloc_tensor(gpu_dst);

// init two 1d views on host and device
ggml_tensor * host_mat_row = ggml_new_tensor_1d(aux_ctx, src->type, row_len);
static ggml_tensor * device_mat_row = ggml_dup_tensor(aux_ctx, host_mat_row);
ggml_set_backend(device_mat_row, GGML_BACKEND_GPU);
ggml_cuda_alloc_tensor(device_mat_row);
*ggml_cuda_get_data_pp(device_mat_row) = *ggml_cuda_get_data_pp(gpu_dst);

// read raw data and copy to device depending on gpu_idx
const enum ggml_type type = src->type;
const int ne0 = src->ne[0];
const size_t row_data_size = ne0*ggml_type_size(type)/ggml_blck_size(type);
for (int i = 0; i < gpu_rows; i++) {
int32_t host_i = ((int32_t *)gpu_bucket->data)[i];
host_mat_row -> data = (char *)(src -> data) + host_i * row_data_size;
char ** gpu_data_pp = reinterpret_cast<char **>(ggml_cuda_get_data_pp(device_mat_row));
// printf("gpu_data_p: %p\n", *gpu_data_pp);
ggml_cuda_cpy_1d(device_mat_row, host_mat_row);
*gpu_data_pp = *gpu_data_pp + row_data_size;
}
ggml_set_no_alloc(aux_ctx, false);

return gpu_dst;
#else
return NULL;
#endif
}

size_t slice_ffn_mat_to_gpu(llama_layer & layer) {
std::vector<uint8_t> work_buffer;
ggml_tensor * gpu_idx = layer.gpu_idx;
Expand Down

0 comments on commit eb45354

Please sign in to comment.