diff --git a/ggml-cuda.cu b/ggml-cuda.cu index 8ded43b..6d20e89 100644 --- a/ggml-cuda.cu +++ b/ggml-cuda.cu @@ -7127,24 +7127,6 @@ static cudaError_t ggml_cuda_cpy_tensor_1d( return cudaMemcpyAsync(dst_ptr, x, i1_diff*nb0/blck, kind, stream); } -void ggml_cuda_cpy_1d(struct ggml_tensor * dst, const struct ggml_tensor * src) { - ggml_cuda_set_device(g_main_device); - const cudaStream_t main_stream = g_cudaStreams[g_main_device][0]; - - // TODO: only supports CPU -> GPU as of now - GGML_ASSERT(src->backend == GGML_BACKEND_CPU && dst->backend == GGML_BACKEND_GPU); - struct ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra; - - CUDA_CHECK(ggml_cuda_cpy_tensor_1d(dst_extra->data_device[0], src, 0, src->ne[0], main_stream)); -} - -void ** ggml_cuda_get_data_pp(struct ggml_tensor * tensor) { - // only supports one device for now - GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU); - struct ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) tensor->extra; - return &extra->data_device[0]; -} - template inline void ggml_cuda_op_bin_bcast( const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, @@ -9682,10 +9664,6 @@ void ggml_cuda_transform_tensor(void * data, struct ggml_tensor * tensor) { return ggml_cuda_transform_tensor_impl(data, tensor, false); } -void ggml_cuda_alloc_tensor(struct ggml_tensor * tensor) { - return ggml_cuda_transform_tensor_impl(nullptr, tensor, true); -} - void ggml_cuda_free_data(struct ggml_tensor * tensor) { if (!tensor || (tensor->backend != GGML_BACKEND_GPU && tensor->backend != GGML_BACKEND_GPU_SPLIT) ) { return; diff --git a/ggml-cuda.h b/ggml-cuda.h index 7257867..90a7d34 100644 --- a/ggml-cuda.h +++ b/ggml-cuda.h @@ -29,11 +29,7 @@ GGML_API void ggml_cuda_host_free(void * ptr); GGML_API bool ggml_cuda_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst); GGML_API void ggml_cuda_set_tensor_split(const float * tensor_split); GGML_API void ggml_cuda_transform_tensor(void * data, struct ggml_tensor * tensor); -GGML_API void ggml_cuda_alloc_tensor(struct ggml_tensor * tensor); GGML_API void ggml_cuda_free_data(struct ggml_tensor * tensor); -GGML_API void ggml_cuda_cpy_1d(struct ggml_tensor * dst, const struct ggml_tensor * src); -GGML_API bool debug_equal(short *a, short *b); -GGML_API void **ggml_cuda_get_data_pp(struct ggml_tensor * tensor); GGML_API void ggml_cuda_assign_buffers(struct ggml_tensor * tensor); GGML_API void ggml_cuda_assign_buffers_no_scratch(struct ggml_tensor * tensor); diff --git a/llama.cpp b/llama.cpp index 613c3e5..d624055 100644 --- a/llama.cpp +++ b/llama.cpp @@ -3258,52 +3258,6 @@ struct llama_augmentation_model_loader { aux_ctx = ggml_init(params); } - // allocate and copy selected weights to gpu - ggml_tensor * create_striped_mat_to_gpu(struct ggml_tensor *src, struct ggml_tensor * gpu_bucket) { -#ifdef GGML_USE_CUBLAS - if (gpu_bucket == NULL) { - // offload the whole tensor to gpu - ggml_set_backend(src, GGML_BACKEND_GPU); - ggml_cuda_transform_tensor(src->data, src); - return src; - } - - int64_t row_len = src->ne[0]; - int64_t gpu_rows = gpu_bucket->ne[0]; - GGML_ASSERT(0 < gpu_rows && gpu_rows <= src->ne[1]); - - ggml_set_no_alloc(aux_ctx, true); - ggml_tensor * gpu_dst = ggml_new_tensor_2d(aux_ctx, src->type, row_len, gpu_rows); - ggml_set_backend(gpu_dst, GGML_BACKEND_GPU); - ggml_cuda_alloc_tensor(gpu_dst); - - // init two 1d views on host and device - ggml_tensor * host_mat_row = ggml_new_tensor_1d(aux_ctx, src->type, row_len); - static ggml_tensor * device_mat_row = ggml_dup_tensor(aux_ctx, host_mat_row); - ggml_set_backend(device_mat_row, GGML_BACKEND_GPU); - ggml_cuda_alloc_tensor(device_mat_row); - *ggml_cuda_get_data_pp(device_mat_row) = *ggml_cuda_get_data_pp(gpu_dst); - - // read raw data and copy to device depending on gpu_idx - const enum ggml_type type = src->type; - const int ne0 = src->ne[0]; - const size_t row_data_size = ne0*ggml_type_size(type)/ggml_blck_size(type); - for (int i = 0; i < gpu_rows; i++) { - int32_t host_i = ((int32_t *)gpu_bucket->data)[i]; - host_mat_row -> data = (char *)(src -> data) + host_i * row_data_size; - char ** gpu_data_pp = reinterpret_cast(ggml_cuda_get_data_pp(device_mat_row)); - // printf("gpu_data_p: %p\n", *gpu_data_pp); - ggml_cuda_cpy_1d(device_mat_row, host_mat_row); - *gpu_data_pp = *gpu_data_pp + row_data_size; - } - ggml_set_no_alloc(aux_ctx, false); - - return gpu_dst; -#else - return NULL; -#endif - } - size_t slice_ffn_mat_to_gpu(llama_layer & layer) { std::vector work_buffer; ggml_tensor * gpu_idx = layer.gpu_idx;