diff --git a/src/plugins/intel_gpu/include/intel_gpu/plugin/primitives_list.hpp b/src/plugins/intel_gpu/include/intel_gpu/plugin/primitives_list.hpp index 408f3dfb7cf2d4..5ada89073046e5 100644 --- a/src/plugins/intel_gpu/include/intel_gpu/plugin/primitives_list.hpp +++ b/src/plugins/intel_gpu/include/intel_gpu/plugin/primitives_list.hpp @@ -290,3 +290,4 @@ REGISTER_FACTORY(internal, SDPA); REGISTER_FACTORY(internal, IndirectSDPA); REGISTER_FACTORY(internal, RoPE); REGISTER_FACTORY(internal, DynamicQuantize); +REGISTER_FACTORY(internal, PagedAttentionExtension); diff --git a/src/plugins/intel_gpu/include/intel_gpu/plugin/program_builder.hpp b/src/plugins/intel_gpu/include/intel_gpu/plugin/program_builder.hpp index 6d6c7df5ff869b..5cede62fd17e69 100644 --- a/src/plugins/intel_gpu/include/intel_gpu/plugin/program_builder.hpp +++ b/src/plugins/intel_gpu/include/intel_gpu/plugin/program_builder.hpp @@ -178,7 +178,6 @@ class ProgramBuilder final { void CreateSingleLayerPrimitive(const std::shared_ptr& op); }; -void CreatePagedAttention(ProgramBuilder& p, const std::shared_ptr& op); void CreateCustomOp(ProgramBuilder& p, const std::shared_ptr& node, CustomLayerPtr customLayer); void CreateUnaryEltwiseOp(ProgramBuilder& p, const std::shared_ptr& node, cldnn::activation_func func, cldnn::activation_additional_params params); diff --git a/src/plugins/intel_gpu/src/graph/impls/ocl/paged_attention.cpp b/src/plugins/intel_gpu/src/graph/impls/ocl/paged_attention.cpp index 792b819ca2dba3..e7a64185969a2d 100644 --- a/src/plugins/intel_gpu/src/graph/impls/ocl/paged_attention.cpp +++ b/src/plugins/intel_gpu/src/graph/impls/ocl/paged_attention.cpp @@ -240,25 +240,25 @@ struct paged_attention_impl : multi_stage_primitive { static kv_cache_update_kernel_params_t get_kv_cache_update_kernel_params(const kernel_impl_params& impl_param, bool is_dynamic = false) { auto params = get_default_params(impl_param, is_dynamic); - auto key = impl_param.get_input_layout(1); - auto value = impl_param.get_input_layout(2); - auto key_cache = impl_param.get_input_layout(3); - auto value_cache = impl_param.get_input_layout(4); - auto past_lens = impl_param.get_input_layout(5); - auto block_indices = impl_param.get_input_layout(7); - auto block_indices_begins = impl_param.get_input_layout(8); + const auto& key_layout = impl_param.get_input_layout(1); + const auto& value_layout = impl_param.get_input_layout(2); + const auto& key_cache_layout = impl_param.get_input_layout(3); + const auto& value_cache_layout = impl_param.get_input_layout(4); + const auto& past_lens_layout = impl_param.get_input_layout(5); + const auto& block_indices_layout = impl_param.get_input_layout(7); + const auto& block_indices_begins_layout = impl_param.get_input_layout(8); const auto inputs_number = 5; const auto outputs_number = 2; params.inputs.resize(inputs_number); params.outputs.resize(outputs_number); - params.inputs[0] = convert_data_tensor(key); - params.inputs[1] = convert_data_tensor(value); - params.inputs[2] = convert_data_tensor(past_lens); - params.inputs[3] = convert_data_tensor(block_indices); - params.inputs[4] = convert_data_tensor(block_indices_begins); - params.outputs[0] = convert_data_tensor(key_cache); - params.outputs[1] = convert_data_tensor(value_cache); + params.inputs[0] = convert_data_tensor(key_layout); + params.inputs[1] = convert_data_tensor(value_layout); + params.inputs[2] = convert_data_tensor(past_lens_layout); + params.inputs[3] = convert_data_tensor(block_indices_layout); + params.inputs[4] = convert_data_tensor(block_indices_begins_layout); + params.outputs[0] = convert_data_tensor(key_cache_layout); + params.outputs[1] = convert_data_tensor(value_cache_layout); params.conf = get_sdpa_configuration(impl_param); @@ -283,11 +283,11 @@ struct paged_attention_impl : multi_stage_primitive { static sdpa_kernel_params_t get_sdpa_kernel_params(const kernel_impl_params& impl_param, bool is_dynamic = false) { auto params = get_default_params(impl_param, is_dynamic); - const auto query_layout = impl_param.get_input_layout(0); - const auto key_layout = impl_param.get_input_layout(1); - const auto value_layout = impl_param.get_input_layout(2); - const auto subsequence_begins_layout = impl_param.get_input_layout(6); - const auto alibi_layout = impl_param.get_input_layout(11); + const auto& query_layout = impl_param.get_input_layout(0); + const auto& key_layout = impl_param.get_input_layout(1); + const auto& value_layout = impl_param.get_input_layout(2); + const auto& subsequence_begins_layout = impl_param.get_input_layout(6); + const auto& alibi_layout = impl_param.get_input_layout(11); const auto has_alibi = alibi_layout.count() > 0; auto inputs_number = 4; @@ -371,15 +371,15 @@ struct paged_attention_impl : multi_stage_primitive { static pa_sdpa_kernel_params_t get_pa_sdpa_params(const kernel_impl_params& impl_param, bool is_dynamic = false) { auto params = get_default_params(impl_param, is_dynamic); - const auto query_layout = impl_param.get_input_layout(0); - const auto key_cache_layout = impl_param.get_input_layout(3); - const auto value_cache_layout = impl_param.get_input_layout(4); - const auto past_lens_layout = impl_param.get_input_layout(5); - const auto block_indices_layout = impl_param.get_input_layout(7); - const auto block_indices_begins_layout = impl_param.get_input_layout(8); - const auto alibi_layout = impl_param.get_input_layout(11); - + const auto& query_layout = impl_param.get_input_layout(0); + const auto& key_cache_layout = impl_param.get_input_layout(3); + const auto& value_cache_layout = impl_param.get_input_layout(4); + const auto& past_lens_layout = impl_param.get_input_layout(5); + const auto& block_indices_layout = impl_param.get_input_layout(7); + const auto& block_indices_begins_layout = impl_param.get_input_layout(8); + const auto& alibi_layout = impl_param.get_input_layout(11); const auto has_alibi = alibi_layout.count() > 0; + auto inputs_number = 6; if (has_alibi) inputs_number++; diff --git a/src/plugins/intel_gpu/src/graph/include/paged_attention_inst.h b/src/plugins/intel_gpu/src/graph/include/paged_attention_inst.h index e71a52ea8ee32d..5fec71ba9421d0 100644 --- a/src/plugins/intel_gpu/src/graph/include/paged_attention_inst.h +++ b/src/plugins/intel_gpu/src/graph/include/paged_attention_inst.h @@ -55,9 +55,6 @@ class typed_primitive_inst : public typed_primitive_inst_base

prefill_network; protected: diff --git a/src/plugins/intel_gpu/src/graph/program.cpp b/src/plugins/intel_gpu/src/graph/program.cpp index c4d7638e4ae5d1..35dd0c77a70e28 100644 --- a/src/plugins/intel_gpu/src/graph/program.cpp +++ b/src/plugins/intel_gpu/src/graph/program.cpp @@ -704,7 +704,7 @@ void program::transfer_memory_to_device() { auto mem_layout = mem.get_layout(); auto alloc_type = mem.get_allocation_type(); - if (ov::shape_size(mem_layout.get_shape()) == 0) + if (mem_layout.count() == 0) continue; if (!mem_layout.compatible(data_node_layout)) { @@ -712,7 +712,6 @@ void program::transfer_memory_to_device() { throw std::invalid_argument(err_str); } - if (alloc_type == allocation_type::usm_host || alloc_type == allocation_type::usm_shared) { GPU_DEBUG_LOG << "[" << data_node.id() << ": constant]" << std::endl; // Allocate and transfer memory diff --git a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/pa_kv_cache_update_ref.cl b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/pa_kv_cache_update_ref.cl index e7331228bc8585..d0c3ed5b13d859 100644 --- a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/pa_kv_cache_update_ref.cl +++ b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/pa_kv_cache_update_ref.cl @@ -172,68 +172,6 @@ KERNEL(pa_kv_cache_update)( for (uint i = 0; i < tokens_num; i++) { uint head_idx_index = 0; -#ifdef ENABLE_THIS - #define READ_BLOCK_SIZE 8 - for (; head_idx_index + (READ_BLOCK_SIZE * SUBGROUP_SIZE) <= HEAD_SIZE; head_idx_index += SUBGROUP_SIZE * READ_BLOCK_SIZE) { - #define BLOCK_READ(ptr, offset) BLOCK_READN(INPUT0_TYPE, READ_BLOCK_SIZE, ptr, offset); - #define DATA_VEC MAKE_VECTOR_TYPE(INPUT0_TYPE, READ_BLOCK_SIZE) - - DATA_VEC input_data = BLOCK_READ(key_data, key_value_in_offset + head_idx_index); - - unroll_for (uint i = 0; i < READ_BLOCK_SIZE; i++) { - uint key_offset = key_out_offset + (head_idx_index + sglid + SUBGROUP_SIZE * i) * PAGED_ATTENTION_BLOCK_SIZE; - key_cache_data[key_offset] = input_data[i]; - } - - input_data = BLOCK_READ(value_data, key_value_in_offset + head_idx_index); - - unroll_for (uint i = 0; i < READ_BLOCK_SIZE; i++) { - uint value_offset = value_out_offset + head_idx_index + sglid + SUBGROUP_SIZE * i; - value_cache_data[value_offset] = input_data[i]; - } - } - - #define READ_BLOCK_SIZE 4 - for (; head_idx_index + (READ_BLOCK_SIZE * SUBGROUP_SIZE) <= HEAD_SIZE; head_idx_index += SUBGROUP_SIZE * READ_BLOCK_SIZE) { - #define BLOCK_READ(ptr, offset) BLOCK_READN(INPUT0_TYPE, READ_BLOCK_SIZE, ptr, offset); - #define DATA_VEC MAKE_VECTOR_TYPE(INPUT0_TYPE, READ_BLOCK_SIZE) - - DATA_VEC input_data = BLOCK_READ(key_data, key_value_in_offset + head_idx_index); - - unroll_for (uint i = 0; i < READ_BLOCK_SIZE; i++) { - uint key_offset = key_out_offset + (head_idx_index + sglid + SUBGROUP_SIZE * i) * PAGED_ATTENTION_BLOCK_SIZE; - key_cache_data[key_offset] = input_data[i]; - } - - input_data = BLOCK_READ(value_data, key_value_in_offset + head_idx_index); - - unroll_for (uint i = 0; i < READ_BLOCK_SIZE; i++) { - uint value_offset = value_out_offset + head_idx_index + sglid + SUBGROUP_SIZE * i; - value_cache_data[value_offset] = input_data[i]; - } - } - - #define READ_BLOCK_SIZE 2 - for (; head_idx_index + (READ_BLOCK_SIZE * SUBGROUP_SIZE) <= HEAD_SIZE; head_idx_index += SUBGROUP_SIZE * READ_BLOCK_SIZE) { - #define BLOCK_READ(ptr, offset) BLOCK_READN(INPUT0_TYPE, READ_BLOCK_SIZE, ptr, offset); - #define DATA_VEC MAKE_VECTOR_TYPE(INPUT0_TYPE, READ_BLOCK_SIZE) - - DATA_VEC input_data = BLOCK_READ(key_data, key_value_in_offset + head_idx_index); - - unroll_for (uint i = 0; i < READ_BLOCK_SIZE; i++) { - uint key_offset = key_out_offset + (head_idx_index + sglid + SUBGROUP_SIZE * i) * PAGED_ATTENTION_BLOCK_SIZE; - key_cache_data[key_offset] = input_data[i]; - } - - input_data = BLOCK_READ(value_data, key_value_in_offset + head_idx_index); - - unroll_for (uint i = 0; i < READ_BLOCK_SIZE; i++) { - uint value_offset = value_out_offset + head_idx_index + sglid + SUBGROUP_SIZE * i; - value_cache_data[value_offset] = input_data[i]; - } - } -#endif - #define READ_BLOCK_SIZE 1 for (; head_idx_index + (READ_BLOCK_SIZE * SUBGROUP_SIZE) <= HEAD_SIZE; head_idx_index += SUBGROUP_SIZE * READ_BLOCK_SIZE) { #define BLOCK_READ(ptr, offset) BLOCK_READN(INPUT0_TYPE, READ_BLOCK_SIZE, ptr, offset); @@ -259,6 +197,5 @@ KERNEL(pa_kv_cache_update)( value_out_offset += HEAD_SIZE; } } - } } diff --git a/src/plugins/intel_gpu/src/kernel_selector/kernels/sdpa/pa_kv_cache_update_kernel_ref.cpp b/src/plugins/intel_gpu/src/kernel_selector/kernels/sdpa/pa_kv_cache_update_kernel_ref.cpp index 08e82aa9503dcc..312b340480dbe7 100644 --- a/src/plugins/intel_gpu/src/kernel_selector/kernels/sdpa/pa_kv_cache_update_kernel_ref.cpp +++ b/src/plugins/intel_gpu/src/kernel_selector/kernels/sdpa/pa_kv_cache_update_kernel_ref.cpp @@ -16,7 +16,7 @@ constexpr size_t paged_attention_block_size = 16; static size_t get_generate_stage_block_size(size_t head_size) { auto preferred_block_size = { 4, 2, 1 }; for (const auto& block_size : preferred_block_size) { - if (head_size % block_size == 0) { + if (head_size % (block_size * subgroup_size) == 0) { return block_size; } } diff --git a/src/plugins/intel_gpu/src/plugin/ops/custom.cpp b/src/plugins/intel_gpu/src/plugin/ops/custom.cpp index 8f42a8709bbbc5..c00b2c01a19de1 100644 --- a/src/plugins/intel_gpu/src/plugin/ops/custom.cpp +++ b/src/plugins/intel_gpu/src/plugin/ops/custom.cpp @@ -4,14 +4,12 @@ #include "openvino/core/attribute_visitor.hpp" #include "openvino/core/node.hpp" -#include "openvino/op/constant.hpp" #include "intel_gpu/plugin/program_builder.hpp" #include "intel_gpu/plugin/common_utils.hpp" #include "intel_gpu/plugin/simple_math.hpp" #include "intel_gpu/primitives/custom_gpu_primitive.hpp" #include "intel_gpu/primitives/reorder.hpp" -#include "intel_gpu/primitives/paged_attention.hpp" namespace ov { namespace intel_gpu { @@ -102,42 +100,6 @@ class CustomLayerAttributeVisitor : public ov::AttributeVisitor { std::map m_values; }; -void CreatePagedAttention(ProgramBuilder& p, const std::shared_ptr& op) { - validate_inputs_count(op, {13}); - auto inputs = p.GetInputInfo(op); - auto prim = cldnn::paged_attention(layer_type_name_ID(op), inputs); - - auto key_cache_ps = op->get_input_partial_shape(3); - auto query_ps = op->get_input_partial_shape(0); - auto head_size = key_cache_ps[2].get_length(); - auto kv_heads_num = key_cache_ps[1].get_length(); - auto heads_num = query_ps[1].get_length() / head_size; - - prim.head_size = head_size; - prim.kv_heads_num = kv_heads_num; - prim.heads_num = heads_num; - - const size_t scale_idx = 9; - const size_t alibi_idx = 11; - - std::shared_ptr scale_const = std::dynamic_pointer_cast(op->get_input_node_shared_ptr(scale_idx)); - OPENVINO_ASSERT(scale_const != nullptr); - OPENVINO_ASSERT(ov::shape_size(scale_const->get_output_shape(0)) == 1); - prim.scale_val = scale_const->cast_vector()[0]; - - std::shared_ptr alibi_const = std::dynamic_pointer_cast(op->get_input_node_shared_ptr(alibi_idx)); - OPENVINO_ASSERT(alibi_const != nullptr); - prim.has_alibi = ov::shape_size(alibi_const->get_output_shape(0)) > 0; - - if (op->get_output_size() > 1) { - const auto scores_output_idx = 1; - const auto& users = op->get_output_target_inputs(scores_output_idx); - OPENVINO_ASSERT(users.size() == 0, "[GPU] PagedAttention implementation doesn't support scores output yet"); - } - - p.add_primitive(*op, prim); -} - void CreateCustomOp(ProgramBuilder& p, const std::shared_ptr& op, CustomLayerPtr customLayer) { auto inputs = p.GetInputInfo(op); std::string layerName = layer_type_name_ID(op); diff --git a/src/plugins/intel_gpu/src/plugin/ops/paged_attention.cpp b/src/plugins/intel_gpu/src/plugin/ops/paged_attention.cpp new file mode 100644 index 00000000000000..5d07488c676847 --- /dev/null +++ b/src/plugins/intel_gpu/src/plugin/ops/paged_attention.cpp @@ -0,0 +1,63 @@ +// Copyright (C) 2018-2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "intel_gpu/plugin/program_builder.hpp" +#include "intel_gpu/plugin/common_utils.hpp" + +#include "openvino/op/constant.hpp" +#include "openvino/op/paged_attention.hpp" + +#include "intel_gpu/primitives/paged_attention.hpp" + +namespace ov { +namespace op { +namespace internal { +using PagedAttentionExtension = ov::op::PagedAttentionExtension; +} // namespace internal +} // namespace op +} // namespace ov + +namespace ov { +namespace intel_gpu { + +static void CreatePagedAttentionExtensionOp(ProgramBuilder& p, const std::shared_ptr& op) { + validate_inputs_count(op, {13}); + auto inputs = p.GetInputInfo(op); + auto prim = cldnn::paged_attention(layer_type_name_ID(op), inputs); + + auto key_cache_ps = op->get_input_partial_shape(3); + auto query_ps = op->get_input_partial_shape(0); + auto head_size = key_cache_ps[2].get_length(); + auto kv_heads_num = key_cache_ps[1].get_length(); + auto heads_num = query_ps[1].get_length() / head_size; + + prim.head_size = head_size; + prim.kv_heads_num = kv_heads_num; + prim.heads_num = heads_num; + + const size_t scale_idx = 9; + const size_t alibi_idx = 11; + + std::shared_ptr scale_const = std::dynamic_pointer_cast(op->get_input_node_shared_ptr(scale_idx)); + OPENVINO_ASSERT(scale_const != nullptr); + OPENVINO_ASSERT(ov::shape_size(scale_const->get_output_shape(0)) == 1); + prim.scale_val = scale_const->cast_vector()[0]; + + std::shared_ptr alibi_const = std::dynamic_pointer_cast(op->get_input_node_shared_ptr(alibi_idx)); + OPENVINO_ASSERT(alibi_const != nullptr); + prim.has_alibi = ov::shape_size(alibi_const->get_output_shape(0)) > 0; + + if (op->get_output_size() > 1) { + const auto scores_output_idx = 1; + const auto& users = op->get_output_target_inputs(scores_output_idx); + OPENVINO_ASSERT(users.size() == 0, "[GPU] PagedAttention implementation doesn't support scores output yet"); + } + + p.add_primitive(*op, prim); +} + +REGISTER_FACTORY_IMPL(internal, PagedAttentionExtension) + +} // namespace intel_gpu +} // namespace ov diff --git a/src/plugins/intel_gpu/src/plugin/program_builder.cpp b/src/plugins/intel_gpu/src/plugin/program_builder.cpp index 5467e07f2ac0f5..aae9b163b4f6bf 100644 --- a/src/plugins/intel_gpu/src/plugin/program_builder.cpp +++ b/src/plugins/intel_gpu/src/plugin/program_builder.cpp @@ -239,12 +239,7 @@ void ProgramBuilder::CreateSingleLayerPrimitive(const std::shared_ptr& is_created = true; break; } - - const std::string paged_attention_type = "PagedAttentionExtension"; - if (op->get_type_name() == paged_attention_type) { - CreatePagedAttention(*this, op); - return; - } + op_type_info = op_type_info->parent; } if (!is_created) { diff --git a/src/plugins/intel_gpu/src/plugin/sync_infer_request.cpp b/src/plugins/intel_gpu/src/plugin/sync_infer_request.cpp index 11c58fbc9af169..69746d59974ea1 100644 --- a/src/plugins/intel_gpu/src/plugin/sync_infer_request.cpp +++ b/src/plugins/intel_gpu/src/plugin/sync_infer_request.cpp @@ -841,7 +841,7 @@ std::vector SyncInferRequest::prepare_input(const std::string } cldnn::event::ptr ret_event = nullptr; - if (!is_remote && !convert_needed) { + if (!is_remote_tensor_impl && !convert_needed) { auto src_ptr = static_cast(user_tensor->data()); if (!same_host_mem(memory, src_ptr)) { // WA: Set need_lockable_mem as a blocking argument