Skip to content

Commit

Permalink
Apply review comments
Browse files Browse the repository at this point in the history
  • Loading branch information
sshlyapn committed Aug 26, 2024
1 parent 34d731c commit d28d40b
Show file tree
Hide file tree
Showing 11 changed files with 95 additions and 142 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -290,3 +290,4 @@ REGISTER_FACTORY(internal, SDPA);
REGISTER_FACTORY(internal, IndirectSDPA);
REGISTER_FACTORY(internal, RoPE);
REGISTER_FACTORY(internal, DynamicQuantize);
REGISTER_FACTORY(internal, PagedAttentionExtension);
Original file line number Diff line number Diff line change
Expand Up @@ -178,7 +178,6 @@ class ProgramBuilder final {
void CreateSingleLayerPrimitive(const std::shared_ptr<ov::Node>& op);
};

void CreatePagedAttention(ProgramBuilder& p, const std::shared_ptr<ov::Node>& op);
void CreateCustomOp(ProgramBuilder& p, const std::shared_ptr<ov::Node>& node, CustomLayerPtr customLayer);
void CreateUnaryEltwiseOp(ProgramBuilder& p, const std::shared_ptr<ov::Node>& node,
cldnn::activation_func func, cldnn::activation_additional_params params);
Expand Down
54 changes: 27 additions & 27 deletions src/plugins/intel_gpu/src/graph/impls/ocl/paged_attention.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -240,25 +240,25 @@ struct paged_attention_impl : multi_stage_primitive<paged_attention> {
static kv_cache_update_kernel_params_t get_kv_cache_update_kernel_params(const kernel_impl_params& impl_param, bool is_dynamic = false) {
auto params = get_default_params<kv_cache_update_kernel_params_t>(impl_param, is_dynamic);

auto key = impl_param.get_input_layout(1);
auto value = impl_param.get_input_layout(2);
auto key_cache = impl_param.get_input_layout(3);
auto value_cache = impl_param.get_input_layout(4);
auto past_lens = impl_param.get_input_layout(5);
auto block_indices = impl_param.get_input_layout(7);
auto block_indices_begins = impl_param.get_input_layout(8);
const auto& key_layout = impl_param.get_input_layout(1);
const auto& value_layout = impl_param.get_input_layout(2);
const auto& key_cache_layout = impl_param.get_input_layout(3);
const auto& value_cache_layout = impl_param.get_input_layout(4);
const auto& past_lens_layout = impl_param.get_input_layout(5);
const auto& block_indices_layout = impl_param.get_input_layout(7);
const auto& block_indices_begins_layout = impl_param.get_input_layout(8);

const auto inputs_number = 5;
const auto outputs_number = 2;
params.inputs.resize(inputs_number);
params.outputs.resize(outputs_number);
params.inputs[0] = convert_data_tensor(key);
params.inputs[1] = convert_data_tensor(value);
params.inputs[2] = convert_data_tensor(past_lens);
params.inputs[3] = convert_data_tensor(block_indices);
params.inputs[4] = convert_data_tensor(block_indices_begins);
params.outputs[0] = convert_data_tensor(key_cache);
params.outputs[1] = convert_data_tensor(value_cache);
params.inputs[0] = convert_data_tensor(key_layout);
params.inputs[1] = convert_data_tensor(value_layout);
params.inputs[2] = convert_data_tensor(past_lens_layout);
params.inputs[3] = convert_data_tensor(block_indices_layout);
params.inputs[4] = convert_data_tensor(block_indices_begins_layout);
params.outputs[0] = convert_data_tensor(key_cache_layout);
params.outputs[1] = convert_data_tensor(value_cache_layout);

params.conf = get_sdpa_configuration(impl_param);

Expand All @@ -283,11 +283,11 @@ struct paged_attention_impl : multi_stage_primitive<paged_attention> {
static sdpa_kernel_params_t get_sdpa_kernel_params(const kernel_impl_params& impl_param, bool is_dynamic = false) {
auto params = get_default_params<sdpa_kernel_params_t>(impl_param, is_dynamic);

const auto query_layout = impl_param.get_input_layout(0);
const auto key_layout = impl_param.get_input_layout(1);
const auto value_layout = impl_param.get_input_layout(2);
const auto subsequence_begins_layout = impl_param.get_input_layout(6);
const auto alibi_layout = impl_param.get_input_layout(11);
const auto& query_layout = impl_param.get_input_layout(0);
const auto& key_layout = impl_param.get_input_layout(1);
const auto& value_layout = impl_param.get_input_layout(2);
const auto& subsequence_begins_layout = impl_param.get_input_layout(6);
const auto& alibi_layout = impl_param.get_input_layout(11);
const auto has_alibi = alibi_layout.count() > 0;

auto inputs_number = 4;
Expand Down Expand Up @@ -371,15 +371,15 @@ struct paged_attention_impl : multi_stage_primitive<paged_attention> {
static pa_sdpa_kernel_params_t get_pa_sdpa_params(const kernel_impl_params& impl_param, bool is_dynamic = false) {
auto params = get_default_params<pa_sdpa_kernel_params_t>(impl_param, is_dynamic);

const auto query_layout = impl_param.get_input_layout(0);
const auto key_cache_layout = impl_param.get_input_layout(3);
const auto value_cache_layout = impl_param.get_input_layout(4);
const auto past_lens_layout = impl_param.get_input_layout(5);
const auto block_indices_layout = impl_param.get_input_layout(7);
const auto block_indices_begins_layout = impl_param.get_input_layout(8);
const auto alibi_layout = impl_param.get_input_layout(11);

const auto& query_layout = impl_param.get_input_layout(0);
const auto& key_cache_layout = impl_param.get_input_layout(3);
const auto& value_cache_layout = impl_param.get_input_layout(4);
const auto& past_lens_layout = impl_param.get_input_layout(5);
const auto& block_indices_layout = impl_param.get_input_layout(7);
const auto& block_indices_begins_layout = impl_param.get_input_layout(8);
const auto& alibi_layout = impl_param.get_input_layout(11);
const auto has_alibi = alibi_layout.count() > 0;

auto inputs_number = 6;
if (has_alibi)
inputs_number++;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -55,9 +55,6 @@ class typed_primitive_inst<paged_attention> : public typed_primitive_inst_base<p
memory::ptr block_indices_begins_memory_ptr() const { return input_memory_ptr(8); }
memory::ptr alibi_memory_ptr() const { return input_memory_ptr(11); }

mutable cldnn::memory::ptr blocks_mem = nullptr;
mutable cldnn::memory::ptr context_lens_mem = nullptr;

std::shared_ptr<network> prefill_network;

protected:
Expand Down
3 changes: 1 addition & 2 deletions src/plugins/intel_gpu/src/graph/program.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -704,15 +704,14 @@ void program::transfer_memory_to_device() {
auto mem_layout = mem.get_layout();
auto alloc_type = mem.get_allocation_type();

if (ov::shape_size(mem_layout.get_shape()) == 0)
if (mem_layout.count() == 0)
continue;

if (!mem_layout.compatible(data_node_layout)) {
std::string err_str("Node and memory layouts are incompatible, error occurred for " + node->id() + " node");
throw std::invalid_argument(err_str);
}


if (alloc_type == allocation_type::usm_host || alloc_type == allocation_type::usm_shared) {
GPU_DEBUG_LOG << "[" << data_node.id() << ": constant]" << std::endl;
// Allocate and transfer memory
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -172,68 +172,6 @@ KERNEL(pa_kv_cache_update)(
for (uint i = 0; i < tokens_num; i++) {
uint head_idx_index = 0;

#ifdef ENABLE_THIS
#define READ_BLOCK_SIZE 8
for (; head_idx_index + (READ_BLOCK_SIZE * SUBGROUP_SIZE) <= HEAD_SIZE; head_idx_index += SUBGROUP_SIZE * READ_BLOCK_SIZE) {
#define BLOCK_READ(ptr, offset) BLOCK_READN(INPUT0_TYPE, READ_BLOCK_SIZE, ptr, offset);
#define DATA_VEC MAKE_VECTOR_TYPE(INPUT0_TYPE, READ_BLOCK_SIZE)

DATA_VEC input_data = BLOCK_READ(key_data, key_value_in_offset + head_idx_index);

unroll_for (uint i = 0; i < READ_BLOCK_SIZE; i++) {
uint key_offset = key_out_offset + (head_idx_index + sglid + SUBGROUP_SIZE * i) * PAGED_ATTENTION_BLOCK_SIZE;
key_cache_data[key_offset] = input_data[i];
}

input_data = BLOCK_READ(value_data, key_value_in_offset + head_idx_index);

unroll_for (uint i = 0; i < READ_BLOCK_SIZE; i++) {
uint value_offset = value_out_offset + head_idx_index + sglid + SUBGROUP_SIZE * i;
value_cache_data[value_offset] = input_data[i];
}
}

#define READ_BLOCK_SIZE 4
for (; head_idx_index + (READ_BLOCK_SIZE * SUBGROUP_SIZE) <= HEAD_SIZE; head_idx_index += SUBGROUP_SIZE * READ_BLOCK_SIZE) {
#define BLOCK_READ(ptr, offset) BLOCK_READN(INPUT0_TYPE, READ_BLOCK_SIZE, ptr, offset);
#define DATA_VEC MAKE_VECTOR_TYPE(INPUT0_TYPE, READ_BLOCK_SIZE)

DATA_VEC input_data = BLOCK_READ(key_data, key_value_in_offset + head_idx_index);

unroll_for (uint i = 0; i < READ_BLOCK_SIZE; i++) {
uint key_offset = key_out_offset + (head_idx_index + sglid + SUBGROUP_SIZE * i) * PAGED_ATTENTION_BLOCK_SIZE;
key_cache_data[key_offset] = input_data[i];
}

input_data = BLOCK_READ(value_data, key_value_in_offset + head_idx_index);

unroll_for (uint i = 0; i < READ_BLOCK_SIZE; i++) {
uint value_offset = value_out_offset + head_idx_index + sglid + SUBGROUP_SIZE * i;
value_cache_data[value_offset] = input_data[i];
}
}

#define READ_BLOCK_SIZE 2
for (; head_idx_index + (READ_BLOCK_SIZE * SUBGROUP_SIZE) <= HEAD_SIZE; head_idx_index += SUBGROUP_SIZE * READ_BLOCK_SIZE) {
#define BLOCK_READ(ptr, offset) BLOCK_READN(INPUT0_TYPE, READ_BLOCK_SIZE, ptr, offset);
#define DATA_VEC MAKE_VECTOR_TYPE(INPUT0_TYPE, READ_BLOCK_SIZE)

DATA_VEC input_data = BLOCK_READ(key_data, key_value_in_offset + head_idx_index);

unroll_for (uint i = 0; i < READ_BLOCK_SIZE; i++) {
uint key_offset = key_out_offset + (head_idx_index + sglid + SUBGROUP_SIZE * i) * PAGED_ATTENTION_BLOCK_SIZE;
key_cache_data[key_offset] = input_data[i];
}

input_data = BLOCK_READ(value_data, key_value_in_offset + head_idx_index);

unroll_for (uint i = 0; i < READ_BLOCK_SIZE; i++) {
uint value_offset = value_out_offset + head_idx_index + sglid + SUBGROUP_SIZE * i;
value_cache_data[value_offset] = input_data[i];
}
}
#endif

#define READ_BLOCK_SIZE 1
for (; head_idx_index + (READ_BLOCK_SIZE * SUBGROUP_SIZE) <= HEAD_SIZE; head_idx_index += SUBGROUP_SIZE * READ_BLOCK_SIZE) {
#define BLOCK_READ(ptr, offset) BLOCK_READN(INPUT0_TYPE, READ_BLOCK_SIZE, ptr, offset);
Expand All @@ -259,6 +197,5 @@ KERNEL(pa_kv_cache_update)(
value_out_offset += HEAD_SIZE;
}
}

}
}
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ constexpr size_t paged_attention_block_size = 16;
static size_t get_generate_stage_block_size(size_t head_size) {
auto preferred_block_size = { 4, 2, 1 };
for (const auto& block_size : preferred_block_size) {
if (head_size % block_size == 0) {
if (head_size % (block_size * subgroup_size) == 0) {
return block_size;
}
}
Expand Down
38 changes: 0 additions & 38 deletions src/plugins/intel_gpu/src/plugin/ops/custom.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -4,14 +4,12 @@

#include "openvino/core/attribute_visitor.hpp"
#include "openvino/core/node.hpp"
#include "openvino/op/constant.hpp"

#include "intel_gpu/plugin/program_builder.hpp"
#include "intel_gpu/plugin/common_utils.hpp"
#include "intel_gpu/plugin/simple_math.hpp"
#include "intel_gpu/primitives/custom_gpu_primitive.hpp"
#include "intel_gpu/primitives/reorder.hpp"
#include "intel_gpu/primitives/paged_attention.hpp"

namespace ov {
namespace intel_gpu {
Expand Down Expand Up @@ -102,42 +100,6 @@ class CustomLayerAttributeVisitor : public ov::AttributeVisitor {
std::map<std::string, std::string> m_values;
};

void CreatePagedAttention(ProgramBuilder& p, const std::shared_ptr<ov::Node>& op) {
validate_inputs_count(op, {13});
auto inputs = p.GetInputInfo(op);
auto prim = cldnn::paged_attention(layer_type_name_ID(op), inputs);

auto key_cache_ps = op->get_input_partial_shape(3);
auto query_ps = op->get_input_partial_shape(0);
auto head_size = key_cache_ps[2].get_length();
auto kv_heads_num = key_cache_ps[1].get_length();
auto heads_num = query_ps[1].get_length() / head_size;

prim.head_size = head_size;
prim.kv_heads_num = kv_heads_num;
prim.heads_num = heads_num;

const size_t scale_idx = 9;
const size_t alibi_idx = 11;

std::shared_ptr<ov::op::v0::Constant> scale_const = std::dynamic_pointer_cast<ov::op::v0::Constant>(op->get_input_node_shared_ptr(scale_idx));
OPENVINO_ASSERT(scale_const != nullptr);
OPENVINO_ASSERT(ov::shape_size(scale_const->get_output_shape(0)) == 1);
prim.scale_val = scale_const->cast_vector<float>()[0];

std::shared_ptr<ov::op::v0::Constant> alibi_const = std::dynamic_pointer_cast<ov::op::v0::Constant>(op->get_input_node_shared_ptr(alibi_idx));
OPENVINO_ASSERT(alibi_const != nullptr);
prim.has_alibi = ov::shape_size(alibi_const->get_output_shape(0)) > 0;

if (op->get_output_size() > 1) {
const auto scores_output_idx = 1;
const auto& users = op->get_output_target_inputs(scores_output_idx);
OPENVINO_ASSERT(users.size() == 0, "[GPU] PagedAttention implementation doesn't support scores output yet");
}

p.add_primitive(*op, prim);
}

void CreateCustomOp(ProgramBuilder& p, const std::shared_ptr<ov::Node>& op, CustomLayerPtr customLayer) {
auto inputs = p.GetInputInfo(op);
std::string layerName = layer_type_name_ID(op);
Expand Down
63 changes: 63 additions & 0 deletions src/plugins/intel_gpu/src/plugin/ops/paged_attention.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
// Copyright (C) 2018-2024 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//

#include "intel_gpu/plugin/program_builder.hpp"
#include "intel_gpu/plugin/common_utils.hpp"

#include "openvino/op/constant.hpp"
#include "openvino/op/paged_attention.hpp"

#include "intel_gpu/primitives/paged_attention.hpp"

namespace ov {
namespace op {
namespace internal {
using PagedAttentionExtension = ov::op::PagedAttentionExtension;
} // namespace internal
} // namespace op
} // namespace ov

namespace ov {
namespace intel_gpu {

static void CreatePagedAttentionExtensionOp(ProgramBuilder& p, const std::shared_ptr<ov::op::PagedAttentionExtension>& op) {
validate_inputs_count(op, {13});
auto inputs = p.GetInputInfo(op);
auto prim = cldnn::paged_attention(layer_type_name_ID(op), inputs);

auto key_cache_ps = op->get_input_partial_shape(3);
auto query_ps = op->get_input_partial_shape(0);
auto head_size = key_cache_ps[2].get_length();
auto kv_heads_num = key_cache_ps[1].get_length();
auto heads_num = query_ps[1].get_length() / head_size;

prim.head_size = head_size;
prim.kv_heads_num = kv_heads_num;
prim.heads_num = heads_num;

const size_t scale_idx = 9;
const size_t alibi_idx = 11;

std::shared_ptr<ov::op::v0::Constant> scale_const = std::dynamic_pointer_cast<ov::op::v0::Constant>(op->get_input_node_shared_ptr(scale_idx));
OPENVINO_ASSERT(scale_const != nullptr);
OPENVINO_ASSERT(ov::shape_size(scale_const->get_output_shape(0)) == 1);
prim.scale_val = scale_const->cast_vector<float>()[0];

std::shared_ptr<ov::op::v0::Constant> alibi_const = std::dynamic_pointer_cast<ov::op::v0::Constant>(op->get_input_node_shared_ptr(alibi_idx));
OPENVINO_ASSERT(alibi_const != nullptr);
prim.has_alibi = ov::shape_size(alibi_const->get_output_shape(0)) > 0;

if (op->get_output_size() > 1) {
const auto scores_output_idx = 1;
const auto& users = op->get_output_target_inputs(scores_output_idx);
OPENVINO_ASSERT(users.size() == 0, "[GPU] PagedAttention implementation doesn't support scores output yet");
}

p.add_primitive(*op, prim);
}

REGISTER_FACTORY_IMPL(internal, PagedAttentionExtension)

} // namespace intel_gpu
} // namespace ov
7 changes: 1 addition & 6 deletions src/plugins/intel_gpu/src/plugin/program_builder.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -239,12 +239,7 @@ void ProgramBuilder::CreateSingleLayerPrimitive(const std::shared_ptr<ov::Node>&
is_created = true;
break;
}

const std::string paged_attention_type = "PagedAttentionExtension";
if (op->get_type_name() == paged_attention_type) {
CreatePagedAttention(*this, op);
return;
}
op_type_info = op_type_info->parent;
}

if (!is_created) {
Expand Down
2 changes: 1 addition & 1 deletion src/plugins/intel_gpu/src/plugin/sync_infer_request.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -841,7 +841,7 @@ std::vector<cldnn::event::ptr> SyncInferRequest::prepare_input(const std::string
}

cldnn::event::ptr ret_event = nullptr;
if (!is_remote && !convert_needed) {
if (!is_remote_tensor_impl && !is_generic_remote && !convert_needed) {
auto src_ptr = static_cast<uint8_t*>(user_tensor->data());
if (!same_host_mem(memory, src_ptr)) {
// WA: Set need_lockable_mem as a blocking argument
Expand Down

0 comments on commit d28d40b

Please sign in to comment.