Skip to content

Commit

Permalink
WIP: Move scales to variable
Browse files Browse the repository at this point in the history
  • Loading branch information
sshlyapn committed Oct 14, 2024
1 parent a134521 commit 6d8c913
Show file tree
Hide file tree
Showing 11 changed files with 215 additions and 86 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -876,21 +876,47 @@ void prepare_buffer_fusing::run(program& p) {
padding::DynamicDimsMask info_dynamic_pad;
info_dynamic_pad[concat_axis] = 1;
kv_out_layout.data_padding._dynamic_dims_mask = info_dynamic_pad;
GPU_DEBUG_TRACE_DETAIL << node.id() << " 0th output layout before before " << node.get_output_layout(false, 0) << "\n";
node.set_output_layout(kv_out_layout);
node.can_share_buffer(false);
GPU_DEBUG_TRACE_DETAIL << node.id() << " 0th output layout after " << node.get_output_layout(false, 0) << "\n";

auto update_dep = [&info_dynamic_pad](program_node* dep) {
auto prev_layout = dep->get_output_layout();
auto update_dep = [](program_node* dep, padding::DynamicDimsMask& info_dynamic_pad, size_t idx) {
auto prev_layout = dep->get_output_layout(true, idx);
prev_layout.data_padding._dynamic_dims_mask = info_dynamic_pad;
dep->set_output_layout(prev_layout);
dep->set_output_layout(prev_layout, true, idx);
dep->can_share_buffer(false);
};

if (rv_prim) {
update_dep(rv_prim);
update_dep(rv_prim, info_dynamic_pad, 0);
}
if (gather_prim) {
update_dep(gather_prim);
update_dep(gather_prim, info_dynamic_pad, 0);
}

GPU_DEBUG_TRACE_DETAIL << "valid first? " << node.is_valid_output_layout(0) << "\n";
GPU_DEBUG_TRACE_DETAIL << "first output :" << node.get_output_layout(false, 0) << "\n";

if (node.get_primitive()->compressed) {
const auto scales_output_idx = 2;
auto scales_out_layout = node.get_output_layout(false, scales_output_idx);

const size_t scales_zp_concat_axis = 3;
padding::DynamicDimsMask info_dynamic_pad_scales;
info_dynamic_pad_scales[scales_zp_concat_axis] = 1;
GPU_DEBUG_TRACE_DETAIL << "Set this pad: " << info_dynamic_pad_scales << "\n";
scales_out_layout.data_padding._dynamic_dims_mask = info_dynamic_pad_scales;
GPU_DEBUG_TRACE_DETAIL << "Pad after: " << info_dynamic_pad_scales << " " << scales_out_layout.data_padding._dynamic_dims_mask << " " << scales_out_layout.data_padding.is_dynamic() << "\n";
GPU_DEBUG_TRACE_DETAIL << scales_out_layout.to_string() << "\n";
GPU_DEBUG_TRACE_DETAIL << node.id() << " 2nd output layout before before " << node.get_output_layout(false, scales_output_idx) << "\n";
node.set_output_layout(scales_out_layout, true, scales_output_idx);
GPU_DEBUG_TRACE_DETAIL << node.id() << " 2nd output after " << node.get_output_layout(false, scales_output_idx) << " " << node.get_output_layout(false, scales_output_idx).data_padding._dynamic_dims_mask << "\n";

update_dep(rv_prim, info_dynamic_pad_scales, 1);

GPU_DEBUG_TRACE_DETAIL << "valid 3d? " << node.is_valid_output_layout(scales_output_idx) << "\n";
GPU_DEBUG_TRACE_DETAIL << node.id() << " 2nd output layout " << node.get_output_layout(false, scales_output_idx) << " " << info_dynamic_pad << "\n";
}
}
});
Expand Down
71 changes: 37 additions & 34 deletions src/plugins/intel_gpu/src/graph/impls/ocl/kv_cache.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -76,7 +76,7 @@ struct kv_cache_impl : multi_stage_primitive<kv_cache> {

cldnn::memory::ptr beam_table_prev = nullptr;
cldnn::memory::ptr beam_table_new = nullptr;
cldnn::memory::ptr compression_scale = nullptr;
// cldnn::memory::ptr compression_scale = nullptr;

void load(BinaryInputBuffer& ib) override {
parent::load(ib);
Expand Down Expand Up @@ -111,7 +111,7 @@ struct kv_cache_impl : multi_stage_primitive<kv_cache> {
} else if (stage == scale_concat_stage) {
// FIXME: indirectness and compression are orthogonal feature.
args.inputs = { instance.input_memory_ptr(3) }; // [past, new, beam_table, past_scale, new_scale]
args.outputs = { compression_scale };
args.outputs = { instance.output_memory_ptr(2) };
} else if (stage == dq_concat_stage) {
args.inputs = { instance.input_memory_ptr(1) }; // [past, new, beam_table, past_scale, new_scale]
args.outputs = { instance.output_memory_ptr(0), instance.output_memory_ptr(2) };
Expand Down Expand Up @@ -175,8 +175,9 @@ struct kv_cache_impl : multi_stage_primitive<kv_cache> {
// However, allow execution of the first token for the case if KV-cache can't be optimized (if optimization is disabled, or
// variables memory was reallocated and we have to copy past KV-cache to new memory)
_kernels_data[concat_stage].kernels[1].skip_execution = true;
if (_kernels_data[concat_stage].kernels[0].skip_execution)
if (!_kernels_data[concat_stage].kernels[0].skip_execution) {
GPU_DEBUG_TRACE_DETAIL << "Run copy of data!\n";
}
}

execute_stage(events, instance, res_events, concat_stage);
Expand Down Expand Up @@ -217,39 +218,43 @@ struct kv_cache_impl : multi_stage_primitive<kv_cache> {
}

if (desc->compressed) {
const auto scale_alloc_type = engine.get_preferred_memory_allocation_type(false);
// const auto scale_alloc_type = engine.get_preferred_memory_allocation_type(false);
auto comp_scale_state =
dynamic_cast<ov::intel_gpu::VariableStateIndirectKVCache&>(variable).get_compression_scale_state();
auto comp_scale_layout = instance.get_impl_params()->output_layouts[2];
auto comp_scale_shape = comp_scale_layout.get_shape();
// auto comp_scale_layout = instance.get_impl_params()->output_layouts[2];
// auto comp_scale_shape = comp_scale_layout.get_shape();

bool skip_first_kernel = true;
const auto preallocation_size = instance.get_prealloc_iter_num();
// const auto preallocation_size = instance.get_prealloc_iter_num();
// const auto preallocation_size = 4;
if (compression_scale) {
GPU_DEBUG_TRACE_DETAIL << "Has compression, mem=" << compression_scale->get_layout().to_short_string() << ", req size" << ov::shape_size(comp_scale_shape) << ", has " << compression_scale->count() << "\n";
} else {
GPU_DEBUG_TRACE_DETAIL << "Has compression, mem=" << compression_scale << ", req size" << ov::shape_size(comp_scale_shape) << "\n";
}

if (!compression_scale || compression_scale->count() < ov::shape_size(comp_scale_shape)) {
const auto concat_axis = 2;
auto alloc_shape = comp_scale_shape;
alloc_shape[concat_axis] += preallocation_size;
const layout comp_scale_alloc_layout = {alloc_shape, comp_scale_layout.data_type, comp_scale_layout.format};
GPU_DEBUG_TRACE_DETAIL << "Realloc compression scale table to " << comp_scale_alloc_layout.to_short_string() << std::endl;
compression_scale = engine.allocate_memory(comp_scale_alloc_layout, scale_alloc_type, false);

skip_first_kernel = comp_scale_state->get_layout().count() == 0;

if (comp_scale_state->get_layout().count() > 64) {
GPU_DEBUG_TRACE_DETAIL << "Reallocation of scales buffer. Prev " << comp_scale_state->get_layout().to_short_string() << " new: " << comp_scale_alloc_layout.to_short_string() << "(prealloc=" << preallocation_size << ")\n";
}
}

instance.set_output_memory(compression_scale, false, 2);
GPU_DEBUG_TRACE_DETAIL << "Override Variable memory\n";
comp_scale_state->set_memory(compression_scale, instance.get_impl_params()->output_layouts[2]);
// if (compression_scale) {
// GPU_DEBUG_TRACE_DETAIL << "Has compression, mem=" << compression_scale->get_layout().to_short_string() << ", req size" << ov::shape_size(comp_scale_shape) << ", has " << compression_scale->count() << "\n";
// } else {
// GPU_DEBUG_TRACE_DETAIL << "Has compression, mem=" << compression_scale << ", req size" << ov::shape_size(comp_scale_shape) << "\n";
// }

// if (!compression_scale || compression_scale->count() < ov::shape_size(comp_scale_shape)) {
// const auto concat_axis = 3;
// auto alloc_shape = comp_scale_shape;
// alloc_shape[concat_axis] += preallocation_size;
// const layout comp_scale_alloc_layout = {alloc_shape, comp_scale_layout.data_type, comp_scale_layout.format};
// GPU_DEBUG_TRACE_DETAIL << "Realloc compression scale table to " << comp_scale_alloc_layout.to_short_string() << std::endl;
// compression_scale = engine.allocate_memory(comp_scale_alloc_layout, scale_alloc_type, false);

// skip_first_kernel = comp_scale_state->get_layout().count() == 0;

// if (comp_scale_state->get_layout().count() > 64) {
// GPU_DEBUG_TRACE_DETAIL << "Reallocation of scales buffer. Prev " << comp_scale_state->get_layout().to_short_string() << " new: " << comp_scale_alloc_layout.to_short_string() << "(prealloc=" << preallocation_size << ")\n";
// }
// }

// instance.set_output_memory(compression_scale, false, 2);
// auto scales_layout = instance.get_impl_params()->output_layouts[2];
// size_t scale_concat_axis = 3;
// scales_layout.data_padding._upper_size[scale_concat_axis] =
// GPU_DEBUG_TRACE_DETAIL << "Override Variable memory with layoyut " << instance.get_impl_params()->output_layouts[2] << "\n";

// comp_scale_state->set_memory(compression_scale, instance.get_impl_params()->output_layouts[2]);

if (!skip_first_kernel) {
GPU_DEBUG_TRACE_DETAIL << "Run copy of scales!\n";
Expand All @@ -260,8 +265,6 @@ struct kv_cache_impl : multi_stage_primitive<kv_cache> {
execute_stage(events, instance, res_events, scale_concat_stage);
}



auto dq_params = get_dq_update_kernel_params(impl_param, impl_param.is_dynamic());
(_kernels_data[dq_concat_stage].update_dispatch_data_func)(dq_params, _kernels_data[dq_concat_stage]);
execute_stage(events, instance, res_events, dq_concat_stage);
Expand Down Expand Up @@ -454,7 +457,7 @@ struct kv_cache_impl : multi_stage_primitive<kv_cache> {
const auto& primitive = impl_param.typed_desc<kv_cache>();
auto params = get_default_params<kernel_selector::concatenation_params>(impl_param, is_shape_agnostic);

const auto concat_axis = 2;
const auto concat_axis = 3;
params.axis = convert_axis(concat_axis, impl_param.get_output_layout().get_rank());

auto inputs_count = 1;
Expand Down
2 changes: 1 addition & 1 deletion src/plugins/intel_gpu/src/graph/include/program_node.h
Original file line number Diff line number Diff line change
Expand Up @@ -232,7 +232,7 @@ struct program_node {
}

void merge_output_padding(padding const& padd, size_t idx = 0) {
set_output_padding(padding::max(padd, output_layouts[idx].data_padding));
set_output_padding(padding::max(padd, output_layouts[idx].data_padding), idx);
}

// only calculated output layout (for external usage), does not modify/use cached output layout nor invalidate users
Expand Down
5 changes: 4 additions & 1 deletion src/plugins/intel_gpu/src/graph/kv_cache.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,9 @@ std::vector<layout> kv_cache_inst::calc_output_layouts(kv_cache_node const& node
std::vector<ShapeType> output_shapes = desc->compressed ? shape_infer(&op, input_shapes, desc->group_sizes, desc->scales_output_order)
: shape_infer(&op, input_shapes);

if (desc->num_outputs == 3)
GPU_DEBUG_TRACE_DETAIL << desc->id << " scales output calculated shape: " << output_shapes[2] << "\n";

static const std::map<size_t, size_t> ports_map = {{0, 0}, {1, 2}};

std::vector<layout> out_layouts;
Expand Down Expand Up @@ -95,7 +98,7 @@ int32_t kv_cache_inst::get_prealloc_iter_num() {
// iteration.
// - Therfore, to avoid this situation where the allocation and copying occurs simutaneously for all the kv_cache_insts,
// we assigned different prealloc-size for each kv cache so that we could prevent a memory peak
return 128 + kv_cache_id % 64;
return 10;
}

void kv_cache_inst::update_shape_info_tensor(const kernel_impl_params& params) {
Expand Down
Loading

0 comments on commit 6d8c913

Please sign in to comment.