diff --git a/src/plugins/intel_gpu/src/graph/graph_optimizer/prepare_buffer_fusing.cpp b/src/plugins/intel_gpu/src/graph/graph_optimizer/prepare_buffer_fusing.cpp index 5b43caf43c43ee..2144fa4c730540 100644 --- a/src/plugins/intel_gpu/src/graph/graph_optimizer/prepare_buffer_fusing.cpp +++ b/src/plugins/intel_gpu/src/graph/graph_optimizer/prepare_buffer_fusing.cpp @@ -876,21 +876,47 @@ void prepare_buffer_fusing::run(program& p) { padding::DynamicDimsMask info_dynamic_pad; info_dynamic_pad[concat_axis] = 1; kv_out_layout.data_padding._dynamic_dims_mask = info_dynamic_pad; + GPU_DEBUG_TRACE_DETAIL << node.id() << " 0th output layout before before " << node.get_output_layout(false, 0) << "\n"; node.set_output_layout(kv_out_layout); node.can_share_buffer(false); + GPU_DEBUG_TRACE_DETAIL << node.id() << " 0th output layout after " << node.get_output_layout(false, 0) << "\n"; - auto update_dep = [&info_dynamic_pad](program_node* dep) { - auto prev_layout = dep->get_output_layout(); + auto update_dep = [](program_node* dep, padding::DynamicDimsMask& info_dynamic_pad, size_t idx) { + auto prev_layout = dep->get_output_layout(true, idx); prev_layout.data_padding._dynamic_dims_mask = info_dynamic_pad; - dep->set_output_layout(prev_layout); + dep->set_output_layout(prev_layout, true, idx); dep->can_share_buffer(false); }; if (rv_prim) { - update_dep(rv_prim); + update_dep(rv_prim, info_dynamic_pad, 0); } if (gather_prim) { - update_dep(gather_prim); + update_dep(gather_prim, info_dynamic_pad, 0); + } + + GPU_DEBUG_TRACE_DETAIL << "valid first? " << node.is_valid_output_layout(0) << "\n"; + GPU_DEBUG_TRACE_DETAIL << "first output :" << node.get_output_layout(false, 0) << "\n"; + + if (node.get_primitive()->compressed) { + const auto scales_output_idx = 2; + auto scales_out_layout = node.get_output_layout(false, scales_output_idx); + + const size_t scales_zp_concat_axis = 3; + padding::DynamicDimsMask info_dynamic_pad_scales; + info_dynamic_pad_scales[scales_zp_concat_axis] = 1; + GPU_DEBUG_TRACE_DETAIL << "Set this pad: " << info_dynamic_pad_scales << "\n"; + scales_out_layout.data_padding._dynamic_dims_mask = info_dynamic_pad_scales; + GPU_DEBUG_TRACE_DETAIL << "Pad after: " << info_dynamic_pad_scales << " " << scales_out_layout.data_padding._dynamic_dims_mask << " " << scales_out_layout.data_padding.is_dynamic() << "\n"; + GPU_DEBUG_TRACE_DETAIL << scales_out_layout.to_string() << "\n"; + GPU_DEBUG_TRACE_DETAIL << node.id() << " 2nd output layout before before " << node.get_output_layout(false, scales_output_idx) << "\n"; + node.set_output_layout(scales_out_layout, true, scales_output_idx); + GPU_DEBUG_TRACE_DETAIL << node.id() << " 2nd output after " << node.get_output_layout(false, scales_output_idx) << " " << node.get_output_layout(false, scales_output_idx).data_padding._dynamic_dims_mask << "\n"; + + update_dep(rv_prim, info_dynamic_pad_scales, 1); + + GPU_DEBUG_TRACE_DETAIL << "valid 3d? " << node.is_valid_output_layout(scales_output_idx) << "\n"; + GPU_DEBUG_TRACE_DETAIL << node.id() << " 2nd output layout " << node.get_output_layout(false, scales_output_idx) << " " << info_dynamic_pad << "\n"; } } }); diff --git a/src/plugins/intel_gpu/src/graph/impls/ocl/kv_cache.cpp b/src/plugins/intel_gpu/src/graph/impls/ocl/kv_cache.cpp index 1b4421c2689fc2..5b59f75f6aff5d 100644 --- a/src/plugins/intel_gpu/src/graph/impls/ocl/kv_cache.cpp +++ b/src/plugins/intel_gpu/src/graph/impls/ocl/kv_cache.cpp @@ -76,7 +76,7 @@ struct kv_cache_impl : multi_stage_primitive { cldnn::memory::ptr beam_table_prev = nullptr; cldnn::memory::ptr beam_table_new = nullptr; - cldnn::memory::ptr compression_scale = nullptr; + // cldnn::memory::ptr compression_scale = nullptr; void load(BinaryInputBuffer& ib) override { parent::load(ib); @@ -111,7 +111,7 @@ struct kv_cache_impl : multi_stage_primitive { } else if (stage == scale_concat_stage) { // FIXME: indirectness and compression are orthogonal feature. args.inputs = { instance.input_memory_ptr(3) }; // [past, new, beam_table, past_scale, new_scale] - args.outputs = { compression_scale }; + args.outputs = { instance.output_memory_ptr(2) }; } else if (stage == dq_concat_stage) { args.inputs = { instance.input_memory_ptr(1) }; // [past, new, beam_table, past_scale, new_scale] args.outputs = { instance.output_memory_ptr(0), instance.output_memory_ptr(2) }; @@ -175,8 +175,9 @@ struct kv_cache_impl : multi_stage_primitive { // However, allow execution of the first token for the case if KV-cache can't be optimized (if optimization is disabled, or // variables memory was reallocated and we have to copy past KV-cache to new memory) _kernels_data[concat_stage].kernels[1].skip_execution = true; - if (_kernels_data[concat_stage].kernels[0].skip_execution) + if (!_kernels_data[concat_stage].kernels[0].skip_execution) { GPU_DEBUG_TRACE_DETAIL << "Run copy of data!\n"; + } } execute_stage(events, instance, res_events, concat_stage); @@ -217,39 +218,43 @@ struct kv_cache_impl : multi_stage_primitive { } if (desc->compressed) { - const auto scale_alloc_type = engine.get_preferred_memory_allocation_type(false); + // const auto scale_alloc_type = engine.get_preferred_memory_allocation_type(false); auto comp_scale_state = dynamic_cast(variable).get_compression_scale_state(); - auto comp_scale_layout = instance.get_impl_params()->output_layouts[2]; - auto comp_scale_shape = comp_scale_layout.get_shape(); + // auto comp_scale_layout = instance.get_impl_params()->output_layouts[2]; + // auto comp_scale_shape = comp_scale_layout.get_shape(); bool skip_first_kernel = true; - const auto preallocation_size = instance.get_prealloc_iter_num(); + // const auto preallocation_size = instance.get_prealloc_iter_num(); // const auto preallocation_size = 4; - if (compression_scale) { - GPU_DEBUG_TRACE_DETAIL << "Has compression, mem=" << compression_scale->get_layout().to_short_string() << ", req size" << ov::shape_size(comp_scale_shape) << ", has " << compression_scale->count() << "\n"; - } else { - GPU_DEBUG_TRACE_DETAIL << "Has compression, mem=" << compression_scale << ", req size" << ov::shape_size(comp_scale_shape) << "\n"; - } - - if (!compression_scale || compression_scale->count() < ov::shape_size(comp_scale_shape)) { - const auto concat_axis = 2; - auto alloc_shape = comp_scale_shape; - alloc_shape[concat_axis] += preallocation_size; - const layout comp_scale_alloc_layout = {alloc_shape, comp_scale_layout.data_type, comp_scale_layout.format}; - GPU_DEBUG_TRACE_DETAIL << "Realloc compression scale table to " << comp_scale_alloc_layout.to_short_string() << std::endl; - compression_scale = engine.allocate_memory(comp_scale_alloc_layout, scale_alloc_type, false); - - skip_first_kernel = comp_scale_state->get_layout().count() == 0; - - if (comp_scale_state->get_layout().count() > 64) { - GPU_DEBUG_TRACE_DETAIL << "Reallocation of scales buffer. Prev " << comp_scale_state->get_layout().to_short_string() << " new: " << comp_scale_alloc_layout.to_short_string() << "(prealloc=" << preallocation_size << ")\n"; - } - } - - instance.set_output_memory(compression_scale, false, 2); - GPU_DEBUG_TRACE_DETAIL << "Override Variable memory\n"; - comp_scale_state->set_memory(compression_scale, instance.get_impl_params()->output_layouts[2]); + // if (compression_scale) { + // GPU_DEBUG_TRACE_DETAIL << "Has compression, mem=" << compression_scale->get_layout().to_short_string() << ", req size" << ov::shape_size(comp_scale_shape) << ", has " << compression_scale->count() << "\n"; + // } else { + // GPU_DEBUG_TRACE_DETAIL << "Has compression, mem=" << compression_scale << ", req size" << ov::shape_size(comp_scale_shape) << "\n"; + // } + + // if (!compression_scale || compression_scale->count() < ov::shape_size(comp_scale_shape)) { + // const auto concat_axis = 3; + // auto alloc_shape = comp_scale_shape; + // alloc_shape[concat_axis] += preallocation_size; + // const layout comp_scale_alloc_layout = {alloc_shape, comp_scale_layout.data_type, comp_scale_layout.format}; + // GPU_DEBUG_TRACE_DETAIL << "Realloc compression scale table to " << comp_scale_alloc_layout.to_short_string() << std::endl; + // compression_scale = engine.allocate_memory(comp_scale_alloc_layout, scale_alloc_type, false); + + // skip_first_kernel = comp_scale_state->get_layout().count() == 0; + + // if (comp_scale_state->get_layout().count() > 64) { + // GPU_DEBUG_TRACE_DETAIL << "Reallocation of scales buffer. Prev " << comp_scale_state->get_layout().to_short_string() << " new: " << comp_scale_alloc_layout.to_short_string() << "(prealloc=" << preallocation_size << ")\n"; + // } + // } + + // instance.set_output_memory(compression_scale, false, 2); + // auto scales_layout = instance.get_impl_params()->output_layouts[2]; + // size_t scale_concat_axis = 3; + // scales_layout.data_padding._upper_size[scale_concat_axis] = + // GPU_DEBUG_TRACE_DETAIL << "Override Variable memory with layoyut " << instance.get_impl_params()->output_layouts[2] << "\n"; + + // comp_scale_state->set_memory(compression_scale, instance.get_impl_params()->output_layouts[2]); if (!skip_first_kernel) { GPU_DEBUG_TRACE_DETAIL << "Run copy of scales!\n"; @@ -260,8 +265,6 @@ struct kv_cache_impl : multi_stage_primitive { execute_stage(events, instance, res_events, scale_concat_stage); } - - auto dq_params = get_dq_update_kernel_params(impl_param, impl_param.is_dynamic()); (_kernels_data[dq_concat_stage].update_dispatch_data_func)(dq_params, _kernels_data[dq_concat_stage]); execute_stage(events, instance, res_events, dq_concat_stage); @@ -454,7 +457,7 @@ struct kv_cache_impl : multi_stage_primitive { const auto& primitive = impl_param.typed_desc(); auto params = get_default_params(impl_param, is_shape_agnostic); - const auto concat_axis = 2; + const auto concat_axis = 3; params.axis = convert_axis(concat_axis, impl_param.get_output_layout().get_rank()); auto inputs_count = 1; diff --git a/src/plugins/intel_gpu/src/graph/include/program_node.h b/src/plugins/intel_gpu/src/graph/include/program_node.h index 029755c4733fe4..323d630732b5c4 100644 --- a/src/plugins/intel_gpu/src/graph/include/program_node.h +++ b/src/plugins/intel_gpu/src/graph/include/program_node.h @@ -232,7 +232,7 @@ struct program_node { } void merge_output_padding(padding const& padd, size_t idx = 0) { - set_output_padding(padding::max(padd, output_layouts[idx].data_padding)); + set_output_padding(padding::max(padd, output_layouts[idx].data_padding), idx); } // only calculated output layout (for external usage), does not modify/use cached output layout nor invalidate users diff --git a/src/plugins/intel_gpu/src/graph/kv_cache.cpp b/src/plugins/intel_gpu/src/graph/kv_cache.cpp index 325447a5b71d31..dbaf2c55e40eb3 100644 --- a/src/plugins/intel_gpu/src/graph/kv_cache.cpp +++ b/src/plugins/intel_gpu/src/graph/kv_cache.cpp @@ -48,6 +48,9 @@ std::vector kv_cache_inst::calc_output_layouts(kv_cache_node const& node std::vector output_shapes = desc->compressed ? shape_infer(&op, input_shapes, desc->group_sizes, desc->scales_output_order) : shape_infer(&op, input_shapes); + if (desc->num_outputs == 3) + GPU_DEBUG_TRACE_DETAIL << desc->id << " scales output calculated shape: " << output_shapes[2] << "\n"; + static const std::map ports_map = {{0, 0}, {1, 2}}; std::vector out_layouts; @@ -95,7 +98,7 @@ int32_t kv_cache_inst::get_prealloc_iter_num() { // iteration. // - Therfore, to avoid this situation where the allocation and copying occurs simutaneously for all the kv_cache_insts, // we assigned different prealloc-size for each kv cache so that we could prevent a memory peak - return 128 + kv_cache_id % 64; + return 10; } void kv_cache_inst::update_shape_info_tensor(const kernel_impl_params& params) { diff --git a/src/plugins/intel_gpu/src/graph/primitive_inst.cpp b/src/plugins/intel_gpu/src/graph/primitive_inst.cpp index 09480176df8afd..239e7f3953c547 100644 --- a/src/plugins/intel_gpu/src/graph/primitive_inst.cpp +++ b/src/plugins/intel_gpu/src/graph/primitive_inst.cpp @@ -490,6 +490,12 @@ void primitive_inst::update_shape() { auto& variable = get_network().get_variable(desc->variable_id); // Custom output layout update as update_output_layout handles paddings incorrectly for optimized out read_value + kv_cache pattern _impl_params->output_layouts[0] = variable.get_layout(); + + if (desc->compressed) { + GPU_DEBUG_TRACE_DETAIL << "Update ReadValue output\n"; + auto& multi_tensors_variable = downcast(variable); + _impl_params->output_layouts[1] = multi_tensors_variable.get_compression_scale_state()->get_layout(); + } } if (get_node().is_type()) { @@ -572,6 +578,13 @@ event::ptr primitive_inst::realloc_if_needed() { << ", variable layout " << variable.get_layout().to_short_string() << ")" << std::endl; _outputs[0] = variable.get_memory(); + + auto prim = _node->as().get_primitive(); + if (prim->compressed) { + const auto& multi_tensor_var = downcast(variable); + _outputs[2] = multi_tensor_var.get_compression_scale_state()->get_memory(); + } + // To record shape predictor for (size_t j = 0; j < _impl_params->output_layouts.size(); ++j) sp.predict_preallocation_shape(id(), _impl_params->output_layouts[j], true, j); @@ -719,13 +732,19 @@ event::ptr primitive_inst::realloc_if_needed() { for (size_t i = 0; i < updated_layouts.size(); ++i) { bool reclaim = 0; size_t required_buffer_size = 0; - if (_node->is_type() && i == 0) { + if (_node->is_type() && (i == 0 || i == 2)) { // Relax reclaiming condition for kv cache const auto& desc = _node->as().get_primitive(); auto prealloc_shape = updated_layouts[i].get_shape(); const auto shape_rank = prealloc_shape.size(); - auto seq_axis = - static_cast(desc->concat_axis >= 0 ? desc->concat_axis : shape_rank + desc->concat_axis); + auto seq_axis = 0; + if (i == 0) { + // seq_axis = kv_cache_inst::get_sequence_axis(desc->concat_axis, shape_rank); + seq_axis = static_cast(desc->concat_axis >= 0 ? desc->concat_axis : shape_rank + desc->concat_axis); + } else if (i == 2) { + seq_axis = 3; + } + prealloc_shape[seq_axis] += tmp_prealloc_count; required_buffer_size = std::accumulate(prealloc_shape.begin(), prealloc_shape.end(), size_t(1), std::multiplies()); } else { @@ -758,11 +777,17 @@ event::ptr primitive_inst::realloc_if_needed() { // continue; std::pair prealloc_info; - if (_node->is_type() && i == 0) { + if (_node->is_type() && (i == 0 || i == 2)) { const auto& desc = _node->as().get_primitive(); auto shape_rank = updated_layouts[i].get_shape().size(); - auto seq_axis = - static_cast(desc->concat_axis >= 0 ? desc->concat_axis : shape_rank + desc->concat_axis); + auto seq_axis = 0; + if (i == 0) { + // seq_axis = static_cast(desc->concat_axis >= 0 ? desc->concat_axis : shape_rank + desc->concat_axis); + seq_axis = kv_cache_inst::get_sequence_axis(desc->concat_axis, shape_rank); + } else if (i == 2) { + seq_axis = 3; + } + prealloc_info = sp.predict_preallocation_shape(id(), updated_layouts[i], false, i, tmp_prealloc_count, seq_axis); } else { prealloc_info = sp.predict_preallocation_shape(id(), updated_layouts[i], can_reuse_buffer, i, tmp_prealloc_count); @@ -778,20 +803,20 @@ event::ptr primitive_inst::realloc_if_needed() { GPU_DEBUG_TRACE_DETAIL << id() << ": reuse previously allocated output buffer[" << i << "] - " << actual_layouts[i].get_linear_size() << "/" << _max_output_layout_count[i] << std::endl; - if (_node->is_type() && (i == 0)) { + if (_node->is_type() && (i == 0 || i == 2)) { // kv_cache has already assigned memory. // No need to reinterpret output memory but need to update padding const auto& desc = _node->as().get_primitive(); auto& present_layout = _impl_params->output_layouts[i]; const auto present_layout_rank = present_layout.get_partial_shape().size(); - const auto sequence_axis = kv_cache_inst::get_sequence_axis(desc->concat_axis, present_layout_rank); + const auto sequence_axis = i == 0 ? kv_cache_inst::get_sequence_axis(desc->concat_axis, present_layout_rank) : 3; GPU_DEBUG_TRACE_DETAIL << "get_max_pad: " << present_layout.to_short_string() << " " << _max_output_layout_count[0] << " " << sequence_axis << "\n"; auto max_pad = kv_cache_inst::get_max_pad(present_layout, _max_output_layout_count[i], sequence_axis, - "present_layout"); + i == 0 ? "present_layout" : "present_scales_layout"); kv_cache_inst::update_pad(present_layout, max_pad, sequence_axis); - GPU_DEBUG_TRACE_DETAIL << _impl_params->output_layouts[i].to_string() << std::endl; + GPU_DEBUG_TRACE_DETAIL << i << ". " << _impl_params->output_layouts[i].to_string() << std::endl; set_shape_change(); } else { _outputs[i] = _network.get_engine().reinterpret_buffer(*_outputs[i], actual_layouts[i]); @@ -853,7 +878,26 @@ event::ptr primitive_inst::realloc_if_needed() { sequence_axis, "present_layout"); if (max_pad > 0) { + if (desc->compressed) { + GPU_DEBUG_TRACE_DETAIL << "Compressed case!\n"; + auto present_scales_layout = _impl_params->output_layouts[2]; + + const auto sequence_axis = 3; + GPU_DEBUG_TRACE_DETAIL << id() << " is kv_cache => set the variable with newly allocated output memory" + << std::endl; + + kv_cache_inst::update_pad(present_scales_layout, max_pad, sequence_axis); + GPU_DEBUG_TRACE_DETAIL << "Updated scales pad (" << max_pad << " " << sequence_axis << "): " << present_scales_layout.to_string() << "\n"; + if (!axis_is_outer_most) { + _impl_params->output_layouts[2] = present_scales_layout; + } + + const auto& multi_tensor_var = downcast(variable); + multi_tensor_var.get_compression_scale_state()->set_memory(_outputs[2], present_scales_layout); + } + kv_cache_inst::update_pad(present_layout, max_pad, sequence_axis); + GPU_DEBUG_TRACE_DETAIL << "Updated data pad (" << max_pad << " " << sequence_axis << "): " << present_layout.to_string() << "\n"; if (!axis_is_outer_most) { GPU_DEBUG_TRACE_DETAIL << id() << ": Update impl with new output padding" << std::endl; set_shape_change(); @@ -873,12 +917,28 @@ event::ptr primitive_inst::realloc_if_needed() { << "'s layout with allocated kv cache output: " << present_layout.to_short_string() << " (is_set = " << variable.is_set() << ") " << std::endl; variable.set_memory(_outputs[0], present_layout); + + if (desc->compressed) { + GPU_DEBUG_TRACE_DETAIL << "Compressed case[2]!\n"; + auto present_scales_layout = _impl_params->output_layouts[2]; + + const auto& multi_tensor_var = downcast(variable); + multi_tensor_var.get_compression_scale_state()->set_memory(_outputs[2], present_scales_layout); + } } } else { GPU_DEBUG_TRACE_DETAIL << id() << ": Update variable " << variable.get_name() << "'s layout with allocated kv cache output: " << present_layout.to_short_string() << " (is_set = " << variable.is_set() << ") " << std::endl; variable.set_layout(present_layout); + + if (desc->compressed) { + GPU_DEBUG_TRACE_DETAIL << "Compressed case[2]!\n"; + auto present_scales_layout = _impl_params->output_layouts[2]; + + const auto& multi_tensor_var = downcast(variable); + multi_tensor_var.get_compression_scale_state()->set_layout(present_scales_layout); + } } } @@ -1241,6 +1301,13 @@ void primitive_inst::do_runtime_in_place_kv_cache() { return; } const auto& desc = _node->as().get_primitive(); + + if (desc->compressed) { + GPU_DEBUG_TRACE_DETAIL << "Original layouts\n"; + GPU_DEBUG_TRACE_DETAIL << _impl_params->input_layouts[0] << "\n"; + GPU_DEBUG_TRACE_DETAIL << _impl_params->input_layouts[3] << "\n"; + } + auto& past_layout = _impl_params->input_layouts[0]; auto& new_layout = _impl_params->input_layouts[1]; auto& present_layout = _impl_params->output_layouts[0]; @@ -1268,11 +1335,33 @@ void primitive_inst::do_runtime_in_place_kv_cache() { GPU_DEBUG_TRACE_DETAIL << "[do runtime_in_place_kv_cache] " << id() << " Updated present_layout's pad : " << present_layout.to_string() << std::endl; auto& variable = get_network().get_variable(desc->variable_info.variable_id); variable.set_layout(present_layout); + + if (desc->compressed) { + GPU_DEBUG_TRACE_DETAIL << "Compressed case[1]!\n"; + auto& present_scales_layout = _impl_params->output_layouts[2]; + const auto sequence_axis = 3; + kv_cache_inst::update_pad(present_scales_layout, max_pad - new_seq_len, sequence_axis); + GPU_DEBUG_TRACE_DETAIL << "[do runtime_in_place_kv_cache] " << id() << " Updated present_scale_layout's pad : " << present_scales_layout.to_string() << std::endl; + + const auto& multi_tensor_var = downcast(variable); + multi_tensor_var.get_compression_scale_state()->set_layout(present_scales_layout); + } + GPU_DEBUG_TRACE_DETAIL << "[do_runtime_in_place_kv_cache] " << id() << "Updated variable with present_layout" << variable.get_layout().to_string() << " is_set = " << variable.is_set() << std::endl; if (past_layout.data_padding._upper_size[sequence_axis] > 0 && variable.is_set()) { kv_cache_inst::update_pad(past_layout, max_pad, sequence_axis); _impl_params->_can_be_optimized = true; + + GPU_DEBUG_TRACE_DETAIL << "Updated data layout (" << max_pad << " " << sequence_axis << "): " << _impl_params->input_layouts[0] << "\n"; + + if (desc->compressed) { + GPU_DEBUG_TRACE_DETAIL << "Compressed case[2]!\n"; + auto& past_scale_layout = _impl_params->input_layouts[3]; + const auto sequence_axis = 3; + kv_cache_inst::update_pad(past_scale_layout, max_pad, sequence_axis); + GPU_DEBUG_TRACE_DETAIL << "Updated scales layout (" << max_pad << " " << sequence_axis << "): " << _impl_params->input_layouts[3] << "\n"; + } GPU_DEBUG_TRACE_DETAIL << "[do_runtime_in_place_kv_cache] " << id() << " Updated past layout's pad : " << past_layout.to_string() << std::endl; } } diff --git a/src/plugins/intel_gpu/src/graph/program_node.cpp b/src/plugins/intel_gpu/src/graph/program_node.cpp index 0b7b0ca4ca2b1b..047e04f88d7e8a 100644 --- a/src/plugins/intel_gpu/src/graph/program_node.cpp +++ b/src/plugins/intel_gpu/src/graph/program_node.cpp @@ -435,10 +435,15 @@ layout program_node::get_non_padded_output_layout(bool invalidate_users_if_chang } bool program_node::set_output_layout(layout& new_layout, bool invalidate_users_if_changed, size_t idx) { + // GPU_DEBUG_TRACE_DETAIL << "TEST: " << padding::max(new_layout.data_padding, output_layouts[idx].data_padding)._dynamic_dims_mask << "\n"; + merge_output_padding(new_layout.data_padding, idx); + // GPU_DEBUG_TRACE_DETAIL << "Merged padding[1] " << new_layout.to_string() << "\n"; + // GPU_DEBUG_TRACE_DETAIL << "Merged padding[2] " << output_layouts[idx].data_padding._dynamic_dims_mask << "\n"; OPENVINO_ASSERT(idx < output_layouts.size(), id(), " has invalid index : index is ", std::to_string(idx), " but output_layouts length is ", std::to_string(output_layouts.size())); new_layout.data_padding = output_layouts[idx].data_padding; + // GPU_DEBUG_TRACE_DETAIL << "Merged padding[3] " << new_layout.to_string() << "\n"; bool changed = (new_layout != output_layouts[idx]); if (changed && invalidate_users_if_changed) // output_layout has changed! invalidate users invalidate_users(); diff --git a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/dynamic_quantize_gpu_opt_generic.cl b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/dynamic_quantize_gpu_opt_generic.cl index c5dc3609133192..656699162aeba0 100644 --- a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/dynamic_quantize_gpu_opt_generic.cl +++ b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/dynamic_quantize_gpu_opt_generic.cl @@ -62,7 +62,7 @@ KERNEL(dynamic_quantize_gpu_opt_generic)( DECLARE_GROUPED_DIMS_INDEXES(grouped_indexes); // the innermost dimension is always handled in the loop inside the kernel - const uint x = 0; + uint x = 0; half max_value = 0.0001h; half val[INNERMOST_DIM_VALUE / SUBGROUP_SIZE]; diff --git a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/sdpa_opt.cl b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/sdpa_opt.cl index 8e3bf05ee11f18..eddda57006bf9f 100644 --- a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/sdpa_opt.cl +++ b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/sdpa_opt.cl @@ -258,11 +258,11 @@ KERNEL(sdpa_opt)( #endif #ifdef COMPRESSED_PER_HEAD // const uint key_scale_comp_offset = GET_DATA_INDEX_6D(KEY_COMPRESSION_SCALE, b_idx, seq_len, 0, 0, b1_idx, 0); - const uint key_scale_comp_offset = GET_DATA_INDEX_6D(KEY_COMPRESSION_SCALE, b_idx, 0, 0, 0, start_partition_idx + seq_len, b1_idx / BROADCAST_GROUP_SIZE); + const uint key_scale_comp_offset = GET_DATA_INDEX_6D(KEY_COMPRESSION_SCALE, b_idx, 0, 0, 0, b1_idx / BROADCAST_GROUP_SIZE, start_partition_idx + seq_len); #else // const uint key_scale_comp_offset = GET_DATA_INDEX_6D(KEY_COMPRESSION_SCALE, b_idx, seq_len, 0, 0, 0, 0); - const uint key_scale_comp_offset = GET_DATA_INDEX_6D(KEY_COMPRESSION_SCALE, b_idx, 0, 0, 0, start_partition_idx + seq_len, 0); + const uint key_scale_comp_offset = GET_DATA_INDEX_6D(KEY_COMPRESSION_SCALE, b_idx, 0, 0, 0, 0, start_partition_idx + seq_len); #endif KEY_COMPRESSION_SCALE_TYPE key_comp_scale = key_scale[key_scale_comp_offset]; #endif @@ -584,10 +584,10 @@ KERNEL(sdpa_opt)( #ifdef COMPRESSED_PER_HEAD // TODO: consider to change scales layout from [batch, seq_len, num_heads, 1] to [batch, num_heads, seq_len, 1] // const uint value_scale_comp_offset = GET_DATA_INDEX_6D(VALUE_COMPRESSION_SCALE, b_idx, start_partition_idx + (seq_len * SUBGROUP_SIZE) + sglid, 0, 0, b1_idx, 0); - const uint value_scale_comp_offset = GET_DATA_INDEX_6D(VALUE_COMPRESSION_SCALE, b_idx, 0, 0, 0, start_partition_idx + (seq_len * SUBGROUP_SIZE) + sglid, b1_idx / BROADCAST_GROUP_SIZE); + const uint value_scale_comp_offset = GET_DATA_INDEX_6D(VALUE_COMPRESSION_SCALE, b_idx, 0, 0, 0, b1_idx / BROADCAST_GROUP_SIZE, start_partition_idx + (seq_len * SUBGROUP_SIZE) + sglid); #else // const uint value_scale_comp_offset = GET_DATA_INDEX_6D(VALUE_COMPRESSION_SCALE, b_idx, start_partition_idx + (seq_len * SUBGROUP_SIZE) + sglid, 0, 0, 0, 0); - const uint value_scale_comp_offset = GET_DATA_INDEX_6D(VALUE_COMPRESSION_SCALE, b_idx, 0, 0, 0, start_partition_idx + (seq_len * SUBGROUP_SIZE) + sglid, 0); + const uint value_scale_comp_offset = GET_DATA_INDEX_6D(VALUE_COMPRESSION_SCALE, b_idx, 0, 0, 0, 0, start_partition_idx + (seq_len * SUBGROUP_SIZE) + sglid); #endif VALUE_COMPRESSION_SCALE_TYPE value_comp_scale = val_scale[value_scale_comp_offset]; #endif @@ -655,10 +655,10 @@ KERNEL(sdpa_opt)( #ifdef COMPRESSED_PER_HEAD // TODO: consider to change scales layout from [batch, seq_len, num_heads, 1] to [batch, num_heads, seq_len, 1] // const uint value_scale_comp_offset = GET_DATA_INDEX_6D(VALUE_COMPRESSION_SCALE, b_idx, start_partition_idx + seq_len, 0, 0, b1_idx, 0); - const uint value_scale_comp_offset = GET_DATA_INDEX_6D(VALUE_COMPRESSION_SCALE, b_idx, 0, 0, 0, start_partition_idx + seq_len, b1_idx / BROADCAST_GROUP_SIZE); + const uint value_scale_comp_offset = GET_DATA_INDEX_6D(VALUE_COMPRESSION_SCALE, b_idx, 0, 0, 0, b1_idx / BROADCAST_GROUP_SIZE, start_partition_idx + seq_len); #else // const uint value_scale_comp_offset = GET_DATA_INDEX_6D(VALUE_COMPRESSION_SCALE, b_idx, start_partition_idx + seq_len, 0, 0, 0, 0); - const uint value_scale_comp_offset = GET_DATA_INDEX_6D(VALUE_COMPRESSION_SCALE, b_idx, 0, 0, 0, start_partition_idx + seq_len, 0); + const uint value_scale_comp_offset = GET_DATA_INDEX_6D(VALUE_COMPRESSION_SCALE, b_idx, 0, 0, 0, 0, start_partition_idx + seq_len); #endif VALUE_COMPRESSION_SCALE_TYPE value_comp_scale = val_scale[value_scale_comp_offset]; #endif @@ -928,12 +928,18 @@ KERNEL(sdpa_opt)( #endif uint query_local_offset = head_size_idx * TARGET_SEQ_LEN_BLOCK_SIZE; +#if HAS_SCALE_INPUT + const OUTPUT_TYPE scale_val = *scale; +#else + const OUTPUT_TYPE scale_val = TO_OUTPUT_TYPE(STATIC_SCALE_VALUE); +#endif + if (cur_target_seq_len_size != TARGET_SEQ_LEN_BLOCK_SIZE) { if (sgid * SUBGROUP_SIZE < HEAD_SIZE) { for (uint seq_idx = 0; seq_idx < cur_target_seq_len_size; seq_idx++) { INPUT0_TYPE val = BLOCK_READN(INPUT0_TYPE, 1, query_input, query_offset); - slm_query[query_local_offset] = val; + slm_query[query_local_offset] = val * scale_val; query_offset += query_pitch; query_local_offset++; } @@ -944,7 +950,7 @@ KERNEL(sdpa_opt)( unroll_for (uint seq_idx = 0; seq_idx < (TARGET_SEQ_LEN_BLOCK_SIZE / SG_SCALE_FACTOR); seq_idx++) { INPUT0_TYPE val = BLOCK_READN(INPUT0_TYPE, 1, query_input, query_offset); - slm_query[query_local_offset] = val; + slm_query[query_local_offset] = val * scale_val; query_offset += query_pitch; query_local_offset++; } @@ -954,7 +960,7 @@ KERNEL(sdpa_opt)( unroll_for (uint seq_idx = 0; seq_idx < (TARGET_SEQ_LEN_BLOCK_SIZE / SG_SCALE_FACTOR); seq_idx++) { INPUT0_TYPE val = BLOCK_READN(INPUT0_TYPE, 1, query_input, query_offset); - slm_query[query_local_offset] = val; + slm_query[query_local_offset] = val * scale_val; query_offset += query_pitch; query_local_offset++; } @@ -965,7 +971,7 @@ KERNEL(sdpa_opt)( unroll_for (uint seq_idx = 0; seq_idx < (TARGET_SEQ_LEN_BLOCK_SIZE / SG_SCALE_FACTOR); seq_idx++) { INPUT0_TYPE val = BLOCK_READN(INPUT0_TYPE, 1, query_input, query_offset); - slm_query[query_local_offset] = val; + slm_query[query_local_offset] = val * scale_val; query_offset += query_pitch; query_local_offset++; } @@ -973,7 +979,7 @@ KERNEL(sdpa_opt)( unroll_for (uint seq_idx = 0; seq_idx < TARGET_SEQ_LEN_BLOCK_SIZE; seq_idx++) { INPUT0_TYPE val = BLOCK_READN(INPUT0_TYPE, 1, query_input, query_offset); - slm_query[query_local_offset] = val; + slm_query[query_local_offset] = val * scale_val; query_offset += query_pitch; query_local_offset++; } @@ -1053,10 +1059,10 @@ KERNEL(sdpa_opt)( #endif #ifdef COMPRESSED_PER_HEAD // const uint key_scale_comp_offset = GET_DATA_INDEX_6D(KEY_COMPRESSION_SCALE, b_idx, seq_len + sglid, 0, 0, b1_idx, 0); - const uint key_scale_comp_offset = GET_DATA_INDEX_6D(KEY_COMPRESSION_SCALE, b_idx, 0, 0, 0, seq_len + sglid, b1_idx / BROADCAST_GROUP_SIZE); + const uint key_scale_comp_offset = GET_DATA_INDEX_6D(KEY_COMPRESSION_SCALE, b_idx, 0, 0, 0, b1_idx / BROADCAST_GROUP_SIZE, seq_len + sglid); #else // const uint key_scale_comp_offset = GET_DATA_INDEX_6D(KEY_COMPRESSION_SCALE, b_idx, seq_len + sglid, 0, 0, 0, 0); - const uint key_scale_comp_offset = GET_DATA_INDEX_6D(KEY_COMPRESSION_SCALE, b_idx, 0, 0, 0, seq_len + sglid, 0); + const uint key_scale_comp_offset = GET_DATA_INDEX_6D(KEY_COMPRESSION_SCALE, b_idx, 0, 0, 0, 0, seq_len + sglid); #endif KEY_COMPRESSION_SCALE_TYPE key_comp_scale = key_scale[key_scale_comp_offset]; // printf("[0]key_scale_comp_offset=%d, sglid=%d: %f\n", key_scale_comp_offset, sglid, key_comp_scale); @@ -1102,10 +1108,10 @@ KERNEL(sdpa_opt)( #endif #ifdef COMPRESSED_PER_HEAD // const uint key_scale_comp_offset = GET_DATA_INDEX_6D(KEY_COMPRESSION_SCALE, b_idx, seq_len + sglid, 0, 0, b1_idx, 0); - const uint key_scale_comp_offset = GET_DATA_INDEX_6D(KEY_COMPRESSION_SCALE, b_idx, 0, 0, 0, seq_len + sglid, b1_idx / BROADCAST_GROUP_SIZE); + const uint key_scale_comp_offset = GET_DATA_INDEX_6D(KEY_COMPRESSION_SCALE, b_idx, 0, 0, 0, b1_idx / BROADCAST_GROUP_SIZE, seq_len + sglid); #else // const uint key_scale_comp_offset = GET_DATA_INDEX_6D(KEY_COMPRESSION_SCALE, b_idx, seq_len + sglid, 0, 0, 0, 0); - const uint key_scale_comp_offset = GET_DATA_INDEX_6D(KEY_COMPRESSION_SCALE, b_idx, 0, 0, 0, seq_len + sglid, 0); + const uint key_scale_comp_offset = GET_DATA_INDEX_6D(KEY_COMPRESSION_SCALE, b_idx, 0, 0, 0, 0, seq_len + sglid); #endif KEY_COMPRESSION_SCALE_TYPE key_comp_scale = key_scale[key_scale_comp_offset]; // printf("[1]key_scale_comp_offset=%d, sglid=%d: %f\n", key_scale_comp_offset, sglid, key_comp_scale); @@ -1185,12 +1191,7 @@ KERNEL(sdpa_opt)( { unroll_for (uint i = 0; i < TARGET_SEQ_LEN_BLOCK_SIZE; i++) { -#if HAS_SCALE_INPUT - const OUTPUT_TYPE scale_val = *scale; -#else - const OUTPUT_TYPE scale_val = TO_OUTPUT_TYPE(STATIC_SCALE_VALUE); -#endif - qk_acc[i] *= scale_val; + #ifdef HAS_ALIBI const int alibi_val = (1 - SOURCE_SEQ_LEN) + seq_len + i; @@ -1320,10 +1321,10 @@ KERNEL(sdpa_opt)( #endif #ifdef COMPRESSED_PER_HEAD // const uint value_scale_comp_offset = GET_DATA_INDEX_6D(VALUE_COMPRESSION_SCALE, b_idx, start_partition_idx + (seq_len) + sglid, 0, 0, b1_idx, 0); - const uint value_scale_comp_offset = GET_DATA_INDEX_6D(VALUE_COMPRESSION_SCALE, b_idx, 0, 0, 0, start_partition_idx + (seq_len) + sglid, b1_idx / BROADCAST_GROUP_SIZE); + const uint value_scale_comp_offset = GET_DATA_INDEX_6D(VALUE_COMPRESSION_SCALE, b_idx, 0, 0, 0, b1_idx / BROADCAST_GROUP_SIZE, start_partition_idx + (seq_len) + sglid); #else // const uint value_scale_comp_offset = GET_DATA_INDEX_6D(VALUE_COMPRESSION_SCALE, b_idx, start_partition_idx + (seq_len) + sglid, 0, 0, 0, 0); - const uint value_scale_comp_offset = GET_DATA_INDEX_6D(VALUE_COMPRESSION_SCALE, b_idx, 0, 0, 0, start_partition_idx + (seq_len) + sglid, 0); + const uint value_scale_comp_offset = GET_DATA_INDEX_6D(VALUE_COMPRESSION_SCALE, b_idx, 0, 0, 0, 0, start_partition_idx + (seq_len) + sglid); #endif VALUE_COMPRESSION_SCALE_TYPE value_comp_scale = val_scale[value_scale_comp_offset]; // printf("[0]value_scale_comp_offset=%d, sglid=%d: %f\n", value_scale_comp_offset, sglid, value_comp_scale); @@ -1386,10 +1387,10 @@ KERNEL(sdpa_opt)( #endif #ifdef COMPRESSED_PER_HEAD // const uint value_scale_comp_offset = GET_DATA_INDEX_6D(VALUE_COMPRESSION_SCALE, b_idx, start_partition_idx + (seq_len * SUBGROUP_SIZE) + sglid, 0, 0, b1_idx, 0); - const uint value_scale_comp_offset = GET_DATA_INDEX_6D(VALUE_COMPRESSION_SCALE, b_idx, 0, 0, 0, start_partition_idx + (seq_len * SUBGROUP_SIZE) + sglid, b1_idx / BROADCAST_GROUP_SIZE); + const uint value_scale_comp_offset = GET_DATA_INDEX_6D(VALUE_COMPRESSION_SCALE, b_idx, 0, 0, 0, b1_idx / BROADCAST_GROUP_SIZE, start_partition_idx + (seq_len * SUBGROUP_SIZE) + sglid); #else // const uint value_scale_comp_offset = GET_DATA_INDEX_6D(VALUE_COMPRESSION_SCALE, b_idx, start_partition_idx + (seq_len * SUBGROUP_SIZE) + sglid, 0, 0, 0, 0); - const uint value_scale_comp_offset = GET_DATA_INDEX_6D(VALUE_COMPRESSION_SCALE, b_idx, 0, 0, 0, start_partition_idx + (seq_len * SUBGROUP_SIZE) + sglid, 0); + const uint value_scale_comp_offset = GET_DATA_INDEX_6D(VALUE_COMPRESSION_SCALE, b_idx, 0, 0, 0, 0, start_partition_idx + (seq_len * SUBGROUP_SIZE) + sglid); #endif VALUE_COMPRESSION_SCALE_TYPE value_comp_scale = val_scale[value_scale_comp_offset]; // printf("[1]value_scale_comp_offset=%d, sglid=%d: %f\n", value_scale_comp_offset, sglid, value_comp_scale); @@ -1461,10 +1462,10 @@ KERNEL(sdpa_opt)( #endif #ifdef COMPRESSED_PER_HEAD // const uint value_scale_comp_offset = GET_DATA_INDEX_6D(VALUE_COMPRESSION_SCALE, b_idx, start_partition_idx + seq_len_leftovers_start + sglid, 0, 0, b1_idx, 0); - const uint value_scale_comp_offset = GET_DATA_INDEX_6D(VALUE_COMPRESSION_SCALE, b_idx, 0, 0, 0, start_partition_idx + seq_len_leftovers_start + sglid, b1_idx / BROADCAST_GROUP_SIZE); + const uint value_scale_comp_offset = GET_DATA_INDEX_6D(VALUE_COMPRESSION_SCALE, b_idx, 0, 0, 0, b1_idx / BROADCAST_GROUP_SIZE, start_partition_idx + seq_len_leftovers_start + sglid); #else // const uint value_scale_comp_offset = GET_DATA_INDEX_6D(VALUE_COMPRESSION_SCALE, b_idx, start_partition_idx + seq_len_leftovers_start + sglid, 0, 0, 0, 0); - const uint value_scale_comp_offset = GET_DATA_INDEX_6D(VALUE_COMPRESSION_SCALE, b_idx, 0, 0, 0, start_partition_idx + seq_len_leftovers_start + sglid, 0); + const uint value_scale_comp_offset = GET_DATA_INDEX_6D(VALUE_COMPRESSION_SCALE, b_idx, 0, 0, 0, 0, start_partition_idx + seq_len_leftovers_start + sglid); #endif VALUE_COMPRESSION_SCALE_TYPE value_comp_scale = val_scale[value_scale_comp_offset]; // printf("[2]value_scale_comp_offset=%d, sglid=%d: %f\n", value_scale_comp_offset, sglid, value_comp_scale); diff --git a/src/plugins/intel_gpu/src/kernel_selector/kernels/sdpa/sdpa_kernel_selector.cpp b/src/plugins/intel_gpu/src/kernel_selector/kernels/sdpa/sdpa_kernel_selector.cpp index 906ad3dcb23d5e..6d157fe901efcb 100644 --- a/src/plugins/intel_gpu/src/kernel_selector/kernels/sdpa/sdpa_kernel_selector.cpp +++ b/src/plugins/intel_gpu/src/kernel_selector/kernels/sdpa/sdpa_kernel_selector.cpp @@ -13,13 +13,13 @@ namespace kernel_selector { sdpa_kernel_selector::sdpa_kernel_selector() { - int USE_REF = 0; - if (const auto env_var = std::getenv("USE_REF")) { + int USE_REF_SDPA = 0; + if (const auto env_var = std::getenv("USE_REF_SDPA")) { std::istringstream ss(env_var); - ss >> USE_REF; + ss >> USE_REF_SDPA; } - if (!USE_REF) { + if (!USE_REF_SDPA) { Attach(); Attach(); #ifdef ENABLE_ONEDNN_FOR_GPU diff --git a/src/plugins/intel_gpu/src/plugin/transformations/kv_cache_compression.cpp b/src/plugins/intel_gpu/src/plugin/transformations/kv_cache_compression.cpp index d7c2c0170aa3a8..1f591d9571dc2a 100644 --- a/src/plugins/intel_gpu/src/plugin/transformations/kv_cache_compression.cpp +++ b/src/plugins/intel_gpu/src/plugin/transformations/kv_cache_compression.cpp @@ -126,8 +126,8 @@ KVCacheCompressionMatcher::KVCacheCompressionMatcher() { std::vector scales_output_order(rank, 1); scales_output_order[0] = transposed_order[0]; scales_output_order[1] = transposed_order[3]; - scales_output_order[2] = transposed_order[2]; - scales_output_order[3] = transposed_order[1]; + scales_output_order[2] = transposed_order[1]; + scales_output_order[3] = transposed_order[2]; return scales_output_order; }; diff --git a/src/plugins/intel_gpu/src/plugin/transformations/op/kv_cache.cpp b/src/plugins/intel_gpu/src/plugin/transformations/op/kv_cache.cpp index fa7e803c1fcc74..da160e26aa6749 100644 --- a/src/plugins/intel_gpu/src/plugin/transformations/op/kv_cache.cpp +++ b/src/plugins/intel_gpu/src/plugin/transformations/op/kv_cache.cpp @@ -174,7 +174,9 @@ std::vector shape_infer(const KVCache* op, ov::op::internal::DynamicQuantize op; auto new_token_data_quantized_shapes = ov::op::internal::DynamicQuantize::shape_infer(&op, {input_shapes[1]}, group_sizes, scales_output_order); - const auto scales_concat_axis = 2; + std::cout << "KV input: " << input_shapes[0] << " " << input_shapes[1] << " " << input_shapes[2] << " " << input_shapes[3] << "\n"; + std::cout << "DQ output results for KV: " << new_token_data_quantized_shapes[0] << " " << new_token_data_quantized_shapes[1] << "\n"; + const auto scales_concat_axis = 3; ov::PartialShape compression_scale_shape = input_shapes[3]; compression_scale_shape[scales_concat_axis] += new_token_data_quantized_shapes[1][scales_concat_axis]; out_shapes[2] = compression_scale_shape;