WIP: Move scales to variable

sshlyapn · Oct 14, 2024 · 6d8c913 · 6d8c913
1 parent a134521
commit 6d8c913
Show file tree

Hide file tree

Showing 11 changed files with 215 additions and 86 deletions.
diff --git a/src/plugins/intel_gpu/src/graph/graph_optimizer/prepare_buffer_fusing.cpp b/src/plugins/intel_gpu/src/graph/graph_optimizer/prepare_buffer_fusing.cpp
@@ -876,21 +876,47 @@ void prepare_buffer_fusing::run(program& p) {
                 padding::DynamicDimsMask info_dynamic_pad;
                 info_dynamic_pad[concat_axis] = 1;
                 kv_out_layout.data_padding._dynamic_dims_mask = info_dynamic_pad;
+                GPU_DEBUG_TRACE_DETAIL << node.id() << " 0th  output  layout before before " << node.get_output_layout(false,  0) << "\n";
                 node.set_output_layout(kv_out_layout);
                 node.can_share_buffer(false);
+                GPU_DEBUG_TRACE_DETAIL << node.id() << " 0th  output  layout after " << node.get_output_layout(false,  0) << "\n";
 
-                auto update_dep = [&info_dynamic_pad](program_node* dep) {
-                    auto prev_layout = dep->get_output_layout();
+                auto update_dep = [](program_node* dep, padding::DynamicDimsMask& info_dynamic_pad, size_t idx) {
+                    auto prev_layout = dep->get_output_layout(true, idx);
                     prev_layout.data_padding._dynamic_dims_mask = info_dynamic_pad;
-                    dep->set_output_layout(prev_layout);
+                    dep->set_output_layout(prev_layout, true, idx);
                     dep->can_share_buffer(false);
                 };
 
                 if (rv_prim) {
-                    update_dep(rv_prim);
+                    update_dep(rv_prim, info_dynamic_pad, 0);
                 }
                 if (gather_prim) {
-                    update_dep(gather_prim);
+                    update_dep(gather_prim, info_dynamic_pad, 0);
+                }
+
+                GPU_DEBUG_TRACE_DETAIL << "valid first? " << node.is_valid_output_layout(0) << "\n";
+                GPU_DEBUG_TRACE_DETAIL << "first output :" << node.get_output_layout(false,  0) << "\n";
+
+                if (node.get_primitive()->compressed) {
+                    const auto scales_output_idx = 2;
+                    auto scales_out_layout = node.get_output_layout(false, scales_output_idx);
+
+                    const size_t scales_zp_concat_axis = 3;
+                    padding::DynamicDimsMask info_dynamic_pad_scales;
+                    info_dynamic_pad_scales[scales_zp_concat_axis] = 1;
+                    GPU_DEBUG_TRACE_DETAIL << "Set this pad: " << info_dynamic_pad_scales << "\n";
+                    scales_out_layout.data_padding._dynamic_dims_mask = info_dynamic_pad_scales;
+                    GPU_DEBUG_TRACE_DETAIL << "Pad after: " << info_dynamic_pad_scales << " " << scales_out_layout.data_padding._dynamic_dims_mask << " " << scales_out_layout.data_padding.is_dynamic() << "\n";
+                    GPU_DEBUG_TRACE_DETAIL << scales_out_layout.to_string() << "\n";
+                    GPU_DEBUG_TRACE_DETAIL << node.id() << " 2nd  output  layout before before " << node.get_output_layout(false,  scales_output_idx) << "\n";
+                    node.set_output_layout(scales_out_layout, true, scales_output_idx);
+                    GPU_DEBUG_TRACE_DETAIL << node.id() << " 2nd  output  after " << node.get_output_layout(false,  scales_output_idx) << " " << node.get_output_layout(false,  scales_output_idx).data_padding._dynamic_dims_mask << "\n";
+
+                    update_dep(rv_prim, info_dynamic_pad_scales, 1);
+
+                    GPU_DEBUG_TRACE_DETAIL << "valid 3d? " << node.is_valid_output_layout(scales_output_idx) << "\n";
+                    GPU_DEBUG_TRACE_DETAIL << node.id() << " 2nd  output layout " << node.get_output_layout(false,  scales_output_idx) << " " << info_dynamic_pad << "\n";
                 }
             }
         });

diff --git a/src/plugins/intel_gpu/src/graph/impls/ocl/kv_cache.cpp b/src/plugins/intel_gpu/src/graph/impls/ocl/kv_cache.cpp
@@ -76,7 +76,7 @@ struct kv_cache_impl : multi_stage_primitive<kv_cache> {
 
     cldnn::memory::ptr beam_table_prev = nullptr;
     cldnn::memory::ptr beam_table_new = nullptr;
-    cldnn::memory::ptr compression_scale = nullptr;
+    // cldnn::memory::ptr compression_scale = nullptr;
 
     void load(BinaryInputBuffer& ib) override {
         parent::load(ib);
@@ -111,7 +111,7 @@ struct kv_cache_impl : multi_stage_primitive<kv_cache> {
         } else if (stage == scale_concat_stage) {
             // FIXME: indirectness and compression are orthogonal feature.
             args.inputs = { instance.input_memory_ptr(3) }; // [past, new, beam_table, past_scale, new_scale]
-            args.outputs = { compression_scale };
+            args.outputs = { instance.output_memory_ptr(2) };
         } else if (stage == dq_concat_stage) {
             args.inputs = { instance.input_memory_ptr(1) }; // [past, new, beam_table, past_scale, new_scale]
             args.outputs = { instance.output_memory_ptr(0), instance.output_memory_ptr(2) };
@@ -175,8 +175,9 @@ struct kv_cache_impl : multi_stage_primitive<kv_cache> {
             // However, allow execution of the first token for the case if KV-cache can't be optimized (if optimization is disabled, or
             // variables memory was reallocated and we have to copy past KV-cache to new memory)
             _kernels_data[concat_stage].kernels[1].skip_execution = true;
-            if (_kernels_data[concat_stage].kernels[0].skip_execution)
+            if (!_kernels_data[concat_stage].kernels[0].skip_execution) {
                 GPU_DEBUG_TRACE_DETAIL << "Run copy of data!\n";
+            }
         }
 
         execute_stage(events, instance, res_events, concat_stage);
@@ -217,39 +218,43 @@ struct kv_cache_impl : multi_stage_primitive<kv_cache> {
         }
 
         if (desc->compressed) {
-            const auto scale_alloc_type = engine.get_preferred_memory_allocation_type(false);
+            // const auto scale_alloc_type = engine.get_preferred_memory_allocation_type(false);
             auto comp_scale_state =
                 dynamic_cast<ov::intel_gpu::VariableStateIndirectKVCache&>(variable).get_compression_scale_state();
-            auto comp_scale_layout = instance.get_impl_params()->output_layouts[2];
-            auto comp_scale_shape = comp_scale_layout.get_shape();
+            // auto comp_scale_layout = instance.get_impl_params()->output_layouts[2];
+            // auto comp_scale_shape = comp_scale_layout.get_shape();
 
             bool skip_first_kernel = true;
-            const auto preallocation_size = instance.get_prealloc_iter_num();
+            // const auto preallocation_size = instance.get_prealloc_iter_num();
             // const auto preallocation_size = 4;
-            if (compression_scale) {
-                GPU_DEBUG_TRACE_DETAIL << "Has compression, mem=" << compression_scale->get_layout().to_short_string() << ", req size" << ov::shape_size(comp_scale_shape) << ", has " << compression_scale->count() << "\n";
-            } else {
-                GPU_DEBUG_TRACE_DETAIL << "Has compression, mem=" << compression_scale << ", req size" << ov::shape_size(comp_scale_shape) << "\n";
-            }
-
-            if (!compression_scale || compression_scale->count() < ov::shape_size(comp_scale_shape)) {
-                const auto concat_axis = 2;
-                auto alloc_shape = comp_scale_shape;
-                alloc_shape[concat_axis] += preallocation_size;
-                const layout comp_scale_alloc_layout = {alloc_shape, comp_scale_layout.data_type, comp_scale_layout.format};
-                GPU_DEBUG_TRACE_DETAIL << "Realloc compression scale table to " << comp_scale_alloc_layout.to_short_string() << std::endl;
-                compression_scale = engine.allocate_memory(comp_scale_alloc_layout, scale_alloc_type, false);
-
-                skip_first_kernel = comp_scale_state->get_layout().count() == 0;
-
-                if (comp_scale_state->get_layout().count() > 64) {
-                    GPU_DEBUG_TRACE_DETAIL << "Reallocation of scales buffer. Prev " << comp_scale_state->get_layout().to_short_string() << " new: " << comp_scale_alloc_layout.to_short_string() << "(prealloc=" << preallocation_size << ")\n";
-                }
-            }
-
-            instance.set_output_memory(compression_scale, false, 2);
-            GPU_DEBUG_TRACE_DETAIL << "Override Variable memory\n";
-            comp_scale_state->set_memory(compression_scale, instance.get_impl_params()->output_layouts[2]);
+            // if (compression_scale) {
+            //     GPU_DEBUG_TRACE_DETAIL << "Has compression, mem=" << compression_scale->get_layout().to_short_string() << ", req size" << ov::shape_size(comp_scale_shape) << ", has " << compression_scale->count() << "\n";
+            // } else {
+            //     GPU_DEBUG_TRACE_DETAIL << "Has compression, mem=" << compression_scale << ", req size" << ov::shape_size(comp_scale_shape) << "\n";
+            // }
+
+            // if (!compression_scale || compression_scale->count() < ov::shape_size(comp_scale_shape)) {
+            //     const auto concat_axis = 3;
+            //     auto alloc_shape = comp_scale_shape;
+            //     alloc_shape[concat_axis] += preallocation_size;
+            //     const layout comp_scale_alloc_layout = {alloc_shape, comp_scale_layout.data_type, comp_scale_layout.format};
+            //     GPU_DEBUG_TRACE_DETAIL << "Realloc compression scale table to " << comp_scale_alloc_layout.to_short_string() << std::endl;
+            //     compression_scale = engine.allocate_memory(comp_scale_alloc_layout, scale_alloc_type, false);
+
+            //     skip_first_kernel = comp_scale_state->get_layout().count() == 0;
+
+            //     if (comp_scale_state->get_layout().count() > 64) {
+            //         GPU_DEBUG_TRACE_DETAIL << "Reallocation of scales buffer. Prev " << comp_scale_state->get_layout().to_short_string() << " new: " << comp_scale_alloc_layout.to_short_string() << "(prealloc=" << preallocation_size << ")\n";
+            //     }
+            // }
+
+            // instance.set_output_memory(compression_scale, false, 2);
+            // auto scales_layout = instance.get_impl_params()->output_layouts[2];
+            // size_t scale_concat_axis = 3;
+            // scales_layout.data_padding._upper_size[scale_concat_axis] =
+            // GPU_DEBUG_TRACE_DETAIL << "Override Variable memory with layoyut " << instance.get_impl_params()->output_layouts[2] << "\n";
+
+            // comp_scale_state->set_memory(compression_scale, instance.get_impl_params()->output_layouts[2]);
 
             if (!skip_first_kernel) {
                 GPU_DEBUG_TRACE_DETAIL << "Run copy of scales!\n";
@@ -260,8 +265,6 @@ struct kv_cache_impl : multi_stage_primitive<kv_cache> {
                 execute_stage(events, instance, res_events, scale_concat_stage);
             }
 
-
-
             auto dq_params = get_dq_update_kernel_params(impl_param, impl_param.is_dynamic());
             (_kernels_data[dq_concat_stage].update_dispatch_data_func)(dq_params, _kernels_data[dq_concat_stage]);
             execute_stage(events, instance, res_events, dq_concat_stage);
@@ -454,7 +457,7 @@ struct kv_cache_impl : multi_stage_primitive<kv_cache> {
         const auto& primitive = impl_param.typed_desc<kv_cache>();
         auto params = get_default_params<kernel_selector::concatenation_params>(impl_param, is_shape_agnostic);
 
-        const auto concat_axis = 2;
+        const auto concat_axis = 3;
         params.axis = convert_axis(concat_axis, impl_param.get_output_layout().get_rank());
 
         auto inputs_count = 1;

diff --git a/src/plugins/intel_gpu/src/graph/include/program_node.h b/src/plugins/intel_gpu/src/graph/include/program_node.h
@@ -232,7 +232,7 @@ struct program_node {
     }
 
     void merge_output_padding(padding const& padd, size_t idx = 0) {
-        set_output_padding(padding::max(padd, output_layouts[idx].data_padding));
+        set_output_padding(padding::max(padd, output_layouts[idx].data_padding), idx);
     }
 
     // only calculated output layout (for external usage), does not modify/use cached output layout nor invalidate users

diff --git a/src/plugins/intel_gpu/src/graph/kv_cache.cpp b/src/plugins/intel_gpu/src/graph/kv_cache.cpp
@@ -48,6 +48,9 @@ std::vector<layout> kv_cache_inst::calc_output_layouts(kv_cache_node const& node
     std::vector<ShapeType> output_shapes = desc->compressed ? shape_infer(&op, input_shapes, desc->group_sizes, desc->scales_output_order)
                                                             : shape_infer(&op, input_shapes);
 
+    if (desc->num_outputs == 3)
+        GPU_DEBUG_TRACE_DETAIL << desc->id << " scales output calculated shape: " << output_shapes[2] << "\n";
+
     static const std::map<size_t, size_t> ports_map = {{0, 0}, {1, 2}};
 
     std::vector<layout> out_layouts;
@@ -95,7 +98,7 @@ int32_t kv_cache_inst::get_prealloc_iter_num() {
     //   iteration.
     // - Therfore, to avoid this situation where the allocation and copying occurs simutaneously for all the kv_cache_insts,
     //   we assigned different prealloc-size for each kv cache so that we could prevent a memory peak
-    return 128 + kv_cache_id % 64;
+    return 10;
 }
 
 void kv_cache_inst::update_shape_info_tensor(const kernel_impl_params& params) {