From ca53124cacbe76989ddfe97ae5022df50111095b Mon Sep 17 00:00:00 2001 From: Sergey Shlyapnikov Date: Thu, 19 Oct 2023 09:49:25 +0400 Subject: [PATCH] [GPU] Improve OOOQ synchronization between ShapeOf subgraph's CPU impls and GPU kernels --- src/plugins/intel_gpu/src/graph/primitive_inst.cpp | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/src/plugins/intel_gpu/src/graph/primitive_inst.cpp b/src/plugins/intel_gpu/src/graph/primitive_inst.cpp index fd6cea4b5154d5..eec7776d27e762 100644 --- a/src/plugins/intel_gpu/src/graph/primitive_inst.cpp +++ b/src/plugins/intel_gpu/src/graph/primitive_inst.cpp @@ -867,16 +867,16 @@ event::ptr primitive_inst::execute(const std::vector& events) { GPU_DEBUG_TRACE << id() << ": execute " << _impl->get_kernel_name() << " (is_dynamic=" << _impl->is_dynamic() << ", " << "can_be_optimized=" << can_be_optimized() << ")" << std::endl; + const bool out_of_order_queue = get_network().get_stream().get_queue_type() == QueueTypes::out_of_order; if (_exec_deps.empty() && dependencies.empty()) { dependencies = events; } else { - auto queue_type = get_network().get_stream().get_queue_type(); // Prepare dependencies events in case of OOO queue, CPU implementation, // or optimized_out impl which has CPU users (needs_completion_event() && !is_output() condition) - if (queue_type == QueueTypes::out_of_order || _impl->is_cpu() || (can_be_optimized() && needs_completion_event() && !is_output())) { + if (out_of_order_queue || _impl->is_cpu() || (can_be_optimized() && needs_completion_event() && !is_output())) { dependencies.reserve(dependencies.size() + _exec_deps.size()); for (auto& input : _exec_deps) { - if (input->is_input() && queue_type != QueueTypes::out_of_order) + if (input->is_input() && !out_of_order_queue) continue; auto id = input->id(); try { @@ -891,6 +891,14 @@ event::ptr primitive_inst::execute(const std::vector& events) { } } + // Replace multiple events with single grouped event in case of barriers synchronization to prevent `_last_barrier_ev` usage as a dependency + // event of optimized_out instance's users, which may lead to unwanted extra synchronization of CPU impls with GPU kernels + if (_node->is_in_shape_of_subgraph() && can_be_optimized() && dependencies.size() > 1 && out_of_order_queue) { + // std::cout << "Combine events for " << id() << "\n"; + auto grouped_ev = get_network().get_stream().group_events(dependencies); + dependencies = {grouped_ev}; + } + { GPU_DEBUG_PROFILED_STAGE(instrumentation::pipeline_stage::inference); auto ev = _impl->execute(dependencies, *this);