Skip to content

Commit

Permalink
[GPU] Improve OOOQ synchronization between ShapeOf subgraph's CPU imp…
Browse files Browse the repository at this point in the history
…ls and GPU kernels
  • Loading branch information
sshlyapn committed Oct 19, 2023
1 parent 9e8a78a commit ca53124
Showing 1 changed file with 11 additions and 3 deletions.
14 changes: 11 additions & 3 deletions src/plugins/intel_gpu/src/graph/primitive_inst.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -867,16 +867,16 @@ event::ptr primitive_inst::execute(const std::vector<event::ptr>& events) {
GPU_DEBUG_TRACE << id() << ": execute " << _impl->get_kernel_name() << " (is_dynamic=" << _impl->is_dynamic() << ", "
<< "can_be_optimized=" << can_be_optimized() << ")" << std::endl;

const bool out_of_order_queue = get_network().get_stream().get_queue_type() == QueueTypes::out_of_order;
if (_exec_deps.empty() && dependencies.empty()) {
dependencies = events;
} else {
auto queue_type = get_network().get_stream().get_queue_type();
// Prepare dependencies events in case of OOO queue, CPU implementation,
// or optimized_out impl which has CPU users (needs_completion_event() && !is_output() condition)
if (queue_type == QueueTypes::out_of_order || _impl->is_cpu() || (can_be_optimized() && needs_completion_event() && !is_output())) {
if (out_of_order_queue || _impl->is_cpu() || (can_be_optimized() && needs_completion_event() && !is_output())) {
dependencies.reserve(dependencies.size() + _exec_deps.size());
for (auto& input : _exec_deps) {
if (input->is_input() && queue_type != QueueTypes::out_of_order)
if (input->is_input() && !out_of_order_queue)
continue;
auto id = input->id();
try {
Expand All @@ -891,6 +891,14 @@ event::ptr primitive_inst::execute(const std::vector<event::ptr>& events) {
}
}

// Replace multiple events with single grouped event in case of barriers synchronization to prevent `_last_barrier_ev` usage as a dependency
// event of optimized_out instance's users, which may lead to unwanted extra synchronization of CPU impls with GPU kernels
if (_node->is_in_shape_of_subgraph() && can_be_optimized() && dependencies.size() > 1 && out_of_order_queue) {
// std::cout << "Combine events for " << id() << "\n";
auto grouped_ev = get_network().get_stream().group_events(dependencies);
dependencies = {grouped_ev};
}

{
GPU_DEBUG_PROFILED_STAGE(instrumentation::pipeline_stage::inference);
auto ev = _impl->execute(dependencies, *this);
Expand Down

0 comments on commit ca53124

Please sign in to comment.