Skip to content

Commit

Permalink
[GPU] Improvement of event-related primitives code (openvinotoolkit#2…
Browse files Browse the repository at this point in the history
…7337)

### Details:
- Removed `_events` map from network class. Now dependency and result
events are stored in `kernel_impl_params` for each primitive
- User events are not created for CPU impls with barrier based
synchronization to avoid useless OCL API calls (clCreateUserEvent ->
clSetUserEventStatus -> clReleaseEvent). Overall, methods can return
nullptr instead of user event.
- Update ocl_stream::wait_for_events impl to deal with C event handles
(cl_event) instead of C++ wrapper to avoid redundant clRetainEvent call
- Introduced ExecutionFlags structure which reflects an execution status
of primitive. Now methods which prepares dynamic primitive for execution
modify/check some flags instead of some primitive_inst attributes or
explicit function arguments.

---------

Signed-off-by: Vladimir Paramuzov <[email protected]>
  • Loading branch information
vladimir-paramuzov authored and ababushk committed Nov 18, 2024
1 parent e461441 commit 5edec4e
Show file tree
Hide file tree
Showing 39 changed files with 348 additions and 476 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,13 @@ struct program;
struct network;


struct ExecutionFlags : public std::bitset<4> {
static const size_t SHAPE_CHANGED = 0;
static const size_t IMPL_CHANGED = 1;
static const size_t MEMORY_CHANGED = 2;
static const size_t SKIP = 3;
};

struct kernel_impl_params final {
struct Hasher {
size_t operator()(const kernel_impl_params &k) const {
Expand All @@ -48,6 +55,11 @@ struct kernel_impl_params final {
std::shared_ptr<dnnl::primitive_attr> attrs_onednn;
#endif // ENABLE_ONEDNN_FOR_GPU

std::vector<event::ptr> dep_events = {};
event::ptr out_event = nullptr;

ExecutionFlags flags;

optional_layout weights_layout = optional_layout();

optional_layout bias_layout = optional_layout();
Expand Down
11 changes: 3 additions & 8 deletions src/plugins/intel_gpu/include/intel_gpu/graph/network.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ struct network_output {
// TODO: in_order queue doesn't create proper output event in some cases which leads to syncronization issues with user app
// So call finish for associated stream to enusre that the output data is ready.
if (do_sync) {
if (_stream->get_queue_type() == QueueTypes::in_order) {
if (_stream->get_queue_type() == QueueTypes::in_order || !_event) {
_stream->finish();
} else {
_event->wait();
Expand Down Expand Up @@ -167,12 +167,10 @@ struct network {
std::shared_ptr<const primitive_inst> get_primitive(const primitive_id& id) const;
std::string get_primitive_info(const primitive_id& id) const;
std::string get_implementation_info(const primitive_id& id) const;
const event::ptr& get_primitive_event(const primitive_id& id) const { return _events.at(id); }
bool has_event(const primitive_id& id) const { return _events.count(id); }
const event::ptr& get_primitive_event(const primitive_id& id) const;
bool has_event(const primitive_id& id) const;
std::vector<primitive_inst*> get_primitives(const std::vector<primitive_id>& ids);
std::vector<std::pair<primitive_inst*, int>> get_primitives(const std::vector<std::pair<program_node*, int>>& nodes);
void execute_primitive(const std::shared_ptr<primitive_inst>& primitive,
const std::vector<event::ptr>& events);
void allocate_primitives();
void configure_primitives_second_output();
void build_insts_deps();
Expand Down Expand Up @@ -231,9 +229,6 @@ struct network {
program::primitives_info _prims_info;
size_t _weights_cache_capacity = 1;

std::unordered_map<primitive_id, event::ptr> _events;
// This map is used to temporarily hold events that will be deallocated later
std::unordered_map<primitive_id, event::ptr> _old_events;
output_chains_map _output_chains;

std::shared_ptr<ShapePredictor> _shape_predictor;
Expand Down
1 change: 1 addition & 0 deletions src/plugins/intel_gpu/include/intel_gpu/runtime/stream.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,7 @@ class stream {
virtual event::ptr aggregate_events(const std::vector<event::ptr>& events, bool group = false, bool is_output = false);

QueueTypes get_queue_type() const { return m_queue_type; }
SyncMethods get_sync_method() const { return m_sync_method; }

static QueueTypes detect_queue_type(engine_types engine_type, void* queue_handle);
static SyncMethods get_expected_sync_method(const ExecutionConfig& config);
Expand Down
1 change: 0 additions & 1 deletion src/plugins/intel_gpu/src/graph/impls/common/condition.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,6 @@ struct condition_impl : typed_primitive_impl<condition> {
events[0]->wait();

auto& stream = instance.get_network().get_stream();
auto ev = stream.create_user_event(false);
set_node_params(instance.get_node());

auto pred = condition_inst::get_pred_from_memory(instance.pred_memory_ptr(), stream);
Expand Down
7 changes: 3 additions & 4 deletions src/plugins/intel_gpu/src/graph/impls/common/loop.cpp
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
// Copyright (C) 2018-2024 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//
#include "intel_gpu/graph/kernel_impl_params.hpp"
#include "loop_inst.h"
#include "impls/registry/implementation_map.hpp"
#include "register.hpp"
Expand Down Expand Up @@ -122,7 +123,7 @@ struct loop_impl : typed_primitive_impl<loop> {

if (is_dynamic) {
instance.update_shape();
if (instance.shape_changed()) {
if (instance.get_flag(ExecutionFlags::SHAPE_CHANGED)) {
instance.preproc_memories_done = false;
instance.reset_memory();
}
Expand Down Expand Up @@ -198,9 +199,7 @@ struct loop_impl : typed_primitive_impl<loop> {
// If there are concatenated_input_mem_mappings or backedge_memory_mappings we need to wait for
// previous tasks before accessing memory in get_sliced_mem() and setup_iteration() functions
if (!concatenated_input_mem_mappings.empty() || !backedge_memory_mappings.empty()) {
for (auto& e : events) {
e->wait();
}
stream.wait_for_events(events);
}

// Set sliced input data
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -37,8 +37,7 @@ class wait_for_events_impl : public primitive_impl {
event::ptr execute(const std::vector<event::ptr>& events, primitive_inst& instance) override {
auto& stream = instance.get_network().get_stream();

return events.empty() ? stream.create_user_event(true)
: stream.enqueue_marker(events);
return stream.aggregate_events(events);
}

static std::unique_ptr<primitive_impl> create_data(const data_node& data, const kernel_impl_params&) {
Expand Down
13 changes: 4 additions & 9 deletions src/plugins/intel_gpu/src/graph/impls/cpu/activation.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
// SPDX-License-Identifier: Apache-2.0
//

#include "impls/cpu/cpu_impl_helpers.hpp"
#include "openvino/core/type/element_type_traits.hpp"
#include "register.hpp"
#include "activation_inst.h"
Expand Down Expand Up @@ -143,9 +144,7 @@ struct activation_impl : public typed_primitive_impl<activation> {
const bool pass_through_events = (stream.get_queue_type() == QueueTypes::out_of_order) && instance.all_dependencies_cpu_impl();

if (!pass_through_events) {
for (auto e : events) {
e->wait();
}
stream.wait_for_events(events);
}

if (!op) {
Expand Down Expand Up @@ -276,14 +275,10 @@ struct activation_impl : public typed_primitive_impl<activation> {
}

if (pass_through_events) {
if (events.size() > 1) {
return stream.group_events(events);
} else if (events.size() == 1) {
return events[0];
}
return stream.group_events(events);
}

return stream.create_user_event(true);
return make_output_event(stream, instance.is_output());
}

void init_kernels(const kernels_cache& , const kernel_impl_params&) override {}
Expand Down
4 changes: 1 addition & 3 deletions src/plugins/intel_gpu/src/graph/impls/cpu/assign.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -52,9 +52,7 @@ struct assign_impl : public typed_primitive_impl<assign> {

auto& stream = instance.get_network().get_stream();

for (auto e : events) {
e->wait();
}
stream.wait_for_events(events);

const auto ev_set_memory = variable.get_memory()->copy_from(stream, instance.input_memory());
variable.set();
Expand Down
13 changes: 4 additions & 9 deletions src/plugins/intel_gpu/src/graph/impls/cpu/broadcast.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
// SPDX-License-Identifier: Apache-2.0
//

#include "impls/cpu/cpu_impl_helpers.hpp"
#include "register.hpp"
#include "broadcast_inst.h"
#include "impls/registry/implementation_map.hpp"
Expand Down Expand Up @@ -63,9 +64,7 @@ struct broadcast_impl : public typed_primitive_impl<broadcast> {
const bool pass_through_events = (stream.get_queue_type() == QueueTypes::out_of_order) && instance.all_dependencies_cpu_impl();

if (!pass_through_events) {
for (auto e : events) {
e->wait();
}
stream.wait_for_events(events);
}

auto params = instance.get_impl_params();
Expand Down Expand Up @@ -110,14 +109,10 @@ struct broadcast_impl : public typed_primitive_impl<broadcast> {
input_mem_ptrs[i]->unlock(stream);

if (pass_through_events) {
if (events.size() > 1) {
return stream.group_events(events);
} else if (events.size() == 1) {
return events[0];
}
return stream.group_events(events);
}

return stream.create_user_event(true);
return make_output_event(stream, instance.is_output());
}

void init_kernels(const kernels_cache& , const kernel_impl_params&) override {}
Expand Down
13 changes: 4 additions & 9 deletions src/plugins/intel_gpu/src/graph/impls/cpu/concat.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
// SPDX-License-Identifier: Apache-2.0
//

#include "impls/cpu/cpu_impl_helpers.hpp"
#include "register.hpp"
#include "concatenation_inst.h"
#include "impls/registry/implementation_map.hpp"
Expand Down Expand Up @@ -54,9 +55,7 @@ struct concatenation_impl : public typed_primitive_impl<concatenation> {
const bool pass_through_events = (stream.get_queue_type() == QueueTypes::out_of_order) && instance.all_dependencies_cpu_impl();

if (!pass_through_events) {
for (auto e : events) {
e->wait();
}
stream.wait_for_events(events);
}

auto params = instance.get_impl_params();
Expand Down Expand Up @@ -97,14 +96,10 @@ struct concatenation_impl : public typed_primitive_impl<concatenation> {
input_mem_ptrs[i]->unlock(stream);

if (pass_through_events) {
if (events.size() > 1) {
return stream.group_events(events);
} else if (events.size() == 1) {
return events[0];
}
return stream.group_events(events);
}

return stream.create_user_event(true);
return make_output_event(stream, instance.is_output());
}

void init_kernels(const kernels_cache& , const kernel_impl_params&) override {}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,9 @@
#include <algorithm>
#include <vector>

#include "intel_gpu/runtime/event.hpp"
#include "intel_gpu/runtime/stream.hpp"

namespace cldnn {
namespace cpu {

Expand Down Expand Up @@ -74,5 +77,11 @@ using vector3D = vector2D<vector1D<T>>;
template <typename T>
using vector4D = vector2D<vector2D<T>>;

inline event::ptr make_output_event(cldnn::stream& stream, bool is_output) {
if (is_output)
return stream.create_user_event(true);
return nullptr;
}

} // namespace cpu
} // namespace cldnn
13 changes: 4 additions & 9 deletions src/plugins/intel_gpu/src/graph/impls/cpu/crop.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
// SPDX-License-Identifier: Apache-2.0
//

#include "impls/cpu/cpu_impl_helpers.hpp"
#include "register.hpp"
#include "crop_inst.h"
#include "impls/registry/implementation_map.hpp"
Expand Down Expand Up @@ -40,9 +41,7 @@ struct crop_impl : public typed_primitive_impl<crop> {
const bool pass_through_events = (stream.get_queue_type() == QueueTypes::out_of_order) && instance.all_dependencies_cpu_impl();

if (!pass_through_events) {
for (auto e : events) {
e->wait();
}
stream.wait_for_events(events);
}

auto params = instance.get_impl_params();
Expand Down Expand Up @@ -97,14 +96,10 @@ struct crop_impl : public typed_primitive_impl<crop> {
"[GPU] Couldn't execute crop primitive with id ", instance.id());

if (pass_through_events) {
if (events.size() > 1) {
return stream.group_events(events);
} else if (events.size() == 1) {
return events[0];
}
return stream.group_events(events);
}

return stream.create_user_event(true);
return make_output_event(stream, instance.is_output());
}

void init_kernels(const kernels_cache& , const kernel_impl_params&) override {}
Expand Down
12 changes: 3 additions & 9 deletions src/plugins/intel_gpu/src/graph/impls/cpu/detection_output.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -830,9 +830,7 @@ struct detection_output_impl : typed_primitive_impl<detection_output> {
const bool pass_through_events = (stream.get_queue_type() == QueueTypes::out_of_order) && instance.all_dependencies_cpu_impl();

if (!pass_through_events) {
for (auto e : events) {
e->wait();
}
stream.wait_for_events(events);
}

const int num_of_images = instance.location_memory()->get_layout().batch(); // batch size
Expand All @@ -851,14 +849,10 @@ struct detection_output_impl : typed_primitive_impl<detection_output> {
}

if (pass_through_events) {
if (events.size() > 1) {
return stream.group_events(events);
} else if (events.size() == 1) {
return events[0];
}
return stream.group_events(events);
}

return stream.create_user_event(true);
return make_output_event(stream, instance.is_output());
}

void init_kernels(const kernels_cache& , const kernel_impl_params&) override {}
Expand Down
13 changes: 4 additions & 9 deletions src/plugins/intel_gpu/src/graph/impls/cpu/eltwise.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
//

#include "eltwise_inst.h"
#include "impls/cpu/cpu_impl_helpers.hpp"
#include "impls/registry/implementation_map.hpp"
#include "openvino/op/add.hpp"
#include "openvino/op/bitwise_and.hpp"
Expand Down Expand Up @@ -84,9 +85,7 @@ struct eltwise_impl : public typed_primitive_impl<eltwise> {
const bool pass_through_events = (stream.get_queue_type() == QueueTypes::out_of_order) && instance.all_dependencies_cpu_impl();

if (!pass_through_events) {
for (auto e : events) {
e->wait();
}
stream.wait_for_events(events);
}

auto params = instance.get_impl_params();
Expand Down Expand Up @@ -210,14 +209,10 @@ struct eltwise_impl : public typed_primitive_impl<eltwise> {
input_mem_ptrs[i]->unlock(stream);

if (pass_through_events) {
if (events.size() > 1) {
return stream.group_events(events);
} else if (events.size() == 1) {
return events[0];
}
return stream.group_events(events);
}

return stream.create_user_event(true);
return make_output_event(stream, instance.is_output());
}

void init_kernels(const kernels_cache& , const kernel_impl_params&) override {}
Expand Down
13 changes: 4 additions & 9 deletions src/plugins/intel_gpu/src/graph/impls/cpu/gather.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
// SPDX-License-Identifier: Apache-2.0
//

#include "impls/cpu/cpu_impl_helpers.hpp"
#include "register.hpp"
#include "gather_inst.h"
#include "impls/registry/implementation_map.hpp"
Expand Down Expand Up @@ -62,9 +63,7 @@ struct gather_impl : public typed_primitive_impl<gather> {
const bool pass_through_events = (stream.get_queue_type() == QueueTypes::out_of_order) && instance.all_dependencies_cpu_impl();

if (!pass_through_events) {
for (auto e : events) {
e->wait();
}
stream.wait_for_events(events);
}

auto params = instance.get_impl_params();
Expand Down Expand Up @@ -100,14 +99,10 @@ struct gather_impl : public typed_primitive_impl<gather> {
input_mem_ptrs[i]->unlock(stream);

if (pass_through_events) {
if (events.size() > 1) {
return stream.group_events(events);
} else if (events.size() == 1) {
return events[0];
}
return stream.group_events(events);
}

return stream.create_user_event(true);
return make_output_event(stream, instance.is_output());
}

void init_kernels(const kernels_cache& , const kernel_impl_params&) override {}
Expand Down
Loading

0 comments on commit 5edec4e

Please sign in to comment.