From 4a33ad8a5b2268889c2f0a2c3822ae8a8151f844 Mon Sep 17 00:00:00 2001 From: Surya Siddharth Pemmaraju Date: Sun, 27 Oct 2024 11:13:42 -0700 Subject: [PATCH 001/120] Remove duplicate backend registration from torch 2.5 (#27245) ### Details: - In torch 2.5.0, pytorch is registering the backends from entrypoints and we don't have to register the backend again. If we register it twice, torch throws a duplicate registration error. ### Tickets: - *ticket-id* --- .../frontend/pytorch/torchdynamo/backend.py | 38 ++++++++++++------- 1 file changed, 25 insertions(+), 13 deletions(-) diff --git a/src/bindings/python/src/openvino/frontend/pytorch/torchdynamo/backend.py b/src/bindings/python/src/openvino/frontend/pytorch/torchdynamo/backend.py index c6c01fa98e5e99..8294927a079c7e 100644 --- a/src/bindings/python/src/openvino/frontend/pytorch/torchdynamo/backend.py +++ b/src/bindings/python/src/openvino/frontend/pytorch/torchdynamo/backend.py @@ -49,18 +49,26 @@ openvino_options = {} -@register_backend + @fake_tensor_unsupported def openvino(subgraph, example_inputs, options=None): - if (_get_aot_autograd(options)): + if _get_aot_autograd(options): global openvino_options openvino_options = options decompositions = _get_decompositions(options) + get_inf_decomposition_list() + get_aot_decomposition_list() - return aot_autograd(fw_compiler=fx_openvino, - bw_compiler=fx_openvino, - decompositions=get_decompositions(decompositions))(subgraph, example_inputs) + return aot_autograd(fw_compiler=fx_openvino, bw_compiler=fx_openvino, decompositions=get_decompositions(decompositions))(subgraph, example_inputs) return fx_openvino(subgraph, example_inputs, options) + +try: + from packaging import version + + if version.parse(torch.__version__) < version.parse("2.5.0"): + register_backend(compiler_fn=openvino, name="openvino") +except ImportError: + logger.warning("The 'packaging' module is required but not installed") + + def fx_openvino(subgraph, example_inputs, options=None): try: if len(openvino_options) != 0: @@ -70,7 +78,7 @@ def fx_openvino(subgraph, example_inputs, options=None): openvino_model_caching = _get_model_caching(options) if openvino_model_caching is not None and openvino_model_caching: # Create a hash to be used for caching - model_hash_str = sha256(subgraph.code.encode('utf-8')).hexdigest() + model_hash_str = sha256(subgraph.code.encode("utf-8")).hexdigest() executor_parameters = {"model_hash_str": model_hash_str} # Check if the model was fully supported and already cached example_inputs.reverse() @@ -79,15 +87,17 @@ def fx_openvino(subgraph, example_inputs, options=None): if os.path.isfile(maybe_fs_cached_name + ".xml") and os.path.isfile(maybe_fs_cached_name + ".bin"): # Model is fully supported and already cached. Run the cached OV model directly. compiled_model = openvino_compile_cached_model(maybe_fs_cached_name, options, *example_inputs) + def _call(*args): res = execute_cached(compiled_model, *args) return res + return _call if inputs_reversed: example_inputs.reverse() preserved_arg_indices = [] - if (_get_aot_autograd(options)): + if _get_aot_autograd(options): if tracing_context := torch._guards.TracingContext.try_get(): fw_metadata = tracing_context.fw_metadata params_flat = tracing_context.params_flat @@ -97,6 +107,7 @@ def _call(*args): model = subgraph else: from torch._subclasses.fake_tensor import FakeTensorMode + decompositions = _get_decompositions(options) + get_inf_decomposition_list() with FakeTensorMode(allow_non_fake_inputs=True): model = make_fx(subgraph, decomposition_table=get_decompositions(decompositions))(*example_inputs) @@ -106,26 +117,27 @@ def _call(*args): partitioner = Partitioner(options) compiled_model = partitioner.make_partitions(model, options) - if executor_parameters is not None and 'model_hash_str' in executor_parameters: + if executor_parameters is not None and "model_hash_str" in executor_parameters: # Check if the model is fully supported. fully_supported = partitioner.check_fully_supported(compiled_model) if fully_supported: executor_parameters["model_hash_str"] += "_fs" def _call(*args): - if(_get_aot_autograd(options)): + if _get_aot_autograd(options): args_list = args[0] args_new = [args_list[i] for i in preserved_arg_indices] args = args_new - res = execute(compiled_model, *args, executor="openvino", - executor_parameters=executor_parameters, options=options) + res = execute(compiled_model, *args, executor="openvino", executor_parameters=executor_parameters, options=options) return res - if(_get_aot_autograd(options)): - _call._boxed_call = True # type: ignore[attr-defined] + + if _get_aot_autograd(options): + _call._boxed_call = True # type: ignore[attr-defined] return _call except Exception as e: logger.debug(f"Failed in OpenVINO execution: {e}") return compile_fx(subgraph, example_inputs) + def reset(): clear_caches() From afa9231ec8f42719bb2c4f139df5adff494b1462 Mon Sep 17 00:00:00 2001 From: Maxim Vafin Date: Sun, 27 Oct 2024 20:31:19 +0100 Subject: [PATCH 002/120] [OVC] Fail with unsupported message when output argument is used for pytorch (#27255) ### Details: - *`output` argument cannot be used for pytorch as it is unclear what behavior is expected in this case.* ### Tickets: - *#26457 * --------- Co-authored-by: Roman Kazantsev --- .../ovc_python_api_tests/test_pytorch.py | 30 +++++++++++++++++++ .../tools/ovc/moc_frontend/pipeline.py | 9 ++---- 2 files changed, 33 insertions(+), 6 deletions(-) diff --git a/tests/layer_tests/ovc_python_api_tests/test_pytorch.py b/tests/layer_tests/ovc_python_api_tests/test_pytorch.py index 7dc40e310330cf..1a49a989c11df2 100644 --- a/tests/layer_tests/ovc_python_api_tests/test_pytorch.py +++ b/tests/layer_tests/ovc_python_api_tests/test_pytorch.py @@ -1181,6 +1181,19 @@ def forward(self, a, b): )} +def create_pytorch_module_with_output(tmp_dir): + class PTModel(torch.nn.Module): + def forward(self, a, b): + return a + b + + net = PTModel() + return net, None, { + "example_input": ( + torch.tensor([5, 6], dtype=torch.float32), + torch.tensor([5, 6], dtype=torch.float32), + ), "output": "some_name"} + + class TestMoConvertPyTorch(CommonMOConvertTest): test_data = [ 'create_pytorch_nn_module_case1', @@ -1255,6 +1268,23 @@ def test_mo_import_from_memory(self, create_model, ie_device, precision, ir_vers self._test_by_ref_graph(temp_dir, test_params, graph_ref, compare_tensor_names=False) + @pytest.mark.parametrize("create_model,exception", [ + ('create_pytorch_module_with_output', AssertionError) + ]) + @pytest.mark.nightly + @pytest.mark.precommit + def test_mo_import_from_memory_negative(self, create_model, exception, + ie_device, precision, ir_version, + temp_dir, use_legacy_frontend): + fw_model, graph_ref, mo_params = eval(create_model)(temp_dir) + + test_params = {'input_model': fw_model} + if mo_params is not None: + test_params.update(mo_params) + with pytest.raises(exception): + self._test_by_ref_graph(temp_dir, test_params, + graph_ref, compare_tensor_names=False) + def create_pt_model_with_custom_op(): # diff --git a/tools/ovc/openvino/tools/ovc/moc_frontend/pipeline.py b/tools/ovc/openvino/tools/ovc/moc_frontend/pipeline.py index 12722b5d771b75..4a297707a0e537 100644 --- a/tools/ovc/openvino/tools/ovc/moc_frontend/pipeline.py +++ b/tools/ovc/openvino/tools/ovc/moc_frontend/pipeline.py @@ -126,16 +126,13 @@ def merge_inputs(inputs, to_set_list): res.append(p) return res iplaces = merge_inputs(model_inputs, iplaces) - # Currently this only work to reorder inputs/outputs + oplaces = [] + # Currently this only work to reorder inputs to_override_all_inputs = check_places_are_same(model_inputs, [{"node": p} for p in iplaces]) to_override_all_outputs = False if argv.output: - oplaces = [] _outputs = fe_output_user_data_repack(input_model, argv.output, moc_front_end.get_name()) - for out_desc in _outputs: - oplaces.append(out_desc["name"]) - model_outputs = input_model.get_outputs() - to_override_all_outputs = check_places_are_same(model_outputs, [{"node": p} for p in oplaces]) + assert len(_outputs) == 0, "`output` argument is not supported for PyTorch" if to_override_all_inputs and to_override_all_outputs: input_model.extract_subgraph(iplaces, oplaces) elif to_override_all_inputs: From 65dd174c34e7849a993e30980bb22d9575ae0ef9 Mon Sep 17 00:00:00 2001 From: David Nam Date: Mon, 28 Oct 2024 09:49:34 +0800 Subject: [PATCH 003/120] [GPU] Support convolution onednn activation zero points for i32 (#27261) ### Details: - Support activation zero points for i32 in convolution onednn ### Tickets: - 155423 --- .../graph/impls/onednn/convolution_onednn.cpp | 6 +- .../src/graph/impls/onednn/utils.cpp | 2 + .../unit/test_cases/convolution_gpu_test.cpp | 107 ++++++++++++++++++ 3 files changed, 113 insertions(+), 2 deletions(-) diff --git a/src/plugins/intel_gpu/src/graph/impls/onednn/convolution_onednn.cpp b/src/plugins/intel_gpu/src/graph/impls/onednn/convolution_onednn.cpp index 83d2a10dc4f2f9..a11ceef8b0f2dd 100644 --- a/src/plugins/intel_gpu/src/graph/impls/onednn/convolution_onednn.cpp +++ b/src/plugins/intel_gpu/src/graph/impls/onednn/convolution_onednn.cpp @@ -204,14 +204,16 @@ struct convolution_onednn : typed_primitive_onednn_impl { auto& a_zp = arg.activations_zero_points(); auto a_zp_dtype = a_zp.get_output_layout().data_type; - if (!data_type_traits::is_i8_u8(a_zp_dtype)) { + if (!data_type_traits::is_i8_u8(a_zp_dtype) && a_zp_dtype != data_types::i32) { throw std::runtime_error("Unsupported data type for activations zero points for oneDNN convolution"); } if (a_zp_dtype == data_types::i8) { set_activation_zero_points_attr::value_type>(attrs, a_zp.as(), zero_point_mask); - } else { // if (a_zp_dtype == data_types::u8) + } else if (a_zp_dtype == data_types::u8) { set_activation_zero_points_attr::value_type>(attrs, a_zp.as(), zero_point_mask); + } else if (a_zp_dtype == data_types::i32) { + set_activation_zero_points_attr::value_type>(attrs, a_zp.as(), zero_point_mask); } } diff --git a/src/plugins/intel_gpu/src/graph/impls/onednn/utils.cpp b/src/plugins/intel_gpu/src/graph/impls/onednn/utils.cpp index 19ea02c7c66d28..a8aa43671ed048 100644 --- a/src/plugins/intel_gpu/src/graph/impls/onednn/utils.cpp +++ b/src/plugins/intel_gpu/src/graph/impls/onednn/utils.cpp @@ -30,6 +30,7 @@ cldnn::memory::ptr convert_zp_data_to_s32(const memory::ptr zp_memory) { template cldnn::memory::ptr convert_zp_data_to_s32(const memory::ptr zp_memory); template cldnn::memory::ptr convert_zp_data_to_s32(const memory::ptr zp_memory); +template cldnn::memory::ptr convert_zp_data_to_s32(const memory::ptr zp_memory); cldnn::format default_fmt_for_dims(size_t dims, bool is_grouped) { switch (dims) { @@ -489,6 +490,7 @@ bool is_per_tensor(cldnn::data_node& node, int32_t& zp_val) { template bool is_per_tensor(cldnn::data_node& node, int32_t& zp_val); template bool is_per_tensor(cldnn::data_node& node, int32_t& zp_val); +template bool is_per_tensor(cldnn::data_node& node, int32_t& zp_val); static std::string get_external_order(const std::vector& order, bool is_weights, bool is_grouped) { diff --git a/src/plugins/intel_gpu/tests/unit/test_cases/convolution_gpu_test.cpp b/src/plugins/intel_gpu/tests/unit/test_cases/convolution_gpu_test.cpp index 4155ac0b420e66..4f9c31064e9026 100644 --- a/src/plugins/intel_gpu/tests/unit/test_cases/convolution_gpu_test.cpp +++ b/src/plugins/intel_gpu/tests/unit/test_cases/convolution_gpu_test.cpp @@ -10134,6 +10134,113 @@ TEST(convolution_gpu_onednn, quantized_onednn_convolution_u8s8f32_weights_zp) { } } +TEST(convolution_gpu_onednn, support_activation_zero_points_for_i32) { + auto& engine = get_test_engine(); + if (!engine.get_device_info().supports_immad) + return; + + auto in_layout = layout { ov::PartialShape::dynamic(4), data_types::u8, format::bfyx }; + auto input = engine.allocate_memory({ data_types::u8, format::bfyx, { 1, 2, 5, 4 } }); + auto weights = engine.allocate_memory({ data_types::i8, format::bfyx, { 3, 2, 3, 3 } }); + auto biases = engine.allocate_memory({ data_types::f32, format::bfyx, { 1, 3, 1, 1 } }); + auto a_zp = engine.allocate_memory({ data_types::i32, format::bfyx, { 1, 3, 1, 1 } }); + auto w_zp = engine.allocate_memory({ data_types::u8, format::bfyx, { 1, 1, 1, 1 } }); + + set_values(input, { 1, 2, 3, 4, 5, + 2, 2, 3, 4, 6, + 3, 3, 3, 5, 1, + 1, 1, 1, 1, 1, + + 1, 2, 3, 4, 5, + 2, 2, 3, 4, 6, + 3, 3, 3, 5, 1, + 1, 1, 1, 1, 1 }); + + set_values(weights, { 1, 2, -1, + -2, 1, 2, + 9, 7, -1, + + 9, 0, -4, + -1, 3, 2, + 0, 2, 5, + + 1, 2, -1, + -2, 1, 2, + 9, 7, -1, + + 9, 0, -4, + -1, 3, 2, + 0, 2, 5, + + 1, 2, -1, + -2, 1, 2, + 9, 7, -1, + + 9, 0, -4, + -1, 3, 2, + 0, 2, 5 }); + set_values(a_zp, { 2, 5, 5 }); + set_values(w_zp, { 2 }); + set_values(biases, { 1.0f, -8.0f, -8.0f }); + + VVVF output_vec = { + { + { 2.0f, -5.0f, -20.0f }, + { 12.0f, 26.0f, -10.0f } + }, + { + { -7.0f, -14.0f, -29.0f }, + { 3.0f, 17.0f, -19.0f } + }, + { + { -7.0f, -14.0f, -29.0f }, + { 3.0f, 17.0f, -19.0f } + } }; + + topology topology( + input_layout("input", in_layout), + data("weights", weights), + data("biases", biases), + data("a_zp", a_zp), + data("w_zp", w_zp), + convolution("conv", input_info("input"), "weights", "biases", "w_zp", "a_zp", "", 1, + { 2, 2 }, { 1, 1 }, { 0, 0 }, { 1, 2 }, false, data_types::f32), + reorder("out", input_info("conv"), format::bfyx, data_types::f32)); + + ExecutionConfig config = get_test_default_config(engine); + ov::intel_gpu::ImplementationDesc conv_impl = { format::bfyx, "", impl_types::onednn }; + config.set_property(ov::intel_gpu::force_implementations(ov::intel_gpu::ImplForcingMap{ { "conv", conv_impl }})); + config.set_property(ov::intel_gpu::allow_new_shape_infer(true)); + config.set_property(ov::intel_gpu::optimize_data(true)); + + network network(engine, topology, config); + network.set_input_data("input", input); + + auto outputs = network.execute(); + ASSERT_EQ(outputs.begin()->first, "out"); + + auto output_memory = outputs.at("out").get_memory(); + cldnn::mem_lock output_ptr(output_memory, get_test_stream()); + + auto output_layout = output_memory->get_layout(); + int y_size = output_layout.spatial(1); + int x_size = output_layout.spatial(0); + int f_size = output_layout.feature(); + int b_size = output_layout.batch(); + ASSERT_EQ(output_layout.format, format::bfyx); + ASSERT_EQ(y_size, 2); + ASSERT_EQ(x_size, 3); + ASSERT_EQ(f_size, 3); + ASSERT_EQ(b_size, 1); + for (int f = 0; f < f_size; f++) + for (int y = 0; y < y_size; ++y) { + for (int x = 0; x < x_size; ++x) { + ASSERT_NEAR(output_vec[f][y][x], ((float)output_ptr[f * y_size * x_size + y * x_size + x]), 1e-5f) << + " x="<< x << " y=" << y << " f=" << f; + } + } +} + TEST(convolution_gpu_onednn, has_proper_synchronization) { auto& engine = get_test_engine(); if (!engine.get_device_info().supports_immad) From 78864cab37493e8f250c122070d6ef8dd0efd8eb Mon Sep 17 00:00:00 2001 From: Mang Guo Date: Mon, 28 Oct 2024 09:57:27 +0800 Subject: [PATCH 004/120] Add "if" statement for loop unrolling in rms kernel. (#27215) ### Details: - *Add "if" statement for loop unrolling in rms kernel to fix Segmentation Fault in tiny-random-sd3 model* ### Tickets: - *CVS-152057* --- .../src/nodes/kernels/x64/rms_kernel.cpp | 33 ++++++++++--------- .../instances/x64/rms_norm.cpp | 11 +++++++ 2 files changed, 29 insertions(+), 15 deletions(-) diff --git a/src/plugins/intel_cpu/src/nodes/kernels/x64/rms_kernel.cpp b/src/plugins/intel_cpu/src/nodes/kernels/x64/rms_kernel.cpp index 719e9a6e464934..30a7870a1a4b54 100644 --- a/src/plugins/intel_cpu/src/nodes/kernels/x64/rms_kernel.cpp +++ b/src/plugins/intel_cpu/src/nodes/kernels/x64/rms_kernel.cpp @@ -123,22 +123,25 @@ void jit_rms_kernel::generate() { // x * 1/Sqrt(ReduceMean(x^2,axes)+eps) * gamma // sum(x^2) align(16); - Xbyak::Label loop_4reg; - L(loop_4reg); - { - load(vmm_src, reg_src, m_jcp.src_prc, vec_size, false); - vfmadd231ps(vmm_sum0, vmm_src, vmm_src); - load(vmm_src, reg_src, m_jcp.src_prc, vec_size, false, vec_size * m_jcp.src_prc.size() * 1); - vfmadd231ps(vmm_sum1, vmm_src, vmm_src); - load(vmm_src, reg_src, m_jcp.src_prc, vec_size, false, vec_size * m_jcp.src_prc.size() * 2); - vfmadd231ps(vmm_sum2, vmm_src, vmm_src); - load(vmm_src, reg_src, m_jcp.src_prc, vec_size, false, vec_size * m_jcp.src_prc.size() * 3); - vfmadd231ps(vmm_sum3, vmm_src, vmm_src); - - add(reg_src, vec_size * m_jcp.src_prc.size() * 4); - dec(reg_size); - jnz(loop_4reg); + if ((m_jcp.data_size / (vec_size * 4)) != 0) { + Xbyak::Label loop_4reg; + L(loop_4reg); + { + load(vmm_src, reg_src, m_jcp.src_prc, vec_size, false); + vfmadd231ps(vmm_sum0, vmm_src, vmm_src); + load(vmm_src, reg_src, m_jcp.src_prc, vec_size, false, vec_size * m_jcp.src_prc.size() * 1); + vfmadd231ps(vmm_sum1, vmm_src, vmm_src); + load(vmm_src, reg_src, m_jcp.src_prc, vec_size, false, vec_size * m_jcp.src_prc.size() * 2); + vfmadd231ps(vmm_sum2, vmm_src, vmm_src); + load(vmm_src, reg_src, m_jcp.src_prc, vec_size, false, vec_size * m_jcp.src_prc.size() * 3); + vfmadd231ps(vmm_sum3, vmm_src, vmm_src); + + add(reg_src, vec_size * m_jcp.src_prc.size() * 4); + dec(reg_size); + jnz(loop_4reg); + } } + // 1 ~ 3 vmm for (size_t i = m_jcp.data_size / (vec_size * 4) * 4; i < m_jcp.data_size / vec_size; i++) { load(vmm_src, reg_src, m_jcp.src_prc, vec_size, false); diff --git a/src/plugins/intel_cpu/tests/functional/custom/single_layer_tests/instances/x64/rms_norm.cpp b/src/plugins/intel_cpu/tests/functional/custom/single_layer_tests/instances/x64/rms_norm.cpp index da7bbaaaf848d1..38aeedb5e451cf 100644 --- a/src/plugins/intel_cpu/tests/functional/custom/single_layer_tests/instances/x64/rms_norm.cpp +++ b/src/plugins/intel_cpu/tests/functional/custom/single_layer_tests/instances/x64/rms_norm.cpp @@ -24,6 +24,17 @@ const std::vector> shapes{ {ov::Shape{1024 + 16 + 1}, ov::Shape{1024 + 16 + 1}}} }, }, + // small data size + { + // data shape + {ov::test::InputShape{ov::PartialShape{-1, -1, 31}, + {ov::Shape{1, 8, 31}, ov::Shape{2, 3, 31}}} + }, + // scale shape + {ov::test::InputShape{ov::PartialShape{31}, + {ov::Shape{31}, ov::Shape{31}}} + }, + }, // scale is scalar { // data shape From 9b97cf9b001a76ac2163f099b6790850d78565ea Mon Sep 17 00:00:00 2001 From: Alexandra Sidorova Date: Mon, 28 Oct 2024 08:22:01 +0400 Subject: [PATCH 005/120] [RV64] Disabled multi-stream mode when SHL is used (#27250) ### Details: - *SHL library uses static global variables as flags and counters. It means that multi-stream execution can be unsafe when SHL is used - there can be data racing. The PR manually sets `streams = 1` to avoid possible functional issues in tput scenario with using SHL* ### Tickets: - *N/A* --- src/plugins/intel_cpu/src/config.cpp | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/src/plugins/intel_cpu/src/config.cpp b/src/plugins/intel_cpu/src/config.cpp index 421dca07747932..adcaeaaaa31a6f 100644 --- a/src/plugins/intel_cpu/src/config.cpp +++ b/src/plugins/intel_cpu/src/config.cpp @@ -421,6 +421,13 @@ void Config::readProperties(const ov::AnyMap& prop, const ModelType modelType) { streamsChanged = true; } +#if defined(OV_CPU_WITH_SHL) + // TODO: multi-stream execution is unsafe when SHL is used: + // The library uses global static variables as flags and counters. + streams = 1; + streamsChanged = true; +#endif + this->modelType = modelType; CPU_DEBUG_CAP_ENABLE(applyDebugCapsProperties()); From 8bc165595cea04745815b16148b1745151e5382f Mon Sep 17 00:00:00 2001 From: Roman Kazantsev Date: Mon, 28 Oct 2024 08:39:38 +0400 Subject: [PATCH 006/120] [TF FE][JAX FE] Support latest TF 2.18, JAX 0.4.35 and NumPy 2.x (#27246) **Details:** Support TF 2.18 and JAX 0.4.35 **Ticket:** TBD --------- Signed-off-by: Kazantsev, Roman --- .github/workflows/job_tokenizers.yml | 9 +++-- .github/workflows/linux_arm64.yml | 1 + .github/workflows/mac.yml | 1 + .github/workflows/mac_arm64.yml | 1 + .github/workflows/ubuntu_22.yml | 1 + .github/workflows/ubuntu_24.yml | 24 +++++++++++- .github/workflows/windows_vs2019_release.yml | 1 + src/frontends/jax/src/op/erfc.cpp | 33 ++++++++++++++++ src/frontends/jax/src/op_table.cpp | 2 + tests/constraints.txt | 8 ++-- tests/layer_tests/jax_tests/test_erfc.py | 38 +++++++++++++++++++ .../tensorflow_tests/test_tf_Equal.py | 4 +- tests/requirements_tensorflow | 16 +++++--- tools/mo/requirements_tf.txt | 2 +- tools/mo/requirements_tf2.txt | 2 +- 15 files changed, 126 insertions(+), 17 deletions(-) create mode 100644 src/frontends/jax/src/op/erfc.cpp create mode 100644 tests/layer_tests/jax_tests/test_erfc.py diff --git a/.github/workflows/job_tokenizers.yml b/.github/workflows/job_tokenizers.yml index 14243bda13531a..5c5e59aa3bec97 100644 --- a/.github/workflows/job_tokenizers.yml +++ b/.github/workflows/job_tokenizers.yml @@ -20,12 +20,15 @@ on: description: 'Components that are affected by changes in the commit defined by the Smart CI Action' type: string required: true + python-version: + description: 'Python version to setup. E.g., "3.11"' + type: string + required: true permissions: read-all env: PIP_CACHE_PATH: /mount/caches/pip/linux - PYTHON_VERSION: '3.11' TARGET_BRANCH: ${{ github.base_ref || github.event.merge_group.base_ref || github.ref }} jobs: @@ -63,10 +66,10 @@ jobs: if: runner.os == 'macOS' run: brew install pigz - - name: Setup Python ${{ env.PYTHON_VERSION }} + - name: Setup Python ${{ inputs.python-version }} uses: ./.github/actions/setup_python with: - version: ${{ env.PYTHON_VERSION }} + version: ${{ inputs.python-version }} pip-cache-path: ${{ runner.os == 'Linux' && env.PIP_CACHE_PATH || '' }} should-setup-pip-paths: ${{ runner.os == 'Linux' }} self-hosted-runner: ${{ runner.os == 'Linux' }} diff --git a/.github/workflows/linux_arm64.yml b/.github/workflows/linux_arm64.yml index 67cd4d0d1a5d84..0af30621a2a7fd 100644 --- a/.github/workflows/linux_arm64.yml +++ b/.github/workflows/linux_arm64.yml @@ -146,6 +146,7 @@ jobs: shell: bash container: '{"image": "${{ fromJSON(needs.docker.outputs.images).ov_build.ubuntu_20_04_arm64 }}", "volumes": ["/mount:/mount"]}' affected-components: ${{ needs.smart_ci.outputs.affected_components }} + python-version: '3.11' if: fromJSON(needs.smart_ci.outputs.affected_components).TOKENIZERS CXX_Unit_Tests: diff --git a/.github/workflows/mac.yml b/.github/workflows/mac.yml index 7c47c1c635c2f8..5492ad40aa17b4 100644 --- a/.github/workflows/mac.yml +++ b/.github/workflows/mac.yml @@ -258,6 +258,7 @@ jobs: runner: 'macos-13' shell: bash affected-components: ${{ needs.smart_ci.outputs.affected_components }} + python-version: '3.11' if: fromJSON(needs.smart_ci.outputs.affected_components).TOKENIZERS CXX_Unit_Tests: diff --git a/.github/workflows/mac_arm64.yml b/.github/workflows/mac_arm64.yml index 81cd229d1dd9f6..8100b74734ab17 100644 --- a/.github/workflows/mac_arm64.yml +++ b/.github/workflows/mac_arm64.yml @@ -258,6 +258,7 @@ jobs: runner: 'macos-13-xlarge' shell: bash affected-components: ${{ needs.smart_ci.outputs.affected_components }} + python-version: '3.11' if: fromJSON(needs.smart_ci.outputs.affected_components).TOKENIZERS CXX_Unit_Tests: diff --git a/.github/workflows/ubuntu_22.yml b/.github/workflows/ubuntu_22.yml index 753708d9b3ba51..92178fce7f5054 100644 --- a/.github/workflows/ubuntu_22.yml +++ b/.github/workflows/ubuntu_22.yml @@ -491,6 +491,7 @@ jobs: shell: bash container: '{"image": "${{ fromJSON(needs.docker.outputs.images).ov_build.ubuntu_22_04_x64 }}", "volumes": ["/mount:/mount"]}' affected-components: ${{ needs.smart_ci.outputs.affected_components }} + python-version: '3.11' if: fromJSON(needs.smart_ci.outputs.affected_components).TOKENIZERS iGPU: diff --git a/.github/workflows/ubuntu_24.yml b/.github/workflows/ubuntu_24.yml index 9d9aba6739f22f..d874e06a189232 100644 --- a/.github/workflows/ubuntu_24.yml +++ b/.github/workflows/ubuntu_24.yml @@ -144,9 +144,31 @@ jobs: affected-components: ${{ needs.smart_ci.outputs.affected_components }} python-version: '3.12' + TensorFlow_Layer_Tests: + name: TensorFlow Layer Tests + needs: [ Docker, Build, Smart_CI, Openvino_tokenizers ] + uses: ./.github/workflows/job_tensorflow_layer_tests.yml + with: + runner: 'aks-linux-4-cores-16gb' + container: '{"image": "${{ fromJSON(needs.docker.outputs.images).ov_test.ubuntu_24_04_x64 }}", "volumes": ["/mount:/mount"]}' + affected-components: ${{ needs.smart_ci.outputs.affected_components }} + python-version: '3.12' + + Openvino_tokenizers: + name: OpenVINO tokenizers extension + needs: [ Build, Smart_CI, Docker ] + uses: ./.github/workflows/job_tokenizers.yml + with: + runner: 'aks-linux-4-cores-16gb' + shell: bash + container: '{"image": "${{ fromJSON(needs.docker.outputs.images).ov_build.ubuntu_24_04_x64 }}", "volumes": ["/mount:/mount"]}' + affected-components: ${{ needs.smart_ci.outputs.affected_components }} + python-version: '3.12' + if: fromJSON(needs.smart_ci.outputs.affected_components).TOKENIZERS + Overall_Status: name: ci/gha_overall_status_ubuntu_24 - needs: [Smart_CI, Build, Debian_Packages, Samples, Python_Unit_Tests] + needs: [Smart_CI, Build, Debian_Packages, Samples, Python_Unit_Tests, Pytorch_Layer_Tests, TensorFlow_Layer_Tests, Openvino_tokenizers] if: ${{ always() }} runs-on: ubuntu-latest steps: diff --git a/.github/workflows/windows_vs2019_release.yml b/.github/workflows/windows_vs2019_release.yml index 1a3e8753f15421..1c84db5dcda530 100644 --- a/.github/workflows/windows_vs2019_release.yml +++ b/.github/workflows/windows_vs2019_release.yml @@ -235,6 +235,7 @@ jobs: runner: 'aks-win-4-cores-8gb' shell: pwsh affected-components: ${{ needs.smart_ci.outputs.affected_components }} + python-version: '3.11' if: fromJSON(needs.smart_ci.outputs.affected_components).TOKENIZERS Python_Unit_Tests: diff --git a/src/frontends/jax/src/op/erfc.cpp b/src/frontends/jax/src/op/erfc.cpp new file mode 100644 index 00000000000000..5a38577f868d35 --- /dev/null +++ b/src/frontends/jax/src/op/erfc.cpp @@ -0,0 +1,33 @@ +// Copyright (C) 2018-2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "openvino/frontend/jax/node_context.hpp" +#include "openvino/op/erf.hpp" +#include "openvino/op/subtract.hpp" +#include "utils.hpp" + +using namespace std; +using namespace ov; +using namespace ov::op; + +namespace ov { +namespace frontend { +namespace jax { +namespace op { + +OutputVector translate_erfc(const NodeContext& context) { + num_inputs_check(context, 1, 1); + auto x = context.get_input(0); + + // create const one of the same type as x + auto const_one = create_same_type_const_scalar(x, 1); + Output res = make_shared(x); + res = make_shared(const_one, res); + return {res}; +}; + +} // namespace op +} // namespace jax +} // namespace frontend +} // namespace ov diff --git a/src/frontends/jax/src/op_table.cpp b/src/frontends/jax/src/op_table.cpp index 500226594fea13..98f22452c5afab 100644 --- a/src/frontends/jax/src/op_table.cpp +++ b/src/frontends/jax/src/op_table.cpp @@ -45,6 +45,7 @@ OP_CONVERTER(translate_convert); OP_CONVERTER(translate_convolution); OP_CONVERTER(translate_copy); OP_CONVERTER(translate_dot_general); +OP_CONVERTER(translate_erfc); OP_CONVERTER(translate_integer_pow); OP_T_CONVERTER(translate_reduce_op); OP_CONVERTER(translate_reduce_window_max); @@ -72,6 +73,7 @@ const std::map get_supported_ops_jaxpr() { {"dot_general", op::translate_dot_general}, {"eq", op::translate_binary_op}, {"erf", op::translate_1to1_match_1_input}, + {"erfc", op::translate_erfc}, {"exp", op::translate_1to1_match_1_input}, {"ge", op::translate_binary_op}, {"gt", op::translate_binary_op}, diff --git a/tests/constraints.txt b/tests/constraints.txt index 775d3287c061a1..b800d289ce1547 100644 --- a/tests/constraints.txt +++ b/tests/constraints.txt @@ -11,7 +11,7 @@ sympy>=1.10 wheel>=0.38.1 defusedxml>=0.7.1 fastjsonschema~=2.17.1 -tensorflow>=2.5,<2.18.0 +tensorflow>=2.5,<2.19.0 requests>=2.25.1 opencv-python>=4.5 paddlepaddle==2.6.1 @@ -21,11 +21,11 @@ pytest>=5.0,<8.4 pytest-dependency==0.5.1 pytest-html==4.1.1 pytest-timeout==2.3.1 -jax<=0.4.33 -jaxlib<=0.4.33 +jax<=0.4.35 +jaxlib<=0.4.35 kornia==0.7.0 networkx<=3.3 -flax<=0.9.0 +flax<=0.10.0 --extra-index-url https://download.pytorch.org/whl/cpu torch~=2.4.1; platform_system != "Darwin" or platform_machine != "x86_64" diff --git a/tests/layer_tests/jax_tests/test_erfc.py b/tests/layer_tests/jax_tests/test_erfc.py new file mode 100644 index 00000000000000..c84655635ba98e --- /dev/null +++ b/tests/layer_tests/jax_tests/test_erfc.py @@ -0,0 +1,38 @@ +# Copyright (C) 2018-2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +import jax +import numpy as np +import pytest +from jax import numpy as jnp + +from jax_layer_test_class import JaxLayerTest + +rng = np.random.default_rng(109734) + + +class TestErfc(JaxLayerTest): + def _prepare_input(self): + # erf are mostly changing in a range [-4, 4] + x = rng.uniform(-4.0, 4.0, self.input_shape).astype(self.input_type) + + x = jnp.array(x) + return [x] + + def create_model(self, input_shape, input_type): + self.input_shape = input_shape + self.input_type = input_type + + def jax_erfc(x): + return jax.lax.erfc(x) + + return jax_erfc, None, 'erfc' + + @pytest.mark.parametrize("input_shape", [[2], [3, 4]]) + @pytest.mark.parametrize("input_type", [np.float16, np.float32, np.float64]) + @pytest.mark.nightly + @pytest.mark.precommit_jax_fe + def test_erfc(self, ie_device, precision, ir_version, input_shape, input_type): + self._test(*self.create_model(input_shape, input_type), + ie_device, precision, + ir_version) diff --git a/tests/layer_tests/tensorflow_tests/test_tf_Equal.py b/tests/layer_tests/tensorflow_tests/test_tf_Equal.py index 7d61317857ddbc..8c0b496a7b4c42 100644 --- a/tests/layer_tests/tensorflow_tests/test_tf_Equal.py +++ b/tests/layer_tests/tensorflow_tests/test_tf_Equal.py @@ -146,8 +146,8 @@ def test_tf_equal_int64(self, params, ie_device, precision, ir_version, temp_dir # Values for checking important corner cases for float values # expect: false false false false false false true false true - x_corner = [1., 1., 1., np.nan, np.nan, np.nan, np.inf, np.inf, np.NINF] - y_corner = [np.nan, np.inf, np.NINF, np.nan, np.inf, np.NINF, np.inf, np.NINF, np.NINF] + x_corner = [1., 1., 1., np.nan, np.nan, np.nan, np.inf, np.inf, -np.inf] + y_corner = [np.nan, np.inf, -np.inf, np.nan, np.inf, -np.inf, np.inf, -np.inf, -np.inf] test_data_float16 = [ pytest.param( diff --git a/tests/requirements_tensorflow b/tests/requirements_tensorflow index c29aa777fda537..3ae47d81ee2c50 100644 --- a/tests/requirements_tensorflow +++ b/tests/requirements_tensorflow @@ -1,15 +1,21 @@ -# tensorflow-intel inside tensorflow still requires numpy<2.0.0 -numpy==1.26.4 +# test ovc with NumPy 2.x on Ubuntu 24 with default Python 3.12 +# test against NumPy 1.x with older Python versions +# tensorflow-intel 2.18.0 depends on numpy<2.1.0 and >=1.26.0 +numpy==1.26.4; python_version < "3.12" +numpy==2.0.2; python_version >= "3.12" pytest==7.0.1 pytest-xdist[psutil]==3.6.1 pytest-html==4.1.1 transformers==4.45.1 # install exact keras version since tensorflow depends and has no upper bound for it keras==3.6.0 -tensorflow==2.17.0; platform_system != "Darwin" or platform_machine != "x86_64" +tensorflow==2.18.0; python_version >= "3.12" and (platform_system != "Darwin" or platform_machine != "x86_64") +tensorflow==2.17.0; python_version < "3.12" and (platform_system != "Darwin" or platform_machine != "x86_64") tensorflow==2.16.2; platform_system == "Darwin" and platform_machine == "x86_64" +# install explicit version of wrapt to avoid "this __dict__ descriptor does not support '_DictWrapper' objects" error from TensorFlow 2.18 +wrapt==1.15.0; python_version >= "3.12" # tensorflow-text is not available for both Windows and ARM platforms -tensorflow-text==2.17.0; platform_system == "Linux" and platform_machine == "x86_64" +tensorflow-text==2.17.0; python_version < "3.12" and platform_system == "Linux" and platform_machine == "x86_64" tensorflow-hub==0.16.1 -jax==0.4.33 +jax==0.4.35 defusedxml==0.7.1 diff --git a/tools/mo/requirements_tf.txt b/tools/mo/requirements_tf.txt index 4897012177c1bb..fb19c216e955ad 100644 --- a/tools/mo/requirements_tf.txt +++ b/tools/mo/requirements_tf.txt @@ -1,6 +1,6 @@ -c ../constraints.txt h5py -tensorflow>=1.15.5,<2.18.0 +tensorflow>=1.15.5,<2.19.0 numpy>=1.16.6,<1.27 networkx defusedxml diff --git a/tools/mo/requirements_tf2.txt b/tools/mo/requirements_tf2.txt index b6f9029cdff263..50df4160c669d3 100644 --- a/tools/mo/requirements_tf2.txt +++ b/tools/mo/requirements_tf2.txt @@ -1,6 +1,6 @@ -c ../constraints.txt h5py -tensorflow>=2.5,<2.18.0 +tensorflow>=2.5,<2.19.0 numpy>=1.16.6,<1.27 networkx defusedxml From de5f765ca14dca52cb1780e6de6305f0cb9623e8 Mon Sep 17 00:00:00 2001 From: Alexandra Sidorova Date: Mon, 28 Oct 2024 10:04:15 +0400 Subject: [PATCH 007/120] [RV64] Added build support via linux package and fix the build via GNU toolchain (#27228) ### Details: - *The [GNU toolchain for RISC-V](https://github.com/riscv-collab/riscv-gnu-toolchain) now supports RVV by default - ratified RVV1.0. All intrinsics have prefix `__riscv64`. Some compilers (clang, llvm) can compile a code contained intrinsics without this prefix (have overloaded functions). But GCC from `riscv-gnu-toolchain` doesn't have them - so cannot compile max pooling primitive from oneDNN. The fix in oneDNN* - https://github.com/openvinotoolkit/oneDNN/pull/265 - *Added one more option to cross-compile OpenVINO for riscv64 platforms without RVV - using linux packages and new written cmake toolchain file* - *Renamed `T-Head` to `Xuantie`* - *Added the information about recommended build to tutorial* ### Tickets: - *N/A* --- ...> riscv64-071-xuantie-gnu.toolchain.cmake} | 6 ++-- ...> riscv64-100-xuantie-gnu.toolchain.cmake} | 6 ++-- cmake/toolchains/riscv64-gnu.toolchain.cmake | 7 ++-- .../toolchains/riscv64.linux.toolchain.cmake | 13 ++++++++ docs/dev/build_riscv64.md | 33 +++++++++++++++---- src/plugins/intel_cpu/CMakeLists.txt | 2 +- src/plugins/intel_cpu/thirdparty/onednn | 2 +- 7 files changed, 49 insertions(+), 20 deletions(-) rename cmake/toolchains/{riscv64-071-thead-gnu.toolchain.cmake => riscv64-071-xuantie-gnu.toolchain.cmake} (95%) rename cmake/toolchains/{riscv64-100-thead-gnu.toolchain.cmake => riscv64-100-xuantie-gnu.toolchain.cmake} (95%) create mode 100644 cmake/toolchains/riscv64.linux.toolchain.cmake diff --git a/cmake/toolchains/riscv64-071-thead-gnu.toolchain.cmake b/cmake/toolchains/riscv64-071-xuantie-gnu.toolchain.cmake similarity index 95% rename from cmake/toolchains/riscv64-071-thead-gnu.toolchain.cmake rename to cmake/toolchains/riscv64-071-xuantie-gnu.toolchain.cmake index f5e9e68aabedc6..5bc16de8df91e8 100644 --- a/cmake/toolchains/riscv64-071-thead-gnu.toolchain.cmake +++ b/cmake/toolchains/riscv64-071-xuantie-gnu.toolchain.cmake @@ -2,7 +2,7 @@ # SPDX-License-Identifier: Apache-2.0 # -# NOTE: use T-Head compiler: +# NOTE: use Xuantie compiler: # git clone https://github.com/XUANTIE-RV/xuantie-gnu-toolchain.git # ./configure --prefix=/opt/riscv # make linux @@ -22,10 +22,10 @@ set(CMAKE_SYSTEM_NAME Linux) set(CMAKE_SYSTEM_PROCESSOR riscv64) -set(RISCV64_THEAD ON) +set(RISCV64_XUANTIE ON) set(RISCV64_RVV0p7 ON) -set(RISCV_TOOLCHAIN_ROOT $ENV{RISCV_TOOLCHAIN_ROOT} CACHE PATH "Path to CLANG for RISC-V cross compiler build directory") +set(RISCV_TOOLCHAIN_ROOT $ENV{RISCV_TOOLCHAIN_ROOT} CACHE PATH "Path to GCC for RISC-V cross compiler build directory") set(CMAKE_SYSROOT "${RISCV_TOOLCHAIN_ROOT}/sysroot" CACHE PATH "RISC-V sysroot") set(CMAKE_C_COMPILER ${RISCV_TOOLCHAIN_ROOT}/bin/riscv64-unknown-linux-gnu-gcc) diff --git a/cmake/toolchains/riscv64-100-thead-gnu.toolchain.cmake b/cmake/toolchains/riscv64-100-xuantie-gnu.toolchain.cmake similarity index 95% rename from cmake/toolchains/riscv64-100-thead-gnu.toolchain.cmake rename to cmake/toolchains/riscv64-100-xuantie-gnu.toolchain.cmake index e00e30f975598f..0664b38a9ba68d 100644 --- a/cmake/toolchains/riscv64-100-thead-gnu.toolchain.cmake +++ b/cmake/toolchains/riscv64-100-xuantie-gnu.toolchain.cmake @@ -2,7 +2,7 @@ # SPDX-License-Identifier: Apache-2.0 # -# NOTE: use T-Head compiler: +# NOTE: use Xuantie compiler: # git clone https://github.com/XUANTIE-RV/xuantie-gnu-toolchain.git # ./configure --prefix=/opt/riscv # make linux @@ -22,10 +22,10 @@ set(CMAKE_SYSTEM_NAME Linux) set(CMAKE_SYSTEM_PROCESSOR riscv64) -set(RISCV64_THEAD ON) +set(RISCV64_XUANTIE ON) set(RISCV64_RVV1p0 ON) -set(RISCV_TOOLCHAIN_ROOT $ENV{RISCV_TOOLCHAIN_ROOT} CACHE PATH "Path to CLANG for RISC-V cross compiler build directory") +set(RISCV_TOOLCHAIN_ROOT $ENV{RISCV_TOOLCHAIN_ROOT} CACHE PATH "Path to GCC for RISC-V cross compiler build directory") set(CMAKE_SYSROOT "${RISCV_TOOLCHAIN_ROOT}/sysroot" CACHE PATH "RISC-V sysroot") set(CMAKE_C_COMPILER ${RISCV_TOOLCHAIN_ROOT}/bin/riscv64-unknown-linux-gnu-gcc) diff --git a/cmake/toolchains/riscv64-gnu.toolchain.cmake b/cmake/toolchains/riscv64-gnu.toolchain.cmake index 994b05f66b52f6..b58dcf169fc2da 100644 --- a/cmake/toolchains/riscv64-gnu.toolchain.cmake +++ b/cmake/toolchains/riscv64-gnu.toolchain.cmake @@ -2,12 +2,12 @@ # SPDX-License-Identifier: Apache-2.0 # -# NOTE: use with the following docker image https://github.com/Incarnation-p-lee/riscv-docker-emulator#llvm-clang-tool-chain +# NOTE: use with the following docker image https://github.com/Incarnation-p-lee/riscv-docker-emulator#gnu-toolchain set(CMAKE_SYSTEM_NAME Linux) set(CMAKE_SYSTEM_PROCESSOR riscv64) -set(RISCV_TOOLCHAIN_ROOT "/opt/riscv/gnu-toolchain/rv64-linux" CACHE PATH "Path to CLANG for RISC-V cross compiler build directory") +set(RISCV_TOOLCHAIN_ROOT "/opt/riscv/gnu-toolchain/rv64-linux" CACHE PATH "Path to GCC for RISC-V cross compiler build directory") set(CMAKE_SYSROOT "${RISCV_TOOLCHAIN_ROOT}/sysroot" CACHE PATH "RISC-V sysroot") set(CMAKE_C_COMPILER_TARGET riscv64-unknown-linux-gnu) @@ -26,9 +26,6 @@ set(CMAKE_OBJDUMP ${RISCV_TOOLCHAIN_ROOT}/bin/riscv64-unknown-linux-gnu-objdump) set(CMAKE_READELF ${RISCV_TOOLCHAIN_ROOT}/bin/riscv64-unknown-linux-gnu-readelf) set(PKG_CONFIG_EXECUTABLE "NOT-FOUND" CACHE PATH "Path to RISC-V pkg-config") -# Don't run the linker on compiler check -set(CMAKE_TRY_COMPILE_TARGET_TYPE STATIC_LIBRARY) - set(CMAKE_SHARED_LINKER_FLAGS_INIT "-L${CMAKE_SYSROOT}/lib") set(CMAKE_EXE_LINKER_FLAGS_INIT "-L${CMAKE_SYSROOT}/lib") set(CMAKE_MODULE_LINKER_FLAGS_INIT "-L${CMAKE_SYSROOT}/lib") diff --git a/cmake/toolchains/riscv64.linux.toolchain.cmake b/cmake/toolchains/riscv64.linux.toolchain.cmake new file mode 100644 index 00000000000000..cb088f5eca5052 --- /dev/null +++ b/cmake/toolchains/riscv64.linux.toolchain.cmake @@ -0,0 +1,13 @@ +# Copyright (C) 2018-2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 +# + +# Install compiler on debian using: +# apt-get install -y gcc-riscv64-linux-gnu g++-riscv64-linux-gnu binutils-riscv64-linux-gnu + +set(CMAKE_SYSTEM_NAME Linux) +set(CMAKE_SYSTEM_PROCESSOR riscv64) + +set(CMAKE_C_COMPILER riscv64-linux-gnu-gcc) +set(CMAKE_CXX_COMPILER riscv64-linux-gnu-g++) +set(CMAKE_STRIP riscv64-linux-gnu-strip) diff --git a/docs/dev/build_riscv64.md b/docs/dev/build_riscv64.md index 13ab9e98d56958..19d6e8714b45fc 100644 --- a/docs/dev/build_riscv64.md +++ b/docs/dev/build_riscv64.md @@ -16,15 +16,23 @@ The software was validated on the following devices: - Python 3.10 for OpenVINO Runtime Python API ## How to build +Currently, there are three ways to build OpenVINO Runtime for 64-bit RISC-V platforms: + +1. **Recommended**. The build with vectorized (using RVV instructions) primitives for limited scope of operations from [`SHL`](https://github.com/XUANTIE-RV/csi-nn2) using [`xuantie-gnu-toolchain`](https://github.com/XUANTIE-RV/). This GNU Compiler Toolchain supports RVV 0.7.1, ratified RVV 1.0 and Xuantie-specific instruction sets. The vector intrinsics don't use the common prefix `__riscv_`. This method provides the best performance available at the moment. +2. The build without optimized primitives using [`riscv-gnu-toolchain`](https://github.com/riscv-collab/riscv-gnu-toolchain.git). This GNU Compiler Toolchain supports RVV 0.7.1 and ratified RVV 1.0. The vector intrinsics use the common prefix `__riscv_`. However, as mentioned earlier, this build method doesn't yet provide optimized primitives implemented using the RVV intrinsics. +3. The build without optimized primitives using installed Linux packages. The compilers in these packages don't support RVV intrinsics. + +### Steps + 0. Prerequisite: -- For target with RVV - build `xuantie-gnu-toolchain` and `qemu`: +- For target with vectorized primitives from `SHL` - build `xuantie-gnu-toolchain` and `qemu`: ```sh git clone https://github.com/XUANTIE-RV/xuantie-gnu-toolchain.git cd xuantie-gnu-toolchain ./configure --prefix= make linux build-qemu -j$(nproc) ``` -- For target without RVV - build `riscv-gnu-toolchain`: +- For target without optimized primitives using `riscv-gnu-toolchain`: ```sh git clone https://github.com/riscv-collab/riscv-gnu-toolchain.git cd riscv-gnu-toolchain @@ -32,6 +40,11 @@ The software was validated on the following devices: make linux build-qemu -j$(nproc) ``` > **NOTE**: The `build-qemu` target is optional, as it is used to build the `qemu` simulator. However, it is recommended to build the `qemu` simulator, since it is much more convenient to validate the software on your host than on your devices. More information can be seen [here](https://github.com/riscv-collab/riscv-gnu-toolchain). +- For target without optimized primitives using installed Linux packages: + ```sh + apt-get update + apt-get install -y gcc-riscv64-linux-gnu g++-riscv64-linux-gnu binutils-riscv64-linux-gnu + ``` 1. Clone OpenVINO repository and init submodules: ```sh @@ -50,8 +63,8 @@ The software was validated on the following devices: mkdir build && cd build ``` -4. To cross compile OpenVINO Runtime for RISC-V devices, run `cmake` with specified `CMAKE_TOOLCHAIN_FILE` and `RISCV_TOOLCHAIN_ROOT`. -- For target with RVV: +4. To cross compile OpenVINO Runtime for RISC-V devices, run `cmake` with specified `CMAKE_TOOLCHAIN_FILE` and `RISCV_TOOLCHAIN_ROOT` (the last one is needed only for build using GNU toolchain). +- For target with vectorized primitives from `SHL`: ```sh cmake .. \ -DCMAKE_BUILD_TYPE=Release \ @@ -59,8 +72,8 @@ The software was validated on the following devices: -DCMAKE_TOOLCHAIN_FILE=../cmake/toolchains/ \ -DRISCV_TOOLCHAIN_ROOT= ``` - > **NOTE**: To build OpenVINO Runtime for different versions of RVV, you just need to specify corresponding toolchain files. For exmaple, you can replace `` with `riscv64-071-thead-gnu.toolchain.cmake` for RVV 0.7.1 and `riscv64-100-thead-gnu.toolchain.cmake` for RVV 1.0 respectively. -- For target without RVV: + > **NOTE**: To build OpenVINO Runtime for different versions of RVV, you just need to specify corresponding toolchain files. For example, you can replace `` with `riscv64-071-xuantie-gnu.toolchain.cmake` for RVV 0.7.1 and `riscv64-100-xuantie-gnu.toolchain.cmake` for RVV 1.0 respectively. +- For target without optimized primitives using `riscv-gnu-toolchain`: ```sh cmake .. \ -DCMAKE_BUILD_TYPE=Release \ @@ -69,7 +82,13 @@ The software was validated on the following devices: -DRISCV_TOOLCHAIN_ROOT=/opt/riscv ``` > **NOTE**: The `riscv-gnu-toolchain` is build as there are essential files used for cross compilation under `/opt/riscv/sysroot`. The latest stable versions of Clang or GCC both support compiling source code into RISC-V instructions, so it is acceptable to choose your preferable compilers by specifying `-DCMAKE_C_COMPILER` and `CMAKE_CXX_COMPILER`. But remember to add the key `-DCMAKE_SYSROOT=/opt/riscv/sysroot`, otherwise many fundamental headers and libs could not be found during cross compilation. - +- For target without optimized primitives using installed Linux packages: + ```sh + cmake .. \ + -DCMAKE_BUILD_TYPE=Release \ + -DCMAKE_INSTALL_PREFIX= \ + -DCMAKE_TOOLCHAIN_FILE=../cmake/toolchains/riscv64.linux.toolchain.cmake + ``` > **NOTE**: By default OpenVINO is built with OpenMP support on RISC-V devices. Then run `make` to build the project: diff --git a/src/plugins/intel_cpu/CMakeLists.txt b/src/plugins/intel_cpu/CMakeLists.txt index 6965b7a25ce512..04909c7d8f5a5a 100644 --- a/src/plugins/intel_cpu/CMakeLists.txt +++ b/src/plugins/intel_cpu/CMakeLists.txt @@ -143,7 +143,7 @@ else() endif() ov_option(ENABLE_MLAS_FOR_CPU "Enable MLAS for OpenVINO CPU Plugin" ${ENABLE_MLAS_FOR_CPU_DEFAULT}) -if(RISCV64_THEAD) +if(RISCV64_XUANTIE) set(ENABLE_SHL_FOR_CPU_DEFAULT ON) else() set(ENABLE_SHL_FOR_CPU_DEFAULT OFF) diff --git a/src/plugins/intel_cpu/thirdparty/onednn b/src/plugins/intel_cpu/thirdparty/onednn index 1ce2d722922efb..c60a9946aa2386 160000 --- a/src/plugins/intel_cpu/thirdparty/onednn +++ b/src/plugins/intel_cpu/thirdparty/onednn @@ -1 +1 @@ -Subproject commit 1ce2d722922efb80da52a6efe2152a9aecdddebf +Subproject commit c60a9946aa2386890e5c9f5587974facb7624227 From 6f001e9a37d8a3c2264733989f93abd134b58d68 Mon Sep 17 00:00:00 2001 From: Przemyslaw Wysocki Date: Mon, 28 Oct 2024 07:17:41 +0100 Subject: [PATCH 008/120] [PyOV] Migrate from direct setup.py calls in Benchmark App and Docs CI (#27220) ### Details: - Remove direct `setup.py` calls from comments in cmake files - Change the docs building command in GHA - Migrate Benchmark App wheel building command to use `python -m pip wheel .` - Since `openvino_dev` is deprecated and set to be removed soon anyway, the `constraints.txt` file has been unpinned from Benchmark App requirements. In my opinion this simplifies the code significantly with no drawbacks, because `numpy` version for benchmark_app (the only dependency) is not even specified in `constraints.txt`, which will soon be removed. ### Tickets: - CVS-155956 - CVS-155957 --------- Co-authored-by: Ilya Lavrenov --- .github/workflows/build_doc.yml | 2 +- .../packaging/common-libraries.cmake | 4 +- .../packaging/debian/debian.cmake | 4 +- .../developer_package/packaging/rpm/rpm.cmake | 4 +- docs/documentation_build_instructions.md | 2 +- docs/openvino_sphinx_theme/README.md | 3 +- tools/benchmark_tool/requirements.txt | 1 - tools/benchmark_tool/setup.py | 87 ++----------------- 8 files changed, 17 insertions(+), 90 deletions(-) diff --git a/.github/workflows/build_doc.yml b/.github/workflows/build_doc.yml index 6623f5ea182da1..6f38ff0214bfaa 100644 --- a/.github/workflows/build_doc.yml +++ b/.github/workflows/build_doc.yml @@ -41,7 +41,7 @@ jobs: - name: Install python dependencies run: | python3 -m pip install -r docs/requirements.txt - (cd docs/openvino_sphinx_theme && python3 setup.py install) + (cd docs/openvino_sphinx_theme && python3 -m pip install .) python3 -m pip install docs/openvino_custom_sphinx_sitemap - name: Download and install doxygen diff --git a/cmake/developer_package/packaging/common-libraries.cmake b/cmake/developer_package/packaging/common-libraries.cmake index 0ec054da853e2c..4ac0124d3089f0 100644 --- a/cmake/developer_package/packaging/common-libraries.cmake +++ b/cmake/developer_package/packaging/common-libraries.cmake @@ -30,7 +30,7 @@ macro(ov_common_libraries_cpack_set_dirs) ov_get_pyversion(pyversion) if(pyversion) - # should not be used in production; only by setup.py install + # should not be used in production; only by pip install set(OV_CPACK_PYTHONDIR lib/${pyversion}/site-packages) endif() @@ -94,7 +94,7 @@ macro(ov_define_component_include_rules) set(OV_CPACK_COMP_PYTHON_OPENVINO_EXCLUDE_ALL EXCLUDE_FROM_ALL) set(OV_CPACK_COMP_BENCHMARK_APP_EXCLUDE_ALL ${OV_CPACK_COMP_PYTHON_OPENVINO_EXCLUDE_ALL}) set(OV_CPACK_COMP_OVC_EXCLUDE_ALL ${OV_CPACK_COMP_PYTHON_OPENVINO_EXCLUDE_ALL}) - # we don't pack artifacts of setup.py install, because it's called explicitly in conda / brew + # we don't pack artifacts of pip install, because it's called explicitly in conda / brew # or not used at all like in cases with conan / vcpkg set(OV_CPACK_COMP_PYTHON_OPENVINO_PACKAGE_EXCLUDE_ALL ${OV_CPACK_COMP_PYTHON_OPENVINO_EXCLUDE_ALL}) # we don't need wheels in the distribution packages diff --git a/cmake/developer_package/packaging/debian/debian.cmake b/cmake/developer_package/packaging/debian/debian.cmake index 2b95fcfde5c145..a23d5290044e3d 100644 --- a/cmake/developer_package/packaging/debian/debian.cmake +++ b/cmake/developer_package/packaging/debian/debian.cmake @@ -95,12 +95,12 @@ macro(ov_define_component_include_rules) endif() # python if(ENABLE_PYTHON_PACKAGING) - # pack artifacts of setup.py install + # pack artifacts of pip install unset(OV_CPACK_COMP_PYTHON_OPENVINO_PACKAGE_EXCLUDE_ALL) else() set(OV_CPACK_COMP_PYTHON_OPENVINO_PACKAGE_EXCLUDE_ALL EXCLUDE_FROM_ALL) endif() - # we don't pack python components itself, we pack artifacts of setup.py install + # we don't pack python components itself, we pack artifacts of pip install set(OV_CPACK_COMP_PYTHON_OPENVINO_EXCLUDE_ALL EXCLUDE_FROM_ALL) set(OV_CPACK_COMP_BENCHMARK_APP_EXCLUDE_ALL ${OV_CPACK_COMP_PYTHON_OPENVINO_EXCLUDE_ALL}) set(OV_CPACK_COMP_OVC_EXCLUDE_ALL ${OV_CPACK_COMP_PYTHON_OPENVINO_EXCLUDE_ALL}) diff --git a/cmake/developer_package/packaging/rpm/rpm.cmake b/cmake/developer_package/packaging/rpm/rpm.cmake index 45d9b0c0ca2121..bb4e7942d7640b 100644 --- a/cmake/developer_package/packaging/rpm/rpm.cmake +++ b/cmake/developer_package/packaging/rpm/rpm.cmake @@ -86,12 +86,12 @@ macro(ov_define_component_include_rules) endif() # python if(ENABLE_PYTHON_PACKAGING) - # pack artifacts of setup.py install + # pack artifacts of pip install unset(OV_CPACK_COMP_PYTHON_OPENVINO_PACKAGE_EXCLUDE_ALL) else() set(OV_CPACK_COMP_PYTHON_OPENVINO_PACKAGE_EXCLUDE_ALL EXCLUDE_FROM_ALL) endif() - # we don't pack python components itself, we pack artifacts of setup.py install + # we don't pack python components itself, we pack artifacts of pip install set(OV_CPACK_COMP_PYTHON_OPENVINO_EXCLUDE_ALL EXCLUDE_FROM_ALL) set(OV_CPACK_COMP_BENCHMARK_APP_EXCLUDE_ALL ${OV_CPACK_COMP_PYTHON_OPENVINO_EXCLUDE_ALL}) set(OV_CPACK_COMP_OVC_EXCLUDE_ALL ${OV_CPACK_COMP_PYTHON_OPENVINO_EXCLUDE_ALL}) diff --git a/docs/documentation_build_instructions.md b/docs/documentation_build_instructions.md index d9219454b86a19..a1412cfc3f358d 100644 --- a/docs/documentation_build_instructions.md +++ b/docs/documentation_build_instructions.md @@ -26,7 +26,7 @@ $ source env/bin/activate ``` 5. Install the sphinx theme ``` -(env) $ cd docs/openvino_sphinx_theme && python setup.py install && cd - +(env) $ python -m pip install docs/openvino_sphinx_theme `````` 6. Install the custom sphinx sitemap ``` diff --git a/docs/openvino_sphinx_theme/README.md b/docs/openvino_sphinx_theme/README.md index 7931c481c308aa..2e82fa06e8c185 100644 --- a/docs/openvino_sphinx_theme/README.md +++ b/docs/openvino_sphinx_theme/README.md @@ -4,7 +4,8 @@ 1. Install the `openvino_sphinx_theme` using `python`: ``` -python setup.py install --user +cd openvino/docs/openvino_sphinx_theme +python -m pip install --user . ``` 2. Update the `html_theme` variable in your `conf.py`: diff --git a/tools/benchmark_tool/requirements.txt b/tools/benchmark_tool/requirements.txt index f8f5f92cddeb53..5594ff31cd35c5 100644 --- a/tools/benchmark_tool/requirements.txt +++ b/tools/benchmark_tool/requirements.txt @@ -1,2 +1 @@ --c ../constraints.txt numpy>=1.16.6,<2.1.0 diff --git a/tools/benchmark_tool/setup.py b/tools/benchmark_tool/setup.py index 0df9a9bc92379a..98f2a369a60360 100644 --- a/tools/benchmark_tool/setup.py +++ b/tools/benchmark_tool/setup.py @@ -6,92 +6,18 @@ """ Use this script to create a wheel with OpenVINO™ Python* tools: -$ python setup.py sdist bdist_wheel +$ python -m pip wheel . """ -import pkg_resources -import re from setuptools import setup, find_packages -from pathlib import Path -from typing import Dict, List +with open('requirements.txt', 'r', encoding='utf-8') as f: + raw_contents = f.readlines() + reqs = [line.strip() for line in raw_contents] + with open('README.md', 'r', encoding='utf-8') as f: long_description = f.read() - -def read_constraints(path: str='../constraints.txt') -> Dict[str, List[str]]: - """ - Read a constraints.txt file and return a dict - of {package_name: [required_version_1, required_version_2]}. - The dict values are a list because a package can be mentioned - multiple times, for example: - mxnet~=1.2.0; sys_platform == 'win32' - mxnet>=1.7.0; sys_platform != 'win32' - """ - constraints = {} - with open(Path(__file__).resolve().parent / path) as f: - raw_constraints = f.readlines() - for line in raw_constraints: - # skip comments - if line.startswith('#'): - continue - line = line.replace('\n', '') - # read constraints for that package - package, delimiter, constraint = re.split('(~|=|<|>|;)', line, maxsplit=1) - # if there is no entry for that package, add it - if constraints.get(package) is None: - constraints[package] = [delimiter + constraint] - # else add another entry for that package - else: - constraints[package].extend([delimiter + constraint]) - return constraints - - -def read_requirements(path: str) -> List[str]: - """ - Read a requirements.txt file and return a list - of requirements. Three cases are supported, the - list corresponds to priority: - 1. version specified in requirements.txt - 2. version specified in constraints.txt - 3. version unbound - - Putting environment markers into constraints.txt is prone to bugs. - They should be specified in requirements.txt files. - """ - requirements = [] - constraints = read_constraints() - with open(Path(__file__).resolve().parent / path) as f: - raw_requirements = f.readlines() - for line in raw_requirements: - # skip comments and constraints link - if line.startswith(('#', '-c')): - continue - # get rid of newlines - line = line.replace('\n', '') - # if version is specified (non-word chars present) - package_constraint = constraints.get(line.split(';')[0]) - if re.search('(~|=|<|>)', line) and len(line.split(';'))>1: - if package_constraint: # both markers and versions specified - marker_index = line.find(";") - # insert package version between package name and environment markers - line = line[:marker_index] \ - + ",".join([constraint for constraint in package_constraint]) \ - + line[marker_index:] - requirements.append(line) - # else get version from constraints - else: - constraint = constraints.get(line) - # if version found in constraints.txt - if constraint: - for marker in constraint: - requirements.append(line+marker) - # else version is unbound - else: - requirements.append(line) - return requirements - - setup( name='benchmark_tool', version='0.0.0', @@ -111,6 +37,7 @@ def read_requirements(path: str) -> List[str]: 'Operating System :: OS Independent', ], packages=find_packages(), - install_requires=read_requirements('requirements.txt'), + install_requires=reqs, + data_files=[('.', ['requirements.txt'])], python_requires='>=3.9', ) From f5ef9b64d8de41995d2db44ce278771b353829e6 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon, 28 Oct 2024 07:48:25 +0000 Subject: [PATCH 009/120] Bump actions/setup-node from 4.0.4 to 4.1.0 (#27266) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Bumps [actions/setup-node](https://github.com/actions/setup-node) from 4.0.4 to 4.1.0.
Release notes

Sourced from actions/setup-node's releases.

v4.1.0

What's Changed

New Contributors

Full Changelog: https://github.com/actions/setup-node/compare/v4...v4.1.0

Commits

[![Dependabot compatibility score](https://dependabot-badges.githubapp.com/badges/compatibility_score?dependency-name=actions/setup-node&package-manager=github_actions&previous-version=4.0.4&new-version=4.1.0)](https://docs.github.com/en/github/managing-security-vulnerabilities/about-dependabot-security-updates#about-compatibility-scores) Dependabot will resolve any conflicts with this PR as long as you don't alter it yourself. You can also trigger a rebase manually by commenting `@dependabot rebase`. [//]: # (dependabot-automerge-start) [//]: # (dependabot-automerge-end) ---
Dependabot commands and options
You can trigger Dependabot actions by commenting on this PR: - `@dependabot rebase` will rebase this PR - `@dependabot recreate` will recreate this PR, overwriting any edits that have been made to it - `@dependabot merge` will merge this PR after your CI passes on it - `@dependabot squash and merge` will squash and merge this PR after your CI passes on it - `@dependabot cancel merge` will cancel a previously requested merge and block automerging - `@dependabot reopen` will reopen this PR if it is closed - `@dependabot close` will close this PR and stop Dependabot recreating it. You can achieve the same result by closing it manually - `@dependabot show ignore conditions` will show all of the ignore conditions of the specified dependency - `@dependabot ignore this major version` will close this PR and stop Dependabot creating any more for this major version (unless you reopen the PR or upgrade to it yourself) - `@dependabot ignore this minor version` will close this PR and stop Dependabot creating any more for this minor version (unless you reopen the PR or upgrade to it yourself) - `@dependabot ignore this dependency` will close this PR and stop Dependabot creating any more for this dependency (unless you reopen the PR or upgrade to it yourself)
Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- .github/workflows/job_openvino_js.yml | 2 +- .github/workflows/windows_vs2019_release.yml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/job_openvino_js.yml b/.github/workflows/job_openvino_js.yml index 6097d3e6f18bc4..ecb278fdb54ca3 100644 --- a/.github/workflows/job_openvino_js.yml +++ b/.github/workflows/job_openvino_js.yml @@ -52,7 +52,7 @@ jobs: - name: Setup Node ${{ env.NODE_VERSION }} if: runner.os != 'Linux' # Node is already installed in the Docker image - uses: actions/setup-node@0a44ba7841725637a19e28fa30b79a866c81b0a6 # v4.0.4 + uses: actions/setup-node@39370e3970a6d050c480ffad4ff0ed4d3fdee5af # v4.1.0 with: node-version: ${{ env.NODE_VERSION }} diff --git a/.github/workflows/windows_vs2019_release.yml b/.github/workflows/windows_vs2019_release.yml index 1c84db5dcda530..b9b8fa76d37c34 100644 --- a/.github/workflows/windows_vs2019_release.yml +++ b/.github/workflows/windows_vs2019_release.yml @@ -189,7 +189,7 @@ jobs: path: ${{ env.OPENVINO_JS_LIBS_DIR }} - name: Setup Node ${{ env.NODE_VERSION }} - uses: actions/setup-node@0a44ba7841725637a19e28fa30b79a866c81b0a6 # v4.0.4 + uses: actions/setup-node@39370e3970a6d050c480ffad4ff0ed4d3fdee5af # v4.1.0 with: node-version: ${{ env.NODE_VERSION }} From 08365f6fddc0449b11e50da986112c95b208fa9a Mon Sep 17 00:00:00 2001 From: Katarzyna Mitrus Date: Mon, 28 Oct 2024 09:59:07 +0100 Subject: [PATCH 010/120] [STFT] Pytorch FE enablement of STFT operation (#27186) ### Details: - Pytorch FE enablement of STFT op - This PR contains STFT CPU enablement changes (to be merged first), please review pytorch related changes only within this PR: src/frontends/pytorch/src/op/stft.cpp + test_stft.py ### Tickets: - 155166, 143017 Related PR: - https://github.com/openvinotoolkit/openvino/pull/27137 --------- Co-authored-by: Michal Lukaszewski --- src/frontends/pytorch/src/op/stft.cpp | 93 +++++++++++ src/frontends/pytorch/src/op_table.cpp | 2 + tests/layer_tests/pytorch_tests/test_stft.py | 156 +++++++++++++++++++ 3 files changed, 251 insertions(+) create mode 100644 src/frontends/pytorch/src/op/stft.cpp create mode 100644 tests/layer_tests/pytorch_tests/test_stft.py diff --git a/src/frontends/pytorch/src/op/stft.cpp b/src/frontends/pytorch/src/op/stft.cpp new file mode 100644 index 00000000000000..b7e4858c2f8fcc --- /dev/null +++ b/src/frontends/pytorch/src/op/stft.cpp @@ -0,0 +1,93 @@ +// Copyright (C) 2018-2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "openvino/op/stft.hpp" + +#include "openvino/frontend/pytorch/node_context.hpp" +#include "openvino/op/broadcast.hpp" +#include "openvino/op/constant.hpp" +#include "openvino/op/convert_like.hpp" +#include "openvino/op/divide.hpp" +#include "openvino/op/shape_of.hpp" +#include "openvino/op/unsqueeze.hpp" +#include "utils.hpp" + +namespace ov { +namespace frontend { +namespace pytorch { +namespace op { + +using namespace ov::op; + +OutputVector translate_stft(const NodeContext& context) { + // schema: aten::stft(Tensor self, int n_fft, int? hop_length=None, int? win_length=None, Tensor? window=None, bool + // normalized=False, bool? onesided=None, bool? return_complex=None) -> Tensor + // + // Note: aten::stft doesn't have "center" and "pad_mode" attrs like torch.stft, so the number of the inputs is lower + // and index of any input after the "window" is smaller accordingly + + num_inputs_check(context, 2, 8); + + auto input = context.get_input(0); + auto n_fft = context.get_input(1); + + ov::Output hop_length; + if (!context.input_is_none(2)) { + hop_length = context.get_input(2); + } else { + // Defualt floor(n_fft / 4) + const auto four = context.mark_node(std::make_shared(ov::element::i32, Shape{}, 4)); + const auto four_cast = context.mark_node(std::make_shared(four, n_fft)); + hop_length = context.mark_node(std::make_shared(n_fft, four_cast)); + } + + ov::Output win_length; + if (!context.input_is_none(3)) { + win_length = context.get_input(3); + } else { + win_length = n_fft; + } + + ov::Output window; + if (!context.input_is_none(4)) { + window = context.get_input(4); + } else { + const auto one = context.mark_node(std::make_shared(ov::element::i32, Shape{}, 1)); + const auto one_cast = context.mark_node(std::make_shared(one, input)); + const auto zero = context.mark_node(std::make_shared(ov::element::i32, Shape{1}, 0)); + const auto win_length_cast = + context.mark_node(std::make_shared(win_length, ov::element::i64)); + const auto win_len_vec = context.mark_node(std::make_shared(win_length_cast, zero)); + window = context.mark_node(std::make_shared(one_cast, win_len_vec)); + } + + bool normalized = false; + if (!context.input_is_none(5)) { + normalized = context.const_input(5); + } + PYTORCH_OP_CONVERSION_CHECK(!normalized, + "aten::stft conversion is currently supported with normalized=False only."); + + bool onesided = true; + if (!context.input_is_none(6)) { + onesided = context.const_input(6); + } + PYTORCH_OP_CONVERSION_CHECK(onesided, "aten::stft conversion is currently supported with onesided=True only."); + + bool return_complex = false; + if (!context.input_is_none(7)) { + return_complex = context.const_input(7); + } + PYTORCH_OP_CONVERSION_CHECK(!return_complex, + "aten::stft conversion is currently supported with return_complex=False only."); + + // Perform STFT + constexpr bool transpose_frames = true; + auto stft = context.mark_node(std::make_shared(input, window, n_fft, hop_length, transpose_frames)); + return {stft}; +}; +} // namespace op +} // namespace pytorch +} // namespace frontend +} // namespace ov diff --git a/src/frontends/pytorch/src/op_table.cpp b/src/frontends/pytorch/src/op_table.cpp index 66c76e33032ef6..9b7e6134cd24ab 100644 --- a/src/frontends/pytorch/src/op_table.cpp +++ b/src/frontends/pytorch/src/op_table.cpp @@ -220,6 +220,7 @@ OP_CONVERTER(translate_square); OP_CONVERTER(translate_squeeze); OP_CONVERTER(translate_std); OP_CONVERTER(translate_std_mean); +OP_CONVERTER(translate_stft); OP_CONVERTER(translate_sub); OP_CONVERTER(translate_sub_); OP_CONVERTER(translate_sum); @@ -649,6 +650,7 @@ const std::unordered_map get_supported_ops_ts() { // aten::stack - Supported in limited set of patterns {"aten::std", op::translate_std}, {"aten::std_mean", op::translate_std_mean}, + {"aten::stft", op::translate_stft}, {"aten::sub", op::translate_sub}, {"aten::sub_", op::translate_sub_}, {"aten::sum", op::translate_sum}, diff --git a/tests/layer_tests/pytorch_tests/test_stft.py b/tests/layer_tests/pytorch_tests/test_stft.py new file mode 100644 index 00000000000000..29d6b94efbfd37 --- /dev/null +++ b/tests/layer_tests/pytorch_tests/test_stft.py @@ -0,0 +1,156 @@ +# Copyright (C) 2018-2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +import pytest + +from pytorch_layer_test_class import PytorchLayerTest + + +class TestSTFT(PytorchLayerTest): + def _prepare_input(self, win_length, signal_shape, rand_data=False, out_dtype="float32"): + import numpy as np + + if rand_data: + signal = np.random.randn(*signal_shape).astype(out_dtype) + else: + num_samples = signal_shape[-1] + half_idx = num_samples // 2 + t = np.linspace(0, 1, num_samples) + signal = np.sin(2 * np.pi * 5 * t) + signal[half_idx:] += np.sin(2 * np.pi * 10 * t[half_idx:]) + signal = np.broadcast_to(signal, signal_shape).astype(out_dtype) + + window = np.hanning(win_length).reshape([win_length]) + + return (signal, window.astype(out_dtype)) + + def create_model(self, n_fft, hop_length, win_length): + import torch + + class aten_stft(torch.nn.Module): + + def __init__(self, n_fft, hop_length, win_length): + super(aten_stft, self).__init__() + self.n_fft = n_fft + self.hop_length = hop_length + self.win_length = win_length + + def forward(self, x, window): + return torch.stft( + x, + self.n_fft, + hop_length=self.hop_length, + win_length=self.win_length, + window=window, + center=False, + pad_mode="reflect", + normalized=False, + onesided=True, + return_complex=False, + ) + + ref_net = None + + return aten_stft(n_fft, hop_length, win_length), ref_net, "aten::stft" + + @pytest.mark.nightly + @pytest.mark.precommit + @pytest.mark.parametrize(("trace_model"), [True, False]) + @pytest.mark.parametrize(("signal_shape"), [(1, 256), (2, 128), (128,)]) + @pytest.mark.parametrize(("n_fft", "hop_length", "window_size"), [ + [16, 4, 16], + [32, 32, 32], + [32, 16, 24], + [24, 32, 20], + [128, 128, 128], + ]) + def test_stft(self, n_fft, hop_length, window_size, signal_shape, ie_device, precision, ir_version, trace_model): + if ie_device == "GPU": + pytest.xfail(reason="STFT op is not supported on GPU yet") + if signal_shape == (128,): + pytest.xfail(reason="STFT op is doesn't support 1D signal yet, please unsqueeze the input.") + self._test(*self.create_model(n_fft, hop_length, window_size), ie_device, precision, + ir_version, kwargs_to_prepare_input={"win_length": window_size, "signal_shape": signal_shape}, trace_model=trace_model) + + +class TestSTFTAttrs(PytorchLayerTest): + def _prepare_input(self, out=False, out_dtype="float32"): + import numpy as np + + signal = np.random.randn(2, 512).astype(out_dtype) + return (signal,) + + def create_model_with_attrs(self, n_fft, hop_length, win_length, center, pad_mode, normalized, onesided, return_complex): + import torch + + class aten_stft_attrs(torch.nn.Module): + + def __init__(self, n_fft, hop_length, win_length, center, pad_mode, normalized, onesided, return_complex): + super(aten_stft_attrs, self).__init__() + self.n_fft = n_fft + self.hop_length = hop_length + self.win_length = win_length + self.window = None # Default window + self.center = center + self.pad_mode = pad_mode + self.normalized = normalized + self.onesided = onesided + self.return_complex = return_complex + + def forward(self, x): + return torch.stft( + x, + self.n_fft, + hop_length=self.hop_length, + win_length=self.win_length, + window=self.window, + center=self.center, + pad_mode=self.pad_mode, + normalized=self.normalized, + onesided=self.onesided, + return_complex=self.return_complex, + ) + + ref_net = None + + return aten_stft_attrs(n_fft, hop_length, win_length, center, pad_mode, normalized, onesided, return_complex), ref_net, "aten::stft" + + @pytest.mark.nightly + @pytest.mark.precommit + @pytest.mark.parametrize(("trace_model"), [True, False]) + @pytest.mark.parametrize(("n_fft", "hop_length", "win_length", "center", "pad_mode", "normalized", "onesided", "return_complex"), [ + [16, 4, 16, False, "reflect", False, True, False], # default window + [16, 4, 14, True, "reflect", False, True, False], # center True + [16, 4, 14, True, "reflect", False, True, False], # center True + [16, 4, 14, True, "replicate", False, True, False], # center True + [16, 4, 14, False, "replicate", False, True, False], # center False + [16, None, 16, False, "reflect", False, True, False], # hop_length None + [16, None, None, False, "reflect", False, True, False], # hop & win length None + [16, 4, None, False, "reflect", False, True, False], # win_length None + # Unsupported cases: + [16, 4, 16, False, "reflect", True, True, False], # normalized True + [16, 4, 16, False, "reflect", False, False, False], # onesided False + [16, 4, 16, False, "reflect", False, True, True], # reutrn_complex True + ]) + def test_stft_not_supported_attrs(self, n_fft, hop_length, win_length, center, pad_mode, normalized, onesided, return_complex, ie_device, precision, ir_version, trace_model): + if ie_device == "GPU": + pytest.xfail(reason="STFT op is not supported on GPU yet") + + if center is True and trace_model is False: + pytest.xfail( + reason="torch stft uses list() for `center` subgrpah before aten::stft, that leads to error: No conversion rule found for operations: aten::list") + + if normalized is True: + pytest.xfail( + reason="aten::stft conversion is currently supported with normalized=False only") + + if onesided is False: + pytest.xfail( + reason="aten::stft conversion is currently supported with onesided=True only") + + if return_complex is True: + pytest.xfail( + reason="aten::stft conversion is currently supported with return_complex=False only") + + self._test(*self.create_model_with_attrs(n_fft, hop_length, win_length, center, pad_mode, normalized, onesided, return_complex), ie_device, precision, + ir_version, kwargs_to_prepare_input={}, trace_model=trace_model) From 30a52e8f6c2db2d2da83fb4578fbe111ac20e5db Mon Sep 17 00:00:00 2001 From: Maxim Vafin Date: Mon, 28 Oct 2024 10:01:06 +0100 Subject: [PATCH 011/120] [TESTS] Update pytorch version to 2.5.0 (#27117) ### Details: - *item1* - *...* ### Tickets: - *ticket-id* --- src/frontends/pytorch/src/op_table.cpp | 1 + tests/constraints.txt | 2 +- tests/requirements_pytorch | 8 ++++---- 3 files changed, 6 insertions(+), 5 deletions(-) diff --git a/src/frontends/pytorch/src/op_table.cpp b/src/frontends/pytorch/src/op_table.cpp index 9b7e6134cd24ab..8e490a60ffa580 100644 --- a/src/frontends/pytorch/src/op_table.cpp +++ b/src/frontends/pytorch/src/op_table.cpp @@ -748,6 +748,7 @@ const std::unordered_map get_supported_ops_fx() { {"aten._native_batch_norm_legit.no_stats", op::translate_batch_norm_legit_no_stats_fx}, {"aten._native_batch_norm_legit_functional.default", op::translate_batch_norm_legit_fx}, {"aten._native_batch_norm_legit_no_training.default", op::translate_batch_norm_legit_no_training_fx}, + {"aten._safe_softmax.default", op::translate_softmax_fx}, {"aten._scaled_dot_product_flash_attention.default", op::translate_scaled_dot_product_attention_fx}, {"aten._scaled_dot_product_flash_attention_for_cpu.default", op::translate_scaled_dot_product_attention_fx}, {"aten._softmax.default", op::translate_softmax_fx}, diff --git a/tests/constraints.txt b/tests/constraints.txt index b800d289ce1547..c6e2e5e65f96fe 100644 --- a/tests/constraints.txt +++ b/tests/constraints.txt @@ -28,5 +28,5 @@ networkx<=3.3 flax<=0.10.0 --extra-index-url https://download.pytorch.org/whl/cpu -torch~=2.4.1; platform_system != "Darwin" or platform_machine != "x86_64" +torch~=2.5.0; platform_system != "Darwin" or platform_machine != "x86_64" torch~=2.2.0; platform_system == "Darwin" and platform_machine == "x86_64" diff --git a/tests/requirements_pytorch b/tests/requirements_pytorch index d4e59c40f56f61..c2873210003b7d 100644 --- a/tests/requirements_pytorch +++ b/tests/requirements_pytorch @@ -3,12 +3,14 @@ # optimum still requires numpy<2.0.0 numpy==1.26.4; python_version < "3.12" numpy==2.1.1; python_version >= "3.12" -torch==2.4.1; platform_system != "Darwin" or platform_machine != "x86_64" +torch==2.5.0; platform_system != "Darwin" or platform_machine != "x86_64" torch==2.2.2; platform_system == "Darwin" and platform_machine == "x86_64" --extra-index-url https://download.pytorch.org/whl/cpu -torchvision==0.19.1; platform_system != "Darwin" or platform_machine != "x86_64" +torchvision==0.20.0; platform_system != "Darwin" or platform_machine != "x86_64" torchvision==0.17.2; platform_system == "Darwin" and platform_machine == "x86_64" +torchaudio==2.5.0; platform_system != "Darwin" or platform_machine != "x86_64" +torchaudio==2.2.2; platform_system == "Darwin" and platform_machine == "x86_64" # transformers 4.45.1 is available # but optimum still requires <4.45.0 transformers==4.44.2 @@ -33,8 +35,6 @@ sentencepiece==0.2.0 soundfile==0.12.1 super-image==0.1.7; python_version < "3.12" timm==1.0.11 -torchaudio==2.4.1; platform_system != "Darwin" or platform_machine != "x86_64" -torchaudio==2.2.2; platform_system == "Darwin" and platform_machine == "x86_64" wheel==0.44.0 PyYAML==6.0.2 kornia==0.7.3 From 443078c11b505e5acc12c10fe2016c32a6033871 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon, 28 Oct 2024 09:20:01 +0000 Subject: [PATCH 012/120] Bump actions/setup-python from 5.2.0 to 5.3.0 (#27267) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Bumps [actions/setup-python](https://github.com/actions/setup-python) from 5.2.0 to 5.3.0.
Release notes

Sourced from actions/setup-python's releases.

v5.3.0

What's Changed

Bug Fixes:

Enhancements:

New Contributors

Full Changelog: https://github.com/actions/setup-python/compare/v5...v5.3.0

Commits

[![Dependabot compatibility score](https://dependabot-badges.githubapp.com/badges/compatibility_score?dependency-name=actions/setup-python&package-manager=github_actions&previous-version=5.2.0&new-version=5.3.0)](https://docs.github.com/en/github/managing-security-vulnerabilities/about-dependabot-security-updates#about-compatibility-scores) Dependabot will resolve any conflicts with this PR as long as you don't alter it yourself. You can also trigger a rebase manually by commenting `@dependabot rebase`. [//]: # (dependabot-automerge-start) [//]: # (dependabot-automerge-end) ---
Dependabot commands and options
You can trigger Dependabot actions by commenting on this PR: - `@dependabot rebase` will rebase this PR - `@dependabot recreate` will recreate this PR, overwriting any edits that have been made to it - `@dependabot merge` will merge this PR after your CI passes on it - `@dependabot squash and merge` will squash and merge this PR after your CI passes on it - `@dependabot cancel merge` will cancel a previously requested merge and block automerging - `@dependabot reopen` will reopen this PR if it is closed - `@dependabot close` will close this PR and stop Dependabot recreating it. You can achieve the same result by closing it manually - `@dependabot show ignore conditions` will show all of the ignore conditions of the specified dependency - `@dependabot ignore this major version` will close this PR and stop Dependabot creating any more for this major version (unless you reopen the PR or upgrade to it yourself) - `@dependabot ignore this minor version` will close this PR and stop Dependabot creating any more for this minor version (unless you reopen the PR or upgrade to it yourself) - `@dependabot ignore this dependency` will close this PR and stop Dependabot creating any more for this dependency (unless you reopen the PR or upgrade to it yourself)
Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- .github/workflows/build_doc.yml | 2 +- .github/workflows/coverage.yml | 2 +- .github/workflows/job_gpu_tests.yml | 2 +- .github/workflows/mo.yml | 2 +- .github/workflows/ovc.yml | 2 +- .github/workflows/py_checks.yml | 2 +- 6 files changed, 6 insertions(+), 6 deletions(-) diff --git a/.github/workflows/build_doc.yml b/.github/workflows/build_doc.yml index 6f38ff0214bfaa..8c78375e61769c 100644 --- a/.github/workflows/build_doc.yml +++ b/.github/workflows/build_doc.yml @@ -29,7 +29,7 @@ jobs: packages: graphviz texlive liblua5.2-0 libclang1-9 libclang-cpp9 version: 3.0 - - uses: actions/setup-python@f677139bbe7f9c59b41e40162b753c062f5d49a3 # v5.2.0 + - uses: actions/setup-python@0b93645e9fea7318ecaed2b359559ac225c90a2b # v5.3.0 id: cp310 with: python-version: '3.10' diff --git a/.github/workflows/coverage.yml b/.github/workflows/coverage.yml index fdb41226f4efb8..db5ba3de1a3c85 100644 --- a/.github/workflows/coverage.yml +++ b/.github/workflows/coverage.yml @@ -20,7 +20,7 @@ jobs: steps: - name: Setup python - uses: actions/setup-python@f677139bbe7f9c59b41e40162b753c062f5d49a3 # v5.2.0 + uses: actions/setup-python@0b93645e9fea7318ecaed2b359559ac225c90a2b # v5.3.0 with: python-version: '3.10.10' architecture: 'x64' diff --git a/.github/workflows/job_gpu_tests.yml b/.github/workflows/job_gpu_tests.yml index b9862eac09cc05..195abbbd5fb0f9 100644 --- a/.github/workflows/job_gpu_tests.yml +++ b/.github/workflows/job_gpu_tests.yml @@ -74,7 +74,7 @@ jobs: DEBIAN_FRONTEND: noninteractive # to prevent apt-get from waiting user input TZ: "Europe/London" # to prevent tzdata from waiting user input - name: Setup Python ${{ env.PYTHON_VERSION }} - uses: actions/setup-python@f677139bbe7f9c59b41e40162b753c062f5d49a3 # v5.2.0 + uses: actions/setup-python@0b93645e9fea7318ecaed2b359559ac225c90a2b # v5.3.0 with: python-version: ${{ env.PYTHON_VERSION }} diff --git a/.github/workflows/mo.yml b/.github/workflows/mo.yml index 2405103755b552..f48986d4a0d304 100644 --- a/.github/workflows/mo.yml +++ b/.github/workflows/mo.yml @@ -27,7 +27,7 @@ jobs: uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 - name: Setup Python - uses: actions/setup-python@f677139bbe7f9c59b41e40162b753c062f5d49a3 # v5.2.0 + uses: actions/setup-python@0b93645e9fea7318ecaed2b359559ac225c90a2b # v5.3.0 with: python-version: '3.10' diff --git a/.github/workflows/ovc.yml b/.github/workflows/ovc.yml index 7d18643def3ce6..4d69563a741d3a 100644 --- a/.github/workflows/ovc.yml +++ b/.github/workflows/ovc.yml @@ -22,7 +22,7 @@ jobs: uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 - name: Setup Python - uses: actions/setup-python@f677139bbe7f9c59b41e40162b753c062f5d49a3 # v5.2.0 + uses: actions/setup-python@0b93645e9fea7318ecaed2b359559ac225c90a2b # v5.3.0 with: python-version: '3.10' diff --git a/.github/workflows/py_checks.yml b/.github/workflows/py_checks.yml index 13ddbaaa1ec41c..caed37eee89056 100644 --- a/.github/workflows/py_checks.yml +++ b/.github/workflows/py_checks.yml @@ -31,7 +31,7 @@ jobs: uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 - name: Setup Python - uses: actions/setup-python@f677139bbe7f9c59b41e40162b753c062f5d49a3 # v5.2.0 + uses: actions/setup-python@0b93645e9fea7318ecaed2b359559ac225c90a2b # v5.3.0 with: python-version: '3.9' From 7a80fe83ef07651ba60557fe5022bd9277615a5f Mon Sep 17 00:00:00 2001 From: Taylor Yeonbok Lee Date: Mon, 28 Oct 2024 02:59:41 -0700 Subject: [PATCH 013/120] [GPU] Support GeLU Tanh for Phi-2 (#27213) ### Details: - Previously GeLU Tanh was supported only for x * (0.5 * (1 + tanh)) - Support pattern with (x * 0.5) * (1 + tanh)) too. ### Tickets: - 155576 --- .../common_optimizations/gelu_fusion.cpp | 36 ++++++++++-------- .../common_optimizations/gelu_fusion.cpp | 38 +++++++++++++++++++ 2 files changed, 59 insertions(+), 15 deletions(-) diff --git a/src/common/transformations/src/transformations/common_optimizations/gelu_fusion.cpp b/src/common/transformations/src/transformations/common_optimizations/gelu_fusion.cpp index 221484c75cccde..8d075f4a727758 100644 --- a/src/common/transformations/src/transformations/common_optimizations/gelu_fusion.cpp +++ b/src/common/transformations/src/transformations/common_optimizations/gelu_fusion.cpp @@ -22,6 +22,7 @@ #include "openvino/op/parameter.hpp" #include "openvino/op/power.hpp" #include "openvino/op/tanh.hpp" +#include "openvino/pass/pattern/op/or.hpp" #include "openvino/pass/pattern/op/wrap_type.hpp" #include "transformations/utils/utils.hpp" @@ -280,9 +281,16 @@ ov::pass::GeluFusionWithTanh::GeluFusionWithTanh() { auto add_1 = ov::pass::pattern::wrap_type({tanh, add_1_constant}); auto mul_2_constant = ov::pass::pattern::wrap_type(); - auto mul_2 = ov::pass::pattern::wrap_type({add_1, mul_2_constant}); - auto mul_3 = ov::pass::pattern::wrap_type({input, mul_2}); + // x * (0.5 * (1 + tanh)) + auto mul_2_1 = ov::pass::pattern::wrap_type({add_1, mul_2_constant}); + auto mul_3_1 = ov::pass::pattern::wrap_type({input, mul_2_1}); + + // (x * 0.5) * (1 + tanh) + auto mul_2_2 = ov::pass::pattern::wrap_type({input, mul_2_constant}); + auto mul_3_2 = ov::pass::pattern::wrap_type({add_1, mul_2_2}); + + auto mul_3 = std::make_shared(OutputVector{mul_3_1, mul_3_2}); ov::matcher_pass_callback callback = [=](ov::pass::pattern::Matcher& m) { auto& pattern_to_output = m.get_pattern_value_map(); @@ -298,7 +306,6 @@ ov::pass::GeluFusionWithTanh::GeluFusionWithTanh() { ov::as_type_ptr(pattern_to_output.at(mul_2_constant).get_node_shared_ptr()); auto add_1_constant_value = ov::as_type_ptr(pattern_to_output.at(add_1_constant).get_node_shared_ptr()); - if (!pow_constant_value || !add_1_constant_value || !mul_0_constant_value || !mul_1_constant_value || !mul_2_constant_value) { return false; @@ -318,18 +325,17 @@ ov::pass::GeluFusionWithTanh::GeluFusionWithTanh() { auto gelu = std::make_shared(x_output, op::GeluApproximationMode::TANH); gelu->set_friendly_name(m.get_match_root()->get_friendly_name()); - ov::copy_runtime_info( - { - pattern_to_output.at(pow).get_node_shared_ptr(), - pattern_to_output.at(mul_0).get_node_shared_ptr(), - pattern_to_output.at(mul_1).get_node_shared_ptr(), - pattern_to_output.at(mul_2).get_node_shared_ptr(), - pattern_to_output.at(mul_3).get_node_shared_ptr(), - pattern_to_output.at(tanh).get_node_shared_ptr(), - pattern_to_output.at(add_0).get_node_shared_ptr(), - pattern_to_output.at(add_1).get_node_shared_ptr(), - }, - gelu); + + std::vector> pattern_nodes = + {pow, mul_0, mul_1, tanh, add_0, add_1, mul_2_1, mul_2_2, mul_3_1, mul_3_2}; + std::vector> cp_rt_info_nodes; + for (const auto& pattern_node : pattern_nodes) { + if (pattern_to_output.count(pattern_node)) { + cp_rt_info_nodes.push_back(pattern_to_output.at(pattern_node).get_node_shared_ptr()); + } + } + ov::copy_runtime_info(cp_rt_info_nodes, gelu); + ov::replace_node(m.get_match_root(), gelu); return true; }; diff --git a/src/common/transformations/tests/common_optimizations/gelu_fusion.cpp b/src/common/transformations/tests/common_optimizations/gelu_fusion.cpp index 837d2ba6d4597e..dbc54f5492bffa 100644 --- a/src/common/transformations/tests/common_optimizations/gelu_fusion.cpp +++ b/src/common/transformations/tests/common_optimizations/gelu_fusion.cpp @@ -388,6 +388,44 @@ TEST_F(TransformationTestsF, GeluFusionTanhWithTanh_epsilon_pow_value) { } } +TEST_F(TransformationTestsF, GeluFusionTanhWithTanh_epsilon_pow_value_2) { + { + auto input = std::make_shared(element::f32, Shape{2, 2}); + auto pow_constant = + std::make_shared(element::f32, Shape{1}, std::vector{3.0f + 1.0e-8f}); + auto pow = std::make_shared(input, pow_constant); + auto mul_0_constant = + std::make_shared(element::f32, Shape{1}, std::vector{0.044715f}); + auto mul_0 = std::make_shared(pow, mul_0_constant); + auto add_0 = std::make_shared(input, mul_0); + + auto mul_1_constant = + std::make_shared(element::f32, + Shape{1}, + std::vector{static_cast(std::sqrt(2.0 / M_PI))}); + auto mul_1 = std::make_shared(add_0, mul_1_constant); + + auto tanh = std::make_shared(mul_1); + + auto add_1_constant = std::make_shared(element::f32, Shape{1}, std::vector{1.0f}); + auto add_1 = std::make_shared(tanh, add_1_constant); + + auto mul_2_constant = std::make_shared(element::f32, Shape{1}, std::vector{0.5f}); + auto mul_2 = std::make_shared(input, mul_2_constant); + + auto mul_3 = std::make_shared(add_1, mul_2); + + model = std::make_shared(NodeVector{mul_3}, ParameterVector{input}); + manager.register_pass(); + } + + { + auto data = std::make_shared(element::f32, Shape{2, 2}); + auto gelu = std::make_shared(data, op::GeluApproximationMode::TANH); + model_ref = std::make_shared(NodeVector{gelu}, ParameterVector{data}); + } +} + TEST_F(TransformationTestsF, GeluFusionTanhWithTanh_wrong_pow_value) { { auto input = std::make_shared(element::f32, Shape{2, 2}); From dd6ed6cdbb99461284885fa0c6b4865410f2ab19 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon, 28 Oct 2024 10:23:47 +0000 Subject: [PATCH 014/120] Update flake8-comprehensions requirement from <=3.15.0 to <=3.16.0 in /src/bindings/python (#27268) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Updates the requirements on [flake8-comprehensions](https://github.com/adamchainz/flake8-comprehensions) to permit the latest version.
Changelog

Sourced from flake8-comprehensions's changelog.

3.16.0 (2024-10-27)

  • Drop Python 3.8 support.

  • Support Python 3.13.

3.15.0 (2024-06-29)

  • Add rule C420 to check for dict comprehensions with constant values, encouraging replacement with dict.fromkeys().

    Thanks to Tom Kuson in PR [#553](https://github.com/adamchainz/flake8-comprehensions/issues/553) <https://github.com/adamchainz/flake8-comprehensions/pull/553>__.

3.14.0 (2023-07-10)

  • Drop Python 3.7 support.

3.13.0 (2023-06-15)

  • Support Python 3.12.

3.12.0 (2023-04-13)

  • Add rule C418 to check for calls passing a dict literal or dict comprehension to dict().

  • Add rule C419 to check for calls passing a list comprehension to any()/all().

3.11.1 (2023-03-21)

  • Fix false positives in C406 “unnecessary dict literal”.

    Fixes Issue [#260](https://github.com/adamchainz/flake8-comprehensions/issues/260) <https://github.com/adamchainz/flake8-comprehensions/issues/260>__.

3.11.0 (2023-03-18)

  • Expand C416 to dict comprehensions.

    Thanks to Aaron Gokaslan in PR [#490](https://github.com/adamchainz/flake8-comprehensions/issues/490) <https://github.com/adamchainz/flake8-comprehensions/pull/490>__.

3.10.1 (2022-10-29)

  • Fix false positive in rules C402 and C404 for dict() calls with keyword arguments.

... (truncated)

Commits

Dependabot will resolve any conflicts with this PR as long as you don't alter it yourself. You can also trigger a rebase manually by commenting `@dependabot rebase`. [//]: # (dependabot-automerge-start) [//]: # (dependabot-automerge-end) ---
Dependabot commands and options
You can trigger Dependabot actions by commenting on this PR: - `@dependabot rebase` will rebase this PR - `@dependabot recreate` will recreate this PR, overwriting any edits that have been made to it - `@dependabot merge` will merge this PR after your CI passes on it - `@dependabot squash and merge` will squash and merge this PR after your CI passes on it - `@dependabot cancel merge` will cancel a previously requested merge and block automerging - `@dependabot reopen` will reopen this PR if it is closed - `@dependabot close` will close this PR and stop Dependabot recreating it. You can achieve the same result by closing it manually - `@dependabot show ignore conditions` will show all of the ignore conditions of the specified dependency - `@dependabot ignore this major version` will close this PR and stop Dependabot creating any more for this major version (unless you reopen the PR or upgrade to it yourself) - `@dependabot ignore this minor version` will close this PR and stop Dependabot creating any more for this minor version (unless you reopen the PR or upgrade to it yourself) - `@dependabot ignore this dependency` will close this PR and stop Dependabot creating any more for this dependency (unless you reopen the PR or upgrade to it yourself)
Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- src/bindings/python/requirements_test.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/bindings/python/requirements_test.txt b/src/bindings/python/requirements_test.txt index a9ee93b1d9e9bc..1aa2ff24b1b948 100644 --- a/src/bindings/python/requirements_test.txt +++ b/src/bindings/python/requirements_test.txt @@ -7,7 +7,7 @@ flake8-annotations-complexity<=0.0.8 flake8-broken-line<=1.0.0 flake8-bugbear<=24.8.19 flake8-class-attributes-order<=0.1.3 -flake8-comprehensions<=3.15.0 +flake8-comprehensions<=3.16.0 flake8-debugger<=4.1.2 flake8-docstrings<=1.7.0 flake8-eradicate<=1.5.0 From 837128968bf5c423dd7bd5d92d9167bf833516fc Mon Sep 17 00:00:00 2001 From: Xin Wang Date: Mon, 28 Oct 2024 19:35:48 +0800 Subject: [PATCH 015/120] []NPU] Update adapter and backend to use new level zero ext table (#27095) ### Details: - *use ze_graph_dditable_ext_t to replace local ze_graph_dditable_ext_last_t* - *use ze_graph_ext_version_t as template parameter instead of ze_graph_dditable_ext_1_x* - *update commit of level-zero-ext to use one ddi table* Plugin with current driver and driver released later, will use ze_graph_dditable_ext_t and ZE_extension_graph, the version is ZE_GRAPH_EXT_VERSION_CURRENT inside current level-zero-ext commit, 1.8 now Plugin with old driver, will use ze_graph_dditable_ext_t to reinterpret old ddi table, use largest version supported by driver, , backend work on ze_graph_ext_version_t, compiler will work based on ze_graph_ext_version_t. Fpr example, largest ext version supported by driver is 1.7. plugin will use 1.7. ### Tickets: - *155313* --- .../src/backend/include/zero_types.hpp | 8 +- .../src/backend/src/zero_backend.cpp | 2 +- .../intel_npu/src/backend/src/zero_init.cpp | 42 ++++++++- .../include/zero_compiler_in_driver.hpp | 57 ++++++------ .../compiler/src/driver_compiler_adapter.cpp | 56 ++++++------ .../compiler/src/zero_compiler_in_driver.cpp | 88 +++++++++---------- .../intel_npu/thirdparty/level-zero-ext | 2 +- 7 files changed, 147 insertions(+), 108 deletions(-) diff --git a/src/plugins/intel_npu/src/backend/include/zero_types.hpp b/src/plugins/intel_npu/src/backend/include/zero_types.hpp index 2b8738b245a8d4..43b97b7217f512 100644 --- a/src/plugins/intel_npu/src/backend/include/zero_types.hpp +++ b/src/plugins/intel_npu/src/backend/include/zero_types.hpp @@ -13,10 +13,6 @@ #include "intel_npu/config/runtime.hpp" -/** - * @brief Last version of Table of Graph Extension functions used within plugin - */ -using ze_graph_dditable_ext_last_t = ze_graph_dditable_ext_1_8_t; /** * @brief Last version of the Command Queue functions used within plugin */ @@ -34,7 +30,7 @@ using ze_graph_profiling_dditable_ext_last_t = ze_graph_profiling_dditable_ext_t */ struct ze_graph_dditable_ext_decorator final { private: - ze_graph_dditable_ext_last_t* const _impl; + ze_graph_dditable_ext_t* const _impl; const uint32_t _driverExtVersion; ze_graph_dditable_ext_decorator(const ze_graph_dditable_ext_decorator&) = delete; @@ -53,7 +49,7 @@ struct ze_graph_dditable_ext_decorator final { } public: - ze_graph_dditable_ext_decorator(ze_graph_dditable_ext_last_t* impl, uint32_t driverExtVersion) + ze_graph_dditable_ext_decorator(ze_graph_dditable_ext_t* impl, uint32_t driverExtVersion) : _impl(impl), _driverExtVersion(driverExtVersion), // version 1.0 diff --git a/src/plugins/intel_npu/src/backend/src/zero_backend.cpp b/src/plugins/intel_npu/src/backend/src/zero_backend.cpp index d74692400b0d90..86af62d414b88c 100644 --- a/src/plugins/intel_npu/src/backend/src/zero_backend.cpp +++ b/src/plugins/intel_npu/src/backend/src/zero_backend.cpp @@ -30,7 +30,7 @@ uint32_t ZeroEngineBackend::getGraphExtVersion() const { } bool ZeroEngineBackend::isBatchingSupported() const { - return _instance->isExtensionSupported(std::string(ZE_GRAPH_EXT_NAME_1_6), ZE_MAKE_VERSION(1, 6)); + return _instance->isExtensionSupported("ZE_extension_graph_1_6", ZE_MAKE_VERSION(1, 6)); } bool ZeroEngineBackend::isCommandQueueExtSupported() const { diff --git a/src/plugins/intel_npu/src/backend/src/zero_init.cpp b/src/plugins/intel_npu/src/backend/src/zero_init.cpp index e418fcc5f58cc2..827450718edaf2 100644 --- a/src/plugins/intel_npu/src/backend/src/zero_init.cpp +++ b/src/plugins/intel_npu/src/backend/src/zero_init.cpp @@ -4,6 +4,8 @@ #include "zero_init.hpp" +#include + #include "intel_npu/common/itt.hpp" #include "intel_npu/utils/zero/zero_api.hpp" #include "intel_npu/utils/zero/zero_utils.hpp" @@ -114,14 +116,50 @@ ZeroInitStructsHolder::ZeroInitStructsHolder() : log("NPUZeroInitStructsHolder", // Query our graph extension version std::string graph_ext_name; uint32_t graph_ext_version = 0; + uint32_t target_graph_ext_version = ZE_GRAPH_EXT_VERSION_CURRENT; + +#if defined(NPU_PLUGIN_DEVELOPER_BUILD) + const char* extVersion = std::getenv("NPU_ZE_GRAPH_EXT_VERSION"); + if (extVersion) { + std::string extVersionString(extVersion); + std::regex extVersionRegex(R"(^(\d+)\.(\d+)$)"); + std::smatch match; + + if (std::regex_match(extVersionString, match, extVersionRegex)) { + int major = std::stoi(match[1].str()); + int minor = std::stoi(match[2].str()); + log.debug("Try to find graph ext version: %d.%d instead of %d.%d.", + major, + minor, + ZE_MAJOR_VERSION(target_graph_ext_version), + ZE_MINOR_VERSION(target_graph_ext_version)); + target_graph_ext_version = ZE_MAKE_VERSION(major, minor); + } + } +#endif + log.debug("Try to find graph ext version: %d.%d", + ZE_MAJOR_VERSION(target_graph_ext_version), + ZE_MINOR_VERSION(target_graph_ext_version)); std::tie(graph_ext_version, graph_ext_name) = - queryDriverExtensionVersion(ZE_GRAPH_EXT_NAME, ZE_GRAPH_EXT_VERSION_CURRENT, extProps, count); + queryDriverExtensionVersion(ZE_GRAPH_EXT_NAME, target_graph_ext_version, extProps, count); if (graph_ext_name.empty()) { OPENVINO_THROW("queryGraphExtensionVersion: Failed to find Graph extension in NPU Driver"); } + // Use version that plugin can support as identifier to control workflow + if (graph_ext_version > target_graph_ext_version) { + log.warning("Graph extension version from driver is %d.%d. " + "Larger than plugin max graph ext version %d.%d. " + "Force to use plugin ext version with the new table to control flow!", + ZE_MAJOR_VERSION(graph_ext_version), + ZE_MINOR_VERSION(graph_ext_version), + ZE_MAJOR_VERSION(target_graph_ext_version), + ZE_MINOR_VERSION(target_graph_ext_version)); + graph_ext_version = target_graph_ext_version; + } + const uint16_t supported_driver_ext_major_version = 1; const uint16_t driver_ext_major_version = ZE_MAJOR_VERSION(graph_ext_version); if (supported_driver_ext_major_version != driver_ext_major_version) { @@ -166,7 +204,7 @@ ZeroInitStructsHolder::ZeroInitStructsHolder() : log("NPUZeroInitStructsHolder", command_queue_ext_version); // Load our graph extension - ze_graph_dditable_ext_last_t* graph_ddi_table_ext = nullptr; + ze_graph_dditable_ext_t* graph_ddi_table_ext = nullptr; THROW_ON_FAIL_FOR_LEVELZERO("zeDriverGetExtensionFunctionAddress", zeDriverGetExtensionFunctionAddress(driver_handle, graph_ext_name.c_str(), diff --git a/src/plugins/intel_npu/src/compiler/include/zero_compiler_in_driver.hpp b/src/plugins/intel_npu/src/compiler/include/zero_compiler_in_driver.hpp index 658f138a72c102..5641408dffcac0 100644 --- a/src/plugins/intel_npu/src/compiler/include/zero_compiler_in_driver.hpp +++ b/src/plugins/intel_npu/src/compiler/include/zero_compiler_in_driver.hpp @@ -18,37 +18,34 @@ namespace driverCompilerAdapter { using SerializedIR = std::pair>; -#define NotSupportQuery(T) (std::is_same::value) +#define NotSupportQuery(T) (T == ZE_GRAPH_EXT_VERSION_1_2) // ext version == 1.3 && 1.4, support API (pfnQueryNetworkCreate, pfnQueryNetworkDestroy, // pfnQueryNetworkGetSupportedLayers) -#define SupportAPIGraphQueryNetworkV1(T) \ - (std::is_same::value || std::is_same::value) +#define SupportAPIGraphQueryNetworkV1(T) (T == ZE_GRAPH_EXT_VERSION_1_3 || T == ZE_GRAPH_EXT_VERSION_1_4) // ext version >= 1.5, support API (pfnCreate2, pfnQueryNetworkCreate2, pfnQueryContextMemory) #define SupportAPIGraphQueryNetworkV2(T) ((!NotSupportQuery(T) && !SupportAPIGraphQueryNetworkV1(T))) // For ext version >= 1.5, pfnCreate2 api is avaible -#define NotSupportGraph2(T) \ - (std::is_same::value || std::is_same::value || \ - std::is_same::value) +#define NotSupportGraph2(T) \ + (T == ZE_GRAPH_EXT_VERSION_1_2 || T == ZE_GRAPH_EXT_VERSION_1_3 || T == ZE_GRAPH_EXT_VERSION_1_4) // A bug inside the driver makes the "pfnGraphGetArgumentMetadata" call not safe for use prior to // "ze_graph_dditable_ext_1_6_t". // See: E#117498 -#define NotSupportArgumentMetadata(T) \ - (std::is_same::value || std::is_same::value || \ - std::is_same::value || std::is_same::value) +#define NotSupportArgumentMetadata(T) \ + (T == ZE_GRAPH_EXT_VERSION_1_2 || T == ZE_GRAPH_EXT_VERSION_1_3 || T == ZE_GRAPH_EXT_VERSION_1_4 || \ + T == ZE_GRAPH_EXT_VERSION_1_5) -#define UseCopyForNativeBinary(T) \ - (std::is_same::value || std::is_same::value || \ - std::is_same::value || std::is_same::value || \ - std::is_same::value) +#define UseCopyForNativeBinary(T) \ + (T == ZE_GRAPH_EXT_VERSION_1_2 || T == ZE_GRAPH_EXT_VERSION_1_3 || T == ZE_GRAPH_EXT_VERSION_1_4 || \ + T == ZE_GRAPH_EXT_VERSION_1_5 || T == ZE_GRAPH_EXT_VERSION_1_6) /** * Adapter to use CiD through ZeroAPI */ -template +template class LevelZeroCompilerInDriver final : public ICompiler { public: LevelZeroCompilerInDriver(ze_driver_handle_t driverHandle, @@ -79,7 +76,7 @@ class LevelZeroCompilerInDriver final : public ICompiler { OPENVINO_THROW("Profiling post-processing is not implemented."); } - template = true> + template = true> std::unordered_set getQueryResultFromSupportedLayers( ze_result_t result, ze_graph_query_network_handle_t& hGraphQueryNetwork) const; @@ -111,35 +108,40 @@ class LevelZeroCompilerInDriver final : public ICompiler { ze_graph_compiler_version_info_t compilerVersion) const; std::string serializeConfig(const Config& config, ze_graph_compiler_version_info_t& compilerVersion) const; - template = true> + template = true> void getMetadata(ze_graph_dditable_ext_curr_t& graphDdiTableExt, ze_graph_handle_t graphHandle, uint32_t index, std::vector& inputs, std::vector& outputs) const; - template = true> + template = true> void getMetadata(ze_graph_dditable_ext_curr_t& graphDdiTableExt, ze_graph_handle_t graphHandle, uint32_t index, std::vector& inputs, std::vector& outputs) const; - template = true> + template = true> void getNativeBinary(ze_graph_dditable_ext_curr_t& graphDdiTableExt, ze_graph_handle_t graphHandle, std::vector& blob, const uint8_t*& blobPtr, size_t& blobSize) const; - template = true> + template = true> void getNativeBinary(ze_graph_dditable_ext_curr_t& graphDdiTableExt, ze_graph_handle_t graphHandle, std::vector& /* unusedBlob */, const uint8_t*& blobPtr, size_t& blobSize) const; - template = true> + template = true> ze_result_t seriazlideIRModelAndQueryNetworkCreateV2(const std::shared_ptr& model, const Config& config, ze_device_graph_properties_t deviceGraphProperties, @@ -147,11 +149,13 @@ class LevelZeroCompilerInDriver final : public ICompiler { ze_graph_query_network_handle_t& hGraphQueryNetwork) const; // ext version >= 1.5, support API (pfnCreate2, pfnQueryNetworkCreate2, pfnQueryContextMemory) - template = true> + template = true> std::unordered_set queryImpl(const std::shared_ptr& model, const Config& config) const; - template = true> + template = true> ze_result_t seriazlideIRModelAndQueryNetworkCreateV1(const std::shared_ptr& model, const Config& config, ze_device_graph_properties_t deviceGraphProperties, @@ -160,23 +164,24 @@ class LevelZeroCompilerInDriver final : public ICompiler { // ext version == 1.3 && 1.4, support API (pfnQueryNetworkCreate, pfnQueryNetworkDestroy, // pfnQueryNetworkGetSupportedLayers) - template = true> + template = true> std::unordered_set queryImpl(const std::shared_ptr& model, const Config& config) const; // For ext version < 1.3 - template = true> + template = true> std::unordered_set queryImpl(const std::shared_ptr& model, const Config& config) const; - template = true> + template = true> ze_result_t createGraph(const ze_graph_format_t& format, const SerializedIR& serializedIR, const std::string& buildFlags, const uint32_t& flags, ze_graph_handle_t* graph) const; - template = true> + template = true> ze_result_t createGraph(const ze_graph_format_t& format, const SerializedIR& serializedIR, const std::string& buildFlags, diff --git a/src/plugins/intel_npu/src/compiler/src/driver_compiler_adapter.cpp b/src/plugins/intel_npu/src/compiler/src/driver_compiler_adapter.cpp index 84bca75106483d..0406b375609044 100644 --- a/src/plugins/intel_npu/src/compiler/src/driver_compiler_adapter.cpp +++ b/src/plugins/intel_npu/src/compiler/src/driver_compiler_adapter.cpp @@ -41,46 +41,46 @@ LevelZeroCompilerAdapter::LevelZeroCompilerAdapter(std::shared_ptr>(driverHandle, - deviceHandle, - zeContext, - graph_ddi_table_ext); + apiAdapter = std::make_shared>(driverHandle, + deviceHandle, + zeContext, + graph_ddi_table_ext); break; case ZE_GRAPH_EXT_VERSION_1_4: - apiAdapter = std::make_shared>(driverHandle, - deviceHandle, - zeContext, - graph_ddi_table_ext); + apiAdapter = std::make_shared>(driverHandle, + deviceHandle, + zeContext, + graph_ddi_table_ext); break; case ZE_GRAPH_EXT_VERSION_1_5: - apiAdapter = std::make_shared>(driverHandle, - deviceHandle, - zeContext, - graph_ddi_table_ext); + apiAdapter = std::make_shared>(driverHandle, + deviceHandle, + zeContext, + graph_ddi_table_ext); break; case ZE_GRAPH_EXT_VERSION_1_6: - apiAdapter = std::make_shared>(driverHandle, - deviceHandle, - zeContext, - graph_ddi_table_ext); + apiAdapter = std::make_shared>(driverHandle, + deviceHandle, + zeContext, + graph_ddi_table_ext); break; case ZE_GRAPH_EXT_VERSION_1_7: - apiAdapter = std::make_shared>(driverHandle, - deviceHandle, - zeContext, - graph_ddi_table_ext); + apiAdapter = std::make_shared>(driverHandle, + deviceHandle, + zeContext, + graph_ddi_table_ext); break; case ZE_GRAPH_EXT_VERSION_1_8: - apiAdapter = std::make_shared>(driverHandle, - deviceHandle, - zeContext, - graph_ddi_table_ext); + apiAdapter = std::make_shared>(driverHandle, + deviceHandle, + zeContext, + graph_ddi_table_ext); break; default: - apiAdapter = std::make_shared>(driverHandle, - deviceHandle, - zeContext, - graph_ddi_table_ext); + apiAdapter = std::make_shared>(driverHandle, + deviceHandle, + zeContext, + graph_ddi_table_ext); break; } diff --git a/src/plugins/intel_npu/src/compiler/src/zero_compiler_in_driver.cpp b/src/plugins/intel_npu/src/compiler/src/zero_compiler_in_driver.cpp index 47bc7611d132c3..8f7ac4198bb0a4 100644 --- a/src/plugins/intel_npu/src/compiler/src/zero_compiler_in_driver.cpp +++ b/src/plugins/intel_npu/src/compiler/src/zero_compiler_in_driver.cpp @@ -178,7 +178,7 @@ std::string rankToLegacyLayoutString(const size_t rank) { namespace intel_npu { namespace driverCompilerAdapter { -template +template LevelZeroCompilerInDriver::LevelZeroCompilerInDriver(ze_driver_handle_t driverHandle, ze_device_handle_t deviceHandle, ze_context_handle_t zeContext, @@ -189,7 +189,7 @@ LevelZeroCompilerInDriver::LevelZeroCompilerInDriver(ze_driver_h _graphDdiTableExt(graph_ddi_table_ext), _logger("LevelZeroCompilerInDriver", Logger::global().level()) {} -template +template LevelZeroCompilerInDriver::~LevelZeroCompilerInDriver() { _logger.debug("LevelZeroCompilerInDriver obj destroyed"); } @@ -198,7 +198,7 @@ LevelZeroCompilerInDriver::~LevelZeroCompilerInDriver() { * @brief Place xml + weights in sequential memory * @details Format of the memory: */ -template +template SerializedIR LevelZeroCompilerInDriver::serializeIR( const std::shared_ptr& model, ze_graph_compiler_version_info_t compilerVersion) const { @@ -258,7 +258,7 @@ SerializedIR LevelZeroCompilerInDriver::serializeIR( return std::make_pair(sizeOfSerializedIR, buffer); } -template +template std::string LevelZeroCompilerInDriver::serializeIOInfo(const std::shared_ptr& model, const bool useIndices) { const ov::ParameterVector& parameters = model->get_parameters(); @@ -348,7 +348,7 @@ std::string LevelZeroCompilerInDriver::serializeIOInfo(const std outputsPrecisionSS.str() + VALUES_SEPARATOR.data() + outputsLayoutSS.str(); } -template +template void LevelZeroCompilerInDriver::release(std::shared_ptr networkDescription) { _logger.debug("performing release networkDescription"); if (networkDescription->metadata.graphHandle != nullptr) { @@ -366,8 +366,8 @@ void LevelZeroCompilerInDriver::release(std::shared_ptr -template > +template +template > void LevelZeroCompilerInDriver::getNativeBinary(ze_graph_dditable_ext_curr_t& graphDdiTableExt, ze_graph_handle_t graphHandle, std::vector& blob, @@ -389,8 +389,8 @@ void LevelZeroCompilerInDriver::getNativeBinary(ze_graph_dditabl blobPtr = blob.data(); } -template -template > +template +template > void LevelZeroCompilerInDriver::getNativeBinary(ze_graph_dditable_ext_curr_t& graphDdiTableExt, ze_graph_handle_t graphHandle, std::vector& /* unusedBlob */, @@ -403,7 +403,7 @@ void LevelZeroCompilerInDriver::getNativeBinary(ze_graph_dditabl _graphDdiTableExt); } -template +template CompiledNetwork LevelZeroCompilerInDriver::getCompiledNetwork( const NetworkDescription& networkDescription) { if (networkDescription.metadata.graphHandle != nullptr && networkDescription.compiledNetwork.size() == 0) { @@ -425,7 +425,7 @@ CompiledNetwork LevelZeroCompilerInDriver::getCompiledNetwork( networkDescription.compiledNetwork); } -template +template std::string LevelZeroCompilerInDriver::serializeConfig( const Config& config, ze_graph_compiler_version_info_t& compilerVersion) const { @@ -611,8 +611,8 @@ static std::unordered_set parseQueryResult(std::vector& data) } // For ext version < 1.3, query is unsupported, return empty result and add debug log here -template -template > +template +template > std::unordered_set LevelZeroCompilerInDriver::queryImpl( const std::shared_ptr& /*model*/, const Config&) const { @@ -621,8 +621,8 @@ std::unordered_set LevelZeroCompilerInDriver::query } // For ext version == 1.3 && == 1.4 -template -template > +template +template > ze_result_t LevelZeroCompilerInDriver::seriazlideIRModelAndQueryNetworkCreateV1( const std::shared_ptr& model, const Config& config, @@ -652,8 +652,8 @@ ze_result_t LevelZeroCompilerInDriver::seriazlideIRModelAndQuery } // For ext version == 1.3 && == 1.4, query is supported, calling querynetwork api in _graphDdiTableExt -template -template > +template +template > std::unordered_set LevelZeroCompilerInDriver::queryImpl( const std::shared_ptr& model, const Config& config) const { @@ -675,8 +675,8 @@ std::unordered_set LevelZeroCompilerInDriver::query } // For ext version >= 1.5 -template -template > +template +template > ze_result_t LevelZeroCompilerInDriver::seriazlideIRModelAndQueryNetworkCreateV2( const std::shared_ptr& model, const Config& config, @@ -708,8 +708,8 @@ ze_result_t LevelZeroCompilerInDriver::seriazlideIRModelAndQuery } // For ext version >= 1.5 -template -template > +template +template > std::unordered_set LevelZeroCompilerInDriver::queryImpl( const std::shared_ptr& model, const Config& config) const { @@ -730,8 +730,8 @@ std::unordered_set LevelZeroCompilerInDriver::query return getQueryResultFromSupportedLayers(result, hGraphQueryNetwork); } -template -template > +template +template > std::unordered_set LevelZeroCompilerInDriver::getQueryResultFromSupportedLayers( ze_result_t result, ze_graph_query_network_handle_t& hGraphQueryNetwork) const { @@ -761,7 +761,7 @@ std::unordered_set LevelZeroCompilerInDriver::getQu return parseQueryResult(supportedLayers); } -template +template ov::SupportedOpsMap LevelZeroCompilerInDriver::query(const std::shared_ptr& model, const Config& config) const { _logger.debug("query start"); @@ -784,8 +784,8 @@ ov::SupportedOpsMap LevelZeroCompilerInDriver::query(const std:: } // For ext version <1.5, calling pfnCreate api in _graphDdiTableExt -template -template > +template +template > ze_result_t LevelZeroCompilerInDriver::createGraph(const ze_graph_format_t& format, const SerializedIR& serializedIR, const std::string& buildFlags, @@ -807,8 +807,8 @@ ze_result_t LevelZeroCompilerInDriver::createGraph(const ze_grap } // For ext version >= 1.5, calling pfnCreate2 api in _graphDdiTableExt -template -template > +template +template > ze_result_t LevelZeroCompilerInDriver::createGraph(const ze_graph_format_t& format, const SerializedIR& serializedIR, const std::string& buildFlags, @@ -829,7 +829,7 @@ ze_result_t LevelZeroCompilerInDriver::createGraph(const ze_grap return result; } -template +template ze_result_t LevelZeroCompilerInDriver::seriazlideIRModelAndCreateGraph( const std::shared_ptr& model, const Config& config, @@ -862,7 +862,7 @@ ze_result_t LevelZeroCompilerInDriver::seriazlideIRModelAndCreat return result; } -template +template NetworkDescription LevelZeroCompilerInDriver::compile(const std::shared_ptr& model, const Config& config) const { _logger.debug("compile start"); @@ -887,7 +887,7 @@ NetworkDescription LevelZeroCompilerInDriver::compile(const std: return networkDescription; } -template +template NetworkMetadata LevelZeroCompilerInDriver::parse(const std::vector& network, const Config& config) const { OV_ITT_TASK_CHAIN(PARSE_BLOB, itt::domains::NPUPlugin, "LevelZeroCompilerInDriver::parse", "desc"); @@ -920,7 +920,7 @@ NetworkMetadata LevelZeroCompilerInDriver::parse(const std::vect return networkMeta; } -template +template uint32_t LevelZeroCompilerInDriver::getSupportedOpsetVersion() const { _logger.debug("getSupportedOpsetVersion start"); @@ -990,8 +990,8 @@ static IODescriptor getIODescriptor(const ze_graph_argument_properties_3_t& arg, metadata.has_value() ? std::optional(shapeFromIRModel) : std::nullopt}; } -template -template > +template +template > void LevelZeroCompilerInDriver::getMetadata(ze_graph_dditable_ext_curr_t& graphDdiTableExt, ze_graph_handle_t graphHandle, uint32_t index, @@ -1014,8 +1014,8 @@ void LevelZeroCompilerInDriver::getMetadata(ze_graph_dditable_ex } } -template -template > +template +template > void LevelZeroCompilerInDriver::getMetadata(ze_graph_dditable_ext_curr_t& graphDdiTableExt, ze_graph_handle_t graphHandle, uint32_t index, @@ -1048,7 +1048,7 @@ void LevelZeroCompilerInDriver::getMetadata(ze_graph_dditable_ex } } -template +template NetworkMetadata LevelZeroCompilerInDriver::getNetworkMeta(ze_graph_handle_t graphHandle) const { ze_graph_properties_t graphProperties{}; @@ -1069,13 +1069,13 @@ NetworkMetadata LevelZeroCompilerInDriver::getNetworkMeta(ze_gra return meta; } -template class LevelZeroCompilerInDriver; -template class LevelZeroCompilerInDriver; -template class LevelZeroCompilerInDriver; -template class LevelZeroCompilerInDriver; -template class LevelZeroCompilerInDriver; -template class LevelZeroCompilerInDriver; -template class LevelZeroCompilerInDriver; +template class LevelZeroCompilerInDriver; +template class LevelZeroCompilerInDriver; +template class LevelZeroCompilerInDriver; +template class LevelZeroCompilerInDriver; +template class LevelZeroCompilerInDriver; +template class LevelZeroCompilerInDriver; +template class LevelZeroCompilerInDriver; } // namespace driverCompilerAdapter } // namespace intel_npu diff --git a/src/plugins/intel_npu/thirdparty/level-zero-ext b/src/plugins/intel_npu/thirdparty/level-zero-ext index cdb761dd63b1d4..a6487cc2c5da9a 160000 --- a/src/plugins/intel_npu/thirdparty/level-zero-ext +++ b/src/plugins/intel_npu/thirdparty/level-zero-ext @@ -1 +1 @@ -Subproject commit cdb761dd63b1d47230d501e631a2d725db09ba0d +Subproject commit a6487cc2c5da9aa13db9e005a320a1b6a0ee5919 From 2db556dacf5600697647f3f00a13e4c6f4f3133b Mon Sep 17 00:00:00 2001 From: Karol Blaszczak Date: Mon, 28 Oct 2024 14:19:47 +0100 Subject: [PATCH 016/120] [DOCS] supported llm data update mstr (#27271) port: https://github.com/openvinotoolkit/openvino/pull/27254 --- .../generative-ai-performance.rst | 11 +- .../_static/benchmarks_files/llm_models.csv | 168 +++++++++++++++--- .../llm_models_platform_list_.pdf | Bin 0 -> 27518 bytes 3 files changed, 152 insertions(+), 27 deletions(-) create mode 100644 docs/sphinx_setup/_static/benchmarks_files/llm_models_platform_list_.pdf diff --git a/docs/articles_en/about-openvino/performance-benchmarks/generative-ai-performance.rst b/docs/articles_en/about-openvino/performance-benchmarks/generative-ai-performance.rst index 39b27d12c970fd..d0a04f16ceb6bd 100644 --- a/docs/articles_en/about-openvino/performance-benchmarks/generative-ai-performance.rst +++ b/docs/articles_en/about-openvino/performance-benchmarks/generative-ai-performance.rst @@ -3,9 +3,11 @@ Most Efficient Large Language Models for AI PC This page is regularly updated to help you identify the best-performing LLMs on the Intel® Core™ Ultra processor family and AI PCs. +The current data is as of OpenVINO 2024.4, 24 Oct. 2024 -The tables below list key performance indicators for a selection of Large Language Models, -running on an Intel® Core™ Ultra 7-165H based system, on built-in GPUs. +The tables below list the key performance indicators for a selection of Large Language Models, +running on an Intel® Core™ Ultra 7-165H, Intel® Core™ Ultra 7-265V, and Intel® Core™ Ultra +7-288V based system, on built-in GPUs. @@ -34,18 +36,17 @@ running on an Intel® Core™ Ultra 7-165H based system, on built-in GPUs. All models listed here were tested with the following parameters: * Framework: PyTorch - * Model precision: INT4 * Beam: 1 * Batch size: 1 .. grid-item:: - .. button-link:: https://docs.openvino.ai/2024/_static/benchmarks_files/OV-2024.4-platform_list.pdf + .. button-link:: https://docs.openvino.ai/2024/_static/benchmarks_files/llm_models_platform_list_.pdf :color: primary :outline: :expand: - :material-regular:`download;1.5em` Get full system info [PDF] + :material-regular:`download;1.5em` Get system descriptions [PDF] .. button-link:: ../../_static/benchmarks_files/llm_models.csv :color: primary diff --git a/docs/sphinx_setup/_static/benchmarks_files/llm_models.csv b/docs/sphinx_setup/_static/benchmarks_files/llm_models.csv index dee8e72a9578fd..b16312fa09457c 100644 --- a/docs/sphinx_setup/_static/benchmarks_files/llm_models.csv +++ b/docs/sphinx_setup/_static/benchmarks_files/llm_models.csv @@ -1,22 +1,146 @@ -Model name,"Throughput: (tokens/sec. 2nd token)",1st token latency (msec),Max RSS memory used. (MB),Input tokens,Output tokens -OPT-2.7b,"20.2",2757,7084,937,128 -Phi-3-mini-4k-instruct,"19.9",2776,7028,1062,128 -Orca-mini-3b,"19.2",2966,7032,1024,128 -Phi-2,"17.8",2162,7032,1024,128 -Stable-Zephyr-3b-dpo,"17.0",1791,7007,946,128 -ChatGLM3-6b,"16.5",3569,6741,1024,128 -Dolly-v2-3b,"15.8",6891,6731,1024,128 -Stablelm-3b-4e1t,"15.7",2051,7018,1024,128 -Red-Pajama-Incite-Chat-3b-V1,"14.8",6582,7028,1020,128 -Falcon-7b-instruct,"14.5",4552,7033,1049,128 -Codegen25-7b,"13.3",3982,6732,1024,128 -GPT-j-6b,"13.2",7213,6882,1024,128 -Stablelm-7b,"12.8",6339,7013,1020,128 -Llama-3-8b,"12.8",4356,6953,1024,128 -Llama-2-7b-chat,"12.3",4205,6906,1024,128 -Llama-7b,"11.7",4315,6927,1024,128 -Mistral-7b-v0.1,"10.5",4462,7242,1007,128 -Zephyr-7b-beta,"10.5",4500,7039,1024,128 -Qwen1.5-7b-chat,"9.9",4318,7034,1024,128 -Baichuan2-7b-chat,"9.8",4668,6724,1024,128 -Qwen-7b-chat,"9.0",5141,6996,1024,128 \ No newline at end of file +Topology,Precision,Input Size,max rss memory,1st latency (ms),2nd latency (ms),2nd tok/sec +opt-125m-gptq,INT4-MIXED,1024,1610.2,146,9.4,106.38 +opt-125m-gptq,INT4-MIXED,32,1087.6,60.8,9.5,105.26 +tiny-llama-1.1b-chat,INT4-MIXED,32,1977,85.7,20.2,49.50 +tiny-llama-1.1b-chat,INT4-MIXED,1024,1940.8,367.7,20.3,49.26 +tiny-llama-1.1b-chat,INT8-CW,32,1855.2,70.2,21.8,45.87 +qwen2-0.5b,INT4-MIXED,1024,3029.3,226.4,22.3,44.84 +qwen2-0.5b,INT8-CW,1024,3093,222,22.3,44.84 +qwen2-0.5b,FP16,1024,2509.5,234.3,22.4,44.64 +qwen2-0.5b,FP16,32,1933.8,146.4,22.4,44.64 +tiny-llama-1.1b-chat,INT8-CW,1024,2288.3,368.6,22.9,43.67 +qwen2-0.5b,INT4-MIXED,32,2670.9,115.1,23,43.48 +qwen2-0.5b,INT8-CW,32,2530,157.9,24.3,41.15 +red-pajama-incite-chat-3b-v1,INT4-MIXED,32,2677.3,186.1,27.9,35.84 +qwen2-1.5b,INT4-MIXED,32,4515.1,179.8,28.7,34.84 +qwen2-1.5b,INT4-MIXED,1024,4927.5,254.3,29.1,34.36 +dolly-v2-3b,INT4-MIXED,32,2420.9,245.6,30.8,32.47 +qwen2-1.5b,INT8-CW,32,4824.9,165.1,31.2,32.05 +phi-2,INT4-MIXED,32,2523.5,233.9,31.5,31.75 +qwen2-1.5b,INT8-CW,1024,5401.8,331.1,32,31.25 +stable-zephyr-3b-dpo,INT4-MIXED,30,2816.2,151.3,32.9,30.40 +red-pajama-incite-chat-3b-v1,INT4-MIXED,1020,2646.7,860.6,33,30.30 +opt-2.7b,INT4-MIXED,31,2814.5,174.7,33.1,30.21 +phi-2,INT4-MIXED,32,2363.6,236.6,34,29.41 +stablelm-3b-4e1t,INT4-MIXED,32,3079.1,220,34,29.41 +minicpm-1b-sft,INT4-MIXED,31,2971,185.1,34.1,29.33 +minicpm-1b-sft,INT8-CW,31,3103.6,233.5,34.3,29.15 +dolly-v2-3b,INT4-MIXED,1024,2152.3,876.6,34.7,28.82 +phi-3-mini-4k-instruct,INT4-MIXED,38,2951,155.4,35.9,27.86 +phi-2,INT4-MIXED,1024,2689.9,971.7,36.5,27.40 +stablelm-3b-4e1t,INT4-MIXED,1024,3335.9,519.3,37.3,26.81 +opt-2.7b,INT4-MIXED,937,3227.5,639.5,37.7,26.53 +phi-3-mini-4k-instruct,INT4-MIXED,38,3289.7,161,37.9,26.39 +gemma-2b-it,INT4-MIXED,32,4099.6,258.6,38,26.32 +tiny-llama-1.1b-chat,FP16,32,3098.7,143.9,38.2,26.18 +stable-zephyr-3b-dpo,INT4-MIXED,946,3548.5,453.9,38.8,25.77 +tiny-llama-1.1b-chat,FP16,1024,3388.6,523,39,25.64 +phi-2,INT4-MIXED,1024,2594.7,964.2,39.1,25.58 +minicpm-1b-sft,FP16,31,3597.7,164.8,39.8,25.13 +gemma-2b-it,INT4-MIXED,1024,5059.1,669.1,40.5,24.69 +phi-3-mini-4k-instruct,INT4-MIXED,1061,3431.8,840.1,40.6,24.63 +phi-3-mini-4k-instruct,INT4-MIXED,1061,3555.6,836.3,41.8,23.92 +qwen2-1.5b,FP16,32,3979.4,111.8,42.5,23.53 +red-pajama-incite-chat-3b-v1,INT8-CW,32,3639.9,199.1,43.6,22.94 +qwen2-1.5b,FP16,1024,4569.8,250.5,44.1,22.68 +dolly-v2-3b,INT8-CW,32,3727,248.2,44.5,22.47 +opt-2.7b,INT8-CW,31,3746.3,175.6,44.6,22.42 +stablelm-3b-4e1t,INT8-CW,32,3651.3,178,45.4,22.03 +chatglm3-6b,INT4-MIXED,32,4050.3,88.1,47.4,21.10 +phi-2,INT8-CW,32,3608.7,232,48.3,20.70 +red-pajama-incite-chat-3b-v1,INT8-CW,1020,2951,816.6,48.4,20.66 +stablelm-3b-4e1t,INT8-CW,1024,4142.8,658.7,48.5,20.62 +opt-2.7b,INT8-CW,937,4019,640.7,48.8,20.49 +stable-zephyr-3b-dpo,INT8-CW,30,3264.5,150.7,48.8,20.49 +gemma-2b-it,INT8-CW,32,4874.7,249.4,48.9,20.45 +chatglm3-6b,INT4-MIXED,32,3902.1,84.9,49.5,20.20 +dolly-v2-3b,INT8-CW,1024,2931.4,865.2,49.7,20.12 +gemma-2b-it,INT8-CW,1024,5834,545.4,50.7,19.72 +vicuna-7b-v1.5,INT4-MIXED,32,4560.3,119.4,50.7,19.72 +chatglm3-6b,INT4-MIXED,1024,4070.1,895.9,50.9,19.65 +chatglm3-6b,INT4-MIXED,1024,3832.1,854.4,52,19.23 +orca-mini-3b,INT4-MIXED,32,2345.5,132.8,52.2,19.16 +phi-2,INT8-CW,1024,3511.6,989.7,53.1,18.83 +chatglm2-6b,INT4-MIXED,32,4960.2,91.5,54.2,18.45 +qwen1.5-7b-chat,INT4-MIXED,32,5936.5,195.7,54.8,18.25 +stable-zephyr-3b-dpo,INT8-CW,946,3700.5,677.9,54.8,18.25 +llama-2-7b-chat-hf,INT4-MIXED,32,4010.5,113.7,55.6,17.99 +qwen-7b-chat,INT4-MIXED,32,7393,132.7,56.1,17.83 +chatglm2-6b,INT4-MIXED,1024,5234.5,747.3,56.2,17.79 +qwen2-7b,INT4-MIXED,32,7086.2,183,56.3,17.76 +phi-3-mini-4k-instruct,INT8-CW,38,4574.4,132.9,56.9,17.57 +llama-2-7b-gptq,INT4-MIXED,32,4134.1,120,58,17.24 +chatglm3-6b-gptq,INT4-MIXED,32,4288.1,99.4,58.1,17.21 +qwen2-7b,INT4-MIXED,1024,7716.4,734.9,58.3,17.15 +mistral-7b-v0.1,INT4-MIXED,31,4509.3,115,58.6,17.06 +codegen25-7b,INT4-MIXED,32,4211.8,136.5,59,16.95 +qwen1.5-7b-chat,INT4-MIXED,1024,7007.2,792.7,60.6,16.50 +chatglm3-6b-gptq,INT4-MIXED,1024,4545.4,860.3,60.9,16.42 +phi-3-mini-4k-instruct,INT8-CW,1061,5087.2,1029.5,60.9,16.42 +gpt-j-6b,INT4-MIXED,32,4013.5,316.1,61.1,16.37 +mistral-7b-v0.1,INT4-MIXED,1007,876.5,984.4,61.7,16.21 +llama-3-8b,INT4-MIXED,32,4357.1,132.8,62,16.13 +llama-2-7b-chat-hf,INT4-MIXED,1024,3564.8,1163.7,62.5,16.00 +qwen-7b-chat-gptq,INT4-MIXED,32,7384.1,217.8,62.9,15.90 +zephyr-7b-beta,INT4-MIXED,32,5331.6,125,62.9,15.90 +qwen-7b-chat,INT4-MIXED,32,6545.8,218.7,63,15.87 +llama-3.1-8b,INT4-MIXED,31,5076.3,110.4,63.4,15.77 +llama-3.1-8b,INT4-MIXED,31,4419,145.6,63.5,15.75 +llama-2-7b-gptq,INT4-MIXED,1024,3434.2,921.6,64.4,15.53 +llama-3-8b,INT4-MIXED,32,4886.7,132.3,65.4,15.29 +stablelm-7b,INT4-MIXED,32,4768.4,132.1,65.5,15.27 +codegen25-7b,INT4-MIXED,1024,1429.7,967.5,65.7,15.22 +zephyr-7b-beta,INT4-MIXED,1024,5575.6,837.2,65.7,15.22 +llama-3-8b,INT4-MIXED,32,4888.3,161.8,66.2,15.11 +mistral-7b-v0.1,INT4-MIXED,31,4401.4,142.7,66.2,15.11 +llama-3-8b,INT4-MIXED,1024,3782.4,1091.5,66.8,14.97 +llama-3.1-8b,INT4-MIXED,31,4781.4,159.4,67,14.93 +glm-4-9b,INT4-MIXED,33,6392.6,298.7,67.2,14.88 +qwen-7b-chat,INT4-MIXED,1024,8472.8,1331.2,67.4,14.84 +gpt-j-6b,INT4-MIXED,1024,1237.8,1638.8,68.1,14.68 +llama-2-7b-chat-hf,INT4-MIXED,32,4497.4,153.2,68.7,14.56 +llama-3-8b,INT4-MIXED,1024,4526.9,1060.3,69.8,14.33 +mistral-7b-v0.1,INT4-MIXED,1007,3968.7,1033.1,69.9,14.31 +llama-3-8b,INT4-MIXED,1024,4297.9,1041.7,70,14.29 +orca-mini-3b,INT8-CW,32,3744.3,174,70.5,14.18 +stablelm-7b,INT4-MIXED,1020,4402.1,1186.4,70.5,14.18 +gemma-2b-it,FP16,32,5806.3,117.6,71.8,13.93 +glm-4-9b,INT4-MIXED,1025,7003.5,1354.2,72.5,13.79 +gemma-2b-it,FP16,1024,6804.7,490.6,73.4,13.62 +stablelm-3b-4e1t,FP16,32,6217,207.5,75.2,13.30 +llama-2-7b-chat-hf,INT4-MIXED,1024,4320.9,1247.7,75.8,13.19 +gemma-7b-it,INT4-MIXED,32,8050.6,134.6,76.1,13.14 +gemma-7b-it,INT4-MIXED,32,7992.6,146.4,76.1,13.14 +qwen-7b-chat,INT4-MIXED,1024,5712.7,1144.4,77.1,12.97 +stablelm-3b-4e1t,FP16,1024,6722.9,491.4,77.7,12.87 +chatglm2-6b,INT8-CW,32,6856.2,111.6,78.9,12.67 +opt-2.7b,FP16,31,5377.5,138,79.6,12.56 +chatglm2-6b,INT8-CW,1024,7133.8,1012.1,81,12.35 +red-pajama-incite-chat-3b-v1,FP16,32,5672.5,211,81.2,12.32 +gemma-7b-it,INT4-MIXED,1024,9399.5,1726.7,82.2,12.17 +dolly-v2-3b,FP16,32,5573,230.6,82.5,12.12 +gemma-7b-it,INT4-MIXED,1024,9460,1241.2,82.7,12.09 +opt-2.7b,FP16,937,4727.8,618.8,84.6,11.82 +baichuan2-7b-chat,INT4-MIXED,32,5782.4,274.1,84.8,11.79 +phi-2,FP16,32,5497.3,244.9,85,11.76 +stable-zephyr-3b-dpo,FP16,30,5714.8,173.1,86,11.63 +red-pajama-incite-chat-3b-v1,FP16,1020,5262.2,817.4,86.2,11.60 +dolly-v2-3b,FP16,1024,2376.1,935.5,87,11.49 +qwen-7b-chat,INT4-MIXED,32,8597.4,226.2,87.7,11.40 +phi-2,FP16,1024,4063.9,969.8,89.7,11.15 +chatglm3-6b,INT8-CW,32,6158.8,123.4,89.8,11.14 +stable-zephyr-3b-dpo,FP16,946,5337.1,781.4,90.5,11.05 +baichuan2-7b-chat,INT4-MIXED,1024,807.4,1725.7,91.8,10.89 +vicuna-7b-v1.5,INT8-CW,32,7391,171.3,92.5,10.81 +chatglm3-6b,INT8-CW,1024,550.7,1210.9,93.3,10.72 +phi-3-mini-4k-instruct,FP16,38,8299.3,142,94.1,10.63 +qwen2-7b,INT8-CW,32,9941.1,139.1,94.9,10.54 +qwen-7b-chat-gptq,INT4-MIXED,1024,6545,1103.9,95.8,10.44 +qwen2-7b,INT8-CW,1024,10575.1,1183,96.7,10.34 +qwen-7b-chat,INT4-MIXED,1024,6777.4,1309.6,96.9,10.32 +vicuna-7b-v1.5,INT8-CW,1024,8013.7,1154.6,96.9,10.32 +phi-3-medium-4k-instruct,INT4-MIXED,38,8212.8,448.3,97,10.31 +zephyr-7b-beta,INT8-CW,32,7888,144.8,97.4,10.27 +phi-3-mini-4k-instruct,FP16,1061,8814.8,1195.7,98.7,10.13 +zephyr-7b-beta,INT8-CW,1024,8136.7,1191.6,99.4,10.06 +llama-2-13b-chat-hf,INT4-MIXED,32,6927.5,165.3,99.9,10.01 diff --git a/docs/sphinx_setup/_static/benchmarks_files/llm_models_platform_list_.pdf b/docs/sphinx_setup/_static/benchmarks_files/llm_models_platform_list_.pdf new file mode 100644 index 0000000000000000000000000000000000000000..bedd9c28286476ad46ff3ec0db7669717404115e GIT binary patch literal 27518 zcmdSB1yo$mvM-E#u;4ma&_M{gWw?1ZVc6E1EcXf3&)UWpRrc;uT0&=i|pP_QqCE z&;$h7)I1#^Y~n^PMmF~5Y)VGv5N7}g+N{DRZ)9iA0I_2Pu&KH@xth4BIYA&Q_Vz9S z4&Gnw3Pw)WvUX(joFHZpCy1R1 zbOa7Q@MDXR&=WL>o#`)C9tZuKPQP?v6SsG@gAU2|!qU`P2f+0xr3d&^j`I&WQCAlW zdnX1NBRgk^9ScCg(!|0DVgrzZ&OzJ>V&r0JZzo~o0%4Hg2eE^=IM_j)9GoC_4lW=& zn4X=T9?GU*Z~8xQC^^}ix|%?o815fn6ksf2OkkW~>|vatml=!;OeG8eMiRyYS_^@( z`BgH5wm3m+|1&QDh7*PdhWod2P{seh>+(+vJsRylSr)p~(6w;+$6BjEob6qmpeqAi zOi2$HX;l}fnV__^Di7eVHI!E61ModsAG*L&_I57NMkx-!uT^A|0s%N4UFxw7Y913+ zh>H#z)aTgLARaDkvbIpC68q~W{?|?RaS7Q(?d@oOMG==wlREsezNJ#?V0>;Nzy2*ASug4T;eRf0~(8SqPtP1?!c)d8yFFX#SM zU{hsNbAtMX!{f9~JlVum*(4xtmL?DtX)&lSYDUJ+P?o|@fd_L$uqT>AZpW8ZJib4+o4{5pix_a>;Iv)-ZKaRqVsndX;bDKt*v-?e z-4K}#+)=P9y1XLIogLX(h$%o+oVf;yBmFQW+Pnuy;HhlQ;?li7utP3c-BA5Ph82^g zA0wTBdPSc;bHX&?U^2VQ?LdX6CT;QVwIFG%UnZHK@RId2;cEM0ki1>7nUg1nYhr6z zmTo63LFNY5g=tI4`r7lWA-nH|*9OH8h9xU8(k~Tfyz|{Q!^gk7U^vowes?NA+jMCm ze8vW2i`AcT23%X{>g(R|3{*c3Q}_OoVHSpu8f$Q%@H0V~P6#V!22rY(M#`E6t6_); zB-KGxKo=$FxRDw)raAd6N<{`URu}Q;?nc@VohvdYkKFjS7wPt?kI9-gzn_kX>vJ*n zac9Bci2Av6+ThzT|NEiUm+yCGW^ZG#zT3~dI?*Yz*L296i$|aT1b}dJQO#4Qd@u@J zx{n}VUKZW=SaIG&yyp);5|Ide*iG~Y85T4li$I#yu(;-CL9^(cQFybpL@MbQS_FF& z$E89Q8oJ#6t2cEmYx&m)=fpC;5lGgi3t@Cc+Zr6qbXa&`ORYf3grPiKDNzfj*X+z# zmmL#=;6VCJW0lEq%1VMO(ha;shlE3}2;brA8E5!62gE6NC7XD|1bdm|3`lO$ie;N> z%tT{~og6G1R?dAEKa_Teyf)7$`#BOU(_k!woaNDL!_B*l@rjy-_||7nSBhVKN*n(| z^Zb+k59e)?mNTAW=VQ4s+Vj#k8gcmI6dzOG9-^`%X}lcu&s`kY9_M!~<}xDhH$6x;){&pgo8=EZ*^)x2)_rbjy56dG; zmY|q-hthkZCIcq(S`B+c?#H!VY*yFC(+XpCj)W>Ooauc-+bt6EZ{Eddb%k|f;&h}R zX`?fJCkU%Py^l!di(Qg*E1*o?yKQEoQq_AWJ{`R&CxpqF+{kYhscO9Dm{xXg8D6ae zHj7PEN9rdPy8|Bt=;^e}YVYIU&f*W7OCpY|;qCJ#BLUChbD7k$UQ9KTRk==p^MYmH z)L^PU=bgYbq>dKh`Wmg+Qbrdq#3CguKWnB*bh+%dk#Qf6XkJ<~&1aIlBlR#cq9mA1 zpd+S`Z?k4a*OW(7n}*fvxZ}~PysI~fHzRvCTK}xK`DjaU9L{YSigSR3ue}pU? z6^p(_GbAq_M;fJE6^P*s3hg68SCgyIQT9=VE40!?E=^un!Gdkze9uwjgEsmR5{XZW zxK)U5qbvS&my7;>CjRBXQ}{gDo$KA7-GONrggE(&8>qJ+;)T8!lKh{NG_p!7r>DYB zFbU-zu&jb*nl5-oF6{;15U!2~L^^)TcEjb&kL1+#Xk&oI)viQT<6j->geg3-s+?;< z3!k3GU3VQDMM4o?qk0-xL09)|^~d=UYWU=y)at7Ht* z4xEfUBKd{M7HB)q7P0j#ufJd`zwaGmcl)5gjfB%Chp~@WOjA!JF<*x$o|oUotGhQ% zU<^Mzy`d}D*y|vaF{%^{mNjC*c`GM`_&WSE#>uB#FCP+fY?B>t{^Wd}5t}{-ZRO`C zYA_{UO2{!-1WwC+@Fo1p$aeL8!IvkUWj-Q87F!qls}!kRb@BWBlP96px|3IyZ-Mzs zVt5~W&(`jEo+-kIgC-SheFke~pdXol{Iu z2*C$b{Mk~BsSk`x4$p(9hZ}k`KigzXXq79wM$KDiPX1(xi$PN-r#~&@cPej9Oz42c@db}> zV~SM6Nx(Nq5%RQbX5H)vd>d%vBiD~=WNpf4^AW9I^srWJxRB0~+)>K`>>XsY!hI>} zw+DAYMcVzdl5j3FzM=j%I2Il{#z6Ya*i39^uESQo6*AZbTbNnb4Hf!ZkCcnbsui3u zdlFis*)UJNmmv&C_#!%s`7K5ZSBY;<401rVCbaGGxkE_6Oj@#fyKlM8QwuF?o8_eG zS-?d=vup+FG!O69!t|44jm7>0fmI|=@m_>QRV=X08ob@TN?IgW8(5;ysKafy!FUA( z^FZGNmWqm`5V7+URO~x(P3)}vT%cJdZ0LDO5YdUIr0?P`QWqZ_gUJ|Hk9mxhq9Qt~ z#-iFx4tv7wokjO{eB7ClQ?)GD9BnXoSUzwkFs<8GN{ji$V--`%ZUUpZac&bh8O^KM zIDId}sW&OwJLiab$w;Mv#R6DFUaCZ!a=QFsU9(~powzyPW3d)?1MMLfb#bi1iPTnf zG%UgK`5C;1#OqrF5Hqxl7MAW6M`x!$q^%-qhZx!WnxzRmyu|_?tmq5jd`n5Y70TJf z<|qUH8Ah{@ZNBy7yAX#7NjrLq%ORpcS5#?fdrnShH<);+nxNo!QG(eGx#lQH`1;9r zxNy%!%}Rm&Pb|Y^8`LHo%XQ2`pmW9E#Ct^$R`(hu?J`Vlj1xgJpOLsboH2{k3f(K@ z{dW7=q{Ay1r`#bWoiWZr(QcbB*fG~$u9F037;EoDYYZ8R3!AOW0N`LrvCDlK9C3xr zH-i}Uh=Ss6?u44*`Ne@(Ke5*Rz*@f$%hT7L(NzhcmT2Gwk03T%q@PPRri zY$i~I^k;nwOkCOjM!+-Vs3mE)T@3%_O2-)2EaR6B;zO*zEwKKPY0NB|?olPFG z2Om2(GzkBT27)-)p)k}z24ZP$;Q|1Ic-chV%zvo@;Ns$dLQIdp3LqXH?q36G{>90~ z&d#Rw4=E5gJ2VhCdNcxnlM@=vLu@sm(!bcnexU{+R2uL&768P~4h0Y(-oK;CM~L&c z8UDM46Bqsa!u|K1np=3WTDaKSFh0VzKQYpOu)}}K zEBtpJ4EmP`i-}7}N=h<|LfzAu2E?vt*vy*c^*aBp7FAMVWuZThEsb93<$N~C)<*}f@8RE~}$iFcJHxCCZ2=v-T%fAoP3A+!NtzY3I_ilH^l#@t-<{_L;M+z{2N1XbAwsI zT)z#$4Tib`@Ba-${O{NrJbxMj`p^15!~`#mtyEoX*%YB^NoZ{G7)L>~mOMOA^1niU3!1<70AlM8h<+cbzSd!jEXZXJxXf+X>${{Lw-w~ z9>bS6oB5+7nuN@(T&bXDCB0|I)XkY0k&(=PjvP%YXh&|z$BLC2QPGuIW=<7|^02dF ztOU}VZ7WdCZ_n)~OR zkQ2l^?jZw0r(7{pqlQ7-$Vq;#72EV5e*}@!&Ndt%|uQs>BL?SF3&^__9WQ`z-eq~ z;i?2|NQOziLVFn|L91wxsgUg?w;I!!q#87aU5GS4TlY+=-CsoTz;n!bTgi(|~Mb1<QXSoknfc3_KWc<|UIdtlICz4OepNV7Qqc=JoZG+2$ z%5(cY)HbZ%31AJ-OI*O6D>>7{=O^E{wy!3ZygEF8c72*SPqn+gzqRk+!?FZz zp4Z+Qw7Z@eT(TeR$1#a7e;oUqi{PM5ih_&5;*Xj_X^e=@tQ&+Ms`aHiBGqsUhd@OV zNxzIXyPHuS9o0C;%Cny3@Wh+6R>TFc!`sfKT;H0rAN=(Kt~6!QHs zOW1+5VZP;CpK3mqiPzYAk{i$Q*XqKggoq`|=TWIonHh=~$rg{tF8XE7vY8mf9-1UkfjA-Y;APlEJSiLCfgKdD|57|3$7BnY8G1(SfLZnr zneB1Oa+Ph13?bdtnlf;q+!lrn1MeIGnflT*LkI)!C%+oyDO)6-s5x5r5i0zeH%J^x z+)KLcAIp>JW{a_As*KTMH0vKn%6z0AS@ANheo=55p&S{eoqUC24PyYL^5P8iG|l=N zuZ6QUI1PoalX8ub&X6G0N2+D`pOix9JiA#U25+cZKLJUvh;4%ztD*kXfKjTnSoCbOCBlLj6W$8o-YB;GH-11L@_4KHjc6&txPG4wY5<@%d6Os37ae^STeZu!BD6U)2b%=ReK_Vp*#xn~dA(KdEc46Rhg)m=jf``+Ry zD9HUa)NhLQkWJ3x`C(R!UGY(1^AVk%dchJ!0k5B3_(gU|Wx49d56|dS6B8-}IX`ku z7n(wQ;&zB|x9m{vUa&lNap82u8ggaZEq{C9MqG5_`~t+yxLeNuiPT!9hFf-~lUSd^ zT7`$3ai4_uQ)}G=!`fk;JAX@h3-WeLs;m02+ilbj!=>j-w+q7euJzZ4OO%TG&Plti z84vACKg;057z9X6>`S9sr<~1^f-Za0*&Vj?LO9H9+9=AZ2u7f`fz7~Xve6`oA8rg6N}u>sXQG&8 zOh2yA%Xt+Wec@%Cuj_o)I7F@IVp()S@K$Y?S}I=P z>gwKNkuvNCH2Y3hzhtl}dQU z8;8d_WSM?~yg!ZoPq!#|1gUYPD8L3^rI643vWNZ&sv$+d~su>X3j&BFtRW}ZO5u1A^&`t%$O?ewwkFM-F} z$02y3L;m7~wm#N9W~`u(ynoC5gXZIUgt7ne^G_9_d9L5|pQ?kP($E%Y>m&6z(r>Qc z_s0qUYX3dqUl}vdue!(j$Ls&h2h0hDng9B(2nXLkaXa+u1H6xRdHis*|EB(`=E5<+YLD$mhYhE4llz~G)sqWI6 zN4mqdhv(V0q7F8dAk^K6VsUrYgQ4CxLzee-ZZct)zst3J00 zF3ZG*#)=6A=i4T4`}V_z1)GVbhQ`K2O>p$rd(Ishl=Ww}jcs$w=fqjgX7|@S{-eU2 z>w)ZI4CEAKAp3nF(Ej14`8~6khpt01{YaHwqy6OV%nq4Z9FF(jX$mN#Ym^Ak}`Ro9bJ~i zSZatrz^tO1#N=w7&)(|?yWKN)62rSUF@L24&!J-aAsAle~YHOgeQ6W;YF?AvECs7^ydQ$ z>^a}RasAoir`K`sY@ka*5_0R=1nya4?O|~McP*j0(+7U&xY<9RYEqXmjiCZ+#*j3D z>H@n5l-XtiNaj}Z@O5$ba;vGidVNTy0x@`4iR)FgK=ZIQ+)m6&=wzgg_vtNXoj(Z{ zU!(7S5S9cUWZ(*JE2|-1!ZtC>>V>Ee9)g;f@5w$sEbLbSvD(+FsHV6#eXxBm?ZvO{ z{W;k3e&*C+_(w+eKKxSk;S;*>B`Q1J3TuM>H9^UiDM z4d-3V^g|Pt++nPBp!0nn>OlLKdn{ITD(BmumrLHG!}KvLE(hPDrR{iqzO9}GmiSMvoY? z5+19?82e%ac?h~zUzuGr3XfmY@4?>?Be%rP>IH&q$KZiT+(Sg)zZ?;B3eB}TujV5e@0zZ3qGOnmR6I#Z z#~%YbD8rPfGKl9Q$dJHU=3-)Gdj={i`y5Jgib-3rNyFCJ!UGxP@n0L6wsy*}NB5S- zfk5IP7g!xsV);UbVdd$%Vq|6|K@DhLT!lNh#)d~@io0;kf#ySBK59c0LGRMlkbFc% z*kCqsSDzW3<&%^cA7YFh;U1bsyYF?chui89qt#z~oj2`U3?aDyq!!SvNd7N6&D2EWFLyS-o!*-UgT$NLd zJ+2@-a_rX&QF^c0Wy3jw9hzj*IGo?)E6^T36CAp&UW8a!)zC&+5Om|zTGYZx)xM$b zkw-AB6~#L+geVcR%vcBPjbxe1uZ#p%&)Zd#c6OUNYa!Iv2My;x(Xp9aXQz-5r~7oL z>z0uCRD9bf&=!G$5P{2v8{-8G=s97x6d{}x3^R_I1^vM{(h2vbr7;_145&M!pyQMf z9uP2^DiIEo`s1O}15j60q+PtawB$S_3FrV;l2JAsr0t}gANo9vB?=uv1Y(b1a^ zVe$Z52~_6Juy5?&sC18pL~ujI`1%x9b<_7Ar~<&(8owZc~(mQ`w8)r zeE3Y6E7bhgF}ARdWv(dPwzZ=6uz1A*^SBtubH-@`Hb!Be`&rX$eMF^rIawxysjLr! zf(P;E8+44cvAM(|Mf8ATv`=a3;!_Y&WW!_;5pX_FyfiW$;;nt5&5oQ$S*0wdp`DJo-o`?|lI@Vu zID5cY-HQ}SU-@M`S^e;`Aiv48L^W(>{D1-)vVMRob2z`VD2yxiE2yJ7(jn$zzDH;d zWin1C)eU^TM+*#vlg+7C#(uIntsmAtkH|M}1!(*vHv5i&1{9Ql`64*H8L#m4RdF1j zY!2QJ%K|3apu}BiS;#Zr1i3q(p26u>lm(72PF~m49O88!wX>6dd~~*;QH=b`8m-b# ztQ!!8mcB5p;7R>frw+N>`=f_UKd(eBDD%zeFVkBm&g+$XrS#|MztcCY(mU&3b1+Mewrq+eI zF^c)^S%9)K`+ylI0us=hc-aE7Pvg$rM8`R_&ML&Ug%M|OHnIfhz>J@y^6oO(%PaYq z-rr25m6_U0C0I~?k$O`eYxYSF(<*gwYD6Y!p-2FBOiYJYUS(mZk_#k0^@bQ`6yd9t zNI!|TT=lRhc2Q)thBAt6w~H#SBW)vop-3o_gF&L$b3}e_z_9fzRCFOiqu0VlQARJM zV44%r3wcvpx4x>lD3Sn`tf?CDm0tA&M+@;hQ_2ZVebYZ~P842eh46CJjFGP6$yy`5 zGr}6O^jy_qu;|pSpUi#(V5;k>=hV_>p!tpv$XNO$RMq%8r|==N8dlQ9IhI{Q^@k?^8r0JAJYPX^m$qRtC>>E?K%OOmbVBdYbK zf}};|0zi{t&QzFDBO-cHB#Sy+mj?bgEn&!$*cW(~6#0E=bk(VQRIMhwutZ#N`mxh5 zLaAQ+j0A88FcWE2Reb1dUKcxS6NpSIC(_znJ3>bfdIsOpE%uCzi@LAOb4v*RO&3!o zL(+#Zq&_1jIS&ah?x#oJ5NYC`YX9K31@Uct!qmDX7B1HTY*4+Bydb*R4`-xac%oXZ zZNVjhV1%MV4Wo0)vc}F*Vg%18YfBlIU5HM&6Q2b;_o=^WUVyh?n0v~TcCi=zE8%Is zBjboI|L0+>MHxCW6T%OAiKO8m0ftW`KBZTj>ivL|9W+SsTHbd>Udlb1K@<~hOpD?z zZ3e`+O(AMjko0Aeq-*PpglntZ&z8J#&X%t6Qo9ny^4c`keWZwj7IHe}rHh2Y7C#fU znZ|OxnO3IqeQok7*DAkPGxc&~w#-)3t^|S=3$O5|Y_*A}T=Aq_58p`XEkr0UZR4l9 zRxszcqy&{#Hm_RhB~q1J55?}0oTa*+-;|EL_kYzW+w^{cLBf?Zn6u?vtgOZ zp#0reN!NWBX_u+NyOwf@vfftBU`xGcsw=*yvYzMiL<={>FRRSsb!Fjfl}?*gVHJ3qY*FjC-7m8OYPp7sZ(ysFZi}PJ{>T~t4UkvNW z3WiP+?o30TyPHj!UYJi>+;9R(URXu9o3;G`g zLY+jC69K+(d`7(p0)2VDPiv^zl-^3|7~R1X0pN;Bz32mOsN}*|FGU6s+xh~`gvaXO zCu3W_V%DLve=b@vGtu*TZmi5dAVya@`O(ESrrL$BBj|8`w#5YkvcijW^>iAo8k%WvMz9AHZ8tR(r>XksQO=bd8V?nt$ zVqaR!bk3e73U$19af`a1o3i_)2}HJz>3J?|dmG`!#*ZrK`?6P1otT<0}!r7!Q%~g!t*qw?k!kXb3eu$bge!qvMmwV`NrgJw`LErx(d+V-8;y4ZSICPtiPk=TctkLCOY zaV-wYJbtb#JwIQ^db+KspJ82=4U9}4^VifEdX)%sWec4w*RtKY_&6}%f4!|*{Mxed zEnDNxIAz4q z$@<3s6<;HX9Nd8X20hlsSZ+GT#<4~7!xa2*Mr-%Pvxnt(9jBA^Ui>pF!*;us)?$+{ zI}CHSq8Sjka*dr8y~d5*>~YvX2soX6yIRuz# z7Z8{njTPg48c~Aa_X+Jk+mp?M-M&x%j6J+rx0N6JQkA&758Os+@ppN8_tBR39Zg5~!uZ0Vk14)eJb+gAgQOD7-sUz43 zIy>WH1%koRCdwC1%T*F(Kaxhu7=ggy(*F0dXGYIZ!3^o#E-LC8ECZDcCB~#oTLV7V z$!A{*efI|nE?0c6&S)C#4uxFKq-Hih^kov32M2yQ!G6}O~>DT-Lsn8%A=ak%I5UmfuFSd?2sEc z#xYpJ&2^wEt3=HYGfy!xE{?0;$lDz=QJ*o5s2OvwLBt4{ln6_eTOuxZSg>23ama^H z*7VA6&C9yX0xvz<5HDX;#?Y0mokb4qWgM%RnI18@9MU+?cHWxJ{qdZ9eCdm&~q%ikeb z787V355VuiaLE~`hQRd;C)&5zgCCfR)Zci97#Ss$?Y?3fuFkx=!!qRma1U1XRv)G4 zxFK+#SaQEJf}7q}tp55AyUG*$1NZ7if{nVJWeO*{W|3*pcQHGh4lvr293S4eT!US{TA7ktl^ zL4!G_p!Zf+sNch4W!_FzCbZC;r>bM`55&9qaj4U#u%N z!cHNv4>8|aDR#Maylk?8Vcqehsq^+zm~R{N%N~8>MtNu-d;wEIv0O_b z?F3y>5Q4CtNLba=r|f#dNjIN(K6Xs#^Q@jNP#k$X?aiH@(@V9Fs*&|ZlrPb2SFUBe z&!$Hb_O|M?4J8$JW!5Tpy%4x%y0xAixsa$D@goO3H|ywZ?{7zsYyWm|u*GN1Y&>n3 z-ZtcyXlxTJ#rRpH^D4pAK7#6)L?#I^o}y`Q;r_MZ<6E)TjD{_4jCsL2U={4xQGqRA z6)viU&WQZjvf_f1V&Dbl&FK${S-tExs`C5?tlVtCYc2nYBQW!&&o7e->l5uK=8`9KCnqHpy_fc|8!ulU z@0fLqt&9!_U)DB;@_;F^Qv**=NtslR6)Y)O?g6A+#0bNRHJY6X$d$Pr4gPqO!iG8J z-|W+TZ7Awr&hKzDZNAD6BHzw2&7!!?YiSX8yLGWWtn(EYx3lXBc@q7~w!mxi?!N34 z^Y(i8%PCEE`&05P@a6f^;+$iCtkw>UKzo;2&rrWlxFSX!7&V!MnTbF#a(gizXzjIM zd%;Pir#Fz);;~eNb=W{Is zCwW=7Gl^kk2*itqsdRDH^=9?Ym|v8<_@Yj&Sq@?iW)VO54#}nNO;8nh7o`0Ch-RlD zlW5jk&Do;NcOP@Q+Pr|-=k^KE_AsOT264*nyy$rO;u`k&yUbNoo((^dz~K=%Lq~Jx z_Pt>=2)hq!S_v@hSp%=!q=hSk>K(ICZQGD-ndTeB_ngDc6?=LSEp8co+}dck0`P-P zg8+YPXQzlNl~Exf=ZIl_?Eyyzd6rLxH1vu((3?NT6{~KV#||@n_`cyHqPVNFfSfcm zgl9Tjd^^H6%X-k;YtmY(b>SdA(rv2okc-5An#;vOlWraXn`nPr!J` zkMz@lgN31{_0$9bJe5y=(WpxOKS6~g-#qYylQ#2Qm|)k7lST*PKo<(1Upi7GP~it` z6!Bs`MHz}emhzA67y#$e))6JAr<$GkFu9|<3-=xDu%{+QbGho_2rol;+r>Tna+G~69%@o2O^4}`S`;SW_N{}#V&E*j&9V7XUtUE+Ud8t zai-_pYE#yRJB{GpEW|jf$Z*bpP$=&=c+ne)cw7e`Su5K3`Z3YsI zHm?chkbcmT655B0sQ8+EV10*Qhvalz*j!e#sO?vinakddT;{oLG$yP}UBi*IRdS5ozPQVd3DcgSboCE9u`_!uxB z+H8sM+-;D^TDg$FYEC;kEGn}sO(-!`+;U1O_9hyjeQA*GSHAxlsDo8Mwan!i-RLtU6S&?2C-KDD1P+?%H0}{*kSA|4#7eCj>&Xlwk|>M0n_|D8r3{`0f3{MHS=njW1*O72iFb+lO`LxR(hA5*}x{A{z}NZO_yzEFh|?3D=p1(yKg9V&g+8l!nTrd z0NvlIxwoz7XYegw(u$L&=j@tD*^ccU{GEwX_L^{?f4o2Z0OY>)?)lxz+`G&Npuc*T z_*@~8#(_FQE}7b5VUwOCI0=m3p|;Juq?E0C|0=oMNsL&NT!~zD32Do2s;8V@Jzh9Z zSIt-ZTwzLB)KmJraxb-#9y<*=1P|3*YEEP1PU9RE9;yH=V5UpOkkk%75 zx0p3OjY*CW=~4!XF(eJl(&4qb%pQ`nl0StP4efAwp=07~ZeMFT)INaFa$S5UL9XPb zppcL8EN^g)(isshw*jV{m)s9ohHsBhP&Bkni|FKesTNPkH%eDLbkQn+S203P4`l%C z@>=ek*j?wufss>s3_`>$lGZdC@TB76-cuW2j9#(!FO^7JSWaJ7V399kwa z2b>&NJ4W|{7c2R8=}qrVRzTJGtptEtL!R_Zeoz5oz1N_wras<|Tc%2aHXbfE#x_rG zlJ9R2b-yS;cJ3`MMB=o}k@F39PTn~yFm80mE%3YPKsrJsm^Lb0wf6%mC(e8G)8`WF zVh$yvKtwED zaM&BtqQW^~ZWa0Ucy@xwaB3ei&-7!xCkOc2;>S z_^kq|2^~=RKg;Bwwg$XgdcN!EEqUv+AD&P~=9eEssMW=Zch)r-gdiP+>X3=qXx}bL zWi4hFHx@1~Lv+)oI8%*o%JT`A*fb!(lEqB)?O2ei5F0I7j&nKQIzQgx8i1p%2ZF`F z5JY{2kR=YsBn@$>lH9CX*6RyLzUh|L9blH_em3*F$st_QtEPR}i&XwE|^MK&zx zNA-oBcj{b#oqhVfPIXA=n``HD%EJOdiW~QI(OfVFo($HS6hJXovNTGGHTZ$n5GRcM zJBlRu?1}*b>md033^pWZJp?APQ!0pBnJQ3 zp&Xfw5+4*{oYe*dRFfZlP1wt<0oWvxlSrZgh)O`FHy||MU^A4ajDQ%zXjru*h(j)J zE^%U>3+!P6!hF%BqP|B89Af;Qq{EA^1d~V)@Ctbgg+~8uf|xL|K&#jk{w>u_ zYi0sDL5!QYcT_j7Yg*)T?=^!gNxpLpZ&!Aq5Rz**gm&4D)?-k5V_Ko&un_xs#S0ZI zRc+cK5wZy>Tha}>>w=_h%s{cCD5@{335S9^GuCsJyj@iZtnD7bNQ4q!Gusm@;n=yF-Az==o^WI{oMkXTrI*UuWCmvyBsx~jmCCy@mApu);-Ml!F@0*TbN$e@)^dh`0 z)zc=9Hhe6DK5i7G&@^S??UnG*y!*`XbJzkdqerfHj@6sL*phFI=K4LRtvG&Z#Qm0^ zjqaC^@!`633*l;V2_0ehpBawYvA+0MoI4~nh>JL~4#vxzDlf>maRzRSdbUxkNwl!? zZZbqy;Et1U$VP@kPdQgLpoY5eYOlJDvr z4Nv5uGr`EH^;~w8A~=LL`W~YiP21Yn^wu*sOD(~rUZrdKjay)zkUfHL3xCJ zmhqyqhq2m^Y7*-byJ!geV^h(?OH_9(5=o+9N1yHVOWv3X^be?4fyr`{)6{DixUwbv zc??Xw0rMzb)8Y-=hfIh%icVCaK?$42L=~SW)?rQEADXb$pnm?CEBG z{`}OAqq^$-nee30`qND9(GOo6R(S@u6SCeDG_nqEM|odSH<34<@>-;xlrQz-cn3Va zKu;AgKk{TV{5oU!Wx+9&vNIy5cft>o%~aU(N|Lfmx$1{5cNNacCBNbgt>U$_Pr8Do z?`MNG(wt9{2E8HR?*;y!i@}axQcoHmxHTDK5%E7166lsFG-)q5nm432xs|~W&PRLK zQ!seQaxFt)9bQsi2=KyOS>ZXafNJ<^tJTA zh~;`SLCrS&jeBGdIAV!OSQl`74ogZc?K=*>%gdR?OSjVYcaUV>HWIC%q1+^WvpOuz z(a--X=zRhAs}I0=2XUKE7rE45ak8{xKyK2Fo_lg?eCLG}2MTSxnIR``<*y1`cPSi! zZ8X_0PkRl9 za@-oZ0E{8#mUe>F z$E_XI083LrYE5nhb_EAC zzlR{T5%kqr5Pm5ASj|Qa_{HL4ElADD$^itkb5IMR2{@UW@vBHk{X+;^5~O}SN&Jt4 z#O|D|_D<$(9DICyZ0sO55C{n606Kfxxfpo>?VM?T_wf6WH}ps{;IT&|V|!N@L27Ei zKl*j}qhC8`)?bSKI-d+}V1u51W_#=iO7s7IeU=dP6(EVMvNYqhdc#0JvroVjmPS6X(f zg^+To6k=j1?NUm%bZlu^2yH|n0)@f@L5(p)6X*lcuqyFEi~+frcq36X7$3X@<-rF` z2pHn+%xt@~bN(nxBEEE9+MGS#H{b8fxAUKw&G(-{zG0D(qf1wc``}^?Qu{}j%nUEv z&Ii$(n=*YnT^$Lpp3P06cp~2BI#O}>6^(HaqsWoc6W zB>j(<8yeoj9S)YKP-e6vDv@k-c%+IqQckMy{Lf75Bw|l7k%_=j2xa+AFaXr`E zC8?@bPZ~S0Rd~j06^c?B@@0B0*@~@o?dIEQIV>~l88Y-`YU5uQ|G6@4Z9~ObytpFT z-qhtD-r8uStE;uSLbXV-tL*CXYP&`&*P~Xnyjs<5X4SMN>>4dsRu#x9d31JVSFW_| z^EEzSEM{&JA~$a`*p%wgS)2G4;uiNZ?xLw76P##kr;}*h%$GOytsB{^DQ)RHeC3t! zfLggMzBaVHbxWL`X6#S{FUVq>Ue}W~Z5GU(FAKRo!9mYXDpVVNs?x5utJrA`OQqpgN|Arha4l|bX}6$-JqL%IE3u3(FFbvqF>m5`?I%~Psa;j|{PC*Pu?dq3 zQc5b7DtM2#*e89zjy-X(t7qn`+c)=iUO5zgrEFot+=4~dllgxfF8N?wsy_I3WdDU7 z{qwsQ1P7*-Z~OME`6p)YZ)xeBwPb%Ff5n8|>t8P`?J1kubH>_wE_Z9y#6KtW9{KUi z@xKN>zuX@wTDW8!78 z$F9c0H%fQq1kMEmr}F}5l|ZauE={>mv~6k8E-KnvyeC#NXLaSA##u*JtLv$tlS;l& zrfW;~t$Jzu#dW7D_0^H$RNkaxMSaJL7p2C3oC?fKjXipP&!;QbYf`}nYPGN6bj2ab6Zh+uBnXiVpRVzQZ*3M zLCgR#wwo6H@d8;919L+K92(%z0EY%Rc=4;)uLd}HhRQs^!E;UK0S*mt=zxQLm_&a( zcV!H4@Pc8P2RL-Vp#u&baOi-;02~J3FaQU;E{QQ4fP=paG7oU@_gCft4g+wQfWrhF zCg3muhY2|Nye7tH0uFaExc~cM0uB>!Sb&2)3Ppbw;IIG(8$!vpfWrbDeEySd0fz-R zY`|dy4z@58``~l7i~$Zlcgj4#!OPKQ9^kM62cO$T+c4m8eeeCpob5Vg3~;cP$vnWp zmVYu2aD*iXvA+4+@@jbylN_W-4pJlsDUyTOluqtfa`3Y{kOw%}*;Dk#*AX%XIQV@A zG7oUDC4|fa9Q+OhnFlz4o)ge>0(wrap@eT<5zuo|0S8~J$hLq(1sp2i0D4Y9&k5)` zztuyI1?V{eJtv^&#Mf|w4d^*(fCK0`0X-+6=LGbebTF@go|6ua2hei@dQL#k3FtWi zJtv^&1oWJMp0oL>tTWJaVrvv}Jb<2)0geaIa{_u!K+g&2IRQN84;C~EK#Px!xnuH$-d^3P!%8`Dn(RQC-{{N}2MV|6wBKApiQ n1K?7eeT6JpzNN+CG!v3(lmaQ;SKt47d^O5 literal 0 HcmV?d00001 From 33493b440cc76cdb55cfc768a4a993bb34e9386f Mon Sep 17 00:00:00 2001 From: Jade Cho Date: Mon, 28 Oct 2024 22:56:51 +0900 Subject: [PATCH 017/120] [GPU] Fix bugs in cpu impls (#27260) ### Details: - *Select GPU implementation for reorder in shape flow with blocked output format.* - *Ensure CPU implementation waits for GPU kernel execution to finish when using GPU output as input.* ### Tickets: - *155116* --- .../src/graph/impls/cpu/activation.cpp | 2 +- .../src/graph/impls/cpu/broadcast.cpp | 2 +- .../intel_gpu/src/graph/impls/cpu/concat.cpp | 2 +- .../intel_gpu/src/graph/impls/cpu/crop.cpp | 2 +- .../src/graph/impls/cpu/detection_output.cpp | 2 +- .../intel_gpu/src/graph/impls/cpu/eltwise.cpp | 2 +- .../intel_gpu/src/graph/impls/cpu/gather.cpp | 2 +- .../graph/impls/cpu/non_max_suppression.cpp | 4 ++-- .../src/graph/impls/cpu/proposal.cpp | 2 +- .../intel_gpu/src/graph/impls/cpu/range.cpp | 2 +- .../intel_gpu/src/graph/impls/cpu/reduce.cpp | 2 +- .../intel_gpu/src/graph/impls/cpu/reorder.cpp | 2 +- .../src/graph/impls/cpu/scatter_update.cpp | 2 +- .../intel_gpu/src/graph/impls/cpu/select.cpp | 2 +- .../src/graph/impls/cpu/shape_of.cpp | 2 +- .../src/graph/impls/cpu/strided_slice.cpp | 2 +- .../intel_gpu/src/graph/impls/cpu/tile.cpp | 2 +- .../graph/impls/registry/reorder_impls.cpp | 3 ++- .../src/graph/include/primitive_inst.h | 1 + .../intel_gpu/src/graph/primitive_inst.cpp | 19 +++++++++++++++++++ 20 files changed, 40 insertions(+), 19 deletions(-) diff --git a/src/plugins/intel_gpu/src/graph/impls/cpu/activation.cpp b/src/plugins/intel_gpu/src/graph/impls/cpu/activation.cpp index e750303b955d77..1ec8c9458cf30b 100644 --- a/src/plugins/intel_gpu/src/graph/impls/cpu/activation.cpp +++ b/src/plugins/intel_gpu/src/graph/impls/cpu/activation.cpp @@ -140,7 +140,7 @@ struct activation_impl : public typed_primitive_impl { OV_ITT_SCOPED_TASK(ov::intel_gpu::itt::domains::intel_gpu_plugin, "activation::execute_impl"); auto& stream = instance.get_network().get_stream(); - const bool pass_through_events = (stream.get_queue_type() == QueueTypes::out_of_order) && instance.get_node().is_in_shape_of_subgraph(); + const bool pass_through_events = (stream.get_queue_type() == QueueTypes::out_of_order) && instance.all_dependencies_cpu_impl(); if (!pass_through_events) { for (auto e : events) { diff --git a/src/plugins/intel_gpu/src/graph/impls/cpu/broadcast.cpp b/src/plugins/intel_gpu/src/graph/impls/cpu/broadcast.cpp index 2701a57001eb93..7abd3a141c15f1 100644 --- a/src/plugins/intel_gpu/src/graph/impls/cpu/broadcast.cpp +++ b/src/plugins/intel_gpu/src/graph/impls/cpu/broadcast.cpp @@ -60,7 +60,7 @@ struct broadcast_impl : public typed_primitive_impl { OV_ITT_SCOPED_TASK(ov::intel_gpu::itt::domains::intel_gpu_plugin, "broadcast::execute_impl"); auto& stream = instance.get_network().get_stream(); - const bool pass_through_events = (stream.get_queue_type() == QueueTypes::out_of_order) && instance.get_node().is_in_shape_of_subgraph(); + const bool pass_through_events = (stream.get_queue_type() == QueueTypes::out_of_order) && instance.all_dependencies_cpu_impl(); if (!pass_through_events) { for (auto e : events) { diff --git a/src/plugins/intel_gpu/src/graph/impls/cpu/concat.cpp b/src/plugins/intel_gpu/src/graph/impls/cpu/concat.cpp index 6b7a483bae7d8c..a7d3beb5b94cf0 100644 --- a/src/plugins/intel_gpu/src/graph/impls/cpu/concat.cpp +++ b/src/plugins/intel_gpu/src/graph/impls/cpu/concat.cpp @@ -51,7 +51,7 @@ struct concatenation_impl : public typed_primitive_impl { OV_ITT_SCOPED_TASK(ov::intel_gpu::itt::domains::intel_gpu_plugin, "concat::execute_impl"); auto& stream = instance.get_network().get_stream(); - const bool pass_through_events = (stream.get_queue_type() == QueueTypes::out_of_order) && instance.get_node().is_in_shape_of_subgraph(); + const bool pass_through_events = (stream.get_queue_type() == QueueTypes::out_of_order) && instance.all_dependencies_cpu_impl(); if (!pass_through_events) { for (auto e : events) { diff --git a/src/plugins/intel_gpu/src/graph/impls/cpu/crop.cpp b/src/plugins/intel_gpu/src/graph/impls/cpu/crop.cpp index 99bfa9b9383492..c8041f44a78e86 100644 --- a/src/plugins/intel_gpu/src/graph/impls/cpu/crop.cpp +++ b/src/plugins/intel_gpu/src/graph/impls/cpu/crop.cpp @@ -37,7 +37,7 @@ struct crop_impl : public typed_primitive_impl { OV_ITT_SCOPED_TASK(ov::intel_gpu::itt::domains::intel_gpu_plugin, "crop::execute_impl"); auto& stream = instance.get_network().get_stream(); - const bool pass_through_events = (stream.get_queue_type() == QueueTypes::out_of_order) && instance.get_node().is_in_shape_of_subgraph(); + const bool pass_through_events = (stream.get_queue_type() == QueueTypes::out_of_order) && instance.all_dependencies_cpu_impl(); if (!pass_through_events) { for (auto e : events) { diff --git a/src/plugins/intel_gpu/src/graph/impls/cpu/detection_output.cpp b/src/plugins/intel_gpu/src/graph/impls/cpu/detection_output.cpp index 7d3e4ff7c4eadf..25676ea3e9aa3d 100644 --- a/src/plugins/intel_gpu/src/graph/impls/cpu/detection_output.cpp +++ b/src/plugins/intel_gpu/src/graph/impls/cpu/detection_output.cpp @@ -827,7 +827,7 @@ struct detection_output_impl : typed_primitive_impl { event::ptr execute_impl(const std::vector& events, detection_output_inst& instance) override { auto& stream = instance.get_network().get_stream(); - const bool pass_through_events = (stream.get_queue_type() == QueueTypes::out_of_order) && instance.get_node().is_in_shape_of_subgraph(); + const bool pass_through_events = (stream.get_queue_type() == QueueTypes::out_of_order) && instance.all_dependencies_cpu_impl(); if (!pass_through_events) { for (auto e : events) { diff --git a/src/plugins/intel_gpu/src/graph/impls/cpu/eltwise.cpp b/src/plugins/intel_gpu/src/graph/impls/cpu/eltwise.cpp index 1b18b78b041fd8..057686f9c2d6a6 100644 --- a/src/plugins/intel_gpu/src/graph/impls/cpu/eltwise.cpp +++ b/src/plugins/intel_gpu/src/graph/impls/cpu/eltwise.cpp @@ -81,7 +81,7 @@ struct eltwise_impl : public typed_primitive_impl { OV_ITT_SCOPED_TASK(ov::intel_gpu::itt::domains::intel_gpu_plugin, "eltwise::execute_impl"); auto& stream = instance.get_network().get_stream(); - const bool pass_through_events = (stream.get_queue_type() == QueueTypes::out_of_order) && instance.get_node().is_in_shape_of_subgraph(); + const bool pass_through_events = (stream.get_queue_type() == QueueTypes::out_of_order) && instance.all_dependencies_cpu_impl(); if (!pass_through_events) { for (auto e : events) { diff --git a/src/plugins/intel_gpu/src/graph/impls/cpu/gather.cpp b/src/plugins/intel_gpu/src/graph/impls/cpu/gather.cpp index 242273a23dd000..733e143b2c2c76 100644 --- a/src/plugins/intel_gpu/src/graph/impls/cpu/gather.cpp +++ b/src/plugins/intel_gpu/src/graph/impls/cpu/gather.cpp @@ -59,7 +59,7 @@ struct gather_impl : public typed_primitive_impl { return stream.group_events(events); } - const bool pass_through_events = (stream.get_queue_type() == QueueTypes::out_of_order) && instance.get_node().is_in_shape_of_subgraph(); + const bool pass_through_events = (stream.get_queue_type() == QueueTypes::out_of_order) && instance.all_dependencies_cpu_impl(); if (!pass_through_events) { for (auto e : events) { diff --git a/src/plugins/intel_gpu/src/graph/impls/cpu/non_max_suppression.cpp b/src/plugins/intel_gpu/src/graph/impls/cpu/non_max_suppression.cpp index 4783159d501404..446a277866f222 100644 --- a/src/plugins/intel_gpu/src/graph/impls/cpu/non_max_suppression.cpp +++ b/src/plugins/intel_gpu/src/graph/impls/cpu/non_max_suppression.cpp @@ -403,7 +403,7 @@ struct non_max_suppression_impl : typed_primitive_impl { event::ptr execute_impl(const std::vector& events, typed_primitive_inst& instance) override { auto& stream = instance.get_network().get_stream(); - const bool pass_through_events = (stream.get_queue_type() == QueueTypes::out_of_order) && instance.get_node().is_in_shape_of_subgraph(); + const bool pass_through_events = (stream.get_queue_type() == QueueTypes::out_of_order) && instance.all_dependencies_cpu_impl(); if (!pass_through_events) { for (auto e : events) { @@ -455,7 +455,7 @@ struct non_max_suppression_gather_impl : typed_primitive_impl& events, typed_primitive_inst& instance) override { auto& stream = instance.get_network().get_stream(); - const bool pass_through_events = (stream.get_queue_type() == QueueTypes::out_of_order) && instance.get_node().is_in_shape_of_subgraph(); + const bool pass_through_events = (stream.get_queue_type() == QueueTypes::out_of_order) && instance.all_dependencies_cpu_impl(); if (!pass_through_events) { for (auto e : events) { diff --git a/src/plugins/intel_gpu/src/graph/impls/cpu/proposal.cpp b/src/plugins/intel_gpu/src/graph/impls/cpu/proposal.cpp index e49cb3a832f8ae..b295ee5f84c56f 100644 --- a/src/plugins/intel_gpu/src/graph/impls/cpu/proposal.cpp +++ b/src/plugins/intel_gpu/src/graph/impls/cpu/proposal.cpp @@ -388,7 +388,7 @@ struct proposal_impl : typed_primitive_impl { event::ptr execute_impl(const std::vector& events, proposal_inst& instance) override { auto& stream = instance.get_network().get_stream(); - const bool pass_through_events = (stream.get_queue_type() == QueueTypes::out_of_order) && instance.get_node().is_in_shape_of_subgraph(); + const bool pass_through_events = (stream.get_queue_type() == QueueTypes::out_of_order) && instance.all_dependencies_cpu_impl(); if (!pass_through_events) { for (auto e : events) { diff --git a/src/plugins/intel_gpu/src/graph/impls/cpu/range.cpp b/src/plugins/intel_gpu/src/graph/impls/cpu/range.cpp index 83142812f29e8b..439bbf5aa4ef6b 100644 --- a/src/plugins/intel_gpu/src/graph/impls/cpu/range.cpp +++ b/src/plugins/intel_gpu/src/graph/impls/cpu/range.cpp @@ -37,7 +37,7 @@ struct range_impl : public typed_primitive_impl { OV_ITT_SCOPED_TASK(ov::intel_gpu::itt::domains::intel_gpu_plugin, "range::execute_impl"); auto& stream = instance.get_network().get_stream(); - const bool pass_through_events = (stream.get_queue_type() == QueueTypes::out_of_order) && instance.get_node().is_in_shape_of_subgraph(); + const bool pass_through_events = (stream.get_queue_type() == QueueTypes::out_of_order) && instance.all_dependencies_cpu_impl(); if (!pass_through_events) { for (auto e : events) { diff --git a/src/plugins/intel_gpu/src/graph/impls/cpu/reduce.cpp b/src/plugins/intel_gpu/src/graph/impls/cpu/reduce.cpp index 5a3867f9d1582a..dd6284a3dd795d 100644 --- a/src/plugins/intel_gpu/src/graph/impls/cpu/reduce.cpp +++ b/src/plugins/intel_gpu/src/graph/impls/cpu/reduce.cpp @@ -77,7 +77,7 @@ struct reduce_impl : public typed_primitive_impl { OV_ITT_SCOPED_TASK(ov::intel_gpu::itt::domains::intel_gpu_plugin, "reduce::execute_impl"); auto& stream = instance.get_network().get_stream(); - const bool pass_through_events = (stream.get_queue_type() == QueueTypes::out_of_order) && instance.get_node().is_in_shape_of_subgraph(); + const bool pass_through_events = (stream.get_queue_type() == QueueTypes::out_of_order) && instance.all_dependencies_cpu_impl(); if (!pass_through_events) { for (auto e : events) { diff --git a/src/plugins/intel_gpu/src/graph/impls/cpu/reorder.cpp b/src/plugins/intel_gpu/src/graph/impls/cpu/reorder.cpp index 1b6f145c4ceb2d..6e4202f815dc96 100644 --- a/src/plugins/intel_gpu/src/graph/impls/cpu/reorder.cpp +++ b/src/plugins/intel_gpu/src/graph/impls/cpu/reorder.cpp @@ -37,7 +37,7 @@ struct reorder_impl : public typed_primitive_impl { OV_ITT_SCOPED_TASK(ov::intel_gpu::itt::domains::intel_gpu_plugin, "reorder::execute_impl"); auto& stream = instance.get_network().get_stream(); - const bool pass_through_events = (stream.get_queue_type() == QueueTypes::out_of_order) && instance.get_node().is_in_shape_of_subgraph(); + const bool pass_through_events = (stream.get_queue_type() == QueueTypes::out_of_order) && instance.all_dependencies_cpu_impl(); if (!pass_through_events) { for (auto e : events) { diff --git a/src/plugins/intel_gpu/src/graph/impls/cpu/scatter_update.cpp b/src/plugins/intel_gpu/src/graph/impls/cpu/scatter_update.cpp index 1a329ea495ef82..94a12dadddb407 100644 --- a/src/plugins/intel_gpu/src/graph/impls/cpu/scatter_update.cpp +++ b/src/plugins/intel_gpu/src/graph/impls/cpu/scatter_update.cpp @@ -51,7 +51,7 @@ struct scatter_update_impl : public typed_primitive_impl { OV_ITT_SCOPED_TASK(ov::intel_gpu::itt::domains::intel_gpu_plugin, "scatter_update::execute_impl"); auto& stream = instance.get_network().get_stream(); - const bool pass_through_events = (stream.get_queue_type() == QueueTypes::out_of_order) && instance.get_node().is_in_shape_of_subgraph(); + const bool pass_through_events = (stream.get_queue_type() == QueueTypes::out_of_order) && instance.all_dependencies_cpu_impl(); if (!pass_through_events) { for (auto e : events) { diff --git a/src/plugins/intel_gpu/src/graph/impls/cpu/select.cpp b/src/plugins/intel_gpu/src/graph/impls/cpu/select.cpp index 9c9ab75f64ad59..78df55d5c7c1f7 100644 --- a/src/plugins/intel_gpu/src/graph/impls/cpu/select.cpp +++ b/src/plugins/intel_gpu/src/graph/impls/cpu/select.cpp @@ -50,7 +50,7 @@ struct select_impl : public typed_primitive_impl + 56 + + + 7 + + + + + + 16 + 6 2 - 48 + + + + + +*Example 1D signal, transpose_frames=true: * + +.. code-block:: xml + :force: + + + + + + 56 - 8 + 7 - - + + - + + 6 + 16 2 - 9 - 9 + + + + +*Example 2D signal, transpose_frames=false: * + +.. code-block:: xml + :force: + + + + + + 3 + 56 + + + 7 + + + + + + 3 + 16 + 6 + 2 + + + + + +*Example 2D signal, transpose_frames=true: * + +.. code-block:: xml + :force: + + + + + + 3 + 56 + + + 7 + + + + + + 3 + 6 + 16 2 From b2a9527bac12563e49b01a745daaa27e007c133b Mon Sep 17 00:00:00 2001 From: Alina Kladieva Date: Mon, 28 Oct 2024 17:49:42 +0100 Subject: [PATCH 023/120] Add commit signoff policy readme (#27282) --- docs/dev/ci/commit_signoff_policy.md | 72 ++++++++++++++++++++++++++++ 1 file changed, 72 insertions(+) create mode 100644 docs/dev/ci/commit_signoff_policy.md diff --git a/docs/dev/ci/commit_signoff_policy.md b/docs/dev/ci/commit_signoff_policy.md new file mode 100644 index 00000000000000..ec6ec446286f58 --- /dev/null +++ b/docs/dev/ci/commit_signoff_policy.md @@ -0,0 +1,72 @@ +# How to sign-off commits + +We require a sign-off commit message in the following format on each commit in pull request. + +``` +This is a commit message. + +Signed-off-by: Author Name +``` + +## How to sign-off new commits + +In a local Git environment, the sign-off message can be added to a commit either manually (as a text) +or via the **-s** flag used with the “git commit” command, for example: + +`git commit -s -m "My commit message"` + +To avoid manually adding the flag for each commit, we recommend setting up a Git hook with the following steps: + +1. Navigate to the `/.git/hooks` folder. +2. Open the `prepare-commit-msg.sample` file and paste the following content: + +``` +COMMIT_MSG_FILE=$1 +COMMIT_SOURCE=$2 +SHA1=$3 + +NAME=$(git config user.name) +EMAIL=$(git config user.email) + +if [ -z "$NAME" ]; then + echo "empty git config user.name" + exit 1 +fi + +if [ -z "$EMAIL" ]; then + echo "empty git config user.email" + exit 1 +fi + +git interpret-trailers --if-exists doNothing --trailer \ + "Signed-off-by: $NAME <$EMAIL>" \ + --in-place "$1" +``` + +3. Save the file with the name `prepare-commit-msg` (remove the .sample extension). +4. Make the file executable (on Linux: `chmod +x /.git/hooks/prepare-commit-msg`). + +**Note**: For both sign-off approaches, ensure your user name and email address are configured in Git first: + +``` +git config user.name 'FIRST_NAME LAST_NAME' +git config user.email 'MY_EMAIL@example.com' +``` + +### Sign-off web-based commits + +To enable automatic sign-off of commits made via GitHub web interface, make sure that +[Require contributors to sign off on web-based commits](https://docs.github.com/en/repositories/managing-your-repositorys-settings-and-features/managing-repository-settings/managing-the-commit-signoff-policy-for-your-repository#enabling-or-disabling-compulsory-commit-signoffs-for-your-repository) +setting is selected in the Settings menu of your OpenVINO repository fork. + +## How to sign-off older commits in the history + +If you forget to add the sign-off to your last commit, you can amend it and force-push to GitHub: + +``` +git commit --amend --signoff +``` + +To sign off on even older commits, use an interactive rebase, edit unsigned commits, and execute +`git commit --amend --signoff` for each. However, please note that if others have already started working based on +the commits in this branch, this will rewrite history and may cause issues for collaborators. From 65313fcff4c19c8d8abb095f8300c3e568800c5f Mon Sep 17 00:00:00 2001 From: Alina Kladieva Date: Mon, 28 Oct 2024 19:25:59 +0100 Subject: [PATCH 024/120] Minor updates for commit_signoff_policy.md --- docs/dev/ci/commit_signoff_policy.md | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/docs/dev/ci/commit_signoff_policy.md b/docs/dev/ci/commit_signoff_policy.md index ec6ec446286f58..0328d3c3ec308c 100644 --- a/docs/dev/ci/commit_signoff_policy.md +++ b/docs/dev/ci/commit_signoff_policy.md @@ -21,6 +21,8 @@ To avoid manually adding the flag for each commit, we recommend setting up a Git 2. Open the `prepare-commit-msg.sample` file and paste the following content: ``` +#!/bin/sh + COMMIT_MSG_FILE=$1 COMMIT_SOURCE=$2 SHA1=$3 @@ -44,7 +46,7 @@ git interpret-trailers --if-exists doNothing --trailer \ ``` 3. Save the file with the name `prepare-commit-msg` (remove the .sample extension). -4. Make the file executable (on Linux: `chmod +x /.git/hooks/prepare-commit-msg`). +4. Make the file executable (on Linux / Git Bash: `chmod +x /.git/hooks/prepare-commit-msg`). **Note**: For both sign-off approaches, ensure your user name and email address are configured in Git first: From c158480ae3a94f9a1f4a711d06224bce934876ed Mon Sep 17 00:00:00 2001 From: Alina Kladieva Date: Mon, 28 Oct 2024 20:45:48 +0100 Subject: [PATCH 025/120] [GHA] Use action_path in default smart ci config path (#27287) For easier usage in 3rd-party repos Signed-off-by: Alina Kladieva --- .github/actions/smart-ci/action.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/actions/smart-ci/action.yml b/.github/actions/smart-ci/action.yml index cd111d617ddc1b..4d772c8f0eeb03 100644 --- a/.github/actions/smart-ci/action.yml +++ b/.github/actions/smart-ci/action.yml @@ -30,7 +30,6 @@ inputs: components_config_schema: description: "Path to the schema file for components configuration" required: false - default: ".github/actions/smart-ci/components_schema.yml" labeler_config: description: "Path to labeler configuration file" required: false @@ -101,7 +100,7 @@ runs: -f "${{ inputs.ref_name }}" \ -p "${{ inputs.component_pattern }}" \ -c "${{ inputs.components_config }}" \ - -m "${{ inputs.components_config_schema }}" \ + -m "${{ inputs.components_config_schema || env.DEFAULT_CONFIG_SCHEMA }}" \ -l "${{ inputs.labeler_config }}" \ --enable_for_org "${{ inputs.enable_for_org }}" \ --skip-when-only-listed-labels-set "${{ inputs.skip_when_only_listed_labels_set }}" \ @@ -109,3 +108,4 @@ runs: shell: bash env: GITHUB_TOKEN: ${{ inputs.repo_token }} + DEFAULT_CONFIG_SCHEMA: "${{ github.action_path }}/components_schema.yml" From e07546d6414768c7fea3769eef40bcb544e9352a Mon Sep 17 00:00:00 2001 From: Roman Kazantsev Date: Tue, 29 Oct 2024 12:29:00 +0400 Subject: [PATCH 026/120] [TF FE] Deduce Switch-Merge predicate shape (#27277) **Details:** It helps to convert some TF models out-of-the-box with static rank tensors that are required by plugins for inference. **Ticket:** 156204 --------- Signed-off-by: Kazantsev, Roman --- .../reverse_shape_and_type_infer.cpp | 9 ++++ .../transformations/switch_merge_resolve.cpp | 3 ++ .../tensorflow_tests/test_tf_SwitchMerge.py | 45 +++++++++++++++++++ 3 files changed, 57 insertions(+) diff --git a/src/common/transformations/src/transformations/common_optimizations/reverse_shape_and_type_infer.cpp b/src/common/transformations/src/transformations/common_optimizations/reverse_shape_and_type_infer.cpp index 211f351da34024..9a06201f688675 100644 --- a/src/common/transformations/src/transformations/common_optimizations/reverse_shape_and_type_infer.cpp +++ b/src/common/transformations/src/transformations/common_optimizations/reverse_shape_and_type_infer.cpp @@ -282,6 +282,15 @@ bool ov::pass::ReverseShapeAndTypeInfer::run_on_model(const std::shared_ptrget_input_tensor(0).m_element_type = element::boolean; is_changed = true; } + + // in case TensorFlow models, we can deduce predicate shape that must be a scalar + // If operations created by fusing Switch-Merge sub-graph contain tf_switch_merge_if rt-info + if (if_op->get_rt_info().count("tf_switch_merge_if") && + if_op->get_rt_info()["tf_switch_merge_if"].as() && + if_op->input_value(0).get_partial_shape().rank().is_dynamic()) { + if_op->get_input_tensor(0).m_partial_shape = ov::PartialShape({}); + is_changed = true; + } } else if (ov::as_type_ptr(op)) { is_changed |= inherit_output_shape(op, {0}); is_changed |= inherit_output_type(op, {1}); diff --git a/src/frontends/tensorflow/src/transformations/switch_merge_resolve.cpp b/src/frontends/tensorflow/src/transformations/switch_merge_resolve.cpp index 34b2a82152ccfc..cbdc506671aa67 100644 --- a/src/frontends/tensorflow/src/transformations/switch_merge_resolve.cpp +++ b/src/frontends/tensorflow/src/transformations/switch_merge_resolve.cpp @@ -235,6 +235,9 @@ bool pass::SwitchMergeResolver::run_on_model(const shared_ptr& m) { auto else_body = make_shared(else_results, else_params); auto if_op = make_shared(cond); + // in case TensorFlow models, we can deduce predicate shape that must be a scalar + if_op->get_rt_info()["tf_switch_merge_if"] = true; + set_cf_marker(if_cf_marker, if_op); if_op->set_then_body(then_body); if_op->set_else_body(else_body); diff --git a/tests/layer_tests/tensorflow_tests/test_tf_SwitchMerge.py b/tests/layer_tests/tensorflow_tests/test_tf_SwitchMerge.py index 96b73dd2134575..3747ab7a726aec 100644 --- a/tests/layer_tests/tensorflow_tests/test_tf_SwitchMerge.py +++ b/tests/layer_tests/tensorflow_tests/test_tf_SwitchMerge.py @@ -63,3 +63,48 @@ def test_merge_eliminating_several_cond_flows(self, params, cond_value, x_type, self._test(*self.merge_eliminating_several_cond_flows_net(**params, cond_value=cond_value, x_type=x_type), ie_device, precision, ir_version, temp_dir=temp_dir, use_legacy_frontend=use_legacy_frontend) + + +class TestSwitchMergeWithVariablePredicate(CommonTFLayerTest): + def _prepare_input(self, inputs_info): + assert 'x:0' in inputs_info + x_shape = inputs_info['x:0'] + inputs_data = {} + rng = np.random.default_rng() + inputs_data['x:0'] = rng.integers(-10, 10, x_shape).astype(np.float32) + inputs_data['cond:0'] = np.array(self.cond_value, dtype=bool) + return inputs_data + + def switch_merge_with_variable_predicate_net(self, x_shape, cond_shape, cond_value): + self.cond_value = cond_value + tf.compat.v1.reset_default_graph() + # Create the graph and model + with tf.compat.v1.Session() as sess: + x = tf.compat.v1.placeholder(tf.float32, x_shape, 'x') + cond = tf.compat.v1.placeholder(tf.bool, cond_shape, 'cond') + const_add = tf.constant(3, dtype=tf.float32) + const_sub = tf.constant(1, dtype=tf.float32) + switch_false, switch_true = tf.raw_ops.Switch(data=x, pred=cond) + add = tf.raw_ops.AddV2(x=switch_false, y=const_add) + sub = tf.raw_ops.Sub(x=switch_true, y=const_sub) + merge = tf.raw_ops.Merge(inputs=[add, sub]) + const_main = tf.constant(1, dtype=tf.float32) + tf.raw_ops.AddV2(x=merge[0], y=const_main, name='add_res') + tf.compat.v1.global_variables_initializer() + tf_net = sess.graph_def + + return tf_net, None + + @pytest.mark.parametrize('x_shape', [[], [2], [3, 2]]) + @pytest.mark.parametrize('cond_shape', [None, []]) + @pytest.mark.parametrize('cond_value', [True, False]) + @pytest.mark.precommit + @pytest.mark.nightly + def test_switch_merge_with_variable_predicate(self, x_shape, cond_shape, cond_value, + ie_device, precision, ir_version, temp_dir, + use_legacy_frontend): + if ie_device == 'GPU': + pytest.skip("156244: accuracy error on GPU") + self._test(*self.switch_merge_with_variable_predicate_net(x_shape, cond_shape, cond_value), + ie_device, precision, ir_version, temp_dir=temp_dir, + use_legacy_frontend=use_legacy_frontend) From 744475b46127c74531d0271218ecaf1c5ff5b702 Mon Sep 17 00:00:00 2001 From: Mateusz Mikolajczyk Date: Tue, 29 Oct 2024 09:51:57 +0100 Subject: [PATCH 027/120] [SPEC][Op] Align SearchSorted specification with core (#27275) ### Details: - *Align SearchSorted tensor/attribute names with ones used in core* - *Minor improvements for descriptions* - *Fix SliceScatter missing opset number* ### Tickets: - *ticket-id* --- .../operation-sets/operation-specs.rst | 2 +- .../operation-specs/sort/search-sorted-15.rst | 21 +++++++++++-------- 2 files changed, 13 insertions(+), 10 deletions(-) diff --git a/docs/articles_en/documentation/openvino-ir-format/operation-sets/operation-specs.rst b/docs/articles_en/documentation/openvino-ir-format/operation-sets/operation-specs.rst index 6ecbf2695699f9..8eccea47c31dd0 100644 --- a/docs/articles_en/documentation/openvino-ir-format/operation-sets/operation-specs.rst +++ b/docs/articles_en/documentation/openvino-ir-format/operation-sets/operation-specs.rst @@ -215,7 +215,7 @@ Operation Specifications Sin-1 Sinh-1 Slice-8 - SliceScatter + SliceScatter-15 SoftMax-1 SoftMax-8 SoftPlus-4 diff --git a/docs/articles_en/documentation/openvino-ir-format/operation-sets/operation-specs/sort/search-sorted-15.rst b/docs/articles_en/documentation/openvino-ir-format/operation-sets/operation-specs/sort/search-sorted-15.rst index 81c592d3341a35..7a623a1e16739c 100644 --- a/docs/articles_en/documentation/openvino-ir-format/operation-sets/operation-specs/sort/search-sorted-15.rst +++ b/docs/articles_en/documentation/openvino-ir-format/operation-sets/operation-specs/sort/search-sorted-15.rst @@ -17,29 +17,32 @@ SearchSorted **Attributes** -* *right* +* *right_mode* - * **Description**: If False, set the first suitable index. If True, return the last suitable index for given value. Default is False. - * **Range of values**: true or false - * **Type**: boolean + * **Description**: flag to control whether output would contain leftmost or rightmost indices for given values. + * **Range of values**: + + * *true* - return the rightmost (last) suitable index for given value. + * *false* - return the leftmost (first) suitable index for given value. + * **Type**: ``boolean`` * **Default value**: false * **Required**: *no* **Inputs**: -* **1**: ``sorted`` - ND input tensor of type *T* - cannot be a scalar, containing monotonically increasing sequence on the innermost dimension. **Required.** +* **1**: ``sorted_sequence`` - ND input tensor of type *T* - cannot be a scalar, containing monotonically increasing sequence on the innermost dimension. **Required.** * **2**: ``values`` - ND input tensor of type *T*, containing the search values. If sorted sequence is 1D, then the values can have any shape, otherwise the rank should be equal to the rank of sorted input. **Required.** **Outputs**: -* **1**: Tensor of type *TOut*, with the same shape as second input tensor, containing the indices. +* **1**: Tensor of type *T_IND*, with the same shape as second input tensor ``values``, containing the indices. **Types** * *T*: any supported floating-point and integer type. -* *TOut*: int64. +* *T_IND*: ``int64``. **Example** @@ -47,7 +50,7 @@ SearchSorted :force: - + 7 @@ -63,7 +66,7 @@ SearchSorted - + 7 256 200 From dd7967cf19b0241d5504f02bbf504533f0fc8326 Mon Sep 17 00:00:00 2001 From: Nikolay Shchegolev Date: Tue, 29 Oct 2024 15:44:40 +0400 Subject: [PATCH 028/120] [CPU][OMP] Safe usage of threads num with buffers (#27237) ### Details: - *Function `parallel_get_max_threads()` may returns ether core number or threads number. That can affect buffers access which size depends on threads number.* - *...* ### Tickets: - *152606* --------- Co-authored-by: Ilya Lavrenov --- src/plugins/intel_cpu/src/nodes/ctc_loss.cpp | 5 ++- src/plugins/intel_cpu/src/nodes/eltwise.cpp | 17 +++++--- src/plugins/intel_cpu/src/nodes/gather.cpp | 12 +++--- src/plugins/intel_cpu/src/nodes/gather.h | 1 + .../intel_cpu/src/nodes/grid_sample.cpp | 12 +++--- .../intel_cpu/src/nodes/grid_sample.hpp | 2 +- .../kernels/scaled_attn/mha_single_token.cpp | 6 ++- src/plugins/intel_cpu/src/nodes/llm_mlp.cpp | 37 ++++++++++------ src/plugins/intel_cpu/src/nodes/mha.cpp | 24 ++++++----- src/plugins/intel_cpu/src/nodes/mha.h | 2 + src/plugins/intel_cpu/src/nodes/mvn.cpp | 43 +++++++++++++------ src/plugins/intel_cpu/src/nodes/qkv_proj.cpp | 11 ++--- src/plugins/intel_cpu/src/nodes/reduce.cpp | 8 +++- src/plugins/intel_cpu/src/nodes/roi_align.cpp | 6 ++- .../intel_cpu/src/nodes/scaled_attn.cpp | 27 ++++++++---- .../intel_cpu/src/nodes/strided_slice.cpp | 7 ++- .../intel_cpu/src/nodes/strided_slice.h | 1 + 17 files changed, 142 insertions(+), 79 deletions(-) diff --git a/src/plugins/intel_cpu/src/nodes/ctc_loss.cpp b/src/plugins/intel_cpu/src/nodes/ctc_loss.cpp index 78bb6fc0563e60..3161c9a0e87a84 100644 --- a/src/plugins/intel_cpu/src/nodes/ctc_loss.cpp +++ b/src/plugins/intel_cpu/src/nodes/ctc_loss.cpp @@ -84,7 +84,8 @@ void CTCLoss::execute(dnnl::stream strm) { std::vector decodedTargetLenB(batchNum, 0); std::vector> targetDB(batchNum); std::vector>> logProbabilitiesB(batchNum); - std::vector errorMsgB(parallel_get_max_threads()); + const auto threads_num = parallel_get_max_threads(); + std::vector errorMsgB(threads_num); auto threadBody_1 = [&](const int ithr, const int nthr) { size_t start(0lu), end(0lu); @@ -153,7 +154,7 @@ void CTCLoss::execute(dnnl::stream strm) { } // for batch }; // threadBody_1 - parallel_nt(0, threadBody_1); + parallel_nt(threads_num, threadBody_1); if (returnCode != 0) { std::string resErr(""); for (auto& err : errorMsgB) { diff --git a/src/plugins/intel_cpu/src/nodes/eltwise.cpp b/src/plugins/intel_cpu/src/nodes/eltwise.cpp index 5c3a358dff9d38..c2d23bf9adc89e 100644 --- a/src/plugins/intel_cpu/src/nodes/eltwise.cpp +++ b/src/plugins/intel_cpu/src/nodes/eltwise.cpp @@ -1503,7 +1503,7 @@ class EltwiseJitExecutor : public Eltwise::IEltwiseExecutor { fullWorkAmount *= jep.dims[i]; } - size_t minimalConcurrency = parallel_get_max_threads(); + m_threads_num = static_cast(parallel_get_max_threads()); size_t minimalJitWorkAmount = 256; size_t currentJitWorkAmount = jep.dims[jep.dims.size() - 1]; int collapsedDims = 0; @@ -1516,6 +1516,7 @@ class EltwiseJitExecutor : public Eltwise::IEltwiseExecutor { for (size_t j = 1; j < inpDims.size(); j++) { if (inpDims[j].back() != inpDims[0].back()) { hasDifferentDims = true; + break; } } @@ -1538,7 +1539,7 @@ class EltwiseJitExecutor : public Eltwise::IEltwiseExecutor { } size_t nextJitWorkAmount = currentJitWorkAmount * jep.dims[jep.dims.size() - 2]; - if (fullWorkAmount / nextJitWorkAmount >= minimalConcurrency) { + if (fullWorkAmount / nextJitWorkAmount >= m_threads_num) { currentJitWorkAmount = nextJitWorkAmount; collapsedDims++; @@ -1622,8 +1623,7 @@ class EltwiseJitExecutor : public Eltwise::IEltwiseExecutor { if (_pKernel->jep_.input_size == optimalTensorRank) { // execute Optimized 6D - parallel_for5d(dims_out[0], dims_out[1], dims_out[2], dims_out[3], dims_out[4], - [&](size_t i0, size_t i1, size_t i2, size_t i3, size_t i4) { + auto d6_loop = [&](size_t i0, size_t i1, size_t i2, size_t i3, size_t i4) { auto args = jit_eltwise_call_args_indexes(); args.indexes[0] = i0; args.indexes[1] = i1; @@ -1632,7 +1632,11 @@ class EltwiseJitExecutor : public Eltwise::IEltwiseExecutor { args.indexes[4] = i4; (*_pKernel)(&args_ptrs, &args); - }); + }; + + parallel_nt_static(m_threads_num, [&](const int ithr, const int nthr) { + for_5d(ithr, nthr, dims_out[0], dims_out[1], dims_out[2], dims_out[3], dims_out[4], d6_loop); + }); } else { // execute Optimized Generic if (_pKernel->jep_.use_runtime_ptrs) { @@ -1642,7 +1646,7 @@ class EltwiseJitExecutor : public Eltwise::IEltwiseExecutor { _schedulerWorkAmount *= dims_out[i]; } } - parallel_nt(0, [&](const int ithr, const int nthr) { + parallel_nt(m_threads_num, [&](const int ithr, const int nthr) { size_t start = 0, end = 0; splitter(_schedulerWorkAmount, nthr, ithr, start, end); @@ -1676,6 +1680,7 @@ class EltwiseJitExecutor : public Eltwise::IEltwiseExecutor { std::unique_ptr _pKernel; size_t _schedulerWorkAmount = 0; size_t _batchDimIdx = 0; + size_t m_threads_num = 0lu; public: static const int optimalTensorRank = 6; diff --git a/src/plugins/intel_cpu/src/nodes/gather.cpp b/src/plugins/intel_cpu/src/nodes/gather.cpp index 81f6f36b84dd89..d2629fe8fe6811 100644 --- a/src/plugins/intel_cpu/src/nodes/gather.cpp +++ b/src/plugins/intel_cpu/src/nodes/gather.cpp @@ -253,6 +253,7 @@ void Gather::createPrimitive() { if (isInPlace()) { return; } + m_threads_num = parallel_get_max_threads(); #if defined(OPENVINO_ARCH_X86_64) uint64_t idxElPerVec = 1; if (!isDynamicNode()) { @@ -294,11 +295,10 @@ void Gather::createPrimitive() { if (!isDynamicNode()) { const uint64_t dataElPerVec = jitKernel->getDataElPerVec(); - const uint64_t nthr = parallel_get_max_threads(); - const uint64_t wpt = ((totalWork / dataElPerVec) / nthr + 1) * dataElPerVec; - execParamsPerThread.resize(nthr); + const uint64_t wpt = ((totalWork / dataElPerVec) / m_threads_num + 1) * dataElPerVec; + execParamsPerThread.resize(m_threads_num); - parallel_nt(nthr, [&](const int ithr, const int nthr) { + parallel_nt(m_threads_num, [&](const int ithr, const int nthr) { const uint64_t dstStart = std::min(wpt * ithr, totalWork); const uint64_t dstEnd = std::min(wpt * (ithr + 1), totalWork); @@ -469,7 +469,7 @@ void Gather::execute(dnnl::stream strm) { (*jitKernel)(&arg); }; - parallel_nt(0, threadBody); + parallel_nt(m_threads_num, threadBody); return; } @@ -543,7 +543,7 @@ void Gather::executeDynamicImpl(dnnl::stream strm) { (*jitKernel)(&arg); }; - parallel_nt(0, threadBody); + parallel_nt(m_threads_num, threadBody); return; } diff --git a/src/plugins/intel_cpu/src/nodes/gather.h b/src/plugins/intel_cpu/src/nodes/gather.h index 96dad228f65b59..6ee097e9a1fbab 100644 --- a/src/plugins/intel_cpu/src/nodes/gather.h +++ b/src/plugins/intel_cpu/src/nodes/gather.h @@ -110,6 +110,7 @@ class Gather : public Node { bool have_scalar_scale = false; size_t zp_group_size = 1u; size_t scale_group_size = 1u; + size_t m_threads_num = 0lu; std::shared_ptr jitKernel; }; diff --git a/src/plugins/intel_cpu/src/nodes/grid_sample.cpp b/src/plugins/intel_cpu/src/nodes/grid_sample.cpp index 618d6b39105689..c8eed21bb312f5 100644 --- a/src/plugins/intel_cpu/src/nodes/grid_sample.cpp +++ b/src/plugins/intel_cpu/src/nodes/grid_sample.cpp @@ -149,11 +149,11 @@ void GridSample::createPrimitive() { } jitKernel->create_ker(); - nthr = parallel_get_max_threads(); - execParamsPerThread.resize(nthr); + m_threads_num = parallel_get_max_threads(); + execParamsPerThread.resize(m_threads_num); if (!x64::mayiuse(x64::avx512_core)) { const auto dataElPerVec = jitKernel->getDataElPerVec(); - parallel_nt(nthr, [&](const int ithr, const int nthr) { + parallel_nt(m_threads_num, [&](const int ithr, const int nthr) { auto& p = execParamsPerThread[ithr]; p.srcHeightF.resize(dataElPerVec); @@ -197,9 +197,9 @@ void GridSample::prepareParams() { const auto& srcDataShape = dataMemPtr->getStaticDims(); const auto& dstShape = dstMemPtr->getStaticDims(); const uint64_t totalWork = dstShape[2] * dstShape[3]; - const uint64_t wpt = ((totalWork / dataElPerVec) / nthr + 1) * dataElPerVec; + const uint64_t wpt = ((totalWork / dataElPerVec) / m_threads_num + 1) * dataElPerVec; - parallel_nt(nthr, [&](const int ithr, const int nthr) { + parallel_nt(m_threads_num, [&](const int ithr, const int nthr) { const uint64_t dstStart = std::min(wpt * ithr, totalWork); const uint64_t dstEnd = std::min(wpt * (ithr + 1), totalWork); @@ -303,7 +303,7 @@ void GridSample::execute(dnnl::stream strm) { (*jitKernel)(&arg); }; - parallel_nt(nthr, threadBody); + parallel_nt(m_threads_num, threadBody); } void GridSample::executeDynamicImpl(dnnl::stream strm) { diff --git a/src/plugins/intel_cpu/src/nodes/grid_sample.hpp b/src/plugins/intel_cpu/src/nodes/grid_sample.hpp index 0d172bd5c3e055..b4468d58be9b52 100644 --- a/src/plugins/intel_cpu/src/nodes/grid_sample.hpp +++ b/src/plugins/intel_cpu/src/nodes/grid_sample.hpp @@ -62,7 +62,7 @@ class GridSample : public Node { ov::element::Type dataPrecision; ov::element::Type gridPrecision = ov::element::f32; - int nthr = 1; + size_t m_threads_num = 0lu; std::vector execParamsPerThread; static constexpr size_t IN_DATA = 0; diff --git a/src/plugins/intel_cpu/src/nodes/kernels/scaled_attn/mha_single_token.cpp b/src/plugins/intel_cpu/src/nodes/kernels/scaled_attn/mha_single_token.cpp index 1543c168403382..6b6df3c3181ee0 100644 --- a/src/plugins/intel_cpu/src/nodes/kernels/scaled_attn/mha_single_token.cpp +++ b/src/plugins/intel_cpu/src/nodes/kernels/scaled_attn/mha_single_token.cpp @@ -1068,11 +1068,15 @@ static void mha_single_token_kernel(const ov::intel_cpu::PlainTensor& query, } }); - parallel_for3d(B, H, q_len, [&](size_t b, size_t h, size_t pq) { + auto bhl_loop = [&](size_t b, size_t h, size_t pq) { auto* temp = buf_attn_score.ptr(0, b, pq, h); size_t temp_stride = buf_attn_score.stride(0); auto* dst = has_out_transpose ? output_emb.ptr(b, pq, h * SV) : output_emb.ptr(b, h, pq); attn_reduce(dst, temp, nthr, SV, temp_stride); + }; + + parallel_nt_static(nthr, [&](const int ithr, const int nthr) { + for_3d(ithr, nthr, B, H, q_len, bhl_loop); }); } diff --git a/src/plugins/intel_cpu/src/nodes/llm_mlp.cpp b/src/plugins/intel_cpu/src/nodes/llm_mlp.cpp index 13c46a7c976cfd..8df1f5498da384 100644 --- a/src/plugins/intel_cpu/src/nodes/llm_mlp.cpp +++ b/src/plugins/intel_cpu/src/nodes/llm_mlp.cpp @@ -53,19 +53,19 @@ class LinearKsplit2 { OPENVINO_ASSERT((N % REG_BLK_N_SIZE) == 0); OPENVINO_ASSERT((K % reg_blk_K_size) == 0); - auto nthr = parallel_get_max_threads(); + m_threads_num = parallel_get_max_threads(); auto num_blk_N = N / REG_BLK_N_SIZE; - works.resize(nthr); + works.resize(m_threads_num); auto K_splits = 2; // split task on more cores is better on TBB - auto valid_nthr = nthr / 2; + auto valid_nthr = m_threads_num / 2; auto blkN_per_thread = (num_blk_N) / valid_nthr; auto blkN_leftover = num_blk_N - (blkN_per_thread * valid_nthr); auto start_blkN = 0; used_nthr = 0; - for (int ithr = 0; ithr < nthr; ithr += K_splits) { + for (int ithr = 0; ithr < m_threads_num; ithr += K_splits) { auto blkN = std::min(num_blk_N - start_blkN, blkN_per_thread); if (blkN_leftover > 0) { blkN_leftover--; @@ -106,7 +106,7 @@ class LinearKsplit2 { wbuffer.alloc(works, weight_element_size); - ov::parallel_nt_static(0, [&](const size_t ithr, const size_t nthr) { + ov::parallel_nt_static(m_threads_num, [&](const size_t ithr, const size_t nthr) { auto& work = works[ithr]; if (work) { if (is_quantized) { @@ -125,7 +125,7 @@ class LinearKsplit2 { float * w_scale) { static ReduceAdd2bh jit_reduce2cvt(true, std::is_same::value); - ov::parallel_nt_static(0, [&](const size_t ithr, const size_t nthr) { + ov::parallel_nt_static(m_threads_num, [&](const size_t ithr, const size_t nthr) { auto& work = works[ithr]; auto& workC = work.m_C; if (work) { @@ -165,6 +165,9 @@ class LinearKsplit2 { } }); } + +private: + int m_threads_num = 0; }; template @@ -205,18 +208,18 @@ class LinearGateUp { // in unit of 32 OPENVINO_ASSERT((N % REG_BLK_N_SIZE) == 0); OPENVINO_ASSERT((K % reg_blk_K_size) == 0); - auto nthr = parallel_get_max_threads(); + m_threads_num = parallel_get_max_threads(); auto num_blk_N = N / REG_BLK_N_SIZE; - works.resize(nthr); + works.resize(m_threads_num); // split task on more cores is better on TBB - auto valid_nthr = nthr; + auto valid_nthr = m_threads_num; auto blkN_per_thread = (num_blk_N) / valid_nthr; auto blkN_leftover = num_blk_N - (blkN_per_thread * valid_nthr); auto start_blkN = 0; used_nthr = 0; - for (int ithr = 0; ithr < nthr; ithr ++) { + for (int ithr = 0; ithr < m_threads_num; ithr ++) { auto blkN = std::min(num_blk_N - start_blkN, blkN_per_thread); if (blkN_leftover > 0) { blkN_leftover--; @@ -243,7 +246,7 @@ class LinearGateUp { wbuffer.alloc(works, weight_element_size); DEBUG_LOG("Linear N,K=", N, ",", K, " used_nthr=", used_nthr); - ov::parallel_nt_static(0, [&](const size_t ithr, const size_t nthr) { + ov::parallel_nt_static(m_threads_num, [&](const size_t ithr, const size_t nthr) { auto& work = works[ithr]; if (work) { if (quantized_int8) @@ -267,7 +270,7 @@ class LinearGateUp { const LLMMLPNode::Config& config, MatrixDynQuantPerRow& src_dq, float * w_scale) { - ov::parallel_nt_static(0, [&](const size_t ithr, const size_t nthr) { + ov::parallel_nt_static(m_threads_num, [&](const size_t ithr, const size_t nthr) { auto& work = works[ithr]; if (work) { work.run(M, pA, strideA_in_bytes); @@ -303,6 +306,9 @@ class LinearGateUp { } }); } + +private: + int m_threads_num = 0; }; template @@ -384,8 +390,8 @@ struct LLMMLP::Executor : public LLMMLP::ExecutorBase { reinterpret_cast(ptr)); }); - auto nthr = parallel_get_max_threads(); - for (int ithr = 0; ithr < nthr; ithr++) { + m_threads_num = parallel_get_max_threads(); + for (size_t ithr = 0lu; ithr < m_threads_num; ithr++) { auto C1_size = gate_up.works[ithr].set_C(M, reinterpret_cast(cur_scratch_base)); auto C2_size = down.works[ithr].set_C(M, reinterpret_cast(cur_scratch_base)); auto max_C_size = std::max(C1_size, C2_size); @@ -482,6 +488,9 @@ struct LLMMLP::Executor : public LLMMLP::ExecutorBase { dstC += BM * strideC / sizeof(T); } } + +private: + size_t m_threads_num = 0lu; }; #else template diff --git a/src/plugins/intel_cpu/src/nodes/mha.cpp b/src/plugins/intel_cpu/src/nodes/mha.cpp index 7d082e99fa4f6a..9364058c5d19a2 100644 --- a/src/plugins/intel_cpu/src/nodes/mha.cpp +++ b/src/plugins/intel_cpu/src/nodes/mha.cpp @@ -934,7 +934,7 @@ void MHA::prepareParams() { bool isAMXSupported = mayiuse(avx512_core_amx); - size_t numThreads = parallel_get_max_threads(); + m_threads_num = parallel_get_max_threads(); size_t matmulOptimalM = 32; @@ -1072,21 +1072,21 @@ void MHA::prepareParams() { bufferCompensation1Size = rnd_up(N1, N1_blk); if (brgCopyAKernel0) { - bufferMatMul0In0.resize(numThreads * bufferMatMul0In0Size); + bufferMatMul0In0.resize(m_threads_num * bufferMatMul0In0Size); } - bufferMatMul0In1.resize(numThreads * bufferMatMul0In1Size); - bufferMatMul0Out.resize(numThreads * bufferMatMul0OutSize); - bufferMatMul1In1.resize(numThreads * bufferMatMul1In1Size); - bufferMatMul1Out.resize(numThreads * bufferMatMul1OutSize); + bufferMatMul0In1.resize(m_threads_num * bufferMatMul0In1Size); + bufferMatMul0Out.resize(m_threads_num * bufferMatMul0OutSize); + bufferMatMul1In1.resize(m_threads_num * bufferMatMul1In1Size); + bufferMatMul1Out.resize(m_threads_num * bufferMatMul1OutSize); if (brgemmCtx0.is_with_comp) { - bufferCompensation0.resize(numThreads * bufferCompensation0Size); + bufferCompensation0.resize(m_threads_num * bufferCompensation0Size); } if (brgemmCtx1.is_with_comp) { - bufferCompensation1.resize(numThreads * bufferCompensation1Size); + bufferCompensation1.resize(m_threads_num * bufferCompensation1Size); } if (brgemmCtx0.is_with_amx || brgemmCtx1.is_with_amx) { - wsp.resize(numThreads * wsp_size_per_thread); + wsp.resize(m_threads_num * wsp_size_per_thread); } { @@ -1224,7 +1224,7 @@ void MHA::mhaImpl() { auto outPrcSize = outputPrecision.size(); - parallel_for2d(dimsMatMul0Out[0], dimsMatMul0Out[1], [&](size_t i0, size_t i1) { + auto spatial_loop = [&](size_t i0, size_t i1) { size_t threadNum = parallel_get_thread_num(); auto pTranspose0In0_aux = pTranspose0In0 + (i0 * strTranspose0In0[0] + i1 * strTranspose0In0[2]) * inputPrecisions[0].size(); // order 0213 @@ -1417,6 +1417,10 @@ void MHA::mhaImpl() { (*convertReorderKernel)(&call_args); } } + }; + + parallel_nt_static(m_threads_num, [&](const int ithr, const int nthr) { + for_2d(ithr, nthr, dimsMatMul0Out[0], dimsMatMul0Out[1], spatial_loop); }); } diff --git a/src/plugins/intel_cpu/src/nodes/mha.h b/src/plugins/intel_cpu/src/nodes/mha.h index cd272c086e2190..36afe20224299a 100644 --- a/src/plugins/intel_cpu/src/nodes/mha.h +++ b/src/plugins/intel_cpu/src/nodes/mha.h @@ -238,6 +238,8 @@ class MHA : public Node { std::unique_ptr mulAddSoftmaxKernel; std::unique_ptr convertReorderKernel; std::unique_ptr convertTransposeKernel; + + size_t m_threads_num = 0lu; }; } // namespace node diff --git a/src/plugins/intel_cpu/src/nodes/mvn.cpp b/src/plugins/intel_cpu/src/nodes/mvn.cpp index 61aa4738b8f81f..76471b0cca741d 100644 --- a/src/plugins/intel_cpu/src/nodes/mvn.cpp +++ b/src/plugins/intel_cpu/src/nodes/mvn.cpp @@ -2417,9 +2417,9 @@ void MVN::MVNJitExecutor::mvn_nspc(const uint8_t* src_data, uint8_t* dst_data, c const size_t H = shape5d[3]; const size_t W = shape5d[4]; - size_t threads_num = parallel_get_max_threads(); + const size_t threads_num = parallel_get_max_threads(); size_t aux_buffer_size = mvnAttrs.execAcrossChannels_ ? 1 : rnd_up(C, blk_size) + blk_size; - parallel_for(N, [&](size_t b) { + auto b_loop = [&](size_t b) { std::vector mean_buffer(aux_buffer_size * threads_num, 0.f); std::vector variance_buffer; if (mvnAttrs.normalizeVariance_) { @@ -2429,7 +2429,7 @@ void MVN::MVNJitExecutor::mvn_nspc(const uint8_t* src_data, uint8_t* dst_data, c // kernel_type: 0 for mean, 1 for variance, 2 for normalization auto worker = [&](const bool across_channel, const int kernel_type) { - parallel_nt(0, [&](const int ithr, const int nthr) { + parallel_nt(threads_num, [&](const int ithr, const int nthr) { size_t start = 0, end = 0; splitter(D * H * W, nthr, ithr, start, end); @@ -2512,6 +2512,10 @@ void MVN::MVNJitExecutor::mvn_nspc(const uint8_t* src_data, uint8_t* dst_data, c } worker(false, 2); } + }; + + parallel_nt_static(threads_num, [&](const int ithr, const int nthr) { + for_1d(ithr, nthr, N, b_loop); }); } @@ -2529,15 +2533,15 @@ void MVN::MVNJitExecutor::mvn_blk(const uint8_t* src_data, uint8_t* dst_data, co const size_t H = shape5d[3]; const size_t W = shape5d[4]; - size_t CB = div_up(C, blk_size); + const size_t CB = div_up(C, blk_size); - size_t C0 = W * blk_size; - size_t C1 = C0 * H; - size_t C2 = C1 * D; - size_t C3 = C2 * CB; - size_t C5 = C * D * H * W; + const size_t C0 = W * blk_size; + const size_t C1 = C0 * H; + const size_t C2 = C1 * D; + const size_t C3 = C2 * CB; + const size_t C5 = C * D * H * W; - size_t threads_num = parallel_get_max_threads(); + const size_t threads_num = parallel_get_max_threads(); size_t aux_buffer_size = mvnAttrs.execAcrossChannels_ ? blk_size : rnd_up(C, blk_size); aux_buffer_size += blk_size; std::vector mean_buffer(aux_buffer_size * threads_num); @@ -2562,7 +2566,11 @@ void MVN::MVNJitExecutor::mvn_blk(const uint8_t* src_data, uint8_t* dst_data, co // // | // // \|/ ///////////////////////////////// - auto mean_buffer_ptr = &mean_buffer[aux_buffer_size * static_cast(parallel_get_thread_num())]; + auto thread_idx = static_cast(parallel_get_thread_num()); + if (thread_idx >= threads_num) { + return mean_internal; + } + auto mean_buffer_ptr = &mean_buffer[aux_buffer_size * thread_idx]; for (size_t i = 0; i < blk_size; i++) mean_buffer_ptr[i] = 0.f; @@ -2651,7 +2659,7 @@ void MVN::MVNJitExecutor::mvn_blk(const uint8_t* src_data, uint8_t* dst_data, co // one thread for one C*W size(the same H) to get C size result for the same H, added to last group result // keep the compute order the same as planar - parallel_for2d(D, H, [&](size_t thr_idx, size_t d, size_t h) { + auto dh_loop = [&](size_t thr_idx, size_t d, size_t h) { for (size_t cb = 0; cb < CB; cb++) { size_t src_offset = b_offset + cb * C2 + d * C1 + h * C0; auto mean_buffer_ptr = &mean_buffer[blk_size * cb + aux_buffer_size * thr_idx]; @@ -2665,6 +2673,10 @@ void MVN::MVNJitExecutor::mvn_blk(const uint8_t* src_data, uint8_t* dst_data, co arg.post_op_data = post_ops_data_; (*mvn_mean_kernel)(&arg); } + }; + + parallel_nt_static(threads_num, [&](const int ithr, const int nthr) { + for_2d(ithr, nthr, D, H, dh_loop); }); for (size_t i = 1; i < threads_num; i++) { @@ -2678,7 +2690,7 @@ void MVN::MVNJitExecutor::mvn_blk(const uint8_t* src_data, uint8_t* dst_data, co for (size_t i = 0; i < variance_buffer.size(); i++) variance_buffer[i] = 0.f; - parallel_for2d(D, H, [&](size_t thr_idx, size_t d, size_t h) { + auto dh_loop = [&](size_t thr_idx, size_t d, size_t h) { for (size_t cb = 0; cb < CB; cb++) { size_t src_offset = b_offset + cb * C2 + d * C1 + h * C0; auto mean_buffer_ptr = &mean_buffer[blk_size * cb]; @@ -2694,7 +2706,12 @@ void MVN::MVNJitExecutor::mvn_blk(const uint8_t* src_data, uint8_t* dst_data, co arg.post_op_data = post_ops_data_; (*mvn_variance_kernel)(&arg); } + }; + + parallel_nt_static(threads_num, [&](const int ithr, const int nthr) { + for_2d(ithr, nthr, D, H, dh_loop); }); + for (size_t i = 1; i < threads_num; i++) { for (size_t c = 0; c < C; c++) variance_buffer[c] += variance_buffer[c + aux_buffer_size * i]; diff --git a/src/plugins/intel_cpu/src/nodes/qkv_proj.cpp b/src/plugins/intel_cpu/src/nodes/qkv_proj.cpp index 3260b12f1b5b4b..00c8b6f9b17c0b 100644 --- a/src/plugins/intel_cpu/src/nodes/qkv_proj.cpp +++ b/src/plugins/intel_cpu/src/nodes/qkv_proj.cpp @@ -60,6 +60,7 @@ struct QKVProjection::Executor : public QKVProjection::ExecutorBase { MemoryPtr m_scratchMem; uint8_t* m_scratch_base = nullptr; int m_M = 0; + size_t m_threads_num = 0lu; MatrixDynQuantPerRow m_quant_act; @@ -79,11 +80,11 @@ struct QKVProjection::Executor : public QKVProjection::ExecutorBase { auto K = w0.size(1); OPENVINO_ASSERT((K % cache_blk_k_size) == 0); - auto nthr = parallel_get_max_threads(); + m_threads_num = parallel_get_max_threads(); auto num_blk_K = K / cache_blk_k_size; int stride_in_bytes = K * weight_element_size; - works.resize(nthr); + works.resize(m_threads_num); int cur_work_id = 0; auto create_works = [&](void* pw, int output_id, int N, int valid_nthr) { @@ -119,7 +120,7 @@ struct QKVProjection::Executor : public QKVProjection::ExecutorBase { auto proj_size0 = m_node->m_config.proj_size0; auto proj_size1 = m_node->m_config.proj_size1; auto proj_size2 = m_node->m_config.proj_size2; - auto n_group_workers = allocate_workers({proj_size0, proj_size1, proj_size2}, nthr); + auto n_group_workers = allocate_workers({proj_size0, proj_size1, proj_size2}, m_threads_num); if (m_node->m_config.weights_combined) { auto* ptr_weights = reinterpret_cast(w0.ptr_v()); @@ -140,7 +141,7 @@ struct QKVProjection::Executor : public QKVProjection::ExecutorBase { wbuffer.alloc(works, weight_element_size); - ov::parallel_nt_static(0, [&](const size_t ithr, const size_t nthr) { + ov::parallel_nt_static(m_threads_num, [&](const size_t ithr, const size_t nthr) { auto& work = works[ithr]; if (work) { if (quantized_int8) @@ -237,7 +238,7 @@ struct QKVProjection::Executor : public QKVProjection::ExecutorBase { strideA = m_quant_act.K; } - ov::parallel_nt_static(0, [&](const size_t ithr, const size_t nthr) { + ov::parallel_nt_static(m_threads_num, [&](const size_t ithr, const size_t nthr) { auto& work = works[ithr]; if (work) { work.run(BM, pA, strideA); diff --git a/src/plugins/intel_cpu/src/nodes/reduce.cpp b/src/plugins/intel_cpu/src/nodes/reduce.cpp index b40c50f957514f..6cfc94a02b9f3b 100644 --- a/src/plugins/intel_cpu/src/nodes/reduce.cpp +++ b/src/plugins/intel_cpu/src/nodes/reduce.cpp @@ -2742,12 +2742,12 @@ inline void Reduce::reduce_kernel_post_process(uint8_t *out_ptr) { (*reduce_post_kernel)(&arg); }); } else if (layout == ReduceLayoutType::reduce_nspc) { - size_t num_threads = static_cast(parallel_get_max_threads()); + const size_t num_threads = static_cast(parallel_get_max_threads()); size_t OP = OB * OC >= num_threads ? OB * OC : OB * OC * OD; if (OP < num_threads && OW > blk_size) OP *= OH; size_t work_amount = OB * OC * OD * OH * OW / OP; - parallel_for(OP, [&](size_t op) { + auto op_loop = [&](size_t op) { const uint8_t *in_p = in_ptr + op * work_amount * intermediate_data_size; uint8_t *out_p = out_ptr + op * work_amount * dst_data_size; auto arg = jit_reduce_post_call_args(); @@ -2759,6 +2759,10 @@ inline void Reduce::reduce_kernel_post_process(uint8_t *out_ptr) { arg.divisor = &divisor; arg.post_op_data = static_cast(postOpsDataPtrs.data()); (*reduce_post_kernel)(&arg); + }; + + parallel_nt_static(num_threads, [&](const int ithr, const int nthr) { + for_1d(ithr, nthr, OP, op_loop); }); } else { size_t OCB = div_up(OC, blk_size); diff --git a/src/plugins/intel_cpu/src/nodes/roi_align.cpp b/src/plugins/intel_cpu/src/nodes/roi_align.cpp index eb1797279e1415..27f9426dca6af9 100644 --- a/src/plugins/intel_cpu/src/nodes/roi_align.cpp +++ b/src/plugins/intel_cpu/src/nodes/roi_align.cpp @@ -1076,7 +1076,7 @@ void ROIAlign::executeSpecified() { int bufSize = rnd_up(C, 16); size_t threadsNum = parallel_get_max_threads(); workingBuf.resize(bufSize * threadsNum, 0.f); - parallel_for3d(realRois, pooledH, pooledW, [&](int n, int yBinInd, int xBinInd) { + auto rhw_loop = [&](int n, int yBinInd, int xBinInd) { int numSamplesROI = numSamples[n]; // each sample have 4 values for srcAddressList and weight size_t binOffset = numSamplesROI * BLIParamsNum * pooledW * yBinInd + numSamplesROI * BLIParamsNum * xBinInd; @@ -1095,6 +1095,10 @@ void ROIAlign::executeSpecified() { arg.dst = static_cast(&dst[dstOffset]); arg.src_stride = lastBlockDim * W * H; // only valid for blk, nspc generate inside (*roi_align_kernel)(&arg); + }; + + parallel_nt_static(threadsNum, [&](const int ithr, const int nthr) { + for_3d(ithr, nthr, realRois, pooledH, pooledW, rhw_loop); }); } else { // one lane for one sample generation, then pooling all samples. diff --git a/src/plugins/intel_cpu/src/nodes/scaled_attn.cpp b/src/plugins/intel_cpu/src/nodes/scaled_attn.cpp index e229ff4bb72c57..f9f853230c4dd6 100644 --- a/src/plugins/intel_cpu/src/nodes/scaled_attn.cpp +++ b/src/plugins/intel_cpu/src/nodes/scaled_attn.cpp @@ -217,6 +217,7 @@ struct MHAKernel { size_t wsp_size_per_thread = 0; using tag = dnnl::memory::format_tag; using dt = dnnl::memory::data_type; + size_t m_threads_num = 0lu; struct brgemmKey { size_t M; size_t N; @@ -315,21 +316,21 @@ struct MHAKernel { wv_gemm_ptr = wv_result.first; - size_t nthr = static_cast(parallel_get_max_threads()); + m_threads_num = static_cast(parallel_get_max_threads()); // wsp is used to compute beta when K is blocked wsp_size_per_thread = wv_gemm_ptr->get_wsp_size(); - wsp.resize(nthr * wsp_size_per_thread); + wsp.resize(m_threads_num * wsp_size_per_thread); // allocate scratch a/b, notice get_scratch_a_size/get_scratch_b_size returns in bytes size_t data_size = sizeof(T); - qk_scratch_a.resize({nthr, qk_gemm_ptr->get_scratch_a_size() / data_size}); - wv_scratch_a.resize({nthr, wv_gemm_ptr->get_scratch_a_size() / data_size}); + qk_scratch_a.resize({m_threads_num, qk_gemm_ptr->get_scratch_a_size() / data_size}); + wv_scratch_a.resize({m_threads_num, wv_gemm_ptr->get_scratch_a_size() / data_size}); qk_scratch_b.resize({B, Hk, qk_gemm_ptr->get_scratch_b_size() / data_size}); wv_scratch_b.resize({B, Hk, wv_gemm_ptr->get_scratch_b_size() / data_size}); const size_t m_block_size = qk_gemm_ptr->get_mblk_size(); - weight_score.resize({static_cast(parallel_get_max_threads()), H, m_block_size, kv_len}); + weight_score.resize({m_threads_num, H, m_block_size, kv_len}); if (has_out_transpose) { fp32_out.resize({B, q_len, H, head_size_v}); } else { @@ -367,7 +368,7 @@ struct MHAKernel { }); // attention - parallel_for3d(B, H, m_blocks, [&](size_t ithr, size_t b, size_t h, size_t m_blk) { + auto bhb_loop = [&](size_t ithr, size_t b, size_t h, size_t m_blk) { auto m_start = m_blk * m_block_size; auto m_end = std::min(m_start + m_block_size, q_len); auto m_cnt = m_end - m_start; @@ -456,6 +457,10 @@ struct MHAKernel { 1); } } + }; + + parallel_nt_static(m_threads_num, [&](const int ithr, const int nthr) { + for_3d(ithr, nthr, B, H, m_blocks, bhb_loop); }); } @@ -652,12 +657,14 @@ struct MHAKernel { size_t m_block_size; // buffer to hold qk temp std::vector qk_buffers; + size_t m_threads_num = 0lu; MHAKernel() = delete; explicit MHAKernel(GraphContext::CPtr ctx): context(ctx) { m_block_size = 4; select_nfltmax_at_0 = false; - qk_buffers.resize(parallel_get_max_threads()); + m_threads_num = parallel_get_max_threads(); + qk_buffers.resize(m_threads_num); } PlainTensor causal_mask; @@ -699,7 +706,7 @@ struct MHAKernel { auto m_blocks = (q_len + m_block_size - 1) / m_block_size; - parallel_for3d(B, H, m_blocks, [&](size_t b, size_t h, size_t m_blk) { + auto bhb_loop = [&](size_t b, size_t h, size_t m_blk) { auto thread_id = parallel_get_thread_num(); if (thread_id < 0) OPENVINO_THROW("The calling thread isn't initialized!"); @@ -801,6 +808,10 @@ struct MHAKernel { has_out_transpose ? &output_emb.at({b, m_start, h * head_size_v}) : &output_emb.at({b, h, m_start}), has_out_transpose ? output_emb.stride(1) : output_emb.stride(2), 1); + }; + + parallel_nt_static(m_threads_num, [&](const int ithr, const int nthr) { + for_3d(ithr, nthr, B, H, m_blocks, bhb_loop); }); } }; diff --git a/src/plugins/intel_cpu/src/nodes/strided_slice.cpp b/src/plugins/intel_cpu/src/nodes/strided_slice.cpp index 4f974cfe5e9748..13671c22d102ae 100644 --- a/src/plugins/intel_cpu/src/nodes/strided_slice.cpp +++ b/src/plugins/intel_cpu/src/nodes/strided_slice.cpp @@ -348,6 +348,7 @@ StridedSlice::StridedSliceCommonExecutor::StridedSliceCommonExecutor(const Strid dimsNormalization(); dimsGluing(); indicesCalculation(); + m_threads_num = parallel_get_max_threads(); } void StridedSlice::StridedSliceCommonExecutor::orderParametersByLayouts(const BlockedMemoryDescCPtr& blockedMemoryDesc) { @@ -642,8 +643,7 @@ void StridedSlice::StridedSliceCommonExecutor::dimsGluing() { for (size_t idx = secondDim.first + 1; idx < secondDim.second; idx++) params.attrs.begin[1] /= dstBlockedDimsBefore[idx]; - const size_t maxThreads = parallel_get_max_threads(); - if (params.dstBlockedDims[0] < maxThreads) { + if (params.dstBlockedDims[0] < m_threads_num) { params.dstBlockedDims[1] /= realDstDim; params.srcBlockedDims[1] /= realSrcDim; params.dstBlockedDims.insert(params.dstBlockedDims.begin() + 1, realDstDim); @@ -682,8 +682,7 @@ void StridedSlice::StridedSliceCommonExecutor::indicesCalculation() { dstIndices.resize(workAmount, 0); // should choose more optimal thread count - const size_t nthr = parallel_get_max_threads(); - nThreads = nthr > workAmount ? workAmount : nthr; + nThreads = m_threads_num > workAmount ? workAmount : m_threads_num; if (params.isOptimized) { indicesCalculationForOptimized(); diff --git a/src/plugins/intel_cpu/src/nodes/strided_slice.h b/src/plugins/intel_cpu/src/nodes/strided_slice.h index 5c5950520bda7d..bf698643271d7a 100644 --- a/src/plugins/intel_cpu/src/nodes/strided_slice.h +++ b/src/plugins/intel_cpu/src/nodes/strided_slice.h @@ -122,6 +122,7 @@ class StridedSlice : public Node { size_t workAmount = 0lu; size_t lastDstDim = 0lu; size_t srcShift = 0lu; + size_t m_threads_num = 0lu; }; using executorPtr = std::shared_ptr; executorPtr execPtr = nullptr; From c7d8e038f2662dcbe8a81adc6691c594811e5e90 Mon Sep 17 00:00:00 2001 From: Maksim Kutakov Date: Tue, 29 Oct 2024 12:45:52 +0100 Subject: [PATCH 029/120] [CPU] Drop redundant MemoryOutput nodes (#27189) ### Details: In direct ReadValue->Assign pairs the Assign node is practically useless as there are no other layers that might modify data in between. Therefore, it does make sense to remove corresponding MemoryOutput nodes to eliminate additional overheads on their processing. ### Tickets: - CVS-153035 - CVS-155112 --- src/plugins/intel_cpu/src/graph_optimizer.cpp | 116 +++++++++++ src/plugins/intel_cpu/src/graph_optimizer.h | 1 + src/plugins/intel_cpu/src/memory_state.cpp | 43 +++++ src/plugins/intel_cpu/src/memory_state.h | 21 ++ src/plugins/intel_cpu/src/nodes/memory.cpp | 119 +++++++++++- src/plugins/intel_cpu/src/nodes/memory.hpp | 59 +++++- .../src/common/read_value_assign.cpp | 182 ++++++++++++++++++ .../template/src/sync_infer_request.cpp | 2 +- src/plugins/template/src/variable_state.hpp | 32 ++- .../include/subgraph_tests/lora_pattern.hpp | 10 + .../subgraph/lora_pattern.hpp | 3 +- .../src/subgraph/lora_pattern.cpp | 74 +++++++ 12 files changed, 640 insertions(+), 22 deletions(-) create mode 100644 src/plugins/intel_cpu/tests/functional/custom/subgraph_tests/src/common/read_value_assign.cpp diff --git a/src/plugins/intel_cpu/src/graph_optimizer.cpp b/src/plugins/intel_cpu/src/graph_optimizer.cpp index 6b3175e24d9dcb..ab7eb223ba17ce 100644 --- a/src/plugins/intel_cpu/src/graph_optimizer.cpp +++ b/src/plugins/intel_cpu/src/graph_optimizer.cpp @@ -183,6 +183,10 @@ void GraphOptimizer::ApplyCommonGraphOptimizations(Graph &graph) { MatchSdpaKvCache(graph); graph.RemoveDroppedNodes(); + OV_ITT_SCOPE_NEXT(FIRST_INFERENCE, taskChain, "DropRedundantMemoryOutput"); + DropRedundantMemoryOutput(graph); + graph.RemoveDroppedNodes(); + OV_ITT_SCOPE_NEXT(FIRST_INFERENCE, taskChain, "RemoveDroppedEdges"); graph.RemoveDroppedEdges(); } @@ -3186,5 +3190,117 @@ void GraphOptimizer::MatchSdpaKvCache(Graph &graph) { } } +void GraphOptimizer::DropRedundantMemoryOutput(Graph &graph) { + // When we have a MemoryInput->MemoryOutput pair, that means that the state is immediately populated with the init + // subgraph values when the init subgraph exists. In all the other cases the state is simply a read only object. + // We can optimize such a case removing the MemoryOutput node and transferring the state values update + // responsibility to a special type of the MemoryInput node - MemoryInputSingle + auto& graphNodes = graph.GetNodes(); + + auto isSuitableMemInput = [](const NodePtr& node) -> bool { + if (Type::MemoryInput != node->getType()) { + return false; + } + + CPU_GRAPH_OPTIMIZER_SCOPE(DropRedundantMemoryOutput_isSuitableMemInput); + + auto memInputBase = std::dynamic_pointer_cast(node); + OPENVINO_ASSERT(memInputBase, + "Unexpectedly wrong dynamic type of node: ", + node->getName(), + " of type: ", + node->getTypeStr()); + + auto id = memInputBase->getId(); + + NodePtr MemoryOutput = nullptr; + auto&& childEdges = node->getChildEdgesAtPort(0); + for (auto&& item : childEdges) { + auto childNode = item->getChild(); + + if (Type::MemoryOutput == childNode->getType()) { + auto memOutputBase = std::dynamic_pointer_cast(childNode); + OPENVINO_ASSERT(memInputBase, + "Unexpectedly wrong dynamic type of node: ", + node->getName(), + " of type: ", + node->getTypeStr()); + + if (memOutputBase->getId() != id) { + return false; // an Assign node from different Variable is attached + } + + if (MemoryOutput && MemoryOutput != childNode) { + //only one child MemoryOutput is expected + return false; + } + MemoryOutput = childNode; + } + } + return nullptr != MemoryOutput; + }; + + for (size_t i = 0; i < graphNodes.size(); i++) { + auto node = graphNodes[i]; + if (!isSuitableMemInput(node)) { + continue; + } + + CPU_GRAPH_OPTIMIZER_SCOPE(DropRedundantMemoryOutput_Node); + + auto memInputNode = std::dynamic_pointer_cast(node); + OPENVINO_ASSERT(memInputNode, "MemoryInput node ", node->getName(), " has unexpected dynamic type"); + + ov::optional inputShape; + ov::optional inputPrc; + + if (!node->getParentEdges().empty()) { + inputShape = ov::optional(node->getInputShapeAtPort(0)); + inputPrc = ov::optional(node->getOriginalInputPrecisionAtPort(0)); + } + + //search for the MemoryOutputNode + NodePtr memoryOutputNode; + for (auto&& edge : node->getChildEdgesAtPort(0)) { + auto child = edge->getChild(); + if (Type::MemoryOutput == child->getType()) { + memoryOutputNode = child; + break; + } + } + OPENVINO_ASSERT(memoryOutputNode, "Corresponding MemoryOutput has not been found"); + + graph.RemoveEdge(memoryOutputNode->getParentEdgeAt(0)); + // there are no output edges from MemoryOutput nodes + + // now replace the existing MemoryInput with a special type that works without the corresponding MemoryOutput + auto memInputSingle = std::make_shared(memInputNode->getId(), + memInputNode->getName(), + memInputNode->getTypeStr(), + memInputNode->getOutputShapeAtPort(0), + memInputNode->getOriginalOutputPrecisionAtPort(0), + graph.getGraphContext(), + inputShape, + inputPrc); + + graph.AddNode(memInputSingle); + + if (!memInputNode->getParentEdges().empty()) { + auto parentEdge = memInputNode->getParentEdgeAt(0); + auto parent = parentEdge->getParent(); + const auto inputNum = parentEdge->getInputNum(); + graph.RemoveEdge(parentEdge); + graph.CreateEdge(parent, memInputSingle, inputNum, 0); + } + + for (auto&& edge : memInputNode->getChildEdgesAtPort(0)) { + auto child = edge->getChild(); + const auto outputNum = edge->getOutputNum(); + graph.RemoveEdge(edge); + graph.CreateEdge(memInputSingle, child, 0, outputNum); + } + } +} + } // namespace intel_cpu } // namespace ov diff --git a/src/plugins/intel_cpu/src/graph_optimizer.h b/src/plugins/intel_cpu/src/graph_optimizer.h index 0a85a253ba8d66..886296a7c0053b 100644 --- a/src/plugins/intel_cpu/src/graph_optimizer.h +++ b/src/plugins/intel_cpu/src/graph_optimizer.h @@ -52,6 +52,7 @@ class GraphOptimizer { void RemoveMemoryInputConvert(Graph &graph); void RemoveConvertMemoryOutput(Graph &graph); void MatchSdpaKvCache(Graph &graph); + void DropRedundantMemoryOutput(Graph &graph); bool canBeInplaced(const NodePtr& parentNode, const NodePtr& childNode); // Method checks that after the sequential execution of Transpose and Reorder nodes, diff --git a/src/plugins/intel_cpu/src/memory_state.cpp b/src/plugins/intel_cpu/src/memory_state.cpp index f5f76fe42feb48..bf77917497de77 100644 --- a/src/plugins/intel_cpu/src/memory_state.cpp +++ b/src/plugins/intel_cpu/src/memory_state.cpp @@ -156,6 +156,49 @@ MemoryPtr VariableStateDoubleBuffer::internal_state_mem() const { return prime_mem(); } +VariableStateSingleBuffer::VariableStateSingleBuffer(const std::string& name, + const MemoryPtr& external_buffer, + const MemoryDescPtr& external_desc) + : VariableStateBase(name, external_desc) { + OPENVINO_ASSERT(external_buffer); + m_internal_mem = external_buffer; + m_internal_desc = m_internal_mem->getDescPtr(); + auto&& shape = m_internal_desc->getShape(); + + if (shape.isStatic()) { + m_internal_mem->nullify(); + } else { + // in the case of the original desc has dynamic shape we create an empty tensor + auto new_desc = to_static(m_internal_desc); + m_internal_mem->redefineDesc(new_desc); + } +} +MemoryPtr VariableStateSingleBuffer::input_mem() { + return m_internal_mem; +} +MemoryPtr VariableStateSingleBuffer::output_mem() { + return m_internal_mem; +} +MemoryDescPtr VariableStateSingleBuffer::internal_desc() const { + return m_internal_desc; +} + +void VariableStateSingleBuffer::reset_impl() { + auto new_desc = to_static(m_internal_desc); + if (m_internal_mem) { + m_internal_mem->redefineDesc(new_desc); + m_internal_mem->nullify(); + } +} + +MemoryPtr VariableStateSingleBuffer::internal_state_mem() const { + return m_internal_mem; +} + +void VariableStateSingleBuffer::commit_impl() { + // nothing to do +} + VariableStateKVcache::VariableStateKVcache( const std::string& name, const MemoryDescPtr& external_desc, diff --git a/src/plugins/intel_cpu/src/memory_state.h b/src/plugins/intel_cpu/src/memory_state.h index b4c52903d12f31..e7493f327e93fa 100644 --- a/src/plugins/intel_cpu/src/memory_state.h +++ b/src/plugins/intel_cpu/src/memory_state.h @@ -94,6 +94,27 @@ class VariableStateDoubleBuffer : public VariableStateBase { size_t buffer_num = 0; }; +class VariableStateSingleBuffer : public VariableStateBase { +public: + VariableStateSingleBuffer(const std::string& name, + const MemoryPtr& external_buffer, + const MemoryDescPtr& external_desc); + + MemoryPtr input_mem() override; + MemoryPtr output_mem() override; + MemoryDescPtr internal_desc() const override; + +private: + void reset_impl() override; + void commit_impl() override; + + MemoryPtr internal_state_mem() const override; + +private: + MemoryDescPtr m_internal_desc; //mem desc required by the graph internal tensor + MemoryPtr m_internal_mem; +}; + class VariableStateKVcache : public VariableStateBase { public: VariableStateKVcache(const std::string& name, diff --git a/src/plugins/intel_cpu/src/nodes/memory.cpp b/src/plugins/intel_cpu/src/nodes/memory.cpp index 756fbc5b578f61..565597bdcc2a9e 100644 --- a/src/plugins/intel_cpu/src/nodes/memory.cpp +++ b/src/plugins/intel_cpu/src/nodes/memory.cpp @@ -377,7 +377,8 @@ bool MemoryInputBase::isSupportedOperation(const std::shared_ptr } MemoryInputBase::MemoryInputBase(const std::shared_ptr& op, const GraphContext::CPtr ctx) - : Input(op, ctx), MemoryStateNode(op) { + : Input(op, ctx), + MemoryStateNode(op) { std::string errorMessage; if (!isSupportedOperation(op, errorMessage)) { OPENVINO_THROW_NOT_IMPLEMENTED(errorMessage); @@ -385,6 +386,7 @@ MemoryInputBase::MemoryInputBase(const std::shared_ptr& op, const Grap if (created()) { context->getMemoryStatesRegister()->registerInput(this); } + executeHook = &MemoryInputBase::assignState; } MemoryInputBase::MemoryInputBase(const std::string id, @@ -394,8 +396,10 @@ MemoryInputBase::MemoryInputBase(const std::string id, const ov::element::Type& output_prc, const GraphContext::CPtr context, const ov::optional& input_shape, - const ov::optional& input_prc) : - Input(output_shape, output_prc, name, type, context), MemoryStateNode(id) { + const ov::optional& input_prc, + MemoryInputBase::mode mode) + : Input(output_shape, output_prc, name, type, context), + MemoryStateNode(id) { outputShapes.emplace_back(output_shape); addOriginalOutputPrecision(output_prc); if (input_shape) { @@ -411,6 +415,17 @@ MemoryInputBase::MemoryInputBase(const std::string id, if (created()) { context->getMemoryStatesRegister()->registerInput(this); } + + // this important to prevent identifying it as a const when it's on a const path + constant = ConstantType::StrictNoConst; + + if (mode::read_value_assign == mode) { + executeHook = &MemoryInputBase::assignState; + } else if (mode::single_read_value == mode) { + executeHook = &MemoryInputBase::bypassAssignState; + } else { + THROW_CPU_NODE_ERR("Unexpected MemoryInput mode"); + } } MemoryInputBase::~MemoryInputBase() { @@ -513,15 +528,26 @@ void MemoryInputBase::assignState(MemStatePtr newState) { } void MemoryInputBase::execute(dnnl::stream strm) { - getOutputNode().assignState(getAssignedState()); + assert(executeHook && "executeHook is not initialized!"); + (this->*executeHook)(); runStatic(strm); } void MemoryInputBase::executeDynamicImpl(dnnl::stream strm) { - getOutputNode().assignState(getAssignedState()); + assert(executeHook && "executeHook is not initialized!"); + (this->*executeHook)(); runDynamic(strm); } +void MemoryInputBase::assignState() { + getOutputNode().assignState(getAssignedState()); +} + +void MemoryInputBase::bypassAssignState() { + // nothing to do + return; +} + bool MemoryInput::needInitGraphProcessing() const { return !getParentEdges().empty() && getAssignedState()->is_reset_state(); } @@ -828,6 +854,89 @@ void MemoryInputSDPA::resolveInPlaceEdges(Edge::LOOK look) { } } +MemoryInputSingle::MemoryInputSingle(const std::string id, + const std::string& name, + const std::string& type, + const Shape& output_shape, + const ov::element::Type& output_prc, + const GraphContext::CPtr context, + const ov::optional& input_shape, + const ov::optional& input_prc) + : MemoryInput(id, + name, + type, + output_shape, + output_prc, + context, + input_shape, + input_prc, + MemoryInputBase::mode::single_read_value) {} + +MemStatePtr MemoryInputSingle::makeState() const { + // assume ov::Tensor is always dense + auto original_desc = + std::make_shared(getOriginalOutputPrecisionAtPort(0), outputShapes.at(0)); + + auto mem_desc = getBaseMemDescAtOutputPort(0); + const auto& eng = getEngine(); + + auto state_name = getId(); + + // Remove suffix with pair ID. Internal information. + auto suffix_idx = state_name.find("/id="); + if (suffix_idx != std::string::npos) { + state_name = state_name.substr(0, suffix_idx); + } + + return std::make_shared(state_name, + std::make_shared(eng, mem_desc), + original_desc); +} + +void MemoryInputSingle::runStatic(dnnl::stream strm) { + MemoryInput::runStatic(strm); + if (needInitGraphProcessing()) { + // since there is no corresponding MemoryOutput node, we need to update the state here + auto result = getDstMemoryAtPort(0); // only one output port + auto stateMem = getAssignedState()->output_mem(); + CPU_NODE_ASSERT(stateMem, " state memory has nullptr"); + if (result->getData() != stateMem->getData()) { + stateMem->load(*result); + } + } + getAssignedState()->commit(); // since we don't use MemoryOutput, commit must be called to change the reset state +} + +void MemoryInputSingle::runDynamic(dnnl::stream strm) { + MemoryInput::runDynamic(strm); + if (needInitGraphProcessing()) { + // since there is no corresponding MemoryOutput node, we need to update the state here + auto result = getDstMemoryAtPort(0); // only one output port + auto state = getAssignedState(); + auto stateMem = state->output_mem(); + CPU_NODE_ASSERT(stateMem, " state memory has nullptr"); + + const auto& newShape = result->getShape(); + const auto& stateShape = stateMem->getShape(); + + if (stateShape.isDynamic() || stateShape.getStaticDims() != newShape.getStaticDims()) { + auto extMemDesc = state->internal_desc(); + auto newExternDesc = extMemDesc->cloneWithNewDims(newShape.getStaticDims()); + stateMem->redefineDesc(newExternDesc); + } + + if (result->getData() != stateMem->getData()) { + stateMem->load(*result); + } + } + getAssignedState()->commit(); // since we don't use MemoryOutput, commit must be called to change the reset state +} + +bool MemoryInputSingle::isSupportedOperation(const std::shared_ptr& op, + std::string& errorMessage) noexcept { + return MemoryInput::isSupportedOperation(op, errorMessage); +} + } // namespace node } // namespace intel_cpu } // namespace ov diff --git a/src/plugins/intel_cpu/src/nodes/memory.hpp b/src/plugins/intel_cpu/src/nodes/memory.hpp index c158d738a36148..f503a8d58386a5 100644 --- a/src/plugins/intel_cpu/src/nodes/memory.hpp +++ b/src/plugins/intel_cpu/src/nodes/memory.hpp @@ -120,16 +120,14 @@ class MemoryOutputStub : public MemoryOutputBase { }; class MemoryInputBase : public Input, public MemoryStateNode { +public: + enum class mode { + read_value_assign, + single_read_value + }; + public: MemoryInputBase(const std::shared_ptr& op, const GraphContext::CPtr context); - MemoryInputBase(const std::string id, - const std::string& name, - const std::string& type, - const Shape& output_shape, - const ov::element::Type& output_prc, - const GraphContext::CPtr context, - const ov::optional& input_shape, - const ov::optional& input_prc); ~MemoryInputBase() override; @@ -152,6 +150,17 @@ class MemoryInputBase : public Input, public MemoryStateNode { MemoryOutputBase& getOutputNode(); void assignState(MemStatePtr newState) override final; // NOLINT +protected: + MemoryInputBase(const std::string id, + const std::string& name, + const std::string& type, + const Shape& output_shape, + const ov::element::Type& output_prc, + const GraphContext::CPtr context, + const ov::optional& input_shape, + const ov::optional& input_prc, + mode mode = mode::read_value_assign); + protected: virtual void runStatic(dnnl::stream strm) = 0; virtual void runDynamic(dnnl::stream strm) = 0; @@ -160,12 +169,20 @@ class MemoryInputBase : public Input, public MemoryStateNode { return state; } +private: + using executeHookPtr = void (MemoryInputBase::*)(void); + +private: + void assignState(); + void bypassAssignState(); + private: /** * @brief keeps reference to output sibling node */ MemoryOutputBase* outputNode = nullptr; MemStatePtr state = nullptr; + executeHookPtr executeHook; }; class MemoryInput : public MemoryInputBase { @@ -179,16 +196,38 @@ class MemoryInput : public MemoryInputBase { MemStatePtr makeState() const override; -private: +protected: + bool needInitGraphProcessing() const; void runStatic(dnnl::stream strm) override; void runDynamic(dnnl::stream strm) override; + +private: void assignStateHook() override {/*pass*/} - bool needInitGraphProcessing() const; private: ProxyMemoryBlockPtr memBlock = nullptr; }; +class MemoryInputSingle : public MemoryInput { +public: + MemoryInputSingle(const std::string id, + const std::string& name, + const std::string& type, + const Shape& output_shape, + const ov::element::Type& output_prc, + const GraphContext::CPtr context, + const ov::optional& input_shape, + const ov::optional& input_prc); + + static bool isSupportedOperation(const std::shared_ptr& op, std::string& errorMessage) noexcept; + + MemStatePtr makeState() const override; + +private: + void runStatic(dnnl::stream strm) override; + void runDynamic(dnnl::stream strm) override; +}; + class MemoryInputSDPA : public MemoryInputBase { public: MemoryInputSDPA(const std::string id, diff --git a/src/plugins/intel_cpu/tests/functional/custom/subgraph_tests/src/common/read_value_assign.cpp b/src/plugins/intel_cpu/tests/functional/custom/subgraph_tests/src/common/read_value_assign.cpp new file mode 100644 index 00000000000000..c6e976b321f703 --- /dev/null +++ b/src/plugins/intel_cpu/tests/functional/custom/subgraph_tests/src/common/read_value_assign.cpp @@ -0,0 +1,182 @@ +// Copyright (C) 2018-2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "common_test_utils/node_builders/eltwise.hpp" +#include "common_test_utils/node_builders/constant.hpp" +#include "shared_test_classes/base/ov_subgraph.hpp" +#include "utils/cpu_test_utils.hpp" + +/* The main purpose of this test set is to test ReadValue->Assign direct connection optimizations, i.e. + dropping the MemoryOutput node. +*/ + +namespace ov { +namespace test { + +using namespace CPUTestUtils; + +// ┌────────┐ ┌────────┐ +// │ Param2 │ │ Param1 │ |---------------| +// └───┬────┘ └────┬───┘ | | +// │ |-----------|┌─────────┐ | +// │ | │ │Constant │ | +// │ | │ └───┬─────┘ | +// │ | ┌───┴────┐ │ | +// │ | │Multiply├─────┘ | +// │ | └───┬────┘ | <- Optional Init Subgraph +// │ | │ ┌─────────┐ | +// │ | │ │Constant │ | +// │ | │ └───┬─────┘ | +// │ | ┌───┴────┐ │ | +// │ | │ Add ├─────┘ | +// │ | └───┬────┘ | +// │ | │ | +// │ |---------------------------| +// │ │ +// │ │ +// │ │ +// │ ┌─────┴─────┐ +// │ │ ReadValue │ +// │ └─────┬─────┘ +// │ │ \ +// │ ┌──┴──┐ \ +// └────────┤ Add │ \┌────────┐ +// └──┬──┘ │ Assign │ +// │ └────────┘ +// │ +// ┌────┴────┐ +// │ Result1 │ +// └─────────┘ + +typedef std::tuple< + bool, // include init subgraph + CPUSpecificParams +> ReadValueAssignTestParams; + +class ReadValueAssignTest : public testing::WithParamInterface, + virtual public SubgraphBaseTest, + public CPUTestsBase { +public: + static std::string getTestCaseName(const testing::TestParamInfo &obj) { + bool use_init_subgraph = false; + CPUSpecificParams cpu_params; + std::tie(use_init_subgraph, cpu_params) = obj.param; + + std::ostringstream results; + results << "Init_Graph=" << (use_init_subgraph ? "True" : "False") << "_"; + results << CPUTestsBase::getTestCaseName(cpu_params); + return results.str(); + } + + void SetUp() override { + targetDevice = ov::test::utils::DEVICE_CPU; + + const ov::Shape tensor_shape = {3, 32, 7, 7}; + + InputShape param1_shape = {{-1, 32, -1, -1}, {tensor_shape}}; + InputShape param2_shape = {{-1, -1, -1, -1}, {tensor_shape}}; + + bool use_init_subgraph = false; + CPUSpecificParams cpu_params; + std::tie(use_init_subgraph, cpu_params) = this->GetParam(); + std::tie(inFmts, outFmts, priority, selectedType) = cpu_params; + selectedType = makeSelectedTypeStr(selectedType, net_prc); + + init_input_shapes({param1_shape, param2_shape}); + + ov::ParameterVector params; + params.push_back(std::make_shared(net_prc, inputDynamicShapes[0])); + params.push_back(std::make_shared(net_prc, inputDynamicShapes[1])); + std::shared_ptr last_node = params.front(); + + if (use_init_subgraph) { + //build init subgraph + auto const1 = utils::make_constant(net_prc, tensor_shape); + auto const2 = utils::make_constant(net_prc, tensor_shape); + auto multiply = utils::make_eltwise(last_node, const1, utils::EltwiseTypes::MULTIPLY); + auto add = utils::make_eltwise(multiply, const2, utils::EltwiseTypes::ADD); + last_node = add; + } + + const std::string variable_name("variable0"); + auto variable = std::make_shared( + ov::op::util::VariableInfo{inputDynamicShapes[0], net_prc, variable_name}); + + auto read = std::make_shared(last_node, variable); + auto assign = std::make_shared(read, variable); + auto add = utils::make_eltwise(params[1], read, utils::EltwiseTypes::ADD); + + add->get_rt_info() = getCPUInfo(); + auto res = std::make_shared(add); + + function = + std::make_shared(ov::ResultVector({res}), ov::SinkVector({assign}), params, "ReadValueAssign"); + } + +protected: + const ov::Shape tensor_shape = {3, 32, 7, 7}; + const ElementType net_prc = element::f32; +}; + +TEST_P(ReadValueAssignTest, CompareWithRefs) { + compile_model(); + inferRequest = compiledModel.create_infer_request(); + ASSERT_TRUE(inferRequest); + + // use the Template plugin as a reference + + auto compiledReferenceModel = core->compile_model(function, ov::test::utils::DEVICE_TEMPLATE); + auto inferRequestRef = compiledReferenceModel.create_infer_request(); + ASSERT_TRUE(inferRequestRef); + + generate_inputs(targetStaticShapes.front()); + for (const auto& input : inputs) { + inferRequest.set_tensor(input.first, input.second); + inferRequestRef.set_tensor(input.first, input.second); + } + + constexpr int infer_count = 3lu; + + auto&& states = inferRequest.query_state(); + auto&& refStates = inferRequestRef.query_state(); + + for (int i = 0; i < infer_count; ++i) { + // set states + + if (i & 0x1) { + //reset every odd iteration + states.front().reset(); + refStates.front().reset(); + } else { + // generate and set state tensors every even iteration + using ov::test::utils::InputGenerateData; + + auto tensor = + ov::test::utils::create_and_fill_tensor(net_prc, tensor_shape, InputGenerateData{0, 10, 1, i}); + states.front().set_state(tensor); + refStates.front().set_state(tensor); + } + + inferRequest.infer(); + inferRequestRef.infer(); + auto outputs = function->outputs(); + + auto result = inferRequest.get_tensor(outputs[0]); + + auto result_ref = inferRequestRef.get_tensor(outputs[0]); + + ov::test::utils::compare(result, result_ref, 1e-4, 1e-4); + } + CheckNumberOfNodesWithTypes(compiledModel, {"MemoryOutput", "Assign"}, 0); +} + +INSTANTIATE_TEST_SUITE_P( + smoke_ReadValue_Assign, + ReadValueAssignTest, + ::testing::Combine(::testing::Values(true, false), + ::testing::Values(CPUSpecificParams{{nchw, nchw}, {nchw}, {""}, "any_type"}, + CPUSpecificParams{{nhwc, nhwc}, {nhwc}, {""}, "any_type"})), + ReadValueAssignTest::getTestCaseName); +} // namespace test +} // namespace ov \ No newline at end of file diff --git a/src/plugins/template/src/sync_infer_request.cpp b/src/plugins/template/src/sync_infer_request.cpp index 41881e9839adaf..418f8f1b717a99 100644 --- a/src/plugins/template/src/sync_infer_request.cpp +++ b/src/plugins/template/src/sync_infer_request.cpp @@ -59,7 +59,7 @@ void collect_variables(const std::shared_ptr& ov_model, ov::Tensor tensor = ov::Tensor(variable->get_info().data_type, shape); variable_context.set_variable_value(variable, std::make_shared(tensor)); auto state = - std::make_shared(variable->get_info().variable_id, + std::make_shared(variable->get_info(), variable_context.get_variable_value(variable)); list_of_variables.emplace_back(state); } diff --git a/src/plugins/template/src/variable_state.hpp b/src/plugins/template/src/variable_state.hpp index 8227a22c0fe93c..d6f0972f8675f3 100644 --- a/src/plugins/template/src/variable_state.hpp +++ b/src/plugins/template/src/variable_state.hpp @@ -4,6 +4,7 @@ #pragma once +#include "openvino/op/util/variable.hpp" #include "openvino/runtime/itensor.hpp" #include "openvino/runtime/ivariable_state.hpp" #include "openvino/runtime/so_ptr.hpp" @@ -13,16 +14,35 @@ namespace template_plugin { class VariableState : public ov::IVariableState { public: - VariableState(const std::string& name, const std::shared_ptr& variable_value) - : ov::IVariableState(name), + VariableState(const ov::op::util::VariableInfo& variable_info, + const std::shared_ptr& variable_value) + : ov::IVariableState(variable_info.variable_id), + m_data_shape(variable_info.data_shape), + m_data_type(variable_info.data_type), m_variable_value(variable_value) { m_state = get_tensor_impl(variable_value->get_state()); } void set_state(const ov::SoPtr& state) override { - OPENVINO_ASSERT(state->get_shape() == m_state->get_shape(), "Wrong tensor shape."); - OPENVINO_ASSERT(state->get_element_type() == m_state->get_element_type(), "Wrong tensor type."); - OPENVINO_ASSERT(state->get_byte_size() == m_state->get_byte_size(), "Blob size of tensors are not equal."); + OPENVINO_ASSERT(m_data_shape.compatible(state->get_shape()), + "Wrong tensor shape: ", + state->get_shape(), + " is not compatible with expected: ", + m_data_shape, + " in a variable with ID: ", + this->get_name()); + OPENVINO_ASSERT(m_data_type.compatible(state->get_element_type()), + "Wrong tensor type: ", + state->get_element_type(), + " expected: ", + m_data_type, + " in a variable with ID: ", + this->get_name()); + m_state->set_shape(state->get_shape()); + OPENVINO_ASSERT(state->get_byte_size() == m_state->get_byte_size(), + "Blob size of tensors are not equal. Variable with ID: ", + this->get_name()); std::memcpy(m_state->data(), state->data(), state->get_byte_size()); + m_variable_value->set_reset(false); } void reset() override { @@ -33,6 +53,8 @@ class VariableState : public ov::IVariableState { ~VariableState() override = default; private: + PartialShape m_data_shape; // original shape + element::Type m_data_type; // original type std::shared_ptr m_variable_value; }; } // namespace template_plugin diff --git a/src/tests/functional/plugin/shared/include/subgraph_tests/lora_pattern.hpp b/src/tests/functional/plugin/shared/include/subgraph_tests/lora_pattern.hpp index 8f9687b7b93b2a..42f70aa92474a3 100644 --- a/src/tests/functional/plugin/shared/include/subgraph_tests/lora_pattern.hpp +++ b/src/tests/functional/plugin/shared/include/subgraph_tests/lora_pattern.hpp @@ -19,5 +19,15 @@ TEST_P(LoraPatternConvolution, empty_tensors) { run_test_empty_tensors(); } +TEST_P(LoraPatternMatmul, random_tensors) { + targetStaticShapes = {{{{1, 20, K}}, {{N, K}}}}; + run_test_random_tensors(); +} + +TEST_P(LoraPatternConvolution, random_tensors) { + targetStaticShapes = {{{1, num_channels, 64, 64}}}; + run_test_random_tensors(); +} + } // namespace test } // namespace ov \ No newline at end of file diff --git a/src/tests/functional/shared_test_classes/include/shared_test_classes/subgraph/lora_pattern.hpp b/src/tests/functional/shared_test_classes/include/shared_test_classes/subgraph/lora_pattern.hpp index 16764d37dcf688..9b38ca059f1aba 100644 --- a/src/tests/functional/shared_test_classes/include/shared_test_classes/subgraph/lora_pattern.hpp +++ b/src/tests/functional/shared_test_classes/include/shared_test_classes/subgraph/lora_pattern.hpp @@ -15,6 +15,7 @@ class LoraPatternBase : public SubgraphBaseTest { protected: void run_test_empty_tensors(); + void run_test_random_tensors(); protected: static constexpr auto t4_name = "lora/MatMul.B"; @@ -37,7 +38,7 @@ class LoraPatternConvolution : public LoraPatternBase, public testing::WithParam void SetUp() override; protected: - static constexpr size_t num_channels = 320ul; + static constexpr size_t num_channels = 64ul; }; } // namespace test diff --git a/src/tests/functional/shared_test_classes/src/subgraph/lora_pattern.cpp b/src/tests/functional/shared_test_classes/src/subgraph/lora_pattern.cpp index 6f74fd09b022a6..d40872f0756d6e 100644 --- a/src/tests/functional/shared_test_classes/src/subgraph/lora_pattern.cpp +++ b/src/tests/functional/shared_test_classes/src/subgraph/lora_pattern.cpp @@ -8,6 +8,7 @@ #include "common_test_utils/node_builders/convolution.hpp" #include "common_test_utils/ov_tensor_utils.hpp" #include "shared_test_classes/base/ov_subgraph.hpp" +#include "template/properties.hpp" namespace ov { namespace test { @@ -37,6 +38,79 @@ void LoraPatternBase::run_test_empty_tensors() { ov::test::utils::compare(tx_result, tz_result, 1e-4, 1e-4); } +void LoraPatternBase::run_test_random_tensors() { + compile_model(); + inferRequest = compiledModel.create_infer_request(); + ASSERT_TRUE(inferRequest); + + // use the Template plugin as a reference + + auto compiledReferenceModel = core->compile_model(function, + ov::test::utils::DEVICE_TEMPLATE, + {{ov::template_plugin::disable_transformations(true)}}); + auto inferRequestRef = compiledReferenceModel.create_infer_request(); + ASSERT_TRUE(inferRequestRef); + + generate_inputs(targetStaticShapes.front()); + for (const auto& input : inputs) { + inferRequest.set_tensor(input.first, input.second); + inferRequestRef.set_tensor(input.first, input.second); + } + + constexpr size_t lora_order = 25lu; + constexpr int infer_count = 6lu; + + std::unordered_map stateShapes; + + auto&& vars = function->get_variables(); + + for (auto&& var : vars) { + auto var_info = var->get_info(); + auto var_shape = var_info.data_shape; + + std::for_each(var_shape.begin(), var_shape.end(), [=](ov::PartialShape::value_type& x) { + if (x.is_dynamic()) { + x = lora_order; + } + }); + stateShapes.insert({var_info.variable_id, var_shape.to_shape()}); + } + + for (int i = 0; i < infer_count; ++i) { + // set states + + auto&& states = inferRequest.query_state(); + if (!(i & 0x1)) { // every even call + // generate and set state tensors + for (auto&& item : states) { + auto&& refStates = inferRequestRef.query_state(); + using ov::test::utils::InputGenerateData; + const auto& shape = stateShapes.at(item.get_name()); + auto tensor = ov::test::utils::create_and_fill_tensor(netType, shape, InputGenerateData{0, 10, 1, i}); + item.set_state(tensor); + auto itr = std::find_if(refStates.begin(), refStates.end(), [&](const ov::VariableState& state) { + return state.get_name() == item.get_name(); + }); + ASSERT_FALSE(itr == refStates.end()); + itr->set_state(tensor); + } + } + + inferRequest.infer(); + inferRequestRef.infer(); + auto outputs = function->outputs(); + + auto tx_result = inferRequest.get_tensor(outputs[0]); + auto tz_result = inferRequest.get_tensor(outputs[1]); + + auto tx_result_ref = inferRequestRef.get_tensor(outputs[0]); + auto tz_result_ref = inferRequestRef.get_tensor(outputs[1]); + + ov::test::utils::compare(tx_result, tx_result_ref, 1e-4, 1e-4); + ov::test::utils::compare(tz_result, tz_result_ref, 1e-4, 1e-4); + } +} + void LoraPatternMatmul::SetUp() { targetDevice = this->GetParam(); From af18322643b2df57345a8e312bcf8d70bb185dbf Mon Sep 17 00:00:00 2001 From: Sungeun Kim Date: Tue, 29 Oct 2024 20:50:34 +0900 Subject: [PATCH 030/120] [GPU] update onednn_3.7pc: 32ad05ab (#27264) --- src/plugins/intel_gpu/thirdparty/onednn_gpu | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/plugins/intel_gpu/thirdparty/onednn_gpu b/src/plugins/intel_gpu/thirdparty/onednn_gpu index e99a84e4914a81..32ad05ab263b78 160000 --- a/src/plugins/intel_gpu/thirdparty/onednn_gpu +++ b/src/plugins/intel_gpu/thirdparty/onednn_gpu @@ -1 +1 @@ -Subproject commit e99a84e4914a818c64165a4b52785f606e405c2b +Subproject commit 32ad05ab263b782d4a4455ea85f5de009cf607c4 From 015de6d6de046a49a1c4f421eff0e3039d9a8a45 Mon Sep 17 00:00:00 2001 From: darksapien23151 <141660450+darksapien23151@users.noreply.github.com> Date: Tue, 29 Oct 2024 17:36:48 +0530 Subject: [PATCH 031/120] Update android_x64.yml (#27257) ### Details: - Enabling test building by setting {ENABLE_TEST=ON} ### Tickets: - 149906 --------- Co-authored-by: Ilya Lavrenov --- .github/workflows/android_x64.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/android_x64.yml b/.github/workflows/android_x64.yml index a667a07da5bd3e..1cdb2023784979 100644 --- a/.github/workflows/android_x64.yml +++ b/.github/workflows/android_x64.yml @@ -135,6 +135,7 @@ jobs: -DCMAKE_C_COMPILER_LAUNCHER=${{ env.CMAKE_C_COMPILER_LAUNCHER }} \ -DENABLE_LTO=ON \ -DENABLE_PYTHON=OFF \ + -DENABLE_TESTS=ON \ -DOPENVINO_EXTRA_MODULES=${{ env.OPENVINO_GENAI_REPO }} \ -S ${OPENVINO_REPO} \ -B ${BUILD_DIR} From a7ccc5e0efcc55455e4f2988489a64d70e6be0f7 Mon Sep 17 00:00:00 2001 From: Attila Csok Date: Tue, 29 Oct 2024 14:07:58 +0200 Subject: [PATCH 032/120] [intel-npu] Bugfix for total allocable memory property (#27270) ### Details: - Bugfix in zero_device to return correct maximum allocable memory size in NPU_DEVICE_TOTAL_MEM_SIZE property. - For old drivers we return hardcoded 2GB value (compiler limitation) - For graph_ext 1.8 windows drivers we just convert KB to B - For graph_ext >1.9 drivers we return values from driver as is ### Tickets: - *EISW-143246* --- .../intel_npu/src/backend/src/zero_device.cpp | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/src/plugins/intel_npu/src/backend/src/zero_device.cpp b/src/plugins/intel_npu/src/backend/src/zero_device.cpp index ac60e4741947bd..439b5fbd59f4f9 100644 --- a/src/plugins/intel_npu/src/backend/src/zero_device.cpp +++ b/src/plugins/intel_npu/src/backend/src/zero_device.cpp @@ -162,12 +162,22 @@ uint64_t ZeroDevice::getAllocMemSize() const { } uint64_t ZeroDevice::getTotalMemSize() const { +#define LEGACY_MAX_MEM_ALLOC_SIZE_BYTES (2147483648) // 2GB in base-2 + ze_graph_memory_query_t query{}; ze_result_t result = _graph_ddi_table_ext.pfnQueryContextMemory(_initStructs->getContext(), ZE_GRAPH_QUERY_MEMORY_DDR, &query); THROW_ON_FAIL_FOR_LEVELZERO_EXT("pfnQueryContextMemory", result, _graph_ddi_table_ext); - return query.total; + // For drivers with graph_extension < 1.9 we report fixed 2GB max allocation size (old drivers don't support more) + // For drivers with graph_extension > 1.9 we report the value they return + if (_initStructs->isExtensionSupported(std::string(ZE_GRAPH_EXT_NAME), ZE_MAKE_VERSION(1, 9))) { + // we are safe here, can return the value directly from driver + return query.total; + } + + // Default for older drivers: return 2GB + return LEGACY_MAX_MEM_ALLOC_SIZE_BYTES; } ov::device::PCIInfo ZeroDevice::getPciInfo() const { From fc5f897442bdf8a43e08301e938e8cec56b0b1f2 Mon Sep 17 00:00:00 2001 From: Maksim Kutakov Date: Tue, 29 Oct 2024 13:18:51 +0100 Subject: [PATCH 033/120] [CPU] Disable parallel UpdateShapes/PrepairParams processing if there are many sync nodes (#27280) ### Details: If there are too many sync nodes in a model, it's not beneficial to run UpdateShapes and PrepairParams stages in parallel as the parallel tasks spawning/synchronization overheads outweigh the performance gain of the parallel execution. As a quantitative characteristic that defines the boundary between parallel and sequential node processing strategy, we use the ratio between the total number of nodes and the number of synchronous nodes. ### Tickets: - CVS-153035 - CVS-155112 --- src/plugins/intel_cpu/src/graph.cpp | 17 ++++++++++++++--- src/plugins/intel_cpu/src/graph.h | 18 ++++++++++++++---- src/plugins/intel_cpu/src/infer_request.cpp | 12 ++++++------ .../intel_cpu/src/nodes/tensoriterator.cpp | 2 +- 4 files changed, 35 insertions(+), 14 deletions(-) diff --git a/src/plugins/intel_cpu/src/graph.cpp b/src/plugins/intel_cpu/src/graph.cpp index 45118763a3eaf9..f9bfa9334eae8f 100644 --- a/src/plugins/intel_cpu/src/graph.cpp +++ b/src/plugins/intel_cpu/src/graph.cpp @@ -371,9 +371,20 @@ void Graph::Activate(const std::vector& externalInputMemory, std::tie(m_executableGraphNodes, m_executableSyncNodesInds) = ExtractExecutableNodesAndSyncPoints(syncNodesInds, graphNodes); - status = hasDynNodes ? (parallel_get_max_threads() > 1 ? Status::ReadyDynamic : Status::ReadyDynamicSeq) - : Status::ReadyStatic; - + if (hasDynNodes) { + status = Status::ReadyDynamic; + // Here we use the following heuristic: if the number of sync nodes is less than 10 times of the number of exec + // nodes, it does make sense to use Sequential dynamic shapes processing due to the high overheads on context + // switching when the dynamic shapes are being processed in parallel and there are a lot of sync points. Also + // this rule works for short graphs (usually subgraphs) when the amount of nodes is to low to process them in + // parallel. + const auto exec2sync = m_executableGraphNodes.size() / m_executableSyncNodesInds.size(); + if (exec2sync < 10 || parallel_get_max_threads() < 2) { + status = Status::ReadyDynamicSeq; + } + } else { + status = Status::ReadyStatic; + } CPU_DEBUG_CAP_ENABLE(serialize(*this)); } diff --git a/src/plugins/intel_cpu/src/graph.h b/src/plugins/intel_cpu/src/graph.h index b3634800fb2e05..d50ccc152c9186 100644 --- a/src/plugins/intel_cpu/src/graph.h +++ b/src/plugins/intel_cpu/src/graph.h @@ -49,8 +49,16 @@ class Graph { ~Graph(); - bool IsReady() { - return one_of(status, Status::ReadyStatic, Status::ReadyDynamic, Status::ReadyDynamicSeq); + bool IsStatic() const { + return Status::ReadyStatic == status; + } + + bool IsDynamic() const { + return one_of(status, Status::ReadyDynamic, Status::ReadyDynamicSeq); + } + + bool IsReady() const { + return IsStatic() || IsDynamic(); } const Config & getConfig() const { @@ -193,7 +201,6 @@ class Graph { return graphHasDynamicInput; } - Status getStatus() const {return status;} const std::unordered_map& getInternalStateNodes() const; /** @@ -210,6 +217,10 @@ class Graph { void Activate(const std::vector& externalInputMemory = {}, const std::vector& externalOutputMemory = {}); + const std::unordered_map& getOutputNodesMemBlocksMap() const { + return outputNodesMemBlocksMap; + } + protected: void ForgetGraphData() { status = Status::NotReady; @@ -273,7 +284,6 @@ class Graph { template void InferDynamic(SyncInferRequest* request, int numaId, UpdateStrategy&& update); - friend class intel_cpu::SyncInferRequest; friend std::shared_ptr dump_graph_as_ie_ngraph_net(const Graph &graph); private: diff --git a/src/plugins/intel_cpu/src/infer_request.cpp b/src/plugins/intel_cpu/src/infer_request.cpp index f255a46efe7d0a..f0b817dcda859c 100644 --- a/src/plugins/intel_cpu/src/infer_request.cpp +++ b/src/plugins/intel_cpu/src/infer_request.cpp @@ -140,7 +140,7 @@ void SyncInferRequest::infer() { throw_if_canceled(); // update output control blocks, if any, in order to refresh internal buffers - if (Graph::Status::ReadyDynamic == m_graph->getStatus()) { + if (m_graph->IsDynamic()) { for (auto&& item : m_outputControlBlocks) { item.second.update(); } @@ -178,7 +178,7 @@ void SyncInferRequest::change_default_ptr() { std::unordered_set inputPtrs; std::function& tensor)> changeInpPtr; - if (Graph::Status::ReadyDynamic == m_graph->getStatus()) { + if (m_graph->IsDynamic()) { changeInpPtr = [&inputPtrs](const EdgePtr &edge, ov::SoPtr& tensor) { change_edge_ptr(edge, tensor); inputPtrs.insert(tensor->data()); @@ -278,8 +278,8 @@ void SyncInferRequest::change_default_ptr() { change_edge_ptr(parentEdge, it.second); } - if (Graph::Status::ReadyDynamic == m_graph->getStatus()) { - const auto &outMemBlocksMap = m_graph->outputNodesMemBlocksMap; + if (m_graph->IsDynamic()) { + const auto &outMemBlocksMap = m_graph->getOutputNodesMemBlocksMap(); for (auto&& item : outMemBlocksMap) { const auto& name = item.first; @@ -476,7 +476,7 @@ void SyncInferRequest::init_tensor(const std::size_t& port_index, const ov::ISyn ov::SoPtr tensor; if (type == ov::ISyncInferRequest::FoundPort::Type::INPUT) { - OPENVINO_ASSERT(m_graph->inputNodesMap.find(port_index) != m_graph->inputNodesMap.end(), + OPENVINO_ASSERT(m_graph->GetInputNodesMap().find(port_index) != m_graph->GetInputNodesMap().end(), "Tensor with index: ", port_index, " exists in CPU plugin graph, but absents in model inputs"); @@ -509,7 +509,7 @@ void SyncInferRequest::init_tensor(const std::size_t& port_index, const ov::ISyn } if (type == ov::ISyncInferRequest::FoundPort::Type::OUTPUT) { - const auto& outMap = m_graph->outputNodesMap; + const auto& outMap = m_graph->GetOutputNodesMap(); auto output = outMap.find(port_index); OPENVINO_ASSERT(output != outMap.end(), "Tensor with index: ", diff --git a/src/plugins/intel_cpu/src/nodes/tensoriterator.cpp b/src/plugins/intel_cpu/src/nodes/tensoriterator.cpp index 9a3b9788b838d2..dcf2b0f8ffd5ee 100644 --- a/src/plugins/intel_cpu/src/nodes/tensoriterator.cpp +++ b/src/plugins/intel_cpu/src/nodes/tensoriterator.cpp @@ -938,7 +938,7 @@ int TensorIterator::getNumIteration(const std::vector& inputPortMap, co } bool TensorIterator::runAsDynamic() const { - return isDynamicNode() || Graph::Status::ReadyDynamic == sub_graph.getStatus(); + return isDynamicNode() || sub_graph.IsDynamic(); } bool TensorIterator::created() const { From cde0429991ef8746a7c73c36dd6afbd1bf9b2951 Mon Sep 17 00:00:00 2001 From: Bo Liu Date: Tue, 29 Oct 2024 20:21:02 +0800 Subject: [PATCH 034/120] [CPU] enable channel first format support for rank=3 Deconv to use amx fp16 kernel (#27085) ### Details: - *enable channel first format support for rank=3 Deconv to use amx fp16 kernel to benefit model performance on GNR, e.g. hifigan* - *fix acl_convert UNKNOWN DataLayout accuracy issues* ### Tickets: - *153089* --- src/plugins/intel_cpu/src/nodes/deconv.cpp | 6 ++++-- .../intel_cpu/src/nodes/executors/acl/acl_convert.cpp | 11 +++++------ 2 files changed, 9 insertions(+), 8 deletions(-) diff --git a/src/plugins/intel_cpu/src/nodes/deconv.cpp b/src/plugins/intel_cpu/src/nodes/deconv.cpp index 8a7f95268b4f3a..cb340afc029304 100644 --- a/src/plugins/intel_cpu/src/nodes/deconv.cpp +++ b/src/plugins/intel_cpu/src/nodes/deconv.cpp @@ -426,8 +426,10 @@ std::vector Deconvolution::getAvailableFormatsForDims(const else if (dims.getRank() == 2) return {memory::format_tag::nc}; else if (dims.getRank() == 3) - return {memory::format_tag::tnc, memory::format_tag::ntc, - memory::format_tag::ncw, memory::format_tag::nCw8c, memory::format_tag::nCw16c }; + return {memory::format_tag::ncw, + memory::format_tag::nCw8c, + memory::format_tag::nCw16c, + memory::format_tag::nwc}; else if (dims.getRank() == 4) return {memory::format_tag::nchw, memory::format_tag::nChw8c, memory::format_tag::nChw16c, memory::format_tag::nhwc }; diff --git a/src/plugins/intel_cpu/src/nodes/executors/acl/acl_convert.cpp b/src/plugins/intel_cpu/src/nodes/executors/acl/acl_convert.cpp index 440af52749bc9c..1bc0585930387f 100644 --- a/src/plugins/intel_cpu/src/nodes/executors/acl/acl_convert.cpp +++ b/src/plugins/intel_cpu/src/nodes/executors/acl/acl_convert.cpp @@ -27,12 +27,11 @@ bool ACLConvertExecutor::init(const ConvertParams& convertParams, if (!isCopyOp && dstPrecision == DataType::S8) { dstPrecision = DataType::QASYMM8_SIGNED; } - auto srcDims = srcDesc->getShape().getStaticDims(); - auto dstDims = dstDesc->getShape().getStaticDims(); - auto srcDataLayout = getAclDataLayoutByMemoryDesc(srcDesc); - auto dstDataLayout = getAclDataLayoutByMemoryDesc(dstDesc); - auto srcTensorInfo = TensorInfo(shapeCast(collapse_dims_to_max_rank(srcDims)), 1, srcPrecision, srcDataLayout); - auto dstTensorInfo = TensorInfo(shapeCast(collapse_dims_to_max_rank(dstDims)), 1, dstPrecision, dstDataLayout); + // Use 1D TensorInfo, since UNKNOWN DataLayout may have accuracy issues + auto srcDims1D = convertParams.size; + auto dstDims1D = convertParams.size; + auto srcTensorInfo = TensorInfo(TensorShape(srcDims1D), 1, srcPrecision); + auto dstTensorInfo = TensorInfo(TensorShape(dstDims1D), 1, dstPrecision); if (isCopyOp) { Status s = NECopy::validate(&srcTensorInfo, &dstTensorInfo); if (!s) { From 583925c5de910fe4e0e2729b215e513424b84f06 Mon Sep 17 00:00:00 2001 From: M Date: Tue, 29 Oct 2024 05:55:36 -0700 Subject: [PATCH 035/120] [CPU][ARM] Fix ARM tests failing because of overflow (#27074) ### Details: - Fixes ARM test overflow for Multiple Query SDP. --- .../src/nodes/kernels/scaled_attn/mha_single_token.cpp | 2 +- .../subgraph_tests/src/common/concat_multiple_query_sdp.cpp | 2 +- .../functional/shared_tests_instances/skip_tests_config.cpp | 4 ++-- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/src/plugins/intel_cpu/src/nodes/kernels/scaled_attn/mha_single_token.cpp b/src/plugins/intel_cpu/src/nodes/kernels/scaled_attn/mha_single_token.cpp index 6b6df3c3181ee0..25ddbb1b4246b1 100644 --- a/src/plugins/intel_cpu/src/nodes/kernels/scaled_attn/mha_single_token.cpp +++ b/src/plugins/intel_cpu/src/nodes/kernels/scaled_attn/mha_single_token.cpp @@ -1148,7 +1148,7 @@ void mha_single_token(const ov::intel_cpu::PlainTensor& query, past_v_scale_zp, head_sum); } else { - OPENVINO_THROW("Unsupported precision: ", query.get_precision()); + OPENVINO_THROW("Unsupported precision: ", present_key.get_precision()); } #else if (present_key.get_precision() == ov::element::u8) { diff --git a/src/plugins/intel_cpu/tests/functional/custom/subgraph_tests/src/common/concat_multiple_query_sdp.cpp b/src/plugins/intel_cpu/tests/functional/custom/subgraph_tests/src/common/concat_multiple_query_sdp.cpp index d05e7840562191..d74ab99fb3d5ab 100644 --- a/src/plugins/intel_cpu/tests/functional/custom/subgraph_tests/src/common/concat_multiple_query_sdp.cpp +++ b/src/plugins/intel_cpu/tests/functional/custom/subgraph_tests/src/common/concat_multiple_query_sdp.cpp @@ -238,7 +238,7 @@ class ConcatMultiQuerySDPTest : public testing::WithParamInterfaceget_element_type() == element::f16) { ov::Tensor t{ov::element::f16, shape}; - strided_iota(static_cast(t.data()), t.get_size(), val, 0.1f); + strided_iota(static_cast(t.data()), t.get_size(), val, 0.0f); inputs.insert({param, t}); } else { ov::Tensor t{ov::element::bf16, shape}; diff --git a/src/plugins/intel_cpu/tests/functional/shared_tests_instances/skip_tests_config.cpp b/src/plugins/intel_cpu/tests/functional/shared_tests_instances/skip_tests_config.cpp index e7c006ab97427f..6edc4f062536d0 100644 --- a/src/plugins/intel_cpu/tests/functional/shared_tests_instances/skip_tests_config.cpp +++ b/src/plugins/intel_cpu/tests/functional/shared_tests_instances/skip_tests_config.cpp @@ -358,8 +358,7 @@ std::vector disabledTestPatterns() { retVector.emplace_back(R"(smoke_VariableState/OVInferRequestVariableStateTest.*)"); // Issue: 141705 retVector.emplace_back(R"(.*smoke_arm_Deconv_2D_Planar_FP16/DeconvolutionLayerCPUTest.*INFERENCE_PRECISION_HINT=f16.*)"); - // Issue: 154882 - retVector.emplace_back(R"(.*ConcatMultiQuerySDPTest.*f16.*)"); + retVector.emplace_back(R"(.*ConcatMultiQuerySDPTest.*u8.*)"); #endif #if defined(OPENVINO_ARCH_ARM) @@ -539,6 +538,7 @@ std::vector disabledTestPatterns() { // Skip fp16 tests for paltforms that don't support fp16 precision retVector.emplace_back(R"(.*INFERENCE_PRECISION_HINT=(F|f)16.*)"); retVector.emplace_back(R"(.*Prc=f16.*)"); + retVector.emplace_back(R"(.*ConcatMultiQuerySDPTest.*f16.*HasShapeOf=1.*)"); } else { // Issue 117407 retVector.emplace_back( From 08c6672eda563aa737672487e605a3e55ff60143 Mon Sep 17 00:00:00 2001 From: Maxim Vafin Date: Tue, 29 Oct 2024 14:02:38 +0100 Subject: [PATCH 036/120] [TESTS] Disable lerp test for torch.export on older versions (#27302) ### Details: - *Disable `lerp_` test for `torch.export` on older versions of `torch`* ### Tickets: - *CVS-156278* Signed-off-by: Maxim Vafin --- tests/layer_tests/pytorch_tests/test_lerp.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/tests/layer_tests/pytorch_tests/test_lerp.py b/tests/layer_tests/pytorch_tests/test_lerp.py index 0f85fac8569c95..d689efb3c77252 100644 --- a/tests/layer_tests/pytorch_tests/test_lerp.py +++ b/tests/layer_tests/pytorch_tests/test_lerp.py @@ -4,6 +4,7 @@ import numpy as np import pytest import torch +from packaging import version from pytorch_layer_test_class import PytorchLayerTest, skip_if_export @@ -44,6 +45,9 @@ def forward2(self, lhs, rhs): @pytest.mark.precommit_fx_backend def test_lerp(self, ie_device, precision, ir_version, weight, input_shape_rhs, op_type): + if (op_type == "lerp_" and PytorchLayerTest.use_torch_export() and + version.parse(torch.__version__) < version.parse("2.5")): + pytest.skip("Not supported in PyTorch versions earlier than 2.5.") self.input_rhs = np.random.randn(*input_shape_rhs).astype(np.float32) if isinstance(weight, list): weight = torch.rand(weight) From 60e348b85c15f3e3a260555377b13aa6e8d5085c Mon Sep 17 00:00:00 2001 From: Maksim Kutakov Date: Tue, 29 Oct 2024 14:25:07 +0100 Subject: [PATCH 037/120] [CPU] Introduce LoRA macro operation (#27110) ### Details: To minimize the overheads on the LoRA subgraph operation processing (shape update, memory allocation, etc.) it does make sense to merge such subgraphs into a specific LoRA macro operation, which may exploit some LoRA properties to optimize performance. ### Tickets: - CVS-153035 - CVS-155112 --- src/plugins/intel_cpu/src/cpu_types.cpp | 4 +- src/plugins/intel_cpu/src/cpu_types.h | 1 + src/plugins/intel_cpu/src/nodes/composite.cpp | 23 +- src/plugins/intel_cpu/src/nodes/input.cpp | 1 + src/plugins/intel_cpu/src/nodes/input.h | 12 +- src/plugins/intel_cpu/src/nodes/lora.cpp | 110 ++++++++ src/plugins/intel_cpu/src/nodes/lora.h | 41 +++ src/plugins/intel_cpu/src/nodes/reference.cpp | 2 +- src/plugins/intel_cpu/src/nodes_factory.cpp | 2 + .../transformation_pipeline.cpp | 2 + .../src/common/lora_pattern.cpp | 266 ++++++++++++++++++ 11 files changed, 453 insertions(+), 11 deletions(-) create mode 100644 src/plugins/intel_cpu/src/nodes/lora.cpp create mode 100644 src/plugins/intel_cpu/src/nodes/lora.h create mode 100644 src/plugins/intel_cpu/tests/functional/custom/subgraph_tests/src/common/lora_pattern.cpp diff --git a/src/plugins/intel_cpu/src/cpu_types.cpp b/src/plugins/intel_cpu/src/cpu_types.cpp index 953f94cb3d5776..e20369c9cca215 100644 --- a/src/plugins/intel_cpu/src/cpu_types.cpp +++ b/src/plugins/intel_cpu/src/cpu_types.cpp @@ -256,7 +256,8 @@ static const TypeToNameMap& get_type_to_name_tbl() { {"LLMMLP", Type::LLMMLP}, {"QKVProjection", Type::QKVProjection}, {"RMS", Type::RMS}, - {"SearchSorted", Type::SearchSorted} + {"SearchSorted", Type::SearchSorted}, + {"LoraSubgraph", Type::LoRA} }; return type_to_name_tbl; } @@ -389,6 +390,7 @@ std::string NameFromType(const Type type) { CASE(QKVProjection); CASE(RMS); CASE(SearchSorted); + CASE(LoRA); CASE(Unknown); } #undef CASE diff --git a/src/plugins/intel_cpu/src/cpu_types.h b/src/plugins/intel_cpu/src/cpu_types.h index c0a2acc3329a9c..d6ac9947a8fb5d 100644 --- a/src/plugins/intel_cpu/src/cpu_types.h +++ b/src/plugins/intel_cpu/src/cpu_types.h @@ -134,6 +134,7 @@ enum class Type { QKVProjection, RMS, SearchSorted, + LoRA }; enum class Algorithm { diff --git a/src/plugins/intel_cpu/src/nodes/composite.cpp b/src/plugins/intel_cpu/src/nodes/composite.cpp index b38a56649bd60a..a1ceabd6942db1 100644 --- a/src/plugins/intel_cpu/src/nodes/composite.cpp +++ b/src/plugins/intel_cpu/src/nodes/composite.cpp @@ -15,11 +15,23 @@ namespace intel_cpu { namespace node { bool Composite::isSupportedOperation(const std::shared_ptr& op, std::string& errorMessage) noexcept { - return ov::is_type(op); + try { + if (!ov::is_type(op)) { + errorMessage = "Unknown SubGraph operation : " + std::string(op->get_type_info().name) + " with name '" + + op->get_friendly_name() + "'"; + } + } catch (...) { + return false; + } + return true; } Composite::Composite(const std::shared_ptr& op, const GraphContext::CPtr& context) : Node(op, context, InternalDynShapeInferFactory()) { + std::string errorMessage; + if (!isSupportedOperation(op, errorMessage)) { + OPENVINO_THROW_NOT_IMPLEMENTED(errorMessage); + } const auto& subModel = ov::as_type_ptr(op); OPENVINO_ASSERT(subModel, "Attempt to create SubGraph node from an invalid op type: ", op); @@ -27,7 +39,7 @@ Composite::Composite(const std::shared_ptr& op, const GraphContext::CP } void Composite::selectOptimalPrimitiveDescriptor() { - // for the input configution, just always use the parent configuration + // for the input configuration, just always use the parent configuration std::vector inConfs; std::vector graphInputConfig; @@ -38,14 +50,14 @@ void Composite::selectOptimalPrimitiveDescriptor() { } std::vector graphOutputConfig; - for (size_t i = 0; i < getParentEdges().size(); i++) { + for (size_t i = 0; i < outputShapes.size(); i++) { graphOutputConfig.emplace_back(node::Input::OutputConfig{true, true}); } // configure the inner graph to get the information about output memory descriptors m_graph.Init(m_body, context, graphInputConfig, graphOutputConfig); - // for the output decriptors, use the configuration of the graph's output nodes + // for the output descriptors, use the configuration of the graph's output nodes auto outputDescriptors = m_graph.getOutputMemoryDescriptors(); std::vector outConfs; @@ -89,9 +101,6 @@ void Composite::execute(dnnl::stream) { void Composite::executeDynamicImpl(dnnl::stream strm) { execute(strm); - if (!inputShapesModified()) - return; - // since the shape inference is not performed for the composite node // a memory of the extra child edges, attached to the output ports // has to be updated after an inference of the inner graph finished diff --git a/src/plugins/intel_cpu/src/nodes/input.cpp b/src/plugins/intel_cpu/src/nodes/input.cpp index 4ee5707e0a9e76..1f650bd8c5de17 100644 --- a/src/plugins/intel_cpu/src/nodes/input.cpp +++ b/src/plugins/intel_cpu/src/nodes/input.cpp @@ -430,6 +430,7 @@ Input::Input(const std::shared_ptr& op, const GraphContext::CPtr context, OutputConfig config) : Input(op, context) { + extMemDesc = config.desc; m_useParentMemoryDescForOutput = config.useParentMemoryDescForOutput; m_isInPlace = config.inPlace; } diff --git a/src/plugins/intel_cpu/src/nodes/input.h b/src/plugins/intel_cpu/src/nodes/input.h index a954ce56665d61..4d7febb17ad4b7 100644 --- a/src/plugins/intel_cpu/src/nodes/input.h +++ b/src/plugins/intel_cpu/src/nodes/input.h @@ -19,9 +19,17 @@ class Input : public Node { }; struct OutputConfig { + OutputConfig() = default; + OutputConfig(bool useParentMemoryDesc_, bool inPlace_) + : useParentMemoryDescForOutput(useParentMemoryDesc_), + inPlace(inPlace_) {} + + OutputConfig(MemoryDescPtr desc_, bool inPlace_) : desc(std::move(desc_)), inPlace(inPlace_) {} + // @todo better to use memory desc with any layout and undefined precision - bool useParentMemoryDescForOutput; - bool inPlace; + MemoryDescPtr desc = nullptr; + bool useParentMemoryDescForOutput = false; + bool inPlace = false; }; Input(const std::shared_ptr& op, const GraphContext::CPtr context); diff --git a/src/plugins/intel_cpu/src/nodes/lora.cpp b/src/plugins/intel_cpu/src/nodes/lora.cpp new file mode 100644 index 00000000000000..2c69bc347b6139 --- /dev/null +++ b/src/plugins/intel_cpu/src/nodes/lora.cpp @@ -0,0 +1,110 @@ +// Copyright (C) 2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "lora.h" + +#include "nodes/input.h" +#include "cpu_memory.h" +#include "ov_ops/lora_subgraph.hpp" +#include "utils/debug_capabilities.h" +#include "shape_inference/shape_inference_pass_through.hpp" + +namespace ov { +namespace intel_cpu { +namespace node { + +bool LoRA::isSupportedOperation(const std::shared_ptr& op, std::string& errorMessage) noexcept { + try { + if (!ov::is_type(op)) { + errorMessage = "Unknown LoRA operation : " + std::string(op->get_type_info().name) + " with name '" + + op->get_friendly_name() + "'"; + } + } catch (...) { + return false; + } + return true; +} + +LoRA::LoRA(const std::shared_ptr& op, const GraphContext::CPtr& context) + : Node(op, context, PassThroughShapeInferFactory()) { + std::string errorMessage; + if (!isSupportedOperation(op, errorMessage)) { + OPENVINO_THROW_NOT_IMPLEMENTED(errorMessage); + } + const auto& loraModel = ov::as_type_ptr(op); + OPENVINO_ASSERT(loraModel, + "Attempt to create LoRA node from an invalid op type: ", + op, + " with name ", + op->get_friendly_name()); + + m_body = loraModel->get_function(); +} + +void LoRA::selectOptimalPrimitiveDescriptor() { + // for the input configuration, just always use the parent configuration + std::vector inConfs; + std::vector graphInputConfig; + + for (size_t i = 0; i < getParentEdges().size(); i++) { + auto desc = getParentOutputMemDesc(getParentEdgeAt(i)); + inConfs.emplace_back(desc); + graphInputConfig.emplace_back(node::Input::InputConfig{desc, true}); + } + + std::vector graphOutputConfig; + // enforce the same memory descriptor on the output as on the input to allow inPlace memory + graphOutputConfig.emplace_back(node::Input::OutputConfig{inConfs.front().getMemDesc(), true}); + + // configure the inner graph to get the information about output memory descriptors + m_graph.Init(m_body, context, graphInputConfig, graphOutputConfig); + + // for the output descriptors, use the configuration of the graph's output nodes + auto outputDescriptors = m_graph.getOutputMemoryDescriptors(); + + const auto& desc = outputDescriptors.front(); + + // just a sanity check + CPU_NODE_ASSERT(desc->isCompatible(*(inConfs.front().getMemDesc())), "Unexpected input/output descriptor mismatch"); + + std::vector outConfs; + + outConfs.emplace_back(desc, BlockedMemoryDesc::FULL_MASK, 0); // use the memory from the first input inPlace + + const NodeConfig config(inConfs, outConfs); + + supportedPrimitiveDescriptors.clear(); + supportedPrimitiveDescriptors.emplace_back(config, impl_desc_type::undef); + + selectPrimitiveDescriptorByIndex(0); +} + +// @todo add ascii diagram for memory mapping / reuse +void LoRA::createPrimitive() { + CPU_NODE_ASSERT(getOriginalInputsNumber() == m_graph.GetInputNodesMap().size(), + "Number of node inputs must be equal the number of inner graph's inputs"); + + std::vector inputMemory; + for (size_t i = 0; i < getOriginalInputsNumber(); i++) { + inputMemory.emplace_back(getSrcMemoryAtPort(i)); + } + + CPU_NODE_ASSERT(getOriginalOutputsNumber() == m_graph.GetOutputNodesMap().size(), + "Number of node outputs must be equal the number of inner graph's outputs"); + + std::vector outputMemory{getDstMemoryAtPort(0)}; + m_graph.Activate(inputMemory, outputMemory); +} + +void LoRA::execute(dnnl::stream) { + m_graph.Infer(); +} + +void LoRA::executeDynamicImpl(dnnl::stream strm) { + execute(strm); +} + +} // namespace node +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/nodes/lora.h b/src/plugins/intel_cpu/src/nodes/lora.h new file mode 100644 index 00000000000000..89a1bc15c2bf17 --- /dev/null +++ b/src/plugins/intel_cpu/src/nodes/lora.h @@ -0,0 +1,41 @@ +// Copyright (C) 2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include "graph.h" +#include "node.h" + +namespace ov { +namespace intel_cpu { +namespace node { + +class LoRA : public Node { +public: + static bool isSupportedOperation(const std::shared_ptr& op, std::string& errorMessage) noexcept; + + LoRA(const std::shared_ptr& op, const GraphContext::CPtr& context); + + bool created() const override { + return getType() == Type::LoRA; + } + + bool needPrepareParams() const override { + return false; + } + + void getSupportedDescriptors() override{}; + void selectOptimalPrimitiveDescriptor() override; + void createPrimitive() override; + void execute(dnnl::stream) override; + void executeDynamicImpl(dnnl::stream strm) override; + +private: + std::shared_ptr m_body; + Graph m_graph; +}; + +} // namespace node +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/nodes/reference.cpp b/src/plugins/intel_cpu/src/nodes/reference.cpp index 43b8f041184a70..185815acd8c294 100644 --- a/src/plugins/intel_cpu/src/nodes/reference.cpp +++ b/src/plugins/intel_cpu/src/nodes/reference.cpp @@ -14,7 +14,7 @@ Reference::Reference(const std::shared_ptr& op, const GraphContext::CP Node(op, context, NgraphShapeInferFactory(op, FULL_PORT_MASK)), ovCoreNode(op), additionalErrorMessage(errorMessage) { if (!op->has_evaluate()) { OPENVINO_THROW_NOT_IMPLEMENTED( - "Cannot fallback on ngraph reference implementation (Ngraph::Node::evaluate() is not implemented"); + "Cannot fallback on ngraph reference implementation (Ngraph::Node::evaluate() is not implemented)"); } setType(Type::Reference); diff --git a/src/plugins/intel_cpu/src/nodes_factory.cpp b/src/plugins/intel_cpu/src/nodes_factory.cpp index 16cf1b974d8561..4a8e8205510fcf 100644 --- a/src/plugins/intel_cpu/src/nodes_factory.cpp +++ b/src/plugins/intel_cpu/src/nodes_factory.cpp @@ -108,6 +108,7 @@ #include "nodes/transpose.h" #include "nodes/unique.hpp" #include "nodes/causal_mask_preprocess.h" +#include "nodes/lora.h" namespace ov { namespace intel_cpu { @@ -221,6 +222,7 @@ Node::NodesFactory::NodesFactory() : Factory("NodesFactory") { INTEL_CPU_NODE(Composite, Type::SubModel); INTEL_CPU_NODE(ScaledDotProductAttention, Type::ScaledDotProductAttention); INTEL_CPU_NODE(SearchSorted, Type::SearchSorted); + INTEL_CPU_NODE(LoRA, Type::LoRA); #if defined(OPENVINO_ARCH_X86_64) INTEL_CPU_NODE(FakeQuantize, Type::FakeQuantize); INTEL_CPU_NODE(GridSample, Type::GridSample); diff --git a/src/plugins/intel_cpu/src/transformations/transformation_pipeline.cpp b/src/plugins/intel_cpu/src/transformations/transformation_pipeline.cpp index fcf38440b8aa4b..9dd1da2d471e5a 100644 --- a/src/plugins/intel_cpu/src/transformations/transformation_pipeline.cpp +++ b/src/plugins/intel_cpu/src/transformations/transformation_pipeline.cpp @@ -37,6 +37,7 @@ #include "transformations/common_optimizations/move_eltwise_up_data_movement.hpp" #include "transformations/common_optimizations/mark_rope_input_to_keep_in_mixed_precision.hpp" #include "transformations/common_optimizations/rms_fusion.hpp" +#include "transformations/common_optimizations/lora_subgraph_fusion.hpp" #include "transformations/control_flow/unroll_tensor_iterator.hpp" #include "transformations/fp16_compression/mark_decompression_convert_constant_folding.hpp" #include "transformations/fp16_compression/mark_floatpoint_range.hpp" @@ -693,6 +694,7 @@ void Transformations::PreLpt(const std::vector& defaultPrecis CPU_REGISTER_PASS_COMMON(manager, ov::pass::EnableDecompressionConvertConstantFolding); CPU_REGISTER_PASS_COMMON(manager, ov::pass::KeepConstAndDecompression); CPU_REGISTER_PASS_COMMON(manager, ov::pass::ConstantFolding); + CPU_REGISTER_PASS_COMMON(manager, ov::pass::LoraSubgraphFusion); manager.run_passes(model); } diff --git a/src/plugins/intel_cpu/tests/functional/custom/subgraph_tests/src/common/lora_pattern.cpp b/src/plugins/intel_cpu/tests/functional/custom/subgraph_tests/src/common/lora_pattern.cpp new file mode 100644 index 00000000000000..4f4b05ef56750c --- /dev/null +++ b/src/plugins/intel_cpu/tests/functional/custom/subgraph_tests/src/common/lora_pattern.cpp @@ -0,0 +1,266 @@ +// Copyright (C) 2018-2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "common_test_utils/node_builders/convolution.hpp" +#include "common_test_utils/node_builders/eltwise.hpp" +#include "common_test_utils/ov_tensor_utils.hpp" +#include "shared_test_classes/base/ov_subgraph.hpp" +#include "utils/cpu_test_utils.hpp" + +namespace ov { +namespace test { + +namespace { +constexpr auto t4_name = "lora/MatMul.B"; +constexpr auto t5_name = "lora/MatMul.alpha"; +constexpr auto t6_name = "lora/MatMul.A"; +constexpr auto netType = ov::element::f32; +} // namespace + +class LoraPatternBaseCPUTest : public SubgraphBaseTest { +protected: + void run_test_empty_tensors() { + compile_model(); + inferRequest = compiledModel.create_infer_request(); + ASSERT_TRUE(inferRequest); + generate_inputs(targetStaticShapes.front()); + for (const auto& input : inputs) { + inferRequest.set_tensor(input.first, input.second); + } + + inferRequest.infer(); + auto outputs = function->outputs(); + + auto tx_result = inferRequest.get_tensor(outputs[0]); + auto tz_result = inferRequest.get_tensor(outputs[1]); + ov::test::utils::compare(tx_result, tz_result, 1e-4, 1e-4); + } + + void run_test_random_tensors() { + compile_model(); + inferRequest = compiledModel.create_infer_request(); + ASSERT_TRUE(inferRequest); + + // use the Template plugin as a reference + + auto compiledReferenceModel = core->compile_model(function, ov::test::utils::DEVICE_TEMPLATE); + auto inferRequestRef = compiledReferenceModel.create_infer_request(); + ASSERT_TRUE(inferRequestRef); + + generate_inputs(targetStaticShapes.front()); + for (const auto& input : inputs) { + inferRequest.set_tensor(input.first, input.second); + inferRequestRef.set_tensor(input.first, input.second); + } + + constexpr size_t lora_order = 25lu; + constexpr int infer_count = 6lu; + + std::unordered_map stateShapes; + std::unordered_map initStateShapes; + + auto&& states = inferRequest.query_state(); + for (auto&& state : states) { + auto shape = state.get_state().get_shape(); + initStateShapes.insert({state.get_name(), shape}); + std::for_each(shape.begin(), shape.end(), [=](ov::Shape::value_type& x) { + if (0 == x) { + x = lora_order; + } + }); + stateShapes.insert({state.get_name(), std::move(shape)}); + } + + for (int i = 0; i < infer_count; ++i) { + // set states + + if (i == 3) { + // reset states on the 3rd iteration + for (auto&& item : states) { + item.reset(); + } + + for (auto&& item : inferRequestRef.query_state()) { + // Template plugin doesn't support reset state for dynamic shape states + item.get_state().set_shape(initStateShapes.at(item.get_name())); + } + } else if (!(i & 0x1)) { // every even call + // generate and set state tensors + for (auto&& item : states) { + auto&& refStates = inferRequestRef.query_state(); + using ov::test::utils::InputGenerateData; + const auto& shape = stateShapes.at(item.get_name()); + auto tensor = + ov::test::utils::create_and_fill_tensor(netType, shape, InputGenerateData{0, 10, 1, i}); + item.set_state(tensor); + auto itr = std::find_if(refStates.begin(), refStates.end(), [&](const ov::VariableState& state) { + return state.get_name() == item.get_name(); + }); + ASSERT_FALSE(itr == refStates.end()); + itr->set_state(tensor); + } + } + + inferRequest.infer(); + inferRequestRef.infer(); + auto outputs = function->outputs(); + + auto tx_result = inferRequest.get_tensor(outputs[0]); + auto tz_result = inferRequest.get_tensor(outputs[1]); + + auto tx_result_ref = inferRequestRef.get_tensor(outputs[0]); + auto tz_result_ref = inferRequestRef.get_tensor(outputs[1]); + + ov::test::utils::compare(tx_result, tx_result_ref, 1e-4, 1e-4); + ov::test::utils::compare(tz_result, tz_result_ref, 1e-4, 1e-4); + } + } +}; + +class LoraPatternMatmulCPUTest : public LoraPatternBaseCPUTest { +public: + void SetUp() override { + targetDevice = ov::test::utils::DEVICE_CPU; + + ov::PartialShape shape_x = {-1, -1, K}; + ov::PartialShape shape_w = {N, K}; + + auto param_y = std::make_shared(netType, shape_x); + auto param_w = std::make_shared(netType, shape_w); + + // "Main" matrix multiplication from the original transformer model + auto tx = std::make_shared(param_y, param_w, false, true); + + // LoRA parameters from states + auto variable_t4 = std::make_shared( + ov::op::util::VariableInfo{ov::PartialShape({N, -1}), netType, t4_name}); + auto t4 = std::make_shared(variable_t4); + auto t4_assign = std::make_shared(t4, variable_t4); + + auto variable_t5 = std::make_shared( + ov::op::util::VariableInfo{ov::PartialShape({1, -1}), netType, t5_name}); + auto t5 = std::make_shared(variable_t5); + auto t5_assign = std::make_shared(t5, variable_t5); + + auto variable_t6 = std::make_shared( + ov::op::util::VariableInfo{ov::PartialShape({-1, K}), netType, t6_name}); + auto t6 = std::make_shared(variable_t6); + auto t6_assign = std::make_shared(t6, variable_t6); + + // Apply LoRA parameters to the current activations + auto t5810 = std::make_shared(param_y, t6, false, true); + auto t5811 = std::make_shared(t5810, t5); + auto t5812 = std::make_shared(t5811, t4, false, true); + + // Mix LoRA part into normally computed activations after the "main" MatMul + auto tz = std::make_shared(tx, t5812); + + auto result_x = std::make_shared(tx); + auto result_z = std::make_shared(tz); + + function = std::make_shared(ov::ResultVector({result_x, result_z}), + ov::SinkVector({t4_assign, t5_assign, t6_assign}), + ov::ParameterVector({param_y, param_w})); + } + +protected: + static constexpr size_t K = 563ul; // Weights matrix K dimension + static constexpr size_t N = 2048ul; // Weights matrix N dimension +}; + +class LoraPatternConvolutionCPUTest : public LoraPatternBaseCPUTest { +public: + void SetUp() override { + targetDevice = ov::test::utils::DEVICE_CPU; + ov::PartialShape shape_x = {-1, num_channels, -1, -1}; + + auto param_y = std::make_shared(netType, shape_x); + + // Original Convolution that is modified by LoRA adapter later + auto tx = ov::test::utils::make_convolution(param_y, + netType, + {1, 1}, + {1, 1}, + {0, 0}, + {0, 0}, + {1, 1}, + ov::op::PadType::EXPLICIT, + num_channels); + + // LoRA parameters from states + auto variable_t4 = std::make_shared( + ov::op::util::VariableInfo{ov::PartialShape({num_channels, -1}), netType, t4_name}); + auto t4 = std::make_shared(variable_t4); + auto t4_assign = std::make_shared(t4, variable_t4); + + auto variable_t5 = std::make_shared( + ov::op::util::VariableInfo{ov::PartialShape({1, -1}), netType, t5_name}); + auto t5 = std::make_shared(variable_t5); + auto t5_assign = std::make_shared(t5, variable_t5); + + auto variable_t6 = std::make_shared( + ov::op::util::VariableInfo{ov::PartialShape({-1, num_channels}), netType, t6_name}); + auto t6 = std::make_shared(variable_t6); + auto t6_assign = std::make_shared(t6, variable_t6); + + // LoRA pattern with additional Transposes to move channel dimensions into positions where MatMul can be applied + auto t4940 = + std::make_shared(ov::element::i32, ov::Shape{4}, std::vector{2, 3, 0, 1}); + + auto t4941 = std::make_shared(param_y, t4940); + auto t4942 = std::make_shared(t4941, t6, false, true); + auto t4943 = std::make_shared(t4942, t5); + auto t4944 = std::make_shared(t4943, t4, false, true); + + auto t4945 = + std::make_shared(ov::element::i32, ov::Shape{4}, std::vector{2, 3, 0, 1}); + auto t4946 = std::make_shared(t4944, t4945); + + // Mix LoRA part into normally computed activations after the "main" MatMul + auto tz = std::make_shared(tx, t4946); + + auto result_x = std::make_shared(tx); + auto result_z = std::make_shared(tz); + + function = std::make_shared(ov::ResultVector({result_x, result_z}), + ov::SinkVector({t4_assign, t5_assign, t6_assign}), + ov::ParameterVector({param_y})); + } + +protected: + static constexpr size_t num_channels = 64ul; +}; + +TEST_F(LoraPatternMatmulCPUTest, smoke_LoRA_CPU_MatMul_empty) { + targetStaticShapes = {{{{1, 20, K}}, {{N, K}}}}; + run_test_empty_tensors(); + CPUTestUtils::CheckNumberOfNodesWithType(compiledModel, "LoRA", 1); + CPUTestUtils::CheckNumberOfNodesWithType(compiledModel, "MatMul", 1); +} + +TEST_F(LoraPatternConvolutionCPUTest, smoke_LoRA_CPU_Conv_empty) { + targetStaticShapes = {{{1, num_channels, 10, 15}}}; + run_test_empty_tensors(); + CPUTestUtils::CheckNumberOfNodesWithType(compiledModel, "LoRA", 1); + CPUTestUtils::CheckNumberOfNodesWithType(compiledModel, "MatMul", 0); +} + +TEST_F(LoraPatternMatmulCPUTest, smoke_LoRA_CPU_MatMul_random) { + GTEST_SKIP(); + targetStaticShapes = {{{{1, 20, K}}, {{N, K}}}}; + run_test_random_tensors(); + CPUTestUtils::CheckNumberOfNodesWithType(compiledModel, "LoRA", 1); + CPUTestUtils::CheckNumberOfNodesWithType(compiledModel, "MatMul", 1); +} + +TEST_F(LoraPatternConvolutionCPUTest, smoke_LoRA_CPU_Conv_random) { + GTEST_SKIP(); + targetStaticShapes = {{{1, num_channels, 10, 15}}}; + run_test_random_tensors(); + CPUTestUtils::CheckNumberOfNodesWithType(compiledModel, "LoRA", 1); + CPUTestUtils::CheckNumberOfNodesWithType(compiledModel, "MatMul", 0); +} + +} // namespace test +} // namespace ov \ No newline at end of file From 3ced1c18a365d3f7ed6232b457d66076c8536dda Mon Sep 17 00:00:00 2001 From: Roman Kazantsev Date: Tue, 29 Oct 2024 18:15:57 +0400 Subject: [PATCH 038/120] [TF FE] Update tensorflow-text version and fix jax version for MacOS x86 (#27295) **Details:** Update tensorflow-text version and fix jax version for MacOS x86 **Ticket:** 156277 --------- Signed-off-by: Kazantsev, Roman --- tests/requirements_tensorflow | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/tests/requirements_tensorflow b/tests/requirements_tensorflow index 3ae47d81ee2c50..954bba7944245f 100644 --- a/tests/requirements_tensorflow +++ b/tests/requirements_tensorflow @@ -9,13 +9,14 @@ pytest-html==4.1.1 transformers==4.45.1 # install exact keras version since tensorflow depends and has no upper bound for it keras==3.6.0 -tensorflow==2.18.0; python_version >= "3.12" and (platform_system != "Darwin" or platform_machine != "x86_64") -tensorflow==2.17.0; python_version < "3.12" and (platform_system != "Darwin" or platform_machine != "x86_64") +tensorflow==2.18.0; platform_system != "Darwin" or platform_machine != "x86_64" tensorflow==2.16.2; platform_system == "Darwin" and platform_machine == "x86_64" # install explicit version of wrapt to avoid "this __dict__ descriptor does not support '_DictWrapper' objects" error from TensorFlow 2.18 wrapt==1.15.0; python_version >= "3.12" # tensorflow-text is not available for both Windows and ARM platforms -tensorflow-text==2.17.0; python_version < "3.12" and platform_system == "Linux" and platform_machine == "x86_64" +tensorflow-text==2.18.0; python_version < "3.12" and platform_system == "Linux" and platform_machine == "x86_64" tensorflow-hub==0.16.1 -jax==0.4.35 +jax==0.4.35; platform_system != "Darwin" or platform_machine != "x86_64" +# tensorflow 2.16.2 depends on ml-dtypes~=0.3.1 and jax 0.4.35 depends on ml-dtypes>=0.4.0 +jax==0.4.33; platform_system == "Darwin" and platform_machine == "x86_64" defusedxml==0.7.1 From 669537a21ed0809d75725d2e770c8218b9b6d308 Mon Sep 17 00:00:00 2001 From: Surya Siddharth Pemmaraju Date: Tue, 29 Oct 2024 09:21:50 -0700 Subject: [PATCH 039/120] Disabled regional compilation (#27289) ### Details: - Torch 2.5.0 enabled regional compilation by default which degrades the performance of openvino backend ### Tickets: - (https://jira.devtools.intel.com/browse/CVS-156251) --------- Co-authored-by: Maxim Vafin --- .../frontend/pytorch/torchdynamo/backend.py | 14 +++++--------- 1 file changed, 5 insertions(+), 9 deletions(-) diff --git a/src/bindings/python/src/openvino/frontend/pytorch/torchdynamo/backend.py b/src/bindings/python/src/openvino/frontend/pytorch/torchdynamo/backend.py index 8294927a079c7e..9f2ef019769875 100644 --- a/src/bindings/python/src/openvino/frontend/pytorch/torchdynamo/backend.py +++ b/src/bindings/python/src/openvino/frontend/pytorch/torchdynamo/backend.py @@ -49,6 +49,9 @@ openvino_options = {} +# Disable regional compilation which was enabled by default from Torch 2.5.0 +if hasattr(torch._dynamo.config, "inline_inbuilt_nn_modules"): + torch._dynamo.config.inline_inbuilt_nn_modules=False @fake_tensor_unsupported def openvino(subgraph, example_inputs, options=None): @@ -59,15 +62,8 @@ def openvino(subgraph, example_inputs, options=None): return aot_autograd(fw_compiler=fx_openvino, bw_compiler=fx_openvino, decompositions=get_decompositions(decompositions))(subgraph, example_inputs) return fx_openvino(subgraph, example_inputs, options) - -try: - from packaging import version - - if version.parse(torch.__version__) < version.parse("2.5.0"): - register_backend(compiler_fn=openvino, name="openvino") -except ImportError: - logger.warning("The 'packaging' module is required but not installed") - +if "openvino" not in torch.compiler.list_backends(): + register_backend(compiler_fn=openvino, name="openvino") def fx_openvino(subgraph, example_inputs, options=None): try: From 9235543beb6f214cbb2857e99ddccb1eb2970451 Mon Sep 17 00:00:00 2001 From: Piotr Kowalczyk Date: Tue, 29 Oct 2024 18:40:00 +0100 Subject: [PATCH 040/120] [def/transformations]: Fix for failing roblox model at ConvertPrecision transformation (#27298) ### Details: - Fix for failing roblox model on ConvertPrecision transformation. ### Tickets: - CVS-156058 --------- Co-authored-by: Andrii Staikov Co-authored-by: Michal Lukaszewski --- .../src/transformations/convert_precision.cpp | 17 +++++++++++++- .../tests/utils/convert_precision.cpp | 23 +++++++++++++++++++ .../include/openvino/op/search_sorted.hpp | 19 ++++++++++++++- src/core/src/op/search_sorted.cpp | 17 ++++++++++---- src/core/tests/visitors/op/sorted_search.cpp | 2 +- 5 files changed, 71 insertions(+), 7 deletions(-) diff --git a/src/common/transformations/src/transformations/convert_precision.cpp b/src/common/transformations/src/transformations/convert_precision.cpp index c34e91f835301a..3ab2c694be40ef 100644 --- a/src/common/transformations/src/transformations/convert_precision.cpp +++ b/src/common/transformations/src/transformations/convert_precision.cpp @@ -62,6 +62,8 @@ bool fuse_type_to_ctc_greedy_decoder_seq_len(const std::shared_ptr& no bool fuse_type_to_random_uniform_v8(const std::shared_ptr& node, const precisions_map& precisions); +bool fuse_type_to_search_sorted_v15(const std::shared_ptr& node, const precisions_map& precisions); + bool extend_select_type(const std::shared_ptr& node, const precisions_map& precisions); bool extend_reverse_type(const std::shared_ptr& node, const precisions_map& precisions); @@ -468,7 +470,8 @@ bool ov::pass::ConvertPrecision::run_on_model(const std::shared_ptr& {ov::op::v13::Multinomial::get_type_info_static(), fuse_type_to_multinomial_v13}, {ov::op::v0::PriorBox::get_type_info_static(), fuse_type_to_prior_box}, {ov::op::v8::PriorBox::get_type_info_static(), fuse_type_to_prior_box}, - {ov::op::v0::PriorBoxClustered::get_type_info_static(), fuse_type_to_prior_box}}; + {ov::op::v0::PriorBoxClustered::get_type_info_static(), fuse_type_to_prior_box}, + {ov::op::v15::SearchSorted::get_type_info_static(), fuse_type_to_search_sorted_v15}}; for (const auto& it : m_additional_type_to_fuse_map) { type_to_fuse[it.first] = it.second; @@ -553,6 +556,18 @@ bool fuse_type_to_unique_v10(const std::shared_ptr& node, const precisions return res; } +bool fuse_type_to_search_sorted_v15(const std::shared_ptr& node, const precisions_map& precisions) { + bool res = false; + if (auto op = ov::as_type_ptr(node)) { + auto it = precisions.find(node->get_output_element_type(0)); + if (it != precisions.end()) { + op->set_output_type_attr(it->second); + res = true; + } + } + return res; +} + bool fuse_type_to_range_v4(const std::shared_ptr& node, const precisions_map& precisions) { auto it = precisions.find(node->get_output_element_type(0)); if (it == precisions.end()) diff --git a/src/common/transformations/tests/utils/convert_precision.cpp b/src/common/transformations/tests/utils/convert_precision.cpp index 9554cf09162d45..2aa4d4d2fac9e9 100644 --- a/src/common/transformations/tests/utils/convert_precision.cpp +++ b/src/common/transformations/tests/utils/convert_precision.cpp @@ -15,6 +15,7 @@ #include "openvino/core/model.hpp" #include "openvino/opsets/opset1.hpp" #include "openvino/opsets/opset10.hpp" +#include "openvino/opsets/opset15.hpp" #include "openvino/opsets/opset3.hpp" #include "openvino/opsets/opset4.hpp" #include "openvino/opsets/opset5.hpp" @@ -1036,6 +1037,28 @@ TEST(TransformationTests, ConvertPrecision_TypeRelaxed) { } } +TEST(TransformationTests, ConvertPrecision_SearchSorted) { + std::shared_ptr f(nullptr); + { + auto search_sorted_input = opset15::Constant::create(ov::element::i64, {5}, {1, 2, 3, 4, 5}); + auto indices = std::make_shared(ov::element::i64, Shape{3}); + auto search_sorted = std::make_shared(search_sorted_input, indices); + + auto less_input = opset15::Constant::create(ov::element::i64, {3}, {4, 5, 6}); + auto less = std::make_shared(search_sorted, less_input); + + f = std::make_shared(OutputVector{less}, ParameterVector{indices}); + + pass::Manager manager; + manager.register_pass(); + manager.register_pass(precisions_map{{element::i64, element::i32}}); + manager.run_passes(f); + } + OV_ASSERT_NO_THROW(check_rt_info(f)); + ASSERT_FALSE(has_type(f)); + ASSERT_TRUE(has_type(f)); +} + TEST(TransformationTests, ConvertPrecision_Variables) { std::shared_ptr f(nullptr); { diff --git a/src/core/include/openvino/op/search_sorted.hpp b/src/core/include/openvino/op/search_sorted.hpp index c370ba46b2f182..efb1f8491e0882 100644 --- a/src/core/include/openvino/op/search_sorted.hpp +++ b/src/core/include/openvino/op/search_sorted.hpp @@ -22,7 +22,15 @@ class OPENVINO_API SearchSorted : public Op { /// \param values Values to search indexs for. /// \param right_mode If False, return the first suitable index that is found for given value. If True, return /// the last such index. - SearchSorted(const Output& sorted_sequence, const Output& values, bool right_mode = false); + /// \param output_type The element type of the output tensor. This is purely an implementation flag, which + /// is used to convert the output type for CPU plugin in ConvertPrecision transformation (and potentially other + /// plugins as well). Setting this flag to element::i32 will result in the output tensor of i32 element type. + /// Setting this flag to element::i64 will generally not give any effect, since it will be converted to i32 anyway, + /// at least for CPU plugin. + SearchSorted(const Output& sorted_sequence, + const Output& values, + bool right_mode = false, + const element::Type& output_type = element::i64); void validate_and_infer_types() override; bool visit_attributes(AttributeVisitor& visitor) override; @@ -36,8 +44,17 @@ class OPENVINO_API SearchSorted : public Op { m_right_mode = right_mode; } + void set_output_type_attr(const element::Type& output_type) { + m_output_type = output_type; + } + + element::Type get_output_type_attr() const { + return m_output_type; + } + private: bool m_right_mode{}; + element::Type m_output_type = element::i64; }; } // namespace v15 } // namespace op diff --git a/src/core/src/op/search_sorted.cpp b/src/core/src/op/search_sorted.cpp index 8b9bb012b27106..65b5ff31861d8e 100644 --- a/src/core/src/op/search_sorted.cpp +++ b/src/core/src/op/search_sorted.cpp @@ -12,9 +12,13 @@ namespace ov { namespace op { namespace v15 { -SearchSorted::SearchSorted(const Output& sorted_sequence, const Output& values, bool right_mode) +SearchSorted::SearchSorted(const Output& sorted_sequence, + const Output& values, + bool right_mode, + const element::Type& output_type) : Op({sorted_sequence, values}), - m_right_mode(right_mode) { + m_right_mode(right_mode), + m_output_type(output_type) { constructor_validate_and_infer_types(); } @@ -23,20 +27,25 @@ void SearchSorted::validate_and_infer_types() { NODE_VALIDATION_CHECK(this, get_input_element_type(0).compatible(get_input_element_type(1)), "Sorted sequence and values must have the same element type."); + NODE_VALIDATION_CHECK(this, + m_output_type == element::i32 || m_output_type == element::i64, + "The element type of the last output can only be set to i32 or i64."); + const auto& output_shapes = shape_infer(this, ov::util::get_node_input_partial_shapes(*this)); - set_output_type(0, ov::element::i64, output_shapes[0]); + set_output_type(0, m_output_type, output_shapes[0]); } bool SearchSorted::visit_attributes(AttributeVisitor& visitor) { OV_OP_SCOPE(v15_SearchSorted_visit_attributes); visitor.on_attribute("right_mode", m_right_mode); + visitor.on_attribute("output_type", m_output_type); return true; } std::shared_ptr SearchSorted::clone_with_new_inputs(const OutputVector& new_args) const { OV_OP_SCOPE(v15_SearchSorted_clone_with_new_inputs); check_new_args_count(this, new_args); - return std::make_shared(new_args.at(0), new_args.at(1), get_right_mode()); + return std::make_shared(new_args.at(0), new_args.at(1), get_right_mode(), get_output_type_attr()); } } // namespace v15 } // namespace op diff --git a/src/core/tests/visitors/op/sorted_search.cpp b/src/core/tests/visitors/op/sorted_search.cpp index 860c9528d0e9aa..10d544527f3714 100644 --- a/src/core/tests/visitors/op/sorted_search.cpp +++ b/src/core/tests/visitors/op/sorted_search.cpp @@ -22,7 +22,7 @@ TEST(attributes, search_sorted_op) { auto g_op = ov::as_type_ptr(builder.create()); // attribute count - const auto expected_attr_count = 1; + const auto expected_attr_count = 2; EXPECT_EQ(builder.get_value_map_size(), expected_attr_count); // space_to_depth attributes From 99f3a91f2ad4347c660a3103a2aca0748323a58f Mon Sep 17 00:00:00 2001 From: Ilya Lavrenov Date: Tue, 29 Oct 2024 21:50:32 +0400 Subject: [PATCH 041/120] [Wheel] return back JAX FE (#27309) ### Details: - Regression after https://github.com/openvinotoolkit/openvino/pull/26610 ### Tickets: - CVS-156317 --- cmake/developer_package/frontends/frontends.cmake | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/cmake/developer_package/frontends/frontends.cmake b/cmake/developer_package/frontends/frontends.cmake index d2aa0410476245..0815297a11a5eb 100644 --- a/cmake/developer_package/frontends/frontends.cmake +++ b/cmake/developer_package/frontends/frontends.cmake @@ -304,6 +304,9 @@ macro(ov_add_frontend) # then we need to mark it to be CXX ABI free ov_abi_free_target(${TARGET_NAME}) + # public target name + set_target_properties(${TARGET_NAME} PROPERTIES EXPORT_NAME frontend::${OV_FRONTEND_NAME}) + # installation if(NOT OV_FRONTEND_SKIP_INSTALL) @@ -351,9 +354,6 @@ macro(ov_add_frontend) COMPONENT ${dev_component} ${OV_CPACK_COMP_CORE_DEV_EXCLUDE_ALL} FILES_MATCHING PATTERN "*.hpp") - - # public target name - set_target_properties(${TARGET_NAME} PROPERTIES EXPORT_NAME frontend::${OV_FRONTEND_NAME}) endif() else() # skipped frontend has to be installed in static libraries case From fc105b06be4fb554f0cb075534cb4f1baa95605d Mon Sep 17 00:00:00 2001 From: Katarzyna Mitrus Date: Tue, 29 Oct 2024 19:14:14 +0100 Subject: [PATCH 042/120] [STFT][Op][Python] Fix STFT Python API to pass attribute (#27311) ### Details: - Fix STFT Python API to pass "transpose_frames" attribute ### Tickets: - 147160 --- .../python/src/openvino/runtime/opset15/ops.py | 2 +- src/bindings/python/tests/test_graph/test_create_op.py | 10 +++++++++- 2 files changed, 10 insertions(+), 2 deletions(-) diff --git a/src/bindings/python/src/openvino/runtime/opset15/ops.py b/src/bindings/python/src/openvino/runtime/opset15/ops.py index 45b01a11bc3588..b3a131602af703 100644 --- a/src/bindings/python/src/openvino/runtime/opset15/ops.py +++ b/src/bindings/python/src/openvino/runtime/opset15/ops.py @@ -326,7 +326,7 @@ def stft( :return: The new node performing STFT operation. """ inputs = as_nodes(data, window, frame_size, frame_step, name=name) - return _get_node_factory_opset15().create("STFT", inputs) + return _get_node_factory_opset15().create("STFT", inputs, {"transpose_frames": transpose_frames}) @nameable_op diff --git a/src/bindings/python/tests/test_graph/test_create_op.py b/src/bindings/python/tests/test_graph/test_create_op.py index 87787e1e29bc32..98d0ec3583882c 100644 --- a/src/bindings/python/tests/test_graph/test_create_op.py +++ b/src/bindings/python/tests/test_graph/test_create_op.py @@ -2492,8 +2492,8 @@ def test_stft(): window = ov.parameter([7], name="window", dtype=np.float32) frame_size = ov.constant(np.array(11, dtype=np.int32)) frame_step = ov.constant(np.array(3, dtype=np.int32)) - transpose_frames = True + transpose_frames = False op = ov_opset15.stft(data, window, frame_size, frame_step, transpose_frames) assert op.get_type_name() == "STFT" @@ -2501,6 +2501,14 @@ def test_stft(): assert op.get_output_element_type(0) == Type.f32 assert op.get_output_shape(0) == [4, 13, 6, 2] + transpose_frames = True + op = ov_opset15.stft(data, window, frame_size, frame_step, transpose_frames) + + assert op.get_type_name() == "STFT" + assert op.get_output_size() == 1 + assert op.get_output_element_type(0) == Type.f32 + assert op.get_output_shape(0) == [4, 6, 13, 2] + def test_search_sorted(): sorted_sequence = ov.parameter([7, 256, 200, 200], name="sorted", dtype=np.float32) From bd6cf01d4fa0bf6ecc250b8293fba08a518c2805 Mon Sep 17 00:00:00 2001 From: Taylor Yeonbok Lee Date: Tue, 29 Oct 2024 11:16:15 -0700 Subject: [PATCH 043/120] [GPU] Fix sdpa opt accuracy (#27262) ### Details: - Fix accuracy for sdpa_opt ### Tickets: - 154583 --- .../src/kernel_selector/cl_kernels/sdpa_opt.cl | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/sdpa_opt.cl b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/sdpa_opt.cl index 8e6be800f37cf0..c114332f393c0e 100644 --- a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/sdpa_opt.cl +++ b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/sdpa_opt.cl @@ -190,7 +190,7 @@ KERNEL(sdpa_opt)( // SLM for query inputs __local INPUT0_TYPE query_local[HEAD_SIZE * TARGET_SEQ_LEN_BLOCK_SIZE]; // SLM for intermediate QK results - __local OUTPUT_TYPE qk_local[SEQ_LEN_PARTITION_SIZE * TARGET_SEQ_LEN_BLOCK_SIZE]; + __local SOFTMAX_ACCUMULATOR_TYPE qk_local[SEQ_LEN_PARTITION_SIZE * TARGET_SEQ_LEN_BLOCK_SIZE]; // SLM buffers for SoftMax calculation and qk_max/qk_sums results aggregation across all WG __local SOFTMAX_ACCUMULATOR_TYPE qk_max_vals[SUBGROUPS_PER_WG * TARGET_SEQ_LEN_BLOCK_SIZE]; __local SOFTMAX_ACCUMULATOR_TYPE qk_sum_vals[SUBGROUPS_PER_WG * TARGET_SEQ_LEN_BLOCK_SIZE]; @@ -259,7 +259,7 @@ KERNEL(sdpa_opt)( uint key_offset = INPUT1_GET_INDEX(b_idx, b1_idx, start_partition_idx + seq_len, 0); #endif - INPUT0_TYPE acc[TARGET_SEQ_LEN_BLOCK_SIZE] = {INPUT0_VAL_ZERO}; + SOFTMAX_ACCUMULATOR_TYPE acc[TARGET_SEQ_LEN_BLOCK_SIZE] = {SOFTMAX_ACCUMULATOR_VAL_ZERO}; #if IS_KV_COMPRESSED const uint comp_offset = GET_COMPRESSION_INDEX(KEY_COMPRESSION_SCALE, b_idx, b1_idx / BROADCAST_GROUP_SIZE, start_partition_idx + seq_len, 0); @@ -294,7 +294,7 @@ KERNEL(sdpa_opt)( } unroll_for(uint i = 0; i < KEY_BLOCK_SIZE; i++) { - acc[seq_idx] = mad(query_vals_reg[i], key_vals[i], acc[seq_idx]); + acc[seq_idx] = mad(TO_SOFTMAX_ACCUMULATOR_TYPE(query_vals_reg[i]), TO_SOFTMAX_ACCUMULATOR_TYPE(key_vals[i]), acc[seq_idx]); } query_offset += HEAD_SIZE; @@ -326,7 +326,7 @@ KERNEL(sdpa_opt)( } unroll_for(uint i = 0; i < KEY_BLOCK_SIZE; i++) { - acc[seq_idx] = mad(query_vals_reg[i], key_vals[i], acc[seq_idx]); + acc[seq_idx] = mad(TO_SOFTMAX_ACCUMULATOR_TYPE(query_vals_reg[i]), TO_SOFTMAX_ACCUMULATOR_TYPE(key_vals[i]), acc[seq_idx]); } query_offset += HEAD_SIZE; @@ -358,7 +358,7 @@ KERNEL(sdpa_opt)( } unroll_for(uint i = 0; i < KEY_BLOCK_SIZE; i++) { - acc[seq_idx] = mad(query_vals_reg[i], key_vals[i], acc[seq_idx]); + acc[seq_idx] = mad(TO_SOFTMAX_ACCUMULATOR_TYPE(query_vals_reg[i]), TO_SOFTMAX_ACCUMULATOR_TYPE(key_vals[i]), acc[seq_idx]); } query_offset += HEAD_SIZE; @@ -389,7 +389,7 @@ KERNEL(sdpa_opt)( query_vals_reg = query_local[query_offset + i * SUBGROUP_SIZE]; } - acc[seq_idx] = mad(query_vals_reg, key_vals, acc[seq_idx]); + acc[seq_idx] = mad(TO_SOFTMAX_ACCUMULATOR_TYPE(query_vals_reg), TO_SOFTMAX_ACCUMULATOR_TYPE(key_vals), acc[seq_idx]); query_offset += HEAD_SIZE; } } @@ -405,7 +405,7 @@ KERNEL(sdpa_opt)( // Wait until all SG finishes their calculations and apply scale and attention mask to the results barrier(CLK_LOCAL_MEM_FENCE); - INPUT0_TYPE qk_val[TARGET_SEQ_LEN_BLOCK_SIZE]; + SOFTMAX_ACCUMULATOR_TYPE qk_val[TARGET_SEQ_LEN_BLOCK_SIZE]; const uint seq_idx_end = 1; for (uint seq_idx = 0; seq_idx < seq_idx_end; seq_idx++) { // Iterate over all values QK values in SLM and apply scale and attention mask From 4d29e2ecc959d82fb4c2fc8b1fd974f39e9f1501 Mon Sep 17 00:00:00 2001 From: Septimiu Neaga <111509085+SeptimiuIoachimNeagaIntel@users.noreply.github.com> Date: Tue, 29 Oct 2024 21:09:11 +0200 Subject: [PATCH 044/120] Enable - disabling ORT optimizations flag in protopipe app (#27182) ### Details: - Enable - disabling ORT optimizations flag - in protopipe app Co-authored-by: Maksim Doronin --- src/plugins/intel_npu/tools/protopipe/README.md | 1 + src/plugins/intel_npu/tools/protopipe/src/parser/config.cpp | 3 +++ .../intel_npu/tools/protopipe/src/scenario/inference.hpp | 1 + .../intel_npu/tools/protopipe/src/simulation/simulation.cpp | 3 +++ 4 files changed, 8 insertions(+) diff --git a/src/plugins/intel_npu/tools/protopipe/README.md b/src/plugins/intel_npu/tools/protopipe/README.md index afe6e8cffbc8c3..00849ad8bddc9a 100644 --- a/src/plugins/intel_npu/tools/protopipe/README.md +++ b/src/plugins/intel_npu/tools/protopipe/README.md @@ -97,6 +97,7 @@ The dependency graph in Protopipe is specified by: - `tag` - **Required**. The unique name of operation. - `type` - **Optional**. The operation type: _Infer_, _CPU_, _Compound_ (**Default**: _Infer_) - `repeat_count` - **Optional**. Runs operation over specified number of iterations. + - `opt_level` - **Optional**. Configures optimization level for ONNX Runtime. - `connections` - The list of connections between operations. Supported operation types diff --git a/src/plugins/intel_npu/tools/protopipe/src/parser/config.cpp b/src/plugins/intel_npu/tools/protopipe/src/parser/config.cpp index 34099d36a69fdb..c2a1bd6415d595 100644 --- a/src/plugins/intel_npu/tools/protopipe/src/parser/config.cpp +++ b/src/plugins/intel_npu/tools/protopipe/src/parser/config.cpp @@ -404,6 +404,9 @@ struct convert { if (node["ep"]) { params.ep = node["ep"].as(); } + if (node["opt_level"]) { + params.opt_level = node["opt_level"].as(); + } return true; } }; diff --git a/src/plugins/intel_npu/tools/protopipe/src/scenario/inference.hpp b/src/plugins/intel_npu/tools/protopipe/src/scenario/inference.hpp index c4fd85aa26721a..e4568c671438bc 100644 --- a/src/plugins/intel_npu/tools/protopipe/src/scenario/inference.hpp +++ b/src/plugins/intel_npu/tools/protopipe/src/scenario/inference.hpp @@ -104,6 +104,7 @@ struct ONNXRTParams { }; // NB: std::monostate stands for the default MLAS Execution provider using EP = std::variant; + std::optional opt_level; EP ep; }; diff --git a/src/plugins/intel_npu/tools/protopipe/src/simulation/simulation.cpp b/src/plugins/intel_npu/tools/protopipe/src/simulation/simulation.cpp index 52f57c2881a3b6..5b1743651b6ef1 100644 --- a/src/plugins/intel_npu/tools/protopipe/src/simulation/simulation.cpp +++ b/src/plugins/intel_npu/tools/protopipe/src/simulation/simulation.cpp @@ -79,6 +79,9 @@ static void cfgExecutionProvider(cv::gapi::onnx::Params& netw static cv::gapi::GNetPackage getNetPackage(const std::string& tag, const ONNXRTParams& params) { cv::gapi::onnx::Params network{tag, params.model_path}; network.cfgSessionOptions(params.session_options); + if (params.opt_level.has_value()) { + network.cfgOptLevel(params.opt_level.value()); + } cfgExecutionProvider(network, params.ep); return cv::gapi::networks(network); } From 874bf8a120359ff4fc0b4e934de0b3e251835425 Mon Sep 17 00:00:00 2001 From: Alexey Smirnov Date: Tue, 29 Oct 2024 23:23:16 +0000 Subject: [PATCH 045/120] [NPUW] Fix optimized out check (#27313) --- .../intel_npu/src/plugin/npuw/just_sync_infer_request.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/plugins/intel_npu/src/plugin/npuw/just_sync_infer_request.cpp b/src/plugins/intel_npu/src/plugin/npuw/just_sync_infer_request.cpp index 4a9a3e06a0aa16..0070e6be2d2041 100644 --- a/src/plugins/intel_npu/src/plugin/npuw/just_sync_infer_request.cpp +++ b/src/plugins/intel_npu/src/plugin/npuw/just_sync_infer_request.cpp @@ -378,7 +378,7 @@ ov::npuw::JustInferRequest::JustInferRequest(const std::shared_ptrm_compiled_submodels[i]; - if (!comp_model_desc.compiled_model || !comp_model_desc.replaced_by) { + if (!comp_model_desc.compiled_model && !comp_model_desc.replaced_by) { continue; } const auto real_idx = comp_model_desc.replaced_by.value(); From 95a6f183d11286c4296c122c221f1faa4b3d9b06 Mon Sep 17 00:00:00 2001 From: Vladimir Paramuzov Date: Wed, 30 Oct 2024 08:00:22 +0400 Subject: [PATCH 046/120] Revert "[GPU] Fixes for hybrid quantization (#27127)" (#27308) This reverts commit c21f572cc45193232d76aa21e821e92445b18725. Signed-off-by: Vladimir Paramuzov --- .../impls/onednn/fully_connected_onednn.hpp | 2 +- .../src/plugin/transformations_pipeline.cpp | 59 ++++--------------- 2 files changed, 14 insertions(+), 47 deletions(-) diff --git a/src/plugins/intel_gpu/src/graph/impls/onednn/fully_connected_onednn.hpp b/src/plugins/intel_gpu/src/graph/impls/onednn/fully_connected_onednn.hpp index 39423980521042..f4495fb5dd1645 100644 --- a/src/plugins/intel_gpu/src/graph/impls/onednn/fully_connected_onednn.hpp +++ b/src/plugins/intel_gpu/src/graph/impls/onednn/fully_connected_onednn.hpp @@ -50,7 +50,7 @@ struct FullyConnectedImplementationManager : public ImplementationManager { bool compressed_case = fc_prim->compressed_weights && one_of(in0_dt, {data_types::f16, data_types::f32, data_types::i8}) && one_of(wei_dt, {data_types::u8, data_types::i8, data_types::u4, data_types::i4}) && - one_of(out_dt, {data_types::f16, data_types::f32, data_types::u8, data_types::i8}); + one_of(out_dt, {data_types::f16, data_types::f32}); if (!f16f16_case && !f32f32_case && !u8s8_case && !compressed_case) return false; diff --git a/src/plugins/intel_gpu/src/plugin/transformations_pipeline.cpp b/src/plugins/intel_gpu/src/plugin/transformations_pipeline.cpp index 770aa387da8a60..305e21a5000149 100644 --- a/src/plugins/intel_gpu/src/plugin/transformations_pipeline.cpp +++ b/src/plugins/intel_gpu/src/plugin/transformations_pipeline.cpp @@ -15,11 +15,8 @@ #include "intel_gpu/plugin/transformations_pipeline.hpp" #include "intel_gpu/runtime/debug_configuration.hpp" #include "intel_gpu/runtime/itt.hpp" -#include "low_precision/add.hpp" #include "low_precision/convolution.hpp" #include "low_precision/convolution_backprop_data.hpp" -#include "low_precision/fold_convert.hpp" -#include "low_precision/fuse_convert.hpp" #include "low_precision/group_convolution.hpp" #include "low_precision/low_precision.hpp" #include "low_precision/mat_mul.hpp" @@ -28,9 +25,7 @@ #include "low_precision/pull_reshape_through_dequantization.hpp" #include "low_precision/pull_transpose_through_dequantization.hpp" #include "low_precision/recurrent_cell.hpp" -#include "low_precision/rt_info/bias_attribute.hpp" #include "low_precision/strided_slice.hpp" -#include "low_precision/transpose.hpp" #include "openvino/core/deprecated.hpp" #include "openvino/core/type/element_type.hpp" #include "openvino/core/validation_util.hpp" @@ -51,7 +46,6 @@ #include "openvino/op/reshape.hpp" #include "openvino/op/rnn_cell.hpp" #include "openvino/op/rnn_sequence.hpp" -#include "openvino/op/scaled_dot_product_attention.hpp" #include "openvino/op/squeeze.hpp" #include "openvino/op/unsqueeze.hpp" #include "openvino/op/util/sub_graph_base.hpp" @@ -319,9 +313,13 @@ void TransformationsPipeline::apply(std::shared_ptr func) { // it expects to have the same data type for weights and zero points (apply it only for u8 data type, since other compression // types are not supported by oneDNN) manager.register_pass(supported_woq_types, !device_info.supports_immad); - pass_config->set_callback([&](const std::shared_ptr node) { - return !is_decompression_multiply(node); - }); + + // Need to check if transformations work correctly for mixed models with both compression and quantization at the same time. + if (!is_model_quantized) { + pass_config->set_callback([&](const std::shared_ptr node) { + return !is_decompression_multiply(node); + }); + } const bool keep_precision_sensitive_in_fp32_1 = true; const bool convert_input_output_precision = false; @@ -690,6 +688,12 @@ void TransformationsPipeline::apply(std::shared_ptr func) { auto lptPassConfig = lptManager.get_pass_config(); // quantized LSTMSequence / GPUSequence are not supported yet. Avoid extra transformation lptPassConfig->disable(); + lptPassConfig->set_callback([](const_node_ptr& node) -> bool { + if (const auto mulitply = std::dynamic_pointer_cast(node)) { + return !MultiplyToGroupConvolutionTransformation::canBeTransformedToGroupConvolution(mulitply); + } + return false; + }); lptPassConfig->set_callback([func, defaultPrecisions](const_node_ptr& node) -> bool { auto fillStaticChannel = [func](const ov::PartialShape& shape, size_t& channel) -> bool { const auto rank = shape.rank(); @@ -726,43 +730,6 @@ void TransformationsPipeline::apply(std::shared_ptr func) { || WeightableLayerTransformation::isAsymmetricOnWeights(node, defaultPrecisions); }); - lptPassConfig->set_callback([&](const_node_ptr& node) -> bool { - for (auto& user : node->get_users()) { - if (ov::is_type(user)) - return true; - } - - return false; - }); - - lptPassConfig->set_callback([](const_node_ptr& node) -> bool { - return ov::is_type(node) && !MultiplyToGroupConvolutionTransformation::canBeTransformedToGroupConvolution(node); - }); - - lptPassConfig->set_callback([](const_node_ptr& node) -> bool { - return ov::marked_as_bias(node); - }); - lptPassConfig->set_callback([](const_node_ptr& node) -> bool { - const auto& consumers = node->get_output_target_inputs(0); - if (consumers.size() == 1) { - const auto consumer = consumers.begin()->get_node()->shared_from_this(); - return ov::is_type(consumer) && is_decompression_multiply(consumer); - } - return false; - }); - lptPassConfig->set_callback([](const_node_ptr& node) -> bool { - if (ov::is_type(node)) { - return ov::is_type(node) && is_decompression_multiply(node); - } else if (ov::is_type(node)) { - const auto& consumers = node->get_output_target_inputs(0); - if (consumers.size() == 1) { - const auto consumer = consumers.begin()->get_node()->shared_from_this(); - return ov::is_type(consumer) && is_decompression_multiply(consumer); - } - } - return false; - }); - lptPassConfig->set_callback([&](const_node_ptr& node) -> bool { // disable MultiplyToGroupConvolution if Multiply with Constant can be fused From 961b891ea423427e74b47d24629b33efd866c793 Mon Sep 17 00:00:00 2001 From: Mingyu Kim Date: Wed, 30 Oct 2024 14:37:18 +0900 Subject: [PATCH 047/120] [GPU] update onednn (#27322) --- src/plugins/intel_gpu/thirdparty/onednn_gpu | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/plugins/intel_gpu/thirdparty/onednn_gpu b/src/plugins/intel_gpu/thirdparty/onednn_gpu index 32ad05ab263b78..062d247e7853b1 160000 --- a/src/plugins/intel_gpu/thirdparty/onednn_gpu +++ b/src/plugins/intel_gpu/thirdparty/onednn_gpu @@ -1 +1 @@ -Subproject commit 32ad05ab263b782d4a4455ea85f5de009cf607c4 +Subproject commit 062d247e7853b14ed287a130cc2dc221187430aa From f12f086d4185765b06f2a659e0d01406b31fd634 Mon Sep 17 00:00:00 2001 From: Andrew Kwangwoong Park Date: Wed, 30 Oct 2024 15:05:57 +0900 Subject: [PATCH 048/120] [GPU] Add per layer scaling for FC to fix accuracy issue regarding fp16 overflow (#27291) ### Details: - Fix LLM accuracy issue due to fp16 overflow when using decompression_post_opt in fully_connected_gpu_bf_tiled_opt kernel - In the fc kernel, to optimize grouped scale, we calculate acc first as mad of activation (fp16) * weight (int4) first , and then apply scale value. This can cause accuracy issue when only multiply of activation and weight overflows. - In this case we can resolve the issue by applying scale down to the activation. - Implement per layer scaling for FCs ### Tickets: - 154583 --------- Signed-off-by: Andrew Park --- .../runtime/properties/hint/__init__.py | 1 + .../pyopenvino/core/properties/properties.cpp | 1 + .../tests/test_runtime/test_properties.py | 5 + .../include/openvino/runtime/properties.hpp | 6 + .../intel_gpu/src/plugin/compiled_model.cpp | 1 + src/plugins/intel_gpu/src/plugin/plugin.cpp | 4 +- .../transformations/fc_per_layer_scaling.cpp | 81 ++++++++++++ .../transformations/fc_per_layer_scaling.hpp | 19 +++ .../src/plugin/transformations_pipeline.cpp | 2 + .../src/runtime/execution_config.cpp | 1 + .../fc_per_layer_scaling_test.cpp | 117 ++++++++++++++++++ 11 files changed, 237 insertions(+), 1 deletion(-) create mode 100644 src/plugins/intel_gpu/src/plugin/transformations/fc_per_layer_scaling.cpp create mode 100644 src/plugins/intel_gpu/src/plugin/transformations/fc_per_layer_scaling.hpp create mode 100644 src/plugins/intel_gpu/tests/unit/transformations/fc_per_layer_scaling_test.cpp diff --git a/src/bindings/python/src/openvino/runtime/properties/hint/__init__.py b/src/bindings/python/src/openvino/runtime/properties/hint/__init__.py index dd90ded374ca11..d1dce289d09941 100644 --- a/src/bindings/python/src/openvino/runtime/properties/hint/__init__.py +++ b/src/bindings/python/src/openvino/runtime/properties/hint/__init__.py @@ -23,3 +23,4 @@ from openvino._pyopenvino.properties.hint import allow_auto_batching from openvino._pyopenvino.properties.hint import dynamic_quantization_group_size from openvino._pyopenvino.properties.hint import kv_cache_precision +from openvino._pyopenvino.properties.hint import activations_scale_factor diff --git a/src/bindings/python/src/pyopenvino/core/properties/properties.cpp b/src/bindings/python/src/pyopenvino/core/properties/properties.cpp index a6b30bd773001f..564e5f69f5ee14 100644 --- a/src/bindings/python/src/pyopenvino/core/properties/properties.cpp +++ b/src/bindings/python/src/pyopenvino/core/properties/properties.cpp @@ -101,6 +101,7 @@ void regmodule_properties(py::module m) { wrap_property_RW(m_hint, ov::hint::allow_auto_batching, "allow_auto_batching"); wrap_property_RW(m_hint, ov::hint::dynamic_quantization_group_size, "dynamic_quantization_group_size"); wrap_property_RW(m_hint, ov::hint::kv_cache_precision, "kv_cache_precision"); + wrap_property_RW(m_hint, ov::hint::activations_scale_factor, "activations_scale_factor"); // Submodule intel_cpu py::module m_intel_cpu = diff --git a/src/bindings/python/tests/test_runtime/test_properties.py b/src/bindings/python/tests/test_runtime/test_properties.py index 32eb48f6765f41..6065d72196b44b 100644 --- a/src/bindings/python/tests/test_runtime/test_properties.py +++ b/src/bindings/python/tests/test_runtime/test_properties.py @@ -335,6 +335,11 @@ def test_properties_ro(ov_property_ro, expected_value): ((64, 64),), ), (hints.kv_cache_precision, "KV_CACHE_PRECISION", ((Type.f32, Type.f32),)), + ( + hints.activations_scale_factor, + "ACTIVATIONS_SCALE_FACTOR", + ((0.0, 0.0),), + ), ( intel_cpu.denormals_optimization, "CPU_DENORMALS_OPTIMIZATION", diff --git a/src/inference/include/openvino/runtime/properties.hpp b/src/inference/include/openvino/runtime/properties.hpp index 627314748bbe9c..5674c75dd546d7 100644 --- a/src/inference/include/openvino/runtime/properties.hpp +++ b/src/inference/include/openvino/runtime/properties.hpp @@ -580,6 +580,12 @@ static constexpr Property dynamic_quantization */ static constexpr Property kv_cache_precision{"KV_CACHE_PRECISION"}; +/** + * @brief This property scales down activations to prevent overflows when inference precision is f16. + * @ingroup ov_runtime_cpp_prop_api + */ +static constexpr Property activations_scale_factor{"ACTIVATIONS_SCALE_FACTOR"}; + } // namespace hint /** diff --git a/src/plugins/intel_gpu/src/plugin/compiled_model.cpp b/src/plugins/intel_gpu/src/plugin/compiled_model.cpp index 15ff4447b4bafe..233bc97c249cd4 100644 --- a/src/plugins/intel_gpu/src/plugin/compiled_model.cpp +++ b/src/plugins/intel_gpu/src/plugin/compiled_model.cpp @@ -257,6 +257,7 @@ ov::Any CompiledModel::get_property(const std::string& name) const { ov::PropertyName{ov::hint::num_requests.name(), PropertyMutability::RO}, ov::PropertyName{ov::hint::inference_precision.name(), PropertyMutability::RO}, ov::PropertyName{ov::hint::dynamic_quantization_group_size.name(), PropertyMutability::RO}, + ov::PropertyName{ov::hint::activations_scale_factor.name(), PropertyMutability::RO}, ov::PropertyName{ov::device::id.name(), PropertyMutability::RO}, ov::PropertyName{ov::execution_devices.name(), PropertyMutability::RO}, }; diff --git a/src/plugins/intel_gpu/src/plugin/plugin.cpp b/src/plugins/intel_gpu/src/plugin/plugin.cpp index 9aba7ee1a117eb..d3d70ec92cd23c 100644 --- a/src/plugins/intel_gpu/src/plugin/plugin.cpp +++ b/src/plugins/intel_gpu/src/plugin/plugin.cpp @@ -540,6 +540,7 @@ std::vector Plugin::get_caching_properties() const { ov::PropertyName{ov::hint::execution_mode.name(), PropertyMutability::RW}, ov::PropertyName{ov::hint::performance_mode.name(), PropertyMutability::RW}, ov::PropertyName{ov::hint::dynamic_quantization_group_size.name(), PropertyMutability::RW}, + ov::PropertyName{ov::hint::activations_scale_factor.name(), PropertyMutability::RW}, }; return caching_properties; @@ -585,7 +586,8 @@ std::vector Plugin::get_supported_properties() const { ov::PropertyName{ov::hint::inference_precision.name(), PropertyMutability::RW}, ov::PropertyName{ov::hint::enable_cpu_pinning.name(), PropertyMutability::RW}, ov::PropertyName{ov::device::id.name(), PropertyMutability::RW}, - ov::PropertyName{ov::hint::dynamic_quantization_group_size.name(), PropertyMutability::RW} + ov::PropertyName{ov::hint::dynamic_quantization_group_size.name(), PropertyMutability::RW}, + ov::PropertyName{ov::hint::activations_scale_factor.name(), PropertyMutability::RW} }; return supported_properties; diff --git a/src/plugins/intel_gpu/src/plugin/transformations/fc_per_layer_scaling.cpp b/src/plugins/intel_gpu/src/plugin/transformations/fc_per_layer_scaling.cpp new file mode 100644 index 00000000000000..618578919d4024 --- /dev/null +++ b/src/plugins/intel_gpu/src/plugin/transformations/fc_per_layer_scaling.cpp @@ -0,0 +1,81 @@ +// Copyright (C) 2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "fc_per_layer_scaling.hpp" + +#include "intel_gpu/op/fully_connected_compressed.hpp" +#include "intel_gpu/op/placeholder.hpp" + +#include "openvino/op/multiply.hpp" +#include "openvino/core/rt_info.hpp" +#include "openvino/pass/pattern/op/pattern.hpp" +#include "openvino/pass/pattern/op/wrap_type.hpp" +#include "openvino/pass/pattern/op/or.hpp" +#include "transformations/utils/utils.hpp" + +namespace ov { +namespace intel_gpu { + +FullyConnectedPerLayerScaling::FullyConnectedPerLayerScaling(float scale_factor) { + using namespace ov::pass::pattern; + + auto data_m = any_input(); + auto weights_m = any_input(); + auto bias_m = any_input(); + auto fc_compressed_wo_zp_m = wrap_type({data_m, weights_m, bias_m, any_input()}, consumers_count(1)); + auto fc_compressed_w_zp_m = wrap_type({data_m, weights_m, bias_m, any_input(), any_input()}, consumers_count(1)); + auto fc_compressed_m = std::make_shared(OutputVector{fc_compressed_wo_zp_m, fc_compressed_w_zp_m}); + + ov::matcher_pass_callback callback = [OV_CAPTURE_CPY_AND_THIS](Matcher& m) { + if (scale_factor == 0.f || scale_factor == 1.f) + return false; + auto fc = std::dynamic_pointer_cast(m.get_match_root()); + if (!fc || transformation_callback(fc)) + return false; + + const auto& pattern_map = m.get_pattern_value_map(); + const auto& data = pattern_map.at(data_m).get_node_shared_ptr(); + const auto& bias = pattern_map.at(bias_m).get_node_shared_ptr(); + + ov::Shape scale_const_shape = {1}; + std::vector scale_down_value = {(1.f / scale_factor)}; + std::vector scale_up_value = {scale_factor}; + std::shared_ptr scale_down_const_f16 = std::make_shared(ov::element::f16, scale_const_shape, scale_down_value); + std::shared_ptr scale_down_const_f32 = std::make_shared(ov::element::f32, scale_const_shape, scale_down_value); + std::shared_ptr scale_up_const_f16 = std::make_shared(ov::element::f16, scale_const_shape, scale_up_value); + std::shared_ptr scale_up_const_f32 = std::make_shared(ov::element::f32, scale_const_shape, scale_up_value); + + std::shared_ptr scale_down_const = (data->get_element_type() == ov::element::f16) ? scale_down_const_f16 : scale_down_const_f32; + auto scale_down = std::make_shared(data, scale_down_const); + scale_down->set_friendly_name(fc->get_friendly_name() + "_scale_down"); + ov::copy_runtime_info(fc, scale_down); + fc->input(0).replace_source_output(scale_down); + + // If FC has bias as input, scaling must be applied to bias as well + if (!std::dynamic_pointer_cast(bias)) { + std::shared_ptr bias_scale_down_const = (bias->get_element_type() == ov::element::f16) ? scale_down_const_f16 : scale_down_const_f32; + auto bias_scale_down = std::make_shared(bias, bias_scale_down_const); + bias_scale_down->set_friendly_name(fc->get_friendly_name() + "_bias_scale_down"); + ov::copy_runtime_info(fc, bias_scale_down); + fc->input(2).replace_source_output(bias_scale_down); + } + + auto target_inputs = fc->get_output_target_inputs(0); + std::shared_ptr scale_up_const = (fc->get_element_type() == ov::element::f16) ? scale_up_const_f16 : scale_up_const_f32; + auto scale_up = std::make_shared(fc, scale_up_const); + scale_up->set_friendly_name(fc->get_friendly_name() + "_scale_up"); + ov::copy_runtime_info(fc, scale_up); + for (auto& in : target_inputs) { + in.replace_source_output(scale_up); + } + + return true; + }; + + auto m = std::make_shared(fc_compressed_m, "FullyConnectedPerLayerScaling"); + this->register_matcher(m, callback); +} + +} // namespace intel_gpu +} // namespace ov diff --git a/src/plugins/intel_gpu/src/plugin/transformations/fc_per_layer_scaling.hpp b/src/plugins/intel_gpu/src/plugin/transformations/fc_per_layer_scaling.hpp new file mode 100644 index 00000000000000..5c0d7d07f5b411 --- /dev/null +++ b/src/plugins/intel_gpu/src/plugin/transformations/fc_per_layer_scaling.hpp @@ -0,0 +1,19 @@ +// Copyright (C) 2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include "openvino/pass/graph_rewrite.hpp" + +namespace ov { +namespace intel_gpu { + +class FullyConnectedPerLayerScaling: public ov::pass::MatcherPass { +public: + OPENVINO_RTTI("FullyConnectedPerLayerScaling", "0"); + FullyConnectedPerLayerScaling(float scale_factor); +}; + +} // namespace intel_gpu +} // namespace ov diff --git a/src/plugins/intel_gpu/src/plugin/transformations_pipeline.cpp b/src/plugins/intel_gpu/src/plugin/transformations_pipeline.cpp index 305e21a5000149..a33a15fbbe6a1a 100644 --- a/src/plugins/intel_gpu/src/plugin/transformations_pipeline.cpp +++ b/src/plugins/intel_gpu/src/plugin/transformations_pipeline.cpp @@ -65,6 +65,7 @@ #include "plugin/transformations/move_fc_reshape_to_weights.hpp" #include "plugin/transformations/bcast_and_pad_zp_buffers.hpp" #include "plugin/transformations/print_model_statistics.hpp" +#include "plugin/transformations/fc_per_layer_scaling.hpp" #include "plugin/transformations/swiglu_fusion.hpp" #include "plugin/transformations/transpose_fusion.hpp" #include "plugin/transformations/indirect_kv_cache.hpp" @@ -846,6 +847,7 @@ void TransformationsPipeline::apply(std::shared_ptr func) { manager.register_pass(); manager.register_pass(); manager.register_pass(device_info.supports_immad); + manager.register_pass(config.get_property(ov::hint::activations_scale_factor)); if (!device_info.supports_immad) { manager.register_pass(); diff --git a/src/plugins/intel_gpu/src/runtime/execution_config.cpp b/src/plugins/intel_gpu/src/runtime/execution_config.cpp index c48f3f02fa9f6a..f3b9058f7ebdc8 100644 --- a/src/plugins/intel_gpu/src/runtime/execution_config.cpp +++ b/src/plugins/intel_gpu/src/runtime/execution_config.cpp @@ -61,6 +61,7 @@ void ExecutionConfig::set_default() { std::make_tuple(ov::hint::kv_cache_precision, ov::element::undefined), std::make_tuple(ov::intel_gpu::hint::enable_kernels_reuse, false), std::make_tuple(ov::weights_path, ""), + std::make_tuple(ov::hint::activations_scale_factor, 0.f), // Legacy API properties std::make_tuple(ov::intel_gpu::nv12_two_inputs, false), diff --git a/src/plugins/intel_gpu/tests/unit/transformations/fc_per_layer_scaling_test.cpp b/src/plugins/intel_gpu/tests/unit/transformations/fc_per_layer_scaling_test.cpp new file mode 100644 index 00000000000000..2d2f21b57d7152 --- /dev/null +++ b/src/plugins/intel_gpu/tests/unit/transformations/fc_per_layer_scaling_test.cpp @@ -0,0 +1,117 @@ +// Copyright (C) 2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include +#include "common_test_utils/graph_comparator.hpp" +#include "common_test_utils/ov_test_utils.hpp" + +#include +#include + +#include "openvino/op/constant.hpp" +#include "openvino/op/parameter.hpp" +#include "openvino/op/multiply.hpp" +#include "openvino/pass/manager.hpp" + +#include +#include "plugin/transformations/fc_per_layer_scaling.hpp" +#include "intel_gpu/op/placeholder.hpp" +#include "intel_gpu/op/fully_connected_compressed.hpp" + +using namespace testing; +using namespace ov::intel_gpu; + +namespace ov { +namespace test { +namespace intel_gpu { + +TEST_F(TransformationTestsF, FullyConnectedPerLayerScalingTest1) { + float scale_factor = 2.f; + { + auto input = std::make_shared(ov::element::f16, ov::PartialShape{ -1, 16 }); + auto weights_const = ov::op::v0::Constant::create(ov::element::u8, ov::Shape{ 32, 16 }, { 1 }); + auto no_bias = std::make_shared(); + auto scale_const = ov::op::v0::Constant::create(ov::element::f16, ov::Shape{ 32, 1 }, { 1 }); + auto zp_const = ov::op::v0::Constant::create(ov::element::f16, ov::Shape{ 32, 1 }, { 1 }); + auto fc_compressed = std::make_shared(input, weights_const, no_bias, scale_const, zp_const); + auto convert = std::make_shared(fc_compressed, ov::element::f32); + auto result = std::make_shared(convert); + + model = std::make_shared(ov::ResultVector{result}, ov::ParameterVector{input}); + manager.register_pass(scale_factor); + } + { + auto input = std::make_shared(ov::element::f16, ov::PartialShape{ -1, 16 }); + auto weights_const = ov::op::v0::Constant::create(ov::element::u8, ov::Shape{ 32, 16 }, { 1 }); + auto no_bias = std::make_shared(); + auto scale_const = ov::op::v0::Constant::create(ov::element::f16, ov::Shape{ 32, 1 }, { 1 }); + auto zp_const = ov::op::v0::Constant::create(ov::element::f16, ov::Shape{ 32, 1 }, { 1 }); + auto scale_down_const = ov::op::v0::Constant::create(ov::element::f16, ov::Shape{ 1 }, { 1.f / scale_factor }); + auto scale_down = std::make_shared(input, scale_down_const); + auto fc_compressed = std::make_shared(scale_down, weights_const, no_bias, scale_const, zp_const); + auto scale_up_const = ov::op::v0::Constant::create(ov::element::f16, ov::Shape{ 1 }, { scale_factor }); + auto scale_up = std::make_shared(fc_compressed, scale_up_const); + auto convert = std::make_shared(scale_up, ov::element::f32); + auto result = std::make_shared(convert); + + model_ref = std::make_shared(ov::ResultVector{result}, ov::ParameterVector{input}); + comparator.enable(FunctionsComparator::ATTRIBUTES); + } +} + +TEST_F(TransformationTestsF, FullyConnectedPerLayerScalingTest2) { + float scale_factor = 2.f; + { + auto input = std::make_shared(ov::element::f16, ov::PartialShape{ -1, 16 }); + auto weights_const = ov::op::v0::Constant::create(ov::element::u8, ov::Shape{ 32, 16 }, { 1 }); + auto bias = std::make_shared(ov::element::f16, ov::Shape{ 1, 32 }); + auto scale_const = ov::op::v0::Constant::create(ov::element::f16, ov::Shape{ 32, 1 }, { 1 }); + auto zp_const = ov::op::v0::Constant::create(ov::element::f16, ov::Shape{ 32, 1 }, { 1 }); + auto fc_compressed = std::make_shared(input, weights_const, bias, scale_const, zp_const); + auto convert = std::make_shared(fc_compressed, ov::element::f32); + auto result = std::make_shared(convert); + + model = std::make_shared(ov::ResultVector{result}, ov::ParameterVector{input}); + manager.register_pass(scale_factor); + } + { + auto input = std::make_shared(ov::element::f16, ov::PartialShape{ -1, 16 }); + auto weights_const = ov::op::v0::Constant::create(ov::element::u8, ov::Shape{ 32, 16 }, { 1 }); + auto bias = std::make_shared(ov::element::f16, ov::Shape{ 1, 32 }); + auto scale_const = ov::op::v0::Constant::create(ov::element::f16, ov::Shape{ 32, 1 }, { 1 }); + auto zp_const = ov::op::v0::Constant::create(ov::element::f16, ov::Shape{ 32, 1 }, { 1 }); + auto scale_down_const = ov::op::v0::Constant::create(ov::element::f16, ov::Shape{ 1 }, { 1.f / scale_factor }); + auto scale_down = std::make_shared(input, scale_down_const); + auto bias_scale_down_const = ov::op::v0::Constant::create(ov::element::f16, ov::Shape{ 1 }, { 1.f / scale_factor }); + auto bias_scale_down = std::make_shared(bias, scale_down_const); + auto fc_compressed = std::make_shared(scale_down, weights_const, bias_scale_down, scale_const, zp_const); + auto scale_up_const = ov::op::v0::Constant::create(ov::element::f16, ov::Shape{ 1 }, { scale_factor }); + auto scale_up = std::make_shared(fc_compressed, scale_up_const); + auto convert = std::make_shared(scale_up, ov::element::f32); + auto result = std::make_shared(convert); + + model_ref = std::make_shared(ov::ResultVector{result}, ov::ParameterVector{input}); + comparator.enable(FunctionsComparator::ATTRIBUTES); + } +} + +TEST_F(TransformationTestsF, FullyConnectedPerLayerScalingTest3) { + { + auto input = std::make_shared(ov::element::f16, ov::PartialShape{ -1, 16 }); + auto weights_const = ov::op::v0::Constant::create(ov::element::u8, ov::Shape{ 32, 16 }, { 1 }); + auto no_bias = std::make_shared(); + auto scale_const = ov::op::v0::Constant::create(ov::element::f16, ov::Shape{ 32, 1 }, { 1 }); + auto zp_const = ov::op::v0::Constant::create(ov::element::f16, ov::Shape{ 32, 1 }, { 1 }); + auto fc_compressed = std::make_shared(input, weights_const, no_bias, scale_const, zp_const); + auto convert = std::make_shared(fc_compressed, ov::element::f32); + auto result = std::make_shared(convert); + + model = std::make_shared(ov::ResultVector{result}, ov::ParameterVector{input}); + manager.register_pass(1.f); + } +} + +} // namespace intel_gpu +} // namespace test +} // namespace ov \ No newline at end of file From 9036b592d36a40808a8c9e0ab22a9dbb75e33cfc Mon Sep 17 00:00:00 2001 From: Nikolay Shchegolev Date: Wed, 30 Oct 2024 10:20:10 +0400 Subject: [PATCH 049/120] [CPU][OMP] Handle exception outside parallel region (#27303) ### Details: - *Handle exception inside OMP threads to avoid immediate program interruption.* ### Tickets: - *152606* Co-authored-by: Ilya Lavrenov --- src/plugins/intel_cpu/src/graph.cpp | 21 +++++++++++++++++++-- 1 file changed, 19 insertions(+), 2 deletions(-) diff --git a/src/plugins/intel_cpu/src/graph.cpp b/src/plugins/intel_cpu/src/graph.cpp index f9bfa9334eae8f..f3f3a379fc2af7 100644 --- a/src/plugins/intel_cpu/src/graph.cpp +++ b/src/plugins/intel_cpu/src/graph.cpp @@ -1297,23 +1297,40 @@ class UpdateNodes : public UpdateNodesBase { if (origin_nested_levels < 2) { set_max_nested_levels(2); } + // In OpenMP, an exception that is thrown in a parallel region must be caught and handled in the same region by the same thread. + // Therefore, need to pass the error message and throw a new exception outside the parallel region. + const char* what = nullptr; #pragma omp parallel #pragma omp sections { #pragma omp section { - updateDynParams(startCounter, stopIndx); + try { + updateDynParams(startCounter, stopIndx); + } catch (std::exception& e) { + what = e.what(); + } catch (...) { + what = "[ CPU ] Could not update dynamic parameters."; + } } #pragma omp section { - updateShapes(startCounter, stopIndx); + try { + updateShapes(startCounter, stopIndx); + } catch (std::exception& e) { + what = e.what(); + } catch (...) { + what = "[ CPU ] Could not update shapes."; + } } } if (origin_nested_levels != 2) { set_max_nested_levels(origin_nested_levels); } + + OPENVINO_ASSERT(what == nullptr, what); } }; #endif From 8da8a300994a3b3c52dd7a0d7c75e4db74f845f2 Mon Sep 17 00:00:00 2001 From: Mingyu Kim Date: Wed, 30 Oct 2024 15:26:17 +0900 Subject: [PATCH 050/120] [GPU] model cache fix from kv cache compression (#27323) ### Details: - model cache was not working because of load/save mismatch --- .../intel_gpu/primitives/scaled_dot_product_attention.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/plugins/intel_gpu/include/intel_gpu/primitives/scaled_dot_product_attention.hpp b/src/plugins/intel_gpu/include/intel_gpu/primitives/scaled_dot_product_attention.hpp index 1fd5b43824d0a7..77e1c5ae71099e 100644 --- a/src/plugins/intel_gpu/include/intel_gpu/primitives/scaled_dot_product_attention.hpp +++ b/src/plugins/intel_gpu/include/intel_gpu/primitives/scaled_dot_product_attention.hpp @@ -116,6 +116,7 @@ struct scaled_dot_product_attention : public primitive_base::save(ob); ob << is_causal; + ob << is_kv_compressed; ob << has_attn_mask_input; ob << has_scale_input; ob << indirect_axis; @@ -123,7 +124,6 @@ struct scaled_dot_product_attention : public primitive_base Date: Wed, 30 Oct 2024 10:14:58 +0200 Subject: [PATCH 051/120] [intel-npu] max memalloc quickfix for grext 1.8 windows drivers (#27317) ### Details: - another quickfix for maximum memory allocation property, to enable UD44 windows drivers too - addition to https://github.com/openvinotoolkit/openvino/pull/27270 ### Tickets: - *EISW-143246* --- src/plugins/intel_npu/src/backend/src/zero_device.cpp | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/src/plugins/intel_npu/src/backend/src/zero_device.cpp b/src/plugins/intel_npu/src/backend/src/zero_device.cpp index 439b5fbd59f4f9..58bcd0eb7cc944 100644 --- a/src/plugins/intel_npu/src/backend/src/zero_device.cpp +++ b/src/plugins/intel_npu/src/backend/src/zero_device.cpp @@ -175,6 +175,14 @@ uint64_t ZeroDevice::getTotalMemSize() const { // we are safe here, can return the value directly from driver return query.total; } +#if defined(_WIN32) || defined(__CYGWIN__) + // Special case for windows drivers with graph_extension v 1.8 + if (_initStructs->isExtensionSupported(std::string("ZE_extension_graph_1_8"), ZE_MAKE_VERSION(1, 8))) { + // query here returns total system memory in KB, which we need to + // divide by 2 (OS limitation) and convert to bytes + return (query.total << 9); + } +#endif // Default for older drivers: return 2GB return LEGACY_MAX_MEM_ALLOC_SIZE_BYTES; From 9263641442f513758f7f7d6a772f598f888f20b2 Mon Sep 17 00:00:00 2001 From: Sebastian Golebiewski Date: Wed, 30 Oct 2024 09:25:14 +0100 Subject: [PATCH 052/120] [DOCS] Fixing formatting in the STFT article. (#27312) Fixing formatting issues in the `Short Time Fourier Transformation for real-valued input` article. --- .../operation-specs/signals/stft-15.rst | 32 +++++++++---------- 1 file changed, 16 insertions(+), 16 deletions(-) diff --git a/docs/articles_en/documentation/openvino-ir-format/operation-sets/operation-specs/signals/stft-15.rst b/docs/articles_en/documentation/openvino-ir-format/operation-sets/operation-specs/signals/stft-15.rst index 581c5062f67520..bcc420f5db25c9 100644 --- a/docs/articles_en/documentation/openvino-ir-format/operation-sets/operation-specs/signals/stft-15.rst +++ b/docs/articles_en/documentation/openvino-ir-format/operation-sets/operation-specs/signals/stft-15.rst @@ -31,25 +31,25 @@ Short Time Fourier Transformation for real-valued input (STFT) **Inputs** -* **1**: ``signal`` - Tensor of type *T* and 1D shape [signal_size] or 2D shape [batch, signal_size] with signal data for the STFT. **Required.** -* **2**: ``window`` - Tensor of type *T* and 1D shape [window_length], specifying the window values for the signal slice multiplication. **Required.** -* **3**: ``frame_size`` - Scalar tensor of type *T_INT* describing the size of a single frame of the signal to be provided as input to FFT. **Required.** -* **4**: ``frame_step`` - Scalar tensor of type *T_INT* describing The distance (number of samples) between successive frames. **Required.** +* **1**: ``signal`` - Tensor of type *T* and 1D shape [signal_size] or 2D shape [batch, signal_size] with signal data for the STFT. **Required.** +* **2**: ``window`` - Tensor of type *T* and 1D shape [window_length], specifying the window values for the signal slice multiplication. **Required.** +* **3**: ``frame_size`` - Scalar tensor of type *T_INT* describing the size of a single frame of the signal to be provided as input to FFT. **Required.** +* **4**: ``frame_step`` - Scalar tensor of type *T_INT* describing The distance (number of samples) between successive frames. **Required.** **Outputs** -* **1**: The result of STFT operation, tensor of the same type as input ``signal`` tensor and shape: +* **1**: The result of STFT operation, tensor of the same type as input ``signal`` tensor and shape: - + When ``transpose_frames == false`` the output shape is ``[frames, fft_results, 2]`` for 1D signal input or [batch, frames, fft_results, 2] for 2D signal input. - + When ``transpose_frames == true`` the output shape is [fft_results, frames, 2] for 1D signal input or [batch, fft_results, frames, 2]`` for 2D signal input. + * When ``transpose_frames == false`` the output shape is ``[frames, fft_results, 2]`` for 1D signal input or ``[batch, frames, fft_results, 2]`` for 2D signal input. + * When ``transpose_frames == true`` the output shape is ``[fft_results, frames, 2]`` for 1D signal input or ``[batch, fft_results, frames, 2]`` for 2D signal input. - where: + where: - + ``batch`` is a batch size dimension - + ``frames`` is a number calculated as ``(signal_shape[-1] - frame_size) / frame_step) + 1`` - + ``fft_results`` is a number calculated as ``(frame_size / 2) + 1`` - + ``2`` is the last dimension is for complex value real and imaginary part + * ``batch`` is a batch size dimension + * ``frames`` is a number calculated as ``(signal_shape[-1] - frame_size) / frame_step) + 1`` + * ``fft_results`` is a number calculated as ``(frame_size / 2) + 1`` + * ``2`` is the last dimension is for complex value real and imaginary part **Types** @@ -61,7 +61,7 @@ Short Time Fourier Transformation for real-valued input (STFT) **Examples**: -*Example 1D signal, transpose_frames=false: * +*Example 1D signal, transpose_frames=false:* .. code-block:: xml :force: @@ -87,7 +87,7 @@ Short Time Fourier Transformation for real-valued input (STFT) -*Example 1D signal, transpose_frames=true: * +*Example 1D signal, transpose_frames=true:* .. code-block:: xml :force: @@ -112,7 +112,7 @@ Short Time Fourier Transformation for real-valued input (STFT) -*Example 2D signal, transpose_frames=false: * +*Example 2D signal, transpose_frames=false:* .. code-block:: xml :force: @@ -140,7 +140,7 @@ Short Time Fourier Transformation for real-valued input (STFT) -*Example 2D signal, transpose_frames=true: * +*Example 2D signal, transpose_frames=true:* .. code-block:: xml :force: From 22c6740f79e8de50f6653cec7266fe9a5186caa9 Mon Sep 17 00:00:00 2001 From: Roman Kazantsev Date: Wed, 30 Oct 2024 12:44:08 +0400 Subject: [PATCH 053/120] [PT FE] Unify conversion pipeline for ExportedProgram from memory and disk (#27324) **Details:** Before the fix, conversions of ExportedProgram from memory and disk use different decomposition sets **Tickets:** TBD Signed-off-by: Kazantsev, Roman --- .../moc_frontend/pytorch_frontend_utils.py | 33 ++++++++++--------- 1 file changed, 18 insertions(+), 15 deletions(-) diff --git a/tools/ovc/openvino/tools/ovc/moc_frontend/pytorch_frontend_utils.py b/tools/ovc/openvino/tools/ovc/moc_frontend/pytorch_frontend_utils.py index d3b77c9a61f566..486f72d87fd89d 100644 --- a/tools/ovc/openvino/tools/ovc/moc_frontend/pytorch_frontend_utils.py +++ b/tools/ovc/openvino/tools/ovc/moc_frontend/pytorch_frontend_utils.py @@ -21,6 +21,22 @@ def extract_module_extensions(args): return {extension.module: extension for extension in extensions if isinstance(extension, ModuleExtension)} +def get_decoder_for_exported_program(model): + from openvino.frontend.pytorch.fx_decoder import TorchFXPythonDecoder + import torch + + from packaging import version + if version.parse(torch.__version__) >= version.parse("2.2"): + from torch._decomp import get_decompositions + from openvino.frontend.pytorch.torchdynamo.decompositions import get_export_decomposition_list + decomp = get_decompositions(get_export_decomposition_list()) + model = model.run_decompositions(decomp_table=decomp) + gm = model.module() + log.debug(gm.code) + decoder = TorchFXPythonDecoder(gm, dynamic_shapes=True) + return decoder + + def get_pytorch_decoder(model, example_inputs, args): try: from openvino.frontend.pytorch.ts_decoder import TorchScriptPythonDecoder @@ -49,15 +65,7 @@ def get_pytorch_decoder(model, example_inputs, args): inputs = prepare_torch_inputs(example_inputs) if not isinstance(model, (TorchScriptPythonDecoder, TorchFXPythonDecoder)): if hasattr(torch, "export") and isinstance(model, (torch.export.ExportedProgram)): - from packaging import version - if version.parse(torch.__version__) >= version.parse("2.2"): - from torch._decomp import get_decompositions - from openvino.frontend.pytorch.torchdynamo.decompositions import get_export_decomposition_list - decomp = get_decompositions(get_export_decomposition_list()) - model = model.run_decompositions(decomp_table=decomp) - gm = model.module() - log.debug(gm.code) - decoder = TorchFXPythonDecoder(gm, dynamic_shapes=True) + decoder = get_decoder_for_exported_program(model) else: decoder = TorchScriptPythonDecoder( model, @@ -111,12 +119,7 @@ def get_pytorch_decoder_for_model_on_disk(argv, args): try: exported_program = torch.export.load(input_model) if hasattr(torch, "export") and isinstance(exported_program, (torch.export.ExportedProgram)): - from packaging import version - if version.parse(torch.__version__) >= version.parse("2.2"): - exported_program = exported_program.run_decompositions() - gm = exported_program.module() - decoder = TorchFXPythonDecoder(gm, dynamic_shapes=True) - argv.input_model = decoder + argv.input_model = get_decoder_for_exported_program(exported_program) argv.framework = 'pytorch' return True except: From 11cf409183ee45e930240c00cd3526f62db14abb Mon Sep 17 00:00:00 2001 From: Vladimir Paramuzov Date: Wed, 30 Oct 2024 13:29:55 +0400 Subject: [PATCH 054/120] [GPU] Disable OneDNN for unknown arch via dpas flag faking (#27326) ### Details: - This patch enforces dpas availability flag to false when HW architecture is unknown to onednn to fallback to OCL kernels which are supposed to be more generic and more forward compatible. - Also, added an architecture check in each onednn-based impl if in the future we'll stop relying on `supports_immad` flag when decide whether to use onednn or not. Signed-off-by: Vladimir Paramuzov --- .../src/graph/impls/onednn/concatenation_onednn.hpp | 2 +- .../src/graph/impls/onednn/convolution_onednn.hpp | 2 +- .../src/graph/impls/onednn/deconvolution_onednn.hpp | 2 +- .../src/graph/impls/onednn/fully_connected_onednn.hpp | 2 +- .../intel_gpu/src/graph/impls/onednn/gemm_onednn.hpp | 2 +- .../intel_gpu/src/graph/impls/onednn/pooling_onednn.hpp | 2 +- .../intel_gpu/src/graph/impls/onednn/reduce_onednn.hpp | 2 +- .../intel_gpu/src/graph/impls/onednn/reorder_onednn.hpp | 2 +- src/plugins/intel_gpu/src/runtime/ocl/ocl_device.cpp | 8 ++++++++ 9 files changed, 16 insertions(+), 8 deletions(-) diff --git a/src/plugins/intel_gpu/src/graph/impls/onednn/concatenation_onednn.hpp b/src/plugins/intel_gpu/src/graph/impls/onednn/concatenation_onednn.hpp index e85bda18a034da..9e0a3fa5cfb390 100644 --- a/src/plugins/intel_gpu/src/graph/impls/onednn/concatenation_onednn.hpp +++ b/src/plugins/intel_gpu/src/graph/impls/onednn/concatenation_onednn.hpp @@ -19,7 +19,7 @@ struct ConcatenationImplementationManager : public ImplementationManager { bool validate_impl(const program_node& node) const override { assert(node.is_type()); const auto& info = node.get_program().get_engine().get_device_info(); - if (!info.supports_immad) + if (!info.supports_immad || info.arch == gpu_arch::unknown) return false; static const std::vector supported_types = { ov::element::f16, ov::element::u8, ov::element::i8 }; diff --git a/src/plugins/intel_gpu/src/graph/impls/onednn/convolution_onednn.hpp b/src/plugins/intel_gpu/src/graph/impls/onednn/convolution_onednn.hpp index a5616167506f70..c3f599fc5db9f6 100644 --- a/src/plugins/intel_gpu/src/graph/impls/onednn/convolution_onednn.hpp +++ b/src/plugins/intel_gpu/src/graph/impls/onednn/convolution_onednn.hpp @@ -24,7 +24,7 @@ struct ConvolutionImplementationManager : public ImplementationManager { bool validate_impl(const program_node& node) const override { assert(node.is_type()); const auto& info = node.get_program().get_engine().get_device_info(); - if (!info.supports_immad) + if (!info.supports_immad || info.arch == gpu_arch::unknown) return false; const auto& conv_node = node.as(); diff --git a/src/plugins/intel_gpu/src/graph/impls/onednn/deconvolution_onednn.hpp b/src/plugins/intel_gpu/src/graph/impls/onednn/deconvolution_onednn.hpp index 949c979ed77e80..039cf36261caa0 100644 --- a/src/plugins/intel_gpu/src/graph/impls/onednn/deconvolution_onednn.hpp +++ b/src/plugins/intel_gpu/src/graph/impls/onednn/deconvolution_onednn.hpp @@ -20,7 +20,7 @@ struct DeconvolutionImplementationManager : public ImplementationManager { bool validate_impl(const program_node& node) const override { assert(node.is_type()); const auto& info = node.get_program().get_engine().get_device_info(); - if (!info.supports_immad) + if (!info.supports_immad || info.arch == gpu_arch::unknown) return false; const auto& deconv_node = node.as(); diff --git a/src/plugins/intel_gpu/src/graph/impls/onednn/fully_connected_onednn.hpp b/src/plugins/intel_gpu/src/graph/impls/onednn/fully_connected_onednn.hpp index f4495fb5dd1645..a601b2c74c09e3 100644 --- a/src/plugins/intel_gpu/src/graph/impls/onednn/fully_connected_onednn.hpp +++ b/src/plugins/intel_gpu/src/graph/impls/onednn/fully_connected_onednn.hpp @@ -22,7 +22,7 @@ struct FullyConnectedImplementationManager : public ImplementationManager { bool validate_impl(const program_node& node) const override { assert(node.is_type()); const auto& info = node.get_program().get_engine().get_device_info(); - if (!info.supports_immad) + if (!info.supports_immad || info.arch == gpu_arch::unknown) return false; const auto& fc_node = node.as(); diff --git a/src/plugins/intel_gpu/src/graph/impls/onednn/gemm_onednn.hpp b/src/plugins/intel_gpu/src/graph/impls/onednn/gemm_onednn.hpp index f89d3e588735e2..6c576d177043ee 100644 --- a/src/plugins/intel_gpu/src/graph/impls/onednn/gemm_onednn.hpp +++ b/src/plugins/intel_gpu/src/graph/impls/onednn/gemm_onednn.hpp @@ -19,7 +19,7 @@ struct GemmImplementationManager : public ImplementationManager { bool validate_impl(const program_node& node) const override { assert(node.is_type()); const auto& info = node.get_program().get_engine().get_device_info(); - if (!info.supports_immad) + if (!info.supports_immad || info.arch == gpu_arch::unknown) return false; const auto& gemm_node = node.as(); diff --git a/src/plugins/intel_gpu/src/graph/impls/onednn/pooling_onednn.hpp b/src/plugins/intel_gpu/src/graph/impls/onednn/pooling_onednn.hpp index 343fe66771de25..4710b0c77b83c7 100644 --- a/src/plugins/intel_gpu/src/graph/impls/onednn/pooling_onednn.hpp +++ b/src/plugins/intel_gpu/src/graph/impls/onednn/pooling_onednn.hpp @@ -20,7 +20,7 @@ struct PoolingImplementationManager : public ImplementationManager { bool validate_impl(const program_node& node) const override { assert(node.is_type()); const auto& info = node.get_program().get_engine().get_device_info(); - if (!info.supports_immad) + if (!info.supports_immad || info.arch == gpu_arch::unknown) return false; const auto& in_layout = node.get_input_layout(0); diff --git a/src/plugins/intel_gpu/src/graph/impls/onednn/reduce_onednn.hpp b/src/plugins/intel_gpu/src/graph/impls/onednn/reduce_onednn.hpp index fbdf64131ff384..68d963fd9e369f 100644 --- a/src/plugins/intel_gpu/src/graph/impls/onednn/reduce_onednn.hpp +++ b/src/plugins/intel_gpu/src/graph/impls/onednn/reduce_onednn.hpp @@ -49,7 +49,7 @@ struct ReduceImplementationManager : public ImplementationManager { bool validate_impl(const program_node& node) const override { assert(node.is_type()); const auto& info = node.get_program().get_engine().get_device_info(); - if (!info.supports_immad) + if (!info.supports_immad || info.arch == gpu_arch::unknown) return false; const auto& reduce_node = node.as(); diff --git a/src/plugins/intel_gpu/src/graph/impls/onednn/reorder_onednn.hpp b/src/plugins/intel_gpu/src/graph/impls/onednn/reorder_onednn.hpp index b671f5e210e75c..ad08c516e939d8 100644 --- a/src/plugins/intel_gpu/src/graph/impls/onednn/reorder_onednn.hpp +++ b/src/plugins/intel_gpu/src/graph/impls/onednn/reorder_onednn.hpp @@ -57,7 +57,7 @@ struct ReorderImplementationManager : public ImplementationManager { return true; const auto& info = node.get_program().get_engine().get_device_info(); - if (!info.supports_immad) + if (!info.supports_immad || info.arch == gpu_arch::unknown) return false; if (!one_of(input_fmt.value, supported_formats) || !one_of(output_fmt.value, supported_formats)) diff --git a/src/plugins/intel_gpu/src/runtime/ocl/ocl_device.cpp b/src/plugins/intel_gpu/src/runtime/ocl/ocl_device.cpp index 88801b8b2b4e61..7ab48308cfeaf7 100644 --- a/src/plugins/intel_gpu/src/runtime/ocl/ocl_device.cpp +++ b/src/plugins/intel_gpu/src/runtime/ocl/ocl_device.cpp @@ -330,6 +330,14 @@ device_info init_device_info(const cl::Device& device, const cl::Context& contex ngen::Product product = {ngen::ProductFamily::Unknown, 0}; jit_generator::detectHWInfo(context.get(), device.get(), hw, product); info.arch = convert_ngen_arch(hw); + // We change the value of this flag to avoid OneDNN usage for the platforms unknown to OneDNN + // This is required to guarantee some level of forward compatibility for the new HW generations + // as OneDNN code generators are not generic and typically requires some updates for the new architectures + // Ideally, we shouldn't do that as OCL impls sometimes also check this flag, but in order to avoid that + // we need to ensure that graph transformations are not relying on this flag as indicator that onednn will be used + if (product.family == ngen::ProductFamily::Unknown) { + info.supports_immad = false; + } #else // ENABLE_ONEDNN_FOR_GPU info.arch = gpu_arch::unknown; #endif // ENABLE_ONEDNN_FOR_GPU From cb292c750056a956b66bc7871dcf8688e4ca0a1e Mon Sep 17 00:00:00 2001 From: Ekaterina Aidova Date: Wed, 30 Oct 2024 13:53:21 +0400 Subject: [PATCH 055/120] add support aten::__ior__ (#27315) ### Details: - *add support `aten::__ior__`, `aten::__iand__`, `aten::__ixor__`* ### Tickets: - *CVS-156301* --- src/frontends/pytorch/src/op_table.cpp | 3 + .../pytorch_tests/test_bitwise_ops.py | 58 +++++++++++++++++++ 2 files changed, 61 insertions(+) diff --git a/src/frontends/pytorch/src/op_table.cpp b/src/frontends/pytorch/src/op_table.cpp index d0e388b5d08cf1..607f0bd32db80d 100644 --- a/src/frontends/pytorch/src/op_table.cpp +++ b/src/frontends/pytorch/src/op_table.cpp @@ -331,12 +331,15 @@ OP_CONVERTER(translate_zeros_like_fx); const std::unordered_map get_supported_ops_ts() { return { {"aten::__and__", op::translate_bitwise_and}, + {"aten::__iand__", op::inplace_op}, {"aten::__derive_index", op::translate_derive_index}, {"aten::__getitem__", op::translate_getitem}, {"aten::__not__", op::translate_1to1_match_1_inputs}, {"aten::__or__", op::translate_bitwise_or}, + {"aten::__ior__", op::inplace_op}, {"aten::__range_length", op::translate_range_length}, {"aten::__xor__", op::translate_bitwise_xor}, + {"aten::__ixor__", op::inplace_op}, {"aten::_convolution", op::translate_convolution}, {"aten::_convolution_mode", op::translate_convolution_mode}, {"aten::_native_multi_head_attention", op::translate_native_multi_head_attention}, diff --git a/tests/layer_tests/pytorch_tests/test_bitwise_ops.py b/tests/layer_tests/pytorch_tests/test_bitwise_ops.py index a400f6dcd76d17..125402b4dbec17 100644 --- a/tests/layer_tests/pytorch_tests/test_bitwise_ops.py +++ b/tests/layer_tests/pytorch_tests/test_bitwise_ops.py @@ -140,3 +140,61 @@ def test_bitwise_operators(self, lhs_dtype, rhs_dtype, lhs_shape, rhs_shape, ie_ trace_model=True, freeze_model=False, ) + + +class TestBitwiseInplaceOp(PytorchLayerTest): + def _prepare_input(self, lhs_shape, rhs_shape, dtype): + choices = np.array([0, 1, 255, 7]) + x = np.random.choice(choices, lhs_shape).astype(dtype) + y = np.random.choice(choices, rhs_shape).astype(dtype) + return x, y + + def create_model(self, op): + class aten_bitwise(torch.nn.Module): + def __init__(self, op) -> None: + super().__init__() + if op == "aten::__ior__": + self.forward = self.forward_or + if op == "aten::__iand__": + self.forward = self.forward_and + if op == "aten::__ixor__": + self.forward = self.forward_xor + + def forward_or(self, lhs, rhs): + return lhs.__ior__(rhs) + + def forward_and(self, lhs, rhs): + return lhs.__iand__(rhs) + + def forward_xor(self, lhs, rhs): + return lhs.__ixor__(rhs) + + return aten_bitwise(op), None, op + + @pytest.mark.nightly + @pytest.mark.precommit + @pytest.mark.parametrize("dtype", ["bool", "int32"]) + @pytest.mark.parametrize( + ("lhs_shape", "rhs_shape"), + [ + ([2, 3], [2, 3]), + ([2, 3], []), + ], + ) + @pytest.mark.parametrize("op", ["aten::__ior__", "aten::__iand__", "aten::__ixor__"]) + def test_bitwise_operators(self, op, dtype, lhs_shape, rhs_shape, ie_device, precision, ir_version): + if ie_device == "GPU" and dtype != "bool": + pytest.xfail(reason="bitwise ops are not supported on GPU") + self._test( + *self.create_model(op), + ie_device, + precision, + ir_version, + kwargs_to_prepare_input={ + "dtype": dtype, + "lhs_shape": lhs_shape, + "rhs_shape": rhs_shape, + }, + trace_model=True, + freeze_model=False, + ) \ No newline at end of file From a7ff891fa552532091576ea39e883c30c5fcf241 Mon Sep 17 00:00:00 2001 From: David Nam Date: Wed, 30 Oct 2024 18:41:08 +0800 Subject: [PATCH 056/120] [GPU] Init tensor.data when allocating inputs for string type (#27269) ### Details: - In case the element type is string produces the segmentation fault when input data is an empty string, unless the each element of tensor.data is initialized. ### Tickets: - 148921 --- src/plugins/intel_gpu/src/plugin/sync_infer_request.cpp | 6 ++++++ .../layer_tests/tensorflow_tests/test_tf_LookupTableSize.py | 2 -- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/src/plugins/intel_gpu/src/plugin/sync_infer_request.cpp b/src/plugins/intel_gpu/src/plugin/sync_infer_request.cpp index 26771117e2e786..985336b801b9d3 100644 --- a/src/plugins/intel_gpu/src/plugin/sync_infer_request.cpp +++ b/src/plugins/intel_gpu/src/plugin/sync_infer_request.cpp @@ -592,6 +592,12 @@ void SyncInferRequest::allocate_input(const ov::Output& port, si auto element_type = port.get_element_type(); m_user_inputs[input_idx] = { create_host_tensor(shape, element_type), TensorOwner::PLUGIN }; + if (element_type == ov::element::string) { + // In case the element type is string and input data is an empty string, + // it produces the segmentation fault unless the each element of tensor.data is initialized. + auto data = m_user_inputs.at(input_idx).ptr->data(); + std::uninitialized_fill_n(data, m_user_inputs.at(input_idx).ptr->get_size(), std::string()); + } ov::ISyncInferRequest::set_tensor(port, m_user_inputs.at(input_idx).ptr); } diff --git a/tests/layer_tests/tensorflow_tests/test_tf_LookupTableSize.py b/tests/layer_tests/tensorflow_tests/test_tf_LookupTableSize.py index e0050c245f1321..4cd5b05f3e86d4 100644 --- a/tests/layer_tests/tensorflow_tests/test_tf_LookupTableSize.py +++ b/tests/layer_tests/tensorflow_tests/test_tf_LookupTableSize.py @@ -69,8 +69,6 @@ def create_lookup_table_size_net(self, hash_table_type, keys_type, values_type, def test_lookup_table_size(self, hash_table_type, params, ie_device, precision, ir_version, temp_dir, use_legacy_frontend): keys_type = params['keys_type'] - if ie_device == 'GPU' and keys_type == str: - pytest.skip("148921: Segmentation fault on GPU") self._test(*self.create_lookup_table_size_net(hash_table_type=hash_table_type, **params), ie_device, precision, ir_version, temp_dir=temp_dir, use_legacy_frontend=use_legacy_frontend) From 2441dcdbcf2f9e996679f72f70faf7ba611fe928 Mon Sep 17 00:00:00 2001 From: Roman Kazantsev Date: Wed, 30 Oct 2024 16:09:03 +0400 Subject: [PATCH 057/120] [TF FE] Stabilize tests for UnsortedSegmentSum operation on all platforms (#27325) **Details:** Stabilize tests for UnsortedSegmentSum operation on all platforms **Ticket:** TBD --------- Signed-off-by: Kazantsev, Roman --- .../test_tf_UnsortedSegmentSum.py | 37 +++++++------------ 1 file changed, 14 insertions(+), 23 deletions(-) diff --git a/tests/layer_tests/tensorflow_tests/test_tf_UnsortedSegmentSum.py b/tests/layer_tests/tensorflow_tests/test_tf_UnsortedSegmentSum.py index 3369aeb8aad231..ccf7c16896270c 100644 --- a/tests/layer_tests/tensorflow_tests/test_tf_UnsortedSegmentSum.py +++ b/tests/layer_tests/tensorflow_tests/test_tf_UnsortedSegmentSum.py @@ -2,11 +2,12 @@ # SPDX-License-Identifier: Apache-2.0 import numpy as np -import platform import pytest import tensorflow as tf from common.tf_layer_test_class import CommonTFLayerTest +rng = np.random.default_rng(23254) + class TestUnsortedSegmentSum(CommonTFLayerTest): def _prepare_input(self, inputs_info): @@ -15,10 +16,10 @@ def _prepare_input(self, inputs_info): data_shape = inputs_info['data:0'] segment_ids_shape = inputs_info['segment_ids:0'] inputs_data = {} - inputs_data['data:0'] = np.random.randint(-50, 50, data_shape).astype(self.data_type) + inputs_data['data:0'] = rng.integers(-10, 10, data_shape).astype(self.data_type) # segment_ids can have negative values - inputs_data['segment_ids:0'] = np.random.randint(-self.num_segments_val, self.num_segments_val, - segment_ids_shape) + inputs_data['segment_ids:0'] = rng.integers(-self.num_segments_val, self.num_segments_val, + segment_ids_shape).astype(self.segment_ids_type) return inputs_data def create_unsorted_segment_sum_net(self, data_shape, segment_ids_shape, num_segments_val, data_type, @@ -48,28 +49,18 @@ def create_unsorted_segment_sum_net(self, data_shape, segment_ids_shape, num_seg ] @pytest.mark.parametrize("params", test_data_basic) - @pytest.mark.parametrize("data_type", [ - np.float32, np.int32 - ]) - @pytest.mark.parametrize("segment_ids_type", [ - np.int32, np.int64 - ]) - @pytest.mark.parametrize("num_segments_type", [ - np.int32, np.int64 - ]) + @pytest.mark.parametrize("data_type", [np.float32, np.int32]) + @pytest.mark.parametrize("segment_ids_type", [np.int32, np.int64]) + @pytest.mark.parametrize("num_segments_type", [np.int32, np.int64]) @pytest.mark.precommit @pytest.mark.nightly - @pytest.mark.xfail(condition=platform.system() == 'Darwin' and platform.machine() == 'arm64', - reason='Ticket - 122716') def test_unsorted_segment_sum_basic(self, params, data_type, segment_ids_type, num_segments_type, ie_device, precision, ir_version, temp_dir, use_legacy_frontend): - if use_legacy_frontend: - pytest.skip("UnsortedSegmentSum operation is not supported via legacy frontend.") if ie_device == 'GPU': - pytest.skip("GPU error: Can't choose implementation for embedding_segment_sum:UnsortedSegmentSum node") - self._test( - *self.create_unsorted_segment_sum_net(**params, data_type=data_type, segment_ids_type=segment_ids_type, - num_segments_type=num_segments_type), - ie_device, precision, ir_version, temp_dir=temp_dir, - use_legacy_frontend=use_legacy_frontend) + pytest.skip("156362: No layout format available for embeddingsegmentssum:UnsortedSegmentSum on GPU") + self._test(*self.create_unsorted_segment_sum_net(**params, + data_type=data_type, segment_ids_type=segment_ids_type, + num_segments_type=num_segments_type), + ie_device, precision, ir_version, temp_dir=temp_dir, + use_legacy_frontend=use_legacy_frontend) From 118efc85baaa1c1ea04b280ead81e540b24acf15 Mon Sep 17 00:00:00 2001 From: Andrzej Kopytko Date: Wed, 30 Oct 2024 13:35:40 +0100 Subject: [PATCH 058/120] Docs Added searching by new coveo Category (#27335) ### Details: - *item1* - *...* ### Tickets: - *ticket-id* --- .../templates/layout.html | 2 +- .../sphinx_setup/_static/css/coveo_custom.css | 19 +- docs/sphinx_setup/_static/js/custom.js | 41 ++-- docs/sphinx_setup/_templates/layout.html | 11 +- docs/sphinx_setup/_templates/search.html | 221 ++++++++---------- docs/sphinx_setup/conf.py | 1 - 6 files changed, 134 insertions(+), 161 deletions(-) diff --git a/docs/openvino_sphinx_theme/openvino_sphinx_theme/templates/layout.html b/docs/openvino_sphinx_theme/openvino_sphinx_theme/templates/layout.html index 25acb3c1e5cbda..a2ab53c6a57a83 100644 --- a/docs/openvino_sphinx_theme/openvino_sphinx_theme/templates/layout.html +++ b/docs/openvino_sphinx_theme/openvino_sphinx_theme/templates/layout.html @@ -28,7 +28,7 @@ {# The data-cfasync attribute disables CloudFlare's Rocket loader so that #} {# mode/theme are correctly set before the browser renders the page. #} {# https://github.com/pydata/pydata-sphinx-theme/pull/1045 #} - + - - - - + + - - - - {% endblock %} - {% block docs_navbar %} {{ super() }} {% include 'baner.html' %} diff --git a/docs/sphinx_setup/_templates/search.html b/docs/sphinx_setup/_templates/search.html index 3519f6e7e02f19..5430f24f74aa8c 100644 --- a/docs/sphinx_setup/_templates/search.html +++ b/docs/sphinx_setup/_templates/search.html @@ -2,133 +2,100 @@ {% set title = _('Search') %} {%- block content %} - {% block docs_navbar %} - {{ super() }} - {% include 'baner.html' %} - {% endblock %} +{% block docs_navbar %} +{{ super() }} +{% include 'baner.html' %} +{% endblock %} - {% block body %} - - - - - - - - - - - -
- - -
- - - - - -
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
-
- {% endblock %} - - {%- block scripts_end %} - {{ _webpack.body_post() }} - {%- endblock %} +{% block body %} + + + + + + + + +
+ + +
+ + + + +
+ + + + + + + + + + + + + + + + + + + + + + + + + +
+
+{% endblock %} +{%- block scripts_end %} +{{ _webpack.body_post() }} {%- endblock %} + +{%- endblock %} \ No newline at end of file diff --git a/docs/sphinx_setup/conf.py b/docs/sphinx_setup/conf.py index def41af5943b3c..01c74de0175bcf 100644 --- a/docs/sphinx_setup/conf.py +++ b/docs/sphinx_setup/conf.py @@ -193,7 +193,6 @@ 'css/textfield.css', 'css/tabs.css', 'css/coveo_custom.css', - 'https://static.cloud.coveo.com/atomic/v2/themes/coveo.css', 'https://cdn.jsdelivr.net/npm/@splidejs/splide@4.1.4/dist/css/splide.min.css', ] From a5a09418cf4f047be8a58a1484da1b45488f8dba Mon Sep 17 00:00:00 2001 From: Mingyu Kim Date: Wed, 30 Oct 2024 22:23:42 +0900 Subject: [PATCH 059/120] [GPU] Fix cache mode and weights path interaction (#27328) ### Details: - Currently ov::CacheMode::OPTIMIZE_SIZE behaves like ov::CacheMode::OPTIMIZE_SPEED if weights_path is provided. This change fixes that. - Additionally, after this change if cache is saved with OPTIMIZE_SIZE and the user tries to load with OPTIMIZE_SPEED (or vice versa), import_model() will fail and the workload will behave like during the first launch, according to the cache mode set by the user. - This change also tightens the weights_path value validation - only files with ".bin" extension will be accepted. However, if the user provides the path to the wrong bin file, the execution will still fail - there's no way to validate if the bin file is correct without storing information about it in the cache. ### Tickets: - 156265 --------- Co-authored-by: Tomasz Krupa --- .../util/include/openvino/util/weights_path.hpp | 15 +++++++++++++++ src/common/util/src/weights_path.cpp | 14 ++++++++++++++ src/plugins/intel_gpu/src/graph/program.cpp | 4 +++- .../intel_gpu/src/plugin/compiled_model.cpp | 8 ++++++-- src/plugins/intel_gpu/src/plugin/plugin.cpp | 11 ++++++++++- .../intel_gpu/src/plugin/program_builder.cpp | 15 +++++++++------ 6 files changed, 57 insertions(+), 10 deletions(-) create mode 100644 src/common/util/include/openvino/util/weights_path.hpp create mode 100644 src/common/util/src/weights_path.cpp diff --git a/src/common/util/include/openvino/util/weights_path.hpp b/src/common/util/include/openvino/util/weights_path.hpp new file mode 100644 index 00000000000000..db97484be98d35 --- /dev/null +++ b/src/common/util/include/openvino/util/weights_path.hpp @@ -0,0 +1,15 @@ +// Copyright (C) 2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include "openvino/util/common_util.hpp" + +namespace ov { +namespace util { + +bool validate_weights_path(std::string& weights_path); + +} // namespace ov +} // namespace util diff --git a/src/common/util/src/weights_path.cpp b/src/common/util/src/weights_path.cpp new file mode 100644 index 00000000000000..9cf2336f064dd0 --- /dev/null +++ b/src/common/util/src/weights_path.cpp @@ -0,0 +1,14 @@ +// Copyright (C) 2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + + +#include "openvino/util/weights_path.hpp" + +bool ov::util::validate_weights_path(std::string& weights_path) { + if (weights_path.empty() || !ov::util::ends_with(weights_path, ".bin")) { + return false; + } + + return true; +} diff --git a/src/plugins/intel_gpu/src/graph/program.cpp b/src/plugins/intel_gpu/src/graph/program.cpp index 1e2e84043dc82b..07fad4873659cd 100644 --- a/src/plugins/intel_gpu/src/graph/program.cpp +++ b/src/plugins/intel_gpu/src/graph/program.cpp @@ -7,6 +7,7 @@ #include "openvino/core/type.hpp" #include "openvino/runtime/system_conf.hpp" #include "openvino/runtime/threading/cpu_streams_info.hpp" +#include "openvino/util/weights_path.hpp" #include "intel_gpu/runtime/memory.hpp" #include "intel_gpu/runtime/engine.hpp" @@ -1839,7 +1840,8 @@ void program::load(cldnn::BinaryInputBuffer& ib) { std::shared_ptr mapped_memory = nullptr; std::string weights_path = _config.get_property(ov::weights_path); - if (!weights_path.empty()) { + if (_config.get_property(ov::cache_mode) == ov::CacheMode::OPTIMIZE_SIZE && + ov::util::validate_weights_path(weights_path)) { mapped_memory = ov::load_mmap_object(weights_path); } diff --git a/src/plugins/intel_gpu/src/plugin/compiled_model.cpp b/src/plugins/intel_gpu/src/plugin/compiled_model.cpp index 233bc97c249cd4..527e08f07432ef 100644 --- a/src/plugins/intel_gpu/src/plugin/compiled_model.cpp +++ b/src/plugins/intel_gpu/src/plugin/compiled_model.cpp @@ -5,6 +5,7 @@ #include "openvino/runtime/iplugin.hpp" #include "openvino/runtime/intel_gpu/properties.hpp" #include "openvino/runtime/internal_properties.hpp" +#include "openvino/util/weights_path.hpp" #include "intel_gpu/graph/serialization/binary_buffer.hpp" #include "intel_gpu/runtime/itt.hpp" @@ -169,14 +170,17 @@ std::shared_ptr CompiledModel::create_infer_request() co void CompiledModel::export_model(std::ostream& model) const { // If ov::CacheMode::OPTIMIZE_SIZE is set, do the export iff it's possible to do weightless caching // which requires the weights_path. - if (m_config.get_property(ov::cache_mode) == ov::CacheMode::OPTIMIZE_SIZE && - m_config.get_property(ov::weights_path).empty()) + ov::CacheMode cache_mode = m_config.get_property(ov::cache_mode); + std::string weights_path = m_config.get_property(ov::weights_path); + if (cache_mode == ov::CacheMode::OPTIMIZE_SIZE && + !ov::util::validate_weights_path(weights_path)) return; OV_ITT_SCOPED_TASK(itt::domains::intel_gpu_plugin, "CompiledModel::export_model"); OPENVINO_ASSERT(!m_graphs.empty(), "[GPU] Model not loaded"); cldnn::BinaryOutputBuffer ob(model); + ob << cldnn::make_data(&cache_mode, sizeof(ov::CacheMode)); // Inputs { diff --git a/src/plugins/intel_gpu/src/plugin/plugin.cpp b/src/plugins/intel_gpu/src/plugin/plugin.cpp index d3d70ec92cd23c..7d010a9b590e2e 100644 --- a/src/plugins/intel_gpu/src/plugin/plugin.cpp +++ b/src/plugins/intel_gpu/src/plugin/plugin.cpp @@ -35,6 +35,7 @@ #include "openvino/runtime/performance_heuristics.hpp" #include "openvino/runtime/properties.hpp" #include "openvino/util/common_util.hpp" +#include "openvino/util/weights_path.hpp" #include "transformations/common_optimizations/dimension_tracking.hpp" #include "transformations/init_node_info.hpp" #include "transformations/rt_info/fused_names_attribute.hpp" @@ -330,8 +331,16 @@ std::shared_ptr Plugin::import_model(std::istream& model, cldnn::BinaryInputBuffer ib(model, context_impl->get_engine()); + ov::CacheMode cache_mode; + ib >> cldnn::make_data(&cache_mode, sizeof(ov::CacheMode)); + + if (cache_mode != config.get_property(ov::cache_mode)) { + return nullptr; + } + + std::string weights_path = config.get_property(ov::weights_path); if (config.get_property(ov::cache_mode) == ov::CacheMode::OPTIMIZE_SIZE && - config.get_property(ov::weights_path).empty()) { + !ov::util::validate_weights_path(weights_path)) { return nullptr; } diff --git a/src/plugins/intel_gpu/src/plugin/program_builder.cpp b/src/plugins/intel_gpu/src/plugin/program_builder.cpp index 510d715e7ac805..899110872ba633 100644 --- a/src/plugins/intel_gpu/src/plugin/program_builder.cpp +++ b/src/plugins/intel_gpu/src/plugin/program_builder.cpp @@ -305,12 +305,15 @@ void ProgramBuilder::add_primitive(const ov::Node& op, std::shared_ptrorigin_op_name = op.get_friendly_name(); prim->origin_op_type_name = op.get_type_name(); - if (auto data_prim = dynamic_cast(prim.get())) { - auto rt_info = op.get_rt_info(); - auto weightless_cache_attr = rt_info.find(ov::WeightlessCacheAttribute::get_type_info_static()); - if (weightless_cache_attr != rt_info.end()) { - data_prim->bin_offset = weightless_cache_attr->second.as().bin_offset; - data_prim->original_size = weightless_cache_attr->second.as().original_size; + if (this->m_config.get_property(ov::cache_mode) == ov::CacheMode::OPTIMIZE_SIZE) { + if (auto data_prim = dynamic_cast(prim.get())) { + auto rt_info = op.get_rt_info(); + auto weightless_cache_attr = rt_info.find(ov::WeightlessCacheAttribute::get_type_info_static()); + if (weightless_cache_attr != rt_info.end()) { + data_prim->bin_offset = weightless_cache_attr->second.as().bin_offset; + data_prim->original_size = + weightless_cache_attr->second.as().original_size; + } } } From 37f5dd3455279eed6708783a4f8fce880290bc06 Mon Sep 17 00:00:00 2001 From: Andrzej Kopytko Date: Wed, 30 Oct 2024 14:35:58 +0100 Subject: [PATCH 060/120] Docs Unhide columns in datatableJs (#27338) ### Details: - *item1* - *...* ### Tickets: - *ticket-id* --- .../generative-ai-performance.rst | 15 ++++++--------- .../sphinx_setup/_static/js/openVinoDataTables.js | 14 ++++++++++++++ 2 files changed, 20 insertions(+), 9 deletions(-) diff --git a/docs/articles_en/about-openvino/performance-benchmarks/generative-ai-performance.rst b/docs/articles_en/about-openvino/performance-benchmarks/generative-ai-performance.rst index d0a04f16ceb6bd..b8256af650e2f8 100644 --- a/docs/articles_en/about-openvino/performance-benchmarks/generative-ai-performance.rst +++ b/docs/articles_en/about-openvino/performance-benchmarks/generative-ai-performance.rst @@ -17,16 +17,13 @@ running on an Intel® Core™ Ultra 7-165H, Intel® Core™ Ultra 7-265V, and In -.. tab-set:: - - .. tab-item:: OpenVINO - - .. csv-table:: - :class: modeldata stripe - :name: supportedModelsTableOv - :header-rows: 1 - :file: ../../_static/benchmarks_files/llm_models.csv +.. csv-table:: + :class: modeldata stripe + :name: supportedModelsTableOv + :header-rows: 1 + :file: ../../_static/benchmarks_files/llm_models.csv +| .. grid:: 1 1 2 2 :gutter: 4 diff --git a/docs/sphinx_setup/_static/js/openVinoDataTables.js b/docs/sphinx_setup/_static/js/openVinoDataTables.js index 6f7231db424e89..b3f56b4a8de3e0 100644 --- a/docs/sphinx_setup/_static/js/openVinoDataTables.js +++ b/docs/sphinx_setup/_static/js/openVinoDataTables.js @@ -1,4 +1,17 @@ $(document).ready(function () { + var pageTitle = document.title; + var columnDefs; + if(pageTitle.includes('Most Efficient Large Language Models for AI PC')) + { + columnDefs= [ + { "visible": false, "targets": [1, 2, 3, 4, 5] } + ] + } + else + { + columnDefs=[] + } + var table = $('table.modeldata').DataTable({ responsive: true, "autoWidth": false, @@ -12,6 +25,7 @@ $(document).ready(function () { [10, 25, 50, -1], ['10 rows', '25 rows', '50 rows', 'Show all rows'] ], + "columnDefs": columnDefs, layout: { topStart: { buttons: [ From b60449ea7c7255004fd1087e2f5aaad9f4404eb8 Mon Sep 17 00:00:00 2001 From: Xiake Sun Date: Thu, 31 Oct 2024 00:08:20 +0800 Subject: [PATCH 061/120] [DOCS] Fix submodule update for building OV on Windows (#26549) - Add missing --recursive args for submodule update - Referenece guide for linux: https://github.com/openvinotoolkit/openvino/blob/6dc3f5538057caed5dd2eda0797aec445b6105bf/docs/dev/build_linux.md?plain=1#L23 --- docs/dev/build_windows.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/dev/build_windows.md b/docs/dev/build_windows.md index 10049485202cca..4a9761f5364046 100644 --- a/docs/dev/build_windows.md +++ b/docs/dev/build_windows.md @@ -25,7 +25,7 @@ Supported configurations: ```sh git clone https://github.com/openvinotoolkit/openvino.git cd openvino - git submodule update --init + git submodule update --init --recursive ``` 2. Create build directory: From 967a730722fa69853da1cc3c62b4ef672ae03b7a Mon Sep 17 00:00:00 2001 From: Alexey Smirnov Date: Wed, 30 Oct 2024 16:42:17 +0000 Subject: [PATCH 062/120] [NPUW] Support 3d gather in head (#27258) --- .../plugin/npuw/partitioning/partitioning.cpp | 2 +- .../plugin/npuw/partitioning/patterns/opt.cpp | 66 ++++++++++++++----- .../plugin/npuw/partitioning/patterns/opt.hpp | 4 +- .../intel_npu/src/plugin/npuw/util.cpp | 28 +++++++- 4 files changed, 77 insertions(+), 23 deletions(-) diff --git a/src/plugins/intel_npu/src/plugin/npuw/partitioning/partitioning.cpp b/src/plugins/intel_npu/src/plugin/npuw/partitioning/partitioning.cpp index 954df49e39f99b..99705fef30e8a8 100644 --- a/src/plugins/intel_npu/src/plugin/npuw/partitioning/partitioning.cpp +++ b/src/plugins/intel_npu/src/plugin/npuw/partitioning/partitioning.cpp @@ -1788,7 +1788,7 @@ void Partitioner::optimize(const std::string& func_name) { // Run Head/Tail passes ov::pass::GraphRewrite rewr; - rewr.add_matcher(std::ref(ctx)); + rewr.add_matcher(std::ref(ctx)); rewr.add_matcher(std::ref(ctx)); rewr.add_matcher(std::ref(ctx)); // NB: This pass is disabled for reason! It doesn't make things better diff --git a/src/plugins/intel_npu/src/plugin/npuw/partitioning/patterns/opt.cpp b/src/plugins/intel_npu/src/plugin/npuw/partitioning/patterns/opt.cpp index 997d0e5108f8b9..9693e2e8f2b753 100644 --- a/src/plugins/intel_npu/src/plugin/npuw/partitioning/patterns/opt.cpp +++ b/src/plugins/intel_npu/src/plugin/npuw/partitioning/patterns/opt.cpp @@ -59,11 +59,21 @@ Context::PPtr Context::concat(ov::ParameterVector&& v, std::size_t dim) { } Context::PPtr Context::unpack(Context::PPtr w, Context::PPtr z, Context::PPtr s, ov::element::Type type) { - // FIXME: Assume CW only - NPUW_ASSERT(w->get_shape().size() == 2); - NPUW_ASSERT(z->get_shape().size() == 2); - NPUW_ASSERT(s->get_shape().size() == 2); - auto new_param = std::make_shared(type, w->get_shape()); + const auto& w_shape = w->get_shape(); + const auto& s_shape = s->get_shape(); + + Context::PPtr new_param; + if (w_shape.size() == 3 && s_shape.size() == 3) { + // Assume already reshaped tensor (as it does with unpack) + ov::Shape new_shape = {w_shape[0], w_shape[1] * w_shape[2]}; + new_param = std::make_shared(type, new_shape); + } else if (w_shape.size() == 2 && s_shape.size() == 2) { + new_param = std::make_shared(type, w_shape); + } else { + NPUW_ASSERT(false && "Yet unsupported combination"); + } + + NPUW_ASSERT(new_param); params_to_unpack[new_param] = {w, z, s}; return new_param; } @@ -350,8 +360,8 @@ DQMatMulGQ2i::DQMatMulGQ2i(Context::Ref ctx) { if (ov::element::i4 == matched_qweight->get_element_type() && qweight_shape.size() == 3 && ov::element::f16 == matched_qcoeff->get_element_type() && qcoeff_shape.size() == 3 && - act_shape.size() == 3 && act_shape[1] == 1 && qcoeff_shape[0] == qweight_shape[0] && qcoeff_shape[2] == 1 && - qcoeff_shape[1] == qweight_shape[1] && !matched_matmul->get_transpose_a() && + act_shape.size() == 3 && act_shape[0] == 1 && act_shape[1] == 1 && qcoeff_shape[0] == qweight_shape[0] && + qcoeff_shape[2] == 1 && qcoeff_shape[1] == qweight_shape[1] && !matched_matmul->get_transpose_a() && matched_matmul->get_transpose_b()) { // Mark W closure to transpose, and transpose the respective parameter ctx.get().permute(matched_qweight, {1, 0, 2}); @@ -378,9 +388,6 @@ DQMatMulGQ2i::DQMatMulGQ2i(Context::Ref ctx) { auto split_a = std::make_shared(rshp_act, split_axis, NSPLIT); auto split_w = std::make_shared(matched_qweight, split_axis, NSPLIT); - std::vector rshp_scale_v = {1, 1, qcoeff_shape[0]}; - auto rshp_scale_c = std::make_shared(ov::element::i32, ov::Shape{3}, rshp_scale_v); - // Do the CW MM for every split std::vector> to_concat; for (std::size_t i = 0; i < NSPLIT; i++) { @@ -583,9 +590,13 @@ DQMatMulGQ2iP::DQMatMulGQ2iP(Context::Ref ctx) { auto qcoeff_shape = matched_qcoeff->output(0).get_shape(); auto act_shape = matched_out_mmi.get_shape(); + const auto just_one = [](std::size_t a, std::size_t b) { + return (a == 1 && b > 1) || (a > 1 && b == 1); + }; + if (ov::element::i4 == matched_qweight->get_element_type() && qweight_shape.size() == 3 && ov::element::f16 == matched_qcoeff->get_element_type() && qcoeff_shape.size() == 3 && - act_shape.size() == 3 && act_shape[1] > 1 && // multi-token case + act_shape.size() == 3 && just_one(act_shape[0], act_shape[1]) && // multi-token case qcoeff_shape[0] == qweight_shape[0] && qcoeff_shape[1] == qweight_shape[1] && qcoeff_shape[2] == 1 && !matched_matmul->get_transpose_a() && matched_matmul->get_transpose_b()) { // Mark W closure to transpose, and transpose the respective parameter @@ -601,9 +612,12 @@ DQMatMulGQ2iP::DQMatMulGQ2iP(Context::Ref ctx) { matched_qcoeff->set_partial_shape(ts_shape); matched_qcoeff->validate_and_infer_types(); + // Select proper activation shape + std::size_t act_dim = act_shape[0] > act_shape[1] ? 0 : 1; + // Reshape the Act to group format const auto NSPLIT = qweight_shape[1]; - std::vector rshp_act_v = {act_shape[1], NSPLIT, act_shape[2] / NSPLIT}; + std::vector rshp_act_v = {act_shape[act_dim], NSPLIT, act_shape[2] / NSPLIT}; auto rshp_act_c = std::make_shared(ov::element::i32, ov::Shape{3}, rshp_act_v); auto rshp_act = std::make_shared(matched_out_mmi, rshp_act_c, false); @@ -615,7 +629,7 @@ DQMatMulGQ2iP::DQMatMulGQ2iP(Context::Ref ctx) { auto split_w = std::make_shared(matched_qweight, split_axis_w, NSPLIT); auto split_s = std::make_shared(matched_qcoeff, split_axis_w, NSPLIT); - std::vector r_a_v = {1, act_shape[1], act_shape[2] / NSPLIT}; + std::vector r_a_v = {1, act_shape[act_dim], act_shape[2] / NSPLIT}; auto r_a_c = std::make_shared(ov::element::i32, ov::Shape{3}, r_a_v); // Do the CW MM for every split @@ -642,6 +656,13 @@ DQMatMulGQ2iP::DQMatMulGQ2iP(Context::Ref ctx) { out = std::make_shared(out, ov::element::f32); } + if (act_shape[0] > act_shape[1]) { + std::vector new_out_size = {act_shape[0], act_shape[1], qweight_shape[0]}; + auto new_out_shape = + std::make_shared(ov::element::i32, ov::Shape{3}, new_out_size); + out = std::make_shared(out, new_out_shape, false); + } + // Now.. Reconnect the matmul readers to the new output (reducesum) for (auto&& r : matched_matmul->output(0).get_target_inputs()) { r.replace_source_output(out); @@ -933,7 +954,7 @@ DQLiftGatherSymGQ::DQLiftGatherSymGQ() { // the respective block (mainly, a head) was turned a function // (e.g. with FUNCALL_FOR_ALL) As in this case the DQDictMatMulCWu // compile-time converts asymmetric MM to fp16, do the same thing here -DQUnpackDictGatherCWu::DQUnpackDictGatherCWu(Context::Ref ctx) { +DQUnpackDictGatheru::DQUnpackDictGatheru(Context::Ref ctx) { auto pids = opp::wrap_type(); auto cvtids = opp::optional({pids->output(0)}); @@ -966,14 +987,23 @@ DQUnpackDictGatherCWu::DQUnpackDictGatherCWu(Context::Ref ctx) { // Strip down the DQ subgraph, replace the original Q-ed closure tensor with unpacked fp16 auto new_wi = ctx.get().unpack(matched_qweight, matched_qzerop, matched_qcoeff, ov::element::f16); - auto gather_c = std::make_shared(ov::element::i32, ov::Shape{}, 0); - auto new_g = std::make_shared(new_wi, matched_out_ids, gather_c); + auto w_shape = matched_node_qweight->get_shape(); + auto new_w_shape = new_wi->get_shape(); + std::shared_ptr gather_in = new_wi; + if (new_w_shape.size() == 2 && w_shape.size() == 3) { + NPUW_ASSERT(new_w_shape[0] == w_shape[0] && w_shape[1] * w_shape[2] == new_w_shape[1]); + auto new_const = std::make_shared(ov::element::i32, ov::Shape{3}, w_shape); + gather_in = std::make_shared(new_wi, new_const, false); + } + NPUW_ASSERT(gather_in); + auto gather_c = std::make_shared(ov::element::i32, ov::Shape{}, 0); + auto new_g = std::make_shared(gather_in, matched_out_ids, gather_c); matched_node_cvt->input(0).replace_source_output(new_g); return true; // root has changed }; - register_matcher(std::make_shared(qcvtm, "DQDictGatherCWu"), std::move(callback)); + register_matcher(std::make_shared(qcvtm, "DQDictGatheru"), std::move(callback)); } // This is a follow-up to DQLiftGatherSymGQ step, which happens if the respective @@ -1013,7 +1043,7 @@ DQUnpackDictGatherGQi::DQUnpackDictGatherGQi(Context::Ref ctx) { return true; // root has changed }; - register_matcher(std::make_shared(qcvtm, "DQDictGatherCWu"), std::move(callback)); + register_matcher(std::make_shared(qcvtm, "DQDictGatherGQu"), std::move(callback)); } // Identify the case* where the FP16/32 vocab tensor is gathered with diff --git a/src/plugins/intel_npu/src/plugin/npuw/partitioning/patterns/opt.hpp b/src/plugins/intel_npu/src/plugin/npuw/partitioning/patterns/opt.hpp index a66012d4a85fb8..323d443fa781f4 100644 --- a/src/plugins/intel_npu/src/plugin/npuw/partitioning/patterns/opt.hpp +++ b/src/plugins/intel_npu/src/plugin/npuw/partitioning/patterns/opt.hpp @@ -112,9 +112,9 @@ class DQLiftGatherSymGQ : public ov::pass::MatcherPass { // Head vocab unpacks -class DQUnpackDictGatherCWu : public ov::pass::MatcherPass { +class DQUnpackDictGatheru : public ov::pass::MatcherPass { public: - DQUnpackDictGatherCWu(Context::Ref ctx); + DQUnpackDictGatheru(Context::Ref ctx); }; class DQUnpackDictGatherGQi : public ov::pass::MatcherPass { diff --git a/src/plugins/intel_npu/src/plugin/npuw/util.cpp b/src/plugins/intel_npu/src/plugin/npuw/util.cpp index e9deb34ee2ded7..99a53430295a89 100644 --- a/src/plugins/intel_npu/src/plugin/npuw/util.cpp +++ b/src/plugins/intel_npu/src/plugin/npuw/util.cpp @@ -176,6 +176,7 @@ void ov::npuw::util::unpack(const ov::SoPtr& from, const auto& from_shape = from->get_shape(); const auto& scale_shape = scale->get_shape(); + const auto& zerop_shape = zerop->get_shape(); if (type_from == ov::element::u4) { if (scale_shape.size() == 3 && scale_shape[0] == from_shape[0] && scale_shape[1] == 1 && @@ -194,8 +195,31 @@ void ov::npuw::util::unpack(const ov::SoPtr& from, NPUW_ASSERT(false); } } else if (type_from == ov::element::u8) { - // Only support CW for now - if (scale_shape.size() == 2 && scale_shape[0] == from_shape[0] && scale_shape[1] == 1) { + if (scale_shape.size() == 3 && scale_shape[1] == 1 && scale_shape[2] == 1) { + // Special case for broadcasting vocab by 2 dimensions + // FIXME: all this logic probably should be in some specific unpack or another util function + const auto& from_strides = from->get_strides(); + const auto& zerop_strides = zerop->get_strides(); + const auto& scale_strides = scale->get_strides(); + ov::Tensor wraped_from(from->get_element_type(), + ov::Shape{from_shape[0], from_shape[1] * from_shape[2]}, + from->data(), + ov::Strides{from_strides[0], from_strides[2]}); + ov::Tensor wraped_zerop(zerop->get_element_type(), + ov::Shape{zerop_shape[0], zerop_shape[1] * zerop_shape[2]}, + zerop->data(), + ov::Strides{zerop_strides[0], zerop_strides[2]}); + ov::Tensor wraped_scale(scale->get_element_type(), + ov::Shape{scale_shape[0], scale_shape[1] * scale_shape[2]}, + scale->data(), + ov::Strides{scale_strides[0], scale_strides[2]}); + + ov::npuw::util::XARCH::unpack_u8f16(ov::get_tensor_impl(wraped_from), + ov::get_tensor_impl(wraped_zerop), + ov::get_tensor_impl(wraped_scale), + to, + unpack_options); + } else if (scale_shape.size() == 2 && scale_shape[0] == from_shape[0] && scale_shape[1] == 1) { ov::npuw::util::XARCH::unpack_u8f16(from, zerop, scale, to, unpack_options); } else { NPUW_ASSERT(false); From 6e350492163a983aa10eb879e9d1179866f9ebf8 Mon Sep 17 00:00:00 2001 From: barnasm1 Date: Wed, 30 Oct 2024 18:14:32 +0100 Subject: [PATCH 063/120] squeeze v15 implementation (#26995) ### Details: - Add v15::Squeeze class with support dynamic rank result based on v0::Squeeze ### Tickets: - [*154023*](https://jira.devtools.intel.com/browse/CVS-154023) --- src/core/include/openvino/op/squeeze.hpp | 57 ++- .../include/openvino/op/util/squeeze_base.hpp | 39 ++ .../include/openvino/opsets/opset15_tbl.hpp | 2 +- .../include/squeeze_shape_inference.hpp | 119 +++++- src/core/src/op/squeeze.cpp | 112 +++-- src/core/src/op/util/squeeze_base.cpp | 91 ++++ src/core/tests/CMakeLists.txt | 1 + src/core/tests/type_prop/squeeze.cpp | 399 ++++++++++++++---- src/core/tests/visitors/op/squeeze.cpp | 9 + src/core/tests/visitors/op/unary_ops.hpp | 9 +- .../src/shape_inference/shape_inference.cpp | 1 + .../squeeze_shape_inference_test.cpp | 136 +++++- .../tests/functional/op_reference/squeeze.cpp | 63 ++- .../src/op_impl_check/single_op_graph.cpp | 10 +- .../include/common_test_utils/type_prop.hpp | 21 + 15 files changed, 887 insertions(+), 182 deletions(-) create mode 100644 src/core/include/openvino/op/util/squeeze_base.hpp create mode 100644 src/core/src/op/util/squeeze_base.cpp diff --git a/src/core/include/openvino/op/squeeze.hpp b/src/core/include/openvino/op/squeeze.hpp index 8c27f29d66df66..dde456aa2eef47 100644 --- a/src/core/include/openvino/op/squeeze.hpp +++ b/src/core/include/openvino/op/squeeze.hpp @@ -4,7 +4,7 @@ #pragma once -#include "openvino/op/op.hpp" +#include "openvino/op/util/squeeze_base.hpp" namespace ov { namespace op { @@ -12,30 +12,65 @@ namespace v0 { /// \brief Squeeze operation. /// /// \ingroup ov_ops_cpp_api -class OPENVINO_API Squeeze : public Op { +class OPENVINO_API Squeeze : public util::SqueezeBase { public: OPENVINO_OP("Squeeze", "opset1"); Squeeze(); - Squeeze(const Output& data, const Output& axes); + /// \brief Constructs a squeeze v0 operation. + /// + /// \param data Input tensor with data Squeeze(const Output& data); + /// \brief Constructs a squeeze v0 operation. + /// + /// \param data Input tensor with data + /// \param axis The axis along which to squeeze the input tensor. + Squeeze(const Output& data, const Output& axes); void validate_and_infer_types() override; bool evaluate(TensorVector& outputs, const TensorVector& inputs) const override; - bool has_evaluate() const override; - bool evaluate_lower(TensorVector& outputs) const override; - bool evaluate_upper(TensorVector& outputs) const override; - bool evaluate_symbol(TensorSymbolVector& output_symbols) const override; - bool constant_fold(OutputVector& output_values, const OutputVector& inputs_values) override; - bool can_constant_fold(const OutputVector& inputs_values) const override; std::shared_ptr clone_with_new_inputs(const OutputVector& new_args) const override; - bool is_dynamic() const override; - private: Output get_default_axes_input() const; }; } // namespace v0 + +namespace v15 { +/// \brief Squeeze operation. +/// +/// \ingroup ov_ops_cpp_api +class OPENVINO_API Squeeze : public util::SqueezeBase { +public: + OPENVINO_OP("Squeeze", "opset15"); + + Squeeze(); + /// \brief Constructs a squeeze v15 operation. + /// + /// \param data Input tensor with data + /// \param allow_axis_skip Shape inference result dynamic rank if selected axis has 1 in range of its dynamic + Squeeze(const Output& data, const bool allow_axis_skip = false); + /// \brief Constructs a squeeze v15 operation. + /// + /// \param data Input tensor with data + /// \param axis The axis along which to squeeze the input tensor. + /// \param allow_axis_skip Shape inference result dynamic rank if selected axis has 1 in range of its dynamic + Squeeze(const Output& data, const Output& axes, const bool allow_axis_skip = false); + + void validate_and_infer_types() override; + bool evaluate(TensorVector& outputs, const TensorVector& inputs) const override; + + std::shared_ptr clone_with_new_inputs(const OutputVector& new_args) const override; + + bool visit_attributes(AttributeVisitor& visitor) override; + + bool get_allow_axis_skip() const; + +private: + Output get_default_axes_input() const; + bool m_allow_axis_skip{}; +}; +} // namespace v15 } // namespace op } // namespace ov diff --git a/src/core/include/openvino/op/util/squeeze_base.hpp b/src/core/include/openvino/op/util/squeeze_base.hpp new file mode 100644 index 00000000000000..50d960824e10d2 --- /dev/null +++ b/src/core/include/openvino/op/util/squeeze_base.hpp @@ -0,0 +1,39 @@ +// Copyright (C) 2018-2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include "openvino/op/op.hpp" + +namespace ov { +namespace op { +namespace util { +/// \brief Squeeze operation. +/// +/// \ingroup ov_ops_cpp_api +class OPENVINO_API SqueezeBase : public Op { +public: + OPENVINO_OP("Squeeze", "util"); + SqueezeBase() = default; + /// \brief Constructs a squeeze operation. + /// + /// \param data Input tensor with data + SqueezeBase(const Output& data); + /// \brief Constructs a squeeze operation. + /// + /// \param data Input tensor with data + /// \param axis The axis along which to squeeze the input tensor. + SqueezeBase(const Output& data, const Output& axes); + + bool has_evaluate() const override; + bool evaluate_lower(TensorVector& outputs) const override; + bool evaluate_upper(TensorVector& outputs) const override; + bool evaluate_symbol(TensorSymbolVector& output_symbols) const override; + bool can_constant_fold(const OutputVector& inputs_values) const override; + bool constant_fold(OutputVector& output_values, const OutputVector& inputs_values) override; + bool is_dynamic() const override; +}; +} // namespace util +} // namespace op +} // namespace ov diff --git a/src/core/include/openvino/opsets/opset15_tbl.hpp b/src/core/include/openvino/opsets/opset15_tbl.hpp index 9a49e421f9ad8e..8d12420719bb6b 100644 --- a/src/core/include/openvino/opsets/opset15_tbl.hpp +++ b/src/core/include/openvino/opsets/opset15_tbl.hpp @@ -97,7 +97,7 @@ _OPENVINO_OP_REG(Sqrt, ov::op::v0) _OPENVINO_OP_REG(SpaceToDepth, ov::op::v0) _OPENVINO_OP_REG(Split, ov::op::v1) _OPENVINO_OP_REG(SquaredDifference, ov::op::v0) -_OPENVINO_OP_REG(Squeeze, ov::op::v0) +_OPENVINO_OP_REG(Squeeze, ov::op::v15) _OPENVINO_OP_REG(StridedSlice, ov::op::v1) _OPENVINO_OP_REG(Subtract, ov::op::v1) _OPENVINO_OP_REG(Tan, ov::op::v0) diff --git a/src/core/shape_inference/include/squeeze_shape_inference.hpp b/src/core/shape_inference/include/squeeze_shape_inference.hpp index ee71b5452db1c3..31eeea5d36a9ea 100644 --- a/src/core/shape_inference/include/squeeze_shape_inference.hpp +++ b/src/core/shape_inference/include/squeeze_shape_inference.hpp @@ -11,6 +11,117 @@ namespace ov { namespace op { namespace v0 { +template > +std::vector shape_infer(const Squeeze* op, + const std::vector& input_shapes, + const ITensorAccessor& ta = make_tensor_accessor()) { + using DimType = typename T::value_type; + + const auto number_of_inputs = input_shapes.size(); + OPENVINO_ASSERT(!input_shapes.empty()); + + const auto& arg_shape = input_shapes[0]; + const auto& arg_rank = arg_shape.rank(); + auto output_shapes = std::vector(1); + auto& output_shape = output_shapes[0]; + + std::unique_ptr> unique_axes; + + if (number_of_inputs == 1) { + unique_axes.reset(new std::set()); + } else if (number_of_inputs == 2) { + const auto& axes_shape = input_shapes[1]; + NODE_VALIDATION_CHECK(op, + axes_shape.is_dynamic() || ov::util::is_rank_compatible_any_of(axes_shape.rank(), {0, 1}), + "Second input (axes) should not be of rank higher than 1. Got: ", + axes_shape.rank().get_length()); + + std::vector axes; + if (arg_rank.is_static() && axes_shape.is_static()) { + if (auto axes = get_input_const_data_as(op, 1, ta)) { + // The values of `axes` input are known + ov::util::try_normalize_axes(*axes, arg_rank, *op); + unique_axes.reset(new std::set(axes->cbegin(), axes->cend())); + } else if (arg_rank.get_length() > 0 && shape_size(axes_shape.to_shape()) == 1) { + // The `axes` input is a single element tensor which is unique by definition, deducing output rank + const auto has_squeezable_dim = + std::any_of(arg_shape.cbegin(), arg_shape.cend(), [](const DimType& dim) { + return dim.compatible(1); + }); + if (has_squeezable_dim) { + output_shape = PartialShape::dynamic(arg_rank.get_length() - 1); + } else { + output_shape = arg_shape; + } + return output_shapes; + } + } + } else { + // Invalid number of inputs, empty error message for backward compatibility. + NODE_VALIDATION_CHECK(op, false); + } + + if (arg_rank.is_static() && (unique_axes != nullptr)) { + output_shape.resize(0); + if (unique_axes->empty()) { + // if only first input provided or axes are empty remove all dimensions equal to 1. + if (std::any_of(arg_shape.cbegin(), arg_shape.cend(), [](const DimType& d) { + return d.is_dynamic() && d.compatible(1); + })) { + // we are unsure if dynamic dimensions would be equal to 1 or not, so we set dynamic output rank + output_shape = PartialShape::dynamic(); + return output_shapes; + } else { + std::copy_if(arg_shape.cbegin(), + arg_shape.cend(), + std::back_inserter(output_shape), + [](const DimType& dim) { + return !dim.compatible(1); + }); + } + } else { + int64_t idx = 0; + auto rm_axis_iter = unique_axes->cbegin(); + auto rm_axis_end = unique_axes->cend(); + + // Returns true if dimension not squeezable on axis from input axes. + const auto not_squeezable_at_axis = [&rm_axis_iter, &rm_axis_end, &idx](const DimType& dim) { + if ((rm_axis_iter != rm_axis_end) && (*rm_axis_iter == idx++)) { + ++rm_axis_iter; + // Ignore: Pointed by axis, but not squeezable + return !dim.compatible(1); + } else { + return true; + } + }; + + std::copy_if(arg_shape.cbegin(), + arg_shape.cend(), + std::back_inserter(output_shape), + not_squeezable_at_axis); + } + } else { + output_shape = PartialShape::dynamic(); + } + return output_shapes; +} +} // namespace v0 + +namespace v15 { +template +bool apply_allow_axis_skip(const ov::op::v15::Squeeze* const op, + const std::unique_ptr>& unique_axes, + const T& arg_shape) { + using DimType = typename T::value_type; + int64_t i{-1}; + + return op->get_allow_axis_skip() && + std::any_of(arg_shape.cbegin(), arg_shape.cend(), [&unique_axes, &i](const DimType& d) { + ++i; + // Squeeze result with dynamic rank if 1 is in range of selected dynamic dimension. + return d.is_dynamic() && d.compatible(1) && unique_axes->find(i) != unique_axes->end(); + }); +} /** * \brief Do Squeeze shape inference. @@ -59,7 +170,7 @@ std::vector shape_infer(const Squeeze* op, return dim.compatible(1); }); if (has_squeezable_dim) { - output_shape = PartialShape::dynamic(arg_rank.get_length() - 1); + output_shape = PartialShape::dynamic(); } else { output_shape = arg_shape; } @@ -71,7 +182,9 @@ std::vector shape_infer(const Squeeze* op, NODE_VALIDATION_CHECK(op, false); } - if (arg_rank.is_static() && (unique_axes != nullptr)) { + if (!arg_rank.is_static() || (unique_axes == nullptr) || apply_allow_axis_skip(op, unique_axes, arg_shape)) { + output_shape = PartialShape::dynamic(); + } else if (arg_rank.is_static() && (unique_axes != nullptr)) { output_shape.resize(0); if (unique_axes->empty()) { // if only first input provided or axes are empty remove all dimensions equal to 1. @@ -115,6 +228,6 @@ std::vector shape_infer(const Squeeze* op, } return output_shapes; } -} // namespace v0 +} // namespace v15 } // namespace op } // namespace ov diff --git a/src/core/src/op/squeeze.cpp b/src/core/src/op/squeeze.cpp index 1b34a4e48a4faf..b79165ca4f5543 100644 --- a/src/core/src/op/squeeze.cpp +++ b/src/core/src/op/squeeze.cpp @@ -6,31 +6,19 @@ #include -#include "bound_evaluate.hpp" #include "itt.hpp" -#include "openvino/core/validation_util.hpp" -#include "openvino/op/constant.hpp" #include "squeeze_shape_inference.hpp" namespace ov { namespace op { namespace v0 { -namespace validate { -namespace { +Squeeze::Squeeze() : util::SqueezeBase() {} -bool axes_has_and_set_bound(const Node& op) { - return (op.get_input_size() < 2) || op.get_input_tensor(1).has_and_set_bound(); -} -} // namespace -} // namespace validate - -Squeeze::Squeeze() : Op() {} - -Squeeze::Squeeze(const Output& data, const Output& axes) : Op({data, axes}) { +Squeeze::Squeeze(const Output& data, const Output& axes) : util::SqueezeBase(data, axes) { constructor_validate_and_infer_types(); } -Squeeze::Squeeze(const Output& data) : Op({data}) { +Squeeze::Squeeze(const Output& data) : util::SqueezeBase(data) { constructor_validate_and_infer_types(); } @@ -69,62 +57,68 @@ bool Squeeze::evaluate(TensorVector& outputs, const TensorVector& inputs) const return true; } -bool Squeeze::has_evaluate() const { - OV_OP_SCOPE(v0_Squeeze_has_evaluate); - const auto validate_axes_type = [](const element::Type& et) -> bool { - switch (et) { - case element::i8: - case element::i16: - case element::i32: - case element::i64: - case element::u8: - case element::u16: - case element::u32: - case element::u64: - return true; - default: - return false; - } - }; - - return (get_input_size() < 2) || validate_axes_type(get_input_element_type(1)); -} +} // namespace v0 -bool Squeeze::evaluate_lower(TensorVector& output_values) const { - OV_OP_SCOPE(v0_Squeeze_evaluate_lower); - return validate::axes_has_and_set_bound(*this) && default_lower_bound_evaluator(this, output_values); +namespace v15 { +Squeeze::Squeeze() : util::SqueezeBase() {} + +Squeeze::Squeeze(const Output& data, const bool allow_axis_skip) + : util::SqueezeBase(data), + m_allow_axis_skip{allow_axis_skip} { + constructor_validate_and_infer_types(); } -bool Squeeze::evaluate_upper(TensorVector& output_values) const { - OV_OP_SCOPE(v0_Squeeze_evaluate_upper); - return validate::axes_has_and_set_bound(*this) && default_upper_bound_evaluator(this, output_values); +Squeeze::Squeeze(const Output& data, const Output& axes, const bool allow_axis_skip) + : util::SqueezeBase(data, axes), + m_allow_axis_skip{allow_axis_skip} { + constructor_validate_and_infer_types(); } -bool Squeeze::evaluate_symbol(TensorSymbolVector& output_symbols) const { - return validate::axes_has_and_set_bound(*this) && ov::util::default_symbol_evaluator(this, output_symbols); +std::shared_ptr Squeeze::clone_with_new_inputs(const OutputVector& new_args) const { + OV_OP_SCOPE(v15_Squeeze_clone_with_new_inputs); + check_new_args_count(this, new_args); + + switch (new_args.size()) { + case 1: + return std::make_shared(new_args[0], m_allow_axis_skip); + case 2: + return std::make_shared(new_args[0], new_args[1], m_allow_axis_skip); + default: + OPENVINO_THROW("Incorrect number of new arguments"); + } } -bool Squeeze::can_constant_fold(const OutputVector& inputs_values) const { - return get_output_partial_shape(0).is_static() && !is_const_fold_disabled(); +void Squeeze::validate_and_infer_types() { + OV_OP_SCOPE(v15_Squeeze_validate_and_infer_types); + + const auto input_shapes = ov::util::get_node_input_partial_shapes(*this); + const auto output_shapes = shape_infer(this, input_shapes); + + set_output_type(0, get_input_element_type(0), output_shapes[0]); } -bool Squeeze::constant_fold(OutputVector& output_values, const OutputVector& inputs_values) { - OV_OP_SCOPE(v0_Squeeze_constant_fold); - if (!can_constant_fold(inputs_values)) { - return false; - } +bool Squeeze::evaluate(TensorVector& outputs, const TensorVector& inputs) const { + OV_OP_SCOPE(v15_Squeeze_evaluate); + OPENVINO_ASSERT(outputs.size() == 1); - if (auto data_const = std::dynamic_pointer_cast(inputs_values[0].get_node_shared_ptr())) { - const auto& shape = get_output_shape(0); - output_values[0] = std::make_shared(*data_const, shape); - return true; - } - return false; + const auto output_shapes = + shape_infer(this, ov::util::get_tensors_partial_shapes(inputs), make_tensor_accessor(inputs)); + outputs[0].set_shape(output_shapes.front().get_shape()); + + std::memcpy(outputs[0].data(), inputs[0].data(), outputs[0].get_byte_size()); + return true; } -bool Squeeze::is_dynamic() const { - return get_output_partial_shape(0).is_dynamic(); +bool Squeeze::visit_attributes(AttributeVisitor& visitor) { + OV_OP_SCOPE(v15_Squeeze_visit_attributes); + visitor.on_attribute("allow_axis_skip", m_allow_axis_skip); + return true; } -} // namespace v0 + +bool Squeeze::get_allow_axis_skip() const { + OV_OP_SCOPE(v15_Squeeze_get_allow_axis_skip); + return m_allow_axis_skip; +} +} // namespace v15 } // namespace op } // namespace ov diff --git a/src/core/src/op/util/squeeze_base.cpp b/src/core/src/op/util/squeeze_base.cpp new file mode 100644 index 00000000000000..be5a20cbb58620 --- /dev/null +++ b/src/core/src/op/util/squeeze_base.cpp @@ -0,0 +1,91 @@ +#include "openvino/op/util/squeeze_base.hpp" + +#include "bound_evaluate.hpp" +#include "itt.hpp" +#include "openvino/core/validation_util.hpp" +#include "openvino/op/constant.hpp" + +namespace ov { +namespace op { + +namespace validate { +namespace { + +bool axes_has_and_set_bound(const Node& op) { + return (op.get_input_size() < 2) || op.get_input_tensor(1).has_and_set_bound(); +} +} // namespace +} // namespace validate + +namespace util { +SqueezeBase::SqueezeBase(const Output& data, const Output& axes) : Op({data, axes}) { + constructor_validate_and_infer_types(); +} + +SqueezeBase::SqueezeBase(const Output& data) : Op({data}) { + constructor_validate_and_infer_types(); +} + +bool SqueezeBase::has_evaluate() const { + OV_OP_SCOPE(util_SqueezeBase_has_evaluate); + const auto validate_axes_type = [](const element::Type& et) -> bool { + switch (et) { + case element::i8: + case element::i16: + case element::i32: + case element::i64: + case element::u8: + case element::u16: + case element::u32: + case element::u64: + return true; + default: + return false; + } + }; + + return (get_input_size() < 2) || validate_axes_type(get_input_element_type(1)); +} + +bool SqueezeBase::evaluate_lower(TensorVector& output_values) const { + OV_OP_SCOPE(util_SqueezeBase_evaluate_lower); + return validate::axes_has_and_set_bound(*this) && default_lower_bound_evaluator(this, output_values); +} + +bool SqueezeBase::evaluate_upper(TensorVector& output_values) const { + OV_OP_SCOPE(util_SqueezeBase_evaluate_upper); + return validate::axes_has_and_set_bound(*this) && default_upper_bound_evaluator(this, output_values); +} + +bool SqueezeBase::evaluate_symbol(TensorSymbolVector& output_symbols) const { + OV_OP_SCOPE(util_SqueezeBase_evaluate_symbol); + return validate::axes_has_and_set_bound(*this) && ov::util::default_symbol_evaluator(this, output_symbols); +} + +bool SqueezeBase::can_constant_fold(const OutputVector& inputs_values) const { + OV_OP_SCOPE(util_SqueezeBase_can_constant_fold); + return get_output_partial_shape(0).is_static() && !is_const_fold_disabled(); +} + +bool SqueezeBase::constant_fold(OutputVector& output_values, const OutputVector& inputs_values) { + OV_OP_SCOPE(util_SqueezeBase_constant_fold); + if (!can_constant_fold(inputs_values)) { + return false; + } + + if (auto data_const = std::dynamic_pointer_cast(inputs_values[0].get_node_shared_ptr())) { + const auto& shape = get_output_shape(0); + output_values[0] = std::make_shared(*data_const, shape); + return true; + } + return false; +} + +bool SqueezeBase::is_dynamic() const { + OV_OP_SCOPE(util_SqueezeBase_is_dynamic); + return get_output_partial_shape(0).is_dynamic(); +} + +} // namespace util +} // namespace op +} // namespace ov diff --git a/src/core/tests/CMakeLists.txt b/src/core/tests/CMakeLists.txt index c3ed58783ac946..89acd7bd1809d0 100644 --- a/src/core/tests/CMakeLists.txt +++ b/src/core/tests/CMakeLists.txt @@ -18,6 +18,7 @@ set_source_files_properties(${CMAKE_CURRENT_SOURCE_DIR}/threading.cpp if(SUGGEST_OVERRIDE_SUPPORTED) set_source_files_properties(ov_tensor_test.cpp type_prop/multiclass_nms.cpp + type_prop/squeeze.cpp PROPERTIES COMPILE_OPTIONS -Wno-suggest-override) endif() diff --git a/src/core/tests/type_prop/squeeze.cpp b/src/core/tests/type_prop/squeeze.cpp index c7d81fd97c2786..7be05de1876d9f 100644 --- a/src/core/tests/type_prop/squeeze.cpp +++ b/src/core/tests/type_prop/squeeze.cpp @@ -7,193 +7,261 @@ #include "common_test_utils/test_assertions.hpp" #include "common_test_utils/type_prop.hpp" #include "openvino/op/broadcast.hpp" -#include "openvino/op/constant.hpp" -#include "openvino/op/gather.hpp" -#include "openvino/op/shape_of.hpp" -#include "openvino/op/unsqueeze.hpp" #include "sequence_generator.hpp" using namespace std; using namespace ov; using namespace testing; -TEST(type_prop, squeeze_axes_invalid_value) { +namespace { + +template +class SqueezelOperator : public TypePropOpTest {}; + +using SqueezeTypes = ::testing::Types; + +TYPED_TEST_SUITE(SqueezelOperator, SqueezeTypes); + +TYPED_TEST(SqueezelOperator, squeeze_axes_invalid_value) { auto param = make_shared(element::f32, Shape{1, 2, 3, 4}); auto axes_node = make_shared(element::u64, Shape{2}, vector{0, 2}); - const auto squeeze = std::make_shared(param, axes_node); + const auto squeeze = this->make_op(param, axes_node); EXPECT_EQ(squeeze->get_element_type(), element::f32); EXPECT_EQ(squeeze->get_output_partial_shape(0), (PartialShape{2, 3, 4})); } -TEST(type_prop, squeeze_single_input) { +TYPED_TEST(SqueezelOperator, squeeze_single_input) { auto param = make_shared(element::f32, PartialShape{1, -1, 3, 4}); - auto s = make_shared(param); - EXPECT_EQ(s->get_output_partial_shape(0), PartialShape::dynamic()); + const auto squeeze = this->make_op(param); + EXPECT_EQ(squeeze->get_output_partial_shape(0), PartialShape::dynamic()); } -TEST(type_prop, squeeze_axes_invalid_rank) { +TYPED_TEST(SqueezelOperator, squeeze_axes_invalid_rank) { auto param = make_shared(element::f32, Shape{1, 2, 3, 4}); auto axes_node = make_shared(element::i32, Shape{2, 1}, vector{0, 2}); - OV_EXPECT_THROW(auto s = make_shared(param, axes_node), + OV_EXPECT_THROW(const auto squeeze = this->make_op(param, axes_node), NodeValidationFailure, HasSubstr("Second input (axes) should not be of rank higher than 1.")); } -TEST(type_prop, squeeze_incorrect_negative_axes) { +TYPED_TEST(SqueezelOperator, squeeze_incorrect_negative_axes) { auto param = make_shared(element::f32, Shape{1, 4, 1, 4, 1, 8}); auto axes_node = make_shared(element::i64, Shape{2}, vector{-6, -10}); - OV_EXPECT_THROW(auto s = make_shared(param, axes_node), + OV_EXPECT_THROW(const auto squeeze = this->make_op(param, axes_node), ov::Exception, HasSubstr("Axis -10 out of the tensor rank range")); } -TEST(type_prop, squeeze_data_static_param_axes_1D_single_elem_static_shape_no_squeezable_dims) { +TYPED_TEST(SqueezelOperator, squeeze_data_static_param_axes_1D_single_elem_static_shape_no_squeezable_dims) { auto param = std::make_shared(ov::element::f32, PartialShape{2, 2, 4}); const auto axes_node = std::make_shared(element::u64, PartialShape{1}); - const auto squeeze = std::make_shared(param, axes_node); + const auto squeeze = this->make_op(param, axes_node); EXPECT_EQ(squeeze->get_element_type(), element::f32); EXPECT_EQ(squeeze->get_output_partial_shape(0), (PartialShape{2, 2, 4})); } -TEST(type_prop, squeeze_data_static_param_axes_1D_two_elem_static_shape_squeezable_dims_two) { +TYPED_TEST(SqueezelOperator, squeeze_data_static_param_axes_1D_two_elem_static_shape_squeezable_dims_two) { auto param = std::make_shared(ov::element::f32, PartialShape{1, 2, 1, 4}); const auto axes_node = std::make_shared(element::u64, PartialShape{2}); - const auto squeeze = std::make_shared(param, axes_node); + const auto squeeze = this->make_op(param, axes_node); EXPECT_EQ(squeeze->get_element_type(), element::f32); EXPECT_EQ(squeeze->get_output_partial_shape(0), PartialShape::dynamic()); } -TEST(type_prop, squeeze_data_static_param_axes_1D_two_elem_static_shape_squeezable_dims_one) { +TYPED_TEST(SqueezelOperator, squeeze_data_static_param_axes_1D_two_elem_static_shape_squeezable_dims_one) { auto param = std::make_shared(ov::element::f32, PartialShape{2, 1, 4}); const auto axes_node = std::make_shared(element::u64, PartialShape{2}); - const auto squeeze = std::make_shared(param, axes_node); + const auto squeeze = this->make_op(param, axes_node); EXPECT_EQ(squeeze->get_element_type(), element::f32); EXPECT_EQ(squeeze->get_output_partial_shape(0), PartialShape::dynamic()); } -TEST(type_prop, squeeze_data_static_param_axes_1D_single_elem_static_shape_squeezable_dims_one) { +TEST(TypePropSqueezelOperatorV0, squeeze_data_static_param_axes_1D_single_elem_static_shape_squeezable_dims_one) { auto param = std::make_shared(ov::element::f32, PartialShape{2, 1, 4}); const auto axes_node = std::make_shared(element::u64, PartialShape{1}); - const auto squeeze = std::make_shared(param, axes_node); + const auto squeeze = std::make_shared(param, axes_node); EXPECT_EQ(squeeze->get_element_type(), element::f32); EXPECT_EQ(squeeze->get_output_partial_shape(0), PartialShape::dynamic(2)); } -TEST(type_prop, squeeze_data_static_param_axes_scalar_static_shape_squeezable_dims_one) { +TEST(TypePropSqueezelOperatorV15, squeeze_data_static_param_axes_1D_single_elem_static_shape_squeezable_dims_one) { + auto param = std::make_shared(ov::element::f32, PartialShape{2, 1, 4}); + const auto axes_node = std::make_shared(element::u64, PartialShape{1}); + const auto squeeze = std::make_shared(param, axes_node); + + EXPECT_EQ(squeeze->get_element_type(), element::f32); + EXPECT_EQ(squeeze->get_output_partial_shape(0), PartialShape::dynamic()); +} + +TEST(TypePropSqueezelOperatorV0, squeeze_data_static_param_axes_scalar_static_shape_squeezable_dims_one) { auto param = std::make_shared(ov::element::f32, PartialShape{2, 1, 4}); const auto axes_node = std::make_shared(element::u64, PartialShape{}); - const auto squeeze = std::make_shared(param, axes_node); + const auto squeeze = std::make_shared(param, axes_node); EXPECT_EQ(squeeze->get_element_type(), element::f32); EXPECT_EQ(squeeze->get_output_partial_shape(0), PartialShape::dynamic(2)); } -TEST(type_prop, squeeze_data_scalar_param_axes_1D_single_elem_static_shape) { +TEST(TypePropSqueezelOperatorV15, squeeze_data_static_param_axes_scalar_static_shape_squeezable_dims_one) { + auto param = std::make_shared(ov::element::f32, PartialShape{2, 1, 4}); + const auto axes_node = std::make_shared(element::u64, PartialShape{}); + const auto squeeze = std::make_shared(param, axes_node); + + EXPECT_EQ(squeeze->get_element_type(), element::f32); + EXPECT_EQ(squeeze->get_output_partial_shape(0), PartialShape::dynamic()); +} + +TYPED_TEST(SqueezelOperator, squeeze_data_scalar_param_axes_1D_single_elem_static_shape) { auto param = std::make_shared(ov::element::f32, PartialShape{}); const auto axes_node = std::make_shared(element::u64, PartialShape{1}); - const auto squeeze = std::make_shared(param, axes_node); + const auto squeeze = this->make_op(param, axes_node); EXPECT_EQ(squeeze->get_element_type(), element::f32); EXPECT_EQ(squeeze->get_output_partial_shape(0), PartialShape::dynamic()); } -TEST(type_prop, squeeze_data_dynamic_param_axes_1D_two_elem_static_shape_squeezable_dims_equal) { +TYPED_TEST(SqueezelOperator, squeeze_data_dynamic_param_axes_1D_two_elem_static_shape_squeezable_dims_equal) { auto param = std::make_shared(ov::element::f32, PartialShape{-1, {2, 8}, {1, 3}, {4, -1}}); const auto axes_node = std::make_shared(element::u64, PartialShape{2}); - const auto squeeze = std::make_shared(param, axes_node); + const auto squeeze = this->make_op(param, axes_node); EXPECT_EQ(squeeze->get_element_type(), element::f32); EXPECT_EQ(squeeze->get_output_partial_shape(0), PartialShape::dynamic()); } -TEST(type_prop, squeeze_data_static_param_axes_1D_two_elem_static_shape_squeezable_dims_more) { +TYPED_TEST(SqueezelOperator, squeeze_data_static_param_axes_1D_two_elem_static_shape_squeezable_dims_more) { auto param = std::make_shared(ov::element::f32, PartialShape{1, 2, 1, 3, 1}); const auto axes_node = std::make_shared(element::u64, PartialShape{2}); - const auto squeeze = std::make_shared(param, axes_node); + const auto squeeze = this->make_op(param, axes_node); EXPECT_EQ(squeeze->get_element_type(), element::f32); EXPECT_EQ(squeeze->get_output_partial_shape(0), PartialShape::dynamic()); } -TEST(type_prop, squeeze_data_static_param_axes_1D_single_elem_static_shape_squeezable_dims_more) { +TEST(TypePropSqueezelOperatorV0, squeeze_data_static_param_axes_1D_single_elem_static_shape_squeezable_dims_more) { auto param = std::make_shared(ov::element::f32, PartialShape{1, 2, 1, 3, 1}); const auto axes_node = std::make_shared(element::u64, PartialShape{1}); - const auto squeeze = std::make_shared(param, axes_node); + const auto squeeze = std::make_shared(param, axes_node); EXPECT_EQ(squeeze->get_element_type(), element::f32); EXPECT_EQ(squeeze->get_output_partial_shape(0), PartialShape::dynamic(4)); } -TEST(type_prop, squeeze_data_static_param_axes_scalar_static_shape_squeezable_dims_more) { +TEST(TypePropSqueezelOperatorV15, squeeze_data_static_param_axes_1D_single_elem_static_shape_squeezable_dims_more) { + auto param = std::make_shared(ov::element::f32, PartialShape{1, 2, 1, 3, 1}); + const auto axes_node = std::make_shared(element::u64, PartialShape{1}); + const auto squeeze = std::make_shared(param, axes_node); + + EXPECT_EQ(squeeze->get_element_type(), element::f32); + EXPECT_EQ(squeeze->get_output_partial_shape(0), PartialShape::dynamic()); +} + +TEST(TypePropSqueezelOperatorV0, squeeze_data_static_param_axes_scalar_static_shape_squeezable_dims_more) { auto param = std::make_shared(ov::element::f32, PartialShape{1, 2, 1, 3, 1}); const auto axes_node = std::make_shared(element::u64, PartialShape{}); - const auto squeeze = std::make_shared(param, axes_node); + const auto squeeze = std::make_shared(param, axes_node); EXPECT_EQ(squeeze->get_element_type(), element::f32); EXPECT_EQ(squeeze->get_output_partial_shape(0), PartialShape::dynamic(4)); } -TEST(type_prop, squeeze_data_dynamic_param_axes_1D_two_elem_static_shape_squeezable_dims_more) { +TEST(TypePropSqueezelOperatorV15, squeeze_data_static_param_axes_scalar_static_shape_squeezable_dims_more) { + auto param = std::make_shared(ov::element::f32, PartialShape{1, 2, 1, 3, 1}); + const auto axes_node = std::make_shared(element::u64, PartialShape{}); + const auto squeeze = std::make_shared(param, axes_node); + + EXPECT_EQ(squeeze->get_element_type(), element::f32); + EXPECT_EQ(squeeze->get_output_partial_shape(0), PartialShape::dynamic()); +} + +TYPED_TEST(SqueezelOperator, squeeze_data_dynamic_param_axes_1D_two_elem_static_shape_squeezable_dims_more) { auto param = std::make_shared(ov::element::f32, PartialShape{-1, {2, 8}, {1, 3}, {4, -1}}); const auto axes_node = std::make_shared(element::u64, PartialShape{2}); - const auto squeeze = std::make_shared(param, axes_node); + const auto squeeze = this->make_op(param, axes_node); EXPECT_EQ(squeeze->get_element_type(), element::f32); EXPECT_EQ(squeeze->get_output_partial_shape(0), PartialShape::dynamic()); } -TEST(type_prop, squeeze_data_dynamic_param_axes_1D_single_elem_static_shape_squeezable_dims_more) { +TEST(TypePropSqueezelOperatorV0, squeeze_data_dynamic_param_axes_1D_single_elem_static_shape_squeezable_dims_more) { auto param = std::make_shared(ov::element::f32, PartialShape{-1, {2, 8}, {1, 3}, {4, -1}}); const auto axes_node = std::make_shared(element::u64, PartialShape{1}); - const auto squeeze = std::make_shared(param, axes_node); + const auto squeeze = std::make_shared(param, axes_node); EXPECT_EQ(squeeze->get_element_type(), element::f32); EXPECT_EQ(squeeze->get_output_partial_shape(0), PartialShape::dynamic(3)); } -TEST(type_prop, squeeze_data_dynamic_param_axes_scalar_static_shape_squeezable_dims_more) { +TEST(TypePropSqueezelOperatorV15, squeeze_data_dynamic_param_axes_1D_single_elem_static_shape_squeezable_dims_more) { + auto param = std::make_shared(ov::element::f32, PartialShape{-1, {2, 8}, {1, 3}, {4, -1}}); + const auto axes_node = std::make_shared(element::u64, PartialShape{1}); + const auto squeeze = std::make_shared(param, axes_node); + + EXPECT_EQ(squeeze->get_element_type(), element::f32); + EXPECT_EQ(squeeze->get_output_partial_shape(0), PartialShape::dynamic()); +} + +TEST(TypePropSqueezelOperatorV0, squeeze_data_dynamic_param_axes_scalar_static_shape_squeezable_dims_more) { auto param = std::make_shared(ov::element::f32, PartialShape{-1, {2, 8}, {1, 3}, {4, -1}}); const auto axes_node = std::make_shared(element::u64, PartialShape{}); - const auto squeeze = std::make_shared(param, axes_node); + const auto squeeze = std::make_shared(param, axes_node); EXPECT_EQ(squeeze->get_element_type(), element::f32); EXPECT_EQ(squeeze->get_output_partial_shape(0), PartialShape::dynamic(3)); } -TEST(type_prop, squeeze_data_dyamic_param_axes_1D_two_elem_static_shape_squeezable_dims_one) { +TEST(TypePropSqueezelOperatorV15, squeeze_data_dynamic_param_axes_scalar_static_shape_squeezable_dims_more) { + auto param = std::make_shared(ov::element::f32, PartialShape{-1, {2, 8}, {1, 3}, {4, -1}}); + const auto axes_node = std::make_shared(element::u64, PartialShape{}); + const auto squeeze = std::make_shared(param, axes_node); + + EXPECT_EQ(squeeze->get_element_type(), element::f32); + EXPECT_EQ(squeeze->get_output_partial_shape(0), PartialShape::dynamic()); +} + +TYPED_TEST(SqueezelOperator, squeeze_data_dyamic_param_axes_1D_two_elem_static_shape_squeezable_dims_one) { auto param = std::make_shared(ov::element::f32, PartialShape{2, -1, 4}); const auto axes_node = std::make_shared(element::u64, PartialShape{2}); - const auto squeeze = std::make_shared(param, axes_node); + const auto squeeze = this->make_op(param, axes_node); EXPECT_EQ(squeeze->get_element_type(), element::f32); EXPECT_EQ(squeeze->get_output_partial_shape(0), PartialShape::dynamic()); } -TEST(type_prop, squeeze_data_dynamic_param_axes_1D_three_elem_static_shape_squeezable_dims_two) { +TYPED_TEST(SqueezelOperator, squeeze_data_dynamic_param_axes_1D_three_elem_static_shape_squeezable_dims_two) { auto param = std::make_shared(ov::element::f32, PartialShape{-1, {2, 8}, {1, 3}, {4, -1}}); const auto axes_node = std::make_shared(element::u64, PartialShape{3}); - const auto squeeze = std::make_shared(param, axes_node); + const auto squeeze = this->make_op(param, axes_node); EXPECT_EQ(squeeze->get_element_type(), element::f32); EXPECT_EQ(squeeze->get_output_partial_shape(0), PartialShape::dynamic()); } -TEST(type_prop, squeeze_data_dynamic_param_axes_1D_single_elem_static_shape_squeezable_dims_less) { +TEST(TypePropSqueezelOperatorV0, squeeze_data_dynamic_param_axes_1D_single_elem_static_shape_squeezable_dims_less) { auto param = std::make_shared(ov::element::f32, PartialShape{-1, {2, 8}, {1, 3}, {4, -1}}); const auto axes_node = std::make_shared(element::u64, PartialShape{1}); - const auto squeeze = std::make_shared(param, axes_node); + const auto squeeze = std::make_shared(param, axes_node); EXPECT_EQ(squeeze->get_element_type(), element::f32); EXPECT_EQ(squeeze->get_output_partial_shape(0), PartialShape::dynamic(3)); } +TEST(TypePropSqueezelOperatorV15, squeeze_data_dynamic_param_axes_1D_single_elem_static_shape_squeezable_dims_less) { + auto param = std::make_shared(ov::element::f32, PartialShape{-1, {2, 8}, {1, 3}, {4, -1}}); + const auto axes_node = std::make_shared(element::u64, PartialShape{1}); + const auto squeeze = std::make_shared(param, axes_node); + + EXPECT_EQ(squeeze->get_element_type(), element::f32); + EXPECT_EQ(squeeze->get_output_partial_shape(0), PartialShape::dynamic()); +} + using SqueezeTypePropTestParam = std::tuple, // Squeeze axis PartialShape // Expected shape @@ -288,26 +356,44 @@ INSTANTIATE_TEST_SUITE_P(type_prop_shrink_shape_default_axes, TEST_P(SqueezeTest, partial_shape_dimension_propagation_const_axis_i32) { const auto axes_node = std::make_shared(element::i32, Shape{axes.size()}, axes); - const auto squeeze = std::make_shared(param, axes_node); - - EXPECT_EQ(squeeze->get_element_type(), element::f32); - EXPECT_EQ(squeeze->get_output_partial_shape(0), exp_shape); + { + const auto squeeze = std::make_shared(param, axes_node); + EXPECT_EQ(squeeze->get_element_type(), element::f32); + EXPECT_EQ(squeeze->get_output_partial_shape(0), exp_shape); + } + { + const auto squeeze = std::make_shared(param, axes_node); + EXPECT_EQ(squeeze->get_element_type(), element::f32); + EXPECT_EQ(squeeze->get_output_partial_shape(0), exp_shape); + } } TEST_P(SqueezeTest, partial_shape_dimension_propagation_parameter_axes_no_data) { const auto axes_node = std::make_shared(element::u64, PartialShape{Shape{axes.size()}}); - const auto squeeze = std::make_shared(param, axes_node); - - EXPECT_EQ(squeeze->get_element_type(), element::f32); - EXPECT_TRUE(squeeze->get_output_partial_shape(0).compatible(exp_shape)); + { + const auto squeeze = std::make_shared(param, axes_node); + EXPECT_EQ(squeeze->get_element_type(), element::f32); + EXPECT_TRUE(squeeze->get_output_partial_shape(0).compatible(exp_shape)); + } + { + const auto squeeze = std::make_shared(param, axes_node); + EXPECT_EQ(squeeze->get_element_type(), element::f32); + EXPECT_TRUE(squeeze->get_output_partial_shape(0).compatible(exp_shape)); + } } TEST_P(SqueezeTest, partial_shape_dimension_propagation_dynamic_axes) { const auto axes_node = std::make_shared(element::u64, PartialShape::dynamic()); - const auto squeeze = std::make_shared(param, axes_node); - - EXPECT_EQ(squeeze->get_element_type(), element::f32); - EXPECT_EQ(squeeze->get_output_partial_shape(0), PartialShape::dynamic()); + { + const auto squeeze = std::make_shared(param, axes_node); + EXPECT_EQ(squeeze->get_element_type(), element::f32); + EXPECT_EQ(squeeze->get_output_partial_shape(0), PartialShape::dynamic()); + } + { + const auto squeeze = std::make_shared(param, axes_node); + EXPECT_EQ(squeeze->get_element_type(), element::f32); + EXPECT_EQ(squeeze->get_output_partial_shape(0), PartialShape::dynamic()); + } } TEST_P(SqueezeTest, symbols_propagation) { @@ -321,9 +407,14 @@ TEST_P(SqueezeTest, symbols_propagation) { param = make_shared(element::f32, p_shape); const auto axes_node = std::make_shared(element::i32, Shape{axes.size()}, axes); - const auto squeeze = std::make_shared(param, axes_node); - - EXPECT_EQ(get_shape_symbols(squeeze->get_output_partial_shape(0)), exp_symbols); + { + const auto squeeze = std::make_shared(param, axes_node); + EXPECT_EQ(get_shape_symbols(squeeze->get_output_partial_shape(0)), exp_symbols); + } + { + const auto squeeze = std::make_shared(param, axes_node); + EXPECT_EQ(get_shape_symbols(squeeze->get_output_partial_shape(0)), exp_symbols); + } } using SqueezeShapeTests = SqueezeTest; @@ -336,10 +427,16 @@ INSTANTIATE_TEST_SUITE_P(type_prop_shrink_shape_no_axes, TEST_P(SqueezeShapeTests, shape_dimension_propagation_const_axis_i64) { param = std::make_shared(element::f64, p_shape.to_shape()); const auto axes_node = std::make_shared(element::i64, Shape{axes.size()}, axes); - const auto squeeze = std::make_shared(param, axes_node); - - EXPECT_EQ(squeeze->get_element_type(), element::f64); - EXPECT_EQ(squeeze->get_output_partial_shape(0), exp_shape.to_shape()); + { + const auto squeeze = std::make_shared(param, axes_node); + EXPECT_EQ(squeeze->get_element_type(), element::f64); + EXPECT_EQ(squeeze->get_output_partial_shape(0), exp_shape.to_shape()); + } + { + const auto squeeze = std::make_shared(param, axes_node); + EXPECT_EQ(squeeze->get_element_type(), element::f64); + EXPECT_EQ(squeeze->get_output_partial_shape(0), exp_shape.to_shape()); + } } using SqueezeNoAxesTest = SqueezeTest; @@ -350,10 +447,16 @@ INSTANTIATE_TEST_SUITE_P(type_prop_shrink_shape_no_axes, PrintToStringParamName()); TEST_P(SqueezeNoAxesTest, partial_shape_dimension_propagation_no_axes) { - const auto squeeze = std::make_shared(param); - - EXPECT_EQ(squeeze->get_element_type(), element::f32); - EXPECT_EQ(squeeze->get_output_partial_shape(0), exp_shape); + { + const auto squeeze = std::make_shared(param); + EXPECT_EQ(squeeze->get_element_type(), element::f32); + EXPECT_EQ(squeeze->get_output_partial_shape(0), exp_shape); + } + { + const auto squeeze = std::make_shared(param); + EXPECT_EQ(squeeze->get_element_type(), element::f32); + EXPECT_EQ(squeeze->get_output_partial_shape(0), exp_shape); + } } using SqueezeScalarAxisTest = SqueezeTest; @@ -368,25 +471,35 @@ INSTANTIATE_TEST_SUITE_P( TEST_P(SqueezeScalarAxisTest, axis_value_as_vector) { const auto axes_node = std::make_shared(element::i32, Shape{}, axes); - const auto squeeze = std::make_shared(param, axes_node); - - EXPECT_EQ(squeeze->get_element_type(), element::f32); - EXPECT_EQ(squeeze->get_output_partial_shape(0), exp_shape); + { + const auto squeeze = std::make_shared(param, axes_node); + EXPECT_EQ(squeeze->get_element_type(), element::f32); + EXPECT_EQ(squeeze->get_output_partial_shape(0), exp_shape); + } + { + const auto squeeze = std::make_shared(param, axes_node); + EXPECT_EQ(squeeze->get_element_type(), element::f32); + EXPECT_EQ(squeeze->get_output_partial_shape(0), exp_shape); + } } TEST_P(SqueezeScalarAxisTest, axis_value_as_integer) { const auto axes_node = std::make_shared(element::i32, Shape{}, axes.front()); - const auto squeeze = std::make_shared(param, axes_node); - - EXPECT_EQ(squeeze->get_element_type(), element::f32); - EXPECT_EQ(squeeze->get_output_partial_shape(0), exp_shape); + { + const auto squeeze = std::make_shared(param, axes_node); + EXPECT_EQ(squeeze->get_element_type(), element::f32); + EXPECT_EQ(squeeze->get_output_partial_shape(0), exp_shape); + } + { + const auto squeeze = std::make_shared(param, axes_node); + EXPECT_EQ(squeeze->get_element_type(), element::f32); + EXPECT_EQ(squeeze->get_output_partial_shape(0), exp_shape); + } } using SqueezeBoundTest = UnSqueezeBoundTest; -INSTANTIATE_TEST_SUITE_P( - type_prop_bounds_propagate, - SqueezeBoundTest, +const auto test_values_in = Values(std::make_tuple(PartialShape::dynamic(6), PartialShape::dynamic(1)), std::make_tuple(PartialShape{Dimension(-1)}, PartialShape{Dimension(-1)}), std::make_tuple(PartialShape{Dimension::dynamic(), 8}, PartialShape{Dimension::dynamic()}), @@ -394,34 +507,136 @@ INSTANTIATE_TEST_SUITE_P( std::make_tuple(PartialShape{Dimension(20, -1), Dimension::dynamic()}, PartialShape{{20, -1}}), std::make_tuple(PartialShape{Dimension(-1, 5), Dimension::dynamic()}, PartialShape{Dimension(-1, 5)}), std::make_tuple(PartialShape{15}, PartialShape{15}), - std::make_tuple(PartialShape{2, 6}, PartialShape{2})), - PrintToStringParamName()); + std::make_tuple(PartialShape{2, 6}, PartialShape{2})); + +INSTANTIATE_TEST_SUITE_P(type_prop_bounds_propagate, SqueezeBoundTest, test_values_in, PrintToStringParamName()); /** * \brief Check symbol and dynamic value propagation. * * Test use evaluate symbol, lower/upper. */ -TEST_P(SqueezeBoundTest, propagate_symbol_and_dynamic_value) { +TEST_P(SqueezeBoundTest, propagate_symbol_and_dynamic_value_squeeze_v0) { PartialShape symboled_shape = PartialShape{p_shape}; in_symbols = set_shape_symbols(symboled_shape); - constexpr auto et = element::i64; - const auto symboled_param = std::make_shared(et, symboled_shape); - const auto symboled_shape_of = std::make_shared(symboled_param); + const auto squeeze = create_squeeze(symboled_shape); + const auto bc = std::make_shared(param, squeeze); + + EXPECT_EQ(bc->get_output_partial_shape(0), exp_shape); + const auto symbols = get_shape_symbols(bc->get_output_partial_shape(0)); + EXPECT_THAT(symbols, ElementsAre(in_symbols.front())); +} - const auto zero = std::vector{0}; - const auto axis = std::make_shared(et, Shape{}, zero); - const auto indices = std::make_shared(et, Shape{}, zero); - const auto gather = std::make_shared(symboled_shape_of, indices, axis); - const auto axis_1 = std::make_shared(et, Shape{2}, std::vector{0, 1}); - const auto unsqueeze = std::make_shared(gather, axis_1); - const auto squeeze = std::make_shared(unsqueeze, axis); +/** + * \brief Check symbol and dynamic value propagation. + * + * Test use evaluate symbol, lower/upper. + */ +TEST_P(SqueezeBoundTest, propagate_symbol_and_dynamic_value_squeeze_v15) { + PartialShape symboled_shape = PartialShape{p_shape}; + + in_symbols = set_shape_symbols(symboled_shape); + const auto squeeze = create_squeeze(symboled_shape); const auto bc = std::make_shared(param, squeeze); EXPECT_EQ(bc->get_output_partial_shape(0), exp_shape); const auto symbols = get_shape_symbols(bc->get_output_partial_shape(0)); EXPECT_THAT(symbols, ElementsAre(in_symbols.front())); } + +using SqueezeAxesDynamicRankTestParam = decltype(std::tuple_cat(SqueezeTypePropTestParam{}, std::make_tuple(false))); +class SqueezeAxesDynamicRank : public ::testing::TestWithParam { +protected: + ov::PartialShape p_shape{}, exp_shape{}; + std::vector axes{}; + bool allow_axis_skip{}; +}; + +INSTANTIATE_TEST_SUITE_P( + SqueezeAxesDynamicRankTests, + SqueezeAxesDynamicRank, + ::testing::Values( + std::make_tuple(PartialShape{1, 2, -1, 4}, std::vector{}, PartialShape::dynamic(), false), + std::make_tuple(PartialShape{1, 2, -1, 4}, std::vector{}, PartialShape::dynamic(), true), + + std::make_tuple(PartialShape{1, 2, -1, 4}, std::vector{0}, PartialShape{2, -1, 4}, false), + std::make_tuple(PartialShape{1, 2, -1, 4}, std::vector{0}, PartialShape{2, -1, 4}, true), + + std::make_tuple(PartialShape{1, 2, -1, 4}, std::vector{2}, PartialShape{1, 2, 4}, false), + std::make_tuple(PartialShape{1, 2, -1, 4}, std::vector{2}, PartialShape::dynamic(), true), + + std::make_tuple(PartialShape{1, 2, -1, 4}, std::vector{0, 2}, PartialShape{2, 4}, false), + std::make_tuple(PartialShape{1, 2, -1, 4}, std::vector{0, 2}, PartialShape::dynamic(), true), + + std::make_tuple(PartialShape{1, 2, -1, 4}, std::vector{1}, PartialShape{1, 2, -1, 4}, false), + std::make_tuple(PartialShape{1, 2, -1, 4}, std::vector{1}, PartialShape{1, 2, -1, 4}, true), + + std::make_tuple(PartialShape{2, 4}, std::vector{1}, PartialShape{2, 4}, false), + std::make_tuple(PartialShape{2, 4}, std::vector{1}, PartialShape{2, 4}, true), + + std::make_tuple(PartialShape{2, {3, 5}}, std::vector{}, PartialShape{2, {3, 5}}, false), + std::make_tuple(PartialShape{2, {3, 5}}, std::vector{}, PartialShape{2, {3, 5}}, true), + + std::make_tuple(PartialShape{1, 2, -1}, std::vector{0, 1}, PartialShape{2, -1}, false), + std::make_tuple(PartialShape{1, 2, -1}, std::vector{0, 1}, PartialShape{2, -1}, true), + + std::make_tuple(PartialShape{1, 2, -1}, std::vector{1}, PartialShape{1, 2, -1}, false), + std::make_tuple(PartialShape{1, 2, -1}, std::vector{1}, PartialShape{1, 2, -1}, true), + + std::make_tuple(PartialShape{1, 1, -1}, std::vector{0, 1}, PartialShape{-1}, false), + std::make_tuple(PartialShape{1, 1, -1}, std::vector{0, 1}, PartialShape{-1}, true), + + std::make_tuple(PartialShape{1, 1, -1}, std::vector{1}, PartialShape{1, -1}, false), + std::make_tuple(PartialShape{1, 1, -1}, std::vector{1}, PartialShape{1, -1}, true), + + std::make_tuple(PartialShape{1, 2, 3}, std::vector{}, PartialShape{2, 3}, false), + std::make_tuple(PartialShape{1, 2, 3}, std::vector{}, PartialShape{2, 3}, true))); + +TEST_P(SqueezeAxesDynamicRank, squeeze_axes_dynamic_rank_param) { + const auto& params = GetParam(); + p_shape = std::get<0>(params); + axes = std::get<1>(params); + exp_shape = std::get<2>(params); + allow_axis_skip = std::get<3>(params); + + auto param = make_shared(element::f32, p_shape); + auto axes_node = make_shared(element::u64, Shape{axes.size()}, axes); + const auto squeeze = std::make_shared(param, axes_node, allow_axis_skip); + + EXPECT_EQ(squeeze->get_element_type(), element::f32); + EXPECT_EQ(squeeze->get_output_partial_shape(0), exp_shape); + EXPECT_EQ(squeeze->get_allow_axis_skip(), allow_axis_skip); +} + +TEST(SqueezeDynamicAxis, squeeze_dynamic_non_const_single_axis) { + auto p_shape = PartialShape{1, 2, -1, 4}; + auto exp_shape = PartialShape::dynamic(); + auto allow_axis_skip = true; + + auto param = make_shared(element::f32, p_shape); + auto axes_node = make_shared(element::i32, Shape{1}); + const auto squeeze = std::make_shared(param, axes_node, allow_axis_skip); + + EXPECT_EQ(squeeze->get_element_type(), element::f32); + EXPECT_EQ(squeeze->get_output_partial_shape(0), exp_shape); + EXPECT_EQ(squeeze->get_allow_axis_skip(), allow_axis_skip); +} + +TEST(SqueezeDynamicAxis, squeeze_dynamic_non_const_axes) { + auto p_shape = PartialShape{1, 2, -1, 4}; + auto exp_shape = PartialShape::dynamic(); + auto allow_axis_skip = true; + + auto param = make_shared(element::f32, p_shape); + auto axes_node = make_shared(element::i32, PartialShape{-1}); + const auto squeeze = std::make_shared(param, axes_node, allow_axis_skip); + + EXPECT_EQ(squeeze->get_element_type(), element::f32); + EXPECT_EQ(squeeze->get_output_partial_shape(0), exp_shape); + EXPECT_EQ(squeeze->get_allow_axis_skip(), allow_axis_skip); +} + +} // namespace diff --git a/src/core/tests/visitors/op/squeeze.cpp b/src/core/tests/visitors/op/squeeze.cpp index 6eb1674b26329a..be596a5fb1dc67 100644 --- a/src/core/tests/visitors/op/squeeze.cpp +++ b/src/core/tests/visitors/op/squeeze.cpp @@ -6,7 +6,16 @@ #include "unary_ops.hpp" +namespace v0 { using Types = ::testing::Types, UnaryOperatorType>; INSTANTIATE_TYPED_TEST_SUITE_P(visitor_without_attribute, UnaryOperatorVisitor, Types, UnaryOperatorTypeName); +} // namespace v0 + +namespace v15 { +using Types = ::testing::Types, + UnaryOperatorTypeWithAttribute>; + +INSTANTIATE_TYPED_TEST_SUITE_P(visitor_single_attribute, UnaryOperatorVisitor, Types, UnaryOperatorTypeName); +} // namespace v15 diff --git a/src/core/tests/visitors/op/unary_ops.hpp b/src/core/tests/visitors/op/unary_ops.hpp index 3bef2429983e9f..6cc2afda62e253 100644 --- a/src/core/tests/visitors/op/unary_ops.hpp +++ b/src/core/tests/visitors/op/unary_ops.hpp @@ -9,12 +9,17 @@ #include "openvino/op/parameter.hpp" #include "visitors/visitors.hpp" -template +template class UnaryOperatorType { public: using op_type = T; static constexpr ov::element::Type_t element_type = ELEMENT_TYPE; + static constexpr int expected_attr_count = ATTRIBUTES_COUNT; }; + +template +using UnaryOperatorTypeWithAttribute = UnaryOperatorType; + template class UnaryOperatorVisitor : public testing::Test {}; @@ -43,7 +48,7 @@ TYPED_TEST_P(UnaryOperatorVisitor, No_Attribute_4D) { EXPECT_NO_THROW(auto g_op_func = ov::as_type_ptr(builder.create())); - const auto expected_attr_count = 0; + const auto expected_attr_count = TypeParam::expected_attr_count; EXPECT_EQ(builder.get_value_map_size(), expected_attr_count); } diff --git a/src/plugins/intel_cpu/src/shape_inference/shape_inference.cpp b/src/plugins/intel_cpu/src/shape_inference/shape_inference.cpp index bb2d5e5e84b267..1921169f83afd7 100644 --- a/src/plugins/intel_cpu/src/shape_inference/shape_inference.cpp +++ b/src/plugins/intel_cpu/src/shape_inference/shape_inference.cpp @@ -407,6 +407,7 @@ using IStaticShapeInferFactory = template <> const IStaticShapeInferFactory::TRegistry IStaticShapeInferFactory::registry{ // opset15 + _OV_OP_SHAPE_INFER_MASK_REG(op::v15::Squeeze, ShapeInferTA, util::bit::mask(1)), _OV_OP_SHAPE_INFER_MASK_REG(op::v15::SearchSorted, ShapeInferTA, util::bit::mask()), _OV_OP_SHAPE_INFER_MASK_REG(op::v15::StringTensorUnpack, ShapeInferTA, util::bit::mask(0)), _OV_OP_SHAPE_INFER_MASK_REG(op::v15::StringTensorPack, ShapeInferTA, util::bit::mask(0, 1)), diff --git a/src/plugins/intel_cpu/tests/unit/shape_inference_test/squeeze_shape_inference_test.cpp b/src/plugins/intel_cpu/tests/unit/shape_inference_test/squeeze_shape_inference_test.cpp index 69da74b10a2f45..5f790135780013 100644 --- a/src/plugins/intel_cpu/tests/unit/shape_inference_test/squeeze_shape_inference_test.cpp +++ b/src/plugins/intel_cpu/tests/unit/shape_inference_test/squeeze_shape_inference_test.cpp @@ -16,14 +16,16 @@ using namespace ov; using namespace ov::intel_cpu; using namespace testing; -class SqueezeStaticShapeInferenceAssertTest : public OpStaticShapeInferenceTest { +namespace v0 { + +class SqueezeV0StaticShapeInferenceAssertTest : public OpStaticShapeInferenceTest { protected: void SetUp() override { output_shapes = ShapeVector(1); } }; -TEST_F(SqueezeStaticShapeInferenceAssertTest, no_axes) { +TEST_F(SqueezeV0StaticShapeInferenceAssertTest, no_axes) { const auto arg = std::make_shared(element::f64, PartialShape{-1, -1}); const auto axes = std::make_shared(element::i64, PartialShape{1}); const auto op = make_op(arg, axes); @@ -35,7 +37,7 @@ TEST_F(SqueezeStaticShapeInferenceAssertTest, no_axes) { HasSubstr("Check 'constant != nullptr'")); } -TEST_F(SqueezeStaticShapeInferenceAssertTest, parameter_static_shape_axes_no_data) { +TEST_F(SqueezeV0StaticShapeInferenceAssertTest, parameter_static_shape_axes_no_data) { const auto arg = std::make_shared(element::f64, Shape{2, 1, 3, 1}); const auto axes = std::make_shared(element::i64, Shape{2}); const auto op = make_op(arg, axes); @@ -52,11 +54,11 @@ using TestParams = std::tuple; -class SqueezeStaticShapeInferenceTest : public SqueezeStaticShapeInferenceAssertTest, +class SqueezeV0StaticShapeInferenceTest : public SqueezeV0StaticShapeInferenceAssertTest, public WithParamInterface { protected: void SetUp() override { - SqueezeStaticShapeInferenceAssertTest::SetUp(); + SqueezeV0StaticShapeInferenceAssertTest::SetUp(); std::tie(input_shapes, axes, exp_shape) = GetParam(); output_shapes = ShapeVector(1); @@ -68,7 +70,7 @@ class SqueezeStaticShapeInferenceTest : public SqueezeStaticShapeInferenceAssert }; INSTANTIATE_TEST_SUITE_P(1d_shapes, - SqueezeStaticShapeInferenceTest, + SqueezeV0StaticShapeInferenceTest, Values(make_tuple(ShapeVector{{1}, {1}}, std::vector{-1}, StaticShape({})), make_tuple(ShapeVector{{6}, {1}}, std::vector{-1}, StaticShape({6})), make_tuple(ShapeVector{{1}, {1}}, std::vector{0}, StaticShape({}))), @@ -76,7 +78,7 @@ INSTANTIATE_TEST_SUITE_P(1d_shapes, INSTANTIATE_TEST_SUITE_P( multi_dim_shapes, - SqueezeStaticShapeInferenceTest, + SqueezeV0StaticShapeInferenceTest, Values(make_tuple(ShapeVector{{1, 2, 3, 1}, {2}}, std::vector{0, 3}, StaticShape({2, 3})), make_tuple(ShapeVector{{2, 1, 1, 4}, {2}}, std::vector{2, 1}, StaticShape({2, 4})), make_tuple(ShapeVector{{2, 1, 1, 4, 1}, {2}}, std::vector{0, 1, -2, -1}, StaticShape({2, 1, 4})), @@ -92,7 +94,7 @@ INSTANTIATE_TEST_SUITE_P( INSTANTIATE_TEST_SUITE_P( multi_dim_shapes_repeated_axis, - SqueezeStaticShapeInferenceTest, + SqueezeV0StaticShapeInferenceTest, Values(make_tuple(ShapeVector{{2, 1, 3}, {2}}, std::vector{1, 1}, StaticShape({2, 3})), make_tuple(ShapeVector{{3, 1, 2, 1}, {3}}, std::vector{1, -1, 1}, StaticShape({3, 2})), make_tuple(ShapeVector{{3, 1, 2, 1}, {3}}, std::vector{1, -1, 1, -1}, StaticShape({3, 2})), @@ -100,7 +102,7 @@ INSTANTIATE_TEST_SUITE_P( make_tuple(ShapeVector{{2, 6, 7, 8, 1}, {2}}, std::vector{-1, -1}, StaticShape({2, 6, 7, 8}))), PrintToStringParamName()); -TEST_P(SqueezeStaticShapeInferenceTest, shape_inference_empty_const_map) { +TEST_P(SqueezeV0StaticShapeInferenceTest, shape_inference_empty_const_map) { const auto axes_node = std::make_shared(element::i64, Shape{axes.size()}, axes); const auto op = make_op(arg, axes_node); @@ -109,8 +111,8 @@ TEST_P(SqueezeStaticShapeInferenceTest, shape_inference_empty_const_map) { ASSERT_EQ(output_shapes.front(), exp_shape); } -TEST_P(SqueezeStaticShapeInferenceTest, shape_inference_with_const_map) { - const auto axes_node = std::make_shared(element::i64, Shape{1}); +TEST_P(SqueezeV0StaticShapeInferenceTest, shape_inference_with_const_map) { + const auto axes_node = std::make_shared(element::i64, ov::PartialShape::dynamic()); const auto op = make_op(arg, axes_node); const auto axes_tensor = axes.empty() ? ov::Tensor(element::i64, ov::Shape{axes.size()}) @@ -121,3 +123,115 @@ TEST_P(SqueezeStaticShapeInferenceTest, shape_inference_with_const_map) { ASSERT_EQ(output_shapes.front(), exp_shape); } + +} // namespace v0 + +namespace v15 { + +class SqueezeV15StaticShapeInferenceAssertTest : public OpStaticShapeInferenceTest { +protected: + void SetUp() override { + output_shapes = ShapeVector(1); + } +}; + +TEST_F(SqueezeV15StaticShapeInferenceAssertTest, no_axes) { + const auto arg = std::make_shared(element::f64, PartialShape{-1, -1}); + const auto axes = std::make_shared(element::i64, PartialShape{1}); + const auto op = make_op(arg, axes); + + input_shapes = ShapeVector{{5, 6}, axes->get_shape()}; + + OV_EXPECT_THROW(shape_inference(op.get(), input_shapes), + NodeValidationFailure, + HasSubstr("Check 'constant != nullptr'")); +} + +TEST_F(SqueezeV15StaticShapeInferenceAssertTest, parameter_static_shape_axes_no_data) { + const auto arg = std::make_shared(element::f64, Shape{2, 1, 3, 1}); + const auto axes = std::make_shared(element::i64, Shape{2}); + const auto op = make_op(arg, axes); + + input_shapes = ShapeVector{arg->get_shape(), axes->get_shape()}; + + OV_EXPECT_THROW(shape_inference(op.get(), input_shapes), + NodeValidationFailure, + HasSubstr("Check 'constant != nullptr'")); +} + +using TestParams = std::tuple, // Squeeze axes + StaticShape // Expected shape + >; + +class SqueezeV15StaticShapeInferenceTest : public SqueezeV15StaticShapeInferenceAssertTest, + public WithParamInterface { +protected: + void SetUp() override { + SqueezeV15StaticShapeInferenceAssertTest::SetUp(); + std::tie(input_shapes, axes, exp_shape) = GetParam(); + + output_shapes = ShapeVector(1); + arg = std::make_shared(element::f32, input_shapes.front().get_shape()); + } + + std::vector axes; + std::shared_ptr arg; +}; + +INSTANTIATE_TEST_SUITE_P(1d_shapes, + SqueezeV15StaticShapeInferenceTest, + Values(make_tuple(ShapeVector{{1}, {1}}, std::vector{-1}, StaticShape({})), + make_tuple(ShapeVector{{6}, {1}}, std::vector{-1}, StaticShape({6})), + make_tuple(ShapeVector{{1}, {1}}, std::vector{0}, StaticShape({}))), + PrintToStringParamName()); + +INSTANTIATE_TEST_SUITE_P( + multi_dim_shapes, + SqueezeV15StaticShapeInferenceTest, + Values(make_tuple(ShapeVector{{1, 2, 3, 1}, {2}}, std::vector{0, 3}, StaticShape({2, 3})), + make_tuple(ShapeVector{{2, 1, 1, 4}, {2}}, std::vector{2, 1}, StaticShape({2, 4})), + make_tuple(ShapeVector{{2, 1, 1, 4, 1}, {2}}, std::vector{0, 1, -2, -1}, StaticShape({2, 1, 4})), + make_tuple(ShapeVector{{1, 3, 1, 2, 1}, {3}}, std::vector{0, 2, 4}, StaticShape({3, 2})), + make_tuple(ShapeVector{{1, 3, 1, 2, 1}, {3}}, std::vector{4, 2, 0}, StaticShape({3, 2})), + make_tuple(ShapeVector{{1, 3, 1, 2, 1}, {3}}, std::vector{2, 0, 4}, StaticShape({3, 2})), + make_tuple(ShapeVector{{10, 1, 0, 1, 3, 1, 1}, {4}}, + std::vector{1, -1, 3, -2}, + StaticShape({10, 0, 3})), + make_tuple(ShapeVector{{10, 1, 0, 1, 3, 1, 1}, {}}, std::vector{}, StaticShape({10, 0, 3})), + make_tuple(ShapeVector{{2, 1, 7, 8, 3}, {1}}, std::vector{1}, StaticShape({2, 7, 8, 3}))), + PrintToStringParamName()); + +INSTANTIATE_TEST_SUITE_P( + multi_dim_shapes_repeated_axis, + SqueezeV15StaticShapeInferenceTest, + Values(make_tuple(ShapeVector{{2, 1, 3}, {2}}, std::vector{1, 1}, StaticShape({2, 3})), + make_tuple(ShapeVector{{3, 1, 2, 1}, {3}}, std::vector{1, -1, 1}, StaticShape({3, 2})), + make_tuple(ShapeVector{{3, 1, 2, 1}, {3}}, std::vector{1, -1, 1, -1}, StaticShape({3, 2})), + make_tuple(ShapeVector{{1, 3, 1, 2, 1}, {3}}, std::vector{2, -1, 2, -1, 0}, StaticShape({3, 2})), + make_tuple(ShapeVector{{2, 6, 7, 8, 1}, {2}}, std::vector{-1, -1}, StaticShape({2, 6, 7, 8}))), + PrintToStringParamName()); + +TEST_P(SqueezeV15StaticShapeInferenceTest, shape_inference_empty_const_map) { + const auto axes_node = std::make_shared(element::i64, Shape{axes.size()}, axes); + const auto op = make_op(arg, axes_node); + + output_shapes = shape_inference(op.get(), input_shapes); + + ASSERT_EQ(output_shapes.front(), exp_shape); +} + +TEST_P(SqueezeV15StaticShapeInferenceTest, shape_inference_with_const_map) { + const auto axes_node = std::make_shared(element::i64, ov::PartialShape::dynamic()); + const auto op = make_op(arg, axes_node); + + const auto axes_tensor = axes.empty() ? ov::Tensor(element::i64, ov::Shape{axes.size()}) + : ov::Tensor(element::i64, ov::Shape{axes.size()}, axes.data()); + const auto constant_data = std::unordered_map{{1, axes_tensor}}; + + output_shapes = shape_inference(op.get(), input_shapes, constant_data); + + ASSERT_EQ(output_shapes.front(), exp_shape); +} + +} // namespace v15 diff --git a/src/plugins/template/tests/functional/op_reference/squeeze.cpp b/src/plugins/template/tests/functional/op_reference/squeeze.cpp index 8bf1902c403a1b..e397c7c403cec8 100644 --- a/src/plugins/template/tests/functional/op_reference/squeeze.cpp +++ b/src/plugins/template/tests/functional/op_reference/squeeze.cpp @@ -63,7 +63,7 @@ struct SqueezeParams { bool m_axes_node; }; -class ReferenceSqueezeLayerTest : public testing::TestWithParam, public CommonReferenceTest { +class ReferenceSqueezeLayerTestBase : public testing::TestWithParam, public CommonReferenceTest { public: void SetUp() override { const auto params = GetParam(); @@ -90,7 +90,12 @@ class ReferenceSqueezeLayerTest : public testing::TestWithParam, } private: - static std::shared_ptr CreateFunction(const SqueezeParams& params) { + virtual std::shared_ptr CreateFunction(const SqueezeParams&) = 0; +}; + +class ReferenceSqueezeLayerTest : public ReferenceSqueezeLayerTestBase { +private: + std::shared_ptr CreateFunction(const SqueezeParams& params) override { const auto in = std::make_shared(params.m_input_type, params.m_input_shape); std::shared_ptr axes_node = NULL; std::shared_ptr squeeze = NULL; @@ -180,4 +185,58 @@ INSTANTIATE_TEST_SUITE_P(smoke_Squeeze_With_Hardcoded_Refs, ::testing::ValuesIn(generateCombinedParamsForSqueeze()), ReferenceSqueezeLayerTest::getTestCaseName); +class ReferenceSqueezeV15LayerTest : public ReferenceSqueezeLayerTestBase { +private: + std::shared_ptr CreateFunction(const SqueezeParams& params) override { + const auto in = std::make_shared(params.m_input_type, params.m_input_shape); + std::shared_ptr axes_node = NULL; + std::shared_ptr squeeze = NULL; + if (params.m_axes_node) { + axes_node = + std::make_shared(params.m_axes_type, params.m_axes_shape, params.m_axes_value.data()); + squeeze = std::make_shared(in, axes_node); + } else { + squeeze = std::make_shared(in); + } + + return std::make_shared(squeeze, ParameterVector{in}); + } +}; + +TEST_P(ReferenceSqueezeV15LayerTest, CompareWithHardcodedRefs) { + Exec(); +} + +INSTANTIATE_TEST_SUITE_P(smoke_Squeeze_With_Hardcoded_Refs, + ReferenceSqueezeV15LayerTest, + ::testing::ValuesIn(generateCombinedParamsForSqueeze()), + ReferenceSqueezeV15LayerTest::getTestCaseName); + +class ReferenceSqueezeV15AttributeSetLayerTest : public ReferenceSqueezeLayerTestBase { +private: + std::shared_ptr CreateFunction(const SqueezeParams& params) override { + const auto in = std::make_shared(params.m_input_type, params.m_input_shape); + std::shared_ptr axes_node = NULL; + std::shared_ptr squeeze = NULL; + if (params.m_axes_node) { + axes_node = + std::make_shared(params.m_axes_type, params.m_axes_shape, params.m_axes_value.data()); + squeeze = std::make_shared(in, axes_node, true); + } else { + squeeze = std::make_shared(in, true); + } + + return std::make_shared(squeeze, ParameterVector{in}); + } +}; + +TEST_P(ReferenceSqueezeV15AttributeSetLayerTest, CompareWithHardcodedRefs) { + Exec(); +} + +INSTANTIATE_TEST_SUITE_P(smoke_Squeeze_With_Hardcoded_Refs, + ReferenceSqueezeV15AttributeSetLayerTest, + ::testing::ValuesIn(generateCombinedParamsForSqueeze()), + ReferenceSqueezeV15AttributeSetLayerTest::getTestCaseName); + } // namespace diff --git a/src/tests/functional/plugin/conformance/test_runner/op_conformance_runner/src/op_impl_check/single_op_graph.cpp b/src/tests/functional/plugin/conformance/test_runner/op_conformance_runner/src/op_impl_check/single_op_graph.cpp index f38427b7b192ed..bcb259cd49b60f 100644 --- a/src/tests/functional/plugin/conformance/test_runner/op_conformance_runner/src/op_impl_check/single_op_graph.cpp +++ b/src/tests/functional/plugin/conformance/test_runner/op_conformance_runner/src/op_impl_check/single_op_graph.cpp @@ -1193,7 +1193,15 @@ std::shared_ptr generate(const std::shared_ptr & const auto axes = std::make_shared(ov::element::i64, ov::Shape{2}, std::vector{0, 2}); auto Node = std::make_shared(params.at(0), axes); ov::ResultVector results{std::make_shared(Node)}; - return std::make_shared(results, params, "SqueezeGraph"); + return std::make_shared(results, params, "SqueezeV0Graph"); +} + +std::shared_ptr generate(const std::shared_ptr &node) { + ov::ParameterVector params{std::make_shared(ov::element::f32, ov::Shape{{1, 4, 1, 1, 2}})}; + const auto axes = std::make_shared(ov::element::i64, ov::Shape{2}, std::vector{0, 2}); + auto Node = std::make_shared(params.at(0), axes); + ov::ResultVector results{std::make_shared(Node)}; + return std::make_shared(results, params, "SqueezeV15Graph"); } std::shared_ptr generate(const std::shared_ptr &node) { diff --git a/src/tests/test_utils/common_test_utils/include/common_test_utils/type_prop.hpp b/src/tests/test_utils/common_test_utils/include/common_test_utils/type_prop.hpp index 2a953ad27740e6..e097a3ab957d13 100644 --- a/src/tests/test_utils/common_test_utils/include/common_test_utils/type_prop.hpp +++ b/src/tests/test_utils/common_test_utils/include/common_test_utils/type_prop.hpp @@ -7,7 +7,11 @@ #include "gmock/gmock.h" #include "openvino/core/dimension.hpp" #include "openvino/core/partial_shape.hpp" +#include "openvino/op/constant.hpp" +#include "openvino/op/gather.hpp" #include "openvino/op/parameter.hpp" +#include "openvino/op/shape_of.hpp" +#include "openvino/op/unsqueeze.hpp" #define EXPECT_HAS_SUBSTRING(haystack, needle) EXPECT_PRED_FORMAT2(testing::IsSubstring, needle, haystack) @@ -56,6 +60,23 @@ class UnSqueezeBoundTest : public testing::WithParamInterface(ov::element::f32, ov::PartialShape{1}); } + template + auto create_squeeze(ov::PartialShape symboled_shape) -> std::shared_ptr { + constexpr auto et = ov::element::i64; + const auto symboled_param = std::make_shared(et, symboled_shape); + const auto symboled_shape_of = std::make_shared(symboled_param); + + const auto zero = std::vector{0}; + const auto axis = std::make_shared(et, ov::Shape{}, zero); + const auto indices = std::make_shared(et, ov::Shape{}, zero); + const auto gather = std::make_shared(symboled_shape_of, indices, axis); + const auto axis_1 = std::make_shared(et, ov::Shape{2}, std::vector{0, 1}); + const auto unsqueeze = std::make_shared(gather, axis_1); + const auto squeeze = std::make_shared(unsqueeze, axis); + + return squeeze; + } + ov::TensorSymbol in_symbols; }; From 1ec99ee520cdd6fe5dd25c83a36d440170b4f888 Mon Sep 17 00:00:00 2001 From: Maxim Vafin Date: Wed, 30 Oct 2024 20:37:51 +0100 Subject: [PATCH 064/120] [PT FE] Support torch 2.5.1 (#27334) ### Details: - *Support torch 2.5.1* - *Update test requirements* ### Tickets: - *ticket-id* Signed-off-by: Maxim Vafin --- tests/constraints.txt | 2 +- tests/requirements_pytorch | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/tests/constraints.txt b/tests/constraints.txt index c6e2e5e65f96fe..2272151565ca8a 100644 --- a/tests/constraints.txt +++ b/tests/constraints.txt @@ -28,5 +28,5 @@ networkx<=3.3 flax<=0.10.0 --extra-index-url https://download.pytorch.org/whl/cpu -torch~=2.5.0; platform_system != "Darwin" or platform_machine != "x86_64" +torch~=2.5.1; platform_system != "Darwin" or platform_machine != "x86_64" torch~=2.2.0; platform_system == "Darwin" and platform_machine == "x86_64" diff --git a/tests/requirements_pytorch b/tests/requirements_pytorch index c2873210003b7d..56446beba12600 100644 --- a/tests/requirements_pytorch +++ b/tests/requirements_pytorch @@ -3,13 +3,13 @@ # optimum still requires numpy<2.0.0 numpy==1.26.4; python_version < "3.12" numpy==2.1.1; python_version >= "3.12" -torch==2.5.0; platform_system != "Darwin" or platform_machine != "x86_64" +torch==2.5.1; platform_system != "Darwin" or platform_machine != "x86_64" torch==2.2.2; platform_system == "Darwin" and platform_machine == "x86_64" --extra-index-url https://download.pytorch.org/whl/cpu -torchvision==0.20.0; platform_system != "Darwin" or platform_machine != "x86_64" +torchvision==0.20.1; platform_system != "Darwin" or platform_machine != "x86_64" torchvision==0.17.2; platform_system == "Darwin" and platform_machine == "x86_64" -torchaudio==2.5.0; platform_system != "Darwin" or platform_machine != "x86_64" +torchaudio==2.5.1; platform_system != "Darwin" or platform_machine != "x86_64" torchaudio==2.2.2; platform_system == "Darwin" and platform_machine == "x86_64" # transformers 4.45.1 is available # but optimum still requires <4.45.0 From a6eb53506e9b7fe6a29f128f16f3988da870bec6 Mon Sep 17 00:00:00 2001 From: virajwad <84867530+virajwad@users.noreply.github.com> Date: Wed, 30 Oct 2024 12:50:41 -0700 Subject: [PATCH 065/120] [ONNX] Added QuickGelu from com.microsoft domain (#27238) ### Details: - Microsoft Contrib Operator "QuickGelu" for ONNX RT ### Tickets: - CVS-152783 --------- Co-authored-by: Georgy Krivoruchko --- .../src/op/com.microsoft/quick_gelu.cpp | 58 +++++++++++++++++++ .../models/com.microsoft/quick_gelu.prototxt | 52 +++++++++++++++++ .../tests/onnx_import_com_microsoft.in.cpp | 26 +++++++++ 3 files changed, 136 insertions(+) create mode 100644 src/frontends/onnx/frontend/src/op/com.microsoft/quick_gelu.cpp create mode 100644 src/frontends/onnx/tests/models/com.microsoft/quick_gelu.prototxt diff --git a/src/frontends/onnx/frontend/src/op/com.microsoft/quick_gelu.cpp b/src/frontends/onnx/frontend/src/op/com.microsoft/quick_gelu.cpp new file mode 100644 index 00000000000000..c4144be9b5ff44 --- /dev/null +++ b/src/frontends/onnx/frontend/src/op/com.microsoft/quick_gelu.cpp @@ -0,0 +1,58 @@ +// Copyright (C) 2018-2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "core/operator_set.hpp" +#include "exceptions.hpp" +#include "openvino/frontend/exception.hpp" +#include "openvino/op/constant.hpp" +#include "openvino/op/multiply.hpp" +#include "openvino/op/sigmoid.hpp" +#include "utils/common.hpp" + +using namespace ov::op; + +namespace ov { +namespace frontend { +namespace onnx { +namespace com_microsoft { +namespace opset_1 { +ov::OutputVector quick_gelu(const ov::frontend::onnx::Node& node) { + // Original Documentation: + // https://github.com/microsoft/onnxruntime/blob/main/docs/ContribOperators.md#com.microsoft.QuickGelu + // Goal: Compute x * Sigmoid(alpha * x) + common::default_op_checks(node, 1); + + const auto inputs = node.get_ov_inputs(); + const auto& x = inputs[0]; + + // Constrain input type to float16, float, double (f64), bfloat16 + auto element_type = x.get_element_type(); + CHECK_VALID_NODE(node, + element_type == ov::element::f16 || element_type == ov::element::f32 || + element_type == ov::element::f64 || element_type == ov::element::bf16, + "Unsupported input x type, accepted FP16, FP32, FP64, BFP16 but got: ", + element_type); + + // Get attribute from node + const float alpha = node.get_attribute_value("alpha"); + + // Numpy broadcasting rule is automatically applied with mismatched shapes according to: + // https://docs.openvino.ai/2022.3/openvino_docs_ops_arithmetic_Multiply_1.html "Tensor with dimension of size 1 + // will be implicitly broadcasted to match the size of the second tensor." Convert alpha to tensor with size 1 + const auto alpha_tensor = std::make_shared(ov::element::f32, Shape{1}, alpha); + + auto alpha_x = std::make_shared(alpha_tensor, x); + auto sig_alpha_x = std::make_shared(alpha_x); + auto result = std::make_shared(x, sig_alpha_x); + + return {result}; +} // func end + +ONNX_OP("QuickGelu", OPSET_SINCE(1), com_microsoft::opset_1::quick_gelu, MICROSOFT_DOMAIN); + +} // namespace opset_1 +} // namespace com_microsoft +} // namespace onnx +} // namespace frontend +} // namespace ov diff --git a/src/frontends/onnx/tests/models/com.microsoft/quick_gelu.prototxt b/src/frontends/onnx/tests/models/com.microsoft/quick_gelu.prototxt new file mode 100644 index 00000000000000..4fb110fd485833 --- /dev/null +++ b/src/frontends/onnx/tests/models/com.microsoft/quick_gelu.prototxt @@ -0,0 +1,52 @@ +ir_version: 3 +producer_name: "OpenVINO ONNX Frontend" +graph { + name: "test_quick_gelu" + node { + input: "X" + output: "Y" + op_type: "QuickGelu" + attribute { + name: "alpha" + f: 0.9974269270896912 + type: FLOAT + } + domain: "com.microsoft" + } + input { + name: "X" + type { + tensor_type { + elem_type: 1 + shape { + dim { + dim_value: 2 + } + dim { + dim_value: 5 + } + } + } + } + } + output { + name: "Y" + type { + tensor_type { + elem_type: 1 + shape { + dim { + dim_value: 2 + } + dim { + dim_value: 5 + } + } + } + } + } +} +opset_import { + domain: "com.microsoft" + version: 1 +} \ No newline at end of file diff --git a/src/frontends/onnx/tests/onnx_import_com_microsoft.in.cpp b/src/frontends/onnx/tests/onnx_import_com_microsoft.in.cpp index da8189926a4546..900fc025d8d9ab 100644 --- a/src/frontends/onnx/tests/onnx_import_com_microsoft.in.cpp +++ b/src/frontends/onnx/tests/onnx_import_com_microsoft.in.cpp @@ -1330,3 +1330,29 @@ OPENVINO_TEST(${BACKEND_NAME}, onnx_com_microsoft_matmulnbits_3x17) { } test_case.run(); } + +OPENVINO_TEST(${BACKEND_NAME}, onnx_com_microsoft_quickgelu) { + const auto model = convert_model("com.microsoft/quick_gelu.onnx"); + auto test_case = ov::test::TestCase(model, s_device); + + const std::vector input_X{1, 2, 3, 4, 5, 6, 7, 8, 9, 10}; + const std::vector output{0.7305524f, + 1.7605114f, + 2.8566725f, + 3.9273243f, + 4.9661055f, + 5.984934f, + 6.9935064f, + 7.997261f, + 8.998864f, + 9.999535f}; + + test_case.add_input(Shape{2, 5}, input_X); + test_case.add_expected_output(Shape{2, 5}, output); + + if (std::string("${BACKEND_NAME}") == std::string("IE_GPU")) { + test_case.run_with_tolerance_as_fp(0.0001f); + } else { + test_case.run(); + } +} From e44ea54991ffd6fe3b1e6b085676672f0fb4dc7b Mon Sep 17 00:00:00 2001 From: Alexey Smirnov Date: Wed, 30 Oct 2024 21:58:17 +0000 Subject: [PATCH 066/120] [NPUW] Revert check in preemptive tensor set (#27345) https://github.com/openvinotoolkit/openvino/pull/27313 --- .../intel_npu/src/plugin/npuw/just_sync_infer_request.cpp | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/plugins/intel_npu/src/plugin/npuw/just_sync_infer_request.cpp b/src/plugins/intel_npu/src/plugin/npuw/just_sync_infer_request.cpp index 0070e6be2d2041..26363e66e55d2a 100644 --- a/src/plugins/intel_npu/src/plugin/npuw/just_sync_infer_request.cpp +++ b/src/plugins/intel_npu/src/plugin/npuw/just_sync_infer_request.cpp @@ -378,7 +378,9 @@ ov::npuw::JustInferRequest::JustInferRequest(const std::shared_ptrm_compiled_submodels[i]; - if (!comp_model_desc.compiled_model && !comp_model_desc.replaced_by) { + // FIXME: figure out our cases and if this should be replaced with && + // Note: replaced_by is utilized below unconditionally + if (!comp_model_desc.compiled_model || !comp_model_desc.replaced_by) { continue; } const auto real_idx = comp_model_desc.replaced_by.value(); From b5a59532a8b36a8bcc5066d78c3c83ff258f5f01 Mon Sep 17 00:00:00 2001 From: "Min, Byungil" Date: Thu, 31 Oct 2024 09:49:13 +0900 Subject: [PATCH 067/120] [GPU] Support int8 dyn-quan FC (#27027) ### Details: - Support FC dynamic quantize for 8Bit Asym model - Enable SLM for 8bit weight ### Tickets: - CVS-152990 --------- Signed-off-by: Min, Byung-il Signed-off-by: Min, Byungil --- .../fully_connected_gpu_bf_tiled.cl | 372 +++++++++++------- .../include/batch_headers/int4_utils.cl | 7 + .../fully_connected_kernel_bf_tiled.cpp | 93 +++-- .../test_cases/fully_connected_gpu_test.cpp | 160 +++++++- 4 files changed, 456 insertions(+), 176 deletions(-) diff --git a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/fully_connected_gpu_bf_tiled.cl b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/fully_connected_gpu_bf_tiled.cl index 70c55bfb73b8f5..ef4cc76650e0f3 100644 --- a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/fully_connected_gpu_bf_tiled.cl +++ b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/fully_connected_gpu_bf_tiled.cl @@ -22,36 +22,47 @@ #if FC_KERNEL_DYNAMIC_QUANTIZE KERNEL(quantize_input)( const __global INPUT0_TYPE* input, - __global char* quantized_input, - __global INPUT0_TYPE* de_quan_scale) { + __global DQ_TYPE* quantized_input, + __global INPUT0_TYPE* quan_var +) { const uint offset = get_global_id(0); const uint input_offset = offset * QUANTIZE_GROUP_SIZE; const uint quantize_block = QUANTIZE_GROUP_SIZE / 4; - half4 input_0[quantize_block]; - char4 quantized_value[quantize_block]; - half max[quantize_block]; + MAKE_VECTOR_TYPE(INPUT0_TYPE, INPUT_LOAD_SIZE) input_0[quantize_block]; + MAKE_VECTOR_TYPE(DQ_TYPE, INPUT_LOAD_SIZE) quantized_value[quantize_block]; + INPUT0_TYPE max[quantize_block]; unroll_for (uint i = 0 ; i < quantize_block ; ++i) { input_0[i] = vload4(0, &input[input_offset + i * 4]); max[i] = fmax(fmax(fabs(input_0[i][0]), fabs(input_0[i][1])), fmax(fabs(input_0[i][2]), fabs(input_0[i][3]))); } - half max_value = 0.001; - for (uint i = 0 ; i < quantize_block; i+=8) { - half temp = fmax(fmax(fmax(max[i], max[i+1]), fmax(max[i+2], max[i+3])), + INPUT0_TYPE max_value = 0.001; + for (uint i = 0 ; i < quantize_block ; i+=8) { + INPUT0_TYPE temp = fmax(fmax(fmax(max[i], max[i+1]), fmax(max[i+2], max[i+3])), fmax(fmax(max[i+4], max[i+5]), fmax(max[i+6], max[i+7]))); max_value = fmax(max_value, temp); } - half quan_scale = max_value / 128; - - unroll_for (uint i = 0 ; i < quantize_block ; ++i) { - quantized_value[i] = CAT(convert_, MAKE_VECTOR_TYPE(char, INPUT_LOAD_SIZE))(input_0[i] / (half4)quan_scale); + half quan_scale = (half)max_value / 127; + #if COMPRESSED_WEIGHTS_INT8 + half quantized_sum = 0; + #endif + for (uint i = 0 ; i < quantize_block ; ++i) { + half4 buff = input_0[i] / (half4)quan_scale; + quantized_value[i] = CAT(CAT(convert_, MAKE_VECTOR_TYPE(DQ_TYPE, INPUT_LOAD_SIZE)), _rte)(buff); + #if COMPRESSED_WEIGHTS_INT8 + quantized_sum += (buff[0] + buff[1] + buff[2] + buff[3]); + #endif vstore4(quantized_value[i], 0, &quantized_input[input_offset + i * 4]); } - de_quan_scale[offset] = quan_scale; + // Pair of quantizing_scale and quantized activation_sum for each group + quan_var[offset * 2] = quan_scale; + #if COMPRESSED_WEIGHTS_INT8 + quan_var[(offset * 2) + 1] = quantized_sum; + #endif } #else // !FC_KERNEL_DYNAMIC_QUANTIZE @@ -189,6 +200,7 @@ inline void FUNC(fc_bf_tiled_kernel_default)( #else uint gid = (uint)get_group_id(0); #endif + uint sglid = (uint)get_sub_group_local_id(); // Dispatch as bs_fs_bsv_fsv, where bsv = DISPATCH_BSV and fsv = DISPATCH_FSV. @@ -212,10 +224,9 @@ inline void FUNC(fc_bf_tiled_kernel_default)( ACCUMULATOR_VEC_TYPE acc[TILE_B] = { }; INPUT_VEC_TYPE in_0[TILE_B] = { }; -#if !USE_SLM - FILTER_VEC_TYPE wei = 0; -#endif - + #if !USE_SLM || !COMPRESSED_WEIGHTS_INT4 + FILTER_VEC_TYPE wei = 0; + #endif #if OUTPUT_3D uint out_b0 = out_b / OUTPUT_FEATURE_NUM; @@ -743,19 +754,31 @@ inline void FUNC(fc_bf_tiled_kernel_default)( // ===================================================================================================================================== } + + + // Dyc Quantize #if USE_SLM && DYNAMIC_QUANTIZE -#define PACKED_DQ_TYPE int -#define DQ_VEC_TYPE MAKE_VECTOR_TYPE(DQ_TYPE, TILE_IFM) -#define DQ_SLM_FILTER_VEC MAKE_VECTOR_TYPE(DQ_TYPE, 4) + +#if COMPRESSED_WEIGHTS_INT4 + #define SLM_WEIGHT_TYPE DQ_TYPE +#else + #define SLM_WEIGHT_TYPE FILTER_TYPE +#endif + +#define PACKED_DQ_TYPE uint +#define ACCUM_DQ_TYPE int #define DQ_SLM_FILTER_PACKED_VEC MAKE_VECTOR_TYPE(FILTER_TYPE, FILTER_ACTUAL_LOAD_BLOCK_SIZE) -#define DQ_SLM_FILTER_UNPACKED_VEC MAKE_VECTOR_TYPE(DQ_TYPE, FILTER_ELEMENTS_PER_LOAD) -#define DQ_FILTER_VEC_TYPE MAKE_VECTOR_TYPE(DQ_TYPE, TILE_K_OFM) +#define SLM_WEIGHT_VEC MAKE_VECTOR_TYPE(SLM_WEIGHT_TYPE, INPUT_LOAD_SIZE) +#define SLM_WEIGHT_UNPACKED_VEC MAKE_VECTOR_TYPE(SLM_WEIGHT_TYPE, FILTER_ELEMENTS_PER_LOAD) +#define WEIGHT_VEC_TYPE MAKE_VECTOR_TYPE(SLM_WEIGHT_TYPE, TILE_K_OFM) +#define MAKE_DQ_TYPE_VEC(x) MAKE_VECTOR_TYPE(DQ_TYPE, x) #define TO_DQ_TYPE(x) CAT(CAT(convert_, DQ_TYPE),_sat)(x) #define TO_DQ_VEC_TYPE(x) CAT(convert_, DQ_VEC_TYPE)(x) -#define TO_DQ_SLM_FILTER_UNPACKED_VEC(x) CAT(convert_, DQ_SLM_FILTER_UNPACKED_VEC)(x) -#define TO_DQ_FILTER_VEC_TYPE(x) CAT(convert_, DQ_FILTER_VEC_TYPE)(x) +#define TO_ACCUM_DQ_TYPE(x) CAT(convert_, ACCUM_DQ_TYPE)(x) +#define TO_SLM_WEIGHT_UNPACKED_VEC(x) CAT(convert_, SLM_WEIGHT_UNPACKED_VEC)(x) +#define TO_WEIGHT_VEC_TYPE(x) CAT(convert_, WEIGHT_VEC_TYPE)(x) #define AS_TYPE_N_(type, n, x) as_##type##n(x) #define AS_TYPE_N(type, n, x) AS_TYPE_N_(type, n, x) @@ -764,8 +787,8 @@ inline void FUNC(fc_bf_tiled_kernel_default)( inline void FUNC(fc_bf_tiled_kernel_dyn_quan)( OPTIONAL_SHAPE_INFO_ARG const __global INPUT0_TYPE* input, - __global char* quantized_input, - __global INPUT0_TYPE* scale, + __global DQ_TYPE* quantized_input, + __global INPUT0_TYPE* quan_var, // pair of params for each quantizing group : scale, activation_sum #if DECOMPRESSION_SCALE_TERM const __global DECOMPRESSION_SCALE_TYPE* decompression_scale, #endif @@ -774,7 +797,7 @@ inline void FUNC(fc_bf_tiled_kernel_dyn_quan)( #endif __global OUTPUT_TYPE* output, const __global FILTER_TYPE* weights - , __local int* wei_local_mem + , __local uint* wei_local_mem #if BIAS_TERM , const __global BIAS_TYPE* biases #endif @@ -801,28 +824,32 @@ inline void FUNC(fc_bf_tiled_kernel_dyn_quan)( uint out_f = gid * (TILE_OFM * SIMD); uint out_b = LWS_BATCHES * TILE_B * (uint)get_group_id(2) + local_id * TILE_B; -#if OUTPUT_3D - uint out_b0 = out_b / OUTPUT_FEATURE_NUM; - uint out_b1 = out_b % OUTPUT_FEATURE_NUM; - uint input_offset = out_b0 * INPUT0_BATCH_PITCH + out_b1 * INPUT0_FEATURE_PITCH + INPUT0_OFFSET; -#else - uint input_offset = out_b * TILE_IN_B_PITCH + INPUT0_OFFSET; -#endif + #if OUTPUT_3D + uint out_b0 = out_b / OUTPUT_FEATURE_NUM; + uint out_b1 = out_b % OUTPUT_FEATURE_NUM; + uint input_offset = out_b0 * INPUT0_BATCH_PITCH + out_b1 * INPUT0_FEATURE_PITCH + INPUT0_OFFSET; + #else + uint input_offset = out_b * TILE_IN_B_PITCH + INPUT0_OFFSET; + #endif -#if FILTER_LAYOUT_OS_IS_YX_OSV64_ISV2 - const int power_of_two_for_simd = 5; - const int power_of_two_for_osv = 6; - const uint osv64_weight_base = (( (int) (out_f >> power_of_two_for_osv) ) << power_of_two_for_osv); - const uint osv_weight_stride = (INPUT_ELEMENTS_COUNT >> 1); - const uint out_f_offset = (int)((out_f >> power_of_two_for_simd) & 0x1) << power_of_two_for_simd; - // out_f(32) : 0 * osv_weight_stride + 32; - // out_f(64) : 64 * osv_weight_stride + 0; - // out_f(128) : 64 * osv_weight_stride + 32; - // ... - uint weights_offset = osv64_weight_base * osv_weight_stride + out_f_offset; -#else - uint weights_offset = out_f * (INPUT_ELEMENTS_COUNT / 2); -#endif + #if COMPRESSED_WEIGHTS_INT4 + #if FILTER_LAYOUT_OS_IS_YX_OSV64_ISV2 + const int power_of_two_for_simd = 5; + const int power_of_two_for_osv = 6; + const uint osv64_weight_base = (( (int) (out_f >> power_of_two_for_osv) ) << power_of_two_for_osv); + const uint osv_weight_stride = (INPUT_ELEMENTS_COUNT >> 1); + const uint out_f_offset = (int)((out_f >> power_of_two_for_simd) & 0x1) << power_of_two_for_simd; + // out_f(32) : 0 * osv_weight_stride + 32; + // out_f(64) : 64 * osv_weight_stride + 0; + // out_f(128) : 64 * osv_weight_stride + 32; + // ... + uint weights_offset = osv64_weight_base * osv_weight_stride + out_f_offset; + #else + uint weights_offset = out_f * (INPUT_ELEMENTS_COUNT / 2); + #endif + #else + uint weights_offset = out_f * INPUT_ELEMENTS_COUNT; + #endif ACCUMULATOR_VEC_TYPE acc[TILE_B] = { }; @@ -831,38 +858,42 @@ inline void FUNC(fc_bf_tiled_kernel_dyn_quan)( PACKED_DQ_TYPE packed_in_0[HALF_TILE_B] = { }; // Packing char4 inputs to 1 integer INPUT0_TYPE de_quantize_scale[TILE_B]; -#if COMPRESSED_WEIGHTS && DECOMPRESSION_SCALE_GROUPS_NUM == 1 - #if DECOMPRESSION_SCALE_LENGTH > 1 && DECOMPRESSION_SCALE_LENGTH % (TILE_OFM * SIMD) == 0 - ACCUMULATOR_VEC_TYPE d_scale = TO_ACCUMULATOR_VEC_TYPE(BLOCK_READN(DECOMPRESSION_SCALE_TYPE, TILE_OFM, decompression_scale, out_f)); - #elif DECOMPRESSION_SCALE_LENGTH > 1 && DECOMPRESSION_SCALE_LENGTH % (TILE_OFM * SIMD) != 0 - ACCUMULATOR_VEC_TYPE d_scale = 0; - unroll_for(uint of = 0; of < TILE_OFM; ++of) { - uint offset = out_f + of*SIMD + get_sub_group_local_id(); - if (offset < DECOMPRESSION_SCALE_LENGTH) - ((ACCUMULATOR_TYPE*)(&d_scale))[of] = decompression_scale[offset]; - } - #else - ACCUMULATOR_VEC_TYPE d_scale = decompression_scale[0]; + #if COMPRESSED_WEIGHTS_INT8 + INPUT0_TYPE activation_sum[TILE_B] = { }; #endif - ACCUMULATOR_TYPE* d_scales = (ACCUMULATOR_TYPE*)(&d_scale); -#endif + #if COMPRESSED_WEIGHTS && DECOMPRESSION_SCALE_GROUPS_NUM == 1 + #if DECOMPRESSION_SCALE_LENGTH > 1 && DECOMPRESSION_SCALE_LENGTH % (TILE_OFM * SIMD) == 0 + ACCUMULATOR_VEC_TYPE d_scale = TO_ACCUMULATOR_VEC_TYPE(BLOCK_READN(DECOMPRESSION_SCALE_TYPE, TILE_OFM, decompression_scale, out_f)); + #elif DECOMPRESSION_SCALE_LENGTH > 1 && DECOMPRESSION_SCALE_LENGTH % (TILE_OFM * SIMD) != 0 + ACCUMULATOR_VEC_TYPE d_scale = 0; + unroll_for(uint of = 0; of < TILE_OFM; ++of) { + uint offset = out_f + of*SIMD + get_sub_group_local_id(); + if (offset < DECOMPRESSION_SCALE_LENGTH) + ((ACCUMULATOR_TYPE*)(&d_scale))[of] = decompression_scale[offset]; + } + #else + ACCUMULATOR_VEC_TYPE d_scale = decompression_scale[0]; + #endif -#if COMPRESSED_WEIGHTS && DECOMPRESSION_ZP_TERM && DECOMPRESSION_ZP_GROUPS_NUM == 1 && !DECOMPRESSION_ZP_SCALAR - #if DECOMPRESSION_ZP_LENGTH > 1 && DECOMPRESSION_ZP_LENGTH % (TILE_OFM * SIMD) == 0 - ACCUMULATOR_VEC_TYPE d_zp = TO_ACCUMULATOR_VEC_TYPE(BLOCK_READN(DECOMPRESSION_ZP_TYPE, TILE_OFM, decompression_zp, out_f)); - #elif DECOMPRESSION_ZP_LENGTH > 1 && DECOMPRESSION_ZP_LENGTH % (TILE_OFM * SIMD) != 0 - ACCUMULATOR_VEC_TYPE d_zp = 0; - unroll_for(uint of = 0; of < TILE_OFM; ++of) { - uint offset = out_f + of*SIMD + get_sub_group_local_id(); - if (offset < DECOMPRESSION_ZP_LENGTH) - ((ACCUMULATOR_TYPE*)(&d_zp))[of] = decompression_zp[offset]; - } - #else - ACCUMULATOR_VEC_TYPE d_zp = decompression_zp[0]; + ACCUMULATOR_TYPE* d_scales = (ACCUMULATOR_TYPE*)(&d_scale); + #endif + + #if COMPRESSED_WEIGHTS && DECOMPRESSION_ZP_TERM && DECOMPRESSION_ZP_GROUPS_NUM == 1 && !DECOMPRESSION_ZP_SCALAR + #if DECOMPRESSION_ZP_LENGTH > 1 && DECOMPRESSION_ZP_LENGTH % (TILE_OFM * SIMD) == 0 + ACCUMULATOR_VEC_TYPE d_zp = TO_ACCUMULATOR_VEC_TYPE(BLOCK_READN(DECOMPRESSION_ZP_TYPE, TILE_OFM, decompression_zp, out_f)); + #elif DECOMPRESSION_ZP_LENGTH > 1 && DECOMPRESSION_ZP_LENGTH % (TILE_OFM * SIMD) != 0 + ACCUMULATOR_VEC_TYPE d_zp = 0; + unroll_for(uint of = 0; of < TILE_OFM; ++of) { + uint offset = out_f + of*SIMD + get_sub_group_local_id(); + if (offset < DECOMPRESSION_ZP_LENGTH) + ((ACCUMULATOR_TYPE*)(&d_zp))[of] = decompression_zp[offset]; + } + #else + ACCUMULATOR_VEC_TYPE d_zp = decompression_zp[0]; + #endif + ACCUMULATOR_TYPE* d_zps = (ACCUMULATOR_TYPE*)(&d_zp); #endif - ACCUMULATOR_TYPE* d_zps = (ACCUMULATOR_TYPE*)(&d_zp); -#endif // ===================================================================================================================================== // Main computation loop @@ -871,7 +902,7 @@ inline void FUNC(fc_bf_tiled_kernel_dyn_quan)( uint idx_sglid = (sglid * TILE_K) % TILE_IFM_ELEMENTS_SIZE; // same index for sglid 0~7 : to tile_k direction uint batch_sglid = (sglid * TILE_K) / TILE_IFM_ELEMENTS_SIZE; // 0 to 1 : to batch direction - const uint scale_pitch = TILE_IN_B_PITCH / QUANTIZE_GROUP_SIZE; + const uint scale_pitch = (TILE_IN_B_PITCH / QUANTIZE_GROUP_SIZE); MAKE_VECTOR_TYPE(int, TILE_B) acc_tmp[TILE_OFM] = { }; __attribute__((opencl_unroll_hint(1))) for (uint ni = 0; ni < iterations; ++ni) { @@ -881,14 +912,20 @@ inline void FUNC(fc_bf_tiled_kernel_dyn_quan)( // Load quantizing info from pre-quantizing kernel tiled_input_0[bi] = vload4(0, &quantized_input[in_offset]); // Packing : Get 4(B)x4(K) integer vector (packing to 4x1 vector) - packed_in_0[bi] = as_int(tiled_input_0[bi]); + packed_in_0[bi] = as_uint(tiled_input_0[bi]); // Next batch in_offset += (TILE_IN_B_PITCH * 2); #if NUM_LOOP_IN_DYN_QUAN_GROUP == 1 - de_quantize_scale[bi * 2] = scale[scale_offset]; - de_quantize_scale[bi * 2 + 1] = scale[scale_offset+ scale_pitch]; + de_quantize_scale[bi * 2] = quan_var[scale_offset * 2]; + de_quantize_scale[bi * 2 + 1] = quan_var[scale_offset * 2 + scale_pitch * 2]; + #if COMPRESSED_WEIGHTS_INT8 + // Need additional accumulation of quantized activation along the dyn-quan group + // to use i8 multiplier for int8 weight + activation_sum[bi * 2] = quan_var[scale_offset * 2 + 1]; + activation_sum[bi * 2 + 1] = quan_var[scale_offset * 2 + 1 + scale_pitch * 2]; + #endif scale_offset += (scale_pitch * 2); #endif } @@ -896,7 +933,10 @@ inline void FUNC(fc_bf_tiled_kernel_dyn_quan)( #if NUM_LOOP_IN_DYN_QUAN_GROUP > 1 if (ni % NUM_LOOP_IN_DYN_QUAN_GROUP == 0) { unroll_for (uint bi = 0; bi < TILE_B; ++bi) { - de_quantize_scale[bi] = scale[scale_offset]; + de_quantize_scale[bi] = quan_var[scale_offset * 2]; + #if COMPRESSED_WEIGHTS_INT8 + activation_sum[bi] = quan_var[scale_offset * 2 + 1]; + #endif scale_offset += scale_pitch; } } @@ -916,49 +956,64 @@ inline void FUNC(fc_bf_tiled_kernel_dyn_quan)( barrier(CLK_LOCAL_MEM_FENCE); #endif - __local int* char_slm_weight = (__local int*)wei_local_mem; + __local uint* char_slm_weight = (__local uint*)wei_local_mem; - #if FILTER_LAYOUT_OS_IS_YX_OSV64_ISV2 - uint weights_idx = weights_offset + local_id * SIMD * FILTER_LOAD_ITERS * FILTER_LOAD_BLOCK_SIZE * 2; + #if COMPRESSED_WEIGHTS_INT4 + #if FILTER_LAYOUT_OS_IS_YX_OSV64_ISV2 + uint weights_idx = weights_offset + local_id * SIMD * FILTER_LOAD_ITERS * FILTER_LOAD_BLOCK_SIZE * 2; + #else + uint weights_idx = weights_offset + local_id * SIMD * FILTER_LOAD_ITERS * FILTER_ACTUAL_LOAD_BLOCK_SIZE; + #endif #else - uint weights_idx = weights_offset + local_id * SIMD * FILTER_LOAD_ITERS * FILTER_ACTUAL_LOAD_BLOCK_SIZE; + uint weights_idx = weights_offset + local_id * SIMD * FILTER_LOAD_ITERS * TILE_K_OFM_PACKED; #endif uint wei_local_idx = local_id * SIMD * FILTER_LOAD_ITERS * (FILTER_LOAD_BLOCK_SIZE/2) + sglid * 2; - // DECOMPRESSION_SCALE_POST_OP SHOULD be enabled for dynamic quantize FC : scale is ACCUMULATOR_VAL_ONE + #if COMPRESSED_WEIGHTS_INT8 + ACCUMULATOR_TYPE wei_zp[TILE_OFM] = { }; + #endif + + // DQ_DECOMPRESSION_SCALE_POST_OP SHOULD be enabled for dynamic quantize FC : scale is ACCUMULATOR_VAL_ONE unroll_for(uint load_iter = 0; load_iter < FILTER_LOAD_ITERS; ++load_iter) { - #if FILTER_LAYOUT_OS_IYX_OSV16 - SLM_FILTER_PACKED_VEC wei_packed0 = BLOCK_READN(FILTER_TYPE, FILTER_ACTUAL_LOAD_BLOCK_SIZE, weights, weights_idx); - SLM_FILTER_PACKED_VEC wei_packed1 = BLOCK_READN(FILTER_TYPE, FILTER_ACTUAL_LOAD_BLOCK_SIZE, weights, (weights_idx + ((IFM_SIZE / 2) * 16))); - DQ_SLM_FILTER_UNPACKED_VEC dq_wei_unpacked; - // loaded weights 'wei_packed' of os_iyx_osv16 format have continuous values along TILE_K. So no need to transpose while unpacking - dq_wei_unpacked.s0123 = UNPACK_INT4(DQ_TYPE, *((INT4_PACKED_TYPE_PRELOAD*)&wei_packed0)); - dq_wei_unpacked.s4567 = UNPACK_INT4(DQ_TYPE, *((INT4_PACKED_TYPE_PRELOAD*)&wei_packed1)); - #elif FILTER_LAYOUT_OS_IS_YX_OSV64_ISV2 - SLM_FILTER_PACKED_VEC wei_packed0 = BLOCK_READN(FILTER_TYPE, FILTER_ACTUAL_LOAD_BLOCK_SIZE, weights, weights_idx); - SLM_FILTER_PACKED_VEC wei_packed1 = BLOCK_READN(FILTER_TYPE, FILTER_ACTUAL_LOAD_BLOCK_SIZE, weights, (weights_idx + (FILTER_LOAD_BLOCK_SIZE * SIMD))); - DQ_SLM_FILTER_UNPACKED_VEC dq_wei_unpacked; - DQ_SLM_FILTER_UNPACKED_VEC dq_wei_unpacked_tmp; - dq_wei_unpacked_tmp.s0123 = UNPACK_INT4(DQ_TYPE, *((INT4_PACKED_TYPE_PRELOAD*)&wei_packed0)); - dq_wei_unpacked_tmp.s4567 = UNPACK_INT4(DQ_TYPE, *((INT4_PACKED_TYPE_PRELOAD*)&wei_packed1)); - dq_wei_unpacked.s01 = dq_wei_unpacked_tmp.s01; - dq_wei_unpacked.s23 = dq_wei_unpacked_tmp.s45; - dq_wei_unpacked.s45 = dq_wei_unpacked_tmp.s23; - dq_wei_unpacked.s67 = dq_wei_unpacked_tmp.s67; - #else - SLM_FILTER_PACKED_VEC wei_packed = BLOCK_READN(FILTER_TYPE, FILTER_LOAD_BLOCK_SIZE, weights, weights_idx); - DQ_SLM_FILTER_UNPACKED_VEC dq_wei_unpacked = UNPACK_TRANSPOSED_INT4(DQ_TYPE, *((INT4_PACKED_TYPE_PRELOAD *)&wei_packed)); + #if COMPRESSED_WEIGHTS_INT4 + #if FILTER_LAYOUT_OS_IYX_OSV16 + SLM_FILTER_PACKED_VEC wei_packed0 = BLOCK_READN(FILTER_TYPE, FILTER_ACTUAL_LOAD_BLOCK_SIZE, weights, weights_idx); + SLM_FILTER_PACKED_VEC wei_packed1 = BLOCK_READN(FILTER_TYPE, FILTER_ACTUAL_LOAD_BLOCK_SIZE, weights, (weights_idx + ((IFM_SIZE / 2) * 16))); + SLM_WEIGHT_UNPACKED_VEC dq_wei_unpacked; + // loaded weights 'wei_packed' of os_iyx_osv16 format have continuous values along TILE_K. So no need to transpose while unpacking + dq_wei_unpacked.s0123 = (UNPACK_INT4(DQ_TYPE, *((INT4_PACKED_TYPE_PRELOAD*)&wei_packed0))); + dq_wei_unpacked.s4567 = (UNPACK_INT4(DQ_TYPE, *((INT4_PACKED_TYPE_PRELOAD*)&wei_packed1))); + #elif FILTER_LAYOUT_OS_IS_YX_OSV64_ISV2 + SLM_FILTER_PACKED_VEC wei_packed0 = BLOCK_READN(FILTER_TYPE, FILTER_ACTUAL_LOAD_BLOCK_SIZE, weights, weights_idx); + SLM_FILTER_PACKED_VEC wei_packed1 = BLOCK_READN(FILTER_TYPE, FILTER_ACTUAL_LOAD_BLOCK_SIZE, weights, (weights_idx + (FILTER_LOAD_BLOCK_SIZE * SIMD))); + SLM_WEIGHT_UNPACKED_VEC dq_wei_unpacked; + SLM_WEIGHT_UNPACKED_VEC dq_wei_unpacked_tmp; + dq_wei_unpacked_tmp.s0123 = (UNPACK_INT4(DQ_TYPE, *((INT4_PACKED_TYPE_PRELOAD*)&wei_packed0))); + dq_wei_unpacked_tmp.s4567 = (UNPACK_INT4(DQ_TYPE, *((INT4_PACKED_TYPE_PRELOAD*)&wei_packed1))); + dq_wei_unpacked.s01 = dq_wei_unpacked_tmp.s01; + dq_wei_unpacked.s23 = dq_wei_unpacked_tmp.s45; + dq_wei_unpacked.s45 = dq_wei_unpacked_tmp.s23; + dq_wei_unpacked.s67 = dq_wei_unpacked_tmp.s67; + #else + SLM_FILTER_PACKED_VEC wei_packed = BLOCK_READN(FILTER_TYPE, FILTER_LOAD_BLOCK_SIZE, weights, weights_idx); + SLM_WEIGHT_UNPACKED_VEC dq_wei_unpacked = (UNPACK_TRANSPOSED_INT4(DQ_TYPE, *((INT4_PACKED_TYPE_PRELOAD *)&wei_packed))); + #endif + #else // COMPRESSED_WEIGHTS_INT8 + SLM_WEIGHT_UNPACKED_VEC dq_wei_unpacked; + WEIGHT_VEC_TYPE wei_packed = TO_WEIGHT_VEC_TYPE(FILTER_BLOCK_READ(weights, weights_idx)); + dq_wei_unpacked.s0123 = wei_packed.s0246; + dq_wei_unpacked.s4567 = wei_packed.s1357; #endif - // Calculate zero-point and scale only for DECOMPRESSION_SCALE_POST_OP enabled + // Calculate zero-point and scale only for DQ_DECOMPRESSION_SCALE_POST_OP enabled // Calculate weight : w = (w - dzp) * ds // if DECOMPRESSION_ZP_TERM is not enabled, then dzp is ACCUMULATOR_VAL_ZERO. - #if DECOMPRESSION_ZP_TERM + #if DECOMPRESSION_ZP_TERM && !COMPRESSED_WEIGHTS_INT8 #if DECOMPRESSION_ZP_SCALAR - DQ_SLM_FILTER_UNPACKED_VEC dzp = (DQ_SLM_FILTER_UNPACKED_VEC)(DECOMPRESSION_ZP_VALUE); + SLM_WEIGHT_UNPACKED_VEC dzp = (SLM_WEIGHT_UNPACKED_VEC)(DECOMPRESSION_ZP_VALUE); dq_wei_unpacked -= dzp; #elif DECOMPRESSION_ZP_GROUPS_NUM > 1 - DQ_TYPE* w = (DQ_TYPE*)(&dq_wei_unpacked); + SLM_WEIGHT_TYPE* w = (SLM_WEIGHT_TYPE*)(&dq_wei_unpacked); const uint ni_offset = ni * TILE_IFM * SIMD + local_id * FILTER_LOAD_ITERS * FILTER_LOAD_BLOCK_SIZE; unroll_for(uint fi = 0; fi < TILE_OFM; ++fi) { const uint offset_ofm = out_f + fi*SIMD + sglid; @@ -966,11 +1021,11 @@ inline void FUNC(fc_bf_tiled_kernel_dyn_quan)( const uint offset_ifm = ni_offset + load_iter * FILTER_LOAD_BLOCK_SIZE + kii; const uint zp_offset = (offset_ofm % DECOMPRESSION_ZP_BATCH_NUM) * DECOMPRESSION_ZP_BATCH_PITCH + (offset_ifm / DECOMPRESSION_ZP_GROUP_SIZE) * DECOMPRESSION_ZP_FEATURE_PITCH; - w[W_DYN_QUAN_IDX] = w[W_DYN_QUAN_IDX] - TO_DQ_TYPE(decompression_zp[zp_offset]); + w[W_DYN_QUAN_IDX] = w[W_DYN_QUAN_IDX] - CAT(CAT(convert_, SLM_WEIGHT_TYPE),_rte)(decompression_zp[zp_offset]); } } #else - DQ_TYPE* w = (DQ_TYPE*)(&dq_wei_unpacked); + SLM_WEIGHT_TYPE* w = (SLM_WEIGHT_TYPE*)(&dq_wei_unpacked); unroll_for(uint fi = 0; fi < TILE_OFM; ++fi) { unroll_for(uint kii = 0; kii < FILTER_LOAD_BLOCK_SIZE; ++kii) { w[W_DYN_QUAN_IDX] = w[W_DYN_QUAN_IDX] - d_zps[fi % DECOMPRESSION_ZP_LENGTH]; @@ -979,29 +1034,58 @@ inline void FUNC(fc_bf_tiled_kernel_dyn_quan)( #endif #endif + #if COMPRESSED_WEIGHTS_INT8 + unroll_for(uint fi = 0; fi < TILE_OFM; ++fi) { + #if DECOMPRESSION_ZP_TERM + #if DECOMPRESSION_ZP_SCALAR + wei_zp[fi] = (TO_ACCUMULATOR_TYPE)(DECOMPRESSION_ZP_VALUE); + #elif DECOMPRESSION_ZP_GROUPS_NUM > 1 + #if FILTER_LOAD_BLOCK_SIZE % DECOMPRESSION_ZP_GROUP_SIZE != 0 + #error "FC bf_tiled kernel: Not support DECOMPRESSION_ZP_GROUPS_NUM > 1" + #endif + + const uint ni_offset = ni * TILE_IFM * SIMD + local_id * FILTER_LOAD_ITERS * FILTER_LOAD_BLOCK_SIZE; + const uint offset_ofm = out_f + fi*SIMD + sglid; + const uint offset_ifm = ni_offset + load_iter * FILTER_LOAD_BLOCK_SIZE; + const uint zp_offset = (offset_ofm % DECOMPRESSION_ZP_BATCH_NUM) * DECOMPRESSION_ZP_BATCH_PITCH + + (offset_ifm / DECOMPRESSION_ZP_GROUP_SIZE) * DECOMPRESSION_ZP_FEATURE_PITCH; + wei_zp[fi] = TO_ACCUMULATOR_TYPE(decompression_zp[zp_offset]); + #else + wei_zp[fi] = TO_ACCUMULATOR_TYPE(d_zps[fi % DECOMPRESSION_ZP_LENGTH]); + #endif + #else + wei_zp[fi] = ACCUMULATOR_VAL_ZERO; + #endif + } + #endif + #if FILTER_LOAD_BLOCK_SIZE == 2 - DQ_SLM_FILTER_VEC wei_1 = {dq_wei_unpacked.s01, dq_wei_unpacked.s23}; - char_slm_weight[wei_local_idx] = as_int(wei_1); + SLM_WEIGHT_VEC wei_1 = {dq_wei_unpacked.s01, dq_wei_unpacked.s23}; + char_slm_weight[wei_local_idx] = as_uint(wei_1); #elif FILTER_LOAD_BLOCK_SIZE == 4 - DQ_SLM_FILTER_VEC wei_1 = {dq_wei_unpacked.s01, dq_wei_unpacked.s23}; - char_slm_weight[wei_local_idx] = as_int(wei_1); - DQ_SLM_FILTER_VEC wei_2 = {dq_wei_unpacked.s45, dq_wei_unpacked.s67}; - char_slm_weight[wei_local_idx+1] = as_int(wei_2); + SLM_WEIGHT_VEC wei_1 = {dq_wei_unpacked.s01, dq_wei_unpacked.s23}; + char_slm_weight[wei_local_idx] = as_uint(wei_1); + SLM_WEIGHT_VEC wei_2 = {dq_wei_unpacked.s45, dq_wei_unpacked.s67}; + char_slm_weight[wei_local_idx+1] = as_uint(wei_2); #elif FILTER_LOAD_BLOCK_SIZE == 8 - DQ_SLM_FILTER_VEC wei_1 = {dq_wei_unpacked.s01, dq_wei_unpacked.s23}; - char_slm_weight[wei_local_idx] = as_int(wei_1); - DQ_SLM_FILTER_VEC wei_2 = {dq_wei_unpacked.s45, dq_wei_unpacked.s67}; - char_slm_weight[wei_local_idx+1] = as_int(wei_2); - DQ_SLM_FILTER_VEC wei_3 = {dq_wei_unpacked.s89, dq_wei_unpacked.sab}; - char_slm_weight[wei_local_idx+2] = as_int(wei_3); - DQ_SLM_FILTER_VEC wei_4 = {dq_wei_unpacked.scd, dq_wei_unpacked.sef}; - char_slm_weight[wei_local_idx+3] = as_int(wei_4); + SLM_WEIGHT_VEC wei_1 = {dq_wei_unpacked.s01, dq_wei_unpacked.s23}; + char_slm_weight[wei_local_idx] = as_uint(wei_1); + SLM_WEIGHT_VEC wei_2 = {dq_wei_unpacked.s45, dq_wei_unpacked.s67}; + char_slm_weight[wei_local_idx+1] = as_uint(wei_2); + SLM_WEIGHT_VEC wei_3 = {dq_wei_unpacked.s89, dq_wei_unpacked.sab}; + char_slm_weight[wei_local_idx+2] = as_uint(wei_3); + SLM_WEIGHT_VEC wei_4 = {dq_wei_unpacked.scd, dq_wei_unpacked.sef}; + char_slm_weight[wei_local_idx+3] = as_uint(wei_4); #else #error "FC bf_tiled kernel: unsupported FILTER_LOAD_BLOCK_SIZE for SLM kernel" #endif wei_local_idx += SIMD * (FILTER_LOAD_BLOCK_SIZE/2); - weights_idx += SIMD * FILTER_ACTUAL_LOAD_BLOCK_SIZE; + #if COMPRESSED_WEIGHTS_INT8 + weights_idx += SIMD * TILE_K_OFM_PACKED; + #else + weights_idx += SIMD * FILTER_ACTUAL_LOAD_BLOCK_SIZE; + #endif } wei_local_idx = sglid * 2; @@ -1014,11 +1098,11 @@ inline void FUNC(fc_bf_tiled_kernel_dyn_quan)( #endif // Compute input * weight : packed char4 type - char8 weight = vload8(0, (__local char *)(&char_slm_weight[wei_local_idx + 16*2*ki])); - char4 first_weight = weight.s0123; - char4 second_weight = weight.s4567; + WEIGHT_VEC_TYPE weight = vload8(0, (__local SLM_WEIGHT_TYPE *)(&char_slm_weight[wei_local_idx + 16*2*ki])); + SLM_WEIGHT_VEC first_weight = weight.s0123; + SLM_WEIGHT_VEC second_weight = weight.s4567; unroll_for (uint bi = 0; bi < TILE_B; ++bi) { - char4 input_val = as_char4(_sub_group_shuffle(packed_in_0[bi / 2], (bi % 2) * 8 + ki)); + MAKE_DQ_TYPE_VEC(4) input_val = AS_DQ_TYPE_4(_sub_group_shuffle(packed_in_0[bi / 2], (bi % 2) * 8 + ki)); acc_tmp[0][bi] = imad_SW(acc_tmp[0][bi], input_val, first_weight); acc_tmp[1][bi] = imad_SW(acc_tmp[1][bi], input_val, second_weight); } @@ -1038,7 +1122,12 @@ inline void FUNC(fc_bf_tiled_kernel_dyn_quan)( ACCUMULATOR_TYPE ds = d_scales[fi % DECOMPRESSION_SCALE_LENGTH]; #endif - ((ACCUMULATOR_TYPE*)(&acc[bi]))[fi] += convert_half(((int *)(&acc_tmp[fi]))[bi]) * ds * de_quantize_scale[bi]; + #if COMPRESSED_WEIGHTS_INT8 + ACCUM_DQ_TYPE modified_calc_buff = ((int *)(&acc_tmp[fi]))[bi] - ((float)(wei_zp[fi]) * (convert_float)(activation_sum[bi])); + ((ACCUMULATOR_TYPE*)(&acc[bi]))[fi] += (convert_half)(convert_float(modified_calc_buff) * (float)ds * (float)de_quantize_scale[bi]); + #else + ((ACCUMULATOR_TYPE*)(&acc[bi]))[fi] += convert_half(((int *)(&acc_tmp[fi]))[bi]) * ds * de_quantize_scale[bi]; + #endif acc_tmp[fi][bi] = 0; } } @@ -1060,7 +1149,12 @@ inline void FUNC(fc_bf_tiled_kernel_dyn_quan)( ACCUMULATOR_TYPE ds = d_scales[fi % DECOMPRESSION_SCALE_LENGTH]; #endif - ((ACCUMULATOR_TYPE*)(&acc[bi]))[fi] += convert_half(((int *)(&acc_tmp[fi]))[bi]) * ds * de_quantize_scale[bi]; + #if COMPRESSED_WEIGHTS_INT8 + ACCUM_DQ_TYPE modified_calc_buff = ((int *)(&acc_tmp[fi]))[bi] - ((float)(wei_zp[fi]) * (convert_float)(activation_sum[bi])); + ((ACCUMULATOR_TYPE*)(&acc[bi]))[fi] += (convert_half)(convert_float(modified_calc_buff) * (float)ds * (float)de_quantize_scale[bi]); + #else + ((ACCUMULATOR_TYPE*)(&acc[bi]))[fi] += convert_half(((int *)(&acc_tmp[fi]))[bi]) * ds * de_quantize_scale[bi]; + #endif acc_tmp[fi][bi] = 0; } } @@ -1169,13 +1263,13 @@ KERNEL(fc)( , FUSED_OPS_DECLS #endif #if DYNAMIC_QUANTIZE - , __global char* quantized_input - , __global INPUT0_TYPE* de_quan_scale + , __global DQ_TYPE* quantized_input + , __global INPUT0_TYPE* quan_var #endif ) { #if USE_SLM #if DYNAMIC_QUANTIZE - __local int dq_wei_local_mem[SIMD * TILE_OFM * SIMD]; + __local uint dq_wei_local_mem[SIMD * TILE_OFM * SIMD]; #else __local ACCUMULATOR_TYPE wei_local_mem[TILE_IFM * SIMD * TILE_OFM * SIMD]; #endif @@ -1321,7 +1415,7 @@ KERNEL(fc)( OPTIONAL_SHAPE_INFO_TENSOR input, quantized_input, - de_quan_scale, + quan_var, #if DECOMPRESSION_SCALE_TERM decompression_scale, #endif @@ -1368,7 +1462,7 @@ KERNEL(fc)( OPTIONAL_SHAPE_INFO_TENSOR input, quantized_input, - de_quan_scale, + quan_var, #if DECOMPRESSION_SCALE_TERM decompression_scale, #endif diff --git a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/include/batch_headers/int4_utils.cl b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/include/batch_headers/int4_utils.cl index 68d778475f5601..99ff124e3a39f9 100644 --- a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/include/batch_headers/int4_utils.cl +++ b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/include/batch_headers/int4_utils.cl @@ -59,6 +59,13 @@ inline char4 unpack_to_char(uint4x4_t v) __attribute__((overloadable)) { return (char4)(v0.s0, v0.s1, v1.s0, v1.s1); } +inline uchar4 unpack_to_uchar(uint4x4_t v) __attribute__((overloadable)) { + uchar2 v0 = unpack_to_uchar(v.s0); + uchar2 v1 = unpack_to_uchar(v.s1); + return (uchar4)(v0.s0, v0.s1, v1.s0, v1.s1); +} + + inline char4 unpack_transposed_to_char(int4x4_t v) __attribute__((overloadable)) { char2 v0 = unpack_to_char(v.s0); char2 v1 = unpack_to_char(v.s1); diff --git a/src/plugins/intel_gpu/src/kernel_selector/kernels/fully_connected/fully_connected_kernel_bf_tiled.cpp b/src/plugins/intel_gpu/src/kernel_selector/kernels/fully_connected/fully_connected_kernel_bf_tiled.cpp index 6604def1a69093..178e1ea405b6bb 100644 --- a/src/plugins/intel_gpu/src/kernel_selector/kernels/fully_connected/fully_connected_kernel_bf_tiled.cpp +++ b/src/plugins/intel_gpu/src/kernel_selector/kernels/fully_connected/fully_connected_kernel_bf_tiled.cpp @@ -8,6 +8,7 @@ #include #include "common_types.h" +static constexpr size_t lws_batches = 8; static constexpr size_t simd = 16; static constexpr size_t min_quantize_grp_size = 32; static constexpr size_t min_slm_size = 256; @@ -50,6 +51,17 @@ static std::pair get_output_aligned_bf_size(const fully_connecte return {output_b, output_f}; } +static bool is_weight_dyn_quantizable(const fully_connected_params& params) { + auto weight_type = params.weights.GetDType(); + if (weight_type == WeightsType::INT4 || weight_type == WeightsType::UINT4) + return true; + // UINT8 weight type is supported by FC dyn-quantize(with SLM). + if (weight_type == WeightsType::UINT8) + return true; + + return false; +} + // DYNAMIC_QUANTIZE static size_t get_dynamic_quantize_group_size(const fully_connected_params& params) { auto dynamic_quantization_group_size = params.dynamic_quantization_group_size; @@ -91,7 +103,7 @@ static size_t get_dynamic_quantize_group_size(const fully_connected_params& para return 0; } -static bool should_dynamic_quantize(const fully_connected_params& params) { +static bool should_dynamic_quantize(const fully_connected_params& params, bool print_log = false) { size_t dynamic_quantization_group_size = get_dynamic_quantize_group_size(params); if (params.inputs[0].GetFirstElementOffset() != 0) @@ -110,11 +122,17 @@ static bool should_dynamic_quantize(const fully_connected_params& params) { const size_t scale_group_size = params.weights.IFM().v / params.decompression_scale.Feature().v; if ((scale_group_size % simd == 0) && (input_f % dynamic_quantization_group_size == 0) && (params.is_shape_agnostic || (params.inputs[0].Batch().v > 1 && input_b > min_slm_size)) && - params.inputs[0].GetDType() == Datatype::F16 && - (params.weights.GetDType() == WeightsType::INT4 || params.weights.GetDType() == WeightsType::UINT4)) { - GPU_DEBUG_TRACE_DETAIL << " Dynamic quantizing for FC : scale_group_size " << scale_group_size << ", Input (" << - kernel_selector::toString(params.inputs[0].GetDType()) << ", " << kernel_selector::toString(params.outputs[0].GetLayout()) << - ") B: " << params.inputs[0].Batch().v << ", F: " << params.inputs[0].Feature().v << ", Y: " << params.inputs[0].Y().v << std ::endl; + params.inputs[0].GetDType() == Datatype::F16 && is_weight_dyn_quantizable(params)) { + if (print_log) { + GPU_DEBUG_TRACE_DETAIL << " Dynamic quantizing for FC : scale_group_size: " << scale_group_size << + ", Dyn-quan group size: " << dynamic_quantization_group_size << + ", Type(I:" << kernel_selector::toString(params.inputs[0].GetDType()) << + ", O:" << kernel_selector::toString(params.outputs[0].GetDType()) << + ", W:" << kernel_selector::toString(params.weights.GetDType()) << + "), Format(W:" << kernel_selector::toString(params.weights.GetLayout()) << + ") B: " << params.inputs[0].Batch().v << ", F: " << params.inputs[0].Feature().v << + ", Y: " << params.inputs[0].Y().v << std ::endl; + } return true; } @@ -204,8 +222,9 @@ DeviceFeaturesKey FullyConnected_bf_tiled::get_required_device_features_key(cons } bool FullyConnected_bf_tiled::Validate(const Params& params) const { - if (!Parent::Validate(params)) + if (!Parent::Validate(params)) { return false; + } auto& fc_params = static_cast(params); auto& input = fc_params.inputs[0]; @@ -314,21 +333,21 @@ bool TuneParamsSelector::VerifyTuneParams(const fully_connected_params& params, if (tparams.tile_ofm * simd > 64) return false; - bool is_i4_u4 = (params.weights.GetDType() == WeightsType::INT4 || params.weights.GetDType() == WeightsType::UINT4); + bool is_dyn_quantable_type = is_weight_dyn_quantizable(params); if (tparams.kernel_type == FullyConnected_bf_tiled::KernelType::SLM) { const auto required_batch_alignment = 64; if (!params.is_shape_agnostic && (!IsAligned(output_b, required_batch_alignment) || output_b < min_slm_size)) return false; const auto required_tile_b = 8; - if ((tparams.tile_b != required_tile_b) && !is_i4_u4) + if ((tparams.tile_b != required_tile_b) && !is_dyn_quantable_type) return false; const auto required_tile_ofm = 2; if (tparams.tile_ofm != required_tile_ofm) return false; - if (params.weights.GetDType() != WeightsType::INT4 && params.weights.GetDType() != WeightsType::UINT4) + if (!is_dyn_quantable_type) return false; if (params.engineInfo.deviceType != dev_type::integrated_gpu) @@ -340,7 +359,7 @@ bool TuneParamsSelector::VerifyTuneParams(const fully_connected_params& params, return true; } - if (params.compressed && is_i4_u4) { + if (params.compressed && is_dyn_quantable_type) { if (!(tparams.tile_ofm == 2 || tparams.tile_ofm == 4)) return false; if (tparams.tile_ofm == 4 && tparams.outer_ofm == 2 && !is_suitable_outer_ofm(params, output_f)) @@ -382,11 +401,10 @@ FullyConnected_bf_tiled::GetAutoTuneParams(const fully_connected_params& params, while (max_tile_ofm * 2 * simd <= output_f && max_tile_ofm < 4) max_tile_ofm *= 2; - if (params.weights.GetDType() == WeightsType::UINT4 || params.weights.GetDType() == WeightsType::INT4) { + if (params.weights.GetDType() == WeightsType::UINT4 || params.weights.GetDType() == WeightsType::INT4 || + (is_weight_dyn_quantizable(params) && should_dynamic_quantize(params))) { + // Only 4bit weight type is fully optimized to use SLM. In default kernel, SLM is not applied to 8bit weight. if (!params.is_shape_agnostic && batch == 1) { - if (should_dynamic_quantize(params)) - return selector.Default(tune_params(1, 2, 4, 2, 1, 1, 1, EXE_MODE_DEFAULT)); - // Tuning for Meteor Lake if (is_weight_vertical(params, output_f)) { if (params.weights.GetLayout() == WeightsLayout::os_is_yx_osv32_isv2) { @@ -411,9 +429,11 @@ FullyConnected_bf_tiled::GetAutoTuneParams(const fully_connected_params& params, selector.Case(tune_params(16, 2, 2, 4, 1, 1, 1, EXE_MODE_DEFAULT, KernelType::SLM)) .Case(tune_params(16, 2, 1, 4, 1, 1, 1, EXE_MODE_DEFAULT, KernelType::SLM)); } + selector.Case(tune_params(8, 2, 2, 4, 1, 1, 1, EXE_MODE_DEFAULT, KernelType::SLM)) .Case(tune_params(8, 2, 1, 4, 1, 1, 1, EXE_MODE_DEFAULT, KernelType::SLM)); } + if (params.weights.GetLayout() == WeightsLayout::os_iyx_osv16) return selector.Default(tune_params(8, 1, 1, 4, 1, 1, 1, EXE_MODE_DEFAULT)); else if (params.weights.GetLayout() == WeightsLayout::os_is_yx_osv64_isv2) @@ -501,7 +521,6 @@ FullyConnected_bf_tiled::SetDefault(const fully_connected_params& params, int au auto batch_threads = threads.first; auto feature_threads = threads.second; - const size_t lws_batches = 8; const size_t aligned_batch = Align(batch_threads, lws_batches); // Each WG calculates 8x8 batches (TILE_B x LWS[2] size) const bool can_use_slm = tparams.kernel_type == KernelType::SLM; @@ -550,7 +569,6 @@ JitConstants FullyConnected_bf_tiled::GetJitConstants(const fully_connected_para WeightsType weights_dt = params.weights.GetDType(); if (weights_dt == WeightsType::UINT4 || weights_dt == WeightsType::INT4) { tile_k_ofm_packed /= 2; - jit.Merge(make_int4_packed_type_jit_constant("INT4_PACKED_TYPE", weights_dt, tile_k_ofm)); const size_t scale_group_size = params.weights.IFM().v / params.decompression_scale.Feature().v; // Do not use SCALE_POST_OP for SLM kernel, since it demonstrates worse performance @@ -581,7 +599,7 @@ JitConstants FullyConnected_bf_tiled::GetJitConstants(const fully_connected_para if (dispatchData.use_slm) { OPENVINO_ASSERT(dispatchData.tile_n == 2, "[GPU] Unsupported TILE_OFM size for SLM kernel configuration"); - OPENVINO_ASSERT(weights_dt == WeightsType::INT4 || weights_dt == WeightsType::UINT4, "[GPU] Unsupported FC weights type for SLM kernel configuration"); + OPENVINO_ASSERT(is_weight_dyn_quantizable(params), "[GPU] Unsupported FC weights type for SLM kernel configuration"); auto lws_batches = dispatchData.lws[2]; auto total_weights_elements = simd * dispatchData.tile_n * simd * dispatchData.tile_mk; // SIMD * TILE_OFM * SIMD * TILE_IFM @@ -608,15 +626,19 @@ JitConstants FullyConnected_bf_tiled::GetJitConstants(const fully_connected_para jit.AddConstant(MakeJitConstant("LWS_BATCHES", lws_batches)); jit.AddConstant(MakeJitConstant("FILTER_LOAD_ITERS", weights_load_iters)); - if (params.weights.GetLayout() == WeightsLayout::os_iyx_osv16) { - jit.AddConstant(MakeJitConstant("FILTER_ACTUAL_LOAD_BLOCK_SIZE", block_read_size / 2)); - jit.Merge(make_int4_packed_type_jit_constant("INT4_PACKED_TYPE_PRELOAD", params.weights.GetDType(), weights_elements_per_load / 2)); - } else if (params.weights.GetLayout() == WeightsLayout::os_is_yx_osv64_isv2) { - jit.AddConstant(MakeJitConstant("FILTER_ACTUAL_LOAD_BLOCK_SIZE", block_read_size / 2)); - jit.Merge(make_int4_packed_type_jit_constant("INT4_PACKED_TYPE_PRELOAD", params.weights.GetDType(), weights_elements_per_load / 2)); + if (weights_dt == WeightsType::INT4 || weights_dt == WeightsType::UINT4) { + if (params.weights.GetLayout() == WeightsLayout::os_iyx_osv16) { + jit.AddConstant(MakeJitConstant("FILTER_ACTUAL_LOAD_BLOCK_SIZE", block_read_size / 2)); + jit.Merge(make_int4_packed_type_jit_constant("INT4_PACKED_TYPE_PRELOAD", params.weights.GetDType(), weights_elements_per_load / 2)); + } else if (params.weights.GetLayout() == WeightsLayout::os_is_yx_osv64_isv2) { + jit.AddConstant(MakeJitConstant("FILTER_ACTUAL_LOAD_BLOCK_SIZE", block_read_size / 2)); + jit.Merge(make_int4_packed_type_jit_constant("INT4_PACKED_TYPE_PRELOAD", params.weights.GetDType(), weights_elements_per_load / 2)); + } else { + jit.AddConstant(MakeJitConstant("FILTER_ACTUAL_LOAD_BLOCK_SIZE", block_read_size)); + jit.Merge(make_int4_packed_type_jit_constant("INT4_PACKED_TYPE_PRELOAD", params.weights.GetDType(), weights_elements_per_load)); + } } else { jit.AddConstant(MakeJitConstant("FILTER_ACTUAL_LOAD_BLOCK_SIZE", block_read_size)); - jit.Merge(make_int4_packed_type_jit_constant("INT4_PACKED_TYPE_PRELOAD", params.weights.GetDType(), weights_elements_per_load)); } jit.AddConstant(MakeJitConstant("FILTER_LOAD_BLOCK_SIZE", block_read_size)); @@ -629,7 +651,6 @@ JitConstants FullyConnected_bf_tiled::GetJitConstants(const fully_connected_para if (should_dynamic_quantize(params)) { jit.AddConstant(MakeJitConstant("DYNAMIC_QUANTIZE", 1)); jit.AddConstant(MakeJitConstant("DQ_DECOMPRESSION_SCALE_POST_OP", 1)); - jit.AddConstant(MakeJitConstant("DQ_TYPE", "char")); jit.AddConstant(MakeJitConstant("QUANTIZE_GROUP_SIZE", quantize_grp_size)); } else { if (add_decompress_scale_post_op) @@ -637,6 +658,7 @@ JitConstants FullyConnected_bf_tiled::GetJitConstants(const fully_connected_para jit.AddConstant(MakeJitConstant("DYNAMIC_QUANTIZE", 0)); jit.AddConstant(MakeJitConstant("QUANTIZE_GROUP_SIZE", min_quantize_grp_size)); } + jit.AddConstant(MakeJitConstant("DQ_TYPE", "char")); jit.AddConstant(MakeJitConstant("IFM_SIZE", get_input_bf_size(params).second)); jit.AddConstant(MakeJitConstant("SIMD", simd)); @@ -659,9 +681,7 @@ JitConstants FullyConnected_bf_tiled::GetJitConstants(const fully_connected_para } auto max_tile_b_size = dispatchData.tile_m; - if (params.compressed && - params.is_shape_agnostic && - (weights_dt == WeightsType::UINT4 || weights_dt == WeightsType::INT4)) + if (params.compressed && params.is_shape_agnostic && is_weight_dyn_quantizable(params)) max_tile_b_size = std::max(max_tile_b_size, (uint32_t)8); jit.Merge(MakeConstantLoopUnrollJitConstants(max_tile_b_size)); @@ -772,8 +792,10 @@ void FullyConnected_bf_tiled::GetUpdateDispatchDataFunc(KernelData& kd) const { if (kd.internalBufferSizes[0] < input_size) { kd.internalBufferSizes.clear(); - kd.internalBufferSizes.push_back(input_size); // quantized input is char type - kd.internalBufferSizes.push_back(input_size / quantize_grp_size * 2); // de_quan_scale is half type + // quantized input is char type + kd.internalBufferSizes.push_back(input_size); + // half type of de_quan_scale and activation sum for each quantized group + kd.internalBufferSizes.push_back((input_size / quantize_grp_size) * 2 * 2); } kd.kernels[0].params.workGroups.global = {std::max((input_size / quantize_grp_size), (size_t)1), 1, 1}; @@ -800,7 +822,7 @@ KernelsData FullyConnected_bf_tiled::GetTunedKernelsDataByIndex(const Params &pa && (fc_params.weights.GetLayout() == WeightsLayout::oiyx || fc_params.weights.GetLayout() == WeightsLayout::os_is_yx_osv64_isv2) && (fc_params.weights.GetDType() == WeightsType::INT4 || fc_params.weights.GetDType() == WeightsType::UINT4) && is_weight_horizontal(fc_params, output_f)) { - // Large N + Small K case (horizontal weight) to use [osv64_isv2] + TILE_OFM 4 for batch 1 + // Large N + small K case (horizontal weight) to use [osv64_isv2] + TILE_OFM 4 for batch 1 weights_layout = WeightsLayout::os_is_yx_osv64_isv2; } else if (fc_params.compressed && fc_params.inputs[0].GetDType() == Datatype::F16 && (fc_params.weights.GetDType() == WeightsType::INT4 || fc_params.weights.GetDType() == WeightsType::UINT4) @@ -947,6 +969,8 @@ KernelsData FullyConnected_bf_tiled::GetMultiKernelsData(const Params ¶ms, auto& quan_kernel = kd.kernels[0]; DispatchData dyn_quan_dispatch = dispatchData; auto input_size = std::max(fc_params.inputs[0].PhysicalSize(), get_input_bf_size(fc_params).second); + if (!params.is_shape_agnostic) + input_size = std::max(input_size, Align(get_input_bf_size(fc_params).first, lws_batches) * get_input_bf_size(fc_params).second); dyn_quan_dispatch.gws = {input_size / quantize_grp_size, 1, 1}; dyn_quan_dispatch.lws = {16, 1, 1}; quan_kernel.params.workGroups.global = dyn_quan_dispatch.gws; @@ -958,7 +982,6 @@ KernelsData FullyConnected_bf_tiled::GetMultiKernelsData(const Params ¶ms, quan_cldnn_jit.AddConstant(MakeJitConstant("FC_KERNEL_DYNAMIC_QUANTIZE", 1)); auto quan_jit = CreateJit(kernelName, quan_cldnn_jit, quan_entry_point); - FillCLKernelData(quan_kernel, dyn_quan_dispatch, params.engineInfo, @@ -977,8 +1000,10 @@ KernelsData FullyConnected_bf_tiled::GetMultiKernelsData(const Params ¶ms, quan_kernel.params.arguments.push_back({ArgumentDescriptor::Types::INPUT, 0}); quan_kernel.params.arguments.push_back({ArgumentDescriptor::Types::INTERNAL_BUFFER, 0}); quan_kernel.params.arguments.push_back({ArgumentDescriptor::Types::INTERNAL_BUFFER, 1}); + // char type quantized input kd.internalBufferSizes.push_back(input_size); - kd.internalBufferSizes.push_back(input_size / quantize_grp_size * 2); + // half type of de_quan_scale and activation sum for each quantized group + kd.internalBufferSizes.push_back(input_size / quantize_grp_size * 2 * 2); kernel_number++; } kd.internalBufferDataType = Datatype::F16; diff --git a/src/plugins/intel_gpu/tests/unit/test_cases/fully_connected_gpu_test.cpp b/src/plugins/intel_gpu/tests/unit/test_cases/fully_connected_gpu_test.cpp index eed9760348f669..6bf44a31add0f4 100644 --- a/src/plugins/intel_gpu/tests/unit/test_cases/fully_connected_gpu_test.cpp +++ b/src/plugins/intel_gpu/tests/unit/test_cases/fully_connected_gpu_test.cpp @@ -1589,7 +1589,7 @@ class fully_connected_gpu_tests: public ::testing::Test { count++; OPENVINO_ASSERT(abs_diff < 256); } - GPU_DEBUG_LOG << "---> count: " << count << ", max_diff:" << max_diff << ", avg_diff: " << (avg/count) << std::endl; + std::cout << "---> count: " << count << ", max_diff:" << max_diff << ", avg_diff: " << (avg/count) << std::endl; } void test_compressed_int4_scale(bool is_caching_test, bool is_dynamic, long int batch_num, long int scales_group_size = 128, bool is_wei_dyn = false) { @@ -2903,7 +2903,7 @@ class fully_connected_gpu_tests: public ::testing::Test { auto config = get_test_default_config(engine); config.set_property(ov::intel_gpu::allow_new_shape_infer(true)); - ov::intel_gpu::ImplementationDesc fc_impl_desc = { format::bfyx, "fully_connected_gpu_bf_tiled", impl_types::ocl }; + ov::intel_gpu::ImplementationDesc fc_impl_desc = { format::bfyx, "fully_connected_gpu_bfyx_ref", impl_types::ocl }; config.set_property(ov::intel_gpu::force_implementations(ov::intel_gpu::ImplForcingMap{ {"fc_prim", fc_impl_desc} })); config.set_property(ov::hint::dynamic_quantization_group_size(0)); @@ -2965,11 +2965,137 @@ class fully_connected_gpu_tests: public ::testing::Test { max_diff = abs_diff; avg += abs_diff; count++; - OPENVINO_ASSERT(abs_diff < 5); + OPENVINO_ASSERT(abs_diff < 6); } GPU_DEBUG_LOG << "---> count: " << count << ", max_diff:" << max_diff << ", avg_diff: " << (avg/count) << std::endl; OPENVINO_ASSERT((avg/count) < 0.5); } + + void test_compressed_int8_scale_dyn_quan_weight_u8(bool is_dynamic, int batch = 1, int ifm = 512, int ofm = 2048, + int quantize_group_size = 32, int scales_group_size = 128, + bool is_wzp_test = false, bool is_wzp_scalar = false) { + tests::random_generator rg(GET_SUITE_NAME); + auto& engine = get_test_engine(); + + if (engine.get_device_info().dev_type == device_type::discrete_gpu) + GTEST_SKIP(); + + long int batch_num = batch; + long int ifm_num = ifm; + long int ofm_num = ofm; + long int wzp_num = is_wzp_scalar ? 1 : ofm_num; + + auto input_ps = ov::PartialShape{ batch_num, 1, ifm_num }; + auto input_mem = engine.allocate_memory({ input_ps, data_types::f16, format::bfyx }); + + auto weights_mem = engine.allocate_memory({ {ofm_num, ifm_num}, data_types::u8, format::bfyx }); + auto scale_mem = engine.allocate_memory({ {ofm_num, ifm_num / scales_group_size}, data_types::f16, format::fbyx }); + auto dcomp_zp_mem = engine.allocate_memory({ {wzp_num, 1}, data_types::u8, format::bfyx }); + + + auto input_data = rg.generate_random_1d(batch_num * ifm_num, -2.f, 2.f); + set_values(input_mem, input_data); + + auto weigths_data = rg.generate_random_1d(ofm_num * ifm_num, 0, 4); + set_values(weights_mem, weigths_data); + + auto scale_data = rg.generate_random_1d(ofm_num * ifm_num / scales_group_size, -2.f, 2.f); + set_values(scale_mem, scale_data); + + if (is_wzp_test) { + auto zp_data = rg.generate_random_1d(wzp_num, 0, 2); + set_values(dcomp_zp_mem, zp_data); + } + + auto in_layout = is_dynamic ? layout{ ov::PartialShape{ -1, -1, -1 }, data_types::f16, format::bfyx } + : layout{ input_ps, data_types::f16, format::bfyx }; + + auto dcomp_zp_name = is_wzp_test ? "wzp" : ""; + auto fc_prim = fully_connected("fc_prim", input_info("input"), "weights", "", "scale", dcomp_zp_name, data_types::f16, 3, 2); + + if (is_wzp_test) { + fc_prim.compressed_weights = true; + fc_prim.decompression_zero_point = is_wzp_test ? "wzp" : ""; + } + + // Implemented dynamic quantize kernel + auto get_ref_results = [&]() { + topology topo; + topo.add(input_layout("input", in_layout)); + topo.add(data("weights", weights_mem)); + topo.add(data("scale", scale_mem)); + topo.add(data("wzp", dcomp_zp_mem)); + topo.add(fc_prim); + + auto config = get_test_default_config(engine); + config.set_property(ov::intel_gpu::allow_new_shape_infer(true)); + ov::intel_gpu::ImplementationDesc fc_impl_desc = { format::bfyx, "fully_connected_gpu_bf_tiled", impl_types::ocl }; + config.set_property(ov::intel_gpu::force_implementations(ov::intel_gpu::ImplForcingMap{ {"fc_prim", fc_impl_desc} })); + config.set_property(ov::hint::dynamic_quantization_group_size(0)); + + network network(engine, topo, config); + network.set_input_data("input", input_mem); + + auto outputs = network.execute(); + OPENVINO_ASSERT(outputs.size() == 1); + OPENVINO_ASSERT(outputs.begin()->first == "fc_prim"); + + auto output_layout = outputs.begin()->second.get_layout(); + auto output_mem = outputs.begin()->second.get_memory(); + + return engine.reinterpret_buffer(*output_mem, output_layout); + }; + + topology topology( + input_layout("input", in_layout), + data("weights", weights_mem), + data("scale", scale_mem), + data("wzp", dcomp_zp_mem), + fc_prim + ); + + auto config = get_test_default_config(engine); + config.set_property(ov::intel_gpu::allow_new_shape_infer(true)); + config.set_property(ov::intel_gpu::optimize_data(true)); + config.set_property(ov::hint::dynamic_quantization_group_size(quantize_group_size)); + + network::ptr network = get_network(engine, topology, config, get_test_stream_ptr(), false); + + if (is_dynamic && !engine.get_device_info().supports_immad) { + auto inst = network->get_primitive("fc_prim"); + auto impl = inst->get_impl(); + ASSERT_TRUE(impl != NULL); + auto kernel_num = (is_dynamic) ? 3 : 2; + kernel_num = (quantize_group_size < 32) ? 2 : kernel_num; + ASSERT_EQ(impl->get_kernels().size(), size_t(kernel_num)); + } + + network->set_input_data("input", input_mem); + + auto outputs = network->execute(); + ASSERT_EQ(outputs.size(), size_t(1)); + ASSERT_EQ(outputs.begin()->first, "fc_prim"); + + auto output_mem = outputs.begin()->second.get_memory(); + cldnn::mem_lock output_ptr (output_mem, get_test_stream()); + + auto ref_output_mem = get_ref_results(); + cldnn::mem_lock output_ptr_ref (ref_output_mem, get_test_stream()); + + size_t count = 0; + float max_diff = 0.f; + float avg = 0.f; + for (size_t i = 0; i < output_ptr_ref.size(); ++i) { + auto abs_diff = std::abs((float)output_ptr_ref[i] - (float)output_ptr[i]); + if (max_diff < abs_diff) + max_diff = abs_diff; + avg += abs_diff; + count++; + OPENVINO_ASSERT(abs_diff < 8); + } + GPU_DEBUG_LOG << "---> count: " << count << ", max_diff:" << max_diff << ", avg_diff: " << (avg/count) << std::endl; + OPENVINO_ASSERT((avg/count) < 0.8); + } }; using shared_dims = std::tuple; @@ -4064,6 +4190,34 @@ TEST_F(fully_connected_gpu_tests, compressed_int4_scale_dynamic_quantize_wzp_sta this->test_compressed_int4_scale_dyn_quan_weight_i4(false, 320, 1024, 1024, 32, 32, true); } +TEST_F(fully_connected_gpu_tests, compressed_int8_scale_dynamic_quantize_wzp_128_large) { + this->test_compressed_int8_scale_dyn_quan_weight_u8(true, 320, 4096, 4096, 128, 128, true); +} + +TEST_F(fully_connected_gpu_tests, compressed_int8_scale_dynamic_quantize_wzp_32_ifm_1024) { + this->test_compressed_int8_scale_dyn_quan_weight_u8(true, 320, 1024, 1024, 32, 32, true); +} + +TEST_F(fully_connected_gpu_tests, compressed_int8_scale_dynamic_quantize_wzp_32_ifm_2048) { + this->test_compressed_int8_scale_dyn_quan_weight_u8(true, 320, 2048, 2048, 32, 32, true); +} + +TEST_F(fully_connected_gpu_tests, compressed_int8_scale_dynamic_quantize_wzp_32_ifm_4096) { + this->test_compressed_int8_scale_dyn_quan_weight_u8(true, 320, 4096, 4096, 32, 32, true); +} + +TEST_F(fully_connected_gpu_tests, compressed_int8_scale_dynamic_quantize_wzp_32_large_unaligned) { + this->test_compressed_int8_scale_dyn_quan_weight_u8(true, 310, 1024, 1024, 32, 32, true); +} + +TEST_F(fully_connected_gpu_tests, compressed_int8_scale_dynamic_quantize_wzp_128_small) { + this->test_compressed_int8_scale_dyn_quan_weight_u8(true, 16, 1024, 1024, 128, 128, true); +} + +TEST_F(fully_connected_gpu_tests, compressed_int8_scale_dynamic_quantize_wzp_128_single) { + this->test_compressed_int8_scale_dyn_quan_weight_u8(true, 1, 1024, 1024, 128, 128, true); +} + TEST_F(fully_connected_gpu_tests, compressed_scale_bias) { this->test_compressed_scale_bias(false); } From 9ec63be1be3c3d5005367fcd708c629456982bac Mon Sep 17 00:00:00 2001 From: Sergey Shlyapnikov Date: Thu, 31 Oct 2024 05:29:24 +0400 Subject: [PATCH 068/120] [GPU] Fix per-token dynamic quantization (#27332) ### Details: - Allow the DynamicQuantizeKernelOpt kernel to be selected with the default scales order - Relax DynamicQuantizeKernelRef kernel validation function --- .../dynamic_quantize/dynamic_quantize_kernel_opt.cpp | 9 +++++++-- .../dynamic_quantize/dynamic_quantize_kernel_ref.cpp | 11 +++++------ 2 files changed, 12 insertions(+), 8 deletions(-) diff --git a/src/plugins/intel_gpu/src/kernel_selector/kernels/dynamic_quantize/dynamic_quantize_kernel_opt.cpp b/src/plugins/intel_gpu/src/kernel_selector/kernels/dynamic_quantize/dynamic_quantize_kernel_opt.cpp index b610ac2076def4..52a648679499f2 100644 --- a/src/plugins/intel_gpu/src/kernel_selector/kernels/dynamic_quantize/dynamic_quantize_kernel_opt.cpp +++ b/src/plugins/intel_gpu/src/kernel_selector/kernels/dynamic_quantize/dynamic_quantize_kernel_opt.cpp @@ -160,8 +160,13 @@ bool DynamicQuantizeKernelOpt::Validate(const Params& params) const { if (dq_params.group_sizes.back() != UINT64_MAX) return false; - if (!dq_params.scales_output_order.empty()) - return false; + // Allow only default scales order + const auto& scales_output_order = dq_params.scales_output_order; + if (!scales_output_order.empty()) { + for (size_t i = 0; i < scales_output_order.size(); i++) + if (scales_output_order[i] != i) + return false; + } return true; } diff --git a/src/plugins/intel_gpu/src/kernel_selector/kernels/dynamic_quantize/dynamic_quantize_kernel_ref.cpp b/src/plugins/intel_gpu/src/kernel_selector/kernels/dynamic_quantize/dynamic_quantize_kernel_ref.cpp index b7a9b40191da4e..bd3d0f87cdc931 100644 --- a/src/plugins/intel_gpu/src/kernel_selector/kernels/dynamic_quantize/dynamic_quantize_kernel_ref.cpp +++ b/src/plugins/intel_gpu/src/kernel_selector/kernels/dynamic_quantize/dynamic_quantize_kernel_ref.cpp @@ -54,7 +54,9 @@ JitConstants DynamicQuantizeKernelRef::GetJitConstants(const dynamic_quantize_pa jit.AddConstant(MakeJitConstant("ASYMMETRIC_QUANTIZATION", params.use_asymmetric_quantization)); jit.AddConstant(MakeJitConstant("GROUP_SCALES_WITH_ZP", params.combine_scales_and_zp)); - const auto& group_sizes = params.group_sizes; + auto group_sizes = params.group_sizes; + group_sizes.resize(std::min((size_t)4, group_sizes.size()), 1); + for (size_t i = 0; i < group_sizes.size(); i++) { jit.AddConstant(MakeJitConstant("GROUP_SIZE_DIM" + std::to_string(i), group_sizes[i])); } @@ -68,7 +70,8 @@ CommonDispatchData DynamicQuantizeKernelRef::SetDefault(const dynamic_quantize_p OPENVINO_ASSERT(params.outputs[0].GetLayout() == DataLayout::bfyx, "It supports only 4d tensor"); - const auto& group_sizes = params.group_sizes; + auto group_sizes = params.group_sizes; + group_sizes.resize(std::min((size_t)4, group_sizes.size()), 1); auto batch_size = group_sizes[0] == 1 ? params.outputs[0].Batch().v : 1; auto feature_size = group_sizes[1] == 1 ? params.outputs[0].Feature().v : 1; auto y_size = group_sizes[2] == 1 ? params.outputs[0].Y().v : 1; @@ -134,10 +137,6 @@ bool DynamicQuantizeKernelRef::Validate(const Params& params) const { if (!KernelBaseOpenCL::Validate(params)) return false; - const auto& prim_params = static_cast(params); - if (prim_params.group_sizes.size() != 4) - return false; - return true; } } // namespace kernel_selector From f60b9c41446594bda517eb86b8eb99d244d7318b Mon Sep 17 00:00:00 2001 From: Wenjing Kang Date: Thu, 31 Oct 2024 13:27:54 +0800 Subject: [PATCH 069/120] Add /MT[d] to CMAKE_LANG_FLAGS_CONFIG_INIT (#27173) ### Details: Add /MT[d] to CMAKE_LANG_FLAGS_CONFIG_INIT to avoid the missing of /O2 /Ob2 /DNDEBUG flags in CMAKE_LANG_FLAGS_CONFIG ### Tickets: - *152927* Signed-off-by: Kang Wenjing --- cmake/toolchains/mt.runtime.win32.toolchain.cmake | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/cmake/toolchains/mt.runtime.win32.toolchain.cmake b/cmake/toolchains/mt.runtime.win32.toolchain.cmake index 7dd4e1e7f96ded..9a99781eac0426 100644 --- a/cmake/toolchains/mt.runtime.win32.toolchain.cmake +++ b/cmake/toolchains/mt.runtime.win32.toolchain.cmake @@ -27,6 +27,11 @@ if(use_static_runtime) foreach(build_type "" "_DEBUG" "_MINSIZEREL" "_RELEASE" "_RELWITHDEBINFO") set(flag_var "CMAKE_${lang}_FLAGS${build_type}_INIT") string(REPLACE "/MD" "/MT" ${flag_var} "${${flag_var}}") + if (build_type STREQUAL "_DEBUG") + set(${flag_var} "${${flag_var}} /MTd") + else() + set(${flag_var} "${${flag_var}} /MT") + endif() endforeach() endforeach() endif() From 689e04320819f0784b40de633da4908b65579415 Mon Sep 17 00:00:00 2001 From: Sergey Shlyapnikov Date: Thu, 31 Oct 2024 09:54:16 +0400 Subject: [PATCH 070/120] [GPU] Add subsequent reshapes optimization and dynamic paddings support for RoPE and PagedAttention (#27329) ### Details: - Added subsequent reshapes optimization - Added dynamic paddings support for RoPE and PagedAttention ### Tickets: - [CVS-156124](https://jira.devtools.intel.com/browse/CVS-156124) --- .../graph_optimizer/prepare_buffer_fusing.cpp | 60 ++++++++--- .../src/graph/include/reshape_inst.h | 15 +-- .../cl_kernels/pa_kv_cache_update_ref.cl | 47 +++++---- .../kernel_selector/cl_kernels/rope_ref.cl | 30 +++--- .../kernel_selector/cl_kernels/sdpa_opt.cl | 31 ++++-- .../kernels/rope/rope_kernel_base.cpp | 25 +---- .../optimize_subsequent_reshapes.cpp | 99 +++++++++++++++++++ .../optimize_subsequent_reshapes.hpp | 23 +++++ .../src/plugin/transformations_pipeline.cpp | 3 + .../optimize_subsequent_reshapes_test.cpp | 97 ++++++++++++++++++ 10 files changed, 341 insertions(+), 89 deletions(-) create mode 100644 src/plugins/intel_gpu/src/plugin/transformations/optimize_subsequent_reshapes.cpp create mode 100644 src/plugins/intel_gpu/src/plugin/transformations/optimize_subsequent_reshapes.hpp create mode 100644 src/plugins/intel_gpu/tests/unit/transformations/optimize_subsequent_reshapes_test.cpp diff --git a/src/plugins/intel_gpu/src/graph/graph_optimizer/prepare_buffer_fusing.cpp b/src/plugins/intel_gpu/src/graph/graph_optimizer/prepare_buffer_fusing.cpp index 6d7d609d232947..e94714c84fdebf 100644 --- a/src/plugins/intel_gpu/src/graph/graph_optimizer/prepare_buffer_fusing.cpp +++ b/src/plugins/intel_gpu/src/graph/graph_optimizer/prepare_buffer_fusing.cpp @@ -660,23 +660,34 @@ void crop_in_place_optimization::update_in_place_crop_padding_simple_data_format if (user_info.first && user_info.first->is_type()) { auto reshape_desc = user_info.first->as().get_primitive(); auto reshape_mode = reshape_desc->mode; + auto reshape_axis = crop_axis; if (reshape_mode == reshape::reshape_mode::base) { - user_info.second.data_padding._dynamic_dims_mask = dyn_pad_sizes; + auto reshape_ps = user_info.second.get_partial_shape(); + auto crop_dim_val = crop_layout.get_partial_shape()[crop_axis].get_length(); + + auto mul = 1; + reshape_axis = reshape_ps.size() - 1; + for (size_t i = reshape_ps.size(); i > 1; i--) { + if (reshape_ps[i - 1].is_dynamic() || mul == crop_dim_val) + break; + + mul *= reshape_ps[i - 1].get_length(); + reshape_axis = i - 1; + } } else if (reshape_mode == reshape::reshape_mode::unsqueeze || reshape_mode == reshape::reshape_mode::squeeze) { auto reshape_ps = user_info.second.get_partial_shape(); auto output_pattern = reshape_desc->output_pattern; - auto reshape_axis = crop_axis; for (size_t i = 0; i < output_pattern.size(); i++) { if (output_pattern[i] <= static_cast(reshape_axis)) { reshape_axis += reshape_mode == reshape::reshape_mode::unsqueeze ? 1 : -1; } } - - padding::DynamicDimsMask dyn_pad_mask; - dyn_pad_mask[reshape_axis] = 1; - user_info.second.data_padding._dynamic_dims_mask = dyn_pad_mask; } + + auto reshape_dyn_pad_mask = padding::DynamicDimsMask(); + reshape_dyn_pad_mask[reshape_axis] = 1; + user_info.second.data_padding._dynamic_dims_mask = reshape_dyn_pad_mask; } return; } @@ -704,13 +715,36 @@ void crop_in_place_optimization::update_in_place_crop_padding_simple_data_format auto reshape_desc = user_info.first->as().get_primitive(); auto reshape_mode = reshape_desc->mode; if (reshape_mode == reshape::reshape_mode::base) { - auto reshape_rank = user_info.second.get_partial_shape().size(); - auto reshape_last_dim = user_info.second.get_partial_shape().to_shape()[reshape_rank - 1]; - if (lower_sizes[crop_axis]) - lower_sizes[crop_axis] /= reshape_last_dim; - if (upper_sizes[crop_axis]) - upper_sizes[crop_axis] /= reshape_last_dim; - user_info.second.data_padding = padding(lower_sizes, upper_sizes, dyn_pad_sizes); + auto reshape_ps = user_info.second.get_partial_shape(); + auto crop_dim_val = crop_layout.get_partial_shape()[crop_axis].get_length(); + + auto divider = 1; + auto reshape_axis = reshape_ps.size(); + for (size_t i = reshape_ps.size(); i > 1; i--) { + const auto& dim_value = reshape_ps[i - 1].get_length(); + if (divider * dim_value == crop_dim_val) + break; + + divider *= dim_value; + reshape_axis = i - 1; + } + reshape_axis -= 1; + + const auto output_rank = std::max(reshape_ps.size(), static_cast(4)); + std::vector reshape_lower_sizes(output_rank, 0); + std::vector reshape_upper_sizes(output_rank, 0); + padding::DynamicDimsMask reshape_dyn_pad_mask; + + reshape_lower_sizes[reshape_axis] = lower_sizes[crop_axis]; + reshape_upper_sizes[reshape_axis] = upper_sizes[crop_axis]; + reshape_dyn_pad_mask[reshape_axis] = 1; + + if (reshape_lower_sizes[reshape_axis]) + reshape_lower_sizes[reshape_axis] /= divider; + if (reshape_upper_sizes[reshape_axis]) + reshape_upper_sizes[reshape_axis] /= divider; + + user_info.second.data_padding = padding(reshape_lower_sizes, reshape_upper_sizes, reshape_dyn_pad_mask); } else { auto reshape_ps = user_info.second.get_partial_shape(); auto output_pattern = reshape_desc->output_pattern; diff --git a/src/plugins/intel_gpu/src/graph/include/reshape_inst.h b/src/plugins/intel_gpu/src/graph/include/reshape_inst.h index 1bbfd94256a50c..d6a71c20fcac8d 100644 --- a/src/plugins/intel_gpu/src/graph/include/reshape_inst.h +++ b/src/plugins/intel_gpu/src/graph/include/reshape_inst.h @@ -59,7 +59,7 @@ struct typed_program_node : public typed_program_node_base { return false; // TODO: If user is RoPE or MVN and dynamic padding exists, ouput padding propagation is not supported in the base mode - if (get_users().size() == 1 && (get_users().front()->is_type() || get_users().front()->is_type())) + if (get_users().size() == 1 && get_users().front()->is_type()) return false; auto axis = input().as().get_primitive()->axis; @@ -73,14 +73,17 @@ struct typed_program_node : public typed_program_node_base { const auto& output_pshape = prim->output_partial_shape; // TODO: If the reshape's output shape is non constant, issue occurs // during shape inference due to execution order at runtime - if ((output_pshape.size() != input_rank + 1) || prim->output_pattern.empty()) + if (prim->output_pattern.empty()) return false; + // Iteratively check the total product of all static innermost dimensions + // until the crop dimension value matches or the first dynamic dimension is encountered int64_t mul = 1; - for (size_t i = input_rank - 1; i < output_pshape.size() ; i++) { - if (output_pshape[i].is_dynamic()) - return false; - mul *= output_pshape[i].get_length(); + for (size_t i = output_pshape.size(); i > 1 ; i--) { + if (output_pshape[i - 1].is_dynamic() || mul == input_last_dim_val) + break; + + mul *= output_pshape[i - 1].get_length(); } if (input_last_dim_val != mul) return false; diff --git a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/pa_kv_cache_update_ref.cl b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/pa_kv_cache_update_ref.cl index ef2f78496b2cf2..8426baf719f990 100644 --- a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/pa_kv_cache_update_ref.cl +++ b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/pa_kv_cache_update_ref.cl @@ -34,10 +34,14 @@ KERNEL(pa_kv_cache_update)( const uint seq_block_idx = block_indices_begins[seq_idx] + seq_len / PAGED_ATTENTION_BLOCK_SIZE; const uint block_idx = block_indices[seq_block_idx]; - uint key_value_in_offset = seq_idx * KV_HEADS_NUM * HEAD_SIZE + head_idx * HEAD_SIZE; + uint key_in_offset = INPUT0_OFFSET + + seq_idx * (KV_HEADS_NUM * HEAD_SIZE + INPUT0_PAD_BEFORE_FEATURE_NUM + INPUT0_PAD_AFTER_FEATURE_NUM) + + head_idx * HEAD_SIZE; + uint value_in_offset = INPUT1_OFFSET + + seq_idx * (KV_HEADS_NUM * HEAD_SIZE + INPUT1_PAD_BEFORE_FEATURE_NUM + INPUT1_PAD_AFTER_FEATURE_NUM) + + head_idx * HEAD_SIZE; uint key_out_offset = block_idx * KV_HEADS_NUM * HEAD_SIZE * PAGED_ATTENTION_BLOCK_SIZE + head_idx * HEAD_SIZE * PAGED_ATTENTION_BLOCK_SIZE + current_token_pos_in_block; - uint value_out_offset = block_idx * KV_HEADS_NUM * HEAD_SIZE * PAGED_ATTENTION_BLOCK_SIZE + head_idx * HEAD_SIZE * PAGED_ATTENTION_BLOCK_SIZE + current_token_pos_in_block * HEAD_SIZE; #define READ_BLOCK_SIZE GENERATE_STAGE_BLOCK_SIZE @@ -45,7 +49,7 @@ KERNEL(pa_kv_cache_update)( #define BLOCK_READ(ptr, offset) BLOCK_READN(INPUT0_TYPE, READ_BLOCK_SIZE, ptr, offset); #define DATA_VEC MAKE_VECTOR_TYPE(INPUT0_TYPE, READ_BLOCK_SIZE) - DATA_VEC input_data = BLOCK_READ(key_data, key_value_in_offset + head_idx_index); + DATA_VEC input_data = BLOCK_READ(key_data, key_in_offset + head_idx_index); unroll_for (uint i = 0; i < READ_BLOCK_SIZE; i++) { uint key_offset = key_out_offset + (head_idx_index + sglid + SUBGROUP_SIZE * i) * PAGED_ATTENTION_BLOCK_SIZE; @@ -56,7 +60,7 @@ KERNEL(pa_kv_cache_update)( #endif } - input_data = BLOCK_READ(value_data, key_value_in_offset + head_idx_index); + input_data = BLOCK_READ(value_data, value_in_offset + head_idx_index); unroll_for (uint i = 0; i < READ_BLOCK_SIZE; i++) { uint value_offset = value_out_offset + head_idx_index + sglid + SUBGROUP_SIZE * i; @@ -83,8 +87,13 @@ KERNEL(pa_kv_cache_update)( const uint token_start_pos = (past_len + block_start_pos - subsequence_begin_idx) % PAGED_ATTENTION_BLOCK_SIZE; - uint key_value_in_offset = block_start_pos * KV_HEADS_NUM * HEAD_SIZE + - head_idx * HEAD_SIZE; + uint key_in_offset = INPUT0_OFFSET + + block_start_pos * (KV_HEADS_NUM * HEAD_SIZE + INPUT0_PAD_AFTER_FEATURE_NUM + INPUT0_PAD_BEFORE_FEATURE_NUM) + + head_idx * HEAD_SIZE; + + uint value_in_offset = INPUT1_OFFSET + + block_start_pos * (KV_HEADS_NUM * HEAD_SIZE + INPUT1_PAD_AFTER_FEATURE_NUM + INPUT1_PAD_BEFORE_FEATURE_NUM) + + head_idx * HEAD_SIZE; const uint current_block_idx = (past_len + block_start_pos - subsequence_begin_idx) / PAGED_ATTENTION_BLOCK_SIZE; @@ -106,14 +115,14 @@ KERNEL(pa_kv_cache_update)( #define BLOCK_READ(ptr, offset) BLOCK_READN(INPUT0_TYPE, READ_BLOCK_SIZE, ptr, offset); #define DATA_VEC MAKE_VECTOR_TYPE(INPUT0_TYPE, READ_BLOCK_SIZE) - DATA_VEC input_data = BLOCK_READ(key_data, key_value_in_offset + head_idx_index); + DATA_VEC input_data = BLOCK_READ(key_data, key_in_offset + head_idx_index); unroll_for (uint i = 0; i < READ_BLOCK_SIZE; i++) { uint key_offset = key_out_offset + (head_idx_index + sglid + SUBGROUP_SIZE * i) * PAGED_ATTENTION_BLOCK_SIZE; key_cache_data[key_offset] = input_data[i]; } - input_data = BLOCK_READ(value_data, key_value_in_offset + head_idx_index); + input_data = BLOCK_READ(value_data, value_in_offset + head_idx_index); unroll_for (uint i = 0; i < READ_BLOCK_SIZE; i++) { uint value_offset = value_out_offset + head_idx_index + sglid + SUBGROUP_SIZE * i; @@ -126,14 +135,14 @@ KERNEL(pa_kv_cache_update)( #define BLOCK_READ(ptr, offset) BLOCK_READN(INPUT0_TYPE, READ_BLOCK_SIZE, ptr, offset); #define DATA_VEC MAKE_VECTOR_TYPE(INPUT0_TYPE, READ_BLOCK_SIZE) - DATA_VEC input_data = BLOCK_READ(key_data, key_value_in_offset + head_idx_index); + DATA_VEC input_data = BLOCK_READ(key_data, key_in_offset + head_idx_index); unroll_for (uint i = 0; i < READ_BLOCK_SIZE; i++) { uint key_offset = key_out_offset + (head_idx_index + sglid + SUBGROUP_SIZE * i) * PAGED_ATTENTION_BLOCK_SIZE; key_cache_data[key_offset] = input_data[i]; } - input_data = BLOCK_READ(value_data, key_value_in_offset + head_idx_index); + input_data = BLOCK_READ(value_data, value_in_offset + head_idx_index); unroll_for (uint i = 0; i < READ_BLOCK_SIZE; i++) { uint value_offset = value_out_offset + head_idx_index + sglid + SUBGROUP_SIZE * i; @@ -146,14 +155,14 @@ KERNEL(pa_kv_cache_update)( #define BLOCK_READ(ptr, offset) BLOCK_READN(INPUT0_TYPE, READ_BLOCK_SIZE, ptr, offset); #define DATA_VEC MAKE_VECTOR_TYPE(INPUT0_TYPE, READ_BLOCK_SIZE) - DATA_VEC input_data = BLOCK_READ(key_data, key_value_in_offset + head_idx_index); + DATA_VEC input_data = BLOCK_READ(key_data, key_in_offset + head_idx_index); unroll_for (uint i = 0; i < READ_BLOCK_SIZE; i++) { uint key_offset = key_out_offset + (head_idx_index + sglid + SUBGROUP_SIZE * i) * PAGED_ATTENTION_BLOCK_SIZE; key_cache_data[key_offset] = input_data[i]; } - input_data = BLOCK_READ(value_data, key_value_in_offset + head_idx_index); + input_data = BLOCK_READ(value_data, value_in_offset + head_idx_index); unroll_for (uint i = 0; i < READ_BLOCK_SIZE; i++) { uint value_offset = value_out_offset + head_idx_index + sglid + SUBGROUP_SIZE * i; @@ -166,14 +175,14 @@ KERNEL(pa_kv_cache_update)( #define BLOCK_READ(ptr, offset) BLOCK_READN(INPUT0_TYPE, READ_BLOCK_SIZE, ptr, offset); #define DATA_VEC MAKE_VECTOR_TYPE(INPUT0_TYPE, READ_BLOCK_SIZE) - DATA_VEC input_data = BLOCK_READ(key_data, key_value_in_offset + head_idx_index); + DATA_VEC input_data = BLOCK_READ(key_data, key_in_offset + head_idx_index); unroll_for (uint i = 0; i < READ_BLOCK_SIZE; i++) { uint key_offset = key_out_offset + (head_idx_index + sglid + SUBGROUP_SIZE * i) * PAGED_ATTENTION_BLOCK_SIZE; key_cache_data[key_offset] = input_data; } - input_data = BLOCK_READ(value_data, key_value_in_offset + head_idx_index); + input_data = BLOCK_READ(value_data, value_in_offset + head_idx_index); unroll_for (uint i = 0; i < READ_BLOCK_SIZE; i++) { uint value_offset = value_out_offset + head_idx_index + sglid + SUBGROUP_SIZE * i; @@ -181,7 +190,8 @@ KERNEL(pa_kv_cache_update)( } } - key_value_in_offset += KV_HEADS_NUM * HEAD_SIZE; + key_in_offset += (KV_HEADS_NUM * HEAD_SIZE + INPUT0_PAD_AFTER_FEATURE_NUM + INPUT0_PAD_BEFORE_FEATURE_NUM); + value_in_offset += (KV_HEADS_NUM * HEAD_SIZE + INPUT1_PAD_AFTER_FEATURE_NUM + INPUT1_PAD_BEFORE_FEATURE_NUM); key_out_offset += 1; value_out_offset += HEAD_SIZE; } @@ -194,14 +204,14 @@ KERNEL(pa_kv_cache_update)( #define BLOCK_READ(ptr, offset) BLOCK_READN(INPUT0_TYPE, READ_BLOCK_SIZE, ptr, offset); #define DATA_VEC MAKE_VECTOR_TYPE(INPUT0_TYPE, READ_BLOCK_SIZE) - DATA_VEC input_data = BLOCK_READ(key_data, key_value_in_offset + head_idx_index); + DATA_VEC input_data = BLOCK_READ(key_data, key_in_offset + head_idx_index); unroll_for (uint i = 0; i < READ_BLOCK_SIZE; i++) { uint key_offset = key_out_offset + (head_idx_index + sglid + SUBGROUP_SIZE * i) * PAGED_ATTENTION_BLOCK_SIZE; key_cache_data[key_offset] = input_data; } - input_data = BLOCK_READ(value_data, key_value_in_offset + head_idx_index); + input_data = BLOCK_READ(value_data, value_in_offset + head_idx_index); unroll_for (uint i = 0; i < READ_BLOCK_SIZE; i++) { uint value_offset = value_out_offset + head_idx_index + sglid + SUBGROUP_SIZE * i; @@ -209,7 +219,8 @@ KERNEL(pa_kv_cache_update)( } } - key_value_in_offset += KV_HEADS_NUM * HEAD_SIZE; + key_in_offset += (KV_HEADS_NUM * HEAD_SIZE + INPUT0_PAD_AFTER_FEATURE_NUM + INPUT0_PAD_BEFORE_FEATURE_NUM); + value_in_offset += (KV_HEADS_NUM * HEAD_SIZE + INPUT1_PAD_AFTER_FEATURE_NUM + INPUT1_PAD_BEFORE_FEATURE_NUM); key_out_offset += 1; value_out_offset += HEAD_SIZE; } diff --git a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/rope_ref.cl b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/rope_ref.cl index 38066b4461def4..133440a21301f2 100644 --- a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/rope_ref.cl +++ b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/rope_ref.cl @@ -28,14 +28,11 @@ KERNEL(rope_ref)( uint r = rf < HALF_ROTARY_NDIMS ? rf * 2 : 0; uint f = rf < HEAD_SIZE - ROTARY_NDIMS ? rf * 2 : 0; -#ifdef ENABLE_SLICE - uint input_idx = GET_DATA_INDEX(SLICED_INPUT0, p, b, h * HEAD_SIZE, 0); - - input_idx += SLICED_FROM_START * (p * INPUT0_FEATURE_NUM + b + 1) - + SLICED_FROM_END * (p * INPUT0_FEATURE_NUM + b); -#else uint input_idx = INPUT0_GET_INDEX(p, b, h * HEAD_SIZE, 0); +#ifdef ENABLE_SLICE + input_idx += SLICED_FROM_START; #endif + uint cos_sin_p = p < INPUT1_BATCH_NUM ? p : 0; uint cos_sin_b = b < INPUT1_FEATURE_NUM ? b : 0; uint cos_sin_idx = INPUT1_GET_INDEX(cos_sin_p, cos_sin_b, 0, 0); @@ -69,14 +66,11 @@ KERNEL(rope_ref)( const uint h = (uint)get_global_id(2) / HALF_ROTARY_NDIMS; const uint r = (uint)get_global_id(2) % HALF_ROTARY_NDIMS; -#ifdef ENABLE_SLICE - uint input_idx = GET_DATA_INDEX(SLICED_INPUT0, b, p, h * HEAD_SIZE, 0); - - input_idx += SLICED_FROM_START * (b * INPUT0_FEATURE_NUM + p + 1) - + SLICED_FROM_END * (b * INPUT0_FEATURE_NUM + p); -#else uint input_idx = INPUT0_GET_INDEX(b, p, h * HEAD_SIZE, 0); +#ifdef ENABLE_SLICE + input_idx += SLICED_FROM_START; #endif + uint cos_sin_b = b < INPUT1_BATCH_NUM ? b : 0; uint cos_sin_p = p + INPUT1_FEATURE_NUM - INPUT0_FEATURE_NUM < INPUT1_FEATURE_NUM ? p + INPUT1_FEATURE_NUM - INPUT0_FEATURE_NUM : 0; uint cos_sin_h = h < INPUT1_SIZE_Y ? h : 0; @@ -119,15 +113,13 @@ KERNEL(rope_ref)( const uint p = (uint)get_global_id(2) / HALF_ROTARY_NDIMS; const uint r = (uint)get_global_id(2) % HALF_ROTARY_NDIMS; -#ifdef ENABLE_SLICE - uint input_idx = GET_DATA_INDEX(SLICED_INPUT0, b, h, p, 0); - - input_idx += SLICED_FROM_START * (b * INPUT0_FEATURE_NUM + h + 1) - + SLICED_FROM_END * (b * INPUT0_FEATURE_NUM + h); -#elif ENABLE_TRANSPOSE - uint input_idx = GET_DATA_INDEX(TRANSPOSED_INPUT0, b, h, p, 0); +#if ENABLE_TRANSPOSE + uint input_idx = INPUT0_GET_INDEX(b, p, h, 0); #else uint input_idx = INPUT0_GET_INDEX(b, h, p, 0); +#ifdef ENABLE_SLICE + input_idx += SLICED_FROM_START; +#endif #endif uint cos_sin_b = b < INPUT1_BATCH_NUM ? b : 0; diff --git a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/sdpa_opt.cl b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/sdpa_opt.cl index c114332f393c0e..55f87e4189d9fe 100644 --- a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/sdpa_opt.cl +++ b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/sdpa_opt.cl @@ -885,9 +885,10 @@ KERNEL(sdpa_opt)( #if IS_PAGED_ATTENTION const uint block_start_pos = blocked_indexes_start[target_seq_dim]; const uint block_end_pos = blocked_indexes_end[target_seq_dim]; - - uint query_offset = block_start_pos * HEAD_SIZE * NUM_HEADS + num_heads_dim * HEAD_SIZE + head_size_idx; - const uint query_pitch = HEAD_SIZE * NUM_HEADS; + uint query_offset = INPUT0_OFFSET + + block_start_pos * (HEAD_SIZE * NUM_HEADS + INPUT0_PAD_BEFORE_FEATURE_NUM + INPUT0_PAD_AFTER_FEATURE_NUM) + + num_heads_dim * HEAD_SIZE + head_size_idx; + const uint query_pitch = (HEAD_SIZE * NUM_HEADS + INPUT0_PAD_BEFORE_FEATURE_NUM + INPUT0_PAD_AFTER_FEATURE_NUM); const uint cur_target_seq_len_size = block_end_pos - block_start_pos; #else @@ -996,8 +997,11 @@ KERNEL(sdpa_opt)( const uint heads_dim = num_heads_dim; #endif #define KEY_SEQ_OFFSET subsequence_begins[gws_seq_indexes_correspondence[target_seq_dim]] - uint key_offset = KEY_SEQ_OFFSET * HEAD_SIZE * NUM_KV_HEADS + heads_dim * HEAD_SIZE + seq_len * HEAD_SIZE * NUM_KV_HEADS; - const uint key_pitch = HEAD_SIZE * NUM_KV_HEADS; + const uint key_pitch = (HEAD_SIZE * NUM_KV_HEADS + INPUT1_PAD_BEFORE_FEATURE_NUM + INPUT1_PAD_AFTER_FEATURE_NUM); + uint key_offset = INPUT1_OFFSET + + KEY_SEQ_OFFSET * key_pitch + + heads_dim * HEAD_SIZE + + seq_len * key_pitch; #else #ifdef BEAM_TABLE_TYPE const uint b_idx = beam_table[FUNC_CALL(get_bt_index_key)(OPTIONAL_SHAPE_INFO_TENSOR b0_idx, b1_idx, 0, 0, seq_len + sglid, 0)]; @@ -1225,7 +1229,7 @@ KERNEL(sdpa_opt)( // QK*V calculation MAKE_VECTOR_TYPE(OUTPUT_TYPE, TARGET_SEQ_LEN_BLOCK_SIZE) acc_output_res = OUTPUT_VAL_ZERO; #if IS_PAGED_ATTENTION - const uint value_pitch = HEAD_SIZE * NUM_KV_HEADS; + const uint value_pitch = (HEAD_SIZE * NUM_KV_HEADS + INPUT2_PAD_BEFORE_FEATURE_NUM + INPUT2_PAD_AFTER_FEATURE_NUM); #else #ifdef INPUT2_DIMS_ORDER uint value_offset_base = FUNC_CALL(get_input2_index)(OPTIONAL_SHAPE_INFO_TENSOR b0_idx, b1_idx, 0, 0, 0, 0); @@ -1246,7 +1250,10 @@ KERNEL(sdpa_opt)( const uint heads_dim = num_heads_dim; #endif const uint value_seq_offset = subsequence_begins[gws_seq_indexes_correspondence[target_seq_dim]]; - uint value_offset = value_seq_offset * HEAD_SIZE * NUM_KV_HEADS + heads_dim * HEAD_SIZE + (start_partition_idx + (seq_len)) * HEAD_SIZE * NUM_KV_HEADS + head_size_idx; + uint value_offset = INPUT2_OFFSET + + value_seq_offset * value_pitch + + heads_dim * HEAD_SIZE + + (start_partition_idx + (seq_len)) * value_pitch + head_size_idx; #else #ifdef BEAM_TABLE_TYPE const uint b_idx = beam_table[FUNC_CALL(get_bt_index_value)(OPTIONAL_SHAPE_INFO_TENSOR b0_idx, b1_idx, 0, 0, start_partition_idx + (seq_len) + sglid, sgid * SUBGROUP_SIZE)]; @@ -1311,7 +1318,10 @@ KERNEL(sdpa_opt)( const uint heads_dim = num_heads_dim; #endif const uint value_seq_offset = subsequence_begins[gws_seq_indexes_correspondence[target_seq_dim]]; - uint value_offset = value_seq_offset * HEAD_SIZE * NUM_KV_HEADS + heads_dim * HEAD_SIZE + (start_partition_idx + (seq_len * SUBGROUP_SIZE)) * HEAD_SIZE * NUM_KV_HEADS + head_size_idx; + uint value_offset = INPUT2_OFFSET + + value_seq_offset * value_pitch + + heads_dim * HEAD_SIZE + + (start_partition_idx + (seq_len * SUBGROUP_SIZE)) * value_pitch + head_size_idx; #else #ifdef BEAM_TABLE_TYPE const uint b_idx = beam_table[FUNC_CALL(get_bt_index_value)(OPTIONAL_SHAPE_INFO_TENSOR b0_idx, b1_idx, 0, 0, start_partition_idx + (seq_len * SUBGROUP_SIZE) + sglid, sgid * SUBGROUP_SIZE)]; @@ -1379,7 +1389,10 @@ KERNEL(sdpa_opt)( const uint heads_dim = num_heads_dim; #endif const uint value_seq_offset = subsequence_begins[gws_seq_indexes_correspondence[target_seq_dim]]; - uint value_offset = value_seq_offset * HEAD_SIZE * NUM_KV_HEADS + heads_dim * HEAD_SIZE + (start_partition_idx + seq_len_leftovers_start) * HEAD_SIZE * NUM_KV_HEADS + head_size_idx; + uint value_offset = INPUT2_OFFSET + + value_seq_offset * value_pitch + + heads_dim * HEAD_SIZE + + (start_partition_idx + seq_len_leftovers_start) * value_pitch + head_size_idx; #else #ifdef BEAM_TABLE_TYPE const uint b_idx = beam_table[FUNC_CALL(get_bt_index_value)(OPTIONAL_SHAPE_INFO_TENSOR b0_idx, b1_idx, 0, 0, start_partition_idx + seq_len_leftovers_start + sglid, sgid * SUBGROUP_SIZE)]; diff --git a/src/plugins/intel_gpu/src/kernel_selector/kernels/rope/rope_kernel_base.cpp b/src/plugins/intel_gpu/src/kernel_selector/kernels/rope/rope_kernel_base.cpp index a48632f6c45509..130c5a69d4262c 100644 --- a/src/plugins/intel_gpu/src/kernel_selector/kernels/rope/rope_kernel_base.cpp +++ b/src/plugins/intel_gpu/src/kernel_selector/kernels/rope/rope_kernel_base.cpp @@ -29,38 +29,15 @@ JitConstants RoPEKernelBase::GetJitConstants(const rope_params& params, RoPEKern if (params.slice_stop > params.slice_start) { jit.AddConstant(MakeJitConstant("ENABLE_SLICE", true)); - - auto f = toCodeString(params.inputs[0].Feature(), 1); - auto x = toCodeString(params.inputs[0].X(), 2); - auto y = toCodeString(params.inputs[0].Y(), 3); - - auto sliced_val = toCodeString(params.slice_stop - params.slice_start); - auto sliced_x = params.axis == 3 ? sliced_val : x; - auto sliced_y = params.axis == 2 ? sliced_val : y; - - jit.AddConstant(MakeJitConstant("SLICED_INPUT0_X_PITCH", 1)); - jit.AddConstant(MakeJitConstant("SLICED_INPUT0_Y_PITCH", sliced_x)); - jit.AddConstant(MakeJitConstant("SLICED_INPUT0_FEATURE_PITCH", sliced_x + "*" + sliced_y)); - jit.AddConstant(MakeJitConstant("SLICED_INPUT0_BATCH_PITCH", sliced_x + "*" + sliced_y + "*" + f)); - jit.AddConstant(MakeJitConstant("SLICED_INPUT0_OFFSET", 0)); jit.AddConstant(MakeJitConstant("SLICED_FROM_START", toCodeString(params.slice_start))); - if (params.axis == 2) { - jit.AddConstant(MakeJitConstant("SLICED_FROM_END", "(" + y + "-" + toCodeString(params.slice_stop) + ")")); - } else if (params.axis == 3) { - jit.AddConstant(MakeJitConstant("SLICED_FROM_END", "(" + x + "-" + toCodeString(params.slice_stop) + ")")); - } else { + if (params.axis != 2 && params.axis != 3) { OPENVINO_THROW("[GPU] Invalid axis value for RoPE operation"); } } if (params.transposed_input) { jit.AddConstant(MakeJitConstant("ENABLE_TRANSPOSE", true)); - jit.AddConstant(MakeJitConstant("TRANSPOSED_INPUT0_OFFSET", 0)); - jit.AddConstant(MakeJitConstant("TRANSPOSED_INPUT0_X_PITCH", 1)); - jit.AddConstant(MakeJitConstant("TRANSPOSED_INPUT0_Y_PITCH", "INPUT0_FEATURE_PITCH")); - jit.AddConstant(MakeJitConstant("TRANSPOSED_INPUT0_FEATURE_PITCH", "INPUT0_Y_PITCH")); - jit.AddConstant(MakeJitConstant("TRANSPOSED_INPUT0_BATCH_PITCH", "INPUT0_BATCH_PITCH")); } if (!params.is_chatglm && (params.inputs[1].has_dynamic_pad() || params.inputs[2].has_dynamic_pad())) { diff --git a/src/plugins/intel_gpu/src/plugin/transformations/optimize_subsequent_reshapes.cpp b/src/plugins/intel_gpu/src/plugin/transformations/optimize_subsequent_reshapes.cpp new file mode 100644 index 00000000000000..b87600ed36e347 --- /dev/null +++ b/src/plugins/intel_gpu/src/plugin/transformations/optimize_subsequent_reshapes.cpp @@ -0,0 +1,99 @@ +// Copyright (C) 2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "optimize_subsequent_reshapes.hpp" + +#include "openvino/core/rt_info.hpp" +#include "openvino/op/constant.hpp" +#include "openvino/op/concat.hpp" +#include "openvino/op/reshape.hpp" +#include "openvino/pass/pattern/op/or.hpp" +#include "openvino/pass/pattern/op/wrap_type.hpp" +#include "transformations/utils/utils.hpp" + +namespace ov { +namespace intel_gpu { + +OptimizeSubsequentReshapes::OptimizeSubsequentReshapes() { + using namespace ov::pass::pattern; + using ov::pass::pattern::op::Or; + + auto dynamic_batch_only = [](Output output) { + const auto& shape = output.get_partial_shape(); + + if (shape.rank().is_dynamic()) + return false; + + if (shape.size() <= 1) + return false; + + if (shape[0].is_static()) + return false; + + for (size_t i = 1; i < shape.size(); i++) + if (shape[i].is_dynamic()) + return false; + + return true; + }; + + auto first_reshape_data = any_input(dynamic_batch_only); + auto first_reshape_pattern = ov::pass::pattern::wrap_type(); + auto first_reshape = wrap_type({ first_reshape_data, first_reshape_pattern }, + ov::pass::pattern::all_of({ dynamic_batch_only, ov::pass::pattern::consumers_count(1) })); + + auto second_reshape_pattern = ov::pass::pattern::wrap_type(); + auto second_reshape = wrap_type({ first_reshape, second_reshape_pattern }, dynamic_batch_only); + + ov::matcher_pass_callback callback = [=](ov::pass::pattern::Matcher& m) { + const auto& pattern_map = m.get_pattern_value_map(); + + auto input_node = pattern_map.at(first_reshape_data).get_node_shared_ptr(); + auto first_reshape_node = pattern_map.at(first_reshape).get_node_shared_ptr(); + auto second_reshape_node = pattern_map.at(second_reshape).get_node_shared_ptr(); + + auto input_ps = first_reshape_node->input(0).get_partial_shape(); + auto first_reshape_ps = first_reshape_node->get_output_partial_shape(0); + auto second_reshape_ps = second_reshape_node->get_output_partial_shape(0); + + auto static_dims_product = [](ov::PartialShape& ps) { + int64_t total_dims = 1; + + for (auto& dim : ps) { + if (dim.is_static()) + total_dims *= dim.get_length(); + } + + return total_dims; + }; + + if (static_dims_product(input_ps) != static_dims_product(first_reshape_ps) || + static_dims_product(first_reshape_ps) != static_dims_product(second_reshape_ps)) + return false; + + std::vector new_pattern; + for (auto& dim : second_reshape_ps) { + if (dim.is_dynamic()) { + new_pattern.push_back(0); + } else { + new_pattern.push_back(dim.get_length()); + } + } + + auto new_pattern_const = std::make_shared(ov::element::i32, ov::Shape{new_pattern.size()}, new_pattern); + auto new_reshape = std::make_shared(first_reshape_node->input(0).get_source_output(), new_pattern_const, true); + new_reshape->set_friendly_name(second_reshape_node->get_friendly_name()); + + ov::replace_node(second_reshape_node, new_reshape); + copy_runtime_info(first_reshape_node, new_reshape); + + return true; + }; + + auto m = std::make_shared(second_reshape, "OptimizeSubsequentReshapes"); + this->register_matcher(m, callback); +} + +} // namespace intel_gpu +} // namespace ov diff --git a/src/plugins/intel_gpu/src/plugin/transformations/optimize_subsequent_reshapes.hpp b/src/plugins/intel_gpu/src/plugin/transformations/optimize_subsequent_reshapes.hpp new file mode 100644 index 00000000000000..3a38bb92ad5167 --- /dev/null +++ b/src/plugins/intel_gpu/src/plugin/transformations/optimize_subsequent_reshapes.hpp @@ -0,0 +1,23 @@ +// Copyright (C) 2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include "openvino/pass/graph_rewrite.hpp" + +namespace ov { +namespace intel_gpu { + +/** + * @brief This pass looks for `Reshape [ dynamic dim, n static dims] -> Reshape [dynamic dim, n static dims]` patterns + * and replaces them with a single `Reshape [dynamic dim, n static dims]` operation. + */ +class OptimizeSubsequentReshapes : public ov::pass::MatcherPass { +public: + OPENVINO_RTTI("OptimizeSubsequentReshapes", "0"); + OptimizeSubsequentReshapes(); +}; + +} // namespace intel_gpu +} // namespace ov diff --git a/src/plugins/intel_gpu/src/plugin/transformations_pipeline.cpp b/src/plugins/intel_gpu/src/plugin/transformations_pipeline.cpp index a33a15fbbe6a1a..bfc348d135a813 100644 --- a/src/plugins/intel_gpu/src/plugin/transformations_pipeline.cpp +++ b/src/plugins/intel_gpu/src/plugin/transformations_pipeline.cpp @@ -76,6 +76,7 @@ #include "plugin/transformations/increase_position_ids_precision.hpp" #include "plugin/transformations/group_norm_composition.hpp" #include "plugin/transformations/dynamic_quantize_fully_connected.hpp" +#include "plugin/transformations/optimize_subsequent_reshapes.hpp" #include "transformations/common_optimizations/nop_elimination.hpp" #include "transformations/common_optimizations/rms_fusion.hpp" #include "transformations/common_optimizations/broadcast_elementwise_fusion.hpp" @@ -875,6 +876,8 @@ void TransformationsPipeline::apply(std::shared_ptr func) { pass_config->disable(); pass_config->disable(); + manager.register_pass(); + manager.register_pass(); // This Validate is needed for proper data type propagation after applying IncreasePositionIdsPrecision pass manager.register_pass(); diff --git a/src/plugins/intel_gpu/tests/unit/transformations/optimize_subsequent_reshapes_test.cpp b/src/plugins/intel_gpu/tests/unit/transformations/optimize_subsequent_reshapes_test.cpp new file mode 100644 index 00000000000000..732a14be03bf39 --- /dev/null +++ b/src/plugins/intel_gpu/tests/unit/transformations/optimize_subsequent_reshapes_test.cpp @@ -0,0 +1,97 @@ +// Copyright (C) 2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include + +#include +#include +#include + +#include "openvino/pass/manager.hpp" +#include "openvino/core/model.hpp" +#include "openvino/core/coordinate_diff.hpp" +#include "openvino/core/type/element_type.hpp" +#include "openvino/op/constant.hpp" +#include "openvino/op/reshape.hpp" + +#include +#include +#include + +#include "common_test_utils/ov_test_utils.hpp" + +using namespace testing; +using namespace ov::intel_gpu; + +TEST_F(TransformationTestsF, OptimizeSubsequentReshapes1) { + { + auto input = std::make_shared(ov::element::i64, ov::PartialShape{ -1, 1, 4096 }); + auto first_reshape_pattern = std::make_shared(ov::element::i32, ov::Shape{4}, std::vector{ 0, 0, 32, 128 }); + auto first_reshape = std::make_shared(input, first_reshape_pattern, true); + + auto second_reshape_pattern = std::make_shared(ov::element::i32, ov::Shape{2}, std::vector{ 0, -1 }); + auto second_reshape = std::make_shared(first_reshape, second_reshape_pattern, true); + auto result = std::make_shared(second_reshape); + + model = std::make_shared(ov::NodeVector{ result }, ov::ParameterVector{ input }); + manager.register_pass(); + } + { + auto input = std::make_shared(ov::element::i64, ov::PartialShape{ -1, 1, 4096 }); + auto reshape_pattern = std::make_shared(ov::element::i32, ov::Shape{2}, std::vector{ 0, 4096 }); + auto reshape = std::make_shared(input, reshape_pattern, true); + auto result = std::make_shared(reshape); + + model_ref = std::make_shared(ov::NodeVector{ result }, ov::ParameterVector{ input }); + } + comparator.enable(FunctionsComparator::CmpValues::ATTRIBUTES); +} + +TEST_F(TransformationTestsF, OptimizeSubsequentReshapes2) { + { + auto input = std::make_shared(ov::element::i64, ov::PartialShape{ -1, 1, 4096 }); + auto first_reshape_pattern = std::make_shared(ov::element::i32, ov::Shape{4}, std::vector{ 0, 0, 32, 128 }); + auto first_reshape = std::make_shared(input, first_reshape_pattern, true); + + auto second_reshape_pattern = std::make_shared(ov::element::i32, ov::Shape{4}, std::vector{ 0, 32, 1, 0 }); + auto second_reshape = std::make_shared(first_reshape, second_reshape_pattern, true); + auto result = std::make_shared(second_reshape); + + model = std::make_shared(ov::NodeVector{ result }, ov::ParameterVector{ input }); + manager.register_pass(); + } + { + auto input = std::make_shared(ov::element::i64, ov::PartialShape{ -1, 1, 4096 }); + auto reshape_pattern = std::make_shared(ov::element::i32, ov::Shape{4}, std::vector{ 0, 32, 1, 128 }); + auto reshape = std::make_shared(input, reshape_pattern, true); + auto result = std::make_shared(reshape); + + model_ref = std::make_shared(ov::NodeVector{ result }, ov::ParameterVector{ input }); + } + comparator.enable(FunctionsComparator::CmpValues::ATTRIBUTES); +} + +TEST_F(TransformationTestsF, OptimizeSubsequentReshapes3) { + { + auto input = std::make_shared(ov::element::i64, ov::PartialShape{ -1, 32, 1, 128 }); + auto first_reshape_pattern = std::make_shared(ov::element::i32, ov::Shape{4}, std::vector{ 0, 1, 32, 0 }); + auto first_reshape = std::make_shared(input, first_reshape_pattern, true); + + auto second_reshape_pattern = std::make_shared(ov::element::i32, ov::Shape{2}, std::vector{ 0, -1 }); + auto second_reshape = std::make_shared(first_reshape, second_reshape_pattern, true); + auto result = std::make_shared(second_reshape); + + model = std::make_shared(ov::NodeVector{ result }, ov::ParameterVector{ input }); + manager.register_pass(); + } + { + auto input = std::make_shared(ov::element::i64, ov::PartialShape{ -1, 32, 1, 128 }); + auto reshape_pattern = std::make_shared(ov::element::i32, ov::Shape{2}, std::vector{ 0, 4096 }); + auto reshape = std::make_shared(input, reshape_pattern, true); + auto result = std::make_shared(reshape); + + model_ref = std::make_shared(ov::NodeVector{ result }, ov::ParameterVector{ input }); + } + comparator.enable(FunctionsComparator::CmpValues::ATTRIBUTES); +} From 2e87aad94024f310a4d2338482fe1bb1167d3c2a Mon Sep 17 00:00:00 2001 From: Roman Lyamin Date: Thu, 31 Oct 2024 10:34:40 +0400 Subject: [PATCH 071/120] [GPU] Handling the case where get_state was called before set_state (#27276) ### Tickets: - *[156193](https://jira.devtools.intel.com/browse/CVS-156193)* --- src/plugins/intel_gpu/src/plugin/variable_state.cpp | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/src/plugins/intel_gpu/src/plugin/variable_state.cpp b/src/plugins/intel_gpu/src/plugin/variable_state.cpp index 6b1c8d0cfc993f..2b7a26ba35a292 100644 --- a/src/plugins/intel_gpu/src/plugin/variable_state.cpp +++ b/src/plugins/intel_gpu/src/plugin/variable_state.cpp @@ -123,6 +123,12 @@ ov::element::Type VariableState::get_user_specified_type() const { } ov::SoPtr VariableState::get_state() const { + if (m_memory == nullptr) { + const auto& pshape = m_layout.get_partial_shape(); + const auto& shape = get_tensor_shape(pshape); + return m_context->create_host_tensor(get_user_specified_type(), shape); + } + auto tensor = m_context->create_host_tensor(get_user_specified_type(), m_memory->get_layout().get_shape()); convert_and_copy(m_memory, tensor._ptr.get(), m_context->get_engine().get_service_stream()); From a6a113ce01a49c8259c709a312df9e0f49eac970 Mon Sep 17 00:00:00 2001 From: Roman Lyamin Date: Thu, 31 Oct 2024 10:35:01 +0400 Subject: [PATCH 072/120] [GPU] Use onednn impl for dynamic gemm (#27212) ### Details: - *Performance improvement for LoRA* --- .../src/graph/impls/registry/gemm_impls.cpp | 5 ++- .../tests/unit/fusions/gemm_fusion_test.cpp | 4 +-- .../tests/unit/test_cases/gemm_gpu_test.cpp | 33 +++++++++++++------ 3 files changed, 29 insertions(+), 13 deletions(-) diff --git a/src/plugins/intel_gpu/src/graph/impls/registry/gemm_impls.cpp b/src/plugins/intel_gpu/src/graph/impls/registry/gemm_impls.cpp index 66947ef1a84a00..436a3bb257b483 100644 --- a/src/plugins/intel_gpu/src/graph/impls/registry/gemm_impls.cpp +++ b/src/plugins/intel_gpu/src/graph/impls/registry/gemm_impls.cpp @@ -19,7 +19,10 @@ const std::vector>& Registry static const std::vector> impls = { OV_GPU_CREATE_INSTANCE_ONEDNN(onednn::GemmImplementationManager, shape_types::static_shape) OV_GPU_GET_INSTANCE_OCL(gemm, shape_types::static_shape) - OV_GPU_GET_INSTANCE_OCL(gemm, shape_types::dynamic_shape) + OV_GPU_GET_INSTANCE_OCL(gemm, shape_types::dynamic_shape, + [](const program_node& node) { + return !node.can_use(impl_types::onednn); + }) }; return impls; diff --git a/src/plugins/intel_gpu/tests/unit/fusions/gemm_fusion_test.cpp b/src/plugins/intel_gpu/tests/unit/fusions/gemm_fusion_test.cpp index baed5400181130..659ccaf9d8a723 100644 --- a/src/plugins/intel_gpu/tests/unit/fusions/gemm_fusion_test.cpp +++ b/src/plugins/intel_gpu/tests/unit/fusions/gemm_fusion_test.cpp @@ -455,8 +455,8 @@ TEST_P(gemm_2in_dynamic_add, add) { } INSTANTIATE_TEST_SUITE_P(fusings_gpu, gemm_2in_dynamic_add, ::testing::ValuesIn(std::vector{ - gemm_test_params{ CASE_GEMM_2IN_FP16_3D_1, 4, 5, "", broadcast_kinds::batch, eltwise_mode::sum }, - gemm_test_params{ CASE_GEMM_2IN_FP16_3D_1, 4, 5, "", broadcast_kinds::feature, eltwise_mode::sum }, + gemm_test_params{ CASE_GEMM_2IN_FP16_3D_1, 4, 4, "gemm_tiled_opt", broadcast_kinds::batch, eltwise_mode::sum }, + gemm_test_params{ CASE_GEMM_2IN_FP16_3D_1, 4, 4, "gemm_tiled_opt", broadcast_kinds::feature, eltwise_mode::sum }, })); class gemm_2in_act_scale_quantize_i8 : public GemmFusingTest {}; diff --git a/src/plugins/intel_gpu/tests/unit/test_cases/gemm_gpu_test.cpp b/src/plugins/intel_gpu/tests/unit/test_cases/gemm_gpu_test.cpp index 3b41f44050e527..df493544624b64 100644 --- a/src/plugins/intel_gpu/tests/unit/test_cases/gemm_gpu_test.cpp +++ b/src/plugins/intel_gpu/tests/unit/test_cases/gemm_gpu_test.cpp @@ -473,6 +473,9 @@ class gemm_gpu_tests: public ::testing::Test { config.set_property(ov::intel_gpu::optimize_data(true)); config.set_property(ov::intel_gpu::allow_new_shape_infer(true)); + ov::intel_gpu::ImplementationDesc gemm_impl = { format::bfyx, "", impl_types::ocl }; + config.set_property(ov::intel_gpu::force_implementations(ov::intel_gpu::ImplForcingMap{ {"gemm_ref", gemm_impl} })); + network network(engine, topology, config); network.set_input_data("input1", input1_mem); network.set_input_data("input2", input2_mem); @@ -498,6 +501,10 @@ class gemm_gpu_tests: public ::testing::Test { ExecutionConfig config = get_test_default_config(engine); config.set_property(ov::intel_gpu::optimize_data(true)); config.set_property(ov::intel_gpu::allow_new_shape_infer(true)); + + ov::intel_gpu::ImplementationDesc gemm_impl = { format::bfyx, "", impl_types::ocl }; + config.set_property(ov::intel_gpu::force_implementations(ov::intel_gpu::ImplForcingMap{ {"gemm", gemm_impl} })); + network::ptr network = get_network(engine, topology, config, get_test_stream_ptr(), is_caching_test); network->set_input_data("input1", input1_mem); network->set_input_data("input2", input2_mem); @@ -1246,10 +1253,12 @@ class gemm_gpu_tests: public ::testing::Test { network->set_input_data("input0", input0_mem); network->set_input_data("input1", input1_mem); - auto inst = network->get_primitive("gemm"); - auto impl = inst->get_impl(); - ASSERT_TRUE(impl != nullptr); - ASSERT_TRUE(impl->is_dynamic() == is_input_dynamic); + if (!engine.get_device_info().supports_immad) { + auto inst = network->get_primitive("gemm"); + auto impl = inst->get_impl(); + ASSERT_TRUE(impl != nullptr); + ASSERT_TRUE(impl->is_dynamic() == is_input_dynamic); + } auto outputs = network->execute(); @@ -1533,10 +1542,12 @@ class gemm_gpu_tests: public ::testing::Test { network->set_input_data("input0", input0_mem); network->set_input_data("input1", input1_mem); - auto inst = network->get_primitive("gemm"); - auto impl = inst->get_impl(); - ASSERT_TRUE(impl != nullptr); - ASSERT_TRUE(impl->is_dynamic() == is_input_dynamic); + if (!engine.get_device_info().supports_immad) { + auto inst = network->get_primitive("gemm"); + auto impl = inst->get_impl(); + ASSERT_TRUE(impl != nullptr); + ASSERT_TRUE(impl->is_dynamic() == is_input_dynamic); + } auto outputs = network->execute(); @@ -2853,8 +2864,10 @@ class gemm_onednn: public ::testing::Test { auto inst = network->get_primitive("gemm"); auto impl = inst->get_impl(); - ASSERT_TRUE(impl != nullptr); - ASSERT_TRUE(impl->is_dynamic()); + if (!engine.get_device_info().supports_immad) { + ASSERT_TRUE(impl != nullptr); + ASSERT_TRUE(impl->is_dynamic()); + } auto outputs = network->execute(); From 9f6826ad4058dca14b3065f9448f2aaf0bd76f07 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Thu, 31 Oct 2024 07:35:47 +0100 Subject: [PATCH 073/120] Bump paddlepaddle from 2.6.0 to 2.6.2 in /src/bindings/python (#26966) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Bumps [paddlepaddle](https://github.com/paddlepaddle/paddle) from 2.6.0 to 2.6.2.
Commits

[![Dependabot compatibility score](https://dependabot-badges.githubapp.com/badges/compatibility_score?dependency-name=paddlepaddle&package-manager=pip&previous-version=2.6.0&new-version=2.6.2)](https://docs.github.com/en/github/managing-security-vulnerabilities/about-dependabot-security-updates#about-compatibility-scores) Dependabot will resolve any conflicts with this PR as long as you don't alter it yourself. You can also trigger a rebase manually by commenting `@dependabot rebase`. [//]: # (dependabot-automerge-start) [//]: # (dependabot-automerge-end) ---
Dependabot commands and options
You can trigger Dependabot actions by commenting on this PR: - `@dependabot rebase` will rebase this PR - `@dependabot recreate` will recreate this PR, overwriting any edits that have been made to it - `@dependabot merge` will merge this PR after your CI passes on it - `@dependabot squash and merge` will squash and merge this PR after your CI passes on it - `@dependabot cancel merge` will cancel a previously requested merge and block automerging - `@dependabot reopen` will reopen this PR if it is closed - `@dependabot close` will close this PR and stop Dependabot recreating it. You can achieve the same result by closing it manually - `@dependabot show ignore conditions` will show all of the ignore conditions of the specified dependency - `@dependabot ignore this major version` will close this PR and stop Dependabot creating any more for this major version (unless you reopen the PR or upgrade to it yourself) - `@dependabot ignore this minor version` will close this PR and stop Dependabot creating any more for this minor version (unless you reopen the PR or upgrade to it yourself) - `@dependabot ignore this dependency` will close this PR and stop Dependabot creating any more for this dependency (unless you reopen the PR or upgrade to it yourself)
Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- src/bindings/python/constraints.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/bindings/python/constraints.txt b/src/bindings/python/constraints.txt index c7837798c8aca7..a0fbf982105ad6 100644 --- a/src/bindings/python/constraints.txt +++ b/src/bindings/python/constraints.txt @@ -18,7 +18,7 @@ patchelf<=0.17.2.1 # Frontends h5py>=3.1.0,<3.13.0 docopt~=0.6.2 -paddlepaddle==2.6.0 +paddlepaddle==2.6.2 tensorflow>=1.15.5,<2.18.0 six~=1.16.0 protobuf>=3.18.1,<4.0.0 From 2f78f6f9ca0f9da93de1751948621513b50e57fa Mon Sep 17 00:00:00 2001 From: Anastasia Kuporosova Date: Thu, 31 Oct 2024 09:04:54 +0100 Subject: [PATCH 074/120] [PyOV] allow constant accept keyword args (#27346) ### Details: - using of singledispatch breaks the scenario where a user pass keywords arguments only ### Tickets: - CVS-153553 --- src/bindings/python/src/openvino/runtime/opset13/ops.py | 8 ++++---- src/bindings/python/tests/test_graph/test_constant.py | 6 +++--- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/src/bindings/python/src/openvino/runtime/opset13/ops.py b/src/bindings/python/src/openvino/runtime/opset13/ops.py index cb201d3d4263dd..a624ffb4f79873 100644 --- a/src/bindings/python/src/openvino/runtime/opset13/ops.py +++ b/src/bindings/python/src/openvino/runtime/opset13/ops.py @@ -15,7 +15,7 @@ from openvino.runtime.op import Constant, Result from openvino.runtime.opset1 import convert_like from openvino.runtime.opset_utils import _get_node_factory -from openvino.runtime.utils.decorators import binary_op, nameable_op, unary_op +from openvino.runtime.utils.decorators import binary_op, nameable_op, unary_op, overloading from openvino.runtime.utils.types import ( NumericData, NodeInput, @@ -271,7 +271,7 @@ def scaled_dot_product_attention( return _get_node_factory_opset13().create("ScaledDotProductAttention", inputs, attributes) -@singledispatch +@overloading(Union[NumericData, np.number, bool, np.bool_, list], Union[NumericType, Type], Optional[str], bool) # type: ignore @nameable_op def constant( value: Union[NumericData, np.number, bool, np.bool_, list], @@ -339,9 +339,9 @@ def display_shared_memory_warning(warning_message: str) -> None: return Constant(_value, shared_memory=_shared_memory) -@constant.register +@overloading(Tensor, bool, Optional[str]) # type: ignore @nameable_op -def _( +def constant( # noqa: F811 tensor: Tensor, shared_memory: bool = False, name: Optional[str] = None, diff --git a/src/bindings/python/tests/test_graph/test_constant.py b/src/bindings/python/tests/test_graph/test_constant.py index e28a4ad05510f2..131654855b380a 100644 --- a/src/bindings/python/tests/test_graph/test_constant.py +++ b/src/bindings/python/tests/test_graph/test_constant.py @@ -87,7 +87,7 @@ def test_init_with_array(src_dtype, dst_dtype, shared_flag, data_getter): data = np.ascontiguousarray(data) # Create constant from based on numpy dtype or openvino type - ov_const = ops.constant(data, dtype=dst_dtype, shared_memory=shared_flag) + ov_const = ops.constant(data, dst_dtype, shared_memory=shared_flag) # Check shape and element type of Constant class assert isinstance(ov_const, Constant) @@ -842,7 +842,7 @@ def test_get_data_casting_bf16(src_dtype, dst_dtype, copy_flag): ) def test_get_data_casting_packed(src_dtype, ov_type, dst_dtype, copy_flag): data = np.array([[0, 0, 0, 0, 1, 0, 0, 1], [0, 0, 0, 0, 0, 0, 0, 1]], dtype=src_dtype) - ov_const = ops.constant(data, dtype=ov_type) + ov_const = ops.constant(value=data, dtype=ov_type) arr = ov_const.get_data(dtype=dst_dtype, copy=copy_flag) if dst_dtype is None: @@ -867,7 +867,7 @@ def test_const_from_tensor(shared_flag): shape = [1, 3, 32, 32] arr = np.ones(shape).astype(np.float32) ov_tensor = Tensor(arr, shape, Type.f32) - ov_const = ops.constant(ov_tensor, shared_memory=shared_flag) + ov_const = ops.constant(tensor=ov_tensor, shared_memory=shared_flag) assert isinstance(ov_const, Constant) assert np.all(list(ov_const.shape) == shape) From 66b14678502d1aada20d4d6357157b3dff2adcf8 Mon Sep 17 00:00:00 2001 From: Dmitry Matveev Date: Thu, 31 Oct 2024 08:32:49 +0000 Subject: [PATCH 075/120] NPUW: Eliminate unnecessary kvcache tensors copy (#27347) ### Details: - We mistakenly copy input parameters when we shouldn't - Yet another `||` -> `&&` change, hopefully less destructive this time ### Tickets: - *ticket-id* --- .../intel_npu/src/plugin/npuw/just_sync_infer_request.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/plugins/intel_npu/src/plugin/npuw/just_sync_infer_request.cpp b/src/plugins/intel_npu/src/plugin/npuw/just_sync_infer_request.cpp index 26363e66e55d2a..0e0b96582a663c 100644 --- a/src/plugins/intel_npu/src/plugin/npuw/just_sync_infer_request.cpp +++ b/src/plugins/intel_npu/src/plugin/npuw/just_sync_infer_request.cpp @@ -597,7 +597,7 @@ void ov::npuw::JustInferRequest::bind_global_parameters(std::size_t idx) { LOG_BLOCK(); if (!is_spatial_param(sub_in_idx)) { // Input parameter is non-spatial, do normal handling - if (do_copy || m_input_allocated.count(g_tnsr->data()) == 0) { + if (m_input_allocated.count(g_tnsr->data()) == 0 && do_copy) { LOG_DEBUG("Will be copied"); copy_list.emplace_back(g_tnsr, s_port); } else { From 44b86a860ecb0a3e79e6f75627d6cc5270226e7a Mon Sep 17 00:00:00 2001 From: Vladimir Zlobin Date: Thu, 31 Oct 2024 12:54:52 +0400 Subject: [PATCH 076/120] benchmark_app/cpp: report an error if no files were found. (#26663) Python version already reports an error in that case. benchmark_app is the only user of `readInputFilesArguments()`. It could make sense earlier to emit the warning instead of the error because other samples. Ticket 152614 --- samples/cpp/common/utils/src/args_helper.cpp | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/samples/cpp/common/utils/src/args_helper.cpp b/samples/cpp/common/utils/src/args_helper.cpp index f4a3d10ceb0b5b..ba58f98e498e90 100644 --- a/samples/cpp/common/utils/src/args_helper.cpp +++ b/samples/cpp/common/utils/src/args_helper.cpp @@ -29,8 +29,7 @@ void readInputFilesArguments(std::vector& files, const std::string& arg) { struct stat sb; if (stat(arg.c_str(), &sb) != 0) { - slog::warn << "File " << arg << " cannot be opened!" << slog::endl; - return; + throw std::invalid_argument(arg + " file or directory not found."); } if (S_ISDIR(sb.st_mode)) { struct CloseDir { @@ -43,17 +42,20 @@ void readInputFilesArguments(std::vector& files, const std::string& using Dir = std::unique_ptr; Dir dp(opendir(arg.c_str())); if (dp == nullptr) { - slog::warn << "Directory " << arg << " cannot be opened!" << slog::endl; - return; + throw std::invalid_argument(arg + " directory cannot be opened!"); } struct dirent* ep; + size_t files_size = files.size(); while (nullptr != (ep = readdir(dp.get()))) { std::string fileName = ep->d_name; if (fileName == "." || fileName == "..") continue; files.push_back(arg + "/" + ep->d_name); } + if (files.size() == files_size) { + throw std::invalid_argument("No files were found in directory " + arg); + } } else { files.push_back(arg); } From 86083e0dbf8d173451a8ee47fa40496a62aea893 Mon Sep 17 00:00:00 2001 From: Mateusz Mikolajczyk Date: Thu, 31 Oct 2024 10:23:59 +0100 Subject: [PATCH 077/120] [Transformations] Add Squeeze-15 downgrade transformation (#27286) ### Details: - *Add Squeeze-15 downgrade transformation to Squeeze-0 for compatible attribute* - *...* ### Tickets: - *CVS-154027* ### PR requires [PR-26995](https://github.com/openvinotoolkit/openvino/pull/26995) to be merged --------- Co-authored-by: Michal Lukaszewski --- .../convert_squeeze15_downgrade.hpp | 23 ++++ .../common_optimizations.cpp | 2 + .../convert_squeeze15_downgrade.cpp | 40 +++++++ .../convert_squeeze15_downgrade_test.cpp | 112 ++++++++++++++++++ 4 files changed, 177 insertions(+) create mode 100644 src/common/transformations/include/transformations/op_conversions/convert_squeeze15_downgrade.hpp create mode 100644 src/common/transformations/src/transformations/op_conversions/convert_squeeze15_downgrade.cpp create mode 100644 src/common/transformations/tests/op_conversions/convert_squeeze15_downgrade_test.cpp diff --git a/src/common/transformations/include/transformations/op_conversions/convert_squeeze15_downgrade.hpp b/src/common/transformations/include/transformations/op_conversions/convert_squeeze15_downgrade.hpp new file mode 100644 index 00000000000000..c2ebfbc0f3138b --- /dev/null +++ b/src/common/transformations/include/transformations/op_conversions/convert_squeeze15_downgrade.hpp @@ -0,0 +1,23 @@ +// Copyright (C) 2018-2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include "openvino/pass/matcher_pass.hpp" +#include "transformations_visibility.hpp" + +namespace ov { +namespace pass { +/** + * @ingroup ov_transformation_common_api + * @brief Converts Squeeze v15 to Squeeze v0. + */ +class TRANSFORMATIONS_API ConvertSqueeze15ToSqueeze0 : public MatcherPass { +public: + OPENVINO_RTTI("ConvertSqueeze15ToSqueeze0", "0"); + ConvertSqueeze15ToSqueeze0(); +}; + +} // namespace pass +} // namespace ov diff --git a/src/common/transformations/src/transformations/common_optimizations/common_optimizations.cpp b/src/common/transformations/src/transformations/common_optimizations/common_optimizations.cpp index 9d46b583a828f2..37ee2d12d9aebb 100644 --- a/src/common/transformations/src/transformations/common_optimizations/common_optimizations.cpp +++ b/src/common/transformations/src/transformations/common_optimizations/common_optimizations.cpp @@ -98,6 +98,7 @@ #include "transformations/op_conversions/convert_softmax_downgrade.hpp" #include "transformations/op_conversions/convert_softmax_upgrade.hpp" #include "transformations/op_conversions/convert_space_to_depth.hpp" +#include "transformations/op_conversions/convert_squeeze15_downgrade.hpp" #include "transformations/op_conversions/convert_subtract.hpp" #include "transformations/op_conversions/convert_topk11_downgrade.hpp" #include "transformations/op_conversions/convert_xor_to_logical_xor.hpp" @@ -235,6 +236,7 @@ bool ov::pass::CommonOptimizations::run_on_model(const std::shared_ptr(); ADD_MATCHER(fq_fusions, FakeQuantizeMulFusion) diff --git a/src/common/transformations/src/transformations/op_conversions/convert_squeeze15_downgrade.cpp b/src/common/transformations/src/transformations/op_conversions/convert_squeeze15_downgrade.cpp new file mode 100644 index 00000000000000..50701d3d6acd56 --- /dev/null +++ b/src/common/transformations/src/transformations/op_conversions/convert_squeeze15_downgrade.cpp @@ -0,0 +1,40 @@ +// Copyright (C) 2018-2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "transformations/op_conversions/convert_squeeze15_downgrade.hpp" + +#include "itt.hpp" +#include "openvino/core/rt_info.hpp" +#include "openvino/op/squeeze.hpp" +#include "openvino/pass/pattern/op/wrap_type.hpp" +#include "transformations/utils/utils.hpp" + +ov::pass::ConvertSqueeze15ToSqueeze0::ConvertSqueeze15ToSqueeze0() { + MATCHER_SCOPE(ConvertSqueeze15ToSqueeze0); + + const auto& squeeze_v15_pattern = pattern::wrap_type(); + + const matcher_pass_callback callback = [OV_CAPTURE_CPY_AND_THIS](pattern::Matcher& m) { + const auto& squeeze_v15 = ov::as_type_ptr(m.get_match_root()); + if (!squeeze_v15 || transformation_callback(squeeze_v15)) { + return false; + } + std::shared_ptr squeeze_v0; + if (squeeze_v15->get_input_size() == 1) { + squeeze_v0 = std::make_shared(squeeze_v15->input_value(0)); + } else if (squeeze_v15->get_input_size() == 2 && !squeeze_v15->get_allow_axis_skip()) { + squeeze_v0 = std::make_shared(squeeze_v15->input_value(0), squeeze_v15->input_value(1)); + } else { + return false; + } + squeeze_v0->set_friendly_name(squeeze_v15->get_friendly_name()); + copy_runtime_info(squeeze_v15, squeeze_v0); + replace_node(squeeze_v15, squeeze_v0); + + return true; + }; + + auto m = std::make_shared(squeeze_v15_pattern, matcher_name); + register_matcher(m, callback); +} diff --git a/src/common/transformations/tests/op_conversions/convert_squeeze15_downgrade_test.cpp b/src/common/transformations/tests/op_conversions/convert_squeeze15_downgrade_test.cpp new file mode 100644 index 00000000000000..f3d90ab2c748bd --- /dev/null +++ b/src/common/transformations/tests/op_conversions/convert_squeeze15_downgrade_test.cpp @@ -0,0 +1,112 @@ +// Copyright (C) 2018-2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "transformations/op_conversions/convert_squeeze15_downgrade.hpp" + +#include + +#include + +#include "common_test_utils/ov_test_utils.hpp" +#include "openvino/opsets/opset1.hpp" +#include "openvino/opsets/opset15.hpp" +#include "openvino/pass/manager.hpp" +#include "transformations/utils/utils.hpp" +using namespace ov; +using namespace testing; + +namespace { + +enum class IndicesMode { NONE, CONST, PARAM }; + +std::shared_ptr create_v15_model(const IndicesMode indices_mode, + const std::vector indices_const_val, + const bool allow_axis_skip) { + const PartialShape data_shape{-1, {2, 5}, 1, {1, 5}, 4}; + const auto& data = std::make_shared(ov::element::f32, data_shape); + ov::ParameterVector params = {data}; + std::shared_ptr squeeze; + if (indices_mode == IndicesMode::NONE) { + squeeze = std::make_shared(data, allow_axis_skip); + } else if (indices_mode == IndicesMode::PARAM) { + const auto& indices = + std::make_shared(ov::element::i32, PartialShape({data_shape.rank()})); + params.push_back(indices); + squeeze = std::make_shared(data, indices, allow_axis_skip); + } else if (indices_mode == IndicesMode::CONST) { + const auto& indices = + ov::opset15::Constant::create(ov::element::i32, Shape({indices_const_val.size()}), indices_const_val); + squeeze = std::make_shared(data, indices, allow_axis_skip); + } + squeeze->set_friendly_name("squeeze15"); + return std::make_shared(squeeze->outputs(), params); +} + +std::shared_ptr create_v1_model(const IndicesMode indices_mode, const std::vector indices_const_val) { + const PartialShape data_shape{-1, {2, 5}, 1, {1, 5}, 4}; + const auto& data = std::make_shared(ov::element::f32, data_shape); + ov::ParameterVector params = {data}; + std::shared_ptr squeeze; + if (indices_mode == IndicesMode::NONE) { + squeeze = std::make_shared(data); + } else if (indices_mode == IndicesMode::PARAM) { + const auto& indices = + std::make_shared(ov::element::i32, PartialShape({data_shape.rank()})); + params.push_back(indices); + squeeze = std::make_shared(data, indices); + } else if (indices_mode == IndicesMode::CONST) { + const auto& indices = + ov::opset1::Constant::create(ov::element::i32, Shape({indices_const_val.size()}), indices_const_val); + squeeze = std::make_shared(data, indices); + } + squeeze->set_friendly_name("squeeze15"); + return std::make_shared(squeeze->outputs(), params); +} + +} // namespace + +TEST_F(TransformationTestsF, ConvertSqueeze15ToSqueeze1_no_indices_no_skip) { + manager.register_pass(); + model = create_v15_model(IndicesMode::NONE, {}, false); + model_ref = create_v1_model(IndicesMode::NONE, {}); + EXPECT_EQ(model->output(0).get_partial_shape(), model_ref->output(0).get_partial_shape()); + comparator.enable(FunctionsComparator::CmpValues::CONST_VALUES); + comparator.enable(FunctionsComparator::CmpValues::ATTRIBUTES); + comparator.enable(FunctionsComparator::CmpValues::NAMES); +} + +TEST_F(TransformationTestsF, ConvertSqueeze15ToSqueeze1_no_indices_skip) { + manager.register_pass(); + model = create_v15_model(IndicesMode::NONE, {}, true); + model_ref = create_v1_model(IndicesMode::NONE, {}); + EXPECT_EQ(model->output(0).get_partial_shape(), model_ref->output(0).get_partial_shape()); + comparator.enable(FunctionsComparator::CmpValues::CONST_VALUES); + comparator.enable(FunctionsComparator::CmpValues::ATTRIBUTES); + comparator.enable(FunctionsComparator::CmpValues::NAMES); +} + +TEST_F(TransformationTestsF, ConvertSqueeze15ToSqueeze1_const_indices_no_skip) { + manager.register_pass(); + model = create_v15_model(IndicesMode::CONST, {0, -4, 3}, false); + model_ref = create_v1_model(IndicesMode::CONST, {0, -4, 3}); + EXPECT_EQ(model->output(0).get_partial_shape(), model_ref->output(0).get_partial_shape()); + comparator.enable(FunctionsComparator::CmpValues::CONST_VALUES); + comparator.enable(FunctionsComparator::CmpValues::ATTRIBUTES); + comparator.enable(FunctionsComparator::CmpValues::NAMES); +} + +TEST_F(TransformationTestsF, ConvertSqueeze15ToSqueeze1_dynamic_indices_no_skip) { + manager.register_pass(); + model = create_v15_model(IndicesMode::PARAM, {}, false); + model_ref = create_v1_model(IndicesMode::PARAM, {}); + EXPECT_EQ(model->output(0).get_partial_shape(), model_ref->output(0).get_partial_shape()); + comparator.enable(FunctionsComparator::CmpValues::CONST_VALUES); + comparator.enable(FunctionsComparator::CmpValues::ATTRIBUTES); + comparator.enable(FunctionsComparator::CmpValues::NAMES); +} + +TEST_F(TransformationTestsF, ConvertSqueeze15ToSqueeze1_unsupported_skip) { + manager.register_pass(); + model = create_v15_model(IndicesMode::PARAM, {}, true); +} From c685d44493f5a4b0403038f6f1ce9f350cfc0581 Mon Sep 17 00:00:00 2001 From: Alexandra Sidorova Date: Thu, 31 Oct 2024 13:24:38 +0400 Subject: [PATCH 078/120] [Snippets][CPU][Tests] Added tests for dynamic BF16/INT8 MHA (#27169) ### Details: - *Added more tests for the validation of INT8/BF16 MHA in CPU Plugin* - *Split the large "mha.cpp" file into the several small files with the same test semantic (comment https://github.com/openvinotoolkit/openvino/pull/26547#discussion_r1796616083)* ### Tickets: - *N/A* --- .../skip_tests_config.cpp | 8 +- .../snippets/matmul.cpp | 38 +- .../shared_tests_instances/snippets/mha.cpp | 543 +++--------------- .../snippets/mha_extracted_reshape.cpp | 40 ++ .../snippets/mha_fma.cpp | 33 ++ .../snippets/mha_quantized.cpp | 103 ++++ .../snippets/mha_select.cpp | 41 ++ .../snippets/mha_split_dim_m.cpp | 121 ++++ .../snippets/mha_transposed_b.cpp | 50 ++ .../snippets/mha_with_dyn_mul.cpp | 68 +++ .../snippets/mha_wo_transpose.cpp | 151 +++++ .../snippets/transpose_matmul.cpp | 32 +- .../shared_tests_instances/snippets/utils.hpp | 48 ++ .../plugin/shared/include/snippets/mha.hpp | 3 + .../plugin/shared/src/snippets/mha.cpp | 20 +- .../include/subgraph_mha.hpp | 15 +- .../ov_snippets_models/src/subgraph_mha.cpp | 113 ++-- 17 files changed, 807 insertions(+), 620 deletions(-) create mode 100644 src/plugins/intel_cpu/tests/functional/shared_tests_instances/snippets/mha_extracted_reshape.cpp create mode 100644 src/plugins/intel_cpu/tests/functional/shared_tests_instances/snippets/mha_fma.cpp create mode 100644 src/plugins/intel_cpu/tests/functional/shared_tests_instances/snippets/mha_quantized.cpp create mode 100644 src/plugins/intel_cpu/tests/functional/shared_tests_instances/snippets/mha_select.cpp create mode 100644 src/plugins/intel_cpu/tests/functional/shared_tests_instances/snippets/mha_split_dim_m.cpp create mode 100644 src/plugins/intel_cpu/tests/functional/shared_tests_instances/snippets/mha_transposed_b.cpp create mode 100644 src/plugins/intel_cpu/tests/functional/shared_tests_instances/snippets/mha_with_dyn_mul.cpp create mode 100644 src/plugins/intel_cpu/tests/functional/shared_tests_instances/snippets/mha_wo_transpose.cpp create mode 100644 src/plugins/intel_cpu/tests/functional/shared_tests_instances/snippets/utils.hpp diff --git a/src/plugins/intel_cpu/tests/functional/shared_tests_instances/skip_tests_config.cpp b/src/plugins/intel_cpu/tests/functional/shared_tests_instances/skip_tests_config.cpp index 6edc4f062536d0..90820d550df179 100644 --- a/src/plugins/intel_cpu/tests/functional/shared_tests_instances/skip_tests_config.cpp +++ b/src/plugins/intel_cpu/tests/functional/shared_tests_instances/skip_tests_config.cpp @@ -237,7 +237,6 @@ std::vector disabledTestPatterns() { R"(.*smoke_FakeQuantize.*/FakeQuantizeLayerTest.Inference.*TS=.*3.4.2.5.*LEVELS=255.*)", R"(.*smoke_FakeQuantizePerChannel.*/FakeQuantizeLayerTest.Inference.*TS=.*11.10.22.19.*LEVELS=(255|256).*netPRC=f32.*)", R"(.*smoke_MVN_5D/Mvn6LayerTest.Inference.*TS=.*3.4.2.5.*LEVELS=255.*netPRC=f16.*)", - R"(.*smoke_Snippets_MHAINT8MatMul/MHAINT8MatMul.*)", R"(.*smoke_static/ConvertFqRnnToQuantizedRnn.*2.1.5.*2.1.1.*2.1.1.*)", R"(.*smoke_InterpolateBicubicPillow_Layout_Test/InterpolateLayerCPUTest.CompareWithRefs/ShapeCalcMode=sizes_IS=\[?.2..20.?.?\]_TS.*1.17.4.4.*2.3.10.12.*1.17.4.4.*Sizes.*4.4.*10.20.*10.4.*PARAMETER.*0.0.0.0.*0.0.1.1.*2.3.*)", R"(.*smoke_LoopForCommon/LoopLayerCPUTest.CompareWithRefs/.*_netType=bf16.*)", @@ -563,7 +562,7 @@ std::vector disabledTestPatterns() { // ignored for not supported bf16 platforms retVector.emplace_back(R"(.*smoke_Snippets_EnforcePrecision_bf16.*)"); retVector.emplace_back(R"(.*smoke_Snippets_MHAWOTransposeEnforceBF16.*)"); - retVector.emplace_back(R"(.*smoke_Snippets_MHAEnforceBF16.*)"); + retVector.emplace_back(R"(.*smoke_Snippets_MHA.*EnforceBF16.*)"); retVector.emplace_back(R"(.*ConcatSDPTest.*bf16.*)"); } // [150842] Need to support dynamic K dimension of BF16|INT8 MatMul on AMX systems @@ -572,6 +571,11 @@ std::vector disabledTestPatterns() { retVector.emplace_back(R"(.*smoke_Snippets_MatMul/MatMul.CompareWithRefImpl/.*IS\[0\]=\[\?.\?.\?.\?\].*T\[0\]=(u8|i8|bf16)_T\[1\]=(i8|bf16).*)"); retVector.emplace_back(R"(.*smoke_Snippets_MatMulTransposeB.*IS\[0\]=\[\?.\?.\?.\?\].*T\[0\]=(u8|i8|bf16)_T\[1\]=(i8|bf16).*)"); retVector.emplace_back(R"(.*smoke_Snippets_MatMulBias.*IS\[0\]=\[\?.\?.\?.\?\].*T\[0\]=(u8|i8|bf16)_T\[1\]=(i8|bf16).*)"); + + retVector.emplace_back(R"(.*smoke_Snippets_MHAWOTransposeEnforceBF16_3D.*IS\[1\]=\[2.64.\?\].*)"); + retVector.emplace_back(R"(.*smoke_Snippets_MHA.*BF16.*/MHA.*IS\[0\]=\[(\?|1).(\?|4).(\?|12).(\?|64)\].*)"); + retVector.emplace_back(R"(.*smoke_Snippets_MHA.*BF16.*/MHA.*IS\[0\]=\[\?.\?.\?\].*)"); + retVector.emplace_back(R"(.*smoke_Snippets_(MHAINT8MatMul|MHAQuantMatMul0|MHAFQAfterMatMul_4D|smoke_Snippets_MHAFQ).*IS\[0\]=\[\?.\?.\?\.\?].*)"); } #ifdef SNIPPETS_LIBXSMM_TPP // GN in TPP requires exposing tmp Buffer results outside the loop (ticket: 151234) diff --git a/src/plugins/intel_cpu/tests/functional/shared_tests_instances/snippets/matmul.cpp b/src/plugins/intel_cpu/tests/functional/shared_tests_instances/snippets/matmul.cpp index f5057137f9b65c..176f0cb4d46aed 100644 --- a/src/plugins/intel_cpu/tests/functional/shared_tests_instances/snippets/matmul.cpp +++ b/src/plugins/intel_cpu/tests/functional/shared_tests_instances/snippets/matmul.cpp @@ -4,44 +4,26 @@ #include "snippets/matmul.hpp" -#include "common_test_utils/test_constants.hpp" -#include "openvino/runtime/system_conf.hpp" +#include "utils.hpp" namespace ov { namespace test { namespace snippets { -#define STATIC_SHAPES(...) static_shapes_to_test_representation(std::vector>{__VA_ARGS__}) - namespace { -static inline std::vector> quantized_precisions() { - std::vector> prc = {}; - // In Snippets MatMul INT8 is supported only on VNNI/AMX platforms - if (ov::with_cpu_x86_avx512_core_vnni() || ov::with_cpu_x86_avx512_core_amx_int8()) { - prc.emplace_back(std::vector{element::i8, element::i8}); - prc.emplace_back(std::vector{element::u8, element::i8}); - } - return prc; -} - static inline std::vector> precisions() { - std::vector> prc = { - {element::f32, element::f32}, - }; + std::vector> prc = precision_f32(2); // Note: TPP doesn't support low precisions yet #ifndef SNIPPETS_LIBXSMM_TPP - auto quant = quantized_precisions(); + auto quant = quantized_precisions_if_supported(); std::copy(quant.begin(), quant.end(), std::back_inserter(prc)); - // In Snippets MatMul BF16 is supported only on bf16/AMX platforms - if (ov::with_cpu_x86_bfloat16() || ov::with_cpu_x86_avx512_core_amx_bf16()) { - prc.emplace_back(std::vector{element::bf16, element::bf16}); - } + auto bfloat = precision_bf16_if_supported(2); + std::copy(bfloat.begin(), bfloat.end(), std::back_inserter(prc)); #endif return prc; } - std::vector> input_shapes{ { {{}, {{2, 1, 3, 5}}}, {{}, {{1, 3, 5, 3}}} }, { {{}, {{3, 1, 32, 14}}}, {{}, {{1, 3, 14, 37}}} }, @@ -158,7 +140,7 @@ INSTANTIATE_TEST_SUITE_P(smoke_Snippets_MatMulBias, MatMulBias, INSTANTIATE_TEST_SUITE_P(smoke_Snippets_MatMulBiasQuantized, MatMulBiasQuantized, ::testing::Combine( ::testing::ValuesIn(input_shapes_bias), - ::testing::ValuesIn(quantized_precisions()), + ::testing::ValuesIn(quantized_precisions_if_supported()), ::testing::Values(MatMulType::MatMul), ::testing::Values(1), // Subgraph ::testing::Values(1), // Tokenized MatMul+Bias @@ -167,8 +149,8 @@ INSTANTIATE_TEST_SUITE_P(smoke_Snippets_MatMulBiasQuantized, MatMulBiasQuantized INSTANTIATE_TEST_SUITE_P(smoke_Snippets_MatMulsQuantized, MatMulsQuantized, ::testing::Combine( - ::testing::ValuesIn(STATIC_SHAPES({{1, 16, 128, 64}, {1, 16, 64, 128}, {128, 64}})), - ::testing::ValuesIn(quantized_precisions()), + ::testing::ValuesIn(SNIPPETS_TESTS_STATIC_SHAPES({{1, 16, 128, 64}, {1, 16, 64, 128}, {128, 64}})), + ::testing::ValuesIn(quantized_precisions_if_supported()), ::testing::Values(MatMulType::MatMul), ::testing::Values(3), // Subgraph + Reshape + Subgraph ::testing::Values(2), // Tokenized [MatMul+FQ+Matmul] and [FQ] @@ -177,8 +159,8 @@ INSTANTIATE_TEST_SUITE_P(smoke_Snippets_MatMulsQuantized, MatMulsQuantized, INSTANTIATE_TEST_SUITE_P(smoke_Snippets_MatMulsQuantizedSoftmax, MatMulsQuantizedSoftmax, ::testing::Combine( - ::testing::ValuesIn(STATIC_SHAPES({{1, 16, 128, 64}, {1, 16, 64, 128}, {128, 64}})), - ::testing::ValuesIn(quantized_precisions()), + ::testing::ValuesIn(SNIPPETS_TESTS_STATIC_SHAPES({{1, 16, 128, 64}, {1, 16, 64, 128}, {128, 64}})), + ::testing::ValuesIn(quantized_precisions_if_supported()), ::testing::Values(MatMulType::MatMul), ::testing::Values(3), // Subgraph + Reshape + Subgraph ::testing::Values(2), // Tokenized [MatMul+FQ+Matmul] and [FQ] diff --git a/src/plugins/intel_cpu/tests/functional/shared_tests_instances/snippets/mha.cpp b/src/plugins/intel_cpu/tests/functional/shared_tests_instances/snippets/mha.cpp index 79db0b1546b2a8..63f5176684ccc1 100644 --- a/src/plugins/intel_cpu/tests/functional/shared_tests_instances/snippets/mha.cpp +++ b/src/plugins/intel_cpu/tests/functional/shared_tests_instances/snippets/mha.cpp @@ -1,60 +1,70 @@ -// Copyright (C) 2022 Intel Corporation +// Copyright (C) 2024 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // #include "snippets/mha.hpp" -#include "common_test_utils/test_constants.hpp" -#include "internal_properties.hpp" -#include "utils/cpu_test_utils.hpp" -#include "openvino/runtime/system_conf.hpp" +#include "utils.hpp" namespace ov { namespace test { namespace snippets { -#define STATIC_SHAPES(...) static_shapes_to_test_representation(std::vector>{__VA_ARGS__}) namespace { -const auto& inputShapes_4D = STATIC_SHAPES( - {{1, 128, 12, 64}, {1, 128, 12, 64}, {1, 12, 128, 128}, {1, 128, 12, 64}}, - {{1, 128, 16, 64}, {1, 128, 16, 64}, {1, 16, 1, 1}, {1, 128, 16, 64}}, - {{1, 128, 16, 64}, {1, 128, 16, 64}, {1, 1, 1, 128}, {1, 128, 16, 64}}, - {{2, 68, 6, 92}, {2, 68, 6, 92}, {1, 1, 68, 68}, {2, 68, 6, 92}}, - {{1, 58, 16, 34}, {1, 58, 16, 34}, {1, 1, 1, 58}, {1, 58, 16, 34}}); - -const auto& inputShapes_3D = STATIC_SHAPES( - {{128, 12, 64}, {128, 12, 64}, {12, 128, 128}, {128, 12, 64}}, - {{68, 6, 92}, {68, 6, 92}, {1, 68, 68}, {68, 6, 92}}, - {{16, 2, 92}, {68, 2, 92}, {1, 16, 68}, {68, 2, 92}}); - -static inline bool is_bf16_supported() { - return ov::with_cpu_x86_bfloat16() || ov::with_cpu_x86_avx512_core_amx_bf16(); -} - -static inline std::vector> precision_f32(size_t count) { - std::vector> prc; - prc.emplace_back(std::vector(count, element::f32)); - return prc; -} - -static inline std::vector> precision_bf16(size_t count) { - std::vector> prc; - if (is_bf16_supported()) - prc.emplace_back(std::vector(count, element::bf16)); - return prc; +std::vector> transposedShape_4D(bool with_dynamic = true) { + auto shapes = SNIPPETS_TESTS_STATIC_SHAPES( + {{1, 128, 12, 64}, {1, 128, 12, 64}, {1, 12, 128, 128}, {1, 128, 12, 64}}, + {{1, 128, 16, 64}, {1, 128, 16, 64}, {1, 16, 1, 1}, {1, 128, 16, 64}}, + {{1, 128, 16, 64}, {1, 128, 16, 64}, {1, 1, 1, 128}, {1, 128, 16, 64}}, + {{2, 68, 6, 92}, {2, 68, 6, 92}, {1, 1, 68, 68}, {2, 68, 6, 92}}, + {{1, 58, 16, 34}, {1, 58, 16, 34}, {1, 1, 1, 58}, {1, 58, 16, 34}}); + if (with_dynamic) { + std::vector> dynamic_shapes = {{ + {PartialShape{-1, -1, -1, 100}, {{1, 64, 4, 100}, {2, 16, 2, 100}, {1, 72, 4, 100}}}, + {PartialShape{-1, 128, -1, 100}, {{1, 128, 4, 100}, {2, 128, 2, 100}, {1, 128, 4, 100}}}, + {PartialShape{-1, -1, -1, 128}, {{1, 4, 64, 128}, {2, 2, 16, 128}, {1, 4, 72, 128}}}, + {PartialShape{-1, 128, -1, 100}, {{1, 128, 4, 100}, {2, 128, 2, 100}, {1, 128, 4, 100}}}, + }, + { + {PartialShape{-1, -1, -1, -1}, {{1, 128, 3, 64}, {2, 16, 2, 100}, {1, 128, 3, 64}}}, + {PartialShape{-1, -1, -1, -1}, {{1, 128, 1, 64}, {2, 128, 2, 100}, {1, 128, 1, 64}}}, + {PartialShape{-1, -1, -1, -1}, {{2, 1, 128, 128}, {2, 2, 16, 128}, {2, 1, 128, 128}}}, + {PartialShape{-1, -1, -1, -1}, {{1, 128, 3, 64}, {2, 128, 2, 100}, {1, 128, 3, 64}}}, + }, + { + {PartialShape{-1, -1, 12, 64}, {{1, 70, 12, 64}, {1, 20, 12, 64}, {1, 20, 12, 64}, {1, 20, 12, 64}, {1, 70, 12, 64}}}, + {PartialShape{-1, -1, 12, 64}, {{1, 35, 12, 64}, {2, 10, 12, 64}, {2, 1, 12, 64}, {2, 10, 12, 64}, {1, 35, 12, 64}}}, + {PartialShape{-1, 12, -1, -1}, {{2, 12, 70, 35}, {1, 12, 20, 10}, {1, 12, 20, 10}, {1, 12, 20, 1}, {2, 12, 70, 35}}}, + {PartialShape{-1, -1, 12, 64}, {{1, 35, 12, 64}, {1, 10, 12, 64}, {1, 10, 12, 64}, {1, 10, 12, 64}, {1, 35, 12, 64}}}, + }}; + shapes.insert(shapes.end(), dynamic_shapes.begin(), dynamic_shapes.end()); + } + return shapes; } -static ov::AnyMap enable_callback() { - return ov::AnyMap({ov::intel_cpu::snippets_mode(ov::intel_cpu::SnippetsMode::ENABLE)}); +std::vector> transposedShape_3D(bool with_dynamic = true) { + auto shapes = SNIPPETS_TESTS_STATIC_SHAPES( + {{128, 12, 64}, {128, 12, 64}, {12, 128, 128}, {128, 12, 64}}, + {{68, 6, 92}, {68, 6, 92}, {1, 68, 68}, {68, 6, 92}}, + {{16, 2, 92}, {68, 2, 92}, {1, 16, 68}, {68, 2, 92}}); + if (with_dynamic) { + shapes.push_back({ + {PartialShape{-1, -1, -1}, {{128, 3, 64}, {128, 3, 64}, {68, 6, 87}}}, + {PartialShape{-1, -1, -1}, {{128, 1, 64}, {128, 1, 64}, {13, 6, 87}}}, + {PartialShape{-1, -1, -1}, {{1, 128, 128}, {1, 128, 128}, {1, 68, 13}}}, + {PartialShape{-1, -1, -1}, {{128, 3, 64}, {128, 3, 64}, {13, 6, 87}}}, + }); + } + return shapes; } INSTANTIATE_TEST_SUITE_P(smoke_Snippets_MHA_4D, MHA, - ::testing::Combine(::testing::ValuesIn(inputShapes_4D), + ::testing::Combine(::testing::ValuesIn(transposedShape_4D()), ::testing::ValuesIn(precision_f32(4)), ::testing::Values(ov::element::f32), - ::testing::ValuesIn({false, true}), + ::testing::Values(false), ::testing::Values(MHA::default_thread_count), ::testing::Values(1), ::testing::Values(1), @@ -62,27 +72,12 @@ INSTANTIATE_TEST_SUITE_P(smoke_Snippets_MHA_4D, ::testing::Values(CPUTestUtils::empty_plugin_config)), MHA::getTestCaseName); -std::vector> inputShapes_4D_dynamic{ - { - {PartialShape{-1, -1, -1, -1}, {{1, 128, 3, 64}, {1, 70, 3, 19}, {1, 128, 3, 64}, {1, 68, 6, 87}}}, - {PartialShape{-1, -1, -1, -1}, {{1, 128, 1, 64}, {2, 49, 1, 19}, {1, 128, 1, 64}, {2, 13, 6, 87}}}, - {PartialShape{-1, -1, -1, -1}, {{2, 1, 128, 128}, {1, 1, 70, 49}, {2, 1, 128, 128}, {1, 1, 68, 13}}}, - {PartialShape{-1, -1, -1, -1}, {{1, 128, 3, 64}, {1, 49, 3, 19}, {1, 128, 3, 64}, {2, 13, 6, 87}}}, - }, - { - {PartialShape{-1, -1, 12, 64}, {{1, 70, 12, 64}, {1, 20, 12, 64}, {1, 20, 12, 64}, {1, 20, 12, 64}, {1, 70, 12, 64}}}, - {PartialShape{-1, -1, 12, 64}, {{1, 35, 12, 64}, {2, 10, 12, 64}, {2, 1, 12, 64}, {2, 10, 12, 64}, {1, 35, 12, 64}}}, - {PartialShape{-1, 12, -1, -1}, {{2, 12, 70, 35}, {1, 12, 20, 10}, {1, 12, 20, 10}, {1, 12, 20, 1}, {2, 12, 70, 35}}}, - {PartialShape{-1, -1, 12, 64}, {{1, 35, 12, 64}, {1, 10, 12, 64}, {1, 10, 12, 64}, {1, 10, 12, 64}, {1, 35, 12, 64}}}, - } -}; - -INSTANTIATE_TEST_SUITE_P(smoke_Snippets_DynMHA_4D, +INSTANTIATE_TEST_SUITE_P(smoke_Snippets_MHA_4D_WithScalarMul, MHA, - ::testing::Combine(::testing::ValuesIn(inputShapes_4D_dynamic), + ::testing::Combine(::testing::ValuesIn(transposedShape_4D(false)), ::testing::ValuesIn(precision_f32(4)), ::testing::Values(ov::element::f32), - ::testing::ValuesIn({false}), + ::testing::Values(true), ::testing::Values(MHA::default_thread_count), ::testing::Values(1), ::testing::Values(1), @@ -90,13 +85,12 @@ INSTANTIATE_TEST_SUITE_P(smoke_Snippets_DynMHA_4D, ::testing::Values(CPUTestUtils::empty_plugin_config)), MHA::getTestCaseName); - INSTANTIATE_TEST_SUITE_P(smoke_Snippets_MHA_3D, MHA, - ::testing::Combine(::testing::ValuesIn(inputShapes_3D), + ::testing::Combine(::testing::ValuesIn(transposedShape_3D()), ::testing::ValuesIn(precision_f32(4)), ::testing::Values(ov::element::f32), - ::testing::ValuesIn({false, true}), + ::testing::Values(false), ::testing::Values(MHA::default_thread_count), ::testing::Values(5), // [122706]: Subgraph + 4 Transpose ::testing::Values(2), // decomposed Transpose + MHA @@ -104,111 +98,23 @@ INSTANTIATE_TEST_SUITE_P(smoke_Snippets_MHA_3D, ::testing::Values(CPUTestUtils::empty_plugin_config)), MHA::getTestCaseName); -const auto& splitm_static_shapes = STATIC_SHAPES({{1, 128, 2, 64}, {1, 128, 2, 64}, {1, 1, 1, 1}, {1, 128, 2, 64}}); - -INSTANTIATE_TEST_SUITE_P( - smoke_Snippets_MHA_4D_SplitDimensionM_static, - MHA, - ::testing::Combine(::testing::ValuesIn(splitm_static_shapes), - ::testing::ValuesIn(precision_f32(4)), - ::testing::Values(ov::element::f32), - ::testing::Values(true), - ::testing::Values(4), // 4 Threads - ::testing::Values(6), // Subgraph + 4 Reshapes on inputs and 1 Reshape on output - ::testing::Values(1), - ::testing::Values(ov::test::utils::DEVICE_CPU), - ::testing::Values(enable_callback())), - MHA::getTestCaseName); - -INSTANTIATE_TEST_SUITE_P( - smoke_Snippets_MHA_3D_SplitDimensionM_static, - MHA, - ::testing::Combine( - ::testing::ValuesIn(STATIC_SHAPES({{384, 2, 64}, {384, 2, 64}, {1, 384, 384}, {384, 2, 64}})), - ::testing::ValuesIn(precision_f32(4)), - ::testing::Values(ov::element::f32), - ::testing::Values(true), - ::testing::Values(4), // 4 Threads - ::testing::Values(10), // Subgraph + 4 Reshapes on inputs and 1 Reshape on output + 4 Transposes - ::testing::Values(1), // MHA - ::testing::Values(ov::test::utils::DEVICE_CPU), - ::testing::Values(enable_callback())), - MHA::getTestCaseName); - -std::vector> splitm_dynamic_shapes_4d = { - { - {PartialShape{-1, -1, -1, -1}, {{1, 128, 2, 64}, {1, 17, 2, 64}, {1, 128, 2, 64}}}, - {PartialShape{-1, -1, -1, -1}, {{1, 128, 2, 64}, {1, 17, 2, 64}, {1, 128, 2, 64}}}, - {PartialShape{-1, -1, -1, -1}, {{1, 1, 1, 128}, {1, 1, 1, 17}, {1, 1, 1, 128}}}, - {PartialShape{-1, -1, -1, -1}, {{1, 128, 2, 64}, {1, 17, 2, 64}, {1, 128, 2, 64}}}, - }, - { - {PartialShape{-1, 128, -1, -1}, {{1, 128, 2, 64}}}, - {PartialShape{-1, -1, -1, -1}, {{1, 16, 2, 64}}}, - {PartialShape{-1, -1, 128, -1}, {{1, 1, 128, 16}}}, - {PartialShape{-1, -1, -1, -1}, {{1, 16, 2, 32}}}, - }, - { - {PartialShape{-1, 32, -1, -1}, {{1, 32, 2, 64}}}, - {PartialShape{-1, -1, -1, -1}, {{1, 16, 2, 64}}}, - {PartialShape{-1, -1, 32, -1}, {{1, 1, 32, 16}}}, - {PartialShape{-1, -1, -1, -1}, {{1, 16, 2, 32}}}, - }, - { - {PartialShape{-1, -1, -1, -1}, {{1, 16, 2, 64}}}, - {PartialShape{-1, -1, -1, -1}, {{1, 16, 2, 64}}}, - {PartialShape{-1, -1, 16, -1}, {{1, 1, 16, 16}}}, - {PartialShape{-1, -1, -1, -1}, {{1, 16, 2, 32}}}, - }, -}; - -INSTANTIATE_TEST_SUITE_P( - smoke_Snippets_MHA_4D_SplitDimensionM_dynamic, - MHA, - ::testing::Combine(::testing::ValuesIn(splitm_dynamic_shapes_4d), - ::testing::ValuesIn(precision_f32(4)), - ::testing::Values(ov::element::f32), - ::testing::Values(false), - ::testing::Values(4), // 4 Threads - ::testing::Values(1), - ::testing::Values(1), - ::testing::Values(ov::test::utils::DEVICE_CPU), - ::testing::Values(CPUTestUtils::empty_plugin_config)), - MHA::getTestCaseName); - -std::vector> splitm_dynamic_shapes_3d = { - { - {PartialShape{-1, -1, -1}, {{128, 2, 64}, {17, 2, 64}, {128, 2, 64}}}, - {PartialShape{-1, -1, -1}, {{128, 2, 64}, {17, 2, 64}, {128, 2, 64}}}, - {PartialShape{-1, -1, -1}, {{1, 1, 128}, {1, 1, 17}, {1, 1, 128}}}, - {PartialShape{-1, -1, -1}, {{128, 2, 64}, {17, 2, 64}, {128, 2, 64}}}, - }, - { - {PartialShape{-1, 2, 64}, {{128, 2, 64}, {64, 2, 64}, {128, 2, 64}}}, - {PartialShape{-1, 2, 64}, {{128, 2, 64}, {64, 2, 64}, {128, 2, 64}}}, - {PartialShape{1, 1, -1}, {{1, 1, 128}, {1, 1, 64}, {1, 1, 128}}}, - {PartialShape{-1, 2, 64}, {{128, 2, 64}, {64, 2, 64}, {128, 2, 64}}}, - }, -}; - -INSTANTIATE_TEST_SUITE_P( - smoke_Snippets_MHA_3D_SplitDimensionM_dynamic, - MHA, - ::testing::Combine(::testing::ValuesIn(splitm_dynamic_shapes_3d), - ::testing::ValuesIn(precision_f32(4)), - ::testing::Values(ov::element::f32), - ::testing::Values(false), - ::testing::Values(4), // 4 Threads - ::testing::Values(5), // Subgraph + 4 Transpose - ::testing::Values(2), // MHA + one of the transposes is executed via Subgraph (because callback is disabled) - ::testing::Values(ov::test::utils::DEVICE_CPU), - ::testing::Values(CPUTestUtils::empty_plugin_config)), - MHA::getTestCaseName); +INSTANTIATE_TEST_SUITE_P(smoke_Snippets_MHA_3D_WithScalarMul, + MHA, + ::testing::Combine(::testing::ValuesIn(transposedShape_3D(false)), + ::testing::ValuesIn(precision_f32(4)), + ::testing::Values(ov::element::f32), + ::testing::Values(true), + ::testing::Values(MHA::default_thread_count), + ::testing::Values(5), // [122706]: Subgraph + 4 Transpose + ::testing::Values(2), // decomposed Transpose + MHA + ::testing::Values(ov::test::utils::DEVICE_CPU), + ::testing::Values(CPUTestUtils::empty_plugin_config)), + MHA::getTestCaseName); INSTANTIATE_TEST_SUITE_P(smoke_Snippets_MHABF16_4D, MHA, - ::testing::Combine(::testing::ValuesIn(inputShapes_4D), - ::testing::ValuesIn(precision_bf16(4)), + ::testing::Combine(::testing::ValuesIn(transposedShape_4D()), + ::testing::ValuesIn(precision_bf16_if_supported(4)), ::testing::Values(ov::element::f32), ::testing::ValuesIn({false, true}), ::testing::Values(MHA::default_thread_count), @@ -220,7 +126,7 @@ INSTANTIATE_TEST_SUITE_P(smoke_Snippets_MHABF16_4D, INSTANTIATE_TEST_SUITE_P(smoke_Snippets_MHAEnforceBF16, MHA, - ::testing::Combine(::testing::ValuesIn(inputShapes_4D), + ::testing::Combine(::testing::ValuesIn(transposedShape_4D()), ::testing::ValuesIn(precision_f32(4)), ::testing::Values(ov::element::bf16), ::testing::ValuesIn({false}), @@ -231,321 +137,6 @@ INSTANTIATE_TEST_SUITE_P(smoke_Snippets_MHAEnforceBF16, ::testing::Values(CPUTestUtils::cpu_bf16_plugin_config)), MHA::getTestCaseName); -INSTANTIATE_TEST_SUITE_P( - smoke_Snippets_MHAMulAdd, - MHAMulAdd, - ::testing::Combine( - ::testing::ValuesIn(STATIC_SHAPES({{1, 10, 12, 16}, {1, 10, 12, 16}, {1, 10, 12, 16}})), - ::testing::ValuesIn(precision_f32(3)), - ::testing::Values(ov::element::f32), - ::testing::ValuesIn({false}), // Need to support True for graph builder in tests - ::testing::Values(MHA::default_thread_count), - ::testing::Values(1), - ::testing::Values(1), - ::testing::Values(ov::test::utils::DEVICE_CPU), - ::testing::Values(CPUTestUtils::empty_plugin_config)), - MHA::getTestCaseName); - -const auto& inputShapeSelect = STATIC_SHAPES( - // without broadcast - {{1, 128, 12, 64}, {1, 128, 12, 64}, {1, 12, 128, 128}, {1, 12, 128, 128}, {1, 12, 128, 128}, {1, 128, 12, 64}}, - {{1, 94, 12, 54}, {1, 94, 12, 54}, {1, 12, 94, 94}, {1, 12, 94, 94}, {1, 12, 94, 94}, {1, 94, 12, 54}}, - // with broadcast - {{1, 128, 12, 64}, {1, 128, 12, 64}, {1, 12, 128, 128}, {1, 12, 1, 1}, {1, 12, 1, 1}, {1, 128, 12, 64}}, - {{2, 52, 6, 102}, {2, 52, 6, 102}, {1, 6, 52, 52}, {1, 6, 1, 1}, {1, 6, 1, 1}, {2, 52, 6, 102}} -); - -INSTANTIATE_TEST_SUITE_P( - smoke_Snippets_MHA, - MHASelect, - ::testing::Combine(::testing::ValuesIn(inputShapeSelect), - ::testing::ValuesIn(precision_f32(6)), - ::testing::Values(ov::element::f32), - ::testing::Values(false), // Need to support True for graph builder in tests - ::testing::Values(MHA::default_thread_count), - ::testing::Values(2), // Less + MHA - ::testing::Values(2), - ::testing::Values(ov::test::utils::DEVICE_CPU), - ::testing::Values(CPUTestUtils::empty_plugin_config)), - MHA::getTestCaseName); - -const auto& inputShapesWOTranspose_4D = STATIC_SHAPES( - {{1, 12, 197, 64}, {1, 12, 64, 197}, {1, 12, 197, 64}}, - {{1, 12, 12, 64}, {1, 12, 64, 48}, {1, 12, 48, 64}}); -const auto& inputShapesWOTranspose_3D = STATIC_SHAPES( - {{12, 197, 64}, {12, 64, 197}, {12, 197, 64}}, - {{12, 128, 100}, {12, 100, 128}, {12, 128, 100}}); - -INSTANTIATE_TEST_SUITE_P( - smoke_Snippets_MHAWOTransposeOnInputs_4D, - MHAWOTransposeOnInputs, - ::testing::Combine(::testing::ValuesIn(inputShapesWOTranspose_4D), - ::testing::Values(std::vector{}), - ::testing::Values(ov::element::f32), - ::testing::Values(true), // Need to support False for graph builder in tests - ::testing::Values(MHA::default_thread_count), - ::testing::Values(1), - ::testing::Values(1), - ::testing::Values(ov::test::utils::DEVICE_CPU), - ::testing::Values(CPUTestUtils::empty_plugin_config)), - MHA::getTestCaseName); - -INSTANTIATE_TEST_SUITE_P( - smoke_Snippets_MHAWOTranspose_4D, - MHAWOTranspose, - ::testing::Combine(::testing::ValuesIn(inputShapesWOTranspose_4D), - ::testing::ValuesIn(precision_f32(3)), - ::testing::Values(ov::element::f32), - ::testing::ValuesIn({true}), // Need to support False for graph builder in tests - ::testing::Values(MHA::default_thread_count), - ::testing::Values(1), - ::testing::Values(1), - ::testing::Values(ov::test::utils::DEVICE_CPU), - ::testing::Values(CPUTestUtils::empty_plugin_config)), - MHA::getTestCaseName); - -INSTANTIATE_TEST_SUITE_P( - smoke_Snippets_MHAWOTranspose_3D, - MHAWOTranspose, - ::testing::Combine(::testing::ValuesIn(inputShapesWOTranspose_3D), - ::testing::ValuesIn(precision_f32(3)), - ::testing::Values(ov::element::f32), - ::testing::ValuesIn({true}), // Need to support False for graph builder in tests - ::testing::Values(MHA::default_thread_count), - ::testing::Values(1), - ::testing::Values(1), - ::testing::Values(ov::test::utils::DEVICE_CPU), - ::testing::Values(CPUTestUtils::empty_plugin_config)), - MHA::getTestCaseName); - -std::vector> inputShapesWOTranspose_3D_dynamic{ - { - {PartialShape{-1, -1, -1}, {{12, 19, 85}, {1, 40, 36}}}, - {PartialShape{-1, -1, -1}, {{1, 85, 19}, {2, 36, 40}}}, - {PartialShape{-1, -1, -1}, {{12, 19, 85}, {1, 40, 36}}}, - }, - { - {PartialShape{2, -1, 64}, {{2, 9, 64}, {2, 2, 64}, {2, 9, 64}}}, - {PartialShape{2, 64, -1}, {{2, 64, 9}, {2, 64, 2}, {2, 64, 9}}}, - {PartialShape{2, -1, 64}, {{2, 9, 64}, {2, 2, 64}, {2, 9, 64}}}, - }, -}; - - - -INSTANTIATE_TEST_SUITE_P( - smoke_Snippets_DynMHAWOTranspose_3D, - MHAWOTranspose, - ::testing::Combine(::testing::ValuesIn(inputShapesWOTranspose_3D_dynamic), - ::testing::ValuesIn(precision_f32(3)), - ::testing::Values(ov::element::f32), - ::testing::ValuesIn({true}), // Need to support False for graph builder in tests - ::testing::Values(MHA::default_thread_count), - ::testing::Values(1), - ::testing::Values(1), - ::testing::Values(ov::test::utils::DEVICE_CPU), - ::testing::Values(CPUTestUtils::empty_plugin_config)), - MHA::getTestCaseName); - -INSTANTIATE_TEST_SUITE_P( - smoke_Snippets_MHAWOTransposeBF16_4D, - MHAWOTranspose, - ::testing::Combine(::testing::ValuesIn(inputShapesWOTranspose_4D), - ::testing::ValuesIn(precision_bf16(3)), - ::testing::Values(ov::element::f32), - ::testing::ValuesIn({true}), // Need to support False for graph builder in tests - ::testing::Values(MHA::default_thread_count), - ::testing::Values(5), // MHA + 4 extra Converts on inputs and output - ::testing::Values(5), // MHA + 4 extra Converts on inputs and output - ::testing::Values(ov::test::utils::DEVICE_CPU), - ::testing::Values(CPUTestUtils::empty_plugin_config)), - MHA::getTestCaseName); - -INSTANTIATE_TEST_SUITE_P( - smoke_Snippets_MHAWOTransposeBF16_3D, - MHAWOTranspose, - ::testing::Combine(::testing::ValuesIn(inputShapesWOTranspose_3D), - ::testing::ValuesIn(precision_bf16(3)), - ::testing::Values(ov::element::f32), - ::testing::ValuesIn({true}), // Need to support False for graph builder in tests - ::testing::Values(MHA::default_thread_count), - ::testing::Values(5), // MHA + 4 extra Converts on inputs and output - ::testing::Values(5), // MHA + 4 extra Converts on inputs and output - ::testing::Values(ov::test::utils::DEVICE_CPU), - ::testing::Values(CPUTestUtils::empty_plugin_config)), - MHA::getTestCaseName); - -INSTANTIATE_TEST_SUITE_P( - smoke_Snippets_MHAWOTransposeEnforceBF16_4D, - MHAWOTranspose, - ::testing::Combine(::testing::ValuesIn(inputShapesWOTranspose_4D), - ::testing::ValuesIn(precision_f32(3)), - ::testing::Values(ov::element::bf16), - ::testing::ValuesIn({true}), // Need to support False for graph builder in tests - ::testing::Values(MHA::default_thread_count), - ::testing::Values(5), // MHA + 4 extra Converts on inputs and output - ::testing::Values(5), // MHA + 4 extra Converts on inputs and output - ::testing::Values(ov::test::utils::DEVICE_CPU), - ::testing::Values(CPUTestUtils::cpu_bf16_plugin_config)), - MHA::getTestCaseName); - -INSTANTIATE_TEST_SUITE_P( - smoke_Snippets_MHAWOTransposeEnforceBF16_3D, - MHAWOTranspose, - ::testing::Combine(::testing::ValuesIn(inputShapesWOTranspose_3D), - ::testing::ValuesIn(precision_f32(3)), - ::testing::Values(ov::element::bf16), - ::testing::ValuesIn({true}), // Need to support False for graph builder in tests - ::testing::Values(MHA::default_thread_count), - ::testing::Values(5), // MHA + 4 extra Converts on inputs and output - ::testing::Values(5), // MHA + 4 extra Converts on inputs and output - ::testing::Values(ov::test::utils::DEVICE_CPU), - ::testing::Values(CPUTestUtils::cpu_bf16_plugin_config)), - MHA::getTestCaseName); - -INSTANTIATE_TEST_SUITE_P( - smoke_Snippets_MHAINT8MatMul, - MHAINT8MatMul, - ::testing::Combine(::testing::ValuesIn(std::vector>(inputShapes_4D.begin(), - inputShapes_4D.begin() + 2)), - ::testing::Values(std::vector{}), - ::testing::Values(ov::element::f32), - ::testing::Values(false), // The graph doesn't contain Multiply - ::testing::Values(MHA::default_thread_count), - ::testing::Values(6), // FQx3 on inputs + MHA + Transpose on output + Deq Mul - ::testing::Values(5), // FQx3 on inputs + MHA + Deq Mul - ::testing::Values(ov::test::utils::DEVICE_CPU), - ::testing::Values(CPUTestUtils::empty_plugin_config)), - MHA::getTestCaseName); - -INSTANTIATE_TEST_SUITE_P( - smoke_Snippets_MHAQuantMatMul0, - MHAQuantMatMul0, - ::testing::Combine( - ::testing::ValuesIn(STATIC_SHAPES({{1, 128, 768}, {1, 128, 768}, {1, 1, 1, 128}, {1, 128, 768}})), - ::testing::Values(std::vector{}), - ::testing::Values(ov::element::f32), - ::testing::Values(false), // The graph doesn't contain Multiply - ::testing::Values(MHA::default_thread_count), - ::testing::Values(9), // FQx2 on inputs + MHA + Transpose on output + 4 Reshapes + Deq Mul - ::testing::Values(4), // FQx2 on inputs + MHA + Deq Mul - ::testing::Values(ov::test::utils::DEVICE_CPU), - ::testing::Values(CPUTestUtils::empty_plugin_config)), - MHA::getTestCaseName); - -INSTANTIATE_TEST_SUITE_P(smoke_Snippets_MHAFQAfterMatMul_4D, - MHAFQAfterMatMul, - ::testing::Combine(::testing::ValuesIn(inputShapes_4D), - ::testing::Values(std::vector{}), - ::testing::Values(ov::element::f32), - ::testing::Values(false), // The graph doesn't contain Multiply - ::testing::Values(MHA::default_thread_count), - ::testing::Values(3), // MHA + Transpose on output + Deq Mul - ::testing::Values(2), // MHA + Deq Mul - ::testing::Values(ov::test::utils::DEVICE_CPU), - ::testing::Values(CPUTestUtils::empty_plugin_config)), - MHA::getTestCaseName); - -INSTANTIATE_TEST_SUITE_P( - smoke_Snippets_MHAFQ, - MHAFQ, - ::testing::Combine(::testing::ValuesIn(STATIC_SHAPES({{1, 64, 12, 64}, - {1, 64, 12, 64}, - {1, 1, 1, 64}, - {1, 64, 12, 64}})), - ::testing::Values(std::vector{}), - ::testing::Values(ov::element::f32), - ::testing::Values(false), // The graph doesn't contain Multiply - ::testing::Values(MHA::default_thread_count), - ::testing::Values(7), // Transposex2 + Subgraphsx5 - ::testing::Values(5), // MHA + Deq Mul on output + Deqs on inputs + 2 xFQ on inputs - ::testing::Values(ov::test::utils::DEVICE_CPU), - ::testing::Values(CPUTestUtils::empty_plugin_config)), - MHA::getTestCaseName); - -std::vector> inputShapesTransposedB { - { - {{}, {{1, 12, 12, 64}}}, - {{}, {{1, 12, 48, 64}}}, - {{}, {{1, 12, 48, 64}}} - }, - { - {PartialShape{-1, 3, -1, 64}, {{1, 3, 12, 64}, {2, 3, 36, 64}}}, - {PartialShape{-1, 3, -1, 64}, {{1, 3, 14, 64}, {2, 3, 42, 64}}}, - {PartialShape{-1, 3, -1, -1}, {{1, 3, 14, 36}, {2, 3, 42, 36}}}, - }, - { - {PartialShape{2, -1, 32, -1}, {{2, 1, 32, 70}, {2, 2, 32, 96}}}, - {PartialShape{2, -1, 49, -1}, {{2, 3, 49, 70}, {2, 1, 49, 96}}}, - {PartialShape{2, -1, 49, -1}, {{2, 1, 49, 17}, {2, 2, 49, 81}}}, - }, -}; - -INSTANTIATE_TEST_SUITE_P( - smoke_Snippets_MHATransposedB, - MHATransposedB, - ::testing::Combine(::testing::ValuesIn(inputShapesTransposedB), - ::testing::Values(std::vector{}), - ::testing::Values(ov::element::f32), - ::testing::ValuesIn({true}), // Need to support False for graph builder in tests - ::testing::Values(MHA::default_thread_count), - ::testing::Values(1), - ::testing::Values(1), - ::testing::Values(ov::test::utils::DEVICE_CPU), - ::testing::Values(CPUTestUtils::empty_plugin_config)), - MHA::getTestCaseName); - -const auto& inputShapesExtractedReshape = STATIC_SHAPES( - {{2, 196, 64}, {2, 64, 196}, {2, 14, 14, 14, 1}, {2, 14, 14, 1, 14}, {2, 196, 64}}, - {{1, 16, 10}, {1, 10, 16}, {1, 4, 4, 4, 1}, {1, 4, 4, 1, 4}, {1, 16, 10}}, - {{1, 16, 10}, {1, 10, 16}, {1, 1, 1, 1, 1}, {1, 4, 4, 4, 4}, {1, 16, 10}}, - {{1, 16, 10}, {1, 10, 16}, {1, 4, 4, 4, 4}, {1, 1, 1, 1, 1}, {1, 16, 10}}, - {{1, 4, 16, 10}, {1, 4, 10, 16}, {1, 4, 256}, {1, 4, 256}, {1, 4, 16, 10}}, - {{1, 4, 16, 10}, {1, 4, 10, 16}, {1, 1, 256}, {1, 4, 1}, {1, 4, 16, 10}}); - -INSTANTIATE_TEST_SUITE_P( - smoke_Snippets_MHAWithExtractedReshape, - MHAWithExtractedReshape, - ::testing::Combine(::testing::ValuesIn(inputShapesExtractedReshape), - ::testing::Values(std::vector{}), - ::testing::Values(ov::element::f32), - ::testing::ValuesIn({true}), // False is not supported for graph builder in tests - ::testing::Values(MHA::default_thread_count), - ::testing::Values(3), // Extracted Add + Extracted Reshape + MHA - ::testing::Values(2), // Extracted Add + MHA - ::testing::Values(ov::test::utils::DEVICE_CPU), - ::testing::Values(CPUTestUtils::empty_plugin_config)), - MHA::getTestCaseName); - -std::vector> inputShapes_4D_WithMul_dynamic{ - { - {PartialShape{-1, -1, -1, -1}, {{1, 128, 3, 64}, {1, 70, 3, 19}, {1, 128, 3, 64}, {1, 68, 6, 87}}}, - {PartialShape{-1, -1, -1, -1}, {{1, 128, 1, 64}, {2, 49, 1, 19}, {1, 128, 1, 64}, {2, 13, 6, 87}}}, - {PartialShape{1}, {{1}, {1}, {1}, {1} }}, - {PartialShape{-1, -1, -1, -1}, {{2, 1, 128, 128}, {1, 1, 70, 49}, {2, 1, 128, 128}, {1, 1, 68, 13}}}, - {PartialShape{-1, -1, -1, -1}, {{1, 128, 3, 64}, {1, 49, 3, 19}, {1, 128, 3, 64}, {2, 13, 6, 87}}}, - }, - { - {PartialShape{-1, -1, 12, 64}, {{1, 70, 12, 64}, {1, 20, 12, 64}, {1, 20, 12, 64}, {1, 20, 12, 64}, {1, 70, 12, 64}}}, - {PartialShape{-1, -1, 12, 64}, {{1, 35, 12, 64}, {2, 10, 12, 64}, {2, 1, 12, 64}, {2, 10, 12, 64}, {1, 35, 12, 64}}}, - {PartialShape{-1, 12, 64, -1}, {{1, 12, 64, 35}, {1, 12, 64, 10}, {1, 12, 64, 10}, {1, 12, 64, 1}, {1, 12, 64, 35}}}, - {PartialShape{-1, 12, -1, -1}, {{2, 12, 70, 35}, {1, 12, 20, 10}, {1, 12, 20, 10}, {1, 12, 20, 1}, {2, 12, 70, 35}}}, - {PartialShape{-1, -1, 12, 64}, {{1, 35, 12, 64}, {1, 10, 12, 64}, {1, 10, 12, 64}, {1, 10, 12, 64}, {1, 35, 12, 64}}}, - } -}; - -INSTANTIATE_TEST_SUITE_P(smoke_Snippets_DynMHA_4D_WithMul, - MHAWithDynamicMul, - ::testing::Combine(::testing::ValuesIn(inputShapes_4D_WithMul_dynamic), - ::testing::ValuesIn(precision_f32(5)), - ::testing::Values(ov::element::f32), - ::testing::Values(MHA::default_thread_count), - ::testing::Values(1), - ::testing::Values(1), - ::testing::Values(ov::test::utils::DEVICE_CPU), - ::testing::Values(CPUTestUtils::empty_plugin_config)), - MHAWithDynamicMul::getTestCaseName); - } // namespace } // namespace snippets } // namespace test diff --git a/src/plugins/intel_cpu/tests/functional/shared_tests_instances/snippets/mha_extracted_reshape.cpp b/src/plugins/intel_cpu/tests/functional/shared_tests_instances/snippets/mha_extracted_reshape.cpp new file mode 100644 index 00000000000000..f3c1439395650a --- /dev/null +++ b/src/plugins/intel_cpu/tests/functional/shared_tests_instances/snippets/mha_extracted_reshape.cpp @@ -0,0 +1,40 @@ +// Copyright (C) 2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "snippets/mha.hpp" + +#include "utils.hpp" + +namespace ov { +namespace test { +namespace snippets { + +namespace { + +const auto& inputShapesExtractedReshape = SNIPPETS_TESTS_STATIC_SHAPES( + {{2, 196, 64}, {2, 64, 196}, {2, 14, 14, 14, 1}, {2, 14, 14, 1, 14}, {2, 196, 64}}, + {{1, 16, 10}, {1, 10, 16}, {1, 4, 4, 4, 1}, {1, 4, 4, 1, 4}, {1, 16, 10}}, + {{1, 16, 10}, {1, 10, 16}, {1, 1, 1, 1, 1}, {1, 4, 4, 4, 4}, {1, 16, 10}}, + {{1, 16, 10}, {1, 10, 16}, {1, 4, 4, 4, 4}, {1, 1, 1, 1, 1}, {1, 16, 10}}, + {{1, 4, 16, 10}, {1, 4, 10, 16}, {1, 4, 256}, {1, 4, 256}, {1, 4, 16, 10}}, + {{1, 4, 16, 10}, {1, 4, 10, 16}, {1, 1, 256}, {1, 4, 1}, {1, 4, 16, 10}}); + +INSTANTIATE_TEST_SUITE_P( + smoke_Snippets_MHAWithExtractedReshape, + MHAWithExtractedReshape, + ::testing::Combine(::testing::ValuesIn(inputShapesExtractedReshape), + ::testing::Values(std::vector{}), + ::testing::Values(ov::element::f32), + ::testing::ValuesIn({true}), // False is not supported for graph builder in tests + ::testing::Values(MHA::default_thread_count), + ::testing::Values(3), // Extracted Add + Extracted Reshape + MHA + ::testing::Values(2), // Extracted Add + MHA + ::testing::Values(ov::test::utils::DEVICE_CPU), + ::testing::Values(CPUTestUtils::empty_plugin_config)), + MHA::getTestCaseName); + +} // namespace +} // namespace snippets +} // namespace test +} // namespace ov diff --git a/src/plugins/intel_cpu/tests/functional/shared_tests_instances/snippets/mha_fma.cpp b/src/plugins/intel_cpu/tests/functional/shared_tests_instances/snippets/mha_fma.cpp new file mode 100644 index 00000000000000..4bf35e2daa690d --- /dev/null +++ b/src/plugins/intel_cpu/tests/functional/shared_tests_instances/snippets/mha_fma.cpp @@ -0,0 +1,33 @@ +// Copyright (C) 2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "snippets/mha.hpp" + +#include "utils.hpp" + +namespace ov { +namespace test { +namespace snippets { + +namespace { + +INSTANTIATE_TEST_SUITE_P( + smoke_Snippets_MHAMulAdd, + MHAMulAdd, + ::testing::Combine( + ::testing::ValuesIn(SNIPPETS_TESTS_STATIC_SHAPES({{1, 10, 12, 16}, {1, 10, 12, 16}, {1, 10, 12, 16}})), + ::testing::ValuesIn(precision_f32(3)), + ::testing::Values(ov::element::f32), + ::testing::ValuesIn({false}), // Need to support True for graph builder in tests + ::testing::Values(MHA::default_thread_count), + ::testing::Values(1), + ::testing::Values(1), + ::testing::Values(ov::test::utils::DEVICE_CPU), + ::testing::Values(CPUTestUtils::empty_plugin_config)), + MHA::getTestCaseName); + +} // namespace +} // namespace snippets +} // namespace test +} // namespace ov diff --git a/src/plugins/intel_cpu/tests/functional/shared_tests_instances/snippets/mha_quantized.cpp b/src/plugins/intel_cpu/tests/functional/shared_tests_instances/snippets/mha_quantized.cpp new file mode 100644 index 00000000000000..0c731b74565863 --- /dev/null +++ b/src/plugins/intel_cpu/tests/functional/shared_tests_instances/snippets/mha_quantized.cpp @@ -0,0 +1,103 @@ +// Copyright (C) 2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "snippets/mha.hpp" + +#include "utils.hpp" + +namespace ov { +namespace test { +namespace snippets { + +namespace { + +std::vector> inputShapesQuantized { + { + {{}, {{1, 128, 16, 64}}}, + {{}, {{1, 128, 16, 64}}}, + {{}, {{1, 16, 1, 1}}}, + {{}, {{1, 128, 16, 64}}} + }, + { + {{}, {{2, 68, 6, 92}}}, + {{}, {{2, 68, 6, 92}}}, + {{}, {{1, 1, 68, 68}}}, + {{}, {{2, 68, 6, 92}}} + }, + // K, N are static + { + {PartialShape{-1, -1, -1, 100}, {{1, 64, 4, 100}, {2, 16, 2, 100}, {1, 72, 4, 100}}}, + {PartialShape{-1, 128, -1, 100}, {{1, 128, 4, 100}, {2, 128, 2, 100}, {1, 128, 4, 100}}}, + {PartialShape{-1, -1, -1, 128}, {{1, 4, 64, 128}, {2, 2, 16, 128}, {1, 4, 72, 128}}}, + {PartialShape{-1, 128, -1, 100}, {{1, 128, 4, 100}, {2, 128, 2, 100}, {1, 128, 4, 100}}}, + }, + { + {PartialShape{-1, -1, -1, -1}, {{1, 128, 3, 64}, {2, 16, 2, 100}, {1, 128, 3, 64}, {1, 128, 12, 600}}}, + {PartialShape{-1, -1, -1, -1}, {{1, 128, 1, 64}, {2, 128, 2, 100}, {1, 128, 1, 64}, {1, 128, 12, 600}}}, + {PartialShape{-1, -1, -1, -1}, {{2, 1, 128, 128}, {1, 1, 1, 128}, {2, 1, 128, 128}, {1, 12, 1, 1}}}, + {PartialShape{-1, -1, -1, -1}, {{1, 128, 3, 64}, {2, 128, 2, 100}, {1, 128, 3, 64}, {1, 128, 12, 600}}}, + } +}; + +INSTANTIATE_TEST_SUITE_P( + smoke_Snippets_MHAINT8MatMul, + MHAINT8MatMul, + ::testing::Combine(::testing::ValuesIn(inputShapesQuantized), + ::testing::Values(std::vector{}), + ::testing::Values(ov::element::f32), + ::testing::Values(false), // The graph doesn't contain Multiply + ::testing::Values(MHA::default_thread_count), + ::testing::Values(6), // FQx3 on inputs + MHA + Transpose on output + Deq Mul + ::testing::Values(5), // FQx3 on inputs + MHA + Deq Mul + ::testing::Values(ov::test::utils::DEVICE_CPU), + ::testing::Values(CPUTestUtils::empty_plugin_config)), + MHA::getTestCaseName); + +INSTANTIATE_TEST_SUITE_P( + smoke_Snippets_MHAQuantMatMul0, + MHAQuantMatMul0, + ::testing::Combine( + ::testing::ValuesIn(inputShapesQuantized), + ::testing::Values(std::vector{}), + ::testing::Values(ov::element::f32), + ::testing::Values(false), // The graph doesn't contain Multiply + ::testing::Values(MHA::default_thread_count), + ::testing::Values(5), // FQx2 on inputs + MHA + Transpose on output + Deq Mul + ::testing::Values(4), // FQx2 on inputs + MHA + Deq Mul + ::testing::Values(ov::test::utils::DEVICE_CPU), + ::testing::Values(CPUTestUtils::empty_plugin_config)), + MHA::getTestCaseName); + +INSTANTIATE_TEST_SUITE_P( + smoke_Snippets_MHAFQAfterMatMul_4D, + MHAFQAfterMatMul, + ::testing::Combine(::testing::ValuesIn(inputShapesQuantized), + ::testing::Values(std::vector{}), + ::testing::Values(ov::element::f32), + ::testing::Values(false), // The graph doesn't contain Multiply + ::testing::Values(MHA::default_thread_count), + ::testing::Values(3), // MHA + Transpose on output + Deq Mul + ::testing::Values(2), // MHA + Deq Mul + ::testing::Values(ov::test::utils::DEVICE_CPU), + ::testing::Values(CPUTestUtils::empty_plugin_config)), + MHA::getTestCaseName); + +INSTANTIATE_TEST_SUITE_P( + smoke_Snippets_MHAFQ, + MHAFQ, + ::testing::Combine(::testing::ValuesIn(inputShapesQuantized), + ::testing::Values(std::vector{}), + ::testing::Values(ov::element::f32), + ::testing::Values(false), // The graph doesn't contain Multiply + ::testing::Values(MHA::default_thread_count), + ::testing::Values(7), // Transposex2 + Subgraphsx5 + ::testing::Values(5), // MHA + Deq Mul on output + Deqs on inputs + 2 xFQ on inputs + ::testing::Values(ov::test::utils::DEVICE_CPU), + ::testing::Values(CPUTestUtils::empty_plugin_config)), + MHA::getTestCaseName); + +} // namespace +} // namespace snippets +} // namespace test +} // namespace ov diff --git a/src/plugins/intel_cpu/tests/functional/shared_tests_instances/snippets/mha_select.cpp b/src/plugins/intel_cpu/tests/functional/shared_tests_instances/snippets/mha_select.cpp new file mode 100644 index 00000000000000..3fc1417d20b102 --- /dev/null +++ b/src/plugins/intel_cpu/tests/functional/shared_tests_instances/snippets/mha_select.cpp @@ -0,0 +1,41 @@ +// Copyright (C) 2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "snippets/mha.hpp" + +#include "utils.hpp" + +namespace ov { +namespace test { +namespace snippets { + +namespace { + +const auto& inputShapeSelect = SNIPPETS_TESTS_STATIC_SHAPES( + // without broadcast + {{1, 128, 12, 64}, {1, 128, 12, 64}, {1, 12, 128, 128}, {1, 12, 128, 128}, {1, 12, 128, 128}, {1, 128, 12, 64}}, + {{1, 94, 12, 54}, {1, 94, 12, 54}, {1, 12, 94, 94}, {1, 12, 94, 94}, {1, 12, 94, 94}, {1, 94, 12, 54}}, + // with broadcast + {{1, 128, 12, 64}, {1, 128, 12, 64}, {1, 12, 128, 128}, {1, 12, 1, 1}, {1, 12, 1, 1}, {1, 128, 12, 64}}, + {{2, 52, 6, 102}, {2, 52, 6, 102}, {1, 6, 52, 52}, {1, 6, 1, 1}, {1, 6, 1, 1}, {2, 52, 6, 102}} +); + +INSTANTIATE_TEST_SUITE_P( + smoke_Snippets_MHA, + MHASelect, + ::testing::Combine(::testing::ValuesIn(inputShapeSelect), + ::testing::ValuesIn(precision_f32(6)), + ::testing::Values(ov::element::f32), + ::testing::Values(false), // Need to support True for graph builder in tests + ::testing::Values(MHA::default_thread_count), + ::testing::Values(2), // Less + MHA + ::testing::Values(2), + ::testing::Values(ov::test::utils::DEVICE_CPU), + ::testing::Values(CPUTestUtils::empty_plugin_config)), + MHA::getTestCaseName); + +} // namespace +} // namespace snippets +} // namespace test +} // namespace ov diff --git a/src/plugins/intel_cpu/tests/functional/shared_tests_instances/snippets/mha_split_dim_m.cpp b/src/plugins/intel_cpu/tests/functional/shared_tests_instances/snippets/mha_split_dim_m.cpp new file mode 100644 index 00000000000000..bb5f7fe2fa5b52 --- /dev/null +++ b/src/plugins/intel_cpu/tests/functional/shared_tests_instances/snippets/mha_split_dim_m.cpp @@ -0,0 +1,121 @@ +// Copyright (C) 2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "snippets/mha.hpp" + +#include "utils.hpp" + +namespace ov { +namespace test { +namespace snippets { + +namespace { + +static ov::AnyMap enable_callback() { + return ov::AnyMap({ov::intel_cpu::snippets_mode(ov::intel_cpu::SnippetsMode::ENABLE)}); +} + +INSTANTIATE_TEST_SUITE_P( + smoke_Snippets_MHA_4D_SplitDimensionM_static, + MHA, + ::testing::Combine(::testing::ValuesIn(SNIPPETS_TESTS_STATIC_SHAPES({{1, 128, 2, 64}, {1, 128, 2, 64}, {1, 1, 1, 1}, {1, 128, 2, 64}})), + ::testing::ValuesIn(precision_f32(4)), + ::testing::Values(ov::element::f32), + ::testing::Values(true), + ::testing::Values(4), // 4 Threads + ::testing::Values(6), // Subgraph + 4 Reshapes on inputs and 1 Reshape on output + ::testing::Values(1), + ::testing::Values(ov::test::utils::DEVICE_CPU), + ::testing::Values(enable_callback())), + MHA::getTestCaseName); + +INSTANTIATE_TEST_SUITE_P( + smoke_Snippets_MHA_3D_SplitDimensionM_static, + MHA, + ::testing::Combine( + ::testing::ValuesIn(SNIPPETS_TESTS_STATIC_SHAPES({{384, 2, 64}, {384, 2, 64}, {1, 384, 384}, {384, 2, 64}})), + ::testing::ValuesIn(precision_f32(4)), + ::testing::Values(ov::element::f32), + ::testing::Values(true), + ::testing::Values(4), // 4 Threads + ::testing::Values(10), // Subgraph + 4 Reshapes on inputs and 1 Reshape on output + 4 Transposes + ::testing::Values(1), // MHA + ::testing::Values(ov::test::utils::DEVICE_CPU), + ::testing::Values(enable_callback())), + MHA::getTestCaseName); + +std::vector> splitm_dynamic_shapes_4d = { + { + {PartialShape{-1, -1, -1, -1}, {{1, 128, 2, 64}, {1, 17, 2, 64}, {1, 128, 2, 64}}}, + {PartialShape{-1, -1, -1, -1}, {{1, 128, 2, 64}, {1, 17, 2, 64}, {1, 128, 2, 64}}}, + {PartialShape{-1, -1, -1, -1}, {{1, 1, 1, 128}, {1, 1, 1, 17}, {1, 1, 1, 128}}}, + {PartialShape{-1, -1, -1, -1}, {{1, 128, 2, 64}, {1, 17, 2, 64}, {1, 128, 2, 64}}}, + }, + { + {PartialShape{-1, 128, -1, -1}, {{1, 128, 2, 64}}}, + {PartialShape{-1, -1, -1, -1}, {{1, 16, 2, 64}}}, + {PartialShape{-1, -1, 128, -1}, {{1, 1, 128, 16}}}, + {PartialShape{-1, -1, -1, -1}, {{1, 16, 2, 32}}}, + }, + { + {PartialShape{-1, 32, -1, -1}, {{1, 32, 2, 64}}}, + {PartialShape{-1, -1, -1, -1}, {{1, 16, 2, 64}}}, + {PartialShape{-1, -1, 32, -1}, {{1, 1, 32, 16}}}, + {PartialShape{-1, -1, -1, -1}, {{1, 16, 2, 32}}}, + }, + { + {PartialShape{-1, -1, -1, -1}, {{1, 16, 2, 64}}}, + {PartialShape{-1, -1, -1, -1}, {{1, 16, 2, 64}}}, + {PartialShape{-1, -1, 16, -1}, {{1, 1, 16, 16}}}, + {PartialShape{-1, -1, -1, -1}, {{1, 16, 2, 32}}}, + }, +}; + +INSTANTIATE_TEST_SUITE_P( + smoke_Snippets_MHA_4D_SplitDimensionM_dynamic, + MHA, + ::testing::Combine(::testing::ValuesIn(splitm_dynamic_shapes_4d), + ::testing::ValuesIn(precision_f32(4)), + ::testing::Values(ov::element::f32), + ::testing::Values(false), + ::testing::Values(4), // 4 Threads + ::testing::Values(1), + ::testing::Values(1), + ::testing::Values(ov::test::utils::DEVICE_CPU), + ::testing::Values(CPUTestUtils::empty_plugin_config)), + MHA::getTestCaseName); + +std::vector> splitm_dynamic_shapes_3d = { + { + {PartialShape{-1, -1, -1}, {{128, 2, 64}, {17, 2, 64}, {128, 2, 64}}}, + {PartialShape{-1, -1, -1}, {{128, 2, 64}, {17, 2, 64}, {128, 2, 64}}}, + {PartialShape{-1, -1, -1}, {{1, 1, 128}, {1, 1, 17}, {1, 1, 128}}}, + {PartialShape{-1, -1, -1}, {{128, 2, 64}, {17, 2, 64}, {128, 2, 64}}}, + }, + { + {PartialShape{-1, 2, 64}, {{128, 2, 64}, {64, 2, 64}, {128, 2, 64}}}, + {PartialShape{-1, 2, 64}, {{128, 2, 64}, {64, 2, 64}, {128, 2, 64}}}, + {PartialShape{1, 1, -1}, {{1, 1, 128}, {1, 1, 64}, {1, 1, 128}}}, + {PartialShape{-1, 2, 64}, {{128, 2, 64}, {64, 2, 64}, {128, 2, 64}}}, + }, +}; + +INSTANTIATE_TEST_SUITE_P( + smoke_Snippets_MHA_3D_SplitDimensionM_dynamic, + MHA, + ::testing::Combine(::testing::ValuesIn(splitm_dynamic_shapes_3d), + ::testing::ValuesIn(precision_f32(4)), + ::testing::Values(ov::element::f32), + ::testing::Values(false), + ::testing::Values(4), // 4 Threads + ::testing::Values(5), // Subgraph + 4 Transpose + ::testing::Values(2), // MHA + one of the transposes is executed via Subgraph (because callback is disabled) + ::testing::Values(ov::test::utils::DEVICE_CPU), + ::testing::Values(CPUTestUtils::empty_plugin_config)), + MHA::getTestCaseName); + +} // namespace +} // namespace snippets +} // namespace test +} // namespace ov diff --git a/src/plugins/intel_cpu/tests/functional/shared_tests_instances/snippets/mha_transposed_b.cpp b/src/plugins/intel_cpu/tests/functional/shared_tests_instances/snippets/mha_transposed_b.cpp new file mode 100644 index 00000000000000..45260df3cab280 --- /dev/null +++ b/src/plugins/intel_cpu/tests/functional/shared_tests_instances/snippets/mha_transposed_b.cpp @@ -0,0 +1,50 @@ +// Copyright (C) 2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "snippets/mha.hpp" + +#include "utils.hpp" + +namespace ov { +namespace test { +namespace snippets { + +namespace { + +std::vector> inputShapesTransposedB { + { + {{}, {{1, 12, 12, 64}}}, + {{}, {{1, 12, 48, 64}}}, + {{}, {{1, 12, 48, 64}}} + }, + { + {PartialShape{-1, 3, -1, 64}, {{1, 3, 12, 64}, {2, 3, 36, 64}}}, + {PartialShape{-1, 3, -1, 64}, {{1, 3, 14, 64}, {2, 3, 42, 64}}}, + {PartialShape{-1, 3, -1, -1}, {{1, 3, 14, 36}, {2, 3, 42, 36}}}, + }, + { + {PartialShape{2, -1, 32, -1}, {{2, 1, 32, 70}, {2, 2, 32, 96}}}, + {PartialShape{2, -1, 49, -1}, {{2, 3, 49, 70}, {2, 1, 49, 96}}}, + {PartialShape{2, -1, 49, -1}, {{2, 1, 49, 17}, {2, 2, 49, 81}}}, + }, +}; + +INSTANTIATE_TEST_SUITE_P( + smoke_Snippets_MHATransposedB, + MHATransposedB, + ::testing::Combine(::testing::ValuesIn(inputShapesTransposedB), + ::testing::Values(std::vector{}), + ::testing::Values(ov::element::f32), + ::testing::ValuesIn({true}), // Need to support False for graph builder in tests + ::testing::Values(MHA::default_thread_count), + ::testing::Values(1), + ::testing::Values(1), + ::testing::Values(ov::test::utils::DEVICE_CPU), + ::testing::Values(CPUTestUtils::empty_plugin_config)), + MHA::getTestCaseName); + +} // namespace +} // namespace snippets +} // namespace test +} // namespace ov diff --git a/src/plugins/intel_cpu/tests/functional/shared_tests_instances/snippets/mha_with_dyn_mul.cpp b/src/plugins/intel_cpu/tests/functional/shared_tests_instances/snippets/mha_with_dyn_mul.cpp new file mode 100644 index 00000000000000..7876d737af2281 --- /dev/null +++ b/src/plugins/intel_cpu/tests/functional/shared_tests_instances/snippets/mha_with_dyn_mul.cpp @@ -0,0 +1,68 @@ +// Copyright (C) 2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "snippets/mha.hpp" + +#include "utils.hpp" + +namespace ov { +namespace test { +namespace snippets { + +namespace { + +std::vector> transposedShape_4D_WithMul { + { + {PartialShape{-1, -1, -1, 100}, {{1, 64, 4, 100}, {2, 16, 2, 100}, {1, 72, 4, 100}}}, + {PartialShape{-1, 200, -1, 100}, {{1, 200, 4, 100}, {2, 200, 2, 100}, {1, 200, 4, 100}}}, + {PartialShape{-1, -1, 100, 200}, {{1, 4, 100, 200}, {2, 2, 100, 200}, {1, 4, 100, 200}}}, + {PartialShape{-1, -1, -1, 200}, {{1, 4, 64, 200}, {2, 2, 16, 200}, {1, 4, 72, 200}}}, + {PartialShape{-1, 200, -1, 100}, {{1, 200, 4, 100}, {2, 200, 2, 100}, {1, 200, 4, 100}}}, + }, + { + {PartialShape{-1, -1, -1, -1}, {{1, 128, 3, 64}, {1, 70, 3, 19}, {1, 128, 3, 64}, {1, 68, 6, 87}}}, + {PartialShape{-1, -1, -1, -1}, {{1, 128, 1, 64}, {2, 49, 1, 19}, {1, 128, 1, 64}, {2, 13, 6, 87}}}, + {PartialShape{1}, {{1}, {1}, {1}, {1} }}, + {PartialShape{-1, -1, -1, -1}, {{2, 1, 128, 128}, {1, 1, 70, 49}, {2, 1, 128, 128}, {1, 1, 68, 13}}}, + {PartialShape{-1, -1, -1, -1}, {{1, 128, 3, 64}, {1, 49, 3, 19}, {1, 128, 3, 64}, {2, 13, 6, 87}}}, + }, + { + {PartialShape{-1, -1, 12, 64}, {{1, 70, 12, 64}, {1, 20, 12, 64}, {1, 20, 12, 64}, {1, 20, 12, 64}, {1, 70, 12, 64}}}, + {PartialShape{-1, -1, 12, 64}, {{1, 35, 12, 64}, {2, 10, 12, 64}, {2, 1, 12, 64}, {2, 10, 12, 64}, {1, 35, 12, 64}}}, + {PartialShape{-1, 12, 64, -1}, {{1, 12, 64, 35}, {1, 12, 64, 10}, {1, 12, 64, 10}, {1, 12, 64, 1}, {1, 12, 64, 35}}}, + {PartialShape{-1, 12, -1, -1}, {{2, 12, 70, 35}, {1, 12, 20, 10}, {1, 12, 20, 10}, {1, 12, 20, 1}, {2, 12, 70, 35}}}, + {PartialShape{-1, -1, 12, 64}, {{1, 35, 12, 64}, {1, 10, 12, 64}, {1, 10, 12, 64}, {1, 10, 12, 64}, {1, 35, 12, 64}}}, + } +}; + +INSTANTIATE_TEST_SUITE_P( + smoke_Snippets_MHA_4D_WithDynamicMul, + MHAWithDynamicMul, + ::testing::Combine(::testing::ValuesIn(transposedShape_4D_WithMul), + ::testing::ValuesIn(precision_f32(5)), + ::testing::Values(ov::element::f32), + ::testing::Values(MHA::default_thread_count), + ::testing::Values(1), + ::testing::Values(1), + ::testing::Values(ov::test::utils::DEVICE_CPU), + ::testing::Values(CPUTestUtils::empty_plugin_config)), + MHAWithDynamicMul::getTestCaseName); + +INSTANTIATE_TEST_SUITE_P( + smoke_Snippets_MHA_4D_WithDynamicMul_EnforceBF16, + MHAWithDynamicMul, + ::testing::Combine(::testing::ValuesIn(transposedShape_4D_WithMul), + ::testing::ValuesIn(precision_f32(5)), + ::testing::Values(ov::element::bf16), + ::testing::Values(MHA::default_thread_count), + ::testing::Values(8), // MHA + 1 Transpose on output + 6 Converts around + ::testing::Values(7), // MHA + 6 Converts around + ::testing::Values(ov::test::utils::DEVICE_CPU), + ::testing::Values(CPUTestUtils::empty_plugin_config)), + MHAWithDynamicMul::getTestCaseName); + +} // namespace +} // namespace snippets +} // namespace test +} // namespace ov diff --git a/src/plugins/intel_cpu/tests/functional/shared_tests_instances/snippets/mha_wo_transpose.cpp b/src/plugins/intel_cpu/tests/functional/shared_tests_instances/snippets/mha_wo_transpose.cpp new file mode 100644 index 00000000000000..0967ef27087674 --- /dev/null +++ b/src/plugins/intel_cpu/tests/functional/shared_tests_instances/snippets/mha_wo_transpose.cpp @@ -0,0 +1,151 @@ +// Copyright (C) 2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "snippets/mha.hpp" + +#include "utils.hpp" + +namespace ov { +namespace test { +namespace snippets { + +namespace { + +std::vector> originalShape_4D { + { {{}, {{1, 12, 197, 64}}}, {{}, {{1, 12, 64, 197}}}, {{}, {{1, 12, 197, 64}}} }, + { {{}, {{1, 12, 12, 64}}}, {{}, {{1, 12, 64, 48}}}, {{}, {{1, 12, 48, 64}}} }, + { + {PartialShape{-1, -1, -1, -1}, {{1, 3, 128, 64}, {1, 12, 197, 100}, {1, 3, 128, 64}, {1, 12, 197, 600}}}, + {PartialShape{-1, -1, -1, -1}, {{1, 3, 64, 128}, {1, 12, 100, 197}, {1, 3, 64, 128}, {1, 12, 600, 197}}}, + {PartialShape{-1, -1, -1, -1}, {{1, 3, 128, 64}, {1, 12, 197, 100}, {1, 3, 128, 64}, {1, 12, 197, 600}}}, + }, + { + {PartialShape{1, 4, -1, -1}, {{1, 4, 384, 64}, {1, 4, 197, 64}, {1, 4, 384, 560}}}, + {PartialShape{1, 4, -1, -1}, {{1, 4, 64, 128}, {1, 4, 64, 197}, {1, 4, 560, 384}}}, + {PartialShape{1, 4, -1, 64}, {{1, 4, 128, 64}, {1, 4, 197, 64}, {1, 4, 384, 64}}}, + } +}; + +std::vector> originalShape_3D { + { {{}, {{12, 197, 64}}}, {{}, {{12, 64, 197}}}, {{}, {{12, 197, 64}}} }, + { {{}, {{12, 128, 100}}}, {{}, {{12, 100, 128}}}, {{}, {{12, 128, 100}}} }, + { + {PartialShape{-1, -1, 64}, {{2, 9, 64}, {1, 64, 64}, {2, 64, 64}}}, + {PartialShape{-1, 64, 124}, {{2, 64, 124}, {1, 64, 124}, {2, 64, 124}}}, + {PartialShape{-1, 124, 64}, {{2, 124, 64}, {1, 124, 64}, {2, 124, 64}}}, + }, + { + {PartialShape{-1, -1, -1}, {{12, 19, 85}, {1, 40, 36}}}, + {PartialShape{-1, -1, -1}, {{1, 85, 19}, {2, 36, 40}}}, + {PartialShape{-1, -1, -1}, {{12, 19, 85}, {1, 40, 36}}}, + }, + { + {PartialShape{2, -1, 64}, {{2, 9, 64}, {2, 4, 64}, {2, 9, 64}}}, + {PartialShape{2, 64, -1}, {{2, 64, 9}, {2, 64, 4}, {2, 64, 9}}}, + {PartialShape{2, -1, 64}, {{2, 9, 64}, {2, 4, 64}, {2, 9, 64}}}, + } +}; + +INSTANTIATE_TEST_SUITE_P( + smoke_Snippets_MHAWOTransposeOnInputs_4D, + MHAWOTransposeOnInputs, + ::testing::Combine(::testing::ValuesIn(originalShape_4D), + ::testing::Values(std::vector{}), + ::testing::Values(ov::element::f32), + ::testing::Values(true), // Need to support False for graph builder in tests + ::testing::Values(MHA::default_thread_count), + ::testing::Values(1), + ::testing::Values(1), + ::testing::Values(ov::test::utils::DEVICE_CPU), + ::testing::Values(CPUTestUtils::empty_plugin_config)), + MHA::getTestCaseName); + +INSTANTIATE_TEST_SUITE_P( + smoke_Snippets_MHAWOTranspose_4D, + MHAWOTranspose, + ::testing::Combine(::testing::ValuesIn(originalShape_4D), + ::testing::ValuesIn(precision_f32(3)), + ::testing::Values(ov::element::f32), + ::testing::Values(true), // Need to support False for graph builder in tests + ::testing::Values(MHA::default_thread_count), + ::testing::Values(1), + ::testing::Values(1), + ::testing::Values(ov::test::utils::DEVICE_CPU), + ::testing::Values(CPUTestUtils::empty_plugin_config)), + MHA::getTestCaseName); + +INSTANTIATE_TEST_SUITE_P( + smoke_Snippets_MHAWOTranspose_3D, + MHAWOTranspose, + ::testing::Combine(::testing::ValuesIn(originalShape_3D), + ::testing::ValuesIn(precision_f32(3)), + ::testing::Values(ov::element::f32), + ::testing::Values(true), // Need to support False for graph builder in tests + ::testing::Values(MHA::default_thread_count), + ::testing::Values(1), + ::testing::Values(1), + ::testing::Values(ov::test::utils::DEVICE_CPU), + ::testing::Values(CPUTestUtils::empty_plugin_config)), + MHA::getTestCaseName); + +INSTANTIATE_TEST_SUITE_P( + smoke_Snippets_MHAWOTransposeBF16_4D, + MHAWOTranspose, + ::testing::Combine(::testing::ValuesIn(originalShape_4D), + ::testing::ValuesIn(precision_bf16_if_supported(3)), + ::testing::Values(ov::element::f32), + ::testing::Values(true), // Need to support False for graph builder in tests + ::testing::Values(MHA::default_thread_count), + ::testing::Values(5), // MHA + 4 extra Converts on inputs and output + ::testing::Values(5), // MHA + 4 extra Converts on inputs and output + ::testing::Values(ov::test::utils::DEVICE_CPU), + ::testing::Values(CPUTestUtils::empty_plugin_config)), + MHA::getTestCaseName); + +INSTANTIATE_TEST_SUITE_P( + smoke_Snippets_MHAWOTransposeBF16_3D, + MHAWOTranspose, + ::testing::Combine(::testing::ValuesIn(originalShape_3D), + ::testing::ValuesIn(precision_bf16_if_supported(3)), + ::testing::Values(ov::element::f32), + ::testing::Values(true), // Need to support False for graph builder in tests + ::testing::Values(MHA::default_thread_count), + ::testing::Values(5), // MHA + 4 extra Converts on inputs and output + ::testing::Values(5), // MHA + 4 extra Converts on inputs and output + ::testing::Values(ov::test::utils::DEVICE_CPU), + ::testing::Values(CPUTestUtils::empty_plugin_config)), + MHA::getTestCaseName); + +INSTANTIATE_TEST_SUITE_P( + smoke_Snippets_MHAWOTransposeEnforceBF16_4D, + MHAWOTranspose, + ::testing::Combine(::testing::ValuesIn(originalShape_4D), + ::testing::ValuesIn(precision_f32(3)), + ::testing::Values(ov::element::bf16), + ::testing::Values(true), // Need to support False for graph builder in tests + ::testing::Values(MHA::default_thread_count), + ::testing::Values(5), // MHA + 4 extra Converts on inputs and output + ::testing::Values(5), // MHA + 4 extra Converts on inputs and output + ::testing::Values(ov::test::utils::DEVICE_CPU), + ::testing::Values(CPUTestUtils::cpu_bf16_plugin_config)), + MHA::getTestCaseName); + +INSTANTIATE_TEST_SUITE_P( + smoke_Snippets_MHAWOTransposeEnforceBF16_3D, + MHAWOTranspose, + ::testing::Combine(::testing::ValuesIn(originalShape_3D), + ::testing::ValuesIn(precision_f32(3)), + ::testing::Values(ov::element::bf16), + ::testing::Values(true), // Need to support False for graph builder in tests + ::testing::Values(MHA::default_thread_count), + ::testing::Values(5), // MHA + 4 extra Converts on inputs and output + ::testing::Values(5), // MHA + 4 extra Converts on inputs and output + ::testing::Values(ov::test::utils::DEVICE_CPU), + ::testing::Values(CPUTestUtils::cpu_bf16_plugin_config)), + MHA::getTestCaseName); + +} // namespace +} // namespace snippets +} // namespace test +} // namespace ov diff --git a/src/plugins/intel_cpu/tests/functional/shared_tests_instances/snippets/transpose_matmul.cpp b/src/plugins/intel_cpu/tests/functional/shared_tests_instances/snippets/transpose_matmul.cpp index c05087283305e4..ea7de9ccb209ad 100644 --- a/src/plugins/intel_cpu/tests/functional/shared_tests_instances/snippets/transpose_matmul.cpp +++ b/src/plugins/intel_cpu/tests/functional/shared_tests_instances/snippets/transpose_matmul.cpp @@ -6,36 +6,28 @@ #include "common_test_utils/test_constants.hpp" #include "openvino/runtime/system_conf.hpp" +#include "utils.hpp" namespace ov { namespace test { namespace snippets { -#define STATIC_SHAPES(...) static_shapes_to_test_representation(std::vector>{__VA_ARGS__}) - namespace { static inline std::vector> precisions(bool only_fp32 = true) { - std::vector> prc = { - {element::f32, element::f32}, - }; -// Note: low precisions are not supported by TPP yet (ticker: 130010) + std::vector> prc = precision_f32(2); +// Note: TPP doesn't support low precisions yet #ifndef SNIPPETS_LIBXSMM_TPP if (!only_fp32) { - // In Snippets MatMul INT8 is supported only on VNNI/AMX platforms - if (ov::with_cpu_x86_avx512_core_vnni() || ov::with_cpu_x86_avx512_core_amx_int8()) { - prc.emplace_back(std::vector{element::i8, element::i8}); - prc.emplace_back(std::vector{element::u8, element::i8}); - } - // In Snippets MatMul BF16 is supported only on bf16/AMX platforms - if (ov::with_cpu_x86_bfloat16() || ov::with_cpu_x86_avx512_core_amx_bf16()) { - prc.emplace_back(std::vector{element::bf16, element::bf16}); - } + auto quant = quantized_precisions_if_supported(); + std::copy(quant.begin(), quant.end(), std::back_inserter(prc)); + auto bfloat = precision_bf16_if_supported(2); + std::copy(bfloat.begin(), bfloat.end(), std::back_inserter(prc)); } #endif return prc; } namespace transpose_zero_input { -const auto& transpose_input_shapes = STATIC_SHAPES({{1, 49, 2, 23}, {2, 2, 23, 39}}); +const auto& transpose_input_shapes = SNIPPETS_TESTS_STATIC_SHAPES({{1, 49, 2, 23}, {2, 2, 23, 39}}); INSTANTIATE_TEST_SUITE_P(smoke_Snippets_MatMult, TransposeMatMul, ::testing::Combine( ::testing::ValuesIn(transpose_input_shapes), @@ -84,7 +76,7 @@ INSTANTIATE_TEST_SUITE_P(smoke_Snippets_FullyConnected, TransposeMatMul, } // namespace transpose_zero_input namespace transpose_first_input { -const auto& transpose_input_shapes = STATIC_SHAPES({{2, 1, 49, 13}, {1, 13, 3, 39}}); +const auto& transpose_input_shapes = SNIPPETS_TESTS_STATIC_SHAPES({{2, 1, 49, 13}, {1, 13, 3, 39}}); INSTANTIATE_TEST_SUITE_P(smoke_Snippets_MatMult, TransposeMatMul, ::testing::Combine( ::testing::ValuesIn(transpose_input_shapes), @@ -126,7 +118,7 @@ INSTANTIATE_TEST_SUITE_P(smoke_Snippets_TransposeMatMulFQ, TransposeMatMulFQ, } // namespace transpose_first_input namespace transpose_output { -const auto& transpose_input_shapes = STATIC_SHAPES({{2, 1, 49, 13}, {1, 2, 13, 39}}); +const auto& transpose_input_shapes = SNIPPETS_TESTS_STATIC_SHAPES({{2, 1, 49, 13}, {1, 2, 13, 39}}); INSTANTIATE_TEST_SUITE_P(smoke_Snippets_MatMult, TransposeMatMul, ::testing::Combine( @@ -195,7 +187,7 @@ static inline std::vector> precisions(bool only_fp32 } INSTANTIATE_TEST_SUITE_P(smoke_Snippets_ExplicitTransposeMatMul, ExplicitTransposeMatMul, ::testing::Combine( - ::testing::ValuesIn(STATIC_SHAPES({{1, 2, 69, 43}, {2, 49, 2, 43}})), + ::testing::ValuesIn(SNIPPETS_TESTS_STATIC_SHAPES({{1, 2, 69, 43}, {2, 49, 2, 43}})), ::testing::Values(1), // Transpose on second input ::testing::ValuesIn(precisions()), ::testing::Values(MatMulType::MatMul), @@ -223,7 +215,7 @@ INSTANTIATE_TEST_SUITE_P(smoke_Snippets_DynExplicitTransposeMatMul, ExplicitTran INSTANTIATE_TEST_SUITE_P(smoke_Snippets_TransposeMatMulBias, ExplicitTransposeMatMulBias, ::testing::Combine( - ::testing::ValuesIn(STATIC_SHAPES({{1, 2, 69, 43}, {2, 49, 2, 43}, {1, 1, 69, 49}})), + ::testing::ValuesIn(SNIPPETS_TESTS_STATIC_SHAPES({{1, 2, 69, 43}, {2, 49, 2, 43}, {1, 1, 69, 49}})), ::testing::Values(1), // Transpose on second input ::testing::ValuesIn(precisions()), ::testing::Values(MatMulType::MatMul), diff --git a/src/plugins/intel_cpu/tests/functional/shared_tests_instances/snippets/utils.hpp b/src/plugins/intel_cpu/tests/functional/shared_tests_instances/snippets/utils.hpp new file mode 100644 index 00000000000000..6c0d54da973086 --- /dev/null +++ b/src/plugins/intel_cpu/tests/functional/shared_tests_instances/snippets/utils.hpp @@ -0,0 +1,48 @@ +// Copyright (C) 2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "internal_properties.hpp" +#include "utils/cpu_test_utils.hpp" +#include "openvino/runtime/system_conf.hpp" + +namespace ov { +namespace test { +namespace snippets { + +#define SNIPPETS_TESTS_STATIC_SHAPES(...) static_shapes_to_test_representation(std::vector>{__VA_ARGS__}) + +static inline bool is_bf16_supported_by_brgemm() { + return ov::with_cpu_x86_bfloat16() || ov::with_cpu_x86_avx512_core_amx_bf16(); +} + +static inline bool is_i8_supported_by_brgemm() { + return ov::with_cpu_x86_avx512_core_vnni() || ov::with_cpu_x86_avx512_core_amx_int8(); +} + +static inline std::vector> precision_f32(size_t count) { + std::vector> prc; + prc.emplace_back(std::vector(count, element::f32)); + return prc; +} + +static inline std::vector> precision_bf16_if_supported(size_t count) { + std::vector> prc; + if (is_bf16_supported_by_brgemm()) + prc.emplace_back(std::vector(count, element::bf16)); + return prc; +} + +static inline std::vector> quantized_precisions_if_supported() { + std::vector> prc = {}; + // In Snippets MatMul INT8 is supported only on VNNI/AMX platforms + if (is_i8_supported_by_brgemm()) { + prc.emplace_back(std::vector{element::i8, element::i8}); + prc.emplace_back(std::vector{element::u8, element::i8}); + } + return prc; +} + +} // namespace snippets +} // namespace test +} // namespace ov diff --git a/src/tests/functional/plugin/shared/include/snippets/mha.hpp b/src/tests/functional/plugin/shared/include/snippets/mha.hpp index f8198dee0218ee..34cb4d452bfb15 100644 --- a/src/tests/functional/plugin/shared/include/snippets/mha.hpp +++ b/src/tests/functional/plugin/shared/include/snippets/mha.hpp @@ -44,6 +44,7 @@ class MHABase : virtual public SnippetsTestsCommon { void generate_inputs(const std::vector& targetInputStaticShapes) override; virtual std::shared_ptr get_subgraph() const = 0; virtual void init_params(std::vector& input_shapes, ov::element::Type& prc, ov::AnyMap& additional_config) = 0; + virtual void init_thresholds(); size_t m_thread_count; std::vector m_input_types; @@ -88,6 +89,7 @@ class MHATransposedB : public MHA { class MHAINT8MatMul : public MHA { protected: std::shared_ptr get_subgraph() const override; + void init_thresholds() override; }; class MHAQuantMatMul0 : public MHA { @@ -103,6 +105,7 @@ class MHAFQAfterMatMul : public MHA { class MHAFQ : public MHA { protected: std::shared_ptr get_subgraph() const override; + void init_thresholds() override; }; class MHAWithExtractedReshape : public MHA { diff --git a/src/tests/functional/plugin/shared/src/snippets/mha.cpp b/src/tests/functional/plugin/shared/src/snippets/mha.cpp index 351cd50856357d..8d0cb8613bc47e 100644 --- a/src/tests/functional/plugin/shared/src/snippets/mha.cpp +++ b/src/tests/functional/plugin/shared/src/snippets/mha.cpp @@ -53,15 +53,19 @@ void MHABase::SetUp() { configuration.insert({"SNIPPETS_MODE", "IGNORE_CALLBACK"}); } - setInferenceType(prc); inType = outType = prc; + setInferenceType(prc); + init_thresholds(); +} + + void MHABase::init_thresholds() { // Note: Libxsmm calculates Exp in a slightly different way, so the abs values might differ a bit. Ticket: 130699 #ifdef SNIPPETS_LIBXSMM_TPP abs_threshold = 1e-6; #endif - if (prc == ov::element::bf16) + if (inType == ov::element::bf16) rel_threshold = 0.05f; -} + } std::string MHA::getTestCaseName(testing::TestParamInfo obj) { std::vector input_shapes; @@ -194,6 +198,11 @@ std::shared_ptr MHAINT8MatMul::get_subgraph() const { return std::make_shared(inputDynamicShapes); } +void MHAINT8MatMul::init_thresholds() { + MHABase::init_thresholds(); + abs_threshold = 4e-6; +} + std::shared_ptr MHAQuantMatMul0::get_subgraph() const { return std::make_shared(inputDynamicShapes); } @@ -206,6 +215,11 @@ std::shared_ptr MHAFQ::get_subgraph() const { return std::make_shared(inputDynamicShapes); } +void MHAFQ::init_thresholds() { + MHABase::init_thresholds(); + abs_threshold = 0.016; +} + std::shared_ptr MHAMulAdd::get_subgraph() const { return std::make_shared(inputDynamicShapes); } diff --git a/src/tests/ov_helpers/ov_snippets_models/include/subgraph_mha.hpp b/src/tests/ov_helpers/ov_snippets_models/include/subgraph_mha.hpp index 90ab47214effee..f54f92c598a45f 100644 --- a/src/tests/ov_helpers/ov_snippets_models/include/subgraph_mha.hpp +++ b/src/tests/ov_helpers/ov_snippets_models/include/subgraph_mha.hpp @@ -235,9 +235,7 @@ class MHAWOTransposeSplitMFunction : public MHAWOTransposeFunction { * FakeQuantize i8 * \ / * Add - * Reshape0 - * Softmax - * Reshape1 Transpose2[0,2,1,3] + * Softmax Transpose2[0,2,1,3] * \ / * MatMul1 * FakeQuantize i8 @@ -261,9 +259,7 @@ class MHAFQAfterMatMulFunction : public SnippetsFunctionBase { * FakeQuantize i8 * \ / * Add - * Reshape0 - * Softmax - * Reshape1 FakeQuantize i8 + * Softmax FakeQuantize i8 * FakeQuantize u8 Transpose2[0,2,1,3] * \ / * MatMul1 @@ -281,20 +277,17 @@ class MHAINT8MatMulFunction : public SnippetsFunctionBase { }; /* Graph: - * FakeQuantize i8 Reshape1 - * Reshape0 Transpose1[0,2,3,1] + * FakeQuantize i8 Transpose1[0,2,3,1] * Transpose0[0,2,1,3] FakeQuantize i8 * \ / * MatMul0 * \ / - * Add Reshape2 + * Add * Softmax Transpose2[0,2,1,3] * \ / * MatMul1 * FakeQuantize i8 * Transpose3[0,2,1,3] - * Reshape3 - * Note: Reshapes are tosplit Tokenization between FQs and deq Mul and MHA since Snippets::Ignore_Callback may be enabled */ class MHAQuantMatMul0Function : public SnippetsFunctionBase { public: diff --git a/src/tests/ov_helpers/ov_snippets_models/src/subgraph_mha.cpp b/src/tests/ov_helpers/ov_snippets_models/src/subgraph_mha.cpp index 1dbf8d7d22ed26..34f42ec838aa6d 100644 --- a/src/tests/ov_helpers/ov_snippets_models/src/subgraph_mha.cpp +++ b/src/tests/ov_helpers/ov_snippets_models/src/subgraph_mha.cpp @@ -598,38 +598,25 @@ std::shared_ptr MHAFQAfterMatMulFunction::initOriginal() const { auto transpose2Param = std::make_shared(precision, input_shapes[3]); ov::ParameterVector ngraphParam = {transpose0Param, transpose1Param, addParam, transpose2Param}; - const auto shape_rank = input_shapes[0].get_shape().size(); + const auto shape_rank = input_shapes[0].size(); auto transpose0Const = ov::op::v0::Constant::create(ov::element::i64, {shape_rank}, std::vector{0, 2, 1, 3}); auto transpose1Const = ov::op::v0::Constant::create(ov::element::i64, {shape_rank}, std::vector{0, 2, 3, 1}); auto transpose2Const = ov::op::v0::Constant::create(ov::element::i64, {shape_rank}, std::vector{0, 2, 1, 3}); auto transpose3Const = ov::op::v0::Constant::create(ov::element::i64, {shape_rank}, std::vector{0, 2, 1, 3}); - std::vector reshape0ConstData = {static_cast(input_shapes[0].get_shape()[0] * - input_shapes[0].get_shape()[1] * input_shapes[0].get_shape()[2]), - -1}; - auto reshape0Const = ov::op::v0::Constant::create(ov::element::i64, {reshape0ConstData.size()}, reshape0ConstData); - - std::vector reshape1ConstData = {static_cast(input_shapes[0].get_shape()[0]), - static_cast(input_shapes[0].get_shape()[2]), - static_cast(input_shapes[0].get_shape()[1]), - static_cast(input_shapes[0].get_shape()[1])}; - auto reshape1Const = ov::op::v0::Constant::create(ov::element::i64, {reshape1ConstData.size()}, reshape1ConstData); - bool transA = false; bool transB = false; const auto transpose0 = std::make_shared(transpose0Param, transpose0Const); const auto transpose1 = std::make_shared(transpose1Param, transpose1Const); const auto matMul0 = std::make_shared(transpose0, transpose1, transA, transB); auto fq0 = ov::test::utils::make_fake_quantize(matMul0, ov::element::f32, 256, {1}, - {-35.0172004}, {34.7436294}, {-35.0172004}, {34.7436294}); + {-35.0172004}, {34.7436294}, {-35.0172004}, {34.7436294}); const auto add = std::make_shared(fq0, addParam); - const auto reshape0 = std::make_shared(add, reshape0Const, true); - const auto softMax = std::make_shared(reshape0, 1); - const auto reshape1 = std::make_shared(softMax, reshape1Const, true); + const auto softMax = std::make_shared(add, -1); const auto transpose2 = std::make_shared(transpose2Param, transpose2Const); - const auto matMul1 = std::make_shared(reshape1, transpose2, transA, transB); + const auto matMul1 = std::make_shared(softMax, transpose2, transA, transB); auto fq1 = ov::test::utils::make_fake_quantize(matMul1, ov::element::f32, 256, {1}, - {-35.0172004}, {34.7436294}, {-35.0172004}, {34.7436294}); + {-35.0172004}, {34.7436294}, {-35.0172004}, {34.7436294}); const auto transpose3 = std::make_shared(fq1, transpose3Const); ov::ResultVector results{std::make_shared(transpose3)}; @@ -642,46 +629,33 @@ std::shared_ptr MHAINT8MatMulFunction::initOriginal() const { auto transpose2Param = std::make_shared(precision, input_shapes[3]); ov::ParameterVector ngraphParam = {transpose0Param, transpose1Param, addParam, transpose2Param}; - const auto shape_rank = input_shapes[0].get_shape().size(); + const auto shape_rank = input_shapes[0].size(); auto transpose0Const = ov::op::v0::Constant::create(ov::element::i64, {shape_rank}, std::vector{0, 2, 1, 3}); auto transpose1Const = ov::op::v0::Constant::create(ov::element::i64, {shape_rank}, std::vector{0, 2, 3, 1}); auto transpose2Const = ov::op::v0::Constant::create(ov::element::i64, {shape_rank}, std::vector{0, 2, 1, 3}); auto transpose3Const = ov::op::v0::Constant::create(ov::element::i64, {shape_rank}, std::vector{0, 2, 1, 3}); - std::vector reshape0ConstData = {static_cast(input_shapes[0].get_shape()[0] * - input_shapes[0].get_shape()[1] * input_shapes[0].get_shape()[2]), - -1}; - auto reshape0Const = ov::op::v0::Constant::create(ov::element::i64, {reshape0ConstData.size()}, reshape0ConstData); - - std::vector reshape1ConstData = {static_cast(input_shapes[0].get_shape()[0]), - static_cast(input_shapes[0].get_shape()[2]), - static_cast(input_shapes[0].get_shape()[1]), - static_cast(input_shapes[0].get_shape()[1])}; - auto reshape1Const = ov::op::v0::Constant::create(ov::element::i64, {reshape1ConstData.size()}, reshape1ConstData); - auto fq0 = ov::test::utils::make_fake_quantize(transpose0Param, ov::element::f32, 256, {1}, - {-35.0172004}, {34.7436294}, {-35.0172004}, {34.7436294}); + {-35.0172004}, {34.7436294}, {-35.0172004}, {34.7436294}); auto fq1 = ov::test::utils::make_fake_quantize(transpose1Param, ov::element::f32, 256, {1}, - {-35.0172004}, {34.7436294}, {-35.0172004}, {34.7436294}); + {-35.0172004}, {34.7436294}, {-35.0172004}, {34.7436294}); auto fq2 = ov::test::utils::make_fake_quantize(transpose2Param, ov::element::f32, 256, {1}, - {-35.0172004}, {34.7436294}, {-35.0172004}, {34.7436294}); + {-35.0172004}, {34.7436294}, {-35.0172004}, {34.7436294}); bool transA = false; bool transB = false; const auto transpose0 = std::make_shared(fq0, transpose0Const); const auto transpose1 = std::make_shared(fq1, transpose1Const); const auto matMul0 = std::make_shared(transpose0, transpose1, transA, transB); auto fq3 = ov::test::utils::make_fake_quantize(matMul0, ov::element::f32, 256, {1}, - {-35.0172004}, {34.7436294}, {-35.0172004}, {34.7436294}); + {-35.0172004}, {34.7436294}, {-35.0172004}, {34.7436294}); const auto add = std::make_shared(fq3, addParam); - const auto reshape0 = std::make_shared(add, reshape0Const, true); - const auto softMax = std::make_shared(reshape0, 1); - const auto reshape1 = std::make_shared(softMax, reshape1Const, true); - auto fq4 = ov::test::utils::make_fake_quantize(reshape1, ov::element::f32, 256, {1}, - {0}, {0.820726}, {0}, {0.820726}); + const auto softMax = std::make_shared(add, -1); + auto fq4 = ov::test::utils::make_fake_quantize(softMax, ov::element::f32, 256, {1}, + {0}, {0.820726}, {0}, {0.820726}); const auto transpose2 = std::make_shared(fq2, transpose2Const); const auto matMul1 = std::make_shared(fq4, transpose2, transA, transB); auto fq5 = ov::test::utils::make_fake_quantize(matMul1, ov::element::f32, 256, {1}, - {-35.0172004}, {34.7436294}, {-35.0172004}, {34.7436294}); + {-35.0172004}, {34.7436294}, {-35.0172004}, {34.7436294}); const auto transpose3 = std::make_shared(fq5, transpose3Const); ov::ResultVector results{std::make_shared(transpose3)}; @@ -694,34 +668,20 @@ std::shared_ptr MHAQuantMatMul0Function::initOriginal() const { auto transpose2Param = std::make_shared(precision, input_shapes[3]); ov::ParameterVector ngraphParam = {transpose0Param, transpose1Param, addParam, transpose2Param}; - const auto channel = int64_t(12); - const auto last_dim = input_shapes[0].get_shape().back(); - OPENVINO_ASSERT(last_dim % channel == 0, "Incorrect test configuration"); - const auto new_shape = std::vector{0, 0, channel, static_cast(last_dim) / channel}; - - auto reshape0Const = ov::op::v0::Constant::create(ov::element::i64, {new_shape.size()}, new_shape); - auto reshape1Const = ov::op::v0::Constant::create(ov::element::i64, {new_shape.size()}, new_shape); - auto reshape2Const = ov::op::v0::Constant::create(ov::element::i64, {new_shape.size()}, new_shape); - auto reshape3Const = ov::op::v0::Constant::create(ov::element::i64, {input_shapes[0].size()}, std::vector{0, 0, -1}); - - auto transpose0Const = ov::op::v0::Constant::create(ov::element::i64, {4}, std::vector{0, 2, 1, 3}); - auto transpose1Const = ov::op::v0::Constant::create(ov::element::i64, {4}, std::vector{0, 2, 3, 1}); - auto transpose2Const = ov::op::v0::Constant::create(ov::element::i64, {4}, std::vector{0, 2, 1, 3}); - auto transpose3Const = ov::op::v0::Constant::create(ov::element::i64, {4}, std::vector{0, 2, 1, 3}); - - const auto reshape1 = std::make_shared(transpose1Param, reshape1Const, true); - const auto reshape2 = std::make_shared(transpose2Param, reshape2Const, true); + const auto shape_rank = input_shapes[0].size(); + auto transpose0Const = ov::op::v0::Constant::create(ov::element::i64, {shape_rank}, std::vector{0, 2, 1, 3}); + auto transpose1Const = ov::op::v0::Constant::create(ov::element::i64, {shape_rank}, std::vector{0, 2, 3, 1}); + auto transpose2Const = ov::op::v0::Constant::create(ov::element::i64, {shape_rank}, std::vector{0, 2, 1, 3}); + auto transpose3Const = ov::op::v0::Constant::create(ov::element::i64, {shape_rank}, std::vector{0, 2, 1, 3}); - const auto transpose1 = std::make_shared(reshape1, transpose1Const); - const auto transpose2 = std::make_shared(reshape2, transpose2Const); + const auto transpose1 = std::make_shared(transpose1Param, transpose1Const); + const auto transpose2 = std::make_shared(transpose2Param, transpose2Const); auto fq0 = ov::test::utils::make_fake_quantize(transpose0Param, ov::element::f32, 256, {1}, - {-12.5187311}, {12.4209289}, {-12.5187311}, {12.4209289}); + {-12.5187311}, {12.4209289}, {-12.5187311}, {12.4209289}); auto fq1 = ov::test::utils::make_fake_quantize(transpose1, ov::element::f32, 256, {1}, - {-1.43326699}, {1.42206954}, {-1.43326699}, {1.42206954}); - - const auto reshape0 = std::make_shared(fq0, reshape0Const, true); - const auto transpose0 = std::make_shared(reshape0, transpose0Const); + {-1.43326699}, {1.42206954}, {-1.43326699}, {1.42206954}); + const auto transpose0 = std::make_shared(fq0, transpose0Const); const auto matMul0 = std::make_shared(transpose0, fq1); const auto add = std::make_shared(matMul0, addParam); @@ -729,11 +689,10 @@ std::shared_ptr MHAQuantMatMul0Function::initOriginal() const { const auto matMul1 = std::make_shared(softMax, transpose2); auto fq2 = ov::test::utils::make_fake_quantize(matMul1, ov::element::f32, 256, {1}, - {-1.81826221}, {1.804057}, {-1.81826221}, {1.804057}); + {-1.81826221}, {1.804057}, {-1.81826221}, {1.804057}); const auto transpose3 = std::make_shared(fq2, transpose3Const); - const auto reshape3 = std::make_shared(transpose3, reshape3Const, true); - ov::ResultVector results{std::make_shared(reshape3)}; + ov::ResultVector results{std::make_shared(transpose3)}; return std::make_shared(results, ngraphParam, "mha"); } std::shared_ptr MHAFQFunction::initOriginal() const { @@ -743,18 +702,15 @@ std::shared_ptr MHAFQFunction::initOriginal() const { auto transpose2Param = std::make_shared(precision, input_shapes[3]); ov::ParameterVector ngraphParam = {transpose0Param, transpose1Param, addParam, transpose2Param}; - const auto shape_rank = input_shapes[0].get_shape().size(); + const auto shape_rank = input_shapes[0].size(); auto transpose0Const = ov::op::v0::Constant::create(ov::element::i64, {shape_rank}, std::vector{0, 2, 1, 3}); auto transpose1Const = ov::op::v0::Constant::create(ov::element::i64, {shape_rank}, std::vector{0, 2, 3, 1}); auto transpose2Const = ov::op::v0::Constant::create(ov::element::i64, {shape_rank}, std::vector{0, 2, 1, 3}); auto transpose3Const = ov::op::v0::Constant::create(ov::element::i64, {shape_rank}, std::vector{0, 2, 1, 3}); - const auto fq0 = ov::test::utils::make_fake_quantize(transpose0Param, ov::element::f32, 256, {1}, - {-5.217694}, {6.661877}, {-5.217694}, {6.661877}); - const auto fq1 = ov::test::utils::make_fake_quantize(transpose1Param, ov::element::f32, 256, {1}, - {-6.40245}, {6.45286}, {-6.40245}, {6.45286}); - const auto fq_add = ov::test::utils::make_fake_quantize(addParam, ov::element::f32, 256, {1}, - {-1000}, {0}, {-1000}, {0}); + const auto fq0 = ov::test::utils::make_fake_quantize(transpose0Param, ov::element::f32, 256, {1}, {-5.217694}, {6.661877}, {-5.217694}, {6.661877}); + const auto fq1 = ov::test::utils::make_fake_quantize(transpose1Param, ov::element::f32, 256, {1}, {-6.40245}, {6.45286}, {-6.40245}, {6.45286}); + const auto fq_add = ov::test::utils::make_fake_quantize(addParam, ov::element::f32, 256, {1}, {-1000}, {0}, {-1000}, {0}); bool transA = false; bool transB = false; @@ -766,16 +722,13 @@ std::shared_ptr MHAFQFunction::initOriginal() const { const auto mul_deq_const = ov::op::v0::Constant::create(ov::element::f32, ov::Shape{1}, std::vector{0.00098425}); const auto mul_deq = std::make_shared(convert, mul_deq_const); const auto mul = std::make_shared(transpose1, mul_deq); - auto fq1_1 = ov::test::utils::make_fake_quantize(mul, ov::element::f32, 256, {1}, - {-0.8003067}, {0.8066083}, {-0.8003067}, {0.8066083}); + const auto fq1_1 = ov::test::utils::make_fake_quantize(mul, ov::element::f32, 256, {1}, {-0.8003067}, {0.8066083}, {-0.8003067}, {0.8066083}); const auto matMul0 = std::make_shared(transpose0, fq1_1, transA, transB); - auto fq2 = ov::test::utils::make_fake_quantize(matMul0, ov::element::f32, 256, {1}, - {-14.50351}, {17.65645}, {-14.50351}, {17.65645}); + const auto fq2 = ov::test::utils::make_fake_quantize(matMul0, ov::element::f32, 256, {1}, {-14.50351}, {17.65645}, {-14.50351}, {17.65645}); const auto add = std::make_shared(fq2, fq_add); const auto softMax = std::make_shared(add, 3); const auto matMul1 = std::make_shared(softMax, transpose2, transA, transB); - auto fq3 = ov::test::utils::make_fake_quantize(matMul1, ov::element::f32, 256, {1}, - {-1.895786}, {2.0028071}, {-1.895786}, {2.0028071}); + auto fq3 = ov::test::utils::make_fake_quantize(matMul1, ov::element::f32, 256, {1}, {-1.895786}, {2.0028071}, {-1.895786}, {2.0028071}); const auto transpose3 = std::make_shared(fq3, transpose3Const); ov::ResultVector results{std::make_shared(transpose3)}; From f2640a2d7ee57432b66b085540709904c8525afb Mon Sep 17 00:00:00 2001 From: Wenjing Kang Date: Thu, 31 Oct 2024 17:41:31 +0800 Subject: [PATCH 079/120] Update CMAKE_LANG_FLAGS_CONFIG_INIT appending in toolchain to avoid flag repetition (#27352) ### Details: -Currently, when using this toolchain and print the following flags in [CMakeLists](https://github.com/openvinotoolkit/openvino/blob/master/CMakeLists.txt) ``` CMAKE_CXX_FLAGS_RELEASE in OV CMakeLists.txt::: /MT /MT /O2 /Ob2 /DNDEBUG /Zi /FS /Zf /ZH:SHA_256 /guard:cf /Qspectre CMAKE_C_FLAGS_RELEASE in OV CMakeLists.txt::: /MT /MT /O2 /Ob2 /DNDEBUG /Zi /FS /Zf /ZH:SHA_256 /guard:cf /Qspectre CMAKE_CXX_FLAGS_RELEASE_INIT in OV CMakeLists.txt::: /MT /MT /O2 /Ob2 /DNDEBUG CMAKE_C_FLAGS_RELEASE_INIT in OV CMakeLists.txt::: /MT /MT /O2 /Ob2 /DNDEBUG ``` So there is repetition of `/MT` in flags. The change in this PR will fix this problem. The flags will be: ``` CMAKE_CXX_FLAGS_RELEASE in OV CMakeLists.txt::: /MT /O2 /Ob2 /DNDEBUG /Zi /FS /Zf /ZH:SHA_256 /guard:cf /Qspectre CMAKE_C_FLAGS_RELEASE in OV CMakeLists.txt::: /MT /O2 /Ob2 /DNDEBUG /Zi /FS /Zf /ZH:SHA_256 /guard:cf /Qspectre CMAKE_CXX_FLAGS_RELEASE_INIT in OV CMakeLists.txt::: /MT /O2 /Ob2 /DNDEBUG CMAKE_C_FLAGS_RELEASE_INIT in OV CMakeLists.txt::: /MT /O2 /Ob2 /DNDEBUG ``` ### Tickets: - *152927* Signed-off-by: Kang Wenjing --- cmake/toolchains/mt.runtime.win32.toolchain.cmake | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cmake/toolchains/mt.runtime.win32.toolchain.cmake b/cmake/toolchains/mt.runtime.win32.toolchain.cmake index 9a99781eac0426..b331d370bfe7bf 100644 --- a/cmake/toolchains/mt.runtime.win32.toolchain.cmake +++ b/cmake/toolchains/mt.runtime.win32.toolchain.cmake @@ -28,9 +28,9 @@ if(use_static_runtime) set(flag_var "CMAKE_${lang}_FLAGS${build_type}_INIT") string(REPLACE "/MD" "/MT" ${flag_var} "${${flag_var}}") if (build_type STREQUAL "_DEBUG") - set(${flag_var} "${${flag_var}} /MTd") + set(${flag_var} "/MTd") else() - set(${flag_var} "${${flag_var}} /MT") + set(${flag_var} "/MT") endif() endforeach() endforeach() From 272843d81ad242f2622b8951d922baa299ccdfc1 Mon Sep 17 00:00:00 2001 From: Artemy Skrebkov Date: Thu, 31 Oct 2024 09:44:05 +0000 Subject: [PATCH 080/120] Add support for shape and data_shape parameters (#27314) ### Details: - Move helper function for reshaping to `npu_tools_utils` - Introduce `shape` and `data_shape` params ### Tickets: - E144161 --------- Signed-off-by: Skrebkov, Artemy --- .../tools/common/include/tools_helpers.hpp | 181 ++++++++++++++++++ .../tools/compile_tool/CMakeLists.txt | 3 +- .../intel_npu/tools/compile_tool/main.cpp | 109 +---------- .../tools/compile_tool/tools_helpers.hpp | 81 -------- .../tools/single-image-test/main.cpp | 132 +++++-------- 5 files changed, 236 insertions(+), 270 deletions(-) create mode 100644 src/plugins/intel_npu/tools/common/include/tools_helpers.hpp delete mode 100644 src/plugins/intel_npu/tools/compile_tool/tools_helpers.hpp diff --git a/src/plugins/intel_npu/tools/common/include/tools_helpers.hpp b/src/plugins/intel_npu/tools/common/include/tools_helpers.hpp new file mode 100644 index 00000000000000..e9743594ad8711 --- /dev/null +++ b/src/plugins/intel_npu/tools/common/include/tools_helpers.hpp @@ -0,0 +1,181 @@ +// Copyright (C) 2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include +#include +#include +#include +#include + +#include "openvino/openvino.hpp" + +struct InputInfo { + ov::element::Type type; + ov::PartialShape partialShape; + ov::Shape dataShape; + ov::Layout layout; +}; +using InputsInfo = std::map; + +std::string parameterNameToTensorName(std::string& name, std::vector>& inputs_info) { + auto count_name = std::any_of(inputs_info.begin(), inputs_info.end(), [name](ov::Output& port) { + return port.get_names().count(name) > 0; + }); + if (count_name) { + return name; + } else { + auto inputInfo = std::find_if(inputs_info.begin(), inputs_info.end(), [name](ov::Output& port) { + return name == port.get_node()->get_friendly_name(); + }); + if (inputInfo == inputs_info.end()) { + throw std::runtime_error("Provided I/O name \"" + name + + "\" is not found neither in tensor names nor in nodes names."); + } + return inputInfo->get_any_name(); + } +} + +std::map> parseInputParameters(std::string& parameter_string, + std::vector>& input_info) { + // Parse parameter string like "input0[value0],input1[value1]" or "[value]" (applied to all + // inputs) + std::map> return_value; + std::string search_string = parameter_string; + auto start_pos = search_string.find_first_of('['); + auto input_name = search_string.substr(0, start_pos); + while (start_pos != std::string::npos) { + auto end_pos = search_string.find_first_of(']'); + if (end_pos == std::string::npos) + break; + input_name = search_string.substr(0, start_pos); + auto input_value = search_string.substr(start_pos + 1, end_pos - start_pos - 1); + if (!input_name.empty()) { + return_value[parameterNameToTensorName(input_name, input_info)].push_back(input_value); + } else { + for (auto& item : input_info) { + return_value[item.get_any_name()].push_back(input_value); + } + } + search_string = search_string.substr(end_pos + 1); + if (search_string.empty() || (search_string.front() != ',' && search_string.front() != '[')) + break; + if (search_string.front() == ',') { + if (search_string.length() > 1) + search_string = search_string.substr(1); + else + throw std::logic_error("Can't parse input parameter string, there is nothing after the comma " + + parameter_string); + } + start_pos = search_string.find_first_of('['); + } + if (!search_string.empty()) + throw std::logic_error("Can't parse input parameter string: " + parameter_string); + return return_value; +} + +void boundDynamicShape(std::shared_ptr& model) { + for (auto&& item : model->get_parameters()) { + auto shape = item->get_partial_shape(); + if (shape.is_static()) { + continue; + } + auto rank = shape.rank(); + if (rank.is_dynamic()) { + throw std::logic_error("Rank \"" + rank.to_string() + "\" of the shape \"" + shape.to_string() + + "\" is dynamic which is not supported by NPU"); + } + auto layout = item->get_layout(); + if (!ov::layout::has_batch(layout)) { + item->set_layout(ov::Layout(layout.to_string().insert(1, "N,"))); + layout = item->get_layout(); + } + if (shape[ov::layout::batch_idx(layout)].is_dynamic()) { + std::cout << "WARNING: Shape \"" + shape.to_string() + "\"" + + " has dynamic batch size which is not supported by NPU\n" + " Setting batch to 1 forcibly" + << std::endl; + ov::set_batch(model, 1); + } + shape = item->get_partial_shape(); + if (shape.is_dynamic()) { + throw std::logic_error("Model's input shape \"" + shape.to_string() + "\"" + + " is dynamic which is not supported by NPU"); + } + } +} + +void setModelBatch(std::shared_ptr& model, uint32_t batch = 1) { + if (batch == 1) { + return; + } + for (auto&& item : model->get_parameters()) { + auto shape = item->get_partial_shape(); + auto rank = shape.rank(); + if (rank.is_dynamic()) { + throw std::logic_error("Rank \"" + rank.to_string() + "\" of the shape \"" + shape.to_string() + + "\" is dynamic which is not supported by NPU"); + } + auto layout = item->get_layout(); + if (!ov::layout::has_batch(layout)) { + item->set_layout(ov::Layout(layout.to_string().insert(1, "N,"))); + layout = item->get_layout(); + } + if (shape[ov::layout::batch_idx(layout)].is_dynamic()) { + throw std::logic_error("ERROR: Shape \"" + shape.to_string() + "\"" + + " has dynamic batch size which is not supported by NPU\n" + "Cannot apply fixed batch: " + + std::to_string(batch) + + ". Please remove the parameter from config: \"override_model_batch_size\""); + } + ov::set_batch(model, batch); + } +} + +void reshape(ov::OutputVector inputsInfo, InputsInfo& infoMap, std::shared_ptr& model, + std::string& shapeString, int overrideModelBatchSize, std::string_view device) { + std::vector infoMaps; + if (!shapeString.empty()) { + std::map> shapesMap = parseInputParameters(shapeString, inputsInfo); + + if (overrideModelBatchSize != 1) { + throw std::logic_error(R"(Incompatible params: "shape" and "override_model_batch_size")"); + } + for (auto& item : inputsInfo) { + InputInfo info; + auto name = item.get_any_name(); + + if (!shapesMap.empty()) { + if (shapesMap.count(name)) { + if (shapesMap.at(name).size() > 1) { + // Example: -shape input1[..][..] + throw std::logic_error("shape command line parameter doesn't support multiple " + "shapes for one input."); + } + info.partialShape = shapesMap.at(name)[0]; + } else { + info.partialShape = item.get_partial_shape(); + } + } + infoMap[name] = std::move(info); + infoMaps.push_back(infoMap); + } + std::map newShapes; + for (auto& item : infoMaps) { + for (auto& map : item) { + if (!newShapes.count(map.first)) { + newShapes[map.first] = map.second.partialShape; + } + } + } + model->reshape(newShapes); + } else { + if (device.find("NPU") != std::string::npos || + // FIXME: SIT on CPU also requires to bound dynamic shapes + device.find("CPU") != std::string::npos || device.find("TEMPLATE") != std::string::npos) { + boundDynamicShape(model); + } + + setModelBatch(model, overrideModelBatchSize); + } +} diff --git a/src/plugins/intel_npu/tools/compile_tool/CMakeLists.txt b/src/plugins/intel_npu/tools/compile_tool/CMakeLists.txt index 66ff751b9f5162..fc485030359428 100644 --- a/src/plugins/intel_npu/tools/compile_tool/CMakeLists.txt +++ b/src/plugins/intel_npu/tools/compile_tool/CMakeLists.txt @@ -24,7 +24,8 @@ ov_add_target(ADD_CPPLINT PRIVATE openvino::runtime gflags - Threads::Threads) + Threads::Threads + npu_tools_utils) set_target_properties(${TARGET_NAME} PROPERTIES FOLDER ${CMAKE_CURRENT_SOURCE_DIR} diff --git a/src/plugins/intel_npu/tools/compile_tool/main.cpp b/src/plugins/intel_npu/tools/compile_tool/main.cpp index 471fd55bb82b3f..7a088d1afc69e2 100644 --- a/src/plugins/intel_npu/tools/compile_tool/main.cpp +++ b/src/plugins/intel_npu/tools/compile_tool/main.cpp @@ -14,11 +14,12 @@ #include -#include "openvino/core/partial_shape.hpp" -#include "openvino/openvino.hpp" +#include +#include #include "tools_helpers.hpp" + static constexpr char help_message[] = "Optional. Print the usage message."; static constexpr char model_message[] = "Required. Path to the XML model."; @@ -168,64 +169,6 @@ bool isFP32(const ov::element::Type& type) { return type == ov::element::f32; } -void boundDynamicShape(std::shared_ptr& model) { - for (auto&& item : model->get_parameters()) { - auto shape = item->get_partial_shape(); - if (shape.is_static()) { - continue; - } - auto rank = shape.rank(); - if (rank.is_dynamic()) { - throw std::logic_error("Rank \"" + rank.to_string() + "\" of the shape \"" + shape.to_string() + - "\" is dynamic which is not supported by NPU"); - } - auto layout = item->get_layout(); - if (!ov::layout::has_batch(layout)) { - item->set_layout(ov::Layout(layout.to_string().insert(1, "N,"))); - layout = item->get_layout(); - } - if (shape[ov::layout::batch_idx(layout)].is_dynamic()) { - std::cout << "WARNING: Shape \"" + shape.to_string() + "\"" + - " has dynamic batch size which is not supported by NPU\n" - " Setting batch to 1 forcibly" - << std::endl; - ov::set_batch(model, 1); - } - shape = item->get_partial_shape(); - if (shape.is_dynamic()) { - throw std::logic_error("Model's input shape \"" + shape.to_string() + "\"" + - " is dynamic which is not supported by NPU"); - } - } -} - -void setModelBatch(std::shared_ptr& model, uint32_t batch = 1) { - if (batch == 1) { - return; - } - for (auto&& item : model->get_parameters()) { - auto shape = item->get_partial_shape(); - auto rank = shape.rank(); - if (rank.is_dynamic()) { - throw std::logic_error("Rank \"" + rank.to_string() + "\" of the shape \"" + shape.to_string() + - "\" is dynamic which is not supported by NPU"); - } - auto layout = item->get_layout(); - if (!ov::layout::has_batch(layout)) { - item->set_layout(ov::Layout(layout.to_string().insert(1, "N,"))); - layout = item->get_layout(); - } - if (shape[ov::layout::batch_idx(layout)].is_dynamic()) { - throw std::logic_error("ERROR: Shape \"" + shape.to_string() + "\"" + - " has dynamic batch size which is not supported by NPU\n" - "Cannot apply fixed batch: " + - std::to_string(batch) + - ". Please remove the parameter from config: \"override_model_batch_size\""); - } - ov::set_batch(model, batch); - } -} - void configurePrePostProcessing(std::shared_ptr& model, const std::string& ip, const std::string& op, const std::string& iop, const std::string& il, const std::string& ol, const std::string& iol, const std::string& iml, const std::string& oml, @@ -475,50 +418,6 @@ std::string getFileNameFromPath(const std::string& path, using TimeDiff = std::chrono::milliseconds; -void reshape(ov::OutputVector inputs_info, InputsInfo& info_map, std::shared_ptr& model) { - std::vector info_maps; - if (!FLAGS_shape.empty()) { - std::map> shapes_map = parseInputParameters(FLAGS_shape, inputs_info); - - if (FLAGS_override_model_batch_size != 1) { - throw std::logic_error("Incompatible params: \"shape\" and \"override_model_batch_size\""); - } - for (auto& item : inputs_info) { - InputInfo info; - auto name = item.get_any_name(); - - if (!shapes_map.empty()) { - if (shapes_map.count(name)) { - if (shapes_map.at(name).size() > 1) { - // Example: -shape input1[..][..] - throw std::logic_error("shape command line parameter doesn't support multiple " - "shapes for one input."); - } - info.partialShape = shapes_map.at(name)[0]; - } else { - info.partialShape = item.get_partial_shape(); - } - } - info_map[name] = std::move(info); - info_maps.push_back(info_map); - } - std::map newShapes; - for (auto& item : info_maps) { - for (auto& map : item) { - if (!newShapes.count(map.first)) { - newShapes[map.first] = map.second.partialShape; - } - } - } - model->reshape(newShapes); - } else { - if (FLAGS_d.find("NPU") != std::string::npos) { - boundDynamicShape(model); - } - - setModelBatch(model, FLAGS_override_model_batch_size); - } -} int main(int argc, char* argv[]) { try { @@ -552,7 +451,7 @@ int main(int argc, char* argv[]) { InputsInfo info_map; std::cout << "Performing reshape" << std::endl; - reshape(std::move(inputs_info), info_map, model); + reshape(std::move(inputs_info), info_map, model, FLAGS_shape, FLAGS_override_model_batch_size, FLAGS_d); std::cout << "Configuring model pre & post processing" << std::endl; configurePrePostProcessing(model, FLAGS_ip, FLAGS_op, FLAGS_iop, FLAGS_il, FLAGS_ol, FLAGS_iol, FLAGS_iml, diff --git a/src/plugins/intel_npu/tools/compile_tool/tools_helpers.hpp b/src/plugins/intel_npu/tools/compile_tool/tools_helpers.hpp deleted file mode 100644 index 6d42fd142b8971..00000000000000 --- a/src/plugins/intel_npu/tools/compile_tool/tools_helpers.hpp +++ /dev/null @@ -1,81 +0,0 @@ -// Copyright (C) 2024 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 -// - -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include - -#include "openvino/openvino.hpp" - -struct InputInfo { - ov::element::Type type; - ov::PartialShape partialShape; - ov::Shape dataShape; - ov::Layout layout; -}; -using InputsInfo = std::map; - -std::string parameterNameToTensorName(std::string& name, std::vector>& inputs_info) { - auto count_name = std::any_of(inputs_info.begin(), inputs_info.end(), [name](ov::Output& port) { - return port.get_names().count(name) > 0; - }); - if (count_name) { - return name; - } else { - auto inputInfo = std::find_if(inputs_info.begin(), inputs_info.end(), [name](ov::Output& port) { - return name == port.get_node()->get_friendly_name(); - }); - if (inputInfo == inputs_info.end()) { - throw std::runtime_error("Provided I/O name \"" + name + - "\" is not found neither in tensor names nor in nodes names."); - } - return inputInfo->get_any_name(); - } -} - -std::map> parseInputParameters(std::string& parameter_string, - std::vector>& input_info) { - // Parse parameter string like "input0[value0],input1[value1]" or "[value]" (applied to all - // inputs) - std::map> return_value; - std::string search_string = parameter_string; - auto start_pos = search_string.find_first_of('['); - auto input_name = search_string.substr(0, start_pos); - while (start_pos != std::string::npos) { - auto end_pos = search_string.find_first_of(']'); - if (end_pos == std::string::npos) - break; - input_name = search_string.substr(0, start_pos); - auto input_value = search_string.substr(start_pos + 1, end_pos - start_pos - 1); - if (!input_name.empty()) { - return_value[parameterNameToTensorName(input_name, input_info)].push_back(input_value); - } else { - for (auto& item : input_info) { - return_value[item.get_any_name()].push_back(input_value); - } - } - search_string = search_string.substr(end_pos + 1); - if (search_string.empty() || (search_string.front() != ',' && search_string.front() != '[')) - break; - if (search_string.front() == ',') { - if (search_string.length() > 1) - search_string = search_string.substr(1); - else - throw std::logic_error("Can't parse input parameter string, there is nothing after the comma " + - parameter_string); - } - start_pos = search_string.find_first_of('['); - } - if (!search_string.empty()) - throw std::logic_error("Can't parse input parameter string: " + parameter_string); - return return_value; -} diff --git a/src/plugins/intel_npu/tools/single-image-test/main.cpp b/src/plugins/intel_npu/tools/single-image-test/main.cpp index 4018982b022ed3..5658c18650243b 100644 --- a/src/plugins/intel_npu/tools/single-image-test/main.cpp +++ b/src/plugins/intel_npu/tools/single-image-test/main.cpp @@ -4,9 +4,11 @@ // #include "image_quality_helper.hpp" +#include "openvino/core/partial_shape.hpp" #include "semantic_segmentation_helpers.hpp" #include "tensor_utils.hpp" #include "yolo_helpers.hpp" +#include "tools_helpers.hpp" #include #include @@ -31,7 +33,8 @@ using TensorMap = std::map; struct TensorDescriptor { ov::element::Type precision; - ov::Shape shape; + ov::PartialShape shape; + ov::Shape dataShape; ov::Layout layout; }; @@ -83,6 +86,15 @@ DEFINE_string(oml, "", " is supported"); DEFINE_bool(img_as_bin, false, "Force binary input even if network expects an image"); DEFINE_bool(pc, false, "Report performance counters"); +DEFINE_string( + shape, "", + "Optional. Set shape for model input. For example, \"input1[1,3,224,224],input2[1,4]\" or \"[1,3,224,224]\"" + " in case of one input size. This parameter affects model input shape and can be dynamic." + " For dynamic dimensions use symbol `?` or '-1'. Ex. [?,3,?,?]." + " For bounded dimensions specify range 'min..max'. Ex. [1..10,3,?,?]."); +DEFINE_string(data_shape, "", + "Required for models with dynamic shapes. Set shape for input blobs. Only one shape can be set." + "In case of one input size: \"[1,3,224,224]\""); // for using input image mean and scale static constexpr char mean_values_message[] = @@ -1450,65 +1462,6 @@ std::pair runInfer(ov::InferRequest& inferRequest, ov::Compi return std::make_pair(out, profData); } -void boundDynamicShape(std::shared_ptr& model) { - for (auto&& item : model->get_parameters()) { - auto shape = item->get_partial_shape(); - if (shape.is_static()) { - continue; - } - auto rank = shape.rank(); - if (rank.is_dynamic()) { - throw std::logic_error("Rank \"" + rank.to_string() + "\" of the shape \"" + shape.to_string() + - "\" is dynamic which is not supported by SIT"); - } - auto layout = item->get_layout(); - if (!ov::layout::has_batch(layout)) { - item->set_layout(ov::Layout(layout.to_string().insert(1, "N,"))); - layout = item->get_layout(); - } - if (shape[ov::layout::batch_idx(layout)].is_dynamic()) { - std::cout << "WARNING: Shape \"" + shape.to_string() + "\"" + - " has dynamic batch size which is not supported by SIT\n" - " Setting batch to 1 forcibly" - << std::endl; - ov::set_batch(model, 1); - } - shape = item->get_partial_shape(); - if (shape.is_dynamic()) { - throw std::logic_error("Model's input shape \"" + shape.to_string() + "\"" + - " is dynamic which is not supported by SIT"); - } - } -} - -void setModelBatch(std::shared_ptr& model, uint32_t batch) { - if (batch == 1) { - return; - } - - // New batch value is applicable if the model has non dynamic inputs/outputs only - // Amend layout by adding N if it has no batch dimension - for (auto&& item : model->get_parameters()) { - auto shape = item->get_partial_shape(); - auto rank = shape.rank(); - if (rank.is_dynamic()) { - throw std::logic_error("Rank \"" + rank.to_string() + "\" of the shape \"" + shape.to_string() + - "\" is dynamic which is not supported by SIT"); - } - auto layout = item->get_layout(); - if (!ov::layout::has_batch(layout)) { - item->set_layout(ov::Layout(layout.to_string().insert(1, "N,"))); - } - - shape = item->get_partial_shape(); - if (shape.is_dynamic()) { - throw std::logic_error("Model's input shape \"" + shape.to_string() + "\"" + - " is dynamic which is not supported by SIT"); - } - } - ov::set_batch(model, batch); -} - // FIXME: User must provide layout explicitly. // No "default" layout for IRv11 models. static ov::Layout getLayoutByRank(const size_t rank) { @@ -1558,8 +1511,8 @@ bool testSSDDetection(const TensorMap& outputs, const TensorMap& references, const ov::Tensor& reference = references.begin()->second; const TensorDescriptor& inputDescriptor = inputDescriptors.begin()->second; - const auto imgWidth = inputDescriptor.shape.at(ov::layout::width_idx(inputDescriptor.layout)); - const auto imgHeight = inputDescriptor.shape.at(ov::layout::height_idx(inputDescriptor.layout)); + const auto imgWidth = inputDescriptor.dataShape.at(ov::layout::width_idx(inputDescriptor.layout)); + const auto imgHeight = inputDescriptor.dataShape.at(ov::layout::height_idx(inputDescriptor.layout)); auto confThresh = FLAGS_confidence_threshold; auto probTolerance = FLAGS_prob_tolerance; @@ -1592,8 +1545,8 @@ bool testYoloV2(const TensorMap& outputs, const TensorMap& references, const Ten const TensorDescriptor& inputDescriptor = inputDescriptors.begin()->second; - const auto imgWidth = inputDescriptor.shape.at(ov::layout::width_idx(inputDescriptor.layout)); - const auto imgHeight = inputDescriptor.shape.at(ov::layout::height_idx(inputDescriptor.layout)); + const auto imgWidth = inputDescriptor.dataShape.at(ov::layout::width_idx(inputDescriptor.layout)); + const auto imgHeight = inputDescriptor.dataShape.at(ov::layout::height_idx(inputDescriptor.layout)); double confThresh = FLAGS_confidence_threshold; double probTolerance = FLAGS_prob_tolerance; double boxTolerance = FLAGS_box_tolerance; @@ -1624,8 +1577,8 @@ bool testYoloV3(const TensorMap& outputs, const TensorMap& references, const Ten "Mismatch between the number of model outputs and the number of references"); const TensorDescriptor& inputDescriptor = inputDescriptors.begin()->second; - const auto imgWidth = inputDescriptor.shape.at(ov::layout::width_idx(inputDescriptor.layout)); - const auto imgHeight = inputDescriptor.shape.at(ov::layout::height_idx(inputDescriptor.layout)); + const auto imgWidth = inputDescriptor.dataShape.at(ov::layout::width_idx(inputDescriptor.layout)); + const auto imgHeight = inputDescriptor.dataShape.at(ov::layout::height_idx(inputDescriptor.layout)); double confThresh = FLAGS_confidence_threshold; double probTolerance = FLAGS_prob_tolerance; @@ -1663,8 +1616,8 @@ bool testYoloV4(const TensorMap& outputs, const TensorMap& references, const Ten "Mismatch between the number of model outputs and the number of references"); const TensorDescriptor& inputDescriptor = inputDescriptors.begin()->second; - const auto imgWidth = inputDescriptor.shape.at(ov::layout::width_idx(inputDescriptor.layout)); - const auto imgHeight = inputDescriptor.shape.at(ov::layout::height_idx(inputDescriptor.layout)); + const auto imgWidth = inputDescriptor.dataShape.at(ov::layout::width_idx(inputDescriptor.layout)); + const auto imgHeight = inputDescriptor.dataShape.at(ov::layout::height_idx(inputDescriptor.layout)); double confThresh = FLAGS_confidence_threshold; double probTolerance = FLAGS_prob_tolerance; @@ -1733,6 +1686,16 @@ bool testMeanIoU(const TensorMap& outputs, const TensorMap& references, const La return compare_mean_IoU(iou, semSegThreshold, classes); } +static ov::Shape parseDataShape(const std::string& dataShapeStr) { + std::vector dataShape; + std::istringstream ss(dataShapeStr); + std::string token; + while (std::getline(ss, token, ',')) { + dataShape.push_back(std::stoul(token)); + } + return ov::Shape(dataShape); +} + static int runSingleImageTest() { std::cout << "Run single image test" << std::endl; try { @@ -1814,12 +1777,12 @@ static int runSingleImageTest() { auto model = core.read_model(FLAGS_network); nameIOTensors(model); - setModelBatch(model, FLAGS_override_model_batch_size); - if (FLAGS_device.find("NPU") != std::string::npos || - // FIXME: SIT on CPU also requires to bound dynamic shapes - FLAGS_device.find("CPU") != std::string::npos || FLAGS_device.find("TEMPLATE") != std::string::npos) { - boundDynamicShape(model); - } + auto inputs_info = std::const_pointer_cast(model)->inputs(); + InputsInfo info_map; + + std::cout << "Performing reshape" << std::endl; + reshape(std::move(inputs_info), info_map, model, FLAGS_shape, + FLAGS_override_model_batch_size, FLAGS_device); ov::preprocess::PrePostProcessor ppp(model); @@ -1856,11 +1819,11 @@ static int runSingleImageTest() { inModelLayout.has_value()) { inLayerModelLayout = inModelLayout.value(); } else { - const auto shape = inputInfo[i].get_shape(); + const auto shape = inputInfo[i].get_partial_shape(); inLayerModelLayout = getLayoutByRank(shape.size()); std::cout << "WARNING: Configuring preprocessing. Since --iml option isn't set, input model " "layout for layer \"" - << inputInfo[i].get_any_name() << "\" is infered from shape: " << toString(shape) + << inputInfo[i].get_any_name() << "\" is infered from shape: " << shape.to_string() << " rank (" << shape.size() << ") as " << inLayerModelLayout.to_string() << std::endl; } @@ -1917,11 +1880,11 @@ static int runSingleImageTest() { outModelLayout.has_value()) { outLayerModelLayout = outModelLayout.value(); } else { - const auto shape = outputInfo[i].get_shape(); + const auto shape = outputInfo[i].get_partial_shape(); outLayerModelLayout = getLayoutByRank(shape.size()); std::cout << "WARNING: Configuring preprocessing. Since --oml option isn't set, output model " "layout for layer \"" - << outputInfo[i].get_any_name() << "\" is infered from shape: " << toString(shape) + << outputInfo[i].get_any_name() << "\" is infered from shape: " << shape.to_shape() << " rank (" << shape.size() << ") as " << outLayerModelLayout.to_string() << std::endl; } @@ -1933,6 +1896,7 @@ static int runSingleImageTest() { } } + std::cout << "Compile model" << std::endl; compiledModel = core.compile_model(ppp.build(), FLAGS_device); } else { std::cout << "Import network " << FLAGS_network << std::endl; @@ -1994,7 +1958,8 @@ static int runSingleImageTest() { // Load the input data for (const auto& inputInfo : inputsInfo) { - const ov::Shape& shape = inputInfo.get_shape(); + const auto& shape = inputInfo.get_partial_shape(); + const auto dataShape = shape.is_static() ? shape.get_shape() : parseDataShape(FLAGS_data_shape); const ov::element::Type& precision = inputInfo.get_element_type(); // Determine the input layout @@ -2012,19 +1977,20 @@ static int runSingleImageTest() { inputLayout = getLayoutByRank(shape.size()); std::cout << "WARNING: Loading input data. Since --iml option isn't set, input model layout for " "layer \"" - << inputInfo.get_any_name() << "\" is infered from shape: " << toString(shape) + << inputInfo.get_any_name() << "\" is infered from shape: " << shape.to_shape() << " rank (" << shape.size() << ") as " << inputLayout.to_string() << std::endl; } - inputDescriptors.emplace(inputInfo.get_any_name(), TensorDescriptor{precision, shape, inputLayout}); + inputDescriptors.emplace(inputInfo.get_any_name(), TensorDescriptor{precision, shape, + dataShape, inputLayout}); std::cout << "Load input #" << inputInd << " from " << inputFiles[inputInd] << " as " << precision << " " << inputLayout.to_string() << " " << shape << std::endl; const ov::Tensor tensor = !FLAGS_img_as_bin - ? loadInput(precision, shape, inputLayout, inputFiles[inputInd], FLAGS_color_format) - : loadInput(precision, shape, inputLayout, inputFiles[inputInd], FLAGS_color_format, + ? loadInput(precision, dataShape, inputLayout, inputFiles[inputInd], FLAGS_color_format) + : loadInput(precision, dataShape, inputLayout, inputFiles[inputInd], FLAGS_color_format, inputBinPrecisionForOneInfer[numberOfTestCase][inputInd]); std::ostringstream ostr; ostr << netFileName << "_input_" << inputInd << "_case_" << numberOfTestCase << ".blob"; From c902a0144a45aff068c15726fb27773feaa1f2ea Mon Sep 17 00:00:00 2001 From: Mingyu Kim Date: Thu, 31 Oct 2024 19:33:21 +0900 Subject: [PATCH 081/120] [GPU] update onednn (#27349) --- src/plugins/intel_gpu/thirdparty/onednn_gpu | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/plugins/intel_gpu/thirdparty/onednn_gpu b/src/plugins/intel_gpu/thirdparty/onednn_gpu index 062d247e7853b1..1722066ad4c0f1 160000 --- a/src/plugins/intel_gpu/thirdparty/onednn_gpu +++ b/src/plugins/intel_gpu/thirdparty/onednn_gpu @@ -1 +1 @@ -Subproject commit 062d247e7853b14ed287a130cc2dc221187430aa +Subproject commit 1722066ad4c0f15495f2d0fcbe9deb2bfd188c36 From a0b73e0a7a69873582301a460365792183101ab3 Mon Sep 17 00:00:00 2001 From: Przemyslaw Wysocki Date: Thu, 31 Oct 2024 12:07:18 +0100 Subject: [PATCH 082/120] [PyOV] Extend Python API with `Squeeze-15` (#27281) ### Details: - This PR includes commits from https://github.com/openvinotoolkit/openvino/pull/26995 ### Tickets: - CVS-154024 --------- Signed-off-by: p-wysocki Co-authored-by: Michal Barnas Co-authored-by: Roman Kazantsev Co-authored-by: Michal Lukaszewski --- .../src/openvino/runtime/opset15/__init__.py | 2 +- .../src/openvino/runtime/opset15/ops.py | 39 ++++++++++++++ .../python/tests/test_graph/test_ops_fused.py | 11 ---- .../python/tests/test_graph/test_squeeze.py | 51 +++++++++++++++++++ 4 files changed, 91 insertions(+), 12 deletions(-) create mode 100644 src/bindings/python/tests/test_graph/test_squeeze.py diff --git a/src/bindings/python/src/openvino/runtime/opset15/__init__.py b/src/bindings/python/src/openvino/runtime/opset15/__init__.py index 6cc9c24827a85f..c4dd48d9087ae1 100644 --- a/src/bindings/python/src/openvino/runtime/opset15/__init__.py +++ b/src/bindings/python/src/openvino/runtime/opset15/__init__.py @@ -188,7 +188,7 @@ from openvino.runtime.opset1.ops import split from openvino.runtime.opset1.ops import sqrt from openvino.runtime.opset1.ops import squared_difference -from openvino.runtime.opset1.ops import squeeze +from openvino.runtime.opset15.ops import squeeze from openvino.runtime.opset15.ops import stft from openvino.runtime.opset1.ops import strided_slice from openvino.runtime.opset1.ops import subtract diff --git a/src/bindings/python/src/openvino/runtime/opset15/ops.py b/src/bindings/python/src/openvino/runtime/opset15/ops.py index b3a131602af703..93aacb29572340 100644 --- a/src/bindings/python/src/openvino/runtime/opset15/ops.py +++ b/src/bindings/python/src/openvino/runtime/opset15/ops.py @@ -348,3 +348,42 @@ def search_sorted( inputs = as_nodes(sorted_sequence, values, name=name) attributes = {"right_mode": right_mode} return _get_node_factory_opset15().create("SearchSorted", inputs, attributes) + + +@nameable_op +def squeeze( + data: NodeInput, + axes: Optional[NodeInput] = None, + allow_axis_skip: bool = False, + name: Optional[str] = None, +) -> Node: + """Perform squeeze operation on input tensor. + + :param data: The node with data tensor. + :param axes: Optional list of integers, indicating the dimensions to squeeze. + Negative indices are supported. One of: input node or array. + :param allow_axis_skip: If true, shape inference results in a dynamic rank, when + selected axis has value 1 in its dynamic range. Used only if axes input + is given. Defaults to false. + :param name: Optional new name for output node. + :return: The new node performing a squeeze operation on input tensor. + + Remove single-dimensional entries from the shape of a tensor. + Takes an optional parameter `axes` with a list of axes to squeeze. + If `axes` is not provided, all the single dimensions will be removed from the shape. + + For example: + + Inputs: tensor with shape [1, 2, 1, 3, 1, 1], axes=[2, 4] + + Result: tensor with shape [1, 2, 3, 1] + """ + if axes is None: + inputs = as_nodes(data, name=name) + else: + inputs = as_nodes(data, axes, name=name) + return _get_node_factory_opset15().create( + "Squeeze", + inputs, + {"allow_axis_skip": allow_axis_skip} + ) diff --git a/src/bindings/python/tests/test_graph/test_ops_fused.py b/src/bindings/python/tests/test_graph/test_ops_fused.py index bdbf4a1a9f1f9c..2bab743bfd7afb 100644 --- a/src/bindings/python/tests/test_graph/test_ops_fused.py +++ b/src/bindings/python/tests/test_graph/test_ops_fused.py @@ -110,17 +110,6 @@ def test_clamp_operator(): assert list(model.get_output_shape(0)) == [2, 2] -def test_squeeze_operator(): - data_shape = [1, 2, 1, 3, 1, 1] - parameter_data = ov.parameter(data_shape, name="Data", dtype=np.float32) - axes = [2, 4] - model = ov.squeeze(parameter_data, axes) - - assert model.get_type_name() == "Squeeze" - assert model.get_output_size() == 1 - assert list(model.get_output_shape(0)) == [1, 2, 3, 1] - - def test_squared_difference_operator(): x1_shape = [1, 2, 3, 4] x2_shape = [2, 3, 4] diff --git a/src/bindings/python/tests/test_graph/test_squeeze.py b/src/bindings/python/tests/test_graph/test_squeeze.py new file mode 100644 index 00000000000000..869d84a0414841 --- /dev/null +++ b/src/bindings/python/tests/test_graph/test_squeeze.py @@ -0,0 +1,51 @@ +# -*- coding: utf-8 -*- +# Copyright (C) 2018-2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +import openvino.runtime.opset1 as ov_opset1 +import openvino.runtime.opset15 as ov_opset15 +import numpy as np +import pytest + + +def test_squeeze_v1_operator(): + data_shape = [1, 2, 1, 3, 1, 1] + parameter_data = ov_opset1.parameter(data_shape, name="Data", dtype=np.float32) + axes = [2, 4] + model = ov_opset1.squeeze(parameter_data, axes) + + assert model.get_type_name() == "Squeeze" + assert model.get_output_size() == 1 + assert list(model.get_output_shape(0)) == [1, 2, 3, 1] + + +@pytest.mark.parametrize(("input_shape", "axes", "allow_axis_skip", "expected_shape"), [ + ((1, 2, 1, 3, 1, 1), [1, 2, 4], True, [1, 2, 3, 1]), + ((1, 2, 1, 3, 1, 1), [1, 2, 4], False, [1, 2, 3, 1]), + ((2, -1, 3), [1], False, [2, 3]) +]) +def test_squeeze_v15_operator(input_shape, axes, allow_axis_skip, expected_shape): + parameter_data = ov_opset15.parameter(input_shape, name="Data", dtype=np.float32) + model = ov_opset15.squeeze(parameter_data, axes, allow_axis_skip, name="Squeeze") + + assert model.get_type_name() == "Squeeze" + assert model.get_output_size() == 1 + assert list(model.get_output_shape(0)) == expected_shape + + +def test_squeeze_v15_dynamic_rank_output(): + parameter_data = ov_opset15.parameter((2, -1, 3), name="Data", dtype=np.float32) + model = ov_opset15.squeeze(parameter_data, [1], True, name="Squeeze") + + assert model.get_type_name() == "Squeeze" + assert model.get_output_size() == 1 + assert model.get_output_partial_shape(0).to_string() == "[...]" + + +def test_squeeze_v15_axes_not_given(): + parameter_data = ov_opset15.parameter((1, 3, 1, 1, 3, 5), name="Data", dtype=np.float32) + model = ov_opset15.squeeze(data=parameter_data, name="Squeeze") + + assert model.get_type_name() == "Squeeze" + assert model.get_output_size() == 1 + assert list(model.get_output_shape(0)) == [3, 3, 5] From b9a94c3f8b83deb41ba2e748150d70157784f96b Mon Sep 17 00:00:00 2001 From: Ivan Tikhonov Date: Thu, 31 Oct 2024 15:08:57 +0400 Subject: [PATCH 083/120] [ONNX] Update DequantizeLinear21 converter (#27351) ### Details: Aligned with the canonical form of the dequantization subgraph. Reshape op has been moved up right after the Constant, it will be const folded in MOC, this is ok, Reshape const folding doesn't copy a constant, just copies a pointer. And ConvertLike were replaced with Convert. Perhaps that's a pretty rough change and we need to add a check here that the scale is a contant. And in that case use Convert instead of ConvertLike, if scale is not a constant, maybe we should leave ConvertLike. ### Tickets: - *https://jira.devtools.intel.com/browse/CVS-156329* --- .../frontend/src/op/dequantize_linear.cpp | 33 ++++++++++--------- 1 file changed, 18 insertions(+), 15 deletions(-) diff --git a/src/frontends/onnx/frontend/src/op/dequantize_linear.cpp b/src/frontends/onnx/frontend/src/op/dequantize_linear.cpp index b09bc73467bc10..d7b5214f3e53f4 100644 --- a/src/frontends/onnx/frontend/src/op/dequantize_linear.cpp +++ b/src/frontends/onnx/frontend/src/op/dequantize_linear.cpp @@ -221,19 +221,8 @@ ov::OutputVector dequantize_linear(const ov::frontend::onnx::Node& node) { FRONT_END_GENERAL_CHECK(src_x.get_partial_shape().is_static(), "DequantizeLinear cannot operate with dynamic shapes of input X"); - const auto& unsqueezed_axes = std::make_shared(ov::element::i64, Shape{1}, std::vector{1}); - - if (inputs.size() > 2) { - zp = inputs[2]; - if (zp.get_element_type() != scale.get_element_type()) { - zp = std::make_shared(zp, scale); - } - zp = std::make_shared(zp, unsqueezed_axes); - } - const auto axis = node.get_attribute_value("axis", 1); const auto block_size = static_cast(node.get_attribute_value("block_size", 0)); - const auto scale_type = scale.get_element_type(); FRONT_END_GENERAL_CHECK(axis == 0, "Axis != 0 isn't supported"); FRONT_END_GENERAL_CHECK(block_size > 0, "block_size must be greater than zero"); @@ -241,16 +230,30 @@ ov::OutputVector dequantize_linear(const ov::frontend::onnx::Node& node) { src_x.get_shape()[0] % block_size == 0, "DequantizeLinear doesn't support case when first dimension of X cannot be divided by block_size"); - const auto& x = src_x.get_element_type() == scale_type ? src_x : std::make_shared(src_x, scale); + ov::Output broadcastable_x = op::util::reshape( + src_x, + Shape{static_cast(src_x.get_shape()[0]) / block_size, block_size, src_x.get_shape()[1]}); + + const auto& unsqueezed_axes = std::make_shared(ov::element::i64, Shape{1}, std::vector{1}); + + const auto scale_type = scale.get_element_type(); + if (inputs.size() > 2) { + zp = inputs[2]; + if (zp.get_element_type() != scale.get_element_type()) { + zp = std::make_shared(zp, scale_type); + } + zp = std::make_shared(zp, unsqueezed_axes); + } + + const auto& x = src_x.get_element_type() == scale_type ? broadcastable_x + : std::make_shared(broadcastable_x, scale_type); // For further broadcasting scales and zp - reshape input to a shape [x.shape[0]/block_size, block_size, x.shape[1]] - ov::Output broadcastable_x = - op::util::reshape(x, Shape{static_cast(x.get_shape()[0]) / block_size, block_size, x.get_shape()[1]}); // Adding additional dimension for broadcasting scale = std::make_shared(scale, unsqueezed_axes); if (zp.get_node_shared_ptr()) { - broadcastable_x = std::make_shared(broadcastable_x, zp); + broadcastable_x = std::make_shared(x, zp); } const auto& scaled_x = std::make_shared(broadcastable_x, scale); From a488aec3812c8998028bab7e5996bb1c057f162e Mon Sep 17 00:00:00 2001 From: Roman Kazantsev Date: Fri, 1 Nov 2024 09:20:33 +0400 Subject: [PATCH 084/120] [TF FE] Run string ops tests on ARM (#27367) **Details:** Since openvino-tokenizers is build for ARM in the precommit, we are ready to switch on String ops tests **Ticket:** TBD Signed-off-by: Kazantsev, Roman --- .../tensorflow_tests/test_tf_LookupTableFind.py | 8 -------- .../tensorflow_tests/test_tf_RaggedTensorToSparse.py | 6 ------ .../tensorflow_tests/test_tf_RaggedTensorToTensor.py | 10 ---------- .../tensorflow_tests/test_tf_StaticRegexReplace.py | 6 ------ .../tensorflow_tests/test_tf_StringLower.py | 10 ---------- .../tensorflow_tests/test_tf_StringSplitV2.py | 6 ------ .../tensorflow_tests/test_tf_StringToHashBucketFast.py | 6 ------ 7 files changed, 52 deletions(-) diff --git a/tests/layer_tests/tensorflow_tests/test_tf_LookupTableFind.py b/tests/layer_tests/tensorflow_tests/test_tf_LookupTableFind.py index bd1422f8719cea..97177a5adeec13 100644 --- a/tests/layer_tests/tensorflow_tests/test_tf_LookupTableFind.py +++ b/tests/layer_tests/tensorflow_tests/test_tf_LookupTableFind.py @@ -1,8 +1,6 @@ # Copyright (C) 2018-2024 Intel Corporation # SPDX-License-Identifier: Apache-2.0 -import platform - import numpy as np import pytest import tensorflow as tf @@ -91,12 +89,6 @@ def test_lookup_table_find(self, hash_table_type, keys_shape, params, ie_device, if ie_device == 'GPU' or run_in_jenkins(): pytest.skip("operation extesion is not supported on GPU or " "No layout format available for gather:LookupTableFind issue") - if params['keys_type'] == str: - if platform.system() in ('Darwin') or platform.machine() in ['arm', 'armv7l', - 'aarch64', - 'arm64', - 'ARM64']: - pytest.xfail(reason='126314, 132699: Build tokenizers for ARM and MacOS') self._test(*self.create_lookup_table_find_net(hash_table_type=hash_table_type, keys_shape=keys_shape, **params), ie_device, precision, ir_version, temp_dir=temp_dir, diff --git a/tests/layer_tests/tensorflow_tests/test_tf_RaggedTensorToSparse.py b/tests/layer_tests/tensorflow_tests/test_tf_RaggedTensorToSparse.py index 621b8430f64fdc..f0832676f0f982 100644 --- a/tests/layer_tests/tensorflow_tests/test_tf_RaggedTensorToSparse.py +++ b/tests/layer_tests/tensorflow_tests/test_tf_RaggedTensorToSparse.py @@ -1,8 +1,6 @@ # Copyright (C) 2022-2024 Intel Corporation # SPDX-License-Identifier: Apache-2.0 -import platform - import numpy as np import pytest import tensorflow as tf @@ -55,10 +53,6 @@ def create_ragged_tensor_to_sparse_net(self, rt_dense_values_shape, rt_dense_val ]) @pytest.mark.precommit @pytest.mark.nightly - @pytest.mark.xfail(condition=platform.system() in ('Darwin', 'Linux') and platform.machine() in ['arm', 'armv7l', - 'aarch64', - 'arm64', 'ARM64'], - reason='126314, 132699: Build tokenizers for ARM and MacOS') def test_ragged_tensor_to_sparse(self, rt_dense_values_shape, rt_dense_values_type, rt_nested_splits, ie_device, precision, ir_version, temp_dir, use_legacy_frontend): if ie_device == 'GPU' or run_in_jenkins(): diff --git a/tests/layer_tests/tensorflow_tests/test_tf_RaggedTensorToTensor.py b/tests/layer_tests/tensorflow_tests/test_tf_RaggedTensorToTensor.py index 39afde0a2c6b08..0267874eb98b35 100644 --- a/tests/layer_tests/tensorflow_tests/test_tf_RaggedTensorToTensor.py +++ b/tests/layer_tests/tensorflow_tests/test_tf_RaggedTensorToTensor.py @@ -1,8 +1,6 @@ # Copyright (C) 2022-2024 Intel Corporation # SPDX-License-Identifier: Apache-2.0 -import platform - import numpy as np import pytest import tensorflow as tf @@ -52,10 +50,6 @@ def create_ragged_tensor_to_tensor_net(self, shape_type, shape_value, values_sha @pytest.mark.parametrize('row_partition_types', [["ROW_SPLITS"]]) @pytest.mark.precommit @pytest.mark.nightly - @pytest.mark.xfail(condition=platform.system() in ('Darwin', 'Linux') and platform.machine() in ['arm', 'armv7l', - 'aarch64', - 'arm64', 'ARM64'], - reason='126314, 132699: Build tokenizers for ARM and MacOS') def test_ragged_tensor_to_tensor(self, shape_type, shape_value, values_shape, values_type, default_value, row_partition_tensors, row_partition_types, ie_device, precision, ir_version, temp_dir, use_legacy_frontend): @@ -110,10 +104,6 @@ def create_ragged_tensor_to_tensor_net(self, shape_type, shape_value, values_sha @pytest.mark.parametrize('row_partition_types', [["FIRST_DIM_SIZE", "VALUE_ROWIDS"]]) @pytest.mark.precommit @pytest.mark.nightly - @pytest.mark.xfail(condition=platform.system() in ('Darwin', 'Linux') and platform.machine() in ['arm', 'armv7l', - 'aarch64', - 'arm64', 'ARM64'], - reason='126314, 132699: Build tokenizers for ARM and MacOS') def test_ragged_tensor_to_tensor(self, shape_type, shape_value, values_shape, values_type, default_value, row_partition_tensors, row_partition_types, ie_device, precision, ir_version, temp_dir, use_legacy_frontend): diff --git a/tests/layer_tests/tensorflow_tests/test_tf_StaticRegexReplace.py b/tests/layer_tests/tensorflow_tests/test_tf_StaticRegexReplace.py index ef5e135537eb84..a3fa91ad0976f5 100644 --- a/tests/layer_tests/tensorflow_tests/test_tf_StaticRegexReplace.py +++ b/tests/layer_tests/tensorflow_tests/test_tf_StaticRegexReplace.py @@ -1,8 +1,6 @@ # Copyright (C) 2018-2024 Intel Corporation # SPDX-License-Identifier: Apache-2.0 -import platform - import numpy as np import pytest import tensorflow as tf @@ -41,10 +39,6 @@ def create_static_regex_replace_net(self, input_shape, pattern, rewrite, replace @pytest.mark.parametrize('replace_global', [None, True, False]) @pytest.mark.precommit @pytest.mark.nightly - @pytest.mark.xfail(condition=platform.system() in ('Darwin', 'Linux') and platform.machine() in ['arm', 'armv7l', - 'aarch64', - 'arm64', 'ARM64'], - reason='Ticket - 126314, 132699') def test_static_regex_replace(self, input_shape, pattern, rewrite, replace_global, ie_device, precision, ir_version, temp_dir, use_legacy_frontend): diff --git a/tests/layer_tests/tensorflow_tests/test_tf_StringLower.py b/tests/layer_tests/tensorflow_tests/test_tf_StringLower.py index f4c9e7260d7afb..5787c0b8318801 100644 --- a/tests/layer_tests/tensorflow_tests/test_tf_StringLower.py +++ b/tests/layer_tests/tensorflow_tests/test_tf_StringLower.py @@ -3,7 +3,6 @@ import numpy as np import os -import platform import pytest import tensorflow as tf from common.tf_layer_test_class import CommonTFLayerTest @@ -46,10 +45,6 @@ def create_string_lower_net(self, input_shape, encoding, strings_dictionary): ['第一句話在這裡', '第二句話在這裡', '第三句話在這裡']]) @pytest.mark.precommit @pytest.mark.nightly - @pytest.mark.xfail(condition=platform.system() in ('Darwin', 'Linux') and platform.machine() in ['arm', 'armv7l', - 'aarch64', - 'arm64', 'ARM64'], - reason='Ticket - 126314, 132699') def test_string_lower(self, input_shape, encoding, strings_dictionary, ie_device, precision, ir_version, temp_dir, use_legacy_frontend): if ie_device == 'GPU' or run_in_jenkins(): @@ -78,10 +73,6 @@ def create_string_lower_model(self, output_dir): @pytest.mark.precommit @pytest.mark.nightly - @pytest.mark.xfail(condition=platform.system() in ('Darwin', 'Linux') and platform.machine() in ['arm', 'armv7l', - 'aarch64', - 'arm64', 'ARM64'], - reason='Ticket - 126314, 132699') def test_string_lower_with_ovc(self, ie_device, temp_dir, precision): if ie_device == 'GPU' or run_in_jenkins(): pytest.skip("operation extension is not supported on GPU") @@ -90,7 +81,6 @@ def test_string_lower_with_ovc(self, ie_device, temp_dir, precision): return_code, _, _ = generate_ir_ovc(input_model_path, {'output_model': output_model_path}) assert return_code == 0, "OVC tool is failed for conversion model {}".format(input_model_path) - import openvino_tokenizers import openvino as ov core = ov.Core() compiled_model = core.compile_model(output_model_path, ie_device) diff --git a/tests/layer_tests/tensorflow_tests/test_tf_StringSplitV2.py b/tests/layer_tests/tensorflow_tests/test_tf_StringSplitV2.py index 3745d07926bc43..84d7c269ce598f 100644 --- a/tests/layer_tests/tensorflow_tests/test_tf_StringSplitV2.py +++ b/tests/layer_tests/tensorflow_tests/test_tf_StringSplitV2.py @@ -1,8 +1,6 @@ # Copyright (C) 2018-2024 Intel Corporation # SPDX-License-Identifier: Apache-2.0 -import platform - import numpy as np import pytest import tensorflow as tf @@ -42,10 +40,6 @@ def create_string_split_v2_net(self, input_shape, sep, maxsplit): @pytest.mark.parametrize('maxsplit', [None, -1, 5, 10]) @pytest.mark.precommit @pytest.mark.nightly - @pytest.mark.xfail(condition=platform.system() in ('Darwin', 'Linux') and platform.machine() in ['arm', 'armv7l', - 'aarch64', - 'arm64', 'ARM64'], - reason='126314, 132699: Build tokenizers for ARM and MacOS') def test_string_split_v2(self, input_shape, sep, maxsplit, ie_device, precision, ir_version, temp_dir, use_legacy_frontend): diff --git a/tests/layer_tests/tensorflow_tests/test_tf_StringToHashBucketFast.py b/tests/layer_tests/tensorflow_tests/test_tf_StringToHashBucketFast.py index 08812fe7b46228..5fefb8117f3dcf 100644 --- a/tests/layer_tests/tensorflow_tests/test_tf_StringToHashBucketFast.py +++ b/tests/layer_tests/tensorflow_tests/test_tf_StringToHashBucketFast.py @@ -1,8 +1,6 @@ # Copyright (C) 2018-2024 Intel Corporation # SPDX-License-Identifier: Apache-2.0 -import platform - import numpy as np import pytest import tensorflow as tf @@ -45,10 +43,6 @@ def create_string_to_hash_bucket_fast_net(self, input_shape, strings_dictionary, ['', ' ', '12345 ']]) @pytest.mark.precommit @pytest.mark.nightly - @pytest.mark.xfail(condition=platform.system() in ('Darwin', 'Linux') and platform.machine() in ['arm', 'armv7l', - 'aarch64', - 'arm64', 'ARM64'], - reason='Ticket - 126314, 132699') def test_string_to_hash_bucket_fast(self, input_shape, num_buckets, strings_dictionary, ie_device, precision, ir_version, temp_dir, use_legacy_frontend): From 5833781ddbc476d77cf5593f1f8b34758988b9a8 Mon Sep 17 00:00:00 2001 From: Georgy Krivoruchko Date: Fri, 1 Nov 2024 12:03:18 +0400 Subject: [PATCH 085/120] [ONNX] Disabled constant folding for Subtract branch of DequantizeLinear-21 (#27359) ### Details: - Disabled constant folding for Subtract branch of DequantizeLinear-21 ### Tickets: - 156329 --- src/frontends/onnx/frontend/src/op/dequantize_linear.cpp | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/frontends/onnx/frontend/src/op/dequantize_linear.cpp b/src/frontends/onnx/frontend/src/op/dequantize_linear.cpp index d7b5214f3e53f4..47fcc7af60bf61 100644 --- a/src/frontends/onnx/frontend/src/op/dequantize_linear.cpp +++ b/src/frontends/onnx/frontend/src/op/dequantize_linear.cpp @@ -18,6 +18,7 @@ #include "openvino/op/subtract.hpp" #include "openvino/op/transpose.hpp" #include "openvino/op/unsqueeze.hpp" +#include "transformations/rt_info/disable_constant_folding.hpp" #include "utils/common.hpp" #include "utils/reshape.hpp" using namespace ov::op; @@ -241,6 +242,7 @@ ov::OutputVector dequantize_linear(const ov::frontend::onnx::Node& node) { zp = inputs[2]; if (zp.get_element_type() != scale.get_element_type()) { zp = std::make_shared(zp, scale_type); + disable_constant_folding(zp.get_node_shared_ptr()); } zp = std::make_shared(zp, unsqueezed_axes); } From af389b482381b445a3d7bb6ed6c7de3a5320da87 Mon Sep 17 00:00:00 2001 From: Evgenya Nugmanova Date: Fri, 1 Nov 2024 12:14:46 +0400 Subject: [PATCH 086/120] Broadcast: symbol propagation (#27357) ### Details: - *Improves symbol propagation in LLMs and allows for better ShapeOf optimization* Signed-off-by: Evgeniia Nugmanova --- src/core/include/openvino/op/util/broadcast_base.hpp | 1 + src/core/src/op/util/broadcast_base.cpp | 7 +++++++ 2 files changed, 8 insertions(+) diff --git a/src/core/include/openvino/op/util/broadcast_base.hpp b/src/core/include/openvino/op/util/broadcast_base.hpp index 2e500eb611c04c..6300559ac8cf00 100644 --- a/src/core/include/openvino/op/util/broadcast_base.hpp +++ b/src/core/include/openvino/op/util/broadcast_base.hpp @@ -63,6 +63,7 @@ class OPENVINO_API BroadcastBase : public Op { bool evaluate_lower(TensorVector& outputs) const override; bool evaluate_upper(TensorVector& outputs) const override; + bool evaluate_symbol(ov::TensorSymbolVector& output_symbols) const override; PartialShape get_result_shape_pdpd(const PartialShape& arg0_shape, const PartialShape& target_shape, diff --git a/src/core/src/op/util/broadcast_base.cpp b/src/core/src/op/util/broadcast_base.cpp index 59154e45e2b37a..c2c838afeb38bd 100644 --- a/src/core/src/op/util/broadcast_base.cpp +++ b/src/core/src/op/util/broadcast_base.cpp @@ -471,3 +471,10 @@ bool ov::op::util::BroadcastBase::evaluate_upper(ov::TensorVector& output_values return false; return default_upper_bound_evaluator(this, output_values); } + +bool ov::op::util::BroadcastBase::evaluate_symbol(ov::TensorSymbolVector& output_symbols) const { + if (!input_value(1).get_tensor().has_and_set_bound() || + (get_input_size() > 2 && !input_value(2).get_tensor().has_and_set_bound())) + return false; + return default_symbol_evaluator(this, {0}, output_symbols); +} From caa1e6af13139692a34cf37787c9c79f949bcaaa Mon Sep 17 00:00:00 2001 From: Bogdan Pereanu Date: Fri, 1 Nov 2024 10:42:50 +0200 Subject: [PATCH 087/120] [NPU] Create compiler adapter class (#27006) ### Details: - *Create a new CompilerAdapter interface that hides different implementations of CIP and CID* - *iCompiler remains an interface only for CIP. This keeps CIP (developed in another repository) decoupled from L0* - we still use NetworkMetadata in the plugin flow, which needs to be decided later if is still needed or if it can be removed - *Graph object is created by compiler_adapter* - *Backend doesn't create/initialize graph any longer* - *Moving common objects for backend and compiler_adapter to utils/zero/* - *Destroy blob on the import path after we load the weights into the NPU memory* - *Create a new property to postpone weights loading until the creation of the first inference request, by default is performed right after the model is compiled - NPU_DEFER_WEIGHTS_LOAD* A short description of the new format: ![Screenshot 2024-10-30 151129](https://github.com/user-attachments/assets/89f86c36-f3e8-4906-8394-7cd0ae5617a2) ### Tickets: - *CVS-153081* --------- Signed-off-by: Bogdan Pereanu --- src/plugins/intel_npu/README.md | 2 +- src/plugins/intel_npu/cmake/features.cmake | 25 +- src/plugins/intel_npu/src/CMakeLists.txt | 13 +- .../al/include/intel_npu/config/runtime.hpp | 32 + .../src/al/include/intel_npu/icompiler.hpp | 162 +-- .../al/include/intel_npu/network_metadata.hpp | 127 ++ .../intel_npu/npu_private_properties.hpp | 7 + .../intel_npu/src/al/src/config/runtime.cpp | 1 + .../intel_npu/src/backend/CMakeLists.txt | 29 - .../src/backend/include/zero_backend.hpp | 9 +- .../src/backend/include/zero_device.hpp | 10 +- .../src/backend/include/zero_executor.hpp | 86 -- .../src/backend/include/zero_host_tensor.hpp | 2 +- .../backend/include/zero_infer_request.hpp | 9 +- .../src/backend/include/zero_memory.hpp | 2 +- .../src/backend/include/zero_pipeline.hpp | 13 +- .../src/backend/include/zero_profiling.hpp | 2 +- .../backend/include/zero_remote_tensor.hpp | 2 +- .../src/backend/src/zero_backend.cpp | 28 +- .../intel_npu/src/backend/src/zero_device.cpp | 36 +- .../src/backend/src/zero_executor.cpp | 187 --- .../src/backend/src/zero_infer_request.cpp | 74 +- .../src/backend/src/zero_pipeline.cpp | 62 +- .../intel_npu/src/common/CMakeLists.txt | 2 +- .../intel_npu/common/icompiled_model.hpp | 11 +- .../include/intel_npu/common/igraph.hpp | 103 ++ .../common/include/intel_npu/common/npu.hpp | 16 +- .../intel_npu/common/sync_infer_request.hpp | 3 +- .../src/common/src/sync_infer_request.cpp | 2 +- .../include/driver_compiler_adapter.hpp | 50 - .../include/zero_compiler_in_driver.hpp | 201 --- .../compiler/src/driver_compiler_adapter.cpp | 130 -- .../compiler/src/zero_compiler_in_driver.cpp | 1081 ----------------- .../CMakeLists.txt | 7 +- .../include/custom_stream_buffer.hpp | 4 +- .../include/driver_compiler_adapter.hpp | 64 + .../compiler_adapter/include/driver_graph.hpp | 50 + .../include/ir_serializer.hpp} | 4 +- .../include/plugin_compiler_adapter.hpp | 37 + .../compiler_adapter/include/plugin_graph.hpp | 49 + .../include/ze_graph_ext_wrappers.hpp | 159 +++ .../ze_graph_ext_wrappers_interface.hpp | 42 + .../src/driver_compiler_adapter.cpp | 606 +++++++++ .../src/compiler_adapter/src/driver_graph.cpp | 164 +++ .../src/ir_serializer.cpp} | 8 +- .../src/plugin_compiler_adapter.cpp | 160 +++ .../src/compiler_adapter/src/plugin_graph.cpp | 132 ++ .../src/precomp.hpp | 0 .../src/ze_graph_ext_wrappers.cpp | 568 +++++++++ .../intel_npu/src/plugin/CMakeLists.txt | 30 +- .../src/plugin/include/compiled_model.hpp | 37 +- .../intel_npu/src/plugin/include/compiler.hpp | 20 - .../intel_npu/src/plugin/include/plugin.hpp | 4 +- .../intel_npu/src/plugin/src/backends.cpp | 7 +- .../src/plugin/src/compiled_model.cpp | 138 +-- .../intel_npu/src/plugin/src/compiler.cpp | 101 -- .../intel_npu/src/plugin/src/plugin.cpp | 82 +- .../intel_npu/utils/zero}/zero_init.hpp | 6 +- .../intel_npu/utils/zero}/zero_types.hpp | 2 - .../intel_npu/utils/zero/zero_utils.hpp | 31 +- .../intel_npu/utils/zero}/zero_wrappers.hpp | 23 +- .../intel_npu/src/utils/src/CMakeLists.txt | 3 +- .../src/utils/src/zero/CMakeLists.txt | 34 +- .../src => utils/src/zero}/zero_init.cpp | 9 +- .../src => utils/src/zero}/zero_wrappers.cpp | 28 +- .../intel_npu/tests/functional/CMakeLists.txt | 10 +- .../custom_stream.cpp | 5 +- .../ov_infer_request/compile_and_infer.cpp | 4 +- .../functional/behavior/work_with_devices.hpp | 2 +- .../internal/overload/compile_and_infer.hpp | 8 +- .../overload/compiled_model/property.cpp | 2 +- .../behavior/compiled_model/properties.cpp | 2 +- .../intel_npu/thirdparty/CMakeLists.txt | 3 +- 73 files changed, 2620 insertions(+), 2544 deletions(-) create mode 100644 src/plugins/intel_npu/src/al/include/intel_npu/network_metadata.hpp delete mode 100644 src/plugins/intel_npu/src/backend/include/zero_executor.hpp delete mode 100644 src/plugins/intel_npu/src/backend/src/zero_executor.cpp create mode 100644 src/plugins/intel_npu/src/common/include/intel_npu/common/igraph.hpp delete mode 100644 src/plugins/intel_npu/src/compiler/include/driver_compiler_adapter.hpp delete mode 100644 src/plugins/intel_npu/src/compiler/include/zero_compiler_in_driver.hpp delete mode 100644 src/plugins/intel_npu/src/compiler/src/driver_compiler_adapter.cpp delete mode 100644 src/plugins/intel_npu/src/compiler/src/zero_compiler_in_driver.cpp rename src/plugins/intel_npu/src/{compiler => compiler_adapter}/CMakeLists.txt (85%) rename src/plugins/intel_npu/src/{compiler => compiler_adapter}/include/custom_stream_buffer.hpp (95%) create mode 100644 src/plugins/intel_npu/src/compiler_adapter/include/driver_compiler_adapter.hpp create mode 100644 src/plugins/intel_npu/src/compiler_adapter/include/driver_graph.hpp rename src/plugins/intel_npu/src/{compiler/include/graph_transformations.hpp => compiler_adapter/include/ir_serializer.hpp} (93%) create mode 100644 src/plugins/intel_npu/src/compiler_adapter/include/plugin_compiler_adapter.hpp create mode 100644 src/plugins/intel_npu/src/compiler_adapter/include/plugin_graph.hpp create mode 100644 src/plugins/intel_npu/src/compiler_adapter/include/ze_graph_ext_wrappers.hpp create mode 100644 src/plugins/intel_npu/src/compiler_adapter/include/ze_graph_ext_wrappers_interface.hpp create mode 100644 src/plugins/intel_npu/src/compiler_adapter/src/driver_compiler_adapter.cpp create mode 100644 src/plugins/intel_npu/src/compiler_adapter/src/driver_graph.cpp rename src/plugins/intel_npu/src/{compiler/src/graph_transformations.cpp => compiler_adapter/src/ir_serializer.cpp} (94%) create mode 100644 src/plugins/intel_npu/src/compiler_adapter/src/plugin_compiler_adapter.cpp create mode 100644 src/plugins/intel_npu/src/compiler_adapter/src/plugin_graph.cpp rename src/plugins/intel_npu/src/{compiler => compiler_adapter}/src/precomp.hpp (100%) create mode 100644 src/plugins/intel_npu/src/compiler_adapter/src/ze_graph_ext_wrappers.cpp delete mode 100644 src/plugins/intel_npu/src/plugin/include/compiler.hpp delete mode 100644 src/plugins/intel_npu/src/plugin/src/compiler.cpp rename src/plugins/intel_npu/src/{backend/include => utils/include/intel_npu/utils/zero}/zero_init.hpp (95%) rename src/plugins/intel_npu/src/{backend/include => utils/include/intel_npu/utils/zero}/zero_types.hpp (99%) rename src/plugins/intel_npu/src/{backend/include => utils/include/intel_npu/utils/zero}/zero_wrappers.hpp (90%) rename src/plugins/intel_npu/src/{backend/src => utils/src/zero}/zero_init.cpp (98%) rename src/plugins/intel_npu/src/{backend/src => utils/src/zero}/zero_wrappers.cpp (91%) diff --git a/src/plugins/intel_npu/README.md b/src/plugins/intel_npu/README.md index b7508c68704e32..980faa71a15937 100644 --- a/src/plugins/intel_npu/README.md +++ b/src/plugins/intel_npu/README.md @@ -78,7 +78,7 @@ There is currently no support for multiple devices, which means only one level-z ### Inference pipeline -The result of the model compilation is represented through a NetworkDescription. This model description is passed by the plugin to the driver to create a level zero graph instance and obtain a graph handle that can later be used to execute multiple inferences in parallel for the same model. Since the same model instance is shared across all subsequent inference objects, this initialization step is performed by default right after the model is compiled and it can be postponed until the creation of the first inference request through the use of an environment variable: "IE_NPU_CREATE_EXECUTOR" (IE_NPU_CREATE_EXECUTOR=0 to postpone the initialization). +The result of the model compilation is represented through an IGraph object, which contains a valid level zero graph handle that can later be used to execute multiple inferences in parallel for the same model. By default, weights are loaded into the NPU memory right after the model is compiled, but this step can be postponed until the creation of the first inference request through the use of an internal NPU property: "NPU_DEFER_WEIGHTS_LOAD". Users can create one or more inference requests for a compiled model using OpenVINO API: diff --git a/src/plugins/intel_npu/cmake/features.cmake b/src/plugins/intel_npu/cmake/features.cmake index 0dde0f9d67f6e5..7d34c52c6d1292 100644 --- a/src/plugins/intel_npu/cmake/features.cmake +++ b/src/plugins/intel_npu/cmake/features.cmake @@ -4,29 +4,10 @@ ov_option(ENABLE_MLIR_COMPILER "Enable compilation of npu_mlir_compiler libraries" ON) -ov_option(ENABLE_NPU_RUNTIME_COMMON "Enable compilation of npu runtime common libraries" ON) +ov_option(ENABLE_NPU_PLUGIN_ENGINE "Enable compilation of NPU plugin engine" ON) -# if ENABLE_ZEROAPI_BACKEND=ON, it adds the ze_loader dependency for driver compiler -ov_dependent_option(ENABLE_NPU_PLUGIN_ENGINE "Enable compilation of NPU plugin engine" ON "ENABLE_NPU_RUNTIME_COMMON" OFF) - -ov_dependent_option(ENABLE_ZEROAPI_BACKEND "Enable zero-api as a plugin backend" ON "ENABLE_NPU_RUNTIME_COMMON;ENABLE_NPU_PLUGIN_ENGINE" OFF) - -ov_dependent_option(ENABLE_DRIVER_COMPILER_ADAPTER "Enable NPU Compiler inside driver" ON "ENABLE_ZEROAPI_BACKEND" OFF) - -if((NOT ENABLE_NPU_PLUGIN_ENGINE OR NOT ENABLE_NPU_RUNTIME_COMMON) AND ENABLE_TESTS) - message(FATAL_ERROR "Tests depends on npu plugin engine and npu runtime common libraries!") -endif() - -if((NOT ENABLE_NPU_PLUGIN_ENGINE OR NOT ENABLE_NPU_RUNTIME_COMMON) AND ENABLE_ZEROAPI_BACKEND) - message(FATAL_ERROR "Zero backend depends on npu plugin engine and npu common libraries!") -endif() - -if(NOT ENABLE_ZEROAPI_BACKEND AND ENABLE_DRIVER_COMPILER_ADAPTER) - message(FATAL_ERROR "Compiler adapter depends on zero backend to use same context!") -endif() - -if(NOT BUILD_SHARED_LIBS AND NOT ENABLE_MLIR_COMPILER AND NOT ENABLE_DRIVER_COMPILER_ADAPTER) - message(FATAL_ERROR "No compiler found for static build!") +if(NOT ENABLE_NPU_PLUGIN_ENGINE AND ENABLE_TESTS) + message(FATAL_ERROR "Tests depends on npu plugin engine!") endif() ov_dependent_option(ENABLE_IMD_BACKEND "Enable InferenceManagerDemo based NPU AL backend" OFF "NOT WIN32;NOT CMAKE_CROSSCOMPILING" OFF) diff --git a/src/plugins/intel_npu/src/CMakeLists.txt b/src/plugins/intel_npu/src/CMakeLists.txt index 5530eb1f3e59e5..f5d1fd5b41226c 100644 --- a/src/plugins/intel_npu/src/CMakeLists.txt +++ b/src/plugins/intel_npu/src/CMakeLists.txt @@ -9,18 +9,9 @@ add_subdirectory(utils) add_subdirectory(al) -if (ENABLE_NPU_RUNTIME_COMMON) +if (ENABLE_NPU_PLUGIN_ENGINE) add_subdirectory(common) -endif() - -if(ENABLE_DRIVER_COMPILER_ADAPTER AND ENABLE_ZEROAPI_BACKEND) - add_subdirectory(compiler) -endif() - -if(ENABLE_ZEROAPI_BACKEND) + add_subdirectory(compiler_adapter) add_subdirectory(backend) -endif() - -if (ENABLE_NPU_PLUGIN_ENGINE) add_subdirectory(plugin) endif() diff --git a/src/plugins/intel_npu/src/al/include/intel_npu/config/runtime.hpp b/src/plugins/intel_npu/src/al/include/intel_npu/config/runtime.hpp index d52c25f6a3e6a5..510ab7fc43b0c8 100644 --- a/src/plugins/intel_npu/src/al/include/intel_npu/config/runtime.hpp +++ b/src/plugins/intel_npu/src/al/include/intel_npu/config/runtime.hpp @@ -131,6 +131,38 @@ struct CREATE_EXECUTOR final : OptionBase { } }; +// +// DEFER_WEIGHTS_LOAD +// + +struct DEFER_WEIGHTS_LOAD final : OptionBase { + static std::string_view key() { + return ov::intel_npu::defer_weights_load.name(); + } + + static int64_t defaultValue() { + return false; + } + + static constexpr std::string_view getTypeName() { + return "bool"; + } + +#ifdef NPU_PLUGIN_DEVELOPER_BUILD + static std::string_view envVar() { + return "OV_NPU_DEFER_WEIGHTS_LOAD"; + } +#endif + + static bool isPublic() { + return false; + } + + static OptionMode mode() { + return OptionMode::RunTime; + } +}; + // // NUM_STREAMS // diff --git a/src/plugins/intel_npu/src/al/include/intel_npu/icompiler.hpp b/src/plugins/intel_npu/src/al/include/intel_npu/icompiler.hpp index e0a02f12aa2e17..53696396603d9a 100644 --- a/src/plugins/intel_npu/src/al/include/intel_npu/icompiler.hpp +++ b/src/plugins/intel_npu/src/al/include/intel_npu/icompiler.hpp @@ -6,128 +6,12 @@ #pragma once -#include -#include -#include -#include -#include -#include -#include - #include "intel_npu/config/config.hpp" -#include "openvino/core/partial_shape.hpp" -#include "openvino/core/type/element_type.hpp" -#include "openvino/runtime/common.hpp" +#include "intel_npu/network_metadata.hpp" #include "openvino/runtime/profiling_info.hpp" namespace intel_npu { -/** - * @brief A helper structure used for storing metadata corresponding to one input/output entry. - */ -struct IODescriptor { - /** - * @brief The name of the input/output assigned by the compiler. - * @details This value may differ from other name attributes: - * - The compiler could have created additional inputs/outputs (e.g. for representing states). These are not - * found in the original IR model. - * - The compiler may append indices to names in the case where duplicate names are found. - * @note The prefixes introduced by the compiler in order to differentiate the special cases (e.g. states and shape - * tensors) were removed prior to initializing this field. - */ - std::string nameFromCompiler; - - ov::element::Type precision; - - ov::PartialShape shapeFromCompiler; - - /** - * @brief If set to "true", the current object describes a buffer which may be used for altering a state tensor. - * @details This flag is set if the compiler prefixed the name using a "read value" prefix. The state input and - * state output descriptors are also tied using the "relatedDescriptorIndex" attribute. - */ - bool isStateInput = false; - - /** - * @brief If set to "true", the current object describes a buffer which reflects the value of a state tensor. - * @details This flag is set if the compiler prefixed the name using an "assign" prefix. The state input and - * state output descriptors are also tied using the "relatedDescriptorIndex" attribute. - */ - bool isStateOutput = false; - - /** - * @brief If set to "true", the buffer of the tensor described here contains as value the shape of the referenced - * tensor. - * @details This flag is set if the compiler prefixed the name using a "shape" prefix. - * - * The referenced tensor bears the same name ("nameFromCompiler"), but its "isShapeTensor" value is set to - * "false". The two descriptors are also tied using the "relatedDescriptorIndex" attribute. - */ - bool isShapeTensor = false; - - /** - * @brief Points towards a related descriptor. - * @details The related descriptors are defined by (state input, state output) or (dynamic tensor, shape tensor) - * pairs. - */ - std::optional relatedDescriptorIndex; - - /** - * @brief The friendly name of the node extracted from the IR model. - * @details In some cases, this field is required for constructing a dummy model which uses the same input/output - * metadata as the original IR model. - * - * This field may be empty if the I/O entry is not found in the original IR model (i.e. the entry was added by the - * compiler). - */ - std::string nodeFriendlyName; - - /** - * @brief The names of the output tensors extracted from the IR model. - * @details In some cases, this field is required for constructing a dummy model which uses the same input/output - * metadata as the original IR model. - * - * This field may be empty if the I/O entry is not found in the original IR model (i.e. the entry was added by the - * compiler). - */ - std::unordered_set outputTensorNames; - - /** - * @brief The shape extracted from the IR model. - * @details The values may differ from the ones found in "shapeFromCompiler" if batching is to be handled by the - * plugin. - * - * This field may be empty if the I/O entry is not found in the original IR model (i.e. the entry was added - * by the compiler). - */ - std::optional shapeFromIRModel = std::nullopt; -}; - -struct NetworkMetadata final { - std::string name; - - std::vector inputs; - std::vector outputs; - std::vector profilingOutputs; - - size_t numStreams = 1; - - // Used primarily in the CID path to pass the level zero graph handle from compiler to the backend executor - void* graphHandle = nullptr; - - /** - * @brief Binds the (state input, state output) and (dynamic tensor, shape tensor) pairs using the - * "relatedDescriptorIndex" attribute. - * @details For state inputs, the "relatedDescriptorIndex" value is set to the index of the output which bears the - * same name. The reverse is also applied. - * - * For shape tensors, the lookup is performed in the same container (inputs or outputs). The value is once again set - * to the index of the entry which bears the same name. - */ - void bindRelatedDescriptors(); - -}; // namespace intel_npu - /** * @struct NetworkDescription * @brief The object returned by the compiler @@ -138,7 +22,6 @@ struct NetworkDescription final { NetworkDescription(std::vector&& compiledNetwork, NetworkMetadata&& metadata) : compiledNetwork(std::move(compiledNetwork)), metadata(std::move(metadata)) {} - NetworkDescription(NetworkMetadata&& metadata) : metadata(std::move(metadata)) {} // Force move semantics to prevent blob copies NetworkDescription(const NetworkDescription&) = delete; NetworkDescription(NetworkDescription&&) = default; @@ -151,32 +34,6 @@ struct NetworkDescription final { NetworkMetadata metadata; }; -/** - * @struct CompiledNetwork - * @brief Custom container for compiled network, used for export - * @var CompiledNetwork::data - * Pointer to the address of compiled network - * @var CompiledNetwork:size - * Size of the compiled network - * @var CompiledNetwork::ownedStorage - * Plugin owned compiled network storage that is required in case of a driver that - * doesn't support graph extension 1.7, as in this case plugin must create a copy of the compiled network. - * @note It's unsafe to store either data or size outside of the compiled network object as its destructor - * would release the owning container - */ - -struct CompiledNetwork { - const uint8_t* data; - size_t size; - CompiledNetwork(const uint8_t* data, size_t size, std::vector storage) - : data(data), - size(size), - ownedStorage(std::move(storage)) {} - -private: - std::vector ownedStorage; -}; - /** * @interface ICompiler * @brief An interface to be implemented by a concrete compiler to provide @@ -184,12 +41,6 @@ struct CompiledNetwork { */ class ICompiler : public std::enable_shared_from_this { public: - /** - * @brief Returns the maximum OpenVino opset version supported by the compiler - * @return opset version e.g. 11 for opset11 - */ - virtual uint32_t getSupportedOpsetVersion() const = 0; - /** * @brief Transforms a network from the OpenVINO model representation to a format executable * by a NPU device @@ -216,8 +67,6 @@ class ICompiler : public std::enable_shared_from_this { * @param config a reference to NPUConfig containing plugin config options * Note: compilation options will be ignored, * since the network is already compiled - * @param netName a reference to the string describing network name - * to be used for creating network description * @return a shared pointer on an object implementing NetworkDescription interface */ virtual NetworkMetadata parse(const std::vector& network, const Config& config) const = 0; @@ -226,15 +75,6 @@ class ICompiler : public std::enable_shared_from_this { const std::vector& network, const Config& config) const = 0; - // Driver compiler can use this to release graphHandle, if we do not have executor - virtual void release([[maybe_unused]] std::shared_ptr networkDescription){}; - - virtual CompiledNetwork getCompiledNetwork(const NetworkDescription& networkDescription) { - return CompiledNetwork(networkDescription.compiledNetwork.data(), - networkDescription.compiledNetwork.size(), - networkDescription.compiledNetwork); - } - protected: virtual ~ICompiler() = default; }; diff --git a/src/plugins/intel_npu/src/al/include/intel_npu/network_metadata.hpp b/src/plugins/intel_npu/src/al/include/intel_npu/network_metadata.hpp new file mode 100644 index 00000000000000..b7a78b3dfd43e1 --- /dev/null +++ b/src/plugins/intel_npu/src/al/include/intel_npu/network_metadata.hpp @@ -0,0 +1,127 @@ +// Copyright (C) 2018-2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +// Compiler Interface + +#pragma once + +#include +#include +#include +#include +#include +#include +#include + +#include "intel_npu/config/config.hpp" +#include "openvino/core/partial_shape.hpp" +#include "openvino/core/type/element_type.hpp" +#include "openvino/runtime/common.hpp" + +namespace intel_npu { + +/** + * @brief A helper structure used for storing metadata corresponding to one input/output entry. + */ +struct IODescriptor { + /** + * @brief The name of the input/output assigned by the compiler. + * @details This value may differ from other name attributes: + * - The compiler could have created additional inputs/outputs (e.g. for representing states). These are not + * found in the original IR model. + * - The compiler may append indices to names in the case where duplicate names are found. + * @note The prefixes introduced by the compiler in order to differentiate the special cases (e.g. states and shape + * tensors) were removed prior to initializing this field. + */ + std::string nameFromCompiler; + + ov::element::Type precision; + + ov::PartialShape shapeFromCompiler; + + /** + * @brief If set to "true", the current object describes a buffer which may be used for altering a state tensor. + * @details This flag is set if the compiler prefixed the name using a "read value" prefix. The state input and + * state output descriptors are also tied using the "relatedDescriptorIndex" attribute. + */ + bool isStateInput = false; + + /** + * @brief If set to "true", the current object describes a buffer which reflects the value of a state tensor. + * @details This flag is set if the compiler prefixed the name using an "assign" prefix. The state input and + * state output descriptors are also tied using the "relatedDescriptorIndex" attribute. + */ + bool isStateOutput = false; + + /** + * @brief If set to "true", the buffer of the tensor described here contains as value the shape of the referenced + * tensor. + * @details This flag is set if the compiler prefixed the name using a "shape" prefix. + * + * The referenced tensor bears the same name ("nameFromCompiler"), but its "isShapeTensor" value is set to + * "false". The two descriptors are also tied using the "relatedDescriptorIndex" attribute. + */ + bool isShapeTensor = false; + + /** + * @brief Points towards a related descriptor. + * @details The related descriptors are defined by (state input, state output) or (dynamic tensor, shape tensor) + * pairs. + */ + std::optional relatedDescriptorIndex; + + /** + * @brief The friendly name of the node extracted from the IR model. + * @details In some cases, this field is required for constructing a dummy model which uses the same input/output + * metadata as the original IR model. + * + * This field may be empty if the I/O entry is not found in the original IR model (i.e. the entry was added by the + * compiler). + */ + std::string nodeFriendlyName; + + /** + * @brief The names of the output tensors extracted from the IR model. + * @details In some cases, this field is required for constructing a dummy model which uses the same input/output + * metadata as the original IR model. + * + * This field may be empty if the I/O entry is not found in the original IR model (i.e. the entry was added by the + * compiler). + */ + std::unordered_set outputTensorNames; + + /** + * @brief The shape extracted from the IR model. + * @details The values may differ from the ones found in "shapeFromCompiler" if batching is to be handled by the + * plugin. + * + * This field may be empty if the I/O entry is not found in the original IR model (i.e. the entry was added + * by the compiler). + */ + std::optional shapeFromIRModel = std::nullopt; +}; + +struct NetworkMetadata final { + std::string name; + + std::vector inputs; + std::vector outputs; + std::vector profilingOutputs; + + size_t numStreams = 1; + + /** + * @brief Binds the (state input, state output) and (dynamic tensor, shape tensor) pairs using the + * "relatedDescriptorIndex" attribute. + * @details For state inputs, the "relatedDescriptorIndex" value is set to the index of the output which bears the + * same name. The reverse is also applied. + * + * For shape tensors, the lookup is performed in the same container (inputs or outputs). The value is once again set + * to the index of the entry which bears the same name. + */ + void bindRelatedDescriptors(); + +}; // namespace intel_npu + +} // namespace intel_npu diff --git a/src/plugins/intel_npu/src/al/include/intel_npu/npu_private_properties.hpp b/src/plugins/intel_npu/src/al/include/intel_npu/npu_private_properties.hpp index 0c5a04ce0c0d83..d8fabee177b2b9 100644 --- a/src/plugins/intel_npu/src/al/include/intel_npu/npu_private_properties.hpp +++ b/src/plugins/intel_npu/src/al/include/intel_npu/npu_private_properties.hpp @@ -351,6 +351,13 @@ static constexpr ov::Property batch_mode{"NPU_BATCH_MODE"}; */ static constexpr ov::Property create_executor{"NPU_CREATE_EXECUTOR"}; +/** + * @brief [Only for NPU Plugin] + * Type: boolean, default is false + * This option allows to omit loading the weights until inference is created + */ +static constexpr ov::Property defer_weights_load{"NPU_DEFER_WEIGHTS_LOAD"}; + /** * @brief Read-only property to get the name of used backend */ diff --git a/src/plugins/intel_npu/src/al/src/config/runtime.cpp b/src/plugins/intel_npu/src/al/src/config/runtime.cpp index 10f9b4a7c7222b..759956b6f597df 100644 --- a/src/plugins/intel_npu/src/al/src/config/runtime.cpp +++ b/src/plugins/intel_npu/src/al/src/config/runtime.cpp @@ -21,6 +21,7 @@ void intel_npu::registerRunTimeOptions(OptionsDesc& desc) { desc.add(); desc.add(); desc.add(); + desc.add(); desc.add(); desc.add(); desc.add(); diff --git a/src/plugins/intel_npu/src/backend/CMakeLists.txt b/src/plugins/intel_npu/src/backend/CMakeLists.txt index 01465a8179dc24..5a1585c0a63073 100644 --- a/src/plugins/intel_npu/src/backend/CMakeLists.txt +++ b/src/plugins/intel_npu/src/backend/CMakeLists.txt @@ -25,7 +25,6 @@ target_link_libraries(${TARGET_NAME} PRIVATE openvino::npu_al openvino::npu_common - openvino_npu_zero_result_parser ze_loader ) @@ -33,31 +32,3 @@ target_link_libraries(${TARGET_NAME} # targets install # ov_install_static_lib(${TARGET_NAME} ${NPU_INTERNAL_COMPONENT}) - -if(TARGET ze_loader) - if(NOT BUILD_SHARED_LIBS) - # Support link of static runtime in case system does not have ze_loader - install(TARGETS ze_loader EXPORT OpenVINOTargets - RUNTIME DESTINATION ${OV_CPACK_RUNTIMEDIR} COMPONENT ${NPU_PLUGIN_COMPONENT} - ARCHIVE DESTINATION ${OV_CPACK_ARCHIVEDIR} COMPONENT ${NPU_PLUGIN_COMPONENT} - LIBRARY DESTINATION ${OV_CPACK_LIBRARYDIR} COMPONENT ${NPU_PLUGIN_COMPONENT}) - - install(TARGETS utils EXPORT OpenVINOTargets - RUNTIME DESTINATION ${OV_CPACK_RUNTIMEDIR} COMPONENT ${NPU_PLUGIN_COMPONENT} - ARCHIVE DESTINATION ${OV_CPACK_ARCHIVEDIR} COMPONENT ${NPU_PLUGIN_COMPONENT} - LIBRARY DESTINATION ${OV_CPACK_LIBRARYDIR} COMPONENT ${NPU_PLUGIN_COMPONENT}) - - # export to local tree to build against static build tree - export(TARGETS ze_loader NAMESPACE openvino:: - APPEND FILE "${CMAKE_BINARY_DIR}/OpenVINOTargets.cmake") - - export(TARGETS utils NAMESPACE openvino:: - APPEND FILE "${CMAKE_BINARY_DIR}/OpenVINOTargets.cmake") - endif() - - # Support tests to run with ze_loader - install(TARGETS ze_loader - RUNTIME DESTINATION tests COMPONENT tests EXCLUDE_FROM_ALL - LIBRARY DESTINATION tests COMPONENT tests EXCLUDE_FROM_ALL) -endif() - diff --git a/src/plugins/intel_npu/src/backend/include/zero_backend.hpp b/src/plugins/intel_npu/src/backend/include/zero_backend.hpp index 68e4f9434418a6..038c7c1d2d9bf9 100644 --- a/src/plugins/intel_npu/src/backend/include/zero_backend.hpp +++ b/src/plugins/intel_npu/src/backend/include/zero_backend.hpp @@ -9,7 +9,7 @@ #include "intel_npu/common/npu.hpp" #include "intel_npu/utils/logger/logger.hpp" -#include "zero_init.hpp" +#include "intel_npu/utils/zero/zero_init.hpp" namespace intel_npu { class ZeroEngineBackend final : public IEngineBackend { @@ -29,15 +29,14 @@ class ZeroEngineBackend final : public IEngineBackend { bool isCommandQueueExtSupported() const override; bool isLUIDExtSupported() const override; + const std::shared_ptr& getInitStruct() const; + void* getContext() const override; - void* getDriverHandle() const; - void* getDeviceHandle() const; - ze_graph_dditable_ext_curr_t& getGraphDdiTable() const; void updateInfo(const Config& config) override; private: - std::shared_ptr _instance; + std::shared_ptr _initStruct; std::map> _devices{}; Logger _logger; diff --git a/src/plugins/intel_npu/src/backend/include/zero_device.hpp b/src/plugins/intel_npu/src/backend/include/zero_device.hpp index e87a602613a92a..50f0d28ed210cd 100644 --- a/src/plugins/intel_npu/src/backend/include/zero_device.hpp +++ b/src/plugins/intel_npu/src/backend/include/zero_device.hpp @@ -10,9 +10,9 @@ #include "intel_npu/common/icompiled_model.hpp" #include "intel_npu/common/npu.hpp" #include "intel_npu/utils/logger/logger.hpp" +#include "intel_npu/utils/zero/zero_init.hpp" +#include "intel_npu/utils/zero/zero_types.hpp" #include "openvino/runtime/intel_npu/remote_properties.hpp" -#include "zero_init.hpp" -#include "zero_types.hpp" namespace intel_npu { @@ -20,9 +20,6 @@ class ZeroDevice : public IDevice { public: ZeroDevice(const std::shared_ptr& initStructs); - std::shared_ptr createExecutor(const std::shared_ptr& networkDescription, - const Config& config) override; - std::string getName() const override; std::string getFullDeviceName() const override; Uuid getUuid() const override; @@ -36,7 +33,6 @@ class ZeroDevice : public IDevice { ov::device::Type getDeviceType() const override; std::shared_ptr createInferRequest(const std::shared_ptr& compiledModel, - const std::shared_ptr& executor, const Config& config) override; void updateInfo(const Config& config) override { log.setLevel(config.get()); @@ -76,8 +72,6 @@ class ZeroDevice : public IDevice { {ov::element::u8, 0.f}, {ov::element::i8, 0.f}}; - uint32_t _group_ordinal; - Logger log; }; } // namespace intel_npu diff --git a/src/plugins/intel_npu/src/backend/include/zero_executor.hpp b/src/plugins/intel_npu/src/backend/include/zero_executor.hpp deleted file mode 100644 index eeb96defc16441..00000000000000 --- a/src/plugins/intel_npu/src/backend/include/zero_executor.hpp +++ /dev/null @@ -1,86 +0,0 @@ -// Copyright (C) 2018-2024 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 -// - -#pragma once - -#include -#include - -#include - -#include "intel_npu/common/npu.hpp" -#include "intel_npu/utils/logger/logger.hpp" -#include "openvino/runtime/properties.hpp" -#include "zero_init.hpp" -#include "zero_wrappers.hpp" - -namespace intel_npu { - -class ZeroExecutor final : public IExecutor { -public: - ZeroExecutor(const std::shared_ptr& initStructs, - const std::shared_ptr& networkDescription, - const Config& config, - const uint32_t& group_ordinal); - - ZeroExecutor(const ZeroExecutor&) = delete; - ZeroExecutor& operator=(const ZeroExecutor&) = delete; - - ~ZeroExecutor() override; - - struct ArgumentDescriptor { - ze_graph_argument_properties_3_t info; - uint32_t idx; - }; - - void setArgumentValue(uint32_t argi_, const void* argv_) const; - void setWorkloadType(const ov::WorkloadType workloadType) const override; - void mutexLock() const; - void mutexUnlock() const; - inline ze_graph_handle_t graph() const { - return _graph; - } - inline std::shared_ptr getInitStructs() const { - return _initStructs; - } - inline const std::shared_ptr& getNetworkDesc() const { - return _networkDesc; - } - inline const std::shared_ptr& getCommandQueue() const { - return _command_queues; - } - inline const uint32_t& get_group_ordinal() const { - return _group_ordinal; - } - inline const std::vector& get_input_descriptors() const { - return _input_descriptors; - } - inline const std::vector& get_output_descriptors() const { - return _output_descriptors; - } - -private: - void initialize_graph_through_command_list() const; - - const Config _config; - Logger _logger; - - const std::shared_ptr _initStructs; - std::shared_ptr _networkDesc; - - ze_graph_dditable_ext_curr_t& _graph_ddi_table_ext; - - const uint32_t _group_ordinal; - - ze_graph_handle_t _graph = nullptr; - - std::vector _input_descriptors; - std::vector _output_descriptors; - - std::shared_ptr _command_queues; - - mutable std::mutex _mutex; -}; - -} // namespace intel_npu diff --git a/src/plugins/intel_npu/src/backend/include/zero_host_tensor.hpp b/src/plugins/intel_npu/src/backend/include/zero_host_tensor.hpp index 52000930e2a751..a214c8e2cb2b5d 100644 --- a/src/plugins/intel_npu/src/backend/include/zero_host_tensor.hpp +++ b/src/plugins/intel_npu/src/backend/include/zero_host_tensor.hpp @@ -5,8 +5,8 @@ #pragma once #include "intel_npu/config/config.hpp" +#include "intel_npu/utils/zero/zero_init.hpp" #include "openvino/runtime/itensor.hpp" -#include "zero_init.hpp" #include "zero_remote_tensor.hpp" namespace intel_npu { diff --git a/src/plugins/intel_npu/src/backend/include/zero_infer_request.hpp b/src/plugins/intel_npu/src/backend/include/zero_infer_request.hpp index 48aad52010a4c2..31248b582250da 100644 --- a/src/plugins/intel_npu/src/backend/include/zero_infer_request.hpp +++ b/src/plugins/intel_npu/src/backend/include/zero_infer_request.hpp @@ -11,19 +11,17 @@ #include "intel_npu/common/sync_infer_request.hpp" #include "intel_npu/utils/logger/logger.hpp" #include "intel_npu/utils/zero/zero_utils.hpp" -#include "zero_executor.hpp" +#include "intel_npu/utils/zero/zero_wrappers.hpp" #include "zero_pipeline.hpp" #include "zero_profiling.hpp" #include "zero_remote_tensor.hpp" -#include "zero_wrappers.hpp" namespace intel_npu { class ZeroInferRequest final : public SyncInferRequest { public: - explicit ZeroInferRequest(const std::shared_ptr& backendPtr, + explicit ZeroInferRequest(const std::shared_ptr& initStructs, const std::shared_ptr& compiledModel, - const std::shared_ptr& executor, const Config& config); ov::SoPtr get_tensor(const ov::Output& port) const override; @@ -85,8 +83,7 @@ class ZeroInferRequest final : public SyncInferRequest { std::vector>& get_input_tensors_data(size_t index) const; const std::shared_ptr _initStructs; - const std::shared_ptr _executorPtr; - const ZeroExecutor* _executor; + const std::shared_ptr _graph; const Config _config; Logger _logger; diff --git a/src/plugins/intel_npu/src/backend/include/zero_memory.hpp b/src/plugins/intel_npu/src/backend/include/zero_memory.hpp index 6ecbde0d546110..992f409b86a928 100644 --- a/src/plugins/intel_npu/src/backend/include/zero_memory.hpp +++ b/src/plugins/intel_npu/src/backend/include/zero_memory.hpp @@ -11,7 +11,7 @@ #include #include "intel_npu/utils/logger/logger.hpp" -#include "zero_init.hpp" +#include "intel_npu/utils/zero/zero_init.hpp" namespace { diff --git a/src/plugins/intel_npu/src/backend/include/zero_pipeline.hpp b/src/plugins/intel_npu/src/backend/include/zero_pipeline.hpp index 62c8481d28ac1a..92a473a9fc412c 100644 --- a/src/plugins/intel_npu/src/backend/include/zero_pipeline.hpp +++ b/src/plugins/intel_npu/src/backend/include/zero_pipeline.hpp @@ -4,11 +4,11 @@ #pragma once +#include "intel_npu/common/igraph.hpp" #include "intel_npu/utils/zero/zero_utils.hpp" -#include "zero_executor.hpp" +#include "intel_npu/utils/zero/zero_wrappers.hpp" #include "zero_memory.hpp" #include "zero_profiling.hpp" -#include "zero_wrappers.hpp" namespace intel_npu { @@ -21,13 +21,15 @@ struct TensorData { struct Pipeline { public: Pipeline(const Config& config, - const std::shared_ptr& executorPtr, + const std::shared_ptr& initStructs, + const std::shared_ptr& graph, zeroProfiling::ProfilingPool& profiling_pool, zeroProfiling::ProfilingQuery& profiling_query, std::shared_ptr npu_profiling, const std::vector>>& inputTensorsData, const std::vector>& outputTensorsData, - const size_t numberOfCommandLists); + size_t numberOfCommandLists, + uint32_t group_ordinal); Pipeline(const Pipeline&) = delete; Pipeline& operator=(const Pipeline&) = delete; @@ -42,8 +44,7 @@ struct Pipeline { protected: const Config _config; - const ZeroExecutor* _executor; - CommandQueue& _command_queue; + std::shared_ptr _command_queue; std::vector> _command_lists; std::vector> _fences; EventPool _event_pool; diff --git a/src/plugins/intel_npu/src/backend/include/zero_profiling.hpp b/src/plugins/intel_npu/src/backend/include/zero_profiling.hpp index 505a7f0185e135..17e263a7aaf620 100644 --- a/src/plugins/intel_npu/src/backend/include/zero_profiling.hpp +++ b/src/plugins/intel_npu/src/backend/include/zero_profiling.hpp @@ -12,8 +12,8 @@ #include "intel_npu/config/compiler.hpp" #include "intel_npu/utils/logger/logger.hpp" +#include "intel_npu/utils/zero/zero_types.hpp" #include "openvino/runtime/profiling_info.hpp" -#include "zero_types.hpp" namespace intel_npu { namespace zeroProfiling { diff --git a/src/plugins/intel_npu/src/backend/include/zero_remote_tensor.hpp b/src/plugins/intel_npu/src/backend/include/zero_remote_tensor.hpp index 0211bd5bd08962..5b08643704b651 100644 --- a/src/plugins/intel_npu/src/backend/include/zero_remote_tensor.hpp +++ b/src/plugins/intel_npu/src/backend/include/zero_remote_tensor.hpp @@ -9,8 +9,8 @@ #include #include "intel_npu/common/remote_tensor.hpp" +#include "intel_npu/utils/zero/zero_init.hpp" #include "openvino/runtime/intel_npu/remote_properties.hpp" -#include "zero_init.hpp" namespace intel_npu { diff --git a/src/plugins/intel_npu/src/backend/src/zero_backend.cpp b/src/plugins/intel_npu/src/backend/src/zero_backend.cpp index 86af62d414b88c..55aaad102e8b8f 100644 --- a/src/plugins/intel_npu/src/backend/src/zero_backend.cpp +++ b/src/plugins/intel_npu/src/backend/src/zero_backend.cpp @@ -14,31 +14,31 @@ namespace intel_npu { ZeroEngineBackend::ZeroEngineBackend(const Config& config) : _logger("ZeroEngineBackend", Logger::global().level()) { _logger.debug("ZeroEngineBackend - initialize started"); - _instance = std::make_shared(); + _initStruct = std::make_shared(); - auto device = std::make_shared(_instance); + auto device = std::make_shared(_initStruct); _devices.emplace(std::make_pair(device->getName(), device)); _logger.debug("ZeroEngineBackend - initialize completed"); } uint32_t ZeroEngineBackend::getDriverVersion() const { - return _instance->getDriverVersion(); + return _initStruct->getDriverVersion(); } uint32_t ZeroEngineBackend::getGraphExtVersion() const { - return _instance->getGraphDdiTable().version(); + return _initStruct->getGraphDdiTable().version(); } bool ZeroEngineBackend::isBatchingSupported() const { - return _instance->isExtensionSupported("ZE_extension_graph_1_6", ZE_MAKE_VERSION(1, 6)); + return _initStruct->isExtensionSupported("ZE_extension_graph_1_6", ZE_MAKE_VERSION(1, 6)); } bool ZeroEngineBackend::isCommandQueueExtSupported() const { - return _instance->isExtensionSupported(std::string(ZE_COMMAND_QUEUE_NPU_EXT_NAME), ZE_MAKE_VERSION(1, 0)); + return _initStruct->isExtensionSupported(std::string(ZE_COMMAND_QUEUE_NPU_EXT_NAME), ZE_MAKE_VERSION(1, 0)); } bool ZeroEngineBackend::isLUIDExtSupported() const { - return _instance->isExtensionSupported(std::string(ZE_DEVICE_LUID_EXT_NAME), ZE_MAKE_VERSION(1, 0)); + return _initStruct->isExtensionSupported(std::string(ZE_DEVICE_LUID_EXT_NAME), ZE_MAKE_VERSION(1, 0)); } ZeroEngineBackend::~ZeroEngineBackend() = default; @@ -69,19 +69,11 @@ const std::vector ZeroEngineBackend::getDeviceNames() const { } void* ZeroEngineBackend::getContext() const { - return _instance->getContext(); + return _initStruct->getContext(); } -void* ZeroEngineBackend::getDriverHandle() const { - return _instance->getDriver(); -} - -void* ZeroEngineBackend::getDeviceHandle() const { - return _instance->getDevice(); -} - -ze_graph_dditable_ext_curr_t& ZeroEngineBackend::getGraphDdiTable() const { - return _instance->getGraphDdiTable(); +const std::shared_ptr& ZeroEngineBackend::getInitStruct() const { + return _initStruct; } void ZeroEngineBackend::updateInfo(const Config& config) { diff --git a/src/plugins/intel_npu/src/backend/src/zero_device.cpp b/src/plugins/intel_npu/src/backend/src/zero_device.cpp index 58bcd0eb7cc944..6e16dde3b120bf 100644 --- a/src/plugins/intel_npu/src/backend/src/zero_device.cpp +++ b/src/plugins/intel_npu/src/backend/src/zero_device.cpp @@ -7,7 +7,6 @@ #include "intel_npu/common/itt.hpp" #include "intel_npu/utils/zero/zero_api.hpp" #include "intel_npu/utils/zero/zero_utils.hpp" -#include "zero_executor.hpp" #include "zero_host_tensor.hpp" #include "zero_infer_request.hpp" #include "zero_remote_tensor.hpp" @@ -64,38 +63,6 @@ ZeroDevice::ZeroDevice(const std::shared_ptr& initStructs device_gops[ov::element::i8] = gops; device_gops[ov::element::f16] = 0.5f * gops; } - - std::vector command_group_properties; - uint32_t command_queue_group_count = 0; - // Discover all command queue groups - THROW_ON_FAIL_FOR_LEVELZERO( - "zeDeviceGetCommandQueueGroupProperties", - zeDeviceGetCommandQueueGroupProperties(_initStructs->getDevice(), &command_queue_group_count, nullptr)); - - log.debug("ZeroDevice::ZeroDevice - resize command_queue_group_count"); - command_group_properties.resize(command_queue_group_count); - - for (auto& prop : command_group_properties) { - prop.stype = ZE_STRUCTURE_TYPE_COMMAND_QUEUE_GROUP_PROPERTIES; - prop.pNext = nullptr; - } - - THROW_ON_FAIL_FOR_LEVELZERO("zeDeviceGetCommandQueueGroupProperties", - zeDeviceGetCommandQueueGroupProperties(_initStructs->getDevice(), - &command_queue_group_count, - command_group_properties.data())); - - // Find the corresponding command queue group. - log.debug("ZeroDevice::ZeroDevice - findGroupOrdinal"); - _group_ordinal = zeroUtils::findGroupOrdinal(command_group_properties, device_properties); - log.debug("ZeroDevice::ZeroDevice - init completed"); -} - -std::shared_ptr ZeroDevice::createExecutor( - const std::shared_ptr& networkDescription, - const Config& config) { - OV_ITT_SCOPED_TASK(itt::domains::LevelZeroBackend, "Device::createExecutor"); - return std::make_shared(_initStructs, networkDescription, config, _group_ordinal); } std::string ZeroDevice::getName() const { @@ -205,9 +172,8 @@ ov::device::Type ZeroDevice::getDeviceType() const { std::shared_ptr ZeroDevice::createInferRequest( const std::shared_ptr& compiledModel, - const std::shared_ptr& executor, const Config& config) { - return std::make_shared(_initStructs, compiledModel, executor, config); + return std::make_shared(_initStructs, compiledModel, config); } ov::SoPtr ZeroDevice::createRemoteTensor(std::shared_ptr context, diff --git a/src/plugins/intel_npu/src/backend/src/zero_executor.cpp b/src/plugins/intel_npu/src/backend/src/zero_executor.cpp deleted file mode 100644 index 32da2b2e0e4189..00000000000000 --- a/src/plugins/intel_npu/src/backend/src/zero_executor.cpp +++ /dev/null @@ -1,187 +0,0 @@ -// Copyright (C) 2018-2024 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 -// - -#include "zero_executor.hpp" - -#include - -#include -#include -#include -#include - -#include "intel_npu/common/itt.hpp" -#include "intel_npu/config/common.hpp" -#include "intel_npu/prefix.hpp" -#include "intel_npu/utils/zero/zero_utils.hpp" -#include "openvino/runtime/properties.hpp" -#include "ze_command_queue_npu_ext.h" -#include "zero_device.hpp" - -using namespace intel_npu; - -ZeroExecutor::ZeroExecutor(const std::shared_ptr& initStructs, - const std::shared_ptr& networkDescription, - const Config& config, - const uint32_t& group_ordinal) - : _config(config), - _logger("Graph", _config.get()), - _initStructs(initStructs), - _networkDesc(networkDescription), - _graph_ddi_table_ext(_initStructs->getGraphDdiTable()), - _group_ordinal(group_ordinal), - _command_queues{std::make_shared(_initStructs->getDevice(), - _initStructs->getContext(), - zeroUtils::toZeQueuePriority(_config.get()), - _initStructs->getCommandQueueDdiTable(), - _config, - group_ordinal)} { - _logger.debug("ZeroExecutor::ZeroExecutor - create graph"); - OV_ITT_TASK_CHAIN(ZERO_EXECUTOR_GRAPH, itt::domains::LevelZeroBackend, "Executor::ZeroExecutor", "graphCreate"); - - // _graph is a nullptr for CIP path, a new handle will be obtained from the driver based on the given - // compiledNetwork _graph gets (reuses) graphHandle from the compiler for CID path - if (_networkDesc->metadata.graphHandle == nullptr) { - _logger.debug("create graph handle on executor"); - ze_graph_desc_t desc{ZE_STRUCTURE_TYPE_GRAPH_DESC_PROPERTIES, - nullptr, - ZE_GRAPH_FORMAT_NATIVE, - _networkDesc->compiledNetwork.size(), - _networkDesc->compiledNetwork.data(), - nullptr}; - ze_result_t result = - _graph_ddi_table_ext.pfnCreate(_initStructs->getContext(), _initStructs->getDevice(), &desc, &_graph); - THROW_ON_FAIL_FOR_LEVELZERO_EXT("pfnCreate", result, _graph_ddi_table_ext); - - } else { - _logger.debug("reuse graph handle created from compiler"); - _graph = static_cast(_networkDesc->metadata.graphHandle); - } - - OV_ITT_TASK_NEXT(ZERO_EXECUTOR_GRAPH, "pfnGetProperties"); - _logger.debug("performing pfnGetProperties"); - ze_graph_properties_t props{}; - props.stype = ZE_STRUCTURE_TYPE_GRAPH_PROPERTIES; - - ze_result_t result = _graph_ddi_table_ext.pfnGetProperties(_graph, &props); - THROW_ON_FAIL_FOR_LEVELZERO_EXT("pfnGetProperties", result, _graph_ddi_table_ext); - - auto targetDriverExtVersion = _graph_ddi_table_ext.version(); - if (targetDriverExtVersion <= ZE_GRAPH_EXT_VERSION_1_1) { - OPENVINO_THROW("Incompatibility between the NPU plugin and driver! The driver version is too old, please " - "update the driver version"); - } - - OV_ITT_TASK_NEXT(ZERO_EXECUTOR_GRAPH, "pfnGetArgumentProperties3"); - _logger.debug("performing pfnGetArgumentProperties3"); - for (uint32_t index = 0; index < props.numGraphArgs; ++index) { - ze_graph_argument_properties_3_t arg3{}; - arg3.stype = ZE_STRUCTURE_TYPE_GRAPH_ARGUMENT_PROPERTIES; - ze_result_t result = _graph_ddi_table_ext.pfnGetArgumentProperties3(_graph, index, &arg3); - THROW_ON_FAIL_FOR_LEVELZERO_EXT("pfnGetArgumentProperties3", result, _graph_ddi_table_ext); - - if (arg3.type == ZE_GRAPH_ARGUMENT_TYPE_INPUT) { - _input_descriptors.push_back(ArgumentDescriptor{arg3, index}); - } else { - _output_descriptors.push_back(ArgumentDescriptor{arg3, index}); - } - } - - if (_graph_ddi_table_ext.version() < ZE_GRAPH_EXT_VERSION_1_8) { - initialize_graph_through_command_list(); - } else { - ze_graph_properties_2_t properties = {}; - properties.stype = ZE_STRUCTURE_TYPE_GRAPH_PROPERTIES; - _graph_ddi_table_ext.pfnGetProperties2(_graph, &properties); - - if (properties.initStageRequired & ZE_GRAPH_STAGE_INITIALIZE) { - OV_ITT_TASK_NEXT(ZERO_EXECUTOR_GRAPH, "pfnGraphInitialize"); - _graph_ddi_table_ext.pfnGraphInitialize(_graph); - } - - if (properties.initStageRequired & ZE_GRAPH_STAGE_COMMAND_LIST_INITIALIZE) { - initialize_graph_through_command_list(); - } - } - - if (config.has()) { - setWorkloadType(config.get()); - } -} - -void ZeroExecutor::initialize_graph_through_command_list() const { - OV_ITT_TASK_CHAIN(ZERO_EXECUTOR_GRAPH, - itt::domains::LevelZeroBackend, - "Executor::ZeroExecutor", - "initialize_graph_through_command_list"); - - _logger.debug("ZeroExecutor::ZeroExecutor init start - create graph_command_list"); - OV_ITT_SCOPED_TASK(itt::domains::LevelZeroBackend, "Executor::ZeroExecutor"); - CommandList graph_command_list(_initStructs->getDevice(), - _initStructs->getContext(), - _graph_ddi_table_ext, - _config, - _group_ordinal); - _logger.debug("ZeroExecutor::ZeroExecutor - create graph_command_queue"); - CommandQueue graph_command_queue(_initStructs->getDevice(), - _initStructs->getContext(), - ZE_COMMAND_QUEUE_PRIORITY_NORMAL, - _initStructs->getCommandQueueDdiTable(), - _config, - _group_ordinal); - _logger.debug("ZeroExecutor::ZeroExecutor - create fence"); - Fence fence(graph_command_queue, _config); - - OV_ITT_TASK_NEXT(ZERO_EXECUTOR_GRAPH, "appendGraphInitialize"); - _logger.debug("ZeroExecutor::ZeroExecutor - performing appendGraphInitialize"); - graph_command_list.appendGraphInitialize(_graph); - _logger.debug("ZeroExecutor::ZeroExecutor - closing graph command list"); - graph_command_list.close(); - - OV_ITT_TASK_NEXT(ZERO_EXECUTOR_GRAPH, "queue_execute"); - _logger.debug("ZeroExecutor::ZeroExecutor - performing executeCommandList"); - graph_command_queue.executeCommandList(graph_command_list, fence); - _logger.debug("ZeroExecutor::ZeroExecutor - performing hostSynchronize"); - fence.hostSynchronize(); - _logger.debug("ZeroExecutor::ZeroExecutor - hostSynchronize completed"); -} - -void ZeroExecutor::setWorkloadType(const ov::WorkloadType workloadType) const { - ze_command_queue_workload_type_t zeWorkloadType; - switch (workloadType) { - case ov::WorkloadType::DEFAULT: - zeWorkloadType = ze_command_queue_workload_type_t::ZE_WORKLOAD_TYPE_DEFAULT; - break; - case ov::WorkloadType::EFFICIENT: - zeWorkloadType = ze_command_queue_workload_type_t::ZE_WORKLOAD_TYPE_BACKGROUND; - break; - default: - OPENVINO_THROW("Unknown value for WorkloadType!"); - } - - _command_queues->setWorkloadType(zeWorkloadType); -} - -void ZeroExecutor::setArgumentValue(uint32_t argi_, const void* argv_) const { - ze_result_t result = _graph_ddi_table_ext.pfnSetArgumentValue(_graph, argi_, argv_); - if (ZE_RESULT_SUCCESS != result) { - THROW_ON_FAIL_FOR_LEVELZERO_EXT("zeGraphSetArgumentValue", result, _graph_ddi_table_ext); - } -} - -void ZeroExecutor::mutexLock() const { - _mutex.lock(); -} - -void ZeroExecutor::mutexUnlock() const { - _mutex.unlock(); -} - -ZeroExecutor::~ZeroExecutor() { - _logger.debug("~ZeroExecutor() - pfnDestroy _graph "); - auto result = _graph_ddi_table_ext.pfnDestroy(_graph); - if (ZE_RESULT_SUCCESS != result) { - _logger.error("_graph_ddi_table_ext.pfnDestroy failed %#X", uint64_t(result)); - } -} diff --git a/src/plugins/intel_npu/src/backend/src/zero_infer_request.cpp b/src/plugins/intel_npu/src/backend/src/zero_infer_request.cpp index dd2629372dc7d8..1c5ceecfac1961 100644 --- a/src/plugins/intel_npu/src/backend/src/zero_infer_request.cpp +++ b/src/plugins/intel_npu/src/backend/src/zero_infer_request.cpp @@ -31,8 +31,7 @@ constexpr bool OUTPUT = false; * @param ioDescriptor The OpenVINO API specific I/O descriptor which shall be compared. * @param zeDescriptor The Level Zero specific structure used for comparison. */ -void check_level_zero_attributes_match(const IODescriptor& ioDescriptor, - const ZeroExecutor::ArgumentDescriptor& zeDescriptor) { +void check_level_zero_attributes_match(const IODescriptor& ioDescriptor, const ArgumentDescriptor& zeDescriptor) { std::string zeDescriptorName = zeDescriptor.info.name; if (isStateInputName(zeDescriptorName)) { @@ -158,38 +157,35 @@ std::optional ZeroInferRequest::get_batch_size(const NetworkMetadata& me //------------------------------------------------------------------------------ ZeroInferRequest::ZeroInferRequest(const std::shared_ptr& initStructs, const std::shared_ptr& compiledModel, - const std::shared_ptr& executor, const Config& config) : SyncInferRequest(compiledModel, config), _initStructs(initStructs), - _executorPtr(executor), - _executor(static_cast(_executorPtr.get())), + _graph(compiledModel->get_graph()), _config(config), _logger("ZeroInferRequest", config.get()), _levelZeroInputTensors(_metadata.inputs.size(), std::vector>(1, nullptr)), _levelZeroOutputTensors(_metadata.outputs.size(), nullptr), _inputTensorsData(_metadata.inputs.size(), std::vector>(1, std::nullopt)), _outputTensorsData(_metadata.outputs.size(), std::nullopt), - _profilingPool(_executor->graph(), zeroProfiling::POOL_SIZE, _executor->getInitStructs()->getProfilingDdiTable()), - _profilingQuery(0, - _executor->getInitStructs()->getDevice(), - _executor->getInitStructs()->getProfilingDdiTable()) { + _profilingPool(static_cast(_graph->get_handle()), + zeroProfiling::POOL_SIZE, + _initStructs->getProfilingDdiTable()), + _profilingQuery(0, _initStructs->getDevice(), _initStructs->getProfilingDdiTable()) { _logger.debug("ZeroInferRequest::ZeroInferRequest - SyncInferRequest"); - const std::vector& executorInputDescriptors = _executor->get_input_descriptors(); - const std::vector& executorOutputDescriptors = - _executor->get_output_descriptors(); + const std::vector& executorInputDescriptors = _graph->get_input_descriptors(); + const std::vector& executorOutputDescriptors = _graph->get_output_descriptors(); auto proftype = config.get(); if (proftype == ov::intel_npu::ProfilingType::INFER) { _logger.debug("ZeroInferRequest::ZeroInferRequest - profiling type == ov::intel_npu::ProfilingType::INFER"); - _npuProfiling = std::make_shared(_executor->getInitStructs()->getContext(), - _executor->getInitStructs()->getDevice(), + _npuProfiling = std::make_shared(_initStructs->getContext(), + _initStructs->getDevice(), _config.get()); } _properties.stype = ZE_STRUCTURE_TYPE_DEVICE_PROPERTIES; THROW_ON_FAIL_FOR_LEVELZERO("zeDeviceGetProperties", - zeDeviceGetProperties(_executor->getInitStructs()->getDevice(), &_properties)); + zeDeviceGetProperties(_initStructs->getDevice(), &_properties)); _outputAllocator = std::make_shared(_initStructs); _inputAllocator = @@ -278,17 +274,24 @@ void ZeroInferRequest::create_pipeline() { _levelZeroOutputTensors.at(outputIndex)->get_byte_size()}); } + // Find the corresponding command queue group. + _logger.debug("ZeroDevice::ZeroDevice - findGroupOrdinal"); + auto groupOrdinal = zeroUtils::findGroupOrdinal(_initStructs->getDevice(), _properties); + _logger.debug("ZeroDevice::ZeroDevice - init completed"); + _logger.debug("ZeroInferRequest::create_pipeline - constructing pipeline"); - // Construct pipeline + // Construct pipeline _pipeline = std::make_unique(_config, - _executorPtr, + _initStructs, + _graph, _profilingPool, _profilingQuery, _npuProfiling, _inputTensorsData, _outputTensorsData, - _numberOfCommandLists); + _numberOfCommandLists, + groupOrdinal); _logger.debug("ZeroInferRequest::create_pipeline - SyncInferRequest completed"); } @@ -338,8 +341,8 @@ void ZeroInferRequest::set_tensor_data(const std::shared_ptr tensor OV_ITT_TASK_NEXT(ZERO_SET_TENSOR, "updateCommandList"); _pipeline->updateCommandList(*tensorsData, - isInput ? _executor->get_input_descriptors().at(index).idx - : _executor->get_output_descriptors().at(index).idx); + isInput ? _graph->get_input_descriptors().at(index).idx + : _graph->get_output_descriptors().at(index).idx); } } } @@ -370,9 +373,9 @@ void ZeroInferRequest::set_remote_tensor_data(const std::shared_ptrupdateCommandList(*tensorsData, - isInput ? _executor->get_input_descriptors().at(index).idx - : _executor->get_output_descriptors().at(index).idx); + _pipeline->updateCommandList( + *tensorsData, + isInput ? _graph->get_input_descriptors().at(index).idx : _graph->get_output_descriptors().at(index).idx); } } @@ -390,13 +393,17 @@ void ZeroInferRequest::set_tensor(const ov::Output& port, const if (foundPort.is_input()) { if (get_user_input(foundPort.idx)._ptr == tensor._ptr) { // Got set_tensor with the same object - do nothing + _logger.debug("ZeroInferRequest::set_tensor - got the same tensor, do nothing"); return; } if (is_batched_input(foundPort.idx)) { // resize vector size to 1 if set_tensor is called after set_tensors get_input_tensors_data(foundPort.idx).resize(1); + get_input_tensors_data(foundPort.idx).shrink_to_fit(); get_level_zero_inputs(foundPort.idx).resize(1); + get_level_zero_inputs(foundPort.idx).shrink_to_fit(); get_user_inputs(foundPort.idx).resize(1); + get_user_inputs(foundPort.idx).shrink_to_fit(); } get_user_input(foundPort.idx) = tensor; @@ -485,7 +492,7 @@ void ZeroInferRequest::set_tensors(const ov::Output& port, if (_pipelineIsCreated) { OV_ITT_TASK_NEXT(SET_TENSORS, "updateCommandList"); _pipeline->updateCommandList(*get_input_tensor_data(foundPort.idx, i), - _executor->get_input_descriptors().at(foundPort.idx).idx, + _graph->get_input_descriptors().at(foundPort.idx).idx, i); } } @@ -537,14 +544,16 @@ void ZeroInferRequest::infer_async() { _logger.debug("InferRequest::infer_async started"); OV_ITT_TASK_CHAIN(ZERO_INFER, itt::domains::LevelZeroBackend, "infer_async", "start"); - _executor->mutexLock(); - if (!_pipelineIsCreated) { - OV_ITT_TASK_NEXT(ZERO_INFER, "create_pipeline"); - create_pipeline(); + { + std::lock_guard lock(_graph->get_mutex()); - _pipelineIsCreated = true; + if (!_pipelineIsCreated) { + OV_ITT_TASK_NEXT(ZERO_INFER, "create_pipeline"); + create_pipeline(); + + _pipelineIsCreated = true; + } } - _executor->mutexUnlock(); size_t inputIndex = 0; for (const auto& userTensor : _userInputTensors) { @@ -740,12 +749,9 @@ std::vector ZeroInferRequest::get_profiling_info() const { if (compilerType == ov::intel_npu::CompilerType::MLIR) { // For plugin compiler retreive raw profiling data from backend and delegate // processing to the compiler - const auto& networkDesc = compiledModel.get_network_description(); - const auto& compiler = compiledModel.get_compiler(); - const auto& blob = networkDesc->compiledNetwork; auto profData = get_raw_profiling_data(); _logger.debug("InferRequest::get_profiling_info complete with compiler->process_profiling_output()."); - return compiler->process_profiling_output(profData, blob, compilerConfig); + return _graph->process_profiling_output(profData, compilerConfig); } else { auto proftype = _config.get(); if (proftype == ov::intel_npu::ProfilingType::INFER) { diff --git a/src/plugins/intel_npu/src/backend/src/zero_pipeline.cpp b/src/plugins/intel_npu/src/backend/src/zero_pipeline.cpp index 009eee6541e8ef..34eb71eaf112f7 100644 --- a/src/plugins/intel_npu/src/backend/src/zero_pipeline.cpp +++ b/src/plugins/intel_npu/src/backend/src/zero_pipeline.cpp @@ -11,25 +11,25 @@ #include "intel_npu/prefix.hpp" #include "intel_npu/utils/logger/logger.hpp" #include "intel_npu/utils/zero/zero_api.hpp" -#include "zero_types.hpp" +#include "intel_npu/utils/zero/zero_types.hpp" namespace intel_npu { Pipeline::Pipeline(const Config& config, - const std::shared_ptr& executorPtr, + const std::shared_ptr& initStructs, + const std::shared_ptr& graph, zeroProfiling::ProfilingPool& profiling_pool, zeroProfiling::ProfilingQuery& profiling_query, std::shared_ptr npu_profiling, const std::vector>>& inputTensorsData, const std::vector>& outputTensorsData, - const size_t numberOfCommandLists) + size_t numberOfCommandLists, + uint32_t group_ordinal) : _config(config), - _executor(static_cast(executorPtr.get())), - _command_queue(*_executor->getCommandQueue()), - _event_pool{_executor->getInitStructs()->getDevice(), - _executor->getInitStructs()->getContext(), - numberOfCommandLists ? static_cast(numberOfCommandLists) : 1, - _config}, + _command_queue(graph->get_command_queue()), + _event_pool{initStructs->getDevice(), + initStructs->getContext(), + numberOfCommandLists ? static_cast(numberOfCommandLists) : 1}, _npu_profiling(std::move(npu_profiling)), _logger("Pipeline", _config.get()) { OV_ITT_SCOPED_TASK(itt::domains::LevelZeroBackend, "Zero_infer_request::Pipeline::Pipeline"); @@ -45,38 +45,37 @@ Pipeline::Pipeline(const Config& config, _logger.debug("Pipeline - emplace_back _event_pool and _command_queue"); for (size_t i = 0; i < numberOfCommandLists; i++) { _command_lists.emplace_back( - std::make_unique(_executor->getInitStructs()->getDevice(), - _executor->getInitStructs()->getContext(), - _executor->getInitStructs()->getGraphDdiTable(), - _config, - _executor->get_group_ordinal(), - _executor->getInitStructs()->getMutableCommandListVersion() ? true : false)); - _events.emplace_back(std::make_unique(_event_pool.handle(), static_cast(i), _config)); - _fences.emplace_back(std::make_unique(_command_queue, _config)); + std::make_unique(initStructs->getDevice(), + initStructs->getContext(), + initStructs->getGraphDdiTable(), + group_ordinal, + initStructs->getMutableCommandListVersion() ? true : false)); + _events.emplace_back(std::make_unique(_event_pool.handle(), static_cast(i))); + _fences.emplace_back(std::make_unique(*_command_queue)); } for (size_t i = 0; i < numberOfCommandLists; i++) { size_t ioIndex = 0; - for (const auto& desc : _executor->get_input_descriptors()) { + for (const auto& desc : graph->get_input_descriptors()) { if (inputTensorsData.at(ioIndex).size() > 1) { - _executor->setArgumentValue(desc.idx, inputTensorsData.at(ioIndex).at(i)->mem); + graph->set_argument_value(desc.idx, inputTensorsData.at(ioIndex).at(i)->mem); ++ioIndex; continue; } - _executor->setArgumentValue(desc.idx, - static_cast(inputTensorsData.at(ioIndex).at(0)->mem) + - (i * inputTensorsData.at(ioIndex).at(0)->size) / numberOfCommandLists); + graph->set_argument_value(desc.idx, + static_cast(inputTensorsData.at(ioIndex).at(0)->mem) + + (i * inputTensorsData.at(ioIndex).at(0)->size) / numberOfCommandLists); ++ioIndex; } ioIndex = 0; - for (const auto& desc : _executor->get_output_descriptors()) { - _executor->setArgumentValue(desc.idx, - static_cast(outputTensorsData.at(ioIndex)->mem) + - (i * outputTensorsData.at(ioIndex)->size) / numberOfCommandLists); + for (const auto& desc : graph->get_output_descriptors()) { + graph->set_argument_value(desc.idx, + static_cast(outputTensorsData.at(ioIndex)->mem) + + (i * outputTensorsData.at(ioIndex)->size) / numberOfCommandLists); ++ioIndex; } @@ -86,7 +85,8 @@ Pipeline::Pipeline(const Config& config, _command_lists.at(i)->appendNpuTimestamp(reinterpret_cast(_npu_profiling->npu_ts_infer_start)); } - _command_lists.at(i)->appendGraphExecute(_executor->graph(), profiling_query.getHandle()); + _command_lists.at(i)->appendGraphExecute(static_cast(graph->get_handle()), + profiling_query.getHandle()); /// append timestamp command if feature was activated if (_npu_profiling != nullptr) { @@ -108,11 +108,11 @@ void Pipeline::push() { _logger.debug("Pipeline - push() started"); for (size_t i = 0; i < _command_lists.size(); ++i) { - OV_ITT_TASK_CHAIN(ZERO_EXECUTOR_IP_PUSH, itt::domains::LevelZeroBackend, "Pipeline", "push"); + OV_ITT_TASK_CHAIN(ZERO_PIPELINE_IP_PUSH, itt::domains::LevelZeroBackend, "Pipeline", "push"); if (sync_output_with_fences_) { - _command_queue.executeCommandList(*_command_lists.at(i), *_fences.at(i)); + _command_queue->executeCommandList(*_command_lists.at(i), *_fences.at(i)); } else { - _command_queue.executeCommandList(*_command_lists.at(i)); + _command_queue->executeCommandList(*_command_lists.at(i)); } } @@ -121,7 +121,7 @@ void Pipeline::push() { void Pipeline::pull() { _logger.debug("Pipeline - pull() started"); - OV_ITT_TASK_CHAIN(ZERO_EXECUTOR_IP_PULL, itt::domains::LevelZeroBackend, "Pipeline", "pull"); + OV_ITT_TASK_CHAIN(ZERO_PIPELINE_IP_PULL, itt::domains::LevelZeroBackend, "Pipeline", "pull"); for (size_t i = 0; i < _command_lists.size(); ++i) { if (sync_output_with_fences_) { diff --git a/src/plugins/intel_npu/src/common/CMakeLists.txt b/src/plugins/intel_npu/src/common/CMakeLists.txt index 2d1f5d9cbb39ea..1aa93cce1bc291 100644 --- a/src/plugins/intel_npu/src/common/CMakeLists.txt +++ b/src/plugins/intel_npu/src/common/CMakeLists.txt @@ -20,7 +20,7 @@ target_link_libraries(${TARGET_NAME} PUBLIC openvino::npu_al openvino::npu_logger_utils - openvino::runtime::dev + openvino::npu_zero_utils ) set_target_properties(${TARGET_NAME} PROPERTIES INTERPROCEDURAL_OPTIMIZATION_RELEASE ${ENABLE_LTO}) diff --git a/src/plugins/intel_npu/src/common/include/intel_npu/common/icompiled_model.hpp b/src/plugins/intel_npu/src/common/include/intel_npu/common/icompiled_model.hpp index eb6a3de57e41fc..19023a1fca883f 100644 --- a/src/plugins/intel_npu/src/common/include/intel_npu/common/icompiled_model.hpp +++ b/src/plugins/intel_npu/src/common/include/intel_npu/common/icompiled_model.hpp @@ -7,8 +7,8 @@ #include #include +#include "intel_npu/common/igraph.hpp" #include "intel_npu/config/common.hpp" -#include "intel_npu/icompiler.hpp" #include "openvino/runtime/icompiled_model.hpp" namespace intel_npu { @@ -17,17 +17,10 @@ class ICompiledModel : public ov::ICompiledModel { public: using ov::ICompiledModel::ICompiledModel; - virtual const std::shared_ptr& get_network_description() const = 0; + virtual const std::shared_ptr& get_graph() const = 0; virtual const Config& get_config() const = 0; - // Compiler is used for post-processing profiling data when using PERF_COUNT property - virtual const ov::SoPtr& get_compiler() const = 0; - - const NetworkMetadata& get_network_metadata() const { - return get_network_description()->metadata; - } - protected: std::shared_ptr shared_from_this() const { return std::dynamic_pointer_cast(ov::ICompiledModel::shared_from_this()); diff --git a/src/plugins/intel_npu/src/common/include/intel_npu/common/igraph.hpp b/src/plugins/intel_npu/src/common/include/intel_npu/common/igraph.hpp new file mode 100644 index 00000000000000..51c4a4cf26eafd --- /dev/null +++ b/src/plugins/intel_npu/src/common/include/intel_npu/common/igraph.hpp @@ -0,0 +1,103 @@ +// Copyright (C) 2018-2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include +#include +#include + +#include "intel_npu/network_metadata.hpp" +#include "intel_npu/utils/zero/zero_utils.hpp" +#include "intel_npu/utils/zero/zero_wrappers.hpp" +#include "openvino/runtime/profiling_info.hpp" + +namespace intel_npu { + +class IGraph : public std::enable_shared_from_this { +public: + IGraph(ze_graph_handle_t handle, NetworkMetadata metadata, std::optional> blob) + : _handle(handle), + _metadata(std::move(metadata)) { + if (blob.has_value()) { + _blob = std::move(*blob); + } + } + + virtual void export_blob(std::ostream& stream) const = 0; + + virtual std::vector process_profiling_output(const std::vector& profData, + const Config& config) const = 0; + + virtual void set_argument_value(uint32_t argi, const void* argv) const = 0; + + virtual void initialize(const Config& config) = 0; + + virtual ~IGraph() = default; + + const NetworkMetadata& get_metadata() const { + return _metadata; + } + + ze_graph_handle_t get_handle() const { + return _handle; + } + + void update_network_name(std::string_view name) { + _metadata.name = name; + } + + inline const std::vector& get_input_descriptors() const { + return _input_descriptors; + } + + inline const std::vector& get_output_descriptors() const { + return _output_descriptors; + } + + inline const std::shared_ptr& get_command_queue() const { + return _command_queue; + } + + void set_workload_type(const ov::WorkloadType workloadType) const { + if (_command_queue == nullptr) { + return; + } + + ze_command_queue_workload_type_t zeWorkloadType; + switch (workloadType) { + case ov::WorkloadType::DEFAULT: + zeWorkloadType = ze_command_queue_workload_type_t::ZE_WORKLOAD_TYPE_DEFAULT; + break; + case ov::WorkloadType::EFFICIENT: + zeWorkloadType = ze_command_queue_workload_type_t::ZE_WORKLOAD_TYPE_BACKGROUND; + break; + default: + OPENVINO_THROW("Unknown value for WorkloadType!"); + } + + _command_queue->setWorkloadType(zeWorkloadType); + } + + std::mutex& get_mutex() { + return _mutex; + } + +protected: + ze_graph_handle_t _handle = nullptr; + NetworkMetadata _metadata; + + std::vector _input_descriptors; + std::vector _output_descriptors; + + std::shared_ptr _command_queue; + + // Used to protect zero pipeline creation in the graph. The pipeline should be created only once per graph when the + // first inference starts running + std::mutex _mutex; + + std::vector _blob; +}; + +} // namespace intel_npu diff --git a/src/plugins/intel_npu/src/common/include/intel_npu/common/npu.hpp b/src/plugins/intel_npu/src/common/include/intel_npu/common/npu.hpp index 8c1eb57fe34fc3..b34f2deee6c61e 100644 --- a/src/plugins/intel_npu/src/common/include/intel_npu/common/npu.hpp +++ b/src/plugins/intel_npu/src/common/include/intel_npu/common/npu.hpp @@ -7,9 +7,9 @@ #include #include "intel_npu/common/icompiled_model.hpp" +#include "intel_npu/common/igraph.hpp" #include "intel_npu/common/sync_infer_request.hpp" #include "intel_npu/config/config.hpp" -#include "intel_npu/icompiler.hpp" #include "openvino/runtime/intel_npu/remote_properties.hpp" #include "openvino/runtime/iremote_context.hpp" #include "openvino/runtime/properties.hpp" @@ -54,11 +54,14 @@ class IEngineBackend : public std::enable_shared_from_this { //------------------------------------------------------------------------------ -class IExecutor { +class ICompilerAdapter { public: - virtual ~IExecutor() = default; + virtual std::shared_ptr compile(const std::shared_ptr& model, + const Config& config) const = 0; + virtual std::shared_ptr parse(std::vector network, const Config& config) const = 0; + virtual ov::SupportedOpsMap query(const std::shared_ptr& model, const Config& config) const = 0; - virtual void setWorkloadType(const ov::WorkloadType workloadType) const = 0; + virtual ~ICompilerAdapter() = default; }; //------------------------------------------------------------------------------ @@ -67,10 +70,6 @@ class IDevice : public std::enable_shared_from_this { public: using Uuid = ov::device::UUID; - virtual std::shared_ptr createExecutor( - const std::shared_ptr& networkDescription, - const Config& config) = 0; - virtual std::string getName() const = 0; virtual std::string getFullDeviceName() const = 0; virtual Uuid getUuid() const; @@ -85,7 +84,6 @@ class IDevice : public std::enable_shared_from_this { virtual std::shared_ptr createInferRequest( const std::shared_ptr& compiledModel, - const std::shared_ptr& executor, const Config& config) = 0; virtual void updateInfo(const Config& config) = 0; diff --git a/src/plugins/intel_npu/src/common/include/intel_npu/common/sync_infer_request.hpp b/src/plugins/intel_npu/src/common/include/intel_npu/common/sync_infer_request.hpp index 99f9ce7cb0eb28..788ce87136a04d 100644 --- a/src/plugins/intel_npu/src/common/include/intel_npu/common/sync_infer_request.hpp +++ b/src/plugins/intel_npu/src/common/include/intel_npu/common/sync_infer_request.hpp @@ -5,8 +5,9 @@ #pragma once #include "intel_npu/common/icompiled_model.hpp" +#include "intel_npu/common/igraph.hpp" #include "intel_npu/common/variable_state.hpp" -#include "intel_npu/icompiler.hpp" +#include "intel_npu/network_metadata.hpp" #include "openvino/runtime/iinfer_request.hpp" #include "openvino/runtime/iplugin.hpp" diff --git a/src/plugins/intel_npu/src/common/src/sync_infer_request.cpp b/src/plugins/intel_npu/src/common/src/sync_infer_request.cpp index 0ae0832fe29d72..0eeefccf43906d 100644 --- a/src/plugins/intel_npu/src/common/src/sync_infer_request.cpp +++ b/src/plugins/intel_npu/src/common/src/sync_infer_request.cpp @@ -21,7 +21,7 @@ namespace intel_npu { SyncInferRequest::SyncInferRequest(const std::shared_ptr& compiledModel, const Config& config) : _compiledModel(compiledModel), - _metadata(compiledModel->get_network_metadata()), + _metadata(compiledModel->get_graph()->get_metadata()), _logger("SyncInferRequest", config.get()), _userInputTensors(_metadata.inputs.size(), std::vector>(1, {nullptr})), _userOutputTensors(_metadata.outputs.size(), {nullptr}) { diff --git a/src/plugins/intel_npu/src/compiler/include/driver_compiler_adapter.hpp b/src/plugins/intel_npu/src/compiler/include/driver_compiler_adapter.hpp deleted file mode 100644 index addd9ca5308c65..00000000000000 --- a/src/plugins/intel_npu/src/compiler/include/driver_compiler_adapter.hpp +++ /dev/null @@ -1,50 +0,0 @@ -// Copyright (C) 2018-2024 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 -// - -#pragma once - -#include - -#include "intel_npu/common/npu.hpp" -#include "intel_npu/icompiler.hpp" -#include "intel_npu/utils/logger/logger.hpp" - -namespace intel_npu { -namespace driverCompilerAdapter { - -/** - * @brief Adapter for Compiler in driver - * @details Wrap compiler in driver calls and do preliminary actions (like opset conversion) - */ -class LevelZeroCompilerAdapter final : public ICompiler { -public: - LevelZeroCompilerAdapter(std::shared_ptr iEngineBackend); - - uint32_t getSupportedOpsetVersion() const override final; - - NetworkDescription compile(const std::shared_ptr& model, - const Config& config) const override final; - - ov::SupportedOpsMap query(const std::shared_ptr& model, const Config& config) const override final; - - NetworkMetadata parse(const std::vector& network, const Config& config) const override final; - - std::vector process_profiling_output(const std::vector& profData, - const std::vector& network, - const Config& config) const override final; - - void release(std::shared_ptr networkDescription) override; - - CompiledNetwork getCompiledNetwork(const NetworkDescription& networkDescription) override; - -private: - /** - * @brief Separate externals calls to separate class - */ - std::shared_ptr apiAdapter; - Logger _logger; -}; - -} // namespace driverCompilerAdapter -} // namespace intel_npu diff --git a/src/plugins/intel_npu/src/compiler/include/zero_compiler_in_driver.hpp b/src/plugins/intel_npu/src/compiler/include/zero_compiler_in_driver.hpp deleted file mode 100644 index 5641408dffcac0..00000000000000 --- a/src/plugins/intel_npu/src/compiler/include/zero_compiler_in_driver.hpp +++ /dev/null @@ -1,201 +0,0 @@ -// Copyright (C) 2018-2024 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 -// - -#include -#include - -#include -#include - -#include "intel_npu/icompiler.hpp" -#include "intel_npu/utils/logger/logger.hpp" -#include "intel_npu/utils/zero/zero_api.hpp" -#include "zero_executor.hpp" - -namespace intel_npu { -namespace driverCompilerAdapter { - -using SerializedIR = std::pair>; - -#define NotSupportQuery(T) (T == ZE_GRAPH_EXT_VERSION_1_2) - -// ext version == 1.3 && 1.4, support API (pfnQueryNetworkCreate, pfnQueryNetworkDestroy, -// pfnQueryNetworkGetSupportedLayers) -#define SupportAPIGraphQueryNetworkV1(T) (T == ZE_GRAPH_EXT_VERSION_1_3 || T == ZE_GRAPH_EXT_VERSION_1_4) - -// ext version >= 1.5, support API (pfnCreate2, pfnQueryNetworkCreate2, pfnQueryContextMemory) -#define SupportAPIGraphQueryNetworkV2(T) ((!NotSupportQuery(T) && !SupportAPIGraphQueryNetworkV1(T))) - -// For ext version >= 1.5, pfnCreate2 api is avaible -#define NotSupportGraph2(T) \ - (T == ZE_GRAPH_EXT_VERSION_1_2 || T == ZE_GRAPH_EXT_VERSION_1_3 || T == ZE_GRAPH_EXT_VERSION_1_4) - -// A bug inside the driver makes the "pfnGraphGetArgumentMetadata" call not safe for use prior to -// "ze_graph_dditable_ext_1_6_t". -// See: E#117498 -#define NotSupportArgumentMetadata(T) \ - (T == ZE_GRAPH_EXT_VERSION_1_2 || T == ZE_GRAPH_EXT_VERSION_1_3 || T == ZE_GRAPH_EXT_VERSION_1_4 || \ - T == ZE_GRAPH_EXT_VERSION_1_5) - -#define UseCopyForNativeBinary(T) \ - (T == ZE_GRAPH_EXT_VERSION_1_2 || T == ZE_GRAPH_EXT_VERSION_1_3 || T == ZE_GRAPH_EXT_VERSION_1_4 || \ - T == ZE_GRAPH_EXT_VERSION_1_5 || T == ZE_GRAPH_EXT_VERSION_1_6) - -/** - * Adapter to use CiD through ZeroAPI - */ -template -class LevelZeroCompilerInDriver final : public ICompiler { -public: - LevelZeroCompilerInDriver(ze_driver_handle_t driverHandle, - ze_device_handle_t deviceHandle, - ze_context_handle_t zeContext, - ze_graph_dditable_ext_curr_t& graph_ddi_table_ext); - LevelZeroCompilerInDriver(const LevelZeroCompilerInDriver&) = delete; - LevelZeroCompilerInDriver& operator=(const LevelZeroCompilerInDriver&) = delete; - ~LevelZeroCompilerInDriver() override; - - uint32_t getSupportedOpsetVersion() const override final; - - ov::SupportedOpsMap query(const std::shared_ptr& model, const Config& config) const override; - - NetworkDescription compile(const std::shared_ptr& model, - const Config& config) const override final; - - ze_result_t seriazlideIRModelAndCreateGraph(const std::shared_ptr& model, - const Config& config, - ze_device_graph_properties_t deviceGraphProperties, - ze_graph_handle_t& graphHandle) const; - - NetworkMetadata parse(const std::vector& network, const Config& config) const override final; - - std::vector process_profiling_output(const std::vector& profData, - const std::vector& network, - const Config& config) const override final { - OPENVINO_THROW("Profiling post-processing is not implemented."); - } - - template = true> - std::unordered_set getQueryResultFromSupportedLayers( - ze_result_t result, - ze_graph_query_network_handle_t& hGraphQueryNetwork) const; - - /** - * @brief Serialize input / output information to string format. - * @details Format: - * --inputs_precisions="0: [1:]" - * --inputs_layouts="0: [1:]" - * --outputs_precisions="0:" - * --outputs_layouts="0:" - * - * For older compiler versions, the name of the inputs/outputs may be used instead of their indices. - * - * Since the layout information is no longer an important part of the metadata values when using the 2.0 OV - * API, the layout fields shall be filled with default values in order to assure the backward compatibility - * with the driver. - */ - static std::string serializeIOInfo(const std::shared_ptr& model, const bool useIndices); - - void release(std::shared_ptr networkDescription) override; - - CompiledNetwork getCompiledNetwork(const NetworkDescription& networkDescription) override; - -private: - NetworkMetadata getNetworkMeta(ze_graph_handle_t graphHandle) const; - - SerializedIR serializeIR(const std::shared_ptr& model, - ze_graph_compiler_version_info_t compilerVersion) const; - std::string serializeConfig(const Config& config, ze_graph_compiler_version_info_t& compilerVersion) const; - - template = true> - void getMetadata(ze_graph_dditable_ext_curr_t& graphDdiTableExt, - ze_graph_handle_t graphHandle, - uint32_t index, - std::vector& inputs, - std::vector& outputs) const; - - template = true> - void getMetadata(ze_graph_dditable_ext_curr_t& graphDdiTableExt, - ze_graph_handle_t graphHandle, - uint32_t index, - std::vector& inputs, - std::vector& outputs) const; - - template = true> - void getNativeBinary(ze_graph_dditable_ext_curr_t& graphDdiTableExt, - ze_graph_handle_t graphHandle, - std::vector& blob, - const uint8_t*& blobPtr, - size_t& blobSize) const; - - template = true> - void getNativeBinary(ze_graph_dditable_ext_curr_t& graphDdiTableExt, - ze_graph_handle_t graphHandle, - std::vector& /* unusedBlob */, - const uint8_t*& blobPtr, - size_t& blobSize) const; - - template = true> - ze_result_t seriazlideIRModelAndQueryNetworkCreateV2(const std::shared_ptr& model, - const Config& config, - ze_device_graph_properties_t deviceGraphProperties, - const ze_device_handle_t& _deviceHandle, - ze_graph_query_network_handle_t& hGraphQueryNetwork) const; - - // ext version >= 1.5, support API (pfnCreate2, pfnQueryNetworkCreate2, pfnQueryContextMemory) - template = true> - std::unordered_set queryImpl(const std::shared_ptr& model, - const Config& config) const; - - template = true> - ze_result_t seriazlideIRModelAndQueryNetworkCreateV1(const std::shared_ptr& model, - const Config& config, - ze_device_graph_properties_t deviceGraphProperties, - const ze_device_handle_t& _deviceHandle, - ze_graph_query_network_handle_t& hGraphQueryNetwork) const; - - // ext version == 1.3 && 1.4, support API (pfnQueryNetworkCreate, pfnQueryNetworkDestroy, - // pfnQueryNetworkGetSupportedLayers) - template = true> - std::unordered_set queryImpl(const std::shared_ptr& model, - const Config& config) const; - - // For ext version < 1.3 - template = true> - std::unordered_set queryImpl(const std::shared_ptr& model, - const Config& config) const; - - template = true> - ze_result_t createGraph(const ze_graph_format_t& format, - const SerializedIR& serializedIR, - const std::string& buildFlags, - const uint32_t& flags, - ze_graph_handle_t* graph) const; - - template = true> - ze_result_t createGraph(const ze_graph_format_t& format, - const SerializedIR& serializedIR, - const std::string& buildFlags, - const uint32_t& flags, - ze_graph_handle_t* graph) const; - -private: - ze_driver_handle_t _driverHandle = nullptr; - ze_device_handle_t _deviceHandle = nullptr; - ze_context_handle_t _context = nullptr; - - ze_graph_dditable_ext_curr_t& _graphDdiTableExt; - Logger _logger; -}; - -} // namespace driverCompilerAdapter -} // namespace intel_npu diff --git a/src/plugins/intel_npu/src/compiler/src/driver_compiler_adapter.cpp b/src/plugins/intel_npu/src/compiler/src/driver_compiler_adapter.cpp deleted file mode 100644 index 0406b375609044..00000000000000 --- a/src/plugins/intel_npu/src/compiler/src/driver_compiler_adapter.cpp +++ /dev/null @@ -1,130 +0,0 @@ -// Copyright (C) 2018-2024 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 -// - -#include "driver_compiler_adapter.hpp" - -#include "graph_transformations.hpp" -#include "intel_npu/config/common.hpp" -#include "intel_npu/utils/zero/zero_api.hpp" -#include "intel_npu/utils/zero/zero_result.hpp" -#include "ze_intel_npu_uuid.h" -#include "zero_backend.hpp" -#include "zero_compiler_in_driver.hpp" -#include "zero_init.hpp" - -namespace intel_npu { -namespace driverCompilerAdapter { - -LevelZeroCompilerAdapter::LevelZeroCompilerAdapter(std::shared_ptr iEngineBackend) - : _logger("LevelZeroCompilerAdapter", Logger::global().level()) { - _logger.debug("initialize LevelZeroCompilerAdapter start"); - - std::shared_ptr zeroBackend = nullptr; - zeroBackend = std::dynamic_pointer_cast(iEngineBackend); - if (!zeroBackend) { - OPENVINO_THROW("LevelZeroCompilerAdapter init failed to cast zeroBackend, zeroBackend is a nullptr"); - } - - ze_context_handle_t zeContext = static_cast(zeroBackend->getContext()); - ze_driver_handle_t driverHandle = static_cast(zeroBackend->getDriverHandle()); - ze_device_handle_t deviceHandle = static_cast(zeroBackend->getDeviceHandle()); - ze_graph_dditable_ext_curr_t& graph_ddi_table_ext = zeroBackend->getGraphDdiTable(); - - uint32_t graphExtVersion = graph_ddi_table_ext.version(); - - if (driverHandle == nullptr) { - OPENVINO_THROW("LevelZeroCompilerAdapter failed to get properties about zeDriver"); - } - - _logger.info("LevelZeroCompilerAdapter creating adapter using graphExtVersion"); - - switch (graphExtVersion) { - case ZE_GRAPH_EXT_VERSION_1_3: - apiAdapter = std::make_shared>(driverHandle, - deviceHandle, - zeContext, - graph_ddi_table_ext); - break; - case ZE_GRAPH_EXT_VERSION_1_4: - apiAdapter = std::make_shared>(driverHandle, - deviceHandle, - zeContext, - graph_ddi_table_ext); - break; - case ZE_GRAPH_EXT_VERSION_1_5: - apiAdapter = std::make_shared>(driverHandle, - deviceHandle, - zeContext, - graph_ddi_table_ext); - break; - case ZE_GRAPH_EXT_VERSION_1_6: - apiAdapter = std::make_shared>(driverHandle, - deviceHandle, - zeContext, - graph_ddi_table_ext); - break; - case ZE_GRAPH_EXT_VERSION_1_7: - apiAdapter = std::make_shared>(driverHandle, - deviceHandle, - zeContext, - graph_ddi_table_ext); - break; - case ZE_GRAPH_EXT_VERSION_1_8: - apiAdapter = std::make_shared>(driverHandle, - deviceHandle, - zeContext, - graph_ddi_table_ext); - break; - default: - apiAdapter = std::make_shared>(driverHandle, - deviceHandle, - zeContext, - graph_ddi_table_ext); - break; - } - - _logger.info("initialize LevelZeroCompilerAdapter complete, using graphExtVersion: %d.%d", - ZE_MAJOR_VERSION(graphExtVersion), - ZE_MINOR_VERSION(graphExtVersion)); -} - -uint32_t LevelZeroCompilerAdapter::getSupportedOpsetVersion() const { - return apiAdapter->getSupportedOpsetVersion(); -} - -NetworkDescription LevelZeroCompilerAdapter::compile(const std::shared_ptr& model, - const Config& config) const { - _logger.debug("compile start"); - return apiAdapter->compile(model, config); -} - -ov::SupportedOpsMap LevelZeroCompilerAdapter::query(const std::shared_ptr& model, - const Config& config) const { - _logger.debug("query start"); - return apiAdapter->query(model, config); -} - -NetworkMetadata LevelZeroCompilerAdapter::parse(const std::vector& network, const Config& config) const { - _logger.debug("parse start"); - return apiAdapter->parse(network, config); -} - -std::vector LevelZeroCompilerAdapter::process_profiling_output(const std::vector&, - const std::vector&, - const Config&) const { - OPENVINO_THROW("Profiling post-processing is not implemented."); -} - -void LevelZeroCompilerAdapter::release(std::shared_ptr networkDescription) { - _logger.info("release - using adapter to release networkDescription"); - apiAdapter->release(std::move(networkDescription)); -} - -CompiledNetwork LevelZeroCompilerAdapter::getCompiledNetwork(const NetworkDescription& networkDescription) { - _logger.info("getCompiledNetwork - using adapter to perform getCompiledNetwork(networkDescription)"); - return apiAdapter->getCompiledNetwork(networkDescription); -} - -} // namespace driverCompilerAdapter -} // namespace intel_npu diff --git a/src/plugins/intel_npu/src/compiler/src/zero_compiler_in_driver.cpp b/src/plugins/intel_npu/src/compiler/src/zero_compiler_in_driver.cpp deleted file mode 100644 index 8f7ac4198bb0a4..00000000000000 --- a/src/plugins/intel_npu/src/compiler/src/zero_compiler_in_driver.cpp +++ /dev/null @@ -1,1081 +0,0 @@ -// Copyright (C) 2018-2024 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 -// - -#include "zero_compiler_in_driver.hpp" - -#include -#include - -#include "graph_transformations.hpp" -#include "intel_npu/common/itt.hpp" -#include "intel_npu/config/common.hpp" -#include "intel_npu/config/compiler.hpp" -#include "intel_npu/config/runtime.hpp" -#include "intel_npu/prefix.hpp" -#include "intel_npu/utils/zero/zero_result.hpp" -#include "openvino/core/model.hpp" - -namespace { - -constexpr std::string_view INPUTS_PRECISIONS_KEY = "--inputs_precisions"; -constexpr std::string_view INPUTS_LAYOUTS_KEY = "--inputs_layouts"; -constexpr std::string_view OUTPUTS_PRECISIONS_KEY = "--outputs_precisions"; -constexpr std::string_view OUTPUTS_LAYOUTS_KEY = "--outputs_layouts"; - -//
Release notes

Sourced from paddlepaddle's releases.

PaddlePaddle 2.6.1 Release Note

发版说明

此版本在新功能方面,引入了对Fake GroupWise Quant的支持,有助于用户更好地进行模型量化。同时新增了图神经网络训练引擎PGLBox,支持超大规模图模型GPU多机多卡高效训练。此外,增加了对自定义设备的支持,进一步扩展PaddlePaddle的功能范围。在bug方面,解决了一些核心功能、数据加载以及网络通信等方面的问题。修复了多个安全问题,包括一些潜在的安全漏洞,进一步提高框架代码安全性,并更新了安全公告。

新特性

  • 支持Fake GroupWise量化(#61900:新增对一种量化方法的支持,可以提高模型的性能效率。
  • 支持图神经网络GPU训练(#60495#62111:新增图神经网络训练引擎PGLBox,支持超大规模图模型GPU多机多卡高效训练。
  • 其他改进:支持tile op int8模式推理,并添加vlog语句 (#60261),repeat_interleave支持bfloat16数据类型的Tensor输入 (#61854),自定义设备支持动态图模式的c_embedding算子 (#60774),在CINN(自定义中间网络)框架中,将IntrinsicOps添加到ir_codes_collector中。

Bug修复

  • 修复权重量化内核错误(#60184:解决了在权重量化内核中当n不能被64整除时的问题。
  • 修复量化感知测试问题(#61211:修复了量化感知训练(QAT)测试中的问题,以确保其正常运行。
  • 修复Paddle-TRT集成问题(#61806, #61605, #61966:对Paddle-TRT集成进行了多项修复,包括缓存键值(KV)量化和单元测试失败问题。
  • 禁用LLM_INT8 UT(#62282:禁用了大型语言模型(LLM)INT8精度的单元测试,以避免不必要的运行时。
  • 修复test_benchmark单测编译失败问题(#61427:修复了test_benchmark单测编译失败的问题#60092
  • 修复工具包的数据加载器(#61867:对工具包的数据加载器进行了必要的更正。
  • 修复put_along_axis系列问题(#62065:对reduce参数新增支持min/max/mean三种可选值,修复了reduce=add/mul下的反向梯度计算Bug, 修复了reduce=mul下的GPU前向计算Bug,修复了size过大时的前向计算Bug。
  • 修复Windows平台下的编译Bug(#60308:修复windows平台下的编译找不到common库的bug。
  • 修复OpenSSL-CPU编译错误(#62079:修复cpu-openblas编译场景中未正确链接 Python 库导致的编译bug。

安全修复

文档

  • 文档风格改进(#61688:改进了文档的样式和格式。
  • 安全公告更新(#60532, #60649:更新了2023年的安全公告,以通知用户潜在的安全问题。