diff --git a/src/plugins/intel_gpu/src/graph/impls/onednn/fully_connected_onednn.hpp b/src/plugins/intel_gpu/src/graph/impls/onednn/fully_connected_onednn.hpp index a601b2c74c09e3..17498831a542d1 100644 --- a/src/plugins/intel_gpu/src/graph/impls/onednn/fully_connected_onednn.hpp +++ b/src/plugins/intel_gpu/src/graph/impls/onednn/fully_connected_onednn.hpp @@ -50,7 +50,7 @@ struct FullyConnectedImplementationManager : public ImplementationManager { bool compressed_case = fc_prim->compressed_weights && one_of(in0_dt, {data_types::f16, data_types::f32, data_types::i8}) && one_of(wei_dt, {data_types::u8, data_types::i8, data_types::u4, data_types::i4}) && - one_of(out_dt, {data_types::f16, data_types::f32}); + one_of(out_dt, {data_types::f16, data_types::f32, data_types::u8, data_types::i8}); if (!f16f16_case && !f32f32_case && !u8s8_case && !compressed_case) return false; diff --git a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/permute_f_y_axes.cl b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/permute_f_y_axes.cl index 6df614c3328dd9..3aafc2c727b345 100644 --- a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/permute_f_y_axes.cl +++ b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/permute_f_y_axes.cl @@ -43,7 +43,7 @@ KERNEL (permute_f_y_axes)( result = FUSED_OPS_RESULT_VEC; #else IN_VEC_TYPE res = READ_VEC(0, &input[INPUT0_GET_INDEX(b_idx, f_idx, y_idx, x_idx)]); - OUT_VEC_TYPE result = ACTIVATION(res, ACTIVATION_PARAMS); + OUT_VEC_TYPE result = TO_OUT_VEC_TYPE(ACTIVATION(res, ACTIVATION_PARAMS)); #endif const int output_idx = OUTPUT_GET_INDEX(b_idx, f_out_idx, y_out_idx, x_idx); WRITE_VEC(result, 0, &output[output_idx]); diff --git a/src/plugins/intel_gpu/src/plugin/ops/fully_connected.cpp b/src/plugins/intel_gpu/src/plugin/ops/fully_connected.cpp index 7c0c570f7cf54c..7b0aa921ef3ad5 100644 --- a/src/plugins/intel_gpu/src/plugin/ops/fully_connected.cpp +++ b/src/plugins/intel_gpu/src/plugin/ops/fully_connected.cpp @@ -68,6 +68,34 @@ static void CreateFullyConnectedCompressedOp(ProgramBuilder& p, const std::share } p.add_primitive(*op, fc); + + if (op->get_input_partial_shape(0).size() > 3 && !p.use_new_shape_infer()) { + auto lastLayerName = primitive_name; + auto outReshapeName = primitive_name + "_cldnn_out_reshape"; + + // add reorder + auto outDims = op->get_output_shape(0); + auto outTensor = tensor_from_dims(outDims); + + if (outDims.size() > 4) { + cldnn::format outputFormat = cldnn::format::bfyx; + switch (outDims.size()) { + case 5: outputFormat = cldnn::format::bfzyx; break; + case 6: outputFormat = cldnn::format::bfwzyx; break; + default: break; + } + + cldnn::primitive_id reorderId = "reorder:" + outReshapeName + "_reorder"; + cldnn::layout outputLayout(cldnn::element_type_to_data_type(op->get_output_element_type(0)), outputFormat, outTensor); + auto reorder_prim = cldnn::reorder(reorderId, cldnn::input_info(primitive_name), outputLayout); + p.add_primitive(*op, reorder_prim); + lastLayerName = reorderId; + } + + // add reshape + auto outReshapePrim = cldnn::reshape(outReshapeName, cldnn::input_info(lastLayerName), outTensor); + p.add_primitive(*op, outReshapePrim); + } } static void CreateFullyConnectedOp(ProgramBuilder& p, const std::shared_ptr& op) { diff --git a/src/plugins/intel_gpu/src/plugin/transformations_pipeline.cpp b/src/plugins/intel_gpu/src/plugin/transformations_pipeline.cpp index db93696865a971..5d5f901deeaf1f 100644 --- a/src/plugins/intel_gpu/src/plugin/transformations_pipeline.cpp +++ b/src/plugins/intel_gpu/src/plugin/transformations_pipeline.cpp @@ -15,8 +15,11 @@ #include "intel_gpu/plugin/transformations_pipeline.hpp" #include "intel_gpu/runtime/debug_configuration.hpp" #include "intel_gpu/runtime/itt.hpp" +#include "low_precision/add.hpp" #include "low_precision/convolution.hpp" #include "low_precision/convolution_backprop_data.hpp" +#include "low_precision/fold_convert.hpp" +#include "low_precision/fuse_convert.hpp" #include "low_precision/group_convolution.hpp" #include "low_precision/low_precision.hpp" #include "low_precision/mat_mul.hpp" @@ -25,7 +28,9 @@ #include "low_precision/pull_reshape_through_dequantization.hpp" #include "low_precision/pull_transpose_through_dequantization.hpp" #include "low_precision/recurrent_cell.hpp" +#include "low_precision/rt_info/bias_attribute.hpp" #include "low_precision/strided_slice.hpp" +#include "low_precision/transpose.hpp" #include "openvino/core/deprecated.hpp" #include "openvino/core/type/element_type.hpp" #include "openvino/core/validation_util.hpp" @@ -46,6 +51,7 @@ #include "openvino/op/reshape.hpp" #include "openvino/op/rnn_cell.hpp" #include "openvino/op/rnn_sequence.hpp" +#include "openvino/op/scaled_dot_product_attention.hpp" #include "openvino/op/squeeze.hpp" #include "openvino/op/unsqueeze.hpp" #include "openvino/op/util/sub_graph_base.hpp" @@ -169,7 +175,17 @@ static bool disable_reduce_decomposition(const std::shared_ptr n return false; } -static bool is_decompression_multiply(const std::shared_ptr node) { +static bool is_decompression_multiply(const std::shared_ptr node, bool supports_immad) { + std::vector target_consumers = { ov::opset1::MatMul::get_type_info_static(), + ov::op::v8::Gather::get_type_info_static(), + ov::op::v1::Convolution::get_type_info_static(), + ov::opset1::Convolution::get_type_info_static(), + ov::opset1::GroupConvolution::get_type_info_static() }; + + std::vector convolutions = { ov::op::v1::Convolution::get_type_info_static(), + ov::opset1::Convolution::get_type_info_static(), + ov::opset1::GroupConvolution::get_type_info_static() }; + auto all_has_types = [](const std::set>& consumers, const std::vector& types) { return std::all_of(consumers.begin(), consumers.end(), [&types](const ov::Input& input) { return cldnn::one_of(input.get_node()->get_type_info(), types); @@ -177,27 +193,47 @@ static bool is_decompression_multiply(const std::shared_ptr node }; const auto consumers = node->get_output_target_inputs(0); - if (all_has_types(consumers, { ov::op::v0::MatMul::get_type_info_static(), ov::op::v8::Gather::get_type_info_static() })) - return true; - auto are_multiply_from_decompression = [&all_has_types](const ov::Input consumer) { + for (const auto& consumer : consumers) { + const auto& type_info = consumer.get_node()->get_type_info(); + if (cldnn::one_of(type_info, target_consumers)) { + if (cldnn::one_of(type_info, convolutions) && consumer.get_node()->input_value(0).get_partial_shape().is_dynamic()) { + return false; + } + return true; + } + } + + auto are_multiply_from_decompression = [&](const ov::Input consumer) { if (!cldnn::one_of(consumer.get_node()->get_type_info(), { ov::op::v1::Multiply::get_type_info_static() })) return false; const auto child_consumers = consumer.get_node()->get_output_target_inputs(0); - if (all_has_types(child_consumers, { ov::opset1::MatMul::get_type_info_static(), ov::op::v8::Gather::get_type_info_static() })) - return true; + + for (const auto& child_consumer : child_consumers) { + const auto& type_info = child_consumer.get_node()->get_type_info(); + if (cldnn::one_of(type_info, target_consumers)) { + if (cldnn::one_of(type_info, convolutions) && child_consumer.get_node()->input_value(0).get_partial_shape().is_dynamic()) { + return false; + } + return true; + } + } return false; }; - auto are_converts_from_decompression = [&all_has_types, &are_multiply_from_decompression](const std::set>& consumers) { + auto are_converts_from_decompression = [&](const std::set>& consumers) { if (!all_has_types(consumers, { ov::opset1::Convert::get_type_info_static() })) return false; for (const auto& consumer : consumers) { const auto child_consumers = consumer.get_node()->get_output_target_inputs(0); for (const auto& child_consumer : child_consumers) { const auto& type_info = child_consumer.get_node()->get_type_info(); - if (cldnn::one_of(type_info, { ov::opset1::MatMul::get_type_info_static(), ov::op::v8::Gather::get_type_info_static() })) - continue; + if (cldnn::one_of(type_info, target_consumers)) { + if (cldnn::one_of(type_info, convolutions) && child_consumer.get_node()->input_value(0).get_partial_shape().is_dynamic()) { + return false; + } + return true; + } if (are_multiply_from_decompression(child_consumer)) { continue; } @@ -210,9 +246,16 @@ static bool is_decompression_multiply(const std::shared_ptr node if (all_has_types(consumers, { ov::opset1::Reshape::get_type_info_static() })) { for (const auto& consumer : consumers) { const auto child_consumers = consumer.get_node()->get_output_target_inputs(0); - if (all_has_types(child_consumers, { ov::opset1::MatMul::get_type_info_static(), ov::op::v8::Gather::get_type_info_static() }) || - are_converts_from_decompression(child_consumers)) { - return true; + for (const auto& child_consumer : child_consumers) { + const auto& type_info = child_consumer.get_node()->get_type_info(); + if (cldnn::one_of(type_info, target_consumers)) { + if (cldnn::one_of(type_info, convolutions) && child_consumer.get_node()->input_value(0).get_partial_shape().is_dynamic()) { + return false; + } + return true; + } else if (are_converts_from_decompression(child_consumers)) { + return true; + } } } } @@ -330,13 +373,9 @@ void TransformationsPipeline::apply(std::shared_ptr func) { // it expects to have the same data type for weights and zero points (apply it only for u8 data type, since other compression // types are not supported by oneDNN) manager.register_pass(supported_woq_types, !device_info.supports_immad); - - // Need to check if transformations work correctly for mixed models with both compression and quantization at the same time. - if (!is_model_quantized) { - pass_config->set_callback([&](const std::shared_ptr node) { - return !is_decompression_multiply(node); - }); - } + pass_config->set_callback([&](const std::shared_ptr node) { + return !is_decompression_multiply(node, device_info.supports_immad); + }); const bool keep_precision_sensitive_in_fp32_1 = true; const bool convert_input_output_precision = false; @@ -705,12 +744,6 @@ void TransformationsPipeline::apply(std::shared_ptr func) { auto lptPassConfig = lptManager.get_pass_config(); // quantized LSTMSequence / GPUSequence are not supported yet. Avoid extra transformation lptPassConfig->disable(); - lptPassConfig->set_callback([](const_node_ptr& node) -> bool { - if (const auto mulitply = std::dynamic_pointer_cast(node)) { - return !MultiplyToGroupConvolutionTransformation::canBeTransformedToGroupConvolution(mulitply); - } - return false; - }); lptPassConfig->set_callback([func, defaultPrecisions](const_node_ptr& node) -> bool { auto fillStaticChannel = [func](const ov::PartialShape& shape, size_t& channel) -> bool { const auto rank = shape.rank(); @@ -747,6 +780,40 @@ void TransformationsPipeline::apply(std::shared_ptr func) { || WeightableLayerTransformation::isAsymmetricOnWeights(node, defaultPrecisions); }); + lptPassConfig->set_callback([&](const_node_ptr& node) -> bool { + for (auto& user : node->get_users()) { + if (ov::is_type(user)) + return true; + } + + return false; + }); + + lptPassConfig->set_callback([](const_node_ptr& node) -> bool { + return ov::is_type(node) && !MultiplyToGroupConvolutionTransformation::canBeTransformedToGroupConvolution(node); + }); + + lptPassConfig->set_callback([&](const_node_ptr& node) -> bool { + const auto& consumers = node->get_output_target_inputs(0); + if (consumers.size() == 1) { + const auto consumer = consumers.begin()->get_node()->shared_from_this(); + return ov::is_type(consumer) && is_decompression_multiply(consumer, device_info.supports_immad); + } + return false; + }); + lptPassConfig->set_callback([&](const_node_ptr& node) -> bool { + if (ov::is_type(node)) { + return ov::is_type(node) && is_decompression_multiply(node, device_info.supports_immad); + } else if (ov::is_type(node)) { + const auto& consumers = node->get_output_target_inputs(0); + if (consumers.size() == 1) { + const auto consumer = consumers.begin()->get_node()->shared_from_this(); + return ov::is_type(consumer) && is_decompression_multiply(consumer, device_info.supports_immad); + } + } + return false; + }); + lptPassConfig->set_callback([&](const_node_ptr& node) -> bool { // disable MultiplyToGroupConvolution if Multiply with Constant can be fused diff --git a/src/plugins/intel_gpu/tests/functional/subgraph_tests/dynamic/hybrid.cpp b/src/plugins/intel_gpu/tests/functional/subgraph_tests/dynamic/hybrid.cpp new file mode 100644 index 00000000000000..f7de00d77a9bd9 --- /dev/null +++ b/src/plugins/intel_gpu/tests/functional/subgraph_tests/dynamic/hybrid.cpp @@ -0,0 +1,379 @@ +// Copyright (C) 2023 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "common_test_utils/ov_tensor_utils.hpp" +#include "shared_test_classes/base/ov_subgraph.hpp" +#include "transformations/rt_info/decompression.hpp" + +#include "openvino/op/parameter.hpp" +#include "openvino/op/constant.hpp" +#include "openvino/op/result.hpp" +#include "openvino/op/matmul.hpp" +#include "openvino/op/reshape.hpp" +#include "openvino/op/convert.hpp" +#include "openvino/op/subtract.hpp" +#include "openvino/op/transpose.hpp" + +namespace { +using ov::test::InputShape; + +struct ShapeParams { + ShapeParams() = default; + ShapeParams(InputShape data_shape, ov::Shape weights_shape, int weights_group_size = -1) + : data_shape(std::move(data_shape)), + weights_shape(std::move(weights_shape)), + weights_group_size(weights_group_size) {} + + InputShape data_shape; + ov::Shape weights_shape; + // Decompression group size. If the value is equal to -1, ordinary decompression is used + int weights_group_size; +}; + +const std::vector activations_precisions = {ov::element::f32, ov::element::f16}; +const std::vector weights_precisions = {ov::element::u8, ov::element::u4, ov::element::i4}; +const std::vector transpose_weights = {true, false}; + +const std::vector input_shapes_basic = { + {{{-1, -1, -1}, {{1, 8, 4}, {1, 8, 4}}}, {4, 4}}, +}; + +const std::vector add_decompression_sub = {true, false}; +const std::vector reshape_on_decompression = {true, false}; +const std::vector per_tensor_zp = {true, false}; + +using MatmulWeightsDecompressionQuantizeConvolutionParams = std::tuple; + +class MatmulWeightsDecompressionQuantizeConvolution : public testing::WithParamInterface, + virtual public ov::test::SubgraphBaseTest { +public: + static std::string get_test_case_name(testing::TestParamInfo obj) { + ShapeParams shape_params; + ov::element::Type weights_precision; + ov::element::Type activations_precision; + bool transpose; + bool decompression_sub; + bool reshape_on_decompression; + bool extra_multiply; + bool per_tensor_zp; + uint64_t dyn_quan_group_size; + + std::tie(shape_params, + weights_precision, + activations_precision, + transpose, + decompression_sub, + reshape_on_decompression, + extra_multiply, + per_tensor_zp, + dyn_quan_group_size) = obj.param; + + std::ostringstream result; + result << "data_shape="; + result << ov::test::utils::partialShape2str({shape_params.data_shape.first}) << "_"; + for (const auto& actual_shape : shape_params.data_shape.second) { + result << ov::test::utils::partialShape2str({actual_shape}) << "_"; + } + result << "_" << "weights_shape=" << shape_params.weights_shape << "_"; + result << "group_size=" << (shape_params.weights_group_size == -1 ? 1111 : shape_params.weights_group_size) << "_"; + result << "weights_precision=" << weights_precision << "_"; + result << "activations_precision=" << activations_precision << "_"; + result << "transpose_weights=" << transpose << "_"; + result << "decompression_subtract=" << decompression_sub << "_"; + result << "reshape_on_decompression=" << reshape_on_decompression << "_"; + result << "extra_multiply=" << extra_multiply << "_"; + result << "per_tensor_zp=" << per_tensor_zp << "_"; + result << "dyn_quan_group_size=" << dyn_quan_group_size << "_"; + result << "quantize_conv"; + + return result.str(); + } +protected: + std::shared_ptr init_subgraph(const ov::PartialShape& data_shape, + const ov::Shape& weights_shape, + const int group_size, + const ov::element::Type data_precision, + const ov::element::Type weights_precision, + const bool transpose_weights, + const bool add_subtract, + const bool reshape_on_decompression, + const bool extra_multiply, + const bool per_tensor_zp) { + ov::ParameterVector params{std::make_shared(data_precision, data_shape)}; + const auto weights_subgraph = init_compressed_weights_subgraph(weights_shape, + group_size, + data_precision, + weights_precision, + transpose_weights, + add_subtract, + reshape_on_decompression, + extra_multiply, + per_tensor_zp); + + auto mat_mul = std::make_shared(params[0], weights_subgraph); + + std::vector shape_pattern = {1, -1, 2, 4}; + auto shapePatternsNode = std::make_shared(ov::element::i64, ov::Shape({4}), shape_pattern); + auto reshape = std::make_shared(mat_mul, shapePatternsNode, false); + reshape->set_friendly_name("reshape"); + + auto conv = init_quantized_convolution_subgraph(reshape); + return std::make_shared(ov::NodeVector{conv}, params, "MatmulWeightsDecompressionQuantizeConvolution"); + } + + std::shared_ptr init_compressed_weights_subgraph(const ov::Shape& weights_shape, + const int group_size, + const ov::element::Type data_precision, + const ov::element::Type weights_precision, + const bool transpose_weights, + const bool add_subtract, + const bool reshape_on_decompression_constant, + const bool extra_multiply, + const bool per_tensor_zp) { + auto transpose_if_necessary = [&](const ov::Shape& shape) { + auto result_shape = shape; + if (transpose_weights) + std::swap(*result_shape.rbegin(), *(result_shape.rbegin() + 1)); + return result_shape; + }; + + const bool group_decompression = group_size != -1; + // Weights has shape [I, O], where + // I - input channels + // O - output channels + // In case of group decompression, input channels dimension is split into 2: I -> [N, G], where + // N - number of groups + // G - group size + auto transformed_weights_shape = transpose_if_necessary(weights_shape); + if (group_decompression) { + OPENVINO_ASSERT(weights_shape[0] % group_size == 0, + "Weights output channels count (", + weights_shape[0], + ") must be divisible by decompression group size (", + group_size, + ")."); + auto in_channel_idx = transpose_weights ? transformed_weights_shape.size() - 1 : transformed_weights_shape.size() - 2; + transformed_weights_shape[in_channel_idx] = weights_shape[0] / group_size; + transformed_weights_shape.insert(transformed_weights_shape.begin() + in_channel_idx + 1, group_size); + } + auto weights_tensor = ov::test::utils::create_and_fill_tensor(weights_precision, transformed_weights_shape); + auto weights = std::make_shared(weights_tensor); + weights->set_friendly_name("Compressed_weights"); + auto weights_convert = std::make_shared(weights, data_precision); + + std::shared_ptr mul_parent = weights_convert; + auto output_channels = *weights_shape.rbegin(); + + // Decompression constants shape: + // Ordinary decompression: [O, 1] + // Group decompression: [O, N, 1] + ov::Shape scaleshift_target_shape{output_channels}; + scaleshift_target_shape.insert(scaleshift_target_shape.begin(), group_decompression ? weights_shape[0] / group_size : 1); + scaleshift_target_shape = transpose_if_necessary(scaleshift_target_shape); + if (group_decompression) { + auto in_channel_idx = transpose_weights ? scaleshift_target_shape.size() - 1 : scaleshift_target_shape.size() - 2; + scaleshift_target_shape.insert(scaleshift_target_shape.begin() + in_channel_idx + 1, 1); + } + + auto scaleshift_const_shape = scaleshift_target_shape; + if (reshape_on_decompression_constant) + scaleshift_const_shape.erase(std::remove(scaleshift_const_shape.begin(), scaleshift_const_shape.end(), 1), scaleshift_const_shape.end()); + if (add_subtract) { + auto shift_tensor_shape = per_tensor_zp ? ov::Shape{1} : scaleshift_const_shape; + auto shift_tensor = ov::test::utils::create_and_fill_tensor(weights_precision, shift_tensor_shape); + if (per_tensor_zp && weights_precision.bitwidth() == 4) { + static_cast(shift_tensor.data())[0] = 0x88; + } + auto shift_const = std::make_shared(shift_tensor); + std::shared_ptr shift_convert = std::make_shared(shift_const, data_precision); + if (reshape_on_decompression_constant && !per_tensor_zp) { + auto shift_reshape_const = ov::op::v0::Constant::create(ov::element::i32, {scaleshift_target_shape.size()}, scaleshift_target_shape); + auto shift_reshape = std::make_shared(shift_convert, shift_reshape_const, false); + shift_convert = shift_reshape; + } + mul_parent = std::make_shared(weights_convert, shift_convert); + } + + ov::test::utils::InputGenerateData in_data; + in_data.start_from = -0.5; + in_data.range = 1; + in_data.resolution = 30000; + auto scale_tensor = ov::test::utils::create_and_fill_tensor(data_precision, scaleshift_const_shape, in_data); + for (size_t i = 0; i < scale_tensor.get_size(); i++) { + if (data_precision == ov::element::f16) + scale_tensor.data()[i] /= ov::float16(16.f); + else if (data_precision == ov::element::f32) + scale_tensor.data()[i] /= 16.f; + } + std::shared_ptr scale_const = std::make_shared(scale_tensor); + if (reshape_on_decompression_constant) { + auto scale_reshape_const = ov::op::v0::Constant::create(ov::element::i32, {scaleshift_target_shape.size()}, scaleshift_target_shape); + auto scale_reshape = std::make_shared(scale_const, scale_reshape_const, false); + scale_const = scale_reshape; + } + std::shared_ptr last_node = std::make_shared(mul_parent, scale_const); + + if (group_decompression) { + auto reshape_target_shape = transpose_weights ? std::vector{-1, static_cast(weights_shape[0])} + : std::vector{static_cast(weights_shape[0]), -1}; + auto target_shape_node = ov::op::v0::Constant::create(ov::element::i32, {reshape_target_shape.size()}, reshape_target_shape); + last_node = std::make_shared(last_node, target_shape_node, false); + } + if (transpose_weights) { + const size_t rank = last_node->get_output_partial_shape(0).size(); + std::vector order(rank); + std::iota(order.begin(), order.end(), 0); + std::swap(*order.rbegin(), *(order.rbegin() + 1)); + auto transpose_constant = ov::op::v0::Constant::create(ov::element::i32, {rank}, order); + last_node = std::make_shared(last_node, transpose_constant); + } else if (extra_multiply) { + last_node = std::make_shared(last_node, scale_const); + } + return last_node; + } + + std::shared_ptr init_quantized_convolution_subgraph(std::shared_ptr data) { + size_t input_channels = 4; + size_t output_channels = 4; + ov::Shape input_intervals_shape; + ov::Shape weights_intervals_shape; + ov::Shape weights_shape1{output_channels, input_channels, 1, 1}; + + auto low_act = ov::op::v0::Constant::create(ov::element::f32, input_intervals_shape, {0}); + auto high_act = ov::op::v0::Constant::create(ov::element::f32, input_intervals_shape, {20}); + auto low_weights = ov::op::v0::Constant::create(ov::element::f32, weights_intervals_shape, {-0.72519057}); + auto high_weights = ov::op::v0::Constant::create(ov::element::f32, weights_intervals_shape, {0.72519057}); + std::shared_ptr activations = nullptr; + auto weights_tensor = ov::test::utils::create_and_fill_tensor_real_distribution(ov::element::f32, weights_shape1, -0.5f, 0.5f, 1); + std::shared_ptr weights = std::make_shared(weights_tensor); + + auto output_low_act = ov::op::v0::Constant::create(ov::element::f32, ov::Shape{}, {0}); + auto output_high_act = ov::op::v0::Constant::create(ov::element::f32, ov::Shape{}, {255}); + activations = std::make_shared< ov::op::v0::FakeQuantize>(data, low_act, high_act, output_low_act, output_high_act, 256); + activations = std::make_shared< ov::op::v0::Convert>(activations, ov::element::u8); + activations = std::make_shared< ov::op::v0::Convert>(activations, ov::element::f32); + std::shared_ptr shift_act = ov::op::v0::Constant::create(ov::element::u8, input_intervals_shape, {0}); + shift_act = std::make_shared< ov::op::v0::Convert>(shift_act, ov::element::f32); + activations = std::make_shared(activations, shift_act); + auto scale_act = ov::op::v0::Constant::create(ov::element::f32, input_intervals_shape, {20.0 / 255.0}); + activations = std::make_shared(activations, scale_act); + + auto output_low_weights = ov::op::v0::Constant::create(ov::element::f32, ov::Shape{}, {0}); + auto output_high_weights = ov::op::v0::Constant::create(ov::element::f32, ov::Shape{}, {254}); + weights = std::make_shared(weights, low_weights, high_weights, output_low_weights, output_high_weights, 255); + weights = std::make_shared(weights, ov::element::i8); + weights = std::make_shared(weights, ov::element::f32); + std::shared_ptr shift_weights = ov::op::v0::Constant::create(ov::element::i8, weights_intervals_shape, {0}); + shift_weights = std::make_shared(shift_weights, ov::element::f32); + weights = std::make_shared(weights, shift_weights); + auto scale_weights = ov::op::v0::Constant::create(ov::element::f32, weights_intervals_shape, {2.0 / 255.0}); + weights = std::make_shared(weights, scale_weights); + + std::shared_ptr conv; + conv = std::make_shared(activations, weights, ov::Strides{1, 1}, + ov::CoordinateDiff{0, 0}, ov::CoordinateDiff{0, 0}, ov::Strides{1, 1}); + return conv; + } + + void SetUp() override { + targetDevice = ov::test::utils::DEVICE_GPU; + + ShapeParams shape_params; + ov::element::Type weights_precision; + ov::element::Type activations_precision; + bool transpose_weights; + bool decompression_sub; + bool reshape_on_decompression; + bool extra_multiply; + bool per_tensor_zp; + uint64_t dyn_quan_group_size; + + std::tie(shape_params, + weights_precision, + activations_precision, + transpose_weights, + decompression_sub, + reshape_on_decompression, + extra_multiply, + per_tensor_zp, + dyn_quan_group_size) = GetParam(); + + init_input_shapes({shape_params.data_shape, {{}, {{shape_params.weights_shape}}}}); + + inType = outType = activations_precision; + + function = init_subgraph(inputDynamicShapes[0], + shape_params.weights_shape, + shape_params.weights_group_size, + activations_precision, + weights_precision, + transpose_weights, + decompression_sub, + reshape_on_decompression, + extra_multiply, + per_tensor_zp); + + + if (activations_precision == ov::element::f16) { + abs_threshold = 1.0f; + } else { + abs_threshold = 1e-4f; + } + + this->configuration.insert({ov::hint::dynamic_quantization_group_size(dyn_quan_group_size)}); + } + + void generate_inputs(const std::vector& target_input_static_shapes) override { + inputs.clear(); + const auto& model_inputs = function->inputs(); + for (size_t i = 0; i < model_inputs.size(); ++i) { + const auto& model_input = model_inputs[i]; + ov::test::utils::InputGenerateData in_data; + in_data.start_from = -1; + in_data.range = 2; + in_data.resolution = 10000; + ov::Tensor tensor = ov::test::utils::create_and_fill_tensor(model_input.get_element_type(), target_input_static_shapes[i], in_data); + inputs.insert({model_input.get_node_shared_ptr(), tensor}); + } + } + + void check_results() { + const auto& test_param = GetParam(); + ov::element::Type weights_precision = std::get<1>(test_param); + for (const auto& n : compiledModel.get_runtime_model()->get_ordered_ops()) { + if (n->get_friendly_name() == "Compressed_weights") { + ASSERT_EQ(n->get_output_element_type(0), weights_precision); + } + } + } +}; + +TEST_P(MatmulWeightsDecompressionQuantizeConvolution, Inference) { + run(); + check_results(); +} + +INSTANTIATE_TEST_SUITE_P(smoke_MatmulWeightsDecompressionQuantizeConvolution_basic, + MatmulWeightsDecompressionQuantizeConvolution, + ::testing::Combine(::testing::ValuesIn(input_shapes_basic), + ::testing::ValuesIn(weights_precisions), + ::testing::ValuesIn(activations_precisions), + ::testing::ValuesIn(transpose_weights), + ::testing::Values(true), + ::testing::Values(true), + ::testing::Values(false), + ::testing::Values(false), + ::testing::Values(0)), + MatmulWeightsDecompressionQuantizeConvolution::get_test_case_name); + +} // namespace