Skip to content

Commit

Permalink
[GPU] Fix hybrid quantization reg issue (#27404)
Browse files Browse the repository at this point in the history
Many daily int8 models have Perf regression.(incorrect conv data type
and bias)
Fix kernel selection issue.

### Tickets:
 - *156564, 155267, 156243*

---------

Signed-off-by: hyunback <[email protected]>
Co-authored-by: Vladimir Paramuzov <[email protected]>
  • Loading branch information
hyunback and vladimir-paramuzov authored Nov 19, 2024
1 parent 9163fb0 commit 40152d1
Show file tree
Hide file tree
Showing 5 changed files with 501 additions and 27 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@ struct FullyConnectedImplementationManager : public ImplementationManager {
bool compressed_case = fc_prim->compressed_weights &&
one_of(in0_dt, {data_types::f16, data_types::f32, data_types::i8}) &&
one_of(wei_dt, {data_types::u8, data_types::i8, data_types::u4, data_types::i4}) &&
one_of(out_dt, {data_types::f16, data_types::f32});
one_of(out_dt, {data_types::f16, data_types::f32, data_types::u8, data_types::i8});
if (!f16f16_case && !f32f32_case && !u8s8_case && !compressed_case)
return false;

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@ KERNEL (permute_f_y_axes)(
result = FUSED_OPS_RESULT_VEC;
#else
IN_VEC_TYPE res = READ_VEC(0, &input[INPUT0_GET_INDEX(b_idx, f_idx, y_idx, x_idx)]);
OUT_VEC_TYPE result = ACTIVATION(res, ACTIVATION_PARAMS);
OUT_VEC_TYPE result = TO_OUT_VEC_TYPE(ACTIVATION(res, ACTIVATION_PARAMS));
#endif
const int output_idx = OUTPUT_GET_INDEX(b_idx, f_out_idx, y_out_idx, x_idx);
WRITE_VEC(result, 0, &output[output_idx]);
Expand Down
28 changes: 28 additions & 0 deletions src/plugins/intel_gpu/src/plugin/ops/fully_connected.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,34 @@ static void CreateFullyConnectedCompressedOp(ProgramBuilder& p, const std::share
}

p.add_primitive(*op, fc);

if (op->get_input_partial_shape(0).size() > 3 && !p.use_new_shape_infer()) {
auto lastLayerName = primitive_name;
auto outReshapeName = primitive_name + "_cldnn_out_reshape";

// add reorder
auto outDims = op->get_output_shape(0);
auto outTensor = tensor_from_dims(outDims);

if (outDims.size() > 4) {
cldnn::format outputFormat = cldnn::format::bfyx;
switch (outDims.size()) {
case 5: outputFormat = cldnn::format::bfzyx; break;
case 6: outputFormat = cldnn::format::bfwzyx; break;
default: break;
}

cldnn::primitive_id reorderId = "reorder:" + outReshapeName + "_reorder";
cldnn::layout outputLayout(cldnn::element_type_to_data_type(op->get_output_element_type(0)), outputFormat, outTensor);
auto reorder_prim = cldnn::reorder(reorderId, cldnn::input_info(primitive_name), outputLayout);
p.add_primitive(*op, reorder_prim);
lastLayerName = reorderId;
}

// add reshape
auto outReshapePrim = cldnn::reshape(outReshapeName, cldnn::input_info(lastLayerName), outTensor);
p.add_primitive(*op, outReshapePrim);
}
}

static void CreateFullyConnectedOp(ProgramBuilder& p, const std::shared_ptr<op::FullyConnected>& op) {
Expand Down
117 changes: 92 additions & 25 deletions src/plugins/intel_gpu/src/plugin/transformations_pipeline.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -15,8 +15,11 @@
#include "intel_gpu/plugin/transformations_pipeline.hpp"
#include "intel_gpu/runtime/debug_configuration.hpp"
#include "intel_gpu/runtime/itt.hpp"
#include "low_precision/add.hpp"
#include "low_precision/convolution.hpp"
#include "low_precision/convolution_backprop_data.hpp"
#include "low_precision/fold_convert.hpp"
#include "low_precision/fuse_convert.hpp"
#include "low_precision/group_convolution.hpp"
#include "low_precision/low_precision.hpp"
#include "low_precision/mat_mul.hpp"
Expand All @@ -25,7 +28,9 @@
#include "low_precision/pull_reshape_through_dequantization.hpp"
#include "low_precision/pull_transpose_through_dequantization.hpp"
#include "low_precision/recurrent_cell.hpp"
#include "low_precision/rt_info/bias_attribute.hpp"
#include "low_precision/strided_slice.hpp"
#include "low_precision/transpose.hpp"
#include "openvino/core/deprecated.hpp"
#include "openvino/core/type/element_type.hpp"
#include "openvino/core/validation_util.hpp"
Expand All @@ -46,6 +51,7 @@
#include "openvino/op/reshape.hpp"
#include "openvino/op/rnn_cell.hpp"
#include "openvino/op/rnn_sequence.hpp"
#include "openvino/op/scaled_dot_product_attention.hpp"
#include "openvino/op/squeeze.hpp"
#include "openvino/op/unsqueeze.hpp"
#include "openvino/op/util/sub_graph_base.hpp"
Expand Down Expand Up @@ -169,35 +175,65 @@ static bool disable_reduce_decomposition(const std::shared_ptr<const ov::Node> n
return false;
}

static bool is_decompression_multiply(const std::shared_ptr<const ov::Node> node) {
static bool is_decompression_multiply(const std::shared_ptr<const ov::Node> node, bool supports_immad) {
std::vector<ov::DiscreteTypeInfo> target_consumers = { ov::opset1::MatMul::get_type_info_static(),
ov::op::v8::Gather::get_type_info_static(),
ov::op::v1::Convolution::get_type_info_static(),
ov::opset1::Convolution::get_type_info_static(),
ov::opset1::GroupConvolution::get_type_info_static() };

std::vector<ov::DiscreteTypeInfo> convolutions = { ov::op::v1::Convolution::get_type_info_static(),
ov::opset1::Convolution::get_type_info_static(),
ov::opset1::GroupConvolution::get_type_info_static() };

auto all_has_types = [](const std::set<ov::Input<ov::Node>>& consumers, const std::vector<ov::DiscreteTypeInfo>& types) {
return std::all_of(consumers.begin(), consumers.end(), [&types](const ov::Input<ov::Node>& input) {
return cldnn::one_of(input.get_node()->get_type_info(), types);
});
};

const auto consumers = node->get_output_target_inputs(0);
if (all_has_types(consumers, { ov::op::v0::MatMul::get_type_info_static(), ov::op::v8::Gather::get_type_info_static() }))
return true;

auto are_multiply_from_decompression = [&all_has_types](const ov::Input<ov::Node> consumer) {
for (const auto& consumer : consumers) {
const auto& type_info = consumer.get_node()->get_type_info();
if (cldnn::one_of(type_info, target_consumers)) {
if (cldnn::one_of(type_info, convolutions) && consumer.get_node()->input_value(0).get_partial_shape().is_dynamic()) {
return false;
}
return true;
}
}

auto are_multiply_from_decompression = [&](const ov::Input<ov::Node> consumer) {
if (!cldnn::one_of(consumer.get_node()->get_type_info(), { ov::op::v1::Multiply::get_type_info_static() }))
return false;
const auto child_consumers = consumer.get_node()->get_output_target_inputs(0);
if (all_has_types(child_consumers, { ov::opset1::MatMul::get_type_info_static(), ov::op::v8::Gather::get_type_info_static() }))
return true;

for (const auto& child_consumer : child_consumers) {
const auto& type_info = child_consumer.get_node()->get_type_info();
if (cldnn::one_of(type_info, target_consumers)) {
if (cldnn::one_of(type_info, convolutions) && child_consumer.get_node()->input_value(0).get_partial_shape().is_dynamic()) {
return false;
}
return true;
}
}
return false;
};

auto are_converts_from_decompression = [&all_has_types, &are_multiply_from_decompression](const std::set<ov::Input<ov::Node>>& consumers) {
auto are_converts_from_decompression = [&](const std::set<ov::Input<ov::Node>>& consumers) {
if (!all_has_types(consumers, { ov::opset1::Convert::get_type_info_static() }))
return false;
for (const auto& consumer : consumers) {
const auto child_consumers = consumer.get_node()->get_output_target_inputs(0);
for (const auto& child_consumer : child_consumers) {
const auto& type_info = child_consumer.get_node()->get_type_info();
if (cldnn::one_of(type_info, { ov::opset1::MatMul::get_type_info_static(), ov::op::v8::Gather::get_type_info_static() }))
continue;
if (cldnn::one_of(type_info, target_consumers)) {
if (cldnn::one_of(type_info, convolutions) && child_consumer.get_node()->input_value(0).get_partial_shape().is_dynamic()) {
return false;
}
return true;
}
if (are_multiply_from_decompression(child_consumer)) {
continue;
}
Expand All @@ -210,9 +246,16 @@ static bool is_decompression_multiply(const std::shared_ptr<const ov::Node> node
if (all_has_types(consumers, { ov::opset1::Reshape::get_type_info_static() })) {
for (const auto& consumer : consumers) {
const auto child_consumers = consumer.get_node()->get_output_target_inputs(0);
if (all_has_types(child_consumers, { ov::opset1::MatMul::get_type_info_static(), ov::op::v8::Gather::get_type_info_static() }) ||
are_converts_from_decompression(child_consumers)) {
return true;
for (const auto& child_consumer : child_consumers) {
const auto& type_info = child_consumer.get_node()->get_type_info();
if (cldnn::one_of(type_info, target_consumers)) {
if (cldnn::one_of(type_info, convolutions) && child_consumer.get_node()->input_value(0).get_partial_shape().is_dynamic()) {
return false;
}
return true;
} else if (are_converts_from_decompression(child_consumers)) {
return true;
}
}
}
}
Expand Down Expand Up @@ -330,13 +373,9 @@ void TransformationsPipeline::apply(std::shared_ptr<ov::Model> func) {
// it expects to have the same data type for weights and zero points (apply it only for u8 data type, since other compression
// types are not supported by oneDNN)
manager.register_pass<ov::pass::MarkDequantizationSubgraph>(supported_woq_types, !device_info.supports_immad);

// Need to check if transformations work correctly for mixed models with both compression and quantization at the same time.
if (!is_model_quantized) {
pass_config->set_callback<ov::pass::MarkDequantizationSubgraph>([&](const std::shared_ptr<const ov::Node> node) {
return !is_decompression_multiply(node);
});
}
pass_config->set_callback<ov::pass::MarkDequantizationSubgraph>([&](const std::shared_ptr<const ov::Node> node) {
return !is_decompression_multiply(node, device_info.supports_immad);
});

const bool keep_precision_sensitive_in_fp32_1 = true;
const bool convert_input_output_precision = false;
Expand Down Expand Up @@ -705,12 +744,6 @@ void TransformationsPipeline::apply(std::shared_ptr<ov::Model> func) {
auto lptPassConfig = lptManager.get_pass_config();
// quantized LSTMSequence / GPUSequence are not supported yet. Avoid extra transformation
lptPassConfig->disable<ov::pass::low_precision::RecurrentCellTransformation>();
lptPassConfig->set_callback<ov::pass::low_precision::MarkupPrecisions>([](const_node_ptr& node) -> bool {
if (const auto mulitply = std::dynamic_pointer_cast<const ov::op::v1::Multiply>(node)) {
return !MultiplyToGroupConvolutionTransformation::canBeTransformedToGroupConvolution(mulitply);
}
return false;
});
lptPassConfig->set_callback<ConvolutionBackpropDataTransformation>([func, defaultPrecisions](const_node_ptr& node) -> bool {
auto fillStaticChannel = [func](const ov::PartialShape& shape, size_t& channel) -> bool {
const auto rank = shape.rank();
Expand Down Expand Up @@ -747,6 +780,40 @@ void TransformationsPipeline::apply(std::shared_ptr<ov::Model> func) {
|| WeightableLayerTransformation::isAsymmetricOnWeights(node, defaultPrecisions);
});

lptPassConfig->set_callback<TransposeTransformation>([&](const_node_ptr& node) -> bool {
for (auto& user : node->get_users()) {
if (ov::is_type<ov::op::v13::ScaledDotProductAttention>(user))
return true;
}

return false;
});

lptPassConfig->set_callback<MarkupPrecisions>([](const_node_ptr& node) -> bool {
return ov::is_type<ov::opset1::Multiply>(node) && !MultiplyToGroupConvolutionTransformation::canBeTransformedToGroupConvolution(node);
});

lptPassConfig->set_callback<FoldConvertTransformation>([&](const_node_ptr& node) -> bool {
const auto& consumers = node->get_output_target_inputs(0);
if (consumers.size() == 1) {
const auto consumer = consumers.begin()->get_node()->shared_from_this();
return ov::is_type<ov::opset1::Multiply>(consumer) && is_decompression_multiply(consumer, device_info.supports_immad);
}
return false;
});
lptPassConfig->set_callback<FuseConvertTransformation>([&](const_node_ptr& node) -> bool {
if (ov::is_type<ov::opset1::Multiply>(node)) {
return ov::is_type<ov::opset1::Multiply>(node) && is_decompression_multiply(node, device_info.supports_immad);
} else if (ov::is_type<ov::opset1::Subtract>(node)) {
const auto& consumers = node->get_output_target_inputs(0);
if (consumers.size() == 1) {
const auto consumer = consumers.begin()->get_node()->shared_from_this();
return ov::is_type<ov::opset1::Multiply>(consumer) && is_decompression_multiply(consumer, device_info.supports_immad);
}
}
return false;
});

lptPassConfig->set_callback<MultiplyToGroupConvolutionTransformation>([&](const_node_ptr& node) -> bool {
// disable MultiplyToGroupConvolution if Multiply with Constant can be fused

Expand Down
Loading

0 comments on commit 40152d1

Please sign in to comment.