Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[GPU] Fix hybrid quantization reg issue #27404

Merged
Show file tree
Hide file tree
Changes from 7 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@ struct FullyConnectedImplementationManager : public ImplementationManager {
bool compressed_case = fc_prim->compressed_weights &&
one_of(in0_dt, {data_types::f16, data_types::f32, data_types::i8}) &&
one_of(wei_dt, {data_types::u8, data_types::i8, data_types::u4, data_types::i4}) &&
one_of(out_dt, {data_types::f16, data_types::f32});
one_of(out_dt, {data_types::f16, data_types::f32, data_types::u8, data_types::i8});
if (!f16f16_case && !f32f32_case && !u8s8_case && !compressed_case)
return false;

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@ KERNEL (permute_f_y_axes)(
result = FUSED_OPS_RESULT_VEC;
#else
IN_VEC_TYPE res = READ_VEC(0, &input[INPUT0_GET_INDEX(b_idx, f_idx, y_idx, x_idx)]);
OUT_VEC_TYPE result = ACTIVATION(res, ACTIVATION_PARAMS);
OUT_VEC_TYPE result = TO_OUT_VEC_TYPE(ACTIVATION(res, ACTIVATION_PARAMS));
#endif
const int output_idx = OUTPUT_GET_INDEX(b_idx, f_out_idx, y_out_idx, x_idx);
WRITE_VEC(result, 0, &output[output_idx]);
Expand Down
28 changes: 28 additions & 0 deletions src/plugins/intel_gpu/src/plugin/ops/fully_connected.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,34 @@ static void CreateFullyConnectedCompressedOp(ProgramBuilder& p, const std::share
}

p.add_primitive(*op, fc);

if (op->get_input_partial_shape(0).size() > 3 && !p.use_new_shape_infer()) {
auto lastLayerName = primitive_name;
auto outReshapeName = primitive_name + "_cldnn_out_reshape";

// add reorder
auto outDims = op->get_output_shape(0);
auto outTensor = tensor_from_dims(outDims);

if (outDims.size() > 4) {
cldnn::format outputFormat = cldnn::format::bfyx;
switch (outDims.size()) {
case 5: outputFormat = cldnn::format::bfzyx; break;
case 6: outputFormat = cldnn::format::bfwzyx; break;
default: break;
}

cldnn::primitive_id reorderId = "reorder:" + outReshapeName + "_reorder";
cldnn::layout outputLayout(cldnn::element_type_to_data_type(op->get_output_element_type(0)), outputFormat, outTensor);
auto reorder_prim = cldnn::reorder(reorderId, cldnn::input_info(primitive_name), outputLayout);
p.add_primitive(*op, reorder_prim);
lastLayerName = reorderId;
}

// add reshape
auto outReshapePrim = cldnn::reshape(outReshapeName, cldnn::input_info(lastLayerName), outTensor);
p.add_primitive(*op, outReshapePrim);
}
}

static void CreateFullyConnectedOp(ProgramBuilder& p, const std::shared_ptr<op::FullyConnected>& op) {
Expand Down
121 changes: 96 additions & 25 deletions src/plugins/intel_gpu/src/plugin/transformations_pipeline.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -15,8 +15,11 @@
#include "intel_gpu/plugin/transformations_pipeline.hpp"
#include "intel_gpu/runtime/debug_configuration.hpp"
#include "intel_gpu/runtime/itt.hpp"
#include "low_precision/add.hpp"
#include "low_precision/convolution.hpp"
#include "low_precision/convolution_backprop_data.hpp"
#include "low_precision/fold_convert.hpp"
#include "low_precision/fuse_convert.hpp"
#include "low_precision/group_convolution.hpp"
#include "low_precision/low_precision.hpp"
#include "low_precision/mat_mul.hpp"
Expand All @@ -25,7 +28,9 @@
#include "low_precision/pull_reshape_through_dequantization.hpp"
#include "low_precision/pull_transpose_through_dequantization.hpp"
#include "low_precision/recurrent_cell.hpp"
#include "low_precision/rt_info/bias_attribute.hpp"
#include "low_precision/strided_slice.hpp"
#include "low_precision/transpose.hpp"
#include "openvino/core/deprecated.hpp"
#include "openvino/core/type/element_type.hpp"
#include "openvino/core/validation_util.hpp"
Expand All @@ -46,6 +51,7 @@
#include "openvino/op/reshape.hpp"
#include "openvino/op/rnn_cell.hpp"
#include "openvino/op/rnn_sequence.hpp"
#include "openvino/op/scaled_dot_product_attention.hpp"
#include "openvino/op/squeeze.hpp"
#include "openvino/op/unsqueeze.hpp"
#include "openvino/op/util/sub_graph_base.hpp"
Expand Down Expand Up @@ -169,35 +175,67 @@ static bool disable_reduce_decomposition(const std::shared_ptr<const ov::Node> n
return false;
}

static bool is_decompression_multiply(const std::shared_ptr<const ov::Node> node) {
static bool is_decompression_multiply(const std::shared_ptr<const ov::Node> node, bool supports_immad) {
std::vector<ov::DiscreteTypeInfo> target_consumers = { ov::opset1::MatMul::get_type_info_static(),
ov::op::v8::Gather::get_type_info_static(),
ov::op::v1::Convolution::get_type_info_static(),
ov::opset1::Convolution::get_type_info_static(),
ov::op::v1::GroupConvolution::get_type_info_static(),
ov::opset1::GroupConvolution::get_type_info_static() };
hyunback marked this conversation as resolved.
Show resolved Hide resolved

std::vector<ov::DiscreteTypeInfo> convolutions = { ov::op::v1::Convolution::get_type_info_static(),
ov::opset1::Convolution::get_type_info_static(),
ov::op::v1::GroupConvolution::get_type_info_static(),
ov::opset1::GroupConvolution::get_type_info_static() };

auto all_has_types = [](const std::set<ov::Input<ov::Node>>& consumers, const std::vector<ov::DiscreteTypeInfo>& types) {
return std::all_of(consumers.begin(), consumers.end(), [&types](const ov::Input<ov::Node>& input) {
return cldnn::one_of(input.get_node()->get_type_info(), types);
});
};

const auto consumers = node->get_output_target_inputs(0);
if (all_has_types(consumers, { ov::op::v0::MatMul::get_type_info_static(), ov::op::v8::Gather::get_type_info_static() }))
return true;

auto are_multiply_from_decompression = [&all_has_types](const ov::Input<ov::Node> consumer) {
for (const auto& consumer : consumers) {
const auto& type_info = consumer.get_node()->get_type_info();
if (cldnn::one_of(type_info, target_consumers)) {
if (cldnn::one_of(type_info, convolutions) && consumer.get_node()->input_value(0).get_partial_shape().is_dynamic()) {
return false;
}
return true;
}
}

auto are_multiply_from_decompression = [&](const ov::Input<ov::Node> consumer) {
if (!cldnn::one_of(consumer.get_node()->get_type_info(), { ov::op::v1::Multiply::get_type_info_static() }))
return false;
const auto child_consumers = consumer.get_node()->get_output_target_inputs(0);
if (all_has_types(child_consumers, { ov::opset1::MatMul::get_type_info_static(), ov::op::v8::Gather::get_type_info_static() }))
return true;

for (const auto& child_consumer : child_consumers) {
const auto& type_info = child_consumer.get_node()->get_type_info();
if (cldnn::one_of(type_info, target_consumers)) {
if (cldnn::one_of(type_info, convolutions) && child_consumer.get_node()->input_value(0).get_partial_shape().is_dynamic()) {
return false;
}
return true;
}
}
return false;
};

auto are_converts_from_decompression = [&all_has_types, &are_multiply_from_decompression](const std::set<ov::Input<ov::Node>>& consumers) {
auto are_converts_from_decompression = [&](const std::set<ov::Input<ov::Node>>& consumers) {
if (!all_has_types(consumers, { ov::opset1::Convert::get_type_info_static() }))
return false;
for (const auto& consumer : consumers) {
const auto child_consumers = consumer.get_node()->get_output_target_inputs(0);
for (const auto& child_consumer : child_consumers) {
const auto& type_info = child_consumer.get_node()->get_type_info();
if (cldnn::one_of(type_info, { ov::opset1::MatMul::get_type_info_static(), ov::op::v8::Gather::get_type_info_static() }))
continue;
if (cldnn::one_of(type_info, target_consumers)) {
if (cldnn::one_of(type_info, convolutions) && child_consumer.get_node()->input_value(0).get_partial_shape().is_dynamic()) {
return false;
}
return true;
}
if (are_multiply_from_decompression(child_consumer)) {
continue;
}
Expand All @@ -210,9 +248,16 @@ static bool is_decompression_multiply(const std::shared_ptr<const ov::Node> node
if (all_has_types(consumers, { ov::opset1::Reshape::get_type_info_static() })) {
for (const auto& consumer : consumers) {
const auto child_consumers = consumer.get_node()->get_output_target_inputs(0);
if (all_has_types(child_consumers, { ov::opset1::MatMul::get_type_info_static(), ov::op::v8::Gather::get_type_info_static() }) ||
are_converts_from_decompression(child_consumers)) {
return true;
for (const auto& child_consumer : child_consumers) {
const auto& type_info = child_consumer.get_node()->get_type_info();
if (cldnn::one_of(type_info, target_consumers)) {
if (cldnn::one_of(type_info, convolutions) && child_consumer.get_node()->input_value(0).get_partial_shape().is_dynamic()) {
return false;
}
return true;
} else if (are_converts_from_decompression(child_consumers)) {
return true;
}
}
}
}
Expand Down Expand Up @@ -330,13 +375,11 @@ void TransformationsPipeline::apply(std::shared_ptr<ov::Model> func) {
// it expects to have the same data type for weights and zero points (apply it only for u8 data type, since other compression
// types are not supported by oneDNN)
manager.register_pass<ov::pass::MarkDequantizationSubgraph>(supported_woq_types, !device_info.supports_immad);

// Need to check if transformations work correctly for mixed models with both compression and quantization at the same time.
if (!is_model_quantized) {
pass_config->set_callback<ov::pass::MarkDequantizationSubgraph>([&](const std::shared_ptr<const ov::Node> node) {
return !is_decompression_multiply(node);
});
}
GPU_DEBUG_TRACE << "is_model_quantized: " << is_model_quantized << std::endl;
pass_config->set_callback<ov::pass::MarkDequantizationSubgraph>([&](const std::shared_ptr<const ov::Node> node) {
GPU_DEBUG_TRACE_DETAIL << node->get_friendly_name() << ": " << !is_decompression_multiply(node, device_info.supports_immad) << std::endl;
hyunback marked this conversation as resolved.
Show resolved Hide resolved
return !is_decompression_multiply(node, device_info.supports_immad);
});

const bool keep_precision_sensitive_in_fp32_1 = true;
const bool convert_input_output_precision = false;
Expand Down Expand Up @@ -705,12 +748,6 @@ void TransformationsPipeline::apply(std::shared_ptr<ov::Model> func) {
auto lptPassConfig = lptManager.get_pass_config();
// quantized LSTMSequence / GPUSequence are not supported yet. Avoid extra transformation
lptPassConfig->disable<ov::pass::low_precision::RecurrentCellTransformation>();
lptPassConfig->set_callback<ov::pass::low_precision::MarkupPrecisions>([](const_node_ptr& node) -> bool {
if (const auto mulitply = std::dynamic_pointer_cast<const ov::op::v1::Multiply>(node)) {
return !MultiplyToGroupConvolutionTransformation::canBeTransformedToGroupConvolution(mulitply);
}
return false;
});
lptPassConfig->set_callback<ConvolutionBackpropDataTransformation>([func, defaultPrecisions](const_node_ptr& node) -> bool {
auto fillStaticChannel = [func](const ov::PartialShape& shape, size_t& channel) -> bool {
const auto rank = shape.rank();
Expand Down Expand Up @@ -747,6 +784,40 @@ void TransformationsPipeline::apply(std::shared_ptr<ov::Model> func) {
|| WeightableLayerTransformation::isAsymmetricOnWeights(node, defaultPrecisions);
});

lptPassConfig->set_callback<TransposeTransformation>([&](const_node_ptr& node) -> bool {
hyunback marked this conversation as resolved.
Show resolved Hide resolved
for (auto& user : node->get_users()) {
if (ov::is_type<ov::op::v13::ScaledDotProductAttention>(user))
return true;
}

return false;
});

lptPassConfig->set_callback<MarkupPrecisions>([](const_node_ptr& node) -> bool {
return ov::is_type<ov::opset1::Multiply>(node) && !MultiplyToGroupConvolutionTransformation::canBeTransformedToGroupConvolution(node);
});

lptPassConfig->set_callback<FoldConvertTransformation>([&](const_node_ptr& node) -> bool {
const auto& consumers = node->get_output_target_inputs(0);
if (consumers.size() == 1) {
const auto consumer = consumers.begin()->get_node()->shared_from_this();
return ov::is_type<ov::opset1::Multiply>(consumer) && is_decompression_multiply(consumer, device_info.supports_immad);
}
return false;
});
lptPassConfig->set_callback<FuseConvertTransformation>([&](const_node_ptr& node) -> bool {
if (ov::is_type<ov::opset1::Multiply>(node)) {
return ov::is_type<ov::opset1::Multiply>(node) && is_decompression_multiply(node, device_info.supports_immad);
} else if (ov::is_type<ov::opset1::Subtract>(node)) {
const auto& consumers = node->get_output_target_inputs(0);
if (consumers.size() == 1) {
const auto consumer = consumers.begin()->get_node()->shared_from_this();
return ov::is_type<ov::opset1::Multiply>(consumer) && is_decompression_multiply(consumer, device_info.supports_immad);
}
}
return false;
});

lptPassConfig->set_callback<MultiplyToGroupConvolutionTransformation>([&](const_node_ptr& node) -> bool {
// disable MultiplyToGroupConvolution if Multiply with Constant can be fused

Expand Down
Loading