Skip to content

Commit

Permalink
[GPU] Resolve FC dyn-quan issue
Browse files Browse the repository at this point in the history
+ Resolved unit-tests' failure on debug build
+ Clear exception logic in FC dyn-quan

Signed-off-by: Min, Byung-il <[email protected]>
  • Loading branch information
byungilm committed Oct 6, 2024
1 parent 890f2e1 commit c474247
Show file tree
Hide file tree
Showing 6 changed files with 50 additions and 16 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -952,6 +952,7 @@ inline void FUNC(fc_bf_tiled_kernel_dyn_quan)(

// Calculate zero-point and scale only for DECOMPRESSION_SCALE_POST_OP enabled
// Calculate weight : w = (w - dzp) * ds
// if DECOMPRESSION_ZP_TERM is not enabled, then dzp is ACCUMULATOR_VAL_ZERO.
#if DECOMPRESSION_ZP_TERM
#if DECOMPRESSION_ZP_SCALAR
DQ_SLM_FILTER_UNPACKED_VEC dzp = (DQ_SLM_FILTER_UNPACKED_VEC)(DECOMPRESSION_ZP_VALUE);
Expand All @@ -976,8 +977,6 @@ inline void FUNC(fc_bf_tiled_kernel_dyn_quan)(
}
}
#endif
#else
DQ_SLM_FILTER_UNPACKED_VEC dzp = (DQ_SLM_FILTER_UNPACKED_VEC)(ACCUMULATOR_VAL_ZERO);
#endif

#if FILTER_LOAD_BLOCK_SIZE == 2
Expand Down Expand Up @@ -1026,7 +1025,7 @@ inline void FUNC(fc_bf_tiled_kernel_dyn_quan)(

weights_offset += TILE_K_OFM_PACKED * TILE_OFM_PER_OSV_SIZE * SIMD;

#if DECOMPRESSION_SCALE_POST_OP && (TILE_IFM_ELEMENTS_SIZE > DECOMPRESSION_SCALE_GROUP_SIZE)
#if DQ_DECOMPRESSION_SCALE_POST_OP && (TILE_IFM_ELEMENTS_SIZE > DECOMPRESSION_SCALE_GROUP_SIZE)
unroll_for (uint bi = 0; bi < TILE_B; ++bi) {
unroll_for(uint fi = 0; fi < TILE_OFM; ++fi) {
const uint offset_ofm = out_f + fi*SIMD + sglid;
Expand All @@ -1046,7 +1045,7 @@ inline void FUNC(fc_bf_tiled_kernel_dyn_quan)(
#endif
} // Whole tile_k elements of each iteration : ki

#if DECOMPRESSION_SCALE_POST_OP && (TILE_IFM_ELEMENTS_SIZE <= DECOMPRESSION_SCALE_GROUP_SIZE)
#if DQ_DECOMPRESSION_SCALE_POST_OP && (TILE_IFM_ELEMENTS_SIZE <= DECOMPRESSION_SCALE_GROUP_SIZE)
// Dynamic-quantizing group size set to same or smaller than scale group size
if ((ni % NUM_LOOP_IN_DYN_QUAN_GROUP) == (NUM_LOOP_IN_DYN_QUAN_GROUP - 1)) {
const uint ni_offset = ((ni*TILE_IFM*SIMD) / DECOMPRESSION_SCALE_GROUP_SIZE)*DECOMPRESSION_SCALE_FEATURE_PITCH;
Expand Down Expand Up @@ -1175,7 +1174,7 @@ KERNEL(fc)(
#endif
) {
#if USE_SLM
#if DYNAMIC_QUANTIZE && (TILE_OFM == 2)
#if DYNAMIC_QUANTIZE
__local int dq_wei_local_mem[SIMD * TILE_OFM * SIMD];
#else
__local ACCUMULATOR_TYPE wei_local_mem[TILE_IFM * SIMD * TILE_OFM * SIMD];
Expand Down Expand Up @@ -1317,7 +1316,7 @@ KERNEL(fc)(
#endif
);
} else {
#if USE_SLM && DYNAMIC_QUANTIZE && (TILE_OFM == 2)
#if USE_SLM && DYNAMIC_QUANTIZE
FUNC_CALL(fc_bf_tiled_kernel_dyn_quan)(
OPTIONAL_SHAPE_INFO_TENSOR
input,
Expand Down Expand Up @@ -1364,7 +1363,7 @@ KERNEL(fc)(
#endif
}
#else
#if USE_SLM && DYNAMIC_QUANTIZE && (TILE_OFM == 2)
#if USE_SLM && DYNAMIC_QUANTIZE
FUNC_CALL(fc_bf_tiled_kernel_dyn_quan)(
OPTIONAL_SHAPE_INFO_TENSOR
input,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -375,6 +375,9 @@ FullyConnected_bf_tiled::GetAutoTuneParams(const fully_connected_params& params,

if (params.weights.GetDType() == WeightsType::UINT4 || params.weights.GetDType() == WeightsType::INT4) {
if (!params.is_shape_agnostic && batch == 1) {
if (should_dynamic_quantize(params))
return selector.Default(tune_params(1, 2, 4, 2, 1, 1, 1, EXE_MODE_DEFAULT));

// Tuning for Meteor Lake
if (is_weight_vertical(params, output_f)) {
if (params.weights.GetLayout() == WeightsLayout::os_is_yx_osv32_isv2) {
Expand Down Expand Up @@ -616,7 +619,7 @@ JitConstants FullyConnected_bf_tiled::GetJitConstants(const fully_connected_para
// Validated perf gain, Dynamic quantize force enable SCALE_POST_OP for char type multiplication
if (should_dynamic_quantize(params)) {
jit.AddConstant(MakeJitConstant("DYNAMIC_QUANTIZE", 1));
jit.AddConstant(MakeJitConstant("DECOMPRESSION_SCALE_POST_OP", 1));
jit.AddConstant(MakeJitConstant("DQ_DECOMPRESSION_SCALE_POST_OP", 1));
jit.AddConstant(MakeJitConstant("DQ_TYPE", "char"));
jit.AddConstant(MakeJitConstant("QUANTIZE_GROUP_SIZE", quantize_grp_size));
} else {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@
namespace ov {
namespace intel_gpu {

ConvertFullyConnectedToFullyConnectedCompressed::ConvertFullyConnectedToFullyConnectedCompressed(bool convert_u4zp_to_u8) {
ConvertFullyConnectedToFullyConnectedCompressed::ConvertFullyConnectedToFullyConnectedCompressed(bool convert_zp_to_u8) {
using namespace ov::pass::pattern;

auto compressed_constant = [](const ov::Output<ov::Node>& output) {
Expand Down Expand Up @@ -97,9 +97,9 @@ ConvertFullyConnectedToFullyConnectedCompressed::ConvertFullyConnectedToFullyCon
return std::make_shared<ov::op::v0::Constant>(*constant, new_shape);
};

auto convert_u4const_to_u8 = [convert_u4zp_to_u8](std::shared_ptr<ov::Node> node) {
auto convert_u4const_to_u8 = [convert_zp_to_u8](std::shared_ptr<ov::Node> node) {
auto constant = std::dynamic_pointer_cast<ov::op::v0::Constant>(node);
if (constant->get_element_type() != ov::element::u4 || !convert_u4zp_to_u8)
if (constant->get_element_type() == ov::element::u8 || !convert_zp_to_u8)
return std::dynamic_pointer_cast<ov::Node>(constant);
return std::dynamic_pointer_cast<ov::Node>(std::make_shared<ov::op::v0::Convert>(node, ov::element::u8));
};
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ namespace intel_gpu {
class ConvertFullyConnectedToFullyConnectedCompressed: public ov::pass::MatcherPass {
public:
OPENVINO_RTTI("ConvertFullyConnectedToFullyConnectedCompressed", "0");
ConvertFullyConnectedToFullyConnectedCompressed(bool convert_u4zp_to_u8 = false);
ConvertFullyConnectedToFullyConnectedCompressed(bool convert_zp_to_u8 = false);
};

} // namespace intel_gpu
Expand Down
9 changes: 5 additions & 4 deletions src/plugins/intel_gpu/src/plugin/transformations_pipeline.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -810,7 +810,7 @@ void TransformationsPipeline::apply(std::shared_ptr<ov::Model> func) {
manager.register_pass<ov::intel_gpu::ClampFP16Output>();
manager.register_pass<ov::intel_gpu::ConvertMatMulToFullyConnected>();
manager.register_pass<ov::intel_gpu::MoveFCReshapeToWeights>();
manager.register_pass<ov::intel_gpu::ConvertFullyConnectedToFullyConnectedCompressed>(device_info.supports_immad);
manager.register_pass<ov::intel_gpu::ConvertFullyConnectedToFullyConnectedCompressed>(true);

bool disable_horizontal_fc_fusion = false;
GPU_DEBUG_GET_INSTANCE(debug_config);
Expand All @@ -819,10 +819,11 @@ void TransformationsPipeline::apply(std::shared_ptr<ov::Model> func) {

if (!disable_horizontal_fc_fusion)
manager.register_pass<ov::intel_gpu::FullyConnectedHorizontalFusion>();

// ZP should not be folded for FC. But still, ZP should be folded for Gather.
// Therefore, run MarkDequantizationSubgraph again to fold ZP constant.
manager.register_pass<ov::pass::MarkDequantizationSubgraph>(supported_woq_types, true);
if (device_info.supports_immad) {
// For OneDNN, ZP should not be folded for FC. But still, ZP should be folded for Gather.
// Therefore, run MarkDequantizationSubgraph again to fold ZP constant.
manager.register_pass<ov::pass::MarkDequantizationSubgraph>(supported_woq_types, true);
if (disable_horizontal_fc_fusion)
manager.register_pass<ov::pass::ConstantFolding>();
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -429,6 +429,37 @@ TEST_F(TransformationTestsF, ConvertFCToCompressed9) {
}
}

TEST_F(TransformationTestsF, ConvertFCToCompressed10) {
{
auto input1 = std::make_shared<ov::op::v0::Parameter>(ov::element::f16, ov::PartialShape{ -1, 16 });
auto weights_const = ov::op::v0::Constant::create(ov::element::u8, ov::Shape{ 32, 16 }, { 1 });
auto convert = std::make_shared<ov::op::v0::Convert>(weights_const, ov::element::f16);
auto zp_const = ov::op::v0::Constant::create(ov::element::u8, ov::Shape{ 32, 1 }, { 1 });
auto zp_convert = std::make_shared<ov::op::v0::Convert>(zp_const, ov::element::f16);
auto sub = std::make_shared<ov::op::v1::Subtract>(convert, zp_convert);
auto scale_const = ov::op::v0::Constant::create(ov::element::f16, ov::Shape{ 32, 1 }, { 1 });
auto scale = std::make_shared<ov::op::v1::Multiply>(sub, scale_const);
auto no_bias = std::make_shared<ov::intel_gpu::op::Placeholder>();
auto fc = std::make_shared<ov::intel_gpu::op::FullyConnected>(input1, scale, no_bias);

model = std::make_shared<ov::Model>(ov::NodeVector{ fc }, ov::ParameterVector{ input1 });
manager.register_pass<ConvertFullyConnectedToFullyConnectedCompressed>();
}
{
auto input1 = std::make_shared<ov::op::v0::Parameter>(ov::element::f16, ov::PartialShape{ -1, 16 });
auto weights_const = ov::op::v0::Constant::create(ov::element::u8, ov::Shape{ 32, 16 }, { 1 });
auto convert = std::make_shared<ov::op::v0::Convert>(weights_const, ov::element::f16);
auto zp_const = ov::op::v0::Constant::create(ov::element::u8, ov::Shape{ 32, 1 }, { 1 });
auto sub = std::make_shared<ov::op::v1::Subtract>(convert, zp_const);
auto scale_const = ov::op::v0::Constant::create(ov::element::f16, ov::Shape{ 32, 1 }, { 1 });
auto scale = std::make_shared<ov::op::v1::Multiply>(sub, scale_const);
auto no_bias = std::make_shared<ov::intel_gpu::op::Placeholder>();
auto fc_compressed = std::make_shared<ov::intel_gpu::op::FullyConnected>(input1, scale, no_bias);

model_ref = std::make_shared<ov::Model>(ov::NodeVector{ fc_compressed }, ov::ParameterVector{ input1 });
}
}

} // namespace intel_gpu
} // namespace test
} // namespace ov

0 comments on commit c474247

Please sign in to comment.