From c4742479c9a10ea0326239727d6fa8094b15174f Mon Sep 17 00:00:00 2001 From: "Min, Byung-il" Date: Thu, 26 Sep 2024 13:52:31 +0900 Subject: [PATCH] [GPU] Resolve FC dyn-quan issue + Resolved unit-tests' failure on debug build + Clear exception logic in FC dyn-quan Signed-off-by: Min, Byung-il --- .../fully_connected_gpu_bf_tiled.cl | 13 ++++---- .../fully_connected_kernel_bf_tiled.cpp | 5 ++- .../convert_fc_to_compressed.cpp | 6 ++-- .../convert_fc_to_compressed.hpp | 2 +- .../src/plugin/transformations_pipeline.cpp | 9 +++--- .../convert_fc_to_compressed_test.cpp | 31 +++++++++++++++++++ 6 files changed, 50 insertions(+), 16 deletions(-) diff --git a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/fully_connected_gpu_bf_tiled.cl b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/fully_connected_gpu_bf_tiled.cl index 57545b0df37cff..70c55bfb73b8f5 100644 --- a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/fully_connected_gpu_bf_tiled.cl +++ b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/fully_connected_gpu_bf_tiled.cl @@ -952,6 +952,7 @@ inline void FUNC(fc_bf_tiled_kernel_dyn_quan)( // Calculate zero-point and scale only for DECOMPRESSION_SCALE_POST_OP enabled // Calculate weight : w = (w - dzp) * ds + // if DECOMPRESSION_ZP_TERM is not enabled, then dzp is ACCUMULATOR_VAL_ZERO. #if DECOMPRESSION_ZP_TERM #if DECOMPRESSION_ZP_SCALAR DQ_SLM_FILTER_UNPACKED_VEC dzp = (DQ_SLM_FILTER_UNPACKED_VEC)(DECOMPRESSION_ZP_VALUE); @@ -976,8 +977,6 @@ inline void FUNC(fc_bf_tiled_kernel_dyn_quan)( } } #endif - #else - DQ_SLM_FILTER_UNPACKED_VEC dzp = (DQ_SLM_FILTER_UNPACKED_VEC)(ACCUMULATOR_VAL_ZERO); #endif #if FILTER_LOAD_BLOCK_SIZE == 2 @@ -1026,7 +1025,7 @@ inline void FUNC(fc_bf_tiled_kernel_dyn_quan)( weights_offset += TILE_K_OFM_PACKED * TILE_OFM_PER_OSV_SIZE * SIMD; - #if DECOMPRESSION_SCALE_POST_OP && (TILE_IFM_ELEMENTS_SIZE > DECOMPRESSION_SCALE_GROUP_SIZE) + #if DQ_DECOMPRESSION_SCALE_POST_OP && (TILE_IFM_ELEMENTS_SIZE > DECOMPRESSION_SCALE_GROUP_SIZE) unroll_for (uint bi = 0; bi < TILE_B; ++bi) { unroll_for(uint fi = 0; fi < TILE_OFM; ++fi) { const uint offset_ofm = out_f + fi*SIMD + sglid; @@ -1046,7 +1045,7 @@ inline void FUNC(fc_bf_tiled_kernel_dyn_quan)( #endif } // Whole tile_k elements of each iteration : ki - #if DECOMPRESSION_SCALE_POST_OP && (TILE_IFM_ELEMENTS_SIZE <= DECOMPRESSION_SCALE_GROUP_SIZE) + #if DQ_DECOMPRESSION_SCALE_POST_OP && (TILE_IFM_ELEMENTS_SIZE <= DECOMPRESSION_SCALE_GROUP_SIZE) // Dynamic-quantizing group size set to same or smaller than scale group size if ((ni % NUM_LOOP_IN_DYN_QUAN_GROUP) == (NUM_LOOP_IN_DYN_QUAN_GROUP - 1)) { const uint ni_offset = ((ni*TILE_IFM*SIMD) / DECOMPRESSION_SCALE_GROUP_SIZE)*DECOMPRESSION_SCALE_FEATURE_PITCH; @@ -1175,7 +1174,7 @@ KERNEL(fc)( #endif ) { #if USE_SLM - #if DYNAMIC_QUANTIZE && (TILE_OFM == 2) + #if DYNAMIC_QUANTIZE __local int dq_wei_local_mem[SIMD * TILE_OFM * SIMD]; #else __local ACCUMULATOR_TYPE wei_local_mem[TILE_IFM * SIMD * TILE_OFM * SIMD]; @@ -1317,7 +1316,7 @@ KERNEL(fc)( #endif ); } else { - #if USE_SLM && DYNAMIC_QUANTIZE && (TILE_OFM == 2) + #if USE_SLM && DYNAMIC_QUANTIZE FUNC_CALL(fc_bf_tiled_kernel_dyn_quan)( OPTIONAL_SHAPE_INFO_TENSOR input, @@ -1364,7 +1363,7 @@ KERNEL(fc)( #endif } #else - #if USE_SLM && DYNAMIC_QUANTIZE && (TILE_OFM == 2) + #if USE_SLM && DYNAMIC_QUANTIZE FUNC_CALL(fc_bf_tiled_kernel_dyn_quan)( OPTIONAL_SHAPE_INFO_TENSOR input, diff --git a/src/plugins/intel_gpu/src/kernel_selector/kernels/fully_connected/fully_connected_kernel_bf_tiled.cpp b/src/plugins/intel_gpu/src/kernel_selector/kernels/fully_connected/fully_connected_kernel_bf_tiled.cpp index c4115d74f54a92..7a99c881854700 100644 --- a/src/plugins/intel_gpu/src/kernel_selector/kernels/fully_connected/fully_connected_kernel_bf_tiled.cpp +++ b/src/plugins/intel_gpu/src/kernel_selector/kernels/fully_connected/fully_connected_kernel_bf_tiled.cpp @@ -375,6 +375,9 @@ FullyConnected_bf_tiled::GetAutoTuneParams(const fully_connected_params& params, if (params.weights.GetDType() == WeightsType::UINT4 || params.weights.GetDType() == WeightsType::INT4) { if (!params.is_shape_agnostic && batch == 1) { + if (should_dynamic_quantize(params)) + return selector.Default(tune_params(1, 2, 4, 2, 1, 1, 1, EXE_MODE_DEFAULT)); + // Tuning for Meteor Lake if (is_weight_vertical(params, output_f)) { if (params.weights.GetLayout() == WeightsLayout::os_is_yx_osv32_isv2) { @@ -616,7 +619,7 @@ JitConstants FullyConnected_bf_tiled::GetJitConstants(const fully_connected_para // Validated perf gain, Dynamic quantize force enable SCALE_POST_OP for char type multiplication if (should_dynamic_quantize(params)) { jit.AddConstant(MakeJitConstant("DYNAMIC_QUANTIZE", 1)); - jit.AddConstant(MakeJitConstant("DECOMPRESSION_SCALE_POST_OP", 1)); + jit.AddConstant(MakeJitConstant("DQ_DECOMPRESSION_SCALE_POST_OP", 1)); jit.AddConstant(MakeJitConstant("DQ_TYPE", "char")); jit.AddConstant(MakeJitConstant("QUANTIZE_GROUP_SIZE", quantize_grp_size)); } else { diff --git a/src/plugins/intel_gpu/src/plugin/transformations/convert_fc_to_compressed.cpp b/src/plugins/intel_gpu/src/plugin/transformations/convert_fc_to_compressed.cpp index 885da895b91166..3042a097e995f1 100644 --- a/src/plugins/intel_gpu/src/plugin/transformations/convert_fc_to_compressed.cpp +++ b/src/plugins/intel_gpu/src/plugin/transformations/convert_fc_to_compressed.cpp @@ -24,7 +24,7 @@ namespace ov { namespace intel_gpu { -ConvertFullyConnectedToFullyConnectedCompressed::ConvertFullyConnectedToFullyConnectedCompressed(bool convert_u4zp_to_u8) { +ConvertFullyConnectedToFullyConnectedCompressed::ConvertFullyConnectedToFullyConnectedCompressed(bool convert_zp_to_u8) { using namespace ov::pass::pattern; auto compressed_constant = [](const ov::Output& output) { @@ -97,9 +97,9 @@ ConvertFullyConnectedToFullyConnectedCompressed::ConvertFullyConnectedToFullyCon return std::make_shared(*constant, new_shape); }; - auto convert_u4const_to_u8 = [convert_u4zp_to_u8](std::shared_ptr node) { + auto convert_u4const_to_u8 = [convert_zp_to_u8](std::shared_ptr node) { auto constant = std::dynamic_pointer_cast(node); - if (constant->get_element_type() != ov::element::u4 || !convert_u4zp_to_u8) + if (constant->get_element_type() == ov::element::u8 || !convert_zp_to_u8) return std::dynamic_pointer_cast(constant); return std::dynamic_pointer_cast(std::make_shared(node, ov::element::u8)); }; diff --git a/src/plugins/intel_gpu/src/plugin/transformations/convert_fc_to_compressed.hpp b/src/plugins/intel_gpu/src/plugin/transformations/convert_fc_to_compressed.hpp index 641f55ead5fdaf..d63d208eb5c95a 100644 --- a/src/plugins/intel_gpu/src/plugin/transformations/convert_fc_to_compressed.hpp +++ b/src/plugins/intel_gpu/src/plugin/transformations/convert_fc_to_compressed.hpp @@ -12,7 +12,7 @@ namespace intel_gpu { class ConvertFullyConnectedToFullyConnectedCompressed: public ov::pass::MatcherPass { public: OPENVINO_RTTI("ConvertFullyConnectedToFullyConnectedCompressed", "0"); - ConvertFullyConnectedToFullyConnectedCompressed(bool convert_u4zp_to_u8 = false); + ConvertFullyConnectedToFullyConnectedCompressed(bool convert_zp_to_u8 = false); }; } // namespace intel_gpu diff --git a/src/plugins/intel_gpu/src/plugin/transformations_pipeline.cpp b/src/plugins/intel_gpu/src/plugin/transformations_pipeline.cpp index 40c7ab48c486cb..22a6509b7e17e7 100644 --- a/src/plugins/intel_gpu/src/plugin/transformations_pipeline.cpp +++ b/src/plugins/intel_gpu/src/plugin/transformations_pipeline.cpp @@ -810,7 +810,7 @@ void TransformationsPipeline::apply(std::shared_ptr func) { manager.register_pass(); manager.register_pass(); manager.register_pass(); - manager.register_pass(device_info.supports_immad); + manager.register_pass(true); bool disable_horizontal_fc_fusion = false; GPU_DEBUG_GET_INSTANCE(debug_config); @@ -819,10 +819,11 @@ void TransformationsPipeline::apply(std::shared_ptr func) { if (!disable_horizontal_fc_fusion) manager.register_pass(); + + // ZP should not be folded for FC. But still, ZP should be folded for Gather. + // Therefore, run MarkDequantizationSubgraph again to fold ZP constant. + manager.register_pass(supported_woq_types, true); if (device_info.supports_immad) { - // For OneDNN, ZP should not be folded for FC. But still, ZP should be folded for Gather. - // Therefore, run MarkDequantizationSubgraph again to fold ZP constant. - manager.register_pass(supported_woq_types, true); if (disable_horizontal_fc_fusion) manager.register_pass(); } diff --git a/src/plugins/intel_gpu/tests/unit/transformations/convert_fc_to_compressed_test.cpp b/src/plugins/intel_gpu/tests/unit/transformations/convert_fc_to_compressed_test.cpp index 12398c8221f4b7..cecdb5cfefa13d 100644 --- a/src/plugins/intel_gpu/tests/unit/transformations/convert_fc_to_compressed_test.cpp +++ b/src/plugins/intel_gpu/tests/unit/transformations/convert_fc_to_compressed_test.cpp @@ -429,6 +429,37 @@ TEST_F(TransformationTestsF, ConvertFCToCompressed9) { } } +TEST_F(TransformationTestsF, ConvertFCToCompressed10) { + { + auto input1 = std::make_shared(ov::element::f16, ov::PartialShape{ -1, 16 }); + auto weights_const = ov::op::v0::Constant::create(ov::element::u8, ov::Shape{ 32, 16 }, { 1 }); + auto convert = std::make_shared(weights_const, ov::element::f16); + auto zp_const = ov::op::v0::Constant::create(ov::element::u8, ov::Shape{ 32, 1 }, { 1 }); + auto zp_convert = std::make_shared(zp_const, ov::element::f16); + auto sub = std::make_shared(convert, zp_convert); + auto scale_const = ov::op::v0::Constant::create(ov::element::f16, ov::Shape{ 32, 1 }, { 1 }); + auto scale = std::make_shared(sub, scale_const); + auto no_bias = std::make_shared(); + auto fc = std::make_shared(input1, scale, no_bias); + + model = std::make_shared(ov::NodeVector{ fc }, ov::ParameterVector{ input1 }); + manager.register_pass(); + } + { + auto input1 = std::make_shared(ov::element::f16, ov::PartialShape{ -1, 16 }); + auto weights_const = ov::op::v0::Constant::create(ov::element::u8, ov::Shape{ 32, 16 }, { 1 }); + auto convert = std::make_shared(weights_const, ov::element::f16); + auto zp_const = ov::op::v0::Constant::create(ov::element::u8, ov::Shape{ 32, 1 }, { 1 }); + auto sub = std::make_shared(convert, zp_const); + auto scale_const = ov::op::v0::Constant::create(ov::element::f16, ov::Shape{ 32, 1 }, { 1 }); + auto scale = std::make_shared(sub, scale_const); + auto no_bias = std::make_shared(); + auto fc_compressed = std::make_shared(input1, scale, no_bias); + + model_ref = std::make_shared(ov::NodeVector{ fc_compressed }, ov::ParameterVector{ input1 }); + } +} + } // namespace intel_gpu } // namespace test } // namespace ov