diff --git a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/include/fully_connected_gpu_bf_tiled_common.cl b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/include/fully_connected_gpu_bf_tiled_common.cl index ca5c1ea3646d02..baa65d0c86a88a 100644 --- a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/include/fully_connected_gpu_bf_tiled_common.cl +++ b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/include/fully_connected_gpu_bf_tiled_common.cl @@ -147,9 +147,7 @@ inline void (FUNC_NAME)( // NOTE: Manually unrolling multiplication loop leads to lower register pressure and allows for bigger block sizes, // but significantly degrades readability and generality of code. // It doesn't also show noticable performance improvement on tested configurations. - #if DECOMPRESSION_SCALE_POST_OP - ACCUMULATOR_VEC_TYPE acc_tmp[FORCED_TILE_B] = { }; - #endif + ACCUMULATOR_VEC_TYPE acc_tmp[FORCED_TILE_B] = { }; unroll_for(uint ki = 0; ki < (TILE_IFM * SIMD) / TILE_K; ++ki) { #if COMPRESSED_WEIGHTS_INT4 @@ -201,11 +199,7 @@ inline void (FUNC_NAME)( unroll_for (uint bi = 0; bi < FORCED_TILE_B; ++bi) { INPUT0_TYPE in_val = _sub_group_shuffle(((INPUT0_TYPE*)(&in_0[bi]))[total_k / SIMD], total_k % SIMD); unroll_for (uint fi = 0; fi < TILE_OFM; ++fi) { -#if DECOMPRESSION_SCALE_POST_OP ((ACCUMULATOR_TYPE*)(&acc_tmp[bi]))[fi] += in_val * ((ACCUMULATOR_TYPE*)(&wei))[W_IDX]; -#else - ((ACCUMULATOR_TYPE*)(&acc[bi]))[fi] += in_val * ((ACCUMULATOR_TYPE*)(&wei))[W_IDX]; -#endif } } } @@ -243,6 +237,16 @@ inline void (FUNC_NAME)( } } #endif + +#if !DECOMPRESSION_SCALE_POST_OP + unroll_for (uint bi = 0; bi < FORCED_TILE_B; ++bi) { + unroll_for(uint fi = 0; fi < TILE_OFM; ++fi) { + ((ACCUMULATOR_TYPE*)(&acc[bi]))[fi] += ((ACCUMULATOR_TYPE*)(&acc_tmp[bi]))[fi]; + } + } +#endif + + } // ===================================================================================================================================== // Leftovers diff --git a/src/plugins/intel_gpu/src/kernel_selector/kernels/fully_connected/fully_connected_kernel_bf_tiled.cpp b/src/plugins/intel_gpu/src/kernel_selector/kernels/fully_connected/fully_connected_kernel_bf_tiled.cpp index d0f881adcd88b1..3846a6bc563589 100644 --- a/src/plugins/intel_gpu/src/kernel_selector/kernels/fully_connected/fully_connected_kernel_bf_tiled.cpp +++ b/src/plugins/intel_gpu/src/kernel_selector/kernels/fully_connected/fully_connected_kernel_bf_tiled.cpp @@ -698,14 +698,14 @@ JitConstants FullyConnected_bf_tiled::GetJitConstants(const fully_connected_para jit.AddConstant(MakeJitConstant("USE_SLM", 0)); } + if (add_decompress_scale_post_op) + jit.AddConstant(MakeJitConstant("DECOMPRESSION_SCALE_POST_OP", 1)); // Validated perf gain, Dynamic quantize force enable SCALE_POST_OP for char type multiplication if (should_dynamic_quantize(params)) { jit.AddConstant(MakeJitConstant("DYNAMIC_QUANTIZE", 1)); jit.AddConstant(MakeJitConstant("DQ_DECOMPRESSION_SCALE_POST_OP", 1)); jit.AddConstant(MakeJitConstant("QUANTIZE_GROUP_SIZE", quantize_grp_size)); } else { - if (add_decompress_scale_post_op) - jit.AddConstant(MakeJitConstant("DECOMPRESSION_SCALE_POST_OP", 1)); jit.AddConstant(MakeJitConstant("DYNAMIC_QUANTIZE", 0)); jit.AddConstant(MakeJitConstant("QUANTIZE_GROUP_SIZE", min_quantize_grp_size)); }