diff --git a/src/plugins/intel_gpu/src/kernel_selector/kernels/softmax/softmax_kernel_items_class_optimized.cpp b/src/plugins/intel_gpu/src/kernel_selector/kernels/softmax/softmax_kernel_items_class_optimized.cpp index b2ee40243968db..b27cccaba870c5 100644 --- a/src/plugins/intel_gpu/src/kernel_selector/kernels/softmax/softmax_kernel_items_class_optimized.cpp +++ b/src/plugins/intel_gpu/src/kernel_selector/kernels/softmax/softmax_kernel_items_class_optimized.cpp @@ -9,6 +9,17 @@ namespace kernel_selector { // how many workitems we use to calculate item classes for one output, only 16 supported right now static const auto workitems_per_classes = 16; +inline static size_t get_class_pitch(const DataTensor& tensor, SoftmaxDim dim) { + switch (dim) { + case SoftmaxDim::X: return tensor.X().pitch; + case SoftmaxDim::Y: return tensor.Y().pitch; + case SoftmaxDim::Z: return tensor.Z().pitch; + case SoftmaxDim::FEATURE: return tensor.Feature().pitch; + case SoftmaxDim::BATCH: return tensor.Batch().pitch; + default: return 0; + } +} + inline static size_t GetItemClassCount(const DataTensor& input, SoftmaxDim dim) { size_t item_class_count = 0; @@ -109,10 +120,12 @@ KernelsPriority SoftmaxKerneItemsClassOptimized::GetKernelsPriority(const Params JitConstants SoftmaxKerneItemsClassOptimized::GetJitConstants(const softmax_params& params, DispatchData dispatchData) const { auto jit = SoftmaxItemsClassKernelBase::GetJitConstants(params, dispatchData); - // sub_group_block_write requires aligned memory, - // therefore it can be utilized if either memory is aligned by 16 bytes - bool isSubGroupBlockIOEnabled = params.dim != SoftmaxDim::BATCH && - (dispatchData.dataSetSize * params.outputs[0].ElementSize()) % 16 == 0; + // sub_group_block_write requires + // 1. aligned memory, therefore it can be utilized if memory is aligned by 16 bytes + // 2. class dimension is innermost or all other dims equal to 1 + bool isSubGroupBlockIOEnabled = get_class_pitch(params.outputs[0], params.dim) == 1 && + get_class_pitch(params.inputs[0], params.dim) == 1 && + (dispatchData.dataSetSize * params.outputs[0].ElementSize()) % 16 == 0; jit.AddConstants({ MakeJitConstant("LEFTOVERS", dispatchData.leftovers), diff --git a/src/plugins/intel_gpu/tests/functional/shared_tests_instances/single_layer_tests/softmax.cpp b/src/plugins/intel_gpu/tests/functional/shared_tests_instances/single_layer_tests/softmax.cpp index 00b844defb0686..89d8b52d69e8a3 100644 --- a/src/plugins/intel_gpu/tests/functional/shared_tests_instances/single_layer_tests/softmax.cpp +++ b/src/plugins/intel_gpu/tests/functional/shared_tests_instances/single_layer_tests/softmax.cpp @@ -44,10 +44,11 @@ INSTANTIATE_TEST_SUITE_P( const std::vector inputShapes3D = { {16, 64, 64}, + {1, 256, 256}, }; const std::vector axis3D = { - -1 + -1, 1 }; const auto params3D = testing::Combine(