diff --git a/src/plugins/intel_gpu/src/kernel_selector/kernels/softmax/softmax_kernel_items_class_optimized.cpp b/src/plugins/intel_gpu/src/kernel_selector/kernels/softmax/softmax_kernel_items_class_optimized.cpp
index b2ee40243968db..b27cccaba870c5 100644
--- a/src/plugins/intel_gpu/src/kernel_selector/kernels/softmax/softmax_kernel_items_class_optimized.cpp
+++ b/src/plugins/intel_gpu/src/kernel_selector/kernels/softmax/softmax_kernel_items_class_optimized.cpp
@@ -9,6 +9,17 @@ namespace kernel_selector {
 // how many workitems we use to calculate item classes for one output, only 16 supported right now
 static const auto workitems_per_classes = 16;
 
+inline static size_t get_class_pitch(const DataTensor& tensor, SoftmaxDim dim) {
+    switch (dim) {
+        case SoftmaxDim::X: return tensor.X().pitch;
+        case SoftmaxDim::Y: return tensor.Y().pitch;
+        case SoftmaxDim::Z: return tensor.Z().pitch;
+        case SoftmaxDim::FEATURE: return tensor.Feature().pitch;
+        case SoftmaxDim::BATCH: return tensor.Batch().pitch;
+        default: return 0;
+    }
+}
+
 inline static size_t GetItemClassCount(const DataTensor& input, SoftmaxDim dim) {
     size_t item_class_count = 0;
 
@@ -109,10 +120,12 @@ KernelsPriority SoftmaxKerneItemsClassOptimized::GetKernelsPriority(const Params
 JitConstants SoftmaxKerneItemsClassOptimized::GetJitConstants(const softmax_params& params, DispatchData dispatchData) const {
     auto jit = SoftmaxItemsClassKernelBase::GetJitConstants(params, dispatchData);
 
-    // sub_group_block_write requires aligned memory,
-    // therefore it can be utilized if either memory is aligned by 16 bytes
-    bool isSubGroupBlockIOEnabled = params.dim != SoftmaxDim::BATCH &&
-        (dispatchData.dataSetSize * params.outputs[0].ElementSize()) % 16 == 0;
+    // sub_group_block_write requires
+    // 1. aligned memory, therefore it can be utilized if memory is aligned by 16 bytes
+    // 2. class dimension is innermost or all other dims equal to 1
+    bool isSubGroupBlockIOEnabled = get_class_pitch(params.outputs[0], params.dim) == 1 &&
+                                    get_class_pitch(params.inputs[0], params.dim) == 1 &&
+                                    (dispatchData.dataSetSize * params.outputs[0].ElementSize()) % 16 == 0;
 
     jit.AddConstants({
         MakeJitConstant("LEFTOVERS", dispatchData.leftovers),
diff --git a/src/plugins/intel_gpu/tests/functional/shared_tests_instances/single_layer_tests/softmax.cpp b/src/plugins/intel_gpu/tests/functional/shared_tests_instances/single_layer_tests/softmax.cpp
index 00b844defb0686..89d8b52d69e8a3 100644
--- a/src/plugins/intel_gpu/tests/functional/shared_tests_instances/single_layer_tests/softmax.cpp
+++ b/src/plugins/intel_gpu/tests/functional/shared_tests_instances/single_layer_tests/softmax.cpp
@@ -44,10 +44,11 @@ INSTANTIATE_TEST_SUITE_P(
 
 const std::vector<ov::Shape> inputShapes3D = {
     {16, 64, 64},
+    {1, 256, 256},
 };
 
 const std::vector<int64_t> axis3D = {
-    -1
+    -1, 1
 };
 
 const auto params3D = testing::Combine(