diff --git a/src/plugins/intel_gpu/src/graph/dynamic_quantize.cpp b/src/plugins/intel_gpu/src/graph/dynamic_quantize.cpp index b64a924e3408a7..95d1951cbefaba 100644 --- a/src/plugins/intel_gpu/src/graph/dynamic_quantize.cpp +++ b/src/plugins/intel_gpu/src/graph/dynamic_quantize.cpp @@ -30,16 +30,16 @@ std::vector dynamic_quantize_inst::__calc_output_layouts(const layout &a act_layout.get(), }; - auto print_arr = [&](const std::vector& vec, size_t max_len, std::string name) { - std::stringstream ss; - for (size_t i = 0; i < std::min(max_len, vec.size()); i++) { - ss << vec[i] << ", "; - } - std::cout << "Array " << name << " for calc_shape (len=" << vec.size() << ") content: " << ss.str() << "\n"; - }; - - print_arr(scales_output_order, scales_output_order.size(), "scales_output_order"); - print_arr(group_sizes, group_sizes.size(), "group_sizes"); + // auto print_arr = [&](const std::vector& vec, size_t max_len, std::string name) { + // std::stringstream ss; + // for (size_t i = 0; i < std::min(max_len, vec.size()); i++) { + // ss << vec[i] << ", "; + // } + // std::cout << "Array " << name << " for calc_shape (len=" << vec.size() << ") content: " << ss.str() << "\n"; + // }; + + // print_arr(scales_output_order, scales_output_order.size(), "scales_output_order"); + // print_arr(group_sizes, group_sizes.size(), "group_sizes"); auto output_shapes = ov::op::internal::DynamicQuantize::shape_infer(&op, input_shapes, group_sizes, scales_output_order); GPU_DEBUG_TRACE_DETAIL << "shape infer dynamic" << output_shapes[0] << " " << output_shapes[1] << "\n"; diff --git a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/dynamic_quantize_gpu_opt.cl b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/dynamic_quantize_gpu_opt.cl index 628bc69f3886df..073061789c597e 100644 --- a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/dynamic_quantize_gpu_opt.cl +++ b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/dynamic_quantize_gpu_opt.cl @@ -16,7 +16,7 @@ #define AS_INPUT_TYPE_N(x) AS_TYPE_N(INPUT0_TYPE, VEC_SIZE, x) REQD_SUB_GROUP_SIZE(SIMD) -KERNEL(dynamic_quantize_gpu_opt)( +KERNEL(dynamic_quantize_gpu_opt_generic)( OPTIONAL_SHAPE_INFO_ARG const __global INPUT0_TYPE* input, __global OUTPUT_TYPE* output, diff --git a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/dynamic_quantize_gpu_opt_generic.cl b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/dynamic_quantize_gpu_opt_generic.cl new file mode 100644 index 00000000000000..e904b93f44a08d --- /dev/null +++ b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/dynamic_quantize_gpu_opt_generic.cl @@ -0,0 +1,78 @@ +// Copyright (C) 2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "include/batch_headers/fetch_data.cl" + +#if OUTPUT_DIMS != 4 +#error "dynamic_quantize_gpu_opt.cl: Unsupported output dimension" +#endif + +#define VLOAD_N CAT(vload, VEC_SIZE) +#define VSTORE_N CAT(vstore, VEC_SIZE) +#define CONVERT_CHAR_N CAT(convert_char, VEC_SIZE) +#define AS_TYPE_N_(type, n, x) as_##type##n(x) +#define AS_TYPE_N(type, n, x) AS_TYPE_N_(type, n, x) +#define AS_INPUT_TYPE_N(x) AS_TYPE_N(INPUT0_TYPE, VEC_SIZE, x) + + +inline uint FUNC(get_scales_offset_nt)(OPTIONAL_SHAPE_INFO_ARG uint b, uint f, uint y, uint x) { + return OUTPUT1_GET_INDEX(b, f, y, x); +} + +inline uint FUNC(get_scales_offset)(OPTIONAL_SHAPE_INFO_ARG uint b, uint f, uint y, uint x) { +#ifdef SCALES_OUTPUT_ORDER + return FUNC_CALL(get_scales_offset_nt)(OPTIONAL_SHAPE_INFO_TENSOR SCALES_OUTPUT_ORDER); +#else + return FUNC_CALL(get_scales_offset_nt)(OPTIONAL_SHAPE_INFO_TENSOR b, f, y, x); +#endif +} + +__attribute__((reqd_work_group_size(1, LWS_SIZE, 1))) +KERNEL(dynamic_quantize_gpu_opt)( + OPTIONAL_SHAPE_INFO_ARG + const __global INPUT0_TYPE* input, + __global OUTPUT_TYPE* output, + __global OUTPUT1_TYPE* output_scale) +{ + const uint batch_indexes = get_global_id(0); + const uint data_indexes = get_global_id(1); + + DECLARE_BATCHED_DIMS_INDEXES(batch_indexes); + + half max_value = 0.0001h; + half val[ITERATIONS_NUMBER]; + + for (uint i = 0; i < ITERATIONS_NUMBER; i++) { + const uint value_idx = data_indexes + i * LWS_SIZE; + DECLARE_GROUPED_DIMS_INDEXES(value_idx); + + const uint data_offset = INPUT0_GET_INDEX(b, f, y, x); + val[i] = input[data_offset]; + max_value = fmax(max_value, fabs(val[i])); + } + + max_value = work_group_reduce_max(max_value); + + half scale = 127.0h / max_value; + + for (uint i = 0; i < ITERATIONS_NUMBER; i++) { + const uint value_idx = data_indexes + i * LWS_SIZE; + DECLARE_GROUPED_DIMS_INDEXES(value_idx); + + const uint data_offset = INPUT0_GET_INDEX(b, f, y, x); + + output[data_offset] = convert_char(val[i] * scale); + } + + DECLARE_GROUPED_DIMS_INDEXES(data_indexes); + +#ifdef SCALES_OUTPUT_ORDER + const uint scale_idx = FUNC_CALL(get_scales_offset)(OPTIONAL_SHAPE_INFO_TENSOR b, f, y, x); +#else + const uint scale_idx = OUTPUT1_GET_INDEX_SAFE(b, f, y, x); +#endif + + if (data_indexes == 0) + output_scale[scale_idx] = 1.0h / scale; +} diff --git a/src/plugins/intel_gpu/src/kernel_selector/kernels/dynamic_quantize/dynamic_quantize_kernel_opt_generic.cpp b/src/plugins/intel_gpu/src/kernel_selector/kernels/dynamic_quantize/dynamic_quantize_kernel_opt_generic.cpp new file mode 100644 index 00000000000000..49543ea1855025 --- /dev/null +++ b/src/plugins/intel_gpu/src/kernel_selector/kernels/dynamic_quantize/dynamic_quantize_kernel_opt_generic.cpp @@ -0,0 +1,283 @@ +// Copyright (C) 2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "dynamic_quantize_kernel_opt_generic.h" +#include "kernel_selector_utils.h" +#include + + +static constexpr size_t simd = 16; + +namespace kernel_selector { +static Tensor::NDims get_normalized_dims(const DataTensor& tensor) { + auto dims = tensor.GetDims(); + std::reverse(dims.begin(), dims.end()); + + return dims; +} + +static size_t get_elements_number_per_batch(const dynamic_quantize_params& params) { + const auto& group_sizes = params.group_sizes; + const auto& input_dims = get_normalized_dims(params.inputs[0]); + + auto total_elements_number = 1; + for (size_t i = 0; i < group_sizes.size(); i++) { + if (group_sizes[i] != UINT64_MAX) { + GPU_DEBUG_TRACE_DETAIL << "Multiply " << input_dims[i].v << "\n"; + total_elements_number *= input_dims[i].v; + } + } + + return total_elements_number; +} + +static size_t get_elements_number_per_group(const dynamic_quantize_params& params) { + const auto& group_sizes = params.group_sizes; + const auto& input_dims = get_normalized_dims(params.inputs[0]); + + auto total_elements_number = 1; + for (size_t i = 0; i < group_sizes.size(); i++) { + if (group_sizes[i] == UINT64_MAX) { + GPU_DEBUG_TRACE_DETAIL << "-> Multiply " << input_dims[i].v << "\n"; + total_elements_number *= input_dims[i].v; + } else { + GPU_DEBUG_TRACE_DETAIL << "=> Multiply " << group_sizes[i] << "\n"; + total_elements_number *= group_sizes[i]; + } + } + + return total_elements_number; +} + +static std::string generate_dims_indexes_calculation(std::vector> dims) { + std::reverse(dims.begin(), dims.end()); + + auto generate_calc_function = [&](std::string data_type, std::string index_var, size_t dim_idx) { + std::string index_calc_str; + index_calc_str += "const " + data_type + " " + dims[dim_idx].first + " = "; + index_calc_str += "(" + index_var + " / "; + index_calc_str += "(1"; + for (size_t i = 0; i < dim_idx; i++) { + index_calc_str += " * " + dims[i].second; + } + index_calc_str += ")) % " + dims[dim_idx].second + ";"; + + return index_calc_str; + }; + + std::stringstream indexes_calc_str; + for (size_t i = 0; i < dims.size(); i++) { + indexes_calc_str << generate_calc_function("uint", "data_idx", i); + } + + return indexes_calc_str.str(); +} + +// static size_t get_innermost_group_size(const dynamic_quantize_params& params) { +// const auto& group_sizes = params.group_sizes; +// const auto& input_dims = get_normalized_dims(params.inputs[0]); + +// for (size_t i = group_sizes.size(); i > 0; i--) { +// if (group_sizes[i - 1] == UINT64_MAX) { +// return input_dims[i - 1].v; +// } else if (group_sizes[i - 1] != 1) { +// return group_sizes[i - 1]; +// } +// } + +// return 1; +// } + +// static size_t get_match_vector_size(const dynamic_quantize_params& params) { +// // const auto input_dt = BytesPerElement(params.inputs[0].GetDType()); +// auto block_sizes = { 8, 4, 2 }; + +// for (auto block_size : block_sizes) { +// if (((params.inputs[0].X().v * params.inputs[0].Y().v) / simd) % block_size == 0) { +// return block_size; +// } +// } + +// return 1; +// } + +static size_t get_per_iter_elements_number(const dynamic_quantize_params& params) { + const auto maxWorkGroupSize = params.engineInfo.maxWorkGroupSize; + const auto total_grouped_elements = get_elements_number_per_group(params); + + if (total_grouped_elements % maxWorkGroupSize == 0) + return maxWorkGroupSize; + + if (total_grouped_elements < maxWorkGroupSize) + return total_grouped_elements; + + return 0; +} + +ParamsKey DynamicQuantizeKernelOptGeneric::GetSupportedKey() const { + ParamsKey k; + k.EnableInputDataType(Datatype::F16); + k.EnableOutputDataType(Datatype::INT8); + k.EnableDifferentTypes(); + k.EnableAllInputLayout(); + k.EnableAllOutputLayout(); + k.EnableTensorOffset(); + k.EnableTensorPitches(); + k.EnableBatching(); + k.EnableDynamicShapesSupport(); + return k; +} + +JitConstants DynamicQuantizeKernelOptGeneric::GetJitConstants(const dynamic_quantize_params& params) const { + JitConstants jit = MakeBaseParamsJitConstants(params); + + const std::vector> default_dims = {{"b", "INPUT0_BATCH_NUM"}, + {"f", "INPUT0_FEATURE_NUM"}, + {"y", "INPUT0_SIZE_Y"}, + {"x", "INPUT0_SIZE_X"}}; + + const auto& group_sizes = params.group_sizes; + std::vector> batch_dims, grouped_dims; + for (size_t i = 0; i < group_sizes.size(); i++) { + if (group_sizes[i] == 1) + batch_dims.push_back(default_dims[i]); + else + grouped_dims.push_back(default_dims[i]); + } + const auto total_grouped_elements = get_elements_number_per_group(params); + const auto per_iter_elements_number = get_per_iter_elements_number(params); + + jit.AddConstant(MakeJitConstant("DECLARE_BATCHED_DIMS_INDEXES(data_idx)", generate_dims_indexes_calculation(batch_dims))); + jit.AddConstant(MakeJitConstant("DECLARE_GROUPED_DIMS_INDEXES(data_idx)", generate_dims_indexes_calculation(grouped_dims))); + jit.AddConstant(MakeJitConstant("LWS_SIZE", per_iter_elements_number)); + + const auto iterations_number = total_grouped_elements / per_iter_elements_number; + + jit.AddConstant(MakeJitConstant("ITERATIONS_NUMBER", iterations_number)); + + bool rearrange_scales_order = false; + const auto& scales_output_order = params.scales_output_order; + if (!scales_output_order.empty()) { + for (size_t i = 0; i < scales_output_order.size(); i++) { + if (i != scales_output_order[i]) { + rearrange_scales_order = true; + break; + } + } + } + + if (rearrange_scales_order) { + const std::array default_dim_order = {'b', 'f', 'y', 'x'}; + + std::stringstream ss; + for (size_t i = 0; i < scales_output_order.size(); i++) { + ss << default_dim_order[scales_output_order[i]]; + + if (i + 1 != scales_output_order.size()) + ss << ", "; + } + + jit.AddConstant(MakeJitConstant("SCALES_OUTPUT_ORDER", ss.str())); + GPU_DEBUG_TRACE_DETAIL << "SCALES_OUTPUT_ORDER: " << ss.str() << "\n"; + } + + for (size_t i = 0; i < group_sizes.size(); i++) { + jit.AddConstant(MakeJitConstant("GROUP_SIZE_DIM" + std::to_string(i), group_sizes[i])); + } + + return jit; +} + +CommonDispatchData DynamicQuantizeKernelOptGeneric::SetDefault(const dynamic_quantize_params& params) const { + CommonDispatchData dispatchData; + + const auto total_batched_elements = get_elements_number_per_batch(params); + // const auto total_grouped_elements = get_elements_number_per_group(params); + const auto per_iter_elements_number = get_per_iter_elements_number(params); + + dispatchData.gws = {total_batched_elements, per_iter_elements_number, 1}; + dispatchData.lws = {1, per_iter_elements_number, 1}; + + return dispatchData; +} + +void DynamicQuantizeKernelOptGeneric::GetUpdateDispatchDataFunc(KernelData& kd) const { + kd.update_dispatch_data_func = [this](const Params& params, KernelData& kd) { + const auto& prim_params = static_cast(params); + auto dispatchData = SetDefault(prim_params); + OPENVINO_ASSERT(kd.kernels.size() == 1, "[GPU] Invalid kernels size for update dispatch data func"); + kd.kernels[0].params.workGroups.global = dispatchData.gws; + kd.kernels[0].params.workGroups.local = dispatchData.lws; + kd.kernels[0].skip_execution = false; + + GPU_DEBUG_TRACE_DETAIL << "Update Dispatch data DynamicQuantizeKernelOptGeneric gws : " << dispatchData.gws[0] << ", " + << dispatchData.gws[1] << ", " << dispatchData.gws[2] << std::endl; + }; +} + +KernelsData DynamicQuantizeKernelOptGeneric::GetKernelsData(const Params& params) const { + assert(params.GetType() == KernelType::DYNAMIC_QUANTIZE); + + if (!Validate(params)) + return {}; + + const dynamic_quantize_params& prim_params = static_cast(params); + auto dispatchData = SetDefault(prim_params); + + KernelData kd = KernelData::Default(params); + + auto cldnn_jit = GetJitConstants(prim_params); + auto entry_point = GetEntryPoint(kernelName, prim_params.layerID, params); + auto jit = CreateJit(kernelName, cldnn_jit, entry_point); + + GetUpdateDispatchDataFunc(kd); + + auto& kernel = kd.kernels[0]; + FillCLKernelData(kernel, + dispatchData, + params.engineInfo, + kernelName, + jit, + entry_point, + EXE_MODE_DEFAULT, + false, + false, + 1, + GetFusedPrimitiveInputsCount(params), + static_cast(prim_params.outputs.size()), + prim_params.is_shape_agnostic); + + return {kd}; +} + +KernelsPriority DynamicQuantizeKernelOptGeneric::GetKernelsPriority(const Params& /*params*/) const { + return FORCE_PRIORITY_1; +} + +bool DynamicQuantizeKernelOptGeneric::Validate(const Params& params) const { + if (!KernelBaseOpenCL::Validate(params)) + return false; + + const auto& dq_params = static_cast(params); + + const auto& group_sizes = dq_params.group_sizes; + const auto& input_dims = get_normalized_dims(dq_params.inputs[0]); + const size_t non_compressed_dims_number = std::count(group_sizes.begin(), group_sizes.end(), 1); + + if (non_compressed_dims_number == group_sizes.size()) + return false; + + for (size_t i = 0; i < group_sizes.size(); i++) { + if (group_sizes[i] != 1 && input_dims[i].is_dynamic) { + return false; + } + } + + if (dq_params.inputs[0].GetPaddedVal() != 0 || dq_params.outputs[0].GetPaddedVal() != 0) + return false; + + return true; +} +} // namespace kernel_selector + diff --git a/src/plugins/intel_gpu/src/kernel_selector/kernels/dynamic_quantize/dynamic_quantize_kernel_opt_generic.h b/src/plugins/intel_gpu/src/kernel_selector/kernels/dynamic_quantize/dynamic_quantize_kernel_opt_generic.h new file mode 100644 index 00000000000000..f5148ccc1d9a1f --- /dev/null +++ b/src/plugins/intel_gpu/src/kernel_selector/kernels/dynamic_quantize/dynamic_quantize_kernel_opt_generic.h @@ -0,0 +1,30 @@ +// Copyright (C) 2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include "kernel_base_opencl.h" +#include "dynamic_quantize_kernel_ref.h" + +namespace kernel_selector { +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +// dynamic_quantize_params +/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +class DynamicQuantizeKernelOptGeneric : public KernelBaseOpenCL { +public: + DynamicQuantizeKernelOptGeneric() : KernelBaseOpenCL("dynamic_quantize_gpu_opt_generic") {} + virtual ~DynamicQuantizeKernelOptGeneric() {} + + virtual JitConstants GetJitConstants(const dynamic_quantize_params& params) const; + virtual CommonDispatchData SetDefault(const dynamic_quantize_params& params) const; + KernelsData GetKernelsData(const Params& params) const override; + KernelsPriority GetKernelsPriority(const Params& params) const override; + Datatype GetAccumulatorType(const dynamic_quantize_params& params) const; + ParamsKey GetSupportedKey() const override; + +protected: + bool Validate(const Params&) const override; + void GetUpdateDispatchDataFunc(KernelData& kd) const override; +}; +} // namespace kernel_selector diff --git a/src/plugins/intel_gpu/src/kernel_selector/kernels/dynamic_quantize/dynamic_quantize_kernel_selector.cpp b/src/plugins/intel_gpu/src/kernel_selector/kernels/dynamic_quantize/dynamic_quantize_kernel_selector.cpp index aba81c1827c4d7..dd5ff4acaf06b1 100644 --- a/src/plugins/intel_gpu/src/kernel_selector/kernels/dynamic_quantize/dynamic_quantize_kernel_selector.cpp +++ b/src/plugins/intel_gpu/src/kernel_selector/kernels/dynamic_quantize/dynamic_quantize_kernel_selector.cpp @@ -5,10 +5,20 @@ #include "dynamic_quantize_kernel_selector.h" #include "dynamic_quantize_kernel_ref.h" #include "dynamic_quantize_kernel_opt.h" +#include "dynamic_quantize_kernel_opt_generic.h" namespace kernel_selector { dynamic_quantize_kernel_selector::dynamic_quantize_kernel_selector() { Attach(); + int USE_REF_DQ = 0; + if (const auto env_var = std::getenv("USE_REF_DQ")) { + std::istringstream ss(env_var); + ss >> USE_REF_DQ; + } + + if (!USE_REF_DQ) { + Attach(); + } // Attach(); }