diff --git a/src/plugins/intel_npu/src/al/include/intel_npu/config/npuw.hpp b/src/plugins/intel_npu/src/al/include/intel_npu/config/npuw.hpp index 7b0dab3d16da3c..bdda589a7bcb7b 100644 --- a/src/plugins/intel_npu/src/al/include/intel_npu/config/npuw.hpp +++ b/src/plugins/intel_npu/src/al/include/intel_npu/config/npuw.hpp @@ -42,6 +42,7 @@ DEFINE_OPT(NPUW_PLAN, std::string, "", npuw::partitioning::plan, CompileTime); DEFINE_OPT(NPUW_FOLD, bool, false, npuw::partitioning::fold, CompileTime); DEFINE_OPT(NPUW_CWAI, bool, false, npuw::partitioning::cwai, CompileTime); DEFINE_OPT(NPUW_DQ, bool, false, npuw::partitioning::dyn_quant, CompileTime); +DEFINE_OPT(NPUW_DQ_FULL, bool, true, npuw::partitioning::dyn_quant_full, CompileTime); DEFINE_OPT(NPUW_PMM, std::string, "2", npuw::partitioning::par_matmul_merge_dims, CompileTime); DEFINE_OPT(NPUW_SLICE_OUT, bool, false, npuw::partitioning::slice_out, CompileTime); DEFINE_OPT(NPUW_HOST_GATHER, bool, true, npuw::partitioning::host_gather, CompileTime); diff --git a/src/plugins/intel_npu/src/al/include/intel_npu/npuw_private_properties.hpp b/src/plugins/intel_npu/src/al/include/intel_npu/npuw_private_properties.hpp index 67dce9621bfb4e..648bcde0cdc913 100644 --- a/src/plugins/intel_npu/src/al/include/intel_npu/npuw_private_properties.hpp +++ b/src/plugins/intel_npu/src/al/include/intel_npu/npuw_private_properties.hpp @@ -183,6 +183,14 @@ static constexpr ov::Property cwai{"NPUW_CWAI"}; */ static constexpr ov::Property dyn_quant{"NPUW_DQ"}; +/** + * @brief + * Type: bool. + * Apply the full DQ transformation pipeline in the plugin. + * Default value: true. + */ +static constexpr ov::Property dyn_quant_full{"NPUW_DQ_FULL"}; + /** * @brief * Type: string. diff --git a/src/plugins/intel_npu/src/al/src/config/npuw.cpp b/src/plugins/intel_npu/src/al/src/config/npuw.cpp index 6a519a0f754a32..84ac94d1f7c67b 100644 --- a/src/plugins/intel_npu/src/al/src/config/npuw.cpp +++ b/src/plugins/intel_npu/src/al/src/config/npuw.cpp @@ -27,6 +27,7 @@ void intel_npu::registerNPUWOptions(OptionsDesc& desc) { desc.add(); desc.add(); desc.add(); + desc.add(); desc.add(); desc.add(); desc.add(); diff --git a/src/plugins/intel_npu/src/plugin/npuw/compiled_model.cpp b/src/plugins/intel_npu/src/plugin/npuw/compiled_model.cpp index b52dd40ea59364..4110307ec1623e 100644 --- a/src/plugins/intel_npu/src/plugin/npuw/compiled_model.cpp +++ b/src/plugins/intel_npu/src/plugin/npuw/compiled_model.cpp @@ -923,6 +923,7 @@ void ov::npuw::CompiledModel::implement_properties() { BIND(npuw::partitioning::fold, NPUW_FOLD), BIND(npuw::partitioning::cwai, NPUW_CWAI), BIND(npuw::partitioning::dyn_quant, NPUW_DQ), + BIND(npuw::partitioning::dyn_quant_full, NPUW_DQ_FULL), BIND(npuw::partitioning::par_matmul_merge_dims, NPUW_PMM), BIND(npuw::partitioning::slice_out, NPUW_SLICE_OUT), BIND(npuw::partitioning::spatial, NPUW_SPATIAL), diff --git a/src/plugins/intel_npu/src/plugin/npuw/partitioning/partitioning.cpp b/src/plugins/intel_npu/src/plugin/npuw/partitioning/partitioning.cpp index 653b8350cfdeda..04f1602f8f1650 100644 --- a/src/plugins/intel_npu/src/plugin/npuw/partitioning/partitioning.cpp +++ b/src/plugins/intel_npu/src/plugin/npuw/partitioning/partitioning.cpp @@ -1956,9 +1956,10 @@ void Partitioner::optimize(const std::string& func_name) { // Run "dynamic quantization" ov::npuw::patterns::opt::Context ctx; ctx.is_spatial = f._spatial.has_value(); + ctx.mm_dq_full = cfg.get<::intel_npu::NPUW_DQ_FULL>(); ov::pass::GraphRewrite rewr; - rewr.add_matcher(); + rewr.add_matcher(std::ref(ctx)); rewr.add_matcher(std::ref(ctx)); rewr.add_matcher(std::ref(ctx)); rewr.add_matcher(std::ref(ctx)); diff --git a/src/plugins/intel_npu/src/plugin/npuw/partitioning/patterns/opt.cpp b/src/plugins/intel_npu/src/plugin/npuw/partitioning/patterns/opt.cpp index da3962feba66f3..968039e88758a1 100644 --- a/src/plugins/intel_npu/src/plugin/npuw/partitioning/patterns/opt.cpp +++ b/src/plugins/intel_npu/src/plugin/npuw/partitioning/patterns/opt.cpp @@ -20,6 +20,14 @@ namespace opt { void Context::permute(PPtr orig_param, const Context::Axes& order) { closures_to_permute[orig_param] = order; + + const auto& orig_shape = orig_param->get_shape(); + ov::Shape tw_shape; + for (const auto& axis : order) { + tw_shape.push_back(orig_shape[axis]); + } + orig_param->set_partial_shape(tw_shape); + orig_param->validate_and_infer_types(); } void Context::to_f16(PPtr orig_param) { @@ -126,7 +134,7 @@ namespace uat = ov::npuw::util::at; // Param/Const(S) -> (Reshape) -> (to(f32)) -> Reshape --> // -DQMatMulCWi::DQMatMulCWi() { +DQMatMulCWi::DQMatMulCWi(Context::Ref ctx) { auto qweight = opp::wrap_type(); auto qcoeff = opp::any_input(); auto reshapew = opp::optional({qweight, opp::any_input()}); @@ -161,6 +169,14 @@ DQMatMulCWi::DQMatMulCWi() { auto matched_node_qcoeff_out = uat::_(node_to_output).at_or_at_or_at(qcvtc, reshapec, qcoeff); auto matched_node_muls_out = uat::_(node_to_output).at_or_at(qcvtm, qmuls); + if (!ctx.get().mm_dq_full) { + const auto& matm_mul_out_shape = matched_matmul->get_output_shape(0); + const auto& matm_mul_in_shape = matched_matmul->get_input_shape(1); + NPUW_ASSERT(matm_mul_out_shape.back() == matm_mul_in_shape.front()); + NPUW_ASSERT(matched_matmul->get_transpose_b()); + return false; // root hasn't changed + } + // Reconnect MatMul to read from Convert(W) directly. // Note: ACT has to be converted too. auto cvt_prec = matched_node_cvtw->output(0).get_element_type(); @@ -238,7 +254,9 @@ DQMatMulGQi::DQMatMulGQi(Context::Ref ctx) { auto matched_node_qweight = node_to_output.at(qweight).get_node_shared_ptr(); auto matched_node_qcoeff = node_to_output.at(qcoeff).get_node_shared_ptr(); + auto matched_node_qmuls = node_to_output.at(qmuls).get_node_shared_ptr(); auto matched_node_matmul = node_to_output.at(qmm).get_node_shared_ptr(); + auto matched_node_qreshp = node_to_output.at(qreshp).get_node_shared_ptr(); auto matched_out_mmi = node_to_output.at(qmmi); auto matched_qweight = std::static_pointer_cast(matched_node_qweight); @@ -255,10 +273,36 @@ DQMatMulGQi::DQMatMulGQi(Context::Ref ctx) { act_shape.size() == 3 && act_shape[1] == 1 && // single-token case qcoeff_shape[0] == qweight_shape[0] && qcoeff_shape[1] == 1 && qcoeff_shape[2] == qweight_shape[2] && !matched_matmul->get_transpose_a() && !matched_matmul->get_transpose_b()) { + if (!ctx.get().mm_dq_full) { + // Transpose weight and coeff + ctx.get().permute(matched_qweight, {0, 2, 1}); + ctx.get().permute(matched_qcoeff, {0, 2, 1}); + + // Add Transpose and insert it + std::vector new_transpose_order = {1, 0, 2}; + auto new_transpose_order_c = + std::make_shared(ov::element::i32, ov::Shape{3}, new_transpose_order); + auto new_transpose = std::make_shared(matched_node_qmuls, new_transpose_order_c); + matched_node_qreshp->input(0).replace_source_output(new_transpose); + matched_node_qreshp->validate_and_infer_types(); + + // Change Reshape's shape + std::vector transposed_shape = {qweight_shape[2], qweight_shape[0] * qweight_shape[1]}; + auto transposed_shape_c = + std::make_shared(ov::element::i32, ov::Shape{2}, transposed_shape); + matched_node_qreshp->input(1).replace_source_output(transposed_shape_c); + matched_node_qreshp->validate_and_infer_types(); + + matched_matmul->set_transpose_b(true); + matched_matmul->validate_and_infer_types(); + + const auto& matm_mul_out_shape = matched_matmul->get_output_shape(0); + const auto& matm_mul_in_shape = matched_matmul->get_input_shape(1); + NPUW_ASSERT(matm_mul_out_shape.back() == matm_mul_in_shape.front()); + return false; // root hasn't changed + } + // Mark W closure to transpose, and transpose the respective parameter - ov::Shape tw_shape = {qweight_shape[0], qweight_shape[2], qweight_shape[1]}; - matched_qweight->set_partial_shape(tw_shape); - matched_qweight->validate_and_infer_types(); ctx.get().permute(matched_qweight, {0, 2, 1}); // Mark S closure to be lowered fo f16 @@ -353,7 +397,9 @@ DQMatMulGQ2i::DQMatMulGQ2i(Context::Ref ctx) { auto matched_node_qweight = node_to_output.at(qweight).get_node_shared_ptr(); auto matched_node_qcoeff = node_to_output.at(qcoeff).get_node_shared_ptr(); + auto matched_node_qmuls = node_to_output.at(qmuls).get_node_shared_ptr(); auto matched_node_matmul = node_to_output.at(qmm).get_node_shared_ptr(); + auto matched_node_qreshp = node_to_output.at(qreshp).get_node_shared_ptr(); auto matched_out_mmi = node_to_output.at(qmmi); auto matched_qweight = std::static_pointer_cast(matched_node_qweight); @@ -370,20 +416,33 @@ DQMatMulGQ2i::DQMatMulGQ2i(Context::Ref ctx) { act_shape.size() == 3 && act_shape[0] == 1 && act_shape[1] == 1 && qcoeff_shape[0] == qweight_shape[0] && qcoeff_shape[2] == 1 && qcoeff_shape[1] == qweight_shape[1] && !matched_matmul->get_transpose_a() && matched_matmul->get_transpose_b()) { + if (!ctx.get().mm_dq_full) { + // Transpose weight and coeff + ctx.get().permute(matched_qweight, {1, 0, 2}); + ctx.get().permute(matched_qcoeff, {1, 0, 2}); + + // Add Transpose and insert it + std::vector new_transpose_order = {1, 0, 2}; + auto new_transpose_order_c = + std::make_shared(ov::element::i32, ov::Shape{3}, new_transpose_order); + auto new_transpose = std::make_shared(matched_node_qmuls, new_transpose_order_c); + matched_node_qreshp->input(0).replace_source_output(new_transpose); + matched_node_qreshp->validate_and_infer_types(); + matched_matmul->validate_and_infer_types(); + + const auto& matm_mul_out_shape = matched_matmul->get_output_shape(0); + const auto& matm_mul_in_shape = matched_matmul->get_input_shape(1); + NPUW_ASSERT(matm_mul_out_shape.back() == matm_mul_in_shape.front()); + NPUW_ASSERT(matched_matmul->get_transpose_b()); + return false; // root hasn't changed + } + // Mark W closure to transpose, and transpose the respective parameter ctx.get().permute(matched_qweight, {1, 0, 2}); - ov::Shape tw_shape = {qweight_shape[1], qweight_shape[0], qweight_shape[2]}; - matched_qweight->set_partial_shape(tw_shape); - matched_qweight->validate_and_infer_types(); - // Also transpose S, but in a different way (see diagram above) ctx.get().permute(matched_qcoeff, {1, 2, 0}); - ov::Shape ts_shape = {qcoeff_shape[1], qcoeff_shape[2], qcoeff_shape[0]}; - matched_qcoeff->set_partial_shape(ts_shape); - matched_qcoeff->validate_and_infer_types(); - // Reshape the Act to group format const auto NSPLIT = qweight_shape[1]; std::vector rshp_act_v = {NSPLIT, 1, act_shape[2] / NSPLIT}; @@ -473,7 +532,9 @@ DQMatMulGQiP::DQMatMulGQiP(Context::Ref ctx) { auto matched_node_qweight = node_to_output.at(qweight).get_node_shared_ptr(); auto matched_node_qcoeff = node_to_output.at(qcoeff).get_node_shared_ptr(); + auto matched_node_qmuls = node_to_output.at(qmuls).get_node_shared_ptr(); auto matched_node_matmul = node_to_output.at(qmm).get_node_shared_ptr(); + auto matched_node_qreshp = node_to_output.at(qreshp).get_node_shared_ptr(); auto matched_out_mmi = node_to_output.at(qmmi); auto matched_qweight = std::static_pointer_cast(matched_node_qweight); @@ -489,15 +550,39 @@ DQMatMulGQiP::DQMatMulGQiP(Context::Ref ctx) { act_shape.size() == 3 && act_shape[1] > 1 && // multi-token case qcoeff_shape[0] == qweight_shape[0] && qcoeff_shape[1] == 1 && qcoeff_shape[2] == qweight_shape[2] && !matched_matmul->get_transpose_a() && !matched_matmul->get_transpose_b()) { + if (!ctx.get().mm_dq_full) { + // Transpose weight and coeff + ctx.get().permute(matched_qweight, {0, 2, 1}); + ctx.get().permute(matched_qcoeff, {0, 2, 1}); + + // Add Transpose and insert it + std::vector new_transpose_order = {1, 0, 2}; + auto new_transpose_order_c = + std::make_shared(ov::element::i32, ov::Shape{3}, new_transpose_order); + auto new_transpose = std::make_shared(matched_node_qmuls, new_transpose_order_c); + matched_node_qreshp->input(0).replace_source_output(new_transpose); + matched_node_qreshp->validate_and_infer_types(); + + // // Change Reshape's shape + std::vector transposed_shape = {qweight_shape[2], qweight_shape[0] * qweight_shape[1]}; + auto transposed_shape_c = + std::make_shared(ov::element::i32, ov::Shape{2}, transposed_shape); + matched_node_qreshp->input(1).replace_source_output(transposed_shape_c); + matched_node_qreshp->validate_and_infer_types(); + + matched_matmul->set_transpose_b(true); + matched_matmul->validate_and_infer_types(); + + const auto& matm_mul_out_shape = matched_matmul->get_output_shape(0); + const auto& matm_mul_in_shape = matched_matmul->get_input_shape(1); + NPUW_ASSERT(matm_mul_out_shape.back() == matm_mul_in_shape.front()); + return false; // root hasn't changed + } + // Mark W closure to transpose, and transpose the respective parameter - ov::Shape tw_shape = {qweight_shape[0], qweight_shape[2], qweight_shape[1]}; - matched_qweight->set_partial_shape(tw_shape); - matched_qweight->validate_and_infer_types(); ctx.get().permute(matched_qweight, {0, 2, 1}); // Mark S closure to be lowered fo f16 - matched_qcoeff->set_element_type(ov::element::f16); - matched_qcoeff->validate_and_infer_types(); ctx.get().to_f16(matched_qcoeff); // Reshape the Act to group format @@ -586,7 +671,9 @@ DQMatMulGQ2iP::DQMatMulGQ2iP(Context::Ref ctx) { auto matched_node_qweight = node_to_output.at(qweight).get_node_shared_ptr(); auto matched_node_qcoeff = node_to_output.at(qcoeff).get_node_shared_ptr(); + auto matched_node_qmuls = node_to_output.at(qmuls).get_node_shared_ptr(); auto matched_node_matmul = node_to_output.at(qmm).get_node_shared_ptr(); + auto matched_node_qreshp = node_to_output.at(qreshp).get_node_shared_ptr(); auto matched_out_mmi = node_to_output.at(qmmi); auto matched_qweight = std::static_pointer_cast(matched_node_qweight); @@ -606,19 +693,33 @@ DQMatMulGQ2iP::DQMatMulGQ2iP(Context::Ref ctx) { act_shape.size() == 3 && just_one(act_shape[0], act_shape[1]) && // multi-token case qcoeff_shape[0] == qweight_shape[0] && qcoeff_shape[1] == qweight_shape[1] && qcoeff_shape[2] == 1 && !matched_matmul->get_transpose_a() && matched_matmul->get_transpose_b()) { + if (!ctx.get().mm_dq_full) { + // Transpose weight and coeff + ctx.get().permute(matched_qweight, {1, 0, 2}); + ctx.get().permute(matched_qcoeff, {1, 0, 2}); + + // Add Transpose and insert it + std::vector new_transpose_order = {1, 0, 2}; + auto new_transpose_order_c = + std::make_shared(ov::element::i32, ov::Shape{3}, new_transpose_order); + auto new_transpose = std::make_shared(matched_node_qmuls, new_transpose_order_c); + matched_node_qreshp->input(0).replace_source_output(new_transpose); + matched_node_qreshp->validate_and_infer_types(); + matched_matmul->validate_and_infer_types(); + + const auto& matm_mul_out_shape = matched_matmul->get_output_shape(0); + const auto& matm_mul_in_shape = matched_matmul->get_input_shape(1); + NPUW_ASSERT(matm_mul_out_shape.back() == matm_mul_in_shape.front()); + NPUW_ASSERT(matched_matmul->get_transpose_b()); + return false; // root hasn't changed + } + // Mark W closure to transpose, and transpose the respective parameter - ov::Shape tw_shape = {qweight_shape[1], qweight_shape[0], qweight_shape[2]}; - matched_qweight->set_partial_shape(tw_shape); - matched_qweight->validate_and_infer_types(); ctx.get().permute(matched_qweight, {1, 0, 2}); // Also transpose S, but in a different way (see diagram above) ctx.get().permute(matched_qcoeff, {1, 2, 0}); - ov::Shape ts_shape = {qcoeff_shape[1], qcoeff_shape[2], qcoeff_shape[0]}; - matched_qcoeff->set_partial_shape(ts_shape); - matched_qcoeff->validate_and_infer_types(); - // Select proper activation shape std::size_t act_dim = act_shape[0] > act_shape[1] ? 0 : 1; diff --git a/src/plugins/intel_npu/src/plugin/npuw/partitioning/patterns/opt.hpp b/src/plugins/intel_npu/src/plugin/npuw/partitioning/patterns/opt.hpp index 8bd4e173ff210a..904ce88039d2eb 100644 --- a/src/plugins/intel_npu/src/plugin/npuw/partitioning/patterns/opt.hpp +++ b/src/plugins/intel_npu/src/plugin/npuw/partitioning/patterns/opt.hpp @@ -19,14 +19,10 @@ namespace npuw { namespace patterns { namespace opt { -class DQMatMulCWi : public ov::pass::MatcherPass { -public: - DQMatMulCWi(); -}; - struct Context { std::string pmm_dims; bool is_spatial = false; + bool mm_dq_full = true; using PPtr = std::shared_ptr; using NPtr = std::shared_ptr; @@ -66,6 +62,11 @@ struct Context { using Ref = std::reference_wrapper; }; +class DQMatMulCWi : public ov::pass::MatcherPass { +public: + explicit DQMatMulCWi(Context::Ref ctx); +}; + class DQMatMulGQi : public ov::pass::MatcherPass { public: explicit DQMatMulGQi(Context::Ref ctx); diff --git a/src/plugins/intel_npu/src/plugin/npuw/util.cpp b/src/plugins/intel_npu/src/plugin/npuw/util.cpp index a878b244bc41e9..e9cab91e60bdb0 100644 --- a/src/plugins/intel_npu/src/plugin/npuw/util.cpp +++ b/src/plugins/intel_npu/src/plugin/npuw/util.cpp @@ -439,6 +439,14 @@ inline uint8_t tread_4b(const ov::Tensor& t, std::size_t r, std::size_t c, std:: return hi4(*telem); } +template +inline T tread(const ov::Tensor& t, std::size_t r, std::size_t c, std::size_t COLS) { + const T* tdata = static_cast(t.data()); + const T* trow = tdata + r * COLS; + const T* telem = trow + c; + return *telem; +} + inline void twrite_4b(ov::Tensor& t, uint8_t value, std::size_t r, std::size_t c, std::size_t COLS) { uint8_t* tdata = static_cast(t.data()); uint8_t* trow = tdata + r * COLS / 2; @@ -450,10 +458,18 @@ inline void twrite_4b(ov::Tensor& t, uint8_t value, std::size_t r, std::size_t c } } +template +inline void twrite(ov::Tensor& t, T value, std::size_t r, std::size_t c, std::size_t COLS) { + T* tdata = static_cast(t.data()); + T* trow = tdata + r * COLS; + T* telem = trow + c; + *telem = value; +} + ov::Tensor ov::npuw::util::transpose(const ov::Tensor& t) { ov::Shape shape = t.get_shape(); NPUW_ASSERT(shape.size() == 3); // Yes, so far only transpose 3D tensors - NPUW_ASSERT(t.get_element_type() == ov::element::i4); + NPUW_ASSERT(t.get_element_type() == ov::element::i4 || t.get_element_type() == ov::element::f32); ov::Shape tshape = {shape[2], shape[0], shape[1]}; ov::Tensor tnew(t.get_element_type(), tshape); @@ -462,8 +478,16 @@ ov::Tensor ov::npuw::util::transpose(const ov::Tensor& t) { const auto IN_COLS = shape[2]; for (std::size_t i = 0; i < IN_ROWS; i++) { for (std::size_t j = 0; j < IN_COLS; j++) { - uint8_t value = tread_4b(t, i, j, IN_COLS); - twrite_4b(tnew, value, j, i, IN_ROWS); + switch (t.get_element_type()) { + case ov::element::i4: + twrite_4b(tnew, tread_4b(t, i, j, IN_COLS), j, i, IN_ROWS); + break; + case ov::element::f32: + twrite(tnew, tread(t, i, j, IN_COLS), j, i, IN_ROWS); + break; + default: + NPUW_ASSERT(false && "Element type is not supported yet"); + } } } return tnew; @@ -498,21 +522,33 @@ ov::Tensor ov::npuw::util::permute(const ov::Tensor& t, const std::vector(tnew, + tread(t, p * shape[1] + r, c, shape[2]), + p * shape[2] + c, + r, + shape[1]); + break; + default: + NPUW_ASSERT(false && "Element type is not supported yet"); + } } } } return tnew; } else if (axes[0] == 1 && axes[1] == 0 && axes[2] == 2) { - NPUW_ASSERT(t.get_element_type() == ov::element::i4); // 4bit only here too + NPUW_ASSERT(t.get_element_type() == ov::element::i4 || t.get_element_type() == ov::element::f16); ov::Shape tshape = {shape[1], shape[0], shape[2]}; ov::Tensor tnew(t.get_element_type(), tshape); @@ -520,8 +556,24 @@ ov::Tensor ov::npuw::util::permute(const ov::Tensor& t, const std::vector(tnew, + tread(t, r, p * shape[2] + c, shape[1] * shape[2]), + p * tshape[1] + r, + c, + tshape[2]); + break; + default: + NPUW_ASSERT(false && "Element type is not supported yet"); + } } } } @@ -537,7 +589,7 @@ ov::Tensor ov::npuw::util::permute(const ov::Tensor& t, const std::vector(t, tnew); break; default: - NPUW_ASSERT("Element type is not supported yet"); + NPUW_ASSERT(false && "Element type is not supported yet"); } return tnew; } else {