diff --git a/src/plugins/intel_npu/src/al/include/intel_npu/config/npuw.hpp b/src/plugins/intel_npu/src/al/include/intel_npu/config/npuw.hpp
index 7b0dab3d16da3c..bdda589a7bcb7b 100644
--- a/src/plugins/intel_npu/src/al/include/intel_npu/config/npuw.hpp
+++ b/src/plugins/intel_npu/src/al/include/intel_npu/config/npuw.hpp
@@ -42,6 +42,7 @@ DEFINE_OPT(NPUW_PLAN, std::string, "", npuw::partitioning::plan, CompileTime);
 DEFINE_OPT(NPUW_FOLD, bool, false, npuw::partitioning::fold, CompileTime);
 DEFINE_OPT(NPUW_CWAI, bool, false, npuw::partitioning::cwai, CompileTime);
 DEFINE_OPT(NPUW_DQ, bool, false, npuw::partitioning::dyn_quant, CompileTime);
+DEFINE_OPT(NPUW_DQ_FULL, bool, true, npuw::partitioning::dyn_quant_full, CompileTime);
 DEFINE_OPT(NPUW_PMM, std::string, "2", npuw::partitioning::par_matmul_merge_dims, CompileTime);
 DEFINE_OPT(NPUW_SLICE_OUT, bool, false, npuw::partitioning::slice_out, CompileTime);
 DEFINE_OPT(NPUW_HOST_GATHER, bool, true, npuw::partitioning::host_gather, CompileTime);
diff --git a/src/plugins/intel_npu/src/al/include/intel_npu/npuw_private_properties.hpp b/src/plugins/intel_npu/src/al/include/intel_npu/npuw_private_properties.hpp
index 67dce9621bfb4e..648bcde0cdc913 100644
--- a/src/plugins/intel_npu/src/al/include/intel_npu/npuw_private_properties.hpp
+++ b/src/plugins/intel_npu/src/al/include/intel_npu/npuw_private_properties.hpp
@@ -183,6 +183,14 @@ static constexpr ov::Property<bool> cwai{"NPUW_CWAI"};
  */
 static constexpr ov::Property<bool> dyn_quant{"NPUW_DQ"};
 
+/**
+ * @brief
+ * Type: bool.
+ * Apply the full DQ transformation pipeline in the plugin.
+ * Default value: true.
+ */
+static constexpr ov::Property<bool> dyn_quant_full{"NPUW_DQ_FULL"};
+
 /**
  * @brief
  * Type: string.
diff --git a/src/plugins/intel_npu/src/al/src/config/npuw.cpp b/src/plugins/intel_npu/src/al/src/config/npuw.cpp
index 6a519a0f754a32..84ac94d1f7c67b 100644
--- a/src/plugins/intel_npu/src/al/src/config/npuw.cpp
+++ b/src/plugins/intel_npu/src/al/src/config/npuw.cpp
@@ -27,6 +27,7 @@ void intel_npu::registerNPUWOptions(OptionsDesc& desc) {
     desc.add<NPUW_FOLD>();
     desc.add<NPUW_CWAI>();
     desc.add<NPUW_DQ>();
+    desc.add<NPUW_DQ_FULL>();
     desc.add<NPUW_PMM>();
     desc.add<NPUW_SLICE_OUT>();
     desc.add<NPUW_SPATIAL>();
diff --git a/src/plugins/intel_npu/src/plugin/npuw/compiled_model.cpp b/src/plugins/intel_npu/src/plugin/npuw/compiled_model.cpp
index b52dd40ea59364..4110307ec1623e 100644
--- a/src/plugins/intel_npu/src/plugin/npuw/compiled_model.cpp
+++ b/src/plugins/intel_npu/src/plugin/npuw/compiled_model.cpp
@@ -923,6 +923,7 @@ void ov::npuw::CompiledModel::implement_properties() {
                           BIND(npuw::partitioning::fold, NPUW_FOLD),
                           BIND(npuw::partitioning::cwai, NPUW_CWAI),
                           BIND(npuw::partitioning::dyn_quant, NPUW_DQ),
+                          BIND(npuw::partitioning::dyn_quant_full, NPUW_DQ_FULL),
                           BIND(npuw::partitioning::par_matmul_merge_dims, NPUW_PMM),
                           BIND(npuw::partitioning::slice_out, NPUW_SLICE_OUT),
                           BIND(npuw::partitioning::spatial, NPUW_SPATIAL),
diff --git a/src/plugins/intel_npu/src/plugin/npuw/partitioning/partitioning.cpp b/src/plugins/intel_npu/src/plugin/npuw/partitioning/partitioning.cpp
index 653b8350cfdeda..04f1602f8f1650 100644
--- a/src/plugins/intel_npu/src/plugin/npuw/partitioning/partitioning.cpp
+++ b/src/plugins/intel_npu/src/plugin/npuw/partitioning/partitioning.cpp
@@ -1956,9 +1956,10 @@ void Partitioner::optimize(const std::string& func_name) {
     // Run "dynamic quantization"
     ov::npuw::patterns::opt::Context ctx;
     ctx.is_spatial = f._spatial.has_value();
+    ctx.mm_dq_full = cfg.get<::intel_npu::NPUW_DQ_FULL>();
 
     ov::pass::GraphRewrite rewr;
-    rewr.add_matcher<ov::npuw::patterns::opt::DQMatMulCWi>();
+    rewr.add_matcher<ov::npuw::patterns::opt::DQMatMulCWi>(std::ref(ctx));
     rewr.add_matcher<ov::npuw::patterns::opt::DQMatMulGQi>(std::ref(ctx));
     rewr.add_matcher<ov::npuw::patterns::opt::DQMatMulGQ2i>(std::ref(ctx));
     rewr.add_matcher<ov::npuw::patterns::opt::DQMatMulGQiP>(std::ref(ctx));
diff --git a/src/plugins/intel_npu/src/plugin/npuw/partitioning/patterns/opt.cpp b/src/plugins/intel_npu/src/plugin/npuw/partitioning/patterns/opt.cpp
index da3962feba66f3..968039e88758a1 100644
--- a/src/plugins/intel_npu/src/plugin/npuw/partitioning/patterns/opt.cpp
+++ b/src/plugins/intel_npu/src/plugin/npuw/partitioning/patterns/opt.cpp
@@ -20,6 +20,14 @@ namespace opt {
 
 void Context::permute(PPtr orig_param, const Context::Axes& order) {
     closures_to_permute[orig_param] = order;
+
+    const auto& orig_shape = orig_param->get_shape();
+    ov::Shape tw_shape;
+    for (const auto& axis : order) {
+        tw_shape.push_back(orig_shape[axis]);
+    }
+    orig_param->set_partial_shape(tw_shape);
+    orig_param->validate_and_infer_types();
 }
 
 void Context::to_f16(PPtr orig_param) {
@@ -126,7 +134,7 @@ namespace uat = ov::npuw::util::at;
 //     Param/Const(S) -> (Reshape) -> (to(f32)) -> Reshape -->
 //
 
-DQMatMulCWi::DQMatMulCWi() {
+DQMatMulCWi::DQMatMulCWi(Context::Ref ctx) {
     auto qweight = opp::wrap_type<ov::op::v0::Parameter>();
     auto qcoeff = opp::any_input();
     auto reshapew = opp::optional<ov::op::v1::Reshape>({qweight, opp::any_input()});
@@ -161,6 +169,14 @@ DQMatMulCWi::DQMatMulCWi() {
             auto matched_node_qcoeff_out = uat::_(node_to_output).at_or_at_or_at(qcvtc, reshapec, qcoeff);
             auto matched_node_muls_out = uat::_(node_to_output).at_or_at(qcvtm, qmuls);
 
+            if (!ctx.get().mm_dq_full) {
+                const auto& matm_mul_out_shape = matched_matmul->get_output_shape(0);
+                const auto& matm_mul_in_shape = matched_matmul->get_input_shape(1);
+                NPUW_ASSERT(matm_mul_out_shape.back() == matm_mul_in_shape.front());
+                NPUW_ASSERT(matched_matmul->get_transpose_b());
+                return false;  // root hasn't changed
+            }
+
             // Reconnect MatMul to read from Convert(W) directly.
             // Note: ACT has to be converted too.
             auto cvt_prec = matched_node_cvtw->output(0).get_element_type();
@@ -238,7 +254,9 @@ DQMatMulGQi::DQMatMulGQi(Context::Ref ctx) {
 
         auto matched_node_qweight = node_to_output.at(qweight).get_node_shared_ptr();
         auto matched_node_qcoeff = node_to_output.at(qcoeff).get_node_shared_ptr();
+        auto matched_node_qmuls = node_to_output.at(qmuls).get_node_shared_ptr();
         auto matched_node_matmul = node_to_output.at(qmm).get_node_shared_ptr();
+        auto matched_node_qreshp = node_to_output.at(qreshp).get_node_shared_ptr();
         auto matched_out_mmi = node_to_output.at(qmmi);
 
         auto matched_qweight = std::static_pointer_cast<ov::op::v0::Parameter>(matched_node_qweight);
@@ -255,10 +273,36 @@ DQMatMulGQi::DQMatMulGQi(Context::Ref ctx) {
             act_shape.size() == 3 && act_shape[1] == 1 &&  // single-token case
             qcoeff_shape[0] == qweight_shape[0] && qcoeff_shape[1] == 1 && qcoeff_shape[2] == qweight_shape[2] &&
             !matched_matmul->get_transpose_a() && !matched_matmul->get_transpose_b()) {
+            if (!ctx.get().mm_dq_full) {
+                // Transpose weight and coeff
+                ctx.get().permute(matched_qweight, {0, 2, 1});
+                ctx.get().permute(matched_qcoeff, {0, 2, 1});
+
+                // Add Transpose and insert it
+                std::vector<std::size_t> new_transpose_order = {1, 0, 2};
+                auto new_transpose_order_c =
+                    std::make_shared<ov::op::v0::Constant>(ov::element::i32, ov::Shape{3}, new_transpose_order);
+                auto new_transpose = std::make_shared<ov::op::v1::Transpose>(matched_node_qmuls, new_transpose_order_c);
+                matched_node_qreshp->input(0).replace_source_output(new_transpose);
+                matched_node_qreshp->validate_and_infer_types();
+
+                // Change Reshape's shape
+                std::vector<std::size_t> transposed_shape = {qweight_shape[2], qweight_shape[0] * qweight_shape[1]};
+                auto transposed_shape_c =
+                    std::make_shared<ov::op::v0::Constant>(ov::element::i32, ov::Shape{2}, transposed_shape);
+                matched_node_qreshp->input(1).replace_source_output(transposed_shape_c);
+                matched_node_qreshp->validate_and_infer_types();
+
+                matched_matmul->set_transpose_b(true);
+                matched_matmul->validate_and_infer_types();
+
+                const auto& matm_mul_out_shape = matched_matmul->get_output_shape(0);
+                const auto& matm_mul_in_shape = matched_matmul->get_input_shape(1);
+                NPUW_ASSERT(matm_mul_out_shape.back() == matm_mul_in_shape.front());
+                return false;  // root hasn't changed
+            }
+
             // Mark W closure to transpose, and transpose the respective parameter
-            ov::Shape tw_shape = {qweight_shape[0], qweight_shape[2], qweight_shape[1]};
-            matched_qweight->set_partial_shape(tw_shape);
-            matched_qweight->validate_and_infer_types();
             ctx.get().permute(matched_qweight, {0, 2, 1});
 
             // Mark S closure to be lowered fo f16
@@ -353,7 +397,9 @@ DQMatMulGQ2i::DQMatMulGQ2i(Context::Ref ctx) {
 
         auto matched_node_qweight = node_to_output.at(qweight).get_node_shared_ptr();
         auto matched_node_qcoeff = node_to_output.at(qcoeff).get_node_shared_ptr();
+        auto matched_node_qmuls = node_to_output.at(qmuls).get_node_shared_ptr();
         auto matched_node_matmul = node_to_output.at(qmm).get_node_shared_ptr();
+        auto matched_node_qreshp = node_to_output.at(qreshp).get_node_shared_ptr();
         auto matched_out_mmi = node_to_output.at(qmmi);
 
         auto matched_qweight = std::static_pointer_cast<ov::op::v0::Parameter>(matched_node_qweight);
@@ -370,20 +416,33 @@ DQMatMulGQ2i::DQMatMulGQ2i(Context::Ref ctx) {
             act_shape.size() == 3 && act_shape[0] == 1 && act_shape[1] == 1 && qcoeff_shape[0] == qweight_shape[0] &&
             qcoeff_shape[2] == 1 && qcoeff_shape[1] == qweight_shape[1] && !matched_matmul->get_transpose_a() &&
             matched_matmul->get_transpose_b()) {
+            if (!ctx.get().mm_dq_full) {
+                // Transpose weight and coeff
+                ctx.get().permute(matched_qweight, {1, 0, 2});
+                ctx.get().permute(matched_qcoeff, {1, 0, 2});
+
+                // Add Transpose and insert it
+                std::vector<std::size_t> new_transpose_order = {1, 0, 2};
+                auto new_transpose_order_c =
+                    std::make_shared<ov::op::v0::Constant>(ov::element::i32, ov::Shape{3}, new_transpose_order);
+                auto new_transpose = std::make_shared<ov::op::v1::Transpose>(matched_node_qmuls, new_transpose_order_c);
+                matched_node_qreshp->input(0).replace_source_output(new_transpose);
+                matched_node_qreshp->validate_and_infer_types();
+                matched_matmul->validate_and_infer_types();
+
+                const auto& matm_mul_out_shape = matched_matmul->get_output_shape(0);
+                const auto& matm_mul_in_shape = matched_matmul->get_input_shape(1);
+                NPUW_ASSERT(matm_mul_out_shape.back() == matm_mul_in_shape.front());
+                NPUW_ASSERT(matched_matmul->get_transpose_b());
+                return false;  // root hasn't changed
+            }
+
             // Mark W closure to transpose, and transpose the respective parameter
             ctx.get().permute(matched_qweight, {1, 0, 2});
 
-            ov::Shape tw_shape = {qweight_shape[1], qweight_shape[0], qweight_shape[2]};
-            matched_qweight->set_partial_shape(tw_shape);
-            matched_qweight->validate_and_infer_types();
-
             // Also transpose S, but in a different way (see diagram above)
             ctx.get().permute(matched_qcoeff, {1, 2, 0});
 
-            ov::Shape ts_shape = {qcoeff_shape[1], qcoeff_shape[2], qcoeff_shape[0]};
-            matched_qcoeff->set_partial_shape(ts_shape);
-            matched_qcoeff->validate_and_infer_types();
-
             // Reshape the Act to group format
             const auto NSPLIT = qweight_shape[1];
             std::vector<std::size_t> rshp_act_v = {NSPLIT, 1, act_shape[2] / NSPLIT};
@@ -473,7 +532,9 @@ DQMatMulGQiP::DQMatMulGQiP(Context::Ref ctx) {
 
         auto matched_node_qweight = node_to_output.at(qweight).get_node_shared_ptr();
         auto matched_node_qcoeff = node_to_output.at(qcoeff).get_node_shared_ptr();
+        auto matched_node_qmuls = node_to_output.at(qmuls).get_node_shared_ptr();
         auto matched_node_matmul = node_to_output.at(qmm).get_node_shared_ptr();
+        auto matched_node_qreshp = node_to_output.at(qreshp).get_node_shared_ptr();
         auto matched_out_mmi = node_to_output.at(qmmi);
 
         auto matched_qweight = std::static_pointer_cast<ov::op::v0::Parameter>(matched_node_qweight);
@@ -489,15 +550,39 @@ DQMatMulGQiP::DQMatMulGQiP(Context::Ref ctx) {
             act_shape.size() == 3 && act_shape[1] > 1 &&  // multi-token case
             qcoeff_shape[0] == qweight_shape[0] && qcoeff_shape[1] == 1 && qcoeff_shape[2] == qweight_shape[2] &&
             !matched_matmul->get_transpose_a() && !matched_matmul->get_transpose_b()) {
+            if (!ctx.get().mm_dq_full) {
+                // Transpose weight and coeff
+                ctx.get().permute(matched_qweight, {0, 2, 1});
+                ctx.get().permute(matched_qcoeff, {0, 2, 1});
+
+                // Add Transpose and insert it
+                std::vector<std::size_t> new_transpose_order = {1, 0, 2};
+                auto new_transpose_order_c =
+                    std::make_shared<ov::op::v0::Constant>(ov::element::i32, ov::Shape{3}, new_transpose_order);
+                auto new_transpose = std::make_shared<ov::op::v1::Transpose>(matched_node_qmuls, new_transpose_order_c);
+                matched_node_qreshp->input(0).replace_source_output(new_transpose);
+                matched_node_qreshp->validate_and_infer_types();
+
+                // // Change Reshape's shape
+                std::vector<std::size_t> transposed_shape = {qweight_shape[2], qweight_shape[0] * qweight_shape[1]};
+                auto transposed_shape_c =
+                    std::make_shared<ov::op::v0::Constant>(ov::element::i32, ov::Shape{2}, transposed_shape);
+                matched_node_qreshp->input(1).replace_source_output(transposed_shape_c);
+                matched_node_qreshp->validate_and_infer_types();
+
+                matched_matmul->set_transpose_b(true);
+                matched_matmul->validate_and_infer_types();
+
+                const auto& matm_mul_out_shape = matched_matmul->get_output_shape(0);
+                const auto& matm_mul_in_shape = matched_matmul->get_input_shape(1);
+                NPUW_ASSERT(matm_mul_out_shape.back() == matm_mul_in_shape.front());
+                return false;  // root hasn't changed
+            }
+
             // Mark W closure to transpose, and transpose the respective parameter
-            ov::Shape tw_shape = {qweight_shape[0], qweight_shape[2], qweight_shape[1]};
-            matched_qweight->set_partial_shape(tw_shape);
-            matched_qweight->validate_and_infer_types();
             ctx.get().permute(matched_qweight, {0, 2, 1});
 
             // Mark S closure to be lowered fo f16
-            matched_qcoeff->set_element_type(ov::element::f16);
-            matched_qcoeff->validate_and_infer_types();
             ctx.get().to_f16(matched_qcoeff);
 
             // Reshape the Act to group format
@@ -586,7 +671,9 @@ DQMatMulGQ2iP::DQMatMulGQ2iP(Context::Ref ctx) {
 
         auto matched_node_qweight = node_to_output.at(qweight).get_node_shared_ptr();
         auto matched_node_qcoeff = node_to_output.at(qcoeff).get_node_shared_ptr();
+        auto matched_node_qmuls = node_to_output.at(qmuls).get_node_shared_ptr();
         auto matched_node_matmul = node_to_output.at(qmm).get_node_shared_ptr();
+        auto matched_node_qreshp = node_to_output.at(qreshp).get_node_shared_ptr();
         auto matched_out_mmi = node_to_output.at(qmmi);
 
         auto matched_qweight = std::static_pointer_cast<ov::op::v0::Parameter>(matched_node_qweight);
@@ -606,19 +693,33 @@ DQMatMulGQ2iP::DQMatMulGQ2iP(Context::Ref ctx) {
             act_shape.size() == 3 && just_one(act_shape[0], act_shape[1]) &&  // multi-token case
             qcoeff_shape[0] == qweight_shape[0] && qcoeff_shape[1] == qweight_shape[1] && qcoeff_shape[2] == 1 &&
             !matched_matmul->get_transpose_a() && matched_matmul->get_transpose_b()) {
+            if (!ctx.get().mm_dq_full) {
+                // Transpose weight and coeff
+                ctx.get().permute(matched_qweight, {1, 0, 2});
+                ctx.get().permute(matched_qcoeff, {1, 0, 2});
+
+                // Add Transpose and insert it
+                std::vector<std::size_t> new_transpose_order = {1, 0, 2};
+                auto new_transpose_order_c =
+                    std::make_shared<ov::op::v0::Constant>(ov::element::i32, ov::Shape{3}, new_transpose_order);
+                auto new_transpose = std::make_shared<ov::op::v1::Transpose>(matched_node_qmuls, new_transpose_order_c);
+                matched_node_qreshp->input(0).replace_source_output(new_transpose);
+                matched_node_qreshp->validate_and_infer_types();
+                matched_matmul->validate_and_infer_types();
+
+                const auto& matm_mul_out_shape = matched_matmul->get_output_shape(0);
+                const auto& matm_mul_in_shape = matched_matmul->get_input_shape(1);
+                NPUW_ASSERT(matm_mul_out_shape.back() == matm_mul_in_shape.front());
+                NPUW_ASSERT(matched_matmul->get_transpose_b());
+                return false;  // root hasn't changed
+            }
+
             // Mark W closure to transpose, and transpose the respective parameter
-            ov::Shape tw_shape = {qweight_shape[1], qweight_shape[0], qweight_shape[2]};
-            matched_qweight->set_partial_shape(tw_shape);
-            matched_qweight->validate_and_infer_types();
             ctx.get().permute(matched_qweight, {1, 0, 2});
 
             // Also transpose S, but in a different way (see diagram above)
             ctx.get().permute(matched_qcoeff, {1, 2, 0});
 
-            ov::Shape ts_shape = {qcoeff_shape[1], qcoeff_shape[2], qcoeff_shape[0]};
-            matched_qcoeff->set_partial_shape(ts_shape);
-            matched_qcoeff->validate_and_infer_types();
-
             // Select proper activation shape
             std::size_t act_dim = act_shape[0] > act_shape[1] ? 0 : 1;
 
diff --git a/src/plugins/intel_npu/src/plugin/npuw/partitioning/patterns/opt.hpp b/src/plugins/intel_npu/src/plugin/npuw/partitioning/patterns/opt.hpp
index 8bd4e173ff210a..904ce88039d2eb 100644
--- a/src/plugins/intel_npu/src/plugin/npuw/partitioning/patterns/opt.hpp
+++ b/src/plugins/intel_npu/src/plugin/npuw/partitioning/patterns/opt.hpp
@@ -19,14 +19,10 @@ namespace npuw {
 namespace patterns {
 namespace opt {
 
-class DQMatMulCWi : public ov::pass::MatcherPass {
-public:
-    DQMatMulCWi();
-};
-
 struct Context {
     std::string pmm_dims;
     bool is_spatial = false;
+    bool mm_dq_full = true;
 
     using PPtr = std::shared_ptr<ov::op::v0::Parameter>;
     using NPtr = std::shared_ptr<ov::Node>;
@@ -66,6 +62,11 @@ struct Context {
     using Ref = std::reference_wrapper<Context>;
 };
 
+class DQMatMulCWi : public ov::pass::MatcherPass {
+public:
+    explicit DQMatMulCWi(Context::Ref ctx);
+};
+
 class DQMatMulGQi : public ov::pass::MatcherPass {
 public:
     explicit DQMatMulGQi(Context::Ref ctx);
diff --git a/src/plugins/intel_npu/src/plugin/npuw/util.cpp b/src/plugins/intel_npu/src/plugin/npuw/util.cpp
index a878b244bc41e9..e9cab91e60bdb0 100644
--- a/src/plugins/intel_npu/src/plugin/npuw/util.cpp
+++ b/src/plugins/intel_npu/src/plugin/npuw/util.cpp
@@ -439,6 +439,14 @@ inline uint8_t tread_4b(const ov::Tensor& t, std::size_t r, std::size_t c, std::
     return hi4(*telem);
 }
 
+template <typename T>
+inline T tread(const ov::Tensor& t, std::size_t r, std::size_t c, std::size_t COLS) {
+    const T* tdata = static_cast<T*>(t.data());
+    const T* trow = tdata + r * COLS;
+    const T* telem = trow + c;
+    return *telem;
+}
+
 inline void twrite_4b(ov::Tensor& t, uint8_t value, std::size_t r, std::size_t c, std::size_t COLS) {
     uint8_t* tdata = static_cast<uint8_t*>(t.data());
     uint8_t* trow = tdata + r * COLS / 2;
@@ -450,10 +458,18 @@ inline void twrite_4b(ov::Tensor& t, uint8_t value, std::size_t r, std::size_t c
     }
 }
 
+template <typename T>
+inline void twrite(ov::Tensor& t, T value, std::size_t r, std::size_t c, std::size_t COLS) {
+    T* tdata = static_cast<T*>(t.data());
+    T* trow = tdata + r * COLS;
+    T* telem = trow + c;
+    *telem = value;
+}
+
 ov::Tensor ov::npuw::util::transpose(const ov::Tensor& t) {
     ov::Shape shape = t.get_shape();
     NPUW_ASSERT(shape.size() == 3);  // Yes, so far only transpose 3D tensors
-    NPUW_ASSERT(t.get_element_type() == ov::element::i4);
+    NPUW_ASSERT(t.get_element_type() == ov::element::i4 || t.get_element_type() == ov::element::f32);
 
     ov::Shape tshape = {shape[2], shape[0], shape[1]};
     ov::Tensor tnew(t.get_element_type(), tshape);
@@ -462,8 +478,16 @@ ov::Tensor ov::npuw::util::transpose(const ov::Tensor& t) {
     const auto IN_COLS = shape[2];
     for (std::size_t i = 0; i < IN_ROWS; i++) {
         for (std::size_t j = 0; j < IN_COLS; j++) {
-            uint8_t value = tread_4b(t, i, j, IN_COLS);
-            twrite_4b(tnew, value, j, i, IN_ROWS);
+            switch (t.get_element_type()) {
+            case ov::element::i4:
+                twrite_4b(tnew, tread_4b(t, i, j, IN_COLS), j, i, IN_ROWS);
+                break;
+            case ov::element::f32:
+                twrite<float>(tnew, tread<float>(t, i, j, IN_COLS), j, i, IN_ROWS);
+                break;
+            default:
+                NPUW_ASSERT(false && "Element type is not supported yet");
+            }
         }
     }
     return tnew;
@@ -498,21 +522,33 @@ ov::Tensor ov::npuw::util::permute(const ov::Tensor& t, const std::vector<std::s
     if (axes[0] == 2 && axes[1] == 0 && axes[2] == 1) {
         return transpose(t);
     } else if (axes[0] == 0 && axes[1] == 2 && axes[2] == 1) {
-        NPUW_ASSERT(t.get_element_type() == ov::element::i4);  // 4bit only here
+        NPUW_ASSERT(t.get_element_type() == ov::element::i4 || t.get_element_type() == ov::element::f32);
         ov::Shape tshape = {shape[0], shape[2], shape[1]};
         ov::Tensor tnew(t.get_element_type(), tshape);
 
         for (std::size_t p = 0; p < shape[0]; p++) {
             for (std::size_t r = 0; r < shape[1]; r++) {
                 for (std::size_t c = 0; c < shape[2]; c++) {
-                    uint8_t value = tread_4b(t, p * shape[1] + r, c, shape[2]);
-                    twrite_4b(tnew, value, p * shape[2] + c, r, shape[1]);
+                    switch (t.get_element_type()) {
+                    case ov::element::i4:
+                        twrite_4b(tnew, tread_4b(t, p * shape[1] + r, c, shape[2]), p * shape[2] + c, r, shape[1]);
+                        break;
+                    case ov::element::f32:
+                        twrite<float>(tnew,
+                                      tread<float>(t, p * shape[1] + r, c, shape[2]),
+                                      p * shape[2] + c,
+                                      r,
+                                      shape[1]);
+                        break;
+                    default:
+                        NPUW_ASSERT(false && "Element type is not supported yet");
+                    }
                 }
             }
         }
         return tnew;
     } else if (axes[0] == 1 && axes[1] == 0 && axes[2] == 2) {
-        NPUW_ASSERT(t.get_element_type() == ov::element::i4);  // 4bit only here too
+        NPUW_ASSERT(t.get_element_type() == ov::element::i4 || t.get_element_type() == ov::element::f16);
         ov::Shape tshape = {shape[1], shape[0], shape[2]};
         ov::Tensor tnew(t.get_element_type(), tshape);
 
@@ -520,8 +556,24 @@ ov::Tensor ov::npuw::util::permute(const ov::Tensor& t, const std::vector<std::s
         for (std::size_t p = 0; p < tshape[0]; p++) {
             for (std::size_t r = 0; r < tshape[1]; r++) {
                 for (std::size_t c = 0; c < tshape[2]; c++) {
-                    uint8_t value = tread_4b(t, r, p * shape[2] + c, shape[1] * shape[2]);
-                    twrite_4b(tnew, value, p * tshape[1] + r, c, tshape[2]);
+                    switch (t.get_element_type()) {
+                    case ov::element::i4:
+                        twrite_4b(tnew,
+                                  tread_4b(t, r, p * shape[2] + c, shape[1] * shape[2]),
+                                  p * tshape[1] + r,
+                                  c,
+                                  tshape[2]);
+                        break;
+                    case ov::element::f16:
+                        twrite<uint16_t>(tnew,
+                                         tread<uint16_t>(t, r, p * shape[2] + c, shape[1] * shape[2]),
+                                         p * tshape[1] + r,
+                                         c,
+                                         tshape[2]);
+                        break;
+                    default:
+                        NPUW_ASSERT(false && "Element type is not supported yet");
+                    }
                 }
             }
         }
@@ -537,7 +589,7 @@ ov::Tensor ov::npuw::util::permute(const ov::Tensor& t, const std::vector<std::s
             permute120<uint16_t>(t, tnew);
             break;
         default:
-            NPUW_ASSERT("Element type is not supported yet");
+            NPUW_ASSERT(false && "Element type is not supported yet");
         }
         return tnew;
     } else {