diff --git a/src/plugins/intel_npu/src/al/include/intel_npu/al/config/npuw.hpp b/src/plugins/intel_npu/src/al/include/intel_npu/al/config/npuw.hpp
index 76a3b23259f1f5..53a12fb58d333c 100644
--- a/src/plugins/intel_npu/src/al/include/intel_npu/al/config/npuw.hpp
+++ b/src/plugins/intel_npu/src/al/include/intel_npu/al/config/npuw.hpp
@@ -44,6 +44,8 @@ DEFINE_OPT(NPUW_CWAI, bool, false, npuw::partitioning::cwai, CompileTime);
 DEFINE_OPT(NPUW_DQ, bool, false, npuw::partitioning::dyn_quant, CompileTime);
 DEFINE_OPT(NPUW_PMM, std::string, "2", npuw::partitioning::par_matmul_merge_dims, CompileTime);
 DEFINE_OPT(NPUW_HOST_GATHER, bool, true, npuw::partitioning::host_gather, CompileTime);
+DEFINE_OPT(NPUW_SPATIAL, bool, false, npuw::partitioning::spatial, CompileTime);
+DEFINE_OPT(NPUW_SPATIAL_NWAY, std::size_t, 64, npuw::partitioning::spatial_nway, CompileTime);
 DEFINE_OPT(NPUW_DCOFF_TYPE, std::string, "", npuw::partitioning::dcoff_type, CompileTime);
 DEFINE_OPT(NPUW_DCOFF_SCALE, bool, false, npuw::partitioning::dcoff_with_scale, CompileTime);
 DEFINE_OPT(NPUW_FUNCALL_FOR_ALL, bool, false, npuw::partitioning::funcall_for_all, CompileTime);
diff --git a/src/plugins/intel_npu/src/al/include/npuw_private_properties.hpp b/src/plugins/intel_npu/src/al/include/npuw_private_properties.hpp
index 4ccb29469e98ab..4de9ee5ab15080 100644
--- a/src/plugins/intel_npu/src/al/include/npuw_private_properties.hpp
+++ b/src/plugins/intel_npu/src/al/include/npuw_private_properties.hpp
@@ -178,6 +178,22 @@ static constexpr ov::Property<bool> dyn_quant{"NPUW_DQ"};
  */
 static constexpr ov::Property<std::string> par_matmul_merge_dims{"NPUW_PMM"};
 
+/**
+ * @brief
+ * Type: boolean.
+ * Enable spatial execution for selected subgraphs. Requires COMPUTE isolation.
+ * Default value: false
+ */
+static constexpr ov::Property<bool> spatial{"NPUW_SPATIAL"};
+
+/**
+ * @brief
+ * Type: std::size_t.
+ * Submission size for the spatial execution.
+ * Default value: 64
+ */
+static constexpr ov::Property<std::size_t> spatial_nway{"NPUW_SPATIAL_NWAY"};
+
 /**
  * @brief
  * Type: boolean
diff --git a/src/plugins/intel_npu/src/al/src/config/npuw.cpp b/src/plugins/intel_npu/src/al/src/config/npuw.cpp
index 798b5344c4ea62..ac5a2623020c04 100644
--- a/src/plugins/intel_npu/src/al/src/config/npuw.cpp
+++ b/src/plugins/intel_npu/src/al/src/config/npuw.cpp
@@ -28,6 +28,8 @@ void intel_npu::registerNPUWOptions(OptionsDesc& desc) {
     desc.add<NPUW_CWAI>();
     desc.add<NPUW_DQ>();
     desc.add<NPUW_PMM>();
+    desc.add<NPUW_SPATIAL>();
+    desc.add<NPUW_SPATIAL_NWAY>();
     desc.add<NPUW_HOST_GATHER>();
     desc.add<NPUW_DCOFF_TYPE>();
     desc.add<NPUW_DCOFF_SCALE>();
diff --git a/src/plugins/intel_npu/src/plugin/npuw/base_sync_infer_request.cpp b/src/plugins/intel_npu/src/plugin/npuw/base_sync_infer_request.cpp
index 58036d299b3c1b..3bbbbe6aca7c06 100644
--- a/src/plugins/intel_npu/src/plugin/npuw/base_sync_infer_request.cpp
+++ b/src/plugins/intel_npu/src/plugin/npuw/base_sync_infer_request.cpp
@@ -210,8 +210,14 @@ void ov::npuw::IBaseInferRequest::infer() {
 }
 
 void ov::npuw::IBaseInferRequest::dump_input_tensors(std::size_t idx) {
+    const std::string dump_ios_opt = m_npuw_model->m_cfg.get<::intel_npu::NPUW_DUMP_IO>();
+    if (!ov::npuw::util::is_set(idx, dump_ios_opt)) {
+        return;
+    }
+
     auto real_idx = m_npuw_model->m_compiled_submodels[idx].replaced_by.value_or(idx);
-    const auto& comp_submodel = m_npuw_model->m_compiled_submodels[real_idx].compiled_model;
+    const auto& comp_submodel_desc = m_npuw_model->m_compiled_submodels[real_idx];
+    const auto& comp_submodel = comp_submodel_desc.compiled_model;
 
     // Note: keep using the absolute `idx` for identififaction and printing
     // Note:
@@ -219,11 +225,14 @@ void ov::npuw::IBaseInferRequest::dump_input_tensors(std::size_t idx) {
     // - _path is used for disk dump (will have leading 00s for indices)
     const auto comp_submodel_name = subgr_name(idx);
     const auto comp_submodel_path = m_npuw_model->m_name + subgr_path_suffix(idx) + iter_path_suffix(idx);
+    const auto num_inputs = comp_submodel->inputs().size();
 
-    const std::string dump_ios_opt = m_npuw_model->m_cfg.get<::intel_npu::NPUW_DUMP_IO>();
-    if (ov::npuw::util::is_set(idx, dump_ios_opt)) {
+    // There's different approaches to dumping normal and spatial subgraphs.
+    if (!comp_submodel_desc.spatial) {
+        // In the normal, non-spatial mode, we just dump the current subgrequests
+        // pre-set tensors and that's it
         std::vector<std::string> in_base_names;
-        for (std::size_t i = 0u, num_inputs = comp_submodel->inputs().size(); i < num_inputs; i++) {
+        for (std::size_t i = 0u; i < num_inputs; i++) {
             const auto& port = comp_submodel->inputs()[i];
             const auto& tnsr = m_subrequests[real_idx]->get_tensor(port);
             std::string in_base_name = comp_submodel_path + "_input_" + ov::npuw::util::fmt(i, num_inputs);
@@ -231,12 +240,61 @@ void ov::npuw::IBaseInferRequest::dump_input_tensors(std::size_t idx) {
             in_base_names.push_back(std::move(in_base_name));
         }
         ov::npuw::dump_input_list(comp_submodel_path, in_base_names);
+    } else {
+        const auto& s = comp_submodel_desc.spatial.value();
+
+        std::set<std::size_t> spatial_param_idx;
+        std::vector<std::string> in_base_names_nonspat;
+
+        // First, dump the non-spatial input tensors just once - and remember its names
+        for (auto&& p : s.params) {
+            spatial_param_idx.insert(p.idx);
+        }
+        for (std::size_t i = 0u; i < num_inputs; i++) {
+            if (spatial_param_idx.count(i)) {
+                continue;
+            }
+            const auto& port = comp_submodel->inputs()[i];
+            const auto& tnsr = m_subrequests[real_idx]->get_tensor(port);
+            std::string in_base_name = comp_submodel_path + "_input_" + ov::npuw::util::fmt(i, num_inputs);
+            ov::npuw::dump_tensor(tnsr, in_base_name);
+            in_base_names_nonspat.push_back(std::move(in_base_name));
+        }
+
+        // Now iterate over the spatial range and dump the individual tiles
+        // For the spatial case, these tiles should've been taken from the special
+        // spatial_io tensors
+        for (std::size_t offset = 0u; offset < s.range; offset += s.nway) {
+            const std::size_t this_len = (offset + s.nway <= s.range) ? s.nway               // the full tile
+                                                                      : (s.range - offset);  // the last tile
+            // Copy the base file list to start with it
+            std::vector<std::string> tile_ilist(in_base_names_nonspat);
+            for (auto&& p : s.params) {
+                std::string in_base_name = comp_submodel_path + "_input_" + ov::npuw::util::fmt(p.idx, num_inputs) +
+                                           "_d" + ov::npuw::util::fmt(p.dim, 10) + "_" +
+                                           ov::npuw::util::fmt(offset, s.range);
+
+                const auto& tnsr = m_spatial_io[real_idx].inputs.at(p.idx);
+                const auto& view = ov::npuw::util::view(tnsr, p.dim, offset, this_len);
+
+                ov::npuw::dump_tensor(view, in_base_name);
+                tile_ilist.push_back(std::move(in_base_name));
+            }
+            // Dump ilist per tile
+            ov::npuw::dump_input_list(comp_submodel_path, tile_ilist);
+        }  // for(offset)
     }
 }
 
 void ov::npuw::IBaseInferRequest::dump_output_tensors(std::size_t idx) {
+    const std::string dump_ios_opt = m_npuw_model->m_cfg.get<::intel_npu::NPUW_DUMP_IO>();
+    if (!ov::npuw::util::is_set(idx, dump_ios_opt)) {
+        return;
+    }
+
     auto real_idx = m_npuw_model->m_compiled_submodels[idx].replaced_by.value_or(idx);
-    const auto& comp_submodel = m_npuw_model->m_compiled_submodels[real_idx].compiled_model;
+    const auto& comp_submodel_desc = m_npuw_model->m_compiled_submodels[real_idx];
+    const auto& comp_submodel = comp_submodel_desc.compiled_model;
 
     // Note: keep using the absolute `idx` for identififaction and printing
     // Note:
@@ -245,11 +303,12 @@ void ov::npuw::IBaseInferRequest::dump_output_tensors(std::size_t idx) {
     // FIXME: Duplication is evil
     const auto comp_submodel_name = subgr_name(idx);
     const auto comp_submodel_path = m_npuw_model->m_name + subgr_path_suffix(idx) + iter_path_suffix(idx);
+    const std::size_t num_outputs = comp_submodel->outputs().size();
 
-    const std::string dump_ios_opt = m_npuw_model->m_cfg.get<::intel_npu::NPUW_DUMP_IO>();
-    if (ov::npuw::util::is_set(idx, dump_ios_opt)) {
+    // Same approach as in above. Spatial tensors require special handling
+    if (!comp_submodel_desc.spatial) {
         std::vector<std::string> out_base_names;
-        for (std::size_t i = 0u, num_outputs = comp_submodel->outputs().size(); i < num_outputs; i++) {
+        for (std::size_t i = 0u; i < num_outputs; i++) {
             const auto& port = comp_submodel->outputs()[i];
             const auto& tnsr = m_subrequests[real_idx]->get_tensor(port);
             std::string out_base_name = comp_submodel_path + "_output_" + ov::npuw::util::fmt(i, num_outputs);
@@ -257,6 +316,26 @@ void ov::npuw::IBaseInferRequest::dump_output_tensors(std::size_t idx) {
             out_base_names.push_back(std::move(out_base_name));
         }
         ov::npuw::dump_output_list(comp_submodel_path, out_base_names);
+    } else {
+        // All outputs are considered spatial now so it should be easier
+        const auto& s = comp_submodel_desc.spatial.value();
+        for (std::size_t offset = 0u; offset < s.range; offset += s.nway) {
+            const std::size_t this_len = (offset + s.nway <= s.range) ? s.nway               // the full tile
+                                                                      : (s.range - offset);  // the last tile
+            std::vector<std::string> tile_olist;
+            for (std::size_t i = 0u; i < num_outputs; i++) {
+                std::string out_base_name = comp_submodel_path + "_output_" + ov::npuw::util::fmt(i, num_outputs) +
+                                            "_d" + ov::npuw::util::fmt(s.out_dim, 10) + "_" +
+                                            ov::npuw::util::fmt(offset, s.range);
+                const auto& tnsr = m_spatial_io[real_idx].outputs.at(i);
+                const auto& view = ov::npuw::util::view(tnsr, s.out_dim, offset, this_len);
+
+                ov::npuw::dump_tensor(view, out_base_name);
+                tile_olist.push_back(std::move(out_base_name));
+            }
+            // Dump olist per tile
+            ov::npuw::dump_output_list(comp_submodel_path, tile_olist);
+        }
     }
 }
 
diff --git a/src/plugins/intel_npu/src/plugin/npuw/base_sync_infer_request.hpp b/src/plugins/intel_npu/src/plugin/npuw/base_sync_infer_request.hpp
index 986ea78c378c32..6d4b4c71ef3cab 100644
--- a/src/plugins/intel_npu/src/plugin/npuw/base_sync_infer_request.hpp
+++ b/src/plugins/intel_npu/src/plugin/npuw/base_sync_infer_request.hpp
@@ -91,6 +91,22 @@ class IBaseInferRequest : public ov::ISyncInferRequest {
     // FROM(Every subrequests' output port) TO(Its output tensor)
     std::map<ov::Output<const ov::Node>, TensorStorage> m_port_to_tensor;
 
+    // FIXME: Currently is initialized/managed by subclass as well.
+    // Moved here dumping purposes only
+    // Another sparse vector. Represents populated spatial I/O parameters
+    // which can should be read/written by parts in multile submissions.
+    // An ugly structure, cries for refactoring
+    // See function_prologue for details.
+    // Also it contains pre-allocated tensors for tails handling
+    struct SpatialIO {
+        std::vector<ov::SoPtr<ov::ITensor>> inputs;   // # of elements - # of graph-side inputs
+        std::vector<ov::SoPtr<ov::ITensor>> outputs;  // # of elements - # of subgraph outputs
+
+        std::vector<ov::SoPtr<ov::ITensor>> input_tails;   // temporary buffers for input tails
+        std::vector<ov::SoPtr<ov::ITensor>> output_tails;  // temporary buffers for output tails
+    };
+    std::vector<SpatialIO> m_spatial_io;
+
     const std::size_t m_num_submodels;
 
     void dump_input_tensors(std::size_t idx);
diff --git a/src/plugins/intel_npu/src/plugin/npuw/compiled_model.cpp b/src/plugins/intel_npu/src/plugin/npuw/compiled_model.cpp
index 3213be04ec3a33..1d2217f1114d0c 100644
--- a/src/plugins/intel_npu/src/plugin/npuw/compiled_model.cpp
+++ b/src/plugins/intel_npu/src/plugin/npuw/compiled_model.cpp
@@ -277,6 +277,22 @@ ov::npuw::CompiledModel::CompiledModel(const std::shared_ptr<ov::Model>& model,
                 compiledFunctions.insert({subgraph._funcall, id});
                 m_compiled_submodels[id].model = fcn_template._model;
                 m_compiled_submodels[id].replaced_by = id;  // FIXME: UGLY
+
+                // Fill in the spatial information, if it is present
+                if (fcn_template._spatial) {
+                    using S = CompiledModelDesc::Spatial;
+                    S s;
+                    s.range = fcn_template._spatial->_range;
+                    s.nway = fcn_template._spatial->_slice;
+                    s.out_dim = fcn_template._spatial->_out_dim;
+                    s.nway_iters = s.range / s.nway;
+                    s.tail_size = s.range % s.nway;
+                    for (auto&& input : fcn_template._spatial->_inputs) {
+                        std::size_t p_idx = fcn_template._model->get_parameter_index(input.param);
+                        s.params.push_back(S::Param{p_idx, input.dim});
+                    }
+                    m_compiled_submodels[id].spatial = std::move(s);
+                }
                 LOG_INFO("Subgraph[" << id << "] is a function body for " << subgraph._funcall);
             } else {
                 // ...and refer to it in other calls
@@ -824,6 +840,8 @@ void ov::npuw::CompiledModel::implement_properties() {
                           BIND(npuw::partitioning::cwai, NPUW_CWAI),
                           BIND(npuw::partitioning::dyn_quant, NPUW_DQ),
                           BIND(npuw::partitioning::par_matmul_merge_dims, NPUW_PMM),
+                          BIND(npuw::partitioning::spatial, NPUW_SPATIAL),
+                          BIND(npuw::partitioning::spatial, NPUW_SPATIAL_NWAY),
                           BIND(npuw::partitioning::host_gather, NPUW_HOST_GATHER),
                           BIND(npuw::partitioning::funcall_for_all, NPUW_FUNCALL_FOR_ALL),
                           BIND(npuw::partitioning::dcoff_type, NPUW_DCOFF_TYPE),
diff --git a/src/plugins/intel_npu/src/plugin/npuw/compiled_model.hpp b/src/plugins/intel_npu/src/plugin/npuw/compiled_model.hpp
index 1ddaf3f543eaa8..ab517d6adc75ef 100644
--- a/src/plugins/intel_npu/src/plugin/npuw/compiled_model.hpp
+++ b/src/plugins/intel_npu/src/plugin/npuw/compiled_model.hpp
@@ -116,10 +116,24 @@ class CompiledModel : public ov::ICompiledModel {
 
         std::optional<std::size_t> replaced_by;
 
-        // FIXME: This is a 1:1 copy of the ov::npuw::Subgraph structure
-        // w.r.t. function calls
         Subgraph::Gather host_gather;
+        struct Spatial {
+            struct Param {
+                std::size_t idx;
+                std::size_t dim;
+            };
+            std::vector<Param> params;
+            std::size_t range = 0u;
+            std::size_t nway = 0u;
+            std::size_t out_dim = 0u;
+
+            std::size_t nway_iters = 0u;
+            std::size_t tail_size = 0u;
+        };
+        std::optional<Spatial> spatial;
 
+        // FIXME: This is a 1:1 copy of the ov::npuw::Subgraph structure
+        // w.r.t. function calls
         std::size_t param_base = 0;
         std::vector<ov::Tensor> closure;
         std::vector<ov::Tensor> scales;
diff --git a/src/plugins/intel_npu/src/plugin/npuw/just_sync_infer_request.cpp b/src/plugins/intel_npu/src/plugin/npuw/just_sync_infer_request.cpp
index 6638fbcbe12a57..9ad7016f3efb69 100644
--- a/src/plugins/intel_npu/src/plugin/npuw/just_sync_infer_request.cpp
+++ b/src/plugins/intel_npu/src/plugin/npuw/just_sync_infer_request.cpp
@@ -5,6 +5,7 @@
 #include "just_sync_infer_request.hpp"
 
 #include <algorithm>
+#include <future>
 #include <map>
 #include <memory>
 #include <string>
@@ -29,6 +30,8 @@ ov::npuw::JustInferRequest::JustInferRequest(const std::shared_ptr<ov::npuw::Com
         m_funcall_pipeline.resize(m_num_submodels);
     }
 
+    m_spatial_io.resize(m_num_submodels);
+
     // Create infer requests
     // Preallocate funcall tensors & substitute function call requests
     bool failover_happened = false;
@@ -48,11 +51,45 @@ ov::npuw::JustInferRequest::JustInferRequest(const std::shared_ptr<ov::npuw::Com
         if (comp_model_desc.replaced_by) {
             // Pre-allocate output tesnors for this function call
             const auto real_idx = comp_model_desc.replaced_by.value();
-            auto& proto_comp_model = m_npuw_model->m_compiled_submodels[real_idx].compiled_model;
-            for (size_t out_idx = 0; out_idx < proto_comp_model->outputs().size(); out_idx++) {
+            auto& proto_comp_model_desc = m_npuw_model->m_compiled_submodels[real_idx];
+            auto& proto_comp_model = proto_comp_model_desc.compiled_model;
+
+            const auto num_outputs = proto_comp_model->outputs().size();
+
+            // Initialize the spatial IO placeholders, if required
+            if (proto_comp_model_desc.spatial) {
+                m_spatial_io[real_idx].inputs.resize(proto_comp_model_desc.param_base);
+                m_spatial_io[real_idx].input_tails.resize(proto_comp_model_desc.param_base);
+                m_spatial_io[real_idx].outputs.resize(num_outputs);
+                m_spatial_io[real_idx].output_tails.resize(num_outputs);
+
+                if (proto_comp_model_desc.spatial->tail_size) {
+                    // Preallocate extra buffers for tail processing
+                    // Note: these buffers are allocated to the entire NWAY (> tail_size)
+                    for (auto&& p : proto_comp_model_desc.spatial->params) {
+                        const auto& iport = proto_comp_model_desc.compiled_model->inputs()[p.idx];
+                        m_spatial_io[real_idx].input_tails[p.idx] =
+                            ov::get_tensor_impl(ov::Tensor(iport.get_element_type(), iport.get_shape()));
+                    }
+                    const auto num_outs = proto_comp_model_desc.compiled_model->outputs().size();
+                    for (std::size_t out_idx = 0u; out_idx < num_outs; out_idx++) {
+                        const auto& oport = proto_comp_model_desc.compiled_model->outputs()[out_idx];
+                        m_spatial_io[real_idx].output_tails[out_idx] =
+                            ov::get_tensor_impl(ov::Tensor(oport.get_element_type(), oport.get_shape()));
+                    }
+                }
+            }  // if(spatial)
+
+            for (size_t out_idx = 0; out_idx < num_outputs; out_idx++) {
                 const auto& port = proto_comp_model->outputs()[out_idx];
+                ov::Shape shape = port.get_shape();
+
+                // If the subgraph is spatial, promote the output size to the full vector size
+                if (proto_comp_model_desc.spatial) {
+                    shape[proto_comp_model_desc.spatial->out_dim] = proto_comp_model_desc.spatial->range;
+                }
                 m_funcall_result[LinkFrom{i, out_idx}] =
-                    ov::get_tensor_impl(ov::Tensor(port.get_element_type(), port.get_shape()));
+                    ov::get_tensor_impl(ov::Tensor(port.get_element_type(), shape));
             }
             if (real_idx != i) {
                 // If this function call is NOT the function body, do nothing here - the original
@@ -147,7 +184,7 @@ ov::npuw::JustInferRequest::JustInferRequest(const std::shared_ptr<ov::npuw::Com
         const auto& to_submodel = m_npuw_model->m_inputs_to_submodels_inputs.at(i);
         if (to_submodel != CompiledModel::NO_LINK) {
             std::size_t sub_idx{}, in_idx{};
-            std::tie(sub_idx, in_idx) = m_npuw_model->m_inputs_to_submodels_inputs.at(i);
+            std::tie(sub_idx, in_idx) = to_submodel;
             m_subrequests_gio.at(sub_idx).global_params[i] = in_idx;
         }
     }  // for(inputs)
@@ -336,6 +373,9 @@ void ov::npuw::JustInferRequest::bind_global_parameters(std::size_t idx) {
     const bool do_copy = needs_copy(idx);
     const auto& iodesc = m_subrequests_gio.at(idx);
 
+    const auto& proto_comp_model_desc = m_npuw_model->m_compiled_submodels[real_idx];
+    const bool is_spatial = proto_comp_model_desc.spatial.has_value();
+
     // a list of ports to copy tensors, if needed: FROM -> TO
     std::vector<std::pair<ov::SoPtr<ov::ITensor>, ov::Output<const ov::Node>>> copy_list;
 
@@ -356,21 +396,39 @@ void ov::npuw::JustInferRequest::bind_global_parameters(std::size_t idx) {
         return m_subrequests[real_idx];
     }();
 
+    // Check if the given subgraph's input is spatial
+    auto is_spatial_param = [&](std::size_t sub_in_idx) -> bool {
+        if (!is_spatial) {
+            return false;  // Early return
+        }
+        auto& spatial = proto_comp_model_desc.spatial.value();
+        return std::any_of(spatial.params.begin(), spatial.params.end(), [&](const auto& p) -> bool {
+            return p.idx == sub_in_idx;
+        });
+    };
+
     for (auto&& it : iodesc.global_params) {
         std::size_t param_idx{}, sub_in_idx{};
         std::tie(param_idx, sub_in_idx) = it;
         LOG_DEBUG("Processing " << param_idx << " -> " << sub_in_idx << std::endl);
+
         const auto& g_port = m_npuw_model->inputs()[param_idx];
         const auto& g_tnsr = m_port_to_tensor.at(g_port).tensor;
         const auto& s_port = subr->get_inputs()[sub_in_idx];
         LOG_DEBUG("Processing " << g_port << " -> " << s_port << "...");
         LOG_BLOCK();
-        if (do_copy) {
-            LOG_DEBUG("Will be copied");
-            copy_list.emplace_back(g_tnsr, s_port);
+        if (!is_spatial_param(sub_in_idx)) {
+            // Input parameter is non-spatial, do normal handling
+            if (do_copy) {
+                LOG_DEBUG("Will be copied");
+                copy_list.emplace_back(g_tnsr, s_port);
+            } else {
+                LOG_DEBUG("Will be set");
+                subr->set_tensor(s_port, g_tnsr);
+            }
         } else {
-            LOG_DEBUG("Will be set");
-            subr->set_tensor(s_port, g_tnsr);
+            // Register for future use
+            m_spatial_io[real_idx].inputs.at(sub_in_idx) = g_tnsr;
         }
     }
 
@@ -398,11 +456,10 @@ void ov::npuw::JustInferRequest::bind_global_results(std::size_t idx) {
     LOG_BLOCK();
 
     auto& comp_model_desc = m_npuw_model->m_compiled_submodels[idx];
-    const auto real_idx = comp_model_desc.replaced_by.value_or(idx);
-    if (real_idx != idx) {
-        // Don't do here - function call will take parameter
-        // itself. Note it may be implemented more efficently
-        // than now (and in some cases, parameter can be pre-set)
+    if (comp_model_desc.replaced_by) {
+        // Don't do here - function call will take the right tensor
+        // itself. Note it may be implemented more efficently than now
+        // (and in some cases, the tensor can be pre-set)
         LOG_DEBUG("Skipping this too now - function will do it for itself");
         return;
     }
@@ -429,6 +486,8 @@ void ov::npuw::JustInferRequest::function_prologue(std::size_t idx) {
     const auto real_idx = comp_model_desc.replaced_by.value();
     auto& func_desc = m_npuw_model->m_compiled_submodels[real_idx];
 
+    const bool is_spatial = func_desc.spatial.has_value();
+
     // Function call prologue:
     // 1. Walk through function dependencies and set the respective tensors
     //    as parameters
@@ -446,11 +505,25 @@ void ov::npuw::JustInferRequest::function_prologue(std::size_t idx) {
             if (!m_npuw_model->m_compiled_submodels[prod_idx].replaced_by) {
                 // Producer is a normal model -> take its tensor directly
                 const auto& oport = m_npuw_model->m_compiled_submodels[prod_idx].compiled_model->outputs()[prod_port];
-                m_subrequests[real_idx]->set_tensor(iport, m_subrequests[prod_idx]->get_tensor(oport));
+                auto i_tensor = m_subrequests[prod_idx]->get_tensor(oport);
+                if (!is_spatial) {
+                    // Non-spatial case - set immediately
+                    m_subrequests[real_idx]->set_tensor(iport, i_tensor);
+                } else {
+                    // Spatial case - defer
+                    m_spatial_io[real_idx].inputs.at(i) = i_tensor;
+                }
             } else {
                 // Producer is a function - maybe the same as we're calling now.
                 // Take its tensor from the storage
-                m_subrequests[real_idx]->set_tensor(iport, m_funcall_result.at({prod_idx, prod_port}));
+                auto i_tensor = m_funcall_result.at({prod_idx, prod_port});
+                if (!is_spatial) {
+                    // Non-spatial case - again, set immediately
+                    m_subrequests[real_idx]->set_tensor(iport, m_funcall_result.at({prod_idx, prod_port}));
+                } else {
+                    // Spatial case - defer
+                    m_spatial_io[real_idx].inputs.at(i) = i_tensor;
+                }
             }
         }
     }  // for(param_base)
@@ -472,7 +545,14 @@ void ov::npuw::JustInferRequest::function_prologue(std::size_t idx) {
     for (std::size_t i = 0; i < func_desc.compiled_model->outputs().size(); i++) {
         LOG_DEBUG("Binding result[" << i << "]...");
         auto& oport = func_desc.compiled_model->outputs()[i];
-        m_subrequests[real_idx]->set_tensor(oport, m_funcall_result.at({idx, i}));
+        auto o_tensor = m_funcall_result.at({idx, i});
+        if (!is_spatial) {
+            // Non-spatial case - set immediately
+            m_subrequests[real_idx]->set_tensor(oport, o_tensor);
+        } else {
+            // Spatial case - defer
+            m_spatial_io[real_idx].outputs.at(i) = o_tensor;
+        }
     }
     LOG_DEBUG("Done");
 }
@@ -645,19 +725,133 @@ void ov::npuw::JustInferRequest::run_subrequest_for_success(std::size_t idx, boo
     }
 }
 
-namespace {
-template <typename R, typename F>
-void during(R&& r, F&& f) {
-    r->start_async();
-    f();  // expect noexcept
-    r->wait();
+void ov::npuw::JustInferRequest::unsafe_during(std::size_t real_idx, const std::function<void()>& f) {
+    auto& comp_model_desc = m_npuw_model->m_compiled_submodels[real_idx];
+    if (!comp_model_desc.spatial) {
+        // Non-spatial execution: trigger request asynchronously, run `f` in this context
+        auto& r = m_subrequests[real_idx];
+        r->start_async();
+        f();  // expect noexcept
+        r->wait();
+    } else {
+        // Spatial execution... Do the opposite - run f asynchronously, and meanwhile run the
+        // spatial inference
+        auto future = std::async(std::launch::async, f);
+        unsafe_infer(real_idx);
+        future.wait();
+    }
+}
+
+void ov::npuw::JustInferRequest::unsafe_infer(std::size_t real_idx) {
+    auto& comp_model_desc = m_npuw_model->m_compiled_submodels[real_idx];
+    auto& r = m_subrequests[real_idx];
+    if (!comp_model_desc.spatial) {
+        // Run normally
+        r->infer();
+    } else {
+        // Run over the specified range... Note: the full inputs/outputs
+        // must be prepared in the m_spatial_io at this point
+        const auto& spatial = comp_model_desc.spatial.value();
+        const auto num_outputs = comp_model_desc.compiled_model->outputs().size();
+
+        // Create a sparse vector with full input sizes.
+        // For the access simplicity, its size is aligned with function's
+        // number of input parameters (activations) so some slots may be
+        // not used here.
+        // FIXME: All these preparations could be done statically (just once)
+        std::vector<ov::Shape> full_in_shapes(comp_model_desc.param_base);
+        for (auto&& param : spatial.params) {
+            full_in_shapes[param.idx] = m_spatial_io[real_idx].inputs.at(param.idx)->get_shape();
+        }
+
+        // Now handle the range, even if it is not a multiply of nway (slice):
+        //
+        // |<- - - - full range  - - - ->|
+        // +------+------+------+------+-+
+        // | nway | nway | nway | nway | |
+        // +------+------+------+------+-+
+        //                              ^tail
+        // The block is always compiled to produce nway. If we need a smaller tensor
+        // on the last iteration, the sub-nway will be copied from the input range to
+        // a temporary tensor, and then the sub-nwway range will be copied from the
+        // request's output range.
+
+        std::size_t offset = 0u;
+        for (std::size_t i = 0u; i < spatial.nway_iters; i++, offset += spatial.nway) {
+            // Collect spatial inputs for this offset
+            for (auto&& param : spatial.params) {
+                const auto& iport = comp_model_desc.compiled_model->inputs()[param.idx];
+                r->set_tensor(
+                    iport,
+                    ov::npuw::util::view(m_spatial_io[real_idx].inputs.at(param.idx), param.dim, offset, spatial.nway));
+            }  // for(params)
+
+            // Now set the spatial outputs
+            for (std::size_t out_idx = 0u; out_idx < num_outputs; out_idx++) {
+                const auto& oport = comp_model_desc.compiled_model->outputs()[out_idx];
+                r->set_tensor(oport,
+                              ov::npuw::util::view(m_spatial_io[real_idx].outputs.at(out_idx),
+                                                   spatial.out_dim,
+                                                   offset,
+                                                   spatial.nway));
+            }  // for(outputs)
+
+            // Now run the part
+            r->infer();
+        }  // for(full_nway_times)
+
+        // Now process the tail, if required
+        if (spatial.tail_size) {
+            // Copy the sub-ranges to spatial inputs
+            // NOTE: tails buffers are read from/written to at 0th offset!
+            for (auto&& param : spatial.params) {
+                auto in_view = ov::npuw::util::view(m_spatial_io[real_idx].inputs.at(param.idx),
+                                                    param.dim,
+                                                    offset,
+                                                    spatial.tail_size);
+
+                const auto& iport = comp_model_desc.compiled_model->inputs()[param.idx];
+                auto out_view = ov::npuw::util::view(m_spatial_io[real_idx].input_tails.at(param.idx),
+                                                     param.dim,
+                                                     0,
+                                                     spatial.tail_size);
+
+                in_view->copy_to(out_view._ptr);
+                r->set_tensor(iport, m_spatial_io[real_idx].input_tails.at(param.idx));
+            }  // for(params)
+
+            // Now set the tail tensors
+            for (std::size_t out_idx = 0u; out_idx < num_outputs; out_idx++) {
+                const auto& oport = comp_model_desc.compiled_model->outputs()[out_idx];
+                r->set_tensor(oport, m_spatial_io[real_idx].output_tails.at(out_idx));
+            }  // for(outputs)
+
+            // Now run the tail infer
+            r->infer();
+
+            // Now copy the views from the output full-nway tensor to the output tensors
+            for (std::size_t out_idx = 0u; out_idx < num_outputs; out_idx++) {
+                const auto& oport = comp_model_desc.compiled_model->outputs()[out_idx];
+                auto spatial_tensor_shape = oport.get_shape();
+
+                auto in_view = ov::npuw::util::view(m_spatial_io[real_idx].output_tails.at(out_idx),
+                                                    spatial.out_dim,
+                                                    0,
+                                                    spatial.tail_size);
+
+                auto out_view = ov::npuw::util::view(m_spatial_io[real_idx].outputs.at(out_idx),
+                                                     spatial.out_dim,
+                                                     offset,
+                                                     spatial.tail_size);
+                in_view->copy_to(out_view._ptr);
+            }  // for(outputs)
+        }
+    }
 }
-}  // namespace
 
 void ov::npuw::JustInferRequest::unsafe_run_this_prep_next(std::size_t idx, bool& next_prepared) {
     auto& comp_model_desc = m_npuw_model->m_compiled_submodels[idx];
     auto real_idx = comp_model_desc.replaced_by.value_or(idx);
-    auto& this_subr = m_subrequests[real_idx];
     const std::size_t next_idx = next(idx + 1);
 
     if (comp_model_desc.replaced_by) {
@@ -669,7 +863,7 @@ void ov::npuw::JustInferRequest::unsafe_run_this_prep_next(std::size_t idx, bool
             if (m_use_function_pipelining) {
                 // function pipelining is here! and the next rq is ours.
                 NPUW_ASSERT(m_funcall_pipeline[idx].next.value() == next_idx);
-                during(this_subr, [&]() {
+                unsafe_during(real_idx, [&]() {
                     LOG_DEBUG("Unpacking closures for the NEXT subrequest[" << next_idx << "]...");
                     LOG_BLOCK();
                     // Note: do it here unconditionally - if this request fails,
@@ -680,7 +874,7 @@ void ov::npuw::JustInferRequest::unsafe_run_this_prep_next(std::size_t idx, bool
             } else {
                 // Function pipelining is not used. THIS infer request
                 // is also the NEXT one. Nothing much to do here
-                this_subr->infer();
+                unsafe_infer(real_idx);
                 bind_global_parameters(next_idx);
             }
         } else {
@@ -690,9 +884,9 @@ void ov::npuw::JustInferRequest::unsafe_run_this_prep_next(std::size_t idx, bool
             if (next_idx == 0) {
                 // Note: even if m_function_pipelining is ON,
                 // SWAP won't happen here - see the below check for .next
-                this_subr->infer();
+                unsafe_infer(real_idx);
             } else {
-                during(this_subr, [&]() {
+                unsafe_during(real_idx, [&]() {
                     if (!next_prepared) {
                         bind_global_parameters(next_idx);
                         next_prepared = true;
@@ -710,9 +904,9 @@ void ov::npuw::JustInferRequest::unsafe_run_this_prep_next(std::size_t idx, bool
         // This is a regular subgraph. Start it async to prepare the next
         // parameters
         if (next_idx == 0) {
-            this_subr->infer();
+            unsafe_infer(real_idx);
         } else {
-            during(this_subr, [&]() {
+            unsafe_during(real_idx, [&]() {
                 if (!next_prepared) {
                     bind_global_parameters(next_idx);
                     next_prepared = true;
diff --git a/src/plugins/intel_npu/src/plugin/npuw/just_sync_infer_request.hpp b/src/plugins/intel_npu/src/plugin/npuw/just_sync_infer_request.hpp
index e63f2f18b85ece..bb75eb69d0eb0a 100644
--- a/src/plugins/intel_npu/src/plugin/npuw/just_sync_infer_request.hpp
+++ b/src/plugins/intel_npu/src/plugin/npuw/just_sync_infer_request.hpp
@@ -52,6 +52,8 @@ class JustInferRequest final : public IBaseInferRequest {
     void function_prologue(std::size_t idx);
     void unpack_closure(std::size_t idx, RqPtr request);
 
+    void unsafe_during(std::size_t real_idx, const std::function<void()>& f);
+    void unsafe_infer(std::size_t real_idx);
     void unsafe_run_this_prep_next(std::size_t idx, bool& next_prepared_p);
 
     void connect_subrequests();
diff --git a/src/plugins/intel_npu/src/plugin/npuw/logging.cpp b/src/plugins/intel_npu/src/plugin/npuw/logging.cpp
index 15f0e8cb504c21..3c591e3154d8fd 100644
--- a/src/plugins/intel_npu/src/plugin/npuw/logging.cpp
+++ b/src/plugins/intel_npu/src/plugin/npuw/logging.cpp
@@ -9,6 +9,8 @@
 #include <iostream>
 #include <mutex>
 
+#include "openvino/runtime/make_tensor.hpp"  // get_tensor_impl
+
 namespace {
 #ifdef NPU_PLUGIN_DEVELOPER_BUILD
 const char* get_env(const std::vector<std::string>& list_to_try) {
@@ -61,11 +63,17 @@ int ov::npuw::__logging_indent__::__level__() {
     return this_indent;
 }
 
-void ov::npuw::dump_tensor(const ov::SoPtr<ov::ITensor>& tensor, const std::string& base_path) {
-    if (!tensor->is_continuous()) {
-        LOG_ERROR("Failed to dump blob " << base_path << ": it is not continuous");
-        return;
+void ov::npuw::dump_tensor(const ov::SoPtr<ov::ITensor>& input, const std::string& base_path) {
+    ov::SoPtr<ov::ITensor> tensor;
+
+    if (input->is_continuous()) {
+        tensor = input;
+    } else {
+        // Create temporary tensor and copy data in. Dumping is never fast, anyway
+        tensor = ov::get_tensor_impl(ov::Tensor(input->get_element_type(), input->get_shape()));
+        input->copy_to(tensor._ptr);
     }
+    NPUW_ASSERT(tensor);
 
     const auto bin_path = base_path + ".bin";
     {
diff --git a/src/plugins/intel_npu/src/plugin/npuw/partitioning/online/compiler.cpp b/src/plugins/intel_npu/src/plugin/npuw/partitioning/online/compiler.cpp
index 6a9cf017fded81..4b8973b5bb94ae 100644
--- a/src/plugins/intel_npu/src/plugin/npuw/partitioning/online/compiler.cpp
+++ b/src/plugins/intel_npu/src/plugin/npuw/partitioning/online/compiler.cpp
@@ -20,13 +20,23 @@ namespace npuw {
 namespace online {
 
 namespace detail {
+
+namespace {
+static const std::map<std::string, std::string> ISOL_PRESETS = {{"COMPUTE",
+                                                                 "P:DQMatMulGQu4/compute,P:DQMatMulCWu4/compute,"
+                                                                 "P:DQMatMulGQi4/compute,P:DQMatMulCWi4/compute,"
+                                                                 "P:VocabMatMul/compute,"
+                                                                 "P:RMSNorm/compute"}};
+}
+
 // For missing declaration warning
+// FIXME: Instead, one should use namespace{}
 size_t getMinGraphSize(::intel_npu::Config& cfg);
 size_t getMinRepBlocks(::intel_npu::Config& cfg);
 size_t getMinRepBlockSize(::intel_npu::Config& cfg);
 std::vector<Avoid> getAvoids(::intel_npu::Config& cfg);
 std::vector<Isolate> getIsolates(::intel_npu::Config& cfg);
-std::vector<Isolate> getIsolates(const std::string isolates_unparsed);
+std::vector<Isolate> getIsolates(const std::string& isolates_unparsed);
 std::vector<std::string> getNoFolds(::intel_npu::Config& cfg);
 std::vector<std::string> getNoFolds(const std::string& nofolds_unparsed);
 // Set default predefined values for COMPUTE pipeline
@@ -108,13 +118,18 @@ std::vector<Isolate> getIsolates(::intel_npu::Config& cfg) {
     return getIsolates(cfg.getString<::intel_npu::NPUW_ONLINE_ISOLATE>());
 }
 
-std::vector<Isolate> getIsolates(const std::string isolates_unparsed) {
+std::vector<Isolate> getIsolates(const std::string& isolates_unparsed) {
     if (isolates_unparsed.empty()) {
         return {};
     }
 
     std::vector<Isolate> isolates;
-    std::string s = std::move(isolates_unparsed);
+    std::string s = isolates_unparsed;
+
+    auto preset_iter = ISOL_PRESETS.find(s);
+    if (preset_iter != ISOL_PRESETS.end()) {
+        s = preset_iter->second;
+    }
 
     size_t pos = 0;
     size_t start = 0;
@@ -191,8 +206,7 @@ std::vector<std::string> getNoFolds(const std::string& nofolds_unparsed) {
 
 void setComputeConfig(PassContext& ctx) {
     // FIXME: initialize via a dedicated function instead of parsing
-    ctx.isolates = detail::getIsolates("P:DQMatMulGQu4/compute,P:DQMatMulCWu4/compute,P:DQMatMulGQi4/"
-                                       "compute,P:DQMatMulCWi4/compute,P:RMSNorm/compute");
+    ctx.isolates = detail::getIsolates(ISOL_PRESETS.at("COMPUTE"));
     ctx.nofolds = detail::getNoFolds("compute");
 }
 
@@ -219,6 +233,9 @@ void dump_partitioning(const ov::npuw::Ensemble& ens, const std::string& to) {
         if (!group.avoid_list.empty()) {
             gr.append_attribute("avoid") = group.avoid_list.data();
         }
+        if (!group.tag.empty()) {
+            gr.append_attribute("tag") = group.tag.data();
+        }
 
         // Note: Ensemble also add "id" attribute but it's not used by the plugin
         for (const auto& input : group.input_layers) {
diff --git a/src/plugins/intel_npu/src/plugin/npuw/partitioning/online/group.cpp b/src/plugins/intel_npu/src/plugin/npuw/partitioning/online/group.cpp
index d09f0b8a7100d3..cfcce1725433db 100644
--- a/src/plugins/intel_npu/src/plugin/npuw/partitioning/online/group.cpp
+++ b/src/plugins/intel_npu/src/plugin/npuw/partitioning/online/group.cpp
@@ -112,6 +112,8 @@ ov::npuw::Group Group::toGroup() const {
         }
     }
 
+    g.tag = m_isol_tag;
+
     return g;
 }
 
diff --git a/src/plugins/intel_npu/src/plugin/npuw/partitioning/online/snapshot.cpp b/src/plugins/intel_npu/src/plugin/npuw/partitioning/online/snapshot.cpp
index c621dbafd3dadb..82856cece3de40 100644
--- a/src/plugins/intel_npu/src/plugin/npuw/partitioning/online/snapshot.cpp
+++ b/src/plugins/intel_npu/src/plugin/npuw/partitioning/online/snapshot.cpp
@@ -418,10 +418,11 @@ void Snapshot::earlyRegroup() {
             } else if (isolate.pattern == "DQMatMulGQi4") {
                 rewr.add_matcher<ov::npuw::patterns::compute::DQMatMulGQi4>(shared_from_this(), isolate.tag);
                 handle_patterns = true;
+            } else if (isolate.pattern == "VocabMatMul") {
+                rewr.add_matcher<ov::npuw::patterns::compute::VocabMatMul>(shared_from_this(), isolate.tag);
+                handle_patterns = true;
             } else {
-                LOG_WARN("OPENVINO_NPUW_ISOLATE only supports RMSNorm, DQMatMulCWu4, DQMatMulGQu4, DQMatMulCWi4, "
-                         "DQMatMulGQi4 "
-                         << "as patterns. Isolate pattern " << isolate.pattern << " is skipped!");
+                LOG_WARN("OPENVINO_NPUW_ISOLATE: unsupported pattern " << isolate.pattern << " is skipped!");
             }
         }
         }
diff --git a/src/plugins/intel_npu/src/plugin/npuw/partitioning/partitioning.cpp b/src/plugins/intel_npu/src/plugin/npuw/partitioning/partitioning.cpp
index 192d975509ce5e..4ebfcc1809219c 100644
--- a/src/plugins/intel_npu/src/plugin/npuw/partitioning/partitioning.cpp
+++ b/src/plugins/intel_npu/src/plugin/npuw/partitioning/partitioning.cpp
@@ -1,4 +1,4 @@
-// Copyright (C) 2023 Intel Corporation
+// Copyright (C) 2023-2024 Intel Corporation
 // SPDX-License-Identifier: Apache-2.0
 //
 
@@ -140,6 +140,7 @@ ov::npuw::Ensemble load_groups(const std::shared_ptr<ov::Model>& model, const st
         this_group.gflops = get_float_attr(group, "gflops");
         this_group.repeated_id = get_str_attr(group, "repeated", "");
         this_group.avoid_list = get_str_attr(group, "avoid", "");
+        this_group.tag = get_str_attr(group, "tag", "");
         FOREACH_CHILD(input, group, "input") {
             this_group.input_layers.push_back(get_str_attr(input, "name"));
         }
@@ -226,6 +227,10 @@ class Partitioner {
 
     void createFunction(FunctionPipeline& func_ggg);
 
+    // NB(dm): This method should get a better place, it is here only because
+    // it is tied to the Function structure (but, in fact, not so much)
+    void identifySpatialRange(ov::npuw::Function& f);
+
     template <typename T, typename M>
     void rearrange_to_function_protocol(ov::npuw::Subgraph::Ref func_ref,
                                         const std::vector<T>& protocol,
@@ -308,6 +313,7 @@ class Partitioner {
     void matchResults(const std::string& func_name);
     void createFunction(const std::string& func_name);
     void matchRepeatedSubgraphs(const std::string& func_name);
+    void spatial(const std::string& func_name);
     void optimize(const std::string& func_name);
     void decompressionCutOff(const std::string& func_name);
 
@@ -360,6 +366,7 @@ void Partitioner::identifySubgraphs() {
         P.total_ops += group.sg._ops;
 
         group.sg._avoid_list = group.avoid_list;
+        group.sg._tag = group.tag;
         // Note inputs and outputs are included in the above set, so if
         // we are here, those nodes should be present in the model.
 
@@ -1455,6 +1462,7 @@ void Partitioner::createFunction(FunctionPipeline& func_ggg) {
     ov::npuw::Function function;
     function._model = func_ggg.mdls.front();
     function._param_offset = body_sg._parameters.size();
+    function._tag = body_sg._tag;
     std::size_t new_param_idx = function._param_offset;
 
     for (auto&& node_ptr : function._model->get_ordered_ops()) {
@@ -1516,6 +1524,76 @@ void Partitioner::createFunction(FunctionPipeline& func_ggg) {
     LOG_VERB("Done: " << func_name);
 }
 
+void Partitioner::identifySpatialRange(ov::npuw::Function& f) {
+    NPUW_ASSERT(f._tag == "compute");
+
+    // NB: The current logic must be changed. Here we assume we only
+    // apply this change to "compute" subgraphs which we identify
+    // based on well-known patterns. This won't work in the generic case.
+
+    // The current logic is the following:
+    // - Assume the function results are ALL SPATIAL (and this alone
+    //   is a very strong assumption)
+    // - Identify their SPATIAL dimension (which is dim[1] because
+    //   we know how COMPUTE subgraphs are organized)
+    // - Walk over the parameters (up to _param_offset), find
+    //   spatial Parameters based on the dim we're looking at
+    // - Report the findings.
+    // Hence, the logic is not robust enough and should be generalized
+    // in the future.
+
+    // First, check our assumption on the function results
+    const auto& f_results = f._model->get_results();
+    NPUW_ASSERT(f_results.size() > 0);
+
+    const auto& f_result_0 = f_results.front();
+    const auto& f_result_0_shape = f_result_0->get_shape();
+
+    if (f_result_0_shape.size() != 3) {
+        return;  // NB: this is the only case we enable now
+    }
+
+    if (f_result_0_shape[1] <= 1) {
+        return;  // NB: this is the only spatial dim we enable now
+    }
+
+    for (auto&& f_result_i : f_results) {
+        // Yes, it will also compare r[0] vs r[0]
+        const auto& f_result_i_shape = f_result_i->get_shape();
+        if (f_result_0_shape.size() != f_result_i_shape.size()) {
+            return;  // Do nothing
+        }
+
+        if (f_result_0_shape[1] != f_result_i_shape[1]) {
+            return;  // Do nothing
+        }
+    }
+
+    // Now, find the parameters with the same spatial dim
+    // NB: again, this is a very weak feature to look for
+    const auto& f_params = f._model->get_parameters();
+    NPUW_ASSERT(f_params.size() > 0);
+
+    using S = ov::npuw::Function::Spatial;
+    S spatial;
+    spatial._range = f_result_0_shape[1];
+    spatial._out_dim = 1;  // the only case we're looking into now
+
+    for (std::size_t i = 0u; i < f._param_offset; i++) {
+        const auto& f_param = f_params[i];
+        const auto& f_param_dims = f_param->get_shape();
+
+        auto spatial_dim_iter = std::find(f_param_dims.begin(), f_param_dims.end(), spatial._range);
+        if (spatial_dim_iter != f_param_dims.end()) {
+            std::size_t spatial_dim_idx = std::distance(f_param_dims.begin(), spatial_dim_iter);
+            spatial._inputs.push_back(S::Param{f_param, spatial_dim_idx});
+        }
+    }
+
+    // Apply the spatial change
+    f._spatial = std::move(spatial);
+}
+
 void Partitioner::createFunction(const std::string& func_name) {
     createFunction(all_functions.at(func_name));
 }
@@ -1594,6 +1672,50 @@ void Partitioner::matchRepeatedSubgraphs(const std::string& func_name) {
     LOG_VERB("Done");
 }
 
+void Partitioner::spatial(const std::string& func_name) {
+    ov::npuw::Function& f = P.functions.at(func_name);
+
+    // Identify the spatial dimension for this function
+    // Works only for Compute case.
+    // FIXME: Replace this string identification with smt better
+    if (!cfg.get<::intel_npu::NPUW_SPATIAL>() || f._tag != "compute") {
+        LOG_VERB("No spatial optimizations will be done to  " << func_name << " in model " << model->get_friendly_name()
+                                                              << "...");
+        return;
+    }
+
+    LOG_VERB("Turn " << func_name << " into spatial execution in model " << model->get_friendly_name() << "...");
+    LOG_BLOCK();
+
+    identifySpatialRange(f);
+    if (!f._spatial) {
+        LOG_WARN("No spatial ranges identified in the COMPUTE block, expect a higher compile time");
+        return;
+    }
+
+    LOG_VERB("Spatial range: " << f._spatial->_range);
+
+    // Final check before transformations
+    f._spatial->_slice = cfg.get<::intel_npu::NPUW_SPATIAL_NWAY>();
+    if (f._spatial->_slice == 0) {
+        LOG_WARN("NWAY is set to 0, disabling it (but better disable SPATIAL setting itself)");
+        f._spatial.reset();  // Erase spatial information to avoid conflicts
+        return;
+    }
+
+    // Apply transformation to the model. Note: only function body is modified
+    // Accumulate the reshape map
+    std::map<ov::Output<ov::Node>, ov::PartialShape> new_shapes;
+    for (auto&& p : f._spatial->_inputs) {
+        ov::Shape shape = p.param->get_shape();
+        shape[p.dim] = f._spatial->_slice;
+        new_shapes[p.param->output(0)] = shape;
+    }
+    f._model->reshape(new_shapes);
+
+    LOG_VERB("Done");
+}
+
 void Partitioner::optimize(const std::string& func_name) {
     ov::npuw::Function& f = P.functions.at(func_name);
     auto& func_group = all_functions.at(func_name);
@@ -1622,6 +1744,7 @@ void Partitioner::optimize(const std::string& func_name) {
     // Regardless of DQ setting, run this first
     {
         ov::npuw::patterns::opt::Context ctx;
+        ctx.is_spatial = f._spatial.has_value();
         ctx.pmm_dims = cfg.get<::intel_npu::NPUW_PMM>();
 
         // Run Head/Tail passes
@@ -1768,6 +1891,8 @@ void Partitioner::optimize(const std::string& func_name) {
 
     // Run "dynamic quantization"
     ov::npuw::patterns::opt::Context ctx;
+    ctx.is_spatial = f._spatial.has_value();
+
     ov::pass::GraphRewrite rewr;
     rewr.add_matcher<ov::npuw::patterns::opt::DQMatMulCWi>();
     rewr.add_matcher<ov::npuw::patterns::opt::DQMatMulGQi>(std::ref(ctx));
@@ -2052,6 +2177,7 @@ ov::npuw::Partitioning ov::npuw::getPartitioning(const std::shared_ptr<ov::Model
                 p.matchParameters(func_group);
                 p.matchResults(func_group);
                 p.matchRepeatedSubgraphs(func_group);
+                p.spatial(func_group);
                 p.optimize(func_group);
                 p.decompressionCutOff(func_group);
             }
@@ -2068,7 +2194,6 @@ ov::npuw::Partitioning ov::npuw::getPartitioning(const std::shared_ptr<ov::Model
                 p.saveTinyConstants(func_group);
                 p.saveScaleFactors(func_group);
                 p.createFunction(func_group);
-                p.optimize(func_group);
                 p.decompressionCutOff(func_group);
             }
         } else {
diff --git a/src/plugins/intel_npu/src/plugin/npuw/partitioning/partitioning.hpp b/src/plugins/intel_npu/src/plugin/npuw/partitioning/partitioning.hpp
index 35c4eacfeffe8b..fec00f359a8d15 100644
--- a/src/plugins/intel_npu/src/plugin/npuw/partitioning/partitioning.hpp
+++ b/src/plugins/intel_npu/src/plugin/npuw/partitioning/partitioning.hpp
@@ -30,6 +30,7 @@ struct Subgraph {
     bool _optimized_out = false;
 
     std::string _avoid_list;
+    std::string _tag;
 
     // Function calls only (note: all the above fields are not used)
     //
@@ -57,9 +58,26 @@ struct Function {
     std::size_t _param_offset;
     std::size_t _num_params_total;
 
+    std::string _tag;  // derived from the partitioning
+
     // Mapping: from a prototype {Layer/input_idx} to {param_idx}
     // NOTE: it seems it is required only for `matchRepeatedSubgraphs()'
     std::map<std::pair<std::string, std::size_t>, std::size_t> _param_mapping;
+
+    // Spatial information. So far assume spatial execution in 1 dimension only
+    struct Spatial {
+        using PPtr = std::shared_ptr<ov::op::v0::Parameter>;
+        struct Param {
+            PPtr param;
+            std::size_t dim;
+        };
+        std::size_t _range = 0u;    // Range over which spatial execution is organized, e.g. 1024
+        std::size_t _slice = 0u;    // A submission size for a single execution, e.g. 128
+        std::size_t _out_dim = 0u;  // Assume it is the same dim for all Results
+        std::vector<Param> _inputs;
+    };
+    using SpatialOpt = std::optional<Spatial>;
+    SpatialOpt _spatial;
 };
 
 struct Group {
@@ -71,6 +89,7 @@ struct Group {
     float gflops;
 
     std::string avoid_list;
+    std::string tag;
 
     ov::npuw::Subgraph sg;
 };
diff --git a/src/plugins/intel_npu/src/plugin/npuw/partitioning/patterns/compute.cpp b/src/plugins/intel_npu/src/plugin/npuw/partitioning/patterns/compute.cpp
index e7f09b00cde2a2..b082d67037db7d 100644
--- a/src/plugins/intel_npu/src/plugin/npuw/partitioning/patterns/compute.cpp
+++ b/src/plugins/intel_npu/src/plugin/npuw/partitioning/patterns/compute.cpp
@@ -9,6 +9,7 @@
 #include "../online/snapshot.hpp"  // online::Snapshot
 #include "openvino/op/ops.hpp"
 #include "openvino/pass/pattern/op/label.hpp"  // any_input
+#include "openvino/pass/pattern/op/optional.hpp"
 #include "openvino/pass/pattern/op/wrap_type.hpp"
 #include "openvino/util/common_util.hpp"
 
@@ -106,8 +107,10 @@ DQMatMulCWu4::DQMatMulCWu4(const std::shared_ptr<ov::npuw::online::Snapshot>& sn
         auto matched_qzerop = std::static_pointer_cast<ov::op::v0::Constant>(matched_node_qzerop);
         auto matched_qcoeff = std::static_pointer_cast<ov::op::v0::Constant>(matched_node_qcoeff);
 
-        if (ov::element::u4 == matched_qweight->get_element_type() &&
-            ov::element::u4 == matched_qzerop->get_element_type() &&
+        if ((ov::element::u4 == matched_qweight->get_element_type() ||
+             ov::element::u8 == matched_qweight->get_element_type()) &&
+            (ov::element::u4 == matched_qzerop->get_element_type() ||
+             ov::element::u8 == matched_qzerop->get_element_type()) &&
             ov::element::f16 == matched_qcoeff->get_element_type()) {
             // Partitioning ignores Const->Convert nodes, so qcvtw and qcvtz are not used
             auto matched_qsubz = node_to_output.at(qsubz).get_node_shared_ptr();
@@ -135,7 +138,7 @@ DQMatMulGQi4::DQMatMulGQi4(const std::shared_ptr<ov::npuw::online::Snapshot>& sn
 
     auto qmuls = opp::wrap_type<ov::op::v1::Multiply>({qcvtw, qcoeff});
     auto qreshp = opp::wrap_type<ov::op::v1::Reshape>({qmuls, opp::any_input()});
-    auto qcvtr = opp::wrap_type<ov::op::v0::Convert>({qreshp});
+    auto qcvtr = opp::optional<ov::op::v0::Convert>({qreshp->output(0)});
     auto qmm = opp::wrap_type<ov::op::v0::MatMul>({opp::any_input(), qcvtr});
 
     auto node_to_gptr = snapshot->getNodeToGroupMap();
@@ -155,17 +158,22 @@ DQMatMulGQi4::DQMatMulGQi4(const std::shared_ptr<ov::npuw::online::Snapshot>& sn
 
         if ((ov::element::i4 == matched_qweight->get_element_type() ||
              ov::element::i8 == matched_qweight->get_element_type()) &&
-            ov::element::f16 == matched_qcoeff->get_element_type()) {
+            (ov::element::f16 == matched_qcoeff->get_element_type() ||
+             ov::element::f32 == matched_qcoeff->get_element_type())) {
             // Partitioning ignores Const->Convert nodes, so qcvtw is not used
             auto matched_qmuls = node_to_output.at(qmuls).get_node_shared_ptr();
             auto matched_qreshp = node_to_output.at(qreshp).get_node_shared_ptr();
-            auto matched_qcvtr = node_to_output.at(qcvtr).get_node_shared_ptr();
             auto matched_qmm = node_to_output.at(qmm).get_node_shared_ptr();
 
             node_to_gptr->at(matched_qmuls)->isolate(isol_tag);
             node_to_gptr->at(matched_qreshp)->isolate(isol_tag);
-            node_to_gptr->at(matched_qcvtr)->isolate(isol_tag);
             node_to_gptr->at(matched_qmm)->isolate(isol_tag);
+
+            auto qcvtr_iter = node_to_output.find(qcvtr);
+            if (qcvtr_iter != node_to_output.end()) {
+                auto matched_qcvtr = qcvtr_iter->second.get_node_shared_ptr();
+                node_to_gptr->at(matched_qcvtr)->isolate(isol_tag);
+            }
         }
 
         return false;  // root hasn't changed
@@ -218,6 +226,67 @@ DQMatMulCWi4::DQMatMulCWi4(const std::shared_ptr<ov::npuw::online::Snapshot>& sn
     register_matcher(std::make_shared<opp::Matcher>(qmm, "TagDQMatMulCWi4"), std::move(callback));
 }
 
+// This is a case for Raw (f16/f32) MatMul connected directly to the Result.
+//
+// The following combinations are covered:
+//
+// act(f32)    -> MatMul(f32) -> Result
+// weight(f32) ->
+//
+// act(f16)    -> MatMul(f16) -> to_f32 -> Result
+// weight(f16) ->
+//
+// act(f32)    -> to_f16 -> MatMul -> to_f32 -> Result
+// weight(f16) ----------->
+//
+// act(f32)    -----------> MatMul -> Result
+// weight(f16) -- to_f32-->
+
+VocabMatMul::VocabMatMul(const std::shared_ptr<ov::npuw::online::Snapshot>& snapshot, const std::string& isol_tag) {
+    auto act_in = opp::any_input();
+    auto weight = opp::wrap_type<ov::op::v0::Constant>();
+
+    auto ocvta = opp::optional<ov::op::v0::Convert>({act_in->output(0)});
+    auto ocvtw = opp::optional<ov::op::v0::Convert>({weight->output(0)});
+
+    auto mm = opp::wrap_type<ov::op::v0::MatMul>({ocvta, ocvtw});
+    auto ocvtm = opp::optional<ov::op::v0::Convert>({mm->output(0)});
+
+    auto res = opp::wrap_type<ov::op::v0::Result>({ocvtm});
+
+    auto node_to_gptr = snapshot->getNodeToGroupMap();
+
+    // Note: Use [=] to make sure the above objects stay alive in the callback
+    auto callback = [=](ov::pass::pattern::Matcher& m) {
+        auto& node_to_output = m.get_pattern_value_map();
+        auto matched_out_a = node_to_output.at(act_in).get_node_shared_ptr();
+        auto matched_out_w = node_to_output.at(weight).get_node_shared_ptr();
+
+        auto a_type = matched_out_a->get_element_type();
+        auto w_type = matched_out_w->get_element_type();
+
+        if ((a_type == ov::element::f16 || a_type == ov::element::f32) &&
+            (w_type == ov::element::f16 || w_type == ov::element::f32)) {
+            node_to_gptr->at(node_to_output.at(mm).get_node_shared_ptr())->isolate(isol_tag);
+
+            auto isol_if = [=, &node_to_gptr, &node_to_output](std::shared_ptr<ov::Node> n) {
+                auto iter = node_to_output.find(n);
+                if (iter != node_to_output.end()) {
+                    auto group_iter = node_to_gptr->find(iter->second.get_node_shared_ptr());
+                    if (group_iter != node_to_gptr->end()) {
+                        group_iter->second->isolate(isol_tag);
+                    }
+                }
+            };
+            isol_if(ocvta);
+            isol_if(ocvtw);
+            isol_if(ocvtm);
+        }
+        return false;
+    };
+    register_matcher(std::make_shared<opp::Matcher>(res, "TagVocabMatMul"), std::move(callback));
+}
+
 // TODO: visualize
 RMSNorm::RMSNorm(const std::shared_ptr<ov::npuw::online::Snapshot>& snapshot, const std::string& isol_tag) {
     auto hadd = opp::wrap_type<ov::op::v1::Add>({opp::any_input(), opp::any_input()});
diff --git a/src/plugins/intel_npu/src/plugin/npuw/partitioning/patterns/compute.hpp b/src/plugins/intel_npu/src/plugin/npuw/partitioning/patterns/compute.hpp
index 92e60cb95fbdbe..faa2fe3f0f9578 100644
--- a/src/plugins/intel_npu/src/plugin/npuw/partitioning/patterns/compute.hpp
+++ b/src/plugins/intel_npu/src/plugin/npuw/partitioning/patterns/compute.hpp
@@ -41,6 +41,11 @@ class DQMatMulCWi4 : public ov::pass::MatcherPass {
     DQMatMulCWi4(const std::shared_ptr<ov::npuw::online::Snapshot>& snapshot, const std::string& isol_tag);
 };
 
+class VocabMatMul : public ov::pass::MatcherPass {
+public:
+    VocabMatMul(const std::shared_ptr<ov::npuw::online::Snapshot>& snapshot, const std::string& isol_tag);
+};
+
 class RMSNorm : public ov::pass::MatcherPass {
 public:
     RMSNorm(const std::shared_ptr<ov::npuw::online::Snapshot>& snapshot, const std::string& isol_tag);
diff --git a/src/plugins/intel_npu/src/plugin/npuw/partitioning/patterns/opt.cpp b/src/plugins/intel_npu/src/plugin/npuw/partitioning/patterns/opt.cpp
index 4ec72e02260884..d987023d0040e8 100644
--- a/src/plugins/intel_npu/src/plugin/npuw/partitioning/patterns/opt.cpp
+++ b/src/plugins/intel_npu/src/plugin/npuw/partitioning/patterns/opt.cpp
@@ -687,8 +687,13 @@ DQParMMGQ::DQParMMGQ(Context::Ref ctx) {
 
         auto qmmi_shape = node_to_output.at(qmm).get_shape();
 
-        if (qmmi_shape.size() != 3 || qmmi_shape[0] != 1 || qmmi_shape[1] != 1) {
-            // Limit token to 1-token shapes only (prefill requires its own tranformation)
+        if (qmmi_shape.size() != 3 || qmmi_shape[0] != 1) {
+            // Not handling such cases
+            return false;
+        }
+
+        if (qmmi_shape[1] != 1 && !ctx.get().is_spatial) {
+            // For non 1-token cases, do transformation if and only if and only if the block is spatial
             return false;
         }
 
@@ -709,9 +714,12 @@ void mergeParallelMatMuls(const std::shared_ptr<ov::Model>& m, Context& ctx) {
             continue;
         }
         ov::Output<ov::Node> orig_multiply;
+
         std::size_t axis_to_concat = -1;
         std::tie(orig_multiply, axis_to_concat) = mul_to_mms.first;
 
+        const ov::Shape orig_act_shape = orig_multiply.get_shape();
+
         if (!util::is_set(axis_to_concat, ctx.pmm_dims)) {
             LOG_VERB("Parallel MatMuls found, but fusion over dim " << axis_to_concat << " is not enabled");
             continue;
@@ -773,10 +781,10 @@ void mergeParallelMatMuls(const std::shared_ptr<ov::Model>& m, Context& ctx) {
             auto this_orig_wshape = parallel_matmuls[i].w->get_shape();
             auto this_slice_start =
                 std::make_shared<ov::op::v0::Constant>(ov::element::i32, ov::Shape{3}, S{0, 0, offset});
-            auto this_slice_end =
-                std::make_shared<ov::op::v0::Constant>(ov::element::i32,
-                                                       ov::Shape{3},
-                                                       S{1, 1, offset + this_orig_wshape[axis_to_concat]});
+            auto this_slice_end = std::make_shared<ov::op::v0::Constant>(
+                ov::element::i32,
+                ov::Shape{3},
+                S{1, orig_act_shape[1], offset + this_orig_wshape[axis_to_concat]});
             auto this_slice_step = std::make_shared<ov::op::v0::Constant>(ov::element::i32, ov::Shape{3}, S{1, 1, 1});
             auto this_slice =
                 std::make_shared<ov::op::v8::Slice>(new_mm, this_slice_start, this_slice_end, this_slice_step);
diff --git a/src/plugins/intel_npu/src/plugin/npuw/partitioning/patterns/opt.hpp b/src/plugins/intel_npu/src/plugin/npuw/partitioning/patterns/opt.hpp
index 530d0a52cc515f..b649f6a136c2e7 100644
--- a/src/plugins/intel_npu/src/plugin/npuw/partitioning/patterns/opt.hpp
+++ b/src/plugins/intel_npu/src/plugin/npuw/partitioning/patterns/opt.hpp
@@ -26,6 +26,7 @@ class DQMatMulCWi : public ov::pass::MatcherPass {
 
 struct Context {
     std::string pmm_dims;
+    bool is_spatial = false;
 
     using PPtr = std::shared_ptr<ov::op::v0::Parameter>;
     using NPtr = std::shared_ptr<ov::Node>;
diff --git a/src/plugins/intel_npu/src/plugin/npuw/util.cpp b/src/plugins/intel_npu/src/plugin/npuw/util.cpp
index d83a521fb29496..ebbed29893583c 100644
--- a/src/plugins/intel_npu/src/plugin/npuw/util.cpp
+++ b/src/plugins/intel_npu/src/plugin/npuw/util.cpp
@@ -17,6 +17,7 @@
 #include "openvino/op/constant.hpp"
 #include "openvino/op/transpose.hpp"
 #include "openvino/op/util/op_types.hpp"
+#include "openvino/runtime/make_tensor.hpp"  // get_tensor_impl
 
 #ifdef UNPACK_PROFILING
 #    include "tbb/concurrent_unordered_map.h"
@@ -1562,6 +1563,45 @@ void ov::npuw::util::gather(const ov::SoPtr<ov::ITensor>& src,
     }
 }
 
+ov::SoPtr<ov::ITensor> ov::npuw::util::view(const ov::SoPtr<ov::ITensor>& src,
+                                            const ov::npuw::util::View& from,
+                                            const ov::npuw::util::View& to) {
+    const auto type = src->get_element_type();
+    NPUW_ASSERT(from.size() == to.size());
+
+    // Sub-byte views are not supported here
+    NPUW_ASSERT(type != ov::element::u4 && type != ov::element::i4);
+
+    const auto num_dims = from.size();
+    ov::Shape view_shape;
+    for (auto d = 0u; d < num_dims; d++) {
+        view_shape.push_back(to[d] - from[d]);
+    }
+
+    const auto strides = src->get_strides();
+    uint8_t* ptr = static_cast<uint8_t*>(src->data());
+
+    // Shift PTR according to the strides
+    for (auto d = 0u; d < num_dims; d++) {
+        ptr += strides[d] * from[d];
+    }
+
+    ov::Tensor viewt(type, view_shape, ptr, strides);
+    return ov::get_tensor_impl(viewt);
+}
+
+ov::SoPtr<ov::ITensor> ov::npuw::util::view(const ov::SoPtr<ov::ITensor>& src,
+                                            std::size_t dim,
+                                            std::size_t offset,
+                                            std::size_t len) {
+    const auto shape = src->get_shape();
+    View view_start = View(shape.size(), 0u);
+    View view_end = shape;
+    view_start[dim] = offset;
+    view_end[dim] = offset + len;
+    return ov::npuw::util::view(src, view_start, view_end);
+}
+
 template <typename InT>
 void to_f32(const ov::Tensor& in, ov::Tensor& out) {
     NPUW_ASSERT(in.is_continuous());
diff --git a/src/plugins/intel_npu/src/plugin/npuw/util.hpp b/src/plugins/intel_npu/src/plugin/npuw/util.hpp
index 6012ce0e587352..689bf8571ddb8d 100644
--- a/src/plugins/intel_npu/src/plugin/npuw/util.hpp
+++ b/src/plugins/intel_npu/src/plugin/npuw/util.hpp
@@ -52,6 +52,11 @@ void unpack(const ov::SoPtr<ov::ITensor>& from,
 
 void gather(const ov::SoPtr<ov::ITensor>& src, const ov::SoPtr<ov::ITensor>& idx, const ov::SoPtr<ov::ITensor>& dst);
 
+using View = std::vector<std::size_t>;
+ov::SoPtr<ov::ITensor> view(const ov::SoPtr<ov::ITensor>& src, const View& from, const View& to);
+
+ov::SoPtr<ov::ITensor> view(const ov::SoPtr<ov::ITensor>& src, std::size_t dim, std::size_t offset, std::size_t len);
+
 void to_f32(const ov::Tensor& in, ov::Tensor& out);
 void to_f16(ov::Tensor& t);
 void transpose(ov::Tensor& t);