openvinotoolkit · dmatveev · Oct 9, 2024 · Sep 27, 2024 · Oct 1, 2024 · Oct 2, 2024
@@ -44,6 +44,8 @@ DEFINE_OPT(NPUW_CWAI, bool, false, npuw::partitioning::cwai, CompileTime);
 DEFINE_OPT(NPUW_DQ, bool, false, npuw::partitioning::dyn_quant, CompileTime);
 DEFINE_OPT(NPUW_PMM, std::string, "2", npuw::partitioning::par_matmul_merge_dims, CompileTime);
 DEFINE_OPT(NPUW_HOST_GATHER, bool, true, npuw::partitioning::host_gather, CompileTime);
+DEFINE_OPT(NPUW_SPATIAL, bool, false, npuw::partitioning::spatial, CompileTime);
+DEFINE_OPT(NPUW_SPATIAL_NWAY, std::size_t, 64, npuw::partitioning::spatial_nway, CompileTime);
 DEFINE_OPT(NPUW_DCOFF_TYPE, std::string, "", npuw::partitioning::dcoff_type, CompileTime);
 DEFINE_OPT(NPUW_DCOFF_SCALE, bool, false, npuw::partitioning::dcoff_with_scale, CompileTime);
 DEFINE_OPT(NPUW_FUNCALL_FOR_ALL, bool, false, npuw::partitioning::funcall_for_all, CompileTime);

@@ -178,6 +178,22 @@ static constexpr ov::Property<bool> dyn_quant{"NPUW_DQ"};
  */
 static constexpr ov::Property<std::string> par_matmul_merge_dims{"NPUW_PMM"};
 
+/**
+ * @brief
+ * Type: boolean.
+ * Enable spatial execution for selected subgraphs. Requires COMPUTE isolation.
+ * Default value: false
+ */
+static constexpr ov::Property<bool> spatial{"NPUW_SPATIAL"};
+
+/**
+ * @brief
+ * Type: std::size_t.
+ * Submission size for the spatial execution.
+ * Default value: 64
+ */
+static constexpr ov::Property<std::size_t> spatial_nway{"NPUW_SPATIAL_NWAY"};
+
 /**
  * @brief
  * Type: boolean

@@ -28,6 +28,8 @@ void intel_npu::registerNPUWOptions(OptionsDesc& desc) {
     desc.add<NPUW_CWAI>();
     desc.add<NPUW_DQ>();
     desc.add<NPUW_PMM>();
+    desc.add<NPUW_SPATIAL>();
+    desc.add<NPUW_SPATIAL_NWAY>();
     desc.add<NPUW_HOST_GATHER>();
     desc.add<NPUW_DCOFF_TYPE>();
     desc.add<NPUW_DCOFF_SCALE>();

@@ -210,33 +210,91 @@ void ov::npuw::IBaseInferRequest::infer() {
 }
 
 void ov::npuw::IBaseInferRequest::dump_input_tensors(std::size_t idx) {
+    const std::string dump_ios_opt = m_npuw_model->m_cfg.get<::intel_npu::NPUW_DUMP_IO>();
+    if (!ov::npuw::util::is_set(idx, dump_ios_opt)) {
+        return;
+    }
+
     auto real_idx = m_npuw_model->m_compiled_submodels[idx].replaced_by.value_or(idx);
-    const auto& comp_submodel = m_npuw_model->m_compiled_submodels[real_idx].compiled_model;
+    const auto& comp_submodel_desc = m_npuw_model->m_compiled_submodels[real_idx];
+    const auto& comp_submodel = comp_submodel_desc.compiled_model;
 
     // Note: keep using the absolute `idx` for identififaction and printing
     // Note:
     // - _name is used for the user option (no leading 00s for indices)
     // - _path is used for disk dump (will have leading 00s for indices)
     const auto comp_submodel_name = subgr_name(idx);
     const auto comp_submodel_path = m_npuw_model->m_name + subgr_path_suffix(idx) + iter_path_suffix(idx);
+    const auto num_inputs = comp_submodel->inputs().size();
 
-    const std::string dump_ios_opt = m_npuw_model->m_cfg.get<::intel_npu::NPUW_DUMP_IO>();
-    if (ov::npuw::util::is_set(idx, dump_ios_opt)) {
+    // There's different approaches to dumping normal and spatial subgraphs.
+    if (!comp_submodel_desc.spatial) {
+        // In the normal, non-spatial mode, we just dump the current subgrequests
+        // pre-set tensors and that's it
         std::vector<std::string> in_base_names;
-        for (std::size_t i = 0u, num_inputs = comp_submodel->inputs().size(); i < num_inputs; i++) {
+        for (std::size_t i = 0u; i < num_inputs; i++) {
             const auto& port = comp_submodel->inputs()[i];
             const auto& tnsr = m_subrequests[real_idx]->get_tensor(port);
             std::string in_base_name = comp_submodel_path + "_input_" + ov::npuw::util::fmt(i, num_inputs);
             ov::npuw::dump_tensor(tnsr, in_base_name);
             in_base_names.push_back(std::move(in_base_name));
         }
         ov::npuw::dump_input_list(comp_submodel_path, in_base_names);
+    } else {
+        const auto& s = comp_submodel_desc.spatial.value();
+
+        std::set<std::size_t> spatial_param_idx;
+        std::vector<std::string> in_base_names_nonspat;
+
+        // First, dump the non-spatial input tensors just once - and remember its names
+        for (auto&& p : s.params) {
+            spatial_param_idx.insert(p.idx);
+        }
+        for (std::size_t i = 0u; i < num_inputs; i++) {
+            if (spatial_param_idx.count(i)) {
+                continue;
+            }
+            const auto& port = comp_submodel->inputs()[i];
+            const auto& tnsr = m_subrequests[real_idx]->get_tensor(port);
+            std::string in_base_name = comp_submodel_path + "_input_" + ov::npuw::util::fmt(i, num_inputs);
+            ov::npuw::dump_tensor(tnsr, in_base_name);
+            in_base_names_nonspat.push_back(std::move(in_base_name));
+        }
+
+        // Now iterate over the spatial range and dump the individual tiles
+        // For the spatial case, these tiles should've been taken from the special
+        // spatial_io tensors
+        for (std::size_t offset = 0u; offset < s.range; offset += s.nway) {
+            const std::size_t this_len = (offset + s.nway <= s.range) ? s.nway               // the full tile
+                                                                      : (s.range - offset);  // the last tile
+            // Copy the base file list to start with it
+            std::vector<std::string> tile_ilist(in_base_names_nonspat);
+            for (auto&& p : s.params) {
+                std::string in_base_name = comp_submodel_path + "_input_" + ov::npuw::util::fmt(p.idx, num_inputs) +
+                                           "_d" + ov::npuw::util::fmt(p.dim, 10) + "_" +
+                                           ov::npuw::util::fmt(offset, s.range);
+
+                const auto& tnsr = m_spatial_io[real_idx].inputs.at(p.idx);
+                const auto& view = ov::npuw::util::view(tnsr, p.dim, offset, this_len);
+
+                ov::npuw::dump_tensor(view, in_base_name);
+                tile_ilist.push_back(std::move(in_base_name));
+            }
+            // Dump ilist per tile
+            ov::npuw::dump_input_list(comp_submodel_path, tile_ilist);
+        }  // for(offset)
     }
 }
 
 void ov::npuw::IBaseInferRequest::dump_output_tensors(std::size_t idx) {
+    const std::string dump_ios_opt = m_npuw_model->m_cfg.get<::intel_npu::NPUW_DUMP_IO>();
+    if (!ov::npuw::util::is_set(idx, dump_ios_opt)) {
+        return;
+    }
+
     auto real_idx = m_npuw_model->m_compiled_submodels[idx].replaced_by.value_or(idx);
-    const auto& comp_submodel = m_npuw_model->m_compiled_submodels[real_idx].compiled_model;
+    const auto& comp_submodel_desc = m_npuw_model->m_compiled_submodels[real_idx];
+    const auto& comp_submodel = comp_submodel_desc.compiled_model;
 
     // Note: keep using the absolute `idx` for identififaction and printing
     // Note:
@@ -245,18 +303,39 @@ void ov::npuw::IBaseInferRequest::dump_output_tensors(std::size_t idx) {
     // FIXME: Duplication is evil
     const auto comp_submodel_name = subgr_name(idx);
     const auto comp_submodel_path = m_npuw_model->m_name + subgr_path_suffix(idx) + iter_path_suffix(idx);
+    const std::size_t num_outputs = comp_submodel->outputs().size();
 
-    const std::string dump_ios_opt = m_npuw_model->m_cfg.get<::intel_npu::NPUW_DUMP_IO>();
-    if (ov::npuw::util::is_set(idx, dump_ios_opt)) {
+    // Same approach as in above. Spatial tensors require special handling
+    if (!comp_submodel_desc.spatial) {
         std::vector<std::string> out_base_names;
-        for (std::size_t i = 0u, num_outputs = comp_submodel->outputs().size(); i < num_outputs; i++) {
+        for (std::size_t i = 0u; i < num_outputs; i++) {
             const auto& port = comp_submodel->outputs()[i];
             const auto& tnsr = m_subrequests[real_idx]->get_tensor(port);
             std::string out_base_name = comp_submodel_path + "_output_" + ov::npuw::util::fmt(i, num_outputs);
             ov::npuw::dump_tensor(tnsr, out_base_name);
             out_base_names.push_back(std::move(out_base_name));
         }
         ov::npuw::dump_output_list(comp_submodel_path, out_base_names);
+    } else {
+        // All outputs are considered spatial now so it should be easier
+        const auto& s = comp_submodel_desc.spatial.value();
+        for (std::size_t offset = 0u; offset < s.range; offset += s.nway) {
+            const std::size_t this_len = (offset + s.nway <= s.range) ? s.nway               // the full tile
+                                                                      : (s.range - offset);  // the last tile
+            std::vector<std::string> tile_olist;
+            for (std::size_t i = 0u; i < num_outputs; i++) {
+                std::string out_base_name = comp_submodel_path + "_output_" + ov::npuw::util::fmt(i, num_outputs) +
+                                            "_d" + ov::npuw::util::fmt(s.out_dim, 10) + "_" +
+                                            ov::npuw::util::fmt(offset, s.range);
+                const auto& tnsr = m_spatial_io[real_idx].outputs.at(i);
+                const auto& view = ov::npuw::util::view(tnsr, s.out_dim, offset, this_len);
+
+                ov::npuw::dump_tensor(view, out_base_name);
+                tile_olist.push_back(std::move(out_base_name));
+            }
+            // Dump olist per tile
+            ov::npuw::dump_output_list(comp_submodel_path, tile_olist);
+        }
     }
 }
 

@@ -91,6 +91,22 @@ class IBaseInferRequest : public ov::ISyncInferRequest {
     // FROM(Every subrequests' output port) TO(Its output tensor)
     std::map<ov::Output<const ov::Node>, TensorStorage> m_port_to_tensor;
 
+    // FIXME: Currently is initialized/managed by subclass as well.
+    // Moved here dumping purposes only
+    // Another sparse vector. Represents populated spatial I/O parameters
+    // which can should be read/written by parts in multile submissions.
+    // An ugly structure, cries for refactoring
+    // See function_prologue for details.
+    // Also it contains pre-allocated tensors for tails handling
+    struct SpatialIO {
+        std::vector<ov::SoPtr<ov::ITensor>> inputs;   // # of elements - # of graph-side inputs
+        std::vector<ov::SoPtr<ov::ITensor>> outputs;  // # of elements - # of subgraph outputs
+
+        std::vector<ov::SoPtr<ov::ITensor>> input_tails;   // temporary buffers for input tails
+        std::vector<ov::SoPtr<ov::ITensor>> output_tails;  // temporary buffers for output tails
+    };
+    std::vector<SpatialIO> m_spatial_io;
+
     const std::size_t m_num_submodels;
 
     void dump_input_tensors(std::size_t idx);

@@ -277,6 +277,22 @@ ov::npuw::CompiledModel::CompiledModel(const std::shared_ptr<ov::Model>& model,
                 compiledFunctions.insert({subgraph._funcall, id});
                 m_compiled_submodels[id].model = fcn_template._model;
                 m_compiled_submodels[id].replaced_by = id;  // FIXME: UGLY
+
+                // Fill in the spatial information, if it is present
+                if (fcn_template._spatial) {
+                    using S = CompiledModelDesc::Spatial;
+                    S s;
+                    s.range = fcn_template._spatial->_range;
+                    s.nway = fcn_template._spatial->_slice;
+                    s.out_dim = fcn_template._spatial->_out_dim;
+                    s.nway_iters = s.range / s.nway;
+                    s.tail_size = s.range % s.nway;
+                    for (auto&& input : fcn_template._spatial->_inputs) {
+                        std::size_t p_idx = fcn_template._model->get_parameter_index(input.param);
+                        s.params.push_back(S::Param{p_idx, input.dim});
+                    }
+                    m_compiled_submodels[id].spatial = std::move(s);
+                }
                 LOG_INFO("Subgraph[" << id << "] is a function body for " << subgraph._funcall);
             } else {
                 // ...and refer to it in other calls
@@ -824,6 +840,8 @@ void ov::npuw::CompiledModel::implement_properties() {
                           BIND(npuw::partitioning::cwai, NPUW_CWAI),
                           BIND(npuw::partitioning::dyn_quant, NPUW_DQ),
                           BIND(npuw::partitioning::par_matmul_merge_dims, NPUW_PMM),
+                          BIND(npuw::partitioning::spatial, NPUW_SPATIAL),
+                          BIND(npuw::partitioning::spatial, NPUW_SPATIAL_NWAY),
                           BIND(npuw::partitioning::host_gather, NPUW_HOST_GATHER),
                           BIND(npuw::partitioning::funcall_for_all, NPUW_FUNCALL_FOR_ALL),
                           BIND(npuw::partitioning::dcoff_type, NPUW_DCOFF_TYPE),

@@ -116,10 +116,24 @@ class CompiledModel : public ov::ICompiledModel {
 
         std::optional<std::size_t> replaced_by;
 
-        // FIXME: This is a 1:1 copy of the ov::npuw::Subgraph structure
-        // w.r.t. function calls
         Subgraph::Gather host_gather;
+        struct Spatial {
+            struct Param {
+                std::size_t idx;
+                std::size_t dim;
+            };
+            std::vector<Param> params;
+            std::size_t range = 0u;
+            std::size_t nway = 0u;
+            std::size_t out_dim = 0u;
+
+            std::size_t nway_iters = 0u;
+            std::size_t tail_size = 0u;
+        };
+        std::optional<Spatial> spatial;
 
+        // FIXME: This is a 1:1 copy of the ov::npuw::Subgraph structure
+        // w.r.t. function calls
         std::size_t param_base = 0;
         std::vector<ov::Tensor> closure;
         std::vector<ov::Tensor> scales;