diff --git a/src/plugins/intel_npu/src/al/include/intel_npu/config/npuw.hpp b/src/plugins/intel_npu/src/al/include/intel_npu/config/npuw.hpp index 7b0dab3d16da3c..dc1b4a4e4244ca 100644 --- a/src/plugins/intel_npu/src/al/include/intel_npu/config/npuw.hpp +++ b/src/plugins/intel_npu/src/al/include/intel_npu/config/npuw.hpp @@ -59,6 +59,7 @@ DEFINE_OPT(NPUW_FUNCALL_ASYNC, bool, false, npuw::funcall_async, RunTime); DEFINE_OPT(NPUW_ACC_CHECK, bool, false, npuw::accuracy::check, RunTime); DEFINE_OPT(NPUW_ACC_THRESH, double, 0.01, npuw::accuracy::threshold, RunTime); DEFINE_OPT(NPUW_ACC_DEVICE, std::string, "", npuw::accuracy::reference_device, RunTime); +DEFINE_OPT(NPUW_ACC_DUMP_FAILS, bool, false, npuw::accuracy::dump_failures, RunTime); DEFINE_OPT(NPUW_DUMP_FULL, bool, false, npuw::dump::full, CompileTime); DEFINE_OPT(NPUW_DUMP_SUBS, std::string, "", npuw::dump::subgraphs, CompileTime); DEFINE_OPT(NPUW_DUMP_SUBS_ON_FAIL, std::string, "", npuw::dump::subgraphs_on_fail, CompileTime); diff --git a/src/plugins/intel_npu/src/al/include/intel_npu/npuw_private_properties.hpp b/src/plugins/intel_npu/src/al/include/intel_npu/npuw_private_properties.hpp index 67dce9621bfb4e..52ac711e342dc5 100644 --- a/src/plugins/intel_npu/src/al/include/intel_npu/npuw_private_properties.hpp +++ b/src/plugins/intel_npu/src/al/include/intel_npu/npuw_private_properties.hpp @@ -307,6 +307,14 @@ static constexpr ov::Property threshold{"NPUW_ACC_THRESH"}; * Default value: empty. */ static constexpr ov::Property reference_device{"NPUW_ACC_DEVICE"}; + +/** + * @brief + * Type: bool. + * Enable dumps of materials for model(s), failing accuracy check. + * Default value: false. + */ +static constexpr ov::Property dump_failures{"NPUW_ACC_DUMP_FAILS"}; } // namespace accuracy namespace dump { diff --git a/src/plugins/intel_npu/src/al/src/config/npuw.cpp b/src/plugins/intel_npu/src/al/src/config/npuw.cpp index 6a519a0f754a32..8a37449213d274 100644 --- a/src/plugins/intel_npu/src/al/src/config/npuw.cpp +++ b/src/plugins/intel_npu/src/al/src/config/npuw.cpp @@ -44,6 +44,7 @@ void intel_npu::registerNPUWOptions(OptionsDesc& desc) { desc.add(); desc.add(); desc.add(); + desc.add(); #ifdef NPU_PLUGIN_DEVELOPER_BUILD desc.add(); desc.add(); diff --git a/src/plugins/intel_npu/src/plugin/npuw/accuracy/comparator.cpp b/src/plugins/intel_npu/src/plugin/npuw/accuracy/comparator.cpp index 4440027c818969..13294ac521f122 100644 --- a/src/plugins/intel_npu/src/plugin/npuw/accuracy/comparator.cpp +++ b/src/plugins/intel_npu/src/plugin/npuw/accuracy/comparator.cpp @@ -13,30 +13,47 @@ ov::npuw::metrics::NRMSE::NRMSE(double threshold) : m_threshold(threshold) {} bool ov::npuw::metrics::NRMSE::operator()(const ov::SoPtr& actual, - const ov::SoPtr& reference) const { - NPUW_ASSERT(actual->is_continuous()); - NPUW_ASSERT(reference->is_continuous()); + const ov::SoPtr& reference, + double* result) const { NPUW_ASSERT(actual->get_shape() == reference->get_shape()); // Check for alignment: NPUW_ASSERT(actual->get_byte_size() == reference->get_byte_size()); - // FIXME: Check for strides + + ov::Tensor in_actual(actual->get_element_type(), actual->get_shape()); + ov::Tensor in_reference(reference->get_element_type(), reference->get_shape()); + + if (!actual->is_continuous()) { + ov::make_tensor(actual).copy_to(in_actual); + } else { + in_actual = ov::make_tensor(actual); + } + if (!reference->is_continuous()) { + ov::make_tensor(reference).copy_to(in_reference); + } else { + in_reference = ov::make_tensor(reference); + } + + // TODO: it might be more correct to make to_f32 function + // to work with strided tensors + NPUW_ASSERT(in_actual.is_continuous()); + NPUW_ASSERT(in_reference.is_continuous()); ov::Tensor actual_f32; ov::Tensor reference_f32; - if (ov::element::Type_t::f32 == actual->get_element_type()) { - actual_f32 = ov::make_tensor(actual); + if (ov::element::f32 == in_actual.get_element_type()) { + actual_f32 = in_actual; } else { - ov::Tensor dst(ov::element::Type_t::f32, actual->get_shape()); - ov::npuw::util::to_f32(ov::make_tensor(actual), dst); + ov::Tensor dst(ov::element::Type_t::f32, in_actual.get_shape()); + ov::npuw::util::to_f32(in_actual, dst); actual_f32 = std::move(dst); } - if (ov::element::Type_t::f32 == reference->get_element_type()) { - reference_f32 = ov::make_tensor(reference); + if (ov::element::f32 == in_reference.get_element_type()) { + reference_f32 = in_reference; } else { - ov::Tensor dst(ov::element::Type_t::f32, reference->get_shape()); - ov::npuw::util::to_f32(ov::make_tensor(reference), dst); + ov::Tensor dst(ov::element::Type_t::f32, in_reference.get_shape()); + ov::npuw::util::to_f32(in_reference, dst); reference_f32 = dst; } @@ -51,13 +68,21 @@ bool ov::npuw::metrics::NRMSE::operator()(const ov::SoPtr& actual, } if (squared_error <= std::numeric_limits::epsilon()) { - LOG_INFO("NRMSE loss: 0.0, threshold: " << m_threshold << "."); - LOG_INFO("PASS"); + if (result != nullptr) { + *result = 0.0; + } return true; } double rmse = sqrt(squared_error / size); - NPUW_ASSERT(rmse >= 0.0); + + if (rmse < 0.0) { + // Calculated RMSE metric is < 0.0, what is unexpected. So, return that tensors are unequal. + if (result != nullptr) { + *result = rmse; + } + return false; + } auto actual_min_max = std::minmax_element(actual_data, actual_data + size); auto reference_min_max = std::minmax_element(reference_data, reference_data + size); @@ -66,9 +91,8 @@ bool ov::npuw::metrics::NRMSE::operator()(const ov::SoPtr& actual, std::max(0.f, *actual_min_max.second) - std::min(0.f, *actual_min_max.first)}); double nrmse = rmse / den; - LOG_INFO("NRMSE loss: " << nrmse << ", threshold: " << m_threshold << "."); - - bool success = nrmse <= m_threshold; - LOG_INFO((success ? "PASS" : "FAIL")); - return success; + if (result != nullptr) { + *result = nrmse; + } + return nrmse <= m_threshold; } diff --git a/src/plugins/intel_npu/src/plugin/npuw/accuracy/comparator.hpp b/src/plugins/intel_npu/src/plugin/npuw/accuracy/comparator.hpp index e77a38ced0edc2..1d0182582946c3 100644 --- a/src/plugins/intel_npu/src/plugin/npuw/accuracy/comparator.hpp +++ b/src/plugins/intel_npu/src/plugin/npuw/accuracy/comparator.hpp @@ -15,8 +15,9 @@ namespace metrics { class NRMSE { public: explicit NRMSE(double threshold); - bool operator()(const ov::SoPtr& backup_tensor, const ov::SoPtr& original_tensor) const; - + bool operator()(const ov::SoPtr& backup_tensor, + const ov::SoPtr& original_tensor, + double* result = nullptr) const; private: double m_threshold{}; }; diff --git a/src/plugins/intel_npu/src/plugin/npuw/base_sync_infer_request.cpp b/src/plugins/intel_npu/src/plugin/npuw/base_sync_infer_request.cpp index 216b1a35b4315c..75796186095c7d 100644 --- a/src/plugins/intel_npu/src/plugin/npuw/base_sync_infer_request.cpp +++ b/src/plugins/intel_npu/src/plugin/npuw/base_sync_infer_request.cpp @@ -78,7 +78,7 @@ ov::npuw::IBaseInferRequest::RqPtrs ov::npuw::IBaseInferRequest::create_infer_re OPENVINO_THROW("NPUW: TEMPORARY LIMITATION: Couldn't create reference infer " "requests if 'nireq' is set to > 1!"); } - LOG_INFO("Create reference subrequest for submodel [" << id << "] on " << m_npuw_model->m_ref_device << "..."); + LOG_INFO("Create reference subrequest for Subgraph[" << id << "] on " << m_npuw_model->m_ref_device << "..."); LOG_BLOCK(); if (m_npuw_model->submodel_device(id) != m_npuw_model->m_ref_device) { auto& ref_submodel = m_npuw_model->m_compiled_submodels.at(id).ref_compiled_model; @@ -88,66 +88,225 @@ ov::npuw::IBaseInferRequest::RqPtrs ov::npuw::IBaseInferRequest::create_infer_re m_ref_subrequests.at(id) = std::move(ref_infer_request); LOG_INFO("Done"); } else { - LOG_INFO("Skip creation of reference subrequest for submodule[" - << id << "] on reference device: " << m_npuw_model->m_ref_device << ", as actual subrequest [" - << id << "] has been already created on " - << "it ."); + LOG_INFO("Skip creation of reference subrequest for Subgraph[" + << id << "] on reference device: " << m_npuw_model->m_ref_device << ", as actual subrequest [" + << id << "] has been already created on " + << "it ."); } } return rqs; } -void ov::npuw::IBaseInferRequest::ensure_subrequest_is_accurate(std::size_t idx, bool& failover) { +namespace { + void set_inputs(const ov::SoPtr& from, ov::SoPtr& to) { + const auto& from_comp_model = from->get_compiled_model(); + const auto& to_comp_model = to->get_compiled_model(); + for (size_t i = 0; i < from_comp_model->inputs().size(); i++) { + const auto& itnsr = from->get_tensor(from_comp_model->inputs()[i]); + to->set_tensor(to_comp_model->inputs()[i], itnsr); + } + } + + void copy_results(const ov::SoPtr& from, ov::SoPtr& to) { + const auto& from_comp_model = from->get_compiled_model(); + const auto& to_comp_model = to->get_compiled_model(); + for (size_t i = 0; i < to_comp_model->outputs().size(); i++) { + const auto& from_tnsr = from->get_tensor(from_comp_model->outputs()[i]); + const auto& to_tnsr = to->get_tensor(to_comp_model->outputs()[i]); + from_tnsr->copy_to(to_tnsr._ptr); + } + } + + std::stringstream create_launch_msg(std::size_t idx, std::size_t real_idx) { + std::stringstream log_msg_stream; + log_msg_stream << "Launching subrequest[" << idx << "]" << + ((real_idx == idx) ? std::string("...").c_str() : + std::string(std::string(", which is actually subrequest[") + + std::to_string(real_idx) + "]").c_str()); + return log_msg_stream; + } +} // anonymous namespace + +void ov::npuw::IBaseInferRequest::try_accurate_subinfer(std::size_t subidx, std::size_t offset, + std::size_t len, bool& accuracy_failover) { + auto real_subidx = real(subidx); + auto& act_subr = m_subrequests.at(real_subidx); + if (!m_npuw_model->m_acc_check) { + act_subr->infer(); + return; + } + + std::stringstream log_msg_stream = create_launch_msg(subidx, real_subidx); + if (m_npuw_model->m_compiled_submodels[real_subidx].spatial && len != 0) { + log_msg_stream << ", on range : [" << offset << ", " << offset + len << ")"; + } + log_msg_stream << "..."; + LOG_INFO(log_msg_stream.str()); + LOG_BLOCK(); + + if (m_npuw_model->m_compiled_submodels[real_subidx].switched_to_ref) { + LOG_INFO("Subrequest was inaccurate somewhere before, launching it on reference device."); + + auto& act_subr = m_subrequests.at(real_subidx); + auto& ref_subr = m_ref_subrequests.at(real_subidx); + + set_inputs(act_subr, ref_subr); + ref_subr->infer(); + copy_results(ref_subr, act_subr); + } else { + act_subr->infer(); + ensure_subrequest_is_accurate(subidx, accuracy_failover); + } +} + +void ov::npuw::IBaseInferRequest::try_accurate_subinfer(std::size_t subidx, bool& accuracy_failover) { + try_accurate_subinfer(subidx, 0, 0, accuracy_failover); +} + +void ov::npuw::IBaseInferRequest::try_accurate_substart_async(std::size_t subidx) { + auto real_subidx = real(subidx); + auto& act_subr = m_subrequests.at(real_subidx); + if (!m_npuw_model->m_acc_check) { + act_subr->start_async(); + return; + } + + std::stringstream log_msg_stream = create_launch_msg(subidx, real_subidx); + log_msg_stream << "..."; + LOG_INFO(log_msg_stream.str()); + LOG_BLOCK(); + + if (m_npuw_model->m_compiled_submodels[real_subidx].switched_to_ref) { + LOG_INFO("Subrequest was inaccurate somewhere before, launching it on reference device."); + + auto& act_subr = m_subrequests.at(real_subidx); + auto& ref_subr = m_ref_subrequests.at(real_subidx); + + set_inputs(act_subr, ref_subr); + ref_subr->start_async(); + } else { + act_subr->start_async(); + } +} + +void ov::npuw::IBaseInferRequest::try_accurate_subwait(std::size_t subidx, bool& accuracy_failover) { + auto real_subidx = real(subidx); + auto& act_subr = m_subrequests.at(real_subidx); + if (!m_npuw_model->m_acc_check) { + act_subr->wait(); + return; + } + + LOG_BLOCK(); + + if (m_npuw_model->m_compiled_submodels[real_subidx].switched_to_ref) { + auto& act_subr = m_subrequests.at(real_subidx); + auto& ref_subr = m_ref_subrequests.at(real_subidx); + + ref_subr->wait(); + copy_results(ref_subr, act_subr); + } else { + act_subr->wait(); + ensure_subrequest_is_accurate(subidx, accuracy_failover); + } +} + +void ov::npuw::IBaseInferRequest::ensure_subrequest_is_accurate(std::size_t idx, bool& accuracy_failover) { + if (!m_npuw_model->m_acc_check) { + return; + } + LOG_INFO("Check if subrequest[" << idx << "] is accurate..."); LOG_BLOCK(); - failover = false; - if (m_ref_subrequests.at(idx) != nullptr && m_subrequests.at(idx)._ptr != m_ref_subrequests.at(idx)._ptr) { - NPUW_ASSERT(m_npuw_model->m_compiled_submodels.at(idx).switched_to_ref == false); - NPUW_ASSERT(m_npuw_model->m_compiled_submodels.at(idx).replaced_by.value_or(idx) == idx); - - const auto& ref_comp_model = m_ref_subrequests.at(idx)->get_compiled_model(); - const auto& actual_comp_model = m_subrequests.at(idx)->get_compiled_model(); - NPUW_ASSERT(actual_comp_model->inputs().size() == ref_comp_model->inputs().size()); - // Setting inputs: - for (size_t i = 0; i < actual_comp_model->inputs().size(); i++) { - const auto& itensor = m_subrequests.at(idx)->get_tensor(actual_comp_model->inputs()[i]); - m_ref_subrequests.at(idx)->set_tensor(ref_comp_model->inputs()[i], itensor); - } - m_ref_subrequests.at(idx)->infer(); - LOG_INFO("Compare actual outputs against references:"); - bool tensors_converge = true; - for (size_t i = 0; i < actual_comp_model->outputs().size(); i++) { + std::size_t real_idx = real(idx); + OPENVINO_ASSERT(m_npuw_model->m_compiled_submodels[real_idx].switched_to_ref == false); + + if (m_npuw_model->submodel_device(idx) == m_npuw_model->m_ref_device) { + LOG_INFO("Skipped, subrequest[" << idx << "] is launched on reference device."); + return; + } + + accuracy_failover = false; + auto& actual_subr = m_subrequests.at(real_idx); + auto& ref_subr = m_ref_subrequests.at(real_idx); + + // Setting inputs: + set_inputs(actual_subr, ref_subr); + + // Running inference: + ref_subr->infer(); + + // Comparing results of actual and reference inferfences: + LOG_INFO("Compare actual outputs against references:"); + bool tensors_converge = true; + const auto& actual_comp_model = actual_subr->get_compiled_model(); + const auto& ref_comp_model = ref_subr->get_compiled_model(); + std::vector converges(actual_comp_model->outputs().size()); + std::vector metrics(actual_comp_model->outputs().size()); + for (size_t i = 0; i < actual_comp_model->outputs().size(); i++) { + const auto& actual_tensor = actual_subr->get_tensor(actual_comp_model->outputs()[i]); + const auto& ref_tensor = ref_subr->get_tensor(ref_comp_model->outputs()[i]); + converges[i] = m_npuw_model->m_acc_check(actual_tensor, ref_tensor, &metrics[i]); + tensors_converge &= converges[i]; + } + if (tensors_converge == false) { + if (ov::npuw::get_log_level() == ov::npuw::LogLevel::Error) { + // For just log level error print header message: + LOG_ERROR("Check if subrequest[" << idx << "] is accurate..."); + } + } + // Log comparison details: + for (size_t i = 0; i < actual_comp_model->outputs().size(); i++) { + if (converges[i]) { LOG_INFO(" - " << actual_comp_model->outputs()[i]); - const auto& actual_tensor = m_subrequests.at(idx)->get_tensor(actual_comp_model->outputs()[i]); - const auto& ref_tensor = m_ref_subrequests.at(idx)->get_tensor(ref_comp_model->outputs()[i]); LOG_BLOCK(); - tensors_converge &= m_npuw_model->m_acc_check(actual_tensor, ref_tensor); - } - LOG_INFO((tensors_converge ? "PASS" : "FAIL")); - - if (!tensors_converge) { - LOG_INFO("Subrequest is inaccurate, failover to reference."); - // FIXME: We need to copy reference tensors to actual only in single-model-inference mode - // or if our subgraph is last in the chain. - for (size_t i = 0; i < actual_comp_model->outputs().size(); i++) { - const auto& actual_tensor = m_subrequests.at(idx)->get_tensor(actual_comp_model->outputs()[i]); - const auto& ref_tensor = m_ref_subrequests.at(idx)->get_tensor(ref_comp_model->outputs()[i]); - ref_tensor->copy_to(actual_tensor._ptr); - } - m_npuw_model->m_compiled_submodels.at(idx).compiled_model = - m_npuw_model->m_compiled_submodels.at(idx).ref_compiled_model; - m_npuw_model->m_compiled_submodels.at(idx).switched_to_ref = true; - m_subrequests.at(idx) = m_ref_subrequests.at(idx); - update_subrequest_links(idx); - failover = true; + LOG_INFO(m_npuw_model->m_acc_check_name << " loss: " << metrics[i] << + ", threshold: " << m_npuw_model->m_acc_check_threshold << "."); + LOG_INFO("PASS"); + } else { + LOG_ERROR(" - " << actual_comp_model->outputs()[i]); + LOG_BLOCK(); + LOG_ERROR(m_npuw_model->m_acc_check_name << " loss: " << metrics[i] << + ", threshold: " << m_npuw_model->m_acc_check_threshold << "."); + LOG_ERROR("FAIL"); } + } - LOG_INFO("Done"); + // If comparison fails, copy reference results to original tensors and mark subgraph as + // switched to reference: + if (tensors_converge) { + LOG_INFO("PASS"); } else { - LOG_INFO("Skipped, subrequest is launched on reference device."); + LOG_ERROR("FAIL"); + LOG_ERROR("Subrequest[" << idx << "] is inaccurate, failover to reference results."); + if (idx != real_idx) { + LOG_ERROR("As subrequest[" << idx << "] is actually " << "subrequest[" << real_idx << + "], all subrequests, corresponding to last, will be further " << + "launched on " << m_npuw_model->m_ref_device << ".'"); + } else if (m_npuw_model->m_compiled_submodels[real_idx].replaced_by) { + LOG_ERROR("As subrequest[" << real_idx << "] is actually " << "a function, all " << + "subrequests, corresponding to it, will be further launched on " << + m_npuw_model->m_ref_device << "."); + } + + if (m_npuw_model->m_cfg.get<::intel_npu::NPUW_ACC_DUMP_FAILS>()) { + const auto model = m_npuw_model->m_compiled_submodels[real_idx].model; + const auto model_path = "inaccurate_" + model->get_friendly_name() + ".xml"; + ov::save_model(model, model_path); + dump_input_tensors(idx, true); + dump_output_tensors(idx, true); + } + + // Due to complex memory management logic it is safe to just copy + // results back to already properly allocated and linked tensors: + copy_results(ref_subr, actual_subr); + m_npuw_model->m_compiled_submodels[real_idx].switched_to_ref = true; + accuracy_failover = true; } + + LOG_INFO("Done"); } ov::SoPtr ov::npuw::IBaseInferRequest::get_tensor(const ov::Output& port) const { @@ -192,27 +351,23 @@ void ov::npuw::IBaseInferRequest::infer() { run_subrequest_for_success(idx, failover); failover_happened |= failover; complete_subrequest(idx); - if (m_npuw_model->m_acc_check) { - ensure_subrequest_is_accurate(idx, failover); - failover_happened |= failover; - } } // Increment counter regardless if dumps etc are enabled or not. m_run_iter++; if (failover_happened) { - LOG_INFO("Refined device distribution:"); + LOG_ERROR("Refined device distribution:"); LOG_BLOCK(); - m_npuw_model->log_device_dist(); + m_npuw_model->log_device_dist(ov::npuw::LogLevel::Error); } m_now_idx.reset(); } -void ov::npuw::IBaseInferRequest::dump_input_tensors(std::size_t idx) { +void ov::npuw::IBaseInferRequest::dump_input_tensors(std::size_t idx, bool forced) { const std::string dump_ios_opt = m_npuw_model->m_cfg.get<::intel_npu::NPUW_DUMP_IO>(); const std::size_t end_idx = m_npuw_model->m_compiled_submodels.size(); - if (!ov::npuw::util::is_set(idx, dump_ios_opt, end_idx)) { + if (!ov::npuw::util::is_set(idx, dump_ios_opt, end_idx) && !forced) { return; } @@ -245,7 +400,7 @@ void ov::npuw::IBaseInferRequest::dump_input_tensors(std::size_t idx) { const auto& s = comp_submodel_desc.spatial.value(); std::set spatial_param_idx; - std::vector in_base_names_nonspat; + std::vector in_base_names(num_inputs); // First, dump the non-spatial input tensors just once - and remember its names for (auto&& p : s.params) { @@ -259,7 +414,7 @@ void ov::npuw::IBaseInferRequest::dump_input_tensors(std::size_t idx) { const auto& tnsr = m_subrequests[real_idx]->get_tensor(port); std::string in_base_name = comp_submodel_path + "_input_" + ov::npuw::util::fmt(i, num_inputs); ov::npuw::dump_tensor(tnsr, in_base_name); - in_base_names_nonspat.push_back(std::move(in_base_name)); + in_base_names[i] = std::move(in_base_name); } // Now iterate over the spatial range and dump the individual tiles @@ -268,8 +423,11 @@ void ov::npuw::IBaseInferRequest::dump_input_tensors(std::size_t idx) { for (std::size_t offset = 0u; offset < s.range; offset += s.nway) { const std::size_t this_len = (offset + s.nway <= s.range) ? s.nway // the full tile : (s.range - offset); // the last tile + if (m_spatial_selector != nullptr && !m_spatial_selector->need_submit(offset, this_len)) { + continue; + } + // Copy the base file list to start with it - std::vector tile_ilist(in_base_names_nonspat); for (auto&& p : s.params) { std::string in_base_name = comp_submodel_path + "_input_" + ov::npuw::util::fmt(p.idx, num_inputs) + "_d" + ov::npuw::util::fmt(p.dim, 10) + "_" + @@ -279,18 +437,20 @@ void ov::npuw::IBaseInferRequest::dump_input_tensors(std::size_t idx) { const auto& view = ov::npuw::util::view(tnsr, p.dim, offset, this_len); ov::npuw::dump_tensor(view, in_base_name); - tile_ilist.push_back(std::move(in_base_name)); + in_base_names[p.idx] = std::move(in_base_name); } // Dump ilist per tile - ov::npuw::dump_input_list(comp_submodel_path, tile_ilist); + std::string tile_ilist_name = comp_submodel_path + "_" + + ov::npuw::util::fmt(offset, s.range); + ov::npuw::dump_input_list(tile_ilist_name, in_base_names); } // for(offset) } } -void ov::npuw::IBaseInferRequest::dump_output_tensors(std::size_t idx) { +void ov::npuw::IBaseInferRequest::dump_output_tensors(std::size_t idx, bool forced) { const std::string dump_ios_opt = m_npuw_model->m_cfg.get<::intel_npu::NPUW_DUMP_IO>(); const std::size_t end_idx = m_npuw_model->m_compiled_submodels.size(); - if (!ov::npuw::util::is_set(idx, dump_ios_opt, end_idx)) { + if (!ov::npuw::util::is_set(idx, dump_ios_opt, end_idx) && !forced) { return; } @@ -336,7 +496,9 @@ void ov::npuw::IBaseInferRequest::dump_output_tensors(std::size_t idx) { tile_olist.push_back(std::move(out_base_name)); } // Dump olist per tile - ov::npuw::dump_output_list(comp_submodel_path, tile_olist); + std::string tile_olist_name = comp_submodel_path + "_" + + ov::npuw::util::fmt(offset, s.range); + ov::npuw::dump_output_list(tile_olist_name, tile_olist); } } } diff --git a/src/plugins/intel_npu/src/plugin/npuw/base_sync_infer_request.hpp b/src/plugins/intel_npu/src/plugin/npuw/base_sync_infer_request.hpp index 6be64d676d6149..b054b98dd29b18 100644 --- a/src/plugins/intel_npu/src/plugin/npuw/base_sync_infer_request.hpp +++ b/src/plugins/intel_npu/src/plugin/npuw/base_sync_infer_request.hpp @@ -15,6 +15,7 @@ #include "openvino/runtime/isync_infer_request.hpp" #include "openvino/runtime/so_ptr.hpp" #include "perf.hpp" +#include "spatial.hpp" namespace ov { namespace npuw { @@ -62,8 +63,12 @@ class IBaseInferRequest : public ov::ISyncInferRequest { // their inference requests anymore - they must be stored // only once in the subrequests list RqPtrs create_infer_requests(std::size_t id, size_t nireq = 1, bool* recompiled = nullptr); - void ensure_subrequest_is_accurate(std::size_t idx, bool& failover); - virtual void update_subrequest_links(std::size_t idx) = 0; + void try_accurate_subinfer(std::size_t idx, bool& accuracy_failover); + void try_accurate_subinfer(std::size_t idx, std::size_t offset, std::size_t len, + bool& accuracy_failover); + void try_accurate_substart_async(std::size_t idx); + void try_accurate_subwait(std::size_t idx, bool& accuracy_failover); + void ensure_subrequest_is_accurate(std::size_t idx, bool& accuracy_failover); std::shared_ptr m_npuw_model; std::vector m_completion_cbs; @@ -107,10 +112,15 @@ class IBaseInferRequest : public ov::ISyncInferRequest { }; std::vector m_spatial_io; + // FIXME: Currently is initialized/managed by subclass as well. + // Moved here dumping purposes only + // Represents spatial run-time info + runtime::spatial::Selector::Ptr m_spatial_selector; + const std::size_t m_num_submodels; - void dump_input_tensors(std::size_t idx); - void dump_output_tensors(std::size_t idx); + void dump_input_tensors(std::size_t idx, bool forced = false); + void dump_output_tensors(std::size_t idx, bool forced = false); // Quick-and-dirty profiling ov::npuw::perf::metric m_ms_unpack; @@ -131,11 +141,11 @@ class IBaseInferRequest : public ov::ISyncInferRequest { std::size_t next(std::size_t idx_base) const; std::size_t real(std::size_t idx) const; - RqPtrs m_ref_subrequests; - using now_t = std::optional; now_t now_idx() const; + RqPtrs m_ref_subrequests; + private: now_t m_now_idx; }; diff --git a/src/plugins/intel_npu/src/plugin/npuw/compiled_model.cpp b/src/plugins/intel_npu/src/plugin/npuw/compiled_model.cpp index b52dd40ea59364..da7db76145d324 100644 --- a/src/plugins/intel_npu/src/plugin/npuw/compiled_model.cpp +++ b/src/plugins/intel_npu/src/plugin/npuw/compiled_model.cpp @@ -109,6 +109,8 @@ ov::npuw::CompiledModel::CompiledModel(const std::shared_ptr& model, const double threshold_opt = m_cfg.get<::intel_npu::NPUW_ACC_THRESH>(); m_acc_check = metrics::NRMSE(threshold_opt); + m_acc_check_name = "NRMSE"; + m_acc_check_threshold = threshold_opt; m_ref_device = m_cfg.getString<::intel_npu::NPUW_ACC_DEVICE>(); LOG_INFO("Accuracy check is enabled."); } @@ -377,8 +379,7 @@ ov::npuw::CompiledModel::CompiledModel(const std::shared_ptr& model, } } - m_compiled_submodels[id].device_it = - id != real_id ? m_compiled_submodels[real_id].device_it : m_dev_list.cbegin(); + m_compiled_submodels[id].device_it = m_dev_list.cbegin(); if (forced_sub_devices.count(id)) { std::string forced_device = forced_sub_devices[id]; @@ -776,7 +777,7 @@ std::string ov::npuw::CompiledModel::submodel_device(const std::size_t idx) cons return *comp_subm_desc.device_it; } -void ov::npuw::CompiledModel::log_device_dist() const { +void ov::npuw::CompiledModel::log_device_dist(ov::npuw::LogLevel log_lvl) const { std::unordered_map stats_for_devices; execution_stats stats_for_optimized_out{0.f, 0ul}; @@ -791,14 +792,32 @@ void ov::npuw::CompiledModel::log_device_dist() const { stat.ops += real_cm.stat.ops; } - auto print_stats = [this](const std::string& device, const execution_stats& stat) { + auto print_stats = [this, log_lvl](const std::string& device, const execution_stats& stat) { float flops_prcnt = 100.f; float ops_prcnt = 100.f; if (m_total_stat.gflops > 0 && m_total_stat.ops > 0) { flops_prcnt = stat.gflops / static_cast(m_total_stat.gflops) * 100; ops_prcnt = stat.ops / static_cast(m_total_stat.ops) * 100; } - LOG_INFO(device << ": " << flops_prcnt << "% FLOPS, " << ops_prcnt << "% Layers"); + std::stringstream log_msg; + log_msg << device << ": " << flops_prcnt << "% FLOPS, " << ops_prcnt << "% Layers"; + switch (log_lvl) { + case LogLevel::Error: + LOG_ERROR(log_msg.str()); + break; + case LogLevel::Warning: + LOG_WARN(log_msg.str()); + break; + case LogLevel::Info: + LOG_INFO(log_msg.str()); + break; + case LogLevel::Verbose: + LOG_VERB(log_msg.str()); + break; + case LogLevel::Debug: + LOG_DEBUG(log_msg.str()); + break; + } }; for (auto&& device_st : stats_for_devices) { LOG_BLOCK(); @@ -940,6 +959,7 @@ void ov::npuw::CompiledModel::implement_properties() { BIND(npuw::accuracy::check, NPUW_ACC_CHECK), BIND(npuw::accuracy::threshold, NPUW_ACC_THRESH), BIND(npuw::accuracy::reference_device, NPUW_ACC_DEVICE), + BIND(npuw::accuracy::dump_failures, NPUW_ACC_DUMP_FAILS), #ifdef NPU_PLUGIN_DEVELOPER_BUILD BIND(npuw::dump::full, NPUW_DUMP_FULL), BIND(npuw::dump::subgraphs, NPUW_DUMP_SUBS), diff --git a/src/plugins/intel_npu/src/plugin/npuw/compiled_model.hpp b/src/plugins/intel_npu/src/plugin/npuw/compiled_model.hpp index 6199ac66c0c64e..d629da2a1b42de 100644 --- a/src/plugins/intel_npu/src/plugin/npuw/compiled_model.hpp +++ b/src/plugins/intel_npu/src/plugin/npuw/compiled_model.hpp @@ -71,7 +71,7 @@ class CompiledModel : public ov::ICompiledModel { std::string submodel_device(const std::size_t idx) const; - void log_device_dist() const; + void log_device_dist(ov::npuw::LogLevel log_lvl = ov::npuw::LogLevel::Info) const; void implement_properties(); @@ -149,7 +149,9 @@ class CompiledModel : public ov::ICompiledModel { }; std::vector m_compiled_submodels; - std::function&, const ov::SoPtr&)> m_acc_check; + std::function&, const ov::SoPtr&, double*)> m_acc_check; + std::string m_acc_check_name; + double m_acc_check_threshold; std::string m_ref_device; execution_stats m_total_stat; diff --git a/src/plugins/intel_npu/src/plugin/npuw/just_sync_infer_request.cpp b/src/plugins/intel_npu/src/plugin/npuw/just_sync_infer_request.cpp index 0e0b96582a663c..4cde73b3ea541f 100644 --- a/src/plugins/intel_npu/src/plugin/npuw/just_sync_infer_request.cpp +++ b/src/plugins/intel_npu/src/plugin/npuw/just_sync_infer_request.cpp @@ -129,7 +129,7 @@ void ov::npuw::FuncMemMgr::assign(const LinkFrom& from) { // - Look for an output tensor to reuse // - If there's one, assign it to this allocation // - If there's none, allocate a new tensor - // - How a tensor to reuse is piced: + // - How a tensor to reuse is picked: // 1. It should exist // 2. It's "remaining reads" count should be 0 (all planned reads // happened at this point). @@ -265,6 +265,7 @@ ov::npuw::JustInferRequest::JustInferRequest(const std::shared_ptrlog_device_dist(); + m_npuw_model->log_device_dist(ov::npuw::LogLevel::Error); } // Identify connections for the funcall pipeline, if needed @@ -820,11 +821,11 @@ void ov::npuw::JustInferRequest::unpack_closure(std::size_t idx, RqPtr request) } void ov::npuw::JustInferRequest::recreate_subrequests(std::size_t idx) { - auto& comp_model_desc = m_npuw_model->m_compiled_submodels[idx]; - auto real_idx = comp_model_desc.replaced_by.value_or(idx); + std::size_t real_idx = real(idx); + auto& comp_model_desc = m_npuw_model->m_compiled_submodels[real_idx]; - const auto is_piped = is_pipelined(idx); - auto new_rqs = create_infer_requests(idx, is_piped ? 2 : 1); + const auto is_piped = is_pipelined(real_idx); + auto new_rqs = create_infer_requests(real_idx, is_piped ? 2 : 1); // NB: Regardless if this subrequest was a function call // or not, always use the real_idx here - for regular @@ -841,13 +842,13 @@ void ov::npuw::JustInferRequest::recreate_subrequests(std::size_t idx) { // overkill - only affected subrequest(s) could be updated instead, // but it is a more complex thing and can be implemented separately connect_subrequests(); - m_subrequest_devices[idx] = *comp_model_desc.device_it; + m_subrequest_devices[real_idx] = *comp_model_desc.device_it; } void ov::npuw::JustInferRequest::run_subrequest_for_success(std::size_t idx, bool& failover) { failover = false; - auto& comp_model_desc = m_npuw_model->m_compiled_submodels[idx]; - auto real_idx = comp_model_desc.replaced_by.value_or(idx); + bool accuracy_failover = false; + auto real_idx = real(idx); // Infer is also fail-safe... bool job_done = false; @@ -870,7 +871,7 @@ void ov::npuw::JustInferRequest::run_subrequest_for_success(std::size_t idx, boo // the subrequest' outputs to global Results, if relevant. bind_global_results(idx); - if (comp_model_desc.replaced_by) { + if (m_npuw_model->m_compiled_submodels[idx].replaced_by) { function_prologue(idx); } if (!dump_in) { @@ -881,7 +882,7 @@ void ov::npuw::JustInferRequest::run_subrequest_for_success(std::size_t idx, boo try { LOG_DEBUG("Trying to run subrequest[" << idx << "]..."); LOG_BLOCK(); - unsafe_run_this_prep_next(idx, next_prepared); + unsafe_run_this_prep_next(idx, next_prepared, accuracy_failover); job_done = true; LOG_DEBUG("Done: " << idx << "(exec subrequest)"); } catch (const std::exception& ex) { @@ -896,7 +897,8 @@ void ov::npuw::JustInferRequest::run_subrequest_for_success(std::size_t idx, boo LOG_INFO("- Trying next device..."); // Altering iterators here!! Contracts should be changed! - comp_model_desc.device_it++; + auto& proto_comp_model_desc = m_npuw_model->m_compiled_submodels[real_idx]; + proto_comp_model_desc.device_it++; if (!m_npuw_model->compile_for_success(real_idx)) { OPENVINO_THROW("Failed to compile. No more devices are left!"); } @@ -912,36 +914,41 @@ void ov::npuw::JustInferRequest::run_subrequest_for_success(std::size_t idx, boo std::swap(m_subrequests[real_idx], m_funcall_pipeline[real_idx].subrequest); } } + + failover |= accuracy_failover; } -void ov::npuw::JustInferRequest::unsafe_during(std::size_t real_idx, const std::function& f) { - auto& comp_model_desc = m_npuw_model->m_compiled_submodels[real_idx]; - if (!comp_model_desc.spatial) { +void ov::npuw::JustInferRequest::unsafe_during(std::size_t idx, + const std::function& f, + bool& accuracy_failover) { + std::size_t real_idx = real(idx); + auto& proto_comp_model_desc = m_npuw_model->m_compiled_submodels[real_idx]; + if (!proto_comp_model_desc.spatial) { // Non-spatial execution: trigger request asynchronously, run `f` in this context - auto& r = m_subrequests[real_idx]; - r->start_async(); + try_accurate_substart_async(idx); f(); // expect noexcept - r->wait(); + try_accurate_subwait(idx, accuracy_failover); } else { // Spatial execution... Do the opposite - run f asynchronously, and meanwhile run the // spatial inference auto future = std::async(std::launch::async, f); - unsafe_infer(real_idx); + unsafe_infer(idx, accuracy_failover); future.wait(); } } -void ov::npuw::JustInferRequest::unsafe_infer(std::size_t real_idx) { - auto& comp_model_desc = m_npuw_model->m_compiled_submodels[real_idx]; - auto& r = m_subrequests[real_idx]; - if (!comp_model_desc.spatial) { +void ov::npuw::JustInferRequest::unsafe_infer(std::size_t idx, bool& accuracy_failover) { + std::size_t real_idx = real(idx); + auto& proto_comp_model_desc = m_npuw_model->m_compiled_submodels[real_idx]; + if (!proto_comp_model_desc.spatial) { // Run normally - r->infer(); + try_accurate_subinfer(idx, accuracy_failover); } else { + auto& r = m_subrequests[real_idx]; // Run over the specified range... Note: the full inputs/outputs // must be prepared in the m_spatial_io at this point - const auto& spatial = comp_model_desc.spatial.value(); - const auto num_outputs = comp_model_desc.compiled_model->outputs().size(); + const auto& spatial = proto_comp_model_desc.spatial.value(); + const auto num_outputs = proto_comp_model_desc.compiled_model->outputs().size(); NPUW_ASSERT(m_spatial_selector); // Create a sparse vector with full input sizes. @@ -949,7 +956,7 @@ void ov::npuw::JustInferRequest::unsafe_infer(std::size_t real_idx) { // number of input parameters (activations) so some slots may be // not used here. // FIXME: All these preparations could be done statically (just once) - std::vector full_in_shapes(comp_model_desc.param_base); + std::vector full_in_shapes(proto_comp_model_desc.param_base); for (auto&& param : spatial.params) { full_in_shapes[param.idx] = m_spatial_io[real_idx].inputs.at(param.idx)->get_shape(); } @@ -974,7 +981,7 @@ void ov::npuw::JustInferRequest::unsafe_infer(std::size_t real_idx) { // Collect spatial inputs for this offset for (auto&& param : spatial.params) { - const auto& iport = comp_model_desc.compiled_model->inputs()[param.idx]; + const auto& iport = proto_comp_model_desc.compiled_model->inputs()[param.idx]; r->set_tensor( iport, ov::npuw::util::view(m_spatial_io[real_idx].inputs.at(param.idx), param.dim, offset, spatial.nway)); @@ -982,7 +989,7 @@ void ov::npuw::JustInferRequest::unsafe_infer(std::size_t real_idx) { // Now set the spatial outputs for (std::size_t out_idx = 0u; out_idx < num_outputs; out_idx++) { - const auto& oport = comp_model_desc.compiled_model->outputs()[out_idx]; + const auto& oport = proto_comp_model_desc.compiled_model->outputs()[out_idx]; r->set_tensor(oport, ov::npuw::util::view(m_spatial_io[real_idx].outputs.at(out_idx), spatial.out_dim, @@ -991,7 +998,7 @@ void ov::npuw::JustInferRequest::unsafe_infer(std::size_t real_idx) { } // for(outputs) // Now run the part - r->infer(); + try_accurate_subinfer(idx, offset, spatial.nway, accuracy_failover); } // for(full_nway_times) // Now process the tail, if required @@ -1004,7 +1011,7 @@ void ov::npuw::JustInferRequest::unsafe_infer(std::size_t real_idx) { offset, spatial.tail_size); - const auto& iport = comp_model_desc.compiled_model->inputs()[param.idx]; + const auto& iport = proto_comp_model_desc.compiled_model->inputs()[param.idx]; auto out_view = ov::npuw::util::view(m_spatial_io[real_idx].input_tails.at(param.idx), param.dim, 0, @@ -1016,16 +1023,16 @@ void ov::npuw::JustInferRequest::unsafe_infer(std::size_t real_idx) { // Now set the tail tensors for (std::size_t out_idx = 0u; out_idx < num_outputs; out_idx++) { - const auto& oport = comp_model_desc.compiled_model->outputs()[out_idx]; + const auto& oport = proto_comp_model_desc.compiled_model->outputs()[out_idx]; r->set_tensor(oport, m_spatial_io[real_idx].output_tails.at(out_idx)); } // for(outputs) // Now run the tail infer - r->infer(); + try_accurate_subinfer(idx, offset, spatial.tail_size, accuracy_failover); // Now copy the views from the output full-nway tensor to the output tensors for (std::size_t out_idx = 0u; out_idx < num_outputs; out_idx++) { - const auto& oport = comp_model_desc.compiled_model->outputs()[out_idx]; + const auto& oport = proto_comp_model_desc.compiled_model->outputs()[out_idx]; auto spatial_tensor_shape = oport.get_shape(); auto in_view = ov::npuw::util::view(m_spatial_io[real_idx].output_tails.at(out_idx), @@ -1043,7 +1050,8 @@ void ov::npuw::JustInferRequest::unsafe_infer(std::size_t real_idx) { } } -void ov::npuw::JustInferRequest::unsafe_run_this_prep_next(std::size_t idx, bool& next_prepared) { +void ov::npuw::JustInferRequest::unsafe_run_this_prep_next(std::size_t idx, bool& next_prepared, + bool& accuracy_failover) { auto& comp_model_desc = m_npuw_model->m_compiled_submodels[idx]; auto real_idx = comp_model_desc.replaced_by.value_or(idx); const std::size_t next_idx = next(idx + 1); @@ -1057,18 +1065,18 @@ void ov::npuw::JustInferRequest::unsafe_run_this_prep_next(std::size_t idx, bool if (is_pipelined(real_idx)) { // function pipelining is here! and the next rq is ours. NPUW_ASSERT(m_funcall_pipeline[idx].next.value() == next_idx); - unsafe_during(real_idx, [&]() { + unsafe_during(idx, [&]() { LOG_DEBUG("Unpacking closures for the NEXT subrequest[" << next_idx << "]..."); LOG_BLOCK(); // Note: do it here unconditionally - if this request fails, // have to resubmit all the data to the recompiled pair anyway bind_global_parameters(next_idx); unpack_closure(next_idx, m_funcall_pipeline[real_idx].subrequest); - }); + }, accuracy_failover); } else { // Function pipelining is not used. THIS infer request // is also the NEXT one. Nothing much to do here - unsafe_infer(real_idx); + unsafe_infer(idx, accuracy_failover); bind_global_parameters(next_idx); } } else { @@ -1078,9 +1086,9 @@ void ov::npuw::JustInferRequest::unsafe_run_this_prep_next(std::size_t idx, bool if (next_idx == 0) { // Note: even if m_function_pipelining is ON, // SWAP won't happen here - see the below check for .next - unsafe_infer(real_idx); + unsafe_infer(idx, accuracy_failover); } else { - unsafe_during(real_idx, [&]() { + unsafe_during(idx, [&]() { if (!next_prepared) { bind_global_parameters(next_idx); next_prepared = true; @@ -1091,21 +1099,21 @@ void ov::npuw::JustInferRequest::unsafe_run_this_prep_next(std::size_t idx, bool LOG_BLOCK(); unpack_closure(my_next_idx, m_funcall_pipeline[real_idx].subrequest); } - }); + }, accuracy_failover); } } } else { // This is a regular subgraph. Start it async to prepare the next // parameters if (next_idx == 0) { - unsafe_infer(real_idx); + unsafe_infer(idx, accuracy_failover); } else { - unsafe_during(real_idx, [&]() { + unsafe_during(idx, [&]() { if (!next_prepared) { bind_global_parameters(next_idx); next_prepared = true; } - }); + }, accuracy_failover); } } // if (replaced_by) } @@ -1148,10 +1156,6 @@ bool ov::npuw::JustInferRequest::supports_async_pipeline() const { return false; } -void ov::npuw::JustInferRequest::update_subrequest_links(std::size_t) { - connect_subrequests(); -} - bool ov::npuw::JustInferRequest::is_pipelined(std::size_t idx) const { const auto& desc = m_npuw_model->m_compiled_submodels[real(idx)]; return m_use_function_pipelining && desc.replaced_by && !desc.forced_to_fcall; diff --git a/src/plugins/intel_npu/src/plugin/npuw/just_sync_infer_request.hpp b/src/plugins/intel_npu/src/plugin/npuw/just_sync_infer_request.hpp index d219f170a8e6bb..697925f4f1b652 100644 --- a/src/plugins/intel_npu/src/plugin/npuw/just_sync_infer_request.hpp +++ b/src/plugins/intel_npu/src/plugin/npuw/just_sync_infer_request.hpp @@ -94,8 +94,6 @@ class JustInferRequest final : public IBaseInferRequest { std::size_t total_subrequests() const override; bool supports_async_pipeline() const override; - void update_subrequest_links(std::size_t idx) override; - //////////////////////////////////// // now own API @@ -108,9 +106,9 @@ class JustInferRequest final : public IBaseInferRequest { void function_prologue(std::size_t idx); void unpack_closure(std::size_t idx, RqPtr request); - void unsafe_during(std::size_t real_idx, const std::function& f); - void unsafe_infer(std::size_t real_idx); - void unsafe_run_this_prep_next(std::size_t idx, bool& next_prepared_p); + void unsafe_during(std::size_t idx, const std::function& f, bool& accuracy_failover); + void unsafe_infer(std::size_t idx, bool& accuracy_failover); + void unsafe_run_this_prep_next(std::size_t idx, bool& next_prepared, bool& accuracy_failover); void connect_subrequests(); void recreate_subrequests(std::size_t idx); @@ -151,9 +149,6 @@ class JustInferRequest final : public IBaseInferRequest { std::unordered_set m_input_allocated; - // Represents spatial run-time info - runtime::spatial::Selector::Ptr m_spatial_selector; - // Cached check if we do FOLDing and need to update closures in the repeating blocks bool m_closure_update_required = false; };