From 7060669bbc95cde22baaae4b8f55bd588a512d92 Mon Sep 17 00:00:00 2001 From: Ekaterina Shiryaeva Date: Wed, 16 Oct 2024 16:30:45 +0100 Subject: [PATCH] NPUW: Fix for dynamic dispatch for AVX2 code --- .../intel_npu/src/plugin/CMakeLists.txt | 2 +- .../plugin/npuw/just_sync_infer_request.cpp | 17 +- .../intel_npu/src/plugin/npuw/lazy_tensor.cpp | 6 +- .../intel_npu/src/plugin/npuw/lazy_tensor.hpp | 1 - .../src/plugin/npuw/unpack_kernel.hpp | 1578 ----------------- .../intel_npu/src/plugin/npuw/util.cpp | 134 ++ .../intel_npu/src/plugin/npuw/util.hpp | 28 +- .../intel_npu/src/plugin/npuw/util_xarch.cpp | 1424 ++++++++++++++- .../intel_npu/src/plugin/npuw/util_xarch.hpp | 70 +- 9 files changed, 1646 insertions(+), 1614 deletions(-) delete mode 100644 src/plugins/intel_npu/src/plugin/npuw/unpack_kernel.hpp diff --git a/src/plugins/intel_npu/src/plugin/CMakeLists.txt b/src/plugins/intel_npu/src/plugin/CMakeLists.txt index 50e1d7aecaf935..749819b457c82c 100644 --- a/src/plugins/intel_npu/src/plugin/CMakeLists.txt +++ b/src/plugins/intel_npu/src/plugin/CMakeLists.txt @@ -70,7 +70,7 @@ cross_compiled_file(${TARGET_NAME} ARCH AVX2 ANY npuw/util_xarch.cpp API npuw/util_xarch.hpp - NAME unpack unpack_scale unpack_scale_zp to_f16 + NAME unpack_i4i8 unpack_u4i8 unpack_i4f16 unpack_i4f16_scale unpack_i4f16_z unpack_u4f16 unpack_u4f16_scale_zp unpack_u4f16_asymm_zp unpack_u4f16_z unpack_u4f32 unpack_i8f16 unpack_i8f16_scale unpack_u8f16 to_f16 NAMESPACE ov::npuw::util::XARCH ) diff --git a/src/plugins/intel_npu/src/plugin/npuw/just_sync_infer_request.cpp b/src/plugins/intel_npu/src/plugin/npuw/just_sync_infer_request.cpp index 6f79643b65e6e7..c4e2c3ee98b676 100644 --- a/src/plugins/intel_npu/src/plugin/npuw/just_sync_infer_request.cpp +++ b/src/plugins/intel_npu/src/plugin/npuw/just_sync_infer_request.cpp @@ -18,7 +18,6 @@ #include "openvino/runtime/iasync_infer_request.hpp" #include "plugin.hpp" #include "util.hpp" -#include "util_xarch.hpp" #include "weights_bank.hpp" ov::npuw::MemAccessSim::MemAccessSim(const std::shared_ptr& compiled_model) { @@ -777,18 +776,18 @@ void ov::npuw::JustInferRequest::unpack_closure(std::size_t idx, RqPtr request) if (!comp_model_desc.scales.empty() && comp_model_desc.scales[cidx] && comp_model_desc.zerops[cidx]) { // Unpacking this weight requires scaling with zero points... - ov::npuw::util::XARCH::unpack_scale_zp(ov::get_tensor_impl(closure), - ov::get_tensor_impl(comp_model_desc.zerops[cidx]), - ov::get_tensor_impl(comp_model_desc.scales[cidx]), - clparam); + ov::npuw::util::unpack(ov::get_tensor_impl(closure), + ov::get_tensor_impl(comp_model_desc.zerops[cidx]), + ov::get_tensor_impl(comp_model_desc.scales[cidx]), + clparam); } else if (!comp_model_desc.scales.empty() && comp_model_desc.scales[cidx]) { // Unpacking this weight requires scaling - ov::npuw::util::XARCH::unpack_scale(ov::get_tensor_impl(closure), - ov::get_tensor_impl(comp_model_desc.scales[cidx]), - clparam); + ov::npuw::util::unpack(ov::get_tensor_impl(closure), + ov::get_tensor_impl(comp_model_desc.scales[cidx]), + clparam); } else { // Unpacking this weight doesn't require scaling - ov::npuw::util::XARCH::unpack(ov::get_tensor_impl(closure), clparam); + ov::npuw::util::unpack(ov::get_tensor_impl(closure), clparam); } } } diff --git a/src/plugins/intel_npu/src/plugin/npuw/lazy_tensor.cpp b/src/plugins/intel_npu/src/plugin/npuw/lazy_tensor.cpp index 7dd036b172dbd0..8a0317a9f714e8 100644 --- a/src/plugins/intel_npu/src/plugin/npuw/lazy_tensor.cpp +++ b/src/plugins/intel_npu/src/plugin/npuw/lazy_tensor.cpp @@ -207,9 +207,9 @@ ov::Tensor LazyTensorImpl::eval() const { const auto& ts = cs.get_orig_tensor(); ov::Tensor dst(type, shape); if (tw && tz && ts) { - ov::npuw::util::XARCH::unpack_scale_zp(gti(tw), gti(tz), gti(ts), gti(dst)); + ov::npuw::util::unpack(gti(tw), gti(tz), gti(ts), gti(dst)); } else if (tw && ts) { - ov::npuw::util::XARCH::unpack_scale(gti(tw), gti(ts), gti(dst)); + ov::npuw::util::unpack(gti(tw), gti(ts), gti(dst)); } else { NPUW_ASSERT(false && "Unsupported combination"); } @@ -224,7 +224,7 @@ ov::Tensor LazyTensorImpl::eval() const { case TransformType::PERMUTE: return ov::npuw::util::permute(m_parent->eval(), std::get>(m_transform.second)); case TransformType::CONVERT: - return ov::npuw::util::XARCH::to_f16(m_parent->eval()); + return ov::npuw::util::to_f16(m_parent->eval()); default: NPUW_ASSERT(false); } diff --git a/src/plugins/intel_npu/src/plugin/npuw/lazy_tensor.hpp b/src/plugins/intel_npu/src/plugin/npuw/lazy_tensor.hpp index 0d3c51580bc6bd..5cdeeba058e45f 100644 --- a/src/plugins/intel_npu/src/plugin/npuw/lazy_tensor.hpp +++ b/src/plugins/intel_npu/src/plugin/npuw/lazy_tensor.hpp @@ -14,7 +14,6 @@ #include "openvino/runtime/make_tensor.hpp" #include "openvino/runtime/tensor.hpp" #include "util.hpp" -#include "util_xarch.hpp" namespace ov { namespace npuw { diff --git a/src/plugins/intel_npu/src/plugin/npuw/unpack_kernel.hpp b/src/plugins/intel_npu/src/plugin/npuw/unpack_kernel.hpp deleted file mode 100644 index 4f89657560e4a2..00000000000000 --- a/src/plugins/intel_npu/src/plugin/npuw/unpack_kernel.hpp +++ /dev/null @@ -1,1578 +0,0 @@ -// Copyright (C) 2024 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 -// -#pragma once - -#include - -#include "logging.hpp" -#include "openvino/runtime/itensor.hpp" -#include "openvino/runtime/so_ptr.hpp" - -#ifdef UNPACK_PROFILING -# include "tbb/concurrent_unordered_map.h" -#endif - -namespace ov { -namespace npuw { -namespace util { -namespace XARCH { - -struct UnpackOptions { - bool bUseOvParallelFor; - size_t nPartitions; // if 0 we use 64 elements step in parallel for, otherwise target workload is dynamically - // calculated - bool bStrictPartitioning; // cannot reduce partitions in favor of speed - explicit UnpackOptions(bool useParallelFor, size_t nPartitions, bool bStrictPartitioning) - : bUseOvParallelFor(useParallelFor), - nPartitions(nPartitions), - bStrictPartitioning(bStrictPartitioning) {} -}; - -namespace { -#if defined(HAVE_AVX2) -inline int8_t hi4(int8_t x) { - return ((x & (1 << 7)) >> 4) | ((x & (1 << 6)) >> 4) | ((x & (1 << 5)) >> 4) | ((x & (1 << 4)) >> 4); -} - -inline int8_t lo4(int8_t x) { - return (x & (1 << 3)) | (x & (1 << 2)) | (x & (1 << 1)) | (x & (1 << 0)); -} -#endif - -inline uint8_t hi4(uint8_t x) { - return x >> 4; -} - -inline uint8_t lo4(uint8_t x) { - return x & 0xF; -} - -#if defined(HAVE_AVX2) -inline int8_t upc(int8_t h) { - return h | (-((h & (1 << 3)) >> 3) & (-8)); -} - -// NOTE: This routine implements the NEW ORDER -# define avx2_i4toi8(vinput, vout0, vout1) \ - { \ - __m256i himask = _mm256_broadcastb_epi8(_mm_set_epi32(0, 0, 0, 0xF0)); \ - __m256i lomask = _mm256_broadcastb_epi8(_mm_set_epi32(0, 0, 0, 0x0F)); \ - __m256i vsgmask = _mm256_broadcastb_epi8(_mm_set_epi32(0, 0, 0, 1 << 3)); \ - __m256i vzero = _mm256_broadcastb_epi8(_mm_set_epi32(0, 0, 0, 0)); \ - __m256i vextend = _mm256_broadcastb_epi8(_mm_set_epi32(0, 0, 0, (-8))); \ - \ - __m256i vht = _mm256_and_si256(vinput, himask); \ - __m256i vhi = _mm256_srli_epi16(vht, 4); \ - __m256i vlo = _mm256_and_si256(vinput, lomask); \ - \ - __m256i vsghi = _mm256_srli_epi16(_mm256_and_si256(vhi, vsgmask), 3); \ - __m256i vsglo = _mm256_srli_epi16(_mm256_and_si256(vlo, vsgmask), 3); \ - __m256i vsubhi = _mm256_sub_epi8(vzero, vsghi); \ - __m256i vsublo = _mm256_sub_epi8(vzero, vsglo); \ - __m256i vhires = _mm256_or_si256(vhi, _mm256_and_si256(vsubhi, vextend)); \ - __m256i vlores = _mm256_or_si256(vlo, _mm256_and_si256(vsublo, vextend)); \ - \ - __m256i vunlo = _mm256_unpacklo_epi8(vlores, vhires); \ - __m256i vunhi = _mm256_unpackhi_epi8(vlores, vhires); \ - *vout0 = _mm256_permute2x128_si256(vunlo, vunhi, 0x20); \ - *vout1 = _mm256_permute2x128_si256(vunlo, vunhi, 0x31); \ - } - -inline __m128i avx2_i8tof16(__m128i vi8) { - __m256i i32vec = _mm256_cvtepi8_epi32(vi8); // extend: 8 x i8 -> 8 x i32 [256b of 256b] - __m256 f32vec = _mm256_cvtepi32_ps(i32vec); // convert: 8 x i32 -> 8 x f32 [256b of 256b] - return _mm256_cvtps_ph(f32vec, _MM_FROUND_TO_NEAREST_INT); // convert: 8 x f32 -> 8 x f16 [128b] -} - -inline __m128i avx2_i8tof16(__m128i vi8, __m256 s) { - __m256i i32vec = _mm256_cvtepi8_epi32(vi8); // extend: 8 x i8 -> 8 x i32 [256b of 256b] - __m256 f32vec = _mm256_cvtepi32_ps(i32vec); // convert: 8 x i32 -> 8 x f32 [256b of 256b] - __m256 f32scl = _mm256_mul_ps(f32vec, s); // scale: 8 x f32 -> 8 x f32 [256b of 256b] - return _mm256_cvtps_ph(f32scl, _MM_FROUND_TO_NEAREST_INT); // convert: 8 x f32 -> 8 x f16 [128b] -} - -inline __m128i avx2_u8tof16_hi(__m128i vu8, __m256 z, __m256 s) { - __m256i u32vec = _mm256_cvtepu8_epi32(vu8); // extend: 8 x u8 -> 8 x i32 [256b of 256b] - __m256 f32vec = _mm256_cvtepi32_ps(u32vec); // convert: 8 x i32 -> 8 x f32 [256b of 256b] - __m256 f32sub = _mm256_sub_ps(f32vec, z); // subtract: 8 x f32 -> 8 x f32 [256b of 256b] - __m256 f32scl = _mm256_mul_ps(f32sub, s); // scale: 8 x f32 -> 8 x f32 [256b of 256b] - return _mm256_cvtps_ph(f32scl, _MM_FROUND_TO_NEAREST_INT); // convert: 8 x f32 -> 8 x f16 [128b] -} - -inline __m128i avx2_u8tof16_lo(__m128i vu8, __m256 z, __m256 s) { - __m128i vu8h = _mm_bsrli_si128(vu8, 8); - return avx2_u8tof16_hi(vu8h, z, s); -} - -inline __m128i avx2_u8tof16(__m128i vi8, __m256 z, __m256 s) { - __m256i i32vec = _mm256_cvtepu8_epi32(vi8); // extend: 8 x i8 -> 8 x i32 [256b of 256b] - __m256 f32vec = _mm256_cvtepi32_ps(i32vec); // convert: 8 x i32 -> 8 x f32 [256b of 256b] - __m256 f32sub = _mm256_sub_ps(f32vec, z); // subtract: 8 x f32 -> 8 x f32 [256b of 256b] - __m256 f32scl = _mm256_mul_ps(f32sub, s); // scale: 8 x f32 -> 8 x f32 [256b of 256b] - return _mm256_cvtps_ph(f32scl, _MM_FROUND_TO_NEAREST_INT); // convert: 8 x f32 -> 8 x f16 [128b] -} - -// NOTE: This routine implements the NEW ORDER -inline void avx2_u4tof16(__m256i vinput, __m128i vout[8], __m256 zvalVec, __m256 svalVec[8]) { - // vinput - 64 x u4 elements - 256 bits - // vout[] - 64 (8x8) x f16 elements - - // NOTE: This is largely a copy of unpack_u4f16() {{ - __m256i himask = _mm256_set1_epi8(static_cast(0xF0)); - __m256i lomask = _mm256_set1_epi8(static_cast(0x0F)); - - // unpacking with interleaving - __m256i vht = _mm256_and_si256(vinput, himask); - __m256i xmmUnpackedLo = _mm256_srli_epi16(vht, 4); // 32 x i8 - Extracting High Nibbles - __m256i xmmUnpackedHi = _mm256_and_si256(vinput, lomask); // 32 x i8 - Extracting Low Nibbles - - // need 4 portions of 16 x i8 elements - __m128i unpacked32LoHi = _mm256_castsi256_si128(xmmUnpackedLo); // lower 16 x i8 - Lower 16 of High Nibbles - __m128i unpacked32LoLo = _mm256_extractf128_si256(xmmUnpackedLo, 1); // higher 16 x i8 - Higher 16 of High Nibbles - - __m128i unpacked32HiHi = _mm256_castsi256_si128(xmmUnpackedHi); // lower 16 x i8 - Lower 16 of Low Nibbles - __m128i unpacked32HiLo = _mm256_extractf128_si256(xmmUnpackedHi, 1); // higher 16 x i8 - Higher 16 of Low Nibbles - - // Rearranging of scales - __m256i indices = _mm256_setr_epi32(0, 2, 4, 6, 1, 3, 5, 7); - // Extracting all 64 scales as per the indices specified above - __m256 scale_v_rearranged[] = {_mm256_permutevar8x32_ps(svalVec[0], indices), - _mm256_permutevar8x32_ps(svalVec[1], indices), - _mm256_permutevar8x32_ps(svalVec[2], indices), - _mm256_permutevar8x32_ps(svalVec[3], indices), - _mm256_permutevar8x32_ps(svalVec[4], indices), - _mm256_permutevar8x32_ps(svalVec[5], indices), - _mm256_permutevar8x32_ps(svalVec[6], indices), - _mm256_permutevar8x32_ps(svalVec[7], indices)}; - - // Scaling should happen like this: - // low_nibble[0]->scale[0], high_nibble[0]->scale[1]...low_nibble[31]->scale[60],high_nibble[31]->scale[61] - - // Extracting all the even-indexed scales for the low nibbles - __m256 scale_v_even[] = { - _mm256_permute2f128_ps(scale_v_rearranged[0], scale_v_rearranged[1], 0x20), - _mm256_permute2f128_ps(scale_v_rearranged[2], scale_v_rearranged[3], 0x20), - _mm256_permute2f128_ps(scale_v_rearranged[4], scale_v_rearranged[5], 0x20), - _mm256_permute2f128_ps(scale_v_rearranged[6], scale_v_rearranged[7], 0x20), - }; - - // Extracting all the odd-indexed scales for the high nibbles - __m256 scale_v_odd[] = { - _mm256_permute2f128_ps(scale_v_rearranged[0], scale_v_rearranged[1], 0x31), - _mm256_permute2f128_ps(scale_v_rearranged[2], scale_v_rearranged[3], 0x31), - _mm256_permute2f128_ps(scale_v_rearranged[4], scale_v_rearranged[5], 0x31), - _mm256_permute2f128_ps(scale_v_rearranged[6], scale_v_rearranged[7], 0x31), - }; - - // converting to 64 x f16 - // Higher 16 of High Nibbles - __m128i f16LoLo[] = {avx2_u8tof16_hi(unpacked32LoLo, zvalVec, scale_v_odd[2]), - avx2_u8tof16_lo(unpacked32LoLo, zvalVec, scale_v_odd[3])}; - // Lower 16 of High Nibbles - __m128i f16LoHi[] = {avx2_u8tof16_hi(unpacked32LoHi, zvalVec, scale_v_odd[0]), - avx2_u8tof16_lo(unpacked32LoHi, zvalVec, scale_v_odd[1])}; - // Higher 16 of Low Nibbles - __m128i f16HiLo[] = {avx2_u8tof16_hi(unpacked32HiLo, zvalVec, scale_v_even[2]), - avx2_u8tof16_lo(unpacked32HiLo, zvalVec, scale_v_even[3])}; - // Lower 16 of Low Nibbles - __m128i f16HiHi[] = {avx2_u8tof16_hi(unpacked32HiHi, zvalVec, scale_v_even[0]), - avx2_u8tof16_lo(unpacked32HiHi, zvalVec, scale_v_even[1])}; - - // interleaving back: - // Interleaving lower 8 of low nibbles with lower 8 of high nibbles and so on - vout[0] = _mm_unpacklo_epi16(f16HiHi[0], f16LoHi[0]); - vout[1] = _mm_unpackhi_epi16(f16HiHi[0], f16LoHi[0]); - vout[2] = _mm_unpacklo_epi16(f16HiHi[1], f16LoHi[1]); - vout[3] = _mm_unpackhi_epi16(f16HiHi[1], f16LoHi[1]); - vout[4] = _mm_unpacklo_epi16(f16HiLo[0], f16LoLo[0]); - vout[5] = _mm_unpackhi_epi16(f16HiLo[0], f16LoLo[0]); - vout[6] = _mm_unpacklo_epi16(f16HiLo[1], f16LoLo[1]); - vout[7] = _mm_unpackhi_epi16(f16HiLo[1], f16LoLo[1]); -} - -inline __m256 avx2_load_scale(const int8_t* data, ov::element::Type type) { - if (type == ov::element::f32) { - return _mm256_set1_ps(*reinterpret_cast(data)); - } else { - NPUW_ASSERT(type == ov::element::f16); - float val{}; - _mm_store_ss(&val, _mm_cvtph_ps(_mm_cvtsi32_si128(*reinterpret_cast(data)))); - return _mm256_set1_ps(val); - } -} - -inline float avx2_load_f32(const int8_t* data, ov::element::Type type) { - if (type == ov::element::f32) { - return *reinterpret_cast(data); - } else { - NPUW_ASSERT(type == ov::element::f16); - float val{}; - _mm_store_ss(&val, _mm_cvtph_ps(_mm_cvtsi32_si128(*reinterpret_cast(data)))); - return val; - } -} -#endif - -#ifdef UNPACK_PROFILING -class UnpackStat { - tbb::concurrent_unordered_map> inferenceTimes; - -public: - UnpackStat() {} - void addRecord(size_t sz, size_t time) { - inferenceTimes[sz].first++; - inferenceTimes[sz].second += time; - } - ~UnpackStat() { - for (auto&& r : inferenceTimes) { - std::cout << "work: " << r.first //<< ", stride: " << stride - << " overall_time = " << r.second.second / 1000 << " [ms]" - << " avg_atime = " << r.second.second / r.second.first << " [µs]\n"; - } - } -}; - -static UnpackStat ustat; -# define UNPACK_START_TICK() std::chrono::steady_clock::time_point _begin_tick = std::chrono::steady_clock::now(); -# define UNPACK_SAVE_TICK() \ - std::chrono::steady_clock::time_point _end_tick = std::chrono::steady_clock::now(); \ - ustat.addRecord(total, std::chrono::duration_cast(_end_tick - _begin_tick).count()); -#else -# define UNPACK_START_TICK() -# define UNPACK_SAVE_TICK() -#endif - -inline void unpack_i4i8(const ov::SoPtr& from, - const ov::SoPtr& to, - const UnpackOptions& unpack_options) { - NPUW_ASSERT(from->is_continuous()); - NPUW_ASSERT(to->is_continuous()); - NPUW_ASSERT(from->get_size() == to->get_size()); - -#if defined(HAVE_AVX2) - // with vectorization above, we: - // - read 256 bits (= 32 bytes, = 64 i4 elements) - // - write 512 bits (= 64 bytes, = 64 i8 elements) - // per every iteration, what translates to (from->size() / 64) iterations - - const std::size_t total = from->get_size(); - int8_t const* pSrc = static_cast(from->data()); // 2 x i4 elements - int8_t* pDst = static_cast(to->data()); // 1 x i8 element - size_t stride = 64; - - auto unpack_body = [pSrc, pDst](size_t index, size_t stride) { - size_t halfStride = stride >> 1; - int8_t const* pSrcLocal = pSrc + halfStride * index; - int8_t* pDstLocal = pDst + stride * index; - - for (size_t j = 0; j < stride; j += 64) { - __m256i inv = _mm256_lddqu_si256(reinterpret_cast(pSrcLocal)); - __m256i* outv0 = reinterpret_cast<__m256i*>(pDstLocal); - __m256i* outv1 = reinterpret_cast<__m256i*>(pDstLocal + 32); - - __m256i vout0, vout1; - avx2_i4toi8(inv, &vout0, &vout1); - - _mm256_storeu_si256(outv0, vout0); - _mm256_storeu_si256(outv1, vout1); - - pSrcLocal += 32; - pDstLocal += 64; - } - }; - - // ov work index / 64 - if (unpack_options.nPartitions) { - std::size_t minPartitions; - if (!unpack_options.bStrictPartitioning) { - // some heuristics that every tbb thread workload has to have 2048 elements at least, - // so in terms of stride, it should be 64 * 2048 - minPartitions = total / (64 * 2048); - minPartitions = std::max(1u, minPartitions); - minPartitions = std::min(minPartitions, unpack_options.nPartitions); - } else { - minPartitions = unpack_options.nPartitions; - } - - // calculating stride in elements - this stride give us nPartitions + 1 partitions - stride = static_cast(total / minPartitions); - - // stride has to be 64 elements aligned to avoid gaps between workloads - stride = (stride >> 6) << 6; - // if number of partitions to large comparing to workload, min supported stride still have to be clamped to 64 - stride = stride < 64 ? 64 : stride; - } - - UNPACK_START_TICK(); - - if (unpack_options.bUseOvParallelFor) { - ov::parallel_for(total / stride, [unpack_body, stride](size_t index) { - unpack_body(index, stride); - }); - } else { - for (std::size_t index = 0; index < total / stride; index++) { - unpack_body(index, stride); - } - } - // handle tail - size_t tailOffset = (static_cast(total / stride) * stride); - pSrc = static_cast(from->data()) + (tailOffset >> 1); - pDst = static_cast(to->data()) + tailOffset; - - for (std::size_t index = 0; index < ((total % 64) >> 1); index++) { - *(pDst++) = upc(lo4(*(pSrc))); - *(pDst++) = upc(hi4(*(pSrc))); - pSrc++; - } - UNPACK_SAVE_TICK(); -#else - OPENVINO_THROW("AVX2 support is neccessary but it's not enabled!"); -#endif -} - -inline void unpack_u4i8(const ov::SoPtr& from, - const ov::SoPtr& to, - const UnpackOptions& unpack_options) { - NPUW_ASSERT(from->is_continuous()); - NPUW_ASSERT(to->is_continuous()); - NPUW_ASSERT(from->get_size() == to->get_size()); - - uint8_t const* pSrc = static_cast(from->data()); // 2 x u4 elements - int8_t* pDst = static_cast(to->data()); // 1 x i8 element - - const std::size_t total = from->get_size(); - for (std::size_t index = 0; index < total; index += 2) { - pDst[0] = static_cast(lo4(*pSrc)); // LSB is [0] -- since OpenVINO 24.0! - pDst[1] = static_cast(hi4(*pSrc)); // MSB is [1] -- since OpenVINO 24.0! - pSrc++; - pDst += 2; - } -} - -inline void unpack_i4f16(const ov::SoPtr& from, - const ov::SoPtr& to, - const UnpackOptions& unpack_options) { - NPUW_ASSERT(from->is_continuous()); - NPUW_ASSERT(to->is_continuous()); - NPUW_ASSERT(from->get_size() == to->get_size()); - -#if defined(HAVE_AVX2) - // This conversion combines i4toi8 (above) and i8tof16 (below). Here we - // - read 256 bits (= 32 bytes, = 64 i4 elements) - // - write 1024 bits (= 128 bytes, = 64 f16 elements) - // per every iteration, what translates to (from->size() / 64) iterations - - std::size_t total = to->get_size(); - int8_t const* pSrc = static_cast(from->data()); // 2 x i4 elements - int16_t* pDst = static_cast(to->data()); // 1 x f16 element - // bool tailOnly = total < 64; - - auto unpack_body = [pSrc, pDst](size_t index) { - int8_t const* pSrcLocal = pSrc + 32 * index; - int16_t* pDstLocal = pDst + 64 * index; - - __m256i inv = _mm256_lddqu_si256(reinterpret_cast(pSrcLocal)); - __m128i* outv[8] = { - reinterpret_cast<__m128i*>(pDstLocal), - reinterpret_cast<__m128i*>(pDstLocal + 8), - reinterpret_cast<__m128i*>(pDstLocal + 16), - reinterpret_cast<__m128i*>(pDstLocal + 24), - reinterpret_cast<__m128i*>(pDstLocal + 32), - reinterpret_cast<__m128i*>(pDstLocal + 40), - reinterpret_cast<__m128i*>(pDstLocal + 48), - reinterpret_cast<__m128i*>(pDstLocal + 56), - }; - - __m256i vout0, vout1; - avx2_i4toi8(inv, &vout0, &vout1); - - int8_t tmp[64]; // FIXME: Avoid it - __m256i* tmpv0 = reinterpret_cast<__m256i*>(tmp); - __m256i* tmpv1 = reinterpret_cast<__m256i*>(tmp + 32); - _mm256_storeu_si256(tmpv0, vout0); - _mm256_storeu_si256(tmpv1, vout1); - - __m128i i8vecs[8] = { - _mm_loadl_epi64(reinterpret_cast<__m128i*>(tmp)), - _mm_loadl_epi64(reinterpret_cast<__m128i*>(tmp + 8)), - _mm_loadl_epi64(reinterpret_cast<__m128i*>(tmp + 16)), - _mm_loadl_epi64(reinterpret_cast<__m128i*>(tmp + 24)), - _mm_loadl_epi64(reinterpret_cast<__m128i*>(tmp + 32)), - _mm_loadl_epi64(reinterpret_cast<__m128i*>(tmp + 40)), - _mm_loadl_epi64(reinterpret_cast<__m128i*>(tmp + 48)), - _mm_loadl_epi64(reinterpret_cast<__m128i*>(tmp + 56)), - }; - - __m128i vresults[8] = {avx2_i8tof16(i8vecs[0]), - avx2_i8tof16(i8vecs[1]), - avx2_i8tof16(i8vecs[2]), - avx2_i8tof16(i8vecs[3]), - avx2_i8tof16(i8vecs[4]), - avx2_i8tof16(i8vecs[5]), - avx2_i8tof16(i8vecs[6]), - avx2_i8tof16(i8vecs[7])}; - - _mm_storeu_si128(outv[0], vresults[0]); - _mm_storeu_si128(outv[1], vresults[1]); - _mm_storeu_si128(outv[2], vresults[2]); - _mm_storeu_si128(outv[3], vresults[3]); - _mm_storeu_si128(outv[4], vresults[4]); - _mm_storeu_si128(outv[5], vresults[5]); - _mm_storeu_si128(outv[6], vresults[6]); - _mm_storeu_si128(outv[7], vresults[7]); - }; - - if (unpack_options.bUseOvParallelFor) { - ov::parallel_for(total / 64, [&unpack_body](size_t index) { - unpack_body(index); - }); - } else { - for (std::size_t index = 0; index < total / 64; index++) { - unpack_body(index); - } - } - - // handle tail that is < 64 elements - size_t tailOffset = ((total >> 6) << 6); - pSrc = static_cast(from->data()) + (tailOffset >> 1); - pDst = static_cast(to->data()) + tailOffset; - - constexpr std::size_t VECSIZE = 8; - - total = ((total % 64) >> 1); - int8_t unpackedToI8[VECSIZE] = {0}; - size_t unpackedIdx = 0; - for (std::size_t index = 0; index < total; index++) { - unpackedToI8[unpackedIdx++] = upc(lo4(*(pSrc))); - unpackedToI8[unpackedIdx++] = upc(hi4(*(pSrc))); - if (unpackedIdx == VECSIZE) { - __m128i i8vec = _mm_loadl_epi64(reinterpret_cast<__m128i*>(unpackedToI8)); - __m128i f16vec = avx2_i8tof16(i8vec); - _mm_storeu_si128(reinterpret_cast<__m128i*>(pDst), f16vec); - pDst += VECSIZE; - unpackedIdx = 0; - } - pSrc += 1; - } - - // handle tail that is < 8 - if (unpackedIdx != 0) { - int16_t tmp[VECSIZE]; - __m128i i8vec = _mm_loadl_epi64(reinterpret_cast<__m128i*>(unpackedToI8)); - __m128i f16vec = avx2_i8tof16(i8vec); - _mm_storeu_si128(reinterpret_cast<__m128i*>(tmp), f16vec); - for (size_t i = 0; i != unpackedIdx; i++) { - pDst[i] = tmp[i]; - } - } -#else - OPENVINO_THROW("AVX2 support is neccessary but it's not enabled!"); -#endif -} - -inline void unpack_i4f16(const ov::SoPtr& from, - const ov::SoPtr& scale, - const ov::SoPtr& to, - const UnpackOptions& unpack_options) { - NPUW_ASSERT(from->is_continuous()); - NPUW_ASSERT(scale->is_continuous()); - NPUW_ASSERT(to->is_continuous()); - NPUW_ASSERT(from->get_size() == to->get_size()); - - const auto& from_shape = from->get_shape(); - NPUW_ASSERT(from_shape.back() % 64 == 0); - - // 2-channel (Symmetric) and 3-channel (group-wise) - // scale factors are supported. The scale/value loop - // iteration is based on stotal, so should work for - // both cases. - const auto& scale_shape = scale->get_shape(); - NPUW_ASSERT(scale_shape.size() == 3 || scale_shape.size() == 2); - if (scale_shape.size() == 3) { - NPUW_ASSERT(scale_shape[0] == from_shape[0]); - NPUW_ASSERT(scale_shape[1] == from_shape[1]); - NPUW_ASSERT(scale_shape[2] == 1); - } else { - NPUW_ASSERT(scale_shape[0] == from_shape[0]); - NPUW_ASSERT(scale_shape[1] == 1); - } - - const auto scale_elem_type = scale->get_element_type(); - NPUW_ASSERT(scale_elem_type == ov::element::f32 || scale_elem_type == ov::element::f16); - -#if defined(HAVE_AVX2) - // This conversion combines i4toi8 (above) and i8tof16 (below). Here we - // - read 256 bits (= 32 bytes, = 64 i4 elements) - // - write 1024 bits (= 128 bytes, = 64 f16 elements) - // per every iteration, what translates to (from->size() / 64) iterations - - const std::size_t total = to->get_size(); - const std::size_t stotal = scale->get_size(); - const std::size_t elementsPerScale = total / stotal; - - // TODO: handle tails - NPUW_ASSERT(elementsPerScale % 64 == 0); - - const int8_t* const pSrc = static_cast(from->data()); // 2 x i4 elements - const int8_t* const pScl = static_cast(scale->data()); // either f16 or f32 - const int16_t* pDst = static_cast(to->data()); // 1 x f16 element - - auto unpack_body = [pSrc, pDst, pScl, elementsPerScale, scale_elem_type, stotal](std::size_t sindex, - std::size_t stride) { - // number of vectorized operations per scale - size_t elementsPerScaleVectorized = elementsPerScale / 64; - - int8_t const* pSrcLocal = pSrc + 32 * elementsPerScaleVectorized * sindex * stride; - int8_t const* pSclLocal = pScl + scale_elem_type.size() * sindex * stride; - int16_t* pDstLocal = const_cast(pDst) + 64 * elementsPerScaleVectorized * sindex * stride; - - // if it is last iteration current stride can be smaller - lets check that - sindex *= stride; - const auto jobFinish = std::min(sindex + stride, stotal); - - for (; sindex != jobFinish; sindex++) { - __m256 svec = avx2_load_scale(pSclLocal, scale_elem_type); - for (std::size_t index = 0; index < elementsPerScale; index += 64) { - __m256i inv = _mm256_lddqu_si256(reinterpret_cast(pSrcLocal)); - __m128i* outv[8] = { - reinterpret_cast<__m128i*>(pDstLocal), - reinterpret_cast<__m128i*>(pDstLocal + 8), - reinterpret_cast<__m128i*>(pDstLocal + 16), - reinterpret_cast<__m128i*>(pDstLocal + 24), - reinterpret_cast<__m128i*>(pDstLocal + 32), - reinterpret_cast<__m128i*>(pDstLocal + 40), - reinterpret_cast<__m128i*>(pDstLocal + 48), - reinterpret_cast<__m128i*>(pDstLocal + 56), - }; - - __m256i vout0, vout1; - avx2_i4toi8(inv, &vout0, &vout1); - - int8_t tmp[64]; // FIXME: Avoid it - __m256i* tmpv0 = reinterpret_cast<__m256i*>(tmp); - __m256i* tmpv1 = reinterpret_cast<__m256i*>(tmp + 32); - _mm256_storeu_si256(tmpv0, vout0); - _mm256_storeu_si256(tmpv1, vout1); - - __m128i i8vecs[8] = { - _mm_loadl_epi64(reinterpret_cast<__m128i*>(tmp)), - _mm_loadl_epi64(reinterpret_cast<__m128i*>(tmp + 8)), - _mm_loadl_epi64(reinterpret_cast<__m128i*>(tmp + 16)), - _mm_loadl_epi64(reinterpret_cast<__m128i*>(tmp + 24)), - _mm_loadl_epi64(reinterpret_cast<__m128i*>(tmp + 32)), - _mm_loadl_epi64(reinterpret_cast<__m128i*>(tmp + 40)), - _mm_loadl_epi64(reinterpret_cast<__m128i*>(tmp + 48)), - _mm_loadl_epi64(reinterpret_cast<__m128i*>(tmp + 56)), - }; - - __m128i vresults[8] = {avx2_i8tof16(i8vecs[0], svec), - avx2_i8tof16(i8vecs[1], svec), - avx2_i8tof16(i8vecs[2], svec), - avx2_i8tof16(i8vecs[3], svec), - avx2_i8tof16(i8vecs[4], svec), - avx2_i8tof16(i8vecs[5], svec), - avx2_i8tof16(i8vecs[6], svec), - avx2_i8tof16(i8vecs[7], svec)}; - - _mm_storeu_si128(outv[0], vresults[0]); - _mm_storeu_si128(outv[1], vresults[1]); - _mm_storeu_si128(outv[2], vresults[2]); - _mm_storeu_si128(outv[3], vresults[3]); - _mm_storeu_si128(outv[4], vresults[4]); - _mm_storeu_si128(outv[5], vresults[5]); - _mm_storeu_si128(outv[6], vresults[6]); - _mm_storeu_si128(outv[7], vresults[7]); - - pSrcLocal += 32; // shift pSrc only by 32 since it is 64 x i4 - pDstLocal += 64; // note pDst is int16_t - } - pSclLocal += scale_elem_type.size(); - } - }; - size_t stride{1}; - - // since scaling is always 64 elements aligned operations, lets partition only in scale shape - if (unpack_options.nPartitions) { - std::size_t minPartitions; - if (!unpack_options.bStrictPartitioning) { - // some heuristics that every tbb thread workload has to have 2048 x intrinsics operations at least, - // so in terms of stride, it should be nElementsPerscale/64 * 2048 - const auto nIntrinsicsPerScale = elementsPerScale / 64u; - auto minScaleStride = 2048u / nIntrinsicsPerScale; - minScaleStride = std::max(1u, minScaleStride); - minPartitions = stotal / minScaleStride; - minPartitions = std::max(1u, minPartitions); - minPartitions = std::min(minPartitions, unpack_options.nPartitions); - } else { - minPartitions = unpack_options.nPartitions; - } - - // calculating stride in scale elements space - stride = static_cast(stotal / minPartitions); - } - - const size_t numWork = (stotal + stride - 1) / stride; - - if (unpack_options.bUseOvParallelFor) { - ov::parallel_for(numWork, [unpack_body, stride](size_t index) { - unpack_body(index, stride); - }); - } else { - for (std::size_t index = 0; index < numWork; index++) { - unpack_body(index, stride); - } - } -#else - OPENVINO_THROW("AVX2 support is neccessary but it's not enabled!"); -#endif -} - -inline void unpack_i4f16_z(const ov::SoPtr& from, - const ov::SoPtr& scale, - const ov::SoPtr& to, - const UnpackOptions& unpack_options) { - NPUW_ASSERT(from->is_continuous()); - NPUW_ASSERT(scale->is_continuous()); - NPUW_ASSERT(to->is_continuous()); - NPUW_ASSERT(from->get_size() == to->get_size()); - - const auto& from_shape = from->get_shape(); - NPUW_ASSERT(from_shape.back() % 64 == 0); - - const auto& scale_shape = scale->get_shape(); - NPUW_ASSERT(scale_shape.size() == 3); - NPUW_ASSERT(scale_shape[0] == from_shape[0]); - NPUW_ASSERT(scale_shape[2] == from_shape[2]); - NPUW_ASSERT(scale_shape[1] == 1); - - const auto scale_elem_type = scale->get_element_type(); - NPUW_ASSERT(scale_elem_type == ov::element::f32); - -#if defined(HAVE_AVX2) - // This conversion combines i4tof32 and f32tof16. Here we - // - read 256 bits (= 32 bytes, = 64 u4 elements) - // - write 1024 bits (= 128 bytes, = 64 f16 elements) - // per every iteration, what translates to (from->size() / 64) iterations - - const size_t C = from_shape[from_shape.size() - 3]; - const size_t H = from_shape[from_shape.size() - 2]; - const size_t W = from_shape[from_shape.size() - 1]; - - const int8_t* const pSrc = static_cast(from->data()); // 2 x i4 elements - const float* const pScl = static_cast(scale->data()); // 1 x f32 element - int16_t* pDst = static_cast(to->data()); // 1 x f16 element - - auto unpack_body = [&](size_t job_index, size_t stride) { - size_t start_c = job_index * stride; - size_t end_c = std::min(C, start_c + stride); - - for (size_t c = start_c; c < end_c; ++c) { - for (size_t h = 0; h < H; ++h) { - for (size_t w = 0; w < W; w += 64) { - const int8_t* pSrc_iter = pSrc + (w + W * h + W * H * c) / 2; - __m256i vinput = _mm256_lddqu_si256(reinterpret_cast(pSrc_iter)); - __m256i vout0, vout1; - avx2_i4toi8(vinput, &vout0, &vout1); - int8_t tmp[64]; // FIXME: Avoid it - __m256i* tmpv0 = reinterpret_cast<__m256i*>(tmp); - __m256i* tmpv1 = reinterpret_cast<__m256i*>(tmp + 32); - _mm256_storeu_si256(tmpv0, vout0); - _mm256_storeu_si256(tmpv1, vout1); - __m128i i8vecs[8] = { - _mm_loadl_epi64(reinterpret_cast<__m128i*>(tmp)), - _mm_loadl_epi64(reinterpret_cast<__m128i*>(tmp + 8)), - _mm_loadl_epi64(reinterpret_cast<__m128i*>(tmp + 16)), - _mm_loadl_epi64(reinterpret_cast<__m128i*>(tmp + 24)), - _mm_loadl_epi64(reinterpret_cast<__m128i*>(tmp + 32)), - _mm_loadl_epi64(reinterpret_cast<__m128i*>(tmp + 40)), - _mm_loadl_epi64(reinterpret_cast<__m128i*>(tmp + 48)), - _mm_loadl_epi64(reinterpret_cast<__m128i*>(tmp + 56)), - }; - - const float* pScl_iter = pScl + w + W * c; - __m256 svalVec[8]; - for (int i = 0; i < 8; ++i) { - svalVec[i] = _mm256_loadu_ps(pScl_iter + i * 8); - } - - __m128i vresults[8] = {avx2_i8tof16(i8vecs[0], svalVec[0]), - avx2_i8tof16(i8vecs[1], svalVec[1]), - avx2_i8tof16(i8vecs[2], svalVec[2]), - avx2_i8tof16(i8vecs[3], svalVec[3]), - avx2_i8tof16(i8vecs[4], svalVec[4]), - avx2_i8tof16(i8vecs[5], svalVec[5]), - avx2_i8tof16(i8vecs[6], svalVec[6]), - avx2_i8tof16(i8vecs[7], svalVec[7])}; - - int16_t* pDst_iter = pDst + w + W * h + W * H * c; - for (int i = 0; i < 8; ++i) { - _mm_storeu_si128(reinterpret_cast<__m128i*>(pDst_iter + i * 8), vresults[i]); - } - } - } - } - }; - - size_t stride = C; - size_t num_jobs = 1; - - if (unpack_options.nPartitions) { - if (unpack_options.bStrictPartitioning) { - stride = (C + unpack_options.nPartitions - 1) / unpack_options.nPartitions; - num_jobs = unpack_options.nPartitions; - } else { - stride = std::max(1, C / unpack_options.nPartitions); - num_jobs = (C + stride - 1) / stride; - } - } - - if (unpack_options.bUseOvParallelFor) { - ov::parallel_for(num_jobs, [&](size_t job_index) { - unpack_body(job_index, stride); - }); - } else { - for (size_t job_index = 0; job_index < num_jobs; ++job_index) { - unpack_body(job_index, stride); - } - } -#else - OPENVINO_THROW("AVX2 support is neccessary but it's not enabled!"); -#endif -} - -inline void unpack_u4f16(const ov::SoPtr& from, - const ov::SoPtr& to, - const UnpackOptions& unpack_options) { - NPUW_ASSERT(from->is_continuous()); - NPUW_ASSERT(to->is_continuous()); - NPUW_ASSERT(from->get_size() == to->get_size()); - NPUW_ASSERT(from->get_size() % 64 == 0); - -#if defined(HAVE_AVX2) - // This conversion combines u4i8 and i8tof16 unpacks. Here we - // - read 256 bits (= 32 bytes, = 64 i4 elements) - // - write 1024 bits (= 128 bytes, = 64 f16 elements) - // per every iteration, what translates to (from->size() / 64) iterations - - const std::size_t total = to->get_size(); - int8_t const* pSrc = static_cast(from->data()); // 2 x i4 elements - int16_t* pDst = static_cast(to->data()); // 1 x f16 element - - for (std::size_t index = 0; index < total; index += 64) { - __m128i* outv[8] = { - reinterpret_cast<__m128i*>(pDst), - reinterpret_cast<__m128i*>(pDst + 8), - reinterpret_cast<__m128i*>(pDst + 16), - reinterpret_cast<__m128i*>(pDst + 24), - reinterpret_cast<__m128i*>(pDst + 32), - reinterpret_cast<__m128i*>(pDst + 40), - reinterpret_cast<__m128i*>(pDst + 48), - reinterpret_cast<__m128i*>(pDst + 56), - }; - - int8_t tmp[64]; // FIXME: Avoid it - for (std::size_t ii = 0; ii < 32; ii++) { - tmp[ii * 2] = static_cast(lo4(pSrc[ii])); // LSB is [0] -- since OpenVINO 24.0! - tmp[ii * 2 + 1] = static_cast(hi4(pSrc[ii])); // MSB is [1] -- since OpenVINO 24.0! - } - - __m128i vresults[8] = { - avx2_i8tof16(_mm_loadl_epi64(reinterpret_cast<__m128i*>(tmp))), - avx2_i8tof16(_mm_loadl_epi64(reinterpret_cast<__m128i*>(tmp + 8))), - avx2_i8tof16(_mm_loadl_epi64(reinterpret_cast<__m128i*>(tmp + 16))), - avx2_i8tof16(_mm_loadl_epi64(reinterpret_cast<__m128i*>(tmp + 24))), - avx2_i8tof16(_mm_loadl_epi64(reinterpret_cast<__m128i*>(tmp + 32))), - avx2_i8tof16(_mm_loadl_epi64(reinterpret_cast<__m128i*>(tmp + 40))), - avx2_i8tof16(_mm_loadl_epi64(reinterpret_cast<__m128i*>(tmp + 48))), - avx2_i8tof16(_mm_loadl_epi64(reinterpret_cast<__m128i*>(tmp + 56))), - }; - - _mm_storeu_si128(outv[0], vresults[0]); - _mm_storeu_si128(outv[1], vresults[1]); - _mm_storeu_si128(outv[2], vresults[2]); - _mm_storeu_si128(outv[3], vresults[3]); - _mm_storeu_si128(outv[4], vresults[4]); - _mm_storeu_si128(outv[5], vresults[5]); - _mm_storeu_si128(outv[6], vresults[6]); - _mm_storeu_si128(outv[7], vresults[7]); - - pSrc += 32; // shift pSrc only by 32 since it is 64 x i4 - pDst += 64; // note pDst is int16_t - } -#else - OPENVINO_THROW("AVX2 support is neccessary but it's not enabled!"); -#endif -} - -inline void unpack_u4f16(const ov::SoPtr& from, - const ov::SoPtr& zerop, - const ov::SoPtr& scale, - const ov::SoPtr& to, - const UnpackOptions& unpack_options) { - NPUW_ASSERT(from->is_continuous()); - NPUW_ASSERT(zerop->is_continuous()); - NPUW_ASSERT(scale->is_continuous()); - NPUW_ASSERT(to->is_continuous()); - NPUW_ASSERT(from->get_size() == to->get_size()); - - // Only single-size ZP is supported - NPUW_ASSERT(zerop->get_size() == 1); - - const auto& from_shape = from->get_shape(); - NPUW_ASSERT(from_shape.back() % 64 == 0); - - // 2-channel (Symmetric) and 3-channel (group-wise) - // scale factors are supported. The scale/value loop - // iteration is based on stotal, so should work for - // both cases. - const auto& scale_shape = scale->get_shape(); - NPUW_ASSERT(scale_shape.size() == 3 || scale_shape.size() == 2); - if (scale_shape.size() == 3) { - NPUW_ASSERT(scale_shape[0] == from_shape[0]); - NPUW_ASSERT(scale_shape[1] == from_shape[1]); - NPUW_ASSERT(scale_shape[2] == 1); - } else { - NPUW_ASSERT(scale_shape[0] == from_shape[0]); - NPUW_ASSERT(scale_shape[1] == 1); - } - - const auto zerop_elem_type = zerop->get_element_type(); - const auto scale_elem_type = scale->get_element_type(); - NPUW_ASSERT(zerop_elem_type == ov::element::u4); - NPUW_ASSERT(scale_elem_type == ov::element::f16); - -#if defined(HAVE_AVX2) - // This conversion combines u4tof32 and f32tof16. Here we - // - read 256 bits (= 32 bytes, = 64 u4 elements) - // - write 1024 bits (= 128 bytes, = 64 f16 elements) - // per every iteration, what translates to (from->size() / 64) iterations - - const std::size_t total = to->get_size(); - const std::size_t stotal = scale->get_size(); - const std::size_t elementsPerScale = total / stotal; - - const uint8_t* const pSrc = static_cast(from->data()); // 2 x u4 elements - const uint8_t* const pZer = static_cast(zerop->data()); // 1 x u4 element - const int8_t* const pScl = static_cast(scale->data()); // 1 x f16 element - const int16_t* pDst = static_cast(to->data()); // 1 x f16 element - - const float zval = static_cast(lo4(*pZer)); // MSB - since OpenVINO 24.0! - - __m256 zvalVec = _mm256_set1_ps(zval); - - auto unpack_body = [pSrc, pDst, pScl, zvalVec, elementsPerScale, scale_elem_type, stotal](std::size_t sindex, - std::size_t stride) { - // number of vectorized operations per scale - size_t elementsPerScaleVectorized = elementsPerScale / 64; - - uint8_t const* pSrcLocal = pSrc + 32 * elementsPerScaleVectorized * sindex * stride; - int8_t const* pSclLocal = pScl + scale_elem_type.size() * sindex * stride; - int16_t* pDstLocal = const_cast(pDst) + 64 * elementsPerScaleVectorized * sindex * stride; - - // if it is last iteration current stride can be smaller - lets check that - sindex *= stride; - const auto jobFinish = std::min(sindex + stride, stotal); - - for (; sindex < jobFinish; sindex++) { - __m256 svalVec = avx2_load_scale(pSclLocal, scale_elem_type); - - for (std::size_t index = 0; index < elementsPerScale; index += 64) { - __m128i* outv[] = { - reinterpret_cast<__m128i*>(pDstLocal), - reinterpret_cast<__m128i*>(pDstLocal + 8), - reinterpret_cast<__m128i*>(pDstLocal + 16), - reinterpret_cast<__m128i*>(pDstLocal + 24), - reinterpret_cast<__m128i*>(pDstLocal + 32), - reinterpret_cast<__m128i*>(pDstLocal + 40), - reinterpret_cast<__m128i*>(pDstLocal + 48), - reinterpret_cast<__m128i*>(pDstLocal + 56), - }; - __m256i himask = _mm256_set1_epi8(static_cast(0xF0)); - __m256i lomask = _mm256_set1_epi8(static_cast(0x0F)); - - // loading 256 bit u4 into unalligned memory , so 64 elements - // cannot use aligned version here like _mm256_load_si256 - segfault even on unit tests - __m256i xmmData = _mm256_lddqu_si256(reinterpret_cast<__m256i const*>(pSrcLocal)); - - // unpacking with interleaving - __m256i vht = _mm256_and_si256(xmmData, himask); - __m256i xmmUnpackedLo = _mm256_srli_epi16(vht, 4); // 32 x i8 - __m256i xmmUnpackedHi = _mm256_and_si256(xmmData, lomask); // 32 x i8 - - // need 4 portions of 8 x i8 elements - __m128i unpacked32LoHi = _mm256_castsi256_si128(xmmUnpackedLo); // lower 16 x i8 - __m128i unpacked32LoLo = _mm256_extractf128_si256(xmmUnpackedLo, 1); // higher 16 x i8 - - __m128i unpacked32HiHi = _mm256_castsi256_si128(xmmUnpackedHi); // lower 16 x i8 - __m128i unpacked32HiLo = _mm256_extractf128_si256(xmmUnpackedHi, 1); // higher 16 x i8 - - // converting to 32 x f16 - __m128i f16LoLo[] = {avx2_u8tof16_hi(unpacked32LoLo, zvalVec, svalVec), - avx2_u8tof16_lo(unpacked32LoLo, zvalVec, svalVec)}; - - __m128i f16LoHi[] = { - avx2_u8tof16_hi(unpacked32LoHi, zvalVec, svalVec), - avx2_u8tof16_lo(unpacked32LoHi, zvalVec, svalVec), - }; - - __m128i f16HiLo[] = {avx2_u8tof16_hi(unpacked32HiLo, zvalVec, svalVec), - avx2_u8tof16_lo(unpacked32HiLo, zvalVec, svalVec)}; - __m128i f16HiHi[] = {avx2_u8tof16_hi(unpacked32HiHi, zvalVec, svalVec), - avx2_u8tof16_lo(unpacked32HiHi, zvalVec, svalVec)}; - - // interleaving back - __m128i interleaved[] = {_mm_unpacklo_epi16(f16HiHi[0], f16LoHi[0]), - _mm_unpackhi_epi16(f16HiHi[0], f16LoHi[0]), - _mm_unpacklo_epi16(f16HiHi[1], f16LoHi[1]), - _mm_unpackhi_epi16(f16HiHi[1], f16LoHi[1]), - _mm_unpacklo_epi16(f16HiLo[0], f16LoLo[0]), - _mm_unpackhi_epi16(f16HiLo[0], f16LoLo[0]), - _mm_unpacklo_epi16(f16HiLo[1], f16LoLo[1]), - _mm_unpackhi_epi16(f16HiLo[1], f16LoLo[1])}; - - // store the results - _mm_storeu_si128(outv[0], interleaved[0]); - _mm_storeu_si128(outv[1], interleaved[1]); - _mm_storeu_si128(outv[2], interleaved[2]); - _mm_storeu_si128(outv[3], interleaved[3]); - _mm_storeu_si128(outv[4], interleaved[4]); - _mm_storeu_si128(outv[5], interleaved[5]); - _mm_storeu_si128(outv[6], interleaved[6]); - _mm_storeu_si128(outv[7], interleaved[7]); - - pSrcLocal += 32; // shift pSrc only by 32 since it is 64 x u4 - pDstLocal += 64; // note pDst is int16_t, so 64 x f16 -> 64 elements - } // for(index) - pSclLocal += scale_elem_type.size(); - } // for(sindex) - }; - - size_t stride{1}; - - // since scaling is always 64 elements aligned operations, lets partition only in scale shape - if (unpack_options.nPartitions) { - std::size_t minPartitions; - if (!unpack_options.bStrictPartitioning) { - // some heuristics that every tbb thread workload has to have 2048 x intrinsics operations at least, - // so in terms of stride, it should be nElementsPerscale/64 * 2048 - const auto nIntrinsicsPerScale = elementsPerScale / 64u; - auto minScaleStride = 2048u / nIntrinsicsPerScale; - minScaleStride = std::max(1u, minScaleStride); - minPartitions = stotal / minScaleStride; - minPartitions = std::max(1u, minPartitions); - minPartitions = std::min(minPartitions, unpack_options.nPartitions); - } else { - minPartitions = unpack_options.nPartitions; - } - - // calculating stride in scale elements space - stride = static_cast(stotal / minPartitions); - } - - const size_t numWork = (stotal + stride - 1) / stride; - - if (unpack_options.bUseOvParallelFor) { - ov::parallel_for(numWork, [unpack_body, stride](size_t index) { - unpack_body(index, stride); - }); - } else { - for (std::size_t index = 0; index < numWork; index++) { - unpack_body(index, stride); - } - } -#else - OPENVINO_THROW("AVX2 support is neccessary but it's not enabled!"); -#endif -} - -inline void unpack_u4f16_asymm_zp(const ov::SoPtr& from, - const ov::SoPtr& zerop, - const ov::SoPtr& scale, - const ov::SoPtr& to, - const UnpackOptions& unpack_options) { - NPUW_ASSERT(from->is_continuous()); - NPUW_ASSERT(zerop->is_continuous()); - NPUW_ASSERT(scale->is_continuous()); - NPUW_ASSERT(to->is_continuous()); - NPUW_ASSERT(from->get_size() == to->get_size()); - - const auto& from_shape = from->get_shape(); - NPUW_ASSERT(from_shape.back() % 64 == 0); - - // 3-channel (group-wise) scale factors are - // supported. - - const auto& scale_shape = scale->get_shape(); - NPUW_ASSERT(scale_shape.size() == 3); - if (scale_shape.size() == 3) { - NPUW_ASSERT(scale_shape[0] == from_shape[0]); - NPUW_ASSERT(scale_shape[1] == from_shape[1]); - NPUW_ASSERT(scale_shape[2] == 1); - } - - const auto& zerop_shape = zerop->get_shape(); - NPUW_ASSERT(zerop_shape.size() == 3); - if (zerop_shape.size() == 3) { - NPUW_ASSERT(zerop_shape[0] == from_shape[0]); - NPUW_ASSERT(zerop_shape[1] == from_shape[1]); - NPUW_ASSERT(zerop_shape[2] == 1); - } - - const auto zerop_elem_type = zerop->get_element_type(); - const auto scale_elem_type = scale->get_element_type(); - NPUW_ASSERT(zerop_elem_type == ov::element::u4); - NPUW_ASSERT(scale_elem_type == ov::element::f16); - -#if defined(HAVE_AVX2) - // This conversion combines u4tof32 and f32tof16. Here we - // - read 256 bits (= 32 bytes, = 64 u4 elements) - // - write 1024 bits (= 128 bytes, = 64 f16 elements) - // per every iteration, what translates to (from->size() / 64) iterations - - const std::size_t total = to->get_size(); - const std::size_t stotal = scale->get_size(); - const std::size_t elementsPerScale = total / stotal; - - const uint8_t* const pSrc = static_cast(from->data()); // 2 x u4 elements - const uint8_t* const pZer = static_cast(zerop->data()); // 2 x u4 element - const int8_t* const pScl = static_cast(scale->data()); // 1 x f16 element - const int16_t* pDst = static_cast(to->data()); // 1 x f16 element - - auto unpack_body = [pSrc, pDst, pScl, pZer, elementsPerScale, scale_elem_type, zerop_elem_type, stotal]( - std::size_t sindex, - std::size_t stride) { - // number of vectorized operations per scale - size_t elementsPerScaleVectorized = elementsPerScale / 64; - - uint8_t const* pSrcLocal = pSrc + 32 * elementsPerScaleVectorized * sindex * stride; - int8_t const* pSclLocal = pScl + scale_elem_type.size() * sindex * stride; - uint8_t const* pZerLocal = pZer + zerop_elem_type.size() * sindex * stride / 2; - int16_t* pDstLocal = const_cast(pDst) + 64 * elementsPerScaleVectorized * sindex * stride; - - // if it is last iteration current stride can be smaller - lets check that - sindex *= stride; - const auto jobFinish = std::min(sindex + stride, stotal); - - for (; sindex < jobFinish; sindex++) { - __m256 svalVec = avx2_load_scale(pSclLocal, scale_elem_type); - __m256 zvalVec = _mm256_set1_ps(static_cast((sindex % 2 == 0) ? lo4(*pZerLocal) : hi4(*pZerLocal))); - - for (std::size_t index = 0; index < elementsPerScale; index += 64) { - __m128i* outv[] = { - reinterpret_cast<__m128i*>(pDstLocal), - reinterpret_cast<__m128i*>(pDstLocal + 8), - reinterpret_cast<__m128i*>(pDstLocal + 16), - reinterpret_cast<__m128i*>(pDstLocal + 24), - reinterpret_cast<__m128i*>(pDstLocal + 32), - reinterpret_cast<__m128i*>(pDstLocal + 40), - reinterpret_cast<__m128i*>(pDstLocal + 48), - reinterpret_cast<__m128i*>(pDstLocal + 56), - }; - __m256i himask = _mm256_set1_epi8(static_cast(0xF0)); - __m256i lomask = _mm256_set1_epi8(static_cast(0x0F)); - - // loading 256 bit u4 into unalligned memory , so 64 elements - // cannot use aligned version here like _mm256_load_si256 - segfault even on unit tests - __m256i xmmData = _mm256_lddqu_si256(reinterpret_cast<__m256i const*>(pSrcLocal)); - - // unpacking with interleaving - __m256i vht = _mm256_and_si256(xmmData, himask); - __m256i xmmUnpackedLo = _mm256_srli_epi16(vht, 4); // 32 x i8 - __m256i xmmUnpackedHi = _mm256_and_si256(xmmData, lomask); // 32 x i8 - - // need 4 portions of 8 x i8 elements - __m128i unpacked32LoHi = _mm256_castsi256_si128(xmmUnpackedLo); // lower 16 x i8 - __m128i unpacked32LoLo = _mm256_extractf128_si256(xmmUnpackedLo, 1); // higher 16 x i8 - - __m128i unpacked32HiHi = _mm256_castsi256_si128(xmmUnpackedHi); // lower 16 x i8 - __m128i unpacked32HiLo = _mm256_extractf128_si256(xmmUnpackedHi, 1); // higher 16 x i8 - - // converting to 32 x f16 - __m128i f16LoLo[] = {avx2_u8tof16_hi(unpacked32LoLo, zvalVec, svalVec), - avx2_u8tof16_lo(unpacked32LoLo, zvalVec, svalVec)}; - - __m128i f16LoHi[] = { - avx2_u8tof16_hi(unpacked32LoHi, zvalVec, svalVec), - avx2_u8tof16_lo(unpacked32LoHi, zvalVec, svalVec), - }; - - __m128i f16HiLo[] = {avx2_u8tof16_hi(unpacked32HiLo, zvalVec, svalVec), - avx2_u8tof16_lo(unpacked32HiLo, zvalVec, svalVec)}; - __m128i f16HiHi[] = {avx2_u8tof16_hi(unpacked32HiHi, zvalVec, svalVec), - avx2_u8tof16_lo(unpacked32HiHi, zvalVec, svalVec)}; - - // interleaving back - __m128i interleaved[] = {_mm_unpacklo_epi16(f16HiHi[0], f16LoHi[0]), - _mm_unpackhi_epi16(f16HiHi[0], f16LoHi[0]), - _mm_unpacklo_epi16(f16HiHi[1], f16LoHi[1]), - _mm_unpackhi_epi16(f16HiHi[1], f16LoHi[1]), - _mm_unpacklo_epi16(f16HiLo[0], f16LoLo[0]), - _mm_unpackhi_epi16(f16HiLo[0], f16LoLo[0]), - _mm_unpacklo_epi16(f16HiLo[1], f16LoLo[1]), - _mm_unpackhi_epi16(f16HiLo[1], f16LoLo[1])}; - - // store the results - _mm_storeu_si128(outv[0], interleaved[0]); - _mm_storeu_si128(outv[1], interleaved[1]); - _mm_storeu_si128(outv[2], interleaved[2]); - _mm_storeu_si128(outv[3], interleaved[3]); - _mm_storeu_si128(outv[4], interleaved[4]); - _mm_storeu_si128(outv[5], interleaved[5]); - _mm_storeu_si128(outv[6], interleaved[6]); - _mm_storeu_si128(outv[7], interleaved[7]); - - pSrcLocal += 32; // shift pSrc only by 32 since it is 64 x u4 - pDstLocal += 64; // note pDst is int16_t, so 64 x f16 -> 64 elements - } // for(index) - pSclLocal += scale_elem_type.size(); - if (sindex % 2 == 1) { - pZerLocal += zerop_elem_type.size(); - } - } // for(sindex) - }; - - size_t stride{1}; - - // since scaling is always 64 elements aligned operations, lets partition only in scale shape - if (unpack_options.nPartitions) { - std::size_t minPartitions; - if (!unpack_options.bStrictPartitioning) { - // some heuristics that every tbb thread workload has to have 2048 x intrinsics operations at least, - // so in terms of stride, it should be nElementsPerscale/64 * 2048 - const auto nIntrinsicsPerScale = elementsPerScale / 64u; - auto minScaleStride = 2048u / nIntrinsicsPerScale; - minScaleStride = std::max(1u, minScaleStride); - minPartitions = stotal / minScaleStride; - minPartitions = std::max(1u, minPartitions); - minPartitions = std::min(minPartitions, unpack_options.nPartitions); - } else { - minPartitions = unpack_options.nPartitions; - } - - // calculating stride in scale elements space - stride = static_cast(stotal / minPartitions); - } - - const size_t numWork = (stotal + stride - 1) / stride; - - if (unpack_options.bUseOvParallelFor) { - ov::parallel_for(numWork, [unpack_body, stride](size_t index) { - unpack_body(index, stride); - }); - } else { - for (std::size_t index = 0; index < numWork; index++) { - unpack_body(index, stride); - } - } -#else - OPENVINO_THROW("AVX2 support is neccessary but it's not enabled!"); -#endif -} - -inline void unpack_u4f16_z(const ov::SoPtr& from, - const ov::SoPtr& zerop, - const ov::SoPtr& scale, - const ov::SoPtr& to, - const UnpackOptions& unpack_options) { - NPUW_ASSERT(from->is_continuous()); - NPUW_ASSERT(zerop->is_continuous()); - NPUW_ASSERT(scale->is_continuous()); - NPUW_ASSERT(to->is_continuous()); - NPUW_ASSERT(from->get_size() == to->get_size()); - - // Only single-size ZP is supported - NPUW_ASSERT(zerop->get_size() == 1); - - const auto& from_shape = from->get_shape(); - NPUW_ASSERT(from_shape.back() % 64 == 0); - - const auto& scale_shape = scale->get_shape(); - NPUW_ASSERT(scale_shape.size() == 3); - NPUW_ASSERT(scale_shape[0] == from_shape[0]); - NPUW_ASSERT(scale_shape[2] == from_shape[2]); - NPUW_ASSERT(scale_shape[1] == 1); - - const auto zerop_elem_type = zerop->get_element_type(); - const auto scale_elem_type = scale->get_element_type(); - NPUW_ASSERT(zerop_elem_type == ov::element::f32); - NPUW_ASSERT(scale_elem_type == ov::element::f32); - -#if defined(HAVE_AVX2) - // This conversion combines u4tof32 and f32tof16. Here we - // - read 256 bits (= 32 bytes, = 64 u4 elements) - // - write 1024 bits (= 128 bytes, = 64 f16 elements) - // per every iteration, what translates to (from->size() / 64) iterations - - const size_t C = from_shape[from_shape.size() - 3]; - const size_t H = from_shape[from_shape.size() - 2]; - const size_t W = from_shape[from_shape.size() - 1]; - - const uint8_t* const pSrc = static_cast(from->data()); // 2 x u4 elements - const float* const pScl = static_cast(scale->data()); // 1 x f32 element - int16_t* pDst = static_cast(to->data()); // 1 x f16 element - - const float zval = avx2_load_f32(reinterpret_cast(zerop->data()), zerop_elem_type); - __m256 zvalVec = _mm256_set1_ps(zval); - - auto unpack_body = [&](size_t job_index, size_t stride) { - size_t start_c = job_index * stride; - size_t end_c = std::min(C, start_c + stride); - - for (size_t c = start_c; c < end_c; ++c) { - for (size_t h = 0; h < H; ++h) { - for (size_t w = 0; w < W; w += 64) { - const uint8_t* pSrc_iter = pSrc + (w + W * h + W * H * c) / 2; - __m256i vinput = _mm256_lddqu_si256(reinterpret_cast(pSrc_iter)); - const float* pScl_iter = pScl + w + W * c; - int16_t* pDst_iter = pDst + w + W * h + W * H * c; - - __m256 svalVec[8]; - for (int i = 0; i < 8; ++i) { - svalVec[i] = _mm256_loadu_ps(pScl_iter + i * 8); - } - - // vectorized unpack u4 to f16 - __m128i htmp[8]; // 64 x f16 - avx2_u4tof16(vinput, htmp, zvalVec, svalVec); - - for (int i = 0; i < 8; ++i) { - _mm_storeu_si128(reinterpret_cast<__m128i*>(pDst_iter + i * 8), htmp[i]); - } - } - } - } - }; - - size_t stride = C; - size_t num_jobs = 1; - - if (unpack_options.nPartitions) { - if (unpack_options.bStrictPartitioning) { - stride = (C + unpack_options.nPartitions - 1) / unpack_options.nPartitions; - num_jobs = unpack_options.nPartitions; - } else { - stride = std::max(1, C / unpack_options.nPartitions); - num_jobs = (C + stride - 1) / stride; - } - } - - if (unpack_options.bUseOvParallelFor) { - ov::parallel_for(num_jobs, [&](size_t job_index) { - unpack_body(job_index, stride); - }); - } else { - for (size_t job_index = 0; job_index < num_jobs; ++job_index) { - unpack_body(job_index, stride); - } - } -#else - OPENVINO_THROW("AVX2 support is neccessary but it's not enabled!"); -#endif -} - -inline void unpack_u4f32(const ov::SoPtr& from, - const ov::SoPtr& to, - const UnpackOptions& unpack_options) { - NPUW_ASSERT(from->is_continuous()); - NPUW_ASSERT(to->is_continuous()); - NPUW_ASSERT(from->get_size() == to->get_size()); - - uint8_t const* pSrc = static_cast(from->data()); // 2 x u4 elements - float* pDst = static_cast(to->data()); // 1 x f32 element - - const std::size_t total = from->get_size(); - for (std::size_t index = 0; index < total; index += 2) { - pDst[0] = static_cast(lo4(*pSrc)); // LSB is [0] - since OpenVINO 2024.0! - pDst[1] = static_cast(hi4(*pSrc)); // MSB is [1] - since OpenVINO 2024.0! - pSrc++; - pDst += 2; - } -} - -inline void unpack_i8f16(const ov::SoPtr& from, - const ov::SoPtr& to, - const UnpackOptions& unpack_options) { - NPUW_ASSERT(from->is_continuous()); - NPUW_ASSERT(to->is_continuous()); - NPUW_ASSERT(from->get_size() == to->get_size()); - NPUW_ASSERT(from->get_size() % 8 == 0); - -#if defined(HAVE_AVX2) - constexpr std::size_t VECSIZE = 8; - - const std::size_t total = from->get_size(); - int8_t const* pSrc = from->data(); - int16_t* pDst = static_cast(to->data()); - - for (std::size_t index = 0; index < total; index += VECSIZE) { - const __m128i* pSrcV = reinterpret_cast(pSrc); - __m128i* pDstV = reinterpret_cast<__m128i*>(pDst); - __m128i i8vec = _mm_loadl_epi64(pSrcV); // load: 8 x i8 [ 64b of 128b] - __m128i f16vec = avx2_i8tof16(i8vec); - _mm_store_si128(pDstV, f16vec); // store: 8 x f16 [128b] - pSrc += 8; - pDst += 8; - } -#else - OPENVINO_THROW("AVX2 support is neccessary but it's not enabled!"); -#endif -} - -inline void unpack_i8f16(const ov::SoPtr& from, - const ov::SoPtr& scale, - const ov::SoPtr& to, - const UnpackOptions& unpack_options) { - NPUW_ASSERT(from->is_continuous()); - NPUW_ASSERT(scale->is_continuous()); - NPUW_ASSERT(to->is_continuous()); - NPUW_ASSERT(from->get_size() == to->get_size()); - NPUW_ASSERT(from->get_size() % 8 == 0); - NPUW_ASSERT(scale->get_shape()[0] == from->get_shape()[0]); - NPUW_ASSERT(scale->get_shape()[1] == 1); - - const auto scale_elem_type = scale->get_element_type(); - NPUW_ASSERT(scale_elem_type == ov::element::f32 || scale_elem_type == ov::element::f16); - -#if defined(HAVE_AVX2) - constexpr std::size_t VECSIZE = 8; - - const std::size_t total = from->get_size(); - const std::size_t stotal = scale->get_size(); - int8_t const* pSrc = from->data(); - int8_t const* pScl = static_cast(scale->data()); - int16_t* pDst = static_cast(to->data()); - - for (std::size_t sindex = 0u; sindex < stotal; sindex++) { - __m256 svec = avx2_load_scale(pScl, scale_elem_type); - for (std::size_t index = 0u; index < (total / stotal); index += VECSIZE) { - __m128i const* pSrcV = reinterpret_cast(pSrc); - __m128i* pDstV = reinterpret_cast<__m128i*>(pDst); - __m128i i8vec = _mm_loadl_epi64(pSrcV); // load: 8 x i8 [ 64b of 128b] - __m128i f16vec = avx2_i8tof16(i8vec, svec); // convert & scale - _mm_store_si128(pDstV, f16vec); // store: 8 x f16 [128b] - pSrc += 8; - pDst += 8; - } // index - pScl += scale_elem_type.size(); - } // sindex -#else - OPENVINO_THROW("AVX2 support is neccessary but it's not enabled!"); -#endif -} - -inline void unpack_u8f16(const ov::SoPtr& from, - const ov::SoPtr& zerop, - const ov::SoPtr& scale, - const ov::SoPtr& to, - const UnpackOptions& _options) { - NPUW_ASSERT(from->is_continuous()); - NPUW_ASSERT(zerop->is_continuous()); - NPUW_ASSERT(scale->is_continuous()); - NPUW_ASSERT(to->is_continuous()); - NPUW_ASSERT(from->get_size() == to->get_size()); - NPUW_ASSERT(from->get_size() % 8 == 0); - NPUW_ASSERT(scale->get_shape()[0] == from->get_shape()[0]); - NPUW_ASSERT(scale->get_shape()[1] == 1); - NPUW_ASSERT(zerop->get_shape()[0] == from->get_shape()[0]); - NPUW_ASSERT(zerop->get_shape()[1] == 1); - - const auto scale_elem_type = scale->get_element_type(); - NPUW_ASSERT(scale_elem_type == ov::element::f32 || scale_elem_type == ov::element::f16); - - const auto zerop_elem_type = zerop->get_element_type(); - NPUW_ASSERT(zerop_elem_type == ov::element::u8); - -#if defined(HAVE_AVX2) - constexpr std::size_t VECSIZE = 8; - - const std::size_t total = from->get_size(); - const std::size_t stotal = scale->get_size(); - uint8_t const* pSrc = from->data(); - uint8_t const* pZrp = zerop->data(); - int8_t const* pScl = static_cast(scale->data()); - int16_t* pDst = static_cast(to->data()); - - for (std::size_t sindex = 0u; sindex < stotal; sindex++) { - __m256 svec = avx2_load_scale(pScl, scale_elem_type); - __m128i u8zp = _mm_set1_epi8(*pZrp); // bcast: 8 x u8 - __m256i u32zp = _mm256_cvtepu8_epi32(u8zp); // i32 zero point - __m256 f32zp = _mm256_cvtepi32_ps(u32zp); // f32 zero point - for (std::size_t index = 0u; index < (total / stotal); index += VECSIZE) { - __m128i const* pSrcV = reinterpret_cast(pSrc); - __m128i* pDstV = reinterpret_cast<__m128i*>(pDst); - __m128i u8in = _mm_loadl_epi64(pSrcV); // load: 8 x u8 - __m128i f16vec = avx2_u8tof16(u8in, f32zp, svec); // convert & scale - _mm_store_si128(pDstV, f16vec); // store: 8 x f16 - pSrc += VECSIZE; - pDst += VECSIZE; - } // index - pScl += scale_elem_type.size(); - pZrp++; - } // sindex -#else - OPENVINO_THROW("AVX2 support is neccessary but it's not enabled!"); -#endif -} - -} // namespace - -inline void unpack_impl(const ov::SoPtr& from, - const ov::SoPtr& to, - const UnpackOptions& unpack_options = UnpackOptions{true, 16, false}) { - // This is in fact a weight decompression procedure - auto type_from = from->get_element_type(); - auto type_to = to->get_element_type(); - - namespace ove = ov::element; -#define CAST(x) static_cast((x).operator ove::Type_t()) -#define PAIR(f, t) (CAST(f) << 16 | CAST(t)) -#define HNDL(f, t) \ - case PAIR(ove::f, ove::t): \ - unpack_##f##t(from, to, unpack_options); \ - break; - switch (PAIR(type_from, type_to)) { - HNDL(i4, i8); - HNDL(i4, f16); - HNDL(u4, i8); - HNDL(u4, f16); - HNDL(u4, f32); - HNDL(i8, f16); - default: - OPENVINO_THROW("Unknown unpack combination ", type_from, " -> ", type_to); - } -#undef HNDL -#undef PAIR -#undef CAST -} - -inline void unpack_scale_impl(const ov::SoPtr& from, - const ov::SoPtr& scale, - const ov::SoPtr& to, - const UnpackOptions& unpack_options = UnpackOptions{true, 16, false}) { - // This is in fact a weight decompression procedure - const auto type_from = from->get_element_type(); - const auto type_to = to->get_element_type(); - NPUW_ASSERT(type_to == ov::element::f16); - - const auto& from_shape = from->get_shape(); - const auto& scale_shape = scale->get_shape(); - - if (type_from == ov::element::i4) { - if (from_shape.size() == 3) { - if (scale_shape[2] == from_shape[2]) { - unpack_i4f16_z(from, scale, to, unpack_options); - } else { - unpack_i4f16(from, scale, to, unpack_options); - } - } else { - NPUW_ASSERT(from_shape.size() == 2); - unpack_i4f16(from, scale, to, unpack_options); - } - } else if (type_from == ov::element::i8) { - unpack_i8f16(from, scale, to, unpack_options); - } else { - NPUW_ASSERT(false && "Unsupported combination"); - } -} - -inline void unpack_scale_zp_impl(const ov::SoPtr& from, - const ov::SoPtr& zerop, - const ov::SoPtr& scale, - const ov::SoPtr& to, - const UnpackOptions& unpack_options = UnpackOptions{true, 16, false}) { - const auto type_from = from->get_element_type(); - const auto type_zerop = zerop->get_element_type(); - const auto type_scale = scale->get_element_type(); - const auto type_to = to->get_element_type(); - - if (type_from == ov::element::u4) { - NPUW_ASSERT(type_zerop == ov::element::u4 || type_zerop == ov::element::f16 || type_zerop == ov::element::f32); - NPUW_ASSERT(type_scale == ov::element::f16 || type_scale == ov::element::f32); - NPUW_ASSERT(type_to == ov::element::f16); - } else if (type_from == ov::element::u8) { - NPUW_ASSERT(type_zerop == ov::element::u8); - NPUW_ASSERT(type_scale == ov::element::f16); - NPUW_ASSERT(type_to == ov::element::f16); - } else { - NPUW_ASSERT(false && "Unsupported combination"); - } - - // This function determines the appropriate unpacking strategy for tensor multiplication - // based on the 'scale' shape and 'from' shape. - // Example tensors -> (scale.*from): - // unpack_u4f16: - // - [4096, 1].*[4096, 4096] - // - [11008, 1].*[11008, 4096] - // - [4096, 32, 1].*[4096, 32, 128] - // unpack_u4f16_z: - // - [32, 1, 4096].*[32, 128, 4096] - // - [32, 1, 11008].*[32, 128, 11008] - // - [86, 1, 4096].*[86, 128, 4096] - // unpack_u4f16_asymm_zp: - // - [256, 16, 1].*[256, 16, 128] - // - [2048, 16, 1].*[2048, 16, 128] - // - [5632, 16, 1].*[5632, 16, 128] - // Zero Point Shapes: [256, 16, 1], [2048, 16, 1], [5632, 16, 1] - // Unsupported Case for scale tensor: - // - [s1, 1, s2, 1, s3] - - const auto& from_shape = from->get_shape(); - const auto& scale_shape = scale->get_shape(); - - if (type_from == ov::element::u4) { - if (scale_shape.size() == 3 && scale_shape[0] == from_shape[0] && scale_shape[1] == 1 && - scale_shape[2] == from_shape[2]) { - unpack_u4f16_z(from, zerop, scale, to, unpack_options); - } else if (scale_shape.size() == 3 && scale_shape[0] == from_shape[0] && scale_shape[1] == from_shape[1] && - scale_shape[2] == 1) { - if (zerop->get_size() == 1) { - unpack_u4f16(from, zerop, scale, to, unpack_options); - } else { - unpack_u4f16_asymm_zp(from, zerop, scale, to, unpack_options); - } - } else if (scale_shape.size() == 2 && scale_shape[0] == from_shape[0] && scale_shape[1] == 1) { - unpack_u4f16(from, zerop, scale, to, unpack_options); - } else { - NPUW_ASSERT(false); - } - } else if (type_from == ov::element::u8) { - // Only support CW for now - if (scale_shape.size() == 2 && scale_shape[0] == from_shape[0] && scale_shape[1] == 1) { - unpack_u8f16(from, zerop, scale, to, unpack_options); - } else { - NPUW_ASSERT(false); - } - } -} - -inline ov::Tensor to_f16_impl(const ov::Tensor& t) { - ov::Shape shape = t.get_shape(); - NPUW_ASSERT(t.get_element_type() == ov::element::f32); - NPUW_ASSERT(t.get_size() % 8 == 0); - NPUW_ASSERT(t.is_continuous()); - - ov::Tensor tnew(ov::element::f16, shape); - -#if defined(HAVE_AVX2) - const float* psrc = t.data(); - uint8_t* pdst = static_cast(tnew.data()); - - for (std::size_t i = 0; i < t.get_size() / 8; i++) { - __m256 vsrc = _mm256_loadu_ps(psrc); - __m128i vout = _mm256_cvtps_ph(vsrc, _MM_FROUND_TO_NEAREST_INT); - __m128i* pout = reinterpret_cast<__m128i*>(pdst); - _mm_storeu_si128(pout, vout); - psrc += 8; // offset in sizeof(float) - pdst += (8 * 2); // offset in bytes - } -#else - OPENVINO_THROW("AVX2 support is neccessary but it's not enabled!"); -#endif - return tnew; -} - -} // namespace XARCH -} // namespace util -} // namespace npuw -} // namespace ov diff --git a/src/plugins/intel_npu/src/plugin/npuw/util.cpp b/src/plugins/intel_npu/src/plugin/npuw/util.cpp index 5bea85aee468cd..1de8f4de4bdb4f 100644 --- a/src/plugins/intel_npu/src/plugin/npuw/util.cpp +++ b/src/plugins/intel_npu/src/plugin/npuw/util.cpp @@ -15,6 +15,7 @@ #include "openvino/op/transpose.hpp" #include "openvino/op/util/op_types.hpp" #include "openvino/runtime/make_tensor.hpp" // get_tensor_impl +#include "util_xarch.hpp" bool ov::npuw::util::is_set(const std::size_t sub_idx, const std::string& opt) { if (opt.empty() || opt == "NO") { @@ -65,6 +66,135 @@ std::string ov::npuw::util::fmt(std::size_t number, std::size_t total) { return ss.str(); } +void ov::npuw::util::unpack(const ov::SoPtr& from, + const ov::SoPtr& to, + const UnpackOptions& unpack_options) { + // This is in fact a weight decompression procedure + auto type_from = from->get_element_type(); + auto type_to = to->get_element_type(); + + namespace ove = ov::element; +#define CAST(x) static_cast((x).operator ove::Type_t()) +#define PAIR(f, t) (CAST(f) << 16 | CAST(t)) +#define HNDL(f, t) \ + case PAIR(ove::f, ove::t): \ + ov::npuw::util::XARCH::unpack_##f##t(from, to, unpack_options); \ + break; + switch (PAIR(type_from, type_to)) { + HNDL(i4, i8); + HNDL(i4, f16); + HNDL(u4, i8); + HNDL(u4, f16); + HNDL(u4, f32); + HNDL(i8, f16); + default: + OPENVINO_THROW("Unknown unpack combination ", type_from, " -> ", type_to); + } +#undef HNDL +#undef PAIR +#undef CAST +} + +void ov::npuw::util::unpack(const ov::SoPtr& from, + const ov::SoPtr& scale, + const ov::SoPtr& to, + const UnpackOptions& unpack_options) { + // This is in fact a weight decompression procedure + const auto type_from = from->get_element_type(); + const auto type_to = to->get_element_type(); + NPUW_ASSERT(type_to == ov::element::f16); + + const auto& from_shape = from->get_shape(); + const auto& scale_shape = scale->get_shape(); + + if (type_from == ov::element::i4) { + if (from_shape.size() == 3) { + if (scale_shape[2] == from_shape[2]) { + ov::npuw::util::XARCH::unpack_i4f16_z(from, scale, to, unpack_options); + } else { + ov::npuw::util::XARCH::unpack_i4f16_scale(from, scale, to, unpack_options); + } + } else { + NPUW_ASSERT(from_shape.size() == 2); + ov::npuw::util::XARCH::unpack_i4f16_scale(from, scale, to, unpack_options); + } + } else if (type_from == ov::element::i8) { + ov::npuw::util::XARCH::unpack_i8f16_scale(from, scale, to, unpack_options); + } else { + NPUW_ASSERT(false && "Unsupported combination"); + } +} + +void ov::npuw::util::unpack(const ov::SoPtr& from, + const ov::SoPtr& zerop, + const ov::SoPtr& scale, + const ov::SoPtr& to, + const UnpackOptions& unpack_options) { + const auto type_from = from->get_element_type(); + const auto type_zerop = zerop->get_element_type(); + const auto type_scale = scale->get_element_type(); + const auto type_to = to->get_element_type(); + + if (type_from == ov::element::u4) { + NPUW_ASSERT(type_zerop == ov::element::u4 || type_zerop == ov::element::f16 || type_zerop == ov::element::f32); + NPUW_ASSERT(type_scale == ov::element::f16 || type_scale == ov::element::f32); + NPUW_ASSERT(type_to == ov::element::f16); + } else if (type_from == ov::element::u8) { + NPUW_ASSERT(type_zerop == ov::element::u8); + NPUW_ASSERT(type_scale == ov::element::f16); + NPUW_ASSERT(type_to == ov::element::f16); + } else { + NPUW_ASSERT(false && "Unsupported combination"); + } + + // This function determines the appropriate unpacking strategy for tensor multiplication + // based on the 'scale' shape and 'from' shape. + // Example tensors -> (scale.*from): + // unpack_u4f16: + // - [4096, 1].*[4096, 4096] + // - [11008, 1].*[11008, 4096] + // - [4096, 32, 1].*[4096, 32, 128] + // unpack_u4f16_z: + // - [32, 1, 4096].*[32, 128, 4096] + // - [32, 1, 11008].*[32, 128, 11008] + // - [86, 1, 4096].*[86, 128, 4096] + // unpack_u4f16_asymm_zp: + // - [256, 16, 1].*[256, 16, 128] + // - [2048, 16, 1].*[2048, 16, 128] + // - [5632, 16, 1].*[5632, 16, 128] + // Zero Point Shapes: [256, 16, 1], [2048, 16, 1], [5632, 16, 1] + // Unsupported Case for scale tensor: + // - [s1, 1, s2, 1, s3] + + const auto& from_shape = from->get_shape(); + const auto& scale_shape = scale->get_shape(); + + if (type_from == ov::element::u4) { + if (scale_shape.size() == 3 && scale_shape[0] == from_shape[0] && scale_shape[1] == 1 && + scale_shape[2] == from_shape[2]) { + ov::npuw::util::XARCH::unpack_u4f16_z(from, zerop, scale, to, unpack_options); + } else if (scale_shape.size() == 3 && scale_shape[0] == from_shape[0] && scale_shape[1] == from_shape[1] && + scale_shape[2] == 1) { + if (zerop->get_size() == 1) { + ov::npuw::util::XARCH::unpack_u4f16_scale_zp(from, zerop, scale, to, unpack_options); + } else { + ov::npuw::util::XARCH::unpack_u4f16_asymm_zp(from, zerop, scale, to, unpack_options); + } + } else if (scale_shape.size() == 2 && scale_shape[0] == from_shape[0] && scale_shape[1] == 1) { + ov::npuw::util::XARCH::unpack_u4f16_scale_zp(from, zerop, scale, to, unpack_options); + } else { + NPUW_ASSERT(false); + } + } else if (type_from == ov::element::u8) { + // Only support CW for now + if (scale_shape.size() == 2 && scale_shape[0] == from_shape[0] && scale_shape[1] == 1) { + ov::npuw::util::XARCH::unpack_u8f16(from, zerop, scale, to, unpack_options); + } else { + NPUW_ASSERT(false); + } + } +} + void ov::npuw::util::gather(const ov::SoPtr& src, const ov::SoPtr& idx, const ov::SoPtr& dst) { @@ -201,6 +331,10 @@ void ov::npuw::util::to_f32(const ov::Tensor& in, ov::Tensor& out) { } } +ov::Tensor ov::npuw::util::to_f16(const ov::Tensor& t) { + return ov::npuw::util::XARCH::to_f16(t); +} + inline uint8_t tread_4b(const ov::Tensor& t, std::size_t r, std::size_t c, std::size_t COLS) { const uint8_t* tdata = static_cast(t.data()); const uint8_t* trow = tdata + r * COLS / 2; diff --git a/src/plugins/intel_npu/src/plugin/npuw/util.hpp b/src/plugins/intel_npu/src/plugin/npuw/util.hpp index 7fd8a1c5da5316..02d2c8c097811e 100644 --- a/src/plugins/intel_npu/src/plugin/npuw/util.hpp +++ b/src/plugins/intel_npu/src/plugin/npuw/util.hpp @@ -25,6 +25,32 @@ bool starts_with(const std::string& str, const std::string& prefix); std::string fmt(std::size_t number, std::size_t total); +struct UnpackOptions { + bool bUseOvParallelFor; + size_t nPartitions; // if 0 we use 64 elements step in parallel for, otherwise target workload is dynamically + // calculated + bool bStrictPartitioning; // cannot reduce partitions in favor of speed + explicit UnpackOptions(bool useParallelFor, size_t nPartitions, bool bStrictPartitioning) + : bUseOvParallelFor(useParallelFor), + nPartitions(nPartitions), + bStrictPartitioning(bStrictPartitioning) {} +}; + +void unpack(const ov::SoPtr& from, + const ov::SoPtr& to, + const UnpackOptions& unpack_options = UnpackOptions{true, 16, false}); + +void unpack(const ov::SoPtr& from, + const ov::SoPtr& scale, + const ov::SoPtr& to, + const UnpackOptions& unpack_options = UnpackOptions{true, 16, false}); + +void unpack(const ov::SoPtr& from, + const ov::SoPtr& zerop, + const ov::SoPtr& scale, + const ov::SoPtr& to, + const UnpackOptions& unpack_options = UnpackOptions{true, 16, false}); + void gather(const ov::SoPtr& src, const ov::SoPtr& idx, const ov::SoPtr& dst); using View = std::vector; @@ -33,7 +59,7 @@ ov::SoPtr view(const ov::SoPtr& src, const View& from, ov::SoPtr view(const ov::SoPtr& src, std::size_t dim, std::size_t offset, std::size_t len); void to_f32(const ov::Tensor& in, ov::Tensor& out); - +ov::Tensor to_f16(const ov::Tensor& t); ov::Tensor transpose(const ov::Tensor& t); ov::Tensor permute(const ov::Tensor& t, const std::vector& axes); ov::Tensor concat(const std::vector& tt, std::size_t axis); diff --git a/src/plugins/intel_npu/src/plugin/npuw/util_xarch.cpp b/src/plugins/intel_npu/src/plugin/npuw/util_xarch.cpp index e0a8ebf384e0e1..37c4770b9d9fa3 100644 --- a/src/plugins/intel_npu/src/plugin/npuw/util_xarch.cpp +++ b/src/plugins/intel_npu/src/plugin/npuw/util_xarch.cpp @@ -6,26 +6,1424 @@ # include #endif -#include "unpack_kernel.hpp" +#include + +#include "util.hpp" #include "util_xarch.hpp" -void ov::npuw::util::XARCH::unpack(const ov::SoPtr& from, const ov::SoPtr& to) { - unpack_impl(from, to); +#ifdef UNPACK_PROFILING +# include "tbb/concurrent_unordered_map.h" +#endif + +namespace { +#if defined(HAVE_AVX2) +inline int8_t hi4(int8_t x) { + return ((x & (1 << 7)) >> 4) | ((x & (1 << 6)) >> 4) | ((x & (1 << 5)) >> 4) | ((x & (1 << 4)) >> 4); +} + +inline int8_t lo4(int8_t x) { + return (x & (1 << 3)) | (x & (1 << 2)) | (x & (1 << 1)) | (x & (1 << 0)); } +#endif -void ov::npuw::util::XARCH::unpack_scale(const ov::SoPtr& from, - const ov::SoPtr& scale, - const ov::SoPtr& to) { - unpack_scale_impl(from, scale, to); +inline uint8_t hi4(uint8_t x) { + return x >> 4; +} + +inline uint8_t lo4(uint8_t x) { + return x & 0xF; +} + +#if defined(HAVE_AVX2) +inline int8_t upc(int8_t h) { + return h | (-((h & (1 << 3)) >> 3) & (-8)); +} + +// NOTE: This routine implements the NEW ORDER +# define avx2_i4toi8(vinput, vout0, vout1) \ + { \ + __m256i himask = _mm256_broadcastb_epi8(_mm_set_epi32(0, 0, 0, 0xF0)); \ + __m256i lomask = _mm256_broadcastb_epi8(_mm_set_epi32(0, 0, 0, 0x0F)); \ + __m256i vsgmask = _mm256_broadcastb_epi8(_mm_set_epi32(0, 0, 0, 1 << 3)); \ + __m256i vzero = _mm256_broadcastb_epi8(_mm_set_epi32(0, 0, 0, 0)); \ + __m256i vextend = _mm256_broadcastb_epi8(_mm_set_epi32(0, 0, 0, (-8))); \ + \ + __m256i vht = _mm256_and_si256(vinput, himask); \ + __m256i vhi = _mm256_srli_epi16(vht, 4); \ + __m256i vlo = _mm256_and_si256(vinput, lomask); \ + \ + __m256i vsghi = _mm256_srli_epi16(_mm256_and_si256(vhi, vsgmask), 3); \ + __m256i vsglo = _mm256_srli_epi16(_mm256_and_si256(vlo, vsgmask), 3); \ + __m256i vsubhi = _mm256_sub_epi8(vzero, vsghi); \ + __m256i vsublo = _mm256_sub_epi8(vzero, vsglo); \ + __m256i vhires = _mm256_or_si256(vhi, _mm256_and_si256(vsubhi, vextend)); \ + __m256i vlores = _mm256_or_si256(vlo, _mm256_and_si256(vsublo, vextend)); \ + \ + __m256i vunlo = _mm256_unpacklo_epi8(vlores, vhires); \ + __m256i vunhi = _mm256_unpackhi_epi8(vlores, vhires); \ + *vout0 = _mm256_permute2x128_si256(vunlo, vunhi, 0x20); \ + *vout1 = _mm256_permute2x128_si256(vunlo, vunhi, 0x31); \ + } + +inline __m128i avx2_i8tof16(__m128i vi8) { + __m256i i32vec = _mm256_cvtepi8_epi32(vi8); // extend: 8 x i8 -> 8 x i32 [256b of 256b] + __m256 f32vec = _mm256_cvtepi32_ps(i32vec); // convert: 8 x i32 -> 8 x f32 [256b of 256b] + return _mm256_cvtps_ph(f32vec, _MM_FROUND_TO_NEAREST_INT); // convert: 8 x f32 -> 8 x f16 [128b] +} + +inline __m128i avx2_i8tof16(__m128i vi8, __m256 s) { + __m256i i32vec = _mm256_cvtepi8_epi32(vi8); // extend: 8 x i8 -> 8 x i32 [256b of 256b] + __m256 f32vec = _mm256_cvtepi32_ps(i32vec); // convert: 8 x i32 -> 8 x f32 [256b of 256b] + __m256 f32scl = _mm256_mul_ps(f32vec, s); // scale: 8 x f32 -> 8 x f32 [256b of 256b] + return _mm256_cvtps_ph(f32scl, _MM_FROUND_TO_NEAREST_INT); // convert: 8 x f32 -> 8 x f16 [128b] +} + +inline __m128i avx2_u8tof16_hi(__m128i vu8, __m256 z, __m256 s) { + __m256i u32vec = _mm256_cvtepu8_epi32(vu8); // extend: 8 x u8 -> 8 x i32 [256b of 256b] + __m256 f32vec = _mm256_cvtepi32_ps(u32vec); // convert: 8 x i32 -> 8 x f32 [256b of 256b] + __m256 f32sub = _mm256_sub_ps(f32vec, z); // subtract: 8 x f32 -> 8 x f32 [256b of 256b] + __m256 f32scl = _mm256_mul_ps(f32sub, s); // scale: 8 x f32 -> 8 x f32 [256b of 256b] + return _mm256_cvtps_ph(f32scl, _MM_FROUND_TO_NEAREST_INT); // convert: 8 x f32 -> 8 x f16 [128b] +} + +inline __m128i avx2_u8tof16_lo(__m128i vu8, __m256 z, __m256 s) { + __m128i vu8h = _mm_bsrli_si128(vu8, 8); + return avx2_u8tof16_hi(vu8h, z, s); +} + +inline __m128i avx2_u8tof16(__m128i vi8, __m256 z, __m256 s) { + __m256i i32vec = _mm256_cvtepu8_epi32(vi8); // extend: 8 x i8 -> 8 x i32 [256b of 256b] + __m256 f32vec = _mm256_cvtepi32_ps(i32vec); // convert: 8 x i32 -> 8 x f32 [256b of 256b] + __m256 f32sub = _mm256_sub_ps(f32vec, z); // subtract: 8 x f32 -> 8 x f32 [256b of 256b] + __m256 f32scl = _mm256_mul_ps(f32sub, s); // scale: 8 x f32 -> 8 x f32 [256b of 256b] + return _mm256_cvtps_ph(f32scl, _MM_FROUND_TO_NEAREST_INT); // convert: 8 x f32 -> 8 x f16 [128b] +} + +// NOTE: This routine implements the NEW ORDER +inline void avx2_u4tof16(__m256i vinput, __m128i vout[8], __m256 zvalVec, __m256 svalVec[8]) { + // vinput - 64 x u4 elements - 256 bits + // vout[] - 64 (8x8) x f16 elements + + // NOTE: This is largely a copy of unpack_u4f16() {{ + __m256i himask = _mm256_set1_epi8(static_cast(0xF0)); + __m256i lomask = _mm256_set1_epi8(static_cast(0x0F)); + + // unpacking with interleaving + __m256i vht = _mm256_and_si256(vinput, himask); + __m256i xmmUnpackedLo = _mm256_srli_epi16(vht, 4); // 32 x i8 - Extracting High Nibbles + __m256i xmmUnpackedHi = _mm256_and_si256(vinput, lomask); // 32 x i8 - Extracting Low Nibbles + + // need 4 portions of 16 x i8 elements + __m128i unpacked32LoHi = _mm256_castsi256_si128(xmmUnpackedLo); // lower 16 x i8 - Lower 16 of High Nibbles + __m128i unpacked32LoLo = _mm256_extractf128_si256(xmmUnpackedLo, 1); // higher 16 x i8 - Higher 16 of High Nibbles + + __m128i unpacked32HiHi = _mm256_castsi256_si128(xmmUnpackedHi); // lower 16 x i8 - Lower 16 of Low Nibbles + __m128i unpacked32HiLo = _mm256_extractf128_si256(xmmUnpackedHi, 1); // higher 16 x i8 - Higher 16 of Low Nibbles + + // Rearranging of scales + __m256i indices = _mm256_setr_epi32(0, 2, 4, 6, 1, 3, 5, 7); + // Extracting all 64 scales as per the indices specified above + __m256 scale_v_rearranged[] = {_mm256_permutevar8x32_ps(svalVec[0], indices), + _mm256_permutevar8x32_ps(svalVec[1], indices), + _mm256_permutevar8x32_ps(svalVec[2], indices), + _mm256_permutevar8x32_ps(svalVec[3], indices), + _mm256_permutevar8x32_ps(svalVec[4], indices), + _mm256_permutevar8x32_ps(svalVec[5], indices), + _mm256_permutevar8x32_ps(svalVec[6], indices), + _mm256_permutevar8x32_ps(svalVec[7], indices)}; + + // Scaling should happen like this: + // low_nibble[0]->scale[0], high_nibble[0]->scale[1]...low_nibble[31]->scale[60],high_nibble[31]->scale[61] + + // Extracting all the even-indexed scales for the low nibbles + __m256 scale_v_even[] = { + _mm256_permute2f128_ps(scale_v_rearranged[0], scale_v_rearranged[1], 0x20), + _mm256_permute2f128_ps(scale_v_rearranged[2], scale_v_rearranged[3], 0x20), + _mm256_permute2f128_ps(scale_v_rearranged[4], scale_v_rearranged[5], 0x20), + _mm256_permute2f128_ps(scale_v_rearranged[6], scale_v_rearranged[7], 0x20), + }; + + // Extracting all the odd-indexed scales for the high nibbles + __m256 scale_v_odd[] = { + _mm256_permute2f128_ps(scale_v_rearranged[0], scale_v_rearranged[1], 0x31), + _mm256_permute2f128_ps(scale_v_rearranged[2], scale_v_rearranged[3], 0x31), + _mm256_permute2f128_ps(scale_v_rearranged[4], scale_v_rearranged[5], 0x31), + _mm256_permute2f128_ps(scale_v_rearranged[6], scale_v_rearranged[7], 0x31), + }; + + // converting to 64 x f16 + // Higher 16 of High Nibbles + __m128i f16LoLo[] = {avx2_u8tof16_hi(unpacked32LoLo, zvalVec, scale_v_odd[2]), + avx2_u8tof16_lo(unpacked32LoLo, zvalVec, scale_v_odd[3])}; + // Lower 16 of High Nibbles + __m128i f16LoHi[] = {avx2_u8tof16_hi(unpacked32LoHi, zvalVec, scale_v_odd[0]), + avx2_u8tof16_lo(unpacked32LoHi, zvalVec, scale_v_odd[1])}; + // Higher 16 of Low Nibbles + __m128i f16HiLo[] = {avx2_u8tof16_hi(unpacked32HiLo, zvalVec, scale_v_even[2]), + avx2_u8tof16_lo(unpacked32HiLo, zvalVec, scale_v_even[3])}; + // Lower 16 of Low Nibbles + __m128i f16HiHi[] = {avx2_u8tof16_hi(unpacked32HiHi, zvalVec, scale_v_even[0]), + avx2_u8tof16_lo(unpacked32HiHi, zvalVec, scale_v_even[1])}; + + // interleaving back: + // Interleaving lower 8 of low nibbles with lower 8 of high nibbles and so on + vout[0] = _mm_unpacklo_epi16(f16HiHi[0], f16LoHi[0]); + vout[1] = _mm_unpackhi_epi16(f16HiHi[0], f16LoHi[0]); + vout[2] = _mm_unpacklo_epi16(f16HiHi[1], f16LoHi[1]); + vout[3] = _mm_unpackhi_epi16(f16HiHi[1], f16LoHi[1]); + vout[4] = _mm_unpacklo_epi16(f16HiLo[0], f16LoLo[0]); + vout[5] = _mm_unpackhi_epi16(f16HiLo[0], f16LoLo[0]); + vout[6] = _mm_unpacklo_epi16(f16HiLo[1], f16LoLo[1]); + vout[7] = _mm_unpackhi_epi16(f16HiLo[1], f16LoLo[1]); +} + +inline __m256 avx2_load_scale(const int8_t* data, ov::element::Type type) { + if (type == ov::element::f32) { + return _mm256_set1_ps(*reinterpret_cast(data)); + } else { + NPUW_ASSERT(type == ov::element::f16); + float val{}; + _mm_store_ss(&val, _mm_cvtph_ps(_mm_cvtsi32_si128(*reinterpret_cast(data)))); + return _mm256_set1_ps(val); + } +} + +inline float avx2_load_f32(const int8_t* data, ov::element::Type type) { + if (type == ov::element::f32) { + return *reinterpret_cast(data); + } else { + NPUW_ASSERT(type == ov::element::f16); + float val{}; + _mm_store_ss(&val, _mm_cvtph_ps(_mm_cvtsi32_si128(*reinterpret_cast(data)))); + return val; + } +} +#endif + +#ifdef UNPACK_PROFILING +class UnpackStat { + tbb::concurrent_unordered_map> inferenceTimes; + +public: + UnpackStat() {} + void addRecord(size_t sz, size_t time) { + inferenceTimes[sz].first++; + inferenceTimes[sz].second += time; + } + ~UnpackStat() { + for (auto&& r : inferenceTimes) { + std::cout << "work: " << r.first //<< ", stride: " << stride + << " overall_time = " << r.second.second / 1000 << " [ms]" + << " avg_atime = " << r.second.second / r.second.first << " [µs]\n"; + } + } +}; + +static UnpackStat ustat; +# define UNPACK_START_TICK() std::chrono::steady_clock::time_point _begin_tick = std::chrono::steady_clock::now(); +# define UNPACK_SAVE_TICK() \ + std::chrono::steady_clock::time_point _end_tick = std::chrono::steady_clock::now(); \ + ustat.addRecord(total, std::chrono::duration_cast(_end_tick - _begin_tick).count()); +#else +# define UNPACK_START_TICK() +# define UNPACK_SAVE_TICK() +#endif +} // namespace + +void ov::npuw::util::XARCH::unpack_i4i8(const ov::SoPtr& from, + const ov::SoPtr& to, + const ov::npuw::util::UnpackOptions& unpack_options) { + NPUW_ASSERT(from->is_continuous()); + NPUW_ASSERT(to->is_continuous()); + NPUW_ASSERT(from->get_size() == to->get_size()); + +#if defined(HAVE_AVX2) + // with vectorization above, we: + // - read 256 bits (= 32 bytes, = 64 i4 elements) + // - write 512 bits (= 64 bytes, = 64 i8 elements) + // per every iteration, what translates to (from->size() / 64) iterations + + const std::size_t total = from->get_size(); + int8_t const* pSrc = static_cast(from->data()); // 2 x i4 elements + int8_t* pDst = static_cast(to->data()); // 1 x i8 element + size_t stride = 64; + + auto unpack_body = [pSrc, pDst](size_t index, size_t stride) { + size_t halfStride = stride >> 1; + int8_t const* pSrcLocal = pSrc + halfStride * index; + int8_t* pDstLocal = pDst + stride * index; + + for (size_t j = 0; j < stride; j += 64) { + __m256i inv = _mm256_lddqu_si256(reinterpret_cast(pSrcLocal)); + __m256i* outv0 = reinterpret_cast<__m256i*>(pDstLocal); + __m256i* outv1 = reinterpret_cast<__m256i*>(pDstLocal + 32); + + __m256i vout0, vout1; + avx2_i4toi8(inv, &vout0, &vout1); + + _mm256_storeu_si256(outv0, vout0); + _mm256_storeu_si256(outv1, vout1); + + pSrcLocal += 32; + pDstLocal += 64; + } + }; + + // ov work index / 64 + if (unpack_options.nPartitions) { + std::size_t minPartitions; + if (!unpack_options.bStrictPartitioning) { + // some heuristics that every tbb thread workload has to have 2048 elements at least, + // so in terms of stride, it should be 64 * 2048 + minPartitions = total / (64 * 2048); + minPartitions = std::max(1u, minPartitions); + minPartitions = std::min(minPartitions, unpack_options.nPartitions); + } else { + minPartitions = unpack_options.nPartitions; + } + + // calculating stride in elements - this stride give us nPartitions + 1 partitions + stride = static_cast(total / minPartitions); + + // stride has to be 64 elements aligned to avoid gaps between workloads + stride = (stride >> 6) << 6; + // if number of partitions to large comparing to workload, min supported stride still have to be clamped to 64 + stride = stride < 64 ? 64 : stride; + } + + UNPACK_START_TICK(); + + if (unpack_options.bUseOvParallelFor) { + ov::parallel_for(total / stride, [unpack_body, stride](size_t index) { + unpack_body(index, stride); + }); + } else { + for (std::size_t index = 0; index < total / stride; index++) { + unpack_body(index, stride); + } + } + // handle tail + size_t tailOffset = (static_cast(total / stride) * stride); + pSrc = static_cast(from->data()) + (tailOffset >> 1); + pDst = static_cast(to->data()) + tailOffset; + + for (std::size_t index = 0; index < ((total % 64) >> 1); index++) { + *(pDst++) = upc(lo4(*(pSrc))); + *(pDst++) = upc(hi4(*(pSrc))); + pSrc++; + } + UNPACK_SAVE_TICK(); +#else + OPENVINO_THROW("AVX2 support is neccessary but it's not enabled!"); +#endif +} + +void ov::npuw::util::XARCH::unpack_u4i8(const ov::SoPtr& from, + const ov::SoPtr& to, + const ov::npuw::util::UnpackOptions& unpack_options) { + NPUW_ASSERT(from->is_continuous()); + NPUW_ASSERT(to->is_continuous()); + NPUW_ASSERT(from->get_size() == to->get_size()); + + uint8_t const* pSrc = static_cast(from->data()); // 2 x u4 elements + int8_t* pDst = static_cast(to->data()); // 1 x i8 element + + const std::size_t total = from->get_size(); + for (std::size_t index = 0; index < total; index += 2) { + pDst[0] = static_cast(lo4(*pSrc)); // LSB is [0] -- since OpenVINO 24.0! + pDst[1] = static_cast(hi4(*pSrc)); // MSB is [1] -- since OpenVINO 24.0! + pSrc++; + pDst += 2; + } +} + +void ov::npuw::util::XARCH::unpack_i4f16(const ov::SoPtr& from, + const ov::SoPtr& to, + const ov::npuw::util::UnpackOptions& unpack_options) { + NPUW_ASSERT(from->is_continuous()); + NPUW_ASSERT(to->is_continuous()); + NPUW_ASSERT(from->get_size() == to->get_size()); + +#if defined(HAVE_AVX2) + // This conversion combines i4toi8 (above) and i8tof16 (below). Here we + // - read 256 bits (= 32 bytes, = 64 i4 elements) + // - write 1024 bits (= 128 bytes, = 64 f16 elements) + // per every iteration, what translates to (from->size() / 64) iterations + + std::size_t total = to->get_size(); + int8_t const* pSrc = static_cast(from->data()); // 2 x i4 elements + int16_t* pDst = static_cast(to->data()); // 1 x f16 element + // bool tailOnly = total < 64; + + auto unpack_body = [pSrc, pDst](size_t index) { + int8_t const* pSrcLocal = pSrc + 32 * index; + int16_t* pDstLocal = pDst + 64 * index; + + __m256i inv = _mm256_lddqu_si256(reinterpret_cast(pSrcLocal)); + __m128i* outv[8] = { + reinterpret_cast<__m128i*>(pDstLocal), + reinterpret_cast<__m128i*>(pDstLocal + 8), + reinterpret_cast<__m128i*>(pDstLocal + 16), + reinterpret_cast<__m128i*>(pDstLocal + 24), + reinterpret_cast<__m128i*>(pDstLocal + 32), + reinterpret_cast<__m128i*>(pDstLocal + 40), + reinterpret_cast<__m128i*>(pDstLocal + 48), + reinterpret_cast<__m128i*>(pDstLocal + 56), + }; + + __m256i vout0, vout1; + avx2_i4toi8(inv, &vout0, &vout1); + + int8_t tmp[64]; // FIXME: Avoid it + __m256i* tmpv0 = reinterpret_cast<__m256i*>(tmp); + __m256i* tmpv1 = reinterpret_cast<__m256i*>(tmp + 32); + _mm256_storeu_si256(tmpv0, vout0); + _mm256_storeu_si256(tmpv1, vout1); + + __m128i i8vecs[8] = { + _mm_loadl_epi64(reinterpret_cast<__m128i*>(tmp)), + _mm_loadl_epi64(reinterpret_cast<__m128i*>(tmp + 8)), + _mm_loadl_epi64(reinterpret_cast<__m128i*>(tmp + 16)), + _mm_loadl_epi64(reinterpret_cast<__m128i*>(tmp + 24)), + _mm_loadl_epi64(reinterpret_cast<__m128i*>(tmp + 32)), + _mm_loadl_epi64(reinterpret_cast<__m128i*>(tmp + 40)), + _mm_loadl_epi64(reinterpret_cast<__m128i*>(tmp + 48)), + _mm_loadl_epi64(reinterpret_cast<__m128i*>(tmp + 56)), + }; + + __m128i vresults[8] = {avx2_i8tof16(i8vecs[0]), + avx2_i8tof16(i8vecs[1]), + avx2_i8tof16(i8vecs[2]), + avx2_i8tof16(i8vecs[3]), + avx2_i8tof16(i8vecs[4]), + avx2_i8tof16(i8vecs[5]), + avx2_i8tof16(i8vecs[6]), + avx2_i8tof16(i8vecs[7])}; + + _mm_storeu_si128(outv[0], vresults[0]); + _mm_storeu_si128(outv[1], vresults[1]); + _mm_storeu_si128(outv[2], vresults[2]); + _mm_storeu_si128(outv[3], vresults[3]); + _mm_storeu_si128(outv[4], vresults[4]); + _mm_storeu_si128(outv[5], vresults[5]); + _mm_storeu_si128(outv[6], vresults[6]); + _mm_storeu_si128(outv[7], vresults[7]); + }; + + if (unpack_options.bUseOvParallelFor) { + ov::parallel_for(total / 64, [&unpack_body](size_t index) { + unpack_body(index); + }); + } else { + for (std::size_t index = 0; index < total / 64; index++) { + unpack_body(index); + } + } + + // handle tail that is < 64 elements + size_t tailOffset = ((total >> 6) << 6); + pSrc = static_cast(from->data()) + (tailOffset >> 1); + pDst = static_cast(to->data()) + tailOffset; + + constexpr std::size_t VECSIZE = 8; + + total = ((total % 64) >> 1); + int8_t unpackedToI8[VECSIZE] = {0}; + size_t unpackedIdx = 0; + for (std::size_t index = 0; index < total; index++) { + unpackedToI8[unpackedIdx++] = upc(lo4(*(pSrc))); + unpackedToI8[unpackedIdx++] = upc(hi4(*(pSrc))); + if (unpackedIdx == VECSIZE) { + __m128i i8vec = _mm_loadl_epi64(reinterpret_cast<__m128i*>(unpackedToI8)); + __m128i f16vec = avx2_i8tof16(i8vec); + _mm_storeu_si128(reinterpret_cast<__m128i*>(pDst), f16vec); + pDst += VECSIZE; + unpackedIdx = 0; + } + pSrc += 1; + } + + // handle tail that is < 8 + if (unpackedIdx != 0) { + int16_t tmp[VECSIZE]; + __m128i i8vec = _mm_loadl_epi64(reinterpret_cast<__m128i*>(unpackedToI8)); + __m128i f16vec = avx2_i8tof16(i8vec); + _mm_storeu_si128(reinterpret_cast<__m128i*>(tmp), f16vec); + for (size_t i = 0; i != unpackedIdx; i++) { + pDst[i] = tmp[i]; + } + } +#else + OPENVINO_THROW("AVX2 support is neccessary but it's not enabled!"); +#endif +} + +void ov::npuw::util::XARCH::unpack_i4f16_scale(const ov::SoPtr& from, + const ov::SoPtr& scale, + const ov::SoPtr& to, + const ov::npuw::util::UnpackOptions& unpack_options) { + NPUW_ASSERT(from->is_continuous()); + NPUW_ASSERT(scale->is_continuous()); + NPUW_ASSERT(to->is_continuous()); + NPUW_ASSERT(from->get_size() == to->get_size()); + + const auto& from_shape = from->get_shape(); + NPUW_ASSERT(from_shape.back() % 64 == 0); + + // 2-channel (Symmetric) and 3-channel (group-wise) + // scale factors are supported. The scale/value loop + // iteration is based on stotal, so should work for + // both cases. + const auto& scale_shape = scale->get_shape(); + NPUW_ASSERT(scale_shape.size() == 3 || scale_shape.size() == 2); + if (scale_shape.size() == 3) { + NPUW_ASSERT(scale_shape[0] == from_shape[0]); + NPUW_ASSERT(scale_shape[1] == from_shape[1]); + NPUW_ASSERT(scale_shape[2] == 1); + } else { + NPUW_ASSERT(scale_shape[0] == from_shape[0]); + NPUW_ASSERT(scale_shape[1] == 1); + } + + const auto scale_elem_type = scale->get_element_type(); + NPUW_ASSERT(scale_elem_type == ov::element::f32 || scale_elem_type == ov::element::f16); + +#if defined(HAVE_AVX2) + // This conversion combines i4toi8 (above) and i8tof16 (below). Here we + // - read 256 bits (= 32 bytes, = 64 i4 elements) + // - write 1024 bits (= 128 bytes, = 64 f16 elements) + // per every iteration, what translates to (from->size() / 64) iterations + + const std::size_t total = to->get_size(); + const std::size_t stotal = scale->get_size(); + const std::size_t elementsPerScale = total / stotal; + + // TODO: handle tails + NPUW_ASSERT(elementsPerScale % 64 == 0); + + const int8_t* const pSrc = static_cast(from->data()); // 2 x i4 elements + const int8_t* const pScl = static_cast(scale->data()); // either f16 or f32 + const int16_t* pDst = static_cast(to->data()); // 1 x f16 element + + auto unpack_body = [pSrc, pDst, pScl, elementsPerScale, scale_elem_type, stotal](std::size_t sindex, + std::size_t stride) { + // number of vectorized operations per scale + size_t elementsPerScaleVectorized = elementsPerScale / 64; + + int8_t const* pSrcLocal = pSrc + 32 * elementsPerScaleVectorized * sindex * stride; + int8_t const* pSclLocal = pScl + scale_elem_type.size() * sindex * stride; + int16_t* pDstLocal = const_cast(pDst) + 64 * elementsPerScaleVectorized * sindex * stride; + + // if it is last iteration current stride can be smaller - lets check that + sindex *= stride; + const auto jobFinish = std::min(sindex + stride, stotal); + + for (; sindex != jobFinish; sindex++) { + __m256 svec = avx2_load_scale(pSclLocal, scale_elem_type); + for (std::size_t index = 0; index < elementsPerScale; index += 64) { + __m256i inv = _mm256_lddqu_si256(reinterpret_cast(pSrcLocal)); + __m128i* outv[8] = { + reinterpret_cast<__m128i*>(pDstLocal), + reinterpret_cast<__m128i*>(pDstLocal + 8), + reinterpret_cast<__m128i*>(pDstLocal + 16), + reinterpret_cast<__m128i*>(pDstLocal + 24), + reinterpret_cast<__m128i*>(pDstLocal + 32), + reinterpret_cast<__m128i*>(pDstLocal + 40), + reinterpret_cast<__m128i*>(pDstLocal + 48), + reinterpret_cast<__m128i*>(pDstLocal + 56), + }; + + __m256i vout0, vout1; + avx2_i4toi8(inv, &vout0, &vout1); + + int8_t tmp[64]; // FIXME: Avoid it + __m256i* tmpv0 = reinterpret_cast<__m256i*>(tmp); + __m256i* tmpv1 = reinterpret_cast<__m256i*>(tmp + 32); + _mm256_storeu_si256(tmpv0, vout0); + _mm256_storeu_si256(tmpv1, vout1); + + __m128i i8vecs[8] = { + _mm_loadl_epi64(reinterpret_cast<__m128i*>(tmp)), + _mm_loadl_epi64(reinterpret_cast<__m128i*>(tmp + 8)), + _mm_loadl_epi64(reinterpret_cast<__m128i*>(tmp + 16)), + _mm_loadl_epi64(reinterpret_cast<__m128i*>(tmp + 24)), + _mm_loadl_epi64(reinterpret_cast<__m128i*>(tmp + 32)), + _mm_loadl_epi64(reinterpret_cast<__m128i*>(tmp + 40)), + _mm_loadl_epi64(reinterpret_cast<__m128i*>(tmp + 48)), + _mm_loadl_epi64(reinterpret_cast<__m128i*>(tmp + 56)), + }; + + __m128i vresults[8] = {avx2_i8tof16(i8vecs[0], svec), + avx2_i8tof16(i8vecs[1], svec), + avx2_i8tof16(i8vecs[2], svec), + avx2_i8tof16(i8vecs[3], svec), + avx2_i8tof16(i8vecs[4], svec), + avx2_i8tof16(i8vecs[5], svec), + avx2_i8tof16(i8vecs[6], svec), + avx2_i8tof16(i8vecs[7], svec)}; + + _mm_storeu_si128(outv[0], vresults[0]); + _mm_storeu_si128(outv[1], vresults[1]); + _mm_storeu_si128(outv[2], vresults[2]); + _mm_storeu_si128(outv[3], vresults[3]); + _mm_storeu_si128(outv[4], vresults[4]); + _mm_storeu_si128(outv[5], vresults[5]); + _mm_storeu_si128(outv[6], vresults[6]); + _mm_storeu_si128(outv[7], vresults[7]); + + pSrcLocal += 32; // shift pSrc only by 32 since it is 64 x i4 + pDstLocal += 64; // note pDst is int16_t + } + pSclLocal += scale_elem_type.size(); + } + }; + size_t stride{1}; + + // since scaling is always 64 elements aligned operations, lets partition only in scale shape + if (unpack_options.nPartitions) { + std::size_t minPartitions; + if (!unpack_options.bStrictPartitioning) { + // some heuristics that every tbb thread workload has to have 2048 x intrinsics operations at least, + // so in terms of stride, it should be nElementsPerscale/64 * 2048 + const auto nIntrinsicsPerScale = elementsPerScale / 64u; + auto minScaleStride = 2048u / nIntrinsicsPerScale; + minScaleStride = std::max(1u, minScaleStride); + minPartitions = stotal / minScaleStride; + minPartitions = std::max(1u, minPartitions); + minPartitions = std::min(minPartitions, unpack_options.nPartitions); + } else { + minPartitions = unpack_options.nPartitions; + } + + // calculating stride in scale elements space + stride = static_cast(stotal / minPartitions); + } + + const size_t numWork = (stotal + stride - 1) / stride; + + if (unpack_options.bUseOvParallelFor) { + ov::parallel_for(numWork, [unpack_body, stride](size_t index) { + unpack_body(index, stride); + }); + } else { + for (std::size_t index = 0; index < numWork; index++) { + unpack_body(index, stride); + } + } +#else + OPENVINO_THROW("AVX2 support is neccessary but it's not enabled!"); +#endif } -void ov::npuw::util::XARCH::unpack_scale_zp(const ov::SoPtr& from, - const ov::SoPtr& zerop, - const ov::SoPtr& scale, - const ov::SoPtr& to) { - unpack_scale_zp_impl(from, zerop, scale, to); +void ov::npuw::util::XARCH::unpack_i4f16_z(const ov::SoPtr& from, + const ov::SoPtr& scale, + const ov::SoPtr& to, + const ov::npuw::util::UnpackOptions& unpack_options) { + NPUW_ASSERT(from->is_continuous()); + NPUW_ASSERT(scale->is_continuous()); + NPUW_ASSERT(to->is_continuous()); + NPUW_ASSERT(from->get_size() == to->get_size()); + + const auto& from_shape = from->get_shape(); + NPUW_ASSERT(from_shape.back() % 64 == 0); + + const auto& scale_shape = scale->get_shape(); + NPUW_ASSERT(scale_shape.size() == 3); + NPUW_ASSERT(scale_shape[0] == from_shape[0]); + NPUW_ASSERT(scale_shape[2] == from_shape[2]); + NPUW_ASSERT(scale_shape[1] == 1); + + const auto scale_elem_type = scale->get_element_type(); + NPUW_ASSERT(scale_elem_type == ov::element::f32); + +#if defined(HAVE_AVX2) + // This conversion combines i4tof32 and f32tof16. Here we + // - read 256 bits (= 32 bytes, = 64 u4 elements) + // - write 1024 bits (= 128 bytes, = 64 f16 elements) + // per every iteration, what translates to (from->size() / 64) iterations + + const size_t C = from_shape[from_shape.size() - 3]; + const size_t H = from_shape[from_shape.size() - 2]; + const size_t W = from_shape[from_shape.size() - 1]; + + const int8_t* const pSrc = static_cast(from->data()); // 2 x i4 elements + const float* const pScl = static_cast(scale->data()); // 1 x f32 element + int16_t* pDst = static_cast(to->data()); // 1 x f16 element + + auto unpack_body = [&](size_t job_index, size_t stride) { + size_t start_c = job_index * stride; + size_t end_c = std::min(C, start_c + stride); + + for (size_t c = start_c; c < end_c; ++c) { + for (size_t h = 0; h < H; ++h) { + for (size_t w = 0; w < W; w += 64) { + const int8_t* pSrc_iter = pSrc + (w + W * h + W * H * c) / 2; + __m256i vinput = _mm256_lddqu_si256(reinterpret_cast(pSrc_iter)); + __m256i vout0, vout1; + avx2_i4toi8(vinput, &vout0, &vout1); + int8_t tmp[64]; // FIXME: Avoid it + __m256i* tmpv0 = reinterpret_cast<__m256i*>(tmp); + __m256i* tmpv1 = reinterpret_cast<__m256i*>(tmp + 32); + _mm256_storeu_si256(tmpv0, vout0); + _mm256_storeu_si256(tmpv1, vout1); + __m128i i8vecs[8] = { + _mm_loadl_epi64(reinterpret_cast<__m128i*>(tmp)), + _mm_loadl_epi64(reinterpret_cast<__m128i*>(tmp + 8)), + _mm_loadl_epi64(reinterpret_cast<__m128i*>(tmp + 16)), + _mm_loadl_epi64(reinterpret_cast<__m128i*>(tmp + 24)), + _mm_loadl_epi64(reinterpret_cast<__m128i*>(tmp + 32)), + _mm_loadl_epi64(reinterpret_cast<__m128i*>(tmp + 40)), + _mm_loadl_epi64(reinterpret_cast<__m128i*>(tmp + 48)), + _mm_loadl_epi64(reinterpret_cast<__m128i*>(tmp + 56)), + }; + + const float* pScl_iter = pScl + w + W * c; + __m256 svalVec[8]; + for (int i = 0; i < 8; ++i) { + svalVec[i] = _mm256_loadu_ps(pScl_iter + i * 8); + } + + __m128i vresults[8] = {avx2_i8tof16(i8vecs[0], svalVec[0]), + avx2_i8tof16(i8vecs[1], svalVec[1]), + avx2_i8tof16(i8vecs[2], svalVec[2]), + avx2_i8tof16(i8vecs[3], svalVec[3]), + avx2_i8tof16(i8vecs[4], svalVec[4]), + avx2_i8tof16(i8vecs[5], svalVec[5]), + avx2_i8tof16(i8vecs[6], svalVec[6]), + avx2_i8tof16(i8vecs[7], svalVec[7])}; + + int16_t* pDst_iter = pDst + w + W * h + W * H * c; + for (int i = 0; i < 8; ++i) { + _mm_storeu_si128(reinterpret_cast<__m128i*>(pDst_iter + i * 8), vresults[i]); + } + } + } + } + }; + + size_t stride = C; + size_t num_jobs = 1; + + if (unpack_options.nPartitions) { + if (unpack_options.bStrictPartitioning) { + stride = (C + unpack_options.nPartitions - 1) / unpack_options.nPartitions; + num_jobs = unpack_options.nPartitions; + } else { + stride = std::max(1, C / unpack_options.nPartitions); + num_jobs = (C + stride - 1) / stride; + } + } + + if (unpack_options.bUseOvParallelFor) { + ov::parallel_for(num_jobs, [&](size_t job_index) { + unpack_body(job_index, stride); + }); + } else { + for (size_t job_index = 0; job_index < num_jobs; ++job_index) { + unpack_body(job_index, stride); + } + } +#else + OPENVINO_THROW("AVX2 support is neccessary but it's not enabled!"); +#endif +} + +void ov::npuw::util::XARCH::unpack_u4f16(const ov::SoPtr& from, + const ov::SoPtr& to, + const ov::npuw::util::UnpackOptions& unpack_options) { + NPUW_ASSERT(from->is_continuous()); + NPUW_ASSERT(to->is_continuous()); + NPUW_ASSERT(from->get_size() == to->get_size()); + NPUW_ASSERT(from->get_size() % 64 == 0); + +#if defined(HAVE_AVX2) + // This conversion combines u4i8 and i8tof16 unpacks. Here we + // - read 256 bits (= 32 bytes, = 64 i4 elements) + // - write 1024 bits (= 128 bytes, = 64 f16 elements) + // per every iteration, what translates to (from->size() / 64) iterations + + const std::size_t total = to->get_size(); + int8_t const* pSrc = static_cast(from->data()); // 2 x i4 elements + int16_t* pDst = static_cast(to->data()); // 1 x f16 element + + for (std::size_t index = 0; index < total; index += 64) { + __m128i* outv[8] = { + reinterpret_cast<__m128i*>(pDst), + reinterpret_cast<__m128i*>(pDst + 8), + reinterpret_cast<__m128i*>(pDst + 16), + reinterpret_cast<__m128i*>(pDst + 24), + reinterpret_cast<__m128i*>(pDst + 32), + reinterpret_cast<__m128i*>(pDst + 40), + reinterpret_cast<__m128i*>(pDst + 48), + reinterpret_cast<__m128i*>(pDst + 56), + }; + + int8_t tmp[64]; // FIXME: Avoid it + for (std::size_t ii = 0; ii < 32; ii++) { + tmp[ii * 2] = static_cast(lo4(pSrc[ii])); // LSB is [0] -- since OpenVINO 24.0! + tmp[ii * 2 + 1] = static_cast(hi4(pSrc[ii])); // MSB is [1] -- since OpenVINO 24.0! + } + + __m128i vresults[8] = { + avx2_i8tof16(_mm_loadl_epi64(reinterpret_cast<__m128i*>(tmp))), + avx2_i8tof16(_mm_loadl_epi64(reinterpret_cast<__m128i*>(tmp + 8))), + avx2_i8tof16(_mm_loadl_epi64(reinterpret_cast<__m128i*>(tmp + 16))), + avx2_i8tof16(_mm_loadl_epi64(reinterpret_cast<__m128i*>(tmp + 24))), + avx2_i8tof16(_mm_loadl_epi64(reinterpret_cast<__m128i*>(tmp + 32))), + avx2_i8tof16(_mm_loadl_epi64(reinterpret_cast<__m128i*>(tmp + 40))), + avx2_i8tof16(_mm_loadl_epi64(reinterpret_cast<__m128i*>(tmp + 48))), + avx2_i8tof16(_mm_loadl_epi64(reinterpret_cast<__m128i*>(tmp + 56))), + }; + + _mm_storeu_si128(outv[0], vresults[0]); + _mm_storeu_si128(outv[1], vresults[1]); + _mm_storeu_si128(outv[2], vresults[2]); + _mm_storeu_si128(outv[3], vresults[3]); + _mm_storeu_si128(outv[4], vresults[4]); + _mm_storeu_si128(outv[5], vresults[5]); + _mm_storeu_si128(outv[6], vresults[6]); + _mm_storeu_si128(outv[7], vresults[7]); + + pSrc += 32; // shift pSrc only by 32 since it is 64 x i4 + pDst += 64; // note pDst is int16_t + } +#else + OPENVINO_THROW("AVX2 support is neccessary but it's not enabled!"); +#endif +} + +void ov::npuw::util::XARCH::unpack_u4f16_scale_zp(const ov::SoPtr& from, + const ov::SoPtr& zerop, + const ov::SoPtr& scale, + const ov::SoPtr& to, + const ov::npuw::util::UnpackOptions& unpack_options) { + NPUW_ASSERT(from->is_continuous()); + NPUW_ASSERT(zerop->is_continuous()); + NPUW_ASSERT(scale->is_continuous()); + NPUW_ASSERT(to->is_continuous()); + NPUW_ASSERT(from->get_size() == to->get_size()); + + // Only single-size ZP is supported + NPUW_ASSERT(zerop->get_size() == 1); + + const auto& from_shape = from->get_shape(); + NPUW_ASSERT(from_shape.back() % 64 == 0); + + // 2-channel (Symmetric) and 3-channel (group-wise) + // scale factors are supported. The scale/value loop + // iteration is based on stotal, so should work for + // both cases. + const auto& scale_shape = scale->get_shape(); + NPUW_ASSERT(scale_shape.size() == 3 || scale_shape.size() == 2); + if (scale_shape.size() == 3) { + NPUW_ASSERT(scale_shape[0] == from_shape[0]); + NPUW_ASSERT(scale_shape[1] == from_shape[1]); + NPUW_ASSERT(scale_shape[2] == 1); + } else { + NPUW_ASSERT(scale_shape[0] == from_shape[0]); + NPUW_ASSERT(scale_shape[1] == 1); + } + + const auto zerop_elem_type = zerop->get_element_type(); + const auto scale_elem_type = scale->get_element_type(); + NPUW_ASSERT(zerop_elem_type == ov::element::u4); + NPUW_ASSERT(scale_elem_type == ov::element::f16); + +#if defined(HAVE_AVX2) + // This conversion combines u4tof32 and f32tof16. Here we + // - read 256 bits (= 32 bytes, = 64 u4 elements) + // - write 1024 bits (= 128 bytes, = 64 f16 elements) + // per every iteration, what translates to (from->size() / 64) iterations + + const std::size_t total = to->get_size(); + const std::size_t stotal = scale->get_size(); + const std::size_t elementsPerScale = total / stotal; + + const uint8_t* const pSrc = static_cast(from->data()); // 2 x u4 elements + const uint8_t* const pZer = static_cast(zerop->data()); // 1 x u4 element + const int8_t* const pScl = static_cast(scale->data()); // 1 x f16 element + const int16_t* pDst = static_cast(to->data()); // 1 x f16 element + + const float zval = static_cast(lo4(*pZer)); // MSB - since OpenVINO 24.0! + + __m256 zvalVec = _mm256_set1_ps(zval); + + auto unpack_body = [pSrc, pDst, pScl, zvalVec, elementsPerScale, scale_elem_type, stotal](std::size_t sindex, + std::size_t stride) { + // number of vectorized operations per scale + size_t elementsPerScaleVectorized = elementsPerScale / 64; + + uint8_t const* pSrcLocal = pSrc + 32 * elementsPerScaleVectorized * sindex * stride; + int8_t const* pSclLocal = pScl + scale_elem_type.size() * sindex * stride; + int16_t* pDstLocal = const_cast(pDst) + 64 * elementsPerScaleVectorized * sindex * stride; + + // if it is last iteration current stride can be smaller - lets check that + sindex *= stride; + const auto jobFinish = std::min(sindex + stride, stotal); + + for (; sindex < jobFinish; sindex++) { + __m256 svalVec = avx2_load_scale(pSclLocal, scale_elem_type); + + for (std::size_t index = 0; index < elementsPerScale; index += 64) { + __m128i* outv[] = { + reinterpret_cast<__m128i*>(pDstLocal), + reinterpret_cast<__m128i*>(pDstLocal + 8), + reinterpret_cast<__m128i*>(pDstLocal + 16), + reinterpret_cast<__m128i*>(pDstLocal + 24), + reinterpret_cast<__m128i*>(pDstLocal + 32), + reinterpret_cast<__m128i*>(pDstLocal + 40), + reinterpret_cast<__m128i*>(pDstLocal + 48), + reinterpret_cast<__m128i*>(pDstLocal + 56), + }; + __m256i himask = _mm256_set1_epi8(static_cast(0xF0)); + __m256i lomask = _mm256_set1_epi8(static_cast(0x0F)); + + // loading 256 bit u4 into unalligned memory , so 64 elements + // cannot use aligned version here like _mm256_load_si256 - segfault even on unit tests + __m256i xmmData = _mm256_lddqu_si256(reinterpret_cast<__m256i const*>(pSrcLocal)); + + // unpacking with interleaving + __m256i vht = _mm256_and_si256(xmmData, himask); + __m256i xmmUnpackedLo = _mm256_srli_epi16(vht, 4); // 32 x i8 + __m256i xmmUnpackedHi = _mm256_and_si256(xmmData, lomask); // 32 x i8 + + // need 4 portions of 8 x i8 elements + __m128i unpacked32LoHi = _mm256_castsi256_si128(xmmUnpackedLo); // lower 16 x i8 + __m128i unpacked32LoLo = _mm256_extractf128_si256(xmmUnpackedLo, 1); // higher 16 x i8 + + __m128i unpacked32HiHi = _mm256_castsi256_si128(xmmUnpackedHi); // lower 16 x i8 + __m128i unpacked32HiLo = _mm256_extractf128_si256(xmmUnpackedHi, 1); // higher 16 x i8 + + // converting to 32 x f16 + __m128i f16LoLo[] = {avx2_u8tof16_hi(unpacked32LoLo, zvalVec, svalVec), + avx2_u8tof16_lo(unpacked32LoLo, zvalVec, svalVec)}; + + __m128i f16LoHi[] = { + avx2_u8tof16_hi(unpacked32LoHi, zvalVec, svalVec), + avx2_u8tof16_lo(unpacked32LoHi, zvalVec, svalVec), + }; + + __m128i f16HiLo[] = {avx2_u8tof16_hi(unpacked32HiLo, zvalVec, svalVec), + avx2_u8tof16_lo(unpacked32HiLo, zvalVec, svalVec)}; + __m128i f16HiHi[] = {avx2_u8tof16_hi(unpacked32HiHi, zvalVec, svalVec), + avx2_u8tof16_lo(unpacked32HiHi, zvalVec, svalVec)}; + + // interleaving back + __m128i interleaved[] = {_mm_unpacklo_epi16(f16HiHi[0], f16LoHi[0]), + _mm_unpackhi_epi16(f16HiHi[0], f16LoHi[0]), + _mm_unpacklo_epi16(f16HiHi[1], f16LoHi[1]), + _mm_unpackhi_epi16(f16HiHi[1], f16LoHi[1]), + _mm_unpacklo_epi16(f16HiLo[0], f16LoLo[0]), + _mm_unpackhi_epi16(f16HiLo[0], f16LoLo[0]), + _mm_unpacklo_epi16(f16HiLo[1], f16LoLo[1]), + _mm_unpackhi_epi16(f16HiLo[1], f16LoLo[1])}; + + // store the results + _mm_storeu_si128(outv[0], interleaved[0]); + _mm_storeu_si128(outv[1], interleaved[1]); + _mm_storeu_si128(outv[2], interleaved[2]); + _mm_storeu_si128(outv[3], interleaved[3]); + _mm_storeu_si128(outv[4], interleaved[4]); + _mm_storeu_si128(outv[5], interleaved[5]); + _mm_storeu_si128(outv[6], interleaved[6]); + _mm_storeu_si128(outv[7], interleaved[7]); + + pSrcLocal += 32; // shift pSrc only by 32 since it is 64 x u4 + pDstLocal += 64; // note pDst is int16_t, so 64 x f16 -> 64 elements + } // for(index) + pSclLocal += scale_elem_type.size(); + } // for(sindex) + }; + + size_t stride{1}; + + // since scaling is always 64 elements aligned operations, lets partition only in scale shape + if (unpack_options.nPartitions) { + std::size_t minPartitions; + if (!unpack_options.bStrictPartitioning) { + // some heuristics that every tbb thread workload has to have 2048 x intrinsics operations at least, + // so in terms of stride, it should be nElementsPerscale/64 * 2048 + const auto nIntrinsicsPerScale = elementsPerScale / 64u; + auto minScaleStride = 2048u / nIntrinsicsPerScale; + minScaleStride = std::max(1u, minScaleStride); + minPartitions = stotal / minScaleStride; + minPartitions = std::max(1u, minPartitions); + minPartitions = std::min(minPartitions, unpack_options.nPartitions); + } else { + minPartitions = unpack_options.nPartitions; + } + + // calculating stride in scale elements space + stride = static_cast(stotal / minPartitions); + } + + const size_t numWork = (stotal + stride - 1) / stride; + + if (unpack_options.bUseOvParallelFor) { + ov::parallel_for(numWork, [unpack_body, stride](size_t index) { + unpack_body(index, stride); + }); + } else { + for (std::size_t index = 0; index < numWork; index++) { + unpack_body(index, stride); + } + } +#else + OPENVINO_THROW("AVX2 support is neccessary but it's not enabled!"); +#endif +} + +void ov::npuw::util::XARCH::unpack_u4f16_asymm_zp(const ov::SoPtr& from, + const ov::SoPtr& zerop, + const ov::SoPtr& scale, + const ov::SoPtr& to, + const ov::npuw::util::UnpackOptions& unpack_options) { + NPUW_ASSERT(from->is_continuous()); + NPUW_ASSERT(zerop->is_continuous()); + NPUW_ASSERT(scale->is_continuous()); + NPUW_ASSERT(to->is_continuous()); + NPUW_ASSERT(from->get_size() == to->get_size()); + + const auto& from_shape = from->get_shape(); + NPUW_ASSERT(from_shape.back() % 64 == 0); + + // 3-channel (group-wise) scale factors are + // supported. + + const auto& scale_shape = scale->get_shape(); + NPUW_ASSERT(scale_shape.size() == 3); + if (scale_shape.size() == 3) { + NPUW_ASSERT(scale_shape[0] == from_shape[0]); + NPUW_ASSERT(scale_shape[1] == from_shape[1]); + NPUW_ASSERT(scale_shape[2] == 1); + } + + const auto& zerop_shape = zerop->get_shape(); + NPUW_ASSERT(zerop_shape.size() == 3); + if (zerop_shape.size() == 3) { + NPUW_ASSERT(zerop_shape[0] == from_shape[0]); + NPUW_ASSERT(zerop_shape[1] == from_shape[1]); + NPUW_ASSERT(zerop_shape[2] == 1); + } + + const auto zerop_elem_type = zerop->get_element_type(); + const auto scale_elem_type = scale->get_element_type(); + NPUW_ASSERT(zerop_elem_type == ov::element::u4); + NPUW_ASSERT(scale_elem_type == ov::element::f16); + +#if defined(HAVE_AVX2) + // This conversion combines u4tof32 and f32tof16. Here we + // - read 256 bits (= 32 bytes, = 64 u4 elements) + // - write 1024 bits (= 128 bytes, = 64 f16 elements) + // per every iteration, what translates to (from->size() / 64) iterations + + const std::size_t total = to->get_size(); + const std::size_t stotal = scale->get_size(); + const std::size_t elementsPerScale = total / stotal; + + const uint8_t* const pSrc = static_cast(from->data()); // 2 x u4 elements + const uint8_t* const pZer = static_cast(zerop->data()); // 2 x u4 element + const int8_t* const pScl = static_cast(scale->data()); // 1 x f16 element + const int16_t* pDst = static_cast(to->data()); // 1 x f16 element + + auto unpack_body = [pSrc, pDst, pScl, pZer, elementsPerScale, scale_elem_type, zerop_elem_type, stotal]( + std::size_t sindex, + std::size_t stride) { + // number of vectorized operations per scale + size_t elementsPerScaleVectorized = elementsPerScale / 64; + + uint8_t const* pSrcLocal = pSrc + 32 * elementsPerScaleVectorized * sindex * stride; + int8_t const* pSclLocal = pScl + scale_elem_type.size() * sindex * stride; + uint8_t const* pZerLocal = pZer + zerop_elem_type.size() * sindex * stride / 2; + int16_t* pDstLocal = const_cast(pDst) + 64 * elementsPerScaleVectorized * sindex * stride; + + // if it is last iteration current stride can be smaller - lets check that + sindex *= stride; + const auto jobFinish = std::min(sindex + stride, stotal); + + for (; sindex < jobFinish; sindex++) { + __m256 svalVec = avx2_load_scale(pSclLocal, scale_elem_type); + __m256 zvalVec = _mm256_set1_ps(static_cast((sindex % 2 == 0) ? lo4(*pZerLocal) : hi4(*pZerLocal))); + + for (std::size_t index = 0; index < elementsPerScale; index += 64) { + __m128i* outv[] = { + reinterpret_cast<__m128i*>(pDstLocal), + reinterpret_cast<__m128i*>(pDstLocal + 8), + reinterpret_cast<__m128i*>(pDstLocal + 16), + reinterpret_cast<__m128i*>(pDstLocal + 24), + reinterpret_cast<__m128i*>(pDstLocal + 32), + reinterpret_cast<__m128i*>(pDstLocal + 40), + reinterpret_cast<__m128i*>(pDstLocal + 48), + reinterpret_cast<__m128i*>(pDstLocal + 56), + }; + __m256i himask = _mm256_set1_epi8(static_cast(0xF0)); + __m256i lomask = _mm256_set1_epi8(static_cast(0x0F)); + + // loading 256 bit u4 into unalligned memory , so 64 elements + // cannot use aligned version here like _mm256_load_si256 - segfault even on unit tests + __m256i xmmData = _mm256_lddqu_si256(reinterpret_cast<__m256i const*>(pSrcLocal)); + + // unpacking with interleaving + __m256i vht = _mm256_and_si256(xmmData, himask); + __m256i xmmUnpackedLo = _mm256_srli_epi16(vht, 4); // 32 x i8 + __m256i xmmUnpackedHi = _mm256_and_si256(xmmData, lomask); // 32 x i8 + + // need 4 portions of 8 x i8 elements + __m128i unpacked32LoHi = _mm256_castsi256_si128(xmmUnpackedLo); // lower 16 x i8 + __m128i unpacked32LoLo = _mm256_extractf128_si256(xmmUnpackedLo, 1); // higher 16 x i8 + + __m128i unpacked32HiHi = _mm256_castsi256_si128(xmmUnpackedHi); // lower 16 x i8 + __m128i unpacked32HiLo = _mm256_extractf128_si256(xmmUnpackedHi, 1); // higher 16 x i8 + + // converting to 32 x f16 + __m128i f16LoLo[] = {avx2_u8tof16_hi(unpacked32LoLo, zvalVec, svalVec), + avx2_u8tof16_lo(unpacked32LoLo, zvalVec, svalVec)}; + + __m128i f16LoHi[] = { + avx2_u8tof16_hi(unpacked32LoHi, zvalVec, svalVec), + avx2_u8tof16_lo(unpacked32LoHi, zvalVec, svalVec), + }; + + __m128i f16HiLo[] = {avx2_u8tof16_hi(unpacked32HiLo, zvalVec, svalVec), + avx2_u8tof16_lo(unpacked32HiLo, zvalVec, svalVec)}; + __m128i f16HiHi[] = {avx2_u8tof16_hi(unpacked32HiHi, zvalVec, svalVec), + avx2_u8tof16_lo(unpacked32HiHi, zvalVec, svalVec)}; + + // interleaving back + __m128i interleaved[] = {_mm_unpacklo_epi16(f16HiHi[0], f16LoHi[0]), + _mm_unpackhi_epi16(f16HiHi[0], f16LoHi[0]), + _mm_unpacklo_epi16(f16HiHi[1], f16LoHi[1]), + _mm_unpackhi_epi16(f16HiHi[1], f16LoHi[1]), + _mm_unpacklo_epi16(f16HiLo[0], f16LoLo[0]), + _mm_unpackhi_epi16(f16HiLo[0], f16LoLo[0]), + _mm_unpacklo_epi16(f16HiLo[1], f16LoLo[1]), + _mm_unpackhi_epi16(f16HiLo[1], f16LoLo[1])}; + + // store the results + _mm_storeu_si128(outv[0], interleaved[0]); + _mm_storeu_si128(outv[1], interleaved[1]); + _mm_storeu_si128(outv[2], interleaved[2]); + _mm_storeu_si128(outv[3], interleaved[3]); + _mm_storeu_si128(outv[4], interleaved[4]); + _mm_storeu_si128(outv[5], interleaved[5]); + _mm_storeu_si128(outv[6], interleaved[6]); + _mm_storeu_si128(outv[7], interleaved[7]); + + pSrcLocal += 32; // shift pSrc only by 32 since it is 64 x u4 + pDstLocal += 64; // note pDst is int16_t, so 64 x f16 -> 64 elements + } // for(index) + pSclLocal += scale_elem_type.size(); + if (sindex % 2 == 1) { + pZerLocal += zerop_elem_type.size(); + } + } // for(sindex) + }; + + size_t stride{1}; + + // since scaling is always 64 elements aligned operations, lets partition only in scale shape + if (unpack_options.nPartitions) { + std::size_t minPartitions; + if (!unpack_options.bStrictPartitioning) { + // some heuristics that every tbb thread workload has to have 2048 x intrinsics operations at least, + // so in terms of stride, it should be nElementsPerscale/64 * 2048 + const auto nIntrinsicsPerScale = elementsPerScale / 64u; + auto minScaleStride = 2048u / nIntrinsicsPerScale; + minScaleStride = std::max(1u, minScaleStride); + minPartitions = stotal / minScaleStride; + minPartitions = std::max(1u, minPartitions); + minPartitions = std::min(minPartitions, unpack_options.nPartitions); + } else { + minPartitions = unpack_options.nPartitions; + } + + // calculating stride in scale elements space + stride = static_cast(stotal / minPartitions); + } + + const size_t numWork = (stotal + stride - 1) / stride; + + if (unpack_options.bUseOvParallelFor) { + ov::parallel_for(numWork, [unpack_body, stride](size_t index) { + unpack_body(index, stride); + }); + } else { + for (std::size_t index = 0; index < numWork; index++) { + unpack_body(index, stride); + } + } +#else + OPENVINO_THROW("AVX2 support is neccessary but it's not enabled!"); +#endif +} + +void ov::npuw::util::XARCH::unpack_u4f16_z(const ov::SoPtr& from, + const ov::SoPtr& zerop, + const ov::SoPtr& scale, + const ov::SoPtr& to, + const ov::npuw::util::UnpackOptions& unpack_options) { + NPUW_ASSERT(from->is_continuous()); + NPUW_ASSERT(zerop->is_continuous()); + NPUW_ASSERT(scale->is_continuous()); + NPUW_ASSERT(to->is_continuous()); + NPUW_ASSERT(from->get_size() == to->get_size()); + + // Only single-size ZP is supported + NPUW_ASSERT(zerop->get_size() == 1); + + const auto& from_shape = from->get_shape(); + NPUW_ASSERT(from_shape.back() % 64 == 0); + + const auto& scale_shape = scale->get_shape(); + NPUW_ASSERT(scale_shape.size() == 3); + NPUW_ASSERT(scale_shape[0] == from_shape[0]); + NPUW_ASSERT(scale_shape[2] == from_shape[2]); + NPUW_ASSERT(scale_shape[1] == 1); + + const auto zerop_elem_type = zerop->get_element_type(); + const auto scale_elem_type = scale->get_element_type(); + NPUW_ASSERT(zerop_elem_type == ov::element::f32); + NPUW_ASSERT(scale_elem_type == ov::element::f32); + +#if defined(HAVE_AVX2) + // This conversion combines u4tof32 and f32tof16. Here we + // - read 256 bits (= 32 bytes, = 64 u4 elements) + // - write 1024 bits (= 128 bytes, = 64 f16 elements) + // per every iteration, what translates to (from->size() / 64) iterations + + const size_t C = from_shape[from_shape.size() - 3]; + const size_t H = from_shape[from_shape.size() - 2]; + const size_t W = from_shape[from_shape.size() - 1]; + + const uint8_t* const pSrc = static_cast(from->data()); // 2 x u4 elements + const float* const pScl = static_cast(scale->data()); // 1 x f32 element + int16_t* pDst = static_cast(to->data()); // 1 x f16 element + + const float zval = avx2_load_f32(reinterpret_cast(zerop->data()), zerop_elem_type); + __m256 zvalVec = _mm256_set1_ps(zval); + + auto unpack_body = [&](size_t job_index, size_t stride) { + size_t start_c = job_index * stride; + size_t end_c = std::min(C, start_c + stride); + + for (size_t c = start_c; c < end_c; ++c) { + for (size_t h = 0; h < H; ++h) { + for (size_t w = 0; w < W; w += 64) { + const uint8_t* pSrc_iter = pSrc + (w + W * h + W * H * c) / 2; + __m256i vinput = _mm256_lddqu_si256(reinterpret_cast(pSrc_iter)); + const float* pScl_iter = pScl + w + W * c; + int16_t* pDst_iter = pDst + w + W * h + W * H * c; + + __m256 svalVec[8]; + for (int i = 0; i < 8; ++i) { + svalVec[i] = _mm256_loadu_ps(pScl_iter + i * 8); + } + + // vectorized unpack u4 to f16 + __m128i htmp[8]; // 64 x f16 + avx2_u4tof16(vinput, htmp, zvalVec, svalVec); + + for (int i = 0; i < 8; ++i) { + _mm_storeu_si128(reinterpret_cast<__m128i*>(pDst_iter + i * 8), htmp[i]); + } + } + } + } + }; + + size_t stride = C; + size_t num_jobs = 1; + + if (unpack_options.nPartitions) { + if (unpack_options.bStrictPartitioning) { + stride = (C + unpack_options.nPartitions - 1) / unpack_options.nPartitions; + num_jobs = unpack_options.nPartitions; + } else { + stride = std::max(1, C / unpack_options.nPartitions); + num_jobs = (C + stride - 1) / stride; + } + } + + if (unpack_options.bUseOvParallelFor) { + ov::parallel_for(num_jobs, [&](size_t job_index) { + unpack_body(job_index, stride); + }); + } else { + for (size_t job_index = 0; job_index < num_jobs; ++job_index) { + unpack_body(job_index, stride); + } + } +#else + OPENVINO_THROW("AVX2 support is neccessary but it's not enabled!"); +#endif +} + +void ov::npuw::util::XARCH::unpack_u4f32(const ov::SoPtr& from, + const ov::SoPtr& to, + const ov::npuw::util::UnpackOptions& unpack_options) { + NPUW_ASSERT(from->is_continuous()); + NPUW_ASSERT(to->is_continuous()); + NPUW_ASSERT(from->get_size() == to->get_size()); + + uint8_t const* pSrc = static_cast(from->data()); // 2 x u4 elements + float* pDst = static_cast(to->data()); // 1 x f32 element + + const std::size_t total = from->get_size(); + for (std::size_t index = 0; index < total; index += 2) { + pDst[0] = static_cast(lo4(*pSrc)); // LSB is [0] - since OpenVINO 2024.0! + pDst[1] = static_cast(hi4(*pSrc)); // MSB is [1] - since OpenVINO 2024.0! + pSrc++; + pDst += 2; + } +} + +void ov::npuw::util::XARCH::unpack_i8f16(const ov::SoPtr& from, + const ov::SoPtr& to, + const ov::npuw::util::UnpackOptions& unpack_options) { + NPUW_ASSERT(from->is_continuous()); + NPUW_ASSERT(to->is_continuous()); + NPUW_ASSERT(from->get_size() == to->get_size()); + NPUW_ASSERT(from->get_size() % 8 == 0); + +#if defined(HAVE_AVX2) + constexpr std::size_t VECSIZE = 8; + + const std::size_t total = from->get_size(); + int8_t const* pSrc = from->data(); + int16_t* pDst = static_cast(to->data()); + + for (std::size_t index = 0; index < total; index += VECSIZE) { + const __m128i* pSrcV = reinterpret_cast(pSrc); + __m128i* pDstV = reinterpret_cast<__m128i*>(pDst); + __m128i i8vec = _mm_loadl_epi64(pSrcV); // load: 8 x i8 [ 64b of 128b] + __m128i f16vec = avx2_i8tof16(i8vec); + _mm_store_si128(pDstV, f16vec); // store: 8 x f16 [128b] + pSrc += 8; + pDst += 8; + } +#else + OPENVINO_THROW("AVX2 support is neccessary but it's not enabled!"); +#endif +} + +void ov::npuw::util::XARCH::unpack_i8f16_scale(const ov::SoPtr& from, + const ov::SoPtr& scale, + const ov::SoPtr& to, + const ov::npuw::util::UnpackOptions& unpack_options) { + NPUW_ASSERT(from->is_continuous()); + NPUW_ASSERT(scale->is_continuous()); + NPUW_ASSERT(to->is_continuous()); + NPUW_ASSERT(from->get_size() == to->get_size()); + NPUW_ASSERT(from->get_size() % 8 == 0); + NPUW_ASSERT(scale->get_shape()[0] == from->get_shape()[0]); + NPUW_ASSERT(scale->get_shape()[1] == 1); + + const auto scale_elem_type = scale->get_element_type(); + NPUW_ASSERT(scale_elem_type == ov::element::f32 || scale_elem_type == ov::element::f16); + +#if defined(HAVE_AVX2) + constexpr std::size_t VECSIZE = 8; + + const std::size_t total = from->get_size(); + const std::size_t stotal = scale->get_size(); + int8_t const* pSrc = from->data(); + int8_t const* pScl = static_cast(scale->data()); + int16_t* pDst = static_cast(to->data()); + + for (std::size_t sindex = 0u; sindex < stotal; sindex++) { + __m256 svec = avx2_load_scale(pScl, scale_elem_type); + for (std::size_t index = 0u; index < (total / stotal); index += VECSIZE) { + __m128i const* pSrcV = reinterpret_cast(pSrc); + __m128i* pDstV = reinterpret_cast<__m128i*>(pDst); + __m128i i8vec = _mm_loadl_epi64(pSrcV); // load: 8 x i8 [ 64b of 128b] + __m128i f16vec = avx2_i8tof16(i8vec, svec); // convert & scale + _mm_store_si128(pDstV, f16vec); // store: 8 x f16 [128b] + pSrc += 8; + pDst += 8; + } // index + pScl += scale_elem_type.size(); + } // sindex +#else + OPENVINO_THROW("AVX2 support is neccessary but it's not enabled!"); +#endif +} + +void ov::npuw::util::XARCH::unpack_u8f16(const ov::SoPtr& from, + const ov::SoPtr& zerop, + const ov::SoPtr& scale, + const ov::SoPtr& to, + const ov::npuw::util::UnpackOptions& _options) { + NPUW_ASSERT(from->is_continuous()); + NPUW_ASSERT(zerop->is_continuous()); + NPUW_ASSERT(scale->is_continuous()); + NPUW_ASSERT(to->is_continuous()); + NPUW_ASSERT(from->get_size() == to->get_size()); + NPUW_ASSERT(from->get_size() % 8 == 0); + NPUW_ASSERT(scale->get_shape()[0] == from->get_shape()[0]); + NPUW_ASSERT(scale->get_shape()[1] == 1); + NPUW_ASSERT(zerop->get_shape()[0] == from->get_shape()[0]); + NPUW_ASSERT(zerop->get_shape()[1] == 1); + + const auto scale_elem_type = scale->get_element_type(); + NPUW_ASSERT(scale_elem_type == ov::element::f32 || scale_elem_type == ov::element::f16); + + const auto zerop_elem_type = zerop->get_element_type(); + NPUW_ASSERT(zerop_elem_type == ov::element::u8); + +#if defined(HAVE_AVX2) + constexpr std::size_t VECSIZE = 8; + + const std::size_t total = from->get_size(); + const std::size_t stotal = scale->get_size(); + uint8_t const* pSrc = from->data(); + uint8_t const* pZrp = zerop->data(); + int8_t const* pScl = static_cast(scale->data()); + int16_t* pDst = static_cast(to->data()); + + for (std::size_t sindex = 0u; sindex < stotal; sindex++) { + __m256 svec = avx2_load_scale(pScl, scale_elem_type); + __m128i u8zp = _mm_set1_epi8(*pZrp); // bcast: 8 x u8 + __m256i u32zp = _mm256_cvtepu8_epi32(u8zp); // i32 zero point + __m256 f32zp = _mm256_cvtepi32_ps(u32zp); // f32 zero point + for (std::size_t index = 0u; index < (total / stotal); index += VECSIZE) { + __m128i const* pSrcV = reinterpret_cast(pSrc); + __m128i* pDstV = reinterpret_cast<__m128i*>(pDst); + __m128i u8in = _mm_loadl_epi64(pSrcV); // load: 8 x u8 + __m128i f16vec = avx2_u8tof16(u8in, f32zp, svec); // convert & scale + _mm_store_si128(pDstV, f16vec); // store: 8 x f16 + pSrc += VECSIZE; + pDst += VECSIZE; + } // index + pScl += scale_elem_type.size(); + pZrp++; + } // sindex +#else + OPENVINO_THROW("AVX2 support is neccessary but it's not enabled!"); +#endif } ov::Tensor ov::npuw::util::XARCH::to_f16(const ov::Tensor& t) { - return to_f16_impl(t); + ov::Shape shape = t.get_shape(); + NPUW_ASSERT(t.get_element_type() == ov::element::f32); + NPUW_ASSERT(t.get_size() % 8 == 0); + NPUW_ASSERT(t.is_continuous()); + + ov::Tensor tnew(ov::element::f16, shape); + +#if defined(HAVE_AVX2) + const float* psrc = t.data(); + uint8_t* pdst = static_cast(tnew.data()); + + for (std::size_t i = 0; i < t.get_size() / 8; i++) { + __m256 vsrc = _mm256_loadu_ps(psrc); + __m128i vout = _mm256_cvtps_ph(vsrc, _MM_FROUND_TO_NEAREST_INT); + __m128i* pout = reinterpret_cast<__m128i*>(pdst); + _mm_storeu_si128(pout, vout); + psrc += 8; // offset in sizeof(float) + pdst += (8 * 2); // offset in bytes + } +#else + OPENVINO_THROW("AVX2 support is neccessary but it's not enabled!"); +#endif + return tnew; } diff --git a/src/plugins/intel_npu/src/plugin/npuw/util_xarch.hpp b/src/plugins/intel_npu/src/plugin/npuw/util_xarch.hpp index f7ab82ecd48c3b..0f0d9912f3b221 100644 --- a/src/plugins/intel_npu/src/plugin/npuw/util_xarch.hpp +++ b/src/plugins/intel_npu/src/plugin/npuw/util_xarch.hpp @@ -6,25 +6,79 @@ #include +#include "logging.hpp" #include "openvino/op/constant.hpp" #include "openvino/runtime/itensor.hpp" #include "openvino/runtime/so_ptr.hpp" +#include "util.hpp" namespace ov { namespace npuw { namespace util { namespace XARCH { -void unpack(const ov::SoPtr& from, const ov::SoPtr& to); +void unpack_i4i8(const ov::SoPtr& from, + const ov::SoPtr& to, + const ov::npuw::util::UnpackOptions& unpack_options); -void unpack_scale(const ov::SoPtr& from, - const ov::SoPtr& scale, - const ov::SoPtr& to); +void unpack_u4i8(const ov::SoPtr& from, + const ov::SoPtr& to, + const ov::npuw::util::UnpackOptions& unpack_options); + +void unpack_i4f16(const ov::SoPtr& from, + const ov::SoPtr& to, + const ov::npuw::util::UnpackOptions& unpack_options); + +void unpack_i4f16_scale(const ov::SoPtr& from, + const ov::SoPtr& scale, + const ov::SoPtr& to, + const ov::npuw::util::UnpackOptions& unpack_options); + +void unpack_i4f16_z(const ov::SoPtr& from, + const ov::SoPtr& scale, + const ov::SoPtr& to, + const ov::npuw::util::UnpackOptions& unpack_options); + +void unpack_u4f16(const ov::SoPtr& from, + const ov::SoPtr& to, + const ov::npuw::util::UnpackOptions& unpack_options); + +void unpack_u4f16_scale_zp(const ov::SoPtr& from, + const ov::SoPtr& zerop, + const ov::SoPtr& scale, + const ov::SoPtr& to, + const ov::npuw::util::UnpackOptions& unpack_options); -void unpack_scale_zp(const ov::SoPtr& from, - const ov::SoPtr& zerop, - const ov::SoPtr& scale, - const ov::SoPtr& to); +void unpack_u4f16_asymm_zp(const ov::SoPtr& from, + const ov::SoPtr& zerop, + const ov::SoPtr& scale, + const ov::SoPtr& to, + const ov::npuw::util::UnpackOptions& unpack_options); + +void unpack_u4f16_z(const ov::SoPtr& from, + const ov::SoPtr& zerop, + const ov::SoPtr& scale, + const ov::SoPtr& to, + const ov::npuw::util::UnpackOptions& unpack_options); + +void unpack_u4f32(const ov::SoPtr& from, + const ov::SoPtr& to, + const ov::npuw::util::UnpackOptions& unpack_options); + +void unpack_i8f16(const ov::SoPtr& from, + const ov::SoPtr& to, + const ov::npuw::util::UnpackOptions& unpack_options); + +void unpack_i8f16_scale(const ov::SoPtr& from, + const ov::SoPtr& scale, + const ov::SoPtr& to, + const ov::npuw::util::UnpackOptions& unpack_options); + +void unpack_u8f16(const ov::SoPtr& from, + const ov::SoPtr& zerop, + const ov::SoPtr& scale, + const ov::SoPtr& to, + const ov::npuw::util::UnpackOptions& _options); ov::Tensor to_f16(const ov::Tensor& t);