diff --git a/.gitignore b/.gitignore index 70d62591ffc780..ad457fc0af83ad 100644 --- a/.gitignore +++ b/.gitignore @@ -24,6 +24,7 @@ bin/ doc/ docs/build_documentation/work_dir/ temp/ +fj/ .repo/ CMakeLists.txt.user docs/IE_PLUGIN_DG/html/ diff --git a/cmake/developer_package/compile_flags/os_flags.cmake b/cmake/developer_package/compile_flags/os_flags.cmake index fdfd7211c8e815..3f855039259fd9 100644 --- a/cmake/developer_package/compile_flags/os_flags.cmake +++ b/cmake/developer_package/compile_flags/os_flags.cmake @@ -208,6 +208,38 @@ macro(ov_arm_neon_fp16_optimization_flags flags) endif() endmacro() +# +# ov_arm_sve_optimization_flags() +# +macro(ov_arm_sve_optimization_flags flags) + if(OV_COMPILER_IS_INTEL_LLVM) + message(WARNING "Unsupported CXX compiler ${CMAKE_CXX_COMPILER_ID}") + elseif(CMAKE_CXX_COMPILER_ID STREQUAL "MSVC") + # nothing to define; works out of box + elseif(ANDROID) + if(ANDROID_ABI STREQUAL "arm64-v8a") + set(${flags} -march=armv8-a+sve -Wno-unused-command-line-argument) + else() + message(WARNING "SVE is not supported on this Android ABI: ${ANDROID_ABI}") + endif() + else() + if(AARCH64) + set(${flags} -O2 -march=armv8-a+sve) + if(NOT CMAKE_CL_64) + list(APPEND ${flags} -ftree-vectorize) + endif() + # Check for SVE support + if(CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64") + set(${flags} ${${flags}} -march=armv8-a+sve) + else() + message(WARNING "SVE is not supported on this architecture.") + endif() + elseif(ARM) + message(WARNING "SVE is not supported on 32-bit ARM architectures.") + endif() + endif() +endmacro() + # # ov_disable_all_warnings() # diff --git a/cmake/developer_package/cross_compile/cross_compiled_disp_gen.cmake b/cmake/developer_package/cross_compile/cross_compiled_disp_gen.cmake index c33d64635eb10b..fd534f3e600bfe 100644 --- a/cmake/developer_package/cross_compile/cross_compiled_disp_gen.cmake +++ b/cmake/developer_package/cross_compile/cross_compiled_disp_gen.cmake @@ -18,6 +18,7 @@ set(_CPU_CHECK_ANY "true") set(_CPU_CHECK_SSE42 "with_cpu_x86_sse42()") set(_CPU_CHECK_AVX "with_cpu_x86_avx()") set(_CPU_CHECK_NEON_FP16 "with_cpu_neon_fp16()") +set(_CPU_CHECK_SVE "with_cpu_sve()") set(_CPU_CHECK_AVX2 "with_cpu_x86_avx2()") set(_CPU_CHECK_AVX512F "with_cpu_x86_avx512f()") diff --git a/cmake/developer_package/cross_compile/cross_compiled_func.cmake b/cmake/developer_package/cross_compile/cross_compiled_func.cmake index 1e92fe3bfdaf8c..962aa5d373a4db 100644 --- a/cmake/developer_package/cross_compile/cross_compiled_func.cmake +++ b/cmake/developer_package/cross_compile/cross_compiled_func.cmake @@ -3,7 +3,7 @@ # ## list of available instruction sets -set(_ARCH_LIST ANY SSE42 AVX AVX2 AVX512F NEON_FP16) +set(_ARCH_LIST ANY SSE42 AVX AVX2 AVX512F NEON_FP16 SVE) set(_ACCEPTED_ARCHS_ANY "^(ANY)$") set(_ACCEPTED_ARCHS_SSE42 "^(ANY|SSE42)$") @@ -11,6 +11,7 @@ set(_ACCEPTED_ARCHS_AVX "^(ANY|SSE42|AVX)$") set(_ACCEPTED_ARCHS_AVX2 "^(ANY|SSE42|AVX|AVX2)$") set(_ACCEPTED_ARCHS_AVX512F "^(ANY|SSE42|AVX|AVX2|AVX512F)$") set(_ACCEPTED_ARCHS_NEON_FP16 "^(ANY|NEON_FP16)$") +set(_ACCEPTED_ARCHS_SVE "^(ANY|SVE)$") ## Arch specific definitions set(_DEFINE_ANY "") @@ -19,12 +20,14 @@ set(_DEFINE_AVX "HAVE_AVX" ${_DEFINE_SSE42}) set(_DEFINE_AVX2 "HAVE_AVX2" ${_DEFINE_AVX}) set(_DEFINE_AVX512F "HAVE_AVX512F" ${_DEFINE_AVX2}) set(_DEFINE_NEON_FP16 "HAVE_NEON_FP16" ${_DEFINE_ANY}) +set(_DEFINE_SVE "HAVE_SVE" ${_DEFINE_SVE}) ## Arch specific compile options ov_avx512_optimization_flags(_FLAGS_AVX512F) ov_avx2_optimization_flags (_FLAGS_AVX2) ov_sse42_optimization_flags (_FLAGS_SSE42) ov_arm_neon_fp16_optimization_flags(_FLAGS_NEON_FP16) +ov_arm_sve_optimization_flags(_FLAGS_SVE) set(_FLAGS_AVX "") ## TBD is not defined for OV project yet set(_FLAGS_ANY "") ## @@ -185,6 +188,8 @@ endfunction() function(_currently_requested_top_arch VAR) if(ENABLE_NEON_FP16) set(RES NEON_FP16) + elseif(ENABLE_SVE) + set(RES SVE) elseif(ENABLE_AVX512F) set(RES AVX512F) elseif(ENABLE_AVX2) diff --git a/cmake/developer_package/features.cmake b/cmake/developer_package/features.cmake index 8d1f3696c6759c..ae5313cea8a8b4 100644 --- a/cmake/developer_package/features.cmake +++ b/cmake/developer_package/features.cmake @@ -51,6 +51,8 @@ ov_dependent_option (ENABLE_AVX512F "Enable AVX512 optimizations" ON "X86_64 OR ov_dependent_option(ENABLE_NEON_FP16 "Enable ARM FP16 optimizations" ON "AARCH64" OFF) +ov_dependent_option(ENABLE_SVE "Enable SVE optimizations" ON "AARCH64" OFF) + # Type of build, we add this as an explicit option to default it to ON get_property(BUILD_SHARED_LIBS_DEFAULT GLOBAL PROPERTY TARGET_SUPPORTS_SHARED_LIBS) ov_option (BUILD_SHARED_LIBS "Build as a shared library" ${BUILD_SHARED_LIBS_DEFAULT}) diff --git a/src/plugins/intel_cpu/CMakeLists.txt b/src/plugins/intel_cpu/CMakeLists.txt index 7ad77d0869c1ed..a85992b6b0978d 100644 --- a/src/plugins/intel_cpu/CMakeLists.txt +++ b/src/plugins/intel_cpu/CMakeLists.txt @@ -277,30 +277,6 @@ target_include_directories(${TARGET_NAME} SYSTEM PRIVATE $) -# Check if SVE is available for AARCH64; compile with that if yes. -if (ARM OR AARCH64) - execute_process( - COMMAND lscpu - OUTPUT_VARIABLE CPUINFO - OUTPUT_STRIP_TRAILING_WHITESPACE - ) - string(FIND "${CPUINFO}" "Flags:" FLAGS_INDEX) - string(SUBSTRING "${CPUINFO}" ${FLAGS_INDEX} -1 CPU_FLAGS) - string(FIND "${CPU_FLAGS}" "sve" ISA_FOUND) - - if (NOT ISA_FOUND EQUAL -1) - set(HAVE_SVE ON) - else() - set(HAVE_SVE OFF) - endif() - - if (HAVE_SVE) - message(STATUS "[AARCH64] ISA SVE detected") - target_compile_options(${TARGET_NAME} PRIVATE "-march=armv8.4-a+sve") - add_definitions(-DHAVE_SVE) - endif() -endif() - # Cross compiled function # TODO: The same for proposal, proposalONNX, topk cross_compiled_file(${TARGET_NAME} @@ -311,14 +287,14 @@ cross_compiled_file(${TARGET_NAME} NAMESPACE ov::Extensions::Cpu::XARCH ) cross_compiled_file(${TARGET_NAME} - ARCH AVX512F AVX2 NEON_FP16 ANY + ARCH AVX512F AVX2 NEON_FP16 SVE ANY src/nodes/kernels/scaled_attn/softmax.cpp API src/nodes/kernels/scaled_attn/softmax.hpp NAME attn_softmax NAMESPACE ov::Extensions::Cpu::XARCH ) cross_compiled_file(${TARGET_NAME} - ARCH AVX512F AVX2 NEON_FP16 ANY + ARCH AVX512F AVX2 NEON_FP16 SVE ANY src/nodes/kernels/scaled_attn/mha_single_token.cpp API src/nodes/kernels/scaled_attn/mha_single_token.hpp NAME mha_single_token diff --git a/src/plugins/intel_cpu/src/nodes/kernels/scaled_attn/common.hpp b/src/plugins/intel_cpu/src/nodes/kernels/scaled_attn/common.hpp index 8f011b66c362a7..ace2bd6706fc9b 100644 --- a/src/plugins/intel_cpu/src/nodes/kernels/scaled_attn/common.hpp +++ b/src/plugins/intel_cpu/src/nodes/kernels/scaled_attn/common.hpp @@ -13,7 +13,7 @@ #include "openvino/core/type/float16.hpp" #if defined(OPENVINO_ARCH_ARM64) -#if defined(HAVE_SVE) +#if defined(__ARM_FEATURE_SVE) #include "arm_sve.h" #endif #include "arm_neon.h" @@ -249,7 +249,7 @@ static constexpr size_t vec_len_f16_neon = vec_len_neon / sizeof(ov::float16); #endif #ifdef OPENVINO_ARCH_ARM64 -#if defined(HAVE_SVE) && !defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) +#if defined(__ARM_FEATURE_SVE) && !defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) inline svfloat32_t exp_ps_sve(svbool_t& pg, svfloat32_t& src) { // Constants const auto log2_e = svdup_n_f32(1.4426950409f); diff --git a/src/plugins/intel_cpu/src/nodes/kernels/scaled_attn/mha_single_token.cpp b/src/plugins/intel_cpu/src/nodes/kernels/scaled_attn/mha_single_token.cpp index 6ce3b6053e62e0..60851a81ef1ec2 100644 --- a/src/plugins/intel_cpu/src/nodes/kernels/scaled_attn/mha_single_token.cpp +++ b/src/plugins/intel_cpu/src/nodes/kernels/scaled_attn/mha_single_token.cpp @@ -21,7 +21,7 @@ #include "softmax_kernel.hpp" #if defined(OPENVINO_ARCH_ARM64) -#if defined(HAVE_SVE) +#if defined(__ARM_FEATURE_SVE) # include #endif # include @@ -62,7 +62,7 @@ void cvt_copy(TA* dst, TB* src, size_t n) { mm256_uni_storeu_ps(dst + i, vb); } #elif defined(OPENVINO_ARCH_ARM64) -#if defined(HAVE_SVE) && !defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) +#if defined(__ARM_FEATURE_SVE) && !defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) size_t vec_len_f32_sve = svcntw(); auto _dst = reinterpret_cast(dst); size_t inc = vec_len_f32_sve; @@ -122,7 +122,7 @@ static void attn_acc_value(float* out, float weight, T* v, size_t S, float* scal mm256_uni_storeu_ps(out + i, v_out); } #elif defined(OPENVINO_ARCH_ARM64) -#if defined(HAVE_SVE) && !defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) +#if defined(__ARM_FEATURE_SVE) && !defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) size_t vec_len_f32_sve = svcntw(); auto _v = reinterpret_cast(v); svfloat32_t attn_w_vec_fp32 = svdup_n_f32(weight); @@ -403,7 +403,7 @@ static float sum_q_head(T* a, size_t n) { hsum(vsum0); sum = _mm256_cvtss_f32(vsum0); #elif defined(OPENVINO_ARCH_ARM64) -#if defined(HAVE_SVE) && !defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) +#if defined(__ARM_FEATURE_SVE) && !defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) size_t vec_len_f32_sve = svcntw(); svfloat32_t sum0 = svdup_n_f32(0.0f); svfloat32_t sum1 = svdup_n_f32(0.0f); @@ -588,7 +588,7 @@ static float dot_product(TA* a, TB* b, size_t n, float* scale, float* zp, float* sum = _mm256_cvtss_f32(vsum0); #elif defined(OPENVINO_ARCH_ARM64) -#if defined(HAVE_SVE) && !defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) +#if defined(__ARM_FEATURE_SVE) && !defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) size_t vec_len_f32_sve = svcntw(); svbool_t pg = svptrue_b32(); svfloat32_t sum0 = svdup_n_f32(0.0f); @@ -939,7 +939,7 @@ static void attn_reduce(T* dst, float* temp, size_t M, size_t S, size_t temp_str mm256_uni_storeu_ps(dst + i, result_vec_fp32); } #elif defined(OPENVINO_ARCH_ARM64) -#if defined(HAVE_SVE) && !defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) +#if defined(__ARM_FEATURE_SVE) && !defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) size_t vec_len_f32_sve = svcntw(); auto _dst = reinterpret_cast(dst); size_t inc = vec_len_f32_sve; diff --git a/src/plugins/intel_cpu/src/nodes/kernels/scaled_attn/softmax_kernel.hpp b/src/plugins/intel_cpu/src/nodes/kernels/scaled_attn/softmax_kernel.hpp index b2aafd5d42691f..b5cd77e64127cf 100644 --- a/src/plugins/intel_cpu/src/nodes/kernels/scaled_attn/softmax_kernel.hpp +++ b/src/plugins/intel_cpu/src/nodes/kernels/scaled_attn/softmax_kernel.hpp @@ -12,7 +12,7 @@ #include #if defined(OPENVINO_ARCH_ARM64) -#if defined(HAVE_SVE) +#if defined(__ARM_FEATURE_SVE) #include "arm_sve.h" #endif #include "arm_neon.h" @@ -659,7 +659,7 @@ inline void exp_reduce_sum(float* a, const float max, const size_t size, float& hsum(v_sum); sum = _mm256_cvtss_f32(v_sum); #elif defined(OPENVINO_ARCH_ARM64) -#if defined(HAVE_SVE) && !defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) +#if defined(__ARM_FEATURE_SVE) && !defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) svfloat32_t v_a; svfloat32_t v_max = svdup_n_f32(max); svfloat32_t v_sum = svdup_n_f32(0.0f); @@ -805,7 +805,7 @@ inline void multiply_scalar(float* a, float* a_dst, const float val, const size_ i += (size - i); } #elif defined(OPENVINO_ARCH_ARM64) -#if defined(HAVE_SVE) && !defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) +#if defined(__ARM_FEATURE_SVE) && !defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) size_t vec_len_f32_sve = svcntw(); svfloat32_t v_scale = svdup_n_f32(val); size_t inc = vec_len_f32_sve;