From 42acd0d1f97ff347ca2125e9ea3c973067a163a0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eduardo=20S=C3=A1nchez=20Mu=C3=B1oz?= Date: Tue, 3 Oct 2023 21:47:59 +0200 Subject: [PATCH 01/15] Reimplement _mm_cvtepi32_ps without LLVM intrinsics --- crates/core_arch/src/x86/sse2.rs | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/crates/core_arch/src/x86/sse2.rs b/crates/core_arch/src/x86/sse2.rs index 3fe81e0048..7937b4c2c5 100644 --- a/crates/core_arch/src/x86/sse2.rs +++ b/crates/core_arch/src/x86/sse2.rs @@ -952,7 +952,7 @@ pub unsafe fn _mm_cvtsi32_sd(a: __m128d, b: i32) -> __m128d { #[cfg_attr(test, assert_instr(cvtdq2ps))] #[stable(feature = "simd_x86", since = "1.27.0")] pub unsafe fn _mm_cvtepi32_ps(a: __m128i) -> __m128 { - cvtdq2ps(a.as_i32x4()) + transmute(simd_cast::<_, f32x4>(a.as_i32x4())) } /// Converts packed single-precision (32-bit) floating-point elements in `a` @@ -2856,8 +2856,6 @@ extern "C" { fn psrld(a: i32x4, count: i32x4) -> i32x4; #[link_name = "llvm.x86.sse2.psrl.q"] fn psrlq(a: i64x2, count: i64x2) -> i64x2; - #[link_name = "llvm.x86.sse2.cvtdq2ps"] - fn cvtdq2ps(a: i32x4) -> __m128; #[link_name = "llvm.x86.sse2.cvtps2dq"] fn cvtps2dq(a: __m128) -> i32x4; #[link_name = "llvm.x86.sse2.maskmov.dqu"] From 816a4196ae11491d62ea775faa64072abf92e9bb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eduardo=20S=C3=A1nchez=20Mu=C3=B1oz?= Date: Tue, 3 Oct 2023 22:04:29 +0200 Subject: [PATCH 02/15] Reimplement _mm_cvtpd_ps and _mm_cvtps_pd without LLVM intrinsics --- crates/core_arch/src/x86/sse2.rs | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/crates/core_arch/src/x86/sse2.rs b/crates/core_arch/src/x86/sse2.rs index 7937b4c2c5..8dbad60eed 100644 --- a/crates/core_arch/src/x86/sse2.rs +++ b/crates/core_arch/src/x86/sse2.rs @@ -2240,7 +2240,9 @@ pub unsafe fn _mm_ucomineq_sd(a: __m128d, b: __m128d) -> i32 { #[cfg_attr(test, assert_instr(cvtpd2ps))] #[stable(feature = "simd_x86", since = "1.27.0")] pub unsafe fn _mm_cvtpd_ps(a: __m128d) -> __m128 { - cvtpd2ps(a) + let r = simd_cast::<_, f32x2>(a.as_f64x2()); + let zero = f32x2::new(0.0, 0.0); + transmute::(simd_shuffle!(r, zero, [0, 1, 2, 3])) } /// Converts packed single-precision (32-bit) floating-point elements in `a` to @@ -2253,7 +2255,8 @@ pub unsafe fn _mm_cvtpd_ps(a: __m128d) -> __m128 { #[cfg_attr(test, assert_instr(cvtps2pd))] #[stable(feature = "simd_x86", since = "1.27.0")] pub unsafe fn _mm_cvtps_pd(a: __m128) -> __m128d { - cvtps2pd(a) + let a = a.as_f32x4(); + transmute(simd_cast::(simd_shuffle!(a, a, [0, 1]))) } /// Converts packed double-precision (64-bit) floating-point elements in `a` to @@ -2908,10 +2911,6 @@ extern "C" { fn ucomineqsd(a: __m128d, b: __m128d) -> i32; #[link_name = "llvm.x86.sse2.movmsk.pd"] fn movmskpd(a: __m128d) -> i32; - #[link_name = "llvm.x86.sse2.cvtpd2ps"] - fn cvtpd2ps(a: __m128d) -> __m128; - #[link_name = "llvm.x86.sse2.cvtps2pd"] - fn cvtps2pd(a: __m128) -> __m128d; #[link_name = "llvm.x86.sse2.cvtpd2dq"] fn cvtpd2dq(a: __m128d) -> i32x4; #[link_name = "llvm.x86.sse2.cvtsd2si"] From efb5a082245f39ed321206d78c02ae7928f9eb07 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eduardo=20S=C3=A1nchez=20Mu=C3=B1oz?= Date: Tue, 3 Oct 2023 22:48:44 +0200 Subject: [PATCH 03/15] Reimplement _mm_mul_epu32 and _mm256_mul_epu32 without LLVM intrinsics --- crates/core_arch/src/x86/avx2.rs | 7 ++++--- crates/core_arch/src/x86/sse2.rs | 7 ++++--- 2 files changed, 8 insertions(+), 6 deletions(-) diff --git a/crates/core_arch/src/x86/avx2.rs b/crates/core_arch/src/x86/avx2.rs index e23c795ee7..7ee5cee567 100644 --- a/crates/core_arch/src/x86/avx2.rs +++ b/crates/core_arch/src/x86/avx2.rs @@ -2074,7 +2074,10 @@ pub unsafe fn _mm256_mul_epi32(a: __m256i, b: __m256i) -> __m256i { #[cfg_attr(test, assert_instr(vpmuludq))] #[stable(feature = "simd_x86", since = "1.27.0")] pub unsafe fn _mm256_mul_epu32(a: __m256i, b: __m256i) -> __m256i { - transmute(pmuludq(a.as_u32x8(), b.as_u32x8())) + let a = a.as_u64x4(); + let b = b.as_u64x4(); + let mask = u64x4::splat(u32::MAX.into()); + transmute(simd_mul(simd_and(a, mask), simd_and(b, mask))) } /// Multiplies the packed 16-bit integers in `a` and `b`, producing @@ -3675,8 +3678,6 @@ extern "C" { fn pmulhw(a: i16x16, b: i16x16) -> i16x16; #[link_name = "llvm.x86.avx2.pmul.dq"] fn pmuldq(a: i32x8, b: i32x8) -> i64x4; - #[link_name = "llvm.x86.avx2.pmulu.dq"] - fn pmuludq(a: u32x8, b: u32x8) -> u64x4; #[link_name = "llvm.x86.avx2.pmul.hr.sw"] fn pmulhrsw(a: i16x16, b: i16x16) -> i16x16; #[link_name = "llvm.x86.avx2.packsswb"] diff --git a/crates/core_arch/src/x86/sse2.rs b/crates/core_arch/src/x86/sse2.rs index 8dbad60eed..46146aab54 100644 --- a/crates/core_arch/src/x86/sse2.rs +++ b/crates/core_arch/src/x86/sse2.rs @@ -303,7 +303,10 @@ pub unsafe fn _mm_mullo_epi16(a: __m128i, b: __m128i) -> __m128i { #[cfg_attr(test, assert_instr(pmuludq))] #[stable(feature = "simd_x86", since = "1.27.0")] pub unsafe fn _mm_mul_epu32(a: __m128i, b: __m128i) -> __m128i { - transmute(pmuludq(a.as_u32x4(), b.as_u32x4())) + let a = a.as_u64x2(); + let b = b.as_u64x2(); + let mask = u64x2::splat(u32::MAX.into()); + transmute(simd_mul(simd_and(a, mask), simd_and(b, mask))) } /// Sum the absolute differences of packed unsigned 8-bit integers. @@ -2839,8 +2842,6 @@ extern "C" { fn pmulhw(a: i16x8, b: i16x8) -> i16x8; #[link_name = "llvm.x86.sse2.pmulhu.w"] fn pmulhuw(a: u16x8, b: u16x8) -> u16x8; - #[link_name = "llvm.x86.sse2.pmulu.dq"] - fn pmuludq(a: u32x4, b: u32x4) -> u64x2; #[link_name = "llvm.x86.sse2.psad.bw"] fn psadbw(a: u8x16, b: u8x16) -> u64x2; #[link_name = "llvm.x86.sse2.psll.w"] From b6b9afc9b792ce83f2d0c242a42cbfc49afc9873 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eduardo=20S=C3=A1nchez=20Mu=C3=B1oz?= Date: Tue, 3 Oct 2023 23:00:22 +0200 Subject: [PATCH 04/15] Reimplement _mm_mulhi_epi16, _mm_mulhi_epu16, _mm256_mulhi_epi16 and _mm256_mulhi_epu16 without LLVM intrinsics --- crates/core_arch/src/x86/avx2.rs | 14 ++++++++------ crates/core_arch/src/x86/sse2.rs | 14 ++++++++------ 2 files changed, 16 insertions(+), 12 deletions(-) diff --git a/crates/core_arch/src/x86/avx2.rs b/crates/core_arch/src/x86/avx2.rs index 7ee5cee567..93cea66aa4 100644 --- a/crates/core_arch/src/x86/avx2.rs +++ b/crates/core_arch/src/x86/avx2.rs @@ -2090,7 +2090,10 @@ pub unsafe fn _mm256_mul_epu32(a: __m256i, b: __m256i) -> __m256i { #[cfg_attr(test, assert_instr(vpmulhw))] #[stable(feature = "simd_x86", since = "1.27.0")] pub unsafe fn _mm256_mulhi_epi16(a: __m256i, b: __m256i) -> __m256i { - transmute(pmulhw(a.as_i16x16(), b.as_i16x16())) + let a = simd_cast::<_, i32x16>(a.as_i16x16()); + let b = simd_cast::<_, i32x16>(b.as_i16x16()); + let r = simd_shr(simd_mul(a, b), i32x16::splat(16)); + transmute(simd_cast::(r)) } /// Multiplies the packed unsigned 16-bit integers in `a` and `b`, producing @@ -2103,7 +2106,10 @@ pub unsafe fn _mm256_mulhi_epi16(a: __m256i, b: __m256i) -> __m256i { #[cfg_attr(test, assert_instr(vpmulhuw))] #[stable(feature = "simd_x86", since = "1.27.0")] pub unsafe fn _mm256_mulhi_epu16(a: __m256i, b: __m256i) -> __m256i { - transmute(pmulhuw(a.as_u16x16(), b.as_u16x16())) + let a = simd_cast::<_, u32x16>(a.as_u16x16()); + let b = simd_cast::<_, u32x16>(b.as_u16x16()); + let r = simd_shr(simd_mul(a, b), u32x16::splat(16)); + transmute(simd_cast::(r)) } /// Multiplies the packed 16-bit integers in `a` and `b`, producing @@ -3672,10 +3678,6 @@ extern "C" { fn maskstoreq256(mem_addr: *mut i8, mask: i64x4, a: i64x4); #[link_name = "llvm.x86.avx2.mpsadbw"] fn mpsadbw(a: u8x32, b: u8x32, imm8: i32) -> u16x16; - #[link_name = "llvm.x86.avx2.pmulhu.w"] - fn pmulhuw(a: u16x16, b: u16x16) -> u16x16; - #[link_name = "llvm.x86.avx2.pmulh.w"] - fn pmulhw(a: i16x16, b: i16x16) -> i16x16; #[link_name = "llvm.x86.avx2.pmul.dq"] fn pmuldq(a: i32x8, b: i32x8) -> i64x4; #[link_name = "llvm.x86.avx2.pmul.hr.sw"] diff --git a/crates/core_arch/src/x86/sse2.rs b/crates/core_arch/src/x86/sse2.rs index 46146aab54..107f0b0cf2 100644 --- a/crates/core_arch/src/x86/sse2.rs +++ b/crates/core_arch/src/x86/sse2.rs @@ -261,7 +261,10 @@ pub unsafe fn _mm_min_epu8(a: __m128i, b: __m128i) -> __m128i { #[cfg_attr(test, assert_instr(pmulhw))] #[stable(feature = "simd_x86", since = "1.27.0")] pub unsafe fn _mm_mulhi_epi16(a: __m128i, b: __m128i) -> __m128i { - transmute(pmulhw(a.as_i16x8(), b.as_i16x8())) + let a = simd_cast::<_, i32x8>(a.as_i16x8()); + let b = simd_cast::<_, i32x8>(b.as_i16x8()); + let r = simd_shr(simd_mul(a, b), i32x8::splat(16)); + transmute(simd_cast::(r)) } /// Multiplies the packed unsigned 16-bit integers in `a` and `b`. @@ -275,7 +278,10 @@ pub unsafe fn _mm_mulhi_epi16(a: __m128i, b: __m128i) -> __m128i { #[cfg_attr(test, assert_instr(pmulhuw))] #[stable(feature = "simd_x86", since = "1.27.0")] pub unsafe fn _mm_mulhi_epu16(a: __m128i, b: __m128i) -> __m128i { - transmute(pmulhuw(a.as_u16x8(), b.as_u16x8())) + let a = simd_cast::<_, u32x8>(a.as_u16x8()); + let b = simd_cast::<_, u32x8>(b.as_u16x8()); + let r = simd_shr(simd_mul(a, b), u32x8::splat(16)); + transmute(simd_cast::(r)) } /// Multiplies the packed 16-bit integers in `a` and `b`. @@ -2838,10 +2844,6 @@ extern "C" { fn pavgw(a: u16x8, b: u16x8) -> u16x8; #[link_name = "llvm.x86.sse2.pmadd.wd"] fn pmaddwd(a: i16x8, b: i16x8) -> i32x4; - #[link_name = "llvm.x86.sse2.pmulh.w"] - fn pmulhw(a: i16x8, b: i16x8) -> i16x8; - #[link_name = "llvm.x86.sse2.pmulhu.w"] - fn pmulhuw(a: u16x8, b: u16x8) -> u16x8; #[link_name = "llvm.x86.sse2.psad.bw"] fn psadbw(a: u8x16, b: u8x16) -> u64x2; #[link_name = "llvm.x86.sse2.psll.w"] From 2fe39cf9b12262da93d42fa6a49f140e2802be31 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eduardo=20S=C3=A1nchez=20Mu=C3=B1oz?= Date: Tue, 3 Oct 2023 23:14:09 +0200 Subject: [PATCH 05/15] Reimplement _mm_avg_epu8, _mm_avg_epu16, _mm256_avg_epu8 and _mm256_avg_epu16 without LLVM intrinsics --- crates/core_arch/src/x86/avx2.rs | 14 ++++++++------ crates/core_arch/src/x86/sse2.rs | 14 ++++++++------ 2 files changed, 16 insertions(+), 12 deletions(-) diff --git a/crates/core_arch/src/x86/avx2.rs b/crates/core_arch/src/x86/avx2.rs index 93cea66aa4..80c59fe746 100644 --- a/crates/core_arch/src/x86/avx2.rs +++ b/crates/core_arch/src/x86/avx2.rs @@ -344,7 +344,10 @@ pub unsafe fn _mm256_andnot_si256(a: __m256i, b: __m256i) -> __m256i { #[cfg_attr(test, assert_instr(vpavgw))] #[stable(feature = "simd_x86", since = "1.27.0")] pub unsafe fn _mm256_avg_epu16(a: __m256i, b: __m256i) -> __m256i { - transmute(pavgw(a.as_u16x16(), b.as_u16x16())) + let a = simd_cast::<_, u32x16>(a.as_u16x16()); + let b = simd_cast::<_, u32x16>(b.as_u16x16()); + let r = simd_shr(simd_add(simd_add(a, b), u32x16::splat(1)), u32x16::splat(1)); + transmute(simd_cast::<_, u16x16>(r)) } /// Averages packed unsigned 8-bit integers in `a` and `b`. @@ -355,7 +358,10 @@ pub unsafe fn _mm256_avg_epu16(a: __m256i, b: __m256i) -> __m256i { #[cfg_attr(test, assert_instr(vpavgb))] #[stable(feature = "simd_x86", since = "1.27.0")] pub unsafe fn _mm256_avg_epu8(a: __m256i, b: __m256i) -> __m256i { - transmute(pavgb(a.as_u8x32(), b.as_u8x32())) + let a = simd_cast::<_, u16x32>(a.as_u8x32()); + let b = simd_cast::<_, u16x32>(b.as_u8x32()); + let r = simd_shr(simd_add(simd_add(a, b), u16x32::splat(1)), u16x32::splat(1)); + transmute(simd_cast::<_, u8x32>(r)) } /// Blends packed 32-bit integers from `a` and `b` using control mask `IMM4`. @@ -3638,10 +3644,6 @@ extern "C" { fn pabsw(a: i16x16) -> u16x16; #[link_name = "llvm.x86.avx2.pabs.d"] fn pabsd(a: i32x8) -> u32x8; - #[link_name = "llvm.x86.avx2.pavg.b"] - fn pavgb(a: u8x32, b: u8x32) -> u8x32; - #[link_name = "llvm.x86.avx2.pavg.w"] - fn pavgw(a: u16x16, b: u16x16) -> u16x16; #[link_name = "llvm.x86.avx2.pblendvb"] fn pblendvb(a: i8x32, b: i8x32, mask: i8x32) -> i8x32; #[link_name = "llvm.x86.avx2.phadd.w"] diff --git a/crates/core_arch/src/x86/sse2.rs b/crates/core_arch/src/x86/sse2.rs index 107f0b0cf2..22fa62ed7d 100644 --- a/crates/core_arch/src/x86/sse2.rs +++ b/crates/core_arch/src/x86/sse2.rs @@ -165,7 +165,10 @@ pub unsafe fn _mm_adds_epu16(a: __m128i, b: __m128i) -> __m128i { #[cfg_attr(test, assert_instr(pavgb))] #[stable(feature = "simd_x86", since = "1.27.0")] pub unsafe fn _mm_avg_epu8(a: __m128i, b: __m128i) -> __m128i { - transmute(pavgb(a.as_u8x16(), b.as_u8x16())) + let a = simd_cast::<_, u16x16>(a.as_u8x16()); + let b = simd_cast::<_, u16x16>(b.as_u8x16()); + let r = simd_shr(simd_add(simd_add(a, b), u16x16::splat(1)), u16x16::splat(1)); + transmute(simd_cast::<_, u8x16>(r)) } /// Averages packed unsigned 16-bit integers in `a` and `b`. @@ -176,7 +179,10 @@ pub unsafe fn _mm_avg_epu8(a: __m128i, b: __m128i) -> __m128i { #[cfg_attr(test, assert_instr(pavgw))] #[stable(feature = "simd_x86", since = "1.27.0")] pub unsafe fn _mm_avg_epu16(a: __m128i, b: __m128i) -> __m128i { - transmute(pavgw(a.as_u16x8(), b.as_u16x8())) + let a = simd_cast::<_, u32x8>(a.as_u16x8()); + let b = simd_cast::<_, u32x8>(b.as_u16x8()); + let r = simd_shr(simd_add(simd_add(a, b), u32x8::splat(1)), u32x8::splat(1)); + transmute(simd_cast::<_, u16x8>(r)) } /// Multiplies and then horizontally add signed 16 bit integers in `a` and `b`. @@ -2838,10 +2844,6 @@ extern "C" { fn lfence(); #[link_name = "llvm.x86.sse2.mfence"] fn mfence(); - #[link_name = "llvm.x86.sse2.pavg.b"] - fn pavgb(a: u8x16, b: u8x16) -> u8x16; - #[link_name = "llvm.x86.sse2.pavg.w"] - fn pavgw(a: u16x8, b: u16x8) -> u16x8; #[link_name = "llvm.x86.sse2.pmadd.wd"] fn pmaddwd(a: i16x8, b: i16x8) -> i32x4; #[link_name = "llvm.x86.sse2.psad.bw"] From 5d054a6756a2ef15b6006289629a48a1b026b4bc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eduardo=20S=C3=A1nchez=20Mu=C3=B1oz?= Date: Tue, 3 Oct 2023 23:38:47 +0200 Subject: [PATCH 06/15] Reimplement _mm_mul_epi32 and _mm256_mul_epi32 without LLVM intrinsics --- crates/core_arch/src/x86/avx2.rs | 6 +++--- crates/core_arch/src/x86/sse41.rs | 6 +++--- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/crates/core_arch/src/x86/avx2.rs b/crates/core_arch/src/x86/avx2.rs index 80c59fe746..7acfe33645 100644 --- a/crates/core_arch/src/x86/avx2.rs +++ b/crates/core_arch/src/x86/avx2.rs @@ -2066,7 +2066,9 @@ pub unsafe fn _mm256_mpsadbw_epu8(a: __m256i, b: __m256i) -> __ #[cfg_attr(test, assert_instr(vpmuldq))] #[stable(feature = "simd_x86", since = "1.27.0")] pub unsafe fn _mm256_mul_epi32(a: __m256i, b: __m256i) -> __m256i { - transmute(pmuldq(a.as_i32x8(), b.as_i32x8())) + let a = simd_cast::<_, i64x4>(simd_cast::<_, i32x4>(a.as_i64x4())); + let b = simd_cast::<_, i64x4>(simd_cast::<_, i32x4>(b.as_i64x4())); + transmute(simd_mul(a, b)) } /// Multiplies the low unsigned 32-bit integers from each packed 64-bit @@ -3680,8 +3682,6 @@ extern "C" { fn maskstoreq256(mem_addr: *mut i8, mask: i64x4, a: i64x4); #[link_name = "llvm.x86.avx2.mpsadbw"] fn mpsadbw(a: u8x32, b: u8x32, imm8: i32) -> u16x16; - #[link_name = "llvm.x86.avx2.pmul.dq"] - fn pmuldq(a: i32x8, b: i32x8) -> i64x4; #[link_name = "llvm.x86.avx2.pmul.hr.sw"] fn pmulhrsw(a: i16x16, b: i16x16) -> i16x16; #[link_name = "llvm.x86.avx2.packsswb"] diff --git a/crates/core_arch/src/x86/sse41.rs b/crates/core_arch/src/x86/sse41.rs index a62beb6f68..444b599749 100644 --- a/crates/core_arch/src/x86/sse41.rs +++ b/crates/core_arch/src/x86/sse41.rs @@ -923,7 +923,9 @@ pub unsafe fn _mm_minpos_epu16(a: __m128i) -> __m128i { #[cfg_attr(test, assert_instr(pmuldq))] #[stable(feature = "simd_x86", since = "1.27.0")] pub unsafe fn _mm_mul_epi32(a: __m128i, b: __m128i) -> __m128i { - transmute(pmuldq(a.as_i32x4(), b.as_i32x4())) + let a = simd_cast::<_, i64x2>(simd_cast::<_, i32x2>(a.as_i64x2())); + let b = simd_cast::<_, i64x2>(simd_cast::<_, i32x2>(b.as_i64x2())); + transmute(simd_mul(a, b)) } /// Multiplies the packed 32-bit integers in `a` and `b`, producing intermediate @@ -1154,8 +1156,6 @@ extern "C" { fn roundss(a: __m128, b: __m128, rounding: i32) -> __m128; #[link_name = "llvm.x86.sse41.phminposuw"] fn phminposuw(a: u16x8) -> u16x8; - #[link_name = "llvm.x86.sse41.pmuldq"] - fn pmuldq(a: i32x4, b: i32x4) -> i64x2; #[link_name = "llvm.x86.sse41.mpsadbw"] fn mpsadbw(a: u8x16, b: u8x16, imm8: u8) -> u16x8; #[link_name = "llvm.x86.sse41.ptestz"] From fd3048416c21cd6a97374dbc2b6930915376b993 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eduardo=20S=C3=A1nchez=20Mu=C3=B1oz?= Date: Wed, 4 Oct 2023 19:14:56 +0200 Subject: [PATCH 07/15] Reimplement _mm_blendv_epi8 and _mm256_blendv_epi8 without LLVM intrinsics --- crates/core_arch/src/x86/avx2.rs | 5 ++--- crates/core_arch/src/x86/sse41.rs | 5 ++--- 2 files changed, 4 insertions(+), 6 deletions(-) diff --git a/crates/core_arch/src/x86/avx2.rs b/crates/core_arch/src/x86/avx2.rs index 7acfe33645..243a4cdab1 100644 --- a/crates/core_arch/src/x86/avx2.rs +++ b/crates/core_arch/src/x86/avx2.rs @@ -464,7 +464,8 @@ pub unsafe fn _mm256_blend_epi16(a: __m256i, b: __m256i) -> __m #[cfg_attr(test, assert_instr(vpblendvb))] #[stable(feature = "simd_x86", since = "1.27.0")] pub unsafe fn _mm256_blendv_epi8(a: __m256i, b: __m256i, mask: __m256i) -> __m256i { - transmute(pblendvb(a.as_i8x32(), b.as_i8x32(), mask.as_i8x32())) + let mask: i8x32 = simd_lt(mask.as_i8x32(), i8x32::splat(0)); + transmute(simd_select(mask, b.as_i8x32(), a.as_i8x32())) } /// Broadcasts the low packed 8-bit integer from `a` to all elements of @@ -3646,8 +3647,6 @@ extern "C" { fn pabsw(a: i16x16) -> u16x16; #[link_name = "llvm.x86.avx2.pabs.d"] fn pabsd(a: i32x8) -> u32x8; - #[link_name = "llvm.x86.avx2.pblendvb"] - fn pblendvb(a: i8x32, b: i8x32, mask: i8x32) -> i8x32; #[link_name = "llvm.x86.avx2.phadd.w"] fn phaddw(a: i16x16, b: i16x16) -> i16x16; #[link_name = "llvm.x86.avx2.phadd.d"] diff --git a/crates/core_arch/src/x86/sse41.rs b/crates/core_arch/src/x86/sse41.rs index 444b599749..aaa18703d9 100644 --- a/crates/core_arch/src/x86/sse41.rs +++ b/crates/core_arch/src/x86/sse41.rs @@ -62,7 +62,8 @@ pub const _MM_FROUND_NEARBYINT: i32 = _MM_FROUND_NO_EXC | _MM_FROUND_CUR_DIRECTI #[cfg_attr(test, assert_instr(pblendvb))] #[stable(feature = "simd_x86", since = "1.27.0")] pub unsafe fn _mm_blendv_epi8(a: __m128i, b: __m128i, mask: __m128i) -> __m128i { - transmute(pblendvb(a.as_i8x16(), b.as_i8x16(), mask.as_i8x16())) + let mask: i8x16 = simd_lt(mask.as_i8x16(), i8x16::splat(0)); + transmute(simd_select(mask, b.as_i8x16(), a.as_i8x16())) } /// Blend packed 16-bit integers from `a` and `b` using the mask `IMM8`. @@ -1126,8 +1127,6 @@ pub unsafe fn _mm_test_mix_ones_zeros(a: __m128i, mask: __m128i) -> i32 { #[allow(improper_ctypes)] extern "C" { - #[link_name = "llvm.x86.sse41.pblendvb"] - fn pblendvb(a: i8x16, b: i8x16, mask: i8x16) -> i8x16; #[link_name = "llvm.x86.sse41.blendvpd"] fn blendvpd(a: __m128d, b: __m128d, mask: __m128d) -> __m128d; #[link_name = "llvm.x86.sse41.blendvps"] From 6a58f7e83eb51e0bc3078456ec8598a5c4ca0dca Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eduardo=20S=C3=A1nchez=20Mu=C3=B1oz?= Date: Wed, 4 Oct 2023 19:37:56 +0200 Subject: [PATCH 08/15] Reimplement _mm_blend_epi16 without LLVM intrinsics --- crates/core_arch/src/x86/sse41.rs | 22 +++++++++++++++------- 1 file changed, 15 insertions(+), 7 deletions(-) diff --git a/crates/core_arch/src/x86/sse41.rs b/crates/core_arch/src/x86/sse41.rs index aaa18703d9..00916ff8e8 100644 --- a/crates/core_arch/src/x86/sse41.rs +++ b/crates/core_arch/src/x86/sse41.rs @@ -75,15 +75,25 @@ pub unsafe fn _mm_blendv_epi8(a: __m128i, b: __m128i, mask: __m128i) -> __m128i /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_blend_epi16) #[inline] #[target_feature(enable = "sse4.1")] -// Note: LLVM7 prefers the single-precision floating-point domain when possible -// see https://bugs.llvm.org/show_bug.cgi?id=38195 -// #[cfg_attr(test, assert_instr(pblendw, IMM8 = 0xF0))] -#[cfg_attr(test, assert_instr(blendps, IMM8 = 0xF0))] +#[cfg_attr(test, assert_instr(pblendw, IMM8 = 0xB1))] #[rustc_legacy_const_generics(2)] #[stable(feature = "simd_x86", since = "1.27.0")] pub unsafe fn _mm_blend_epi16(a: __m128i, b: __m128i) -> __m128i { static_assert_uimm_bits!(IMM8, 8); - transmute(pblendw(a.as_i16x8(), b.as_i16x8(), IMM8 as u8)) + transmute::(simd_shuffle!( + a.as_i16x8(), + b.as_i16x8(), + [ + [0, 8][IMM8 as usize & 1], + [1, 9][(IMM8 >> 1) as usize & 1], + [2, 10][(IMM8 >> 2) as usize & 1], + [3, 11][(IMM8 >> 3) as usize & 1], + [4, 12][(IMM8 >> 4) as usize & 1], + [5, 13][(IMM8 >> 5) as usize & 1], + [6, 14][(IMM8 >> 6) as usize & 1], + [7, 15][(IMM8 >> 7) as usize & 1], + ] + )) } /// Blend packed double-precision (64-bit) floating-point elements from `a` @@ -1135,8 +1145,6 @@ extern "C" { fn blendpd(a: __m128d, b: __m128d, imm2: u8) -> __m128d; #[link_name = "llvm.x86.sse41.blendps"] fn blendps(a: __m128, b: __m128, imm4: u8) -> __m128; - #[link_name = "llvm.x86.sse41.pblendw"] - fn pblendw(a: i16x8, b: i16x8, imm8: u8) -> i16x8; #[link_name = "llvm.x86.sse41.insertps"] fn insertps(a: __m128, b: __m128, imm8: u8) -> __m128; #[link_name = "llvm.x86.sse41.packusdw"] From 22ffb3ce6ca3a442a7ce32660ea970851d66eb98 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eduardo=20S=C3=A1nchez=20Mu=C3=B1oz?= Date: Wed, 4 Oct 2023 19:47:25 +0200 Subject: [PATCH 09/15] Reimplement _mm_blendv_pd and _mm256_blendv_pd without LLVM intrinsics --- crates/core_arch/src/x86/avx.rs | 5 ++--- crates/core_arch/src/x86/sse41.rs | 5 ++--- 2 files changed, 4 insertions(+), 6 deletions(-) diff --git a/crates/core_arch/src/x86/avx.rs b/crates/core_arch/src/x86/avx.rs index 00bcc1fa1b..88465b4312 100644 --- a/crates/core_arch/src/x86/avx.rs +++ b/crates/core_arch/src/x86/avx.rs @@ -511,7 +511,8 @@ pub unsafe fn _mm256_blend_ps(a: __m256, b: __m256) -> __m256 { #[cfg_attr(test, assert_instr(vblendvpd))] #[stable(feature = "simd_x86", since = "1.27.0")] pub unsafe fn _mm256_blendv_pd(a: __m256d, b: __m256d, c: __m256d) -> __m256d { - vblendvpd(a, b, c) + let mask: i64x4 = simd_lt(transmute::<_, i64x4>(c), i64x4::splat(0)); + transmute(simd_select(mask, b.as_f64x4(), a.as_f64x4())) } /// Blends packed single-precision (32-bit) floating-point elements from @@ -2914,8 +2915,6 @@ extern "C" { fn roundps256(a: __m256, b: i32) -> __m256; #[link_name = "llvm.x86.avx.sqrt.ps.256"] fn sqrtps256(a: __m256) -> __m256; - #[link_name = "llvm.x86.avx.blendv.pd.256"] - fn vblendvpd(a: __m256d, b: __m256d, c: __m256d) -> __m256d; #[link_name = "llvm.x86.avx.blendv.ps.256"] fn vblendvps(a: __m256, b: __m256, c: __m256) -> __m256; #[link_name = "llvm.x86.avx.dp.ps.256"] diff --git a/crates/core_arch/src/x86/sse41.rs b/crates/core_arch/src/x86/sse41.rs index 00916ff8e8..15898eb7b3 100644 --- a/crates/core_arch/src/x86/sse41.rs +++ b/crates/core_arch/src/x86/sse41.rs @@ -105,7 +105,8 @@ pub unsafe fn _mm_blend_epi16(a: __m128i, b: __m128i) -> __m128 #[cfg_attr(test, assert_instr(blendvpd))] #[stable(feature = "simd_x86", since = "1.27.0")] pub unsafe fn _mm_blendv_pd(a: __m128d, b: __m128d, mask: __m128d) -> __m128d { - blendvpd(a, b, mask) + let mask: i64x2 = simd_lt(transmute::<_, i64x2>(mask), i64x2::splat(0)); + transmute(simd_select(mask, b.as_f64x2(), a.as_f64x2())) } /// Blend packed single-precision (32-bit) floating-point elements from `a` @@ -1137,8 +1138,6 @@ pub unsafe fn _mm_test_mix_ones_zeros(a: __m128i, mask: __m128i) -> i32 { #[allow(improper_ctypes)] extern "C" { - #[link_name = "llvm.x86.sse41.blendvpd"] - fn blendvpd(a: __m128d, b: __m128d, mask: __m128d) -> __m128d; #[link_name = "llvm.x86.sse41.blendvps"] fn blendvps(a: __m128, b: __m128, mask: __m128) -> __m128; #[link_name = "llvm.x86.sse41.blendpd"] From 1e87cb83b7b33d6b410549af9668033840875d9c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eduardo=20S=C3=A1nchez=20Mu=C3=B1oz?= Date: Wed, 4 Oct 2023 19:51:08 +0200 Subject: [PATCH 10/15] Reimplement _mm_blendv_ps and _mm256_blendv_ps without LLVM intrinsics --- crates/core_arch/src/x86/avx.rs | 5 ++--- crates/core_arch/src/x86/sse41.rs | 5 ++--- 2 files changed, 4 insertions(+), 6 deletions(-) diff --git a/crates/core_arch/src/x86/avx.rs b/crates/core_arch/src/x86/avx.rs index 88465b4312..28e1c6b35a 100644 --- a/crates/core_arch/src/x86/avx.rs +++ b/crates/core_arch/src/x86/avx.rs @@ -524,7 +524,8 @@ pub unsafe fn _mm256_blendv_pd(a: __m256d, b: __m256d, c: __m256d) -> __m256d { #[cfg_attr(test, assert_instr(vblendvps))] #[stable(feature = "simd_x86", since = "1.27.0")] pub unsafe fn _mm256_blendv_ps(a: __m256, b: __m256, c: __m256) -> __m256 { - vblendvps(a, b, c) + let mask: i32x8 = simd_lt(transmute::<_, i32x8>(c), i32x8::splat(0)); + transmute(simd_select(mask, b.as_f32x8(), a.as_f32x8())) } /// Conditionally multiplies the packed single-precision (32-bit) floating-point @@ -2915,8 +2916,6 @@ extern "C" { fn roundps256(a: __m256, b: i32) -> __m256; #[link_name = "llvm.x86.avx.sqrt.ps.256"] fn sqrtps256(a: __m256) -> __m256; - #[link_name = "llvm.x86.avx.blendv.ps.256"] - fn vblendvps(a: __m256, b: __m256, c: __m256) -> __m256; #[link_name = "llvm.x86.avx.dp.ps.256"] fn vdpps(a: __m256, b: __m256, imm8: i32) -> __m256; #[link_name = "llvm.x86.avx.hadd.pd.256"] diff --git a/crates/core_arch/src/x86/sse41.rs b/crates/core_arch/src/x86/sse41.rs index 15898eb7b3..bad7ed2c68 100644 --- a/crates/core_arch/src/x86/sse41.rs +++ b/crates/core_arch/src/x86/sse41.rs @@ -118,7 +118,8 @@ pub unsafe fn _mm_blendv_pd(a: __m128d, b: __m128d, mask: __m128d) -> __m128d { #[cfg_attr(test, assert_instr(blendvps))] #[stable(feature = "simd_x86", since = "1.27.0")] pub unsafe fn _mm_blendv_ps(a: __m128, b: __m128, mask: __m128) -> __m128 { - blendvps(a, b, mask) + let mask: i32x4 = simd_lt(transmute::<_, i32x4>(mask), i32x4::splat(0)); + transmute(simd_select(mask, b.as_f32x4(), a.as_f32x4())) } /// Blend packed double-precision (64-bit) floating-point elements from `a` @@ -1138,8 +1139,6 @@ pub unsafe fn _mm_test_mix_ones_zeros(a: __m128i, mask: __m128i) -> i32 { #[allow(improper_ctypes)] extern "C" { - #[link_name = "llvm.x86.sse41.blendvps"] - fn blendvps(a: __m128, b: __m128, mask: __m128) -> __m128; #[link_name = "llvm.x86.sse41.blendpd"] fn blendpd(a: __m128d, b: __m128d, imm2: u8) -> __m128d; #[link_name = "llvm.x86.sse41.blendps"] From cbe563ff8fa74605ff60438c32798024d647d6ae Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eduardo=20S=C3=A1nchez=20Mu=C3=B1oz?= Date: Wed, 4 Oct 2023 19:57:58 +0200 Subject: [PATCH 11/15] Reimplement _mm_blend_pd and _mm_blend_ps without LLVM intrinsics --- crates/core_arch/src/x86/sse41.rs | 21 +++++++++++++++------ 1 file changed, 15 insertions(+), 6 deletions(-) diff --git a/crates/core_arch/src/x86/sse41.rs b/crates/core_arch/src/x86/sse41.rs index bad7ed2c68..6351aa45ff 100644 --- a/crates/core_arch/src/x86/sse41.rs +++ b/crates/core_arch/src/x86/sse41.rs @@ -136,7 +136,11 @@ pub unsafe fn _mm_blendv_ps(a: __m128, b: __m128, mask: __m128) -> __m128 { #[stable(feature = "simd_x86", since = "1.27.0")] pub unsafe fn _mm_blend_pd(a: __m128d, b: __m128d) -> __m128d { static_assert_uimm_bits!(IMM2, 2); - blendpd(a, b, IMM2 as u8) + transmute::(simd_shuffle!( + a.as_f64x2(), + b.as_f64x2(), + [[0, 2][IMM2 as usize & 1], [1, 3][(IMM2 >> 1) as usize & 1]] + )) } /// Blend packed single-precision (32-bit) floating-point elements from `a` @@ -150,7 +154,16 @@ pub unsafe fn _mm_blend_pd(a: __m128d, b: __m128d) -> __m128d { #[stable(feature = "simd_x86", since = "1.27.0")] pub unsafe fn _mm_blend_ps(a: __m128, b: __m128) -> __m128 { static_assert_uimm_bits!(IMM4, 4); - blendps(a, b, IMM4 as u8) + transmute::(simd_shuffle!( + a.as_f32x4(), + b.as_f32x4(), + [ + [0, 4][IMM4 as usize & 1], + [1, 5][(IMM4 >> 1) as usize & 1], + [2, 6][(IMM4 >> 2) as usize & 1], + [3, 7][(IMM4 >> 3) as usize & 1], + ] + )) } /// Extracts a single-precision (32-bit) floating-point element from `a`, @@ -1139,10 +1152,6 @@ pub unsafe fn _mm_test_mix_ones_zeros(a: __m128i, mask: __m128i) -> i32 { #[allow(improper_ctypes)] extern "C" { - #[link_name = "llvm.x86.sse41.blendpd"] - fn blendpd(a: __m128d, b: __m128d, imm2: u8) -> __m128d; - #[link_name = "llvm.x86.sse41.blendps"] - fn blendps(a: __m128, b: __m128, imm4: u8) -> __m128; #[link_name = "llvm.x86.sse41.insertps"] fn insertps(a: __m128, b: __m128, imm8: u8) -> __m128; #[link_name = "llvm.x86.sse41.packusdw"] From 01c610f34d53f1cf973490be208959d85d4e5c1b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eduardo=20S=C3=A1nchez=20Mu=C3=B1oz?= Date: Wed, 4 Oct 2023 20:22:41 +0200 Subject: [PATCH 12/15] Reimplement _mm_addsub_ps and _mm_addsub_pd without LLVM intrinsics --- crates/core_arch/src/x86/sse3.rs | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) diff --git a/crates/core_arch/src/x86/sse3.rs b/crates/core_arch/src/x86/sse3.rs index 092a8d9cd5..df0d78e5bf 100644 --- a/crates/core_arch/src/x86/sse3.rs +++ b/crates/core_arch/src/x86/sse3.rs @@ -1,7 +1,7 @@ //! Streaming SIMD Extensions 3 (SSE3) use crate::{ - core_arch::{simd::*, simd_llvm::simd_shuffle, x86::*}, + core_arch::{simd::*, simd_llvm::*, x86::*}, mem::transmute, }; @@ -17,7 +17,11 @@ use stdarch_test::assert_instr; #[cfg_attr(test, assert_instr(addsubps))] #[stable(feature = "simd_x86", since = "1.27.0")] pub unsafe fn _mm_addsub_ps(a: __m128, b: __m128) -> __m128 { - addsubps(a, b) + let a = a.as_f32x4(); + let b = b.as_f32x4(); + let add = simd_add(a, b); + let sub = simd_sub(a, b); + simd_shuffle!(add, sub, [4, 1, 6, 3]) } /// Alternatively add and subtract packed double-precision (64-bit) @@ -29,7 +33,11 @@ pub unsafe fn _mm_addsub_ps(a: __m128, b: __m128) -> __m128 { #[cfg_attr(test, assert_instr(addsubpd))] #[stable(feature = "simd_x86", since = "1.27.0")] pub unsafe fn _mm_addsub_pd(a: __m128d, b: __m128d) -> __m128d { - addsubpd(a, b) + let a = a.as_f64x2(); + let b = b.as_f64x2(); + let add = simd_add(a, b); + let sub = simd_sub(a, b); + simd_shuffle!(add, sub, [2, 1]) } /// Horizontally adds adjacent pairs of double-precision (64-bit) @@ -143,10 +151,6 @@ pub unsafe fn _mm_moveldup_ps(a: __m128) -> __m128 { #[allow(improper_ctypes)] extern "C" { - #[link_name = "llvm.x86.sse3.addsub.ps"] - fn addsubps(a: __m128, b: __m128) -> __m128; - #[link_name = "llvm.x86.sse3.addsub.pd"] - fn addsubpd(a: __m128d, b: __m128d) -> __m128d; #[link_name = "llvm.x86.sse3.hadd.pd"] fn haddpd(a: __m128d, b: __m128d) -> __m128d; #[link_name = "llvm.x86.sse3.hadd.ps"] From b45868dd0981107105fbca4336fee5b06ceb6905 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eduardo=20S=C3=A1nchez=20Mu=C3=B1oz?= Date: Wed, 4 Oct 2023 20:26:08 +0200 Subject: [PATCH 13/15] Reimplement _mm256_addsub_ps and _mm256_addsub_pd without LLVM intrinsics --- crates/core_arch/src/x86/avx.rs | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/crates/core_arch/src/x86/avx.rs b/crates/core_arch/src/x86/avx.rs index 28e1c6b35a..55a90f70b5 100644 --- a/crates/core_arch/src/x86/avx.rs +++ b/crates/core_arch/src/x86/avx.rs @@ -268,7 +268,11 @@ pub unsafe fn _mm256_mul_ps(a: __m256, b: __m256) -> __m256 { #[cfg_attr(test, assert_instr(vaddsubpd))] #[stable(feature = "simd_x86", since = "1.27.0")] pub unsafe fn _mm256_addsub_pd(a: __m256d, b: __m256d) -> __m256d { - addsubpd256(a, b) + let a = a.as_f64x4(); + let b = b.as_f64x4(); + let add = simd_add(a, b); + let sub = simd_sub(a, b); + simd_shuffle!(add, sub, [4, 1, 6, 3]) } /// Alternatively adds and subtracts packed single-precision (32-bit) @@ -280,7 +284,11 @@ pub unsafe fn _mm256_addsub_pd(a: __m256d, b: __m256d) -> __m256d { #[cfg_attr(test, assert_instr(vaddsubps))] #[stable(feature = "simd_x86", since = "1.27.0")] pub unsafe fn _mm256_addsub_ps(a: __m256, b: __m256) -> __m256 { - addsubps256(a, b) + let a = a.as_f32x8(); + let b = b.as_f32x8(); + let add = simd_add(a, b); + let sub = simd_sub(a, b); + simd_shuffle!(add, sub, [8, 1, 10, 3, 12, 5, 14, 7]) } /// Subtracts packed double-precision (64-bit) floating-point elements in `b` @@ -2906,10 +2914,6 @@ pub unsafe fn _mm256_cvtss_f32(a: __m256) -> f32 { // LLVM intrinsics used in the above functions #[allow(improper_ctypes)] extern "C" { - #[link_name = "llvm.x86.avx.addsub.pd.256"] - fn addsubpd256(a: __m256d, b: __m256d) -> __m256d; - #[link_name = "llvm.x86.avx.addsub.ps.256"] - fn addsubps256(a: __m256, b: __m256) -> __m256; #[link_name = "llvm.x86.avx.round.pd.256"] fn roundpd256(a: __m256d, b: i32) -> __m256d; #[link_name = "llvm.x86.avx.round.ps.256"] From 07f63ec0c44a861089aa77f36286ed0a3c61fda1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eduardo=20S=C3=A1nchez=20Mu=C3=B1oz?= Date: Wed, 4 Oct 2023 20:34:31 +0200 Subject: [PATCH 14/15] Reimplement _mm_movemask_ps and _mm_movemask_pd without LLVM intrinsics --- crates/core_arch/src/x86/sse.rs | 4 +--- crates/core_arch/src/x86/sse2.rs | 4 +--- 2 files changed, 2 insertions(+), 6 deletions(-) diff --git a/crates/core_arch/src/x86/sse.rs b/crates/core_arch/src/x86/sse.rs index 5050cd770b..c325e3939e 100644 --- a/crates/core_arch/src/x86/sse.rs +++ b/crates/core_arch/src/x86/sse.rs @@ -1081,7 +1081,7 @@ pub unsafe fn _mm_movelh_ps(a: __m128, b: __m128) -> __m128 { #[cfg_attr(test, assert_instr(movmskps))] #[stable(feature = "simd_x86", since = "1.27.0")] pub unsafe fn _mm_movemask_ps(a: __m128) -> i32 { - movmskps(a) + simd_bitmask::(transmute(a)).into() } /// Construct a `__m128` with the lowest element read from `p` and the other @@ -1885,8 +1885,6 @@ extern "C" { fn maxss(a: __m128, b: __m128) -> __m128; #[link_name = "llvm.x86.sse.max.ps"] fn maxps(a: __m128, b: __m128) -> __m128; - #[link_name = "llvm.x86.sse.movmsk.ps"] - fn movmskps(a: __m128) -> i32; #[link_name = "llvm.x86.sse.cmp.ps"] fn cmpps(a: __m128, b: __m128, imm8: i8) -> __m128; #[link_name = "llvm.x86.sse.comieq.ss"] diff --git a/crates/core_arch/src/x86/sse2.rs b/crates/core_arch/src/x86/sse2.rs index 22fa62ed7d..0ef4e7dbc7 100644 --- a/crates/core_arch/src/x86/sse2.rs +++ b/crates/core_arch/src/x86/sse2.rs @@ -2450,7 +2450,7 @@ pub unsafe fn _mm_setzero_pd() -> __m128d { #[cfg_attr(test, assert_instr(movmskpd))] #[stable(feature = "simd_x86", since = "1.27.0")] pub unsafe fn _mm_movemask_pd(a: __m128d) -> i32 { - movmskpd(a) + simd_bitmask::(transmute(a)).into() } /// Loads 128-bits (composed of 2 packed double-precision (64-bit) @@ -2914,8 +2914,6 @@ extern "C" { fn ucomigesd(a: __m128d, b: __m128d) -> i32; #[link_name = "llvm.x86.sse2.ucomineq.sd"] fn ucomineqsd(a: __m128d, b: __m128d) -> i32; - #[link_name = "llvm.x86.sse2.movmsk.pd"] - fn movmskpd(a: __m128d) -> i32; #[link_name = "llvm.x86.sse2.cvtpd2dq"] fn cvtpd2dq(a: __m128d) -> i32x4; #[link_name = "llvm.x86.sse2.cvtsd2si"] From 93163d599ce7ca330750c000f9f22c9b4338fa00 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eduardo=20S=C3=A1nchez=20Mu=C3=B1oz?= Date: Wed, 4 Oct 2023 20:37:54 +0200 Subject: [PATCH 15/15] Reimplement _mm256_movemask_ps and _mm256_movemask_pd without LLVM intrinsics --- crates/core_arch/src/x86/avx.rs | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/crates/core_arch/src/x86/avx.rs b/crates/core_arch/src/x86/avx.rs index 55a90f70b5..074bf744d5 100644 --- a/crates/core_arch/src/x86/avx.rs +++ b/crates/core_arch/src/x86/avx.rs @@ -2066,7 +2066,7 @@ pub unsafe fn _mm_testnzc_ps(a: __m128, b: __m128) -> i32 { #[cfg_attr(test, assert_instr(vmovmskpd))] #[stable(feature = "simd_x86", since = "1.27.0")] pub unsafe fn _mm256_movemask_pd(a: __m256d) -> i32 { - movmskpd256(a) + simd_bitmask::(transmute(a)).into() } /// Sets each bit of the returned mask based on the most significant bit of the @@ -2079,7 +2079,7 @@ pub unsafe fn _mm256_movemask_pd(a: __m256d) -> i32 { #[cfg_attr(test, assert_instr(vmovmskps))] #[stable(feature = "simd_x86", since = "1.27.0")] pub unsafe fn _mm256_movemask_ps(a: __m256) -> i32 { - movmskps256(a) + simd_bitmask::(transmute(a)).into() } /// Returns vector of type __m256d with all elements set to zero. @@ -3028,10 +3028,6 @@ extern "C" { fn vtestcps(a: __m128, b: __m128) -> i32; #[link_name = "llvm.x86.avx.vtestnzc.ps"] fn vtestnzcps(a: __m128, b: __m128) -> i32; - #[link_name = "llvm.x86.avx.movmsk.pd.256"] - fn movmskpd256(a: __m256d) -> i32; - #[link_name = "llvm.x86.avx.movmsk.ps.256"] - fn movmskps256(a: __m256) -> i32; #[link_name = "llvm.x86.avx.min.ps.256"] fn vminps(a: __m256, b: __m256) -> __m256; #[link_name = "llvm.x86.avx.max.ps.256"]