diff --git a/crates/core_arch/src/x86/avx.rs b/crates/core_arch/src/x86/avx.rs index 00bcc1fa1b..074bf744d5 100644 --- a/crates/core_arch/src/x86/avx.rs +++ b/crates/core_arch/src/x86/avx.rs @@ -268,7 +268,11 @@ pub unsafe fn _mm256_mul_ps(a: __m256, b: __m256) -> __m256 { #[cfg_attr(test, assert_instr(vaddsubpd))] #[stable(feature = "simd_x86", since = "1.27.0")] pub unsafe fn _mm256_addsub_pd(a: __m256d, b: __m256d) -> __m256d { - addsubpd256(a, b) + let a = a.as_f64x4(); + let b = b.as_f64x4(); + let add = simd_add(a, b); + let sub = simd_sub(a, b); + simd_shuffle!(add, sub, [4, 1, 6, 3]) } /// Alternatively adds and subtracts packed single-precision (32-bit) @@ -280,7 +284,11 @@ pub unsafe fn _mm256_addsub_pd(a: __m256d, b: __m256d) -> __m256d { #[cfg_attr(test, assert_instr(vaddsubps))] #[stable(feature = "simd_x86", since = "1.27.0")] pub unsafe fn _mm256_addsub_ps(a: __m256, b: __m256) -> __m256 { - addsubps256(a, b) + let a = a.as_f32x8(); + let b = b.as_f32x8(); + let add = simd_add(a, b); + let sub = simd_sub(a, b); + simd_shuffle!(add, sub, [8, 1, 10, 3, 12, 5, 14, 7]) } /// Subtracts packed double-precision (64-bit) floating-point elements in `b` @@ -511,7 +519,8 @@ pub unsafe fn _mm256_blend_ps(a: __m256, b: __m256) -> __m256 { #[cfg_attr(test, assert_instr(vblendvpd))] #[stable(feature = "simd_x86", since = "1.27.0")] pub unsafe fn _mm256_blendv_pd(a: __m256d, b: __m256d, c: __m256d) -> __m256d { - vblendvpd(a, b, c) + let mask: i64x4 = simd_lt(transmute::<_, i64x4>(c), i64x4::splat(0)); + transmute(simd_select(mask, b.as_f64x4(), a.as_f64x4())) } /// Blends packed single-precision (32-bit) floating-point elements from @@ -523,7 +532,8 @@ pub unsafe fn _mm256_blendv_pd(a: __m256d, b: __m256d, c: __m256d) -> __m256d { #[cfg_attr(test, assert_instr(vblendvps))] #[stable(feature = "simd_x86", since = "1.27.0")] pub unsafe fn _mm256_blendv_ps(a: __m256, b: __m256, c: __m256) -> __m256 { - vblendvps(a, b, c) + let mask: i32x8 = simd_lt(transmute::<_, i32x8>(c), i32x8::splat(0)); + transmute(simd_select(mask, b.as_f32x8(), a.as_f32x8())) } /// Conditionally multiplies the packed single-precision (32-bit) floating-point @@ -2056,7 +2066,7 @@ pub unsafe fn _mm_testnzc_ps(a: __m128, b: __m128) -> i32 { #[cfg_attr(test, assert_instr(vmovmskpd))] #[stable(feature = "simd_x86", since = "1.27.0")] pub unsafe fn _mm256_movemask_pd(a: __m256d) -> i32 { - movmskpd256(a) + simd_bitmask::(transmute(a)).into() } /// Sets each bit of the returned mask based on the most significant bit of the @@ -2069,7 +2079,7 @@ pub unsafe fn _mm256_movemask_pd(a: __m256d) -> i32 { #[cfg_attr(test, assert_instr(vmovmskps))] #[stable(feature = "simd_x86", since = "1.27.0")] pub unsafe fn _mm256_movemask_ps(a: __m256) -> i32 { - movmskps256(a) + simd_bitmask::(transmute(a)).into() } /// Returns vector of type __m256d with all elements set to zero. @@ -2904,20 +2914,12 @@ pub unsafe fn _mm256_cvtss_f32(a: __m256) -> f32 { // LLVM intrinsics used in the above functions #[allow(improper_ctypes)] extern "C" { - #[link_name = "llvm.x86.avx.addsub.pd.256"] - fn addsubpd256(a: __m256d, b: __m256d) -> __m256d; - #[link_name = "llvm.x86.avx.addsub.ps.256"] - fn addsubps256(a: __m256, b: __m256) -> __m256; #[link_name = "llvm.x86.avx.round.pd.256"] fn roundpd256(a: __m256d, b: i32) -> __m256d; #[link_name = "llvm.x86.avx.round.ps.256"] fn roundps256(a: __m256, b: i32) -> __m256; #[link_name = "llvm.x86.avx.sqrt.ps.256"] fn sqrtps256(a: __m256) -> __m256; - #[link_name = "llvm.x86.avx.blendv.pd.256"] - fn vblendvpd(a: __m256d, b: __m256d, c: __m256d) -> __m256d; - #[link_name = "llvm.x86.avx.blendv.ps.256"] - fn vblendvps(a: __m256, b: __m256, c: __m256) -> __m256; #[link_name = "llvm.x86.avx.dp.ps.256"] fn vdpps(a: __m256, b: __m256, imm8: i32) -> __m256; #[link_name = "llvm.x86.avx.hadd.pd.256"] @@ -3026,10 +3028,6 @@ extern "C" { fn vtestcps(a: __m128, b: __m128) -> i32; #[link_name = "llvm.x86.avx.vtestnzc.ps"] fn vtestnzcps(a: __m128, b: __m128) -> i32; - #[link_name = "llvm.x86.avx.movmsk.pd.256"] - fn movmskpd256(a: __m256d) -> i32; - #[link_name = "llvm.x86.avx.movmsk.ps.256"] - fn movmskps256(a: __m256) -> i32; #[link_name = "llvm.x86.avx.min.ps.256"] fn vminps(a: __m256, b: __m256) -> __m256; #[link_name = "llvm.x86.avx.max.ps.256"] diff --git a/crates/core_arch/src/x86/avx2.rs b/crates/core_arch/src/x86/avx2.rs index e23c795ee7..243a4cdab1 100644 --- a/crates/core_arch/src/x86/avx2.rs +++ b/crates/core_arch/src/x86/avx2.rs @@ -344,7 +344,10 @@ pub unsafe fn _mm256_andnot_si256(a: __m256i, b: __m256i) -> __m256i { #[cfg_attr(test, assert_instr(vpavgw))] #[stable(feature = "simd_x86", since = "1.27.0")] pub unsafe fn _mm256_avg_epu16(a: __m256i, b: __m256i) -> __m256i { - transmute(pavgw(a.as_u16x16(), b.as_u16x16())) + let a = simd_cast::<_, u32x16>(a.as_u16x16()); + let b = simd_cast::<_, u32x16>(b.as_u16x16()); + let r = simd_shr(simd_add(simd_add(a, b), u32x16::splat(1)), u32x16::splat(1)); + transmute(simd_cast::<_, u16x16>(r)) } /// Averages packed unsigned 8-bit integers in `a` and `b`. @@ -355,7 +358,10 @@ pub unsafe fn _mm256_avg_epu16(a: __m256i, b: __m256i) -> __m256i { #[cfg_attr(test, assert_instr(vpavgb))] #[stable(feature = "simd_x86", since = "1.27.0")] pub unsafe fn _mm256_avg_epu8(a: __m256i, b: __m256i) -> __m256i { - transmute(pavgb(a.as_u8x32(), b.as_u8x32())) + let a = simd_cast::<_, u16x32>(a.as_u8x32()); + let b = simd_cast::<_, u16x32>(b.as_u8x32()); + let r = simd_shr(simd_add(simd_add(a, b), u16x32::splat(1)), u16x32::splat(1)); + transmute(simd_cast::<_, u8x32>(r)) } /// Blends packed 32-bit integers from `a` and `b` using control mask `IMM4`. @@ -458,7 +464,8 @@ pub unsafe fn _mm256_blend_epi16(a: __m256i, b: __m256i) -> __m #[cfg_attr(test, assert_instr(vpblendvb))] #[stable(feature = "simd_x86", since = "1.27.0")] pub unsafe fn _mm256_blendv_epi8(a: __m256i, b: __m256i, mask: __m256i) -> __m256i { - transmute(pblendvb(a.as_i8x32(), b.as_i8x32(), mask.as_i8x32())) + let mask: i8x32 = simd_lt(mask.as_i8x32(), i8x32::splat(0)); + transmute(simd_select(mask, b.as_i8x32(), a.as_i8x32())) } /// Broadcasts the low packed 8-bit integer from `a` to all elements of @@ -2060,7 +2067,9 @@ pub unsafe fn _mm256_mpsadbw_epu8(a: __m256i, b: __m256i) -> __ #[cfg_attr(test, assert_instr(vpmuldq))] #[stable(feature = "simd_x86", since = "1.27.0")] pub unsafe fn _mm256_mul_epi32(a: __m256i, b: __m256i) -> __m256i { - transmute(pmuldq(a.as_i32x8(), b.as_i32x8())) + let a = simd_cast::<_, i64x4>(simd_cast::<_, i32x4>(a.as_i64x4())); + let b = simd_cast::<_, i64x4>(simd_cast::<_, i32x4>(b.as_i64x4())); + transmute(simd_mul(a, b)) } /// Multiplies the low unsigned 32-bit integers from each packed 64-bit @@ -2074,7 +2083,10 @@ pub unsafe fn _mm256_mul_epi32(a: __m256i, b: __m256i) -> __m256i { #[cfg_attr(test, assert_instr(vpmuludq))] #[stable(feature = "simd_x86", since = "1.27.0")] pub unsafe fn _mm256_mul_epu32(a: __m256i, b: __m256i) -> __m256i { - transmute(pmuludq(a.as_u32x8(), b.as_u32x8())) + let a = a.as_u64x4(); + let b = b.as_u64x4(); + let mask = u64x4::splat(u32::MAX.into()); + transmute(simd_mul(simd_and(a, mask), simd_and(b, mask))) } /// Multiplies the packed 16-bit integers in `a` and `b`, producing @@ -2087,7 +2099,10 @@ pub unsafe fn _mm256_mul_epu32(a: __m256i, b: __m256i) -> __m256i { #[cfg_attr(test, assert_instr(vpmulhw))] #[stable(feature = "simd_x86", since = "1.27.0")] pub unsafe fn _mm256_mulhi_epi16(a: __m256i, b: __m256i) -> __m256i { - transmute(pmulhw(a.as_i16x16(), b.as_i16x16())) + let a = simd_cast::<_, i32x16>(a.as_i16x16()); + let b = simd_cast::<_, i32x16>(b.as_i16x16()); + let r = simd_shr(simd_mul(a, b), i32x16::splat(16)); + transmute(simd_cast::(r)) } /// Multiplies the packed unsigned 16-bit integers in `a` and `b`, producing @@ -2100,7 +2115,10 @@ pub unsafe fn _mm256_mulhi_epi16(a: __m256i, b: __m256i) -> __m256i { #[cfg_attr(test, assert_instr(vpmulhuw))] #[stable(feature = "simd_x86", since = "1.27.0")] pub unsafe fn _mm256_mulhi_epu16(a: __m256i, b: __m256i) -> __m256i { - transmute(pmulhuw(a.as_u16x16(), b.as_u16x16())) + let a = simd_cast::<_, u32x16>(a.as_u16x16()); + let b = simd_cast::<_, u32x16>(b.as_u16x16()); + let r = simd_shr(simd_mul(a, b), u32x16::splat(16)); + transmute(simd_cast::(r)) } /// Multiplies the packed 16-bit integers in `a` and `b`, producing @@ -3629,12 +3647,6 @@ extern "C" { fn pabsw(a: i16x16) -> u16x16; #[link_name = "llvm.x86.avx2.pabs.d"] fn pabsd(a: i32x8) -> u32x8; - #[link_name = "llvm.x86.avx2.pavg.b"] - fn pavgb(a: u8x32, b: u8x32) -> u8x32; - #[link_name = "llvm.x86.avx2.pavg.w"] - fn pavgw(a: u16x16, b: u16x16) -> u16x16; - #[link_name = "llvm.x86.avx2.pblendvb"] - fn pblendvb(a: i8x32, b: i8x32, mask: i8x32) -> i8x32; #[link_name = "llvm.x86.avx2.phadd.w"] fn phaddw(a: i16x16, b: i16x16) -> i16x16; #[link_name = "llvm.x86.avx2.phadd.d"] @@ -3669,14 +3681,6 @@ extern "C" { fn maskstoreq256(mem_addr: *mut i8, mask: i64x4, a: i64x4); #[link_name = "llvm.x86.avx2.mpsadbw"] fn mpsadbw(a: u8x32, b: u8x32, imm8: i32) -> u16x16; - #[link_name = "llvm.x86.avx2.pmulhu.w"] - fn pmulhuw(a: u16x16, b: u16x16) -> u16x16; - #[link_name = "llvm.x86.avx2.pmulh.w"] - fn pmulhw(a: i16x16, b: i16x16) -> i16x16; - #[link_name = "llvm.x86.avx2.pmul.dq"] - fn pmuldq(a: i32x8, b: i32x8) -> i64x4; - #[link_name = "llvm.x86.avx2.pmulu.dq"] - fn pmuludq(a: u32x8, b: u32x8) -> u64x4; #[link_name = "llvm.x86.avx2.pmul.hr.sw"] fn pmulhrsw(a: i16x16, b: i16x16) -> i16x16; #[link_name = "llvm.x86.avx2.packsswb"] diff --git a/crates/core_arch/src/x86/sse.rs b/crates/core_arch/src/x86/sse.rs index 5050cd770b..c325e3939e 100644 --- a/crates/core_arch/src/x86/sse.rs +++ b/crates/core_arch/src/x86/sse.rs @@ -1081,7 +1081,7 @@ pub unsafe fn _mm_movelh_ps(a: __m128, b: __m128) -> __m128 { #[cfg_attr(test, assert_instr(movmskps))] #[stable(feature = "simd_x86", since = "1.27.0")] pub unsafe fn _mm_movemask_ps(a: __m128) -> i32 { - movmskps(a) + simd_bitmask::(transmute(a)).into() } /// Construct a `__m128` with the lowest element read from `p` and the other @@ -1885,8 +1885,6 @@ extern "C" { fn maxss(a: __m128, b: __m128) -> __m128; #[link_name = "llvm.x86.sse.max.ps"] fn maxps(a: __m128, b: __m128) -> __m128; - #[link_name = "llvm.x86.sse.movmsk.ps"] - fn movmskps(a: __m128) -> i32; #[link_name = "llvm.x86.sse.cmp.ps"] fn cmpps(a: __m128, b: __m128, imm8: i8) -> __m128; #[link_name = "llvm.x86.sse.comieq.ss"] diff --git a/crates/core_arch/src/x86/sse2.rs b/crates/core_arch/src/x86/sse2.rs index 3fe81e0048..0ef4e7dbc7 100644 --- a/crates/core_arch/src/x86/sse2.rs +++ b/crates/core_arch/src/x86/sse2.rs @@ -165,7 +165,10 @@ pub unsafe fn _mm_adds_epu16(a: __m128i, b: __m128i) -> __m128i { #[cfg_attr(test, assert_instr(pavgb))] #[stable(feature = "simd_x86", since = "1.27.0")] pub unsafe fn _mm_avg_epu8(a: __m128i, b: __m128i) -> __m128i { - transmute(pavgb(a.as_u8x16(), b.as_u8x16())) + let a = simd_cast::<_, u16x16>(a.as_u8x16()); + let b = simd_cast::<_, u16x16>(b.as_u8x16()); + let r = simd_shr(simd_add(simd_add(a, b), u16x16::splat(1)), u16x16::splat(1)); + transmute(simd_cast::<_, u8x16>(r)) } /// Averages packed unsigned 16-bit integers in `a` and `b`. @@ -176,7 +179,10 @@ pub unsafe fn _mm_avg_epu8(a: __m128i, b: __m128i) -> __m128i { #[cfg_attr(test, assert_instr(pavgw))] #[stable(feature = "simd_x86", since = "1.27.0")] pub unsafe fn _mm_avg_epu16(a: __m128i, b: __m128i) -> __m128i { - transmute(pavgw(a.as_u16x8(), b.as_u16x8())) + let a = simd_cast::<_, u32x8>(a.as_u16x8()); + let b = simd_cast::<_, u32x8>(b.as_u16x8()); + let r = simd_shr(simd_add(simd_add(a, b), u32x8::splat(1)), u32x8::splat(1)); + transmute(simd_cast::<_, u16x8>(r)) } /// Multiplies and then horizontally add signed 16 bit integers in `a` and `b`. @@ -261,7 +267,10 @@ pub unsafe fn _mm_min_epu8(a: __m128i, b: __m128i) -> __m128i { #[cfg_attr(test, assert_instr(pmulhw))] #[stable(feature = "simd_x86", since = "1.27.0")] pub unsafe fn _mm_mulhi_epi16(a: __m128i, b: __m128i) -> __m128i { - transmute(pmulhw(a.as_i16x8(), b.as_i16x8())) + let a = simd_cast::<_, i32x8>(a.as_i16x8()); + let b = simd_cast::<_, i32x8>(b.as_i16x8()); + let r = simd_shr(simd_mul(a, b), i32x8::splat(16)); + transmute(simd_cast::(r)) } /// Multiplies the packed unsigned 16-bit integers in `a` and `b`. @@ -275,7 +284,10 @@ pub unsafe fn _mm_mulhi_epi16(a: __m128i, b: __m128i) -> __m128i { #[cfg_attr(test, assert_instr(pmulhuw))] #[stable(feature = "simd_x86", since = "1.27.0")] pub unsafe fn _mm_mulhi_epu16(a: __m128i, b: __m128i) -> __m128i { - transmute(pmulhuw(a.as_u16x8(), b.as_u16x8())) + let a = simd_cast::<_, u32x8>(a.as_u16x8()); + let b = simd_cast::<_, u32x8>(b.as_u16x8()); + let r = simd_shr(simd_mul(a, b), u32x8::splat(16)); + transmute(simd_cast::(r)) } /// Multiplies the packed 16-bit integers in `a` and `b`. @@ -303,7 +315,10 @@ pub unsafe fn _mm_mullo_epi16(a: __m128i, b: __m128i) -> __m128i { #[cfg_attr(test, assert_instr(pmuludq))] #[stable(feature = "simd_x86", since = "1.27.0")] pub unsafe fn _mm_mul_epu32(a: __m128i, b: __m128i) -> __m128i { - transmute(pmuludq(a.as_u32x4(), b.as_u32x4())) + let a = a.as_u64x2(); + let b = b.as_u64x2(); + let mask = u64x2::splat(u32::MAX.into()); + transmute(simd_mul(simd_and(a, mask), simd_and(b, mask))) } /// Sum the absolute differences of packed unsigned 8-bit integers. @@ -952,7 +967,7 @@ pub unsafe fn _mm_cvtsi32_sd(a: __m128d, b: i32) -> __m128d { #[cfg_attr(test, assert_instr(cvtdq2ps))] #[stable(feature = "simd_x86", since = "1.27.0")] pub unsafe fn _mm_cvtepi32_ps(a: __m128i) -> __m128 { - cvtdq2ps(a.as_i32x4()) + transmute(simd_cast::<_, f32x4>(a.as_i32x4())) } /// Converts packed single-precision (32-bit) floating-point elements in `a` @@ -2240,7 +2255,9 @@ pub unsafe fn _mm_ucomineq_sd(a: __m128d, b: __m128d) -> i32 { #[cfg_attr(test, assert_instr(cvtpd2ps))] #[stable(feature = "simd_x86", since = "1.27.0")] pub unsafe fn _mm_cvtpd_ps(a: __m128d) -> __m128 { - cvtpd2ps(a) + let r = simd_cast::<_, f32x2>(a.as_f64x2()); + let zero = f32x2::new(0.0, 0.0); + transmute::(simd_shuffle!(r, zero, [0, 1, 2, 3])) } /// Converts packed single-precision (32-bit) floating-point elements in `a` to @@ -2253,7 +2270,8 @@ pub unsafe fn _mm_cvtpd_ps(a: __m128d) -> __m128 { #[cfg_attr(test, assert_instr(cvtps2pd))] #[stable(feature = "simd_x86", since = "1.27.0")] pub unsafe fn _mm_cvtps_pd(a: __m128) -> __m128d { - cvtps2pd(a) + let a = a.as_f32x4(); + transmute(simd_cast::(simd_shuffle!(a, a, [0, 1]))) } /// Converts packed double-precision (64-bit) floating-point elements in `a` to @@ -2432,7 +2450,7 @@ pub unsafe fn _mm_setzero_pd() -> __m128d { #[cfg_attr(test, assert_instr(movmskpd))] #[stable(feature = "simd_x86", since = "1.27.0")] pub unsafe fn _mm_movemask_pd(a: __m128d) -> i32 { - movmskpd(a) + simd_bitmask::(transmute(a)).into() } /// Loads 128-bits (composed of 2 packed double-precision (64-bit) @@ -2826,18 +2844,8 @@ extern "C" { fn lfence(); #[link_name = "llvm.x86.sse2.mfence"] fn mfence(); - #[link_name = "llvm.x86.sse2.pavg.b"] - fn pavgb(a: u8x16, b: u8x16) -> u8x16; - #[link_name = "llvm.x86.sse2.pavg.w"] - fn pavgw(a: u16x8, b: u16x8) -> u16x8; #[link_name = "llvm.x86.sse2.pmadd.wd"] fn pmaddwd(a: i16x8, b: i16x8) -> i32x4; - #[link_name = "llvm.x86.sse2.pmulh.w"] - fn pmulhw(a: i16x8, b: i16x8) -> i16x8; - #[link_name = "llvm.x86.sse2.pmulhu.w"] - fn pmulhuw(a: u16x8, b: u16x8) -> u16x8; - #[link_name = "llvm.x86.sse2.pmulu.dq"] - fn pmuludq(a: u32x4, b: u32x4) -> u64x2; #[link_name = "llvm.x86.sse2.psad.bw"] fn psadbw(a: u8x16, b: u8x16) -> u64x2; #[link_name = "llvm.x86.sse2.psll.w"] @@ -2856,8 +2864,6 @@ extern "C" { fn psrld(a: i32x4, count: i32x4) -> i32x4; #[link_name = "llvm.x86.sse2.psrl.q"] fn psrlq(a: i64x2, count: i64x2) -> i64x2; - #[link_name = "llvm.x86.sse2.cvtdq2ps"] - fn cvtdq2ps(a: i32x4) -> __m128; #[link_name = "llvm.x86.sse2.cvtps2dq"] fn cvtps2dq(a: __m128) -> i32x4; #[link_name = "llvm.x86.sse2.maskmov.dqu"] @@ -2908,12 +2914,6 @@ extern "C" { fn ucomigesd(a: __m128d, b: __m128d) -> i32; #[link_name = "llvm.x86.sse2.ucomineq.sd"] fn ucomineqsd(a: __m128d, b: __m128d) -> i32; - #[link_name = "llvm.x86.sse2.movmsk.pd"] - fn movmskpd(a: __m128d) -> i32; - #[link_name = "llvm.x86.sse2.cvtpd2ps"] - fn cvtpd2ps(a: __m128d) -> __m128; - #[link_name = "llvm.x86.sse2.cvtps2pd"] - fn cvtps2pd(a: __m128) -> __m128d; #[link_name = "llvm.x86.sse2.cvtpd2dq"] fn cvtpd2dq(a: __m128d) -> i32x4; #[link_name = "llvm.x86.sse2.cvtsd2si"] diff --git a/crates/core_arch/src/x86/sse3.rs b/crates/core_arch/src/x86/sse3.rs index 092a8d9cd5..df0d78e5bf 100644 --- a/crates/core_arch/src/x86/sse3.rs +++ b/crates/core_arch/src/x86/sse3.rs @@ -1,7 +1,7 @@ //! Streaming SIMD Extensions 3 (SSE3) use crate::{ - core_arch::{simd::*, simd_llvm::simd_shuffle, x86::*}, + core_arch::{simd::*, simd_llvm::*, x86::*}, mem::transmute, }; @@ -17,7 +17,11 @@ use stdarch_test::assert_instr; #[cfg_attr(test, assert_instr(addsubps))] #[stable(feature = "simd_x86", since = "1.27.0")] pub unsafe fn _mm_addsub_ps(a: __m128, b: __m128) -> __m128 { - addsubps(a, b) + let a = a.as_f32x4(); + let b = b.as_f32x4(); + let add = simd_add(a, b); + let sub = simd_sub(a, b); + simd_shuffle!(add, sub, [4, 1, 6, 3]) } /// Alternatively add and subtract packed double-precision (64-bit) @@ -29,7 +33,11 @@ pub unsafe fn _mm_addsub_ps(a: __m128, b: __m128) -> __m128 { #[cfg_attr(test, assert_instr(addsubpd))] #[stable(feature = "simd_x86", since = "1.27.0")] pub unsafe fn _mm_addsub_pd(a: __m128d, b: __m128d) -> __m128d { - addsubpd(a, b) + let a = a.as_f64x2(); + let b = b.as_f64x2(); + let add = simd_add(a, b); + let sub = simd_sub(a, b); + simd_shuffle!(add, sub, [2, 1]) } /// Horizontally adds adjacent pairs of double-precision (64-bit) @@ -143,10 +151,6 @@ pub unsafe fn _mm_moveldup_ps(a: __m128) -> __m128 { #[allow(improper_ctypes)] extern "C" { - #[link_name = "llvm.x86.sse3.addsub.ps"] - fn addsubps(a: __m128, b: __m128) -> __m128; - #[link_name = "llvm.x86.sse3.addsub.pd"] - fn addsubpd(a: __m128d, b: __m128d) -> __m128d; #[link_name = "llvm.x86.sse3.hadd.pd"] fn haddpd(a: __m128d, b: __m128d) -> __m128d; #[link_name = "llvm.x86.sse3.hadd.ps"] diff --git a/crates/core_arch/src/x86/sse41.rs b/crates/core_arch/src/x86/sse41.rs index a62beb6f68..6351aa45ff 100644 --- a/crates/core_arch/src/x86/sse41.rs +++ b/crates/core_arch/src/x86/sse41.rs @@ -62,7 +62,8 @@ pub const _MM_FROUND_NEARBYINT: i32 = _MM_FROUND_NO_EXC | _MM_FROUND_CUR_DIRECTI #[cfg_attr(test, assert_instr(pblendvb))] #[stable(feature = "simd_x86", since = "1.27.0")] pub unsafe fn _mm_blendv_epi8(a: __m128i, b: __m128i, mask: __m128i) -> __m128i { - transmute(pblendvb(a.as_i8x16(), b.as_i8x16(), mask.as_i8x16())) + let mask: i8x16 = simd_lt(mask.as_i8x16(), i8x16::splat(0)); + transmute(simd_select(mask, b.as_i8x16(), a.as_i8x16())) } /// Blend packed 16-bit integers from `a` and `b` using the mask `IMM8`. @@ -74,15 +75,25 @@ pub unsafe fn _mm_blendv_epi8(a: __m128i, b: __m128i, mask: __m128i) -> __m128i /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_blend_epi16) #[inline] #[target_feature(enable = "sse4.1")] -// Note: LLVM7 prefers the single-precision floating-point domain when possible -// see https://bugs.llvm.org/show_bug.cgi?id=38195 -// #[cfg_attr(test, assert_instr(pblendw, IMM8 = 0xF0))] -#[cfg_attr(test, assert_instr(blendps, IMM8 = 0xF0))] +#[cfg_attr(test, assert_instr(pblendw, IMM8 = 0xB1))] #[rustc_legacy_const_generics(2)] #[stable(feature = "simd_x86", since = "1.27.0")] pub unsafe fn _mm_blend_epi16(a: __m128i, b: __m128i) -> __m128i { static_assert_uimm_bits!(IMM8, 8); - transmute(pblendw(a.as_i16x8(), b.as_i16x8(), IMM8 as u8)) + transmute::(simd_shuffle!( + a.as_i16x8(), + b.as_i16x8(), + [ + [0, 8][IMM8 as usize & 1], + [1, 9][(IMM8 >> 1) as usize & 1], + [2, 10][(IMM8 >> 2) as usize & 1], + [3, 11][(IMM8 >> 3) as usize & 1], + [4, 12][(IMM8 >> 4) as usize & 1], + [5, 13][(IMM8 >> 5) as usize & 1], + [6, 14][(IMM8 >> 6) as usize & 1], + [7, 15][(IMM8 >> 7) as usize & 1], + ] + )) } /// Blend packed double-precision (64-bit) floating-point elements from `a` @@ -94,7 +105,8 @@ pub unsafe fn _mm_blend_epi16(a: __m128i, b: __m128i) -> __m128 #[cfg_attr(test, assert_instr(blendvpd))] #[stable(feature = "simd_x86", since = "1.27.0")] pub unsafe fn _mm_blendv_pd(a: __m128d, b: __m128d, mask: __m128d) -> __m128d { - blendvpd(a, b, mask) + let mask: i64x2 = simd_lt(transmute::<_, i64x2>(mask), i64x2::splat(0)); + transmute(simd_select(mask, b.as_f64x2(), a.as_f64x2())) } /// Blend packed single-precision (32-bit) floating-point elements from `a` @@ -106,7 +118,8 @@ pub unsafe fn _mm_blendv_pd(a: __m128d, b: __m128d, mask: __m128d) -> __m128d { #[cfg_attr(test, assert_instr(blendvps))] #[stable(feature = "simd_x86", since = "1.27.0")] pub unsafe fn _mm_blendv_ps(a: __m128, b: __m128, mask: __m128) -> __m128 { - blendvps(a, b, mask) + let mask: i32x4 = simd_lt(transmute::<_, i32x4>(mask), i32x4::splat(0)); + transmute(simd_select(mask, b.as_f32x4(), a.as_f32x4())) } /// Blend packed double-precision (64-bit) floating-point elements from `a` @@ -123,7 +136,11 @@ pub unsafe fn _mm_blendv_ps(a: __m128, b: __m128, mask: __m128) -> __m128 { #[stable(feature = "simd_x86", since = "1.27.0")] pub unsafe fn _mm_blend_pd(a: __m128d, b: __m128d) -> __m128d { static_assert_uimm_bits!(IMM2, 2); - blendpd(a, b, IMM2 as u8) + transmute::(simd_shuffle!( + a.as_f64x2(), + b.as_f64x2(), + [[0, 2][IMM2 as usize & 1], [1, 3][(IMM2 >> 1) as usize & 1]] + )) } /// Blend packed single-precision (32-bit) floating-point elements from `a` @@ -137,7 +154,16 @@ pub unsafe fn _mm_blend_pd(a: __m128d, b: __m128d) -> __m128d { #[stable(feature = "simd_x86", since = "1.27.0")] pub unsafe fn _mm_blend_ps(a: __m128, b: __m128) -> __m128 { static_assert_uimm_bits!(IMM4, 4); - blendps(a, b, IMM4 as u8) + transmute::(simd_shuffle!( + a.as_f32x4(), + b.as_f32x4(), + [ + [0, 4][IMM4 as usize & 1], + [1, 5][(IMM4 >> 1) as usize & 1], + [2, 6][(IMM4 >> 2) as usize & 1], + [3, 7][(IMM4 >> 3) as usize & 1], + ] + )) } /// Extracts a single-precision (32-bit) floating-point element from `a`, @@ -923,7 +949,9 @@ pub unsafe fn _mm_minpos_epu16(a: __m128i) -> __m128i { #[cfg_attr(test, assert_instr(pmuldq))] #[stable(feature = "simd_x86", since = "1.27.0")] pub unsafe fn _mm_mul_epi32(a: __m128i, b: __m128i) -> __m128i { - transmute(pmuldq(a.as_i32x4(), b.as_i32x4())) + let a = simd_cast::<_, i64x2>(simd_cast::<_, i32x2>(a.as_i64x2())); + let b = simd_cast::<_, i64x2>(simd_cast::<_, i32x2>(b.as_i64x2())); + transmute(simd_mul(a, b)) } /// Multiplies the packed 32-bit integers in `a` and `b`, producing intermediate @@ -1124,18 +1152,6 @@ pub unsafe fn _mm_test_mix_ones_zeros(a: __m128i, mask: __m128i) -> i32 { #[allow(improper_ctypes)] extern "C" { - #[link_name = "llvm.x86.sse41.pblendvb"] - fn pblendvb(a: i8x16, b: i8x16, mask: i8x16) -> i8x16; - #[link_name = "llvm.x86.sse41.blendvpd"] - fn blendvpd(a: __m128d, b: __m128d, mask: __m128d) -> __m128d; - #[link_name = "llvm.x86.sse41.blendvps"] - fn blendvps(a: __m128, b: __m128, mask: __m128) -> __m128; - #[link_name = "llvm.x86.sse41.blendpd"] - fn blendpd(a: __m128d, b: __m128d, imm2: u8) -> __m128d; - #[link_name = "llvm.x86.sse41.blendps"] - fn blendps(a: __m128, b: __m128, imm4: u8) -> __m128; - #[link_name = "llvm.x86.sse41.pblendw"] - fn pblendw(a: i16x8, b: i16x8, imm8: u8) -> i16x8; #[link_name = "llvm.x86.sse41.insertps"] fn insertps(a: __m128, b: __m128, imm8: u8) -> __m128; #[link_name = "llvm.x86.sse41.packusdw"] @@ -1154,8 +1170,6 @@ extern "C" { fn roundss(a: __m128, b: __m128, rounding: i32) -> __m128; #[link_name = "llvm.x86.sse41.phminposuw"] fn phminposuw(a: u16x8) -> u16x8; - #[link_name = "llvm.x86.sse41.pmuldq"] - fn pmuldq(a: i32x4, b: i32x4) -> i64x2; #[link_name = "llvm.x86.sse41.mpsadbw"] fn mpsadbw(a: u8x16, b: u8x16, imm8: u8) -> u16x8; #[link_name = "llvm.x86.sse41.ptestz"]