diff --git a/crates/core_arch/src/aarch64/neon/mod.rs b/crates/core_arch/src/aarch64/neon/mod.rs index a85d979a59..0d4c77a737 100644 --- a/crates/core_arch/src/aarch64/neon/mod.rs +++ b/crates/core_arch/src/aarch64/neon/mod.rs @@ -619,6 +619,22 @@ pub unsafe fn vld1q_p16(ptr: *const p16) -> poly16x8_t { read_unaligned(ptr.cast()) } +/// Load multiple single-element structures to one, two, three, or four registers. +#[inline] +#[target_feature(enable = "neon,aes")] +#[cfg_attr(test, assert_instr(ldr))] +pub unsafe fn vld1_p64(ptr: *const p64) -> poly64x1_t { + read_unaligned(ptr.cast()) +} + +/// Load multiple single-element structures to one, two, three, or four registers. +#[inline] +#[target_feature(enable = "neon,aes")] +#[cfg_attr(test, assert_instr(ldr))] +pub unsafe fn vld1q_p64(ptr: *const p64) -> poly64x2_t { + read_unaligned(ptr.cast()) +} + /// Load multiple single-element structures to one, two, three, or four registers. #[inline] #[target_feature(enable = "neon")] @@ -651,6 +667,43 @@ pub unsafe fn vld1q_f64(ptr: *const f64) -> float64x2_t { read_unaligned(ptr.cast()) } +/// Load multiple single-element structures to one, two, three, or four registers +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(test, assert_instr(ldr))] +pub unsafe fn vld1_dup_f64(ptr: *const f64) -> float64x1_t { + vld1_f64(ptr) +} + +/// Load multiple single-element structures to one, two, three, or four registers +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(test, assert_instr(ldr))] +pub unsafe fn vld1q_dup_f64(ptr: *const f64) -> float64x2_t { + let x = vld1q_lane_f64::<0>(ptr, transmute(f64x2::splat(0.))); + simd_shuffle2!(x, x, [0, 0]) +} + +/// Load one single-element structure to one lane of one register. +#[inline] +#[target_feature(enable = "neon")] +#[rustc_legacy_const_generics(2)] +#[cfg_attr(test, assert_instr(ldr, LANE = 0))] +pub unsafe fn vld1_lane_f64(ptr: *const f64, src: float64x1_t) -> float64x1_t { + static_assert!(LANE : i32 where LANE == 0); + simd_insert(src, LANE as u32, *ptr) +} + +/// Load one single-element structure to one lane of one register. +#[inline] +#[target_feature(enable = "neon")] +#[rustc_legacy_const_generics(2)] +#[cfg_attr(test, assert_instr(ldr, LANE = 1))] +pub unsafe fn vld1q_lane_f64(ptr: *const f64, src: float64x2_t) -> float64x2_t { + static_assert_imm1!(LANE); + simd_insert(src, LANE as u32, *ptr) +} + /// Store multiple single-element structures from one, two, three, or four registers. #[inline] #[target_feature(enable = "neon")] @@ -4700,6 +4753,56 @@ mod tests { assert_eq!(r, e); } + #[simd_test(enable = "neon")] + unsafe fn test_vld1_f64() { + let a: [f64; 2] = [0., 1.]; + let e = f64x1::new(1.); + let r: f64x1 = transmute(vld1_f64(a[1..].as_ptr())); + assert_eq!(r, e) + } + + #[simd_test(enable = "neon")] + unsafe fn test_vld1q_f64() { + let a: [f64; 3] = [0., 1., 2.]; + let e = f64x2::new(1., 2.); + let r: f64x2 = transmute(vld1q_f64(a[1..].as_ptr())); + assert_eq!(r, e) + } + + #[simd_test(enable = "neon")] + unsafe fn test_vld1_dup_f64() { + let a: [f64; 2] = [1., 42.]; + let e = f64x1::new(42.); + let r: f64x1 = transmute(vld1_dup_f64(a[1..].as_ptr())); + assert_eq!(r, e) + } + + #[simd_test(enable = "neon")] + unsafe fn test_vld1q_dup_f64() { + let elem: f64 = 42.; + let e = f64x2::new(42., 42.); + let r: f64x2 = transmute(vld1q_dup_f64(&elem)); + assert_eq!(r, e) + } + + #[simd_test(enable = "neon")] + unsafe fn test_vld1_lane_f64() { + let a = f64x1::new(0.); + let elem: f64 = 42.; + let e = f64x1::new(42.); + let r: f64x1 = transmute(vld1_lane_f64::<0>(&elem, transmute(a))); + assert_eq!(r, e) + } + + #[simd_test(enable = "neon")] + unsafe fn test_vld1q_lane_f64() { + let a = f64x2::new(0., 1.); + let elem: f64 = 42.; + let e = f64x2::new(0., 42.); + let r: f64x2 = transmute(vld1q_lane_f64::<1>(&elem, transmute(a))); + assert_eq!(r, e) + } + #[simd_test(enable = "neon")] unsafe fn test_vst1_p64() { let mut vals = [0_u64; 2]; diff --git a/crates/core_arch/src/arm/neon.rs b/crates/core_arch/src/arm/neon.rs index cf3b16f9a0..b38a4f9227 100644 --- a/crates/core_arch/src/arm/neon.rs +++ b/crates/core_arch/src/arm/neon.rs @@ -288,6 +288,22 @@ pub unsafe fn vld1q_p16(ptr: *const p16) -> poly16x8_t { transmute(vld1q_v8i16(ptr as *const i8, align_of::() as i32)) } +/// Load multiple single-element structures to one, two, three, or four registers. +#[inline] +#[target_feature(enable = "neon,aes")] +#[cfg_attr(test, assert_instr(vldr))] +pub unsafe fn vld1_p64(ptr: *const p64) -> poly64x1_t { + transmute(vld1_v1i64(ptr as *const i8, align_of::() as i32)) +} + +/// Load multiple single-element structures to one, two, three, or four registers. +#[inline] +#[target_feature(enable = "neon,aes")] +#[cfg_attr(test, assert_instr("vld1.64"))] +pub unsafe fn vld1q_p64(ptr: *const p64) -> poly64x2_t { + transmute(vld1q_v2i64(ptr as *const i8, align_of::() as i32)) +} + /// Load multiple single-element structures to one, two, three, or four registers. #[inline] #[target_feature(enable = "neon,v7")] diff --git a/crates/core_arch/src/arm_shared/neon/generated.rs b/crates/core_arch/src/arm_shared/neon/generated.rs index e8b76ae377..7eeaa91049 100644 --- a/crates/core_arch/src/arm_shared/neon/generated.rs +++ b/crates/core_arch/src/arm_shared/neon/generated.rs @@ -2087,7 +2087,7 @@ pub unsafe fn vcreate_p16(a: u64) -> poly16x4_t { /// Insert vector element from another vector element #[inline] #[target_feature(enable = "neon,aes")] -#[cfg_attr(target_arch = "arm", target_feature(enable = "crypto,v8"))] +#[cfg_attr(target_arch = "arm", target_feature(enable = "aes,v8"))] #[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))] #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(nop))] pub unsafe fn vcreate_p64(a: u64) -> poly64x1_t { @@ -6602,6 +6602,66 @@ pub unsafe fn vld1q_p16_x4(a: *const p16) -> poly16x8x4_t { transmute(vld1q_s16_x4(transmute(a))) } +/// Load multiple single-element structures to one, two, three, or four registers +#[inline] +#[target_feature(enable = "neon,aes")] +#[cfg_attr(target_arch = "arm", target_feature(enable = "aes,v8"))] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld1))] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld1))] +pub unsafe fn vld1_p64_x2(a: *const p64) -> poly64x1x2_t { + transmute(vld1_s64_x2(transmute(a))) +} + +/// Load multiple single-element structures to one, two, three, or four registers +#[inline] +#[target_feature(enable = "neon,aes")] +#[cfg_attr(target_arch = "arm", target_feature(enable = "aes,v8"))] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(ldr))] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld1))] +pub unsafe fn vld1_p64_x3(a: *const p64) -> poly64x1x3_t { + transmute(vld1_s64_x3(transmute(a))) +} + +/// Load multiple single-element structures to one, two, three, or four registers +#[inline] +#[target_feature(enable = "neon,aes")] +#[cfg_attr(target_arch = "arm", target_feature(enable = "aes,v8"))] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(ldr))] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld1))] +pub unsafe fn vld1_p64_x4(a: *const p64) -> poly64x1x4_t { + transmute(vld1_s64_x4(transmute(a))) +} + +/// Load multiple single-element structures to one, two, three, or four registers +#[inline] +#[target_feature(enable = "neon,aes")] +#[cfg_attr(target_arch = "arm", target_feature(enable = "aes,v8"))] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(ldr))] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld1))] +pub unsafe fn vld1q_p64_x2(a: *const p64) -> poly64x2x2_t { + transmute(vld1q_s64_x2(transmute(a))) +} + +/// Load multiple single-element structures to one, two, three, or four registers +#[inline] +#[target_feature(enable = "neon,aes")] +#[cfg_attr(target_arch = "arm", target_feature(enable = "aes,v8"))] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(ldr))] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld1))] +pub unsafe fn vld1q_p64_x3(a: *const p64) -> poly64x2x3_t { + transmute(vld1q_s64_x3(transmute(a))) +} + +/// Load multiple single-element structures to one, two, three, or four registers +#[inline] +#[target_feature(enable = "neon,aes")] +#[cfg_attr(target_arch = "arm", target_feature(enable = "aes,v8"))] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(ldr))] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld1))] +pub unsafe fn vld1q_p64_x4(a: *const p64) -> poly64x2x4_t { + transmute(vld1q_s64_x4(transmute(a))) +} + /// Load multiple single-element structures to one, two, three, or four registers #[inline] #[target_feature(enable = "neon")] @@ -8528,7 +8588,7 @@ vmull_p8_(a, b) #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))] #[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmull))] #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(smull))] -pub unsafe fn vmullh_n_s16(a: int16x4_t, b: i16) -> int32x4_t { +pub unsafe fn vmull_n_s16(a: int16x4_t, b: i16) -> int32x4_t { vmull_s16(a, vdup_n_s16(b)) } @@ -8538,7 +8598,7 @@ pub unsafe fn vmullh_n_s16(a: int16x4_t, b: i16) -> int32x4_t { #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))] #[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmull))] #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(smull))] -pub unsafe fn vmulls_n_s32(a: int32x2_t, b: i32) -> int64x2_t { +pub unsafe fn vmull_n_s32(a: int32x2_t, b: i32) -> int64x2_t { vmull_s32(a, vdup_n_s32(b)) } @@ -8548,7 +8608,7 @@ pub unsafe fn vmulls_n_s32(a: int32x2_t, b: i32) -> int64x2_t { #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))] #[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmull))] #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(umull))] -pub unsafe fn vmullh_n_u16(a: uint16x4_t, b: u16) -> uint32x4_t { +pub unsafe fn vmull_n_u16(a: uint16x4_t, b: u16) -> uint32x4_t { vmull_u16(a, vdup_n_u16(b)) } @@ -8558,7 +8618,7 @@ pub unsafe fn vmullh_n_u16(a: uint16x4_t, b: u16) -> uint32x4_t { #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))] #[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmull))] #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(umull))] -pub unsafe fn vmulls_n_u32(a: uint32x2_t, b: u32) -> uint64x2_t { +pub unsafe fn vmull_n_u32(a: uint32x2_t, b: u32) -> uint64x2_t { vmull_u32(a, vdup_n_u32(b)) } @@ -10198,7 +10258,7 @@ pub unsafe fn vqdmulh_n_s32(a: int32x2_t, b: i32) -> int32x2_t { #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))] #[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqdmulh))] #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqdmulh))] -pub unsafe fn vqdmulhq_nq_s16(a: int16x8_t, b: i16) -> int16x8_t { +pub unsafe fn vqdmulhq_n_s16(a: int16x8_t, b: i16) -> int16x8_t { let b: int16x8_t = vdupq_n_s16(b); vqdmulhq_s16(a, b) } @@ -10209,7 +10269,7 @@ pub unsafe fn vqdmulhq_nq_s16(a: int16x8_t, b: i16) -> int16x8_t { #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))] #[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqdmulh))] #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqdmulh))] -pub unsafe fn vqdmulhq_nq_s32(a: int32x4_t, b: i32) -> int32x4_t { +pub unsafe fn vqdmulhq_n_s32(a: int32x4_t, b: i32) -> int32x4_t { let b: int32x4_t = vdupq_n_s32(b); vqdmulhq_s32(a, b) } @@ -15277,7 +15337,7 @@ pub unsafe fn vset_lane_p16(a: p16, b: poly16x4_t) -> poly16x4_ /// Insert vector element from another vector element #[inline] #[target_feature(enable = "neon,aes")] -#[cfg_attr(target_arch = "arm", target_feature(enable = "crypto,v8"))] +#[cfg_attr(target_arch = "arm", target_feature(enable = "aes,v8"))] #[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop, LANE = 0))] #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(nop, LANE = 0))] #[rustc_legacy_const_generics(2)] @@ -15409,7 +15469,7 @@ pub unsafe fn vsetq_lane_p16(a: p16, b: poly16x8_t) -> poly16x8 /// Insert vector element from another vector element #[inline] #[target_feature(enable = "neon,aes")] -#[cfg_attr(target_arch = "arm", target_feature(enable = "crypto,v8"))] +#[cfg_attr(target_arch = "arm", target_feature(enable = "aes,v8"))] #[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop, LANE = 0))] #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(nop, LANE = 0))] #[rustc_legacy_const_generics(2)] @@ -21481,6 +21541,54 @@ mod test { assert_eq!(r, e); } + #[simd_test(enable = "neon")] + unsafe fn test_vld1_p64_x2() { + let a: [u64; 3] = [0, 1, 2]; + let e: [i64x1; 2] = [i64x1::new(1), i64x1::new(2)]; + let r: [i64x1; 2] = transmute(vld1_p64_x2(a[1..].as_ptr())); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vld1_p64_x3() { + let a: [u64; 4] = [0, 1, 2, 3]; + let e: [i64x1; 3] = [i64x1::new(1), i64x1::new(2), i64x1::new(3)]; + let r: [i64x1; 3] = transmute(vld1_p64_x3(a[1..].as_ptr())); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vld1_p64_x4() { + let a: [u64; 5] = [0, 1, 2, 3, 4]; + let e: [i64x1; 4] = [i64x1::new(1), i64x1::new(2), i64x1::new(3), i64x1::new(4)]; + let r: [i64x1; 4] = transmute(vld1_p64_x4(a[1..].as_ptr())); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vld1q_p64_x2() { + let a: [u64; 5] = [0, 1, 2, 3, 4]; + let e: [i64x2; 2] = [i64x2::new(1, 2), i64x2::new(3, 4)]; + let r: [i64x2; 2] = transmute(vld1q_p64_x2(a[1..].as_ptr())); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vld1q_p64_x3() { + let a: [u64; 7] = [0, 1, 2, 3, 4, 5, 6]; + let e: [i64x2; 3] = [i64x2::new(1, 2), i64x2::new(3, 4), i64x2::new(5, 6)]; + let r: [i64x2; 3] = transmute(vld1q_p64_x3(a[1..].as_ptr())); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vld1q_p64_x4() { + let a: [u64; 9] = [0, 1, 2, 3, 4, 5, 6, 7, 8]; + let e: [i64x2; 4] = [i64x2::new(1, 2), i64x2::new(3, 4), i64x2::new(5, 6), i64x2::new(7, 8)]; + let r: [i64x2; 4] = transmute(vld1q_p64_x4(a[1..].as_ptr())); + assert_eq!(r, e); + } + #[simd_test(enable = "neon")] unsafe fn test_vld1_f32_x2() { let a: [f32; 5] = [0., 1., 2., 3., 4.]; @@ -22601,38 +22709,38 @@ mod test { } #[simd_test(enable = "neon")] - unsafe fn test_vmullh_n_s16() { + unsafe fn test_vmull_n_s16() { let a: i16x4 = i16x4::new(1, 2, 3, 4); let b: i16 = 2; let e: i32x4 = i32x4::new(2, 4, 6, 8); - let r: i32x4 = transmute(vmullh_n_s16(transmute(a), transmute(b))); + let r: i32x4 = transmute(vmull_n_s16(transmute(a), transmute(b))); assert_eq!(r, e); } #[simd_test(enable = "neon")] - unsafe fn test_vmulls_n_s32() { + unsafe fn test_vmull_n_s32() { let a: i32x2 = i32x2::new(1, 2); let b: i32 = 2; let e: i64x2 = i64x2::new(2, 4); - let r: i64x2 = transmute(vmulls_n_s32(transmute(a), transmute(b))); + let r: i64x2 = transmute(vmull_n_s32(transmute(a), transmute(b))); assert_eq!(r, e); } #[simd_test(enable = "neon")] - unsafe fn test_vmullh_n_u16() { + unsafe fn test_vmull_n_u16() { let a: u16x4 = u16x4::new(1, 2, 3, 4); let b: u16 = 2; let e: u32x4 = u32x4::new(2, 4, 6, 8); - let r: u32x4 = transmute(vmullh_n_u16(transmute(a), transmute(b))); + let r: u32x4 = transmute(vmull_n_u16(transmute(a), transmute(b))); assert_eq!(r, e); } #[simd_test(enable = "neon")] - unsafe fn test_vmulls_n_u32() { + unsafe fn test_vmull_n_u32() { let a: u32x2 = u32x2::new(1, 2); let b: u32 = 2; let e: u64x2 = u64x2::new(2, 4); - let r: u64x2 = transmute(vmulls_n_u32(transmute(a), transmute(b))); + let r: u64x2 = transmute(vmull_n_u32(transmute(a), transmute(b))); assert_eq!(r, e); } @@ -23797,20 +23905,20 @@ mod test { } #[simd_test(enable = "neon")] - unsafe fn test_vqdmulhq_nq_s16() { + unsafe fn test_vqdmulhq_n_s16() { let a: i16x8 = i16x8::new(0x7F_FF, 0x7F_FF, 0x7F_FF, 0x7F_FF, 0x7F_FF, 0x7F_FF, 0x7F_FF, 0x7F_FF); let b: i16 = 2; let e: i16x8 = i16x8::new(1, 1, 1, 1, 1, 1, 1, 1); - let r: i16x8 = transmute(vqdmulhq_nq_s16(transmute(a), transmute(b))); + let r: i16x8 = transmute(vqdmulhq_n_s16(transmute(a), transmute(b))); assert_eq!(r, e); } #[simd_test(enable = "neon")] - unsafe fn test_vqdmulhq_nq_s32() { + unsafe fn test_vqdmulhq_n_s32() { let a: i32x4 = i32x4::new(0x7F_FF_FF_FF, 0x7F_FF_FF_FF, 0x7F_FF_FF_FF, 0x7F_FF_FF_FF); let b: i32 = 2; let e: i32x4 = i32x4::new(1, 1, 1, 1); - let r: i32x4 = transmute(vqdmulhq_nq_s32(transmute(a), transmute(b))); + let r: i32x4 = transmute(vqdmulhq_n_s32(transmute(a), transmute(b))); assert_eq!(r, e); } diff --git a/crates/core_arch/src/arm_shared/neon/load_tests.rs b/crates/core_arch/src/arm_shared/neon/load_tests.rs index 82e2f74955..bbee29ae7a 100644 --- a/crates/core_arch/src/arm_shared/neon/load_tests.rs +++ b/crates/core_arch/src/arm_shared/neon/load_tests.rs @@ -173,6 +173,22 @@ unsafe fn test_vld1q_p16() { assert_eq!(r, e) } +#[simd_test(enable = "neon,aes")] +unsafe fn test_vld1_p64() { + let a: [p64; 2] = [0, 1]; + let e = u64x1::new(1); + let r: u64x1 = transmute(vld1_p64(a[1..].as_ptr())); + assert_eq!(r, e) +} + +#[simd_test(enable = "neon,aes")] +unsafe fn test_vld1q_p64() { + let a: [p64; 3] = [0, 1, 2]; + let e = u64x2::new(1, 2); + let r: u64x2 = transmute(vld1q_p64(a[1..].as_ptr())); + assert_eq!(r, e) +} + #[simd_test(enable = "neon")] unsafe fn test_vld1_f32() { let a: [f32; 3] = [0., 1., 2.]; @@ -188,21 +204,3 @@ unsafe fn test_vld1q_f32() { let r: f32x4 = transmute(vld1q_f32(a[1..].as_ptr())); assert_eq!(r, e) } - -#[cfg(target_arch = "aarch64")] -#[simd_test(enable = "neon")] -unsafe fn test_vld1_f64() { - let a: [f64; 2] = [0., 1.]; - let e = f64x1::new(1.); - let r: f64x1 = transmute(vld1_f64(a[1..].as_ptr())); - assert_eq!(r, e) -} - -#[cfg(target_arch = "aarch64")] -#[simd_test(enable = "neon")] -unsafe fn test_vld1q_f64() { - let a: [f64; 3] = [0., 1., 2.]; - let e = f64x2::new(1., 2.); - let r: f64x2 = transmute(vld1q_f64(a[1..].as_ptr())); - assert_eq!(r, e) -} diff --git a/crates/core_arch/src/arm_shared/neon/mod.rs b/crates/core_arch/src/arm_shared/neon/mod.rs index 369bf07e18..2885d250fd 100644 --- a/crates/core_arch/src/arm_shared/neon/mod.rs +++ b/crates/core_arch/src/arm_shared/neon/mod.rs @@ -362,6 +362,36 @@ pub struct uint64x2x4_t( pub uint64x2_t, ); +/// ARM-specific type containing four `poly64x1_t` vectors. +#[derive(Copy, Clone)] +pub struct poly64x1x2_t(pub poly64x1_t, pub poly64x1_t); +/// ARM-specific type containing four `poly64x1_t` vectors. +#[derive(Copy, Clone)] +pub struct poly64x1x3_t(pub poly64x1_t, pub poly64x1_t, pub poly64x1_t); +/// ARM-specific type containing four `poly64x1_t` vectors. +#[derive(Copy, Clone)] +pub struct poly64x1x4_t( + pub poly64x1_t, + pub poly64x1_t, + pub poly64x1_t, + pub poly64x1_t, +); + +/// ARM-specific type containing four `poly64x2_t` vectors. +#[derive(Copy, Clone)] +pub struct poly64x2x2_t(pub poly64x2_t, pub poly64x2_t); +/// ARM-specific type containing four `poly64x2_t` vectors. +#[derive(Copy, Clone)] +pub struct poly64x2x3_t(pub poly64x2_t, pub poly64x2_t, pub poly64x2_t); +/// ARM-specific type containing four `poly64x2_t` vectors. +#[derive(Copy, Clone)] +pub struct poly64x2x4_t( + pub poly64x2_t, + pub poly64x2_t, + pub poly64x2_t, + pub poly64x2_t, +); + #[allow(improper_ctypes)] extern "unadjusted" { // absolute value (64-bit) @@ -793,6 +823,30 @@ pub unsafe fn vld1q_lane_p16(ptr: *const p16, src: poly16x8_t) simd_insert(src, LANE as u32, *ptr) } +/// Load one single-element structure to one lane of one register. +#[inline] +#[target_feature(enable = "neon,aes")] +#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))] +#[rustc_legacy_const_generics(2)] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vldr", LANE = 0))] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ldr, LANE = 0))] +pub unsafe fn vld1_lane_p64(ptr: *const p64, src: poly64x1_t) -> poly64x1_t { + static_assert!(LANE : i32 where LANE == 0); + simd_insert(src, LANE as u32, *ptr) +} + +/// Load one single-element structure to one lane of one register. +#[inline] +#[target_feature(enable = "neon,aes")] +#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))] +#[rustc_legacy_const_generics(2)] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vldr", LANE = 1))] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld1, LANE = 1))] +pub unsafe fn vld1q_lane_p64(ptr: *const p64, src: poly64x2_t) -> poly64x2_t { + static_assert_imm1!(LANE); + simd_insert(src, LANE as u32, *ptr) +} + /// Load one single-element structure to one lane of one register. #[inline] #[target_feature(enable = "neon")] @@ -1060,6 +1114,34 @@ pub unsafe fn vld1_dup_f32(ptr: *const f32) -> float32x2_t { simd_shuffle2!(x, x, [0, 0]) } +/// Load one single-element structure and Replicate to all lanes (of one register). +#[inline] +#[target_feature(enable = "neon,aes")] +#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vldr"))] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ldr))] +pub unsafe fn vld1_dup_p64(ptr: *const p64) -> poly64x1_t { + #[cfg(target_arch = "aarch64")] + { + crate::core_arch::aarch64::vld1_p64(ptr) + } + #[cfg(target_arch = "arm")] + { + crate::core_arch::arm::vld1_p64(ptr) + } +} + +/// Load one single-element structure and Replicate to all lanes (of one register). +#[inline] +#[target_feature(enable = "neon,aes")] +#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vldr"))] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld1r))] +pub unsafe fn vld1q_dup_p64(ptr: *const p64) -> poly64x2_t { + let x = vld1q_lane_p64::<0>(ptr, transmute(u64x2::splat(0))); + simd_shuffle2!(x, x, [0, 0]) +} + /// Load one single-element structure and Replicate to all lanes (of one register). #[inline] #[target_feature(enable = "neon")] @@ -4873,6 +4955,24 @@ mod tests { assert_eq!(r, e) } + #[simd_test(enable = "neon,aes")] + unsafe fn test_vld1_lane_p64() { + let a = u64x1::new(0); + let elem: u64 = 42; + let e = u64x1::new(42); + let r: u64x1 = transmute(vld1_lane_p64::<0>(&elem, transmute(a))); + assert_eq!(r, e) + } + + #[simd_test(enable = "neon,aes")] + unsafe fn test_vld1q_lane_p64() { + let a = u64x2::new(0, 1); + let elem: u64 = 42; + let e = u64x2::new(0, 42); + let r: u64x2 = transmute(vld1q_lane_p64::<1>(&elem, transmute(a))); + assert_eq!(r, e) + } + #[simd_test(enable = "neon")] unsafe fn test_vld1_lane_f32() { let a = f32x2::new(0., 1.); @@ -5057,6 +5157,22 @@ mod tests { assert_eq!(r, e) } + #[simd_test(enable = "neon,aes")] + unsafe fn test_vld1_dup_p64() { + let elem: u64 = 42; + let e = u64x1::new(42); + let r: u64x1 = transmute(vld1_dup_p64(&elem)); + assert_eq!(r, e) + } + + #[simd_test(enable = "neon,aes")] + unsafe fn test_vld1q_dup_p64() { + let elem: u64 = 42; + let e = u64x2::new(42, 42); + let r: u64x2 = transmute(vld1q_dup_p64(&elem)); + assert_eq!(r, e) + } + #[simd_test(enable = "neon")] unsafe fn test_vld1_dup_f32() { let elem: f32 = 42.; diff --git a/crates/stdarch-gen/neon.spec b/crates/stdarch-gen/neon.spec index 789a394885..c523822d22 100644 --- a/crates/stdarch-gen/neon.spec +++ b/crates/stdarch-gen/neon.spec @@ -2080,6 +2080,11 @@ generate *const p8:poly8x8x2_t, *const p8:poly8x8x3_t, *const p8:poly8x8x4_t generate *const p8:poly8x16x2_t, *const p8:poly8x16x3_t, *const p8:poly8x16x4_t generate *const p16:poly16x4x2_t, *const p16:poly16x4x3_t, *const p16:poly16x4x4_t generate *const p16:poly16x8x2_t, *const p16:poly16x8x3_t, *const p16:poly16x8x4_t +target = aes +generate *const p64:poly64x1x2_t +arm = ldr +generate *const p64:poly64x1x3_t, *const p64:poly64x1x4_t +generate *const p64:poly64x2x2_t, *const p64:poly64x2x3_t, *const p64:poly64x2x4_t /// Load multiple single-element structures to one, two, three, or four registers name = vld1 @@ -2111,6 +2116,166 @@ link-aarch64 = ld1x4._EXT2_ link-arm = vld1x4._EXT2_ generate *const f32:float32x2x4_t, *const f32:float32x4x4_t +/// Load multiple 2-element structures to two registers +name = vld2 +out-nox +a = 0, 1, 2, 2, 3, 2, 4, 3, 5, 2, 6, 3, 7, 4, 8, 5, 9, 2, 10, 3, 11, 4, 12, 5, 13, 6, 14, 7, 15, 8, 16, 9, 17 +validate 1, 2, 2, 3, 2, 3, 4, 5, 2, 3, 4, 5, 6, 7, 8, 9, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17 +load_fn + +aarch64 = ld2 +link-aarch64 = ld2._EXTv2_ +arm = vld2 +link-arm = vld2._EXTpi82_ +//generate *const i8:int8x8x2_t, *const i16:int16x4x2_t, *const i32:int32x2x2_t, *const i64:int64x1x2_t +//generate *const i8:int8x16x2_t, *const i16:int16x8x2_t, *const i32:int32x4x2_t, *const i64:int64x2x2_t + +/// Load multiple 2-element structures to two registers +name = vld2 +out-nox +multi_fn = transmute, {vld2-outsignednox-noext, transmute(a)} +a = 0, 1, 2, 2, 3, 2, 4, 3, 5, 2, 6, 3, 7, 4, 8, 5, 9, 2, 10, 3, 11, 4, 12, 5, 13, 6, 14, 7, 15, 8, 16, 9, 17 +validate 1, 2, 2, 3, 2, 3, 4, 5, 2, 3, 4, 5, 6, 7, 8, 9, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17 +load_fn + +aarch64 = ld2 +arm = vld2 +//generate *const u8:uint8x8x2_t, *const u16:uint16x4x2_t, *const u32:uint32x2x2_t, *const u64:uint64x1x2_t +//generate *const u8:uint8x16x2_t, *const u16:uint16x8x2_t, *const u32:uint32x4x2_t, *const u64:uint64x2x2_t +//generate *const p8:poly8x8x2_t, *const p16:poly16x4x2_t, *const p8:poly8x16x2_t, *const p16:poly16x8x2_t + +/// Load multiple 2-element structures to two registers +name = vld2 +out-nox +a = 0., 1., 2., 2., 3., 2., 4., 3., 5., 2., 6., 3., 7., 4., 8., 5., 9. +validate 1., 2., 2., 3., 2., 3., 4., 5., 2., 3., 4., 5., 6., 7., 8., 9. +load_fn + +aarch64 = ld2 +link-aarch64 = ld2._EXTv2_ +//generate *const f64:float64x1x2_t, *const f64:float64x2x2_t + +arm = vld2 +link-arm = vld2._EXTpi82_ +//generate *const f32:float32x2x2_t, *const f32:float32x4x2_t + +/// Load single 2-element structure and replicate to all lanes of two registers +name = vld2 +out-dup-nox +a = 0, 1, 1, 2, 3, 1, 4, 3, 5, 1, 6, 3, 7, 4, 8, 5, 9, 2, 10, 3, 11, 4, 12, 5, 13, 6, 14, 7, 15, 8, 16, 9, 17 +validate 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 +load_fn + +arm = vld2dup +link-arm = vld2dup._EXTpi82_ +aarch64 = ld2r +link-aarch64 = ld2r._EXT2_ +//generate *const i8:int8x8x2_t, *const i16:int16x4x2_t, *const i32:int32x2x2_t, *const i64:int64x1x2_t +//generate *const i8:int8x16x2_t, *const i16:int16x8x2_t, *const i32:int32x4x2_t, *const i64:int64x2x2_t + +/// Load single 2-element structure and replicate to all lanes of two registers +name = vld2 +out-dup-nox +multi_fn = transmute, {vld2-outsigneddupnox-noext, transmute(a)} +a = 0, 1, 1, 2, 3, 1, 4, 3, 5, 1, 6, 3, 7, 4, 8, 5, 9, 2, 10, 3, 11, 4, 12, 5, 13, 6, 14, 7, 15, 8, 16, 9, 17 +validate 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 +load_fn + +arm = vld2dup +aarch64 = ld2r +//generate *const u8:uint8x8x2_t, *const u16:uint16x4x2_t, *const u32:uint32x2x2_t, *const u64:uint64x1x2_t +//generate *const u8:uint8x16x2_t, *const u16:uint16x8x2_t, *const u32:uint32x4x2_t, *const u64:uint64x2x2_t +//generate *const p8:poly8x8x2_t, *const p16:poly16x4x2_t, *const p8:poly8x16x2_t, *const p16:poly16x8x2_t + +/// Load single 2-element structure and replicate to all lanes of two registers +name = vld2 +out-dup-nox +a = 0., 1., 1., 2., 3., 1., 4., 3., 5. +validate 1., 1., 1., 1., 1., 1., 1., 1. +load_fn + +aarch64 = ld2r +link-aarch64 = ld2r._EXT2_ +//generate *const f64:float64x1x2_t, *const f64:float64x2x2_t + +arm = vld2dup +link-arm = vld2dup._EXTpi82_ +//generate *const f32:float32x2x2_t, *const f32:float32x4x2_t + +/// Load multiple 2-element structures to two registers +name = vld2 +out-lane-nox +multi_fn = static_assert_imm-in_exp_len-LANE +constn = LANE +a = 0, 1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8 +b = 0, 2, 2, 14, 2, 16, 17, 18, 2, 20, 21, 22, 23, 24, 25, 26, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26 +n = 0 +validate 1, 2, 2, 14, 2, 16, 17, 18, 2, 20, 21, 22, 23, 24, 25, 26, 2, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26 +load_fn +arm-aarch64-separate + +aarch64 = ld2lane +const-aarch64 = LANE +link-aarch64 = ld2lane._EXTpi82_ +//generate *const i64:int64x1x2_t:int64x1x2_t, *const i64:int64x2x2_t:int64x2x2_t + +arm = vld2lane +const-arm = LANE +link-arm = vld2lane._EXTpi82_ +//generate *const i8:int8x8x2_t:int8x8x2_t, *const i16:int16x4x2_t:int16x4x2_t, *const i32:int32x2x2_t:int32x2x2_t +//generate *const i8:int8x16x2_t:int8x16x2_t, *const i16:int16x8x2_t:int16x8x2_t, *const i32:int32x4x2_t:int32x4x2_t + +/// Load multiple 2-element structures to two registers +name = vld2 +out-lane-nox +multi_fn = static_assert_imm-in_exp_len-LANE +multi_fn = transmute, {vld2-outsignedlanenox-::, transmute(a), transmute(b)} +constn = LANE +a = 0, 1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8 +b = 0, 2, 2, 14, 2, 16, 17, 18, 2, 20, 21, 22, 23, 24, 25, 26, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26 +n = 0 +validate 1, 2, 2, 14, 2, 16, 17, 18, 2, 20, 21, 22, 23, 24, 25, 26, 2, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26 +load_fn +arm-aarch64-separate + +aarch64 = ld2lane +const-aarch64 = LANE + +target = aes +//generate *const p64:poly64x1x2_t:poly64x1x2_t, *const p64:poly64x2x2_t:poly64x2x2_t + +target = default +//generate *const u64:uint64x1x2_t:uint64x1x2_t, *const u64:uint64x2x2_t:uint64x2x2_t + +arm = vld2lane +const-arm = LANE +//generate *const u8:uint8x8x2_t:uint8x8x2_t, *const u16:uint16x4x2_t:uint16x4x2_t, *const u32:uint32x2x2_t:uint32x2x2_t +//generate *const u8:uint8x16x2_t:uint8x16x2_t, *const u16:uint16x8x2_t:uint16x8x2_t, *const u32:uint32x4x2_t:uint32x4x2_t +//generate *const p8:poly8x8x2_t:poly8x8x2_t, *const p16:poly16x4x2_t:poly16x4x2_t +//generate *const p8:poly8x16x2_t:poly8x16x2_t, *const p16:poly16x8x2_t:poly16x8x2_t + +/// Load multiple 2-element structures to two registers +name = vld2 +out-lane-nox +multi_fn = static_assert_imm-in_exp_len-LANE +constn = LANE +a = 0., 1., 2., 3., 4., 5., 6., 7., 8. +b = 0., 2., 2., 14., 2., 16., 17., 18. +n = 0 +validate 1., 2., 2., 14., 2., 16., 17., 18. +load_fn +arm-aarch64-separate + +aarch64 = ld2lane +const-aarch64 = LANE +link-aarch64 = ld2lane._EXTpi82_ +//generate *const f64:float64x1x2_t:float64x1x2_t, *const f64:float64x2x2_t:float64x2x2_t + +arm = vld2lane +const-arm = LANE +link-arm = vld2lane._EXTpi82_ +//generate *const f32:float32x2x2_t:float32x2x2_t, *const f32:float32x4x2_t:float32x4x2_t + /// Store multiple single-element structures from one, two, three, or four registers name = vst1 a = 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32 @@ -2121,17 +2286,17 @@ arm-aarch64-separate aarch64 = st1 link-aarch64 = st1x2._EXT3_ arm = vst1 -link-arm = vst1x2._EXT3_ +link-arm = vst1x2._EXTr3_ generate *mut i8:int8x8x2_t:void, *mut i16:int16x4x2_t:void, *mut i32:int32x2x2_t:void, *mut i64:int64x1x2_t:void generate *mut i8:int8x16x2_t:void, *mut i16:int16x8x2_t:void, *mut i32:int32x4x2_t:void, *mut i64:int64x2x2_t:void link-aarch64 = st1x3._EXT3_ -link-arm = vst1x3._EXT3_ +link-arm = vst1x3._EXTr3_ generate *mut i8:int8x8x3_t:void, *mut i16:int16x4x3_t:void, *mut i32:int32x2x3_t:void, *mut i64:int64x1x3_t:void generate *mut i8:int8x16x3_t:void, *mut i16:int16x8x3_t:void, *mut i32:int32x4x3_t:void, *mut i64:int64x2x3_t:void link-aarch64 = st1x4._EXT3_ -link-arm = vst1x4._EXT3_ +link-arm = vst1x4._EXTr3_ generate *mut i8:int8x8x4_t:void, *mut i16:int16x4x4_t:void, *mut i32:int32x2x4_t:void, *mut i64:int64x1x4_t:void generate *mut i8:int8x16x4_t:void, *mut i16:int16x8x4_t:void, *mut i32:int32x4x4_t:void, *mut i64:int64x2x4_t:void @@ -2174,15 +2339,15 @@ generate *mut f64:float64x1x4_t:void, *mut f64:float64x2x4_t:void arm = vst1 link-aarch64 = st1x2._EXT3_ -link-arm = vst1x2._EXT3_ +link-arm = vst1x2._EXTr3_ generate *mut f32:float32x2x2_t:void, *mut f32:float32x4x2_t:void link-aarch64 = st1x3._EXT3_ -link-arm = vst1x3._EXT3_ +link-arm = vst1x3._EXTr3_ generate *mut f32:float32x2x3_t:void, *mut f32:float32x4x3_t:void link-aarch64 = st1x4._EXT3_ -link-arm = vst1x4._EXT3_ +link-arm = vst1x4._EXTr3_ generate *mut f32:float32x2x4_t:void, *mut f32:float32x4x4_t:void /// Multiply @@ -2429,8 +2594,8 @@ aarch64 = pmull generate poly64x2_t:poly64x2_t:p128 /// Vector long multiply with scalar -name = vmull -n-suffix +name = vmull_n +no-q multi_fn = vmull-in0-noext, a, {vdup-nin0-noext, b} a = 1, 2, 3, 4, 5, 6, 7, 8 b = 2 @@ -3568,7 +3733,7 @@ generate int16x4_t:i16:int16x4_t, int32x2_t:i32:int32x2_t /// Vector saturating doubling multiply high with scalar name = vqdmulhq_n -out-suffix +no-q multi_fn = vdupq_n-in_ntt-noext, b:out_t, b multi_fn = vqdmulh-out-noext, a, b a = MAX, MAX, MAX, MAX, MAX, MAX, MAX, MAX diff --git a/crates/stdarch-gen/src/main.rs b/crates/stdarch-gen/src/main.rs index 82149064d2..974d3f64cc 100644 --- a/crates/stdarch-gen/src/main.rs +++ b/crates/stdarch-gen/src/main.rs @@ -97,35 +97,14 @@ fn type_bits(t: &str) -> usize { } fn type_exp_len(t: &str) -> usize { - match t { - "int8x8_t" => 3, - "int8x16_t" => 4, - "int16x4_t" => 2, - "int16x8_t" => 3, - "int32x2_t" => 1, - "int32x4_t" => 2, - "int64x1_t" => 0, - "int64x2_t" => 1, - "uint8x8_t" => 3, - "uint8x16_t" => 4, - "uint16x4_t" => 2, - "uint16x8_t" => 3, - "uint32x2_t" => 1, - "uint32x4_t" => 2, - "uint64x1_t" => 0, - "uint64x2_t" => 1, - "float16x4_t" => 2, - "float16x8_t" => 3, - "float32x2_t" => 1, - "float32x4_t" => 2, - "float64x1_t" => 0, - "float64x2_t" => 1, - "poly8x8_t" => 3, - "poly8x16_t" => 4, - "poly16x4_t" => 2, - "poly16x8_t" => 3, - "poly64x1_t" => 0, - "poly64x2_t" => 1, + let t = type_to_sub_type(t); + let len = type_len(&t); + match len { + 1 => 0, + 2 => 1, + 4 => 2, + 8 => 3, + 16 => 4, _ => panic!("unknown type: {}", t), } } @@ -267,6 +246,18 @@ fn type_to_suffix(t: &str) -> &str { } } +fn type_to_dup_suffix(t: &str) -> String { + let s: Vec<_> = type_to_suffix(t).split('_').collect(); + assert_eq!(s.len(), 2); + format!("{}_dup_{}", s[0], s[1]) +} + +fn type_to_lane_suffix(t: &str) -> String { + let s: Vec<_> = type_to_suffix(t).split('_').collect(); + assert_eq!(s.len(), 2); + format!("{}_lane_{}", s[0], s[1]) +} + fn type_to_n_suffix(t: &str) -> &str { match t { "int8x8_t" => "_n_s8", @@ -435,6 +426,9 @@ enum Suffix { NoQNSuffix, OutSuffix, OutNSuffix, + OutNox, + OutDupNox, + OutLaneNox, Lane, In2, In2Lane, @@ -565,28 +559,6 @@ fn native_type_to_long_type(t: &str) -> &str { } } -fn type_to_ext(t: &str) -> String { - if !t.contains('x') { - return t.replace("u", "i"); - } - let native = type_to_native_type(t); - let sub_ext = match type_sub_len(t) { - 1 => String::new(), - _ => format!(".p0{}", native), - }; - let sub_type = match &native[0..1] { - "i" | "f" => native, - "u" => native.replace("u", "i"), - _ => panic!("unknown type: {}", t), - }; - format!( - "v{}{}{}", - &type_len(&type_to_sub_type(t)).to_string(), - sub_type, - sub_ext - ) -} - fn type_to_half(t: &str) -> &str { match t { "int8x16_t" => "int8x8_t", @@ -892,6 +864,53 @@ fn map_val<'v>(t: &str, v: &'v str) -> &'v str { } } +fn type_to_ext(t: &str, v: bool, r: bool, pi8: bool) -> String { + if !t.contains('x') { + return t.replace("u", "i"); + } + let native = type_to_native_type(t); + let sub_ext = match type_sub_len(t) { + 1 => String::new(), + _ if v => format!( + ".p0v{}{}", + &type_len(&type_to_sub_type(t)).to_string(), + native + ), + _ if pi8 => format!(".p0i8"), + _ => format!(".p0{}", native), + }; + let sub_type = match &native[0..1] { + "i" | "f" => native, + "u" => native.replace("u", "i"), + _ => panic!("unknown type: {}", t), + }; + let ext = format!( + "v{}{}{}", + &type_len(&type_to_sub_type(t)).to_string(), + sub_type, + sub_ext + ); + if r { + let ss: Vec<_> = ext.split('.').collect(); + if ss.len() != 2 { + ext + } else { + format!("{}.{}", ss[1], ss[0]) + } + } else { + ext + } +} + +fn ext(s: &str, in_t: &[&str; 3], out_t: &str) -> String { + s.replace("_EXT_", &type_to_ext(in_t[0], false, false, false)) + .replace("_EXT2_", &type_to_ext(out_t, false, false, false)) + .replace("_EXT3_", &type_to_ext(in_t[1], false, false, false)) + .replace("_EXTr3_", &type_to_ext(in_t[1], false, true, false)) + .replace("_EXTv2_", &type_to_ext(out_t, true, false, false)) + .replace("_EXTpi82_", &type_to_ext(out_t, false, false, true)) +} + #[allow(clippy::too_many_arguments)] fn gen_aarch64( current_comment: &str, @@ -939,6 +958,21 @@ fn gen_aarch64( NoQNSuffix => format!("{}{}", current_name, type_to_noq_n_suffix(in_t[1])), OutSuffix => format!("{}{}", current_name, type_to_suffix(out_t)), OutNSuffix => format!("{}{}", current_name, type_to_n_suffix(out_t)), + OutNox => format!( + "{}{}", + current_name, + type_to_suffix(&type_to_sub_type(out_t)) + ), + OutDupNox => format!( + "{}{}", + current_name, + type_to_dup_suffix(&type_to_sub_type(out_t)) + ), + OutLaneNox => format!( + "{}{}", + current_name, + type_to_lane_suffix(&type_to_sub_type(out_t)) + ), Lane => format!("{}{}", current_name, type_to_lane_suffixes(out_t, in_t[1])), In2 => format!("{}{}", current_name, type_to_suffix(in_t[2])), In2Lane => format!("{}{}", current_name, type_to_lane_suffixes(out_t, in_t[2])), @@ -966,15 +1000,13 @@ fn gen_aarch64( String::new() }; let current_aarch64 = current_aarch64.clone().unwrap(); - let mut ext_c = String::new(); - let mut ext_c_const = String::new(); - let mut ext_c_store = String::new(); let mut link_t: Vec = vec![ in_t[0].to_string(), in_t[1].to_string(), in_t[2].to_string(), out_t.to_string(), ]; + let mut ext_c = String::new(); if let Some(mut link_aarch64) = link_aarch64.clone() { if link_aarch64.contains(":") { let links: Vec<_> = link_aarch64.split(':').map(|v| v.to_string()).collect(); @@ -987,101 +1019,81 @@ fn gen_aarch64( links[4].clone(), ]; } - let ext = type_to_ext(in_t[0]); - let ext2 = type_to_ext(out_t); - let ext3 = type_to_ext(in_t[1]); let link_aarch64 = if link_aarch64.starts_with("llvm") { - link_aarch64 - .replace("_EXT_", &ext) - .replace("_EXT2_", &ext2) - .replace("_EXT3_", &ext3) + ext(&link_aarch64, in_t, out_t) } else { let mut link = String::from("llvm.aarch64.neon."); link.push_str(&link_aarch64); - link.replace("_EXT_", &ext) - .replace("_EXT2_", &ext2) - .replace("_EXT3_", &ext3) + ext(&link, in_t, out_t) + }; + let (ext_inputs, ext_output) = { + if const_aarch64.is_some() { + if matches!(fn_type, Fntype::Load) { + let sub = type_to_sub_type(in_t[1]); + ( + match type_sub_len(in_t[1]) { + 1 => format!("a: {}, n: i64, ptr: *const i8", sub), + 2 => format!("a: {}, b: {}, n: i64, ptr: *const i8", sub, sub), + 3 => format!( + "a: {}, b: {}, c: {}, n: i64, ptr: *const i8", + sub, sub, sub + ), + 4 => format!( + "a: {}, b: {}, c: {}, d: {}, n: i64, ptr: *const i8", + sub, sub, sub, sub + ), + _ => panic!("unsupported type: {}", in_t[1]), + }, + format!(" -> {}", out_t), + ) + } else { + ( + match para_num { + 1 => format!("a: {}, n: i32", in_t[0]), + 2 => format!("a: {}, b: {}, n: i32", in_t[0], in_t[1]), + 3 => format!("a: {}, b: {}, c: {}, n: i32", in_t[0], in_t[1], in_t[2]), + _ => unimplemented!("unknown para_num"), + }, + format!(" -> {}", out_t), + ) + } + } else if matches!(fn_type, Fntype::Store) { + let sub = type_to_sub_type(in_t[1]); + let native = type_to_native_type(in_t[1]); + ( + match type_sub_len(in_t[1]) { + 1 => format!("a: {}, ptr: *mut {}", sub, native), + 2 => format!("a: {}, b: {}, ptr: *mut {}", sub, sub, native), + 3 => format!("a: {}, b: {}, c: {}, ptr: *mut {}", sub, sub, sub, native), + 4 => format!( + "a: {}, b: {}, c: {}, d: {}, ptr: *mut {}", + sub, sub, sub, sub, native + ), + _ => panic!("unsupported type: {}", in_t[1]), + }, + String::new(), + ) + } else { + ( + match para_num { + 1 => format!("a: {}", link_t[0]), + 2 => format!("a: {}, b: {}", link_t[0], link_t[1]), + 3 => format!("a: {}, b: {}, c: {}", link_t[0], link_t[1], link_t[2]), + _ => unimplemented!("unknown para_num"), + }, + format!(" -> {}", link_t[3]), + ) + } }; ext_c = format!( r#"#[allow(improper_ctypes)] extern "unadjusted" {{ #[cfg_attr(target_arch = "aarch64", link_name = "{}")] - fn {}({}) -> {}; + fn {}({}){}; }} "#, - link_aarch64, - current_fn, - match para_num { - 1 => { - format!("a: {}", link_t[0]) - } - 2 => { - format!("a: {}, b: {}", link_t[0], link_t[1]) - } - 3 => { - format!("a: {}, b: {}, c: {}", link_t[0], link_t[1], link_t[2]) - } - _ => unimplemented!("unknown para_num"), - }, - link_t[3] + link_aarch64, current_fn, ext_inputs, ext_output, ); - if const_aarch64.is_some() { - ext_c_const = format!( - r#"#[allow(improper_ctypes)] - extern "unadjusted" {{ - #[cfg_attr(target_arch = "aarch64", link_name = "{}")] - fn {}({}) -> {}; - }} - "#, - link_aarch64, - current_fn, - match para_num { - 1 => { - format!("a: {}, n: i32", in_t[0]) - } - 2 => { - format!("a: {}, b: {}, n: i32", in_t[0], in_t[1]) - } - 3 => { - format!("a: {}, b: {}, c: {}, n: i32", in_t[0], in_t[1], in_t[2]) - } - _ => unimplemented!("unknown para_num"), - }, - out_t - ); - } - if matches!(fn_type, Fntype::Store) { - let sub = type_to_sub_type(in_t[1]); - let native = type_to_native_type(in_t[1]); - ext_c_store = format!( - r#"#[allow(improper_ctypes)] - extern "unadjusted" {{ - #[cfg_attr(target_arch = "aarch64", link_name = "{}")] - fn {}({}); - }} - "#, - link_aarch64, - current_fn, - match type_sub_len(in_t[1]) { - 1 => { - format!("a: {}, ptr: *mut {}", sub, native,) - } - 2 => { - format!("a: {}, b: {}, ptr: *mut {}", sub, sub, native,) - } - 3 => { - format!("a: {}, b: {}, c: {}, ptr: *mut {}", sub, sub, sub, native,) - } - 4 => { - format!( - "a: {}, b: {}, c: {}, d: {}, ptr: *mut {}", - sub, sub, sub, sub, native, - ) - } - _ => panic!("unsupported type: {}", in_t[1]), - }, - ); - } }; let const_declare = if let Some(constn) = constn { if constn.contains(":") { @@ -1168,124 +1180,76 @@ fn gen_aarch64( name, const_declare, fn_inputs, fn_output ) }; - let call = if let Some(const_aarch64) = const_aarch64 { - match para_num { - 1 => format!( - r#"pub unsafe fn {}{}(a: {}) -> {} {{ - {} - {}{}(a, {}) -}}"#, - name, - const_declare, - in_t[0], - out_t, - multi_calls, - ext_c_const, - current_fn, - const_aarch64 - ), - 2 => format!( - r#"pub unsafe fn {}{}(a: {}) -> {} {{ - {}{}{}(a, b, {}) -}}"#, - name, - const_declare, - in_t[0], - out_t, - multi_calls, - ext_c_const, - current_fn, - const_aarch64 - ), - _ => String::new(), - } - } else if matches!(fn_type, Fntype::Store) { - match type_sub_len(in_t[1]) { - 1 => format!( - r#"{}{{ - {}{}(b, a) -}}"#, - fn_decl, ext_c_store, current_fn, - ), - 2 => format!( - r#"{}{{ - {}{}(b.0, b.1, a) -}}"#, - fn_decl, ext_c_store, current_fn, - ), - 3 => format!( - r#"{}{{ - {}{}(b.0, b.1, b.2, a) -}}"#, - fn_decl, ext_c_store, current_fn, - ), - 4 => format!( - r#"{}{{ - {}{}(b.0, b.1, b.2, b.3, a) -}}"#, - fn_decl, ext_c_store, current_fn, - ), - _ => panic!("unsupported type: {}", in_t[1]), - } - } else { - let trans: [&str; 2] = if link_t[3] != out_t { - ["transmute(", ")"] - } else { - ["", ""] - }; - match (multi_calls.len(), para_num, fixed.len()) { - (0, 1, 0) => format!( - r#"{}{{ - {}{}{}(a){} -}}"#, - fn_decl, ext_c, trans[0], current_fn, trans[1] - ), - (0, 1, _) => { - let fixed: Vec = fixed.iter().take(type_len(in_t[0])).cloned().collect(); + let call_params = { + if let (Some(const_aarch64), Some(_)) = (const_aarch64, link_aarch64) { + if matches!(fn_type, Fntype::Load) { + let subs = match type_sub_len(in_t[1]) { + 1 => "b", + 2 => "b.0, b.1", + 3 => "b.0, b.1, b.2", + 4 => "b.0, b.1, b.2, b.3", + _ => panic!("unsupported type: {}", in_t[1]), + }; format!( - r#"{}{{ - let b{}; - {}{}{}(a, transmute(b)){} -}}"#, - fn_decl, - values(in_t[0], &fixed), + r#"{} + {}{}({}, {} as i64, a as *const i8)"#, + multi_calls, ext_c, - trans[0], current_fn, - trans[1], + subs, + constn.as_deref().unwrap() ) + } else { + match para_num { + 1 => format!( + r#"{} + {}{}(a, {})"#, + multi_calls, ext_c, current_fn, const_aarch64 + ), + 2 => format!( + r#"{} + {}{}(a, b, {})"#, + multi_calls, ext_c, current_fn, const_aarch64 + ), + _ => String::new(), + } + } + } else if matches!(fn_type, Fntype::Store) { + match type_sub_len(in_t[1]) { + 1 => format!(r#"{}{}(b, a)"#, ext_c, current_fn), + 2 => format!(r#"{}{}(b.0, b.1, a)"#, ext_c, current_fn), + 3 => format!(r#"{}{}(b.0, b.1, b.2, a)"#, ext_c, current_fn), + 4 => format!(r#"{}{}(b.0, b.1, b.2, b.3, a)"#, ext_c, current_fn), + _ => panic!("unsupported type: {}", in_t[1]), + } + } else { + let trans: [&str; 2] = if link_t[3] != out_t { + ["transmute(", ")"] + } else { + ["", ""] + }; + match (multi_calls.len(), para_num, fixed.len()) { + (0, 1, 0) => format!(r#"{}{}{}(a){}"#, ext_c, trans[0], current_fn, trans[1]), + (0, 1, _) => { + let fixed: Vec = + fixed.iter().take(type_len(in_t[0])).cloned().collect(); + format!( + r#"let b{}; + {}{}{}(a, transmute(b)){}"#, + values(in_t[0], &fixed), + ext_c, + trans[0], + current_fn, + trans[1], + ) + } + (0, 2, _) => format!(r#"{}{}{}(a, b){}"#, ext_c, trans[0], current_fn, trans[1],), + (0, 3, _) => format!(r#"{}{}(a, b, c)"#, ext_c, current_fn,), + (_, 1, _) => format!(r#"{}{}"#, ext_c, multi_calls,), + (_, 2, _) => format!(r#"{}{}"#, ext_c, multi_calls,), + (_, 3, _) => format!(r#"{}{}"#, ext_c, multi_calls,), + (_, _, _) => String::new(), } - (0, 2, _) => format!( - r#"{}{{ - {}{}{}(a, b){} -}}"#, - fn_decl, ext_c, trans[0], current_fn, trans[1], - ), - (0, 3, _) => format!( - r#"{}{{ - {}{}(a, b, c) -}}"#, - fn_decl, ext_c, current_fn, - ), - (_, 1, _) => format!( - r#"{}{{ - {}{} -}}"#, - fn_decl, ext_c, multi_calls, - ), - (_, 2, _) => format!( - r#"{}{{ - {}{} -}}"#, - fn_decl, ext_c, multi_calls, - ), - (_, 3, _) => format!( - r#"{}{{ - {}{} -}}"#, - fn_decl, ext_c, multi_calls, - ), - (_, _, _) => String::new(), } }; let function = format!( @@ -1294,9 +1258,17 @@ fn gen_aarch64( #[inline] #[target_feature(enable = "{}")] #[cfg_attr(test, assert_instr({}{}))]{} -{} +{}{{ + {} +}} "#, - current_comment, current_target, current_aarch64, const_assert, const_legacy, call + current_comment, + current_target, + current_aarch64, + const_assert, + const_legacy, + fn_decl, + call_params ); let test = match fn_type { Fntype::Normal => gen_test( @@ -1316,7 +1288,7 @@ fn gen_aarch64( fn gen_load_test( name: &str, - _in_t: &[&str; 3], + in_t: &[&str; 3], out_t: &str, current_tests: &[( Vec, @@ -1333,9 +1305,11 @@ fn gen_load_test( unsafe fn test_{}() {{"#, name, ); - for (a, _, _, _, e) in current_tests { + for (a, b, _, n, e) in current_tests { let a: Vec = a.iter().take(type_len + 1).cloned().collect(); let e: Vec = e.iter().take(type_len).cloned().collect(); + let has_b = b.len() > 0; + let has_n = n.is_some(); let mut input = String::from("["); for i in 0..type_len + 1 { if i != 0 { @@ -1344,43 +1318,65 @@ fn gen_load_test( input.push_str(&a[i]) } input.push_str("]"); - let mut output = String::from("["); - for i in 0..type_sub_len(out_t) { - if i != 0 { - output.push_str(", "); - } - let sub_len = type_len / type_sub_len(out_t); - if type_to_global_type(out_t) != "f64" { - let mut sub_output = format!("{}::new(", type_to_global_type(out_t)); - for j in 0..sub_len { - if j != 0 { - sub_output.push_str(", "); + let output = |v: &Vec| { + let mut output = String::from("["); + for i in 0..type_sub_len(out_t) { + if i != 0 { + output.push_str(", "); + } + let sub_len = type_len / type_sub_len(out_t); + if type_to_global_type(out_t) != "f64" { + let mut sub_output = format!("{}::new(", type_to_global_type(out_t)); + for j in 0..sub_len { + if j != 0 { + sub_output.push_str(", "); + } + sub_output.push_str(&v[i * sub_len + j]); } - sub_output.push_str(&e[i * sub_len + j]); + sub_output.push_str(")"); + output.push_str(&sub_output); + } else { + output.push_str(&v[i]); } - sub_output.push_str(")"); - output.push_str(&sub_output); - } else { - output.push_str(&e[i]); } - } - output.push_str("]"); + output.push_str("]"); + output + }; + let input_b = if has_b { + let b: Vec = b.iter().take(type_len).cloned().collect(); + format!( + r#" + let b: [{}; {}] = {};"#, + type_to_global_type(in_t[1]), + type_sub_len(in_t[1]), + output(&b), + ) + } else { + String::new() + }; let t = format!( r#" - let a: [{}; {}] = {}; + let a: [{}; {}] = {};{} let e: [{}; {}] = {}; - let r: [{}; {}] = transmute({}(a[1..].as_ptr())); + let r: [{}; {}] = transmute({}{}(a[1..].as_ptr(){})); assert_eq!(r, e); "#, type_to_native_type(out_t), type_len + 1, input, + input_b, type_to_global_type(out_t), type_sub_len(out_t), - output, + output(&e), type_to_global_type(out_t), type_sub_len(out_t), name, + if has_n { + format!("::<{}>", n.as_deref().unwrap()) + } else { + String::new() + }, + if has_b { ", transmute(b)" } else { "" }, ); test.push_str(&t); } @@ -1609,6 +1605,21 @@ fn gen_arm( NoQNSuffix => format!("{}{}", current_name, type_to_noq_n_suffix(in_t[1])), OutSuffix => format!("{}{}", current_name, type_to_suffix(out_t)), OutNSuffix => format!("{}{}", current_name, type_to_n_suffix(out_t)), + OutNox => format!( + "{}{}", + current_name, + type_to_suffix(&type_to_sub_type(out_t)) + ), + OutDupNox => format!( + "{}{}", + current_name, + type_to_dup_suffix(&type_to_sub_type(out_t)) + ), + OutLaneNox => format!( + "{}{}", + current_name, + type_to_lane_suffix(&type_to_sub_type(out_t)) + ), Lane => format!("{}{}", current_name, type_to_lane_suffixes(out_t, in_t[1])), In2 => format!("{}{}", current_name, type_to_suffix(in_t[2])), In2Lane => format!("{}{}", current_name, type_to_lane_suffixes(out_t, in_t[2])), @@ -1626,9 +1637,8 @@ fn gen_arm( Default => "v7", ArmV7 => "v7", FPArmV8 => "fp-armv8,v8", - AES => "crypto,v8", // TODO: Replace with AES when the minimum LLVM version has b8baa2a9132498ea286dbb0d03f005760ecc6fdb + AES => "aes,v8", }; - let current_fn = if let Some(current_fn) = current_fn.clone() { if link_aarch64.is_some() || link_arm.is_some() { panic!( @@ -1649,7 +1659,7 @@ fn gen_arm( String::new() }; let mut ext_c = String::new(); - let mut ext_c_arm = if multi_fn.is_empty() { + let mut ext_c_arm = if multi_fn.is_empty() || link_arm.is_none() { String::new() } else { String::from( @@ -1657,7 +1667,7 @@ fn gen_arm( "#, ) }; - let mut ext_c_aarch64 = if multi_fn.is_empty() { + let mut ext_c_aarch64 = if multi_fn.is_empty() || link_aarch64.is_none() { String::new() } else { String::from( @@ -1700,39 +1710,19 @@ fn gen_arm( links[4].clone(), ]; } - let ext = type_to_ext(in_t[0]); - let ext2 = type_to_ext(out_t); - let ext3 = type_to_ext(in_t[1]); - let ext3_arm = if matches!(fn_type, Fntype::Store) { - let s: Vec<_> = ext3.split('.').collect(); - assert_eq!(s.len(), 2); - format!("{}.{}", s[1], s[0]) - } else { - ext3.clone() - }; let link_arm = if link_arm.starts_with("llvm") { - link_arm - .replace("_EXT_", &ext) - .replace("_EXT2_", &ext2) - .replace("_EXT3_", &ext3_arm) + ext(&link_arm, in_t, out_t) } else { let mut link = String::from("llvm.arm.neon."); link.push_str(&link_arm); - link.replace("_EXT_", &ext) - .replace("_EXT2_", &ext2) - .replace("_EXT3_", &ext3_arm) + ext(&link, in_t, out_t) }; let link_aarch64 = if link_aarch64.starts_with("llvm") { - link_aarch64 - .replace("_EXT_", &ext) - .replace("_EXT2_", &ext2) - .replace("_EXT3_", &ext3) + ext(&link_aarch64, in_t, out_t) } else { let mut link = String::from("llvm.aarch64.neon."); link.push_str(&link_aarch64); - link.replace("_EXT_", &ext) - .replace("_EXT2_", &ext2) - .replace("_EXT3_", &ext3) + ext(&link, in_t, out_t) }; if out_t == link_arm_t[3] && out_t == link_aarch64_t[3] { ext_c = format!( @@ -1747,98 +1737,72 @@ fn gen_arm( link_aarch64, current_fn, match para_num { - 1 => { - format!("a: {}", in_t[0]) - } - 2 => { - format!("a: {}, b: {}", in_t[0], in_t[1]) - } - 3 => { - format!("a: {}, b: {}, c: {}", in_t[0], in_t[1], in_t[2]) - } + 1 => format!("a: {}", in_t[0]), + 2 => format!("a: {}, b: {}", in_t[0], in_t[1]), + 3 => format!("a: {}, b: {}, c: {}", in_t[0], in_t[1], in_t[2]), _ => unimplemented!("unknown para_num"), }, out_t ); }; - if let Some(const_arm) = const_arm { - let (_, const_type) = if const_arm.contains(":") { - let consts: Vec<_> = const_arm.split(':').map(|v| v.trim().to_string()).collect(); - (consts[0].clone(), consts[1].clone()) - } else { - ( - const_arm.to_string(), - in_t[para_num as usize - 1].to_string(), - ) - }; - ext_c_arm.push_str(&format!( - r#"#[allow(improper_ctypes)] - extern "unadjusted" {{ - #[cfg_attr(target_arch = "arm", link_name = "{}")] - fn {}({}) -> {}; - }} -"#, - link_arm, - current_fn, - match para_num { - 1 => { - format!("a: {}, n: {}", in_t[0], const_type) - } - 2 => { - format!("a: {}, b: {}, n: {}", in_t[0], in_t[1], const_type) - } - 3 => { - format!( - "a: {}, b: {}, c: {}, n: {}", - in_t[0], in_t[1], in_t[2], const_type + let (arm_ext_inputs, arm_ext_output) = { + if let Some(const_arm) = const_arm { + if matches!(fn_type, Fntype::Load) { + let sub_type = type_to_sub_type(in_t[1]); + let inputs = match type_sub_len(in_t[1]) { + 1 => format!("a: {}", sub_type), + 2 => format!("a: {}, b: {}", sub_type, sub_type,), + 3 => format!("a: {}, b: {}, c: {}", sub_type, sub_type, sub_type,), + 4 => format!( + "a: {}, b: {}, c: {}, d: {}", + sub_type, sub_type, sub_type, sub_type, + ), + _ => panic!("unknown type: {}", in_t[1]), + }; + ( + format!("ptr: *const i8, {}, n: i32, size: i32", inputs), + String::new(), + ) + } else { + let (_, const_type) = if const_arm.contains(":") { + let consts: Vec<_> = + const_arm.split(':').map(|v| v.trim().to_string()).collect(); + (consts[0].clone(), consts[1].clone()) + } else { + ( + const_arm.to_string(), + in_t[para_num as usize - 1].to_string(), ) - } - _ => unimplemented!("unknown para_num"), - }, - out_t - )); - }; - if out_t != link_arm_t[3] { - ext_c_arm.push_str(&format!( - r#"#[allow(improper_ctypes)] - extern "unadjusted" {{ - #[cfg_attr(target_arch = "arm", link_name = "{}")] - fn {}({}) -> {}; - }} -"#, - link_arm, - current_fn, - match para_num { - 1 => { - format!("a: {}", link_arm_t[0]) - } - 2 => { - format!("a: {}, b: {}", link_arm_t[0], link_arm_t[1]) - } - 3 => { - format!( + }; + ( + match para_num { + 1 => format!("a: {}, n: {}", in_t[0], const_type), + 2 => format!("a: {}, b: {}, n: {}", in_t[0], in_t[1], const_type), + 3 => format!( + "a: {}, b: {}, c: {}, n: {}", + in_t[0], in_t[1], in_t[2], const_type + ), + _ => unimplemented!("unknown para_num"), + }, + format!(" -> {}", out_t), + ) + } + } else if out_t != link_arm_t[3] { + ( + match para_num { + 1 => format!("a: {}", link_arm_t[0]), + 2 => format!("a: {}, b: {}", link_arm_t[0], link_arm_t[1]), + 3 => format!( "a: {}, b: {}, c: {}", link_arm_t[0], link_arm_t[1], link_arm_t[2] - ) - } - _ => unimplemented!("unknown para_num"), - }, - link_arm_t[3] - )); - } - if matches!(fn_type, Fntype::Store) { - let sub_type = type_to_sub_type(in_t[1]); - ext_c_arm.push_str(&format!( - r#"#[allow(improper_ctypes)] - extern "unadjusted" {{ - #[cfg_attr(target_arch = "arm", link_name = "{}")] - fn {}(ptr: *mut {}, {}); - }} -"#, - link_arm, - current_fn, - type_to_native_type(in_t[0]), - match type_sub_len(in_t[1]) { + ), + _ => unimplemented!("unknown para_num"), + }, + format!(" -> {}", link_arm_t[3]), + ) + } else if matches!(fn_type, Fntype::Store) { + let sub_type = type_to_sub_type(in_t[1]); + let inputs = match type_sub_len(in_t[1]) { 1 => format!("a: {}", sub_type), 2 => format!("a: {}, b: {}", sub_type, sub_type,), 3 => format!("a: {}, b: {}, c: {}", sub_type, sub_type, sub_type,), @@ -1847,75 +1811,68 @@ fn gen_arm( sub_type, sub_type, sub_type, sub_type, ), _ => panic!("unknown type: {}", in_t[1]), - }, - )); - } - if const_aarch64.is_some() { - ext_c_aarch64.push_str(&format!( - r#"#[allow(improper_ctypes)] - extern "unadjusted" {{ - #[cfg_attr(target_arch = "aarch64", link_name = "{}")] - fn {}({}) -> {}; - }} -"#, - link_aarch64, - current_fn, - match para_num { - 1 => { - format!("a: {}, n: i32", in_t[0]) - } - 2 => { - format!("a: {}, b: {}, n: i32", in_t[0], in_t[1]) - } - 3 => { - format!("a: {}, b: {}, c: {}, n: i32", in_t[0], in_t[1], in_t[2]) - } - _ => unimplemented!("unknown para_num"), - }, - out_t - )); - } - if out_t != link_aarch64_t[3] { - ext_c_aarch64.push_str(&format!( - r#"#[allow(improper_ctypes)] + }; + ( + format!("ptr: *mut {}, {}", type_to_native_type(in_t[1]), inputs), + String::new(), + ) + } else { + (String::new(), String::new()) + } + }; + ext_c_arm.push_str(&format!( + r#"#[allow(improper_ctypes)] extern "unadjusted" {{ - #[cfg_attr(target_arch = "aarch64", link_name = "{}")] - fn {}({}) -> {}; + #[cfg_attr(target_arch = "arm", link_name = "{}")] + fn {}({}){}; }} "#, - link_aarch64, - current_fn, - match para_num { - 1 => { - format!("a: {}", link_aarch64_t[0]) - } - 2 => { - format!("a: {}, b: {}", link_aarch64_t[0], link_aarch64_t[1]) - } - 3 => { - format!( + link_arm, current_fn, arm_ext_inputs, arm_ext_output, + )); + let (aarch64_ext_inputs, aarch64_ext_output) = { + if const_aarch64.is_some() { + if matches!(fn_type, Fntype::Load) { + let sub_type = type_to_sub_type(in_t[1]); + let mut inputs = match type_sub_len(in_t[1]) { + 1 => format!("a: {}", sub_type,), + 2 => format!("a: {}, b: {}", sub_type, sub_type,), + 3 => format!("a: {}, b: {}, c: {}", sub_type, sub_type, sub_type,), + 4 => format!( + "a: {}, b: {}, c: {}, d: {}", + sub_type, sub_type, sub_type, sub_type, + ), + _ => panic!("unknown type: {}", in_t[1]), + }; + inputs.push_str(&format!(", n: i64, ptr: *const i8")); + (inputs, format!(" -> {}", out_t)) + } else { + ( + match para_num { + 1 => format!("a: {}, n: i32", in_t[0]), + 2 => format!("a: {}, b: {}, n: i32", in_t[0], in_t[1]), + 3 => format!("a: {}, b: {}, c: {}, n: i32", in_t[0], in_t[1], in_t[2]), + _ => unimplemented!("unknown para_num"), + }, + format!(" -> {}", out_t), + ) + } + } else if out_t != link_aarch64_t[3] { + ( + match para_num { + 1 => format!("a: {}", link_aarch64_t[0]), + 2 => format!("a: {}, b: {}", link_aarch64_t[0], link_aarch64_t[1]), + 3 => format!( "a: {}, b: {}, c: {}", link_aarch64_t[0], link_aarch64_t[1], link_aarch64_t[2] - ) - } - _ => unimplemented!("unknown para_num"), - }, - link_aarch64_t[3] - )); - } - if matches!(fn_type, Fntype::Store) { - let sub_type = type_to_sub_type(in_t[1]); - ext_c_aarch64.push_str(&format!( - r#"#[allow(improper_ctypes)] - extern "unadjusted" {{ - #[cfg_attr(target_arch = "aarch64", link_name = "{}")] - fn {}({}, ptr: *mut {}); - }} -"#, - link_aarch64, - current_fn, - match type_sub_len(in_t[1]) { - 1 => format!("a: {}", sub_type), + ), + _ => unimplemented!("unknown para_num"), + }, + format!(" -> {}", link_aarch64_t[3]), + ) + } else if matches!(fn_type, Fntype::Store) { + let sub_type = type_to_sub_type(in_t[1]); + let mut inputs = match type_sub_len(in_t[1]) { + 1 => format!("a: {}", sub_type,), 2 => format!("a: {}, b: {}", sub_type, sub_type,), 3 => format!("a: {}, b: {}, c: {}", sub_type, sub_type, sub_type,), 4 => format!( @@ -1923,10 +1880,22 @@ fn gen_arm( sub_type, sub_type, sub_type, sub_type, ), _ => panic!("unknown type: {}", in_t[1]), - }, - type_to_native_type(in_t[0]), - )); - } + }; + inputs.push_str(&format!(", ptr: *mut {}", type_to_native_type(in_t[0]))); + (inputs, String::new()) + } else { + (String::new(), String::new()) + } + }; + ext_c_aarch64.push_str(&format!( + r#"#[allow(improper_ctypes)] + extern "unadjusted" {{ + #[cfg_attr(target_arch = "aarch64", link_name = "{}")] + fn {}({}){}; + }} +"#, + link_aarch64, current_fn, aarch64_ext_inputs, aarch64_ext_output, + )); }; let const_declare = if let Some(constn) = constn { format!(r#""#, constn) @@ -1984,202 +1953,120 @@ fn gen_arm( name, const_declare, fn_inputs, fn_output ) }; - let call = match (multi_calls.len(), para_num, fixed.len()) { - (0, 1, 0) => format!( - r#"{}{{ - {}{}(a) -}}"#, - fn_decl, ext_c, current_fn, - ), - (0, 1, _) => { - let fixed: Vec = fixed.iter().take(type_len(in_t[0])).cloned().collect(); + let function = if separate { + let call_arm = { + let arm_params = if let (Some(const_arm), Some(_)) = (const_arm, link_arm) { + if matches!(fn_type, Fntype::Load) { + let subs = match type_sub_len(in_t[1]) { + 1 => "b", + 2 => "b.0, b.1", + 3 => "b.0, b.1, b.2", + 4 => "b.0, b.1, b.2, b.3", + _ => "", + }; + format!( + "{}(a as *const i8, {}, {}, {})", + current_fn, + subs, + constn.as_deref().unwrap(), + type_bits(&type_to_sub_type(in_t[1])) / 8, + ) + } else { + let cnt = if const_arm.contains(':') { + let consts: Vec<_> = + const_arm.split(':').map(|v| v.trim().to_string()).collect(); + consts[0].clone() + } else { + let const_arm = const_arm.replace("ttn", &type_to_native_type(in_t[1])); + let mut cnt = String::from(in_t[1]); + cnt.push_str("("); + for i in 0..type_len(in_t[1]) { + if i != 0 { + cnt.push_str(", "); + } + cnt.push_str(&const_arm); + } + cnt.push_str(")"); + cnt + }; + match para_num { + 1 => format!("{}(a, {})", current_fn, cnt), + 2 => format!("{}(a, b, {})", current_fn, cnt), + _ => String::new(), + } + } + } else if out_t != link_arm_t[3] { + match para_num { + 1 => format!("transmute({}(a))", current_fn,), + 2 => format!("transmute({}(transmute(a), transmute(b)))", current_fn,), + _ => String::new(), + } + } else if matches!(fn_type, Fntype::Store) { + match type_sub_len(in_t[1]) { + 1 => format!("{}(a, b)", current_fn), + 2 => format!("{}(a, b.0, b.1)", current_fn), + 3 => format!("{}(a, b.0, b.1, b.2)", current_fn), + 4 => format!("{}(a, b.0, b.1, b.2, b.3)", current_fn), + _ => String::new(), + } + } else { + String::new() + }; format!( r#"{}{{ - let b{}; - {}{}(a, transmute(b)) + {}{}{} }}"#, - fn_decl, - values(in_t[0], &fixed), - ext_c, - current_fn, + fn_decl, multi_calls, ext_c_arm, arm_params ) - } - (0, 2, _) => format!( - r#"{}{{ - {}{}(a, b) -}}"#, - fn_decl, ext_c, current_fn, - ), - (0, 3, _) => format!( - r#"{}{{ - {}{}(a, b, c) -}}"#, - fn_decl, ext_c, current_fn, - ), - (_, 1, _) => format!( - r#"{}{{ - {}{} -}}"#, - fn_decl, ext_c, multi_calls, - ), - (_, 2, _) => format!( - r#"{}{{ - {}{} -}}"#, - fn_decl, ext_c, multi_calls, - ), - (_, 3, _) => format!( - r#"{}{{ - {}{} -}}"#, - fn_decl, ext_c, multi_calls, - ), - (_, _, _) => String::new(), - }; - - let call_arm = if let Some(const_arm) = const_arm { - let cnt = if const_arm.contains(':') { - let consts: Vec<_> = const_arm.split(':').map(|v| v.trim().to_string()).collect(); - consts[0].clone() - } else { - let const_arm = const_arm.replace("ttn", &type_to_native_type(in_t[1])); - let mut cnt = String::from(in_t[1]); - cnt.push_str("("); - for i in 0..type_len(in_t[1]) { - if i != 0 { - cnt.push_str(", "); - } - cnt.push_str(&const_arm); - } - cnt.push_str(")"); - cnt }; - match para_num { - 1 => format!( - r#"{}{{ - {}{}{}(a, {}) -}}"#, - fn_decl, multi_calls, ext_c_arm, current_fn, cnt - ), - 2 => format!( - r#"{}{{ - {}{}{}(a, b, {}) -}}"#, - fn_decl, multi_calls, ext_c_arm, current_fn, cnt - ), - _ => String::new(), - } - } else if out_t != link_arm_t[3] { - match para_num { - 1 => format!( - r#"{}{{ - {}{}transmute({}(a)) -}}"#, - fn_decl, multi_calls, ext_c_arm, current_fn, - ), - 2 => format!( - r#"{}{{ - {}{}transmute({}(transmute(a), transmute(b))) -}}"#, - fn_decl, multi_calls, ext_c_arm, current_fn, - ), - _ => String::new(), - } - } else if matches!(fn_type, Fntype::Store) { - match type_sub_len(in_t[1]) { - 1 => format!( - r#"{}{{ - {}{}{}(a, b) -}}"#, - fn_decl, multi_calls, ext_c_arm, current_fn, - ), - 2 => format!( - r#"{}{{ - {}{}{}(a, b.0, b.1) -}}"#, - fn_decl, multi_calls, ext_c_arm, current_fn, - ), - 3 => format!( - r#"{}{{ - {}{}{}(a, b.0, b.1, b.2) -}}"#, - fn_decl, multi_calls, ext_c_arm, current_fn, - ), - 4 => format!( - r#"{}{{ - {}{}{}(a, b.0, b.1, b.2, b.3) -}}"#, - fn_decl, multi_calls, ext_c_arm, current_fn, - ), - _ => String::new(), - } - } else { - String::new() - }; - let call_aarch64 = if let Some(const_aarch64) = const_aarch64 { - match para_num { - 1 => format!( - r#"{}{{ - {}{}{}(a, {}) -}}"#, - fn_decl, multi_calls, ext_c_aarch64, current_fn, const_aarch64 - ), - 2 => format!( - r#"{}{{ - {}{}{}(a, b, {}) -}}"#, - fn_decl, multi_calls, ext_c_aarch64, current_fn, const_aarch64 - ), - _ => String::new(), - } - } else if out_t != link_aarch64_t[3] { - match para_num { - 1 => format!( - r#"{}{{ - {}{}transmute({}(a)) -}}"#, - fn_decl, multi_calls, ext_c_aarch64, current_fn, - ), - 2 => format!( - r#"{}{{ - {}{}transmute({}(a, b)) -}}"#, - fn_decl, multi_calls, ext_c_aarch64, current_fn, - ), - _ => String::new(), - } - } else if matches!(fn_type, Fntype::Store) { - match type_sub_len(in_t[1]) { - 1 => format!( - r#"{}{{ - {}{}{}(b, a) -}}"#, - fn_decl, multi_calls, ext_c_aarch64, current_fn, - ), - 2 => format!( - r#"{}{{ - {}{}{}(b.0, b.1, a) -}}"#, - fn_decl, multi_calls, ext_c_aarch64, current_fn, - ), - 3 => format!( - r#"{}{{ - {}{}{}(b.0, b.1, b.2, a) -}}"#, - fn_decl, multi_calls, ext_c_aarch64, current_fn, - ), - 4 => format!( + let call_aarch64 = { + let aarch64_params = + if let (Some(const_aarch64), Some(_)) = (const_aarch64, link_aarch64) { + if matches!(fn_type, Fntype::Load) { + let subs = match type_sub_len(in_t[1]) { + 1 => "b", + 2 => "b.0, b.1", + 3 => "b.0, b.1, b.2", + 4 => "b.0, b.1, b.2, b.3", + _ => "", + }; + format!( + "{}({}, {} as i64, a as *const i8)", + current_fn, + subs, + constn.as_deref().unwrap() + ) + } else { + match para_num { + 1 => format!("{}(a, {})", current_fn, const_aarch64), + 2 => format!("{}(a, b, {})", current_fn, const_aarch64), + _ => String::new(), + } + } + } else if out_t != link_aarch64_t[3] { + match para_num { + 1 => format!("transmute({}(a))", current_fn,), + 2 => format!("transmute({}(a, b))", current_fn,), + _ => String::new(), + } + } else if matches!(fn_type, Fntype::Store) { + match type_sub_len(in_t[1]) { + 1 => format!("{}(b, a)", current_fn), + 2 => format!("{}(b.0, b.1, a)", current_fn), + 3 => format!("{}(b.0, b.1, b.2, a)", current_fn), + 4 => format!("{}(b.0, b.1, b.2, b.3, a)", current_fn), + _ => String::new(), + } + } else { + String::new() + }; + format!( r#"{}{{ - {}{}{}(b.0, b.1, b.2, b.3, a) + {}{}{} }}"#, - fn_decl, multi_calls, ext_c_aarch64, current_fn, - ), - _ => String::new(), - } - } else { - String::new() - }; - let function = if separate { + fn_decl, multi_calls, ext_c_aarch64, aarch64_params + ) + }; format!( r#" {} @@ -2210,6 +2097,38 @@ fn gen_arm( call_aarch64, ) } else { + let call = { + let stmts = match (multi_calls.len(), para_num, fixed.len()) { + (0, 1, 0) => format!(r#"{}{}(a)"#, ext_c, current_fn,), + (0, 1, _) => { + let fixed: Vec = + fixed.iter().take(type_len(in_t[0])).cloned().collect(); + format!( + r#"let b{}; + {}{}(a, transmute(b))"#, + values(in_t[0], &fixed), + ext_c, + current_fn, + ) + } + (0, 2, _) => format!(r#"{}{}(a, b)"#, ext_c, current_fn,), + (0, 3, _) => format!(r#"{}{}(a, b, c)"#, ext_c, current_fn,), + (_, 1, _) => format!(r#"{}{}"#, ext_c, multi_calls,), + (_, 2, _) => format!(r#"{}{}"#, ext_c, multi_calls,), + (_, 3, _) => format!(r#"{}{}"#, ext_c, multi_calls,), + (_, _, _) => String::new(), + }; + if stmts != String::new() { + format!( + r#"{}{{ + {} +}}"#, + fn_decl, stmts + ) + } else { + String::new() + } + }; format!( r#" {} @@ -2662,6 +2581,18 @@ fn get_call( fn_name.push_str(type_to_suffix(&type_to_signed(&String::from(in_t[1])))); } else if fn_format[1] == "outsigned" { fn_name.push_str(type_to_suffix(&type_to_signed(&String::from(out_t)))); + } else if fn_format[1] == "outsignednox" { + fn_name.push_str(&type_to_suffix(&type_to_sub_type(&type_to_signed( + &String::from(out_t), + )))); + } else if fn_format[1] == "outsigneddupnox" { + fn_name.push_str(&type_to_dup_suffix(&type_to_sub_type(&type_to_signed( + &String::from(out_t), + )))); + } else if fn_format[1] == "outsignedlanenox" { + fn_name.push_str(&type_to_lane_suffix(&type_to_sub_type(&type_to_signed( + &String::from(out_t), + )))); } else if fn_format[1] == "unsigned" { fn_name.push_str(type_to_suffix(type_to_unsigned(in_t[1]))); } else if fn_format[1] == "doubleself" { @@ -2880,6 +2811,12 @@ mod test { suffix = NoQNSuffix; } else if line.starts_with("out-suffix") { suffix = OutSuffix; + } else if line.starts_with("out-nox") { + suffix = OutNox; + } else if line.starts_with("out-dup-nox") { + suffix = OutDupNox; + } else if line.starts_with("out-lane-nox") { + suffix = OutLaneNox; } else if line.starts_with("lane-suffixes") { suffix = Lane; } else if line.starts_with("in2-suffix") { diff --git a/crates/stdarch-verify/src/lib.rs b/crates/stdarch-verify/src/lib.rs index 5836949122..22108d26a1 100644 --- a/crates/stdarch-verify/src/lib.rs +++ b/crates/stdarch-verify/src/lib.rs @@ -15,7 +15,14 @@ pub fn x86_functions(input: TokenStream) -> TokenStream { #[proc_macro] pub fn arm_functions(input: TokenStream) -> TokenStream { - functions(input, &["core_arch/src/arm", "core_arch/src/aarch64"]) + functions( + input, + &[ + "core_arch/src/arm", + "core_arch/src/aarch64", + "core_arch/src/arm_shared/neon", + ], + ) } #[proc_macro] @@ -219,12 +226,12 @@ fn to_type(t: &syn::Type) -> proc_macro2::TokenStream { "int16x2_t" => quote! { &I16X2 }, "int16x4_t" => quote! { &I16X4 }, "int16x4x2_t" => quote! { &I16X4X2 }, - "int16x4x3_t" => quote! { &I16x4x3 }, - "int16x4x4_t" => quote! { &I16x4x4 }, + "int16x4x3_t" => quote! { &I16X4X3 }, + "int16x4x4_t" => quote! { &I16X4X4 }, "int16x8_t" => quote! { &I16X8 }, "int16x8x2_t" => quote! { &I16X8X2 }, - "int16x8x3_t" => quote! { &I16x8x3 }, - "int16x8x4_t" => quote! { &I16x8x4 }, + "int16x8x3_t" => quote! { &I16X8X3 }, + "int16x8x4_t" => quote! { &I16X8X4 }, "int32x2_t" => quote! { &I32X2 }, "int32x2x2_t" => quote! { &I32X2X2 }, "int32x2x3_t" => quote! { &I32X2X3 }, @@ -252,12 +259,12 @@ fn to_type(t: &syn::Type) -> proc_macro2::TokenStream { "uint8x16_t" => quote! { &U8X16 }, "uint16x4_t" => quote! { &U16X4 }, "uint16x4x2_t" => quote! { &U16X4X2 }, - "uint16x4x3_t" => quote! { &U16x4x3 }, - "uint16x4x4_t" => quote! { &U16x4x4 }, + "uint16x4x3_t" => quote! { &U16X4X3 }, + "uint16x4x4_t" => quote! { &U16X4X4 }, "uint16x8_t" => quote! { &U16X8 }, "uint16x8x2_t" => quote! { &U16X8X2 }, - "uint16x8x3_t" => quote! { &U16x8x3 }, - "uint16x8x4_t" => quote! { &U16x8x4 }, + "uint16x8x3_t" => quote! { &U16X8X3 }, + "uint16x8x4_t" => quote! { &U16X8X4 }, "uint32x2_t" => quote! { &U32X2 }, "uint32x2x2_t" => quote! { &U32X2X2 }, "uint32x2x3_t" => quote! { &U32X2X3 }, @@ -302,13 +309,19 @@ fn to_type(t: &syn::Type) -> proc_macro2::TokenStream { "poly64x2_t" => quote! { &POLY64X2 }, "poly8x16_t" => quote! { &POLY8X16 }, "poly16x4_t" => quote! { &POLY16X4 }, - "poly16x4x2_t" => quote! { &POLY16X4X2 }, - "poly16x4x3_t" => quote! { &POLY16X4X3 }, - "poly16x4x4_t" => quote! { &POLY16X4X4 }, + "poly16x4x2_t" => quote! { &P16X4X2 }, + "poly16x4x3_t" => quote! { &P16X4X3 }, + "poly16x4x4_t" => quote! { &P16X4X4 }, "poly16x8_t" => quote! { &POLY16X8 }, - "poly16x8x2_t" => quote! { &POLY16X8X2 }, - "poly16x8x3_t" => quote! { &POLY16X8X3 }, - "poly16x8x4_t" => quote! { &POLY16X8X4 }, + "poly16x8x2_t" => quote! { &P16X8X2 }, + "poly16x8x3_t" => quote! { &P16X8X3 }, + "poly16x8x4_t" => quote! { &P16X8X4 }, + "poly64x1x2_t" => quote! { &P64X1X2 }, + "poly64x1x3_t" => quote! { &P64X1X3 }, + "poly64x1x4_t" => quote! { &P64X1X4 }, + "poly64x2x2_t" => quote! { &P64X2X2 }, + "poly64x2x3_t" => quote! { &P64X2X3 }, + "poly64x2x4_t" => quote! { &P64X2X4 }, "p128" => quote! { &P128 }, "v16i8" => quote! { &v16i8 }, diff --git a/crates/stdarch-verify/tests/arm.rs b/crates/stdarch-verify/tests/arm.rs index da17eb313c..9047a08ef7 100644 --- a/crates/stdarch-verify/tests/arm.rs +++ b/crates/stdarch-verify/tests/arm.rs @@ -411,65 +411,6 @@ fn verify_all_signatures() { "__smusdx", "__usad8", "__usada8", - "vld1_s8", - "vld1q_s8", - "vld1q_s8", - "vld1_s16", - "vld1q_s16", - "vld1_s32", - "vld1q_s32", - "vld1_s64", - "vld1q_s64", - "vld1_u8", - "vld1q_u8", - "vld1_u16", - "vld1q_u16", - "vld1_u32", - "vld1q_u32", - "vld1_u64", - "vld1q_u64", - "vld1_p8", - "vld1q_p8", - "vld1_p16", - "vld1q_p16", - "vld1_f32", - "vld1q_f32", - "vld1_f64", - "vld1q_f64", - "vst1_s8", - "vst1q_s8", - "vst1_s16", - "vst1q_s16", - "vst1_s32", - "vst1q_s32", - "vst1_s64", - "vst1q_s64", - "vst1_u8", - "vst1q_u8", - "vst1_u16", - "vst1q_u16", - "vst1_u32", - "vst1q_u32", - "vst1_u64", - "vst1q_u64", - "vst1_p8", - "vst1q_p8", - "vst1_p16", - "vst1q_p16", - "vst1_f32", - "vst1q_f32", - "vpadal_s8", - "vpadal_s16", - "vpadal_s32", - "vpadalq_s8", - "vpadalq_s16", - "vpadalq_s32", - "vpadal_u8", - "vpadal_u16", - "vpadal_u32", - "vpadalq_u8", - "vpadalq_u16", - "vpadalq_u32", "__ldrex", "__strex", "__ldrexb", @@ -515,12 +456,36 @@ fn verify_all_signatures() { "vqrdmlahh_laneq_s16", "vqrdmlahs_lane_s32", "vqrdmlahs_laneq_s32", + "vqrdmlah_s16", + "vqrdmlah_s32", + "vqrdmlahq_s16", + "vqrdmlahq_s32", + "vqrdmlah_lane_s16", + "vqrdmlah_laneq_s16", + "vqrdmlahq_lane_s16", + "vqrdmlahq_laneq_s16", + "vqrdmlah_lane_s32", + "vqrdmlah_laneq_s32", + "vqrdmlahq_lane_s32", + "vqrdmlahq_laneq_s32", "vqrdmlshh_s16", "vqrdmlshs_s32", "vqrdmlshh_lane_s16", "vqrdmlshh_laneq_s16", "vqrdmlshs_lane_s32", "vqrdmlshs_laneq_s32", + "vqrdmlsh_s16", + "vqrdmlshq_s16", + "vqrdmlsh_s32", + "vqrdmlshq_s32", + "vqrdmlsh_lane_s16", + "vqrdmlsh_laneq_s16", + "vqrdmlshq_lane_s16", + "vqrdmlshq_laneq_s16", + "vqrdmlsh_lane_s32", + "vqrdmlsh_laneq_s32", + "vqrdmlshq_lane_s32", + "vqrdmlshq_laneq_s32", "__dbg", ]; let arm = match map.get(rust.name) {