Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .github/workflows/main.yml
Original file line number Diff line number Diff line change
Expand Up @@ -348,6 +348,7 @@ jobs:
- name: Run miri tests
env:
TARGET: "aarch64-unknown-linux-gnu"
RUSTFLAGS: "-Ctarget-cpu=neoverse-v3"
run: |
# read filters and join them with a space.
FILTERS=$(cat aarch64-miri-tests.txt | tr '\n' ' ')
Expand Down
4 changes: 2 additions & 2 deletions crates/core_arch/src/aarch64/neon/generated.rs
Original file line number Diff line number Diff line change
Expand Up @@ -12189,7 +12189,7 @@ pub unsafe fn vld3q_lane_p64<const LANE: i32>(a: *const p64, b: poly64x2x3_t) ->
#[rustc_legacy_const_generics(2)]
#[stable(feature = "neon_intrinsics", since = "1.59.0")]
pub unsafe fn vld3q_lane_s8<const LANE: i32>(a: *const i8, b: int8x16x3_t) -> int8x16x3_t {
static_assert_uimm_bits!(LANE, 3);
static_assert_uimm_bits!(LANE, 4);
unsafe extern "unadjusted" {
#[cfg_attr(
any(target_arch = "aarch64", target_arch = "arm64ec"),
Expand Down Expand Up @@ -12571,7 +12571,7 @@ pub unsafe fn vld4q_lane_f64<const LANE: i32>(a: *const f64, b: float64x2x4_t) -
#[rustc_legacy_const_generics(2)]
#[stable(feature = "neon_intrinsics", since = "1.59.0")]
pub unsafe fn vld4q_lane_s8<const LANE: i32>(a: *const i8, b: int8x16x4_t) -> int8x16x4_t {
static_assert_uimm_bits!(LANE, 3);
static_assert_uimm_bits!(LANE, 4);
unsafe extern "unadjusted" {
#[cfg_attr(
any(target_arch = "aarch64", target_arch = "arm64ec"),
Expand Down
205 changes: 127 additions & 78 deletions crates/core_arch/src/aarch64/neon/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1037,6 +1037,7 @@ mod tests {
macro_rules! wide_store_load_roundtrip_fp16 {
($( $name:ident $args:tt);* $(;)?) => {
$(
#[cfg_attr(miri, ignore)]
#[simd_test(enable = "neon,fp16")]
#[cfg(not(target_arch = "arm64ec"))]
unsafe fn $name() {
Expand All @@ -1055,13 +1056,13 @@ mod tests {
test_vld1q_f16_x3(f16, 24, float16x8x3_t, vst1q_f16_x3, vld1q_f16_x3);
test_vld1q_f16_x4(f16, 32, float16x8x4_t, vst1q_f16_x4, vld1q_f16_x4);

test_vld2_f16_x2(f16, 8, float16x4x2_t, vst2_f16, vld2_f16);
test_vld2_f16_x3(f16, 12, float16x4x3_t, vst3_f16, vld3_f16);
test_vld2_f16_x4(f16, 16, float16x4x4_t, vst4_f16, vld4_f16);
test_vld2_f16(f16, 8, float16x4x2_t, vst2_f16, vld2_f16);
test_vld3_f16(f16, 12, float16x4x3_t, vst3_f16, vld3_f16);
test_vld4_f16(f16, 16, float16x4x4_t, vst4_f16, vld4_f16);

test_vld2q_f16_x2(f16, 16, float16x8x2_t, vst2q_f16, vld2q_f16);
test_vld3q_f16_x3(f16, 24, float16x8x3_t, vst3q_f16, vld3q_f16);
test_vld4q_f16_x4(f16, 32, float16x8x4_t, vst4q_f16, vld4q_f16);
test_vld2q_f16(f16, 16, float16x8x2_t, vst2q_f16, vld2q_f16);
test_vld3q_f16(f16, 24, float16x8x3_t, vst3q_f16, vld3q_f16);
test_vld4q_f16(f16, 32, float16x8x4_t, vst4q_f16, vld4q_f16);
}

macro_rules! wide_store_load_roundtrip_aes {
Expand Down Expand Up @@ -1195,101 +1196,149 @@ mod tests {
}

wide_store_load_roundtrip_neon! {
test_vld2_f32_x2(f32, 4, float32x2x2_t, vst2_f32, vld2_f32);
test_vld2_f32_x3(f32, 6, float32x2x3_t, vst3_f32, vld3_f32);
test_vld2_f32_x4(f32, 8, float32x2x4_t, vst4_f32, vld4_f32);
test_vld2_f32(f32, 4, float32x2x2_t, vst2_f32, vld2_f32);
test_vld3_f32(f32, 6, float32x2x3_t, vst3_f32, vld3_f32);
test_vld4_f32(f32, 8, float32x2x4_t, vst4_f32, vld4_f32);

test_vld2q_f32_x2(f32, 8, float32x4x2_t, vst2q_f32, vld2q_f32);
test_vld3q_f32_x3(f32, 12, float32x4x3_t, vst3q_f32, vld3q_f32);
test_vld4q_f32_x4(f32, 16, float32x4x4_t, vst4q_f32, vld4q_f32);
test_vld2q_f32(f32, 8, float32x4x2_t, vst2q_f32, vld2q_f32);
test_vld3q_f32(f32, 12, float32x4x3_t, vst3q_f32, vld3q_f32);
test_vld4q_f32(f32, 16, float32x4x4_t, vst4q_f32, vld4q_f32);

test_vld2_f64_x2(f64, 2, float64x1x2_t, vst2_f64, vld2_f64);
test_vld2_f64_x3(f64, 3, float64x1x3_t, vst3_f64, vld3_f64);
test_vld2_f64_x4(f64, 4, float64x1x4_t, vst4_f64, vld4_f64);
test_vld2_f64(f64, 2, float64x1x2_t, vst2_f64, vld2_f64);
test_vld3_f64(f64, 3, float64x1x3_t, vst3_f64, vld3_f64);
test_vld4_f64(f64, 4, float64x1x4_t, vst4_f64, vld4_f64);

test_vld2q_f64_x2(f64, 4, float64x2x2_t, vst2q_f64, vld2q_f64);
test_vld3q_f64_x3(f64, 6, float64x2x3_t, vst3q_f64, vld3q_f64);
test_vld4q_f64_x4(f64, 8, float64x2x4_t, vst4q_f64, vld4q_f64);
test_vld2q_f64(f64, 4, float64x2x2_t, vst2q_f64, vld2q_f64);
test_vld3q_f64(f64, 6, float64x2x3_t, vst3q_f64, vld3q_f64);
test_vld4q_f64(f64, 8, float64x2x4_t, vst4q_f64, vld4q_f64);

test_vld2_s8_x2(i8, 16, int8x8x2_t, vst2_s8, vld2_s8);
test_vld2_s8_x3(i8, 24, int8x8x3_t, vst3_s8, vld3_s8);
test_vld2_s8_x4(i8, 32, int8x8x4_t, vst4_s8, vld4_s8);
test_vld2_s8(i8, 16, int8x8x2_t, vst2_s8, vld2_s8);
test_vld3_s8(i8, 24, int8x8x3_t, vst3_s8, vld3_s8);
test_vld4_s8(i8, 32, int8x8x4_t, vst4_s8, vld4_s8);

test_vld2q_s8_x2(i8, 32, int8x16x2_t, vst2q_s8, vld2q_s8);
test_vld3q_s8_x3(i8, 48, int8x16x3_t, vst3q_s8, vld3q_s8);
test_vld4q_s8_x4(i8, 64, int8x16x4_t, vst4q_s8, vld4q_s8);
test_vld2q_s8(i8, 32, int8x16x2_t, vst2q_s8, vld2q_s8);
test_vld3q_s8(i8, 48, int8x16x3_t, vst3q_s8, vld3q_s8);
test_vld4q_s8(i8, 64, int8x16x4_t, vst4q_s8, vld4q_s8);

test_vld2_s16_x2(i16, 8, int16x4x2_t, vst2_s16, vld2_s16);
test_vld2_s16_x3(i16, 12, int16x4x3_t, vst3_s16, vld3_s16);
test_vld2_s16_x4(i16, 16, int16x4x4_t, vst4_s16, vld4_s16);
test_vld2_s16(i16, 8, int16x4x2_t, vst2_s16, vld2_s16);
test_vld3_s16(i16, 12, int16x4x3_t, vst3_s16, vld3_s16);
test_vld4_s16(i16, 16, int16x4x4_t, vst4_s16, vld4_s16);

test_vld2q_s16_x2(i16, 16, int16x8x2_t, vst2q_s16, vld2q_s16);
test_vld3q_s16_x3(i16, 24, int16x8x3_t, vst3q_s16, vld3q_s16);
test_vld4q_s16_x4(i16, 32, int16x8x4_t, vst4q_s16, vld4q_s16);
test_vld2q_s16(i16, 16, int16x8x2_t, vst2q_s16, vld2q_s16);
test_vld3q_s16(i16, 24, int16x8x3_t, vst3q_s16, vld3q_s16);
test_vld4q_s16(i16, 32, int16x8x4_t, vst4q_s16, vld4q_s16);

test_vld2_s32_x2(i32, 4, int32x2x2_t, vst2_s32, vld2_s32);
test_vld2_s32_x3(i32, 6, int32x2x3_t, vst3_s32, vld3_s32);
test_vld2_s32_x4(i32, 8, int32x2x4_t, vst4_s32, vld4_s32);
test_vld2_s32(i32, 4, int32x2x2_t, vst2_s32, vld2_s32);
test_vld3_s32(i32, 6, int32x2x3_t, vst3_s32, vld3_s32);
test_vld4_s32(i32, 8, int32x2x4_t, vst4_s32, vld4_s32);

test_vld2q_s32_x2(i32, 8, int32x4x2_t, vst2q_s32, vld2q_s32);
test_vld3q_s32_x3(i32, 12, int32x4x3_t, vst3q_s32, vld3q_s32);
test_vld4q_s32_x4(i32, 16, int32x4x4_t, vst4q_s32, vld4q_s32);
test_vld2q_s32(i32, 8, int32x4x2_t, vst2q_s32, vld2q_s32);
test_vld3q_s32(i32, 12, int32x4x3_t, vst3q_s32, vld3q_s32);
test_vld4q_s32(i32, 16, int32x4x4_t, vst4q_s32, vld4q_s32);

test_vld2_s64_x2(i64, 2, int64x1x2_t, vst2_s64, vld2_s64);
test_vld2_s64_x3(i64, 3, int64x1x3_t, vst3_s64, vld3_s64);
test_vld2_s64_x4(i64, 4, int64x1x4_t, vst4_s64, vld4_s64);
test_vld2_s64(i64, 2, int64x1x2_t, vst2_s64, vld2_s64);
test_vld3_s64(i64, 3, int64x1x3_t, vst3_s64, vld3_s64);
test_vld4_s64(i64, 4, int64x1x4_t, vst4_s64, vld4_s64);

test_vld2q_s64_x2(i64, 4, int64x2x2_t, vst2q_s64, vld2q_s64);
test_vld3q_s64_x3(i64, 6, int64x2x3_t, vst3q_s64, vld3q_s64);
test_vld4q_s64_x4(i64, 8, int64x2x4_t, vst4q_s64, vld4q_s64);
test_vld2q_s64(i64, 4, int64x2x2_t, vst2q_s64, vld2q_s64);
test_vld3q_s64(i64, 6, int64x2x3_t, vst3q_s64, vld3q_s64);
test_vld4q_s64(i64, 8, int64x2x4_t, vst4q_s64, vld4q_s64);

test_vld2_u8_x2(u8, 16, uint8x8x2_t, vst2_u8, vld2_u8);
test_vld2_u8_x3(u8, 24, uint8x8x3_t, vst3_u8, vld3_u8);
test_vld2_u8_x4(u8, 32, uint8x8x4_t, vst4_u8, vld4_u8);
test_vld2_u8(u8, 16, uint8x8x2_t, vst2_u8, vld2_u8);
test_vld3_u8(u8, 24, uint8x8x3_t, vst3_u8, vld3_u8);
test_vld4_u8(u8, 32, uint8x8x4_t, vst4_u8, vld4_u8);

test_vld2q_u8_x2(u8, 32, uint8x16x2_t, vst2q_u8, vld2q_u8);
test_vld3q_u8_x3(u8, 48, uint8x16x3_t, vst3q_u8, vld3q_u8);
test_vld4q_u8_x4(u8, 64, uint8x16x4_t, vst4q_u8, vld4q_u8);
test_vld2q_u8(u8, 32, uint8x16x2_t, vst2q_u8, vld2q_u8);
test_vld3q_u8(u8, 48, uint8x16x3_t, vst3q_u8, vld3q_u8);
test_vld4q_u8(u8, 64, uint8x16x4_t, vst4q_u8, vld4q_u8);

test_vld2_u16_x2(u16, 8, uint16x4x2_t, vst2_u16, vld2_u16);
test_vld2_u16_x3(u16, 12, uint16x4x3_t, vst3_u16, vld3_u16);
test_vld2_u16_x4(u16, 16, uint16x4x4_t, vst4_u16, vld4_u16);
test_vld2_u16(u16, 8, uint16x4x2_t, vst2_u16, vld2_u16);
test_vld3_u16(u16, 12, uint16x4x3_t, vst3_u16, vld3_u16);
test_vld4_u16(u16, 16, uint16x4x4_t, vst4_u16, vld4_u16);

test_vld2q_u16_x2(u16, 16, uint16x8x2_t, vst2q_u16, vld2q_u16);
test_vld3q_u16_x3(u16, 24, uint16x8x3_t, vst3q_u16, vld3q_u16);
test_vld4q_u16_x4(u16, 32, uint16x8x4_t, vst4q_u16, vld4q_u16);
test_vld2q_u16(u16, 16, uint16x8x2_t, vst2q_u16, vld2q_u16);
test_vld3q_u16(u16, 24, uint16x8x3_t, vst3q_u16, vld3q_u16);
test_vld4q_u16(u16, 32, uint16x8x4_t, vst4q_u16, vld4q_u16);

test_vld2_u32_x2(u32, 4, uint32x2x2_t, vst2_u32, vld2_u32);
test_vld2_u32_x3(u32, 6, uint32x2x3_t, vst3_u32, vld3_u32);
test_vld2_u32_x4(u32, 8, uint32x2x4_t, vst4_u32, vld4_u32);
test_vld2_u32(u32, 4, uint32x2x2_t, vst2_u32, vld2_u32);
test_vld3_u32(u32, 6, uint32x2x3_t, vst3_u32, vld3_u32);
test_vld4_u32(u32, 8, uint32x2x4_t, vst4_u32, vld4_u32);

test_vld2q_u32_x2(u32, 8, uint32x4x2_t, vst2q_u32, vld2q_u32);
test_vld3q_u32_x3(u32, 12, uint32x4x3_t, vst3q_u32, vld3q_u32);
test_vld4q_u32_x4(u32, 16, uint32x4x4_t, vst4q_u32, vld4q_u32);
test_vld2q_u32(u32, 8, uint32x4x2_t, vst2q_u32, vld2q_u32);
test_vld3q_u32(u32, 12, uint32x4x3_t, vst3q_u32, vld3q_u32);
test_vld4q_u32(u32, 16, uint32x4x4_t, vst4q_u32, vld4q_u32);

test_vld2_u64_x2(u64, 2, uint64x1x2_t, vst2_u64, vld2_u64);
test_vld2_u64_x3(u64, 3, uint64x1x3_t, vst3_u64, vld3_u64);
test_vld2_u64_x4(u64, 4, uint64x1x4_t, vst4_u64, vld4_u64);
test_vld2_u64(u64, 2, uint64x1x2_t, vst2_u64, vld2_u64);
test_vld3_u64(u64, 3, uint64x1x3_t, vst3_u64, vld3_u64);
test_vld4_u64(u64, 4, uint64x1x4_t, vst4_u64, vld4_u64);

test_vld2q_u64_x2(u64, 4, uint64x2x2_t, vst2q_u64, vld2q_u64);
test_vld3q_u64_x3(u64, 6, uint64x2x3_t, vst3q_u64, vld3q_u64);
test_vld4q_u64_x4(u64, 8, uint64x2x4_t, vst4q_u64, vld4q_u64);
test_vld2q_u64(u64, 4, uint64x2x2_t, vst2q_u64, vld2q_u64);
test_vld3q_u64(u64, 6, uint64x2x3_t, vst3q_u64, vld3q_u64);
test_vld4q_u64(u64, 8, uint64x2x4_t, vst4q_u64, vld4q_u64);

test_vld2_p8_x2(p8, 16, poly8x8x2_t, vst2_p8, vld2_p8);
test_vld2_p8_x3(p8, 24, poly8x8x3_t, vst3_p8, vld3_p8);
test_vld2_p8_x4(p8, 32, poly8x8x4_t, vst4_p8, vld4_p8);
test_vld2_p8(p8, 16, poly8x8x2_t, vst2_p8, vld2_p8);
test_vld3_p8(p8, 24, poly8x8x3_t, vst3_p8, vld3_p8);
test_vld4_p8(p8, 32, poly8x8x4_t, vst4_p8, vld4_p8);

test_vld2q_p8_x2(p8, 32, poly8x16x2_t, vst2q_p8, vld2q_p8);
test_vld3q_p8_x3(p8, 48, poly8x16x3_t, vst3q_p8, vld3q_p8);
test_vld4q_p8_x4(p8, 64, poly8x16x4_t, vst4q_p8, vld4q_p8);
test_vld2q_p8(p8, 32, poly8x16x2_t, vst2q_p8, vld2q_p8);
test_vld3q_p8(p8, 48, poly8x16x3_t, vst3q_p8, vld3q_p8);
test_vld4q_p8(p8, 64, poly8x16x4_t, vst4q_p8, vld4q_p8);

test_vld2_p16_x2(p16, 8, poly16x4x2_t, vst2_p16, vld2_p16);
test_vld2_p16_x3(p16, 12, poly16x4x3_t, vst3_p16, vld3_p16);
test_vld2_p16_x4(p16, 16, poly16x4x4_t, vst4_p16, vld4_p16);
test_vld2_p16(p16, 8, poly16x4x2_t, vst2_p16, vld2_p16);
test_vld3_p16(p16, 12, poly16x4x3_t, vst3_p16, vld3_p16);
test_vld4_p16(p16, 16, poly16x4x4_t, vst4_p16, vld4_p16);

test_vld2q_p16_x2(p16, 16, poly16x8x2_t, vst2q_p16, vld2q_p16);
test_vld3q_p16_x3(p16, 24, poly16x8x3_t, vst3q_p16, vld3q_p16);
test_vld4q_p16_x4(p16, 32, poly16x8x4_t, vst4q_p16, vld4q_p16);
test_vld2q_p16(p16, 16, poly16x8x2_t, vst2q_p16, vld2q_p16);
test_vld3q_p16(p16, 24, poly16x8x3_t, vst3q_p16, vld3q_p16);
test_vld4q_p16(p16, 32, poly16x8x4_t, vst4q_p16, vld4q_p16);
}

macro_rules! lane_wide_store_load_roundtrip {
($elem_ty:ty, $len:expr, $idx:expr, $vec_ty:ty, $store:ident, $load:ident) => {
let vals: [$elem_ty; $len] = crate::array::from_fn(|i| i as $elem_ty);
let a: $vec_ty = transmute(vals);
let mut tmp = [0 as $elem_ty; 4];
$store::<$idx>(tmp.as_mut_ptr().cast(), a);
let r: $vec_ty = $load::<$idx>(tmp.as_ptr().cast(), a);
let out: [$elem_ty; $len] = transmute(r);
assert_eq!(out, vals);
};
}

macro_rules! lane_wide_store_load_roundtrip_neon {
($( $name:ident $args:tt);* $(;)?) => {
$(
#[cfg_attr(miri, ignore)]
#[simd_test(enable = "neon")]
unsafe fn $name() {
lane_wide_store_load_roundtrip! $args;
}
)*
};
}

lane_wide_store_load_roundtrip_neon! {
test_vld2q_lane_s8(i8, 32, 15, int8x16x2_t, vst2q_lane_s8, vld2q_lane_s8);
test_vld3q_lane_s8(i8, 48, 15, int8x16x3_t, vst3q_lane_s8, vld3q_lane_s8);
test_vld4q_lane_s8(i8, 64, 15, int8x16x4_t, vst4q_lane_s8, vld4q_lane_s8);

test_vld2q_lane_u8(u8, 32, 15, uint8x16x2_t, vst2q_lane_u8, vld2q_lane_u8);
test_vld3q_lane_u8(u8, 48, 15, uint8x16x3_t, vst3q_lane_u8, vld3q_lane_u8);
test_vld4q_lane_u8(u8, 64, 15, uint8x16x4_t, vst4q_lane_u8, vld4q_lane_u8);

test_vld2_lane_s64(i64, 2, 0, int64x1x2_t, vst2_lane_s64, vld2_lane_s64);
test_vld3_lane_s64(i64, 3, 0, int64x1x3_t, vst3_lane_s64, vld3_lane_s64);
test_vld4_lane_s64(i64, 4, 0, int64x1x4_t, vst4_lane_s64, vld4_lane_s64);
test_vld2q_lane_s64(i64, 4, 1, int64x2x2_t, vst2q_lane_s64, vld2q_lane_s64);
test_vld3q_lane_s64(i64, 6, 1, int64x2x3_t, vst3q_lane_s64, vld3q_lane_s64);
test_vld4q_lane_s64(i64, 8, 1, int64x2x4_t, vst4q_lane_s64, vld4q_lane_s64);

test_vld2_lane_u64(u64, 2, 0, uint64x1x2_t, vst2_lane_u64, vld2_lane_u64);
test_vld3_lane_u64(u64, 3, 0, uint64x1x3_t, vst3_lane_u64, vld3_lane_u64);
test_vld4_lane_u64(u64, 4, 0, uint64x1x4_t, vst4_lane_u64, vld4_lane_u64);
test_vld2q_lane_u64(u64, 4, 1, uint64x2x2_t, vst2q_lane_u64, vld2q_lane_u64);
test_vld3q_lane_u64(u64, 6, 1, uint64x2x3_t, vst3q_lane_u64, vld3q_lane_u64);
test_vld4q_lane_u64(u64, 8, 1, uint64x2x4_t, vst4q_lane_u64, vld4q_lane_u64);
}
}

Expand Down
Loading