diff --git a/example/std_example.rs b/example/std_example.rs index 9da701d4469..e3b3edd86af 100644 --- a/example/std_example.rs +++ b/example/std_example.rs @@ -65,6 +65,8 @@ unsafe fn test_simd() { assert_eq!(std::mem::transmute::<_, [u16; 8]>(cmp_lt), [0, 0, 0, 0, 0, 0, 0, 0]); test_mm_slli_si128(); + test_mm_movemask_epi8(); + test_mm256_movemask_epi8(); } #[target_feature(enable = "sse2")] @@ -109,6 +111,31 @@ unsafe fn test_mm_slli_si128() { assert_eq_m128i(r, _mm_set1_epi8(0)); } +#[target_feature(enable = "sse2")] +unsafe fn test_mm_movemask_epi8() { + use std::arch::x86_64::*; + + #[rustfmt::skip] + let a = _mm_setr_epi8( + 0b1000_0000u8 as i8, 0b0, 0b1000_0000u8 as i8, 0b01, + 0b0101, 0b1111_0000u8 as i8, 0, 0, + 0, 0, 0b1111_0000u8 as i8, 0b0101, + 0b01, 0b1000_0000u8 as i8, 0b0, 0b1000_0000u8 as i8, + ); + let r = _mm_movemask_epi8(a); + assert_eq!(r, 0b10100100_00100101); +} + +#[target_feature(enable = "avx2")] +unsafe fn test_mm256_movemask_epi8() { + use std::arch::x86_64::*; + + let a = _mm256_set1_epi8(-1); + let r = _mm256_movemask_epi8(a); + let e = -1; + assert_eq!(r, e); +} + fn assert_eq_m128i(x: std::arch::x86_64::__m128i, y: std::arch::x86_64::__m128i) { unsafe { assert_eq!(std::mem::transmute::<_, [u8; 16]>(x), std::mem::transmute::<_, [u8; 16]>(x)); diff --git a/src/llvm_intrinsics.rs b/src/llvm_intrinsics.rs index bb993298411..32aa8b5d3df 100644 --- a/src/llvm_intrinsics.rs +++ b/src/llvm_intrinsics.rs @@ -33,15 +33,15 @@ pub fn codegen_llvm_intrinsic_call<'a, 'tcx: 'a>( crate::trap::trap_unimplemented(fx, intrinsic); }; - // Used by _mm_movemask_epi8 - llvm.x86.sse2.pmovmskb.128, (c a) { + // Used by `_mm_movemask_epi8` and `_mm256_movemask_epi8` + llvm.x86.sse2.pmovmskb.128 | llvm.x86.avx2.pmovmskb, (c a) { let (lane_layout, lane_count) = crate::intrinsics::lane_type_and_count(fx, a.layout(), intrinsic); assert_eq!(lane_layout.ty.sty, fx.tcx.types.i8.sty); - assert_eq!(lane_count, 16); + assert!(lane_count == 16 || lane_count == 32); let mut res = fx.bcx.ins().iconst(types::I32, 0); - for lane in 0..16 { + for lane in 0..lane_count { let a_lane = a.value_field(fx, mir::Field::new(lane.try_into().unwrap())).load_scalar(fx); let a_lane_sign = fx.bcx.ins().ushr_imm(a_lane, 7); // extract sign bit of 8bit int let a_lane_sign = fx.bcx.ins().uextend(types::I32, a_lane_sign); @@ -65,6 +65,5 @@ pub fn codegen_llvm_intrinsic_call<'a, 'tcx: 'a>( // llvm.x86.avx2.vperm2i128 // llvm.x86.ssse3.pshuf.b.128 // llvm.x86.avx2.pshuf.b -// llvm.x86.avx2.pmovmskb // llvm.x86.avx2.psrli.w // llvm.x86.sse2.psrli.w