Auto merge of #42556 - scottmcm:ctz-nz, r=BurntSushi

Get LLVM to stop generating dead assembly in next_power_of_two

It turns out that LLVM can turn `@llvm.ctlz.i64(_, true)` into `@llvm.ctlz.i64(_, false)` ([`ctlz`](http://llvm.org/docs/LangRef.html#llvm-ctlz-intrinsic)) where valuable, but never does the opposite.  That leads to some silly assembly getting generated in certain cases.

A contrived-but-clear example https://is.gd/VAIKuC:
```rust
fn foo(x:u64) -> u32 {
    if x == 0 { return !0; }
    x.leading_zeros()
}
```
Generates
```asm
	testq	%rdi, %rdi
	je	.LBB0_1
	je	.LBB0_3    ; <-- wha?
	bsrq	%rdi, %rax
	xorq	$63, %rax
	retq
.LBB0_1:
	movl	$-1, %eax
	retq
.LBB0_3:
	movl	$64, %eax  ; <-- dead
	retq
```

I noticed this in `next_power_of_two`, which without this PR generates the following:
```asm
	cmpq	$2, %rcx
	jae	.LBB1_2
	movl	$1, %eax
	retq
.LBB1_2:
	decq	%rcx
	je	.LBB1_3
	bsrq	%rcx, %rcx
	xorq	$63, %rcx
	jmp	.LBB1_5
.LBB1_3:
	movl	$64, %ecx  ; <-- dead
.LBB1_5:
	movq	$-1, %rax
	shrq	%cl, %rax
	incq	%rax
	retq
```

And with this PR becomes
```asm
	cmpq	$2, %rcx
	jae	.LBB0_2
	movl	$1, %eax
	retq
.LBB0_2:
	decq	%rcx
	bsrq	%rcx, %rcx
	xorl	$63, %ecx
	movq	$-1, %rax
	shrq	%cl, %rax
	incq	%rax
	retq
```
This commit is contained in:
bors 2017-06-10 09:11:36 +00:00
commit 995f741a0e
5 changed files with 96 additions and 3 deletions

View File

@ -1229,6 +1229,23 @@ extern "rust-intrinsic" {
/// ```
pub fn ctlz<T>(x: T) -> T;
/// Like `ctlz`, but extra-unsafe as it returns `undef` when
/// given an `x` with value `0`.
///
/// # Examples
///
/// ```
/// #![feature(core_intrinsics)]
///
/// use std::intrinsics::ctlz_nonzero;
///
/// let x = 0b0001_1100_u8;
/// let num_leading = unsafe { ctlz_nonzero(x) };
/// assert_eq!(num_leading, 3);
/// ```
#[cfg(not(stage0))]
pub fn ctlz_nonzero<T>(x: T) -> T;
/// Returns the number of trailing unset bits (zeroes) in an integer type `T`.
///
/// # Examples
@ -1256,6 +1273,23 @@ extern "rust-intrinsic" {
/// ```
pub fn cttz<T>(x: T) -> T;
/// Like `cttz`, but extra-unsafe as it returns `undef` when
/// given an `x` with value `0`.
///
/// # Examples
///
/// ```
/// #![feature(core_intrinsics)]
///
/// use std::intrinsics::cttz_nonzero;
///
/// let x = 0b0011_1000_u8;
/// let num_trailing = unsafe { cttz_nonzero(x) };
/// assert_eq!(num_trailing, 3);
/// ```
#[cfg(not(stage0))]
pub fn cttz_nonzero<T>(x: T) -> T;
/// Reverses the bytes in an integer type `T`.
pub fn bswap<T>(x: T) -> T;

View File

@ -1262,6 +1262,7 @@ macro_rules! uint_impl {
($SelfT:ty, $ActualT:ty, $BITS:expr,
$ctpop:path,
$ctlz:path,
$ctlz_nonzero:path,
$cttz:path,
$bswap:path,
$add_with_overflow:path,
@ -2184,6 +2185,7 @@ macro_rules! uint_impl {
// This method cannot overflow, as in the `next_power_of_two`
// overflow cases it instead ends up returning the maximum value
// of the type, and can return 0 for 0.
#[inline]
fn one_less_than_next_power_of_two(self) -> Self {
if self <= 1 { return 0; }
@ -2192,7 +2194,7 @@ macro_rules! uint_impl {
// (such as intel pre-haswell) have more efficient ctlz
// intrinsics when the argument is non-zero.
let p = self - 1;
let z = p.leading_zeros();
let z = unsafe { $ctlz_nonzero(p) };
<$SelfT>::max_value() >> z
}
@ -2236,11 +2238,17 @@ macro_rules! uint_impl {
}
}
#[cfg(stage0)]
unsafe fn ctlz_nonzero<T>(x: T) -> T { intrinsics::ctlz(x) }
#[cfg(not(stage0))]
unsafe fn ctlz_nonzero<T>(x: T) -> T { intrinsics::ctlz_nonzero(x) }
#[lang = "u8"]
impl u8 {
uint_impl! { u8, u8, 8,
intrinsics::ctpop,
intrinsics::ctlz,
ctlz_nonzero,
intrinsics::cttz,
intrinsics::bswap,
intrinsics::add_with_overflow,
@ -2253,6 +2261,7 @@ impl u16 {
uint_impl! { u16, u16, 16,
intrinsics::ctpop,
intrinsics::ctlz,
ctlz_nonzero,
intrinsics::cttz,
intrinsics::bswap,
intrinsics::add_with_overflow,
@ -2265,6 +2274,7 @@ impl u32 {
uint_impl! { u32, u32, 32,
intrinsics::ctpop,
intrinsics::ctlz,
ctlz_nonzero,
intrinsics::cttz,
intrinsics::bswap,
intrinsics::add_with_overflow,
@ -2277,6 +2287,7 @@ impl u64 {
uint_impl! { u64, u64, 64,
intrinsics::ctpop,
intrinsics::ctlz,
ctlz_nonzero,
intrinsics::cttz,
intrinsics::bswap,
intrinsics::add_with_overflow,
@ -2289,6 +2300,7 @@ impl u128 {
uint_impl! { u128, u128, 128,
intrinsics::ctpop,
intrinsics::ctlz,
ctlz_nonzero,
intrinsics::cttz,
intrinsics::bswap,
intrinsics::add_with_overflow,
@ -2302,6 +2314,7 @@ impl usize {
uint_impl! { usize, u16, 16,
intrinsics::ctpop,
intrinsics::ctlz,
ctlz_nonzero,
intrinsics::cttz,
intrinsics::bswap,
intrinsics::add_with_overflow,
@ -2314,6 +2327,7 @@ impl usize {
uint_impl! { usize, u32, 32,
intrinsics::ctpop,
intrinsics::ctlz,
ctlz_nonzero,
intrinsics::cttz,
intrinsics::bswap,
intrinsics::add_with_overflow,
@ -2327,6 +2341,7 @@ impl usize {
uint_impl! { usize, u64, 64,
intrinsics::ctpop,
intrinsics::ctlz,
ctlz_nonzero,
intrinsics::cttz,
intrinsics::bswap,
intrinsics::add_with_overflow,

View File

@ -267,7 +267,7 @@ pub fn trans_intrinsic_call<'a, 'tcx>(bcx: &Builder<'a, 'tcx>,
};
bcx.call(expect, &[llargs[0], C_i32(ccx, rw), llargs[1], C_i32(ccx, cache_type)], None)
},
"ctlz" | "cttz" | "ctpop" | "bswap" |
"ctlz" | "ctlz_nonzero" | "cttz" | "cttz_nonzero" | "ctpop" | "bswap" |
"add_with_overflow" | "sub_with_overflow" | "mul_with_overflow" |
"overflowing_add" | "overflowing_sub" | "overflowing_mul" |
"unchecked_div" | "unchecked_rem" | "unchecked_shl" | "unchecked_shr" => {
@ -280,6 +280,12 @@ pub fn trans_intrinsic_call<'a, 'tcx>(bcx: &Builder<'a, 'tcx>,
let llfn = ccx.get_intrinsic(&format!("llvm.{}.i{}", name, width));
bcx.call(llfn, &[llargs[0], y], None)
}
"ctlz_nonzero" | "cttz_nonzero" => {
let y = C_bool(bcx.ccx, true);
let llvm_name = &format!("llvm.{}.i{}", &name[..4], width);
let llfn = ccx.get_intrinsic(llvm_name);
bcx.call(llfn, &[llargs[0], y], None)
}
"ctpop" => bcx.call(ccx.get_intrinsic(&format!("llvm.ctpop.i{}", width)),
&llargs, None),
"bswap" => {

View File

@ -272,7 +272,8 @@ pub fn check_intrinsic_type<'a, 'tcx>(tcx: TyCtxt<'a, 'tcx, 'tcx>,
"volatile_store" =>
(1, vec![ tcx.mk_mut_ptr(param(0)), param(0) ], tcx.mk_nil()),
"ctpop" | "ctlz" | "cttz" | "bswap" => (1, vec![param(0)], param(0)),
"ctpop" | "ctlz" | "ctlz_nonzero" | "cttz" | "cttz_nonzero" | "bswap" =>
(1, vec![param(0)], param(0)),
"add_with_overflow" | "sub_with_overflow" | "mul_with_overflow" =>
(1, vec![param(0), param(0)],

View File

@ -14,7 +14,9 @@ mod rusti {
extern "rust-intrinsic" {
pub fn ctpop<T>(x: T) -> T;
pub fn ctlz<T>(x: T) -> T;
pub fn ctlz_nonzero<T>(x: T) -> T;
pub fn cttz<T>(x: T) -> T;
pub fn cttz_nonzero<T>(x: T) -> T;
pub fn bswap<T>(x: T) -> T;
}
}
@ -68,6 +70,21 @@ pub fn main() {
assert_eq!(ctlz(100u32), 25); assert_eq!(ctlz(100i32), 25);
assert_eq!(ctlz(100u64), 57); assert_eq!(ctlz(100i64), 57);
assert_eq!(ctlz_nonzero(1u8), 7); assert_eq!(ctlz_nonzero(1i8), 7);
assert_eq!(ctlz_nonzero(1u16), 15); assert_eq!(ctlz_nonzero(1i16), 15);
assert_eq!(ctlz_nonzero(1u32), 31); assert_eq!(ctlz_nonzero(1i32), 31);
assert_eq!(ctlz_nonzero(1u64), 63); assert_eq!(ctlz_nonzero(1i64), 63);
assert_eq!(ctlz_nonzero(10u8), 4); assert_eq!(ctlz_nonzero(10i8), 4);
assert_eq!(ctlz_nonzero(10u16), 12); assert_eq!(ctlz_nonzero(10i16), 12);
assert_eq!(ctlz_nonzero(10u32), 28); assert_eq!(ctlz_nonzero(10i32), 28);
assert_eq!(ctlz_nonzero(10u64), 60); assert_eq!(ctlz_nonzero(10i64), 60);
assert_eq!(ctlz_nonzero(100u8), 1); assert_eq!(ctlz_nonzero(100i8), 1);
assert_eq!(ctlz_nonzero(100u16), 9); assert_eq!(ctlz_nonzero(100i16), 9);
assert_eq!(ctlz_nonzero(100u32), 25); assert_eq!(ctlz_nonzero(100i32), 25);
assert_eq!(ctlz_nonzero(100u64), 57); assert_eq!(ctlz_nonzero(100i64), 57);
assert_eq!(cttz(-1i8 as u8), 0); assert_eq!(cttz(-1i8), 0);
assert_eq!(cttz(-1i16 as u16), 0); assert_eq!(cttz(-1i16), 0);
assert_eq!(cttz(-1i32 as u32), 0); assert_eq!(cttz(-1i32), 0);
@ -93,6 +110,26 @@ pub fn main() {
assert_eq!(cttz(100u32), 2); assert_eq!(cttz(100i32), 2);
assert_eq!(cttz(100u64), 2); assert_eq!(cttz(100i64), 2);
assert_eq!(cttz_nonzero(-1i8 as u8), 0); assert_eq!(cttz_nonzero(-1i8), 0);
assert_eq!(cttz_nonzero(-1i16 as u16), 0); assert_eq!(cttz_nonzero(-1i16), 0);
assert_eq!(cttz_nonzero(-1i32 as u32), 0); assert_eq!(cttz_nonzero(-1i32), 0);
assert_eq!(cttz_nonzero(-1i64 as u64), 0); assert_eq!(cttz_nonzero(-1i64), 0);
assert_eq!(cttz_nonzero(1u8), 0); assert_eq!(cttz_nonzero(1i8), 0);
assert_eq!(cttz_nonzero(1u16), 0); assert_eq!(cttz_nonzero(1i16), 0);
assert_eq!(cttz_nonzero(1u32), 0); assert_eq!(cttz_nonzero(1i32), 0);
assert_eq!(cttz_nonzero(1u64), 0); assert_eq!(cttz_nonzero(1i64), 0);
assert_eq!(cttz_nonzero(10u8), 1); assert_eq!(cttz_nonzero(10i8), 1);
assert_eq!(cttz_nonzero(10u16), 1); assert_eq!(cttz_nonzero(10i16), 1);
assert_eq!(cttz_nonzero(10u32), 1); assert_eq!(cttz_nonzero(10i32), 1);
assert_eq!(cttz_nonzero(10u64), 1); assert_eq!(cttz_nonzero(10i64), 1);
assert_eq!(cttz_nonzero(100u8), 2); assert_eq!(cttz_nonzero(100i8), 2);
assert_eq!(cttz_nonzero(100u16), 2); assert_eq!(cttz_nonzero(100i16), 2);
assert_eq!(cttz_nonzero(100u32), 2); assert_eq!(cttz_nonzero(100i32), 2);
assert_eq!(cttz_nonzero(100u64), 2); assert_eq!(cttz_nonzero(100i64), 2);
assert_eq!(bswap(0x0Au8), 0x0A); // no-op
assert_eq!(bswap(0x0Ai8), 0x0A); // no-op
assert_eq!(bswap(0x0A0Bu16), 0x0B0A);