From 13e24003472c13636d4350a530d6e6495775afeb Mon Sep 17 00:00:00 2001
From: Scott McMurray <scottmcm@users.noreply.github.com>
Date: Sat, 27 May 2017 20:56:25 -0700
Subject: [PATCH 1/2] Add ctlz_nonzero & cttz_nonzero intrinsics

LLVM currently doesn't remove the "bypass if argument is zero" assembly inside branches where the value is known to be non-zero, pessimizing code that uses uN::leading_zeros
---
 src/libcore/intrinsics.rs               | 34 +++++++++++++++++++++++
 src/librustc_trans/intrinsic.rs         |  8 +++++-
 src/librustc_typeck/check/intrinsic.rs  |  3 +-
 src/test/run-pass/intrinsics-integer.rs | 37 +++++++++++++++++++++++++
 4 files changed, 80 insertions(+), 2 deletions(-)
diff --git a/src/libcore/intrinsics.rs b/src/libcore/intrinsics.rs
index 3566bbdebc2..8188c15a282 100644
--- a/src/libcore/intrinsics.rs
+++ b/src/libcore/intrinsics.rs
@@ -1229,6 +1229,23 @@ extern "rust-intrinsic" {
     /// ```
     pub fn ctlz<T>(x: T) -> T;
 
+    /// Like `ctlz`, but extra-unsafe as it returns `undef` when
+    /// given an `x` with value `0`.
+    ///
+    /// # Examples
+    ///
+    /// ```
+    /// #![feature(core_intrinsics)]
+    ///
+    /// use std::intrinsics::ctlz_nonzero;
+    ///
+    /// let x = 0b0001_1100_u8;
+    /// let num_leading = unsafe { ctlz_nonzero(x) };
+    /// assert_eq!(num_leading, 3);
+    /// ```
+    #[cfg(not(stage0))]
+    pub fn ctlz_nonzero<T>(x: T) -> T;
+
     /// Returns the number of trailing unset bits (zeroes) in an integer type `T`.
     ///
     /// # Examples
@@ -1256,6 +1273,23 @@ extern "rust-intrinsic" {
     /// ```
     pub fn cttz<T>(x: T) -> T;
 
+    /// Like `cttz`, but extra-unsafe as it returns `undef` when
+    /// given an `x` with value `0`.
+    ///
+    /// # Examples
+    ///
+    /// ```
+    /// #![feature(core_intrinsics)]
+    ///
+    /// use std::intrinsics::cttz_nonzero;
+    ///
+    /// let x = 0b0011_1000_u8;
+    /// let num_trailing = unsafe { cttz_nonzero(x) };
+    /// assert_eq!(num_trailing, 3);
+    /// ```
+    #[cfg(not(stage0))]
+    pub fn cttz_nonzero<T>(x: T) -> T;
+
     /// Reverses the bytes in an integer type `T`.
     pub fn bswap<T>(x: T) -> T;
 
diff --git a/src/librustc_trans/intrinsic.rs b/src/librustc_trans/intrinsic.rs
index f2e6aa3ef00..de908bb24a7 100644
--- a/src/librustc_trans/intrinsic.rs
+++ b/src/librustc_trans/intrinsic.rs
@@ -267,7 +267,7 @@ pub fn trans_intrinsic_call<'a, 'tcx>(bcx: &Builder<'a, 'tcx>,
             };
             bcx.call(expect, &[llargs[0], C_i32(ccx, rw), llargs[1], C_i32(ccx, cache_type)], None)
         },
-        "ctlz" | "cttz" | "ctpop" | "bswap" |
+        "ctlz" | "ctlz_nonzero" | "cttz" | "cttz_nonzero" | "ctpop" | "bswap" |
         "add_with_overflow" | "sub_with_overflow" | "mul_with_overflow" |
         "overflowing_add" | "overflowing_sub" | "overflowing_mul" |
         "unchecked_div" | "unchecked_rem" | "unchecked_shl" | "unchecked_shr" => {
@@ -280,6 +280,12 @@ pub fn trans_intrinsic_call<'a, 'tcx>(bcx: &Builder<'a, 'tcx>,
                             let llfn = ccx.get_intrinsic(&format!("llvm.{}.i{}", name, width));
                             bcx.call(llfn, &[llargs[0], y], None)
                         }
+                        "ctlz_nonzero" | "cttz_nonzero" => {
+                            let y = C_bool(bcx.ccx, true);
+                            let llvm_name = &format!("llvm.{}.i{}", &name[..4], width);
+                            let llfn = ccx.get_intrinsic(llvm_name);
+                            bcx.call(llfn, &[llargs[0], y], None)
+                        }
                         "ctpop" => bcx.call(ccx.get_intrinsic(&format!("llvm.ctpop.i{}", width)),
                                         &llargs, None),
                         "bswap" => {
diff --git a/src/librustc_typeck/check/intrinsic.rs b/src/librustc_typeck/check/intrinsic.rs
index daf202cd797..4d9f50b0fc0 100644
--- a/src/librustc_typeck/check/intrinsic.rs
+++ b/src/librustc_typeck/check/intrinsic.rs
@@ -272,7 +272,8 @@ pub fn check_intrinsic_type<'a, 'tcx>(tcx: TyCtxt<'a, 'tcx, 'tcx>,
             "volatile_store" =>
                 (1, vec![ tcx.mk_mut_ptr(param(0)), param(0) ], tcx.mk_nil()),
 
-            "ctpop" | "ctlz" | "cttz" | "bswap" => (1, vec![param(0)], param(0)),
+            "ctpop" | "ctlz" | "ctlz_nonzero" | "cttz" | "cttz_nonzero" | "bswap" =>
+                (1, vec![param(0)], param(0)),
 
             "add_with_overflow" | "sub_with_overflow"  | "mul_with_overflow" =>
                 (1, vec![param(0), param(0)],
diff --git a/src/test/run-pass/intrinsics-integer.rs b/src/test/run-pass/intrinsics-integer.rs
index 759dc515456..4896f02da20 100644
--- a/src/test/run-pass/intrinsics-integer.rs
+++ b/src/test/run-pass/intrinsics-integer.rs
@@ -14,7 +14,9 @@ mod rusti {
     extern "rust-intrinsic" {
         pub fn ctpop<T>(x: T) -> T;
         pub fn ctlz<T>(x: T) -> T;
+        pub fn ctlz_nonzero<T>(x: T) -> T;
         pub fn cttz<T>(x: T) -> T;
+        pub fn cttz_nonzero<T>(x: T) -> T;
         pub fn bswap<T>(x: T) -> T;
     }
 }
@@ -68,6 +70,21 @@ pub fn main() {
         assert_eq!(ctlz(100u32), 25); assert_eq!(ctlz(100i32), 25);
         assert_eq!(ctlz(100u64), 57); assert_eq!(ctlz(100i64), 57);
 
+        assert_eq!(ctlz_nonzero(1u8), 7); assert_eq!(ctlz_nonzero(1i8), 7);
+        assert_eq!(ctlz_nonzero(1u16), 15); assert_eq!(ctlz_nonzero(1i16), 15);
+        assert_eq!(ctlz_nonzero(1u32), 31); assert_eq!(ctlz_nonzero(1i32), 31);
+        assert_eq!(ctlz_nonzero(1u64), 63); assert_eq!(ctlz_nonzero(1i64), 63);
+
+        assert_eq!(ctlz_nonzero(10u8), 4); assert_eq!(ctlz_nonzero(10i8), 4);
+        assert_eq!(ctlz_nonzero(10u16), 12); assert_eq!(ctlz_nonzero(10i16), 12);
+        assert_eq!(ctlz_nonzero(10u32), 28); assert_eq!(ctlz_nonzero(10i32), 28);
+        assert_eq!(ctlz_nonzero(10u64), 60); assert_eq!(ctlz_nonzero(10i64), 60);
+
+        assert_eq!(ctlz_nonzero(100u8), 1); assert_eq!(ctlz_nonzero(100i8), 1);
+        assert_eq!(ctlz_nonzero(100u16), 9); assert_eq!(ctlz_nonzero(100i16), 9);
+        assert_eq!(ctlz_nonzero(100u32), 25); assert_eq!(ctlz_nonzero(100i32), 25);
+        assert_eq!(ctlz_nonzero(100u64), 57); assert_eq!(ctlz_nonzero(100i64), 57);
+
         assert_eq!(cttz(-1i8 as u8), 0); assert_eq!(cttz(-1i8), 0);
         assert_eq!(cttz(-1i16 as u16), 0); assert_eq!(cttz(-1i16), 0);
         assert_eq!(cttz(-1i32 as u32), 0); assert_eq!(cttz(-1i32), 0);
@@ -93,6 +110,26 @@ pub fn main() {
         assert_eq!(cttz(100u32), 2); assert_eq!(cttz(100i32), 2);
         assert_eq!(cttz(100u64), 2); assert_eq!(cttz(100i64), 2);
 
+        assert_eq!(cttz_nonzero(-1i8 as u8), 0); assert_eq!(cttz_nonzero(-1i8), 0);
+        assert_eq!(cttz_nonzero(-1i16 as u16), 0); assert_eq!(cttz_nonzero(-1i16), 0);
+        assert_eq!(cttz_nonzero(-1i32 as u32), 0); assert_eq!(cttz_nonzero(-1i32), 0);
+        assert_eq!(cttz_nonzero(-1i64 as u64), 0); assert_eq!(cttz_nonzero(-1i64), 0);
+
+        assert_eq!(cttz_nonzero(1u8), 0); assert_eq!(cttz_nonzero(1i8), 0);
+        assert_eq!(cttz_nonzero(1u16), 0); assert_eq!(cttz_nonzero(1i16), 0);
+        assert_eq!(cttz_nonzero(1u32), 0); assert_eq!(cttz_nonzero(1i32), 0);
+        assert_eq!(cttz_nonzero(1u64), 0); assert_eq!(cttz_nonzero(1i64), 0);
+
+        assert_eq!(cttz_nonzero(10u8), 1); assert_eq!(cttz_nonzero(10i8), 1);
+        assert_eq!(cttz_nonzero(10u16), 1); assert_eq!(cttz_nonzero(10i16), 1);
+        assert_eq!(cttz_nonzero(10u32), 1); assert_eq!(cttz_nonzero(10i32), 1);
+        assert_eq!(cttz_nonzero(10u64), 1); assert_eq!(cttz_nonzero(10i64), 1);
+
+        assert_eq!(cttz_nonzero(100u8), 2); assert_eq!(cttz_nonzero(100i8), 2);
+        assert_eq!(cttz_nonzero(100u16), 2); assert_eq!(cttz_nonzero(100i16), 2);
+        assert_eq!(cttz_nonzero(100u32), 2); assert_eq!(cttz_nonzero(100i32), 2);
+        assert_eq!(cttz_nonzero(100u64), 2); assert_eq!(cttz_nonzero(100i64), 2);
+
         assert_eq!(bswap(0x0Au8), 0x0A); // no-op
         assert_eq!(bswap(0x0Ai8), 0x0A); // no-op
         assert_eq!(bswap(0x0A0Bu16), 0x0B0A);

From 6d86f0c018b57fcb9ca12c801939130b7f8e441e Mon Sep 17 00:00:00 2001
From: Scott McMurray <scottmcm@users.noreply.github.com>
Date: Thu, 8 Jun 2017 21:29:45 -0700
Subject: [PATCH 2/2] Use ctlz_nonzero to improve ASM from next_power_of_two

---
 src/libcore/num/mod.rs | 17 ++++++++++++++++-
 1 file changed, 16 insertions(+), 1 deletion(-)

diff --git a/src/libcore/num/mod.rs b/src/libcore/num/mod.rs
index b76eac9e51f..62d75445cc9 100644
--- a/src/libcore/num/mod.rs
+++ b/src/libcore/num/mod.rs
@@ -1262,6 +1262,7 @@ macro_rules! uint_impl {
     ($SelfT:ty, $ActualT:ty, $BITS:expr,
      $ctpop:path,
      $ctlz:path,
+     $ctlz_nonzero:path,
      $cttz:path,
      $bswap:path,
      $add_with_overflow:path,
@@ -2184,6 +2185,7 @@ macro_rules! uint_impl {
         // This method cannot overflow, as in the `next_power_of_two`
         // overflow cases it instead ends up returning the maximum value
         // of the type, and can return 0 for 0.
+        #[inline]
         fn one_less_than_next_power_of_two(self) -> Self {
             if self <= 1 { return 0; }
 
@@ -2192,7 +2194,7 @@ macro_rules! uint_impl {
             // (such as intel pre-haswell) have more efficient ctlz
             // intrinsics when the argument is non-zero.
             let p = self - 1;
-            let z = p.leading_zeros();
+            let z = unsafe { $ctlz_nonzero(p) };
             <$SelfT>::max_value() >> z
         }
 
@@ -2236,11 +2238,17 @@ macro_rules! uint_impl {
     }
 }
 
+#[cfg(stage0)]
+unsafe fn ctlz_nonzero<T>(x: T) -> T { intrinsics::ctlz(x) }
+#[cfg(not(stage0))]
+unsafe fn ctlz_nonzero<T>(x: T) -> T { intrinsics::ctlz_nonzero(x) }
+
 #[lang = "u8"]
 impl u8 {
     uint_impl! { u8, u8, 8,
         intrinsics::ctpop,
         intrinsics::ctlz,
+        ctlz_nonzero,
         intrinsics::cttz,
         intrinsics::bswap,
         intrinsics::add_with_overflow,
@@ -2253,6 +2261,7 @@ impl u16 {
     uint_impl! { u16, u16, 16,
         intrinsics::ctpop,
         intrinsics::ctlz,
+        ctlz_nonzero,
         intrinsics::cttz,
         intrinsics::bswap,
         intrinsics::add_with_overflow,
@@ -2265,6 +2274,7 @@ impl u32 {
     uint_impl! { u32, u32, 32,
         intrinsics::ctpop,
         intrinsics::ctlz,
+        ctlz_nonzero,
         intrinsics::cttz,
         intrinsics::bswap,
         intrinsics::add_with_overflow,
@@ -2277,6 +2287,7 @@ impl u64 {
     uint_impl! { u64, u64, 64,
         intrinsics::ctpop,
         intrinsics::ctlz,
+        ctlz_nonzero,
         intrinsics::cttz,
         intrinsics::bswap,
         intrinsics::add_with_overflow,
@@ -2289,6 +2300,7 @@ impl u128 {
     uint_impl! { u128, u128, 128,
         intrinsics::ctpop,
         intrinsics::ctlz,
+        ctlz_nonzero,
         intrinsics::cttz,
         intrinsics::bswap,
         intrinsics::add_with_overflow,
@@ -2302,6 +2314,7 @@ impl usize {
     uint_impl! { usize, u16, 16,
         intrinsics::ctpop,
         intrinsics::ctlz,
+        ctlz_nonzero,
         intrinsics::cttz,
         intrinsics::bswap,
         intrinsics::add_with_overflow,
@@ -2314,6 +2327,7 @@ impl usize {
     uint_impl! { usize, u32, 32,
         intrinsics::ctpop,
         intrinsics::ctlz,
+        ctlz_nonzero,
         intrinsics::cttz,
         intrinsics::bswap,
         intrinsics::add_with_overflow,
@@ -2327,6 +2341,7 @@ impl usize {
     uint_impl! { usize, u64, 64,
         intrinsics::ctpop,
         intrinsics::ctlz,
+        ctlz_nonzero,
         intrinsics::cttz,
         intrinsics::bswap,
         intrinsics::add_with_overflow,