Implement a faster sort algorithm

This is a complete rewrite of the standard sort algorithm. The new algorithm is a simplified variant of TimSort. In summary, the changes are: * Improved performance, especially on partially sorted inputs. * Performs less comparisons on both random and partially sorted inputs. * Decreased the size of temporary memory: the new sort allocates 4x less.
2016-12-06 12:05:16 +01:00 · 2016-12-06 12:05:16 +01:00 · c8d73ea68a
commit c8d73ea68a
parent ff261d3a6b
4 changed files with 490 additions and 325 deletions
--- a/src/libcollections/lib.rs
+++ b/src/libcollections/lib.rs
@ -47,14 +47,14 @@
 #![feature(placement_in)]
 #![feature(placement_new_protocol)]
 #![feature(shared)]
+#![feature(slice_get_slice)]
 #![feature(slice_patterns)]
 #![feature(specialization)]
 #![feature(staged_api)]
-#![feature(step_by)]
 #![feature(trusted_len)]
 #![feature(unicode)]
 #![feature(unique)]
-#![feature(slice_get_slice)]
+#![feature(untagged_unions)]
 #![cfg_attr(test, feature(rand, test))]

 #![no_std]
--- a/src/libcollections/slice.rs
+++ b/src/libcollections/slice.rs
@ -98,8 +98,7 @@
 #![cfg_attr(test, allow(unused_imports, dead_code))]

 use alloc::boxed::Box;
-use core::cmp::Ordering::{self, Greater, Less};
-use core::cmp;
+use core::cmp::Ordering::{self, Greater};
 use core::mem::size_of;
 use core::mem;
 use core::ptr;
@ -1042,8 +1041,8 @@ impl<T> [T] {

    /// This is equivalent to `self.sort_by(|a, b| a.cmp(b))`.
    ///
-    /// This sort is stable and `O(n log n)` worst-case but allocates
-    /// approximately `2 * n` where `n` is the length of `self`.
+    /// This sort is stable and `O(n log n)` worst-case, but allocates
+    /// temporary storage half the size of `self`.
    ///
    /// # Examples
    ///
@ -1064,8 +1063,8 @@ impl<T> [T] {
    /// Sorts the slice, in place, using `f` to extract a key by which to
    /// order the sort by.
    ///
-    /// This sort is stable and `O(n log n)` worst-case but allocates
-    /// approximately `2 * n`, where `n` is the length of `self`.
+    /// This sort is stable and `O(n log n)` worst-case, but allocates
+    /// temporary storage half the size of `self`.
    ///
    /// # Examples
    ///
@ -1086,8 +1085,8 @@ impl<T> [T] {
    /// Sorts the slice, in place, using `compare` to compare
    /// elements.
    ///
-    /// This sort is stable and `O(n log n)` worst-case but allocates
-    /// approximately `2 * n`, where `n` is the length of `self`.
+    /// This sort is stable and `O(n log n)` worst-case, but allocates
+    /// temporary storage half the size of `self`.
    ///
    /// # Examples
    ///
@ -1305,213 +1304,332 @@ impl<T: Clone> ToOwned for [T] {
 // Sorting
 ////////////////////////////////////////////////////////////////////////////////

-fn insertion_sort<T, F>(v: &mut [T], mut compare: F)
+/// Inserts `v[0]` into pre-sorted sequence `v[1..]` so that whole `v[..]` becomes sorted.
+///
+/// This is the integral subroutine of insertion sort.
+fn insert_head<T, F>(v: &mut [T], compare: &mut F)
    where F: FnMut(&T, &T) -> Ordering
 {
-    let len = v.len() as isize;
-    let buf_v = v.as_mut_ptr();
-
-    // 1 <= i < len;
-    for i in 1..len {
-        // j satisfies: 0 <= j <= i;
-        let mut j = i;
+    if v.len() >= 2 && compare(&v[0], &v[1]) == Greater {
        unsafe {
-            // `i` is in bounds.
-            let read_ptr = buf_v.offset(i) as *const T;
+            // There are three ways to implement insertion here:
+            //
+            // 1. Swap adjacent elements until the first one gets to its final destination.
+            //    However, this way we copy data around more than is necessary. If elements are big
+            //    structures (costly to copy), this method will be slow.
+            //
+            // 2. Iterate until the right place for the first element is found. Then shift the
+            //    elements succeeding it to make room for it and finally place it into the
+            //    remaining hole. This is a good method.
+            //
+            // 3. Copy the first element into a temporary variable. Iterate until the right place
+            //    for it is found. As we go along, copy every traversed element into the slot
+            //    preceding it. Finally, copy data from the temporary variable into the remaining
+            //    hole. This method is very good. Benchmarks demonstrated slightly better
+            //    performance than with the 2nd method.
+            //
+            // All methods were benchmarked, and the 3rd showed best results. So we chose that one.
+            let mut tmp = NoDrop { value: ptr::read(&v[0]) };

-            // find where to insert, we need to do strict <,
-            // rather than <=, to maintain stability.
+            // Intermediate state of the insertion process is always tracked by `hole`, which
+            // serves two purposes:
+            // 1. Protects integrity of `v` from panics in `compare`.
+            // 2. Fills the remaining hole in `v` in the end.
+            //
+            // Panic safety:
+            //
+            // If `compare` panics at any point during the process, `hole` will get dropped and
+            // fill the hole in `v` with `tmp`, thus ensuring that `v` still holds every object it
+            // initially held exactly once.
+            let mut hole = InsertionHole {
+                src: &mut tmp.value,
+                dest: &mut v[1],
+            };
+            ptr::copy_nonoverlapping(&v[1], &mut v[0], 1);

-            // 0 <= j - 1 < len, so .offset(j - 1) is in bounds.
-            while j > 0 && compare(&*read_ptr, &*buf_v.offset(j - 1)) == Less {
-                j -= 1;
+            for i in 2..v.len() {
+                if compare(&tmp.value, &v[i]) != Greater {
+                    break;
+                }
+                ptr::copy_nonoverlapping(&v[i], &mut v[i - 1], 1);
+                hole.dest = &mut v[i];
            }
+            // `hole` gets dropped and thus copies `tmp` into the remaining hole in `v`.
+        }
+    }

-            // shift everything to the right, to make space to
-            // insert this value.
+    // Holds a value, but never drops it.
+    #[allow(unions_with_drop_fields)]
+    union NoDrop<T> {
+        value: T
+    }

-            // j + 1 could be `len` (for the last `i`), but in
-            // that case, `i == j` so we don't copy. The
-            // `.offset(j)` is always in bounds.
+    // When dropped, copies from `src` into `dest`.
+    struct InsertionHole<T> {
+        src: *mut T,
+        dest: *mut T,
+    }

-            if i != j {
-                let tmp = ptr::read(read_ptr);
-                ptr::copy(&*buf_v.offset(j), buf_v.offset(j + 1), (i - j) as usize);
-                ptr::copy_nonoverlapping(&tmp, buf_v.offset(j), 1);
-                mem::forget(tmp);
-            }
+    impl<T> Drop for InsertionHole<T> {
+        fn drop(&mut self) {
+            unsafe { ptr::copy_nonoverlapping(self.src, self.dest, 1); }
        }
    }
 }

-fn merge_sort<T, F>(v: &mut [T], mut compare: F)
+/// Merges non-decreasing runs `v[..mid]` and `v[mid..]` using `buf` as temporary storage, and
+/// stores the result into `v[..]`.
+///
+/// # Safety
+///
+/// The two slices must be non-empty and `mid` must be in bounds. Buffer `buf` must be long enough
+/// to hold a copy of the shorter slice. Also, `T` must not be a zero-sized type.
+unsafe fn merge<T, F>(v: &mut [T], mid: usize, buf: *mut T, compare: &mut F)
    where F: FnMut(&T, &T) -> Ordering
 {
-    // warning: this wildly uses unsafe.
-    const BASE_INSERTION: usize = 32;
-    const LARGE_INSERTION: usize = 16;
-
-    // FIXME #12092: smaller insertion runs seems to make sorting
-    // vectors of large elements a little faster on some platforms,
-    // but hasn't been tested/tuned extensively
-    let insertion = if size_of::<T>() <= 16 {
-        BASE_INSERTION
-    } else {
-        LARGE_INSERTION
-    };
-
    let len = v.len();
+    let v = v.as_mut_ptr();
+    let v_mid = v.offset(mid as isize);
+    let v_end = v.offset(len as isize);

-    // short vectors get sorted in-place via insertion sort to avoid allocations
-    if len <= insertion {
-        insertion_sort(v, compare);
-        return;
-    }
+    // The merge process first copies the shorter run into `buf`. Then it traces the newly copied
+    // run and the longer run forwards (or backwards), comparing their next unconsumed elements and
+    // copying the lesser (or greater) one into `v`.
+    //
+    // As soon as the shorter run is fully consumed, the process is done. If the longer run gets
+    // consumed first, then we must copy whatever is left of the shorter run into the remaining
+    // hole in `v`.
+    //
+    // Intermediate state of the process is always tracked by `hole`, which serves two purposes:
+    // 1. Protects integrity of `v` from panics in `compare`.
+    // 2. Fills the remaining hole in `v` if the longer run gets consumed first.
+    //
+    // Panic safety:
+    //
+    // If `compare` panics at any point during the process, `hole` will get dropped and fill the
+    // hole in `v` with the unconsumed range in `buf`, thus ensuring that `v` still holds every
+    // object it initially held exactly once.
+    let mut hole;

-    // allocate some memory to use as scratch memory, we keep the
-    // length 0 so we can keep shallow copies of the contents of `v`
-    // without risking the dtors running on an object twice if
-    // `compare` panics.
-    let mut working_space = Vec::with_capacity(2 * len);
-    // these both are buffers of length `len`.
-    let mut buf_dat = working_space.as_mut_ptr();
-    let mut buf_tmp = unsafe { buf_dat.offset(len as isize) };
+    if mid <= len - mid {
+        // The left run is shorter.
+        ptr::copy_nonoverlapping(v, buf, mid);
+        hole = MergeHole {
+            start: buf,
+            end: buf.offset(mid as isize),
+            dest: v,
+        };

-    // length `len`.
-    let buf_v = v.as_ptr();
+        // Initially, these pointers point to the beginnings of their arrays.
+        let left = &mut hole.start;
+        let mut right = v_mid;
+        let out = &mut hole.dest;

-    // step 1. sort short runs with insertion sort. This takes the
-    // values from `v` and sorts them into `buf_dat`, leaving that
-    // with sorted runs of length INSERTION.
+        while *left < hole.end && right < v_end {
+            // Consume the lesser side.
+            // If equal, prefer the left run to maintain stability.
+            let to_copy = if compare(&**left, &*right) == Greater {
+                get_and_increment(&mut right)
+            } else {
+                get_and_increment(left)
+            };
+            ptr::copy_nonoverlapping(to_copy, get_and_increment(out), 1);
+        }
+    } else {
+        // The right run is shorter.
+        ptr::copy_nonoverlapping(v_mid, buf, len - mid);
+        hole = MergeHole {
+            start: buf,
+            end: buf.offset((len - mid) as isize),
+            dest: v_mid,
+        };

-    // We could hardcode the sorting comparisons here, and we could
-    // manipulate/step the pointers themselves, rather than repeatedly
-    // .offset-ing.
-    for start in (0..len).step_by(insertion) {
-        // start <= i < len;
-        for i in start..cmp::min(start + insertion, len) {
-            // j satisfies: start <= j <= i;
-            let mut j = i as isize;
-            unsafe {
-                // `i` is in bounds.
-                let read_ptr = buf_v.offset(i as isize);
+        // Initially, these pointers point past the ends of their arrays.
+        let left = &mut hole.dest;
+        let right = &mut hole.end;
+        let mut out = v_end;

-                // find where to insert, we need to do strict <,
-                // rather than <=, to maintain stability.
-
-                // start <= j - 1 < len, so .offset(j - 1) is in
-                // bounds.
-                while j > start as isize && compare(&*read_ptr, &*buf_dat.offset(j - 1)) == Less {
-                    j -= 1;
-                }
-
-                // shift everything to the right, to make space to
-                // insert this value.
-
-                // j + 1 could be `len` (for the last `i`), but in
-                // that case, `i == j` so we don't copy. The
-                // `.offset(j)` is always in bounds.
-                ptr::copy(&*buf_dat.offset(j), buf_dat.offset(j + 1), i - j as usize);
-                ptr::copy_nonoverlapping(read_ptr, buf_dat.offset(j), 1);
-            }
+        while v < *left && buf < *right {
+            // Consume the greater side.
+            // If equal, prefer the right run to maintain stability.
+            let to_copy = if compare(&*left.offset(-1), &*right.offset(-1)) == Greater {
+                decrement_and_get(left)
+            } else {
+                decrement_and_get(right)
+            };
+            ptr::copy_nonoverlapping(to_copy, decrement_and_get(&mut out), 1);
        }
    }
+    // Finally, `hole` gets dropped. If the shorter run was not fully consumed, whatever remains of
+    // it will now be copied into the hole in `v`.

-    // step 2. merge the sorted runs.
-    let mut width = insertion;
-    while width < len {
-        // merge the sorted runs of length `width` in `buf_dat` two at
-        // a time, placing the result in `buf_tmp`.
-
-        // 0 <= start <= len.
-        for start in (0..len).step_by(2 * width) {
-            // manipulate pointers directly for speed (rather than
-            // using a `for` loop with `range` and `.offset` inside
-            // that loop).
-            unsafe {
-                // the end of the first run & start of the
-                // second. Offset of `len` is defined, since this is
-                // precisely one byte past the end of the object.
-                let right_start = buf_dat.offset(cmp::min(start + width, len) as isize);
-                // end of the second. Similar reasoning to the above re safety.
-                let right_end_idx = cmp::min(start + 2 * width, len);
-                let right_end = buf_dat.offset(right_end_idx as isize);
-
-                // the pointers to the elements under consideration
-                // from the two runs.
-
-                // both of these are in bounds.
-                let mut left = buf_dat.offset(start as isize);
-                let mut right = right_start;
-
-                // where we're putting the results, it is a run of
-                // length `2*width`, so we step it once for each step
-                // of either `left` or `right`.  `buf_tmp` has length
-                // `len`, so these are in bounds.
-                let mut out = buf_tmp.offset(start as isize);
-                let out_end = buf_tmp.offset(right_end_idx as isize);
-
-                // If left[last] <= right[0], they are already in order:
-                // fast-forward the left side (the right side is handled
-                // in the loop).
-                // If `right` is not empty then left is not empty, and
-                // the offsets are in bounds.
-                if right != right_end && compare(&*right.offset(-1), &*right) != Greater {
-                    let elems = (right_start as usize - left as usize) / mem::size_of::<T>();
-                    ptr::copy_nonoverlapping(&*left, out, elems);
-                    out = out.offset(elems as isize);
-                    left = right_start;
-                }
-
-                while out < out_end {
-                    // Either the left or the right run are exhausted,
-                    // so just copy the remainder from the other run
-                    // and move on; this gives a huge speed-up (order
-                    // of 25%) for mostly sorted vectors (the best
-                    // case).
-                    if left == right_start {
-                        // the number remaining in this run.
-                        let elems = (right_end as usize - right as usize) / mem::size_of::<T>();
-                        ptr::copy_nonoverlapping(&*right, out, elems);
-                        break;
-                    } else if right == right_end {
-                        let elems = (right_start as usize - left as usize) / mem::size_of::<T>();
-                        ptr::copy_nonoverlapping(&*left, out, elems);
-                        break;
-                    }
-
-                    // check which side is smaller, and that's the
-                    // next element for the new run.
-
-                    // `left < right_start` and `right < right_end`,
-                    // so these are valid.
-                    let to_copy = if compare(&*left, &*right) == Greater {
-                        step(&mut right)
-                    } else {
-                        step(&mut left)
-                    };
-                    ptr::copy_nonoverlapping(&*to_copy, out, 1);
-                    step(&mut out);
-                }
-            }
-        }
-
-        mem::swap(&mut buf_dat, &mut buf_tmp);
-
-        width *= 2;
-    }
-
-    // write the result to `v` in one go, so that there are never two copies
-    // of the same object in `v`.
-    unsafe {
-        ptr::copy_nonoverlapping(&*buf_dat, v.as_mut_ptr(), len);
-    }
-
-    // increment the pointer, returning the old pointer.
-    #[inline(always)]
-    unsafe fn step<T>(ptr: &mut *mut T) -> *mut T {
+    unsafe fn get_and_increment<T>(ptr: &mut *mut T) -> *mut T {
        let old = *ptr;
        *ptr = ptr.offset(1);
        old
    }
+
+    unsafe fn decrement_and_get<T>(ptr: &mut *mut T) -> *mut T {
+        *ptr = ptr.offset(-1);
+        *ptr
+    }
+
+    // When dropped, copies the range `start..end` into `dest..`.
+    struct MergeHole<T> {
+        start: *mut T,
+        end: *mut T,
+        dest: *mut T,
+    }
+
+    impl<T> Drop for MergeHole<T> {
+        fn drop(&mut self) {
+            // `T` is not a zero-sized type, so it's okay to divide by it's size.
+            let len = (self.end as usize - self.start as usize) / mem::size_of::<T>();
+            unsafe { ptr::copy_nonoverlapping(self.start, self.dest, len); }
+        }
+    }
+}
+
+/// This merge sort borrows some (but not all) ideas from TimSort, which is described in detail
+/// [here](http://svn.python.org/projects/python/trunk/Objects/listsort.txt).
+///
+/// The algorithm identifies strictly descending and non-descending subsequences, which are called
+/// natural runs. There is a stack of pending runs yet to be merged. Each newly found run is pushed
+/// onto the stack, and then some pairs of adjacent runs are merged until these two invariants are
+/// satisfied, for every `i` in `0 .. runs.len() - 2`:
+///
+/// 1. `runs[i].len > runs[i + 1].len`
+/// 2. `runs[i].len > runs[i + 1].len + runs[i + 2].len`
+///
+/// The invariants ensure that the total running time is `O(n log n)` worst-case.
+fn merge_sort<T, F>(v: &mut [T], mut compare: F)
+    where F: FnMut(&T, &T) -> Ordering
+{
+    // Sorting has no meaningful behavior on zero-sized types.
+    if size_of::<T>() == 0 {
+        return;
+    }
+
+    // FIXME #12092: These numbers are platform-specific and need more extensive testing/tuning.
+    //
+    // If `v` has length up to `insertion_len`, simply switch to insertion sort because it is going
+    // to perform better than merge sort. For bigger types `T`, the threshold is smaller.
+    //
+    // Short runs are extended using insertion sort to span at least `min_run` elements, in order
+    // to improve performance.
+    let (max_insertion, min_run) = if size_of::<T>() <= 16 {
+        (64, 32)
+    } else {
+        (32, 16)
+    };
+
+    let len = v.len();
+
+    // Short arrays get sorted in-place via insertion sort to avoid allocations.
+    if len <= max_insertion {
+        if len >= 2 {
+            for i in (0..len-1).rev() {
+                insert_head(&mut v[i..], &mut compare);
+            }
+        }
+        return;
+    }
+
+    // Allocate a buffer to use as scratch memory. We keep the length 0 so we can keep in it
+    // shallow copies of the contents of `v` without risking the dtors running on copies if
+    // `compare` panics. When merging two sorted runs, this buffer holds a copy of the shorter run,
+    // which will always have length at most `len / 2`.
+    let mut buf = Vec::with_capacity(len / 2);
+
+    // In order to identify natural runs in `v`, we traverse it backwards. That might seem like a
+    // strange decision, but consider the fact that merges more often go in the opposite direction
+    // (forwards). According to benchmarks, merging forwards is slightly faster than merging
+    // backwards. To conclude, identifying runs by traversing backwards improves performance.
+    let mut runs = vec![];
+    let mut end = len;
+    while end > 0 {
+        // Find the next natural run, and reverse it if it's strictly descending.
+        let mut start = end - 1;
+        if start > 0 {
+            start -= 1;
+            if compare(&v[start], &v[start + 1]) == Greater {
+                while start > 0 && compare(&v[start - 1], &v[start]) == Greater {
+                    start -= 1;
+                }
+                v[start..end].reverse();
+            } else {
+                while start > 0 && compare(&v[start - 1], &v[start]) != Greater {
+                    start -= 1;
+                }
+            }
+        }
+
+        // Insert some more elements into the run if it's too short. Insertion sort is faster than
+        // merge sort on short sequences, so this significantly improves performance.
+        while start > 0 && end - start < min_run {
+            start -= 1;
+            insert_head(&mut v[start..end], &mut compare);
+        }
+
+        // Push this run onto the stack.
+        runs.push(Run {
+            start: start,
+            len: end - start,
+        });
+        end = start;
+
+        // Merge some pairs of adjacent runs to satisfy the invariants.
+        while let Some(r) = collapse(&runs) {
+            let left = runs[r + 1];
+            let right = runs[r];
+            unsafe {
+                merge(&mut v[left.start .. right.start + right.len], left.len, buf.as_mut_ptr(),
+                      &mut compare);
+            }
+            runs[r] = Run {
+                start: left.start,
+                len: left.len + right.len,
+            };
+            runs.remove(r + 1);
+        }
+    }
+
+    // Finally, exactly one run must remain in the stack.
+    debug_assert!(runs.len() == 1 && runs[0].start == 0 && runs[0].len == len);
+
+    // Examines the stack of runs and identifies the next pair of runs to merge. More specifically,
+    // if `Some(r)` is returned, that means `runs[r]` and `runs[r + 1]` must be merged next. If the
+    // algorithm should continue building a new run instead, `None` is returned.
+    //
+    // TimSort is infamous for it's buggy implementations, as described here:
+    // http://envisage-project.eu/timsort-specification-and-verification/
+    //
+    // The gist of the story is: we must enforce the invariants on the top four runs on the stack.
+    // Enforcing them on just top three is not sufficient to ensure that the invariants will still
+    // hold for *all* runs in the stack.
+    //
+    // This function correctly checks invariants for the top four runs. Additionally, if the top
+    // run starts at index 0, it will always demand a merge operation until the stack is fully
+    // collapsed, in order to complete the sort.
+    fn collapse(runs: &[Run]) -> Option<usize> {
+        let n = runs.len();
+        if n >= 2 && (runs[n - 1].start == 0 ||
+                      runs[n - 2].len <= runs[n - 1].len ||
+                      (n >= 3 && runs[n - 3].len <= runs[n - 2].len + runs[n - 1].len) ||
+                      (n >= 4 && runs[n - 4].len <= runs[n - 3].len + runs[n - 2].len)) {
+            if n >= 3 && runs[n - 3].len < runs[n - 1].len {
+                Some(n - 3)
+            } else {
+                Some(n - 2)
+            }
+        } else {
+            None
+        }
+    }
+
+    #[derive(Clone, Copy)]
+    struct Run {
+        start: usize,
+        len: usize,
+    }
 }
--- a/src/libcollectionstest/slice.rs
+++ b/src/libcollectionstest/slice.rs
@ -383,7 +383,7 @@ fn test_reverse() {

 #[test]
 fn test_sort() {
-    for len in 4..25 {
+    for len in (2..25).chain(500..510) {
        for _ in 0..100 {
            let mut v: Vec<_> = thread_rng().gen_iter::<i32>().take(len).collect();
            let mut v1 = v.clone();
@ -410,7 +410,7 @@ fn test_sort() {

 #[test]
 fn test_sort_stability() {
-    for len in 4..25 {
+    for len in (2..25).chain(500..510) {
        for _ in 0..10 {
            let mut counts = [0; 10];

@ -441,6 +441,13 @@ fn test_sort_stability() {
    }
 }

+#[test]
+fn test_sort_zero_sized_type() {
+    // Should not panic.
+    [(); 10].sort();
+    [(); 100].sort();
+}
+
 #[test]
 fn test_concat() {
    let v: [Vec<i32>; 0] = [];
@ -1338,89 +1345,104 @@ mod bench {
        })
    }

-    #[bench]
-    fn sort_random_small(b: &mut Bencher) {
+    fn gen_ascending(len: usize) -> Vec<u64> {
+        (0..len as u64).collect()
+    }
+
+    fn gen_descending(len: usize) -> Vec<u64> {
+        (0..len as u64).rev().collect()
+    }
+
+    fn gen_random(len: usize) -> Vec<u64> {
        let mut rng = thread_rng();
-        b.iter(|| {
-            let mut v: Vec<_> = rng.gen_iter::<u64>().take(5).collect();
-            v.sort();
-        });
-        b.bytes = 5 * mem::size_of::<u64>() as u64;
+        rng.gen_iter::<u64>().take(len).collect()
    }

-    #[bench]
-    fn sort_random_medium(b: &mut Bencher) {
+    fn gen_mostly_ascending(len: usize) -> Vec<u64> {
        let mut rng = thread_rng();
-        b.iter(|| {
-            let mut v: Vec<_> = rng.gen_iter::<u64>().take(100).collect();
-            v.sort();
-        });
-        b.bytes = 100 * mem::size_of::<u64>() as u64;
+        let mut v = gen_ascending(len);
+        for _ in (0usize..).take_while(|x| x * x <= len) {
+            let x = rng.gen::<usize>() % len;
+            let y = rng.gen::<usize>() % len;
+            v.swap(x, y);
+        }
+        v
    }

-    #[bench]
-    fn sort_random_large(b: &mut Bencher) {
+    fn gen_mostly_descending(len: usize) -> Vec<u64> {
        let mut rng = thread_rng();
-        b.iter(|| {
-            let mut v: Vec<_> = rng.gen_iter::<u64>().take(10000).collect();
-            v.sort();
-        });
-        b.bytes = 10000 * mem::size_of::<u64>() as u64;
+        let mut v = gen_descending(len);
+        for _ in (0usize..).take_while(|x| x * x <= len) {
+            let x = rng.gen::<usize>() % len;
+            let y = rng.gen::<usize>() % len;
+            v.swap(x, y);
+        }
+        v
    }

-    #[bench]
-    fn sort_sorted(b: &mut Bencher) {
-        let mut v: Vec<_> = (0..10000).collect();
-        b.iter(|| {
-            v.sort();
-        });
-        b.bytes = (v.len() * mem::size_of_val(&v[0])) as u64;
-    }
-
-    type BigSortable = (u64, u64, u64, u64);
-
-    #[bench]
-    fn sort_big_random_small(b: &mut Bencher) {
+    fn gen_big_random(len: usize) -> Vec<[u64; 16]> {
        let mut rng = thread_rng();
-        b.iter(|| {
-            let mut v = rng.gen_iter::<BigSortable>()
-                .take(5)
-                .collect::<Vec<BigSortable>>();
-            v.sort();
-        });
-        b.bytes = 5 * mem::size_of::<BigSortable>() as u64;
+        rng.gen_iter().map(|x| [x; 16]).take(len).collect()
    }

-    #[bench]
-    fn sort_big_random_medium(b: &mut Bencher) {
-        let mut rng = thread_rng();
-        b.iter(|| {
-            let mut v = rng.gen_iter::<BigSortable>()
-                .take(100)
-                .collect::<Vec<BigSortable>>();
-            v.sort();
-        });
-        b.bytes = 100 * mem::size_of::<BigSortable>() as u64;
+    fn gen_big_ascending(len: usize) -> Vec<[u64; 16]> {
+        (0..len as u64).map(|x| [x; 16]).take(len).collect()
    }

-    #[bench]
-    fn sort_big_random_large(b: &mut Bencher) {
-        let mut rng = thread_rng();
-        b.iter(|| {
-            let mut v = rng.gen_iter::<BigSortable>()
-                .take(10000)
-                .collect::<Vec<BigSortable>>();
-            v.sort();
-        });
-        b.bytes = 10000 * mem::size_of::<BigSortable>() as u64;
+    fn gen_big_descending(len: usize) -> Vec<[u64; 16]> {
+        (0..len as u64).rev().map(|x| [x; 16]).take(len).collect()
    }

+    macro_rules! sort_bench {
+        ($name:ident, $gen:expr, $len:expr) => {
+            #[bench]
+            fn $name(b: &mut Bencher) {
+                b.iter(|| $gen($len).sort());
+                b.bytes = $len * mem::size_of_val(&$gen(1)[0]) as u64;
+            }
+        }
+    }
+
+    sort_bench!(sort_small_random, gen_random, 10);
+    sort_bench!(sort_small_ascending, gen_ascending, 10);
+    sort_bench!(sort_small_descending, gen_descending, 10);
+
+    sort_bench!(sort_small_big_random, gen_big_random, 10);
+    sort_bench!(sort_small_big_ascending, gen_big_ascending, 10);
+    sort_bench!(sort_small_big_descending, gen_big_descending, 10);
+
+    sort_bench!(sort_medium_random, gen_random, 100);
+    sort_bench!(sort_medium_ascending, gen_ascending, 100);
+    sort_bench!(sort_medium_descending, gen_descending, 100);
+
+    sort_bench!(sort_large_random, gen_random, 10000);
+    sort_bench!(sort_large_ascending, gen_ascending, 10000);
+    sort_bench!(sort_large_descending, gen_descending, 10000);
+    sort_bench!(sort_large_mostly_ascending, gen_mostly_ascending, 10000);
+    sort_bench!(sort_large_mostly_descending, gen_mostly_descending, 10000);
+
+    sort_bench!(sort_large_big_random, gen_big_random, 10000);
+    sort_bench!(sort_large_big_ascending, gen_big_ascending, 10000);
+    sort_bench!(sort_large_big_descending, gen_big_descending, 10000);
+
    #[bench]
-    fn sort_big_sorted(b: &mut Bencher) {
-        let mut v: Vec<BigSortable> = (0..10000).map(|i| (i, i, i, i)).collect();
+    fn sort_large_random_expensive(b: &mut Bencher) {
+        let len = 10000;
        b.iter(|| {
-            v.sort();
+            let mut count = 0;
+            let cmp = move |a: &u64, b: &u64| {
+                count += 1;
+                if count % 1_000_000_000 == 0 {
+                    panic!("should not happen");
+                }
+                (*a as f64).cos().partial_cmp(&(*b as f64).cos()).unwrap()
+            };
+
+            let mut v = gen_random(len);
+            v.sort_by(cmp);
+
+            black_box(count);
        });
-        b.bytes = (v.len() * mem::size_of_val(&v[0])) as u64;
+        b.bytes = len as u64 * mem::size_of::<u64>() as u64;
    }
 }
--- a/src/test/run-pass/vector-sort-panic-safe.rs
+++ b/src/test/run-pass/vector-sort-panic-safe.rs
@ -17,86 +17,111 @@ use std::sync::atomic::{AtomicUsize, Ordering};
 use std::__rand::{thread_rng, Rng};
 use std::thread;

-const REPEATS: usize = 5;
-const MAX_LEN: usize = 32;
-static drop_counts: [AtomicUsize;  MAX_LEN] =
-    // FIXME #5244: AtomicUsize is not Copy.
-    [
-        AtomicUsize::new(0), AtomicUsize::new(0), AtomicUsize::new(0),
-        AtomicUsize::new(0), AtomicUsize::new(0), AtomicUsize::new(0),
-        AtomicUsize::new(0), AtomicUsize::new(0), AtomicUsize::new(0),
-        AtomicUsize::new(0), AtomicUsize::new(0), AtomicUsize::new(0),
-        AtomicUsize::new(0), AtomicUsize::new(0), AtomicUsize::new(0),
-        AtomicUsize::new(0), AtomicUsize::new(0), AtomicUsize::new(0),
-        AtomicUsize::new(0), AtomicUsize::new(0), AtomicUsize::new(0),
-        AtomicUsize::new(0), AtomicUsize::new(0), AtomicUsize::new(0),
-        AtomicUsize::new(0), AtomicUsize::new(0), AtomicUsize::new(0),
-        AtomicUsize::new(0), AtomicUsize::new(0), AtomicUsize::new(0),
-        AtomicUsize::new(0), AtomicUsize::new(0),
-     ];
+const MAX_LEN: usize = 80;

-static creation_count: AtomicUsize = AtomicUsize::new(0);
+static DROP_COUNTS: [AtomicUsize; MAX_LEN] = [
+    // FIXME #5244: AtomicUsize is not Copy.
+    AtomicUsize::new(0), AtomicUsize::new(0), AtomicUsize::new(0), AtomicUsize::new(0),
+    AtomicUsize::new(0), AtomicUsize::new(0), AtomicUsize::new(0), AtomicUsize::new(0),
+    AtomicUsize::new(0), AtomicUsize::new(0), AtomicUsize::new(0), AtomicUsize::new(0),
+    AtomicUsize::new(0), AtomicUsize::new(0), AtomicUsize::new(0), AtomicUsize::new(0),
+    AtomicUsize::new(0), AtomicUsize::new(0), AtomicUsize::new(0), AtomicUsize::new(0),
+    AtomicUsize::new(0), AtomicUsize::new(0), AtomicUsize::new(0), AtomicUsize::new(0),
+    AtomicUsize::new(0), AtomicUsize::new(0), AtomicUsize::new(0), AtomicUsize::new(0),
+    AtomicUsize::new(0), AtomicUsize::new(0), AtomicUsize::new(0), AtomicUsize::new(0),
+    AtomicUsize::new(0), AtomicUsize::new(0), AtomicUsize::new(0), AtomicUsize::new(0),
+    AtomicUsize::new(0), AtomicUsize::new(0), AtomicUsize::new(0), AtomicUsize::new(0),
+    AtomicUsize::new(0), AtomicUsize::new(0), AtomicUsize::new(0), AtomicUsize::new(0),
+    AtomicUsize::new(0), AtomicUsize::new(0), AtomicUsize::new(0), AtomicUsize::new(0),
+    AtomicUsize::new(0), AtomicUsize::new(0), AtomicUsize::new(0), AtomicUsize::new(0),
+    AtomicUsize::new(0), AtomicUsize::new(0), AtomicUsize::new(0), AtomicUsize::new(0),
+    AtomicUsize::new(0), AtomicUsize::new(0), AtomicUsize::new(0), AtomicUsize::new(0),
+    AtomicUsize::new(0), AtomicUsize::new(0), AtomicUsize::new(0), AtomicUsize::new(0),
+    AtomicUsize::new(0), AtomicUsize::new(0), AtomicUsize::new(0), AtomicUsize::new(0),
+    AtomicUsize::new(0), AtomicUsize::new(0), AtomicUsize::new(0), AtomicUsize::new(0),
+    AtomicUsize::new(0), AtomicUsize::new(0), AtomicUsize::new(0), AtomicUsize::new(0),
+    AtomicUsize::new(0), AtomicUsize::new(0), AtomicUsize::new(0), AtomicUsize::new(0),
+];

 #[derive(Clone, PartialEq, PartialOrd, Eq, Ord)]
-struct DropCounter { x: u32, creation_id: usize }
+struct DropCounter {
+    x: u32,
+    id: usize,
+}

 impl Drop for DropCounter {
    fn drop(&mut self) {
-        drop_counts[self.creation_id].fetch_add(1, Ordering::Relaxed);
+        DROP_COUNTS[self.id].fetch_add(1, Ordering::Relaxed);
    }
 }

-pub fn main() {
-    // len can't go above 64.
-    for len in 2..MAX_LEN {
-        for _ in 0..REPEATS {
-            // reset the count for these new DropCounters, so their
-            // IDs start from 0.
-            creation_count.store(0, Ordering::Relaxed);
+fn test(input: &[DropCounter]) {
+    let len = input.len();

-            let mut rng = thread_rng();
-            let main = (0..len).map(|_| {
-                DropCounter {
-                    x: rng.next_u32(),
-                    creation_id: creation_count.fetch_add(1, Ordering::Relaxed),
+    // Work out the total number of comparisons required to sort
+    // this array...
+    let mut count = 0usize;
+    input.to_owned().sort_by(|a, b| { count += 1; a.cmp(b) });
+
+    // ... and then panic on each and every single one.
+    for panic_countdown in 0..count {
+        // Refresh the counters.
+        for i in 0..len {
+            DROP_COUNTS[i].store(0, Ordering::Relaxed);
+        }
+
+        let v = input.to_owned();
+        let _ = thread::spawn(move || {
+            let mut v = v;
+            let mut panic_countdown = panic_countdown;
+            v.sort_by(|a, b| {
+                if panic_countdown == 0 {
+                    panic!();
                }
-            }).collect::<Vec<_>>();
+                panic_countdown -= 1;
+                a.cmp(b)
+            })
+        }).join();

-            // work out the total number of comparisons required to sort
-            // this array...
-            let mut count = 0_usize;
-            main.clone().sort_by(|a, b| { count += 1; a.cmp(b) });
-
-            // ... and then panic on each and every single one.
-            for panic_countdown in 0..count {
-                // refresh the counters.
-                for c in &drop_counts {
-                    c.store(0, Ordering::Relaxed);
-                }
-
-                let v = main.clone();
-
-                let _ = thread::spawn(move|| {
-                    let mut v = v;
-                    let mut panic_countdown = panic_countdown;
-                    v.sort_by(|a, b| {
-                        if panic_countdown == 0 {
-                            panic!()
-                        }
-                        panic_countdown -= 1;
-                        a.cmp(b)
-                    })
-                }).join();
-
-                // check that the number of things dropped is exactly
-                // what we expect (i.e. the contents of `v`).
-                for (i, c) in drop_counts.iter().enumerate().take(len) {
-                    let count = c.load(Ordering::Relaxed);
-                    assert!(count == 1,
-                            "found drop count == {} for i == {}, len == {}",
-                            count, i, len);
-                }
-            }
+        // Check that the number of things dropped is exactly
+        // what we expect (i.e. the contents of `v`).
+        for (i, c) in DROP_COUNTS.iter().enumerate().take(len) {
+            let count = c.load(Ordering::Relaxed);
+            assert!(count == 1,
+                    "found drop count == {} for i == {}, len == {}",
+                    count, i, len);
+        }
+    }
+}
+
+fn main() {
+    for len in (1..20).chain(70..MAX_LEN) {
+        // Test on a random array.
+        let mut rng = thread_rng();
+        let input = (0..len).map(|id| {
+            DropCounter {
+                x: rng.next_u32(),
+                id: id,
+            }
+        }).collect::<Vec<_>>();
+        test(&input);
+
+        // Test on a sorted array with two elements randomly swapped, creating several natural
+        // runs of random lengths. Such arrays have very high chances of hitting all code paths in
+        // the merge procedure.
+        for _ in 0..5 {
+            let mut input = (0..len).map(|i|
+                DropCounter {
+                    x: i as u32,
+                    id: i,
+                }
+            ).collect::<Vec<_>>();
+
+            let a = rng.gen::<usize>() % len;
+            let b = rng.gen::<usize>() % len;
+            input.swap(a, b);
+
+            test(&input);
        }
    }
 }