Rollup merge of #39538 - stjepang:slightly-optimize-sort, r=alexcrichton

Slightly optimize slice::sort

First, get rid of some bound checks.

Second, instead of comparing by ternary `compare` function, use a binary function testing whether an element is less than some other element. This apparently makes it easier for the compiler to reason about the code. I've noticed the same effect with [pdqsort](https://github.com/stjepang/pdqsort) crate.

Benchmark:

```
name                                        before ns/iter        after ns/iter         diff ns/iter   diff %
slice::bench::sort_large_ascending          8,969 (8919 MB/s)     7,410 (10796 MB/s)          -1,559  -17.38%
slice::bench::sort_large_big_ascending      355,640 (3599 MB/s)   359,137 (3564 MB/s)          3,497    0.98%
slice::bench::sort_large_big_descending     427,112 (2996 MB/s)   424,721 (3013 MB/s)         -2,391   -0.56%
slice::bench::sort_large_big_random         2,207,799 (579 MB/s)  2,138,804 (598 MB/s)       -68,995   -3.13%
slice::bench::sort_large_descending         13,694 (5841 MB/s)    13,514 (5919 MB/s)            -180   -1.31%
slice::bench::sort_large_mostly_ascending   239,697 (333 MB/s)    203,542 (393 MB/s)         -36,155  -15.08%
slice::bench::sort_large_mostly_descending  270,102 (296 MB/s)    234,263 (341 MB/s)         -35,839  -13.27%
slice::bench::sort_large_random             513,406 (155 MB/s)    470,084 (170 MB/s)         -43,322   -8.44%
slice::bench::sort_large_random_expensive   23,650,321 (3 MB/s)   23,675,098 (3 MB/s)         24,777    0.10%
slice::bench::sort_medium_ascending         143 (5594 MB/s)       132 (6060 MB/s)                -11   -7.69%
slice::bench::sort_medium_descending        197 (4060 MB/s)       188 (4255 MB/s)                 -9   -4.57%
slice::bench::sort_medium_random            3,358 (238 MB/s)      3,271 (244 MB/s)               -87   -2.59%
slice::bench::sort_small_ascending          32 (2500 MB/s)        32 (2500 MB/s)                   0    0.00%
slice::bench::sort_small_big_ascending      97 (13195 MB/s)       97 (13195 MB/s)                  0    0.00%
slice::bench::sort_small_big_descending     247 (5182 MB/s)       249 (5140 MB/s)                  2    0.81%
slice::bench::sort_small_big_random         502 (2549 MB/s)       498 (2570 MB/s)                 -4   -0.80%
slice::bench::sort_small_descending         55 (1454 MB/s)        61 (1311 MB/s)                   6   10.91%
slice::bench::sort_small_random             358 (223 MB/s)        356 (224 MB/s)                  -2   -0.56%
```
This commit is contained in:
Corey Farwell 2017-02-05 12:45:15 -05:00 committed by GitHub
commit a419dd1c01
2 changed files with 39 additions and 38 deletions

View File

@ -98,7 +98,7 @@
#![cfg_attr(test, allow(unused_imports, dead_code))]
use alloc::boxed::Box;
use core::cmp::Ordering::{self, Greater};
use core::cmp::Ordering::{self, Less};
use core::mem::size_of;
use core::mem;
use core::ptr;
@ -1089,7 +1089,7 @@ pub fn binary_search_by_key<'a, B, F>(&'a self, b: &B, f: F) -> Result<usize, us
pub fn sort(&mut self)
where T: Ord
{
self.sort_by(|a, b| a.cmp(b))
merge_sort(self, |a, b| a.lt(b));
}
/// Sorts the slice using `f` to extract a key to compare elements by.
@ -1119,7 +1119,7 @@ pub fn sort(&mut self)
pub fn sort_by_key<B, F>(&mut self, mut f: F)
where F: FnMut(&T) -> B, B: Ord
{
self.sort_by(|a, b| f(a).cmp(&f(b)))
merge_sort(self, |a, b| f(a).lt(&f(b)));
}
/// Sorts the slice using `compare` to compare elements.
@ -1149,10 +1149,10 @@ pub fn sort_by_key<B, F>(&mut self, mut f: F)
/// ```
#[stable(feature = "rust1", since = "1.0.0")]
#[inline]
pub fn sort_by<F>(&mut self, compare: F)
pub fn sort_by<F>(&mut self, mut compare: F)
where F: FnMut(&T, &T) -> Ordering
{
merge_sort(self, compare)
merge_sort(self, |a, b| compare(a, b) == Less);
}
/// Copies the elements from `src` into `self`.
@ -1355,10 +1355,10 @@ fn to_owned(&self) -> Vec<T> {
/// Inserts `v[0]` into pre-sorted sequence `v[1..]` so that whole `v[..]` becomes sorted.
///
/// This is the integral subroutine of insertion sort.
fn insert_head<T, F>(v: &mut [T], compare: &mut F)
where F: FnMut(&T, &T) -> Ordering
fn insert_head<T, F>(v: &mut [T], is_less: &mut F)
where F: FnMut(&T, &T) -> bool
{
if v.len() >= 2 && compare(&v[0], &v[1]) == Greater {
if v.len() >= 2 && is_less(&v[1], &v[0]) {
unsafe {
// There are three ways to implement insertion here:
//
@ -1381,12 +1381,12 @@ fn insert_head<T, F>(v: &mut [T], compare: &mut F)
// Intermediate state of the insertion process is always tracked by `hole`, which
// serves two purposes:
// 1. Protects integrity of `v` from panics in `compare`.
// 1. Protects integrity of `v` from panics in `is_less`.
// 2. Fills the remaining hole in `v` in the end.
//
// Panic safety:
//
// If `compare` panics at any point during the process, `hole` will get dropped and
// If `is_less` panics at any point during the process, `hole` will get dropped and
// fill the hole in `v` with `tmp`, thus ensuring that `v` still holds every object it
// initially held exactly once.
let mut hole = InsertionHole {
@ -1396,7 +1396,7 @@ fn insert_head<T, F>(v: &mut [T], compare: &mut F)
ptr::copy_nonoverlapping(&v[1], &mut v[0], 1);
for i in 2..v.len() {
if compare(&tmp.value, &v[i]) != Greater {
if !is_less(&v[i], &tmp.value) {
break;
}
ptr::copy_nonoverlapping(&v[i], &mut v[i - 1], 1);
@ -1432,8 +1432,8 @@ fn drop(&mut self) {
///
/// The two slices must be non-empty and `mid` must be in bounds. Buffer `buf` must be long enough
/// to hold a copy of the shorter slice. Also, `T` must not be a zero-sized type.
unsafe fn merge<T, F>(v: &mut [T], mid: usize, buf: *mut T, compare: &mut F)
where F: FnMut(&T, &T) -> Ordering
unsafe fn merge<T, F>(v: &mut [T], mid: usize, buf: *mut T, is_less: &mut F)
where F: FnMut(&T, &T) -> bool
{
let len = v.len();
let v = v.as_mut_ptr();
@ -1449,12 +1449,12 @@ unsafe fn merge<T, F>(v: &mut [T], mid: usize, buf: *mut T, compare: &mut F)
// hole in `v`.
//
// Intermediate state of the process is always tracked by `hole`, which serves two purposes:
// 1. Protects integrity of `v` from panics in `compare`.
// 1. Protects integrity of `v` from panics in `is_less`.
// 2. Fills the remaining hole in `v` if the longer run gets consumed first.
//
// Panic safety:
//
// If `compare` panics at any point during the process, `hole` will get dropped and fill the
// If `is_less` panics at any point during the process, `hole` will get dropped and fill the
// hole in `v` with the unconsumed range in `buf`, thus ensuring that `v` still holds every
// object it initially held exactly once.
let mut hole;
@ -1476,7 +1476,7 @@ unsafe fn merge<T, F>(v: &mut [T], mid: usize, buf: *mut T, compare: &mut F)
while *left < hole.end && right < v_end {
// Consume the lesser side.
// If equal, prefer the left run to maintain stability.
let to_copy = if compare(&**left, &*right) == Greater {
let to_copy = if is_less(&*right, &**left) {
get_and_increment(&mut right)
} else {
get_and_increment(left)
@ -1500,7 +1500,7 @@ unsafe fn merge<T, F>(v: &mut [T], mid: usize, buf: *mut T, compare: &mut F)
while v < *left && buf < *right {
// Consume the greater side.
// If equal, prefer the right run to maintain stability.
let to_copy = if compare(&*left.offset(-1), &*right.offset(-1)) == Greater {
let to_copy = if is_less(&*right.offset(-1), &*left.offset(-1)) {
decrement_and_get(left)
} else {
decrement_and_get(right)
@ -1550,8 +1550,8 @@ fn drop(&mut self) {
/// 2. for every `i` in `2..runs.len()`: `runs[i - 2].len > runs[i - 1].len + runs[i].len`
///
/// The invariants ensure that the total running time is `O(n log n)` worst-case.
fn merge_sort<T, F>(v: &mut [T], mut compare: F)
where F: FnMut(&T, &T) -> Ordering
fn merge_sort<T, F>(v: &mut [T], mut is_less: F)
where F: FnMut(&T, &T) -> bool
{
// Sorting has no meaningful behavior on zero-sized types.
if size_of::<T>() == 0 {
@ -1565,7 +1565,7 @@ fn merge_sort<T, F>(v: &mut [T], mut compare: F)
//
// Short runs are extended using insertion sort to span at least `min_run` elements, in order
// to improve performance.
let (max_insertion, min_run) = if size_of::<T>() <= 16 {
let (max_insertion, min_run) = if size_of::<T>() <= 2 * mem::size_of::<usize>() {
(64, 32)
} else {
(32, 16)
@ -1577,7 +1577,7 @@ fn merge_sort<T, F>(v: &mut [T], mut compare: F)
if len <= max_insertion {
if len >= 2 {
for i in (0..len-1).rev() {
insert_head(&mut v[i..], &mut compare);
insert_head(&mut v[i..], &mut is_less);
}
}
return;
@ -1585,7 +1585,7 @@ fn merge_sort<T, F>(v: &mut [T], mut compare: F)
// Allocate a buffer to use as scratch memory. We keep the length 0 so we can keep in it
// shallow copies of the contents of `v` without risking the dtors running on copies if
// `compare` panics. When merging two sorted runs, this buffer holds a copy of the shorter run,
// `is_less` panics. When merging two sorted runs, this buffer holds a copy of the shorter run,
// which will always have length at most `len / 2`.
let mut buf = Vec::with_capacity(len / 2);
@ -1600,14 +1600,18 @@ fn merge_sort<T, F>(v: &mut [T], mut compare: F)
let mut start = end - 1;
if start > 0 {
start -= 1;
if compare(&v[start], &v[start + 1]) == Greater {
while start > 0 && compare(&v[start - 1], &v[start]) == Greater {
start -= 1;
}
v[start..end].reverse();
} else {
while start > 0 && compare(&v[start - 1], &v[start]) != Greater {
start -= 1;
unsafe {
if is_less(v.get_unchecked(start + 1), v.get_unchecked(start)) {
while start > 0 && is_less(v.get_unchecked(start),
v.get_unchecked(start - 1)) {
start -= 1;
}
v[start..end].reverse();
} else {
while start > 0 && !is_less(v.get_unchecked(start),
v.get_unchecked(start - 1)) {
start -= 1;
}
}
}
}
@ -1616,7 +1620,7 @@ fn merge_sort<T, F>(v: &mut [T], mut compare: F)
// merge sort on short sequences, so this significantly improves performance.
while start > 0 && end - start < min_run {
start -= 1;
insert_head(&mut v[start..end], &mut compare);
insert_head(&mut v[start..end], &mut is_less);
}
// Push this run onto the stack.
@ -1632,7 +1636,7 @@ fn merge_sort<T, F>(v: &mut [T], mut compare: F)
let right = runs[r];
unsafe {
merge(&mut v[left.start .. right.start + right.len], left.len, buf.as_mut_ptr(),
&mut compare);
&mut is_less);
}
runs[r] = Run {
start: left.start,

View File

@ -1429,18 +1429,15 @@ fn $name(b: &mut Bencher) {
fn sort_large_random_expensive(b: &mut Bencher) {
let len = 10000;
b.iter(|| {
let mut v = gen_random(len);
let mut count = 0;
let cmp = move |a: &u64, b: &u64| {
v.sort_by(|a: &u64, b: &u64| {
count += 1;
if count % 1_000_000_000 == 0 {
panic!("should not happen");
}
(*a as f64).cos().partial_cmp(&(*b as f64).cos()).unwrap()
};
let mut v = gen_random(len);
v.sort_by(cmp);
});
black_box(count);
});
b.bytes = len as u64 * mem::size_of::<u64>() as u64;