Reduced allocations in merge_sort for short vectors

Added a seperate in-place insertion sort for short vectors.
Increased threshold for insertion short for 8 to 32 elements
for small types and 16 for larger types. Added benchmarks
for sorting larger types.
This commit is contained in:
Zach Kamsler 2014-02-04 12:56:13 -05:00
parent ef53b7a97c
commit cebe5e8e6b

View File

@ -1812,12 +1812,70 @@ impl<T:Eq> OwnedEqVector<T> for ~[T] {
}
}
fn insertion_sort<T>(v: &mut [T], compare: |&T, &T| -> Ordering) {
let len = v.len() as int;
let buf_v = v.as_mut_ptr();
// 1 <= i < len;
for i in range(1, len) {
// j satisfies: 0 <= j <= i;
let mut j = i;
unsafe {
// `i` is in bounds.
let read_ptr = buf_v.offset(i) as *T;
// find where to insert, we need to do strict <,
// rather than <=, to maintain stability.
// 0 <= j - 1 < len, so .offset(j - 1) is in bounds.
while j > 0 &&
compare(&*read_ptr, &*buf_v.offset(j - 1)) == Less {
j -= 1;
}
// shift everything to the right, to make space to
// insert this value.
// j + 1 could be `len` (for the last `i`), but in
// that case, `i == j` so we don't copy. The
// `.offset(j)` is always in bounds.
if i != j {
let tmp = ptr::read_ptr(read_ptr);
ptr::copy_memory(buf_v.offset(j + 1),
buf_v.offset(j),
(i - j) as uint);
ptr::copy_nonoverlapping_memory(buf_v.offset(j),
&tmp as *T,
1);
cast::forget(tmp);
}
}
}
}
fn merge_sort<T>(v: &mut [T], compare: |&T, &T| -> Ordering) {
// warning: this wildly uses unsafe.
static INSERTION: uint = 8;
static BASE_INSERTION: uint = 32;
static LARGE_INSERTION: uint = 16;
// FIXME #12092: smaller insertion runs seems to make sorting
// vectors of large elements a little faster on some platforms,
// but hasn't been tested/tuned extensively
let insertion = if size_of::<T>() <= 16 {
BASE_INSERTION
} else {
LARGE_INSERTION
};
let len = v.len();
// short vectors get sorted in-place via insertion sort to avoid allocations
if len <= insertion {
insertion_sort(v, compare);
return;
}
// allocate some memory to use as scratch memory, we keep the
// length 0 so we can keep shallow copies of the contents of `v`
// without risking the dtors running on an object twice if
@ -1837,9 +1895,9 @@ fn merge_sort<T>(v: &mut [T], compare: |&T, &T| -> Ordering) {
// We could hardcode the sorting comparisons here, and we could
// manipulate/step the pointers themselves, rather than repeatedly
// .offset-ing.
for start in range_step(0, len, INSERTION) {
// start <= i <= len;
for i in range(start, cmp::min(start + INSERTION, len)) {
for start in range_step(0, len, insertion) {
// start <= i < len;
for i in range(start, cmp::min(start + insertion, len)) {
// j satisfies: start <= j <= i;
let mut j = i as int;
unsafe {
@ -1871,7 +1929,7 @@ fn merge_sort<T>(v: &mut [T], compare: |&T, &T| -> Ordering) {
}
// step 2. merge the sorted runs.
let mut width = INSERTION;
let mut width = insertion;
while width < len {
// merge the sorted runs of length `width` in `buf_dat` two at
// a time, placing the result in `buf_tmp`.
@ -4505,4 +4563,45 @@ mod bench {
});
bh.bytes = (v.len() * mem::size_of_val(&v[0])) as u64;
}
type BigSortable = (u64,u64,u64,u64);
#[bench]
fn sort_big_random_small(bh: &mut BenchHarness) {
let mut rng = weak_rng();
bh.iter(|| {
let mut v: ~[BigSortable] = rng.gen_vec(5);
v.sort();
});
bh.bytes = 5 * mem::size_of::<BigSortable>() as u64;
}
#[bench]
fn sort_big_random_medium(bh: &mut BenchHarness) {
let mut rng = weak_rng();
bh.iter(|| {
let mut v: ~[BigSortable] = rng.gen_vec(100);
v.sort();
});
bh.bytes = 100 * mem::size_of::<BigSortable>() as u64;
}
#[bench]
fn sort_big_random_large(bh: &mut BenchHarness) {
let mut rng = weak_rng();
bh.iter(|| {
let mut v: ~[BigSortable] = rng.gen_vec(10000);
v.sort();
});
bh.bytes = 10000 * mem::size_of::<BigSortable>() as u64;
}
#[bench]
fn sort_big_sorted(bh: &mut BenchHarness) {
let mut v = vec::from_fn(10000u, |i| (i, i, i, i));
bh.iter(|| {
v.sort();
});
bh.bytes = (v.len() * mem::size_of_val(&v[0])) as u64;
}
}