Auto merge of #88788 - falk-hueffner:speedup-int-log10-branchless, r=joshtriplett

Speedup int log10 branchless

This is achieved with a branchless bit-twiddling implementation of the case x < 100_000, and using this as building block.

Benchmark on an Intel i7-8700K (Coffee Lake):

```
name                                   old ns/iter  new ns/iter  diff ns/iter   diff %  speedup
num::int_log::u8_log10_predictable     165          169                     4    2.42%   x 0.98
num::int_log::u8_log10_random          438          423                   -15   -3.42%   x 1.04
num::int_log::u8_log10_random_small    438          423                   -15   -3.42%   x 1.04
num::int_log::u16_log10_predictable    633          417                  -216  -34.12%   x 1.52
num::int_log::u16_log10_random         908          471                  -437  -48.13%   x 1.93
num::int_log::u16_log10_random_small   945          471                  -474  -50.16%   x 2.01
num::int_log::u32_log10_predictable    1,496        1,340                -156  -10.43%   x 1.12
num::int_log::u32_log10_random         1,076        873                  -203  -18.87%   x 1.23
num::int_log::u32_log10_random_small   1,145        874                  -271  -23.67%   x 1.31
num::int_log::u64_log10_predictable    4,005        3,171                -834  -20.82%   x 1.26
num::int_log::u64_log10_random         1,247        1,021                -226  -18.12%   x 1.22
num::int_log::u64_log10_random_small   1,265        921                  -344  -27.19%   x 1.37
num::int_log::u128_log10_predictable   39,667       39,579                -88   -0.22%   x 1.00
num::int_log::u128_log10_random        6,456        6,696                 240    3.72%   x 0.96
num::int_log::u128_log10_random_small  4,108        3,903                -205   -4.99%   x 1.05
```

Benchmark on an M1 Mac Mini:

```
name                                   old ns/iter  new ns/iter  diff ns/iter   diff %  speedup
num::int_log::u8_log10_predictable     143          130                   -13   -9.09%   x 1.10
num::int_log::u8_log10_random          375          325                   -50  -13.33%   x 1.15
num::int_log::u8_log10_random_small    376          325                   -51  -13.56%   x 1.16
num::int_log::u16_log10_predictable    500          322                  -178  -35.60%   x 1.55
num::int_log::u16_log10_random         794          405                  -389  -48.99%   x 1.96
num::int_log::u16_log10_random_small   1,035        405                  -630  -60.87%   x 2.56
num::int_log::u32_log10_predictable    1,144        894                  -250  -21.85%   x 1.28
num::int_log::u32_log10_random         832          786                   -46   -5.53%   x 1.06
num::int_log::u32_log10_random_small   832          787                   -45   -5.41%   x 1.06
num::int_log::u64_log10_predictable    2,681        2,057                -624  -23.27%   x 1.30
num::int_log::u64_log10_random         1,015        806                  -209  -20.59%   x 1.26
num::int_log::u64_log10_random_small   1,004        795                  -209  -20.82%   x 1.26
num::int_log::u128_log10_predictable   56,825       56,526               -299   -0.53%   x 1.01
num::int_log::u128_log10_random        9,056        8,861                -195   -2.15%   x 1.02
num::int_log::u128_log10_random_small  1,528        1,527                  -1   -0.07%   x 1.00
```

The 128 bit case remains ridiculously slow because llvm fails to optimize division by a constant 128-bit value to multiplications. This could be worked around but it seems preferable to fix this in llvm.

From u32 up, table lookup (like suggested [here](https://github.com/rust-lang/rust/issues/70887#issuecomment-881099813)) is still faster, but requires a hardware `leading_zeros` to be viable, and might clog up the cache.
This commit is contained in:
bors 2021-10-12 03:18:54 +00:00
commit ffdf18d144
5 changed files with 114 additions and 56 deletions

View File

@ -1,6 +1,7 @@
// wasm32 does not support benches (no time). // wasm32 does not support benches (no time).
#![cfg(not(target_arch = "wasm32"))] #![cfg(not(target_arch = "wasm32"))]
#![feature(flt2dec)] #![feature(flt2dec)]
#![feature(int_log)]
#![feature(test)] #![feature(test)]
extern crate test; extern crate test;

View File

@ -0,0 +1,58 @@
use rand::Rng;
use test::{black_box, Bencher};
macro_rules! int_log_bench {
($t:ty, $predictable:ident, $random:ident, $random_small:ident) => {
#[bench]
fn $predictable(bench: &mut Bencher) {
bench.iter(|| {
for n in 0..(<$t>::BITS / 8) {
for i in 1..=(100 as $t) {
let x = black_box(i << (n * 8));
black_box(x.log10());
}
}
});
}
#[bench]
fn $random(bench: &mut Bencher) {
let mut rng = rand::thread_rng();
/* Exponentially distributed random numbers from the whole range of the type. */
let numbers: Vec<$t> = (0..256)
.map(|_| {
let x = rng.gen::<$t>() >> rng.gen_range(0, <$t>::BITS);
if x != 0 { x } else { 1 }
})
.collect();
bench.iter(|| {
for x in &numbers {
black_box(black_box(x).log10());
}
});
}
#[bench]
fn $random_small(bench: &mut Bencher) {
let mut rng = rand::thread_rng();
/* Exponentially distributed random numbers from the range 0..256. */
let numbers: Vec<$t> = (0..256)
.map(|_| {
let x = (rng.gen::<u8>() >> rng.gen_range(0, u8::BITS)) as $t;
if x != 0 { x } else { 1 }
})
.collect();
bench.iter(|| {
for x in &numbers {
black_box(black_box(x).log10());
}
});
}
};
}
int_log_bench! {u8, u8_log10_predictable, u8_log10_random, u8_log10_random_small}
int_log_bench! {u16, u16_log10_predictable, u16_log10_random, u16_log10_random_small}
int_log_bench! {u32, u32_log10_predictable, u32_log10_random, u32_log10_random_small}
int_log_bench! {u64, u64_log10_predictable, u64_log10_random, u64_log10_random_small}
int_log_bench! {u128, u128_log10_predictable, u128_log10_random, u128_log10_random_small}

View File

@ -1,5 +1,6 @@
mod dec2flt; mod dec2flt;
mod flt2dec; mod flt2dec;
mod int_log;
use std::str::FromStr; use std::str::FromStr;
use test::Bencher; use test::Bencher;

View File

@ -1,76 +1,71 @@
mod unchecked { mod unchecked {
// 0 < val <= u8::MAX // 0 < val <= u8::MAX
pub const fn u8(val: u8) -> u32 { pub const fn u8(val: u8) -> u32 {
if val >= 100 { let val = val as u32;
2
} else if val >= 10 { // For better performance, avoid branches by assembling the solution
1 // in the bits above the low 8 bits.
} else {
0 // Adding c1 to val gives 10 in the top bits for val < 10, 11 for val >= 10
} const C1: u32 = 0b11_00000000 - 10; // 758
// Adding c2 to val gives 01 in the top bits for val < 100, 10 for val >= 100
const C2: u32 = 0b10_00000000 - 100; // 412
// Value of top bits:
// +c1 +c2 1&2
// 0..=9 10 01 00 = 0
// 10..=99 11 01 01 = 1
// 100..=255 11 10 10 = 2
((val + C1) & (val + C2)) >> 8
}
// 0 < val < 100_000
const fn less_than_5(val: u32) -> u32 {
// Similar to u8, when adding one of these constants to val,
// we get two possible bit patterns above the low 17 bits,
// depending on whether val is below or above the threshold.
const C1: u32 = 0b011_00000000000000000 - 10; // 393206
const C2: u32 = 0b100_00000000000000000 - 100; // 524188
const C3: u32 = 0b111_00000000000000000 - 1000; // 916504
const C4: u32 = 0b100_00000000000000000 - 10000; // 514288
// Value of top bits:
// +c1 +c2 1&2 +c3 +c4 3&4 ^
// 0..=9 010 011 010 110 011 010 000 = 0
// 10..=99 011 011 011 110 011 010 001 = 1
// 100..=999 011 100 000 110 011 010 010 = 2
// 1000..=9999 011 100 000 111 011 011 011 = 3
// 10000..=99999 011 100 000 111 100 100 100 = 4
(((val + C1) & (val + C2)) ^ ((val + C3) & (val + C4))) >> 17
} }
// 0 < val <= u16::MAX // 0 < val <= u16::MAX
pub const fn u16(val: u16) -> u32 { pub const fn u16(val: u16) -> u32 {
if val >= 10_000 { less_than_5(val as u32)
4
} else if val >= 1000 {
3
} else if val >= 100 {
2
} else if val >= 10 {
1
} else {
0
}
}
// 0 < val < 100_000_000
const fn less_than_8(mut val: u32) -> u32 {
let mut log = 0;
if val >= 10_000 {
val /= 10_000;
log += 4;
}
log + if val >= 1000 {
3
} else if val >= 100 {
2
} else if val >= 10 {
1
} else {
0
}
} }
// 0 < val <= u32::MAX // 0 < val <= u32::MAX
pub const fn u32(mut val: u32) -> u32 { pub const fn u32(mut val: u32) -> u32 {
let mut log = 0; let mut log = 0;
if val >= 100_000_000 { if val >= 100_000 {
val /= 100_000_000; val /= 100_000;
log += 8; log += 5;
} }
log + less_than_8(val) log + less_than_5(val)
}
// 0 < val < 10_000_000_000_000_000
const fn less_than_16(mut val: u64) -> u32 {
let mut log = 0;
if val >= 100_000_000 {
val /= 100_000_000;
log += 8;
}
log + less_than_8(val as u32)
} }
// 0 < val <= u64::MAX // 0 < val <= u64::MAX
pub const fn u64(mut val: u64) -> u32 { pub const fn u64(mut val: u64) -> u32 {
let mut log = 0; let mut log = 0;
if val >= 10_000_000_000_000_000 { if val >= 10_000_000_000 {
val /= 10_000_000_000_000_000; val /= 10_000_000_000;
log += 16; log += 10;
} }
log + less_than_16(val) if val >= 100_000 {
val /= 100_000;
log += 5;
}
log + less_than_5(val as u32)
} }
// 0 < val <= u128::MAX // 0 < val <= u128::MAX
@ -79,13 +74,13 @@ pub const fn u128(mut val: u128) -> u32 {
if val >= 100_000_000_000_000_000_000_000_000_000_000 { if val >= 100_000_000_000_000_000_000_000_000_000_000 {
val /= 100_000_000_000_000_000_000_000_000_000_000; val /= 100_000_000_000_000_000_000_000_000_000_000;
log += 32; log += 32;
return log + less_than_8(val as u32); return log + u32(val as u32);
} }
if val >= 10_000_000_000_000_000 { if val >= 10_000_000_000_000_000 {
val /= 10_000_000_000_000_000; val /= 10_000_000_000_000_000;
log += 16; log += 16;
} }
log + less_than_16(val as u64) log + u64(val as u64)
} }
// 0 < val <= i8::MAX // 0 < val <= i8::MAX

View File

@ -96,6 +96,9 @@ fn checked_log10() {
for i in 1..=u16::MAX { for i in 1..=u16::MAX {
assert_eq!(i.checked_log10(), Some((i as f32).log10() as u32)); assert_eq!(i.checked_log10(), Some((i as f32).log10() as u32));
} }
for i in 1..=100_000u32 {
assert_eq!(i.checked_log10(), Some((i as f32).log10() as u32));
}
} }
macro_rules! log10_loop { macro_rules! log10_loop {