Speed up integer log10.

This is achieved with a branchless bit-twiddling implementation of the
case x < 100_000, and using this as building block.

Benchmark on an Intel i7-8700K (Coffee Lake):

name                                   old ns/iter  new ns/iter  diff ns/iter   diff %  speedup
num::int_log::u8_log10_predictable     165          169                     4    2.42%   x 0.98
num::int_log::u8_log10_random          438          423                   -15   -3.42%   x 1.04
num::int_log::u8_log10_random_small    438          423                   -15   -3.42%   x 1.04
num::int_log::u16_log10_predictable    633          417                  -216  -34.12%   x 1.52
num::int_log::u16_log10_random         908          471                  -437  -48.13%   x 1.93
num::int_log::u16_log10_random_small   945          471                  -474  -50.16%   x 2.01
num::int_log::u32_log10_predictable    1,496        1,340                -156  -10.43%   x 1.12
num::int_log::u32_log10_random         1,076        873                  -203  -18.87%   x 1.23
num::int_log::u32_log10_random_small   1,145        874                  -271  -23.67%   x 1.31
num::int_log::u64_log10_predictable    4,005        3,171                -834  -20.82%   x 1.26
num::int_log::u64_log10_random         1,247        1,021                -226  -18.12%   x 1.22
num::int_log::u64_log10_random_small   1,265        921                  -344  -27.19%   x 1.37
num::int_log::u128_log10_predictable   39,667       39,579                -88   -0.22%   x 1.00
num::int_log::u128_log10_random        6,456        6,696                 240    3.72%   x 0.96
num::int_log::u128_log10_random_small  4,108        3,903                -205   -4.99%   x 1.05

Benchmark on an M1 Mac Mini:

name                                   old ns/iter  new ns/iter  diff ns/iter   diff %  speedup
num::int_log::u8_log10_predictable     143          130                   -13   -9.09%   x 1.10
num::int_log::u8_log10_random          375          325                   -50  -13.33%   x 1.15
num::int_log::u8_log10_random_small    376          325                   -51  -13.56%   x 1.16
num::int_log::u16_log10_predictable    500          322                  -178  -35.60%   x 1.55
num::int_log::u16_log10_random         794          405                  -389  -48.99%   x 1.96
num::int_log::u16_log10_random_small   1,035        405                  -630  -60.87%   x 2.56
num::int_log::u32_log10_predictable    1,144        894                  -250  -21.85%   x 1.28
num::int_log::u32_log10_random         832          786                   -46   -5.53%   x 1.06
num::int_log::u32_log10_random_small   832          787                   -45   -5.41%   x 1.06
num::int_log::u64_log10_predictable    2,681        2,057                -624  -23.27%   x 1.30
num::int_log::u64_log10_random         1,015        806                  -209  -20.59%   x 1.26
num::int_log::u64_log10_random_small   1,004        795                  -209  -20.82%   x 1.26
num::int_log::u128_log10_predictable   56,825       56,526               -299   -0.53%   x 1.01
num::int_log::u128_log10_random        9,056        8,861                -195   -2.15%   x 1.02
num::int_log::u128_log10_random_small  1,528        1,527                  -1   -0.07%   x 1.00

The 128 bit case remains ridiculously slow because llvm fails to optimize division by
a constant 128-bit value to multiplications. This could be worked around but it seems
preferable to fix this in llvm.

From u32 up, table lookup (like suggested here
https://github.com/rust-lang/rust/issues/70887#issuecomment-881099813) is still
faster, but requires a hardware leading_zero to be viable, and might clog up the
cache.
This commit is contained in:
Falk Hüffner 2021-09-05 22:55:29 +02:00
parent 0c26a3bc0c
commit d53c483502
2 changed files with 54 additions and 56 deletions

View File

@ -1,76 +1,71 @@
mod unchecked {
// 0 < val <= u8::MAX
pub const fn u8(val: u8) -> u32 {
if val >= 100 {
2
} else if val >= 10 {
1
} else {
0
}
let val = val as u32;
// For better performance, avoid branches by assembling the solution
// in the bits above the low 8 bits.
// Adding c1 to val gives 10 in the top bits for val < 10, 11 for val >= 10
let c1 = 0b11_00000000 - 10; // 758
// Adding c2 to val gives 01 in the top bits for val < 100, 10 for val >= 100
let c2 = 0b10_00000000 - 100; // 412
// Value of top bits:
// +c1 +c2 1&2
// 0..=9 10 01 00 = 0
// 10..=99 11 01 01 = 1
// 100..=255 11 10 10 = 2
((val + c1) & (val + c2)) >> 8
}
// 0 < val < 100_000
const fn less_than_5(val: u32) -> u32 {
// Similar to u8, when adding one of these constants to val,
// we get two possible bit patterns above the low 17 bits,
// depending on whether val is below or above the threshold.
let c1 = 0b011_00000000000000000 - 10; // 393206
let c2 = 0b100_00000000000000000 - 100; // 524188
let c3 = 0b111_00000000000000000 - 1000; // 916504
let c4 = 0b100_00000000000000000 - 10000; // 514288
// Value of top bits:
// +c1 +c2 1&2 +c3 +c4 3&4 ^
// 0..=9 010 011 010 110 011 010 000 = 0
// 10..=99 011 011 011 110 011 010 001 = 1
// 100..=999 011 100 000 110 011 010 010 = 2
// 1000..=9999 011 100 000 111 011 011 011 = 3
// 10000..=99999 011 100 000 111 100 100 100 = 4
(((val + c1) & (val + c2)) ^ ((val + c3) & (val + c4))) >> 17
}
// 0 < val <= u16::MAX
pub const fn u16(val: u16) -> u32 {
if val >= 10_000 {
4
} else if val >= 1000 {
3
} else if val >= 100 {
2
} else if val >= 10 {
1
} else {
0
}
}
// 0 < val < 100_000_000
const fn less_than_8(mut val: u32) -> u32 {
let mut log = 0;
if val >= 10_000 {
val /= 10_000;
log += 4;
}
log + if val >= 1000 {
3
} else if val >= 100 {
2
} else if val >= 10 {
1
} else {
0
}
less_than_5(val as u32)
}
// 0 < val <= u32::MAX
pub const fn u32(mut val: u32) -> u32 {
let mut log = 0;
if val >= 100_000_000 {
val /= 100_000_000;
log += 8;
if val >= 100_000 {
val /= 100_000;
log += 5;
}
log + less_than_8(val)
}
// 0 < val < 10_000_000_000_000_000
const fn less_than_16(mut val: u64) -> u32 {
let mut log = 0;
if val >= 100_000_000 {
val /= 100_000_000;
log += 8;
}
log + less_than_8(val as u32)
log + less_than_5(val)
}
// 0 < val <= u64::MAX
pub const fn u64(mut val: u64) -> u32 {
let mut log = 0;
if val >= 10_000_000_000_000_000 {
val /= 10_000_000_000_000_000;
log += 16;
if val >= 10_000_000_000 {
val /= 10_000_000_000;
log += 10;
}
log + less_than_16(val)
if val >= 100_000 {
val /= 100_000;
log += 5;
}
log + less_than_5(val as u32)
}
// 0 < val <= u128::MAX
@ -79,13 +74,13 @@ mod unchecked {
if val >= 100_000_000_000_000_000_000_000_000_000_000 {
val /= 100_000_000_000_000_000_000_000_000_000_000;
log += 32;
return log + less_than_8(val as u32);
return log + u32(val as u32);
}
if val >= 10_000_000_000_000_000 {
val /= 10_000_000_000_000_000;
log += 16;
}
log + less_than_16(val as u64)
log + u64(val as u64)
}
// 0 < val <= i8::MAX

View File

@ -96,6 +96,9 @@ fn checked_log10() {
for i in 1..=u16::MAX {
assert_eq!(i.checked_log10(), Some((i as f32).log10() as u32));
}
for i in 1..=100_000u32 {
assert_eq!(i.checked_log10(), Some((i as f32).log10() as u32));
}
}
macro_rules! log10_loop {