Auto merge of #128204 - GuillaumeGomez:integers-opti, r=workingjubilee

Small optimization for integers Display implementation

This is a first pass to try to speed up a bit integers `Display` implementation. The idea behind this is to reduce the stack usage for the buffer storing the output (shouldn't be visible in bench normally) and some small specialization which benefits a lot to smaller integers like `u8` and `i8`.

Here are the results of the benchmarks:

| bench name | current std | with this PR |
|-|-|-|
| bench_std_fmt::bench_i16_0    | 16.45 ns/iter (+/- 0.25) | 16.50 ns/iter (+/- 0.15) |
| bench_std_fmt::bench_i16_max  | 17.83 ns/iter (+/- 0.66) | 17.58 ns/iter (+/- 0.10) |
| bench_std_fmt::bench_i16_min  | 20.97 ns/iter (+/- 0.49) | 20.50 ns/iter (+/- 0.28) |
| bench_std_fmt::bench_i32_0    | 16.63 ns/iter (+/- 0.06) | 16.62 ns/iter (+/- 0.07) |
| bench_std_fmt::bench_i32_max  | 19.79 ns/iter (+/- 0.43) | 19.55 ns/iter (+/- 0.14) |
| bench_std_fmt::bench_i32_min  | 22.97 ns/iter (+/- 0.50) | 22.08 ns/iter (+/- 0.08) |
| bench_std_fmt::bench_i64_0    | 16.63 ns/iter (+/- 0.39) | 16.69 ns/iter (+/- 0.44) |
| bench_std_fmt::bench_i64_half | 19.60 ns/iter (+/- 0.05) | 19.10 ns/iter (+/- 0.05) |
| bench_std_fmt::bench_i64_max  | 25.22 ns/iter (+/- 0.34) | 24.43 ns/iter (+/- 0.02) |
| bench_std_fmt::bench_i8_0     | 16.27 ns/iter (+/- 0.32) | 15.80 ns/iter (+/- 0.17) |
| bench_std_fmt::bench_i8_max   | 16.71 ns/iter (+/- 0.09) | 16.25 ns/iter (+/- 0.01) |
| bench_std_fmt::bench_i8_min   | 20.07 ns/iter (+/- 0.22) | 19.80 ns/iter (+/- 0.30) |
| bench_std_fmt::bench_u128_0   | 21.37 ns/iter (+/- 0.24) | 21.35 ns/iter (+/- 0.35) |
| bench_std_fmt::bench_u128_max | 48.13 ns/iter (+/- 0.20) | 48.78 ns/iter (+/- 0.29) |
| bench_std_fmt::bench_u16_0    | 16.48 ns/iter (+/- 0.46) | 16.03 ns/iter (+/- 0.39) |
| bench_std_fmt::bench_u16_max  | 17.31 ns/iter (+/- 0.32) | 17.41 ns/iter (+/- 0.32) |
| bench_std_fmt::bench_u16_min  | 16.40 ns/iter (+/- 0.45) | 16.02 ns/iter (+/- 0.39) |
| bench_std_fmt::bench_u32_0    | 16.17 ns/iter (+/- 0.04) | 16.29 ns/iter (+/- 0.16) |
| bench_std_fmt::bench_u32_max  | 19.00 ns/iter (+/- 0.10) | 19.16 ns/iter (+/- 0.28) |
| bench_std_fmt::bench_u32_min  | 16.16 ns/iter (+/- 0.09) | 16.28 ns/iter (+/- 0.11) |
| bench_std_fmt::bench_u64_0    | 16.22 ns/iter (+/- 0.22) | 16.14 ns/iter (+/- 0.18) |
| bench_std_fmt::bench_u64_half | 19.25 ns/iter (+/- 0.07) | 18.95 ns/iter (+/- 0.05) |
| bench_std_fmt::bench_u64_max  | 24.31 ns/iter (+/- 0.08) | 24.18 ns/iter (+/- 0.08) |
| bench_std_fmt::bench_u8_0     | 15.76 ns/iter (+/- 0.08) | 15.66 ns/iter (+/- 0.08) |
| bench_std_fmt::bench_u8_max   | 16.53 ns/iter (+/- 0.03) | 16.29 ns/iter (+/- 0.02) |
| bench_std_fmt::bench_u8_min   | 15.77 ns/iter (+/- 0.06) | 15.67 ns/iter (+/- 0.02) |

The source code is:

<details>
<summary>source code</summary>

```rust
#![feature(test)]
#![allow(non_snake_case)]
#![allow(clippy::cast_lossless)]

extern crate test;

macro_rules! benches {
    ($($name:ident($value:expr))*) => {
        mod bench_std_fmt {
            use std::io::Write;
            use test::{Bencher, black_box};

            $(
                #[bench]
                fn $name(b: &mut Bencher) {
                    let mut buf = Vec::with_capacity(40);

                    b.iter(|| {
                        buf.clear();
                        write!(&mut buf, "{}", black_box($value)).unwrap();
                        black_box(&buf);
                    });
                }
            )*
        }
    }
}

benches! {
    bench_u64_0(0u64)
    bench_u64_half(u32::max_value() as u64)
    bench_u64_max(u64::max_value())

    bench_i64_0(0i64)
    bench_i64_half(i32::max_value() as i64)
    bench_i64_max(i64::max_value())

    bench_u16_0(0u16)
    bench_u16_min(u16::min_value())
    bench_u16_max(u16::max_value())

    bench_i16_0(0i16)
    bench_i16_min(i16::min_value())
    bench_i16_max(i16::max_value())

    bench_u128_0(0u128)
    bench_u128_max(u128::max_value())

    bench_i8_0(0i8)
    bench_i8_min(i8::min_value())
    bench_i8_max(i8::max_value())

    bench_u8_0(0u8)
    bench_u8_min(u8::min_value())
    bench_u8_max(u8::max_value())

    bench_u32_0(0u32)
    bench_u32_min(u32::min_value())
    bench_u32_max(u32::max_value())

    bench_i32_0(0i32)
    bench_i32_min(i32::min_value())
    bench_i32_max(i32::max_value())
}
```

</details>

And then I ran the equivalent code (source code below) in callgrind with [callgrind_differ](https://github.com/Ethiraric/callgrind_differ) to generate a nice output and here's the result:

```
core::fmt::num:👿:<impl core::fmt::Display for i16>::fmt |   1300000 | -    70000 -  5.385%   1230000
core::fmt::num:👿:<impl core::fmt::Display for i32>::fmt |   1910000 | -   100000 -  5.236%   1810000
core::fmt::num:👿:<impl core::fmt::Display for i64>::fmt |   2430000 | -   110000 -  4.527%   2320000
core::fmt::num:👿:<impl core::fmt::Display for i8>::fmt  |   1080000 | -   170000 - 15.741%    910000
core::fmt::num:👿:<impl core::fmt::Display for u16>::fmt |    960000 | +    10000 +  1.042%    970000
core::fmt::num:👿:<impl core::fmt::Display for u32>::fmt |   1300000 | +    30000 +  2.308%   1330000
core::fmt::num:👿:<impl core::fmt::Display for u8>::fmt  |    820000 | -    30000 -  3.659%    790000
```

<details>
<summary>Source code</summary>

```rust
#![feature(test)]

extern crate test;

use std::io::{stdout, Write};
use std::io::StdoutLock;
use test::black_box;

macro_rules! benches {
    ($handle:ident, $buf:ident, $($name:ident($value:expr))*) => {
            $(
                fn $name(handle: &mut StdoutLock, buf: &mut Vec<u8>) {
                    for _ in 0..10000 {
                        buf.clear();
                        write!(buf, "{}", black_box($value)).unwrap();
                        handle.write_all(buf);
                    }
                }
                $name(&mut $handle, &mut $buf);
            )*
    }
}

fn main() {
    let mut handle = stdout().lock();
    let mut buf = Vec::with_capacity(40);

    benches! {
        handle, buf,

        bench_u64_0(0u64)
        bench_u64_half(u32::max_value() as u64)
        bench_u64_max(u64::max_value())

        bench_i64_0(0i64)
        bench_i64_half(i32::max_value() as i64)
        bench_i64_max(i64::max_value())

        bench_u16_0(0u16)
        bench_u16_min(u16::min_value())
        bench_u16_max(u16::max_value())

        bench_i16_0(0i16)
        bench_i16_min(i16::min_value())
        bench_i16_max(i16::max_value())

        bench_u128_0(0u128)
        bench_u128_max(u128::max_value())

        bench_i8_0(0i8)
        bench_i8_min(i8::min_value())
        bench_i8_max(i8::max_value())

        bench_u8_0(0u8)
        bench_u8_min(u8::min_value())
        bench_u8_max(u8::max_value())

        bench_i32_0(0i32)
        bench_i32_min(i32::min_value())
        bench_i32_max(i32::max_value())

        bench_u32_0(0u32)
        bench_u32_min(u32::min_value())
        bench_u32_max(u32::max_value())
    }
}
```

</details>

The next step would be to specialize the `ToString` implementation so it doesn't go through the `Display` trait. I'm not sure if it will improve anything but I think it's worth a try.

r? `@Amanieu`
This commit is contained in:
bors 2024-10-01 22:12:44 +00:00
commit bfe5e8cef6

View File

@ -208,75 +208,119 @@ fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
8081828384858687888990919293949596979899";
macro_rules! impl_Display {
($($t:ident),* as $u:ident via $conv_fn:ident named $name:ident) => {
#[cfg(not(feature = "optimize_for_size"))]
fn $name(mut n: $u, is_nonnegative: bool, f: &mut fmt::Formatter<'_>) -> fmt::Result {
// 2^128 is about 3*10^38, so 39 gives an extra byte of space
let mut buf = [MaybeUninit::<u8>::uninit(); 39];
let mut curr = buf.len();
let buf_ptr = MaybeUninit::slice_as_mut_ptr(&mut buf);
let lut_ptr = DEC_DIGITS_LUT.as_ptr();
($($t:ident $(as $positive:ident)? named $name:ident,)* ; as $u:ident via $conv_fn:ident named $gen_name:ident) => {
// SAFETY: Since `d1` and `d2` are always less than or equal to `198`, we
// can copy from `lut_ptr[d1..d1 + 1]` and `lut_ptr[d2..d2 + 1]`. To show
// that it's OK to copy into `buf_ptr`, notice that at the beginning
// `curr == buf.len() == 39 > log(n)` since `n < 2^128 < 10^39`, and at
// each step this is kept the same as `n` is divided. Since `n` is always
// non-negative, this means that `curr > 0` so `buf_ptr[curr..curr + 1]`
// is safe to access.
unsafe {
// need at least 16 bits for the 4-characters-at-a-time to work.
assert!(crate::mem::size_of::<$u>() >= 2);
$(
#[stable(feature = "rust1", since = "1.0.0")]
impl fmt::Display for $t {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
// If it's a signed integer.
$(
let is_nonnegative = *self >= 0;
// eagerly decode 4 characters at a time
while n >= 10000 {
let rem = (n % 10000) as usize;
n /= 10000;
let d1 = (rem / 100) << 1;
let d2 = (rem % 100) << 1;
curr -= 4;
// We are allowed to copy to `buf_ptr[curr..curr + 3]` here since
// otherwise `curr < 0`. But then `n` was originally at least `10000^10`
// which is `10^40 > 2^128 > n`.
ptr::copy_nonoverlapping(lut_ptr.add(d1), buf_ptr.add(curr), 2);
ptr::copy_nonoverlapping(lut_ptr.add(d2), buf_ptr.add(curr + 2), 2);
#[cfg(not(feature = "optimize_for_size"))]
{
if !is_nonnegative {
// convert the negative num to positive by summing 1 to its 2s complement
return (!self as $positive).wrapping_add(1)._fmt(false, f);
}
}
#[cfg(feature = "optimize_for_size")]
{
if !is_nonnegative {
// convert the negative num to positive by summing 1 to its 2s complement
return $gen_name((!self.$conv_fn()).wrapping_add(1), false, f);
}
}
)?
// If it's a positive integer.
#[cfg(not(feature = "optimize_for_size"))]
{
self._fmt(true, f)
}
// if we reach here numbers are <= 9999, so at most 4 chars long
let mut n = n as usize; // possibly reduce 64bit math
// decode 2 more chars, if > 2 chars
if n >= 100 {
let d1 = (n % 100) << 1;
n /= 100;
curr -= 2;
ptr::copy_nonoverlapping(lut_ptr.add(d1), buf_ptr.add(curr), 2);
}
// decode last 1 or 2 chars
if n < 10 {
curr -= 1;
*buf_ptr.add(curr) = (n as u8) + b'0';
} else {
let d1 = n << 1;
curr -= 2;
ptr::copy_nonoverlapping(lut_ptr.add(d1), buf_ptr.add(curr), 2);
#[cfg(feature = "optimize_for_size")]
{
$gen_name(self.$conv_fn(), true, f)
}
}
// SAFETY: `curr` > 0 (since we made `buf` large enough), and all the chars are valid
// UTF-8 since `DEC_DIGITS_LUT` is
let buf_slice = unsafe {
str::from_utf8_unchecked(
slice::from_raw_parts(buf_ptr.add(curr), buf.len() - curr))
};
f.pad_integral(is_nonnegative, "", buf_slice)
}
#[cfg(not(feature = "optimize_for_size"))]
impl $t {
fn _fmt(mut self: $t, is_nonnegative: bool, f: &mut fmt::Formatter<'_>) -> fmt::Result {
const SIZE: usize = $t::MAX.ilog(10) as usize + 1;
let mut buf = [MaybeUninit::<u8>::uninit(); SIZE];
let mut curr = SIZE;
let buf_ptr = MaybeUninit::slice_as_mut_ptr(&mut buf);
let lut_ptr = DEC_DIGITS_LUT.as_ptr();
// SAFETY: Since `d1` and `d2` are always less than or equal to `198`, we
// can copy from `lut_ptr[d1..d1 + 1]` and `lut_ptr[d2..d2 + 1]`. To show
// that it's OK to copy into `buf_ptr`, notice that at the beginning
// `curr == buf.len() == 39 > log(n)` since `n < 2^128 < 10^39`, and at
// each step this is kept the same as `n` is divided. Since `n` is always
// non-negative, this means that `curr > 0` so `buf_ptr[curr..curr + 1]`
// is safe to access.
unsafe {
// need at least 16 bits for the 4-characters-at-a-time to work.
#[allow(overflowing_literals)]
#[allow(unused_comparisons)]
// This block will be removed for smaller types at compile time and in the worst
// case, it will prevent to have the `10000` literal to overflow for `i8` and `u8`.
if core::mem::size_of::<$t>() >= 2 {
// eagerly decode 4 characters at a time
while self >= 10000 {
let rem = (self % 10000) as usize;
self /= 10000;
let d1 = (rem / 100) << 1;
let d2 = (rem % 100) << 1;
curr -= 4;
// We are allowed to copy to `buf_ptr[curr..curr + 3]` here since
// otherwise `curr < 0`. But then `n` was originally at least `10000^10`
// which is `10^40 > 2^128 > n`.
ptr::copy_nonoverlapping(lut_ptr.add(d1 as usize), buf_ptr.add(curr), 2);
ptr::copy_nonoverlapping(lut_ptr.add(d2 as usize), buf_ptr.add(curr + 2), 2);
}
}
// if we reach here numbers are <= 9999, so at most 4 chars long
let mut n = self as usize; // possibly reduce 64bit math
// decode 2 more chars, if > 2 chars
if n >= 100 {
let d1 = (n % 100) << 1;
n /= 100;
curr -= 2;
ptr::copy_nonoverlapping(lut_ptr.add(d1), buf_ptr.add(curr), 2);
}
// if we reach here numbers are <= 100, so at most 2 chars long
// The biggest it can be is 99, and 99 << 1 == 198, so a `u8` is enough.
// decode last 1 or 2 chars
if n < 10 {
curr -= 1;
*buf_ptr.add(curr) = (n as u8) + b'0';
} else {
let d1 = n << 1;
curr -= 2;
ptr::copy_nonoverlapping(lut_ptr.add(d1), buf_ptr.add(curr), 2);
}
}
// SAFETY: `curr` > 0 (since we made `buf` large enough), and all the chars are valid
// UTF-8 since `DEC_DIGITS_LUT` is
let buf_slice = unsafe {
str::from_utf8_unchecked(
slice::from_raw_parts(buf_ptr.add(curr), buf.len() - curr))
};
f.pad_integral(is_nonnegative, "", buf_slice)
}
})*
#[cfg(feature = "optimize_for_size")]
fn $name(mut n: $u, is_nonnegative: bool, f: &mut fmt::Formatter<'_>) -> fmt::Result {
fn $gen_name(mut n: $u, is_nonnegative: bool, f: &mut fmt::Formatter<'_>) -> fmt::Result {
// 2^128 is about 3*10^38, so 39 gives an extra byte of space
let mut buf = [MaybeUninit::<u8>::uninit(); 39];
let mut curr = buf.len();
@ -306,21 +350,6 @@ fn $name(mut n: $u, is_nonnegative: bool, f: &mut fmt::Formatter<'_>) -> fmt::Re
};
f.pad_integral(is_nonnegative, "", buf_slice)
}
$(#[stable(feature = "rust1", since = "1.0.0")]
impl fmt::Display for $t {
#[allow(unused_comparisons)]
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
let is_nonnegative = *self >= 0;
let n = if is_nonnegative {
self.$conv_fn()
} else {
// convert the negative num to positive by summing 1 to it's 2 complement
(!self.$conv_fn()).wrapping_add(1)
};
$name(n, is_nonnegative, f)
}
})*
};
}
@ -374,7 +403,6 @@ fn $name(
(n, exponent, exponent, added_precision)
};
// 39 digits (worst case u128) + . = 40
// Since `curr` always decreases by the number of digits copied, this means
// that `curr >= 0`.
let mut buf = [MaybeUninit::<u8>::uninit(); 40];
@ -469,7 +497,7 @@ fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
let n = if is_nonnegative {
self.$conv_fn()
} else {
// convert the negative num to positive by summing 1 to it's 2 complement
// convert the negative num to positive by summing 1 to its 2s complement
(!self.$conv_fn()).wrapping_add(1)
};
$name(n, is_nonnegative, false, f)
@ -484,7 +512,7 @@ fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
let n = if is_nonnegative {
self.$conv_fn()
} else {
// convert the negative num to positive by summing 1 to it's 2 complement
// convert the negative num to positive by summing 1 to its 2s complement
(!self.$conv_fn()).wrapping_add(1)
};
$name(n, is_nonnegative, true, f)
@ -499,8 +527,17 @@ fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
mod imp {
use super::*;
impl_Display!(
i8, u8, i16, u16, i32, u32, i64, u64, usize, isize
as u64 via to_u64 named fmt_u64
i8 as u8 named fmt_i8,
u8 named fmt_u8,
i16 as u16 named fmt_i16,
u16 named fmt_u16,
i32 as u32 named fmt_i32,
u32 named fmt_u32,
i64 as u64 named fmt_i64,
u64 named fmt_u64,
isize as usize named fmt_isize,
usize named fmt_usize,
; as u64 via to_u64 named fmt_u64
);
impl_Exp!(
i8, u8, i16, u16, i32, u32, i64, u64, usize, isize
@ -511,8 +548,21 @@ mod imp {
#[cfg(not(any(target_pointer_width = "64", target_arch = "wasm32")))]
mod imp {
use super::*;
impl_Display!(i8, u8, i16, u16, i32, u32, isize, usize as u32 via to_u32 named fmt_u32);
impl_Display!(i64, u64 as u64 via to_u64 named fmt_u64);
impl_Display!(
i8 as u8 named fmt_i8,
u8 named fmt_u8,
i16 as u16 named fmt_i16,
u16 named fmt_u16,
i32 as u32 named fmt_i32,
u32 named fmt_u32,
isize as usize named fmt_isize,
usize named fmt_usize,
; as u32 via to_u32 named fmt_u32);
impl_Display!(
i64 as u64 named fmt_i64,
u64 named fmt_u64,
; as u64 via to_u64 named fmt_u64);
impl_Exp!(i8, u8, i16, u16, i32, u32, isize, usize as u32 via to_u32 named exp_u32);
impl_Exp!(i64, u64 as u64 via to_u64 named exp_u64);
}
@ -619,7 +669,7 @@ fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
let n = if is_nonnegative {
self.to_u128()
} else {
// convert the negative num to positive by summing 1 to it's 2 complement
// convert the negative num to positive by summing 1 to its 2s complement
(!self.to_u128()).wrapping_add(1)
};
fmt_u128(n, is_nonnegative, f)