auto merge of #11280 : c-a/rust/inline_byteswap, r=brson
After writing some benchmarks for ebml::reader::vuint_at() I noticed that LLVM doesn't seem to inline the from_be32 function even though it only does a call to the bswap32 intrinsic in the x86_64 case. Marking the functions with #[inline(always)] fixes that and seems to me a reasonable thing to do. I got the following measurements in my vuint_at() benchmarks: - Before test ebml::bench::vuint_at_A_aligned ... bench: 1075 ns/iter (+/- 58) test ebml::bench::vuint_at_A_unaligned ... bench: 1073 ns/iter (+/- 5) test ebml::bench::vuint_at_D_aligned ... bench: 1150 ns/iter (+/- 5) test ebml::bench::vuint_at_D_unaligned ... bench: 1151 ns/iter (+/- 6) - Inline from_be32 test ebml::bench::vuint_at_A_aligned ... bench: 769 ns/iter (+/- 9) test ebml::bench::vuint_at_A_unaligned ... bench: 795 ns/iter (+/- 6) test ebml::bench::vuint_at_D_aligned ... bench: 758 ns/iter (+/- 8) test ebml::bench::vuint_at_D_unaligned ... bench: 759 ns/iter (+/- 8) - Using vuint_at_slow() test ebml::bench::vuint_at_A_aligned ... bench: 646 ns/iter (+/- 7) test ebml::bench::vuint_at_A_unaligned ... bench: 645 ns/iter (+/- 3) test ebml::bench::vuint_at_D_aligned ... bench: 907 ns/iter (+/- 4) test ebml::bench::vuint_at_D_unaligned ... bench: 1085 ns/iter (+/- 16) As expected inlining from_be32() gave a considerable speedup. I also tried how the "slow" version fared against the optimized version and noticed that it's actually a bit faster for small A class integers (using only two bytes) but slower for big D class integers (using four bytes)
This commit is contained in:
commit
0ff6c12ce9
@ -960,3 +960,87 @@ mod tests {
|
||||
test_v(Some(3));
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod bench {
|
||||
use ebml::reader;
|
||||
use test::BenchHarness;
|
||||
|
||||
#[bench]
|
||||
pub fn vuint_at_A_aligned(bh: &mut BenchHarness) {
|
||||
use std::vec;
|
||||
let data = vec::from_fn(4*100, |i| {
|
||||
match (i % 2) {
|
||||
0 => 0x80u8,
|
||||
_ => i as u8,
|
||||
}
|
||||
});
|
||||
let mut sum = 0u;
|
||||
bh.iter(|| {
|
||||
let mut i = 0;
|
||||
while (i < data.len()) {
|
||||
sum += reader::vuint_at(data, i).val;
|
||||
i += 4;
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
#[bench]
|
||||
pub fn vuint_at_A_unaligned(bh: &mut BenchHarness) {
|
||||
use std::vec;
|
||||
let data = vec::from_fn(4*100+1, |i| {
|
||||
match (i % 2) {
|
||||
1 => 0x80u8,
|
||||
_ => i as u8
|
||||
}
|
||||
});
|
||||
let mut sum = 0u;
|
||||
bh.iter(|| {
|
||||
let mut i = 1;
|
||||
while (i < data.len()) {
|
||||
sum += reader::vuint_at(data, i).val;
|
||||
i += 4;
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
#[bench]
|
||||
pub fn vuint_at_D_aligned(bh: &mut BenchHarness) {
|
||||
use std::vec;
|
||||
let data = vec::from_fn(4*100, |i| {
|
||||
match (i % 4) {
|
||||
0 => 0x10u8,
|
||||
3 => i as u8,
|
||||
_ => 0u8
|
||||
}
|
||||
});
|
||||
let mut sum = 0u;
|
||||
bh.iter(|| {
|
||||
let mut i = 0;
|
||||
while (i < data.len()) {
|
||||
sum += reader::vuint_at(data, i).val;
|
||||
i += 4;
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
#[bench]
|
||||
pub fn vuint_at_D_unaligned(bh: &mut BenchHarness) {
|
||||
use std::vec;
|
||||
let data = vec::from_fn(4*100+1, |i| {
|
||||
match (i % 4) {
|
||||
1 => 0x10u8,
|
||||
0 => i as u8,
|
||||
_ => 0u8
|
||||
}
|
||||
});
|
||||
let mut sum = 0u;
|
||||
bh.iter(|| {
|
||||
let mut i = 1;
|
||||
while (i < data.len()) {
|
||||
sum += reader::vuint_at(data, i).val;
|
||||
i += 4;
|
||||
}
|
||||
});
|
||||
}
|
||||
}
|
||||
|
@ -486,33 +486,33 @@ extern "rust-intrinsic" {
|
||||
pub fn u64_mul_with_overflow(x: u64, y: u64) -> (u64, bool);
|
||||
}
|
||||
|
||||
#[cfg(target_endian = "little")] pub fn to_le16(x: i16) -> i16 { x }
|
||||
#[cfg(target_endian = "big")] pub fn to_le16(x: i16) -> i16 { unsafe { bswap16(x) } }
|
||||
#[cfg(target_endian = "little")] pub fn to_le32(x: i32) -> i32 { x }
|
||||
#[cfg(target_endian = "big")] pub fn to_le32(x: i32) -> i32 { unsafe { bswap32(x) } }
|
||||
#[cfg(target_endian = "little")] pub fn to_le64(x: i64) -> i64 { x }
|
||||
#[cfg(target_endian = "big")] pub fn to_le64(x: i64) -> i64 { unsafe { bswap64(x) } }
|
||||
#[cfg(target_endian = "little")] #[inline] pub fn to_le16(x: i16) -> i16 { x }
|
||||
#[cfg(target_endian = "big")] #[inline] pub fn to_le16(x: i16) -> i16 { unsafe { bswap16(x) } }
|
||||
#[cfg(target_endian = "little")] #[inline] pub fn to_le32(x: i32) -> i32 { x }
|
||||
#[cfg(target_endian = "big")] #[inline] pub fn to_le32(x: i32) -> i32 { unsafe { bswap32(x) } }
|
||||
#[cfg(target_endian = "little")] #[inline] pub fn to_le64(x: i64) -> i64 { x }
|
||||
#[cfg(target_endian = "big")] #[inline] pub fn to_le64(x: i64) -> i64 { unsafe { bswap64(x) } }
|
||||
|
||||
#[cfg(target_endian = "little")] pub fn to_be16(x: i16) -> i16 { unsafe { bswap16(x) } }
|
||||
#[cfg(target_endian = "big")] pub fn to_be16(x: i16) -> i16 { x }
|
||||
#[cfg(target_endian = "little")] pub fn to_be32(x: i32) -> i32 { unsafe { bswap32(x) } }
|
||||
#[cfg(target_endian = "big")] pub fn to_be32(x: i32) -> i32 { x }
|
||||
#[cfg(target_endian = "little")] pub fn to_be64(x: i64) -> i64 { unsafe { bswap64(x) } }
|
||||
#[cfg(target_endian = "big")] pub fn to_be64(x: i64) -> i64 { x }
|
||||
#[cfg(target_endian = "little")] #[inline] pub fn to_be16(x: i16) -> i16 { unsafe { bswap16(x) } }
|
||||
#[cfg(target_endian = "big")] #[inline] pub fn to_be16(x: i16) -> i16 { x }
|
||||
#[cfg(target_endian = "little")] #[inline] pub fn to_be32(x: i32) -> i32 { unsafe { bswap32(x) } }
|
||||
#[cfg(target_endian = "big")] #[inline] pub fn to_be32(x: i32) -> i32 { x }
|
||||
#[cfg(target_endian = "little")] #[inline] pub fn to_be64(x: i64) -> i64 { unsafe { bswap64(x) } }
|
||||
#[cfg(target_endian = "big")] #[inline] pub fn to_be64(x: i64) -> i64 { x }
|
||||
|
||||
#[cfg(target_endian = "little")] pub fn from_le16(x: i16) -> i16 { x }
|
||||
#[cfg(target_endian = "big")] pub fn from_le16(x: i16) -> i16 { unsafe { bswap16(x) } }
|
||||
#[cfg(target_endian = "little")] pub fn from_le32(x: i32) -> i32 { x }
|
||||
#[cfg(target_endian = "big")] pub fn from_le32(x: i32) -> i32 { unsafe { bswap32(x) } }
|
||||
#[cfg(target_endian = "little")] pub fn from_le64(x: i64) -> i64 { x }
|
||||
#[cfg(target_endian = "big")] pub fn from_le64(x: i64) -> i64 { unsafe { bswap64(x) } }
|
||||
#[cfg(target_endian = "little")] #[inline] pub fn from_le16(x: i16) -> i16 { x }
|
||||
#[cfg(target_endian = "big")] #[inline] pub fn from_le16(x: i16) -> i16 { unsafe { bswap16(x) } }
|
||||
#[cfg(target_endian = "little")] #[inline] pub fn from_le32(x: i32) -> i32 { x }
|
||||
#[cfg(target_endian = "big")] #[inline] pub fn from_le32(x: i32) -> i32 { unsafe { bswap32(x) } }
|
||||
#[cfg(target_endian = "little")] #[inline] pub fn from_le64(x: i64) -> i64 { x }
|
||||
#[cfg(target_endian = "big")] #[inline] pub fn from_le64(x: i64) -> i64 { unsafe { bswap64(x) } }
|
||||
|
||||
#[cfg(target_endian = "little")] pub fn from_be16(x: i16) -> i16 { unsafe { bswap16(x) } }
|
||||
#[cfg(target_endian = "big")] pub fn from_be16(x: i16) -> i16 { x }
|
||||
#[cfg(target_endian = "little")] pub fn from_be32(x: i32) -> i32 { unsafe { bswap32(x) } }
|
||||
#[cfg(target_endian = "big")] pub fn from_be32(x: i32) -> i32 { x }
|
||||
#[cfg(target_endian = "little")] pub fn from_be64(x: i64) -> i64 { unsafe { bswap64(x) } }
|
||||
#[cfg(target_endian = "big")] pub fn from_be64(x: i64) -> i64 { x }
|
||||
#[cfg(target_endian = "little")] #[inline] pub fn from_be16(x: i16) -> i16 { unsafe { bswap16(x) } }
|
||||
#[cfg(target_endian = "big")] #[inline] pub fn from_be16(x: i16) -> i16 { x }
|
||||
#[cfg(target_endian = "little")] #[inline] pub fn from_be32(x: i32) -> i32 { unsafe { bswap32(x) } }
|
||||
#[cfg(target_endian = "big")] #[inline] pub fn from_be32(x: i32) -> i32 { x }
|
||||
#[cfg(target_endian = "little")] #[inline] pub fn from_be64(x: i64) -> i64 { unsafe { bswap64(x) } }
|
||||
#[cfg(target_endian = "big")] #[inline] pub fn from_be64(x: i64) -> i64 { x }
|
||||
|
||||
/// `TypeId` represents a globally unique identifier for a type
|
||||
#[lang="type_id"] // This needs to be kept in lockstep with the code in trans/intrinsic.rs and
|
||||
|
Loading…
x
Reference in New Issue
Block a user