Auto merge of #50398 - llogiq:memchr-nano-opt, r=nagisa

nano-optimization for memchr::repeat_byte This replaces the multiple shifts & bitwise or with a single multiplication In my benchmarks this performs equally well or better, especially on 64bit systems (it shaves a stable nanosecond on my skylake). This may go against conventional wisdom, but the shifts and bitwise ors cannot be pipelined because of hard data dependencies. While it may or may not be worthwile from an optimization standpoint, it also reduces code size, so there's basically no downside.
2018-05-04 05:38:18 +00:00 · 2018-05-04 05:38:18 +00:00 · e78c51adc2
commit e78c51adc2
parent 841e0ccb4c 1cefb5ce31
1 changed files with 2 additions and 13 deletions
--- a/src/libcore/slice/memchr.rs
+++ b/src/libcore/slice/memchr.rs
@ -39,21 +39,10 @@ fn repeat_byte(b: u8) -> usize {
    (b as usize) << 8 | b as usize
 }

-#[cfg(target_pointer_width = "32")]
+#[cfg(not(target_pointer_width = "16"))]
 #[inline]
 fn repeat_byte(b: u8) -> usize {
-    let mut rep = (b as usize) << 8 | b as usize;
-    rep = rep << 16 | rep;
-    rep
-}
-
-#[cfg(target_pointer_width = "64")]
-#[inline]
-fn repeat_byte(b: u8) -> usize {
-    let mut rep = (b as usize) << 8 | b as usize;
-    rep = rep << 16 | rep;
-    rep = rep << 32 | rep;
-    rep
+    (b as usize) * (::usize::MAX / 255)
 }

 /// Return the first index matching the byte `x` in `text`.