From 82b4544ddc58fb905c4457900fa9faf196f79e87 Mon Sep 17 00:00:00 2001 From: The8472 Date: Fri, 5 Nov 2021 01:01:14 +0100 Subject: [PATCH 1/5] add benchmarks and tests for Hash and Eq impls on Path The tests check for consistency between Ord, Eq and Hash --- library/std/src/path/tests.rs | 80 ++++++++++++++++++++++++++++++++++- 1 file changed, 78 insertions(+), 2 deletions(-) diff --git a/library/std/src/path/tests.rs b/library/std/src/path/tests.rs index 0a16ff2a721..2bf499e1ab8 100644 --- a/library/std/src/path/tests.rs +++ b/library/std/src/path/tests.rs @@ -1,6 +1,8 @@ use super::*; -use crate::collections::BTreeSet; +use crate::collections::hash_map::DefaultHasher; +use crate::collections::{BTreeSet, HashSet}; +use crate::hash::Hasher; use crate::rc::Rc; use crate::sync::Arc; use core::hint::black_box; @@ -1632,7 +1634,25 @@ fn into_rc() { fn test_ord() { macro_rules! ord( ($ord:ident, $left:expr, $right:expr) => ( { - assert_eq!(Path::new($left).cmp(&Path::new($right)), core::cmp::Ordering::$ord); + use core::cmp::Ordering; + + let left = Path::new($left); + let right = Path::new($right); + assert_eq!(left.cmp(&right), Ordering::$ord); + if (core::cmp::Ordering::$ord == Ordering::Equal) { + assert_eq!(left, right); + + let mut hasher = DefaultHasher::new(); + left.hash(&mut hasher); + let left_hash = hasher.finish(); + hasher = DefaultHasher::new(); + right.hash(&mut hasher); + let right_hash = hasher.finish(); + + assert_eq!(left_hash, right_hash, "hashes for {:?} and {:?} must match", left, right); + } else { + assert_ne!(left, right); + } }); ); @@ -1693,3 +1713,59 @@ fn bench_path_cmp_fast_path_short(b: &mut test::Bencher) { set.insert(paths[500].as_path()); }); } + +#[bench] +fn bench_path_hashset(b: &mut test::Bencher) { + let prefix = "/my/home/is/my/castle/and/my/castle/has/a/rusty/workbench/"; + let paths: Vec<_> = + (0..1000).map(|num| PathBuf::from(prefix).join(format!("file {}.rs", num))).collect(); + + let mut set = HashSet::new(); + + paths.iter().for_each(|p| { + set.insert(p.as_path()); + }); + + b.iter(|| { + set.remove(paths[500].as_path()); + set.insert(black_box(paths[500].as_path())) + }); +} + +#[bench] +fn bench_path_hashset_miss(b: &mut test::Bencher) { + let prefix = "/my/home/is/my/castle/and/my/castle/has/a/rusty/workbench/"; + let paths: Vec<_> = + (0..1000).map(|num| PathBuf::from(prefix).join(format!("file {}.rs", num))).collect(); + + let mut set = HashSet::new(); + + paths.iter().for_each(|p| { + set.insert(p.as_path()); + }); + + let probe = PathBuf::from(prefix).join("other"); + + b.iter(|| set.remove(black_box(probe.as_path()))); +} + +#[bench] +fn bench_hash_path_short(b: &mut test::Bencher) { + let mut hasher = DefaultHasher::new(); + let path = Path::new("explorer.exe"); + + b.iter(|| black_box(path).hash(&mut hasher)); + + black_box(hasher.finish()); +} + +#[bench] +fn bench_hash_path_long(b: &mut test::Bencher) { + let mut hasher = DefaultHasher::new(); + let path = + Path::new("/aaaaa/aaaaaa/./../aaaaaaaa/bbbbbbbbbbbbb/ccccccccccc/ddddddddd/eeeeeee.fff"); + + b.iter(|| black_box(path).hash(&mut hasher)); + + black_box(hasher.finish()); +} From a083dd653af0f7f46ba6058ab51e1f9d6a2aca7d Mon Sep 17 00:00:00 2001 From: The8472 Date: Fri, 5 Nov 2021 01:07:23 +0100 Subject: [PATCH 2/5] optimize Hash for Path Hashing does not have to use the whole Components parsing machinery because we only need it to match the normalizations that Components does. * stripping redundant separators -> skipping separators * stripping redundant '.' directories -> skipping '.' following after a separator That's all it takes. And instead of hashing individual slices for each component we feed the bytes directly into the hasher which avoids hashing the length of each component in addition to its contents. --- library/std/src/path.rs | 30 ++++++++++++++++++++++++++++-- 1 file changed, 28 insertions(+), 2 deletions(-) diff --git a/library/std/src/path.rs b/library/std/src/path.rs index dc0c735a06c..aca3a42f20a 100644 --- a/library/std/src/path.rs +++ b/library/std/src/path.rs @@ -2873,9 +2873,35 @@ impl cmp::PartialEq for Path { #[stable(feature = "rust1", since = "1.0.0")] impl Hash for Path { fn hash(&self, h: &mut H) { - for component in self.components() { - component.hash(h); + let bytes = self.as_u8_slice(); + + let mut component_start = 0; + let mut bytes_hashed = 0; + + for i in 0..bytes.len() { + if is_sep_byte(bytes[i]) { + if i > component_start { + let to_hash = &bytes[component_start..i]; + h.write(to_hash); + bytes_hashed += to_hash.len(); + } + + // skip over separator and optionally a following CurDir item + // since components() would normalize these away + component_start = i + match bytes[i..] { + [_, b'.', b'/', ..] | [_, b'.'] => 2, + _ => 1, + }; + } } + + if component_start < bytes.len() { + let to_hash = &bytes[component_start..]; + h.write(to_hash); + bytes_hashed += to_hash.len(); + } + + h.write_usize(bytes_hashed); } } From 7f6e080120fe3c2fd067a5b8d5df04b45ef78b09 Mon Sep 17 00:00:00 2001 From: The8472 Date: Fri, 5 Nov 2021 01:08:19 +0100 Subject: [PATCH 3/5] add fast path on Path::eq for exact equality --- library/std/src/path.rs | 21 ++++++++++++++++++++- 1 file changed, 20 insertions(+), 1 deletion(-) diff --git a/library/std/src/path.rs b/library/std/src/path.rs index aca3a42f20a..33225692d17 100644 --- a/library/std/src/path.rs +++ b/library/std/src/path.rs @@ -979,6 +979,25 @@ impl FusedIterator for Components<'_> {} impl<'a> cmp::PartialEq for Components<'a> { #[inline] fn eq(&self, other: &Components<'a>) -> bool { + let Components { path: _, front: _, back: _, has_physical_root: _, prefix: _ } = self; + + // Fast path for exact matches, e.g. for hashmap lookups. + // Don't explicitly compare the prefix or has_physical_root fields since they'll + // either be covered by the `path` buffer or are only relevant for `prefix_verbatim()`. + if self.path.len() == other.path.len() + && self.front == other.front + && self.back == State::Body + && other.back == State::Body + && self.prefix_verbatim() == other.prefix_verbatim() + { + // possible future improvement: this could bail out earlier if there were a + // reverse memcmp/bcmp comparing back to front + if self.path == other.path { + return true; + } + } + + // compare back to front since absolute paths often share long prefixes Iterator::eq(self.clone().rev(), other.clone().rev()) } } @@ -1013,7 +1032,7 @@ fn compare_components(mut left: Components<'_>, mut right: Components<'_>) -> cm // The fast path isn't taken for paths with a PrefixComponent to avoid backtracking into // the middle of one if left.prefix.is_none() && right.prefix.is_none() && left.front == right.front { - // this might benefit from a [u8]::first_mismatch simd implementation, if it existed + // possible future improvement: a [u8]::first_mismatch simd implementation let first_difference = match left.path.iter().zip(right.path.iter()).position(|(&a, &b)| a != b) { None if left.path.len() == right.path.len() => return cmp::Ordering::Equal, From a6e0aa20d90ee9173a6b901c641a8d48abbd82db Mon Sep 17 00:00:00 2001 From: The8472 Date: Mon, 8 Nov 2021 19:29:16 +0100 Subject: [PATCH 4/5] remove redundant .iter() call since zip() takes an IntoIterator argument --- library/std/src/path.rs | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/library/std/src/path.rs b/library/std/src/path.rs index 33225692d17..7e1135365cd 100644 --- a/library/std/src/path.rs +++ b/library/std/src/path.rs @@ -1033,12 +1033,11 @@ fn compare_components(mut left: Components<'_>, mut right: Components<'_>) -> cm // the middle of one if left.prefix.is_none() && right.prefix.is_none() && left.front == right.front { // possible future improvement: a [u8]::first_mismatch simd implementation - let first_difference = - match left.path.iter().zip(right.path.iter()).position(|(&a, &b)| a != b) { - None if left.path.len() == right.path.len() => return cmp::Ordering::Equal, - None => left.path.len().min(right.path.len()), - Some(diff) => diff, - }; + let first_difference = match left.path.iter().zip(right.path).position(|(&a, &b)| a != b) { + None if left.path.len() == right.path.len() => return cmp::Ordering::Equal, + None => left.path.len().min(right.path.len()), + Some(diff) => diff, + }; if let Some(previous_sep) = left.path[..first_difference].iter().rposition(|&b| left.is_sep_byte(b)) From c1ea7bdc87a07c733769fd6adaa16818d692df24 Mon Sep 17 00:00:00 2001 From: The8472 Date: Thu, 11 Nov 2021 21:42:59 +0100 Subject: [PATCH 5/5] `Prefix` can be case-insensitive, delegate to its Hash impl instead of trying to hash the raw bytes This should have 0 performance overhead on unix since Prefix is always None. --- library/std/src/path.rs | 8 ++++++++ library/std/src/sys/unix/path.rs | 1 + 2 files changed, 9 insertions(+) diff --git a/library/std/src/path.rs b/library/std/src/path.rs index 7e1135365cd..cf2cd5adc48 100644 --- a/library/std/src/path.rs +++ b/library/std/src/path.rs @@ -2892,6 +2892,14 @@ impl cmp::PartialEq for Path { impl Hash for Path { fn hash(&self, h: &mut H) { let bytes = self.as_u8_slice(); + let prefix_len = match parse_prefix(&self.inner) { + Some(prefix) => { + prefix.hash(h); + prefix.len() + } + None => 0, + }; + let bytes = &bytes[prefix_len..]; let mut component_start = 0; let mut bytes_hashed = 0; diff --git a/library/std/src/sys/unix/path.rs b/library/std/src/sys/unix/path.rs index 840a7ae0426..717add9ec48 100644 --- a/library/std/src/sys/unix/path.rs +++ b/library/std/src/sys/unix/path.rs @@ -11,6 +11,7 @@ pub fn is_verbatim_sep(b: u8) -> bool { b == b'/' } +#[inline] pub fn parse_prefix(_: &OsStr) -> Option> { None }