Reduce size overhead of adaptative hashmap

Exposes a boolean flag in RawTable and use it
instead of a bool field in HashMap.

Fixes: #40042
This commit is contained in:
arthurprs 2017-03-03 20:31:54 +01:00
parent 1476105dd3
commit 3273003912
2 changed files with 74 additions and 24 deletions

View File

@ -396,8 +396,6 @@ pub struct HashMap<K, V, S = RandomState> {
table: RawTable<K, V>,
resize_policy: DefaultResizePolicy,
long_probes: bool,
}
/// Search for a pre-hashed key.
@ -655,7 +653,6 @@ pub fn with_hasher(hash_builder: S) -> HashMap<K, V, S> {
hash_builder: hash_builder,
resize_policy: DefaultResizePolicy::new(),
table: RawTable::new(0),
long_probes: false,
}
}
@ -688,7 +685,6 @@ pub fn with_capacity_and_hasher(capacity: usize, hash_builder: S) -> HashMap<K,
hash_builder: hash_builder,
resize_policy: resize_policy,
table: RawTable::new(raw_cap),
long_probes: false,
}
}
@ -746,7 +742,7 @@ pub fn reserve(&mut self, additional: usize) {
let min_cap = self.len().checked_add(additional).expect("reserve overflow");
let raw_cap = self.resize_policy.raw_capacity(min_cap);
self.resize(raw_cap);
} else if self.long_probes && remaining <= self.len() {
} else if self.table.tag() && remaining <= self.len() {
// Probe sequence is too long and table is half full,
// resize early to reduce probing length.
let new_capacity = self.table.capacity() * 2;
@ -763,7 +759,6 @@ fn resize(&mut self, new_raw_cap: usize) {
assert!(self.table.size() <= new_raw_cap);
assert!(new_raw_cap.is_power_of_two() || new_raw_cap == 0);
self.long_probes = false;
let mut old_table = replace(&mut self.table, RawTable::new(new_raw_cap));
let old_size = old_table.size();
@ -844,8 +839,7 @@ pub fn shrink_to_fit(&mut self) {
/// If the key already exists, the hashtable will be returned untouched
/// and a reference to the existing element will be returned.
fn insert_hashed_nocheck(&mut self, hash: SafeHash, k: K, v: V) -> Option<V> {
let entry = search_hashed(&mut self.table, hash, |key| *key == k)
.into_entry(k, &mut self.long_probes);
let entry = search_hashed(&mut self.table, hash, |key| *key == k).into_entry(k);
match entry {
Some(Occupied(mut elem)) => Some(elem.insert(v)),
Some(Vacant(elem)) => {
@ -1002,7 +996,7 @@ pub fn entry(&mut self, key: K) -> Entry<K, V> {
self.reserve(1);
let hash = self.make_hash(&key);
search_hashed(&mut self.table, hash, |q| q.eq(&key))
.into_entry(key, &mut self.long_probes).expect("unreachable")
.into_entry(key).expect("unreachable")
}
/// Returns the number of elements in the map.
@ -1456,7 +1450,7 @@ fn into_occupied_bucket(self) -> Option<FullBucket<K, V, M>> {
impl<'a, K, V> InternalEntry<K, V, &'a mut RawTable<K, V>> {
#[inline]
fn into_entry(self, key: K, long_probes: &'a mut bool) -> Option<Entry<'a, K, V>> {
fn into_entry(self, key: K) -> Option<Entry<'a, K, V>> {
match self {
InternalEntry::Occupied { elem } => {
Some(Occupied(OccupiedEntry {
@ -1469,7 +1463,6 @@ fn into_entry(self, key: K, long_probes: &'a mut bool) -> Option<Entry<'a, K, V>
hash: hash,
key: key,
elem: elem,
long_probes: long_probes,
}))
}
InternalEntry::TableIsEmpty => None,
@ -1542,7 +1535,6 @@ pub struct VacantEntry<'a, K: 'a, V: 'a> {
hash: SafeHash,
key: K,
elem: VacantEntryState<K, V, &'a mut RawTable<K, V>>,
long_probes: &'a mut bool,
}
#[stable(feature= "debug_hash_map", since = "1.12.0")]
@ -2117,15 +2109,15 @@ pub fn into_key(self) -> K {
#[stable(feature = "rust1", since = "1.0.0")]
pub fn insert(self, value: V) -> &'a mut V {
match self.elem {
NeqElem(bucket, disp) => {
NeqElem(mut bucket, disp) => {
if disp >= DISPLACEMENT_THRESHOLD {
*self.long_probes = true;
bucket.table_mut().set_tag(true);
}
robin_hood(bucket, disp, self.hash, self.key, value)
},
NoElem(bucket, disp) => {
NoElem(mut bucket, disp) => {
if disp >= DISPLACEMENT_THRESHOLD {
*self.long_probes = true;
bucket.table_mut().set_tag(true);
}
bucket.put(self.hash, self.key, value).into_mut_refs().1
},

View File

@ -34,6 +34,42 @@
const EMPTY_BUCKET: HashUint = 0;
/// Special `Unique<HashUint>` that uses the lower bit of the pointer
/// to expose a boolean tag.
/// Note: when the pointer is initialized to EMPTY `.ptr()` will return
/// null and the tag functions shouldn't be used.
struct TaggedHashUintPtr(Unique<HashUint>);
impl TaggedHashUintPtr {
#[inline]
unsafe fn new(ptr: *mut HashUint) -> Self {
debug_assert!(ptr as usize & 1 == 0 || ptr as usize == EMPTY as usize);
TaggedHashUintPtr(Unique::new(ptr))
}
#[inline]
fn set_tag(&mut self, value: bool) {
let usize_ptr = &*self.0 as *const *mut HashUint as *mut usize;
unsafe {
if value {
*usize_ptr |= 1;
} else {
*usize_ptr &= !1;
}
}
}
#[inline]
fn tag(&self) -> bool {
(*self.0 as usize) & 1 == 1
}
#[inline]
fn ptr(&self) -> *mut HashUint {
(*self.0 as usize & !1) as *mut HashUint
}
}
/// The raw hashtable, providing safe-ish access to the unzipped and highly
/// optimized arrays of hashes, and key-value pairs.
///
@ -72,10 +108,14 @@
/// around just the "table" part of the hashtable. It enforces some
/// invariants at the type level and employs some performance trickery,
/// but in general is just a tricked out `Vec<Option<(u64, K, V)>>`.
///
/// The hashtable also exposes a special boolean tag. The tag defaults to false
/// when the RawTable is created and is accessible with the `tag` and `set_tag`
/// functions.
pub struct RawTable<K, V> {
capacity: usize,
size: usize,
hashes: Unique<HashUint>,
hashes: TaggedHashUintPtr,
// Because K/V do not appear directly in any of the types in the struct,
// inform rustc that in fact instances of K and V are reachable from here.
@ -208,6 +248,10 @@ impl<K, V, M> FullBucket<K, V, M> {
pub fn table(&self) -> &M {
&self.table
}
/// Borrow a mutable reference to the table.
pub fn table_mut(&mut self) -> &mut M {
&mut self.table
}
/// Move out the reference to the table.
pub fn into_table(self) -> M {
self.table
@ -227,6 +271,10 @@ impl<K, V, M> EmptyBucket<K, V, M> {
pub fn table(&self) -> &M {
&self.table
}
/// Borrow a mutable reference to the table.
pub fn table_mut(&mut self) -> &mut M {
&mut self.table
}
}
impl<K, V, M> Bucket<K, V, M> {
@ -687,7 +735,7 @@ unsafe fn new_uninitialized(capacity: usize) -> RawTable<K, V> {
return RawTable {
size: 0,
capacity: 0,
hashes: Unique::new(EMPTY as *mut HashUint),
hashes: TaggedHashUintPtr::new(EMPTY as *mut HashUint),
marker: marker::PhantomData,
};
}
@ -728,7 +776,7 @@ unsafe fn new_uninitialized(capacity: usize) -> RawTable<K, V> {
RawTable {
capacity: capacity,
size: 0,
hashes: Unique::new(hashes),
hashes: TaggedHashUintPtr::new(hashes),
marker: marker::PhantomData,
}
}
@ -737,13 +785,13 @@ fn first_bucket_raw(&self) -> RawBucket<K, V> {
let hashes_size = self.capacity * size_of::<HashUint>();
let pairs_size = self.capacity * size_of::<(K, V)>();
let buffer = *self.hashes as *mut u8;
let buffer = self.hashes.ptr() as *mut u8;
let (pairs_offset, _, oflo) =
calculate_offsets(hashes_size, pairs_size, align_of::<(K, V)>());
debug_assert!(!oflo, "capacity overflow");
unsafe {
RawBucket {
hash: *self.hashes,
hash: self.hashes.ptr(),
pair: buffer.offset(pairs_offset as isize) as *const _,
_marker: marker::PhantomData,
}
@ -755,7 +803,7 @@ fn first_bucket_raw(&self) -> RawBucket<K, V> {
pub fn new(capacity: usize) -> RawTable<K, V> {
unsafe {
let ret = RawTable::new_uninitialized(capacity);
ptr::write_bytes(*ret.hashes, 0, capacity);
ptr::write_bytes(ret.hashes.ptr(), 0, capacity);
ret
}
}
@ -774,7 +822,7 @@ pub fn size(&self) -> usize {
fn raw_buckets(&self) -> RawBuckets<K, V> {
RawBuckets {
raw: self.first_bucket_raw(),
hashes_end: unsafe { self.hashes.offset(self.capacity as isize) },
hashes_end: unsafe { self.hashes.ptr().offset(self.capacity as isize) },
marker: marker::PhantomData,
}
}
@ -832,6 +880,16 @@ unsafe fn rev_move_buckets(&mut self) -> RevMoveBuckets<K, V> {
marker: marker::PhantomData,
}
}
/// Set the table tag
pub fn set_tag(&mut self, value: bool) {
self.hashes.set_tag(value)
}
/// Get the table tag
pub fn tag(&self) -> bool {
self.hashes.tag()
}
}
/// A raw iterator. The basis for some other iterators in this module. Although
@ -1156,7 +1214,7 @@ fn drop(&mut self) {
debug_assert!(!oflo, "should be impossible");
unsafe {
deallocate(*self.hashes as *mut u8, size, align);
deallocate(self.hashes.ptr() as *mut u8, size, align);
// Remember how everything was allocated out of one buffer
// during initialization? We only need one call to free here.
}