Implement an insertion-order preserving, efficient multi-map

2020-02-16 14:40:12 -08:00 · 2020-02-16 14:40:12 -08:00 · ea3c9d27cf
commit ea3c9d27cf
parent 7710ae0e26
3 changed files with 249 additions and 1 deletions
--- a/src/librustc_data_structures/sorted_map.rs
+++ b/src/librustc_data_structures/sorted_map.rs
@ -4,6 +4,10 @@ use std::iter::FromIterator;
 use std::mem;
 use std::ops::{Bound, Index, IndexMut, RangeBounds};

+mod index_map;
+
+pub use index_map::SortedIndexMultiMap;
+
 /// `SortedMap` is a data structure with similar characteristics as BTreeMap but
 /// slightly different trade-offs: lookup, insertion, and removal are O(log(N))
 /// and elements can be iterated in order cheaply.
--- a/src/librustc_data_structures/sorted_map/index_map.rs
+++ b/src/librustc_data_structures/sorted_map/index_map.rs
@ -0,0 +1,218 @@
+//! A variant of `SortedMap` that preserves insertion order.
+
+use std::borrow::Borrow;
+use std::hash::{Hash, Hasher};
+use std::iter::FromIterator;
+
+use crate::stable_hasher::{HashStable, StableHasher};
+use rustc_index::vec::{Idx, IndexVec};
+
+/// An indexed multi-map that preserves insertion order while permitting both `O(log n)` lookup of
+/// an item by key and `O(1)` lookup by index.
+///
+/// This data structure is a hybrid of an [`IndexVec`] and a [`SortedMap`]. Like `IndexVec`,
+/// `SortedIndexMultiMap` assigns a typed index to each item while preserving insertion order.
+/// Like `SortedMap`, `SortedIndexMultiMap` has efficient lookup of items by key. However, this
+/// is accomplished by sorting an array of item indices instead of the items themselves.
+///
+/// Unlike `SortedMap`, this data structure can hold multiple equivalent items at once, so the
+/// `get_by_key` method and its variants return an iterator instead of an `Option`. Equivalent
+/// items will be yielded in insertion order.
+///
+/// Unlike a general-purpose map like `BTreeSet` or `HashSet`, `SortedMap` and
+/// `SortedIndexMultiMap` require `O(n)` time to insert a single item. This is because we may need
+/// to insert into the middle of the sorted array. Users should avoid mutating this data structure
+/// in-place.
+///
+/// [`IndexVec`]: ../../rustc_index/vec/struct.IndexVec.html
+/// [`SortedMap`]: ../sorted_map/struct.SortedMap.html
+#[derive(Clone, Debug)]
+pub struct SortedIndexMultiMap<I: Idx, K, V> {
+    /// The elements of the map in insertion order.
+    items: IndexVec<I, (K, V)>,
+
+    /// Indices of the items in the set, sorted by the item's key.
+    idx_sorted_by_item_key: Vec<I>,
+}
+
+impl<I: Idx, K: Ord, V> SortedIndexMultiMap<I, K, V> {
+    pub fn new() -> Self {
+        SortedIndexMultiMap { items: IndexVec::new(), idx_sorted_by_item_key: Vec::new() }
+    }
+
+    pub fn len(&self) -> usize {
+        self.items.len()
+    }
+
+    pub fn is_empty(&self) -> bool {
+        self.items.is_empty()
+    }
+
+    /// Returns an iterator over the items in the map in insertion order.
+    pub fn into_iter(self) -> impl DoubleEndedIterator<Item = (K, V)> {
+        self.items.into_iter()
+    }
+
+    /// Returns an iterator over the items in the map in insertion order along with their indices.
+    pub fn into_iter_enumerated(self) -> impl DoubleEndedIterator<Item = (I, (K, V))> {
+        self.items.into_iter_enumerated()
+    }
+
+    /// Returns an iterator over the items in the map in insertion order.
+    pub fn iter(&self) -> impl '_ + DoubleEndedIterator<Item = (&K, &V)> {
+        self.items.iter().map(|(ref k, ref v)| (k, v))
+    }
+
+    /// Returns an iterator over the items in the map in insertion order along with their indices.
+    pub fn iter_enumerated(&self) -> impl '_ + DoubleEndedIterator<Item = (I, (&K, &V))> {
+        self.items.iter_enumerated().map(|(i, (ref k, ref v))| (i, (k, v)))
+    }
+
+    /// Returns the item in the map with the given index.
+    pub fn get(&self, idx: I) -> Option<&(K, V)> {
+        self.items.get(idx)
+    }
+
+    /// Returns an iterator over the items in the map that are equal to `key`.
+    ///
+    /// If there are multiple items that are equivalent to `key`, they will be yielded in
+    /// insertion order.
+    pub fn get_by_key<Q: 'a>(&'a self, key: &Q) -> impl 'a + Iterator<Item = &'a V>
+    where
+        Q: Ord + ?Sized,
+        K: Borrow<Q>,
+    {
+        self.get_by_key_enumerated(key).map(|(_, v)| v)
+    }
+
+    /// Returns an iterator over the items in the map that are equal to `key` along with their
+    /// indices.
+    ///
+    /// If there are multiple items that are equivalent to `key`, they will be yielded in
+    /// insertion order.
+    pub fn get_by_key_enumerated<Q>(&self, key: &Q) -> impl '_ + Iterator<Item = (I, &V)>
+    where
+        Q: Ord + ?Sized,
+        K: Borrow<Q>,
+    {
+        // FIXME: This should be in the standard library as `equal_range`. See rust-lang/rfcs#2184.
+        match self.binary_search_idx(key) {
+            Err(_) => self.idxs_to_items_enumerated(&[]),
+
+            Ok(idx) => {
+                let start = self.find_lower_bound(key, idx);
+                let end = self.find_upper_bound(key, idx);
+                self.idxs_to_items_enumerated(&self.idx_sorted_by_item_key[start..end])
+            }
+        }
+    }
+
+    fn binary_search_idx<Q>(&self, key: &Q) -> Result<usize, usize>
+    where
+        Q: Ord + ?Sized,
+        K: Borrow<Q>,
+    {
+        self.idx_sorted_by_item_key.binary_search_by(|&idx| self.items[idx].0.borrow().cmp(key))
+    }
+
+    /// Returns the index into the `idx_sorted_by_item_key` array of the first item equal to
+    /// `key`.
+    ///
+    /// `initial` must be an index into that same array for an item that is equal to `key`.
+    fn find_lower_bound<Q>(&self, key: &Q, initial: usize) -> usize
+    where
+        Q: Ord + ?Sized,
+        K: Borrow<Q>,
+    {
+        debug_assert!(self.items[self.idx_sorted_by_item_key[initial]].0.borrow() == key);
+
+        // FIXME: At present, this uses linear search, meaning lookup is only `O(log n)` if duplicate
+        // entries are rare. It would be better to start with a linear search for the common case but
+        // fall back to an exponential search if many duplicates are found. This applies to
+        // `upper_bound` as well.
+        let mut start = initial;
+        while start != 0 && self.items[self.idx_sorted_by_item_key[start - 1]].0.borrow() == key {
+            start -= 1;
+        }
+
+        start
+    }
+
+    /// Returns the index into the `idx_sorted_by_item_key` array of the first item greater than
+    /// `key`, or `self.len()` if no such item exists.
+    ///
+    /// `initial` must be an index into that same array for an item that is equal to `key`.
+    fn find_upper_bound<Q>(&self, key: &Q, initial: usize) -> usize
+    where
+        Q: Ord + ?Sized,
+        K: Borrow<Q>,
+    {
+        debug_assert!(self.items[self.idx_sorted_by_item_key[initial]].0.borrow() == key);
+
+        // See the FIXME for `find_lower_bound`.
+        let mut end = initial + 1;
+        let len = self.items.len();
+        while end < len && self.items[self.idx_sorted_by_item_key[end]].0.borrow() == key {
+            end += 1;
+        }
+
+        end
+    }
+
+    fn idxs_to_items_enumerated(&'a self, idxs: &'a [I]) -> impl 'a + Iterator<Item = (I, &'a V)> {
+        idxs.iter().map(move |&idx| (idx, &self.items[idx].1))
+    }
+}
+
+impl<I: Idx, K: Eq, V: Eq> Eq for SortedIndexMultiMap<I, K, V> {}
+impl<I: Idx, K: PartialEq, V: PartialEq> PartialEq for SortedIndexMultiMap<I, K, V> {
+    fn eq(&self, other: &Self) -> bool {
+        // No need to compare the sorted index. If the items are the same, the index will be too.
+        self.items == other.items
+    }
+}
+
+impl<I: Idx, K, V> Hash for SortedIndexMultiMap<I, K, V>
+where
+    K: Hash,
+    V: Hash,
+{
+    fn hash<H: Hasher>(&self, hasher: &mut H) {
+        self.items.hash(hasher)
+    }
+}
+impl<I: Idx, K, V, C> HashStable<C> for SortedIndexMultiMap<I, K, V>
+where
+    K: HashStable<C>,
+    V: HashStable<C>,
+{
+    fn hash_stable(&self, ctx: &mut C, hasher: &mut StableHasher) {
+        self.items.hash_stable(ctx, hasher)
+    }
+}
+
+impl<I: Idx, K: Ord, V> FromIterator<(K, V)> for SortedIndexMultiMap<I, K, V> {
+    fn from_iter<J>(iter: J) -> Self
+    where
+        J: IntoIterator<Item = (K, V)>,
+    {
+        let items = IndexVec::from_iter(iter);
+        let mut idx_sorted_by_item_key: Vec<_> = items.indices().collect();
+
+        // `sort_by_key` is stable, so insertion order is preserved for duplicate items.
+        idx_sorted_by_item_key.sort_by_key(|&idx| &items[idx].0);
+
+        SortedIndexMultiMap { items, idx_sorted_by_item_key }
+    }
+}
+
+impl<I: Idx, K, V> std::ops::Index<I> for SortedIndexMultiMap<I, K, V> {
+    type Output = V;
+
+    fn index(&self, idx: I) -> &Self::Output {
+        &self.items[idx].1
+    }
+}
+
+#[cfg(tests)]
+mod tests;
--- a/src/librustc_data_structures/sorted_map/tests.rs
+++ b/src/librustc_data_structures/sorted_map/tests.rs
@ -1,4 +1,30 @@
-use super::SortedMap;
+use super::{SortedIndexMultiMap, SortedMap};
+
+#[test]
+fn test_sorted_index_multi_map() {
+    let entries: Vec<_> = vec![(2, 0), (1, 0), (2, 1), (3, 0), (2, 2)];
+    let set: SortedIndexMultiMap<usize, _, _> = entries.iter().copied().collect();
+
+    // Insertion order is preserved.
+    assert!(entries.iter().map(|(ref k, ref v)| (k, v)).eq(set.iter()));
+
+    // Indexing
+    for (i, expect) in entries.iter().enumerate() {
+        assert_eq!(set[i], expect.1);
+    }
+
+    // `get_by_key` works.
+    assert_eq!(set.get_by_key(&3).copied().collect::<Vec<_>>(), vec![0]);
+    assert!(set.get_by_key(&4).next().is_none());
+
+    // `get_by_key` returns items in insertion order.
+    let twos: Vec<_> = set.get_by_key_enumerated(&2).collect();
+    let idxs: Vec<usize> = twos.iter().map(|(i, _)| *i).collect();
+    let values: Vec<usize> = twos.iter().map(|(_, &v)| v).collect();
+
+    assert_eq!(idxs, vec![0, 2, 4]);
+    assert_eq!(values, vec![0, 1, 2]);
+}

 #[test]
 fn test_insert_and_iter() {