std: Remove implicit shrinking from hash_map.

Implements fn shrink_to_fit for HashMap.
2014-11-17 14:23:21 +01:00 · 2014-11-17 14:23:21 +01:00 · 72c96badd2
commit 72c96badd2
parent 29e928f2ba
2 changed files with 238 additions and 137 deletions
--- a/src/libstd/collections/hash/map.rs
+++ b/src/libstd/collections/hash/map.rs
@ -23,7 +23,7 @@ use hash::{Hash, Hasher, RandomSipHasher};
 use iter::{mod, Iterator, IteratorExt, FromIterator, Extend};
 use kinds::Sized;
 use mem::{mod, replace};
-use num::UnsignedInt;
+use num::{Int, UnsignedInt};
 use ops::{Deref, Index, IndexMut};
 use option::{Some, None, Option};
 use result::{Result, Ok, Err};
@ -41,45 +41,53 @@ use super::table::{
    SafeHash
 };

-// FIXME(conventions): update capacity management to match other collections (no auto-shrink)
-
 const INITIAL_LOG2_CAP: uint = 5;
 pub const INITIAL_CAPACITY: uint = 1 << INITIAL_LOG2_CAP; // 2^5

 /// The default behavior of HashMap implements a load factor of 90.9%.
-/// This behavior is characterized by the following conditions:
+/// This behavior is characterized by the following condition:
 ///
-/// - if size > 0.909 * capacity: grow
-/// - if size < 0.25 * capacity: shrink (if this won't bring capacity lower
-///   than the minimum)
+/// - if size > 0.909 * capacity: grow the map
 #[deriving(Clone)]
-struct DefaultResizePolicy {
-    /// Doubled minimal capacity. The capacity must never drop below
-    /// the minimum capacity. (The check happens before the capacity
-    /// is potentially halved.)
-    minimum_capacity2: uint
-}
+struct DefaultResizePolicy;

 impl DefaultResizePolicy {
-    fn new(new_capacity: uint) -> DefaultResizePolicy {
-        DefaultResizePolicy {
-            minimum_capacity2: new_capacity << 1
-        }
+    fn new() -> DefaultResizePolicy {
+        DefaultResizePolicy
    }

    #[inline]
-    fn capacity_range(&self, new_size: uint) -> (uint, uint) {
-        // Here, we are rephrasing the logic by specifying the ranges:
+    fn min_capacity(&self, usable_size: uint) -> uint {
+        // Here, we are rephrasing the logic by specifying the lower limit
+        // on capacity:
        //
-        // - if `size * 1.1 < cap < size * 4`: don't resize
-        // - if `cap < minimum_capacity * 2`: don't shrink
-        // - otherwise, resize accordingly
-        ((new_size * 11) / 10, max(new_size << 2, self.minimum_capacity2))
+        // - if `cap < size * 1.1`: grow the map
+        usable_size * 11 / 10
    }

+    /// An inverse of `min_capacity`, approximately.
    #[inline]
-    fn reserve(&mut self, new_capacity: uint) {
-        self.minimum_capacity2 = new_capacity << 1;
+    fn usable_capacity(&self, cap: uint) -> uint {
+        // As the number of entries approaches usable capacity,
+        // min_capacity(size) must be smaller than the internal capacity,
+        // so that the map is not resized:
+        // `min_capacity(usable_capacity(x)) <= x`.
+        // The lef-hand side can only be smaller due to flooring by integer
+        // division.
+        //
+        // This doesn't have to be checked for overflow since allocation size
+        // in bytes will overflow earlier than multiplication by 10.
+        cap * 10 / 11
+    }
+}
+
+#[test]
+fn test_resize_policy() {
+    use prelude::*;
+    let rp = DefaultResizePolicy;
+    for n in range(0u, 1000) {
+        assert!(rp.min_capacity(rp.usable_capacity(n)) <= n);
+        assert!(rp.usable_capacity(rp.min_capacity(n)) <= n);
    }
 }

@ -282,7 +290,6 @@ pub struct HashMap<K, V, H = RandomSipHasher> {

    table: RawTable<K, V>,

-    // We keep this at the end since it might as well have tail padding.
    resize_policy: DefaultResizePolicy,
 }

@ -529,7 +536,7 @@ impl<K: Eq + Hash<S>, V, S, H: Hasher<S>> HashMap<K, V, H> {
    pub fn with_hasher(hasher: H) -> HashMap<K, V, H> {
        HashMap {
            hasher:        hasher,
-            resize_policy: DefaultResizePolicy::new(INITIAL_CAPACITY),
+            resize_policy: DefaultResizePolicy::new(),
            table:         RawTable::new(0),
        }
    }
@ -554,17 +561,32 @@ impl<K: Eq + Hash<S>, V, S, H: Hasher<S>> HashMap<K, V, H> {
    /// ```
    #[inline]
    pub fn with_capacity_and_hasher(capacity: uint, hasher: H) -> HashMap<K, V, H> {
-        let cap = max(INITIAL_CAPACITY, capacity).next_power_of_two();
+        let resize_policy = DefaultResizePolicy::new();
+        let min_cap = max(INITIAL_CAPACITY, resize_policy.min_capacity(capacity));
+        let internal_cap = min_cap.checked_next_power_of_two().expect("capacity overflow");
+        assert!(internal_cap >= capacity, "capacity overflow");
        HashMap {
            hasher:        hasher,
-            resize_policy: DefaultResizePolicy::new(cap),
-            table:         RawTable::new(cap),
+            resize_policy: resize_policy,
+            table:         RawTable::new(internal_cap),
        }
    }

-    /// The hashtable will never try to shrink below this size. You can use
-    /// this function to reduce reallocations if your hashtable frequently
-    /// grows and shrinks by large amounts.
+    /// Returns the number of elements the map can hold without reallocating.
+    ///
+    /// # Example
+    ///
+    /// ```
+    /// use std::collections::HashMap;
+    /// let map: HashMap<int, int> = HashMap::with_capacity(100);
+    /// assert!(map.capacity() >= 100);
+    /// ```
+    #[inline]
+    #[unstable = "matches collection reform specification, waiting for dust to settle"]
+    pub fn capacity(&self) -> uint {
+        self.resize_policy.usable_capacity(self.table.capacity())
+    }
+
    ///
    /// This function has no effect on the operational semantics of the
    /// hashtable, only on performance.
@ -579,8 +601,6 @@ impl<K: Eq + Hash<S>, V, S, H: Hasher<S>> HashMap<K, V, H> {
    pub fn reserve(&mut self, new_minimum_capacity: uint) {
        let cap = max(INITIAL_CAPACITY, new_minimum_capacity).next_power_of_two();

-        self.resize_policy.reserve(cap);
-
        if self.table.capacity() < cap {
            self.resize(cap);
        }
@ -601,94 +621,106 @@ impl<K: Eq + Hash<S>, V, S, H: Hasher<S>> HashMap<K, V, H> {
            return;
        }

-        if new_capacity < old_table.capacity() {
-            // Shrink the table. Naive algorithm for resizing:
-            for (h, k, v) in old_table.into_iter() {
-                self.insert_hashed_nocheck(h, k, v);
-            }
-        } else {
-            // Grow the table.
-            // Specialization of the other branch.
-            let mut bucket = Bucket::first(&mut old_table);
+        // Grow the table.
+        // Specialization of the other branch.
+        let mut bucket = Bucket::first(&mut old_table);

-            // "So a few of the first shall be last: for many be called,
-            // but few chosen."
-            //
-            // We'll most likely encounter a few buckets at the beginning that
-            // have their initial buckets near the end of the table. They were
-            // placed at the beginning as the probe wrapped around the table
-            // during insertion. We must skip forward to a bucket that won't
-            // get reinserted too early and won't unfairly steal others spot.
-            // This eliminates the need for robin hood.
-            loop {
-                bucket = match bucket.peek() {
-                    Full(full) => {
-                        if full.distance() == 0 {
-                            // This bucket occupies its ideal spot.
-                            // It indicates the start of another "cluster".
-                            bucket = full.into_bucket();
-                            break;
-                        }
-                        // Leaving this bucket in the last cluster for later.
-                        full.into_bucket()
+        // "So a few of the first shall be last: for many be called,
+        // but few chosen."
+        //
+        // We'll most likely encounter a few buckets at the beginning that
+        // have their initial buckets near the end of the table. They were
+        // placed at the beginning as the probe wrapped around the table
+        // during insertion. We must skip forward to a bucket that won't
+        // get reinserted too early and won't unfairly steal others spot.
+        // This eliminates the need for robin hood.
+        loop {
+            bucket = match bucket.peek() {
+                Full(full) => {
+                    if full.distance() == 0 {
+                        // This bucket occupies its ideal spot.
+                        // It indicates the start of another "cluster".
+                        bucket = full.into_bucket();
+                        break;
                    }
-                    Empty(b) => {
-                        // Encountered a hole between clusters.
-                        b.into_bucket()
-                    }
-                };
-                bucket.next();
-            }
+                    // Leaving this bucket in the last cluster for later.
+                    full.into_bucket()
+                }
+                Empty(b) => {
+                    // Encountered a hole between clusters.
+                    b.into_bucket()
+                }
+            };
+            bucket.next();
+        }

-            // This is how the buckets might be laid out in memory:
-            // ($ marks an initialized bucket)
-            //  ________________
-            // |$$$_$$$$$$_$$$$$|
-            //
-            // But we've skipped the entire initial cluster of buckets
-            // and will continue iteration in this order:
-            //  ________________
-            //     |$$$$$$_$$$$$
-            //                  ^ wrap around once end is reached
-            //  ________________
-            //  $$$_____________|
-            //    ^ exit once table.size == 0
-            loop {
-                bucket = match bucket.peek() {
-                    Full(bucket) => {
-                        let h = bucket.hash();
-                        let (b, k, v) = bucket.take();
-                        self.insert_hashed_ordered(h, k, v);
-                        {
-                            let t = b.table(); // FIXME "lifetime too short".
-                            if t.size() == 0 { break }
-                        };
-                        b.into_bucket()
-                    }
-                    Empty(b) => b.into_bucket()
-                };
-                bucket.next();
-            }
+        // This is how the buckets might be laid out in memory:
+        // ($ marks an initialized bucket)
+        //  ________________
+        // |$$$_$$$$$$_$$$$$|
+        //
+        // But we've skipped the entire initial cluster of buckets
+        // and will continue iteration in this order:
+        //  ________________
+        //     |$$$$$$_$$$$$
+        //                  ^ wrap around once end is reached
+        //  ________________
+        //  $$$_____________|
+        //    ^ exit once table.size == 0
+        loop {
+            bucket = match bucket.peek() {
+                Full(bucket) => {
+                    let h = bucket.hash();
+                    let (b, k, v) = bucket.take();
+                    self.insert_hashed_ordered(h, k, v);
+                    {
+                        let t = b.table(); // FIXME "lifetime too short".
+                        if t.size() == 0 { break }
+                    };
+                    b.into_bucket()
+                }
+                Empty(b) => b.into_bucket()
+            };
+            bucket.next();
        }

        assert_eq!(self.table.size(), old_size);
    }

-    /// Performs any necessary resize operations, such that there's space for
-    /// new_size elements.
-    fn make_some_room(&mut self, new_size: uint) {
-        let (grow_at, shrink_at) = self.resize_policy.capacity_range(new_size);
-        let cap = self.table.capacity();
+    /// Shrinks the capacity of the map as much as possible. It will drop
+    /// down as much as possible while maintaining the internal rules
+    /// and possibly leaving some space in accordance with the resize policy.
+    ///
+    /// # Example
+    ///
+    /// ```
+    /// use std::collections::HashMap;
+    ///
+    /// let mut map: HashMap<int, int> = HashMap::with_capacity(100);
+    /// map.insert(1, 2);
+    /// map.insert(3, 4);
+    /// assert!(map.capacity() >= 100);
+    /// map.shrink_to_fit();
+    /// assert!(map.capacity() >= 2);
+    /// ```
+    #[unstable = "matches collection reform specification, waiting for dust to settle"]
+    pub fn shrink_to_fit(&mut self) {
+        let min_capacity = self.resize_policy.min_capacity(self.len());
+        let min_capacity = max(min_capacity.next_power_of_two(), INITIAL_CAPACITY);

        // An invalid value shouldn't make us run out of space.
-        debug_assert!(grow_at >= new_size);
+        debug_assert!(self.len() <= min_capacity);

-        if cap <= grow_at {
-            let new_capacity = max(cap << 1, INITIAL_CAPACITY);
-            self.resize(new_capacity);
-        } else if shrink_at <= cap {
-            let new_capacity = cap >> 1;
-            self.resize(new_capacity);
+        if self.table.capacity() != min_capacity {
+            let old_table = replace(&mut self.table, RawTable::new(min_capacity));
+            let old_size = old_table.size();
+
+            // Shrink the table. Naive algorithm for resizing:
+            for (h, k, v) in old_table.into_iter() {
+                self.insert_hashed_nocheck(h, k, v);
+            }
+
+            debug_assert_eq!(self.table.size(), old_size);
        }
    }

@ -775,8 +807,7 @@ impl<K: Eq + Hash<S>, V, S, H: Hasher<S>> HashMap<K, V, H> {
            return None
        }

-        let potential_new_size = self.table.size() - 1;
-        self.make_some_room(potential_new_size);
+        self.reserve(1);

        match self.search_equiv_mut(k) {
            Some(bucket) => {
@ -907,12 +938,8 @@ impl<K: Eq + Hash<S>, V, S, H: Hasher<S>> HashMap<K, V, H> {

    /// Gets the given key's corresponding entry in the map for in-place manipulation
    pub fn entry<'a>(&'a mut self, key: K) -> Entry<'a, K, V> {
-        // Gotta resize now, and we don't know which direction, so try both?
-        let size = self.table.size();
-        self.make_some_room(size + 1);
-        if size > 0 {
-            self.make_some_room(size - 1);
-        }
+        // Gotta resize now.
+        self.reserve(1);

        let hash = self.make_hash(&key);
        search_entry_hashed(&mut self.table, hash, key)
@ -964,10 +991,6 @@ impl<K: Eq + Hash<S>, V, S, H: Hasher<S>> HashMap<K, V, H> {
    /// ```
    #[unstable = "matches collection reform specification, waiting for dust to settle"]
    pub fn clear(&mut self) {
-        // Prevent reallocations from happening from now on. Makes it possible
-        // for the map to be reused but has a downside: reserves permanently.
-        self.resize_policy.reserve(self.table.size());
-
        let cap = self.table.capacity();
        let mut buckets = Bucket::first(&mut self.table);

@ -1100,8 +1123,7 @@ impl<K: Eq + Hash<S>, V, S, H: Hasher<S>> HashMap<K, V, H> {
    #[unstable = "matches collection reform specification, waiting for dust to settle"]
    pub fn insert(&mut self, k: K, v: V) -> Option<V> {
        let hash = self.make_hash(&k);
-        let potential_new_size = self.table.size() + 1;
-        self.make_some_room(potential_new_size);
+        self.reserve(1);

        let mut retval = None;
        self.insert_or_replace_with(hash, k, v, |_, val_ref, val| {
@ -1141,9 +1163,6 @@ impl<K: Eq + Hash<S>, V, S, H: Hasher<S>> HashMap<K, V, H> {
            return None
        }

-        let potential_new_size = self.table.size() - 1;
-        self.make_some_room(potential_new_size);
-
        self.search_mut(k).map(|bucket| {
            let (_k, val) = pop_internal(bucket);
            val
@ -1894,7 +1913,7 @@ mod test_map {
    }

    #[test]
-    fn test_resize_policy() {
+    fn test_behavior_resize_policy() {
        let mut m = HashMap::new();

        assert_eq!(m.len(), 0);
@ -1905,7 +1924,7 @@ mod test_map {
        m.remove(&0);
        assert!(m.is_empty());
        let initial_cap = m.table.capacity();
-        m.reserve(initial_cap * 2);
+        m.reserve(initial_cap);
        let cap = m.table.capacity();

        assert_eq!(cap, initial_cap * 2);
@ -1935,15 +1954,55 @@ mod test_map {
            assert_eq!(m.table.capacity(), new_cap);
        }
        // A little more than one quarter full.
-        // Shrinking starts as we remove more elements:
+        m.shrink_to_fit();
+        assert_eq!(m.table.capacity(), cap);
+        // again, a little more than half full
        for _ in range(0, cap / 2 - 1) {
            i -= 1;
            m.remove(&i);
        }
+        m.shrink_to_fit();

        assert_eq!(m.len(), i);
        assert!(!m.is_empty());
-        assert_eq!(m.table.capacity(), cap);
+        assert_eq!(m.table.capacity(), initial_cap);
+    }
+
+    #[test]
+    fn test_reserve_shrink_to_fit() {
+        let mut m = HashMap::new();
+        m.insert(0u, 0u);
+        m.remove(&0);
+        assert!(m.capacity() >= m.len());
+        for i in range(0, 128) {
+            m.insert(i, i);
+        }
+        m.reserve(256);
+
+        let usable_cap = m.capacity();
+        for i in range(128, 128+256) {
+            m.insert(i, i);
+            assert_eq!(m.capacity(), usable_cap);
+        }
+
+        for i in range(100, 128+256) {
+            assert_eq!(m.remove(&i), Some(i));
+        }
+        m.shrink_to_fit();
+
+        assert_eq!(m.len(), 100);
+        assert!(!m.is_empty());
+        assert!(m.capacity() >= m.len());
+
+        for i in range(0, 100) {
+            assert_eq!(m.remove(&i), Some(i));
+        }
+        m.shrink_to_fit();
+        m.insert(0, 0);
+
+        assert_eq!(m.len(), 1);
+        assert!(m.capacity() >= m.len());
+        assert_eq!(m.remove(&0), Some(0));
    }

    #[test]
--- a/src/libstd/collections/hash/set.rs
+++ b/src/libstd/collections/hash/set.rs
@ -25,7 +25,6 @@ use result::{Ok, Err};
 use super::map::{HashMap, Entries, MoveEntries, INITIAL_CAPACITY};

 // FIXME(conventions): implement BitOr, BitAnd, BitXor, and Sub
-// FIXME(conventions): update capacity management to match other collections (no auto-shrink)


 // Future Optimization (FIXME!)
@ -172,7 +171,28 @@ impl<T: Eq + Hash<S>, S, H: Hasher<S>> HashSet<T, H> {
        HashSet { map: HashMap::with_capacity_and_hasher(capacity, hasher) }
    }

-    /// Reserve space for at least `n` elements in the hash table.
+    /// Returns the number of elements the set can hold without reallocating.
+    ///
+    /// # Example
+    ///
+    /// ```
+    /// use std::collections::HashSet;
+    /// let set: HashSet<int> = HashSet::with_capacity(100);
+    /// assert!(set.capacity() >= 100);
+    /// ```
+    #[inline]
+    #[unstable = "matches collection reform specification, waiting for dust to settle"]
+    pub fn capacity(&self) -> uint {
+        self.map.capacity()
+    }
+
+    /// Reserves capacity for at least `additional` more elements to be inserted
+    /// in the `HashSet`. The collection may reserve more space to avoid
+    /// frequent reallocations.
+    ///
+    /// # Panics
+    ///
+    /// Panics if the new allocation size overflows `uint`.
    ///
    /// # Example
    ///
@ -181,8 +201,30 @@ impl<T: Eq + Hash<S>, S, H: Hasher<S>> HashSet<T, H> {
    /// let mut set: HashSet<int> = HashSet::new();
    /// set.reserve(10);
    /// ```
-    pub fn reserve(&mut self, n: uint) {
-        self.map.reserve(n)
+    #[unstable = "matches collection reform specification, waiting for dust to settle"]
+    pub fn reserve(&mut self, additional: uint) {
+        self.map.reserve(additional)
+    }
+
+    /// Shrinks the capacity of the set as much as possible. It will drop
+    /// down as much as possible while maintaining the internal rules
+    /// and possibly leaving some space in accordance with the resize policy.
+    ///
+    /// # Example
+    ///
+    /// ```
+    /// use std::collections::HashSet;
+    ///
+    /// let mut set: HashSet<int> = HashSet::with_capacity(100);
+    /// set.insert(1);
+    /// set.insert(2);
+    /// assert!(set.capacity() >= 100);
+    /// set.shrink_to_fit();
+    /// assert!(set.capacity() >= 2);
+    /// ```
+    #[unstable = "matches collection reform specification, waiting for dust to settle"]
+    pub fn shrink_to_fit(&mut self) {
+        self.map.shrink_to_fit()
    }

    /// Deprecated: use `contains` and `BorrowFrom`.