From 65d56612bb2e0427ef42ec366977318c10d41539 Mon Sep 17 00:00:00 2001 From: Clark Gaebel Date: Mon, 21 Apr 2014 19:07:59 -0400 Subject: [PATCH] Just some general cleanup in the HashMap module I went through the HashMap module, fixed spelling mistakes, minor inefficiencies, added tests, and other trivial changes. --- src/libcollections/hashmap.rs | 280 ++++++++++++++++++++-------------- 1 file changed, 168 insertions(+), 112 deletions(-) diff --git a/src/libcollections/hashmap.rs b/src/libcollections/hashmap.rs index 03b486e628c..5b65fe46089 100644 --- a/src/libcollections/hashmap.rs +++ b/src/libcollections/hashmap.rs @@ -43,7 +43,8 @@ mod table { use std::ptr; use std::ptr::RawPtr; use std::rt::global_heap; - use std::intrinsics::{size_of, min_align_of, transmute, move_val_init}; + use std::intrinsics::{size_of, min_align_of, transmute}; + use std::intrinsics::{move_val_init, set_memory}; use std::iter::{Iterator, range_step_inclusive}; static EMPTY_BUCKET: u64 = 0u64; @@ -52,15 +53,15 @@ mod table { /// optimized arrays of hashes, keys, and values. /// /// This design uses less memory and is a lot faster than the naive - /// `~[Option]`, because we don't pay for the overhead of an + /// `Vec>`, because we don't pay for the overhead of an /// option on every element, and we get a generally more cache-aware design. /// /// Key invariants of this structure: /// /// - if hashes[i] == EMPTY_BUCKET, then keys[i] and vals[i] have /// 'undefined' contents. Don't read from them. This invariant is - /// enforced outside this module with the [EmptyIndex], [FullIndex], - /// and [SafeHash] types/concepts. + /// enforced outside this module with the `EmptyIndex`, `FullIndex`, + /// and `SafeHash` types. /// /// - An `EmptyIndex` is only constructed for a bucket at an index with /// a hash of EMPTY_BUCKET. @@ -69,8 +70,9 @@ mod table { /// non-EMPTY_BUCKET hash. /// /// - A `SafeHash` is only constructed for non-`EMPTY_BUCKET` hash. We get - /// around hashes of zero by changing them to 0x800_0000, which will - /// likely hash to the same bucket, but not be represented as "empty". + /// around hashes of zero by changing them to 0x8000_0000_0000_0000, + /// which will likely map to the same bucket, while not being confused + /// with "empty". /// /// - All three "arrays represented by pointers" are the same length: /// `capacity`. This is set at creation and never changes. The arrays @@ -111,25 +113,27 @@ mod table { /// Represents an index into a `RawTable` with no key or value in it. pub struct EmptyIndex { - idx: int, + idx: int, nocopy: marker::NoCopy, } /// Represents an index into a `RawTable` with a key, value, and hash /// in it. pub struct FullIndex { - idx: int, - hash: SafeHash, + idx: int, + hash: SafeHash, nocopy: marker::NoCopy, } impl FullIndex { /// Since we get the hash for free whenever we check the bucket state, - /// this function is provided for fast access, letting us avoid making + /// this function is provided for fast access, letting us avoid /// redundant trips back to the hashtable. + #[inline(always)] pub fn hash(&self) -> SafeHash { self.hash } /// Same comment as with `hash`. + #[inline(always)] pub fn raw_index(&self) -> uint { self.idx as uint } } @@ -141,7 +145,8 @@ mod table { Full(FullIndex), } - /// A hash that is not zero, since we use that to represent empty buckets. + /// A hash that is not zero, since we use a hash of zero to represent empty + /// buckets. #[deriving(Eq)] pub struct SafeHash { hash: u64, @@ -149,6 +154,7 @@ mod table { impl SafeHash { /// Peek at the hash value, which is guaranteed to be non-zero. + #[inline(always)] pub fn inspect(&self) -> u64 { self.hash } } @@ -171,12 +177,16 @@ mod table { #[test] fn test_rounding() { - assert!(round_up_to_next(0, 4) == 0); - assert!(round_up_to_next(1, 4) == 4); - assert!(round_up_to_next(2, 4) == 4); - assert!(round_up_to_next(3, 4) == 4); - assert!(round_up_to_next(4, 4) == 4); - assert!(round_up_to_next(5, 4) == 8); + assert_eq!(round_up_to_next(0, 4), 0); + assert_eq!(round_up_to_next(1, 4), 4); + assert_eq!(round_up_to_next(2, 4), 4); + assert_eq!(round_up_to_next(3, 4), 4); + assert_eq!(round_up_to_next(4, 4), 4); + assert_eq!(round_up_to_next(5, 4), 8); + } + + fn has_alignment(n: uint, alignment: uint) -> bool { + round_up_to_next(n, alignment) == n } // Returns a tuple of (minimum required malloc alignment, hash_offset, @@ -200,6 +210,13 @@ mod table { (min_align, hash_offset, keys_offset, vals_offset, end_of_vals) } + #[test] + fn test_offset_calculation() { + assert_eq!(calculate_offsets(128, 8, 15, 1, 4, 4 ), (8, 0, 128, 144, 148)); + assert_eq!(calculate_offsets(3, 1, 2, 1, 1, 1 ), (1, 0, 3, 5, 6)); + assert_eq!(calculate_offsets(6, 2, 12, 4, 24, 8), (8, 0, 8, 24, 48)); + } + impl RawTable { /// Does not initialize the buckets. The caller should ensure they, @@ -213,9 +230,9 @@ mod table { capacity.checked_mul(&size_of::< V >()).expect("capacity overflow"); // Allocating hashmaps is a little tricky. We need to allocate three - // arrays here, but since we know their sizes and alignments up front, - // we could theoretically allocate only a single array, and then have - // the subarrays just point into it. + // arrays, but since we know their sizes and alignments up front, + // we just allocate a single array, and then have the subarrays + // point into it. // // This is great in theory, but in practice getting the alignment // right is a little subtle. Therefore, calculating offsets has been @@ -231,8 +248,7 @@ mod table { // FIXME #13094: If malloc was not at as aligned as we expected, // our offset calculations are just plain wrong. We could support // any alignment if we switched from `malloc` to `posix_memalign`. - assert!(round_up_to_next(buffer as uint, malloc_alignment) - == (buffer as uint)); + assert!(has_alignment(buffer as uint, malloc_alignment)); let hashes = buffer.offset(hash_offset as int) as *mut u64; let keys = buffer.offset(keys_offset as int) as *mut K; @@ -247,26 +263,20 @@ mod table { } } - - /// Creates a new raw table from a given capacity. All buckets are /// initially empty. pub fn new(capacity: uint) -> RawTable { unsafe { let ret = RawTable::new_uninitialized(capacity); - - for i in range(0, ret.capacity() as int) { - *ret.hashes.offset(i) = EMPTY_BUCKET; - } - + set_memory(ret.hashes, 0u8, capacity); ret } } /// Reads a bucket at a given index, returning an enum indicating whether /// there's anything there or not. You need to match on this enum to get - /// the appropriate types to pass on to most of the rest of the functions - /// in this module. + /// the appropriate types to pass on to most of the other functions in + /// this module. pub fn peek(&self, index: uint) -> BucketState { // FIXME #12049 if cfg!(test) { assert!(index < self.capacity) } @@ -279,13 +289,13 @@ mod table { match hash { EMPTY_BUCKET => Empty(EmptyIndex { - idx: idx, + idx: idx, nocopy: nocopy }), full_hash => Full(FullIndex { - idx: idx, - hash: SafeHash { hash: full_hash }, + idx: idx, + hash: SafeHash { hash: full_hash }, nocopy: nocopy, }) } @@ -321,13 +331,6 @@ mod table { -> (&'a mut SafeHash, &'a mut K, &'a mut V) { let idx = index.idx; - // I'm totally abusing the fact that a pointer to any u64 in the - // hashtable at a full index is a safe hash. Thanks to `SafeHash` - // just being a wrapper around u64, this is true. It's just really - // really really really unsafe. However, the exposed API is now - // impossible to get wrong. You cannot insert an empty hash into - // this slot now. - unsafe { // FIXME #12049 if cfg!(test) { assert!(*self.hashes.offset(idx) != EMPTY_BUCKET) } @@ -340,8 +343,8 @@ mod table { /// Puts a key and value pair, along with the key's hash, into a given /// index in the hashtable. Note how the `EmptyIndex` is 'moved' into this /// function, because that slot will no longer be empty when we return! - /// Because we know this, a FullIndex is returned for later use, pointing - /// to the newly-filled slot in the hashtable. + /// A FullIndex is returned for later use, pointing to the newly-filled + /// slot in the hashtable. /// /// Use `make_hash` to construct a `SafeHash` to pass to this function. pub fn put(&mut self, index: EmptyIndex, hash: SafeHash, k: K, v: V) -> FullIndex { @@ -349,7 +352,7 @@ mod table { unsafe { // FIXME #12049 - if cfg!(test) { assert!(*self.hashes.offset(idx) == EMPTY_BUCKET) } + if cfg!(test) { assert_eq!(*self.hashes.offset(idx), EMPTY_BUCKET) } *self.hashes.offset(idx) = hash.inspect(); move_val_init(&mut *self.keys.offset(idx), k); move_val_init(&mut *self.vals.offset(idx), v); @@ -371,9 +374,7 @@ mod table { // FIXME #12049 if cfg!(test) { assert!(*self.hashes.offset(idx) != EMPTY_BUCKET) } - let hash_ptr = self.hashes.offset(idx); - - *hash_ptr = EMPTY_BUCKET; + *self.hashes.offset(idx) = EMPTY_BUCKET; // Drop the mutable constraint. let keys = self.keys as *K; @@ -400,31 +401,48 @@ mod table { } pub fn iter<'a>(&'a self) -> Entries<'a, K, V> { - Entries { table: self, idx: 0 } + Entries { table: self, idx: 0, elems_seen: 0 } } pub fn mut_iter<'a>(&'a mut self) -> MutEntries<'a, K, V> { - MutEntries { table: self, idx: 0 } + MutEntries { table: self, idx: 0, elems_seen: 0 } } pub fn move_iter(self) -> MoveEntries { - MoveEntries { table: self, idx: 0 } + MoveEntries { table: self, idx: 0, elems_seen: 0 } } } + // `read_all_mut` casts a `*u64` to a `*SafeHash`. Since we statically + // ensure that a `FullIndex` points to an index with a non-zero hash, + // and a `SafeHash` is just a `u64` with a different name, this is + // safe. + // + // This test ensures that a `SafeHash` really IS the same size as a + // `u64`. If you need to change the size of `SafeHash` (and + // consequently made this test fail), `read_all_mut` needs to be + // modified to no longer assume this. + #[test] + fn can_alias_safehash_as_u64() { + unsafe { assert_eq!(size_of::(), size_of::()) }; + } + pub struct Entries<'a, K, V> { table: &'a RawTable, idx: uint, + elems_seen: uint, } pub struct MutEntries<'a, K, V> { table: &'a mut RawTable, idx: uint, + elems_seen: uint, } pub struct MoveEntries { table: RawTable, idx: uint, + elems_seen: uint, } impl<'a, K, V> Iterator<(&'a K, &'a V)> for Entries<'a, K, V> { @@ -435,7 +453,10 @@ mod table { match self.table.peek(i) { Empty(_) => {}, - Full(idx) => return Some(self.table.read(&idx)) + Full(idx) => { + self.elems_seen += 1; + return Some(self.table.read(&idx)); + } } } @@ -443,7 +464,7 @@ mod table { } fn size_hint(&self) -> (uint, Option) { - let size = self.table.size() - self.idx; + let size = self.table.size() - self.elems_seen; (size, Some(size)) } } @@ -460,7 +481,8 @@ mod table { // error: lifetime of `self` is too short to guarantee its contents // can be safely reborrowed Full(idx) => unsafe { - return Some(transmute(self.table.read_mut(&idx))) + self.elems_seen += 1; + return Some(transmute(self.table.read_mut(&idx))); } } } @@ -469,7 +491,7 @@ mod table { } fn size_hint(&self) -> (uint, Option) { - let size = self.table.size() - self.idx; + let size = self.table.size() - self.elems_seen; (size, Some(size)) } } @@ -526,18 +548,14 @@ mod table { } } - - #[unsafe_destructor] impl Drop for RawTable { fn drop(&mut self) { - // Ideally, this should be in reverse, since we're likely to have - // partially taken some elements out with `.move_iter()` from the - // front. + // This is in reverse because we're likely to have partially taken + // some elements out with `.move_iter()` from the front. for i in range_step_inclusive(self.capacity as int - 1, 0, -1) { // Check if the size is 0, so we don't do a useless scan when // dropping empty tables such as on resize. - if self.size == 0 { break } match self.peek(i as uint) { @@ -546,7 +564,7 @@ mod table { } } - assert!(self.size == 0); + assert_eq!(self.size, 0); unsafe { libc::free(self.hashes as *mut libc::c_void); @@ -637,19 +655,12 @@ static INITIAL_LOAD_FACTOR: Fraction = (9, 10); // This would definitely be an avenue worth exploring if people start complaining // about the size of rust executables. // -// There's also two optimizations that have been omitted regarding how the -// hashtable allocates. The first is that a hashtable which never has an element -// inserted should not allocate. I'm suspicious of this one, because supporting -// that internally gains no performance over just using an -// `Option>`, and is significantly more complicated. -// -// The second omitted allocation optimization is that right now we allocate three -// arrays to back the hashtable. This is wasteful. In theory, we only need one -// array, and each of the three original arrays can just be slices of it. This -// would reduce the pressure on the allocator, and will play much nicer with the -// rest of the system. An initial implementation is commented out in -// `table::RawTable::new`, but I'm not confident it works for all sane alignments, -// especially if a type needs more alignment than `malloc` provides. +// There's also an "optimization" that has been omitted regarding how the +// hashtable allocates. The vector type has set the expectation that a hashtable +// which never has an element inserted should not allocate. I'm suspicious of +// implementing this for hashtables, because supporting it has no performance +// benefit over using an `Option>`, and is significantly more +// complicated. /// A hash map implementation which uses linear probing with Robin /// Hood bucket stealing. @@ -745,7 +756,7 @@ impl, V, S, H: Hasher> HashMap { // This exploits the power-of-two size of the hashtable. As long as this // is always true, we can use a bitmask of cap-1 to do modular arithmetic. // - // Prefer to use this with increasing values of `idx` rather than repeatedly + // Prefer using this with increasing values of `idx` rather than repeatedly // calling `probe_next`. This reduces data-dependencies between loops, which // can help the optimizer, and certainly won't hurt it. `probe_next` is // simply for convenience, and is no more efficient than `probe`. @@ -756,7 +767,7 @@ impl, V, S, H: Hasher> HashMap { ((hash.inspect() as uint) + idx) & hash_mask } - // Generate the next probe in a sequence. Prefer to use 'probe' by itself, + // Generate the next probe in a sequence. Prefer using 'probe' by itself, // but this can sometimes be useful. fn probe_next(&self, probe: uint) -> uint { let hash_mask = self.table.capacity() - 1; @@ -804,7 +815,7 @@ impl, V, S, H: Hasher> HashMap { if self.bucket_distance(&idx) < num_probes { return None } // If the hash doesn't match, it can't be this one.. - if hash != &idx.hash() { continue } + if *hash != idx.hash() { continue } let (k, _) = self.table.read(&idx); @@ -1087,7 +1098,7 @@ impl, V, S, H: Hasher> HashMap { /// 2) Ensure new_capacity is a power of two. fn resize(&mut self, new_capacity: uint) { assert!(self.table.size() <= new_capacity); - assert!((new_capacity - 1) & new_capacity == 0); + assert!(num::is_power_of_two(new_capacity)); self.grow_at = grow_at(new_capacity, self.load_factor); @@ -1095,7 +1106,7 @@ impl, V, S, H: Hasher> HashMap { let old_size = old_table.size(); for (h, k, v) in old_table.move_iter() { - self.manual_insert_hashed_nocheck(h, k, v); + self.insert_hashed_nocheck(h, k, v); } assert_eq!(self.table.size(), old_size); @@ -1171,13 +1182,13 @@ impl, V, S, H: Hasher> HashMap { } } - /// Manually insert a pre-hashed key-value pair, without first checking + /// Insert a pre-hashed key-value pair, without first checking /// that there's enough room in the buckets. Returns a reference to the /// newly insert value. /// /// If the key already exists, the hashtable will be returned untouched /// and a reference to the existing element will be returned. - fn manual_insert_hashed_nocheck<'a>( + fn insert_hashed_nocheck<'a>( &'a mut self, hash: table::SafeHash, k: K, v: V) -> &'a mut V { for dib in range_inclusive(0u, self.table.size()) { @@ -1226,28 +1237,25 @@ impl, V, S, H: Hasher> HashMap { fail!("Internal HashMap error: Out of space."); } - fn manual_insert_hashed<'a>(&'a mut self, hash: table::SafeHash, k: K, v: V) -> &'a mut V { + /// Inserts an element which has already been hashed, returning a reference + /// to that element inside the hashtable. This is more efficient that using + /// `insert`, since the key will not be rehashed. + fn insert_hashed<'a>(&'a mut self, hash: table::SafeHash, k: K, v: V) -> &'a mut V { let potential_new_size = self.table.size() + 1; self.make_some_room(potential_new_size); - self.manual_insert_hashed_nocheck(hash, k, v) - } - - /// Inserts an element, returning a reference to that element inside the - /// hashtable. - fn manual_insert<'a>(&'a mut self, k: K, v: V) -> &'a mut V { - let hash = self.make_hash(&k); - self.manual_insert_hashed(hash, k, v) + self.insert_hashed_nocheck(hash, k, v) } /// Return the value corresponding to the key in the map, or insert /// and return the value if it doesn't exist. pub fn find_or_insert<'a>(&'a mut self, k: K, v: V) -> &'a mut V { - match self.search(&k) { + let hash = self.make_hash(&k); + match self.search_hashed(&hash, &k) { Some(idx) => { let (_, v_ref) = self.table.read_mut(&idx); v_ref }, - None => self.manual_insert(k, v) + None => self.insert_hashed(hash, k, v) } } @@ -1255,14 +1263,15 @@ impl, V, S, H: Hasher> HashMap { /// insert, and return a new value if it doesn't exist. pub fn find_or_insert_with<'a>(&'a mut self, k: K, f: |&K| -> V) -> &'a mut V { - match self.search(&k) { + let hash = self.make_hash(&k); + match self.search_hashed(&hash, &k) { Some(idx) => { let (_, v_ref) = self.table.read_mut(&idx); v_ref }, - None => { + None => { let v = f(&k); - self.manual_insert(k, v) + self.insert_hashed(hash, k, v) } } } @@ -1276,8 +1285,9 @@ impl, V, S, H: Hasher> HashMap { v: V, f: |&K, &mut V|) -> &'a mut V { - match self.search(&k) { - None => self.manual_insert(k, v), + let hash = self.make_hash(&k); + match self.search_hashed(&hash, &k) { + None => self.insert_hashed(hash, k, v), Some(idx) => { let (_, v_ref) = self.table.read_mut(&idx); f(&k, v_ref); @@ -1369,7 +1379,8 @@ impl, V: Eq, S, H: Hasher> Eq for HashMap { fn eq(&self, other: &HashMap) -> bool { if self.len() != other.len() { return false; } - self.iter().all(|(key, value)| { + self.iter() + .all(|(key, value)| { match other.find(key) { None => false, Some(v) => *value == *v @@ -1393,7 +1404,7 @@ impl + Show, V: Show, S, H: Hasher> Show for HashMap, V, S, H: Hasher + Default> Default for HashMap { fn default() -> HashMap { - HashMap::with_capacity_and_hasher(INITIAL_CAPACITY, Default::default()) + HashMap::with_hasher(Default::default()) } } @@ -1449,13 +1460,10 @@ pub struct HashSet { } impl, S, H: Hasher> Eq for HashSet { - // FIXME #11998: Since the value is a (), and `find` returns a Some(&()), - // we trigger #11998 when matching on it. I've fallen back to manual - // iteration until this is fixed. fn eq(&self, other: &HashSet) -> bool { if self.len() != other.len() { return false; } - self.iter().all(|key| other.map.contains_key(key)) + self.iter().all(|key| other.contains(key)) } } @@ -1468,7 +1476,7 @@ impl, S, H: Hasher> Mutable for HashSet { } impl, S, H: Hasher> Set for HashSet { - fn contains(&self, value: &T) -> bool { self.map.search(value).is_some() } + fn contains(&self, value: &T) -> bool { self.map.contains_key(value) } fn is_disjoint(&self, other: &HashSet) -> bool { self.iter().all(|v| !other.contains(v)) @@ -1540,8 +1548,7 @@ impl, S, H: Hasher> HashSet { /// Visit the values representing the difference pub fn difference<'a>(&'a self, other: &'a HashSet) -> SetAlgebraItems<'a, T, H> { - Repeat::new(other) - .zip(self.iter()) + Repeat::new(other).zip(self.iter()) .filter_map(|(other, elt)| { if !other.contains(elt) { Some(elt) } else { None } }) @@ -1556,8 +1563,7 @@ impl, S, H: Hasher> HashSet { /// Visit the values representing the intersection pub fn intersection<'a>(&'a self, other: &'a HashSet) -> SetAlgebraItems<'a, T, H> { - Repeat::new(other) - .zip(self.iter()) + Repeat::new(other).zip(self.iter()) .filter_map(|(other, elt)| { if other.contains(elt) { Some(elt) } else { None } }) @@ -1568,7 +1574,6 @@ impl, S, H: Hasher> HashSet { -> Chain, SetAlgebraItems<'a, T, H>> { self.iter().chain(other.difference(self)) } - } impl + fmt::Show, S, H: Hasher> fmt::Show for HashSet { @@ -1953,7 +1958,7 @@ mod test_map { m.insert(1, 2); match m.find(&1) { None => fail!(), - Some(v) => assert!(*v == 2) + Some(v) => assert_eq!(*v, 2) } } @@ -2020,6 +2025,32 @@ mod test_map { assert_eq!(map.find(&k), Some(&v)); } } + + #[test] + fn test_size_hint() { + let xs = [(1, 1), (2, 2), (3, 3), (4, 4), (5, 5), (6, 6)]; + + let map: HashMap = xs.iter().map(|&x| x).collect(); + + let mut iter = map.iter(); + + for _ in iter.by_ref().take(3) {} + + assert_eq!(iter.size_hint(), (3, Some(3))); + } + + #[test] + fn test_mut_size_hint() { + let xs = [(1, 1), (2, 2), (3, 3), (4, 4), (5, 5), (6, 6)]; + + let mut map: HashMap = xs.iter().map(|&x| x).collect(); + + let mut iter = map.mut_iter(); + + for _ in iter.by_ref().take(3) {} + + assert_eq!(iter.size_hint(), (3, Some(3))); + } } #[cfg(test)] @@ -2270,6 +2301,27 @@ mod bench { use self::test::Bencher; use std::iter::{range_inclusive}; + #[bench] + fn new_drop(b : &mut Bencher) { + use super::HashMap; + + b.iter(|| { + let m : HashMap = HashMap::new(); + assert_eq!(m.len(), 0); + }) + } + + #[bench] + fn new_insert_drop(b : &mut Bencher) { + use super::HashMap; + + b.iter(|| { + let mut m = HashMap::new(); + m.insert(0, 0); + assert_eq!(m.len(), 1); + }) + } + #[bench] fn insert(b: &mut Bencher) { use super::HashMap; @@ -2299,7 +2351,9 @@ mod bench { } b.iter(|| { - m.contains_key(&412); + for i in range_inclusive(1, 1000) { + m.contains_key(&i); + } }); } @@ -2314,7 +2368,9 @@ mod bench { } b.iter(|| { - m.contains_key(&2048); + for i in range_inclusive(1001, 2000) { + m.contains_key(&i); + } }); }