From 5b0d3adf3debde4cd21e7adb1f434580386d7265 Mon Sep 17 00:00:00 2001 From: Piotr Czarnecki Date: Wed, 9 Jul 2014 18:31:58 +0100 Subject: [PATCH 1/6] std: branchless bucket distance for hashmap --- src/libstd/collections/hashmap.rs | 10 +--------- 1 file changed, 1 insertion(+), 9 deletions(-) diff --git a/src/libstd/collections/hashmap.rs b/src/libstd/collections/hashmap.rs index 1985128c4e3..d949eeebea0 100644 --- a/src/libstd/collections/hashmap.rs +++ b/src/libstd/collections/hashmap.rs @@ -802,17 +802,9 @@ impl, V, S, H: Hasher> HashMap { fn bucket_distance(&self, index_of_elem: &table::FullIndex) -> uint { // where the hash of the element that happens to reside at // `index_of_elem` tried to place itself first. - let first_probe_index = self.probe(&index_of_elem.hash(), 0); - let raw_index = index_of_elem.raw_index(); - if first_probe_index <= raw_index { - // probe just went forward - raw_index - first_probe_index - } else { - // probe wrapped around the hashtable - raw_index + (self.table.capacity() - first_probe_index) - } + (raw_index - index_of_elem.hash() as uint) & (self.table.capacity() - 1) } /// Search for a pre-hashed key. From 9ddaaa4db02ec79f30e51c3e4f32baec8b0bb650 Mon Sep 17 00:00:00 2001 From: Piotr Czarnecki Date: Wed, 9 Jul 2014 21:21:46 +0100 Subject: [PATCH 2/6] std: RawTable exposes a safe interface for HashMap Introduced a new growth algorithm. --- src/libstd/collections/hashmap.rs | 1439 ++++++++++++++++++----------- 1 file changed, 897 insertions(+), 542 deletions(-) diff --git a/src/libstd/collections/hashmap.rs b/src/libstd/collections/hashmap.rs index d949eeebea0..bfe74fed077 100644 --- a/src/libstd/collections/hashmap.rs +++ b/src/libstd/collections/hashmap.rs @@ -19,29 +19,30 @@ use default::Default; use fmt::Show; use fmt; use hash::{Hash, Hasher, RandomSipHasher}; -use iter::{Iterator, FilterMap, Chain, Repeat, Zip, Extendable}; -use iter::{range, range_inclusive, FromIterator}; +use iter::{Iterator, FromIterator, FilterMap, Chain, Repeat, Zip, Extendable, range}; use iter; use mem::replace; use num; +use ops::Deref; use option::{Some, None, Option}; use result::{Ok, Err}; use ops::Index; +use self::table::{BucketWithTable, FullBucketImm, RawTable, FullBucket, FullBucketMut, Bucket}; + mod table { use clone::Clone; use cmp; use hash::{Hash, Hasher}; - use iter::range_step_inclusive; - use iter::{Iterator, range}; - use kinds::marker; + use iter::{Iterator, count}; use mem::{min_align_of, size_of}; - use mem::{overwrite, transmute}; + use mem; use num::{CheckedMul, is_power_of_two}; - use ops::Drop; + use ops::{Deref, Drop}; use option::{Some, None, Option}; use ptr::RawPtr; use ptr::set_memory; + use ptr::write; use ptr; use rt::heap::{allocate, deallocate}; @@ -105,43 +106,381 @@ mod table { pub struct RawTable { capacity: uint, size: uint, - hashes: *mut u64, - keys: *mut K, - vals: *mut V, + hashes: *mut u64 } - /// Represents an index into a `RawTable` with no key or value in it. - pub struct EmptyIndex { - idx: int, - nocopy: marker::NoCopy, + /// A bucket that holds a reference to the table + pub trait BucketWithTable { + /// A bucket that holds a reference to the table + fn table<'a>(&'a self) -> &'a M; + + /// Move out the reference to the table. + fn into_table(self) -> M; + + /// Get the raw index. + fn index(&self) -> uint; } - /// Represents an index into a `RawTable` with a key, value, and hash - /// in it. - pub struct FullIndex { - idx: int, - hash: SafeHash, - nocopy: marker::NoCopy, + struct RawBucket { + hash: *mut u64, + key: *mut K, + val: *mut V } - impl FullIndex { - /// Since we get the hash for free whenever we check the bucket state, - /// this function is provided for fast access, letting us avoid - /// redundant trips back to the hashtable. - #[inline(always)] - pub fn hash(&self) -> SafeHash { self.hash } - - /// Same comment as with `hash`. - #[inline(always)] - pub fn raw_index(&self) -> uint { self.idx as uint } + pub struct Bucket { + raw: RawBucket, + idx: uint, + table: M } - /// Represents the state of a bucket: it can either have a key/value - /// pair (be full) or not (be empty). You cannot `take` empty buckets, - /// and you cannot `put` into full buckets. - pub enum BucketState { - Empty(EmptyIndex), - Full(FullIndex), + pub struct EmptyBucket { + raw: RawBucket, + idx: uint, + table: M + } + + pub struct FullBucket { + raw: RawBucket, + idx: uint, + table: M + } + + pub type EmptyBucketImm<'table,K,V> = EmptyBucket>; + pub type FullBucketImm<'table,K,V> = FullBucket>; + + pub type EmptyBucketMut<'table,K,V> = EmptyBucket>; + pub type FullBucketMut<'table,K,V> = FullBucket>; + + struct GapThenFull { + gap: EmptyBucket, + full: FullBucket + } + + impl>> GapThenFull { + pub fn full<'a>(&'a self) -> &'a FullBucket { + &self.full + } + + pub fn shift(mut self) -> Option> { + unsafe { + *self.gap.raw.hash = mem::replace(&mut *self.full.raw.hash, EMPTY_BUCKET); + mem::overwrite(self.gap.raw.key, ptr::read(self.full.raw.key as *const K)); + mem::overwrite(self.gap.raw.val, ptr::read(self.full.raw.val as *const V)); + } + + let FullBucket { raw, idx, .. } = self.full; + + match self.full.next().peek() { + Empty(_) => None, + Full(bucket) => { + self.gap.raw = raw; + self.gap.idx = idx; + + self.full = bucket; + self.full.idx &= self.full.table.capacity - 1; + + Some(self) + } + } + } + } + + impl RawPtr for RawBucket { + unsafe fn offset(self, count: int) -> RawBucket { + RawBucket { + hash: self.hash.offset(count), + key: self.key.offset(count), + val: self.val.offset(count), + } + } + + fn null() -> RawBucket { + RawBucket { + hash: RawPtr::null(), + key: RawPtr::null(), + val: RawPtr::null() + } + } + + fn is_null(&self) -> bool { + self.hash.is_null() + } + + fn to_uint(&self) -> uint { + self.hash.to_uint() + } + + unsafe fn to_option(&self) -> Option<&u64> { + self.hash.to_option() + } + } + + impl>> EmptyBucket { + pub fn next(self) -> Bucket { + let mut bucket = self.into_bucket(); + bucket.next(); + bucket + } + + pub fn into_bucket(self) -> Bucket { + Bucket { + raw: self.raw, + idx: self.idx, + table: self.table + } + } + + pub fn gap_peek(self) -> Option> { + let gap = EmptyBucket { + raw: self.raw, + idx: self.idx, + table: () + }; + + match self.next().peek() { + Empty(_) => None, + Full(bucket) => { + Some(GapThenFull { + gap: gap, + full: bucket + }) + } + } + } + } + + impl>> EmptyBucket { + pub fn put(mut self, hash: SafeHash, key: K, value: V) + -> FullBucket { + unsafe { + *self.raw.hash = hash.inspect(); + write(self.raw.key, key); + write(self.raw.val, value); + } + + self.table.size += 1; + + FullBucket { raw: self.raw, idx: self.idx, table: self.table } + } + } + + impl>> FullBucket { + pub fn next(self) -> Bucket { + let mut bucket = self.into_bucket(); + bucket.next(); + bucket + } + + pub fn into_bucket(self) -> Bucket { + Bucket { + raw: self.raw, + idx: self.idx, + table: self.table + } + } + + pub fn distance(&self) -> uint { + (self.idx - self.hash().inspect() as uint) & (self.table.capacity() - 1) + } + + pub fn hash(&self) -> SafeHash { + unsafe { + SafeHash { + hash: *self.raw.hash + } + } + } + + pub fn read<'a>(&'a self) -> (&'a K, &'a V) { + unsafe { + (&*self.raw.key, + &*self.raw.val) + } + } + + pub fn into_refs(self) -> (&K, &V) { + unsafe { + // debug_assert!(*self.raw.hash != EMPTY_BUCKET); + (&*self.raw.key, + &*self.raw.val) + } + } + } + + impl>> FullBucket { + pub fn take(mut self) -> (EmptyBucket, K, V) { + let key = self.raw.key as *const K; + let val = self.raw.val as *const V; + + self.table.size -= 1; + + unsafe { + *self.raw.hash = EMPTY_BUCKET; + ( + EmptyBucket { + raw: self.raw, + idx: self.idx, + table: self.table + }, + ptr::read(key), + ptr::read(val) + ) + } + } + + pub fn replace(&mut self, h: SafeHash, k: K, v: V) -> (SafeHash, K, V) { + unsafe { + let old_hash = ptr::replace(self.raw.hash as *mut SafeHash, h); + let old_key = ptr::replace(self.raw.key, k); + let old_val = ptr::replace(self.raw.val, v); + + (old_hash, old_key, old_val) + } + } + + pub fn read_mut<'a>(&'a self) -> (&'a mut K, &'a mut V) { + unsafe { + // debug_assert!(*self.raw.hash != EMPTY_BUCKET); + (&mut *self.raw.key, + &mut *self.raw.val) + } + } + + pub fn into_mut_refs(self) -> (&mut K, &mut V) { + unsafe { + // debug_assert!(*self.raw.hash != EMPTY_BUCKET); + (&mut *self.raw.key, + &mut *self.raw.val) + } + } + } + + impl>> Bucket { + pub fn new(table: M, hash: &SafeHash) -> Bucket { + let ib_index = (hash.inspect() as uint) & (table.capacity() - 1); + Bucket { + raw: unsafe { + table.as_mut_ptrs().offset(ib_index as int) + }, + idx: ib_index, + table: table + } + } + + pub fn at_index(table: M, ib_index: uint) -> Bucket { + let ib_index = ib_index & (table.capacity() - 1); + Bucket { + raw: unsafe { + table.as_mut_ptrs().offset(ib_index as int) + }, + idx: ib_index, + table: table + } + } + + pub fn first(table: M) -> Bucket { + Bucket { + raw: table.as_mut_ptrs(), + idx: 0, + table: table + } + } + + pub fn peek(self) -> BucketState { + match unsafe { *self.raw.hash } { + EMPTY_BUCKET => + Empty(EmptyBucket { + raw: self.raw, + idx: self.idx, + table: self.table + }), + _ => + Full(FullBucket { + raw: self.raw, + idx: self.idx, + table: self.table + }) + } + } + + pub fn next(&mut self) { + self.idx += 1; + + let dist = if self.idx == self.table.capacity() { + -(self.table.capacity() as int - 1) + } else { + 1i + }; + + unsafe { + self.raw = self.raw.offset(dist); + } + } + } + + impl BucketWithTable for FullBucket { + fn table<'a>(&'a self) -> &'a M { + &self.table + } + + fn into_table(self) -> M { + self.table + } + + fn index(&self) -> uint { + self.idx + } + } + + impl BucketWithTable for EmptyBucket { + fn table<'a>(&'a self) -> &'a M { + &self.table + } + + fn into_table(self) -> M { + self.table + } + + fn index(&self) -> uint { + self.idx + } + } + + impl BucketWithTable for Bucket { + fn table<'a>(&'a self) -> &'a M { + &self.table + } + + fn into_table(self) -> M { + self.table + } + + fn index(&self) -> uint { + self.idx + } + } + + impl<'table,K,V> Deref> for &'table RawTable { + fn deref<'a>(&'a self) -> &'a RawTable { + &**self + } + } + + impl<'table,K,V> Deref> for &'table mut RawTable { + fn deref<'a>(&'a self) -> &'a RawTable { + &**self + } + } + + impl<'table,K,V> DerefMut> for &'table mut RawTable { + fn deref_mut<'a>(&'a mut self) -> &'a mut RawTable { + &mut **self + } + } + + pub enum BucketState { + Empty(EmptyBucket), + Full(FullBucket), } /// A hash that is not zero, since we use a hash of zero to represent empty @@ -217,6 +556,13 @@ mod table { /// Does not initialize the buckets. The caller should ensure they, /// at the very least, set every hash to EMPTY_BUCKET. unsafe fn new_uninitialized(capacity: uint) -> RawTable { + if capacity == 0 { + return RawTable { + size: 0, + capacity: 0, + hashes: 0 as *mut u64, + }; + } let hashes_size = capacity.checked_mul(&size_of::()) .expect("capacity overflow"); let keys_size = capacity.checked_mul(&size_of::< K >()) @@ -232,7 +578,7 @@ mod table { // This is great in theory, but in practice getting the alignment // right is a little subtle. Therefore, calculating offsets has been // factored out into a different function. - let (malloc_alignment, hash_offset, keys_offset, vals_offset, size) = + let (malloc_alignment, hash_offset, _, _, size) = calculate_offsets( hashes_size, min_align_of::(), keys_size, min_align_of::< K >(), @@ -241,15 +587,31 @@ mod table { let buffer = allocate(size, malloc_alignment); let hashes = buffer.offset(hash_offset as int) as *mut u64; - let keys = buffer.offset(keys_offset as int) as *mut K; - let vals = buffer.offset(vals_offset as int) as *mut V; RawTable { capacity: capacity, size: 0, hashes: hashes, - keys: keys, - vals: vals, + } + } + + fn as_mut_ptrs(&self) -> RawBucket { + let hashes_size = self.capacity * size_of::(); + let keys_size = self.capacity * size_of::(); + + let keys_offset = (hashes_size + min_align_of::< K >() - 1) & !(min_align_of::< K >() - 1); + let end_of_keys = keys_offset + keys_size; + + let vals_offset = (end_of_keys + min_align_of::< V >() - 1) & !(min_align_of::< V >() - 1); + + let buffer = self.hashes as *mut u8; + + unsafe { + RawBucket { + hash: self.hashes, + key: buffer.offset(keys_offset as int) as *mut K, + val: buffer.offset(vals_offset as int) as *mut V + } } } @@ -264,113 +626,6 @@ mod table { } } - /// Reads a bucket at a given index, returning an enum indicating whether - /// there's anything there or not. You need to match on this enum to get - /// the appropriate types to pass on to most of the other functions in - /// this module. - pub fn peek(&self, index: uint) -> BucketState { - debug_assert!(index < self.capacity); - - let idx = index as int; - let hash = unsafe { *self.hashes.offset(idx) }; - - let nocopy = marker::NoCopy; - - match hash { - EMPTY_BUCKET => - Empty(EmptyIndex { - idx: idx, - nocopy: nocopy - }), - full_hash => - Full(FullIndex { - idx: idx, - hash: SafeHash { hash: full_hash }, - nocopy: nocopy, - }) - } - } - - /// Gets references to the key and value at a given index. - pub fn read<'a>(&'a self, index: &FullIndex) -> (&'a K, &'a V) { - let idx = index.idx; - - unsafe { - debug_assert!(*self.hashes.offset(idx) != EMPTY_BUCKET); - (&*self.keys.offset(idx), &*self.vals.offset(idx)) - } - } - - /// Gets references to the key and value at a given index, with the - /// value's reference being mutable. - pub fn read_mut<'a>(&'a mut self, index: &FullIndex) -> (&'a K, &'a mut V) { - let idx = index.idx; - - unsafe { - debug_assert!(*self.hashes.offset(idx) != EMPTY_BUCKET); - (&*self.keys.offset(idx), &mut *self.vals.offset(idx)) - } - } - - /// Read everything, mutably. - pub fn read_all_mut<'a>(&'a mut self, index: &FullIndex) - -> (&'a mut SafeHash, &'a mut K, &'a mut V) { - let idx = index.idx; - - unsafe { - debug_assert!(*self.hashes.offset(idx) != EMPTY_BUCKET); - (transmute(self.hashes.offset(idx)), - &mut *self.keys.offset(idx), &mut *self.vals.offset(idx)) - } - } - - /// Puts a key and value pair, along with the key's hash, into a given - /// index in the hashtable. Note how the `EmptyIndex` is 'moved' into this - /// function, because that slot will no longer be empty when we return! - /// A FullIndex is returned for later use, pointing to the newly-filled - /// slot in the hashtable. - /// - /// Use `make_hash` to construct a `SafeHash` to pass to this function. - pub fn put(&mut self, index: EmptyIndex, hash: SafeHash, k: K, v: V) -> FullIndex { - let idx = index.idx; - - unsafe { - debug_assert_eq!(*self.hashes.offset(idx), EMPTY_BUCKET); - *self.hashes.offset(idx) = hash.inspect(); - overwrite(&mut *self.keys.offset(idx), k); - overwrite(&mut *self.vals.offset(idx), v); - } - - self.size += 1; - - FullIndex { idx: idx, hash: hash, nocopy: marker::NoCopy } - } - - /// Removes a key and value from the hashtable. - /// - /// This works similarly to `put`, building an `EmptyIndex` out of the - /// taken FullIndex. - pub fn take(&mut self, index: FullIndex) -> (EmptyIndex, K, V) { - let idx = index.idx; - - unsafe { - debug_assert!(*self.hashes.offset(idx) != EMPTY_BUCKET); - - *self.hashes.offset(idx) = EMPTY_BUCKET; - - // Drop the mutable constraint. - let keys = self.keys as *const K; - let vals = self.vals as *const V; - - let k = ptr::read(keys.offset(idx)); - let v = ptr::read(vals.offset(idx)); - - self.size -= 1; - - (EmptyIndex { idx: idx, nocopy: marker::NoCopy }, k, v) - } - } - /// The hashtable's capacity, similar to a vector's. pub fn capacity(&self) -> uint { self.capacity @@ -382,16 +637,95 @@ mod table { self.size } + fn ptrs<'a>(&'a self) -> RawBuckets<'a, K, V> { + RawBuckets { + raw: self.as_mut_ptrs(), + hashes_end: unsafe { + self.hashes.offset(self.capacity as int) + } + } + } + pub fn iter<'a>(&'a self) -> Entries<'a, K, V> { - Entries { table: self, idx: 0, elems_seen: 0 } + Entries { + iter: self.ptrs(), + elems_left: self.size(), + } } pub fn mut_iter<'a>(&'a mut self) -> MutEntries<'a, K, V> { - MutEntries { table: self, idx: 0, elems_seen: 0 } + MutEntries { + iter: self.ptrs(), + elems_left: self.size(), + } } pub fn move_iter(self) -> MoveEntries { - MoveEntries { table: self, idx: 0 } + MoveEntries { + iter: self.ptrs(), + table: self, + } + } + + pub fn rev_move_buckets<'a>(&'a mut self) -> RevMoveBuckets<'a, K, V> { + let raw_bucket = self.as_mut_ptrs(); + unsafe { + RevMoveBuckets { + raw: raw_bucket.offset(self.capacity as int), + hashes_end: raw_bucket.hash, + elems_left: self.size + } + } + } + } + + pub struct RawBuckets<'a, K, V> { + raw: RawBucket, + hashes_end: *mut u64 + } + + impl<'a, K, V> Iterator> for RawBuckets<'a, K, V> { + fn next(&mut self) -> Option> { + while self.raw.hash != self.hashes_end { + unsafe { + let prev = ptr::replace(&mut self.raw, self.raw.offset(1)); + if *prev.hash != EMPTY_BUCKET { + return Some(prev); + } + } + } + + None + } + } + + pub struct RevMoveBuckets<'a, K, V> { + raw: RawBucket, + hashes_end: *mut u64, + elems_left: uint + } + + impl<'a, K, V> Iterator<(K, V)> for RevMoveBuckets<'a, K, V> { + fn next(&mut self) -> Option<(K, V)> { + if self.elems_left == 0 { + return None; + } + + loop { + debug_assert!(self.raw.hash != self.hashes_end); + + unsafe { + self.raw = self.raw.offset(-1); + + if *self.raw.hash != EMPTY_BUCKET { + self.elems_left -= 1; + return Some(( + ptr::read(self.raw.key as *const K), + ptr::read(self.raw.val as *const V) + )); + } + } + } } } @@ -426,77 +760,55 @@ mod table { /// Iterator over the entries in a table, consuming the table. pub struct MoveEntries { table: RawTable, - idx: uint + iter: RawBuckets<'static, K, V> } impl<'a, K, V> Iterator<(&'a K, &'a V)> for Entries<'a, K, V> { fn next(&mut self) -> Option<(&'a K, &'a V)> { - while self.idx < self.table.capacity() { - let i = self.idx; - self.idx += 1; - - match self.table.peek(i) { - Empty(_) => {}, - Full(idx) => { - self.elems_seen += 1; - return Some(self.table.read(&idx)); - } + self.iter.next().map(|bucket| { + self.elems_left -= 1; + unsafe { + (&*bucket.key, + &*bucket.val) } - } - - None + }) } fn size_hint(&self) -> (uint, Option) { - let size = self.table.size() - self.elems_seen; - (size, Some(size)) + (self.elems_left, Some(self.elems_left)) } } impl<'a, K, V> Iterator<(&'a K, &'a mut V)> for MutEntries<'a, K, V> { fn next(&mut self) -> Option<(&'a K, &'a mut V)> { - while self.idx < self.table.capacity() { - let i = self.idx; - self.idx += 1; - - match self.table.peek(i) { - Empty(_) => {}, - // the transmute here fixes: - // error: lifetime of `self` is too short to guarantee its contents - // can be safely reborrowed - Full(idx) => unsafe { - self.elems_seen += 1; - return Some(transmute(self.table.read_mut(&idx))); - } + self.iter.next().map(|bucket| { + self.elems_left -= 1; + unsafe { + (&*bucket.key, + &mut *bucket.val) } - } - - None + }) } fn size_hint(&self) -> (uint, Option) { - let size = self.table.size() - self.elems_seen; - (size, Some(size)) + (self.elems_left, Some(self.elems_left)) } } impl Iterator<(SafeHash, K, V)> for MoveEntries { fn next(&mut self) -> Option<(SafeHash, K, V)> { - while self.idx < self.table.capacity() { - let i = self.idx; - self.idx += 1; - - match self.table.peek(i) { - Empty(_) => {}, - Full(idx) => { - let h = idx.hash(); - let (_, k, v) = self.table.take(idx); - return Some((h, k, v)); - } + self.iter.next().map(|bucket| { + self.table.size -= 1; + unsafe { + ( + SafeHash { + hash: *bucket.hash, + }, + ptr::read(bucket.key as *const K), + ptr::read(bucket.val as *const V) + ) } - } - - None + }) } fn size_hint(&self) -> (uint, Option) { @@ -510,18 +822,27 @@ mod table { unsafe { let mut new_ht = RawTable::new_uninitialized(self.capacity()); - for i in range(0, self.capacity()) { - match self.peek(i) { - Empty(_) => { - *new_ht.hashes.offset(i as int) = EMPTY_BUCKET; - }, - Full(idx) => { - let hash = idx.hash().inspect(); - let (k, v) = self.read(&idx); - *new_ht.hashes.offset(i as int) = hash; - overwrite(&mut *new_ht.keys.offset(i as int), (*k).clone()); - overwrite(&mut *new_ht.vals.offset(i as int), (*v).clone()); + { + let cap = self.capacity(); + let mut new_buckets = Bucket::first(&mut new_ht); + let mut buckets = Bucket::first(self); + while buckets.index() != cap { + match buckets.peek() { + Full(full) => { + let (h, k, v) = { + let (k, v) = full.read(); + (full.hash(), k.clone(), v.clone()) + }; + *new_buckets.raw.hash = h.inspect(); + mem::overwrite(new_buckets.raw.key, k); + mem::overwrite(new_buckets.raw.val, v); + } + _ => { + *new_buckets.raw.hash = EMPTY_BUCKET; + } } + new_buckets.next(); + buckets.next(); } } @@ -535,37 +856,30 @@ mod table { #[unsafe_destructor] impl Drop for RawTable { fn drop(&mut self) { + if self.hashes.is_null() { + return; + } // This is in reverse because we're likely to have partially taken // some elements out with `.move_iter()` from the front. - for i in range_step_inclusive(self.capacity as int - 1, 0, -1) { - // Check if the size is 0, so we don't do a useless scan when - // dropping empty tables such as on resize. - if self.size == 0 { break } + // Check if the size is 0, so we don't do a useless scan when + // dropping empty tables such as on resize. + // Avoid double free of elements already moved out. + for _ in self.rev_move_buckets() {} - match self.peek(i as uint) { - Empty(_) => {}, - Full(idx) => { self.take(idx); } - } + let hashes_size = self.capacity * size_of::(); + let keys_size = self.capacity * size_of::(); + let vals_size = self.capacity * size_of::(); + let (align, _, _, _, size) = calculate_offsets(hashes_size, min_align_of::(), + keys_size, min_align_of::(), + vals_size, min_align_of::()); + + unsafe { + deallocate(self.hashes as *mut u8, size, align); + // Remember how everything was allocated out of one buffer + // during initialization? We only need one call to free here. } - assert_eq!(self.size, 0); - - if self.hashes.is_not_null() { - let hashes_size = self.capacity * size_of::(); - let keys_size = self.capacity * size_of::(); - let vals_size = self.capacity * size_of::(); - let (align, _, _, _, size) = calculate_offsets(hashes_size, min_align_of::(), - keys_size, min_align_of::(), - vals_size, min_align_of::()); - - unsafe { - deallocate(self.hashes as *mut u8, size, align); - // Remember how everything was allocated out of one buffer - // during initialization? We only need one call to free here. - } - - self.hashes = RawPtr::null(); - } + self.hashes = RawPtr::null(); } } } @@ -605,7 +919,7 @@ impl DefaultResizePolicy { } // The main performance trick in this hashmap is called Robin Hood Hashing. -// It gains its excellent performance from one key invariant: +// It gains its excellent performance from one crucial operation: // // If an insertion collides with an existing element, and that elements // "probe distance" (how far away the element is from its ideal location) @@ -765,163 +1079,121 @@ pub struct HashMap { resize_policy: DefaultResizePolicy, } +/// Search for a pre-hashed key. +fn search_hashed_generic>>(table: M, hash: &table::SafeHash, is_match: |&K| -> bool) + -> Option> { + let size = table.size(); + let mut probe = Bucket::new(table, hash); + let ib = probe.index(); + + while probe.index() != ib + size { + let full = match probe.peek() { + table::Empty(_) => return None, // hit an empty bucket + table::Full(b) => b + }; + + if full.distance() + ib < full.index() { + return None; + } + + // If the hash doesn't match, it can't be this one.. + if *hash == full.hash() { + let matched = { + let (k, _) = full.read(); + is_match(k) + }; + + // If the key doesn't match, it can't be this one.. + if matched { + return Some(full); + } + } + + probe = full.next(); + } + + None +} + +fn search_hashed>>(table: M, hash: &table::SafeHash, k: &K) + -> Option> { + search_hashed_generic(table, hash, |k_| *k == *k_) +} + +fn pop_internal(starting_bucket: FullBucketMut) -> V { + let size = { + let table = starting_bucket.table(); + table.size() + }; + let (empty, _k, retval) = starting_bucket.take(); + let mut gap = match empty.gap_peek() { + Some(b) => b, + None => return retval + }; + // COMPILER error! wrong enum optimization. sets ptr to 0 + + for _ in range(0, size) { + if gap.full().distance() != 0 { + gap = match gap.shift() { + Some(b) => b, + None => return retval + }; + continue; + } + + break; + } + + // Now we're done all our shifting. Return the value we grabbed + // earlier. + return retval; +} + impl, V, S, H: Hasher> HashMap { - // Probe the `idx`th bucket for a given hash, returning the index of the - // target bucket. - // - // This exploits the power-of-two size of the hashtable. As long as this - // is always true, we can use a bitmask of cap-1 to do modular arithmetic. - // - // Prefer using this with increasing values of `idx` rather than repeatedly - // calling `probe_next`. This reduces data-dependencies between loops, which - // can help the optimizer, and certainly won't hurt it. `probe_next` is - // simply for convenience, and is no more efficient than `probe`. - fn probe(&self, hash: &table::SafeHash, idx: uint) -> uint { - let hash_mask = self.table.capacity() - 1; - - // So I heard a rumor that unsigned overflow is safe in rust.. - ((hash.inspect() as uint) + idx) & hash_mask - } - - // Generate the next probe in a sequence. Prefer using 'probe' by itself, - // but this can sometimes be useful. - fn probe_next(&self, probe: uint) -> uint { - let hash_mask = self.table.capacity() - 1; - (probe + 1) & hash_mask - } - fn make_hash>(&self, x: &X) -> table::SafeHash { table::make_hash(&self.hasher, x) } - /// Get the distance of the bucket at the given index that it lies - /// from its 'ideal' location. - /// - /// In the cited blog posts above, this is called the "distance to - /// initial bucket", or DIB. - fn bucket_distance(&self, index_of_elem: &table::FullIndex) -> uint { - // where the hash of the element that happens to reside at - // `index_of_elem` tried to place itself first. - let raw_index = index_of_elem.raw_index(); - - (raw_index - index_of_elem.hash() as uint) & (self.table.capacity() - 1) + fn search_equiv<'a, Q: Hash + Equiv>(&'a self, q: &Q) + -> Option> { + let hash = self.make_hash(q); + search_hashed_generic(&self.table, &hash, |k| q.equiv(k)) } - /// Search for a pre-hashed key. - fn search_hashed_generic(&self, hash: &table::SafeHash, is_match: |&K| -> bool) - -> Option { - for num_probes in range(0u, self.table.size()) { - let probe = self.probe(hash, num_probes); - - let idx = match self.table.peek(probe) { - table::Empty(_) => return None, // hit an empty bucket - table::Full(idx) => idx - }; - - // We can finish the search early if we hit any bucket - // with a lower distance to initial bucket than we've probed. - if self.bucket_distance(&idx) < num_probes { return None } - - // If the hash doesn't match, it can't be this one.. - if *hash != idx.hash() { continue } - - let (k, _) = self.table.read(&idx); - - // If the key doesn't match, it can't be this one.. - if !is_match(k) { continue } - - return Some(idx); - } - - return None - } - - fn search_hashed(&self, hash: &table::SafeHash, k: &K) -> Option { - self.search_hashed_generic(hash, |k_| *k == *k_) - } - - fn search_equiv + Equiv>(&self, q: &Q) -> Option { - self.search_hashed_generic(&self.make_hash(q), |k| q.equiv(k)) + fn search_equiv_mut<'a, Q: Hash + Equiv>(&'a mut self, q: &Q) + -> Option> { + let hash = self.make_hash(q); + search_hashed_generic(&mut self.table, &hash, |k| q.equiv(k)) } /// Search for a key, yielding the index if it's found in the hashtable. /// If you already have the hash for the key lying around, use /// search_hashed. - fn search(&self, k: &K) -> Option { - self.search_hashed(&self.make_hash(k), k) + fn search<'a>(&'a self, k: &K) -> Option> { + let hash = self.make_hash(k); + search_hashed(&self.table, &hash, k) } - fn pop_internal(&mut self, starting_index: table::FullIndex) -> Option { - let starting_probe = starting_index.raw_index(); + fn search_mut<'a>(&'a mut self, k: &K) -> Option> { + let hash = self.make_hash(k); + search_hashed(&mut self.table, &hash, k) + } - let ending_probe = { - let mut probe = self.probe_next(starting_probe); - for _ in range(0u, self.table.size()) { - match self.table.peek(probe) { - table::Empty(_) => {}, // empty bucket. this is the end of our shifting. - table::Full(idx) => { - // Bucket that isn't us, which has a non-zero probe distance. - // This isn't the ending index, so keep searching. - if self.bucket_distance(&idx) != 0 { - probe = self.probe_next(probe); - continue; - } - - // if we do have a bucket_distance of zero, we're at the end - // of what we need to shift. - } + fn insert_hashed_ordered(&mut self, hash: table::SafeHash, k: K, v: V) { + let cap = self.table.capacity(); + let mut buckets = Bucket::new(&mut self.table, &hash); + let ib = buckets.index(); + while buckets.index() != ib + cap { + buckets = match buckets.peek() { + table::Empty(empty) => { + empty.put(hash, k, v); + return; } - break; - } - - probe - }; - - let (_, _, retval) = self.table.take(starting_index); - - let mut probe = starting_probe; - let mut next_probe = self.probe_next(probe); - - // backwards-shift all the elements after our newly-deleted one. - while next_probe != ending_probe { - match self.table.peek(next_probe) { - table::Empty(_) => { - // nothing to shift in. just empty it out. - match self.table.peek(probe) { - table::Empty(_) => {}, - table::Full(idx) => { self.table.take(idx); } - } - }, - table::Full(next_idx) => { - // something to shift. move it over! - let next_hash = next_idx.hash(); - let (_, next_key, next_val) = self.table.take(next_idx); - match self.table.peek(probe) { - table::Empty(idx) => { - self.table.put(idx, next_hash, next_key, next_val); - }, - table::Full(idx) => { - let (emptyidx, _, _) = self.table.take(idx); - self.table.put(emptyidx, next_hash, next_key, next_val); - } - } - } - } - - probe = next_probe; - next_probe = self.probe_next(next_probe); + table::Full(b) => b.into_bucket() + }; + buckets.next(); } - - // Done the backwards shift, but there's still an element left! - // Empty it out. - match self.table.peek(probe) { - table::Empty(_) => {}, - table::Full(idx) => { self.table.take(idx); } - } - - // Now we're done all our shifting. Return the value we grabbed - // earlier. - return Some(retval); + fail!("Internal HashMap error: Out of space."); } } @@ -938,19 +1210,25 @@ impl, V, S, H: Hasher> Mutable for HashMap { // for the map to be reused but has a downside: reserves permanently. self.resize_policy.reserve(self.table.size()); - for i in range(0, self.table.capacity()) { - match self.table.peek(i) { - table::Empty(_) => {}, - table::Full(idx) => { self.table.take(idx); } - } + let cap = self.table.capacity(); + let mut buckets = Bucket::first(&mut self.table); + + while buckets.index() != cap { + buckets = match buckets.peek() { + table::Empty(b) => b.next(), + table::Full(full) => { + let (b, _, _) = full.take(); + b.next() + } + }; } } } impl, V, S, H: Hasher> Map for HashMap { fn find<'a>(&'a self, k: &K) -> Option<&'a V> { - self.search(k).map(|idx| { - let (_, v) = self.table.read(&idx); + self.search(k).map(|bucket| { + let (_, v) = bucket.into_refs(); v }) } @@ -962,12 +1240,12 @@ impl, V, S, H: Hasher> Map for HashMap { impl, V, S, H: Hasher> MutableMap for HashMap { fn find_mut<'a>(&'a mut self, k: &K) -> Option<&'a mut V> { - match self.search(k) { - None => None, - Some(idx) => { - let (_, v) = self.table.read_mut(&idx); + match self.search_mut(k) { + Some(bucket) => { + let (_, v) = bucket.into_mut_refs(); Some(v) } + _ => None } } @@ -976,41 +1254,14 @@ impl, V, S, H: Hasher> MutableMap for HashMap let potential_new_size = self.table.size() + 1; self.make_some_room(potential_new_size); - for dib in range_inclusive(0u, self.table.size()) { - let probe = self.probe(&hash, dib); - - let idx = match self.table.peek(probe) { - table::Empty(idx) => { - // Found a hole! - self.table.put(idx, hash, k, v); - return None; - }, - table::Full(idx) => idx - }; - - if idx.hash() == hash { - let (bucket_k, bucket_v) = self.table.read_mut(&idx); - if k == *bucket_k { - // Found an existing value. - return Some(replace(bucket_v, v)); - } - } - - let probe_dib = self.bucket_distance(&idx); - - if probe_dib < dib { - // Found a luckier bucket. This implies that the key does not - // already exist in the hashtable. Just do a robin hood - // insertion, then. - self.robin_hood(idx, probe_dib, hash, k, v); - return None; - } - } - - // We really shouldn't be here. - fail!("Internal HashMap error: Out of space."); + let mut retval = None; + self.insert_or_replace_with(hash, k, v, |val_ref, val| { + retval = Some(replace(val_ref, val)); + }); + retval } + fn pop(&mut self, k: &K) -> Option { if self.table.size() == 0 { return None @@ -1019,14 +1270,10 @@ impl, V, S, H: Hasher> MutableMap for HashMap let potential_new_size = self.table.size() - 1; self.make_some_room(potential_new_size); - let starting_index = match self.search(k) { - Some(idx) => idx, - None => return None, - }; - - self.pop_internal(starting_index) + self.search_mut(k).map(|bucket| { + pop_internal(bucket) + }) } - } impl HashMap { @@ -1040,7 +1287,8 @@ impl HashMap { /// ``` #[inline] pub fn new() -> HashMap { - HashMap::with_capacity(INITIAL_CAPACITY) + let hasher = RandomSipHasher::new(); + HashMap::with_hasher(hasher) } /// Creates an empty hash map with the given initial capacity. @@ -1075,7 +1323,11 @@ impl, V, S, H: Hasher> HashMap { /// ``` #[inline] pub fn with_hasher(hasher: H) -> HashMap { - HashMap::with_capacity_and_hasher(INITIAL_CAPACITY, hasher) + HashMap { + hasher: hasher, + resize_policy: DefaultResizePolicy::new(INITIAL_CAPACITY), + table: table::RawTable::new(0), + } } /// Create an empty HashMap with space for at least `capacity` @@ -1137,11 +1389,52 @@ impl, V, S, H: Hasher> HashMap { assert!(self.table.size() <= new_capacity); assert!(num::is_power_of_two(new_capacity)); - let old_table = replace(&mut self.table, table::RawTable::new(new_capacity)); - let old_size = old_table.size(); + let mut old_table = replace(&mut self.table, table::RawTable::new(new_capacity)); + let old_size = old_table.size(); - for (h, k, v) in old_table.move_iter() { - self.insert_hashed_nocheck(h, k, v); + if old_table.capacity() == 0 { + return; + } + + if new_capacity < old_table.capacity() { + for (h, k, v) in old_table.move_iter() { + self.insert_hashed_nocheck(h, k, v); + } + } else { + let mut bucket = Bucket::first(&mut old_table); + + loop { + match bucket.peek() { + table::Full(full) => { + if full.distance() == 0 { + bucket = full.into_bucket(); + break; + } + bucket = full.next(); + } + table::Empty(b) => { + bucket = b.next(); + break; + } + }; + } + + loop { + bucket = match bucket.peek() { + table::Full(bucket) => { + { + let t = bucket.table(); + if t.size() == 0 { break } + } + let h = bucket.hash(); + let (b, k, v) = bucket.take(); + self.insert_hashed_ordered(h, k, v); + b.into_bucket() + } + table::Empty(b) => b.into_bucket() + }; + bucket.next(); + } } assert_eq!(self.table.size(), old_size); @@ -1157,7 +1450,7 @@ impl, V, S, H: Hasher> HashMap { debug_assert!(grow_at >= new_size); if cap <= grow_at { - let new_capacity = cap << 1; + let new_capacity = max(cap << 1, INITIAL_CAPACITY); self.resize(new_capacity); } else if shrink_at <= cap { let new_capacity = cap >> 1; @@ -1165,57 +1458,6 @@ impl, V, S, H: Hasher> HashMap { } } - /// Perform robin hood bucket stealing at the given 'index'. You must - /// also pass that probe's "distance to initial bucket" so we don't have - /// to recalculate it, as well as the total number of probes already done - /// so we have some sort of upper bound on the number of probes to do. - /// - /// 'hash', 'k', and 'v' are the elements to robin hood into the hashtable. - fn robin_hood(&mut self, mut index: table::FullIndex, mut dib_param: uint, - mut hash: table::SafeHash, mut k: K, mut v: V) { - 'outer: loop { - let (old_hash, old_key, old_val) = { - let (old_hash_ref, old_key_ref, old_val_ref) = - self.table.read_all_mut(&index); - - let old_hash = replace(old_hash_ref, hash); - let old_key = replace(old_key_ref, k); - let old_val = replace(old_val_ref, v); - - (old_hash, old_key, old_val) - }; - - let mut probe = self.probe_next(index.raw_index()); - - for dib in range(dib_param + 1, self.table.size()) { - let full_index = match self.table.peek(probe) { - table::Empty(idx) => { - // Finally. A hole! - self.table.put(idx, old_hash, old_key, old_val); - return; - }, - table::Full(idx) => idx - }; - - let probe_dib = self.bucket_distance(&full_index); - - // Robin hood! Steal the spot. - if probe_dib < dib { - index = full_index; - dib_param = probe_dib; - hash = old_hash; - k = old_key; - v = old_val; - continue 'outer; - } - - probe = self.probe_next(probe); - } - - fail!("HashMap fatal error: 100% load factor?"); - } - } - /// Insert a pre-hashed key-value pair, without first checking /// that there's enough room in the buckets. Returns a reference to the /// newly insert value. @@ -1224,51 +1466,87 @@ impl, V, S, H: Hasher> HashMap { /// and a reference to the existing element will be returned. fn insert_hashed_nocheck<'a>( &'a mut self, hash: table::SafeHash, k: K, v: V) -> &'a mut V { + self.insert_or_replace_with(hash, k, v, |_, _| ()) + } - for dib in range_inclusive(0u, self.table.size()) { - let probe = self.probe(&hash, dib); + fn insert_or_replace_with<'a>( + &'a mut self, hash: table::SafeHash, k: K, v: V, + found_existing: |&mut V, V| + ) -> &'a mut V { - let idx = match self.table.peek(probe) { - table::Empty(idx) => { + // Worst case, we'll find one empty bucket among `size + 1` buckets. + let size = self.table.size(); + let mut rbucket = Bucket::new(&mut self.table, &hash); + let ib = rbucket.index(); + + loop { + let mut bucket = match rbucket.peek() { + table::Empty(bucket) => { // Found a hole! - let fullidx = self.table.put(idx, hash, k, v); - let (_, val) = self.table.read_mut(&fullidx); + let bucket = bucket.put(hash, k, v); + let (_, val) = bucket.into_mut_refs(); return val; }, - table::Full(idx) => idx + table::Full(bucket) => bucket }; - if idx.hash() == hash { - let (bucket_k, bucket_v) = self.table.read_mut(&idx); + if bucket.hash() == hash { + let (bucket_k, bucket_v) = bucket.read_mut(); // FIXME #12147 the conditional return confuses // borrowck if we return bucket_v directly let bv: *mut V = bucket_v; if k == *bucket_k { // Key already exists. Get its reference. + found_existing(bucket_v, v); return unsafe {&mut *bv}; } } - let probe_dib = self.bucket_distance(&idx); + let robin_ib = bucket.index() as int - bucket.distance() as int; - if probe_dib < dib { + if (ib as int) < robin_ib { // Found a luckier bucket than me. Better steal his spot. - self.robin_hood(idx, probe_dib, hash, k, v); + let (mut hash, mut k, mut v) = bucket.replace(hash, k, v); + let robin_index = bucket.index(); + let mut robin_ib = robin_ib as uint; + let mut rbucket = bucket.next(); + loop { + let mut bucket = match rbucket.peek() { + table::Empty(bucket) => { + // Found a hole! + let b = bucket.put(hash, k, v); + // Now that it's stolen, just read the value's pointer + // right out of the table! + let (_, v) = match Bucket::at_index(b.into_table(), robin_index).peek() { + table::Full(b) => b.into_mut_refs(), + _ => fail!() + }; + return v; + }, + table::Full(bucket) => bucket + }; - // Now that it's stolen, just read the value's pointer - // right out of the table! - match self.table.peek(probe) { - table::Empty(_) => fail!("Just stole a spot, but now that spot's empty."), - table::Full(idx) => { - let (_, v) = self.table.read_mut(&idx); - return v; + let probe_ib = bucket.index() - bucket.distance(); + + // Robin hood! Steal the spot. + if robin_ib < probe_ib { + robin_ib = probe_ib; + let (old_hash, old_key, old_val) = bucket.replace(hash, k, v); + hash = old_hash; + k = old_key; + v = old_val; + } + rbucket = bucket.next(); + if rbucket.index() == ib + size + 1 { + fail!("HashMap fatal error: 100% load factor?") } } } + rbucket = bucket.next(); + if rbucket.index() == ib + size + 1 { + fail!("Internal HashMap error: Out of space.") + } } - - // We really shouldn't be here. - fail!("Internal HashMap error: Out of space."); } /// Inserts an element which has already been hashed, returning a reference @@ -1396,17 +1674,19 @@ impl, V, S, H: Hasher> HashMap { not_found: |&K, A| -> V) -> &'a mut V { let hash = self.make_hash(&k); - match self.search_hashed(&hash, &k) { - None => { - let v = not_found(&k, a); - self.insert_hashed(hash, k, v) - }, - Some(idx) => { - let (_, v_ref) = self.table.read_mut(&idx); - found(&k, v_ref, a); - v_ref - } + { + match search_hashed(&mut self.table, &hash, &k) { + Some(bucket) => { + let (_, v_ref) = bucket.into_mut_refs(); + found(&k, v_ref, a); + return v_ref; + } + _ => { + } + }; } + let v = not_found(&k, a); + self.insert_hashed(hash, k, v) } /// Retrieves a value for the given key. @@ -1482,8 +1762,8 @@ impl, V, S, H: Hasher> HashMap { pub fn find_equiv<'a, Q: Hash + Equiv>(&'a self, k: &Q) -> Option<&'a V> { match self.search_equiv(k) { None => None, - Some(idx) => { - let (_, v_ref) = self.table.read(&idx); + Some(bucket) => { + let (_, v_ref) = bucket.into_refs(); Some(v_ref) } } @@ -1543,12 +1823,12 @@ impl, V, S, H: Hasher> HashMap { let potential_new_size = self.table.size() - 1; self.make_some_room(potential_new_size); - let starting_index = match self.search_equiv(k) { - Some(idx) => idx, - None => return None, - }; - - self.pop_internal(starting_index) + match self.search_equiv_mut(k) { + Some(bucket) => { + Some(pop_internal(bucket)) + } + _ => None + } } /// An iterator visiting all keys in arbitrary order. @@ -2284,6 +2564,12 @@ mod test_map { } } + impl Clone for Dropable { + fn clone(&self) -> Dropable { + Dropable::new(self.k) + } + } + #[test] fn test_drops() { drop_vector.replace(Some(RefCell::new(Vec::from_elem(200, 0i)))); @@ -2338,6 +2624,66 @@ mod test_map { } } + #[test] + fn test_move_iter_drops() { + drop_vector.replace(Some(RefCell::new(Vec::from_elem(200, 0i)))); + + let hm = { + let mut hm = HashMap::new(); + + let v = drop_vector.get().unwrap(); + for i in range(0u, 200) { + assert_eq!(v.borrow().as_slice()[i], 0); + } + drop(v); + + for i in range(0u, 100) { + let d1 = Dropable::new(i); + let d2 = Dropable::new(i+100); + hm.insert(d1, d2); + } + + let v = drop_vector.get().unwrap(); + for i in range(0u, 200) { + assert_eq!(v.borrow().as_slice()[i], 1); + } + drop(v); + + hm + }; + + drop(hm.clone()); + + { + let mut half = hm.move_iter().take(50); + + let v = drop_vector.get().unwrap(); + for i in range(0u, 200) { + assert_eq!(v.borrow().as_slice()[i], 1); + } + drop(v); + + for _ in half {} + + let v = drop_vector.get().unwrap(); + let nk = range(0u, 100).filter(|&i| { + v.borrow().as_slice()[i] == 1 + }).count(); + + let nv = range(0u, 100).filter(|&i| { + v.borrow().as_slice()[i+100] == 1 + }).count(); + + assert_eq!(nk, 50); + assert_eq!(nv, 50); + }; + + let v = drop_vector.get().unwrap(); + for i in range(0u, 200) { + assert_eq!(v.borrow().as_slice()[i], 0); + } + } + #[test] fn test_empty_pop() { let mut m: HashMap = HashMap::new(); @@ -2491,21 +2837,6 @@ mod test_map { assert_eq!(m.swap(1i, 4i), Some(3)); } - #[test] - fn test_move_iter() { - let hm = { - let mut hm = HashMap::new(); - - hm.insert('a', 1i); - hm.insert('b', 2i); - - hm - }; - - let v = hm.move_iter().collect::>(); - assert!([('a', 1), ('b', 2)] == v.as_slice() || [('b', 2), ('a', 1)] == v.as_slice()); - } - #[test] fn test_iterate() { let mut m = HashMap::with_capacity(4); @@ -2556,6 +2887,26 @@ mod test_map { } } + #[test] + fn test_find_copy() { + let mut m = HashMap::new(); + assert!(m.find(&1i).is_none()); + + for i in range(1i, 10000) { + m.insert(i, i + 7); + match m.find_copy(&i) { + None => fail!(), + Some(v) => assert_eq!(v, i + 7) + } + for j in range(1i, i/100) { + match m.find_copy(&j) { + None => fail!(), + Some(v) => assert_eq!(v, j + 7) + } + } + } + } + #[test] fn test_eq() { let mut m1 = HashMap::new(); @@ -2611,8 +2962,12 @@ mod test_map { let mut m = HashMap::new(); assert_eq!(m.len(), 0); + assert_eq!(m.table.capacity(), 0); assert!(m.is_empty()); + m.insert(0, 0); + m.remove(&0); + assert!(m.is_empty()); let initial_cap = m.table.capacity(); m.reserve(initial_cap * 2); let cap = m.table.capacity(); @@ -2647,9 +3002,9 @@ mod test_map { m.remove(&i); } - assert_eq!(m.table.capacity(), cap); assert_eq!(m.len(), i); assert!(!m.is_empty()); + assert_eq!(m.table.capacity(), cap); } #[test] From fc636ae8f4c44f4594f2191e1fcc7c3cdf4948fd Mon Sep 17 00:00:00 2001 From: Piotr Czarnecki Date: Wed, 16 Jul 2014 00:39:32 +0100 Subject: [PATCH 3/6] std: Split hashmap.rs into modules --- src/libstd/collections/hashmap/bench.rs | 130 ++ .../{hashmap.rs => hashmap/map.rs} | 1673 +---------------- src/libstd/collections/hashmap/mod.rs | 27 + src/libstd/collections/hashmap/set.rs | 696 +++++++ src/libstd/collections/hashmap/table.rs | 877 +++++++++ 5 files changed, 1743 insertions(+), 1660 deletions(-) create mode 100644 src/libstd/collections/hashmap/bench.rs rename src/libstd/collections/{hashmap.rs => hashmap/map.rs} (52%) create mode 100644 src/libstd/collections/hashmap/mod.rs create mode 100644 src/libstd/collections/hashmap/set.rs create mode 100644 src/libstd/collections/hashmap/table.rs diff --git a/src/libstd/collections/hashmap/bench.rs b/src/libstd/collections/hashmap/bench.rs new file mode 100644 index 00000000000..66d97ba0448 --- /dev/null +++ b/src/libstd/collections/hashmap/bench.rs @@ -0,0 +1,130 @@ +// Copyright 2014 The Rust Project Developers. See the COPYRIGHT +// file at the top-level directory of this distribution and at +// http://rust-lang.org/COPYRIGHT. +// +// Licensed under the Apache License, Version 2.0 or the MIT license +// , at your +// option. This file may not be copied, modified, or distributed +// except according to those terms. + +#![cfg(test)] + +extern crate test; +use prelude::*; + +use self::test::Bencher; +use iter::{range_inclusive}; + +#[bench] +fn new_drop(b : &mut Bencher) { + use super::HashMap; + + b.iter(|| { + let m : HashMap = HashMap::new(); + assert_eq!(m.len(), 0); + }) +} + +#[bench] +fn new_insert_drop(b : &mut Bencher) { + use super::HashMap; + + b.iter(|| { + let mut m = HashMap::new(); + m.insert(0i, 0i); + assert_eq!(m.len(), 1); + }) +} + +#[bench] +fn insert(b: &mut Bencher) { + use super::HashMap; + + let mut m = HashMap::new(); + + for i in range_inclusive(1i, 1000) { + m.insert(i, i); + } + + let mut k = 1001; + + b.iter(|| { + m.insert(k, k); + k += 1; + }); +} + +#[bench] +fn find_existing(b: &mut Bencher) { + use super::HashMap; + + let mut m = HashMap::new(); + + for i in range_inclusive(1i, 1000) { + m.insert(i, i); + } + + b.iter(|| { + for i in range_inclusive(1i, 1000) { + m.contains_key(&i); + } + }); +} + +#[bench] +fn find_nonexisting(b: &mut Bencher) { + use super::HashMap; + + let mut m = HashMap::new(); + + for i in range_inclusive(1i, 1000) { + m.insert(i, i); + } + + b.iter(|| { + for i in range_inclusive(1001i, 2000) { + m.contains_key(&i); + } + }); +} + +#[bench] +fn hashmap_as_queue(b: &mut Bencher) { + use super::HashMap; + + let mut m = HashMap::new(); + + for i in range_inclusive(1i, 1000) { + m.insert(i, i); + } + + let mut k = 1i; + + b.iter(|| { + m.pop(&k); + m.insert(k + 1000, k + 1000); + k += 1; + }); +} + +#[bench] +fn find_pop_insert(b: &mut Bencher) { + use super::HashMap; + + let mut m = HashMap::new(); + + for i in range_inclusive(1i, 1000) { + m.insert(i, i); + } + + let mut k = 1i; + + b.iter(|| { + m.find(&(k + 400)); + m.find(&(k + 2000)); + m.pop(&k); + m.insert(k + 1000, k + 1000); + k += 1; + }) +} diff --git a/src/libstd/collections/hashmap.rs b/src/libstd/collections/hashmap/map.rs similarity index 52% rename from src/libstd/collections/hashmap.rs rename to src/libstd/collections/hashmap/map.rs index bfe74fed077..7a3779a91a0 100644 --- a/src/libstd/collections/hashmap.rs +++ b/src/libstd/collections/hashmap/map.rs @@ -10,16 +10,15 @@ // // ignore-lexer-test FIXME #15883 -//! Unordered containers, implemented as hash-tables (`HashSet` and `HashMap` types) - use clone::Clone; use cmp::{max, Eq, Equiv, PartialEq}; -use collections::{Collection, Mutable, Set, MutableSet, Map, MutableMap}; +use collections::{Collection, Mutable, MutableSet, Map, MutableMap}; use default::Default; use fmt::Show; use fmt; -use hash::{Hash, Hasher, RandomSipHasher}; -use iter::{Iterator, FromIterator, FilterMap, Chain, Repeat, Zip, Extendable, range}; +use RandomSipHasher; +use hash::{Hash, Hasher}; +use iter::{Iterator, FromIterator, Extendable, range}; use iter; use mem::replace; use num; @@ -28,864 +27,11 @@ use option::{Some, None, Option}; use result::{Ok, Err}; use ops::Index; -use self::table::{BucketWithTable, FullBucketImm, RawTable, FullBucket, FullBucketMut, Bucket}; - -mod table { - use clone::Clone; - use cmp; - use hash::{Hash, Hasher}; - use iter::{Iterator, count}; - use mem::{min_align_of, size_of}; - use mem; - use num::{CheckedMul, is_power_of_two}; - use ops::{Deref, Drop}; - use option::{Some, None, Option}; - use ptr::RawPtr; - use ptr::set_memory; - use ptr::write; - use ptr; - use rt::heap::{allocate, deallocate}; - - static EMPTY_BUCKET: u64 = 0u64; - - /// The raw hashtable, providing safe-ish access to the unzipped and highly - /// optimized arrays of hashes, keys, and values. - /// - /// This design uses less memory and is a lot faster than the naive - /// `Vec>`, because we don't pay for the overhead of an - /// option on every element, and we get a generally more cache-aware design. - /// - /// Key invariants of this structure: - /// - /// - if hashes[i] == EMPTY_BUCKET, then keys[i] and vals[i] have - /// 'undefined' contents. Don't read from them. This invariant is - /// enforced outside this module with the `EmptyIndex`, `FullIndex`, - /// and `SafeHash` types. - /// - /// - An `EmptyIndex` is only constructed for a bucket at an index with - /// a hash of EMPTY_BUCKET. - /// - /// - A `FullIndex` is only constructed for a bucket at an index with a - /// non-EMPTY_BUCKET hash. - /// - /// - A `SafeHash` is only constructed for non-`EMPTY_BUCKET` hash. We get - /// around hashes of zero by changing them to 0x8000_0000_0000_0000, - /// which will likely map to the same bucket, while not being confused - /// with "empty". - /// - /// - All three "arrays represented by pointers" are the same length: - /// `capacity`. This is set at creation and never changes. The arrays - /// are unzipped to save space (we don't have to pay for the padding - /// between odd sized elements, such as in a map from u64 to u8), and - /// be more cache aware (scanning through 8 hashes brings in 2 cache - /// lines, since they're all right beside each other). - /// - /// You can kind of think of this module/data structure as a safe wrapper - /// around just the "table" part of the hashtable. It enforces some - /// invariants at the type level and employs some performance trickery, - /// but in general is just a tricked out `Vec>`. - /// - /// FIXME(cgaebel): - /// - /// Feb 11, 2014: This hashtable was just implemented, and, hard as I tried, - /// isn't yet totally safe. There's a "known exploit" that you can create - /// multiple FullIndexes for a bucket, `take` one, and then still `take` - /// the other causing undefined behavior. Currently, there's no story - /// for how to protect against this statically. Therefore, there are asserts - /// on `take`, `get`, `get_mut`, and `put` which check the bucket state. - /// With time, and when we're confident this works correctly, they should - /// be removed. Also, the bounds check in `peek` is especially painful, - /// as that's called in the innermost loops of the hashtable and has the - /// potential to be a major performance drain. Remove this too. - /// - /// Or, better than remove, only enable these checks for debug builds. - /// There's currently no "debug-only" asserts in rust, so if you're reading - /// this and going "what? of course there are debug-only asserts!", then - /// please make this use them! - #[unsafe_no_drop_flag] - pub struct RawTable { - capacity: uint, - size: uint, - hashes: *mut u64 - } - - /// A bucket that holds a reference to the table - pub trait BucketWithTable { - /// A bucket that holds a reference to the table - fn table<'a>(&'a self) -> &'a M; - - /// Move out the reference to the table. - fn into_table(self) -> M; - - /// Get the raw index. - fn index(&self) -> uint; - } - - struct RawBucket { - hash: *mut u64, - key: *mut K, - val: *mut V - } - - pub struct Bucket { - raw: RawBucket, - idx: uint, - table: M - } - - pub struct EmptyBucket { - raw: RawBucket, - idx: uint, - table: M - } - - pub struct FullBucket { - raw: RawBucket, - idx: uint, - table: M - } - - pub type EmptyBucketImm<'table,K,V> = EmptyBucket>; - pub type FullBucketImm<'table,K,V> = FullBucket>; - - pub type EmptyBucketMut<'table,K,V> = EmptyBucket>; - pub type FullBucketMut<'table,K,V> = FullBucket>; - - struct GapThenFull { - gap: EmptyBucket, - full: FullBucket - } - - impl>> GapThenFull { - pub fn full<'a>(&'a self) -> &'a FullBucket { - &self.full - } - - pub fn shift(mut self) -> Option> { - unsafe { - *self.gap.raw.hash = mem::replace(&mut *self.full.raw.hash, EMPTY_BUCKET); - mem::overwrite(self.gap.raw.key, ptr::read(self.full.raw.key as *const K)); - mem::overwrite(self.gap.raw.val, ptr::read(self.full.raw.val as *const V)); - } - - let FullBucket { raw, idx, .. } = self.full; - - match self.full.next().peek() { - Empty(_) => None, - Full(bucket) => { - self.gap.raw = raw; - self.gap.idx = idx; - - self.full = bucket; - self.full.idx &= self.full.table.capacity - 1; - - Some(self) - } - } - } - } - - impl RawPtr for RawBucket { - unsafe fn offset(self, count: int) -> RawBucket { - RawBucket { - hash: self.hash.offset(count), - key: self.key.offset(count), - val: self.val.offset(count), - } - } - - fn null() -> RawBucket { - RawBucket { - hash: RawPtr::null(), - key: RawPtr::null(), - val: RawPtr::null() - } - } - - fn is_null(&self) -> bool { - self.hash.is_null() - } - - fn to_uint(&self) -> uint { - self.hash.to_uint() - } - - unsafe fn to_option(&self) -> Option<&u64> { - self.hash.to_option() - } - } - - impl>> EmptyBucket { - pub fn next(self) -> Bucket { - let mut bucket = self.into_bucket(); - bucket.next(); - bucket - } - - pub fn into_bucket(self) -> Bucket { - Bucket { - raw: self.raw, - idx: self.idx, - table: self.table - } - } - - pub fn gap_peek(self) -> Option> { - let gap = EmptyBucket { - raw: self.raw, - idx: self.idx, - table: () - }; - - match self.next().peek() { - Empty(_) => None, - Full(bucket) => { - Some(GapThenFull { - gap: gap, - full: bucket - }) - } - } - } - } - - impl>> EmptyBucket { - pub fn put(mut self, hash: SafeHash, key: K, value: V) - -> FullBucket { - unsafe { - *self.raw.hash = hash.inspect(); - write(self.raw.key, key); - write(self.raw.val, value); - } - - self.table.size += 1; - - FullBucket { raw: self.raw, idx: self.idx, table: self.table } - } - } - - impl>> FullBucket { - pub fn next(self) -> Bucket { - let mut bucket = self.into_bucket(); - bucket.next(); - bucket - } - - pub fn into_bucket(self) -> Bucket { - Bucket { - raw: self.raw, - idx: self.idx, - table: self.table - } - } - - pub fn distance(&self) -> uint { - (self.idx - self.hash().inspect() as uint) & (self.table.capacity() - 1) - } - - pub fn hash(&self) -> SafeHash { - unsafe { - SafeHash { - hash: *self.raw.hash - } - } - } - - pub fn read<'a>(&'a self) -> (&'a K, &'a V) { - unsafe { - (&*self.raw.key, - &*self.raw.val) - } - } - - pub fn into_refs(self) -> (&K, &V) { - unsafe { - // debug_assert!(*self.raw.hash != EMPTY_BUCKET); - (&*self.raw.key, - &*self.raw.val) - } - } - } - - impl>> FullBucket { - pub fn take(mut self) -> (EmptyBucket, K, V) { - let key = self.raw.key as *const K; - let val = self.raw.val as *const V; - - self.table.size -= 1; - - unsafe { - *self.raw.hash = EMPTY_BUCKET; - ( - EmptyBucket { - raw: self.raw, - idx: self.idx, - table: self.table - }, - ptr::read(key), - ptr::read(val) - ) - } - } - - pub fn replace(&mut self, h: SafeHash, k: K, v: V) -> (SafeHash, K, V) { - unsafe { - let old_hash = ptr::replace(self.raw.hash as *mut SafeHash, h); - let old_key = ptr::replace(self.raw.key, k); - let old_val = ptr::replace(self.raw.val, v); - - (old_hash, old_key, old_val) - } - } - - pub fn read_mut<'a>(&'a self) -> (&'a mut K, &'a mut V) { - unsafe { - // debug_assert!(*self.raw.hash != EMPTY_BUCKET); - (&mut *self.raw.key, - &mut *self.raw.val) - } - } - - pub fn into_mut_refs(self) -> (&mut K, &mut V) { - unsafe { - // debug_assert!(*self.raw.hash != EMPTY_BUCKET); - (&mut *self.raw.key, - &mut *self.raw.val) - } - } - } - - impl>> Bucket { - pub fn new(table: M, hash: &SafeHash) -> Bucket { - let ib_index = (hash.inspect() as uint) & (table.capacity() - 1); - Bucket { - raw: unsafe { - table.as_mut_ptrs().offset(ib_index as int) - }, - idx: ib_index, - table: table - } - } - - pub fn at_index(table: M, ib_index: uint) -> Bucket { - let ib_index = ib_index & (table.capacity() - 1); - Bucket { - raw: unsafe { - table.as_mut_ptrs().offset(ib_index as int) - }, - idx: ib_index, - table: table - } - } - - pub fn first(table: M) -> Bucket { - Bucket { - raw: table.as_mut_ptrs(), - idx: 0, - table: table - } - } - - pub fn peek(self) -> BucketState { - match unsafe { *self.raw.hash } { - EMPTY_BUCKET => - Empty(EmptyBucket { - raw: self.raw, - idx: self.idx, - table: self.table - }), - _ => - Full(FullBucket { - raw: self.raw, - idx: self.idx, - table: self.table - }) - } - } - - pub fn next(&mut self) { - self.idx += 1; - - let dist = if self.idx == self.table.capacity() { - -(self.table.capacity() as int - 1) - } else { - 1i - }; - - unsafe { - self.raw = self.raw.offset(dist); - } - } - } - - impl BucketWithTable for FullBucket { - fn table<'a>(&'a self) -> &'a M { - &self.table - } - - fn into_table(self) -> M { - self.table - } - - fn index(&self) -> uint { - self.idx - } - } - - impl BucketWithTable for EmptyBucket { - fn table<'a>(&'a self) -> &'a M { - &self.table - } - - fn into_table(self) -> M { - self.table - } - - fn index(&self) -> uint { - self.idx - } - } - - impl BucketWithTable for Bucket { - fn table<'a>(&'a self) -> &'a M { - &self.table - } - - fn into_table(self) -> M { - self.table - } - - fn index(&self) -> uint { - self.idx - } - } - - impl<'table,K,V> Deref> for &'table RawTable { - fn deref<'a>(&'a self) -> &'a RawTable { - &**self - } - } - - impl<'table,K,V> Deref> for &'table mut RawTable { - fn deref<'a>(&'a self) -> &'a RawTable { - &**self - } - } - - impl<'table,K,V> DerefMut> for &'table mut RawTable { - fn deref_mut<'a>(&'a mut self) -> &'a mut RawTable { - &mut **self - } - } - - pub enum BucketState { - Empty(EmptyBucket), - Full(FullBucket), - } - - /// A hash that is not zero, since we use a hash of zero to represent empty - /// buckets. - #[deriving(PartialEq)] - pub struct SafeHash { - hash: u64, - } - - impl SafeHash { - /// Peek at the hash value, which is guaranteed to be non-zero. - #[inline(always)] - pub fn inspect(&self) -> u64 { self.hash } - } - - /// We need to remove hashes of 0. That's reserved for empty buckets. - /// This function wraps up `hash_keyed` to be the only way outside this - /// module to generate a SafeHash. - pub fn make_hash, S, H: Hasher>(hasher: &H, t: &T) -> SafeHash { - match hasher.hash(t) { - // This constant is exceedingly likely to hash to the same - // bucket, but it won't be counted as empty! - EMPTY_BUCKET => SafeHash { hash: 0x8000_0000_0000_0000 }, - h => SafeHash { hash: h }, - } - } - - fn round_up_to_next(unrounded: uint, target_alignment: uint) -> uint { - assert!(is_power_of_two(target_alignment)); - (unrounded + target_alignment - 1) & !(target_alignment - 1) - } - - #[test] - fn test_rounding() { - assert_eq!(round_up_to_next(0, 4), 0); - assert_eq!(round_up_to_next(1, 4), 4); - assert_eq!(round_up_to_next(2, 4), 4); - assert_eq!(round_up_to_next(3, 4), 4); - assert_eq!(round_up_to_next(4, 4), 4); - assert_eq!(round_up_to_next(5, 4), 8); - } - - // Returns a tuple of (minimum required malloc alignment, hash_offset, - // key_offset, val_offset, array_size), from the start of a mallocated array. - fn calculate_offsets( - hash_size: uint, hash_align: uint, - keys_size: uint, keys_align: uint, - vals_size: uint, vals_align: uint) -> (uint, uint, uint, uint, uint) { - - let hash_offset = 0; - let end_of_hashes = hash_offset + hash_size; - - let keys_offset = round_up_to_next(end_of_hashes, keys_align); - let end_of_keys = keys_offset + keys_size; - - let vals_offset = round_up_to_next(end_of_keys, vals_align); - let end_of_vals = vals_offset + vals_size; - - let min_align = cmp::max(hash_align, cmp::max(keys_align, vals_align)); - - (min_align, hash_offset, keys_offset, vals_offset, end_of_vals) - } - - #[test] - fn test_offset_calculation() { - assert_eq!(calculate_offsets(128, 8, 15, 1, 4, 4 ), (8, 0, 128, 144, 148)); - assert_eq!(calculate_offsets(3, 1, 2, 1, 1, 1 ), (1, 0, 3, 5, 6)); - assert_eq!(calculate_offsets(6, 2, 12, 4, 24, 8), (8, 0, 8, 24, 48)); - } - - impl RawTable { - - /// Does not initialize the buckets. The caller should ensure they, - /// at the very least, set every hash to EMPTY_BUCKET. - unsafe fn new_uninitialized(capacity: uint) -> RawTable { - if capacity == 0 { - return RawTable { - size: 0, - capacity: 0, - hashes: 0 as *mut u64, - }; - } - let hashes_size = capacity.checked_mul(&size_of::()) - .expect("capacity overflow"); - let keys_size = capacity.checked_mul(&size_of::< K >()) - .expect("capacity overflow"); - let vals_size = capacity.checked_mul(&size_of::< V >()) - .expect("capacity overflow"); - - // Allocating hashmaps is a little tricky. We need to allocate three - // arrays, but since we know their sizes and alignments up front, - // we just allocate a single array, and then have the subarrays - // point into it. - // - // This is great in theory, but in practice getting the alignment - // right is a little subtle. Therefore, calculating offsets has been - // factored out into a different function. - let (malloc_alignment, hash_offset, _, _, size) = - calculate_offsets( - hashes_size, min_align_of::(), - keys_size, min_align_of::< K >(), - vals_size, min_align_of::< V >()); - - let buffer = allocate(size, malloc_alignment); - - let hashes = buffer.offset(hash_offset as int) as *mut u64; - - RawTable { - capacity: capacity, - size: 0, - hashes: hashes, - } - } - - fn as_mut_ptrs(&self) -> RawBucket { - let hashes_size = self.capacity * size_of::(); - let keys_size = self.capacity * size_of::(); - - let keys_offset = (hashes_size + min_align_of::< K >() - 1) & !(min_align_of::< K >() - 1); - let end_of_keys = keys_offset + keys_size; - - let vals_offset = (end_of_keys + min_align_of::< V >() - 1) & !(min_align_of::< V >() - 1); - - let buffer = self.hashes as *mut u8; - - unsafe { - RawBucket { - hash: self.hashes, - key: buffer.offset(keys_offset as int) as *mut K, - val: buffer.offset(vals_offset as int) as *mut V - } - } - } - - /// Creates a new raw table from a given capacity. All buckets are - /// initially empty. - #[allow(experimental)] - pub fn new(capacity: uint) -> RawTable { - unsafe { - let ret = RawTable::new_uninitialized(capacity); - set_memory(ret.hashes, 0u8, capacity); - ret - } - } - - /// The hashtable's capacity, similar to a vector's. - pub fn capacity(&self) -> uint { - self.capacity - } - - /// The number of elements ever `put` in the hashtable, minus the number - /// of elements ever `take`n. - pub fn size(&self) -> uint { - self.size - } - - fn ptrs<'a>(&'a self) -> RawBuckets<'a, K, V> { - RawBuckets { - raw: self.as_mut_ptrs(), - hashes_end: unsafe { - self.hashes.offset(self.capacity as int) - } - } - } - - pub fn iter<'a>(&'a self) -> Entries<'a, K, V> { - Entries { - iter: self.ptrs(), - elems_left: self.size(), - } - } - - pub fn mut_iter<'a>(&'a mut self) -> MutEntries<'a, K, V> { - MutEntries { - iter: self.ptrs(), - elems_left: self.size(), - } - } - - pub fn move_iter(self) -> MoveEntries { - MoveEntries { - iter: self.ptrs(), - table: self, - } - } - - pub fn rev_move_buckets<'a>(&'a mut self) -> RevMoveBuckets<'a, K, V> { - let raw_bucket = self.as_mut_ptrs(); - unsafe { - RevMoveBuckets { - raw: raw_bucket.offset(self.capacity as int), - hashes_end: raw_bucket.hash, - elems_left: self.size - } - } - } - } - - pub struct RawBuckets<'a, K, V> { - raw: RawBucket, - hashes_end: *mut u64 - } - - impl<'a, K, V> Iterator> for RawBuckets<'a, K, V> { - fn next(&mut self) -> Option> { - while self.raw.hash != self.hashes_end { - unsafe { - let prev = ptr::replace(&mut self.raw, self.raw.offset(1)); - if *prev.hash != EMPTY_BUCKET { - return Some(prev); - } - } - } - - None - } - } - - pub struct RevMoveBuckets<'a, K, V> { - raw: RawBucket, - hashes_end: *mut u64, - elems_left: uint - } - - impl<'a, K, V> Iterator<(K, V)> for RevMoveBuckets<'a, K, V> { - fn next(&mut self) -> Option<(K, V)> { - if self.elems_left == 0 { - return None; - } - - loop { - debug_assert!(self.raw.hash != self.hashes_end); - - unsafe { - self.raw = self.raw.offset(-1); - - if *self.raw.hash != EMPTY_BUCKET { - self.elems_left -= 1; - return Some(( - ptr::read(self.raw.key as *const K), - ptr::read(self.raw.val as *const V) - )); - } - } - } - } - } - - // `read_all_mut` casts a `*u64` to a `*SafeHash`. Since we statically - // ensure that a `FullIndex` points to an index with a non-zero hash, - // and a `SafeHash` is just a `u64` with a different name, this is - // safe. - // - // This test ensures that a `SafeHash` really IS the same size as a - // `u64`. If you need to change the size of `SafeHash` (and - // consequently made this test fail), `read_all_mut` needs to be - // modified to no longer assume this. - #[test] - fn can_alias_safehash_as_u64() { - assert_eq!(size_of::(), size_of::()) - } - - /// Iterator over shared references to entries in a table. - pub struct Entries<'a, K:'a, V:'a> { - table: &'a RawTable, - idx: uint, - elems_seen: uint, - } - - /// Iterator over mutable references to entries in a table. - pub struct MutEntries<'a, K:'a, V:'a> { - table: &'a mut RawTable, - idx: uint, - elems_seen: uint, - } - - /// Iterator over the entries in a table, consuming the table. - pub struct MoveEntries { - table: RawTable, - iter: RawBuckets<'static, K, V> - } - - impl<'a, K, V> Iterator<(&'a K, &'a V)> for Entries<'a, K, V> { - fn next(&mut self) -> Option<(&'a K, &'a V)> { - self.iter.next().map(|bucket| { - self.elems_left -= 1; - unsafe { - (&*bucket.key, - &*bucket.val) - } - }) - } - - fn size_hint(&self) -> (uint, Option) { - (self.elems_left, Some(self.elems_left)) - } - } - - impl<'a, K, V> Iterator<(&'a K, &'a mut V)> for MutEntries<'a, K, V> { - fn next(&mut self) -> Option<(&'a K, &'a mut V)> { - self.iter.next().map(|bucket| { - self.elems_left -= 1; - unsafe { - (&*bucket.key, - &mut *bucket.val) - } - }) - } - - fn size_hint(&self) -> (uint, Option) { - (self.elems_left, Some(self.elems_left)) - } - } - - impl Iterator<(SafeHash, K, V)> for MoveEntries { - fn next(&mut self) -> Option<(SafeHash, K, V)> { - self.iter.next().map(|bucket| { - self.table.size -= 1; - unsafe { - ( - SafeHash { - hash: *bucket.hash, - }, - ptr::read(bucket.key as *const K), - ptr::read(bucket.val as *const V) - ) - } - }) - } - - fn size_hint(&self) -> (uint, Option) { - let size = self.table.size(); - (size, Some(size)) - } - } - - impl Clone for RawTable { - fn clone(&self) -> RawTable { - unsafe { - let mut new_ht = RawTable::new_uninitialized(self.capacity()); - - { - let cap = self.capacity(); - let mut new_buckets = Bucket::first(&mut new_ht); - let mut buckets = Bucket::first(self); - while buckets.index() != cap { - match buckets.peek() { - Full(full) => { - let (h, k, v) = { - let (k, v) = full.read(); - (full.hash(), k.clone(), v.clone()) - }; - *new_buckets.raw.hash = h.inspect(); - mem::overwrite(new_buckets.raw.key, k); - mem::overwrite(new_buckets.raw.val, v); - } - _ => { - *new_buckets.raw.hash = EMPTY_BUCKET; - } - } - new_buckets.next(); - buckets.next(); - } - } - - new_ht.size = self.size(); - - new_ht - } - } - } - - #[unsafe_destructor] - impl Drop for RawTable { - fn drop(&mut self) { - if self.hashes.is_null() { - return; - } - // This is in reverse because we're likely to have partially taken - // some elements out with `.move_iter()` from the front. - // Check if the size is 0, so we don't do a useless scan when - // dropping empty tables such as on resize. - // Avoid double free of elements already moved out. - for _ in self.rev_move_buckets() {} - - let hashes_size = self.capacity * size_of::(); - let keys_size = self.capacity * size_of::(); - let vals_size = self.capacity * size_of::(); - let (align, _, _, _, size) = calculate_offsets(hashes_size, min_align_of::(), - keys_size, min_align_of::(), - vals_size, min_align_of::()); - - unsafe { - deallocate(self.hashes as *mut u8, size, align); - // Remember how everything was allocated out of one buffer - // during initialization? We only need one call to free here. - } - - self.hashes = RawPtr::null(); - } - } -} +use super::table::{BucketWithTable, FullBucketImm, RawTable, FullBucket, FullBucketMut, Bucket}; +use super::table; static INITIAL_LOG2_CAP: uint = 5; -static INITIAL_CAPACITY: uint = 1 << INITIAL_LOG2_CAP; // 2^5 +pub static INITIAL_CAPACITY: uint = 1 << INITIAL_LOG2_CAP; // 2^5 /// The default behavior of HashMap implements a load factor of 90.9%. /// This behavior is characterized by the following conditions: @@ -1283,7 +429,7 @@ impl HashMap { /// /// ``` /// use std::collections::HashMap; - /// let mut map: HashMap<&str, int> = HashMap::new(); + /// let mut map: HashMap<&str, int> = HashMap::with_capacity(10); /// ``` #[inline] pub fn new() -> HashMap { @@ -1365,6 +511,8 @@ impl, V, S, H: Hasher> HashMap { /// This function has no effect on the operational semantics of the /// hashtable, only on performance. /// + /// # Example + /// /// ``` /// use std::collections::HashMap; /// let mut map: HashMap<&str, int> = HashMap::new(); @@ -1774,9 +922,9 @@ impl, V, S, H: Hasher> HashMap { /// /// # Example /// - /// This is a slightly silly example where we define the number's parity as - /// the equivalence class. It is important that the values hash the same, - /// which is why we override `Hash`. + /// This is a slightly silly example where we define the number's + /// parity as the equivalence class. It is important that the + /// values hash the same, which is why we implement `Hash`. /// /// ``` /// use std::collections::HashMap; @@ -2064,435 +1212,6 @@ impl, V, S, H: Hasher + Default> Extendable<(K, V)> for HashM } } -/// HashSet iterator -pub type SetItems<'a, K> = - iter::Map<'static, (&'a K, &'a ()), &'a K, Entries<'a, K, ()>>; - -/// HashSet move iterator -pub type SetMoveItems = - iter::Map<'static, (K, ()), K, MoveEntries>; - -/// An implementation of a hash set using the underlying representation of a -/// HashMap where the value is (). As with the `HashMap` type, a `HashSet` -/// requires that the elements implement the `Eq` and `Hash` traits. -/// -/// # Example -/// -/// ``` -/// use std::collections::HashSet; -/// -/// // Type inference lets us omit an explicit type signature (which -/// // would be `HashSet<&str>` in this example). -/// let mut books = HashSet::new(); -/// -/// // Add some books. -/// books.insert("A Dance With Dragons"); -/// books.insert("To Kill a Mockingbird"); -/// books.insert("The Odyssey"); -/// books.insert("The Great Gatsby"); -/// -/// // Check for a specific one. -/// if !books.contains(&("The Winds of Winter")) { -/// println!("We have {} books, but The Winds of Winter ain't one.", -/// books.len()); -/// } -/// -/// // Remove a book. -/// books.remove(&"The Odyssey"); -/// -/// // Iterate over everything. -/// for book in books.iter() { -/// println!("{}", *book); -/// } -/// ``` -/// -/// The easiest way to use `HashSet` with a custom type is to derive -/// `Eq` and `Hash`. We must also derive `PartialEq`, this will in the -/// future be implied by `Eq`. -/// -/// ```rust -/// use std::collections::HashSet; -/// -/// #[deriving(Hash, Eq, PartialEq, Show)] -/// struct Viking<'a> { -/// name: &'a str, -/// power: uint, -/// } -/// -/// let mut vikings = HashSet::new(); -/// -/// vikings.insert(Viking { name: "Einar", power: 9u }); -/// vikings.insert(Viking { name: "Einar", power: 9u }); -/// vikings.insert(Viking { name: "Olaf", power: 4u }); -/// vikings.insert(Viking { name: "Harald", power: 8u }); -/// -/// // Use derived implementation to print the vikings. -/// for x in vikings.iter() { -/// println!("{}", x); -/// } -/// ``` -#[deriving(Clone)] -pub struct HashSet { - map: HashMap -} - -impl HashSet { - /// Create an empty HashSet. - /// - /// # Example - /// - /// ``` - /// use std::collections::HashSet; - /// let mut set: HashSet = HashSet::new(); - /// ``` - #[inline] - pub fn new() -> HashSet { - HashSet::with_capacity(INITIAL_CAPACITY) - } - - /// Create an empty HashSet with space for at least `n` elements in - /// the hash table. - /// - /// # Example - /// - /// ``` - /// use std::collections::HashSet; - /// let mut set: HashSet = HashSet::with_capacity(10); - /// ``` - #[inline] - pub fn with_capacity(capacity: uint) -> HashSet { - HashSet { map: HashMap::with_capacity(capacity) } - } -} - -impl, S, H: Hasher> HashSet { - /// Creates a new empty hash set which will use the given hasher to hash - /// keys. - /// - /// The hash set is also created with the default initial capacity. - /// - /// # Example - /// - /// ```rust - /// use std::collections::HashSet; - /// use std::hash::sip::SipHasher; - /// - /// let h = SipHasher::new(); - /// let mut set = HashSet::with_hasher(h); - /// set.insert(2u); - /// ``` - #[inline] - pub fn with_hasher(hasher: H) -> HashSet { - HashSet::with_capacity_and_hasher(INITIAL_CAPACITY, hasher) - } - - /// Create an empty HashSet with space for at least `capacity` - /// elements in the hash table, using `hasher` to hash the keys. - /// - /// Warning: `hasher` is normally randomly generated, and - /// is designed to allow `HashSet`s to be resistant to attacks that - /// cause many collisions and very poor performance. Setting it - /// manually using this function can expose a DoS attack vector. - /// - /// # Example - /// - /// ```rust - /// use std::collections::HashSet; - /// use std::hash::sip::SipHasher; - /// - /// let h = SipHasher::new(); - /// let mut set = HashSet::with_capacity_and_hasher(10u, h); - /// set.insert(1i); - /// ``` - #[inline] - pub fn with_capacity_and_hasher(capacity: uint, hasher: H) -> HashSet { - HashSet { map: HashMap::with_capacity_and_hasher(capacity, hasher) } - } - - /// Reserve space for at least `n` elements in the hash table. - /// - /// # Example - /// - /// ``` - /// use std::collections::HashSet; - /// let mut set: HashSet = HashSet::new(); - /// set.reserve(10); - /// ``` - pub fn reserve(&mut self, n: uint) { - self.map.reserve(n) - } - - /// Returns true if the hash set contains a value equivalent to the - /// given query value. - /// - /// # Example - /// - /// This is a slightly silly example where we define the number's - /// parity as the equivalence class. It is important that the - /// values hash the same, which is why we implement `Hash`. - /// - /// ```rust - /// use std::collections::HashSet; - /// use std::hash::Hash; - /// use std::hash::sip::SipState; - /// - /// #[deriving(Eq, PartialEq)] - /// struct EvenOrOdd { - /// num: uint - /// }; - /// - /// impl Hash for EvenOrOdd { - /// fn hash(&self, state: &mut SipState) { - /// let parity = self.num % 2; - /// parity.hash(state); - /// } - /// } - /// - /// impl Equiv for EvenOrOdd { - /// fn equiv(&self, other: &EvenOrOdd) -> bool { - /// self.num % 2 == other.num % 2 - /// } - /// } - /// - /// let mut set = HashSet::new(); - /// set.insert(EvenOrOdd { num: 3u }); - /// - /// assert!(set.contains_equiv(&EvenOrOdd { num: 3u })); - /// assert!(set.contains_equiv(&EvenOrOdd { num: 5u })); - /// assert!(!set.contains_equiv(&EvenOrOdd { num: 4u })); - /// assert!(!set.contains_equiv(&EvenOrOdd { num: 2u })); - /// - /// ``` - pub fn contains_equiv + Equiv>(&self, value: &Q) -> bool { - self.map.contains_key_equiv(value) - } - - /// An iterator visiting all elements in arbitrary order. - /// Iterator element type is &'a T. - /// - /// # Example - /// - /// ``` - /// use std::collections::HashSet; - /// let mut set = HashSet::new(); - /// set.insert("a"); - /// set.insert("b"); - /// - /// // Will print in an arbitrary order. - /// for x in set.iter() { - /// println!("{}", x); - /// } - /// ``` - pub fn iter<'a>(&'a self) -> SetItems<'a, T> { - self.map.keys() - } - - /// Creates a consuming iterator, that is, one that moves each value out - /// of the set in arbitrary order. The set cannot be used after calling - /// this. - /// - /// # Example - /// - /// ``` - /// use std::collections::HashSet; - /// let mut set = HashSet::new(); - /// set.insert("a".to_string()); - /// set.insert("b".to_string()); - /// - /// // Not possible to collect to a Vec with a regular `.iter()`. - /// let v: Vec = set.move_iter().collect(); - /// - /// // Will print in an arbitrary order. - /// for x in v.iter() { - /// println!("{}", x); - /// } - /// ``` - pub fn move_iter(self) -> SetMoveItems { - self.map.move_iter().map(|(k, _)| k) - } - - /// Visit the values representing the difference. - /// - /// # Example - /// - /// ``` - /// use std::collections::HashSet; - /// let a: HashSet = [1i, 2, 3].iter().map(|&x| x).collect(); - /// let b: HashSet = [4i, 2, 3, 4].iter().map(|&x| x).collect(); - /// - /// // Can be seen as `a - b`. - /// for x in a.difference(&b) { - /// println!("{}", x); // Print 1 - /// } - /// - /// let diff: HashSet = a.difference(&b).map(|&x| x).collect(); - /// assert_eq!(diff, [1i].iter().map(|&x| x).collect()); - /// - /// // Note that difference is not symmetric, - /// // and `b - a` means something else: - /// let diff: HashSet = b.difference(&a).map(|&x| x).collect(); - /// assert_eq!(diff, [4i].iter().map(|&x| x).collect()); - /// ``` - pub fn difference<'a>(&'a self, other: &'a HashSet) -> SetAlgebraItems<'a, T, H> { - Repeat::new(other).zip(self.iter()) - .filter_map(|(other, elt)| { - if !other.contains(elt) { Some(elt) } else { None } - }) - } - - /// Visit the values representing the symmetric difference. - /// - /// # Example - /// - /// ``` - /// use std::collections::HashSet; - /// let a: HashSet = [1i, 2, 3].iter().map(|&x| x).collect(); - /// let b: HashSet = [4i, 2, 3, 4].iter().map(|&x| x).collect(); - /// - /// // Print 1, 4 in arbitrary order. - /// for x in a.symmetric_difference(&b) { - /// println!("{}", x); - /// } - /// - /// let diff1: HashSet = a.symmetric_difference(&b).map(|&x| x).collect(); - /// let diff2: HashSet = b.symmetric_difference(&a).map(|&x| x).collect(); - /// - /// assert_eq!(diff1, diff2); - /// assert_eq!(diff1, [1i, 4].iter().map(|&x| x).collect()); - /// ``` - pub fn symmetric_difference<'a>(&'a self, other: &'a HashSet) - -> Chain, SetAlgebraItems<'a, T, H>> { - self.difference(other).chain(other.difference(self)) - } - - /// Visit the values representing the intersection. - /// - /// # Example - /// - /// ``` - /// use std::collections::HashSet; - /// let a: HashSet = [1i, 2, 3].iter().map(|&x| x).collect(); - /// let b: HashSet = [4i, 2, 3, 4].iter().map(|&x| x).collect(); - /// - /// // Print 2, 3 in arbitrary order. - /// for x in a.intersection(&b) { - /// println!("{}", x); - /// } - /// - /// let diff: HashSet = a.intersection(&b).map(|&x| x).collect(); - /// assert_eq!(diff, [2i, 3].iter().map(|&x| x).collect()); - /// ``` - pub fn intersection<'a>(&'a self, other: &'a HashSet) - -> SetAlgebraItems<'a, T, H> { - Repeat::new(other).zip(self.iter()) - .filter_map(|(other, elt)| { - if other.contains(elt) { Some(elt) } else { None } - }) - } - - /// Visit the values representing the union. - /// - /// # Example - /// - /// ``` - /// use std::collections::HashSet; - /// let a: HashSet = [1i, 2, 3].iter().map(|&x| x).collect(); - /// let b: HashSet = [4i, 2, 3, 4].iter().map(|&x| x).collect(); - /// - /// // Print 1, 2, 3, 4 in arbitrary order. - /// for x in a.union(&b) { - /// println!("{}", x); - /// } - /// - /// let diff: HashSet = a.union(&b).map(|&x| x).collect(); - /// assert_eq!(diff, [1i, 2, 3, 4].iter().map(|&x| x).collect()); - /// ``` - pub fn union<'a>(&'a self, other: &'a HashSet) - -> Chain, SetAlgebraItems<'a, T, H>> { - self.iter().chain(other.difference(self)) - } -} - -impl, S, H: Hasher> PartialEq for HashSet { - fn eq(&self, other: &HashSet) -> bool { - if self.len() != other.len() { return false; } - - self.iter().all(|key| other.contains(key)) - } -} - -impl, S, H: Hasher> Eq for HashSet {} - -impl, S, H: Hasher> Collection for HashSet { - fn len(&self) -> uint { self.map.len() } -} - -impl, S, H: Hasher> Mutable for HashSet { - fn clear(&mut self) { self.map.clear() } -} - -impl, S, H: Hasher> Set for HashSet { - fn contains(&self, value: &T) -> bool { self.map.contains_key(value) } - - fn is_disjoint(&self, other: &HashSet) -> bool { - self.iter().all(|v| !other.contains(v)) - } - - fn is_subset(&self, other: &HashSet) -> bool { - self.iter().all(|v| other.contains(v)) - } -} - -impl, S, H: Hasher> MutableSet for HashSet { - fn insert(&mut self, value: T) -> bool { self.map.insert(value, ()) } - - fn remove(&mut self, value: &T) -> bool { self.map.remove(value) } -} - - -impl + fmt::Show, S, H: Hasher> fmt::Show for HashSet { - fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { - try!(write!(f, "{{")); - - for (i, x) in self.iter().enumerate() { - if i != 0 { try!(write!(f, ", ")); } - try!(write!(f, "{}", *x)); - } - - write!(f, "}}") - } -} - -impl, S, H: Hasher + Default> FromIterator for HashSet { - fn from_iter>(iter: I) -> HashSet { - let (lower, _) = iter.size_hint(); - let mut set = HashSet::with_capacity_and_hasher(lower, Default::default()); - set.extend(iter); - set - } -} - -impl, S, H: Hasher + Default> Extendable for HashSet { - fn extend>(&mut self, mut iter: I) { - for k in iter { - self.insert(k); - } - } -} - -impl, S, H: Hasher + Default> Default for HashSet { - fn default() -> HashSet { - HashSet::with_hasher(Default::default()) - } -} - -// `Repeat` is used to feed the filter closure an explicit capture -// of a reference to the other set -/// Set operations iterator -pub type SetAlgebraItems<'a, T, H> = - FilterMap<'static, (&'a HashSet, &'a T), &'a T, - Zip>, SetItems<'a, T>>>; - #[cfg(test)] mod test_map { use prelude::*; @@ -3084,369 +1803,3 @@ mod test_map { map[4]; } } - -#[cfg(test)] -mod test_set { - use prelude::*; - - use super::HashSet; - use slice::ImmutablePartialEqSlice; - use collections::Collection; - - #[test] - fn test_disjoint() { - let mut xs = HashSet::new(); - let mut ys = HashSet::new(); - assert!(xs.is_disjoint(&ys)); - assert!(ys.is_disjoint(&xs)); - assert!(xs.insert(5i)); - assert!(ys.insert(11i)); - assert!(xs.is_disjoint(&ys)); - assert!(ys.is_disjoint(&xs)); - assert!(xs.insert(7)); - assert!(xs.insert(19)); - assert!(xs.insert(4)); - assert!(ys.insert(2)); - assert!(ys.insert(-11)); - assert!(xs.is_disjoint(&ys)); - assert!(ys.is_disjoint(&xs)); - assert!(ys.insert(7)); - assert!(!xs.is_disjoint(&ys)); - assert!(!ys.is_disjoint(&xs)); - } - - #[test] - fn test_subset_and_superset() { - let mut a = HashSet::new(); - assert!(a.insert(0i)); - assert!(a.insert(5)); - assert!(a.insert(11)); - assert!(a.insert(7)); - - let mut b = HashSet::new(); - assert!(b.insert(0i)); - assert!(b.insert(7)); - assert!(b.insert(19)); - assert!(b.insert(250)); - assert!(b.insert(11)); - assert!(b.insert(200)); - - assert!(!a.is_subset(&b)); - assert!(!a.is_superset(&b)); - assert!(!b.is_subset(&a)); - assert!(!b.is_superset(&a)); - - assert!(b.insert(5)); - - assert!(a.is_subset(&b)); - assert!(!a.is_superset(&b)); - assert!(!b.is_subset(&a)); - assert!(b.is_superset(&a)); - } - - #[test] - fn test_iterate() { - let mut a = HashSet::new(); - for i in range(0u, 32) { - assert!(a.insert(i)); - } - let mut observed: u32 = 0; - for k in a.iter() { - observed |= 1 << *k; - } - assert_eq!(observed, 0xFFFF_FFFF); - } - - #[test] - fn test_intersection() { - let mut a = HashSet::new(); - let mut b = HashSet::new(); - - assert!(a.insert(11i)); - assert!(a.insert(1)); - assert!(a.insert(3)); - assert!(a.insert(77)); - assert!(a.insert(103)); - assert!(a.insert(5)); - assert!(a.insert(-5)); - - assert!(b.insert(2i)); - assert!(b.insert(11)); - assert!(b.insert(77)); - assert!(b.insert(-9)); - assert!(b.insert(-42)); - assert!(b.insert(5)); - assert!(b.insert(3)); - - let mut i = 0; - let expected = [3, 5, 11, 77]; - for x in a.intersection(&b) { - assert!(expected.contains(x)); - i += 1 - } - assert_eq!(i, expected.len()); - } - - #[test] - fn test_difference() { - let mut a = HashSet::new(); - let mut b = HashSet::new(); - - assert!(a.insert(1i)); - assert!(a.insert(3)); - assert!(a.insert(5)); - assert!(a.insert(9)); - assert!(a.insert(11)); - - assert!(b.insert(3i)); - assert!(b.insert(9)); - - let mut i = 0; - let expected = [1, 5, 11]; - for x in a.difference(&b) { - assert!(expected.contains(x)); - i += 1 - } - assert_eq!(i, expected.len()); - } - - #[test] - fn test_symmetric_difference() { - let mut a = HashSet::new(); - let mut b = HashSet::new(); - - assert!(a.insert(1i)); - assert!(a.insert(3)); - assert!(a.insert(5)); - assert!(a.insert(9)); - assert!(a.insert(11)); - - assert!(b.insert(-2i)); - assert!(b.insert(3)); - assert!(b.insert(9)); - assert!(b.insert(14)); - assert!(b.insert(22)); - - let mut i = 0; - let expected = [-2, 1, 5, 11, 14, 22]; - for x in a.symmetric_difference(&b) { - assert!(expected.contains(x)); - i += 1 - } - assert_eq!(i, expected.len()); - } - - #[test] - fn test_union() { - let mut a = HashSet::new(); - let mut b = HashSet::new(); - - assert!(a.insert(1i)); - assert!(a.insert(3)); - assert!(a.insert(5)); - assert!(a.insert(9)); - assert!(a.insert(11)); - assert!(a.insert(16)); - assert!(a.insert(19)); - assert!(a.insert(24)); - - assert!(b.insert(-2i)); - assert!(b.insert(1)); - assert!(b.insert(5)); - assert!(b.insert(9)); - assert!(b.insert(13)); - assert!(b.insert(19)); - - let mut i = 0; - let expected = [-2, 1, 3, 5, 9, 11, 13, 16, 19, 24]; - for x in a.union(&b) { - assert!(expected.contains(x)); - i += 1 - } - assert_eq!(i, expected.len()); - } - - #[test] - fn test_from_iter() { - let xs = [1i, 2, 3, 4, 5, 6, 7, 8, 9]; - - let set: HashSet = xs.iter().map(|&x| x).collect(); - - for x in xs.iter() { - assert!(set.contains(x)); - } - } - - #[test] - fn test_move_iter() { - let hs = { - let mut hs = HashSet::new(); - - hs.insert('a'); - hs.insert('b'); - - hs - }; - - let v = hs.move_iter().collect::>(); - assert!(['a', 'b'] == v.as_slice() || ['b', 'a'] == v.as_slice()); - } - - #[test] - fn test_eq() { - // These constants once happened to expose a bug in insert(). - // I'm keeping them around to prevent a regression. - let mut s1 = HashSet::new(); - - s1.insert(1i); - s1.insert(2); - s1.insert(3); - - let mut s2 = HashSet::new(); - - s2.insert(1i); - s2.insert(2); - - assert!(s1 != s2); - - s2.insert(3); - - assert_eq!(s1, s2); - } - - #[test] - fn test_show() { - let mut set: HashSet = HashSet::new(); - let empty: HashSet = HashSet::new(); - - set.insert(1i); - set.insert(2); - - let set_str = format!("{}", set); - - assert!(set_str == "{1, 2}".to_string() || set_str == "{2, 1}".to_string()); - assert_eq!(format!("{}", empty), "{}".to_string()); - } -} - -#[cfg(test)] -mod bench { - extern crate test; - use prelude::*; - - use self::test::Bencher; - use iter::{range_inclusive}; - - #[bench] - fn new_drop(b : &mut Bencher) { - use super::HashMap; - - b.iter(|| { - let m : HashMap = HashMap::new(); - assert_eq!(m.len(), 0); - }) - } - - #[bench] - fn new_insert_drop(b : &mut Bencher) { - use super::HashMap; - - b.iter(|| { - let mut m = HashMap::new(); - m.insert(0i, 0i); - assert_eq!(m.len(), 1); - }) - } - - #[bench] - fn insert(b: &mut Bencher) { - use super::HashMap; - - let mut m = HashMap::new(); - - for i in range_inclusive(1i, 1000) { - m.insert(i, i); - } - - let mut k = 1001; - - b.iter(|| { - m.insert(k, k); - k += 1; - }); - } - - #[bench] - fn find_existing(b: &mut Bencher) { - use super::HashMap; - - let mut m = HashMap::new(); - - for i in range_inclusive(1i, 1000) { - m.insert(i, i); - } - - b.iter(|| { - for i in range_inclusive(1i, 1000) { - m.contains_key(&i); - } - }); - } - - #[bench] - fn find_nonexisting(b: &mut Bencher) { - use super::HashMap; - - let mut m = HashMap::new(); - - for i in range_inclusive(1i, 1000) { - m.insert(i, i); - } - - b.iter(|| { - for i in range_inclusive(1001i, 2000) { - m.contains_key(&i); - } - }); - } - - #[bench] - fn hashmap_as_queue(b: &mut Bencher) { - use super::HashMap; - - let mut m = HashMap::new(); - - for i in range_inclusive(1i, 1000) { - m.insert(i, i); - } - - let mut k = 1i; - - b.iter(|| { - m.pop(&k); - m.insert(k + 1000, k + 1000); - k += 1; - }); - } - - #[bench] - fn find_pop_insert(b: &mut Bencher) { - use super::HashMap; - - let mut m = HashMap::new(); - - for i in range_inclusive(1i, 1000) { - m.insert(i, i); - } - - let mut k = 1i; - - b.iter(|| { - m.find(&(k + 400)); - m.find(&(k + 2000)); - m.pop(&k); - m.insert(k + 1000, k + 1000); - k += 1; - }) - } -} diff --git a/src/libstd/collections/hashmap/mod.rs b/src/libstd/collections/hashmap/mod.rs new file mode 100644 index 00000000000..f493e844526 --- /dev/null +++ b/src/libstd/collections/hashmap/mod.rs @@ -0,0 +1,27 @@ +// Copyright 2014 The Rust Project Developers. See the COPYRIGHT +// file at the top-level directory of this distribution and at +// http://rust-lang.org/COPYRIGHT. +// +// Licensed under the Apache License, Version 2.0 or the MIT license +// , at your +// option. This file may not be copied, modified, or distributed +// except according to those terms. + +//! Unordered containers, implemented as hash-tables + +pub use self::map::HashMap; +pub use self::map::Entries; +pub use self::map::MoveEntries; +pub use self::map::Keys; +pub use self::map::Values; +pub use self::map::INITIAL_CAPACITY; +pub use self::set::HashSet; +pub use self::set::SetItems; +pub use self::set::SetMoveItems; +pub use self::set::SetAlgebraItems; + +mod bench; +mod map; +mod set; +mod table; diff --git a/src/libstd/collections/hashmap/set.rs b/src/libstd/collections/hashmap/set.rs new file mode 100644 index 00000000000..a1f71e33303 --- /dev/null +++ b/src/libstd/collections/hashmap/set.rs @@ -0,0 +1,696 @@ +// Copyright 2014 The Rust Project Developers. See the COPYRIGHT +// file at the top-level directory of this distribution and at +// http://rust-lang.org/COPYRIGHT. +// +// Licensed under the Apache License, Version 2.0 or the MIT license +// , at your +// option. This file may not be copied, modified, or distributed +// except according to those terms. +// +// ignore-lexer-test FIXME #15883 + +use clone::Clone; +use cmp::{Eq, Equiv, PartialEq}; +use collections::{Collection, Mutable, Set, MutableSet, Map, MutableMap}; +use default::Default; +use fmt::Show; +use fmt; +use RandomSipHasher; +use hash::{Hash, Hasher}; +use iter::{Iterator, FromIterator, FilterMap, Chain, Repeat, Zip, Extendable}; +use iter; +use option::{Some, None}; +use result::{Ok, Err}; + +use super::{HashMap, Entries, MoveEntries, INITIAL_CAPACITY}; + +/// HashSet iterator +pub type SetItems<'a, K> = + iter::Map<'static, (&'a K, &'a ()), &'a K, Entries<'a, K, ()>>; + +/// HashSet move iterator +pub type SetMoveItems = + iter::Map<'static, (K, ()), K, MoveEntries>; + +/// An implementation of a hash set using the underlying representation of a +/// HashMap where the value is (). As with the `HashMap` type, a `HashSet` +/// requires that the elements implement the `Eq` and `Hash` traits. +/// +/// # Example +/// +/// ``` +/// use std::collections::HashSet; +/// // Type inference lets us omit an explicit type signature (which +/// // would be `HashSet<&str>` in this example). +/// let mut books = HashSet::new(); +/// +/// // Add some books. +/// books.insert("A Dance With Dragons"); +/// books.insert("To Kill a Mockingbird"); +/// books.insert("The Odyssey"); +/// books.insert("The Great Gatsby"); +/// +/// // Check for a specific one. +/// if !books.contains(&("The Winds of Winter")) { +/// println!("We have {} books, but The Winds of Winter ain't one.", +/// books.len()); +/// } +/// +/// // Remove a book. +/// books.remove(&"The Odyssey"); +/// +/// // Iterate over everything. +/// for book in books.iter() { +/// println!("{}", *book); +/// } +/// ``` +/// +/// The easiest way to use `HashSet` with a custom type is to derive +/// `Eq` and `Hash`. We must also derive `PartialEq`, this will in the +/// future be implied by `Eq`. +/// +/// ``` +/// use std::collections::HashSet; +/// #[deriving(Hash, Eq, PartialEq, Show)] +/// struct Viking<'a> { +/// name: &'a str, +/// power: uint, +/// } +/// +/// let mut vikings = HashSet::new(); +/// +/// vikings.insert(Viking { name: "Einar", power: 9u }); +/// vikings.insert(Viking { name: "Einar", power: 9u }); +/// vikings.insert(Viking { name: "Olaf", power: 4u }); +/// vikings.insert(Viking { name: "Harald", power: 8u }); +/// +/// // Use derived implementation to print the vikings. +/// for x in vikings.iter() { +/// println!("{}", x); +/// } +/// ``` +#[deriving(Clone)] +pub struct HashSet { + map: HashMap +} + +impl HashSet { + /// Create an empty HashSet. + /// + /// # Example + /// + /// ``` + /// use std::collections::HashSet; + /// let mut set: HashSet = HashSet::new(); + /// ``` + #[inline] + pub fn new() -> HashSet { + HashSet::with_capacity(INITIAL_CAPACITY) + } + + /// Create an empty HashSet with space for at least `n` elements in + /// the hash table. + /// + /// # Example + /// + /// ``` + /// use std::collections::HashSet; + /// let mut set: HashSet = HashSet::with_capacity(10); + /// ``` + #[inline] + pub fn with_capacity(capacity: uint) -> HashSet { + HashSet { map: HashMap::with_capacity(capacity) } + } +} + +impl, S, H: Hasher> HashSet { + /// Creates a new empty hash set which will use the given hasher to hash + /// keys. + /// + /// The hash set is also created with the default initial capacity. + /// + /// # Example + /// + /// ``` + /// use std::collections::HashSet; + /// use std::hash::sip::SipHasher; + /// + /// let h = SipHasher::new(); + /// let mut set = HashSet::with_hasher(h); + /// set.insert(2u); + /// ``` + #[inline] + pub fn with_hasher(hasher: H) -> HashSet { + HashSet::with_capacity_and_hasher(INITIAL_CAPACITY, hasher) + } + + /// Create an empty HashSet with space for at least `capacity` + /// elements in the hash table, using `hasher` to hash the keys. + /// + /// Warning: `hasher` is normally randomly generated, and + /// is designed to allow `HashSet`s to be resistant to attacks that + /// cause many collisions and very poor performance. Setting it + /// manually using this function can expose a DoS attack vector. + /// + /// # Example + /// + /// ``` + /// use std::collections::HashSet; + /// use std::hash::sip::SipHasher; + /// + /// let h = SipHasher::new(); + /// let mut set = HashSet::with_capacity_and_hasher(10u, h); + /// set.insert(1i); + /// ``` + #[inline] + pub fn with_capacity_and_hasher(capacity: uint, hasher: H) -> HashSet { + HashSet { map: HashMap::with_capacity_and_hasher(capacity, hasher) } + } + + /// Reserve space for at least `n` elements in the hash table. + /// + /// # Example + /// + /// ``` + /// use std::collections::HashSet; + /// let mut set: HashSet = HashSet::new(); + /// set.reserve(10); + /// ``` + pub fn reserve(&mut self, n: uint) { + self.map.reserve(n) + } + + /// Returns true if the hash set contains a value equivalent to the + /// given query value. + /// + /// # Example + /// + /// This is a slightly silly example where we define the number's + /// parity as the equivilance class. It is important that the + /// values hash the same, which is why we implement `Hash`. + /// + /// ``` + /// use std::collections::HashSet; + /// use std::hash::Hash; + /// use std::hash::sip::SipState; + /// + /// #[deriving(Eq, PartialEq)] + /// struct EvenOrOdd { + /// num: uint + /// }; + /// + /// impl Hash for EvenOrOdd { + /// fn hash(&self, state: &mut SipState) { + /// let parity = self.num % 2; + /// parity.hash(state); + /// } + /// } + /// + /// impl Equiv for EvenOrOdd { + /// fn equiv(&self, other: &EvenOrOdd) -> bool { + /// self.num % 2 == other.num % 2 + /// } + /// } + /// + /// let mut set = HashSet::new(); + /// set.insert(EvenOrOdd { num: 3u }); + /// + /// assert!(set.contains_equiv(&EvenOrOdd { num: 3u })); + /// assert!(set.contains_equiv(&EvenOrOdd { num: 5u })); + /// assert!(!set.contains_equiv(&EvenOrOdd { num: 4u })); + /// assert!(!set.contains_equiv(&EvenOrOdd { num: 2u })); + /// + /// ``` + pub fn contains_equiv + Equiv>(&self, value: &Q) -> bool { + self.map.contains_key_equiv(value) + } + + /// An iterator visiting all elements in arbitrary order. + /// Iterator element type is &'a T. + /// + /// # Example + /// + /// ``` + /// use std::collections::HashSet; + /// let mut set = HashSet::new(); + /// set.insert("a"); + /// set.insert("b"); + /// + /// // Will print in an arbitrary order. + /// for x in set.iter() { + /// println!("{}", x); + /// } + /// ``` + pub fn iter<'a>(&'a self) -> SetItems<'a, T> { + self.map.keys() + } + + /// Creates a consuming iterator, that is, one that moves each value out + /// of the set in arbitrary order. The set cannot be used after calling + /// this. + /// + /// # Example + /// + /// ``` + /// use std::collections::HashSet; + /// let mut set = HashSet::new(); + /// set.insert("a".to_string()); + /// set.insert("b".to_string()); + /// + /// // Not possible to collect to a Vec with a regular `.iter()`. + /// let v: Vec = set.move_iter().collect(); + /// + /// // Will print in an arbitrary order. + /// for x in v.iter() { + /// println!("{}", x); + /// } + /// ``` + pub fn move_iter(self) -> SetMoveItems { + self.map.move_iter().map(|(k, _)| k) + } + + /// Visit the values representing the difference. + /// + /// # Example + /// + /// ``` + /// use std::collections::HashSet; + /// let a: HashSet = [1i, 2, 3].iter().map(|&x| x).collect(); + /// let b: HashSet = [4i, 2, 3, 4].iter().map(|&x| x).collect(); + /// + /// // Can be seen as `a - b`. + /// for x in a.difference(&b) { + /// println!("{}", x); // Print 1 + /// } + /// + /// let diff: HashSet = a.difference(&b).map(|&x| x).collect(); + /// assert_eq!(diff, [1i].iter().map(|&x| x).collect()); + /// + /// // Note that difference is not symmetric, + /// // and `b - a` means something else: + /// let diff: HashSet = b.difference(&a).map(|&x| x).collect(); + /// assert_eq!(diff, [4i].iter().map(|&x| x).collect()); + /// ``` + pub fn difference<'a>(&'a self, other: &'a HashSet) -> SetAlgebraItems<'a, T, H> { + Repeat::new(other).zip(self.iter()) + .filter_map(|(other, elt)| { + if !other.contains(elt) { Some(elt) } else { None } + }) + } + + /// Visit the values representing the symmetric difference. + /// + /// # Example + /// + /// ``` + /// use std::collections::HashSet; + /// let a: HashSet = [1i, 2, 3].iter().map(|&x| x).collect(); + /// let b: HashSet = [4i, 2, 3, 4].iter().map(|&x| x).collect(); + /// + /// // Print 1, 4 in arbitrary order. + /// for x in a.symmetric_difference(&b) { + /// println!("{}", x); + /// } + /// + /// let diff1: HashSet = a.symmetric_difference(&b).map(|&x| x).collect(); + /// let diff2: HashSet = b.symmetric_difference(&a).map(|&x| x).collect(); + /// + /// assert_eq!(diff1, diff2); + /// assert_eq!(diff1, [1i, 4].iter().map(|&x| x).collect()); + /// ``` + pub fn symmetric_difference<'a>(&'a self, other: &'a HashSet) + -> Chain, SetAlgebraItems<'a, T, H>> { + self.difference(other).chain(other.difference(self)) + } + + /// Visit the values representing the intersection. + /// + /// # Example + /// + /// ``` + /// use std::collections::HashSet; + /// let a: HashSet = [1i, 2, 3].iter().map(|&x| x).collect(); + /// let b: HashSet = [4i, 2, 3, 4].iter().map(|&x| x).collect(); + /// + /// // Print 2, 3 in arbitrary order. + /// for x in a.intersection(&b) { + /// println!("{}", x); + /// } + /// + /// let diff: HashSet = a.intersection(&b).map(|&x| x).collect(); + /// assert_eq!(diff, [2i, 3].iter().map(|&x| x).collect()); + /// ``` + pub fn intersection<'a>(&'a self, other: &'a HashSet) + -> SetAlgebraItems<'a, T, H> { + Repeat::new(other).zip(self.iter()) + .filter_map(|(other, elt)| { + if other.contains(elt) { Some(elt) } else { None } + }) + } + + /// Visit the values representing the union. + /// + /// # Example + /// + /// ``` + /// use std::collections::HashSet; + /// let a: HashSet = [1i, 2, 3].iter().map(|&x| x).collect(); + /// let b: HashSet = [4i, 2, 3, 4].iter().map(|&x| x).collect(); + /// + /// // Print 1, 2, 3, 4 in arbitrary order. + /// for x in a.union(&b) { + /// println!("{}", x); + /// } + /// + /// let diff: HashSet = a.union(&b).map(|&x| x).collect(); + /// assert_eq!(diff, [1i, 2, 3, 4].iter().map(|&x| x).collect()); + /// ``` + pub fn union<'a>(&'a self, other: &'a HashSet) + -> Chain, SetAlgebraItems<'a, T, H>> { + self.iter().chain(other.difference(self)) + } +} + +impl, S, H: Hasher> PartialEq for HashSet { + fn eq(&self, other: &HashSet) -> bool { + if self.len() != other.len() { return false; } + + self.iter().all(|key| other.contains(key)) + } +} + +impl, S, H: Hasher> Eq for HashSet {} + +impl, S, H: Hasher> Collection for HashSet { + fn len(&self) -> uint { self.map.len() } +} + +impl, S, H: Hasher> Mutable for HashSet { + fn clear(&mut self) { self.map.clear() } +} + +impl, S, H: Hasher> Set for HashSet { + fn contains(&self, value: &T) -> bool { self.map.contains_key(value) } + + fn is_disjoint(&self, other: &HashSet) -> bool { + self.iter().all(|v| !other.contains(v)) + } + + fn is_subset(&self, other: &HashSet) -> bool { + self.iter().all(|v| other.contains(v)) + } +} + +impl, S, H: Hasher> MutableSet for HashSet { + fn insert(&mut self, value: T) -> bool { self.map.insert(value, ()) } + + fn remove(&mut self, value: &T) -> bool { self.map.remove(value) } +} + +impl + fmt::Show, S, H: Hasher> fmt::Show for HashSet { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + try!(write!(f, "{{")); + + for (i, x) in self.iter().enumerate() { + if i != 0 { try!(write!(f, ", ")); } + try!(write!(f, "{}", *x)); + } + + write!(f, "}}") + } +} + +impl, S, H: Hasher + Default> FromIterator for HashSet { + fn from_iter>(iter: I) -> HashSet { + let (lower, _) = iter.size_hint(); + let mut set = HashSet::with_capacity_and_hasher(lower, Default::default()); + set.extend(iter); + set + } +} + +impl, S, H: Hasher + Default> Extendable for HashSet { + fn extend>(&mut self, mut iter: I) { + for k in iter { + self.insert(k); + } + } +} + +impl, S, H: Hasher + Default> Default for HashSet { + fn default() -> HashSet { + HashSet::with_hasher(Default::default()) + } +} + +// `Repeat` is used to feed the filter closure an explicit capture +// of a reference to the other set +/// Set operations iterator +pub type SetAlgebraItems<'a, T, H> = + FilterMap<'static, (&'a HashSet, &'a T), &'a T, + Zip>, SetItems<'a, T>>>; + +#[cfg(test)] +mod test_set { + use prelude::*; + + use super::HashSet; + use slice::ImmutablePartialEqSlice; + use collections::Collection; + + #[test] + fn test_disjoint() { + let mut xs = HashSet::new(); + let mut ys = HashSet::new(); + assert!(xs.is_disjoint(&ys)); + assert!(ys.is_disjoint(&xs)); + assert!(xs.insert(5i)); + assert!(ys.insert(11i)); + assert!(xs.is_disjoint(&ys)); + assert!(ys.is_disjoint(&xs)); + assert!(xs.insert(7)); + assert!(xs.insert(19)); + assert!(xs.insert(4)); + assert!(ys.insert(2)); + assert!(ys.insert(-11)); + assert!(xs.is_disjoint(&ys)); + assert!(ys.is_disjoint(&xs)); + assert!(ys.insert(7)); + assert!(!xs.is_disjoint(&ys)); + assert!(!ys.is_disjoint(&xs)); + } + + #[test] + fn test_subset_and_superset() { + let mut a = HashSet::new(); + assert!(a.insert(0i)); + assert!(a.insert(5)); + assert!(a.insert(11)); + assert!(a.insert(7)); + + let mut b = HashSet::new(); + assert!(b.insert(0i)); + assert!(b.insert(7)); + assert!(b.insert(19)); + assert!(b.insert(250)); + assert!(b.insert(11)); + assert!(b.insert(200)); + + assert!(!a.is_subset(&b)); + assert!(!a.is_superset(&b)); + assert!(!b.is_subset(&a)); + assert!(!b.is_superset(&a)); + + assert!(b.insert(5)); + + assert!(a.is_subset(&b)); + assert!(!a.is_superset(&b)); + assert!(!b.is_subset(&a)); + assert!(b.is_superset(&a)); + } + + #[test] + fn test_iterate() { + let mut a = HashSet::new(); + for i in range(0u, 32) { + assert!(a.insert(i)); + } + let mut observed: u32 = 0; + for k in a.iter() { + observed |= 1 << *k; + } + assert_eq!(observed, 0xFFFF_FFFF); + } + + #[test] + fn test_intersection() { + let mut a = HashSet::new(); + let mut b = HashSet::new(); + + assert!(a.insert(11i)); + assert!(a.insert(1)); + assert!(a.insert(3)); + assert!(a.insert(77)); + assert!(a.insert(103)); + assert!(a.insert(5)); + assert!(a.insert(-5)); + + assert!(b.insert(2i)); + assert!(b.insert(11)); + assert!(b.insert(77)); + assert!(b.insert(-9)); + assert!(b.insert(-42)); + assert!(b.insert(5)); + assert!(b.insert(3)); + + let mut i = 0; + let expected = [3, 5, 11, 77]; + for x in a.intersection(&b) { + assert!(expected.contains(x)); + i += 1 + } + assert_eq!(i, expected.len()); + } + + #[test] + fn test_difference() { + let mut a = HashSet::new(); + let mut b = HashSet::new(); + + assert!(a.insert(1i)); + assert!(a.insert(3)); + assert!(a.insert(5)); + assert!(a.insert(9)); + assert!(a.insert(11)); + + assert!(b.insert(3i)); + assert!(b.insert(9)); + + let mut i = 0; + let expected = [1, 5, 11]; + for x in a.difference(&b) { + assert!(expected.contains(x)); + i += 1 + } + assert_eq!(i, expected.len()); + } + + #[test] + fn test_symmetric_difference() { + let mut a = HashSet::new(); + let mut b = HashSet::new(); + + assert!(a.insert(1i)); + assert!(a.insert(3)); + assert!(a.insert(5)); + assert!(a.insert(9)); + assert!(a.insert(11)); + + assert!(b.insert(-2i)); + assert!(b.insert(3)); + assert!(b.insert(9)); + assert!(b.insert(14)); + assert!(b.insert(22)); + + let mut i = 0; + let expected = [-2, 1, 5, 11, 14, 22]; + for x in a.symmetric_difference(&b) { + assert!(expected.contains(x)); + i += 1 + } + assert_eq!(i, expected.len()); + } + + #[test] + fn test_union() { + let mut a = HashSet::new(); + let mut b = HashSet::new(); + + assert!(a.insert(1i)); + assert!(a.insert(3)); + assert!(a.insert(5)); + assert!(a.insert(9)); + assert!(a.insert(11)); + assert!(a.insert(16)); + assert!(a.insert(19)); + assert!(a.insert(24)); + + assert!(b.insert(-2i)); + assert!(b.insert(1)); + assert!(b.insert(5)); + assert!(b.insert(9)); + assert!(b.insert(13)); + assert!(b.insert(19)); + + let mut i = 0; + let expected = [-2, 1, 3, 5, 9, 11, 13, 16, 19, 24]; + for x in a.union(&b) { + assert!(expected.contains(x)); + i += 1 + } + assert_eq!(i, expected.len()); + } + + #[test] + fn test_from_iter() { + let xs = [1i, 2, 3, 4, 5, 6, 7, 8, 9]; + + let set: HashSet = xs.iter().map(|&x| x).collect(); + + for x in xs.iter() { + assert!(set.contains(x)); + } + } + + #[test] + fn test_move_iter() { + let hs = { + let mut hs = HashSet::new(); + + hs.insert('a'); + hs.insert('b'); + + hs + }; + + let v = hs.move_iter().collect::>(); + assert!(['a', 'b'] == v.as_slice() || ['b', 'a'] == v.as_slice()); + } + + #[test] + fn test_eq() { + // These constants once happened to expose a bug in insert(). + // I'm keeping them around to prevent a regression. + let mut s1 = HashSet::new(); + + s1.insert(1i); + s1.insert(2); + s1.insert(3); + + let mut s2 = HashSet::new(); + + s2.insert(1i); + s2.insert(2); + + assert!(s1 != s2); + + s2.insert(3); + + assert_eq!(s1, s2); + } + + #[test] + fn test_show() { + let mut set: HashSet = HashSet::new(); + let empty: HashSet = HashSet::new(); + + set.insert(1i); + set.insert(2); + + let set_str = format!("{}", set); + + assert!(set_str == "{1, 2}".to_string() || set_str == "{2, 1}".to_string()); + assert_eq!(format!("{}", empty), "{}".to_string()); + } +} diff --git a/src/libstd/collections/hashmap/table.rs b/src/libstd/collections/hashmap/table.rs new file mode 100644 index 00000000000..96d1a9ba2fb --- /dev/null +++ b/src/libstd/collections/hashmap/table.rs @@ -0,0 +1,877 @@ +// Copyright 2014 The Rust Project Developers. See the COPYRIGHT +// file at the top-level directory of this distribution and at +// http://rust-lang.org/COPYRIGHT. +// +// Licensed under the Apache License, Version 2.0 or the MIT license +// , at your +// option. This file may not be copied, modified, or distributed +// except according to those terms. +// +// ignore-lexer-test FIXME #15883 + +use clone::Clone; +use cmp; +use hash::{Hash, Hasher}; +use iter::{Iterator, count}; +use mem::{min_align_of, size_of}; +use mem; +use num::{CheckedMul, is_power_of_two}; +use ops::{Deref, DerefMut, Drop}; +use option::{Some, None, Option}; +use ptr::RawPtr; +use ptr::set_memory; +use ptr::write; +use ptr; +use rt::heap::{allocate, deallocate}; + +static EMPTY_BUCKET: u64 = 0u64; + +/// The raw hashtable, providing safe-ish access to the unzipped and highly +/// optimized arrays of hashes, keys, and values. +/// +/// This design uses less memory and is a lot faster than the naive +/// `Vec>`, because we don't pay for the overhead of an +/// option on every element, and we get a generally more cache-aware design. +/// +/// Key invariants of this structure: +/// +/// - if hashes[i] == EMPTY_BUCKET, then keys[i] and vals[i] have +/// 'undefined' contents. Don't read from them. This invariant is +/// enforced outside this module with the `EmptyIndex`, `FullIndex`, +/// and `SafeHash` types. +/// +/// - An `EmptyIndex` is only constructed for a bucket at an index with +/// a hash of EMPTY_BUCKET. +/// +/// - A `FullIndex` is only constructed for a bucket at an index with a +/// non-EMPTY_BUCKET hash. +/// +/// - A `SafeHash` is only constructed for non-`EMPTY_BUCKET` hash. We get +/// around hashes of zero by changing them to 0x8000_0000_0000_0000, +/// which will likely map to the same bucket, while not being confused +/// with "empty". +/// +/// - All three "arrays represented by pointers" are the same length: +/// `capacity`. This is set at creation and never changes. The arrays +/// are unzipped to save space (we don't have to pay for the padding +/// between odd sized elements, such as in a map from u64 to u8), and +/// be more cache aware (scanning through 8 hashes brings in 2 cache +/// lines, since they're all right beside each other). +/// +/// You can kind of think of this module/data structure as a safe wrapper +/// around just the "table" part of the hashtable. It enforces some +/// invariants at the type level and employs some performance trickery, +/// but in general is just a tricked out `Vec>`. +/// +/// FIXME(cgaebel): +/// +/// Feb 11, 2014: This hashtable was just implemented, and, hard as I tried, +/// isn't yet totally safe. There's a "known exploit" that you can create +/// multiple FullIndexes for a bucket, `take` one, and then still `take` +/// the other causing undefined behavior. Currently, there's no story +/// for how to protect against this statically. Therefore, there are asserts +/// on `take`, `get`, `get_mut`, and `put` which check the bucket state. +/// With time, and when we're confident this works correctly, they should +/// be removed. Also, the bounds check in `peek` is especially painful, +/// as that's called in the innermost loops of the hashtable and has the +/// potential to be a major performance drain. Remove this too. +/// +/// Or, better than remove, only enable these checks for debug builds. +/// There's currently no "debug-only" asserts in rust, so if you're reading +/// this and going "what? of course there are debug-only asserts!", then +/// please make this use them! +#[unsafe_no_drop_flag] +pub struct RawTable { + capacity: uint, + size: uint, + hashes: *mut u64 +} + +/// A bucket that holds a reference to the table +pub trait BucketWithTable { + /// A bucket that holds a reference to the table + fn table<'a>(&'a self) -> &'a M; + + /// Move out the reference to the table. + fn into_table(self) -> M; + + /// Get the raw index. + fn index(&self) -> uint; +} + +struct RawBucket { + hash: *mut u64, + key: *mut K, + val: *mut V +} + +pub struct Bucket { + raw: RawBucket, + idx: uint, + table: M +} + +pub struct EmptyBucket { + raw: RawBucket, + idx: uint, + table: M +} + +pub struct FullBucket { + raw: RawBucket, + idx: uint, + table: M +} + +pub type EmptyBucketImm<'table,K,V> = EmptyBucket>; +pub type FullBucketImm<'table,K,V> = FullBucket>; + +pub type EmptyBucketMut<'table,K,V> = EmptyBucket>; +pub type FullBucketMut<'table,K,V> = FullBucket>; + +struct GapThenFull { + gap: EmptyBucket, + full: FullBucket +} + +impl>> GapThenFull { + pub fn full<'a>(&'a self) -> &'a FullBucket { + &self.full + } + + pub fn shift(mut self) -> Option> { + unsafe { + *self.gap.raw.hash = mem::replace(&mut *self.full.raw.hash, EMPTY_BUCKET); + mem::overwrite(self.gap.raw.key, ptr::read(self.full.raw.key as *const K)); + mem::overwrite(self.gap.raw.val, ptr::read(self.full.raw.val as *const V)); + } + + let FullBucket { raw, idx, .. } = self.full; + + match self.full.next().peek() { + Empty(_) => None, + Full(bucket) => { + self.gap.raw = raw; + self.gap.idx = idx; + + self.full = bucket; + self.full.idx &= self.full.table.capacity - 1; + + Some(self) + } + } + } +} + +impl RawPtr for RawBucket { + unsafe fn offset(self, count: int) -> RawBucket { + RawBucket { + hash: self.hash.offset(count), + key: self.key.offset(count), + val: self.val.offset(count), + } + } + + fn null() -> RawBucket { + RawBucket { + hash: RawPtr::null(), + key: RawPtr::null(), + val: RawPtr::null() + } + } + + fn is_null(&self) -> bool { + self.hash.is_null() + } + + fn to_uint(&self) -> uint { + self.hash.to_uint() + } + + unsafe fn to_option(&self) -> Option<&u64> { + self.hash.to_option() + } +} + +impl>> EmptyBucket { + pub fn next(self) -> Bucket { + let mut bucket = self.into_bucket(); + bucket.next(); + bucket + } + + pub fn into_bucket(self) -> Bucket { + Bucket { + raw: self.raw, + idx: self.idx, + table: self.table + } + } + + pub fn gap_peek(self) -> Option> { + let gap = EmptyBucket { + raw: self.raw, + idx: self.idx, + table: () + }; + + match self.next().peek() { + Empty(_) => None, + Full(bucket) => { + Some(GapThenFull { + gap: gap, + full: bucket + }) + } + } + } +} + +impl>> EmptyBucket { + pub fn put(mut self, hash: SafeHash, key: K, value: V) + -> FullBucket { + unsafe { + *self.raw.hash = hash.inspect(); + write(self.raw.key, key); + write(self.raw.val, value); + } + + self.table.size += 1; + + FullBucket { raw: self.raw, idx: self.idx, table: self.table } + } +} + +impl>> FullBucket { + pub fn next(self) -> Bucket { + let mut bucket = self.into_bucket(); + bucket.next(); + bucket + } + + pub fn into_bucket(self) -> Bucket { + Bucket { + raw: self.raw, + idx: self.idx, + table: self.table + } + } + + pub fn distance(&self) -> uint { + (self.idx - self.hash().inspect() as uint) & (self.table.capacity() - 1) + } + + pub fn hash(&self) -> SafeHash { + unsafe { + SafeHash { + hash: *self.raw.hash + } + } + } + + pub fn read<'a>(&'a self) -> (&'a K, &'a V) { + unsafe { + (&*self.raw.key, + &*self.raw.val) + } + } + + pub fn into_refs(self) -> (&K, &V) { + unsafe { + // debug_assert!(*self.raw.hash != EMPTY_BUCKET); + (&*self.raw.key, + &*self.raw.val) + } + } +} + +impl>> FullBucket { + pub fn take(mut self) -> (EmptyBucket, K, V) { + let key = self.raw.key as *const K; + let val = self.raw.val as *const V; + + self.table.size -= 1; + + unsafe { + *self.raw.hash = EMPTY_BUCKET; + ( + EmptyBucket { + raw: self.raw, + idx: self.idx, + table: self.table + }, + ptr::read(key), + ptr::read(val) + ) + } + } + + pub fn replace(&mut self, h: SafeHash, k: K, v: V) -> (SafeHash, K, V) { + unsafe { + let old_hash = ptr::replace(self.raw.hash as *mut SafeHash, h); + let old_key = ptr::replace(self.raw.key, k); + let old_val = ptr::replace(self.raw.val, v); + + (old_hash, old_key, old_val) + } + } + + pub fn read_mut<'a>(&'a self) -> (&'a mut K, &'a mut V) { + unsafe { + // debug_assert!(*self.raw.hash != EMPTY_BUCKET); + (&mut *self.raw.key, + &mut *self.raw.val) + } + } + + pub fn into_mut_refs(self) -> (&mut K, &mut V) { + unsafe { + // debug_assert!(*self.raw.hash != EMPTY_BUCKET); + (&mut *self.raw.key, + &mut *self.raw.val) + } + } +} + +impl>> Bucket { + pub fn new(table: M, hash: &SafeHash) -> Bucket { + let ib_index = (hash.inspect() as uint) & (table.capacity() - 1); + Bucket { + raw: unsafe { + table.as_mut_ptrs().offset(ib_index as int) + }, + idx: ib_index, + table: table + } + } + + pub fn at_index(table: M, ib_index: uint) -> Bucket { + let ib_index = ib_index & (table.capacity() - 1); + Bucket { + raw: unsafe { + table.as_mut_ptrs().offset(ib_index as int) + }, + idx: ib_index, + table: table + } + } + + pub fn first(table: M) -> Bucket { + Bucket { + raw: table.as_mut_ptrs(), + idx: 0, + table: table + } + } + + pub fn peek(self) -> BucketState { + match unsafe { *self.raw.hash } { + EMPTY_BUCKET => + Empty(EmptyBucket { + raw: self.raw, + idx: self.idx, + table: self.table + }), + _ => + Full(FullBucket { + raw: self.raw, + idx: self.idx, + table: self.table + }) + } + } + + pub fn next(&mut self) { + self.idx += 1; + + let dist = if self.idx == self.table.capacity() { + -(self.table.capacity() as int - 1) + } else { + 1i + }; + + unsafe { + self.raw = self.raw.offset(dist); + } + } +} + +impl BucketWithTable for FullBucket { + fn table<'a>(&'a self) -> &'a M { + &self.table + } + + fn into_table(self) -> M { + self.table + } + + fn index(&self) -> uint { + self.idx + } +} + +impl BucketWithTable for EmptyBucket { + fn table<'a>(&'a self) -> &'a M { + &self.table + } + + fn into_table(self) -> M { + self.table + } + + fn index(&self) -> uint { + self.idx + } +} + +impl BucketWithTable for Bucket { + fn table<'a>(&'a self) -> &'a M { + &self.table + } + + fn into_table(self) -> M { + self.table + } + + fn index(&self) -> uint { + self.idx + } +} + +impl<'table,K,V> Deref> for &'table RawTable { + fn deref<'a>(&'a self) -> &'a RawTable { + &**self + } +} + +impl<'table,K,V> Deref> for &'table mut RawTable { + fn deref<'a>(&'a self) -> &'a RawTable { + &**self + } +} + +impl<'table,K,V> DerefMut> for &'table mut RawTable { + fn deref_mut<'a>(&'a mut self) -> &'a mut RawTable { + &mut **self + } +} + +pub enum BucketState { + Empty(EmptyBucket), + Full(FullBucket), +} + +/// A hash that is not zero, since we use a hash of zero to represent empty +/// buckets. +#[deriving(PartialEq)] +pub struct SafeHash { + hash: u64, +} + +impl SafeHash { + /// Peek at the hash value, which is guaranteed to be non-zero. + #[inline(always)] + pub fn inspect(&self) -> u64 { self.hash } +} + +/// We need to remove hashes of 0. That's reserved for empty buckets. +/// This function wraps up `hash_keyed` to be the only way outside this +/// module to generate a SafeHash. +pub fn make_hash, S, H: Hasher>(hasher: &H, t: &T) -> SafeHash { + match hasher.hash(t) { + // This constant is exceedingly likely to hash to the same + // bucket, but it won't be counted as empty! + EMPTY_BUCKET => SafeHash { hash: 0x8000_0000_0000_0000 }, + h => SafeHash { hash: h }, + } +} + +fn round_up_to_next(unrounded: uint, target_alignment: uint) -> uint { + assert!(is_power_of_two(target_alignment)); + (unrounded + target_alignment - 1) & !(target_alignment - 1) +} + +#[test] +fn test_rounding() { + assert_eq!(round_up_to_next(0, 4), 0); + assert_eq!(round_up_to_next(1, 4), 4); + assert_eq!(round_up_to_next(2, 4), 4); + assert_eq!(round_up_to_next(3, 4), 4); + assert_eq!(round_up_to_next(4, 4), 4); + assert_eq!(round_up_to_next(5, 4), 8); +} + +// Returns a tuple of (minimum required malloc alignment, hash_offset, +// key_offset, val_offset, array_size), from the start of a mallocated array. +fn calculate_offsets( + hash_size: uint, hash_align: uint, + keys_size: uint, keys_align: uint, + vals_size: uint, vals_align: uint) -> (uint, uint, uint, uint, uint) { + + let hash_offset = 0; + let end_of_hashes = hash_offset + hash_size; + + let keys_offset = round_up_to_next(end_of_hashes, keys_align); + let end_of_keys = keys_offset + keys_size; + + let vals_offset = round_up_to_next(end_of_keys, vals_align); + let end_of_vals = vals_offset + vals_size; + + let min_align = cmp::max(hash_align, cmp::max(keys_align, vals_align)); + + (min_align, hash_offset, keys_offset, vals_offset, end_of_vals) +} + +#[test] +fn test_offset_calculation() { + assert_eq!(calculate_offsets(128, 8, 15, 1, 4, 4 ), (8, 0, 128, 144, 148)); + assert_eq!(calculate_offsets(3, 1, 2, 1, 1, 1 ), (1, 0, 3, 5, 6)); + assert_eq!(calculate_offsets(6, 2, 12, 4, 24, 8), (8, 0, 8, 24, 48)); +} + +impl RawTable { + + /// Does not initialize the buckets. The caller should ensure they, + /// at the very least, set every hash to EMPTY_BUCKET. + unsafe fn new_uninitialized(capacity: uint) -> RawTable { + if capacity == 0 { + return RawTable { + size: 0, + capacity: 0, + hashes: 0 as *mut u64, + }; + } + let hashes_size = capacity.checked_mul(&size_of::()) + .expect("capacity overflow"); + let keys_size = capacity.checked_mul(&size_of::< K >()) + .expect("capacity overflow"); + let vals_size = capacity.checked_mul(&size_of::< V >()) + .expect("capacity overflow"); + + // Allocating hashmaps is a little tricky. We need to allocate three + // arrays, but since we know their sizes and alignments up front, + // we just allocate a single array, and then have the subarrays + // point into it. + // + // This is great in theory, but in practice getting the alignment + // right is a little subtle. Therefore, calculating offsets has been + // factored out into a different function. + let (malloc_alignment, hash_offset, _, _, size) = + calculate_offsets( + hashes_size, min_align_of::(), + keys_size, min_align_of::< K >(), + vals_size, min_align_of::< V >()); + + let buffer = allocate(size, malloc_alignment); + + let hashes = buffer.offset(hash_offset as int) as *mut u64; + + RawTable { + capacity: capacity, + size: 0, + hashes: hashes, + } + } + + fn as_mut_ptrs(&self) -> RawBucket { + let hashes_size = self.capacity * size_of::(); + let keys_size = self.capacity * size_of::(); + + let keys_offset = (hashes_size + min_align_of::< K >() - 1) & !(min_align_of::< K >() - 1); + let end_of_keys = keys_offset + keys_size; + + let vals_offset = (end_of_keys + min_align_of::< V >() - 1) & !(min_align_of::< V >() - 1); + + let buffer = self.hashes as *mut u8; + + unsafe { + RawBucket { + hash: self.hashes, + key: buffer.offset(keys_offset as int) as *mut K, + val: buffer.offset(vals_offset as int) as *mut V + } + } + } + + /// Creates a new raw table from a given capacity. All buckets are + /// initially empty. + #[allow(experimental)] + pub fn new(capacity: uint) -> RawTable { + unsafe { + let ret = RawTable::new_uninitialized(capacity); + set_memory(ret.hashes, 0u8, capacity); + ret + } + } + + /// The hashtable's capacity, similar to a vector's. + pub fn capacity(&self) -> uint { + self.capacity + } + + /// The number of elements ever `put` in the hashtable, minus the number + /// of elements ever `take`n. + pub fn size(&self) -> uint { + self.size + } + + fn ptrs<'a>(&'a self) -> RawBuckets<'a, K, V> { + RawBuckets { + raw: self.as_mut_ptrs(), + hashes_end: unsafe { + self.hashes.offset(self.capacity as int) + } + } + } + + pub fn iter<'a>(&'a self) -> Entries<'a, K, V> { + Entries { + iter: self.ptrs(), + elems_left: self.size(), + } + } + + pub fn mut_iter<'a>(&'a mut self) -> MutEntries<'a, K, V> { + MutEntries { + iter: self.ptrs(), + elems_left: self.size(), + } + } + + pub fn move_iter(self) -> MoveEntries { + MoveEntries { + iter: self.ptrs(), + table: self, + } + } + + pub fn rev_move_buckets<'a>(&'a mut self) -> RevMoveBuckets<'a, K, V> { + let raw_bucket = self.as_mut_ptrs(); + unsafe { + RevMoveBuckets { + raw: raw_bucket.offset(self.capacity as int), + hashes_end: raw_bucket.hash, + elems_left: self.size + } + } + } +} + +pub struct RawBuckets<'a, K, V> { + raw: RawBucket, + hashes_end: *mut u64 +} + +impl<'a, K, V> Iterator> for RawBuckets<'a, K, V> { + fn next(&mut self) -> Option> { + while self.raw.hash != self.hashes_end { + unsafe { + let prev = ptr::replace(&mut self.raw, self.raw.offset(1)); + if *prev.hash != EMPTY_BUCKET { + return Some(prev); + } + } + } + + None + } +} + +pub struct RevMoveBuckets<'a, K, V> { + raw: RawBucket, + hashes_end: *mut u64, + elems_left: uint +} + +impl<'a, K, V> Iterator<(K, V)> for RevMoveBuckets<'a, K, V> { + fn next(&mut self) -> Option<(K, V)> { + if self.elems_left == 0 { + return None; + } + + loop { + debug_assert!(self.raw.hash != self.hashes_end); + + unsafe { + self.raw = self.raw.offset(-1); + + if *self.raw.hash != EMPTY_BUCKET { + self.elems_left -= 1; + return Some(( + ptr::read(self.raw.key as *const K), + ptr::read(self.raw.val as *const V) + )); + } + } + } + } +} + +// `read_all_mut` casts a `*u64` to a `*SafeHash`. Since we statically +// ensure that a `FullIndex` points to an index with a non-zero hash, +// and a `SafeHash` is just a `u64` with a different name, this is +// safe. +// +// This test ensures that a `SafeHash` really IS the same size as a +// `u64`. If you need to change the size of `SafeHash` (and +// consequently made this test fail), `read_all_mut` needs to be +// modified to no longer assume this. +#[test] +fn can_alias_safehash_as_u64() { + assert_eq!(size_of::(), size_of::()) +} + +/// Note: stage0-specific version that lacks bound. +#[cfg(stage0)] +pub struct Entries<'a, K, V> { + iter: RawBuckets<'a, K, V>, + elems_left: uint, +} + +/// Iterator over shared references to entries in a table. +#[cfg(not(stage0))] +pub struct Entries<'a, K: 'a, V: 'a> { + iter: RawBuckets<'a, K, V>, + elems_left: uint, +} + +/// Note: stage0-specific version that lacks bound. +#[cfg(stage0)] +pub struct MutEntries<'a, K, V> { + iter: RawBuckets<'a, K, V>, + elems_left: uint, +} + +/// Iterator over mutable references to entries in a table. +#[cfg(not(stage0))] +pub struct MutEntries<'a, K: 'a, V: 'a> { + iter: RawBuckets<'a, K, V>, + elems_left: uint, +} + +/// Iterator over the entries in a table, consuming the table. +pub struct MoveEntries { + table: RawTable, + iter: RawBuckets<'static, K, V> +} + +impl<'a, K, V> Iterator<(&'a K, &'a V)> for Entries<'a, K, V> { + fn next(&mut self) -> Option<(&'a K, &'a V)> { + self.iter.next().map(|bucket| { + self.elems_left -= 1; + unsafe { + (&*bucket.key, + &*bucket.val) + } + }) + } + + fn size_hint(&self) -> (uint, Option) { + (self.elems_left, Some(self.elems_left)) + } +} + +impl<'a, K, V> Iterator<(&'a K, &'a mut V)> for MutEntries<'a, K, V> { + fn next(&mut self) -> Option<(&'a K, &'a mut V)> { + self.iter.next().map(|bucket| { + self.elems_left -= 1; + unsafe { + (&*bucket.key, + &mut *bucket.val) + } + }) + } + + fn size_hint(&self) -> (uint, Option) { + (self.elems_left, Some(self.elems_left)) + } +} + +impl Iterator<(SafeHash, K, V)> for MoveEntries { + fn next(&mut self) -> Option<(SafeHash, K, V)> { + self.iter.next().map(|bucket| { + self.table.size -= 1; + unsafe { + ( + SafeHash { + hash: *bucket.hash, + }, + ptr::read(bucket.key as *const K), + ptr::read(bucket.val as *const V) + ) + } + }) + } + + fn size_hint(&self) -> (uint, Option) { + let size = self.table.size(); + (size, Some(size)) + } +} + +impl Clone for RawTable { + fn clone(&self) -> RawTable { + unsafe { + let mut new_ht = RawTable::new_uninitialized(self.capacity()); + + { + let cap = self.capacity(); + let mut new_buckets = Bucket::first(&mut new_ht); + let mut buckets = Bucket::first(self); + while buckets.index() != cap { + match buckets.peek() { + Full(full) => { + let (h, k, v) = { + let (k, v) = full.read(); + (full.hash(), k.clone(), v.clone()) + }; + *new_buckets.raw.hash = h.inspect(); + mem::overwrite(new_buckets.raw.key, k); + mem::overwrite(new_buckets.raw.val, v); + } + _ => { + *new_buckets.raw.hash = EMPTY_BUCKET; + } + } + new_buckets.next(); + buckets.next(); + } + } + + new_ht.size = self.size(); + + new_ht + } + } +} + +#[unsafe_destructor] +impl Drop for RawTable { + fn drop(&mut self) { + if self.hashes.is_null() { + return; + } + // This is in reverse because we're likely to have partially taken + // some elements out with `.move_iter()` from the front. + // Check if the size is 0, so we don't do a useless scan when + // dropping empty tables such as on resize. + // Avoid double free of elements already moved out. + for _ in self.rev_move_buckets() {} + + let hashes_size = self.capacity * size_of::(); + let keys_size = self.capacity * size_of::(); + let vals_size = self.capacity * size_of::(); + let (align, _, _, _, size) = calculate_offsets(hashes_size, min_align_of::(), + keys_size, min_align_of::(), + vals_size, min_align_of::()); + + unsafe { + deallocate(self.hashes as *mut u8, size, align); + // Remember how everything was allocated out of one buffer + // during initialization? We only need one call to free here. + } + + self.hashes = RawPtr::null(); + } +} From ae7342a56a24eac539e3d4b13cd49c6719908426 Mon Sep 17 00:00:00 2001 From: Piotr Czarnecki Date: Tue, 15 Jul 2014 21:58:35 +0100 Subject: [PATCH 4/6] std: Refine and document HashMap's code * branchless `bucket.next()` * robin_hood is a free function * fixed the resize policy that was off by one * documented the growth algorithm * updated documentation after interface changes * removed old fixmes --- src/libstd/collections/hashmap/bench.rs | 2 +- src/libstd/collections/hashmap/map.rs | 562 ++++++++++++++------- src/libstd/collections/hashmap/mod.rs | 1 + src/libstd/collections/hashmap/set.rs | 23 +- src/libstd/collections/hashmap/table.rs | 624 ++++++++++++------------ 5 files changed, 717 insertions(+), 495 deletions(-) diff --git a/src/libstd/collections/hashmap/bench.rs b/src/libstd/collections/hashmap/bench.rs index 66d97ba0448..21bbb38f489 100644 --- a/src/libstd/collections/hashmap/bench.rs +++ b/src/libstd/collections/hashmap/bench.rs @@ -38,7 +38,7 @@ fn new_insert_drop(b : &mut Bencher) { } #[bench] -fn insert(b: &mut Bencher) { +fn grow_by_insertion(b: &mut Bencher) { use super::HashMap; let mut m = HashMap::new(); diff --git a/src/libstd/collections/hashmap/map.rs b/src/libstd/collections/hashmap/map.rs index 7a3779a91a0..a50c6a59f7e 100644 --- a/src/libstd/collections/hashmap/map.rs +++ b/src/libstd/collections/hashmap/map.rs @@ -16,19 +16,27 @@ use collections::{Collection, Mutable, MutableSet, Map, MutableMap}; use default::Default; use fmt::Show; use fmt; -use RandomSipHasher; -use hash::{Hash, Hasher}; -use iter::{Iterator, FromIterator, Extendable, range}; +use hash::{Hash, Hasher, RandomSipHasher}; +use iter::{Iterator, FromIterator, Extendable}; use iter; use mem::replace; use num; -use ops::Deref; +use ops::{Deref, DerefMut}; use option::{Some, None, Option}; use result::{Ok, Err}; use ops::Index; -use super::table::{BucketWithTable, FullBucketImm, RawTable, FullBucket, FullBucketMut, Bucket}; use super::table; +use super::table::{ + Bucket, + Empty, + Full, + FullBucket, + FullBucketImm, + FullBucketMut, + RawTable, + SafeHash +}; static INITIAL_LOG2_CAP: uint = 5; pub static INITIAL_CAPACITY: uint = 1 << INITIAL_LOG2_CAP; // 2^5 @@ -36,8 +44,9 @@ pub static INITIAL_CAPACITY: uint = 1 << INITIAL_LOG2_CAP; // 2^5 /// The default behavior of HashMap implements a load factor of 90.9%. /// This behavior is characterized by the following conditions: /// -/// - if `size * 1.1 < cap < size * 4` then shouldn't resize -/// - if `cap < minimum_capacity * 2` then shouldn't shrink +/// - if size > 0.909 * capacity: grow +/// - if size < 0.25 * capacity: shrink (if this won't bring capacity lower +/// than the minimum) #[deriving(Clone)] struct DefaultResizePolicy { /// Doubled minimal capacity. The capacity must never drop below @@ -55,7 +64,12 @@ impl DefaultResizePolicy { #[inline] fn capacity_range(&self, new_size: uint) -> (uint, uint) { - ((new_size * 11) / 10, max(new_size << 3, self.minimum_capacity2)) + // Here, we are rephrasing the logic by specifying the ranges: + // + // - if `size * 1.1 < cap < size * 4`: don't resize + // - if `cap < minimum_capacity * 2`: don't shrink + // - otherwise, resize accordingly + ((new_size * 11) / 10, max(new_size << 2, self.minimum_capacity2)) } #[inline] @@ -65,9 +79,9 @@ impl DefaultResizePolicy { } // The main performance trick in this hashmap is called Robin Hood Hashing. -// It gains its excellent performance from one crucial operation: +// It gains its excellent performance from one essential operation: // -// If an insertion collides with an existing element, and that elements +// If an insertion collides with an existing element, and that element's // "probe distance" (how far away the element is from its ideal location) // is higher than how far we've already probed, swap the elements. // @@ -94,6 +108,15 @@ impl DefaultResizePolicy { // α^3, etc. Therefore, the odds of colliding k times is α^k. The odds of NOT // colliding after k tries is 1-α^k. // +// The paper from 1986 cited below mentions an implementation which keeps track +// of the distance-to-initial-bucket histogram. This approach is not suitable +// for modern architectures because it requires maintaining an internal data +// structure. This allows very good first guesses, but we are most concerned +// with guessing entire cache lines, not individual indexes. Furthermore, array +// accesses are no longer linear and in one direction, as we have now. There +// is also memory and cache pressure that this would entail that would be very +// difficult to properly see in a microbenchmark. +// // Future Improvements (FIXME!) // ============================ // @@ -106,15 +129,6 @@ impl DefaultResizePolicy { // Future Optimizations (FIXME!) // ============================= // -// The paper cited below mentions an implementation which keeps track of the -// distance-to-initial-bucket histogram. I'm suspicious of this approach because -// it requires maintaining an internal map. If this map were replaced with a -// hashmap, it would be faster, but now our data structure is self-referential -// and blows up. Also, this allows very good first guesses, but array accesses -// are no longer linear and in one direction, as we have now. There is also -// memory and cache pressure that this map would entail that would be very -// difficult to properly see in a microbenchmark. -// // Another possible design choice that I made without any real reason is // parameterizing the raw table over keys and values. Technically, all we need // is the size and alignment of keys and values, and the code should be just as @@ -125,12 +139,56 @@ impl DefaultResizePolicy { // This would definitely be an avenue worth exploring if people start complaining // about the size of rust executables. // -// There's also an "optimization" that has been omitted regarding how the -// hashtable allocates. The vector type has set the expectation that a hashtable -// which never has an element inserted should not allocate. I'm suspicious of -// implementing this for hashtables, because supporting it has no performance -// benefit over using an `Option>`, and is significantly more -// complicated. +// Annotate exceedingly likely branches in `table::make_hash` +// and `search_hashed_generic` to reduce instruction cache pressure +// and mispredictions once it becomes possible (blocked on issue #11092). +// +// Shrinking the table could simply reallocate in place after moving buckets +// to the first half. +// +// The growth algorithm (fragment of the Proof of Correctness) +// -------------------- +// +// The growth algorithm is basically a fast path of the naive reinsertion- +// during-resize algorithm. Other paths should never be taken. +// +// Consider growing a robin hood hashtable of capacity n. Normally, we do this +// by allocating a new table of capacity `2n`, and then individually reinsert +// each element in the old table into the new one. This guarantees that the +// new table is a valid robin hood hashtable with all the desired statistical +// properties. Remark that the order we reinsert the elements in should not +// matter. For simplicity and efficiency, we will consider only linear +// reinsertions, which consist of reinserting all elements in the old table +// into the new one by increasing order of index. However we will not be +// starting our reinsertions from index 0 in general. If we start from index +// i, for the purpose of reinsertion we will consider all elements with real +// index j < i to have virtual index n + j. +// +// Our hash generation scheme consists of generating a 64-bit hash and +// truncating the most significant bits. When moving to the new table, we +// simply introduce a new bit to the front of the hash. Therefore, if an +// elements has ideal index i in the old table, it can have one of two ideal +// locations in the new table. If the new bit is 0, then the new ideal index +// is i. If the new bit is 1, then the new ideal index is n + i. Intutively, +// we are producing two independent tables of size n, and for each element we +// independently choose which table to insert it into with equal probability. +// However the rather than wrapping around themselves on overflowing their +// indexes, the first table overflows into the first, and the first into the +// second. Visually, our new table will look something like: +// +// [yy_xxx_xxxx_xxx|xx_yyy_yyyy_yyy] +// +// Where x's are elements inserted into the first table, y's are elements +// inserted into the second, and _'s are empty sections. We now define a few +// key concepts that we will use later. Note that this is a very abstract +// perspective of the table. A real resized table would be at least half +// empty. +// +// Theorem: A linear robin hood reinsertion from the first ideal element +// produces identical results to a linear naive reinsertion from the same +// element. +// +// FIXME(Gankro, pczarn): review the proof and put it all in a separate doc.rs /// A hash map implementation which uses linear probing with Robin /// Hood bucket stealing. @@ -219,27 +277,31 @@ pub struct HashMap { // All hashes are keyed on these values, to prevent hash collision attacks. hasher: H, - table: table::RawTable, + table: RawTable, // We keep this at the end since it might as well have tail padding. resize_policy: DefaultResizePolicy, } /// Search for a pre-hashed key. -fn search_hashed_generic>>(table: M, hash: &table::SafeHash, is_match: |&K| -> bool) - -> Option> { +fn search_hashed_generic>>(table: M, + hash: &SafeHash, + is_match: |&K| -> bool) + -> SearchResult { let size = table.size(); let mut probe = Bucket::new(table, hash); let ib = probe.index(); while probe.index() != ib + size { let full = match probe.peek() { - table::Empty(_) => return None, // hit an empty bucket - table::Full(b) => b + Empty(b) => return TableRef(b.into_table()), // hit an empty bucket + Full(b) => b }; if full.distance() + ib < full.index() { - return None; + // We can finish the search early if we hit any bucket + // with a lower distance to initial bucket than we've probed. + return TableRef(full.into_table()); } // If the hash doesn't match, it can't be this one.. @@ -251,65 +313,149 @@ fn search_hashed_generic>>(table: M, hash: &table: // If the key doesn't match, it can't be this one.. if matched { - return Some(full); + return FoundExisting(full); } } probe = full.next(); } - None + TableRef(probe.into_table()) } -fn search_hashed>>(table: M, hash: &table::SafeHash, k: &K) - -> Option> { +fn search_hashed>>(table: M, hash: &SafeHash, k: &K) + -> SearchResult { search_hashed_generic(table, hash, |k_| *k == *k_) } fn pop_internal(starting_bucket: FullBucketMut) -> V { - let size = { - let table = starting_bucket.table(); - table.size() - }; let (empty, _k, retval) = starting_bucket.take(); let mut gap = match empty.gap_peek() { Some(b) => b, None => return retval }; - // COMPILER error! wrong enum optimization. sets ptr to 0 - for _ in range(0, size) { - if gap.full().distance() != 0 { - gap = match gap.shift() { - Some(b) => b, - None => return retval - }; - continue; - } - - break; + while gap.full().distance() != 0 { + gap = match gap.shift() { + Some(b) => b, + None => break + }; } - // Now we're done all our shifting. Return the value we grabbed - // earlier. + // Now we've done all our shifting. Return the value we grabbed earlier. return retval; } +/// Perform robin hood bucket stealing at the given `bucket`. You must +/// also pass the position of that bucket's initial bucket so we don't have +/// to recalculate it. +/// +/// `hash`, `k`, and `v` are the elements to "robin hood" into the hashtable. +fn robin_hood<'a, K: 'a, V: 'a>(mut bucket: FullBucketMut<'a, K, V>, + mut ib: uint, + mut hash: SafeHash, + mut k: K, + mut v: V) + -> &'a mut V { + let starting_index = bucket.index(); + let size = { + let table = bucket.table(); // FIXME "lifetime too short". + table.size() + }; + // There can be at most `size - dib` buckets to displace, because + // in the worst case, there are `size` elements and we already are + // `distance` buckets away from the initial one. + let idx_end = starting_index + size - bucket.distance(); + + loop { + let (old_hash, old_key, old_val) = bucket.replace(hash, k, v); + loop { + let probe = bucket.next(); + assert!(probe.index() != idx_end); + + let full_bucket = match probe.peek() { + table::Empty(bucket) => { + // Found a hole! + let b = bucket.put(old_hash, old_key, old_val); + // Now that it's stolen, just read the value's pointer + // right out of the table! + let (_, v) = Bucket::at_index(b.into_table(), starting_index).peek() + .expect_full() + .into_mut_refs(); + return v; + }, + table::Full(bucket) => bucket + }; + + let probe_ib = full_bucket.index() - full_bucket.distance(); + + bucket = full_bucket; + + // Robin hood! Steal the spot. + if ib < probe_ib { + ib = probe_ib; + hash = old_hash; + k = old_key; + v = old_val; + break; + } + } + } +} + +/// A result that works like Option> but preserves +/// the reference that grants us access to the table in any case. +enum SearchResult { + // This is an entry that holds the given key: + FoundExisting(FullBucket), + + // There was no such entry. The reference is given back: + TableRef(M) +} + +impl SearchResult { + fn into_option(self) -> Option> { + match self { + FoundExisting(bucket) => Some(bucket), + TableRef(_) => None + } + } +} + +/// A newtyped mutable reference to the hashmap that allows e.g. Deref to be +/// implemented without making changes to the visible interface of HashMap. +/// Used internally because it's accepted by the search functions above. +struct MapMutRef<'a, K: 'a, V: 'a, H: 'a> { + map_ref: &'a mut HashMap +} + +impl<'a, K, V, H> Deref> for MapMutRef<'a, K, V, H> { + fn deref(&self) -> &RawTable { + &self.map_ref.table + } +} + +impl<'a, K, V, H> DerefMut> for MapMutRef<'a, K, V, H> { + fn deref_mut(&mut self) -> &mut RawTable { + &mut self.map_ref.table + } +} + impl, V, S, H: Hasher> HashMap { - fn make_hash>(&self, x: &X) -> table::SafeHash { + fn make_hash>(&self, x: &X) -> SafeHash { table::make_hash(&self.hasher, x) } fn search_equiv<'a, Q: Hash + Equiv>(&'a self, q: &Q) -> Option> { let hash = self.make_hash(q); - search_hashed_generic(&self.table, &hash, |k| q.equiv(k)) + search_hashed_generic(&self.table, &hash, |k| q.equiv(k)).into_option() } fn search_equiv_mut<'a, Q: Hash + Equiv>(&'a mut self, q: &Q) -> Option> { let hash = self.make_hash(q); - search_hashed_generic(&mut self.table, &hash, |k| q.equiv(k)) + search_hashed_generic(&mut self.table, &hash, |k| q.equiv(k)).into_option() } /// Search for a key, yielding the index if it's found in the hashtable. @@ -317,25 +463,29 @@ impl, V, S, H: Hasher> HashMap { /// search_hashed. fn search<'a>(&'a self, k: &K) -> Option> { let hash = self.make_hash(k); - search_hashed(&self.table, &hash, k) + search_hashed(&self.table, &hash, k).into_option() } fn search_mut<'a>(&'a mut self, k: &K) -> Option> { let hash = self.make_hash(k); - search_hashed(&mut self.table, &hash, k) + search_hashed(&mut self.table, &hash, k).into_option() } - fn insert_hashed_ordered(&mut self, hash: table::SafeHash, k: K, v: V) { + // The caller should ensure that invariants by Robin Hood Hashing hold. + fn insert_hashed_ordered(&mut self, hash: SafeHash, k: K, v: V) { let cap = self.table.capacity(); let mut buckets = Bucket::new(&mut self.table, &hash); let ib = buckets.index(); + while buckets.index() != ib + cap { + // We don't need to compare hashes for value swap. + // Not even DIBs for Robin Hood. buckets = match buckets.peek() { - table::Empty(empty) => { + Empty(empty) => { empty.put(hash, k, v); return; } - table::Full(b) => b.into_bucket() + Full(b) => b.into_bucket() }; buckets.next(); } @@ -361,8 +511,8 @@ impl, V, S, H: Hasher> Mutable for HashMap { while buckets.index() != cap { buckets = match buckets.peek() { - table::Empty(b) => b.next(), - table::Full(full) => { + Empty(b) => b.next(), + Full(full) => { let (b, _, _) = full.take(); b.next() } @@ -401,7 +551,7 @@ impl, V, S, H: Hasher> MutableMap for HashMap self.make_some_room(potential_new_size); let mut retval = None; - self.insert_or_replace_with(hash, k, v, |val_ref, val| { + self.insert_or_replace_with(hash, k, v, |_, val_ref, val| { retval = Some(replace(val_ref, val)); }); retval @@ -472,7 +622,7 @@ impl, V, S, H: Hasher> HashMap { HashMap { hasher: hasher, resize_policy: DefaultResizePolicy::new(INITIAL_CAPACITY), - table: table::RawTable::new(0), + table: RawTable::new(0), } } @@ -500,7 +650,7 @@ impl, V, S, H: Hasher> HashMap { HashMap { hasher: hasher, resize_policy: DefaultResizePolicy::new(cap), - table: table::RawTable::new(cap), + table: RawTable::new(cap), } } @@ -537,49 +687,78 @@ impl, V, S, H: Hasher> HashMap { assert!(self.table.size() <= new_capacity); assert!(num::is_power_of_two(new_capacity)); - let mut old_table = replace(&mut self.table, table::RawTable::new(new_capacity)); + let mut old_table = replace(&mut self.table, RawTable::new(new_capacity)); let old_size = old_table.size(); - if old_table.capacity() == 0 { + if old_table.capacity() == 0 || old_table.size() == 0 { return; } if new_capacity < old_table.capacity() { + // Shrink the table. Naive algorithm for resizing: for (h, k, v) in old_table.move_iter() { self.insert_hashed_nocheck(h, k, v); } } else { + // Grow the table. + // Specialization of the other branch. let mut bucket = Bucket::first(&mut old_table); + // "So a few of the first shall be last: for many be called, + // but few chosen." + // + // We'll most likely encounter a few buckets at the beginning that + // have their initial buckets near the end of the table. They were + // placed at the beginning as the probe wrapped around the table + // during insertion. We must skip forward to a bucket that won't + // get reinserted too early and won't unfairly steal others spot. + // This eliminates the need for robin hood. loop { - match bucket.peek() { - table::Full(full) => { + bucket = match bucket.peek() { + Full(full) => { if full.distance() == 0 { + // This bucket occupies its ideal spot. + // It indicates the start of another "cluster". bucket = full.into_bucket(); break; } - bucket = full.next(); + // Leaving this bucket in the last cluster for later. + full.into_bucket() } - table::Empty(b) => { - bucket = b.next(); - break; + Empty(b) => { + // Encountered a hole between clusters. + b.into_bucket() } }; + bucket.next(); } + // This is how the buckets might be laid out in memory: + // ($ marks an initialized bucket) + // ________________ + // |$$$_$$$$$$_$$$$$| + // + // But we've skipped the entire initial cluster of buckets + // and will continue iteration in this order: + // ________________ + // |$$$$$$_$$$$$ + // ^ wrap around once end is reached + // ________________ + // $$$_____________| + // ^ exit once table.size == 0 loop { bucket = match bucket.peek() { - table::Full(bucket) => { - { - let t = bucket.table(); - if t.size() == 0 { break } - } + Full(bucket) => { let h = bucket.hash(); let (b, k, v) = bucket.take(); self.insert_hashed_ordered(h, k, v); + { + let t = b.table(); // FIXME "lifetime too short". + if t.size() == 0 { break } + }; b.into_bucket() } - table::Empty(b) => b.into_bucket() + Empty(b) => b.into_bucket() }; bucket.next(); } @@ -612,41 +791,43 @@ impl, V, S, H: Hasher> HashMap { /// /// If the key already exists, the hashtable will be returned untouched /// and a reference to the existing element will be returned. - fn insert_hashed_nocheck<'a>( - &'a mut self, hash: table::SafeHash, k: K, v: V) -> &'a mut V { - self.insert_or_replace_with(hash, k, v, |_, _| ()) + fn insert_hashed_nocheck(&mut self, hash: SafeHash, k: K, v: V) -> &mut V { + self.insert_or_replace_with(hash, k, v, |_, _, _| ()) } - fn insert_or_replace_with<'a>( - &'a mut self, hash: table::SafeHash, k: K, v: V, - found_existing: |&mut V, V| - ) -> &'a mut V { - + fn insert_or_replace_with<'a>(&'a mut self, + hash: SafeHash, + k: K, + v: V, + found_existing: |&mut K, &mut V, V|) + -> &'a mut V { // Worst case, we'll find one empty bucket among `size + 1` buckets. let size = self.table.size(); - let mut rbucket = Bucket::new(&mut self.table, &hash); - let ib = rbucket.index(); + let mut probe = Bucket::new(&mut self.table, &hash); + let ib = probe.index(); loop { - let mut bucket = match rbucket.peek() { - table::Empty(bucket) => { + let mut bucket = match probe.peek() { + Empty(bucket) => { // Found a hole! let bucket = bucket.put(hash, k, v); let (_, val) = bucket.into_mut_refs(); return val; }, - table::Full(bucket) => bucket + Full(bucket) => bucket }; if bucket.hash() == hash { - let (bucket_k, bucket_v) = bucket.read_mut(); - // FIXME #12147 the conditional return confuses - // borrowck if we return bucket_v directly - let bv: *mut V = bucket_v; - if k == *bucket_k { + let found_match = { + let (bucket_k, _) = bucket.read_mut(); + k == *bucket_k + }; + if found_match { + let (bucket_k, bucket_v) = bucket.into_mut_refs(); + debug_assert!(k == *bucket_k); // Key already exists. Get its reference. - found_existing(bucket_v, v); - return unsafe {&mut *bv}; + found_existing(bucket_k, bucket_v, v); + return bucket_v; } } @@ -654,53 +835,18 @@ impl, V, S, H: Hasher> HashMap { if (ib as int) < robin_ib { // Found a luckier bucket than me. Better steal his spot. - let (mut hash, mut k, mut v) = bucket.replace(hash, k, v); - let robin_index = bucket.index(); - let mut robin_ib = robin_ib as uint; - let mut rbucket = bucket.next(); - loop { - let mut bucket = match rbucket.peek() { - table::Empty(bucket) => { - // Found a hole! - let b = bucket.put(hash, k, v); - // Now that it's stolen, just read the value's pointer - // right out of the table! - let (_, v) = match Bucket::at_index(b.into_table(), robin_index).peek() { - table::Full(b) => b.into_mut_refs(), - _ => fail!() - }; - return v; - }, - table::Full(bucket) => bucket - }; - - let probe_ib = bucket.index() - bucket.distance(); - - // Robin hood! Steal the spot. - if robin_ib < probe_ib { - robin_ib = probe_ib; - let (old_hash, old_key, old_val) = bucket.replace(hash, k, v); - hash = old_hash; - k = old_key; - v = old_val; - } - rbucket = bucket.next(); - if rbucket.index() == ib + size + 1 { - fail!("HashMap fatal error: 100% load factor?") - } - } - } - rbucket = bucket.next(); - if rbucket.index() == ib + size + 1 { - fail!("Internal HashMap error: Out of space.") + return robin_hood(bucket, robin_ib as uint, hash, k, v); } + + probe = bucket.next(); + assert!(probe.index() != ib + size + 1); } } /// Inserts an element which has already been hashed, returning a reference /// to that element inside the hashtable. This is more efficient that using /// `insert`, since the key will not be rehashed. - fn insert_hashed<'a>(&'a mut self, hash: table::SafeHash, k: K, v: V) -> &'a mut V { + fn insert_hashed(&mut self, hash: SafeHash, k: K, v: V) -> &mut V { let potential_new_size = self.table.size() + 1; self.make_some_room(potential_new_size); self.insert_hashed_nocheck(hash, k, v) @@ -721,7 +867,7 @@ impl, V, S, H: Hasher> HashMap { /// // Find the existing key /// assert_eq!(*map.find_or_insert("a", -2), 1); /// ``` - pub fn find_or_insert<'a>(&'a mut self, k: K, v: V) -> &'a mut V { + pub fn find_or_insert(&mut self, k: K, v: V) -> &mut V { self.find_with_or_insert_with(k, v, |_k, _v, _a| (), |_k, a| a) } @@ -768,7 +914,11 @@ impl, V, S, H: Hasher> HashMap { v: V, f: |&K, &mut V|) -> &'a mut V { - self.find_with_or_insert_with(k, v, |k, v, _a| f(k, v), |_k, a| a) + let potential_new_size = self.table.size() + 1; + self.make_some_room(potential_new_size); + + let hash = self.make_hash(&k); + self.insert_or_replace_with(hash, k, v, |kref, vref, _v| f(kref, vref)) } /// Modify and return the value corresponding to the key in the map, or @@ -820,21 +970,22 @@ impl, V, S, H: Hasher> HashMap { a: A, found: |&K, &mut V, A|, not_found: |&K, A| -> V) - -> &'a mut V { + -> &'a mut V + { let hash = self.make_hash(&k); - { - match search_hashed(&mut self.table, &hash, &k) { - Some(bucket) => { - let (_, v_ref) = bucket.into_mut_refs(); - found(&k, v_ref, a); - return v_ref; - } - _ => { - } - }; + let this = MapMutRef { map_ref: self }; + + match search_hashed(this, &hash, &k) { + FoundExisting(bucket) => { + let (_, v_ref) = bucket.into_mut_refs(); + found(&k, v_ref, a); + v_ref + } + TableRef(this) => { + let v = not_found(&k, a); + this.map_ref.insert_hashed(hash, k, v) + } } - let v = not_found(&k, a); - self.insert_hashed(hash, k, v) } /// Retrieves a value for the given key. @@ -996,7 +1147,7 @@ impl, V, S, H: Hasher> HashMap { /// println!("{}", key); /// } /// ``` - pub fn keys<'a>(&'a self) -> Keys<'a, K, V> { + pub fn keys(&self) -> Keys { self.iter().map(|(k, _v)| k) } @@ -1017,7 +1168,7 @@ impl, V, S, H: Hasher> HashMap { /// println!("{}", key); /// } /// ``` - pub fn values<'a>(&'a self) -> Values<'a, K, V> { + pub fn values(&self) -> Values { self.iter().map(|(_k, v)| v) } @@ -1038,8 +1189,8 @@ impl, V, S, H: Hasher> HashMap { /// println!("key: {} val: {}", key, val); /// } /// ``` - pub fn iter<'a>(&'a self) -> Entries<'a, K, V> { - self.table.iter() + pub fn iter(&self) -> Entries { + Entries { inner: self.table.iter() } } /// An iterator visiting all key-value pairs in arbitrary order, @@ -1065,8 +1216,8 @@ impl, V, S, H: Hasher> HashMap { /// println!("key: {} val: {}", key, val); /// } /// ``` - pub fn mut_iter<'a>(&'a mut self) -> MutEntries<'a, K, V> { - self.table.mut_iter() + pub fn mut_iter(&mut self) -> MutEntries { + MutEntries { inner: self.table.mut_iter() } } /// Creates a consuming iterator, that is, one that moves each key-value @@ -1087,7 +1238,9 @@ impl, V, S, H: Hasher> HashMap { /// let vec: Vec<(&str, int)> = map.move_iter().collect(); /// ``` pub fn move_iter(self) -> MoveEntries { - self.table.move_iter().map(|(_, k, v)| (k, v)) + MoveEntries { + inner: self.table.move_iter().map(|(_, k, v)| (k, v)) + } } } @@ -1131,13 +1284,9 @@ impl, V: PartialEq, S, H: Hasher> PartialEq for HashMap) -> bool { if self.len() != other.len() { return false; } - self.iter() - .all(|(key, value)| { - match other.find(key) { - None => false, - Some(v) => *value == *v - } - }) + self.iter().all(|(key, value)| + other.find(key).map_or(false, |v| *value == *v) + ) } } @@ -1178,14 +1327,52 @@ impl, V, S, H: Hasher> Index for HashMap { }*/ /// HashMap iterator -pub type Entries<'a, K, V> = table::Entries<'a, K, V>; +pub struct Entries<'a, K: 'a, V: 'a> { + inner: table::Entries<'a, K, V> +} /// HashMap mutable values iterator -pub type MutEntries<'a, K, V> = table::MutEntries<'a, K, V>; +pub struct MutEntries<'a, K: 'a, V: 'a> { + inner: table::MutEntries<'a, K, V> +} /// HashMap move iterator -pub type MoveEntries = - iter::Map<'static, (table::SafeHash, K, V), (K, V), table::MoveEntries>; +pub struct MoveEntries { + inner: iter::Map<'static, (SafeHash, K, V), (K, V), table::MoveEntries> +} + +impl<'a, K, V> Iterator<(&'a K, &'a V)> for Entries<'a, K, V> { + #[inline] + fn next(&mut self) -> Option<(&'a K, &'a V)> { + self.inner.next() + } + #[inline] + fn size_hint(&self) -> (uint, Option) { + self.inner.size_hint() + } +} + +impl<'a, K, V> Iterator<(&'a K, &'a mut V)> for MutEntries<'a, K, V> { + #[inline] + fn next(&mut self) -> Option<(&'a K, &'a mut V)> { + self.inner.next() + } + #[inline] + fn size_hint(&self) -> (uint, Option) { + self.inner.size_hint() + } +} + +impl Iterator<(K, V)> for MoveEntries { + #[inline] + fn next(&mut self) -> Option<(K, V)> { + self.inner.next() + } + #[inline] + fn size_hint(&self) -> (uint, Option) { + self.inner.size_hint() + } +} /// HashMap keys iterator pub type Keys<'a, K, V> = @@ -1266,7 +1453,6 @@ mod test_map { k: uint } - impl Dropable { fn new(k: uint) -> Dropable { let v = drop_vector.get().unwrap(); @@ -1371,6 +1557,7 @@ mod test_map { hm }; + // By the way, ensure that cloning doesn't screw up the dropping. drop(hm.clone()); { @@ -1505,6 +1692,28 @@ mod test_map { assert_eq!(*m.find(&1).unwrap(), 2); } + #[test] + fn test_update_with() { + let mut m = HashMap::with_capacity(4); + assert!(m.insert(1i, 2i)); + + for i in range(1i, 1000) { + assert_eq!( + i + 2, + *m.insert_or_update_with(i + 1, i + 2, |_k, _v| { + fail!("Key not yet present"); + }) + ); + assert_eq!( + i + 1, + *m.insert_or_update_with(i, i + 3, |k, v| { + assert_eq!(*k, i); + assert_eq!(*v, i + 1); + }) + ); + } + } + #[test] fn test_conflict_remove() { let mut m = HashMap::with_capacity(4); @@ -1698,6 +1907,7 @@ mod test_map { m.insert(i, i); i += 1; } + // three quarters full assert_eq!(m.len(), i); assert_eq!(m.table.capacity(), cap); @@ -1706,16 +1916,18 @@ mod test_map { m.insert(i, i); i += 1; } + // half full let new_cap = m.table.capacity(); assert_eq!(new_cap, cap * 2); - for _ in range(0, cap / 2) { + for _ in range(0, cap / 2 - 1) { i -= 1; m.remove(&i); assert_eq!(m.table.capacity(), new_cap); } - + // A little more than one quarter full. + // Shrinking starts as we remove more elements: for _ in range(0, cap / 2 - 1) { i -= 1; m.remove(&i); diff --git a/src/libstd/collections/hashmap/mod.rs b/src/libstd/collections/hashmap/mod.rs index f493e844526..b5612ce0f07 100644 --- a/src/libstd/collections/hashmap/mod.rs +++ b/src/libstd/collections/hashmap/mod.rs @@ -12,6 +12,7 @@ pub use self::map::HashMap; pub use self::map::Entries; +pub use self::map::MutEntries; pub use self::map::MoveEntries; pub use self::map::Keys; pub use self::map::Values; diff --git a/src/libstd/collections/hashmap/set.rs b/src/libstd/collections/hashmap/set.rs index a1f71e33303..4a2a04cbc9f 100644 --- a/src/libstd/collections/hashmap/set.rs +++ b/src/libstd/collections/hashmap/set.rs @@ -16,8 +16,7 @@ use collections::{Collection, Mutable, Set, MutableSet, Map, MutableMap}; use default::Default; use fmt::Show; use fmt; -use RandomSipHasher; -use hash::{Hash, Hasher}; +use hash::{Hash, Hasher, RandomSipHasher}; use iter::{Iterator, FromIterator, FilterMap, Chain, Repeat, Zip, Extendable}; use iter; use option::{Some, None}; @@ -25,13 +24,13 @@ use result::{Ok, Err}; use super::{HashMap, Entries, MoveEntries, INITIAL_CAPACITY}; -/// HashSet iterator -pub type SetItems<'a, K> = - iter::Map<'static, (&'a K, &'a ()), &'a K, Entries<'a, K, ()>>; -/// HashSet move iterator -pub type SetMoveItems = - iter::Map<'static, (K, ()), K, MoveEntries>; +// Future Optimization (FIXME!) +// ============================= +// +// Iteration over zero sized values is a noop. There is no need +// for `bucket.val` in the case of HashSet. I suppose we would need HKT +// to get rid of it properly. /// An implementation of a hash set using the underlying representation of a /// HashMap where the value is (). As with the `HashMap` type, a `HashSet` @@ -444,6 +443,14 @@ impl, S, H: Hasher + Default> Default for HashSet { } } +/// HashSet iterator +pub type SetItems<'a, K> = + iter::Map<'static, (&'a K, &'a ()), &'a K, Entries<'a, K, ()>>; + +/// HashSet move iterator +pub type SetMoveItems = + iter::Map<'static, (K, ()), K, MoveEntries>; + // `Repeat` is used to feed the filter closure an explicit capture // of a reference to the other set /// Set operations iterator diff --git a/src/libstd/collections/hashmap/table.rs b/src/libstd/collections/hashmap/table.rs index 96d1a9ba2fb..54469baaef5 100644 --- a/src/libstd/collections/hashmap/table.rs +++ b/src/libstd/collections/hashmap/table.rs @@ -14,14 +14,13 @@ use clone::Clone; use cmp; use hash::{Hash, Hasher}; use iter::{Iterator, count}; +use kinds::marker; use mem::{min_align_of, size_of}; use mem; -use num::{CheckedMul, is_power_of_two}; +use num::{CheckedAdd, CheckedMul, is_power_of_two}; use ops::{Deref, DerefMut, Drop}; use option::{Some, None, Option}; -use ptr::RawPtr; -use ptr::set_memory; -use ptr::write; +use ptr::{RawPtr, copy_nonoverlapping_memory, zero_memory}; use ptr; use rt::heap::{allocate, deallocate}; @@ -34,17 +33,17 @@ static EMPTY_BUCKET: u64 = 0u64; /// `Vec>`, because we don't pay for the overhead of an /// option on every element, and we get a generally more cache-aware design. /// -/// Key invariants of this structure: +/// Essential invariants of this structure: /// -/// - if hashes[i] == EMPTY_BUCKET, then keys[i] and vals[i] have -/// 'undefined' contents. Don't read from them. This invariant is -/// enforced outside this module with the `EmptyIndex`, `FullIndex`, +/// - if t.hashes[i] == EMPTY_BUCKET, then `Bucket::at_index(&t, i).raw` +/// points to 'undefined' contents. Don't read from it. This invariant is +/// enforced outside this module with the `EmptyBucket`, `FullBucket`, /// and `SafeHash` types. /// -/// - An `EmptyIndex` is only constructed for a bucket at an index with +/// - An `EmptyBucket` is only constructed at an index with /// a hash of EMPTY_BUCKET. /// -/// - A `FullIndex` is only constructed for a bucket at an index with a +/// - A `FullBucket` is only constructed at an index with a /// non-EMPTY_BUCKET hash. /// /// - A `SafeHash` is only constructed for non-`EMPTY_BUCKET` hash. We get @@ -56,48 +55,21 @@ static EMPTY_BUCKET: u64 = 0u64; /// `capacity`. This is set at creation and never changes. The arrays /// are unzipped to save space (we don't have to pay for the padding /// between odd sized elements, such as in a map from u64 to u8), and -/// be more cache aware (scanning through 8 hashes brings in 2 cache -/// lines, since they're all right beside each other). +/// be more cache aware (scanning through 8 hashes brings in at most +/// 2 cache lines, since they're all right beside each other). /// /// You can kind of think of this module/data structure as a safe wrapper /// around just the "table" part of the hashtable. It enforces some /// invariants at the type level and employs some performance trickery, /// but in general is just a tricked out `Vec>`. -/// -/// FIXME(cgaebel): -/// -/// Feb 11, 2014: This hashtable was just implemented, and, hard as I tried, -/// isn't yet totally safe. There's a "known exploit" that you can create -/// multiple FullIndexes for a bucket, `take` one, and then still `take` -/// the other causing undefined behavior. Currently, there's no story -/// for how to protect against this statically. Therefore, there are asserts -/// on `take`, `get`, `get_mut`, and `put` which check the bucket state. -/// With time, and when we're confident this works correctly, they should -/// be removed. Also, the bounds check in `peek` is especially painful, -/// as that's called in the innermost loops of the hashtable and has the -/// potential to be a major performance drain. Remove this too. -/// -/// Or, better than remove, only enable these checks for debug builds. -/// There's currently no "debug-only" asserts in rust, so if you're reading -/// this and going "what? of course there are debug-only asserts!", then -/// please make this use them! #[unsafe_no_drop_flag] pub struct RawTable { capacity: uint, size: uint, - hashes: *mut u64 -} - -/// A bucket that holds a reference to the table -pub trait BucketWithTable { - /// A bucket that holds a reference to the table - fn table<'a>(&'a self) -> &'a M; - - /// Move out the reference to the table. - fn into_table(self) -> M; - - /// Get the raw index. - fn index(&self) -> uint; + hashes: *mut u64, + // Because K/V do not appear directly in any of the types in the struct, + // inform rustc that in fact instances of K and V are reachable from here. + marker: marker::CovariantType<(K,V)>, } struct RawBucket { @@ -124,47 +96,66 @@ pub struct FullBucket { table: M } -pub type EmptyBucketImm<'table,K,V> = EmptyBucket>; -pub type FullBucketImm<'table,K,V> = FullBucket>; +pub type EmptyBucketImm<'table, K, V> = EmptyBucket>; +pub type FullBucketImm<'table, K, V> = FullBucket>; -pub type EmptyBucketMut<'table,K,V> = EmptyBucket>; -pub type FullBucketMut<'table,K,V> = FullBucket>; +pub type EmptyBucketMut<'table, K, V> = EmptyBucket>; +pub type FullBucketMut<'table, K, V> = FullBucket>; +pub enum BucketState { + Empty(EmptyBucket), + Full(FullBucket), +} + +// A GapThenFull encapsulates the state of two consecutive buckets at once. +// The first bucket, called the gap, is known to be empty. +// The second bucket is full. struct GapThenFull { gap: EmptyBucket, - full: FullBucket + full: FullBucket, } -impl>> GapThenFull { - pub fn full<'a>(&'a self) -> &'a FullBucket { - &self.full - } +/// A hash that is not zero, since we use a hash of zero to represent empty +/// buckets. +#[deriving(PartialEq)] +pub struct SafeHash { + hash: u64, +} - pub fn shift(mut self) -> Option> { - unsafe { - *self.gap.raw.hash = mem::replace(&mut *self.full.raw.hash, EMPTY_BUCKET); - mem::overwrite(self.gap.raw.key, ptr::read(self.full.raw.key as *const K)); - mem::overwrite(self.gap.raw.val, ptr::read(self.full.raw.val as *const V)); - } +impl SafeHash { + /// Peek at the hash value, which is guaranteed to be non-zero. + #[inline(always)] + pub fn inspect(&self) -> u64 { self.hash } +} - let FullBucket { raw, idx, .. } = self.full; - - match self.full.next().peek() { - Empty(_) => None, - Full(bucket) => { - self.gap.raw = raw; - self.gap.idx = idx; - - self.full = bucket; - self.full.idx &= self.full.table.capacity - 1; - - Some(self) - } - } +/// We need to remove hashes of 0. That's reserved for empty buckets. +/// This function wraps up `hash_keyed` to be the only way outside this +/// module to generate a SafeHash. +pub fn make_hash, S, H: Hasher>(hasher: &H, t: &T) -> SafeHash { + match hasher.hash(t) { + // This constant is exceedingly likely to hash to the same + // bucket, but it won't be counted as empty! Just so we can maintain + // our precious uniform distribution of initial indexes. + EMPTY_BUCKET => SafeHash { hash: 0x8000_0000_0000_0000 }, + h => SafeHash { hash: h }, } } -impl RawPtr for RawBucket { +// `replace` casts a `*u64` to a `*SafeHash`. Since we statically +// ensure that a `FullBucket` points to an index with a non-zero hash, +// and a `SafeHash` is just a `u64` with a different name, this is +// safe. +// +// This test ensures that a `SafeHash` really IS the same size as a +// `u64`. If you need to change the size of `SafeHash` (and +// consequently made this test fail), `replace` needs to be +// modified to no longer assume this. +#[test] +fn can_alias_safehash_as_u64() { + assert_eq!(size_of::(), size_of::()) +} + +impl RawBucket { unsafe fn offset(self, count: int) -> RawBucket { RawBucket { hash: self.hash.offset(count), @@ -172,35 +163,143 @@ impl RawPtr for RawBucket { val: self.val.offset(count), } } +} - fn null() -> RawBucket { - RawBucket { - hash: RawPtr::null(), - key: RawPtr::null(), - val: RawPtr::null() - } - } - - fn is_null(&self) -> bool { - self.hash.is_null() - } - - fn to_uint(&self) -> uint { - self.hash.to_uint() - } - - unsafe fn to_option(&self) -> Option<&u64> { - self.hash.to_option() +// For parameterizing over mutability. +impl<'t, K, V> Deref> for &'t RawTable { + fn deref(&self) -> &RawTable { + &**self } } -impl>> EmptyBucket { +impl<'t, K, V> Deref> for &'t mut RawTable { + fn deref(&self) -> &RawTable { + &**self + } +} + +impl<'t, K, V> DerefMut> for &'t mut RawTable { + fn deref_mut(&mut self) -> &mut RawTable { + &mut **self + } +} + +// Buckets hold references to the table. +impl FullBucket { + /// Borrow a reference to the table. + pub fn table(&self) -> &M { + &self.table + } + /// Move out the reference to the table. + pub fn into_table(self) -> M { + self.table + } + /// Get the raw index. + pub fn index(&self) -> uint { + self.idx + } +} + +impl EmptyBucket { + /// Borrow a reference to the table. + pub fn table(&self) -> &M { + &self.table + } + /// Move out the reference to the table. + pub fn into_table(self) -> M { + self.table + } +} + +impl Bucket { + /// Move out the reference to the table. + pub fn into_table(self) -> M { + self.table + } + /// Get the raw index. + pub fn index(&self) -> uint { + self.idx + } +} + +impl>> Bucket { + pub fn new(table: M, hash: &SafeHash) -> Bucket { + Bucket::at_index(table, hash.inspect() as uint) + } + + pub fn at_index(table: M, ib_index: uint) -> Bucket { + let ib_index = ib_index & (table.capacity() - 1); + Bucket { + raw: unsafe { + table.first_bucket_raw().offset(ib_index as int) + }, + idx: ib_index, + table: table + } + } + + pub fn first(table: M) -> Bucket { + Bucket { + raw: table.first_bucket_raw(), + idx: 0, + table: table + } + } + + /// Reads a bucket at a given index, returning an enum indicating whether + /// it's initialized or not. You need to match on this enum to get + /// the appropriate types to call most of the other functions in + /// this module. + pub fn peek(self) -> BucketState { + match unsafe { *self.raw.hash } { + EMPTY_BUCKET => + Empty(EmptyBucket { + raw: self.raw, + idx: self.idx, + table: self.table + }), + _ => + Full(FullBucket { + raw: self.raw, + idx: self.idx, + table: self.table + }) + } + } + + /// Modifies the bucket pointer in place to make it point to the next slot. + pub fn next(&mut self) { + // Branchless bucket iteration step. + // As we reach the end of the table... + // We take the current idx: 0111111b + // Xor it by its increment: ^ 1000000b + // ------------ + // 1111111b + // Then AND with the capacity: & 1000000b + // ------------ + // to get the backwards offset: 1000000b + // ... and it's zero at all other times. + let maybe_wraparound_dist = (self.idx ^ (self.idx + 1)) & self.table.capacity(); + // Finally, we obtain the offset 1 or the offset -cap + 1. + let dist = 1i - (maybe_wraparound_dist as int); + + self.idx += 1; + + unsafe { + self.raw = self.raw.offset(dist); + } + } +} + +impl>> EmptyBucket { + #[inline] pub fn next(self) -> Bucket { let mut bucket = self.into_bucket(); bucket.next(); bucket } + #[inline] pub fn into_bucket(self) -> Bucket { Bucket { raw: self.raw, @@ -217,24 +316,31 @@ impl>> EmptyBucket { }; match self.next().peek() { - Empty(_) => None, Full(bucket) => { Some(GapThenFull { gap: gap, full: bucket }) } + Empty(..) => None } } } -impl>> EmptyBucket { +impl>> EmptyBucket { + /// Puts given key and value pair, along with the key's hash, + /// into this bucket in the hashtable. Note how `self` is 'moved' into + /// this function, because this slot will no longer be empty when + /// we return! A `FullBucket` is returned for later use, pointing to + /// the newly-filled slot in the hashtable. + /// + /// Use `make_hash` to construct a `SafeHash` to pass to this function. pub fn put(mut self, hash: SafeHash, key: K, value: V) -> FullBucket { unsafe { *self.raw.hash = hash.inspect(); - write(self.raw.key, key); - write(self.raw.val, value); + ptr::write(self.raw.key, key); + ptr::write(self.raw.val, value); } self.table.size += 1; @@ -243,13 +349,15 @@ impl>> EmptyBucket { } } -impl>> FullBucket { +impl>> FullBucket { + #[inline] pub fn next(self) -> Bucket { let mut bucket = self.into_bucket(); bucket.next(); bucket } + #[inline] pub fn into_bucket(self) -> Bucket { Bucket { raw: self.raw, @@ -258,10 +366,19 @@ impl>> FullBucket { } } + /// Get the distance between this bucket and the 'ideal' location + /// as determined by the key's hash stored in it. + /// + /// In the cited blog posts above, this is called the "distance to + /// initial bucket", or DIB. Also known as "probe count". pub fn distance(&self) -> uint { + // Calculates the distance one has to travel when going from + // `hash mod capacity` onwards to `idx mod capacity`, wrapping around + // if the destination is not reached before the end of the table. (self.idx - self.hash().inspect() as uint) & (self.table.capacity() - 1) } + #[inline] pub fn hash(&self) -> SafeHash { unsafe { SafeHash { @@ -270,23 +387,20 @@ impl>> FullBucket { } } - pub fn read<'a>(&'a self) -> (&'a K, &'a V) { + /// Gets references to the key and value at a given index. + pub fn read(&self) -> (&K, &V) { unsafe { (&*self.raw.key, &*self.raw.val) } } - - pub fn into_refs(self) -> (&K, &V) { - unsafe { - // debug_assert!(*self.raw.hash != EMPTY_BUCKET); - (&*self.raw.key, - &*self.raw.val) - } - } } -impl>> FullBucket { +impl>> FullBucket { + /// Removes this bucket's key and value from the hashtable. + /// + /// This works similarly to `put`, building an `EmptyBucket` out of the + /// taken bucket. pub fn take(mut self) -> (EmptyBucket, K, V) { let key = self.raw.key as *const K; let val = self.raw.val as *const V; @@ -317,176 +431,86 @@ impl>> FullBucket { } } - pub fn read_mut<'a>(&'a self) -> (&'a mut K, &'a mut V) { + /// Gets mutable references to the key and value at a given index. + pub fn read_mut(&mut self) -> (&mut K, &mut V) { unsafe { - // debug_assert!(*self.raw.hash != EMPTY_BUCKET); - (&mut *self.raw.key, - &mut *self.raw.val) - } - } - - pub fn into_mut_refs(self) -> (&mut K, &mut V) { - unsafe { - // debug_assert!(*self.raw.hash != EMPTY_BUCKET); (&mut *self.raw.key, &mut *self.raw.val) } } } -impl>> Bucket { - pub fn new(table: M, hash: &SafeHash) -> Bucket { - let ib_index = (hash.inspect() as uint) & (table.capacity() - 1); - Bucket { - raw: unsafe { - table.as_mut_ptrs().offset(ib_index as int) - }, - idx: ib_index, - table: table - } - } - - pub fn at_index(table: M, ib_index: uint) -> Bucket { - let ib_index = ib_index & (table.capacity() - 1); - Bucket { - raw: unsafe { - table.as_mut_ptrs().offset(ib_index as int) - }, - idx: ib_index, - table: table - } - } - - pub fn first(table: M) -> Bucket { - Bucket { - raw: table.as_mut_ptrs(), - idx: 0, - table: table - } - } - - pub fn peek(self) -> BucketState { - match unsafe { *self.raw.hash } { - EMPTY_BUCKET => - Empty(EmptyBucket { - raw: self.raw, - idx: self.idx, - table: self.table - }), - _ => - Full(FullBucket { - raw: self.raw, - idx: self.idx, - table: self.table - }) - } - } - - pub fn next(&mut self) { - self.idx += 1; - - let dist = if self.idx == self.table.capacity() { - -(self.table.capacity() as int - 1) - } else { - 1i - }; - +impl<'t, K, V, M: Deref> + 't> FullBucket { + /// Exchange a bucket state for immutable references into the table. + /// Because the underlying reference to the table is also consumed, + /// no further changes to the structure of the table are possible; + /// in exchange for this, the returned references have a longer lifetime + /// than the references returned by `read()`. + pub fn into_refs(self) -> (&'t K, &'t V) { unsafe { - self.raw = self.raw.offset(dist); + (&*self.raw.key, + &*self.raw.val) } } } -impl BucketWithTable for FullBucket { - fn table<'a>(&'a self) -> &'a M { - &self.table - } - - fn into_table(self) -> M { - self.table - } - - fn index(&self) -> uint { - self.idx +impl<'t, K, V, M: DerefMut> + 't> FullBucket { + /// This works similarly to `into_refs`, exchanging a bucket state + /// for mutable references into the table. + pub fn into_mut_refs(self) -> (&'t mut K, &'t mut V) { + unsafe { + (&mut *self.raw.key, + &mut *self.raw.val) + } } } -impl BucketWithTable for EmptyBucket { - fn table<'a>(&'a self) -> &'a M { - &self.table - } - - fn into_table(self) -> M { - self.table - } - - fn index(&self) -> uint { - self.idx +impl BucketState { + // For convenience. + pub fn expect_full(self) -> FullBucket { + match self { + Full(full) => full, + Empty(..) => fail!("Expected full bucket") + } } } -impl BucketWithTable for Bucket { - fn table<'a>(&'a self) -> &'a M { - &self.table +impl>> GapThenFull { + #[inline] + pub fn full(&self) -> &FullBucket { + &self.full } - fn into_table(self) -> M { - self.table - } + pub fn shift(mut self) -> Option> { + unsafe { + *self.gap.raw.hash = mem::replace(&mut *self.full.raw.hash, EMPTY_BUCKET); + copy_nonoverlapping_memory(self.gap.raw.key, self.full.raw.key as *const K, 1); + copy_nonoverlapping_memory(self.gap.raw.val, self.full.raw.val as *const V, 1); + } - fn index(&self) -> uint { - self.idx + let FullBucket { raw: prev_raw, idx: prev_idx, .. } = self.full; + + match self.full.next().peek() { + Full(bucket) => { + self.gap.raw = prev_raw; + self.gap.idx = prev_idx; + + self.full = bucket; + + Some(self) + } + Empty(..) => None + } } } -impl<'table,K,V> Deref> for &'table RawTable { - fn deref<'a>(&'a self) -> &'a RawTable { - &**self - } -} - -impl<'table,K,V> Deref> for &'table mut RawTable { - fn deref<'a>(&'a self) -> &'a RawTable { - &**self - } -} - -impl<'table,K,V> DerefMut> for &'table mut RawTable { - fn deref_mut<'a>(&'a mut self) -> &'a mut RawTable { - &mut **self - } -} - -pub enum BucketState { - Empty(EmptyBucket), - Full(FullBucket), -} - -/// A hash that is not zero, since we use a hash of zero to represent empty -/// buckets. -#[deriving(PartialEq)] -pub struct SafeHash { - hash: u64, -} - -impl SafeHash { - /// Peek at the hash value, which is guaranteed to be non-zero. - #[inline(always)] - pub fn inspect(&self) -> u64 { self.hash } -} - -/// We need to remove hashes of 0. That's reserved for empty buckets. -/// This function wraps up `hash_keyed` to be the only way outside this -/// module to generate a SafeHash. -pub fn make_hash, S, H: Hasher>(hasher: &H, t: &T) -> SafeHash { - match hasher.hash(t) { - // This constant is exceedingly likely to hash to the same - // bucket, but it won't be counted as empty! - EMPTY_BUCKET => SafeHash { hash: 0x8000_0000_0000_0000 }, - h => SafeHash { hash: h }, - } -} +/// Rounds up to a multiple of a power of two. Returns the closest multiple +/// of `target_alignment` that is higher or equal to `unrounded`. +/// +/// # Failure +/// +/// Fails if `target_alignment` is not a power of two. fn round_up_to_next(unrounded: uint, target_alignment: uint) -> uint { assert!(is_power_of_two(target_alignment)); (unrounded + target_alignment - 1) & !(target_alignment - 1) @@ -531,7 +555,6 @@ fn test_offset_calculation() { } impl RawTable { - /// Does not initialize the buckets. The caller should ensure they, /// at the very least, set every hash to EMPTY_BUCKET. unsafe fn new_uninitialized(capacity: uint) -> RawTable { @@ -540,6 +563,7 @@ impl RawTable { size: 0, capacity: 0, hashes: 0 as *mut u64, + marker: marker::CovariantType, }; } let hashes_size = capacity.checked_mul(&size_of::()) @@ -571,17 +595,18 @@ impl RawTable { capacity: capacity, size: 0, hashes: hashes, + marker: marker::CovariantType, } } - fn as_mut_ptrs(&self) -> RawBucket { + fn first_bucket_raw(&self) -> RawBucket { let hashes_size = self.capacity * size_of::(); let keys_size = self.capacity * size_of::(); - let keys_offset = (hashes_size + min_align_of::< K >() - 1) & !(min_align_of::< K >() - 1); + let keys_offset = (hashes_size + min_align_of::() - 1) & !(min_align_of::() - 1); let end_of_keys = keys_offset + keys_size; - let vals_offset = (end_of_keys + min_align_of::< V >() - 1) & !(min_align_of::< V >() - 1); + let vals_offset = (end_of_keys + min_align_of::() - 1) & !(min_align_of::() - 1); let buffer = self.hashes as *mut u8; @@ -600,7 +625,7 @@ impl RawTable { pub fn new(capacity: uint) -> RawTable { unsafe { let ret = RawTable::new_uninitialized(capacity); - set_memory(ret.hashes, 0u8, capacity); + zero_memory(ret.hashes, capacity); ret } } @@ -616,49 +641,51 @@ impl RawTable { self.size } - fn ptrs<'a>(&'a self) -> RawBuckets<'a, K, V> { + fn raw_buckets(&self) -> RawBuckets { RawBuckets { - raw: self.as_mut_ptrs(), + raw: self.first_bucket_raw(), hashes_end: unsafe { self.hashes.offset(self.capacity as int) } } } - pub fn iter<'a>(&'a self) -> Entries<'a, K, V> { + pub fn iter(&self) -> Entries { Entries { - iter: self.ptrs(), + iter: self.raw_buckets(), elems_left: self.size(), } } - pub fn mut_iter<'a>(&'a mut self) -> MutEntries<'a, K, V> { + pub fn mut_iter(&mut self) -> MutEntries { MutEntries { - iter: self.ptrs(), + iter: self.raw_buckets(), elems_left: self.size(), } } pub fn move_iter(self) -> MoveEntries { MoveEntries { - iter: self.ptrs(), + iter: self.raw_buckets(), table: self, } } - pub fn rev_move_buckets<'a>(&'a mut self) -> RevMoveBuckets<'a, K, V> { - let raw_bucket = self.as_mut_ptrs(); - unsafe { - RevMoveBuckets { - raw: raw_bucket.offset(self.capacity as int), - hashes_end: raw_bucket.hash, - elems_left: self.size - } + /// Returns an iterator that copies out each entry. Used while the table + /// is being dropped. + unsafe fn rev_move_buckets(&mut self) -> RevMoveBuckets { + let raw_bucket = self.first_bucket_raw(); + RevMoveBuckets { + raw: raw_bucket.offset(self.capacity as int), + hashes_end: raw_bucket.hash, + elems_left: self.size } } } -pub struct RawBuckets<'a, K, V> { +/// A raw iterator. The basis for some other iterators in this module. Although +/// this interface is safe, it's not used outside this module. +struct RawBuckets<'a, K, V> { raw: RawBucket, hashes_end: *mut u64 } @@ -667,6 +694,8 @@ impl<'a, K, V> Iterator> for RawBuckets<'a, K, V> { fn next(&mut self) -> Option> { while self.raw.hash != self.hashes_end { unsafe { + // We are swapping out the pointer to a bucket and replacing + // it with the pointer to the next one. let prev = ptr::replace(&mut self.raw, self.raw.offset(1)); if *prev.hash != EMPTY_BUCKET { return Some(prev); @@ -678,7 +707,10 @@ impl<'a, K, V> Iterator> for RawBuckets<'a, K, V> { } } -pub struct RevMoveBuckets<'a, K, V> { +/// An iterator that moves out buckets in reverse order. It leaves the table +/// in an an inconsistent state and should only be used for dropping +/// the table's remaining entries. It's used in the implementation of Drop. +struct RevMoveBuckets<'a, K, V> { raw: RawBucket, hashes_end: *mut u64, elems_left: uint @@ -708,43 +740,13 @@ impl<'a, K, V> Iterator<(K, V)> for RevMoveBuckets<'a, K, V> { } } -// `read_all_mut` casts a `*u64` to a `*SafeHash`. Since we statically -// ensure that a `FullIndex` points to an index with a non-zero hash, -// and a `SafeHash` is just a `u64` with a different name, this is -// safe. -// -// This test ensures that a `SafeHash` really IS the same size as a -// `u64`. If you need to change the size of `SafeHash` (and -// consequently made this test fail), `read_all_mut` needs to be -// modified to no longer assume this. -#[test] -fn can_alias_safehash_as_u64() { - assert_eq!(size_of::(), size_of::()) -} - -/// Note: stage0-specific version that lacks bound. -#[cfg(stage0)] -pub struct Entries<'a, K, V> { - iter: RawBuckets<'a, K, V>, - elems_left: uint, -} - /// Iterator over shared references to entries in a table. -#[cfg(not(stage0))] pub struct Entries<'a, K: 'a, V: 'a> { iter: RawBuckets<'a, K, V>, elems_left: uint, } -/// Note: stage0-specific version that lacks bound. -#[cfg(stage0)] -pub struct MutEntries<'a, K, V> { - iter: RawBuckets<'a, K, V>, - elems_left: uint, -} - /// Iterator over mutable references to entries in a table. -#[cfg(not(stage0))] pub struct MutEntries<'a, K: 'a, V: 'a> { iter: RawBuckets<'a, K, V>, elems_left: uint, @@ -830,14 +832,14 @@ impl Clone for RawTable { mem::overwrite(new_buckets.raw.key, k); mem::overwrite(new_buckets.raw.val, v); } - _ => { + Empty(..) => { *new_buckets.raw.hash = EMPTY_BUCKET; } } new_buckets.next(); buckets.next(); } - } + }; new_ht.size = self.size(); @@ -852,12 +854,14 @@ impl Drop for RawTable { if self.hashes.is_null() { return; } - // This is in reverse because we're likely to have partially taken + // This is done in reverse because we've likely partially taken // some elements out with `.move_iter()` from the front. // Check if the size is 0, so we don't do a useless scan when // dropping empty tables such as on resize. - // Avoid double free of elements already moved out. - for _ in self.rev_move_buckets() {} + // Also avoid double drop of elements that have been already moved out. + unsafe { + for _ in self.rev_move_buckets() {} + } let hashes_size = self.capacity * size_of::(); let keys_size = self.capacity * size_of::(); @@ -871,7 +875,5 @@ impl Drop for RawTable { // Remember how everything was allocated out of one buffer // during initialization? We only need one call to free here. } - - self.hashes = RawPtr::null(); } } From 27f87c611fa57b0320f72f483c60e7b4d70ddc2a Mon Sep 17 00:00:00 2001 From: Piotr Czarnecki Date: Sat, 26 Jul 2014 04:45:09 +0100 Subject: [PATCH 5/6] std: Fix overflow of HashMap's capacity --- src/libstd/collections/hashmap/table.rs | 85 +++++++++++-------- .../run-fail/hashmap-capacity-overflow.rs | 21 +++++ 2 files changed, 72 insertions(+), 34 deletions(-) create mode 100644 src/test/run-fail/hashmap-capacity-overflow.rs diff --git a/src/libstd/collections/hashmap/table.rs b/src/libstd/collections/hashmap/table.rs index 54469baaef5..2edb8cd092e 100644 --- a/src/libstd/collections/hashmap/table.rs +++ b/src/libstd/collections/hashmap/table.rs @@ -526,32 +526,45 @@ fn test_rounding() { assert_eq!(round_up_to_next(5, 4), 8); } +// Returns a tuple of (key_offset, val_offset), +// from the start of a mallocated array. +fn calculate_offsets(hashes_size: uint, + keys_size: uint, keys_align: uint, + vals_align: uint) + -> (uint, uint) { + let keys_offset = round_up_to_next(hashes_size, keys_align); + let end_of_keys = keys_offset + keys_size; + + let vals_offset = round_up_to_next(end_of_keys, vals_align); + + (keys_offset, vals_offset) +} + // Returns a tuple of (minimum required malloc alignment, hash_offset, -// key_offset, val_offset, array_size), from the start of a mallocated array. -fn calculate_offsets( - hash_size: uint, hash_align: uint, - keys_size: uint, keys_align: uint, - vals_size: uint, vals_align: uint) -> (uint, uint, uint, uint, uint) { - - let hash_offset = 0; - let end_of_hashes = hash_offset + hash_size; - - let keys_offset = round_up_to_next(end_of_hashes, keys_align); - let end_of_keys = keys_offset + keys_size; - - let vals_offset = round_up_to_next(end_of_keys, vals_align); - let end_of_vals = vals_offset + vals_size; +// array_size), from the start of a mallocated array. +fn calculate_allocation(hash_size: uint, hash_align: uint, + keys_size: uint, keys_align: uint, + vals_size: uint, vals_align: uint) + -> (uint, uint, uint) { + let hash_offset = 0; + let (_, vals_offset) = calculate_offsets(hash_size, + keys_size, keys_align, + vals_align); + let end_of_vals = vals_offset + vals_size; let min_align = cmp::max(hash_align, cmp::max(keys_align, vals_align)); - (min_align, hash_offset, keys_offset, vals_offset, end_of_vals) + (min_align, hash_offset, end_of_vals) } #[test] fn test_offset_calculation() { - assert_eq!(calculate_offsets(128, 8, 15, 1, 4, 4 ), (8, 0, 128, 144, 148)); - assert_eq!(calculate_offsets(3, 1, 2, 1, 1, 1 ), (1, 0, 3, 5, 6)); - assert_eq!(calculate_offsets(6, 2, 12, 4, 24, 8), (8, 0, 8, 24, 48)); + assert_eq!(calculate_allocation(128, 8, 15, 1, 4, 4), (8, 0, 148)); + assert_eq!(calculate_allocation(3, 1, 2, 1, 1, 1), (1, 0, 6)); + assert_eq!(calculate_allocation(6, 2, 12, 4, 24, 8), (8, 0, 48)); + assert_eq!(calculate_offsets(128, 15, 1, 4), (128, 144)); + assert_eq!(calculate_offsets(3, 2, 1, 1), (3, 5)); + assert_eq!(calculate_offsets(6, 12, 4, 8), (8, 24)); } impl RawTable { @@ -566,12 +579,11 @@ impl RawTable { marker: marker::CovariantType, }; } - let hashes_size = capacity.checked_mul(&size_of::()) - .expect("capacity overflow"); - let keys_size = capacity.checked_mul(&size_of::< K >()) - .expect("capacity overflow"); - let vals_size = capacity.checked_mul(&size_of::< V >()) - .expect("capacity overflow"); + // No need for `checked_mul` before a more restrictive check performed + // later in this method. + let hashes_size = capacity * size_of::(); + let keys_size = capacity * size_of::< K >(); + let vals_size = capacity * size_of::< V >(); // Allocating hashmaps is a little tricky. We need to allocate three // arrays, but since we know their sizes and alignments up front, @@ -581,12 +593,19 @@ impl RawTable { // This is great in theory, but in practice getting the alignment // right is a little subtle. Therefore, calculating offsets has been // factored out into a different function. - let (malloc_alignment, hash_offset, _, _, size) = - calculate_offsets( + let (malloc_alignment, hash_offset, size) = + calculate_allocation( hashes_size, min_align_of::(), keys_size, min_align_of::< K >(), vals_size, min_align_of::< V >()); + // One check for overflow that covers calculation and rounding of size. + let size_of_bucket = size_of::().checked_add(&size_of::()).unwrap() + .checked_add(&size_of::()).unwrap(); + assert!(size >= capacity.checked_mul(&size_of_bucket) + .expect("capacity overflow"), + "capacity overflow"); + let buffer = allocate(size, malloc_alignment); let hashes = buffer.offset(hash_offset as int) as *mut u64; @@ -603,12 +622,10 @@ impl RawTable { let hashes_size = self.capacity * size_of::(); let keys_size = self.capacity * size_of::(); - let keys_offset = (hashes_size + min_align_of::() - 1) & !(min_align_of::() - 1); - let end_of_keys = keys_offset + keys_size; - - let vals_offset = (end_of_keys + min_align_of::() - 1) & !(min_align_of::() - 1); - let buffer = self.hashes as *mut u8; + let (keys_offset, vals_offset) = calculate_offsets(hashes_size, + keys_size, min_align_of::(), + min_align_of::()); unsafe { RawBucket { @@ -866,9 +883,9 @@ impl Drop for RawTable { let hashes_size = self.capacity * size_of::(); let keys_size = self.capacity * size_of::(); let vals_size = self.capacity * size_of::(); - let (align, _, _, _, size) = calculate_offsets(hashes_size, min_align_of::(), - keys_size, min_align_of::(), - vals_size, min_align_of::()); + let (align, _, size) = calculate_allocation(hashes_size, min_align_of::(), + keys_size, min_align_of::(), + vals_size, min_align_of::()); unsafe { deallocate(self.hashes as *mut u8, size, align); diff --git a/src/test/run-fail/hashmap-capacity-overflow.rs b/src/test/run-fail/hashmap-capacity-overflow.rs new file mode 100644 index 00000000000..f68b511d0aa --- /dev/null +++ b/src/test/run-fail/hashmap-capacity-overflow.rs @@ -0,0 +1,21 @@ +// Copyright 2014 The Rust Project Developers. See the COPYRIGHT +// file at the top-level directory of this distribution and at +// http://rust-lang.org/COPYRIGHT. +// +// Licensed under the Apache License, Version 2.0 or the MIT license +// , at your +// option. This file may not be copied, modified, or distributed +// except according to those terms. + +// error-pattern:capacity overflow + +use std::collections::hashmap::HashMap; +use std::uint; +use std::mem::size_of; + +fn main() { + let threshold = uint::MAX / size_of::<(u64, u64, u64)>(); + let mut h = HashMap::::with_capacity(threshold + 100); + h.insert(0, 0); +} From 0ad4644ae1474b5443e34d273e5553612c6d9364 Mon Sep 17 00:00:00 2001 From: Piotr Czarnecki Date: Fri, 5 Sep 2014 01:24:04 +0100 Subject: [PATCH 6/6] Work around inability to link lifetime of ref bindings (#16994) --- src/librustc/lint/context.rs | 6 ++++-- src/librustdoc/html/render.rs | 2 +- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/src/librustc/lint/context.rs b/src/librustc/lint/context.rs index b40916dcc30..18e44cbac37 100644 --- a/src/librustc/lint/context.rs +++ b/src/librustc/lint/context.rs @@ -103,7 +103,9 @@ impl LintStore { } pub fn get_lint_groups<'t>(&'t self) -> Vec<(&'static str, Vec, bool)> { - self.lint_groups.iter().map(|(k, &(ref v, b))| (*k, v.clone(), b)).collect() + self.lint_groups.iter().map(|(k, v)| (*k, + v.ref0().clone(), + *v.ref1())).collect() } pub fn register_pass(&mut self, sess: Option<&Session>, @@ -210,7 +212,7 @@ impl LintStore { match self.by_name.find_equiv(&lint_name.as_slice()) { Some(&lint_id) => self.set_level(lint_id, (level, CommandLine)), None => { - match self.lint_groups.iter().map(|(&x, &(ref y, _))| (x, y.clone())) + match self.lint_groups.iter().map(|(&x, pair)| (x, pair.ref0().clone())) .collect::>>() .find_equiv(&lint_name.as_slice()) { Some(v) => { diff --git a/src/librustdoc/html/render.rs b/src/librustdoc/html/render.rs index fc8fd0d086b..f68971ee8d2 100644 --- a/src/librustdoc/html/render.rs +++ b/src/librustdoc/html/render.rs @@ -312,7 +312,7 @@ pub fn run(mut krate: clean::Crate, external_html: &ExternalHtml, dst: Path) -> }).unwrap_or(HashMap::new()); let mut cache = Cache { impls: HashMap::new(), - external_paths: paths.iter().map(|(&k, &(ref v, _))| (k, v.clone())) + external_paths: paths.iter().map(|(&k, v)| (k, v.ref0().clone())) .collect(), paths: paths, implementors: HashMap::new(),