// Licensed to the .NET Foundation under one or more agreements. // The .NET Foundation licenses this file to you under the MIT license. // See the LICENSE file in the project root for more information. #ifndef _SMALLHASHTABLE_H_ #define _SMALLHASHTABLE_H_ // Since compiler depends on valuenum which depends on smallhash, forward declare // a wrapper for comp->compGetMem here (implemented in compiler.hpp) that can be used below. class Compiler; void* compGetMem(Compiler* comp, size_t sz); // genLog2 is defined in compiler.hpp unsigned genLog2(unsigned value); //------------------------------------------------------------------------ // HashTableInfo: a concept that provides equality and hashing methods for // a particular key type. Used by HashTableBase and its // subclasses. template struct HashTableInfo { // static bool Equals(const TKey& x, const TKey& y); // static unsigned GetHashCode(const TKey& key); }; //------------------------------------------------------------------------ // HashTableInfo: specialized version of HashTableInfo for pointer- // typed keys. template struct HashTableInfo { static bool Equals(const TKey* x, const TKey* y) { return x == y; } static unsigned GetHashCode(const TKey* key) { // Shift off bits that are not likely to be significant size_t keyval = reinterpret_cast(key) >> ConstLog2<__alignof(TKey)>::value; // Truncate and return the result return static_cast(keyval); } }; //------------------------------------------------------------------------ // HashTableInfo: specialized version of HashTableInfo for int- // typed keys. template <> struct HashTableInfo { static bool Equals(int x, int y) { return x == y; } static unsigned GetHashCode(int key) { // Cast and return the key return static_cast(key); } }; //------------------------------------------------------------------------ // HashTableInfo: specialized version of HashTableInfo for unsigned- // typed keys. template <> struct HashTableInfo { static bool Equals(unsigned x, unsigned y) { return x == y; } static unsigned GetHashCode(unsigned key) { // Return the key itself return key; } }; //------------------------------------------------------------------------ // HashTableBase: base type for HashTable and SmallHashTable. This class // provides the vast majority of the implementation. The // subclasses differ in the storage they use at the time of // construction: HashTable allocates the initial bucket // array on the heap; SmallHashTable contains a small inline // array. // // This implementation is based on the ideas presented in Herlihy, Shavit, // and Tzafrir '08 (http://mcg.cs.tau.ac.il/papers/disc2008-hopscotch.pdf), // though it does not currently implement the hopscotch algorithm. // // The approach taken is intended to perform well in both space and speed. // This approach is a hybrid of separate chaining and open addressing with // linear probing: collisions are resolved using a bucket chain, but that // chain is stored in the bucket array itself. // // Resolving collisions using a bucket chain avoids the primary clustering // issue common in linearly-probed open addressed hash tables, while using // buckets as chain nodes avoids the allocaiton traffic typical of chained // tables. Applying the hopscotch algorithm in the aforementioned paper // could further improve performance by optimizing access patterns for // better cache usage. // // Template parameters: // TKey - The type of the table's keys. // TValue - The type of the table's values. // TKeyInfo - A type that conforms to the HashTableInfo concept. template > class HashTableBase { friend class KeyValuePair; friend class Iterator; enum : unsigned { InitialNumBuckets = 8 }; protected: //------------------------------------------------------------------------ // HashTableBase::Bucket: provides storage for the key-value pairs that // make up the contents of the table. // // The "home" bucket for a particular key is the bucket indexed by the // key's hash code modulo the size of the bucket array (the "home index"). // // The home bucket is always considered to be part of the chain that it // roots, even if it is also part of the chain rooted at a different // bucket. `m_firstOffset` indicates the offset of the first non-home // bucket in the home bucket's chain. If the `m_firstOffset` of a bucket // is 0, the chain rooted at that bucket is empty. // // The index of the next bucket in a chain is calculated by adding the // value in `m_nextOffset` to the index of the current bucket. If // `m_nextOffset` is 0, the current bucket is the end of its chain. Each // bucket in a chain must be occupied (i.e. `m_isFull` will be true). struct Bucket { bool m_isFull; // True if the bucket is occupied; false otherwise. unsigned m_firstOffset; // The offset to the first node in the chain for this bucket index. unsigned m_nextOffset; // The offset to the next node in the chain for this bucket index. unsigned m_hash; // The hash code for the element stored in this bucket. TKey m_key; // The key for the element stored in this bucket. TValue m_value; // The value for the element stored in this bucket. }; private: Compiler* m_compiler; // The compiler context to use for allocations. Bucket* m_buckets; // The bucket array. unsigned m_numBuckets; // The number of buckets in the bucket array. unsigned m_numFullBuckets; // The number of occupied buckets. //------------------------------------------------------------------------ // HashTableBase::Insert: inserts a key-value pair into a bucket array. // // Arguments: // buckets - The bucket array in which to insert the key-value pair. // numBuckets - The number of buckets in the bucket array. // hash - The hash code of the key to insert. // key - The key to insert. // value - The value to insert. // // Returns: // True if the key-value pair was successfully inserted; false // otherwise. static bool Insert(Bucket* buckets, unsigned numBuckets, unsigned hash, const TKey& key, const TValue& value) { const unsigned mask = numBuckets - 1; unsigned homeIndex = hash & mask; Bucket* home = &buckets[homeIndex]; if (!home->m_isFull) { // The home bucket is empty; use it. // // Note that `m_firstOffset` does not need to be updated: whether or not it is non-zero, // it is already correct, since we're inserting at the head of the list. `m_nextOffset` // must be 0, however, since this node should not be part of a list. assert(home->m_nextOffset == 0); home->m_isFull = true; home->m_hash = hash; home->m_key = key; home->m_value = value; return true; } // If the home bucket is full, probe to find the next empty bucket. unsigned precedingIndexInChain = homeIndex; unsigned nextIndexInChain = (homeIndex + home->m_firstOffset) & mask; for (unsigned j = 1; j < numBuckets; j++) { unsigned bucketIndex = (homeIndex + j) & mask; Bucket* bucket = &buckets[bucketIndex]; if (bucketIndex == nextIndexInChain) { assert(bucket->m_isFull); precedingIndexInChain = bucketIndex; nextIndexInChain = (bucketIndex + bucket->m_nextOffset) & mask; } else if (!bucket->m_isFull) { bucket->m_isFull = true; if (precedingIndexInChain == nextIndexInChain) { bucket->m_nextOffset = 0; } else { assert(((nextIndexInChain - bucketIndex) & mask) > 0); bucket->m_nextOffset = (nextIndexInChain - bucketIndex) & mask; } unsigned offset = (bucketIndex - precedingIndexInChain) & mask; assert(offset != 0); if (precedingIndexInChain == homeIndex) { buckets[precedingIndexInChain].m_firstOffset = offset; } else { buckets[precedingIndexInChain].m_nextOffset = offset; } bucket->m_hash = hash; bucket->m_key = key; bucket->m_value = value; return true; } } // No more free buckets. return false; } //------------------------------------------------------------------------ // HashTableBase::TryGetBucket: attempts to get the bucket that holds a // particular key. // // Arguments: // hash - The hash code of the key to find. // key - The key to find. // precedingIndex - An output parameter that will hold the index of the // preceding bucket in the chain for the key. May be // equal to `bucketIndex` if the key is stored in its // home bucket. // bucketIndex - An output parameter that will hold the index of the // bucket that stores the key. // // Returns: // True if the key was successfully found; false otherwise. bool TryGetBucket(unsigned hash, const TKey& key, unsigned* precedingIndex, unsigned* bucketIndex) const { if (m_numBuckets == 0) { return false; } const unsigned mask = m_numBuckets - 1; unsigned index = hash & mask; Bucket* bucket = &m_buckets[index]; if (bucket->m_isFull && bucket->m_hash == hash && TKeyInfo::Equals(bucket->m_key, key)) { *precedingIndex = index; *bucketIndex = index; return true; } for (unsigned offset = bucket->m_firstOffset; offset != 0; offset = bucket->m_nextOffset) { unsigned precedingIndexInChain = index; index = (index + offset) & mask; bucket = &m_buckets[index]; assert(bucket->m_isFull); if (bucket->m_hash == hash && TKeyInfo::Equals(bucket->m_key, key)) { *precedingIndex = precedingIndexInChain; *bucketIndex = index; return true; } } return false; } //------------------------------------------------------------------------ // HashTableBase::Resize: allocates a new bucket array twice the size of // the current array and copies the key-value pairs // from the current bucket array into the new array. void Resize() { Bucket* currentBuckets = m_buckets; unsigned newNumBuckets = m_numBuckets == 0 ? InitialNumBuckets : m_numBuckets * 2; size_t allocSize = sizeof(Bucket) * newNumBuckets; assert((sizeof(Bucket) * m_numBuckets) < allocSize); auto* newBuckets = reinterpret_cast(compGetMem(m_compiler, allocSize)); memset(newBuckets, 0, allocSize); for (unsigned currentIndex = 0; currentIndex < m_numBuckets; currentIndex++) { Bucket* currentBucket = ¤tBuckets[currentIndex]; if (!currentBucket->m_isFull) { continue; } bool inserted = Insert(newBuckets, newNumBuckets, currentBucket->m_hash, currentBucket->m_key, currentBucket->m_value); (assert(inserted), (void)inserted); } m_numBuckets = newNumBuckets; m_buckets = newBuckets; } protected: HashTableBase(Compiler* compiler, Bucket* buckets, unsigned numBuckets) : m_compiler(compiler), m_buckets(buckets), m_numBuckets(numBuckets), m_numFullBuckets(0) { assert(compiler != nullptr); if (numBuckets > 0) { assert((numBuckets & (numBuckets - 1)) == 0); // Size must be a power of 2 assert(m_buckets != nullptr); memset(m_buckets, 0, sizeof(Bucket) * numBuckets); } } public: #ifdef DEBUG class Iterator; class KeyValuePair final { friend class HashTableBase::Iterator; Bucket* m_bucket; KeyValuePair(Bucket* bucket) : m_bucket(bucket) { assert(m_bucket != nullptr); } public: KeyValuePair() : m_bucket(nullptr) { } inline TKey& Key() { return m_bucket->m_key; } inline TValue& Value() { return m_bucket->m_value; } }; // NOTE: HashTableBase only provides iterators in debug builds because the order in which // the iterator type produces values is undefined (e.g. it is not related to the order in // which key-value pairs were inserted). class Iterator final { friend class HashTableBase; Bucket* m_buckets; unsigned m_numBuckets; unsigned m_index; Iterator(Bucket* buckets, unsigned numBuckets, unsigned index) : m_buckets(buckets), m_numBuckets(numBuckets), m_index(index) { assert((buckets != nullptr) || (numBuckets == 0)); assert(index <= numBuckets); // Advance to the first occupied bucket while (m_index != m_numBuckets && !m_buckets[m_index].m_isFull) { m_index++; } } public: Iterator() : m_buckets(nullptr), m_numBuckets(0), m_index(0) { } KeyValuePair operator*() const { if (m_index >= m_numBuckets) { return KeyValuePair(); } Bucket* bucket = &m_buckets[m_index]; assert(bucket->m_isFull); return KeyValuePair(bucket); } KeyValuePair operator->() const { return this->operator*(); } bool operator==(const Iterator& other) const { return (m_buckets == other.m_buckets) && (m_index == other.m_index); } bool operator!=(const Iterator& other) const { return (m_buckets != other.m_buckets) || (m_index != other.m_index); } Iterator& operator++() { do { m_index++; } while (m_index != m_numBuckets && !m_buckets[m_index].m_isFull); return *this; } }; Iterator begin() const { return Iterator(m_buckets, m_numBuckets, 0); } Iterator end() const { return Iterator(m_buckets, m_numBuckets, m_numBuckets); } #endif // DEBUG unsigned Count() const { return m_numFullBuckets; } void Clear() { if (m_numBuckets > 0) { memset(m_buckets, 0, sizeof(Bucket) * m_numBuckets); m_numFullBuckets = 0; } } //------------------------------------------------------------------------ // HashTableBase::AddOrUpdate: adds a key-value pair to the hash table if // the key does not already exist in the // table, or updates the value if the key // already exists. // // Arguments: // key - The key for which to add or update a value. // value - The value. // // Returns: // True if the value was added; false if it was updated. bool AddOrUpdate(const TKey& key, const TValue& value) { unsigned hash = TKeyInfo::GetHashCode(key); unsigned unused, index; if (TryGetBucket(hash, key, &unused, &index)) { m_buckets[index].m_value = value; return false; } // If the load is greater than 0.8, resize the table before inserting. if ((m_numFullBuckets * 5) >= (m_numBuckets * 4)) { Resize(); } bool inserted = Insert(m_buckets, m_numBuckets, hash, key, value); (assert(inserted), (void)inserted); m_numFullBuckets++; return true; } //------------------------------------------------------------------------ // HashTableBase::TryRemove: removes a key from the hash table and returns // its value if the key exists in the table. // // Arguments: // key - The key to remove from the table. // value - An output parameter that will hold the value for the removed // key. // // Returns: // True if the key was removed from the table; false otherwise. bool TryRemove(const TKey& key, TValue* value) { unsigned hash = TKeyInfo::GetHashCode(key); unsigned precedingIndexInChain, bucketIndex; if (!TryGetBucket(hash, key, &precedingIndexInChain, &bucketIndex)) { return false; } Bucket* bucket = &m_buckets[bucketIndex]; if (precedingIndexInChain != bucketIndex) { const unsigned mask = m_numBuckets - 1; unsigned homeIndex = hash & mask; unsigned nextOffset; if (bucket->m_nextOffset == 0) { nextOffset = 0; } else { unsigned nextIndexInChain = (bucketIndex + bucket->m_nextOffset) & mask; nextOffset = (nextIndexInChain - precedingIndexInChain) & mask; } if (precedingIndexInChain == homeIndex) { m_buckets[precedingIndexInChain].m_firstOffset = nextOffset; } else { m_buckets[precedingIndexInChain].m_nextOffset = nextOffset; } } bucket->m_isFull = false; bucket->m_nextOffset = 0; m_numFullBuckets--; *value = bucket->m_value; return true; } //------------------------------------------------------------------------ // HashTableBase::TryGetValue: retrieves the value for a key if the key // exists in the table. // // Arguments: // key - The key to find from the table. // value - An output parameter that will hold the value for the key. // // Returns: // True if the key was found in the table; false otherwise. bool TryGetValue(const TKey& key, TValue* value) const { unsigned unused, index; if (!TryGetBucket(TKeyInfo::GetHashCode(key), key, &unused, &index)) { return false; } *value = m_buckets[index].m_value; return true; } //------------------------------------------------------------------------ // HashTableBase::Contains: returns true if a key exists in the table and // false otherwise. // // Arguments: // key - The key to find from the table. // // Returns: // True if the key was found in the table; false otherwise. bool Contains(const TKey& key) const { unsigned unused, index; return TryGetBucket(TKeyInfo::GetHashCode(key), key, &unused, &index); } }; //------------------------------------------------------------------------ // HashTable: a simple subclass of `HashTableBase` that always uses heap // storage for its bucket array. template > class HashTable final : public HashTableBase { typedef HashTableBase TBase; static unsigned RoundUp(unsigned initialSize) { return 1 << genLog2(initialSize); } public: HashTable(Compiler* compiler) : TBase(compiler, nullptr, 0) { } HashTable(Compiler* compiler, unsigned initialSize) : TBase(compiler, reinterpret_cast( compGetMem(compiler, RoundUp(initialSize) * sizeof(typename TBase::Bucket))), RoundUp(initialSize)) { } }; //------------------------------------------------------------------------ // SmallHashTable: an alternative to `HashTable` that stores the initial // bucket array inline. Most useful for situations where // the number of key-value pairs that will be stored in // the map at any given time falls below a certain // threshold. Switches to heap storage once the initial // inline storage is exhausted. template > class SmallHashTable final : public HashTableBase { typedef HashTableBase TBase; enum : unsigned { RoundedNumInlineBuckets = 1 << ConstLog2::value }; typename TBase::Bucket m_inlineBuckets[RoundedNumInlineBuckets]; public: SmallHashTable(Compiler* compiler) : TBase(compiler, m_inlineBuckets, RoundedNumInlineBuckets) { } }; #endif // _SMALLHASHTABLE_H_