My roommate learned the hash table only on the last day of 2021

Posted by daucoin on Fri, 31 Dec 2021 13:16:42 +0100

Hash concept

A one-to-one mapping relationship can be established between the storage location of an element and its key through a function (hashFunc), so the element can be found quickly during search

The conversion function used in the hash method is called hash (hash) function, and the structure constructed is called hash table (or Hash list)

Mapping mode

① Direct addressing method

The index relationship is established by the relative mapping or absolute position between the array and the data. At this time, the time complexity of adding, deleting, checking and modifying is O (1)
Defects:
1. If the data range is large, the direct customization method will waste a lot of space
2. Cannot process string, floating point number and other data, and cannot be used as the index of the array

Applicable to: integers and data sets

② Division residue method

Solve the problem of large data range

However, there is hash conflict in the divide and remainder method (different keywords calculate the same hash address through the same hash hash number, which is called hash conflict or hash collision)

Resolve hash conflicts

1. Closed hash (open addressing method)

① Linear detection

Linear detection can lead to a stampede effect (continuous position conflicts)

② Secondary detection

Secondary detection is more dispersed than linear detection data, which can effectively alleviate the stampede effect

Load factor (load factor) = number of valid data stored / space size
The larger the load factor, the higher the probability of conflict, the lower the efficiency of adding, deleting, searching and modifying, and the higher the space utilization
The smaller the load factor, the lower the probability of conflict, the higher the efficiency of adding, deleting, searching and modifying, and the lower the space utilization

Therefore, an important problem of hash table is to control the size of load factor

③ Node design

In order to determine whether a position in the array is empty, we use State to identify the State of the data in the array

	//Three states of data are listed
	enum State
	{
		EMPTY,//empty
		EXITS,//existence
		DELETE,//When deleting, you only need to change the status to DELETE, so as to realize "false" deletion
	};
    
	template<class K,class V>
	struct HashData
	{   
	    //Each data node is stored in the vector
	    //kv structure
		pair<K, V> _kv;
		
		State _state = EMPTY;
	};

④ Find operation

When the search is empty, it is not found. We control the size of the load factor to be less than 0.8, so there must be an empty location

		HashData<K,V>* Find(const K& key)
		{
			//Judge whether it is empty
			if (_table.size() == 0)
			{
				return nullptr;
			}
			HashFunc hf;
			size_t start = hf(key) % _table.size();
			size_t index = start;
			size_t i = 1;
			while (_table[index].state != EMPTY)
			{
				//eureka
				if (_table[index]._state==EXITS && _table[index]._kv.first == key)
				{
					return &_table[index];
				}
				//Look back
				//Linear detection secondary detection only needs to be changed to index=start+i*i;
				index =start+ i;
				index %= _table.size();
				++i;
			}
			return nullptr;
		}

⑤ Insert operation

The insertion operation needs to pay attention to the problem of capacity expansion. After capacity expansion, the data must be remapped

		bool Insert(const pair<K, V>& kv)
		{
			//Judge whether it exists
			HashData<K,V>* ret = Find(kv.first);
			if (ret)
			{
				return false;
			}
			//size==0
			if (_table.size() == 0)
			{
				_table.resize(10);
			}
			//Calculate load factor 
			else if ((double)_n/(double)_table.size()>0.8)
			{
				//After capacity expansion, all data must be remapped into the new array
				//Recreate the HashTable object and reuse its insert function
				HashTable<K, V,HashFunc> newHashTable;
				newHashTable.resize(2 * _table.size());
				//Traverse the data of the original Table
				for (auto& e : _table)
				{
					if (e._state == EXITS)
					{
						//Reuse Insert
						newHashTable.Insert(e._kv);
					}
				}
				_table.swap(newHashTable._table);
			}
			HashFunc hf;
			//If accessing data larger than size < capacity 
			size_t start = hf(kv.first) % _table.size();
			size_t index = start;
			size_t i = 1;
			//Linear detection or secondary detection
			while (_table[index]._state == EXITS)
			{
				index += start+i;
				index %= _table.size();
				++i;
			}
			_table[index]._kv = kv;
			_table[index]._state = EXITS;
			++_n;
		}

⑥ Delete operation

Once found, you only need to modify the status

	bool Erase(const K& key)
	{
		    //lookup
			HashData<K, V>* ret = Find(key);
			if (ret == nullptr)
			{
				return false;
			}
			else
			{
				//Modify node status
				ret->_state = DELETE;
				--_n;
				return true;
			}
	}

Complete code

#pragma once
#include<vector>
#include<iostream>
using namespace std;
//closeHash
namespace tzc
{
	enum State
	{
		EMPTY,
		EXITS,
		DELETE,
	};

	template<class K,class V>
	struct HashData
	{
		pair<K, V> _kv;
		State _state = EMPTY;
	};
	
	template<class K>
	struct HashFunc
	{
		int operator()(int i)
		{
			return i;
		}
	};
	template<>
	struct HashFunc<string>
	{
		size_t operator()(const string& s)
		{
			//return s[0];
			size_t value = 0;
			for (auto ch : s)
			{
				value += ch;
				value *= 131;
			}
			return value;
		}
	};
	struct pairHashFunc
	{
		size_t operator()(const pair<string, string>& kv)
		{
			size_t value = 0;
			//Take KEY as the standard
			for (auto& ch : kv.first)
			{
				value += ch;
				value *= 131;
			}
			return value;
		}
	};

	template<class K, class V,class HashFunc>
	class HashTable
	{
	public:

		bool Insert(const pair<K, V>& kv)
		{
			//Judge whether it exists
			HashData<K,V>* ret = Find(kv.first);
			if (ret)
			{
				return false;
			}
			//size==0
			if (_table.size() == 0)
			{
				_table.resize(10);
			}
			//Calculate load factor 
			else if ((double)_n/(double)_table.size()>0.8)
			{
				//After capacity expansion, all data must be remapped into the new array

				//Recreate the HashTable object and reuse its insert function
				HashTable<K, V,HashFunc> newHashTable;
				newHashTable.resize(2 * _table.size());
				for (auto& e : _table)
				{
					if (e._state == EXITS)
					{
						//Multiplex insertion
						newHashTable.Insert(e._kv);
					}
				}
				_table.swap(newHashTable._table);
			}
			HashFunc hf;
			size_t start = hf(kv.first) % _table.size();//table[]
			size_t index = start;
			size_t i = 1;
			//Linear detection or secondary detection
			while (_table[index]._state == EXITS)
			{
				index += start+i;
				index %= _table.size();
				++i;
			}
			_table[index]._kv = kv;
			_table[index]._state = EXITS;
			++_n;
		}
		HashData<K,V>* Find(const K& key)
		{
			//Judge whether it is empty
			if (_table.size() == 0)
			{
				return nullptr;
			}
			HashFunc hf;
			size_t start = hf(key) % _table.size();
			size_t index = start;
			size_t i = 1;
			while (_table[index].state != EMPTY)
			{
				//eureka
				if (_table[index]._state==EXITS && _table[index]._kv.first == key)
				{
					return &_table[index];
				}
				//Look back
				//Linear detection secondary detection only needs to be changed to index=start+i*i;
				index =start+ i;
				index %= _table.size();
				++i;
			}
			return nullptr;
		}
		bool Erase(const K& key)
		{
			HashData<K, V>* ret = Find(key);
			if (ret == nullptr)
			{
				return false;
			}
			else
			{
				ret->_state = DELETE;
				--_n;
				return true;
			}
		}
	private:
		vector<HashData<K,V>> _table;
		size_t _n;//The number of data stored in the array
	};
}

Problems with hash

For the following statements:

size_t start = hf(kv.first) % _table.size();

When the key stores string, user-defined type and other data, it cannot be converted into a mapping relationship and stored in the hash table

At this time, you need to implement the corresponding imitation function to convert the key module

 template<class K>
	struct HashFunc
	{
		int operator()(int i)
		{
			return i;
		}
	};
	
	//template specialization 
	template<>
	struct HashFunc<string>
	{
		size_t operator()(const string& s)
		{
			//return s[0];
			size_t value = 0;
			for (auto ch : s)
			{
				value += ch;
				value *= 131;
			}
			return value;
    }

It can be seen that if a type is used as a map/set key, it needs to be able to support size comparison

Do unordered_ map/unordered_ The key of set needs to be able to be converted into shaping (supporting modulo) and can be compared to equality

2. Hash

Hashing – hash bucket / zipper method
A single linked table pointer that stores a structure in an array

① Node design

Node stored in vector

template<class K,class V>
	struct HashNode
	{
		pair<K, V> _kv;
		HashNode<K, V>* _kv;
		HashNode(const pair<K, V>& kv)
			:_next(__nullptr)
			,_kv(kv)
		{
			
		}
	};

Private members contained in HashTable

		vector<Node*> _table;
		size_t _n = 0;//Number of valid data

② Insert operation

Capacity increase: it is agreed that when the load factor is equal to 1, it is necessary to increase capacity and reorganize, traverse the nodes in the old table and insert them into the new table
Hash table reorganization method 1: cleverly reuse the insert method

	bool Insert(const pair<K.V>& kv)
		{
			if (Find(kv.first))
			{
				return false;
			}
			if (_n == _table.size())
			{
				//Open up a new vector
				vector<Node*> newtable;
				size_t newSize = _table.size() == 0 ? 11: _table.size() * 2;
				newtable.resize(GetNextPrime(_table.size()), nullptr);

				//Traverse old tables
				for (size_t i = 0; i < _table.size(); ++i)
				{
					if (_table[i])
					{
						Node* cur = _table[i];
						while (cur)
						{
							Node* next = cur->_next;
							size_t index = cur->_kv.first % newtable.size();
							//Head insertion
							cur->_next = _table[index];
							_table[index] = cur;
							cur = next;
						}
						_table[i] = nullptr;
					}
				}
				//Old and new exchange
				_table.swap(newtable);
			}
			HashFunc fc;
			size_t index = fc(kv.first) % _table.size();
			Node* newnode = new Node(kv);
			//Head insert
			newnode->_next = _table[index];
			_table[index] = newnode;
			++_n;
			return true;
		}

Hash table reorganization method 2: the reorganization is realized by continuously inserting the head node of the old bucket node into the new corresponding vector

bool Insert(const pair<K.V>& kv)
	{
		if (Find(kv.first))
		{
			return false;
		}
		if (_n == _table.size())
		{
			vector<Node*> newtable(sizeof(_table.size()),(Node*)0);
			//Traverse old nodes
			for(size_t i=0;i<_table.size();i++)
			{
				Node* cur=_table[i];
				while(cur)
				{
					//Find out which bucket the node is in
					HashFunc fc;        
					size_t cur_bucket= fc(cur.first) % _table.size();
					//Four step delicate operation
					//1. Take the next node of cur to the head of the bucket
					_table[i]=cur->next;
					//2,3: insert the cur node into the new vector by head interpolation
					cur->next=newtable[cur_bucket];
					newtable[cur_bucket]=cur;
					//4.cur becomes the head of the old barrel again
					cur=_table[i];					
				}
			}
			_table.swap(newtable);
		}
		HashFunc fc;
		size_t index = fc(kv.first) % _table.size();
		Node* newnode = new Node(kv);
		//Head insert
		newnode->_next = _table[index];
		_table[index] = newnode;
		++_n;
		return true;
	}

③ Find operation

	Node* Find(const K& key)
		{
		
			if (_table.size() == 0)
			{
				return false;
			}
			HashFunc fc;
			size_t index = fc(key) % _table.size();
			Node* cur = _table[index];
			while(cur)
			{
				if (cur->_kv.first == key)
				{
					return cur;
				}
				else
				{
					cur = cur->_next;
				}
			}
			return nullptr;
		}

④ Delete operation

		bool Erase(const K& key)
		{
			HashFunc fc;
			size_t index = fc(key) % _table.size();
			Node* prev = nullptr;
			Node* cur = _table[index];
			while (cur)
			{
				if (cur->_kv.first == key)
				{   
					//The header node is deleted
					if (_table[index] == cur)
					{
						_table[index] = cur->_next;
					}
					else
					{
						prev->_next = cur->_next;
					}
					--_n;
					delete cur;
					return true;
				}
				prev = cur;
				cur = cur->_next;
			}
		}

Advantages of hash bucket:
1. High space utilization
2. Extreme situations (all conflicts) and Solutions

Resolution of hash bucket conflict

If the data is concentrated on several buckets at this time, the time complexity will be very large when searching and deleting, which may be equal to O (N)

1. Therefore, the load factor can be controlled
The load factor of open hash may be greater than 1, so the control load factor is generally [0,1]

2. Secondly, when there are few data, the load factor is very low, but most of the data conflict
The bucket with more data can be used to change its data structure into a red black tree

Design of bucket size:
Try to make the size of the table prime, so it is less likely to conflict

const int PRIMECOUNT = 28;

const size_t primeList[PRIMECOUNT] ={ 
 53ul, 97ul, 193ul, 389ul, 769ul, 1543ul, 3079ul, 6151ul, 12289ul,
 24593ul, 49157ul, 98317ul, 196613ul, 393241ul, 786433ul, 1572869ul, 3145739ul, 
 6291469ul,12582917ul, 25165843ul, 50331653ul, 100663319ul, 201326611ul, 402653189ul, 805306457ul, 1610612741ul, 3221225473ul, 4294967291ul};
 
   size_t GetNextPrime(size_t prime)
	{
	 size_t i = 0; 
	 for(; i < PRIMECOUNT; ++i) 
	 { 
		 if(primeList[i] > primeList[i]) 
		 return primeList[i]; 
	 } 
   return primeList[i];
   }

encapsulation

① Iterator

In addition to passing the current node pointer, the iterator should also pass in the hash table
Because + + cannot find the position of the next bucket when the iterator reaches the end node of a bucket
At this time, the hash table is passed in and the position of the next bucket is recalculated

	// forward declaration  					// Default parameters are only given when defining
	template<class K, class T, class KeyOfT, class HashFunc>
	class HashTable;

	// iterator 
	template<class K, class T, class KeyOfT, class HashFunc = Hash<K>>
	struct __HTIterator
	{
		typedef HashNode<T> Node;
		typedef __HTIterator<K, T, KeyOfT, HashFunc> Self;
		
		typedef HashTable<K, T, KeyOfT, HashFunc> HT;
		Node* _node;
		HT* _pht;
			//Hash table passed to iterator
		__HTIterator(Node* node, HT* pht)
			:_node(node)
			, _pht(pht)
		{}

		Self& operator++()
		{
			// 1. Traverse the nodes of each bucket one by one
			if (_node->_next)
			{
				_node = _node->_next;
			}
			//To the next bucket
			else
			{
				//size_t index = HashFunc()(KeyOfT()(_node->_data)) % _pht->_table.size();
				KeyOfT kot;
				HashFunc hf;
				size_t index = hf(kot(_node->_data)) % _pht->_table.size();

				++index;
				while (index < _pht->_table.size())
				{
					if (_pht->_table[index])
					{
						//_ You need Shen Mingyuan for pht to access private members
						_node = _pht->_table[index];
						
						return *this;
					}
					//Empty bucket
					else
					{
						++index;
					}
				}

				_node = nullptr;
			}

			return *this;
		}
		//The underlying implementation uses a one-way linked list without overloading--
		T& operator*()
		{
			return _node->_data;
		}

		T* operator->()
		{
			return &_node->_data;
		}

		bool operator != (const Self& s) const
		{
			return _node != s._node;
		}

		bool operator == (const Self& s) const
		{
			return _node == s.node;
		}
	};

②hash functions

hash fuctions is an imitation function to solve the problem of modulo of stored values
① char,int, long, etc. are integer types and return values directly
② Strings and built-in types can only be returned after special operations

And the returned value should consider the hash conflict

//Convert Key to realize modular operation
	template<class K>
	struct Hash
	{
		size_t operator()(const K& key)
		{
			return key;
		}
	};
	// Specialization string
	template<>
	struct Hash < string >
	{
		size_t operator()(const string& s)
		{
			// BKDR Hash
			size_t value = 0;
			for (auto ch : s)
			{
				value += ch;
				value *= 131;
			}
			return value;
		}
	};

③ Hash table

	
	//Node storage T
	template<class T>
	struct HashNode
	{
		HashNode<T>* _next;
		T _data;

		HashNode(const T& data)
			:_next(nullptr)
			, _data(data)
		{}
	};
  template<class K, class T, class KeyOfT, class HashFunc = Hash<K>>
	class HashTable
	{
		typedef HashNode<T> Node;

		template<class K, class T, class KeyOfT, class HashFunc>
		friend struct __HTIterator;//Class template parameter friend
	public:
		typedef __HTIterator<K, T, KeyOfT, HashFunc> iterator;

		HashTable() = default; // Displays the default construction for the specified build
	   //Deep copy
		HashTable(const HashTable& ht)
		{
			_n = ht._n;
			_table.resize(ht._table.size());
			for (size_t i = 0; i < ht._table.size(); i++)
			{
				Node* cur = ht._table[i];
				while (cur)
				{
					Node* copy = new Node(cur->_data);
					// Traversal insertion
					copy->_next = _table[i];
					_table[i] = copy;

					cur = cur->_next;
				}
			}
		}
		
		HashTable& operator=(HashTable ht)
		{
			_table.swap(ht._table);
			swap(_n, ht._n);

			return *this;
		}

		~HashTable()
		{
			for (size_t i = 0; i < _table.size(); ++i)
			{
				Node* cur = _table[i];
				while (cur)
				{
					Node* next = cur->_next;
					delete cur;
					cur = next;
				}
				_table[i] = nullptr;
			}
		}
		//Calculate the number of barrels in the table
		size_t bucket_count()const
		{
			return  _table.size();
		}
		
		iterator begin()
		{
			size_t i = 0;
			while (i < _table.size())
			{
				if (_table[i])
				{
					return iterator(_table[i], this);
				}

				++i;
			}

			return end();
		}

		iterator end()
		{
			return iterator(nullptr, this);
		}

		size_t GetNextPrime(size_t prime)
		{
			const int PRIMECOUNT = 28;
			static const size_t primeList[PRIMECOUNT] =
			{
				53ul, 97ul, 193ul, 389ul, 769ul,
				1543ul, 3079ul, 6151ul, 12289ul, 24593ul,
				49157ul, 98317ul, 196613ul, 393241ul, 786433ul,
				1572869ul, 3145739ul, 6291469ul, 12582917ul, 25165843ul,
				50331653ul, 100663319ul, 201326611ul, 402653189ul, 805306457ul,
				1610612741ul, 3221225473ul, 4294967291ul
			};
			size_t i = 0;
			for (; i < PRIMECOUNT; ++i)
			{
				if (primeList[i] > prime)
					return primeList[i];
			}

			return primeList[i];
		}
		//Calculate the maximum number of barrels
		size_t maxbucket_count()const
		{
			return primeList[PRIMECOUNT-1];
		}
		
		//Hash bucket implementation
		pair<iterator, bool> Insert(const T& data)
		{
			KeyOfT kot;
			// eureka
			auto ret = Find(kot(data));
			if (ret != end())
				return make_pair(ret, false);

			HashFunc hf;
			// When the load factor reaches 1, increase the capacity
			if (_n == _table.size())
			{
				vector<Node*> newtable;
				//size_t newSize = _table.size() == 0 ? 8 : _table.size() * 2;
				//newtable.resize(newSize, nullptr);
				newtable.resize(GetNextPrime(_table.size()));

				// Traverse the nodes in the old table and map them to the new table
				for (size_t i = 0; i < _table.size(); ++i)
				{
					if (_table[i])
					{
						Node* cur = _table[i];
						while (cur)
						{
							Node* next = cur->_next;
							size_t index = hf(kot(cur->_data)) % newtable.size();
							// Head insert
							cur->_next = newtable[index];
							newtable[index] = cur;

							cur = next;
						}
						_table[i] = nullptr;
					}
				}

				_table.swap(newtable);
			}

			size_t index = hf(kot(data)) % _table.size();
			Node* newnode = new Node(data);

			// Head insert
			newnode->_next = _table[index];
			_table[index] = newnode;
			++_n;

			return make_pair(iterator(newnode, this), true);
		}

		iterator Find(const K& key)
		{
			if (_table.size() == 0)
			{
				return end();
			}

			KeyOfT kot;//Get key
			HashFunc hf;//Transform key module
			size_t index = hf(key) % _table.size();
			Node* cur = _table[index];
			while (cur)
			{
				if (kot(cur->_data) == key)
				{
					return iterator(cur, this);
				}
				else
				{
					cur = cur->_next;
				}
			}

			return end();
		}

		bool Erase(const K& key)
		{
			size_t index = hf(key) % _table.size();
			Node* prev = nullptr;
			Node* cur = _table[index];
			while (cur)
			{
				if (kot(cur->_data) == key)
				{
					if (_table[index] == cur)
					{
						_table[index] = cur->_next;
					}
					else
					{
						prev->_next = cur->_next;
					}

					--_n;
					delete cur;
					return true;
				}

				prev = cur;
				cur = cur->_next;
			}

			return false;
		}

	private:
		vector<Node*> _table;
		size_t _n = 0;         // Number of valid data
	};

UnorderedSet

Using set is convenient to find elements quickly
The set with red black tree at the bottom has the function of automatic sorting (medium order traversal), while the unordered set with hash table at the bottom does not

namespace tzc
{
	template<class K>
	class unordered_set
	{
		//Get key
		struct SetKeyOfT
		{
			const K& operator()(const K& k)
			{
				return k;
			}
		};
	public:
		typedef typename OpenHash::HashTable<K, K, SetKeyOfT >::iterator iterator;
		iterator begin()
		{
			return _ht.begin();
		}

		iterator end()
		{
			return _ht.end();
		}

		pair<iterator, bool> insert(const K k)
		{
			return _ht.Insert(k);
		}

	private:
		OpenHash::HashTable<K, K, SetKeyOfT> _ht;//Hash table implemented by open hash
	};
}

UnorderedMap

namespace tzc
{
	template<class K, class V>
	class unordered_map
	{
		//Get Key
		struct MapKeyOfT
		{
			const K& operator()(const pair<K, V>& kv)
			{
				return kv.first;
			}
		};
	public:
		typedef typename OpenHash::HashTable<K, pair<K, V>, MapKeyOfT>::iterator iterator;
		iterator begin()
		{
			return _ht.begin();
		}

		iterator end()
		{
			return _ht.end();
		}

		pair<iterator, bool> insert(const pair<K, V>& kv)
		{
			return _ht.Insert(kv);
		}

		V& operator[](const K& key)//Overload []
		{
			pair<iterator, bool> ret = _ht.Insert(make_pair(key, V()));
			return ret.first->second;
		}

	private:
		OpenHash::HashTable<K, pair<K, V>, MapKeyOfT> _ht;
	};
}

Topics: C++ data structure