Hash table and bloom filter
Hash operation: mapping from high-dimensional space to low-dimensional space (the mapping rules are specified by themselves). (the low dimensional data of the hash table is the array subscript.)
Hash conflicts cannot be avoided, so more emphasis is placed on processing methods
Processing method of hash table:
(1) Open addressing method. (linear detection method: if the subscript has been calculated, calculate the following subscript.) (secondary hashing (respectively + 1 ^ 2 + 2 ^ 2 + 2 ^ 3 +...) is used more)
(2) Then hashifa. (you need to set the hash function to work with other hash processing methods)
(3) Establish a public overflow area. (establish a public overflow buffer (red black tree O(logn) and array O(n) can be used for storage))
(4) Zipper (linked list) method. (treat each position of the hash table as a chain header node)
Create hash table (method for handling conflicts: open address method)
#include<iostream> #include<cstdio> #include<queue> #include<stack> #include<algorithm> #include<string> #include<map> #include<set> #include<vector> using namespace std; class HashTable{ public: HashTable(int n=100):flag(n),data(n),cnt(0){} //Save operation void insert(string s){ //Step 1: create a hash_func function returns ind subscript int ind = hash_func(s)%data.size();//DP Hash recalc_ind(ind,s);//Conflict handling //Get legal array subscript if(flag[ind] == false){ data[ind] = s; flag[ind]= true; cnt++; if(cnt*100>data.size()*75){ expand();//Capacity expansion operation } } return ; } bool find(string s){ int ind=hash_func(s)%data.size();//DP Hash recalc_ind(ind,s);//Conflict handling return flag[ind]; } private: int cnt; vector<string> data; vector<bool> flag;//It is used to record whether there is data stored in the corresponding position. To prevent hash conflicts, the size should be the same as that of the data array void expand(){ int n = data.size()*2;//Typically double the size of the original storage area HashTable h(n); for(int i =0;i<data.size();i++){ if(flag[i]==false) continue; h.insert(data[i]); } *this = h;//The new hash table is assigned to the original hash table return ; } //Calculate hash value: convert any data type to integer int hash_func(string &s){ int seed = 131,hash = 0;//BKDRHash for(int i =0,s[i];i++){ hash = hash*seed+s[i]; } return hash & 0x7fffffff; } void recalc_ind(int &ind,string &s){ int t =1;//Here t means how many times are you currently testing while(flag[ind] && data[ind]!=s){//If the current position is 1, the data is stored & & the current value is prevented from being stored twice //The square detection method is used here ind += t*t; t+=1; ind %= data.size();//Then convert ind into array subscript } return ; } }; int main(){ int op; string s; while(cin>>op>>s){ switch(op){ case 1:h.insert(s);break; case 2:cout<<"find"<<s<<":"<<h.find(s)<<endl;break; } } return 0; }
Filling factor:
Number of storage elements / total hash capacity = 0.75 (the capacity needs to be expanded if it exceeds 0.75) the function uses expand() to expand the capacity, and the time complexity is O(n)
Handle conflicts by creating a public buffer. Create a hash table:
#include<iostream> #include<cstdio> #include<queue> #include<stack> #include<algorithm> #include<string> #include<map> #include<set> #include<vector> using namespace std; class HashTable{ public: HashTable(int n=100):flag(n),data(n),cnt(0){} //Save operation void insert(string s){ //Step 1: create a hash_func function returns ind subscript int ind = hash_func(s)%data.size();//DP Hash recalc_ind(ind,s);//Conflict handling //Get legal array subscript if(flag[ind] == false){ data[ind] = s; flag[ind]= true; cnt++; if(cnt*100>data.size()*75){ expand();//Capacity expansion operation } }else{//If a value exists at the current position of the insertion, the value is placed in the public overflow area buff.insert(s); } return ; } bool find(string s){ int ind=hash_func(s)%data.size();//DP Hash recalc_ind(ind,s);//Conflict handling if(flag[ind]==false) return false; if(data[ind] == s) return true; return buff.find(s) != buff.end(); } private: int cnt; vector<string> data; vector<bool> flag;//It is used to record whether there is data stored in the corresponding position. To prevent hash conflicts, the size should be the same as that of the data array set<string>buff;//Use the underlying buffer (red tree) void expand(){ int n = data.size()*2;//Typically double the size of the original storage area HashTable h(n); for(int i =0;i<data.size();i++){ if(flag[i]==false) continue; h.insert(data[i]); } //Insert the elements of the common overflow area into the new hash table for(auto x:buff){ h.insert(x); } *this = h;//The new hash table is assigned to the original hash table return ; } //Calculate hash value: convert any data type to integer int hash_func(string &s){ int seed = 131,hash = 0;//bkdrhash for(int i =0,s[i];i++){ hash = hash*seed+s[i]; } return hash & 0x7fffffff; } void recalc_ind(int &ind,string &s){ return ;//No conflict handling, return directly } }; int main(){ int op; string s; while(cin>>op>>s){ switch(op){ case 1:h.insert(s);break; case 2:cout<<"find"<<s<<":"<<h.find(s)<<endl;break; } } return 0; }
Using zipper method to deal with hash conflict and establish hash table
#include<iostream> #include<cstdio> #include<queue> #include<stack> #include<algorithm> #include<string> #include<map> #include<set> #include<vector> using namespace std; class Node{ public: Node(string data = "",Node = nullptr):data(),next(nullptr){} string data; Node *next; void insert(Node *node){ node ->next = this.next; this->next = node; return ; } }; class HashTable{ public: HashTable(int n=100):flag(n),data(n),cnt(0){} //Save operation void insert(string s){ //Step 1: create a hash_func function returns ind subscript int ind = hash_func(s)%data.size();//DP Hash recalc_ind(ind,s);//Conflict handling //Store the current element in the of the current linked list Node *p =&data[ind]; while(p->next && p->next->data!=s)p=p->next; if(p->next == nullptr){ //It indicates that the currently inserted data does not exist in the current linked list. At this time, the current data will be inserted into the next bit of the end node of the whole linked list p->insert(new Node(s)); cnt+=1; if(cnt >data.size()*3)//At this time, the loading factor is equivalent to 3, which indicates that the capacity expansion operation occurs when the average number of data stored at each position in the zipper method is 3. expend(); } return ; } bool find(string s){ int ind=hash_func(s)%data.size();//DP Hash recalc_ind(ind,s);//Conflict handling Node *p = &data[ind].next;//The head node does not store data, so start searching at the next bit of the head node while(p&&p->data!=s) p=p->next; return p!=nullptr;//At this time, the description p is empty. If p is not empty, it indicates that it has been found. } private: int cnt;//Here, the filling factor can be greater than 1 vector<Node> data;//Each position of the hash table is the head node of the linked list void expand(){ int n = data.size()*2;//Typically double the size of the original storage area HashTable h(n); for(int i =0;i<data.size();i++){ //Traverse each position by traversing the linked list Node *p = data[i].next; while(p){ h.insert(p->data);//When p is not empty, the data stored by P is stored in a new linked list p= p->next; } } *this = h;//The new hash table is assigned to the original hash table return ; } //Calculate hash value: convert any data type to integer int hash_func(string &s){ int seed = 131,hash = 0;//bkdrhash for(int i =0,s[i];i++){ hash = hash*seed+s[i]; } return hash & 0x7fffffff; } void recalc_ind(int &ind,string &s){ return ;//No conflict handling, return directly } }; int main(){ int op; string s; while(cin>>op>>s){ switch(op){ case 1:h.insert(s);break; case 2:cout<<"find"<<s<<":"<<h.find(s)<<endl;break; } } return 0; }
In traditional hash tables, the storage space is related to the number of elements
Bloom filter, the storage space is independent of the number of elements
The traditional hash table is not feasible when it involves a large amount of data, such as crawler storage url weight judgment*
The bloom filter has a data storage area for storing binary marks (it has a set of hash functions. When a data enters the function, it maps binary numbers. If a hash function maps 0, it indicates that the data does not exist, and if the mapped corresponding positions are all 1, it can only indicate that the probability exists): it is used to judge whether the element exists and there is a misjudgment rate. Therefore, it is mostly used in scenarios with big data and information security requirements.
leetcode topic training
705. Design hash set
class Node{ public: Node(int key=0,Node *next = nullptr):key(key),next(next){} int key; Node *next; void insert_after(Node *node){ node->next = this->next; this->next=node; return; } void remove_after(){ if(this->next == nullptr) return ; Node *p = this->next; this->next = this->next->next; delete p; return ; } }; class MyHashSet { public: //Zipper method /** Initialize your data structure here. */ vector<Node> data; MyHashSet():data(100) {} int hash_func(int key){return key & 0x7fffffff;} void add(int key) { if(contains(key))return ;//If the current element exists, return it directly int ind = hash_func(key)%data.size();//Otherwise, calculate the hash value corresponding to the key through the hash function, and then take the remainder of the array size data[ind].insert_after(new Node(key));//The process of inserting elements into a hash table return ; } void remove(int key) { int ind = hash_func(key) % data.size();//Find the subscript of the removed element Node *p = &data[ind]; while(p->next &&p->next->key!=key) p=p->next; //At this time, the previous node of the node to be deleted is found p->remove_after(); return ; } /** Returns true if this set contains the specified element */ bool contains(int key) { int ind = hash_func(key)%data.size(); Node *p = data[ind].next; while(p&&p->key!=key) p=p->next; return p!=nullptr; } };
- Design hash mapping
class Node{ public: Node(int key=0,int value = 0,Node *next=nullptr):value(value),key(key),next(next){} int key,value; Node *next; void insert_after(Node *node){ node->next = this->next; this->next = node; return ; } void remove_after(){ if(this->next == nullptr) return ; Node *p = this->next; this->next = this->next->next; delete p; return ; } }; class MyHashMap { public: //Designing hash function with zipper method vector<Node>data; /** Initialize your data structure here. */ MyHashMap():data(100) {} int hash_func(int key){return key&0x7fffffff;} /** value will always be non-negative. */ void put(int key, int value) { int ind = hash_func(key)%data.size(); Node *p = &data[ind]; while(p->next && p->next->key!=key) p=p->next; //Currently, there are two cases where the key value of P - > next is empty / P - > next if(p->next){//1. If P - > next is not empty p->next->value = value; }else{//2. If P's next is empty, add a new node storage key value pair after the current node p->insert_after(new Node(key,value)); } return ; } /** Returns the value to which the specified key is mapped, or -1 if this map contains no mapping for the key */ int get(int key) { int ind = hash_func(key)%data.size(); Node *p = data[ind].next; while(p && p->key !=key) p=p->next; if(p == nullptr) return -1; return p->value; } /** Removes the mapping of the specified value key if this map contains a mapping for the key */ void remove(int key) { int ind = hash_func(key)%data.size(); Node *p = &data[ind]; while(p->next&&p->next->key!=key) p=p->next; p->remove_after(); return ; } };
Interview question 16.25 LRU cache
class Node{ public: Node(int key =0,int value=0,Node *prev = nullptr,Node *next = nullptr) :key(key),value(value),prev(prev),next(next){} int key,value; Node *next,*prev;//The pointer field points to the front and back position Node *remove_this(){//Points the previous bit of the current node to the next bit of the current node if(this->prev) this->prev->next = this->next; if(this->next) this->next->prev = this->prev; this->next = this->prev = nullptr;//The two pointers before and after the current position are assigned null respectively return this;//Returns the address of the current node } void insert_prev(Node *node){ node->next = this; node->prev = this->prev; if(this->prev) this->prev->next = node; this->prev = node; return ; } }; class HashList{//Simulate a queue using hash linked list = > know the head and tail of the queue public: int capacity;//Maximum number of storage elements Node head,tail;//Virtual head and virtual tail are easy to insert and delete unordered_map<int,Node *>data;//Hash table implementation unordered_map (mapping from int type to linked list node address) HashList(int capacity):capacity(capacity){ head.next =&tail; tail.prev =&head; } void put(int key,int value){//Function: insert a new node if(data.find(key)!=data.end()){//Note: if the find() function is not found, it returns the value of end //Note that the newly inserted value already exists. First, find the current node, and then take the existing value from the list to the end of the list data[key]->value = value;//Modify the value first data[key]->remove_this();//Then move the node out of the whole linked list }else{ data[key] = new Node(key,value); } tail.insert_prev(data[key]);//Finally, put the node at the end of the whole linked list if(data.size()>capacity){//Detects whether the storage space of the hash table has been exceeded data.erase(data.find(head.next->key));//If the capacity is exceeded, the next node of the head will be deleted (deleted in the hash table) delete head.next->remove_this();//Then delete in the linked list and recycle the space of the current node through delete } return ; } int get(int key){//Get node value from hash linked list if(data.find(key) == data.end())return -1; data[key]->remove_this(); tail.insert_prev(data[key]); return data[key]->value; } }; class LRUCache { public: HashList h; LRUCache(int capacity):h(capacity) {} int get(int key) { return h.get(key); } void put(int key, int value) { h.put(key,value); } };
- Encryption and decryption of TinyURL
class Solution { public: Solution(){srand(time(0));} char ch(int x){//Randomly generate upper and lower case characters and '0' - '9' through x (0-25 = A-Z; 26-51 = A-Z; 52-61 ='0 '-' 9 ') x %= 62; if(x<26) return x+'a'; if(x<52) return x-26+'A'; return x-52+'0'; } string rand_string(int n){ //The length of randomly generated string is n string s=""; for(int i=0;i<n;i++){ s+=ch(rand()); } return s; } unordered_map<string,string> h;//Short URL // Encodes a URL to a shortened URL. string encode(string longUrl) { string s;//Unused short URL do{ s=rand_string(5); }while(h.find(s)!=h.end());//If the short URL exists, continue to generate it h[s] = longUrl; return s; } // Decodes a shortened URL to its original URL. string decode(string shortUrl) { return h[shortUrl]; } };
- Repetitive DNA sequence
class Solution { public: vector<string> findRepeatedDnaSequences(string s) { unordered_map<string,int> h; for(int i=0,I=s.size()-9;i<I;i++){ h[s.substr(i,10)] +=1 ; } vector<string> ret; //Scan the elements in the hash table and store the key with value greater than 1 into the result array ret for(auto x:h){ if(x.second == 1) continue; //At this time, the rest is more than once ret.push_back(x.first); } return ret; } };
- Maximum word length product
class Solution { public: int maxProduct(vector<string>& words) { vector<int> mark(words.size()); for(int i=0;i<words.size();i++){ for(auto c:words[i]){ mark[i] |=(1<<(c-'a')); } } int ans = 0; for(int i=0;i<words.size();i++){ for(int j=i+1;j<words.size();j++){ if(mark[i]&mark[j]) continue; ans = max(ans,int(words[i].size() *words[j].size())); } } return ans; } };
- Search two-dimensional matrix II (two positions: lower left and upper right are the boundaries)
class Solution { public: bool searchMatrix(vector<vector<int>>& matrix, int target) { int i=0,j=matrix[0].size()-1; while(i<matrix.size() && j>=0){ if(matrix[i][j] == target) return true; if(matrix[i][j] < target)i+=1; else j-=1; } return false; } };
- Allocate coins in a binary tree
class Solution { public: //Total cost of coin movement int getResult(TreeNode *root,int &n,int &m){//Number of nodes n, number of coins m n = m =0; if(root == nullptr) return 0; n = 1,m = root->val; int ans=0,n1,m1; ans += getResult(root->left,n1,m1); ans += abs(n1-m1); n += n1,m += m1; ans += getResult(root->right,n1,m1); ans += abs(n1-m1); n += n1,m += m1; return ans; } int distributeCoins(TreeNode* root) { int n,m; return getResult(root,n,m); } };
- Flat multilevel bidirectional linked list
class Solution { public: Node* flatten(Node* head) { Node *p = head,*k,*q;//Here, p is the original linked list node, k is the first node after flattening, and q is the node behind the original linked list p while(p){ if(p->child){ q=p->next; k =flatten(p->child); p->child = nullptr;//After flattening, you need to point P - > child to null!!!! p->next = k; k->prev = p; while(p->next) p=p->next;//p follows the list of k all the way back to the end of k; p->next = q; if(q)q->prev = p; } p=p->next; } return head; } };
- All nodes with distance K in binary tree
/** * Definition for a binary tree node. * struct TreeNode { * int val; * TreeNode *left; * TreeNode *right; * TreeNode(int x) : val(x), left(NULL), right(NULL) {} * }; */ class Solution { public: void dfs(TreeNode *root,int c,int k,vector<int> &ret){//c is the current layer number, k and the target layer number if(k<0) return ; if(root == nullptr) return ; if(c == k) {//When the target layer is found, the node value corresponding to the layer is transferred to the ret array ret.push_back(root->val); return ; } dfs(root->left,c+1,k,ret); dfs(root->right,c+1,k,ret); return ; } TreeNode *getResult(TreeNode *root ,TreeNode *target,int &k,vector<int> &ret){//Node with distance k if(root == nullptr) return nullptr; if(root== target){ dfs(root,0,k,ret);//The layer where root is located is layer 0 return root; } if(getResult(root->left,target,k,ret)){ k-=1; if(k==0) ret.push_back(root->val); dfs(root->right,0,k-1,ret);//Because the current node is in the left subtree, we need to deal with the number of nodes in the right subtree return target; }else if(getResult(root->right,target,k,ret)){ k-=1;//In the process of backtracking, k needs to be reduced by 1 first if(k==0) ret.push_back(root->val); dfs(root->left,0,k-1,ret); return target; } return nullptr; } vector<int> distanceK(TreeNode* root, TreeNode* target, int k) { vector<int> ret; getResult(root,target,k,ret); return ret; } };