Hash table and bloom filter

Posted by Serpent7 on Sun, 30 Jan 2022 03:22:00 +0100

Hash table and bloom filter

Hash operation: mapping from high-dimensional space to low-dimensional space (the mapping rules are specified by themselves). (the low dimensional data of the hash table is the array subscript.)
Hash conflicts cannot be avoided, so more emphasis is placed on processing methods
Processing method of hash table:
(1) Open addressing method. (linear detection method: if the subscript has been calculated, calculate the following subscript.) (secondary hashing (respectively + 1 ^ 2 + 2 ^ 2 + 2 ^ 3 +...) is used more)
(2) Then hashifa. (you need to set the hash function to work with other hash processing methods)
(3) Establish a public overflow area. (establish a public overflow buffer (red black tree O(logn) and array O(n) can be used for storage))
(4) Zipper (linked list) method. (treat each position of the hash table as a chain header node)

Create hash table (method for handling conflicts: open address method)

#include<iostream>
#include<cstdio>
#include<queue>
#include<stack>
#include<algorithm>
#include<string>
#include<map>
#include<set>
#include<vector>
using namespace std;

class HashTable{
    public:
        HashTable(int n=100):flag(n),data(n),cnt(0){}
        //Save operation
        void insert(string s){
            //Step 1: create a hash_func function returns ind subscript
            int ind = hash_func(s)%data.size();//DP Hash 
            recalc_ind(ind,s);//Conflict handling
            //Get legal array subscript
            if(flag[ind] == false){
            	data[ind] = s;
            	flag[ind]= true;
            	cnt++;
            	if(cnt*100>data.size()*75){
            		expand();//Capacity expansion operation
            	}
            }
            return ;
        }
        bool find(string s){
            int ind=hash_func(s)%data.size();//DP Hash 
            recalc_ind(ind,s);//Conflict handling
            return flag[ind];
        }

    private:
        int cnt;
        vector<string> data;
        vector<bool> flag;//It is used to record whether there is data stored in the corresponding position. To prevent hash conflicts, the size should be the same as that of the data array
        void expand(){
            int n = data.size()*2;//Typically double the size of the original storage area
            HashTable h(n);
            for(int i =0;i<data.size();i++){
                if(flag[i]==false) continue;
                h.insert(data[i]);
            }
            *this = h;//The new hash table is assigned to the original hash table
            return ;
        }
        //Calculate hash value: convert any data type to integer
        int hash_func(string &s){
            int seed = 131,hash = 0;//BKDRHash
            for(int i =0,s[i];i++){
                hash = hash*seed+s[i];
            }
            return hash & 0x7fffffff;
        }
        void recalc_ind(int &ind,string &s){
            int t =1;//Here t means how many times are you currently testing
            while(flag[ind] && data[ind]!=s){//If the current position is 1, the data is stored & & the current value is prevented from being stored twice
                //The square detection method is used here
                ind += t*t;
                t+=1;
                ind %= data.size();//Then convert ind into array subscript
            }
        return ;
        }


    };

int main(){
    int op;
    string s;
    while(cin>>op>>s){
        switch(op){
            case 1:h.insert(s);break;
            case 2:cout<<"find"<<s<<":"<<h.find(s)<<endl;break;
        }
    }
    return 0;
}

Filling factor:
Number of storage elements / total hash capacity = 0.75 (the capacity needs to be expanded if it exceeds 0.75) the function uses expand() to expand the capacity, and the time complexity is O(n)

Handle conflicts by creating a public buffer. Create a hash table:

#include<iostream>
#include<cstdio>
#include<queue>
#include<stack>
#include<algorithm>
#include<string>
#include<map>
#include<set>
#include<vector>
using namespace std;

class HashTable{
    public:
        HashTable(int n=100):flag(n),data(n),cnt(0){}
        //Save operation
        void insert(string s){
            //Step 1: create a hash_func function returns ind subscript
            int ind = hash_func(s)%data.size();//DP Hash 
            recalc_ind(ind,s);//Conflict handling
            //Get legal array subscript
            if(flag[ind] == false){
            	data[ind] = s;
            	flag[ind]= true;
            	cnt++;
            	if(cnt*100>data.size()*75){
            		expand();//Capacity expansion operation
            	}
            }else{//If a value exists at the current position of the insertion, the value is placed in the public overflow area
                buff.insert(s);
            }
            return ;
        }
        bool find(string s){
            int ind=hash_func(s)%data.size();//DP Hash 
            recalc_ind(ind,s);//Conflict handling
            if(flag[ind]==false) return false;
            if(data[ind] == s) return true;
            return buff.find(s) != buff.end();
        }

    private:
        int cnt;
        vector<string> data;
        vector<bool> flag;//It is used to record whether there is data stored in the corresponding position. To prevent hash conflicts, the size should be the same as that of the data array
        set<string>buff;//Use the underlying buffer (red tree)
        
        void expand(){
            int n = data.size()*2;//Typically double the size of the original storage area
            HashTable h(n);
            for(int i =0;i<data.size();i++){
                if(flag[i]==false) continue;
                h.insert(data[i]);
            }
            //Insert the elements of the common overflow area into the new hash table
            for(auto x:buff){
                h.insert(x);
            }
            *this = h;//The new hash table is assigned to the original hash table
            return ;
        }
        //Calculate hash value: convert any data type to integer
        int hash_func(string &s){
            int seed = 131,hash = 0;//bkdrhash
            for(int i =0,s[i];i++){
                hash = hash*seed+s[i];
            }
            return hash & 0x7fffffff;
        }
        void recalc_ind(int &ind,string &s){
            return ;//No conflict handling, return directly 
            }


    };

int main(){
    int op;
    string s;
    while(cin>>op>>s){
        switch(op){
            case 1:h.insert(s);break;
            case 2:cout<<"find"<<s<<":"<<h.find(s)<<endl;break;
        }
    }
    return 0;
}

Using zipper method to deal with hash conflict and establish hash table

#include<iostream>
#include<cstdio>
#include<queue>
#include<stack>
#include<algorithm>
#include<string>
#include<map>
#include<set>
#include<vector>
using namespace std;

class Node{
    public:
        Node(string data = "",Node = nullptr):data(),next(nullptr){}
        string data;
        Node *next;
        void insert(Node *node){
            node ->next = this.next;
            this->next = node;
            return ;
        }
};
class HashTable{
    public:
        HashTable(int n=100):flag(n),data(n),cnt(0){}
        //Save operation
        void insert(string s){
            //Step 1: create a hash_func function returns ind subscript
            int ind = hash_func(s)%data.size();//DP Hash 
            recalc_ind(ind,s);//Conflict handling
            //Store the current element in the of the current linked list
            Node *p =&data[ind];
            while(p->next && p->next->data!=s)p=p->next;
            if(p->next == nullptr){
                //It indicates that the currently inserted data does not exist in the current linked list. At this time, the current data will be inserted into the next bit of the end node of the whole linked list
                p->insert(new Node(s));
                cnt+=1;
                if(cnt >data.size()*3)//At this time, the loading factor is equivalent to 3, which indicates that the capacity expansion operation occurs when the average number of data stored at each position in the zipper method is 3.
                    expend();
            }
            return ;
        }
        bool find(string s){
            int ind=hash_func(s)%data.size();//DP Hash 
            recalc_ind(ind,s);//Conflict handling
            Node *p = &data[ind].next;//The head node does not store data, so start searching at the next bit of the head node
            while(p&&p->data!=s) p=p->next;
            return p!=nullptr;//At this time, the description p is empty. If p is not empty, it indicates that it has been found.

        }

    private:
        int cnt;//Here, the filling factor can be greater than 1
        vector<Node> data;//Each position of the hash table is the head node of the linked list
       
        void expand(){
            int n = data.size()*2;//Typically double the size of the original storage area
            HashTable h(n);
            for(int i =0;i<data.size();i++){
                //Traverse each position by traversing the linked list
                Node *p = data[i].next;
                while(p){
                    h.insert(p->data);//When p is not empty, the data stored by P is stored in a new linked list
                    p= p->next;
                }
            }
            *this = h;//The new hash table is assigned to the original hash table
            return ;
        }
        //Calculate hash value: convert any data type to integer
        int hash_func(string &s){
            int seed = 131,hash = 0;//bkdrhash
            for(int i =0,s[i];i++){
                hash = hash*seed+s[i];
            }
            return hash & 0x7fffffff;
        }
        void recalc_ind(int &ind,string &s){
            return ;//No conflict handling, return directly 
            }


    };

int main(){
    int op;
    string s;
    while(cin>>op>>s){
        switch(op){
            case 1:h.insert(s);break;
            case 2:cout<<"find"<<s<<":"<<h.find(s)<<endl;break;
        }
    }
    return 0;
}

In traditional hash tables, the storage space is related to the number of elements
Bloom filter, the storage space is independent of the number of elements
The traditional hash table is not feasible when it involves a large amount of data, such as crawler storage url weight judgment*
The bloom filter has a data storage area for storing binary marks (it has a set of hash functions. When a data enters the function, it maps binary numbers. If a hash function maps 0, it indicates that the data does not exist, and if the mapped corresponding positions are all 1, it can only indicate that the probability exists): it is used to judge whether the element exists and there is a misjudgment rate. Therefore, it is mostly used in scenarios with big data and information security requirements.

leetcode topic training
705. Design hash set

class Node{
    public:
        Node(int key=0,Node *next = nullptr):key(key),next(next){}
        int key;
        Node *next;
        void insert_after(Node *node){
            node->next = this->next;
            this->next=node;
            return;
        }
        void remove_after(){
            if(this->next == nullptr) return ;
            Node *p = this->next;
            this->next = this->next->next;
            delete p;
            return ;
        }
};

class MyHashSet {
public:
//Zipper method

    /** Initialize your data structure here. */
    vector<Node> data;

    MyHashSet():data(100) {}
    int hash_func(int key){return key & 0x7fffffff;}
    void add(int key) {
        if(contains(key))return ;//If the current element exists, return it directly
        int ind = hash_func(key)%data.size();//Otherwise, calculate the hash value corresponding to the key through the hash function, and then take the remainder of the array size
        data[ind].insert_after(new Node(key));//The process of inserting elements into a hash table
        return ;
    }
    void remove(int key) {
        int ind = hash_func(key) % data.size();//Find the subscript of the removed element
        Node *p = &data[ind];
        while(p->next &&p->next->key!=key) p=p->next;
        //At this time, the previous node of the node to be deleted is found
        p->remove_after();
        return ;
    }
    
    /** Returns true if this set contains the specified element */
    bool contains(int key) {
        int ind = hash_func(key)%data.size();
        Node *p = data[ind].next;
        while(p&&p->key!=key) p=p->next;
        return p!=nullptr;
    }
};

Design hash mapping

 class Node{
    public:
        Node(int key=0,int value = 0,Node *next=nullptr):value(value),key(key),next(next){}
        int key,value;
        Node *next;
        void insert_after(Node *node){
            node->next = this->next;
            this->next = node;
            return ;
        }
        void remove_after(){
            if(this->next == nullptr) return ;
            Node *p = this->next;
            this->next = this->next->next;
            delete p;
            return ;
        }
};
class MyHashMap {
public:
//Designing hash function with zipper method
vector<Node>data;
    /** Initialize your data structure here. */
    MyHashMap():data(100) {}
    int hash_func(int key){return key&0x7fffffff;}
    /** value will always be non-negative. */
    void put(int key, int value) {
        int ind = hash_func(key)%data.size();
        Node *p = &data[ind];
        while(p->next && p->next->key!=key) p=p->next;
        //Currently, there are two cases where the key value of P - > next is empty / P - > next
        if(p->next){//1. If P - > next is not empty
            p->next->value = value;
        }else{//2. If P's next is empty, add a new node storage key value pair after the current node
            p->insert_after(new Node(key,value));
        }
        return ;
    }
    
    /** Returns the value to which the specified key is mapped, or -1 if this map contains no mapping for the key */
    int get(int key) {
        int ind = hash_func(key)%data.size();
        Node *p = data[ind].next;
        while(p && p->key !=key) p=p->next;
        if(p == nullptr) return -1;
        return p->value;
    }
    
    /** Removes the mapping of the specified value key if this map contains a mapping for the key */
    void remove(int key) {
        int ind = hash_func(key)%data.size();
        Node *p = &data[ind];
        while(p->next&&p->next->key!=key) p=p->next;
        p->remove_after();
        return ;
    }
};

Interview question 16.25 LRU cache

class Node{
    public:
        Node(int key =0,int value=0,Node *prev = nullptr,Node *next = nullptr)
        :key(key),value(value),prev(prev),next(next){}
        int key,value;
        Node *next,*prev;//The pointer field points to the front and back position
        Node *remove_this(){//Points the previous bit of the current node to the next bit of the current node
            if(this->prev) this->prev->next = this->next;
            if(this->next) this->next->prev = this->prev;
            this->next = this->prev = nullptr;//The two pointers before and after the current position are assigned null respectively
            return this;//Returns the address of the current node
        }
        void insert_prev(Node *node){
            node->next = this;
            node->prev = this->prev;
            if(this->prev) this->prev->next = node;
            this->prev = node;
            return ;
        }
};
class HashList{//Simulate a queue using hash linked list = > know the head and tail of the queue
    public:
        int capacity;//Maximum number of storage elements
        Node head,tail;//Virtual head and virtual tail are easy to insert and delete
        unordered_map<int,Node *>data;//Hash table implementation unordered_map (mapping from int type to linked list node address)
        HashList(int capacity):capacity(capacity){
            head.next =&tail;
            tail.prev =&head;
        }
        void put(int key,int value){//Function: insert a new node
            if(data.find(key)!=data.end()){//Note: if the find() function is not found, it returns the value of end
                //Note that the newly inserted value already exists. First, find the current node, and then take the existing value from the list to the end of the list
                data[key]->value = value;//Modify the value first
                data[key]->remove_this();//Then move the node out of the whole linked list
            }else{
                data[key] = new Node(key,value);
            }
            tail.insert_prev(data[key]);//Finally, put the node at the end of the whole linked list
            if(data.size()>capacity){//Detects whether the storage space of the hash table has been exceeded
                data.erase(data.find(head.next->key));//If the capacity is exceeded, the next node of the head will be deleted (deleted in the hash table)
                delete head.next->remove_this();//Then delete in the linked list and recycle the space of the current node through delete
            }
            return ;
        }
        int get(int key){//Get node value from hash linked list
            if(data.find(key) == data.end())return -1;
            data[key]->remove_this();
            tail.insert_prev(data[key]);
            return data[key]->value;
        }
        
};
class LRUCache {
public:
    HashList h;
    LRUCache(int capacity):h(capacity) {}
    
    int get(int key) {
        return h.get(key);
    }
    
    void put(int key, int value) {
        h.put(key,value);
    }
};

Encryption and decryption of TinyURL

class Solution {
public:
    Solution(){srand(time(0));}
    char ch(int x){//Randomly generate upper and lower case characters and '0' - '9' through x (0-25 = A-Z; 26-51 = A-Z; 52-61 ='0 '-' 9 ')
        x %= 62;
        if(x<26) return x+'a';
        if(x<52) return x-26+'A';
        return x-52+'0';
    }
    string rand_string(int n){
        //The length of randomly generated string is n
        string s="";
        for(int i=0;i<n;i++){
            s+=ch(rand());
        }
        return s;
    }
    unordered_map<string,string> h;//Short URL

    // Encodes a URL to a shortened URL.
    string encode(string longUrl) {
        string s;//Unused short URL
        do{
            s=rand_string(5);
        }while(h.find(s)!=h.end());//If the short URL exists, continue to generate it
        h[s] = longUrl;
        return s;
    }

    // Decodes a shortened URL to its original URL.
    string decode(string shortUrl) {
        return h[shortUrl];
    }
};

Repetitive DNA sequence

class Solution {
public:
    vector<string> findRepeatedDnaSequences(string s) {
        unordered_map<string,int> h;
        for(int i=0,I=s.size()-9;i<I;i++){
            h[s.substr(i,10)] +=1 ;
        }
        vector<string> ret;
        //Scan the elements in the hash table and store the key with value greater than 1 into the result array ret
        for(auto x:h){
            if(x.second == 1) continue;
            //At this time, the rest is more than once
            ret.push_back(x.first);
        }
        return ret;
    }
};

Maximum word length product

class Solution {
public:
    int maxProduct(vector<string>& words) {
        vector<int> mark(words.size());
        for(int i=0;i<words.size();i++){
            for(auto c:words[i]){
                mark[i] |=(1<<(c-'a'));
            }
        }
        int ans = 0;
        for(int i=0;i<words.size();i++){
            for(int j=i+1;j<words.size();j++){
                if(mark[i]&mark[j]) continue;
                ans = max(ans,int(words[i].size() *words[j].size()));
            }
        }
        return ans;
    }
};

Search two-dimensional matrix II (two positions: lower left and upper right are the boundaries)

class Solution {
public:
    bool searchMatrix(vector<vector<int>>& matrix, int target) {
        int i=0,j=matrix[0].size()-1;
        while(i<matrix.size() && j>=0){
            if(matrix[i][j] == target) return true;
            if(matrix[i][j] < target)i+=1;
            else j-=1; 
        }
        return false;
    }
};

Allocate coins in a binary tree

class Solution {
public:
    //Total cost of coin movement
    int getResult(TreeNode *root,int &n,int &m){//Number of nodes n, number of coins m
        n = m =0;        
        if(root == nullptr) return 0;
        n = 1,m = root->val;
        int ans=0,n1,m1;
        ans += getResult(root->left,n1,m1);
        ans += abs(n1-m1);
        n += n1,m += m1;
        ans += getResult(root->right,n1,m1);
        ans += abs(n1-m1);
        n += n1,m += m1;
        return ans; 
    }
    int distributeCoins(TreeNode* root) {
        int n,m;
        return getResult(root,n,m);
    }
};

Flat multilevel bidirectional linked list

class Solution {
public:
    Node* flatten(Node* head) {
        Node *p = head,*k,*q;//Here, p is the original linked list node, k is the first node after flattening, and q is the node behind the original linked list p
        while(p){
            if(p->child){
                q=p->next;
                k =flatten(p->child);
                p->child = nullptr;//After flattening, you need to point P - > child to null!!!!
                p->next = k;
                k->prev = p;
                while(p->next) p=p->next;//p follows the list of k all the way back to the end of k;
                p->next = q;
                if(q)q->prev = p;
            }
            p=p->next;
        }
        return head;
    }
};

All nodes with distance K in binary tree

/**
 * Definition for a binary tree node.
 * struct TreeNode {
 *     int val;
 *     TreeNode *left;
 *     TreeNode *right;
 *     TreeNode(int x) : val(x), left(NULL), right(NULL) {}
 * };
 */
class Solution {
public:
    void dfs(TreeNode *root,int c,int k,vector<int> &ret){//c is the current layer number, k and the target layer number
        if(k<0) return ;
        if(root == nullptr) return ;
        if(c == k) {//When the target layer is found, the node value corresponding to the layer is transferred to the ret array
            ret.push_back(root->val);
            return ;
        }
        dfs(root->left,c+1,k,ret);
        dfs(root->right,c+1,k,ret);
        return ;
    }
    TreeNode *getResult(TreeNode *root ,TreeNode *target,int &k,vector<int> &ret){//Node with distance k
        if(root == nullptr) return nullptr;
        if(root== target){
            dfs(root,0,k,ret);//The layer where root is located is layer 0
            return root;
        } 
        if(getResult(root->left,target,k,ret)){
            k-=1;
            if(k==0) ret.push_back(root->val);
            dfs(root->right,0,k-1,ret);//Because the current node is in the left subtree, we need to deal with the number of nodes in the right subtree
            return target;
        }else if(getResult(root->right,target,k,ret)){
            k-=1;//In the process of backtracking, k needs to be reduced by 1 first
            if(k==0) ret.push_back(root->val);
            dfs(root->left,0,k-1,ret); 
            return target;
        }
        
        return nullptr;
    }
    vector<int> distanceK(TreeNode* root, TreeNode* target, int k) {
        vector<int> ret;
        getResult(root,target,k,ret);
        return ret;
    }
};

Programmer Think

Hash table and bloom filter

Hash table and bloom filter

Hot Topics