String matching Pabinkarp

Posted by defect on Thu, 10 Feb 2022 06:11:37 +0100

Title:

Judge whether the string s contains the string p.

The input takes up two lines: String s and string p.

The output contains the starting subscript of string p in string s. if there are multiple matches, all of them will be output, and each output will occupy one line.

Case:

Input:

ABABABA
ABA

Output:

0
2
4

Preliminarily explore the idea of Pabinkarp algorithm:

First calculate the hash value of the string p, and then calculate the hash value (time complexity m) of the substring (continuous) (time complexity n) with the same length as the string p in the calendar s one by one, and compare it with the hash value of the string p. if it is the same, the matching is successful and the output is OK. The time complexity of this idea is n*m (N and m are the length of string s and string p respectively), and the code is as follows:

#include <bits/stdc++.h>
using namespace std;
typedef long long ll;
void match(char *p,char *s);
ll hash(char *s);
const int seed = 31;
const int eps = 1e7+10;
char s[1005];
char p[1005];
int main() {
	gets(s);
	gets(p);
	match(p,s);
	return 0;
} 

void match(char *p,char *s) {
	ll p_hash = hash(p);
	int p_len = strlen(p);
	int s_len = strlen(s);
	//Traverses a continuous string in string s with a length equal to string p 
	for(int i=0;i+p_len<=s_len;i++) { //Pay attention to the boundary
		char t[p_len];
		int k = 0;
		for(int j=i;j<i+p_len;j++,k++)//Take out the string to be verified 
			t[k] = s[j];
		t[k] = '\0';
		ll i_hash = hash(t);
		if(i_hash==p_hash) {
			printf("%d\n",i);//Output when hash values are equal 
		}
	}
}

ll hash(char *s) {
	ll s_hash = 0;
	int n = strlen(s);
	for(int i=0;i<n;i++) 
		s_hash = s_hash*seed + (int)s[i];
	s_hash %= eps;
	return s_hash;
}

It is not difficult to find that this idea is the same as that of direct violent matching in terms of time complexity, so we consider further optimization. This idea is similar to preprocessing, which is to directly construct a hash array to store the hash values of all strings, so that we can match with linear time complexity. The method of constructing hash array also needs to be improved. The method is to add the hash value of the next element and delete the hash value of the first element of the substring. Note that the method of adding and deleting is not simply adding and subtracting, but using the idea of similar base system, multiply and then add, and the number of subtractions should also be multiplied by the corresponding "weight" (this step is the key), The multiplication is a pre-defined "seed", and the weight is the m power of the seed (M is the scale of string p). This method is vividly called "rolling hash". The code is as follows:

#include <bits/stdc++.h>
using namespace std;
typedef long long ll;
void match(char *p,char *s);
void match(char *p,char *s,int n,int s_len);
ll hash(char *s);
const int seed = 31;
const int eps = 1e5+7;
char s[1005];
char p[1005];
ll res[1005];
int main() {
	gets(s);
	gets(p);
	int p_len = strlen(p);
	int s_len = strlen(s);
	ll p_hash = hash(p);
	match(p,s,p_len,s_len);
	int n = s_len-p_len+1;
	for(int i=0;i<n;i++)
		if(res[i]==p_hash) printf("%lld\n",i);
	return 0;
} 

void match(char *p,char *s,int n,int s_len) {
	char t[n];
	int i;
	for(i=0;i<n;i++)
		t[i] = s[i];
	t[i] = '\0';
	res[0] = hash(t);//First calculate the first substring with the same size as the string p 
	res[0] %= eps;
	for(int i=n;i<s_len;i++) {
		res[i-n+1] = res[i-n]*seed + (int)s[i] - (int)s[i-n]*pow(seed,n);//The hash array is constructed by adding the hash value of the next element and deleting the hash value of the first element of the substring 
		res[i-n+1] %= eps;
	}
}

ll hash(char *s) {
	ll s_hash = 0;
	int n = strlen(s);
	for(int i=0;i<n;i++) 
		s_hash = s_hash*seed + (int)s[i];
	return s_hash;
}

The time complexity is O (n), and the algorithm ends here.

Next is a problem I encountered during debugging. Record it here.

In the optimization code, I defined the hash array res as a local variable, but in the process of writing, I first defined it as a local variable of the main function, and then defined the return value of the match function as ll * type, trying to return the res array, but the number recorded by the res array in the match function is released when the match function ends, and the result is naturally wrong. (in fact, I encountered the same error a long time ago, and now I stepped on the pit again... Because I take it for granted that the array is a direct operation on the things stored on the address). There are three solutions. The first is to define res as a global variable (commonly used in competition, because the length can be specified).

The second is dynamic distributed energy storage. The code is as follows:

#include <bits/stdc++.h>
using namespace std;
typedef long long ll;
void match(char *p,char *s);
void match(char *p,char *s,int n,int s_len);
ll hash(char *s);
const int seed = 31;
const int eps = 1e5+7;
char s[1005];
char p[1005];
ll res[1005];
int main() {
	gets(s);
	gets(p);
	int p_len = strlen(p);
	int s_len = strlen(s);
	ll p_hash = hash(p);
	match(p,s,p_len,s_len);
	int n = s_len-p_len+1;
	for(int i=0;i<n;i++)
		if(res[i]==p_hash) printf("%lld\n",i);
	return 0;
} 

void match(char *p,char *s,int n,int s_len) {
	char t[n];
	int i;
	for(i=0;i<n;i++)
		t[i] = s[i];
	t[i] = '\0';
	res[0] = hash(t);//First calculate the first substring with the same size as the string p 
	res[0] %= eps;
	for(int i=n;i<s_len;i++) {
		res[i-n+1] = res[i-n]*seed + (int)s[i] - (int)s[i-n]*pow(seed,n);//The hash array is constructed by adding the hash value of the next element and deleting the hash value of the first element of the substring 
		res[i-n+1] %= eps;
	}
}

ll hash(char *s) {
	ll s_hash = 0;
	int n = strlen(s);
	for(int i=0;i<n;i++) 
		s_hash = s_hash*seed + (int)s[i];
	return s_hash;
}

This method must remember to free memory. The third method is to take the res array as a parameter of the match function. The code is as follows:

#include <bits/stdc++.h>
using namespace std;
typedef long long ll;
void match(char *p,char *s);
void match(char *p,char *s,int n,int s_len,ll* res);
ll hash(char *s);
const int seed = 31;
const int eps = 1e5+7;
char s[1005];
char p[1005];
int main() {
	gets(s);
	gets(p);
	int p_len = strlen(p);
	int s_len = strlen(s);
	ll p_hash = hash(p);
	int n = s_len-p_len+1;
	ll res[n] = {0};
	match(p,s,p_len,s_len,res);
	for(int i=0;i<n;i++)
		if(res[i]==p_hash) printf("%d\n",i);
	return 0;
} 

void match(char *p,char *s,int n,int s_len,ll* res) {
	char t[n];
	int i;
	for(i=0;i<n;i++)
		t[i] = s[i];
	t[i] = '\0';
	res[0] = hash(t);
	res[0] %= eps;
	for(int i=n;i<s_len;i++) {
		res[i-n+1] = res[i-n]*seed + (int)s[i] - (int)s[i-n]*pow(seed,n);
		res[i-n+1] %= eps;
	}
}

ll hash(char *s) {
	ll s_hash = 0;
	int n = strlen(s);
	for(int i=0;i<n;i++) 
		s_hash = s_hash*seed + (int)s[i];
	return s_hash;
}

Above~

Topics: C++ Algorithm leetcode