Java BM string matching algorithm

Posted by pfdesigns on Sat, 08 Feb 2020 18:13:24 +0100

BM matching algorithm follows one condition and two rules:

1. Conditions:

The pattern string matches the main string from the end of the string, that is, from the back to the front

Two, rules:

1. Bad character rule: in the process of pattern string and main string matching in turn, if there are unmatched characters in the main string, this character is called bad character, and the matching position of this character in pattern string is recorded as Si; the lower mark of bad character in pattern string is Xi, if not, it is - 1; the difference of x = Si - Xi is the sliding distance of pattern string, and it is not possible to rely on bad character rule alone Because x is negative in some cases

2. Good suffix rule: the matching substring will appear in the matching of pattern string and main string, and this substring is called good suffix. If the good suffix exists in the first part of the pattern string, the sliding distance of the pattern string is the length of the pattern string minus the starting position of the good suffix in the pattern string. If the good suffix does not exist in the pattern string, then it is good to see whether the suffix exists in the substring and whether it is the prefix substring of the pattern string. If so, the sliding distance of the pattern string is the length of the pattern string minus the prefix substring Length of string

3, Take the maximum value of bad character and good suffix as the sliding distance of pattern string

The code is as follows:


/**
 * BM Matching algorithm:
 * Bad character rule: the subscript Si of bad character in pattern string, the subscript Xi of bad character in pattern string, and the sliding distance x = Si - Xi of pattern string
 * Good suffix rule: the offset distance of a good suffix when it exists in a pattern string is suffix; the offset distance of the longest suffix when it can match the prefix substring of a pattern string is prefix
 */
public class BmMatch {

   // Bad character: using the position of each character in the hash table pattern string in the pattern string
    private static int SIZE = 256;
    public static void generateBC(char[] b, int m, int[] bc){
        for(int i = 0; i < SIZE; i++){
            bc[i] = -1;
        }

        for(int i = 0; i < m; i++){
            int ascii = b[i];
            bc[ascii] = i;
        }
    }

    // Good suffix: solve suffix and prefix
    public static void generateGS(char[] b, int m, int[] suffix, boolean[] prefix){
        for(int i = 0; i < m; i++){
            suffix[i] = -1;
            prefix[i] = false;
        }

        for(int i = 0; i < m - 1; i++){
            int j = i;
            int k = 0;
            while(j >= 0 && b[i] == b[m - 1 - k]){
                j--;
                k++;
                suffix[k] = j + 1;
            }
            if(j == -1) prefix[k] = true;
        }
    }

    // Return the sliding distance of a good suffix
    public static int moveByGS(int j, int m, int[] suffix, boolean[] prefix){
        int k = m - 1 - j;
        if(suffix[k] != -1) return j - suffix[k] + 1;
        for(int r = j + 2; r <= m - 1; r++){
            if(prefix[m - r])
                return r;
        }
        return m;
    }

    // Matching main function
    public static int bm(char[] a, int n, char[] b, int m){
        int[] bc = new int[SIZE];
        generateBC(b, m, bc);
        int[] suffix = new int[m];
        boolean[] prefix = new boolean[m];
        generateGS(b, m, suffix, prefix);
        int i = 0;
        while(i <= n - m){
            int j;
            for(j = m - 1; j >= 0; j--){
                if(a[i + j] != b[j])
                    break;
            }

            if(j == -1) return i;
            int x = j - bc[a[i + j]];
            int y = 0;
            if(j < m - 1){
                y = moveByGS(j, m, suffix, prefix);
            }
            i = i + Math.max(x, y);
        }
        return -1;
    }
}

Topics: Programming ascii