Match Rating Algorithm (Phonetic Match)

It's an algorithm for indexing of words by their pronunciation.

Algotithm

Encoding rules

1. Delete all vowels unless the vowel begins the word

2. Remove the second consonant of any double consonants present

3. Reduce codex to 6 letters by joining the first 3 and last 3 letters only

Comparison rules

1. If the length difference between the encoded strings is 3 or greater, then no similarity comparison is done.

2. Obtain the minimum rating value by calculating the length sum of the encoded strings and using below given Minimum Rating Table

3. Process the encoded strings from left to right and remove any identical characters found from both strings respectively.

4. Process the unmatched characters from right to left and remove any identical characters found from both names respectively.

5. Subtract the number of unmatched characters from 6 in the longer string. This is the similarity rating.

6. If the similarity rating equal to or greater than the minimum rating then the match is considered good.

Minimum Rating Table

Sum of Lengths of encoded strings
Minimum Rating
≤ 4
5
4 < sum ≤ 7
4
7 < sum ≤ 11
3
= 12
2

Match Rating Approach examples

Name1
Name2
Minimum Rating
Similarity Comparison Rating
Byrne
Boern
4
5
Smith
Smyth
3
5
Catherine
Kathryn
3
4

Java Code

package match;
public class Phonetic
{
    private static String CollapseRepeatingConsonants(String name) {
        StringBuilder sb = new StringBuilder();
        char prev = ' ';
        boolean first = true;
        char[] nameArray = name.toCharArray();
        for(char c : nameArray) {
            if (c != prev || first || c=='A' || c=='E' || c=='I' || c=='O' || c=='U')
            {
                sb.append(c);
                first = false;
            }
            prev = c;
        }
        return sb.toString();
    }

    public static String encode(String name)
    {
        name = name.toUpperCase();
        name = name.replaceAll("[^A-Z]", "");

        //Delete all vowels unless the vowel begins the word
        if(name.length() > 1) {
            name = name.substring(0,1) + name.substring(1).replace("A","").replace("E","").replace("I","").replace("O","").replace("U","");
        }

        //Remove the second consonant of any double consonants present
        name = CollapseRepeatingConsonants(name);

        //Reduce codex to 6 letters by joining the first 3 and last 3 letters only
        int length = name.length();
        if (length > 6) {
            name = name.substring(0, 3) + name.substring(length - 3, length);
        }
        return name;
    }

    private static int MinimumRating(int sum)
    {
        if(sum <= 4)
            return 5;
        if(sum <= 7)
            return 4;
        if(sum <= 11)
            return 3;
        if(sum == 12)
            return 2;

        return 0;
    }

    public static int MatchRatingCompute(String name1, String name2)
    {
        //0 is an impossible rating, it will mean unrated
        if(name1 == null || name1.trim().length() == 0 || name2 == null || name2.trim().length() == 0) {
            return 0;
        }

        String large;
        String small;

        if (name1.length() >= name2.length()) {
            large = encode(name1.toUpperCase());
            small = encode(name2.toUpperCase());
        }
        else {
            large = encode(name2.toUpperCase());
            small = encode(name1.toUpperCase());
        }

        int x = large.length();
        int y = small.length();

        //If the length difference between the encoded strings is 3 or greater, then no similarity comparison is done.
        if ((x - y) >= 3) {
            return 0;
        }

        //Obtain the minimum rating value by calculating the length sum of the encoded strings and using table A
        int minRating = MinimumRating(x + y);

        //Process the encoded strings from left to right and remove any identical characters found from both strings respectively.
        for(int i = 0; i < small.length(); )
        {
            boolean found = false;
            for(int j = 0; j < large.length(); j++) {
                if(small.charAt(i) == large.charAt(j)) {
                    small = small.substring(0,i) + small.substring(i+1, small.length());
                    large = large.substring(0,j) + large.substring(j+1, large.length());
                    found = true;
                    break;
                }
            }
            if(!found) {
                i++;
            }
        }

        large = new StringBuffer(large).reverse().toString();
        small = new StringBuffer(small).reverse().toString();

        //Process the unmatched characters from right to left and remove any identical characters found from both names respectively.
        for(int i = 0; i < small.length(); ) {
            boolean found = false;
            for(int j = 0; j < large.length(); j++) {
                if(small.charAt(i) == large.charAt(j)) {
                    small = small.substring(0,i) + small.substring(i+1, small.length());
                    large = large.substring(0,j) + large.substring(j+1, large.length());
                    found = true;
                }
            }
            if (!found)
            {
                i++;
            }
        }

        int rating = 0;

        //Subtract the number of unmatched characters from 6 in the longer string. This is the similarity rating.
        if(large.length() > small.length()) {
            rating = 6 - large.length();
        }
        else {
            rating = 6 - small.length();
        }

        //If the similarity rating equal to or greater than the minimum rating then the match is considered good.
        if (rating >= minRating) {
            return rating;
        }

        return 0;
    }
}

Comments

Popular posts from this blog

Index MySQL datadase table in Solr

Hibernate Sharding Example

Shallow Copy