Match Rating Algorithm (Phonetic Match)
It's an algorithm for indexing of words by their pronunciation.
Algotithm
Encoding rules
1. Delete all vowels unless the vowel begins the word
2. Remove the second consonant of any double consonants present
3. Reduce codex to 6 letters by joining the first 3 and last 3 letters only
2. Remove the second consonant of any double consonants present
3. Reduce codex to 6 letters by joining the first 3 and last 3 letters only
Comparison rules
1. If the length difference between the encoded strings is 3 or greater, then no similarity comparison is done.
2. Obtain the minimum rating value by calculating the length sum of the encoded strings and using below given Minimum Rating Table
3. Process the encoded strings from left to right and remove any identical characters found from both strings respectively.
4. Process the unmatched characters from right to left and remove any identical characters found from both names respectively.
5. Subtract the number of unmatched characters from 6 in the longer string. This is the similarity rating.
6. If the similarity rating equal to or greater than the minimum rating then the match is considered good.
2. Obtain the minimum rating value by calculating the length sum of the encoded strings and using below given Minimum Rating Table
3. Process the encoded strings from left to right and remove any identical characters found from both strings respectively.
4. Process the unmatched characters from right to left and remove any identical characters found from both names respectively.
5. Subtract the number of unmatched characters from 6 in the longer string. This is the similarity rating.
6. If the similarity rating equal to or greater than the minimum rating then the match is considered good.
Minimum Rating Table
Sum of Lengths of encoded strings | Minimum Rating |
Match Rating Approach examples
Name1 | Name2 | Minimum Rating | Similarity Comparison Rating |
Java Code
package match;
public class Phonetic
{
private static String CollapseRepeatingConsonants(String name) {
StringBuilder sb = new StringBuilder();
char prev = ' ';
boolean first = true;
char[] nameArray = name.toCharArray();
for(char c : nameArray) {
if (c != prev || first || c=='A' || c=='E' || c=='I' || c=='O' || c=='U')
{
sb.append(c);
first = false;
}
prev = c;
}
return sb.toString();
}
public static String encode(String name)
{
name = name.toUpperCase();
name = name.replaceAll("[^A-Z]", "");
//Delete all vowels unless the vowel begins the word
if(name.length() > 1) {
name = name.substring(0,1) + name.substring(1).replace("A","").replace("E","").replace("I","").replace("O","").replace("U","");
}
//Remove the second consonant of any double consonants present
name = CollapseRepeatingConsonants(name);
//Reduce codex to 6 letters by joining the first 3 and last 3 letters only
int length = name.length();
if (length > 6) {
name = name.substring(0, 3) + name.substring(length - 3, length);
}
return name;
}
private static int MinimumRating(int sum)
{
if(sum <= 4)
return 5;
if(sum <= 7)
return 4;
if(sum <= 11)
return 3;
if(sum == 12)
return 2;
return 0;
}
public static int MatchRatingCompute(String name1, String name2)
{
//0 is an impossible rating, it will mean unrated
if(name1 == null || name1.trim().length() == 0 || name2 == null || name2.trim().length() == 0) {
return 0;
}
String large;
String small;
if (name1.length() >= name2.length()) {
large = encode(name1.toUpperCase());
small = encode(name2.toUpperCase());
}
else {
large = encode(name2.toUpperCase());
small = encode(name1.toUpperCase());
}
int x = large.length();
int y = small.length();
//If the length difference between the encoded strings is 3 or greater, then no similarity comparison is done.
if ((x - y) >= 3) {
return 0;
}
//Obtain the minimum rating value by calculating the length sum of the encoded strings and using table A
int minRating = MinimumRating(x + y);
//Process the encoded strings from left to right and remove any identical characters found from both strings respectively.
for(int i = 0; i < small.length(); )
{
boolean found = false;
for(int j = 0; j < large.length(); j++) {
if(small.charAt(i) == large.charAt(j)) {
small = small.substring(0,i) + small.substring(i+1, small.length());
large = large.substring(0,j) + large.substring(j+1, large.length());
found = true;
break;
}
}
if(!found) {
i++;
}
}
large = new StringBuffer(large).reverse().toString();
small = new StringBuffer(small).reverse().toString();
//Process the unmatched characters from right to left and remove any identical characters found from both names respectively.
for(int i = 0; i < small.length(); ) {
boolean found = false;
for(int j = 0; j < large.length(); j++) {
if(small.charAt(i) == large.charAt(j)) {
small = small.substring(0,i) + small.substring(i+1, small.length());
large = large.substring(0,j) + large.substring(j+1, large.length());
found = true;
}
}
if (!found)
{
i++;
}
}
int rating = 0;
//Subtract the number of unmatched characters from 6 in the longer string. This is the similarity rating.
if(large.length() > small.length()) {
rating = 6 - large.length();
}
else {
rating = 6 - small.length();
}
//If the similarity rating equal to or greater than the minimum rating then the match is considered good.
if (rating >= minRating) {
return rating;
}
return 0;
}
}
{
private static String CollapseRepeatingConsonants(String name) {
StringBuilder sb = new StringBuilder();
char prev = ' ';
boolean first = true;
char[] nameArray = name.toCharArray();
for(char c : nameArray) {
if (c != prev || first || c=='A' || c=='E' || c=='I' || c=='O' || c=='U')
{
sb.append(c);
first = false;
}
prev = c;
}
return sb.toString();
}
public static String encode(String name)
{
name = name.toUpperCase();
name = name.replaceAll("[^A-Z]", "");
//Delete all vowels unless the vowel begins the word
if(name.length() > 1) {
name = name.substring(0,1) + name.substring(1).replace("A","").replace("E","").replace("I","").replace("O","").replace("U","");
}
//Remove the second consonant of any double consonants present
name = CollapseRepeatingConsonants(name);
//Reduce codex to 6 letters by joining the first 3 and last 3 letters only
int length = name.length();
if (length > 6) {
name = name.substring(0, 3) + name.substring(length - 3, length);
}
return name;
}
private static int MinimumRating(int sum)
{
if(sum <= 4)
return 5;
if(sum <= 7)
return 4;
if(sum <= 11)
return 3;
if(sum == 12)
return 2;
return 0;
}
public static int MatchRatingCompute(String name1, String name2)
{
//0 is an impossible rating, it will mean unrated
if(name1 == null || name1.trim().length() == 0 || name2 == null || name2.trim().length() == 0) {
return 0;
}
String large;
String small;
if (name1.length() >= name2.length()) {
large = encode(name1.toUpperCase());
small = encode(name2.toUpperCase());
}
else {
large = encode(name2.toUpperCase());
small = encode(name1.toUpperCase());
}
int x = large.length();
int y = small.length();
//If the length difference between the encoded strings is 3 or greater, then no similarity comparison is done.
if ((x - y) >= 3) {
return 0;
}
//Obtain the minimum rating value by calculating the length sum of the encoded strings and using table A
int minRating = MinimumRating(x + y);
//Process the encoded strings from left to right and remove any identical characters found from both strings respectively.
for(int i = 0; i < small.length(); )
{
boolean found = false;
for(int j = 0; j < large.length(); j++) {
if(small.charAt(i) == large.charAt(j)) {
small = small.substring(0,i) + small.substring(i+1, small.length());
large = large.substring(0,j) + large.substring(j+1, large.length());
found = true;
break;
}
}
if(!found) {
i++;
}
}
large = new StringBuffer(large).reverse().toString();
small = new StringBuffer(small).reverse().toString();
//Process the unmatched characters from right to left and remove any identical characters found from both names respectively.
for(int i = 0; i < small.length(); ) {
boolean found = false;
for(int j = 0; j < large.length(); j++) {
if(small.charAt(i) == large.charAt(j)) {
small = small.substring(0,i) + small.substring(i+1, small.length());
large = large.substring(0,j) + large.substring(j+1, large.length());
found = true;
}
}
if (!found)
{
i++;
}
}
int rating = 0;
//Subtract the number of unmatched characters from 6 in the longer string. This is the similarity rating.
if(large.length() > small.length()) {
rating = 6 - large.length();
}
else {
rating = 6 - small.length();
}
//If the similarity rating equal to or greater than the minimum rating then the match is considered good.
if (rating >= minRating) {
return rating;
}
return 0;
}
}
Comments
Post a Comment