# Fuzzy matching based on string

Recently, due to the phenomenon that some similar cell names, street names stored in the database are not standardized, such as shorthand, typography, etc., it is necessary to correct the irregular writing. In the process of error correction, the calculation method of edit distance is used to accurately match the comparison table.

# Edit distance

1.Levenshtein Distance is a string metric to calculate the difference between two strings. We can think of Levenshtein Distance as the minimum number of times to edit a single character (such as modify, insert, delete) when modifying from one string to another. 2.jaro distance 3. Jaro Winkler distance  Note: similarity = 1 - distance

Because there is the concept of local visual window in jaro's distance, even if there are the same substring, but the length beyond the visual window will not be calculated, but most of the business data with a long prefix will affect the accuracy of the final match, so the length of the visual window will be enlarged to the length of the longest string, so the part of the package will be The source code of python is modified as follows:

``````def count_matches(s1, s2, len1, len2):
assert len1 and len1 <= len2
# search_range = max(len2//2-1, 0)
# print ("search_range",search_range)
search_range = len2
num_matches = 0

flags1 =  * len1
flags2 =  * len2

for i, char in enumerate(s1):

lolim = max(i - search_range, 0)
hilim = min(i + search_range, len2 - 1)

for j in range(lolim, hilim + 1):

if not flags2[j] and char == s2[j]:
flags1[i] = flags2[j] = 1
# where_matched[i] = j
num_matches += 1
break
return num_matches, flags1, flags2  # , where_matched

def count_half_transpositions(s1, s2, flags1, flags2):
half_transposes = 0
k = 0

for i, flag in enumerate(flags1):
if not flag: continue
while not flags2[k]: k += 1
if s1[i] != s2[k]:
half_transposes += 1
k += 1
return half_transposes

def count_typos(s1, s2, flags1, flags2, typo_table):
assert 0 in flags1

typo_score = 0
for i, flag1 in enumerate(flags1):
if flag1: continue  # Iterate through unmatched chars
row = s1[i]
if row not in typo_table:
# If we don't have a similarity mapping for the char, continue
continue
typo_row = typo_table[row]

for j, flag2 in enumerate(flags2):
if flag2: continue
col = s2[j]
if col not in typo_row: continue

# print 'Similarity!', row, col
typo_score += typo_row[col]
flags2[j] = 2
break
return typo_score, flags2

def fn_jaro(len1, len2, num_matches, half_transposes, typo_score, typo_scale):
if not len1:
if not len2: return 1.0
return 0.0
if not num_matches: return 0.0

similar = (typo_score / typo_scale) + num_matches
weight = (similar / len1
+ similar / len2
+ (num_matches - half_transposes // 2) / num_matches)

return weight / 3

def string_metrics(s1, s2, typo_table=None, typo_scale=1, boost_threshold=None,
pre_len=0, pre_scale=0, longer_prob=False):
len1 = len(s1)
len2 = len(s2)

if len2 < len1:
s1, s2 = s2, s1
len1, len2 = len2, len1
assert len1 <= len2

if not (len1 and len2): return len1, len2, 0, 0, 0, 0, False

num_matches, flags1, flags2 = count_matches(s1, s2, len1, len2)

# If no characters in common - return
if not num_matches: return len1, len2, 0, 0, 0, 0, False

half_transposes = count_half_transpositions(s1, s2, flags1, flags2)

# adjust for similarities in non-matched characters
typo_score = 0
if typo_table and len1 > num_matches:
typo_score, flags2 = count_typos(s1, s2, flags1, flags2, typo_table)

if not boost_threshold:
return len1, len2, num_matches, half_transposes, typo_score, 0, 0

pre_matches = 0
weight_typo = fn_jaro(len1, len2, num_matches, half_transposes,
typo_score, typo_scale)

# Continue to boost the weight if the strings are similar
if weight_typo > boost_threshold:
# Adjust for having up to first 'pre_len' chars (not digits) in common
limit = min(len1, pre_len)
while pre_matches < limit:
char1 = s1[pre_matches]
if not (char1.isalpha() and char1 == s2[pre_matches]):
break
pre_matches += 1

if longer_prob:
cond = len1 > pre_len
cond = cond and num_matches > pre_matches + 1
cond = cond and 2 * num_matches >= len1 + pre_matches
cond = cond and s1.isalpha()
if cond:

return (len1, len2, num_matches, half_transposes,

def metric_jaro(string1, string2):
"The standard, basic Jaro string metric."

ans = string_metrics(string1, string2)
len1, len2, num_matches, half_transposes = ans[:4]
assert ans[4:] == (0, 0, False)
return fn_jaro(len1, len2, num_matches, half_transposes, 0, 1)

def metric_jaro_score(s1,s2):
return metric_jaro(s1,s2)

print (metric_jaro_score("45 century pearl of saiding line","45 century Pearl"))    ``````

Tags: Python Database

Posted on Wed, 04 Dec 2019 00:24:00 -0500 by esconsult1