mirror of
https://github.com/clinton-hall/nzbToMedia.git
synced 2025-08-14 18:47:09 -07:00
Update vendored beets to 1.6.0
Updates colorama to 0.4.6 Adds confuse version 1.7.0 Updates jellyfish to 0.9.0 Adds mediafile 0.10.1 Updates munkres to 1.1.4 Updates musicbrainzngs to 0.7.1 Updates mutagen to 1.46.0 Updates pyyaml to 6.0 Updates unidecode to 1.3.6
This commit is contained in:
parent
5073ec0c6f
commit
56c6773c6b
385 changed files with 25143 additions and 18080 deletions
|
@ -1,6 +1,28 @@
|
|||
import warnings
|
||||
|
||||
try:
|
||||
from .cjellyfish import * # noqa
|
||||
from .cjellyfish import * # noqa
|
||||
|
||||
library = "C"
|
||||
except ImportError:
|
||||
from ._jellyfish import * # noqa
|
||||
from ._jellyfish import * # noqa
|
||||
|
||||
library = "Python"
|
||||
|
||||
|
||||
def jaro_winkler(s1, s2, long_tolerance=False):
|
||||
warnings.warn(
|
||||
"the name 'jaro_winkler' is deprecated and will be removed in jellyfish 1.0, "
|
||||
"for the same functionality please use jaro_winkler_similarity",
|
||||
DeprecationWarning,
|
||||
)
|
||||
return jaro_winkler_similarity(s1, s2, long_tolerance) # noqa
|
||||
|
||||
|
||||
def jaro_distance(s1, s2):
|
||||
warnings.warn(
|
||||
"the jaro_distance function incorrectly returns the jaro similarity, "
|
||||
"replace your usage with jaro_similarity before 1.0",
|
||||
DeprecationWarning,
|
||||
)
|
||||
return jaro_similarity(s1, s2) # noqa
|
||||
|
|
11
libs/common/jellyfish/__init__.pyi
Normal file
11
libs/common/jellyfish/__init__.pyi
Normal file
|
@ -0,0 +1,11 @@
|
|||
def levenshtein_distance(s1: str, s2: str) -> int: ...
|
||||
def jaro_similarity(s1: str, s2: str) -> float: ...
|
||||
def jaro_winkler_similarity(s1: str, s2: str, long_tolerance: bool = ...) -> float: ...
|
||||
def damerau_levenshtein_distance(s1: str, s2: str) -> int: ...
|
||||
def soundex(s: str) -> str: ...
|
||||
def hamming_distance(s1: str, s2: str) -> int: ...
|
||||
def nysiis(s: str) -> str: ...
|
||||
def match_rating_codex(s: str) -> str: ...
|
||||
def match_rating_comparison(s1: str, s2: str) -> bool: ...
|
||||
def metaphone(s: str) -> str: ...
|
||||
def porter_stem(s: str) -> str: ...
|
|
@ -1,18 +1,16 @@
|
|||
import unicodedata
|
||||
from collections import defaultdict
|
||||
from .compat import _range, _zip_longest, IS_PY3
|
||||
from itertools import zip_longest
|
||||
from .porter import Stemmer
|
||||
|
||||
|
||||
def _normalize(s):
|
||||
return unicodedata.normalize('NFKD', s)
|
||||
return unicodedata.normalize("NFKD", s)
|
||||
|
||||
|
||||
def _check_type(s):
|
||||
if IS_PY3 and not isinstance(s, str):
|
||||
raise TypeError('expected str or unicode, got %s' % type(s).__name__)
|
||||
elif not IS_PY3 and not isinstance(s, unicode):
|
||||
raise TypeError('expected unicode, got %s' % type(s).__name__)
|
||||
if not isinstance(s, str):
|
||||
raise TypeError("expected str or unicode, got %s" % type(s).__name__)
|
||||
|
||||
|
||||
def levenshtein_distance(s1, s2):
|
||||
|
@ -21,53 +19,54 @@ def levenshtein_distance(s1, s2):
|
|||
|
||||
if s1 == s2:
|
||||
return 0
|
||||
rows = len(s1)+1
|
||||
cols = len(s2)+1
|
||||
rows = len(s1) + 1
|
||||
cols = len(s2) + 1
|
||||
|
||||
if not s1:
|
||||
return cols-1
|
||||
return cols - 1
|
||||
if not s2:
|
||||
return rows-1
|
||||
return rows - 1
|
||||
|
||||
prev = None
|
||||
cur = range(cols)
|
||||
for r in _range(1, rows):
|
||||
prev, cur = cur, [r] + [0]*(cols-1)
|
||||
for c in _range(1, cols):
|
||||
for r in range(1, rows):
|
||||
prev, cur = cur, [r] + [0] * (cols - 1)
|
||||
for c in range(1, cols):
|
||||
deletion = prev[c] + 1
|
||||
insertion = cur[c-1] + 1
|
||||
edit = prev[c-1] + (0 if s1[r-1] == s2[c-1] else 1)
|
||||
insertion = cur[c - 1] + 1
|
||||
edit = prev[c - 1] + (0 if s1[r - 1] == s2[c - 1] else 1)
|
||||
cur[c] = min(edit, deletion, insertion)
|
||||
|
||||
return cur[-1]
|
||||
|
||||
|
||||
def _jaro_winkler(ying, yang, long_tolerance, winklerize):
|
||||
_check_type(ying)
|
||||
_check_type(yang)
|
||||
def _jaro_winkler(s1, s2, long_tolerance, winklerize):
|
||||
_check_type(s1)
|
||||
_check_type(s2)
|
||||
|
||||
ying_len = len(ying)
|
||||
yang_len = len(yang)
|
||||
s1_len = len(s1)
|
||||
s2_len = len(s2)
|
||||
|
||||
if not ying_len or not yang_len:
|
||||
if not s1_len or not s2_len:
|
||||
return 0.0
|
||||
|
||||
min_len = max(ying_len, yang_len)
|
||||
search_range = (min_len // 2) - 1
|
||||
min_len = min(s1_len, s2_len)
|
||||
search_range = max(s1_len, s2_len)
|
||||
search_range = (search_range // 2) - 1
|
||||
if search_range < 0:
|
||||
search_range = 0
|
||||
|
||||
ying_flags = [False]*ying_len
|
||||
yang_flags = [False]*yang_len
|
||||
s1_flags = [False] * s1_len
|
||||
s2_flags = [False] * s2_len
|
||||
|
||||
# looking only within search range, count & flag matched pairs
|
||||
common_chars = 0
|
||||
for i, ying_ch in enumerate(ying):
|
||||
low = i - search_range if i > search_range else 0
|
||||
hi = i + search_range if i + search_range < yang_len else yang_len - 1
|
||||
for j in _range(low, hi+1):
|
||||
if not yang_flags[j] and yang[j] == ying_ch:
|
||||
ying_flags[i] = yang_flags[j] = True
|
||||
for i, s1_ch in enumerate(s1):
|
||||
low = max(0, i - search_range)
|
||||
hi = min(i + search_range, s2_len - 1)
|
||||
for j in range(low, hi + 1):
|
||||
if not s2_flags[j] and s2[j] == s1_ch:
|
||||
s1_flags[i] = s2_flags[j] = True
|
||||
common_chars += 1
|
||||
break
|
||||
|
||||
|
@ -77,27 +76,32 @@ def _jaro_winkler(ying, yang, long_tolerance, winklerize):
|
|||
|
||||
# count transpositions
|
||||
k = trans_count = 0
|
||||
for i, ying_f in enumerate(ying_flags):
|
||||
if ying_f:
|
||||
for j in _range(k, yang_len):
|
||||
if yang_flags[j]:
|
||||
for i, s1_f in enumerate(s1_flags):
|
||||
if s1_f:
|
||||
for j in range(k, s2_len):
|
||||
if s2_flags[j]:
|
||||
k = j + 1
|
||||
break
|
||||
if ying[i] != yang[j]:
|
||||
if s1[i] != s2[j]:
|
||||
trans_count += 1
|
||||
trans_count /= 2
|
||||
trans_count //= 2
|
||||
|
||||
# adjust for similarities in nonmatched characters
|
||||
common_chars = float(common_chars)
|
||||
weight = ((common_chars/ying_len + common_chars/yang_len +
|
||||
(common_chars-trans_count) / common_chars)) / 3
|
||||
weight = (
|
||||
(
|
||||
common_chars / s1_len
|
||||
+ common_chars / s2_len
|
||||
+ (common_chars - trans_count) / common_chars
|
||||
)
|
||||
) / 3
|
||||
|
||||
# winkler modification: continue to boost if strings are similar
|
||||
if winklerize and weight > 0.7 and ying_len > 3 and yang_len > 3:
|
||||
if winklerize and weight > 0.7:
|
||||
# adjust for up to first 4 chars in common
|
||||
j = min(min_len, 4)
|
||||
i = 0
|
||||
while i < j and ying[i] == yang[i] and ying[i]:
|
||||
while i < j and s1[i] == s2[i]:
|
||||
i += 1
|
||||
if i:
|
||||
weight += i * 0.1 * (1.0 - weight)
|
||||
|
@ -105,13 +109,27 @@ def _jaro_winkler(ying, yang, long_tolerance, winklerize):
|
|||
# optionally adjust for long strings
|
||||
# after agreeing beginning chars, at least two or more must agree and
|
||||
# agreed characters must be > half of remaining characters
|
||||
if (long_tolerance and min_len > 4 and common_chars > i+1 and
|
||||
2 * common_chars >= min_len + i):
|
||||
weight += ((1.0 - weight) * (float(common_chars-i-1) / float(ying_len+yang_len-i*2+2)))
|
||||
if (
|
||||
long_tolerance
|
||||
and min_len > 4
|
||||
and common_chars > i + 1
|
||||
and 2 * common_chars >= min_len + i
|
||||
):
|
||||
weight += (1.0 - weight) * (
|
||||
float(common_chars - i - 1) / float(s1_len + s2_len - i * 2 + 2)
|
||||
)
|
||||
|
||||
return weight
|
||||
|
||||
|
||||
def jaro_similarity(s1, s2):
|
||||
return _jaro_winkler(s1, s2, False, False) # noqa
|
||||
|
||||
|
||||
def jaro_winkler_similarity(s1, s2, long_tolerance=False):
|
||||
return _jaro_winkler(s1, s2, long_tolerance, True) # noqa
|
||||
|
||||
|
||||
def damerau_levenshtein_distance(s1, s2):
|
||||
_check_type(s1)
|
||||
_check_type(s2)
|
||||
|
@ -124,41 +142,35 @@ def damerau_levenshtein_distance(s1, s2):
|
|||
da = defaultdict(int)
|
||||
|
||||
# distance matrix
|
||||
score = [[0]*(len2+2) for x in _range(len1+2)]
|
||||
score = [[0] * (len2 + 2) for x in range(len1 + 2)]
|
||||
|
||||
score[0][0] = infinite
|
||||
for i in _range(0, len1+1):
|
||||
score[i+1][0] = infinite
|
||||
score[i+1][1] = i
|
||||
for i in _range(0, len2+1):
|
||||
score[0][i+1] = infinite
|
||||
score[1][i+1] = i
|
||||
for i in range(0, len1 + 1):
|
||||
score[i + 1][0] = infinite
|
||||
score[i + 1][1] = i
|
||||
for i in range(0, len2 + 1):
|
||||
score[0][i + 1] = infinite
|
||||
score[1][i + 1] = i
|
||||
|
||||
for i in _range(1, len1+1):
|
||||
for i in range(1, len1 + 1):
|
||||
db = 0
|
||||
for j in _range(1, len2+1):
|
||||
i1 = da[s2[j-1]]
|
||||
for j in range(1, len2 + 1):
|
||||
i1 = da[s2[j - 1]]
|
||||
j1 = db
|
||||
cost = 1
|
||||
if s1[i-1] == s2[j-1]:
|
||||
if s1[i - 1] == s2[j - 1]:
|
||||
cost = 0
|
||||
db = j
|
||||
|
||||
score[i+1][j+1] = min(score[i][j] + cost,
|
||||
score[i+1][j] + 1,
|
||||
score[i][j+1] + 1,
|
||||
score[i1][j1] + (i-i1-1) + 1 + (j-j1-1))
|
||||
da[s1[i-1]] = i
|
||||
score[i + 1][j + 1] = min(
|
||||
score[i][j] + cost,
|
||||
score[i + 1][j] + 1,
|
||||
score[i][j + 1] + 1,
|
||||
score[i1][j1] + (i - i1 - 1) + 1 + (j - j1 - 1),
|
||||
)
|
||||
da[s1[i - 1]] = i
|
||||
|
||||
return score[len1+1][len2+1]
|
||||
|
||||
|
||||
def jaro_distance(s1, s2):
|
||||
return _jaro_winkler(s1, s2, False, False)
|
||||
|
||||
|
||||
def jaro_winkler(s1, s2, long_tolerance=False):
|
||||
return _jaro_winkler(s1, s2, long_tolerance, True)
|
||||
return score[len1 + 1][len2 + 1]
|
||||
|
||||
|
||||
def soundex(s):
|
||||
|
@ -166,21 +178,23 @@ def soundex(s):
|
|||
_check_type(s)
|
||||
|
||||
if not s:
|
||||
return ''
|
||||
return ""
|
||||
|
||||
s = _normalize(s)
|
||||
s = s.upper()
|
||||
|
||||
replacements = (('BFPV', '1'),
|
||||
('CGJKQSXZ', '2'),
|
||||
('DT', '3'),
|
||||
('L', '4'),
|
||||
('MN', '5'),
|
||||
('R', '6'))
|
||||
replacements = (
|
||||
("BFPV", "1"),
|
||||
("CGJKQSXZ", "2"),
|
||||
("DT", "3"),
|
||||
("L", "4"),
|
||||
("MN", "5"),
|
||||
("R", "6"),
|
||||
)
|
||||
result = [s[0]]
|
||||
count = 1
|
||||
|
||||
# find would-be replacment for first character
|
||||
# find would-be replacement for first character
|
||||
for lset, sub in replacements:
|
||||
if s[0] in lset:
|
||||
last = sub
|
||||
|
@ -197,12 +211,14 @@ def soundex(s):
|
|||
last = sub
|
||||
break
|
||||
else:
|
||||
last = None
|
||||
if letter != "H" and letter != "W":
|
||||
# leave last alone if middle letter is H or W
|
||||
last = None
|
||||
if count == 4:
|
||||
break
|
||||
|
||||
result += '0'*(4-count)
|
||||
return ''.join(result)
|
||||
result += "0" * (4 - count)
|
||||
return "".join(result)
|
||||
|
||||
|
||||
def hamming_distance(s1, s2):
|
||||
|
@ -227,28 +243,28 @@ def nysiis(s):
|
|||
_check_type(s)
|
||||
|
||||
if not s:
|
||||
return ''
|
||||
return ""
|
||||
|
||||
s = s.upper()
|
||||
key = []
|
||||
|
||||
# step 1 - prefixes
|
||||
if s.startswith('MAC'):
|
||||
s = 'MCC' + s[3:]
|
||||
elif s.startswith('KN'):
|
||||
if s.startswith("MAC"):
|
||||
s = "MCC" + s[3:]
|
||||
elif s.startswith("KN"):
|
||||
s = s[1:]
|
||||
elif s.startswith('K'):
|
||||
s = 'C' + s[1:]
|
||||
elif s.startswith(('PH', 'PF')):
|
||||
s = 'FF' + s[2:]
|
||||
elif s.startswith('SCH'):
|
||||
s = 'SSS' + s[3:]
|
||||
elif s.startswith("K"):
|
||||
s = "C" + s[1:]
|
||||
elif s.startswith(("PH", "PF")):
|
||||
s = "FF" + s[2:]
|
||||
elif s.startswith("SCH"):
|
||||
s = "SSS" + s[3:]
|
||||
|
||||
# step 2 - suffixes
|
||||
if s.endswith(('IE', 'EE')):
|
||||
s = s[:-2] + 'Y'
|
||||
elif s.endswith(('DT', 'RT', 'RD', 'NT', 'ND')):
|
||||
s = s[:-2] + 'D'
|
||||
if s.endswith(("IE", "EE")):
|
||||
s = s[:-2] + "Y"
|
||||
elif s.endswith(("DT", "RT", "RD", "NT", "ND")):
|
||||
s = s[:-2] + "D"
|
||||
|
||||
# step 3 - first character of key comes from name
|
||||
key.append(s[0])
|
||||
|
@ -258,53 +274,57 @@ def nysiis(s):
|
|||
len_s = len(s)
|
||||
while i < len_s:
|
||||
ch = s[i]
|
||||
if ch == 'E' and i+1 < len_s and s[i+1] == 'V':
|
||||
ch = 'AF'
|
||||
if ch == "E" and i + 1 < len_s and s[i + 1] == "V":
|
||||
ch = "AF"
|
||||
i += 1
|
||||
elif ch in 'AEIOU':
|
||||
ch = 'A'
|
||||
elif ch == 'Q':
|
||||
ch = 'G'
|
||||
elif ch == 'Z':
|
||||
ch = 'S'
|
||||
elif ch == 'M':
|
||||
ch = 'N'
|
||||
elif ch == 'K':
|
||||
if i+1 < len(s) and s[i+1] == 'N':
|
||||
ch = 'N'
|
||||
elif ch in "AEIOU":
|
||||
ch = "A"
|
||||
elif ch == "Q":
|
||||
ch = "G"
|
||||
elif ch == "Z":
|
||||
ch = "S"
|
||||
elif ch == "M":
|
||||
ch = "N"
|
||||
elif ch == "K":
|
||||
if i + 1 < len(s) and s[i + 1] == "N":
|
||||
ch = "N"
|
||||
else:
|
||||
ch = 'C'
|
||||
elif ch == 'S' and s[i+1:i+3] == 'CH':
|
||||
ch = 'SS'
|
||||
ch = "C"
|
||||
elif ch == "S" and s[i + 1 : i + 3] == "CH":
|
||||
ch = "SS"
|
||||
i += 2
|
||||
elif ch == 'P' and i+1 < len(s) and s[i+1] == 'H':
|
||||
ch = 'F'
|
||||
elif ch == "P" and i + 1 < len(s) and s[i + 1] == "H":
|
||||
ch = "F"
|
||||
i += 1
|
||||
elif ch == 'H' and (s[i-1] not in 'AEIOU' or (i+1 < len(s) and s[i+1] not in 'AEIOU')):
|
||||
if s[i-1] in 'AEIOU':
|
||||
ch = 'A'
|
||||
elif ch == "H" and (
|
||||
s[i - 1] not in "AEIOU"
|
||||
or (i + 1 < len(s) and s[i + 1] not in "AEIOU")
|
||||
or (i + 1 == len(s))
|
||||
):
|
||||
if s[i - 1] in "AEIOU":
|
||||
ch = "A"
|
||||
else:
|
||||
ch = s[i-1]
|
||||
elif ch == 'W' and s[i-1] in 'AEIOU':
|
||||
ch = s[i-1]
|
||||
ch = s[i - 1]
|
||||
elif ch == "W" and s[i - 1] in "AEIOU":
|
||||
ch = s[i - 1]
|
||||
|
||||
if ch[-1] != key[-1][-1]:
|
||||
key.append(ch)
|
||||
|
||||
i += 1
|
||||
|
||||
key = ''.join(key)
|
||||
key = "".join(key)
|
||||
|
||||
# step 5 - remove trailing S
|
||||
if key.endswith('S') and key != 'S':
|
||||
if key.endswith("S") and key != "S":
|
||||
key = key[:-1]
|
||||
|
||||
# step 6 - replace AY w/ Y
|
||||
if key.endswith('AY'):
|
||||
key = key[:-2] + 'Y'
|
||||
if key.endswith("AY"):
|
||||
key = key[:-2] + "Y"
|
||||
|
||||
# step 7 - remove trailing A
|
||||
if key.endswith('A') and key != 'A':
|
||||
if key.endswith("A") and key != "A":
|
||||
key = key[:-1]
|
||||
|
||||
# step 8 was already done
|
||||
|
@ -315,24 +335,26 @@ def nysiis(s):
|
|||
def match_rating_codex(s):
|
||||
_check_type(s)
|
||||
|
||||
s = s.upper()
|
||||
# we ignore spaces
|
||||
s = s.upper().replace(" ", "")
|
||||
codex = []
|
||||
|
||||
prev = None
|
||||
for i, c in enumerate(s):
|
||||
# not a space OR
|
||||
# starting character & vowel
|
||||
first = True
|
||||
for c in s:
|
||||
# starting character
|
||||
# or consonant not preceded by same consonant
|
||||
if (c != ' ' and (i == 0 and c in 'AEIOU') or (c not in 'AEIOU' and c != prev)):
|
||||
if first or (c not in "AEIOU" and c != prev):
|
||||
codex.append(c)
|
||||
|
||||
prev = c
|
||||
first = False
|
||||
|
||||
# just use first/last 3
|
||||
if len(codex) > 6:
|
||||
return ''.join(codex[:3]+codex[-3:])
|
||||
return "".join(codex[:3] + codex[-3:])
|
||||
else:
|
||||
return ''.join(codex)
|
||||
return "".join(codex)
|
||||
|
||||
|
||||
def match_rating_comparison(s1, s2):
|
||||
|
@ -344,7 +366,7 @@ def match_rating_comparison(s1, s2):
|
|||
res2 = []
|
||||
|
||||
# length differs by 3 or more, no result
|
||||
if abs(len1-len2) >= 3:
|
||||
if abs(len1 - len2) >= 3:
|
||||
return None
|
||||
|
||||
# get minimum rating based on sums of codexes
|
||||
|
@ -359,7 +381,7 @@ def match_rating_comparison(s1, s2):
|
|||
min_rating = 2
|
||||
|
||||
# strip off common prefixes
|
||||
for c1, c2 in _zip_longest(codex1, codex2):
|
||||
for c1, c2 in zip_longest(codex1, codex2):
|
||||
if c1 != c2:
|
||||
if c1:
|
||||
res1.append(c1)
|
||||
|
@ -367,7 +389,7 @@ def match_rating_comparison(s1, s2):
|
|||
res2.append(c2)
|
||||
|
||||
unmatched_count1 = unmatched_count2 = 0
|
||||
for c1, c2 in _zip_longest(reversed(res1), reversed(res2)):
|
||||
for c1, c2 in zip_longest(reversed(res1), reversed(res2)):
|
||||
if c1 != c2:
|
||||
if c1:
|
||||
unmatched_count1 += 1
|
||||
|
@ -385,112 +407,113 @@ def metaphone(s):
|
|||
s = _normalize(s.lower())
|
||||
|
||||
# skip first character if s starts with these
|
||||
if s.startswith(('kn', 'gn', 'pn', 'ac', 'wr', 'ae')):
|
||||
if s.startswith(("kn", "gn", "pn", "wr", "ae")):
|
||||
s = s[1:]
|
||||
|
||||
i = 0
|
||||
|
||||
while i < len(s):
|
||||
c = s[i]
|
||||
next = s[i+1] if i < len(s)-1 else '*****'
|
||||
nextnext = s[i+2] if i < len(s)-2 else '*****'
|
||||
next = s[i + 1] if i < len(s) - 1 else "*****"
|
||||
nextnext = s[i + 2] if i < len(s) - 2 else "*****"
|
||||
|
||||
# skip doubles except for cc
|
||||
if c == next and c != 'c':
|
||||
if c == next and c != "c":
|
||||
i += 1
|
||||
continue
|
||||
|
||||
if c in 'aeiou':
|
||||
if i == 0 or s[i-1] == ' ':
|
||||
if c in "aeiou":
|
||||
if i == 0 or s[i - 1] == " ":
|
||||
result.append(c)
|
||||
elif c == 'b':
|
||||
if (not (i != 0 and s[i-1] == 'm')) or next:
|
||||
result.append('b')
|
||||
elif c == 'c':
|
||||
if next == 'i' and nextnext == 'a' or next == 'h':
|
||||
result.append('x')
|
||||
elif c == "b":
|
||||
if (not (i != 0 and s[i - 1] == "m")) or next:
|
||||
result.append("b")
|
||||
elif c == "c":
|
||||
if next == "i" and nextnext == "a" or next == "h":
|
||||
result.append("x")
|
||||
i += 1
|
||||
elif next in 'iey':
|
||||
result.append('s')
|
||||
elif next in "iey":
|
||||
result.append("s")
|
||||
i += 1
|
||||
else:
|
||||
result.append('k')
|
||||
elif c == 'd':
|
||||
if next == 'g' and nextnext in 'iey':
|
||||
result.append('j')
|
||||
result.append("k")
|
||||
elif c == "d":
|
||||
if next == "g" and nextnext in "iey":
|
||||
result.append("j")
|
||||
i += 2
|
||||
else:
|
||||
result.append('t')
|
||||
elif c in 'fjlmnr':
|
||||
result.append("t")
|
||||
elif c in "fjlmnr":
|
||||
result.append(c)
|
||||
elif c == 'g':
|
||||
if next in 'iey':
|
||||
result.append('j')
|
||||
elif next not in 'hn':
|
||||
result.append('k')
|
||||
elif next == 'h' and nextnext and nextnext not in 'aeiou':
|
||||
elif c == "g":
|
||||
if next in "iey":
|
||||
result.append("j")
|
||||
elif next == "h" and nextnext and nextnext not in "aeiou":
|
||||
i += 1
|
||||
elif c == 'h':
|
||||
if i == 0 or next in 'aeiou' or s[i-1] not in 'aeiou':
|
||||
result.append('h')
|
||||
elif c == 'k':
|
||||
if i == 0 or s[i-1] != 'c':
|
||||
result.append('k')
|
||||
elif c == 'p':
|
||||
if next == 'h':
|
||||
result.append('f')
|
||||
elif next == "n" and not nextnext:
|
||||
i += 1
|
||||
else:
|
||||
result.append('p')
|
||||
elif c == 'q':
|
||||
result.append('k')
|
||||
elif c == 's':
|
||||
if next == 'h':
|
||||
result.append('x')
|
||||
result.append("k")
|
||||
elif c == "h":
|
||||
if i == 0 or next in "aeiou" or s[i - 1] not in "aeiou":
|
||||
result.append("h")
|
||||
elif c == "k":
|
||||
if i == 0 or s[i - 1] != "c":
|
||||
result.append("k")
|
||||
elif c == "p":
|
||||
if next == "h":
|
||||
result.append("f")
|
||||
i += 1
|
||||
elif next == 'i' and nextnext in 'oa':
|
||||
result.append('x')
|
||||
else:
|
||||
result.append("p")
|
||||
elif c == "q":
|
||||
result.append("k")
|
||||
elif c == "s":
|
||||
if next == "h":
|
||||
result.append("x")
|
||||
i += 1
|
||||
elif next == "i" and nextnext in "oa":
|
||||
result.append("x")
|
||||
i += 2
|
||||
else:
|
||||
result.append('s')
|
||||
elif c == 't':
|
||||
if next == 'i' and nextnext in 'oa':
|
||||
result.append('x')
|
||||
elif next == 'h':
|
||||
result.append('0')
|
||||
result.append("s")
|
||||
elif c == "t":
|
||||
if next == "i" and nextnext in "oa":
|
||||
result.append("x")
|
||||
elif next == "h":
|
||||
result.append("0")
|
||||
i += 1
|
||||
elif next != 'c' or nextnext != 'h':
|
||||
result.append('t')
|
||||
elif c == 'v':
|
||||
result.append('f')
|
||||
elif c == 'w':
|
||||
if i == 0 and next == 'h':
|
||||
elif next != "c" or nextnext != "h":
|
||||
result.append("t")
|
||||
elif c == "v":
|
||||
result.append("f")
|
||||
elif c == "w":
|
||||
if i == 0 and next == "h":
|
||||
i += 1
|
||||
if nextnext in 'aeiou' or nextnext == '*****':
|
||||
result.append('w')
|
||||
elif next in 'aeiou' or next == '*****':
|
||||
result.append('w')
|
||||
elif c == 'x':
|
||||
result.append("w")
|
||||
elif next in "aeiou":
|
||||
result.append("w")
|
||||
elif c == "x":
|
||||
if i == 0:
|
||||
if next == 'h' or (next == 'i' and nextnext in 'oa'):
|
||||
result.append('x')
|
||||
if next == "h" or (next == "i" and nextnext in "oa"):
|
||||
result.append("x")
|
||||
else:
|
||||
result.append('s')
|
||||
result.append("s")
|
||||
else:
|
||||
result.append('k')
|
||||
result.append('s')
|
||||
elif c == 'y':
|
||||
if next in 'aeiou':
|
||||
result.append('y')
|
||||
elif c == 'z':
|
||||
result.append('s')
|
||||
elif c == ' ':
|
||||
if len(result) > 0 and result[-1] != ' ':
|
||||
result.append(' ')
|
||||
result.append("k")
|
||||
result.append("s")
|
||||
elif c == "y":
|
||||
if next in "aeiou":
|
||||
result.append("y")
|
||||
elif c == "z":
|
||||
result.append("s")
|
||||
elif c == " ":
|
||||
if len(result) > 0 and result[-1] != " ":
|
||||
result.append(" ")
|
||||
|
||||
i += 1
|
||||
|
||||
return ''.join(result).upper()
|
||||
return "".join(result).upper()
|
||||
|
||||
|
||||
def porter_stem(s):
|
||||
|
|
BIN
libs/common/jellyfish/cjellyfish.cp37-win_amd64.pyd
Normal file
BIN
libs/common/jellyfish/cjellyfish.cp37-win_amd64.pyd
Normal file
Binary file not shown.
|
@ -1,11 +0,0 @@
|
|||
import sys
|
||||
import itertools
|
||||
|
||||
IS_PY3 = sys.version_info[0] == 3
|
||||
|
||||
if IS_PY3:
|
||||
_range = range
|
||||
_zip_longest = itertools.zip_longest
|
||||
else:
|
||||
_range = xrange
|
||||
_zip_longest = itertools.izip_longest
|
|
@ -1,69 +1,84 @@
|
|||
from .compat import _range
|
||||
|
||||
_s2_options = {
|
||||
'a': ((['a', 't', 'i', 'o', 'n', 'a', 'l'], ['a', 't', 'e']),
|
||||
(['t', 'i', 'o', 'n', 'a', 'l'], ['t', 'i', 'o', 'n'])),
|
||||
'c': ((['e', 'n', 'c', 'i'], ['e', 'n', 'c', 'e']),
|
||||
(['a', 'n', 'c', 'i'], ['a', 'n', 'c', 'e']),),
|
||||
'e': ((['i', 'z', 'e', 'r'], ['i', 'z', 'e']),),
|
||||
'l': ((['b', 'l', 'i'], ['b', 'l', 'e']),
|
||||
(['a', 'l', 'l', 'i'], ['a', 'l']),
|
||||
(['e', 'n', 't', 'l', 'i'], ['e', 'n', 't']),
|
||||
(['e', 'l', 'i'], ['e']),
|
||||
(['o', 'u', 's', 'l', 'i'], ['o', 'u', 's']),),
|
||||
'o': ((['i', 'z', 'a', 't', 'i', 'o', 'n'], ['i', 'z', 'e']),
|
||||
(['a', 't', 'i', 'o', 'n'], ['a', 't', 'e']),
|
||||
(['a', 't', 'o', 'r'], ['a', 't', 'e']),),
|
||||
's': ((['a', 'l', 'i', 's', 'm'], ['a', 'l']),
|
||||
(['i', 'v', 'e', 'n', 'e', 's', 's'], ['i', 'v', 'e']),
|
||||
(['f', 'u', 'l', 'n', 'e', 's', 's'], ['f', 'u', 'l']),
|
||||
(['o', 'u', 's', 'n', 'e', 's', 's'], ['o', 'u', 's']),),
|
||||
't': ((['a', 'l', 'i', 't', 'i'], ['a', 'l']),
|
||||
(['i', 'v', 'i', 't', 'i'], ['i', 'v', 'e']),
|
||||
(['b', 'i', 'l', 'i', 't', 'i'], ['b', 'l', 'e']),),
|
||||
'g': ((['l', 'o', 'g', 'i'], ['l', 'o', 'g']),),
|
||||
"a": (
|
||||
(["a", "t", "i", "o", "n", "a", "l"], ["a", "t", "e"]),
|
||||
(["t", "i", "o", "n", "a", "l"], ["t", "i", "o", "n"]),
|
||||
),
|
||||
"c": (
|
||||
(["e", "n", "c", "i"], ["e", "n", "c", "e"]),
|
||||
(["a", "n", "c", "i"], ["a", "n", "c", "e"]),
|
||||
),
|
||||
"e": ((["i", "z", "e", "r"], ["i", "z", "e"]),),
|
||||
"l": (
|
||||
(["b", "l", "i"], ["b", "l", "e"]),
|
||||
(["a", "l", "l", "i"], ["a", "l"]),
|
||||
(["e", "n", "t", "l", "i"], ["e", "n", "t"]),
|
||||
(["e", "l", "i"], ["e"]),
|
||||
(["o", "u", "s", "l", "i"], ["o", "u", "s"]),
|
||||
),
|
||||
"o": (
|
||||
(["i", "z", "a", "t", "i", "o", "n"], ["i", "z", "e"]),
|
||||
(["a", "t", "i", "o", "n"], ["a", "t", "e"]),
|
||||
(["a", "t", "o", "r"], ["a", "t", "e"]),
|
||||
),
|
||||
"s": (
|
||||
(["a", "l", "i", "s", "m"], ["a", "l"]),
|
||||
(["i", "v", "e", "n", "e", "s", "s"], ["i", "v", "e"]),
|
||||
(["f", "u", "l", "n", "e", "s", "s"], ["f", "u", "l"]),
|
||||
(["o", "u", "s", "n", "e", "s", "s"], ["o", "u", "s"]),
|
||||
),
|
||||
"t": (
|
||||
(["a", "l", "i", "t", "i"], ["a", "l"]),
|
||||
(["i", "v", "i", "t", "i"], ["i", "v", "e"]),
|
||||
(["b", "i", "l", "i", "t", "i"], ["b", "l", "e"]),
|
||||
),
|
||||
"g": ((["l", "o", "g", "i"], ["l", "o", "g"]),),
|
||||
}
|
||||
|
||||
|
||||
_s3_options = {
|
||||
'e': ((['i', 'c', 'a', 't', 'e'], ['i', 'c']),
|
||||
(['a', 't', 'i', 'v', 'e'], []),
|
||||
(['a', 'l', 'i', 'z', 'e'], ['a', 'l']),),
|
||||
'i': ((['i', 'c', 'i', 't', 'i'], ['i', 'c']),),
|
||||
'l': ((['i', 'c', 'a', 'l'], ['i', 'c']),
|
||||
(['f', 'u', 'l'], []),),
|
||||
's': ((['n', 'e', 's', 's'], []),),
|
||||
"e": (
|
||||
(["i", "c", "a", "t", "e"], ["i", "c"]),
|
||||
(["a", "t", "i", "v", "e"], []),
|
||||
(["a", "l", "i", "z", "e"], ["a", "l"]),
|
||||
),
|
||||
"i": ((["i", "c", "i", "t", "i"], ["i", "c"]),),
|
||||
"l": ((["i", "c", "a", "l"], ["i", "c"]), (["f", "u", "l"], [])),
|
||||
"s": ((["n", "e", "s", "s"], []),),
|
||||
}
|
||||
|
||||
_s4_endings = {
|
||||
'a': (['a', 'l'],),
|
||||
'c': (['a', 'n', 'c', 'e'], ['e', 'n', 'c', 'e']),
|
||||
'e': (['e', 'r'],),
|
||||
'i': (['i', 'c'],),
|
||||
'l': (['a', 'b', 'l', 'e'], ['i', 'b', 'l', 'e']),
|
||||
'n': (['a', 'n', 't'], ['e', 'm', 'e', 'n', 't'], ['m', 'e', 'n', 't'],
|
||||
['e', 'n', 't']),
|
||||
"a": (["a", "l"],),
|
||||
"c": (["a", "n", "c", "e"], ["e", "n", "c", "e"]),
|
||||
"e": (["e", "r"],),
|
||||
"i": (["i", "c"],),
|
||||
"l": (["a", "b", "l", "e"], ["i", "b", "l", "e"]),
|
||||
"n": (
|
||||
["a", "n", "t"],
|
||||
["e", "m", "e", "n", "t"],
|
||||
["m", "e", "n", "t"],
|
||||
["e", "n", "t"],
|
||||
),
|
||||
# handle 'o' separately
|
||||
's': (['i', 's', 'm'],),
|
||||
't': (['a', 't', 'e'], ['i', 't', 'i']),
|
||||
'u': (['o', 'u', 's'],),
|
||||
'v': (['i', 'v', 'e'],),
|
||||
'z': (['i', 'z', 'e'],),
|
||||
"s": (["i", "s", "m"],),
|
||||
"t": (["a", "t", "e"], ["i", "t", "i"]),
|
||||
"u": (["o", "u", "s"],),
|
||||
"v": (["i", "v", "e"],),
|
||||
"z": (["i", "z", "e"],),
|
||||
}
|
||||
|
||||
|
||||
class Stemmer(object):
|
||||
def __init__(self, b):
|
||||
self.b = list(b)
|
||||
self.k = len(b)-1
|
||||
self.k = len(b) - 1
|
||||
self.j = 0
|
||||
|
||||
def cons(self, i):
|
||||
""" True iff b[i] is a consonant """
|
||||
if self.b[i] in 'aeiou':
|
||||
if self.b[i] in "aeiou":
|
||||
return False
|
||||
elif self.b[i] == 'y':
|
||||
return True if i == 0 else not self.cons(i-1)
|
||||
elif self.b[i] == "y":
|
||||
return True if i == 0 else not self.cons(i - 1)
|
||||
return True
|
||||
|
||||
def m(self):
|
||||
|
@ -96,31 +111,36 @@ class Stemmer(object):
|
|||
|
||||
def vowel_in_stem(self):
|
||||
""" True iff 0...j contains vowel """
|
||||
for i in _range(0, self.j+1):
|
||||
for i in range(0, self.j + 1):
|
||||
if not self.cons(i):
|
||||
return True
|
||||
return False
|
||||
|
||||
def doublec(self, j):
|
||||
""" True iff j, j-1 contains double consonant """
|
||||
if j < 1 or self.b[j] != self.b[j-1]:
|
||||
if j < 1 or self.b[j] != self.b[j - 1]:
|
||||
return False
|
||||
return self.cons(j)
|
||||
|
||||
def cvc(self, i):
|
||||
""" True iff i-2,i-1,i is consonent-vowel consonant
|
||||
""" True iff i-2,i-1,i is consonant-vowel consonant
|
||||
and if second c isn't w,x, or y.
|
||||
used to restore e at end of short words like cave, love, hope, crime
|
||||
"""
|
||||
if (i < 2 or not self.cons(i) or self.cons(i-1) or not self.cons(i-2) or
|
||||
self.b[i] in 'wxy'):
|
||||
if (
|
||||
i < 2
|
||||
or not self.cons(i)
|
||||
or self.cons(i - 1)
|
||||
or not self.cons(i - 2)
|
||||
or self.b[i] in "wxy"
|
||||
):
|
||||
return False
|
||||
return True
|
||||
|
||||
def ends(self, s):
|
||||
length = len(s)
|
||||
""" True iff 0...k ends with string s """
|
||||
res = (self.b[self.k-length+1:self.k+1] == s)
|
||||
res = self.b[self.k - length + 1 : self.k + 1] == s
|
||||
if res:
|
||||
self.j = self.k - length
|
||||
return res
|
||||
|
@ -128,7 +148,7 @@ class Stemmer(object):
|
|||
def setto(self, s):
|
||||
""" set j+1...k to string s, readjusting k """
|
||||
length = len(s)
|
||||
self.b[self.j+1:self.j+1+length] = s
|
||||
self.b[self.j + 1 : self.j + 1 + length] = s
|
||||
self.k = self.j + length
|
||||
|
||||
def r(self, s):
|
||||
|
@ -136,39 +156,40 @@ class Stemmer(object):
|
|||
self.setto(s)
|
||||
|
||||
def step1ab(self):
|
||||
if self.b[self.k] == 's':
|
||||
if self.ends(['s', 's', 'e', 's']):
|
||||
if self.b[self.k] == "s":
|
||||
if self.ends(["s", "s", "e", "s"]):
|
||||
self.k -= 2
|
||||
elif self.ends(['i', 'e', 's']):
|
||||
self.setto(['i'])
|
||||
elif self.b[self.k-1] != 's':
|
||||
elif self.ends(["i", "e", "s"]):
|
||||
self.setto(["i"])
|
||||
elif self.b[self.k - 1] != "s":
|
||||
self.k -= 1
|
||||
if self.ends(['e', 'e', 'd']):
|
||||
if self.ends(["e", "e", "d"]):
|
||||
if self.m() > 0:
|
||||
self.k -= 1
|
||||
elif ((self.ends(['e', 'd']) or self.ends(['i', 'n', 'g'])) and
|
||||
self.vowel_in_stem()):
|
||||
elif (
|
||||
self.ends(["e", "d"]) or self.ends(["i", "n", "g"])
|
||||
) and self.vowel_in_stem():
|
||||
self.k = self.j
|
||||
if self.ends(['a', 't']):
|
||||
self.setto(['a', 't', 'e'])
|
||||
elif self.ends(['b', 'l']):
|
||||
self.setto(['b', 'l', 'e'])
|
||||
elif self.ends(['i', 'z']):
|
||||
self.setto(['i', 'z', 'e'])
|
||||
if self.ends(["a", "t"]):
|
||||
self.setto(["a", "t", "e"])
|
||||
elif self.ends(["b", "l"]):
|
||||
self.setto(["b", "l", "e"])
|
||||
elif self.ends(["i", "z"]):
|
||||
self.setto(["i", "z", "e"])
|
||||
elif self.doublec(self.k):
|
||||
self.k -= 1
|
||||
if self.b[self.k] in 'lsz':
|
||||
if self.b[self.k] in "lsz":
|
||||
self.k += 1
|
||||
elif self.m() == 1 and self.cvc(self.k):
|
||||
self.setto(['e'])
|
||||
self.setto(["e"])
|
||||
|
||||
def step1c(self):
|
||||
""" turn terminal y into i if there's a vowel in stem """
|
||||
if self.ends(['y']) and self.vowel_in_stem():
|
||||
self.b[self.k] = 'i'
|
||||
if self.ends(["y"]) and self.vowel_in_stem():
|
||||
self.b[self.k] = "i"
|
||||
|
||||
def step2and3(self):
|
||||
for end, repl in _s2_options.get(self.b[self.k-1], []):
|
||||
for end, repl in _s2_options.get(self.b[self.k - 1], []):
|
||||
if self.ends(end):
|
||||
self.r(repl)
|
||||
break
|
||||
|
@ -179,11 +200,13 @@ class Stemmer(object):
|
|||
break
|
||||
|
||||
def step4(self):
|
||||
ch = self.b[self.k-1]
|
||||
ch = self.b[self.k - 1]
|
||||
|
||||
if ch == 'o':
|
||||
if not ((self.ends(['i', 'o', 'n']) and self.b[self.j] in 'st') or
|
||||
self.ends(['o', 'u'])):
|
||||
if ch == "o":
|
||||
if not (
|
||||
(self.ends(["i", "o", "n"]) and self.b[self.j] in "st")
|
||||
or self.ends(["o", "u"])
|
||||
):
|
||||
return
|
||||
else:
|
||||
endings = _s4_endings.get(ch, [])
|
||||
|
@ -198,15 +221,15 @@ class Stemmer(object):
|
|||
|
||||
def step5(self):
|
||||
self.j = self.k
|
||||
if self.b[self.k] == 'e':
|
||||
if self.b[self.k] == "e":
|
||||
a = self.m()
|
||||
if a > 1 or a == 1 and not self.cvc(self.k-1):
|
||||
if a > 1 or a == 1 and not self.cvc(self.k - 1):
|
||||
self.k -= 1
|
||||
if self.b[self.k] == 'l' and self.doublec(self.k) and self.m() > 1:
|
||||
if self.b[self.k] == "l" and self.doublec(self.k) and self.m() > 1:
|
||||
self.k -= 1
|
||||
|
||||
def result(self):
|
||||
return ''.join(self.b[:self.k+1])
|
||||
return "".join(self.b[: self.k + 1])
|
||||
|
||||
def stem(self):
|
||||
if self.k > 1:
|
||||
|
|
0
libs/common/jellyfish/py.typed
Normal file
0
libs/common/jellyfish/py.typed
Normal file
|
@ -1,28 +1,24 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
import sys
|
||||
if sys.version_info[0] < 3:
|
||||
import unicodecsv as csv
|
||||
open_kwargs = {}
|
||||
else:
|
||||
import csv
|
||||
open_kwargs = {'encoding': 'utf8'}
|
||||
import csv
|
||||
import platform
|
||||
import pytest
|
||||
|
||||
open_kwargs = {"encoding": "utf8"}
|
||||
|
||||
|
||||
def assertAlmostEqual(a, b, places=3):
|
||||
assert abs(a - b) < (0.1**places)
|
||||
assert abs(a - b) < (0.1 ** places)
|
||||
|
||||
|
||||
if platform.python_implementation() == 'CPython':
|
||||
implementations = ['python', 'c']
|
||||
if platform.python_implementation() == "CPython":
|
||||
implementations = ["python", "c"]
|
||||
else:
|
||||
implementations = ['python']
|
||||
implementations = ["python"]
|
||||
|
||||
|
||||
@pytest.fixture(params=implementations)
|
||||
def jf(request):
|
||||
if request.param == 'python':
|
||||
if request.param == "python":
|
||||
from jellyfish import _jellyfish as jf
|
||||
else:
|
||||
from jellyfish import cjellyfish as jf
|
||||
|
@ -30,64 +26,86 @@ def jf(request):
|
|||
|
||||
|
||||
def _load_data(name):
|
||||
with open('testdata/{}.csv'.format(name), **open_kwargs) as f:
|
||||
with open("testdata/{}.csv".format(name), **open_kwargs) as f:
|
||||
for data in csv.reader(f):
|
||||
yield data
|
||||
|
||||
|
||||
@pytest.mark.parametrize("s1,s2,value", _load_data('jaro_winkler'), ids=str)
|
||||
def test_jaro_winkler(jf, s1, s2, value):
|
||||
@pytest.mark.parametrize("s1,s2,value", _load_data("jaro_winkler"), ids=str)
|
||||
def test_jaro_winkler_similarity(jf, s1, s2, value):
|
||||
value = float(value)
|
||||
assertAlmostEqual(jf.jaro_winkler(s1, s2), value, places=3)
|
||||
assertAlmostEqual(jf.jaro_winkler_similarity(s1, s2), value, places=3)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("s1,s2,value", _load_data('jaro_distance'), ids=str)
|
||||
def test_jaro_distance(jf, s1, s2, value):
|
||||
@pytest.mark.parametrize("s1,s2,value", _load_data("jaro_winkler_longtol"), ids=str)
|
||||
def test_jaro_winkler_similarity_longtol(jf, s1, s2, value):
|
||||
value = float(value)
|
||||
assertAlmostEqual(jf.jaro_distance(s1, s2), value, places=3)
|
||||
assertAlmostEqual(jf.jaro_winkler_similarity(s1, s2, True), value, places=3)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("s1,s2,value", _load_data('hamming'), ids=str)
|
||||
def test_jaro_winkler_deprecation(jf):
|
||||
# backwards compatibility function
|
||||
from jellyfish import jaro_winkler
|
||||
|
||||
with pytest.deprecated_call():
|
||||
assert jaro_winkler("a", "a") == 1
|
||||
|
||||
|
||||
def test_jaro_distance_deprecation():
|
||||
# backwards compatibility function
|
||||
from jellyfish import jaro_distance
|
||||
|
||||
with pytest.deprecated_call():
|
||||
assert jaro_distance("a", "a") == 1
|
||||
|
||||
|
||||
@pytest.mark.parametrize("s1,s2,value", _load_data("jaro_distance"), ids=str)
|
||||
def test_jaro_similarity(jf, s1, s2, value):
|
||||
value = float(value)
|
||||
assertAlmostEqual(jf.jaro_similarity(s1, s2), value, places=3)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("s1,s2,value", _load_data("hamming"), ids=str)
|
||||
def test_hamming_distance(jf, s1, s2, value):
|
||||
value = int(value)
|
||||
assert jf.hamming_distance(s1, s2) == value
|
||||
|
||||
|
||||
@pytest.mark.parametrize("s1,s2,value", _load_data('levenshtein'), ids=str)
|
||||
@pytest.mark.parametrize("s1,s2,value", _load_data("levenshtein"), ids=str)
|
||||
def test_levenshtein_distance(jf, s1, s2, value):
|
||||
value = int(value)
|
||||
assert jf.levenshtein_distance(s1, s2) == value
|
||||
|
||||
|
||||
@pytest.mark.parametrize("s1,s2,value", _load_data('damerau_levenshtein'), ids=str)
|
||||
@pytest.mark.parametrize("s1,s2,value", _load_data("damerau_levenshtein"), ids=str)
|
||||
def test_damerau_levenshtein_distance(jf, s1, s2, value):
|
||||
value = int(value)
|
||||
assert jf.damerau_levenshtein_distance(s1, s2) == value
|
||||
|
||||
|
||||
@pytest.mark.parametrize("s1,code", _load_data('soundex'), ids=str)
|
||||
@pytest.mark.parametrize("s1,code", _load_data("soundex"), ids=str)
|
||||
def test_soundex(jf, s1, code):
|
||||
assert jf.soundex(s1) == code
|
||||
|
||||
|
||||
@pytest.mark.parametrize("s1,code", _load_data('metaphone'), ids=str)
|
||||
@pytest.mark.parametrize("s1,code", _load_data("metaphone"), ids=str)
|
||||
def test_metaphone(jf, s1, code):
|
||||
assert jf.metaphone(s1) == code
|
||||
|
||||
|
||||
@pytest.mark.parametrize("s1,s2", _load_data('nysiis'), ids=str)
|
||||
@pytest.mark.parametrize("s1,s2", _load_data("nysiis"), ids=str)
|
||||
def test_nysiis(jf, s1, s2):
|
||||
assert jf.nysiis(s1) == s2
|
||||
|
||||
|
||||
@pytest.mark.parametrize("s1,s2", _load_data('match_rating_codex'), ids=str)
|
||||
@pytest.mark.parametrize("s1,s2", _load_data("match_rating_codex"), ids=str)
|
||||
def test_match_rating_codex(jf, s1, s2):
|
||||
assert jf.match_rating_codex(s1) == s2
|
||||
|
||||
|
||||
@pytest.mark.parametrize("s1,s2,value", _load_data('match_rating_comparison'), ids=str)
|
||||
@pytest.mark.parametrize("s1,s2,value", _load_data("match_rating_comparison"), ids=str)
|
||||
def test_match_rating_comparison(jf, s1, s2, value):
|
||||
value = {'True': True, 'False': False, 'None': None}[value]
|
||||
value = {"True": True, "False": False, "None": None}[value]
|
||||
assert jf.match_rating_comparison(s1, s2) is value
|
||||
|
||||
|
||||
|
@ -96,117 +114,125 @@ def test_match_rating_comparison(jf, s1, s2, value):
|
|||
# def test_porter_stem(jf, a, b):
|
||||
# assert jf.porter_stem(a) == b
|
||||
|
||||
|
||||
def test_porter_stem(jf):
|
||||
with open('testdata/porter.csv', **open_kwargs) as f:
|
||||
with open("testdata/porter.csv", **open_kwargs) as f:
|
||||
reader = csv.reader(f)
|
||||
for (a, b) in reader:
|
||||
assert jf.porter_stem(a) == b
|
||||
|
||||
|
||||
if platform.python_implementation() == 'CPython':
|
||||
if platform.python_implementation() == "CPython":
|
||||
|
||||
def test_match_rating_comparison_segfault():
|
||||
import hashlib
|
||||
from jellyfish import cjellyfish as jf
|
||||
sha1s = [u'{}'.format(hashlib.sha1(str(v).encode('ascii')).hexdigest())
|
||||
for v in range(100)]
|
||||
|
||||
sha1s = [
|
||||
u"{}".format(hashlib.sha1(str(v).encode("ascii")).hexdigest())
|
||||
for v in range(100)
|
||||
]
|
||||
# this segfaulted on 0.1.2
|
||||
assert [[jf.match_rating_comparison(h1, h2) for h1 in sha1s] for h2 in sha1s]
|
||||
|
||||
def test_damerau_levenshtein_unicode_segfault():
|
||||
# unfortunate difference in behavior between Py & C versions
|
||||
# test that unicode works in C & Python versions now
|
||||
from jellyfish.cjellyfish import damerau_levenshtein_distance as c_dl
|
||||
from jellyfish._jellyfish import damerau_levenshtein_distance as py_dl
|
||||
s1 = u'mylifeoutdoors'
|
||||
s2 = u'нахлыст'
|
||||
with pytest.raises(ValueError):
|
||||
c_dl(s1, s2)
|
||||
with pytest.raises(ValueError):
|
||||
c_dl(s2, s1)
|
||||
|
||||
s1 = u"mylifeoutdoors"
|
||||
s2 = u"нахлыст"
|
||||
assert c_dl(s1, s2) == 14
|
||||
assert c_dl(s2, s1) == 14
|
||||
|
||||
assert py_dl(s1, s2) == 14
|
||||
assert py_dl(s2, s1) == 14
|
||||
|
||||
|
||||
def test_jaro_winkler_long_tolerance(jf):
|
||||
no_lt = jf.jaro_winkler(u'two long strings', u'two long stringz', long_tolerance=False)
|
||||
with_lt = jf.jaro_winkler(u'two long strings', u'two long stringz', long_tolerance=True)
|
||||
no_lt = jf.jaro_winkler_similarity(
|
||||
u"two long strings", u"two long stringz", long_tolerance=False
|
||||
)
|
||||
with_lt = jf.jaro_winkler_similarity(
|
||||
u"two long strings", u"two long stringz", long_tolerance=True
|
||||
)
|
||||
# make sure long_tolerance does something
|
||||
assertAlmostEqual(no_lt, 0.975)
|
||||
assertAlmostEqual(with_lt, 0.984)
|
||||
|
||||
|
||||
def test_damerau_levenshtein_distance_type(jf):
|
||||
jf.damerau_levenshtein_distance(u'abc', u'abc')
|
||||
jf.damerau_levenshtein_distance(u"abc", u"abc")
|
||||
with pytest.raises(TypeError) as exc:
|
||||
jf.damerau_levenshtein_distance(b'abc', b'abc')
|
||||
assert 'expected' in str(exc.value)
|
||||
jf.damerau_levenshtein_distance(b"abc", b"abc")
|
||||
assert "expected" in str(exc.value)
|
||||
|
||||
|
||||
def test_levenshtein_distance_type(jf):
|
||||
assert jf.levenshtein_distance(u'abc', u'abc') == 0
|
||||
assert jf.levenshtein_distance(u"abc", u"abc") == 0
|
||||
with pytest.raises(TypeError) as exc:
|
||||
jf.levenshtein_distance(b'abc', b'abc')
|
||||
assert 'expected' in str(exc.value)
|
||||
jf.levenshtein_distance(b"abc", b"abc")
|
||||
assert "expected" in str(exc.value)
|
||||
|
||||
|
||||
def test_jaro_distance_type(jf):
|
||||
assert jf.jaro_distance(u'abc', u'abc') == 1
|
||||
def test_jaro_similarity_type(jf):
|
||||
assert jf.jaro_similarity(u"abc", u"abc") == 1
|
||||
with pytest.raises(TypeError) as exc:
|
||||
jf.jaro_distance(b'abc', b'abc')
|
||||
assert 'expected' in str(exc.value)
|
||||
jf.jaro_similarity(b"abc", b"abc")
|
||||
assert "expected" in str(exc.value)
|
||||
|
||||
|
||||
def test_jaro_winkler_type(jf):
|
||||
assert jf.jaro_winkler(u'abc', u'abc') == 1
|
||||
assert jf.jaro_winkler_similarity(u"abc", u"abc") == 1
|
||||
with pytest.raises(TypeError) as exc:
|
||||
jf.jaro_winkler(b'abc', b'abc')
|
||||
assert 'expected' in str(exc.value)
|
||||
jf.jaro_winkler_similarity(b"abc", b"abc")
|
||||
assert "expected" in str(exc.value)
|
||||
|
||||
|
||||
def test_mra_comparison_type(jf):
|
||||
assert jf.match_rating_comparison(u'abc', u'abc') is True
|
||||
assert jf.match_rating_comparison(u"abc", u"abc") is True
|
||||
with pytest.raises(TypeError) as exc:
|
||||
jf.match_rating_comparison(b'abc', b'abc')
|
||||
assert 'expected' in str(exc.value)
|
||||
jf.match_rating_comparison(b"abc", b"abc")
|
||||
assert "expected" in str(exc.value)
|
||||
|
||||
|
||||
def test_hamming_type(jf):
|
||||
assert jf.hamming_distance(u'abc', u'abc') == 0
|
||||
assert jf.hamming_distance(u"abc", u"abc") == 0
|
||||
with pytest.raises(TypeError) as exc:
|
||||
jf.hamming_distance(b'abc', b'abc')
|
||||
assert 'expected' in str(exc.value)
|
||||
jf.hamming_distance(b"abc", b"abc")
|
||||
assert "expected" in str(exc.value)
|
||||
|
||||
|
||||
def test_soundex_type(jf):
|
||||
assert jf.soundex(u'ABC') == 'A120'
|
||||
assert jf.soundex(u"ABC") == "A120"
|
||||
with pytest.raises(TypeError) as exc:
|
||||
jf.soundex(b'ABC')
|
||||
assert 'expected' in str(exc.value)
|
||||
jf.soundex(b"ABC")
|
||||
assert "expected" in str(exc.value)
|
||||
|
||||
|
||||
def test_metaphone_type(jf):
|
||||
assert jf.metaphone(u'abc') == 'ABK'
|
||||
assert jf.metaphone(u"abc") == "ABK"
|
||||
with pytest.raises(TypeError) as exc:
|
||||
jf.metaphone(b'abc')
|
||||
assert 'expected' in str(exc.value)
|
||||
jf.metaphone(b"abc")
|
||||
assert "expected" in str(exc.value)
|
||||
|
||||
|
||||
def test_nysiis_type(jf):
|
||||
assert jf.nysiis(u'abc') == 'ABC'
|
||||
assert jf.nysiis(u"abc") == "ABC"
|
||||
with pytest.raises(TypeError) as exc:
|
||||
jf.nysiis(b'abc')
|
||||
assert 'expected' in str(exc.value)
|
||||
jf.nysiis(b"abc")
|
||||
assert "expected" in str(exc.value)
|
||||
|
||||
|
||||
def test_mr_codex_type(jf):
|
||||
assert jf.match_rating_codex(u'abc') == 'ABC'
|
||||
assert jf.match_rating_codex(u"abc") == "ABC"
|
||||
with pytest.raises(TypeError) as exc:
|
||||
jf.match_rating_codex(b'abc')
|
||||
assert 'expected' in str(exc.value)
|
||||
jf.match_rating_codex(b"abc")
|
||||
assert "expected" in str(exc.value)
|
||||
|
||||
|
||||
def test_porter_type(jf):
|
||||
assert jf.porter_stem(u'abc') == 'abc'
|
||||
assert jf.porter_stem(u"abc") == "abc"
|
||||
with pytest.raises(TypeError) as exc:
|
||||
jf.porter_stem(b'abc')
|
||||
assert 'expected' in str(exc.value)
|
||||
jf.porter_stem(b"abc")
|
||||
assert "expected" in str(exc.value)
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue