Update vendored beets to 1.6.0

Updates colorama to 0.4.6
Adds confuse version 1.7.0
Updates jellyfish to 0.9.0
Adds mediafile 0.10.1
Updates munkres to 1.1.4
Updates musicbrainzngs to 0.7.1
Updates mutagen to 1.46.0
Updates pyyaml to 6.0
Updates unidecode to 1.3.6
This commit is contained in:
Labrys of Knossos 2022-11-28 18:02:40 -05:00
commit 56c6773c6b
385 changed files with 25143 additions and 18080 deletions

View file

@ -1,6 +1,28 @@
import warnings
try:
from .cjellyfish import * # noqa
from .cjellyfish import * # noqa
library = "C"
except ImportError:
from ._jellyfish import * # noqa
from ._jellyfish import * # noqa
library = "Python"
def jaro_winkler(s1, s2, long_tolerance=False):
warnings.warn(
"the name 'jaro_winkler' is deprecated and will be removed in jellyfish 1.0, "
"for the same functionality please use jaro_winkler_similarity",
DeprecationWarning,
)
return jaro_winkler_similarity(s1, s2, long_tolerance) # noqa
def jaro_distance(s1, s2):
warnings.warn(
"the jaro_distance function incorrectly returns the jaro similarity, "
"replace your usage with jaro_similarity before 1.0",
DeprecationWarning,
)
return jaro_similarity(s1, s2) # noqa

View file

@ -0,0 +1,11 @@
def levenshtein_distance(s1: str, s2: str) -> int: ...
def jaro_similarity(s1: str, s2: str) -> float: ...
def jaro_winkler_similarity(s1: str, s2: str, long_tolerance: bool = ...) -> float: ...
def damerau_levenshtein_distance(s1: str, s2: str) -> int: ...
def soundex(s: str) -> str: ...
def hamming_distance(s1: str, s2: str) -> int: ...
def nysiis(s: str) -> str: ...
def match_rating_codex(s: str) -> str: ...
def match_rating_comparison(s1: str, s2: str) -> bool: ...
def metaphone(s: str) -> str: ...
def porter_stem(s: str) -> str: ...

View file

@ -1,18 +1,16 @@
import unicodedata
from collections import defaultdict
from .compat import _range, _zip_longest, IS_PY3
from itertools import zip_longest
from .porter import Stemmer
def _normalize(s):
return unicodedata.normalize('NFKD', s)
return unicodedata.normalize("NFKD", s)
def _check_type(s):
if IS_PY3 and not isinstance(s, str):
raise TypeError('expected str or unicode, got %s' % type(s).__name__)
elif not IS_PY3 and not isinstance(s, unicode):
raise TypeError('expected unicode, got %s' % type(s).__name__)
if not isinstance(s, str):
raise TypeError("expected str or unicode, got %s" % type(s).__name__)
def levenshtein_distance(s1, s2):
@ -21,53 +19,54 @@ def levenshtein_distance(s1, s2):
if s1 == s2:
return 0
rows = len(s1)+1
cols = len(s2)+1
rows = len(s1) + 1
cols = len(s2) + 1
if not s1:
return cols-1
return cols - 1
if not s2:
return rows-1
return rows - 1
prev = None
cur = range(cols)
for r in _range(1, rows):
prev, cur = cur, [r] + [0]*(cols-1)
for c in _range(1, cols):
for r in range(1, rows):
prev, cur = cur, [r] + [0] * (cols - 1)
for c in range(1, cols):
deletion = prev[c] + 1
insertion = cur[c-1] + 1
edit = prev[c-1] + (0 if s1[r-1] == s2[c-1] else 1)
insertion = cur[c - 1] + 1
edit = prev[c - 1] + (0 if s1[r - 1] == s2[c - 1] else 1)
cur[c] = min(edit, deletion, insertion)
return cur[-1]
def _jaro_winkler(ying, yang, long_tolerance, winklerize):
_check_type(ying)
_check_type(yang)
def _jaro_winkler(s1, s2, long_tolerance, winklerize):
_check_type(s1)
_check_type(s2)
ying_len = len(ying)
yang_len = len(yang)
s1_len = len(s1)
s2_len = len(s2)
if not ying_len or not yang_len:
if not s1_len or not s2_len:
return 0.0
min_len = max(ying_len, yang_len)
search_range = (min_len // 2) - 1
min_len = min(s1_len, s2_len)
search_range = max(s1_len, s2_len)
search_range = (search_range // 2) - 1
if search_range < 0:
search_range = 0
ying_flags = [False]*ying_len
yang_flags = [False]*yang_len
s1_flags = [False] * s1_len
s2_flags = [False] * s2_len
# looking only within search range, count & flag matched pairs
common_chars = 0
for i, ying_ch in enumerate(ying):
low = i - search_range if i > search_range else 0
hi = i + search_range if i + search_range < yang_len else yang_len - 1
for j in _range(low, hi+1):
if not yang_flags[j] and yang[j] == ying_ch:
ying_flags[i] = yang_flags[j] = True
for i, s1_ch in enumerate(s1):
low = max(0, i - search_range)
hi = min(i + search_range, s2_len - 1)
for j in range(low, hi + 1):
if not s2_flags[j] and s2[j] == s1_ch:
s1_flags[i] = s2_flags[j] = True
common_chars += 1
break
@ -77,27 +76,32 @@ def _jaro_winkler(ying, yang, long_tolerance, winklerize):
# count transpositions
k = trans_count = 0
for i, ying_f in enumerate(ying_flags):
if ying_f:
for j in _range(k, yang_len):
if yang_flags[j]:
for i, s1_f in enumerate(s1_flags):
if s1_f:
for j in range(k, s2_len):
if s2_flags[j]:
k = j + 1
break
if ying[i] != yang[j]:
if s1[i] != s2[j]:
trans_count += 1
trans_count /= 2
trans_count //= 2
# adjust for similarities in nonmatched characters
common_chars = float(common_chars)
weight = ((common_chars/ying_len + common_chars/yang_len +
(common_chars-trans_count) / common_chars)) / 3
weight = (
(
common_chars / s1_len
+ common_chars / s2_len
+ (common_chars - trans_count) / common_chars
)
) / 3
# winkler modification: continue to boost if strings are similar
if winklerize and weight > 0.7 and ying_len > 3 and yang_len > 3:
if winklerize and weight > 0.7:
# adjust for up to first 4 chars in common
j = min(min_len, 4)
i = 0
while i < j and ying[i] == yang[i] and ying[i]:
while i < j and s1[i] == s2[i]:
i += 1
if i:
weight += i * 0.1 * (1.0 - weight)
@ -105,13 +109,27 @@ def _jaro_winkler(ying, yang, long_tolerance, winklerize):
# optionally adjust for long strings
# after agreeing beginning chars, at least two or more must agree and
# agreed characters must be > half of remaining characters
if (long_tolerance and min_len > 4 and common_chars > i+1 and
2 * common_chars >= min_len + i):
weight += ((1.0 - weight) * (float(common_chars-i-1) / float(ying_len+yang_len-i*2+2)))
if (
long_tolerance
and min_len > 4
and common_chars > i + 1
and 2 * common_chars >= min_len + i
):
weight += (1.0 - weight) * (
float(common_chars - i - 1) / float(s1_len + s2_len - i * 2 + 2)
)
return weight
def jaro_similarity(s1, s2):
return _jaro_winkler(s1, s2, False, False) # noqa
def jaro_winkler_similarity(s1, s2, long_tolerance=False):
return _jaro_winkler(s1, s2, long_tolerance, True) # noqa
def damerau_levenshtein_distance(s1, s2):
_check_type(s1)
_check_type(s2)
@ -124,41 +142,35 @@ def damerau_levenshtein_distance(s1, s2):
da = defaultdict(int)
# distance matrix
score = [[0]*(len2+2) for x in _range(len1+2)]
score = [[0] * (len2 + 2) for x in range(len1 + 2)]
score[0][0] = infinite
for i in _range(0, len1+1):
score[i+1][0] = infinite
score[i+1][1] = i
for i in _range(0, len2+1):
score[0][i+1] = infinite
score[1][i+1] = i
for i in range(0, len1 + 1):
score[i + 1][0] = infinite
score[i + 1][1] = i
for i in range(0, len2 + 1):
score[0][i + 1] = infinite
score[1][i + 1] = i
for i in _range(1, len1+1):
for i in range(1, len1 + 1):
db = 0
for j in _range(1, len2+1):
i1 = da[s2[j-1]]
for j in range(1, len2 + 1):
i1 = da[s2[j - 1]]
j1 = db
cost = 1
if s1[i-1] == s2[j-1]:
if s1[i - 1] == s2[j - 1]:
cost = 0
db = j
score[i+1][j+1] = min(score[i][j] + cost,
score[i+1][j] + 1,
score[i][j+1] + 1,
score[i1][j1] + (i-i1-1) + 1 + (j-j1-1))
da[s1[i-1]] = i
score[i + 1][j + 1] = min(
score[i][j] + cost,
score[i + 1][j] + 1,
score[i][j + 1] + 1,
score[i1][j1] + (i - i1 - 1) + 1 + (j - j1 - 1),
)
da[s1[i - 1]] = i
return score[len1+1][len2+1]
def jaro_distance(s1, s2):
return _jaro_winkler(s1, s2, False, False)
def jaro_winkler(s1, s2, long_tolerance=False):
return _jaro_winkler(s1, s2, long_tolerance, True)
return score[len1 + 1][len2 + 1]
def soundex(s):
@ -166,21 +178,23 @@ def soundex(s):
_check_type(s)
if not s:
return ''
return ""
s = _normalize(s)
s = s.upper()
replacements = (('BFPV', '1'),
('CGJKQSXZ', '2'),
('DT', '3'),
('L', '4'),
('MN', '5'),
('R', '6'))
replacements = (
("BFPV", "1"),
("CGJKQSXZ", "2"),
("DT", "3"),
("L", "4"),
("MN", "5"),
("R", "6"),
)
result = [s[0]]
count = 1
# find would-be replacment for first character
# find would-be replacement for first character
for lset, sub in replacements:
if s[0] in lset:
last = sub
@ -197,12 +211,14 @@ def soundex(s):
last = sub
break
else:
last = None
if letter != "H" and letter != "W":
# leave last alone if middle letter is H or W
last = None
if count == 4:
break
result += '0'*(4-count)
return ''.join(result)
result += "0" * (4 - count)
return "".join(result)
def hamming_distance(s1, s2):
@ -227,28 +243,28 @@ def nysiis(s):
_check_type(s)
if not s:
return ''
return ""
s = s.upper()
key = []
# step 1 - prefixes
if s.startswith('MAC'):
s = 'MCC' + s[3:]
elif s.startswith('KN'):
if s.startswith("MAC"):
s = "MCC" + s[3:]
elif s.startswith("KN"):
s = s[1:]
elif s.startswith('K'):
s = 'C' + s[1:]
elif s.startswith(('PH', 'PF')):
s = 'FF' + s[2:]
elif s.startswith('SCH'):
s = 'SSS' + s[3:]
elif s.startswith("K"):
s = "C" + s[1:]
elif s.startswith(("PH", "PF")):
s = "FF" + s[2:]
elif s.startswith("SCH"):
s = "SSS" + s[3:]
# step 2 - suffixes
if s.endswith(('IE', 'EE')):
s = s[:-2] + 'Y'
elif s.endswith(('DT', 'RT', 'RD', 'NT', 'ND')):
s = s[:-2] + 'D'
if s.endswith(("IE", "EE")):
s = s[:-2] + "Y"
elif s.endswith(("DT", "RT", "RD", "NT", "ND")):
s = s[:-2] + "D"
# step 3 - first character of key comes from name
key.append(s[0])
@ -258,53 +274,57 @@ def nysiis(s):
len_s = len(s)
while i < len_s:
ch = s[i]
if ch == 'E' and i+1 < len_s and s[i+1] == 'V':
ch = 'AF'
if ch == "E" and i + 1 < len_s and s[i + 1] == "V":
ch = "AF"
i += 1
elif ch in 'AEIOU':
ch = 'A'
elif ch == 'Q':
ch = 'G'
elif ch == 'Z':
ch = 'S'
elif ch == 'M':
ch = 'N'
elif ch == 'K':
if i+1 < len(s) and s[i+1] == 'N':
ch = 'N'
elif ch in "AEIOU":
ch = "A"
elif ch == "Q":
ch = "G"
elif ch == "Z":
ch = "S"
elif ch == "M":
ch = "N"
elif ch == "K":
if i + 1 < len(s) and s[i + 1] == "N":
ch = "N"
else:
ch = 'C'
elif ch == 'S' and s[i+1:i+3] == 'CH':
ch = 'SS'
ch = "C"
elif ch == "S" and s[i + 1 : i + 3] == "CH":
ch = "SS"
i += 2
elif ch == 'P' and i+1 < len(s) and s[i+1] == 'H':
ch = 'F'
elif ch == "P" and i + 1 < len(s) and s[i + 1] == "H":
ch = "F"
i += 1
elif ch == 'H' and (s[i-1] not in 'AEIOU' or (i+1 < len(s) and s[i+1] not in 'AEIOU')):
if s[i-1] in 'AEIOU':
ch = 'A'
elif ch == "H" and (
s[i - 1] not in "AEIOU"
or (i + 1 < len(s) and s[i + 1] not in "AEIOU")
or (i + 1 == len(s))
):
if s[i - 1] in "AEIOU":
ch = "A"
else:
ch = s[i-1]
elif ch == 'W' and s[i-1] in 'AEIOU':
ch = s[i-1]
ch = s[i - 1]
elif ch == "W" and s[i - 1] in "AEIOU":
ch = s[i - 1]
if ch[-1] != key[-1][-1]:
key.append(ch)
i += 1
key = ''.join(key)
key = "".join(key)
# step 5 - remove trailing S
if key.endswith('S') and key != 'S':
if key.endswith("S") and key != "S":
key = key[:-1]
# step 6 - replace AY w/ Y
if key.endswith('AY'):
key = key[:-2] + 'Y'
if key.endswith("AY"):
key = key[:-2] + "Y"
# step 7 - remove trailing A
if key.endswith('A') and key != 'A':
if key.endswith("A") and key != "A":
key = key[:-1]
# step 8 was already done
@ -315,24 +335,26 @@ def nysiis(s):
def match_rating_codex(s):
_check_type(s)
s = s.upper()
# we ignore spaces
s = s.upper().replace(" ", "")
codex = []
prev = None
for i, c in enumerate(s):
# not a space OR
# starting character & vowel
first = True
for c in s:
# starting character
# or consonant not preceded by same consonant
if (c != ' ' and (i == 0 and c in 'AEIOU') or (c not in 'AEIOU' and c != prev)):
if first or (c not in "AEIOU" and c != prev):
codex.append(c)
prev = c
first = False
# just use first/last 3
if len(codex) > 6:
return ''.join(codex[:3]+codex[-3:])
return "".join(codex[:3] + codex[-3:])
else:
return ''.join(codex)
return "".join(codex)
def match_rating_comparison(s1, s2):
@ -344,7 +366,7 @@ def match_rating_comparison(s1, s2):
res2 = []
# length differs by 3 or more, no result
if abs(len1-len2) >= 3:
if abs(len1 - len2) >= 3:
return None
# get minimum rating based on sums of codexes
@ -359,7 +381,7 @@ def match_rating_comparison(s1, s2):
min_rating = 2
# strip off common prefixes
for c1, c2 in _zip_longest(codex1, codex2):
for c1, c2 in zip_longest(codex1, codex2):
if c1 != c2:
if c1:
res1.append(c1)
@ -367,7 +389,7 @@ def match_rating_comparison(s1, s2):
res2.append(c2)
unmatched_count1 = unmatched_count2 = 0
for c1, c2 in _zip_longest(reversed(res1), reversed(res2)):
for c1, c2 in zip_longest(reversed(res1), reversed(res2)):
if c1 != c2:
if c1:
unmatched_count1 += 1
@ -385,112 +407,113 @@ def metaphone(s):
s = _normalize(s.lower())
# skip first character if s starts with these
if s.startswith(('kn', 'gn', 'pn', 'ac', 'wr', 'ae')):
if s.startswith(("kn", "gn", "pn", "wr", "ae")):
s = s[1:]
i = 0
while i < len(s):
c = s[i]
next = s[i+1] if i < len(s)-1 else '*****'
nextnext = s[i+2] if i < len(s)-2 else '*****'
next = s[i + 1] if i < len(s) - 1 else "*****"
nextnext = s[i + 2] if i < len(s) - 2 else "*****"
# skip doubles except for cc
if c == next and c != 'c':
if c == next and c != "c":
i += 1
continue
if c in 'aeiou':
if i == 0 or s[i-1] == ' ':
if c in "aeiou":
if i == 0 or s[i - 1] == " ":
result.append(c)
elif c == 'b':
if (not (i != 0 and s[i-1] == 'm')) or next:
result.append('b')
elif c == 'c':
if next == 'i' and nextnext == 'a' or next == 'h':
result.append('x')
elif c == "b":
if (not (i != 0 and s[i - 1] == "m")) or next:
result.append("b")
elif c == "c":
if next == "i" and nextnext == "a" or next == "h":
result.append("x")
i += 1
elif next in 'iey':
result.append('s')
elif next in "iey":
result.append("s")
i += 1
else:
result.append('k')
elif c == 'd':
if next == 'g' and nextnext in 'iey':
result.append('j')
result.append("k")
elif c == "d":
if next == "g" and nextnext in "iey":
result.append("j")
i += 2
else:
result.append('t')
elif c in 'fjlmnr':
result.append("t")
elif c in "fjlmnr":
result.append(c)
elif c == 'g':
if next in 'iey':
result.append('j')
elif next not in 'hn':
result.append('k')
elif next == 'h' and nextnext and nextnext not in 'aeiou':
elif c == "g":
if next in "iey":
result.append("j")
elif next == "h" and nextnext and nextnext not in "aeiou":
i += 1
elif c == 'h':
if i == 0 or next in 'aeiou' or s[i-1] not in 'aeiou':
result.append('h')
elif c == 'k':
if i == 0 or s[i-1] != 'c':
result.append('k')
elif c == 'p':
if next == 'h':
result.append('f')
elif next == "n" and not nextnext:
i += 1
else:
result.append('p')
elif c == 'q':
result.append('k')
elif c == 's':
if next == 'h':
result.append('x')
result.append("k")
elif c == "h":
if i == 0 or next in "aeiou" or s[i - 1] not in "aeiou":
result.append("h")
elif c == "k":
if i == 0 or s[i - 1] != "c":
result.append("k")
elif c == "p":
if next == "h":
result.append("f")
i += 1
elif next == 'i' and nextnext in 'oa':
result.append('x')
else:
result.append("p")
elif c == "q":
result.append("k")
elif c == "s":
if next == "h":
result.append("x")
i += 1
elif next == "i" and nextnext in "oa":
result.append("x")
i += 2
else:
result.append('s')
elif c == 't':
if next == 'i' and nextnext in 'oa':
result.append('x')
elif next == 'h':
result.append('0')
result.append("s")
elif c == "t":
if next == "i" and nextnext in "oa":
result.append("x")
elif next == "h":
result.append("0")
i += 1
elif next != 'c' or nextnext != 'h':
result.append('t')
elif c == 'v':
result.append('f')
elif c == 'w':
if i == 0 and next == 'h':
elif next != "c" or nextnext != "h":
result.append("t")
elif c == "v":
result.append("f")
elif c == "w":
if i == 0 and next == "h":
i += 1
if nextnext in 'aeiou' or nextnext == '*****':
result.append('w')
elif next in 'aeiou' or next == '*****':
result.append('w')
elif c == 'x':
result.append("w")
elif next in "aeiou":
result.append("w")
elif c == "x":
if i == 0:
if next == 'h' or (next == 'i' and nextnext in 'oa'):
result.append('x')
if next == "h" or (next == "i" and nextnext in "oa"):
result.append("x")
else:
result.append('s')
result.append("s")
else:
result.append('k')
result.append('s')
elif c == 'y':
if next in 'aeiou':
result.append('y')
elif c == 'z':
result.append('s')
elif c == ' ':
if len(result) > 0 and result[-1] != ' ':
result.append(' ')
result.append("k")
result.append("s")
elif c == "y":
if next in "aeiou":
result.append("y")
elif c == "z":
result.append("s")
elif c == " ":
if len(result) > 0 and result[-1] != " ":
result.append(" ")
i += 1
return ''.join(result).upper()
return "".join(result).upper()
def porter_stem(s):

Binary file not shown.

View file

@ -1,11 +0,0 @@
import sys
import itertools
IS_PY3 = sys.version_info[0] == 3
if IS_PY3:
_range = range
_zip_longest = itertools.zip_longest
else:
_range = xrange
_zip_longest = itertools.izip_longest

View file

@ -1,69 +1,84 @@
from .compat import _range
_s2_options = {
'a': ((['a', 't', 'i', 'o', 'n', 'a', 'l'], ['a', 't', 'e']),
(['t', 'i', 'o', 'n', 'a', 'l'], ['t', 'i', 'o', 'n'])),
'c': ((['e', 'n', 'c', 'i'], ['e', 'n', 'c', 'e']),
(['a', 'n', 'c', 'i'], ['a', 'n', 'c', 'e']),),
'e': ((['i', 'z', 'e', 'r'], ['i', 'z', 'e']),),
'l': ((['b', 'l', 'i'], ['b', 'l', 'e']),
(['a', 'l', 'l', 'i'], ['a', 'l']),
(['e', 'n', 't', 'l', 'i'], ['e', 'n', 't']),
(['e', 'l', 'i'], ['e']),
(['o', 'u', 's', 'l', 'i'], ['o', 'u', 's']),),
'o': ((['i', 'z', 'a', 't', 'i', 'o', 'n'], ['i', 'z', 'e']),
(['a', 't', 'i', 'o', 'n'], ['a', 't', 'e']),
(['a', 't', 'o', 'r'], ['a', 't', 'e']),),
's': ((['a', 'l', 'i', 's', 'm'], ['a', 'l']),
(['i', 'v', 'e', 'n', 'e', 's', 's'], ['i', 'v', 'e']),
(['f', 'u', 'l', 'n', 'e', 's', 's'], ['f', 'u', 'l']),
(['o', 'u', 's', 'n', 'e', 's', 's'], ['o', 'u', 's']),),
't': ((['a', 'l', 'i', 't', 'i'], ['a', 'l']),
(['i', 'v', 'i', 't', 'i'], ['i', 'v', 'e']),
(['b', 'i', 'l', 'i', 't', 'i'], ['b', 'l', 'e']),),
'g': ((['l', 'o', 'g', 'i'], ['l', 'o', 'g']),),
"a": (
(["a", "t", "i", "o", "n", "a", "l"], ["a", "t", "e"]),
(["t", "i", "o", "n", "a", "l"], ["t", "i", "o", "n"]),
),
"c": (
(["e", "n", "c", "i"], ["e", "n", "c", "e"]),
(["a", "n", "c", "i"], ["a", "n", "c", "e"]),
),
"e": ((["i", "z", "e", "r"], ["i", "z", "e"]),),
"l": (
(["b", "l", "i"], ["b", "l", "e"]),
(["a", "l", "l", "i"], ["a", "l"]),
(["e", "n", "t", "l", "i"], ["e", "n", "t"]),
(["e", "l", "i"], ["e"]),
(["o", "u", "s", "l", "i"], ["o", "u", "s"]),
),
"o": (
(["i", "z", "a", "t", "i", "o", "n"], ["i", "z", "e"]),
(["a", "t", "i", "o", "n"], ["a", "t", "e"]),
(["a", "t", "o", "r"], ["a", "t", "e"]),
),
"s": (
(["a", "l", "i", "s", "m"], ["a", "l"]),
(["i", "v", "e", "n", "e", "s", "s"], ["i", "v", "e"]),
(["f", "u", "l", "n", "e", "s", "s"], ["f", "u", "l"]),
(["o", "u", "s", "n", "e", "s", "s"], ["o", "u", "s"]),
),
"t": (
(["a", "l", "i", "t", "i"], ["a", "l"]),
(["i", "v", "i", "t", "i"], ["i", "v", "e"]),
(["b", "i", "l", "i", "t", "i"], ["b", "l", "e"]),
),
"g": ((["l", "o", "g", "i"], ["l", "o", "g"]),),
}
_s3_options = {
'e': ((['i', 'c', 'a', 't', 'e'], ['i', 'c']),
(['a', 't', 'i', 'v', 'e'], []),
(['a', 'l', 'i', 'z', 'e'], ['a', 'l']),),
'i': ((['i', 'c', 'i', 't', 'i'], ['i', 'c']),),
'l': ((['i', 'c', 'a', 'l'], ['i', 'c']),
(['f', 'u', 'l'], []),),
's': ((['n', 'e', 's', 's'], []),),
"e": (
(["i", "c", "a", "t", "e"], ["i", "c"]),
(["a", "t", "i", "v", "e"], []),
(["a", "l", "i", "z", "e"], ["a", "l"]),
),
"i": ((["i", "c", "i", "t", "i"], ["i", "c"]),),
"l": ((["i", "c", "a", "l"], ["i", "c"]), (["f", "u", "l"], [])),
"s": ((["n", "e", "s", "s"], []),),
}
_s4_endings = {
'a': (['a', 'l'],),
'c': (['a', 'n', 'c', 'e'], ['e', 'n', 'c', 'e']),
'e': (['e', 'r'],),
'i': (['i', 'c'],),
'l': (['a', 'b', 'l', 'e'], ['i', 'b', 'l', 'e']),
'n': (['a', 'n', 't'], ['e', 'm', 'e', 'n', 't'], ['m', 'e', 'n', 't'],
['e', 'n', 't']),
"a": (["a", "l"],),
"c": (["a", "n", "c", "e"], ["e", "n", "c", "e"]),
"e": (["e", "r"],),
"i": (["i", "c"],),
"l": (["a", "b", "l", "e"], ["i", "b", "l", "e"]),
"n": (
["a", "n", "t"],
["e", "m", "e", "n", "t"],
["m", "e", "n", "t"],
["e", "n", "t"],
),
# handle 'o' separately
's': (['i', 's', 'm'],),
't': (['a', 't', 'e'], ['i', 't', 'i']),
'u': (['o', 'u', 's'],),
'v': (['i', 'v', 'e'],),
'z': (['i', 'z', 'e'],),
"s": (["i", "s", "m"],),
"t": (["a", "t", "e"], ["i", "t", "i"]),
"u": (["o", "u", "s"],),
"v": (["i", "v", "e"],),
"z": (["i", "z", "e"],),
}
class Stemmer(object):
def __init__(self, b):
self.b = list(b)
self.k = len(b)-1
self.k = len(b) - 1
self.j = 0
def cons(self, i):
""" True iff b[i] is a consonant """
if self.b[i] in 'aeiou':
if self.b[i] in "aeiou":
return False
elif self.b[i] == 'y':
return True if i == 0 else not self.cons(i-1)
elif self.b[i] == "y":
return True if i == 0 else not self.cons(i - 1)
return True
def m(self):
@ -96,31 +111,36 @@ class Stemmer(object):
def vowel_in_stem(self):
""" True iff 0...j contains vowel """
for i in _range(0, self.j+1):
for i in range(0, self.j + 1):
if not self.cons(i):
return True
return False
def doublec(self, j):
""" True iff j, j-1 contains double consonant """
if j < 1 or self.b[j] != self.b[j-1]:
if j < 1 or self.b[j] != self.b[j - 1]:
return False
return self.cons(j)
def cvc(self, i):
""" True iff i-2,i-1,i is consonent-vowel consonant
""" True iff i-2,i-1,i is consonant-vowel consonant
and if second c isn't w,x, or y.
used to restore e at end of short words like cave, love, hope, crime
"""
if (i < 2 or not self.cons(i) or self.cons(i-1) or not self.cons(i-2) or
self.b[i] in 'wxy'):
if (
i < 2
or not self.cons(i)
or self.cons(i - 1)
or not self.cons(i - 2)
or self.b[i] in "wxy"
):
return False
return True
def ends(self, s):
length = len(s)
""" True iff 0...k ends with string s """
res = (self.b[self.k-length+1:self.k+1] == s)
res = self.b[self.k - length + 1 : self.k + 1] == s
if res:
self.j = self.k - length
return res
@ -128,7 +148,7 @@ class Stemmer(object):
def setto(self, s):
""" set j+1...k to string s, readjusting k """
length = len(s)
self.b[self.j+1:self.j+1+length] = s
self.b[self.j + 1 : self.j + 1 + length] = s
self.k = self.j + length
def r(self, s):
@ -136,39 +156,40 @@ class Stemmer(object):
self.setto(s)
def step1ab(self):
if self.b[self.k] == 's':
if self.ends(['s', 's', 'e', 's']):
if self.b[self.k] == "s":
if self.ends(["s", "s", "e", "s"]):
self.k -= 2
elif self.ends(['i', 'e', 's']):
self.setto(['i'])
elif self.b[self.k-1] != 's':
elif self.ends(["i", "e", "s"]):
self.setto(["i"])
elif self.b[self.k - 1] != "s":
self.k -= 1
if self.ends(['e', 'e', 'd']):
if self.ends(["e", "e", "d"]):
if self.m() > 0:
self.k -= 1
elif ((self.ends(['e', 'd']) or self.ends(['i', 'n', 'g'])) and
self.vowel_in_stem()):
elif (
self.ends(["e", "d"]) or self.ends(["i", "n", "g"])
) and self.vowel_in_stem():
self.k = self.j
if self.ends(['a', 't']):
self.setto(['a', 't', 'e'])
elif self.ends(['b', 'l']):
self.setto(['b', 'l', 'e'])
elif self.ends(['i', 'z']):
self.setto(['i', 'z', 'e'])
if self.ends(["a", "t"]):
self.setto(["a", "t", "e"])
elif self.ends(["b", "l"]):
self.setto(["b", "l", "e"])
elif self.ends(["i", "z"]):
self.setto(["i", "z", "e"])
elif self.doublec(self.k):
self.k -= 1
if self.b[self.k] in 'lsz':
if self.b[self.k] in "lsz":
self.k += 1
elif self.m() == 1 and self.cvc(self.k):
self.setto(['e'])
self.setto(["e"])
def step1c(self):
""" turn terminal y into i if there's a vowel in stem """
if self.ends(['y']) and self.vowel_in_stem():
self.b[self.k] = 'i'
if self.ends(["y"]) and self.vowel_in_stem():
self.b[self.k] = "i"
def step2and3(self):
for end, repl in _s2_options.get(self.b[self.k-1], []):
for end, repl in _s2_options.get(self.b[self.k - 1], []):
if self.ends(end):
self.r(repl)
break
@ -179,11 +200,13 @@ class Stemmer(object):
break
def step4(self):
ch = self.b[self.k-1]
ch = self.b[self.k - 1]
if ch == 'o':
if not ((self.ends(['i', 'o', 'n']) and self.b[self.j] in 'st') or
self.ends(['o', 'u'])):
if ch == "o":
if not (
(self.ends(["i", "o", "n"]) and self.b[self.j] in "st")
or self.ends(["o", "u"])
):
return
else:
endings = _s4_endings.get(ch, [])
@ -198,15 +221,15 @@ class Stemmer(object):
def step5(self):
self.j = self.k
if self.b[self.k] == 'e':
if self.b[self.k] == "e":
a = self.m()
if a > 1 or a == 1 and not self.cvc(self.k-1):
if a > 1 or a == 1 and not self.cvc(self.k - 1):
self.k -= 1
if self.b[self.k] == 'l' and self.doublec(self.k) and self.m() > 1:
if self.b[self.k] == "l" and self.doublec(self.k) and self.m() > 1:
self.k -= 1
def result(self):
return ''.join(self.b[:self.k+1])
return "".join(self.b[: self.k + 1])
def stem(self):
if self.k > 1:

View file

View file

@ -1,28 +1,24 @@
# -*- coding: utf-8 -*-
import sys
if sys.version_info[0] < 3:
import unicodecsv as csv
open_kwargs = {}
else:
import csv
open_kwargs = {'encoding': 'utf8'}
import csv
import platform
import pytest
open_kwargs = {"encoding": "utf8"}
def assertAlmostEqual(a, b, places=3):
assert abs(a - b) < (0.1**places)
assert abs(a - b) < (0.1 ** places)
if platform.python_implementation() == 'CPython':
implementations = ['python', 'c']
if platform.python_implementation() == "CPython":
implementations = ["python", "c"]
else:
implementations = ['python']
implementations = ["python"]
@pytest.fixture(params=implementations)
def jf(request):
if request.param == 'python':
if request.param == "python":
from jellyfish import _jellyfish as jf
else:
from jellyfish import cjellyfish as jf
@ -30,64 +26,86 @@ def jf(request):
def _load_data(name):
with open('testdata/{}.csv'.format(name), **open_kwargs) as f:
with open("testdata/{}.csv".format(name), **open_kwargs) as f:
for data in csv.reader(f):
yield data
@pytest.mark.parametrize("s1,s2,value", _load_data('jaro_winkler'), ids=str)
def test_jaro_winkler(jf, s1, s2, value):
@pytest.mark.parametrize("s1,s2,value", _load_data("jaro_winkler"), ids=str)
def test_jaro_winkler_similarity(jf, s1, s2, value):
value = float(value)
assertAlmostEqual(jf.jaro_winkler(s1, s2), value, places=3)
assertAlmostEqual(jf.jaro_winkler_similarity(s1, s2), value, places=3)
@pytest.mark.parametrize("s1,s2,value", _load_data('jaro_distance'), ids=str)
def test_jaro_distance(jf, s1, s2, value):
@pytest.mark.parametrize("s1,s2,value", _load_data("jaro_winkler_longtol"), ids=str)
def test_jaro_winkler_similarity_longtol(jf, s1, s2, value):
value = float(value)
assertAlmostEqual(jf.jaro_distance(s1, s2), value, places=3)
assertAlmostEqual(jf.jaro_winkler_similarity(s1, s2, True), value, places=3)
@pytest.mark.parametrize("s1,s2,value", _load_data('hamming'), ids=str)
def test_jaro_winkler_deprecation(jf):
# backwards compatibility function
from jellyfish import jaro_winkler
with pytest.deprecated_call():
assert jaro_winkler("a", "a") == 1
def test_jaro_distance_deprecation():
# backwards compatibility function
from jellyfish import jaro_distance
with pytest.deprecated_call():
assert jaro_distance("a", "a") == 1
@pytest.mark.parametrize("s1,s2,value", _load_data("jaro_distance"), ids=str)
def test_jaro_similarity(jf, s1, s2, value):
value = float(value)
assertAlmostEqual(jf.jaro_similarity(s1, s2), value, places=3)
@pytest.mark.parametrize("s1,s2,value", _load_data("hamming"), ids=str)
def test_hamming_distance(jf, s1, s2, value):
value = int(value)
assert jf.hamming_distance(s1, s2) == value
@pytest.mark.parametrize("s1,s2,value", _load_data('levenshtein'), ids=str)
@pytest.mark.parametrize("s1,s2,value", _load_data("levenshtein"), ids=str)
def test_levenshtein_distance(jf, s1, s2, value):
value = int(value)
assert jf.levenshtein_distance(s1, s2) == value
@pytest.mark.parametrize("s1,s2,value", _load_data('damerau_levenshtein'), ids=str)
@pytest.mark.parametrize("s1,s2,value", _load_data("damerau_levenshtein"), ids=str)
def test_damerau_levenshtein_distance(jf, s1, s2, value):
value = int(value)
assert jf.damerau_levenshtein_distance(s1, s2) == value
@pytest.mark.parametrize("s1,code", _load_data('soundex'), ids=str)
@pytest.mark.parametrize("s1,code", _load_data("soundex"), ids=str)
def test_soundex(jf, s1, code):
assert jf.soundex(s1) == code
@pytest.mark.parametrize("s1,code", _load_data('metaphone'), ids=str)
@pytest.mark.parametrize("s1,code", _load_data("metaphone"), ids=str)
def test_metaphone(jf, s1, code):
assert jf.metaphone(s1) == code
@pytest.mark.parametrize("s1,s2", _load_data('nysiis'), ids=str)
@pytest.mark.parametrize("s1,s2", _load_data("nysiis"), ids=str)
def test_nysiis(jf, s1, s2):
assert jf.nysiis(s1) == s2
@pytest.mark.parametrize("s1,s2", _load_data('match_rating_codex'), ids=str)
@pytest.mark.parametrize("s1,s2", _load_data("match_rating_codex"), ids=str)
def test_match_rating_codex(jf, s1, s2):
assert jf.match_rating_codex(s1) == s2
@pytest.mark.parametrize("s1,s2,value", _load_data('match_rating_comparison'), ids=str)
@pytest.mark.parametrize("s1,s2,value", _load_data("match_rating_comparison"), ids=str)
def test_match_rating_comparison(jf, s1, s2, value):
value = {'True': True, 'False': False, 'None': None}[value]
value = {"True": True, "False": False, "None": None}[value]
assert jf.match_rating_comparison(s1, s2) is value
@ -96,117 +114,125 @@ def test_match_rating_comparison(jf, s1, s2, value):
# def test_porter_stem(jf, a, b):
# assert jf.porter_stem(a) == b
def test_porter_stem(jf):
with open('testdata/porter.csv', **open_kwargs) as f:
with open("testdata/porter.csv", **open_kwargs) as f:
reader = csv.reader(f)
for (a, b) in reader:
assert jf.porter_stem(a) == b
if platform.python_implementation() == 'CPython':
if platform.python_implementation() == "CPython":
def test_match_rating_comparison_segfault():
import hashlib
from jellyfish import cjellyfish as jf
sha1s = [u'{}'.format(hashlib.sha1(str(v).encode('ascii')).hexdigest())
for v in range(100)]
sha1s = [
u"{}".format(hashlib.sha1(str(v).encode("ascii")).hexdigest())
for v in range(100)
]
# this segfaulted on 0.1.2
assert [[jf.match_rating_comparison(h1, h2) for h1 in sha1s] for h2 in sha1s]
def test_damerau_levenshtein_unicode_segfault():
# unfortunate difference in behavior between Py & C versions
# test that unicode works in C & Python versions now
from jellyfish.cjellyfish import damerau_levenshtein_distance as c_dl
from jellyfish._jellyfish import damerau_levenshtein_distance as py_dl
s1 = u'mylifeoutdoors'
s2 = u'нахлыст'
with pytest.raises(ValueError):
c_dl(s1, s2)
with pytest.raises(ValueError):
c_dl(s2, s1)
s1 = u"mylifeoutdoors"
s2 = u"нахлыст"
assert c_dl(s1, s2) == 14
assert c_dl(s2, s1) == 14
assert py_dl(s1, s2) == 14
assert py_dl(s2, s1) == 14
def test_jaro_winkler_long_tolerance(jf):
no_lt = jf.jaro_winkler(u'two long strings', u'two long stringz', long_tolerance=False)
with_lt = jf.jaro_winkler(u'two long strings', u'two long stringz', long_tolerance=True)
no_lt = jf.jaro_winkler_similarity(
u"two long strings", u"two long stringz", long_tolerance=False
)
with_lt = jf.jaro_winkler_similarity(
u"two long strings", u"two long stringz", long_tolerance=True
)
# make sure long_tolerance does something
assertAlmostEqual(no_lt, 0.975)
assertAlmostEqual(with_lt, 0.984)
def test_damerau_levenshtein_distance_type(jf):
jf.damerau_levenshtein_distance(u'abc', u'abc')
jf.damerau_levenshtein_distance(u"abc", u"abc")
with pytest.raises(TypeError) as exc:
jf.damerau_levenshtein_distance(b'abc', b'abc')
assert 'expected' in str(exc.value)
jf.damerau_levenshtein_distance(b"abc", b"abc")
assert "expected" in str(exc.value)
def test_levenshtein_distance_type(jf):
assert jf.levenshtein_distance(u'abc', u'abc') == 0
assert jf.levenshtein_distance(u"abc", u"abc") == 0
with pytest.raises(TypeError) as exc:
jf.levenshtein_distance(b'abc', b'abc')
assert 'expected' in str(exc.value)
jf.levenshtein_distance(b"abc", b"abc")
assert "expected" in str(exc.value)
def test_jaro_distance_type(jf):
assert jf.jaro_distance(u'abc', u'abc') == 1
def test_jaro_similarity_type(jf):
assert jf.jaro_similarity(u"abc", u"abc") == 1
with pytest.raises(TypeError) as exc:
jf.jaro_distance(b'abc', b'abc')
assert 'expected' in str(exc.value)
jf.jaro_similarity(b"abc", b"abc")
assert "expected" in str(exc.value)
def test_jaro_winkler_type(jf):
assert jf.jaro_winkler(u'abc', u'abc') == 1
assert jf.jaro_winkler_similarity(u"abc", u"abc") == 1
with pytest.raises(TypeError) as exc:
jf.jaro_winkler(b'abc', b'abc')
assert 'expected' in str(exc.value)
jf.jaro_winkler_similarity(b"abc", b"abc")
assert "expected" in str(exc.value)
def test_mra_comparison_type(jf):
assert jf.match_rating_comparison(u'abc', u'abc') is True
assert jf.match_rating_comparison(u"abc", u"abc") is True
with pytest.raises(TypeError) as exc:
jf.match_rating_comparison(b'abc', b'abc')
assert 'expected' in str(exc.value)
jf.match_rating_comparison(b"abc", b"abc")
assert "expected" in str(exc.value)
def test_hamming_type(jf):
assert jf.hamming_distance(u'abc', u'abc') == 0
assert jf.hamming_distance(u"abc", u"abc") == 0
with pytest.raises(TypeError) as exc:
jf.hamming_distance(b'abc', b'abc')
assert 'expected' in str(exc.value)
jf.hamming_distance(b"abc", b"abc")
assert "expected" in str(exc.value)
def test_soundex_type(jf):
assert jf.soundex(u'ABC') == 'A120'
assert jf.soundex(u"ABC") == "A120"
with pytest.raises(TypeError) as exc:
jf.soundex(b'ABC')
assert 'expected' in str(exc.value)
jf.soundex(b"ABC")
assert "expected" in str(exc.value)
def test_metaphone_type(jf):
assert jf.metaphone(u'abc') == 'ABK'
assert jf.metaphone(u"abc") == "ABK"
with pytest.raises(TypeError) as exc:
jf.metaphone(b'abc')
assert 'expected' in str(exc.value)
jf.metaphone(b"abc")
assert "expected" in str(exc.value)
def test_nysiis_type(jf):
assert jf.nysiis(u'abc') == 'ABC'
assert jf.nysiis(u"abc") == "ABC"
with pytest.raises(TypeError) as exc:
jf.nysiis(b'abc')
assert 'expected' in str(exc.value)
jf.nysiis(b"abc")
assert "expected" in str(exc.value)
def test_mr_codex_type(jf):
assert jf.match_rating_codex(u'abc') == 'ABC'
assert jf.match_rating_codex(u"abc") == "ABC"
with pytest.raises(TypeError) as exc:
jf.match_rating_codex(b'abc')
assert 'expected' in str(exc.value)
jf.match_rating_codex(b"abc")
assert "expected" in str(exc.value)
def test_porter_type(jf):
assert jf.porter_stem(u'abc') == 'abc'
assert jf.porter_stem(u"abc") == "abc"
with pytest.raises(TypeError) as exc:
jf.porter_stem(b'abc')
assert 'expected' in str(exc.value)
jf.porter_stem(b"abc")
assert "expected" in str(exc.value)