mirror of
https://github.com/clinton-hall/nzbToMedia.git
synced 2025-08-14 10:36:52 -07:00
Move common libs to libs/common
This commit is contained in:
parent
8dbb1a2451
commit
1f4bd41bcc
1612 changed files with 962 additions and 10 deletions
499
libs/common/jellyfish/_jellyfish.py
Normal file
499
libs/common/jellyfish/_jellyfish.py
Normal file
|
@ -0,0 +1,499 @@
|
|||
import unicodedata
|
||||
from collections import defaultdict
|
||||
from .compat import _range, _zip_longest, IS_PY3
|
||||
from .porter import Stemmer
|
||||
|
||||
|
||||
def _normalize(s):
|
||||
return unicodedata.normalize('NFKD', s)
|
||||
|
||||
|
||||
def _check_type(s):
|
||||
if IS_PY3 and not isinstance(s, str):
|
||||
raise TypeError('expected str or unicode, got %s' % type(s).__name__)
|
||||
elif not IS_PY3 and not isinstance(s, unicode):
|
||||
raise TypeError('expected unicode, got %s' % type(s).__name__)
|
||||
|
||||
|
||||
def levenshtein_distance(s1, s2):
|
||||
_check_type(s1)
|
||||
_check_type(s2)
|
||||
|
||||
if s1 == s2:
|
||||
return 0
|
||||
rows = len(s1)+1
|
||||
cols = len(s2)+1
|
||||
|
||||
if not s1:
|
||||
return cols-1
|
||||
if not s2:
|
||||
return rows-1
|
||||
|
||||
prev = None
|
||||
cur = range(cols)
|
||||
for r in _range(1, rows):
|
||||
prev, cur = cur, [r] + [0]*(cols-1)
|
||||
for c in _range(1, cols):
|
||||
deletion = prev[c] + 1
|
||||
insertion = cur[c-1] + 1
|
||||
edit = prev[c-1] + (0 if s1[r-1] == s2[c-1] else 1)
|
||||
cur[c] = min(edit, deletion, insertion)
|
||||
|
||||
return cur[-1]
|
||||
|
||||
|
||||
def _jaro_winkler(ying, yang, long_tolerance, winklerize):
|
||||
_check_type(ying)
|
||||
_check_type(yang)
|
||||
|
||||
ying_len = len(ying)
|
||||
yang_len = len(yang)
|
||||
|
||||
if not ying_len or not yang_len:
|
||||
return 0.0
|
||||
|
||||
min_len = max(ying_len, yang_len)
|
||||
search_range = (min_len // 2) - 1
|
||||
if search_range < 0:
|
||||
search_range = 0
|
||||
|
||||
ying_flags = [False]*ying_len
|
||||
yang_flags = [False]*yang_len
|
||||
|
||||
# looking only within search range, count & flag matched pairs
|
||||
common_chars = 0
|
||||
for i, ying_ch in enumerate(ying):
|
||||
low = i - search_range if i > search_range else 0
|
||||
hi = i + search_range if i + search_range < yang_len else yang_len - 1
|
||||
for j in _range(low, hi+1):
|
||||
if not yang_flags[j] and yang[j] == ying_ch:
|
||||
ying_flags[i] = yang_flags[j] = True
|
||||
common_chars += 1
|
||||
break
|
||||
|
||||
# short circuit if no characters match
|
||||
if not common_chars:
|
||||
return 0.0
|
||||
|
||||
# count transpositions
|
||||
k = trans_count = 0
|
||||
for i, ying_f in enumerate(ying_flags):
|
||||
if ying_f:
|
||||
for j in _range(k, yang_len):
|
||||
if yang_flags[j]:
|
||||
k = j + 1
|
||||
break
|
||||
if ying[i] != yang[j]:
|
||||
trans_count += 1
|
||||
trans_count /= 2
|
||||
|
||||
# adjust for similarities in nonmatched characters
|
||||
common_chars = float(common_chars)
|
||||
weight = ((common_chars/ying_len + common_chars/yang_len +
|
||||
(common_chars-trans_count) / common_chars)) / 3
|
||||
|
||||
# winkler modification: continue to boost if strings are similar
|
||||
if winklerize and weight > 0.7 and ying_len > 3 and yang_len > 3:
|
||||
# adjust for up to first 4 chars in common
|
||||
j = min(min_len, 4)
|
||||
i = 0
|
||||
while i < j and ying[i] == yang[i] and ying[i]:
|
||||
i += 1
|
||||
if i:
|
||||
weight += i * 0.1 * (1.0 - weight)
|
||||
|
||||
# optionally adjust for long strings
|
||||
# after agreeing beginning chars, at least two or more must agree and
|
||||
# agreed characters must be > half of remaining characters
|
||||
if (long_tolerance and min_len > 4 and common_chars > i+1 and
|
||||
2 * common_chars >= min_len + i):
|
||||
weight += ((1.0 - weight) * (float(common_chars-i-1) / float(ying_len+yang_len-i*2+2)))
|
||||
|
||||
return weight
|
||||
|
||||
|
||||
def damerau_levenshtein_distance(s1, s2):
|
||||
_check_type(s1)
|
||||
_check_type(s2)
|
||||
|
||||
len1 = len(s1)
|
||||
len2 = len(s2)
|
||||
infinite = len1 + len2
|
||||
|
||||
# character array
|
||||
da = defaultdict(int)
|
||||
|
||||
# distance matrix
|
||||
score = [[0]*(len2+2) for x in _range(len1+2)]
|
||||
|
||||
score[0][0] = infinite
|
||||
for i in _range(0, len1+1):
|
||||
score[i+1][0] = infinite
|
||||
score[i+1][1] = i
|
||||
for i in _range(0, len2+1):
|
||||
score[0][i+1] = infinite
|
||||
score[1][i+1] = i
|
||||
|
||||
for i in _range(1, len1+1):
|
||||
db = 0
|
||||
for j in _range(1, len2+1):
|
||||
i1 = da[s2[j-1]]
|
||||
j1 = db
|
||||
cost = 1
|
||||
if s1[i-1] == s2[j-1]:
|
||||
cost = 0
|
||||
db = j
|
||||
|
||||
score[i+1][j+1] = min(score[i][j] + cost,
|
||||
score[i+1][j] + 1,
|
||||
score[i][j+1] + 1,
|
||||
score[i1][j1] + (i-i1-1) + 1 + (j-j1-1))
|
||||
da[s1[i-1]] = i
|
||||
|
||||
return score[len1+1][len2+1]
|
||||
|
||||
|
||||
def jaro_distance(s1, s2):
|
||||
return _jaro_winkler(s1, s2, False, False)
|
||||
|
||||
|
||||
def jaro_winkler(s1, s2, long_tolerance=False):
|
||||
return _jaro_winkler(s1, s2, long_tolerance, True)
|
||||
|
||||
|
||||
def soundex(s):
|
||||
|
||||
_check_type(s)
|
||||
|
||||
if not s:
|
||||
return ''
|
||||
|
||||
s = _normalize(s)
|
||||
s = s.upper()
|
||||
|
||||
replacements = (('BFPV', '1'),
|
||||
('CGJKQSXZ', '2'),
|
||||
('DT', '3'),
|
||||
('L', '4'),
|
||||
('MN', '5'),
|
||||
('R', '6'))
|
||||
result = [s[0]]
|
||||
count = 1
|
||||
|
||||
# find would-be replacment for first character
|
||||
for lset, sub in replacements:
|
||||
if s[0] in lset:
|
||||
last = sub
|
||||
break
|
||||
else:
|
||||
last = None
|
||||
|
||||
for letter in s[1:]:
|
||||
for lset, sub in replacements:
|
||||
if letter in lset:
|
||||
if sub != last:
|
||||
result.append(sub)
|
||||
count += 1
|
||||
last = sub
|
||||
break
|
||||
else:
|
||||
last = None
|
||||
if count == 4:
|
||||
break
|
||||
|
||||
result += '0'*(4-count)
|
||||
return ''.join(result)
|
||||
|
||||
|
||||
def hamming_distance(s1, s2):
|
||||
_check_type(s1)
|
||||
_check_type(s2)
|
||||
|
||||
# ensure length of s1 >= s2
|
||||
if len(s2) > len(s1):
|
||||
s1, s2 = s2, s1
|
||||
|
||||
# distance is difference in length + differing chars
|
||||
distance = len(s1) - len(s2)
|
||||
for i, c in enumerate(s2):
|
||||
if c != s1[i]:
|
||||
distance += 1
|
||||
|
||||
return distance
|
||||
|
||||
|
||||
def nysiis(s):
|
||||
|
||||
_check_type(s)
|
||||
|
||||
if not s:
|
||||
return ''
|
||||
|
||||
s = s.upper()
|
||||
key = []
|
||||
|
||||
# step 1 - prefixes
|
||||
if s.startswith('MAC'):
|
||||
s = 'MCC' + s[3:]
|
||||
elif s.startswith('KN'):
|
||||
s = s[1:]
|
||||
elif s.startswith('K'):
|
||||
s = 'C' + s[1:]
|
||||
elif s.startswith(('PH', 'PF')):
|
||||
s = 'FF' + s[2:]
|
||||
elif s.startswith('SCH'):
|
||||
s = 'SSS' + s[3:]
|
||||
|
||||
# step 2 - suffixes
|
||||
if s.endswith(('IE', 'EE')):
|
||||
s = s[:-2] + 'Y'
|
||||
elif s.endswith(('DT', 'RT', 'RD', 'NT', 'ND')):
|
||||
s = s[:-2] + 'D'
|
||||
|
||||
# step 3 - first character of key comes from name
|
||||
key.append(s[0])
|
||||
|
||||
# step 4 - translate remaining chars
|
||||
i = 1
|
||||
len_s = len(s)
|
||||
while i < len_s:
|
||||
ch = s[i]
|
||||
if ch == 'E' and i+1 < len_s and s[i+1] == 'V':
|
||||
ch = 'AF'
|
||||
i += 1
|
||||
elif ch in 'AEIOU':
|
||||
ch = 'A'
|
||||
elif ch == 'Q':
|
||||
ch = 'G'
|
||||
elif ch == 'Z':
|
||||
ch = 'S'
|
||||
elif ch == 'M':
|
||||
ch = 'N'
|
||||
elif ch == 'K':
|
||||
if i+1 < len(s) and s[i+1] == 'N':
|
||||
ch = 'N'
|
||||
else:
|
||||
ch = 'C'
|
||||
elif ch == 'S' and s[i+1:i+3] == 'CH':
|
||||
ch = 'SS'
|
||||
i += 2
|
||||
elif ch == 'P' and i+1 < len(s) and s[i+1] == 'H':
|
||||
ch = 'F'
|
||||
i += 1
|
||||
elif ch == 'H' and (s[i-1] not in 'AEIOU' or (i+1 < len(s) and s[i+1] not in 'AEIOU')):
|
||||
if s[i-1] in 'AEIOU':
|
||||
ch = 'A'
|
||||
else:
|
||||
ch = s[i-1]
|
||||
elif ch == 'W' and s[i-1] in 'AEIOU':
|
||||
ch = s[i-1]
|
||||
|
||||
if ch[-1] != key[-1][-1]:
|
||||
key.append(ch)
|
||||
|
||||
i += 1
|
||||
|
||||
key = ''.join(key)
|
||||
|
||||
# step 5 - remove trailing S
|
||||
if key.endswith('S') and key != 'S':
|
||||
key = key[:-1]
|
||||
|
||||
# step 6 - replace AY w/ Y
|
||||
if key.endswith('AY'):
|
||||
key = key[:-2] + 'Y'
|
||||
|
||||
# step 7 - remove trailing A
|
||||
if key.endswith('A') and key != 'A':
|
||||
key = key[:-1]
|
||||
|
||||
# step 8 was already done
|
||||
|
||||
return key
|
||||
|
||||
|
||||
def match_rating_codex(s):
|
||||
_check_type(s)
|
||||
|
||||
s = s.upper()
|
||||
codex = []
|
||||
|
||||
prev = None
|
||||
for i, c in enumerate(s):
|
||||
# not a space OR
|
||||
# starting character & vowel
|
||||
# or consonant not preceded by same consonant
|
||||
if (c != ' ' and (i == 0 and c in 'AEIOU') or (c not in 'AEIOU' and c != prev)):
|
||||
codex.append(c)
|
||||
|
||||
prev = c
|
||||
|
||||
# just use first/last 3
|
||||
if len(codex) > 6:
|
||||
return ''.join(codex[:3]+codex[-3:])
|
||||
else:
|
||||
return ''.join(codex)
|
||||
|
||||
|
||||
def match_rating_comparison(s1, s2):
|
||||
codex1 = match_rating_codex(s1)
|
||||
codex2 = match_rating_codex(s2)
|
||||
len1 = len(codex1)
|
||||
len2 = len(codex2)
|
||||
res1 = []
|
||||
res2 = []
|
||||
|
||||
# length differs by 3 or more, no result
|
||||
if abs(len1-len2) >= 3:
|
||||
return None
|
||||
|
||||
# get minimum rating based on sums of codexes
|
||||
lensum = len1 + len2
|
||||
if lensum <= 4:
|
||||
min_rating = 5
|
||||
elif lensum <= 7:
|
||||
min_rating = 4
|
||||
elif lensum <= 11:
|
||||
min_rating = 3
|
||||
else:
|
||||
min_rating = 2
|
||||
|
||||
# strip off common prefixes
|
||||
for c1, c2 in _zip_longest(codex1, codex2):
|
||||
if c1 != c2:
|
||||
if c1:
|
||||
res1.append(c1)
|
||||
if c2:
|
||||
res2.append(c2)
|
||||
|
||||
unmatched_count1 = unmatched_count2 = 0
|
||||
for c1, c2 in _zip_longest(reversed(res1), reversed(res2)):
|
||||
if c1 != c2:
|
||||
if c1:
|
||||
unmatched_count1 += 1
|
||||
if c2:
|
||||
unmatched_count2 += 1
|
||||
|
||||
return (6 - max(unmatched_count1, unmatched_count2)) >= min_rating
|
||||
|
||||
|
||||
def metaphone(s):
|
||||
_check_type(s)
|
||||
|
||||
result = []
|
||||
|
||||
s = _normalize(s.lower())
|
||||
|
||||
# skip first character if s starts with these
|
||||
if s.startswith(('kn', 'gn', 'pn', 'ac', 'wr', 'ae')):
|
||||
s = s[1:]
|
||||
|
||||
i = 0
|
||||
|
||||
while i < len(s):
|
||||
c = s[i]
|
||||
next = s[i+1] if i < len(s)-1 else '*****'
|
||||
nextnext = s[i+2] if i < len(s)-2 else '*****'
|
||||
|
||||
# skip doubles except for cc
|
||||
if c == next and c != 'c':
|
||||
i += 1
|
||||
continue
|
||||
|
||||
if c in 'aeiou':
|
||||
if i == 0 or s[i-1] == ' ':
|
||||
result.append(c)
|
||||
elif c == 'b':
|
||||
if (not (i != 0 and s[i-1] == 'm')) or next:
|
||||
result.append('b')
|
||||
elif c == 'c':
|
||||
if next == 'i' and nextnext == 'a' or next == 'h':
|
||||
result.append('x')
|
||||
i += 1
|
||||
elif next in 'iey':
|
||||
result.append('s')
|
||||
i += 1
|
||||
else:
|
||||
result.append('k')
|
||||
elif c == 'd':
|
||||
if next == 'g' and nextnext in 'iey':
|
||||
result.append('j')
|
||||
i += 2
|
||||
else:
|
||||
result.append('t')
|
||||
elif c in 'fjlmnr':
|
||||
result.append(c)
|
||||
elif c == 'g':
|
||||
if next in 'iey':
|
||||
result.append('j')
|
||||
elif next not in 'hn':
|
||||
result.append('k')
|
||||
elif next == 'h' and nextnext and nextnext not in 'aeiou':
|
||||
i += 1
|
||||
elif c == 'h':
|
||||
if i == 0 or next in 'aeiou' or s[i-1] not in 'aeiou':
|
||||
result.append('h')
|
||||
elif c == 'k':
|
||||
if i == 0 or s[i-1] != 'c':
|
||||
result.append('k')
|
||||
elif c == 'p':
|
||||
if next == 'h':
|
||||
result.append('f')
|
||||
i += 1
|
||||
else:
|
||||
result.append('p')
|
||||
elif c == 'q':
|
||||
result.append('k')
|
||||
elif c == 's':
|
||||
if next == 'h':
|
||||
result.append('x')
|
||||
i += 1
|
||||
elif next == 'i' and nextnext in 'oa':
|
||||
result.append('x')
|
||||
i += 2
|
||||
else:
|
||||
result.append('s')
|
||||
elif c == 't':
|
||||
if next == 'i' and nextnext in 'oa':
|
||||
result.append('x')
|
||||
elif next == 'h':
|
||||
result.append('0')
|
||||
i += 1
|
||||
elif next != 'c' or nextnext != 'h':
|
||||
result.append('t')
|
||||
elif c == 'v':
|
||||
result.append('f')
|
||||
elif c == 'w':
|
||||
if i == 0 and next == 'h':
|
||||
i += 1
|
||||
if nextnext in 'aeiou' or nextnext == '*****':
|
||||
result.append('w')
|
||||
elif next in 'aeiou' or next == '*****':
|
||||
result.append('w')
|
||||
elif c == 'x':
|
||||
if i == 0:
|
||||
if next == 'h' or (next == 'i' and nextnext in 'oa'):
|
||||
result.append('x')
|
||||
else:
|
||||
result.append('s')
|
||||
else:
|
||||
result.append('k')
|
||||
result.append('s')
|
||||
elif c == 'y':
|
||||
if next in 'aeiou':
|
||||
result.append('y')
|
||||
elif c == 'z':
|
||||
result.append('s')
|
||||
elif c == ' ':
|
||||
if len(result) > 0 and result[-1] != ' ':
|
||||
result.append(' ')
|
||||
|
||||
i += 1
|
||||
|
||||
return ''.join(result).upper()
|
||||
|
||||
|
||||
def porter_stem(s):
|
||||
_check_type(s)
|
||||
|
||||
return Stemmer(s).stem()
|
Loading…
Add table
Add a link
Reference in a new issue