Move common libs to libs/common

2025-08-21 13:53:15 -07:00 · 2018-12-16 13:30:24 -05:00 · 2018-12-16 13:30:24 -05:00 · 1f4bd41bcc
commit 1f4bd41bcc
parent 8dbb1a2451
1612 changed files with 962 additions and 10 deletions
--- a/libs/common/jellyfish/init.py
+++ b/libs/common/jellyfish/init.py
@ -0,0 +1,6 @@
+try:
+    from .cjellyfish import *   # noqa
+    library = "C"
+except ImportError:
+    from ._jellyfish import *   # noqa
+    library = "Python"
--- a/libs/common/jellyfish/_jellyfish.py
+++ b/libs/common/jellyfish/_jellyfish.py
@ -0,0 +1,499 @@
+import unicodedata
+from collections import defaultdict
+from .compat import _range, _zip_longest, IS_PY3
+from .porter import Stemmer
+
+
+def _normalize(s):
+    return unicodedata.normalize('NFKD', s)
+
+
+def _check_type(s):
+    if IS_PY3 and not isinstance(s, str):
+        raise TypeError('expected str or unicode, got %s' % type(s).__name__)
+    elif not IS_PY3 and not isinstance(s, unicode):
+        raise TypeError('expected unicode, got %s' % type(s).__name__)
+
+
+def levenshtein_distance(s1, s2):
+    _check_type(s1)
+    _check_type(s2)
+
+    if s1 == s2:
+        return 0
+    rows = len(s1)+1
+    cols = len(s2)+1
+
+    if not s1:
+        return cols-1
+    if not s2:
+        return rows-1
+
+    prev = None
+    cur = range(cols)
+    for r in _range(1, rows):
+        prev, cur = cur, [r] + [0]*(cols-1)
+        for c in _range(1, cols):
+            deletion = prev[c] + 1
+            insertion = cur[c-1] + 1
+            edit = prev[c-1] + (0 if s1[r-1] == s2[c-1] else 1)
+            cur[c] = min(edit, deletion, insertion)
+
+    return cur[-1]
+
+
+def _jaro_winkler(ying, yang, long_tolerance, winklerize):
+    _check_type(ying)
+    _check_type(yang)
+
+    ying_len = len(ying)
+    yang_len = len(yang)
+
+    if not ying_len or not yang_len:
+        return 0.0
+
+    min_len = max(ying_len, yang_len)
+    search_range = (min_len // 2) - 1
+    if search_range < 0:
+        search_range = 0
+
+    ying_flags = [False]*ying_len
+    yang_flags = [False]*yang_len
+
+    # looking only within search range, count & flag matched pairs
+    common_chars = 0
+    for i, ying_ch in enumerate(ying):
+        low = i - search_range if i > search_range else 0
+        hi = i + search_range if i + search_range < yang_len else yang_len - 1
+        for j in _range(low, hi+1):
+            if not yang_flags[j] and yang[j] == ying_ch:
+                ying_flags[i] = yang_flags[j] = True
+                common_chars += 1
+                break
+
+    # short circuit if no characters match
+    if not common_chars:
+        return 0.0
+
+    # count transpositions
+    k = trans_count = 0
+    for i, ying_f in enumerate(ying_flags):
+        if ying_f:
+            for j in _range(k, yang_len):
+                if yang_flags[j]:
+                    k = j + 1
+                    break
+            if ying[i] != yang[j]:
+                trans_count += 1
+    trans_count /= 2
+
+    # adjust for similarities in nonmatched characters
+    common_chars = float(common_chars)
+    weight = ((common_chars/ying_len + common_chars/yang_len +
+              (common_chars-trans_count) / common_chars)) / 3
+
+    # winkler modification: continue to boost if strings are similar
+    if winklerize and weight > 0.7 and ying_len > 3 and yang_len > 3:
+        # adjust for up to first 4 chars in common
+        j = min(min_len, 4)
+        i = 0
+        while i < j and ying[i] == yang[i] and ying[i]:
+            i += 1
+        if i:
+            weight += i * 0.1 * (1.0 - weight)
+
+        # optionally adjust for long strings
+        # after agreeing beginning chars, at least two or more must agree and
+        # agreed characters must be > half of remaining characters
+        if (long_tolerance and min_len > 4 and common_chars > i+1 and
+                2 * common_chars >= min_len + i):
+            weight += ((1.0 - weight) * (float(common_chars-i-1) / float(ying_len+yang_len-i*2+2)))
+
+    return weight
+
+
+def damerau_levenshtein_distance(s1, s2):
+    _check_type(s1)
+    _check_type(s2)
+
+    len1 = len(s1)
+    len2 = len(s2)
+    infinite = len1 + len2
+
+    # character array
+    da = defaultdict(int)
+
+    # distance matrix
+    score = [[0]*(len2+2) for x in _range(len1+2)]
+
+    score[0][0] = infinite
+    for i in _range(0, len1+1):
+        score[i+1][0] = infinite
+        score[i+1][1] = i
+    for i in _range(0, len2+1):
+        score[0][i+1] = infinite
+        score[1][i+1] = i
+
+    for i in _range(1, len1+1):
+        db = 0
+        for j in _range(1, len2+1):
+            i1 = da[s2[j-1]]
+            j1 = db
+            cost = 1
+            if s1[i-1] == s2[j-1]:
+                cost = 0
+                db = j
+
+            score[i+1][j+1] = min(score[i][j] + cost,
+                                  score[i+1][j] + 1,
+                                  score[i][j+1] + 1,
+                                  score[i1][j1] + (i-i1-1) + 1 + (j-j1-1))
+        da[s1[i-1]] = i
+
+    return score[len1+1][len2+1]
+
+
+def jaro_distance(s1, s2):
+    return _jaro_winkler(s1, s2, False, False)
+
+
+def jaro_winkler(s1, s2, long_tolerance=False):
+    return _jaro_winkler(s1, s2, long_tolerance, True)
+
+
+def soundex(s):
+
+    _check_type(s)
+
+    if not s:
+        return ''
+
+    s = _normalize(s)
+    s = s.upper()
+
+    replacements = (('BFPV', '1'),
+                    ('CGJKQSXZ', '2'),
+                    ('DT', '3'),
+                    ('L', '4'),
+                    ('MN', '5'),
+                    ('R', '6'))
+    result = [s[0]]
+    count = 1
+
+    # find would-be replacment for first character
+    for lset, sub in replacements:
+        if s[0] in lset:
+            last = sub
+            break
+    else:
+        last = None
+
+    for letter in s[1:]:
+        for lset, sub in replacements:
+            if letter in lset:
+                if sub != last:
+                    result.append(sub)
+                    count += 1
+                last = sub
+                break
+        else:
+            last = None
+        if count == 4:
+            break
+
+    result += '0'*(4-count)
+    return ''.join(result)
+
+
+def hamming_distance(s1, s2):
+    _check_type(s1)
+    _check_type(s2)
+
+    # ensure length of s1 >= s2
+    if len(s2) > len(s1):
+        s1, s2 = s2, s1
+
+    # distance is difference in length + differing chars
+    distance = len(s1) - len(s2)
+    for i, c in enumerate(s2):
+        if c != s1[i]:
+            distance += 1
+
+    return distance
+
+
+def nysiis(s):
+
+    _check_type(s)
+
+    if not s:
+        return ''
+
+    s = s.upper()
+    key = []
+
+    # step 1 - prefixes
+    if s.startswith('MAC'):
+        s = 'MCC' + s[3:]
+    elif s.startswith('KN'):
+        s = s[1:]
+    elif s.startswith('K'):
+        s = 'C' + s[1:]
+    elif s.startswith(('PH', 'PF')):
+        s = 'FF' + s[2:]
+    elif s.startswith('SCH'):
+        s = 'SSS' + s[3:]
+
+    # step 2 - suffixes
+    if s.endswith(('IE', 'EE')):
+        s = s[:-2] + 'Y'
+    elif s.endswith(('DT', 'RT', 'RD', 'NT', 'ND')):
+        s = s[:-2] + 'D'
+
+    # step 3 - first character of key comes from name
+    key.append(s[0])
+
+    # step 4 - translate remaining chars
+    i = 1
+    len_s = len(s)
+    while i < len_s:
+        ch = s[i]
+        if ch == 'E' and i+1 < len_s and s[i+1] == 'V':
+            ch = 'AF'
+            i += 1
+        elif ch in 'AEIOU':
+            ch = 'A'
+        elif ch == 'Q':
+            ch = 'G'
+        elif ch == 'Z':
+            ch = 'S'
+        elif ch == 'M':
+            ch = 'N'
+        elif ch == 'K':
+            if i+1 < len(s) and s[i+1] == 'N':
+                ch = 'N'
+            else:
+                ch = 'C'
+        elif ch == 'S' and s[i+1:i+3] == 'CH':
+            ch = 'SS'
+            i += 2
+        elif ch == 'P' and i+1 < len(s) and s[i+1] == 'H':
+            ch = 'F'
+            i += 1
+        elif ch == 'H' and (s[i-1] not in 'AEIOU' or (i+1 < len(s) and s[i+1] not in 'AEIOU')):
+            if s[i-1] in 'AEIOU':
+                ch = 'A'
+            else:
+                ch = s[i-1]
+        elif ch == 'W' and s[i-1] in 'AEIOU':
+            ch = s[i-1]
+
+        if ch[-1] != key[-1][-1]:
+            key.append(ch)
+
+        i += 1
+
+    key = ''.join(key)
+
+    # step 5 - remove trailing S
+    if key.endswith('S') and key != 'S':
+        key = key[:-1]
+
+    # step 6 - replace AY w/ Y
+    if key.endswith('AY'):
+        key = key[:-2] + 'Y'
+
+    # step 7 - remove trailing A
+    if key.endswith('A') and key != 'A':
+        key = key[:-1]
+
+    # step 8 was already done
+
+    return key
+
+
+def match_rating_codex(s):
+    _check_type(s)
+
+    s = s.upper()
+    codex = []
+
+    prev = None
+    for i, c in enumerate(s):
+        # not a space OR
+        # starting character & vowel
+        # or consonant not preceded by same consonant
+        if (c != ' ' and (i == 0 and c in 'AEIOU') or (c not in 'AEIOU' and c != prev)):
+            codex.append(c)
+
+        prev = c
+
+    # just use first/last 3
+    if len(codex) > 6:
+        return ''.join(codex[:3]+codex[-3:])
+    else:
+        return ''.join(codex)
+
+
+def match_rating_comparison(s1, s2):
+    codex1 = match_rating_codex(s1)
+    codex2 = match_rating_codex(s2)
+    len1 = len(codex1)
+    len2 = len(codex2)
+    res1 = []
+    res2 = []
+
+    # length differs by 3 or more, no result
+    if abs(len1-len2) >= 3:
+        return None
+
+    # get minimum rating based on sums of codexes
+    lensum = len1 + len2
+    if lensum <= 4:
+        min_rating = 5
+    elif lensum <= 7:
+        min_rating = 4
+    elif lensum <= 11:
+        min_rating = 3
+    else:
+        min_rating = 2
+
+    # strip off common prefixes
+    for c1, c2 in _zip_longest(codex1, codex2):
+        if c1 != c2:
+            if c1:
+                res1.append(c1)
+            if c2:
+                res2.append(c2)
+
+    unmatched_count1 = unmatched_count2 = 0
+    for c1, c2 in _zip_longest(reversed(res1), reversed(res2)):
+        if c1 != c2:
+            if c1:
+                unmatched_count1 += 1
+            if c2:
+                unmatched_count2 += 1
+
+    return (6 - max(unmatched_count1, unmatched_count2)) >= min_rating
+
+
+def metaphone(s):
+    _check_type(s)
+
+    result = []
+
+    s = _normalize(s.lower())
+
+    # skip first character if s starts with these
+    if s.startswith(('kn', 'gn', 'pn', 'ac', 'wr', 'ae')):
+        s = s[1:]
+
+    i = 0
+
+    while i < len(s):
+        c = s[i]
+        next = s[i+1] if i < len(s)-1 else '*****'
+        nextnext = s[i+2] if i < len(s)-2 else '*****'
+
+        # skip doubles except for cc
+        if c == next and c != 'c':
+            i += 1
+            continue
+
+        if c in 'aeiou':
+            if i == 0 or s[i-1] == ' ':
+                result.append(c)
+        elif c == 'b':
+            if (not (i != 0 and s[i-1] == 'm')) or next:
+                result.append('b')
+        elif c == 'c':
+            if next == 'i' and nextnext == 'a' or next == 'h':
+                result.append('x')
+                i += 1
+            elif next in 'iey':
+                result.append('s')
+                i += 1
+            else:
+                result.append('k')
+        elif c == 'd':
+            if next == 'g' and nextnext in 'iey':
+                result.append('j')
+                i += 2
+            else:
+                result.append('t')
+        elif c in 'fjlmnr':
+            result.append(c)
+        elif c == 'g':
+            if next in 'iey':
+                result.append('j')
+            elif next not in 'hn':
+                result.append('k')
+            elif next == 'h' and nextnext and nextnext not in 'aeiou':
+                i += 1
+        elif c == 'h':
+            if i == 0 or next in 'aeiou' or s[i-1] not in 'aeiou':
+                result.append('h')
+        elif c == 'k':
+            if i == 0 or s[i-1] != 'c':
+                result.append('k')
+        elif c == 'p':
+            if next == 'h':
+                result.append('f')
+                i += 1
+            else:
+                result.append('p')
+        elif c == 'q':
+            result.append('k')
+        elif c == 's':
+            if next == 'h':
+                result.append('x')
+                i += 1
+            elif next == 'i' and nextnext in 'oa':
+                result.append('x')
+                i += 2
+            else:
+                result.append('s')
+        elif c == 't':
+            if next == 'i' and nextnext in 'oa':
+                result.append('x')
+            elif next == 'h':
+                result.append('0')
+                i += 1
+            elif next != 'c' or nextnext != 'h':
+                result.append('t')
+        elif c == 'v':
+            result.append('f')
+        elif c == 'w':
+            if i == 0 and next == 'h':
+                i += 1
+                if nextnext in 'aeiou' or nextnext == '*****':
+                    result.append('w')
+            elif next in 'aeiou' or next == '*****':
+                result.append('w')
+        elif c == 'x':
+            if i == 0:
+                if next == 'h' or (next == 'i' and nextnext in 'oa'):
+                    result.append('x')
+                else:
+                    result.append('s')
+            else:
+                result.append('k')
+                result.append('s')
+        elif c == 'y':
+            if next in 'aeiou':
+                result.append('y')
+        elif c == 'z':
+            result.append('s')
+        elif c == ' ':
+            if len(result) > 0 and result[-1] != ' ':
+                result.append(' ')
+
+        i += 1
+
+    return ''.join(result).upper()
+
+
+def porter_stem(s):
+    _check_type(s)
+
+    return Stemmer(s).stem()
--- a/libs/common/jellyfish/compat.py
+++ b/libs/common/jellyfish/compat.py
@ -0,0 +1,11 @@
+import sys
+import itertools
+
+IS_PY3 = sys.version_info[0] == 3
+
+if IS_PY3:
+    _range = range
+    _zip_longest = itertools.zip_longest
+else:
+    _range = xrange
+    _zip_longest = itertools.izip_longest
--- a/libs/common/jellyfish/porter.py
+++ b/libs/common/jellyfish/porter.py
@ -0,0 +1,218 @@
+from .compat import _range
+
+_s2_options = {
+    'a': ((['a', 't', 'i', 'o', 'n', 'a', 'l'], ['a', 't', 'e']),
+          (['t', 'i', 'o', 'n', 'a', 'l'], ['t', 'i', 'o', 'n'])),
+    'c': ((['e', 'n', 'c', 'i'], ['e', 'n', 'c', 'e']),
+          (['a', 'n', 'c', 'i'], ['a', 'n', 'c', 'e']),),
+    'e': ((['i', 'z', 'e', 'r'], ['i', 'z', 'e']),),
+    'l': ((['b', 'l', 'i'], ['b', 'l', 'e']),
+          (['a', 'l', 'l', 'i'], ['a', 'l']),
+          (['e', 'n', 't', 'l', 'i'], ['e', 'n', 't']),
+          (['e', 'l', 'i'], ['e']),
+          (['o', 'u', 's', 'l', 'i'], ['o', 'u', 's']),),
+    'o': ((['i', 'z', 'a', 't', 'i', 'o', 'n'], ['i', 'z', 'e']),
+          (['a', 't', 'i', 'o', 'n'], ['a', 't', 'e']),
+          (['a', 't', 'o', 'r'], ['a', 't', 'e']),),
+    's': ((['a', 'l', 'i', 's', 'm'], ['a', 'l']),
+          (['i', 'v', 'e', 'n', 'e', 's', 's'], ['i', 'v', 'e']),
+          (['f', 'u', 'l', 'n', 'e', 's', 's'], ['f', 'u', 'l']),
+          (['o', 'u', 's', 'n', 'e', 's', 's'], ['o', 'u', 's']),),
+    't': ((['a', 'l', 'i', 't', 'i'], ['a', 'l']),
+          (['i', 'v', 'i', 't', 'i'], ['i', 'v', 'e']),
+          (['b', 'i', 'l', 'i', 't', 'i'], ['b', 'l', 'e']),),
+    'g': ((['l', 'o', 'g', 'i'], ['l', 'o', 'g']),),
+}
+
+
+_s3_options = {
+    'e': ((['i', 'c', 'a', 't', 'e'], ['i', 'c']),
+          (['a', 't', 'i', 'v', 'e'], []),
+          (['a', 'l', 'i', 'z', 'e'], ['a', 'l']),),
+    'i': ((['i', 'c', 'i', 't', 'i'], ['i', 'c']),),
+    'l': ((['i', 'c', 'a', 'l'], ['i', 'c']),
+          (['f', 'u', 'l'], []),),
+    's': ((['n', 'e', 's', 's'], []),),
+}
+
+_s4_endings = {
+    'a': (['a', 'l'],),
+    'c': (['a', 'n', 'c', 'e'], ['e', 'n', 'c', 'e']),
+    'e': (['e', 'r'],),
+    'i': (['i', 'c'],),
+    'l': (['a', 'b', 'l', 'e'], ['i', 'b', 'l', 'e']),
+    'n': (['a', 'n', 't'], ['e', 'm', 'e', 'n', 't'], ['m', 'e', 'n', 't'],
+          ['e', 'n', 't']),
+    # handle 'o' separately
+    's': (['i', 's', 'm'],),
+    't': (['a', 't', 'e'], ['i', 't', 'i']),
+    'u': (['o', 'u', 's'],),
+    'v': (['i', 'v', 'e'],),
+    'z': (['i', 'z', 'e'],),
+}
+
+
+class Stemmer(object):
+    def __init__(self, b):
+        self.b = list(b)
+        self.k = len(b)-1
+        self.j = 0
+
+    def cons(self, i):
+        """ True iff b[i] is a consonant """
+        if self.b[i] in 'aeiou':
+            return False
+        elif self.b[i] == 'y':
+            return True if i == 0 else not self.cons(i-1)
+        return True
+
+    def m(self):
+        n = i = 0
+        while True:
+            if i > self.j:
+                return n
+            if not self.cons(i):
+                break
+            i += 1
+        i += 1
+        while True:
+            while True:
+                if i > self.j:
+                    return n
+                if self.cons(i):
+                    break
+                i += 1
+
+            i += 1
+            n += 1
+
+            while True:
+                if i > self.j:
+                    return n
+                if not self.cons(i):
+                    break
+                i += 1
+            i += 1
+
+    def vowel_in_stem(self):
+        """ True iff 0...j contains vowel """
+        for i in _range(0, self.j+1):
+            if not self.cons(i):
+                return True
+        return False
+
+    def doublec(self, j):
+        """ True iff j, j-1 contains double consonant """
+        if j < 1 or self.b[j] != self.b[j-1]:
+            return False
+        return self.cons(j)
+
+    def cvc(self, i):
+        """ True iff i-2,i-1,i is consonent-vowel consonant
+        and if second c isn't w,x, or y.
+        used to restore e at end of short words like cave, love, hope, crime
+        """
+        if (i < 2 or not self.cons(i) or self.cons(i-1) or not self.cons(i-2) or
+                self.b[i] in 'wxy'):
+            return False
+        return True
+
+    def ends(self, s):
+        length = len(s)
+        """ True iff 0...k ends with string s """
+        res = (self.b[self.k-length+1:self.k+1] == s)
+        if res:
+            self.j = self.k - length
+        return res
+
+    def setto(self, s):
+        """ set j+1...k to string s, readjusting k """
+        length = len(s)
+        self.b[self.j+1:self.j+1+length] = s
+        self.k = self.j + length
+
+    def r(self, s):
+        if self.m() > 0:
+            self.setto(s)
+
+    def step1ab(self):
+        if self.b[self.k] == 's':
+            if self.ends(['s', 's', 'e', 's']):
+                self.k -= 2
+            elif self.ends(['i', 'e', 's']):
+                self.setto(['i'])
+            elif self.b[self.k-1] != 's':
+                self.k -= 1
+        if self.ends(['e', 'e', 'd']):
+            if self.m() > 0:
+                self.k -= 1
+        elif ((self.ends(['e', 'd']) or self.ends(['i', 'n', 'g'])) and
+              self.vowel_in_stem()):
+            self.k = self.j
+            if self.ends(['a', 't']):
+                self.setto(['a', 't', 'e'])
+            elif self.ends(['b', 'l']):
+                self.setto(['b', 'l', 'e'])
+            elif self.ends(['i', 'z']):
+                self.setto(['i', 'z', 'e'])
+            elif self.doublec(self.k):
+                self.k -= 1
+                if self.b[self.k] in 'lsz':
+                    self.k += 1
+            elif self.m() == 1 and self.cvc(self.k):
+                self.setto(['e'])
+
+    def step1c(self):
+        """ turn terminal y into i if there's a vowel in stem """
+        if self.ends(['y']) and self.vowel_in_stem():
+            self.b[self.k] = 'i'
+
+    def step2and3(self):
+        for end, repl in _s2_options.get(self.b[self.k-1], []):
+            if self.ends(end):
+                self.r(repl)
+                break
+
+        for end, repl in _s3_options.get(self.b[self.k], []):
+            if self.ends(end):
+                self.r(repl)
+                break
+
+    def step4(self):
+        ch = self.b[self.k-1]
+
+        if ch == 'o':
+            if not ((self.ends(['i', 'o', 'n']) and self.b[self.j] in 'st') or
+                    self.ends(['o', 'u'])):
+                return
+        else:
+            endings = _s4_endings.get(ch, [])
+            for end in endings:
+                if self.ends(end):
+                    break
+            else:
+                return
+
+        if self.m() > 1:
+            self.k = self.j
+
+    def step5(self):
+        self.j = self.k
+        if self.b[self.k] == 'e':
+            a = self.m()
+            if a > 1 or a == 1 and not self.cvc(self.k-1):
+                self.k -= 1
+        if self.b[self.k] == 'l' and self.doublec(self.k) and self.m() > 1:
+            self.k -= 1
+
+    def result(self):
+        return ''.join(self.b[:self.k+1])
+
+    def stem(self):
+        if self.k > 1:
+            self.step1ab()
+            self.step1c()
+            self.step2and3()
+            self.step4()
+            self.step5()
+        return self.result()
--- a/libs/common/jellyfish/test.py
+++ b/libs/common/jellyfish/test.py
@ -0,0 +1,212 @@
+# -*- coding: utf-8 -*-
+import sys
+if sys.version_info[0] < 3:
+    import unicodecsv as csv
+    open_kwargs = {}
+else:
+    import csv
+    open_kwargs = {'encoding': 'utf8'}
+import platform
+import pytest
+
+
+def assertAlmostEqual(a, b, places=3):
+    assert abs(a - b) < (0.1**places)
+
+
+if platform.python_implementation() == 'CPython':
+    implementations = ['python', 'c']
+else:
+    implementations = ['python']
+
+
+@pytest.fixture(params=implementations)
+def jf(request):
+    if request.param == 'python':
+        from jellyfish import _jellyfish as jf
+    else:
+        from jellyfish import cjellyfish as jf
+    return jf
+
+
+def _load_data(name):
+    with open('testdata/{}.csv'.format(name), **open_kwargs) as f:
+        for data in csv.reader(f):
+            yield data
+
+
+@pytest.mark.parametrize("s1,s2,value", _load_data('jaro_winkler'), ids=str)
+def test_jaro_winkler(jf, s1, s2, value):
+    value = float(value)
+    assertAlmostEqual(jf.jaro_winkler(s1, s2), value, places=3)
+
+
+@pytest.mark.parametrize("s1,s2,value", _load_data('jaro_distance'), ids=str)
+def test_jaro_distance(jf, s1, s2, value):
+    value = float(value)
+    assertAlmostEqual(jf.jaro_distance(s1, s2), value, places=3)
+
+
+@pytest.mark.parametrize("s1,s2,value", _load_data('hamming'), ids=str)
+def test_hamming_distance(jf, s1, s2, value):
+    value = int(value)
+    assert jf.hamming_distance(s1, s2) == value
+
+
+@pytest.mark.parametrize("s1,s2,value", _load_data('levenshtein'), ids=str)
+def test_levenshtein_distance(jf, s1, s2, value):
+    value = int(value)
+    assert jf.levenshtein_distance(s1, s2) == value
+
+
+@pytest.mark.parametrize("s1,s2,value", _load_data('damerau_levenshtein'), ids=str)
+def test_damerau_levenshtein_distance(jf, s1, s2, value):
+    value = int(value)
+    assert jf.damerau_levenshtein_distance(s1, s2) == value
+
+
+@pytest.mark.parametrize("s1,code", _load_data('soundex'), ids=str)
+def test_soundex(jf, s1, code):
+    assert jf.soundex(s1) == code
+
+
+@pytest.mark.parametrize("s1,code", _load_data('metaphone'), ids=str)
+def test_metaphone(jf, s1, code):
+    assert jf.metaphone(s1) == code
+
+
+@pytest.mark.parametrize("s1,s2", _load_data('nysiis'), ids=str)
+def test_nysiis(jf, s1, s2):
+    assert jf.nysiis(s1) == s2
+
+
+@pytest.mark.parametrize("s1,s2", _load_data('match_rating_codex'), ids=str)
+def test_match_rating_codex(jf, s1, s2):
+    assert jf.match_rating_codex(s1) == s2
+
+
+@pytest.mark.parametrize("s1,s2,value", _load_data('match_rating_comparison'), ids=str)
+def test_match_rating_comparison(jf, s1, s2, value):
+    value = {'True': True, 'False': False, 'None': None}[value]
+    assert jf.match_rating_comparison(s1, s2) is value
+
+
+# use non-parameterized version for speed
+# @pytest.mark.parametrize("a,b", _load_data('porter'), ids=str)
+# def test_porter_stem(jf, a, b):
+#     assert jf.porter_stem(a) == b
+
+def test_porter_stem(jf):
+    with open('testdata/porter.csv', **open_kwargs) as f:
+        reader = csv.reader(f)
+        for (a, b) in reader:
+            assert jf.porter_stem(a) == b
+
+
+if platform.python_implementation() == 'CPython':
+    def test_match_rating_comparison_segfault():
+        import hashlib
+        from jellyfish import cjellyfish as jf
+        sha1s = [u'{}'.format(hashlib.sha1(str(v).encode('ascii')).hexdigest())
+                 for v in range(100)]
+        # this segfaulted on 0.1.2
+        assert [[jf.match_rating_comparison(h1, h2) for h1 in sha1s] for h2 in sha1s]
+
+    def test_damerau_levenshtein_unicode_segfault():
+        # unfortunate difference in behavior between Py & C versions
+        from jellyfish.cjellyfish import damerau_levenshtein_distance as c_dl
+        from jellyfish._jellyfish import damerau_levenshtein_distance as py_dl
+        s1 = u'mylifeoutdoors'
+        s2 = u'нахлыст'
+        with pytest.raises(ValueError):
+            c_dl(s1, s2)
+        with pytest.raises(ValueError):
+            c_dl(s2, s1)
+
+        assert py_dl(s1, s2) == 14
+        assert py_dl(s2, s1) == 14
+
+
+def test_jaro_winkler_long_tolerance(jf):
+    no_lt = jf.jaro_winkler(u'two long strings', u'two long stringz', long_tolerance=False)
+    with_lt = jf.jaro_winkler(u'two long strings', u'two long stringz', long_tolerance=True)
+    # make sure long_tolerance does something
+    assertAlmostEqual(no_lt, 0.975)
+    assertAlmostEqual(with_lt, 0.984)
+
+
+def test_damerau_levenshtein_distance_type(jf):
+    jf.damerau_levenshtein_distance(u'abc', u'abc')
+    with pytest.raises(TypeError) as exc:
+        jf.damerau_levenshtein_distance(b'abc', b'abc')
+    assert 'expected' in str(exc.value)
+
+
+def test_levenshtein_distance_type(jf):
+    assert jf.levenshtein_distance(u'abc', u'abc') == 0
+    with pytest.raises(TypeError) as exc:
+        jf.levenshtein_distance(b'abc', b'abc')
+    assert 'expected' in str(exc.value)
+
+
+def test_jaro_distance_type(jf):
+    assert jf.jaro_distance(u'abc', u'abc') == 1
+    with pytest.raises(TypeError) as exc:
+        jf.jaro_distance(b'abc', b'abc')
+    assert 'expected' in str(exc.value)
+
+
+def test_jaro_winkler_type(jf):
+    assert jf.jaro_winkler(u'abc', u'abc') == 1
+    with pytest.raises(TypeError) as exc:
+        jf.jaro_winkler(b'abc', b'abc')
+    assert 'expected' in str(exc.value)
+
+
+def test_mra_comparison_type(jf):
+    assert jf.match_rating_comparison(u'abc', u'abc') is True
+    with pytest.raises(TypeError) as exc:
+        jf.match_rating_comparison(b'abc', b'abc')
+    assert 'expected' in str(exc.value)
+
+
+def test_hamming_type(jf):
+    assert jf.hamming_distance(u'abc', u'abc') == 0
+    with pytest.raises(TypeError) as exc:
+        jf.hamming_distance(b'abc', b'abc')
+    assert 'expected' in str(exc.value)
+
+
+def test_soundex_type(jf):
+    assert jf.soundex(u'ABC') == 'A120'
+    with pytest.raises(TypeError) as exc:
+        jf.soundex(b'ABC')
+    assert 'expected' in str(exc.value)
+
+
+def test_metaphone_type(jf):
+    assert jf.metaphone(u'abc') == 'ABK'
+    with pytest.raises(TypeError) as exc:
+        jf.metaphone(b'abc')
+    assert 'expected' in str(exc.value)
+
+
+def test_nysiis_type(jf):
+    assert jf.nysiis(u'abc') == 'ABC'
+    with pytest.raises(TypeError) as exc:
+        jf.nysiis(b'abc')
+    assert 'expected' in str(exc.value)
+
+
+def test_mr_codex_type(jf):
+    assert jf.match_rating_codex(u'abc') == 'ABC'
+    with pytest.raises(TypeError) as exc:
+        jf.match_rating_codex(b'abc')
+    assert 'expected' in str(exc.value)
+
+
+def test_porter_type(jf):
+    assert jf.porter_stem(u'abc') == 'abc'
+    with pytest.raises(TypeError) as exc:
+        jf.porter_stem(b'abc')
+    assert 'expected' in str(exc.value)