From c1a1354636131303ce9f2587bf054756a25686e6 Mon Sep 17 00:00:00 2001 From: echel0n Date: Sat, 19 Apr 2014 12:28:55 -0700 Subject: [PATCH] Added GuessIt library and required libs for it. We now perform a guessit on the nzbName to extract movie title and year instead of a regex, this is more accurate. --- lib/babelfish/__init__.py | 18 + lib/babelfish/converters/__init__.py | 280 ++++++++ lib/babelfish/converters/alpha2.py | 17 + lib/babelfish/converters/alpha3b.py | 17 + lib/babelfish/converters/alpha3t.py | 17 + lib/babelfish/converters/countryname.py | 31 + lib/babelfish/converters/name.py | 17 + lib/babelfish/converters/opensubtitles.py | 36 + lib/babelfish/converters/scope.py | 23 + lib/babelfish/converters/type.py | 23 + lib/babelfish/country.py | 95 +++ lib/babelfish/exceptions.py | 85 +++ lib/babelfish/language.py | 174 +++++ lib/babelfish/script.py | 66 ++ lib/babelfish/tests.py | 353 ++++++++++ lib/guessit/ISO-3166-1_utf8.txt | 249 +++++++ lib/guessit/ISO-639-2_utf-8.txt | 485 ++++++++++++++ lib/guessit/__init__.py | 232 +++++++ lib/guessit/__main__.py | 217 ++++++ lib/guessit/__version__.py | 20 + lib/guessit/containers.py | 615 ++++++++++++++++++ lib/guessit/country.py | 111 ++++ lib/guessit/date.py | 146 +++++ lib/guessit/fileutils.py | 87 +++ lib/guessit/guess.py | 452 +++++++++++++ lib/guessit/hash_ed2k.py | 67 ++ lib/guessit/hash_mpc.py | 58 ++ lib/guessit/language.py | 401 ++++++++++++ lib/guessit/matcher.py | 247 +++++++ lib/guessit/matchtree.py | 439 +++++++++++++ lib/guessit/options.py | 25 + lib/guessit/patterns/__init__.py | 77 +++ lib/guessit/patterns/extension.py | 32 + lib/guessit/patterns/numeral.py | 150 +++++ lib/guessit/plugins/__init__.py | 21 + lib/guessit/plugins/transformers.py | 186 ++++++ lib/guessit/quality.py | 65 ++ lib/guessit/slogging.py | 89 +++ lib/guessit/textutils.py | 352 ++++++++++ lib/guessit/tlds-alpha-by-domain.txt | 341 ++++++++++ lib/guessit/transfo/__init__.py | 30 + lib/guessit/transfo/guess_bonus_features.py | 67 ++ lib/guessit/transfo/guess_country.py | 69 ++ lib/guessit/transfo/guess_date.py | 43 ++ .../guess_episode_info_from_position.py | 162 +++++ lib/guessit/transfo/guess_episode_special.py | 62 ++ lib/guessit/transfo/guess_episodes_rexps.py | 80 +++ lib/guessit/transfo/guess_filetype.py | 213 ++++++ lib/guessit/transfo/guess_idnumber.py | 69 ++ lib/guessit/transfo/guess_language.py | 169 +++++ .../guess_movie_title_from_position.py | 177 +++++ lib/guessit/transfo/guess_properties.py | 230 +++++++ lib/guessit/transfo/guess_release_group.py | 149 +++++ lib/guessit/transfo/guess_video_rexps.py | 58 ++ .../transfo/guess_weak_episodes_rexps.py | 69 ++ lib/guessit/transfo/guess_website.py | 66 ++ lib/guessit/transfo/guess_year.py | 49 ++ lib/guessit/transfo/split_explicit_groups.py | 49 ++ lib/guessit/transfo/split_on_dash.py | 47 ++ lib/guessit/transfo/split_path_components.py | 45 ++ lib/stevedore/__init__.py | 36 + lib/stevedore/dispatch.py | 216 ++++++ lib/stevedore/driver.py | 126 ++++ lib/stevedore/enabled.py | 71 ++ lib/stevedore/extension.py | 276 ++++++++ lib/stevedore/hook.py | 64 ++ lib/stevedore/named.py | 124 ++++ nzbtomedia/__init__.py | 49 +- nzbtomedia/nzbToMediaUtil.py | 20 +- 69 files changed, 9263 insertions(+), 38 deletions(-) create mode 100644 lib/babelfish/__init__.py create mode 100644 lib/babelfish/converters/__init__.py create mode 100644 lib/babelfish/converters/alpha2.py create mode 100644 lib/babelfish/converters/alpha3b.py create mode 100644 lib/babelfish/converters/alpha3t.py create mode 100644 lib/babelfish/converters/countryname.py create mode 100644 lib/babelfish/converters/name.py create mode 100644 lib/babelfish/converters/opensubtitles.py create mode 100644 lib/babelfish/converters/scope.py create mode 100644 lib/babelfish/converters/type.py create mode 100644 lib/babelfish/country.py create mode 100644 lib/babelfish/exceptions.py create mode 100644 lib/babelfish/language.py create mode 100644 lib/babelfish/script.py create mode 100644 lib/babelfish/tests.py create mode 100644 lib/guessit/ISO-3166-1_utf8.txt create mode 100644 lib/guessit/ISO-639-2_utf-8.txt create mode 100644 lib/guessit/__init__.py create mode 100644 lib/guessit/__main__.py create mode 100644 lib/guessit/__version__.py create mode 100644 lib/guessit/containers.py create mode 100644 lib/guessit/country.py create mode 100644 lib/guessit/date.py create mode 100644 lib/guessit/fileutils.py create mode 100644 lib/guessit/guess.py create mode 100644 lib/guessit/hash_ed2k.py create mode 100644 lib/guessit/hash_mpc.py create mode 100644 lib/guessit/language.py create mode 100644 lib/guessit/matcher.py create mode 100644 lib/guessit/matchtree.py create mode 100644 lib/guessit/options.py create mode 100644 lib/guessit/patterns/__init__.py create mode 100644 lib/guessit/patterns/extension.py create mode 100644 lib/guessit/patterns/numeral.py create mode 100644 lib/guessit/plugins/__init__.py create mode 100644 lib/guessit/plugins/transformers.py create mode 100644 lib/guessit/quality.py create mode 100644 lib/guessit/slogging.py create mode 100644 lib/guessit/textutils.py create mode 100644 lib/guessit/tlds-alpha-by-domain.txt create mode 100644 lib/guessit/transfo/__init__.py create mode 100644 lib/guessit/transfo/guess_bonus_features.py create mode 100644 lib/guessit/transfo/guess_country.py create mode 100644 lib/guessit/transfo/guess_date.py create mode 100644 lib/guessit/transfo/guess_episode_info_from_position.py create mode 100644 lib/guessit/transfo/guess_episode_special.py create mode 100644 lib/guessit/transfo/guess_episodes_rexps.py create mode 100644 lib/guessit/transfo/guess_filetype.py create mode 100644 lib/guessit/transfo/guess_idnumber.py create mode 100644 lib/guessit/transfo/guess_language.py create mode 100644 lib/guessit/transfo/guess_movie_title_from_position.py create mode 100644 lib/guessit/transfo/guess_properties.py create mode 100644 lib/guessit/transfo/guess_release_group.py create mode 100644 lib/guessit/transfo/guess_video_rexps.py create mode 100644 lib/guessit/transfo/guess_weak_episodes_rexps.py create mode 100644 lib/guessit/transfo/guess_website.py create mode 100644 lib/guessit/transfo/guess_year.py create mode 100644 lib/guessit/transfo/split_explicit_groups.py create mode 100644 lib/guessit/transfo/split_on_dash.py create mode 100644 lib/guessit/transfo/split_path_components.py create mode 100644 lib/stevedore/__init__.py create mode 100644 lib/stevedore/dispatch.py create mode 100644 lib/stevedore/driver.py create mode 100644 lib/stevedore/enabled.py create mode 100644 lib/stevedore/extension.py create mode 100644 lib/stevedore/hook.py create mode 100644 lib/stevedore/named.py diff --git a/lib/babelfish/__init__.py b/lib/babelfish/__init__.py new file mode 100644 index 00000000..52b7ac28 --- /dev/null +++ b/lib/babelfish/__init__.py @@ -0,0 +1,18 @@ +# -*- coding: utf-8 -*- +# +# Copyright (c) 2013 the BabelFish authors. All rights reserved. +# Use of this source code is governed by the 3-clause BSD license +# that can be found in the LICENSE file. +# +__title__ = 'babelfish' +__version__ = '0.5.1' +__author__ = 'Antoine Bertin' +__license__ = 'BSD' +__copyright__ = 'Copyright 2013 the BabelFish authors' + +from .converters import (LanguageConverter, LanguageReverseConverter, LanguageEquivalenceConverter, CountryConverter, + CountryReverseConverter) +from .country import country_converters, COUNTRIES, COUNTRY_MATRIX, Country +from .exceptions import Error, LanguageConvertError, LanguageReverseError, CountryConvertError, CountryReverseError +from .language import language_converters, LANGUAGES, LANGUAGE_MATRIX, Language +from .script import SCRIPTS, SCRIPT_MATRIX, Script diff --git a/lib/babelfish/converters/__init__.py b/lib/babelfish/converters/__init__.py new file mode 100644 index 00000000..9a0a1bd9 --- /dev/null +++ b/lib/babelfish/converters/__init__.py @@ -0,0 +1,280 @@ +# Copyright (c) 2013 the BabelFish authors. All rights reserved. +# Use of this source code is governed by the 3-clause BSD license +# that can be found in the LICENSE file. +# +import collections +from pkg_resources import iter_entry_points, EntryPoint +from ..exceptions import LanguageConvertError, LanguageReverseError + + +# from https://github.com/kennethreitz/requests/blob/master/requests/structures.py +class CaseInsensitiveDict(collections.MutableMapping): + """A case-insensitive ``dict``-like object. + + Implements all methods and operations of + ``collections.MutableMapping`` as well as dict's ``copy``. Also + provides ``lower_items``. + + All keys are expected to be strings. The structure remembers the + case of the last key to be set, and ``iter(instance)``, + ``keys()``, ``items()``, ``iterkeys()``, and ``iteritems()`` + will contain case-sensitive keys. However, querying and contains + testing is case insensitive: + + cid = CaseInsensitiveDict() + cid['English'] = 'eng' + cid['ENGLISH'] == 'eng' # True + list(cid) == ['English'] # True + + If the constructor, ``.update``, or equality comparison + operations are given keys that have equal ``.lower()``s, the + behavior is undefined. + + """ + def __init__(self, data=None, **kwargs): + self._store = dict() + if data is None: + data = {} + self.update(data, **kwargs) + + def __setitem__(self, key, value): + # Use the lowercased key for lookups, but store the actual + # key alongside the value. + self._store[key.lower()] = (key, value) + + def __getitem__(self, key): + return self._store[key.lower()][1] + + def __delitem__(self, key): + del self._store[key.lower()] + + def __iter__(self): + return (casedkey for casedkey, mappedvalue in self._store.values()) + + def __len__(self): + return len(self._store) + + def lower_items(self): + """Like iteritems(), but with all lowercase keys.""" + return ( + (lowerkey, keyval[1]) + for (lowerkey, keyval) + in self._store.items() + ) + + def __eq__(self, other): + if isinstance(other, collections.Mapping): + other = CaseInsensitiveDict(other) + else: + return NotImplemented + # Compare insensitively + return dict(self.lower_items()) == dict(other.lower_items()) + + # Copy is required + def copy(self): + return CaseInsensitiveDict(self._store.values()) + + def __repr__(self): + return '%s(%r)' % (self.__class__.__name__, dict(self.items())) + + +class LanguageConverter(object): + """A :class:`LanguageConverter` supports converting an alpha3 language code with an + alpha2 country code and a script code into a custom code + + .. attribute:: codes + + Set of possible custom codes + + """ + def convert(self, alpha3, country=None, script=None): + """Convert an alpha3 language code with an alpha2 country code and a script code + into a custom code + + :param string alpha3: ISO-639-3 language code + :param country: ISO-3166 country code, if any + :type country: string or None + :param script: ISO-15924 script code, if any + :type script: string or None + :return: the corresponding custom code + :rtype: string + :raise: :class:`~babelfish.exceptions.LanguageConvertError` + + """ + raise NotImplementedError + + +class LanguageReverseConverter(LanguageConverter): + """A :class:`LanguageConverter` able to reverse a custom code into a alpha3 + ISO-639-3 language code, alpha2 ISO-3166-1 country code and ISO-15924 script code + + """ + def reverse(self, code): + """Reverse a custom code into alpha3, country and script code + + :param string code: custom code to reverse + :return: the corresponding alpha3 ISO-639-3 language code, alpha2 ISO-3166-1 country code and ISO-15924 script code + :rtype: tuple + :raise: :class:`~babelfish.exceptions.LanguageReverseError` + + """ + raise NotImplementedError + + +class LanguageEquivalenceConverter(LanguageReverseConverter): + """A :class:`LanguageEquivalenceConverter` is a utility class that allows you to easily define a + :class:`LanguageReverseConverter` by only specifying the dict from alpha3 to their corresponding symbols. + + You must specify the dict of equivalence as a class variable named SYMBOLS. + + If you also set the class variable CASE_SENSITIVE to ``True`` then the reverse conversion function will be + case-sensitive (it is case-insensitive by default). + + Example:: + + class MyCodeConverter(babelfish.LanguageEquivalenceConverter): + CASE_SENSITIVE = True + SYMBOLS = {'fra': 'mycode1', 'eng': 'mycode2'} + + """ + CASE_SENSITIVE = False + + def __init__(self): + self.codes = set() + self.to_symbol = {} + if self.CASE_SENSITIVE: + self.from_symbol = {} + else: + self.from_symbol = CaseInsensitiveDict() + + for alpha3, symbol in self.SYMBOLS.items(): + self.to_symbol[alpha3] = symbol + self.from_symbol[symbol] = (alpha3, None, None) + self.codes.add(symbol) + + def convert(self, alpha3, country=None, script=None): + try: + return self.to_symbol[alpha3] + except KeyError: + raise LanguageConvertError(alpha3, country, script) + + def reverse(self, code): + try: + return self.from_symbol[code] + except KeyError: + raise LanguageReverseError(code) + + +class CountryConverter(object): + """A :class:`CountryConverter` supports converting an alpha2 country code + into a custom code + + .. attribute:: codes + + Set of possible custom codes + + """ + def convert(self, alpha2): + """Convert an alpha2 country code into a custom code + + :param string alpha2: ISO-3166-1 language code + :return: the corresponding custom code + :rtype: string + :raise: :class:`~babelfish.exceptions.CountryConvertError` + + """ + raise NotImplementedError + + +class CountryReverseConverter(CountryConverter): + """A :class:`CountryConverter` able to reverse a custom code into a alpha2 + ISO-3166-1 country code + + """ + def reverse(self, code): + """Reverse a custom code into alpha2 code + + :param string code: custom code to reverse + :return: the corresponding alpha2 ISO-3166-1 country code + :rtype: string + :raise: :class:`~babelfish.exceptions.CountryReverseError` + + """ + raise NotImplementedError + + +class ConverterManager(object): + """Manager for babelfish converters behaving like a dict with lazy loading + + Loading is done in this order: + + * Entry point converters + * Registered converters + * Internal converters + + .. attribute:: entry_point + + The entry point where to look for converters + + .. attribute:: internal_converters + + Internal converters with entry point syntax + + """ + entry_point = '' + internal_converters = [] + + def __init__(self): + #: Registered converters with entry point syntax + self.registered_converters = [] + + #: Loaded converters + self.converters = {} + + def __getitem__(self, name): + """Get a converter, lazy loading it if necessary""" + if name in self.converters: + return self.converters[name] + for ep in iter_entry_points(self.entry_point): + if ep.name == name: + self.converters[ep.name] = ep.load()() + return self.converters[ep.name] + for ep in (EntryPoint.parse(c) for c in self.registered_converters + self.internal_converters): + if ep.name == name: + self.converters[ep.name] = ep.load(require=False)() + return self.converters[ep.name] + raise KeyError(name) + + def __setitem__(self, name, converter): + """Load a converter""" + self.converters[name] = converter + + def __delitem__(self, name): + """Unload a converter""" + del self.converters[name] + + def __iter__(self): + """Iterator over loaded converters""" + return iter(self.converters) + + def register(self, entry_point): + """Register a converter + + :param string entry_point: converter to register (entry point syntax) + :raise: ValueError if already registered + + """ + if entry_point in self.registered_converters: + raise ValueError('Already registered') + self.registered_converters.insert(0, entry_point) + + def unregister(self, entry_point): + """Unregister a converter + + :param string entry_point: converter to unregister (entry point syntax) + + """ + self.registered_converters.remove(entry_point) + + def __contains__(self, name): + return name in self.converters diff --git a/lib/babelfish/converters/alpha2.py b/lib/babelfish/converters/alpha2.py new file mode 100644 index 00000000..aca973dd --- /dev/null +++ b/lib/babelfish/converters/alpha2.py @@ -0,0 +1,17 @@ +# -*- coding: utf-8 -*- +# +# Copyright (c) 2013 the BabelFish authors. All rights reserved. +# Use of this source code is governed by the 3-clause BSD license +# that can be found in the LICENSE file. +# +from __future__ import unicode_literals +from . import LanguageEquivalenceConverter +from ..language import LANGUAGE_MATRIX + + +class Alpha2Converter(LanguageEquivalenceConverter): + CASE_SENSITIVE = True + SYMBOLS = {} + for iso_language in LANGUAGE_MATRIX: + if iso_language.alpha2: + SYMBOLS[iso_language.alpha3] = iso_language.alpha2 diff --git a/lib/babelfish/converters/alpha3b.py b/lib/babelfish/converters/alpha3b.py new file mode 100644 index 00000000..e90c5f5e --- /dev/null +++ b/lib/babelfish/converters/alpha3b.py @@ -0,0 +1,17 @@ +# -*- coding: utf-8 -*- +# +# Copyright (c) 2013 the BabelFish authors. All rights reserved. +# Use of this source code is governed by the 3-clause BSD license +# that can be found in the LICENSE file. +# +from __future__ import unicode_literals +from . import LanguageEquivalenceConverter +from ..language import LANGUAGE_MATRIX + + +class Alpha3BConverter(LanguageEquivalenceConverter): + CASE_SENSITIVE = True + SYMBOLS = {} + for iso_language in LANGUAGE_MATRIX: + if iso_language.alpha3b: + SYMBOLS[iso_language.alpha3] = iso_language.alpha3b diff --git a/lib/babelfish/converters/alpha3t.py b/lib/babelfish/converters/alpha3t.py new file mode 100644 index 00000000..6de6e4c6 --- /dev/null +++ b/lib/babelfish/converters/alpha3t.py @@ -0,0 +1,17 @@ +# -*- coding: utf-8 -*- +# +# Copyright (c) 2013 the BabelFish authors. All rights reserved. +# Use of this source code is governed by the 3-clause BSD license +# that can be found in the LICENSE file. +# +from __future__ import unicode_literals +from . import LanguageEquivalenceConverter +from ..language import LANGUAGE_MATRIX + + +class Alpha3TConverter(LanguageEquivalenceConverter): + CASE_SENSITIVE = True + SYMBOLS = {} + for iso_language in LANGUAGE_MATRIX: + if iso_language.alpha3t: + SYMBOLS[iso_language.alpha3] = iso_language.alpha3t diff --git a/lib/babelfish/converters/countryname.py b/lib/babelfish/converters/countryname.py new file mode 100644 index 00000000..ff36c878 --- /dev/null +++ b/lib/babelfish/converters/countryname.py @@ -0,0 +1,31 @@ +# -*- coding: utf-8 -*- +# +# Copyright (c) 2013 the BabelFish authors. All rights reserved. +# Use of this source code is governed by the 3-clause BSD license +# that can be found in the LICENSE file. +# +from __future__ import unicode_literals +from . import CountryReverseConverter, CaseInsensitiveDict +from ..country import COUNTRY_MATRIX +from ..exceptions import CountryConvertError, CountryReverseError + + +class CountryNameConverter(CountryReverseConverter): + def __init__(self): + self.codes = set() + self.to_name = {} + self.from_name = CaseInsensitiveDict() + for country in COUNTRY_MATRIX: + self.codes.add(country.name) + self.to_name[country.alpha2] = country.name + self.from_name[country.name] = country.alpha2 + + def convert(self, alpha2): + if alpha2 not in self.to_name: + raise CountryConvertError(alpha2) + return self.to_name[alpha2] + + def reverse(self, name): + if name not in self.from_name: + raise CountryReverseError(name) + return self.from_name[name] diff --git a/lib/babelfish/converters/name.py b/lib/babelfish/converters/name.py new file mode 100644 index 00000000..8dd865b7 --- /dev/null +++ b/lib/babelfish/converters/name.py @@ -0,0 +1,17 @@ +# -*- coding: utf-8 -*- +# +# Copyright (c) 2013 the BabelFish authors. All rights reserved. +# Use of this source code is governed by the 3-clause BSD license +# that can be found in the LICENSE file. +# +from __future__ import unicode_literals +from . import LanguageEquivalenceConverter +from ..language import LANGUAGE_MATRIX + + +class NameConverter(LanguageEquivalenceConverter): + CASE_SENSITIVE = False + SYMBOLS = {} + for iso_language in LANGUAGE_MATRIX: + if iso_language.name: + SYMBOLS[iso_language.alpha3] = iso_language.name diff --git a/lib/babelfish/converters/opensubtitles.py b/lib/babelfish/converters/opensubtitles.py new file mode 100644 index 00000000..101c40fd --- /dev/null +++ b/lib/babelfish/converters/opensubtitles.py @@ -0,0 +1,36 @@ +# -*- coding: utf-8 -*- +# +# Copyright (c) 2013 the BabelFish authors. All rights reserved. +# Use of this source code is governed by the 3-clause BSD license +# that can be found in the LICENSE file. +# +from __future__ import unicode_literals +from . import LanguageReverseConverter, CaseInsensitiveDict +from ..exceptions import LanguageReverseError +from ..language import language_converters + + +class OpenSubtitlesConverter(LanguageReverseConverter): + def __init__(self): + self.alpha3b_converter = language_converters['alpha3b'] + self.alpha2_converter = language_converters['alpha2'] + self.to_opensubtitles = {('por', 'BR'): 'pob', ('gre', None): 'ell', ('srp', None): 'scc', ('srp', 'ME'): 'mne'} + self.from_opensubtitles = CaseInsensitiveDict({'pob': ('por', 'BR'), 'pb': ('por', 'BR'), 'ell': ('ell', None), + 'scc': ('srp', None), 'mne': ('srp', 'ME')}) + self.codes = (self.alpha2_converter.codes | self.alpha3b_converter.codes | set(['pob', 'pb', 'scc', 'mne'])) + + def convert(self, alpha3, country=None, script=None): + alpha3b = self.alpha3b_converter.convert(alpha3, country, script) + if (alpha3b, country) in self.to_opensubtitles: + return self.to_opensubtitles[(alpha3b, country)] + return alpha3b + + def reverse(self, opensubtitles): + if opensubtitles in self.from_opensubtitles: + return self.from_opensubtitles[opensubtitles] + for conv in [self.alpha3b_converter, self.alpha2_converter]: + try: + return conv.reverse(opensubtitles) + except LanguageReverseError: + pass + raise LanguageReverseError(opensubtitles) diff --git a/lib/babelfish/converters/scope.py b/lib/babelfish/converters/scope.py new file mode 100644 index 00000000..73540063 --- /dev/null +++ b/lib/babelfish/converters/scope.py @@ -0,0 +1,23 @@ +# -*- coding: utf-8 -*- +# +# Copyright (c) 2013 the BabelFish authors. All rights reserved. +# Use of this source code is governed by the 3-clause BSD license +# that can be found in the LICENSE file. +# +from __future__ import unicode_literals +from . import LanguageConverter +from ..exceptions import LanguageConvertError +from ..language import LANGUAGE_MATRIX + + +class ScopeConverter(LanguageConverter): + FULLNAME = {'I': 'individual', 'M': 'macrolanguage', 'S': 'special'} + SYMBOLS = {} + for iso_language in LANGUAGE_MATRIX: + SYMBOLS[iso_language.alpha3] = iso_language.scope + codes = set(SYMBOLS.values()) + + def convert(self, alpha3, country=None, script=None): + if self.SYMBOLS[alpha3] in self.FULLNAME: + return self.FULLNAME[self.SYMBOLS[alpha3]] + raise LanguageConvertError(alpha3, country, script) diff --git a/lib/babelfish/converters/type.py b/lib/babelfish/converters/type.py new file mode 100644 index 00000000..3b7378c2 --- /dev/null +++ b/lib/babelfish/converters/type.py @@ -0,0 +1,23 @@ +# -*- coding: utf-8 -*- +# +# Copyright (c) 2013 the BabelFish authors. All rights reserved. +# Use of this source code is governed by the 3-clause BSD license +# that can be found in the LICENSE file. +# +from __future__ import unicode_literals +from . import LanguageConverter +from ..exceptions import LanguageConvertError +from ..language import LANGUAGE_MATRIX + + +class LanguageTypeConverter(LanguageConverter): + FULLNAME = {'A': 'ancient', 'C': 'constructed', 'E': 'extinct', 'H': 'historical', 'L': 'living', 'S': 'special'} + SYMBOLS = {} + for iso_language in LANGUAGE_MATRIX: + SYMBOLS[iso_language.alpha3] = iso_language.type + codes = set(SYMBOLS.values()) + + def convert(self, alpha3, country=None, script=None): + if self.SYMBOLS[alpha3] in self.FULLNAME: + return self.FULLNAME[self.SYMBOLS[alpha3]] + raise LanguageConvertError(alpha3, country, script) diff --git a/lib/babelfish/country.py b/lib/babelfish/country.py new file mode 100644 index 00000000..83641fc9 --- /dev/null +++ b/lib/babelfish/country.py @@ -0,0 +1,95 @@ +# -*- coding: utf-8 -*- +# +# Copyright (c) 2013 the BabelFish authors. All rights reserved. +# Use of this source code is governed by the 3-clause BSD license +# that can be found in the LICENSE file. +# +from __future__ import unicode_literals +from collections import namedtuple +from functools import partial +from pkg_resources import resource_stream # @UnresolvedImport +from .converters import ConverterManager + + +COUNTRIES = {} +COUNTRY_MATRIX = [] + +#: The namedtuple used in the :data:`COUNTRY_MATRIX` +IsoCountry = namedtuple('IsoCountry', ['name', 'alpha2']) + +f = resource_stream('babelfish', 'data/iso-3166-1.txt') +f.readline() +for l in f: + iso_country = IsoCountry(*l.decode('utf-8').strip().split(';')) + COUNTRIES[iso_country.alpha2] = iso_country.name + COUNTRY_MATRIX.append(iso_country) +f.close() + + +class CountryConverterManager(ConverterManager): + """:class:`~babelfish.converters.ConverterManager` for country converters""" + entry_point = 'babelfish.country_converters' + internal_converters = ['name = babelfish.converters.countryname:CountryNameConverter'] + +country_converters = CountryConverterManager() + + +class CountryMeta(type): + """The :class:`Country` metaclass + + Dynamically redirect :meth:`Country.frommycode` to :meth:`Country.fromcode` with the ``mycode`` `converter` + + """ + def __getattr__(cls, name): + if name.startswith('from'): + return partial(cls.fromcode, converter=name[4:]) + return getattr(cls, name) + + +class Country(CountryMeta(str('CountryBase'), (object,), {})): + """A country on Earth + + A country is represented by a 2-letter code from the ISO-3166 standard + + :param string country: 2-letter ISO-3166 country code + + """ + def __init__(self, country): + if country not in COUNTRIES: + raise ValueError('%r is not a valid country' % country) + + #: ISO-3166 2-letter country code + self.alpha2 = country + + @classmethod + def fromcode(cls, code, converter): + """Create a :class:`Country` by its `code` using `converter` to + :meth:`~babelfish.converters.CountryReverseConverter.reverse` it + + :param string code: the code to reverse + :param string converter: name of the :class:`~babelfish.converters.CountryReverseConverter` to use + :return: the corresponding :class:`Country` instance + :rtype: :class:`Country` + + """ + return cls(country_converters[converter].reverse(code)) + + def __getattr__(self, name): + return country_converters[name].convert(self.alpha2) + + def __hash__(self): + return hash(self.alpha2) + + def __eq__(self, other): + if other is None: + return False + return self.alpha2 == other.alpha2 + + def __ne__(self, other): + return not self == other + + def __repr__(self): + return '' % self + + def __str__(self): + return self.alpha2 diff --git a/lib/babelfish/exceptions.py b/lib/babelfish/exceptions.py new file mode 100644 index 00000000..bbc6efe3 --- /dev/null +++ b/lib/babelfish/exceptions.py @@ -0,0 +1,85 @@ +# -*- coding: utf-8 -*- +# +# Copyright (c) 2013 the BabelFish authors. All rights reserved. +# Use of this source code is governed by the 3-clause BSD license +# that can be found in the LICENSE file. +# +from __future__ import unicode_literals + + +class Error(Exception): + """Base class for all exceptions in babelfish""" + pass + + +class LanguageError(Error, AttributeError): + """Base class for all language exceptions in babelfish""" + pass + + +class LanguageConvertError(LanguageError): + """Exception raised by converters when :meth:`~babelfish.converters.LanguageConverter.convert` fails + + :param string alpha3: alpha3 code that failed conversion + :param country: country code that failed conversion, if any + :type country: string or None + :param script: script code that failed conversion, if any + :type script: string or None + + """ + def __init__(self, alpha3, country=None, script=None): + self.alpha3 = alpha3 + self.country = country + self.script = script + + def __str__(self): + s = self.alpha3 + if self.country is not None: + s += '-' + self.country + if self.script is not None: + s += '-' + self.script + return s + + +class LanguageReverseError(LanguageError): + """Exception raised by converters when :meth:`~babelfish.converters.LanguageReverseConverter.reverse` fails + + :param string code: code that failed reverse conversion + + """ + def __init__(self, code): + self.code = code + + def __str__(self): + return repr(self.code) + + +class CountryError(Error, AttributeError): + """Base class for all country exceptions in babelfish""" + pass + + +class CountryConvertError(CountryError): + """Exception raised by converters when :meth:`~babelfish.converters.CountryConverter.convert` fails + + :param string alpha2: alpha2 code that failed conversion + + """ + def __init__(self, alpha2): + self.alpha2 = alpha2 + + def __str__(self): + return self.alpha2 + + +class CountryReverseError(CountryError): + """Exception raised by converters when :meth:`~babelfish.converters.CountryReverseConverter.reverse` fails + + :param string code: code that failed reverse conversion + + """ + def __init__(self, code): + self.code = code + + def __str__(self): + return repr(self.code) diff --git a/lib/babelfish/language.py b/lib/babelfish/language.py new file mode 100644 index 00000000..ffa48ef5 --- /dev/null +++ b/lib/babelfish/language.py @@ -0,0 +1,174 @@ +# -*- coding: utf-8 -*- +# +# Copyright (c) 2013 the BabelFish authors. All rights reserved. +# Use of this source code is governed by the 3-clause BSD license +# that can be found in the LICENSE file. +# +from __future__ import unicode_literals +from collections import namedtuple +from functools import partial +from pkg_resources import resource_stream # @UnresolvedImport +from .converters import ConverterManager +from .country import Country +from .exceptions import LanguageConvertError +from .script import Script + + +LANGUAGES = set() +LANGUAGE_MATRIX = [] + +#: The namedtuple used in the :data:`LANGUAGE_MATRIX` +IsoLanguage = namedtuple('IsoLanguage', ['alpha3', 'alpha3b', 'alpha3t', 'alpha2', 'scope', 'type', 'name', 'comment']) + +f = resource_stream('babelfish', 'data/iso-639-3.tab') +f.readline() +for l in f: + iso_language = IsoLanguage(*l.decode('utf-8').split('\t')) + LANGUAGES.add(iso_language.alpha3) + LANGUAGE_MATRIX.append(iso_language) +f.close() + + +class LanguageConverterManager(ConverterManager): + """:class:`~babelfish.converters.ConverterManager` for language converters""" + entry_point = 'babelfish.language_converters' + internal_converters = ['alpha2 = babelfish.converters.alpha2:Alpha2Converter', + 'alpha3b = babelfish.converters.alpha3b:Alpha3BConverter', + 'alpha3t = babelfish.converters.alpha3t:Alpha3TConverter', + 'name = babelfish.converters.name:NameConverter', + 'scope = babelfish.converters.scope:ScopeConverter', + 'type = babelfish.converters.type:LanguageTypeConverter', + 'opensubtitles = babelfish.converters.opensubtitles:OpenSubtitlesConverter'] + +language_converters = LanguageConverterManager() + + +class LanguageMeta(type): + """The :class:`Language` metaclass + + Dynamically redirect :meth:`Language.frommycode` to :meth:`Language.fromcode` with the ``mycode`` `converter` + + """ + def __getattr__(cls, name): + if name.startswith('from'): + return partial(cls.fromcode, converter=name[4:]) + return getattr(cls, name) + + +class Language(LanguageMeta(str('LanguageBase'), (object,), {})): + """A human language + + A human language is composed of a language part following the ISO-639 + standard and can be country-specific when a :class:`~babelfish.country.Country` + is specified. + + The :class:`Language` is extensible with custom converters (see :ref:`custom_converters`) + + :param string language: the language as a 3-letter ISO-639-3 code + :param country: the country (if any) as a 2-letter ISO-3166 code or :class:`~babelfish.country.Country` instance + :type country: string or :class:`~babelfish.country.Country` or None + :param script: the script (if any) as a 4-letter ISO-15924 code or :class:`~babelfish.script.Script` instance + :type script: string or :class:`~babelfish.script.Script` or None + :param unknown: the unknown language as a three-letters ISO-639-3 code to use as fallback + :type unknown: string or None + :raise: ValueError if the language could not be recognized and `unknown` is ``None`` + + """ + def __init__(self, language, country=None, script=None, unknown=None): + if unknown is not None and language not in LANGUAGES: + language = unknown + if language not in LANGUAGES: + raise ValueError('%r is not a valid language' % language) + self.alpha3 = language + self.country = None + if isinstance(country, Country): + self.country = country + elif country is None: + self.country = None + else: + self.country = Country(country) + self.script = None + if isinstance(script, Script): + self.script = script + elif script is None: + self.script = None + else: + self.script = Script(script) + + @classmethod + def fromcode(cls, code, converter): + """Create a :class:`Language` by its `code` using `converter` to + :meth:`~babelfish.converters.LanguageReverseConverter.reverse` it + + :param string code: the code to reverse + :param string converter: name of the :class:`~babelfish.converters.LanguageReverseConverter` to use + :return: the corresponding :class:`Language` instance + :rtype: :class:`Language` + + """ + return cls(*language_converters[converter].reverse(code)) + + @classmethod + def fromietf(cls, ietf): + """Create a :class:`Language` by from an IETF language code + + :param string ietf: the ietf code + :return: the corresponding :class:`Language` instance + :rtype: :class:`Language` + + """ + subtags = ietf.split('-') + language_subtag = subtags.pop(0).lower() + if len(language_subtag) == 2: + language = cls.fromalpha2(language_subtag) + else: + language = cls(language_subtag) + while subtags: + subtag = subtags.pop(0) + if len(subtag) == 2: + language.country = Country(subtag.upper()) + else: + language.script = Script(subtag.capitalize()) + if language.script is not None: + if subtags: + raise ValueError('Wrong IETF format. Unmatched subtags: %r' % subtags) + break + return language + + def __getattr__(self, name): + alpha3 = self.alpha3 + country = self.country.alpha2 if self.country is not None else None + script = self.script.code if self.script is not None else None + try: + return language_converters[name].convert(alpha3, country, script) + except KeyError: + raise AttributeError(name) + + def __hash__(self): + return hash(str(self)) + + def __eq__(self, other): + if other is None: + return False + return self.alpha3 == other.alpha3 and self.country == other.country and self.script == other.script + + def __ne__(self, other): + return not self == other + + def __bool__(self): + return self.alpha3 != 'und' + __nonzero__ = __bool__ + + def __repr__(self): + return '' % self + + def __str__(self): + try: + s = self.alpha2 + except LanguageConvertError: + s = self.alpha3 + if self.country is not None: + s += '-' + str(self.country) + if self.script is not None: + s += '-' + str(self.script) + return s diff --git a/lib/babelfish/script.py b/lib/babelfish/script.py new file mode 100644 index 00000000..9d3b26ee --- /dev/null +++ b/lib/babelfish/script.py @@ -0,0 +1,66 @@ +# -*- coding: utf-8 -*- +# +# Copyright (c) 2013 the BabelFish authors. All rights reserved. +# Use of this source code is governed by the 3-clause BSD license +# that can be found in the LICENSE file. +# +from __future__ import unicode_literals +from collections import namedtuple +from pkg_resources import resource_stream # @UnresolvedImport + + +#: Script code to script name mapping +SCRIPTS = {} + +#: List of countries in the ISO-15924 as namedtuple of code, number, name, french_name, pva and date +SCRIPT_MATRIX = [] + +#: The namedtuple used in the :data:`SCRIPT_MATRIX` +IsoScript = namedtuple('IsoScript', ['code', 'number', 'name', 'french_name', 'pva', 'date']) + +f = resource_stream('babelfish', 'data/iso15924-utf8-20131012.txt') +f.readline() +for l in f: + l = l.decode('utf-8').strip() + if not l or l.startswith('#'): + continue + script = IsoScript._make(l.split(';')) + SCRIPT_MATRIX.append(script) + SCRIPTS[script.code] = script.name +f.close() + + +class Script(object): + """A human writing system + + A script is represented by a 4-letter code from the ISO-15924 standard + + :param string script: 4-letter ISO-15924 script code + + """ + def __init__(self, script): + if script not in SCRIPTS: + raise ValueError('%r is not a valid script' % script) + + #: ISO-15924 4-letter script code + self.code = script + + @property + def name(self): + """English name of the script""" + return SCRIPTS[self.code] + + def __hash__(self): + return hash(self.code) + + def __eq__(self, other): + return self.code == other.code + + def __ne__(self, other): + return not self == other + + def __repr__(self): + return '