Added GuessIt library and required libs for it.

We now perform a guessit on the nzbName to extract movie title and year instead of a regex, this is more accurate.
This commit is contained in:
echel0n 2014-04-19 12:28:55 -07:00
parent d26cc388d1
commit c1a1354636
69 changed files with 9263 additions and 38 deletions

18
lib/babelfish/__init__.py Normal file
View file

@ -0,0 +1,18 @@
# -*- coding: utf-8 -*-
#
# Copyright (c) 2013 the BabelFish authors. All rights reserved.
# Use of this source code is governed by the 3-clause BSD license
# that can be found in the LICENSE file.
#
__title__ = 'babelfish'
__version__ = '0.5.1'
__author__ = 'Antoine Bertin'
__license__ = 'BSD'
__copyright__ = 'Copyright 2013 the BabelFish authors'
from .converters import (LanguageConverter, LanguageReverseConverter, LanguageEquivalenceConverter, CountryConverter,
CountryReverseConverter)
from .country import country_converters, COUNTRIES, COUNTRY_MATRIX, Country
from .exceptions import Error, LanguageConvertError, LanguageReverseError, CountryConvertError, CountryReverseError
from .language import language_converters, LANGUAGES, LANGUAGE_MATRIX, Language
from .script import SCRIPTS, SCRIPT_MATRIX, Script

View file

@ -0,0 +1,280 @@
# Copyright (c) 2013 the BabelFish authors. All rights reserved.
# Use of this source code is governed by the 3-clause BSD license
# that can be found in the LICENSE file.
#
import collections
from pkg_resources import iter_entry_points, EntryPoint
from ..exceptions import LanguageConvertError, LanguageReverseError
# from https://github.com/kennethreitz/requests/blob/master/requests/structures.py
class CaseInsensitiveDict(collections.MutableMapping):
"""A case-insensitive ``dict``-like object.
Implements all methods and operations of
``collections.MutableMapping`` as well as dict's ``copy``. Also
provides ``lower_items``.
All keys are expected to be strings. The structure remembers the
case of the last key to be set, and ``iter(instance)``,
``keys()``, ``items()``, ``iterkeys()``, and ``iteritems()``
will contain case-sensitive keys. However, querying and contains
testing is case insensitive:
cid = CaseInsensitiveDict()
cid['English'] = 'eng'
cid['ENGLISH'] == 'eng' # True
list(cid) == ['English'] # True
If the constructor, ``.update``, or equality comparison
operations are given keys that have equal ``.lower()``s, the
behavior is undefined.
"""
def __init__(self, data=None, **kwargs):
self._store = dict()
if data is None:
data = {}
self.update(data, **kwargs)
def __setitem__(self, key, value):
# Use the lowercased key for lookups, but store the actual
# key alongside the value.
self._store[key.lower()] = (key, value)
def __getitem__(self, key):
return self._store[key.lower()][1]
def __delitem__(self, key):
del self._store[key.lower()]
def __iter__(self):
return (casedkey for casedkey, mappedvalue in self._store.values())
def __len__(self):
return len(self._store)
def lower_items(self):
"""Like iteritems(), but with all lowercase keys."""
return (
(lowerkey, keyval[1])
for (lowerkey, keyval)
in self._store.items()
)
def __eq__(self, other):
if isinstance(other, collections.Mapping):
other = CaseInsensitiveDict(other)
else:
return NotImplemented
# Compare insensitively
return dict(self.lower_items()) == dict(other.lower_items())
# Copy is required
def copy(self):
return CaseInsensitiveDict(self._store.values())
def __repr__(self):
return '%s(%r)' % (self.__class__.__name__, dict(self.items()))
class LanguageConverter(object):
"""A :class:`LanguageConverter` supports converting an alpha3 language code with an
alpha2 country code and a script code into a custom code
.. attribute:: codes
Set of possible custom codes
"""
def convert(self, alpha3, country=None, script=None):
"""Convert an alpha3 language code with an alpha2 country code and a script code
into a custom code
:param string alpha3: ISO-639-3 language code
:param country: ISO-3166 country code, if any
:type country: string or None
:param script: ISO-15924 script code, if any
:type script: string or None
:return: the corresponding custom code
:rtype: string
:raise: :class:`~babelfish.exceptions.LanguageConvertError`
"""
raise NotImplementedError
class LanguageReverseConverter(LanguageConverter):
"""A :class:`LanguageConverter` able to reverse a custom code into a alpha3
ISO-639-3 language code, alpha2 ISO-3166-1 country code and ISO-15924 script code
"""
def reverse(self, code):
"""Reverse a custom code into alpha3, country and script code
:param string code: custom code to reverse
:return: the corresponding alpha3 ISO-639-3 language code, alpha2 ISO-3166-1 country code and ISO-15924 script code
:rtype: tuple
:raise: :class:`~babelfish.exceptions.LanguageReverseError`
"""
raise NotImplementedError
class LanguageEquivalenceConverter(LanguageReverseConverter):
"""A :class:`LanguageEquivalenceConverter` is a utility class that allows you to easily define a
:class:`LanguageReverseConverter` by only specifying the dict from alpha3 to their corresponding symbols.
You must specify the dict of equivalence as a class variable named SYMBOLS.
If you also set the class variable CASE_SENSITIVE to ``True`` then the reverse conversion function will be
case-sensitive (it is case-insensitive by default).
Example::
class MyCodeConverter(babelfish.LanguageEquivalenceConverter):
CASE_SENSITIVE = True
SYMBOLS = {'fra': 'mycode1', 'eng': 'mycode2'}
"""
CASE_SENSITIVE = False
def __init__(self):
self.codes = set()
self.to_symbol = {}
if self.CASE_SENSITIVE:
self.from_symbol = {}
else:
self.from_symbol = CaseInsensitiveDict()
for alpha3, symbol in self.SYMBOLS.items():
self.to_symbol[alpha3] = symbol
self.from_symbol[symbol] = (alpha3, None, None)
self.codes.add(symbol)
def convert(self, alpha3, country=None, script=None):
try:
return self.to_symbol[alpha3]
except KeyError:
raise LanguageConvertError(alpha3, country, script)
def reverse(self, code):
try:
return self.from_symbol[code]
except KeyError:
raise LanguageReverseError(code)
class CountryConverter(object):
"""A :class:`CountryConverter` supports converting an alpha2 country code
into a custom code
.. attribute:: codes
Set of possible custom codes
"""
def convert(self, alpha2):
"""Convert an alpha2 country code into a custom code
:param string alpha2: ISO-3166-1 language code
:return: the corresponding custom code
:rtype: string
:raise: :class:`~babelfish.exceptions.CountryConvertError`
"""
raise NotImplementedError
class CountryReverseConverter(CountryConverter):
"""A :class:`CountryConverter` able to reverse a custom code into a alpha2
ISO-3166-1 country code
"""
def reverse(self, code):
"""Reverse a custom code into alpha2 code
:param string code: custom code to reverse
:return: the corresponding alpha2 ISO-3166-1 country code
:rtype: string
:raise: :class:`~babelfish.exceptions.CountryReverseError`
"""
raise NotImplementedError
class ConverterManager(object):
"""Manager for babelfish converters behaving like a dict with lazy loading
Loading is done in this order:
* Entry point converters
* Registered converters
* Internal converters
.. attribute:: entry_point
The entry point where to look for converters
.. attribute:: internal_converters
Internal converters with entry point syntax
"""
entry_point = ''
internal_converters = []
def __init__(self):
#: Registered converters with entry point syntax
self.registered_converters = []
#: Loaded converters
self.converters = {}
def __getitem__(self, name):
"""Get a converter, lazy loading it if necessary"""
if name in self.converters:
return self.converters[name]
for ep in iter_entry_points(self.entry_point):
if ep.name == name:
self.converters[ep.name] = ep.load()()
return self.converters[ep.name]
for ep in (EntryPoint.parse(c) for c in self.registered_converters + self.internal_converters):
if ep.name == name:
self.converters[ep.name] = ep.load(require=False)()
return self.converters[ep.name]
raise KeyError(name)
def __setitem__(self, name, converter):
"""Load a converter"""
self.converters[name] = converter
def __delitem__(self, name):
"""Unload a converter"""
del self.converters[name]
def __iter__(self):
"""Iterator over loaded converters"""
return iter(self.converters)
def register(self, entry_point):
"""Register a converter
:param string entry_point: converter to register (entry point syntax)
:raise: ValueError if already registered
"""
if entry_point in self.registered_converters:
raise ValueError('Already registered')
self.registered_converters.insert(0, entry_point)
def unregister(self, entry_point):
"""Unregister a converter
:param string entry_point: converter to unregister (entry point syntax)
"""
self.registered_converters.remove(entry_point)
def __contains__(self, name):
return name in self.converters

View file

@ -0,0 +1,17 @@
# -*- coding: utf-8 -*-
#
# Copyright (c) 2013 the BabelFish authors. All rights reserved.
# Use of this source code is governed by the 3-clause BSD license
# that can be found in the LICENSE file.
#
from __future__ import unicode_literals
from . import LanguageEquivalenceConverter
from ..language import LANGUAGE_MATRIX
class Alpha2Converter(LanguageEquivalenceConverter):
CASE_SENSITIVE = True
SYMBOLS = {}
for iso_language in LANGUAGE_MATRIX:
if iso_language.alpha2:
SYMBOLS[iso_language.alpha3] = iso_language.alpha2

View file

@ -0,0 +1,17 @@
# -*- coding: utf-8 -*-
#
# Copyright (c) 2013 the BabelFish authors. All rights reserved.
# Use of this source code is governed by the 3-clause BSD license
# that can be found in the LICENSE file.
#
from __future__ import unicode_literals
from . import LanguageEquivalenceConverter
from ..language import LANGUAGE_MATRIX
class Alpha3BConverter(LanguageEquivalenceConverter):
CASE_SENSITIVE = True
SYMBOLS = {}
for iso_language in LANGUAGE_MATRIX:
if iso_language.alpha3b:
SYMBOLS[iso_language.alpha3] = iso_language.alpha3b

View file

@ -0,0 +1,17 @@
# -*- coding: utf-8 -*-
#
# Copyright (c) 2013 the BabelFish authors. All rights reserved.
# Use of this source code is governed by the 3-clause BSD license
# that can be found in the LICENSE file.
#
from __future__ import unicode_literals
from . import LanguageEquivalenceConverter
from ..language import LANGUAGE_MATRIX
class Alpha3TConverter(LanguageEquivalenceConverter):
CASE_SENSITIVE = True
SYMBOLS = {}
for iso_language in LANGUAGE_MATRIX:
if iso_language.alpha3t:
SYMBOLS[iso_language.alpha3] = iso_language.alpha3t

View file

@ -0,0 +1,31 @@
# -*- coding: utf-8 -*-
#
# Copyright (c) 2013 the BabelFish authors. All rights reserved.
# Use of this source code is governed by the 3-clause BSD license
# that can be found in the LICENSE file.
#
from __future__ import unicode_literals
from . import CountryReverseConverter, CaseInsensitiveDict
from ..country import COUNTRY_MATRIX
from ..exceptions import CountryConvertError, CountryReverseError
class CountryNameConverter(CountryReverseConverter):
def __init__(self):
self.codes = set()
self.to_name = {}
self.from_name = CaseInsensitiveDict()
for country in COUNTRY_MATRIX:
self.codes.add(country.name)
self.to_name[country.alpha2] = country.name
self.from_name[country.name] = country.alpha2
def convert(self, alpha2):
if alpha2 not in self.to_name:
raise CountryConvertError(alpha2)
return self.to_name[alpha2]
def reverse(self, name):
if name not in self.from_name:
raise CountryReverseError(name)
return self.from_name[name]

View file

@ -0,0 +1,17 @@
# -*- coding: utf-8 -*-
#
# Copyright (c) 2013 the BabelFish authors. All rights reserved.
# Use of this source code is governed by the 3-clause BSD license
# that can be found in the LICENSE file.
#
from __future__ import unicode_literals
from . import LanguageEquivalenceConverter
from ..language import LANGUAGE_MATRIX
class NameConverter(LanguageEquivalenceConverter):
CASE_SENSITIVE = False
SYMBOLS = {}
for iso_language in LANGUAGE_MATRIX:
if iso_language.name:
SYMBOLS[iso_language.alpha3] = iso_language.name

View file

@ -0,0 +1,36 @@
# -*- coding: utf-8 -*-
#
# Copyright (c) 2013 the BabelFish authors. All rights reserved.
# Use of this source code is governed by the 3-clause BSD license
# that can be found in the LICENSE file.
#
from __future__ import unicode_literals
from . import LanguageReverseConverter, CaseInsensitiveDict
from ..exceptions import LanguageReverseError
from ..language import language_converters
class OpenSubtitlesConverter(LanguageReverseConverter):
def __init__(self):
self.alpha3b_converter = language_converters['alpha3b']
self.alpha2_converter = language_converters['alpha2']
self.to_opensubtitles = {('por', 'BR'): 'pob', ('gre', None): 'ell', ('srp', None): 'scc', ('srp', 'ME'): 'mne'}
self.from_opensubtitles = CaseInsensitiveDict({'pob': ('por', 'BR'), 'pb': ('por', 'BR'), 'ell': ('ell', None),
'scc': ('srp', None), 'mne': ('srp', 'ME')})
self.codes = (self.alpha2_converter.codes | self.alpha3b_converter.codes | set(['pob', 'pb', 'scc', 'mne']))
def convert(self, alpha3, country=None, script=None):
alpha3b = self.alpha3b_converter.convert(alpha3, country, script)
if (alpha3b, country) in self.to_opensubtitles:
return self.to_opensubtitles[(alpha3b, country)]
return alpha3b
def reverse(self, opensubtitles):
if opensubtitles in self.from_opensubtitles:
return self.from_opensubtitles[opensubtitles]
for conv in [self.alpha3b_converter, self.alpha2_converter]:
try:
return conv.reverse(opensubtitles)
except LanguageReverseError:
pass
raise LanguageReverseError(opensubtitles)

View file

@ -0,0 +1,23 @@
# -*- coding: utf-8 -*-
#
# Copyright (c) 2013 the BabelFish authors. All rights reserved.
# Use of this source code is governed by the 3-clause BSD license
# that can be found in the LICENSE file.
#
from __future__ import unicode_literals
from . import LanguageConverter
from ..exceptions import LanguageConvertError
from ..language import LANGUAGE_MATRIX
class ScopeConverter(LanguageConverter):
FULLNAME = {'I': 'individual', 'M': 'macrolanguage', 'S': 'special'}
SYMBOLS = {}
for iso_language in LANGUAGE_MATRIX:
SYMBOLS[iso_language.alpha3] = iso_language.scope
codes = set(SYMBOLS.values())
def convert(self, alpha3, country=None, script=None):
if self.SYMBOLS[alpha3] in self.FULLNAME:
return self.FULLNAME[self.SYMBOLS[alpha3]]
raise LanguageConvertError(alpha3, country, script)

View file

@ -0,0 +1,23 @@
# -*- coding: utf-8 -*-
#
# Copyright (c) 2013 the BabelFish authors. All rights reserved.
# Use of this source code is governed by the 3-clause BSD license
# that can be found in the LICENSE file.
#
from __future__ import unicode_literals
from . import LanguageConverter
from ..exceptions import LanguageConvertError
from ..language import LANGUAGE_MATRIX
class LanguageTypeConverter(LanguageConverter):
FULLNAME = {'A': 'ancient', 'C': 'constructed', 'E': 'extinct', 'H': 'historical', 'L': 'living', 'S': 'special'}
SYMBOLS = {}
for iso_language in LANGUAGE_MATRIX:
SYMBOLS[iso_language.alpha3] = iso_language.type
codes = set(SYMBOLS.values())
def convert(self, alpha3, country=None, script=None):
if self.SYMBOLS[alpha3] in self.FULLNAME:
return self.FULLNAME[self.SYMBOLS[alpha3]]
raise LanguageConvertError(alpha3, country, script)

95
lib/babelfish/country.py Normal file
View file

@ -0,0 +1,95 @@
# -*- coding: utf-8 -*-
#
# Copyright (c) 2013 the BabelFish authors. All rights reserved.
# Use of this source code is governed by the 3-clause BSD license
# that can be found in the LICENSE file.
#
from __future__ import unicode_literals
from collections import namedtuple
from functools import partial
from pkg_resources import resource_stream # @UnresolvedImport
from .converters import ConverterManager
COUNTRIES = {}
COUNTRY_MATRIX = []
#: The namedtuple used in the :data:`COUNTRY_MATRIX`
IsoCountry = namedtuple('IsoCountry', ['name', 'alpha2'])
f = resource_stream('babelfish', 'data/iso-3166-1.txt')
f.readline()
for l in f:
iso_country = IsoCountry(*l.decode('utf-8').strip().split(';'))
COUNTRIES[iso_country.alpha2] = iso_country.name
COUNTRY_MATRIX.append(iso_country)
f.close()
class CountryConverterManager(ConverterManager):
""":class:`~babelfish.converters.ConverterManager` for country converters"""
entry_point = 'babelfish.country_converters'
internal_converters = ['name = babelfish.converters.countryname:CountryNameConverter']
country_converters = CountryConverterManager()
class CountryMeta(type):
"""The :class:`Country` metaclass
Dynamically redirect :meth:`Country.frommycode` to :meth:`Country.fromcode` with the ``mycode`` `converter`
"""
def __getattr__(cls, name):
if name.startswith('from'):
return partial(cls.fromcode, converter=name[4:])
return getattr(cls, name)
class Country(CountryMeta(str('CountryBase'), (object,), {})):
"""A country on Earth
A country is represented by a 2-letter code from the ISO-3166 standard
:param string country: 2-letter ISO-3166 country code
"""
def __init__(self, country):
if country not in COUNTRIES:
raise ValueError('%r is not a valid country' % country)
#: ISO-3166 2-letter country code
self.alpha2 = country
@classmethod
def fromcode(cls, code, converter):
"""Create a :class:`Country` by its `code` using `converter` to
:meth:`~babelfish.converters.CountryReverseConverter.reverse` it
:param string code: the code to reverse
:param string converter: name of the :class:`~babelfish.converters.CountryReverseConverter` to use
:return: the corresponding :class:`Country` instance
:rtype: :class:`Country`
"""
return cls(country_converters[converter].reverse(code))
def __getattr__(self, name):
return country_converters[name].convert(self.alpha2)
def __hash__(self):
return hash(self.alpha2)
def __eq__(self, other):
if other is None:
return False
return self.alpha2 == other.alpha2
def __ne__(self, other):
return not self == other
def __repr__(self):
return '<Country [%s]>' % self
def __str__(self):
return self.alpha2

View file

@ -0,0 +1,85 @@
# -*- coding: utf-8 -*-
#
# Copyright (c) 2013 the BabelFish authors. All rights reserved.
# Use of this source code is governed by the 3-clause BSD license
# that can be found in the LICENSE file.
#
from __future__ import unicode_literals
class Error(Exception):
"""Base class for all exceptions in babelfish"""
pass
class LanguageError(Error, AttributeError):
"""Base class for all language exceptions in babelfish"""
pass
class LanguageConvertError(LanguageError):
"""Exception raised by converters when :meth:`~babelfish.converters.LanguageConverter.convert` fails
:param string alpha3: alpha3 code that failed conversion
:param country: country code that failed conversion, if any
:type country: string or None
:param script: script code that failed conversion, if any
:type script: string or None
"""
def __init__(self, alpha3, country=None, script=None):
self.alpha3 = alpha3
self.country = country
self.script = script
def __str__(self):
s = self.alpha3
if self.country is not None:
s += '-' + self.country
if self.script is not None:
s += '-' + self.script
return s
class LanguageReverseError(LanguageError):
"""Exception raised by converters when :meth:`~babelfish.converters.LanguageReverseConverter.reverse` fails
:param string code: code that failed reverse conversion
"""
def __init__(self, code):
self.code = code
def __str__(self):
return repr(self.code)
class CountryError(Error, AttributeError):
"""Base class for all country exceptions in babelfish"""
pass
class CountryConvertError(CountryError):
"""Exception raised by converters when :meth:`~babelfish.converters.CountryConverter.convert` fails
:param string alpha2: alpha2 code that failed conversion
"""
def __init__(self, alpha2):
self.alpha2 = alpha2
def __str__(self):
return self.alpha2
class CountryReverseError(CountryError):
"""Exception raised by converters when :meth:`~babelfish.converters.CountryReverseConverter.reverse` fails
:param string code: code that failed reverse conversion
"""
def __init__(self, code):
self.code = code
def __str__(self):
return repr(self.code)

174
lib/babelfish/language.py Normal file
View file

@ -0,0 +1,174 @@
# -*- coding: utf-8 -*-
#
# Copyright (c) 2013 the BabelFish authors. All rights reserved.
# Use of this source code is governed by the 3-clause BSD license
# that can be found in the LICENSE file.
#
from __future__ import unicode_literals
from collections import namedtuple
from functools import partial
from pkg_resources import resource_stream # @UnresolvedImport
from .converters import ConverterManager
from .country import Country
from .exceptions import LanguageConvertError
from .script import Script
LANGUAGES = set()
LANGUAGE_MATRIX = []
#: The namedtuple used in the :data:`LANGUAGE_MATRIX`
IsoLanguage = namedtuple('IsoLanguage', ['alpha3', 'alpha3b', 'alpha3t', 'alpha2', 'scope', 'type', 'name', 'comment'])
f = resource_stream('babelfish', 'data/iso-639-3.tab')
f.readline()
for l in f:
iso_language = IsoLanguage(*l.decode('utf-8').split('\t'))
LANGUAGES.add(iso_language.alpha3)
LANGUAGE_MATRIX.append(iso_language)
f.close()
class LanguageConverterManager(ConverterManager):
""":class:`~babelfish.converters.ConverterManager` for language converters"""
entry_point = 'babelfish.language_converters'
internal_converters = ['alpha2 = babelfish.converters.alpha2:Alpha2Converter',
'alpha3b = babelfish.converters.alpha3b:Alpha3BConverter',
'alpha3t = babelfish.converters.alpha3t:Alpha3TConverter',
'name = babelfish.converters.name:NameConverter',
'scope = babelfish.converters.scope:ScopeConverter',
'type = babelfish.converters.type:LanguageTypeConverter',
'opensubtitles = babelfish.converters.opensubtitles:OpenSubtitlesConverter']
language_converters = LanguageConverterManager()
class LanguageMeta(type):
"""The :class:`Language` metaclass
Dynamically redirect :meth:`Language.frommycode` to :meth:`Language.fromcode` with the ``mycode`` `converter`
"""
def __getattr__(cls, name):
if name.startswith('from'):
return partial(cls.fromcode, converter=name[4:])
return getattr(cls, name)
class Language(LanguageMeta(str('LanguageBase'), (object,), {})):
"""A human language
A human language is composed of a language part following the ISO-639
standard and can be country-specific when a :class:`~babelfish.country.Country`
is specified.
The :class:`Language` is extensible with custom converters (see :ref:`custom_converters`)
:param string language: the language as a 3-letter ISO-639-3 code
:param country: the country (if any) as a 2-letter ISO-3166 code or :class:`~babelfish.country.Country` instance
:type country: string or :class:`~babelfish.country.Country` or None
:param script: the script (if any) as a 4-letter ISO-15924 code or :class:`~babelfish.script.Script` instance
:type script: string or :class:`~babelfish.script.Script` or None
:param unknown: the unknown language as a three-letters ISO-639-3 code to use as fallback
:type unknown: string or None
:raise: ValueError if the language could not be recognized and `unknown` is ``None``
"""
def __init__(self, language, country=None, script=None, unknown=None):
if unknown is not None and language not in LANGUAGES:
language = unknown
if language not in LANGUAGES:
raise ValueError('%r is not a valid language' % language)
self.alpha3 = language
self.country = None
if isinstance(country, Country):
self.country = country
elif country is None:
self.country = None
else:
self.country = Country(country)
self.script = None
if isinstance(script, Script):
self.script = script
elif script is None:
self.script = None
else:
self.script = Script(script)
@classmethod
def fromcode(cls, code, converter):
"""Create a :class:`Language` by its `code` using `converter` to
:meth:`~babelfish.converters.LanguageReverseConverter.reverse` it
:param string code: the code to reverse
:param string converter: name of the :class:`~babelfish.converters.LanguageReverseConverter` to use
:return: the corresponding :class:`Language` instance
:rtype: :class:`Language`
"""
return cls(*language_converters[converter].reverse(code))
@classmethod
def fromietf(cls, ietf):
"""Create a :class:`Language` by from an IETF language code
:param string ietf: the ietf code
:return: the corresponding :class:`Language` instance
:rtype: :class:`Language`
"""
subtags = ietf.split('-')
language_subtag = subtags.pop(0).lower()
if len(language_subtag) == 2:
language = cls.fromalpha2(language_subtag)
else:
language = cls(language_subtag)
while subtags:
subtag = subtags.pop(0)
if len(subtag) == 2:
language.country = Country(subtag.upper())
else:
language.script = Script(subtag.capitalize())
if language.script is not None:
if subtags:
raise ValueError('Wrong IETF format. Unmatched subtags: %r' % subtags)
break
return language
def __getattr__(self, name):
alpha3 = self.alpha3
country = self.country.alpha2 if self.country is not None else None
script = self.script.code if self.script is not None else None
try:
return language_converters[name].convert(alpha3, country, script)
except KeyError:
raise AttributeError(name)
def __hash__(self):
return hash(str(self))
def __eq__(self, other):
if other is None:
return False
return self.alpha3 == other.alpha3 and self.country == other.country and self.script == other.script
def __ne__(self, other):
return not self == other
def __bool__(self):
return self.alpha3 != 'und'
__nonzero__ = __bool__
def __repr__(self):
return '<Language [%s]>' % self
def __str__(self):
try:
s = self.alpha2
except LanguageConvertError:
s = self.alpha3
if self.country is not None:
s += '-' + str(self.country)
if self.script is not None:
s += '-' + str(self.script)
return s

66
lib/babelfish/script.py Normal file
View file

@ -0,0 +1,66 @@
# -*- coding: utf-8 -*-
#
# Copyright (c) 2013 the BabelFish authors. All rights reserved.
# Use of this source code is governed by the 3-clause BSD license
# that can be found in the LICENSE file.
#
from __future__ import unicode_literals
from collections import namedtuple
from pkg_resources import resource_stream # @UnresolvedImport
#: Script code to script name mapping
SCRIPTS = {}
#: List of countries in the ISO-15924 as namedtuple of code, number, name, french_name, pva and date
SCRIPT_MATRIX = []
#: The namedtuple used in the :data:`SCRIPT_MATRIX`
IsoScript = namedtuple('IsoScript', ['code', 'number', 'name', 'french_name', 'pva', 'date'])
f = resource_stream('babelfish', 'data/iso15924-utf8-20131012.txt')
f.readline()
for l in f:
l = l.decode('utf-8').strip()
if not l or l.startswith('#'):
continue
script = IsoScript._make(l.split(';'))
SCRIPT_MATRIX.append(script)
SCRIPTS[script.code] = script.name
f.close()
class Script(object):
"""A human writing system
A script is represented by a 4-letter code from the ISO-15924 standard
:param string script: 4-letter ISO-15924 script code
"""
def __init__(self, script):
if script not in SCRIPTS:
raise ValueError('%r is not a valid script' % script)
#: ISO-15924 4-letter script code
self.code = script
@property
def name(self):
"""English name of the script"""
return SCRIPTS[self.code]
def __hash__(self):
return hash(self.code)
def __eq__(self, other):
return self.code == other.code
def __ne__(self, other):
return not self == other
def __repr__(self):
return '<Script [%s]>' % self
def __str__(self):
return self.code

353
lib/babelfish/tests.py Normal file
View file

@ -0,0 +1,353 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
#
# Copyright (c) 2013 the BabelFish authors. All rights reserved.
# Use of this source code is governed by the 3-clause BSD license
# that can be found in the LICENSE file.
#
from __future__ import unicode_literals
import re
import sys
from unittest import TestCase, TestSuite, TestLoader, TextTestRunner
from pkg_resources import resource_stream # @UnresolvedImport
from babelfish import (LANGUAGES, Language, Country, Script, language_converters, country_converters,
LanguageReverseConverter, LanguageConvertError, LanguageReverseError, CountryReverseError)
if sys.version_info[:2] <= (2, 6):
_MAX_LENGTH = 80
def safe_repr(obj, short=False):
try:
result = repr(obj)
except Exception:
result = object.__repr__(obj)
if not short or len(result) < _MAX_LENGTH:
return result
return result[:_MAX_LENGTH] + ' [truncated]...'
class _AssertRaisesContext(object):
"""A context manager used to implement TestCase.assertRaises* methods."""
def __init__(self, expected, test_case, expected_regexp=None):
self.expected = expected
self.failureException = test_case.failureException
self.expected_regexp = expected_regexp
def __enter__(self):
return self
def __exit__(self, exc_type, exc_value, tb):
if exc_type is None:
try:
exc_name = self.expected.__name__
except AttributeError:
exc_name = str(self.expected)
raise self.failureException(
"{0} not raised".format(exc_name))
if not issubclass(exc_type, self.expected):
# let unexpected exceptions pass through
return False
self.exception = exc_value # store for later retrieval
if self.expected_regexp is None:
return True
expected_regexp = self.expected_regexp
if isinstance(expected_regexp, basestring):
expected_regexp = re.compile(expected_regexp)
if not expected_regexp.search(str(exc_value)):
raise self.failureException('"%s" does not match "%s"' %
(expected_regexp.pattern, str(exc_value)))
return True
class _Py26FixTestCase(object):
def assertIsNone(self, obj, msg=None):
"""Same as self.assertTrue(obj is None), with a nicer default message."""
if obj is not None:
standardMsg = '%s is not None' % (safe_repr(obj),)
self.fail(self._formatMessage(msg, standardMsg))
def assertIsNotNone(self, obj, msg=None):
"""Included for symmetry with assertIsNone."""
if obj is None:
standardMsg = 'unexpectedly None'
self.fail(self._formatMessage(msg, standardMsg))
def assertIn(self, member, container, msg=None):
"""Just like self.assertTrue(a in b), but with a nicer default message."""
if member not in container:
standardMsg = '%s not found in %s' % (safe_repr(member),
safe_repr(container))
self.fail(self._formatMessage(msg, standardMsg))
def assertNotIn(self, member, container, msg=None):
"""Just like self.assertTrue(a not in b), but with a nicer default message."""
if member in container:
standardMsg = '%s unexpectedly found in %s' % (safe_repr(member),
safe_repr(container))
self.fail(self._formatMessage(msg, standardMsg))
def assertIs(self, expr1, expr2, msg=None):
"""Just like self.assertTrue(a is b), but with a nicer default message."""
if expr1 is not expr2:
standardMsg = '%s is not %s' % (safe_repr(expr1),
safe_repr(expr2))
self.fail(self._formatMessage(msg, standardMsg))
def assertIsNot(self, expr1, expr2, msg=None):
"""Just like self.assertTrue(a is not b), but with a nicer default message."""
if expr1 is expr2:
standardMsg = 'unexpectedly identical: %s' % (safe_repr(expr1),)
self.fail(self._formatMessage(msg, standardMsg))
else:
class _Py26FixTestCase(object):
pass
class TestScript(TestCase, _Py26FixTestCase):
def test_wrong_script(self):
self.assertRaises(ValueError, lambda: Script('Azer'))
def test_eq(self):
self.assertEqual(Script('Latn'), Script('Latn'))
def test_ne(self):
self.assertNotEqual(Script('Cyrl'), Script('Latn'))
def test_hash(self):
self.assertEqual(hash(Script('Hira')), hash('Hira'))
class TestCountry(TestCase, _Py26FixTestCase):
def test_wrong_country(self):
self.assertRaises(ValueError, lambda: Country('ZZ'))
def test_eq(self):
self.assertEqual(Country('US'), Country('US'))
def test_ne(self):
self.assertNotEqual(Country('GB'), Country('US'))
self.assertIsNotNone(Country('US'))
def test_hash(self):
self.assertEqual(hash(Country('US')), hash('US'))
def test_converter_name(self):
self.assertEqual(Country('US').name, 'UNITED STATES')
self.assertEqual(Country.fromname('UNITED STATES'), Country('US'))
self.assertEqual(Country.fromcode('UNITED STATES', 'name'), Country('US'))
self.assertRaises(CountryReverseError, lambda: Country.fromname('ZZZZZ'))
self.assertEqual(len(country_converters['name'].codes), 249)
class TestLanguage(TestCase, _Py26FixTestCase):
def test_languages(self):
self.assertEqual(len(LANGUAGES), 7874)
def test_wrong_language(self):
self.assertRaises(ValueError, lambda: Language('zzz'))
def test_unknown_language(self):
self.assertEqual(Language('zzzz', unknown='und'), Language('und'))
def test_converter_alpha2(self):
self.assertEqual(Language('eng').alpha2, 'en')
self.assertEqual(Language.fromalpha2('en'), Language('eng'))
self.assertEqual(Language.fromcode('en', 'alpha2'), Language('eng'))
self.assertRaises(LanguageReverseError, lambda: Language.fromalpha2('zz'))
self.assertRaises(LanguageConvertError, lambda: Language('aaa').alpha2)
self.assertEqual(len(language_converters['alpha2'].codes), 184)
def test_converter_alpha3b(self):
self.assertEqual(Language('fra').alpha3b, 'fre')
self.assertEqual(Language.fromalpha3b('fre'), Language('fra'))
self.assertEqual(Language.fromcode('fre', 'alpha3b'), Language('fra'))
self.assertRaises(LanguageReverseError, lambda: Language.fromalpha3b('zzz'))
self.assertRaises(LanguageConvertError, lambda: Language('aaa').alpha3b)
self.assertEqual(len(language_converters['alpha3b'].codes), 418)
def test_converter_alpha3t(self):
self.assertEqual(Language('fra').alpha3t, 'fra')
self.assertEqual(Language.fromalpha3t('fra'), Language('fra'))
self.assertEqual(Language.fromcode('fra', 'alpha3t'), Language('fra'))
self.assertRaises(LanguageReverseError, lambda: Language.fromalpha3t('zzz'))
self.assertRaises(LanguageConvertError, lambda: Language('aaa').alpha3t)
self.assertEqual(len(language_converters['alpha3t'].codes), 418)
def test_converter_name(self):
self.assertEqual(Language('eng').name, 'English')
self.assertEqual(Language.fromname('English'), Language('eng'))
self.assertEqual(Language.fromcode('English', 'name'), Language('eng'))
self.assertRaises(LanguageReverseError, lambda: Language.fromname('Zzzzzzzzz'))
self.assertEqual(len(language_converters['name'].codes), 7874)
def test_converter_scope(self):
self.assertEqual(language_converters['scope'].codes, set(['I', 'S', 'M']))
self.assertEqual(Language('eng').scope, 'individual')
self.assertEqual(Language('und').scope, 'special')
def test_converter_type(self):
self.assertEqual(language_converters['type'].codes, set(['A', 'C', 'E', 'H', 'L', 'S']))
self.assertEqual(Language('eng').type, 'living')
self.assertEqual(Language('und').type, 'special')
def test_converter_opensubtitles(self):
self.assertEqual(Language('fra').opensubtitles, Language('fra').alpha3b)
self.assertEqual(Language('por', 'BR').opensubtitles, 'pob')
self.assertEqual(Language.fromopensubtitles('fre'), Language('fra'))
self.assertEqual(Language.fromopensubtitles('pob'), Language('por', 'BR'))
self.assertEqual(Language.fromopensubtitles('pb'), Language('por', 'BR'))
# Montenegrin is not recognized as an ISO language (yet?) but for now it is
# unofficially accepted as Serbian from Montenegro
self.assertEqual(Language.fromopensubtitles('mne'), Language('srp', 'ME'))
self.assertEqual(Language.fromcode('pob', 'opensubtitles'), Language('por', 'BR'))
self.assertRaises(LanguageReverseError, lambda: Language.fromopensubtitles('zzz'))
self.assertRaises(LanguageConvertError, lambda: Language('aaa').opensubtitles)
self.assertEqual(len(language_converters['opensubtitles'].codes), 606)
# test with all the LANGUAGES from the opensubtitles api
# downloaded from: http://www.opensubtitles.org/addons/export_languages.php
f = resource_stream('babelfish', 'data/opensubtitles_languages.txt')
f.readline()
for l in f:
idlang, alpha2, _, upload_enabled, web_enabled = l.decode('utf-8').strip().split('\t')
if not int(upload_enabled) and not int(web_enabled):
# do not test LANGUAGES that are too esoteric / not widely available
continue
self.assertEqual(Language.fromopensubtitles(idlang).opensubtitles, idlang)
if alpha2:
self.assertEqual(Language.fromopensubtitles(idlang), Language.fromopensubtitles(alpha2))
f.close()
def test_fromietf_country_script(self):
language = Language.fromietf('fra-FR-Latn')
self.assertEqual(language.alpha3, 'fra')
self.assertEqual(language.country, Country('FR'))
self.assertEqual(language.script, Script('Latn'))
def test_fromietf_country_no_script(self):
language = Language.fromietf('fra-FR')
self.assertEqual(language.alpha3, 'fra')
self.assertEqual(language.country, Country('FR'))
self.assertIsNone(language.script)
def test_fromietf_no_country_no_script(self):
language = Language.fromietf('fra-FR')
self.assertEqual(language.alpha3, 'fra')
self.assertEqual(language.country, Country('FR'))
self.assertIsNone(language.script)
def test_fromietf_no_country_script(self):
language = Language.fromietf('fra-Latn')
self.assertEqual(language.alpha3, 'fra')
self.assertIsNone(language.country)
self.assertEqual(language.script, Script('Latn'))
def test_fromietf_alpha2_language(self):
language = Language.fromietf('fr-Latn')
self.assertEqual(language.alpha3, 'fra')
self.assertIsNone(language.country)
self.assertEqual(language.script, Script('Latn'))
def test_fromietf_wrong_language(self):
self.assertRaises(ValueError, lambda: Language.fromietf('xyz-FR'))
def test_fromietf_wrong_country(self):
self.assertRaises(ValueError, lambda: Language.fromietf('fra-YZ'))
def test_fromietf_wrong_script(self):
self.assertRaises(ValueError, lambda: Language.fromietf('fra-FR-Wxyz'))
def test_eq(self):
self.assertEqual(Language('eng'), Language('eng'))
def test_ne(self):
self.assertNotEqual(Language('fra'), Language('eng'))
self.assertIsNotNone(Language('fra'))
def test_nonzero(self):
self.assertFalse(bool(Language('und')))
self.assertTrue(bool(Language('eng')))
def test_language_hasattr(self):
self.assertTrue(hasattr(Language('fra'), 'alpha3'))
self.assertTrue(hasattr(Language('fra'), 'alpha2'))
self.assertFalse(hasattr(Language('bej'), 'alpha2'))
def test_country(self):
self.assertEqual(Language('por', 'BR').country, Country('BR'))
self.assertEqual(Language('eng', Country('US')).country, Country('US'))
def test_eq_with_country(self):
self.assertEqual(Language('eng', 'US'), Language('eng', Country('US')))
def test_ne_with_country(self):
self.assertNotEqual(Language('eng', 'US'), Language('eng', Country('GB')))
def test_script(self):
self.assertEqual(Language('srp', script='Latn').script, Script('Latn'))
self.assertEqual(Language('srp', script=Script('Cyrl')).script, Script('Cyrl'))
def test_eq_with_script(self):
self.assertEqual(Language('srp', script='Latn'), Language('srp', script=Script('Latn')))
def test_ne_with_script(self):
self.assertNotEqual(Language('srp', script='Latn'), Language('srp', script=Script('Cyrl')))
def test_eq_with_country_and_script(self):
self.assertEqual(Language('srp', 'SR', 'Latn'), Language('srp', Country('SR'), Script('Latn')))
def test_ne_with_country_and_script(self):
self.assertNotEqual(Language('srp', 'SR', 'Latn'), Language('srp', Country('SR'), Script('Cyrl')))
def test_hash(self):
self.assertEqual(hash(Language('fra')), hash('fr'))
self.assertEqual(hash(Language('ace')), hash('ace'))
self.assertEqual(hash(Language('por', 'BR')), hash('pt-BR'))
self.assertEqual(hash(Language('srp', script='Cyrl')), hash('sr-Cyrl'))
self.assertEqual(hash(Language('eng', 'US', 'Latn')), hash('en-US-Latn'))
def test_str(self):
self.assertEqual(Language.fromietf(str(Language('eng', 'US', 'Latn'))), Language('eng', 'US', 'Latn'))
self.assertEqual(Language.fromietf(str(Language('fra', 'FR'))), Language('fra', 'FR'))
self.assertEqual(Language.fromietf(str(Language('bel'))), Language('bel'))
def test_register_converter(self):
class TestConverter(LanguageReverseConverter):
def __init__(self):
self.to_test = {'fra': 'test1', 'eng': 'test2'}
self.from_test = {'test1': 'fra', 'test2': 'eng'}
def convert(self, alpha3, country=None, script=None):
if alpha3 not in self.to_test:
raise LanguageConvertError(alpha3, country, script)
return self.to_test[alpha3]
def reverse(self, test):
if test not in self.from_test:
raise LanguageReverseError(test)
return (self.from_test[test], None)
language = Language('fra')
self.assertFalse(hasattr(language, 'test'))
language_converters['test'] = TestConverter()
self.assertTrue(hasattr(language, 'test'))
self.assertIn('test', language_converters)
self.assertEqual(Language('fra').test, 'test1')
self.assertEqual(Language.fromtest('test2').alpha3, 'eng')
del language_converters['test']
self.assertNotIn('test', language_converters)
self.assertRaises(KeyError, lambda: Language.fromtest('test1'))
self.assertRaises(AttributeError, lambda: Language('fra').test)
def suite():
suite = TestSuite()
suite.addTest(TestLoader().loadTestsFromTestCase(TestScript))
suite.addTest(TestLoader().loadTestsFromTestCase(TestCountry))
suite.addTest(TestLoader().loadTestsFromTestCase(TestLanguage))
return suite
if __name__ == '__main__':
TextTestRunner().run(suite())

View file

@ -0,0 +1,249 @@
Afghanistan|AF|AFG|004|ISO 3166-2:AF
Åland Islands|AX|ALA|248|ISO 3166-2:AX
Albania|AL|ALB|008|ISO 3166-2:AL
Algeria|DZ|DZA|012|ISO 3166-2:DZ
American Samoa|AS|ASM|016|ISO 3166-2:AS
Andorra|AD|AND|020|ISO 3166-2:AD
Angola|AO|AGO|024|ISO 3166-2:AO
Anguilla|AI|AIA|660|ISO 3166-2:AI
Antarctica|AQ|ATA|010|ISO 3166-2:AQ
Antigua and Barbuda|AG|ATG|028|ISO 3166-2:AG
Argentina|AR|ARG|032|ISO 3166-2:AR
Armenia|AM|ARM|051|ISO 3166-2:AM
Aruba|AW|ABW|533|ISO 3166-2:AW
Australia|AU|AUS|036|ISO 3166-2:AU
Austria|AT|AUT|040|ISO 3166-2:AT
Azerbaijan|AZ|AZE|031|ISO 3166-2:AZ
Bahamas|BS|BHS|044|ISO 3166-2:BS
Bahrain|BH|BHR|048|ISO 3166-2:BH
Bangladesh|BD|BGD|050|ISO 3166-2:BD
Barbados|BB|BRB|052|ISO 3166-2:BB
Belarus|BY|BLR|112|ISO 3166-2:BY
Belgium|BE|BEL|056|ISO 3166-2:BE
Belize|BZ|BLZ|084|ISO 3166-2:BZ
Benin|BJ|BEN|204|ISO 3166-2:BJ
Bermuda|BM|BMU|060|ISO 3166-2:BM
Bhutan|BT|BTN|064|ISO 3166-2:BT
Bolivia, Plurinational State of|BO|BOL|068|ISO 3166-2:BO
Bonaire, Sint Eustatius and Saba|BQ|BES|535|ISO 3166-2:BQ
Bosnia and Herzegovina|BA|BIH|070|ISO 3166-2:BA
Botswana|BW|BWA|072|ISO 3166-2:BW
Bouvet Island|BV|BVT|074|ISO 3166-2:BV
Brazil|BR|BRA|076|ISO 3166-2:BR
British Indian Ocean Territory|IO|IOT|086|ISO 3166-2:IO
Brunei Darussalam|BN|BRN|096|ISO 3166-2:BN
Bulgaria|BG|BGR|100|ISO 3166-2:BG
Burkina Faso|BF|BFA|854|ISO 3166-2:BF
Burundi|BI|BDI|108|ISO 3166-2:BI
Cambodia|KH|KHM|116|ISO 3166-2:KH
Cameroon|CM|CMR|120|ISO 3166-2:CM
Canada|CA|CAN|124|ISO 3166-2:CA
Cape Verde|CV|CPV|132|ISO 3166-2:CV
Cayman Islands|KY|CYM|136|ISO 3166-2:KY
Central African Republic|CF|CAF|140|ISO 3166-2:CF
Chad|TD|TCD|148|ISO 3166-2:TD
Chile|CL|CHL|152|ISO 3166-2:CL
China|CN|CHN|156|ISO 3166-2:CN
Christmas Island|CX|CXR|162|ISO 3166-2:CX
Cocos (Keeling) Islands|CC|CCK|166|ISO 3166-2:CC
Colombia|CO|COL|170|ISO 3166-2:CO
Comoros|KM|COM|174|ISO 3166-2:KM
Congo|CG|COG|178|ISO 3166-2:CG
Congo, the Democratic Republic of the|CD|COD|180|ISO 3166-2:CD
Cook Islands|CK|COK|184|ISO 3166-2:CK
Costa Rica|CR|CRI|188|ISO 3166-2:CR
Côte d'Ivoire|CI|CIV|384|ISO 3166-2:CI
Croatia|HR|HRV|191|ISO 3166-2:HR
Cuba|CU|CUB|192|ISO 3166-2:CU
Curaçao|CW|CUW|531|ISO 3166-2:CW
Cyprus|CY|CYP|196|ISO 3166-2:CY
Czech Republic|CZ|CZE|203|ISO 3166-2:CZ
Denmark|DK|DNK|208|ISO 3166-2:DK
Djibouti|DJ|DJI|262|ISO 3166-2:DJ
Dominica|DM|DMA|212|ISO 3166-2:DM
Dominican Republic|DO|DOM|214|ISO 3166-2:DO
Ecuador|EC|ECU|218|ISO 3166-2:EC
Egypt|EG|EGY|818|ISO 3166-2:EG
El Salvador|SV|SLV|222|ISO 3166-2:SV
Equatorial Guinea|GQ|GNQ|226|ISO 3166-2:GQ
Eritrea|ER|ERI|232|ISO 3166-2:ER
Estonia|EE|EST|233|ISO 3166-2:EE
Ethiopia|ET|ETH|231|ISO 3166-2:ET
Falkland Islands (Malvinas|FK|FLK|238|ISO 3166-2:FK
Faroe Islands|FO|FRO|234|ISO 3166-2:FO
Fiji|FJ|FJI|242|ISO 3166-2:FJ
Finland|FI|FIN|246|ISO 3166-2:FI
France|FR|FRA|250|ISO 3166-2:FR
French Guiana|GF|GUF|254|ISO 3166-2:GF
French Polynesia|PF|PYF|258|ISO 3166-2:PF
French Southern Territories|TF|ATF|260|ISO 3166-2:TF
Gabon|GA|GAB|266|ISO 3166-2:GA
Gambia|GM|GMB|270|ISO 3166-2:GM
Georgia|GE|GEO|268|ISO 3166-2:GE
Germany|DE|DEU|276|ISO 3166-2:DE
Ghana|GH|GHA|288|ISO 3166-2:GH
Gibraltar|GI|GIB|292|ISO 3166-2:GI
Greece|GR|GRC|300|ISO 3166-2:GR
Greenland|GL|GRL|304|ISO 3166-2:GL
Grenada|GD|GRD|308|ISO 3166-2:GD
Guadeloupe|GP|GLP|312|ISO 3166-2:GP
Guam|GU|GUM|316|ISO 3166-2:GU
Guatemala|GT|GTM|320|ISO 3166-2:GT
Guernsey|GG|GGY|831|ISO 3166-2:GG
Guinea|GN|GIN|324|ISO 3166-2:GN
Guinea-Bissau|GW|GNB|624|ISO 3166-2:GW
Guyana|GY|GUY|328|ISO 3166-2:GY
Haiti|HT|HTI|332|ISO 3166-2:HT
Heard Island and McDonald Islands|HM|HMD|334|ISO 3166-2:HM
Holy See (Vatican City State|VA|VAT|336|ISO 3166-2:VA
Honduras|HN|HND|340|ISO 3166-2:HN
Hong Kong|HK|HKG|344|ISO 3166-2:HK
Hungary|HU|HUN|348|ISO 3166-2:HU
Iceland|IS|ISL|352|ISO 3166-2:IS
India|IN|IND|356|ISO 3166-2:IN
Indonesia|ID|IDN|360|ISO 3166-2:ID
Iran, Islamic Republic of|IR|IRN|364|ISO 3166-2:IR
Iraq|IQ|IRQ|368|ISO 3166-2:IQ
Ireland|IE|IRL|372|ISO 3166-2:IE
Isle of Man|IM|IMN|833|ISO 3166-2:IM
Israel|IL|ISR|376|ISO 3166-2:IL
Italy|IT|ITA|380|ISO 3166-2:IT
Jamaica|JM|JAM|388|ISO 3166-2:JM
Japan|JP|JPN|392|ISO 3166-2:JP
Jersey|JE|JEY|832|ISO 3166-2:JE
Jordan|JO|JOR|400|ISO 3166-2:JO
Kazakhstan|KZ|KAZ|398|ISO 3166-2:KZ
Kenya|KE|KEN|404|ISO 3166-2:KE
Kiribati|KI|KIR|296|ISO 3166-2:KI
Korea, Democratic People's Republic of|KP|PRK|408|ISO 3166-2:KP
Korea, Republic of|KR|KOR|410|ISO 3166-2:KR
Kuwait|KW|KWT|414|ISO 3166-2:KW
Kyrgyzstan|KG|KGZ|417|ISO 3166-2:KG
Lao People's Democratic Republic|LA|LAO|418|ISO 3166-2:LA
Latvia|LV|LVA|428|ISO 3166-2:LV
Lebanon|LB|LBN|422|ISO 3166-2:LB
Lesotho|LS|LSO|426|ISO 3166-2:LS
Liberia|LR|LBR|430|ISO 3166-2:LR
Libya|LY|LBY|434|ISO 3166-2:LY
Liechtenstein|LI|LIE|438|ISO 3166-2:LI
Lithuania|LT|LTU|440|ISO 3166-2:LT
Luxembourg|LU|LUX|442|ISO 3166-2:LU
Macao|MO|MAC|446|ISO 3166-2:MO
Macedonia, the former Yugoslav Republic of|MK|MKD|807|ISO 3166-2:MK
Madagascar|MG|MDG|450|ISO 3166-2:MG
Malawi|MW|MWI|454|ISO 3166-2:MW
Malaysia|MY|MYS|458|ISO 3166-2:MY
Maldives|MV|MDV|462|ISO 3166-2:MV
Mali|ML|MLI|466|ISO 3166-2:ML
Malta|MT|MLT|470|ISO 3166-2:MT
Marshall Islands|MH|MHL|584|ISO 3166-2:MH
Martinique|MQ|MTQ|474|ISO 3166-2:MQ
Mauritania|MR|MRT|478|ISO 3166-2:MR
Mauritius|MU|MUS|480|ISO 3166-2:MU
Mayotte|YT|MYT|175|ISO 3166-2:YT
Mexico|MX|MEX|484|ISO 3166-2:MX
Micronesia, Federated States of|FM|FSM|583|ISO 3166-2:FM
Moldova, Republic of|MD|MDA|498|ISO 3166-2:MD
Monaco|MC|MCO|492|ISO 3166-2:MC
Mongolia|MN|MNG|496|ISO 3166-2:MN
Montenegro|ME|MNE|499|ISO 3166-2:ME
Montserrat|MS|MSR|500|ISO 3166-2:MS
Morocco|MA|MAR|504|ISO 3166-2:MA
Mozambique|MZ|MOZ|508|ISO 3166-2:MZ
Myanmar|MM|MMR|104|ISO 3166-2:MM
Namibia|NA|NAM|516|ISO 3166-2:NA
Nauru|NR|NRU|520|ISO 3166-2:NR
Nepal|NP|NPL|524|ISO 3166-2:NP
Netherlands|NL|NLD|528|ISO 3166-2:NL
New Caledonia|NC|NCL|540|ISO 3166-2:NC
New Zealand|NZ|NZL|554|ISO 3166-2:NZ
Nicaragua|NI|NIC|558|ISO 3166-2:NI
Niger|NE|NER|562|ISO 3166-2:NE
Nigeria|NG|NGA|566|ISO 3166-2:NG
Niue|NU|NIU|570|ISO 3166-2:NU
Norfolk Island|NF|NFK|574|ISO 3166-2:NF
Northern Mariana Islands|MP|MNP|580|ISO 3166-2:MP
Norway|NO|NOR|578|ISO 3166-2:NO
Oman|OM|OMN|512|ISO 3166-2:OM
Pakistan|PK|PAK|586|ISO 3166-2:PK
Palau|PW|PLW|585|ISO 3166-2:PW
Palestinian Territory, Occupied|PS|PSE|275|ISO 3166-2:PS
Panama|PA|PAN|591|ISO 3166-2:PA
Papua New Guinea|PG|PNG|598|ISO 3166-2:PG
Paraguay|PY|PRY|600|ISO 3166-2:PY
Peru|PE|PER|604|ISO 3166-2:PE
Philippines|PH|PHL|608|ISO 3166-2:PH
Pitcairn|PN|PCN|612|ISO 3166-2:PN
Poland|PL|POL|616|ISO 3166-2:PL
Portugal|PT|PRT|620|ISO 3166-2:PT
Puerto Rico|PR|PRI|630|ISO 3166-2:PR
Qatar|QA|QAT|634|ISO 3166-2:QA
Réunion|RE|REU|638|ISO 3166-2:RE
Romania|RO|ROU|642|ISO 3166-2:RO
Russian Federation|RU|RUS|643|ISO 3166-2:RU
Rwanda|RW|RWA|646|ISO 3166-2:RW
Saint Barthélemy|BL|BLM|652|ISO 3166-2:BL
Saint Helena, Ascension and Tristan da Cunha|SH|SHN|654|ISO 3166-2:SH
Saint Kitts and Nevis|KN|KNA|659|ISO 3166-2:KN
Saint Lucia|LC|LCA|662|ISO 3166-2:LC
Saint Martin (French part|MF|MAF|663|ISO 3166-2:MF
Saint Pierre and Miquelon|PM|SPM|666|ISO 3166-2:PM
Saint Vincent and the Grenadines|VC|VCT|670|ISO 3166-2:VC
Samoa|WS|WSM|882|ISO 3166-2:WS
San Marino|SM|SMR|674|ISO 3166-2:SM
Sao Tome and Principe|ST|STP|678|ISO 3166-2:ST
Saudi Arabia|SA|SAU|682|ISO 3166-2:SA
Senegal|SN|SEN|686|ISO 3166-2:SN
Serbia|RS|SRB|688|ISO 3166-2:RS
Seychelles|SC|SYC|690|ISO 3166-2:SC
Sierra Leone|SL|SLE|694|ISO 3166-2:SL
Singapore|SG|SGP|702|ISO 3166-2:SG
Sint Maarten (Dutch part|SX|SXM|534|ISO 3166-2:SX
Slovakia|SK|SVK|703|ISO 3166-2:SK
Slovenia|SI|SVN|705|ISO 3166-2:SI
Solomon Islands|SB|SLB|090|ISO 3166-2:SB
Somalia|SO|SOM|706|ISO 3166-2:SO
South Africa|ZA|ZAF|710|ISO 3166-2:ZA
South Georgia and the South Sandwich Islands|GS|SGS|239|ISO 3166-2:GS
South Sudan|SS|SSD|728|ISO 3166-2:SS
Spain|ES|ESP|724|ISO 3166-2:ES
Sri Lanka|LK|LKA|144|ISO 3166-2:LK
Sudan|SD|SDN|729|ISO 3166-2:SD
Suriname|SR|SUR|740|ISO 3166-2:SR
Svalbard and Jan Mayen|SJ|SJM|744|ISO 3166-2:SJ
Swaziland|SZ|SWZ|748|ISO 3166-2:SZ
Sweden|SE|SWE|752|ISO 3166-2:SE
Switzerland|CH|CHE|756|ISO 3166-2:CH
Syrian Arab Republic|SY|SYR|760|ISO 3166-2:SY
Taiwan, Province of China|TW|TWN|158|ISO 3166-2:TW
Tajikistan|TJ|TJK|762|ISO 3166-2:TJ
Tanzania, United Republic of|TZ|TZA|834|ISO 3166-2:TZ
Thailand|TH|THA|764|ISO 3166-2:TH
Timor-Leste|TL|TLS|626|ISO 3166-2:TL
Togo|TG|TGO|768|ISO 3166-2:TG
Tokelau|TK|TKL|772|ISO 3166-2:TK
Tonga|TO|TON|776|ISO 3166-2:TO
Trinidad and Tobago|TT|TTO|780|ISO 3166-2:TT
Tunisia|TN|TUN|788|ISO 3166-2:TN
Turkey|TR|TUR|792|ISO 3166-2:TR
Turkmenistan|TM|TKM|795|ISO 3166-2:TM
Turks and Caicos Islands|TC|TCA|796|ISO 3166-2:TC
Tuvalu|TV|TUV|798|ISO 3166-2:TV
Uganda|UG|UGA|800|ISO 3166-2:UG
Ukraine|UA|UKR|804|ISO 3166-2:UA
United Arab Emirates|AE|ARE|784|ISO 3166-2:AE
United Kingdom|GB|GBR|826|ISO 3166-2:GB
United States|US|USA|840|ISO 3166-2:US
United States Minor Outlying Islands|UM|UMI|581|ISO 3166-2:UM
Uruguay|UY|URY|858|ISO 3166-2:UY
Uzbekistan|UZ|UZB|860|ISO 3166-2:UZ
Vanuatu|VU|VUT|548|ISO 3166-2:VU
Venezuela, Bolivarian Republic of|VE|VEN|862|ISO 3166-2:VE
Viet Nam|VN|VNM|704|ISO 3166-2:VN
Virgin Islands, British|VG|VGB|092|ISO 3166-2:VG
Virgin Islands, U.S|VI|VIR|850|ISO 3166-2:VI
Wallis and Futuna|WF|WLF|876|ISO 3166-2:WF
Western Sahara|EH|ESH|732|ISO 3166-2:EH
Yemen|YE|YEM|887|ISO 3166-2:YE
Zambia|ZM|ZMB|894|ISO 3166-2:ZM
Zimbabwe|ZW|ZWE|716|ISO 3166-2:ZW

View file

@ -0,0 +1,485 @@
aar||aa|Afar|afar
abk||ab|Abkhazian|abkhaze
ace|||Achinese|aceh
ach|||Acoli|acoli
ada|||Adangme|adangme
ady|||Adyghe; Adygei|adyghé
afa|||Afro-Asiatic languages|afro-asiatiques, langues
afh|||Afrihili|afrihili
afr||af|Afrikaans|afrikaans
ain|||Ainu|aïnou
aka||ak|Akan|akan
akk|||Akkadian|akkadien
alb|sqi|sq|Albanian|albanais
ale|||Aleut|aléoute
alg|||Algonquian languages|algonquines, langues
alt|||Southern Altai|altai du Sud
amh||am|Amharic|amharique
ang|||English, Old (ca.450-1100)|anglo-saxon (ca.450-1100)
anp|||Angika|angika
apa|||Apache languages|apaches, langues
ara||ar|Arabic|arabe
arc|||Official Aramaic (700-300 BCE); Imperial Aramaic (700-300 BCE)|araméen d'empire (700-300 BCE)
arg||an|Aragonese|aragonais
arm|hye|hy|Armenian|arménien
arn|||Mapudungun; Mapuche|mapudungun; mapuche; mapuce
arp|||Arapaho|arapaho
art|||Artificial languages|artificielles, langues
arw|||Arawak|arawak
asm||as|Assamese|assamais
ast|||Asturian; Bable; Leonese; Asturleonese|asturien; bable; léonais; asturoléonais
ath|||Athapascan languages|athapascanes, langues
aus|||Australian languages|australiennes, langues
ava||av|Avaric|avar
ave||ae|Avestan|avestique
awa|||Awadhi|awadhi
aym||ay|Aymara|aymara
aze||az|Azerbaijani|azéri
bad|||Banda languages|banda, langues
bai|||Bamileke languages|bamiléké, langues
bak||ba|Bashkir|bachkir
bal|||Baluchi|baloutchi
bam||bm|Bambara|bambara
ban|||Balinese|balinais
baq|eus|eu|Basque|basque
bas|||Basa|basa
bat|||Baltic languages|baltes, langues
bej|||Beja; Bedawiyet|bedja
bel||be|Belarusian|biélorusse
bem|||Bemba|bemba
ben||bn|Bengali|bengali
ber|||Berber languages|berbères, langues
bho|||Bhojpuri|bhojpuri
bih||bh|Bihari languages|langues biharis
bik|||Bikol|bikol
bin|||Bini; Edo|bini; edo
bis||bi|Bislama|bichlamar
bla|||Siksika|blackfoot
bnt|||Bantu (Other)|bantoues, autres langues
bos||bs|Bosnian|bosniaque
bra|||Braj|braj
bre||br|Breton|breton
btk|||Batak languages|batak, langues
bua|||Buriat|bouriate
bug|||Buginese|bugi
bul||bg|Bulgarian|bulgare
bur|mya|my|Burmese|birman
byn|||Blin; Bilin|blin; bilen
cad|||Caddo|caddo
cai|||Central American Indian languages|amérindiennes de L'Amérique centrale, langues
car|||Galibi Carib|karib; galibi; carib
cat||ca|Catalan; Valencian|catalan; valencien
cau|||Caucasian languages|caucasiennes, langues
ceb|||Cebuano|cebuano
cel|||Celtic languages|celtiques, langues; celtes, langues
cha||ch|Chamorro|chamorro
chb|||Chibcha|chibcha
che||ce|Chechen|tchétchène
chg|||Chagatai|djaghataï
chi|zho|zh|Chinese|chinois
chk|||Chuukese|chuuk
chm|||Mari|mari
chn|||Chinook jargon|chinook, jargon
cho|||Choctaw|choctaw
chp|||Chipewyan; Dene Suline|chipewyan
chr|||Cherokee|cherokee
chu||cu|Church Slavic; Old Slavonic; Church Slavonic; Old Bulgarian; Old Church Slavonic|slavon d'église; vieux slave; slavon liturgique; vieux bulgare
chv||cv|Chuvash|tchouvache
chy|||Cheyenne|cheyenne
cmc|||Chamic languages|chames, langues
cop|||Coptic|copte
cor||kw|Cornish|cornique
cos||co|Corsican|corse
cpe|||Creoles and pidgins, English based|créoles et pidgins basés sur l'anglais
cpf|||Creoles and pidgins, French-based |créoles et pidgins basés sur le français
cpp|||Creoles and pidgins, Portuguese-based |créoles et pidgins basés sur le portugais
cre||cr|Cree|cree
crh|||Crimean Tatar; Crimean Turkish|tatar de Crimé
crp|||Creoles and pidgins |créoles et pidgins
csb|||Kashubian|kachoube
cus|||Cushitic languages|couchitiques, langues
cze|ces|cs|Czech|tchèque
dak|||Dakota|dakota
dan||da|Danish|danois
dar|||Dargwa|dargwa
day|||Land Dayak languages|dayak, langues
del|||Delaware|delaware
den|||Slave (Athapascan)|esclave (athapascan)
dgr|||Dogrib|dogrib
din|||Dinka|dinka
div||dv|Divehi; Dhivehi; Maldivian|maldivien
doi|||Dogri|dogri
dra|||Dravidian languages|dravidiennes, langues
dsb|||Lower Sorbian|bas-sorabe
dua|||Duala|douala
dum|||Dutch, Middle (ca.1050-1350)|néerlandais moyen (ca. 1050-1350)
dut|nld|nl|Dutch; Flemish|néerlandais; flamand
dyu|||Dyula|dioula
dzo||dz|Dzongkha|dzongkha
efi|||Efik|efik
egy|||Egyptian (Ancient)|égyptien
eka|||Ekajuk|ekajuk
elx|||Elamite|élamite
eng||en|English|anglais
enm|||English, Middle (1100-1500)|anglais moyen (1100-1500)
epo||eo|Esperanto|espéranto
est||et|Estonian|estonien
ewe||ee|Ewe|éwé
ewo|||Ewondo|éwondo
fan|||Fang|fang
fao||fo|Faroese|féroïen
fat|||Fanti|fanti
fij||fj|Fijian|fidjien
fil|||Filipino; Pilipino|filipino; pilipino
fin||fi|Finnish|finnois
fiu|||Finno-Ugrian languages|finno-ougriennes, langues
fon|||Fon|fon
fre|fra|fr|French|français
frm|||French, Middle (ca.1400-1600)|français moyen (1400-1600)
fro|||French, Old (842-ca.1400)|français ancien (842-ca.1400)
frr|||Northern Frisian|frison septentrional
frs|||Eastern Frisian|frison oriental
fry||fy|Western Frisian|frison occidental
ful||ff|Fulah|peul
fur|||Friulian|frioulan
gaa|||Ga|ga
gay|||Gayo|gayo
gba|||Gbaya|gbaya
gem|||Germanic languages|germaniques, langues
geo|kat|ka|Georgian|géorgien
ger|deu|de|German|allemand
gez|||Geez|guèze
gil|||Gilbertese|kiribati
gla||gd|Gaelic; Scottish Gaelic|gaélique; gaélique écossais
gle||ga|Irish|irlandais
glg||gl|Galician|galicien
glv||gv|Manx|manx; mannois
gmh|||German, Middle High (ca.1050-1500)|allemand, moyen haut (ca. 1050-1500)
goh|||German, Old High (ca.750-1050)|allemand, vieux haut (ca. 750-1050)
gon|||Gondi|gond
gor|||Gorontalo|gorontalo
got|||Gothic|gothique
grb|||Grebo|grebo
grc|||Greek, Ancient (to 1453)|grec ancien (jusqu'à 1453)
gre|ell|el|Greek, Modern (1453-)|grec moderne (après 1453)
grn||gn|Guarani|guarani
gsw|||Swiss German; Alemannic; Alsatian|suisse alémanique; alémanique; alsacien
guj||gu|Gujarati|goudjrati
gwi|||Gwich'in|gwich'in
hai|||Haida|haida
hat||ht|Haitian; Haitian Creole|haïtien; créole haïtien
hau||ha|Hausa|haoussa
haw|||Hawaiian|hawaïen
heb||he|Hebrew|hébreu
her||hz|Herero|herero
hil|||Hiligaynon|hiligaynon
him|||Himachali languages; Western Pahari languages|langues himachalis; langues paharis occidentales
hin||hi|Hindi|hindi
hit|||Hittite|hittite
hmn|||Hmong; Mong|hmong
hmo||ho|Hiri Motu|hiri motu
hrv||hr|Croatian|croate
hsb|||Upper Sorbian|haut-sorabe
hun||hu|Hungarian|hongrois
hup|||Hupa|hupa
iba|||Iban|iban
ibo||ig|Igbo|igbo
ice|isl|is|Icelandic|islandais
ido||io|Ido|ido
iii||ii|Sichuan Yi; Nuosu|yi de Sichuan
ijo|||Ijo languages|ijo, langues
iku||iu|Inuktitut|inuktitut
ile||ie|Interlingue; Occidental|interlingue
ilo|||Iloko|ilocano
ina||ia|Interlingua (International Auxiliary Language Association)|interlingua (langue auxiliaire internationale)
inc|||Indic languages|indo-aryennes, langues
ind||id|Indonesian|indonésien
ine|||Indo-European languages|indo-européennes, langues
inh|||Ingush|ingouche
ipk||ik|Inupiaq|inupiaq
ira|||Iranian languages|iraniennes, langues
iro|||Iroquoian languages|iroquoises, langues
ita||it|Italian|italien
jav||jv|Javanese|javanais
jbo|||Lojban|lojban
jpn||ja|Japanese|japonais
jpr|||Judeo-Persian|judéo-persan
jrb|||Judeo-Arabic|judéo-arabe
kaa|||Kara-Kalpak|karakalpak
kab|||Kabyle|kabyle
kac|||Kachin; Jingpho|kachin; jingpho
kal||kl|Kalaallisut; Greenlandic|groenlandais
kam|||Kamba|kamba
kan||kn|Kannada|kannada
kar|||Karen languages|karen, langues
kas||ks|Kashmiri|kashmiri
kau||kr|Kanuri|kanouri
kaw|||Kawi|kawi
kaz||kk|Kazakh|kazakh
kbd|||Kabardian|kabardien
kha|||Khasi|khasi
khi|||Khoisan languages|khoïsan, langues
khm||km|Central Khmer|khmer central
kho|||Khotanese; Sakan|khotanais; sakan
kik||ki|Kikuyu; Gikuyu|kikuyu
kin||rw|Kinyarwanda|rwanda
kir||ky|Kirghiz; Kyrgyz|kirghiz
kmb|||Kimbundu|kimbundu
kok|||Konkani|konkani
kom||kv|Komi|kom
kon||kg|Kongo|kongo
kor||ko|Korean|coréen
kos|||Kosraean|kosrae
kpe|||Kpelle|kpellé
krc|||Karachay-Balkar|karatchai balkar
krl|||Karelian|carélien
kro|||Kru languages|krou, langues
kru|||Kurukh|kurukh
kua||kj|Kuanyama; Kwanyama|kuanyama; kwanyama
kum|||Kumyk|koumyk
kur||ku|Kurdish|kurde
kut|||Kutenai|kutenai
lad|||Ladino|judéo-espagnol
lah|||Lahnda|lahnda
lam|||Lamba|lamba
lao||lo|Lao|lao
lat||la|Latin|latin
lav||lv|Latvian|letton
lez|||Lezghian|lezghien
lim||li|Limburgan; Limburger; Limburgish|limbourgeois
lin||ln|Lingala|lingala
lit||lt|Lithuanian|lituanien
lol|||Mongo|mongo
loz|||Lozi|lozi
ltz||lb|Luxembourgish; Letzeburgesch|luxembourgeois
lua|||Luba-Lulua|luba-lulua
lub||lu|Luba-Katanga|luba-katanga
lug||lg|Ganda|ganda
lui|||Luiseno|luiseno
lun|||Lunda|lunda
luo|||Luo (Kenya and Tanzania)|luo (Kenya et Tanzanie)
lus|||Lushai|lushai
mac|mkd|mk|Macedonian|macédonien
mad|||Madurese|madourais
mag|||Magahi|magahi
mah||mh|Marshallese|marshall
mai|||Maithili|maithili
mak|||Makasar|makassar
mal||ml|Malayalam|malayalam
man|||Mandingo|mandingue
mao|mri|mi|Maori|maori
map|||Austronesian languages|austronésiennes, langues
mar||mr|Marathi|marathe
mas|||Masai|massaï
may|msa|ms|Malay|malais
mdf|||Moksha|moksa
mdr|||Mandar|mandar
men|||Mende|mendé
mga|||Irish, Middle (900-1200)|irlandais moyen (900-1200)
mic|||Mi'kmaq; Micmac|mi'kmaq; micmac
min|||Minangkabau|minangkabau
mis|||Uncoded languages|langues non codées
mkh|||Mon-Khmer languages|môn-khmer, langues
mlg||mg|Malagasy|malgache
mlt||mt|Maltese|maltais
mnc|||Manchu|mandchou
mni|||Manipuri|manipuri
mno|||Manobo languages|manobo, langues
moh|||Mohawk|mohawk
mon||mn|Mongolian|mongol
mos|||Mossi|moré
mul|||Multiple languages|multilingue
mun|||Munda languages|mounda, langues
mus|||Creek|muskogee
mwl|||Mirandese|mirandais
mwr|||Marwari|marvari
myn|||Mayan languages|maya, langues
myv|||Erzya|erza
nah|||Nahuatl languages|nahuatl, langues
nai|||North American Indian languages|nord-amérindiennes, langues
nap|||Neapolitan|napolitain
nau||na|Nauru|nauruan
nav||nv|Navajo; Navaho|navaho
nbl||nr|Ndebele, South; South Ndebele|ndébélé du Sud
nde||nd|Ndebele, North; North Ndebele|ndébélé du Nord
ndo||ng|Ndonga|ndonga
nds|||Low German; Low Saxon; German, Low; Saxon, Low|bas allemand; bas saxon; allemand, bas; saxon, bas
nep||ne|Nepali|népalais
new|||Nepal Bhasa; Newari|nepal bhasa; newari
nia|||Nias|nias
nic|||Niger-Kordofanian languages|nigéro-kordofaniennes, langues
niu|||Niuean|niué
nno||nn|Norwegian Nynorsk; Nynorsk, Norwegian|norvégien nynorsk; nynorsk, norvégien
nob||nb|Bokmål, Norwegian; Norwegian Bokmål|norvégien bokmål
nog|||Nogai|nogaï; nogay
non|||Norse, Old|norrois, vieux
nor||no|Norwegian|norvégien
nqo|||N'Ko|n'ko
nso|||Pedi; Sepedi; Northern Sotho|pedi; sepedi; sotho du Nord
nub|||Nubian languages|nubiennes, langues
nwc|||Classical Newari; Old Newari; Classical Nepal Bhasa|newari classique
nya||ny|Chichewa; Chewa; Nyanja|chichewa; chewa; nyanja
nym|||Nyamwezi|nyamwezi
nyn|||Nyankole|nyankolé
nyo|||Nyoro|nyoro
nzi|||Nzima|nzema
oci||oc|Occitan (post 1500); Provençal|occitan (après 1500); provençal
oji||oj|Ojibwa|ojibwa
ori||or|Oriya|oriya
orm||om|Oromo|galla
osa|||Osage|osage
oss||os|Ossetian; Ossetic|ossète
ota|||Turkish, Ottoman (1500-1928)|turc ottoman (1500-1928)
oto|||Otomian languages|otomi, langues
paa|||Papuan languages|papoues, langues
pag|||Pangasinan|pangasinan
pal|||Pahlavi|pahlavi
pam|||Pampanga; Kapampangan|pampangan
pan||pa|Panjabi; Punjabi|pendjabi
pap|||Papiamento|papiamento
pau|||Palauan|palau
peo|||Persian, Old (ca.600-400 B.C.)|perse, vieux (ca. 600-400 av. J.-C.)
per|fas|fa|Persian|persan
phi|||Philippine languages|philippines, langues
phn|||Phoenician|phénicien
pli||pi|Pali|pali
pol||pl|Polish|polonais
pon|||Pohnpeian|pohnpei
por||pt|Portuguese|portugais
pra|||Prakrit languages|prâkrit, langues
pro|||Provençal, Old (to 1500)|provençal ancien (jusqu'à 1500)
pus||ps|Pushto; Pashto|pachto
qaa-qtz|||Reserved for local use|réservée à l'usage local
que||qu|Quechua|quechua
raj|||Rajasthani|rajasthani
rap|||Rapanui|rapanui
rar|||Rarotongan; Cook Islands Maori|rarotonga; maori des îles Cook
roa|||Romance languages|romanes, langues
roh||rm|Romansh|romanche
rom|||Romany|tsigane
rum|ron|ro|Romanian; Moldavian; Moldovan|roumain; moldave
run||rn|Rundi|rundi
rup|||Aromanian; Arumanian; Macedo-Romanian|aroumain; macédo-roumain
rus||ru|Russian|russe
sad|||Sandawe|sandawe
sag||sg|Sango|sango
sah|||Yakut|iakoute
sai|||South American Indian (Other)|indiennes d'Amérique du Sud, autres langues
sal|||Salishan languages|salishennes, langues
sam|||Samaritan Aramaic|samaritain
san||sa|Sanskrit|sanskrit
sas|||Sasak|sasak
sat|||Santali|santal
scn|||Sicilian|sicilien
sco|||Scots|écossais
sel|||Selkup|selkoupe
sem|||Semitic languages|sémitiques, langues
sga|||Irish, Old (to 900)|irlandais ancien (jusqu'à 900)
sgn|||Sign Languages|langues des signes
shn|||Shan|chan
sid|||Sidamo|sidamo
sin||si|Sinhala; Sinhalese|singhalais
sio|||Siouan languages|sioux, langues
sit|||Sino-Tibetan languages|sino-tibétaines, langues
sla|||Slavic languages|slaves, langues
slo|slk|sk|Slovak|slovaque
slv||sl|Slovenian|slovène
sma|||Southern Sami|sami du Sud
sme||se|Northern Sami|sami du Nord
smi|||Sami languages|sames, langues
smj|||Lule Sami|sami de Lule
smn|||Inari Sami|sami d'Inari
smo||sm|Samoan|samoan
sms|||Skolt Sami|sami skolt
sna||sn|Shona|shona
snd||sd|Sindhi|sindhi
snk|||Soninke|soninké
sog|||Sogdian|sogdien
som||so|Somali|somali
son|||Songhai languages|songhai, langues
sot||st|Sotho, Southern|sotho du Sud
spa||es|Spanish; Castilian|espagnol; castillan
srd||sc|Sardinian|sarde
srn|||Sranan Tongo|sranan tongo
srp||sr|Serbian|serbe
srr|||Serer|sérère
ssa|||Nilo-Saharan languages|nilo-sahariennes, langues
ssw||ss|Swati|swati
suk|||Sukuma|sukuma
sun||su|Sundanese|soundanais
sus|||Susu|soussou
sux|||Sumerian|sumérien
swa||sw|Swahili|swahili
swe||sv|Swedish|suédois
syc|||Classical Syriac|syriaque classique
syr|||Syriac|syriaque
tah||ty|Tahitian|tahitien
tai|||Tai languages|tai, langues
tam||ta|Tamil|tamoul
tat||tt|Tatar|tatar
tel||te|Telugu|télougou
tem|||Timne|temne
ter|||Tereno|tereno
tet|||Tetum|tetum
tgk||tg|Tajik|tadjik
tgl||tl|Tagalog|tagalog
tha||th|Thai|thaï
tib|bod|bo|Tibetan|tibétain
tig|||Tigre|tigré
tir||ti|Tigrinya|tigrigna
tiv|||Tiv|tiv
tkl|||Tokelau|tokelau
tlh|||Klingon; tlhIngan-Hol|klingon
tli|||Tlingit|tlingit
tmh|||Tamashek|tamacheq
tog|||Tonga (Nyasa)|tonga (Nyasa)
ton||to|Tonga (Tonga Islands)|tongan (Îles Tonga)
tpi|||Tok Pisin|tok pisin
tsi|||Tsimshian|tsimshian
tsn||tn|Tswana|tswana
tso||ts|Tsonga|tsonga
tuk||tk|Turkmen|turkmène
tum|||Tumbuka|tumbuka
tup|||Tupi languages|tupi, langues
tur||tr|Turkish|turc
tut|||Altaic languages|altaïques, langues
tvl|||Tuvalu|tuvalu
twi||tw|Twi|twi
tyv|||Tuvinian|touva
udm|||Udmurt|oudmourte
uga|||Ugaritic|ougaritique
uig||ug|Uighur; Uyghur|ouïgour
ukr||uk|Ukrainian|ukrainien
umb|||Umbundu|umbundu
und|||Undetermined|indéterminée
urd||ur|Urdu|ourdou
uzb||uz|Uzbek|ouszbek
vai|||Vai|vaï
ven||ve|Venda|venda
vie||vi|Vietnamese|vietnamien
vol||vo|Volapük|volapük
vot|||Votic|vote
wak|||Wakashan languages|wakashanes, langues
wal|||Walamo|walamo
war|||Waray|waray
was|||Washo|washo
wel|cym|cy|Welsh|gallois
wen|||Sorbian languages|sorabes, langues
wln||wa|Walloon|wallon
wol||wo|Wolof|wolof
xal|||Kalmyk; Oirat|kalmouk; oïrat
xho||xh|Xhosa|xhosa
yao|||Yao|yao
yap|||Yapese|yapois
yid||yi|Yiddish|yiddish
yor||yo|Yoruba|yoruba
ypk|||Yupik languages|yupik, langues
zap|||Zapotec|zapotèque
zbl|||Blissymbols; Blissymbolics; Bliss|symboles Bliss; Bliss
zen|||Zenaga|zenaga
zha||za|Zhuang; Chuang|zhuang; chuang
znd|||Zande languages|zandé, langues
zul||zu|Zulu|zoulou
zun|||Zuni|zuni
zxx|||No linguistic content; Not applicable|pas de contenu linguistique; non applicable
zza|||Zaza; Dimili; Dimli; Kirdki; Kirmanjki; Zazaki|zaza; dimili; dimli; kirdki; kirmanjki; zazaki

232
lib/guessit/__init__.py Normal file
View file

@ -0,0 +1,232 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
#
# GuessIt - A library for guessing information from filenames
# Copyright (c) 2013 Nicolas Wack <wackou@gmail.com>
#
# GuessIt is free software; you can redistribute it and/or modify it under
# the terms of the Lesser GNU General Public License as published by
# the Free Software Foundation; either version 3 of the License, or
# (at your option) any later version.
#
# GuessIt is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# Lesser GNU General Public License for more details.
#
# You should have received a copy of the Lesser GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
#
from __future__ import absolute_import, division, print_function, unicode_literals
import pkg_resources
from .__version__ import __version__
__all__ = ['Guess', 'Language',
'guess_file_info', 'guess_video_info',
'guess_movie_info', 'guess_episode_info']
# Do python3 detection before importing any other module, to be sure that
# it will then always be available
# with code from http://lucumr.pocoo.org/2011/1/22/forwards-compatible-python/
import sys
if sys.version_info[0] >= 3: # pragma: no cover
PY2, PY3 = False, True
unicode_text_type = str
native_text_type = str
base_text_type = str
def u(x):
return str(x)
def s(x):
return x
class UnicodeMixin(object):
__str__ = lambda x: x.__unicode__()
import binascii
def to_hex(x):
return binascii.hexlify(x).decode('utf-8')
else: # pragma: no cover
PY2, PY3 = True, False
__all__ = [str(s) for s in __all__] # fix imports for python2
unicode_text_type = unicode
native_text_type = str
base_text_type = basestring
def u(x):
if isinstance(x, str):
return x.decode('utf-8')
if isinstance(x, list):
return [u(s) for s in x]
return unicode(x)
def s(x):
if isinstance(x, unicode):
return x.encode('utf-8')
if isinstance(x, list):
return [s(y) for y in x]
if isinstance(x, tuple):
return tuple(s(y) for y in x)
if isinstance(x, dict):
return dict((s(key), s(value)) for key, value in x.items())
return x
class UnicodeMixin(object):
__str__ = lambda x: unicode(x).encode('utf-8')
def to_hex(x):
return x.encode('hex')
range = xrange
from guessit.guess import Guess, merge_all
from guessit.language import Language
from guessit.matcher import IterativeMatcher
from guessit.textutils import clean_string, is_camel, from_camel
import os.path
import logging
import json
log = logging.getLogger(__name__)
class NullHandler(logging.Handler):
def emit(self, record):
pass
# let's be a nicely behaving library
h = NullHandler()
log.addHandler(h)
def _guess_filename(filename, options=None, **kwargs):
mtree = _build_filename_mtree(filename, options=options, **kwargs)
_add_camel_properties(mtree, options=options)
return mtree.matched()
def _build_filename_mtree(filename, options=None, **kwargs):
mtree = IterativeMatcher(filename, options=options, **kwargs)
second_pass_options = mtree.second_pass_options
if second_pass_options:
log.info("Running 2nd pass")
merged_options = dict(options)
merged_options.update(second_pass_options)
mtree = IterativeMatcher(filename, options=merged_options, **kwargs)
return mtree
def _add_camel_properties(mtree, options=None, **kwargs):
prop = 'title' if mtree.matched().get('type') != 'episode' else 'series'
value = mtree.matched().get(prop)
_guess_camel_string(mtree, value, options=options, skip_title=False, **kwargs)
for leaf in mtree.match_tree.unidentified_leaves():
value = leaf.value
_guess_camel_string(mtree, value, options=options, skip_title=True, **kwargs)
def _guess_camel_string(mtree, string, options=None, skip_title=False, **kwargs):
if string and is_camel(string):
log.info('"%s" is camel cased. Try to detect more properties.' % (string,))
uncameled_value = from_camel(string)
camel_tree = _build_filename_mtree(uncameled_value, options=options, name_only=True, skip_title=skip_title, **kwargs)
if len(camel_tree.matched()) > 0:
# Title has changed.
mtree.matched().update(camel_tree.matched())
return True
return False
def guess_file_info(filename, info=None, options=None, **kwargs):
"""info can contain the names of the various plugins, such as 'filename' to
detect filename info, or 'hash_md5' to get the md5 hash of the file.
>>> testfile = os.path.join(os.path.dirname(__file__), 'test/dummy.srt')
>>> g = guess_file_info(testfile, info = ['hash_md5', 'hash_sha1'])
>>> g['hash_md5'], g['hash_sha1']
('64de6b5893cac24456c46a935ef9c359', 'a703fc0fa4518080505809bf562c6fc6f7b3c98c')
"""
info = info or 'filename'
options = options or {}
result = []
hashers = []
# Force unicode as soon as possible
filename = u(filename)
if isinstance(info, base_text_type):
info = [info]
for infotype in info:
if infotype == 'filename':
result.append(_guess_filename(filename, options, **kwargs))
elif infotype == 'hash_mpc':
from guessit.hash_mpc import hash_file
try:
result.append(Guess({infotype: hash_file(filename)},
confidence=1.0))
except Exception as e:
log.warning('Could not compute MPC-style hash because: %s' % e)
elif infotype == 'hash_ed2k':
from guessit.hash_ed2k import hash_file
try:
result.append(Guess({infotype: hash_file(filename)},
confidence=1.0))
except Exception as e:
log.warning('Could not compute ed2k hash because: %s' % e)
elif infotype.startswith('hash_'):
import hashlib
hashname = infotype[5:]
try:
hasher = getattr(hashlib, hashname)()
hashers.append((infotype, hasher))
except AttributeError:
log.warning('Could not compute %s hash because it is not available from python\'s hashlib module' % hashname)
else:
log.warning('Invalid infotype: %s' % infotype)
# do all the hashes now, but on a single pass
if hashers:
try:
blocksize = 8192
hasherobjs = dict(hashers).values()
with open(filename, 'rb') as f:
chunk = f.read(blocksize)
while chunk:
for hasher in hasherobjs:
hasher.update(chunk)
chunk = f.read(blocksize)
for infotype, hasher in hashers:
result.append(Guess({infotype: hasher.hexdigest()},
confidence=1.0))
except Exception as e:
log.warning('Could not compute hash because: %s' % e)
result = merge_all(result)
return result
def guess_video_info(filename, info=None, options=None, **kwargs):
return guess_file_info(filename, info=info, options=options, type='video', **kwargs)
def guess_movie_info(filename, info=None, options=None, **kwargs):
return guess_file_info(filename, info=info, options=options, type='movie', **kwargs)
def guess_episode_info(filename, info=None, options=None, **kwargs):
return guess_file_info(filename, info=info, options=options, type='episode', **kwargs)

217
lib/guessit/__main__.py Normal file
View file

@ -0,0 +1,217 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
#
# GuessIt - A library for guessing information from filenames
# Copyright (c) 2013 Nicolas Wack <wackou@gmail.com>
# Copyright (c) 2013 Rémi Alvergnat <toilal.dev@gmail.com>
#
# GuessIt is free software; you can redistribute it and/or modify it under
# the terms of the Lesser GNU General Public License as published by
# the Free Software Foundation; either version 3 of the License, or
# (at your option) any later version.
#
# GuessIt is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# Lesser GNU General Public License for more details.
#
# You should have received a copy of the Lesser GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
#
from __future__ import absolute_import, division, print_function, unicode_literals
import logging
import os
from guessit import PY2, u, guess_file_info
from guessit.options import option_parser
def guess_file(filename, info='filename', options=None, **kwargs):
options = options or {}
filename = u(filename)
print('For:', filename)
guess = guess_file_info(filename, info, options, **kwargs)
if options.get('yaml'):
try:
import yaml
for k, v in guess.items():
if isinstance(v, list) and len(v) == 1:
guess[k] = v[0]
ystr = yaml.safe_dump({filename: dict(guess)}, default_flow_style=False)
i = 0
for yline in ystr.splitlines():
if i == 0:
print("? " + yline[:-1])
elif i == 1:
print(":" + yline[1:])
else:
print(yline)
i = i + 1
return
except ImportError: # pragma: no cover
print('PyYAML not found. Using default output.')
print('GuessIt found:', guess.nice_string(options.get('advanced')))
def _supported_properties():
from guessit.plugins import transformers
all_properties = {}
transformers_properties = []
for transformer in transformers.all_transformers():
supported_properties = transformer.supported_properties()
transformers_properties.append((transformer, supported_properties))
if isinstance(supported_properties, dict):
for property_name, possible_values in supported_properties.items():
current_possible_values = all_properties.get(property_name)
if current_possible_values is None:
current_possible_values = []
all_properties[property_name] = current_possible_values
if possible_values:
current_possible_values.extend(possible_values)
else:
for property_name in supported_properties:
current_possible_values = all_properties.get(property_name)
if current_possible_values is None:
current_possible_values = []
all_properties[property_name] = current_possible_values
return (all_properties, transformers_properties)
def display_transformers():
print('GuessIt transformers:')
_, transformers_properties = _supported_properties()
for transformer, _ in transformers_properties:
print('[@] %s (%s)' % (transformer.name, transformer.priority))
def display_properties(values, transformers):
print('GuessIt properties:')
all_properties, transformers_properties = _supported_properties()
if transformers:
for transformer, properties_list in transformers_properties:
print('[@] %s (%s)' % (transformer.name, transformer.priority))
for property_name in properties_list:
property_values = all_properties.get(property_name)
print(' [+] %s' % (property_name,))
if property_values and values:
_display_property_values(property_name, indent=4)
else:
properties_list = []
properties_list.extend(all_properties.keys())
properties_list.sort()
for property_name in properties_list:
property_values = all_properties.get(property_name)
print(' [+] %s' % (property_name,))
if property_values and values:
_display_property_values(property_name, indent=4)
def _display_property_values(property_name, indent=2):
all_properties, _ = _supported_properties()
property_values = all_properties.get(property_name)
for property_value in property_values:
print(indent * ' ' + '[!] %s' % (property_value,))
def run_demo(episodes=True, movies=True, options=None):
# NOTE: tests should not be added here but rather in the tests/ folder
# this is just intended as a quick example
if episodes:
testeps = ['Series/Californication/Season 2/Californication.2x05.Vaginatown.HDTV.XviD-0TV.[tvu.org.ru].avi',
'Series/dexter/Dexter.5x02.Hello,.Bandit.ENG.-.sub.FR.HDTV.XviD-AlFleNi-TeaM.[tvu.org.ru].avi',
'Series/Treme/Treme.1x03.Right.Place,.Wrong.Time.HDTV.XviD-NoTV.[tvu.org.ru].avi',
'Series/Duckman/Duckman - 101 (01) - 20021107 - I, Duckman.avi',
'Series/Duckman/Duckman - S1E13 Joking The Chicken (unedited).avi',
'Series/Simpsons/The_simpsons_s13e18_-_i_am_furious_yellow.mpg',
'Series/Simpsons/Saison 12 Français/Simpsons,.The.12x08.A.Bas.Le.Sergent.Skinner.FR.[tvu.org.ru].avi',
'Series/Dr._Slump_-_002_DVB-Rip_Catalan_by_kelf.avi',
'Series/Kaamelott/Kaamelott - Livre V - Second Volet - HD 704x396 Xvid 2 pass - Son 5.1 - TntRip by Slurm.avi'
]
for f in testeps:
print('-' * 80)
guess_file(f, options=options, type='episode')
if movies:
testmovies = ['Movies/Fear and Loathing in Las Vegas (1998)/Fear.and.Loathing.in.Las.Vegas.720p.HDDVD.DTS.x264-ESiR.mkv',
'Movies/El Dia de la Bestia (1995)/El.dia.de.la.bestia.DVDrip.Spanish.DivX.by.Artik[SEDG].avi',
'Movies/Blade Runner (1982)/Blade.Runner.(1982).(Director\'s.Cut).CD1.DVDRip.XviD.AC3-WAF.avi',
'Movies/Dark City (1998)/Dark.City.(1998).DC.BDRip.720p.DTS.X264-CHD.mkv',
'Movies/Sin City (BluRay) (2005)/Sin.City.2005.BDRip.720p.x264.AC3-SEPTiC.mkv',
'Movies/Borat (2006)/Borat.(2006).R5.PROPER.REPACK.DVDRip.XviD-PUKKA.avi', # FIXME: PROPER and R5 get overwritten
'[XCT].Le.Prestige.(The.Prestige).DVDRip.[x264.HP.He-Aac.{Fr-Eng}.St{Fr-Eng}.Chaps].mkv', # FIXME: title gets overwritten
'Battle Royale (2000)/Battle.Royale.(Batoru.Rowaiaru).(2000).(Special.Edition).CD1of2.DVDRiP.XviD-[ZeaL].avi',
'Movies/Brazil (1985)/Brazil_Criterion_Edition_(1985).CD2.English.srt',
'Movies/Persepolis (2007)/[XCT] Persepolis [H264+Aac-128(Fr-Eng)+ST(Fr-Eng)+Ind].mkv',
'Movies/Toy Story (1995)/Toy Story [HDTV 720p English-Spanish].mkv',
'Movies/Pirates of the Caribbean: The Curse of the Black Pearl (2003)/Pirates.Of.The.Carribean.DC.2003.iNT.DVDRip.XviD.AC3-NDRT.CD1.avi',
'Movies/Office Space (1999)/Office.Space.[Dual-DVDRip].[Spanish-English].[XviD-AC3-AC3].[by.Oswald].avi',
'Movies/The NeverEnding Story (1984)/The.NeverEnding.Story.1.1984.DVDRip.AC3.Xvid-Monteque.avi',
'Movies/Juno (2007)/Juno KLAXXON.avi',
'Movies/Chat noir, chat blanc (1998)/Chat noir, Chat blanc - Emir Kusturica (VO - VF - sub FR - Chapters).mkv',
'Movies/Wild Zero (2000)/Wild.Zero.DVDivX-EPiC.srt',
'Movies/El Bosque Animado (1987)/El.Bosque.Animado.[Jose.Luis.Cuerda.1987].[Xvid-Dvdrip-720x432].avi',
'testsmewt_bugs/movies/Baraka_Edition_Collector.avi'
]
for f in testmovies:
print('-' * 80)
guess_file(f, options=options, type='movie')
def main(args=None, setup_logging=True):
if setup_logging:
from guessit import slogging
slogging.setupLogging()
if PY2: # pragma: no cover
import codecs
import locale
import sys
# see http://bugs.python.org/issue2128
if os.name == 'nt':
for i, a in enumerate(sys.argv):
sys.argv[i] = a.decode(locale.getpreferredencoding())
# see https://github.com/wackou/guessit/issues/43
# and http://stackoverflow.com/questions/4545661/unicodedecodeerror-when-redirecting-to-file
# Wrap sys.stdout into a StreamWriter to allow writing unicode.
sys.stdout = codecs.getwriter(locale.getpreferredencoding())(sys.stdout)
if args:
options, args = option_parser.parse_args(args)
else: # pragma: no cover
options, args = option_parser.parse_args()
if options.verbose:
logging.getLogger().setLevel(logging.DEBUG)
help_required = True
if options.properties or options.values:
display_properties(options.values, options.transformers)
help_required = False
elif options.transformers:
display_transformers()
help_required = False
if options.demo:
run_demo(episodes=True, movies=True, options=vars(options))
help_required = False
else:
if args:
help_required = False
for filename in args:
guess_file(filename,
info=options.info.split(','),
options=vars(options)
)
if help_required: # pragma: no cover
option_parser.print_help()
if __name__ == '__main__':
main()

View file

@ -0,0 +1,20 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
#
# GuessIt - A library for guessing information from filenames
# Copyright (c) 2013 Nicolas Wack <wackou@gmail.com>
#
# GuessIt is free software; you can redistribute it and/or modify it under
# the terms of the Lesser GNU General Public License as published by
# the Free Software Foundation; either version 3 of the License, or
# (at your option) any later version.
#
# GuessIt is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# Lesser GNU General Public License for more details.
#
# You should have received a copy of the Lesser GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
#
__version__ = '0.7.1'

615
lib/guessit/containers.py Normal file
View file

@ -0,0 +1,615 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
#
# GuessIt - A library for guessing information from filenames
# Copyright (c) 2013 Nicolas Wack <wackou@gmail.com>
# Copyright (c) 2013 Rémi Alvergnat <toilal.dev@gmail.com>
#
# GuessIt is free software; you can redistribute it and/or modify it under
# the terms of the Lesser GNU General Public License as published by
# the Free Software Foundation; either version 3 of the License, or
# (at your option) any later version.
#
# GuessIt is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# Lesser GNU General Public License for more details.
#
# You should have received a copy of the Lesser GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
#
from __future__ import absolute_import, division, print_function, unicode_literals
from .patterns import compile_pattern, sep
from . import base_text_type
from .guess import Guess
import types
def _get_span(prop, match):
"""Retrieves span for a match"""
if not prop.global_span and match.re.groups:
start = None
end = None
for i in range(1, match.re.groups + 1):
span = match.span(i)
if start is None or span[0] < start:
start = span[0]
if end is None or span[1] > end:
end = span[1]
return (start, end)
else:
return match.span()
start = span[0]
end = span[1]
def _get_groups(compiled_re):
"""
Retrieves groups from re
:return: list of group names
"""
if compiled_re.groups:
indexgroup = {}
for k, i in compiled_re.groupindex.items():
indexgroup[i] = k
ret = []
for i in range(1, compiled_re.groups + 1):
ret.append(indexgroup.get(i, i))
return ret
else:
return [None]
class NoValidator(object):
def validate(self, prop, string, node, match, entry_start, entry_end):
return True
class DefaultValidator(object):
"""Make sure our match is surrounded by separators, or by another entry"""
def validate(self, prop, string, node, match, entry_start, entry_end):
start, end = _get_span(prop, match)
sep_start = start <= 0 or string[start - 1] in sep
sep_end = end >= len(string) or string[end] in sep
start_by_other = start in entry_end
end_by_other = end in entry_start
if (sep_start or start_by_other) and (sep_end or end_by_other):
return True
return False
class WeakValidator(DefaultValidator):
"""Make sure our match is surrounded by separators and is the first or last element in the string"""
def validate(self, prop, string, node, match, entry_start, entry_end):
if super(WeakValidator, self).validate(prop, string, node, match, entry_start, entry_end):
span = match.span()
start = span[0]
end = span[1]
at_start = True
at_end = True
while start > 0:
start = start - 1
if string[start] not in sep:
at_start = False
break
if at_start:
return True
while end < len(string) - 1:
end = end + 1
if string[end] not in sep:
at_end = False
break
if at_end:
return True
return False
class LeavesValidator(DefaultValidator):
def __init__(self, lambdas=None, previous_lambdas=None, next_lambdas=None, both_side=False, default_=True):
self.previous_lambdas = previous_lambdas if not previous_lambdas is None else []
self.next_lambdas = next_lambdas if not next_lambdas is None else []
if lambdas:
self.previous_lambdas.extend(lambdas)
self.next_lambdas.extend(lambdas)
self.both_side = both_side
self.default_ = default_
"""Make sure our match is surrounded by separators and validates defined lambdas"""
def validate(self, prop, string, node, match, entry_start, entry_end):
if self.default_:
super_ret = super(LeavesValidator, self).validate(prop, string, node, match, entry_start, entry_end)
else:
super_ret = True
if not super_ret:
return False
previous_ = self._validate_previous(prop, string, node, match, entry_start, entry_end)
if previous_ and self.both_side:
return previous_
next_ = self._validate_next(prop, string, node, match, entry_start, entry_end)
if previous_ is None and next_ is None:
return super_ret
if self.both_side:
return previous_ and next_
else:
return previous_ or next_
def _validate_previous(self, prop, string, node, match, entry_start, entry_end):
if self.previous_lambdas:
for leaf in node.root.previous_leaves(node):
for lambda_ in self.previous_lambdas:
ret = self._check_rule(lambda_, leaf)
if not ret is None:
return ret
return False
def _validate_next(self, prop, string, node, match, entry_start, entry_end):
if self.next_lambdas:
for leaf in node.root.next_leaves(node):
for lambda_ in self.next_lambdas:
ret = self._check_rule(lambda_, leaf)
if not ret is None:
return ret
return False
def _check_rule(self, lambda_, previous_leaf):
return lambda_(previous_leaf)
class _Property:
"""Represents a property configuration."""
def __init__(self, keys=None, pattern=None, canonical_form=None, canonical_from_pattern=True, confidence=1.0, enhance=True, global_span=False, validator=DefaultValidator(), formatter=None):
"""
:param keys: Keys of the property (format, screenSize, ...)
:type keys: string
:param canonical_form: Unique value of the property (DVD, 720p, ...)
:type canonical_form: string
:param pattern: Regexp pattern
:type pattern: string
:param confidence: confidence
:type confidence: float
:param enhance: enhance the pattern
:type enhance: boolean
:param global_span: if True, the whole match span will used to create the Guess.
Else, the span from the capturing groups will be used.
:type global_span: boolean
:param validator: Validator to use
:type validator: :class:`DefaultValidator`
:param formatter: Formater to use
:type formatter: function
"""
if isinstance(keys, list):
self.keys = keys
elif isinstance(keys, base_text_type):
self.keys = [keys]
else:
self.keys = []
self.canonical_form = canonical_form
if not pattern is None:
self.pattern = pattern
else:
self.pattern = canonical_form
if self.canonical_form is None and canonical_from_pattern:
self.canonical_form = self.pattern
self.compiled = compile_pattern(self.pattern, enhance=enhance)
for group_name in _get_groups(self.compiled):
if isinstance(group_name, base_text_type) and not group_name in self.keys:
self.keys.append(group_name)
if not self.keys:
raise ValueError("No property key is defined")
self.confidence = confidence
self.global_span = global_span
self.validator = validator
self.formatter = formatter
def format(self, value, group_name=None):
"""Retrieves the final value from re group match value"""
formatter = None
if isinstance(self.formatter, dict):
formatter = self.formatter.get(group_name)
if formatter is None and not group_name is None:
formatter = self.formatter.get(None)
else:
formatter = self.formatter
if isinstance(formatter, types.FunctionType):
return formatter(value)
elif not formatter is None:
return formatter.format(value)
return value
def __repr__(self):
return "%s: %s" % (self.keys, self.canonical_form if self.canonical_form else self.pattern)
class PropertiesContainer(object):
def __init__(self, **kwargs):
self._properties = []
self.default_property_kwargs = kwargs
def unregister_property(self, name, *canonical_forms):
"""Unregister a property canonical forms
If canonical_forms are specified, only those values will be unregistered
:param name: Property name to unregister
:type name: string
:param canonical_forms: Values to unregister
:type canonical_forms: varargs of string
"""
_properties = [prop for prop in self._properties if prop.name == name and (not canonical_forms or prop.canonical_form in canonical_forms)]
def register_property(self, name, *patterns, **property_params):
"""Register property with defined canonical form and patterns.
:param name: name of the property (format, screenSize, ...)
:type name: string
:param patterns: regular expression patterns to register for the property canonical_form
:type patterns: varargs of string
"""
properties = []
for pattern in patterns:
params = dict(self.default_property_kwargs)
params.update(property_params)
if isinstance(pattern, dict):
params.update(pattern)
prop = _Property(name, **params)
else:
prop = _Property(name, pattern, **params)
self._properties.append(prop)
properties.append(prop)
return properties
def register_canonical_properties(self, name, *canonical_forms, **property_params):
"""Register properties from their canonical forms.
:param name: name of the property (releaseGroup, ...)
:type name: string
:param canonical_forms: values of the property ('ESiR', 'WAF', 'SEPTiC', ...)
:type canonical_forms: varargs of strings
"""
properties = []
for canonical_form in canonical_forms:
params = dict(property_params)
params['canonical_form'] = canonical_form
properties.extend(self.register_property(name, canonical_form, **property_params))
return properties
def unregister_all_properties(self):
"""Unregister all defined properties"""
self._properties.clear()
def find_properties(self, string, node, name=None, validate=True, re_match=False, sort=True, multiple=False):
"""Find all distinct properties for given string
If no capturing group is defined in the property, value will be grabbed from the entire match.
If one ore more unnamed capturing group is defined in the property, first capturing group will be used.
If named capturing group are defined in the property, they will be returned as property key.
If validate, found properties will be validated by their defined validator
If re_match, re.match will be used instead of re.search.
if sort, found properties will be sorted from longer match to shorter match.
If multiple is False and multiple values are found for the same property, the more confident one will be returned.
If multiple is False and multiple values are found for the same property and the same confidence, the longer will be returned.
:param string: input string
:type string: string
:param node: current node of the matching tree
:type node: :class:`guessit.matchtree.MatchTree`
:param name: name of property to find
:type name: string
:param re_match: use re.match instead of re.search
:type re_match: bool
:param multiple: Allows multiple property values to be returned
:type multiple: bool
:return: found properties
:rtype: list of tuples (:class:`_Property`, match, list of tuples (property_name, tuple(value_start, value_end)))
:see: `_Property`
:see: `register_property`
:see: `register_canonical_properties`
"""
entry_start = {}
entry_end = {}
entries = []
ret = []
if not string.strip():
return ret
# search all properties
for prop in self.get_properties(name):
match = prop.compiled.match(string) if re_match else prop.compiled.search(string)
if match:
entry = prop, match
entries.append(entry)
if validate:
# compute entries start and ends
for prop, match in entries:
start, end = _get_span(prop, match)
if start not in entry_start:
entry_start[start] = [prop]
else:
entry_start[start].append(prop)
if end not in entry_end:
entry_end[end] = [prop]
else:
entry_end[end].append(prop)
# remove invalid values
while True:
invalid_entries = []
for entry in entries:
prop, match = entry
if not prop.validator.validate(prop, string, node, match, entry_start, entry_end):
invalid_entries.append(entry)
if not invalid_entries:
break
for entry in invalid_entries:
prop, match = entry
entries.remove(entry)
invalid_span = _get_span(prop, match)
start = invalid_span[0]
end = invalid_span[1]
entry_start[start].remove(prop)
if not entry_start.get(start):
del entry_start[start]
entry_end[end].remove(prop)
if not entry_end.get(end):
del entry_end[end]
if multiple:
ret = entries
else:
# keep only best match if multiple values where found
entries_dict = {}
for entry in entries:
for key in prop.keys:
if not key in entries_dict:
entries_dict[key] = []
entries_dict[key].append(entry)
for entries in entries_dict.values():
if multiple:
for entry in entries:
ret.append(entry)
else:
best_ret = {}
best_prop, best_match = None, None
if len(entries) == 1:
best_prop, best_match = entries[0]
else:
for prop, match in entries:
start, end = _get_span(prop, match)
if not best_prop or \
best_prop.confidence < best_prop.confidence or \
best_prop.confidence == best_prop.confidence and \
best_match.span()[1] - best_match.span()[0] < match.span()[1] - match.span()[0]:
best_prop, best_match = prop, match
best_ret[best_prop] = best_match
for prop, match in best_ret.items():
ret.append((prop, match))
if sort:
def _sorting(x):
_, x_match = x
x_start, x_end = x_match.span()
return (x_start - x_end)
ret.sort(key=_sorting)
return ret
def as_guess(self, found_properties, input=None, filter=None, sep_replacement=None, multiple=False, *args, **kwargs):
if filter is None:
filter = lambda property, *args, **kwargs: True
guesses = [] if multiple else None
for property in found_properties:
prop, match = property
first_key = None
for key in prop.keys:
# First property key will be used as base for effective name
if isinstance(key, base_text_type):
if first_key is None:
first_key = key
break
property_name = first_key if first_key else None
span = _get_span(prop, match)
guess = Guess(confidence=prop.confidence, input=input, span=span, prop=property_name)
groups = _get_groups(match.re)
for group_name in groups:
name = group_name if isinstance(group_name, base_text_type) else property_name if property_name not in groups else None
if name:
value = self._effective_prop_value(prop, group_name, input, match.span(group_name) if group_name else match.span(), sep_replacement)
if not value is None:
is_string = isinstance(value, base_text_type)
if not is_string or is_string and value: # Keep non empty strings and other defined objects
if isinstance(value, dict):
for k, v in value.items():
if k is None:
k = name
guess[k] = v
else:
guess[name] = value
if group_name:
guess.metadata(prop).span = match.span(group_name)
if filter(guess):
if multiple:
guesses.append(guess)
else:
return guess
return guesses
def _effective_prop_value(self, prop, group_name, input=None, span=None, sep_replacement=None):
if prop.canonical_form:
return prop.canonical_form
if input is None:
return None
value = input
if not span is None:
value = value[span[0]:span[1]]
value = input[span[0]:span[1]] if input else None
if sep_replacement:
for sep_char in sep:
value = value.replace(sep_char, sep_replacement)
if value:
value = prop.format(value, group_name)
return value
def get_properties(self, name=None, canonical_form=None):
"""Retrieve properties
:return: Properties
:rtype: generator
"""
for prop in self._properties:
if (name is None or name in prop.keys) and (canonical_form is None or prop.canonical_form == canonical_form):
yield prop
def get_supported_properties(self):
supported_properties = {}
for prop in self.get_properties():
for k in prop.keys:
values = supported_properties.get(k)
if not values:
values = set()
supported_properties[k] = values
if prop.canonical_form:
values.add(prop.canonical_form)
return supported_properties
class QualitiesContainer():
def __init__(self):
self._qualities = {}
def register_quality(self, name, canonical_form, rating):
"""Register a quality rating.
:param name: Name of the property
:type name: string
:param canonical_form: Value of the property
:type canonical_form: string
:param rating: Estimated quality rating for the property
:type rating: int
"""
property_qualities = self._qualities.get(name)
if property_qualities is None:
property_qualities = {}
self._qualities[name] = property_qualities
property_qualities[canonical_form] = rating
def unregister_quality(self, name, *canonical_forms):
"""Unregister quality ratings for given property name.
If canonical_forms are specified, only those values will be unregistered
:param name: Name of the property
:type name: string
:param canonical_forms: Value of the property
:type canonical_forms: string
"""
if not canonical_forms:
if name in self._qualities:
del self._qualities[name]
else:
property_qualities = self._qualities.get(name)
if not property_qualities is None:
for property_canonical_form in canonical_forms:
if property_canonical_form in property_qualities:
del property_qualities[property_canonical_form]
if not property_qualities:
del self._qualities[name]
def clear_qualities(self,):
"""Unregister all defined quality ratings.
"""
self._qualities.clear()
def rate_quality(self, guess, *props):
"""Rate the quality of guess.
:param guess: Guess to rate
:type guess: :class:`guessit.guess.Guess`
:param props: Properties to include in the rating. if empty, rating will be performed for all guess properties.
:type props: varargs of string
:return: Quality of the guess. The higher, the better.
:rtype: int
"""
rate = 0
if not props:
props = guess.keys()
for prop in props:
prop_value = guess.get(prop)
prop_qualities = self._qualities.get(prop)
if not prop_value is None and not prop_qualities is None:
rate += prop_qualities.get(prop_value, 0)
return rate
def best_quality_properties(self, props, *guesses):
"""Retrieve the best quality guess, based on given properties
:param props: Properties to include in the rating
:type props: list of strings
:param guesses: Guesses to rate
:type guesses: :class:`guessit.guess.Guess`
:return: Best quality guess from all passed guesses
:rtype: :class:`guessit.guess.Guess`
"""
best_guess = None
best_rate = None
for guess in guesses:
rate = self.rate_quality(guess, *props)
if best_rate is None or best_rate < rate:
best_rate = rate
best_guess = guess
return best_guess
def best_quality(self, *guesses):
"""Retrieve the best quality guess.
:param guesses: Guesses to rate
:type guesses: :class:`guessit.guess.Guess`
:return: Best quality guess from all passed guesses
:rtype: :class:`guessit.guess.Guess`
"""
best_guess = None
best_rate = None
for guess in guesses:
rate = self.rate_quality(guess)
if best_rate is None or best_rate < rate:
best_rate = rate
best_guess = guess
return best_guess

111
lib/guessit/country.py Normal file
View file

@ -0,0 +1,111 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
#
# GuessIt - A library for guessing information from filenames
# Copyright (c) 2013 Nicolas Wack <wackou@gmail.com>
#
# GuessIt is free software; you can redistribute it and/or modify it under
# the terms of the Lesser GNU General Public License as published by
# the Free Software Foundation; either version 3 of the License, or
# (at your option) any later version.
#
# GuessIt is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# Lesser GNU General Public License for more details.
#
# You should have received a copy of the Lesser GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
#
from __future__ import absolute_import, division, print_function, unicode_literals
from guessit import UnicodeMixin, base_text_type, u
from guessit.fileutils import load_file_in_same_dir
import logging
__all__ = ['Country']
log = logging.getLogger(__name__)
# parsed from http://en.wikipedia.org/wiki/ISO_3166-1
#
# Description of the fields:
# "An English name, an alpha-2 code (when given),
# an alpha-3 code (when given), a numeric code, and an ISO 31666-2 code
# are all separated by pipe (|) characters."
_iso3166_contents = load_file_in_same_dir(__file__, 'ISO-3166-1_utf8.txt')
country_matrix = [l.strip().split('|')
for l in _iso3166_contents.strip().split('\n')]
country_matrix += [['Unknown', 'un', 'unk', '', ''],
['Latin America', '', 'lat', '', '']
]
country_to_alpha3 = dict((c[0].lower(), c[2].lower()) for c in country_matrix)
country_to_alpha3.update(dict((c[1].lower(), c[2].lower()) for c in country_matrix))
country_to_alpha3.update(dict((c[2].lower(), c[2].lower()) for c in country_matrix))
# add here exceptions / non ISO representations
# Note: remember to put those exceptions in lower-case, they won't work otherwise
country_to_alpha3.update({'latinoamérica': 'lat',
'brazilian': 'bra',
'españa': 'esp',
'uk': 'gbr'
})
country_alpha3_to_en_name = dict((c[2].lower(), c[0]) for c in country_matrix)
country_alpha3_to_alpha2 = dict((c[2].lower(), c[1].lower()) for c in country_matrix)
class Country(UnicodeMixin):
"""This class represents a country.
You can initialize it with pretty much anything, as it knows conversion
from ISO-3166 2-letter and 3-letter codes, and an English name.
"""
def __init__(self, country, strict=False):
country = u(country.strip().lower())
self.alpha3 = country_to_alpha3.get(country)
if self.alpha3 is None and strict:
msg = 'The given string "%s" could not be identified as a country'
raise ValueError(msg % country)
if self.alpha3 is None:
self.alpha3 = 'unk'
@property
def alpha2(self):
return country_alpha3_to_alpha2[self.alpha3]
@property
def english_name(self):
return country_alpha3_to_en_name[self.alpha3]
def __hash__(self):
return hash(self.alpha3)
def __eq__(self, other):
if isinstance(other, Country):
return self.alpha3 == other.alpha3
if isinstance(other, base_text_type):
try:
return self == Country(other)
except ValueError:
return False
return False
def __ne__(self, other):
return not self == other
def __unicode__(self):
return self.english_name
def __repr__(self):
return 'Country(%s)' % self.english_name

146
lib/guessit/date.py Normal file
View file

@ -0,0 +1,146 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
#
# GuessIt - A library for guessing information from filenames
# Copyright (c) 2013 Nicolas Wack <wackou@gmail.com>
#
# GuessIt is free software; you can redistribute it and/or modify it under
# the terms of the Lesser GNU General Public License as published by
# the Free Software Foundation; either version 3 of the License, or
# (at your option) any later version.
#
# GuessIt is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# Lesser GNU General Public License for more details.
#
# You should have received a copy of the Lesser GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
#
from __future__ import absolute_import, division, print_function, unicode_literals
import datetime
import re
import math
_dsep = r'[-/ \.]'
_date_rexps = [re.compile(
# 20010823
r'[^0-9]' +
r'(?P<year>[0-9]{4})' +
r'(?P<month>[0-9]{2})' +
r'(?P<day>[0-9]{2})' +
r'[^0-9]'),
# 2001-08-23
re.compile(r'[^0-9]' +
r'(?P<year>[0-9]{4})' + _dsep +
r'(?P<month>[0-9]{2})' + _dsep +
r'(?P<day>[0-9]{2})' +
r'[^0-9]'),
# 23-08-2001
re.compile(r'[^0-9]' +
r'(?P<day>[0-9]{2})' + _dsep +
r'(?P<month>[0-9]{2})' + _dsep +
r'(?P<year>[0-9]{4})' +
r'[^0-9]'),
# 23-08-01
re.compile(r'[^0-9]' +
r'(?P<day>[0-9]{2})' + _dsep +
r'(?P<month>[0-9]{2})' + _dsep +
r'(?P<year>[0-9]{2})' +
r'[^0-9]'),
]
def valid_year(year, today=None):
"""Check if number is a valid year"""
if not today:
today = datetime.date.today()
return 1920 < year < today.year + 5
def search_year(string):
"""Looks for year patterns, and if found return the year and group span.
Assumes there are sentinels at the beginning and end of the string that
always allow matching a non-digit delimiting the date.
Note this only looks for valid production years, that is between 1920
and now + 5 years, so for instance 2000 would be returned as a valid
year but 1492 would not.
>>> search_year(' in the year 2000... ')
(2000, (13, 17))
>>> search_year(' they arrived in 1492. ')
(None, None)
"""
match = re.search(r'[^0-9]([0-9]{4})[^0-9]', string)
if match:
year = int(match.group(1))
if valid_year(year):
return (year, match.span(1))
return (None, None)
def search_date(string):
"""Looks for date patterns, and if found return the date and group span.
Assumes there are sentinels at the beginning and end of the string that
always allow matching a non-digit delimiting the date.
Year can be defined on two digit only. It will return the nearest possible
date from today.
>>> search_date(' This happened on 2002-04-22. ')
(datetime.date(2002, 4, 22), (18, 28))
>>> search_date(' And this on 17-06-1998. ')
(datetime.date(1998, 6, 17), (13, 23))
>>> search_date(' no date in here ')
(None, None)
"""
today = datetime.date.today()
for drexp in _date_rexps:
match = re.search(drexp, string)
if match:
d = match.groupdict()
year, month, day = int(d['year']), int(d['month']), int(d['day'])
# years specified as 2 digits should be adjusted here
if year < 100:
if year > (today.year % 100) + 5:
year = 1900 + year
else:
year = 2000 + year
date = None
try:
date = datetime.date(year, month, day)
except ValueError:
try:
date = datetime.date(year, day, month)
except ValueError:
pass
if date is None:
continue
# check date plausibility
if not valid_year(date.year, today=today):
continue
# looks like we have a valid date
# note: span is [+1,-1] because we don't want to include the
# non-digit char
start, end = match.span()
return (date, (start + 1, end - 1))
return None, None

87
lib/guessit/fileutils.py Normal file
View file

@ -0,0 +1,87 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
#
# GuessIt - A library for guessing information from filenames
# Copyright (c) 2013 Nicolas Wack <wackou@gmail.com>
#
# GuessIt is free software; you can redistribute it and/or modify it under
# the terms of the Lesser GNU General Public License as published by
# the Free Software Foundation; either version 3 of the License, or
# (at your option) any later version.
#
# GuessIt is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# Lesser GNU General Public License for more details.
#
# You should have received a copy of the Lesser GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
#
from __future__ import absolute_import, division, print_function, unicode_literals
from guessit import s, u
import os.path
import zipfile
import io
def split_path(path):
r"""Splits the given path into the list of folders and the filename (or the
last folder if you gave it a folder path.
If the given path was an absolute path, the first element will always be:
- the '/' root folder on Unix systems
- the drive letter on Windows systems (eg: r'C:\')
- the mount point '\\' on Windows systems (eg: r'\\host\share')
>>> s(split_path('/usr/bin/smewt'))
['/', 'usr', 'bin', 'smewt']
>>> s(split_path('relative_path/to/my_folder/'))
['relative_path', 'to', 'my_folder']
"""
result = []
while True:
head, tail = os.path.split(path)
if not head and not tail:
return result
if not tail and head == path:
# Make sure we won't have an infinite loop.
result = [head] + result
return result
# we just split a directory ending with '/', so tail is empty
if not tail:
path = head
continue
# otherwise, add the last path fragment and keep splitting
result = [tail] + result
path = head
def file_in_same_dir(ref_file, desired_file):
"""Return the path for a file in the same dir as a given reference file.
>>> s(file_in_same_dir('~/smewt/smewt.db', 'smewt.settings')) == os.path.normpath('~/smewt/smewt.settings')
True
"""
return os.path.join(*(split_path(ref_file)[:-1] + [desired_file]))
def load_file_in_same_dir(ref_file, filename):
"""Load a given file. Works even when the file is contained inside a zip."""
path = split_path(ref_file)[:-1] + [filename]
for i, p in enumerate(path):
if p.endswith('.zip'):
zfilename = os.path.join(*path[:i + 1])
zfile = zipfile.ZipFile(zfilename)
return zfile.read('/'.join(path[i + 1:]))
return u(io.open(os.path.join(*path), encoding='utf-8').read())

452
lib/guessit/guess.py Normal file
View file

@ -0,0 +1,452 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
#
# GuessIt - A library for guessing information from filenames
# Copyright (c) 2013 Nicolas Wack <wackou@gmail.com>
#
# GuessIt is free software; you can redistribute it and/or modify it under
# the terms of the Lesser GNU General Public License as published by
# the Free Software Foundation; either version 3 of the License, or
# (at your option) any later version.
#
# GuessIt is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# Lesser GNU General Public License for more details.
#
# You should have received a copy of the Lesser GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
#
from __future__ import absolute_import, division, print_function, unicode_literals
from guessit import UnicodeMixin, s, u, base_text_type
import json
import datetime
import logging
log = logging.getLogger(__name__)
class GuessMetadata(object):
"""GuessMetadata contains confidence, an input string, span and related property.
If defined on a property of Guess object, it overrides the object defined as global.
:param parent: The parent metadata, used for undefined properties in self object
:type parent: :class: `GuessMedata`
:param confidence: The confidence (from 0.0 to 1.0)
:type confidence: number
:param input: The input string
:type input: string
:param span: The input string
:type span: tuple (int, int)
:param prop: The found property definition
:type prop: :class `guessit.containers._Property`
"""
def __init__(self, parent=None, confidence=None, input=None, span=None, prop=None, *args, **kwargs):
self.parent = parent
if confidence is None and self.parent is None:
self._confidence = 1.0
else:
self._confidence = confidence
self._input = input
self._span = span
self._prop = prop
@property
def confidence(self):
"""The confidence
:rtype: int
:return: confidence value
"""
return self._confidence if not self._confidence is None else self.parent.confidence if self.parent else None
@confidence.setter
def confidence(self, confidence):
self._confidence = confidence
@property
def input(self):
"""The input
:rtype: string
:return: String used to find this guess value
"""
return self._input if not self._input is None else self.parent.input if self.parent else None
@property
def span(self):
"""The span
:rtype: tuple (int, int)
:return: span of input string used to find this guess value
"""
return self._span if not self._span is None else self.parent.span if self.parent else None
@span.setter
def span(self, span):
"""The span
:rtype: tuple (int, int)
:return: span of input string used to find this guess value
"""
self._span = span
@property
def prop(self):
"""The property
:rtype: :class:`_Property`
:return: The property
"""
return self._prop if not self._prop is None else self.parent.prop if self.parent else None
@property
def raw(self):
"""Return the raw information (original match from the string,
not the cleaned version) associated with the given property name."""
if self.input and self.span:
return self.input[self.span[0]:self.span[1]]
return None
def __repr__(self, *args, **kwargs):
return object.__repr__(self, *args, **kwargs)
def _split_kwargs(**kwargs):
metadata_args = {}
for prop in dir(GuessMetadata):
try:
metadata_args[prop] = kwargs.pop(prop)
except KeyError:
pass
return metadata_args, kwargs
class Guess(UnicodeMixin, dict):
"""A Guess is a dictionary which has an associated confidence for each of
its values.
As it is a subclass of dict, you can use it everywhere you expect a
simple dict."""
def __init__(self, *args, **kwargs):
metadata_kwargs, kwargs = _split_kwargs(**kwargs)
self._global_metadata = GuessMetadata(**metadata_kwargs)
dict.__init__(self, *args, **kwargs)
self._metadata = {}
for prop in self:
self._metadata[prop] = GuessMetadata(parent=self._global_metadata)
def to_dict(self, advanced=False):
"""Return the guess as a dict containing only base types, ie:
where dates, languages, countries, etc. are converted to strings.
if advanced is True, return the data as a json string containing
also the raw information of the properties."""
data = dict(self)
for prop, value in data.items():
if isinstance(value, datetime.date):
data[prop] = value.isoformat()
elif isinstance(value, (UnicodeMixin, base_text_type)):
data[prop] = u(value)
elif isinstance(value, list):
data[prop] = [u(x) for x in value]
if advanced:
metadata = self.metadata(prop)
prop_data = {'value': data[prop]}
if metadata.raw:
prop_data['raw'] = metadata.raw
if metadata.confidence:
prop_data['confidence'] = metadata.confidence
data[prop] = prop_data
return data
def nice_string(self, advanced=False):
"""Return a string with the property names and their values,
that also displays the associated confidence to each property.
FIXME: doc with param"""
if advanced:
data = self.to_dict(advanced)
return json.dumps(data, indent=4)
else:
data = self.to_dict()
parts = json.dumps(data, indent=4).split('\n')
for i, p in enumerate(parts):
if p[:5] != ' "':
continue
prop = p.split('"')[1]
parts[i] = (' [%.2f] "' % self.confidence(prop)) + p[5:]
return '\n'.join(parts)
def __unicode__(self):
return u(self.to_dict())
def metadata(self, prop=None):
"""Return the metadata associated with the given property name
If no property name is given, get the global_metadata
"""
if prop is None:
return self._global_metadata
if not prop in self._metadata:
self._metadata[prop] = GuessMetadata(parent=self._global_metadata)
return self._metadata[prop]
def confidence(self, prop=None):
return self.metadata(prop).confidence
def set_confidence(self, prop, confidence):
self.metadata(prop).confidence = confidence
def raw(self, prop):
return self.metadata(prop).raw
def set(self, prop_name, value, *args, **kwargs):
self[prop_name] = value
self._metadata[prop_name] = GuessMetadata(parent=self._global_metadata, *args, **kwargs)
def update(self, other, confidence=None):
dict.update(self, other)
if isinstance(other, Guess):
for prop in other:
try:
self._metadata[prop] = other._metadata[prop]
except KeyError:
pass
if not confidence is None:
for prop in other:
self.set_confidence(prop, confidence)
def update_highest_confidence(self, other):
"""Update this guess with the values from the given one. In case
there is property present in both, only the one with the highest one
is kept."""
if not isinstance(other, Guess):
raise ValueError('Can only call this function on Guess instances')
for prop in other:
if prop in self and self.metadata(prop).confidence >= other.metadata(prop).confidence:
continue
self[prop] = other[prop]
self._metadata[prop] = other.metadata(prop)
def choose_int(g1, g2):
"""Function used by merge_similar_guesses to choose between 2 possible
properties when they are integers."""
v1, c1 = g1 # value, confidence
v2, c2 = g2
if (v1 == v2):
return (v1, 1 - (1 - c1) * (1 - c2))
else:
if c1 > c2:
return (v1, c1 - c2)
else:
return (v2, c2 - c1)
def choose_string(g1, g2):
"""Function used by merge_similar_guesses to choose between 2 possible
properties when they are strings.
If the 2 strings are similar, or one is contained in the other, the latter is returned
with an increased confidence.
If the 2 strings are dissimilar, the one with the higher confidence is returned, with
a weaker confidence.
Note that here, 'similar' means that 2 strings are either equal, or that they
differ very little, such as one string being the other one with the 'the' word
prepended to it.
>>> s(choose_string(('Hello', 0.75), ('World', 0.5)))
('Hello', 0.25)
>>> s(choose_string(('Hello', 0.5), ('hello', 0.5)))
('Hello', 0.75)
>>> s(choose_string(('Hello', 0.4), ('Hello World', 0.4)))
('Hello', 0.64)
>>> s(choose_string(('simpsons', 0.5), ('The Simpsons', 0.5)))
('The Simpsons', 0.75)
"""
v1, c1 = g1 # value, confidence
v2, c2 = g2
if not v1:
return g2
elif not v2:
return g1
v1, v2 = v1.strip(), v2.strip()
v1l, v2l = v1.lower(), v2.lower()
combined_prob = 1 - (1 - c1) * (1 - c2)
if v1l == v2l:
return (v1, combined_prob)
# check for common patterns
elif v1l == 'the ' + v2l:
return (v1, combined_prob)
elif v2l == 'the ' + v1l:
return (v2, combined_prob)
# if one string is contained in the other, return the shortest one
elif v2l in v1l:
return (v2, combined_prob)
elif v1l in v2l:
return (v1, combined_prob)
# in case of conflict, return the one with highest confidence
else:
if c1 > c2:
return (v1, c1 - c2)
else:
return (v2, c2 - c1)
def _merge_similar_guesses_nocheck(guesses, prop, choose):
"""Take a list of guesses and merge those which have the same properties,
increasing or decreasing the confidence depending on whether their values
are similar.
This function assumes there are at least 2 valid guesses."""
similar = [guess for guess in guesses if prop in guess]
g1, g2 = similar[0], similar[1]
other_props = set(g1) & set(g2) - set([prop])
if other_props:
log.debug('guess 1: %s' % g1)
log.debug('guess 2: %s' % g2)
for prop in other_props:
if g1[prop] != g2[prop]:
log.warning('both guesses to be merged have more than one '
'different property in common, bailing out...')
return
# merge all props of s2 into s1, updating the confidence for the
# considered property
v1, v2 = g1[prop], g2[prop]
c1, c2 = g1.confidence(prop), g2.confidence(prop)
new_value, new_confidence = choose((v1, c1), (v2, c2))
if new_confidence >= c1:
msg = "Updating matching property '%s' with confidence %.2f"
else:
msg = "Updating non-matching property '%s' with confidence %.2f"
log.debug(msg % (prop, new_confidence))
g2[prop] = new_value
g2.set_confidence(prop, new_confidence)
g1.update(g2)
guesses.remove(g2)
def merge_similar_guesses(guesses, prop, choose):
"""Take a list of guesses and merge those which have the same properties,
increasing or decreasing the confidence depending on whether their values
are similar."""
similar = [guess for guess in guesses if prop in guess]
if len(similar) < 2:
# nothing to merge
return
if len(similar) == 2:
_merge_similar_guesses_nocheck(guesses, prop, choose)
if len(similar) > 2:
log.debug('complex merge, trying our best...')
before = len(guesses)
_merge_similar_guesses_nocheck(guesses, prop, choose)
after = len(guesses)
if after < before:
# recurse only when the previous call actually did something,
# otherwise we end up in an infinite loop
merge_similar_guesses(guesses, prop, choose)
def merge_all(guesses, append=None):
"""Merge all the guesses in a single result, remove very unlikely values,
and return it.
You can specify a list of properties that should be appended into a list
instead of being merged.
>>> s(merge_all([ Guess({'season': 2}, confidence=0.6),
... Guess({'episodeNumber': 13}, confidence=0.8) ])
... ) == {'season': 2, 'episodeNumber': 13}
True
>>> s(merge_all([ Guess({'episodeNumber': 27}, confidence=0.02),
... Guess({'season': 1}, confidence=0.2) ])
... ) == {'season': 1}
True
>>> s(merge_all([ Guess({'other': 'PROPER'}, confidence=0.8),
... Guess({'releaseGroup': '2HD'}, confidence=0.8) ],
... append=['other'])
... ) == {'releaseGroup': '2HD', 'other': ['PROPER']}
True
"""
result = Guess()
if not guesses:
return result
if append is None:
append = []
for g in guesses:
# first append our appendable properties
for prop in append:
if prop in g:
result.set(prop, result.get(prop, []) + [g[prop]],
# TODO: what to do with confidence here? maybe an
# arithmetic mean...
confidence=g.metadata(prop).confidence,
input=g.metadata(prop).input,
span=g.metadata(prop).span,
prop=g.metadata(prop).prop)
del g[prop]
# then merge the remaining ones
dups = set(result) & set(g)
if dups:
log.warning('duplicate properties %s in merged result...' % [(result[p], g[p]) for p in dups])
result.update_highest_confidence(g)
# delete very unlikely values
for p in list(result.keys()):
if result.confidence(p) < 0.05:
del result[p]
# make sure our appendable properties contain unique values
for prop in append:
try:
value = result[prop]
if isinstance(value, list):
result[prop] = list(set(value))
else:
result[prop] = [value]
except KeyError:
pass
return result

67
lib/guessit/hash_ed2k.py Normal file
View file

@ -0,0 +1,67 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
#
# GuessIt - A library for guessing information from filenames
# Copyright (c) 2013 Nicolas Wack <wackou@gmail.com>
#
# GuessIt is free software; you can redistribute it and/or modify it under
# the terms of the Lesser GNU General Public License as published by
# the Free Software Foundation; either version 3 of the License, or
# (at your option) any later version.
#
# GuessIt is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# Lesser GNU General Public License for more details.
#
# You should have received a copy of the Lesser GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
#
from __future__ import absolute_import, division, print_function, unicode_literals
from guessit import s, to_hex
import hashlib
import os.path
def hash_file(filename):
"""Returns the ed2k hash of a given file.
>>> testfile = os.path.join(os.path.dirname(__file__), 'test/dummy.srt')
>>> s(hash_file(testfile))
'ed2k://|file|dummy.srt|59|41F58B913AB3973F593BEBA8B8DF6510|/'
"""
return 'ed2k://|file|%s|%d|%s|/' % (os.path.basename(filename),
os.path.getsize(filename),
hash_filehash(filename).upper())
def hash_filehash(filename):
"""Returns the ed2k hash of a given file.
This function is taken from:
http://www.radicand.org/blog/orz/2010/2/21/edonkey2000-hash-in-python/
"""
md4 = hashlib.new('md4').copy
def gen(f):
while True:
x = f.read(9728000)
if x:
yield x
else:
return
def md4_hash(data):
m = md4()
m.update(data)
return m
with open(filename, 'rb') as f:
a = gen(f)
hashes = [md4_hash(data).digest() for data in a]
if len(hashes) == 1:
return to_hex(hashes[0])
else:
return md4_hash(reduce(lambda a, d: a + d, hashes, "")).hexd

58
lib/guessit/hash_mpc.py Normal file
View file

@ -0,0 +1,58 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
#
# GuessIt - A library for guessing information from filenames
# Copyright (c) 2013 Nicolas Wack <wackou@gmail.com>
#
# GuessIt is free software; you can redistribute it and/or modify it under
# the terms of the Lesser GNU General Public License as published by
# the Free Software Foundation; either version 3 of the License, or
# (at your option) any later version.
#
# GuessIt is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# Lesser GNU General Public License for more details.
#
# You should have received a copy of the Lesser GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
#
from __future__ import absolute_import, division, print_function, unicode_literals
import struct
import os
def hash_file(filename):
"""This function is taken from:
http://trac.opensubtitles.org/projects/opensubtitles/wiki/HashSourceCodes
and is licensed under the GPL."""
longlongformat = b'q' # long long
bytesize = struct.calcsize(longlongformat)
f = open(filename, "rb")
filesize = os.path.getsize(filename)
hash_value = filesize
if filesize < 65536 * 2:
raise Exception("SizeError: size is %d, should be > 132K..." % filesize)
for x in range(int(65536 / bytesize)):
buf = f.read(bytesize)
(l_value,) = struct.unpack(longlongformat, buf)
hash_value += l_value
hash_value = hash_value & 0xFFFFFFFFFFFFFFFF # to remain as 64bit number
f.seek(max(0, filesize - 65536), 0)
for x in range(int(65536 / bytesize)):
buf = f.read(bytesize)
(l_value,) = struct.unpack(longlongformat, buf)
hash_value += l_value
hash_value = hash_value & 0xFFFFFFFFFFFFFFFF
f.close()
return "%016x" % hash_value

401
lib/guessit/language.py Normal file
View file

@ -0,0 +1,401 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
#
# GuessIt - A library for guessing information from filenames
# Copyright (c) 2013 Nicolas Wack <wackou@gmail.com>
#
# GuessIt is free software; you can redistribute it and/or modify it under
# the terms of the Lesser GNU General Public License as published by
# the Free Software Foundation; either version 3 of the License, or
# (at your option) any later version.
#
# GuessIt is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# Lesser GNU General Public License for more details.
#
# You should have received a copy of the Lesser GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
#
from __future__ import absolute_import, division, print_function, unicode_literals
from guessit import UnicodeMixin, base_text_type, u
from guessit.textutils import find_words
from babelfish import Language
import babelfish
import re
import logging
from guessit.guess import Guess
__all__ = ['Language', 'UNDETERMINED',
'search_language', 'guess_language']
log = logging.getLogger(__name__)
UNDETERMINED = babelfish.Language('und')
SYN = {('und', None): ['unknown', 'inconnu', 'unk', 'un'],
('ell', None): ['gr', 'greek'],
('spa', None): ['esp', 'español'],
('fra', None): ['français', 'vf', 'vff', 'vfi'],
('swe', None): ['se'],
('por', 'BR'): ['po', 'pb', 'pob', 'br', 'brazilian'],
('cat', None): ['català'],
('ces', None): ['cz'],
('ukr', None): ['ua'],
('zho', None): ['cn'],
('jpn', None): ['jp'],
('hrv', None): ['scr'],
('mul', None): ['multi', 'dl'], # http://scenelingo.wordpress.com/2009/03/24/what-does-dl-mean/
}
class GuessitConverter(babelfish.LanguageReverseConverter):
_with_country_regexp = re.compile('(.*)\((.*)\)')
_with_country_regexp2 = re.compile('(.*)-(.*)')
def __init__(self):
self.guessit_exceptions = {}
for (alpha3, country), synlist in SYN.items():
for syn in synlist:
self.guessit_exceptions[syn.lower()] = (alpha3, country, None)
@property
def codes(self):
return (babelfish.language_converters['alpha3b'].codes |
babelfish.language_converters['alpha2'].codes |
babelfish.language_converters['name'].codes |
babelfish.language_converters['opensubtitles'].codes |
babelfish.country_converters['name'].codes |
frozenset(self.guessit_exceptions.keys()))
def convert(self, alpha3, country=None, script=None):
return str(babelfish.Language(alpha3, country, script))
def reverse(self, name):
with_country = (GuessitConverter._with_country_regexp.match(name) or
GuessitConverter._with_country_regexp2.match(name))
if with_country:
lang = babelfish.Language.fromguessit(with_country.group(1).strip())
lang.country = babelfish.Country.fromguessit(with_country.group(2).strip())
return (lang.alpha3, lang.country.alpha2 if lang.country else None, lang.script or None)
# exceptions come first, as they need to override a potential match
# with any of the other guessers
try:
return self.guessit_exceptions[name.lower()]
except KeyError:
pass
for conv in [babelfish.Language,
babelfish.Language.fromalpha3b,
babelfish.Language.fromalpha2,
babelfish.Language.fromname,
babelfish.Language.fromopensubtitles]:
try:
c = conv(name)
return c.alpha3, c.country, c.script
except (ValueError, babelfish.LanguageReverseError):
pass
raise babelfish.LanguageReverseError(name)
babelfish.language_converters['guessit'] = GuessitConverter()
COUNTRIES_SYN = {'ES': ['españa'],
'GB': ['UK'],
'BR': ['brazilian', 'bra'],
# FIXME: this one is a bit of a stretch, not sure how to do
# it properly, though...
'MX': ['Latinoamérica', 'latin america']
}
class GuessitCountryConverter(babelfish.CountryReverseConverter):
def __init__(self):
self.guessit_exceptions = {}
for alpha2, synlist in COUNTRIES_SYN.items():
for syn in synlist:
self.guessit_exceptions[syn.lower()] = alpha2
@property
def codes(self):
return (babelfish.country_converters['name'].codes |
frozenset(babelfish.COUNTRIES.values()) |
frozenset(self.guessit_exceptions.keys()))
def convert(self, alpha2):
return str(babelfish.Country(alpha2))
def reverse(self, name):
# exceptions come first, as they need to override a potential match
# with any of the other guessers
try:
return self.guessit_exceptions[name.lower()]
except KeyError:
pass
try:
return babelfish.Country(name.upper()).alpha2
except ValueError:
pass
for conv in [babelfish.Country.fromname]:
try:
return conv(name).alpha2
except babelfish.CountryReverseError:
pass
raise babelfish.CountryReverseError(name)
babelfish.country_converters['guessit'] = GuessitCountryConverter()
class Language(UnicodeMixin):
"""This class represents a human language.
You can initialize it with pretty much anything, as it knows conversion
from ISO-639 2-letter and 3-letter codes, English and French names.
You can also distinguish languages for specific countries, such as
Portuguese and Brazilian Portuguese.
There are various properties on the language object that give you the
representation of the language for a specific usage, such as .alpha3
to get the ISO 3-letter code, or .opensubtitles to get the OpenSubtitles
language code.
>>> Language('fr')
Language(French)
>>> (Language('eng').english_name) == 'English'
True
>>> (Language('pt(br)').country.name) == 'BRAZIL'
True
>>> (Language('zz', strict=False).english_name) == 'Undetermined'
True
>>> (Language('pt(br)').opensubtitles) == 'pob'
True
"""
def __init__(self, language, country=None, strict=False):
language = u(language.strip().lower())
country = babelfish.Country(country.upper()) if country else None
try:
self.lang = babelfish.Language.fromguessit(language)
# user given country overrides guessed one
if country:
self.lang.country = country
except babelfish.LanguageReverseError:
msg = 'The given string "%s" could not be identified as a language' % language
if strict:
raise ValueError(msg)
log.debug(msg)
self.lang = UNDETERMINED
@property
def country(self):
return self.lang.country
@property
def alpha2(self):
return self.lang.alpha2
@property
def alpha3(self):
return self.lang.alpha3
@property
def alpha3term(self):
return self.lang.alpha3b
@property
def english_name(self):
return self.lang.name
@property
def opensubtitles(self):
return self.lang.opensubtitles
@property
def tmdb(self):
if self.country:
return '%s-%s' % (self.alpha2, self.country.alpha2)
return self.alpha2
def __hash__(self):
return hash(self.lang)
def __eq__(self, other):
if isinstance(other, Language):
# in Guessit, languages are considered equal if their main languages are equal
return self.alpha3 == other.alpha3
if isinstance(other, base_text_type):
try:
return self == Language(other)
except ValueError:
return False
return False
def __ne__(self, other):
return not self == other
def __bool__(self):
return self.lang != UNDETERMINED
__nonzero__ = __bool__
def __unicode__(self):
if self.lang.country:
return '%s(%s)' % (self.english_name, self.country.alpha2)
else:
return self.english_name
def __repr__(self):
if self.lang.country:
return 'Language(%s, country=%s)' % (self.english_name, self.lang.country)
else:
return 'Language(%s)' % self.english_name
# list of common words which could be interpreted as languages, but which
# are far too common to be able to say they represent a language in the
# middle of a string (where they most likely carry their commmon meaning)
LNG_COMMON_WORDS = frozenset([
# english words
'is', 'it', 'am', 'mad', 'men', 'man', 'run', 'sin', 'st', 'to',
'no', 'non', 'war', 'min', 'new', 'car', 'day', 'bad', 'bat', 'fan',
'fry', 'cop', 'zen', 'gay', 'fat', 'one', 'cherokee', 'got', 'an', 'as',
'cat', 'her', 'be', 'hat', 'sun', 'may', 'my', 'mr', 'rum', 'pi',
# french words
'bas', 'de', 'le', 'son', 'ne', 'ca', 'ce', 'et', 'que',
'mal', 'est', 'vol', 'or', 'mon', 'se',
# spanish words
'la', 'el', 'del', 'por', 'mar',
# other
'ind', 'arw', 'ts', 'ii', 'bin', 'chan', 'ss', 'san', 'oss', 'iii',
'vi', 'ben', 'da', 'lt', 'ch',
# new from babelfish
'mkv', 'avi', 'dmd', 'the', 'dis', 'cut', 'stv', 'des', 'dia', 'and',
'cab', 'sub', 'mia', 'rim', 'las', 'une', 'par', 'srt', 'ano', 'toy',
'job', 'gag', 'reel', 'www', 'for', 'ayu', 'csi', 'ren', 'moi', 'sur',
'fer', 'fun', 'two', 'big', 'psy', 'air',
# release groups
'bs' # Bosnian
])
subtitle_prefixes = ['sub', 'subs', 'st', 'vost', 'subforced', 'fansub', 'hardsub']
subtitle_suffixes = ['subforced', 'fansub', 'hardsub']
lang_prefixes = ['true']
def find_possible_languages(string):
"""Find possible languages in the string
:return: list of tuple (property, Language, lang_word, word)
"""
words = find_words(string)
valid_words = []
for word in words:
lang_word = word.lower()
key = 'language'
for prefix in subtitle_prefixes:
if lang_word.startswith(prefix):
lang_word = lang_word[len(prefix):]
key = 'subtitleLanguage'
for suffix in subtitle_suffixes:
if lang_word.endswith(suffix):
lang_word = lang_word[:len(suffix)]
key = 'subtitleLanguage'
for prefix in lang_prefixes:
if lang_word.startswith(prefix):
lang_word = lang_word[len(prefix):]
if not lang_word in LNG_COMMON_WORDS:
try:
lang = Language(lang_word)
# Keep language with alpha2 equilavent. Others are probably an uncommon language.
if lang == 'mul' or hasattr(lang, 'alpha2'):
valid_words.append((key, lang, lang_word, word))
except babelfish.Error:
pass
return valid_words
def search_language(string, lang_filter=None):
"""Looks for language patterns, and if found return the language object,
its group span and an associated confidence.
you can specify a list of allowed languages using the lang_filter argument,
as in lang_filter = [ 'fr', 'eng', 'spanish' ]
>>> search_language('movie [en].avi')['language']
Language(English)
>>> search_language('the zen fat cat and the gay mad men got a new fan', lang_filter = ['en', 'fr', 'es'])
"""
if lang_filter:
lang_filter = set(babelfish.Language.fromguessit(lang) for lang in lang_filter)
confidence = 1.0 # for all of them
for prop, language, lang, word in find_possible_languages(string):
pos = string.find(word)
end = pos + len(word)
if lang_filter and language not in lang_filter:
continue
# only allow those languages that have a 2-letter code, those that
# don't are too esoteric and probably false matches
#if language.lang not in lng3_to_lng2:
# continue
# confidence depends on alpha2, alpha3, english name, ...
if len(lang) == 2:
confidence = 0.8
elif len(lang) == 3:
confidence = 0.9
elif prop == 'subtitleLanguage':
confidence = 0.6 # Subtitle prefix found with language
else:
# Note: we could either be really confident that we found a
# language or assume that full language names are too
# common words and lower their confidence accordingly
confidence = 0.3 # going with the low-confidence route here
return Guess({prop: language}, confidence=confidence, input=string, span=(pos, end))
return None
def guess_language(text): # pragma: no cover
"""Guess the language in which a body of text is written.
This uses the external guess-language python module, and will fail and return
Language(Undetermined) if it is not installed.
"""
try:
from guess_language import guessLanguage
return babelfish.Language.fromguessit(guessLanguage(text))
except ImportError:
log.error('Cannot detect the language of the given text body, missing dependency: guess-language')
log.error('Please install it from PyPI, by doing eg: pip install guess-language')
return UNDETERMINED

247
lib/guessit/matcher.py Normal file
View file

@ -0,0 +1,247 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
#
# GuessIt - A library for guessing information from filenames
# Copyright (c) 2013 Nicolas Wack <wackou@gmail.com>
# Copyright (c) 2013 Rémi Alvergnat <toilal.dev@gmail.com>
#
# GuessIt is free software; you can redistribute it and/or modify it under
# the terms of the Lesser GNU General Public License as published by
# the Free Software Foundation; either version 3 of the License, or
# (at your option) any later version.
#
# GuessIt is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# Lesser GNU General Public License for more details.
#
# You should have received a copy of the Lesser GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
#
from __future__ import absolute_import, division, print_function, \
unicode_literals
import logging
from guessit import PY3, u
from guessit.transfo import TransformerException
from guessit.matchtree import MatchTree
from guessit.textutils import normalize_unicode, clean_string
from guessit.guess import Guess
import inspect
log = logging.getLogger(__name__)
class IterativeMatcher(object):
"""An iterative matcher tries to match different patterns that appear
in the filename.
The ``filetype`` argument indicates which type of file you want to match.
If it is undefined, the matcher will try to see whether it can guess
that the file corresponds to an episode, or otherwise will assume it is
a movie.
The recognized ``filetype`` values are:
``['subtitle', 'info', 'movie', 'moviesubtitle', 'movieinfo', 'episode',
'episodesubtitle', 'episodeinfo']``
``options`` is a dict of options values to be passed to the transformations used
by the matcher.
The IterativeMatcher works mainly in 2 steps:
First, it splits the filename into a match_tree, which is a tree of groups
which have a semantic meaning, such as episode number, movie title,
etc...
The match_tree created looks like the following::
0000000000000000000000000000000000000000000000000000000000000000000000000000000000 111
0000011111111111112222222222222233333333444444444444444455555555666777777778888888 000
0000000000000000000000000000000001111112011112222333333401123334000011233340000000 000
__________________(The.Prestige).______.[____.HP.______.{__-___}.St{__-___}.Chaps].___
xxxxxttttttttttttt ffffff vvvv xxxxxx ll lll xx xxx ccc
[XCT].Le.Prestige.(The.Prestige).DVDRip.[x264.HP.He-Aac.{Fr-Eng}.St{Fr-Eng}.Chaps].mkv
The first 3 lines indicates the group index in which a char in the
filename is located. So for instance, ``x264`` (in the middle) is the group (0, 4, 1), and
it corresponds to a video codec, denoted by the letter ``v`` in the 4th line.
(for more info, see guess.matchtree.to_string)
Second, it tries to merge all this information into a single object
containing all the found properties, and does some (basic) conflict
resolution when they arise.
"""
def __init__(self, filename, options=None, **kwargs):
options = dict(options or {})
for k, v in kwargs.items():
if k not in options or not options[k]:
options[k] = v # options dict has priority over keyword arguments
self._validate_options(options)
if not PY3 and not isinstance(filename, unicode):
log.warning('Given filename to matcher is not unicode...')
filename = filename.decode('utf-8')
filename = normalize_unicode(filename)
self.match_tree = MatchTree(filename)
self.options = options
self._transfo_calls = []
# sanity check: make sure we don't process a (mostly) empty string
if clean_string(filename) == '':
return
from guessit.plugins import transformers
try:
mtree = self.match_tree
if 'type' in self.options:
mtree.guess.set('type', self.options['type'], confidence=0.0)
# Process
for transformer in transformers.all_transformers():
self._process(transformer, False)
# Post-process
for transformer in transformers.all_transformers():
self._process(transformer, True)
log.debug('Found match tree:\n%s' % u(mtree))
except TransformerException as e:
log.debug('An error has occured in Transformer %s: %s' % (e.transformer, e))
def _process(self, transformer, post=False):
if not hasattr(transformer, 'should_process') or transformer.should_process(self.match_tree, self.options):
if post:
transformer.post_process(self.match_tree, self.options)
else:
transformer.process(self.match_tree, self.options)
self._transfo_calls.append(transformer)
@property
def second_pass_options(self):
second_pass_options = {}
for transformer in self._transfo_calls:
if hasattr(transformer, 'second_pass_options'):
transformer_second_pass_options = transformer.second_pass_options(self.match_tree, self.options)
if transformer_second_pass_options:
second_pass_options.update(transformer_second_pass_options)
return second_pass_options
def _validate_options(self, options):
valid_filetypes = ('subtitle', 'info', 'video',
'movie', 'moviesubtitle', 'movieinfo',
'episode', 'episodesubtitle', 'episodeinfo')
type = options.get('type')
if type and type not in valid_filetypes:
raise ValueError("filetype needs to be one of %s" % valid_filetypes)
def matched(self):
return self.match_tree.matched()
def found_property(node, name, value=None, confidence=1.0, update_guess=True, logger=None):
# automatically retrieve the log object from the caller frame
if not logger:
caller_frame = inspect.stack()[1][0]
logger = caller_frame.f_locals['self'].log
guess = Guess({name: node.clean_value if value is None else value}, confidence=confidence)
return found_guess(node, guess, update_guess=update_guess, logger=logger)
def found_guess(node, guess, update_guess=True, logger=None):
if node.guess:
if update_guess:
node.guess.update_highest_confidence(guess)
else:
child = node.add_child(guess.metadata().span)
child.guess = guess
else:
node.guess = guess
log_found_guess(guess, logger)
return node.guess
def log_found_guess(guess, logger=None):
for k, v in guess.items():
(logger or log).debug('Property found: %s=%s (confidence=%.2f)' % (k, v, guess.confidence(k)))
class GuessFinder(object):
def __init__(self, guess_func, confidence=None, logger=None, options=None):
self.guess_func = guess_func
self.confidence = confidence
self.logger = logger or log
self.options = options
def process_nodes(self, nodes):
for node in nodes:
self.process_node(node)
def process_node(self, node, iterative=True, partial_span=None):
value = None
if partial_span:
value = node.value[partial_span[0]:partial_span[1]]
else:
value = node.value
string = ' %s ' % value # add sentinels
if not self.options:
matcher_result = self.guess_func(string, node)
else:
matcher_result = self.guess_func(string, node, self.options)
if matcher_result:
if not isinstance(matcher_result, Guess):
result, span = matcher_result
else:
result, span = matcher_result, matcher_result.metadata().span
if result:
# readjust span to compensate for sentinels
span = (span[0] - 1, span[1] - 1)
# readjust span to compensate for partial_span
if partial_span:
span = (span[0] + partial_span[0], span[1] + partial_span[0])
partition_spans = None
if self.options and 'skip_nodes' in self.options:
skip_nodes = self.options.get('skip_nodes')
for skip_node in skip_nodes:
if skip_node.parent.node_idx == node.node_idx[:len(skip_node.parent.node_idx)] and\
skip_node.span == span:
partition_spans = node.get_partition_spans(skip_node.span)
partition_spans.remove(skip_node.span)
break
if not partition_spans:
# restore sentinels compensation
guess = None
if isinstance(result, Guess):
guess = result
else:
guess = Guess(result, confidence=self.confidence, input=string, span=span)
if not iterative:
node.guess.update(guess)
else:
absolute_span = (span[0] + node.offset, span[1] + node.offset)
node.partition(span)
found_child = None
for child in node.children:
if child.span == absolute_span:
found_guess(child, guess, self.logger)
found_child = child
break
for child in node.children:
if not child is found_child:
self.process_node(child)
else:
for partition_span in partition_spans:
self.process_node(node, partial_span=partition_span)

439
lib/guessit/matchtree.py Normal file
View file

@ -0,0 +1,439 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
#
# GuessIt - A library for guessing information from filenames
# Copyright (c) 2013 Nicolas Wack <wackou@gmail.com>
#
# GuessIt is free software; you can redistribute it and/or modify it under
# the terms of the Lesser GNU General Public License as published by
# the Free Software Foundation; either version 3 of the License, or
# (at your option) any later version.
#
# GuessIt is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# Lesser GNU General Public License for more details.
#
# You should have received a copy of the Lesser GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
#
from __future__ import absolute_import, division, print_function, unicode_literals
import guessit # @UnusedImport needed for doctests
from guessit import UnicodeMixin, base_text_type
from guessit.textutils import clean_string, str_fill
from guessit.patterns import group_delimiters
from guessit.guess import (merge_similar_guesses, merge_all,
choose_int, choose_string, Guess)
import copy
import logging
log = logging.getLogger(__name__)
class BaseMatchTree(UnicodeMixin):
"""A BaseMatchTree is a tree covering the filename, where each
node represents a substring in the filename and can have a ``Guess``
associated with it that contains the information that has been guessed
in this node. Nodes can be further split into subnodes until a proper
split has been found.
Each node has the following attributes:
- string = the original string of which this node represents a region
- span = a pair of (begin, end) indices delimiting the substring
- parent = parent node
- children = list of children nodes
- guess = Guess()
BaseMatchTrees are displayed in the following way:
>>> path = 'Movies/Dark City (1998)/Dark.City.(1998).DC.BDRip.720p.DTS.X264-CHD.mkv'
>>> print(guessit.IterativeMatcher(path).match_tree)
000000 1111111111111111 2222222222222222222222222222222222222222222 333
000000 0000000000111111 0000000000111111222222222222222222222222222 000
011112 011112000011111222222222222222222 000
011112222222222222
0000011112222
01112 0111
Movies/__________(____)/Dark.City.(____).DC._____.____.___.____-___.___
tttttttttt yyyy yyyy fffff ssss aaa vvvv rrr ccc
Movies/Dark City (1998)/Dark.City.(1998).DC.BDRip.720p.DTS.X264-CHD.mkv
The last line contains the filename, which you can use a reference.
The previous line contains the type of property that has been found.
The line before that contains the filename, where all the found groups
have been blanked. Basically, what is left on this line are the leftover
groups which could not be identified.
The lines before that indicate the indices of the groups in the tree.
For instance, the part of the filename 'BDRip' is the leaf with index
``(2, 2, 1)`` (read from top to bottom), and its meaning is 'format'
(as shown by the ``f``'s on the last-but-one line).
"""
def __init__(self, string='', span=None, parent=None):
self.string = string
self.span = span or (0, len(string))
self.parent = parent
self.children = []
self.guess = Guess()
@property
def value(self):
"""Return the substring that this node matches."""
return self.string[self.span[0]:self.span[1]]
@property
def clean_value(self):
"""Return a cleaned value of the matched substring, with better
presentation formatting (punctuation marks removed, duplicate
spaces, ...)"""
return clean_string(self.value)
@property
def offset(self):
return self.span[0]
@property
def info(self):
"""Return a dict containing all the info guessed by this node,
subnodes included."""
result = dict(self.guess)
for c in self.children:
result.update(c.info)
return result
@property
def root(self):
"""Return the root node of the tree."""
if not self.parent:
return self
return self.parent.root
@property
def depth(self):
"""Return the depth of this node."""
if self.is_leaf():
return 0
return 1 + max(c.depth for c in self.children)
def is_leaf(self):
"""Return whether this node is a leaf or not."""
return self.children == []
def add_child(self, span):
"""Add a new child node to this node with the given span."""
child = MatchTree(self.string, span=span, parent=self)
self.children.append(child)
return child
def get_partition_spans(self, indices):
"""Return the list of absolute spans for the regions of the original
string defined by splitting this node at the given indices (relative
to this node)"""
indices = sorted(indices)
if indices[0] != 0:
indices.insert(0, 0)
if indices[-1] != len(self.value):
indices.append(len(self.value))
spans = []
for start, end in zip(indices[:-1], indices[1:]):
spans.append((self.offset + start,
self.offset + end))
return spans
def partition(self, indices):
"""Partition this node by splitting it at the given indices,
relative to this node."""
for partition_span in self.get_partition_spans(indices):
self.add_child(span=partition_span)
def split_on_components(self, components):
offset = 0
for c in components:
start = self.value.find(c, offset)
end = start + len(c)
self.add_child(span=(self.offset + start,
self.offset + end))
offset = end
def nodes_at_depth(self, depth):
"""Return all the nodes at a given depth in the tree"""
if depth == 0:
yield self
for child in self.children:
for node in child.nodes_at_depth(depth - 1):
yield node
@property
def node_idx(self):
"""Return this node's index in the tree, as a tuple.
If this node is the root of the tree, then return ()."""
if self.parent is None:
return ()
return self.parent.node_idx + (self.parent.children.index(self),)
def node_at(self, idx):
"""Return the node at the given index in the subtree rooted at
this node."""
if not idx:
return self
try:
return self.children[idx[0]].node_at(idx[1:])
except IndexError:
raise ValueError('Non-existent node index: %s' % (idx,))
def nodes(self):
"""Return all the nodes and subnodes in this tree."""
yield self
for child in self.children:
for node in child.nodes():
yield node
def _leaves(self):
"""Return a generator over all the nodes that are leaves."""
if self.is_leaf():
yield self
else:
for child in self.children:
# pylint: disable=W0212
for leaf in child._leaves():
yield leaf
def group_node(self):
return self._other_group_node(0)
def previous_group_node(self):
return self._other_group_node(-1)
def next_group_node(self):
return self._other_group_node(+1)
def _other_group_node(self, offset):
if len(self.node_idx) > 1:
group_idx = self.node_idx[:2]
if group_idx[1] + offset >= 0:
other_group_idx = (group_idx[0], group_idx[1] + offset)
try:
other_group_node = self.root.node_at(other_group_idx)
return other_group_node
except ValueError:
pass
return None
def leaves(self):
"""Return a list of all the nodes that are leaves."""
return list(self._leaves())
def previous_leaf(self, leaf):
"""Return previous leaf for this node"""
return self._other_leaf(leaf, -1)
def next_leaf(self, leaf):
"""Return next leaf for this node"""
return self._other_leaf(leaf, +1)
def _other_leaf(self, leaf, offset):
leaves = self.leaves()
index = leaves.index(leaf) + offset
if index > 0 and index < len(leaves):
return leaves[index]
return None
def previous_leaves(self, leaf):
"""Return previous leaves for this node"""
leaves = self.leaves()
index = leaves.index(leaf)
if index > 0 and index < len(leaves):
previous_leaves = leaves[:index]
previous_leaves.reverse()
return previous_leaves
return []
def next_leaves(self, leaf):
"""Return next leaves for this node"""
leaves = self.leaves()
index = leaves.index(leaf)
if index > 0 and index < len(leaves):
return leaves[index + 1:len(leaves)]
return []
def to_string(self):
"""Return a readable string representation of this tree.
The result is a multi-line string, where the lines are:
- line 1 -> N-2: each line contains the nodes at the given depth in the tree
- line N-2: original string where all the found groups have been blanked
- line N-1: type of property that has been found
- line N: the original string, which you can use a reference.
"""
empty_line = ' ' * len(self.string)
def to_hex(x):
if isinstance(x, int):
return str(x) if x < 10 else chr(55 + x)
return x
def meaning(result):
mmap = {'episodeNumber': 'E',
'season': 'S',
'extension': 'e',
'format': 'f',
'language': 'l',
'country': 'C',
'videoCodec': 'v',
'videoProfile': 'v',
'audioCodec': 'a',
'audioProfile': 'a',
'audioChannels': 'a',
'website': 'w',
'container': 'c',
'series': 'T',
'title': 't',
'date': 'd',
'year': 'y',
'releaseGroup': 'r',
'screenSize': 's',
'other': 'o'
}
if result is None:
return ' '
for prop, l in mmap.items():
if prop in result:
return l
return 'x'
lines = [empty_line] * (self.depth + 2) # +2: remaining, meaning
lines[-2] = self.string
for node in self.nodes():
if node == self:
continue
idx = node.node_idx
depth = len(idx) - 1
if idx:
lines[depth] = str_fill(lines[depth], node.span,
to_hex(idx[-1]))
if node.guess:
lines[-2] = str_fill(lines[-2], node.span, '_')
lines[-1] = str_fill(lines[-1], node.span, meaning(node.guess))
lines.append(self.string)
return '\n'.join(l.rstrip() for l in lines)
def __unicode__(self):
return self.to_string()
def __repr__(self):
return '<MatchTree: root=%s>' % self.value
class MatchTree(BaseMatchTree):
"""The MatchTree contains a few "utility" methods which are not necessary
for the BaseMatchTree, but add a lot of convenience for writing
higher-level rules.
"""
_matched_result = None
def _unidentified_leaves(self,
valid=lambda leaf: len(leaf.clean_value) >= 2):
for leaf in self._leaves():
if not leaf.guess and valid(leaf):
yield leaf
def unidentified_leaves(self,
valid=lambda leaf: len(leaf.clean_value) >= 2):
"""Return a list of leaves that are not empty."""
return list(self._unidentified_leaves(valid))
def _leaves_containing(self, property_name):
if isinstance(property_name, base_text_type):
property_name = [property_name]
for leaf in self._leaves():
for prop in property_name:
if prop in leaf.guess:
yield leaf
break
def leaves_containing(self, property_name):
"""Return a list of leaves that guessed the given property."""
return list(self._leaves_containing(property_name))
def first_leaf_containing(self, property_name):
"""Return the first leaf containing the given property."""
try:
return next(self._leaves_containing(property_name))
except StopIteration:
return None
def _previous_unidentified_leaves(self, node):
node_idx = node.node_idx
for leaf in self._unidentified_leaves():
if leaf.node_idx < node_idx:
yield leaf
def previous_unidentified_leaves(self, node):
"""Return a list of non-empty leaves that are before the given
node (in the string)."""
return list(self._previous_unidentified_leaves(node))
def _previous_leaves_containing(self, node, property_name):
node_idx = node.node_idx
for leaf in self._leaves_containing(property_name):
if leaf.node_idx < node_idx:
yield leaf
def previous_leaves_containing(self, node, property_name):
"""Return a list of leaves containing the given property that are
before the given node (in the string)."""
return list(self._previous_leaves_containing(node, property_name))
def is_explicit(self):
"""Return whether the group was explicitly enclosed by
parentheses/square brackets/etc."""
return (self.value[0] + self.value[-1]) in group_delimiters
def matched(self):
"""Return a single guess that contains all the info found in the
nodes of this tree, trying to merge properties as good as possible.
"""
if not self._matched_result:
# we need to make a copy here, as the merge functions work in place and
# calling them on the match tree would modify it
parts = [copy.copy(node.guess) for node in self.nodes() if node.guess]
# 1- try to merge similar information together and give it a higher
# confidence
for int_part in ('year', 'season', 'episodeNumber'):
merge_similar_guesses(parts, int_part, choose_int)
for string_part in ('title', 'series', 'container', 'format',
'releaseGroup', 'website', 'audioCodec',
'videoCodec', 'screenSize', 'episodeFormat',
'audioChannels', 'idNumber'):
merge_similar_guesses(parts, string_part, choose_string)
# 2- merge the rest, potentially discarding information not properly
# merged before
result = merge_all(parts,
append=['language', 'subtitleLanguage', 'other', 'special'])
log.debug('Final result: ' + result.nice_string())
self._matched_result = result
return self._matched_result

25
lib/guessit/options.py Normal file
View file

@ -0,0 +1,25 @@
from optparse import OptionParser
option_parser = OptionParser(usage='usage: %prog [options] file1 [file2...]')
option_parser.add_option('-v', '--verbose', action='store_true', dest='verbose', default=False,
help='display debug output')
option_parser.add_option('-p', '--properties', dest='properties', action='store_true', default=False,
help='Display properties that can be guessed.')
option_parser.add_option('-l', '--values', dest='values', action='store_true', default=False,
help='Display property values that can be guessed.')
option_parser.add_option('-s', '--transformers', dest='transformers', action='store_true', default=False,
help='Display transformers that can be used.')
option_parser.add_option('-i', '--info', dest='info', default='filename',
help='the desired information type: filename, hash_mpc or a hash from python\'s '
'hashlib module, such as hash_md5, hash_sha1, ...; or a list of any of '
'them, comma-separated')
option_parser.add_option('-n', '--name-only', dest='name_only', action='store_true', default=False,
help='Parse files as name only. Disable folder parsing, extension parsing, and file content analysis.')
option_parser.add_option('-t', '--type', dest='type', default=None,
help='the suggested file type: movie, episode. If undefined, type will be guessed.')
option_parser.add_option('-a', '--advanced', dest='advanced', action='store_true', default=False,
help='display advanced information for filename guesses, as json output')
option_parser.add_option('-y', '--yaml', dest='yaml', action='store_true', default=False,
help='display information for filename guesses as yaml output (like unit-test)')
option_parser.add_option('-d', '--demo', action='store_true', dest='demo', default=False,
help='run a few builtin tests instead of analyzing a file')

View file

@ -0,0 +1,77 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
#
# GuessIt - A library for guessing information from filenames
# Copyright (c) 2013 Nicolas Wack <wackou@gmail.com>
# Copyright (c) 2013 Rémi Alvergnat <toilal.dev@gmail.com>
#
# GuessIt is free software; you can redistribute it and/or modify it under
# the terms of the Lesser GNU General Public License as published by
# the Free Software Foundation; either version 3 of the License, or
# (at your option) any later version.
#
# GuessIt is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# Lesser GNU General Public License for more details.
#
# You should have received a copy of the Lesser GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
#
from __future__ import absolute_import, division, print_function, unicode_literals
import re
from guessit import base_text_type
group_delimiters = ['()', '[]', '{}']
# separator character regexp
sep = r'[][,)(}:{+ /\._-]' # regexp art, hehe :D
_dash = '-'
_psep = '[\W_]?'
def build_or_pattern(patterns):
"""Build a or pattern string from a list of possible patterns
"""
or_pattern = ''
for pattern in patterns:
if not or_pattern:
or_pattern += '(?:'
else:
or_pattern += '|'
or_pattern += ('(?:%s)' % pattern)
or_pattern += ')'
return or_pattern
def compile_pattern(pattern, enhance=True):
"""Compile and enhance a pattern
:param pattern: Pattern to compile (regexp).
:type pattern: string
:param pattern: Enhance pattern before compiling.
:type pattern: string
:return: The compiled pattern
:rtype: regular expression object
"""
return re.compile(enhance_pattern(pattern) if enhance else pattern, re.IGNORECASE)
def enhance_pattern(pattern):
"""Enhance pattern to match more equivalent values.
'-' are replaced by '[\W_]?', which matches more types of separators (or none)
:param pattern: Pattern to enhance (regexp).
:type pattern: string
:return: The enhanced pattern
:rtype: string
"""
return pattern.replace(_dash, _psep)

View file

@ -0,0 +1,32 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
#
# GuessIt - A library for guessing information from filenames
# Copyright (c) 2013 Nicolas Wack <wackou@gmail.com>
# Copyright (c) 2013 Rémi Alvergnat <toilal.dev@gmail.com>
# Copyright (c) 2011 Ricard Marxer <ricardmp@gmail.com>
#
# GuessIt is free software; you can redistribute it and/or modify it under
# the terms of the Lesser GNU General Public License as published by
# the Free Software Foundation; either version 3 of the License, or
# (at your option) any later version.
#
# GuessIt is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# Lesser GNU General Public License for more details.
#
# You should have received a copy of the Lesser GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
#
from __future__ import absolute_import, division, print_function, unicode_literals
subtitle_exts = ['srt', 'idx', 'sub', 'ssa']
info_exts = ['nfo']
video_exts = ['3g2', '3gp', '3gp2', 'asf', 'avi', 'divx', 'flv', 'm4v', 'mk2',
'mka', 'mkv', 'mov', 'mp4', 'mp4a', 'mpeg', 'mpg', 'ogg', 'ogm',
'ogv', 'qt', 'ra', 'ram', 'rm', 'ts', 'wav', 'webm', 'wma', 'wmv',
'iso']

View file

@ -0,0 +1,150 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
#
# GuessIt - A library for guessing information from filenames
# Copyright (c) 2013 Rémi Alvergnat <toilal.dev@gmail.com>
#
# GuessIt is free software; you can redistribute it and/or modify it under
# the terms of the Lesser GNU General Public License as published by
# the Free Software Foundation; either version 3 of the License, or
# (at your option) any later version.
#
# GuessIt is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# Lesser GNU General Public License for more details.
#
# You should have received a copy of the Lesser GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
#
from __future__ import absolute_import, division, print_function, unicode_literals
import re
digital_numeral = '\d{1,3}'
roman_numeral = "(?=[MCDLXVI]+)M{0,4}(?:CM|CD|D?C{0,3})(?:XC|XL|L?X{0,3})(?:IX|IV|V?I{0,3})"
english_word_numeral_list = [
'zero', 'one', 'two', 'three', 'four', 'five', 'six', 'seven', 'eight', 'nine', 'ten',
'eleven', 'twelve', 'thirteen', 'fourteen', 'fifteen', 'sixteen', 'seventeen', 'eighteen', 'nineteen', 'twenty'
]
french_word_numeral_list = [
'zéro', 'un', 'deux', 'trois', 'quatre', 'cinq', 'six', 'sept', 'huit', 'neuf', 'dix',
'onze', 'douze', 'treize', 'quatorze', 'quinze', 'seize', 'dix-sept', 'dix-huit', 'dix-neuf', 'vingt'
]
french_alt_word_numeral_list = [
'zero', 'une', 'deux', 'trois', 'quatre', 'cinq', 'six', 'sept', 'huit', 'neuf', 'dix',
'onze', 'douze', 'treize', 'quatorze', 'quinze', 'seize', 'dixsept', 'dixhuit', 'dixneuf', 'vingt'
]
def __build_word_numeral(*args, **kwargs):
re = None
for word_list in args:
for word in word_list:
if not re:
re = '(?:(?=\w+)'
else:
re += '|'
re += word
re += ')'
return re
word_numeral = __build_word_numeral(english_word_numeral_list, french_word_numeral_list, french_alt_word_numeral_list)
numeral = '(?:' + digital_numeral + '|' + roman_numeral + '|' + word_numeral + ')'
__romanNumeralMap = (
('M', 1000),
('CM', 900),
('D', 500),
('CD', 400),
('C', 100),
('XC', 90),
('L', 50),
('XL', 40),
('X', 10),
('IX', 9),
('V', 5),
('IV', 4),
('I', 1)
)
__romanNumeralPattern = re.compile('^' + roman_numeral + '$')
def __parse_roman(value):
"""convert Roman numeral to integer"""
if not __romanNumeralPattern.search(value):
raise ValueError('Invalid Roman numeral: %s' % value)
result = 0
index = 0
for numeral, integer in __romanNumeralMap:
while value[index:index + len(numeral)] == numeral:
result += integer
index += len(numeral)
return result
def __parse_word(value):
"""Convert Word numeral to integer"""
for word_list in [english_word_numeral_list, french_word_numeral_list, french_alt_word_numeral_list]:
try:
return word_list.index(value)
except ValueError:
pass
raise ValueError
_clean_re = re.compile('[^\d]*(\d+)[^\d]*')
def parse_numeral(value, int_enabled=True, roman_enabled=True, word_enabled=True, clean=True):
"""Parse a numeric value into integer.
input can be an integer as a string, a roman numeral or a word
:param value: Value to parse. Can be an integer, roman numeral or word.
:type value: string
:return: Numeric value, or None if value can't be parsed
:rtype: int
"""
if int_enabled:
try:
if clean:
match = _clean_re.match(value)
if match:
clean_value = match.group(1)
return int(clean_value)
return int(value)
except ValueError:
pass
if roman_enabled:
try:
if clean:
for word in value.split():
try:
return __parse_roman(word)
except ValueError:
pass
return __parse_roman(value)
except ValueError:
pass
if word_enabled:
try:
if clean:
for word in value.split():
try:
return __parse_word(word)
except ValueError:
pass
return __parse_word(value)
except ValueError:
pass
raise ValueError('Invalid numeral: ' + value)

View file

@ -0,0 +1,21 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
#
# GuessIt - A library for guessing information from filenames
# Copyright (c) 2013 Nicolas Wack <wackou@gmail.com>
#
# GuessIt is free software; you can redistribute it and/or modify it under
# the terms of the Lesser GNU General Public License as published by
# the Free Software Foundation; either version 3 of the License, or
# (at your option) any later version.
#
# GuessIt is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# Lesser GNU General Public License for more details.
#
# You should have received a copy of the Lesser GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
#
from __future__ import absolute_import, division, print_function, unicode_literals

View file

@ -0,0 +1,186 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
#
# GuessIt - A library for guessing information from filenames
# Copyright (c) 2013 Nicolas Wack <wackou@gmail.com>
#
# GuessIt is free software; you can redistribute it and/or modify it under
# the terms of the Lesser GNU General Public License as published by
# the Free Software Foundation; either version 3 of the License, or
# (at your option) any later version.
#
# GuessIt is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# Lesser GNU General Public License for more details.
#
# You should have received a copy of the Lesser GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
#
from __future__ import absolute_import, division, print_function, unicode_literals
from stevedore import ExtensionManager
from pkg_resources import EntryPoint
from stevedore.extension import Extension
from logging import getLogger
log = getLogger(__name__)
class Transformer(object): # pragma: no cover
def __init__(self, priority=0):
self.priority = priority
self.log = getLogger(self.name)
@property
def name(self):
return self.__class__.__name__
def supported_properties(self):
return {}
def second_pass_options(self, mtree, options=None):
return None
def should_process(self, mtree, options=None):
return True
def process(self, mtree, options=None):
pass
def post_process(self, mtree, options=None):
pass
def rate_quality(self, guess, *props):
return 0
class CustomTransformerExtensionManager(ExtensionManager):
def __init__(self, namespace='guessit.transformer', invoke_on_load=True,
invoke_args=(), invoke_kwds={}, propagate_map_exceptions=True, on_load_failure_callback=None,
verify_requirements=False):
super(CustomTransformerExtensionManager, self).__init__(namespace=namespace,
invoke_on_load=invoke_on_load,
invoke_args=invoke_args,
invoke_kwds=invoke_kwds,
propagate_map_exceptions=propagate_map_exceptions,
on_load_failure_callback=on_load_failure_callback,
verify_requirements=verify_requirements)
def order_extensions(self, extensions):
"""Order the loaded transformers
It should follow those rules
- website before language (eg: tvu.org.ru vs russian)
- language before episodes_rexps
- properties before language (eg: he-aac vs hebrew)
- release_group before properties (eg: XviD-?? vs xvid)
"""
extensions.sort(key=lambda ext: -ext.obj.priority)
return extensions
def _load_one_plugin(self, ep, invoke_on_load, invoke_args, invoke_kwds, verify_requirements):
if not ep.dist:
plugin = ep.load(require=False)
else:
plugin = ep.load(require=verify_requirements)
if invoke_on_load:
obj = plugin(*invoke_args, **invoke_kwds)
else:
obj = None
return Extension(ep.name, ep, plugin, obj)
def _load_plugins(self, invoke_on_load, invoke_args, invoke_kwds, verify_requirements):
return self.order_extensions(super(CustomTransformerExtensionManager, self)._load_plugins(invoke_on_load, invoke_args, invoke_kwds, verify_requirements))
def objects(self):
return self.map(self._get_obj)
def _get_obj(self, ext):
return ext.obj
def object(self, name):
try:
return self[name].obj
except KeyError:
return None
def register_module(self, name, module_name):
ep = EntryPoint(name, module_name)
loaded = self._load_one_plugin(ep, invoke_on_load=True, invoke_args=(), invoke_kwds={})
if loaded:
self.extensions.append(loaded)
self.extensions = self.order_extensions(self.extensions)
self._extensions_by_name = None
class DefaultTransformerExtensionManager(CustomTransformerExtensionManager):
@property
def _internal_entry_points(self):
return ['split_path_components = guessit.transfo.split_path_components:SplitPathComponents',
'guess_filetype = guessit.transfo.guess_filetype:GuessFiletype',
'split_explicit_groups = guessit.transfo.split_explicit_groups:SplitExplicitGroups',
'guess_date = guessit.transfo.guess_date:GuessDate',
'guess_website = guessit.transfo.guess_website:GuessWebsite',
'guess_release_group = guessit.transfo.guess_release_group:GuessReleaseGroup',
'guess_properties = guessit.transfo.guess_properties:GuessProperties',
'guess_language = guessit.transfo.guess_language:GuessLanguage',
'guess_video_rexps = guessit.transfo.guess_video_rexps:GuessVideoRexps',
'guess_episodes_rexps = guessit.transfo.guess_episodes_rexps:GuessEpisodesRexps',
'guess_weak_episodes_rexps = guessit.transfo.guess_weak_episodes_rexps:GuessWeakEpisodesRexps',
'guess_bonus_features = guessit.transfo.guess_bonus_features:GuessBonusFeatures',
'guess_year = guessit.transfo.guess_year:GuessYear',
'guess_country = guessit.transfo.guess_country:GuessCountry',
'guess_idnumber = guessit.transfo.guess_idnumber:GuessIdnumber',
'split_on_dash = guessit.transfo.split_on_dash:SplitOnDash',
'guess_episode_info_from_position = guessit.transfo.guess_episode_info_from_position:GuessEpisodeInfoFromPosition',
'guess_movie_title_from_position = guessit.transfo.guess_movie_title_from_position:GuessMovieTitleFromPosition',
'guess_episode_special = guessit.transfo.guess_episode_special:GuessEpisodeSpecial']
def _find_entry_points(self, namespace):
entry_points = {}
# Internal entry points
if namespace == self.namespace:
for internal_entry_point_str in self._internal_entry_points:
internal_entry_point = EntryPoint.parse(internal_entry_point_str)
entry_points[internal_entry_point.name] = internal_entry_point
# Package entry points
setuptools_entrypoints = super(DefaultTransformerExtensionManager, self)._find_entry_points(namespace)
for setuptools_entrypoint in setuptools_entrypoints:
entry_points[setuptools_entrypoint.name] = setuptools_entrypoint
return list(entry_points.values())
_extensions = None
def all_transformers():
return _extensions.objects()
def get_transformer(name):
return _extensions.object(name)
def add_transformer(name, module_name):
_extensions.register_module(name, module_name)
def reload(custom=False):
"""
Reload extension manager with default or custom one.
:param custom: if True, custom manager will be used, else default one.
Default manager will load default extensions from guessit and setuptools packaging extensions
Custom manager will not load default extensions from guessit, using only setuptools packaging extensions.
:type custom: boolean
"""
global _extensions
if custom:
_extensions = CustomTransformerExtensionManager()
else:
_extensions = DefaultTransformerExtensionManager()
reload()

65
lib/guessit/quality.py Normal file
View file

@ -0,0 +1,65 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
#
# GuessIt - A library for guessing information from filenames
# Copyright (c) 2013 Rémi Alvergnat <toilal.dev@gmail.com>
#
# GuessIt is free software; you can redistribute it and/or modify it under
# the terms of the Lesser GNU General Public License as published by
# the Free Software Foundation; either version 3 of the License, or
# (at your option) any later version.
#
# GuessIt is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# Lesser GNU General Public License for more details.
#
# You should have received a copy of the Lesser GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
#
from __future__ import absolute_import, division, print_function, unicode_literals
from guessit.plugins.transformers import all_transformers
def best_quality_properties(props, *guesses):
"""Retrieve the best quality guess, based on given properties
:param props: Properties to include in the rating
:type props: list of strings
:param guesses: Guesses to rate
:type guesses: :class:`guessit.guess.Guess`
:return: Best quality guess from all passed guesses
:rtype: :class:`guessit.guess.Guess`
"""
best_guess = None
best_rate = None
for guess in guesses:
for transformer in all_transformers():
rate = transformer.rate_quality(guess, *props)
if best_rate is None or best_rate < rate:
best_rate = rate
best_guess = guess
return best_guess
def best_quality(*guesses):
"""Retrieve the best quality guess.
:param guesses: Guesses to rate
:type guesses: :class:`guessit.guess.Guess`
:return: Best quality guess from all passed guesses
:rtype: :class:`guessit.guess.Guess`
"""
best_guess = None
best_rate = None
for guess in guesses:
for transformer in all_transformers():
rate = transformer.rate_quality(guess)
if best_rate is None or best_rate < rate:
best_rate = rate
best_guess = guess
return best_guess

89
lib/guessit/slogging.py Normal file
View file

@ -0,0 +1,89 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
#
# GuessIt - A library for guessing information from filenames
# Copyright (c) 2013 Nicolas Wack <wackou@gmail.com>
#
# GuessIt is free software; you can redistribute it and/or modify it under
# the terms of the Lesser GNU General Public License as published by
# the Free Software Foundation; either version 3 of the License, or
# (at your option) any later version.
#
# GuessIt is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# Lesser GNU General Public License for more details.
#
# You should have received a copy of the Lesser GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
#
from __future__ import absolute_import, division, print_function, unicode_literals
import logging
import sys
import os
GREEN_FONT = "\x1B[0;32m"
YELLOW_FONT = "\x1B[0;33m"
BLUE_FONT = "\x1B[0;34m"
RED_FONT = "\x1B[0;31m"
RESET_FONT = "\x1B[0m"
def setupLogging(colored=True, with_time=False, with_thread=False, filename=None, with_lineno=False): # pragma: no cover
"""Set up a nice colored logger as the main application logger."""
class SimpleFormatter(logging.Formatter):
def __init__(self, with_time, with_thread):
self.fmt = (('%(asctime)s ' if with_time else '') +
'%(levelname)-8s ' +
'[%(name)s:%(funcName)s' +
(':%(lineno)s' if with_lineno else '') + ']' +
('[%(threadName)s]' if with_thread else '') +
' -- %(message)s')
logging.Formatter.__init__(self, self.fmt)
class ColoredFormatter(logging.Formatter):
def __init__(self, with_time, with_thread):
self.fmt = (('%(asctime)s ' if with_time else '') +
'-CC-%(levelname)-8s ' +
BLUE_FONT + '[%(name)s:%(funcName)s' +
(':%(lineno)s' if with_lineno else '') + ']' +
RESET_FONT + ('[%(threadName)s]' if with_thread else '') +
' -- %(message)s')
logging.Formatter.__init__(self, self.fmt)
def format(self, record):
modpath = record.name.split('.')
record.mname = modpath[0]
record.mmodule = '.'.join(modpath[1:])
result = logging.Formatter.format(self, record)
if record.levelno == logging.DEBUG:
color = BLUE_FONT
elif record.levelno == logging.INFO:
color = GREEN_FONT
elif record.levelno == logging.WARNING:
color = YELLOW_FONT
else:
color = RED_FONT
result = result.replace('-CC-', color)
return result
if filename is not None:
# make sure we can write to our log file
logdir = os.path.dirname(filename)
if not os.path.exists(logdir):
os.makedirs(logdir)
ch = logging.FileHandler(filename, mode='w')
ch.setFormatter(SimpleFormatter(with_time, with_thread))
else:
ch = logging.StreamHandler()
if colored and sys.platform != 'win32':
ch.setFormatter(ColoredFormatter(with_time, with_thread))
else:
ch.setFormatter(SimpleFormatter(with_time, with_thread))
logging.getLogger().addHandler(ch)

352
lib/guessit/textutils.py Normal file
View file

@ -0,0 +1,352 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
#
# GuessIt - A library for guessing information from filenames
# Copyright (c) 2013 Nicolas Wack <wackou@gmail.com>
#
# GuessIt is free software; you can redistribute it and/or modify it under
# the terms of the Lesser GNU General Public License as published by
# the Free Software Foundation; either version 3 of the License, or
# (at your option) any later version.
#
# GuessIt is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# Lesser GNU General Public License for more details.
#
# You should have received a copy of the Lesser GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
#
from __future__ import absolute_import, division, print_function, unicode_literals
from guessit import s
from guessit.patterns import sep
import functools
import unicodedata
import re
# string-related functions
def normalize_unicode(s):
return unicodedata.normalize('NFC', s)
def strip_brackets(s):
if not s:
return s
if ((s[0] == '[' and s[-1] == ']') or
(s[0] == '(' and s[-1] == ')') or
(s[0] == '{' and s[-1] == '}')):
return s[1:-1]
return s
_dotted_rexp = re.compile(r'(?:\W|^)(([A-Za-z]\.){2,}[A-Za-z]\.?)')
def clean_string(st):
for c in sep:
# do not remove certain chars
if c in ['-', ',']:
continue
if c == '.':
# we should not remove the dots for acronyms and such
dotted = _dotted_rexp.search(st)
if dotted:
s = dotted.group(1)
exclude_begin, exclude_end = dotted.span(1)
st = (st[:exclude_begin].replace(c, ' ') +
st[exclude_begin:exclude_end] +
st[exclude_end:].replace(c, ' '))
continue
st = st.replace(c, ' ')
parts = st.split()
result = ' '.join(p for p in parts if p != '')
# now also remove dashes on the outer part of the string
while result and result[0] in '-':
result = result[1:]
while result and result[-1] in '-':
result = result[:-1]
return result
_words_rexp = re.compile('\w+', re.UNICODE)
def find_words(s):
return _words_rexp.findall(s.replace('_', ' '))
def reorder_title(title, articles=('the',), separators=(',', ', ')):
ltitle = title.lower()
for article in articles:
for separator in separators:
suffix = separator + article
if ltitle[-len(suffix):] == suffix:
return title[-len(suffix) + len(separator):] + ' ' + title[:-len(suffix)]
return title
def str_replace(string, pos, c):
return string[:pos] + c + string[pos + 1:]
def str_fill(string, region, c):
start, end = region
return string[:start] + c * (end - start) + string[end:]
def levenshtein(a, b):
if not a:
return len(b)
if not b:
return len(a)
m = len(a)
n = len(b)
d = []
for i in range(m + 1):
d.append([0] * (n + 1))
for i in range(m + 1):
d[i][0] = i
for j in range(n + 1):
d[0][j] = j
for i in range(1, m + 1):
for j in range(1, n + 1):
if a[i - 1] == b[j - 1]:
cost = 0
else:
cost = 1
d[i][j] = min(d[i - 1][j] + 1, # deletion
d[i][j - 1] + 1, # insertion
d[i - 1][j - 1] + cost # substitution
)
return d[m][n]
# group-related functions
def find_first_level_groups_span(string, enclosing):
"""Return a list of pairs (start, end) for the groups delimited by the given
enclosing characters.
This does not return nested groups, ie: '(ab(c)(d))' will return a single group
containing the whole string.
>>> find_first_level_groups_span('abcd', '()')
[]
>>> find_first_level_groups_span('abc(de)fgh', '()')
[(3, 7)]
>>> find_first_level_groups_span('(ab(c)(d))', '()')
[(0, 10)]
>>> find_first_level_groups_span('ab[c]de[f]gh(i)', '[]')
[(2, 5), (7, 10)]
"""
opening, closing = enclosing
depth = [] # depth is a stack of indices where we opened a group
result = []
for i, c, in enumerate(string):
if c == opening:
depth.append(i)
elif c == closing:
try:
start = depth.pop()
end = i
if not depth:
# we emptied our stack, so we have a 1st level group
result.append((start, end + 1))
except IndexError:
# we closed a group which was not opened before
pass
return result
def split_on_groups(string, groups):
"""Split the given string using the different known groups for boundaries.
>>> s(split_on_groups('0123456789', [ (2, 4) ]))
['01', '23', '456789']
>>> s(split_on_groups('0123456789', [ (2, 4), (4, 6) ]))
['01', '23', '45', '6789']
>>> s(split_on_groups('0123456789', [ (5, 7), (2, 4) ]))
['01', '23', '4', '56', '789']
"""
if not groups:
return [string]
boundaries = sorted(set(functools.reduce(lambda l, x: l + list(x), groups, [])))
if boundaries[0] != 0:
boundaries.insert(0, 0)
if boundaries[-1] != len(string):
boundaries.append(len(string))
groups = [string[start:end] for start, end in zip(boundaries[:-1],
boundaries[1:])]
return [g for g in groups if g] # return only non-empty groups
def find_first_level_groups(string, enclosing, blank_sep=None):
"""Return a list of groups that could be split because of explicit grouping.
The groups are delimited by the given enclosing characters.
You can also specify if you want to blank the separator chars in the returned
list of groups by specifying a character for it. None means it won't be replaced.
This does not return nested groups, ie: '(ab(c)(d))' will return a single group
containing the whole string.
>>> s(find_first_level_groups('', '()'))
['']
>>> s(find_first_level_groups('abcd', '()'))
['abcd']
>>> s(find_first_level_groups('abc(de)fgh', '()'))
['abc', '(de)', 'fgh']
>>> s(find_first_level_groups('(ab(c)(d))', '()', blank_sep = '_'))
['_ab(c)(d)_']
>>> s(find_first_level_groups('ab[c]de[f]gh(i)', '[]'))
['ab', '[c]', 'de', '[f]', 'gh(i)']
>>> s(find_first_level_groups('()[]()', '()', blank_sep = '-'))
['--', '[]', '--']
"""
groups = find_first_level_groups_span(string, enclosing)
if blank_sep:
for start, end in groups:
string = str_replace(string, start, blank_sep)
string = str_replace(string, end - 1, blank_sep)
return split_on_groups(string, groups)
_camel_word2_set = set(('is', 'to',))
_camel_word3_set = set(('the',))
def _camel_split_and_lower(string, i):
"""Retrieves a tuple (need_split, need_lower)
need_split is True if this char is a first letter in a camelCasedString.
need_lower is True if this char should be lowercased.
"""
def islower(c):
return c.isalpha() and not c.isupper()
previous_char2 = string[i - 2] if i > 1 else None
previous_char = string[i - 1] if i > 0 else None
char = string[i]
next_char = string[i + 1] if i + 1 < len(string) else None
next_char2 = string[i + 2] if i + 2 < len(string) else None
char_upper = char.isupper()
char_lower = islower(char)
# previous_char2_lower = islower(previous_char2) if previous_char2 else False
previous_char2_upper = previous_char2.isupper() if previous_char2 else False
previous_char_lower = islower(previous_char) if previous_char else False
previous_char_upper = previous_char.isupper() if previous_char else False
next_char_upper = next_char.isupper() if next_char else False
next_char_lower = islower(next_char) if next_char else False
next_char2_upper = next_char2.isupper() if next_char2 else False
# next_char2_lower = islower(next_char2) if next_char2 else False
mixedcase_word = (previous_char_upper and char_lower and next_char_upper) or \
(previous_char_lower and char_upper and next_char_lower and next_char2_upper) or \
(previous_char2_upper and previous_char_lower and char_upper)
if mixedcase_word:
word2 = (char + next_char).lower() if next_char else None
word3 = (char + next_char + next_char2).lower() if next_char and next_char2 else None
word2b = (previous_char2 + previous_char).lower() if previous_char2 and previous_char else None
if word2 in _camel_word2_set or word2b in _camel_word2_set or word3 in _camel_word3_set:
mixedcase_word = False
uppercase_word = previous_char_upper and char_upper and next_char_upper or (char_upper and next_char_upper and next_char2_upper)
need_split = char_upper and previous_char_lower and not mixedcase_word
if not need_split:
previous_char_upper = string[i - 1].isupper() if i > 0 else False
next_char_lower = (string[i + 1].isalpha() and not string[i + 1].isupper()) if i + 1 < len(string) else False
need_split = char_upper and previous_char_upper and next_char_lower
uppercase_word = previous_char_upper and not next_char_lower
need_lower = not uppercase_word and not mixedcase_word and need_split
return (need_split, need_lower)
def is_camel(string):
"""
>>> is_camel('dogEATDog')
True
>>> is_camel('DeathToCamelCase')
True
>>> is_camel('death_to_camel_case')
False
>>> is_camel('TheBest')
True
>>> is_camel('The Best')
False
"""
for i in range(0, len(string)):
need_split, _ = _camel_split_and_lower(string, i)
if need_split:
return True
return False
def from_camel(string):
"""
>>> from_camel('dogEATDog') == 'dog EAT dog'
True
>>> from_camel('DeathToCamelCase') == 'Death to camel case'
True
>>> from_camel('TheBest') == 'The best'
True
>>> from_camel('MiXedCaSe is not camelCase') == 'MiXedCaSe is not camel case'
True
"""
if not string:
return string
pieces = []
for i in range(0, len(string)):
char = string[i]
need_split, need_lower = _camel_split_and_lower(string, i)
if need_split:
pieces.append(' ')
if need_lower:
pieces.append(char.lower())
else:
pieces.append(char)
return ''.join(pieces)

View file

@ -0,0 +1,341 @@
# Version 2013112900, Last Updated Fri Nov 29 07:07:01 2013 UTC
AC
AD
AE
AERO
AF
AG
AI
AL
AM
AN
AO
AQ
AR
ARPA
AS
ASIA
AT
AU
AW
AX
AZ
BA
BB
BD
BE
BF
BG
BH
BI
BIKE
BIZ
BJ
BM
BN
BO
BR
BS
BT
BV
BW
BY
BZ
CA
CAMERA
CAT
CC
CD
CF
CG
CH
CI
CK
CL
CLOTHING
CM
CN
CO
COM
CONSTRUCTION
CONTRACTORS
COOP
CR
CU
CV
CW
CX
CY
CZ
DE
DIAMONDS
DIRECTORY
DJ
DK
DM
DO
DZ
EC
EDU
EE
EG
ENTERPRISES
EQUIPMENT
ER
ES
ESTATE
ET
EU
FI
FJ
FK
FM
FO
FR
GA
GALLERY
GB
GD
GE
GF
GG
GH
GI
GL
GM
GN
GOV
GP
GQ
GR
GRAPHICS
GS
GT
GU
GURU
GW
GY
HK
HM
HN
HOLDINGS
HR
HT
HU
ID
IE
IL
IM
IN
INFO
INT
IO
IQ
IR
IS
IT
JE
JM
JO
JOBS
JP
KE
KG
KH
KI
KITCHEN
KM
KN
KP
KR
KW
KY
KZ
LA
LAND
LB
LC
LI
LIGHTING
LK
LR
LS
LT
LU
LV
LY
MA
MC
MD
ME
MG
MH
MIL
MK
ML
MM
MN
MO
MOBI
MP
MQ
MR
MS
MT
MU
MUSEUM
MV
MW
MX
MY
MZ
NA
NAME
NC
NE
NET
NF
NG
NI
NL
NO
NP
NR
NU
NZ
OM
ORG
PA
PE
PF
PG
PH
PHOTOGRAPHY
PK
PL
PLUMBING
PM
PN
POST
PR
PRO
PS
PT
PW
PY
QA
RE
RO
RS
RU
RW
SA
SB
SC
SD
SE
SEXY
SG
SH
SI
SINGLES
SJ
SK
SL
SM
SN
SO
SR
ST
SU
SV
SX
SY
SZ
TATTOO
TC
TD
TECHNOLOGY
TEL
TF
TG
TH
TIPS
TJ
TK
TL
TM
TN
TO
TODAY
TP
TR
TRAVEL
TT
TV
TW
TZ
UA
UG
UK
US
UY
UZ
VA
VC
VE
VENTURES
VG
VI
VN
VOYAGE
VU
WF
WS
XN--3E0B707E
XN--45BRJ9C
XN--80AO21A
XN--80ASEHDB
XN--80ASWG
XN--90A3AC
XN--CLCHC0EA0B2G2A9GCD
XN--FIQS8S
XN--FIQZ9S
XN--FPCRJ9C3D
XN--FZC2C9E2C
XN--GECRJ9C
XN--H2BRJ9C
XN--J1AMH
XN--J6W193G
XN--KPRW13D
XN--KPRY57D
XN--L1ACC
XN--LGBBAT1AD8J
XN--MGB9AWBF
XN--MGBA3A4F16A
XN--MGBAAM7A8H
XN--MGBAYH7GPA
XN--MGBBH1A71E
XN--MGBC0A9AZCG
XN--MGBERP4A5D4AR
XN--MGBX4CD0AB
XN--NGBC5AZD
XN--O3CW4H
XN--OGBPF8FL
XN--P1AI
XN--PGBS0DH
XN--Q9JYB4C
XN--S9BRJ9C
XN--UNUP4Y
XN--WGBH1C
XN--WGBL6A
XN--XKC2AL3HYE2A
XN--XKC2DL3A5EE0H
XN--YFRO4I67O
XN--YGBI2AMMX
XXX
YE
YT
ZA
ZM
ZW

View file

@ -0,0 +1,30 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
#
# GuessIt - A library for guessing information from filenames
# Copyright (c) 2013 Nicolas Wack <wackou@gmail.com>
#
# GuessIt is free software; you can redistribute it and/or modify it under
# the terms of the Lesser GNU General Public License as published by
# the Free Software Foundation; either version 3 of the License, or
# (at your option) any later version.
#
# GuessIt is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# Lesser GNU General Public License for more details.
#
# You should have received a copy of the Lesser GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
#
from __future__ import absolute_import, division, print_function, unicode_literals
class TransformerException(Exception):
def __init__(self, transformer, message):
# Call the base class constructor with the parameters it needs
Exception.__init__(self, message)
self.transformer = transformer

View file

@ -0,0 +1,67 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
#
# GuessIt - A library for guessing information from filenames
# Copyright (c) 2013 Nicolas Wack <wackou@gmail.com>
#
# GuessIt is free software; you can redistribute it and/or modify it under
# the terms of the Lesser GNU General Public License as published by
# the Free Software Foundation; either version 3 of the License, or
# (at your option) any later version.
#
# GuessIt is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# Lesser GNU General Public License for more details.
#
# You should have received a copy of the Lesser GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
#
from __future__ import absolute_import, division, print_function, unicode_literals
from guessit.plugins.transformers import Transformer
from guessit.matcher import found_property
class GuessBonusFeatures(Transformer):
def __init__(self):
Transformer.__init__(self, -150)
def supported_properties(self):
return ['bonusNumber', 'bonusTitle', 'filmNumber', 'filmSeries', 'title', 'series']
def process(self, mtree, options=None):
def previous_group(g):
for leaf in mtree.unidentified_leaves()[::-1]:
if leaf.node_idx < g.node_idx:
return leaf
def next_group(g):
for leaf in mtree.unidentified_leaves():
if leaf.node_idx > g.node_idx:
return leaf
def same_group(g1, g2):
return g1.node_idx[:2] == g2.node_idx[:2]
bonus = [node for node in mtree.leaves() if 'bonusNumber' in node.guess]
if bonus:
bonusTitle = next_group(bonus[0])
if bonusTitle and same_group(bonusTitle, bonus[0]):
found_property(bonusTitle, 'bonusTitle', confidence=0.8)
filmNumber = [node for node in mtree.leaves()
if 'filmNumber' in node.guess]
if filmNumber:
filmSeries = previous_group(filmNumber[0])
found_property(filmSeries, 'filmSeries', confidence=0.9)
title = next_group(filmNumber[0])
found_property(title, 'title', confidence=0.9)
season = [node for node in mtree.leaves() if 'season' in node.guess]
if season and 'bonusNumber' in mtree.info:
series = previous_group(season[0])
if same_group(series, season[0]):
found_property(series, 'series', confidence=0.9)

View file

@ -0,0 +1,69 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
#
# GuessIt - A library for guessing information from filenames
# Copyright (c) 2013 Nicolas Wack <wackou@gmail.com>
#
# GuessIt is free software; you can redistribute it and/or modify it under
# the terms of the Lesser GNU General Public License as published by
# the Free Software Foundation; either version 3 of the License, or
# (at your option) any later version.
#
# GuessIt is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# Lesser GNU General Public License for more details.
#
# You should have received a copy of the Lesser GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
#
from __future__ import absolute_import, division, print_function, unicode_literals
from guessit.plugins.transformers import Transformer
from guessit.country import Country
from guessit import Guess
class GuessCountry(Transformer):
def __init__(self):
Transformer.__init__(self, -170)
# list of common words which could be interpreted as countries, but which
# are far too common to be able to say they represent a country
self.country_common_words = frozenset(['bt', 'bb'])
def supported_properties(self):
return ['country']
def should_process(self, mtree, options=None):
options = options or {}
return 'nocountry' not in options.keys()
def process(self, mtree, options=None):
for node in mtree.unidentified_leaves():
if len(node.node_idx) == 2:
c = node.value[1:-1].lower()
if c in self.country_common_words:
continue
# only keep explicit groups (enclosed in parentheses/brackets)
if not node.is_explicit():
continue
try:
country = Country(c, strict=True)
except ValueError:
continue
node.guess = Guess(country=country, confidence=1.0, input=node.value, span=node.span)
def post_process(self, mtree, options=None, *args, **kwargs):
# if country is in the guessed properties, make it part of the series name
series_leaves = mtree.leaves_containing('series')
country_leaves = mtree.leaves_containing('country')
if series_leaves and country_leaves:
country_leaf = country_leaves[0]
for serie_leaf in series_leaves:
serie_leaf.guess['series'] += ' (%s)' % country_leaf.guess['country'].alpha2.upper()
#result['series'] += ' (%s)' % result['country'].alpha2.upper()

View file

@ -0,0 +1,43 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
#
# GuessIt - A library for guessing information from filenames
# Copyright (c) 2013 Nicolas Wack <wackou@gmail.com>
#
# GuessIt is free software; you can redistribute it and/or modify it under
# the terms of the Lesser GNU General Public License as published by
# the Free Software Foundation; either version 3 of the License, or
# (at your option) any later version.
#
# GuessIt is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# Lesser GNU General Public License for more details.
#
# You should have received a copy of the Lesser GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
#
from __future__ import absolute_import, division, print_function, unicode_literals
from guessit.plugins.transformers import Transformer
from guessit.matcher import GuessFinder
from guessit.date import search_date
class GuessDate(Transformer):
def __init__(self):
Transformer.__init__(self, 50)
def supported_properties(self):
return ['date']
def guess_date(self, string, node=None, options=None):
date, span = search_date(string)
if date:
return {'date': date}, span
else:
return None, None
def process(self, mtree, options=None):
GuessFinder(self.guess_date, 1.0, self.log, options).process_nodes(mtree.unidentified_leaves())

View file

@ -0,0 +1,162 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
#
# GuessIt - A library for guessing information from filenames
# Copyright (c) 2013 Nicolas Wack <wackou@gmail.com>
#
# GuessIt is free software; you can redistribute it and/or modify it under
# the terms of the Lesser GNU General Public License as published by
# the Free Software Foundation; either version 3 of the License, or
# (at your option) any later version.
#
# GuessIt is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# Lesser GNU General Public License for more details.
#
# You should have received a copy of the Lesser GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
#
from __future__ import absolute_import, division, print_function, unicode_literals
from guessit.plugins.transformers import Transformer, get_transformer
from guessit.textutils import reorder_title
from guessit.matcher import found_property
class GuessEpisodeInfoFromPosition(Transformer):
def __init__(self):
Transformer.__init__(self, -200)
def supported_properties(self):
return ['title', 'series']
def match_from_epnum_position(self, mtree, node):
epnum_idx = node.node_idx
# a few helper functions to be able to filter using high-level semantics
def before_epnum_in_same_pathgroup():
return [leaf for leaf in mtree.unidentified_leaves()
if (leaf.node_idx[0] == epnum_idx[0] and
leaf.node_idx[1:] < epnum_idx[1:])]
def after_epnum_in_same_pathgroup():
return [leaf for leaf in mtree.unidentified_leaves()
if (leaf.node_idx[0] == epnum_idx[0] and
leaf.node_idx[1:] > epnum_idx[1:])]
def after_epnum_in_same_explicitgroup():
return [leaf for leaf in mtree.unidentified_leaves()
if (leaf.node_idx[:2] == epnum_idx[:2] and
leaf.node_idx[2:] > epnum_idx[2:])]
# epnumber is the first group and there are only 2 after it in same
# path group
# -> series title - episode title
title_candidates = self._filter_candidates(after_epnum_in_same_pathgroup())
if ('title' not in mtree.info and # no title
before_epnum_in_same_pathgroup() == [] and # no groups before
len(title_candidates) == 2): # only 2 groups after
found_property(title_candidates[0], 'series', confidence=0.4)
found_property(title_candidates[1], 'title', confidence=0.4)
return
# if we have at least 1 valid group before the episodeNumber, then it's
# probably the series name
series_candidates = before_epnum_in_same_pathgroup()
if len(series_candidates) >= 1:
found_property(series_candidates[0], 'series', confidence=0.7)
# only 1 group after (in the same path group) and it's probably the
# episode title
title_candidates = self._filter_candidates(after_epnum_in_same_pathgroup())
if len(title_candidates) == 1:
found_property(title_candidates[0], 'title', confidence=0.5)
return
else:
# try in the same explicit group, with lower confidence
title_candidates = self._filter_candidates(after_epnum_in_same_explicitgroup())
if len(title_candidates) == 1:
found_property(title_candidates[0], 'title', confidence=0.4)
return
elif len(title_candidates) > 1:
found_property(title_candidates[0], 'title', confidence=0.3)
return
# get the one with the longest value
title_candidates = self._filter_candidates(after_epnum_in_same_pathgroup())
if title_candidates:
maxidx = -1
maxv = -1
for i, c in enumerate(title_candidates):
if len(c.clean_value) > maxv:
maxidx = i
maxv = len(c.clean_value)
found_property(title_candidates[maxidx], 'title', confidence=0.3)
def should_process(self, mtree, options=None):
options = options or {}
return not options.get('skip_title') and mtree.guess.get('type', '').startswith('episode')
def _filter_candidates(self, candidates):
episode_special_transformer = get_transformer('guess_episode_special')
if episode_special_transformer:
return [n for n in candidates if not episode_special_transformer.container.find_properties(n.value, n, re_match=True)]
else:
return candidates
def process(self, mtree, options=None):
"""
try to identify the remaining unknown groups by looking at their
position relative to other known elements
"""
eps = [node for node in mtree.leaves() if 'episodeNumber' in node.guess]
if eps:
self.match_from_epnum_position(mtree, eps[0])
else:
# if we don't have the episode number, but at least 2 groups in the
# basename, then it's probably series - eptitle
basename = mtree.node_at((-2,))
title_candidates = self._filter_candidates(basename.unidentified_leaves())
if len(title_candidates) >= 2:
found_property(title_candidates[0], 'series', confidence=0.4)
found_property(title_candidates[1], 'title', confidence=0.4)
elif len(title_candidates) == 1:
# but if there's only one candidate, it's probably the series name
found_property(title_candidates[0], 'series', confidence=0.4)
# if we only have 1 remaining valid group in the folder containing the
# file, then it's likely that it is the series name
try:
series_candidates = mtree.node_at((-3,)).unidentified_leaves()
except ValueError:
series_candidates = []
if len(series_candidates) == 1:
found_property(series_candidates[0], 'series', confidence=0.3)
# if there's a path group that only contains the season info, then the
# previous one is most likely the series title (ie: ../series/season X/..)
eps = [node for node in mtree.nodes()
if 'season' in node.guess and 'episodeNumber' not in node.guess]
if eps:
previous = [node for node in mtree.unidentified_leaves()
if node.node_idx[0] == eps[0].node_idx[0] - 1]
if len(previous) == 1:
found_property(previous[0], 'series', confidence=0.5)
def post_process(self, mtree, options=None):
for node in mtree.nodes():
if 'series' not in node.guess:
continue
node.guess['series'] = reorder_title(node.guess['series'])

View file

@ -0,0 +1,62 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
#
# GuessIt - A library for guessing information from filenames
# Copyright (c) 2013 Nicolas Wack <wackou@gmail.com>
#
# GuessIt is free software; you can redistribute it and/or modify it under
# the terms of the Lesser GNU General Public License as published by
# the Free Software Foundation; either version 3 of the License, or
# (at your option) any later version.
#
# GuessIt is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# Lesser GNU General Public License for more details.
#
# You should have received a copy of the Lesser GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
#
from __future__ import absolute_import, division, print_function, unicode_literals
from guessit.plugins.transformers import Transformer
from guessit.matcher import found_guess
from guessit.containers import PropertiesContainer
class GuessEpisodeSpecial(Transformer):
def __init__(self):
Transformer.__init__(self, -205)
self.container = PropertiesContainer()
self.container.register_property('special', 'Special', 'Bonus', 'Omake', 'Ova', 'Oav', 'Pilot', 'Unaired')
self.container.register_property('special', 'Extras?', canonical_form='Extras')
def guess_special(self, string, node=None, options=None):
properties = self.container.find_properties(string, node, 'special', multiple=True)
guesses = self.container.as_guess(properties, multiple=True)
return guesses
def second_pass_options(self, mtree, options=None):
if not mtree.guess.get('type', '').startswith('episode'):
for unidentified_leaf in mtree.unidentified_leaves():
properties = self.container.find_properties(unidentified_leaf.value, unidentified_leaf, 'special')
guess = self.container.as_guess(properties)
if guess:
return {'type': 'episode'}
return None
def supported_properties(self):
return self.container.get_supported_properties()
def process(self, mtree, options=None):
if mtree.guess.get('type', '').startswith('episode') and (not mtree.info.get('episodeNumber') or mtree.info.get('season') == 0):
for title_leaf in mtree.leaves_containing('title'):
guesses = self.guess_special(title_leaf.value, title_leaf, options)
for guess in guesses:
found_guess(title_leaf, guess, update_guess=False)
for unidentified_leaf in mtree.unidentified_leaves():
guesses = self.guess_special(unidentified_leaf.value, unidentified_leaf, options)
for guess in guesses:
found_guess(unidentified_leaf, guess, update_guess=False)
return None

View file

@ -0,0 +1,80 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
#
# GuessIt - A library for guessing information from filenames
# Copyright (c) 2013 Nicolas Wack <wackou@gmail.com>
#
# GuessIt is free software; you can redistribute it and/or modify it under
# the terms of the Lesser GNU General Public License as published by
# the Free Software Foundation; either version 3 of the License, or
# (at your option) any later version.
#
# GuessIt is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# Lesser GNU General Public License for more details.
#
# You should have received a copy of the Lesser GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
#
from __future__ import absolute_import, division, print_function, unicode_literals
from guessit.plugins.transformers import Transformer
from guessit.matcher import GuessFinder
from guessit.patterns import sep
from guessit.containers import PropertiesContainer, WeakValidator, NoValidator
from guessit.patterns.numeral import numeral, digital_numeral, parse_numeral
from re import split as re_split
class GuessEpisodesRexps(Transformer):
def __init__(self):
Transformer.__init__(self, 20)
self.container = PropertiesContainer(enhance=False, canonical_from_pattern=False)
def episode_parser(value):
values = re_split('[a-zA-Z]', value)
values = [x for x in values if x]
ret = []
for letters_elt in values:
dashed_values = letters_elt.split('-')
dashed_values = [x for x in dashed_values if x]
if len(dashed_values) > 1:
for _ in range(0, len(dashed_values) - 1):
start_dash_ep = parse_numeral(dashed_values[0])
end_dash_ep = parse_numeral(dashed_values[1])
for dash_ep in range(start_dash_ep, end_dash_ep + 1):
ret.append(dash_ep)
else:
ret.append(parse_numeral(letters_elt))
if len(ret) > 1:
return {None: ret[0], 'episodeList': ret} # TODO: Should support seasonList also
elif len(ret) > 0:
return ret[0]
else:
return None
self.container.register_property(None, r'((?:season|saison)' + sep + '?(?P<season>' + numeral + '))', confidence=1.0, formatter=parse_numeral)
self.container.register_property(None, r'(s(?P<season>' + digital_numeral + ')[^0-9]?' + sep + '?(?P<episodeNumber>(?:e' + digital_numeral + '(?:' + sep + '?[e-]' + digital_numeral + ')*)))[^0-9]', confidence=1.0, formatter={None: parse_numeral, 'episodeNumber': episode_parser}, validator=NoValidator())
self.container.register_property(None, r'[^0-9]((?P<season>' + digital_numeral + ')[^0-9 .-]?-?(?P<episodeNumber>(?:x' + digital_numeral + '(?:' + sep + '?[x-]' + digital_numeral + ')*)))[^0-9]', confidence=1.0, formatter={None: parse_numeral, 'episodeNumber': episode_parser})
self.container.register_property(None, r'(s(?P<season>' + digital_numeral + '))[^0-9]', confidence=0.6, formatter=parse_numeral, validator=NoValidator())
self.container.register_property(None, r'((?P<episodeNumber>' + digital_numeral + ')v[23])', confidence=0.6, formatter=parse_numeral)
self.container.register_property(None, r'((?:ep)' + sep + r'(?P<episodeNumber>' + numeral + '))[^0-9]', confidence=0.7, formatter=parse_numeral)
self.container.register_property(None, r'(e(?P<episodeNumber>' + digital_numeral + '))', confidence=0.6, formatter=parse_numeral)
self.container.register_canonical_properties('other', 'FiNAL', 'Complete', validator=WeakValidator())
def supported_properties(self):
return ['episodeNumber', 'season']
def guess_episodes_rexps(self, string, node=None, options=None):
found = self.container.find_properties(string, node)
return self.container.as_guess(found, string)
def should_process(self, mtree, options=None):
return mtree.guess.get('type', '').startswith('episode')
def process(self, mtree, options=None):
GuessFinder(self.guess_episodes_rexps, None, self.log, options).process_nodes(mtree.unidentified_leaves())

View file

@ -0,0 +1,213 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
#
# GuessIt - A library for guessing information from filenames
# Copyright (c) 2013 Nicolas Wack <wackou@gmail.com>
#
# GuessIt is free software; you can redistribute it and/or modify it under
# the terms of the Lesser GNU General Public License as published by
# the Free Software Foundation; either version 3 of the License, or
# (at your option) any later version.
#
# GuessIt is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# Lesser GNU General Public License for more details.
#
# You should have received a copy of the Lesser GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
#
from __future__ import absolute_import, division, print_function, unicode_literals
import mimetypes
import os.path
import re
from guessit.guess import Guess
from guessit.patterns.extension import subtitle_exts, info_exts, video_exts
from guessit.transfo import TransformerException
from guessit.plugins.transformers import Transformer, get_transformer
from guessit.matcher import log_found_guess, found_guess
from guessit.textutils import clean_string
class GuessFiletype(Transformer):
def __init__(self):
Transformer.__init__(self, 250)
# List of well known movies and series, hardcoded because they cannot be
# guessed appropriately otherwise
MOVIES = ['OSS 117']
SERIES = ['Band of Brothers']
MOVIES = [m.lower() for m in MOVIES]
SERIES = [s.lower() for s in SERIES]
def guess_filetype(self, mtree, options=None):
options = options or {}
# put the filetype inside a dummy container to be able to have the
# following functions work correctly as closures
# this is a workaround for python 2 which doesn't have the
# 'nonlocal' keyword which we could use here in the upgrade_* functions
# (python 3 does have it)
filetype_container = [mtree.guess.get('type')]
other = {}
filename = mtree.string
def upgrade_episode():
if filetype_container[0] == 'subtitle':
filetype_container[0] = 'episodesubtitle'
elif filetype_container[0] == 'info':
filetype_container[0] = 'episodeinfo'
elif not filetype_container[0]:
filetype_container[0] = 'episode'
def upgrade_movie():
if filetype_container[0] == 'subtitle':
filetype_container[0] = 'moviesubtitle'
elif filetype_container[0] == 'info':
filetype_container[0] = 'movieinfo'
elif not filetype_container[0]:
filetype_container[0] = 'movie'
def upgrade_subtitle():
if filetype_container[0] == 'movie':
filetype_container[0] = 'moviesubtitle'
elif filetype_container[0] == 'episode':
filetype_container[0] = 'episodesubtitle'
elif not filetype_container[0]:
filetype_container[0] = 'subtitle'
def upgrade_info():
if filetype_container[0] == 'movie':
filetype_container[0] = 'movieinfo'
elif filetype_container[0] == 'episode':
filetype_container[0] = 'episodeinfo'
elif not filetype_container[0]:
filetype_container[0] = 'info'
# look at the extension first
fileext = os.path.splitext(filename)[1][1:].lower()
if fileext in subtitle_exts:
upgrade_subtitle()
other = {'container': fileext}
elif fileext in info_exts:
upgrade_info()
other = {'container': fileext}
elif fileext in video_exts:
other = {'container': fileext}
else:
if fileext and not options.get('name_only'):
other = {'extension': fileext}
# check whether we are in a 'Movies', 'Tv Shows', ... folder
folder_rexps = [
(r'Movies?', upgrade_movie),
(r'Films?', upgrade_movie),
(r'Tv[ _-]?Shows?', upgrade_episode),
(r'Series?', upgrade_episode),
(r'Episodes?', upgrade_episode),
]
for frexp, upgrade_func in folder_rexps:
frexp = re.compile(frexp, re.IGNORECASE)
for pathgroup in mtree.children:
if frexp.match(pathgroup.value):
upgrade_func()
return filetype_container[0], other
# check for a few specific cases which will unintentionally make the
# following heuristics confused (eg: OSS 117 will look like an episode,
# season 1, epnum 17, when it is in fact a movie)
fname = clean_string(filename).lower()
for m in self.MOVIES:
if m in fname:
self.log.debug('Found in exception list of movies -> type = movie')
upgrade_movie()
return filetype_container[0], other
for s in self.SERIES:
if s in fname:
self.log.debug('Found in exception list of series -> type = episode')
upgrade_episode()
return filetype_container[0], other
# now look whether there are some specific hints for episode vs movie
# if we have an episode_rexp (eg: s02e13), it is an episode
episode_transformer = get_transformer('guess_episodes_rexps')
if episode_transformer:
guess = episode_transformer.guess_episodes_rexps(filename)
if guess:
self.log.debug('Found guess_episodes_rexps: %s -> type = episode', guess)
upgrade_episode()
return filetype_container[0], other
properties_transformer = get_transformer('guess_properties')
if properties_transformer:
# if we have certain properties characteristic of episodes, it is an ep
found = properties_transformer.container.find_properties(filename, mtree, 'episodeFormat')
guess = properties_transformer.container.as_guess(found, filename)
if guess:
self.log.debug('Found characteristic property of episodes: %s"', guess)
upgrade_episode()
return filetype_container[0], other
found = properties_transformer.container.find_properties(filename, mtree, 'format')
guess = properties_transformer.container.as_guess(found, filename)
if guess and guess['format'] in ('HDTV', 'WEBRip', 'WEB-DL', 'DVB'):
# Use weak episodes only if TV or WEB source
weak_episode_transformer = get_transformer('guess_weak_episodes_rexps')
if weak_episode_transformer:
guess = weak_episode_transformer.guess_weak_episodes_rexps(filename)
if guess:
self.log.debug('Found guess_weak_episodes_rexps: %s -> type = episode', guess)
upgrade_episode()
return filetype_container[0], other
website_transformer = get_transformer('guess_website')
if website_transformer:
found = website_transformer.container.find_properties(filename, mtree, 'website')
guess = website_transformer.container.as_guess(found, filename)
if guess:
for namepart in ('tv', 'serie', 'episode'):
if namepart in guess['website']:
# origin-specific type
self.log.debug('Found characteristic property of episodes: %s', guess)
upgrade_episode()
return filetype_container[0], other
if filetype_container[0] in ('subtitle', 'info') or (not filetype_container[0] and fileext in video_exts):
# if no episode info found, assume it's a movie
self.log.debug('Nothing characteristic found, assuming type = movie')
upgrade_movie()
if not filetype_container[0]:
self.log.debug('Nothing characteristic found, assuming type = unknown')
filetype_container[0] = 'unknown'
return filetype_container[0], other
def process(self, mtree, options=None):
"""guess the file type now (will be useful later)
"""
filetype, other = self.guess_filetype(mtree, options)
mtree.guess.set('type', filetype, confidence=1.0)
log_found_guess(mtree.guess)
filetype_info = Guess(other, confidence=1.0)
# guess the mimetype of the filename
# TODO: handle other mimetypes not found on the default type_maps
# mimetypes.types_map['.srt']='text/subtitle'
mime, _ = mimetypes.guess_type(mtree.string, strict=False)
if mime is not None:
filetype_info.update({'mimetype': mime}, confidence=1.0)
node_ext = mtree.node_at((-1,))
found_guess(node_ext, filetype_info)
if mtree.guess.get('type') in [None, 'unknown']:
if options.get('name_only'):
mtree.guess.set('type', 'movie', confidence=0.6)
else:
raise TransformerException(__name__, 'Unknown file type')

View file

@ -0,0 +1,69 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
#
# GuessIt - A library for guessing information from filenames
# Copyright (c) 2013 Nicolas Wack <wackou@gmail.com>
#
# GuessIt is free software; you can redistribute it and/or modify it under
# the terms of the Lesser GNU General Public License as published by
# the Free Software Foundation; either version 3 of the License, or
# (at your option) any later version.
#
# GuessIt is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# Lesser GNU General Public License for more details.
#
# You should have received a copy of the Lesser GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
#
from __future__ import absolute_import, division, print_function, unicode_literals
from guessit.plugins.transformers import Transformer
from guessit.matcher import GuessFinder
import re
class GuessIdnumber(Transformer):
def __init__(self):
Transformer.__init__(self, -180)
def supported_properties(self):
return ['idNumber']
_idnum = re.compile(r'(?P<idNumber>[a-zA-Z0-9-]{20,})') # 1.0, (0, 0))
def guess_idnumber(self, string, node=None, options=None):
match = self._idnum.search(string)
if match is not None:
result = match.groupdict()
switch_count = 0
DIGIT = 0
LETTER = 1
OTHER = 2
last = LETTER
for c in result['idNumber']:
if c in '0123456789':
ci = DIGIT
elif c in 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ':
ci = LETTER
else:
ci = OTHER
if ci != last:
switch_count += 1
last = ci
switch_ratio = float(switch_count) / len(result['idNumber'])
# only return the result as probable if we alternate often between
# char type (more likely for hash values than for common words)
if switch_ratio > 0.4:
return result, match.span()
return None, None
def process(self, mtree, options=None):
GuessFinder(self.guess_idnumber, 0.4, self.log, options).process_nodes(mtree.unidentified_leaves())

View file

@ -0,0 +1,169 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
#
# GuessIt - A library for guessing information from filenames
# Copyright (c) 2013 Nicolas Wack <wackou@gmail.com>
#
# GuessIt is free software; you can redistribute it and/or modify it under
# the terms of the Lesser GNU General Public License as published by
# the Free Software Foundation; either version 3 of the License, or
# (at your option) any later version.
#
# GuessIt is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# Lesser GNU General Public License for more details.
#
# You should have received a copy of the Lesser GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
#
from __future__ import absolute_import, division, print_function, unicode_literals
from guessit.language import search_language, subtitle_prefixes, subtitle_suffixes
from guessit.patterns.extension import subtitle_exts
from guessit.textutils import clean_string, find_words
from guessit.plugins.transformers import Transformer
from guessit.matcher import GuessFinder
class GuessLanguage(Transformer):
def __init__(self):
Transformer.__init__(self, 30)
def supported_properties(self):
return ['language', 'subtitleLanguage']
def guess_language(self, string, node=None, options=None):
guess = search_language(string)
return guess
def _skip_language_on_second_pass(self, mtree, node):
"""Check if found node is a valid language node, or if it's a false positive.
:param mtree: Tree detected on first pass.
:type mtree: :class:`guessit.matchtree.MatchTree`
:param node: Node that contains a language Guess
:type node: :class:`guessit.matchtree.MatchTree`
:return: True if a second pass skipping this node is required
:rtype: bool
"""
unidentified_starts = {}
unidentified_ends = {}
property_starts = {}
property_ends = {}
title_starts = {}
title_ends = {}
for unidentified_node in mtree.unidentified_leaves():
unidentified_starts[unidentified_node.span[0]] = unidentified_node
unidentified_ends[unidentified_node.span[1]] = unidentified_node
for property_node in mtree.leaves_containing('year'):
property_starts[property_node.span[0]] = property_node
property_ends[property_node.span[1]] = property_node
for title_node in mtree.leaves_containing(['title', 'series']):
title_starts[title_node.span[0]] = title_node
title_ends[title_node.span[1]] = title_node
return node.span[0] in title_ends.keys() and (node.span[1] in unidentified_starts.keys() or node.span[1] + 1 in property_starts.keys()) or\
node.span[1] in title_starts.keys() and (node.span[0] == 0 or node.span[0] in unidentified_ends.keys() or node.span[0] in property_ends.keys())
def second_pass_options(self, mtree, options=None):
m = mtree.matched()
to_skip_language_nodes = []
for lang_key in ('language', 'subtitleLanguage'):
langs = {}
lang_nodes = set(n for n in mtree.leaves_containing(lang_key))
for lang_node in lang_nodes:
lang = lang_node.guess.get(lang_key, None)
if self._skip_language_on_second_pass(mtree, lang_node):
# Language probably split the title. Add to skip for 2nd pass.
# if filetype is subtitle and the language appears last, just before
# the extension, then it is likely a subtitle language
parts = clean_string(lang_node.root.value).split()
if (m.get('type') in ['moviesubtitle', 'episodesubtitle'] and
(parts.index(lang_node.value) == len(parts) - 2)):
continue
to_skip_language_nodes.append(lang_node)
elif not lang in langs:
langs[lang] = lang_node
else:
# The same language was found. Keep the more confident one,
# and add others to skip for 2nd pass.
existing_lang_node = langs[lang]
to_skip = None
if (existing_lang_node.guess.confidence('language') >=
lang_node.guess.confidence('language')):
# lang_node is to remove
to_skip = lang_node
else:
# existing_lang_node is to remove
langs[lang] = lang_node
to_skip = existing_lang_node
to_skip_language_nodes.append(to_skip)
if to_skip_language_nodes:
return {'skip_nodes': to_skip_language_nodes}
return None
def should_process(self, mtree, options=None):
options = options or {}
return 'nolanguage' not in options
def process(self, mtree, options=None):
GuessFinder(self.guess_language, None, self.log, options).process_nodes(mtree.unidentified_leaves())
def promote_subtitle(self, node):
node.guess.set('subtitleLanguage', node.guess['language'],
confidence=node.guess.confidence('language'))
del node.guess['language']
def post_process(self, mtree, options=None):
# 1- try to promote language to subtitle language where it makes sense
for node in mtree.nodes():
if 'language' not in node.guess:
continue
# - if we matched a language in a file with a sub extension and that
# the group is the last group of the filename, it is probably the
# language of the subtitle
# (eg: 'xxx.english.srt')
if (mtree.node_at((-1,)).value.lower() in subtitle_exts and
node == mtree.leaves()[-2]):
self.promote_subtitle(node)
# - if we find in the same explicit group
# a subtitle prefix before the language,
# or a subtitle suffix after the language,
# then upgrade the language
explicit_group = mtree.node_at(node.node_idx[:2])
group_str = explicit_group.value.lower()
for sub_prefix in subtitle_prefixes:
if (sub_prefix in find_words(group_str) and
0 <= group_str.find(sub_prefix) < (node.span[0] - explicit_group.span[0])):
self.promote_subtitle(node)
for sub_suffix in subtitle_suffixes:
if (sub_suffix in find_words(group_str) and
(node.span[0] - explicit_group.span[0]) < group_str.find(sub_suffix)):
self.promote_subtitle(node)
# - if a language is in an explicit group just preceded by "st",
# it is a subtitle language (eg: '...st[fr-eng]...')
try:
idx = node.node_idx
previous = mtree.node_at((idx[0], idx[1] - 1)).leaves()[-1]
if previous.value.lower()[-2:] == 'st':
self.promote_subtitle(node)
except IndexError:
pass

View file

@ -0,0 +1,177 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
#
# GuessIt - A library for guessing information from filenames
# Copyright (c) 2013 Nicolas Wack <wackou@gmail.com>
#
# GuessIt is free software; you can redistribute it and/or modify it under
# the terms of the Lesser GNU General Public License as published by
# the Free Software Foundation; either version 3 of the License, or
# (at your option) any later version.
#
# GuessIt is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# Lesser GNU General Public License for more details.
#
# You should have received a copy of the Lesser GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
#
from __future__ import absolute_import, division, print_function, unicode_literals
from guessit.plugins.transformers import Transformer
from guessit.matcher import found_property
from guessit import u
class GuessMovieTitleFromPosition(Transformer):
def __init__(self):
Transformer.__init__(self, -200)
def supported_properties(self):
return ['title']
def should_process(self, mtree, options=None):
options = options or {}
return not options.get('skip_title') and not mtree.guess.get('type', '').startswith('episode')
def process(self, mtree, options=None):
"""
try to identify the remaining unknown groups by looking at their
position relative to other known elements
"""
basename = mtree.node_at((-2,))
all_valid = lambda leaf: len(leaf.clean_value) > 0
basename_leftover = basename.unidentified_leaves(valid=all_valid)
try:
folder = mtree.node_at((-3,))
folder_leftover = folder.unidentified_leaves()
except ValueError:
folder = None
folder_leftover = []
self.log.debug('folder: %s' % u(folder_leftover))
self.log.debug('basename: %s' % u(basename_leftover))
# specific cases:
# if we find the same group both in the folder name and the filename,
# it's a good candidate for title
if (folder_leftover and basename_leftover and
folder_leftover[0].clean_value == basename_leftover[0].clean_value):
found_property(folder_leftover[0], 'title', confidence=0.8)
return
# specific cases:
# if the basename contains a number first followed by an unidentified
# group, and the folder only contains 1 unidentified one, then we have
# a series
# ex: Millenium Trilogy (2009)/(1)The Girl With The Dragon Tattoo(2009).mkv
try:
series = folder_leftover[0]
filmNumber = basename_leftover[0]
title = basename_leftover[1]
basename_leaves = basename.leaves()
num = int(filmNumber.clean_value)
self.log.debug('series: %s' % series.clean_value)
self.log.debug('title: %s' % title.clean_value)
if (series.clean_value != title.clean_value and
series.clean_value != filmNumber.clean_value and
basename_leaves.index(filmNumber) == 0 and
basename_leaves.index(title) == 1):
found_property(title, 'title', confidence=0.6)
found_property(series, 'filmSeries', confidence=0.6)
found_property(filmNumber, 'filmNumber', num, confidence=0.6)
return
except Exception:
pass
# specific cases:
# - movies/tttttt (yyyy)/tttttt.ccc
try:
if mtree.node_at((-4, 0)).value.lower() == 'movies':
folder = mtree.node_at((-3,))
# Note:too generic, might solve all the unittests as they all
# contain 'movies' in their path
#
# if containing_folder.is_leaf() and not containing_folder.guess:
# containing_folder.guess =
# Guess({ 'title': clean_string(containing_folder.value) },
# confidence=0.7)
year_group = folder.first_leaf_containing('year')
groups_before = folder.previous_unidentified_leaves(year_group)
found_property(groups_before[0], 'title', confidence=0.8)
return
except Exception:
pass
# if we have either format or videoCodec in the folder containing the file
# or one of its parents, then we should probably look for the title in
# there rather than in the basename
try:
props = mtree.previous_leaves_containing(mtree.children[-2],
['videoCodec', 'format',
'language'])
except IndexError:
props = []
if props:
group_idx = props[0].node_idx[0]
if all(g.node_idx[0] == group_idx for g in props):
# if they're all in the same group, take leftover info from there
leftover = mtree.node_at((group_idx,)).unidentified_leaves()
if leftover:
found_property(leftover[0], 'title', confidence=0.7)
return
# look for title in basename if there are some remaining unidentified
# groups there
if basename_leftover:
# if basename is only one word and the containing folder has at least
# 3 words in it, we should take the title from the folder name
# ex: Movies/Alice in Wonderland DVDRip.XviD-DiAMOND/dmd-aw.avi
# ex: Movies/Somewhere.2010.DVDRip.XviD-iLG/i-smwhr.avi <-- TODO: gets caught here?
if (basename_leftover[0].clean_value.count(' ') == 0 and
folder_leftover and
folder_leftover[0].clean_value.count(' ') >= 2):
found_property(folder_leftover[0], 'title', confidence=0.7)
return
# if there are only many unidentified groups, take the first of which is
# not inside brackets or parentheses.
# ex: Movies/[阿维达].Avida.2006.FRENCH.DVDRiP.XViD-PROD.avi
if basename_leftover[0].is_explicit():
for basename_leftover_elt in basename_leftover:
if not basename_leftover_elt.is_explicit():
found_property(basename_leftover_elt, 'title', confidence=0.8)
return
# if all else fails, take the first remaining unidentified group in the
# basename as title
found_property(basename_leftover[0], 'title', confidence=0.6)
return
# if there are no leftover groups in the basename, look in the folder name
if folder_leftover:
found_property(folder_leftover[0], 'title', confidence=0.5)
return
# if nothing worked, look if we have a very small group at the beginning
# of the basename
basename = mtree.node_at((-2,))
basename_leftover = basename.unidentified_leaves(valid=lambda leaf: True)
if basename_leftover:
found_property(basename_leftover[0], 'title', confidence=0.4)
return

View file

@ -0,0 +1,230 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
#
# GuessIt - A library for guessing information from filenames
# Copyright (c) 2013 Rémi Alvergnat <toilal.dev@gmail.com>
#
# GuessIt is free software; you can redistribute it and/or modify it under
# the terms of the Lesser GNU General Public License as published by
# the Free Software Foundation; either version 3 of the License, or
# (at your option) any later version.
#
# GuessIt is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# Lesser GNU General Public License for more details.
#
# You should have received a copy of the Lesser GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
#
from __future__ import absolute_import, division, print_function, unicode_literals
from guessit.containers import PropertiesContainer, WeakValidator, LeavesValidator, QualitiesContainer
from guessit.patterns.extension import subtitle_exts, video_exts, info_exts
from guessit.plugins.transformers import Transformer
from guessit.matcher import GuessFinder
class GuessProperties(Transformer):
def __init__(self):
Transformer.__init__(self, 35)
self.container = PropertiesContainer()
self.qualities = QualitiesContainer()
def register_property(propname, props):
"""props a dict of {value: [patterns]}"""
for canonical_form, patterns in props.items():
if isinstance(patterns, tuple):
patterns2, kwargs = patterns
kwargs = dict(kwargs)
kwargs['canonical_form'] = canonical_form
self.container.register_property(propname, *patterns2, **kwargs)
else:
self.container.register_property(propname, *patterns, canonical_form=canonical_form)
def register_quality(propname, quality_dict):
"""props a dict of {canonical_form: quality}"""
for canonical_form, quality in quality_dict.items():
self.qualities.register_quality(propname, canonical_form, quality)
register_property('container', {'mp4': ['MP4']})
# http://en.wikipedia.org/wiki/Pirated_movie_release_types
register_property('format', {'VHS': ['VHS'],
'Cam': ['CAM', 'CAMRip'],
'Telesync': ['TELESYNC', 'PDVD'],
'Telesync': (['TS'], {'confidence': 0.2}),
'Workprint': ['WORKPRINT', 'WP'],
'Telecine': ['TELECINE', 'TC'],
'PPV': ['PPV', 'PPV-Rip'], # Pay Per View
'TV': ['SD-TV', 'SD-TV-Rip', 'Rip-SD-TV', 'TV-Rip', 'Rip-TV'],
'DVB': ['DVB-Rip', 'DVB', 'PD-TV'],
'DVD': ['DVD', 'DVD-Rip', 'VIDEO-TS'],
'HDTV': ['HD-TV', 'TV-RIP-HD', 'HD-TV-RIP'],
'VOD': ['VOD', 'VOD-Rip'],
'WEBRip': ['WEB-Rip'],
'WEB-DL': ['WEB-DL'],
'HD-DVD': ['HD-(?:DVD)?-Rip', 'HD-DVD'],
'BluRay': ['Blu-ray', 'B[DR]', 'B[DR]-Rip', 'BD[59]', 'BD25', 'BD50']
})
register_quality('format', {'VHS': -100,
'Cam': -90,
'Telesync': -80,
'Workprint': -70,
'Telecine': -60,
'PPV': -50,
'TV': -30,
'DVB': -20,
'DVD': 0,
'HDTV': 20,
'VOD': 40,
'WEBRip': 50,
'WEB-DL': 60,
'HD-DVD': 80,
'BluRay': 100
})
register_property('screenSize', {'360p': ['(?:\d{3,}(?:\\|\/|x|\*))?360(?:i|p?x?)'],
'368p': ['(?:\d{3,}(?:\\|\/|x|\*))?368(?:i|p?x?)'],
'480p': ['(?:\d{3,}(?:\\|\/|x|\*))?480(?:i|p?x?)'],
'480p': (['hr'], {'confidence': 0.2}),
'576p': ['(?:\d{3,}(?:\\|\/|x|\*))?576(?:i|p?x?)'],
'720p': ['(?:\d{3,}(?:\\|\/|x|\*))?720(?:i|p?x?)'],
'900p': ['(?:\d{3,}(?:\\|\/|x|\*))?900(?:i|p?x?)'],
'1080i': ['(?:\d{3,}(?:\\|\/|x|\*))?1080i'],
'1080p': ['(?:\d{3,}(?:\\|\/|x|\*))?1080(?:p?x?)'],
'4K': ['(?:\d{3,}(?:\\|\/|x|\*))?2160(?:i|p?x?)']
})
register_quality('screenSize', {'360p': -300,
'368p': -200,
'480p': -100,
'576p': 0,
'720p': 100,
'900p': 130,
'1080i': 180,
'1080p': 200,
'4K': 400
})
_videoCodecProperty = {'Real': ['Rv\d{2}'], # http://en.wikipedia.org/wiki/RealVideo
'Mpeg2': ['Mpeg2'],
'DivX': ['DVDivX', 'DivX'],
'XviD': ['XviD'],
'h264': ['[hx]-264(?:-AVC)?', 'MPEG-4(?:-AVC)'],
'h265': ['[hx]-265(?:-HEVC)?', 'HEVC']
}
register_property('videoCodec', _videoCodecProperty)
register_quality('videoCodec', {'Real': -50,
'Mpeg2': -30,
'DivX': -10,
'XviD': 0,
'h264': 100,
'h265': 150
})
# http://blog.mediacoderhq.com/h264-profiles-and-levels/
# http://fr.wikipedia.org/wiki/H.264
self.container.register_property('videoProfile', 'BP', validator=LeavesValidator(lambdas=[lambda node: 'videoCodec' in node.guess]))
self.container.register_property('videoProfile', 'XP', 'EP', canonical_form='XP', validator=LeavesValidator(lambdas=[lambda node: 'videoCodec' in node.guess]))
self.container.register_property('videoProfile', 'MP', validator=LeavesValidator(lambdas=[lambda node: 'videoCodec' in node.guess]))
self.container.register_property('videoProfile', 'HP', 'HiP', canonical_form='HP', validator=LeavesValidator(lambdas=[lambda node: 'videoCodec' in node.guess]))
self.container.register_property('videoProfile', '10.?bit', 'Hi10P', canonical_form='10bit', validator=LeavesValidator(lambdas=[lambda node: 'videoCodec' in node.guess]))
self.container.register_property('videoProfile', 'Hi422P', validator=LeavesValidator(lambdas=[lambda node: 'videoCodec' in node.guess]))
self.container.register_property('videoProfile', 'Hi444PP', validator=LeavesValidator(lambdas=[lambda node: 'videoCodec' in node.guess]))
register_quality('videoProfile', {'BP': -20,
'XP': -10,
'MP': 0,
'HP': 10,
'10bit': 15,
'Hi422P': 25,
'Hi444PP': 35
})
# has nothing to do here (or on filenames for that matter), but some
# releases use it and it helps to identify release groups, so we adapt
register_property('videoApi', {'DXVA': ['DXVA']})
register_property('audioCodec', {'MP3': ['MP3'],
'DolbyDigital': ['DD'],
'AAC': ['AAC'],
'AC3': ['AC3'],
'Flac': ['FLAC'],
'DTS': ['DTS'],
'TrueHD': ['True-HD']
})
register_quality('audioCodec', {'MP3': 10,
'DolbyDigital': 30,
'AAC': 35,
'AC3': 40,
'Flac': 45,
'DTS': 60,
'TrueHD': 70
})
self.container.register_property('audioProfile', 'HD', validator=LeavesValidator(lambdas=[lambda node: node.guess.get('audioCodec') == 'DTS']))
self.container.register_property('audioProfile', 'HD-MA', canonical_form='HDMA', validator=LeavesValidator(lambdas=[lambda node: node.guess.get('audioCodec') == 'DTS']))
self.container.register_property('audioProfile', 'HE', validator=LeavesValidator(lambdas=[lambda node: node.guess.get('audioCodec') == 'AAC']))
self.container.register_property('audioProfile', 'LC', validator=LeavesValidator(lambdas=[lambda node: node.guess.get('audioCodec') == 'AAC']))
self.container.register_property('audioProfile', 'HQ', validator=LeavesValidator(lambdas=[lambda node: node.guess.get('audioCodec') == 'AC3']))
register_quality('audioProfile', {'HD': 20,
'HDMA': 50,
'LC': 0,
'HQ': 0,
'HE': 20
})
register_property('audioChannels', {'7.1': ['7[\W_]1', '7ch'],
'5.1': ['5[\W_]1', '5ch'],
'2.0': ['2[\W_]0', '2ch', 'stereo'],
'1.0': ['1[\W_]0', '1ch', 'mono']
})
register_quality('audioChannels', {'7.1': 200,
'5.1': 100,
'2.0': 0,
'1.0': -100
})
self.container.register_property('episodeFormat', r'Minisodes?', canonical_form='Minisode')
register_property('other', {'AudioFix': ['Audio-Fix', 'Audio-Fixed'],
'SyncFix': ['Sync-Fix', 'Sync-Fixed'],
'DualAudio': ['Dual-Audio'],
'WideScreen': ['ws', 'wide-screen'],
})
self.container.register_property('other', 'Real', 'Fix', canonical_form="Proper", validator=WeakValidator())
self.container.register_property('other', 'Proper', 'Repack', 'Rerip', canonical_form="Proper")
self.container.register_canonical_properties('other', 'R5', 'Screener', '3D', 'HD', 'HQ', 'DDC')
self.container.register_canonical_properties('other', 'Limited', 'Complete', 'Classic', 'Unrated', 'LiNE', 'Bonus', 'Trailer', validator=WeakValidator())
for prop in self.container.get_properties('format'):
self.container.register_property('other', prop.pattern + '(-?Scr(?:eener)?)', canonical_form='Screener')
for exts in (subtitle_exts, info_exts, video_exts):
for container in exts:
self.container.register_property('container', container, confidence=0.3)
def guess_properties(self, string, node=None, options=None):
found = self.container.find_properties(string, node)
return self.container.as_guess(found, string)
def supported_properties(self):
return self.container.get_supported_properties()
def process(self, mtree, options=None):
GuessFinder(self.guess_properties, 1.0, self.log, options).process_nodes(mtree.unidentified_leaves())
def rate_quality(self, guess, *props):
return self.qualities.rate_quality(guess, *props)

View file

@ -0,0 +1,149 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
#
# GuessIt - A library for guessing information from filenames
# Copyright (c) 2013 Nicolas Wack <wackou@gmail.com>
#
# GuessIt is free software; you can redistribute it and/or modify it under
# the terms of the Lesser GNU General Public License as published by
# the Free Software Foundation; either version 3 of the License, or
# (at your option) any later version.
#
# GuessIt is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# Lesser GNU General Public License for more details.
#
# You should have received a copy of the Lesser GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
#
from __future__ import absolute_import, division, print_function, unicode_literals
from guessit.plugins.transformers import Transformer
from guessit.matcher import GuessFinder, found_property, found_guess
from guessit.containers import PropertiesContainer
from guessit.patterns import sep
from guessit.guess import Guess
from guessit.textutils import strip_brackets
class GuessReleaseGroup(Transformer):
def __init__(self):
Transformer.__init__(self, -190)
self.container = PropertiesContainer(canonical_from_pattern=False)
self._allowed_groupname_pattern = '[\w@#€£$&]'
self._forbidden_groupname_lambda = [lambda elt: elt in ['rip', 'by', 'for', 'par', 'pour', 'bonus'],
lambda elt: self._is_number(elt),
]
# If the previous property in this list, the match will be considered as safe
# and group name can contain a separator.
self.previous_safe_properties = ['videoCodec', 'format', 'videoApi', 'audioCodec', 'audioProfile', 'videoProfile', 'audioChannels']
self.container.sep_replace_char = '-'
self.container.canonical_from_pattern = False
self.container.enhance = True
self.container.register_property('releaseGroup', self._allowed_groupname_pattern + '+')
self.container.register_property('releaseGroup', self._allowed_groupname_pattern + '+-' + self._allowed_groupname_pattern + '+')
def supported_properties(self):
return self.container.get_supported_properties()
def _is_number(self, s):
try:
int(s)
return True
except ValueError:
return False
def validate_group_name(self, guess):
val = guess['releaseGroup']
if len(val) >= 2:
if '-' in val:
checked_val = ""
for elt in val.split('-'):
forbidden = False
for forbidden_lambda in self._forbidden_groupname_lambda:
forbidden = forbidden_lambda(elt.lower())
if forbidden:
break
if not forbidden:
if checked_val:
checked_val += '-'
checked_val += elt
else:
break
val = checked_val
if not val:
return False
guess['releaseGroup'] = val
forbidden = False
for forbidden_lambda in self._forbidden_groupname_lambda:
forbidden = forbidden_lambda(val.lower())
if forbidden:
break
if not forbidden:
return True
return False
def is_leaf_previous(self, leaf, node):
if leaf.span[1] <= node.span[0]:
for idx in range(leaf.span[1], node.span[0]):
if not leaf.root.value[idx] in sep:
return False
return True
return False
def guess_release_group(self, string, node=None, options=None):
found = self.container.find_properties(string, node, 'releaseGroup')
guess = self.container.as_guess(found, string, self.validate_group_name, sep_replacement='-')
validated_guess = None
if guess:
explicit_group_node = node.group_node()
if explicit_group_node:
for leaf in explicit_group_node.leaves_containing(self.previous_safe_properties):
if self.is_leaf_previous(leaf, node):
if leaf.root.value[leaf.span[1]] == '-':
guess.metadata().confidence = 1
else:
guess.metadata().confidence = 0.7
validated_guess = guess
if not validated_guess:
# If previous group last leaf is identified as a safe property,
# consider the raw value as a releaseGroup
previous_group_node = node.previous_group_node()
if previous_group_node:
for leaf in previous_group_node.leaves_containing(self.previous_safe_properties):
if self.is_leaf_previous(leaf, node):
guess = Guess({'releaseGroup': node.value}, confidence=1, input=node.value, span=(0, len(node.value)))
if self.validate_group_name(guess):
node.guess = guess
validated_guess = guess
if validated_guess:
# If following group nodes have only one unidentified leaf, it belongs to the release group
next_group_node = node
while True:
next_group_node = next_group_node.next_group_node()
if next_group_node:
leaves = next_group_node.leaves()
if len(leaves) == 1 and not leaves[0].guess:
validated_guess['releaseGroup'] = validated_guess['releaseGroup'] + leaves[0].value
leaves[0].guess = validated_guess
else:
break
else:
break
if validated_guess:
# Strip brackets
validated_guess['releaseGroup'] = strip_brackets(validated_guess['releaseGroup'])
return validated_guess
def process(self, mtree, options=None):
GuessFinder(self.guess_release_group, None, self.log, options).process_nodes(mtree.unidentified_leaves())

View file

@ -0,0 +1,58 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
#
# GuessIt - A library for guessing information from filenames
# Copyright (c) 2013 Nicolas Wack <wackou@gmail.com>
#
# GuessIt is free software; you can redistribute it and/or modify it under
# the terms of the Lesser GNU General Public License as published by
# the Free Software Foundation; either version 3 of the License, or
# (at your option) any later version.
#
# GuessIt is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# Lesser GNU General Public License for more details.
#
# You should have received a copy of the Lesser GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
#
from __future__ import absolute_import, division, print_function, \
unicode_literals
from guessit.patterns import _psep
from guessit.containers import PropertiesContainer
from guessit.plugins.transformers import Transformer
from guessit.matcher import GuessFinder
from guessit.patterns.numeral import parse_numeral
class GuessVideoRexps(Transformer):
def __init__(self):
Transformer.__init__(self, 25)
self.container = PropertiesContainer(canonical_from_pattern=False)
self.container.register_property(None, 'cd' + _psep + '(?P<cdNumber>[0-9])(?:' + _psep + 'of' + _psep + '(?P<cdNumberTotal>[0-9]))?', confidence=1.0, enhance=False, global_span=True, formatter=parse_numeral)
self.container.register_property('cdNumberTotal', '([1-9])' + _psep + 'cds?', confidence=0.9, enhance=False, formatter=parse_numeral)
self.container.register_property('bonusNumber', 'x([0-9]{1,2})', enhance=False, global_span=True, formatter=parse_numeral)
self.container.register_property('filmNumber', 'f([0-9]{1,2})', enhance=False, global_span=True, formatter=parse_numeral)
self.container.register_property('edition', 'collector', 'collector-edition', 'edition-collector', canonical_form='Collector Edition')
self.container.register_property('edition', 'special-edition', 'edition-special', canonical_form='Special Edition')
self.container.register_property('edition', 'criterion', 'criterion-edition', 'edition-criterion', canonical_form='Criterion Edition')
self.container.register_property('edition', 'deluxe', 'cdeluxe-edition', 'edition-deluxe', canonical_form='Deluxe Edition')
self.container.register_property('edition', 'director\'?s?-cut', 'director\'?s?-cut-edition', 'edition-director\'?s?-cut', canonical_form='Director\'s cut')
def supported_properties(self):
return self.container.get_supported_properties()
def guess_video_rexps(self, string, node=None, options=None):
found = self.container.find_properties(string, node)
return self.container.as_guess(found, string)
def process(self, mtree, options=None):
GuessFinder(self.guess_video_rexps, None, self.log, options).process_nodes(mtree.unidentified_leaves())

View file

@ -0,0 +1,69 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
#
# GuessIt - A library for guessing information from filenames
# Copyright (c) 2013 Nicolas Wack <wackou@gmail.com>
#
# GuessIt is free software; you can redistribute it and/or modify it under
# the terms of the Lesser GNU General Public License as published by
# the Free Software Foundation; either version 3 of the License, or
# (at your option) any later version.
#
# GuessIt is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# Lesser GNU General Public License for more details.
#
# You should have received a copy of the Lesser GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
#
from __future__ import absolute_import, division, print_function, unicode_literals
from guessit.plugins.transformers import Transformer
from guessit.matcher import GuessFinder
from guessit.patterns import sep
from guessit.containers import PropertiesContainer
from guessit.patterns.numeral import numeral, parse_numeral
from guessit.date import valid_year
class GuessWeakEpisodesRexps(Transformer):
def __init__(self):
Transformer.__init__(self, 15)
self.properties = PropertiesContainer(enhance=False, canonical_from_pattern=False)
def _formater(episodeNumber):
epnum = parse_numeral(episodeNumber)
if not valid_year(epnum):
if epnum > 100:
season, epnum = epnum // 100, epnum % 100
# episodes which have a season > 50 are most likely errors
# (Simpson is at 25!)
if season > 50:
return None
return {'season': season, 'episodeNumber': epnum}
else:
return epnum
self.properties.register_property(['episodeNumber', 'season'], '[0-9]{2,4}', confidence=0.6, formatter=_formater)
self.properties.register_property('episodeNumber', '(?:episode)' + sep + '(' + numeral + ')[^0-9]', confidence=0.3)
def supported_properties(self):
return self.properties.get_supported_properties()
def guess_weak_episodes_rexps(self, string, node=None, options=None):
if node and 'episodeNumber' in node.root.info:
return None
properties = self.properties.find_properties(string, node)
guess = self.properties.as_guess(properties, string)
return guess
def should_process(self, mtree, options=None):
return mtree.guess.get('type', '').startswith('episode')
def process(self, mtree, options=None):
GuessFinder(self.guess_weak_episodes_rexps, 0.6, self.log, options).process_nodes(mtree.unidentified_leaves())

View file

@ -0,0 +1,66 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
#
# GuessIt - A library for guessing information from filenames
# Copyright (c) 2013 Rémi Alvergnat <toilal.dev@gmail.com>
#
# GuessIt is free software; you can redistribute it and/or modify it under
# the terms of the Lesser GNU General Public License as published by
# the Free Software Foundation; either version 3 of the License, or
# (at your option) any later version.
#
# GuessIt is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# Lesser GNU General Public License for more details.
#
# You should have received a copy of the Lesser GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
#
from __future__ import absolute_import, division, print_function, \
unicode_literals
from guessit.patterns import build_or_pattern
from guessit.containers import PropertiesContainer
from guessit.plugins.transformers import Transformer
from guessit.matcher import GuessFinder
from pkg_resources import resource_stream # @UnresolvedImport
class GuessWebsite(Transformer):
def __init__(self):
Transformer.__init__(self, 45)
self.container = PropertiesContainer(enhance=False, canonical_from_pattern=False)
tlds = []
f = resource_stream('guessit', 'tlds-alpha-by-domain.txt')
f.readline()
next(f)
for tld in f:
tld = tld.strip()
if b'--' in tld:
continue
tlds.append(tld.decode("utf-8"))
f.close()
tlds_pattern = build_or_pattern(tlds) # All registered domain extension
safe_tlds_pattern = build_or_pattern(['com', 'org', 'net']) # For sure a website extension
safe_subdomains_pattern = build_or_pattern(['www']) # For sure a website subdomain
safe_prefix_tlds_pattern = build_or_pattern(['co', 'com', 'org', 'net']) # Those words before a tlds are sure
self.container.register_property('website', '(?:' + safe_subdomains_pattern + '\.)+' + r'(?:[a-z-]+\.)+' + r'(?:' + tlds_pattern + r')+')
self.container.register_property('website', '(?:' + safe_subdomains_pattern + '\.)*' + r'[a-z-]+\.' + r'(?:' + safe_tlds_pattern + r')+')
self.container.register_property('website', '(?:' + safe_subdomains_pattern + '\.)*' + r'[a-z-]+\.' + r'(?:' + safe_prefix_tlds_pattern + r'\.)+' + r'(?:' + tlds_pattern + r')+')
def supported_properties(self):
return self.container.get_supported_properties()
def guess_website(self, string, node=None, options=None):
found = self.container.find_properties(string, node, 'website')
return self.container.as_guess(found, string)
def process(self, mtree, options=None):
GuessFinder(self.guess_website, 1.0, self.log, options).process_nodes(mtree.unidentified_leaves())

View file

@ -0,0 +1,49 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
#
# GuessIt - A library for guessing information from filenames
# Copyright (c) 2013 Nicolas Wack <wackou@gmail.com>
#
# GuessIt is free software; you can redistribute it and/or modify it under
# the terms of the Lesser GNU General Public License as published by
# the Free Software Foundation; either version 3 of the License, or
# (at your option) any later version.
#
# GuessIt is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# Lesser GNU General Public License for more details.
#
# You should have received a copy of the Lesser GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
#
from __future__ import absolute_import, division, print_function, unicode_literals
from guessit.plugins.transformers import Transformer
from guessit.matcher import GuessFinder
from guessit.date import search_year
class GuessYear(Transformer):
def __init__(self):
Transformer.__init__(self, -160)
def supported_properties(self):
return ['year']
def guess_year(self, string, node=None, options=None):
year, span = search_year(string)
if year:
return {'year': year}, span
else:
return None, None
def second_pass_options(self, mtree, options=None):
year_nodes = mtree.leaves_containing('year')
if len(year_nodes) > 1:
return {'skip_nodes': year_nodes[:len(year_nodes) - 1]}
return None
def process(self, mtree, options=None):
GuessFinder(self.guess_year, 1.0, self.log, options).process_nodes(mtree.unidentified_leaves())

View file

@ -0,0 +1,49 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
#
# GuessIt - A library for guessing information from filenames
# Copyright (c) 2013 Nicolas Wack <wackou@gmail.com>
#
# GuessIt is free software; you can redistribute it and/or modify it under
# the terms of the Lesser GNU General Public License as published by
# the Free Software Foundation; either version 3 of the License, or
# (at your option) any later version.
#
# GuessIt is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# Lesser GNU General Public License for more details.
#
# You should have received a copy of the Lesser GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
#
from __future__ import absolute_import, division, print_function, unicode_literals
from guessit.plugins.transformers import Transformer
from guessit.textutils import find_first_level_groups
from guessit.patterns import group_delimiters
from functools import reduce
class SplitExplicitGroups(Transformer):
def __init__(self):
Transformer.__init__(self, 245)
def process(self, mtree, options=None):
"""split each of those into explicit groups (separated by parentheses or square brackets)
:return: return the string split into explicit groups, that is, those either
between parenthese, square brackets or curly braces, and those separated
by a dash."""
for c in mtree.children:
groups = find_first_level_groups(c.value, group_delimiters[0])
for delimiters in group_delimiters:
flatten = lambda l, x: l + find_first_level_groups(x, delimiters)
groups = reduce(flatten, groups, [])
# do not do this at this moment, it is not strong enough and can break other
# patterns, such as dates, etc...
# groups = functools.reduce(lambda l, x: l + x.split('-'), groups, [])
c.split_on_components(groups)

View file

@ -0,0 +1,47 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
#
# GuessIt - A library for guessing information from filenames
# Copyright (c) 2013 Nicolas Wack <wackou@gmail.com>
#
# GuessIt is free software; you can redistribute it and/or modify it under
# the terms of the Lesser GNU General Public License as published by
# the Free Software Foundation; either version 3 of the License, or
# (at your option) any later version.
#
# GuessIt is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# Lesser GNU General Public License for more details.
#
# You should have received a copy of the Lesser GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
#
from __future__ import absolute_import, division, print_function, unicode_literals
from guessit.plugins.transformers import Transformer
from guessit.patterns import sep
import re
class SplitOnDash(Transformer):
def __init__(self):
Transformer.__init__(self, 190)
def process(self, mtree, options=None):
"""split into '-' separated subgroups (with required separator chars
around the dash)
"""
for node in mtree.unidentified_leaves():
indices = []
pattern = re.compile(sep + '-' + sep)
match = pattern.search(node.value)
while match:
span = match.span()
indices.extend([span[0], span[1]])
match = pattern.search(node.value, span[1])
if indices:
node.partition(indices)

View file

@ -0,0 +1,45 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
#
# GuessIt - A library for guessing information from filenames
# Copyright (c) 2013 Nicolas Wack <wackou@gmail.com>
#
# GuessIt is free software; you can redistribute it and/or modify it under
# the terms of the Lesser GNU General Public License as published by
# the Free Software Foundation; either version 3 of the License, or
# (at your option) any later version.
#
# GuessIt is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# Lesser GNU General Public License for more details.
#
# You should have received a copy of the Lesser GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
#
from __future__ import absolute_import, division, print_function, unicode_literals
from guessit.plugins.transformers import Transformer
from guessit import fileutils
from os.path import splitext
class SplitPathComponents(Transformer):
def __init__(self):
Transformer.__init__(self, 255)
def process(self, mtree, options=None):
"""first split our path into dirs + basename + ext
:return: the filename split into [ dir*, basename, ext ]
"""
if not options.get('name_only'):
components = fileutils.split_path(mtree.value)
basename = components.pop(-1)
components += list(splitext(basename))
components[-1] = components[-1][1:] # remove the '.' from the extension
mtree.split_on_components(components)
else:
mtree.split_on_components([mtree.value, ''])

36
lib/stevedore/__init__.py Normal file
View file

@ -0,0 +1,36 @@
# flake8: noqa
__all__ = [
'ExtensionManager',
'EnabledExtensionManager',
'NamedExtensionManager',
'HookManager',
'DriverManager',
]
from .extension import ExtensionManager
from .enabled import EnabledExtensionManager
from .named import NamedExtensionManager
from .hook import HookManager
from .driver import DriverManager
import logging
# Configure a NullHandler for our log messages in case
# the app we're used from does not set up logging.
LOG = logging.getLogger('stevedore')
if hasattr(logging, 'NullHandler'):
LOG.addHandler(logging.NullHandler())
else:
class NullHandler(logging.Handler):
def handle(self, record):
pass
def emit(self, record):
pass
def createLock(self):
self.lock = None
LOG.addHandler(NullHandler())

216
lib/stevedore/dispatch.py Normal file
View file

@ -0,0 +1,216 @@
import logging
from .enabled import EnabledExtensionManager
LOG = logging.getLogger(__name__)
class DispatchExtensionManager(EnabledExtensionManager):
"""Loads all plugins and filters on execution.
This is useful for long-running processes that need to pass
different inputs to different extensions.
:param namespace: The namespace for the entry points.
:type namespace: str
:param check_func: Function to determine which extensions to load.
:type check_func: callable
:param invoke_on_load: Boolean controlling whether to invoke the
object returned by the entry point after the driver is loaded.
:type invoke_on_load: bool
:param invoke_args: Positional arguments to pass when invoking
the object returned by the entry point. Only used if invoke_on_load
is True.
:type invoke_args: tuple
:param invoke_kwds: Named arguments to pass when invoking
the object returned by the entry point. Only used if invoke_on_load
is True.
:type invoke_kwds: dict
:param propagate_map_exceptions: Boolean controlling whether exceptions
are propagated up through the map call or whether they are logged and
then ignored
:type invoke_on_load: bool
"""
def map(self, filter_func, func, *args, **kwds):
"""Iterate over the extensions invoking func() for any where
filter_func() returns True.
The signature of filter_func() should be::
def filter_func(ext, *args, **kwds):
pass
The first argument to filter_func(), 'ext', is the
:class:`~stevedore.extension.Extension`
instance. filter_func() should return True if the extension
should be invoked for the input arguments.
The signature for func() should be::
def func(ext, *args, **kwds):
pass
The first argument to func(), 'ext', is the
:class:`~stevedore.extension.Extension` instance.
Exceptions raised from within func() are propagated up and
processing stopped if self.propagate_map_exceptions is True,
otherwise they are logged and ignored.
:param filter_func: Callable to test each extension.
:param func: Callable to invoke for each extension.
:param args: Variable arguments to pass to func()
:param kwds: Keyword arguments to pass to func()
:returns: List of values returned from func()
"""
if not self.extensions:
# FIXME: Use a more specific exception class here.
raise RuntimeError('No %s extensions found' % self.namespace)
response = []
for e in self.extensions:
if filter_func(e, *args, **kwds):
self._invoke_one_plugin(response.append, func, e, args, kwds)
return response
def map_method(self, filter_func, method_name, *args, **kwds):
"""Iterate over the extensions invoking each one's object method called
`method_name` for any where filter_func() returns True.
This is equivalent of using :meth:`map` with func set to
`lambda x: x.obj.method_name()`
while being more convenient.
Exceptions raised from within the called method are propagated up
and processing stopped if self.propagate_map_exceptions is True,
otherwise they are logged and ignored.
.. versionadded:: 0.12
:param filter_func: Callable to test each extension.
:param method_name: The extension method name to call
for each extension.
:param args: Variable arguments to pass to method
:param kwds: Keyword arguments to pass to method
:returns: List of values returned from methods
"""
return self.map(filter_func, self._call_extension_method,
method_name, *args, **kwds)
class NameDispatchExtensionManager(DispatchExtensionManager):
"""Loads all plugins and filters on execution.
This is useful for long-running processes that need to pass
different inputs to different extensions and can predict the name
of the extensions before calling them.
The check_func argument should return a boolean, with ``True``
indicating that the extension should be loaded and made available
and ``False`` indicating that the extension should be ignored.
:param namespace: The namespace for the entry points.
:type namespace: str
:param check_func: Function to determine which extensions to load.
:type check_func: callable
:param invoke_on_load: Boolean controlling whether to invoke the
object returned by the entry point after the driver is loaded.
:type invoke_on_load: bool
:param invoke_args: Positional arguments to pass when invoking
the object returned by the entry point. Only used if invoke_on_load
is True.
:type invoke_args: tuple
:param invoke_kwds: Named arguments to pass when invoking
the object returned by the entry point. Only used if invoke_on_load
is True.
:type invoke_kwds: dict
:param propagate_map_exceptions: Boolean controlling whether exceptions
are propagated up through the map call or whether they are logged and
then ignored
:type invoke_on_load: bool
:param on_load_failure_callback: Callback function that will be called when
a entrypoint can not be loaded. The arguments that will be provided
when this is called (when an entrypoint fails to load) are
(manager, entrypoint, exception)
:type on_load_failure_callback: function
:param verify_requirements: Use setuptools to enforce the
dependencies of the plugin(s) being loaded. Defaults to False.
:type verify_requirements: bool
"""
def __init__(self, namespace, check_func, invoke_on_load=False,
invoke_args=(), invoke_kwds={},
propagate_map_exceptions=False,
on_load_failure_callback=None,
verify_requirements=False):
super(NameDispatchExtensionManager, self).__init__(
namespace=namespace,
check_func=check_func,
invoke_on_load=invoke_on_load,
invoke_args=invoke_args,
invoke_kwds=invoke_kwds,
propagate_map_exceptions=propagate_map_exceptions,
on_load_failure_callback=on_load_failure_callback,
verify_requirements=verify_requirements,
)
def _init_plugins(self, extensions):
super(NameDispatchExtensionManager, self)._init_plugins(extensions)
self.by_name = dict((e.name, e) for e in self.extensions)
def map(self, names, func, *args, **kwds):
"""Iterate over the extensions invoking func() for any where
the name is in the given list of names.
The signature for func() should be::
def func(ext, *args, **kwds):
pass
The first argument to func(), 'ext', is the
:class:`~stevedore.extension.Extension` instance.
Exceptions raised from within func() are propagated up and
processing stopped if self.propagate_map_exceptions is True,
otherwise they are logged and ignored.
:param names: List or set of name(s) of extension(s) to invoke.
:param func: Callable to invoke for each extension.
:param args: Variable arguments to pass to func()
:param kwds: Keyword arguments to pass to func()
:returns: List of values returned from func()
"""
response = []
for name in names:
try:
e = self.by_name[name]
except KeyError:
LOG.debug('Missing extension %r being ignored', name)
else:
self._invoke_one_plugin(response.append, func, e, args, kwds)
return response
def map_method(self, names, method_name, *args, **kwds):
"""Iterate over the extensions invoking each one's object method called
`method_name` for any where the name is in the given list of names.
This is equivalent of using :meth:`map` with func set to
`lambda x: x.obj.method_name()`
while being more convenient.
Exceptions raised from within the called method are propagated up
and processing stopped if self.propagate_map_exceptions is True,
otherwise they are logged and ignored.
.. versionadded:: 0.12
:param names: List or set of name(s) of extension(s) to invoke.
:param method_name: The extension method name
to call for each extension.
:param args: Variable arguments to pass to method
:param kwds: Keyword arguments to pass to method
:returns: List of values returned from methods
"""
return self.map(names, self._call_extension_method,
method_name, *args, **kwds)

126
lib/stevedore/driver.py Normal file
View file

@ -0,0 +1,126 @@
from .named import NamedExtensionManager
class DriverManager(NamedExtensionManager):
"""Load a single plugin with a given name from the namespace.
:param namespace: The namespace for the entry points.
:type namespace: str
:param name: The name of the driver to load.
:type name: str
:param invoke_on_load: Boolean controlling whether to invoke the
object returned by the entry point after the driver is loaded.
:type invoke_on_load: bool
:param invoke_args: Positional arguments to pass when invoking
the object returned by the entry point. Only used if invoke_on_load
is True.
:type invoke_args: tuple
:param invoke_kwds: Named arguments to pass when invoking
the object returned by the entry point. Only used if invoke_on_load
is True.
:type invoke_kwds: dict
:param on_load_failure_callback: Callback function that will be called when
a entrypoint can not be loaded. The arguments that will be provided
when this is called (when an entrypoint fails to load) are
(manager, entrypoint, exception)
:type on_load_failure_callback: function
:param verify_requirements: Use setuptools to enforce the
dependencies of the plugin(s) being loaded. Defaults to False.
:type verify_requirements: bool
"""
def __init__(self, namespace, name,
invoke_on_load=False, invoke_args=(), invoke_kwds={},
on_load_failure_callback=None,
verify_requirements=False):
super(DriverManager, self).__init__(
namespace=namespace,
names=[name],
invoke_on_load=invoke_on_load,
invoke_args=invoke_args,
invoke_kwds=invoke_kwds,
on_load_failure_callback=on_load_failure_callback,
verify_requirements=verify_requirements,
)
@classmethod
def make_test_instance(cls, extension, namespace='TESTING',
propagate_map_exceptions=False,
on_load_failure_callback=None,
verify_requirements=False):
"""Construct a test DriverManager
Test instances are passed a list of extensions to work from rather
than loading them from entry points.
:param extension: Pre-configured Extension instance
:type extension: :class:`~stevedore.extension.Extension`
:param namespace: The namespace for the manager; used only for
identification since the extensions are passed in.
:type namespace: str
:param propagate_map_exceptions: Boolean controlling whether exceptions
are propagated up through the map call or whether they are logged
and then ignored
:type propagate_map_exceptions: bool
:param on_load_failure_callback: Callback function that will
be called when a entrypoint can not be loaded. The
arguments that will be provided when this is called (when
an entrypoint fails to load) are (manager, entrypoint,
exception)
:type on_load_failure_callback: function
:param verify_requirements: Use setuptools to enforce the
dependencies of the plugin(s) being loaded. Defaults to False.
:type verify_requirements: bool
:return: The manager instance, initialized for testing
"""
o = super(DriverManager, cls).make_test_instance(
[extension], namespace=namespace,
propagate_map_exceptions=propagate_map_exceptions,
on_load_failure_callback=on_load_failure_callback,
verify_requirements=verify_requirements)
return o
def _init_plugins(self, extensions):
super(DriverManager, self)._init_plugins(extensions)
if not self.extensions:
name = self._names[0]
raise RuntimeError('No %r driver found, looking for %r' %
(self.namespace, name))
if len(self.extensions) > 1:
discovered_drivers = ','.join(e.entry_point_target
for e in self.extensions)
raise RuntimeError('Multiple %r drivers found: %s' %
(self.namespace, discovered_drivers))
def __call__(self, func, *args, **kwds):
"""Invokes func() for the single loaded extension.
The signature for func() should be::
def func(ext, *args, **kwds):
pass
The first argument to func(), 'ext', is the
:class:`~stevedore.extension.Extension` instance.
Exceptions raised from within func() are logged and ignored.
:param func: Callable to invoke for each extension.
:param args: Variable arguments to pass to func()
:param kwds: Keyword arguments to pass to func()
:returns: List of values returned from func()
"""
results = self.map(func, *args, **kwds)
if results:
return results[0]
@property
def driver(self):
"""Returns the driver being used by this manager.
"""
ext = self.extensions[0]
return ext.obj if ext.obj else ext.plugin

71
lib/stevedore/enabled.py Normal file
View file

@ -0,0 +1,71 @@
import logging
from .extension import ExtensionManager
LOG = logging.getLogger(__name__)
class EnabledExtensionManager(ExtensionManager):
"""Loads only plugins that pass a check function.
The check_func argument should return a boolean, with ``True``
indicating that the extension should be loaded and made available
and ``False`` indicating that the extension should be ignored.
:param namespace: The namespace for the entry points.
:type namespace: str
:param check_func: Function to determine which extensions to load.
:type check_func: callable
:param invoke_on_load: Boolean controlling whether to invoke the
object returned by the entry point after the driver is loaded.
:type invoke_on_load: bool
:param invoke_args: Positional arguments to pass when invoking
the object returned by the entry point. Only used if invoke_on_load
is True.
:type invoke_args: tuple
:param invoke_kwds: Named arguments to pass when invoking
the object returned by the entry point. Only used if invoke_on_load
is True.
:type invoke_kwds: dict
:param propagate_map_exceptions: Boolean controlling whether exceptions
are propagated up through the map call or whether they are logged and
then ignored
:type propagate_map_exceptions: bool
:param on_load_failure_callback: Callback function that will be called when
a entrypoint can not be loaded. The arguments that will be provided
when this is called (when an entrypoint fails to load) are
(manager, entrypoint, exception)
:type on_load_failure_callback: function
:param verify_requirements: Use setuptools to enforce the
dependencies of the plugin(s) being loaded. Defaults to False.
:type verify_requirements: bool
"""
def __init__(self, namespace, check_func, invoke_on_load=False,
invoke_args=(), invoke_kwds={},
propagate_map_exceptions=False,
on_load_failure_callback=None,
verify_requirements=False,):
self.check_func = check_func
super(EnabledExtensionManager, self).__init__(
namespace,
invoke_on_load=invoke_on_load,
invoke_args=invoke_args,
invoke_kwds=invoke_kwds,
propagate_map_exceptions=propagate_map_exceptions,
on_load_failure_callback=on_load_failure_callback,
verify_requirements=verify_requirements,
)
def _load_one_plugin(self, ep, invoke_on_load, invoke_args, invoke_kwds,
verify_requirements):
ext = super(EnabledExtensionManager, self)._load_one_plugin(
ep, invoke_on_load, invoke_args, invoke_kwds,
verify_requirements,
)
if ext and not self.check_func(ext):
LOG.debug('ignoring extension %r', ep.name)
return None
return ext

276
lib/stevedore/extension.py Normal file
View file

@ -0,0 +1,276 @@
"""ExtensionManager
"""
import pkg_resources
import logging
LOG = logging.getLogger(__name__)
class Extension(object):
"""Book-keeping object for tracking extensions.
The arguments passed to the constructor are saved as attributes of
the instance using the same names, and can be accessed by the
callables passed to :meth:`map` or when iterating over an
:class:`ExtensionManager` directly.
:param name: The entry point name.
:type name: str
:param entry_point: The EntryPoint instance returned by
:mod:`pkg_resources`.
:type entry_point: EntryPoint
:param plugin: The value returned by entry_point.load()
:param obj: The object returned by ``plugin(*args, **kwds)`` if the
manager invoked the extension on load.
"""
def __init__(self, name, entry_point, plugin, obj):
self.name = name
self.entry_point = entry_point
self.plugin = plugin
self.obj = obj
@property
def entry_point_target(self):
"""The module and attribute referenced by this extension's entry_point.
:return: A string representation of the target of the entry point in
'dotted.module:object' format.
"""
return '%s:%s' % (self.entry_point.module_name,
self.entry_point.attrs[0])
class ExtensionManager(object):
"""Base class for all of the other managers.
:param namespace: The namespace for the entry points.
:type namespace: str
:param invoke_on_load: Boolean controlling whether to invoke the
object returned by the entry point after the driver is loaded.
:type invoke_on_load: bool
:param invoke_args: Positional arguments to pass when invoking
the object returned by the entry point. Only used if invoke_on_load
is True.
:type invoke_args: tuple
:param invoke_kwds: Named arguments to pass when invoking
the object returned by the entry point. Only used if invoke_on_load
is True.
:type invoke_kwds: dict
:param propagate_map_exceptions: Boolean controlling whether exceptions
are propagated up through the map call or whether they are logged and
then ignored
:type propagate_map_exceptions: bool
:param on_load_failure_callback: Callback function that will be called when
a entrypoint can not be loaded. The arguments that will be provided
when this is called (when an entrypoint fails to load) are
(manager, entrypoint, exception)
:type on_load_failure_callback: function
:param verify_requirements: Use setuptools to enforce the
dependencies of the plugin(s) being loaded. Defaults to False.
:type verify_requirements: bool
"""
def __init__(self, namespace,
invoke_on_load=False,
invoke_args=(),
invoke_kwds={},
propagate_map_exceptions=False,
on_load_failure_callback=None,
verify_requirements=False):
self._init_attributes(
namespace,
propagate_map_exceptions=propagate_map_exceptions,
on_load_failure_callback=on_load_failure_callback)
extensions = self._load_plugins(invoke_on_load,
invoke_args,
invoke_kwds,
verify_requirements)
self._init_plugins(extensions)
@classmethod
def make_test_instance(cls, extensions, namespace='TESTING',
propagate_map_exceptions=False,
on_load_failure_callback=None,
verify_requirements=False):
"""Construct a test ExtensionManager
Test instances are passed a list of extensions to work from rather
than loading them from entry points.
:param extensions: Pre-configured Extension instances to use
:type extensions: list of :class:`~stevedore.extension.Extension`
:param namespace: The namespace for the manager; used only for
identification since the extensions are passed in.
:type namespace: str
:param propagate_map_exceptions: When calling map, controls whether
exceptions are propagated up through the map call or whether they
are logged and then ignored
:type propagate_map_exceptions: bool
:param on_load_failure_callback: Callback function that will
be called when a entrypoint can not be loaded. The
arguments that will be provided when this is called (when
an entrypoint fails to load) are (manager, entrypoint,
exception)
:type on_load_failure_callback: function
:param verify_requirements: Use setuptools to enforce the
dependencies of the plugin(s) being loaded. Defaults to False.
:type verify_requirements: bool
:return: The manager instance, initialized for testing
"""
o = cls.__new__(cls)
o._init_attributes(namespace,
propagate_map_exceptions=propagate_map_exceptions,
on_load_failure_callback=on_load_failure_callback)
o._init_plugins(extensions)
return o
def _init_attributes(self, namespace, propagate_map_exceptions=False,
on_load_failure_callback=None):
self.namespace = namespace
self.propagate_map_exceptions = propagate_map_exceptions
self._on_load_failure_callback = on_load_failure_callback
def _init_plugins(self, extensions):
self.extensions = extensions
self._extensions_by_name = None
ENTRY_POINT_CACHE = {}
def _find_entry_points(self, namespace):
if namespace not in self.ENTRY_POINT_CACHE:
eps = list(pkg_resources.iter_entry_points(namespace))
self.ENTRY_POINT_CACHE[namespace] = eps
return self.ENTRY_POINT_CACHE[namespace]
def _load_plugins(self, invoke_on_load, invoke_args, invoke_kwds,
verify_requirements):
extensions = []
for ep in self._find_entry_points(self.namespace):
LOG.debug('found extension %r', ep)
try:
ext = self._load_one_plugin(ep,
invoke_on_load,
invoke_args,
invoke_kwds,
verify_requirements,
)
if ext:
extensions.append(ext)
except (KeyboardInterrupt, AssertionError):
raise
except Exception as err:
if self._on_load_failure_callback is not None:
self._on_load_failure_callback(self, ep, err)
else:
LOG.error('Could not load %r: %s', ep.name, err)
LOG.exception(err)
return extensions
def _load_one_plugin(self, ep, invoke_on_load, invoke_args, invoke_kwds,
verify_requirements):
plugin = ep.load(require=verify_requirements)
if invoke_on_load:
obj = plugin(*invoke_args, **invoke_kwds)
else:
obj = None
return Extension(ep.name, ep, plugin, obj)
def names(self):
"Returns the names of the discovered extensions"
# We want to return the names of the extensions in the order
# they would be used by map(), since some subclasses change
# that order.
return [e.name for e in self.extensions]
def map(self, func, *args, **kwds):
"""Iterate over the extensions invoking func() for each.
The signature for func() should be::
def func(ext, *args, **kwds):
pass
The first argument to func(), 'ext', is the
:class:`~stevedore.extension.Extension` instance.
Exceptions raised from within func() are propagated up and
processing stopped if self.propagate_map_exceptions is True,
otherwise they are logged and ignored.
:param func: Callable to invoke for each extension.
:param args: Variable arguments to pass to func()
:param kwds: Keyword arguments to pass to func()
:returns: List of values returned from func()
"""
if not self.extensions:
# FIXME: Use a more specific exception class here.
raise RuntimeError('No %s extensions found' % self.namespace)
response = []
for e in self.extensions:
self._invoke_one_plugin(response.append, func, e, args, kwds)
return response
@staticmethod
def _call_extension_method(extension, method_name, *args, **kwds):
return getattr(extension.obj, method_name)(*args, **kwds)
def map_method(self, method_name, *args, **kwds):
"""Iterate over the extensions invoking a method by name.
This is equivalent of using :meth:`map` with func set to
`lambda x: x.obj.method_name()`
while being more convenient.
Exceptions raised from within the called method are propagated up
and processing stopped if self.propagate_map_exceptions is True,
otherwise they are logged and ignored.
.. versionadded:: 0.12
:param method_name: The extension method name
to call for each extension.
:param args: Variable arguments to pass to method
:param kwds: Keyword arguments to pass to method
:returns: List of values returned from methods
"""
return self.map(self._call_extension_method,
method_name, *args, **kwds)
def _invoke_one_plugin(self, response_callback, func, e, args, kwds):
try:
response_callback(func(e, *args, **kwds))
except Exception as err:
if self.propagate_map_exceptions:
raise
else:
LOG.error('error calling %r: %s', e.name, err)
LOG.exception(err)
def __iter__(self):
"""Produce iterator for the manager.
Iterating over an ExtensionManager produces the :class:`Extension`
instances in the order they would be invoked.
"""
return iter(self.extensions)
def __getitem__(self, name):
"""Return the named extension.
Accessing an ExtensionManager as a dictionary (``em['name']``)
produces the :class:`Extension` instance with the
specified name.
"""
if self._extensions_by_name is None:
d = {}
for e in self.extensions:
d[e.name] = e
self._extensions_by_name = d
return self._extensions_by_name[name]

64
lib/stevedore/hook.py Normal file
View file

@ -0,0 +1,64 @@
from .named import NamedExtensionManager
class HookManager(NamedExtensionManager):
"""Coordinate execution of multiple extensions using a common name.
:param namespace: The namespace for the entry points.
:type namespace: str
:param name: The name of the hooks to load.
:type name: str
:param invoke_on_load: Boolean controlling whether to invoke the
object returned by the entry point after the driver is loaded.
:type invoke_on_load: bool
:param invoke_args: Positional arguments to pass when invoking
the object returned by the entry point. Only used if invoke_on_load
is True.
:type invoke_args: tuple
:param invoke_kwds: Named arguments to pass when invoking
the object returned by the entry point. Only used if invoke_on_load
is True.
:type invoke_kwds: dict
:param on_load_failure_callback: Callback function that will be called when
a entrypoint can not be loaded. The arguments that will be provided
when this is called (when an entrypoint fails to load) are
(manager, entrypoint, exception)
:type on_load_failure_callback: function
:param verify_requirements: Use setuptools to enforce the
dependencies of the plugin(s) being loaded. Defaults to False.
:type verify_requirements: bool
"""
def __init__(self, namespace, name,
invoke_on_load=False, invoke_args=(), invoke_kwds={},
on_load_failure_callback=None,
verify_requirements=False):
super(HookManager, self).__init__(
namespace,
[name],
invoke_on_load=invoke_on_load,
invoke_args=invoke_args,
invoke_kwds=invoke_kwds,
on_load_failure_callback=on_load_failure_callback,
verify_requirements=verify_requirements,
)
def _init_attributes(self, namespace, names, name_order=False,
propagate_map_exceptions=False,
on_load_failure_callback=None):
super(HookManager, self)._init_attributes(
namespace, names,
propagate_map_exceptions=propagate_map_exceptions,
on_load_failure_callback=on_load_failure_callback)
self._name = names[0]
def __getitem__(self, name):
"""Return the named extensions.
Accessing a HookManager as a dictionary (``em['name']``)
produces a list of the :class:`Extension` instance(s) with the
specified name, in the order they would be invoked by map().
"""
if name != self._name:
raise KeyError(name)
return self.extensions

124
lib/stevedore/named.py Normal file
View file

@ -0,0 +1,124 @@
from .extension import ExtensionManager
class NamedExtensionManager(ExtensionManager):
"""Loads only the named extensions.
This is useful for explicitly enabling extensions in a
configuration file, for example.
:param namespace: The namespace for the entry points.
:type namespace: str
:param names: The names of the extensions to load.
:type names: list(str)
:param invoke_on_load: Boolean controlling whether to invoke the
object returned by the entry point after the driver is loaded.
:type invoke_on_load: bool
:param invoke_args: Positional arguments to pass when invoking
the object returned by the entry point. Only used if invoke_on_load
is True.
:type invoke_args: tuple
:param invoke_kwds: Named arguments to pass when invoking
the object returned by the entry point. Only used if invoke_on_load
is True.
:type invoke_kwds: dict
:param name_order: If true, sort the loaded extensions to match the
order used in ``names``.
:type name_order: bool
:param propagate_map_exceptions: Boolean controlling whether exceptions
are propagated up through the map call or whether they are logged and
then ignored
:type propagate_map_exceptions: bool
:param on_load_failure_callback: Callback function that will be called when
a entrypoint can not be loaded. The arguments that will be provided
when this is called (when an entrypoint fails to load) are
(manager, entrypoint, exception)
:type on_load_failure_callback: function
:param verify_requirements: Use setuptools to enforce the
dependencies of the plugin(s) being loaded. Defaults to False.
:type verify_requirements: bool
"""
def __init__(self, namespace, names,
invoke_on_load=False, invoke_args=(), invoke_kwds={},
name_order=False, propagate_map_exceptions=False,
on_load_failure_callback=None,
verify_requirements=False):
self._init_attributes(
namespace, names, name_order=name_order,
propagate_map_exceptions=propagate_map_exceptions,
on_load_failure_callback=on_load_failure_callback)
extensions = self._load_plugins(invoke_on_load,
invoke_args,
invoke_kwds,
verify_requirements)
self._init_plugins(extensions)
@classmethod
def make_test_instance(cls, extensions, namespace='TESTING',
propagate_map_exceptions=False,
on_load_failure_callback=None,
verify_requirements=False):
"""Construct a test NamedExtensionManager
Test instances are passed a list of extensions to use rather than
loading them from entry points.
:param extensions: Pre-configured Extension instances
:type extensions: list of :class:`~stevedore.extension.Extension`
:param namespace: The namespace for the manager; used only for
identification since the extensions are passed in.
:type namespace: str
:param propagate_map_exceptions: Boolean controlling whether exceptions
are propagated up through the map call or whether they are logged
and then ignored
:type propagate_map_exceptions: bool
:param on_load_failure_callback: Callback function that will
be called when a entrypoint can not be loaded. The
arguments that will be provided when this is called (when
an entrypoint fails to load) are (manager, entrypoint,
exception)
:type on_load_failure_callback: function
:param verify_requirements: Use setuptools to enforce the
dependencies of the plugin(s) being loaded. Defaults to False.
:type verify_requirements: bool
:return: The manager instance, initialized for testing
"""
o = cls.__new__(cls)
names = [e.name for e in extensions]
o._init_attributes(namespace, names,
propagate_map_exceptions=propagate_map_exceptions,
on_load_failure_callback=on_load_failure_callback)
o._init_plugins(extensions)
return o
def _init_attributes(self, namespace, names, name_order=False,
propagate_map_exceptions=False,
on_load_failure_callback=None):
super(NamedExtensionManager, self)._init_attributes(
namespace, propagate_map_exceptions=propagate_map_exceptions,
on_load_failure_callback=on_load_failure_callback)
self._names = names
self._name_order = name_order
def _init_plugins(self, extensions):
super(NamedExtensionManager, self)._init_plugins(extensions)
if self._name_order:
self.extensions = [self[n] for n in self._names]
def _load_one_plugin(self, ep, invoke_on_load, invoke_args, invoke_kwds,
verify_requirements):
# Check the name before going any further to prevent
# undesirable code from being loaded at all if we are not
# going to use it.
if ep.name not in self._names:
return None
return super(NamedExtensionManager, self)._load_one_plugin(
ep, invoke_on_load, invoke_args, invoke_kwds,
verify_requirements,
)

View file

@ -3,6 +3,22 @@ import os
import subprocess
import sys
import platform
# init preliminaries
SYS_ARGV = sys.argv[1:]
APP_FILENAME = sys.argv[0]
APP_NAME = os.path.basename(APP_FILENAME)
PROGRAM_DIR = os.path.dirname(os.path.normpath(os.path.abspath(os.path.join(__file__, os.pardir))))
LOG_DIR = os.path.join(PROGRAM_DIR, 'logs')
LOG_FILE = os.path.join(LOG_DIR, 'postprocess.log')
CONFIG_FILE = os.path.join(PROGRAM_DIR, 'autoProcessMedia.cfg')
CONFIG_SPEC_FILE = os.path.join(PROGRAM_DIR, 'autoProcessMedia.cfg.spec')
CONFIG_MOVIE_FILE = os.path.join(PROGRAM_DIR, 'autoProcessMovie.cfg')
CONFIG_TV_FILE = os.path.join(PROGRAM_DIR, 'autoProcessTv.cfg')
# add our custom libs to the system path
sys.path.insert(0, os.path.abspath(os.path.join(PROGRAM_DIR, 'lib')))
from nzbtomedia import logger, versionCheck
from nzbtomedia.nzbToMediaConfig import config
from nzbtomedia.nzbToMediaUtil import WakeUp, makeDir
@ -28,21 +44,9 @@ NZBGET_POSTPROCESS_SUCCESS = 93
NZBGET_POSTPROCESS_ERROR = 94
NZBGET_POSTPROCESS_NONE = 95
# config constants
CFG = None
CFG_LOGGING = None
APP_FILENAME = None
APP_NAME = None
PROGRAM_DIR = None
LOG_DIR = None
LOG_FILE = None
LOG_DEBUG = None
CONFIG_FILE = None
CONFIG_SPEC_FILE = None
CONFIG_MOVIE_FILE = None
CONFIG_TV_FILE = None
SYS_ENCODING = None
SYS_ARGV = None
AUTO_UPDATE = None
NZBTOMEDIA_VERSION = None
@ -119,8 +123,7 @@ __INITIALIZED__ = False
def initialize(section=None):
global NZBGET_POSTPROCESS_ERROR, NZBGET_POSTPROCESS_NONE, NZBGET_POSTPROCESS_PARCHECK, NZBGET_POSTPROCESS_SUCCESS, \
NZBTOMEDIA_TIMEOUT, FORKS, FORK_DEFAULT, FORK_FAILED_TORRENT, FORK_FAILED, SICKBEARD_TORRENT, SICKBEARD_FAILED, \
PROGRAM_DIR, CFG, CFG_LOGGING, CONFIG_FILE, CONFIG_MOVIE_FILE, CONFIG_SPEC_FILE, LOG_DIR, NZBTOMEDIA_BRANCH, \
CONFIG_TV_FILE, LOG_FILE, NZBTOMEDIA_VERSION, NEWEST_VERSION, NEWEST_VERSION_STRING, VERSION_NOTIFY, SYS_ARGV, \
NZBTOMEDIA_BRANCH, NZBTOMEDIA_VERSION, NEWEST_VERSION, NEWEST_VERSION_STRING, VERSION_NOTIFY, SYS_ARGV, CFG, \
SABNZB_NO_OF_ARGUMENTS, SABNZB_0717_NO_OF_ARGUMENTS, CATEGORIES, TORRENT_CLIENTAGENT, USELINK, OUTPUTDIRECTORY, NOFLATTEN, \
UTORRENTPWD, UTORRENTUSR, UTORRENTWEBUI, DELUGEHOST, DELUGEPORT, DELUGEUSR, DELUGEPWD, TRANSMISSIONHOST, TRANSMISSIONPORT, \
TRANSMISSIONPWD, TRANSMISSIONUSR, COMPRESSEDCONTAINER, MEDIACONTAINER, METACONTAINER, MINSAMPLESIZE, SAMPLEIDS, \
@ -134,21 +137,6 @@ def initialize(section=None):
if __INITIALIZED__:
return False
# init preliminaries
SYS_ARGV = sys.argv[1:]
APP_FILENAME = sys.argv[0]
APP_NAME = os.path.basename(APP_FILENAME)
PROGRAM_DIR = os.path.dirname(os.path.normpath(os.path.abspath(os.path.join(__file__, os.pardir))))
LOG_DIR = os.path.join(PROGRAM_DIR, 'logs')
LOG_FILE = os.path.join(LOG_DIR, 'postprocess.log')
CONFIG_FILE = os.path.join(PROGRAM_DIR, 'autoProcessMedia.cfg')
CONFIG_SPEC_FILE = os.path.join(PROGRAM_DIR, 'autoProcessMedia.cfg.spec')
CONFIG_MOVIE_FILE = os.path.join(PROGRAM_DIR, 'autoProcessMovie.cfg')
CONFIG_TV_FILE = os.path.join(PROGRAM_DIR, 'autoProcessTv.cfg')
# add our custom libs to the system path
sys.path.insert(0, os.path.abspath(os.path.join(PROGRAM_DIR, 'lib')))
try:
locale.setlocale(locale.LC_ALL, "")
SYS_ENCODING = locale.getpreferredencoding()
@ -219,6 +207,7 @@ def initialize(section=None):
# Set Current Version
logger.info('nzbToMedia Version:' + NZBTOMEDIA_VERSION + ' Branch:' + GIT_BRANCH + ' (' + platform.system() + ' ' + platform.release() + ')')
if int(CFG["WakeOnLan"]["wake"]) == 1:
WakeUp()
NZB_CLIENTAGENT = CFG["Nzb"]["clientAgent"] # sabnzbd

View file

@ -8,6 +8,7 @@ import time
import nzbtomedia
from lib import requests
from lib import guessit
from nzbtomedia.linktastic import linktastic
from nzbtomedia import logger
from nzbtomedia.synchronousdeluge.client import DelugeClient
@ -248,10 +249,6 @@ def TestCon(host, port):
def WakeUp():
wake = int(nzbtomedia.CFG["WakeOnLan"]["wake"])
if wake == 0: # just return if we don't need to wake anything.
return
logger.info(("Loading WakeOnLan config from %s" % (nzbtomedia.CONFIG_FILE)))
host = nzbtomedia.CFG["WakeOnLan"]["host"]
port = int(nzbtomedia.CFG["WakeOnLan"]["port"])
mac = nzbtomedia.CFG["WakeOnLan"]["mac"]
@ -646,10 +643,17 @@ def find_imdbid(dirName, nzbName):
return imdbid
logger.info('Searching IMDB for imdbID ...')
m = re.search("^(.+)(\d{4})\W", nzbName)
if m:
title = m.group(1)
year = m.group(2)
guess = guessit.guess_movie_info(nzbName)
if guess:
# Movie Title
title = None
if 'title' in guess:
title = guess['title']
# Movie Year
year = None
if 'year' in guess:
year = guess['year']
url = "http://www.omdbapi.com"