From 6fea9ddb4050fa565b6a565f7d131e98e722531f Mon Sep 17 00:00:00 2001 From: echel0n Date: Mon, 21 Apr 2014 23:07:04 -0700 Subject: [PATCH] Switched out guessit libs for the one CP uses, seems to have less depends --- lib/guessit/__init__.py | 209 +++--- lib/guessit/__main__.py | 241 +++---- lib/guessit/__version__.py | 20 - lib/guessit/containers.py | 615 ----------------- lib/guessit/country.py | 29 +- lib/guessit/date.py | 107 ++- lib/guessit/fileutils.py | 23 +- lib/guessit/guess.py | 247 ++----- lib/guessit/hash_ed2k.py | 10 +- lib/guessit/hash_mpc.py | 13 +- lib/guessit/language.py | 501 +++++++------- lib/guessit/matcher.py | 311 ++++----- lib/guessit/matchtree.py | 252 ++----- lib/guessit/options.py | 25 - lib/guessit/patterns.py | 250 +++++++ lib/guessit/patterns/__init__.py | 77 --- lib/guessit/patterns/extension.py | 32 - lib/guessit/patterns/numeral.py | 150 ----- lib/guessit/plugins/__init__.py | 21 - lib/guessit/plugins/transformers.py | 186 ------ lib/guessit/quality.py | 65 -- lib/guessit/slogging.py | 22 +- lib/guessit/test/__init__.py | 26 - lib/guessit/test/__main__.py | 40 -- lib/guessit/test/autodetect.yaml | 289 -------- lib/guessit/test/dummy.srt | 1 - lib/guessit/test/episodes.yaml | 569 ---------------- lib/guessit/test/guessittest.py | 168 ----- lib/guessit/test/movies.yaml | 626 ------------------ .../opensubtitles_languages_2012_05_09.txt | 473 ------------- lib/guessit/test/test_api.py | 54 -- lib/guessit/test/test_autodetect.py | 45 -- lib/guessit/test/test_autodetect_all.py | 46 -- lib/guessit/test/test_doctests.py | 45 -- lib/guessit/test/test_episode.py | 35 - lib/guessit/test/test_hashes.py | 46 -- lib/guessit/test/test_language.py | 138 ---- lib/guessit/test/test_main.py | 70 -- lib/guessit/test/test_matchtree.py | 93 --- lib/guessit/test/test_movie.py | 35 - lib/guessit/test/test_quality.py | 126 ---- lib/guessit/test/test_utils.py | 155 ----- lib/guessit/textutils.py | 196 +----- lib/guessit/tlds-alpha-by-domain.txt | 341 ---------- lib/guessit/transfo/__init__.py | 93 ++- lib/guessit/transfo/guess_bonus_features.py | 72 +- lib/guessit/transfo/guess_country.py | 63 +- lib/guessit/transfo/guess_date.py | 33 +- .../guess_episode_info_from_position.py | 224 +++---- lib/guessit/transfo/guess_episode_special.py | 62 -- lib/guessit/transfo/guess_episodes_rexps.py | 102 ++- lib/guessit/transfo/guess_filetype.py | 324 +++++---- lib/guessit/transfo/guess_idnumber.py | 90 +-- lib/guessit/transfo/guess_language.py | 170 +---- .../guess_movie_title_from_position.py | 265 ++++---- lib/guessit/transfo/guess_properties.py | 220 +----- lib/guessit/transfo/guess_release_group.py | 181 ++--- lib/guessit/transfo/guess_video_rexps.py | 58 +- .../transfo/guess_weak_episodes_rexps.py | 73 +- lib/guessit/transfo/guess_website.py | 57 +- lib/guessit/transfo/guess_year.py | 47 +- lib/guessit/transfo/post_process.py | 73 ++ lib/guessit/transfo/split_explicit_groups.py | 43 +- lib/guessit/transfo/split_on_dash.py | 39 +- lib/guessit/transfo/split_path_components.py | 35 +- 65 files changed, 2034 insertions(+), 7313 deletions(-) delete mode 100644 lib/guessit/__version__.py delete mode 100644 lib/guessit/containers.py delete mode 100644 lib/guessit/options.py create mode 100644 lib/guessit/patterns.py delete mode 100644 lib/guessit/patterns/__init__.py delete mode 100644 lib/guessit/patterns/extension.py delete mode 100644 lib/guessit/patterns/numeral.py delete mode 100644 lib/guessit/plugins/__init__.py delete mode 100644 lib/guessit/plugins/transformers.py delete mode 100644 lib/guessit/quality.py delete mode 100644 lib/guessit/test/__init__.py delete mode 100644 lib/guessit/test/__main__.py delete mode 100644 lib/guessit/test/autodetect.yaml delete mode 100644 lib/guessit/test/dummy.srt delete mode 100644 lib/guessit/test/episodes.yaml delete mode 100644 lib/guessit/test/guessittest.py delete mode 100644 lib/guessit/test/movies.yaml delete mode 100644 lib/guessit/test/opensubtitles_languages_2012_05_09.txt delete mode 100644 lib/guessit/test/test_api.py delete mode 100644 lib/guessit/test/test_autodetect.py delete mode 100644 lib/guessit/test/test_autodetect_all.py delete mode 100644 lib/guessit/test/test_doctests.py delete mode 100644 lib/guessit/test/test_episode.py delete mode 100644 lib/guessit/test/test_hashes.py delete mode 100644 lib/guessit/test/test_language.py delete mode 100644 lib/guessit/test/test_main.py delete mode 100644 lib/guessit/test/test_matchtree.py delete mode 100644 lib/guessit/test/test_movie.py delete mode 100644 lib/guessit/test/test_quality.py delete mode 100644 lib/guessit/test/test_utils.py delete mode 100644 lib/guessit/tlds-alpha-by-domain.txt delete mode 100644 lib/guessit/transfo/guess_episode_special.py create mode 100644 lib/guessit/transfo/post_process.py diff --git a/lib/guessit/__init__.py b/lib/guessit/__init__.py index 017f1223..e6cfa276 100644 --- a/lib/guessit/__init__.py +++ b/lib/guessit/__init__.py @@ -2,7 +2,7 @@ # -*- coding: utf-8 -*- # # GuessIt - A library for guessing information from filenames -# Copyright (c) 2013 Nicolas Wack +# Copyright (c) 2011 Nicolas Wack # # GuessIt is free software; you can redistribute it and/or modify it under # the terms of the Lesser GNU General Public License as published by @@ -18,11 +18,9 @@ # along with this program. If not, see . # -from __future__ import absolute_import, division, print_function, unicode_literals - -import pkg_resources -from .__version__ import __version__ +from __future__ import unicode_literals +__version__ = '0.6.2' __all__ = ['Guess', 'Language', 'guess_file_info', 'guess_video_info', 'guess_movie_info', 'guess_episode_info'] @@ -32,69 +30,58 @@ __all__ = ['Guess', 'Language', # it will then always be available # with code from http://lucumr.pocoo.org/2011/1/22/forwards-compatible-python/ import sys -if sys.version_info[0] >= 3: # pragma: no cover - PY2, PY3 = False, True +if sys.version_info[0] >= 3: + PY3 = True unicode_text_type = str native_text_type = str base_text_type = str - def u(x): return str(x) - def s(x): return x - class UnicodeMixin(object): __str__ = lambda x: x.__unicode__() import binascii - def to_hex(x): return binascii.hexlify(x).decode('utf-8') -else: # pragma: no cover - PY2, PY3 = True, False - __all__ = [str(s) for s in __all__] # fix imports for python2 +else: + PY3 = False + __all__ = [ str(s) for s in __all__ ] # fix imports for python2 unicode_text_type = unicode native_text_type = str base_text_type = basestring - def u(x): if isinstance(x, str): return x.decode('utf-8') - if isinstance(x, list): - return [u(s) for s in x] return unicode(x) - def s(x): if isinstance(x, unicode): return x.encode('utf-8') if isinstance(x, list): - return [s(y) for y in x] + return [ s(y) for y in x ] if isinstance(x, tuple): return tuple(s(y) for y in x) if isinstance(x, dict): return dict((s(key), s(value)) for key, value in x.items()) return x - class UnicodeMixin(object): __str__ = lambda x: unicode(x).encode('utf-8') - def to_hex(x): return x.encode('hex') - range = xrange from guessit.guess import Guess, merge_all from guessit.language import Language from guessit.matcher import IterativeMatcher -from guessit.textutils import clean_string, is_camel, from_camel -import os.path +from guessit.textutils import clean_string import logging import json log = logging.getLogger(__name__) + class NullHandler(logging.Handler): def emit(self, record): pass @@ -104,74 +91,137 @@ h = NullHandler() log.addHandler(h) -def _guess_filename(filename, options=None, **kwargs): - mtree = _build_filename_mtree(filename, options=options, **kwargs) - _add_camel_properties(mtree, options=options) - return mtree.matched() +def _guess_filename(filename, filetype): + def find_nodes(tree, props): + """Yields all nodes containing any of the given props.""" + if isinstance(props, base_text_type): + props = [props] + for node in tree.nodes(): + if any(prop in node.guess for prop in props): + yield node + + def warning(title): + log.warning('%s, guesses: %s - %s' % (title, m.nice_string(), m2.nice_string())) + return m + + mtree = IterativeMatcher(filename, filetype=filetype) + + m = mtree.matched() + + second_pass_opts = [] + second_pass_transfo_opts = {} + + # if there are multiple possible years found, we assume the first one is + # part of the title, reparse the tree taking this into account + years = set(n.value for n in find_nodes(mtree.match_tree, 'year')) + if len(years) >= 2: + second_pass_opts.append('skip_first_year') + + to_skip_language_nodes = [] + + title_nodes = set(n for n in find_nodes(mtree.match_tree, ['title', 'series'])) + title_spans = {} + for title_node in title_nodes: + title_spans[title_node.span[0]] = title_node + title_spans[title_node.span[1]] = title_node + + for lang_key in ('language', 'subtitleLanguage'): + langs = {} + lang_nodes = set(n for n in find_nodes(mtree.match_tree, lang_key)) + + for lang_node in lang_nodes: + lang = lang_node.guess.get(lang_key, None) + if len(lang_node.value) > 3 and (lang_node.span[0] in title_spans.keys() or lang_node.span[1] in title_spans.keys()): + # Language is next or before title, and is not a language code. Add to skip for 2nd pass. + + # if filetype is subtitle and the language appears last, just before + # the extension, then it is likely a subtitle language + parts = clean_string(lang_node.root.value).split() + if m['type'] in ['moviesubtitle', 'episodesubtitle'] and (parts.index(lang_node.value) == len(parts) - 2): + continue + + to_skip_language_nodes.append(lang_node) + elif not lang in langs: + langs[lang] = lang_node + else: + # The same language was found. Keep the more confident one, and add others to skip for 2nd pass. + existing_lang_node = langs[lang] + to_skip = None + if existing_lang_node.guess.confidence('language') >= lang_node.guess.confidence('language'): + # lang_node is to remove + to_skip = lang_node + else: + # existing_lang_node is to remove + langs[lang] = lang_node + to_skip = existing_lang_node + to_skip_language_nodes.append(to_skip) -def _build_filename_mtree(filename, options=None, **kwargs): - mtree = IterativeMatcher(filename, options=options, **kwargs) - second_pass_options = mtree.second_pass_options - if second_pass_options: - log.info("Running 2nd pass") - merged_options = dict(options) - merged_options.update(second_pass_options) - mtree = IterativeMatcher(filename, options=merged_options, **kwargs) - return mtree + if to_skip_language_nodes: + second_pass_transfo_opts['guess_language'] = ( + ((), { 'skip': [ { 'node_idx': node.parent.node_idx, + 'span': node.span } + for node in to_skip_language_nodes ] })) + + if second_pass_opts or second_pass_transfo_opts: + # 2nd pass is needed + log.info("Running 2nd pass with options: %s" % second_pass_opts) + log.info("Transfo options: %s" % second_pass_transfo_opts) + mtree = IterativeMatcher(filename, filetype=filetype, + opts=second_pass_opts, + transfo_opts=second_pass_transfo_opts) + + m = mtree.matched() + + if 'language' not in m and 'subtitleLanguage' not in m or 'title' not in m: + return m + + # if we found some language, make sure we didn't cut a title or sth... + mtree2 = IterativeMatcher(filename, filetype=filetype, + opts=['nolanguage', 'nocountry']) + m2 = mtree2.matched() + + if m.get('title') != m2.get('title'): + title = next(find_nodes(mtree.match_tree, 'title')) + title2 = next(find_nodes(mtree2.match_tree, 'title')) + + # if a node is in an explicit group, then the correct title is probably + # the other one + if title.root.node_at(title.node_idx[:2]).is_explicit(): + return m2 + elif title2.root.node_at(title2.node_idx[:2]).is_explicit(): + return m + + return m -def _add_camel_properties(mtree, options=None, **kwargs): - prop = 'title' if mtree.matched().get('type') != 'episode' else 'series' - value = mtree.matched().get(prop) - _guess_camel_string(mtree, value, options=options, skip_title=False, **kwargs) - - for leaf in mtree.match_tree.unidentified_leaves(): - value = leaf.value - _guess_camel_string(mtree, value, options=options, skip_title=True, **kwargs) - - -def _guess_camel_string(mtree, string, options=None, skip_title=False, **kwargs): - if string and is_camel(string): - log.info('"%s" is camel cased. Try to detect more properties.' % (string,)) - uncameled_value = from_camel(string) - camel_tree = _build_filename_mtree(uncameled_value, options=options, name_only=True, skip_title=skip_title, **kwargs) - if len(camel_tree.matched()) > 0: - # Title has changed. - mtree.matched().update(camel_tree.matched()) - return True - return False - - -def guess_file_info(filename, info=None, options=None, **kwargs): +def guess_file_info(filename, filetype, info=None): """info can contain the names of the various plugins, such as 'filename' to detect filename info, or 'hash_md5' to get the md5 hash of the file. - >>> testfile = os.path.join(os.path.dirname(__file__), 'test/dummy.srt') - >>> g = guess_file_info(testfile, info = ['hash_md5', 'hash_sha1']) - >>> g['hash_md5'], g['hash_sha1'] - ('64de6b5893cac24456c46a935ef9c359', 'a703fc0fa4518080505809bf562c6fc6f7b3c98c') + >>> guess_file_info('tests/dummy.srt', 'autodetect', info = ['hash_md5', 'hash_sha1']) + {'hash_md5': 'e781de9b94ba2753a8e2945b2c0a123d', 'hash_sha1': 'bfd18e2f4e5d59775c2bc14d80f56971891ed620'} """ - info = info or 'filename' - options = options or {} - result = [] hashers = [] # Force unicode as soon as possible filename = u(filename) + if info is None: + info = ['filename'] + if isinstance(info, base_text_type): info = [info] for infotype in info: if infotype == 'filename': - result.append(_guess_filename(filename, options, **kwargs)) + result.append(_guess_filename(filename, filetype)) elif infotype == 'hash_mpc': from guessit.hash_mpc import hash_file try: - result.append(Guess({infotype: hash_file(filename)}, + result.append(Guess({'hash_mpc': hash_file(filename)}, confidence=1.0)) except Exception as e: log.warning('Could not compute MPC-style hash because: %s' % e) @@ -179,7 +229,7 @@ def guess_file_info(filename, info=None, options=None, **kwargs): elif infotype == 'hash_ed2k': from guessit.hash_ed2k import hash_file try: - result.append(Guess({infotype: hash_file(filename)}, + result.append(Guess({'hash_ed2k': hash_file(filename)}, confidence=1.0)) except Exception as e: log.warning('Could not compute ed2k hash because: %s' % e) @@ -217,16 +267,23 @@ def guess_file_info(filename, info=None, options=None, **kwargs): result = merge_all(result) + # last minute adjustments + + # if country is in the guessed properties, make it part of the filename + if 'series' in result and 'country' in result: + result['series'] += ' (%s)' % result['country'].alpha2.upper() + + return result -def guess_video_info(filename, info=None, options=None, **kwargs): - return guess_file_info(filename, info=info, options=options, type='video', **kwargs) +def guess_video_info(filename, info=None): + return guess_file_info(filename, 'autodetect', info) -def guess_movie_info(filename, info=None, options=None, **kwargs): - return guess_file_info(filename, info=info, options=options, type='movie', **kwargs) +def guess_movie_info(filename, info=None): + return guess_file_info(filename, 'movie', info) -def guess_episode_info(filename, info=None, options=None, **kwargs): - return guess_file_info(filename, info=info, options=options, type='episode', **kwargs) +def guess_episode_info(filename, info=None): + return guess_file_info(filename, 'episode', info) diff --git a/lib/guessit/__main__.py b/lib/guessit/__main__.py index a7b286a6..ccfa3af6 100644 --- a/lib/guessit/__main__.py +++ b/lib/guessit/__main__.py @@ -2,8 +2,7 @@ # -*- coding: utf-8 -*- # # GuessIt - A library for guessing information from filenames -# Copyright (c) 2013 Nicolas Wack -# Copyright (c) 2013 Rémi Alvergnat +# Copyright (c) 2011 Nicolas Wack # # GuessIt is free software; you can redistribute it and/or modify it under # the terms of the Lesser GNU General Public License as published by @@ -19,199 +18,109 @@ # along with this program. If not, see . # -from __future__ import absolute_import, division, print_function, unicode_literals +from __future__ import unicode_literals +from __future__ import print_function +from guessit import u +from guessit import slogging, guess_file_info +from optparse import OptionParser import logging +import sys import os - -from guessit import PY2, u, guess_file_info -from guessit.options import option_parser +import locale -def guess_file(filename, info='filename', options=None, **kwargs): - options = options or {} +def detect_filename(filename, filetype, info=['filename'], advanced = False): filename = u(filename) print('For:', filename) - guess = guess_file_info(filename, info, options, **kwargs) - if options.get('yaml'): - try: - import yaml - for k, v in guess.items(): - if isinstance(v, list) and len(v) == 1: - guess[k] = v[0] - ystr = yaml.safe_dump({filename: dict(guess)}, default_flow_style=False) - i = 0 - for yline in ystr.splitlines(): - if i == 0: - print("? " + yline[:-1]) - elif i == 1: - print(":" + yline[1:]) - else: - print(yline) - i = i + 1 - return - except ImportError: # pragma: no cover - print('PyYAML not found. Using default output.') - print('GuessIt found:', guess.nice_string(options.get('advanced'))) + print('GuessIt found:', guess_file_info(filename, filetype, info).nice_string(advanced)) -def _supported_properties(): - from guessit.plugins import transformers - - all_properties = {} - transformers_properties = [] - for transformer in transformers.all_transformers(): - supported_properties = transformer.supported_properties() - transformers_properties.append((transformer, supported_properties)) - - if isinstance(supported_properties, dict): - for property_name, possible_values in supported_properties.items(): - current_possible_values = all_properties.get(property_name) - if current_possible_values is None: - current_possible_values = [] - all_properties[property_name] = current_possible_values - if possible_values: - current_possible_values.extend(possible_values) - else: - for property_name in supported_properties: - current_possible_values = all_properties.get(property_name) - if current_possible_values is None: - current_possible_values = [] - all_properties[property_name] = current_possible_values - - return (all_properties, transformers_properties) - - -def display_transformers(): - print('GuessIt transformers:') - _, transformers_properties = _supported_properties() - for transformer, _ in transformers_properties: - print('[@] %s (%s)' % (transformer.name, transformer.priority)) - - -def display_properties(values, transformers): - print('GuessIt properties:') - all_properties, transformers_properties = _supported_properties() - if transformers: - for transformer, properties_list in transformers_properties: - print('[@] %s (%s)' % (transformer.name, transformer.priority)) - for property_name in properties_list: - property_values = all_properties.get(property_name) - print(' [+] %s' % (property_name,)) - if property_values and values: - _display_property_values(property_name, indent=4) - else: - properties_list = [] - properties_list.extend(all_properties.keys()) - properties_list.sort() - for property_name in properties_list: - property_values = all_properties.get(property_name) - print(' [+] %s' % (property_name,)) - if property_values and values: - _display_property_values(property_name, indent=4) - - -def _display_property_values(property_name, indent=2): - all_properties, _ = _supported_properties() - property_values = all_properties.get(property_name) - for property_value in property_values: - print(indent * ' ' + '[!] %s' % (property_value,)) - - -def run_demo(episodes=True, movies=True, options=None): +def run_demo(episodes=True, movies=True, advanced=False): # NOTE: tests should not be added here but rather in the tests/ folder # this is just intended as a quick example if episodes: - testeps = ['Series/Californication/Season 2/Californication.2x05.Vaginatown.HDTV.XviD-0TV.[tvu.org.ru].avi', - 'Series/dexter/Dexter.5x02.Hello,.Bandit.ENG.-.sub.FR.HDTV.XviD-AlFleNi-TeaM.[tvu.org.ru].avi', - 'Series/Treme/Treme.1x03.Right.Place,.Wrong.Time.HDTV.XviD-NoTV.[tvu.org.ru].avi', - 'Series/Duckman/Duckman - 101 (01) - 20021107 - I, Duckman.avi', - 'Series/Duckman/Duckman - S1E13 Joking The Chicken (unedited).avi', - 'Series/Simpsons/The_simpsons_s13e18_-_i_am_furious_yellow.mpg', - 'Series/Simpsons/Saison 12 Français/Simpsons,.The.12x08.A.Bas.Le.Sergent.Skinner.FR.[tvu.org.ru].avi', - 'Series/Dr._Slump_-_002_DVB-Rip_Catalan_by_kelf.avi', - 'Series/Kaamelott/Kaamelott - Livre V - Second Volet - HD 704x396 Xvid 2 pass - Son 5.1 - TntRip by Slurm.avi' - ] + testeps = [ 'Series/Californication/Season 2/Californication.2x05.Vaginatown.HDTV.XviD-0TV.[tvu.org.ru].avi', + 'Series/dexter/Dexter.5x02.Hello,.Bandit.ENG.-.sub.FR.HDTV.XviD-AlFleNi-TeaM.[tvu.org.ru].avi', + 'Series/Treme/Treme.1x03.Right.Place,.Wrong.Time.HDTV.XviD-NoTV.[tvu.org.ru].avi', + 'Series/Duckman/Duckman - 101 (01) - 20021107 - I, Duckman.avi', + 'Series/Duckman/Duckman - S1E13 Joking The Chicken (unedited).avi', + 'Series/Simpsons/The_simpsons_s13e18_-_i_am_furious_yellow.mpg', + 'Series/Simpsons/Saison 12 Français/Simpsons,.The.12x08.A.Bas.Le.Sergent.Skinner.FR.[tvu.org.ru].avi', + 'Series/Dr._Slump_-_002_DVB-Rip_Catalan_by_kelf.avi', + 'Series/Kaamelott/Kaamelott - Livre V - Second Volet - HD 704x396 Xvid 2 pass - Son 5.1 - TntRip by Slurm.avi' + ] for f in testeps: - print('-' * 80) - guess_file(f, options=options, type='episode') + print('-'*80) + detect_filename(f, filetype='episode', advanced=advanced) + if movies: - testmovies = ['Movies/Fear and Loathing in Las Vegas (1998)/Fear.and.Loathing.in.Las.Vegas.720p.HDDVD.DTS.x264-ESiR.mkv', - 'Movies/El Dia de la Bestia (1995)/El.dia.de.la.bestia.DVDrip.Spanish.DivX.by.Artik[SEDG].avi', - 'Movies/Blade Runner (1982)/Blade.Runner.(1982).(Director\'s.Cut).CD1.DVDRip.XviD.AC3-WAF.avi', - 'Movies/Dark City (1998)/Dark.City.(1998).DC.BDRip.720p.DTS.X264-CHD.mkv', - 'Movies/Sin City (BluRay) (2005)/Sin.City.2005.BDRip.720p.x264.AC3-SEPTiC.mkv', - 'Movies/Borat (2006)/Borat.(2006).R5.PROPER.REPACK.DVDRip.XviD-PUKKA.avi', # FIXME: PROPER and R5 get overwritten - '[XCT].Le.Prestige.(The.Prestige).DVDRip.[x264.HP.He-Aac.{Fr-Eng}.St{Fr-Eng}.Chaps].mkv', # FIXME: title gets overwritten - 'Battle Royale (2000)/Battle.Royale.(Batoru.Rowaiaru).(2000).(Special.Edition).CD1of2.DVDRiP.XviD-[ZeaL].avi', - 'Movies/Brazil (1985)/Brazil_Criterion_Edition_(1985).CD2.English.srt', - 'Movies/Persepolis (2007)/[XCT] Persepolis [H264+Aac-128(Fr-Eng)+ST(Fr-Eng)+Ind].mkv', - 'Movies/Toy Story (1995)/Toy Story [HDTV 720p English-Spanish].mkv', - 'Movies/Pirates of the Caribbean: The Curse of the Black Pearl (2003)/Pirates.Of.The.Carribean.DC.2003.iNT.DVDRip.XviD.AC3-NDRT.CD1.avi', - 'Movies/Office Space (1999)/Office.Space.[Dual-DVDRip].[Spanish-English].[XviD-AC3-AC3].[by.Oswald].avi', - 'Movies/The NeverEnding Story (1984)/The.NeverEnding.Story.1.1984.DVDRip.AC3.Xvid-Monteque.avi', - 'Movies/Juno (2007)/Juno KLAXXON.avi', - 'Movies/Chat noir, chat blanc (1998)/Chat noir, Chat blanc - Emir Kusturica (VO - VF - sub FR - Chapters).mkv', - 'Movies/Wild Zero (2000)/Wild.Zero.DVDivX-EPiC.srt', - 'Movies/El Bosque Animado (1987)/El.Bosque.Animado.[Jose.Luis.Cuerda.1987].[Xvid-Dvdrip-720x432].avi', - 'testsmewt_bugs/movies/Baraka_Edition_Collector.avi' - ] + testmovies = [ 'Movies/Fear and Loathing in Las Vegas (1998)/Fear.and.Loathing.in.Las.Vegas.720p.HDDVD.DTS.x264-ESiR.mkv', + 'Movies/El Dia de la Bestia (1995)/El.dia.de.la.bestia.DVDrip.Spanish.DivX.by.Artik[SEDG].avi', + 'Movies/Blade Runner (1982)/Blade.Runner.(1982).(Director\'s.Cut).CD1.DVDRip.XviD.AC3-WAF.avi', + 'Movies/Dark City (1998)/Dark.City.(1998).DC.BDRip.720p.DTS.X264-CHD.mkv', + 'Movies/Sin City (BluRay) (2005)/Sin.City.2005.BDRip.720p.x264.AC3-SEPTiC.mkv', + 'Movies/Borat (2006)/Borat.(2006).R5.PROPER.REPACK.DVDRip.XviD-PUKKA.avi', # FIXME: PROPER and R5 get overwritten + '[XCT].Le.Prestige.(The.Prestige).DVDRip.[x264.HP.He-Aac.{Fr-Eng}.St{Fr-Eng}.Chaps].mkv', # FIXME: title gets overwritten + 'Battle Royale (2000)/Battle.Royale.(Batoru.Rowaiaru).(2000).(Special.Edition).CD1of2.DVDRiP.XviD-[ZeaL].avi', + 'Movies/Brazil (1985)/Brazil_Criterion_Edition_(1985).CD2.English.srt', + 'Movies/Persepolis (2007)/[XCT] Persepolis [H264+Aac-128(Fr-Eng)+ST(Fr-Eng)+Ind].mkv', + 'Movies/Toy Story (1995)/Toy Story [HDTV 720p English-Spanish].mkv', + 'Movies/Pirates of the Caribbean: The Curse of the Black Pearl (2003)/Pirates.Of.The.Carribean.DC.2003.iNT.DVDRip.XviD.AC3-NDRT.CD1.avi', + 'Movies/Office Space (1999)/Office.Space.[Dual-DVDRip].[Spanish-English].[XviD-AC3-AC3].[by.Oswald].avi', + 'Movies/The NeverEnding Story (1984)/The.NeverEnding.Story.1.1984.DVDRip.AC3.Xvid-Monteque.avi', + 'Movies/Juno (2007)/Juno KLAXXON.avi', + 'Movies/Chat noir, chat blanc (1998)/Chat noir, Chat blanc - Emir Kusturica (VO - VF - sub FR - Chapters).mkv', + 'Movies/Wild Zero (2000)/Wild.Zero.DVDivX-EPiC.srt', + 'Movies/El Bosque Animado (1987)/El.Bosque.Animado.[Jose.Luis.Cuerda.1987].[Xvid-Dvdrip-720x432].avi', + 'testsmewt_bugs/movies/Baraka_Edition_Collector.avi' + ] for f in testmovies: - print('-' * 80) - guess_file(f, options=options, type='movie') + print('-'*80) + detect_filename(f, filetype = 'movie', advanced = advanced) -def main(args=None, setup_logging=True): - if setup_logging: - from guessit import slogging - slogging.setupLogging() +def main(): + slogging.setupLogging() - if PY2: # pragma: no cover - import codecs - import locale - import sys + # see http://bugs.python.org/issue2128 + if sys.version_info.major < 3 and os.name == 'nt': + for i, a in enumerate(sys.argv): + sys.argv[i] = a.decode(locale.getpreferredencoding()) + + parser = OptionParser(usage = 'usage: %prog [options] file1 [file2...]') + parser.add_option('-v', '--verbose', action='store_true', dest='verbose', default=False, + help = 'display debug output') + parser.add_option('-i', '--info', dest = 'info', default = 'filename', + help = 'the desired information type: filename, hash_mpc or a hash from python\'s ' + 'hashlib module, such as hash_md5, hash_sha1, ...; or a list of any of ' + 'them, comma-separated') + parser.add_option('-t', '--type', dest = 'filetype', default = 'autodetect', + help = 'the suggested file type: movie, episode or autodetect') + parser.add_option('-a', '--advanced', dest = 'advanced', action='store_true', default = False, + help = 'display advanced information for filename guesses, as json output') + parser.add_option('-d', '--demo', action='store_true', dest='demo', default=False, + help = 'run a few builtin tests instead of analyzing a file') - # see http://bugs.python.org/issue2128 - if os.name == 'nt': - for i, a in enumerate(sys.argv): - sys.argv[i] = a.decode(locale.getpreferredencoding()) - - # see https://github.com/wackou/guessit/issues/43 - # and http://stackoverflow.com/questions/4545661/unicodedecodeerror-when-redirecting-to-file - # Wrap sys.stdout into a StreamWriter to allow writing unicode. - sys.stdout = codecs.getwriter(locale.getpreferredencoding())(sys.stdout) - - if args: - options, args = option_parser.parse_args(args) - else: # pragma: no cover - options, args = option_parser.parse_args() + options, args = parser.parse_args() if options.verbose: - logging.getLogger().setLevel(logging.DEBUG) + logging.getLogger('guessit').setLevel(logging.DEBUG) - help_required = True - if options.properties or options.values: - display_properties(options.values, options.transformers) - help_required = False - elif options.transformers: - display_transformers() - help_required = False if options.demo: - run_demo(episodes=True, movies=True, options=vars(options)) - help_required = False + run_demo(episodes=True, movies=True, advanced=options.advanced) else: if args: - help_required = False for filename in args: - guess_file(filename, - info=options.info.split(','), - options=vars(options) - ) + detect_filename(filename, + filetype = options.filetype, + info = options.info.split(','), + advanced = options.advanced) - if help_required: # pragma: no cover - option_parser.print_help() + else: + parser.print_help() if __name__ == '__main__': main() diff --git a/lib/guessit/__version__.py b/lib/guessit/__version__.py deleted file mode 100644 index cb875200..00000000 --- a/lib/guessit/__version__.py +++ /dev/null @@ -1,20 +0,0 @@ -#!/usr/bin/env python -# -*- coding: utf-8 -*- -# -# GuessIt - A library for guessing information from filenames -# Copyright (c) 2013 Nicolas Wack -# -# GuessIt is free software; you can redistribute it and/or modify it under -# the terms of the Lesser GNU General Public License as published by -# the Free Software Foundation; either version 3 of the License, or -# (at your option) any later version. -# -# GuessIt is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# Lesser GNU General Public License for more details. -# -# You should have received a copy of the Lesser GNU General Public License -# along with this program. If not, see . -# -__version__ = '0.7.1' diff --git a/lib/guessit/containers.py b/lib/guessit/containers.py deleted file mode 100644 index 2f5d5c26..00000000 --- a/lib/guessit/containers.py +++ /dev/null @@ -1,615 +0,0 @@ -#!/usr/bin/env python -# -*- coding: utf-8 -*- -# -# GuessIt - A library for guessing information from filenames -# Copyright (c) 2013 Nicolas Wack -# Copyright (c) 2013 Rémi Alvergnat -# -# GuessIt is free software; you can redistribute it and/or modify it under -# the terms of the Lesser GNU General Public License as published by -# the Free Software Foundation; either version 3 of the License, or -# (at your option) any later version. -# -# GuessIt is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# Lesser GNU General Public License for more details. -# -# You should have received a copy of the Lesser GNU General Public License -# along with this program. If not, see . -# - -from __future__ import absolute_import, division, print_function, unicode_literals - -from .patterns import compile_pattern, sep -from . import base_text_type -from .guess import Guess -import types - - -def _get_span(prop, match): - """Retrieves span for a match""" - if not prop.global_span and match.re.groups: - start = None - end = None - for i in range(1, match.re.groups + 1): - span = match.span(i) - if start is None or span[0] < start: - start = span[0] - if end is None or span[1] > end: - end = span[1] - return (start, end) - else: - return match.span() - start = span[0] - end = span[1] - - -def _get_groups(compiled_re): - """ - Retrieves groups from re - - :return: list of group names - """ - if compiled_re.groups: - indexgroup = {} - for k, i in compiled_re.groupindex.items(): - indexgroup[i] = k - ret = [] - for i in range(1, compiled_re.groups + 1): - ret.append(indexgroup.get(i, i)) - return ret - else: - return [None] - - -class NoValidator(object): - def validate(self, prop, string, node, match, entry_start, entry_end): - return True - - -class DefaultValidator(object): - """Make sure our match is surrounded by separators, or by another entry""" - def validate(self, prop, string, node, match, entry_start, entry_end): - start, end = _get_span(prop, match) - - sep_start = start <= 0 or string[start - 1] in sep - sep_end = end >= len(string) or string[end] in sep - start_by_other = start in entry_end - end_by_other = end in entry_start - if (sep_start or start_by_other) and (sep_end or end_by_other): - return True - return False - - -class WeakValidator(DefaultValidator): - """Make sure our match is surrounded by separators and is the first or last element in the string""" - def validate(self, prop, string, node, match, entry_start, entry_end): - if super(WeakValidator, self).validate(prop, string, node, match, entry_start, entry_end): - span = match.span() - start = span[0] - end = span[1] - - at_start = True - at_end = True - - while start > 0: - start = start - 1 - if string[start] not in sep: - at_start = False - break - if at_start: - return True - while end < len(string) - 1: - end = end + 1 - if string[end] not in sep: - at_end = False - break - if at_end: - return True - return False - - -class LeavesValidator(DefaultValidator): - def __init__(self, lambdas=None, previous_lambdas=None, next_lambdas=None, both_side=False, default_=True): - self.previous_lambdas = previous_lambdas if not previous_lambdas is None else [] - self.next_lambdas = next_lambdas if not next_lambdas is None else [] - if lambdas: - self.previous_lambdas.extend(lambdas) - self.next_lambdas.extend(lambdas) - self.both_side = both_side - self.default_ = default_ - - """Make sure our match is surrounded by separators and validates defined lambdas""" - def validate(self, prop, string, node, match, entry_start, entry_end): - if self.default_: - super_ret = super(LeavesValidator, self).validate(prop, string, node, match, entry_start, entry_end) - else: - super_ret = True - if not super_ret: - return False - - previous_ = self._validate_previous(prop, string, node, match, entry_start, entry_end) - if previous_ and self.both_side: - return previous_ - next_ = self._validate_next(prop, string, node, match, entry_start, entry_end) - - if previous_ is None and next_ is None: - return super_ret - - if self.both_side: - return previous_ and next_ - else: - return previous_ or next_ - - def _validate_previous(self, prop, string, node, match, entry_start, entry_end): - if self.previous_lambdas: - for leaf in node.root.previous_leaves(node): - for lambda_ in self.previous_lambdas: - ret = self._check_rule(lambda_, leaf) - if not ret is None: - return ret - return False - - def _validate_next(self, prop, string, node, match, entry_start, entry_end): - if self.next_lambdas: - for leaf in node.root.next_leaves(node): - for lambda_ in self.next_lambdas: - ret = self._check_rule(lambda_, leaf) - if not ret is None: - return ret - return False - - def _check_rule(self, lambda_, previous_leaf): - return lambda_(previous_leaf) - - -class _Property: - """Represents a property configuration.""" - def __init__(self, keys=None, pattern=None, canonical_form=None, canonical_from_pattern=True, confidence=1.0, enhance=True, global_span=False, validator=DefaultValidator(), formatter=None): - """ - :param keys: Keys of the property (format, screenSize, ...) - :type keys: string - :param canonical_form: Unique value of the property (DVD, 720p, ...) - :type canonical_form: string - :param pattern: Regexp pattern - :type pattern: string - :param confidence: confidence - :type confidence: float - :param enhance: enhance the pattern - :type enhance: boolean - :param global_span: if True, the whole match span will used to create the Guess. - Else, the span from the capturing groups will be used. - :type global_span: boolean - :param validator: Validator to use - :type validator: :class:`DefaultValidator` - :param formatter: Formater to use - :type formatter: function - """ - if isinstance(keys, list): - self.keys = keys - elif isinstance(keys, base_text_type): - self.keys = [keys] - else: - self.keys = [] - self.canonical_form = canonical_form - if not pattern is None: - self.pattern = pattern - else: - self.pattern = canonical_form - if self.canonical_form is None and canonical_from_pattern: - self.canonical_form = self.pattern - self.compiled = compile_pattern(self.pattern, enhance=enhance) - for group_name in _get_groups(self.compiled): - if isinstance(group_name, base_text_type) and not group_name in self.keys: - self.keys.append(group_name) - if not self.keys: - raise ValueError("No property key is defined") - self.confidence = confidence - self.global_span = global_span - self.validator = validator - self.formatter = formatter - - def format(self, value, group_name=None): - """Retrieves the final value from re group match value""" - formatter = None - if isinstance(self.formatter, dict): - formatter = self.formatter.get(group_name) - if formatter is None and not group_name is None: - formatter = self.formatter.get(None) - else: - formatter = self.formatter - if isinstance(formatter, types.FunctionType): - return formatter(value) - elif not formatter is None: - return formatter.format(value) - return value - - def __repr__(self): - return "%s: %s" % (self.keys, self.canonical_form if self.canonical_form else self.pattern) - - -class PropertiesContainer(object): - def __init__(self, **kwargs): - self._properties = [] - self.default_property_kwargs = kwargs - - def unregister_property(self, name, *canonical_forms): - """Unregister a property canonical forms - - If canonical_forms are specified, only those values will be unregistered - - :param name: Property name to unregister - :type name: string - :param canonical_forms: Values to unregister - :type canonical_forms: varargs of string - """ - _properties = [prop for prop in self._properties if prop.name == name and (not canonical_forms or prop.canonical_form in canonical_forms)] - - def register_property(self, name, *patterns, **property_params): - """Register property with defined canonical form and patterns. - - :param name: name of the property (format, screenSize, ...) - :type name: string - :param patterns: regular expression patterns to register for the property canonical_form - :type patterns: varargs of string - """ - properties = [] - for pattern in patterns: - params = dict(self.default_property_kwargs) - params.update(property_params) - if isinstance(pattern, dict): - params.update(pattern) - prop = _Property(name, **params) - else: - prop = _Property(name, pattern, **params) - self._properties.append(prop) - properties.append(prop) - return properties - - def register_canonical_properties(self, name, *canonical_forms, **property_params): - """Register properties from their canonical forms. - - :param name: name of the property (releaseGroup, ...) - :type name: string - :param canonical_forms: values of the property ('ESiR', 'WAF', 'SEPTiC', ...) - :type canonical_forms: varargs of strings - """ - properties = [] - for canonical_form in canonical_forms: - params = dict(property_params) - params['canonical_form'] = canonical_form - properties.extend(self.register_property(name, canonical_form, **property_params)) - return properties - - def unregister_all_properties(self): - """Unregister all defined properties""" - self._properties.clear() - - def find_properties(self, string, node, name=None, validate=True, re_match=False, sort=True, multiple=False): - """Find all distinct properties for given string - - If no capturing group is defined in the property, value will be grabbed from the entire match. - - If one ore more unnamed capturing group is defined in the property, first capturing group will be used. - - If named capturing group are defined in the property, they will be returned as property key. - - If validate, found properties will be validated by their defined validator - - If re_match, re.match will be used instead of re.search. - - if sort, found properties will be sorted from longer match to shorter match. - - If multiple is False and multiple values are found for the same property, the more confident one will be returned. - - If multiple is False and multiple values are found for the same property and the same confidence, the longer will be returned. - - :param string: input string - :type string: string - - :param node: current node of the matching tree - :type node: :class:`guessit.matchtree.MatchTree` - - :param name: name of property to find - :type name: string - - :param re_match: use re.match instead of re.search - :type re_match: bool - - :param multiple: Allows multiple property values to be returned - :type multiple: bool - - :return: found properties - :rtype: list of tuples (:class:`_Property`, match, list of tuples (property_name, tuple(value_start, value_end))) - - :see: `_Property` - :see: `register_property` - :see: `register_canonical_properties` - """ - entry_start = {} - entry_end = {} - - entries = [] - - ret = [] - - if not string.strip(): - return ret - - # search all properties - for prop in self.get_properties(name): - match = prop.compiled.match(string) if re_match else prop.compiled.search(string) - if match: - entry = prop, match - entries.append(entry) - - if validate: - # compute entries start and ends - for prop, match in entries: - start, end = _get_span(prop, match) - - if start not in entry_start: - entry_start[start] = [prop] - else: - entry_start[start].append(prop) - - if end not in entry_end: - entry_end[end] = [prop] - else: - entry_end[end].append(prop) - - # remove invalid values - while True: - invalid_entries = [] - for entry in entries: - prop, match = entry - if not prop.validator.validate(prop, string, node, match, entry_start, entry_end): - invalid_entries.append(entry) - if not invalid_entries: - break - for entry in invalid_entries: - prop, match = entry - entries.remove(entry) - invalid_span = _get_span(prop, match) - start = invalid_span[0] - end = invalid_span[1] - entry_start[start].remove(prop) - if not entry_start.get(start): - del entry_start[start] - entry_end[end].remove(prop) - if not entry_end.get(end): - del entry_end[end] - - if multiple: - ret = entries - else: - # keep only best match if multiple values where found - entries_dict = {} - for entry in entries: - for key in prop.keys: - if not key in entries_dict: - entries_dict[key] = [] - entries_dict[key].append(entry) - - for entries in entries_dict.values(): - if multiple: - for entry in entries: - ret.append(entry) - else: - best_ret = {} - - best_prop, best_match = None, None - if len(entries) == 1: - best_prop, best_match = entries[0] - else: - for prop, match in entries: - start, end = _get_span(prop, match) - if not best_prop or \ - best_prop.confidence < best_prop.confidence or \ - best_prop.confidence == best_prop.confidence and \ - best_match.span()[1] - best_match.span()[0] < match.span()[1] - match.span()[0]: - best_prop, best_match = prop, match - - best_ret[best_prop] = best_match - - for prop, match in best_ret.items(): - ret.append((prop, match)) - - if sort: - def _sorting(x): - _, x_match = x - x_start, x_end = x_match.span() - return (x_start - x_end) - - ret.sort(key=_sorting) - - return ret - - def as_guess(self, found_properties, input=None, filter=None, sep_replacement=None, multiple=False, *args, **kwargs): - if filter is None: - filter = lambda property, *args, **kwargs: True - guesses = [] if multiple else None - for property in found_properties: - prop, match = property - first_key = None - for key in prop.keys: - # First property key will be used as base for effective name - if isinstance(key, base_text_type): - if first_key is None: - first_key = key - break - property_name = first_key if first_key else None - span = _get_span(prop, match) - guess = Guess(confidence=prop.confidence, input=input, span=span, prop=property_name) - groups = _get_groups(match.re) - for group_name in groups: - name = group_name if isinstance(group_name, base_text_type) else property_name if property_name not in groups else None - if name: - value = self._effective_prop_value(prop, group_name, input, match.span(group_name) if group_name else match.span(), sep_replacement) - if not value is None: - is_string = isinstance(value, base_text_type) - if not is_string or is_string and value: # Keep non empty strings and other defined objects - if isinstance(value, dict): - for k, v in value.items(): - if k is None: - k = name - guess[k] = v - else: - guess[name] = value - if group_name: - guess.metadata(prop).span = match.span(group_name) - if filter(guess): - if multiple: - guesses.append(guess) - else: - return guess - return guesses - - def _effective_prop_value(self, prop, group_name, input=None, span=None, sep_replacement=None): - if prop.canonical_form: - return prop.canonical_form - if input is None: - return None - value = input - if not span is None: - value = value[span[0]:span[1]] - value = input[span[0]:span[1]] if input else None - if sep_replacement: - for sep_char in sep: - value = value.replace(sep_char, sep_replacement) - if value: - value = prop.format(value, group_name) - return value - - def get_properties(self, name=None, canonical_form=None): - """Retrieve properties - - :return: Properties - :rtype: generator - """ - for prop in self._properties: - if (name is None or name in prop.keys) and (canonical_form is None or prop.canonical_form == canonical_form): - yield prop - - def get_supported_properties(self): - supported_properties = {} - for prop in self.get_properties(): - for k in prop.keys: - values = supported_properties.get(k) - if not values: - values = set() - supported_properties[k] = values - if prop.canonical_form: - values.add(prop.canonical_form) - return supported_properties - - -class QualitiesContainer(): - def __init__(self): - self._qualities = {} - - def register_quality(self, name, canonical_form, rating): - """Register a quality rating. - - :param name: Name of the property - :type name: string - :param canonical_form: Value of the property - :type canonical_form: string - :param rating: Estimated quality rating for the property - :type rating: int - """ - property_qualities = self._qualities.get(name) - - if property_qualities is None: - property_qualities = {} - self._qualities[name] = property_qualities - - property_qualities[canonical_form] = rating - - def unregister_quality(self, name, *canonical_forms): - """Unregister quality ratings for given property name. - - If canonical_forms are specified, only those values will be unregistered - - :param name: Name of the property - :type name: string - :param canonical_forms: Value of the property - :type canonical_forms: string - """ - if not canonical_forms: - if name in self._qualities: - del self._qualities[name] - else: - property_qualities = self._qualities.get(name) - if not property_qualities is None: - for property_canonical_form in canonical_forms: - if property_canonical_form in property_qualities: - del property_qualities[property_canonical_form] - if not property_qualities: - del self._qualities[name] - - def clear_qualities(self,): - """Unregister all defined quality ratings. - """ - self._qualities.clear() - - def rate_quality(self, guess, *props): - """Rate the quality of guess. - - :param guess: Guess to rate - :type guess: :class:`guessit.guess.Guess` - :param props: Properties to include in the rating. if empty, rating will be performed for all guess properties. - :type props: varargs of string - - :return: Quality of the guess. The higher, the better. - :rtype: int - """ - rate = 0 - if not props: - props = guess.keys() - for prop in props: - prop_value = guess.get(prop) - prop_qualities = self._qualities.get(prop) - if not prop_value is None and not prop_qualities is None: - rate += prop_qualities.get(prop_value, 0) - return rate - - def best_quality_properties(self, props, *guesses): - """Retrieve the best quality guess, based on given properties - - :param props: Properties to include in the rating - :type props: list of strings - :param guesses: Guesses to rate - :type guesses: :class:`guessit.guess.Guess` - - :return: Best quality guess from all passed guesses - :rtype: :class:`guessit.guess.Guess` - """ - best_guess = None - best_rate = None - for guess in guesses: - rate = self.rate_quality(guess, *props) - if best_rate is None or best_rate < rate: - best_rate = rate - best_guess = guess - return best_guess - - def best_quality(self, *guesses): - """Retrieve the best quality guess. - - :param guesses: Guesses to rate - :type guesses: :class:`guessit.guess.Guess` - - :return: Best quality guess from all passed guesses - :rtype: :class:`guessit.guess.Guess` - """ - best_guess = None - best_rate = None - for guess in guesses: - rate = self.rate_quality(guess) - if best_rate is None or best_rate < rate: - best_rate = rate - best_guess = guess - return best_guess - diff --git a/lib/guessit/country.py b/lib/guessit/country.py index a734836d..944b7df6 100644 --- a/lib/guessit/country.py +++ b/lib/guessit/country.py @@ -2,7 +2,7 @@ # -*- coding: utf-8 -*- # # GuessIt - A library for guessing information from filenames -# Copyright (c) 2013 Nicolas Wack +# Copyright (c) 2012 Nicolas Wack # # GuessIt is free software; you can redistribute it and/or modify it under # the terms of the Lesser GNU General Public License as published by @@ -18,13 +18,12 @@ # along with this program. If not, see . # -from __future__ import absolute_import, division, print_function, unicode_literals - +from __future__ import unicode_literals from guessit import UnicodeMixin, base_text_type, u from guessit.fileutils import load_file_in_same_dir import logging -__all__ = ['Country'] +__all__ = [ 'Country' ] log = logging.getLogger(__name__) @@ -37,12 +36,12 @@ log = logging.getLogger(__name__) # are all separated by pipe (|) characters." _iso3166_contents = load_file_in_same_dir(__file__, 'ISO-3166-1_utf8.txt') -country_matrix = [l.strip().split('|') - for l in _iso3166_contents.strip().split('\n')] +country_matrix = [ l.strip().split('|') + for l in _iso3166_contents.strip().split('\n') ] -country_matrix += [['Unknown', 'un', 'unk', '', ''], - ['Latin America', '', 'lat', '', ''] - ] +country_matrix += [ [ 'Unknown', 'un', 'unk', '', '' ], + [ 'Latin America', '', 'lat', '', '' ] + ] country_to_alpha3 = dict((c[0].lower(), c[2].lower()) for c in country_matrix) country_to_alpha3.update(dict((c[1].lower(), c[2].lower()) for c in country_matrix)) @@ -50,16 +49,17 @@ country_to_alpha3.update(dict((c[2].lower(), c[2].lower()) for c in country_matr # add here exceptions / non ISO representations # Note: remember to put those exceptions in lower-case, they won't work otherwise -country_to_alpha3.update({'latinoamérica': 'lat', - 'brazilian': 'bra', - 'españa': 'esp', - 'uk': 'gbr' - }) +country_to_alpha3.update({ 'latinoamérica': 'lat', + 'brazilian': 'bra', + 'españa': 'esp', + 'uk': 'gbr' + }) country_alpha3_to_en_name = dict((c[2].lower(), c[0]) for c in country_matrix) country_alpha3_to_alpha2 = dict((c[2].lower(), c[1].lower()) for c in country_matrix) + class Country(UnicodeMixin): """This class represents a country. @@ -78,6 +78,7 @@ class Country(UnicodeMixin): if self.alpha3 is None: self.alpha3 = 'unk' + @property def alpha2(self): return country_alpha3_to_alpha2[self.alpha3] diff --git a/lib/guessit/date.py b/lib/guessit/date.py index 6a015e76..bd84c65d 100644 --- a/lib/guessit/date.py +++ b/lib/guessit/date.py @@ -2,7 +2,7 @@ # -*- coding: utf-8 -*- # # GuessIt - A library for guessing information from filenames -# Copyright (c) 2013 Nicolas Wack +# Copyright (c) 2011 Nicolas Wack # # GuessIt is free software; you can redistribute it and/or modify it under # the terms of the Lesser GNU General Public License as published by @@ -18,55 +18,15 @@ # along with this program. If not, see . # -from __future__ import absolute_import, division, print_function, unicode_literals - +from __future__ import unicode_literals import datetime import re -import math - - -_dsep = r'[-/ \.]' -_date_rexps = [re.compile( - # 20010823 - r'[^0-9]' + - r'(?P[0-9]{4})' + - r'(?P[0-9]{2})' + - r'(?P[0-9]{2})' + - r'[^0-9]'), - - # 2001-08-23 - re.compile(r'[^0-9]' + - r'(?P[0-9]{4})' + _dsep + - r'(?P[0-9]{2})' + _dsep + - r'(?P[0-9]{2})' + - r'[^0-9]'), - - # 23-08-2001 - re.compile(r'[^0-9]' + - r'(?P[0-9]{2})' + _dsep + - r'(?P[0-9]{2})' + _dsep + - r'(?P[0-9]{4})' + - r'[^0-9]'), - - # 23-08-01 - re.compile(r'[^0-9]' + - r'(?P[0-9]{2})' + _dsep + - r'(?P[0-9]{2})' + _dsep + - r'(?P[0-9]{2})' + - r'[^0-9]'), - ] - - -def valid_year(year, today=None): - """Check if number is a valid year""" - if not today: - today = datetime.date.today() - return 1920 < year < today.year + 5 +def valid_year(year): + return 1920 < year < datetime.date.today().year + 5 def search_year(string): """Looks for year patterns, and if found return the year and group span. - Assumes there are sentinels at the beginning and end of the string that always allow matching a non-digit delimiting the date. @@ -74,10 +34,10 @@ def search_year(string): and now + 5 years, so for instance 2000 would be returned as a valid year but 1492 would not. - >>> search_year(' in the year 2000... ') - (2000, (13, 17)) + >>> search_year('in the year 2000...') + (2000, (12, 16)) - >>> search_year(' they arrived in 1492. ') + >>> search_year('they arrived in 1492.') (None, None) """ match = re.search(r'[^0-9]([0-9]{4})[^0-9]', string) @@ -91,32 +51,59 @@ def search_year(string): def search_date(string): """Looks for date patterns, and if found return the date and group span. - Assumes there are sentinels at the beginning and end of the string that always allow matching a non-digit delimiting the date. - Year can be defined on two digit only. It will return the nearest possible - date from today. + >>> search_date('This happened on 2002-04-22.') + (datetime.date(2002, 4, 22), (17, 27)) - >>> search_date(' This happened on 2002-04-22. ') - (datetime.date(2002, 4, 22), (18, 28)) + >>> search_date('And this on 17-06-1998.') + (datetime.date(1998, 6, 17), (12, 22)) - >>> search_date(' And this on 17-06-1998. ') - (datetime.date(1998, 6, 17), (13, 23)) - - >>> search_date(' no date in here ') + >>> search_date('no date in here') (None, None) """ - today = datetime.date.today() - for drexp in _date_rexps: + dsep = r'[-/ \.]' + + date_rexps = [ + # 20010823 + r'[^0-9]' + + r'(?P[0-9]{4})' + + r'(?P[0-9]{2})' + + r'(?P[0-9]{2})' + + r'[^0-9]', + + # 2001-08-23 + r'[^0-9]' + + r'(?P[0-9]{4})' + dsep + + r'(?P[0-9]{2})' + dsep + + r'(?P[0-9]{2})' + + r'[^0-9]', + + # 23-08-2001 + r'[^0-9]' + + r'(?P[0-9]{2})' + dsep + + r'(?P[0-9]{2})' + dsep + + r'(?P[0-9]{4})' + + r'[^0-9]', + + # 23-08-01 + r'[^0-9]' + + r'(?P[0-9]{2})' + dsep + + r'(?P[0-9]{2})' + dsep + + r'(?P[0-9]{2})' + + r'[^0-9]', + ] + + for drexp in date_rexps: match = re.search(drexp, string) if match: d = match.groupdict() year, month, day = int(d['year']), int(d['month']), int(d['day']) # years specified as 2 digits should be adjusted here if year < 100: - if year > (today.year % 100) + 5: + if year > (datetime.date.today().year % 100) + 5: year = 1900 + year else: year = 2000 + year @@ -134,7 +121,7 @@ def search_date(string): continue # check date plausibility - if not valid_year(date.year, today=today): + if not 1900 < date.year < datetime.date.today().year + 5: continue # looks like we have a valid date diff --git a/lib/guessit/fileutils.py b/lib/guessit/fileutils.py index 11597c76..9531f82a 100644 --- a/lib/guessit/fileutils.py +++ b/lib/guessit/fileutils.py @@ -2,7 +2,7 @@ # -*- coding: utf-8 -*- # # GuessIt - A library for guessing information from filenames -# Copyright (c) 2013 Nicolas Wack +# Copyright (c) 2011 Nicolas Wack # # GuessIt is free software; you can redistribute it and/or modify it under # the terms of the Lesser GNU General Public License as published by @@ -18,8 +18,7 @@ # along with this program. If not, see . # -from __future__ import absolute_import, division, print_function, unicode_literals - +from __future__ import unicode_literals from guessit import s, u import os.path import zipfile @@ -45,13 +44,17 @@ def split_path(path): result = [] while True: head, tail = os.path.split(path) + headlen = len(head) - if not head and not tail: - return result + # on Unix systems, the root folder is '/' + if head and head == '/'*headlen and tail == '': + return ['/'] + result - if not tail and head == path: - # Make sure we won't have an infinite loop. - result = [head] + result + # on Windows, the root folder is a drive letter (eg: 'C:\') or for shares \\ + if ((headlen == 3 and head[1:] == ':\\') or (headlen == 2 and head == '\\\\')) and tail == '': + return [head] + result + + if head == '' and tail == '': return result # we just split a directory ending with '/', so tail is empty @@ -67,8 +70,8 @@ def split_path(path): def file_in_same_dir(ref_file, desired_file): """Return the path for a file in the same dir as a given reference file. - >>> s(file_in_same_dir('~/smewt/smewt.db', 'smewt.settings')) == os.path.normpath('~/smewt/smewt.settings') - True + >>> s(file_in_same_dir('~/smewt/smewt.db', 'smewt.settings')) + '~/smewt/smewt.settings' """ return os.path.join(*(split_path(ref_file)[:-1] + [desired_file])) diff --git a/lib/guessit/guess.py b/lib/guessit/guess.py index be130d47..73babceb 100644 --- a/lib/guessit/guess.py +++ b/lib/guessit/guess.py @@ -2,7 +2,7 @@ # -*- coding: utf-8 -*- # # GuessIt - A library for guessing information from filenames -# Copyright (c) 2013 Nicolas Wack +# Copyright (c) 2011 Nicolas Wack # # GuessIt is free software; you can redistribute it and/or modify it under # the terms of the Lesser GNU General Public License as published by @@ -18,9 +18,10 @@ # along with this program. If not, see . # -from __future__ import absolute_import, division, print_function, unicode_literals - +from __future__ import unicode_literals from guessit import UnicodeMixin, s, u, base_text_type +from guessit.language import Language +from guessit.country import Country import json import datetime import logging @@ -28,103 +29,6 @@ import logging log = logging.getLogger(__name__) -class GuessMetadata(object): - """GuessMetadata contains confidence, an input string, span and related property. - - If defined on a property of Guess object, it overrides the object defined as global. - - :param parent: The parent metadata, used for undefined properties in self object - :type parent: :class: `GuessMedata` - :param confidence: The confidence (from 0.0 to 1.0) - :type confidence: number - :param input: The input string - :type input: string - :param span: The input string - :type span: tuple (int, int) - :param prop: The found property definition - :type prop: :class `guessit.containers._Property` - """ - def __init__(self, parent=None, confidence=None, input=None, span=None, prop=None, *args, **kwargs): - self.parent = parent - if confidence is None and self.parent is None: - self._confidence = 1.0 - else: - self._confidence = confidence - self._input = input - self._span = span - self._prop = prop - - @property - def confidence(self): - """The confidence - - :rtype: int - :return: confidence value - """ - return self._confidence if not self._confidence is None else self.parent.confidence if self.parent else None - - @confidence.setter - def confidence(self, confidence): - self._confidence = confidence - - @property - def input(self): - """The input - - :rtype: string - :return: String used to find this guess value - """ - return self._input if not self._input is None else self.parent.input if self.parent else None - - @property - def span(self): - """The span - - :rtype: tuple (int, int) - :return: span of input string used to find this guess value - """ - return self._span if not self._span is None else self.parent.span if self.parent else None - - @span.setter - def span(self, span): - """The span - - :rtype: tuple (int, int) - :return: span of input string used to find this guess value - """ - self._span = span - - @property - def prop(self): - """The property - - :rtype: :class:`_Property` - :return: The property - """ - return self._prop if not self._prop is None else self.parent.prop if self.parent else None - - @property - def raw(self): - """Return the raw information (original match from the string, - not the cleaned version) associated with the given property name.""" - if self.input and self.span: - return self.input[self.span[0]:self.span[1]] - return None - - def __repr__(self, *args, **kwargs): - return object.__repr__(self, *args, **kwargs) - - -def _split_kwargs(**kwargs): - metadata_args = {} - for prop in dir(GuessMetadata): - try: - metadata_args[prop] = kwargs.pop(prop) - except KeyError: - pass - return metadata_args, kwargs - - class Guess(UnicodeMixin, dict): """A Guess is a dictionary which has an associated confidence for each of its values. @@ -133,98 +37,91 @@ class Guess(UnicodeMixin, dict): simple dict.""" def __init__(self, *args, **kwargs): - metadata_kwargs, kwargs = _split_kwargs(**kwargs) - self._global_metadata = GuessMetadata(**metadata_kwargs) + try: + confidence = kwargs.pop('confidence') + except KeyError: + confidence = 0 + + try: + raw = kwargs.pop('raw') + except KeyError: + raw = None + dict.__init__(self, *args, **kwargs) - self._metadata = {} + self._confidence = {} + self._raw = {} for prop in self: - self._metadata[prop] = GuessMetadata(parent=self._global_metadata) - + self._confidence[prop] = confidence + self._raw[prop] = raw + def to_dict(self, advanced=False): - """Return the guess as a dict containing only base types, ie: - where dates, languages, countries, etc. are converted to strings. - - if advanced is True, return the data as a json string containing - also the raw information of the properties.""" data = dict(self) for prop, value in data.items(): if isinstance(value, datetime.date): data[prop] = value.isoformat() - elif isinstance(value, (UnicodeMixin, base_text_type)): + elif isinstance(value, (Language, Country, base_text_type)): data[prop] = u(value) elif isinstance(value, list): data[prop] = [u(x) for x in value] if advanced: - metadata = self.metadata(prop) - prop_data = {'value': data[prop]} - if metadata.raw: - prop_data['raw'] = metadata.raw - if metadata.confidence: - prop_data['confidence'] = metadata.confidence - data[prop] = prop_data + data[prop] = {"value": data[prop], "raw": self.raw(prop), "confidence": self.confidence(prop)} return data def nice_string(self, advanced=False): - """Return a string with the property names and their values, - that also displays the associated confidence to each property. - - FIXME: doc with param""" if advanced: data = self.to_dict(advanced) return json.dumps(data, indent=4) - else: + else: data = self.to_dict() - + parts = json.dumps(data, indent=4).split('\n') for i, p in enumerate(parts): if p[:5] != ' "': continue - + prop = p.split('"')[1] parts[i] = (' [%.2f] "' % self.confidence(prop)) + p[5:] - + return '\n'.join(parts) def __unicode__(self): return u(self.to_dict()) - def metadata(self, prop=None): - """Return the metadata associated with the given property name - - If no property name is given, get the global_metadata - """ - if prop is None: - return self._global_metadata - if not prop in self._metadata: - self._metadata[prop] = GuessMetadata(parent=self._global_metadata) - return self._metadata[prop] - - def confidence(self, prop=None): - return self.metadata(prop).confidence - - def set_confidence(self, prop, confidence): - self.metadata(prop).confidence = confidence - + def confidence(self, prop): + return self._confidence.get(prop, -1) + def raw(self, prop): - return self.metadata(prop).raw + return self._raw.get(prop, None) - def set(self, prop_name, value, *args, **kwargs): - self[prop_name] = value - self._metadata[prop_name] = GuessMetadata(parent=self._global_metadata, *args, **kwargs) + def set(self, prop, value, confidence=None, raw=None): + self[prop] = value + if confidence is not None: + self._confidence[prop] = confidence + if raw is not None: + self._raw[prop] = raw - def update(self, other, confidence=None): + def set_confidence(self, prop, value): + self._confidence[prop] = value + + def set_raw(self, prop, value): + self._raw[prop] = value + + def update(self, other, confidence=None, raw=None): dict.update(self, other) if isinstance(other, Guess): for prop in other: - try: - self._metadata[prop] = other._metadata[prop] - except KeyError: - pass - if not confidence is None: + self._confidence[prop] = other.confidence(prop) + self._raw[prop] = other.raw(prop) + + if confidence is not None: for prop in other: - self.set_confidence(prop, confidence) + self._confidence[prop] = confidence + + if raw is not None: + for prop in other: + self._raw[prop] = raw def update_highest_confidence(self, other): """Update this guess with the values from the given one. In case @@ -234,16 +131,17 @@ class Guess(UnicodeMixin, dict): raise ValueError('Can only call this function on Guess instances') for prop in other: - if prop in self and self.metadata(prop).confidence >= other.metadata(prop).confidence: + if prop in self and self.confidence(prop) >= other.confidence(prop): continue self[prop] = other[prop] - self._metadata[prop] = other.metadata(prop) + self._confidence[prop] = other.confidence(prop) + self._raw[prop] = other.raw(prop) def choose_int(g1, g2): """Function used by merge_similar_guesses to choose between 2 possible properties when they are integers.""" - v1, c1 = g1 # value, confidence + v1, c1 = g1 # value, confidence v2, c2 = g2 if (v1 == v2): return (v1, 1 - (1 - c1) * (1 - c2)) @@ -281,7 +179,7 @@ def choose_string(g1, g2): ('The Simpsons', 0.75) """ - v1, c1 = g1 # value, confidence + v1, c1 = g1 # value, confidence v2, c2 = g2 if not v1: @@ -388,48 +286,43 @@ def merge_all(guesses, append=None): instead of being merged. >>> s(merge_all([ Guess({'season': 2}, confidence=0.6), - ... Guess({'episodeNumber': 13}, confidence=0.8) ]) - ... ) == {'season': 2, 'episodeNumber': 13} - True - + ... Guess({'episodeNumber': 13}, confidence=0.8) ])) + {'season': 2, 'episodeNumber': 13} >>> s(merge_all([ Guess({'episodeNumber': 27}, confidence=0.02), - ... Guess({'season': 1}, confidence=0.2) ]) - ... ) == {'season': 1} - True + ... Guess({'season': 1}, confidence=0.2) ])) + {'season': 1} >>> s(merge_all([ Guess({'other': 'PROPER'}, confidence=0.8), ... Guess({'releaseGroup': '2HD'}, confidence=0.8) ], - ... append=['other']) - ... ) == {'releaseGroup': '2HD', 'other': ['PROPER']} - True + ... append=['other'])) + {'releaseGroup': '2HD', 'other': ['PROPER']} + """ - result = Guess() if not guesses: - return result + return Guess() + result = guesses[0] if append is None: append = [] - for g in guesses: + for g in guesses[1:]: # first append our appendable properties for prop in append: if prop in g: result.set(prop, result.get(prop, []) + [g[prop]], # TODO: what to do with confidence here? maybe an # arithmetic mean... - confidence=g.metadata(prop).confidence, - input=g.metadata(prop).input, - span=g.metadata(prop).span, - prop=g.metadata(prop).prop) + confidence=g.confidence(prop), + raw=g.raw(prop)) del g[prop] # then merge the remaining ones dups = set(result) & set(g) if dups: - log.warning('duplicate properties %s in merged result...' % [(result[p], g[p]) for p in dups]) + log.warning('duplicate properties %s in merged result...' % [ (result[p], g[p]) for p in dups] ) result.update_highest_confidence(g) @@ -445,7 +338,7 @@ def merge_all(guesses, append=None): if isinstance(value, list): result[prop] = list(set(value)) else: - result[prop] = [value] + result[prop] = [ value ] except KeyError: pass diff --git a/lib/guessit/hash_ed2k.py b/lib/guessit/hash_ed2k.py index 6361856b..7422d4e9 100644 --- a/lib/guessit/hash_ed2k.py +++ b/lib/guessit/hash_ed2k.py @@ -2,7 +2,7 @@ # -*- coding: utf-8 -*- # # GuessIt - A library for guessing information from filenames -# Copyright (c) 2013 Nicolas Wack +# Copyright (c) 2011 Nicolas Wack # # GuessIt is free software; you can redistribute it and/or modify it under # the terms of the Lesser GNU General Public License as published by @@ -18,8 +18,7 @@ # along with this program. If not, see . # -from __future__ import absolute_import, division, print_function, unicode_literals - +from __future__ import unicode_literals from guessit import s, to_hex import hashlib import os.path @@ -28,9 +27,8 @@ import os.path def hash_file(filename): """Returns the ed2k hash of a given file. - >>> testfile = os.path.join(os.path.dirname(__file__), 'test/dummy.srt') - >>> s(hash_file(testfile)) - 'ed2k://|file|dummy.srt|59|41F58B913AB3973F593BEBA8B8DF6510|/' + >>> s(hash_file('tests/dummy.srt')) + 'ed2k://|file|dummy.srt|44|1CA0B9DED3473B926AA93A0A546138BB|/' """ return 'ed2k://|file|%s|%d|%s|/' % (os.path.basename(filename), os.path.getsize(filename), diff --git a/lib/guessit/hash_mpc.py b/lib/guessit/hash_mpc.py index da0fb159..c9dd4292 100644 --- a/lib/guessit/hash_mpc.py +++ b/lib/guessit/hash_mpc.py @@ -2,7 +2,7 @@ # -*- coding: utf-8 -*- # # GuessIt - A library for guessing information from filenames -# Copyright (c) 2013 Nicolas Wack +# Copyright (c) 2011 Nicolas Wack # # GuessIt is free software; you can redistribute it and/or modify it under # the terms of the Lesser GNU General Public License as published by @@ -18,8 +18,7 @@ # along with this program. If not, see . # -from __future__ import absolute_import, division, print_function, unicode_literals - +from __future__ import unicode_literals import struct import os @@ -29,7 +28,7 @@ def hash_file(filename): http://trac.opensubtitles.org/projects/opensubtitles/wiki/HashSourceCodes and is licensed under the GPL.""" - longlongformat = b'q' # long long + longlongformat = 'q' # long long bytesize = struct.calcsize(longlongformat) f = open(filename, "rb") @@ -40,14 +39,14 @@ def hash_file(filename): if filesize < 65536 * 2: raise Exception("SizeError: size is %d, should be > 132K..." % filesize) - for x in range(int(65536 / bytesize)): + for x in range(65536 / bytesize): buf = f.read(bytesize) (l_value,) = struct.unpack(longlongformat, buf) hash_value += l_value - hash_value = hash_value & 0xFFFFFFFFFFFFFFFF # to remain as 64bit number + hash_value = hash_value & 0xFFFFFFFFFFFFFFFF #to remain as 64bit number f.seek(max(0, filesize - 65536), 0) - for x in range(int(65536 / bytesize)): + for x in range(65536 / bytesize): buf = f.read(bytesize) (l_value,) = struct.unpack(longlongformat, buf) hash_value += l_value diff --git a/lib/guessit/language.py b/lib/guessit/language.py index d8a1ab94..4d22cf05 100644 --- a/lib/guessit/language.py +++ b/lib/guessit/language.py @@ -2,7 +2,7 @@ # -*- coding: utf-8 -*- # # GuessIt - A library for guessing information from filenames -# Copyright (c) 2013 Nicolas Wack +# Copyright (c) 2011 Nicolas Wack # # GuessIt is free software; you can redistribute it and/or modify it under # the terms of the Lesser GNU General Public License as published by @@ -18,143 +18,122 @@ # along with this program. If not, see . # -from __future__ import absolute_import, division, print_function, unicode_literals - -from guessit import UnicodeMixin, base_text_type, u +from __future__ import unicode_literals +from guessit import UnicodeMixin, base_text_type, u, s +from guessit.fileutils import load_file_in_same_dir from guessit.textutils import find_words -from babelfish import Language -import babelfish +from guessit.country import Country import re import logging -from guessit.guess import Guess -__all__ = ['Language', 'UNDETERMINED', - 'search_language', 'guess_language'] +__all__ = [ 'is_iso_language', 'is_language', 'lang_set', 'Language', + 'ALL_LANGUAGES', 'ALL_LANGUAGES_NAMES', 'UNDETERMINED', + 'search_language', 'guess_language' ] + log = logging.getLogger(__name__) -UNDETERMINED = babelfish.Language('und') -SYN = {('und', None): ['unknown', 'inconnu', 'unk', 'un'], - ('ell', None): ['gr', 'greek'], - ('spa', None): ['esp', 'español'], - ('fra', None): ['français', 'vf', 'vff', 'vfi'], - ('swe', None): ['se'], - ('por', 'BR'): ['po', 'pb', 'pob', 'br', 'brazilian'], - ('cat', None): ['català'], - ('ces', None): ['cz'], - ('ukr', None): ['ua'], - ('zho', None): ['cn'], - ('jpn', None): ['jp'], - ('hrv', None): ['scr'], - ('mul', None): ['multi', 'dl'], # http://scenelingo.wordpress.com/2009/03/24/what-does-dl-mean/ - } +# downloaded from http://www.loc.gov/standards/iso639-2/ISO-639-2_utf-8.txt +# +# Description of the fields: +# "An alpha-3 (bibliographic) code, an alpha-3 (terminologic) code (when given), +# an alpha-2 code (when given), an English name, and a French name of a language +# are all separated by pipe (|) characters." +_iso639_contents = load_file_in_same_dir(__file__, 'ISO-639-2_utf-8.txt') + +# drop the BOM from the beginning of the file +_iso639_contents = _iso639_contents[1:] + +language_matrix = [ l.strip().split('|') + for l in _iso639_contents.strip().split('\n') ] -class GuessitConverter(babelfish.LanguageReverseConverter): +# update information in the language matrix +language_matrix += [['mol', '', 'mo', 'Moldavian', 'moldave'], + ['ass', '', '', 'Assyrian', 'assyrien']] - _with_country_regexp = re.compile('(.*)\((.*)\)') - _with_country_regexp2 = re.compile('(.*)-(.*)') - - def __init__(self): - self.guessit_exceptions = {} - for (alpha3, country), synlist in SYN.items(): - for syn in synlist: - self.guessit_exceptions[syn.lower()] = (alpha3, country, None) - - @property - def codes(self): - return (babelfish.language_converters['alpha3b'].codes | - babelfish.language_converters['alpha2'].codes | - babelfish.language_converters['name'].codes | - babelfish.language_converters['opensubtitles'].codes | - babelfish.country_converters['name'].codes | - frozenset(self.guessit_exceptions.keys())) - - def convert(self, alpha3, country=None, script=None): - return str(babelfish.Language(alpha3, country, script)) - - def reverse(self, name): - with_country = (GuessitConverter._with_country_regexp.match(name) or - GuessitConverter._with_country_regexp2.match(name)) - - if with_country: - lang = babelfish.Language.fromguessit(with_country.group(1).strip()) - lang.country = babelfish.Country.fromguessit(with_country.group(2).strip()) - return (lang.alpha3, lang.country.alpha2 if lang.country else None, lang.script or None) - - # exceptions come first, as they need to override a potential match - # with any of the other guessers - try: - return self.guessit_exceptions[name.lower()] - except KeyError: - pass - - for conv in [babelfish.Language, - babelfish.Language.fromalpha3b, - babelfish.Language.fromalpha2, - babelfish.Language.fromname, - babelfish.Language.fromopensubtitles]: - try: - c = conv(name) - return c.alpha3, c.country, c.script - except (ValueError, babelfish.LanguageReverseError): - pass - - raise babelfish.LanguageReverseError(name) +for lang in language_matrix: + # remove unused languages that shadow other common ones with a non-official form + if (lang[2] == 'se' or # Northern Sami shadows Swedish + lang[2] == 'br'): # Breton shadows Brazilian + lang[2] = '' + # add missing information + if lang[0] == 'und': + lang[2] = 'un' + if lang[0] == 'srp': + lang[1] = 'scc' # from OpenSubtitles -babelfish.language_converters['guessit'] = GuessitConverter() +lng3 = frozenset(l[0] for l in language_matrix if l[0]) +lng3term = frozenset(l[1] for l in language_matrix if l[1]) +lng2 = frozenset(l[2] for l in language_matrix if l[2]) +lng_en_name = frozenset(lng for l in language_matrix + for lng in l[3].lower().split('; ') if lng) +lng_fr_name = frozenset(lng for l in language_matrix + for lng in l[4].lower().split('; ') if lng) +lng_all_names = lng3 | lng3term | lng2 | lng_en_name | lng_fr_name -COUNTRIES_SYN = {'ES': ['españa'], - 'GB': ['UK'], - 'BR': ['brazilian', 'bra'], - # FIXME: this one is a bit of a stretch, not sure how to do - # it properly, though... - 'MX': ['Latinoamérica', 'latin america'] - } +lng3_to_lng3term = dict((l[0], l[1]) for l in language_matrix if l[1]) +lng3term_to_lng3 = dict((l[1], l[0]) for l in language_matrix if l[1]) + +lng3_to_lng2 = dict((l[0], l[2]) for l in language_matrix if l[2]) +lng2_to_lng3 = dict((l[2], l[0]) for l in language_matrix if l[2]) + +# we only return the first given english name, hoping it is the most used one +lng3_to_lng_en_name = dict((l[0], l[3].split('; ')[0]) + for l in language_matrix if l[3]) +lng_en_name_to_lng3 = dict((en_name.lower(), l[0]) + for l in language_matrix if l[3] + for en_name in l[3].split('; ')) + +# we only return the first given french name, hoping it is the most used one +lng3_to_lng_fr_name = dict((l[0], l[4].split('; ')[0]) + for l in language_matrix if l[4]) +lng_fr_name_to_lng3 = dict((fr_name.lower(), l[0]) + for l in language_matrix if l[4] + for fr_name in l[4].split('; ')) + +# contains a list of exceptions: strings that should be parsed as a language +# but which are not in an ISO form +lng_exceptions = { 'unknown': ('und', None), + 'inconnu': ('und', None), + 'unk': ('und', None), + 'un': ('und', None), + 'gr': ('gre', None), + 'greek': ('gre', None), + 'esp': ('spa', None), + 'español': ('spa', None), + 'se': ('swe', None), + 'po': ('pt', 'br'), + 'pb': ('pt', 'br'), + 'pob': ('pt', 'br'), + 'br': ('pt', 'br'), + 'brazilian': ('pt', 'br'), + 'català': ('cat', None), + 'cz': ('cze', None), + 'ua': ('ukr', None), + 'cn': ('chi', None), + 'chs': ('chi', None), + 'jp': ('jpn', None), + 'scr': ('hrv', None) + } -class GuessitCountryConverter(babelfish.CountryReverseConverter): - def __init__(self): - self.guessit_exceptions = {} +def is_iso_language(language): + return language.lower() in lng_all_names - for alpha2, synlist in COUNTRIES_SYN.items(): - for syn in synlist: - self.guessit_exceptions[syn.lower()] = alpha2 +def is_language(language): + return is_iso_language(language) or language in lng_exceptions - @property - def codes(self): - return (babelfish.country_converters['name'].codes | - frozenset(babelfish.COUNTRIES.values()) | - frozenset(self.guessit_exceptions.keys())) +def lang_set(languages, strict=False): + """Return a set of guessit.Language created from their given string + representation. - def convert(self, alpha2): - return str(babelfish.Country(alpha2)) - - def reverse(self, name): - # exceptions come first, as they need to override a potential match - # with any of the other guessers - try: - return self.guessit_exceptions[name.lower()] - except KeyError: - pass - - try: - return babelfish.Country(name.upper()).alpha2 - except ValueError: - pass - - for conv in [babelfish.Country.fromname]: - try: - return conv(name).alpha2 - except babelfish.CountryReverseError: - pass - - raise babelfish.CountryReverseError(name) - - -babelfish.country_converters['guessit'] = GuessitCountryConverter() + if strict is True, then this will raise an exception if any language + could not be identified. + """ + return set(Language(l, strict=strict) for l in languages) class Language(UnicodeMixin): @@ -174,65 +153,109 @@ class Language(UnicodeMixin): >>> Language('fr') Language(French) - >>> (Language('eng').english_name) == 'English' + >>> s(Language('eng').french_name) + 'anglais' + + >>> s(Language('pt(br)').country.english_name) + 'Brazil' + + >>> s(Language('Español (Latinoamérica)').country.english_name) + 'Latin America' + + >>> Language('Spanish (Latin America)') == Language('Español (Latinoamérica)') True - >>> (Language('pt(br)').country.name) == 'BRAZIL' - True + >>> s(Language('zz', strict=False).english_name) + 'Undetermined' - >>> (Language('zz', strict=False).english_name) == 'Undetermined' - True - - >>> (Language('pt(br)').opensubtitles) == 'pob' - True + >>> s(Language('pt(br)').opensubtitles) + 'pob' """ - def __init__(self, language, country=None, strict=False): + _with_country_regexp = re.compile('(.*)\((.*)\)') + _with_country_regexp2 = re.compile('(.*)-(.*)') + + def __init__(self, language, country=None, strict=False, scheme=None): language = u(language.strip().lower()) - country = babelfish.Country(country.upper()) if country else None + with_country = (Language._with_country_regexp.match(language) or + Language._with_country_regexp2.match(language)) + if with_country: + self.lang = Language(with_country.group(1)).lang + self.country = Country(with_country.group(2)) + return - try: - self.lang = babelfish.Language.fromguessit(language) - # user given country overrides guessed one - if country: - self.lang.country = country + self.lang = None + self.country = Country(country) if country else None - except babelfish.LanguageReverseError: - msg = 'The given string "%s" could not be identified as a language' % language - if strict: - raise ValueError(msg) + # first look for scheme specific languages + if scheme == 'opensubtitles': + if language == 'br': + self.lang = 'bre' + return + elif language == 'se': + self.lang = 'sme' + return + elif scheme is not None: + log.warning('Unrecognized scheme: "%s" - Proceeding with standard one' % scheme) + # look for ISO language codes + if len(language) == 2: + self.lang = lng2_to_lng3.get(language) + elif len(language) == 3: + self.lang = (language + if language in lng3 + else lng3term_to_lng3.get(language)) + else: + self.lang = (lng_en_name_to_lng3.get(language) or + lng_fr_name_to_lng3.get(language)) + + # general language exceptions + if self.lang is None and language in lng_exceptions: + lang, country = lng_exceptions[language] + self.lang = Language(lang).alpha3 + self.country = Country(country) if country else None + + msg = 'The given string "%s" could not be identified as a language' % language + + if self.lang is None and strict: + raise ValueError(msg) + + if self.lang is None: log.debug(msg) - self.lang = UNDETERMINED - - @property - def country(self): - return self.lang.country + self.lang = 'und' @property def alpha2(self): - return self.lang.alpha2 + return lng3_to_lng2[self.lang] @property def alpha3(self): - return self.lang.alpha3 + return self.lang @property def alpha3term(self): - return self.lang.alpha3b + return lng3_to_lng3term[self.lang] @property def english_name(self): - return self.lang.name + return lng3_to_lng_en_name[self.lang] + + @property + def french_name(self): + return lng3_to_lng_fr_name[self.lang] @property def opensubtitles(self): - return self.lang.opensubtitles + if self.lang == 'por' and self.country and self.country.alpha2 == 'br': + return 'pob' + elif self.lang in ['gre', 'srp']: + return self.alpha3term + return self.alpha3 @property def tmdb(self): if self.country: - return '%s-%s' % (self.alpha2, self.country.alpha2) + return '%s-%s' % (self.alpha2, self.country.alpha2.upper()) return self.alpha2 def __hash__(self): @@ -240,8 +263,7 @@ class Language(UnicodeMixin): def __eq__(self, other): if isinstance(other, Language): - # in Guessit, languages are considered equal if their main languages are equal - return self.alpha3 == other.alpha3 + return self.lang == other.lang if isinstance(other, base_text_type): try: @@ -254,138 +276,115 @@ class Language(UnicodeMixin): def __ne__(self, other): return not self == other - def __bool__(self): - return self.lang != UNDETERMINED - __nonzero__ = __bool__ + def __nonzero__(self): + return self.lang != 'und' def __unicode__(self): - if self.lang.country: + if self.country: return '%s(%s)' % (self.english_name, self.country.alpha2) else: return self.english_name def __repr__(self): - if self.lang.country: - return 'Language(%s, country=%s)' % (self.english_name, self.lang.country) + if self.country: + return 'Language(%s, country=%s)' % (self.english_name, self.country) else: return 'Language(%s)' % self.english_name -# list of common words which could be interpreted as languages, but which -# are far too common to be able to say they represent a language in the -# middle of a string (where they most likely carry their commmon meaning) -LNG_COMMON_WORDS = frozenset([ - # english words - 'is', 'it', 'am', 'mad', 'men', 'man', 'run', 'sin', 'st', 'to', - 'no', 'non', 'war', 'min', 'new', 'car', 'day', 'bad', 'bat', 'fan', - 'fry', 'cop', 'zen', 'gay', 'fat', 'one', 'cherokee', 'got', 'an', 'as', - 'cat', 'her', 'be', 'hat', 'sun', 'may', 'my', 'mr', 'rum', 'pi', - # french words - 'bas', 'de', 'le', 'son', 'ne', 'ca', 'ce', 'et', 'que', - 'mal', 'est', 'vol', 'or', 'mon', 'se', - # spanish words - 'la', 'el', 'del', 'por', 'mar', - # other - 'ind', 'arw', 'ts', 'ii', 'bin', 'chan', 'ss', 'san', 'oss', 'iii', - 'vi', 'ben', 'da', 'lt', 'ch', - # new from babelfish - 'mkv', 'avi', 'dmd', 'the', 'dis', 'cut', 'stv', 'des', 'dia', 'and', - 'cab', 'sub', 'mia', 'rim', 'las', 'une', 'par', 'srt', 'ano', 'toy', - 'job', 'gag', 'reel', 'www', 'for', 'ayu', 'csi', 'ren', 'moi', 'sur', - 'fer', 'fun', 'two', 'big', 'psy', 'air', - # release groups - 'bs' # Bosnian - ]) +UNDETERMINED = Language('und') +ALL_LANGUAGES = frozenset(Language(lng) for lng in lng_all_names) - frozenset([UNDETERMINED]) +ALL_LANGUAGES_NAMES = lng_all_names - -subtitle_prefixes = ['sub', 'subs', 'st', 'vost', 'subforced', 'fansub', 'hardsub'] -subtitle_suffixes = ['subforced', 'fansub', 'hardsub'] -lang_prefixes = ['true'] - - -def find_possible_languages(string): - """Find possible languages in the string - - :return: list of tuple (property, Language, lang_word, word) - """ - words = find_words(string) - - valid_words = [] - for word in words: - lang_word = word.lower() - key = 'language' - for prefix in subtitle_prefixes: - if lang_word.startswith(prefix): - lang_word = lang_word[len(prefix):] - key = 'subtitleLanguage' - for suffix in subtitle_suffixes: - if lang_word.endswith(suffix): - lang_word = lang_word[:len(suffix)] - key = 'subtitleLanguage' - for prefix in lang_prefixes: - if lang_word.startswith(prefix): - lang_word = lang_word[len(prefix):] - if not lang_word in LNG_COMMON_WORDS: - try: - lang = Language(lang_word) - # Keep language with alpha2 equilavent. Others are probably an uncommon language. - if lang == 'mul' or hasattr(lang, 'alpha2'): - valid_words.append((key, lang, lang_word, word)) - except babelfish.Error: - pass - return valid_words - - -def search_language(string, lang_filter=None): +def search_language(string, lang_filter=None, skip=None): """Looks for language patterns, and if found return the language object, its group span and an associated confidence. you can specify a list of allowed languages using the lang_filter argument, as in lang_filter = [ 'fr', 'eng', 'spanish' ] - >>> search_language('movie [en].avi')['language'] - Language(English) + >>> search_language('movie [en].avi') + (Language(English), (7, 9), 0.8) >>> search_language('the zen fat cat and the gay mad men got a new fan', lang_filter = ['en', 'fr', 'es']) - + (None, None, None) """ + # list of common words which could be interpreted as languages, but which + # are far too common to be able to say they represent a language in the + # middle of a string (where they most likely carry their commmon meaning) + lng_common_words = frozenset([ + # english words + 'is', 'it', 'am', 'mad', 'men', 'man', 'run', 'sin', 'st', 'to', + 'no', 'non', 'war', 'min', 'new', 'car', 'day', 'bad', 'bat', 'fan', + 'fry', 'cop', 'zen', 'gay', 'fat', 'cherokee', 'got', 'an', 'as', + 'cat', 'her', 'be', 'hat', 'sun', 'may', 'my', 'mr', 'rum', 'pi', + # french words + 'bas', 'de', 'le', 'son', 'vo', 'vf', 'ne', 'ca', 'ce', 'et', 'que', + 'mal', 'est', 'vol', 'or', 'mon', 'se', + # spanish words + 'la', 'el', 'del', 'por', 'mar', + # other + 'ind', 'arw', 'ts', 'ii', 'bin', 'chan', 'ss', 'san', 'oss', 'iii', + 'vi', 'ben', 'da', 'lt' + ]) + sep = r'[](){} \._-+' + if lang_filter: - lang_filter = set(babelfish.Language.fromguessit(lang) for lang in lang_filter) + lang_filter = lang_set(lang_filter) - confidence = 1.0 # for all of them + slow = ' %s ' % string.lower() + confidence = 1.0 # for all of them - for prop, language, lang, word in find_possible_languages(string): - pos = string.find(word) - end = pos + len(word) + for lang in set(find_words(slow)) & lng_all_names: - if lang_filter and language not in lang_filter: + if lang in lng_common_words: continue - # only allow those languages that have a 2-letter code, those that - # don't are too esoteric and probably false matches - #if language.lang not in lng3_to_lng2: - # continue + pos = slow.find(lang) - # confidence depends on alpha2, alpha3, english name, ... - if len(lang) == 2: - confidence = 0.8 - elif len(lang) == 3: - confidence = 0.9 - elif prop == 'subtitleLanguage': - confidence = 0.6 # Subtitle prefix found with language - else: - # Note: we could either be really confident that we found a - # language or assume that full language names are too - # common words and lower their confidence accordingly - confidence = 0.3 # going with the low-confidence route here + if pos != -1: + end = pos + len(lang) + + # skip if span in in skip list + while skip and (pos - 1, end - 1) in skip: + pos = slow.find(lang, end) + if pos == -1: + continue + end = pos + len(lang) + if pos == -1: + continue + + # make sure our word is always surrounded by separators + if slow[pos - 1] not in sep or slow[end] not in sep: + continue - return Guess({prop: language}, confidence=confidence, input=string, span=(pos, end)) + language = Language(slow[pos:end]) + if lang_filter and language not in lang_filter: + continue - return None + # only allow those languages that have a 2-letter code, those that + # don't are too esoteric and probably false matches + if language.lang not in lng3_to_lng2: + continue + + # confidence depends on lng2, lng3, english name, ... + if len(lang) == 2: + confidence = 0.8 + elif len(lang) == 3: + confidence = 0.9 + else: + # Note: we could either be really confident that we found a + # language or assume that full language names are too + # common words and lower their confidence accordingly + confidence = 0.3 # going with the low-confidence route here + + return language, (pos - 1, end - 1), confidence + + return None, None, None -def guess_language(text): # pragma: no cover +def guess_language(text): """Guess the language in which a body of text is written. This uses the external guess-language python module, and will fail and return @@ -393,7 +392,7 @@ def guess_language(text): # pragma: no cover """ try: from guess_language import guessLanguage - return babelfish.Language.fromguessit(guessLanguage(text)) + return Language(guessLanguage(text)) except ImportError: log.error('Cannot detect the language of the given text body, missing dependency: guess-language') diff --git a/lib/guessit/matcher.py b/lib/guessit/matcher.py index 8233bdf7..1984c01c 100644 --- a/lib/guessit/matcher.py +++ b/lib/guessit/matcher.py @@ -2,8 +2,7 @@ # -*- coding: utf-8 -*- # # GuessIt - A library for guessing information from filenames -# Copyright (c) 2013 Nicolas Wack -# Copyright (c) 2013 Rémi Alvergnat +# Copyright (c) 2012 Nicolas Wack # # GuessIt is free software; you can redistribute it and/or modify it under # the terms of the Lesser GNU General Public License as published by @@ -19,229 +18,163 @@ # along with this program. If not, see . # -from __future__ import absolute_import, division, print_function, \ - unicode_literals - -import logging - -from guessit import PY3, u -from guessit.transfo import TransformerException +from __future__ import unicode_literals +from guessit import PY3, u, base_text_type from guessit.matchtree import MatchTree from guessit.textutils import normalize_unicode, clean_string -from guessit.guess import Guess -import inspect +import logging log = logging.getLogger(__name__) class IterativeMatcher(object): - """An iterative matcher tries to match different patterns that appear - in the filename. + def __init__(self, filename, filetype='autodetect', opts=None, transfo_opts=None): + """An iterative matcher tries to match different patterns that appear + in the filename. - The ``filetype`` argument indicates which type of file you want to match. - If it is undefined, the matcher will try to see whether it can guess - that the file corresponds to an episode, or otherwise will assume it is - a movie. + The 'filetype' argument indicates which type of file you want to match. + If it is 'autodetect', the matcher will try to see whether it can guess + that the file corresponds to an episode, or otherwise will assume it is + a movie. - The recognized ``filetype`` values are: - ``['subtitle', 'info', 'movie', 'moviesubtitle', 'movieinfo', 'episode', - 'episodesubtitle', 'episodeinfo']`` + The recognized 'filetype' values are: + [ autodetect, subtitle, info, movie, moviesubtitle, movieinfo, episode, + episodesubtitle, episodeinfo ] - ``options`` is a dict of options values to be passed to the transformations used - by the matcher. - The IterativeMatcher works mainly in 2 steps: + The IterativeMatcher works mainly in 2 steps: - First, it splits the filename into a match_tree, which is a tree of groups - which have a semantic meaning, such as episode number, movie title, - etc... + First, it splits the filename into a match_tree, which is a tree of groups + which have a semantic meaning, such as episode number, movie title, + etc... - The match_tree created looks like the following:: + The match_tree created looks like the following: - 0000000000000000000000000000000000000000000000000000000000000000000000000000000000 111 - 0000011111111111112222222222222233333333444444444444444455555555666777777778888888 000 - 0000000000000000000000000000000001111112011112222333333401123334000011233340000000 000 - __________________(The.Prestige).______.[____.HP.______.{__-___}.St{__-___}.Chaps].___ - xxxxxttttttttttttt ffffff vvvv xxxxxx ll lll xx xxx ccc - [XCT].Le.Prestige.(The.Prestige).DVDRip.[x264.HP.He-Aac.{Fr-Eng}.St{Fr-Eng}.Chaps].mkv + 0000000000000000000000000000000000000000000000000000000000000000000000000000000000 111 + 0000011111111111112222222222222233333333444444444444444455555555666777777778888888 000 + 0000000000000000000000000000000001111112011112222333333401123334000011233340000000 000 + __________________(The.Prestige).______.[____.HP.______.{__-___}.St{__-___}.Chaps].___ + xxxxxttttttttttttt ffffff vvvv xxxxxx ll lll xx xxx ccc + [XCT].Le.Prestige.(The.Prestige).DVDRip.[x264.HP.He-Aac.{Fr-Eng}.St{Fr-Eng}.Chaps].mkv - The first 3 lines indicates the group index in which a char in the - filename is located. So for instance, ``x264`` (in the middle) is the group (0, 4, 1), and - it corresponds to a video codec, denoted by the letter ``v`` in the 4th line. - (for more info, see guess.matchtree.to_string) + The first 3 lines indicates the group index in which a char in the + filename is located. So for instance, x264 is the group (0, 4, 1), and + it corresponds to a video codec, denoted by the letter'v' in the 4th line. + (for more info, see guess.matchtree.to_string) - Second, it tries to merge all this information into a single object - containing all the found properties, and does some (basic) conflict - resolution when they arise. - """ - def __init__(self, filename, options=None, **kwargs): - options = dict(options or {}) - for k, v in kwargs.items(): - if k not in options or not options[k]: - options[k] = v # options dict has priority over keyword arguments - self._validate_options(options) + Second, it tries to merge all this information into a single object + containing all the found properties, and does some (basic) conflict + resolution when they arise. + + + When you create the Matcher, you can pass it: + - a list 'opts' of option names, that act as global flags + - a dict 'transfo_opts' of { transfo_name: (transfo_args, transfo_kwargs) } + with which to call the transfo.process() function. + """ + + valid_filetypes = ('autodetect', 'subtitle', 'info', 'video', + 'movie', 'moviesubtitle', 'movieinfo', + 'episode', 'episodesubtitle', 'episodeinfo') + if filetype not in valid_filetypes: + raise ValueError("filetype needs to be one of %s" % valid_filetypes) if not PY3 and not isinstance(filename, unicode): log.warning('Given filename to matcher is not unicode...') filename = filename.decode('utf-8') filename = normalize_unicode(filename) + + if opts is None: + opts = [] + if not isinstance(opts, list): + raise ValueError('opts must be a list of option names! Received: type=%s val=%s', + type(opts), opts) + + if transfo_opts is None: + transfo_opts = {} + if not isinstance(transfo_opts, dict): + raise ValueError('transfo_opts must be a dict of { transfo_name: (args, kwargs) }. '+ + 'Received: type=%s val=%s', type(transfo_opts), transfo_opts) + self.match_tree = MatchTree(filename) - self.options = options - self._transfo_calls = [] # sanity check: make sure we don't process a (mostly) empty string if clean_string(filename) == '': return - from guessit.plugins import transformers + mtree = self.match_tree + mtree.guess.set('type', filetype, confidence=1.0) - try: - mtree = self.match_tree - if 'type' in self.options: - mtree.guess.set('type', self.options['type'], confidence=0.0) + def apply_transfo(transfo_name, *args, **kwargs): + transfo = __import__('guessit.transfo.' + transfo_name, + globals=globals(), locals=locals(), + fromlist=['process'], level=0) + default_args, default_kwargs = transfo_opts.get(transfo_name, ((), {})) + all_args = args or default_args + all_kwargs = dict(default_kwargs) + all_kwargs.update(kwargs) # keep all kwargs merged together + transfo.process(mtree, *all_args, **all_kwargs) - # Process - for transformer in transformers.all_transformers(): - self._process(transformer, False) + # 1- first split our path into dirs + basename + ext + apply_transfo('split_path_components') - # Post-process - for transformer in transformers.all_transformers(): - self._process(transformer, True) + # 2- guess the file type now (will be useful later) + apply_transfo('guess_filetype', filetype) + if mtree.guess['type'] == 'unknown': + return - log.debug('Found match tree:\n%s' % u(mtree)) - except TransformerException as e: - log.debug('An error has occured in Transformer %s: %s' % (e.transformer, e)) + # 3- split each of those into explicit groups (separated by parentheses + # or square brackets) + apply_transfo('split_explicit_groups') - def _process(self, transformer, post=False): - if not hasattr(transformer, 'should_process') or transformer.should_process(self.match_tree, self.options): - if post: - transformer.post_process(self.match_tree, self.options) - else: - transformer.process(self.match_tree, self.options) - self._transfo_calls.append(transformer) + # 4- try to match information for specific patterns + # NOTE: order needs to comply to the following: + # - website before language (eg: tvu.org.ru vs russian) + # - language before episodes_rexps + # - properties before language (eg: he-aac vs hebrew) + # - release_group before properties (eg: XviD-?? vs xvid) + if mtree.guess['type'] in ('episode', 'episodesubtitle', 'episodeinfo'): + strategy = [ 'guess_date', 'guess_website', 'guess_release_group', + 'guess_properties', 'guess_language', + 'guess_video_rexps', + 'guess_episodes_rexps', 'guess_weak_episodes_rexps' ] + else: + strategy = [ 'guess_date', 'guess_website', 'guess_release_group', + 'guess_properties', 'guess_language', + 'guess_video_rexps' ] - @property - def second_pass_options(self): - second_pass_options = {} - for transformer in self._transfo_calls: - if hasattr(transformer, 'second_pass_options'): - transformer_second_pass_options = transformer.second_pass_options(self.match_tree, self.options) - if transformer_second_pass_options: - second_pass_options.update(transformer_second_pass_options) + if 'nolanguage' in opts: + strategy.remove('guess_language') - return second_pass_options - def _validate_options(self, options): - valid_filetypes = ('subtitle', 'info', 'video', - 'movie', 'moviesubtitle', 'movieinfo', - 'episode', 'episodesubtitle', 'episodeinfo') + for name in strategy: + apply_transfo(name) - type = options.get('type') - if type and type not in valid_filetypes: - raise ValueError("filetype needs to be one of %s" % valid_filetypes) + # more guessers for both movies and episodes + apply_transfo('guess_bonus_features') + apply_transfo('guess_year', skip_first_year=('skip_first_year' in opts)) + + if 'nocountry' not in opts: + apply_transfo('guess_country') + + apply_transfo('guess_idnumber') + + + # split into '-' separated subgroups (with required separator chars + # around the dash) + apply_transfo('split_on_dash') + + # 5- try to identify the remaining unknown groups by looking at their + # position relative to other known elements + if mtree.guess['type'] in ('episode', 'episodesubtitle', 'episodeinfo'): + apply_transfo('guess_episode_info_from_position') + else: + apply_transfo('guess_movie_title_from_position') + + # 6- perform some post-processing steps + apply_transfo('post_process') + + log.debug('Found match tree:\n%s' % u(mtree)) def matched(self): return self.match_tree.matched() - - -def found_property(node, name, value=None, confidence=1.0, update_guess=True, logger=None): - # automatically retrieve the log object from the caller frame - if not logger: - caller_frame = inspect.stack()[1][0] - logger = caller_frame.f_locals['self'].log - guess = Guess({name: node.clean_value if value is None else value}, confidence=confidence) - return found_guess(node, guess, update_guess=update_guess, logger=logger) - - -def found_guess(node, guess, update_guess=True, logger=None): - if node.guess: - if update_guess: - node.guess.update_highest_confidence(guess) - else: - child = node.add_child(guess.metadata().span) - child.guess = guess - else: - node.guess = guess - log_found_guess(guess, logger) - return node.guess - - -def log_found_guess(guess, logger=None): - for k, v in guess.items(): - (logger or log).debug('Property found: %s=%s (confidence=%.2f)' % (k, v, guess.confidence(k))) - - -class GuessFinder(object): - def __init__(self, guess_func, confidence=None, logger=None, options=None): - self.guess_func = guess_func - self.confidence = confidence - self.logger = logger or log - self.options = options - - def process_nodes(self, nodes): - for node in nodes: - self.process_node(node) - - def process_node(self, node, iterative=True, partial_span=None): - value = None - if partial_span: - value = node.value[partial_span[0]:partial_span[1]] - else: - value = node.value - string = ' %s ' % value # add sentinels - - if not self.options: - matcher_result = self.guess_func(string, node) - else: - matcher_result = self.guess_func(string, node, self.options) - - if matcher_result: - if not isinstance(matcher_result, Guess): - result, span = matcher_result - else: - result, span = matcher_result, matcher_result.metadata().span - - if result: - # readjust span to compensate for sentinels - span = (span[0] - 1, span[1] - 1) - - # readjust span to compensate for partial_span - if partial_span: - span = (span[0] + partial_span[0], span[1] + partial_span[0]) - - partition_spans = None - if self.options and 'skip_nodes' in self.options: - skip_nodes = self.options.get('skip_nodes') - for skip_node in skip_nodes: - if skip_node.parent.node_idx == node.node_idx[:len(skip_node.parent.node_idx)] and\ - skip_node.span == span: - partition_spans = node.get_partition_spans(skip_node.span) - partition_spans.remove(skip_node.span) - break - - if not partition_spans: - # restore sentinels compensation - - guess = None - if isinstance(result, Guess): - guess = result - else: - guess = Guess(result, confidence=self.confidence, input=string, span=span) - - if not iterative: - node.guess.update(guess) - else: - absolute_span = (span[0] + node.offset, span[1] + node.offset) - node.partition(span) - found_child = None - for child in node.children: - if child.span == absolute_span: - found_guess(child, guess, self.logger) - found_child = child - break - for child in node.children: - if not child is found_child: - self.process_node(child) - else: - for partition_span in partition_spans: - self.process_node(node, partial_span=partition_span) diff --git a/lib/guessit/matchtree.py b/lib/guessit/matchtree.py index e2a7aa2a..0725e835 100644 --- a/lib/guessit/matchtree.py +++ b/lib/guessit/matchtree.py @@ -2,7 +2,7 @@ # -*- coding: utf-8 -*- # # GuessIt - A library for guessing information from filenames -# Copyright (c) 2013 Nicolas Wack +# Copyright (c) 2011 Nicolas Wack # # GuessIt is free software; you can redistribute it and/or modify it under # the terms of the Lesser GNU General Public License as published by @@ -18,14 +18,12 @@ # along with this program. If not, see . # -from __future__ import absolute_import, division, print_function, unicode_literals - -import guessit # @UnusedImport needed for doctests -from guessit import UnicodeMixin, base_text_type +from __future__ import unicode_literals +from guessit import UnicodeMixin, base_text_type, Guess from guessit.textutils import clean_string, str_fill from guessit.patterns import group_delimiters from guessit.guess import (merge_similar_guesses, merge_all, - choose_int, choose_string, Guess) + choose_int, choose_string) import copy import logging @@ -33,45 +31,8 @@ log = logging.getLogger(__name__) class BaseMatchTree(UnicodeMixin): - """A BaseMatchTree is a tree covering the filename, where each - node represents a substring in the filename and can have a ``Guess`` - associated with it that contains the information that has been guessed - in this node. Nodes can be further split into subnodes until a proper - split has been found. - - Each node has the following attributes: - - string = the original string of which this node represents a region - - span = a pair of (begin, end) indices delimiting the substring - - parent = parent node - - children = list of children nodes - - guess = Guess() - - BaseMatchTrees are displayed in the following way: - - >>> path = 'Movies/Dark City (1998)/Dark.City.(1998).DC.BDRip.720p.DTS.X264-CHD.mkv' - >>> print(guessit.IterativeMatcher(path).match_tree) - 000000 1111111111111111 2222222222222222222222222222222222222222222 333 - 000000 0000000000111111 0000000000111111222222222222222222222222222 000 - 011112 011112000011111222222222222222222 000 - 011112222222222222 - 0000011112222 - 01112 0111 - Movies/__________(____)/Dark.City.(____).DC._____.____.___.____-___.___ - tttttttttt yyyy yyyy fffff ssss aaa vvvv rrr ccc - Movies/Dark City (1998)/Dark.City.(1998).DC.BDRip.720p.DTS.X264-CHD.mkv - - The last line contains the filename, which you can use a reference. - The previous line contains the type of property that has been found. - The line before that contains the filename, where all the found groups - have been blanked. Basically, what is left on this line are the leftover - groups which could not be identified. - - The lines before that indicate the indices of the groups in the tree. - - For instance, the part of the filename 'BDRip' is the leaf with index - ``(2, 2, 1)`` (read from top to bottom), and its meaning is 'format' - (as shown by the ``f``'s on the last-but-one line). - """ + """A MatchTree represents the hierarchical split of a string into its + constituent semantic groups.""" def __init__(self, string='', span=None, parent=None): self.string = string @@ -82,14 +43,10 @@ class BaseMatchTree(UnicodeMixin): @property def value(self): - """Return the substring that this node matches.""" return self.string[self.span[0]:self.span[1]] @property def clean_value(self): - """Return a cleaned value of the matched substring, with better - presentation formatting (punctuation marks removed, duplicate - spaces, ...)""" return clean_string(self.value) @property @@ -98,8 +55,6 @@ class BaseMatchTree(UnicodeMixin): @property def info(self): - """Return a dict containing all the info guessed by this node, - subnodes included.""" result = dict(self.guess) for c in self.children: @@ -109,7 +64,6 @@ class BaseMatchTree(UnicodeMixin): @property def root(self): - """Return the root node of the tree.""" if not self.parent: return self @@ -117,43 +71,28 @@ class BaseMatchTree(UnicodeMixin): @property def depth(self): - """Return the depth of this node.""" if self.is_leaf(): return 0 return 1 + max(c.depth for c in self.children) def is_leaf(self): - """Return whether this node is a leaf or not.""" return self.children == [] def add_child(self, span): - """Add a new child node to this node with the given span.""" child = MatchTree(self.string, span=span, parent=self) self.children.append(child) - return child - def get_partition_spans(self, indices): - """Return the list of absolute spans for the regions of the original - string defined by splitting this node at the given indices (relative - to this node)""" + def partition(self, indices): indices = sorted(indices) if indices[0] != 0: indices.insert(0, 0) if indices[-1] != len(self.value): indices.append(len(self.value)) - spans = [] for start, end in zip(indices[:-1], indices[1:]): - spans.append((self.offset + start, - self.offset + end)) - return spans - - def partition(self, indices): - """Partition this node by splitting it at the given indices, - relative to this node.""" - for partition_span in self.get_partition_spans(indices): - self.add_child(span=partition_span) + self.add_child(span=(self.offset + start, + self.offset + end)) def split_on_components(self, components): offset = 0 @@ -165,7 +104,6 @@ class BaseMatchTree(UnicodeMixin): offset = end def nodes_at_depth(self, depth): - """Return all the nodes at a given depth in the tree""" if depth == 0: yield self @@ -175,32 +113,26 @@ class BaseMatchTree(UnicodeMixin): @property def node_idx(self): - """Return this node's index in the tree, as a tuple. - If this node is the root of the tree, then return ().""" if self.parent is None: return () return self.parent.node_idx + (self.parent.children.index(self),) def node_at(self, idx): - """Return the node at the given index in the subtree rooted at - this node.""" if not idx: return self try: return self.children[idx[0]].node_at(idx[1:]) - except IndexError: + except: raise ValueError('Non-existent node index: %s' % (idx,)) def nodes(self): - """Return all the nodes and subnodes in this tree.""" yield self for child in self.children: for node in child.nodes(): yield node def _leaves(self): - """Return a generator over all the nodes that are leaves.""" if self.is_leaf(): yield self else: @@ -209,73 +141,10 @@ class BaseMatchTree(UnicodeMixin): for leaf in child._leaves(): yield leaf - def group_node(self): - return self._other_group_node(0) - - def previous_group_node(self): - return self._other_group_node(-1) - - def next_group_node(self): - return self._other_group_node(+1) - - def _other_group_node(self, offset): - if len(self.node_idx) > 1: - group_idx = self.node_idx[:2] - if group_idx[1] + offset >= 0: - other_group_idx = (group_idx[0], group_idx[1] + offset) - try: - other_group_node = self.root.node_at(other_group_idx) - return other_group_node - except ValueError: - pass - return None - def leaves(self): - """Return a list of all the nodes that are leaves.""" return list(self._leaves()) - def previous_leaf(self, leaf): - """Return previous leaf for this node""" - return self._other_leaf(leaf, -1) - - def next_leaf(self, leaf): - """Return next leaf for this node""" - return self._other_leaf(leaf, +1) - - def _other_leaf(self, leaf, offset): - leaves = self.leaves() - index = leaves.index(leaf) + offset - if index > 0 and index < len(leaves): - return leaves[index] - return None - - def previous_leaves(self, leaf): - """Return previous leaves for this node""" - leaves = self.leaves() - index = leaves.index(leaf) - if index > 0 and index < len(leaves): - previous_leaves = leaves[:index] - previous_leaves.reverse() - return previous_leaves - return [] - - def next_leaves(self, leaf): - """Return next leaves for this node""" - leaves = self.leaves() - index = leaves.index(leaf) - if index > 0 and index < len(leaves): - return leaves[index + 1:len(leaves)] - return [] - def to_string(self): - """Return a readable string representation of this tree. - - The result is a multi-line string, where the lines are: - - line 1 -> N-2: each line contains the nodes at the given depth in the tree - - line N-2: original string where all the found groups have been blanked - - line N-1: type of property that has been found - - line N: the original string, which you can use a reference. - """ empty_line = ' ' * len(self.string) def to_hex(x): @@ -284,27 +153,23 @@ class BaseMatchTree(UnicodeMixin): return x def meaning(result): - mmap = {'episodeNumber': 'E', - 'season': 'S', - 'extension': 'e', - 'format': 'f', - 'language': 'l', - 'country': 'C', - 'videoCodec': 'v', - 'videoProfile': 'v', - 'audioCodec': 'a', - 'audioProfile': 'a', - 'audioChannels': 'a', - 'website': 'w', - 'container': 'c', - 'series': 'T', - 'title': 't', - 'date': 'd', - 'year': 'y', - 'releaseGroup': 'r', - 'screenSize': 's', - 'other': 'o' - } + mmap = { 'episodeNumber': 'E', + 'season': 'S', + 'extension': 'e', + 'format': 'f', + 'language': 'l', + 'country': 'C', + 'videoCodec': 'v', + 'audioCodec': 'a', + 'website': 'w', + 'container': 'c', + 'series': 'T', + 'title': 't', + 'date': 'd', + 'year': 'y', + 'releaseGroup': 'r', + 'screenSize': 's' + } if result is None: return ' ' @@ -315,7 +180,7 @@ class BaseMatchTree(UnicodeMixin): return 'x' - lines = [empty_line] * (self.depth + 2) # +2: remaining, meaning + lines = [ empty_line ] * (self.depth + 2) # +2: remaining, meaning lines[-2] = self.string for node in self.nodes(): @@ -333,22 +198,16 @@ class BaseMatchTree(UnicodeMixin): lines.append(self.string) - return '\n'.join(l.rstrip() for l in lines) + return '\n'.join(lines) def __unicode__(self): return self.to_string() - def __repr__(self): - return '' % self.value - class MatchTree(BaseMatchTree): """The MatchTree contains a few "utility" methods which are not necessary for the BaseMatchTree, but add a lot of convenience for writing - higher-level rules. - """ - - _matched_result = None + higher-level rules.""" def _unidentified_leaves(self, valid=lambda leaf: len(leaf.clean_value) >= 2): @@ -358,12 +217,11 @@ class MatchTree(BaseMatchTree): def unidentified_leaves(self, valid=lambda leaf: len(leaf.clean_value) >= 2): - """Return a list of leaves that are not empty.""" return list(self._unidentified_leaves(valid)) def _leaves_containing(self, property_name): if isinstance(property_name, base_text_type): - property_name = [property_name] + property_name = [ property_name ] for leaf in self._leaves(): for prop in property_name: @@ -372,11 +230,9 @@ class MatchTree(BaseMatchTree): break def leaves_containing(self, property_name): - """Return a list of leaves that guessed the given property.""" return list(self._leaves_containing(property_name)) def first_leaf_containing(self, property_name): - """Return the first leaf containing the given property.""" try: return next(self._leaves_containing(property_name)) except StopIteration: @@ -389,8 +245,6 @@ class MatchTree(BaseMatchTree): yield leaf def previous_unidentified_leaves(self, node): - """Return a list of non-empty leaves that are before the given - node (in the string).""" return list(self._previous_unidentified_leaves(node)) def _previous_leaves_containing(self, node, property_name): @@ -400,8 +254,6 @@ class MatchTree(BaseMatchTree): yield leaf def previous_leaves_containing(self, node, property_name): - """Return a list of leaves containing the given property that are - before the given node (in the string).""" return list(self._previous_leaves_containing(node, property_name)) def is_explicit(self): @@ -410,30 +262,26 @@ class MatchTree(BaseMatchTree): return (self.value[0] + self.value[-1]) in group_delimiters def matched(self): - """Return a single guess that contains all the info found in the - nodes of this tree, trying to merge properties as good as possible. - """ - if not self._matched_result: - # we need to make a copy here, as the merge functions work in place and - # calling them on the match tree would modify it - parts = [copy.copy(node.guess) for node in self.nodes() if node.guess] + # we need to make a copy here, as the merge functions work in place and + # calling them on the match tree would modify it + parts = [node.guess for node in self.nodes() if node.guess] + parts = copy.deepcopy(parts) - # 1- try to merge similar information together and give it a higher - # confidence - for int_part in ('year', 'season', 'episodeNumber'): - merge_similar_guesses(parts, int_part, choose_int) + # 1- try to merge similar information together and give it a higher + # confidence + for int_part in ('year', 'season', 'episodeNumber'): + merge_similar_guesses(parts, int_part, choose_int) - for string_part in ('title', 'series', 'container', 'format', - 'releaseGroup', 'website', 'audioCodec', - 'videoCodec', 'screenSize', 'episodeFormat', - 'audioChannels', 'idNumber'): - merge_similar_guesses(parts, string_part, choose_string) + for string_part in ('title', 'series', 'container', 'format', + 'releaseGroup', 'website', 'audioCodec', + 'videoCodec', 'screenSize', 'episodeFormat', + 'audioChannels', 'idNumber'): + merge_similar_guesses(parts, string_part, choose_string) - # 2- merge the rest, potentially discarding information not properly - # merged before - result = merge_all(parts, - append=['language', 'subtitleLanguage', 'other', 'special']) + # 2- merge the rest, potentially discarding information not properly + # merged before + result = merge_all(parts, + append=['language', 'subtitleLanguage', 'other']) - log.debug('Final result: ' + result.nice_string()) - self._matched_result = result - return self._matched_result + log.debug('Final result: ' + result.nice_string()) + return result diff --git a/lib/guessit/options.py b/lib/guessit/options.py deleted file mode 100644 index 003ca86d..00000000 --- a/lib/guessit/options.py +++ /dev/null @@ -1,25 +0,0 @@ -from optparse import OptionParser - -option_parser = OptionParser(usage='usage: %prog [options] file1 [file2...]') -option_parser.add_option('-v', '--verbose', action='store_true', dest='verbose', default=False, - help='display debug output') -option_parser.add_option('-p', '--properties', dest='properties', action='store_true', default=False, - help='Display properties that can be guessed.') -option_parser.add_option('-l', '--values', dest='values', action='store_true', default=False, - help='Display property values that can be guessed.') -option_parser.add_option('-s', '--transformers', dest='transformers', action='store_true', default=False, - help='Display transformers that can be used.') -option_parser.add_option('-i', '--info', dest='info', default='filename', - help='the desired information type: filename, hash_mpc or a hash from python\'s ' - 'hashlib module, such as hash_md5, hash_sha1, ...; or a list of any of ' - 'them, comma-separated') -option_parser.add_option('-n', '--name-only', dest='name_only', action='store_true', default=False, - help='Parse files as name only. Disable folder parsing, extension parsing, and file content analysis.') -option_parser.add_option('-t', '--type', dest='type', default=None, - help='the suggested file type: movie, episode. If undefined, type will be guessed.') -option_parser.add_option('-a', '--advanced', dest='advanced', action='store_true', default=False, - help='display advanced information for filename guesses, as json output') -option_parser.add_option('-y', '--yaml', dest='yaml', action='store_true', default=False, - help='display information for filename guesses as yaml output (like unit-test)') -option_parser.add_option('-d', '--demo', action='store_true', dest='demo', default=False, - help='run a few builtin tests instead of analyzing a file') diff --git a/lib/guessit/patterns.py b/lib/guessit/patterns.py new file mode 100644 index 00000000..f803a11c --- /dev/null +++ b/lib/guessit/patterns.py @@ -0,0 +1,250 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +# +# GuessIt - A library for guessing information from filenames +# Copyright (c) 2011 Nicolas Wack +# Copyright (c) 2011 Ricard Marxer +# +# GuessIt is free software; you can redistribute it and/or modify it under +# the terms of the Lesser GNU General Public License as published by +# the Free Software Foundation; either version 3 of the License, or +# (at your option) any later version. +# +# GuessIt is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# Lesser GNU General Public License for more details. +# +# You should have received a copy of the Lesser GNU General Public License +# along with this program. If not, see . +# + +from __future__ import unicode_literals +import re + + +subtitle_exts = [ 'srt', 'idx', 'sub', 'ssa' ] + +info_exts = [ 'nfo' ] + +video_exts = ['3g2', '3gp', '3gp2', 'asf', 'avi', 'divx', 'flv', 'm4v', 'mk2', + 'mka', 'mkv', 'mov', 'mp4', 'mp4a', 'mpeg', 'mpg', 'ogg', 'ogm', + 'ogv', 'qt', 'ra', 'ram', 'rm', 'ts', 'wav', 'webm', 'wma', 'wmv'] + +group_delimiters = [ '()', '[]', '{}' ] + +# separator character regexp +sep = r'[][,)(}{+ /\._-]' # regexp art, hehe :D + +# character used to represent a deleted char (when matching groups) +deleted = '_' + +# format: [ (regexp, confidence, span_adjust) ] +episode_rexps = [ # ... Season 2 ... + (r'season (?P[0-9]+)', 1.0, (0, 0)), + (r'saison (?P[0-9]+)', 1.0, (0, 0)), + + # ... s02e13 ... + (r'[Ss](?P[0-9]{1,3})[^0-9]?(?P(?:-?[eE-][0-9]{1,3})+)[^0-9]', 1.0, (0, -1)), + + # ... s03-x02 ... # FIXME: redundant? remove it? + #(r'[Ss](?P[0-9]{1,3})[^0-9]?(?P(?:-?[xX-][0-9]{1,3})+)[^0-9]', 1.0, (0, -1)), + + # ... 2x13 ... + (r'[^0-9](?P[0-9]{1,2})[^0-9 .-]?(?P(?:-?[xX][0-9]{1,3})+)[^0-9]', 1.0, (1, -1)), + + # ... s02 ... + #(sep + r's(?P[0-9]{1,2})' + sep, 0.6, (1, -1)), + (r's(?P[0-9]{1,2})[^0-9]', 0.6, (0, -1)), + + # v2 or v3 for some mangas which have multiples rips + (r'(?P[0-9]{1,3})v[23]' + sep, 0.6, (0, 0)), + + # ... ep 23 ... + ('ep' + sep + r'(?P[0-9]{1,2})[^0-9]', 0.7, (0, -1)), + + # ... e13 ... for a mini-series without a season number + (sep + r'e(?P[0-9]{1,2})' + sep, 0.6, (1, -1)) + + ] + + +weak_episode_rexps = [ # ... 213 or 0106 ... + (sep + r'(?P[0-9]{2,4})' + sep, (1, -1)) + ] + +non_episode_title = [ 'extras', 'rip' ] + + +video_rexps = [ # cd number + (r'cd ?(?P[0-9])( ?of ?(?P[0-9]))?', 1.0, (0, 0)), + (r'(?P[1-9]) cds?', 0.9, (0, 0)), + + # special editions + (r'edition' + sep + r'(?Pcollector)', 1.0, (0, 0)), + (r'(?Pcollector)' + sep + 'edition', 1.0, (0, 0)), + (r'(?Pspecial)' + sep + 'edition', 1.0, (0, 0)), + (r'(?Pcriterion)' + sep + 'edition', 1.0, (0, 0)), + + # director's cut + (r"(?Pdirector'?s?" + sep + "cut)", 1.0, (0, 0)), + + # video size + (r'(?P[0-9]{3,4})x(?P[0-9]{3,4})', 0.9, (0, 0)), + + # website + (r'(?Pwww(\.[a-zA-Z0-9]+){2,3})', 0.8, (0, 0)), + + # bonusNumber: ... x01 ... + (r'x(?P[0-9]{1,2})', 1.0, (0, 0)), + + # filmNumber: ... f01 ... + (r'f(?P[0-9]{1,2})', 1.0, (0, 0)) + ] + +websites = [ 'tvu.org.ru', 'emule-island.com', 'UsaBit.com', 'www.divx-overnet.com', + 'sharethefiles.com' ] + +unlikely_series = [ 'series' ] + + +# prop_multi is a dict of { property_name: { canonical_form: [ pattern ] } } +# pattern is a string considered as a regexp, with the addition that dashes are +# replaced with '([ \.-_])?' which matches more types of separators (or none) +# note: simpler patterns need to be at the end of the list to not shadow more +# complete ones, eg: 'AAC' needs to come after 'He-AAC' +# ie: from most specific to less specific +prop_multi = { 'format': { 'DVD': [ 'DVD', 'DVD-Rip', 'VIDEO-TS', 'DVDivX' ], + 'HD-DVD': [ 'HD-(?:DVD)?-Rip', 'HD-DVD' ], + 'BluRay': [ 'Blu-ray', 'B[DR]Rip' ], + 'HDTV': [ 'HD-TV' ], + 'DVB': [ 'DVB-Rip', 'DVB', 'PD-TV' ], + 'WEBRip': [ 'WEB-Rip' ], + 'Screener': [ 'DVD-SCR', 'Screener' ], + 'VHS': [ 'VHS' ], + 'WEB-DL': [ 'WEB-DL' ] }, + + 'is3D': { True: [ '3D' ] }, + + 'screenSize': { '480p': [ '480[pi]?' ], + '720p': [ '720[pi]?' ], + '1080i': [ '1080i' ], + '1080p': [ '1080p', '1080[^i]' ] }, + + 'videoCodec': { 'XviD': [ 'Xvid' ], + 'DivX': [ 'DVDivX', 'DivX' ], + 'h264': [ '[hx]-264' ], + 'Rv10': [ 'Rv10' ], + 'Mpeg2': [ 'Mpeg2' ] }, + + # has nothing to do here (or on filenames for that matter), but some + # releases use it and it helps to identify release groups, so we adapt + 'videoApi': { 'DXVA': [ 'DXVA' ] }, + + 'audioCodec': { 'AC3': [ 'AC3' ], + 'DTS': [ 'DTS' ], + 'AAC': [ 'He-AAC', 'AAC-He', 'AAC' ] }, + + 'audioChannels': { '5.1': [ r'5\.1', 'DD5[._ ]1', '5ch' ] }, + + 'episodeFormat': { 'Minisode': [ 'Minisodes?' ] } + + } + +# prop_single dict of { property_name: [ canonical_form ] } +prop_single = { 'releaseGroup': [ 'ESiR', 'WAF', 'SEPTiC', r'\[XCT\]', 'iNT', 'PUKKA', + 'CHD', 'ViTE', 'TLF', 'FLAiTE', + 'MDX', 'GM4F', 'DVL', 'SVD', 'iLUMiNADOS', + 'aXXo', 'KLAXXON', 'NoTV', 'ZeaL', 'LOL', + 'CtrlHD', 'POD', 'WiKi','IMMERSE', 'FQM', + '2HD', 'CTU', 'HALCYON', 'EbP', 'SiTV', + 'HDBRiSe', 'AlFleNi-TeaM', 'EVOLVE', '0TV', + 'TLA', 'NTB', 'ASAP', 'MOMENTUM', 'FoV', 'D-Z0N3', + 'TrollHD', 'ECI' + ], + + # potentially confusing release group names (they are words) + 'weakReleaseGroup': [ 'DEiTY', 'FiNaLe', 'UnSeeN', 'KiNGS', 'CLUE', 'DIMENSION', + 'SAiNTS', 'ARROW', 'EuReKA', 'SiNNERS', 'DiRTY', 'REWARD', + 'REPTiLE', + ], + + 'other': [ 'PROPER', 'REPACK', 'LIMITED', 'DualAudio', 'Audiofixed', 'R5', + 'complete', 'classic', # not so sure about these ones, could appear in a title + 'ws' ] # widescreen + } + +_dash = '-' +_psep = '[-. _]?' + +def _to_rexp(prop): + return re.compile(prop.replace(_dash, _psep), re.IGNORECASE) + +# properties_rexps dict of { property_name: { canonical_form: [ rexp ] } } +# containing the rexps compiled from both prop_multi and prop_single +properties_rexps = dict((type, dict((canonical_form, + [ _to_rexp(pattern) for pattern in patterns ]) + for canonical_form, patterns in props.items())) + for type, props in prop_multi.items()) + +properties_rexps.update(dict((type, dict((canonical_form, [ _to_rexp(canonical_form) ]) + for canonical_form in props)) + for type, props in prop_single.items())) + + + +def find_properties(string): + result = [] + for property_name, props in properties_rexps.items(): + # FIXME: this should be done in a more flexible way... + if property_name in ['weakReleaseGroup']: + continue + + for canonical_form, rexps in props.items(): + for value_rexp in rexps: + match = value_rexp.search(string) + if match: + start, end = match.span() + # make sure our word is always surrounded by separators + # note: sep is a regexp, but in this case using it as + # a char sequence achieves the same goal + if ((start > 0 and string[start-1] not in sep) or + (end < len(string) and string[end] not in sep)): + continue + + result.append((property_name, canonical_form, start, end)) + return result + + +property_synonyms = { 'Special Edition': [ 'Special' ], + 'Collector Edition': [ 'Collector' ], + 'Criterion Edition': [ 'Criterion' ] + } + + +def revert_synonyms(): + reverse = {} + + for canonical, synonyms in property_synonyms.items(): + for synonym in synonyms: + reverse[synonym.lower()] = canonical + + return reverse + + +reverse_synonyms = revert_synonyms() + + +def canonical_form(string): + return reverse_synonyms.get(string.lower(), string) + + +def compute_canonical_form(property_name, value): + """Return the canonical form of a property given its type if it is a valid + one, None otherwise.""" + if isinstance(value, basestring): + for canonical_form, rexps in properties_rexps[property_name].items(): + for rexp in rexps: + if rexp.match(value): + return canonical_form + return None diff --git a/lib/guessit/patterns/__init__.py b/lib/guessit/patterns/__init__.py deleted file mode 100644 index 0210c4d9..00000000 --- a/lib/guessit/patterns/__init__.py +++ /dev/null @@ -1,77 +0,0 @@ -#!/usr/bin/env python -# -*- coding: utf-8 -*- -# -# GuessIt - A library for guessing information from filenames -# Copyright (c) 2013 Nicolas Wack -# Copyright (c) 2013 Rémi Alvergnat -# -# GuessIt is free software; you can redistribute it and/or modify it under -# the terms of the Lesser GNU General Public License as published by -# the Free Software Foundation; either version 3 of the License, or -# (at your option) any later version. -# -# GuessIt is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# Lesser GNU General Public License for more details. -# -# You should have received a copy of the Lesser GNU General Public License -# along with this program. If not, see . -# - -from __future__ import absolute_import, division, print_function, unicode_literals - -import re - -from guessit import base_text_type - -group_delimiters = ['()', '[]', '{}'] - -# separator character regexp -sep = r'[][,)(}:{+ /\._-]' # regexp art, hehe :D - -_dash = '-' -_psep = '[\W_]?' - - -def build_or_pattern(patterns): - """Build a or pattern string from a list of possible patterns - """ - or_pattern = '' - for pattern in patterns: - if not or_pattern: - or_pattern += '(?:' - else: - or_pattern += '|' - or_pattern += ('(?:%s)' % pattern) - or_pattern += ')' - return or_pattern - - -def compile_pattern(pattern, enhance=True): - """Compile and enhance a pattern - - :param pattern: Pattern to compile (regexp). - :type pattern: string - - :param pattern: Enhance pattern before compiling. - :type pattern: string - - :return: The compiled pattern - :rtype: regular expression object - """ - return re.compile(enhance_pattern(pattern) if enhance else pattern, re.IGNORECASE) - - -def enhance_pattern(pattern): - """Enhance pattern to match more equivalent values. - - '-' are replaced by '[\W_]?', which matches more types of separators (or none) - - :param pattern: Pattern to enhance (regexp). - :type pattern: string - - :return: The enhanced pattern - :rtype: string - """ - return pattern.replace(_dash, _psep) diff --git a/lib/guessit/patterns/extension.py b/lib/guessit/patterns/extension.py deleted file mode 100644 index a8d931af..00000000 --- a/lib/guessit/patterns/extension.py +++ /dev/null @@ -1,32 +0,0 @@ -#!/usr/bin/env python -# -*- coding: utf-8 -*- -# -# GuessIt - A library for guessing information from filenames -# Copyright (c) 2013 Nicolas Wack -# Copyright (c) 2013 Rémi Alvergnat -# Copyright (c) 2011 Ricard Marxer -# -# GuessIt is free software; you can redistribute it and/or modify it under -# the terms of the Lesser GNU General Public License as published by -# the Free Software Foundation; either version 3 of the License, or -# (at your option) any later version. -# -# GuessIt is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# Lesser GNU General Public License for more details. -# -# You should have received a copy of the Lesser GNU General Public License -# along with this program. If not, see . -# - -from __future__ import absolute_import, division, print_function, unicode_literals - -subtitle_exts = ['srt', 'idx', 'sub', 'ssa'] - -info_exts = ['nfo'] - -video_exts = ['3g2', '3gp', '3gp2', 'asf', 'avi', 'divx', 'flv', 'm4v', 'mk2', - 'mka', 'mkv', 'mov', 'mp4', 'mp4a', 'mpeg', 'mpg', 'ogg', 'ogm', - 'ogv', 'qt', 'ra', 'ram', 'rm', 'ts', 'wav', 'webm', 'wma', 'wmv', - 'iso'] diff --git a/lib/guessit/patterns/numeral.py b/lib/guessit/patterns/numeral.py deleted file mode 100644 index 5d0e468b..00000000 --- a/lib/guessit/patterns/numeral.py +++ /dev/null @@ -1,150 +0,0 @@ -#!/usr/bin/env python -# -*- coding: utf-8 -*- -# -# GuessIt - A library for guessing information from filenames -# Copyright (c) 2013 Rémi Alvergnat -# -# GuessIt is free software; you can redistribute it and/or modify it under -# the terms of the Lesser GNU General Public License as published by -# the Free Software Foundation; either version 3 of the License, or -# (at your option) any later version. -# -# GuessIt is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# Lesser GNU General Public License for more details. -# -# You should have received a copy of the Lesser GNU General Public License -# along with this program. If not, see . -# - -from __future__ import absolute_import, division, print_function, unicode_literals - -import re - -digital_numeral = '\d{1,3}' - -roman_numeral = "(?=[MCDLXVI]+)M{0,4}(?:CM|CD|D?C{0,3})(?:XC|XL|L?X{0,3})(?:IX|IV|V?I{0,3})" - -english_word_numeral_list = [ - 'zero', 'one', 'two', 'three', 'four', 'five', 'six', 'seven', 'eight', 'nine', 'ten', - 'eleven', 'twelve', 'thirteen', 'fourteen', 'fifteen', 'sixteen', 'seventeen', 'eighteen', 'nineteen', 'twenty' -] - -french_word_numeral_list = [ - 'zéro', 'un', 'deux', 'trois', 'quatre', 'cinq', 'six', 'sept', 'huit', 'neuf', 'dix', - 'onze', 'douze', 'treize', 'quatorze', 'quinze', 'seize', 'dix-sept', 'dix-huit', 'dix-neuf', 'vingt' -] - -french_alt_word_numeral_list = [ - 'zero', 'une', 'deux', 'trois', 'quatre', 'cinq', 'six', 'sept', 'huit', 'neuf', 'dix', - 'onze', 'douze', 'treize', 'quatorze', 'quinze', 'seize', 'dixsept', 'dixhuit', 'dixneuf', 'vingt' -] - - -def __build_word_numeral(*args, **kwargs): - re = None - for word_list in args: - for word in word_list: - if not re: - re = '(?:(?=\w+)' - else: - re += '|' - re += word - re += ')' - return re - -word_numeral = __build_word_numeral(english_word_numeral_list, french_word_numeral_list, french_alt_word_numeral_list) - -numeral = '(?:' + digital_numeral + '|' + roman_numeral + '|' + word_numeral + ')' - -__romanNumeralMap = ( - ('M', 1000), - ('CM', 900), - ('D', 500), - ('CD', 400), - ('C', 100), - ('XC', 90), - ('L', 50), - ('XL', 40), - ('X', 10), - ('IX', 9), - ('V', 5), - ('IV', 4), - ('I', 1) - ) - -__romanNumeralPattern = re.compile('^' + roman_numeral + '$') - - -def __parse_roman(value): - """convert Roman numeral to integer""" - if not __romanNumeralPattern.search(value): - raise ValueError('Invalid Roman numeral: %s' % value) - - result = 0 - index = 0 - for numeral, integer in __romanNumeralMap: - while value[index:index + len(numeral)] == numeral: - result += integer - index += len(numeral) - return result - - -def __parse_word(value): - """Convert Word numeral to integer""" - for word_list in [english_word_numeral_list, french_word_numeral_list, french_alt_word_numeral_list]: - try: - return word_list.index(value) - except ValueError: - pass - raise ValueError - - -_clean_re = re.compile('[^\d]*(\d+)[^\d]*') - - -def parse_numeral(value, int_enabled=True, roman_enabled=True, word_enabled=True, clean=True): - """Parse a numeric value into integer. - - input can be an integer as a string, a roman numeral or a word - - :param value: Value to parse. Can be an integer, roman numeral or word. - :type value: string - - :return: Numeric value, or None if value can't be parsed - :rtype: int - """ - if int_enabled: - try: - if clean: - match = _clean_re.match(value) - if match: - clean_value = match.group(1) - return int(clean_value) - return int(value) - except ValueError: - pass - if roman_enabled: - try: - if clean: - for word in value.split(): - try: - return __parse_roman(word) - except ValueError: - pass - return __parse_roman(value) - except ValueError: - pass - if word_enabled: - try: - if clean: - for word in value.split(): - try: - return __parse_word(word) - except ValueError: - pass - return __parse_word(value) - except ValueError: - pass - raise ValueError('Invalid numeral: ' + value) diff --git a/lib/guessit/plugins/__init__.py b/lib/guessit/plugins/__init__.py deleted file mode 100644 index 6a63e4e1..00000000 --- a/lib/guessit/plugins/__init__.py +++ /dev/null @@ -1,21 +0,0 @@ -#!/usr/bin/env python -# -*- coding: utf-8 -*- -# -# GuessIt - A library for guessing information from filenames -# Copyright (c) 2013 Nicolas Wack -# -# GuessIt is free software; you can redistribute it and/or modify it under -# the terms of the Lesser GNU General Public License as published by -# the Free Software Foundation; either version 3 of the License, or -# (at your option) any later version. -# -# GuessIt is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# Lesser GNU General Public License for more details. -# -# You should have received a copy of the Lesser GNU General Public License -# along with this program. If not, see . -# - -from __future__ import absolute_import, division, print_function, unicode_literals diff --git a/lib/guessit/plugins/transformers.py b/lib/guessit/plugins/transformers.py deleted file mode 100644 index bbf8d407..00000000 --- a/lib/guessit/plugins/transformers.py +++ /dev/null @@ -1,186 +0,0 @@ -#!/usr/bin/env python -# -*- coding: utf-8 -*- -# -# GuessIt - A library for guessing information from filenames -# Copyright (c) 2013 Nicolas Wack -# -# GuessIt is free software; you can redistribute it and/or modify it under -# the terms of the Lesser GNU General Public License as published by -# the Free Software Foundation; either version 3 of the License, or -# (at your option) any later version. -# -# GuessIt is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# Lesser GNU General Public License for more details. -# -# You should have received a copy of the Lesser GNU General Public License -# along with this program. If not, see . -# - -from __future__ import absolute_import, division, print_function, unicode_literals - -from stevedore import ExtensionManager -from pkg_resources import EntryPoint - -from stevedore.extension import Extension -from logging import getLogger - -log = getLogger(__name__) - - -class Transformer(object): # pragma: no cover - def __init__(self, priority=0): - self.priority = priority - self.log = getLogger(self.name) - - @property - def name(self): - return self.__class__.__name__ - - def supported_properties(self): - return {} - - def second_pass_options(self, mtree, options=None): - return None - - def should_process(self, mtree, options=None): - return True - - def process(self, mtree, options=None): - pass - - def post_process(self, mtree, options=None): - pass - - def rate_quality(self, guess, *props): - return 0 - - -class CustomTransformerExtensionManager(ExtensionManager): - def __init__(self, namespace='guessit.transformer', invoke_on_load=True, - invoke_args=(), invoke_kwds={}, propagate_map_exceptions=True, on_load_failure_callback=None, - verify_requirements=False): - super(CustomTransformerExtensionManager, self).__init__(namespace=namespace, - invoke_on_load=invoke_on_load, - invoke_args=invoke_args, - invoke_kwds=invoke_kwds, - propagate_map_exceptions=propagate_map_exceptions, - on_load_failure_callback=on_load_failure_callback, - verify_requirements=verify_requirements) - - def order_extensions(self, extensions): - """Order the loaded transformers - - It should follow those rules - - website before language (eg: tvu.org.ru vs russian) - - language before episodes_rexps - - properties before language (eg: he-aac vs hebrew) - - release_group before properties (eg: XviD-?? vs xvid) - """ - extensions.sort(key=lambda ext: -ext.obj.priority) - return extensions - - def _load_one_plugin(self, ep, invoke_on_load, invoke_args, invoke_kwds, verify_requirements): - if not ep.dist: - plugin = ep.load(require=False) - else: - plugin = ep.load(require=verify_requirements) - if invoke_on_load: - obj = plugin(*invoke_args, **invoke_kwds) - else: - obj = None - return Extension(ep.name, ep, plugin, obj) - - def _load_plugins(self, invoke_on_load, invoke_args, invoke_kwds, verify_requirements): - return self.order_extensions(super(CustomTransformerExtensionManager, self)._load_plugins(invoke_on_load, invoke_args, invoke_kwds, verify_requirements)) - - def objects(self): - return self.map(self._get_obj) - - def _get_obj(self, ext): - return ext.obj - - def object(self, name): - try: - return self[name].obj - except KeyError: - return None - - def register_module(self, name, module_name): - ep = EntryPoint(name, module_name) - loaded = self._load_one_plugin(ep, invoke_on_load=True, invoke_args=(), invoke_kwds={}) - if loaded: - self.extensions.append(loaded) - self.extensions = self.order_extensions(self.extensions) - self._extensions_by_name = None - - -class DefaultTransformerExtensionManager(CustomTransformerExtensionManager): - @property - def _internal_entry_points(self): - return ['split_path_components = guessit.transfo.split_path_components:SplitPathComponents', - 'guess_filetype = guessit.transfo.guess_filetype:GuessFiletype', - 'split_explicit_groups = guessit.transfo.split_explicit_groups:SplitExplicitGroups', - 'guess_date = guessit.transfo.guess_date:GuessDate', - 'guess_website = guessit.transfo.guess_website:GuessWebsite', - 'guess_release_group = guessit.transfo.guess_release_group:GuessReleaseGroup', - 'guess_properties = guessit.transfo.guess_properties:GuessProperties', - 'guess_language = guessit.transfo.guess_language:GuessLanguage', - 'guess_video_rexps = guessit.transfo.guess_video_rexps:GuessVideoRexps', - 'guess_episodes_rexps = guessit.transfo.guess_episodes_rexps:GuessEpisodesRexps', - 'guess_weak_episodes_rexps = guessit.transfo.guess_weak_episodes_rexps:GuessWeakEpisodesRexps', - 'guess_bonus_features = guessit.transfo.guess_bonus_features:GuessBonusFeatures', - 'guess_year = guessit.transfo.guess_year:GuessYear', - 'guess_country = guessit.transfo.guess_country:GuessCountry', - 'guess_idnumber = guessit.transfo.guess_idnumber:GuessIdnumber', - 'split_on_dash = guessit.transfo.split_on_dash:SplitOnDash', - 'guess_episode_info_from_position = guessit.transfo.guess_episode_info_from_position:GuessEpisodeInfoFromPosition', - 'guess_movie_title_from_position = guessit.transfo.guess_movie_title_from_position:GuessMovieTitleFromPosition', - 'guess_episode_special = guessit.transfo.guess_episode_special:GuessEpisodeSpecial'] - - def _find_entry_points(self, namespace): - entry_points = {} - # Internal entry points - if namespace == self.namespace: - for internal_entry_point_str in self._internal_entry_points: - internal_entry_point = EntryPoint.parse(internal_entry_point_str) - entry_points[internal_entry_point.name] = internal_entry_point - - # Package entry points - setuptools_entrypoints = super(DefaultTransformerExtensionManager, self)._find_entry_points(namespace) - for setuptools_entrypoint in setuptools_entrypoints: - entry_points[setuptools_entrypoint.name] = setuptools_entrypoint - - return list(entry_points.values()) - -_extensions = None - - -def all_transformers(): - return _extensions.objects() - - -def get_transformer(name): - return _extensions.object(name) - - -def add_transformer(name, module_name): - _extensions.register_module(name, module_name) - - -def reload(custom=False): - """ - Reload extension manager with default or custom one. - :param custom: if True, custom manager will be used, else default one. - Default manager will load default extensions from guessit and setuptools packaging extensions - Custom manager will not load default extensions from guessit, using only setuptools packaging extensions. - :type custom: boolean - """ - global _extensions - if custom: - _extensions = CustomTransformerExtensionManager() - else: - _extensions = DefaultTransformerExtensionManager() - -reload() diff --git a/lib/guessit/quality.py b/lib/guessit/quality.py deleted file mode 100644 index 870bbdbb..00000000 --- a/lib/guessit/quality.py +++ /dev/null @@ -1,65 +0,0 @@ -#!/usr/bin/env python -# -*- coding: utf-8 -*- -# -# GuessIt - A library for guessing information from filenames -# Copyright (c) 2013 Rémi Alvergnat -# -# GuessIt is free software; you can redistribute it and/or modify it under -# the terms of the Lesser GNU General Public License as published by -# the Free Software Foundation; either version 3 of the License, or -# (at your option) any later version. -# -# GuessIt is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# Lesser GNU General Public License for more details. -# -# You should have received a copy of the Lesser GNU General Public License -# along with this program. If not, see . -# - -from __future__ import absolute_import, division, print_function, unicode_literals - -from guessit.plugins.transformers import all_transformers - - -def best_quality_properties(props, *guesses): - """Retrieve the best quality guess, based on given properties - - :param props: Properties to include in the rating - :type props: list of strings - :param guesses: Guesses to rate - :type guesses: :class:`guessit.guess.Guess` - - :return: Best quality guess from all passed guesses - :rtype: :class:`guessit.guess.Guess` - """ - best_guess = None - best_rate = None - for guess in guesses: - for transformer in all_transformers(): - rate = transformer.rate_quality(guess, *props) - if best_rate is None or best_rate < rate: - best_rate = rate - best_guess = guess - return best_guess - - -def best_quality(*guesses): - """Retrieve the best quality guess. - - :param guesses: Guesses to rate - :type guesses: :class:`guessit.guess.Guess` - - :return: Best quality guess from all passed guesses - :rtype: :class:`guessit.guess.Guess` - """ - best_guess = None - best_rate = None - for guess in guesses: - for transformer in all_transformers(): - rate = transformer.rate_quality(guess) - if best_rate is None or best_rate < rate: - best_rate = rate - best_guess = guess - return best_guess diff --git a/lib/guessit/slogging.py b/lib/guessit/slogging.py index 5ccf8425..39591a20 100644 --- a/lib/guessit/slogging.py +++ b/lib/guessit/slogging.py @@ -1,28 +1,28 @@ #!/usr/bin/env python # -*- coding: utf-8 -*- # -# GuessIt - A library for guessing information from filenames -# Copyright (c) 2013 Nicolas Wack +# Smewt - A smart collection manager +# Copyright (c) 2011 Nicolas Wack # -# GuessIt is free software; you can redistribute it and/or modify it under -# the terms of the Lesser GNU General Public License as published by +# Smewt is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by # the Free Software Foundation; either version 3 of the License, or # (at your option) any later version. # -# GuessIt is distributed in the hope that it will be useful, +# Smewt is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# Lesser GNU General Public License for more details. +# GNU General Public License for more details. # -# You should have received a copy of the Lesser GNU General Public License +# You should have received a copy of the GNU General Public License # along with this program. If not, see . # -from __future__ import absolute_import, division, print_function, unicode_literals - +from __future__ import unicode_literals import logging import sys -import os +import os, os.path + GREEN_FONT = "\x1B[0;32m" YELLOW_FONT = "\x1B[0;33m" @@ -31,7 +31,7 @@ RED_FONT = "\x1B[0;31m" RESET_FONT = "\x1B[0m" -def setupLogging(colored=True, with_time=False, with_thread=False, filename=None, with_lineno=False): # pragma: no cover +def setupLogging(colored=True, with_time=False, with_thread=False, filename=None, with_lineno=False): """Set up a nice colored logger as the main application logger.""" class SimpleFormatter(logging.Formatter): diff --git a/lib/guessit/test/__init__.py b/lib/guessit/test/__init__.py deleted file mode 100644 index 650a3116..00000000 --- a/lib/guessit/test/__init__.py +++ /dev/null @@ -1,26 +0,0 @@ -#!/usr/bin/env python -# -*- coding: utf-8 -*- -# -# GuessIt - A library for guessing information from filenames -# Copyright (c) 2013 Nicolas Wack -# -# GuessIt is free software; you can redistribute it and/or modify it under -# the terms of the Lesser GNU General Public License as published by -# the Free Software Foundation; either version 3 of the License, or -# (at your option) any later version. -# -# GuessIt is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# Lesser GNU General Public License for more details. -# -# You should have received a copy of the Lesser GNU General Public License -# along with this program. If not, see . -# - -from __future__ import absolute_import, division, print_function, unicode_literals - -import logging -from guessit.slogging import setupLogging -setupLogging() -logging.disable(logging.INFO) diff --git a/lib/guessit/test/__main__.py b/lib/guessit/test/__main__.py deleted file mode 100644 index 32b8dd10..00000000 --- a/lib/guessit/test/__main__.py +++ /dev/null @@ -1,40 +0,0 @@ -#!/usr/bin/env python -# -*- coding: utf-8 -*- -# -# GuessIt - A library for guessing information from filenames -# Copyright (c) 2013 Nicolas Wack -# -# GuessIt is free software; you can redistribute it and/or modify it under -# the terms of the Lesser GNU General Public License as published by -# the Free Software Foundation; either version 3 of the License, or -# (at your option) any later version. -# -# GuessIt is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# Lesser GNU General Public License for more details. -# -# You should have received a copy of the Lesser GNU General Public License -# along with this program. If not, see . -# - -from __future__ import absolute_import, division, print_function, unicode_literals -from guessit.test import (test_api, test_autodetect, test_autodetect_all, test_doctests, - test_episode, test_hashes, test_language, test_main, - test_matchtree, test_movie, test_quality, test_utils) -from unittest import TextTestRunner - - -import logging - -def main(): - for suite in [test_api.suite, test_autodetect.suite, - test_autodetect_all.suite, test_doctests.suite, - test_episode.suite, test_hashes.suite, test_language.suite, - test_main.suite, test_matchtree.suite, test_movie.suite, - test_quality.suite, test_utils.suite]: - TextTestRunner(verbosity=2).run(suite) - - -if __name__ == '__main__': - main() diff --git a/lib/guessit/test/autodetect.yaml b/lib/guessit/test/autodetect.yaml deleted file mode 100644 index c2da5ba8..00000000 --- a/lib/guessit/test/autodetect.yaml +++ /dev/null @@ -1,289 +0,0 @@ -? Movies/Fear and Loathing in Las Vegas (1998)/Fear.and.Loathing.in.Las.Vegas.720p.HDDVD.DTS.x264-ESiR.mkv -: type: movie - title: Fear and Loathing in Las Vegas - year: 1998 - screenSize: 720p - format: HD-DVD - audioCodec: DTS - videoCodec: h264 - releaseGroup: ESiR - -? Leopard.dmg -: type: unknown - extension: dmg - -? Series/Duckman/Duckman - 101 (01) - 20021107 - I, Duckman.avi -: type: episode - series: Duckman - season: 1 - episodeNumber: 1 - title: I, Duckman - date: 2002-11-07 - -? Series/Neverwhere/Neverwhere.05.Down.Street.[tvu.org.ru].avi -: type: episode - series: Neverwhere - episodeNumber: 5 - title: Down Street - website: tvu.org.ru - -? Neverwhere.05.Down.Street.[tvu.org.ru].avi -: type: episode - series: Neverwhere - episodeNumber: 5 - title: Down Street - website: tvu.org.ru - -? Series/Breaking Bad/Minisodes/Breaking.Bad.(Minisodes).01.Good.Cop.Bad.Cop.WEBRip.XviD.avi -: type: episode - series: Breaking Bad - episodeFormat: Minisode - episodeNumber: 1 - title: Good Cop Bad Cop - format: WEBRip - videoCodec: XviD - -? Series/Kaamelott/Kaamelott - Livre V - Ep 23 - Le Forfait.avi -: type: episode - series: Kaamelott - episodeNumber: 23 - title: Le Forfait - -? Movies/The Doors (1991)/09.03.08.The.Doors.(1991).BDRip.720p.AC3.X264-HiS@SiLUHD-English.[sharethefiles.com].mkv -: type: movie - title: The Doors - year: 1991 - date: 2008-03-09 - format: BluRay - screenSize: 720p - audioCodec: AC3 - videoCodec: h264 - releaseGroup: HiS@SiLUHD - language: english - website: sharethefiles.com - -? Movies/M.A.S.H. (1970)/MASH.(1970).[Divx.5.02][Dual-Subtitulos][DVDRip].ogm -: type: movie - title: M.A.S.H. - year: 1970 - videoCodec: DivX - format: DVD - -? the.mentalist.501.hdtv-lol.mp4 -: type: episode - series: The Mentalist - season: 5 - episodeNumber: 1 - format: HDTV - releaseGroup: LOL - -? the.simpsons.2401.hdtv-lol.mp4 -: type: episode - series: The Simpsons - season: 24 - episodeNumber: 1 - format: HDTV - releaseGroup: LOL - -? Homeland.S02E01.HDTV.x264-EVOLVE.mp4 -: type: episode - series: Homeland - season: 2 - episodeNumber: 1 - format: HDTV - videoCodec: h264 - releaseGroup: EVOLVE - -? /media/Band_of_Brothers-e01-Currahee.mkv -: type: episode - series: Band of Brothers - episodeNumber: 1 - title: Currahee - -? /media/Band_of_Brothers-x02-We_Stand_Alone_Together.mkv -: type: episode - series: Band of Brothers - bonusNumber: 2 - bonusTitle: We Stand Alone Together - -? /movies/James_Bond-f21-Casino_Royale-x02-Stunts.mkv -: type: movie - title: Casino Royale - filmSeries: James Bond - filmNumber: 21 - bonusNumber: 2 - bonusTitle: Stunts - -? /TV Shows/new.girl.117.hdtv-lol.mp4 -: type: episode - series: New Girl - season: 1 - episodeNumber: 17 - format: HDTV - releaseGroup: LOL - -? The.Office.(US).1x03.Health.Care.HDTV.XviD-LOL.avi -: type: episode - series: The Office (US) - country: US - season: 1 - episodeNumber: 3 - title: Health Care - format: HDTV - videoCodec: XviD - releaseGroup: LOL - -? The_Insider-(1999)-x02-60_Minutes_Interview-1996.mp4 -: type: movie - title: The Insider - year: 1999 - bonusNumber: 2 - bonusTitle: 60 Minutes Interview-1996 - -? OSS_117--Cairo,_Nest_of_Spies.mkv -: type: movie - title: OSS 117--Cairo, Nest of Spies - -? Rush.._Beyond_The_Lighted_Stage-x09-Between_Sun_and_Moon-2002_Hartford.mkv -: type: movie - title: Rush Beyond The Lighted Stage - bonusNumber: 9 - bonusTitle: Between Sun and Moon-2002 Hartford - -? House.Hunters.International.S56E06.720p.hdtv.x264.mp4 -: type: episode - series: House Hunters International - season: 56 - episodeNumber: 6 - screenSize: 720p - format: HDTV - videoCodec: h264 - -? White.House.Down.2013.1080p.BluRay.DTS-HD.MA.5.1.x264-PublicHD.mkv -: type: movie - title: White House Down - year: 2013 - screenSize: 1080p - format: BluRay - audioCodec: DTS - audioProfile: HDMA - videoCodec: h264 - releaseGroup: PublicHD - audioChannels: "5.1" - -? Hostages.S01E01.Pilot.for.Air.720p.WEB-DL.DD5.1.H.264-NTb.nfo -: type: episodeinfo - series: Hostages - title: Pilot for Air - season: 1 - episodeNumber: 1 - screenSize: 720p - format: WEB-DL - audioChannels: "5.1" - videoCodec: h264 - audioCodec: DolbyDigital - releaseGroup: NTb - -? Despicable.Me.2.2013.1080p.BluRay.x264-VeDeTT.nfo -: type: movieinfo - title: Despicable Me 2 - year: 2013 - screenSize: 1080p - format: BluRay - videoCodec: h264 - releaseGroup: VeDeTT - -? Le Cinquieme Commando 1971 SUBFORCED FRENCH DVDRiP XViD AC3 Bandix.mkv -: type: movie - audioCodec: AC3 - format: DVD - releaseGroup: Bandix - subtitleLanguage: French - title: Le Cinquieme Commando - videoCodec: XviD - year: 1971 - -? Le Seigneur des Anneaux - La Communauté de l'Anneau - Version Longue - BDRip.mkv -: type: movie - format: BluRay - title: Le Seigneur des Anneaux - -? La petite bande (Michel Deville - 1983) VF PAL MP4 x264 AAC.mkv -: type: movie - audioCodec: AAC - language: French - title: La petite bande - videoCodec: h264 - year: 1983 - -? Retour de Flammes (Gregor Schnitzler 2003) FULL DVD.iso -: type: movie - format: DVD - title: Retour de Flammes - type: movie - year: 2003 - -? A.Common.Title.Special.2014.avi -: type: movie - year: 2014 - title: A Common Title Special - -? A.Common.Title.2014.Special.avi -: type: episode - year: 2014 - series: A Common Title - title: Special - special: Special - -? A.Common.Title.2014.Special.Edition.avi -: type: movie - year: 2014 - title: A Common Title - edition: Special Edition - -? Downton.Abbey.2013.Christmas.Special.HDTV.x264-FoV.mp4 -: type: episode - year: 2013 - series: Downton Abbey - title: Christmas Special - videoCodec: h264 - releaseGroup: FoV - format: HDTV - special: Special - -? Doctor_Who_2013_Christmas_Special.The_Time_of_The_Doctor.HD -: options: -n - type: episode - series: Doctor Who - other: HD - special: Special - title: Christmas Special The Time of The Doctor - year: 2013 - -? Doctor Who 2005 50th Anniversary Special The Day of the Doctor 3.avi -: type: episode - series: Doctor Who - special: Special - title: 50th Anniversary Special The Day of the Doctor 3 - year: 2005 - -? Robot Chicken S06-Born Again Virgin Christmas Special HDTV x264.avi -: type: episode - series: Robot Chicken - format: HDTV - season: 6 - title: Born Again Virgin Christmas Special - videoCodec: h264 - special: Special - -? Wicked.Tuna.S03E00.Head.To.Tail.Special.HDTV.x264-YesTV -: options: -n - type: episode - series: Wicked Tuna - title: Head To Tail Special - releaseGroup: YesTV - season: 3 - episodeNumber: 0 - videoCodec: h264 - format: HDTV - special: Special diff --git a/lib/guessit/test/dummy.srt b/lib/guessit/test/dummy.srt deleted file mode 100644 index ca4cf8b8..00000000 --- a/lib/guessit/test/dummy.srt +++ /dev/null @@ -1 +0,0 @@ -Just a dummy srt file (used for unittests: do not remove!) diff --git a/lib/guessit/test/episodes.yaml b/lib/guessit/test/episodes.yaml deleted file mode 100644 index 31c0cae7..00000000 --- a/lib/guessit/test/episodes.yaml +++ /dev/null @@ -1,569 +0,0 @@ -# Dubious tests -# -#? "finale " -#: releaseGroup: FiNaLe -# extension: "" - - -? Series/Californication/Season 2/Californication.2x05.Vaginatown.HDTV.XviD-0TV.avi -: series: Californication - season: 2 - episodeNumber: 5 - title: Vaginatown - format: HDTV - videoCodec: XviD - releaseGroup: 0TV - -? Series/dexter/Dexter.5x02.Hello,.Bandit.ENG.-.sub.FR.HDTV.XviD-AlFleNi-TeaM.[tvu.org.ru].avi -: series: Dexter - season: 5 - episodeNumber: 2 - title: Hello, Bandit - language: English - subtitleLanguage: French - format: HDTV - videoCodec: XviD - releaseGroup: AlFleNi-TeaM - website: tvu.org.ru - -? Series/Treme/Treme.1x03.Right.Place,.Wrong.Time.HDTV.XviD-NoTV.avi -: series: Treme - season: 1 - episodeNumber: 3 - title: Right Place, Wrong Time - format: HDTV - videoCodec: XviD - releaseGroup: NoTV - -? Series/Duckman/Duckman - 101 (01) - 20021107 - I, Duckman.avi -: series: Duckman - season: 1 - episodeNumber: 1 - title: I, Duckman - date: 2002-11-07 - -? Series/Duckman/Duckman - S1E13 Joking The Chicken (unedited).avi -: series: Duckman - season: 1 - episodeNumber: 13 - title: Joking The Chicken - -? Series/Simpsons/Saison 12 Français/Simpsons,.The.12x08.A.Bas.Le.Sergent.Skinner.FR.avi -: series: The Simpsons - season: 12 - episodeNumber: 8 - title: A Bas Le Sergent Skinner - language: French - -? Series/Futurama/Season 3 (mkv)/[™] Futurama - S03E22 - Le chef de fer à 30% ( 30 Percent Iron Chef ).mkv -: series: Futurama - season: 3 - episodeNumber: 22 - title: Le chef de fer à 30% - -? Series/The Office/Season 6/The Office - S06xE01.avi -: series: The Office - season: 6 - episodeNumber: 1 - -? series/The Office/Season 4/The Office [401] Fun Run.avi -: series: The Office - season: 4 - episodeNumber: 1 - title: Fun Run - -? Series/Mad Men Season 1 Complete/Mad.Men.S01E01.avi -: series: Mad Men - season: 1 - episodeNumber: 1 - other: complete - -? series/Psych/Psych S02 Season 2 Complete English DVD/Psych.S02E02.65.Million.Years.Off.avi -: series: Psych - season: 2 - episodeNumber: 2 - title: 65 Million Years Off - language: english - format: DVD - other: complete - -? series/Psych/Psych S02 Season 2 Complete English DVD/Psych.S02E03.Psy.Vs.Psy.Français.srt -: series: Psych - season: 2 - episodeNumber: 3 - title: Psy Vs Psy - format: DVD - language: English - subtitleLanguage: French - other: complete - -? Series/Pure Laine/Pure.Laine.1x01.Toutes.Couleurs.Unies.FR.(Québec).DVB-Kceb.[tvu.org.ru].avi -: series: Pure Laine - season: 1 - episodeNumber: 1 - title: Toutes Couleurs Unies - format: DVB - releaseGroup: Kceb - language: french - website: tvu.org.ru - -? Series/Pure Laine/2x05 - Pure Laine - Je Me Souviens.avi -: series: Pure Laine - season: 2 - episodeNumber: 5 - title: Je Me Souviens - -? Series/Tout sur moi/Tout sur moi - S02E02 - Ménage à trois (14-01-2008) [Rip by Ampli].avi -: series: Tout sur moi - season: 2 - episodeNumber: 2 - title: Ménage à trois - date: 2008-01-14 - -? The.Mentalist.2x21.18-5-4.ENG.-.sub.FR.HDTV.XviD-AlFleNi-TeaM.[tvu.org.ru].avi -: series: The Mentalist - season: 2 - episodeNumber: 21 - title: 18-5-4 - language: english - subtitleLanguage: french - format: HDTV - videoCodec: Xvid - releaseGroup: AlFleNi-TeaM - website: tvu.org.ru - -? series/__ Incomplete __/Dr Slump (Catalan)/Dr._Slump_-_003_DVB-Rip_Catalan_by_kelf.avi -: series: Dr Slump - episodeNumber: 3 - format: DVB - language: catalan - -? series/Ren and Stimpy - Black_hole_[DivX].avi -: series: Ren and Stimpy - title: Black hole - videoCodec: DivX - -? Series/Walt Disney/Donald.Duck.-.Good.Scouts.[www.bigernie.jump.to].avi -: series: Donald Duck - title: Good Scouts - website: www.bigernie.jump.to - -? Series/Neverwhere/Neverwhere.05.Down.Street.[tvu.org.ru].avi -: series: Neverwhere - episodeNumber: 5 - title: Down Street - website: tvu.org.ru - -? Series/South Park/Season 4/South.Park.4x07.Cherokee.Hair.Tampons.DVDRip.[tvu.org.ru].avi -: series: South Park - season: 4 - episodeNumber: 7 - title: Cherokee Hair Tampons - format: DVD - website: tvu.org.ru - -? Series/Kaamelott/Kaamelott - Livre V - Ep 23 - Le Forfait.avi -: series: Kaamelott - episodeNumber: 23 - title: Le Forfait - -? Series/Duckman/Duckman - 110 (10) - 20021218 - Cellar Beware.avi -: series: Duckman - season: 1 - episodeNumber: 10 - date: 2002-12-18 - title: Cellar Beware - -? Series/Ren & Stimpy/Ren And Stimpy - Onward & Upward-Adult Party Cartoon.avi -: series: Ren And Stimpy - title: Onward & Upward-Adult Party Cartoon - -? Series/Breaking Bad/Minisodes/Breaking.Bad.(Minisodes).01.Good.Cop.Bad.Cop.WEBRip.XviD.avi -: series: Breaking Bad - episodeFormat: Minisode - episodeNumber: 1 - title: Good Cop Bad Cop - format: WEBRip - videoCodec: XviD - -? Series/My Name Is Earl/My.Name.Is.Earl.S01Extras.-.Bad.Karma.DVDRip.XviD.avi -: series: My Name Is Earl - season: 1 - title: Bad Karma - format: DVD - special: Extras - videoCodec: XviD - -? /mnt/series/The Big Bang Theory/S01/The.Big.Bang.Theory.S01E01.mkv -: series: The Big Bang Theory - season: 1 - episodeNumber: 1 - -? /media/Parks_and_Recreation-s03-e01.mkv -: series: Parks and Recreation - season: 3 - episodeNumber: 1 - -? /media/Parks_and_Recreation-s03-e02-Flu_Season.mkv -: series: Parks and Recreation - season: 3 - title: Flu Season - episodeNumber: 2 - -? /media/Parks_and_Recreation-s03-x01.mkv -: series: Parks and Recreation - season: 3 - bonusNumber: 1 - -? /media/Parks_and_Recreation-s03-x02-Gag_Reel.mkv -: series: Parks and Recreation - season: 3 - bonusNumber: 2 - bonusTitle: Gag Reel - -? /media/Band_of_Brothers-e01-Currahee.mkv -: series: Band of Brothers - episodeNumber: 1 - title: Currahee - -? /media/Band_of_Brothers-x02-We_Stand_Alone_Together.mkv -: series: Band of Brothers - bonusNumber: 2 - bonusTitle: We Stand Alone Together - -? /TV Shows/Mad.M-5x9.mkv -: series: Mad M - season: 5 - episodeNumber: 9 - -? /TV Shows/new.girl.117.hdtv-lol.mp4 -: series: New Girl - season: 1 - episodeNumber: 17 - format: HDTV - releaseGroup: LOL - -? Kaamelott - 5x44x45x46x47x48x49x50.avi -: series: Kaamelott - season: 5 - episodeNumber: 44 - episodeList: [44, 45, 46, 47, 48, 49, 50] - -? Example S01E01-02.avi -: series: Example - season: 1 - episodeNumber: 1 - episodeList: [1, 2] - -? Example S01E01E02.avi -: series: Example - season: 1 - episodeNumber: 1 - episodeList: [1, 2] - -? Series/Baccano!/Baccano!_-_T1_-_Trailer_-_[Ayu](dae8173e).mkv -: series: Baccano! - other: Trailer - -? Series/Doctor Who (2005)/Season 06/Doctor Who (2005) - S06E01 - The Impossible Astronaut (1).avi -: series: Doctor Who - year: 2005 - season: 6 - episodeNumber: 1 - title: The Impossible Astronaut - -? The.Office.(US).1x03.Health.Care.HDTV.XviD-LOL.avi -: series: The Office (US) - country: US - season: 1 - episodeNumber: 3 - title: Health Care - format: HDTV - videoCodec: XviD - releaseGroup: LOL - -? /Volumes/data-1/Series/Futurama/Season 3/Futurama_-_S03_DVD_Bonus_-_Deleted_Scenes_Part_3.ogm -: series: Futurama - season: 3 - other: Bonus - title: Deleted Scenes Part 3 - format: DVD - -? Ben.and.Kate.S01E02.720p.HDTV.X264-DIMENSION.mkv -: series: Ben and Kate - season: 1 - episodeNumber: 2 - screenSize: 720p - format: HDTV - videoCodec: h264 - releaseGroup: DIMENSION - -? /volume1/TV Series/Drawn Together/Season 1/Drawn Together 1x04 Requiem for a Reality Show.avi -: series: Drawn Together - season: 1 - episodeNumber: 4 - title: Requiem for a Reality Show - -? Sons.of.Anarchy.S05E06.720p.WEB.DL.DD5.1.H.264-CtrlHD.mkv -: series: Sons of Anarchy - season: 5 - episodeNumber: 6 - screenSize: 720p - format: WEB-DL - audioChannels: "5.1" - audioCodec: DolbyDigital - videoCodec: h264 - releaseGroup: CtrlHD - -? /media/bdc64bfe-e36f-4af8-b550-e6fd2dfaa507/TV_Shows/Doctor Who (2005)/Saison 6/Doctor Who (2005) - S06E13 - The Wedding of River Song.mkv -: series: Doctor Who - season: 6 - episodeNumber: 13 - year: 2005 - title: The Wedding of River Song - idNumber: bdc64bfe-e36f-4af8-b550-e6fd2dfaa507 - -? /mnt/videos/tvshows/Doctor Who/Season 06/E13 - The Wedding of River Song.mkv -: series: Doctor Who - season: 6 - episodeNumber: 13 - title: The Wedding of River Song - -? The.Simpsons.S24E03.Adventures.in.Baby-Getting.720p.WEB-DL.DD5.1.H.264-CtrlHD.mkv -: series: The Simpsons - season: 24 - episodeNumber: 3 - title: Adventures in Baby-Getting - screenSize: 720p - format: WEB-DL - audioChannels: "5.1" - audioCodec: DolbyDigital - videoCodec: h264 - releaseGroup: CtrlHD - -? /home/disaster/Videos/TV/Merlin/merlin_2008.5x02.arthurs_bane_part_two.repack.720p_hdtv_x264-fov.mkv -: series: Merlin - season: 5 - episodeNumber: 2 - title: Arthurs bane part two - screenSize: 720p - format: HDTV - videoCodec: h264 - releaseGroup: Fov - year: 2008 - other: Proper - -? "Da Vinci's Demons - 1x04 - The Magician.mkv" -: series: "Da Vinci's Demons" - season: 1 - episodeNumber: 4 - title: The Magician - -? CSI.S013E18.Sheltered.720p.WEB-DL.DD5.1.H.264.mkv -: series: CSI - season: 13 - episodeNumber: 18 - title: Sheltered - screenSize: 720p - format: WEB-DL - audioChannels: "5.1" - audioCodec: DolbyDigital - videoCodec: h264 - -? Game of Thrones S03E06 1080i HDTV DD5.1 MPEG2-TrollHD.ts -: series: Game of Thrones - season: 3 - episodeNumber: 6 - screenSize: 1080i - format: HDTV - audioChannels: "5.1" - audioCodec: DolbyDigital - videoCodec: MPEG2 - releaseGroup: TrollHD - -? gossip.girl.s01e18.hdtv.xvid-2hd.eng.srt -: series: gossip girl - season: 1 - episodeNumber: 18 - format: HDTV - videoCodec: XviD - releaseGroup: 2HD - subtitleLanguage: english - -? Wheels.S03E01E02.720p.HDTV.x264-IMMERSE.mkv -: series: Wheels - season: 3 - episodeNumber: 1 - episodeList: [1, 2] - screenSize: 720p - format: HDTV - videoCodec: h264 - releaseGroup: IMMERSE - -? Wheels.S03E01-02.720p.HDTV.x264-IMMERSE.mkv -: series: Wheels - season: 3 - episodeNumber: 1 - episodeList: [1, 2] - screenSize: 720p - format: HDTV - videoCodec: h264 - releaseGroup: IMMERSE - -? Wheels.S03E01-E02.720p.HDTV.x264-IMMERSE.mkv -: series: Wheels - season: 3 - episodeNumber: 1 - episodeList: [1, 2] - screenSize: 720p - format: HDTV - videoCodec: h264 - releaseGroup: IMMERSE - -? Wheels.S03E01-03.720p.HDTV.x264-IMMERSE.mkv -: series: Wheels - season: 3 - episodeNumber: 1 - episodeList: [1, 2, 3] - screenSize: 720p - format: HDTV - videoCodec: h264 - releaseGroup: IMMERSE - -? Marvels.Agents.of.S.H.I.E.L.D.S01E06.720p.HDTV.X264-DIMENSION.mkv -: series: Marvels Agents of S.H.I.E.L.D. - season: 1 - episodeNumber: 6 - screenSize: 720p - format: HDTV - videoCodec: h264 - releaseGroup: DIMENSION - -? Marvels.Agents.of.S.H.I.E.L.D..S01E06.720p.HDTV.X264-DIMENSION.mkv -: series: Marvels Agents of S.H.I.E.L.D. - season: 1 - episodeNumber: 6 - screenSize: 720p - format: HDTV - videoCodec: h264 - releaseGroup: DIMENSION - -? Series/Friday Night Lights/Season 1/Friday Night Lights S01E19 - Ch-Ch-Ch-Ch-Changes.avi -: series: Friday Night Lights - season: 1 - episodeNumber: 19 - title: Ch-Ch-Ch-Ch-Changes - -? Dexter Saison VII FRENCH.BDRip.XviD-MiND.nfo -: series: Dexter - season: 7 - videoCodec: XviD - language: French - format: BluRay - releaseGroup: MiND - -? Dexter Saison sept FRENCH.BDRip.XviD-MiND.nfo -: series: Dexter - season: 7 - videoCodec: XviD - language: French - format: BluRay - releaseGroup: MiND - -? "Pokémon S16 - E29 - 1280*720 HDTV VF.mkv" -: series: Pokémon - format: HDTV - language: French - season: 16 - episodeNumber: 29 - screenSize: 720p - -? One.Piece.E576.VOSTFR.720p.HDTV.x264-MARINE-FORD.mkv -: episodeNumber: 576 - videoCodec: h264 - format: HDTV - series: One Piece - releaseGroup: MARINE-FORD - subtitleLanguage: French - screenSize: 720p - -? Dexter.S08E12.FINAL.MULTi.1080p.BluRay.x264-MiND.mkv -: videoCodec: h264 - episodeNumber: 12 - season: 8 - format: BluRay - series: Dexter - other: final - language: Multiple languages - releaseGroup: MiND - screenSize: 1080p - -? One Piece - E623 VOSTFR HD [www.manga-ddl-free.com].mkv -: website: www.manga-ddl-free.com - episodeNumber: 623 - subtitleLanguage: French - series: One Piece - other: HD - -? Falling Skies Saison 1.HDLight.720p.x264.VFF.mkv -: language: French - screenSize: 720p - season: 1 - series: Falling Skies - videoCodec: h264 - -? Sleepy.Hollow.S01E09.720p.WEB-DL.DD5.1.H.264-BP.mkv -: episodeNumber: 9 - videoCodec: h264 - format: WEB-DL - series: Sleepy Hollow - audioChannels: "5.1" - screenSize: 720p - season: 1 - videoProfile: BP - audioCodec: DolbyDigital - -? Sleepy.Hollow.S01E09.720p.WEB-DL.DD5.1.H.264-BS.mkv -: episodeNumber: 9 - videoCodec: h264 - format: WEB-DL - series: Sleepy Hollow - audioChannels: "5.1" - screenSize: 720p - season: 1 - releaseGroup: BS - audioCodec: DolbyDigital - -? Battlestar.Galactica.S00.Pilot.FRENCH.DVDRip.XviD-NOTAG.avi -: series: Battlestar Galactica - season: 0 - title: Pilot - special: Pilot - language: French - format: DVD - videoCodec: XviD - releaseGroup: NOTAG - -? The Big Bang Theory S00E00 Unaired Pilot VOSTFR TVRip XviD-VioCs -: options: -n - series: The Big Bang Theory - season: 0 - episodeNumber: 0 - subtitleLanguage: French - format: TV - videoCodec: XviD - releaseGroup: VioCs - special: [Unaired, Pilot] - title: Unaired Pilot - -? The Big Bang Theory S01E00 PROPER Unaired Pilot TVRip XviD-GIGGITY -: options: -n - series: The Big Bang Theory - season: 1 - episodeNumber: 0 - format: TV - videoCodec: XviD - releaseGroup: GIGGITY - other: proper - special: [Unaired, Pilot] - title: Unaired Pilot diff --git a/lib/guessit/test/guessittest.py b/lib/guessit/test/guessittest.py deleted file mode 100644 index 9362ce68..00000000 --- a/lib/guessit/test/guessittest.py +++ /dev/null @@ -1,168 +0,0 @@ -#!/usr/bin/env python -# -*- coding: utf-8 -*- -# -# GuessIt - A library for guessing information from filenames -# Copyright (c) 2013 Nicolas Wack -# -# GuessIt is free software; you can redistribute it and/or modify it under -# the terms of the Lesser GNU General Public License as published by -# the Free Software Foundation; either version 3 of the License, or -# (at your option) any later version. -# -# GuessIt is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# Lesser GNU General Public License for more details. -# -# You should have received a copy of the Lesser GNU General Public License -# along with this program. If not, see . -# - -from __future__ import absolute_import, division, print_function, unicode_literals - -from guessit import base_text_type, u - -from unittest import TestCase, TestLoader, TextTestRunner -import shlex - -import yaml, logging, sys, os -from os.path import * - - -def currentPath(): - '''Returns the path in which the calling file is located.''' - return dirname(join(os.getcwd(), sys._getframe(1).f_globals['__file__'])) - - -def addImportPath(path): - '''Function that adds the specified path to the import path. The path can be - absolute or relative to the calling file.''' - importPath = abspath(join(currentPath(), path)) - sys.path = [importPath] + sys.path - -log = logging.getLogger(__name__) - -from guessit.plugins import transformers -import guessit -from guessit.options import option_parser -from guessit import * -from guessit.matcher import * -from guessit.fileutils import * - - -def allTests(testClass): - return TestLoader().loadTestsFromTestCase(testClass) - - -class TestGuessit(TestCase): - - def checkMinimumFieldsCorrect(self, filename, filetype=None, remove_type=True, - exclude_files=None): - groundTruth = yaml.load(load_file_in_same_dir(__file__, filename)) - - def guess_func(string, options=None): - return guess_file_info(string, options=options, type=filetype) - - return self.checkFields(groundTruth, guess_func, remove_type, exclude_files) - - def checkFields(self, groundTruth, guess_func, remove_type=True, - exclude_files=None): - total = 0 - exclude_files = exclude_files or [] - - fails = {} - additionals = {} - - for filename, required_fields in groundTruth.items(): - filename = u(filename) - if filename in exclude_files: - continue - - log.debug('\n' + '-' * 120) - log.info('Guessing information for file: %s' % filename) - - options = required_fields.pop('options') if 'options' in required_fields else None - - if options: - args = shlex.split(options) - options, _ = option_parser.parse_args(args) - options = vars(options) - found = guess_func(filename, options) - - total = total + 1 - - # no need for these in the unittests - if remove_type: - try: - del found['type'] - except: - pass - for prop in ('container', 'mimetype'): - if prop in found: - del found[prop] - - # props which are list of just 1 elem should be opened for easier writing of the tests - for prop in ('language', 'subtitleLanguage', 'other', 'special'): - value = found.get(prop, None) - if isinstance(value, list) and len(value) == 1: - found[prop] = value[0] - - # look for missing properties - for prop, value in required_fields.items(): - if prop not in found: - log.debug("Prop '%s' not found in: %s" % (prop, filename)) - if not filename in fails: - fails[filename] = [] - fails[filename].append("'%s' not found in: %s" % (prop, filename)) - continue - - # if both properties are strings, do a case-insensitive comparison - if (isinstance(value, base_text_type) and - isinstance(found[prop], base_text_type)): - if value.lower() != found[prop].lower(): - log.debug("Wrong prop value [str] for '%s': expected = '%s' - received = '%s'" % (prop, u(value), u(found[prop]))) - if not filename in fails: - fails[filename] = [] - fails[filename].append("'%s': expected = '%s' - received = '%s'" % (prop, u(value), u(found[prop]))) - - # if both are lists, we assume list of strings and do a case-insensitive - # comparison on their elements - elif isinstance(value, list) and isinstance(found[prop], list): - s1 = set(u(s).lower() for s in value) - s2 = set(u(s).lower() for s in found[prop]) - if s1 != s2: - log.debug("Wrong prop value [list] for '%s': expected = '%s' - received = '%s'" % (prop, u(value), u(found[prop]))) - if not filename in fails: - fails[filename] = [] - fails[filename].append("'%s': expected = '%s' - received = '%s'" % (prop, u(value), u(found[prop]))) - # otherwise, just compare their values directly - else: - if found[prop] != value: - log.debug("Wrong prop value for '%s': expected = '%s' [%s] - received = '%s' [%s]" % (prop, u(value), type(value), u(found[prop]), type(found[prop]))) - if not filename in fails: - fails[filename] = [] - fails[filename].append("'%s': expected = '%s' [%s] - received = '%s' [%s]" % (prop, u(value), type(value), u(found[prop]), type(found[prop]))) - - # look for additional properties - for prop, value in found.items(): - if prop not in required_fields: - log.debug("Found additional info for prop = '%s': '%s'" % (prop, u(value))) - if not filename in additionals: - additionals[filename] = [] - additionals[filename].append("'%s': '%s'" % (prop, u(value))) - - correct = total - len(fails) - log.info('SUMMARY: Guessed correctly %d out of %d filenames' % (correct, total)) - - for failed_entry, failed_properties in fails.items(): - log.error('---- ' + failed_entry + ' ----') - for failed_property in failed_properties: - log.error("FAILED: " + failed_property) - - for additional_entry, additional_properties in additionals.items(): - log.warn('---- ' + additional_entry + ' ----') - for additional_property in additional_properties: - log.warn("ADDITIONAL: " + additional_property) - - self.assertTrue(correct == total, - msg='Correct: %d < Total: %d' % (correct, total)) diff --git a/lib/guessit/test/movies.yaml b/lib/guessit/test/movies.yaml deleted file mode 100644 index 5651d23d..00000000 --- a/lib/guessit/test/movies.yaml +++ /dev/null @@ -1,626 +0,0 @@ - -? Movies/Fear and Loathing in Las Vegas (1998)/Fear.and.Loathing.in.Las.Vegas.720p.HDDVD.DTS.x264-ESiR.mkv -: title: Fear and Loathing in Las Vegas - year: 1998 - screenSize: 720p - format: HD-DVD - audioCodec: DTS - videoCodec: h264 - releaseGroup: ESiR - -? Movies/El Dia de la Bestia (1995)/El.dia.de.la.bestia.DVDrip.Spanish.DivX.by.Artik[SEDG].avi -: title: El Dia de la Bestia - year: 1995 - format: DVD - language: spanish - videoCodec: DivX - -? Movies/Dark City (1998)/Dark.City.(1998).DC.BDRip.720p.DTS.X264-CHD.mkv -: title: Dark City - year: 1998 - format: BluRay - screenSize: 720p - audioCodec: DTS - videoCodec: h264 - releaseGroup: CHD - -? Movies/Sin City (BluRay) (2005)/Sin.City.2005.BDRip.720p.x264.AC3-SEPTiC.mkv -: title: Sin City - year: 2005 - format: BluRay - screenSize: 720p - videoCodec: h264 - audioCodec: AC3 - releaseGroup: SEPTiC - - -? Movies/Borat (2006)/Borat.(2006).R5.PROPER.REPACK.DVDRip.XviD-PUKKA.avi -: title: Borat - year: 2006 - other: PROPER - format: DVD - other: [ R5, Proper ] - videoCodec: XviD - releaseGroup: PUKKA - - -? "[XCT].Le.Prestige.(The.Prestige).DVDRip.[x264.HP.He-Aac.{Fr-Eng}.St{Fr-Eng}.Chaps].mkv" -: title: Le Prestige - format: DVD - videoCodec: h264 - videoProfile: HP - audioCodec: AAC - audioProfile: HE - language: [ french, english ] - subtitleLanguage: [ french, english ] - -? Battle Royale (2000)/Battle.Royale.(Batoru.Rowaiaru).(2000).(Special.Edition).CD1of2.DVDRiP.XviD-[ZeaL].avi -: title: Battle Royale - year: 2000 - edition: special edition - cdNumber: 1 - cdNumberTotal: 2 - format: DVD - videoCodec: XviD - releaseGroup: ZeaL - -? Movies/Brazil (1985)/Brazil_Criterion_Edition_(1985).CD2.avi -: title: Brazil - edition: Criterion Edition - year: 1985 - cdNumber: 2 - -? Movies/Persepolis (2007)/[XCT] Persepolis [H264+Aac-128(Fr-Eng)+ST(Fr-Eng)+Ind].mkv -: title: Persepolis - year: 2007 - videoCodec: h264 - audioCodec: AAC - language: [ French, English ] - subtitleLanguage: [ French, English ] - -? Movies/Toy Story (1995)/Toy Story [HDTV 720p English-Spanish].mkv -: title: Toy Story - year: 1995 - format: HDTV - screenSize: 720p - language: [ english, spanish ] - -? Movies/Office Space (1999)/Office.Space.[Dual-DVDRip].[Spanish-English].[XviD-AC3-AC3].[by.Oswald].avi -: title: Office Space - year: 1999 - format: DVD - language: [ english, spanish ] - videoCodec: XviD - audioCodec: AC3 - -? Movies/Wild Zero (2000)/Wild.Zero.DVDivX-EPiC.avi -: title: Wild Zero - year: 2000 - videoCodec: DivX - releaseGroup: EPiC - -? movies/Baraka_Edition_Collector.avi -: title: Baraka - edition: collector edition - -? Movies/Blade Runner (1982)/Blade.Runner.(1982).(Director's.Cut).CD1.DVDRip.XviD.AC3-WAF.avi -: title: Blade Runner - year: 1982 - edition: Director's Cut - cdNumber: 1 - format: DVD - videoCodec: XviD - audioCodec: AC3 - releaseGroup: WAF - -? movies/American.The.Bill.Hicks.Story.2009.DVDRip.XviD-EPiSODE.[UsaBit.com]/UsaBit.com_esd-americanbh.avi -: title: American The Bill Hicks Story - year: 2009 - format: DVD - videoCodec: XviD - releaseGroup: EPiSODE - website: UsaBit.com - -? movies/Charlie.And.Boots.DVDRip.XviD-TheWretched/wthd-cab.avi -: title: Charlie And Boots - format: DVD - videoCodec: XviD - releaseGroup: TheWretched - -? movies/Steig Larsson Millenium Trilogy (2009) BRrip 720 AAC x264/(1)The Girl With The Dragon Tattoo (2009) BRrip 720 AAC x264.mkv -: title: The Girl With The Dragon Tattoo - filmSeries: Steig Larsson Millenium Trilogy - filmNumber: 1 - year: 2009 - format: BluRay - audioCodec: AAC - videoCodec: h264 - screenSize: 720p - -? movies/Greenberg.REPACK.LiMiTED.DVDRip.XviD-ARROW/arw-repack-greenberg.dvdrip.xvid.avi -: title: Greenberg - format: DVD - videoCodec: XviD - releaseGroup: ARROW - other: ['Proper', 'Limited'] - -? Movies/Fr - Paris 2054, Renaissance (2005) - De Christian Volckman - (Film Divx Science Fiction Fantastique Thriller Policier N&B).avi -: title: Paris 2054, Renaissance - year: 2005 - language: french - videoCodec: DivX - -? Movies/[阿维达].Avida.2006.FRENCH.DVDRiP.XViD-PROD.avi -: title: Avida - year: 2006 - language: french - format: DVD - videoCodec: XviD - releaseGroup: PROD - -? Movies/Alice in Wonderland DVDRip.XviD-DiAMOND/dmd-aw.avi -: title: Alice in Wonderland - format: DVD - videoCodec: XviD - releaseGroup: DiAMOND - -? Movies/Ne.Le.Dis.A.Personne.Fr 2 cd/personnea_mp.avi -: title: Ne Le Dis A Personne - language: french - cdNumberTotal: 2 - -? Movies/Bunker Palace Hôtel (Enki Bilal) (1989)/Enki Bilal - Bunker Palace Hotel (Fr Vhs Rip).avi -: title: Bunker Palace Hôtel - year: 1989 - language: french - format: VHS - -? Movies/21 (2008)/21.(2008).DVDRip.x264.AC3-FtS.[sharethefiles.com].mkv -: title: "21" - year: 2008 - format: DVD - videoCodec: h264 - audioCodec: AC3 - releaseGroup: FtS - website: sharethefiles.com - -? Movies/9 (2009)/9.2009.Blu-ray.DTS.720p.x264.HDBRiSe.[sharethefiles.com].mkv -: title: "9" - year: 2009 - format: BluRay - audioCodec: DTS - screenSize: 720p - videoCodec: h264 - releaseGroup: HDBRiSe - website: sharethefiles.com - -? Movies/Mamma.Mia.2008.DVDRip.AC3.XviD-CrazyTeam/Mamma.Mia.2008.DVDRip.AC3.XviD-CrazyTeam.avi -: title: Mamma Mia - year: 2008 - format: DVD - audioCodec: AC3 - videoCodec: XviD - releaseGroup: CrazyTeam - -? Movies/M.A.S.H. (1970)/MASH.(1970).[Divx.5.02][Dual-Subtitulos][DVDRip].ogm -: title: M.A.S.H. - year: 1970 - videoCodec: DivX - format: DVD - -? Movies/The Doors (1991)/09.03.08.The.Doors.(1991).BDRip.720p.AC3.X264-HiS@SiLUHD-English.[sharethefiles.com].mkv -: title: The Doors - year: 1991 - date: 2008-03-09 - format: BluRay - screenSize: 720p - audioCodec: AC3 - videoCodec: h264 - releaseGroup: HiS@SiLUHD - language: english - website: sharethefiles.com - -? Movies/Ratatouille/video_ts-ratatouille.srt -: title: Ratatouille - format: DVD - -? Movies/001 __ A classer/Fantomas se déchaine - Louis de Funès.avi -: title: Fantomas se déchaine - -? Movies/Comme une Image (2004)/Comme.Une.Image.FRENCH.DVDRiP.XViD-NTK.par-www.divx-overnet.com.avi -: title: Comme une Image - year: 2004 - language: french - format: DVD - videoCodec: XviD - releaseGroup: NTK - website: www.divx-overnet.com - -? Movies/Fantastic Mr Fox/Fantastic.Mr.Fox.2009.DVDRip.{x264+LC-AAC.5.1}{Fr-Eng}{Sub.Fr-Eng}-™.[sharethefiles.com].mkv -: title: Fantastic Mr Fox - year: 2009 - format: DVD - videoCodec: h264 - audioCodec: AAC - audioProfile: LC - audioChannels: "5.1" - language: [ french, english ] - subtitleLanguage: [ french, english ] - website: sharethefiles.com - -? Movies/Somewhere.2010.DVDRip.XviD-iLG/i-smwhr.avi -: title: Somewhere - year: 2010 - format: DVD - videoCodec: XviD - releaseGroup: iLG - -? Movies/Moon_(2009).mkv -: title: Moon - year: 2009 - -? Movies/Moon_(2009)-x01.mkv -: title: Moon - year: 2009 - bonusNumber: 1 - -? Movies/Moon_(2009)-x02-Making_Of.mkv -: title: Moon - year: 2009 - bonusNumber: 2 - bonusTitle: Making Of - -? movies/James_Bond-f17-Goldeneye.mkv -: title: Goldeneye - filmSeries: James Bond - filmNumber: 17 - -? /movies/James_Bond-f21-Casino_Royale.mkv -: title: Casino Royale - filmSeries: James Bond - filmNumber: 21 - -? /movies/James_Bond-f21-Casino_Royale-x01-Becoming_Bond.mkv -: title: Casino Royale - filmSeries: James Bond - filmNumber: 21 - bonusNumber: 1 - bonusTitle: Becoming Bond - -? /movies/James_Bond-f21-Casino_Royale-x02-Stunts.mkv -: title: Casino Royale - filmSeries: James Bond - filmNumber: 21 - bonusNumber: 2 - bonusTitle: Stunts - -? OSS_117--Cairo,_Nest_of_Spies.mkv -: title: OSS 117--Cairo, Nest of Spies - -? The Godfather Part III.mkv -: title: The Godfather Part III - -? Foobar Part VI.mkv -: title: Foobar Part VI - -? The_Insider-(1999)-x02-60_Minutes_Interview-1996.mp4 -: title: The Insider - year: 1999 - bonusNumber: 2 - bonusTitle: 60 Minutes Interview-1996 - -? Rush.._Beyond_The_Lighted_Stage-x09-Between_Sun_and_Moon-2002_Hartford.mkv -: title: Rush Beyond The Lighted Stage - bonusNumber: 9 - bonusTitle: Between Sun and Moon-2002 Hartford - -? /public/uTorrent/Downloads Finished/Movies/Indiana.Jones.and.the.Temple.of.Doom.1984.HDTV.720p.x264.AC3.5.1-REDµX/Indiana.Jones.and.the.Temple.of.Doom.1984.HDTV.720p.x264.AC3.5.1-REDµX.mkv -: title: Indiana Jones and the Temple of Doom - year: 1984 - format: HDTV - screenSize: 720p - videoCodec: h264 - audioCodec: AC3 - audioChannels: "5.1" - releaseGroup: REDµX - -? The.Director’s.Notebook.2006.Blu-Ray.x264.DXVA.720p.AC3-de[42].mkv -: title: The Director’s Notebook - year: 2006 - format: BluRay - videoCodec: h264 - videoApi: DXVA - screenSize: 720p - audioCodec: AC3 - releaseGroup: de[42] - -? Movies/Cosmopolis.2012.LiMiTED.720p.BluRay.x264-AN0NYM0US[bb]/ano-cosmo.720p.mkv -: title: Cosmopolis - year: 2012 - screenSize: 720p - videoCodec: h264 - releaseGroup: AN0NYM0US[bb] - format: BluRay - other: LIMITED - -? movies/La Science des Rêves (2006)/La.Science.Des.Reves.FRENCH.DVDRip.XviD-MP-AceBot.avi -: title: La Science des Rêves - year: 2006 - format: DVD - videoCodec: XviD - videoProfile: MP - releaseGroup: AceBot - language: French - -? The_Italian_Job.mkv -: title: The Italian Job - -? The.Rum.Diary.2011.1080p.BluRay.DTS.x264.D-Z0N3.mkv -: title: The Rum Diary - year: 2011 - screenSize: 1080p - format: BluRay - videoCodec: h264 - audioCodec: DTS - releaseGroup: D-Z0N3 - -? Life.Of.Pi.2012.1080p.BluRay.DTS.x264.D-Z0N3.mkv -: title: Life Of Pi - year: 2012 - screenSize: 1080p - format: BluRay - videoCodec: h264 - audioCodec: DTS - releaseGroup: D-Z0N3 - -? The.Kings.Speech.2010.1080p.BluRay.DTS.x264.D Z0N3.mkv -: title: The Kings Speech - year: 2010 - screenSize: 1080p - format: BluRay - audioCodec: DTS - videoCodec: h264 - releaseGroup: D-Z0N3 - -? Street.Kings.2008.BluRay.1080p.DTS.x264.dxva EuReKA.mkv -: title: Street Kings - year: 2008 - format: BluRay - screenSize: 1080p - audioCodec: DTS - videoCodec: h264 - videoApi: DXVA - releaseGroup: EuReKa - -? 2001.A.Space.Odyssey.1968.HDDVD.1080p.DTS.x264.dxva EuReKA.mkv -: title: 2001 A Space Odyssey - year: 1968 - format: HD-DVD - screenSize: 1080p - audioCodec: DTS - videoCodec: h264 - videoApi: DXVA - releaseGroup: EuReKa - -? 2012.2009.720p.BluRay.x264.DTS WiKi.mkv -: title: "2012" - year: 2009 - screenSize: 720p - format: BluRay - videoCodec: h264 - audioCodec: DTS - releaseGroup: WiKi - -? /share/Download/movie/Dead Man Down (2013) BRRiP XViD DD5_1 Custom NLSubs =-_lt Q_o_Q gt-=_/XD607ebb-BRc59935-5155473f-1c5f49/XD607ebb-BRc59935-5155473f-1c5f49.avi -: title: Dead Man Down - year: 2013 - format: BluRay - videoCodec: XviD - audioChannels: "5.1" - audioCodec: DolbyDigital - idNumber: XD607ebb-BRc59935-5155473f-1c5f49 - -? Pacific.Rim.3D.2013.COMPLETE.BLURAY-PCH.avi -: title: Pacific Rim - year: 2013 - format: BluRay - other: - - complete - - 3D - releaseGroup: PCH - -? Immersion.French.2011.STV.READNFO.QC.FRENCH.ENGLISH.NTSC.DVDR.nfo -: title: Immersion French - year: 2011 - language: - - French - - English - -? Immersion.French.2011.STV.READNFO.QC.FRENCH.NTSC.DVDR.nfo -: title: Immersion French - year: 2011 - language: French - -? Immersion.French.2011.STV.READNFO.QC.NTSC.DVDR.nfo -: title: Immersion French - year: 2011 - -? French.Immersion.2011.STV.READNFO.QC.ENGLISH.NTSC.DVDR.nfo -: title: French Immersion - year: 2011 - language: ENGLISH - -? Howl's_Moving_Castle_(2004)_[720p,HDTV,x264,DTS]-FlexGet.avi -: videoCodec: h264 - format: HDTV - title: Howl's Moving Castle - screenSize: 720p - year: 2004 - audioCodec: DTS - releaseGroup: FlexGet - -? Pirates de langkasuka.2008.FRENCH.1920X1080.h264.AVC.AsiaRa.mkv -: screenSize: 1080p - year: 2008 - language: French - videoCodec: h264 - title: Pirates de langkasuka - releaseGroup: AsiaRa - -? Masala (2013) Telugu Movie HD DVDScr XviD - Exclusive.avi -: year: 2013 - videoCodec: XviD - title: Masala - format: HD-DVD - other: screener - language: Telugu - releaseGroup: Exclusive - -? Django Unchained 2012 DVDSCR X264 AAC-P2P.nfo -: year: 2012 - other: screener - videoCodec: h264 - title: Django Unchained - audioCodec: AAC - format: DVD - releaseGroup: P2P - -? Ejecutiva.En.Apuros(2009).BLURAY.SCR.Xvid.Spanish.LanzamientosD.nfo -: year: 2009 - other: screener - format: BluRay - videoCodec: XviD - language: Spanish - title: Ejecutiva En Apuros - -? Die.Schluempfe.2.German.DL.1080p.BluRay.x264-EXQUiSiTE.mkv -: title: Die Schluempfe 2 - format: BluRay - language: - - Multiple languages - - German - videoCodec: h264 - releaseGroup: EXQUiSiTE - screenSize: 1080p - -? Rocky 1976 French SubForced BRRip x264 AC3-FUNKY.mkv -: title: Rocky - year: 1976 - subtitleLanguage: French - format: BluRay - videoCodec: h264 - audioCodec: AC3 - releaseGroup: FUNKY - -? REDLINE (BD 1080p H264 10bit FLAC) [3xR].mkv -: title: REDLINE - format: BluRay - videoCodec: h264 - videoProfile: 10bit - audioCodec: Flac - screenSize: 1080p - -? The.Lizzie.McGuire.Movie.(2003).HR.DVDRiP.avi -: title: The Lizzie McGuire Movie - year: 2003 - screenSize: 480p - format: DVD - -? Hua.Mulan.BRRIP.MP4.x264.720p-HR.avi -: title: Hua Mulan - videoCodec: h264 - format: BluRay - screenSize: 720p - -? Dr.Seuss.The.Lorax.2012.DVDRip.LiNE.XviD.AC3.HQ.Hive-CM8.mp4 -: videoCodec: XviD - title: Dr Seuss The Lorax - format: DVD - other: LiNE - year: 2012 - audioCodec: AC3 - audioProfile: HQ - releaseGroup: Hive-CM8 - - -? "Star Wars: Episode IV - A New Hope (2004) Special Edition.MKV" -: title: Star Wars Episode IV - year: 2004 - edition: Special Edition - -? Dr.LiNE.The.Lorax.2012.DVDRip.LiNE.XviD.AC3.HQ.Hive-CM8.mp4 -: videoCodec: XviD - title: Dr LiNE The Lorax - format: DVD - other: LiNE - year: 2012 - audioCodec: AC3 - audioProfile: HQ - releaseGroup: Hive-CM8 - -? Perfect Child-2007-TRUEFRENCH-TVRip.Xvid-h@mster.avi -: releaseGroup: h@mster - title: Perfect Child - videoCodec: XviD - language: French - format: TV - year: 2007 - -? entre.ciel.et.terre.(1994).dvdrip.h264.aac-psypeon.avi -: audioCodec: AAC - format: DVD - releaseGroup: psypeon - title: entre ciel et terre - videoCodec: h264 - year: 1994 - -? Yves.Saint.Laurent.2013.FRENCH.DVDSCR.MD.XviD-ViVARiUM.avi -: format: DVD - language: French - other: Screener - releaseGroup: ViVARiUM - title: Yves Saint Laurent - videoCodec: XviD - year: 2013 - -? Echec et Mort - Hard to Kill - Steven Seagal Multi 1080p BluRay x264 CCATS.avi -: format: BluRay - language: Multiple languages - releaseGroup: CCATS - screenSize: 1080p - title: Echec et Mort - videoCodec: h264 - -? Paparazzi - Timsit/Lindon (MKV 1080p tvripHD) -: options: -n - title: Paparazzi - screenSize: 1080p - format: HDTV - -? some.movie.720p.bluray.x264-mind -: options: -n - title: some movie - screenSize: 720p - videoCodec: h264 - releaseGroup: mind - format: BluRay - -? Dr LiNE The Lorax 720p h264 BluRay -: options: -n - title: Dr LiNE The Lorax - screenSize: 720p - videoCodec: h264 - format: BluRay - -? BeatdownFrenchDVDRip.mkv -: title: Beatdown - language: French - format: DVD - -? YvesSaintLaurent2013FrenchDVDScrXvid.avi -: format: DVD - language: French - other: Screener - title: Yves saint laurent - videoCodec: XviD - year: 2013 \ No newline at end of file diff --git a/lib/guessit/test/opensubtitles_languages_2012_05_09.txt b/lib/guessit/test/opensubtitles_languages_2012_05_09.txt deleted file mode 100644 index 4a08d9b5..00000000 --- a/lib/guessit/test/opensubtitles_languages_2012_05_09.txt +++ /dev/null @@ -1,473 +0,0 @@ -IdSubLanguage ISO639 LanguageName UploadEnabled WebEnabled -aar aa Afar, afar 0 0 -abk ab Abkhazian 0 0 -ace Achinese 0 0 -ach Acoli 0 0 -ada Adangme 0 0 -ady adyghé 0 0 -afa Afro-Asiatic (Other) 0 0 -afh Afrihili 0 0 -afr af Afrikaans 0 0 -ain Ainu 0 0 -aka ak Akan 0 0 -akk Akkadian 0 0 -alb sq Albanian 1 1 -ale Aleut 0 0 -alg Algonquian languages 0 0 -alt Southern Altai 0 0 -amh am Amharic 0 0 -ang English, Old (ca.450-1100) 0 0 -apa Apache languages 0 0 -ara ar Arabic 1 1 -arc Aramaic 0 0 -arg an Aragonese 0 0 -arm hy Armenian 1 0 -arn Araucanian 0 0 -arp Arapaho 0 0 -art Artificial (Other) 0 0 -arw Arawak 0 0 -asm as Assamese 0 0 -ast Asturian, Bable 0 0 -ath Athapascan languages 0 0 -aus Australian languages 0 0 -ava av Avaric 0 0 -ave ae Avestan 0 0 -awa Awadhi 0 0 -aym ay Aymara 0 0 -aze az Azerbaijani 0 0 -bad Banda 0 0 -bai Bamileke languages 0 0 -bak ba Bashkir 0 0 -bal Baluchi 0 0 -bam bm Bambara 0 0 -ban Balinese 0 0 -baq eu Basque 1 1 -bas Basa 0 0 -bat Baltic (Other) 0 0 -bej Beja 0 0 -bel be Belarusian 0 0 -bem Bemba 0 0 -ben bn Bengali 1 0 -ber Berber (Other) 0 0 -bho Bhojpuri 0 0 -bih bh Bihari 0 0 -bik Bikol 0 0 -bin Bini 0 0 -bis bi Bislama 0 0 -bla Siksika 0 0 -bnt Bantu (Other) 0 0 -bos bs Bosnian 1 0 -bra Braj 0 0 -bre br Breton 1 0 -btk Batak (Indonesia) 0 0 -bua Buriat 0 0 -bug Buginese 0 0 -bul bg Bulgarian 1 1 -bur my Burmese 0 0 -byn Blin 0 0 -cad Caddo 0 0 -cai Central American Indian (Other) 0 0 -car Carib 0 0 -cat ca Catalan 1 1 -cau Caucasian (Other) 0 0 -ceb Cebuano 0 0 -cel Celtic (Other) 0 0 -cha ch Chamorro 0 0 -chb Chibcha 0 0 -che ce Chechen 0 0 -chg Chagatai 0 0 -chi zh Chinese 1 1 -chk Chuukese 0 0 -chm Mari 0 0 -chn Chinook jargon 0 0 -cho Choctaw 0 0 -chp Chipewyan 0 0 -chr Cherokee 0 0 -chu cu Church Slavic 0 0 -chv cv Chuvash 0 0 -chy Cheyenne 0 0 -cmc Chamic languages 0 0 -cop Coptic 0 0 -cor kw Cornish 0 0 -cos co Corsican 0 0 -cpe Creoles and pidgins, English based (Other) 0 0 -cpf Creoles and pidgins, French-based (Other) 0 0 -cpp Creoles and pidgins, Portuguese-based (Other) 0 0 -cre cr Cree 0 0 -crh Crimean Tatar 0 0 -crp Creoles and pidgins (Other) 0 0 -csb Kashubian 0 0 -cus Cushitic (Other)' couchitiques, autres langues 0 0 -cze cs Czech 1 1 -dak Dakota 0 0 -dan da Danish 1 1 -dar Dargwa 0 0 -day Dayak 0 0 -del Delaware 0 0 -den Slave (Athapascan) 0 0 -dgr Dogrib 0 0 -din Dinka 0 0 -div dv Divehi 0 0 -doi Dogri 0 0 -dra Dravidian (Other) 0 0 -dua Duala 0 0 -dum Dutch, Middle (ca.1050-1350) 0 0 -dut nl Dutch 1 1 -dyu Dyula 0 0 -dzo dz Dzongkha 0 0 -efi Efik 0 0 -egy Egyptian (Ancient) 0 0 -eka Ekajuk 0 0 -elx Elamite 0 0 -eng en English 1 1 -enm English, Middle (1100-1500) 0 0 -epo eo Esperanto 1 0 -est et Estonian 1 1 -ewe ee Ewe 0 0 -ewo Ewondo 0 0 -fan Fang 0 0 -fao fo Faroese 0 0 -fat Fanti 0 0 -fij fj Fijian 0 0 -fil Filipino 0 0 -fin fi Finnish 1 1 -fiu Finno-Ugrian (Other) 0 0 -fon Fon 0 0 -fre fr French 1 1 -frm French, Middle (ca.1400-1600) 0 0 -fro French, Old (842-ca.1400) 0 0 -fry fy Frisian 0 0 -ful ff Fulah 0 0 -fur Friulian 0 0 -gaa Ga 0 0 -gay Gayo 0 0 -gba Gbaya 0 0 -gem Germanic (Other) 0 0 -geo ka Georgian 1 1 -ger de German 1 1 -gez Geez 0 0 -gil Gilbertese 0 0 -gla gd Gaelic 0 0 -gle ga Irish 0 0 -glg gl Galician 1 1 -glv gv Manx 0 0 -gmh German, Middle High (ca.1050-1500) 0 0 -goh German, Old High (ca.750-1050) 0 0 -gon Gondi 0 0 -gor Gorontalo 0 0 -got Gothic 0 0 -grb Grebo 0 0 -grc Greek, Ancient (to 1453) 0 0 -ell el Greek 1 1 -grn gn Guarani 0 0 -guj gu Gujarati 0 0 -gwi Gwich´in 0 0 -hai Haida 0 0 -hat ht Haitian 0 0 -hau ha Hausa 0 0 -haw Hawaiian 0 0 -heb he Hebrew 1 1 -her hz Herero 0 0 -hil Hiligaynon 0 0 -him Himachali 0 0 -hin hi Hindi 1 1 -hit Hittite 0 0 -hmn Hmong 0 0 -hmo ho Hiri Motu 0 0 -hrv hr Croatian 1 1 -hun hu Hungarian 1 1 -hup Hupa 0 0 -iba Iban 0 0 -ibo ig Igbo 0 0 -ice is Icelandic 1 1 -ido io Ido 0 0 -iii ii Sichuan Yi 0 0 -ijo Ijo 0 0 -iku iu Inuktitut 0 0 -ile ie Interlingue 0 0 -ilo Iloko 0 0 -ina ia Interlingua (International Auxiliary Language Asso 0 0 -inc Indic (Other) 0 0 -ind id Indonesian 1 1 -ine Indo-European (Other) 0 0 -inh Ingush 0 0 -ipk ik Inupiaq 0 0 -ira Iranian (Other) 0 0 -iro Iroquoian languages 0 0 -ita it Italian 1 1 -jav jv Javanese 0 0 -jpn ja Japanese 1 1 -jpr Judeo-Persian 0 0 -jrb Judeo-Arabic 0 0 -kaa Kara-Kalpak 0 0 -kab Kabyle 0 0 -kac Kachin 0 0 -kal kl Kalaallisut 0 0 -kam Kamba 0 0 -kan kn Kannada 0 0 -kar Karen 0 0 -kas ks Kashmiri 0 0 -kau kr Kanuri 0 0 -kaw Kawi 0 0 -kaz kk Kazakh 1 0 -kbd Kabardian 0 0 -kha Khasi 0 0 -khi Khoisan (Other) 0 0 -khm km Khmer 1 1 -kho Khotanese 0 0 -kik ki Kikuyu 0 0 -kin rw Kinyarwanda 0 0 -kir ky Kirghiz 0 0 -kmb Kimbundu 0 0 -kok Konkani 0 0 -kom kv Komi 0 0 -kon kg Kongo 0 0 -kor ko Korean 1 1 -kos Kosraean 0 0 -kpe Kpelle 0 0 -krc Karachay-Balkar 0 0 -kro Kru 0 0 -kru Kurukh 0 0 -kua kj Kuanyama 0 0 -kum Kumyk 0 0 -kur ku Kurdish 0 0 -kut Kutenai 0 0 -lad Ladino 0 0 -lah Lahnda 0 0 -lam Lamba 0 0 -lao lo Lao 0 0 -lat la Latin 0 0 -lav lv Latvian 1 0 -lez Lezghian 0 0 -lim li Limburgan 0 0 -lin ln Lingala 0 0 -lit lt Lithuanian 1 0 -lol Mongo 0 0 -loz Lozi 0 0 -ltz lb Luxembourgish 1 0 -lua Luba-Lulua 0 0 -lub lu Luba-Katanga 0 0 -lug lg Ganda 0 0 -lui Luiseno 0 0 -lun Lunda 0 0 -luo Luo (Kenya and Tanzania) 0 0 -lus lushai 0 0 -mac mk Macedonian 1 1 -mad Madurese 0 0 -mag Magahi 0 0 -mah mh Marshallese 0 0 -mai Maithili 0 0 -mak Makasar 0 0 -mal ml Malayalam 0 0 -man Mandingo 0 0 -mao mi Maori 0 0 -map Austronesian (Other) 0 0 -mar mr Marathi 0 0 -mas Masai 0 0 -may ms Malay 1 1 -mdf Moksha 0 0 -mdr Mandar 0 0 -men Mende 0 0 -mga Irish, Middle (900-1200) 0 0 -mic Mi'kmaq 0 0 -min Minangkabau 0 0 -mis Miscellaneous languages 0 0 -mkh Mon-Khmer (Other) 0 0 -mlg mg Malagasy 0 0 -mlt mt Maltese 0 0 -mnc Manchu 0 0 -mni Manipuri 0 0 -mno Manobo languages 0 0 -moh Mohawk 0 0 -mol mo Moldavian 0 0 -mon mn Mongolian 1 0 -mos Mossi 0 0 -mwl Mirandese 0 0 -mul Multiple languages 0 0 -mun Munda languages 0 0 -mus Creek 0 0 -mwr Marwari 0 0 -myn Mayan languages 0 0 -myv Erzya 0 0 -nah Nahuatl 0 0 -nai North American Indian 0 0 -nap Neapolitan 0 0 -nau na Nauru 0 0 -nav nv Navajo 0 0 -nbl nr Ndebele, South 0 0 -nde nd Ndebele, North 0 0 -ndo ng Ndonga 0 0 -nds Low German 0 0 -nep ne Nepali 0 0 -new Nepal Bhasa 0 0 -nia Nias 0 0 -nic Niger-Kordofanian (Other) 0 0 -niu Niuean 0 0 -nno nn Norwegian Nynorsk 0 0 -nob nb Norwegian Bokmal 0 0 -nog Nogai 0 0 -non Norse, Old 0 0 -nor no Norwegian 1 1 -nso Northern Sotho 0 0 -nub Nubian languages 0 0 -nwc Classical Newari 0 0 -nya ny Chichewa 0 0 -nym Nyamwezi 0 0 -nyn Nyankole 0 0 -nyo Nyoro 0 0 -nzi Nzima 0 0 -oci oc Occitan 1 1 -oji oj Ojibwa 0 0 -ori or Oriya 0 0 -orm om Oromo 0 0 -osa Osage 0 0 -oss os Ossetian 0 0 -ota Turkish, Ottoman (1500-1928) 0 0 -oto Otomian languages 0 0 -paa Papuan (Other) 0 0 -pag Pangasinan 0 0 -pal Pahlavi 0 0 -pam Pampanga 0 0 -pan pa Panjabi 0 0 -pap Papiamento 0 0 -pau Palauan 0 0 -peo Persian, Old (ca.600-400 B.C.) 0 0 -per fa Persian 1 1 -phi Philippine (Other) 0 0 -phn Phoenician 0 0 -pli pi Pali 0 0 -pol pl Polish 1 1 -pon Pohnpeian 0 0 -por pt Portuguese 1 1 -pra Prakrit languages 0 0 -pro Provençal, Old (to 1500) 0 0 -pus ps Pushto 0 0 -que qu Quechua 0 0 -raj Rajasthani 0 0 -rap Rapanui 0 0 -rar Rarotongan 0 0 -roa Romance (Other) 0 0 -roh rm Raeto-Romance 0 0 -rom Romany 0 0 -run rn Rundi 0 0 -rup Aromanian 0 0 -rus ru Russian 1 1 -sad Sandawe 0 0 -sag sg Sango 0 0 -sah Yakut 0 0 -sai South American Indian (Other) 0 0 -sal Salishan languages 0 0 -sam Samaritan Aramaic 0 0 -san sa Sanskrit 0 0 -sas Sasak 0 0 -sat Santali 0 0 -scc sr Serbian 1 1 -scn Sicilian 0 0 -sco Scots 0 0 -sel Selkup 0 0 -sem Semitic (Other) 0 0 -sga Irish, Old (to 900) 0 0 -sgn Sign Languages 0 0 -shn Shan 0 0 -sid Sidamo 0 0 -sin si Sinhalese 1 1 -sio Siouan languages 0 0 -sit Sino-Tibetan (Other) 0 0 -sla Slavic (Other) 0 0 -slo sk Slovak 1 1 -slv sl Slovenian 1 1 -sma Southern Sami 0 0 -sme se Northern Sami 0 0 -smi Sami languages (Other) 0 0 -smj Lule Sami 0 0 -smn Inari Sami 0 0 -smo sm Samoan 0 0 -sms Skolt Sami 0 0 -sna sn Shona 0 0 -snd sd Sindhi 0 0 -snk Soninke 0 0 -sog Sogdian 0 0 -som so Somali 0 0 -son Songhai 0 0 -sot st Sotho, Southern 0 0 -spa es Spanish 1 1 -srd sc Sardinian 0 0 -srr Serer 0 0 -ssa Nilo-Saharan (Other) 0 0 -ssw ss Swati 0 0 -suk Sukuma 0 0 -sun su Sundanese 0 0 -sus Susu 0 0 -sux Sumerian 0 0 -swa sw Swahili 1 0 -swe sv Swedish 1 1 -syr Syriac 1 0 -tah ty Tahitian 0 0 -tai Tai (Other) 0 0 -tam ta Tamil 0 0 -tat tt Tatar 0 0 -tel te Telugu 0 0 -tem Timne 0 0 -ter Tereno 0 0 -tet Tetum 0 0 -tgk tg Tajik 0 0 -tgl tl Tagalog 1 1 -tha th Thai 1 1 -tib bo Tibetan 0 0 -tig Tigre 0 0 -tir ti Tigrinya 0 0 -tiv Tiv 0 0 -tkl Tokelau 0 0 -tlh Klingon 0 0 -tli Tlingit 0 0 -tmh Tamashek 0 0 -tog Tonga (Nyasa) 0 0 -ton to Tonga (Tonga Islands) 0 0 -tpi Tok Pisin 0 0 -tsi Tsimshian 0 0 -tsn tn Tswana 0 0 -tso ts Tsonga 0 0 -tuk tk Turkmen 0 0 -tum Tumbuka 0 0 -tup Tupi languages 0 0 -tur tr Turkish 1 1 -tut Altaic (Other) 0 0 -tvl Tuvalu 0 0 -twi tw Twi 0 0 -tyv Tuvinian 0 0 -udm Udmurt 0 0 -uga Ugaritic 0 0 -uig ug Uighur 0 0 -ukr uk Ukrainian 1 1 -umb Umbundu 0 0 -und Undetermined 0 0 -urd ur Urdu 1 0 -uzb uz Uzbek 0 0 -vai Vai 0 0 -ven ve Venda 0 0 -vie vi Vietnamese 1 1 -vol vo Volapük 0 0 -vot Votic 0 0 -wak Wakashan languages 0 0 -wal Walamo 0 0 -war Waray 0 0 -was Washo 0 0 -wel cy Welsh 0 0 -wen Sorbian languages 0 0 -wln wa Walloon 0 0 -wol wo Wolof 0 0 -xal Kalmyk 0 0 -xho xh Xhosa 0 0 -yao Yao 0 0 -yap Yapese 0 0 -yid yi Yiddish 0 0 -yor yo Yoruba 0 0 -ypk Yupik languages 0 0 -zap Zapotec 0 0 -zen Zenaga 0 0 -zha za Zhuang 0 0 -znd Zande 0 0 -zul zu Zulu 0 0 -zun Zuni 0 0 -rum ro Romanian 1 1 -pob pb Brazilian 1 1 diff --git a/lib/guessit/test/test_api.py b/lib/guessit/test/test_api.py deleted file mode 100644 index 92cef41b..00000000 --- a/lib/guessit/test/test_api.py +++ /dev/null @@ -1,54 +0,0 @@ -#!/usr/bin/env python -# -*- coding: utf-8 -*- -# -# GuessIt - A library for guessing information from filenames -# Copyright (c) 2014 Nicolas Wack -# -# GuessIt is free software; you can redistribute it and/or modify it under -# the terms of the Lesser GNU General Public License as published by -# the Free Software Foundation; either version 3 of the License, or -# (at your option) any later version. -# -# GuessIt is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# Lesser GNU General Public License for more details. -# -# You should have received a copy of the Lesser GNU General Public License -# along with this program. If not, see . -# - -from __future__ import absolute_import, division, print_function, unicode_literals - -from guessit.test.guessittest import * - - -class TestApi(TestGuessit): - def test_api(self): - movie_path = 'Movies/Dark City (1998)/Dark.City.(1998).DC.BDRip.720p.DTS.X264-CHD.mkv' - - movie_info = guessit.guess_movie_info(movie_path) - video_info = guessit.guess_video_info(movie_path) - episode_info = guessit.guess_episode_info(movie_path) - file_info = guessit.guess_file_info(movie_path) - - self.assertEqual(guessit.guess_file_info(movie_path, type='movie'), movie_info) - self.assertEqual(guessit.guess_file_info(movie_path, type='video'), video_info) - self.assertEqual(guessit.guess_file_info(movie_path, type='episode'), episode_info) - - self.assertEqual(guessit.guess_file_info(movie_path, options={'type': 'movie'}), movie_info) - self.assertEqual(guessit.guess_file_info(movie_path, options={'type': 'video'}), video_info) - self.assertEqual(guessit.guess_file_info(movie_path, options={'type': 'episode'}), episode_info) - - self.assertEqual(guessit.guess_file_info(movie_path, options={'type': 'episode'}, type='movie'), episode_info) # kwargs priority other options - - movie_path_name_only = 'Movies/Dark City (1998)/Dark.City.(1998).DC.BDRip.720p.DTS.X264-CHD' - file_info_name_only = guessit.guess_file_info(movie_path_name_only, options={"name_only": True}) - - self.assertFalse('container' in file_info_name_only) - self.assertTrue('container' in file_info) - -suite = allTests(TestApi) - -if __name__ == '__main__': - TextTestRunner(verbosity=2).run(suite) diff --git a/lib/guessit/test/test_autodetect.py b/lib/guessit/test/test_autodetect.py deleted file mode 100644 index 229b491f..00000000 --- a/lib/guessit/test/test_autodetect.py +++ /dev/null @@ -1,45 +0,0 @@ -#!/usr/bin/env python -# -*- coding: utf-8 -*- -# -# GuessIt - A library for guessing information from filenames -# Copyright (c) 2013 Nicolas Wack -# -# GuessIt is free software; you can redistribute it and/or modify it under -# the terms of the Lesser GNU General Public License as published by -# the Free Software Foundation; either version 3 of the License, or -# (at your option) any later version. -# -# GuessIt is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# Lesser GNU General Public License for more details. -# -# You should have received a copy of the Lesser GNU General Public License -# along with this program. If not, see . -# - -from __future__ import absolute_import, division, print_function, unicode_literals - -from guessit.test.guessittest import * - - -class TestAutoDetect(TestGuessit): - def testEmpty(self): - result = guessit.guess_file_info('') - self.assertEqual(result, {}) - - result = guessit.guess_file_info('___-__') - self.assertEqual(result, {}) - - result = guessit.guess_file_info('__-.avc') - self.assertEqual(result, {'type': 'unknown', 'extension': 'avc'}) - - def testAutoDetect(self): - self.checkMinimumFieldsCorrect(filename='autodetect.yaml', - remove_type=False) - - -suite = allTests(TestAutoDetect) - -if __name__ == '__main__': - TextTestRunner(verbosity=2).run(suite) diff --git a/lib/guessit/test/test_autodetect_all.py b/lib/guessit/test/test_autodetect_all.py deleted file mode 100644 index 033e1571..00000000 --- a/lib/guessit/test/test_autodetect_all.py +++ /dev/null @@ -1,46 +0,0 @@ -#!/usr/bin/env python -# -*- coding: utf-8 -*- -# -# GuessIt - A library for guessing information from filenames -# Copyright (c) 2013 Nicolas Wack -# -# GuessIt is free software; you can redistribute it and/or modify it under -# the terms of the Lesser GNU General Public License as published by -# the Free Software Foundation; either version 3 of the License, or -# (at your option) any later version. -# -# GuessIt is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# Lesser GNU General Public License for more details. -# -# You should have received a copy of the Lesser GNU General Public License -# along with this program. If not, see . -# - -from __future__ import absolute_import, division, print_function, unicode_literals - -from guessit.test.guessittest import * - -IGNORE_EPISODES = [] -IGNORE_MOVIES = [] - - -class TestAutoDetectAll(TestGuessit): - def testAutoMatcher(self): - self.checkMinimumFieldsCorrect(filename='autodetect.yaml', - remove_type=False) - - def testAutoMatcherMovies(self): - self.checkMinimumFieldsCorrect(filename='movies.yaml', - exclude_files=IGNORE_MOVIES) - - def testAutoMatcherEpisodes(self): - self.checkMinimumFieldsCorrect(filename='episodes.yaml', - exclude_files=IGNORE_EPISODES) - - -suite = allTests(TestAutoDetectAll) - -if __name__ == '__main__': - TextTestRunner(verbosity=2).run(suite) diff --git a/lib/guessit/test/test_doctests.py b/lib/guessit/test/test_doctests.py deleted file mode 100644 index 9fedeb0f..00000000 --- a/lib/guessit/test/test_doctests.py +++ /dev/null @@ -1,45 +0,0 @@ -#!/usr/bin/env python -# -*- coding: utf-8 -*- -# -# GuessIt - A library for guessing information from filenames -# Copyright (c) 2014 Nicolas Wack -# -# GuessIt is free software; you can redistribute it and/or modify it under -# the terms of the Lesser GNU General Public License as published by -# the Free Software Foundation; either version 3 of the License, or -# (at your option) any later version. -# -# GuessIt is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# Lesser GNU General Public License for more details. -# -# You should have received a copy of the Lesser GNU General Public License -# along with this program. If not, see . -# - -from __future__ import absolute_import, division, print_function, unicode_literals - -from guessit.test.guessittest import * -import guessit -import guessit.hash_ed2k -import unittest -import doctest - - -def load_tests(loader, tests, ignore): - tests.addTests(doctest.DocTestSuite(guessit)) - tests.addTests(doctest.DocTestSuite(guessit.date)) - tests.addTests(doctest.DocTestSuite(guessit.fileutils)) - tests.addTests(doctest.DocTestSuite(guessit.guess)) - tests.addTests(doctest.DocTestSuite(guessit.hash_ed2k)) - tests.addTests(doctest.DocTestSuite(guessit.language)) - tests.addTests(doctest.DocTestSuite(guessit.matchtree)) - tests.addTests(doctest.DocTestSuite(guessit.textutils)) - return tests - -suite = unittest.TestSuite() -load_tests(None, suite, None) - -if __name__ == '__main__': - TextTestRunner(verbosity=2).run(suite) diff --git a/lib/guessit/test/test_episode.py b/lib/guessit/test/test_episode.py deleted file mode 100644 index 03abf6b0..00000000 --- a/lib/guessit/test/test_episode.py +++ /dev/null @@ -1,35 +0,0 @@ -#!/usr/bin/env python -# -*- coding: utf-8 -*- -# -# GuessIt - A library for guessing information from filenames -# Copyright (c) 2013 Nicolas Wack -# -# GuessIt is free software; you can redistribute it and/or modify it under -# the terms of the Lesser GNU General Public License as published by -# the Free Software Foundation; either version 3 of the License, or -# (at your option) any later version. -# -# GuessIt is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# Lesser GNU General Public License for more details. -# -# You should have received a copy of the Lesser GNU General Public License -# along with this program. If not, see . -# - -from __future__ import absolute_import, division, print_function, unicode_literals - -from guessit.test.guessittest import * - - -class TestEpisode(TestGuessit): - def testEpisodes(self): - self.checkMinimumFieldsCorrect(filetype='episode', - filename='episodes.yaml') - - -suite = allTests(TestEpisode) - -if __name__ == '__main__': - TextTestRunner(verbosity=2).run(suite) diff --git a/lib/guessit/test/test_hashes.py b/lib/guessit/test/test_hashes.py deleted file mode 100644 index a8bc763c..00000000 --- a/lib/guessit/test/test_hashes.py +++ /dev/null @@ -1,46 +0,0 @@ -#!/usr/bin/env python -# -*- coding: utf-8 -*- -# -# GuessIt - A library for guessing information from filenames -# Copyright (c) 2013 Nicolas Wack -# -# GuessIt is free software; you can redistribute it and/or modify it under -# the terms of the Lesser GNU General Public License as published by -# the Free Software Foundation; either version 3 of the License, or -# (at your option) any later version. -# -# GuessIt is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# Lesser GNU General Public License for more details. -# -# You should have received a copy of the Lesser GNU General Public License -# along with this program. If not, see . -# - -from __future__ import absolute_import, division, print_function, unicode_literals - -from guessit.test.guessittest import * - - -class TestHashes(TestGuessit): - def test_hashes(self): - hashes = ( - ('hash_mpc', '1MB', u'8542ad406c15c8bd'), # TODO: Check if this value is valid - ('hash_ed2k', '1MB', u'ed2k://|file|1MB|1048576|AA3CC5552A9931A76B61A41D306735F7|/'), # TODO: Check if this value is valid - ('hash_md5', '1MB', u'5d8dcbca8d8ac21766f28797d6c3954c'), - ('hash_sha1', '1MB', u'51d2b8f3248d7ee495b7750c8da5aa3b3819de9d'), - ('hash_md5', 'dummy.srt', u'64de6b5893cac24456c46a935ef9c359'), - ('hash_sha1', 'dummy.srt', u'a703fc0fa4518080505809bf562c6fc6f7b3c98c') - ) - - for hash_type, filename, expected_value in hashes: - guess = guess_file_info(file_in_same_dir(__file__, filename), hash_type) - computed_value = guess.get(hash_type) - self.assertEqual(expected_value, guess.get(hash_type), "Invalid %s for %s: %s != %s" % (hash_type, filename, computed_value, expected_value)) - - -suite = allTests(TestHashes) - -if __name__ == '__main__': - TextTestRunner(verbosity=2).run(suite) diff --git a/lib/guessit/test/test_language.py b/lib/guessit/test/test_language.py deleted file mode 100644 index 406d92c1..00000000 --- a/lib/guessit/test/test_language.py +++ /dev/null @@ -1,138 +0,0 @@ -#!/usr/bin/env python -# -*- coding: utf-8 -*- -# -# GuessIt - A library for guessing information from filenames -# Copyright (c) 2013 Nicolas Wack -# -# GuessIt is free software; you can redistribute it and/or modify it under -# the terms of the Lesser GNU General Public License as published by -# the Free Software Foundation; either version 3 of the License, or -# (at your option) any later version. -# -# GuessIt is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# Lesser GNU General Public License for more details. -# -# You should have received a copy of the Lesser GNU General Public License -# along with this program. If not, see . -# - -from __future__ import absolute_import, division, print_function, unicode_literals - -from guessit.test.guessittest import * - -import io - - -class TestLanguage(TestGuessit): - - def check_languages(self, languages): - for lang1, lang2 in languages.items(): - self.assertEqual(Language(lang1), - Language(lang2)) - - def test_addic7ed(self): - languages = {'English': 'en', - 'English (US)': 'en', - 'English (UK)': 'en', - 'Italian': 'it', - 'Portuguese': 'pt', - 'Portuguese (Brazilian)': 'pt', - 'Romanian': 'ro', - 'Español (Latinoamérica)': 'es', - 'Español (España)': 'es', - 'Spanish (Latin America)': 'es', - 'Español': 'es', - 'Spanish': 'es', - 'Spanish (Spain)': 'es', - 'French': 'fr', - 'Greek': 'el', - 'Arabic': 'ar', - 'German': 'de', - 'Croatian': 'hr', - 'Indonesian': 'id', - 'Hebrew': 'he', - 'Russian': 'ru', - 'Turkish': 'tr', - 'Swedish': 'se', - 'Czech': 'cs', - 'Dutch': 'nl', - 'Hungarian': 'hu', - 'Norwegian': 'no', - 'Polish': 'pl', - 'Persian': 'fa'} - - self.check_languages(languages) - - def test_subswiki(self): - languages = {'English (US)': 'en', 'English (UK)': 'en', 'English': 'en', - 'French': 'fr', 'Brazilian': 'po', 'Portuguese': 'pt', - 'Español (Latinoamérica)': 'es', 'Español (España)': 'es', - 'Español': 'es', 'Italian': 'it', 'Català': 'ca'} - - self.check_languages(languages) - - def test_tvsubtitles(self): - languages = {'English': 'en', 'Español': 'es', 'French': 'fr', 'German': 'de', - 'Brazilian': 'br', 'Russian': 'ru', 'Ukrainian': 'ua', 'Italian': 'it', - 'Greek': 'gr', 'Arabic': 'ar', 'Hungarian': 'hu', 'Polish': 'pl', - 'Turkish': 'tr', 'Dutch': 'nl', 'Portuguese': 'pt', 'Swedish': 'sv', - 'Danish': 'da', 'Finnish': 'fi', 'Korean': 'ko', 'Chinese': 'cn', - 'Japanese': 'jp', 'Bulgarian': 'bg', 'Czech': 'cz', 'Romanian': 'ro'} - - self.check_languages(languages) - - def test_opensubtitles(self): - opensubtitles_langfile = file_in_same_dir(__file__, 'opensubtitles_languages_2012_05_09.txt') - for l in [u(l).strip() for l in io.open(opensubtitles_langfile, encoding='utf-8')][1:]: - idlang, alpha2, _, upload_enabled, web_enabled = l.strip().split('\t') - # do not test languages that are too esoteric / not widely available - if int(upload_enabled) and int(web_enabled): - # check that we recognize the opensubtitles language code correctly - # and that we are able to output this code from a language - self.assertEqual(idlang, Language(idlang).opensubtitles) - if alpha2: - # check we recognize the opensubtitles 2-letter code correctly - self.check_languages({idlang: alpha2}) - - def test_tmdb(self): - # examples from http://api.themoviedb.org/2.1/language-tags - for lang in ['en-US', 'en-CA', 'es-MX', 'fr-PF']: - self.assertEqual(lang, Language(lang).tmdb) - - def test_subtitulos(self): - languages = {'English (US)': 'en', 'English (UK)': 'en', 'English': 'en', - 'French': 'fr', 'Brazilian': 'po', 'Portuguese': 'pt', - 'Español (Latinoamérica)': 'es', 'Español (España)': 'es', - 'Español': 'es', 'Italian': 'it', 'Català': 'ca'} - - self.check_languages(languages) - - def test_thesubdb(self): - languages = {'af': 'af', 'cs': 'cs', 'da': 'da', 'de': 'de', 'en': 'en', 'es': 'es', 'fi': 'fi', - 'fr': 'fr', 'hu': 'hu', 'id': 'id', 'it': 'it', 'la': 'la', 'nl': 'nl', 'no': 'no', - 'oc': 'oc', 'pl': 'pl', 'pt': 'pt', 'ro': 'ro', 'ru': 'ru', 'sl': 'sl', 'sr': 'sr', - 'sv': 'sv', 'tr': 'tr'} - - self.check_languages(languages) - - def test_language_object(self): - self.assertEqual(len(list(set([Language('qwerty'), Language('asdf')]))), 1) - d = {Language('qwerty'): 7} - d[Language('asdf')] = 23 - self.assertEqual(d[Language('qwerty')], 23) - - def test_exceptions(self): - self.assertEqual(Language('br'), Language('pt(br)')) - - # languages should be equal regardless of country - self.assertEqual(Language('br'), Language('pt')) - - self.assertEqual(Language('unknown'), Language('und')) - - -suite = allTests(TestLanguage) - -if __name__ == '__main__': - TextTestRunner(verbosity=2).run(suite) diff --git a/lib/guessit/test/test_main.py b/lib/guessit/test/test_main.py deleted file mode 100644 index 19540d8a..00000000 --- a/lib/guessit/test/test_main.py +++ /dev/null @@ -1,70 +0,0 @@ -#!/usr/bin/env python -# -*- coding: utf-8 -*- -# -# GuessIt - A library for guessing information from filenames -# Copyright (c) 2014 Nicolas Wack -# -# GuessIt is free software; you can redistribute it and/or modify it under -# the terms of the Lesser GNU General Public License as published by -# the Free Software Foundation; either version 3 of the License, or -# (at your option) any later version. -# -# GuessIt is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# Lesser GNU General Public License for more details. -# -# You should have received a copy of the Lesser GNU General Public License -# along with this program. If not, see . -# - -from __future__ import absolute_import, division, print_function, unicode_literals - -from guessit.test.guessittest import * -from guessit.fileutils import split_path, file_in_same_dir -from guessit.textutils import strip_brackets, str_replace, str_fill -from guessit import PY2 -from guessit import __main__ - -if PY2: - from StringIO import StringIO -else: - from io import StringIO - - -class TestMain(TestGuessit): - def setUp(self): - self._stdout = sys.stdout - string_out = StringIO() - sys.stdout = string_out - - def tearDown(self): - sys.stdout = self._stdout - - def test_list_properties(self): - __main__.main(["-p"], False) - __main__.main(["-l"], False) - - def test_list_transformers(self): - __main__.main(["--transformers"], False) - __main__.main(["-l", "--transformers"], False) - - def test_demo(self): - __main__.main(["-d"], False) - __main__.main(["-l"], False) - - def test_filename(self): - __main__.main(["A.Movie.2014.avi"], False) - __main__.main(["A.Movie.2014.avi", "A.2nd.Movie.2014.avi"], False) - __main__.main(["-y", "A.Movie.2014.avi"], False) - __main__.main(["-a", "A.Movie.2014.avi"], False) - __main__.main(["-v", "A.Movie.2014.avi"], False) - __main__.main(["-t", "movie", "A.Movie.2014.avi"], False) - __main__.main(["-t", "episode", "A.Serie.S02E06.avi"], False) - __main__.main(["-i", "hash_mpc", file_in_same_dir(__file__, "1MB")], False) - __main__.main(["-i", "hash_md5", file_in_same_dir(__file__, "1MB")], False) - -suite = allTests(TestMain) - -if __name__ == '__main__': - TextTestRunner(verbosity=2).run(suite) diff --git a/lib/guessit/test/test_matchtree.py b/lib/guessit/test/test_matchtree.py deleted file mode 100644 index c10840a0..00000000 --- a/lib/guessit/test/test_matchtree.py +++ /dev/null @@ -1,93 +0,0 @@ -#!/usr/bin/env python -# -*- coding: utf-8 -*- -# -# GuessIt - A library for guessing information from filenames -# Copyright (c) 2013 Nicolas Wack -# -# GuessIt is free software; you can redistribute it and/or modify it under -# the terms of the Lesser GNU General Public License as published by -# the Free Software Foundation; either version 3 of the License, or -# (at your option) any later version. -# -# GuessIt is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# Lesser GNU General Public License for more details. -# -# You should have received a copy of the Lesser GNU General Public License -# along with this program. If not, see . -# - -from __future__ import absolute_import, division, print_function, unicode_literals - -from guessit.test.guessittest import * - -from guessit.transfo.guess_release_group import GuessReleaseGroup -from guessit.transfo.guess_properties import GuessProperties -from guessit.matchtree import BaseMatchTree - -keywords = yaml.load(""" - -? Xvid PROPER -: videoCodec: Xvid - other: PROPER - -? PROPER-Xvid -: videoCodec: Xvid - other: PROPER - -""") - - -def guess_info(string, options=None): - mtree = MatchTree(string) - GuessReleaseGroup().process(mtree, options) - GuessProperties().process(mtree, options) - return mtree.matched() - - -class TestMatchTree(TestGuessit): - def test_base_tree(self): - t = BaseMatchTree('One Two Three(Three) Four') - t.partition((3, 7, 20)) - leaves = t.leaves() - - self.assertEqual(leaves[0].span, (0, 3)) - - self.assertEqual('One', leaves[0].value) - self.assertEqual(' Two', leaves[1].value) - self.assertEqual(' Three(Three)', leaves[2].value) - self.assertEqual(' Four', leaves[3].value) - - leaves[2].partition((1, 6, 7, 12)) - three_leaves = leaves[2].leaves() - - self.assertEqual('Three', three_leaves[1].value) - self.assertEqual('Three', three_leaves[3].value) - - leaves = t.leaves() - - self.assertEqual(len(leaves), 8) - - self.assertEqual(leaves[5], three_leaves[3]) - - self.assertEqual(t.previous_leaf(leaves[5]), leaves[4]) - self.assertEqual(t.next_leaf(leaves[5]), leaves[6]) - - self.assertEqual(t.next_leaves(leaves[5]), [leaves[6], leaves[7]]) - self.assertEqual(t.previous_leaves(leaves[5]), [leaves[4], leaves[3], leaves[2], leaves[1], leaves[0]]) - - self.assertEqual(t.next_leaf(leaves[7]), None) - self.assertEqual(t.previous_leaf(leaves[0]), None) - - self.assertEqual(t.next_leaves(leaves[7]), []) - self.assertEqual(t.previous_leaves(leaves[0]), []) - - def test_match(self): - self.checkFields(keywords, guess_info) - - -suite = allTests(TestMatchTree) - -if __name__ == '__main__': - TextTestRunner(verbosity=2).run(suite) diff --git a/lib/guessit/test/test_movie.py b/lib/guessit/test/test_movie.py deleted file mode 100644 index eecbf49d..00000000 --- a/lib/guessit/test/test_movie.py +++ /dev/null @@ -1,35 +0,0 @@ -#!/usr/bin/env python -# -*- coding: utf-8 -*- -# -# GuessIt - A library for guessing information from filenames -# Copyright (c) 2013 Nicolas Wack -# -# GuessIt is free software; you can redistribute it and/or modify it under -# the terms of the Lesser GNU General Public License as published by -# the Free Software Foundation; either version 3 of the License, or -# (at your option) any later version. -# -# GuessIt is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# Lesser GNU General Public License for more details. -# -# You should have received a copy of the Lesser GNU General Public License -# along with this program. If not, see . -# - -from __future__ import absolute_import, division, print_function, unicode_literals - -from guessit.test.guessittest import * - - -class TestMovie(TestGuessit): - def testMovies(self): - self.checkMinimumFieldsCorrect(filetype='movie', - filename='movies.yaml') - - -suite = allTests(TestMovie) - -if __name__ == '__main__': - TextTestRunner(verbosity=2).run(suite) diff --git a/lib/guessit/test/test_quality.py b/lib/guessit/test/test_quality.py deleted file mode 100644 index 52e21791..00000000 --- a/lib/guessit/test/test_quality.py +++ /dev/null @@ -1,126 +0,0 @@ -#!/usr/bin/env python -# -*- coding: utf-8 -*- -# -# GuessIt - A library for guessing information from filenames -# Copyright (c) 2013 Nicolas Wack -# -# GuessIt is free software; you can redistribute it and/or modify it under -# the terms of the Lesser GNU General Public License as published by -# the Free Software Foundation; either version 3 of the License, or -# (at your option) any later version. -# -# GuessIt is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# Lesser GNU General Public License for more details. -# -# You should have received a copy of the Lesser GNU General Public License -# along with this program. If not, see . -# - -from __future__ import absolute_import, division, print_function, unicode_literals - -from guessit.quality import best_quality, best_quality_properties -from guessit.containers import QualitiesContainer -from guessit.test.guessittest import * - - -class TestQuality(TestGuessit): - def test_container(self): - container = QualitiesContainer() - - container.register_quality('color', 'red', 10) - container.register_quality('color', 'orange', 20) - container.register_quality('color', 'green', 30) - - container.register_quality('context', 'sun', 100) - container.register_quality('context', 'sea', 200) - container.register_quality('context', 'sex', 300) - - g1 = Guess() - g1['color'] = 'red' - - g2 = Guess() - g2['color'] = 'green' - - g3 = Guess() - g3['color'] = 'orange' - - q3 = container.rate_quality(g3) - self.assertEqual(q3, 20, "ORANGE should be rated 20. Don't ask why!") - - q1 = container.rate_quality(g1) - q2 = container.rate_quality(g2) - - self.assertTrue(q2 > q1, "GREEN should be greater than RED. Don't ask why!") - - g1['context'] = 'sex' - g2['context'] = 'sun' - - q1 = container.rate_quality(g1) - q2 = container.rate_quality(g2) - - self.assertTrue(q1 > q2, "SEX should be greater than SUN. Don't ask why!") - - self.assertEqual(container.best_quality(g1, g2), g1, "RED&SEX should be better than GREEN&SUN. Don't ask why!") - - self.assertEqual(container.best_quality_properties(['color'], g1, g2), g2, "GREEN should be better than RED. Don't ask why!") - - self.assertEqual(container.best_quality_properties(['context'], g1, g2), g1, "SEX should be better than SUN. Don't ask why!") - - q1 = container.rate_quality(g1, 'color') - q2 = container.rate_quality(g2, 'color') - - self.assertTrue(q2 > q1, "GREEN should be greater than RED. Don't ask why!") - - container.unregister_quality('context', 'sex') - container.unregister_quality('context', 'sun') - - q1 = container.rate_quality(g1) - q2 = container.rate_quality(g2) - - self.assertTrue(q2 > q1, "GREEN&SUN should be greater than RED&SEX. Don't ask why!") - - g3['context'] = 'sea' - container.unregister_quality('context', 'sea') - - q3 = container.rate_quality(g3, 'context') - self.assertEqual(q3, 0, "Context should be unregistered.") - - container.unregister_quality('color') - q3 = container.rate_quality(g3, 'color') - - self.assertEqual(q3, 0, "Color should be unregistered.") - - container.clear_qualities() - - q1 = container.rate_quality(g1) - q2 = container.rate_quality(g2) - - self.assertTrue(q1 == q2 == 0, "Empty quality container should rate each guess to 0") - - def test_quality_transformers(self): - guess_720p = guessit.guess_file_info("2012.2009.720p.BluRay.x264.DTS WiKi.mkv") - guess_1080p = guessit.guess_file_info("2012.2009.1080p.BluRay.x264.MP3 WiKi.mkv") - - self.assertTrue('audioCodec' in guess_720p, "audioCodec should be present") - self.assertTrue('audioCodec' in guess_1080p, "audioCodec should be present") - self.assertTrue('screenSize' in guess_720p, "screenSize should be present") - self.assertTrue('screenSize' in guess_1080p, "screenSize should be present") - - best_quality_guess = best_quality(guess_720p, guess_1080p) - - self.assertTrue(guess_1080p == best_quality_guess, "1080p+MP3 is not the best global quality") - - best_quality_guess = best_quality_properties(['screenSize'], guess_720p, guess_1080p) - - self.assertTrue(guess_1080p == best_quality_guess, "1080p is not the best screenSize") - - best_quality_guess = best_quality_properties(['audioCodec'], guess_720p, guess_1080p) - - self.assertTrue(guess_720p == best_quality_guess, "DTS is not the best audioCodec") - -suite = allTests(TestQuality) - -if __name__ == '__main__': - TextTestRunner(verbosity=2).run(suite) diff --git a/lib/guessit/test/test_utils.py b/lib/guessit/test/test_utils.py deleted file mode 100644 index 8cf4028b..00000000 --- a/lib/guessit/test/test_utils.py +++ /dev/null @@ -1,155 +0,0 @@ -#!/usr/bin/env python -# -*- coding: utf-8 -*- -# -# GuessIt - A library for guessing information from filenames -# Copyright (c) 2013 Nicolas Wack -# -# GuessIt is free software; you can redistribute it and/or modify it under -# the terms of the Lesser GNU General Public License as published by -# the Free Software Foundation; either version 3 of the License, or -# (at your option) any later version. -# -# GuessIt is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# Lesser GNU General Public License for more details. -# -# You should have received a copy of the Lesser GNU General Public License -# along with this program. If not, see . -# - -from __future__ import absolute_import, division, print_function, unicode_literals - -from guessit.test.guessittest import * -from guessit.fileutils import split_path -from guessit.textutils import strip_brackets, str_replace, str_fill, from_camel, is_camel,\ - levenshtein, reorder_title -from guessit import PY2 -from guessit.date import search_date, search_year -from datetime import datetime, date, timedelta - - -class TestUtils(TestGuessit): - def test_splitpath(self): - alltests = {False: {'/usr/bin/smewt': ['/', 'usr', 'bin', 'smewt'], - 'relative_path/to/my_folder/': ['relative_path', 'to', 'my_folder'], - '//some/path': ['//', 'some', 'path'], - '//some//path': ['//', 'some', 'path'], - '///some////path': ['///', 'some', 'path'] - - }, - True: {'C:\\Program Files\\Smewt\\smewt.exe': ['C:\\', 'Program Files', 'Smewt', 'smewt.exe'], - 'Documents and Settings\\User\\config': ['Documents and Settings', 'User', 'config'], - 'C:\\Documents and Settings\\User\\config': ['C:\\', 'Documents and Settings', 'User', 'config'], - # http://bugs.python.org/issue19945 - '\\\\netdrive\\share': ['\\\\', 'netdrive', 'share'] if PY2 else ['\\\\netdrive\\share'], - '\\\\netdrive\\share\\folder': ['\\\\', 'netdrive', 'share', 'folder'] if PY2 else ['\\\\netdrive\\share\\', 'folder'], - } - } - tests = alltests[sys.platform == 'win32'] - for path, split in tests.items(): - self.assertEqual(split, split_path(path)) - - def test_strip_brackets(self): - allTests = (('', ''), - ('[test]', 'test'), - ('{test2}', 'test2'), - ('(test3)', 'test3'), - ('(test4]', '(test4]'), - ) - - for i, e in allTests: - self.assertEqual(e, strip_brackets(i)) - - def test_levenshtein(self): - self.assertEqual(levenshtein("abcdef ghijk lmno", "abcdef ghijk lmno"), 0) - self.assertEqual(levenshtein("abcdef ghijk lmnop", "abcdef ghijk lmno"), 1) - self.assertEqual(levenshtein("abcdef ghijk lmno", "abcdef ghijk lmn"), 1) - self.assertEqual(levenshtein("abcdef ghijk lmno", "abcdef ghijk lmnp"), 1) - self.assertEqual(levenshtein("abcdef ghijk lmno", "abcdef ghijk lmnq"), 1) - self.assertEqual(levenshtein("cbcdef ghijk lmno", "abcdef ghijk lmnq"), 2) - self.assertEqual(levenshtein("cbcdef ghihk lmno", "abcdef ghijk lmnq"), 3) - - def test_reorder_title(self): - self.assertEqual(reorder_title("Simpsons, The"), "The Simpsons") - self.assertEqual(reorder_title("Simpsons,The"), "The Simpsons") - self.assertEqual(reorder_title("Simpsons,Les", articles=('the', 'le', 'la', 'les')), "Les Simpsons") - self.assertEqual(reorder_title("Simpsons, Les", articles=('the', 'le', 'la', 'les')), "Les Simpsons") - - def test_camel(self): - self.assertEqual("", from_camel("")) - - self.assertEqual("Hello world", str_replace("Hello World", 6, 'w')) - self.assertEqual("Hello *****", str_fill("Hello World", (6, 11), '*')) - - self.assertTrue("This is camel", from_camel("ThisIsCamel")) - - self.assertEqual('camel case', from_camel('camelCase')) - self.assertEqual('A case', from_camel('ACase')) - self.assertEqual('MiXedCaSe is not camel case', from_camel('MiXedCaSe is not camelCase')) - - self.assertEqual("This is camel cased title", from_camel("ThisIsCamelCasedTitle")) - self.assertEqual("This is camel CASED title", from_camel("ThisIsCamelCASEDTitle")) - - self.assertEqual("These are camel CASED title", from_camel("TheseAreCamelCASEDTitle")) - - self.assertEqual("Give a camel case string", from_camel("GiveACamelCaseString")) - - self.assertEqual("Death TO camel case", from_camel("DeathTOCamelCase")) - self.assertEqual("But i like java too:)", from_camel("ButILikeJavaToo:)")) - - self.assertEqual("Beatdown french DVD rip.mkv", from_camel("BeatdownFrenchDVDRip.mkv")) - self.assertEqual("DO NOTHING ON UPPER CASE", from_camel("DO NOTHING ON UPPER CASE")) - - self.assertFalse(is_camel("this_is_not_camel")) - self.assertTrue(is_camel("ThisIsCamel")) - - self.assertEqual("Dark.City.(1998).DC.BDRIP.720p.DTS.X264-CHD.mkv", from_camel("Dark.City.(1998).DC.BDRIP.720p.DTS.X264-CHD.mkv")) - self.assertFalse(is_camel("Dark.City.(1998).DC.BDRIP.720p.DTS.X264-CHD.mkv")) - - self.assertEqual("A2LiNE", from_camel("A2LiNE")) - - def test_date(self): - self.assertEqual(search_year(' in the year 2000... '), (2000, (13, 17))) - self.assertEqual(search_year(' they arrived in 1492. '), (None, None)) - - today = date.today() - today_year_2 = int(str(today.year)[2:]) - - future = today + timedelta(days=1000) - future_year_2 = int(str(future.year)[2:]) - - past = today - timedelta(days=10000) - past_year_2 = int(str(past.year)[2:]) - - self.assertEqual(search_date(' Something before 2002-04-22 '), (date(2002, 4, 22), (18, 28))) - self.assertEqual(search_date(' 2002-04-22 Something after '), (date(2002, 4, 22), (1, 11))) - - self.assertEqual(search_date(' This happened on 2002-04-22. '), (date(2002, 4, 22), (18, 28))) - self.assertEqual(search_date(' This happened on 22-04-2002. '), (date(2002, 4, 22), (18, 28))) - - self.assertEqual(search_date(' This happened on 13-04-%s. ' % (today_year_2,)), (date(today.year, 4, 13), (18, 26))) - self.assertEqual(search_date(' This happened on 22-04-%s. ' % (future_year_2,)), (date(future.year, 4, 22), (18, 26))) - self.assertEqual(search_date(' This happened on 20-04-%s. ' % (past_year_2)), (date(past.year, 4, 20), (18, 26))) - - self.assertEqual(search_date(' This happened on 04-13-%s. ' % (today_year_2,)), (date(today.year, 4, 13), (18, 26))) - self.assertEqual(search_date(' This happened on 04-22-%s. ' % (future_year_2,)), (date(future.year, 4, 22), (18, 26))) - self.assertEqual(search_date(' This happened on 04-20-%s. ' % (past_year_2)), (date(past.year, 4, 20), (18, 26))) - - self.assertEqual(search_date(' This happened on 35-12-%s. ' % (today_year_2,)), (None, None)) - self.assertEqual(search_date(' This happened on 37-18-%s. ' % (future_year_2,)), (None, None)) - self.assertEqual(search_date(' This happened on 44-42-%s. ' % (past_year_2)), (None, None)) - - self.assertEqual(search_date(' This happened on %s. ' % (today, )), (today, (18, 28))) - self.assertEqual(search_date(' This happened on %s. ' % (future, )), (future, (18, 28))) - self.assertEqual(search_date(' This happened on %s. ' % (past, )), (past, (18, 28))) - - self.assertEqual(search_date(' released date: 04-03-1901? '), (None, None)) - - self.assertEqual(search_date(' There\'s no date in here. '), (None, None)) - - -suite = allTests(TestUtils) - -if __name__ == '__main__': - TextTestRunner(verbosity=2).run(suite) diff --git a/lib/guessit/textutils.py b/lib/guessit/textutils.py index eecfce1f..ae9d28c3 100644 --- a/lib/guessit/textutils.py +++ b/lib/guessit/textutils.py @@ -1,25 +1,24 @@ #!/usr/bin/env python # -*- coding: utf-8 -*- # -# GuessIt - A library for guessing information from filenames -# Copyright (c) 2013 Nicolas Wack +# Smewt - A smart collection manager +# Copyright (c) 2008-2012 Nicolas Wack # -# GuessIt is free software; you can redistribute it and/or modify it under -# the terms of the Lesser GNU General Public License as published by +# Smewt is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by # the Free Software Foundation; either version 3 of the License, or # (at your option) any later version. # -# GuessIt is distributed in the hope that it will be useful, +# Smewt is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# Lesser GNU General Public License for more details. +# GNU General Public License for more details. # -# You should have received a copy of the Lesser GNU General Public License +# You should have received a copy of the GNU General Public License # along with this program. If not, see . # -from __future__ import absolute_import, division, print_function, unicode_literals - +from __future__ import unicode_literals from guessit import s from guessit.patterns import sep import functools @@ -28,7 +27,6 @@ import re # string-related functions - def normalize_unicode(s): return unicodedata.normalize('NFC', s) @@ -45,36 +43,19 @@ def strip_brackets(s): return s -_dotted_rexp = re.compile(r'(?:\W|^)(([A-Za-z]\.){2,}[A-Za-z]\.?)') - - def clean_string(st): for c in sep: # do not remove certain chars if c in ['-', ',']: continue - - if c == '.': - # we should not remove the dots for acronyms and such - dotted = _dotted_rexp.search(st) - if dotted: - s = dotted.group(1) - exclude_begin, exclude_end = dotted.span(1) - - st = (st[:exclude_begin].replace(c, ' ') + - st[exclude_begin:exclude_end] + - st[exclude_end:].replace(c, ' ')) - continue - st = st.replace(c, ' ') - parts = st.split() result = ' '.join(p for p in parts if p != '') # now also remove dashes on the outer part of the string - while result and result[0] in '-': + while result and result[0] in sep: result = result[1:] - while result and result[-1] in '-': + while result and result[-1] in sep: result = result[:-1] return result @@ -82,23 +63,21 @@ def clean_string(st): _words_rexp = re.compile('\w+', re.UNICODE) - def find_words(s): return _words_rexp.findall(s.replace('_', ' ')) -def reorder_title(title, articles=('the',), separators=(',', ', ')): +def reorder_title(title): ltitle = title.lower() - for article in articles: - for separator in separators: - suffix = separator + article - if ltitle[-len(suffix):] == suffix: - return title[-len(suffix) + len(separator):] + ' ' + title[:-len(suffix)] + if ltitle[-4:] == ',the': + return title[-3:] + ' ' + title[:-4] + if ltitle[-5:] == ', the': + return title[-3:] + ' ' + title[:-5] return title def str_replace(string, pos, c): - return string[:pos] + c + string[pos + 1:] + return string[:pos] + c + string[pos+1:] def str_fill(string, region, c): @@ -106,6 +85,7 @@ def str_fill(string, region, c): return string[:start] + c * (end - start) + string[end:] + def levenshtein(a, b): if not a: return len(b) @@ -115,25 +95,25 @@ def levenshtein(a, b): m = len(a) n = len(b) d = [] - for i in range(m + 1): - d.append([0] * (n + 1)) + for i in range(m+1): + d.append([0] * (n+1)) - for i in range(m + 1): + for i in range(m+1): d[i][0] = i - for j in range(n + 1): + for j in range(n+1): d[0][j] = j - for i in range(1, m + 1): - for j in range(1, n + 1): - if a[i - 1] == b[j - 1]: + for i in range(1, m+1): + for j in range(1, n+1): + if a[i-1] == b[j-1]: cost = 0 else: cost = 1 - d[i][j] = min(d[i - 1][j] + 1, # deletion - d[i][j - 1] + 1, # insertion - d[i - 1][j - 1] + cost # substitution + d[i][j] = min(d[i-1][j] + 1, # deletion + d[i][j-1] + 1, # insertion + d[i-1][j-1] + cost # substitution ) return d[m][n] @@ -160,7 +140,7 @@ def find_first_level_groups_span(string, enclosing): [(2, 5), (7, 10)] """ opening, closing = enclosing - depth = [] # depth is a stack of indices where we opened a group + depth = [] # depth is a stack of indices where we opened a group result = [] for i, c, in enumerate(string): if c == opening: @@ -171,7 +151,7 @@ def find_first_level_groups_span(string, enclosing): end = i if not depth: # we emptied our stack, so we have a 1st level group - result.append((start, end + 1)) + result.append((start, end+1)) except IndexError: # we closed a group which was not opened before pass @@ -192,7 +172,7 @@ def split_on_groups(string, groups): """ if not groups: - return [string] + return [ string ] boundaries = sorted(set(functools.reduce(lambda l, x: l + list(x), groups, []))) if boundaries[0] != 0: @@ -200,10 +180,10 @@ def split_on_groups(string, groups): if boundaries[-1] != len(string): boundaries.append(len(string)) - groups = [string[start:end] for start, end in zip(boundaries[:-1], - boundaries[1:])] + groups = [ string[start:end] for start, end in zip(boundaries[:-1], + boundaries[1:]) ] - return [g for g in groups if g] # return only non-empty groups + return [ g for g in groups if g ] # return only non-empty groups def find_first_level_groups(string, enclosing, blank_sep=None): @@ -239,114 +219,6 @@ def find_first_level_groups(string, enclosing, blank_sep=None): if blank_sep: for start, end in groups: string = str_replace(string, start, blank_sep) - string = str_replace(string, end - 1, blank_sep) + string = str_replace(string, end-1, blank_sep) return split_on_groups(string, groups) - - -_camel_word2_set = set(('is', 'to',)) -_camel_word3_set = set(('the',)) - - -def _camel_split_and_lower(string, i): - """Retrieves a tuple (need_split, need_lower) - - need_split is True if this char is a first letter in a camelCasedString. - need_lower is True if this char should be lowercased. - """ - - def islower(c): - return c.isalpha() and not c.isupper() - - previous_char2 = string[i - 2] if i > 1 else None - previous_char = string[i - 1] if i > 0 else None - char = string[i] - next_char = string[i + 1] if i + 1 < len(string) else None - next_char2 = string[i + 2] if i + 2 < len(string) else None - - char_upper = char.isupper() - char_lower = islower(char) - - # previous_char2_lower = islower(previous_char2) if previous_char2 else False - previous_char2_upper = previous_char2.isupper() if previous_char2 else False - - previous_char_lower = islower(previous_char) if previous_char else False - previous_char_upper = previous_char.isupper() if previous_char else False - - next_char_upper = next_char.isupper() if next_char else False - next_char_lower = islower(next_char) if next_char else False - - next_char2_upper = next_char2.isupper() if next_char2 else False - # next_char2_lower = islower(next_char2) if next_char2 else False - - mixedcase_word = (previous_char_upper and char_lower and next_char_upper) or \ - (previous_char_lower and char_upper and next_char_lower and next_char2_upper) or \ - (previous_char2_upper and previous_char_lower and char_upper) - if mixedcase_word: - word2 = (char + next_char).lower() if next_char else None - word3 = (char + next_char + next_char2).lower() if next_char and next_char2 else None - word2b = (previous_char2 + previous_char).lower() if previous_char2 and previous_char else None - if word2 in _camel_word2_set or word2b in _camel_word2_set or word3 in _camel_word3_set: - mixedcase_word = False - - uppercase_word = previous_char_upper and char_upper and next_char_upper or (char_upper and next_char_upper and next_char2_upper) - - need_split = char_upper and previous_char_lower and not mixedcase_word - - if not need_split: - previous_char_upper = string[i - 1].isupper() if i > 0 else False - next_char_lower = (string[i + 1].isalpha() and not string[i + 1].isupper()) if i + 1 < len(string) else False - need_split = char_upper and previous_char_upper and next_char_lower - uppercase_word = previous_char_upper and not next_char_lower - - need_lower = not uppercase_word and not mixedcase_word and need_split - - return (need_split, need_lower) - - -def is_camel(string): - """ - >>> is_camel('dogEATDog') - True - >>> is_camel('DeathToCamelCase') - True - >>> is_camel('death_to_camel_case') - False - >>> is_camel('TheBest') - True - >>> is_camel('The Best') - False - """ - for i in range(0, len(string)): - need_split, _ = _camel_split_and_lower(string, i) - if need_split: - return True - return False - - -def from_camel(string): - """ - >>> from_camel('dogEATDog') == 'dog EAT dog' - True - >>> from_camel('DeathToCamelCase') == 'Death to camel case' - True - >>> from_camel('TheBest') == 'The best' - True - >>> from_camel('MiXedCaSe is not camelCase') == 'MiXedCaSe is not camel case' - True - """ - if not string: - return string - pieces = [] - - for i in range(0, len(string)): - char = string[i] - need_split, need_lower = _camel_split_and_lower(string, i) - if need_split: - pieces.append(' ') - - if need_lower: - pieces.append(char.lower()) - else: - pieces.append(char) - return ''.join(pieces) diff --git a/lib/guessit/tlds-alpha-by-domain.txt b/lib/guessit/tlds-alpha-by-domain.txt deleted file mode 100644 index 280c794c..00000000 --- a/lib/guessit/tlds-alpha-by-domain.txt +++ /dev/null @@ -1,341 +0,0 @@ -# Version 2013112900, Last Updated Fri Nov 29 07:07:01 2013 UTC -AC -AD -AE -AERO -AF -AG -AI -AL -AM -AN -AO -AQ -AR -ARPA -AS -ASIA -AT -AU -AW -AX -AZ -BA -BB -BD -BE -BF -BG -BH -BI -BIKE -BIZ -BJ -BM -BN -BO -BR -BS -BT -BV -BW -BY -BZ -CA -CAMERA -CAT -CC -CD -CF -CG -CH -CI -CK -CL -CLOTHING -CM -CN -CO -COM -CONSTRUCTION -CONTRACTORS -COOP -CR -CU -CV -CW -CX -CY -CZ -DE -DIAMONDS -DIRECTORY -DJ -DK -DM -DO -DZ -EC -EDU -EE -EG -ENTERPRISES -EQUIPMENT -ER -ES -ESTATE -ET -EU -FI -FJ -FK -FM -FO -FR -GA -GALLERY -GB -GD -GE -GF -GG -GH -GI -GL -GM -GN -GOV -GP -GQ -GR -GRAPHICS -GS -GT -GU -GURU -GW -GY -HK -HM -HN -HOLDINGS -HR -HT -HU -ID -IE -IL -IM -IN -INFO -INT -IO -IQ -IR -IS -IT -JE -JM -JO -JOBS -JP -KE -KG -KH -KI -KITCHEN -KM -KN -KP -KR -KW -KY -KZ -LA -LAND -LB -LC -LI -LIGHTING -LK -LR -LS -LT -LU -LV -LY -MA -MC -MD -ME -MG -MH -MIL -MK -ML -MM -MN -MO -MOBI -MP -MQ -MR -MS -MT -MU -MUSEUM -MV -MW -MX -MY -MZ -NA -NAME -NC -NE -NET -NF -NG -NI -NL -NO -NP -NR -NU -NZ -OM -ORG -PA -PE -PF -PG -PH -PHOTOGRAPHY -PK -PL -PLUMBING -PM -PN -POST -PR -PRO -PS -PT -PW -PY -QA -RE -RO -RS -RU -RW -SA -SB -SC -SD -SE -SEXY -SG -SH -SI -SINGLES -SJ -SK -SL -SM -SN -SO -SR -ST -SU -SV -SX -SY -SZ -TATTOO -TC -TD -TECHNOLOGY -TEL -TF -TG -TH -TIPS -TJ -TK -TL -TM -TN -TO -TODAY -TP -TR -TRAVEL -TT -TV -TW -TZ -UA -UG -UK -US -UY -UZ -VA -VC -VE -VENTURES -VG -VI -VN -VOYAGE -VU -WF -WS -XN--3E0B707E -XN--45BRJ9C -XN--80AO21A -XN--80ASEHDB -XN--80ASWG -XN--90A3AC -XN--CLCHC0EA0B2G2A9GCD -XN--FIQS8S -XN--FIQZ9S -XN--FPCRJ9C3D -XN--FZC2C9E2C -XN--GECRJ9C -XN--H2BRJ9C -XN--J1AMH -XN--J6W193G -XN--KPRW13D -XN--KPRY57D -XN--L1ACC -XN--LGBBAT1AD8J -XN--MGB9AWBF -XN--MGBA3A4F16A -XN--MGBAAM7A8H -XN--MGBAYH7GPA -XN--MGBBH1A71E -XN--MGBC0A9AZCG -XN--MGBERP4A5D4AR -XN--MGBX4CD0AB -XN--NGBC5AZD -XN--O3CW4H -XN--OGBPF8FL -XN--P1AI -XN--PGBS0DH -XN--Q9JYB4C -XN--S9BRJ9C -XN--UNUP4Y -XN--WGBH1C -XN--WGBL6A -XN--XKC2AL3HYE2A -XN--XKC2DL3A5EE0H -XN--YFRO4I67O -XN--YGBI2AMMX -XXX -YE -YT -ZA -ZM -ZW diff --git a/lib/guessit/transfo/__init__.py b/lib/guessit/transfo/__init__.py index cce2dfda..a28aa988 100644 --- a/lib/guessit/transfo/__init__.py +++ b/lib/guessit/transfo/__init__.py @@ -2,7 +2,7 @@ # -*- coding: utf-8 -*- # # GuessIt - A library for guessing information from filenames -# Copyright (c) 2013 Nicolas Wack +# Copyright (c) 2012 Nicolas Wack # # GuessIt is free software; you can redistribute it and/or modify it under # the terms of the Lesser GNU General Public License as published by @@ -18,13 +18,92 @@ # along with this program. If not, see . # -from __future__ import absolute_import, division, print_function, unicode_literals +from __future__ import unicode_literals +from guessit import base_text_type, Guess +from guessit.patterns import canonical_form +from guessit.textutils import clean_string +import logging + +log = logging.getLogger(__name__) -class TransformerException(Exception): - def __init__(self, transformer, message): +def found_property(node, name, confidence): + node.guess = Guess({name: node.clean_value}, confidence=confidence, raw=node.value) + log.debug('Found with confidence %.2f: %s' % (confidence, node.guess)) - # Call the base class constructor with the parameters it needs - Exception.__init__(self, message) - self.transformer = transformer \ No newline at end of file +def format_guess(guess): + """Format all the found values to their natural type. + For instance, a year would be stored as an int value, etc... + + Note that this modifies the dictionary given as input. + """ + for prop, value in guess.items(): + if prop in ('season', 'episodeNumber', 'year', 'cdNumber', + 'cdNumberTotal', 'bonusNumber', 'filmNumber'): + guess[prop] = int(guess[prop]) + elif isinstance(value, base_text_type): + if prop in ('edition',): + value = clean_string(value) + guess[prop] = canonical_form(value).replace('\\', '') + + return guess + + +def find_and_split_node(node, strategy, logger): + string = ' %s ' % node.value # add sentinels + for matcher, confidence, args, kwargs in strategy: + all_args = [string] + if getattr(matcher, 'use_node', False): + all_args.append(node) + if args: + all_args.append(args) + + if kwargs: + result, span = matcher(*all_args, **kwargs) + else: + result, span = matcher(*all_args) + + if result: + # readjust span to compensate for sentinels + span = (span[0] - 1, span[1] - 1) + + if isinstance(result, Guess): + if confidence is None: + confidence = result.confidence(list(result.keys())[0]) + else: + if confidence is None: + confidence = 1.0 + + guess = format_guess(Guess(result, confidence=confidence, raw=string[span[0] + 1:span[1] + 1])) + msg = 'Found with confidence %.2f: %s' % (confidence, guess) + (logger or log).debug(msg) + + node.partition(span) + absolute_span = (span[0] + node.offset, span[1] + node.offset) + for child in node.children: + if child.span == absolute_span: + child.guess = guess + else: + find_and_split_node(child, strategy, logger) + return + + +class SingleNodeGuesser(object): + def __init__(self, guess_func, confidence, logger, *args, **kwargs): + self.guess_func = guess_func + self.confidence = confidence + self.logger = logger + self.args = args + self.kwargs = kwargs + + def process(self, mtree): + # strategy is a list of pairs (guesser, confidence) + # - if the guesser returns a guessit.Guess and confidence is specified, + # it will override it, otherwise it will leave the guess confidence + # - if the guesser returns a simple dict as a guess and confidence is + # specified, it will use it, or 1.0 otherwise + strategy = [ (self.guess_func, self.confidence, self.args, self.kwargs) ] + + for node in mtree.unidentified_leaves(): + find_and_split_node(node, strategy, self.logger) diff --git a/lib/guessit/transfo/guess_bonus_features.py b/lib/guessit/transfo/guess_bonus_features.py index e904f690..8c7ac013 100644 --- a/lib/guessit/transfo/guess_bonus_features.py +++ b/lib/guessit/transfo/guess_bonus_features.py @@ -2,7 +2,7 @@ # -*- coding: utf-8 -*- # # GuessIt - A library for guessing information from filenames -# Copyright (c) 2013 Nicolas Wack +# Copyright (c) 2012 Nicolas Wack # # GuessIt is free software; you can redistribute it and/or modify it under # the terms of the Lesser GNU General Public License as published by @@ -18,50 +18,44 @@ # along with this program. If not, see . # -from __future__ import absolute_import, division, print_function, unicode_literals +from __future__ import unicode_literals +from guessit.transfo import found_property +import logging -from guessit.plugins.transformers import Transformer -from guessit.matcher import found_property +log = logging.getLogger(__name__) -class GuessBonusFeatures(Transformer): - def __init__(self): - Transformer.__init__(self, -150) +def process(mtree): + def previous_group(g): + for leaf in mtree.unidentified_leaves()[::-1]: + if leaf.node_idx < g.node_idx: + return leaf - def supported_properties(self): - return ['bonusNumber', 'bonusTitle', 'filmNumber', 'filmSeries', 'title', 'series'] + def next_group(g): + for leaf in mtree.unidentified_leaves(): + if leaf.node_idx > g.node_idx: + return leaf - def process(self, mtree, options=None): - def previous_group(g): - for leaf in mtree.unidentified_leaves()[::-1]: - if leaf.node_idx < g.node_idx: - return leaf + def same_group(g1, g2): + return g1.node_idx[:2] == g2.node_idx[:2] - def next_group(g): - for leaf in mtree.unidentified_leaves(): - if leaf.node_idx > g.node_idx: - return leaf + bonus = [ node for node in mtree.leaves() if 'bonusNumber' in node.guess ] + if bonus: + bonusTitle = next_group(bonus[0]) + if same_group(bonusTitle, bonus[0]): + found_property(bonusTitle, 'bonusTitle', 0.8) - def same_group(g1, g2): - return g1.node_idx[:2] == g2.node_idx[:2] + filmNumber = [ node for node in mtree.leaves() + if 'filmNumber' in node.guess ] + if filmNumber: + filmSeries = previous_group(filmNumber[0]) + found_property(filmSeries, 'filmSeries', 0.9) - bonus = [node for node in mtree.leaves() if 'bonusNumber' in node.guess] - if bonus: - bonusTitle = next_group(bonus[0]) - if bonusTitle and same_group(bonusTitle, bonus[0]): - found_property(bonusTitle, 'bonusTitle', confidence=0.8) + title = next_group(filmNumber[0]) + found_property(title, 'title', 0.9) - filmNumber = [node for node in mtree.leaves() - if 'filmNumber' in node.guess] - if filmNumber: - filmSeries = previous_group(filmNumber[0]) - found_property(filmSeries, 'filmSeries', confidence=0.9) - - title = next_group(filmNumber[0]) - found_property(title, 'title', confidence=0.9) - - season = [node for node in mtree.leaves() if 'season' in node.guess] - if season and 'bonusNumber' in mtree.info: - series = previous_group(season[0]) - if same_group(series, season[0]): - found_property(series, 'series', confidence=0.9) + season = [ node for node in mtree.leaves() if 'season' in node.guess ] + if season and 'bonusNumber' in mtree.info: + series = previous_group(season[0]) + if same_group(series, season[0]): + found_property(series, 'series', 0.9) diff --git a/lib/guessit/transfo/guess_country.py b/lib/guessit/transfo/guess_country.py index 6fbbb659..aadb84f7 100644 --- a/lib/guessit/transfo/guess_country.py +++ b/lib/guessit/transfo/guess_country.py @@ -2,7 +2,7 @@ # -*- coding: utf-8 -*- # # GuessIt - A library for guessing information from filenames -# Copyright (c) 2013 Nicolas Wack +# Copyright (c) 2012 Nicolas Wack # # GuessIt is free software; you can redistribute it and/or modify it under # the terms of the Lesser GNU General Public License as published by @@ -18,52 +18,31 @@ # along with this program. If not, see . # -from __future__ import absolute_import, division, print_function, unicode_literals - -from guessit.plugins.transformers import Transformer +from __future__ import unicode_literals from guessit.country import Country from guessit import Guess +import logging +log = logging.getLogger(__name__) -class GuessCountry(Transformer): - def __init__(self): - Transformer.__init__(self, -170) - # list of common words which could be interpreted as countries, but which - # are far too common to be able to say they represent a country - self.country_common_words = frozenset(['bt', 'bb']) +# list of common words which could be interpreted as countries, but which +# are far too common to be able to say they represent a country +country_common_words = frozenset([ 'bt', 'bb' ]) - def supported_properties(self): - return ['country'] +def process(mtree): + for node in mtree.unidentified_leaves(): + if len(node.node_idx) == 2: + c = node.value[1:-1].lower() + if c in country_common_words: + continue - def should_process(self, mtree, options=None): - options = options or {} - return 'nocountry' not in options.keys() + # only keep explicit groups (enclosed in parentheses/brackets) + if node.value[0] + node.value[-1] not in ['()', '[]', '{}']: + continue - def process(self, mtree, options=None): - for node in mtree.unidentified_leaves(): - if len(node.node_idx) == 2: - c = node.value[1:-1].lower() - if c in self.country_common_words: - continue + try: + country = Country(c, strict=True) + except ValueError: + continue - # only keep explicit groups (enclosed in parentheses/brackets) - if not node.is_explicit(): - continue - - try: - country = Country(c, strict=True) - except ValueError: - continue - - node.guess = Guess(country=country, confidence=1.0, input=node.value, span=node.span) - - def post_process(self, mtree, options=None, *args, **kwargs): - # if country is in the guessed properties, make it part of the series name - series_leaves = mtree.leaves_containing('series') - country_leaves = mtree.leaves_containing('country') - - if series_leaves and country_leaves: - country_leaf = country_leaves[0] - for serie_leaf in series_leaves: - serie_leaf.guess['series'] += ' (%s)' % country_leaf.guess['country'].alpha2.upper() - #result['series'] += ' (%s)' % result['country'].alpha2.upper() + node.guess = Guess(country=country, confidence=1.0, raw=c) diff --git a/lib/guessit/transfo/guess_date.py b/lib/guessit/transfo/guess_date.py index fa3a62d9..34a85989 100644 --- a/lib/guessit/transfo/guess_date.py +++ b/lib/guessit/transfo/guess_date.py @@ -2,7 +2,7 @@ # -*- coding: utf-8 -*- # # GuessIt - A library for guessing information from filenames -# Copyright (c) 2013 Nicolas Wack +# Copyright (c) 2012 Nicolas Wack # # GuessIt is free software; you can redistribute it and/or modify it under # the terms of the Lesser GNU General Public License as published by @@ -18,26 +18,21 @@ # along with this program. If not, see . # -from __future__ import absolute_import, division, print_function, unicode_literals - -from guessit.plugins.transformers import Transformer -from guessit.matcher import GuessFinder +from __future__ import unicode_literals +from guessit.transfo import SingleNodeGuesser from guessit.date import search_date +import logging + +log = logging.getLogger(__name__) -class GuessDate(Transformer): - def __init__(self): - Transformer.__init__(self, 50) +def guess_date(string): + date, span = search_date(string) + if date: + return { 'date': date }, span + else: + return None, None - def supported_properties(self): - return ['date'] - def guess_date(self, string, node=None, options=None): - date, span = search_date(string) - if date: - return {'date': date}, span - else: - return None, None - - def process(self, mtree, options=None): - GuessFinder(self.guess_date, 1.0, self.log, options).process_nodes(mtree.unidentified_leaves()) +def process(mtree): + SingleNodeGuesser(guess_date, 1.0, log).process(mtree) diff --git a/lib/guessit/transfo/guess_episode_info_from_position.py b/lib/guessit/transfo/guess_episode_info_from_position.py index ca2696d2..967c3341 100644 --- a/lib/guessit/transfo/guess_episode_info_from_position.py +++ b/lib/guessit/transfo/guess_episode_info_from_position.py @@ -2,7 +2,7 @@ # -*- coding: utf-8 -*- # # GuessIt - A library for guessing information from filenames -# Copyright (c) 2013 Nicolas Wack +# Copyright (c) 2012 Nicolas Wack # # GuessIt is free software; you can redistribute it and/or modify it under # the terms of the Lesser GNU General Public License as published by @@ -18,145 +18,129 @@ # along with this program. If not, see . # -from __future__ import absolute_import, division, print_function, unicode_literals +from __future__ import unicode_literals +from guessit.transfo import found_property +from guessit.patterns import non_episode_title, unlikely_series +import logging -from guessit.plugins.transformers import Transformer, get_transformer -from guessit.textutils import reorder_title - -from guessit.matcher import found_property +log = logging.getLogger(__name__) -class GuessEpisodeInfoFromPosition(Transformer): - def __init__(self): - Transformer.__init__(self, -200) +def match_from_epnum_position(mtree, node): + epnum_idx = node.node_idx - def supported_properties(self): - return ['title', 'series'] + # a few helper functions to be able to filter using high-level semantics + def before_epnum_in_same_pathgroup(): + return [ leaf for leaf in mtree.unidentified_leaves() + if (leaf.node_idx[0] == epnum_idx[0] and + leaf.node_idx[1:] < epnum_idx[1:]) ] - def match_from_epnum_position(self, mtree, node): - epnum_idx = node.node_idx + def after_epnum_in_same_pathgroup(): + return [ leaf for leaf in mtree.unidentified_leaves() + if (leaf.node_idx[0] == epnum_idx[0] and + leaf.node_idx[1:] > epnum_idx[1:]) ] - # a few helper functions to be able to filter using high-level semantics - def before_epnum_in_same_pathgroup(): - return [leaf for leaf in mtree.unidentified_leaves() - if (leaf.node_idx[0] == epnum_idx[0] and - leaf.node_idx[1:] < epnum_idx[1:])] + def after_epnum_in_same_explicitgroup(): + return [ leaf for leaf in mtree.unidentified_leaves() + if (leaf.node_idx[:2] == epnum_idx[:2] and + leaf.node_idx[2:] > epnum_idx[2:]) ] - def after_epnum_in_same_pathgroup(): - return [leaf for leaf in mtree.unidentified_leaves() - if (leaf.node_idx[0] == epnum_idx[0] and - leaf.node_idx[1:] > epnum_idx[1:])] + # epnumber is the first group and there are only 2 after it in same + # path group + # -> series title - episode title + title_candidates = [ n for n in after_epnum_in_same_pathgroup() + if n.clean_value.lower() not in non_episode_title ] + if ('title' not in mtree.info and # no title + before_epnum_in_same_pathgroup() == [] and # no groups before + len(title_candidates) == 2): # only 2 groups after - def after_epnum_in_same_explicitgroup(): - return [leaf for leaf in mtree.unidentified_leaves() - if (leaf.node_idx[:2] == epnum_idx[:2] and - leaf.node_idx[2:] > epnum_idx[2:])] + found_property(title_candidates[0], 'series', confidence=0.4) + found_property(title_candidates[1], 'title', confidence=0.4) + return - # epnumber is the first group and there are only 2 after it in same - # path group - # -> series title - episode title - title_candidates = self._filter_candidates(after_epnum_in_same_pathgroup()) + # if we have at least 1 valid group before the episodeNumber, then it's + # probably the series name + series_candidates = before_epnum_in_same_pathgroup() + if len(series_candidates) >= 1: + found_property(series_candidates[0], 'series', confidence=0.7) - if ('title' not in mtree.info and # no title - before_epnum_in_same_pathgroup() == [] and # no groups before - len(title_candidates) == 2): # only 2 groups after - - found_property(title_candidates[0], 'series', confidence=0.4) - found_property(title_candidates[1], 'title', confidence=0.4) - return - - # if we have at least 1 valid group before the episodeNumber, then it's - # probably the series name - series_candidates = before_epnum_in_same_pathgroup() - if len(series_candidates) >= 1: - found_property(series_candidates[0], 'series', confidence=0.7) - - # only 1 group after (in the same path group) and it's probably the - # episode title - title_candidates = self._filter_candidates(after_epnum_in_same_pathgroup()) + # only 1 group after (in the same path group) and it's probably the + # episode title + title_candidates = [ n for n in after_epnum_in_same_pathgroup() + if n.clean_value.lower() not in non_episode_title ] + if len(title_candidates) == 1: + found_property(title_candidates[0], 'title', confidence=0.5) + return + else: + # try in the same explicit group, with lower confidence + title_candidates = [ n for n in after_epnum_in_same_explicitgroup() + if n.clean_value.lower() not in non_episode_title + ] if len(title_candidates) == 1: - found_property(title_candidates[0], 'title', confidence=0.5) + found_property(title_candidates[0], 'title', confidence=0.4) + return + elif len(title_candidates) > 1: + found_property(title_candidates[0], 'title', confidence=0.3) return - else: - # try in the same explicit group, with lower confidence - title_candidates = self._filter_candidates(after_epnum_in_same_explicitgroup()) - if len(title_candidates) == 1: - found_property(title_candidates[0], 'title', confidence=0.4) - return - elif len(title_candidates) > 1: - found_property(title_candidates[0], 'title', confidence=0.3) - return - # get the one with the longest value - title_candidates = self._filter_candidates(after_epnum_in_same_pathgroup()) - if title_candidates: - maxidx = -1 - maxv = -1 - for i, c in enumerate(title_candidates): - if len(c.clean_value) > maxv: - maxidx = i - maxv = len(c.clean_value) - found_property(title_candidates[maxidx], 'title', confidence=0.3) + # get the one with the longest value + title_candidates = [ n for n in after_epnum_in_same_pathgroup() + if n.clean_value.lower() not in non_episode_title ] + if title_candidates: + maxidx = -1 + maxv = -1 + for i, c in enumerate(title_candidates): + if len(c.clean_value) > maxv: + maxidx = i + maxv = len(c.clean_value) + found_property(title_candidates[maxidx], 'title', confidence=0.3) - def should_process(self, mtree, options=None): - options = options or {} - return not options.get('skip_title') and mtree.guess.get('type', '').startswith('episode') - def _filter_candidates(self, candidates): - episode_special_transformer = get_transformer('guess_episode_special') - if episode_special_transformer: - return [n for n in candidates if not episode_special_transformer.container.find_properties(n.value, n, re_match=True)] - else: - return candidates +def process(mtree): + eps = [node for node in mtree.leaves() if 'episodeNumber' in node.guess] + if eps: + match_from_epnum_position(mtree, eps[0]) - def process(self, mtree, options=None): - """ - try to identify the remaining unknown groups by looking at their - position relative to other known elements - """ - eps = [node for node in mtree.leaves() if 'episodeNumber' in node.guess] - if eps: - self.match_from_epnum_position(mtree, eps[0]) + else: + # if we don't have the episode number, but at least 2 groups in the + # basename, then it's probably series - eptitle + basename = mtree.node_at((-2,)) + title_candidates = [ n for n in basename.unidentified_leaves() + if n.clean_value.lower() not in non_episode_title + ] - else: - # if we don't have the episode number, but at least 2 groups in the - # basename, then it's probably series - eptitle - basename = mtree.node_at((-2,)) + if len(title_candidates) >= 2: + found_property(title_candidates[0], 'series', 0.4) + found_property(title_candidates[1], 'title', 0.4) + elif len(title_candidates) == 1: + # but if there's only one candidate, it's probably the series name + found_property(title_candidates[0], 'series', 0.4) - title_candidates = self._filter_candidates(basename.unidentified_leaves()) + # if we only have 1 remaining valid group in the folder containing the + # file, then it's likely that it is the series name + try: + series_candidates = mtree.node_at((-3,)).unidentified_leaves() + except ValueError: + series_candidates = [] - if len(title_candidates) >= 2: - found_property(title_candidates[0], 'series', confidence=0.4) - found_property(title_candidates[1], 'title', confidence=0.4) - elif len(title_candidates) == 1: - # but if there's only one candidate, it's probably the series name - found_property(title_candidates[0], 'series', confidence=0.4) + if len(series_candidates) == 1: + found_property(series_candidates[0], 'series', 0.3) - # if we only have 1 remaining valid group in the folder containing the - # file, then it's likely that it is the series name - try: - series_candidates = mtree.node_at((-3,)).unidentified_leaves() - except ValueError: - series_candidates = [] + # if there's a path group that only contains the season info, then the + # previous one is most likely the series title (ie: ../series/season X/..) + eps = [ node for node in mtree.nodes() + if 'season' in node.guess and 'episodeNumber' not in node.guess ] - if len(series_candidates) == 1: - found_property(series_candidates[0], 'series', confidence=0.3) + if eps: + previous = [ node for node in mtree.unidentified_leaves() + if node.node_idx[0] == eps[0].node_idx[0] - 1 ] + if len(previous) == 1: + found_property(previous[0], 'series', 0.5) - # if there's a path group that only contains the season info, then the - # previous one is most likely the series title (ie: ../series/season X/..) - eps = [node for node in mtree.nodes() - if 'season' in node.guess and 'episodeNumber' not in node.guess] - - if eps: - previous = [node for node in mtree.unidentified_leaves() - if node.node_idx[0] == eps[0].node_idx[0] - 1] - if len(previous) == 1: - found_property(previous[0], 'series', confidence=0.5) - - def post_process(self, mtree, options=None): - for node in mtree.nodes(): - if 'series' not in node.guess: - continue - - node.guess['series'] = reorder_title(node.guess['series']) + # reduce the confidence of unlikely series + for node in mtree.nodes(): + if 'series' in node.guess: + if node.guess['series'].lower() in unlikely_series: + new_confidence = node.guess.confidence('series') * 0.5 + node.guess.set_confidence('series', new_confidence) diff --git a/lib/guessit/transfo/guess_episode_special.py b/lib/guessit/transfo/guess_episode_special.py deleted file mode 100644 index ac497588..00000000 --- a/lib/guessit/transfo/guess_episode_special.py +++ /dev/null @@ -1,62 +0,0 @@ -#!/usr/bin/env python -# -*- coding: utf-8 -*- -# -# GuessIt - A library for guessing information from filenames -# Copyright (c) 2013 Nicolas Wack -# -# GuessIt is free software; you can redistribute it and/or modify it under -# the terms of the Lesser GNU General Public License as published by -# the Free Software Foundation; either version 3 of the License, or -# (at your option) any later version. -# -# GuessIt is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# Lesser GNU General Public License for more details. -# -# You should have received a copy of the Lesser GNU General Public License -# along with this program. If not, see . -# - -from __future__ import absolute_import, division, print_function, unicode_literals - -from guessit.plugins.transformers import Transformer -from guessit.matcher import found_guess -from guessit.containers import PropertiesContainer - - -class GuessEpisodeSpecial(Transformer): - def __init__(self): - Transformer.__init__(self, -205) - self.container = PropertiesContainer() - self.container.register_property('special', 'Special', 'Bonus', 'Omake', 'Ova', 'Oav', 'Pilot', 'Unaired') - self.container.register_property('special', 'Extras?', canonical_form='Extras') - - def guess_special(self, string, node=None, options=None): - properties = self.container.find_properties(string, node, 'special', multiple=True) - guesses = self.container.as_guess(properties, multiple=True) - return guesses - - def second_pass_options(self, mtree, options=None): - if not mtree.guess.get('type', '').startswith('episode'): - for unidentified_leaf in mtree.unidentified_leaves(): - properties = self.container.find_properties(unidentified_leaf.value, unidentified_leaf, 'special') - guess = self.container.as_guess(properties) - if guess: - return {'type': 'episode'} - return None - - def supported_properties(self): - return self.container.get_supported_properties() - - def process(self, mtree, options=None): - if mtree.guess.get('type', '').startswith('episode') and (not mtree.info.get('episodeNumber') or mtree.info.get('season') == 0): - for title_leaf in mtree.leaves_containing('title'): - guesses = self.guess_special(title_leaf.value, title_leaf, options) - for guess in guesses: - found_guess(title_leaf, guess, update_guess=False) - for unidentified_leaf in mtree.unidentified_leaves(): - guesses = self.guess_special(unidentified_leaf.value, unidentified_leaf, options) - for guess in guesses: - found_guess(unidentified_leaf, guess, update_guess=False) - return None diff --git a/lib/guessit/transfo/guess_episodes_rexps.py b/lib/guessit/transfo/guess_episodes_rexps.py index 8ec17add..30c2ca2f 100644 --- a/lib/guessit/transfo/guess_episodes_rexps.py +++ b/lib/guessit/transfo/guess_episodes_rexps.py @@ -2,7 +2,7 @@ # -*- coding: utf-8 -*- # # GuessIt - A library for guessing information from filenames -# Copyright (c) 2013 Nicolas Wack +# Copyright (c) 2012 Nicolas Wack # # GuessIt is free software; you can redistribute it and/or modify it under # the terms of the Lesser GNU General Public License as published by @@ -18,63 +18,49 @@ # along with this program. If not, see . # -from __future__ import absolute_import, division, print_function, unicode_literals +from __future__ import unicode_literals +from guessit import Guess +from guessit.transfo import SingleNodeGuesser +from guessit.patterns import episode_rexps +import re +import logging -from guessit.plugins.transformers import Transformer -from guessit.matcher import GuessFinder -from guessit.patterns import sep -from guessit.containers import PropertiesContainer, WeakValidator, NoValidator -from guessit.patterns.numeral import numeral, digital_numeral, parse_numeral -from re import split as re_split +log = logging.getLogger(__name__) + +def number_list(s): + l = [ int(n) for n in re.sub('[^0-9]+', ' ', s).split() ] + + if len(l) == 2: + # it is an episode interval, return all numbers in between + return range(l[0], l[1]+1) + + return l + +def guess_episodes_rexps(string): + for rexp, confidence, span_adjust in episode_rexps: + match = re.search(rexp, string, re.IGNORECASE) + if match: + span = (match.start() + span_adjust[0], + match.end() + span_adjust[1]) + guess = Guess(match.groupdict(), confidence=confidence, raw=string[span[0]:span[1]]) + + # decide whether we have only a single episode number or an + # episode list + if guess.get('episodeNumber'): + eplist = number_list(guess['episodeNumber']) + guess.set('episodeNumber', eplist[0], confidence=confidence, raw=string[span[0]:span[1]]) + + if len(eplist) > 1: + guess.set('episodeList', eplist, confidence=confidence, raw=string[span[0]:span[1]]) + + if guess.get('bonusNumber'): + eplist = number_list(guess['bonusNumber']) + guess.set('bonusNumber', eplist[0], confidence=confidence, raw=string[span[0]:span[1]]) + + return guess, span + + return None, None -class GuessEpisodesRexps(Transformer): - def __init__(self): - Transformer.__init__(self, 20) - - self.container = PropertiesContainer(enhance=False, canonical_from_pattern=False) - - def episode_parser(value): - values = re_split('[a-zA-Z]', value) - values = [x for x in values if x] - ret = [] - for letters_elt in values: - dashed_values = letters_elt.split('-') - dashed_values = [x for x in dashed_values if x] - if len(dashed_values) > 1: - for _ in range(0, len(dashed_values) - 1): - start_dash_ep = parse_numeral(dashed_values[0]) - end_dash_ep = parse_numeral(dashed_values[1]) - for dash_ep in range(start_dash_ep, end_dash_ep + 1): - ret.append(dash_ep) - else: - ret.append(parse_numeral(letters_elt)) - if len(ret) > 1: - return {None: ret[0], 'episodeList': ret} # TODO: Should support seasonList also - elif len(ret) > 0: - return ret[0] - else: - return None - - self.container.register_property(None, r'((?:season|saison)' + sep + '?(?P' + numeral + '))', confidence=1.0, formatter=parse_numeral) - self.container.register_property(None, r'(s(?P' + digital_numeral + ')[^0-9]?' + sep + '?(?P(?:e' + digital_numeral + '(?:' + sep + '?[e-]' + digital_numeral + ')*)))[^0-9]', confidence=1.0, formatter={None: parse_numeral, 'episodeNumber': episode_parser}, validator=NoValidator()) - self.container.register_property(None, r'[^0-9]((?P' + digital_numeral + ')[^0-9 .-]?-?(?P(?:x' + digital_numeral + '(?:' + sep + '?[x-]' + digital_numeral + ')*)))[^0-9]', confidence=1.0, formatter={None: parse_numeral, 'episodeNumber': episode_parser}) - self.container.register_property(None, r'(s(?P' + digital_numeral + '))[^0-9]', confidence=0.6, formatter=parse_numeral, validator=NoValidator()) - self.container.register_property(None, r'((?P' + digital_numeral + ')v[23])', confidence=0.6, formatter=parse_numeral) - self.container.register_property(None, r'((?:ep)' + sep + r'(?P' + numeral + '))[^0-9]', confidence=0.7, formatter=parse_numeral) - self.container.register_property(None, r'(e(?P' + digital_numeral + '))', confidence=0.6, formatter=parse_numeral) - - self.container.register_canonical_properties('other', 'FiNAL', 'Complete', validator=WeakValidator()) - - def supported_properties(self): - return ['episodeNumber', 'season'] - - def guess_episodes_rexps(self, string, node=None, options=None): - found = self.container.find_properties(string, node) - return self.container.as_guess(found, string) - - def should_process(self, mtree, options=None): - return mtree.guess.get('type', '').startswith('episode') - - def process(self, mtree, options=None): - GuessFinder(self.guess_episodes_rexps, None, self.log, options).process_nodes(mtree.unidentified_leaves()) +def process(mtree): + SingleNodeGuesser(guess_episodes_rexps, None, log).process(mtree) diff --git a/lib/guessit/transfo/guess_filetype.py b/lib/guessit/transfo/guess_filetype.py index 461046c1..4279c0b0 100644 --- a/lib/guessit/transfo/guess_filetype.py +++ b/lib/guessit/transfo/guess_filetype.py @@ -2,7 +2,7 @@ # -*- coding: utf-8 -*- # # GuessIt - A library for guessing information from filenames -# Copyright (c) 2013 Nicolas Wack +# Copyright (c) 2012 Nicolas Wack # # GuessIt is free software; you can redistribute it and/or modify it under # the terms of the Lesser GNU General Public License as published by @@ -18,196 +18,182 @@ # along with this program. If not, see . # -from __future__ import absolute_import, division, print_function, unicode_literals - -import mimetypes +from __future__ import unicode_literals +from guessit import Guess +from guessit.patterns import (subtitle_exts, info_exts, video_exts, episode_rexps, + find_properties, compute_canonical_form) +from guessit.date import valid_year +from guessit.textutils import clean_string import os.path import re +import mimetypes +import logging -from guessit.guess import Guess -from guessit.patterns.extension import subtitle_exts, info_exts, video_exts -from guessit.transfo import TransformerException -from guessit.plugins.transformers import Transformer, get_transformer -from guessit.matcher import log_found_guess, found_guess -from guessit.textutils import clean_string +log = logging.getLogger(__name__) +# List of well known movies and series, hardcoded because they cannot be +# guessed appropriately otherwise +MOVIES = [ 'OSS 117' ] +SERIES = [ 'Band of Brothers' ] -class GuessFiletype(Transformer): - def __init__(self): - Transformer.__init__(self, 250) +MOVIES = [ m.lower() for m in MOVIES ] +SERIES = [ s.lower() for s in SERIES ] - # List of well known movies and series, hardcoded because they cannot be - # guessed appropriately otherwise - MOVIES = ['OSS 117'] - SERIES = ['Band of Brothers'] +def guess_filetype(mtree, filetype): + # put the filetype inside a dummy container to be able to have the + # following functions work correctly as closures + # this is a workaround for python 2 which doesn't have the + # 'nonlocal' keyword (python 3 does have it) + filetype_container = [filetype] + other = {} + filename = mtree.string - MOVIES = [m.lower() for m in MOVIES] - SERIES = [s.lower() for s in SERIES] + def upgrade_episode(): + if filetype_container[0] == 'video': + filetype_container[0] = 'episode' + elif filetype_container[0] == 'subtitle': + filetype_container[0] = 'episodesubtitle' + elif filetype_container[0] == 'info': + filetype_container[0] = 'episodeinfo' - def guess_filetype(self, mtree, options=None): - options = options or {} + def upgrade_movie(): + if filetype_container[0] == 'video': + filetype_container[0] = 'movie' + elif filetype_container[0] == 'subtitle': + filetype_container[0] = 'moviesubtitle' + elif filetype_container[0] == 'info': + filetype_container[0] = 'movieinfo' - # put the filetype inside a dummy container to be able to have the - # following functions work correctly as closures - # this is a workaround for python 2 which doesn't have the - # 'nonlocal' keyword which we could use here in the upgrade_* functions - # (python 3 does have it) - filetype_container = [mtree.guess.get('type')] - other = {} - filename = mtree.string - - def upgrade_episode(): - if filetype_container[0] == 'subtitle': - filetype_container[0] = 'episodesubtitle' - elif filetype_container[0] == 'info': - filetype_container[0] = 'episodeinfo' - elif not filetype_container[0]: - filetype_container[0] = 'episode' - - def upgrade_movie(): - if filetype_container[0] == 'subtitle': - filetype_container[0] = 'moviesubtitle' - elif filetype_container[0] == 'info': - filetype_container[0] = 'movieinfo' - elif not filetype_container[0]: - filetype_container[0] = 'movie' - - def upgrade_subtitle(): - if filetype_container[0] == 'movie': - filetype_container[0] = 'moviesubtitle' - elif filetype_container[0] == 'episode': - filetype_container[0] = 'episodesubtitle' - elif not filetype_container[0]: - filetype_container[0] = 'subtitle' - - def upgrade_info(): - if filetype_container[0] == 'movie': - filetype_container[0] = 'movieinfo' - elif filetype_container[0] == 'episode': - filetype_container[0] = 'episodeinfo' - elif not filetype_container[0]: - filetype_container[0] = 'info' - - # look at the extension first - fileext = os.path.splitext(filename)[1][1:].lower() - if fileext in subtitle_exts: - upgrade_subtitle() - other = {'container': fileext} - elif fileext in info_exts: - upgrade_info() - other = {'container': fileext} - elif fileext in video_exts: - other = {'container': fileext} + def upgrade_subtitle(): + if 'movie' in filetype_container[0]: + filetype_container[0] = 'moviesubtitle' + elif 'episode' in filetype_container[0]: + filetype_container[0] = 'episodesubtitle' else: - if fileext and not options.get('name_only'): - other = {'extension': fileext} + filetype_container[0] = 'subtitle' - # check whether we are in a 'Movies', 'Tv Shows', ... folder - folder_rexps = [ - (r'Movies?', upgrade_movie), - (r'Films?', upgrade_movie), - (r'Tv[ _-]?Shows?', upgrade_episode), - (r'Series?', upgrade_episode), - (r'Episodes?', upgrade_episode), - ] - for frexp, upgrade_func in folder_rexps: - frexp = re.compile(frexp, re.IGNORECASE) - for pathgroup in mtree.children: - if frexp.match(pathgroup.value): - upgrade_func() - return filetype_container[0], other + def upgrade_info(): + if 'movie' in filetype_container[0]: + filetype_container[0] = 'movieinfo' + elif 'episode' in filetype_container[0]: + filetype_container[0] = 'episodeinfo' + else: + filetype_container[0] = 'info' - # check for a few specific cases which will unintentionally make the - # following heuristics confused (eg: OSS 117 will look like an episode, - # season 1, epnum 17, when it is in fact a movie) - fname = clean_string(filename).lower() - for m in self.MOVIES: - if m in fname: - self.log.debug('Found in exception list of movies -> type = movie') - upgrade_movie() - return filetype_container[0], other - for s in self.SERIES: - if s in fname: - self.log.debug('Found in exception list of series -> type = episode') - upgrade_episode() - return filetype_container[0], other + def upgrade(type='unknown'): + if filetype_container[0] == 'autodetect': + filetype_container[0] = type - # now look whether there are some specific hints for episode vs movie - # if we have an episode_rexp (eg: s02e13), it is an episode - episode_transformer = get_transformer('guess_episodes_rexps') - if episode_transformer: - guess = episode_transformer.guess_episodes_rexps(filename) - if guess: - self.log.debug('Found guess_episodes_rexps: %s -> type = episode', guess) - upgrade_episode() - return filetype_container[0], other - properties_transformer = get_transformer('guess_properties') - if properties_transformer: - # if we have certain properties characteristic of episodes, it is an ep - found = properties_transformer.container.find_properties(filename, mtree, 'episodeFormat') - guess = properties_transformer.container.as_guess(found, filename) - if guess: - self.log.debug('Found characteristic property of episodes: %s"', guess) - upgrade_episode() - return filetype_container[0], other + # look at the extension first + fileext = os.path.splitext(filename)[1][1:].lower() + if fileext in subtitle_exts: + upgrade_subtitle() + other = { 'container': fileext } + elif fileext in info_exts: + upgrade_info() + other = { 'container': fileext } + elif fileext in video_exts: + upgrade(type='video') + other = { 'container': fileext } + else: + upgrade(type='unknown') + other = { 'extension': fileext } - found = properties_transformer.container.find_properties(filename, mtree, 'format') - guess = properties_transformer.container.as_guess(found, filename) - if guess and guess['format'] in ('HDTV', 'WEBRip', 'WEB-DL', 'DVB'): - # Use weak episodes only if TV or WEB source - weak_episode_transformer = get_transformer('guess_weak_episodes_rexps') - if weak_episode_transformer: - guess = weak_episode_transformer.guess_weak_episodes_rexps(filename) - if guess: - self.log.debug('Found guess_weak_episodes_rexps: %s -> type = episode', guess) - upgrade_episode() - return filetype_container[0], other - website_transformer = get_transformer('guess_website') - if website_transformer: - found = website_transformer.container.find_properties(filename, mtree, 'website') - guess = website_transformer.container.as_guess(found, filename) - if guess: - for namepart in ('tv', 'serie', 'episode'): - if namepart in guess['website']: - # origin-specific type - self.log.debug('Found characteristic property of episodes: %s', guess) - upgrade_episode() - return filetype_container[0], other - if filetype_container[0] in ('subtitle', 'info') or (not filetype_container[0] and fileext in video_exts): - # if no episode info found, assume it's a movie - self.log.debug('Nothing characteristic found, assuming type = movie') + # check whether we are in a 'Movies', 'Tv Shows', ... folder + folder_rexps = [ (r'Movies?', upgrade_movie), + (r'Tv[ _-]?Shows?', upgrade_episode), + (r'Series', upgrade_episode) + ] + for frexp, upgrade_func in folder_rexps: + frexp = re.compile(frexp, re.IGNORECASE) + for pathgroup in mtree.children: + if frexp.match(pathgroup.value): + upgrade_func() + + # check for a few specific cases which will unintentionally make the + # following heuristics confused (eg: OSS 117 will look like an episode, + # season 1, epnum 17, when it is in fact a movie) + fname = clean_string(filename).lower() + for m in MOVIES: + if m in fname: + log.debug('Found in exception list of movies -> type = movie') upgrade_movie() + for s in SERIES: + if s in fname: + log.debug('Found in exception list of series -> type = episode') + upgrade_episode() - if not filetype_container[0]: - self.log.debug('Nothing characteristic found, assuming type = unknown') - filetype_container[0] = 'unknown' + # now look whether there are some specific hints for episode vs movie + if filetype_container[0] in ('video', 'subtitle', 'info'): + # if we have an episode_rexp (eg: s02e13), it is an episode + for rexp, _, _ in episode_rexps: + match = re.search(rexp, filename, re.IGNORECASE) + if match: + log.debug('Found matching regexp: "%s" (string = "%s") -> type = episode', rexp, match.group()) + upgrade_episode() + break - return filetype_container[0], other + # if we have a 3-4 digit number that's not a year, maybe an episode + match = re.search(r'[^0-9]([0-9]{3,4})[^0-9]', filename) + if match: + fullnumber = int(match.group()[1:-1]) + #season = fullnumber // 100 + epnumber = fullnumber % 100 + possible = True - def process(self, mtree, options=None): - """guess the file type now (will be useful later) - """ - filetype, other = self.guess_filetype(mtree, options) + # check for validity + if epnumber > 40: + possible = False + if valid_year(fullnumber): + possible = False - mtree.guess.set('type', filetype, confidence=1.0) - log_found_guess(mtree.guess) + if possible: + log.debug('Found possible episode number: %s (from string "%s") -> type = episode', epnumber, match.group()) + upgrade_episode() - filetype_info = Guess(other, confidence=1.0) - # guess the mimetype of the filename - # TODO: handle other mimetypes not found on the default type_maps - # mimetypes.types_map['.srt']='text/subtitle' - mime, _ = mimetypes.guess_type(mtree.string, strict=False) - if mime is not None: - filetype_info.update({'mimetype': mime}, confidence=1.0) + # if we have certain properties characteristic of episodes, it is an ep + for prop, value, _, _ in find_properties(filename): + log.debug('prop: %s = %s' % (prop, value)) + if prop == 'episodeFormat': + log.debug('Found characteristic property of episodes: %s = "%s"', prop, value) + upgrade_episode() + break - node_ext = mtree.node_at((-1,)) - found_guess(node_ext, filetype_info) + elif compute_canonical_form('format', value) == 'DVB': + log.debug('Found characteristic property of episodes: %s = "%s"', prop, value) + upgrade_episode() + break - if mtree.guess.get('type') in [None, 'unknown']: - if options.get('name_only'): - mtree.guess.set('type', 'movie', confidence=0.6) - else: - raise TransformerException(__name__, 'Unknown file type') + # origin-specific type + if 'tvu.org.ru' in filename: + log.debug('Found characteristic property of episodes: %s = "%s"', prop, value) + upgrade_episode() + + # if no episode info found, assume it's a movie + log.debug('Nothing characteristic found, assuming type = movie') + upgrade_movie() + + filetype = filetype_container[0] + return filetype, other + + +def process(mtree, filetype='autodetect'): + filetype, other = guess_filetype(mtree, filetype) + + mtree.guess.set('type', filetype, confidence=1.0) + log.debug('Found with confidence %.2f: %s' % (1.0, mtree.guess)) + + filetype_info = Guess(other, confidence=1.0) + # guess the mimetype of the filename + # TODO: handle other mimetypes not found on the default type_maps + # mimetypes.types_map['.srt']='text/subtitle' + mime, _ = mimetypes.guess_type(mtree.string, strict=False) + if mime is not None: + filetype_info.update({'mimetype': mime}, confidence=1.0) + + node_ext = mtree.node_at((-1,)) + node_ext.guess = filetype_info + log.debug('Found with confidence %.2f: %s' % (1.0, node_ext.guess)) diff --git a/lib/guessit/transfo/guess_idnumber.py b/lib/guessit/transfo/guess_idnumber.py index 7cf368fd..0e15af5c 100644 --- a/lib/guessit/transfo/guess_idnumber.py +++ b/lib/guessit/transfo/guess_idnumber.py @@ -18,52 +18,54 @@ # along with this program. If not, see . # -from __future__ import absolute_import, division, print_function, unicode_literals - -from guessit.plugins.transformers import Transformer -from guessit.matcher import GuessFinder +from __future__ import unicode_literals +from guessit.transfo import SingleNodeGuesser +from guessit.patterns import find_properties import re +import logging + +log = logging.getLogger(__name__) -class GuessIdnumber(Transformer): - def __init__(self): - Transformer.__init__(self, -180) - - def supported_properties(self): - return ['idNumber'] - - _idnum = re.compile(r'(?P[a-zA-Z0-9-]{20,})') # 1.0, (0, 0)) - - def guess_idnumber(self, string, node=None, options=None): - match = self._idnum.search(string) - if match is not None: - result = match.groupdict() - switch_count = 0 - DIGIT = 0 - LETTER = 1 - OTHER = 2 - last = LETTER - for c in result['idNumber']: - if c in '0123456789': - ci = DIGIT - elif c in 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ': - ci = LETTER - else: - ci = OTHER - - if ci != last: - switch_count += 1 - - last = ci - - switch_ratio = float(switch_count) / len(result['idNumber']) - - # only return the result as probable if we alternate often between - # char type (more likely for hash values than for common words) - if switch_ratio > 0.4: - return result, match.span() - +def guess_properties(string): + try: + prop, value, pos, end = find_properties(string)[0] + return { prop: value }, (pos, end) + except IndexError: return None, None - def process(self, mtree, options=None): - GuessFinder(self.guess_idnumber, 0.4, self.log, options).process_nodes(mtree.unidentified_leaves()) +_idnum = re.compile(r'(?P[a-zA-Z0-9-]{10,})') # 1.0, (0, 0)) + +def guess_idnumber(string): + match = _idnum.search(string) + if match is not None: + result = match.groupdict() + switch_count = 0 + DIGIT = 0 + LETTER = 1 + OTHER = 2 + last = LETTER + for c in result['idNumber']: + if c in '0123456789': + ci = DIGIT + elif c in 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ': + ci = LETTER + else: + ci = OTHER + + if ci != last: + switch_count += 1 + + last = ci + + switch_ratio = float(switch_count) / len(result['idNumber']) + + # only return the result as probable if we alternate often between + # char type (more likely for hash values than for common words) + if switch_ratio > 0.4: + return result, match.span() + + return None, None + +def process(mtree): + SingleNodeGuesser(guess_idnumber, 0.4, log).process(mtree) diff --git a/lib/guessit/transfo/guess_language.py b/lib/guessit/transfo/guess_language.py index 00e6203e..648a06b1 100644 --- a/lib/guessit/transfo/guess_language.py +++ b/lib/guessit/transfo/guess_language.py @@ -2,7 +2,7 @@ # -*- coding: utf-8 -*- # # GuessIt - A library for guessing information from filenames -# Copyright (c) 2013 Nicolas Wack +# Copyright (c) 2012 Nicolas Wack # # GuessIt is free software; you can redistribute it and/or modify it under # the terms of the Lesser GNU General Public License as published by @@ -18,152 +18,38 @@ # along with this program. If not, see . # -from __future__ import absolute_import, division, print_function, unicode_literals +from __future__ import unicode_literals +from guessit import Guess +from guessit.transfo import SingleNodeGuesser +from guessit.language import search_language +import logging -from guessit.language import search_language, subtitle_prefixes, subtitle_suffixes -from guessit.patterns.extension import subtitle_exts -from guessit.textutils import clean_string, find_words -from guessit.plugins.transformers import Transformer -from guessit.matcher import GuessFinder +log = logging.getLogger(__name__) -class GuessLanguage(Transformer): - def __init__(self): - Transformer.__init__(self, 30) +def guess_language(string, node, skip=None): + if skip: + relative_skip = [] + for entry in skip: + node_idx = entry['node_idx'] + span = entry['span'] + if node_idx == node.node_idx[:len(node_idx)]: + relative_span = (span[0] - node.offset + 1, span[1] - node.offset + 1) + relative_skip.append(relative_span) + skip = relative_skip - def supported_properties(self): - return ['language', 'subtitleLanguage'] + language, span, confidence = search_language(string, skip=skip) + if language: + return (Guess({'language': language}, + confidence=confidence, + raw= string[span[0]:span[1]]), + span) - def guess_language(self, string, node=None, options=None): - guess = search_language(string) - return guess + return None, None - def _skip_language_on_second_pass(self, mtree, node): - """Check if found node is a valid language node, or if it's a false positive. +guess_language.use_node = True - :param mtree: Tree detected on first pass. - :type mtree: :class:`guessit.matchtree.MatchTree` - :param node: Node that contains a language Guess - :type node: :class:`guessit.matchtree.MatchTree` - :return: True if a second pass skipping this node is required - :rtype: bool - """ - unidentified_starts = {} - unidentified_ends = {} - - property_starts = {} - property_ends = {} - - title_starts = {} - title_ends = {} - - for unidentified_node in mtree.unidentified_leaves(): - unidentified_starts[unidentified_node.span[0]] = unidentified_node - unidentified_ends[unidentified_node.span[1]] = unidentified_node - - for property_node in mtree.leaves_containing('year'): - property_starts[property_node.span[0]] = property_node - property_ends[property_node.span[1]] = property_node - - for title_node in mtree.leaves_containing(['title', 'series']): - title_starts[title_node.span[0]] = title_node - title_ends[title_node.span[1]] = title_node - - return node.span[0] in title_ends.keys() and (node.span[1] in unidentified_starts.keys() or node.span[1] + 1 in property_starts.keys()) or\ - node.span[1] in title_starts.keys() and (node.span[0] == 0 or node.span[0] in unidentified_ends.keys() or node.span[0] in property_ends.keys()) - - def second_pass_options(self, mtree, options=None): - m = mtree.matched() - to_skip_language_nodes = [] - - for lang_key in ('language', 'subtitleLanguage'): - langs = {} - lang_nodes = set(n for n in mtree.leaves_containing(lang_key)) - - for lang_node in lang_nodes: - lang = lang_node.guess.get(lang_key, None) - if self._skip_language_on_second_pass(mtree, lang_node): - # Language probably split the title. Add to skip for 2nd pass. - - # if filetype is subtitle and the language appears last, just before - # the extension, then it is likely a subtitle language - parts = clean_string(lang_node.root.value).split() - if (m.get('type') in ['moviesubtitle', 'episodesubtitle'] and - (parts.index(lang_node.value) == len(parts) - 2)): - continue - - to_skip_language_nodes.append(lang_node) - elif not lang in langs: - langs[lang] = lang_node - else: - # The same language was found. Keep the more confident one, - # and add others to skip for 2nd pass. - existing_lang_node = langs[lang] - to_skip = None - if (existing_lang_node.guess.confidence('language') >= - lang_node.guess.confidence('language')): - # lang_node is to remove - to_skip = lang_node - else: - # existing_lang_node is to remove - langs[lang] = lang_node - to_skip = existing_lang_node - to_skip_language_nodes.append(to_skip) - - if to_skip_language_nodes: - return {'skip_nodes': to_skip_language_nodes} - return None - - def should_process(self, mtree, options=None): - options = options or {} - return 'nolanguage' not in options - - def process(self, mtree, options=None): - GuessFinder(self.guess_language, None, self.log, options).process_nodes(mtree.unidentified_leaves()) - - def promote_subtitle(self, node): - node.guess.set('subtitleLanguage', node.guess['language'], - confidence=node.guess.confidence('language')) - del node.guess['language'] - - def post_process(self, mtree, options=None): - # 1- try to promote language to subtitle language where it makes sense - for node in mtree.nodes(): - if 'language' not in node.guess: - continue - - # - if we matched a language in a file with a sub extension and that - # the group is the last group of the filename, it is probably the - # language of the subtitle - # (eg: 'xxx.english.srt') - if (mtree.node_at((-1,)).value.lower() in subtitle_exts and - node == mtree.leaves()[-2]): - self.promote_subtitle(node) - - # - if we find in the same explicit group - # a subtitle prefix before the language, - # or a subtitle suffix after the language, - # then upgrade the language - explicit_group = mtree.node_at(node.node_idx[:2]) - group_str = explicit_group.value.lower() - - for sub_prefix in subtitle_prefixes: - if (sub_prefix in find_words(group_str) and - 0 <= group_str.find(sub_prefix) < (node.span[0] - explicit_group.span[0])): - self.promote_subtitle(node) - - for sub_suffix in subtitle_suffixes: - if (sub_suffix in find_words(group_str) and - (node.span[0] - explicit_group.span[0]) < group_str.find(sub_suffix)): - self.promote_subtitle(node) - - # - if a language is in an explicit group just preceded by "st", - # it is a subtitle language (eg: '...st[fr-eng]...') - try: - idx = node.node_idx - previous = mtree.node_at((idx[0], idx[1] - 1)).leaves()[-1] - if previous.value.lower()[-2:] == 'st': - self.promote_subtitle(node) - except IndexError: - pass +def process(mtree, *args, **kwargs): + SingleNodeGuesser(guess_language, None, log, *args, **kwargs).process(mtree) + # Note: 'language' is promoted to 'subtitleLanguage' in the post_process transfo diff --git a/lib/guessit/transfo/guess_movie_title_from_position.py b/lib/guessit/transfo/guess_movie_title_from_position.py index bfab1c89..bcb42b45 100644 --- a/lib/guessit/transfo/guess_movie_title_from_position.py +++ b/lib/guessit/transfo/guess_movie_title_from_position.py @@ -2,7 +2,7 @@ # -*- coding: utf-8 -*- # # GuessIt - A library for guessing information from filenames -# Copyright (c) 2013 Nicolas Wack +# Copyright (c) 2012 Nicolas Wack # # GuessIt is free software; you can redistribute it and/or modify it under # the terms of the Lesser GNU General Public License as published by @@ -18,160 +18,157 @@ # along with this program. If not, see . # -from __future__ import absolute_import, division, print_function, unicode_literals +from __future__ import unicode_literals +from guessit import Guess +import unicodedata +import logging -from guessit.plugins.transformers import Transformer -from guessit.matcher import found_property -from guessit import u +log = logging.getLogger(__name__) -class GuessMovieTitleFromPosition(Transformer): - def __init__(self): - Transformer.__init__(self, -200) +def process(mtree): + def found_property(node, name, value, confidence): + node.guess = Guess({ name: value }, + confidence=confidence, + raw=value) + log.debug('Found with confidence %.2f: %s' % (confidence, node.guess)) - def supported_properties(self): - return ['title'] + def found_title(node, confidence): + found_property(node, 'title', node.clean_value, confidence) - def should_process(self, mtree, options=None): - options = options or {} - return not options.get('skip_title') and not mtree.guess.get('type', '').startswith('episode') + basename = mtree.node_at((-2,)) + all_valid = lambda leaf: len(leaf.clean_value) > 0 + basename_leftover = basename.unidentified_leaves(valid=all_valid) - def process(self, mtree, options=None): - """ - try to identify the remaining unknown groups by looking at their - position relative to other known elements - """ - basename = mtree.node_at((-2,)) - all_valid = lambda leaf: len(leaf.clean_value) > 0 - basename_leftover = basename.unidentified_leaves(valid=all_valid) + try: + folder = mtree.node_at((-3,)) + folder_leftover = folder.unidentified_leaves() + except ValueError: + folder = None + folder_leftover = [] - try: + log.debug('folder: %s' % folder_leftover) + log.debug('basename: %s' % basename_leftover) + + # specific cases: + # if we find the same group both in the folder name and the filename, + # it's a good candidate for title + if (folder_leftover and basename_leftover and + folder_leftover[0].clean_value == basename_leftover[0].clean_value): + + found_title(folder_leftover[0], confidence=0.8) + return + + # specific cases: + # if the basename contains a number first followed by an unidentified + # group, and the folder only contains 1 unidentified one, then we have + # a series + # ex: Millenium Trilogy (2009)/(1)The Girl With The Dragon Tattoo(2009).mkv + try: + series = folder_leftover[0] + filmNumber = basename_leftover[0] + title = basename_leftover[1] + + basename_leaves = basename.leaves() + + num = int(filmNumber.clean_value) + + log.debug('series: %s' % series.clean_value) + log.debug('title: %s' % title.clean_value) + if (series.clean_value != title.clean_value and + series.clean_value != filmNumber.clean_value and + basename_leaves.index(filmNumber) == 0 and + basename_leaves.index(title) == 1): + + found_title(title, confidence=0.6) + found_property(series, 'filmSeries', + series.clean_value, confidence=0.6) + found_property(filmNumber, 'filmNumber', + num, confidence=0.6) + return + except Exception: + pass + + # specific cases: + # - movies/tttttt (yyyy)/tttttt.ccc + try: + if mtree.node_at((-4, 0)).value.lower() == 'movies': folder = mtree.node_at((-3,)) - folder_leftover = folder.unidentified_leaves() - except ValueError: - folder = None - folder_leftover = [] - self.log.debug('folder: %s' % u(folder_leftover)) - self.log.debug('basename: %s' % u(basename_leftover)) + # Note:too generic, might solve all the unittests as they all + # contain 'movies' in their path + # + #if containing_folder.is_leaf() and not containing_folder.guess: + # containing_folder.guess = + # Guess({ 'title': clean_string(containing_folder.value) }, + # confidence=0.7) - # specific cases: - # if we find the same group both in the folder name and the filename, - # it's a good candidate for title - if (folder_leftover and basename_leftover and - folder_leftover[0].clean_value == basename_leftover[0].clean_value): + year_group = folder.first_leaf_containing('year') + groups_before = folder.previous_unidentified_leaves(year_group) - found_property(folder_leftover[0], 'title', confidence=0.8) + found_title(groups_before[0], confidence=0.8) return - # specific cases: - # if the basename contains a number first followed by an unidentified - # group, and the folder only contains 1 unidentified one, then we have - # a series - # ex: Millenium Trilogy (2009)/(1)The Girl With The Dragon Tattoo(2009).mkv - try: - series = folder_leftover[0] - filmNumber = basename_leftover[0] - title = basename_leftover[1] + except Exception: + pass - basename_leaves = basename.leaves() + # if we have either format or videoCodec in the folder containing the file + # or one of its parents, then we should probably look for the title in + # there rather than in the basename + try: + props = mtree.previous_leaves_containing(mtree.children[-2], + [ 'videoCodec', 'format', + 'language' ]) + except IndexError: + props = [] - num = int(filmNumber.clean_value) + if props: + group_idx = props[0].node_idx[0] + if all(g.node_idx[0] == group_idx for g in props): + # if they're all in the same group, take leftover info from there + leftover = mtree.node_at((group_idx,)).unidentified_leaves() - self.log.debug('series: %s' % series.clean_value) - self.log.debug('title: %s' % title.clean_value) - if (series.clean_value != title.clean_value and - series.clean_value != filmNumber.clean_value and - basename_leaves.index(filmNumber) == 0 and - basename_leaves.index(title) == 1): - - found_property(title, 'title', confidence=0.6) - found_property(series, 'filmSeries', confidence=0.6) - found_property(filmNumber, 'filmNumber', num, confidence=0.6) - return - except Exception: - pass - - # specific cases: - # - movies/tttttt (yyyy)/tttttt.ccc - try: - if mtree.node_at((-4, 0)).value.lower() == 'movies': - folder = mtree.node_at((-3,)) - - # Note:too generic, might solve all the unittests as they all - # contain 'movies' in their path - # - # if containing_folder.is_leaf() and not containing_folder.guess: - # containing_folder.guess = - # Guess({ 'title': clean_string(containing_folder.value) }, - # confidence=0.7) - - year_group = folder.first_leaf_containing('year') - groups_before = folder.previous_unidentified_leaves(year_group) - - found_property(groups_before[0], 'title', confidence=0.8) + if leftover: + found_title(leftover[0], confidence=0.7) return - except Exception: - pass + # look for title in basename if there are some remaining undidentified + # groups there + if basename_leftover: + title_candidate = basename_leftover[0] - # if we have either format or videoCodec in the folder containing the file - # or one of its parents, then we should probably look for the title in - # there rather than in the basename - try: - props = mtree.previous_leaves_containing(mtree.children[-2], - ['videoCodec', 'format', - 'language']) - except IndexError: - props = [] + # if basename is only one word and the containing folder has at least + # 3 words in it, we should take the title from the folder name + # ex: Movies/Alice in Wonderland DVDRip.XviD-DiAMOND/dmd-aw.avi + # ex: Movies/Somewhere.2010.DVDRip.XviD-iLG/i-smwhr.avi <-- TODO: gets caught here? + if (title_candidate.clean_value.count(' ') == 0 and + folder_leftover and + folder_leftover[0].clean_value.count(' ') >= 2): - if props: - group_idx = props[0].node_idx[0] - if all(g.node_idx[0] == group_idx for g in props): - # if they're all in the same group, take leftover info from there - leftover = mtree.node_at((group_idx,)).unidentified_leaves() - - if leftover: - found_property(leftover[0], 'title', confidence=0.7) - return - - # look for title in basename if there are some remaining unidentified - # groups there - if basename_leftover: - # if basename is only one word and the containing folder has at least - # 3 words in it, we should take the title from the folder name - # ex: Movies/Alice in Wonderland DVDRip.XviD-DiAMOND/dmd-aw.avi - # ex: Movies/Somewhere.2010.DVDRip.XviD-iLG/i-smwhr.avi <-- TODO: gets caught here? - if (basename_leftover[0].clean_value.count(' ') == 0 and - folder_leftover and - folder_leftover[0].clean_value.count(' ') >= 2): - - found_property(folder_leftover[0], 'title', confidence=0.7) - return - - # if there are only many unidentified groups, take the first of which is - # not inside brackets or parentheses. - # ex: Movies/[阿维达].Avida.2006.FRENCH.DVDRiP.XViD-PROD.avi - if basename_leftover[0].is_explicit(): - for basename_leftover_elt in basename_leftover: - if not basename_leftover_elt.is_explicit(): - found_property(basename_leftover_elt, 'title', confidence=0.8) - return - - # if all else fails, take the first remaining unidentified group in the - # basename as title - found_property(basename_leftover[0], 'title', confidence=0.6) + found_title(folder_leftover[0], confidence=0.7) return - # if there are no leftover groups in the basename, look in the folder name - if folder_leftover: - found_property(folder_leftover[0], 'title', confidence=0.5) + # if there are only 2 unidentified groups, the first of which is inside + # brackets or parentheses, we take the second one for the title: + # ex: Movies/[阿维达].Avida.2006.FRENCH.DVDRiP.XViD-PROD.avi + if len(basename_leftover) == 2 and basename_leftover[0].is_explicit(): + found_title(basename_leftover[1], confidence=0.8) return - # if nothing worked, look if we have a very small group at the beginning - # of the basename - basename = mtree.node_at((-2,)) - basename_leftover = basename.unidentified_leaves(valid=lambda leaf: True) - if basename_leftover: - found_property(basename_leftover[0], 'title', confidence=0.4) - return + # if all else fails, take the first remaining unidentified group in the + # basename as title + found_title(title_candidate, confidence=0.6) + return + + # if there are no leftover groups in the basename, look in the folder name + if folder_leftover: + found_title(folder_leftover[0], confidence=0.5) + return + + # if nothing worked, look if we have a very small group at the beginning + # of the basename + basename = mtree.node_at((-2,)) + basename_leftover = basename.unidentified_leaves(valid=lambda leaf: True) + if basename_leftover: + found_title(basename_leftover[0], confidence=0.4) + return diff --git a/lib/guessit/transfo/guess_properties.py b/lib/guessit/transfo/guess_properties.py index 2a1544a0..6c72dfd5 100644 --- a/lib/guessit/transfo/guess_properties.py +++ b/lib/guessit/transfo/guess_properties.py @@ -2,7 +2,7 @@ # -*- coding: utf-8 -*- # # GuessIt - A library for guessing information from filenames -# Copyright (c) 2013 Rémi Alvergnat +# Copyright (c) 2012 Nicolas Wack # # GuessIt is free software; you can redistribute it and/or modify it under # the terms of the Lesser GNU General Public License as published by @@ -18,213 +18,21 @@ # along with this program. If not, see . # -from __future__ import absolute_import, division, print_function, unicode_literals +from __future__ import unicode_literals +from guessit.transfo import SingleNodeGuesser +from guessit.patterns import find_properties +import logging -from guessit.containers import PropertiesContainer, WeakValidator, LeavesValidator, QualitiesContainer -from guessit.patterns.extension import subtitle_exts, video_exts, info_exts -from guessit.plugins.transformers import Transformer -from guessit.matcher import GuessFinder +log = logging.getLogger(__name__) -class GuessProperties(Transformer): - def __init__(self): - Transformer.__init__(self, 35) +def guess_properties(string): + try: + prop, value, pos, end = find_properties(string)[0] + return { prop: value }, (pos, end) + except IndexError: + return None, None - self.container = PropertiesContainer() - self.qualities = QualitiesContainer() - def register_property(propname, props): - """props a dict of {value: [patterns]}""" - for canonical_form, patterns in props.items(): - if isinstance(patterns, tuple): - patterns2, kwargs = patterns - kwargs = dict(kwargs) - kwargs['canonical_form'] = canonical_form - self.container.register_property(propname, *patterns2, **kwargs) - - else: - self.container.register_property(propname, *patterns, canonical_form=canonical_form) - - def register_quality(propname, quality_dict): - """props a dict of {canonical_form: quality}""" - for canonical_form, quality in quality_dict.items(): - self.qualities.register_quality(propname, canonical_form, quality) - - register_property('container', {'mp4': ['MP4']}) - - # http://en.wikipedia.org/wiki/Pirated_movie_release_types - register_property('format', {'VHS': ['VHS'], - 'Cam': ['CAM', 'CAMRip'], - 'Telesync': ['TELESYNC', 'PDVD'], - 'Telesync': (['TS'], {'confidence': 0.2}), - 'Workprint': ['WORKPRINT', 'WP'], - 'Telecine': ['TELECINE', 'TC'], - 'PPV': ['PPV', 'PPV-Rip'], # Pay Per View - 'TV': ['SD-TV', 'SD-TV-Rip', 'Rip-SD-TV', 'TV-Rip', 'Rip-TV'], - 'DVB': ['DVB-Rip', 'DVB', 'PD-TV'], - 'DVD': ['DVD', 'DVD-Rip', 'VIDEO-TS'], - 'HDTV': ['HD-TV', 'TV-RIP-HD', 'HD-TV-RIP'], - 'VOD': ['VOD', 'VOD-Rip'], - 'WEBRip': ['WEB-Rip'], - 'WEB-DL': ['WEB-DL'], - 'HD-DVD': ['HD-(?:DVD)?-Rip', 'HD-DVD'], - 'BluRay': ['Blu-ray', 'B[DR]', 'B[DR]-Rip', 'BD[59]', 'BD25', 'BD50'] - }) - - register_quality('format', {'VHS': -100, - 'Cam': -90, - 'Telesync': -80, - 'Workprint': -70, - 'Telecine': -60, - 'PPV': -50, - 'TV': -30, - 'DVB': -20, - 'DVD': 0, - 'HDTV': 20, - 'VOD': 40, - 'WEBRip': 50, - 'WEB-DL': 60, - 'HD-DVD': 80, - 'BluRay': 100 - }) - - register_property('screenSize', {'360p': ['(?:\d{3,}(?:\\|\/|x|\*))?360(?:i|p?x?)'], - '368p': ['(?:\d{3,}(?:\\|\/|x|\*))?368(?:i|p?x?)'], - '480p': ['(?:\d{3,}(?:\\|\/|x|\*))?480(?:i|p?x?)'], - '480p': (['hr'], {'confidence': 0.2}), - '576p': ['(?:\d{3,}(?:\\|\/|x|\*))?576(?:i|p?x?)'], - '720p': ['(?:\d{3,}(?:\\|\/|x|\*))?720(?:i|p?x?)'], - '900p': ['(?:\d{3,}(?:\\|\/|x|\*))?900(?:i|p?x?)'], - '1080i': ['(?:\d{3,}(?:\\|\/|x|\*))?1080i'], - '1080p': ['(?:\d{3,}(?:\\|\/|x|\*))?1080(?:p?x?)'], - '4K': ['(?:\d{3,}(?:\\|\/|x|\*))?2160(?:i|p?x?)'] - }) - - register_quality('screenSize', {'360p': -300, - '368p': -200, - '480p': -100, - '576p': 0, - '720p': 100, - '900p': 130, - '1080i': 180, - '1080p': 200, - '4K': 400 - }) - - _videoCodecProperty = {'Real': ['Rv\d{2}'], # http://en.wikipedia.org/wiki/RealVideo - 'Mpeg2': ['Mpeg2'], - 'DivX': ['DVDivX', 'DivX'], - 'XviD': ['XviD'], - 'h264': ['[hx]-264(?:-AVC)?', 'MPEG-4(?:-AVC)'], - 'h265': ['[hx]-265(?:-HEVC)?', 'HEVC'] - } - - register_property('videoCodec', _videoCodecProperty) - - register_quality('videoCodec', {'Real': -50, - 'Mpeg2': -30, - 'DivX': -10, - 'XviD': 0, - 'h264': 100, - 'h265': 150 - }) - - # http://blog.mediacoderhq.com/h264-profiles-and-levels/ - # http://fr.wikipedia.org/wiki/H.264 - self.container.register_property('videoProfile', 'BP', validator=LeavesValidator(lambdas=[lambda node: 'videoCodec' in node.guess])) - self.container.register_property('videoProfile', 'XP', 'EP', canonical_form='XP', validator=LeavesValidator(lambdas=[lambda node: 'videoCodec' in node.guess])) - self.container.register_property('videoProfile', 'MP', validator=LeavesValidator(lambdas=[lambda node: 'videoCodec' in node.guess])) - self.container.register_property('videoProfile', 'HP', 'HiP', canonical_form='HP', validator=LeavesValidator(lambdas=[lambda node: 'videoCodec' in node.guess])) - self.container.register_property('videoProfile', '10.?bit', 'Hi10P', canonical_form='10bit', validator=LeavesValidator(lambdas=[lambda node: 'videoCodec' in node.guess])) - self.container.register_property('videoProfile', 'Hi422P', validator=LeavesValidator(lambdas=[lambda node: 'videoCodec' in node.guess])) - self.container.register_property('videoProfile', 'Hi444PP', validator=LeavesValidator(lambdas=[lambda node: 'videoCodec' in node.guess])) - - register_quality('videoProfile', {'BP': -20, - 'XP': -10, - 'MP': 0, - 'HP': 10, - '10bit': 15, - 'Hi422P': 25, - 'Hi444PP': 35 - }) - - # has nothing to do here (or on filenames for that matter), but some - # releases use it and it helps to identify release groups, so we adapt - register_property('videoApi', {'DXVA': ['DXVA']}) - - register_property('audioCodec', {'MP3': ['MP3'], - 'DolbyDigital': ['DD'], - 'AAC': ['AAC'], - 'AC3': ['AC3'], - 'Flac': ['FLAC'], - 'DTS': ['DTS'], - 'TrueHD': ['True-HD'] - }) - - register_quality('audioCodec', {'MP3': 10, - 'DolbyDigital': 30, - 'AAC': 35, - 'AC3': 40, - 'Flac': 45, - 'DTS': 60, - 'TrueHD': 70 - }) - - self.container.register_property('audioProfile', 'HD', validator=LeavesValidator(lambdas=[lambda node: node.guess.get('audioCodec') == 'DTS'])) - self.container.register_property('audioProfile', 'HD-MA', canonical_form='HDMA', validator=LeavesValidator(lambdas=[lambda node: node.guess.get('audioCodec') == 'DTS'])) - self.container.register_property('audioProfile', 'HE', validator=LeavesValidator(lambdas=[lambda node: node.guess.get('audioCodec') == 'AAC'])) - self.container.register_property('audioProfile', 'LC', validator=LeavesValidator(lambdas=[lambda node: node.guess.get('audioCodec') == 'AAC'])) - self.container.register_property('audioProfile', 'HQ', validator=LeavesValidator(lambdas=[lambda node: node.guess.get('audioCodec') == 'AC3'])) - - register_quality('audioProfile', {'HD': 20, - 'HDMA': 50, - 'LC': 0, - 'HQ': 0, - 'HE': 20 - }) - - register_property('audioChannels', {'7.1': ['7[\W_]1', '7ch'], - '5.1': ['5[\W_]1', '5ch'], - '2.0': ['2[\W_]0', '2ch', 'stereo'], - '1.0': ['1[\W_]0', '1ch', 'mono'] - }) - - register_quality('audioChannels', {'7.1': 200, - '5.1': 100, - '2.0': 0, - '1.0': -100 - }) - - self.container.register_property('episodeFormat', r'Minisodes?', canonical_form='Minisode') - - register_property('other', {'AudioFix': ['Audio-Fix', 'Audio-Fixed'], - 'SyncFix': ['Sync-Fix', 'Sync-Fixed'], - 'DualAudio': ['Dual-Audio'], - 'WideScreen': ['ws', 'wide-screen'], - }) - - self.container.register_property('other', 'Real', 'Fix', canonical_form="Proper", validator=WeakValidator()) - self.container.register_property('other', 'Proper', 'Repack', 'Rerip', canonical_form="Proper") - - self.container.register_canonical_properties('other', 'R5', 'Screener', '3D', 'HD', 'HQ', 'DDC') - self.container.register_canonical_properties('other', 'Limited', 'Complete', 'Classic', 'Unrated', 'LiNE', 'Bonus', 'Trailer', validator=WeakValidator()) - - for prop in self.container.get_properties('format'): - self.container.register_property('other', prop.pattern + '(-?Scr(?:eener)?)', canonical_form='Screener') - - for exts in (subtitle_exts, info_exts, video_exts): - for container in exts: - self.container.register_property('container', container, confidence=0.3) - - def guess_properties(self, string, node=None, options=None): - found = self.container.find_properties(string, node) - return self.container.as_guess(found, string) - - def supported_properties(self): - return self.container.get_supported_properties() - - def process(self, mtree, options=None): - GuessFinder(self.guess_properties, 1.0, self.log, options).process_nodes(mtree.unidentified_leaves()) - - def rate_quality(self, guess, *props): - return self.qualities.rate_quality(guess, *props) +def process(mtree): + SingleNodeGuesser(guess_properties, 1.0, log).process(mtree) diff --git a/lib/guessit/transfo/guess_release_group.py b/lib/guessit/transfo/guess_release_group.py index dfbdd17a..b72c7368 100644 --- a/lib/guessit/transfo/guess_release_group.py +++ b/lib/guessit/transfo/guess_release_group.py @@ -2,7 +2,7 @@ # -*- coding: utf-8 -*- # # GuessIt - A library for guessing information from filenames -# Copyright (c) 2013 Nicolas Wack +# Copyright (c) 2012 Nicolas Wack # # GuessIt is free software; you can redistribute it and/or modify it under # the terms of the Lesser GNU General Public License as published by @@ -18,132 +18,69 @@ # along with this program. If not, see . # -from __future__ import absolute_import, division, print_function, unicode_literals +from __future__ import unicode_literals +from guessit.transfo import SingleNodeGuesser +from guessit.patterns import prop_multi, compute_canonical_form, _dash, _psep +import re +import logging -from guessit.plugins.transformers import Transformer -from guessit.matcher import GuessFinder, found_property, found_guess -from guessit.containers import PropertiesContainer -from guessit.patterns import sep -from guessit.guess import Guess -from guessit.textutils import strip_brackets +log = logging.getLogger(__name__) + +def get_patterns(property_name): + return [ p.replace(_dash, _psep) for patterns in prop_multi[property_name].values() for p in patterns ] + +CODECS = get_patterns('videoCodec') +FORMATS = get_patterns('format') +VAPIS = get_patterns('videoApi') + +# RG names following a codec or format, with a potential space or dash inside the name +GROUP_NAMES = [ r'(?P' + codec + r')[ \.-](?P.+?([- \.].*?)??)[ \.]' + for codec in CODECS ] +GROUP_NAMES += [ r'(?P' + fmt + r')[ \.-](?P.+?([- \.].*?)??)[ \.]' + for fmt in FORMATS ] +GROUP_NAMES += [ r'(?P' + api + r')[ \.-](?P.+?([- \.].*?)??)[ \.]' + for api in VAPIS ] + +GROUP_NAMES2 = [ r'\.(?P' + codec + r')-(?P.*?)(-(.*?))?[ \.]' + for codec in CODECS ] +GROUP_NAMES2 += [ r'\.(?P' + fmt + r')-(?P.*?)(-(.*?))?[ \.]' + for fmt in FORMATS ] +GROUP_NAMES2 += [ r'\.(?P' + vapi + r')-(?P.*?)(-(.*?))?[ \.]' + for vapi in VAPIS ] + +GROUP_NAMES = [ re.compile(r, re.IGNORECASE) for r in GROUP_NAMES ] +GROUP_NAMES2 = [ re.compile(r, re.IGNORECASE) for r in GROUP_NAMES2 ] + +def adjust_metadata(md): + return dict((property_name, compute_canonical_form(property_name, value) or value) + for property_name, value in md.items()) -class GuessReleaseGroup(Transformer): - def __init__(self): - Transformer.__init__(self, -190) - self.container = PropertiesContainer(canonical_from_pattern=False) - self._allowed_groupname_pattern = '[\w@#€£$&]' - self._forbidden_groupname_lambda = [lambda elt: elt in ['rip', 'by', 'for', 'par', 'pour', 'bonus'], - lambda elt: self._is_number(elt), - ] - # If the previous property in this list, the match will be considered as safe - # and group name can contain a separator. - self.previous_safe_properties = ['videoCodec', 'format', 'videoApi', 'audioCodec', 'audioProfile', 'videoProfile', 'audioChannels'] +def guess_release_group(string): + # first try to see whether we have both a known codec and a known release group + for rexp in GROUP_NAMES: + match = rexp.search(string) + while match: + metadata = match.groupdict() + # make sure this is an actual release group we caught + release_group = (compute_canonical_form('releaseGroup', metadata['releaseGroup']) or + compute_canonical_form('weakReleaseGroup', metadata['releaseGroup'])) + if release_group: + return adjust_metadata(metadata), (match.start(1), match.end(2)) - self.container.sep_replace_char = '-' - self.container.canonical_from_pattern = False - self.container.enhance = True - self.container.register_property('releaseGroup', self._allowed_groupname_pattern + '+') - self.container.register_property('releaseGroup', self._allowed_groupname_pattern + '+-' + self._allowed_groupname_pattern + '+') + # we didn't find anything conclusive, keep searching + match = rexp.search(string, match.span()[0]+1) - def supported_properties(self): - return self.container.get_supported_properties() + # pick anything as releaseGroup as long as we have a codec in front + # this doesn't include a potential dash ('-') ending the release group + # eg: [...].X264-HiS@SiLUHD-English.[...] + for rexp in GROUP_NAMES2: + match = rexp.search(string) + if match: + return adjust_metadata(match.groupdict()), (match.start(1), match.end(2)) - def _is_number(self, s): - try: - int(s) - return True - except ValueError: - return False + return None, None - def validate_group_name(self, guess): - val = guess['releaseGroup'] - if len(val) >= 2: - if '-' in val: - checked_val = "" - for elt in val.split('-'): - forbidden = False - for forbidden_lambda in self._forbidden_groupname_lambda: - forbidden = forbidden_lambda(elt.lower()) - if forbidden: - break - if not forbidden: - if checked_val: - checked_val += '-' - checked_val += elt - else: - break - val = checked_val - if not val: - return False - guess['releaseGroup'] = val - - forbidden = False - for forbidden_lambda in self._forbidden_groupname_lambda: - forbidden = forbidden_lambda(val.lower()) - if forbidden: - break - if not forbidden: - return True - return False - - def is_leaf_previous(self, leaf, node): - if leaf.span[1] <= node.span[0]: - for idx in range(leaf.span[1], node.span[0]): - if not leaf.root.value[idx] in sep: - return False - return True - return False - - def guess_release_group(self, string, node=None, options=None): - found = self.container.find_properties(string, node, 'releaseGroup') - guess = self.container.as_guess(found, string, self.validate_group_name, sep_replacement='-') - validated_guess = None - if guess: - explicit_group_node = node.group_node() - if explicit_group_node: - for leaf in explicit_group_node.leaves_containing(self.previous_safe_properties): - if self.is_leaf_previous(leaf, node): - if leaf.root.value[leaf.span[1]] == '-': - guess.metadata().confidence = 1 - else: - guess.metadata().confidence = 0.7 - validated_guess = guess - - if not validated_guess: - # If previous group last leaf is identified as a safe property, - # consider the raw value as a releaseGroup - previous_group_node = node.previous_group_node() - if previous_group_node: - for leaf in previous_group_node.leaves_containing(self.previous_safe_properties): - if self.is_leaf_previous(leaf, node): - guess = Guess({'releaseGroup': node.value}, confidence=1, input=node.value, span=(0, len(node.value))) - if self.validate_group_name(guess): - node.guess = guess - validated_guess = guess - - if validated_guess: - # If following group nodes have only one unidentified leaf, it belongs to the release group - next_group_node = node - - while True: - next_group_node = next_group_node.next_group_node() - if next_group_node: - leaves = next_group_node.leaves() - if len(leaves) == 1 and not leaves[0].guess: - validated_guess['releaseGroup'] = validated_guess['releaseGroup'] + leaves[0].value - leaves[0].guess = validated_guess - else: - break - else: - break - - if validated_guess: - # Strip brackets - validated_guess['releaseGroup'] = strip_brackets(validated_guess['releaseGroup']) - - return validated_guess - - def process(self, mtree, options=None): - GuessFinder(self.guess_release_group, None, self.log, options).process_nodes(mtree.unidentified_leaves()) +def process(mtree): + SingleNodeGuesser(guess_release_group, 0.8, log).process(mtree) diff --git a/lib/guessit/transfo/guess_video_rexps.py b/lib/guessit/transfo/guess_video_rexps.py index e8f670fd..1b511f15 100644 --- a/lib/guessit/transfo/guess_video_rexps.py +++ b/lib/guessit/transfo/guess_video_rexps.py @@ -2,7 +2,7 @@ # -*- coding: utf-8 -*- # # GuessIt - A library for guessing information from filenames -# Copyright (c) 2013 Nicolas Wack +# Copyright (c) 2012 Nicolas Wack # # GuessIt is free software; you can redistribute it and/or modify it under # the terms of the Lesser GNU General Public License as published by @@ -18,41 +18,33 @@ # along with this program. If not, see . # -from __future__ import absolute_import, division, print_function, \ - unicode_literals +from __future__ import unicode_literals +from guessit import Guess +from guessit.transfo import SingleNodeGuesser +from guessit.patterns import video_rexps, sep +import re +import logging -from guessit.patterns import _psep -from guessit.containers import PropertiesContainer -from guessit.plugins.transformers import Transformer -from guessit.matcher import GuessFinder -from guessit.patterns.numeral import parse_numeral +log = logging.getLogger(__name__) -class GuessVideoRexps(Transformer): - def __init__(self): - Transformer.__init__(self, 25) +def guess_video_rexps(string): + string = '-' + string + '-' + for rexp, confidence, span_adjust in video_rexps: + match = re.search(sep + rexp + sep, string, re.IGNORECASE) + if match: + metadata = match.groupdict() + # is this the better place to put it? (maybe, as it is at least + # the soonest that we can catch it) + if metadata.get('cdNumberTotal', -1) is None: + del metadata['cdNumberTotal'] + span = (match.start() + span_adjust[0], + match.end() + span_adjust[1] - 2) + return (Guess(metadata, confidence=confidence, raw=string[span[0]:span[1]]), + span) - self.container = PropertiesContainer(canonical_from_pattern=False) + return None, None - self.container.register_property(None, 'cd' + _psep + '(?P[0-9])(?:' + _psep + 'of' + _psep + '(?P[0-9]))?', confidence=1.0, enhance=False, global_span=True, formatter=parse_numeral) - self.container.register_property('cdNumberTotal', '([1-9])' + _psep + 'cds?', confidence=0.9, enhance=False, formatter=parse_numeral) - self.container.register_property('bonusNumber', 'x([0-9]{1,2})', enhance=False, global_span=True, formatter=parse_numeral) - - self.container.register_property('filmNumber', 'f([0-9]{1,2})', enhance=False, global_span=True, formatter=parse_numeral) - - self.container.register_property('edition', 'collector', 'collector-edition', 'edition-collector', canonical_form='Collector Edition') - self.container.register_property('edition', 'special-edition', 'edition-special', canonical_form='Special Edition') - self.container.register_property('edition', 'criterion', 'criterion-edition', 'edition-criterion', canonical_form='Criterion Edition') - self.container.register_property('edition', 'deluxe', 'cdeluxe-edition', 'edition-deluxe', canonical_form='Deluxe Edition') - self.container.register_property('edition', 'director\'?s?-cut', 'director\'?s?-cut-edition', 'edition-director\'?s?-cut', canonical_form='Director\'s cut') - - def supported_properties(self): - return self.container.get_supported_properties() - - def guess_video_rexps(self, string, node=None, options=None): - found = self.container.find_properties(string, node) - return self.container.as_guess(found, string) - - def process(self, mtree, options=None): - GuessFinder(self.guess_video_rexps, None, self.log, options).process_nodes(mtree.unidentified_leaves()) +def process(mtree): + SingleNodeGuesser(guess_video_rexps, None, log).process(mtree) diff --git a/lib/guessit/transfo/guess_weak_episodes_rexps.py b/lib/guessit/transfo/guess_weak_episodes_rexps.py index 4519a678..18306b43 100644 --- a/lib/guessit/transfo/guess_weak_episodes_rexps.py +++ b/lib/guessit/transfo/guess_weak_episodes_rexps.py @@ -2,7 +2,7 @@ # -*- coding: utf-8 -*- # # GuessIt - A library for guessing information from filenames -# Copyright (c) 2013 Nicolas Wack +# Copyright (c) 2012 Nicolas Wack # # GuessIt is free software; you can redistribute it and/or modify it under # the terms of the Lesser GNU General Public License as published by @@ -18,52 +18,45 @@ # along with this program. If not, see . # -from __future__ import absolute_import, division, print_function, unicode_literals +from __future__ import unicode_literals +from guessit import Guess +from guessit.transfo import SingleNodeGuesser +from guessit.patterns import weak_episode_rexps +import re +import logging -from guessit.plugins.transformers import Transformer -from guessit.matcher import GuessFinder -from guessit.patterns import sep -from guessit.containers import PropertiesContainer -from guessit.patterns.numeral import numeral, parse_numeral -from guessit.date import valid_year +log = logging.getLogger(__name__) -class GuessWeakEpisodesRexps(Transformer): - def __init__(self): - Transformer.__init__(self, 15) +def guess_weak_episodes_rexps(string, node): + if 'episodeNumber' in node.root.info: + return None, None - self.properties = PropertiesContainer(enhance=False, canonical_from_pattern=False) + for rexp, span_adjust in weak_episode_rexps: + match = re.search(rexp, string, re.IGNORECASE) + if match: + metadata = match.groupdict() + span = (match.start() + span_adjust[0], + match.end() + span_adjust[1]) - def _formater(episodeNumber): - epnum = parse_numeral(episodeNumber) - if not valid_year(epnum): - if epnum > 100: - season, epnum = epnum // 100, epnum % 100 - # episodes which have a season > 50 are most likely errors - # (Simpson is at 25!) - if season > 50: - return None - return {'season': season, 'episodeNumber': epnum} - else: - return epnum + epnum = int(metadata['episodeNumber']) + if epnum > 100: + season, epnum = epnum // 100, epnum % 100 + # episodes which have a season > 25 are most likely errors + # (Simpsons is at 23!) + if season > 25: + continue + return Guess({ 'season': season, + 'episodeNumber': epnum }, + confidence=0.6, raw=string[span[0]:span[1]]), span + else: + return Guess(metadata, confidence=0.3, raw=string[span[0]:span[1]]), span - self.properties.register_property(['episodeNumber', 'season'], '[0-9]{2,4}', confidence=0.6, formatter=_formater) - self.properties.register_property('episodeNumber', '(?:episode)' + sep + '(' + numeral + ')[^0-9]', confidence=0.3) + return None, None - def supported_properties(self): - return self.properties.get_supported_properties() - def guess_weak_episodes_rexps(self, string, node=None, options=None): - if node and 'episodeNumber' in node.root.info: - return None +guess_weak_episodes_rexps.use_node = True - properties = self.properties.find_properties(string, node) - guess = self.properties.as_guess(properties, string) - return guess - - def should_process(self, mtree, options=None): - return mtree.guess.get('type', '').startswith('episode') - - def process(self, mtree, options=None): - GuessFinder(self.guess_weak_episodes_rexps, 0.6, self.log, options).process_nodes(mtree.unidentified_leaves()) +def process(mtree): + SingleNodeGuesser(guess_weak_episodes_rexps, 0.6, log).process(mtree) diff --git a/lib/guessit/transfo/guess_website.py b/lib/guessit/transfo/guess_website.py index 2718521d..acfd8e11 100644 --- a/lib/guessit/transfo/guess_website.py +++ b/lib/guessit/transfo/guess_website.py @@ -2,7 +2,7 @@ # -*- coding: utf-8 -*- # # GuessIt - A library for guessing information from filenames -# Copyright (c) 2013 Rémi Alvergnat +# Copyright (c) 2012 Nicolas Wack # # GuessIt is free software; you can redistribute it and/or modify it under # the terms of the Lesser GNU General Public License as published by @@ -18,49 +18,22 @@ # along with this program. If not, see . # -from __future__ import absolute_import, division, print_function, \ - unicode_literals +from __future__ import unicode_literals +from guessit.transfo import SingleNodeGuesser +from guessit.patterns import websites +import logging -from guessit.patterns import build_or_pattern -from guessit.containers import PropertiesContainer -from guessit.plugins.transformers import Transformer -from guessit.matcher import GuessFinder -from pkg_resources import resource_stream # @UnresolvedImport +log = logging.getLogger(__name__) -class GuessWebsite(Transformer): - def __init__(self): - Transformer.__init__(self, 45) +def guess_website(string): + low = string.lower() + for site in websites: + pos = low.find(site.lower()) + if pos != -1: + return {'website': site}, (pos, pos + len(site)) + return None, None - self.container = PropertiesContainer(enhance=False, canonical_from_pattern=False) - tlds = [] - - f = resource_stream('guessit', 'tlds-alpha-by-domain.txt') - f.readline() - next(f) - for tld in f: - tld = tld.strip() - if b'--' in tld: - continue - tlds.append(tld.decode("utf-8")) - f.close() - - tlds_pattern = build_or_pattern(tlds) # All registered domain extension - safe_tlds_pattern = build_or_pattern(['com', 'org', 'net']) # For sure a website extension - safe_subdomains_pattern = build_or_pattern(['www']) # For sure a website subdomain - safe_prefix_tlds_pattern = build_or_pattern(['co', 'com', 'org', 'net']) # Those words before a tlds are sure - - self.container.register_property('website', '(?:' + safe_subdomains_pattern + '\.)+' + r'(?:[a-z-]+\.)+' + r'(?:' + tlds_pattern + r')+') - self.container.register_property('website', '(?:' + safe_subdomains_pattern + '\.)*' + r'[a-z-]+\.' + r'(?:' + safe_tlds_pattern + r')+') - self.container.register_property('website', '(?:' + safe_subdomains_pattern + '\.)*' + r'[a-z-]+\.' + r'(?:' + safe_prefix_tlds_pattern + r'\.)+' + r'(?:' + tlds_pattern + r')+') - - def supported_properties(self): - return self.container.get_supported_properties() - - def guess_website(self, string, node=None, options=None): - found = self.container.find_properties(string, node, 'website') - return self.container.as_guess(found, string) - - def process(self, mtree, options=None): - GuessFinder(self.guess_website, 1.0, self.log, options).process_nodes(mtree.unidentified_leaves()) +def process(mtree): + SingleNodeGuesser(guess_website, 1.0, log).process(mtree) diff --git a/lib/guessit/transfo/guess_year.py b/lib/guessit/transfo/guess_year.py index 58ce07f6..c193af7a 100644 --- a/lib/guessit/transfo/guess_year.py +++ b/lib/guessit/transfo/guess_year.py @@ -2,7 +2,7 @@ # -*- coding: utf-8 -*- # # GuessIt - A library for guessing information from filenames -# Copyright (c) 2013 Nicolas Wack +# Copyright (c) 2012 Nicolas Wack # # GuessIt is free software; you can redistribute it and/or modify it under # the terms of the Lesser GNU General Public License as published by @@ -18,32 +18,33 @@ # along with this program. If not, see . # -from __future__ import absolute_import, division, print_function, unicode_literals - -from guessit.plugins.transformers import Transformer -from guessit.matcher import GuessFinder +from __future__ import unicode_literals +from guessit.transfo import SingleNodeGuesser from guessit.date import search_year +import logging + +log = logging.getLogger(__name__) -class GuessYear(Transformer): - def __init__(self): - Transformer.__init__(self, -160) +def guess_year(string): + year, span = search_year(string) + if year: + return { 'year': year }, span + else: + return None, None - def supported_properties(self): - return ['year'] +def guess_year_skip_first(string): + year, span = search_year(string) + if year: + year2, span2 = guess_year(string[span[1]:]) + if year2: + return year2, (span2[0]+span[1], span2[1]+span[1]) - def guess_year(self, string, node=None, options=None): - year, span = search_year(string) - if year: - return {'year': year}, span - else: - return None, None + return None, None - def second_pass_options(self, mtree, options=None): - year_nodes = mtree.leaves_containing('year') - if len(year_nodes) > 1: - return {'skip_nodes': year_nodes[:len(year_nodes) - 1]} - return None - def process(self, mtree, options=None): - GuessFinder(self.guess_year, 1.0, self.log, options).process_nodes(mtree.unidentified_leaves()) +def process(mtree, skip_first_year=False): + if skip_first_year: + SingleNodeGuesser(guess_year_skip_first, 1.0, log).process(mtree) + else: + SingleNodeGuesser(guess_year, 1.0, log).process(mtree) diff --git a/lib/guessit/transfo/post_process.py b/lib/guessit/transfo/post_process.py new file mode 100644 index 00000000..5920e3a4 --- /dev/null +++ b/lib/guessit/transfo/post_process.py @@ -0,0 +1,73 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +# +# GuessIt - A library for guessing information from filenames +# Copyright (c) 2012 Nicolas Wack +# +# GuessIt is free software; you can redistribute it and/or modify it under +# the terms of the Lesser GNU General Public License as published by +# the Free Software Foundation; either version 3 of the License, or +# (at your option) any later version. +# +# GuessIt is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# Lesser GNU General Public License for more details. +# +# You should have received a copy of the Lesser GNU General Public License +# along with this program. If not, see . +# + +from __future__ import unicode_literals +from guessit.patterns import subtitle_exts +from guessit.textutils import reorder_title, find_words +import logging + +log = logging.getLogger(__name__) + + +def process(mtree): + # 1- try to promote language to subtitle language where it makes sense + for node in mtree.nodes(): + if 'language' not in node.guess: + continue + + def promote_subtitle(): + # pylint: disable=W0631 + node.guess.set('subtitleLanguage', node.guess['language'], + confidence=node.guess.confidence('language')) + del node.guess['language'] + + # - if we matched a language in a file with a sub extension and that + # the group is the last group of the filename, it is probably the + # language of the subtitle + # (eg: 'xxx.english.srt') + if (mtree.node_at((-1,)).value.lower() in subtitle_exts and + node == mtree.leaves()[-2]): + promote_subtitle() + + # - if we find the word 'sub' before the language, and in the same explicit + # group, then upgrade the language + explicit_group = mtree.node_at(node.node_idx[:2]) + group_str = explicit_group.value.lower() + + if ('sub' in find_words(group_str) and + 0 <= group_str.find('sub') < (node.span[0] - explicit_group.span[0])): + promote_subtitle() + + # - if a language is in an explicit group just preceded by "st", + # it is a subtitle language (eg: '...st[fr-eng]...') + try: + idx = node.node_idx + previous = mtree.node_at((idx[0], idx[1] - 1)).leaves()[-1] + if previous.value.lower()[-2:] == 'st': + promote_subtitle() + except IndexError: + pass + + # 2- ", the" at the end of a series title should be prepended to it + for node in mtree.nodes(): + if 'series' not in node.guess: + continue + + node.guess['series'] = reorder_title(node.guess['series']) diff --git a/lib/guessit/transfo/split_explicit_groups.py b/lib/guessit/transfo/split_explicit_groups.py index ff46ebc6..7ae5787d 100644 --- a/lib/guessit/transfo/split_explicit_groups.py +++ b/lib/guessit/transfo/split_explicit_groups.py @@ -2,7 +2,7 @@ # -*- coding: utf-8 -*- # # GuessIt - A library for guessing information from filenames -# Copyright (c) 2013 Nicolas Wack +# Copyright (c) 2012 Nicolas Wack # # GuessIt is free software; you can redistribute it and/or modify it under # the terms of the Lesser GNU General Public License as published by @@ -18,32 +18,27 @@ # along with this program. If not, see . # -from __future__ import absolute_import, division, print_function, unicode_literals - -from guessit.plugins.transformers import Transformer +from __future__ import unicode_literals from guessit.textutils import find_first_level_groups from guessit.patterns import group_delimiters -from functools import reduce +import functools +import logging + +log = logging.getLogger(__name__) -class SplitExplicitGroups(Transformer): - def __init__(self): - Transformer.__init__(self, 245) +def process(mtree): + """return the string split into explicit groups, that is, those either + between parenthese, square brackets or curly braces, and those separated + by a dash.""" + for c in mtree.children: + groups = find_first_level_groups(c.value, group_delimiters[0]) + for delimiters in group_delimiters: + flatten = lambda l, x: l + find_first_level_groups(x, delimiters) + groups = functools.reduce(flatten, groups, []) - def process(self, mtree, options=None): - """split each of those into explicit groups (separated by parentheses or square brackets) + # do not do this at this moment, it is not strong enough and can break other + # patterns, such as dates, etc... + #groups = functools.reduce(lambda l, x: l + x.split('-'), groups, []) - :return: return the string split into explicit groups, that is, those either - between parenthese, square brackets or curly braces, and those separated - by a dash.""" - for c in mtree.children: - groups = find_first_level_groups(c.value, group_delimiters[0]) - for delimiters in group_delimiters: - flatten = lambda l, x: l + find_first_level_groups(x, delimiters) - groups = reduce(flatten, groups, []) - - # do not do this at this moment, it is not strong enough and can break other - # patterns, such as dates, etc... - # groups = functools.reduce(lambda l, x: l + x.split('-'), groups, []) - - c.split_on_components(groups) + c.split_on_components(groups) diff --git a/lib/guessit/transfo/split_on_dash.py b/lib/guessit/transfo/split_on_dash.py index 85f4016d..031baff6 100644 --- a/lib/guessit/transfo/split_on_dash.py +++ b/lib/guessit/transfo/split_on_dash.py @@ -2,7 +2,7 @@ # -*- coding: utf-8 -*- # # GuessIt - A library for guessing information from filenames -# Copyright (c) 2013 Nicolas Wack +# Copyright (c) 2012 Nicolas Wack # # GuessIt is free software; you can redistribute it and/or modify it under # the terms of the Lesser GNU General Public License as published by @@ -18,30 +18,25 @@ # along with this program. If not, see . # -from __future__ import absolute_import, division, print_function, unicode_literals - -from guessit.plugins.transformers import Transformer +from __future__ import unicode_literals from guessit.patterns import sep import re +import logging + +log = logging.getLogger(__name__) -class SplitOnDash(Transformer): - def __init__(self): - Transformer.__init__(self, 190) +def process(mtree): + for node in mtree.unidentified_leaves(): + indices = [] - def process(self, mtree, options=None): - """split into '-' separated subgroups (with required separator chars - around the dash) - """ - for node in mtree.unidentified_leaves(): - indices = [] + didx = 0 + pattern = re.compile(sep + '-' + sep) + match = pattern.search(node.value) + while match: + span = match.span() + indices.extend([ span[0], span[1] ]) + match = pattern.search(node.value, span[1]) - pattern = re.compile(sep + '-' + sep) - match = pattern.search(node.value) - while match: - span = match.span() - indices.extend([span[0], span[1]]) - match = pattern.search(node.value, span[1]) - - if indices: - node.partition(indices) + if indices: + node.partition(indices) diff --git a/lib/guessit/transfo/split_path_components.py b/lib/guessit/transfo/split_path_components.py index c630a30c..35fab405 100644 --- a/lib/guessit/transfo/split_path_components.py +++ b/lib/guessit/transfo/split_path_components.py @@ -2,7 +2,7 @@ # -*- coding: utf-8 -*- # # GuessIt - A library for guessing information from filenames -# Copyright (c) 2013 Nicolas Wack +# Copyright (c) 2012 Nicolas Wack # # GuessIt is free software; you can redistribute it and/or modify it under # the terms of the Lesser GNU General Public License as published by @@ -18,28 +18,19 @@ # along with this program. If not, see . # -from __future__ import absolute_import, division, print_function, unicode_literals - -from guessit.plugins.transformers import Transformer +from __future__ import unicode_literals from guessit import fileutils -from os.path import splitext +import os.path +import logging + +log = logging.getLogger(__name__) -class SplitPathComponents(Transformer): - def __init__(self): - Transformer.__init__(self, 255) +def process(mtree): + """Returns the filename split into [ dir*, basename, ext ].""" + components = fileutils.split_path(mtree.value) + basename = components.pop(-1) + components += list(os.path.splitext(basename)) + components[-1] = components[-1][1:] # remove the '.' from the extension - def process(self, mtree, options=None): - """first split our path into dirs + basename + ext - - :return: the filename split into [ dir*, basename, ext ] - """ - if not options.get('name_only'): - components = fileutils.split_path(mtree.value) - basename = components.pop(-1) - components += list(splitext(basename)) - components[-1] = components[-1][1:] # remove the '.' from the extension - - mtree.split_on_components(components) - else: - mtree.split_on_components([mtree.value, '']) + mtree.split_on_components(components)