update guessit and subliminal libs. Fixes #678

2025-08-20 21:33:13 -07:00 · 2015-01-19 14:22:30 +10:30 · 2015-01-19 14:22:30 +10:30 · f716323b76
commit f716323b76
parent ff50e5144c
72 changed files with 9350 additions and 3032 deletions
--- a/libs/guessit/matcher.py
+++ b/libs/guessit/matcher.py
@ -2,7 +2,8 @@
 # -*- coding: utf-8 -*-
 #
 # GuessIt - A library for guessing information from filenames
-# Copyright (c) 2012 Nicolas Wack <wackou@gmail.com>
+# Copyright (c) 2013 Nicolas Wack <wackou@gmail.com>
+# Copyright (c) 2013 Rémi Alvergnat <toilal.dev@gmail.com>
 #
 # GuessIt is free software; you can redistribute it and/or modify it under
 # the terms of the Lesser GNU General Public License as published by
@ -18,163 +19,288 @@
 # along with this program.  If not, see <http://www.gnu.org/licenses/>.
 #

-from __future__ import unicode_literals
-from guessit import PY3, u, base_text_type
-from guessit.matchtree import MatchTree
-from guessit.textutils import normalize_unicode, clean_string
+from __future__ import absolute_import, division, print_function, \
+    unicode_literals
+
 import logging

+from guessit import PY3, u
+from guessit.transfo import TransformerException
+from guessit.matchtree import MatchTree
+from guessit.textutils import normalize_unicode, clean_default
+from guessit.guess import Guess
+import inspect
+
 log = logging.getLogger(__name__)


 class IterativeMatcher(object):
-    def __init__(self, filename, filetype='autodetect', opts=None, transfo_opts=None):
-        """An iterative matcher tries to match different patterns that appear
-        in the filename.
+    """An iterative matcher tries to match different patterns that appear
+    in the filename.

-        The 'filetype' argument indicates which type of file you want to match.
-        If it is 'autodetect', the matcher will try to see whether it can guess
-        that the file corresponds to an episode, or otherwise will assume it is
-        a movie.
+    The ``filetype`` argument indicates which type of file you want to match.
+    If it is undefined, the matcher will try to see whether it can guess
+    that the file corresponds to an episode, or otherwise will assume it is
+    a movie.

-        The recognized 'filetype' values are:
-        [ autodetect, subtitle, info, movie, moviesubtitle, movieinfo, episode,
-        episodesubtitle, episodeinfo ]
+    The recognized ``filetype`` values are:
+    ``['subtitle', 'info', 'movie', 'moviesubtitle', 'movieinfo', 'episode',
+    'episodesubtitle', 'episodeinfo']``

+    ``options`` is a dict of options values to be passed to the transformations used
+    by the matcher.

-        The IterativeMatcher works mainly in 2 steps:
+    The IterativeMatcher works mainly in 2 steps:

-        First, it splits the filename into a match_tree, which is a tree of groups
-        which have a semantic meaning, such as episode number, movie title,
-        etc...
+    First, it splits the filename into a match_tree, which is a tree of groups
+    which have a semantic meaning, such as episode number, movie title,
+    etc...

-        The match_tree created looks like the following:
+    The match_tree created looks like the following::

-        0000000000000000000000000000000000000000000000000000000000000000000000000000000000 111
-        0000011111111111112222222222222233333333444444444444444455555555666777777778888888 000
-        0000000000000000000000000000000001111112011112222333333401123334000011233340000000 000
-        __________________(The.Prestige).______.[____.HP.______.{__-___}.St{__-___}.Chaps].___
-        xxxxxttttttttttttt               ffffff  vvvv    xxxxxx  ll lll     xx xxx         ccc
-        [XCT].Le.Prestige.(The.Prestige).DVDRip.[x264.HP.He-Aac.{Fr-Eng}.St{Fr-Eng}.Chaps].mkv
+      0000000000000000000000000000000000000000000000000000000000000000000000000000000000 111
+      0000011111111111112222222222222233333333444444444444444455555555666777777778888888 000
+      0000000000000000000000000000000001111112011112222333333401123334000011233340000000 000
+      __________________(The.Prestige).______.[____.HP.______.{__-___}.St{__-___}.Chaps].___
+      xxxxxttttttttttttt               ffffff  vvvv    xxxxxx  ll lll     xx xxx         ccc
+      [XCT].Le.Prestige.(The.Prestige).DVDRip.[x264.HP.He-Aac.{Fr-Eng}.St{Fr-Eng}.Chaps].mkv

-        The first 3 lines indicates the group index in which a char in the
-        filename is located. So for instance, x264 is the group (0, 4, 1), and
-        it corresponds to a video codec, denoted by the letter'v' in the 4th line.
-        (for more info, see guess.matchtree.to_string)
+    The first 3 lines indicates the group index in which a char in the
+    filename is located. So for instance, ``x264`` (in the middle) is the group (0, 4, 1), and
+    it corresponds to a video codec, denoted by the letter ``v`` in the 4th line.
+    (for more info, see guess.matchtree.to_string)

-        Second, it tries to merge all this information into a single object
-        containing all the found properties, and does some (basic) conflict
-        resolution when they arise.
-
-
-        When you create the Matcher, you can pass it:
-         - a list 'opts' of option names, that act as global flags
-         - a dict 'transfo_opts' of { transfo_name: (transfo_args, transfo_kwargs) }
-           with which to call the transfo.process() function.
-        """
-
-        valid_filetypes = ('autodetect', 'subtitle', 'info', 'video',
-                           'movie', 'moviesubtitle', 'movieinfo',
-                           'episode', 'episodesubtitle', 'episodeinfo')
-        if filetype not in valid_filetypes:
-            raise ValueError("filetype needs to be one of %s" % valid_filetypes)
+    Second, it tries to merge all this information into a single object
+    containing all the found properties, and does some (basic) conflict
+    resolution when they arise.
+    """
+    def __init__(self, filename, options=None, **kwargs):
+        options = dict(options or {})
+        for k, v in kwargs.items():
+            if k not in options or not options[k]:
+                options[k] = v  # options dict has priority over keyword arguments
+        self._validate_options(options)
        if not PY3 and not isinstance(filename, unicode):
            log.warning('Given filename to matcher is not unicode...')
            filename = filename.decode('utf-8')

        filename = normalize_unicode(filename)
+        if options and options.get('clean_function'):
+            clean_function = options.get('clean_function')
+            if not hasattr(clean_function, '__call__'):
+                module, function = clean_function.rsplit('.')
+                if not module:
+                    module = 'guessit.textutils'
+                clean_function = getattr(__import__(module), function)
+                if not clean_function:
+                    log.error('Can\'t find clean function %s. Default will be used.' % options.get('clean_function'))
+                    clean_function = clean_default
+        else:
+            clean_function = clean_default

-        if opts is None:
-            opts = []
-        if not isinstance(opts, list):
-            raise ValueError('opts must be a list of option names! Received: type=%s val=%s',
-                             type(opts), opts)
-
-        if transfo_opts is None:
-            transfo_opts = {}
-        if not isinstance(transfo_opts, dict):
-            raise ValueError('transfo_opts must be a dict of { transfo_name: (args, kwargs) }. '+
-                             'Received: type=%s val=%s', type(transfo_opts), transfo_opts)
-
-        self.match_tree = MatchTree(filename)
+        self.match_tree = MatchTree(filename, clean_function=clean_function)
+        self.options = options
+        self._transfo_calls = []

        # sanity check: make sure we don't process a (mostly) empty string
-        if clean_string(filename) == '':
+        if clean_function(filename).strip() == '':
            return

-        mtree = self.match_tree
-        mtree.guess.set('type', filetype, confidence=1.0)
+        from guessit.plugins import transformers

-        def apply_transfo(transfo_name, *args, **kwargs):
-            transfo = __import__('guessit.transfo.' + transfo_name,
-                                 globals=globals(), locals=locals(),
-                                 fromlist=['process'], level=0)
-            default_args, default_kwargs = transfo_opts.get(transfo_name, ((), {}))
-            all_args = args or default_args
-            all_kwargs = dict(default_kwargs)
-            all_kwargs.update(kwargs) # keep all kwargs merged together
-            transfo.process(mtree, *all_args, **all_kwargs)
+        try:
+            mtree = self.match_tree
+            if 'type' in self.options:
+                mtree.guess.set('type', self.options['type'], confidence=0.0)

-        # 1- first split our path into dirs + basename + ext
-        apply_transfo('split_path_components')
+            # Process
+            for transformer in transformers.all_transformers():
+                disabled = options.get('disabled_transformers')
+                if not disabled or transformer.name not in disabled:
+                    self._process(transformer, False)

-        # 2- guess the file type now (will be useful later)
-        apply_transfo('guess_filetype', filetype)
-        if mtree.guess['type'] == 'unknown':
-            return
+            # Post-process
+            for transformer in transformers.all_transformers():
+                disabled = options.get('disabled_transformers')
+                if not disabled or transformer.name not in disabled:
+                    self._process(transformer, True)

-        # 3- split each of those into explicit groups (separated by parentheses
-        #    or square brackets)
-        apply_transfo('split_explicit_groups')
+            log.debug('Found match tree:\n%s' % u(mtree))
+        except TransformerException as e:
+            log.debug('An error has occurred in Transformer %s: %s' % (e.transformer, e))

-        # 4- try to match information for specific patterns
-        # NOTE: order needs to comply to the following:
-        #       - website before language (eg: tvu.org.ru vs russian)
-        #       - language before episodes_rexps
-        #       - properties before language (eg: he-aac vs hebrew)
-        #       - release_group before properties (eg: XviD-?? vs xvid)
-        if mtree.guess['type'] in ('episode', 'episodesubtitle', 'episodeinfo'):
-            strategy = [ 'guess_date', 'guess_website', 'guess_release_group',
-                         'guess_properties', 'guess_language',
-                         'guess_video_rexps',
-                         'guess_episodes_rexps', 'guess_weak_episodes_rexps' ]
-        else:
-            strategy = [ 'guess_date', 'guess_website', 'guess_release_group',
-                         'guess_properties', 'guess_language',
-                         'guess_video_rexps' ]
+    def _process(self, transformer, post=False):

-        if 'nolanguage' in opts:
-            strategy.remove('guess_language')
+        if not hasattr(transformer, 'should_process') or transformer.should_process(self.match_tree, self.options):
+            if post:
+                transformer.post_process(self.match_tree, self.options)
+            else:
+                transformer.process(self.match_tree, self.options)
+                self._transfo_calls.append(transformer)

+    @property
+    def second_pass_options(self):
+        second_pass_options = {}
+        for transformer in self._transfo_calls:
+            if hasattr(transformer, 'second_pass_options'):
+                transformer_second_pass_options = transformer.second_pass_options(self.match_tree, self.options)
+                if transformer_second_pass_options:
+                    second_pass_options.update(transformer_second_pass_options)

-        for name in strategy:
-            apply_transfo(name)
+        return second_pass_options

-        # more guessers for both movies and episodes
-        apply_transfo('guess_bonus_features')
-        apply_transfo('guess_year', skip_first_year=('skip_first_year' in opts))
+    def _validate_options(self, options):
+        valid_filetypes = ('subtitle', 'info', 'video',
+                           'movie', 'moviesubtitle', 'movieinfo',
+                           'episode', 'episodesubtitle', 'episodeinfo')

-        if 'nocountry' not in opts:
-            apply_transfo('guess_country')
-
-        apply_transfo('guess_idnumber')
-
-
-        # split into '-' separated subgroups (with required separator chars
-        # around the dash)
-        apply_transfo('split_on_dash')
-
-        # 5- try to identify the remaining unknown groups by looking at their
-        #    position relative to other known elements
-        if mtree.guess['type'] in ('episode', 'episodesubtitle', 'episodeinfo'):
-            apply_transfo('guess_episode_info_from_position')
-        else:
-            apply_transfo('guess_movie_title_from_position')
-
-        # 6- perform some post-processing steps
-        apply_transfo('post_process')
-
-        log.debug('Found match tree:\n%s' % u(mtree))
+        type_ = options.get('type')
+        if type_ and type_ not in valid_filetypes:
+            raise ValueError("filetype needs to be one of %s" % (valid_filetypes,))

    def matched(self):
        return self.match_tree.matched()
+
+
+def build_guess(node, name, value=None, confidence=1.0):
+    guess = Guess({name: node.clean_value if value is None else value}, confidence=confidence)
+    guess.metadata().input = node.value if value is None else value
+    if value is None:
+        left_offset = 0
+        right_offset = 0
+
+        clean_value = node.clean_value
+
+        for i in range(0, len(node.value)):
+            if clean_value[0] == node.value[i]:
+                break
+            left_offset += 1
+
+        for i in reversed(range(0, len(node.value))):
+            if clean_value[-1] == node.value[i]:
+                break
+            right_offset += 1
+
+        guess.metadata().span = (node.span[0] - node.offset + left_offset, node.span[1] - node.offset - right_offset)
+    return guess
+
+
+def found_property(node, name, value=None, confidence=1.0, update_guess=True, logger=None):
+    # automatically retrieve the log object from the caller frame
+    if not logger:
+        caller_frame = inspect.stack()[1][0]
+        logger = caller_frame.f_locals['self'].log
+    guess = build_guess(node, name, value, confidence)
+    return found_guess(node, guess, update_guess=update_guess, logger=logger)
+
+
+def found_guess(node, guess, update_guess=True, logger=None):
+    if node.guess:
+        if update_guess:
+            node.guess.update_highest_confidence(guess)
+        else:
+            child = node.add_child(guess.metadata().span)
+            child.guess = guess
+    else:
+        node.guess = guess
+    log_found_guess(guess, logger)
+    return node.guess
+
+
+def log_found_guess(guess, logger=None):
+    for k, v in guess.items():
+        (logger or log).debug('Property found: %s=%s (%s) (confidence=%.2f)' %
+                              (k, v, guess.raw(k), guess.confidence(k)))
+
+
+def _get_split_spans(node, span):
+    partition_spans = node.get_partition_spans(span)
+    for to_remove_span in partition_spans:
+        if to_remove_span[0] == span[0] and to_remove_span[1] in [span[1], span[1] + 1]:
+            partition_spans.remove(to_remove_span)
+            break
+    return partition_spans
+
+
+class GuessFinder(object):
+    def __init__(self, guess_func, confidence=None, logger=None, options=None):
+        self.guess_func = guess_func
+        self.confidence = confidence
+        self.logger = logger or log
+        self.options = options
+
+    def process_nodes(self, nodes):
+        for node in nodes:
+            self.process_node(node)
+
+    def process_node(self, node, iterative=True, partial_span=None):
+        if partial_span:
+            value = node.value[partial_span[0]:partial_span[1]]
+        else:
+            value = node.value
+        string = ' %s ' % value  # add sentinels
+
+        if not self.options:
+            matcher_result = self.guess_func(string, node)
+        else:
+            matcher_result = self.guess_func(string, node, self.options)
+
+        if matcher_result:
+            if not isinstance(matcher_result, Guess):
+                result, span = matcher_result
+            else:
+                result, span = matcher_result, matcher_result.metadata().span
+
+            if result:
+                # readjust span to compensate for sentinels
+                span = (span[0] - 1, span[1] - 1)
+
+                # readjust span to compensate for partial_span
+                if partial_span:
+                    span = (span[0] + partial_span[0], span[1] + partial_span[0])
+
+                partition_spans = None
+                if self.options and 'skip_nodes' in self.options:
+                    skip_nodes = self.options.get('skip_nodes')
+                    for skip_node in skip_nodes:
+                        if skip_node.parent.node_idx == node.node_idx[:len(skip_node.parent.node_idx)] and\
+                            skip_node.span == span or\
+                                skip_node.span == (span[0] + skip_node.offset, span[1] + skip_node.offset):
+                            if partition_spans is None:
+                                partition_spans = _get_split_spans(node, skip_node.span)
+                            else:
+                                new_partition_spans = []
+                                for partition_span in partition_spans:
+                                    tmp_node = MatchTree(value, span=partition_span, parent=node)
+                                    tmp_partitions_spans = _get_split_spans(tmp_node, skip_node.span)
+                                    new_partition_spans.extend(tmp_partitions_spans)
+                                partition_spans.extend(new_partition_spans)
+
+                if not partition_spans:
+                    # restore sentinels compensation
+
+                    if isinstance(result, Guess):
+                        guess = result
+                    else:
+                        guess = Guess(result, confidence=self.confidence, input=string, span=span)
+
+                    if not iterative:
+                        found_guess(node, guess, logger=self.logger)
+                    else:
+                        absolute_span = (span[0] + node.offset, span[1] + node.offset)
+                        node.partition(span)
+                        if node.is_leaf():
+                            found_guess(node, guess, logger=self.logger)
+                        else:
+                            found_child = None
+                            for child in node.children:
+                                if child.span == absolute_span:
+                                    found_guess(child, guess, logger=self.logger)
+                                    found_child = child
+                                    break
+                            for child in node.children:
+                                if child is not found_child:
+                                    self.process_node(child)
+                else:
+                    for partition_span in partition_spans:
+                        self.process_node(node, partial_span=partition_span)