update guessit and subliminal libs. Fixes #678

This commit is contained in:
clinton-hall 2015-01-19 14:22:30 +10:30
commit f716323b76
72 changed files with 9350 additions and 3032 deletions

View file

@ -2,7 +2,8 @@
# -*- coding: utf-8 -*-
#
# GuessIt - A library for guessing information from filenames
# Copyright (c) 2012 Nicolas Wack <wackou@gmail.com>
# Copyright (c) 2013 Nicolas Wack <wackou@gmail.com>
# Copyright (c) 2013 Rémi Alvergnat <toilal.dev@gmail.com>
#
# GuessIt is free software; you can redistribute it and/or modify it under
# the terms of the Lesser GNU General Public License as published by
@ -18,163 +19,288 @@
# along with this program. If not, see <http://www.gnu.org/licenses/>.
#
from __future__ import unicode_literals
from guessit import PY3, u, base_text_type
from guessit.matchtree import MatchTree
from guessit.textutils import normalize_unicode, clean_string
from __future__ import absolute_import, division, print_function, \
unicode_literals
import logging
from guessit import PY3, u
from guessit.transfo import TransformerException
from guessit.matchtree import MatchTree
from guessit.textutils import normalize_unicode, clean_default
from guessit.guess import Guess
import inspect
log = logging.getLogger(__name__)
class IterativeMatcher(object):
def __init__(self, filename, filetype='autodetect', opts=None, transfo_opts=None):
"""An iterative matcher tries to match different patterns that appear
in the filename.
"""An iterative matcher tries to match different patterns that appear
in the filename.
The 'filetype' argument indicates which type of file you want to match.
If it is 'autodetect', the matcher will try to see whether it can guess
that the file corresponds to an episode, or otherwise will assume it is
a movie.
The ``filetype`` argument indicates which type of file you want to match.
If it is undefined, the matcher will try to see whether it can guess
that the file corresponds to an episode, or otherwise will assume it is
a movie.
The recognized 'filetype' values are:
[ autodetect, subtitle, info, movie, moviesubtitle, movieinfo, episode,
episodesubtitle, episodeinfo ]
The recognized ``filetype`` values are:
``['subtitle', 'info', 'movie', 'moviesubtitle', 'movieinfo', 'episode',
'episodesubtitle', 'episodeinfo']``
``options`` is a dict of options values to be passed to the transformations used
by the matcher.
The IterativeMatcher works mainly in 2 steps:
The IterativeMatcher works mainly in 2 steps:
First, it splits the filename into a match_tree, which is a tree of groups
which have a semantic meaning, such as episode number, movie title,
etc...
First, it splits the filename into a match_tree, which is a tree of groups
which have a semantic meaning, such as episode number, movie title,
etc...
The match_tree created looks like the following:
The match_tree created looks like the following::
0000000000000000000000000000000000000000000000000000000000000000000000000000000000 111
0000011111111111112222222222222233333333444444444444444455555555666777777778888888 000
0000000000000000000000000000000001111112011112222333333401123334000011233340000000 000
__________________(The.Prestige).______.[____.HP.______.{__-___}.St{__-___}.Chaps].___
xxxxxttttttttttttt ffffff vvvv xxxxxx ll lll xx xxx ccc
[XCT].Le.Prestige.(The.Prestige).DVDRip.[x264.HP.He-Aac.{Fr-Eng}.St{Fr-Eng}.Chaps].mkv
0000000000000000000000000000000000000000000000000000000000000000000000000000000000 111
0000011111111111112222222222222233333333444444444444444455555555666777777778888888 000
0000000000000000000000000000000001111112011112222333333401123334000011233340000000 000
__________________(The.Prestige).______.[____.HP.______.{__-___}.St{__-___}.Chaps].___
xxxxxttttttttttttt ffffff vvvv xxxxxx ll lll xx xxx ccc
[XCT].Le.Prestige.(The.Prestige).DVDRip.[x264.HP.He-Aac.{Fr-Eng}.St{Fr-Eng}.Chaps].mkv
The first 3 lines indicates the group index in which a char in the
filename is located. So for instance, x264 is the group (0, 4, 1), and
it corresponds to a video codec, denoted by the letter'v' in the 4th line.
(for more info, see guess.matchtree.to_string)
The first 3 lines indicates the group index in which a char in the
filename is located. So for instance, ``x264`` (in the middle) is the group (0, 4, 1), and
it corresponds to a video codec, denoted by the letter ``v`` in the 4th line.
(for more info, see guess.matchtree.to_string)
Second, it tries to merge all this information into a single object
containing all the found properties, and does some (basic) conflict
resolution when they arise.
When you create the Matcher, you can pass it:
- a list 'opts' of option names, that act as global flags
- a dict 'transfo_opts' of { transfo_name: (transfo_args, transfo_kwargs) }
with which to call the transfo.process() function.
"""
valid_filetypes = ('autodetect', 'subtitle', 'info', 'video',
'movie', 'moviesubtitle', 'movieinfo',
'episode', 'episodesubtitle', 'episodeinfo')
if filetype not in valid_filetypes:
raise ValueError("filetype needs to be one of %s" % valid_filetypes)
Second, it tries to merge all this information into a single object
containing all the found properties, and does some (basic) conflict
resolution when they arise.
"""
def __init__(self, filename, options=None, **kwargs):
options = dict(options or {})
for k, v in kwargs.items():
if k not in options or not options[k]:
options[k] = v # options dict has priority over keyword arguments
self._validate_options(options)
if not PY3 and not isinstance(filename, unicode):
log.warning('Given filename to matcher is not unicode...')
filename = filename.decode('utf-8')
filename = normalize_unicode(filename)
if options and options.get('clean_function'):
clean_function = options.get('clean_function')
if not hasattr(clean_function, '__call__'):
module, function = clean_function.rsplit('.')
if not module:
module = 'guessit.textutils'
clean_function = getattr(__import__(module), function)
if not clean_function:
log.error('Can\'t find clean function %s. Default will be used.' % options.get('clean_function'))
clean_function = clean_default
else:
clean_function = clean_default
if opts is None:
opts = []
if not isinstance(opts, list):
raise ValueError('opts must be a list of option names! Received: type=%s val=%s',
type(opts), opts)
if transfo_opts is None:
transfo_opts = {}
if not isinstance(transfo_opts, dict):
raise ValueError('transfo_opts must be a dict of { transfo_name: (args, kwargs) }. '+
'Received: type=%s val=%s', type(transfo_opts), transfo_opts)
self.match_tree = MatchTree(filename)
self.match_tree = MatchTree(filename, clean_function=clean_function)
self.options = options
self._transfo_calls = []
# sanity check: make sure we don't process a (mostly) empty string
if clean_string(filename) == '':
if clean_function(filename).strip() == '':
return
mtree = self.match_tree
mtree.guess.set('type', filetype, confidence=1.0)
from guessit.plugins import transformers
def apply_transfo(transfo_name, *args, **kwargs):
transfo = __import__('guessit.transfo.' + transfo_name,
globals=globals(), locals=locals(),
fromlist=['process'], level=0)
default_args, default_kwargs = transfo_opts.get(transfo_name, ((), {}))
all_args = args or default_args
all_kwargs = dict(default_kwargs)
all_kwargs.update(kwargs) # keep all kwargs merged together
transfo.process(mtree, *all_args, **all_kwargs)
try:
mtree = self.match_tree
if 'type' in self.options:
mtree.guess.set('type', self.options['type'], confidence=0.0)
# 1- first split our path into dirs + basename + ext
apply_transfo('split_path_components')
# Process
for transformer in transformers.all_transformers():
disabled = options.get('disabled_transformers')
if not disabled or transformer.name not in disabled:
self._process(transformer, False)
# 2- guess the file type now (will be useful later)
apply_transfo('guess_filetype', filetype)
if mtree.guess['type'] == 'unknown':
return
# Post-process
for transformer in transformers.all_transformers():
disabled = options.get('disabled_transformers')
if not disabled or transformer.name not in disabled:
self._process(transformer, True)
# 3- split each of those into explicit groups (separated by parentheses
# or square brackets)
apply_transfo('split_explicit_groups')
log.debug('Found match tree:\n%s' % u(mtree))
except TransformerException as e:
log.debug('An error has occurred in Transformer %s: %s' % (e.transformer, e))
# 4- try to match information for specific patterns
# NOTE: order needs to comply to the following:
# - website before language (eg: tvu.org.ru vs russian)
# - language before episodes_rexps
# - properties before language (eg: he-aac vs hebrew)
# - release_group before properties (eg: XviD-?? vs xvid)
if mtree.guess['type'] in ('episode', 'episodesubtitle', 'episodeinfo'):
strategy = [ 'guess_date', 'guess_website', 'guess_release_group',
'guess_properties', 'guess_language',
'guess_video_rexps',
'guess_episodes_rexps', 'guess_weak_episodes_rexps' ]
else:
strategy = [ 'guess_date', 'guess_website', 'guess_release_group',
'guess_properties', 'guess_language',
'guess_video_rexps' ]
def _process(self, transformer, post=False):
if 'nolanguage' in opts:
strategy.remove('guess_language')
if not hasattr(transformer, 'should_process') or transformer.should_process(self.match_tree, self.options):
if post:
transformer.post_process(self.match_tree, self.options)
else:
transformer.process(self.match_tree, self.options)
self._transfo_calls.append(transformer)
@property
def second_pass_options(self):
second_pass_options = {}
for transformer in self._transfo_calls:
if hasattr(transformer, 'second_pass_options'):
transformer_second_pass_options = transformer.second_pass_options(self.match_tree, self.options)
if transformer_second_pass_options:
second_pass_options.update(transformer_second_pass_options)
for name in strategy:
apply_transfo(name)
return second_pass_options
# more guessers for both movies and episodes
apply_transfo('guess_bonus_features')
apply_transfo('guess_year', skip_first_year=('skip_first_year' in opts))
def _validate_options(self, options):
valid_filetypes = ('subtitle', 'info', 'video',
'movie', 'moviesubtitle', 'movieinfo',
'episode', 'episodesubtitle', 'episodeinfo')
if 'nocountry' not in opts:
apply_transfo('guess_country')
apply_transfo('guess_idnumber')
# split into '-' separated subgroups (with required separator chars
# around the dash)
apply_transfo('split_on_dash')
# 5- try to identify the remaining unknown groups by looking at their
# position relative to other known elements
if mtree.guess['type'] in ('episode', 'episodesubtitle', 'episodeinfo'):
apply_transfo('guess_episode_info_from_position')
else:
apply_transfo('guess_movie_title_from_position')
# 6- perform some post-processing steps
apply_transfo('post_process')
log.debug('Found match tree:\n%s' % u(mtree))
type_ = options.get('type')
if type_ and type_ not in valid_filetypes:
raise ValueError("filetype needs to be one of %s" % (valid_filetypes,))
def matched(self):
return self.match_tree.matched()
def build_guess(node, name, value=None, confidence=1.0):
guess = Guess({name: node.clean_value if value is None else value}, confidence=confidence)
guess.metadata().input = node.value if value is None else value
if value is None:
left_offset = 0
right_offset = 0
clean_value = node.clean_value
for i in range(0, len(node.value)):
if clean_value[0] == node.value[i]:
break
left_offset += 1
for i in reversed(range(0, len(node.value))):
if clean_value[-1] == node.value[i]:
break
right_offset += 1
guess.metadata().span = (node.span[0] - node.offset + left_offset, node.span[1] - node.offset - right_offset)
return guess
def found_property(node, name, value=None, confidence=1.0, update_guess=True, logger=None):
# automatically retrieve the log object from the caller frame
if not logger:
caller_frame = inspect.stack()[1][0]
logger = caller_frame.f_locals['self'].log
guess = build_guess(node, name, value, confidence)
return found_guess(node, guess, update_guess=update_guess, logger=logger)
def found_guess(node, guess, update_guess=True, logger=None):
if node.guess:
if update_guess:
node.guess.update_highest_confidence(guess)
else:
child = node.add_child(guess.metadata().span)
child.guess = guess
else:
node.guess = guess
log_found_guess(guess, logger)
return node.guess
def log_found_guess(guess, logger=None):
for k, v in guess.items():
(logger or log).debug('Property found: %s=%s (%s) (confidence=%.2f)' %
(k, v, guess.raw(k), guess.confidence(k)))
def _get_split_spans(node, span):
partition_spans = node.get_partition_spans(span)
for to_remove_span in partition_spans:
if to_remove_span[0] == span[0] and to_remove_span[1] in [span[1], span[1] + 1]:
partition_spans.remove(to_remove_span)
break
return partition_spans
class GuessFinder(object):
def __init__(self, guess_func, confidence=None, logger=None, options=None):
self.guess_func = guess_func
self.confidence = confidence
self.logger = logger or log
self.options = options
def process_nodes(self, nodes):
for node in nodes:
self.process_node(node)
def process_node(self, node, iterative=True, partial_span=None):
if partial_span:
value = node.value[partial_span[0]:partial_span[1]]
else:
value = node.value
string = ' %s ' % value # add sentinels
if not self.options:
matcher_result = self.guess_func(string, node)
else:
matcher_result = self.guess_func(string, node, self.options)
if matcher_result:
if not isinstance(matcher_result, Guess):
result, span = matcher_result
else:
result, span = matcher_result, matcher_result.metadata().span
if result:
# readjust span to compensate for sentinels
span = (span[0] - 1, span[1] - 1)
# readjust span to compensate for partial_span
if partial_span:
span = (span[0] + partial_span[0], span[1] + partial_span[0])
partition_spans = None
if self.options and 'skip_nodes' in self.options:
skip_nodes = self.options.get('skip_nodes')
for skip_node in skip_nodes:
if skip_node.parent.node_idx == node.node_idx[:len(skip_node.parent.node_idx)] and\
skip_node.span == span or\
skip_node.span == (span[0] + skip_node.offset, span[1] + skip_node.offset):
if partition_spans is None:
partition_spans = _get_split_spans(node, skip_node.span)
else:
new_partition_spans = []
for partition_span in partition_spans:
tmp_node = MatchTree(value, span=partition_span, parent=node)
tmp_partitions_spans = _get_split_spans(tmp_node, skip_node.span)
new_partition_spans.extend(tmp_partitions_spans)
partition_spans.extend(new_partition_spans)
if not partition_spans:
# restore sentinels compensation
if isinstance(result, Guess):
guess = result
else:
guess = Guess(result, confidence=self.confidence, input=string, span=span)
if not iterative:
found_guess(node, guess, logger=self.logger)
else:
absolute_span = (span[0] + node.offset, span[1] + node.offset)
node.partition(span)
if node.is_leaf():
found_guess(node, guess, logger=self.logger)
else:
found_child = None
for child in node.children:
if child.span == absolute_span:
found_guess(child, guess, logger=self.logger)
found_child = child
break
for child in node.children:
if child is not found_child:
self.process_node(child)
else:
for partition_span in partition_spans:
self.process_node(node, partial_span=partition_span)