update guessit and subliminal libs. Fixes #678

This commit is contained in:
clinton-hall 2015-01-19 14:22:30 +10:30
commit f716323b76
72 changed files with 9350 additions and 3032 deletions

View file

@ -2,7 +2,7 @@
# -*- coding: utf-8 -*-
#
# GuessIt - A library for guessing information from filenames
# Copyright (c) 2011 Nicolas Wack <wackou@gmail.com>
# Copyright (c) 2013 Nicolas Wack <wackou@gmail.com>
#
# GuessIt is free software; you can redistribute it and/or modify it under
# the terms of the Lesser GNU General Public License as published by
@ -18,10 +18,10 @@
# along with this program. If not, see <http://www.gnu.org/licenses/>.
#
from __future__ import unicode_literals
from __future__ import absolute_import, division, print_function, unicode_literals
from guessit import UnicodeMixin, s, u, base_text_type
from guessit.language import Language
from guessit.country import Country
from babelfish import Language, Country
import json
import datetime
import logging
@ -29,6 +29,111 @@ import logging
log = logging.getLogger(__name__)
class GuessMetadata(object):
"""GuessMetadata contains confidence, an input string, span and related property.
If defined on a property of Guess object, it overrides the object defined as global.
:param parent: The parent metadata, used for undefined properties in self object
:type parent: :class: `GuessMedata`
:param confidence: The confidence (from 0.0 to 1.0)
:type confidence: number
:param input: The input string
:type input: string
:param span: The input string
:type span: tuple (int, int)
:param prop: The found property definition
:type prop: :class `guessit.containers._Property`
"""
def __init__(self, parent=None, confidence=None, input=None, span=None, prop=None, *args, **kwargs):
self.parent = parent
if confidence is None and self.parent is None:
self._confidence = 1.0
else:
self._confidence = confidence
self._input = input
self._span = span
self._prop = prop
@property
def confidence(self):
"""The confidence
:rtype: int
:return: confidence value
"""
return self._confidence if self._confidence is not None else self.parent.confidence if self.parent else None
@confidence.setter
def confidence(self, confidence):
self._confidence = confidence
@property
def input(self):
"""The input
:rtype: string
:return: String used to find this guess value
"""
return self._input if self._input is not None else self.parent.input if self.parent else None
@input.setter
def input(self, input):
"""The input
:rtype: string
"""
self._input = input
@property
def span(self):
"""The span
:rtype: tuple (int, int)
:return: span of input string used to find this guess value
"""
return self._span if self._span is not None else self.parent.span if self.parent else None
@span.setter
def span(self, span):
"""The span
:rtype: tuple (int, int)
:return: span of input string used to find this guess value
"""
self._span = span
@property
def prop(self):
"""The property
:rtype: :class:`_Property`
:return: The property
"""
return self._prop if self._prop is not None else self.parent.prop if self.parent else None
@property
def raw(self):
"""Return the raw information (original match from the string,
not the cleaned version) associated with the given property name."""
if self.input and self.span:
return self.input[self.span[0]:self.span[1]]
return None
def __repr__(self, *args, **kwargs):
return object.__repr__(self, *args, **kwargs)
def _split_kwargs(**kwargs):
metadata_args = {}
for prop in dir(GuessMetadata):
try:
metadata_args[prop] = kwargs.pop(prop)
except KeyError:
pass
return metadata_args, kwargs
class Guess(UnicodeMixin, dict):
"""A Guess is a dictionary which has an associated confidence for each of
its values.
@ -37,91 +142,125 @@ class Guess(UnicodeMixin, dict):
simple dict."""
def __init__(self, *args, **kwargs):
try:
confidence = kwargs.pop('confidence')
except KeyError:
confidence = 0
try:
raw = kwargs.pop('raw')
except KeyError:
raw = None
metadata_kwargs, kwargs = _split_kwargs(**kwargs)
self._global_metadata = GuessMetadata(**metadata_kwargs)
dict.__init__(self, *args, **kwargs)
self._confidence = {}
self._raw = {}
self._metadata = {}
for prop in self:
self._confidence[prop] = confidence
self._raw[prop] = raw
self._metadata[prop] = GuessMetadata(parent=self._global_metadata)
def rename(self, old_name, new_name):
if old_name in self._metadata:
metadata = self._metadata[old_name]
del self._metadata[old_name]
self._metadata[new_name] = metadata
if old_name in self:
value = self[old_name]
del self[old_name]
self[new_name] = value
return True
return False
def to_dict(self, advanced=False):
"""Return the guess as a dict containing only base types, ie:
where dates, languages, countries, etc. are converted to strings.
if advanced is True, return the data as a json string containing
also the raw information of the properties."""
data = dict(self)
for prop, value in data.items():
if isinstance(value, datetime.date):
data[prop] = value.isoformat()
elif isinstance(value, (Language, Country, base_text_type)):
elif isinstance(value, (UnicodeMixin, base_text_type)):
data[prop] = u(value)
elif isinstance(value, (Language, Country)):
data[prop] = value.guessit
elif isinstance(value, list):
data[prop] = [u(x) for x in value]
if advanced:
data[prop] = {"value": data[prop], "raw": self.raw(prop), "confidence": self.confidence(prop)}
metadata = self.metadata(prop)
prop_data = {'value': data[prop]}
if metadata.raw:
prop_data['raw'] = metadata.raw
if metadata.confidence:
prop_data['confidence'] = metadata.confidence
data[prop] = prop_data
return data
def nice_string(self, advanced=False):
"""Return a string with the property names and their values,
that also displays the associated confidence to each property.
FIXME: doc with param"""
if advanced:
data = self.to_dict(advanced)
return json.dumps(data, indent=4)
else:
else:
data = self.to_dict()
parts = json.dumps(data, indent=4).split('\n')
for i, p in enumerate(parts):
if p[:5] != ' "':
continue
prop = p.split('"')[1]
parts[i] = (' [%.2f] "' % self.confidence(prop)) + p[5:]
return '\n'.join(parts)
def __unicode__(self):
return u(self.to_dict())
def confidence(self, prop):
return self._confidence.get(prop, -1)
def metadata(self, prop=None):
"""Return the metadata associated with the given property name
If no property name is given, get the global_metadata
"""
if prop is None:
return self._global_metadata
if prop not in self._metadata:
self._metadata[prop] = GuessMetadata(parent=self._global_metadata)
return self._metadata[prop]
def confidence(self, prop=None):
return self.metadata(prop).confidence
def set_confidence(self, prop, confidence):
self.metadata(prop).confidence = confidence
def raw(self, prop):
return self._raw.get(prop, None)
return self.metadata(prop).raw
def set(self, prop, value, confidence=None, raw=None):
self[prop] = value
if confidence is not None:
self._confidence[prop] = confidence
if raw is not None:
self._raw[prop] = raw
def set(self, prop_name, value, *args, **kwargs):
if value is None:
try:
del self[prop_name]
except KeyError:
pass
try:
del self._metadata[prop_name]
except KeyError:
pass
else:
self[prop_name] = value
if 'metadata' in kwargs.keys():
self._metadata[prop_name] = kwargs['metadata']
else:
self._metadata[prop_name] = GuessMetadata(parent=self._global_metadata, *args, **kwargs)
def set_confidence(self, prop, value):
self._confidence[prop] = value
def set_raw(self, prop, value):
self._raw[prop] = value
def update(self, other, confidence=None, raw=None):
def update(self, other, confidence=None):
dict.update(self, other)
if isinstance(other, Guess):
for prop in other:
self._confidence[prop] = other.confidence(prop)
self._raw[prop] = other.raw(prop)
try:
self._metadata[prop] = other._metadata[prop]
except KeyError:
pass
if confidence is not None:
for prop in other:
self._confidence[prop] = confidence
if raw is not None:
for prop in other:
self._raw[prop] = raw
self.set_confidence(prop, confidence)
def update_highest_confidence(self, other):
"""Update this guess with the values from the given one. In case
@ -131,17 +270,16 @@ class Guess(UnicodeMixin, dict):
raise ValueError('Can only call this function on Guess instances')
for prop in other:
if prop in self and self.confidence(prop) >= other.confidence(prop):
if prop in self and self.metadata(prop).confidence >= other.metadata(prop).confidence:
continue
self[prop] = other[prop]
self._confidence[prop] = other.confidence(prop)
self._raw[prop] = other.raw(prop)
self._metadata[prop] = other.metadata(prop)
def choose_int(g1, g2):
"""Function used by merge_similar_guesses to choose between 2 possible
properties when they are integers."""
v1, c1 = g1 # value, confidence
v1, c1 = g1 # value, confidence
v2, c2 = g2
if (v1 == v2):
return (v1, 1 - (1 - c1) * (1 - c2))
@ -179,7 +317,7 @@ def choose_string(g1, g2):
('The Simpsons', 0.75)
"""
v1, c1 = g1 # value, confidence
v1, c1 = g1 # value, confidence
v2, c2 = g2
if not v1:
@ -193,26 +331,26 @@ def choose_string(g1, g2):
combined_prob = 1 - (1 - c1) * (1 - c2)
if v1l == v2l:
return (v1, combined_prob)
return v1, combined_prob
# check for common patterns
elif v1l == 'the ' + v2l:
return (v1, combined_prob)
return v1, combined_prob
elif v2l == 'the ' + v1l:
return (v2, combined_prob)
return v2, combined_prob
# if one string is contained in the other, return the shortest one
elif v2l in v1l:
return (v2, combined_prob)
return v2, combined_prob
elif v1l in v2l:
return (v1, combined_prob)
return v1, combined_prob
# in case of conflict, return the one with highest confidence
else:
if c1 > c2:
return (v1, c1 - c2)
return v1, c1 - c2
else:
return (v2, c2 - c1)
return v2, c2 - c1
def _merge_similar_guesses_nocheck(guesses, prop, choose):
@ -226,17 +364,7 @@ def _merge_similar_guesses_nocheck(guesses, prop, choose):
g1, g2 = similar[0], similar[1]
other_props = set(g1) & set(g2) - set([prop])
if other_props:
log.debug('guess 1: %s' % g1)
log.debug('guess 2: %s' % g2)
for prop in other_props:
if g1[prop] != g2[prop]:
log.warning('both guesses to be merged have more than one '
'different property in common, bailing out...')
return
# merge all props of s2 into s1, updating the confidence for the
# merge only this prop of s2 into s1, updating the confidence for the
# considered property
v1, v2 = g1[prop], g2[prop]
c1, c2 = g1.confidence(prop), g2.confidence(prop)
@ -248,11 +376,12 @@ def _merge_similar_guesses_nocheck(guesses, prop, choose):
msg = "Updating non-matching property '%s' with confidence %.2f"
log.debug(msg % (prop, new_confidence))
g2[prop] = new_value
g2.set_confidence(prop, new_confidence)
g1.set(prop, new_value, confidence=new_confidence)
g2.pop(prop)
g1.update(g2)
guesses.remove(g2)
# remove g2 if there are no properties left
if not g2.keys():
guesses.remove(g2)
def merge_similar_guesses(guesses, prop, choose):
@ -286,43 +415,53 @@ def merge_all(guesses, append=None):
instead of being merged.
>>> s(merge_all([ Guess({'season': 2}, confidence=0.6),
... Guess({'episodeNumber': 13}, confidence=0.8) ]))
{'season': 2, 'episodeNumber': 13}
... Guess({'episodeNumber': 13}, confidence=0.8) ])
... ) == {'season': 2, 'episodeNumber': 13}
True
>>> s(merge_all([ Guess({'episodeNumber': 27}, confidence=0.02),
... Guess({'season': 1}, confidence=0.2) ]))
{'season': 1}
... Guess({'season': 1}, confidence=0.2) ])
... ) == {'season': 1}
True
>>> s(merge_all([ Guess({'other': 'PROPER'}, confidence=0.8),
... Guess({'releaseGroup': '2HD'}, confidence=0.8) ],
... append=['other']))
{'releaseGroup': '2HD', 'other': ['PROPER']}
... append=['other'])
... ) == {'releaseGroup': '2HD', 'other': ['PROPER']}
True
"""
result = Guess()
if not guesses:
return Guess()
return result
result = guesses[0]
if append is None:
append = []
for g in guesses[1:]:
for g in guesses:
# first append our appendable properties
for prop in append:
if prop in g:
result.set(prop, result.get(prop, []) + [g[prop]],
if isinstance(g[prop], (list, set)):
new_values = result.get(prop, []) + list(g[prop])
else:
new_values = result.get(prop, []) + [g[prop]]
result.set(prop, new_values,
# TODO: what to do with confidence here? maybe an
# arithmetic mean...
confidence=g.confidence(prop),
raw=g.raw(prop))
confidence=g.metadata(prop).confidence,
input=g.metadata(prop).input,
span=g.metadata(prop).span,
prop=g.metadata(prop).prop)
del g[prop]
# then merge the remaining ones
dups = set(result) & set(g)
if dups:
log.warning('duplicate properties %s in merged result...' % [ (result[p], g[p]) for p in dups] )
log.debug('duplicate properties %s in merged result...' % [(result[p], g[p]) for p in dups])
result.update_highest_confidence(g)
@ -338,8 +477,38 @@ def merge_all(guesses, append=None):
if isinstance(value, list):
result[prop] = list(set(value))
else:
result[prop] = [ value ]
result[prop] = [value]
except KeyError:
pass
return result
def smart_merge(guesses):
"""First tries to merge well-known similar properties, and then merges
the rest with a merge_all call.
Should be the function to call in most cases, unless one wants to have more
control.
Warning: this function is destructive, ie: it will merge the list in-place.
"""
# 1- try to merge similar information together and give it a higher
# confidence
for int_part in ('year', 'season', 'episodeNumber'):
merge_similar_guesses(guesses, int_part, choose_int)
for string_part in ('title', 'series', 'container', 'format',
'releaseGroup', 'website', 'audioCodec',
'videoCodec', 'screenSize', 'episodeFormat',
'audioChannels', 'idNumber'):
merge_similar_guesses(guesses, string_part, choose_string)
# 2- merge the rest, potentially discarding information not properly
# merged before
result = merge_all(guesses,
append=['language', 'subtitleLanguage', 'other',
'episodeDetails', 'unidentified'])
return result