nzbToMedia/libs/guessit/__init__.py
2015-01-19 14:22:30 +10:30

359 lines
12 KiB
Python

#!/usr/bin/env python
# -*- coding: utf-8 -*-
#
# GuessIt - A library for guessing information from filenames
# Copyright (c) 2013 Nicolas Wack <wackou@gmail.com>
#
# GuessIt is free software; you can redistribute it and/or modify it under
# the terms of the Lesser GNU General Public License as published by
# the Free Software Foundation; either version 3 of the License, or
# (at your option) any later version.
#
# GuessIt is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# Lesser GNU General Public License for more details.
#
# You should have received a copy of the Lesser GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
#
from __future__ import absolute_import, division, print_function, unicode_literals
import pkg_resources
from .__version__ import __version__
__all__ = ['Guess', 'Language',
'guess_file_info', 'guess_video_info',
'guess_movie_info', 'guess_episode_info',
'default_options']
# Do python3 detection before importing any other module, to be sure that
# it will then always be available
# with code from http://lucumr.pocoo.org/2011/1/22/forwards-compatible-python/
import sys
if sys.version_info[0] >= 3: # pragma: no cover
PY2, PY3 = False, True
unicode_text_type = str
native_text_type = str
base_text_type = str
def u(x):
return str(x)
def s(x):
return x
class UnicodeMixin(object):
__str__ = lambda x: x.__unicode__()
import binascii
def to_hex(x):
return binascii.hexlify(x).decode('utf-8')
else: # pragma: no cover
PY2, PY3 = True, False
__all__ = [str(s) for s in __all__] # fix imports for python2
unicode_text_type = unicode
native_text_type = str
base_text_type = basestring
def u(x):
if isinstance(x, str):
return x.decode('utf-8')
if isinstance(x, list):
return [u(s) for s in x]
return unicode(x)
def s(x):
if isinstance(x, unicode):
return x.encode('utf-8')
if isinstance(x, list):
return [s(y) for y in x]
if isinstance(x, tuple):
return tuple(s(y) for y in x)
if isinstance(x, dict):
return dict((s(key), s(value)) for key, value in x.items())
return x
class UnicodeMixin(object):
__str__ = lambda x: unicode(x).encode('utf-8')
def to_hex(x):
return x.encode('hex')
range = xrange
from guessit.guess import Guess, smart_merge
from guessit.language import Language
from guessit.matcher import IterativeMatcher
from guessit.textutils import clean_default, is_camel, from_camel
import babelfish
import os.path
import logging
from copy import deepcopy
log = logging.getLogger(__name__)
class NullHandler(logging.Handler):
def emit(self, record):
pass
# let's be a nicely behaving library
h = NullHandler()
log.addHandler(h)
def _guess_filename(filename, options=None, **kwargs):
mtree = _build_filename_mtree(filename, options=options, **kwargs)
if options.get('split_camel'):
_add_camel_properties(mtree, options=options)
return mtree.matched()
def _build_filename_mtree(filename, options=None, **kwargs):
mtree = IterativeMatcher(filename, options=options, **kwargs)
second_pass_options = mtree.second_pass_options
if second_pass_options:
log.debug("Running 2nd pass")
merged_options = dict(options)
merged_options.update(second_pass_options)
mtree = IterativeMatcher(filename, options=merged_options, **kwargs)
return mtree
def _add_camel_properties(mtree, options=None, **kwargs):
prop = 'title' if mtree.matched().get('type') != 'episode' else 'series'
value = mtree.matched().get(prop)
_guess_camel_string(mtree, value, options=options, skip_title=False, **kwargs)
for leaf in mtree.match_tree.unidentified_leaves():
value = leaf.value
_guess_camel_string(mtree, value, options=options, skip_title=True, **kwargs)
def _guess_camel_string(mtree, string, options=None, skip_title=False, **kwargs):
if string and is_camel(string):
log.debug('"%s" is camel cased. Try to detect more properties.' % (string,))
uncameled_value = from_camel(string)
merged_options = dict(options)
if 'type' in mtree.match_tree.info:
current_type = mtree.match_tree.info.get('type')
if current_type and current_type != 'unknown':
merged_options['type'] = current_type
camel_tree = _build_filename_mtree(uncameled_value, options=merged_options, name_only=True, skip_title=skip_title, **kwargs)
if len(camel_tree.matched()) > 0:
mtree.matched().update(camel_tree.matched())
return True
return False
def guess_video_metadata(filename):
"""Gets the video metadata properties out of a given file. The file needs to
exist on the filesystem to be able to be analyzed. An empty guess is
returned otherwise.
You need to have the Enzyme python package installed for this to work."""
result = Guess()
def found(prop, value):
result[prop] = value
log.debug('Found with enzyme %s: %s' % (prop, value))
# first get the size of the file, in bytes
try:
size = os.stat(filename).st_size
found('fileSize', size)
except Exception as e:
log.error('Cannot get video file size: %s' % e)
# file probably does not exist, we might as well return now
return result
# then get additional metadata from the file using enzyme, if available
try:
import enzyme
with open(filename) as f:
mkv = enzyme.MKV(f)
found('duration', mkv.info.duration.total_seconds())
if mkv.video_tracks:
video_track = mkv.video_tracks[0]
# resolution
if video_track.height in (480, 720, 1080):
if video_track.interlaced:
found('screenSize', '%di' % video_track.height)
else:
found('screenSize', '%dp' % video_track.height)
else:
# TODO: do we want this?
#found('screenSize', '%dx%d' % (video_track.width, video_track.height))
pass
# video codec
if video_track.codec_id == 'V_MPEG4/ISO/AVC':
found('videoCodec', 'h264')
elif video_track.codec_id == 'V_MPEG4/ISO/SP':
found('videoCodec', 'DivX')
elif video_track.codec_id == 'V_MPEG4/ISO/ASP':
found('videoCodec', 'XviD')
else:
log.warning('MKV has no video track')
if mkv.audio_tracks:
audio_track = mkv.audio_tracks[0]
# audio codec
if audio_track.codec_id == 'A_AC3':
found('audioCodec', 'AC3')
elif audio_track.codec_id == 'A_DTS':
found('audioCodec', 'DTS')
elif audio_track.codec_id == 'A_AAC':
found('audioCodec', 'AAC')
else:
log.warning('MKV has no audio track')
if mkv.subtitle_tracks:
embedded_subtitle_languages = set()
for st in mkv.subtitle_tracks:
try:
if st.language:
lang = babelfish.Language.fromalpha3b(st.language)
elif st.name:
lang = babelfish.Language.fromname(st.name)
else:
lang = babelfish.Language('und')
except babelfish.Error:
lang = babelfish.Language('und')
embedded_subtitle_languages.add(lang)
found('subtitleLanguage', embedded_subtitle_languages)
else:
log.debug('MKV has no subtitle track')
return result
except ImportError:
log.error('Cannot get video file metadata, missing dependency: enzyme')
log.error('Please install it from PyPI, by doing eg: pip install enzyme')
return result
except IOError as e:
log.error('Could not open file: %s' % filename)
log.error('Make sure it exists and is available for reading on the filesystem')
log.error('Error: %s' % e)
return result
except enzyme.Error as e:
log.error('Cannot guess video file metadata')
log.error('enzyme.Error while reading file: %s' % filename)
log.error('Error: %s' % e)
return result
default_options = {}
def guess_file_info(filename, info=None, options=None, **kwargs):
"""info can contain the names of the various plugins, such as 'filename' to
detect filename info, or 'hash_md5' to get the md5 hash of the file.
>>> testfile = os.path.join(os.path.dirname(__file__), 'test/dummy.srt')
>>> g = guess_file_info(testfile, info = ['hash_md5', 'hash_sha1'])
>>> g['hash_md5'], g['hash_sha1']
('64de6b5893cac24456c46a935ef9c359', 'a703fc0fa4518080505809bf562c6fc6f7b3c98c')
"""
info = info or 'filename'
options = options or {}
if default_options:
merged_options = deepcopy(default_options)
merged_options.update(options)
options = merged_options
result = []
hashers = []
# Force unicode as soon as possible
filename = u(filename)
if isinstance(info, base_text_type):
info = [info]
for infotype in info:
if infotype == 'filename':
result.append(_guess_filename(filename, options, **kwargs))
elif infotype == 'hash_mpc':
from guessit.hash_mpc import hash_file
try:
result.append(Guess({infotype: hash_file(filename)},
confidence=1.0))
except Exception as e:
log.warning('Could not compute MPC-style hash because: %s' % e)
elif infotype == 'hash_ed2k':
from guessit.hash_ed2k import hash_file
try:
result.append(Guess({infotype: hash_file(filename)},
confidence=1.0))
except Exception as e:
log.warning('Could not compute ed2k hash because: %s' % e)
elif infotype.startswith('hash_'):
import hashlib
hashname = infotype[5:]
try:
hasher = getattr(hashlib, hashname)()
hashers.append((infotype, hasher))
except AttributeError:
log.warning('Could not compute %s hash because it is not available from python\'s hashlib module' % hashname)
elif infotype == 'video':
g = guess_video_metadata(filename)
if g:
result.append(g)
else:
log.warning('Invalid infotype: %s' % infotype)
# do all the hashes now, but on a single pass
if hashers:
try:
blocksize = 8192
hasherobjs = dict(hashers).values()
with open(filename, 'rb') as f:
chunk = f.read(blocksize)
while chunk:
for hasher in hasherobjs:
hasher.update(chunk)
chunk = f.read(blocksize)
for infotype, hasher in hashers:
result.append(Guess({infotype: hasher.hexdigest()},
confidence=1.0))
except Exception as e:
log.warning('Could not compute hash because: %s' % e)
result = smart_merge(result)
return result
def guess_video_info(filename, info=None, options=None, **kwargs):
return guess_file_info(filename, info=info, options=options, type='video', **kwargs)
def guess_movie_info(filename, info=None, options=None, **kwargs):
return guess_file_info(filename, info=info, options=options, type='movie', **kwargs)
def guess_episode_info(filename, info=None, options=None, **kwargs):
return guess_file_info(filename, info=info, options=options, type='episode', **kwargs)