nzbToMedia/libs/subliminal/video.py
2014-06-27 17:04:44 +09:30

411 lines
18 KiB
Python

# -*- coding: utf-8 -*-
from __future__ import unicode_literals, division
import datetime
import hashlib
import logging
import os
import struct
import babelfish
import enzyme
import guessit
logger = logging.getLogger(__name__)
#: Video extensions
VIDEO_EXTENSIONS = ('.3g2', '.3gp', '.3gp2', '.3gpp', '.60d', '.ajp', '.asf', '.asx', '.avchd', '.avi', '.bik',
'.bix', '.box', '.cam', '.dat', '.divx', '.dmf', '.dv', '.dvr-ms', '.evo', '.flc', '.fli',
'.flic', '.flv', '.flx', '.gvi', '.gvp', '.h264', '.m1v', '.m2p', '.m2ts', '.m2v', '.m4e',
'.m4v', '.mjp', '.mjpeg', '.mjpg', '.mkv', '.moov', '.mov', '.movhd', '.movie', '.movx', '.mp4',
'.mpe', '.mpeg', '.mpg', '.mpv', '.mpv2', '.mxf', '.nsv', '.nut', '.ogg', '.ogm', '.omf', '.ps',
'.qt', '.ram', '.rm', '.rmvb', '.swf', '.ts', '.vfw', '.vid', '.video', '.viv', '.vivo', '.vob',
'.vro', '.wm', '.wmv', '.wmx', '.wrap', '.wvx', '.wx', '.x264', '.xvid')
#: Subtitle extensions
SUBTITLE_EXTENSIONS = ('.srt', '.sub', '.smi', '.txt', '.ssa', '.ass', '.mpl')
class Video(object):
"""Base class for videos
Represent a video, existing or not, with various properties that defines it.
Each property has an associated score based on equations that are described in
subclasses.
:param string name: name or path of the video
:param string format: format of the video (HDTV, WEB-DL, ...)
:param string release_group: release group of the video
:param string resolution: screen size of the video stream (480p, 720p, 1080p or 1080i)
:param string video_codec: codec of the video stream
:param string audio_codec: codec of the main audio stream
:param int imdb_id: IMDb id of the video
:param dict hashes: hashes of the video file by provider names
:param int size: byte size of the video file
:param set subtitle_languages: existing subtitle languages
"""
scores = {}
def __init__(self, name, format=None, release_group=None, resolution=None, video_codec=None, audio_codec=None,
imdb_id=None, hashes=None, size=None, subtitle_languages=None):
self.name = name
self.format = format
self.release_group = release_group
self.resolution = resolution
self.video_codec = video_codec
self.audio_codec = audio_codec
self.imdb_id = imdb_id
self.hashes = hashes or {}
self.size = size
self.subtitle_languages = subtitle_languages or set()
@classmethod
def fromguess(cls, name, guess):
if guess['type'] == 'episode':
return Episode.fromguess(name, guess)
if guess['type'] == 'movie':
return Movie.fromguess(name, guess)
raise ValueError('The guess must be an episode or a movie guess')
@classmethod
def fromname(cls, name):
return cls.fromguess(os.path.split(name)[1], guessit.guess_file_info(name))
def __repr__(self):
return '<%s [%r]>' % (self.__class__.__name__, self.name)
def __hash__(self):
return hash(self.name)
class Episode(Video):
"""Episode :class:`Video`
Scores are defined by a set of equations, see :func:`~subliminal.score.get_episode_equations`
:param string series: series of the episode
:param int season: season number of the episode
:param int episode: episode number of the episode
:param string title: title of the episode
:param int year: year of series
:param int tvdb_id: TheTVDB id of the episode
"""
scores = {'format': 3, 'video_codec': 2, 'tvdb_id': 48, 'title': 12, 'imdb_id': 60, 'audio_codec': 1, 'year': 24,
'resolution': 2, 'season': 6, 'release_group': 6, 'series': 24, 'episode': 6, 'hash': 74}
def __init__(self, name, series, season, episode, format=None, release_group=None, resolution=None, video_codec=None,
audio_codec=None, imdb_id=None, hashes=None, size=None, subtitle_languages=None, title=None,
year=None, tvdb_id=None):
super(Episode, self).__init__(name, format, release_group, resolution, video_codec, audio_codec, imdb_id, hashes,
size, subtitle_languages)
self.series = series
self.season = season
self.episode = episode
self.title = title
self.year = year
self.tvdb_id = tvdb_id
@classmethod
def fromguess(cls, name, guess):
if guess['type'] != 'episode':
raise ValueError('The guess must be an episode guess')
if 'series' not in guess or 'season' not in guess or 'episodeNumber' not in guess:
raise ValueError('Insufficient data to process the guess')
return cls(name, guess['series'], guess['season'], guess['episodeNumber'], format=guess.get('format'),
release_group=guess.get('releaseGroup'), resolution=guess.get('screenSize'),
video_codec=guess.get('videoCodec'), audio_codec=guess.get('audioCodec'),
title=guess.get('title'), year=guess.get('year'))
@classmethod
def fromname(cls, name):
return cls.fromguess(os.path.split(name)[1], guessit.guess_episode_info(name))
def __repr__(self):
if self.year is None:
return '<%s [%r, %dx%d]>' % (self.__class__.__name__, self.series, self.season, self.episode)
return '<%s [%r, %d, %dx%d]>' % (self.__class__.__name__, self.series, self.year, self.season, self.episode)
class Movie(Video):
"""Movie :class:`Video`
Scores are defined by a set of equations, see :func:`~subliminal.score.get_movie_equations`
:param string title: title of the movie
:param int year: year of the movie
"""
scores = {'format': 3, 'video_codec': 2, 'title': 13, 'imdb_id': 34, 'audio_codec': 1, 'year': 7, 'resolution': 2,
'release_group': 6, 'hash': 34}
def __init__(self, name, title, format=None, release_group=None, resolution=None, video_codec=None, audio_codec=None,
imdb_id=None, hashes=None, size=None, subtitle_languages=None, year=None):
super(Movie, self).__init__(name, format, release_group, resolution, video_codec, audio_codec, imdb_id, hashes,
size, subtitle_languages)
self.title = title
self.year = year
@classmethod
def fromguess(cls, name, guess):
if guess['type'] != 'movie':
raise ValueError('The guess must be a movie guess')
if 'title' not in guess:
raise ValueError('Insufficient data to process the guess')
return cls(name, guess['title'], format=guess.get('format'), release_group=guess.get('releaseGroup'),
resolution=guess.get('screenSize'), video_codec=guess.get('videoCodec'),
audio_codec=guess.get('audioCodec'),year=guess.get('year'))
@classmethod
def fromname(cls, name):
return cls.fromguess(os.path.split(name)[1], guessit.guess_movie_info(name))
def __repr__(self):
if self.year is None:
return '<%s [%r]>' % (self.__class__.__name__, self.title)
return '<%s [%r, %d]>' % (self.__class__.__name__, self.title, self.year)
def scan_subtitle_languages(path):
"""Search for subtitles with alpha2 extension from a video `path` and return their language
:param string path: path to the video
:return: found subtitle languages
:rtype: set
"""
language_extensions = tuple('.' + c for c in babelfish.language_converters['alpha2'].codes)
dirpath, filename = os.path.split(path)
subtitles = set()
for p in os.listdir(dirpath):
if not isinstance(p, bytes) and p.startswith(os.path.splitext(filename)[0]) and p.endswith(SUBTITLE_EXTENSIONS):
if os.path.splitext(p)[0].endswith(language_extensions):
subtitles.add(babelfish.Language.fromalpha2(os.path.splitext(p)[0][-2:]))
else:
subtitles.add(babelfish.Language('und'))
logger.debug('Found subtitles %r', subtitles)
return subtitles
def scan_video(path, subtitles=True, embedded_subtitles=True):
"""Scan a video and its subtitle languages from a video `path`
:param string path: absolute path to the video
:param bool subtitles: scan for subtitles with the same name
:param bool embedded_subtitles: scan for embedded subtitles
:return: the scanned video
:rtype: :class:`Video`
:raise: ValueError if cannot guess enough information from the path
"""
dirpath, filename = os.path.split(path)
logger.info('Scanning video %r in %r', filename, dirpath)
video = Video.fromguess(path, guessit.guess_file_info(path))
video.size = os.path.getsize(path)
if video.size > 10485760:
logger.debug('Size is %d', video.size)
video.hashes['opensubtitles'] = hash_opensubtitles(path)
video.hashes['thesubdb'] = hash_thesubdb(path)
logger.debug('Computed hashes %r', video.hashes)
else:
logger.warning('Size is lower than 10MB: hashes not computed')
if subtitles:
video.subtitle_languages |= scan_subtitle_languages(path)
# enzyme
try:
if filename.endswith('.mkv'):
with open(path, 'rb') as f:
mkv = enzyme.MKV(f)
if mkv.video_tracks:
video_track = mkv.video_tracks[0]
# resolution
if video_track.height in (480, 720, 1080):
if video_track.interlaced:
video.resolution = '%di' % video_track.height
logger.debug('Found resolution %s with enzyme', video.resolution)
else:
video.resolution = '%dp' % video_track.height
logger.debug('Found resolution %s with enzyme', video.resolution)
# video codec
if video_track.codec_id == 'V_MPEG4/ISO/AVC':
video.video_codec = 'h264'
logger.debug('Found video_codec %s with enzyme', video.video_codec)
elif video_track.codec_id == 'V_MPEG4/ISO/SP':
video.video_codec = 'DivX'
logger.debug('Found video_codec %s with enzyme', video.video_codec)
elif video_track.codec_id == 'V_MPEG4/ISO/ASP':
video.video_codec = 'XviD'
logger.debug('Found video_codec %s with enzyme', video.video_codec)
else:
logger.warning('MKV has no video track')
if mkv.audio_tracks:
audio_track = mkv.audio_tracks[0]
# audio codec
if audio_track.codec_id == 'A_AC3':
video.audio_codec = 'AC3'
logger.debug('Found audio_codec %s with enzyme', video.audio_codec)
elif audio_track.codec_id == 'A_DTS':
video.audio_codec = 'DTS'
logger.debug('Found audio_codec %s with enzyme', video.audio_codec)
elif audio_track.codec_id == 'A_AAC':
video.audio_codec = 'AAC'
logger.debug('Found audio_codec %s with enzyme', video.audio_codec)
else:
logger.warning('MKV has no audio track')
if mkv.subtitle_tracks:
# embedded subtitles
if embedded_subtitles:
embedded_subtitle_languages = set()
for st in mkv.subtitle_tracks:
if st.language:
try:
embedded_subtitle_languages.add(babelfish.Language.fromalpha3b(st.language))
except babelfish.Error:
logger.error('Embedded subtitle track language %r is not a valid language', st.language)
embedded_subtitle_languages.add(babelfish.Language('und'))
elif st.name:
try:
embedded_subtitle_languages.add(babelfish.Language.fromname(st.name))
except babelfish.Error:
logger.debug('Embedded subtitle track name %r is not a valid language', st.name)
embedded_subtitle_languages.add(babelfish.Language('und'))
else:
embedded_subtitle_languages.add(babelfish.Language('und'))
logger.debug('Found embedded subtitle %r with enzyme', embedded_subtitle_languages)
video.subtitle_languages |= embedded_subtitle_languages
else:
logger.debug('MKV has no subtitle track')
except enzyme.Error:
logger.exception('Parsing video metadata with enzyme failed')
return video
def scan_videos(paths, subtitles=True, embedded_subtitles=True, age=None):
"""Scan `paths` for videos and their subtitle languages
:params paths: absolute paths to scan for videos
:type paths: list of string
:param bool subtitles: scan for subtitles with the same name
:param bool embedded_subtitles: scan for embedded subtitles
:param age: age of the video, if any
:type age: datetime.timedelta or None
:return: the scanned videos
:rtype: list of :class:`Video`
"""
videos = []
# scan files
for filepath in [p for p in paths if os.path.isfile(p)]:
if age is not None:
try:
video_age = datetime.datetime.now() - datetime.datetime.fromtimestamp(os.path.getmtime(filepath))
except ValueError:
logger.exception('Error while getting video age, skipping it')
continue
if video_age > age:
logger.info('Skipping video %r: older than %r', filepath, age)
continue
try:
videos.append(scan_video(filepath, subtitles, embedded_subtitles))
except ValueError as e:
logger.error('Skipping video: %s', e)
continue
# scan directories
for path in [p for p in paths if os.path.isdir(p)]:
logger.info('Scanning directory %r', path)
for dirpath, dirnames, filenames in os.walk(path):
# skip badly encoded directories
if isinstance(dirpath, bytes):
logger.error('Skipping badly encoded directory %r', dirpath.decode('utf-8', errors='replace'))
continue
# skip badly encoded and hidden sub directories
for dirname in list(dirnames):
if isinstance(dirname, bytes):
logger.error('Skipping badly encoded dirname %r in %r', dirname.decode('utf-8', errors='replace'),
dirpath)
dirnames.remove(dirname)
elif dirname.startswith('.'):
logger.debug('Skipping hidden dirname %r in %r', dirname, dirpath)
dirnames.remove(dirname)
# scan for videos
for filename in filenames:
# skip badly encoded files
if isinstance(filename, bytes):
logger.error('Skipping badly encoded filename %r in %r', filename.decode('utf-8', errors='replace'),
dirpath)
continue
# filter videos
if not filename.endswith(VIDEO_EXTENSIONS):
continue
# skip hidden files
if filename.startswith('.'):
logger.debug('Skipping hidden filename %r in %r', filename, dirpath)
continue
filepath = os.path.join(dirpath, filename)
# skip links
if os.path.islink(filepath):
logger.debug('Skipping link %r in %r', filename, dirpath)
continue
if age is not None:
try:
video_age = datetime.datetime.now() - datetime.datetime.fromtimestamp(os.path.getmtime(filepath))
except ValueError:
logger.exception('Error while getting video age, skipping it')
continue
if video_age > age:
logger.info('Skipping video %r: older than %r', filepath, age)
continue
try:
video = scan_video(filepath, subtitles, embedded_subtitles)
except ValueError as e:
logger.error('Skipping video: %s', e)
continue
videos.append(video)
return videos
def hash_opensubtitles(video_path):
"""Compute a hash using OpenSubtitles' algorithm
:param string video_path: path of the video
:return: the hash
:rtype: string
"""
bytesize = struct.calcsize(b'<q')
with open(video_path, 'rb') as f:
filesize = os.path.getsize(video_path)
filehash = filesize
if filesize < 65536 * 2:
return None
for _ in range(65536 // bytesize):
filebuffer = f.read(bytesize)
(l_value,) = struct.unpack(b'<q', filebuffer)
filehash += l_value
filehash = filehash & 0xFFFFFFFFFFFFFFFF # to remain as 64bit number
f.seek(max(0, filesize - 65536), 0)
for _ in range(65536 // bytesize):
filebuffer = f.read(bytesize)
(l_value,) = struct.unpack(b'<q', filebuffer)
filehash += l_value
filehash = filehash & 0xFFFFFFFFFFFFFFFF
returnedhash = '%016x' % filehash
return returnedhash
def hash_thesubdb(video_path):
"""Compute a hash using TheSubDB's algorithm
:param string video_path: path of the video
:return: the hash
:rtype: string
"""
readsize = 64 * 1024
if os.path.getsize(video_path) < readsize:
return None
with open(video_path, 'rb') as f:
data = f.read(readsize)
f.seek(-readsize, os.SEEK_END)
data += f.read(readsize)
return hashlib.md5(data).hexdigest()