diff --git a/.gitignore b/.gitignore index fdd8ce76..c708d315 100644 --- a/.gitignore +++ b/.gitignore @@ -5,6 +5,7 @@ *.log *.pid *.db +*.dbm /userscripts/ /logs/ /.idea/ diff --git a/core/nzbToMediaUtil.py b/core/nzbToMediaUtil.py index 204766a3..4cfd73fb 100644 --- a/core/nzbToMediaUtil.py +++ b/core/nzbToMediaUtil.py @@ -1124,7 +1124,7 @@ def import_subs(filename): if not core.GETSUBS: return try: - subliminal.cache_region.configure('dogpile.cache.memory') + subliminal.region.configure('dogpile.cache.dbm', arguments={'filename': 'cachefile.dbm'}) except: pass @@ -1139,9 +1139,9 @@ def import_subs(filename): logger.debug("Attempting to download subtitles for {0}".format(filename), 'SUBTITLES') try: - video = subliminal.scan_video(filename, subtitles=True, embedded_subtitles=True) - subtitles = subliminal.download_best_subtitles({video}, languages, hearing_impaired=False) - subliminal.save_subtitles(subtitles) + video = subliminal.scan_video(filename) + subtitles = subliminal.download_best_subtitles({video}, languages) + subliminal.save_subtitles(video, subtitles[video]) except Exception as e: logger.error("Failed to download subtitles for {0} due to: {1}".format(filename, e), 'SUBTITLES') diff --git a/libs/concurrent/__init__.py b/libs/concurrent/__init__.py new file mode 100644 index 00000000..b36383a6 --- /dev/null +++ b/libs/concurrent/__init__.py @@ -0,0 +1,3 @@ +from pkgutil import extend_path + +__path__ = extend_path(__path__, __name__) diff --git a/libs/concurrent/futures/__init__.py b/libs/concurrent/futures/__init__.py new file mode 100644 index 00000000..b5231f8a --- /dev/null +++ b/libs/concurrent/futures/__init__.py @@ -0,0 +1,18 @@ +# Copyright 2009 Brian Quinlan. All Rights Reserved. +# Licensed to PSF under a Contributor Agreement. + +"""Execute computations asynchronously using threads or processes.""" + +__author__ = 'Brian Quinlan (brian@sweetapp.com)' + +from concurrent.futures._base import (FIRST_COMPLETED, + FIRST_EXCEPTION, + ALL_COMPLETED, + CancelledError, + TimeoutError, + Future, + Executor, + wait, + as_completed) +from concurrent.futures.process import ProcessPoolExecutor +from concurrent.futures.thread import ThreadPoolExecutor diff --git a/libs/concurrent/futures/_base.py b/libs/concurrent/futures/_base.py new file mode 100644 index 00000000..8ed69b7d --- /dev/null +++ b/libs/concurrent/futures/_base.py @@ -0,0 +1,574 @@ +# Copyright 2009 Brian Quinlan. All Rights Reserved. +# Licensed to PSF under a Contributor Agreement. + +from __future__ import with_statement +import logging +import threading +import time + +try: + from collections import namedtuple +except ImportError: + from concurrent.futures._compat import namedtuple + +__author__ = 'Brian Quinlan (brian@sweetapp.com)' + +FIRST_COMPLETED = 'FIRST_COMPLETED' +FIRST_EXCEPTION = 'FIRST_EXCEPTION' +ALL_COMPLETED = 'ALL_COMPLETED' +_AS_COMPLETED = '_AS_COMPLETED' + +# Possible future states (for internal use by the futures package). +PENDING = 'PENDING' +RUNNING = 'RUNNING' +# The future was cancelled by the user... +CANCELLED = 'CANCELLED' +# ...and _Waiter.add_cancelled() was called by a worker. +CANCELLED_AND_NOTIFIED = 'CANCELLED_AND_NOTIFIED' +FINISHED = 'FINISHED' + +_FUTURE_STATES = [ + PENDING, + RUNNING, + CANCELLED, + CANCELLED_AND_NOTIFIED, + FINISHED +] + +_STATE_TO_DESCRIPTION_MAP = { + PENDING: "pending", + RUNNING: "running", + CANCELLED: "cancelled", + CANCELLED_AND_NOTIFIED: "cancelled", + FINISHED: "finished" +} + +# Logger for internal use by the futures package. +LOGGER = logging.getLogger("concurrent.futures") + +class Error(Exception): + """Base class for all future-related exceptions.""" + pass + +class CancelledError(Error): + """The Future was cancelled.""" + pass + +class TimeoutError(Error): + """The operation exceeded the given deadline.""" + pass + +class _Waiter(object): + """Provides the event that wait() and as_completed() block on.""" + def __init__(self): + self.event = threading.Event() + self.finished_futures = [] + + def add_result(self, future): + self.finished_futures.append(future) + + def add_exception(self, future): + self.finished_futures.append(future) + + def add_cancelled(self, future): + self.finished_futures.append(future) + +class _AsCompletedWaiter(_Waiter): + """Used by as_completed().""" + + def __init__(self): + super(_AsCompletedWaiter, self).__init__() + self.lock = threading.Lock() + + def add_result(self, future): + with self.lock: + super(_AsCompletedWaiter, self).add_result(future) + self.event.set() + + def add_exception(self, future): + with self.lock: + super(_AsCompletedWaiter, self).add_exception(future) + self.event.set() + + def add_cancelled(self, future): + with self.lock: + super(_AsCompletedWaiter, self).add_cancelled(future) + self.event.set() + +class _FirstCompletedWaiter(_Waiter): + """Used by wait(return_when=FIRST_COMPLETED).""" + + def add_result(self, future): + super(_FirstCompletedWaiter, self).add_result(future) + self.event.set() + + def add_exception(self, future): + super(_FirstCompletedWaiter, self).add_exception(future) + self.event.set() + + def add_cancelled(self, future): + super(_FirstCompletedWaiter, self).add_cancelled(future) + self.event.set() + +class _AllCompletedWaiter(_Waiter): + """Used by wait(return_when=FIRST_EXCEPTION and ALL_COMPLETED).""" + + def __init__(self, num_pending_calls, stop_on_exception): + self.num_pending_calls = num_pending_calls + self.stop_on_exception = stop_on_exception + self.lock = threading.Lock() + super(_AllCompletedWaiter, self).__init__() + + def _decrement_pending_calls(self): + with self.lock: + self.num_pending_calls -= 1 + if not self.num_pending_calls: + self.event.set() + + def add_result(self, future): + super(_AllCompletedWaiter, self).add_result(future) + self._decrement_pending_calls() + + def add_exception(self, future): + super(_AllCompletedWaiter, self).add_exception(future) + if self.stop_on_exception: + self.event.set() + else: + self._decrement_pending_calls() + + def add_cancelled(self, future): + super(_AllCompletedWaiter, self).add_cancelled(future) + self._decrement_pending_calls() + +class _AcquireFutures(object): + """A context manager that does an ordered acquire of Future conditions.""" + + def __init__(self, futures): + self.futures = sorted(futures, key=id) + + def __enter__(self): + for future in self.futures: + future._condition.acquire() + + def __exit__(self, *args): + for future in self.futures: + future._condition.release() + +def _create_and_install_waiters(fs, return_when): + if return_when == _AS_COMPLETED: + waiter = _AsCompletedWaiter() + elif return_when == FIRST_COMPLETED: + waiter = _FirstCompletedWaiter() + else: + pending_count = sum( + f._state not in [CANCELLED_AND_NOTIFIED, FINISHED] for f in fs) + + if return_when == FIRST_EXCEPTION: + waiter = _AllCompletedWaiter(pending_count, stop_on_exception=True) + elif return_when == ALL_COMPLETED: + waiter = _AllCompletedWaiter(pending_count, stop_on_exception=False) + else: + raise ValueError("Invalid return condition: %r" % return_when) + + for f in fs: + f._waiters.append(waiter) + + return waiter + +def as_completed(fs, timeout=None): + """An iterator over the given futures that yields each as it completes. + + Args: + fs: The sequence of Futures (possibly created by different Executors) to + iterate over. + timeout: The maximum number of seconds to wait. If None, then there + is no limit on the wait time. + + Returns: + An iterator that yields the given Futures as they complete (finished or + cancelled). + + Raises: + TimeoutError: If the entire result iterator could not be generated + before the given timeout. + """ + if timeout is not None: + end_time = timeout + time.time() + + with _AcquireFutures(fs): + finished = set( + f for f in fs + if f._state in [CANCELLED_AND_NOTIFIED, FINISHED]) + pending = set(fs) - finished + waiter = _create_and_install_waiters(fs, _AS_COMPLETED) + + try: + for future in finished: + yield future + + while pending: + if timeout is None: + wait_timeout = None + else: + wait_timeout = end_time - time.time() + if wait_timeout < 0: + raise TimeoutError( + '%d (of %d) futures unfinished' % ( + len(pending), len(fs))) + + waiter.event.wait(wait_timeout) + + with waiter.lock: + finished = waiter.finished_futures + waiter.finished_futures = [] + waiter.event.clear() + + for future in finished: + yield future + pending.remove(future) + + finally: + for f in fs: + f._waiters.remove(waiter) + +DoneAndNotDoneFutures = namedtuple( + 'DoneAndNotDoneFutures', 'done not_done') +def wait(fs, timeout=None, return_when=ALL_COMPLETED): + """Wait for the futures in the given sequence to complete. + + Args: + fs: The sequence of Futures (possibly created by different Executors) to + wait upon. + timeout: The maximum number of seconds to wait. If None, then there + is no limit on the wait time. + return_when: Indicates when this function should return. The options + are: + + FIRST_COMPLETED - Return when any future finishes or is + cancelled. + FIRST_EXCEPTION - Return when any future finishes by raising an + exception. If no future raises an exception + then it is equivalent to ALL_COMPLETED. + ALL_COMPLETED - Return when all futures finish or are cancelled. + + Returns: + A named 2-tuple of sets. The first set, named 'done', contains the + futures that completed (is finished or cancelled) before the wait + completed. The second set, named 'not_done', contains uncompleted + futures. + """ + with _AcquireFutures(fs): + done = set(f for f in fs + if f._state in [CANCELLED_AND_NOTIFIED, FINISHED]) + not_done = set(fs) - done + + if (return_when == FIRST_COMPLETED) and done: + return DoneAndNotDoneFutures(done, not_done) + elif (return_when == FIRST_EXCEPTION) and done: + if any(f for f in done + if not f.cancelled() and f.exception() is not None): + return DoneAndNotDoneFutures(done, not_done) + + if len(done) == len(fs): + return DoneAndNotDoneFutures(done, not_done) + + waiter = _create_and_install_waiters(fs, return_when) + + waiter.event.wait(timeout) + for f in fs: + f._waiters.remove(waiter) + + done.update(waiter.finished_futures) + return DoneAndNotDoneFutures(done, set(fs) - done) + +class Future(object): + """Represents the result of an asynchronous computation.""" + + def __init__(self): + """Initializes the future. Should not be called by clients.""" + self._condition = threading.Condition() + self._state = PENDING + self._result = None + self._exception = None + self._waiters = [] + self._done_callbacks = [] + + def _invoke_callbacks(self): + for callback in self._done_callbacks: + try: + callback(self) + except Exception: + LOGGER.exception('exception calling callback for %r', self) + + def __repr__(self): + with self._condition: + if self._state == FINISHED: + if self._exception: + return '' % ( + hex(id(self)), + _STATE_TO_DESCRIPTION_MAP[self._state], + self._exception.__class__.__name__) + else: + return '' % ( + hex(id(self)), + _STATE_TO_DESCRIPTION_MAP[self._state], + self._result.__class__.__name__) + return '' % ( + hex(id(self)), + _STATE_TO_DESCRIPTION_MAP[self._state]) + + def cancel(self): + """Cancel the future if possible. + + Returns True if the future was cancelled, False otherwise. A future + cannot be cancelled if it is running or has already completed. + """ + with self._condition: + if self._state in [RUNNING, FINISHED]: + return False + + if self._state in [CANCELLED, CANCELLED_AND_NOTIFIED]: + return True + + self._state = CANCELLED + self._condition.notify_all() + + self._invoke_callbacks() + return True + + def cancelled(self): + """Return True if the future has cancelled.""" + with self._condition: + return self._state in [CANCELLED, CANCELLED_AND_NOTIFIED] + + def running(self): + """Return True if the future is currently executing.""" + with self._condition: + return self._state == RUNNING + + def done(self): + """Return True of the future was cancelled or finished executing.""" + with self._condition: + return self._state in [CANCELLED, CANCELLED_AND_NOTIFIED, FINISHED] + + def __get_result(self): + if self._exception: + raise self._exception + else: + return self._result + + def add_done_callback(self, fn): + """Attaches a callable that will be called when the future finishes. + + Args: + fn: A callable that will be called with this future as its only + argument when the future completes or is cancelled. The callable + will always be called by a thread in the same process in which + it was added. If the future has already completed or been + cancelled then the callable will be called immediately. These + callables are called in the order that they were added. + """ + with self._condition: + if self._state not in [CANCELLED, CANCELLED_AND_NOTIFIED, FINISHED]: + self._done_callbacks.append(fn) + return + fn(self) + + def result(self, timeout=None): + """Return the result of the call that the future represents. + + Args: + timeout: The number of seconds to wait for the result if the future + isn't done. If None, then there is no limit on the wait time. + + Returns: + The result of the call that the future represents. + + Raises: + CancelledError: If the future was cancelled. + TimeoutError: If the future didn't finish executing before the given + timeout. + Exception: If the call raised then that exception will be raised. + """ + with self._condition: + if self._state in [CANCELLED, CANCELLED_AND_NOTIFIED]: + raise CancelledError() + elif self._state == FINISHED: + return self.__get_result() + + self._condition.wait(timeout) + + if self._state in [CANCELLED, CANCELLED_AND_NOTIFIED]: + raise CancelledError() + elif self._state == FINISHED: + return self.__get_result() + else: + raise TimeoutError() + + def exception(self, timeout=None): + """Return the exception raised by the call that the future represents. + + Args: + timeout: The number of seconds to wait for the exception if the + future isn't done. If None, then there is no limit on the wait + time. + + Returns: + The exception raised by the call that the future represents or None + if the call completed without raising. + + Raises: + CancelledError: If the future was cancelled. + TimeoutError: If the future didn't finish executing before the given + timeout. + """ + + with self._condition: + if self._state in [CANCELLED, CANCELLED_AND_NOTIFIED]: + raise CancelledError() + elif self._state == FINISHED: + return self._exception + + self._condition.wait(timeout) + + if self._state in [CANCELLED, CANCELLED_AND_NOTIFIED]: + raise CancelledError() + elif self._state == FINISHED: + return self._exception + else: + raise TimeoutError() + + # The following methods should only be used by Executors and in tests. + def set_running_or_notify_cancel(self): + """Mark the future as running or process any cancel notifications. + + Should only be used by Executor implementations and unit tests. + + If the future has been cancelled (cancel() was called and returned + True) then any threads waiting on the future completing (though calls + to as_completed() or wait()) are notified and False is returned. + + If the future was not cancelled then it is put in the running state + (future calls to running() will return True) and True is returned. + + This method should be called by Executor implementations before + executing the work associated with this future. If this method returns + False then the work should not be executed. + + Returns: + False if the Future was cancelled, True otherwise. + + Raises: + RuntimeError: if this method was already called or if set_result() + or set_exception() was called. + """ + with self._condition: + if self._state == CANCELLED: + self._state = CANCELLED_AND_NOTIFIED + for waiter in self._waiters: + waiter.add_cancelled(self) + # self._condition.notify_all() is not necessary because + # self.cancel() triggers a notification. + return False + elif self._state == PENDING: + self._state = RUNNING + return True + else: + LOGGER.critical('Future %s in unexpected state: %s', + id(self.future), + self.future._state) + raise RuntimeError('Future in unexpected state') + + def set_result(self, result): + """Sets the return value of work associated with the future. + + Should only be used by Executor implementations and unit tests. + """ + with self._condition: + self._result = result + self._state = FINISHED + for waiter in self._waiters: + waiter.add_result(self) + self._condition.notify_all() + self._invoke_callbacks() + + def set_exception(self, exception): + """Sets the result of the future as being the given exception. + + Should only be used by Executor implementations and unit tests. + """ + with self._condition: + self._exception = exception + self._state = FINISHED + for waiter in self._waiters: + waiter.add_exception(self) + self._condition.notify_all() + self._invoke_callbacks() + +class Executor(object): + """This is an abstract base class for concrete asynchronous executors.""" + + def submit(self, fn, *args, **kwargs): + """Submits a callable to be executed with the given arguments. + + Schedules the callable to be executed as fn(*args, **kwargs) and returns + a Future instance representing the execution of the callable. + + Returns: + A Future representing the given call. + """ + raise NotImplementedError() + + def map(self, fn, *iterables, **kwargs): + """Returns a iterator equivalent to map(fn, iter). + + Args: + fn: A callable that will take as many arguments as there are + passed iterables. + timeout: The maximum number of seconds to wait. If None, then there + is no limit on the wait time. + + Returns: + An iterator equivalent to: map(func, *iterables) but the calls may + be evaluated out-of-order. + + Raises: + TimeoutError: If the entire result iterator could not be generated + before the given timeout. + Exception: If fn(*args) raises for any values. + """ + timeout = kwargs.get('timeout') + if timeout is not None: + end_time = timeout + time.time() + + fs = [self.submit(fn, *args) for args in zip(*iterables)] + + try: + for future in fs: + if timeout is None: + yield future.result() + else: + yield future.result(end_time - time.time()) + finally: + for future in fs: + future.cancel() + + def shutdown(self, wait=True): + """Clean-up the resources associated with the Executor. + + It is safe to call this method several times. Otherwise, no other + methods can be called after this one. + + Args: + wait: If True then shutdown will not return until all running + futures have finished executing and the resources used by the + executor have been reclaimed. + """ + pass + + def __enter__(self): + return self + + def __exit__(self, exc_type, exc_val, exc_tb): + self.shutdown(wait=True) + return False diff --git a/libs/concurrent/futures/_compat.py b/libs/concurrent/futures/_compat.py new file mode 100644 index 00000000..11462326 --- /dev/null +++ b/libs/concurrent/futures/_compat.py @@ -0,0 +1,101 @@ +from keyword import iskeyword as _iskeyword +from operator import itemgetter as _itemgetter +import sys as _sys + + +def namedtuple(typename, field_names): + """Returns a new subclass of tuple with named fields. + + >>> Point = namedtuple('Point', 'x y') + >>> Point.__doc__ # docstring for the new class + 'Point(x, y)' + >>> p = Point(11, y=22) # instantiate with positional args or keywords + >>> p[0] + p[1] # indexable like a plain tuple + 33 + >>> x, y = p # unpack like a regular tuple + >>> x, y + (11, 22) + >>> p.x + p.y # fields also accessable by name + 33 + >>> d = p._asdict() # convert to a dictionary + >>> d['x'] + 11 + >>> Point(**d) # convert from a dictionary + Point(x=11, y=22) + >>> p._replace(x=100) # _replace() is like str.replace() but targets named fields + Point(x=100, y=22) + + """ + + # Parse and validate the field names. Validation serves two purposes, + # generating informative error messages and preventing template injection attacks. + if isinstance(field_names, basestring): + field_names = field_names.replace(',', ' ').split() # names separated by whitespace and/or commas + field_names = tuple(map(str, field_names)) + for name in (typename,) + field_names: + if not all(c.isalnum() or c=='_' for c in name): + raise ValueError('Type names and field names can only contain alphanumeric characters and underscores: %r' % name) + if _iskeyword(name): + raise ValueError('Type names and field names cannot be a keyword: %r' % name) + if name[0].isdigit(): + raise ValueError('Type names and field names cannot start with a number: %r' % name) + seen_names = set() + for name in field_names: + if name.startswith('_'): + raise ValueError('Field names cannot start with an underscore: %r' % name) + if name in seen_names: + raise ValueError('Encountered duplicate field name: %r' % name) + seen_names.add(name) + + # Create and fill-in the class template + numfields = len(field_names) + argtxt = repr(field_names).replace("'", "")[1:-1] # tuple repr without parens or quotes + reprtxt = ', '.join('%s=%%r' % name for name in field_names) + dicttxt = ', '.join('%r: t[%d]' % (name, pos) for pos, name in enumerate(field_names)) + template = '''class %(typename)s(tuple): + '%(typename)s(%(argtxt)s)' \n + __slots__ = () \n + _fields = %(field_names)r \n + def __new__(_cls, %(argtxt)s): + return _tuple.__new__(_cls, (%(argtxt)s)) \n + @classmethod + def _make(cls, iterable, new=tuple.__new__, len=len): + 'Make a new %(typename)s object from a sequence or iterable' + result = new(cls, iterable) + if len(result) != %(numfields)d: + raise TypeError('Expected %(numfields)d arguments, got %%d' %% len(result)) + return result \n + def __repr__(self): + return '%(typename)s(%(reprtxt)s)' %% self \n + def _asdict(t): + 'Return a new dict which maps field names to their values' + return {%(dicttxt)s} \n + def _replace(_self, **kwds): + 'Return a new %(typename)s object replacing specified fields with new values' + result = _self._make(map(kwds.pop, %(field_names)r, _self)) + if kwds: + raise ValueError('Got unexpected field names: %%r' %% kwds.keys()) + return result \n + def __getnewargs__(self): + return tuple(self) \n\n''' % locals() + for i, name in enumerate(field_names): + template += ' %s = _property(_itemgetter(%d))\n' % (name, i) + + # Execute the template string in a temporary namespace and + # support tracing utilities by setting a value for frame.f_globals['__name__'] + namespace = dict(_itemgetter=_itemgetter, __name__='namedtuple_%s' % typename, + _property=property, _tuple=tuple) + try: + exec(template, namespace) + except SyntaxError: + e = _sys.exc_info()[1] + raise SyntaxError(e.message + ':\n' + template) + result = namespace[typename] + + # For pickling to work, the __module__ variable needs to be set to the frame + # where the named tuple is created. Bypass this step in enviroments where + # sys._getframe is not defined (Jython for example). + if hasattr(_sys, '_getframe'): + result.__module__ = _sys._getframe(1).f_globals.get('__name__', '__main__') + + return result diff --git a/libs/concurrent/futures/process.py b/libs/concurrent/futures/process.py new file mode 100644 index 00000000..98684f8e --- /dev/null +++ b/libs/concurrent/futures/process.py @@ -0,0 +1,363 @@ +# Copyright 2009 Brian Quinlan. All Rights Reserved. +# Licensed to PSF under a Contributor Agreement. + +"""Implements ProcessPoolExecutor. + +The follow diagram and text describe the data-flow through the system: + +|======================= In-process =====================|== Out-of-process ==| + ++----------+ +----------+ +--------+ +-----------+ +---------+ +| | => | Work Ids | => | | => | Call Q | => | | +| | +----------+ | | +-----------+ | | +| | | ... | | | | ... | | | +| | | 6 | | | | 5, call() | | | +| | | 7 | | | | ... | | | +| Process | | ... | | Local | +-----------+ | Process | +| Pool | +----------+ | Worker | | #1..n | +| Executor | | Thread | | | +| | +----------- + | | +-----------+ | | +| | <=> | Work Items | <=> | | <= | Result Q | <= | | +| | +------------+ | | +-----------+ | | +| | | 6: call() | | | | ... | | | +| | | future | | | | 4, result | | | +| | | ... | | | | 3, except | | | ++----------+ +------------+ +--------+ +-----------+ +---------+ + +Executor.submit() called: +- creates a uniquely numbered _WorkItem and adds it to the "Work Items" dict +- adds the id of the _WorkItem to the "Work Ids" queue + +Local worker thread: +- reads work ids from the "Work Ids" queue and looks up the corresponding + WorkItem from the "Work Items" dict: if the work item has been cancelled then + it is simply removed from the dict, otherwise it is repackaged as a + _CallItem and put in the "Call Q". New _CallItems are put in the "Call Q" + until "Call Q" is full. NOTE: the size of the "Call Q" is kept small because + calls placed in the "Call Q" can no longer be cancelled with Future.cancel(). +- reads _ResultItems from "Result Q", updates the future stored in the + "Work Items" dict and deletes the dict entry + +Process #1..n: +- reads _CallItems from "Call Q", executes the calls, and puts the resulting + _ResultItems in "Request Q" +""" + +from __future__ import with_statement +import atexit +import multiprocessing +import threading +import weakref +import sys + +from concurrent.futures import _base + +try: + import queue +except ImportError: + import Queue as queue + +__author__ = 'Brian Quinlan (brian@sweetapp.com)' + +# Workers are created as daemon threads and processes. This is done to allow the +# interpreter to exit when there are still idle processes in a +# ProcessPoolExecutor's process pool (i.e. shutdown() was not called). However, +# allowing workers to die with the interpreter has two undesirable properties: +# - The workers would still be running during interpretor shutdown, +# meaning that they would fail in unpredictable ways. +# - The workers could be killed while evaluating a work item, which could +# be bad if the callable being evaluated has external side-effects e.g. +# writing to a file. +# +# To work around this problem, an exit handler is installed which tells the +# workers to exit when their work queues are empty and then waits until the +# threads/processes finish. + +_threads_queues = weakref.WeakKeyDictionary() +_shutdown = False + +def _python_exit(): + global _shutdown + _shutdown = True + items = list(_threads_queues.items()) + for t, q in items: + q.put(None) + for t, q in items: + t.join() + +# Controls how many more calls than processes will be queued in the call queue. +# A smaller number will mean that processes spend more time idle waiting for +# work while a larger number will make Future.cancel() succeed less frequently +# (Futures in the call queue cannot be cancelled). +EXTRA_QUEUED_CALLS = 1 + +class _WorkItem(object): + def __init__(self, future, fn, args, kwargs): + self.future = future + self.fn = fn + self.args = args + self.kwargs = kwargs + +class _ResultItem(object): + def __init__(self, work_id, exception=None, result=None): + self.work_id = work_id + self.exception = exception + self.result = result + +class _CallItem(object): + def __init__(self, work_id, fn, args, kwargs): + self.work_id = work_id + self.fn = fn + self.args = args + self.kwargs = kwargs + +def _process_worker(call_queue, result_queue): + """Evaluates calls from call_queue and places the results in result_queue. + + This worker is run in a separate process. + + Args: + call_queue: A multiprocessing.Queue of _CallItems that will be read and + evaluated by the worker. + result_queue: A multiprocessing.Queue of _ResultItems that will written + to by the worker. + shutdown: A multiprocessing.Event that will be set as a signal to the + worker that it should exit when call_queue is empty. + """ + while True: + call_item = call_queue.get(block=True) + if call_item is None: + # Wake up queue management thread + result_queue.put(None) + return + try: + r = call_item.fn(*call_item.args, **call_item.kwargs) + except BaseException: + e = sys.exc_info()[1] + result_queue.put(_ResultItem(call_item.work_id, + exception=e)) + else: + result_queue.put(_ResultItem(call_item.work_id, + result=r)) + +def _add_call_item_to_queue(pending_work_items, + work_ids, + call_queue): + """Fills call_queue with _WorkItems from pending_work_items. + + This function never blocks. + + Args: + pending_work_items: A dict mapping work ids to _WorkItems e.g. + {5: <_WorkItem...>, 6: <_WorkItem...>, ...} + work_ids: A queue.Queue of work ids e.g. Queue([5, 6, ...]). Work ids + are consumed and the corresponding _WorkItems from + pending_work_items are transformed into _CallItems and put in + call_queue. + call_queue: A multiprocessing.Queue that will be filled with _CallItems + derived from _WorkItems. + """ + while True: + if call_queue.full(): + return + try: + work_id = work_ids.get(block=False) + except queue.Empty: + return + else: + work_item = pending_work_items[work_id] + + if work_item.future.set_running_or_notify_cancel(): + call_queue.put(_CallItem(work_id, + work_item.fn, + work_item.args, + work_item.kwargs), + block=True) + else: + del pending_work_items[work_id] + continue + +def _queue_management_worker(executor_reference, + processes, + pending_work_items, + work_ids_queue, + call_queue, + result_queue): + """Manages the communication between this process and the worker processes. + + This function is run in a local thread. + + Args: + executor_reference: A weakref.ref to the ProcessPoolExecutor that owns + this thread. Used to determine if the ProcessPoolExecutor has been + garbage collected and that this function can exit. + process: A list of the multiprocessing.Process instances used as + workers. + pending_work_items: A dict mapping work ids to _WorkItems e.g. + {5: <_WorkItem...>, 6: <_WorkItem...>, ...} + work_ids_queue: A queue.Queue of work ids e.g. Queue([5, 6, ...]). + call_queue: A multiprocessing.Queue that will be filled with _CallItems + derived from _WorkItems for processing by the process workers. + result_queue: A multiprocessing.Queue of _ResultItems generated by the + process workers. + """ + nb_shutdown_processes = [0] + def shutdown_one_process(): + """Tell a worker to terminate, which will in turn wake us again""" + call_queue.put(None) + nb_shutdown_processes[0] += 1 + while True: + _add_call_item_to_queue(pending_work_items, + work_ids_queue, + call_queue) + + result_item = result_queue.get(block=True) + if result_item is not None: + work_item = pending_work_items[result_item.work_id] + del pending_work_items[result_item.work_id] + + if result_item.exception: + work_item.future.set_exception(result_item.exception) + else: + work_item.future.set_result(result_item.result) + # Check whether we should start shutting down. + executor = executor_reference() + # No more work items can be added if: + # - The interpreter is shutting down OR + # - The executor that owns this worker has been collected OR + # - The executor that owns this worker has been shutdown. + if _shutdown or executor is None or executor._shutdown_thread: + # Since no new work items can be added, it is safe to shutdown + # this thread if there are no pending work items. + if not pending_work_items: + while nb_shutdown_processes[0] < len(processes): + shutdown_one_process() + # If .join() is not called on the created processes then + # some multiprocessing.Queue methods may deadlock on Mac OS + # X. + for p in processes: + p.join() + call_queue.close() + return + del executor + +_system_limits_checked = False +_system_limited = None +def _check_system_limits(): + global _system_limits_checked, _system_limited + if _system_limits_checked: + if _system_limited: + raise NotImplementedError(_system_limited) + _system_limits_checked = True + try: + import os + nsems_max = os.sysconf("SC_SEM_NSEMS_MAX") + except (AttributeError, ValueError): + # sysconf not available or setting not available + return + if nsems_max == -1: + # indetermine limit, assume that limit is determined + # by available memory only + return + if nsems_max >= 256: + # minimum number of semaphores available + # according to POSIX + return + _system_limited = "system provides too few semaphores (%d available, 256 necessary)" % nsems_max + raise NotImplementedError(_system_limited) + +class ProcessPoolExecutor(_base.Executor): + def __init__(self, max_workers=None): + """Initializes a new ProcessPoolExecutor instance. + + Args: + max_workers: The maximum number of processes that can be used to + execute the given calls. If None or not given then as many + worker processes will be created as the machine has processors. + """ + _check_system_limits() + + if max_workers is None: + self._max_workers = multiprocessing.cpu_count() + else: + self._max_workers = max_workers + + # Make the call queue slightly larger than the number of processes to + # prevent the worker processes from idling. But don't make it too big + # because futures in the call queue cannot be cancelled. + self._call_queue = multiprocessing.Queue(self._max_workers + + EXTRA_QUEUED_CALLS) + self._result_queue = multiprocessing.Queue() + self._work_ids = queue.Queue() + self._queue_management_thread = None + self._processes = set() + + # Shutdown is a two-step process. + self._shutdown_thread = False + self._shutdown_lock = threading.Lock() + self._queue_count = 0 + self._pending_work_items = {} + + def _start_queue_management_thread(self): + # When the executor gets lost, the weakref callback will wake up + # the queue management thread. + def weakref_cb(_, q=self._result_queue): + q.put(None) + if self._queue_management_thread is None: + self._queue_management_thread = threading.Thread( + target=_queue_management_worker, + args=(weakref.ref(self, weakref_cb), + self._processes, + self._pending_work_items, + self._work_ids, + self._call_queue, + self._result_queue)) + self._queue_management_thread.daemon = True + self._queue_management_thread.start() + _threads_queues[self._queue_management_thread] = self._result_queue + + def _adjust_process_count(self): + for _ in range(len(self._processes), self._max_workers): + p = multiprocessing.Process( + target=_process_worker, + args=(self._call_queue, + self._result_queue)) + p.start() + self._processes.add(p) + + def submit(self, fn, *args, **kwargs): + with self._shutdown_lock: + if self._shutdown_thread: + raise RuntimeError('cannot schedule new futures after shutdown') + + f = _base.Future() + w = _WorkItem(f, fn, args, kwargs) + + self._pending_work_items[self._queue_count] = w + self._work_ids.put(self._queue_count) + self._queue_count += 1 + # Wake up queue management thread + self._result_queue.put(None) + + self._start_queue_management_thread() + self._adjust_process_count() + return f + submit.__doc__ = _base.Executor.submit.__doc__ + + def shutdown(self, wait=True): + with self._shutdown_lock: + self._shutdown_thread = True + if self._queue_management_thread: + # Wake up queue management thread + self._result_queue.put(None) + if wait: + self._queue_management_thread.join() + # To reduce the risk of openning too many files, remove references to + # objects that use file descriptors. + self._queue_management_thread = None + self._call_queue = None + self._result_queue = None + self._processes = None + shutdown.__doc__ = _base.Executor.shutdown.__doc__ + +atexit.register(_python_exit) diff --git a/libs/concurrent/futures/thread.py b/libs/concurrent/futures/thread.py new file mode 100644 index 00000000..a45959d3 --- /dev/null +++ b/libs/concurrent/futures/thread.py @@ -0,0 +1,138 @@ +# Copyright 2009 Brian Quinlan. All Rights Reserved. +# Licensed to PSF under a Contributor Agreement. + +"""Implements ThreadPoolExecutor.""" + +from __future__ import with_statement +import atexit +import threading +import weakref +import sys + +from concurrent.futures import _base + +try: + import queue +except ImportError: + import Queue as queue + +__author__ = 'Brian Quinlan (brian@sweetapp.com)' + +# Workers are created as daemon threads. This is done to allow the interpreter +# to exit when there are still idle threads in a ThreadPoolExecutor's thread +# pool (i.e. shutdown() was not called). However, allowing workers to die with +# the interpreter has two undesirable properties: +# - The workers would still be running during interpretor shutdown, +# meaning that they would fail in unpredictable ways. +# - The workers could be killed while evaluating a work item, which could +# be bad if the callable being evaluated has external side-effects e.g. +# writing to a file. +# +# To work around this problem, an exit handler is installed which tells the +# workers to exit when their work queues are empty and then waits until the +# threads finish. + +_threads_queues = weakref.WeakKeyDictionary() +_shutdown = False + +def _python_exit(): + global _shutdown + _shutdown = True + items = list(_threads_queues.items()) + for t, q in items: + q.put(None) + for t, q in items: + t.join() + +atexit.register(_python_exit) + +class _WorkItem(object): + def __init__(self, future, fn, args, kwargs): + self.future = future + self.fn = fn + self.args = args + self.kwargs = kwargs + + def run(self): + if not self.future.set_running_or_notify_cancel(): + return + + try: + result = self.fn(*self.args, **self.kwargs) + except BaseException: + e = sys.exc_info()[1] + self.future.set_exception(e) + else: + self.future.set_result(result) + +def _worker(executor_reference, work_queue): + try: + while True: + work_item = work_queue.get(block=True) + if work_item is not None: + work_item.run() + continue + executor = executor_reference() + # Exit if: + # - The interpreter is shutting down OR + # - The executor that owns the worker has been collected OR + # - The executor that owns the worker has been shutdown. + if _shutdown or executor is None or executor._shutdown: + # Notice other workers + work_queue.put(None) + return + del executor + except BaseException: + _base.LOGGER.critical('Exception in worker', exc_info=True) + +class ThreadPoolExecutor(_base.Executor): + def __init__(self, max_workers): + """Initializes a new ThreadPoolExecutor instance. + + Args: + max_workers: The maximum number of threads that can be used to + execute the given calls. + """ + self._max_workers = max_workers + self._work_queue = queue.Queue() + self._threads = set() + self._shutdown = False + self._shutdown_lock = threading.Lock() + + def submit(self, fn, *args, **kwargs): + with self._shutdown_lock: + if self._shutdown: + raise RuntimeError('cannot schedule new futures after shutdown') + + f = _base.Future() + w = _WorkItem(f, fn, args, kwargs) + + self._work_queue.put(w) + self._adjust_thread_count() + return f + submit.__doc__ = _base.Executor.submit.__doc__ + + def _adjust_thread_count(self): + # When the executor gets lost, the weakref callback will wake up + # the worker threads. + def weakref_cb(_, q=self._work_queue): + q.put(None) + # TODO(bquinlan): Should avoid creating new threads if there are more + # idle threads than items in the work queue. + if len(self._threads) < self._max_workers: + t = threading.Thread(target=_worker, + args=(weakref.ref(self, weakref_cb), + self._work_queue)) + t.daemon = True + t.start() + self._threads.add(t) + _threads_queues[t] = self._work_queue + + def shutdown(self, wait=True): + with self._shutdown_lock: + self._shutdown = True + self._work_queue.put(None) + if wait: + for t in self._threads: + t.join() + shutdown.__doc__ = _base.Executor.shutdown.__doc__ diff --git a/libs/dateutil/__init__.py b/libs/dateutil/__init__.py new file mode 100644 index 00000000..290814cf --- /dev/null +++ b/libs/dateutil/__init__.py @@ -0,0 +1,9 @@ +""" +Copyright (c) 2003-2010 Gustavo Niemeyer + +This module offers extensions to the standard python 2.3+ +datetime module. +""" +__author__ = "Gustavo Niemeyer " +__license__ = "PSF License" +__version__ = "1.5" diff --git a/libs/dateutil/easter.py b/libs/dateutil/easter.py new file mode 100644 index 00000000..d7944104 --- /dev/null +++ b/libs/dateutil/easter.py @@ -0,0 +1,92 @@ +""" +Copyright (c) 2003-2007 Gustavo Niemeyer + +This module offers extensions to the standard python 2.3+ +datetime module. +""" +__author__ = "Gustavo Niemeyer " +__license__ = "PSF License" + +import datetime + +__all__ = ["easter", "EASTER_JULIAN", "EASTER_ORTHODOX", "EASTER_WESTERN"] + +EASTER_JULIAN = 1 +EASTER_ORTHODOX = 2 +EASTER_WESTERN = 3 + +def easter(year, method=EASTER_WESTERN): + """ + This method was ported from the work done by GM Arts, + on top of the algorithm by Claus Tondering, which was + based in part on the algorithm of Ouding (1940), as + quoted in "Explanatory Supplement to the Astronomical + Almanac", P. Kenneth Seidelmann, editor. + + This algorithm implements three different easter + calculation methods: + + 1 - Original calculation in Julian calendar, valid in + dates after 326 AD + 2 - Original method, with date converted to Gregorian + calendar, valid in years 1583 to 4099 + 3 - Revised method, in Gregorian calendar, valid in + years 1583 to 4099 as well + + These methods are represented by the constants: + + EASTER_JULIAN = 1 + EASTER_ORTHODOX = 2 + EASTER_WESTERN = 3 + + The default method is method 3. + + More about the algorithm may be found at: + + http://users.chariot.net.au/~gmarts/eastalg.htm + + and + + http://www.tondering.dk/claus/calendar.html + + """ + + if not (1 <= method <= 3): + raise ValueError, "invalid method" + + # g - Golden year - 1 + # c - Century + # h - (23 - Epact) mod 30 + # i - Number of days from March 21 to Paschal Full Moon + # j - Weekday for PFM (0=Sunday, etc) + # p - Number of days from March 21 to Sunday on or before PFM + # (-6 to 28 methods 1 & 3, to 56 for method 2) + # e - Extra days to add for method 2 (converting Julian + # date to Gregorian date) + + y = year + g = y % 19 + e = 0 + if method < 3: + # Old method + i = (19*g+15)%30 + j = (y+y//4+i)%7 + if method == 2: + # Extra dates to convert Julian to Gregorian date + e = 10 + if y > 1600: + e = e+y//100-16-(y//100-16)//4 + else: + # New method + c = y//100 + h = (c-c//4-(8*c+13)//25+19*g+15)%30 + i = h-(h//28)*(1-(h//28)*(29//(h+1))*((21-g)//11)) + j = (y+y//4+i+2-c+c//4)%7 + + # p can be from -6 to 56 corresponding to dates 22 March to 23 May + # (later dates apply to method 2, although 23 May never actually occurs) + p = i-j+e + d = 1+(p+27+(p+6)//40)%31 + m = 3+(p+26)//30 + return datetime.date(int(y),int(m),int(d)) + diff --git a/libs/dateutil/parser.py b/libs/dateutil/parser.py new file mode 100644 index 00000000..5d824e41 --- /dev/null +++ b/libs/dateutil/parser.py @@ -0,0 +1,886 @@ +# -*- coding:iso-8859-1 -*- +""" +Copyright (c) 2003-2007 Gustavo Niemeyer + +This module offers extensions to the standard python 2.3+ +datetime module. +""" +__author__ = "Gustavo Niemeyer " +__license__ = "PSF License" + +import datetime +import string +import time +import sys +import os + +try: + from cStringIO import StringIO +except ImportError: + from StringIO import StringIO + +import relativedelta +import tz + + +__all__ = ["parse", "parserinfo"] + + +# Some pointers: +# +# http://www.cl.cam.ac.uk/~mgk25/iso-time.html +# http://www.iso.ch/iso/en/prods-services/popstds/datesandtime.html +# http://www.w3.org/TR/NOTE-datetime +# http://ringmaster.arc.nasa.gov/tools/time_formats.html +# http://search.cpan.org/author/MUIR/Time-modules-2003.0211/lib/Time/ParseDate.pm +# http://stein.cshl.org/jade/distrib/docs/java.text.SimpleDateFormat.html + + +class _timelex(object): + + def __init__(self, instream): + if isinstance(instream, basestring): + instream = StringIO(instream) + self.instream = instream + self.wordchars = ('abcdfeghijklmnopqrstuvwxyz' + 'ABCDEFGHIJKLMNOPQRSTUVWXYZ_' + 'ßàáâãäåæçèéêëìíîïðñòóôõöøùúûüýþÿ' + 'ÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖØÙÚÛÜÝÞ') + self.numchars = '0123456789' + self.whitespace = ' \t\r\n' + self.charstack = [] + self.tokenstack = [] + self.eof = False + + def get_token(self): + if self.tokenstack: + return self.tokenstack.pop(0) + seenletters = False + token = None + state = None + wordchars = self.wordchars + numchars = self.numchars + whitespace = self.whitespace + while not self.eof: + if self.charstack: + nextchar = self.charstack.pop(0) + else: + nextchar = self.instream.read(1) + while nextchar == '\x00': + nextchar = self.instream.read(1) + if not nextchar: + self.eof = True + break + elif not state: + token = nextchar + if nextchar in wordchars: + state = 'a' + elif nextchar in numchars: + state = '0' + elif nextchar in whitespace: + token = ' ' + break # emit token + else: + break # emit token + elif state == 'a': + seenletters = True + if nextchar in wordchars: + token += nextchar + elif nextchar == '.': + token += nextchar + state = 'a.' + else: + self.charstack.append(nextchar) + break # emit token + elif state == '0': + if nextchar in numchars: + token += nextchar + elif nextchar == '.': + token += nextchar + state = '0.' + else: + self.charstack.append(nextchar) + break # emit token + elif state == 'a.': + seenletters = True + if nextchar == '.' or nextchar in wordchars: + token += nextchar + elif nextchar in numchars and token[-1] == '.': + token += nextchar + state = '0.' + else: + self.charstack.append(nextchar) + break # emit token + elif state == '0.': + if nextchar == '.' or nextchar in numchars: + token += nextchar + elif nextchar in wordchars and token[-1] == '.': + token += nextchar + state = 'a.' + else: + self.charstack.append(nextchar) + break # emit token + if (state in ('a.', '0.') and + (seenletters or token.count('.') > 1 or token[-1] == '.')): + l = token.split('.') + token = l[0] + for tok in l[1:]: + self.tokenstack.append('.') + if tok: + self.tokenstack.append(tok) + return token + + def __iter__(self): + return self + + def next(self): + token = self.get_token() + if token is None: + raise StopIteration + return token + + def split(cls, s): + return list(cls(s)) + split = classmethod(split) + + +class _resultbase(object): + + def __init__(self): + for attr in self.__slots__: + setattr(self, attr, None) + + def _repr(self, classname): + l = [] + for attr in self.__slots__: + value = getattr(self, attr) + if value is not None: + l.append("%s=%s" % (attr, `value`)) + return "%s(%s)" % (classname, ", ".join(l)) + + def __repr__(self): + return self._repr(self.__class__.__name__) + + +class parserinfo(object): + + # m from a.m/p.m, t from ISO T separator + JUMP = [" ", ".", ",", ";", "-", "/", "'", + "at", "on", "and", "ad", "m", "t", "of", + "st", "nd", "rd", "th"] + + WEEKDAYS = [("Mon", "Monday"), + ("Tue", "Tuesday"), + ("Wed", "Wednesday"), + ("Thu", "Thursday"), + ("Fri", "Friday"), + ("Sat", "Saturday"), + ("Sun", "Sunday")] + MONTHS = [("Jan", "January"), + ("Feb", "February"), + ("Mar", "March"), + ("Apr", "April"), + ("May", "May"), + ("Jun", "June"), + ("Jul", "July"), + ("Aug", "August"), + ("Sep", "September"), + ("Oct", "October"), + ("Nov", "November"), + ("Dec", "December")] + HMS = [("h", "hour", "hours"), + ("m", "minute", "minutes"), + ("s", "second", "seconds")] + AMPM = [("am", "a"), + ("pm", "p")] + UTCZONE = ["UTC", "GMT", "Z"] + PERTAIN = ["of"] + TZOFFSET = {} + + def __init__(self, dayfirst=False, yearfirst=False): + self._jump = self._convert(self.JUMP) + self._weekdays = self._convert(self.WEEKDAYS) + self._months = self._convert(self.MONTHS) + self._hms = self._convert(self.HMS) + self._ampm = self._convert(self.AMPM) + self._utczone = self._convert(self.UTCZONE) + self._pertain = self._convert(self.PERTAIN) + + self.dayfirst = dayfirst + self.yearfirst = yearfirst + + self._year = time.localtime().tm_year + self._century = self._year//100*100 + + def _convert(self, lst): + dct = {} + for i in range(len(lst)): + v = lst[i] + if isinstance(v, tuple): + for v in v: + dct[v.lower()] = i + else: + dct[v.lower()] = i + return dct + + def jump(self, name): + return name.lower() in self._jump + + def weekday(self, name): + if len(name) >= 3: + try: + return self._weekdays[name.lower()] + except KeyError: + pass + return None + + def month(self, name): + if len(name) >= 3: + try: + return self._months[name.lower()]+1 + except KeyError: + pass + return None + + def hms(self, name): + try: + return self._hms[name.lower()] + except KeyError: + return None + + def ampm(self, name): + try: + return self._ampm[name.lower()] + except KeyError: + return None + + def pertain(self, name): + return name.lower() in self._pertain + + def utczone(self, name): + return name.lower() in self._utczone + + def tzoffset(self, name): + if name in self._utczone: + return 0 + return self.TZOFFSET.get(name) + + def convertyear(self, year): + if year < 100: + year += self._century + if abs(year-self._year) >= 50: + if year < self._year: + year += 100 + else: + year -= 100 + return year + + def validate(self, res): + # move to info + if res.year is not None: + res.year = self.convertyear(res.year) + if res.tzoffset == 0 and not res.tzname or res.tzname == 'Z': + res.tzname = "UTC" + res.tzoffset = 0 + elif res.tzoffset != 0 and res.tzname and self.utczone(res.tzname): + res.tzoffset = 0 + return True + + +class parser(object): + + def __init__(self, info=None): + self.info = info or parserinfo() + + def parse(self, timestr, default=None, + ignoretz=False, tzinfos=None, + **kwargs): + if not default: + default = datetime.datetime.now().replace(hour=0, minute=0, + second=0, microsecond=0) + res = self._parse(timestr, **kwargs) + if res is None: + raise ValueError, "unknown string format" + repl = {} + for attr in ["year", "month", "day", "hour", + "minute", "second", "microsecond"]: + value = getattr(res, attr) + if value is not None: + repl[attr] = value + ret = default.replace(**repl) + if res.weekday is not None and not res.day: + ret = ret+relativedelta.relativedelta(weekday=res.weekday) + if not ignoretz: + if callable(tzinfos) or tzinfos and res.tzname in tzinfos: + if callable(tzinfos): + tzdata = tzinfos(res.tzname, res.tzoffset) + else: + tzdata = tzinfos.get(res.tzname) + if isinstance(tzdata, datetime.tzinfo): + tzinfo = tzdata + elif isinstance(tzdata, basestring): + tzinfo = tz.tzstr(tzdata) + elif isinstance(tzdata, int): + tzinfo = tz.tzoffset(res.tzname, tzdata) + else: + raise ValueError, "offset must be tzinfo subclass, " \ + "tz string, or int offset" + ret = ret.replace(tzinfo=tzinfo) + elif res.tzname and res.tzname in time.tzname: + ret = ret.replace(tzinfo=tz.tzlocal()) + elif res.tzoffset == 0: + ret = ret.replace(tzinfo=tz.tzutc()) + elif res.tzoffset: + ret = ret.replace(tzinfo=tz.tzoffset(res.tzname, res.tzoffset)) + return ret + + class _result(_resultbase): + __slots__ = ["year", "month", "day", "weekday", + "hour", "minute", "second", "microsecond", + "tzname", "tzoffset"] + + def _parse(self, timestr, dayfirst=None, yearfirst=None, fuzzy=False): + info = self.info + if dayfirst is None: + dayfirst = info.dayfirst + if yearfirst is None: + yearfirst = info.yearfirst + res = self._result() + l = _timelex.split(timestr) + try: + + # year/month/day list + ymd = [] + + # Index of the month string in ymd + mstridx = -1 + + len_l = len(l) + i = 0 + while i < len_l: + + # Check if it's a number + try: + value_repr = l[i] + value = float(value_repr) + except ValueError: + value = None + + if value is not None: + # Token is a number + len_li = len(l[i]) + i += 1 + if (len(ymd) == 3 and len_li in (2, 4) + and (i >= len_l or (l[i] != ':' and + info.hms(l[i]) is None))): + # 19990101T23[59] + s = l[i-1] + res.hour = int(s[:2]) + if len_li == 4: + res.minute = int(s[2:]) + elif len_li == 6 or (len_li > 6 and l[i-1].find('.') == 6): + # YYMMDD or HHMMSS[.ss] + s = l[i-1] + if not ymd and l[i-1].find('.') == -1: + ymd.append(info.convertyear(int(s[:2]))) + ymd.append(int(s[2:4])) + ymd.append(int(s[4:])) + else: + # 19990101T235959[.59] + res.hour = int(s[:2]) + res.minute = int(s[2:4]) + res.second, res.microsecond = _parsems(s[4:]) + elif len_li == 8: + # YYYYMMDD + s = l[i-1] + ymd.append(int(s[:4])) + ymd.append(int(s[4:6])) + ymd.append(int(s[6:])) + elif len_li in (12, 14): + # YYYYMMDDhhmm[ss] + s = l[i-1] + ymd.append(int(s[:4])) + ymd.append(int(s[4:6])) + ymd.append(int(s[6:8])) + res.hour = int(s[8:10]) + res.minute = int(s[10:12]) + if len_li == 14: + res.second = int(s[12:]) + elif ((i < len_l and info.hms(l[i]) is not None) or + (i+1 < len_l and l[i] == ' ' and + info.hms(l[i+1]) is not None)): + # HH[ ]h or MM[ ]m or SS[.ss][ ]s + if l[i] == ' ': + i += 1 + idx = info.hms(l[i]) + while True: + if idx == 0: + res.hour = int(value) + if value%1: + res.minute = int(60*(value%1)) + elif idx == 1: + res.minute = int(value) + if value%1: + res.second = int(60*(value%1)) + elif idx == 2: + res.second, res.microsecond = \ + _parsems(value_repr) + i += 1 + if i >= len_l or idx == 2: + break + # 12h00 + try: + value_repr = l[i] + value = float(value_repr) + except ValueError: + break + else: + i += 1 + idx += 1 + if i < len_l: + newidx = info.hms(l[i]) + if newidx is not None: + idx = newidx + elif i+1 < len_l and l[i] == ':': + # HH:MM[:SS[.ss]] + res.hour = int(value) + i += 1 + value = float(l[i]) + res.minute = int(value) + if value%1: + res.second = int(60*(value%1)) + i += 1 + if i < len_l and l[i] == ':': + res.second, res.microsecond = _parsems(l[i+1]) + i += 2 + elif i < len_l and l[i] in ('-', '/', '.'): + sep = l[i] + ymd.append(int(value)) + i += 1 + if i < len_l and not info.jump(l[i]): + try: + # 01-01[-01] + ymd.append(int(l[i])) + except ValueError: + # 01-Jan[-01] + value = info.month(l[i]) + if value is not None: + ymd.append(value) + assert mstridx == -1 + mstridx = len(ymd)-1 + else: + return None + i += 1 + if i < len_l and l[i] == sep: + # We have three members + i += 1 + value = info.month(l[i]) + if value is not None: + ymd.append(value) + mstridx = len(ymd)-1 + assert mstridx == -1 + else: + ymd.append(int(l[i])) + i += 1 + elif i >= len_l or info.jump(l[i]): + if i+1 < len_l and info.ampm(l[i+1]) is not None: + # 12 am + res.hour = int(value) + if res.hour < 12 and info.ampm(l[i+1]) == 1: + res.hour += 12 + elif res.hour == 12 and info.ampm(l[i+1]) == 0: + res.hour = 0 + i += 1 + else: + # Year, month or day + ymd.append(int(value)) + i += 1 + elif info.ampm(l[i]) is not None: + # 12am + res.hour = int(value) + if res.hour < 12 and info.ampm(l[i]) == 1: + res.hour += 12 + elif res.hour == 12 and info.ampm(l[i]) == 0: + res.hour = 0 + i += 1 + elif not fuzzy: + return None + else: + i += 1 + continue + + # Check weekday + value = info.weekday(l[i]) + if value is not None: + res.weekday = value + i += 1 + continue + + # Check month name + value = info.month(l[i]) + if value is not None: + ymd.append(value) + assert mstridx == -1 + mstridx = len(ymd)-1 + i += 1 + if i < len_l: + if l[i] in ('-', '/'): + # Jan-01[-99] + sep = l[i] + i += 1 + ymd.append(int(l[i])) + i += 1 + if i < len_l and l[i] == sep: + # Jan-01-99 + i += 1 + ymd.append(int(l[i])) + i += 1 + elif (i+3 < len_l and l[i] == l[i+2] == ' ' + and info.pertain(l[i+1])): + # Jan of 01 + # In this case, 01 is clearly year + try: + value = int(l[i+3]) + except ValueError: + # Wrong guess + pass + else: + # Convert it here to become unambiguous + ymd.append(info.convertyear(value)) + i += 4 + continue + + # Check am/pm + value = info.ampm(l[i]) + if value is not None: + if value == 1 and res.hour < 12: + res.hour += 12 + elif value == 0 and res.hour == 12: + res.hour = 0 + i += 1 + continue + + # Check for a timezone name + if (res.hour is not None and len(l[i]) <= 5 and + res.tzname is None and res.tzoffset is None and + not [x for x in l[i] if x not in string.ascii_uppercase]): + res.tzname = l[i] + res.tzoffset = info.tzoffset(res.tzname) + i += 1 + + # Check for something like GMT+3, or BRST+3. Notice + # that it doesn't mean "I am 3 hours after GMT", but + # "my time +3 is GMT". If found, we reverse the + # logic so that timezone parsing code will get it + # right. + if i < len_l and l[i] in ('+', '-'): + l[i] = ('+', '-')[l[i] == '+'] + res.tzoffset = None + if info.utczone(res.tzname): + # With something like GMT+3, the timezone + # is *not* GMT. + res.tzname = None + + continue + + # Check for a numbered timezone + if res.hour is not None and l[i] in ('+', '-'): + signal = (-1,1)[l[i] == '+'] + i += 1 + len_li = len(l[i]) + if len_li == 4: + # -0300 + res.tzoffset = int(l[i][:2])*3600+int(l[i][2:])*60 + elif i+1 < len_l and l[i+1] == ':': + # -03:00 + res.tzoffset = int(l[i])*3600+int(l[i+2])*60 + i += 2 + elif len_li <= 2: + # -[0]3 + res.tzoffset = int(l[i][:2])*3600 + else: + return None + i += 1 + res.tzoffset *= signal + + # Look for a timezone name between parenthesis + if (i+3 < len_l and + info.jump(l[i]) and l[i+1] == '(' and l[i+3] == ')' and + 3 <= len(l[i+2]) <= 5 and + not [x for x in l[i+2] + if x not in string.ascii_uppercase]): + # -0300 (BRST) + res.tzname = l[i+2] + i += 4 + continue + + # Check jumps + if not (info.jump(l[i]) or fuzzy): + return None + + i += 1 + + # Process year/month/day + len_ymd = len(ymd) + if len_ymd > 3: + # More than three members!? + return None + elif len_ymd == 1 or (mstridx != -1 and len_ymd == 2): + # One member, or two members with a month string + if mstridx != -1: + res.month = ymd[mstridx] + del ymd[mstridx] + if len_ymd > 1 or mstridx == -1: + if ymd[0] > 31: + res.year = ymd[0] + else: + res.day = ymd[0] + elif len_ymd == 2: + # Two members with numbers + if ymd[0] > 31: + # 99-01 + res.year, res.month = ymd + elif ymd[1] > 31: + # 01-99 + res.month, res.year = ymd + elif dayfirst and ymd[1] <= 12: + # 13-01 + res.day, res.month = ymd + else: + # 01-13 + res.month, res.day = ymd + if len_ymd == 3: + # Three members + if mstridx == 0: + res.month, res.day, res.year = ymd + elif mstridx == 1: + if ymd[0] > 31 or (yearfirst and ymd[2] <= 31): + # 99-Jan-01 + res.year, res.month, res.day = ymd + else: + # 01-Jan-01 + # Give precendence to day-first, since + # two-digit years is usually hand-written. + res.day, res.month, res.year = ymd + elif mstridx == 2: + # WTF!? + if ymd[1] > 31: + # 01-99-Jan + res.day, res.year, res.month = ymd + else: + # 99-01-Jan + res.year, res.day, res.month = ymd + else: + if ymd[0] > 31 or \ + (yearfirst and ymd[1] <= 12 and ymd[2] <= 31): + # 99-01-01 + res.year, res.month, res.day = ymd + elif ymd[0] > 12 or (dayfirst and ymd[1] <= 12): + # 13-01-01 + res.day, res.month, res.year = ymd + else: + # 01-13-01 + res.month, res.day, res.year = ymd + + except (IndexError, ValueError, AssertionError): + return None + + if not info.validate(res): + return None + return res + +DEFAULTPARSER = parser() +def parse(timestr, parserinfo=None, **kwargs): + if parserinfo: + return parser(parserinfo).parse(timestr, **kwargs) + else: + return DEFAULTPARSER.parse(timestr, **kwargs) + + +class _tzparser(object): + + class _result(_resultbase): + + __slots__ = ["stdabbr", "stdoffset", "dstabbr", "dstoffset", + "start", "end"] + + class _attr(_resultbase): + __slots__ = ["month", "week", "weekday", + "yday", "jyday", "day", "time"] + + def __repr__(self): + return self._repr("") + + def __init__(self): + _resultbase.__init__(self) + self.start = self._attr() + self.end = self._attr() + + def parse(self, tzstr): + res = self._result() + l = _timelex.split(tzstr) + try: + + len_l = len(l) + + i = 0 + while i < len_l: + # BRST+3[BRDT[+2]] + j = i + while j < len_l and not [x for x in l[j] + if x in "0123456789:,-+"]: + j += 1 + if j != i: + if not res.stdabbr: + offattr = "stdoffset" + res.stdabbr = "".join(l[i:j]) + else: + offattr = "dstoffset" + res.dstabbr = "".join(l[i:j]) + i = j + if (i < len_l and + (l[i] in ('+', '-') or l[i][0] in "0123456789")): + if l[i] in ('+', '-'): + # Yes, that's right. See the TZ variable + # documentation. + signal = (1,-1)[l[i] == '+'] + i += 1 + else: + signal = -1 + len_li = len(l[i]) + if len_li == 4: + # -0300 + setattr(res, offattr, + (int(l[i][:2])*3600+int(l[i][2:])*60)*signal) + elif i+1 < len_l and l[i+1] == ':': + # -03:00 + setattr(res, offattr, + (int(l[i])*3600+int(l[i+2])*60)*signal) + i += 2 + elif len_li <= 2: + # -[0]3 + setattr(res, offattr, + int(l[i][:2])*3600*signal) + else: + return None + i += 1 + if res.dstabbr: + break + else: + break + + if i < len_l: + for j in range(i, len_l): + if l[j] == ';': l[j] = ',' + + assert l[i] == ',' + + i += 1 + + if i >= len_l: + pass + elif (8 <= l.count(',') <= 9 and + not [y for x in l[i:] if x != ',' + for y in x if y not in "0123456789"]): + # GMT0BST,3,0,30,3600,10,0,26,7200[,3600] + for x in (res.start, res.end): + x.month = int(l[i]) + i += 2 + if l[i] == '-': + value = int(l[i+1])*-1 + i += 1 + else: + value = int(l[i]) + i += 2 + if value: + x.week = value + x.weekday = (int(l[i])-1)%7 + else: + x.day = int(l[i]) + i += 2 + x.time = int(l[i]) + i += 2 + if i < len_l: + if l[i] in ('-','+'): + signal = (-1,1)[l[i] == "+"] + i += 1 + else: + signal = 1 + res.dstoffset = (res.stdoffset+int(l[i]))*signal + elif (l.count(',') == 2 and l[i:].count('/') <= 2 and + not [y for x in l[i:] if x not in (',','/','J','M', + '.','-',':') + for y in x if y not in "0123456789"]): + for x in (res.start, res.end): + if l[i] == 'J': + # non-leap year day (1 based) + i += 1 + x.jyday = int(l[i]) + elif l[i] == 'M': + # month[-.]week[-.]weekday + i += 1 + x.month = int(l[i]) + i += 1 + assert l[i] in ('-', '.') + i += 1 + x.week = int(l[i]) + if x.week == 5: + x.week = -1 + i += 1 + assert l[i] in ('-', '.') + i += 1 + x.weekday = (int(l[i])-1)%7 + else: + # year day (zero based) + x.yday = int(l[i])+1 + + i += 1 + + if i < len_l and l[i] == '/': + i += 1 + # start time + len_li = len(l[i]) + if len_li == 4: + # -0300 + x.time = (int(l[i][:2])*3600+int(l[i][2:])*60) + elif i+1 < len_l and l[i+1] == ':': + # -03:00 + x.time = int(l[i])*3600+int(l[i+2])*60 + i += 2 + if i+1 < len_l and l[i+1] == ':': + i += 2 + x.time += int(l[i]) + elif len_li <= 2: + # -[0]3 + x.time = (int(l[i][:2])*3600) + else: + return None + i += 1 + + assert i == len_l or l[i] == ',' + + i += 1 + + assert i >= len_l + + except (IndexError, ValueError, AssertionError): + return None + + return res + + +DEFAULTTZPARSER = _tzparser() +def _parsetz(tzstr): + return DEFAULTTZPARSER.parse(tzstr) + + +def _parsems(value): + """Parse a I[.F] seconds value into (seconds, microseconds).""" + if "." not in value: + return int(value), 0 + else: + i, f = value.split(".") + return int(i), int(f.ljust(6, "0")[:6]) + + +# vim:ts=4:sw=4:et diff --git a/libs/dateutil/relativedelta.py b/libs/dateutil/relativedelta.py new file mode 100644 index 00000000..0c72a818 --- /dev/null +++ b/libs/dateutil/relativedelta.py @@ -0,0 +1,432 @@ +""" +Copyright (c) 2003-2010 Gustavo Niemeyer + +This module offers extensions to the standard python 2.3+ +datetime module. +""" +__author__ = "Gustavo Niemeyer " +__license__ = "PSF License" + +import datetime +import calendar + +__all__ = ["relativedelta", "MO", "TU", "WE", "TH", "FR", "SA", "SU"] + +class weekday(object): + __slots__ = ["weekday", "n"] + + def __init__(self, weekday, n=None): + self.weekday = weekday + self.n = n + + def __call__(self, n): + if n == self.n: + return self + else: + return self.__class__(self.weekday, n) + + def __eq__(self, other): + try: + if self.weekday != other.weekday or self.n != other.n: + return False + except AttributeError: + return False + return True + + def __repr__(self): + s = ("MO", "TU", "WE", "TH", "FR", "SA", "SU")[self.weekday] + if not self.n: + return s + else: + return "%s(%+d)" % (s, self.n) + +MO, TU, WE, TH, FR, SA, SU = weekdays = tuple([weekday(x) for x in range(7)]) + +class relativedelta: + """ +The relativedelta type is based on the specification of the excelent +work done by M.-A. Lemburg in his mx.DateTime extension. However, +notice that this type does *NOT* implement the same algorithm as +his work. Do *NOT* expect it to behave like mx.DateTime's counterpart. + +There's two different ways to build a relativedelta instance. The +first one is passing it two date/datetime classes: + + relativedelta(datetime1, datetime2) + +And the other way is to use the following keyword arguments: + + year, month, day, hour, minute, second, microsecond: + Absolute information. + + years, months, weeks, days, hours, minutes, seconds, microseconds: + Relative information, may be negative. + + weekday: + One of the weekday instances (MO, TU, etc). These instances may + receive a parameter N, specifying the Nth weekday, which could + be positive or negative (like MO(+1) or MO(-2). Not specifying + it is the same as specifying +1. You can also use an integer, + where 0=MO. + + leapdays: + Will add given days to the date found, if year is a leap + year, and the date found is post 28 of february. + + yearday, nlyearday: + Set the yearday or the non-leap year day (jump leap days). + These are converted to day/month/leapdays information. + +Here is the behavior of operations with relativedelta: + +1) Calculate the absolute year, using the 'year' argument, or the + original datetime year, if the argument is not present. + +2) Add the relative 'years' argument to the absolute year. + +3) Do steps 1 and 2 for month/months. + +4) Calculate the absolute day, using the 'day' argument, or the + original datetime day, if the argument is not present. Then, + subtract from the day until it fits in the year and month + found after their operations. + +5) Add the relative 'days' argument to the absolute day. Notice + that the 'weeks' argument is multiplied by 7 and added to + 'days'. + +6) Do steps 1 and 2 for hour/hours, minute/minutes, second/seconds, + microsecond/microseconds. + +7) If the 'weekday' argument is present, calculate the weekday, + with the given (wday, nth) tuple. wday is the index of the + weekday (0-6, 0=Mon), and nth is the number of weeks to add + forward or backward, depending on its signal. Notice that if + the calculated date is already Monday, for example, using + (0, 1) or (0, -1) won't change the day. + """ + + def __init__(self, dt1=None, dt2=None, + years=0, months=0, days=0, leapdays=0, weeks=0, + hours=0, minutes=0, seconds=0, microseconds=0, + year=None, month=None, day=None, weekday=None, + yearday=None, nlyearday=None, + hour=None, minute=None, second=None, microsecond=None): + if dt1 and dt2: + if not isinstance(dt1, datetime.date) or \ + not isinstance(dt2, datetime.date): + raise TypeError, "relativedelta only diffs datetime/date" + if type(dt1) is not type(dt2): + if not isinstance(dt1, datetime.datetime): + dt1 = datetime.datetime.fromordinal(dt1.toordinal()) + elif not isinstance(dt2, datetime.datetime): + dt2 = datetime.datetime.fromordinal(dt2.toordinal()) + self.years = 0 + self.months = 0 + self.days = 0 + self.leapdays = 0 + self.hours = 0 + self.minutes = 0 + self.seconds = 0 + self.microseconds = 0 + self.year = None + self.month = None + self.day = None + self.weekday = None + self.hour = None + self.minute = None + self.second = None + self.microsecond = None + self._has_time = 0 + + months = (dt1.year*12+dt1.month)-(dt2.year*12+dt2.month) + self._set_months(months) + dtm = self.__radd__(dt2) + if dt1 < dt2: + while dt1 > dtm: + months += 1 + self._set_months(months) + dtm = self.__radd__(dt2) + else: + while dt1 < dtm: + months -= 1 + self._set_months(months) + dtm = self.__radd__(dt2) + delta = dt1 - dtm + self.seconds = delta.seconds+delta.days*86400 + self.microseconds = delta.microseconds + else: + self.years = years + self.months = months + self.days = days+weeks*7 + self.leapdays = leapdays + self.hours = hours + self.minutes = minutes + self.seconds = seconds + self.microseconds = microseconds + self.year = year + self.month = month + self.day = day + self.hour = hour + self.minute = minute + self.second = second + self.microsecond = microsecond + + if type(weekday) is int: + self.weekday = weekdays[weekday] + else: + self.weekday = weekday + + yday = 0 + if nlyearday: + yday = nlyearday + elif yearday: + yday = yearday + if yearday > 59: + self.leapdays = -1 + if yday: + ydayidx = [31,59,90,120,151,181,212,243,273,304,334,366] + for idx, ydays in enumerate(ydayidx): + if yday <= ydays: + self.month = idx+1 + if idx == 0: + self.day = yday + else: + self.day = yday-ydayidx[idx-1] + break + else: + raise ValueError, "invalid year day (%d)" % yday + + self._fix() + + def _fix(self): + if abs(self.microseconds) > 999999: + s = self.microseconds//abs(self.microseconds) + div, mod = divmod(self.microseconds*s, 1000000) + self.microseconds = mod*s + self.seconds += div*s + if abs(self.seconds) > 59: + s = self.seconds//abs(self.seconds) + div, mod = divmod(self.seconds*s, 60) + self.seconds = mod*s + self.minutes += div*s + if abs(self.minutes) > 59: + s = self.minutes//abs(self.minutes) + div, mod = divmod(self.minutes*s, 60) + self.minutes = mod*s + self.hours += div*s + if abs(self.hours) > 23: + s = self.hours//abs(self.hours) + div, mod = divmod(self.hours*s, 24) + self.hours = mod*s + self.days += div*s + if abs(self.months) > 11: + s = self.months//abs(self.months) + div, mod = divmod(self.months*s, 12) + self.months = mod*s + self.years += div*s + if (self.hours or self.minutes or self.seconds or self.microseconds or + self.hour is not None or self.minute is not None or + self.second is not None or self.microsecond is not None): + self._has_time = 1 + else: + self._has_time = 0 + + def _set_months(self, months): + self.months = months + if abs(self.months) > 11: + s = self.months//abs(self.months) + div, mod = divmod(self.months*s, 12) + self.months = mod*s + self.years = div*s + else: + self.years = 0 + + def __radd__(self, other): + if not isinstance(other, datetime.date): + raise TypeError, "unsupported type for add operation" + elif self._has_time and not isinstance(other, datetime.datetime): + other = datetime.datetime.fromordinal(other.toordinal()) + year = (self.year or other.year)+self.years + month = self.month or other.month + if self.months: + assert 1 <= abs(self.months) <= 12 + month += self.months + if month > 12: + year += 1 + month -= 12 + elif month < 1: + year -= 1 + month += 12 + day = min(calendar.monthrange(year, month)[1], + self.day or other.day) + repl = {"year": year, "month": month, "day": day} + for attr in ["hour", "minute", "second", "microsecond"]: + value = getattr(self, attr) + if value is not None: + repl[attr] = value + days = self.days + if self.leapdays and month > 2 and calendar.isleap(year): + days += self.leapdays + ret = (other.replace(**repl) + + datetime.timedelta(days=days, + hours=self.hours, + minutes=self.minutes, + seconds=self.seconds, + microseconds=self.microseconds)) + if self.weekday: + weekday, nth = self.weekday.weekday, self.weekday.n or 1 + jumpdays = (abs(nth)-1)*7 + if nth > 0: + jumpdays += (7-ret.weekday()+weekday)%7 + else: + jumpdays += (ret.weekday()-weekday)%7 + jumpdays *= -1 + ret += datetime.timedelta(days=jumpdays) + return ret + + def __rsub__(self, other): + return self.__neg__().__radd__(other) + + def __add__(self, other): + if not isinstance(other, relativedelta): + raise TypeError, "unsupported type for add operation" + return relativedelta(years=other.years+self.years, + months=other.months+self.months, + days=other.days+self.days, + hours=other.hours+self.hours, + minutes=other.minutes+self.minutes, + seconds=other.seconds+self.seconds, + microseconds=other.microseconds+self.microseconds, + leapdays=other.leapdays or self.leapdays, + year=other.year or self.year, + month=other.month or self.month, + day=other.day or self.day, + weekday=other.weekday or self.weekday, + hour=other.hour or self.hour, + minute=other.minute or self.minute, + second=other.second or self.second, + microsecond=other.second or self.microsecond) + + def __sub__(self, other): + if not isinstance(other, relativedelta): + raise TypeError, "unsupported type for sub operation" + return relativedelta(years=other.years-self.years, + months=other.months-self.months, + days=other.days-self.days, + hours=other.hours-self.hours, + minutes=other.minutes-self.minutes, + seconds=other.seconds-self.seconds, + microseconds=other.microseconds-self.microseconds, + leapdays=other.leapdays or self.leapdays, + year=other.year or self.year, + month=other.month or self.month, + day=other.day or self.day, + weekday=other.weekday or self.weekday, + hour=other.hour or self.hour, + minute=other.minute or self.minute, + second=other.second or self.second, + microsecond=other.second or self.microsecond) + + def __neg__(self): + return relativedelta(years=-self.years, + months=-self.months, + days=-self.days, + hours=-self.hours, + minutes=-self.minutes, + seconds=-self.seconds, + microseconds=-self.microseconds, + leapdays=self.leapdays, + year=self.year, + month=self.month, + day=self.day, + weekday=self.weekday, + hour=self.hour, + minute=self.minute, + second=self.second, + microsecond=self.microsecond) + + def __nonzero__(self): + return not (not self.years and + not self.months and + not self.days and + not self.hours and + not self.minutes and + not self.seconds and + not self.microseconds and + not self.leapdays and + self.year is None and + self.month is None and + self.day is None and + self.weekday is None and + self.hour is None and + self.minute is None and + self.second is None and + self.microsecond is None) + + def __mul__(self, other): + f = float(other) + return relativedelta(years=self.years*f, + months=self.months*f, + days=self.days*f, + hours=self.hours*f, + minutes=self.minutes*f, + seconds=self.seconds*f, + microseconds=self.microseconds*f, + leapdays=self.leapdays, + year=self.year, + month=self.month, + day=self.day, + weekday=self.weekday, + hour=self.hour, + minute=self.minute, + second=self.second, + microsecond=self.microsecond) + + def __eq__(self, other): + if not isinstance(other, relativedelta): + return False + if self.weekday or other.weekday: + if not self.weekday or not other.weekday: + return False + if self.weekday.weekday != other.weekday.weekday: + return False + n1, n2 = self.weekday.n, other.weekday.n + if n1 != n2 and not ((not n1 or n1 == 1) and (not n2 or n2 == 1)): + return False + return (self.years == other.years and + self.months == other.months and + self.days == other.days and + self.hours == other.hours and + self.minutes == other.minutes and + self.seconds == other.seconds and + self.leapdays == other.leapdays and + self.year == other.year and + self.month == other.month and + self.day == other.day and + self.hour == other.hour and + self.minute == other.minute and + self.second == other.second and + self.microsecond == other.microsecond) + + def __ne__(self, other): + return not self.__eq__(other) + + def __div__(self, other): + return self.__mul__(1/float(other)) + + def __repr__(self): + l = [] + for attr in ["years", "months", "days", "leapdays", + "hours", "minutes", "seconds", "microseconds"]: + value = getattr(self, attr) + if value: + l.append("%s=%+d" % (attr, value)) + for attr in ["year", "month", "day", "weekday", + "hour", "minute", "second", "microsecond"]: + value = getattr(self, attr) + if value is not None: + l.append("%s=%s" % (attr, `value`)) + return "%s(%s)" % (self.__class__.__name__, ", ".join(l)) + +# vim:ts=4:sw=4:et diff --git a/libs/dateutil/rrule.py b/libs/dateutil/rrule.py new file mode 100644 index 00000000..6bd83cad --- /dev/null +++ b/libs/dateutil/rrule.py @@ -0,0 +1,1097 @@ +""" +Copyright (c) 2003-2010 Gustavo Niemeyer + +This module offers extensions to the standard python 2.3+ +datetime module. +""" +__author__ = "Gustavo Niemeyer " +__license__ = "PSF License" + +import itertools +import datetime +import calendar +import thread +import sys + +__all__ = ["rrule", "rruleset", "rrulestr", + "YEARLY", "MONTHLY", "WEEKLY", "DAILY", + "HOURLY", "MINUTELY", "SECONDLY", + "MO", "TU", "WE", "TH", "FR", "SA", "SU"] + +# Every mask is 7 days longer to handle cross-year weekly periods. +M366MASK = tuple([1]*31+[2]*29+[3]*31+[4]*30+[5]*31+[6]*30+ + [7]*31+[8]*31+[9]*30+[10]*31+[11]*30+[12]*31+[1]*7) +M365MASK = list(M366MASK) +M29, M30, M31 = range(1,30), range(1,31), range(1,32) +MDAY366MASK = tuple(M31+M29+M31+M30+M31+M30+M31+M31+M30+M31+M30+M31+M31[:7]) +MDAY365MASK = list(MDAY366MASK) +M29, M30, M31 = range(-29,0), range(-30,0), range(-31,0) +NMDAY366MASK = tuple(M31+M29+M31+M30+M31+M30+M31+M31+M30+M31+M30+M31+M31[:7]) +NMDAY365MASK = list(NMDAY366MASK) +M366RANGE = (0,31,60,91,121,152,182,213,244,274,305,335,366) +M365RANGE = (0,31,59,90,120,151,181,212,243,273,304,334,365) +WDAYMASK = [0,1,2,3,4,5,6]*55 +del M29, M30, M31, M365MASK[59], MDAY365MASK[59], NMDAY365MASK[31] +MDAY365MASK = tuple(MDAY365MASK) +M365MASK = tuple(M365MASK) + +(YEARLY, + MONTHLY, + WEEKLY, + DAILY, + HOURLY, + MINUTELY, + SECONDLY) = range(7) + +# Imported on demand. +easter = None +parser = None + +class weekday(object): + __slots__ = ["weekday", "n"] + + def __init__(self, weekday, n=None): + if n == 0: + raise ValueError, "Can't create weekday with n == 0" + self.weekday = weekday + self.n = n + + def __call__(self, n): + if n == self.n: + return self + else: + return self.__class__(self.weekday, n) + + def __eq__(self, other): + try: + if self.weekday != other.weekday or self.n != other.n: + return False + except AttributeError: + return False + return True + + def __repr__(self): + s = ("MO", "TU", "WE", "TH", "FR", "SA", "SU")[self.weekday] + if not self.n: + return s + else: + return "%s(%+d)" % (s, self.n) + +MO, TU, WE, TH, FR, SA, SU = weekdays = tuple([weekday(x) for x in range(7)]) + +class rrulebase: + def __init__(self, cache=False): + if cache: + self._cache = [] + self._cache_lock = thread.allocate_lock() + self._cache_gen = self._iter() + self._cache_complete = False + else: + self._cache = None + self._cache_complete = False + self._len = None + + def __iter__(self): + if self._cache_complete: + return iter(self._cache) + elif self._cache is None: + return self._iter() + else: + return self._iter_cached() + + def _iter_cached(self): + i = 0 + gen = self._cache_gen + cache = self._cache + acquire = self._cache_lock.acquire + release = self._cache_lock.release + while gen: + if i == len(cache): + acquire() + if self._cache_complete: + break + try: + for j in range(10): + cache.append(gen.next()) + except StopIteration: + self._cache_gen = gen = None + self._cache_complete = True + break + release() + yield cache[i] + i += 1 + while i < self._len: + yield cache[i] + i += 1 + + def __getitem__(self, item): + if self._cache_complete: + return self._cache[item] + elif isinstance(item, slice): + if item.step and item.step < 0: + return list(iter(self))[item] + else: + return list(itertools.islice(self, + item.start or 0, + item.stop or sys.maxint, + item.step or 1)) + elif item >= 0: + gen = iter(self) + try: + for i in range(item+1): + res = gen.next() + except StopIteration: + raise IndexError + return res + else: + return list(iter(self))[item] + + def __contains__(self, item): + if self._cache_complete: + return item in self._cache + else: + for i in self: + if i == item: + return True + elif i > item: + return False + return False + + # __len__() introduces a large performance penality. + def count(self): + if self._len is None: + for x in self: pass + return self._len + + def before(self, dt, inc=False): + if self._cache_complete: + gen = self._cache + else: + gen = self + last = None + if inc: + for i in gen: + if i > dt: + break + last = i + else: + for i in gen: + if i >= dt: + break + last = i + return last + + def after(self, dt, inc=False): + if self._cache_complete: + gen = self._cache + else: + gen = self + if inc: + for i in gen: + if i >= dt: + return i + else: + for i in gen: + if i > dt: + return i + return None + + def between(self, after, before, inc=False): + if self._cache_complete: + gen = self._cache + else: + gen = self + started = False + l = [] + if inc: + for i in gen: + if i > before: + break + elif not started: + if i >= after: + started = True + l.append(i) + else: + l.append(i) + else: + for i in gen: + if i >= before: + break + elif not started: + if i > after: + started = True + l.append(i) + else: + l.append(i) + return l + +class rrule(rrulebase): + def __init__(self, freq, dtstart=None, + interval=1, wkst=None, count=None, until=None, bysetpos=None, + bymonth=None, bymonthday=None, byyearday=None, byeaster=None, + byweekno=None, byweekday=None, + byhour=None, byminute=None, bysecond=None, + cache=False): + rrulebase.__init__(self, cache) + global easter + if not dtstart: + dtstart = datetime.datetime.now().replace(microsecond=0) + elif not isinstance(dtstart, datetime.datetime): + dtstart = datetime.datetime.fromordinal(dtstart.toordinal()) + else: + dtstart = dtstart.replace(microsecond=0) + self._dtstart = dtstart + self._tzinfo = dtstart.tzinfo + self._freq = freq + self._interval = interval + self._count = count + if until and not isinstance(until, datetime.datetime): + until = datetime.datetime.fromordinal(until.toordinal()) + self._until = until + if wkst is None: + self._wkst = calendar.firstweekday() + elif type(wkst) is int: + self._wkst = wkst + else: + self._wkst = wkst.weekday + if bysetpos is None: + self._bysetpos = None + elif type(bysetpos) is int: + if bysetpos == 0 or not (-366 <= bysetpos <= 366): + raise ValueError("bysetpos must be between 1 and 366, " + "or between -366 and -1") + self._bysetpos = (bysetpos,) + else: + self._bysetpos = tuple(bysetpos) + for pos in self._bysetpos: + if pos == 0 or not (-366 <= pos <= 366): + raise ValueError("bysetpos must be between 1 and 366, " + "or between -366 and -1") + if not (byweekno or byyearday or bymonthday or + byweekday is not None or byeaster is not None): + if freq == YEARLY: + if not bymonth: + bymonth = dtstart.month + bymonthday = dtstart.day + elif freq == MONTHLY: + bymonthday = dtstart.day + elif freq == WEEKLY: + byweekday = dtstart.weekday() + # bymonth + if not bymonth: + self._bymonth = None + elif type(bymonth) is int: + self._bymonth = (bymonth,) + else: + self._bymonth = tuple(bymonth) + # byyearday + if not byyearday: + self._byyearday = None + elif type(byyearday) is int: + self._byyearday = (byyearday,) + else: + self._byyearday = tuple(byyearday) + # byeaster + if byeaster is not None: + if not easter: + from dateutil import easter + if type(byeaster) is int: + self._byeaster = (byeaster,) + else: + self._byeaster = tuple(byeaster) + else: + self._byeaster = None + # bymonthay + if not bymonthday: + self._bymonthday = () + self._bynmonthday = () + elif type(bymonthday) is int: + if bymonthday < 0: + self._bynmonthday = (bymonthday,) + self._bymonthday = () + else: + self._bymonthday = (bymonthday,) + self._bynmonthday = () + else: + self._bymonthday = tuple([x for x in bymonthday if x > 0]) + self._bynmonthday = tuple([x for x in bymonthday if x < 0]) + # byweekno + if byweekno is None: + self._byweekno = None + elif type(byweekno) is int: + self._byweekno = (byweekno,) + else: + self._byweekno = tuple(byweekno) + # byweekday / bynweekday + if byweekday is None: + self._byweekday = None + self._bynweekday = None + elif type(byweekday) is int: + self._byweekday = (byweekday,) + self._bynweekday = None + elif hasattr(byweekday, "n"): + if not byweekday.n or freq > MONTHLY: + self._byweekday = (byweekday.weekday,) + self._bynweekday = None + else: + self._bynweekday = ((byweekday.weekday, byweekday.n),) + self._byweekday = None + else: + self._byweekday = [] + self._bynweekday = [] + for wday in byweekday: + if type(wday) is int: + self._byweekday.append(wday) + elif not wday.n or freq > MONTHLY: + self._byweekday.append(wday.weekday) + else: + self._bynweekday.append((wday.weekday, wday.n)) + self._byweekday = tuple(self._byweekday) + self._bynweekday = tuple(self._bynweekday) + if not self._byweekday: + self._byweekday = None + elif not self._bynweekday: + self._bynweekday = None + # byhour + if byhour is None: + if freq < HOURLY: + self._byhour = (dtstart.hour,) + else: + self._byhour = None + elif type(byhour) is int: + self._byhour = (byhour,) + else: + self._byhour = tuple(byhour) + # byminute + if byminute is None: + if freq < MINUTELY: + self._byminute = (dtstart.minute,) + else: + self._byminute = None + elif type(byminute) is int: + self._byminute = (byminute,) + else: + self._byminute = tuple(byminute) + # bysecond + if bysecond is None: + if freq < SECONDLY: + self._bysecond = (dtstart.second,) + else: + self._bysecond = None + elif type(bysecond) is int: + self._bysecond = (bysecond,) + else: + self._bysecond = tuple(bysecond) + + if self._freq >= HOURLY: + self._timeset = None + else: + self._timeset = [] + for hour in self._byhour: + for minute in self._byminute: + for second in self._bysecond: + self._timeset.append( + datetime.time(hour, minute, second, + tzinfo=self._tzinfo)) + self._timeset.sort() + self._timeset = tuple(self._timeset) + + def _iter(self): + year, month, day, hour, minute, second, weekday, yearday, _ = \ + self._dtstart.timetuple() + + # Some local variables to speed things up a bit + freq = self._freq + interval = self._interval + wkst = self._wkst + until = self._until + bymonth = self._bymonth + byweekno = self._byweekno + byyearday = self._byyearday + byweekday = self._byweekday + byeaster = self._byeaster + bymonthday = self._bymonthday + bynmonthday = self._bynmonthday + bysetpos = self._bysetpos + byhour = self._byhour + byminute = self._byminute + bysecond = self._bysecond + + ii = _iterinfo(self) + ii.rebuild(year, month) + + getdayset = {YEARLY:ii.ydayset, + MONTHLY:ii.mdayset, + WEEKLY:ii.wdayset, + DAILY:ii.ddayset, + HOURLY:ii.ddayset, + MINUTELY:ii.ddayset, + SECONDLY:ii.ddayset}[freq] + + if freq < HOURLY: + timeset = self._timeset + else: + gettimeset = {HOURLY:ii.htimeset, + MINUTELY:ii.mtimeset, + SECONDLY:ii.stimeset}[freq] + if ((freq >= HOURLY and + self._byhour and hour not in self._byhour) or + (freq >= MINUTELY and + self._byminute and minute not in self._byminute) or + (freq >= SECONDLY and + self._bysecond and second not in self._bysecond)): + timeset = () + else: + timeset = gettimeset(hour, minute, second) + + total = 0 + count = self._count + while True: + # Get dayset with the right frequency + dayset, start, end = getdayset(year, month, day) + + # Do the "hard" work ;-) + filtered = False + for i in dayset[start:end]: + if ((bymonth and ii.mmask[i] not in bymonth) or + (byweekno and not ii.wnomask[i]) or + (byweekday and ii.wdaymask[i] not in byweekday) or + (ii.nwdaymask and not ii.nwdaymask[i]) or + (byeaster and not ii.eastermask[i]) or + ((bymonthday or bynmonthday) and + ii.mdaymask[i] not in bymonthday and + ii.nmdaymask[i] not in bynmonthday) or + (byyearday and + ((i < ii.yearlen and i+1 not in byyearday + and -ii.yearlen+i not in byyearday) or + (i >= ii.yearlen and i+1-ii.yearlen not in byyearday + and -ii.nextyearlen+i-ii.yearlen + not in byyearday)))): + dayset[i] = None + filtered = True + + # Output results + if bysetpos and timeset: + poslist = [] + for pos in bysetpos: + if pos < 0: + daypos, timepos = divmod(pos, len(timeset)) + else: + daypos, timepos = divmod(pos-1, len(timeset)) + try: + i = [x for x in dayset[start:end] + if x is not None][daypos] + time = timeset[timepos] + except IndexError: + pass + else: + date = datetime.date.fromordinal(ii.yearordinal+i) + res = datetime.datetime.combine(date, time) + if res not in poslist: + poslist.append(res) + poslist.sort() + for res in poslist: + if until and res > until: + self._len = total + return + elif res >= self._dtstart: + total += 1 + yield res + if count: + count -= 1 + if not count: + self._len = total + return + else: + for i in dayset[start:end]: + if i is not None: + date = datetime.date.fromordinal(ii.yearordinal+i) + for time in timeset: + res = datetime.datetime.combine(date, time) + if until and res > until: + self._len = total + return + elif res >= self._dtstart: + total += 1 + yield res + if count: + count -= 1 + if not count: + self._len = total + return + + # Handle frequency and interval + fixday = False + if freq == YEARLY: + year += interval + if year > datetime.MAXYEAR: + self._len = total + return + ii.rebuild(year, month) + elif freq == MONTHLY: + month += interval + if month > 12: + div, mod = divmod(month, 12) + month = mod + year += div + if month == 0: + month = 12 + year -= 1 + if year > datetime.MAXYEAR: + self._len = total + return + ii.rebuild(year, month) + elif freq == WEEKLY: + if wkst > weekday: + day += -(weekday+1+(6-wkst))+self._interval*7 + else: + day += -(weekday-wkst)+self._interval*7 + weekday = wkst + fixday = True + elif freq == DAILY: + day += interval + fixday = True + elif freq == HOURLY: + if filtered: + # Jump to one iteration before next day + hour += ((23-hour)//interval)*interval + while True: + hour += interval + div, mod = divmod(hour, 24) + if div: + hour = mod + day += div + fixday = True + if not byhour or hour in byhour: + break + timeset = gettimeset(hour, minute, second) + elif freq == MINUTELY: + if filtered: + # Jump to one iteration before next day + minute += ((1439-(hour*60+minute))//interval)*interval + while True: + minute += interval + div, mod = divmod(minute, 60) + if div: + minute = mod + hour += div + div, mod = divmod(hour, 24) + if div: + hour = mod + day += div + fixday = True + filtered = False + if ((not byhour or hour in byhour) and + (not byminute or minute in byminute)): + break + timeset = gettimeset(hour, minute, second) + elif freq == SECONDLY: + if filtered: + # Jump to one iteration before next day + second += (((86399-(hour*3600+minute*60+second)) + //interval)*interval) + while True: + second += self._interval + div, mod = divmod(second, 60) + if div: + second = mod + minute += div + div, mod = divmod(minute, 60) + if div: + minute = mod + hour += div + div, mod = divmod(hour, 24) + if div: + hour = mod + day += div + fixday = True + if ((not byhour or hour in byhour) and + (not byminute or minute in byminute) and + (not bysecond or second in bysecond)): + break + timeset = gettimeset(hour, minute, second) + + if fixday and day > 28: + daysinmonth = calendar.monthrange(year, month)[1] + if day > daysinmonth: + while day > daysinmonth: + day -= daysinmonth + month += 1 + if month == 13: + month = 1 + year += 1 + if year > datetime.MAXYEAR: + self._len = total + return + daysinmonth = calendar.monthrange(year, month)[1] + ii.rebuild(year, month) + +class _iterinfo(object): + __slots__ = ["rrule", "lastyear", "lastmonth", + "yearlen", "nextyearlen", "yearordinal", "yearweekday", + "mmask", "mrange", "mdaymask", "nmdaymask", + "wdaymask", "wnomask", "nwdaymask", "eastermask"] + + def __init__(self, rrule): + for attr in self.__slots__: + setattr(self, attr, None) + self.rrule = rrule + + def rebuild(self, year, month): + # Every mask is 7 days longer to handle cross-year weekly periods. + rr = self.rrule + if year != self.lastyear: + self.yearlen = 365+calendar.isleap(year) + self.nextyearlen = 365+calendar.isleap(year+1) + firstyday = datetime.date(year, 1, 1) + self.yearordinal = firstyday.toordinal() + self.yearweekday = firstyday.weekday() + + wday = datetime.date(year, 1, 1).weekday() + if self.yearlen == 365: + self.mmask = M365MASK + self.mdaymask = MDAY365MASK + self.nmdaymask = NMDAY365MASK + self.wdaymask = WDAYMASK[wday:] + self.mrange = M365RANGE + else: + self.mmask = M366MASK + self.mdaymask = MDAY366MASK + self.nmdaymask = NMDAY366MASK + self.wdaymask = WDAYMASK[wday:] + self.mrange = M366RANGE + + if not rr._byweekno: + self.wnomask = None + else: + self.wnomask = [0]*(self.yearlen+7) + #no1wkst = firstwkst = self.wdaymask.index(rr._wkst) + no1wkst = firstwkst = (7-self.yearweekday+rr._wkst)%7 + if no1wkst >= 4: + no1wkst = 0 + # Number of days in the year, plus the days we got + # from last year. + wyearlen = self.yearlen+(self.yearweekday-rr._wkst)%7 + else: + # Number of days in the year, minus the days we + # left in last year. + wyearlen = self.yearlen-no1wkst + div, mod = divmod(wyearlen, 7) + numweeks = div+mod//4 + for n in rr._byweekno: + if n < 0: + n += numweeks+1 + if not (0 < n <= numweeks): + continue + if n > 1: + i = no1wkst+(n-1)*7 + if no1wkst != firstwkst: + i -= 7-firstwkst + else: + i = no1wkst + for j in range(7): + self.wnomask[i] = 1 + i += 1 + if self.wdaymask[i] == rr._wkst: + break + if 1 in rr._byweekno: + # Check week number 1 of next year as well + # TODO: Check -numweeks for next year. + i = no1wkst+numweeks*7 + if no1wkst != firstwkst: + i -= 7-firstwkst + if i < self.yearlen: + # If week starts in next year, we + # don't care about it. + for j in range(7): + self.wnomask[i] = 1 + i += 1 + if self.wdaymask[i] == rr._wkst: + break + if no1wkst: + # Check last week number of last year as + # well. If no1wkst is 0, either the year + # started on week start, or week number 1 + # got days from last year, so there are no + # days from last year's last week number in + # this year. + if -1 not in rr._byweekno: + lyearweekday = datetime.date(year-1,1,1).weekday() + lno1wkst = (7-lyearweekday+rr._wkst)%7 + lyearlen = 365+calendar.isleap(year-1) + if lno1wkst >= 4: + lno1wkst = 0 + lnumweeks = 52+(lyearlen+ + (lyearweekday-rr._wkst)%7)%7//4 + else: + lnumweeks = 52+(self.yearlen-no1wkst)%7//4 + else: + lnumweeks = -1 + if lnumweeks in rr._byweekno: + for i in range(no1wkst): + self.wnomask[i] = 1 + + if (rr._bynweekday and + (month != self.lastmonth or year != self.lastyear)): + ranges = [] + if rr._freq == YEARLY: + if rr._bymonth: + for month in rr._bymonth: + ranges.append(self.mrange[month-1:month+1]) + else: + ranges = [(0, self.yearlen)] + elif rr._freq == MONTHLY: + ranges = [self.mrange[month-1:month+1]] + if ranges: + # Weekly frequency won't get here, so we may not + # care about cross-year weekly periods. + self.nwdaymask = [0]*self.yearlen + for first, last in ranges: + last -= 1 + for wday, n in rr._bynweekday: + if n < 0: + i = last+(n+1)*7 + i -= (self.wdaymask[i]-wday)%7 + else: + i = first+(n-1)*7 + i += (7-self.wdaymask[i]+wday)%7 + if first <= i <= last: + self.nwdaymask[i] = 1 + + if rr._byeaster: + self.eastermask = [0]*(self.yearlen+7) + eyday = easter.easter(year).toordinal()-self.yearordinal + for offset in rr._byeaster: + self.eastermask[eyday+offset] = 1 + + self.lastyear = year + self.lastmonth = month + + def ydayset(self, year, month, day): + return range(self.yearlen), 0, self.yearlen + + def mdayset(self, year, month, day): + set = [None]*self.yearlen + start, end = self.mrange[month-1:month+1] + for i in range(start, end): + set[i] = i + return set, start, end + + def wdayset(self, year, month, day): + # We need to handle cross-year weeks here. + set = [None]*(self.yearlen+7) + i = datetime.date(year, month, day).toordinal()-self.yearordinal + start = i + for j in range(7): + set[i] = i + i += 1 + #if (not (0 <= i < self.yearlen) or + # self.wdaymask[i] == self.rrule._wkst): + # This will cross the year boundary, if necessary. + if self.wdaymask[i] == self.rrule._wkst: + break + return set, start, i + + def ddayset(self, year, month, day): + set = [None]*self.yearlen + i = datetime.date(year, month, day).toordinal()-self.yearordinal + set[i] = i + return set, i, i+1 + + def htimeset(self, hour, minute, second): + set = [] + rr = self.rrule + for minute in rr._byminute: + for second in rr._bysecond: + set.append(datetime.time(hour, minute, second, + tzinfo=rr._tzinfo)) + set.sort() + return set + + def mtimeset(self, hour, minute, second): + set = [] + rr = self.rrule + for second in rr._bysecond: + set.append(datetime.time(hour, minute, second, tzinfo=rr._tzinfo)) + set.sort() + return set + + def stimeset(self, hour, minute, second): + return (datetime.time(hour, minute, second, + tzinfo=self.rrule._tzinfo),) + + +class rruleset(rrulebase): + + class _genitem: + def __init__(self, genlist, gen): + try: + self.dt = gen() + genlist.append(self) + except StopIteration: + pass + self.genlist = genlist + self.gen = gen + + def next(self): + try: + self.dt = self.gen() + except StopIteration: + self.genlist.remove(self) + + def __cmp__(self, other): + return cmp(self.dt, other.dt) + + def __init__(self, cache=False): + rrulebase.__init__(self, cache) + self._rrule = [] + self._rdate = [] + self._exrule = [] + self._exdate = [] + + def rrule(self, rrule): + self._rrule.append(rrule) + + def rdate(self, rdate): + self._rdate.append(rdate) + + def exrule(self, exrule): + self._exrule.append(exrule) + + def exdate(self, exdate): + self._exdate.append(exdate) + + def _iter(self): + rlist = [] + self._rdate.sort() + self._genitem(rlist, iter(self._rdate).next) + for gen in [iter(x).next for x in self._rrule]: + self._genitem(rlist, gen) + rlist.sort() + exlist = [] + self._exdate.sort() + self._genitem(exlist, iter(self._exdate).next) + for gen in [iter(x).next for x in self._exrule]: + self._genitem(exlist, gen) + exlist.sort() + lastdt = None + total = 0 + while rlist: + ritem = rlist[0] + if not lastdt or lastdt != ritem.dt: + while exlist and exlist[0] < ritem: + exlist[0].next() + exlist.sort() + if not exlist or ritem != exlist[0]: + total += 1 + yield ritem.dt + lastdt = ritem.dt + ritem.next() + rlist.sort() + self._len = total + +class _rrulestr: + + _freq_map = {"YEARLY": YEARLY, + "MONTHLY": MONTHLY, + "WEEKLY": WEEKLY, + "DAILY": DAILY, + "HOURLY": HOURLY, + "MINUTELY": MINUTELY, + "SECONDLY": SECONDLY} + + _weekday_map = {"MO":0,"TU":1,"WE":2,"TH":3,"FR":4,"SA":5,"SU":6} + + def _handle_int(self, rrkwargs, name, value, **kwargs): + rrkwargs[name.lower()] = int(value) + + def _handle_int_list(self, rrkwargs, name, value, **kwargs): + rrkwargs[name.lower()] = [int(x) for x in value.split(',')] + + _handle_INTERVAL = _handle_int + _handle_COUNT = _handle_int + _handle_BYSETPOS = _handle_int_list + _handle_BYMONTH = _handle_int_list + _handle_BYMONTHDAY = _handle_int_list + _handle_BYYEARDAY = _handle_int_list + _handle_BYEASTER = _handle_int_list + _handle_BYWEEKNO = _handle_int_list + _handle_BYHOUR = _handle_int_list + _handle_BYMINUTE = _handle_int_list + _handle_BYSECOND = _handle_int_list + + def _handle_FREQ(self, rrkwargs, name, value, **kwargs): + rrkwargs["freq"] = self._freq_map[value] + + def _handle_UNTIL(self, rrkwargs, name, value, **kwargs): + global parser + if not parser: + from dateutil import parser + try: + rrkwargs["until"] = parser.parse(value, + ignoretz=kwargs.get("ignoretz"), + tzinfos=kwargs.get("tzinfos")) + except ValueError: + raise ValueError, "invalid until date" + + def _handle_WKST(self, rrkwargs, name, value, **kwargs): + rrkwargs["wkst"] = self._weekday_map[value] + + def _handle_BYWEEKDAY(self, rrkwargs, name, value, **kwarsg): + l = [] + for wday in value.split(','): + for i in range(len(wday)): + if wday[i] not in '+-0123456789': + break + n = wday[:i] or None + w = wday[i:] + if n: n = int(n) + l.append(weekdays[self._weekday_map[w]](n)) + rrkwargs["byweekday"] = l + + _handle_BYDAY = _handle_BYWEEKDAY + + def _parse_rfc_rrule(self, line, + dtstart=None, + cache=False, + ignoretz=False, + tzinfos=None): + if line.find(':') != -1: + name, value = line.split(':') + if name != "RRULE": + raise ValueError, "unknown parameter name" + else: + value = line + rrkwargs = {} + for pair in value.split(';'): + name, value = pair.split('=') + name = name.upper() + value = value.upper() + try: + getattr(self, "_handle_"+name)(rrkwargs, name, value, + ignoretz=ignoretz, + tzinfos=tzinfos) + except AttributeError: + raise ValueError, "unknown parameter '%s'" % name + except (KeyError, ValueError): + raise ValueError, "invalid '%s': %s" % (name, value) + return rrule(dtstart=dtstart, cache=cache, **rrkwargs) + + def _parse_rfc(self, s, + dtstart=None, + cache=False, + unfold=False, + forceset=False, + compatible=False, + ignoretz=False, + tzinfos=None): + global parser + if compatible: + forceset = True + unfold = True + s = s.upper() + if not s.strip(): + raise ValueError, "empty string" + if unfold: + lines = s.splitlines() + i = 0 + while i < len(lines): + line = lines[i].rstrip() + if not line: + del lines[i] + elif i > 0 and line[0] == " ": + lines[i-1] += line[1:] + del lines[i] + else: + i += 1 + else: + lines = s.split() + if (not forceset and len(lines) == 1 and + (s.find(':') == -1 or s.startswith('RRULE:'))): + return self._parse_rfc_rrule(lines[0], cache=cache, + dtstart=dtstart, ignoretz=ignoretz, + tzinfos=tzinfos) + else: + rrulevals = [] + rdatevals = [] + exrulevals = [] + exdatevals = [] + for line in lines: + if not line: + continue + if line.find(':') == -1: + name = "RRULE" + value = line + else: + name, value = line.split(':', 1) + parms = name.split(';') + if not parms: + raise ValueError, "empty property name" + name = parms[0] + parms = parms[1:] + if name == "RRULE": + for parm in parms: + raise ValueError, "unsupported RRULE parm: "+parm + rrulevals.append(value) + elif name == "RDATE": + for parm in parms: + if parm != "VALUE=DATE-TIME": + raise ValueError, "unsupported RDATE parm: "+parm + rdatevals.append(value) + elif name == "EXRULE": + for parm in parms: + raise ValueError, "unsupported EXRULE parm: "+parm + exrulevals.append(value) + elif name == "EXDATE": + for parm in parms: + if parm != "VALUE=DATE-TIME": + raise ValueError, "unsupported RDATE parm: "+parm + exdatevals.append(value) + elif name == "DTSTART": + for parm in parms: + raise ValueError, "unsupported DTSTART parm: "+parm + if not parser: + from dateutil import parser + dtstart = parser.parse(value, ignoretz=ignoretz, + tzinfos=tzinfos) + else: + raise ValueError, "unsupported property: "+name + if (forceset or len(rrulevals) > 1 or + rdatevals or exrulevals or exdatevals): + if not parser and (rdatevals or exdatevals): + from dateutil import parser + set = rruleset(cache=cache) + for value in rrulevals: + set.rrule(self._parse_rfc_rrule(value, dtstart=dtstart, + ignoretz=ignoretz, + tzinfos=tzinfos)) + for value in rdatevals: + for datestr in value.split(','): + set.rdate(parser.parse(datestr, + ignoretz=ignoretz, + tzinfos=tzinfos)) + for value in exrulevals: + set.exrule(self._parse_rfc_rrule(value, dtstart=dtstart, + ignoretz=ignoretz, + tzinfos=tzinfos)) + for value in exdatevals: + for datestr in value.split(','): + set.exdate(parser.parse(datestr, + ignoretz=ignoretz, + tzinfos=tzinfos)) + if compatible and dtstart: + set.rdate(dtstart) + return set + else: + return self._parse_rfc_rrule(rrulevals[0], + dtstart=dtstart, + cache=cache, + ignoretz=ignoretz, + tzinfos=tzinfos) + + def __call__(self, s, **kwargs): + return self._parse_rfc(s, **kwargs) + +rrulestr = _rrulestr() + +# vim:ts=4:sw=4:et diff --git a/libs/dateutil/tz.py b/libs/dateutil/tz.py new file mode 100644 index 00000000..0e28d6b3 --- /dev/null +++ b/libs/dateutil/tz.py @@ -0,0 +1,951 @@ +""" +Copyright (c) 2003-2007 Gustavo Niemeyer + +This module offers extensions to the standard python 2.3+ +datetime module. +""" +__author__ = "Gustavo Niemeyer " +__license__ = "PSF License" + +import datetime +import struct +import time +import sys +import os + +relativedelta = None +parser = None +rrule = None + +__all__ = ["tzutc", "tzoffset", "tzlocal", "tzfile", "tzrange", + "tzstr", "tzical", "tzwin", "tzwinlocal", "gettz"] + +try: + from dateutil.tzwin import tzwin, tzwinlocal +except (ImportError, OSError): + tzwin, tzwinlocal = None, None + +ZERO = datetime.timedelta(0) +EPOCHORDINAL = datetime.datetime.utcfromtimestamp(0).toordinal() + +class tzutc(datetime.tzinfo): + + def utcoffset(self, dt): + return ZERO + + def dst(self, dt): + return ZERO + + def tzname(self, dt): + return "UTC" + + def __eq__(self, other): + return (isinstance(other, tzutc) or + (isinstance(other, tzoffset) and other._offset == ZERO)) + + def __ne__(self, other): + return not self.__eq__(other) + + def __repr__(self): + return "%s()" % self.__class__.__name__ + + __reduce__ = object.__reduce__ + +class tzoffset(datetime.tzinfo): + + def __init__(self, name, offset): + self._name = name + self._offset = datetime.timedelta(seconds=offset) + + def utcoffset(self, dt): + return self._offset + + def dst(self, dt): + return ZERO + + def tzname(self, dt): + return self._name + + def __eq__(self, other): + return (isinstance(other, tzoffset) and + self._offset == other._offset) + + def __ne__(self, other): + return not self.__eq__(other) + + def __repr__(self): + return "%s(%s, %s)" % (self.__class__.__name__, + `self._name`, + self._offset.days*86400+self._offset.seconds) + + __reduce__ = object.__reduce__ + +class tzlocal(datetime.tzinfo): + + _std_offset = datetime.timedelta(seconds=-time.timezone) + if time.daylight: + _dst_offset = datetime.timedelta(seconds=-time.altzone) + else: + _dst_offset = _std_offset + + def utcoffset(self, dt): + if self._isdst(dt): + return self._dst_offset + else: + return self._std_offset + + def dst(self, dt): + if self._isdst(dt): + return self._dst_offset-self._std_offset + else: + return ZERO + + def tzname(self, dt): + return time.tzname[self._isdst(dt)] + + def _isdst(self, dt): + # We can't use mktime here. It is unstable when deciding if + # the hour near to a change is DST or not. + # + # timestamp = time.mktime((dt.year, dt.month, dt.day, dt.hour, + # dt.minute, dt.second, dt.weekday(), 0, -1)) + # return time.localtime(timestamp).tm_isdst + # + # The code above yields the following result: + # + #>>> import tz, datetime + #>>> t = tz.tzlocal() + #>>> datetime.datetime(2003,2,15,23,tzinfo=t).tzname() + #'BRDT' + #>>> datetime.datetime(2003,2,16,0,tzinfo=t).tzname() + #'BRST' + #>>> datetime.datetime(2003,2,15,23,tzinfo=t).tzname() + #'BRST' + #>>> datetime.datetime(2003,2,15,22,tzinfo=t).tzname() + #'BRDT' + #>>> datetime.datetime(2003,2,15,23,tzinfo=t).tzname() + #'BRDT' + # + # Here is a more stable implementation: + # + timestamp = ((dt.toordinal() - EPOCHORDINAL) * 86400 + + dt.hour * 3600 + + dt.minute * 60 + + dt.second) + return time.localtime(timestamp+time.timezone).tm_isdst + + def __eq__(self, other): + if not isinstance(other, tzlocal): + return False + return (self._std_offset == other._std_offset and + self._dst_offset == other._dst_offset) + return True + + def __ne__(self, other): + return not self.__eq__(other) + + def __repr__(self): + return "%s()" % self.__class__.__name__ + + __reduce__ = object.__reduce__ + +class _ttinfo(object): + __slots__ = ["offset", "delta", "isdst", "abbr", "isstd", "isgmt"] + + def __init__(self): + for attr in self.__slots__: + setattr(self, attr, None) + + def __repr__(self): + l = [] + for attr in self.__slots__: + value = getattr(self, attr) + if value is not None: + l.append("%s=%s" % (attr, `value`)) + return "%s(%s)" % (self.__class__.__name__, ", ".join(l)) + + def __eq__(self, other): + if not isinstance(other, _ttinfo): + return False + return (self.offset == other.offset and + self.delta == other.delta and + self.isdst == other.isdst and + self.abbr == other.abbr and + self.isstd == other.isstd and + self.isgmt == other.isgmt) + + def __ne__(self, other): + return not self.__eq__(other) + + def __getstate__(self): + state = {} + for name in self.__slots__: + state[name] = getattr(self, name, None) + return state + + def __setstate__(self, state): + for name in self.__slots__: + if name in state: + setattr(self, name, state[name]) + +class tzfile(datetime.tzinfo): + + # http://www.twinsun.com/tz/tz-link.htm + # ftp://elsie.nci.nih.gov/pub/tz*.tar.gz + + def __init__(self, fileobj): + if isinstance(fileobj, basestring): + self._filename = fileobj + fileobj = open(fileobj) + elif hasattr(fileobj, "name"): + self._filename = fileobj.name + else: + self._filename = `fileobj` + + # From tzfile(5): + # + # The time zone information files used by tzset(3) + # begin with the magic characters "TZif" to identify + # them as time zone information files, followed by + # sixteen bytes reserved for future use, followed by + # six four-byte values of type long, written in a + # ``standard'' byte order (the high-order byte + # of the value is written first). + + if fileobj.read(4) != "TZif": + raise ValueError, "magic not found" + + fileobj.read(16) + + ( + # The number of UTC/local indicators stored in the file. + ttisgmtcnt, + + # The number of standard/wall indicators stored in the file. + ttisstdcnt, + + # The number of leap seconds for which data is + # stored in the file. + leapcnt, + + # The number of "transition times" for which data + # is stored in the file. + timecnt, + + # The number of "local time types" for which data + # is stored in the file (must not be zero). + typecnt, + + # The number of characters of "time zone + # abbreviation strings" stored in the file. + charcnt, + + ) = struct.unpack(">6l", fileobj.read(24)) + + # The above header is followed by tzh_timecnt four-byte + # values of type long, sorted in ascending order. + # These values are written in ``standard'' byte order. + # Each is used as a transition time (as returned by + # time(2)) at which the rules for computing local time + # change. + + if timecnt: + self._trans_list = struct.unpack(">%dl" % timecnt, + fileobj.read(timecnt*4)) + else: + self._trans_list = [] + + # Next come tzh_timecnt one-byte values of type unsigned + # char; each one tells which of the different types of + # ``local time'' types described in the file is associated + # with the same-indexed transition time. These values + # serve as indices into an array of ttinfo structures that + # appears next in the file. + + if timecnt: + self._trans_idx = struct.unpack(">%dB" % timecnt, + fileobj.read(timecnt)) + else: + self._trans_idx = [] + + # Each ttinfo structure is written as a four-byte value + # for tt_gmtoff of type long, in a standard byte + # order, followed by a one-byte value for tt_isdst + # and a one-byte value for tt_abbrind. In each + # structure, tt_gmtoff gives the number of + # seconds to be added to UTC, tt_isdst tells whether + # tm_isdst should be set by localtime(3), and + # tt_abbrind serves as an index into the array of + # time zone abbreviation characters that follow the + # ttinfo structure(s) in the file. + + ttinfo = [] + + for i in range(typecnt): + ttinfo.append(struct.unpack(">lbb", fileobj.read(6))) + + abbr = fileobj.read(charcnt) + + # Then there are tzh_leapcnt pairs of four-byte + # values, written in standard byte order; the + # first value of each pair gives the time (as + # returned by time(2)) at which a leap second + # occurs; the second gives the total number of + # leap seconds to be applied after the given time. + # The pairs of values are sorted in ascending order + # by time. + + # Not used, for now + if leapcnt: + leap = struct.unpack(">%dl" % (leapcnt*2), + fileobj.read(leapcnt*8)) + + # Then there are tzh_ttisstdcnt standard/wall + # indicators, each stored as a one-byte value; + # they tell whether the transition times associated + # with local time types were specified as standard + # time or wall clock time, and are used when + # a time zone file is used in handling POSIX-style + # time zone environment variables. + + if ttisstdcnt: + isstd = struct.unpack(">%db" % ttisstdcnt, + fileobj.read(ttisstdcnt)) + + # Finally, there are tzh_ttisgmtcnt UTC/local + # indicators, each stored as a one-byte value; + # they tell whether the transition times associated + # with local time types were specified as UTC or + # local time, and are used when a time zone file + # is used in handling POSIX-style time zone envi- + # ronment variables. + + if ttisgmtcnt: + isgmt = struct.unpack(">%db" % ttisgmtcnt, + fileobj.read(ttisgmtcnt)) + + # ** Everything has been read ** + + # Build ttinfo list + self._ttinfo_list = [] + for i in range(typecnt): + gmtoff, isdst, abbrind = ttinfo[i] + # Round to full-minutes if that's not the case. Python's + # datetime doesn't accept sub-minute timezones. Check + # http://python.org/sf/1447945 for some information. + gmtoff = (gmtoff+30)//60*60 + tti = _ttinfo() + tti.offset = gmtoff + tti.delta = datetime.timedelta(seconds=gmtoff) + tti.isdst = isdst + tti.abbr = abbr[abbrind:abbr.find('\x00', abbrind)] + tti.isstd = (ttisstdcnt > i and isstd[i] != 0) + tti.isgmt = (ttisgmtcnt > i and isgmt[i] != 0) + self._ttinfo_list.append(tti) + + # Replace ttinfo indexes for ttinfo objects. + trans_idx = [] + for idx in self._trans_idx: + trans_idx.append(self._ttinfo_list[idx]) + self._trans_idx = tuple(trans_idx) + + # Set standard, dst, and before ttinfos. before will be + # used when a given time is before any transitions, + # and will be set to the first non-dst ttinfo, or to + # the first dst, if all of them are dst. + self._ttinfo_std = None + self._ttinfo_dst = None + self._ttinfo_before = None + if self._ttinfo_list: + if not self._trans_list: + self._ttinfo_std = self._ttinfo_first = self._ttinfo_list[0] + else: + for i in range(timecnt-1,-1,-1): + tti = self._trans_idx[i] + if not self._ttinfo_std and not tti.isdst: + self._ttinfo_std = tti + elif not self._ttinfo_dst and tti.isdst: + self._ttinfo_dst = tti + if self._ttinfo_std and self._ttinfo_dst: + break + else: + if self._ttinfo_dst and not self._ttinfo_std: + self._ttinfo_std = self._ttinfo_dst + + for tti in self._ttinfo_list: + if not tti.isdst: + self._ttinfo_before = tti + break + else: + self._ttinfo_before = self._ttinfo_list[0] + + # Now fix transition times to become relative to wall time. + # + # I'm not sure about this. In my tests, the tz source file + # is setup to wall time, and in the binary file isstd and + # isgmt are off, so it should be in wall time. OTOH, it's + # always in gmt time. Let me know if you have comments + # about this. + laststdoffset = 0 + self._trans_list = list(self._trans_list) + for i in range(len(self._trans_list)): + tti = self._trans_idx[i] + if not tti.isdst: + # This is std time. + self._trans_list[i] += tti.offset + laststdoffset = tti.offset + else: + # This is dst time. Convert to std. + self._trans_list[i] += laststdoffset + self._trans_list = tuple(self._trans_list) + + def _find_ttinfo(self, dt, laststd=0): + timestamp = ((dt.toordinal() - EPOCHORDINAL) * 86400 + + dt.hour * 3600 + + dt.minute * 60 + + dt.second) + idx = 0 + for trans in self._trans_list: + if timestamp < trans: + break + idx += 1 + else: + return self._ttinfo_std + if idx == 0: + return self._ttinfo_before + if laststd: + while idx > 0: + tti = self._trans_idx[idx-1] + if not tti.isdst: + return tti + idx -= 1 + else: + return self._ttinfo_std + else: + return self._trans_idx[idx-1] + + def utcoffset(self, dt): + if not self._ttinfo_std: + return ZERO + return self._find_ttinfo(dt).delta + + def dst(self, dt): + if not self._ttinfo_dst: + return ZERO + tti = self._find_ttinfo(dt) + if not tti.isdst: + return ZERO + + # The documentation says that utcoffset()-dst() must + # be constant for every dt. + return tti.delta-self._find_ttinfo(dt, laststd=1).delta + + # An alternative for that would be: + # + # return self._ttinfo_dst.offset-self._ttinfo_std.offset + # + # However, this class stores historical changes in the + # dst offset, so I belive that this wouldn't be the right + # way to implement this. + + def tzname(self, dt): + if not self._ttinfo_std: + return None + return self._find_ttinfo(dt).abbr + + def __eq__(self, other): + if not isinstance(other, tzfile): + return False + return (self._trans_list == other._trans_list and + self._trans_idx == other._trans_idx and + self._ttinfo_list == other._ttinfo_list) + + def __ne__(self, other): + return not self.__eq__(other) + + + def __repr__(self): + return "%s(%s)" % (self.__class__.__name__, `self._filename`) + + def __reduce__(self): + if not os.path.isfile(self._filename): + raise ValueError, "Unpickable %s class" % self.__class__.__name__ + return (self.__class__, (self._filename,)) + +class tzrange(datetime.tzinfo): + + def __init__(self, stdabbr, stdoffset=None, + dstabbr=None, dstoffset=None, + start=None, end=None): + global relativedelta + if not relativedelta: + from dateutil import relativedelta + self._std_abbr = stdabbr + self._dst_abbr = dstabbr + if stdoffset is not None: + self._std_offset = datetime.timedelta(seconds=stdoffset) + else: + self._std_offset = ZERO + if dstoffset is not None: + self._dst_offset = datetime.timedelta(seconds=dstoffset) + elif dstabbr and stdoffset is not None: + self._dst_offset = self._std_offset+datetime.timedelta(hours=+1) + else: + self._dst_offset = ZERO + if dstabbr and start is None: + self._start_delta = relativedelta.relativedelta( + hours=+2, month=4, day=1, weekday=relativedelta.SU(+1)) + else: + self._start_delta = start + if dstabbr and end is None: + self._end_delta = relativedelta.relativedelta( + hours=+1, month=10, day=31, weekday=relativedelta.SU(-1)) + else: + self._end_delta = end + + def utcoffset(self, dt): + if self._isdst(dt): + return self._dst_offset + else: + return self._std_offset + + def dst(self, dt): + if self._isdst(dt): + return self._dst_offset-self._std_offset + else: + return ZERO + + def tzname(self, dt): + if self._isdst(dt): + return self._dst_abbr + else: + return self._std_abbr + + def _isdst(self, dt): + if not self._start_delta: + return False + year = datetime.datetime(dt.year,1,1) + start = year+self._start_delta + end = year+self._end_delta + dt = dt.replace(tzinfo=None) + if start < end: + return dt >= start and dt < end + else: + return dt >= start or dt < end + + def __eq__(self, other): + if not isinstance(other, tzrange): + return False + return (self._std_abbr == other._std_abbr and + self._dst_abbr == other._dst_abbr and + self._std_offset == other._std_offset and + self._dst_offset == other._dst_offset and + self._start_delta == other._start_delta and + self._end_delta == other._end_delta) + + def __ne__(self, other): + return not self.__eq__(other) + + def __repr__(self): + return "%s(...)" % self.__class__.__name__ + + __reduce__ = object.__reduce__ + +class tzstr(tzrange): + + def __init__(self, s): + global parser + if not parser: + from dateutil import parser + self._s = s + + res = parser._parsetz(s) + if res is None: + raise ValueError, "unknown string format" + + # Here we break the compatibility with the TZ variable handling. + # GMT-3 actually *means* the timezone -3. + if res.stdabbr in ("GMT", "UTC"): + res.stdoffset *= -1 + + # We must initialize it first, since _delta() needs + # _std_offset and _dst_offset set. Use False in start/end + # to avoid building it two times. + tzrange.__init__(self, res.stdabbr, res.stdoffset, + res.dstabbr, res.dstoffset, + start=False, end=False) + + if not res.dstabbr: + self._start_delta = None + self._end_delta = None + else: + self._start_delta = self._delta(res.start) + if self._start_delta: + self._end_delta = self._delta(res.end, isend=1) + + def _delta(self, x, isend=0): + kwargs = {} + if x.month is not None: + kwargs["month"] = x.month + if x.weekday is not None: + kwargs["weekday"] = relativedelta.weekday(x.weekday, x.week) + if x.week > 0: + kwargs["day"] = 1 + else: + kwargs["day"] = 31 + elif x.day: + kwargs["day"] = x.day + elif x.yday is not None: + kwargs["yearday"] = x.yday + elif x.jyday is not None: + kwargs["nlyearday"] = x.jyday + if not kwargs: + # Default is to start on first sunday of april, and end + # on last sunday of october. + if not isend: + kwargs["month"] = 4 + kwargs["day"] = 1 + kwargs["weekday"] = relativedelta.SU(+1) + else: + kwargs["month"] = 10 + kwargs["day"] = 31 + kwargs["weekday"] = relativedelta.SU(-1) + if x.time is not None: + kwargs["seconds"] = x.time + else: + # Default is 2AM. + kwargs["seconds"] = 7200 + if isend: + # Convert to standard time, to follow the documented way + # of working with the extra hour. See the documentation + # of the tzinfo class. + delta = self._dst_offset-self._std_offset + kwargs["seconds"] -= delta.seconds+delta.days*86400 + return relativedelta.relativedelta(**kwargs) + + def __repr__(self): + return "%s(%s)" % (self.__class__.__name__, `self._s`) + +class _tzicalvtzcomp: + def __init__(self, tzoffsetfrom, tzoffsetto, isdst, + tzname=None, rrule=None): + self.tzoffsetfrom = datetime.timedelta(seconds=tzoffsetfrom) + self.tzoffsetto = datetime.timedelta(seconds=tzoffsetto) + self.tzoffsetdiff = self.tzoffsetto-self.tzoffsetfrom + self.isdst = isdst + self.tzname = tzname + self.rrule = rrule + +class _tzicalvtz(datetime.tzinfo): + def __init__(self, tzid, comps=[]): + self._tzid = tzid + self._comps = comps + self._cachedate = [] + self._cachecomp = [] + + def _find_comp(self, dt): + if len(self._comps) == 1: + return self._comps[0] + dt = dt.replace(tzinfo=None) + try: + return self._cachecomp[self._cachedate.index(dt)] + except ValueError: + pass + lastcomp = None + lastcompdt = None + for comp in self._comps: + if not comp.isdst: + # Handle the extra hour in DST -> STD + compdt = comp.rrule.before(dt-comp.tzoffsetdiff, inc=True) + else: + compdt = comp.rrule.before(dt, inc=True) + if compdt and (not lastcompdt or lastcompdt < compdt): + lastcompdt = compdt + lastcomp = comp + if not lastcomp: + # RFC says nothing about what to do when a given + # time is before the first onset date. We'll look for the + # first standard component, or the first component, if + # none is found. + for comp in self._comps: + if not comp.isdst: + lastcomp = comp + break + else: + lastcomp = comp[0] + self._cachedate.insert(0, dt) + self._cachecomp.insert(0, lastcomp) + if len(self._cachedate) > 10: + self._cachedate.pop() + self._cachecomp.pop() + return lastcomp + + def utcoffset(self, dt): + return self._find_comp(dt).tzoffsetto + + def dst(self, dt): + comp = self._find_comp(dt) + if comp.isdst: + return comp.tzoffsetdiff + else: + return ZERO + + def tzname(self, dt): + return self._find_comp(dt).tzname + + def __repr__(self): + return "" % `self._tzid` + + __reduce__ = object.__reduce__ + +class tzical: + def __init__(self, fileobj): + global rrule + if not rrule: + from dateutil import rrule + + if isinstance(fileobj, basestring): + self._s = fileobj + fileobj = open(fileobj) + elif hasattr(fileobj, "name"): + self._s = fileobj.name + else: + self._s = `fileobj` + + self._vtz = {} + + self._parse_rfc(fileobj.read()) + + def keys(self): + return self._vtz.keys() + + def get(self, tzid=None): + if tzid is None: + keys = self._vtz.keys() + if len(keys) == 0: + raise ValueError, "no timezones defined" + elif len(keys) > 1: + raise ValueError, "more than one timezone available" + tzid = keys[0] + return self._vtz.get(tzid) + + def _parse_offset(self, s): + s = s.strip() + if not s: + raise ValueError, "empty offset" + if s[0] in ('+', '-'): + signal = (-1,+1)[s[0]=='+'] + s = s[1:] + else: + signal = +1 + if len(s) == 4: + return (int(s[:2])*3600+int(s[2:])*60)*signal + elif len(s) == 6: + return (int(s[:2])*3600+int(s[2:4])*60+int(s[4:]))*signal + else: + raise ValueError, "invalid offset: "+s + + def _parse_rfc(self, s): + lines = s.splitlines() + if not lines: + raise ValueError, "empty string" + + # Unfold + i = 0 + while i < len(lines): + line = lines[i].rstrip() + if not line: + del lines[i] + elif i > 0 and line[0] == " ": + lines[i-1] += line[1:] + del lines[i] + else: + i += 1 + + tzid = None + comps = [] + invtz = False + comptype = None + for line in lines: + if not line: + continue + name, value = line.split(':', 1) + parms = name.split(';') + if not parms: + raise ValueError, "empty property name" + name = parms[0].upper() + parms = parms[1:] + if invtz: + if name == "BEGIN": + if value in ("STANDARD", "DAYLIGHT"): + # Process component + pass + else: + raise ValueError, "unknown component: "+value + comptype = value + founddtstart = False + tzoffsetfrom = None + tzoffsetto = None + rrulelines = [] + tzname = None + elif name == "END": + if value == "VTIMEZONE": + if comptype: + raise ValueError, \ + "component not closed: "+comptype + if not tzid: + raise ValueError, \ + "mandatory TZID not found" + if not comps: + raise ValueError, \ + "at least one component is needed" + # Process vtimezone + self._vtz[tzid] = _tzicalvtz(tzid, comps) + invtz = False + elif value == comptype: + if not founddtstart: + raise ValueError, \ + "mandatory DTSTART not found" + if tzoffsetfrom is None: + raise ValueError, \ + "mandatory TZOFFSETFROM not found" + if tzoffsetto is None: + raise ValueError, \ + "mandatory TZOFFSETFROM not found" + # Process component + rr = None + if rrulelines: + rr = rrule.rrulestr("\n".join(rrulelines), + compatible=True, + ignoretz=True, + cache=True) + comp = _tzicalvtzcomp(tzoffsetfrom, tzoffsetto, + (comptype == "DAYLIGHT"), + tzname, rr) + comps.append(comp) + comptype = None + else: + raise ValueError, \ + "invalid component end: "+value + elif comptype: + if name == "DTSTART": + rrulelines.append(line) + founddtstart = True + elif name in ("RRULE", "RDATE", "EXRULE", "EXDATE"): + rrulelines.append(line) + elif name == "TZOFFSETFROM": + if parms: + raise ValueError, \ + "unsupported %s parm: %s "%(name, parms[0]) + tzoffsetfrom = self._parse_offset(value) + elif name == "TZOFFSETTO": + if parms: + raise ValueError, \ + "unsupported TZOFFSETTO parm: "+parms[0] + tzoffsetto = self._parse_offset(value) + elif name == "TZNAME": + if parms: + raise ValueError, \ + "unsupported TZNAME parm: "+parms[0] + tzname = value + elif name == "COMMENT": + pass + else: + raise ValueError, "unsupported property: "+name + else: + if name == "TZID": + if parms: + raise ValueError, \ + "unsupported TZID parm: "+parms[0] + tzid = value + elif name in ("TZURL", "LAST-MODIFIED", "COMMENT"): + pass + else: + raise ValueError, "unsupported property: "+name + elif name == "BEGIN" and value == "VTIMEZONE": + tzid = None + comps = [] + invtz = True + + def __repr__(self): + return "%s(%s)" % (self.__class__.__name__, `self._s`) + +if sys.platform != "win32": + TZFILES = ["/etc/localtime", "localtime"] + TZPATHS = ["/usr/share/zoneinfo", "/usr/lib/zoneinfo", "/etc/zoneinfo"] +else: + TZFILES = [] + TZPATHS = [] + +def gettz(name=None): + tz = None + if not name: + try: + name = os.environ["TZ"] + except KeyError: + pass + if name is None or name == ":": + for filepath in TZFILES: + if not os.path.isabs(filepath): + filename = filepath + for path in TZPATHS: + filepath = os.path.join(path, filename) + if os.path.isfile(filepath): + break + else: + continue + if os.path.isfile(filepath): + try: + tz = tzfile(filepath) + break + except (IOError, OSError, ValueError): + pass + else: + tz = tzlocal() + else: + if name.startswith(":"): + name = name[:-1] + if os.path.isabs(name): + if os.path.isfile(name): + tz = tzfile(name) + else: + tz = None + else: + for path in TZPATHS: + filepath = os.path.join(path, name) + if not os.path.isfile(filepath): + filepath = filepath.replace(' ','_') + if not os.path.isfile(filepath): + continue + try: + tz = tzfile(filepath) + break + except (IOError, OSError, ValueError): + pass + else: + tz = None + if tzwin: + try: + tz = tzwin(name) + except OSError: + pass + if not tz: + from dateutil.zoneinfo import gettz + tz = gettz(name) + if not tz: + for c in name: + # name must have at least one offset to be a tzstr + if c in "0123456789": + try: + tz = tzstr(name) + except ValueError: + pass + break + else: + if name in ("GMT", "UTC"): + tz = tzutc() + elif name in time.tzname: + tz = tzlocal() + return tz + +# vim:ts=4:sw=4:et diff --git a/libs/dateutil/tzwin.py b/libs/dateutil/tzwin.py new file mode 100644 index 00000000..073e0ff6 --- /dev/null +++ b/libs/dateutil/tzwin.py @@ -0,0 +1,180 @@ +# This code was originally contributed by Jeffrey Harris. +import datetime +import struct +import _winreg + +__author__ = "Jeffrey Harris & Gustavo Niemeyer " + +__all__ = ["tzwin", "tzwinlocal"] + +ONEWEEK = datetime.timedelta(7) + +TZKEYNAMENT = r"SOFTWARE\Microsoft\Windows NT\CurrentVersion\Time Zones" +TZKEYNAME9X = r"SOFTWARE\Microsoft\Windows\CurrentVersion\Time Zones" +TZLOCALKEYNAME = r"SYSTEM\CurrentControlSet\Control\TimeZoneInformation" + +def _settzkeyname(): + global TZKEYNAME + handle = _winreg.ConnectRegistry(None, _winreg.HKEY_LOCAL_MACHINE) + try: + _winreg.OpenKey(handle, TZKEYNAMENT).Close() + TZKEYNAME = TZKEYNAMENT + except WindowsError: + TZKEYNAME = TZKEYNAME9X + handle.Close() + +_settzkeyname() + +class tzwinbase(datetime.tzinfo): + """tzinfo class based on win32's timezones available in the registry.""" + + def utcoffset(self, dt): + if self._isdst(dt): + return datetime.timedelta(minutes=self._dstoffset) + else: + return datetime.timedelta(minutes=self._stdoffset) + + def dst(self, dt): + if self._isdst(dt): + minutes = self._dstoffset - self._stdoffset + return datetime.timedelta(minutes=minutes) + else: + return datetime.timedelta(0) + + def tzname(self, dt): + if self._isdst(dt): + return self._dstname + else: + return self._stdname + + def list(): + """Return a list of all time zones known to the system.""" + handle = _winreg.ConnectRegistry(None, _winreg.HKEY_LOCAL_MACHINE) + tzkey = _winreg.OpenKey(handle, TZKEYNAME) + result = [_winreg.EnumKey(tzkey, i) + for i in range(_winreg.QueryInfoKey(tzkey)[0])] + tzkey.Close() + handle.Close() + return result + list = staticmethod(list) + + def display(self): + return self._display + + def _isdst(self, dt): + dston = picknthweekday(dt.year, self._dstmonth, self._dstdayofweek, + self._dsthour, self._dstminute, + self._dstweeknumber) + dstoff = picknthweekday(dt.year, self._stdmonth, self._stddayofweek, + self._stdhour, self._stdminute, + self._stdweeknumber) + if dston < dstoff: + return dston <= dt.replace(tzinfo=None) < dstoff + else: + return not dstoff <= dt.replace(tzinfo=None) < dston + + +class tzwin(tzwinbase): + + def __init__(self, name): + self._name = name + + handle = _winreg.ConnectRegistry(None, _winreg.HKEY_LOCAL_MACHINE) + tzkey = _winreg.OpenKey(handle, "%s\%s" % (TZKEYNAME, name)) + keydict = valuestodict(tzkey) + tzkey.Close() + handle.Close() + + self._stdname = keydict["Std"].encode("iso-8859-1") + self._dstname = keydict["Dlt"].encode("iso-8859-1") + + self._display = keydict["Display"] + + # See http://ww_winreg.jsiinc.com/SUBA/tip0300/rh0398.htm + tup = struct.unpack("=3l16h", keydict["TZI"]) + self._stdoffset = -tup[0]-tup[1] # Bias + StandardBias * -1 + self._dstoffset = self._stdoffset-tup[2] # + DaylightBias * -1 + + (self._stdmonth, + self._stddayofweek, # Sunday = 0 + self._stdweeknumber, # Last = 5 + self._stdhour, + self._stdminute) = tup[4:9] + + (self._dstmonth, + self._dstdayofweek, # Sunday = 0 + self._dstweeknumber, # Last = 5 + self._dsthour, + self._dstminute) = tup[12:17] + + def __repr__(self): + return "tzwin(%s)" % repr(self._name) + + def __reduce__(self): + return (self.__class__, (self._name,)) + + +class tzwinlocal(tzwinbase): + + def __init__(self): + + handle = _winreg.ConnectRegistry(None, _winreg.HKEY_LOCAL_MACHINE) + + tzlocalkey = _winreg.OpenKey(handle, TZLOCALKEYNAME) + keydict = valuestodict(tzlocalkey) + tzlocalkey.Close() + + self._stdname = keydict["StandardName"].encode("iso-8859-1") + self._dstname = keydict["DaylightName"].encode("iso-8859-1") + + try: + tzkey = _winreg.OpenKey(handle, "%s\%s"%(TZKEYNAME, self._stdname)) + _keydict = valuestodict(tzkey) + self._display = _keydict["Display"] + tzkey.Close() + except OSError: + self._display = None + + handle.Close() + + self._stdoffset = -keydict["Bias"]-keydict["StandardBias"] + self._dstoffset = self._stdoffset-keydict["DaylightBias"] + + + # See http://ww_winreg.jsiinc.com/SUBA/tip0300/rh0398.htm + tup = struct.unpack("=8h", keydict["StandardStart"]) + + (self._stdmonth, + self._stddayofweek, # Sunday = 0 + self._stdweeknumber, # Last = 5 + self._stdhour, + self._stdminute) = tup[1:6] + + tup = struct.unpack("=8h", keydict["DaylightStart"]) + + (self._dstmonth, + self._dstdayofweek, # Sunday = 0 + self._dstweeknumber, # Last = 5 + self._dsthour, + self._dstminute) = tup[1:6] + + def __reduce__(self): + return (self.__class__, ()) + +def picknthweekday(year, month, dayofweek, hour, minute, whichweek): + """dayofweek == 0 means Sunday, whichweek 5 means last instance""" + first = datetime.datetime(year, month, 1, hour, minute) + weekdayone = first.replace(day=((dayofweek-first.isoweekday())%7+1)) + for n in xrange(whichweek): + dt = weekdayone+(whichweek-n)*ONEWEEK + if dt.month == month: + return dt + +def valuestodict(key): + """Convert a registry key's values to a dictionary.""" + dict = {} + size = _winreg.QueryInfoKey(key)[1] + for i in range(size): + data = _winreg.EnumValue(key, i) + dict[data[0]] = data[1] + return dict diff --git a/libs/dateutil/zoneinfo/__init__.py b/libs/dateutil/zoneinfo/__init__.py new file mode 100644 index 00000000..9bed6264 --- /dev/null +++ b/libs/dateutil/zoneinfo/__init__.py @@ -0,0 +1,87 @@ +""" +Copyright (c) 2003-2005 Gustavo Niemeyer + +This module offers extensions to the standard python 2.3+ +datetime module. +""" +from dateutil.tz import tzfile +from tarfile import TarFile +import os + +__author__ = "Gustavo Niemeyer " +__license__ = "PSF License" + +__all__ = ["setcachesize", "gettz", "rebuild"] + +CACHE = [] +CACHESIZE = 10 + +class tzfile(tzfile): + def __reduce__(self): + return (gettz, (self._filename,)) + +def getzoneinfofile(): + filenames = os.listdir(os.path.join(os.path.dirname(__file__))) + filenames.sort() + filenames.reverse() + for entry in filenames: + if entry.startswith("zoneinfo") and ".tar." in entry: + return os.path.join(os.path.dirname(__file__), entry) + return None + +ZONEINFOFILE = getzoneinfofile() + +del getzoneinfofile + +def setcachesize(size): + global CACHESIZE, CACHE + CACHESIZE = size + del CACHE[size:] + +def gettz(name): + tzinfo = None + if ZONEINFOFILE: + for cachedname, tzinfo in CACHE: + if cachedname == name: + break + else: + tf = TarFile.open(ZONEINFOFILE) + try: + zonefile = tf.extractfile(name) + except KeyError: + tzinfo = None + else: + tzinfo = tzfile(zonefile) + tf.close() + CACHE.insert(0, (name, tzinfo)) + del CACHE[CACHESIZE:] + return tzinfo + +def rebuild(filename, tag=None, format="gz"): + import tempfile, shutil + tmpdir = tempfile.mkdtemp() + zonedir = os.path.join(tmpdir, "zoneinfo") + moduledir = os.path.dirname(__file__) + if tag: tag = "-"+tag + targetname = "zoneinfo%s.tar.%s" % (tag, format) + try: + tf = TarFile.open(filename) + for name in tf.getnames(): + if not (name.endswith(".sh") or + name.endswith(".tab") or + name == "leapseconds"): + tf.extract(name, tmpdir) + filepath = os.path.join(tmpdir, name) + os.system("zic -d %s %s" % (zonedir, filepath)) + tf.close() + target = os.path.join(moduledir, targetname) + for entry in os.listdir(moduledir): + if entry.startswith("zoneinfo") and ".tar." in entry: + os.unlink(os.path.join(moduledir, entry)) + tf = TarFile.open(target, "w:%s" % format) + for entry in os.listdir(zonedir): + entrypath = os.path.join(zonedir, entry) + tf.add(entrypath, entry) + tf.close() + finally: + shutil.rmtree(tmpdir) diff --git a/libs/dateutil/zoneinfo/zoneinfo-2010g.tar.gz b/libs/dateutil/zoneinfo/zoneinfo-2010g.tar.gz new file mode 100644 index 00000000..8bd4f964 Binary files /dev/null and b/libs/dateutil/zoneinfo/zoneinfo-2010g.tar.gz differ diff --git a/libs/guessit/__init__.py b/libs/guessit/__init__.py index 66bcb3d4..8b5c841b 100644 --- a/libs/guessit/__init__.py +++ b/libs/guessit/__init__.py @@ -1,359 +1,8 @@ #!/usr/bin/env python # -*- coding: utf-8 -*- -# -# GuessIt - A library for guessing information from filenames -# Copyright (c) 2013 Nicolas Wack -# -# GuessIt is free software; you can redistribute it and/or modify it under -# the terms of the Lesser GNU General Public License as published by -# the Free Software Foundation; either version 3 of the License, or -# (at your option) any later version. -# -# GuessIt is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# Lesser GNU General Public License for more details. -# -# You should have received a copy of the Lesser GNU General Public License -# along with this program. If not, see . -# +""" +Extracts as much information as possible from a video file. +""" +from .api import guessit, GuessItApi -from __future__ import absolute_import, division, print_function, unicode_literals - -import pkg_resources from .__version__ import __version__ - -__all__ = ['Guess', 'Language', - 'guess_file_info', 'guess_video_info', - 'guess_movie_info', 'guess_episode_info', - 'default_options'] - - -# Do python3 detection before importing any other module, to be sure that -# it will then always be available -# with code from http://lucumr.pocoo.org/2011/1/22/forwards-compatible-python/ -import sys -if sys.version_info[0] >= 3: # pragma: no cover - PY2, PY3 = False, True - unicode_text_type = str - native_text_type = str - base_text_type = str - - def u(x): - return str(x) - - def s(x): - return x - - class UnicodeMixin(object): - __str__ = lambda x: x.__unicode__() - import binascii - - def to_hex(x): - return binascii.hexlify(x).decode('utf-8') - -else: # pragma: no cover - PY2, PY3 = True, False - __all__ = [str(s) for s in __all__] # fix imports for python2 - unicode_text_type = unicode - native_text_type = str - base_text_type = basestring - - def u(x): - if isinstance(x, str): - return x.decode('utf-8') - if isinstance(x, list): - return [u(s) for s in x] - return unicode(x) - - def s(x): - if isinstance(x, unicode): - return x.encode('utf-8') - if isinstance(x, list): - return [s(y) for y in x] - if isinstance(x, tuple): - return tuple(s(y) for y in x) - if isinstance(x, dict): - return dict((s(key), s(value)) for key, value in x.items()) - return x - - class UnicodeMixin(object): - __str__ = lambda x: unicode(x).encode('utf-8') - - def to_hex(x): - return x.encode('hex') - - range = xrange - - -from guessit.guess import Guess, smart_merge -from guessit.language import Language -from guessit.matcher import IterativeMatcher -from guessit.textutils import clean_default, is_camel, from_camel -import babelfish -import os.path -import logging -from copy import deepcopy - -log = logging.getLogger(__name__) - - -class NullHandler(logging.Handler): - def emit(self, record): - pass - -# let's be a nicely behaving library -h = NullHandler() -log.addHandler(h) - - -def _guess_filename(filename, options=None, **kwargs): - mtree = _build_filename_mtree(filename, options=options, **kwargs) - if options.get('split_camel'): - _add_camel_properties(mtree, options=options) - return mtree.matched() - - -def _build_filename_mtree(filename, options=None, **kwargs): - mtree = IterativeMatcher(filename, options=options, **kwargs) - second_pass_options = mtree.second_pass_options - if second_pass_options: - log.debug("Running 2nd pass") - merged_options = dict(options) - merged_options.update(second_pass_options) - mtree = IterativeMatcher(filename, options=merged_options, **kwargs) - return mtree - - -def _add_camel_properties(mtree, options=None, **kwargs): - prop = 'title' if mtree.matched().get('type') != 'episode' else 'series' - value = mtree.matched().get(prop) - _guess_camel_string(mtree, value, options=options, skip_title=False, **kwargs) - - for leaf in mtree.match_tree.unidentified_leaves(): - value = leaf.value - _guess_camel_string(mtree, value, options=options, skip_title=True, **kwargs) - - -def _guess_camel_string(mtree, string, options=None, skip_title=False, **kwargs): - if string and is_camel(string): - log.debug('"%s" is camel cased. Try to detect more properties.' % (string,)) - uncameled_value = from_camel(string) - merged_options = dict(options) - if 'type' in mtree.match_tree.info: - current_type = mtree.match_tree.info.get('type') - if current_type and current_type != 'unknown': - merged_options['type'] = current_type - camel_tree = _build_filename_mtree(uncameled_value, options=merged_options, name_only=True, skip_title=skip_title, **kwargs) - if len(camel_tree.matched()) > 0: - mtree.matched().update(camel_tree.matched()) - return True - return False - - -def guess_video_metadata(filename): - """Gets the video metadata properties out of a given file. The file needs to - exist on the filesystem to be able to be analyzed. An empty guess is - returned otherwise. - - You need to have the Enzyme python package installed for this to work.""" - result = Guess() - - def found(prop, value): - result[prop] = value - log.debug('Found with enzyme %s: %s' % (prop, value)) - - # first get the size of the file, in bytes - try: - size = os.stat(filename).st_size - found('fileSize', size) - - except Exception as e: - log.error('Cannot get video file size: %s' % e) - # file probably does not exist, we might as well return now - return result - - # then get additional metadata from the file using enzyme, if available - try: - import enzyme - - with open(filename) as f: - mkv = enzyme.MKV(f) - - found('duration', mkv.info.duration.total_seconds()) - - if mkv.video_tracks: - video_track = mkv.video_tracks[0] - - # resolution - if video_track.height in (480, 720, 1080): - if video_track.interlaced: - found('screenSize', '%di' % video_track.height) - else: - found('screenSize', '%dp' % video_track.height) - else: - # TODO: do we want this? - #found('screenSize', '%dx%d' % (video_track.width, video_track.height)) - pass - - # video codec - if video_track.codec_id == 'V_MPEG4/ISO/AVC': - found('videoCodec', 'h264') - elif video_track.codec_id == 'V_MPEG4/ISO/SP': - found('videoCodec', 'DivX') - elif video_track.codec_id == 'V_MPEG4/ISO/ASP': - found('videoCodec', 'XviD') - - else: - log.warning('MKV has no video track') - - if mkv.audio_tracks: - audio_track = mkv.audio_tracks[0] - # audio codec - if audio_track.codec_id == 'A_AC3': - found('audioCodec', 'AC3') - elif audio_track.codec_id == 'A_DTS': - found('audioCodec', 'DTS') - elif audio_track.codec_id == 'A_AAC': - found('audioCodec', 'AAC') - else: - log.warning('MKV has no audio track') - - if mkv.subtitle_tracks: - embedded_subtitle_languages = set() - for st in mkv.subtitle_tracks: - try: - if st.language: - lang = babelfish.Language.fromalpha3b(st.language) - elif st.name: - lang = babelfish.Language.fromname(st.name) - else: - lang = babelfish.Language('und') - - except babelfish.Error: - lang = babelfish.Language('und') - - embedded_subtitle_languages.add(lang) - - found('subtitleLanguage', embedded_subtitle_languages) - else: - log.debug('MKV has no subtitle track') - - return result - - except ImportError: - log.error('Cannot get video file metadata, missing dependency: enzyme') - log.error('Please install it from PyPI, by doing eg: pip install enzyme') - return result - - except IOError as e: - log.error('Could not open file: %s' % filename) - log.error('Make sure it exists and is available for reading on the filesystem') - log.error('Error: %s' % e) - return result - - except enzyme.Error as e: - log.error('Cannot guess video file metadata') - log.error('enzyme.Error while reading file: %s' % filename) - log.error('Error: %s' % e) - return result - -default_options = {} - - -def guess_file_info(filename, info=None, options=None, **kwargs): - """info can contain the names of the various plugins, such as 'filename' to - detect filename info, or 'hash_md5' to get the md5 hash of the file. - - >>> testfile = os.path.join(os.path.dirname(__file__), 'test/dummy.srt') - >>> g = guess_file_info(testfile, info = ['hash_md5', 'hash_sha1']) - >>> g['hash_md5'], g['hash_sha1'] - ('64de6b5893cac24456c46a935ef9c359', 'a703fc0fa4518080505809bf562c6fc6f7b3c98c') - """ - info = info or 'filename' - options = options or {} - if default_options: - merged_options = deepcopy(default_options) - merged_options.update(options) - options = merged_options - - result = [] - hashers = [] - - # Force unicode as soon as possible - filename = u(filename) - - if isinstance(info, base_text_type): - info = [info] - - for infotype in info: - if infotype == 'filename': - result.append(_guess_filename(filename, options, **kwargs)) - - elif infotype == 'hash_mpc': - from guessit.hash_mpc import hash_file - try: - result.append(Guess({infotype: hash_file(filename)}, - confidence=1.0)) - except Exception as e: - log.warning('Could not compute MPC-style hash because: %s' % e) - - elif infotype == 'hash_ed2k': - from guessit.hash_ed2k import hash_file - try: - result.append(Guess({infotype: hash_file(filename)}, - confidence=1.0)) - except Exception as e: - log.warning('Could not compute ed2k hash because: %s' % e) - - elif infotype.startswith('hash_'): - import hashlib - hashname = infotype[5:] - try: - hasher = getattr(hashlib, hashname)() - hashers.append((infotype, hasher)) - except AttributeError: - log.warning('Could not compute %s hash because it is not available from python\'s hashlib module' % hashname) - - elif infotype == 'video': - g = guess_video_metadata(filename) - if g: - result.append(g) - - else: - log.warning('Invalid infotype: %s' % infotype) - - # do all the hashes now, but on a single pass - if hashers: - try: - blocksize = 8192 - hasherobjs = dict(hashers).values() - - with open(filename, 'rb') as f: - chunk = f.read(blocksize) - while chunk: - for hasher in hasherobjs: - hasher.update(chunk) - chunk = f.read(blocksize) - - for infotype, hasher in hashers: - result.append(Guess({infotype: hasher.hexdigest()}, - confidence=1.0)) - except Exception as e: - log.warning('Could not compute hash because: %s' % e) - - result = smart_merge(result) - - return result - - -def guess_video_info(filename, info=None, options=None, **kwargs): - return guess_file_info(filename, info=info, options=options, type='video', **kwargs) - - -def guess_movie_info(filename, info=None, options=None, **kwargs): - return guess_file_info(filename, info=info, options=options, type='movie', **kwargs) - - -def guess_episode_info(filename, info=None, options=None, **kwargs): - return guess_file_info(filename, info=info, options=options, type='episode', **kwargs) diff --git a/libs/guessit/__main__.py b/libs/guessit/__main__.py index 759c380b..b2b95cfc 100644 --- a/libs/guessit/__main__.py +++ b/libs/guessit/__main__.py @@ -1,58 +1,48 @@ #!/usr/bin/env python # -*- coding: utf-8 -*- -# -# GuessIt - A library for guessing information from filenames -# Copyright (c) 2013 Nicolas Wack -# Copyright (c) 2013 Rémi Alvergnat -# -# GuessIt is free software; you can redistribute it and/or modify it under -# the terms of the Lesser GNU General Public License as published by -# the Free Software Foundation; either version 3 of the License, or -# (at your option) any later version. -# -# GuessIt is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# Lesser GNU General Public License for more details. -# -# You should have received a copy of the Lesser GNU General Public License -# along with this program. If not, see . -# +""" +Entry point module +""" +# pragma: no cover +from __future__ import print_function -from __future__ import absolute_import, division, print_function, unicode_literals -from collections import defaultdict +import json import logging import os +import sys -from guessit import PY2, u, guess_file_info, __version__ -from guessit.options import get_opts +import six +from guessit import api from guessit.__version__ import __version__ +from guessit.jsonutils import GuessitEncoder +from guessit.options import argument_parser +from rebulk.__version__ import __version__ as __rebulk_version__ -def guess_file(filename, info='filename', options=None, **kwargs): - options = options or {} - filename = u(filename) - - if not options.get('yaml') and not options.get('show_property'): +def guess_filename(filename, options): + """ + Guess a single filename using given options + """ + if not options.yaml and not options.json and not options.show_property: print('For:', filename) - guess = guess_file_info(filename, info, options, **kwargs) - if not options.get('unidentified'): - try: - del guess['unidentified'] - except KeyError: - pass + cmd_options = vars(options) + cmd_options['implicit'] = True # Force implicit option in CLI - if options.get('show_property'): - print(guess.get(options.get('show_property'), '')) + guess = api.guessit(filename, vars(options)) + + if options.show_property: + print(guess.get(options.show_property, '')) return - if options.get('yaml'): + if options.json: + print(json.dumps(guess, cls=GuessitEncoder, ensure_ascii=False)) + elif options.yaml: import yaml - for k, v in guess.items(): - if isinstance(v, list) and len(v) == 1: - guess[k] = v[0] - ystr = yaml.safe_dump({filename: dict(guess)}, default_flow_style=False) + from guessit import yamlutils + + ystr = yaml.dump({filename: dict(guess)}, Dumper=yamlutils.CustomDumper, default_flow_style=False, + allow_unicode=True) i = 0 for yline in ystr.splitlines(): if i == 0: @@ -62,222 +52,108 @@ def guess_file(filename, info='filename', options=None, **kwargs): else: print(yline) i += 1 - return - print('GuessIt found:', guess.nice_string(options.get('advanced'))) - - -def _supported_properties(): - all_properties = defaultdict(list) - transformers_properties = [] - - from guessit.plugins import transformers - for transformer in transformers.all_transformers(): - supported_properties = transformer.supported_properties() - transformers_properties.append((transformer, supported_properties)) - - if isinstance(supported_properties, dict): - for property_name, possible_values in supported_properties.items(): - all_properties[property_name].extend(possible_values) - else: - for property_name in supported_properties: - all_properties[property_name] # just make sure it exists - - return all_properties, transformers_properties - - -def display_transformers(): - print('GuessIt transformers:') - _, transformers_properties = _supported_properties() - for transformer, _ in transformers_properties: - print('[@] %s (%s)' % (transformer.name, transformer.priority)) + else: + print('GuessIt found:', json.dumps(guess, cls=GuessitEncoder, indent=4, ensure_ascii=False)) def display_properties(options): - values = options.values - transformers = options.transformers - name_only = options.name_only + """ + Display properties + """ + properties = api.properties(options) - print('GuessIt properties:') - all_properties, transformers_properties = _supported_properties() - if name_only: - # the 'container' property does not apply when using the --name-only - # option - del all_properties['container'] - - if transformers: - for transformer, properties_list in transformers_properties: - print('[@] %s (%s)' % (transformer.name, transformer.priority)) - for property_name in properties_list: - property_values = all_properties.get(property_name) - print(' [+] %s' % (property_name,)) - if property_values and values: - _display_property_values(property_name, indent=4) - else: - properties_list = sorted(all_properties.keys()) - for property_name in properties_list: - property_values = all_properties.get(property_name) - print(' [+] %s' % (property_name,)) - if property_values and values: - _display_property_values(property_name, indent=4) - - -def _display_property_values(property_name, indent=2): - all_properties, _ = _supported_properties() - property_values = all_properties.get(property_name) - for property_value in property_values: - print(indent * ' ' + '[!] %s' % (property_value,)) - - -def run_demo(episodes=True, movies=True, options=None): - # NOTE: tests should not be added here but rather in the tests/ folder - # this is just intended as a quick example - if episodes: - testeps = ['Series/Californication/Season 2/Californication.2x05.Vaginatown.HDTV.XviD-0TV.[tvu.org.ru].avi', - 'Series/dexter/Dexter.5x02.Hello,.Bandit.ENG.-.sub.FR.HDTV.XviD-AlFleNi-TeaM.[tvu.org.ru].avi', - 'Series/Treme/Treme.1x03.Right.Place,.Wrong.Time.HDTV.XviD-NoTV.[tvu.org.ru].avi', - 'Series/Duckman/Duckman - 101 (01) - 20021107 - I, Duckman.avi', - 'Series/Duckman/Duckman - S1E13 Joking The Chicken (unedited).avi', - 'Series/Simpsons/The_simpsons_s13e18_-_i_am_furious_yellow.mpg', - 'Series/Simpsons/Saison 12 Français/Simpsons,.The.12x08.A.Bas.Le.Sergent.Skinner.FR.[tvu.org.ru].avi', - 'Series/Dr._Slump_-_002_DVB-Rip_Catalan_by_kelf.avi', - 'Series/Kaamelott/Kaamelott - Livre V - Second Volet - HD 704x396 Xvid 2 pass - Son 5.1 - TntRip by Slurm.avi'] - - for f in testeps: - print('-' * 80) - guess_file(f, options=options, type='episode') - - if movies: - testmovies = ['Movies/Fear and Loathing in Las Vegas (1998)/Fear.and.Loathing.in.Las.Vegas.720p.HDDVD.DTS.x264-ESiR.mkv', - 'Movies/El Dia de la Bestia (1995)/El.dia.de.la.bestia.DVDrip.Spanish.DivX.by.Artik[SEDG].avi', - 'Movies/Blade Runner (1982)/Blade.Runner.(1982).(Director\'s.Cut).CD1.DVDRip.XviD.AC3-WAF.avi', - 'Movies/Dark City (1998)/Dark.City.(1998).DC.BDRip.720p.DTS.X264-CHD.mkv', - 'Movies/Sin City (BluRay) (2005)/Sin.City.2005.BDRip.720p.x264.AC3-SEPTiC.mkv', - 'Movies/Borat (2006)/Borat.(2006).R5.PROPER.REPACK.DVDRip.XviD-PUKKA.avi', - '[XCT].Le.Prestige.(The.Prestige).DVDRip.[x264.HP.He-Aac.{Fr-Eng}.St{Fr-Eng}.Chaps].mkv', - 'Battle Royale (2000)/Battle.Royale.(Batoru.Rowaiaru).(2000).(Special.Edition).CD1of2.DVDRiP.XviD-[ZeaL].avi', - 'Movies/Brazil (1985)/Brazil_Criterion_Edition_(1985).CD2.English.srt', - 'Movies/Persepolis (2007)/[XCT] Persepolis [H264+Aac-128(Fr-Eng)+ST(Fr-Eng)+Ind].mkv', - 'Movies/Toy Story (1995)/Toy Story [HDTV 720p English-Spanish].mkv', - 'Movies/Pirates of the Caribbean: The Curse of the Black Pearl (2003)/Pirates.Of.The.Carribean.DC.2003.iNT.DVDRip.XviD.AC3-NDRT.CD1.avi', - 'Movies/Office Space (1999)/Office.Space.[Dual-DVDRip].[Spanish-English].[XviD-AC3-AC3].[by.Oswald].avi', - 'Movies/The NeverEnding Story (1984)/The.NeverEnding.Story.1.1984.DVDRip.AC3.Xvid-Monteque.avi', - 'Movies/Juno (2007)/Juno KLAXXON.avi', - 'Movies/Chat noir, chat blanc (1998)/Chat noir, Chat blanc - Emir Kusturica (VO - VF - sub FR - Chapters).mkv', - 'Movies/Wild Zero (2000)/Wild.Zero.DVDivX-EPiC.srt', - 'Movies/El Bosque Animado (1987)/El.Bosque.Animado.[Jose.Luis.Cuerda.1987].[Xvid-Dvdrip-720x432].avi', - 'testsmewt_bugs/movies/Baraka_Edition_Collector.avi' - ] - - for f in testmovies: - print('-' * 80) - guess_file(f, options=options, type='movie') - - -def submit_bug(filename, options): - import requests # only import when needed - from requests.exceptions import RequestException - - try: - opts = dict((k, v) for k, v in options.__dict__.items() - if v and k != 'submit_bug') - - r = requests.post('http://localhost:5000/bugs', {'filename': filename, - 'version': __version__, - 'options': str(opts)}) - if r.status_code == 200: - print('Successfully submitted file: %s' % r.text) + if options.json: + if options.values: + print(json.dumps(properties, cls=GuessitEncoder, ensure_ascii=False)) else: - print('Could not submit bug at the moment, please try again later.') + print(json.dumps(list(properties.keys()), cls=GuessitEncoder, ensure_ascii=False)) + elif options.yaml: + import yaml + from guessit import yamlutils + if options.values: + print(yaml.dump(properties, Dumper=yamlutils.CustomDumper, default_flow_style=False, allow_unicode=True)) + else: + print(yaml.dump(list(properties.keys()), Dumper=yamlutils.CustomDumper, default_flow_style=False, + allow_unicode=True)) + else: + print('GuessIt properties:') - except RequestException as e: - print('Could not submit bug at the moment, please try again later.') + properties_list = list(sorted(properties.keys())) + for property_name in properties_list: + property_values = properties.get(property_name) + print(2 * ' ' + '[+] %s' % (property_name,)) + if property_values and options.values: + for property_value in property_values: + print(4 * ' ' + '[!] %s' % (property_value,)) -def main(args=None, setup_logging=True): - if setup_logging: - from guessit import slogging - slogging.setup_logging() - - if PY2: # pragma: no cover - import codecs - import locale - import sys - +def main(args=None): # pylint:disable=too-many-branches + """ + Main function for entry point + """ + if six.PY2 and os.name == 'nt': # pragma: no cover # see http://bugs.python.org/issue2128 - if os.name == 'nt': - for i, a in enumerate(sys.argv): - sys.argv[i] = a.decode(locale.getpreferredencoding()) + import locale - # see https://github.com/wackou/guessit/issues/43 - # and http://stackoverflow.com/questions/4545661/unicodedecodeerror-when-redirecting-to-file - # Wrap sys.stdout into a StreamWriter to allow writing unicode. - sys.stdout = codecs.getwriter(locale.getpreferredencoding())(sys.stdout) + for i, j in enumerate(sys.argv): + sys.argv[i] = j.decode(locale.getpreferredencoding()) - from guessit.plugins import transformers - - if args: - options = get_opts().parse_args(args) - else: # pragma: no cover - options = get_opts().parse_args() + if args is None: # pragma: no cover + options = argument_parser.parse_args() + else: + options = argument_parser.parse_args(args) if options.verbose: + logging.basicConfig(stream=sys.stdout, format='%(message)s') logging.getLogger().setLevel(logging.DEBUG) help_required = True - if options.properties or options.values: - display_properties(options) - help_required = False - elif options.transformers: - display_transformers() - help_required = False - - if options.demo: - run_demo(episodes=True, movies=True, options=vars(options)) - help_required = False if options.version: print('+-------------------------------------------------------+') - print('+ GuessIt ' + __version__ + (28-len(__version__)) * ' ' + '+') + print('+ GuessIt ' + __version__ + (28 - len(__version__)) * ' ' + '+') + print('+-------------------------------------------------------+') + print('+ Rebulk ' + __rebulk_version__ + (29 - len(__rebulk_version__)) * ' ' + '+') print('+-------------------------------------------------------+') print('| Please report any bug or feature request at |') - print('| https://github.com/wackou/guessit/issues. |') + print('| https://github.com/guessit-io/guessit/issues. |') print('+-------------------------------------------------------+') help_required = False if options.yaml: try: - import yaml, babelfish - def default_representer(dumper, data): - return dumper.represent_str(str(data)) - yaml.SafeDumper.add_representer(babelfish.Language, default_representer) - yaml.SafeDumper.add_representer(babelfish.Country, default_representer) + import yaml # pylint:disable=unused-variable except ImportError: # pragma: no cover - print('PyYAML not found. Using default output.') + options.yaml = False + print('PyYAML is not installed. \'--yaml\' option will be ignored ...', file=sys.stderr) + + if options.properties or options.values: + display_properties(options) + help_required = False filenames = [] if options.filename: - filenames.extend(options.filename) + for filename in options.filename: + filenames.append(filename) if options.input_file: - input_file = open(options.input_file, 'r') + if six.PY2: + input_file = open(options.input_file, 'r') + else: + input_file = open(options.input_file, 'r', encoding='utf-8') try: filenames.extend([line.strip() for line in input_file.readlines()]) finally: input_file.close() - filenames = filter(lambda f: f, filenames) + filenames = list(filter(lambda f: f, filenames)) if filenames: - help_required = False - if options.submit_bug: - for filename in filenames: - submit_bug(filename, options) - else: - for filename in filenames: - guess_file(filename, - info=options.info.split(','), - options=vars(options)) + for filename in filenames: + help_required = False + guess_filename(filename, options) if help_required: # pragma: no cover - get_opts().print_help() + argument_parser.print_help() -if __name__ == '__main__': + +if __name__ == '__main__': # pragma: no cover main() diff --git a/libs/guessit/__version__.py b/libs/guessit/__version__.py index f8ec056e..cef422c9 100644 --- a/libs/guessit/__version__.py +++ b/libs/guessit/__version__.py @@ -1,20 +1,7 @@ #!/usr/bin/env python # -*- coding: utf-8 -*- -# -# GuessIt - A library for guessing information from filenames -# Copyright (c) 2013 Nicolas Wack -# -# GuessIt is free software; you can redistribute it and/or modify it under -# the terms of the Lesser GNU General Public License as published by -# the Free Software Foundation; either version 3 of the License, or -# (at your option) any later version. -# -# GuessIt is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# Lesser GNU General Public License for more details. -# -# You should have received a copy of the Lesser GNU General Public License -# along with this program. If not, see . -# -__version__ = '0.10.2.dev0' +""" +Version module +""" +# pragma: no cover +__version__ = '2.1.1.dev0' diff --git a/libs/guessit/api.py b/libs/guessit/api.py new file mode 100644 index 00000000..900f6965 --- /dev/null +++ b/libs/guessit/api.py @@ -0,0 +1,150 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +""" +API functions that can be used by external software +""" +try: + from collections import OrderedDict +except ImportError: # pragma: no-cover + from ordereddict import OrderedDict # pylint:disable=import-error + +import traceback + +import six + +from rebulk.introspector import introspect + +from .rules import rebulk_builder +from .options import parse_options +from .__version__ import __version__ + + +class GuessitException(Exception): + """ + Exception raised when guessit fails to perform a guess because of an internal error. + """ + def __init__(self, string, options): + super(GuessitException, self).__init__("An internal error has occured in guessit.\n" + "===================== Guessit Exception Report =====================\n" + "version=%s\n" + "string=%s\n" + "options=%s\n" + "--------------------------------------------------------------------\n" + "%s" + "--------------------------------------------------------------------\n" + "Please report at " + "https://github.com/guessit-io/guessit/issues.\n" + "====================================================================" % + (__version__, str(string), str(options), traceback.format_exc())) + + self.string = string + self.options = options + + +def guessit(string, options=None): + """ + Retrieves all matches from string as a dict + :param string: the filename or release name + :type string: str + :param options: the filename or release name + :type options: str|dict + :return: + :rtype: + """ + return default_api.guessit(string, options) + + +def properties(options=None): + """ + Retrieves all properties with possible values that can be guessed + :param options: + :type options: + :return: + :rtype: + """ + return default_api.properties(options) + + +class GuessItApi(object): + """ + An api class that can be configured with custom Rebulk configuration. + """ + + def __init__(self, rebulk): + """ + :param rebulk: Rebulk instance to use. + :type rebulk: Rebulk + :return: + :rtype: + """ + self.rebulk = rebulk + + @staticmethod + def _fix_option_encoding(value): + if isinstance(value, list): + return [GuessItApi._fix_option_encoding(item) for item in value] + if six.PY2 and isinstance(value, six.text_type): + return value.encode("utf-8") + if six.PY3 and isinstance(value, six.binary_type): + return value.decode('ascii') + return value + + def guessit(self, string, options=None): + """ + Retrieves all matches from string as a dict + :param string: the filename or release name + :type string: str + :param options: the filename or release name + :type options: str|dict + :return: + :rtype: + """ + try: + options = parse_options(options) + result_decode = False + result_encode = False + + fixed_options = {} + for (key, value) in options.items(): + key = GuessItApi._fix_option_encoding(key) + value = GuessItApi._fix_option_encoding(value) + fixed_options[key] = value + options = fixed_options + + if six.PY2 and isinstance(string, six.text_type): + string = string.encode("utf-8") + result_decode = True + if six.PY3 and isinstance(string, six.binary_type): + string = string.decode('ascii') + result_encode = True + matches = self.rebulk.matches(string, options) + if result_decode: + for match in matches: + if isinstance(match.value, six.binary_type): + match.value = match.value.decode("utf-8") + if result_encode: + for match in matches: + if isinstance(match.value, six.text_type): + match.value = match.value.encode("ascii") + return matches.to_dict(options.get('advanced', False), options.get('implicit', False)) + except: + raise GuessitException(string, options) + + def properties(self, options=None): + """ + Grab properties and values that can be generated. + :param options: + :type options: + :return: + :rtype: + """ + unordered = introspect(self.rebulk, options).properties + ordered = OrderedDict() + for k in sorted(unordered.keys(), key=six.text_type): + ordered[k] = list(sorted(unordered[k], key=six.text_type)) + if hasattr(self.rebulk, 'customize_properties'): + ordered = self.rebulk.customize_properties(ordered) + return ordered + + +default_api = GuessItApi(rebulk_builder()) diff --git a/libs/guessit/backports.py b/libs/guessit/backports.py new file mode 100644 index 00000000..3e94e27a --- /dev/null +++ b/libs/guessit/backports.py @@ -0,0 +1,27 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +""" +Backports +""" +# pragma: no-cover +# pylint: disabled + +def cmp_to_key(mycmp): + """functools.cmp_to_key backport""" + class KeyClass(object): + """Key class""" + def __init__(self, obj, *args): # pylint: disable=unused-argument + self.obj = obj + def __lt__(self, other): + return mycmp(self.obj, other.obj) < 0 + def __gt__(self, other): + return mycmp(self.obj, other.obj) > 0 + def __eq__(self, other): + return mycmp(self.obj, other.obj) == 0 + def __le__(self, other): + return mycmp(self.obj, other.obj) <= 0 + def __ge__(self, other): + return mycmp(self.obj, other.obj) >= 0 + def __ne__(self, other): + return mycmp(self.obj, other.obj) != 0 + return KeyClass diff --git a/libs/guessit/containers.py b/libs/guessit/containers.py deleted file mode 100644 index 74847008..00000000 --- a/libs/guessit/containers.py +++ /dev/null @@ -1,771 +0,0 @@ -#!/usr/bin/env python -# -*- coding: utf-8 -*- -# -# GuessIt - A library for guessing information from filenames -# Copyright (c) 2013 Nicolas Wack -# Copyright (c) 2013 Rémi Alvergnat -# -# GuessIt is free software; you can redistribute it and/or modify it under -# the terms of the Lesser GNU General Public License as published by -# the Free Software Foundation; either version 3 of the License, or -# (at your option) any later version. -# -# GuessIt is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# Lesser GNU General Public License for more details. -# -# You should have received a copy of the Lesser GNU General Public License -# along with this program. If not, see . -# - -from __future__ import absolute_import, division, print_function, unicode_literals - -from .patterns import compile_pattern, sep -from . import base_text_type -from .guess import Guess -import types - - -def _get_span(prop, match): - """Retrieves span for a match""" - if not prop.global_span and match.re.groups: - start = None - end = None - for i in range(1, match.re.groups + 1): - span = match.span(i) - if start is None or span[0] < start: - start = span[0] - if end is None or span[1] > end: - end = span[1] - return start, end - else: - return match.span() - start = span[0] - end = span[1] - - -def _trim_span(span, value, blanks = sep): - start, end = span - - for i in range(0, len(value)): - if value[i] in blanks: - start += 1 - else: - break - - for i in reversed(range(0, len(value))): - if value[i] in blanks: - end -= 1 - else: - break - if end <= start: - return -1, -1 - return start, end - - -def _get_groups(compiled_re): - """ - Retrieves groups from re - - :return: list of group names - """ - if compiled_re.groups: - indexgroup = {} - for k, i in compiled_re.groupindex.items(): - indexgroup[i] = k - ret = [] - for i in range(1, compiled_re.groups + 1): - ret.append(indexgroup.get(i, i)) - return ret - else: - return [None] - - -class NoValidator(object): - def validate(self, prop, string, node, match, entry_start, entry_end): - return True - - -class LeftValidator(object): - """Make sure our match is starting by separator, or by another entry""" - - def validate(self, prop, string, node, match, entry_start, entry_end): - span = _get_span(prop, match) - span = _trim_span(span, string[span[0]:span[1]]) - start, end = span - - sep_start = start <= 0 or string[start - 1] in sep - start_by_other = start in entry_end - if not sep_start and not start_by_other: - return False - return True - - -class RightValidator(object): - """Make sure our match is ended by separator, or by another entry""" - - def validate(self, prop, string, node, match, entry_start, entry_end): - span = _get_span(prop, match) - span = _trim_span(span, string[span[0]:span[1]]) - start, end = span - - sep_end = end >= len(string) or string[end] in sep - end_by_other = end in entry_start - if not sep_end and not end_by_other: - return False - return True - - -class ChainedValidator(object): - def __init__(self, *validators): - self._validators = validators - - def validate(self, prop, string, node, match, entry_start, entry_end): - for validator in self._validators: - if not validator.validate(prop, string, node, match, entry_start, entry_end): - return False - return True - - -class SameKeyValidator(object): - def __init__(self, validator_function): - self.validator_function = validator_function - - def validate(self, prop, string, node, match, entry_start, entry_end): - for key in prop.keys: - for same_value_leaf in node.root.leaves_containing(key): - ret = self.validator_function(same_value_leaf, key, prop, string, node, match, entry_start, entry_end) - if ret is not None: - return ret - return True - - -class OnlyOneValidator(SameKeyValidator): - def __init__(self): - super(OnlyOneValidator, self).__init__(lambda same_value_leaf, key, prop, string, node, match, entry_start, entry_end: False) - - -class DefaultValidator(object): - """Make sure our match is surrounded by separators, or by another entry""" - def validate(self, prop, string, node, match, entry_start, entry_end): - span = _get_span(prop, match) - span = _trim_span(span, string[span[0]:span[1]]) - start, end = span - - sep_start = start <= 0 or string[start - 1] in sep - sep_end = end >= len(string) or string[end] in sep - start_by_other = start in entry_end - end_by_other = end in entry_start - if (sep_start or start_by_other) and (sep_end or end_by_other): - return True - return False - - -class FunctionValidator(object): - def __init__(self, function): - self.function = function - - def validate(self, prop, string, node, match, entry_start, entry_end): - return self.function(prop, string, node, match, entry_start, entry_end) - - -class FormatterValidator(object): - def __init__(self, group_name=None, formatted_validator=None): - self.group_name = group_name - self.formatted_validator = formatted_validator - - def validate(self, prop, string, node, match, entry_start, entry_end): - if self.group_name: - formatted = prop.format(match.group(self.group_name), self.group_name) - else: - formatted = prop.format(match.group()) - if self.formatted_validator: - return self.formatted_validator(formatted) - else: - return formatted - - -def _get_positions(prop, string, node, match, entry_start, entry_end): - span = match.span() - start = span[0] - end = span[1] - - at_start = True - at_end = True - - while start > 0: - start -= 1 - if string[start] not in sep: - at_start = False - break - while end < len(string) - 1: - end += 1 - if string[end] not in sep: - at_end = False - break - return at_start, at_end - - -class WeakValidator(DefaultValidator): - """Make sure our match is surrounded by separators and is the first or last element in the string""" - def validate(self, prop, string, node, match, entry_start, entry_end): - if super(WeakValidator, self).validate(prop, string, node, match, entry_start, entry_end): - at_start, at_end = _get_positions(prop, string, node, match, entry_start, entry_end) - return at_start or at_end - return False - - -class NeighborValidator(DefaultValidator): - """Make sure the node is next another one""" - def validate(self, prop, string, node, match, entry_start, entry_end): - at_start, at_end = _get_positions(prop, string, node, match, entry_start, entry_end) - - if at_start: - previous_leaf = node.root.previous_leaf(node) - if previous_leaf is not None: - return True - - if at_end: - next_leaf = node.root.next_leaf(node) - if next_leaf is not None: - return True - - return False - - -class LeavesValidator(DefaultValidator): - def __init__(self, lambdas=None, previous_lambdas=None, next_lambdas=None, both_side=False, default_=True): - self.previous_lambdas = previous_lambdas if previous_lambdas is not None else [] - self.next_lambdas = next_lambdas if next_lambdas is not None else [] - if lambdas: - self.previous_lambdas.extend(lambdas) - self.next_lambdas.extend(lambdas) - self.both_side = both_side - self.default_ = default_ - - """Make sure our match is surrounded by separators and validates defined lambdas""" - def validate(self, prop, string, node, match, entry_start, entry_end): - if self.default_: - super_ret = super(LeavesValidator, self).validate(prop, string, node, match, entry_start, entry_end) - else: - super_ret = True - if not super_ret: - return False - - previous_ = self._validate_previous(prop, string, node, match, entry_start, entry_end) - next_ = self._validate_next(prop, string, node, match, entry_start, entry_end) - - if previous_ is None and next_ is None: - return super_ret - if self.both_side: - return previous_ and next_ - else: - return previous_ or next_ - - def _validate_previous(self, prop, string, node, match, entry_start, entry_end): - if self.previous_lambdas: - for leaf in node.root.previous_leaves(node): - for lambda_ in self.previous_lambdas: - ret = self._check_rule(lambda_, leaf) - if ret is not None: - return ret - return False - - def _validate_next(self, prop, string, node, match, entry_start, entry_end): - if self.next_lambdas: - for leaf in node.root.next_leaves(node): - for lambda_ in self.next_lambdas: - ret = self._check_rule(lambda_, leaf) - if ret is not None: - return ret - return False - - def _check_rule(self, lambda_, previous_leaf): - return lambda_(previous_leaf) - - -class _Property: - """Represents a property configuration.""" - def __init__(self, keys=None, pattern=None, canonical_form=None, canonical_from_pattern=True, confidence=1.0, enhance=True, global_span=False, validator=DefaultValidator(), formatter=None, disabler=None, confidence_lambda=None): - """ - :param keys: Keys of the property (format, screenSize, ...) - :type keys: string - :param canonical_form: Unique value of the property (DVD, 720p, ...) - :type canonical_form: string - :param pattern: Regexp pattern - :type pattern: string - :param confidence: confidence - :type confidence: float - :param enhance: enhance the pattern - :type enhance: boolean - :param global_span: if True, the whole match span will used to create the Guess. - Else, the span from the capturing groups will be used. - :type global_span: boolean - :param validator: Validator to use - :type validator: :class:`DefaultValidator` - :param formatter: Formater to use - :type formatter: function - """ - if isinstance(keys, list): - self.keys = keys - elif isinstance(keys, base_text_type): - self.keys = [keys] - else: - self.keys = [] - self.canonical_form = canonical_form - if pattern is not None: - self.pattern = pattern - else: - self.pattern = canonical_form - if self.canonical_form is None and canonical_from_pattern: - self.canonical_form = self.pattern - self.compiled = compile_pattern(self.pattern, enhance=enhance) - for group_name in _get_groups(self.compiled): - if isinstance(group_name, base_text_type) and not group_name in self.keys: - self.keys.append(group_name) - if not self.keys: - raise ValueError("No property key is defined") - self.confidence = confidence - self.confidence_lambda = confidence_lambda - self.global_span = global_span - self.validator = validator - self.formatter = formatter - self.disabler = disabler - - def disabled(self, options): - if self.disabler: - return self.disabler(options) - return False - - def format(self, value, group_name=None): - """Retrieves the final value from re group match value""" - formatter = None - if isinstance(self.formatter, dict): - formatter = self.formatter.get(group_name) - if formatter is None and group_name is not None: - formatter = self.formatter.get(None) - else: - formatter = self.formatter - if isinstance(formatter, types.FunctionType): - return formatter(value) - elif formatter is not None: - return formatter.format(value) - return value - - def __repr__(self): - return "%s: %s" % (self.keys, self.canonical_form if self.canonical_form else self.pattern) - - -class PropertiesContainer(object): - def __init__(self, **kwargs): - self._properties = [] - self.default_property_kwargs = kwargs - - def unregister_property(self, name, *canonical_forms): - """Unregister a property canonical forms - - If canonical_forms are specified, only those values will be unregistered - - :param name: Property name to unregister - :type name: string - :param canonical_forms: Values to unregister - :type canonical_forms: varargs of string - """ - _properties = [prop for prop in self._properties if prop.name == name and (not canonical_forms or prop.canonical_form in canonical_forms)] - - def register_property(self, name, *patterns, **property_params): - """Register property with defined canonical form and patterns. - - :param name: name of the property (format, screenSize, ...) - :type name: string - :param patterns: regular expression patterns to register for the property canonical_form - :type patterns: varargs of string - """ - properties = [] - for pattern in patterns: - params = dict(self.default_property_kwargs) - params.update(property_params) - if isinstance(pattern, dict): - params.update(pattern) - prop = _Property(name, **params) - else: - prop = _Property(name, pattern, **params) - self._properties.append(prop) - properties.append(prop) - return properties - - def register_canonical_properties(self, name, *canonical_forms, **property_params): - """Register properties from their canonical forms. - - :param name: name of the property (releaseGroup, ...) - :type name: string - :param canonical_forms: values of the property ('ESiR', 'WAF', 'SEPTiC', ...) - :type canonical_forms: varargs of strings - """ - properties = [] - for canonical_form in canonical_forms: - params = dict(property_params) - params['canonical_form'] = canonical_form - properties.extend(self.register_property(name, canonical_form, **property_params)) - return properties - - def unregister_all_properties(self): - """Unregister all defined properties""" - self._properties.clear() - - def find_properties(self, string, node, options, name=None, validate=True, re_match=False, sort=True, multiple=False): - """Find all distinct properties for given string - - If no capturing group is defined in the property, value will be grabbed from the entire match. - - If one ore more unnamed capturing group is defined in the property, first capturing group will be used. - - If named capturing group are defined in the property, they will be returned as property key. - - If validate, found properties will be validated by their defined validator - - If re_match, re.match will be used instead of re.search. - - if sort, found properties will be sorted from longer match to shorter match. - - If multiple is False and multiple values are found for the same property, the more confident one will be returned. - - If multiple is False and multiple values are found for the same property and the same confidence, the longer will be returned. - - :param string: input string - :type string: string - - :param node: current node of the matching tree - :type node: :class:`guessit.matchtree.MatchTree` - - :param name: name of property to find - :type name: string - - :param re_match: use re.match instead of re.search - :type re_match: bool - - :param multiple: Allows multiple property values to be returned - :type multiple: bool - - :return: found properties - :rtype: list of tuples (:class:`_Property`, match, list of tuples (property_name, tuple(value_start, value_end))) - - :see: `_Property` - :see: `register_property` - :see: `register_canonical_properties` - """ - entry_start = {} - entry_end = {} - - entries = [] - duplicate_matches = {} - - ret = [] - - if not string.strip(): - return ret - - # search all properties - for prop in self.get_properties(name): - if not prop.disabled(options): - valid_match = None - if re_match: - match = prop.compiled.match(string) - if match: - entries.append((prop, match)) - else: - matches = list(prop.compiled.finditer(string)) - duplicate_matches[prop] = matches - for match in matches: - entries.append((prop, match)) - - for prop, match in entries: - # compute confidence - if prop.confidence_lambda: - computed_confidence = prop.confidence_lambda(match) - if computed_confidence is not None: - prop.confidence = computed_confidence - - if validate: - # compute entries start and ends - for prop, match in entries: - start, end = _get_span(prop, match) - - if start not in entry_start: - entry_start[start] = [prop] - else: - entry_start[start].append(prop) - - if end not in entry_end: - entry_end[end] = [prop] - else: - entry_end[end].append(prop) - - # remove invalid values - while True: - invalid_entries = [] - for entry in entries: - prop, match = entry - if not prop.validator.validate(prop, string, node, match, entry_start, entry_end): - invalid_entries.append(entry) - if not invalid_entries: - break - for entry in invalid_entries: - prop, match = entry - entries.remove(entry) - prop_duplicate_matches = duplicate_matches.get(prop) - if prop_duplicate_matches: - prop_duplicate_matches.remove(match) - invalid_span = _get_span(prop, match) - start = invalid_span[0] - end = invalid_span[1] - entry_start[start].remove(prop) - if not entry_start.get(start): - del entry_start[start] - entry_end[end].remove(prop) - if not entry_end.get(end): - del entry_end[end] - - for prop, prop_duplicate_matches in duplicate_matches.items(): - # Keeping the last valid match. - # Needed for the.100.109.hdtv-lol.mp4 - for duplicate_match in prop_duplicate_matches[:-1]: - entries.remove((prop, duplicate_match)) - - if multiple: - ret = entries - else: - # keep only best match if multiple values where found - entries_dict = {} - for entry in entries: - for key in prop.keys: - if key not in entries_dict: - entries_dict[key] = [] - entries_dict[key].append(entry) - - for key_entries in entries_dict.values(): - if multiple: - for entry in key_entries: - ret.append(entry) - else: - best_ret = {} - - best_prop, best_match = None, None - if len(key_entries) == 1: - best_prop, best_match = key_entries[0] - else: - for prop, match in key_entries: - start, end = _get_span(prop, match) - if not best_prop or \ - best_prop.confidence < best_prop.confidence or \ - best_prop.confidence == best_prop.confidence and \ - best_match.span()[1] - best_match.span()[0] < match.span()[1] - match.span()[0]: - best_prop, best_match = prop, match - - best_ret[best_prop] = best_match - - for prop, match in best_ret.items(): - ret.append((prop, match)) - - if sort: - def _sorting(x): - _, x_match = x - x_start, x_end = x_match.span() - return x_start - x_end - - ret.sort(key=_sorting) - - return ret - - def as_guess(self, found_properties, input=None, filter_=None, sep_replacement=None, multiple=False, *args, **kwargs): - if filter_ is None: - filter_ = lambda property, *args, **kwargs: True - guesses = [] if multiple else None - for prop, match in found_properties: - first_key = None - for key in prop.keys: - # First property key will be used as base for effective name - if isinstance(key, base_text_type): - if first_key is None: - first_key = key - break - property_name = first_key if first_key else None - span = _get_span(prop, match) - guess = Guess(confidence=prop.confidence, input=input, span=span, prop=property_name) - groups = _get_groups(match.re) - for group_name in groups: - name = group_name if isinstance(group_name, base_text_type) else property_name if property_name not in groups else None - if name: - value = self._effective_prop_value(prop, group_name, input, match.span(group_name) if group_name else match.span(), sep_replacement) - if not value is None: - is_string = isinstance(value, base_text_type) - if not is_string or is_string and value: # Keep non empty strings and other defined objects - if isinstance(value, dict): - for k, v in value.items(): - if k is None: - k = name - guess[k] = v - else: - if name in guess: - if not isinstance(guess[name], list): - guess[name] = [guess[name]] - guess[name].append(value) - else: - guess[name] = value - if group_name: - guess.metadata(prop).span = match.span(group_name) - if filter_(guess): - if multiple: - guesses.append(guess) - else: - return guess - return guesses - - def _effective_prop_value(self, prop, group_name, input=None, span=None, sep_replacement=None): - if prop.canonical_form: - return prop.canonical_form - if input is None: - return None - value = input - if span is not None: - value = value[span[0]:span[1]] - value = input[span[0]:span[1]] if input else None - if sep_replacement: - for sep_char in sep: - value = value.replace(sep_char, sep_replacement) - if value: - value = prop.format(value, group_name) - return value - - def get_properties(self, name=None, canonical_form=None): - """Retrieve properties - - :return: Properties - :rtype: generator - """ - for prop in self._properties: - if (name is None or name in prop.keys) and (canonical_form is None or prop.canonical_form == canonical_form): - yield prop - - def get_supported_properties(self): - supported_properties = {} - for prop in self.get_properties(): - for k in prop.keys: - values = supported_properties.get(k) - if not values: - values = set() - supported_properties[k] = values - if prop.canonical_form: - values.add(prop.canonical_form) - return supported_properties - - -class QualitiesContainer(): - def __init__(self): - self._qualities = {} - - def register_quality(self, name, canonical_form, rating): - """Register a quality rating. - - :param name: Name of the property - :type name: string - :param canonical_form: Value of the property - :type canonical_form: string - :param rating: Estimated quality rating for the property - :type rating: int - """ - property_qualities = self._qualities.get(name) - - if property_qualities is None: - property_qualities = {} - self._qualities[name] = property_qualities - - property_qualities[canonical_form] = rating - - def unregister_quality(self, name, *canonical_forms): - """Unregister quality ratings for given property name. - - If canonical_forms are specified, only those values will be unregistered - - :param name: Name of the property - :type name: string - :param canonical_forms: Value of the property - :type canonical_forms: string - """ - if not canonical_forms: - if name in self._qualities: - del self._qualities[name] - else: - property_qualities = self._qualities.get(name) - if property_qualities is not None: - for property_canonical_form in canonical_forms: - if property_canonical_form in property_qualities: - del property_qualities[property_canonical_form] - if not property_qualities: - del self._qualities[name] - - def clear_qualities(self,): - """Unregister all defined quality ratings. - """ - self._qualities.clear() - - def rate_quality(self, guess, *props): - """Rate the quality of guess. - - :param guess: Guess to rate - :type guess: :class:`guessit.guess.Guess` - :param props: Properties to include in the rating. if empty, rating will be performed for all guess properties. - :type props: varargs of string - - :return: Quality of the guess. The higher, the better. - :rtype: int - """ - rate = 0 - if not props: - props = guess.keys() - for prop in props: - prop_value = guess.get(prop) - prop_qualities = self._qualities.get(prop) - if prop_value is not None and prop_qualities is not None: - rate += prop_qualities.get(prop_value, 0) - return rate - - def best_quality_properties(self, props, *guesses): - """Retrieve the best quality guess, based on given properties - - :param props: Properties to include in the rating - :type props: list of strings - :param guesses: Guesses to rate - :type guesses: :class:`guessit.guess.Guess` - - :return: Best quality guess from all passed guesses - :rtype: :class:`guessit.guess.Guess` - """ - best_guess = None - best_rate = None - for guess in guesses: - rate = self.rate_quality(guess, *props) - if best_rate is None or best_rate < rate: - best_rate = rate - best_guess = guess - return best_guess - - def best_quality(self, *guesses): - """Retrieve the best quality guess. - - :param guesses: Guesses to rate - :type guesses: :class:`guessit.guess.Guess` - - :return: Best quality guess from all passed guesses - :rtype: :class:`guessit.guess.Guess` - """ - best_guess = None - best_rate = None - for guess in guesses: - rate = self.rate_quality(guess) - if best_rate is None or best_rate < rate: - best_rate = rate - best_guess = guess - return best_guess - diff --git a/libs/guessit/date.py b/libs/guessit/date.py deleted file mode 100644 index ed38d1ba..00000000 --- a/libs/guessit/date.py +++ /dev/null @@ -1,129 +0,0 @@ -#!/usr/bin/env python -# -*- coding: utf-8 -*- -# -# GuessIt - A library for guessing information from filenames -# Copyright (c) 2013 Nicolas Wack -# -# GuessIt is free software; you can redistribute it and/or modify it under -# the terms of the Lesser GNU General Public License as published by -# the Free Software Foundation; either version 3 of the License, or -# (at your option) any later version. -# -# GuessIt is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# Lesser GNU General Public License for more details. -# -# You should have received a copy of the Lesser GNU General Public License -# along with this program. If not, see . -# - -from __future__ import absolute_import, division, print_function, unicode_literals - -import datetime - -import re - -from dateutil import parser - - -_dsep = r'[-/ \.]' -_dsep_bis = r'[-/ \.x]' - -date_regexps = [ - re.compile('[^\d](\d{8})[^\d]', re.IGNORECASE), - re.compile('[^\d](\d{6})[^\d]', re.IGNORECASE), - re.compile('[^\d](\d{2})%s(\d{1,2})%s(\d{1,2})[^\d]' % (_dsep, _dsep), re.IGNORECASE), - re.compile('[^\d](\d{1,2})%s(\d{1,2})%s(\d{2})[^\d]' % (_dsep, _dsep), re.IGNORECASE), - re.compile('[^\d](\d{4})%s(\d{1,2})%s(\d{1,2})[^\d]' % (_dsep_bis, _dsep), re.IGNORECASE), - re.compile('[^\d](\d{1,2})%s(\d{1,2})%s(\d{4})[^\d]' % (_dsep, _dsep_bis), re.IGNORECASE), - re.compile('[^\d](\d{1,2}(?:st|nd|rd|th)?%s(?:[a-z]{3,10})%s\d{4})[^\d]' % (_dsep, _dsep), re.IGNORECASE)] - - -def valid_year(year, today=None): - """Check if number is a valid year""" - if not today: - today = datetime.date.today() - return 1920 < year < today.year + 5 - - -def search_year(string): - """Looks for year patterns, and if found return the year and group span. - - Assumes there are sentinels at the beginning and end of the string that - always allow matching a non-digit delimiting the date. - - Note this only looks for valid production years, that is between 1920 - and now + 5 years, so for instance 2000 would be returned as a valid - year but 1492 would not. - - >>> search_year(' in the year 2000... ') - (2000, (13, 17)) - - >>> search_year(' they arrived in 1492. ') - (None, None) - """ - match = re.search(r'[^0-9]([0-9]{4})[^0-9]', string) - if match: - year = int(match.group(1)) - if valid_year(year): - return (year, match.span(1)) - - return (None, None) - - -def search_date(string, year_first=None, day_first=True): - """Looks for date patterns, and if found return the date and group span. - - Assumes there are sentinels at the beginning and end of the string that - always allow matching a non-digit delimiting the date. - - Year can be defined on two digit only. It will return the nearest possible - date from today. - - >>> search_date(' This happened on 2002-04-22. ') - (datetime.date(2002, 4, 22), (18, 28)) - - >>> search_date(' And this on 17-06-1998. ') - (datetime.date(1998, 6, 17), (13, 23)) - - >>> search_date(' no date in here ') - (None, None) - """ - start, end = None, None - match = None - for date_re in date_regexps: - s = date_re.search(string) - if s and (match is None or s.end() - s.start() > len(match)): - start, end = s.start(), s.end() - if date_re.groups: - match = '-'.join(s.groups()) - else: - match = s.group() - - if match is None: - return None, None - - today = datetime.date.today() - - # If day_first/year_first is undefined, parse is made using both possible values. - yearfirst_opts = [False, True] - if year_first is not None: - yearfirst_opts = [year_first] - - dayfirst_opts = [True, False] - if day_first is not None: - dayfirst_opts = [day_first] - - kwargs_list = ({'dayfirst': d, 'yearfirst': y} for d in dayfirst_opts for y in yearfirst_opts) - for kwargs in kwargs_list: - try: - date = parser.parse(match, **kwargs) - except (ValueError, TypeError) as e: #see https://bugs.launchpad.net/dateutil/+bug/1247643 - date = None - pass - # check date plausibility - if date and valid_year(date.year, today=today): - return date.date(), (start+1, end-1) #compensate for sentinels - - return None, None diff --git a/libs/guessit/fileutils.py b/libs/guessit/fileutils.py deleted file mode 100644 index 40110485..00000000 --- a/libs/guessit/fileutils.py +++ /dev/null @@ -1,87 +0,0 @@ -#!/usr/bin/env python -# -*- coding: utf-8 -*- -# -# GuessIt - A library for guessing information from filenames -# Copyright (c) 2013 Nicolas Wack -# -# GuessIt is free software; you can redistribute it and/or modify it under -# the terms of the Lesser GNU General Public License as published by -# the Free Software Foundation; either version 3 of the License, or -# (at your option) any later version. -# -# GuessIt is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# Lesser GNU General Public License for more details. -# -# You should have received a copy of the Lesser GNU General Public License -# along with this program. If not, see . -# - -from __future__ import absolute_import, division, print_function, unicode_literals - -from guessit import s, u -import os.path -import zipfile -import io - - -def split_path(path): - r"""Splits the given path into the list of folders and the filename (or the - last folder if you gave it a folder path. - - If the given path was an absolute path, the first element will always be: - - the '/' root folder on Unix systems - - the drive letter on Windows systems (eg: r'C:\') - - the mount point '\\' on Windows systems (eg: r'\\host\share') - - >>> s(split_path('/usr/bin/smewt')) - ['/', 'usr', 'bin', 'smewt'] - - >>> s(split_path('relative_path/to/my_folder/')) - ['relative_path', 'to', 'my_folder'] - - """ - result = [] - while True: - head, tail = os.path.split(path) - - if not head and not tail: - return result - - if not tail and head == path: - # Make sure we won't have an infinite loop. - result = [head] + result - return result - - # we just split a directory ending with '/', so tail is empty - if not tail: - path = head - continue - - # otherwise, add the last path fragment and keep splitting - result = [tail] + result - path = head - - -def file_in_same_dir(ref_file, desired_file): - """Return the path for a file in the same dir as a given reference file. - - >>> s(file_in_same_dir('~/smewt/smewt.db', 'smewt.settings')) == os.path.normpath('~/smewt/smewt.settings') - True - - """ - return os.path.join(*(split_path(ref_file)[:-1] + [desired_file])) - - -def load_file_in_same_dir(ref_file, filename): - """Load a given file. Works even when the file is contained inside a zip.""" - path = split_path(ref_file)[:-1] + [filename] - - for i, p in enumerate(path): - if p.endswith('.zip'): - zfilename = os.path.join(*path[:i + 1]) - zfile = zipfile.ZipFile(zfilename) - return u(zfile.read('/'.join(path[i + 1:]))) - - return u(io.open(os.path.join(*path), encoding='utf-8').read()) diff --git a/libs/guessit/guess.py b/libs/guessit/guess.py deleted file mode 100644 index c0f401f2..00000000 --- a/libs/guessit/guess.py +++ /dev/null @@ -1,514 +0,0 @@ -#!/usr/bin/env python -# -*- coding: utf-8 -*- -# -# GuessIt - A library for guessing information from filenames -# Copyright (c) 2013 Nicolas Wack -# -# GuessIt is free software; you can redistribute it and/or modify it under -# the terms of the Lesser GNU General Public License as published by -# the Free Software Foundation; either version 3 of the License, or -# (at your option) any later version. -# -# GuessIt is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# Lesser GNU General Public License for more details. -# -# You should have received a copy of the Lesser GNU General Public License -# along with this program. If not, see . -# - -from __future__ import absolute_import, division, print_function, unicode_literals - -from guessit import UnicodeMixin, s, u, base_text_type -from babelfish import Language, Country -import json -import datetime -import logging - -log = logging.getLogger(__name__) - - -class GuessMetadata(object): - """GuessMetadata contains confidence, an input string, span and related property. - - If defined on a property of Guess object, it overrides the object defined as global. - - :param parent: The parent metadata, used for undefined properties in self object - :type parent: :class: `GuessMedata` - :param confidence: The confidence (from 0.0 to 1.0) - :type confidence: number - :param input: The input string - :type input: string - :param span: The input string - :type span: tuple (int, int) - :param prop: The found property definition - :type prop: :class `guessit.containers._Property` - """ - def __init__(self, parent=None, confidence=None, input=None, span=None, prop=None, *args, **kwargs): - self.parent = parent - if confidence is None and self.parent is None: - self._confidence = 1.0 - else: - self._confidence = confidence - self._input = input - self._span = span - self._prop = prop - - @property - def confidence(self): - """The confidence - - :rtype: int - :return: confidence value - """ - return self._confidence if self._confidence is not None else self.parent.confidence if self.parent else None - - @confidence.setter - def confidence(self, confidence): - self._confidence = confidence - - @property - def input(self): - """The input - - :rtype: string - :return: String used to find this guess value - """ - return self._input if self._input is not None else self.parent.input if self.parent else None - - @input.setter - def input(self, input): - """The input - - :rtype: string - """ - self._input = input - - @property - def span(self): - """The span - - :rtype: tuple (int, int) - :return: span of input string used to find this guess value - """ - return self._span if self._span is not None else self.parent.span if self.parent else None - - @span.setter - def span(self, span): - """The span - - :rtype: tuple (int, int) - :return: span of input string used to find this guess value - """ - self._span = span - - @property - def prop(self): - """The property - - :rtype: :class:`_Property` - :return: The property - """ - return self._prop if self._prop is not None else self.parent.prop if self.parent else None - - @property - def raw(self): - """Return the raw information (original match from the string, - not the cleaned version) associated with the given property name.""" - if self.input and self.span: - return self.input[self.span[0]:self.span[1]] - return None - - def __repr__(self, *args, **kwargs): - return object.__repr__(self, *args, **kwargs) - - -def _split_kwargs(**kwargs): - metadata_args = {} - for prop in dir(GuessMetadata): - try: - metadata_args[prop] = kwargs.pop(prop) - except KeyError: - pass - return metadata_args, kwargs - - -class Guess(UnicodeMixin, dict): - """A Guess is a dictionary which has an associated confidence for each of - its values. - - As it is a subclass of dict, you can use it everywhere you expect a - simple dict.""" - - def __init__(self, *args, **kwargs): - metadata_kwargs, kwargs = _split_kwargs(**kwargs) - self._global_metadata = GuessMetadata(**metadata_kwargs) - dict.__init__(self, *args, **kwargs) - - self._metadata = {} - for prop in self: - self._metadata[prop] = GuessMetadata(parent=self._global_metadata) - - def rename(self, old_name, new_name): - if old_name in self._metadata: - metadata = self._metadata[old_name] - del self._metadata[old_name] - self._metadata[new_name] = metadata - if old_name in self: - value = self[old_name] - del self[old_name] - self[new_name] = value - return True - return False - - def to_dict(self, advanced=False): - """Return the guess as a dict containing only base types, ie: - where dates, languages, countries, etc. are converted to strings. - - if advanced is True, return the data as a json string containing - also the raw information of the properties.""" - data = dict(self) - for prop, value in data.items(): - if isinstance(value, datetime.date): - data[prop] = value.isoformat() - elif isinstance(value, (UnicodeMixin, base_text_type)): - data[prop] = u(value) - elif isinstance(value, (Language, Country)): - data[prop] = value.guessit - elif isinstance(value, list): - data[prop] = [u(x) for x in value] - if advanced: - metadata = self.metadata(prop) - prop_data = {'value': data[prop]} - if metadata.raw: - prop_data['raw'] = metadata.raw - if metadata.confidence: - prop_data['confidence'] = metadata.confidence - data[prop] = prop_data - - return data - - def nice_string(self, advanced=False): - """Return a string with the property names and their values, - that also displays the associated confidence to each property. - - FIXME: doc with param""" - if advanced: - data = self.to_dict(advanced) - return json.dumps(data, indent=4) - else: - data = self.to_dict() - - parts = json.dumps(data, indent=4).split('\n') - for i, p in enumerate(parts): - if p[:5] != ' "': - continue - - prop = p.split('"')[1] - parts[i] = (' [%.2f] "' % self.confidence(prop)) + p[5:] - - return '\n'.join(parts) - - def __unicode__(self): - return u(self.to_dict()) - - def metadata(self, prop=None): - """Return the metadata associated with the given property name - - If no property name is given, get the global_metadata - """ - if prop is None: - return self._global_metadata - if prop not in self._metadata: - self._metadata[prop] = GuessMetadata(parent=self._global_metadata) - return self._metadata[prop] - - def confidence(self, prop=None): - return self.metadata(prop).confidence - - def set_confidence(self, prop, confidence): - self.metadata(prop).confidence = confidence - - def raw(self, prop): - return self.metadata(prop).raw - - def set(self, prop_name, value, *args, **kwargs): - if value is None: - try: - del self[prop_name] - except KeyError: - pass - try: - del self._metadata[prop_name] - except KeyError: - pass - else: - self[prop_name] = value - if 'metadata' in kwargs.keys(): - self._metadata[prop_name] = kwargs['metadata'] - else: - self._metadata[prop_name] = GuessMetadata(parent=self._global_metadata, *args, **kwargs) - - def update(self, other, confidence=None): - dict.update(self, other) - if isinstance(other, Guess): - for prop in other: - try: - self._metadata[prop] = other._metadata[prop] - except KeyError: - pass - if confidence is not None: - for prop in other: - self.set_confidence(prop, confidence) - - def update_highest_confidence(self, other): - """Update this guess with the values from the given one. In case - there is property present in both, only the one with the highest one - is kept.""" - if not isinstance(other, Guess): - raise ValueError('Can only call this function on Guess instances') - - for prop in other: - if prop in self and self.metadata(prop).confidence >= other.metadata(prop).confidence: - continue - self[prop] = other[prop] - self._metadata[prop] = other.metadata(prop) - - -def choose_int(g1, g2): - """Function used by merge_similar_guesses to choose between 2 possible - properties when they are integers.""" - v1, c1 = g1 # value, confidence - v2, c2 = g2 - if (v1 == v2): - return (v1, 1 - (1 - c1) * (1 - c2)) - else: - if c1 > c2: - return (v1, c1 - c2) - else: - return (v2, c2 - c1) - - -def choose_string(g1, g2): - """Function used by merge_similar_guesses to choose between 2 possible - properties when they are strings. - - If the 2 strings are similar, or one is contained in the other, the latter is returned - with an increased confidence. - - If the 2 strings are dissimilar, the one with the higher confidence is returned, with - a weaker confidence. - - Note that here, 'similar' means that 2 strings are either equal, or that they - differ very little, such as one string being the other one with the 'the' word - prepended to it. - - >>> s(choose_string(('Hello', 0.75), ('World', 0.5))) - ('Hello', 0.25) - - >>> s(choose_string(('Hello', 0.5), ('hello', 0.5))) - ('Hello', 0.75) - - >>> s(choose_string(('Hello', 0.4), ('Hello World', 0.4))) - ('Hello', 0.64) - - >>> s(choose_string(('simpsons', 0.5), ('The Simpsons', 0.5))) - ('The Simpsons', 0.75) - - """ - v1, c1 = g1 # value, confidence - v2, c2 = g2 - - if not v1: - return g2 - elif not v2: - return g1 - - v1, v2 = v1.strip(), v2.strip() - v1l, v2l = v1.lower(), v2.lower() - - combined_prob = 1 - (1 - c1) * (1 - c2) - - if v1l == v2l: - return v1, combined_prob - - # check for common patterns - elif v1l == 'the ' + v2l: - return v1, combined_prob - elif v2l == 'the ' + v1l: - return v2, combined_prob - - # if one string is contained in the other, return the shortest one - elif v2l in v1l: - return v2, combined_prob - elif v1l in v2l: - return v1, combined_prob - - # in case of conflict, return the one with highest confidence - else: - if c1 > c2: - return v1, c1 - c2 - else: - return v2, c2 - c1 - - -def _merge_similar_guesses_nocheck(guesses, prop, choose): - """Take a list of guesses and merge those which have the same properties, - increasing or decreasing the confidence depending on whether their values - are similar. - - This function assumes there are at least 2 valid guesses.""" - - similar = [guess for guess in guesses if prop in guess] - - g1, g2 = similar[0], similar[1] - - # merge only this prop of s2 into s1, updating the confidence for the - # considered property - v1, v2 = g1[prop], g2[prop] - c1, c2 = g1.confidence(prop), g2.confidence(prop) - - new_value, new_confidence = choose((v1, c1), (v2, c2)) - if new_confidence >= c1: - msg = "Updating matching property '%s' with confidence %.2f" - else: - msg = "Updating non-matching property '%s' with confidence %.2f" - log.debug(msg % (prop, new_confidence)) - - g1.set(prop, new_value, confidence=new_confidence) - g2.pop(prop) - - # remove g2 if there are no properties left - if not g2.keys(): - guesses.remove(g2) - - -def merge_similar_guesses(guesses, prop, choose): - """Take a list of guesses and merge those which have the same properties, - increasing or decreasing the confidence depending on whether their values - are similar.""" - - similar = [guess for guess in guesses if prop in guess] - if len(similar) < 2: - # nothing to merge - return - - if len(similar) == 2: - _merge_similar_guesses_nocheck(guesses, prop, choose) - - if len(similar) > 2: - log.debug('complex merge, trying our best...') - before = len(guesses) - _merge_similar_guesses_nocheck(guesses, prop, choose) - after = len(guesses) - if after < before: - # recurse only when the previous call actually did something, - # otherwise we end up in an infinite loop - merge_similar_guesses(guesses, prop, choose) - - -def merge_all(guesses, append=None): - """Merge all the guesses in a single result, remove very unlikely values, - and return it. - You can specify a list of properties that should be appended into a list - instead of being merged. - - >>> s(merge_all([ Guess({'season': 2}, confidence=0.6), - ... Guess({'episodeNumber': 13}, confidence=0.8) ]) - ... ) == {'season': 2, 'episodeNumber': 13} - True - - - >>> s(merge_all([ Guess({'episodeNumber': 27}, confidence=0.02), - ... Guess({'season': 1}, confidence=0.2) ]) - ... ) == {'season': 1} - True - - >>> s(merge_all([ Guess({'other': 'PROPER'}, confidence=0.8), - ... Guess({'releaseGroup': '2HD'}, confidence=0.8) ], - ... append=['other']) - ... ) == {'releaseGroup': '2HD', 'other': ['PROPER']} - True - - """ - result = Guess() - if not guesses: - return result - - if append is None: - append = [] - - for g in guesses: - # first append our appendable properties - for prop in append: - if prop in g: - if isinstance(g[prop], (list, set)): - new_values = result.get(prop, []) + list(g[prop]) - else: - new_values = result.get(prop, []) + [g[prop]] - - result.set(prop, new_values, - # TODO: what to do with confidence here? maybe an - # arithmetic mean... - confidence=g.metadata(prop).confidence, - input=g.metadata(prop).input, - span=g.metadata(prop).span, - prop=g.metadata(prop).prop) - - del g[prop] - - # then merge the remaining ones - dups = set(result) & set(g) - if dups: - log.debug('duplicate properties %s in merged result...' % [(result[p], g[p]) for p in dups]) - - result.update_highest_confidence(g) - - # delete very unlikely values - for p in list(result.keys()): - if result.confidence(p) < 0.05: - del result[p] - - # make sure our appendable properties contain unique values - for prop in append: - try: - value = result[prop] - if isinstance(value, list): - result[prop] = list(set(value)) - else: - result[prop] = [value] - except KeyError: - pass - - return result - - -def smart_merge(guesses): - """First tries to merge well-known similar properties, and then merges - the rest with a merge_all call. - - Should be the function to call in most cases, unless one wants to have more - control. - - Warning: this function is destructive, ie: it will merge the list in-place. - """ - - # 1- try to merge similar information together and give it a higher - # confidence - for int_part in ('year', 'season', 'episodeNumber'): - merge_similar_guesses(guesses, int_part, choose_int) - - for string_part in ('title', 'series', 'container', 'format', - 'releaseGroup', 'website', 'audioCodec', - 'videoCodec', 'screenSize', 'episodeFormat', - 'audioChannels', 'idNumber'): - merge_similar_guesses(guesses, string_part, choose_string) - - # 2- merge the rest, potentially discarding information not properly - # merged before - result = merge_all(guesses, - append=['language', 'subtitleLanguage', 'other', - 'episodeDetails', 'unidentified']) - - return result diff --git a/libs/guessit/hash_ed2k.py b/libs/guessit/hash_ed2k.py deleted file mode 100644 index a1ea562f..00000000 --- a/libs/guessit/hash_ed2k.py +++ /dev/null @@ -1,69 +0,0 @@ -#!/usr/bin/env python -# -*- coding: utf-8 -*- -# -# GuessIt - A library for guessing information from filenames -# Copyright (c) 2013 Nicolas Wack -# -# GuessIt is free software; you can redistribute it and/or modify it under -# the terms of the Lesser GNU General Public License as published by -# the Free Software Foundation; either version 3 of the License, or -# (at your option) any later version. -# -# GuessIt is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# Lesser GNU General Public License for more details. -# -# You should have received a copy of the Lesser GNU General Public License -# along with this program. If not, see . -# - -from __future__ import absolute_import, division, print_function, unicode_literals - -from guessit import s, to_hex -import hashlib -import os.path - -from functools import reduce - - -def hash_file(filename): - """Returns the ed2k hash of a given file. - - >>> testfile = os.path.join(os.path.dirname(__file__), 'test/dummy.srt') - >>> s(hash_file(testfile)) - 'ed2k://|file|dummy.srt|59|41F58B913AB3973F593BEBA8B8DF6510|/' - """ - return 'ed2k://|file|%s|%d|%s|/' % (os.path.basename(filename), - os.path.getsize(filename), - hash_filehash(filename).upper()) - - -def hash_filehash(filename): - """Returns the ed2k hash of a given file. - - This function is taken from: - http://www.radicand.org/blog/orz/2010/2/21/edonkey2000-hash-in-python/ - """ - md4 = hashlib.new('md4').copy - - def gen(f): - while True: - x = f.read(9728000) - if x: - yield x - else: - return - - def md4_hash(data): - m = md4() - m.update(data) - return m - - with open(filename, 'rb') as f: - a = gen(f) - hashes = [md4_hash(data).digest() for data in a] - if len(hashes) == 1: - return to_hex(hashes[0]) - else: - return md4_hash(reduce(lambda a, d: a + d, hashes, "")).hexd diff --git a/libs/guessit/hash_mpc.py b/libs/guessit/hash_mpc.py deleted file mode 100644 index fb6c52bd..00000000 --- a/libs/guessit/hash_mpc.py +++ /dev/null @@ -1,58 +0,0 @@ -#!/usr/bin/env python -# -*- coding: utf-8 -*- -# -# GuessIt - A library for guessing information from filenames -# Copyright (c) 2013 Nicolas Wack -# -# GuessIt is free software; you can redistribute it and/or modify it under -# the terms of the Lesser GNU General Public License as published by -# the Free Software Foundation; either version 3 of the License, or -# (at your option) any later version. -# -# GuessIt is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# Lesser GNU General Public License for more details. -# -# You should have received a copy of the Lesser GNU General Public License -# along with this program. If not, see . -# - -from __future__ import absolute_import, division, print_function, unicode_literals - -import struct -import os - - -def hash_file(filename): - """This function is taken from: - http://trac.opensubtitles.org/projects/opensubtitles/wiki/HashSourceCodes - and is licensed under the GPL.""" - - longlongformat = b'q' # long long - bytesize = struct.calcsize(longlongformat) - - f = open(filename, "rb") - - filesize = os.path.getsize(filename) - hash_value = filesize - - if filesize < 65536 * 2: - raise Exception("SizeError: size is %d, should be > 132K..." % filesize) - - for x in range(int(65536 / bytesize)): - buf = f.read(bytesize) - (l_value,) = struct.unpack(longlongformat, buf) - hash_value += l_value - hash_value &= 0xFFFFFFFFFFFFFFFF # to remain as 64bit number - - f.seek(max(0, filesize - 65536), 0) - for x in range(int(65536 / bytesize)): - buf = f.read(bytesize) - (l_value,) = struct.unpack(longlongformat, buf) - hash_value += l_value - hash_value &= 0xFFFFFFFFFFFFFFFF - - f.close() - - return "%016x" % hash_value diff --git a/libs/guessit/jsonutils.py b/libs/guessit/jsonutils.py new file mode 100644 index 00000000..7d6ff705 --- /dev/null +++ b/libs/guessit/jsonutils.py @@ -0,0 +1,32 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +""" +JSON Utils +""" +import json +try: + from collections import OrderedDict +except ImportError: # pragma: no-cover + from ordereddict import OrderedDict # pylint:disable=import-error + +from rebulk.match import Match + + +class GuessitEncoder(json.JSONEncoder): + """ + JSON Encoder for guessit response + """ + + def default(self, o): # pylint:disable=method-hidden + if isinstance(o, Match): + ret = OrderedDict() + ret['value'] = o.value + if o.raw: + ret['raw'] = o.raw + ret['start'] = o.start + ret['end'] = o.end + return ret + elif hasattr(o, 'name'): # Babelfish languages/countries long name + return str(o.name) + else: # pragma: no cover + return str(o) diff --git a/libs/guessit/language.py b/libs/guessit/language.py deleted file mode 100644 index 7e32af3c..00000000 --- a/libs/guessit/language.py +++ /dev/null @@ -1,311 +0,0 @@ -#!/usr/bin/env python -# -*- coding: utf-8 -*- -# -# GuessIt - A library for guessing information from filenames -# Copyright (c) 2013 Nicolas Wack -# -# GuessIt is free software; you can redistribute it and/or modify it under -# the terms of the Lesser GNU General Public License as published by -# the Free Software Foundation; either version 3 of the License, or -# (at your option) any later version. -# -# GuessIt is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# Lesser GNU General Public License for more details. -# -# You should have received a copy of the Lesser GNU General Public License -# along with this program. If not, see . -# - -from __future__ import absolute_import, division, print_function, unicode_literals - -from guessit import UnicodeMixin, base_text_type, u -from guessit.textutils import find_words -from babelfish import Language, Country -import babelfish -import re -import logging -from guessit.guess import Guess - -__all__ = ['Language', 'UNDETERMINED', - 'search_language', 'guess_language'] - -log = logging.getLogger(__name__) - -UNDETERMINED = babelfish.Language('und') - -SYN = {('und', None): ['unknown', 'inconnu', 'unk', 'un'], - ('ell', None): ['gr', 'greek'], - ('spa', None): ['esp', 'español'], - ('fra', None): ['français', 'vf', 'vff', 'vfi'], - ('swe', None): ['se'], - ('por', 'BR'): ['po', 'pb', 'pob', 'br', 'brazilian'], - ('cat', None): ['català'], - ('ces', None): ['cz'], - ('ukr', None): ['ua'], - ('zho', None): ['cn'], - ('jpn', None): ['jp'], - ('hrv', None): ['scr'], - ('mul', None): ['multi', 'dl'], # http://scenelingo.wordpress.com/2009/03/24/what-does-dl-mean/ - } - - -class GuessitConverter(babelfish.LanguageReverseConverter): - - _with_country_regexp = re.compile('(.*)\((.*)\)') - _with_country_regexp2 = re.compile('(.*)-(.*)') - - def __init__(self): - self.guessit_exceptions = {} - for (alpha3, country), synlist in SYN.items(): - for syn in synlist: - self.guessit_exceptions[syn.lower()] = (alpha3, country, None) - - @property - def codes(self): - return (babelfish.language_converters['alpha3b'].codes | - babelfish.language_converters['alpha2'].codes | - babelfish.language_converters['name'].codes | - babelfish.language_converters['opensubtitles'].codes | - babelfish.country_converters['name'].codes | - frozenset(self.guessit_exceptions.keys())) - - def convert(self, alpha3, country=None, script=None): - return str(babelfish.Language(alpha3, country, script)) - - def reverse(self, name): - with_country = (GuessitConverter._with_country_regexp.match(name) or - GuessitConverter._with_country_regexp2.match(name)) - - name = u(name.lower()) - if with_country: - lang = Language.fromguessit(with_country.group(1).strip()) - lang.country = babelfish.Country.fromguessit(with_country.group(2).strip()) - return (lang.alpha3, lang.country.alpha2 if lang.country else None, lang.script or None) - - # exceptions come first, as they need to override a potential match - # with any of the other guessers - try: - return self.guessit_exceptions[name] - except KeyError: - pass - - for conv in [babelfish.Language, - babelfish.Language.fromalpha3b, - babelfish.Language.fromalpha2, - babelfish.Language.fromname, - babelfish.Language.fromopensubtitles]: - try: - c = conv(name) - return c.alpha3, c.country, c.script - except (ValueError, babelfish.LanguageReverseError): - pass - - raise babelfish.LanguageReverseError(name) - - -babelfish.language_converters['guessit'] = GuessitConverter() - -COUNTRIES_SYN = {'ES': ['españa'], - 'GB': ['UK'], - 'BR': ['brazilian', 'bra'], - # FIXME: this one is a bit of a stretch, not sure how to do - # it properly, though... - 'MX': ['Latinoamérica', 'latin america'] - } - - -class GuessitCountryConverter(babelfish.CountryReverseConverter): - def __init__(self): - self.guessit_exceptions = {} - - for alpha2, synlist in COUNTRIES_SYN.items(): - for syn in synlist: - self.guessit_exceptions[syn.lower()] = alpha2 - - @property - def codes(self): - return (babelfish.country_converters['name'].codes | - frozenset(babelfish.COUNTRIES.values()) | - frozenset(self.guessit_exceptions.keys())) - - def convert(self, alpha2): - if alpha2 == 'GB': - return 'UK' - return str(Country(alpha2)) - - def reverse(self, name): - # exceptions come first, as they need to override a potential match - # with any of the other guessers - try: - return self.guessit_exceptions[name.lower()] - except KeyError: - pass - - try: - return babelfish.Country(name.upper()).alpha2 - except ValueError: - pass - - for conv in [babelfish.Country.fromname]: - try: - return conv(name).alpha2 - except babelfish.CountryReverseError: - pass - - raise babelfish.CountryReverseError(name) - - -babelfish.country_converters['guessit'] = GuessitCountryConverter() - - -# list of common words which could be interpreted as languages, but which -# are far too common to be able to say they represent a language in the -# middle of a string (where they most likely carry their commmon meaning) -LNG_COMMON_WORDS = frozenset([ - # english words - 'is', 'it', 'am', 'mad', 'men', 'man', 'run', 'sin', 'st', 'to', - 'no', 'non', 'war', 'min', 'new', 'car', 'day', 'bad', 'bat', 'fan', - 'fry', 'cop', 'zen', 'gay', 'fat', 'one', 'cherokee', 'got', 'an', 'as', - 'cat', 'her', 'be', 'hat', 'sun', 'may', 'my', 'mr', 'rum', 'pi', 'bb', 'bt', - 'tv', 'aw', 'by', 'md', 'mp', 'cd', 'lt', 'gt', 'in', 'ad', 'ice', 'ay', - # french words - 'bas', 'de', 'le', 'son', 'ne', 'ca', 'ce', 'et', 'que', - 'mal', 'est', 'vol', 'or', 'mon', 'se', 'je', 'tu', 'me', - 'ne', 'ma', 'va', 'au', - # japanese words, - 'wa', 'ga', 'ao', - # spanish words - 'la', 'el', 'del', 'por', 'mar', - # other - 'ind', 'arw', 'ts', 'ii', 'bin', 'chan', 'ss', 'san', 'oss', 'iii', - 'vi', 'ben', 'da', 'lt', 'ch', - # new from babelfish - 'mkv', 'avi', 'dmd', 'the', 'dis', 'cut', 'stv', 'des', 'dia', 'and', - 'cab', 'sub', 'mia', 'rim', 'las', 'une', 'par', 'srt', 'ano', 'toy', - 'job', 'gag', 'reel', 'www', 'for', 'ayu', 'csi', 'ren', 'moi', 'sur', - 'fer', 'fun', 'two', 'big', 'psy', 'air', - # movie title - 'brazil', - # release groups - 'bs', # Bosnian - 'kz', - # countries - 'gt', 'lt', - # part/pt - 'pt' - ]) - -LNG_COMMON_WORDS_STRICT = frozenset(['brazil']) - - -subtitle_prefixes = ['sub', 'subs', 'st', 'vost', 'subforced', 'fansub', 'hardsub'] -subtitle_suffixes = ['subforced', 'fansub', 'hardsub'] -lang_prefixes = ['true'] - - -def find_possible_languages(string, allowed_languages=None): - """Find possible languages in the string - - :return: list of tuple (property, Language, lang_word, word) - """ - - common_words = None - if allowed_languages: - common_words = LNG_COMMON_WORDS_STRICT - else: - common_words = LNG_COMMON_WORDS - - words = find_words(string) - - valid_words = [] - for word in words: - lang_word = word.lower() - key = 'language' - for prefix in subtitle_prefixes: - if lang_word.startswith(prefix): - lang_word = lang_word[len(prefix):] - key = 'subtitleLanguage' - for suffix in subtitle_suffixes: - if lang_word.endswith(suffix): - lang_word = lang_word[:len(suffix)] - key = 'subtitleLanguage' - for prefix in lang_prefixes: - if lang_word.startswith(prefix): - lang_word = lang_word[len(prefix):] - if lang_word not in common_words: - try: - lang = Language.fromguessit(lang_word) - if allowed_languages: - if lang.name.lower() in allowed_languages or lang.alpha2.lower() in allowed_languages or lang.alpha3.lower() in allowed_languages: - valid_words.append((key, lang, lang_word, word)) - # Keep language with alpha2 equivalent. Others are probably - # uncommon languages. - elif lang == 'mul' or hasattr(lang, 'alpha2'): - valid_words.append((key, lang, lang_word, word)) - except babelfish.Error: - pass - return valid_words - - -def search_language(string, allowed_languages=None): - """Looks for language patterns, and if found return the language object, - its group span and an associated confidence. - - you can specify a list of allowed languages using the lang_filter argument, - as in lang_filter = [ 'fr', 'eng', 'spanish' ] - - >>> search_language('movie [en].avi')['language'] - - - >>> search_language('the zen fat cat and the gay mad men got a new fan', allowed_languages = ['en', 'fr', 'es']) - - """ - - if allowed_languages: - allowed_languages = set(Language.fromguessit(lang) for lang in allowed_languages) - - confidence = 1.0 # for all of them - - for prop, language, lang, word in find_possible_languages(string, allowed_languages): - pos = string.find(word) - end = pos + len(word) - - # only allow those languages that have a 2-letter code, those that - # don't are too esoteric and probably false matches - # if language.lang not in lng3_to_lng2: - # continue - - # confidence depends on alpha2, alpha3, english name, ... - if len(lang) == 2: - confidence = 0.8 - elif len(lang) == 3: - confidence = 0.9 - elif prop == 'subtitleLanguage': - confidence = 0.6 # Subtitle prefix found with language - else: - # Note: we could either be really confident that we found a - # language or assume that full language names are too - # common words and lower their confidence accordingly - confidence = 0.3 # going with the low-confidence route here - - return Guess({prop: language}, confidence=confidence, input=string, span=(pos, end)) - - return None - - -def guess_language(text): # pragma: no cover - """Guess the language in which a body of text is written. - - This uses the external guess-language python module, and will fail and return - Language(Undetermined) if it is not installed. - """ - try: - from guess_language import guessLanguage - return Language.fromguessit(guessLanguage(text)) - - except ImportError: - log.error('Cannot detect the language of the given text body, missing dependency: guess-language') - log.error('Please install it from PyPI, by doing eg: pip install guess-language') - return UNDETERMINED diff --git a/libs/guessit/matcher.py b/libs/guessit/matcher.py deleted file mode 100644 index 2e3bc2af..00000000 --- a/libs/guessit/matcher.py +++ /dev/null @@ -1,306 +0,0 @@ -#!/usr/bin/env python -# -*- coding: utf-8 -*- -# -# GuessIt - A library for guessing information from filenames -# Copyright (c) 2013 Nicolas Wack -# Copyright (c) 2013 Rémi Alvergnat -# -# GuessIt is free software; you can redistribute it and/or modify it under -# the terms of the Lesser GNU General Public License as published by -# the Free Software Foundation; either version 3 of the License, or -# (at your option) any later version. -# -# GuessIt is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# Lesser GNU General Public License for more details. -# -# You should have received a copy of the Lesser GNU General Public License -# along with this program. If not, see . -# - -from __future__ import absolute_import, division, print_function, \ - unicode_literals - -import logging - -from guessit import PY3, u -from guessit.transfo import TransformerException -from guessit.matchtree import MatchTree -from guessit.textutils import normalize_unicode, clean_default -from guessit.guess import Guess -import inspect - -log = logging.getLogger(__name__) - - -class IterativeMatcher(object): - """An iterative matcher tries to match different patterns that appear - in the filename. - - The ``filetype`` argument indicates which type of file you want to match. - If it is undefined, the matcher will try to see whether it can guess - that the file corresponds to an episode, or otherwise will assume it is - a movie. - - The recognized ``filetype`` values are: - ``['subtitle', 'info', 'movie', 'moviesubtitle', 'movieinfo', 'episode', - 'episodesubtitle', 'episodeinfo']`` - - ``options`` is a dict of options values to be passed to the transformations used - by the matcher. - - The IterativeMatcher works mainly in 2 steps: - - First, it splits the filename into a match_tree, which is a tree of groups - which have a semantic meaning, such as episode number, movie title, - etc... - - The match_tree created looks like the following:: - - 0000000000000000000000000000000000000000000000000000000000000000000000000000000000 111 - 0000011111111111112222222222222233333333444444444444444455555555666777777778888888 000 - 0000000000000000000000000000000001111112011112222333333401123334000011233340000000 000 - __________________(The.Prestige).______.[____.HP.______.{__-___}.St{__-___}.Chaps].___ - xxxxxttttttttttttt ffffff vvvv xxxxxx ll lll xx xxx ccc - [XCT].Le.Prestige.(The.Prestige).DVDRip.[x264.HP.He-Aac.{Fr-Eng}.St{Fr-Eng}.Chaps].mkv - - The first 3 lines indicates the group index in which a char in the - filename is located. So for instance, ``x264`` (in the middle) is the group (0, 4, 1), and - it corresponds to a video codec, denoted by the letter ``v`` in the 4th line. - (for more info, see guess.matchtree.to_string) - - Second, it tries to merge all this information into a single object - containing all the found properties, and does some (basic) conflict - resolution when they arise. - """ - def __init__(self, filename, options=None, **kwargs): - options = dict(options or {}) - for k, v in kwargs.items(): - if k not in options or not options[k]: - options[k] = v # options dict has priority over keyword arguments - self._validate_options(options) - if not PY3 and not isinstance(filename, unicode): - log.warning('Given filename to matcher is not unicode...') - filename = filename.decode('utf-8') - - filename = normalize_unicode(filename) - if options and options.get('clean_function'): - clean_function = options.get('clean_function') - if not hasattr(clean_function, '__call__'): - module, function = clean_function.rsplit('.') - if not module: - module = 'guessit.textutils' - clean_function = getattr(__import__(module), function) - if not clean_function: - log.error('Can\'t find clean function %s. Default will be used.' % options.get('clean_function')) - clean_function = clean_default - else: - clean_function = clean_default - - self.match_tree = MatchTree(filename, clean_function=clean_function) - self.options = options - self._transfo_calls = [] - - # sanity check: make sure we don't process a (mostly) empty string - if clean_function(filename).strip() == '': - return - - from guessit.plugins import transformers - - try: - mtree = self.match_tree - if 'type' in self.options: - mtree.guess.set('type', self.options['type'], confidence=0.0) - - # Process - for transformer in transformers.all_transformers(): - disabled = options.get('disabled_transformers') - if not disabled or transformer.name not in disabled: - self._process(transformer, False) - - # Post-process - for transformer in transformers.all_transformers(): - disabled = options.get('disabled_transformers') - if not disabled or transformer.name not in disabled: - self._process(transformer, True) - - log.debug('Found match tree:\n%s' % u(mtree)) - except TransformerException as e: - log.debug('An error has occurred in Transformer %s: %s' % (e.transformer, e)) - - def _process(self, transformer, post=False): - - if not hasattr(transformer, 'should_process') or transformer.should_process(self.match_tree, self.options): - if post: - transformer.post_process(self.match_tree, self.options) - else: - transformer.process(self.match_tree, self.options) - self._transfo_calls.append(transformer) - - @property - def second_pass_options(self): - second_pass_options = {} - for transformer in self._transfo_calls: - if hasattr(transformer, 'second_pass_options'): - transformer_second_pass_options = transformer.second_pass_options(self.match_tree, self.options) - if transformer_second_pass_options: - second_pass_options.update(transformer_second_pass_options) - - return second_pass_options - - def _validate_options(self, options): - valid_filetypes = ('subtitle', 'info', 'video', - 'movie', 'moviesubtitle', 'movieinfo', - 'episode', 'episodesubtitle', 'episodeinfo') - - type_ = options.get('type') - if type_ and type_ not in valid_filetypes: - raise ValueError("filetype needs to be one of %s" % (valid_filetypes,)) - - def matched(self): - return self.match_tree.matched() - - -def build_guess(node, name, value=None, confidence=1.0): - guess = Guess({name: node.clean_value if value is None else value}, confidence=confidence) - guess.metadata().input = node.value if value is None else value - if value is None: - left_offset = 0 - right_offset = 0 - - clean_value = node.clean_value - - for i in range(0, len(node.value)): - if clean_value[0] == node.value[i]: - break - left_offset += 1 - - for i in reversed(range(0, len(node.value))): - if clean_value[-1] == node.value[i]: - break - right_offset += 1 - - guess.metadata().span = (node.span[0] - node.offset + left_offset, node.span[1] - node.offset - right_offset) - return guess - - -def found_property(node, name, value=None, confidence=1.0, update_guess=True, logger=None): - # automatically retrieve the log object from the caller frame - if not logger: - caller_frame = inspect.stack()[1][0] - logger = caller_frame.f_locals['self'].log - guess = build_guess(node, name, value, confidence) - return found_guess(node, guess, update_guess=update_guess, logger=logger) - - -def found_guess(node, guess, update_guess=True, logger=None): - if node.guess: - if update_guess: - node.guess.update_highest_confidence(guess) - else: - child = node.add_child(guess.metadata().span) - child.guess = guess - else: - node.guess = guess - log_found_guess(guess, logger) - return node.guess - - -def log_found_guess(guess, logger=None): - for k, v in guess.items(): - (logger or log).debug('Property found: %s=%s (%s) (confidence=%.2f)' % - (k, v, guess.raw(k), guess.confidence(k))) - - -def _get_split_spans(node, span): - partition_spans = node.get_partition_spans(span) - for to_remove_span in partition_spans: - if to_remove_span[0] == span[0] and to_remove_span[1] in [span[1], span[1] + 1]: - partition_spans.remove(to_remove_span) - break - return partition_spans - - -class GuessFinder(object): - def __init__(self, guess_func, confidence=None, logger=None, options=None): - self.guess_func = guess_func - self.confidence = confidence - self.logger = logger or log - self.options = options - - def process_nodes(self, nodes): - for node in nodes: - self.process_node(node) - - def process_node(self, node, iterative=True, partial_span=None): - if partial_span: - value = node.value[partial_span[0]:partial_span[1]] - else: - value = node.value - string = ' %s ' % value # add sentinels - - if not self.options: - matcher_result = self.guess_func(string, node) - else: - matcher_result = self.guess_func(string, node, self.options) - - if matcher_result: - if not isinstance(matcher_result, Guess): - result, span = matcher_result - else: - result, span = matcher_result, matcher_result.metadata().span - - if result: - # readjust span to compensate for sentinels - span = (span[0] - 1, span[1] - 1) - - # readjust span to compensate for partial_span - if partial_span: - span = (span[0] + partial_span[0], span[1] + partial_span[0]) - - partition_spans = None - if self.options and 'skip_nodes' in self.options: - skip_nodes = self.options.get('skip_nodes') - for skip_node in skip_nodes: - if skip_node.parent.node_idx == node.node_idx[:len(skip_node.parent.node_idx)] and\ - skip_node.span == span or\ - skip_node.span == (span[0] + skip_node.offset, span[1] + skip_node.offset): - if partition_spans is None: - partition_spans = _get_split_spans(node, skip_node.span) - else: - new_partition_spans = [] - for partition_span in partition_spans: - tmp_node = MatchTree(value, span=partition_span, parent=node) - tmp_partitions_spans = _get_split_spans(tmp_node, skip_node.span) - new_partition_spans.extend(tmp_partitions_spans) - partition_spans.extend(new_partition_spans) - - if not partition_spans: - # restore sentinels compensation - - if isinstance(result, Guess): - guess = result - else: - guess = Guess(result, confidence=self.confidence, input=string, span=span) - - if not iterative: - found_guess(node, guess, logger=self.logger) - else: - absolute_span = (span[0] + node.offset, span[1] + node.offset) - node.partition(span) - if node.is_leaf(): - found_guess(node, guess, logger=self.logger) - else: - found_child = None - for child in node.children: - if child.span == absolute_span: - found_guess(child, guess, logger=self.logger) - found_child = child - break - for child in node.children: - if child is not found_child: - self.process_node(child) - else: - for partition_span in partition_spans: - self.process_node(node, partial_span=partition_span) diff --git a/libs/guessit/matchtree.py b/libs/guessit/matchtree.py deleted file mode 100644 index 19c1e759..00000000 --- a/libs/guessit/matchtree.py +++ /dev/null @@ -1,426 +0,0 @@ -#!/usr/bin/env python -# -*- coding: utf-8 -*- -# -# GuessIt - A library for guessing information from filenames -# Copyright (c) 2013 Nicolas Wack -# -# GuessIt is free software; you can redistribute it and/or modify it under -# the terms of the Lesser GNU General Public License as published by -# the Free Software Foundation; either version 3 of the License, or -# (at your option) any later version. -# -# GuessIt is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# Lesser GNU General Public License for more details. -# -# You should have received a copy of the Lesser GNU General Public License -# along with this program. If not, see . -# - -from __future__ import absolute_import, division, print_function, unicode_literals - -import guessit # @UnusedImport needed for doctests -from guessit import UnicodeMixin, base_text_type -from guessit.textutils import clean_default, str_fill -from guessit.patterns import group_delimiters -from guessit.guess import (merge_similar_guesses, smart_merge, - choose_int, choose_string, Guess) -from itertools import takewhile -import copy -import logging - -log = logging.getLogger(__name__) - - -class BaseMatchTree(UnicodeMixin): - """A BaseMatchTree is a tree covering the filename, where each - node represents a substring in the filename and can have a ``Guess`` - associated with it that contains the information that has been guessed - in this node. Nodes can be further split into subnodes until a proper - split has been found. - - Each node has the following attributes: - - string = the original string of which this node represents a region - - span = a pair of (begin, end) indices delimiting the substring - - parent = parent node - - children = list of children nodes - - guess = Guess() - - BaseMatchTrees are displayed in the following way: - - >>> path = 'Movies/Dark City (1998)/Dark.City.(1998).DC.BDRip.720p.DTS.X264-CHD.mkv' - >>> print(guessit.IterativeMatcher(path).match_tree) - 000000 1111111111111111 2222222222222222222222222222222222222222222 333 - 000000 0000000000111111 0000000000111111222222222222222222222222222 000 - 011112 011112000011111222222222222222222 000 - 011112222222222222 - 0000011112222 - 01112 0111 - Movies/__________(____)/Dark.City.(____).DC._____.____.___.____-___.___ - tttttttttt yyyy yyyy fffff ssss aaa vvvv rrr ccc - Movies/Dark City (1998)/Dark.City.(1998).DC.BDRip.720p.DTS.X264-CHD.mkv - - The last line contains the filename, which you can use a reference. - The previous line contains the type of property that has been found. - The line before that contains the filename, where all the found groups - have been blanked. Basically, what is left on this line are the leftover - groups which could not be identified. - - The lines before that indicate the indices of the groups in the tree. - - For instance, the part of the filename 'BDRip' is the leaf with index - ``(2, 2, 1)`` (read from top to bottom), and its meaning is 'format' - (as shown by the ``f``'s on the last-but-one line). - """ - - def __init__(self, string='', span=None, parent=None, clean_function=None): - self.string = string - self.span = span or (0, len(string)) - self.parent = parent - self.children = [] - self.guess = Guess() - self._clean_value = None - self._clean_function = clean_function or clean_default - - @property - def value(self): - """Return the substring that this node matches.""" - return self.string[self.span[0]:self.span[1]] - - @property - def clean_value(self): - """Return a cleaned value of the matched substring, with better - presentation formatting (punctuation marks removed, duplicate - spaces, ...)""" - if self._clean_value is None: - self._clean_value = self.clean_string(self.value) - return self._clean_value - - def clean_string(self, string): - return self._clean_function(string) - - @property - def offset(self): - return self.span[0] - - @property - def info(self): - """Return a dict containing all the info guessed by this node, - subnodes included.""" - result = dict(self.guess) - - for c in self.children: - result.update(c.info) - - return result - - @property - def root(self): - """Return the root node of the tree.""" - if not self.parent: - return self - - return self.parent.root - - @property - def depth(self): - """Return the depth of this node.""" - if self.is_leaf(): - return 0 - - return 1 + max(c.depth for c in self.children) - - def is_leaf(self): - """Return whether this node is a leaf or not.""" - return self.children == [] - - def add_child(self, span): - """Add a new child node to this node with the given span.""" - child = MatchTree(self.string, span=span, parent=self, clean_function=self._clean_function) - self.children.append(child) - return child - - def get_partition_spans(self, indices): - """Return the list of absolute spans for the regions of the original - string defined by splitting this node at the given indices (relative - to this node)""" - indices = sorted(indices) - if indices[0] != 0: - indices.insert(0, 0) - if indices[-1] != len(self.value): - indices.append(len(self.value)) - - spans = [] - for start, end in zip(indices[:-1], indices[1:]): - spans.append((self.offset + start, - self.offset + end)) - return spans - - def partition(self, indices): - """Partition this node by splitting it at the given indices, - relative to this node.""" - for partition_span in self.get_partition_spans(indices): - self.add_child(span=partition_span) - - def split_on_components(self, components): - offset = 0 - for c in components: - start = self.value.find(c, offset) - end = start + len(c) - self.add_child(span=(self.offset + start, - self.offset + end)) - offset = end - - def nodes_at_depth(self, depth): - """Return all the nodes at a given depth in the tree""" - if depth == 0: - yield self - - for child in self.children: - for node in child.nodes_at_depth(depth - 1): - yield node - - @property - def node_idx(self): - """Return this node's index in the tree, as a tuple. - If this node is the root of the tree, then return ().""" - if self.parent is None: - return () - return self.parent.node_idx + (self.node_last_idx,) - - @property - def node_last_idx(self): - if self.parent is None: - return None - return self.parent.children.index(self) - - def node_at(self, idx): - """Return the node at the given index in the subtree rooted at - this node.""" - if not idx: - return self - - try: - return self.children[idx[0]].node_at(idx[1:]) - except IndexError: - raise ValueError('Non-existent node index: %s' % (idx,)) - - def nodes(self): - """Return all the nodes and subnodes in this tree.""" - yield self - for child in self.children: - for node in child.nodes(): - yield node - - def leaves(self): - """Return a generator over all the nodes that are leaves.""" - if self.is_leaf(): - yield self - else: - for child in self.children: - # pylint: disable=W0212 - for leaf in child.leaves(): - yield leaf - - def group_node(self): - return self._other_group_node(0) - - def previous_group_node(self): - return self._other_group_node(-1) - - def next_group_node(self): - return self._other_group_node(+1) - - def _other_group_node(self, offset): - if len(self.node_idx) > 1: - group_idx = self.node_idx[:2] - if group_idx[1] + offset >= 0: - other_group_idx = (group_idx[0], group_idx[1] + offset) - try: - other_group_node = self.root.node_at(other_group_idx) - return other_group_node - except ValueError: - pass - return None - - def previous_leaf(self, leaf): - """Return previous leaf for this node""" - return self._other_leaf(leaf, -1) - - def next_leaf(self, leaf): - """Return next leaf for this node""" - return self._other_leaf(leaf, +1) - - def _other_leaf(self, leaf, offset): - leaves = list(self.leaves()) - index = leaves.index(leaf) + offset - if index > 0 and index < len(leaves): - return leaves[index] - return None - - def previous_leaves(self, leaf): - """Return previous leaves for this node""" - leaves = list(self.leaves()) - index = leaves.index(leaf) - if index > 0 and index < len(leaves): - previous_leaves = leaves[:index] - previous_leaves.reverse() - return previous_leaves - return [] - - def next_leaves(self, leaf): - """Return next leaves for this node""" - leaves = list(self.leaves()) - index = leaves.index(leaf) - if index > 0 and index < len(leaves): - return leaves[index + 1:len(leaves)] - return [] - - def to_string(self): - """Return a readable string representation of this tree. - - The result is a multi-line string, where the lines are: - - line 1 -> N-2: each line contains the nodes at the given depth in the tree - - line N-2: original string where all the found groups have been blanked - - line N-1: type of property that has been found - - line N: the original string, which you can use a reference. - """ - empty_line = ' ' * len(self.string) - - def to_hex(x): - if isinstance(x, int): - return str(x) if x < 10 else chr(55 + x) - return x - - def meaning(result): - mmap = {'episodeNumber': 'E', - 'season': 'S', - 'extension': 'e', - 'format': 'f', - 'language': 'l', - 'country': 'C', - 'videoCodec': 'v', - 'videoProfile': 'v', - 'audioCodec': 'a', - 'audioProfile': 'a', - 'audioChannels': 'a', - 'website': 'w', - 'container': 'c', - 'series': 'T', - 'title': 't', - 'date': 'd', - 'year': 'y', - 'releaseGroup': 'r', - 'screenSize': 's', - 'other': 'o' - } - - if result is None: - return ' ' - - for prop, l in mmap.items(): - if prop in result: - return l - - return 'x' - - lines = [empty_line] * (self.depth + 2) # +2: remaining, meaning - lines[-2] = self.string - - for node in self.nodes(): - if node == self: - continue - - idx = node.node_idx - depth = len(idx) - 1 - if idx: - lines[depth] = str_fill(lines[depth], node.span, - to_hex(idx[-1])) - if node.guess: - lines[-2] = str_fill(lines[-2], node.span, '_') - lines[-1] = str_fill(lines[-1], node.span, meaning(node.guess)) - - lines.append(self.string) - - return '\n'.join(l.rstrip() for l in lines) - - def __unicode__(self): - return self.to_string() - - def __repr__(self): - return '' % self.value - - -class MatchTree(BaseMatchTree): - """The MatchTree contains a few "utility" methods which are not necessary - for the BaseMatchTree, but add a lot of convenience for writing - higher-level rules. - """ - - def unidentified_leaves(self, - valid=lambda leaf: len(leaf.clean_value) > 0): - """Return a generator of leaves that are not empty.""" - for leaf in self.leaves(): - if not leaf.guess and valid(leaf): - yield leaf - - def leaves_containing(self, property_name): - """Return a generator of leaves that guessed the given property.""" - if isinstance(property_name, base_text_type): - property_name = [property_name] - - for leaf in self.leaves(): - for prop in property_name: - if prop in leaf.guess: - yield leaf - break - - def first_leaf_containing(self, property_name): - """Return the first leaf containing the given property.""" - try: - return next(self.leaves_containing(property_name)) - except StopIteration: - return None - - def previous_unidentified_leaves(self, node): - """Return a generator of non-empty leaves that are before the given - node (in the string).""" - node_idx = node.node_idx - for leaf in self.unidentified_leaves(): - if leaf.node_idx < node_idx: - yield leaf - - def previous_leaves_containing(self, node, property_name): - """Return a generator of leaves containing the given property that are - before the given node (in the string).""" - node_idx = node.node_idx - for leaf in self.leaves_containing(property_name): - if leaf.node_idx < node_idx: - yield leaf - - def is_explicit(self): - """Return whether the group was explicitly enclosed by - parentheses/square brackets/etc.""" - return (self.value[0] + self.value[-1]) in group_delimiters - - def matched(self): - """Return a single guess that contains all the info found in the - nodes of this tree, trying to merge properties as good as possible. - """ - if not getattr(self, '_matched_result', None): - # we need to make a copy here, as the merge functions work in place and - # calling them on the match tree would modify it - parts = [copy.copy(node.guess) for node in self.nodes() if node.guess] - - result = smart_merge(parts) - - log.debug('Final result: ' + result.nice_string()) - self._matched_result = result - - for unidentified_leaves in self.unidentified_leaves(): - if 'unidentified' not in self._matched_result: - self._matched_result['unidentified'] = [] - self._matched_result['unidentified'].append(unidentified_leaves.clean_value) - - return self._matched_result diff --git a/libs/guessit/options.py b/libs/guessit/options.py index 9b8dc0fb..be24af48 100644 --- a/libs/guessit/options.py +++ b/libs/guessit/options.py @@ -1,7 +1,20 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +""" +Options +""" from argparse import ArgumentParser +import shlex + +import six -def build_opts(transformers=None): +def build_argument_parser(): + """ + Builds the argument parser + :return: the argument parser + :rtype: ArgumentParser + """ opts = ArgumentParser() opts.add_argument(dest='filename', help='Filename or release name to guess', nargs='*') @@ -9,61 +22,67 @@ def build_opts(transformers=None): naming_opts.add_argument('-t', '--type', dest='type', default=None, help='The suggested file type: movie, episode. If undefined, type will be guessed.') naming_opts.add_argument('-n', '--name-only', dest='name_only', action='store_true', default=False, - help='Parse files as name only. Disable folder parsing, extension parsing, and file content analysis.') - naming_opts.add_argument('-c', '--split-camel', dest='split_camel', action='store_true', default=False, - help='Split camel case part of filename.') + help='Parse files as name only, considering "/" and "\\" like other separators.') + naming_opts.add_argument('-Y', '--date-year-first', action='store_true', dest='date_year_first', default=None, + help='If short date is found, consider the first digits as the year.') + naming_opts.add_argument('-D', '--date-day-first', action='store_true', dest='date_day_first', default=None, + help='If short date is found, consider the second digits as the day.') + naming_opts.add_argument('-L', '--allowed-languages', action='append', dest='allowed_languages', + help='Allowed language (can be used multiple times)') + naming_opts.add_argument('-C', '--allowed-countries', action='append', dest='allowed_countries', + help='Allowed country (can be used multiple times)') + naming_opts.add_argument('-E', '--episode-prefer-number', action='store_true', dest='episode_prefer_number', + default=False, + help='Guess "serie.213.avi" as the episode 213. Without this option, ' + 'it will be guessed as season 2, episode 13') + naming_opts.add_argument('-T', '--expected-title', action='append', dest='expected_title', + help='Expected title to parse (can be used multiple times)') + naming_opts.add_argument('-G', '--expected-group', action='append', dest='expected_group', + help='Expected release group (can be used multiple times)') - naming_opts.add_argument('-X', '--disabled-transformer', action='append', dest='disabled_transformers', - help='Transformer to disable (can be used multiple time)') + input_opts = opts.add_argument_group("Input") + input_opts.add_argument('-f', '--input-file', dest='input_file', default=False, + help='Read filenames from an input text file. File should use UTF-8 charset.') output_opts = opts.add_argument_group("Output") output_opts.add_argument('-v', '--verbose', action='store_true', dest='verbose', default=False, help='Display debug output') output_opts.add_argument('-P', '--show-property', dest='show_property', default=None, - help='Display the value of a single property (title, series, videoCodec, year, type ...)'), - output_opts.add_argument('-u', '--unidentified', dest='unidentified', action='store_true', default=False, - help='Display the unidentified parts.'), + help='Display the value of a single property (title, series, video_codec, year, ...)') output_opts.add_argument('-a', '--advanced', dest='advanced', action='store_true', default=False, help='Display advanced information for filename guesses, as json output') + output_opts.add_argument('-j', '--json', dest='json', action='store_true', default=False, + help='Display information for filename guesses as json output') output_opts.add_argument('-y', '--yaml', dest='yaml', action='store_true', default=False, - help='Display information for filename guesses as yaml output (like unit-test)') - output_opts.add_argument('-f', '--input-file', dest='input_file', default=False, - help='Read filenames from an input file.') - output_opts.add_argument('-d', '--demo', action='store_true', dest='demo', default=False, - help='Run a few builtin tests instead of analyzing a file') + help='Display information for filename guesses as yaml output') + + information_opts = opts.add_argument_group("Information") information_opts.add_argument('-p', '--properties', dest='properties', action='store_true', default=False, help='Display properties that can be guessed.') information_opts.add_argument('-V', '--values', dest='values', action='store_true', default=False, help='Display property values that can be guessed.') - information_opts.add_argument('-s', '--transformers', dest='transformers', action='store_true', default=False, - help='Display transformers that can be used.') information_opts.add_argument('--version', dest='version', action='store_true', default=False, help='Display the guessit version.') - webservice_opts = opts.add_argument_group("guessit.io") - webservice_opts.add_argument('-b', '--bug', action='store_true', dest='submit_bug', default=False, - help='Submit a wrong detection to the guessit.io service') - - other_opts = opts.add_argument_group("Other features") - other_opts.add_argument('-i', '--info', dest='info', default='filename', - help='The desired information type: filename, video, hash_mpc or a hash from python\'s ' - 'hashlib module, such as hash_md5, hash_sha1, ...; or a list of any of ' - 'them, comma-separated') - - if transformers: - for transformer in transformers: - transformer.register_arguments(opts, naming_opts, output_opts, information_opts, webservice_opts, other_opts) - - return opts, naming_opts, output_opts, information_opts, webservice_opts, other_opts -_opts, _naming_opts, _output_opts, _information_opts, _webservice_opts, _other_opts = None, None, None, None, None, None + return opts -def reload(transformers=None): - global _opts, _naming_opts, _output_opts, _information_opts, _webservice_opts, _other_opts - _opts, _naming_opts, _output_opts, _information_opts, _webservice_opts, _other_opts = build_opts(transformers) +def parse_options(options): + """ + Parse given option string + :param options: + :type options: + :return: + :rtype: + """ + if isinstance(options, six.string_types): + args = shlex.split(options) + options = vars(argument_parser.parse_args(args)) + if options is None: + options = {} + return options -def get_opts(): - return _opts +argument_parser = build_argument_parser() diff --git a/libs/guessit/patterns/__init__.py b/libs/guessit/patterns/__init__.py deleted file mode 100755 index 1816d494..00000000 --- a/libs/guessit/patterns/__init__.py +++ /dev/null @@ -1,77 +0,0 @@ -#!/usr/bin/env python -# -*- coding: utf-8 -*- -# -# GuessIt - A library for guessing information from filenames -# Copyright (c) 2013 Nicolas Wack -# Copyright (c) 2013 Rémi Alvergnat -# -# GuessIt is free software; you can redistribute it and/or modify it under -# the terms of the Lesser GNU General Public License as published by -# the Free Software Foundation; either version 3 of the License, or -# (at your option) any later version. -# -# GuessIt is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# Lesser GNU General Public License for more details. -# -# You should have received a copy of the Lesser GNU General Public License -# along with this program. If not, see . -# - -from __future__ import absolute_import, division, print_function, unicode_literals - -import re - -from guessit import base_text_type - -group_delimiters = ['()', '[]', '{}'] - -# separator character regexp -sep = r'[][,)(}:{+ /~/\._-]' # regexp art, hehe :D - -_dash = '-' -_psep = '[\W_]?' - - -def build_or_pattern(patterns, escape=False): - """Build a or pattern string from a list of possible patterns - """ - or_pattern = [] - for pattern in patterns: - if not or_pattern: - or_pattern.append('(?:') - else: - or_pattern.append('|') - or_pattern.append('(?:%s)' % re.escape(pattern) if escape else pattern) - or_pattern.append(')') - return ''.join(or_pattern) - - -def compile_pattern(pattern, enhance=True): - """Compile and enhance a pattern - - :param pattern: Pattern to compile (regexp). - :type pattern: string - - :param pattern: Enhance pattern before compiling. - :type pattern: string - - :return: The compiled pattern - :rtype: regular expression object - """ - return re.compile(enhance_pattern(pattern) if enhance else pattern, re.IGNORECASE) - - -def enhance_pattern(pattern): - """Enhance pattern to match more equivalent values. - - '-' are replaced by '[\W_]?', which matches more types of separators (or none) - - :param pattern: Pattern to enhance (regexp). - :type pattern: string - - :return: The enhanced pattern - :rtype: string - """ - return pattern.replace(_dash, _psep) diff --git a/libs/guessit/patterns/extension.py b/libs/guessit/patterns/extension.py deleted file mode 100644 index 40a576b6..00000000 --- a/libs/guessit/patterns/extension.py +++ /dev/null @@ -1,32 +0,0 @@ -#!/usr/bin/env python -# -*- coding: utf-8 -*- -# -# GuessIt - A library for guessing information from filenames -# Copyright (c) 2013 Nicolas Wack -# Copyright (c) 2013 Rémi Alvergnat -# Copyright (c) 2011 Ricard Marxer -# -# GuessIt is free software; you can redistribute it and/or modify it under -# the terms of the Lesser GNU General Public License as published by -# the Free Software Foundation; either version 3 of the License, or -# (at your option) any later version. -# -# GuessIt is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# Lesser GNU General Public License for more details. -# -# You should have received a copy of the Lesser GNU General Public License -# along with this program. If not, see . -# - -from __future__ import absolute_import, division, print_function, unicode_literals - -subtitle_exts = ['srt', 'idx', 'sub', 'ssa', 'ass'] - -info_exts = ['nfo'] - -video_exts = ['3g2', '3gp', '3gp2', 'asf', 'avi', 'divx', 'flv', 'm4v', 'mk2', - 'mka', 'mkv', 'mov', 'mp4', 'mp4a', 'mpeg', 'mpg', 'ogg', 'ogm', - 'ogv', 'qt', 'ra', 'ram', 'rm', 'ts', 'wav', 'webm', 'wma', 'wmv', - 'iso'] diff --git a/libs/guessit/patterns/numeral.py b/libs/guessit/patterns/numeral.py deleted file mode 100644 index f254c6b8..00000000 --- a/libs/guessit/patterns/numeral.py +++ /dev/null @@ -1,150 +0,0 @@ -#!/usr/bin/env python -# -*- coding: utf-8 -*- -# -# GuessIt - A library for guessing information from filenames -# Copyright (c) 2013 Rémi Alvergnat -# -# GuessIt is free software; you can redistribute it and/or modify it under -# the terms of the Lesser GNU General Public License as published by -# the Free Software Foundation; either version 3 of the License, or -# (at your option) any later version. -# -# GuessIt is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# Lesser GNU General Public License for more details. -# -# You should have received a copy of the Lesser GNU General Public License -# along with this program. If not, see . -# - -from __future__ import absolute_import, division, print_function, unicode_literals - -import re - -digital_numeral = '\d{1,4}' - -roman_numeral = "(?=[MCDLXVI]+)M{0,4}(?:CM|CD|D?C{0,3})(?:XC|XL|L?X{0,3})(?:IX|IV|V?I{0,3})" - -english_word_numeral_list = [ - 'zero', 'one', 'two', 'three', 'four', 'five', 'six', 'seven', 'eight', 'nine', 'ten', - 'eleven', 'twelve', 'thirteen', 'fourteen', 'fifteen', 'sixteen', 'seventeen', 'eighteen', 'nineteen', 'twenty' -] - -french_word_numeral_list = [ - 'zéro', 'un', 'deux', 'trois', 'quatre', 'cinq', 'six', 'sept', 'huit', 'neuf', 'dix', - 'onze', 'douze', 'treize', 'quatorze', 'quinze', 'seize', 'dix-sept', 'dix-huit', 'dix-neuf', 'vingt' -] - -french_alt_word_numeral_list = [ - 'zero', 'une', 'deux', 'trois', 'quatre', 'cinq', 'six', 'sept', 'huit', 'neuf', 'dix', - 'onze', 'douze', 'treize', 'quatorze', 'quinze', 'seize', 'dixsept', 'dixhuit', 'dixneuf', 'vingt' -] - - -def __build_word_numeral(*args, **kwargs): - re_ = None - for word_list in args: - for word in word_list: - if not re_: - re_ = '(?:(?=\w+)' - else: - re_ += '|' - re_ += word - re_ += ')' - return re_ - -word_numeral = __build_word_numeral(english_word_numeral_list, french_word_numeral_list, french_alt_word_numeral_list) - -numeral = '(?:' + digital_numeral + '|' + roman_numeral + '|' + word_numeral + ')' - -__romanNumeralMap = ( - ('M', 1000), - ('CM', 900), - ('D', 500), - ('CD', 400), - ('C', 100), - ('XC', 90), - ('L', 50), - ('XL', 40), - ('X', 10), - ('IX', 9), - ('V', 5), - ('IV', 4), - ('I', 1) - ) - -__romanNumeralPattern = re.compile('^' + roman_numeral + '$') - - -def __parse_roman(value): - """convert Roman numeral to integer""" - if not __romanNumeralPattern.search(value): - raise ValueError('Invalid Roman numeral: %s' % value) - - result = 0 - index = 0 - for num, integer in __romanNumeralMap: - while value[index:index + len(num)] == num: - result += integer - index += len(num) - return result - - -def __parse_word(value): - """Convert Word numeral to integer""" - for word_list in [english_word_numeral_list, french_word_numeral_list, french_alt_word_numeral_list]: - try: - return word_list.index(value.lower()) - except ValueError: - pass - raise ValueError - - -_clean_re = re.compile('[^\d]*(\d+)[^\d]*') - - -def parse_numeral(value, int_enabled=True, roman_enabled=True, word_enabled=True, clean=True): - """Parse a numeric value into integer. - - input can be an integer as a string, a roman numeral or a word - - :param value: Value to parse. Can be an integer, roman numeral or word. - :type value: string - - :return: Numeric value, or None if value can't be parsed - :rtype: int - """ - if int_enabled: - try: - if clean: - match = _clean_re.match(value) - if match: - clean_value = match.group(1) - return int(clean_value) - return int(value) - except ValueError: - pass - if roman_enabled: - try: - if clean: - for word in value.split(): - try: - return __parse_roman(word.upper()) - except ValueError: - pass - return __parse_roman(value) - except ValueError: - pass - if word_enabled: - try: - if clean: - for word in value.split(): - try: - return __parse_word(word) - except ValueError: - pass - return __parse_word(value) - except ValueError: - pass - raise ValueError('Invalid numeral: ' + value) diff --git a/libs/guessit/plugins/__init__.py b/libs/guessit/plugins/__init__.py deleted file mode 100644 index 6a63e4e1..00000000 --- a/libs/guessit/plugins/__init__.py +++ /dev/null @@ -1,21 +0,0 @@ -#!/usr/bin/env python -# -*- coding: utf-8 -*- -# -# GuessIt - A library for guessing information from filenames -# Copyright (c) 2013 Nicolas Wack -# -# GuessIt is free software; you can redistribute it and/or modify it under -# the terms of the Lesser GNU General Public License as published by -# the Free Software Foundation; either version 3 of the License, or -# (at your option) any later version. -# -# GuessIt is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# Lesser GNU General Public License for more details. -# -# You should have received a copy of the Lesser GNU General Public License -# along with this program. If not, see . -# - -from __future__ import absolute_import, division, print_function, unicode_literals diff --git a/libs/guessit/plugins/transformers.py b/libs/guessit/plugins/transformers.py deleted file mode 100644 index f2f746c0..00000000 --- a/libs/guessit/plugins/transformers.py +++ /dev/null @@ -1,219 +0,0 @@ -#!/usr/bin/env python -# -*- coding: utf-8 -*- -# -# GuessIt - A library for guessing information from filenames -# Copyright (c) 2013 Nicolas Wack -# -# GuessIt is free software; you can redistribute it and/or modify it under -# the terms of the Lesser GNU General Public License as published by -# the Free Software Foundation; either version 3 of the License, or -# (at your option) any later version. -# -# GuessIt is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# Lesser GNU General Public License for more details. -# -# You should have received a copy of the Lesser GNU General Public License -# along with this program. If not, see . -# - -from __future__ import absolute_import, division, print_function, unicode_literals -from guessit.options import reload as reload_options - -from stevedore import ExtensionManager -from pkg_resources import EntryPoint - -from stevedore.extension import Extension -from logging import getLogger - -log = getLogger(__name__) - - -class Transformer(object): # pragma: no cover - def __init__(self, priority=0): - self.priority = priority - self.log = getLogger(self.name) - - @property - def name(self): - return self.__class__.__name__ - - def supported_properties(self): - return {} - - def second_pass_options(self, mtree, options=None): - return None - - def should_process(self, mtree, options=None): - return True - - def process(self, mtree, options=None): - pass - - def post_process(self, mtree, options=None): - pass - - def register_arguments(self, opts, naming_opts, output_opts, information_opts, webservice_opts, other_options): - pass - - def rate_quality(self, guess, *props): - return 0 - - -class CustomTransformerExtensionManager(ExtensionManager): - def __init__(self, namespace='guessit.transformer', invoke_on_load=True, - invoke_args=(), invoke_kwds={}, propagate_map_exceptions=True, on_load_failure_callback=None, - verify_requirements=False): - super(CustomTransformerExtensionManager, self).__init__(namespace=namespace, - invoke_on_load=invoke_on_load, - invoke_args=invoke_args, - invoke_kwds=invoke_kwds, - propagate_map_exceptions=propagate_map_exceptions, - on_load_failure_callback=on_load_failure_callback, - verify_requirements=verify_requirements) - - def order_extensions(self, extensions): - """Order the loaded transformers - - It should follow those rules - - website before language (eg: tvu.org.ru vs russian) - - language before episodes_rexps - - properties before language (eg: he-aac vs hebrew) - - release_group before properties (eg: XviD-?? vs xvid) - """ - extensions.sort(key=lambda ext: -ext.obj.priority) - return extensions - - def _load_one_plugin(self, ep, invoke_on_load, invoke_args, invoke_kwds, verify_requirements=True): - if not ep.dist: - # `require` argument of ep.load() is deprecated in newer versions of setuptools - if hasattr(ep, 'resolve'): - plugin = ep.resolve() - elif hasattr(ep, '_load'): - plugin = ep._load() - else: - plugin = ep.load(require=False) - else: - plugin = ep.load() - if invoke_on_load: - obj = plugin(*invoke_args, **invoke_kwds) - else: - obj = None - return Extension(ep.name, ep, plugin, obj) - - def _load_plugins(self, invoke_on_load, invoke_args, invoke_kwds, verify_requirements): - return self.order_extensions(super(CustomTransformerExtensionManager, self)._load_plugins(invoke_on_load, invoke_args, invoke_kwds, verify_requirements)) - - def objects(self): - return self.map(self._get_obj) - - def _get_obj(self, ext): - return ext.obj - - def object(self, name): - try: - return self[name].obj - except KeyError: - return None - - def register_module(self, name=None, module_name=None, attrs=(), entry_point=None): - if entry_point: - ep = EntryPoint.parse(entry_point) - else: - ep = EntryPoint(name, module_name, attrs) - loaded = self._load_one_plugin(ep, invoke_on_load=True, invoke_args=(), invoke_kwds={}) - if loaded: - self.extensions.append(loaded) - self.extensions = self.order_extensions(self.extensions) - self._extensions_by_name = None - - -class DefaultTransformerExtensionManager(CustomTransformerExtensionManager): - @property - def _internal_entry_points(self): - return ['split_path_components = guessit.transfo.split_path_components:SplitPathComponents', - 'guess_filetype = guessit.transfo.guess_filetype:GuessFiletype', - 'split_explicit_groups = guessit.transfo.split_explicit_groups:SplitExplicitGroups', - 'guess_date = guessit.transfo.guess_date:GuessDate', - 'guess_website = guessit.transfo.guess_website:GuessWebsite', - 'guess_release_group = guessit.transfo.guess_release_group:GuessReleaseGroup', - 'guess_properties = guessit.transfo.guess_properties:GuessProperties', - 'guess_language = guessit.transfo.guess_language:GuessLanguage', - 'guess_video_rexps = guessit.transfo.guess_video_rexps:GuessVideoRexps', - 'guess_episodes_rexps = guessit.transfo.guess_episodes_rexps:GuessEpisodesRexps', - 'guess_weak_episodes_rexps = guessit.transfo.guess_weak_episodes_rexps:GuessWeakEpisodesRexps', - 'guess_bonus_features = guessit.transfo.guess_bonus_features:GuessBonusFeatures', - 'guess_year = guessit.transfo.guess_year:GuessYear', - 'guess_country = guessit.transfo.guess_country:GuessCountry', - 'guess_idnumber = guessit.transfo.guess_idnumber:GuessIdnumber', - 'split_on_dash = guessit.transfo.split_on_dash:SplitOnDash', - 'guess_episode_info_from_position = guessit.transfo.guess_episode_info_from_position:GuessEpisodeInfoFromPosition', - 'guess_movie_title_from_position = guessit.transfo.guess_movie_title_from_position:GuessMovieTitleFromPosition', - 'guess_episode_details = guessit.transfo.guess_episode_details:GuessEpisodeDetails', - 'expected_series = guessit.transfo.expected_series:ExpectedSeries', - 'expected_title = guessit.transfo.expected_title:ExpectedTitle',] - - def _find_entry_points(self, namespace): - entry_points = {} - # Internal entry points - if namespace == self.namespace: - for internal_entry_point_str in self._internal_entry_points: - internal_entry_point = EntryPoint.parse(internal_entry_point_str) - entry_points[internal_entry_point.name] = internal_entry_point - - # Package entry points - setuptools_entrypoints = super(DefaultTransformerExtensionManager, self)._find_entry_points(namespace) - for setuptools_entrypoint in setuptools_entrypoints: - entry_points[setuptools_entrypoint.name] = setuptools_entrypoint - - return list(entry_points.values()) - -_extensions = None - - -def all_transformers(): - return _extensions.objects() - - -def get_transformer(name): - return _extensions.object(name) - - -def add_transformer(name, module_name, class_name): - """ - Add a transformer - - :param name: the name of the transformer. ie: 'guess_regexp_id' - :param name: the module name. ie: 'flexget.utils.parsers.transformers.guess_regexp_id' - :param class_name: the class name. ie: 'GuessRegexpId' - """ - - _extensions.register_module(name, module_name, (class_name,)) - - -def add_transformer(entry_point): - """ - Add a transformer - - :param entry_point: entry point spec format. ie: 'guess_regexp_id = flexget.utils.parsers.transformers.guess_regexp_id:GuessRegexpId' - """ - _extensions.register_module(entry_point = entry_point) - - -def reload(custom=False): - """ - Reload extension manager with default or custom one. - :param custom: if True, custom manager will be used, else default one. - Default manager will load default extensions from guessit and setuptools packaging extensions - Custom manager will not load default extensions from guessit, using only setuptools packaging extensions. - :type custom: boolean - """ - global _extensions - if custom: - _extensions = CustomTransformerExtensionManager() - else: - _extensions = DefaultTransformerExtensionManager() - reload_options(all_transformers()) - -reload() diff --git a/libs/guessit/quality.py b/libs/guessit/quality.py deleted file mode 100644 index 870bbdbb..00000000 --- a/libs/guessit/quality.py +++ /dev/null @@ -1,65 +0,0 @@ -#!/usr/bin/env python -# -*- coding: utf-8 -*- -# -# GuessIt - A library for guessing information from filenames -# Copyright (c) 2013 Rémi Alvergnat -# -# GuessIt is free software; you can redistribute it and/or modify it under -# the terms of the Lesser GNU General Public License as published by -# the Free Software Foundation; either version 3 of the License, or -# (at your option) any later version. -# -# GuessIt is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# Lesser GNU General Public License for more details. -# -# You should have received a copy of the Lesser GNU General Public License -# along with this program. If not, see . -# - -from __future__ import absolute_import, division, print_function, unicode_literals - -from guessit.plugins.transformers import all_transformers - - -def best_quality_properties(props, *guesses): - """Retrieve the best quality guess, based on given properties - - :param props: Properties to include in the rating - :type props: list of strings - :param guesses: Guesses to rate - :type guesses: :class:`guessit.guess.Guess` - - :return: Best quality guess from all passed guesses - :rtype: :class:`guessit.guess.Guess` - """ - best_guess = None - best_rate = None - for guess in guesses: - for transformer in all_transformers(): - rate = transformer.rate_quality(guess, *props) - if best_rate is None or best_rate < rate: - best_rate = rate - best_guess = guess - return best_guess - - -def best_quality(*guesses): - """Retrieve the best quality guess. - - :param guesses: Guesses to rate - :type guesses: :class:`guessit.guess.Guess` - - :return: Best quality guess from all passed guesses - :rtype: :class:`guessit.guess.Guess` - """ - best_guess = None - best_rate = None - for guess in guesses: - for transformer in all_transformers(): - rate = transformer.rate_quality(guess) - if best_rate is None or best_rate < rate: - best_rate = rate - best_guess = guess - return best_guess diff --git a/libs/guessit/reutils.py b/libs/guessit/reutils.py new file mode 100644 index 00000000..0b654d27 --- /dev/null +++ b/libs/guessit/reutils.py @@ -0,0 +1,35 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +""" +Utils for re module +""" + +from rebulk.remodule import re + + +def build_or_pattern(patterns, name=None, escape=False): + """ + Build a or pattern string from a list of possible patterns + + :param patterns: + :type patterns: + :param name: + :type name: + :param escape: + :type escape: + :return: + :rtype: + """ + or_pattern = [] + for pattern in patterns: + if not or_pattern: + or_pattern.append('(?') + if name: + or_pattern.append('P<' + name + '>') + else: + or_pattern.append(':') + else: + or_pattern.append('|') + or_pattern.append('(?:%s)' % re.escape(pattern) if escape else pattern) + or_pattern.append(')') + return ''.join(or_pattern) diff --git a/libs/guessit/rules/__init__.py b/libs/guessit/rules/__init__.py new file mode 100644 index 00000000..f9dc4557 --- /dev/null +++ b/libs/guessit/rules/__init__.py @@ -0,0 +1,88 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +""" +Rebulk object default builder +""" +from rebulk import Rebulk + +from .markers.path import path +from .markers.groups import groups + +from .properties.episodes import episodes +from .properties.container import container +from .properties.format import format_ +from .properties.video_codec import video_codec +from .properties.audio_codec import audio_codec +from .properties.screen_size import screen_size +from .properties.website import website +from .properties.date import date +from .properties.title import title +from .properties.episode_title import episode_title +from .properties.language import language +from .properties.country import country +from .properties.release_group import release_group +from .properties.other import other +from .properties.edition import edition +from .properties.cds import cds +from .properties.bonus import bonus +from .properties.film import film +from .properties.part import part +from .properties.crc import crc +from .properties.mimetype import mimetype +from .properties.type import type_ + +from .processors import processors + + +def rebulk_builder(): + """ + Default builder for main Rebulk object used by api. + :return: Main Rebulk object + :rtype: Rebulk + """ + rebulk = Rebulk() + + rebulk.rebulk(path()) + rebulk.rebulk(groups()) + + rebulk.rebulk(episodes()) + rebulk.rebulk(container()) + rebulk.rebulk(format_()) + rebulk.rebulk(video_codec()) + rebulk.rebulk(audio_codec()) + rebulk.rebulk(screen_size()) + rebulk.rebulk(website()) + rebulk.rebulk(date()) + rebulk.rebulk(title()) + rebulk.rebulk(episode_title()) + rebulk.rebulk(language()) + rebulk.rebulk(country()) + rebulk.rebulk(release_group()) + rebulk.rebulk(other()) + rebulk.rebulk(edition()) + rebulk.rebulk(cds()) + rebulk.rebulk(bonus()) + rebulk.rebulk(film()) + rebulk.rebulk(part()) + rebulk.rebulk(crc()) + + rebulk.rebulk(processors()) + + rebulk.rebulk(mimetype()) + rebulk.rebulk(type_()) + + def customize_properties(properties): + """ + Customize default rebulk properties + """ + count = properties['count'] + del properties['count'] + + properties['season_count'] = count + properties['episode_count'] = count + + return properties + + rebulk.customize_properties = customize_properties + + return rebulk diff --git a/libs/guessit/rules/common/__init__.py b/libs/guessit/rules/common/__init__.py new file mode 100644 index 00000000..e9da2aa0 --- /dev/null +++ b/libs/guessit/rules/common/__init__.py @@ -0,0 +1,14 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +""" +Common module +""" +import re + +seps = r' [](){}+*|=-_~#/\\.,;:' # list of tags/words separators +seps_no_fs = seps.replace('/', '').replace('\\', '') + +title_seps = r'-+/\|' # separators for title + +dash = (r'-', r'['+re.escape(seps_no_fs)+']') # abbreviation used by many rebulk objects. +alt_dash = (r'@', r'['+re.escape(seps_no_fs)+']') # abbreviation used by many rebulk objects. diff --git a/libs/guessit/rules/common/comparators.py b/libs/guessit/rules/common/comparators.py new file mode 100644 index 00000000..f9db1d3f --- /dev/null +++ b/libs/guessit/rules/common/comparators.py @@ -0,0 +1,68 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +""" +Comparators +""" +try: + from functools import cmp_to_key +except ImportError: + from ...backports import cmp_to_key + + +def marker_comparator_predicate(match): + """ + Match predicate used in comparator + """ + return not match.private and \ + match.name not in ['proper_count', 'title', 'episode_title', 'alternative_title'] and \ + not (match.name == 'container' and 'extension' in match.tags) + + +def marker_weight(matches, marker): + """ + Compute the comparator weight of a marker + :param matches: + :param marker: + :return: + """ + return len(set(match.name for match in matches.range(*marker.span, predicate=marker_comparator_predicate))) + + +def marker_comparator(matches, markers): + """ + Builds a comparator that returns markers sorted from the most valuable to the less. + + Take the parts where matches count is higher, then when length is higher, then when position is at left. + + :param matches: + :type matches: + :return: + :rtype: + """ + def comparator(marker1, marker2): + """ + The actual comparator function. + """ + matches_count = marker_weight(matches, marker2) - marker_weight(matches, marker1) + if matches_count: + return matches_count + len_diff = len(marker2) - len(marker1) + if len_diff: + return len_diff + return markers.index(marker2) - markers.index(marker1) + + return comparator + + +def marker_sorted(markers, matches): + """ + Sort markers from matches, from the most valuable to the less. + + :param fileparts: + :type fileparts: + :param matches: + :type matches: + :return: + :rtype: + """ + return sorted(markers, key=cmp_to_key(marker_comparator(matches, markers))) diff --git a/libs/guessit/rules/common/date.py b/libs/guessit/rules/common/date.py new file mode 100644 index 00000000..779e4b93 --- /dev/null +++ b/libs/guessit/rules/common/date.py @@ -0,0 +1,125 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +""" +Date +""" +from dateutil import parser + +from rebulk.remodule import re + +_dsep = r'[-/ \.]' +_dsep_bis = r'[-/ \.x]' + +date_regexps = [ + re.compile(r'%s((\d{8}))%s' % (_dsep, _dsep), re.IGNORECASE), + re.compile(r'%s((\d{6}))%s' % (_dsep, _dsep), re.IGNORECASE), + re.compile(r'(?:^|[^\d])((\d{2})%s(\d{1,2})%s(\d{1,2}))(?:$|[^\d])' % (_dsep, _dsep), re.IGNORECASE), + re.compile(r'(?:^|[^\d])((\d{1,2})%s(\d{1,2})%s(\d{2}))(?:$|[^\d])' % (_dsep, _dsep), re.IGNORECASE), + re.compile(r'(?:^|[^\d])((\d{4})%s(\d{1,2})%s(\d{1,2}))(?:$|[^\d])' % (_dsep_bis, _dsep), re.IGNORECASE), + re.compile(r'(?:^|[^\d])((\d{1,2})%s(\d{1,2})%s(\d{4}))(?:$|[^\d])' % (_dsep, _dsep_bis), re.IGNORECASE), + re.compile(r'(?:^|[^\d])((\d{1,2}(?:st|nd|rd|th)?%s(?:[a-z]{3,10})%s\d{4}))(?:$|[^\d])' % (_dsep, _dsep), + re.IGNORECASE)] + + +def valid_year(year): + """Check if number is a valid year""" + return 1920 <= year < 2030 + + +def _is_int(string): + """ + Check if the input string is an integer + + :param string: + :type string: + :return: + :rtype: + """ + try: + int(string) + return True + except ValueError: + return False + + +def _guess_day_first_parameter(groups): + """ + If day_first is not defined, use some heuristic to fix it. + It helps to solve issues with python dateutils 2.5.3 parser changes. + + :param groups: match groups found for the date + :type groups: list of match objects + :return: day_first option guessed value + :rtype: bool + """ + + # If match starts with a long year, then day_first is force to false. + if _is_int(groups[0]) and valid_year(int(groups[0][:4])): + return False + # If match ends with a long year, the day_first is forced to true. + elif _is_int(groups[-1]) and valid_year(int(groups[-1][-4:])): + return True + # If match starts with a short year, then day_first is force to false. + elif _is_int(groups[0]) and int(groups[0][:2]) > 31: + return False + # If match ends with a short year, then day_first is force to true. + elif _is_int(groups[-1]) and int(groups[-1][-2:]) > 31: + return True + + +def search_date(string, year_first=None, day_first=None): + """Looks for date patterns, and if found return the date and group span. + + Assumes there are sentinels at the beginning and end of the string that + always allow matching a non-digit delimiting the date. + + Year can be defined on two digit only. It will return the nearest possible + date from today. + + >>> search_date(' This happened on 2002-04-22. ') + (18, 28, datetime.date(2002, 4, 22)) + + >>> search_date(' And this on 17-06-1998. ') + (13, 23, datetime.date(1998, 6, 17)) + + >>> search_date(' no date in here ') + """ + start, end = None, None + match = None + groups = None + for date_re in date_regexps: + search_match = date_re.search(string) + if search_match and (match is None or search_match.end() - search_match.start() > len(match)): + start, end = search_match.start(1), search_match.end(1) + groups = search_match.groups()[1:] + match = '-'.join(groups) + + if match is None: + return + + if year_first and day_first is None: + day_first = False + + if day_first is None: + day_first = _guess_day_first_parameter(groups) + + # If day_first/year_first is undefined, parse is made using both possible values. + yearfirst_opts = [False, True] + if year_first is not None: + yearfirst_opts = [year_first] + + dayfirst_opts = [True, False] + if day_first is not None: + dayfirst_opts = [day_first] + + kwargs_list = ({'dayfirst': d, 'yearfirst': y} for d in dayfirst_opts for y in yearfirst_opts) + for kwargs in kwargs_list: + try: + date = parser.parse(match, **kwargs) + except (ValueError, TypeError): # pragma: no cover + # see https://bugs.launchpad.net/dateutil/+bug/1247643 + date = None + + # check date plausibility + if date and valid_year(date.year): # pylint:disable=no-member + return start, end, date.date() # pylint:disable=no-member diff --git a/libs/guessit/rules/common/formatters.py b/libs/guessit/rules/common/formatters.py new file mode 100644 index 00000000..6bd09b15 --- /dev/null +++ b/libs/guessit/rules/common/formatters.py @@ -0,0 +1,136 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +""" +Formatters +""" +from rebulk.formatters import formatters +from rebulk.remodule import re +from . import seps + +_excluded_clean_chars = ',:;-/\\' +clean_chars = "" +for sep in seps: + if sep not in _excluded_clean_chars: + clean_chars += sep + + +def _potential_before(i, input_string): + """ + Check if the character at position i can be a potential single char separator considering what's before it. + + :param i: + :type i: int + :param input_string: + :type input_string: str + :return: + :rtype: bool + """ + return i - 2 >= 0 and input_string[i] == input_string[i - 2] and input_string[i - 1] not in seps + + +def _potential_after(i, input_string): + """ + Check if the character at position i can be a potential single char separator considering what's after it. + + :param i: + :type i: int + :param input_string: + :type input_string: str + :return: + :rtype: bool + """ + return i + 2 >= len(input_string) or \ + input_string[i + 2] == input_string[i] and input_string[i + 1] not in seps + + +def cleanup(input_string): + """ + Removes and strip separators from input_string (but keep ',;' characters) + + It also keep separators for single characters (Mavels Agents of S.H.I.E.L.D.) + + :param input_string: + :type input_string: str + :return: + :rtype: + """ + clean_string = input_string + for char in clean_chars: + clean_string = clean_string.replace(char, ' ') + + # Restore input separator if they separate single characters. + # Useful for Mavels Agents of S.H.I.E.L.D. + # https://github.com/guessit-io/guessit/issues/278 + + indices = [i for i, letter in enumerate(clean_string) if letter in seps] + + dots = set() + if indices: + clean_list = list(clean_string) + + potential_indices = [] + + for i in indices: + if _potential_before(i, input_string) and _potential_after(i, input_string): + potential_indices.append(i) + + replace_indices = [] + + for potential_index in potential_indices: + if potential_index - 2 in potential_indices or potential_index + 2 in potential_indices: + replace_indices.append(potential_index) + + if replace_indices: + for replace_index in replace_indices: + dots.add(input_string[replace_index]) + clean_list[replace_index] = input_string[replace_index] + clean_string = ''.join(clean_list) + + clean_string = strip(clean_string, ''.join([c for c in seps if c not in dots])) + + clean_string = re.sub(' +', ' ', clean_string) + return clean_string + + +def strip(input_string, chars=seps): + """ + Strip separators from input_string + :param input_string: + :param chars: + :type input_string: + :return: + :rtype: + """ + return input_string.strip(chars) + + +def raw_cleanup(raw): + """ + Cleanup a raw value to perform raw comparison + :param raw: + :type raw: + :return: + :rtype: + """ + return formatters(cleanup, strip)(raw.lower()) + + +def reorder_title(title, articles=('the',), separators=(',', ', ')): + """ + Reorder the title + :param title: + :type title: + :param articles: + :type articles: + :param separators: + :type separators: + :return: + :rtype: + """ + ltitle = title.lower() + for article in articles: + for separator in separators: + suffix = separator + article + if ltitle[-len(suffix):] == suffix: + return title[-len(suffix) + len(separator):] + ' ' + title[:-len(suffix)] + return title diff --git a/libs/guessit/rules/common/numeral.py b/libs/guessit/rules/common/numeral.py new file mode 100644 index 00000000..7c064fdb --- /dev/null +++ b/libs/guessit/rules/common/numeral.py @@ -0,0 +1,165 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +""" +parse numeral from various formats +""" +from rebulk.remodule import re + +digital_numeral = r'\d{1,4}' + +roman_numeral = r'(?=[MCDLXVI]+)M{0,4}(?:CM|CD|D?C{0,3})(?:XC|XL|L?X{0,3})(?:IX|IV|V?I{0,3})' + +english_word_numeral_list = [ + 'zero', 'one', 'two', 'three', 'four', 'five', 'six', 'seven', 'eight', 'nine', 'ten', + 'eleven', 'twelve', 'thirteen', 'fourteen', 'fifteen', 'sixteen', 'seventeen', 'eighteen', 'nineteen', 'twenty' +] + +french_word_numeral_list = [ + 'zéro', 'un', 'deux', 'trois', 'quatre', 'cinq', 'six', 'sept', 'huit', 'neuf', 'dix', + 'onze', 'douze', 'treize', 'quatorze', 'quinze', 'seize', 'dix-sept', 'dix-huit', 'dix-neuf', 'vingt' +] + +french_alt_word_numeral_list = [ + 'zero', 'une', 'deux', 'trois', 'quatre', 'cinq', 'six', 'sept', 'huit', 'neuf', 'dix', + 'onze', 'douze', 'treize', 'quatorze', 'quinze', 'seize', 'dixsept', 'dixhuit', 'dixneuf', 'vingt' +] + + +def __build_word_numeral(*args): + """ + Build word numeral regexp from list. + + :param args: + :type args: + :param kwargs: + :type kwargs: + :return: + :rtype: + """ + re_ = None + for word_list in args: + for word in word_list: + if not re_: + re_ = r'(?:(?=\w+)' + else: + re_ += '|' + re_ += word + re_ += ')' + return re_ + + +word_numeral = __build_word_numeral(english_word_numeral_list, french_word_numeral_list, french_alt_word_numeral_list) + +numeral = '(?:' + digital_numeral + '|' + roman_numeral + '|' + word_numeral + ')' + +__romanNumeralMap = ( + ('M', 1000), + ('CM', 900), + ('D', 500), + ('CD', 400), + ('C', 100), + ('XC', 90), + ('L', 50), + ('XL', 40), + ('X', 10), + ('IX', 9), + ('V', 5), + ('IV', 4), + ('I', 1) +) + +__romanNumeralPattern = re.compile('^' + roman_numeral + '$') + + +def __parse_roman(value): + """ + convert Roman numeral to integer + + :param value: Value to parse + :type value: string + :return: + :rtype: + """ + if not __romanNumeralPattern.search(value): + raise ValueError('Invalid Roman numeral: %s' % value) + + result = 0 + index = 0 + for num, integer in __romanNumeralMap: + while value[index:index + len(num)] == num: + result += integer + index += len(num) + return result + + +def __parse_word(value): + """ + Convert Word numeral to integer + + :param value: Value to parse + :type value: string + :return: + :rtype: + """ + for word_list in [english_word_numeral_list, french_word_numeral_list, french_alt_word_numeral_list]: + try: + return word_list.index(value.lower()) + except ValueError: + pass + raise ValueError # pragma: no cover + + +_clean_re = re.compile(r'[^\d]*(\d+)[^\d]*') + + +def parse_numeral(value, int_enabled=True, roman_enabled=True, word_enabled=True, clean=True): + """ + Parse a numeric value into integer. + + :param value: Value to parse. Can be an integer, roman numeral or word. + :type value: string + :param int_enabled: + :type int_enabled: + :param roman_enabled: + :type roman_enabled: + :param word_enabled: + :type word_enabled: + :param clean: + :type clean: + :return: Numeric value, or None if value can't be parsed + :rtype: int + """ + # pylint: disable=too-many-branches + if int_enabled: + try: + if clean: + match = _clean_re.match(value) + if match: + clean_value = match.group(1) + return int(clean_value) + return int(value) + except ValueError: + pass + if roman_enabled: + try: + if clean: + for word in value.split(): + try: + return __parse_roman(word.upper()) + except ValueError: + pass + return __parse_roman(value) + except ValueError: + pass + if word_enabled: + try: + if clean: + for word in value.split(): + try: + return __parse_word(word) + except ValueError: # pragma: no cover + pass + return __parse_word(value) # pragma: no cover + except ValueError: # pragma: no cover + pass + raise ValueError('Invalid numeral: ' + value) # pragma: no cover diff --git a/libs/guessit/rules/common/validators.py b/libs/guessit/rules/common/validators.py new file mode 100644 index 00000000..0e79b989 --- /dev/null +++ b/libs/guessit/rules/common/validators.py @@ -0,0 +1,51 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +""" +Validators +""" +from functools import partial + +from rebulk.validators import chars_before, chars_after, chars_surround +from . import seps + +seps_before = partial(chars_before, seps) +seps_after = partial(chars_after, seps) +seps_surround = partial(chars_surround, seps) + + +def int_coercable(string): + """ + Check if string can be coerced to int + :param string: + :type string: + :return: + :rtype: + """ + try: + int(string) + return True + except ValueError: + return False + + +def compose(*validators): + """ + Compose validators functions + :param validators: + :type validators: + :return: + :rtype: + """ + def composed(string): + """ + Composed validators function + :param string: + :type string: + :return: + :rtype: + """ + for validator in validators: + if not validator(string): + return False + return True + return composed diff --git a/libs/guessit/rules/common/words.py b/libs/guessit/rules/common/words.py new file mode 100644 index 00000000..b73b1eef --- /dev/null +++ b/libs/guessit/rules/common/words.py @@ -0,0 +1,77 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +""" +Words utils +""" +from collections import namedtuple + +from guessit.rules.common import seps + +_Word = namedtuple('_Word', ['span', 'value']) + + +def iter_words(string): + """ + Iterate on all words in a string + :param string: + :type string: + :return: + :rtype: iterable[str] + """ + i = 0 + last_sep_index = -1 + inside_word = False + for char in string: + if ord(char) < 128 and char in seps: # Make sure we don't exclude unicode characters. + if inside_word: + yield _Word(span=(last_sep_index+1, i), value=string[last_sep_index+1:i]) + inside_word = False + last_sep_index = i + else: + inside_word = True + i += 1 + if inside_word: + yield _Word(span=(last_sep_index+1, i), value=string[last_sep_index+1:i]) + + +# list of common words which could be interpreted as properties, but which +# are far too common to be able to say they represent a property in the +# middle of a string (where they most likely carry their commmon meaning) +COMMON_WORDS = frozenset([ + # english words + 'is', 'it', 'am', 'mad', 'men', 'man', 'run', 'sin', 'st', 'to', + 'no', 'non', 'war', 'min', 'new', 'car', 'day', 'bad', 'bat', 'fan', + 'fry', 'cop', 'zen', 'gay', 'fat', 'one', 'cherokee', 'got', 'an', 'as', + 'cat', 'her', 'be', 'hat', 'sun', 'may', 'my', 'mr', 'rum', 'pi', 'bb', + 'bt', 'tv', 'aw', 'by', 'md', 'mp', 'cd', 'lt', 'gt', 'in', 'ad', 'ice', + 'ay', 'at', 'star', 'so', 'he', 'do', 'ax', 'mx', + # french words + 'bas', 'de', 'le', 'son', 'ne', 'ca', 'ce', 'et', 'que', + 'mal', 'est', 'vol', 'or', 'mon', 'se', 'je', 'tu', 'me', + 'ne', 'ma', 'va', 'au', 'lu', + # japanese words, + 'wa', 'ga', 'ao', + # spanish words + 'la', 'el', 'del', 'por', 'mar', 'al', + # other + 'ind', 'arw', 'ts', 'ii', 'bin', 'chan', 'ss', 'san', 'oss', 'iii', + 'vi', 'ben', 'da', 'lt', 'ch', 'sr', 'ps', 'cx', 'vo', + # new from babelfish + 'mkv', 'avi', 'dmd', 'the', 'dis', 'cut', 'stv', 'des', 'dia', 'and', + 'cab', 'sub', 'mia', 'rim', 'las', 'une', 'par', 'srt', 'ano', 'toy', + 'job', 'gag', 'reel', 'www', 'for', 'ayu', 'csi', 'ren', 'moi', 'sur', + 'fer', 'fun', 'two', 'big', 'psy', 'air', + # movie title + 'brazil', 'jordan', + # release groups + 'bs', # Bosnian + 'kz', + # countries + 'gt', 'lt', 'im', + # part/pt + 'pt', + # screener + 'scr', + # quality + 'sd', 'hr' +]) diff --git a/libs/guessit/rules/markers/__init__.py b/libs/guessit/rules/markers/__init__.py new file mode 100644 index 00000000..6a48a13b --- /dev/null +++ b/libs/guessit/rules/markers/__init__.py @@ -0,0 +1,5 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +""" +Markers +""" diff --git a/libs/guessit/rules/markers/groups.py b/libs/guessit/rules/markers/groups.py new file mode 100644 index 00000000..bbe69d1c --- /dev/null +++ b/libs/guessit/rules/markers/groups.py @@ -0,0 +1,49 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +""" +Groups markers (...), [...] and {...} +""" +from rebulk import Rebulk + + +def groups(): + """ + Builder for rebulk object. + :return: Created Rebulk object + :rtype: Rebulk + """ + rebulk = Rebulk() + rebulk.defaults(name="group", marker=True) + + starting = '([{' + ending = ')]}' + + def mark_groups(input_string): + """ + Functional pattern to mark groups (...), [...] and {...}. + + :param input_string: + :return: + """ + openings = ([], [], []) + i = 0 + + ret = [] + for char in input_string: + start_type = starting.find(char) + if start_type > -1: + openings[start_type].append(i) + + i += 1 + + end_type = ending.find(char) + if end_type > -1: + try: + start_index = openings[end_type].pop() + ret.append((start_index, i)) + except IndexError: + pass + return ret + + rebulk.functional(mark_groups) + return rebulk diff --git a/libs/guessit/rules/markers/path.py b/libs/guessit/rules/markers/path.py new file mode 100644 index 00000000..5e487ea6 --- /dev/null +++ b/libs/guessit/rules/markers/path.py @@ -0,0 +1,43 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +""" +Path markers +""" +from rebulk import Rebulk + +from rebulk.utils import find_all + + +def path(): + """ + Builder for rebulk object. + :return: Created Rebulk object + :rtype: Rebulk + """ + rebulk = Rebulk() + rebulk.defaults(name="path", marker=True) + + def mark_path(input_string, context): + """ + Functional pattern to mark path elements. + + :param input_string: + :return: + """ + ret = [] + if context.get('name_only', False): + ret.append((0, len(input_string))) + else: + indices = list(find_all(input_string, '/')) + indices += list(find_all(input_string, '\\')) + indices += [-1, len(input_string)] + + indices.sort() + + for i in range(0, len(indices) - 1): + ret.append((indices[i] + 1, indices[i + 1])) + + return ret + + rebulk.functional(mark_path) + return rebulk diff --git a/libs/guessit/rules/processors.py b/libs/guessit/rules/processors.py new file mode 100644 index 00000000..3480a9d1 --- /dev/null +++ b/libs/guessit/rules/processors.py @@ -0,0 +1,198 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +""" +Processors +""" +from collections import defaultdict +import copy + +import six + +from rebulk import Rebulk, Rule, CustomRule, POST_PROCESS, PRE_PROCESS, AppendMatch, RemoveMatch +from guessit.rules.common.words import iter_words +from .common.formatters import cleanup +from .common.comparators import marker_sorted +from .common.date import valid_year + + +class EnlargeGroupMatches(CustomRule): + """ + Enlarge matches that are starting and/or ending group to include brackets in their span. + :param matches: + :type matches: + :return: + :rtype: + """ + priority = PRE_PROCESS + + def when(self, matches, context): + starting = [] + ending = [] + + for group in matches.markers.named('group'): + for match in matches.starting(group.start + 1): + starting.append(match) + + for match in matches.ending(group.end - 1): + ending.append(match) + + if starting or ending: + return starting, ending + + def then(self, matches, when_response, context): + starting, ending = when_response + for match in starting: + matches.remove(match) + match.start -= 1 + match.raw_start += 1 + matches.append(match) + + for match in ending: + matches.remove(match) + match.end += 1 + match.raw_end -= 1 + matches.append(match) + + +class EquivalentHoles(Rule): + """ + Creates equivalent matches for holes that have same values than existing (case insensitive) + """ + priority = POST_PROCESS + consequence = AppendMatch + + def when(self, matches, context): + new_matches = [] + + for filepath in marker_sorted(matches.markers.named('path'), matches): + holes = matches.holes(start=filepath.start, end=filepath.end, formatter=cleanup) + for name in matches.names: + for hole in list(holes): + for current_match in matches.named(name): + if isinstance(current_match.value, six.string_types) and \ + hole.value.lower() == current_match.value.lower(): + if 'equivalent-ignore' in current_match.tags: + continue + new_value = _preferred_string(hole.value, current_match.value) + if hole.value != new_value: + hole.value = new_value + if current_match.value != new_value: + current_match.value = new_value + hole.name = name + hole.tags = ['equivalent'] + new_matches.append(hole) + if hole in holes: + holes.remove(hole) + + return new_matches + + +class RemoveAmbiguous(Rule): + """ + If multiple match are found with same name and different values, keep the one in the most valuable filepart. + Also keep others match with same name and values than those kept ones. + """ + priority = POST_PROCESS + consequence = RemoveMatch + + def when(self, matches, context): + fileparts = marker_sorted(matches.markers.named('path'), matches) + + previous_fileparts_names = set() + values = defaultdict(list) + + to_remove = [] + for filepart in fileparts: + filepart_matches = matches.range(filepart.start, filepart.end) + + filepart_names = set() + for match in filepart_matches: + filepart_names.add(match.name) + if match.name in previous_fileparts_names: + if match.value not in values[match.name]: + to_remove.append(match) + else: + if match.value not in values[match.name]: + values[match.name].append(match.value) + + previous_fileparts_names.update(filepart_names) + + return to_remove + + +def _preferred_string(value1, value2): # pylint:disable=too-many-return-statements + """ + Retrieves preferred title from both values. + :param value1: + :type value1: str + :param value2: + :type value2: str + :return: The preferred title + :rtype: str + """ + if value1 == value2: + return value1 + if value1.istitle() and not value2.istitle(): + return value1 + if not value1.isupper() and value2.isupper(): + return value1 + if not value1.isupper() and value1[0].isupper() and not value2[0].isupper(): + return value1 + if _count_title_words(value1) > _count_title_words(value2): + return value1 + return value2 + + +def _count_title_words(value): + """ + Count only many words are titles in value. + :param value: + :type value: + :return: + :rtype: + """ + ret = 0 + for word in iter_words(value): + if word.value.istitle(): + ret += 1 + return ret + + +class SeasonYear(Rule): + """ + If a season is a valid year and no year was found, create an match with year. + """ + priority = POST_PROCESS + consequence = AppendMatch + + def when(self, matches, context): + ret = [] + if not matches.named('year'): + for season in matches.named('season'): + if valid_year(season.value): + year = copy.copy(season) + year.name = 'year' + ret.append(year) + return ret + + +class Processors(CustomRule): + """ + Empty rule for ordering post_processing properly. + """ + priority = POST_PROCESS + + def when(self, matches, context): + pass + + def then(self, matches, when_response, context): # pragma: no cover + pass + + +def processors(): + """ + Builder for rebulk object. + :return: Created Rebulk object + :rtype: Rebulk + """ + return Rebulk().rules(EnlargeGroupMatches, EquivalentHoles, RemoveAmbiguous, SeasonYear, Processors) diff --git a/libs/guessit/rules/properties/__init__.py b/libs/guessit/rules/properties/__init__.py new file mode 100644 index 00000000..e0a24eaf --- /dev/null +++ b/libs/guessit/rules/properties/__init__.py @@ -0,0 +1,5 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +""" +Properties +""" diff --git a/libs/guessit/rules/properties/audio_codec.py b/libs/guessit/rules/properties/audio_codec.py new file mode 100644 index 00000000..c88a6e7e --- /dev/null +++ b/libs/guessit/rules/properties/audio_codec.py @@ -0,0 +1,164 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +""" +audio_codec, audio_profile and audio_channels property +""" +from rebulk.remodule import re + +from rebulk import Rebulk, Rule, RemoveMatch +from ..common import dash +from ..common.validators import seps_before, seps_after + +audio_properties = ['audio_codec', 'audio_profile', 'audio_channels'] + + +def audio_codec(): + """ + Builder for rebulk object. + :return: Created Rebulk object + :rtype: Rebulk + """ + rebulk = Rebulk().regex_defaults(flags=re.IGNORECASE, abbreviations=[dash]).string_defaults(ignore_case=True) + + def audio_codec_priority(match1, match2): + """ + Gives priority to audio_codec + :param match1: + :type match1: + :param match2: + :type match2: + :return: + :rtype: + """ + if match1.name == 'audio_codec' and match2.name in ['audio_profile', 'audio_channels']: + return match2 + if match1.name in ['audio_profile', 'audio_channels'] and match2.name == 'audio_codec': + return match1 + return '__default__' + + rebulk.defaults(name="audio_codec", conflict_solver=audio_codec_priority) + + rebulk.regex("MP3", "LAME", r"LAME(?:\d)+-?(?:\d)+", value="MP3") + rebulk.regex("Dolby", "DolbyDigital", "Dolby-Digital", "DD", value="DolbyDigital") + rebulk.regex("DolbyAtmos", "Dolby-Atmos", "Atmos", value="DolbyAtmos") + rebulk.regex("AAC", value="AAC") + rebulk.regex("AC3D?", value="AC3") + rebulk.regex("Flac", value="FLAC") + rebulk.regex("DTS", value="DTS") + rebulk.regex("True-?HD", value="TrueHD") + + rebulk.defaults(name="audio_profile") + rebulk.string("HD", value="HD", tags="DTS") + rebulk.regex("HD-?MA", value="HDMA", tags="DTS") + rebulk.string("HE", value="HE", tags="AAC") + rebulk.string("LC", value="LC", tags="AAC") + rebulk.string("HQ", value="HQ", tags="AC3") + + rebulk.defaults(name="audio_channels") + rebulk.regex(r'(7[\W_][01](?:ch)?)(?:[^\d]|$)', value='7.1', children=True) + rebulk.regex(r'(5[\W_][01](?:ch)?)(?:[^\d]|$)', value='5.1', children=True) + rebulk.regex(r'(2[\W_]0(?:ch)?)(?:[^\d]|$)', value='2.0', children=True) + rebulk.string('7ch', '8ch', value='7.1') + rebulk.string('5ch', '6ch', value='5.1') + rebulk.string('2ch', 'stereo', value='2.0') + rebulk.string('1ch', 'mono', value='1.0') + + rebulk.rules(DtsRule, AacRule, Ac3Rule, AudioValidatorRule, HqConflictRule) + + return rebulk + + +class AudioValidatorRule(Rule): + """ + Remove audio properties if not surrounded by separators and not next each others + """ + priority = 64 + consequence = RemoveMatch + + def when(self, matches, context): + ret = [] + + audio_list = matches.range(predicate=lambda match: match.name in audio_properties) + for audio in audio_list: + if not seps_before(audio): + valid_before = matches.range(audio.start - 1, audio.start, + lambda match: match.name in audio_properties) + if not valid_before: + ret.append(audio) + continue + if not seps_after(audio): + valid_after = matches.range(audio.end, audio.end + 1, + lambda match: match.name in audio_properties) + if not valid_after: + ret.append(audio) + continue + + return ret + + +class AudioProfileRule(Rule): + """ + Abstract rule to validate audio profiles + """ + priority = 64 + dependency = AudioValidatorRule + consequence = RemoveMatch + + def __init__(self, codec): + super(AudioProfileRule, self).__init__() + self.codec = codec + + def when(self, matches, context): + profile_list = matches.named('audio_profile', lambda match: self.codec in match.tags) + ret = [] + for profile in profile_list: + codec = matches.previous(profile, lambda match: match.name == 'audio_codec' and match.value == self.codec) + if not codec: + codec = matches.next(profile, lambda match: match.name == 'audio_codec' and match.value == self.codec) + if not codec: + ret.append(profile) + return ret + + +class DtsRule(AudioProfileRule): + """ + Rule to validate DTS profile + """ + + def __init__(self): + super(DtsRule, self).__init__("DTS") + + +class AacRule(AudioProfileRule): + """ + Rule to validate AAC profile + """ + + def __init__(self): + super(AacRule, self).__init__("AAC") + + +class Ac3Rule(AudioProfileRule): + """ + Rule to validate AC3 profile + """ + + def __init__(self): + super(Ac3Rule, self).__init__("AC3") + + +class HqConflictRule(Rule): + """ + Solve conflict between HQ from other property and from audio_profile. + """ + + dependency = [DtsRule, AacRule, Ac3Rule] + consequence = RemoveMatch + + def when(self, matches, context): + hq_audio = matches.named('audio_profile', lambda match: match.value == 'HQ') + hq_audio_spans = [match.span for match in hq_audio] + hq_other = matches.named('other', lambda match: match.span in hq_audio_spans) + + if hq_other: + return hq_other diff --git a/libs/guessit/rules/properties/bonus.py b/libs/guessit/rules/properties/bonus.py new file mode 100644 index 00000000..e37613e9 --- /dev/null +++ b/libs/guessit/rules/properties/bonus.py @@ -0,0 +1,50 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +""" +bonus property +""" +from rebulk.remodule import re + +from rebulk import Rebulk, AppendMatch, Rule + +from .title import TitleFromPosition +from ..common.formatters import cleanup +from ..common.validators import seps_surround + + +def bonus(): + """ + Builder for rebulk object. + :return: Created Rebulk object + :rtype: Rebulk + """ + rebulk = Rebulk().regex_defaults(flags=re.IGNORECASE) + + rebulk.regex(r'x(\d+)', name='bonus', private_parent=True, children=True, formatter=int, + validator={'__parent__': lambda match: seps_surround}, + conflict_solver=lambda match, conflicting: match + if conflicting.name in ['video_codec', 'episode'] and 'bonus-conflict' not in conflicting.tags + else '__default__') + + rebulk.rules(BonusTitleRule) + + return rebulk + + +class BonusTitleRule(Rule): + """ + Find bonus title after bonus. + """ + dependency = TitleFromPosition + consequence = AppendMatch + + properties = {'bonus_title': [None]} + + def when(self, matches, context): + bonus_number = matches.named('bonus', lambda match: not match.private, index=0) + if bonus_number: + filepath = matches.markers.at_match(bonus_number, lambda marker: marker.name == 'path', 0) + hole = matches.holes(bonus_number.end, filepath.end + 1, formatter=cleanup, index=0) + if hole and hole.value: + hole.name = 'bonus_title' + return hole diff --git a/libs/guessit/rules/properties/cds.py b/libs/guessit/rules/properties/cds.py new file mode 100644 index 00000000..db1407d6 --- /dev/null +++ b/libs/guessit/rules/properties/cds.py @@ -0,0 +1,35 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +""" +cd and cd_count properties +""" +from rebulk.remodule import re + +from rebulk import Rebulk +from ..common import dash + + +def cds(): + """ + Builder for rebulk object. + :return: Created Rebulk object + :rtype: Rebulk + """ + rebulk = Rebulk().regex_defaults(flags=re.IGNORECASE, abbreviations=[dash]) + + rebulk.regex(r'cd-?(?P\d+)(?:-?of-?(?P\d+))?', + validator={'cd': lambda match: 0 < match.value < 100, + 'cd_count': lambda match: 0 < match.value < 100}, + formatter={'cd': int, 'cd_count': int}, + children=True, + private_parent=True, + properties={'cd': [None], 'cd_count': [None]}) + rebulk.regex(r'(?P\d+)-?cds?', + validator={'cd': lambda match: 0 < match.value < 100, + 'cd_count': lambda match: 0 < match.value < 100}, + formatter={'cd_count': int}, + children=True, + private_parent=True, + properties={'cd': [None], 'cd_count': [None]}) + + return rebulk diff --git a/libs/guessit/rules/properties/container.py b/libs/guessit/rules/properties/container.py new file mode 100644 index 00000000..747a3ebc --- /dev/null +++ b/libs/guessit/rules/properties/container.py @@ -0,0 +1,53 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +""" +container property +""" +from rebulk.remodule import re + +from rebulk import Rebulk +from ..common.validators import seps_surround +from ...reutils import build_or_pattern + + +def container(): + """ + Builder for rebulk object. + :return: Created Rebulk object + :rtype: Rebulk + """ + rebulk = Rebulk().regex_defaults(flags=re.IGNORECASE).string_defaults(ignore_case=True) + rebulk.defaults(name='container', + formatter=lambda value: value[1:], + tags=['extension'], + conflict_solver=lambda match, other: other + if other.name in ['format', 'video_codec'] or + other.name == 'container' and 'extension' not in other.tags + else '__default__') + + subtitles = ['srt', 'idx', 'sub', 'ssa', 'ass'] + info = ['nfo'] + videos = ['3g2', '3gp', '3gp2', 'asf', 'avi', 'divx', 'flv', 'm4v', 'mk2', + 'mka', 'mkv', 'mov', 'mp4', 'mp4a', 'mpeg', 'mpg', 'ogg', 'ogm', + 'ogv', 'qt', 'ra', 'ram', 'rm', 'ts', 'wav', 'webm', 'wma', 'wmv', + 'iso', 'vob'] + torrent = ['torrent'] + + rebulk.regex(r'\.'+build_or_pattern(subtitles)+'$', exts=subtitles, tags=['extension', 'subtitle']) + rebulk.regex(r'\.'+build_or_pattern(info)+'$', exts=info, tags=['extension', 'info']) + rebulk.regex(r'\.'+build_or_pattern(videos)+'$', exts=videos, tags=['extension', 'video']) + rebulk.regex(r'\.'+build_or_pattern(torrent)+'$', exts=torrent, tags=['extension', 'torrent']) + + rebulk.defaults(name='container', + validator=seps_surround, + formatter=lambda s: s.upper(), + conflict_solver=lambda match, other: match + if other.name in ['format', + 'video_codec'] or other.name == 'container' and 'extension' in other.tags + else '__default__') + + rebulk.string(*[sub for sub in subtitles if sub not in ['sub']], tags=['subtitle']) + rebulk.string(*videos, tags=['video']) + rebulk.string(*torrent, tags=['torrent']) + + return rebulk diff --git a/libs/guessit/rules/properties/country.py b/libs/guessit/rules/properties/country.py new file mode 100644 index 00000000..8f03b498 --- /dev/null +++ b/libs/guessit/rules/properties/country.py @@ -0,0 +1,109 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +""" +country property +""" +# pylint: disable=no-member +import babelfish + +from rebulk import Rebulk +from ..common.words import COMMON_WORDS, iter_words + + +def country(): + """ + Builder for rebulk object. + :return: Created Rebulk object + :rtype: Rebulk + """ + rebulk = Rebulk().defaults(name='country') + + rebulk.functional(find_countries, + #  Prefer language and any other property over country if not US or GB. + conflict_solver=lambda match, other: match + if other.name != 'language' or match.value not in [babelfish.Country('US'), + babelfish.Country('GB')] + else other, + properties={'country': [None]}) + + return rebulk + + +COUNTRIES_SYN = {'ES': ['españa'], + 'GB': ['UK'], + 'BR': ['brazilian', 'bra'], + 'CA': ['québec', 'quebec', 'qc'], + # FIXME: this one is a bit of a stretch, not sure how to do it properly, though... + 'MX': ['Latinoamérica', 'latin america']} + + +class GuessitCountryConverter(babelfish.CountryReverseConverter): # pylint: disable=missing-docstring + def __init__(self): + self.guessit_exceptions = {} + + for alpha2, synlist in COUNTRIES_SYN.items(): + for syn in synlist: + self.guessit_exceptions[syn.lower()] = alpha2 + + @property + def codes(self): # pylint: disable=missing-docstring + return (babelfish.country_converters['name'].codes | + frozenset(babelfish.COUNTRIES.values()) | + frozenset(self.guessit_exceptions.keys())) + + def convert(self, alpha2): + if alpha2 == 'GB': + return 'UK' + return str(babelfish.Country(alpha2)) + + def reverse(self, name): + # exceptions come first, as they need to override a potential match + # with any of the other guessers + try: + return self.guessit_exceptions[name.lower()] + except KeyError: + pass + + try: + return babelfish.Country(name.upper()).alpha2 + except ValueError: + pass + + for conv in [babelfish.Country.fromname]: + try: + return conv(name).alpha2 + except babelfish.CountryReverseError: + pass + + raise babelfish.CountryReverseError(name) + + +babelfish.country_converters['guessit'] = GuessitCountryConverter() + + +def is_allowed_country(country_object, context=None): + """ + Check if country is allowed. + """ + if context and context.get('allowed_countries'): + allowed_countries = context.get('allowed_countries') + return country_object.name.lower() in allowed_countries or country_object.alpha2.lower() in allowed_countries + return True + + +def find_countries(string, context=None): + """ + Find countries in given string. + """ + ret = [] + for word_match in iter_words(string.strip().lower()): + word = word_match.value + if word.lower() in COMMON_WORDS: + continue + try: + country_object = babelfish.Country.fromguessit(word) + if is_allowed_country(country_object, context): + ret.append((word_match.span[0], word_match.span[1], {'value': country_object})) + except babelfish.Error: + continue + return ret diff --git a/libs/guessit/rules/properties/crc.py b/libs/guessit/rules/properties/crc.py new file mode 100644 index 00000000..f655bc13 --- /dev/null +++ b/libs/guessit/rules/properties/crc.py @@ -0,0 +1,85 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +""" +crc and uuid properties +""" +from rebulk.remodule import re + +from rebulk import Rebulk +from ..common.validators import seps_surround + + +def crc(): + """ + Builder for rebulk object. + :return: Created Rebulk object + :rtype: Rebulk + """ + rebulk = Rebulk().regex_defaults(flags=re.IGNORECASE) + rebulk.defaults(validator=seps_surround) + + rebulk.regex('(?:[a-fA-F]|[0-9]){8}', name='crc32', + conflict_solver=lambda match, other: match + if other.name in ['episode', 'season'] + else '__default__') + + rebulk.functional(guess_idnumber, name='uuid', + conflict_solver=lambda match, other: match + if other.name in ['episode', 'season'] + else '__default__') + return rebulk + + +_DIGIT = 0 +_LETTER = 1 +_OTHER = 2 + +_idnum = re.compile(r'(?P[a-zA-Z0-9-]{20,})') # 1.0, (0, 0)) + + +def guess_idnumber(string): + """ + Guess id number function + :param string: + :type string: + :return: + :rtype: + """ + # pylint:disable=invalid-name + ret = [] + + matches = list(_idnum.finditer(string)) + for match in matches: + result = match.groupdict() + switch_count = 0 + switch_letter_count = 0 + letter_count = 0 + last_letter = None + + last = _LETTER + for c in result['uuid']: + if c in '0123456789': + ci = _DIGIT + elif c in 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ': + ci = _LETTER + if c != last_letter: + switch_letter_count += 1 + last_letter = c + letter_count += 1 + else: + ci = _OTHER + + if ci != last: + switch_count += 1 + + last = ci + + # only return the result as probable if we alternate often between + # char type (more likely for hash values than for common words) + switch_ratio = float(switch_count) / len(result['uuid']) + letters_ratio = (float(switch_letter_count) / letter_count) if letter_count > 0 else 1 + + if switch_ratio > 0.4 and letters_ratio > 0.4: + ret.append(match.span()) + + return ret diff --git a/libs/guessit/rules/properties/date.py b/libs/guessit/rules/properties/date.py new file mode 100644 index 00000000..0b6083bd --- /dev/null +++ b/libs/guessit/rules/properties/date.py @@ -0,0 +1,72 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +""" +date and year properties +""" +from rebulk import Rebulk, RemoveMatch, Rule + +from ..common.date import search_date, valid_year +from ..common.validators import seps_surround + + +def date(): + """ + Builder for rebulk object. + :return: Created Rebulk object + :rtype: Rebulk + """ + rebulk = Rebulk().defaults(validator=seps_surround) + + rebulk.regex(r"\d{4}", name="year", formatter=int, + validator=lambda match: seps_surround(match) and valid_year(match.value)) + + def date_functional(string, context): + """ + Search for date in the string and retrieves match + + :param string: + :return: + """ + + ret = search_date(string, context.get('date_year_first'), context.get('date_day_first')) + if ret: + return ret[0], ret[1], {'value': ret[2]} + + rebulk.functional(date_functional, name="date", properties={'date': [None]}, + conflict_solver=lambda match, other: other + if other.name in ['episode', 'season'] + else '__default__') + + rebulk.rules(KeepMarkedYearInFilepart) + + return rebulk + + +class KeepMarkedYearInFilepart(Rule): + """ + Keep first years marked with [](){} in filepart, or if no year is marked, ensure it won't override titles. + """ + priority = 64 + consequence = RemoveMatch + + def when(self, matches, context): + ret = [] + if len(matches.named('year')) > 1: + for filepart in matches.markers.named('path'): + years = matches.range(filepart.start, filepart.end, lambda match: match.name == 'year') + if len(years) > 1: + group_years = [] + ungroup_years = [] + for year in years: + if matches.markers.at_match(year, lambda marker: marker.name == 'group'): + group_years.append(year) + else: + ungroup_years.append(year) + if group_years and ungroup_years: + ret.extend(ungroup_years) + ret.extend(group_years[1:]) # Keep the first year in marker. + elif not group_years: + ret.append(ungroup_years[0]) # Keep first year for title. + if len(ungroup_years) > 2: + ret.extend(ungroup_years[2:]) + return ret diff --git a/libs/guessit/rules/properties/edition.py b/libs/guessit/rules/properties/edition.py new file mode 100644 index 00000000..429ba8d3 --- /dev/null +++ b/libs/guessit/rules/properties/edition.py @@ -0,0 +1,31 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +""" +edition property +""" +from rebulk.remodule import re + +from rebulk import Rebulk +from ..common import dash +from ..common.validators import seps_surround + + +def edition(): + """ + Builder for rebulk object. + :return: Created Rebulk object + :rtype: Rebulk + """ + rebulk = Rebulk().regex_defaults(flags=re.IGNORECASE, abbreviations=[dash]).string_defaults(ignore_case=True) + rebulk.defaults(name='edition', validator=seps_surround) + + rebulk.regex('collector', 'collector-edition', 'edition-collector', value='Collector Edition') + rebulk.regex('special-edition', 'edition-special', value='Special Edition', + conflict_solver=lambda match, other: other + if other.name == 'episode_details' and other.value == 'Special' + else '__default__') + rebulk.regex('criterion-edition', 'edition-criterion', value='Criterion Edition') + rebulk.regex('deluxe', 'deluxe-edition', 'edition-deluxe', value='Deluxe Edition') + rebulk.regex('director\'?s?-cut', 'director\'?s?-cut-edition', 'edition-director\'?s?-cut', value='Director\'s cut') + + return rebulk diff --git a/libs/guessit/rules/properties/episode_title.py b/libs/guessit/rules/properties/episode_title.py new file mode 100644 index 00000000..9d6e4abf --- /dev/null +++ b/libs/guessit/rules/properties/episode_title.py @@ -0,0 +1,196 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +""" +Episode title +""" +from collections import defaultdict + +from rebulk import Rebulk, Rule, AppendMatch, RenameMatch +from ..common import seps, title_seps +from ..properties.title import TitleFromPosition, TitleBaseRule +from ..common.formatters import cleanup + + +def episode_title(): + """ + Builder for rebulk object. + :return: Created Rebulk object + :rtype: Rebulk + """ + rebulk = Rebulk().rules(EpisodeTitleFromPosition, + AlternativeTitleReplace, + TitleToEpisodeTitle, + Filepart3EpisodeTitle, + Filepart2EpisodeTitle) + return rebulk + + +class TitleToEpisodeTitle(Rule): + """ + If multiple different title are found, convert the one following episode number to episode_title. + """ + dependency = TitleFromPosition + + def when(self, matches, context): + titles = matches.named('title') + + if len(titles) < 2: + return + + title_groups = defaultdict(list) + for title in titles: + title_groups[title.value].append(title) + + episode_titles = [] + main_titles = [] + for title in titles: + if matches.previous(title, lambda match: match.name == 'episode'): + episode_titles.append(title) + else: + main_titles.append(title) + + if episode_titles: + return episode_titles + + def then(self, matches, when_response, context): + for title in when_response: + matches.remove(title) + title.name = 'episode_title' + matches.append(title) + + +class EpisodeTitleFromPosition(TitleBaseRule): + """ + Add episode title match in existing matches + Must run after TitleFromPosition rule. + """ + dependency = TitleToEpisodeTitle + + def hole_filter(self, hole, matches): + episode = matches.previous(hole, + lambda previous: any(name in previous.names + for name in ['episode', 'episode_details', + 'episode_count', 'season', 'season_count', + 'date', 'title', 'year']), + 0) + + crc32 = matches.named('crc32') + + return episode or crc32 + + def filepart_filter(self, filepart, matches): + # Filepart where title was found. + if matches.range(filepart.start, filepart.end, lambda match: match.name == 'title'): + return True + return False + + def should_remove(self, match, matches, filepart, hole, context): + if match.name == 'episode_details': + return False + return super(EpisodeTitleFromPosition, self).should_remove(match, matches, filepart, hole, context) + + def __init__(self): + super(EpisodeTitleFromPosition, self).__init__('episode_title', ['title']) + + def when(self, matches, context): + if matches.named('episode_title'): + return + return super(EpisodeTitleFromPosition, self).when(matches, context) + + +class AlternativeTitleReplace(Rule): + """ + If alternateTitle was found and title is next to episode, season or date, replace it with episode_title. + """ + dependency = EpisodeTitleFromPosition + consequence = RenameMatch + + def when(self, matches, context): + if matches.named('episode_title'): + return + + alternative_title = matches.range(predicate=lambda match: match.name == 'alternative_title', index=0) + if alternative_title: + main_title = matches.chain_before(alternative_title.start, seps=seps, + predicate=lambda match: 'title' in match.tags, index=0) + if main_title: + episode = matches.previous(main_title, + lambda previous: any(name in previous.names + for name in ['episode', 'episode_details', + 'episode_count', 'season', + 'season_count', + 'date', 'title', 'year']), + 0) + + crc32 = matches.named('crc32') + + if episode or crc32: + return alternative_title + + def then(self, matches, when_response, context): + matches.remove(when_response) + when_response.name = 'episode_title' + matches.append(when_response) + + +class Filepart3EpisodeTitle(Rule): + """ + If we have at least 3 filepart structured like this: + + Serie name/SO1/E01-episode_title.mkv + AAAAAAAAAA/BBB/CCCCCCCCCCCCCCCCCCCC + + If CCCC contains episode and BBB contains seasonNumber + Then title is to be found in AAAA. + """ + consequence = AppendMatch('title') + + def when(self, matches, context): + fileparts = matches.markers.named('path') + if len(fileparts) < 3: + return + + filename = fileparts[-1] + directory = fileparts[-2] + subdirectory = fileparts[-3] + + episode_number = matches.range(filename.start, filename.end, lambda match: match.name == 'episode', 0) + if episode_number: + season = matches.range(directory.start, directory.end, lambda match: match.name == 'season', 0) + + if season: + hole = matches.holes(subdirectory.start, subdirectory.end, + formatter=cleanup, seps=title_seps, predicate=lambda match: match.value, + index=0) + if hole: + return hole + + +class Filepart2EpisodeTitle(Rule): + """ + If we have at least 2 filepart structured like this: + + Serie name SO1/E01-episode_title.mkv + AAAAAAAAAAAAA/BBBBBBBBBBBBBBBBBBBBB + + If BBBB contains episode and AAA contains a hole followed by seasonNumber + Then title is to be found in AAAA. + """ + consequence = AppendMatch('title') + + def when(self, matches, context): + fileparts = matches.markers.named('path') + if len(fileparts) < 2: + return + + filename = fileparts[-1] + directory = fileparts[-2] + + episode_number = matches.range(filename.start, filename.end, lambda match: match.name == 'episode', 0) + if episode_number: + season = matches.range(directory.start, directory.end, lambda match: match.name == 'season', 0) + if season: + hole = matches.holes(directory.start, directory.end, formatter=cleanup, seps=title_seps, + predicate=lambda match: match.value, index=0) + if hole: + return hole diff --git a/libs/guessit/rules/properties/episodes.py b/libs/guessit/rules/properties/episodes.py new file mode 100644 index 00000000..65722835 --- /dev/null +++ b/libs/guessit/rules/properties/episodes.py @@ -0,0 +1,516 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +""" +episode, season, episode_count, season_count and episode_details properties +""" +import copy +from collections import defaultdict + +from rebulk import Rebulk, RemoveMatch, Rule, AppendMatch, RenameMatch +from rebulk.match import Match +from rebulk.remodule import re +from rebulk.utils import is_iterable + +from .title import TitleFromPosition +from ..common import dash, alt_dash, seps +from ..common.formatters import strip +from ..common.numeral import numeral, parse_numeral +from ..common.validators import compose, seps_surround, seps_before, int_coercable +from ...reutils import build_or_pattern + + +def episodes(): + """ + Builder for rebulk object. + :return: Created Rebulk object + :rtype: Rebulk + """ + # pylint: disable=too-many-branches,too-many-statements,too-many-locals + rebulk = Rebulk() + rebulk.regex_defaults(flags=re.IGNORECASE).string_defaults(ignore_case=True) + rebulk.defaults(private_names=['episodeSeparator', 'seasonSeparator']) + + def season_episode_conflict_solver(match, other): + """ + Conflict solver for episode/season patterns + + :param match: + :param other: + :return: + """ + if match.name in ['season', 'episode'] and other.name in ['screen_size', 'video_codec', + 'audio_codec', 'audio_channels', + 'container', 'date']: + return match + elif match.name in ['season', 'episode'] and other.name in ['season', 'episode'] \ + and match.initiator != other.initiator: + if 'weak-episode' in match.tags: + return match + if 'weak-episode' in other.tags: + return other + if 'x' in match.initiator.raw.lower(): + return match + if 'x' in other.initiator.raw.lower(): + return other + return '__default__' + + season_episode_seps = [] + season_episode_seps.extend(seps) + season_episode_seps.extend(['x', 'X', 'e', 'E']) + + season_words = ['season', 'saison', 'serie', 'seasons', 'saisons', 'series'] + episode_words = ['episode', 'episodes', 'ep'] + of_words = ['of', 'sur'] + all_words = ['All'] + season_markers = ["S"] + season_ep_markers = ["x"] + episode_markers = ["xE", "Ex", "EP", "E", "x"] + range_separators = ['-', '~', 'to', 'a'] + weak_discrete_separators = list(sep for sep in seps if sep not in range_separators) + strong_discrete_separators = ['+', '&', 'and', 'et'] + discrete_separators = strong_discrete_separators + weak_discrete_separators + + def ordering_validator(match): + """ + Validator for season list. They should be in natural order to be validated. + + episode/season separated by a weak discrete separator should be consecutive, unless a strong discrete separator + or a range separator is present in the chain (1.3&5 is valid, but 1.3-5 is not valid and 1.3.5 is not valid) + """ + values = match.children.to_dict(implicit=True) + if 'season' in values and is_iterable(values['season']): + # Season numbers must be in natural order to be validated. + if not list(sorted(values['season'])) == values['season']: + return False + if 'episode' in values and is_iterable(values['episode']): + # Season numbers must be in natural order to be validated. + if not list(sorted(values['episode'])) == values['episode']: + return False + + def is_consecutive(property_name): + """ + Check if the property season or episode has valid consecutive values. + :param property_name: + :type property_name: + :return: + :rtype: + """ + previous_match = None + valid = True + for current_match in match.children.named(property_name): + if previous_match: + match.children.previous(current_match, + lambda m: m.name == property_name + 'Separator') + separator = match.children.previous(current_match, + lambda m: m.name == property_name + 'Separator', 0) + if separator.raw not in range_separators and separator.raw in weak_discrete_separators: + if not current_match.value - previous_match.value == 1: + valid = False + if separator.raw in strong_discrete_separators: + valid = True + break + previous_match = current_match + return valid + + return is_consecutive('episode') and is_consecutive('season') + + # S01E02, 01x02, S01S02S03 + rebulk.chain(formatter={'season': int, 'episode': int}, + tags=['SxxExx'], + abbreviations=[alt_dash], + children=True, + private_parent=True, + validate_all=True, + validator={'__parent__': ordering_validator}, + conflict_solver=season_episode_conflict_solver) \ + .regex(build_or_pattern(season_markers) + r'(?P\d+)@?' + + build_or_pattern(episode_markers) + r'@?(?P\d+)', + validate_all=True, + validator={'__parent__': seps_before}).repeater('+') \ + .regex(build_or_pattern(episode_markers + discrete_separators + range_separators, + name='episodeSeparator', + escape=True) + + r'(?P\d+)').repeater('*') \ + .chain() \ + .regex(r'(?P\d+)@?' + + build_or_pattern(season_ep_markers) + + r'@?(?P\d+)', + validate_all=True, + validator={'__parent__': seps_before}) \ + .chain() \ + .regex(r'(?P\d+)@?' + + build_or_pattern(season_ep_markers) + + r'@?(?P\d+)', + validate_all=True, + validator={'__parent__': seps_before}) \ + .regex(build_or_pattern(season_ep_markers + discrete_separators + range_separators, + name='episodeSeparator', + escape=True) + + r'(?P\d+)').repeater('*') \ + .chain() \ + .regex(build_or_pattern(season_markers) + r'(?P\d+)', + validate_all=True, + validator={'__parent__': seps_before}) \ + .regex(build_or_pattern(season_markers + discrete_separators + range_separators, + name='seasonSeparator', + escape=True) + + r'(?P\d+)').repeater('*') + + # episode_details property + for episode_detail in ('Special', 'Bonus', 'Omake', 'Ova', 'Oav', 'Pilot', 'Unaired'): + rebulk.string(episode_detail, value=episode_detail, name='episode_details') + rebulk.regex(r'Extras?', name='episode_details', value='Extras') + + rebulk.defaults(private_names=['episodeSeparator', 'seasonSeparator'], + validate_all=True, validator={'__parent__': seps_surround}, children=True, private_parent=True) + + def validate_roman(match): + """ + Validate a roman match if surrounded by separators + :param match: + :type match: + :return: + :rtype: + """ + if int_coercable(match.raw): + return True + return seps_surround(match) + + rebulk.chain(abbreviations=[alt_dash], + formatter={'season': parse_numeral, 'count': parse_numeral}, + validator={'__parent__': compose(seps_surround, ordering_validator), + 'season': validate_roman, + 'count': validate_roman}) \ + .defaults(validator=None) \ + .regex(build_or_pattern(season_words) + '@?(?P' + numeral + ')') \ + .regex(r'' + build_or_pattern(of_words) + '@?(?P' + numeral + ')').repeater('?') \ + .regex(r'@?(?P' + + build_or_pattern(range_separators + discrete_separators + ['@'], escape=True) + + r')@?(?P\d+)').repeater('*') + + rebulk.regex(build_or_pattern(episode_words) + r'-?(?P\d+)' + + r'(?:v(?P\d+))?' + + r'(?:-?' + build_or_pattern(of_words) + r'-?(?P\d+))?', # Episode 4 + abbreviations=[dash], formatter=int, + disabled=lambda context: context.get('type') == 'episode') + + rebulk.regex(build_or_pattern(episode_words) + r'-?(?P' + numeral + ')' + + r'(?:v(?P\d+))?' + + r'(?:-?' + build_or_pattern(of_words) + r'-?(?P\d+))?', # Episode 4 + abbreviations=[dash], + validator={'episode': validate_roman}, + formatter={'episode': parse_numeral, 'version': int, 'count': int}, + disabled=lambda context: context.get('type') != 'episode') + + rebulk.regex(r'S?(?P\d+)-?(?:xE|Ex|E|x)-?(?P' + build_or_pattern(all_words) + ')', + tags=['SxxExx'], + abbreviations=[dash], + validator=None, + formatter={'season': int, 'other': lambda match: 'Complete'}) + + rebulk.defaults(private_names=['episodeSeparator', 'seasonSeparator'], validate_all=True, + validator={'__parent__': seps_surround}, children=True, private_parent=True) + + # 12, 13 + rebulk.chain(tags=['bonus-conflict', 'weak-movie', 'weak-episode'], formatter={'episode': int, 'version': int}) \ + .defaults(validator=None) \ + .regex(r'(?P\d{2})') \ + .regex(r'v(?P\d+)').repeater('?') \ + .regex(r'(?P[x-])(?P\d{2})').repeater('*') + + # 012, 013 + rebulk.chain(tags=['bonus-conflict', 'weak-movie', 'weak-episode'], formatter={'episode': int, 'version': int}) \ + .defaults(validator=None) \ + .regex(r'0(?P\d{1,2})') \ + .regex(r'v(?P\d+)').repeater('?') \ + .regex(r'(?P[x-])0(?P\d{1,2})').repeater('*') + + # 112, 113 + rebulk.chain(tags=['bonus-conflict', 'weak-movie', 'weak-episode'], formatter={'episode': int, 'version': int}, + disabled=lambda context: not context.get('episode_prefer_number', False)) \ + .defaults(validator=None) \ + .regex(r'(?P\d{3,4})') \ + .regex(r'v(?P\d+)').repeater('?') \ + .regex(r'(?P[x-])(?P\d{3,4})').repeater('*') + + # 1, 2, 3 + rebulk.chain(tags=['bonus-conflict', 'weak-movie', 'weak-episode'], formatter={'episode': int, 'version': int}, + disabled=lambda context: context.get('type') != 'episode') \ + .defaults(validator=None) \ + .regex(r'(?P\d)') \ + .regex(r'v(?P\d+)').repeater('?') \ + .regex(r'(?P[x-])(?P\d{1,2})').repeater('*') + + # e112, e113 + # TODO: Enhance rebulk for validator to be used globally (season_episode_validator) + rebulk.chain(formatter={'episode': int, 'version': int}) \ + .defaults(validator=None) \ + .regex(r'e(?P\d{1,4})') \ + .regex(r'v(?P\d+)').repeater('?') \ + .regex(r'(?Pe|x|-)(?P\d{1,4})').repeater('*') + + # ep 112, ep113, ep112, ep113 + rebulk.chain(abbreviations=[dash], formatter={'episode': int, 'version': int}) \ + .defaults(validator=None) \ + .regex(r'ep-?(?P\d{1,4})') \ + .regex(r'v(?P\d+)').repeater('?') \ + .regex(r'(?Pep|e|x|-)(?P\d{1,4})').repeater('*') + + # 102, 0102 + rebulk.chain(tags=['bonus-conflict', 'weak-movie', 'weak-episode', 'weak-duplicate'], + formatter={'season': int, 'episode': int, 'version': int}, + conflict_solver=lambda match, other: match if other.name == 'year' else '__default__', + disabled=lambda context: context.get('episode_prefer_number', False)) \ + .defaults(validator=None) \ + .regex(r'(?P\d{1,2})(?P\d{2})') \ + .regex(r'v(?P\d+)').repeater('?') \ + .regex(r'(?Px|-)(?P\d{2})').repeater('*') + + rebulk.regex(r'v(?P\d+)', children=True, private_parent=True, formatter=int) + + rebulk.defaults(private_names=['episodeSeparator', 'seasonSeparator']) + + # TODO: List of words + # detached of X count (season/episode) + rebulk.regex(r'(?P\d+)?-?' + build_or_pattern(of_words) + + r'-?(?P\d+)-?' + build_or_pattern(episode_words) + '?', + abbreviations=[dash], children=True, private_parent=True, formatter=int) + + rebulk.regex(r'Minisodes?', name='episode_format', value="Minisode") + + # Harcoded movie to disable weak season/episodes + rebulk.regex('OSS-?117', + abbreviations=[dash], name="hardcoded-movies", marker=True, + conflict_solver=lambda match, other: None) + + rebulk.rules(EpisodeNumberSeparatorRange(range_separators), + SeasonSeparatorRange(range_separators), RemoveWeakIfMovie, RemoveWeakIfSxxExx, + RemoveWeakDuplicate, EpisodeDetailValidator, RemoveDetachedEpisodeNumber, VersionValidator, + CountValidator, EpisodeSingleDigitValidator) + + return rebulk + + +class CountValidator(Rule): + """ + Validate count property and rename it + """ + priority = 64 + consequence = [RemoveMatch, RenameMatch('episode_count'), RenameMatch('season_count')] + + properties = {'episode_count': [None], 'season_count': [None]} + + def when(self, matches, context): + to_remove = [] + episode_count = [] + season_count = [] + + for count in matches.named('count'): + previous = matches.previous(count, lambda match: match.name in ['episode', 'season'], 0) + if previous: + if previous.name == 'episode': + episode_count.append(count) + elif previous.name == 'season': + season_count.append(count) + else: + to_remove.append(count) + return to_remove, episode_count, season_count + + +class AbstractSeparatorRange(Rule): + """ + Remove separator matches and create matches for season range. + """ + priority = 128 + consequence = [RemoveMatch, AppendMatch] + + def __init__(self, range_separators, property_name): + super(AbstractSeparatorRange, self).__init__() + self.range_separators = range_separators + self.property_name = property_name + + def when(self, matches, context): + to_remove = [] + to_append = [] + + for separator in matches.named(self.property_name + 'Separator'): + previous_match = matches.previous(separator, lambda match: match.name == self.property_name, 0) + next_match = matches.next(separator, lambda match: match.name == self.property_name, 0) + + if previous_match and next_match and separator.value in self.range_separators: + for episode_number in range(previous_match.value + 1, next_match.value): + match = copy.copy(next_match) + match.value = episode_number + to_append.append(match) + to_remove.append(separator) + + previous_match = None + for next_match in matches.named(self.property_name): + if previous_match: + separator = matches.input_string[previous_match.initiator.end:next_match.initiator.start] + if separator not in self.range_separators: + separator = strip(separator) + if separator in self.range_separators: + for episode_number in range(previous_match.value + 1, next_match.value): + match = copy.copy(next_match) + match.value = episode_number + to_append.append(match) + to_append.append(Match(previous_match.end, next_match.start - 1, + name=self.property_name + 'Separator', + private=True, + input_string=matches.input_string)) + to_remove.append(next_match) # Remove and append match to support proper ordering + to_append.append(next_match) + + previous_match = next_match + + return to_remove, to_append + + +class EpisodeNumberSeparatorRange(AbstractSeparatorRange): + """ + Remove separator matches and create matches for episoderNumber range. + """ + priority = 128 + consequence = [RemoveMatch, AppendMatch] + + def __init__(self, range_separators): + super(EpisodeNumberSeparatorRange, self).__init__(range_separators, "episode") + + +class SeasonSeparatorRange(AbstractSeparatorRange): + """ + Remove separator matches and create matches for season range. + """ + priority = 128 + consequence = [RemoveMatch, AppendMatch] + + def __init__(self, range_separators): + super(SeasonSeparatorRange, self).__init__(range_separators, "season") + + +class RemoveWeakIfMovie(Rule): + """ + Remove weak-movie tagged matches if it seems to be a movie. + """ + priority = 64 + consequence = RemoveMatch + + def when(self, matches, context): + if matches.named('year') or matches.markers.named('hardcoded-movies'): + return matches.tagged('weak-movie') + + +class RemoveWeakIfSxxExx(Rule): + """ + Remove weak-movie tagged matches if SxxExx pattern is matched. + """ + priority = 64 + consequence = RemoveMatch + + def when(self, matches, context): + if matches.tagged('SxxExx', lambda match: not match.private): + return matches.tagged('weak-movie') + + +class RemoveWeakDuplicate(Rule): + """ + Remove weak-duplicate tagged matches if duplicate patterns, for example The 100.109 + """ + priority = 64 + consequence = RemoveMatch + + def when(self, matches, context): + to_remove = [] + for filepart in matches.markers.named('path'): + patterns = defaultdict(list) + for match in reversed(matches.range(filepart.start, filepart.end, + predicate=lambda match: 'weak-duplicate' in match.tags)): + if match.pattern in patterns[match.name]: + to_remove.append(match) + else: + patterns[match.name].append(match.pattern) + return to_remove + + +class EpisodeDetailValidator(Rule): + """ + Validate episode_details if they are detached or next to season or episode. + """ + priority = 64 + consequence = RemoveMatch + + def when(self, matches, context): + ret = [] + for detail in matches.named('episode_details'): + if not seps_surround(detail) \ + and not matches.previous(detail, lambda match: match.name in ['season', 'episode']) \ + and not matches.next(detail, lambda match: match.name in ['season', 'episode']): + ret.append(detail) + return ret + + +class RemoveDetachedEpisodeNumber(Rule): + """ + If multiple episode are found, remove those that are not detached from a range and less than 10. + + Fairy Tail 2 - 16-20, 2 should be removed. + """ + priority = 64 + consequence = RemoveMatch + dependency = [RemoveWeakIfSxxExx, RemoveWeakDuplicate] + + def when(self, matches, context): + ret = [] + + episode_numbers = [] + episode_values = set() + for match in matches.named('episode', lambda match: not match.private and 'weak-movie' in match.tags): + if match.value not in episode_values: + episode_numbers.append(match) + episode_values.add(match.value) + + episode_numbers = list(sorted(episode_numbers, key=lambda match: match.value)) + if len(episode_numbers) > 1 and \ + episode_numbers[0].value < 10 and \ + episode_numbers[1].value - episode_numbers[0].value != 1: + parent = episode_numbers[0] + while parent: # TODO: Add a feature in rebulk to avoid this ... + ret.append(parent) + parent = parent.parent + return ret + + +class VersionValidator(Rule): + """ + Validate version if previous match is episode or if surrounded by separators. + """ + priority = 64 + dependency = [RemoveWeakIfMovie, RemoveWeakIfSxxExx] + consequence = RemoveMatch + + def when(self, matches, context): + ret = [] + for version in matches.named('version'): + episode_number = matches.previous(version, lambda match: match.name == 'episode', 0) + if not episode_number and not seps_surround(version.initiator): + ret.append(version) + return ret + + +class EpisodeSingleDigitValidator(Rule): + """ + Remove single digit episode when inside a group that doesn't own title. + """ + dependency = [TitleFromPosition] + + consequence = RemoveMatch + + def when(self, matches, context): + ret = [] + for episode in matches.named('episode', lambda match: len(match.initiator) == 1): + group = matches.markers.at_match(episode, lambda marker: marker.name == 'group', index=0) + if group: + if not matches.range(*group.span, predicate=lambda match: match.name == 'title'): + ret.append(episode) + return ret diff --git a/libs/guessit/rules/properties/film.py b/libs/guessit/rules/properties/film.py new file mode 100644 index 00000000..21a56d29 --- /dev/null +++ b/libs/guessit/rules/properties/film.py @@ -0,0 +1,42 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +""" +film property +""" +from rebulk.remodule import re + +from rebulk import Rebulk, AppendMatch, Rule +from ..common.formatters import cleanup + + +def film(): + """ + Builder for rebulk object. + :return: Created Rebulk object + :rtype: Rebulk + """ + rebulk = Rebulk().regex_defaults(flags=re.IGNORECASE) + + rebulk.regex(r'f(\d{1,2})', name='film', private_parent=True, children=True, formatter=int) + + rebulk.rules(FilmTitleRule) + + return rebulk + + +class FilmTitleRule(Rule): + """ + Rule to find out film_title (hole after film property + """ + consequence = AppendMatch + + properties = {'film_title': [None]} + + def when(self, matches, context): + bonus_number = matches.named('film', lambda match: not match.private, index=0) + if bonus_number: + filepath = matches.markers.at_match(bonus_number, lambda marker: marker.name == 'path', 0) + hole = matches.holes(filepath.start, bonus_number.start + 1, formatter=cleanup, index=0) + if hole and hole.value: + hole.name = 'film_title' + return hole diff --git a/libs/guessit/rules/properties/format.py b/libs/guessit/rules/properties/format.py new file mode 100644 index 00000000..aa75f824 --- /dev/null +++ b/libs/guessit/rules/properties/format.py @@ -0,0 +1,67 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +""" +format property +""" +from rebulk.remodule import re + +from rebulk import Rebulk, RemoveMatch, Rule +from ..common import dash +from ..common.validators import seps_before, seps_after + + +def format_(): + """ + Builder for rebulk object. + :return: Created Rebulk object + :rtype: Rebulk + """ + rebulk = Rebulk().regex_defaults(flags=re.IGNORECASE, abbreviations=[dash]) + rebulk.defaults(name="format") + + rebulk.regex("VHS", "VHS-?Rip", value="VHS") + rebulk.regex("CAM", "CAM-?Rip", "HD-?CAM", value="Cam") + rebulk.regex("TELESYNC", "TS", "HD-?TS", value="Telesync") + rebulk.regex("WORKPRINT", "WP", value="Workprint") + rebulk.regex("TELECINE", "TC", value="Telecine") + rebulk.regex("PPV", "PPV-?Rip", value="PPV") # Pay Per View + rebulk.regex("SD-?TV", "SD-?TV-?Rip", "Rip-?SD-?TV", "TV-?Rip", + "Rip-?TV", value="TV") # TV is too common to allow matching + rebulk.regex("DVB-?Rip", "DVB", "PD-?TV", value="DVB") + rebulk.regex("DVD", "DVD-?Rip", "VIDEO-?TS", "DVD-?R(?:$|(?!E))", # "DVD-?R(?:$|^E)" => DVD-Real ... + "DVD-?9", "DVD-?5", value="DVD") + + rebulk.regex("HD-?TV", "TV-?RIP-?HD", "HD-?TV-?RIP", "HD-?RIP", value="HDTV") + rebulk.regex("VOD", "VOD-?Rip", value="VOD") + rebulk.regex("WEB-?Rip", value="WEBRip") + rebulk.regex("WEB-?DL", "WEB-?HD", "WEB", value="WEB-DL") + rebulk.regex("HD-?DVD-?Rip", "HD-?DVD", value="HD-DVD") + rebulk.regex("Blu-?ray(?:-?Rip)?", "B[DR]", "B[DR]-?Rip", "BD[59]", "BD25", "BD50", value="BluRay") + + rebulk.rules(ValidateFormat) + + return rebulk + + +class ValidateFormat(Rule): + """ + Validate format with screener property, with video_codec property or separated + """ + priority = 64 + consequence = RemoveMatch + + def when(self, matches, context): + ret = [] + for format_match in matches.named('format'): + if not seps_before(format_match) and \ + not matches.range(format_match.start - 1, format_match.start - 2, + lambda match: match.name == 'other' and match.value == 'Screener'): + ret.append(format_match) + continue + if not seps_after(format_match) and \ + not matches.range(format_match.end, format_match.end + 1, + lambda match: match.name == 'video_codec' or ( + match.name == 'other' and match.value == 'Screener')): + ret.append(format_match) + continue + return ret diff --git a/libs/guessit/rules/properties/language.py b/libs/guessit/rules/properties/language.py new file mode 100644 index 00000000..3476d60a --- /dev/null +++ b/libs/guessit/rules/properties/language.py @@ -0,0 +1,249 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +""" +language and subtitle_language properties +""" +# pylint: disable=no-member +import copy + +import babelfish + +from rebulk.remodule import re +from rebulk import Rebulk, Rule, RemoveMatch, RenameMatch +from ..common.words import iter_words, COMMON_WORDS +from ..common.validators import seps_surround + + +def language(): + """ + Builder for rebulk object. + :return: Created Rebulk object + :rtype: Rebulk + """ + rebulk = Rebulk() + + rebulk.string(*subtitle_prefixes, name="subtitle_language.prefix", ignore_case=True, private=True, + validator=seps_surround) + rebulk.string(*subtitle_suffixes, name="subtitle_language.suffix", ignore_case=True, private=True, + validator=seps_surround) + rebulk.functional(find_languages, properties={'language': [None]}) + rebulk.rules(SubtitlePrefixLanguageRule, SubtitleSuffixLanguageRule, SubtitleExtensionRule) + + return rebulk + + +COMMON_WORDS_STRICT = frozenset(['brazil']) + +UNDETERMINED = babelfish.Language('und') + +SYN = {('und', None): ['unknown', 'inconnu', 'unk', 'un'], + ('ell', None): ['gr', 'greek'], + ('spa', None): ['esp', 'español'], + ('fra', None): ['français', 'vf', 'vff', 'vfi', 'vfq'], + ('swe', None): ['se'], + ('por', 'BR'): ['po', 'pb', 'pob', 'br', 'brazilian'], + ('cat', None): ['català'], + ('ces', None): ['cz'], + ('ukr', None): ['ua'], + ('zho', None): ['cn'], + ('jpn', None): ['jp'], + ('hrv', None): ['scr'], + ('mul', None): ['multi', 'dl']} # http://scenelingo.wordpress.com/2009/03/24/what-does-dl-mean/ + + +class GuessitConverter(babelfish.LanguageReverseConverter): # pylint: disable=missing-docstring + _with_country_regexp = re.compile(r'(.*)\((.*)\)') + _with_country_regexp2 = re.compile(r'(.*)-(.*)') + + def __init__(self): + self.guessit_exceptions = {} + for (alpha3, country), synlist in SYN.items(): + for syn in synlist: + self.guessit_exceptions[syn.lower()] = (alpha3, country, None) + + @property + def codes(self): # pylint: disable=missing-docstring + return (babelfish.language_converters['alpha3b'].codes | + babelfish.language_converters['alpha2'].codes | + babelfish.language_converters['name'].codes | + babelfish.language_converters['opensubtitles'].codes | + babelfish.country_converters['name'].codes | + frozenset(self.guessit_exceptions.keys())) + + def convert(self, alpha3, country=None, script=None): + return str(babelfish.Language(alpha3, country, script)) + + def reverse(self, name): + with_country = (GuessitConverter._with_country_regexp.match(name) or + GuessitConverter._with_country_regexp2.match(name)) + + name = name.lower() + if with_country: + lang = babelfish.Language.fromguessit(with_country.group(1).strip()) + lang.country = babelfish.Country.fromguessit(with_country.group(2).strip()) + return lang.alpha3, lang.country.alpha2 if lang.country else None, lang.script or None + + # exceptions come first, as they need to override a potential match + # with any of the other guessers + try: + return self.guessit_exceptions[name] + except KeyError: + pass + + for conv in [babelfish.Language, + babelfish.Language.fromalpha3b, + babelfish.Language.fromalpha2, + babelfish.Language.fromname, + babelfish.Language.fromopensubtitles]: + try: + reverse = conv(name) + return reverse.alpha3, reverse.country, reverse.script + except (ValueError, babelfish.LanguageReverseError): + pass + + raise babelfish.LanguageReverseError(name) + + +babelfish.language_converters['guessit'] = GuessitConverter() + +subtitle_both = ['sub', 'subs', 'subbed', 'custom subbed', 'custom subs', 'custom sub', 'customsubbed', 'customsubs', + 'customsub'] +subtitle_prefixes = subtitle_both + ['st', 'vost', 'subforced', 'fansub', 'hardsub'] +subtitle_suffixes = subtitle_both + ['subforced', 'fansub', 'hardsub'] +lang_prefixes = ['true'] + +all_lang_prefixes_suffixes = subtitle_prefixes + subtitle_suffixes + lang_prefixes + + +def find_languages(string, context=None): + """Find languages in the string + + :return: list of tuple (property, Language, lang_word, word) + """ + allowed_languages = context.get('allowed_languages') + common_words = COMMON_WORDS_STRICT if allowed_languages else COMMON_WORDS + + matches = [] + for word_match in iter_words(string): + word = word_match.value + start, end = word_match.span + + lang_word = word.lower() + key = 'language' + for prefix in subtitle_prefixes: + if lang_word.startswith(prefix): + lang_word = lang_word[len(prefix):] + key = 'subtitle_language' + for suffix in subtitle_suffixes: + if lang_word.endswith(suffix): + lang_word = lang_word[:len(lang_word) - len(suffix)] + key = 'subtitle_language' + for prefix in lang_prefixes: + if lang_word.startswith(prefix): + lang_word = lang_word[len(prefix):] + if lang_word not in common_words and word.lower() not in common_words: + try: + lang = babelfish.Language.fromguessit(lang_word) + match = (start, end, {'name': key, 'value': lang}) + if allowed_languages: + if lang.name.lower() in allowed_languages \ + or lang.alpha2.lower() in allowed_languages \ + or lang.alpha3.lower() in allowed_languages: + matches.append(match) + # Keep language with alpha2 equivalent. Others are probably + # uncommon languages. + elif lang == 'mul' or hasattr(lang, 'alpha2'): + matches.append(match) + except babelfish.Error: + pass + return matches + + +class SubtitlePrefixLanguageRule(Rule): + """ + Convert language guess as subtitle_language if previous match is a subtitle language prefix + """ + consequence = RemoveMatch + + properties = {'subtitle_language': [None]} + + def when(self, matches, context): + to_rename = [] + to_remove = matches.named('subtitle_language.prefix') + for lang in matches.named('language'): + prefix = matches.previous(lang, lambda match: match.name == 'subtitle_language.prefix', 0) + if not prefix: + group_marker = matches.markers.at_match(lang, lambda marker: marker.name == 'group', 0) + if group_marker: + # Find prefix if placed just before the group + prefix = matches.previous(group_marker, lambda match: match.name == 'subtitle_language.prefix', + 0) + if not prefix: + # Find prefix if placed before in the group + prefix = matches.range(group_marker.start, lang.start, + lambda match: match.name == 'subtitle_language.prefix', 0) + if prefix: + to_rename.append((prefix, lang)) + if prefix in to_remove: + to_remove.remove(prefix) + return to_rename, to_remove + + def then(self, matches, when_response, context): + to_rename, to_remove = when_response + super(SubtitlePrefixLanguageRule, self).then(matches, to_remove, context) + for prefix, match in to_rename: + # Remove suffix equivalent of prefix. + suffix = copy.copy(prefix) + suffix.name = 'subtitle_language.suffix' + if suffix in matches: + matches.remove(suffix) + matches.remove(match) + match.name = 'subtitle_language' + matches.append(match) + + +class SubtitleSuffixLanguageRule(Rule): + """ + Convert language guess as subtitle_language if next match is a subtitle language suffix + """ + dependency = SubtitlePrefixLanguageRule + consequence = RemoveMatch + + properties = {'subtitle_language': [None]} + + def when(self, matches, context): + to_append = [] + to_remove = matches.named('subtitle_language.suffix') + for lang in matches.named('language'): + suffix = matches.next(lang, lambda match: match.name == 'subtitle_language.suffix', 0) + if suffix: + to_append.append(lang) + if suffix in to_remove: + to_remove.remove(suffix) + return to_append, to_remove + + def then(self, matches, when_response, context): + to_rename, to_remove = when_response + super(SubtitleSuffixLanguageRule, self).then(matches, to_remove, context) + for match in to_rename: + matches.remove(match) + match.name = 'subtitle_language' + matches.append(match) + + +class SubtitleExtensionRule(Rule): + """ + Convert language guess as subtitle_language if next match is a subtitle extension + """ + consequence = RenameMatch('subtitle_language') + + properties = {'subtitle_language': [None]} + + def when(self, matches, context): + subtitle_extension = matches.named('container', + lambda match: 'extension' in match.tags and 'subtitle' in match.tags, + 0) + if subtitle_extension: + subtitle_lang = matches.previous(subtitle_extension, lambda match: match.name == 'language', 0) + if subtitle_lang: + return subtitle_lang diff --git a/libs/guessit/rules/properties/mimetype.py b/libs/guessit/rules/properties/mimetype.py new file mode 100644 index 00000000..c57ada77 --- /dev/null +++ b/libs/guessit/rules/properties/mimetype.py @@ -0,0 +1,48 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +""" +mimetype property +""" +import mimetypes + +from rebulk import Rebulk, CustomRule, POST_PROCESS +from rebulk.match import Match + +from ...rules.processors import Processors + + +def mimetype(): + """ + Builder for rebulk object. + :return: Created Rebulk object + :rtype: Rebulk + """ + return Rebulk().rules(Mimetype) + + +class Mimetype(CustomRule): + """ + Mimetype post processor + :param matches: + :type matches: + :return: + :rtype: + """ + priority = POST_PROCESS + + dependency = Processors + + def when(self, matches, context): + mime, _ = mimetypes.guess_type(matches.input_string, strict=False) + return mime + + def then(self, matches, when_response, context): + mime = when_response + matches.append(Match(len(matches.input_string), len(matches.input_string), name='mimetype', value=mime)) + + @property + def properties(self): + """ + Properties for this rule. + """ + return {'mimetype': [None]} diff --git a/libs/guessit/rules/properties/other.py b/libs/guessit/rules/properties/other.py new file mode 100644 index 00000000..1c51eea7 --- /dev/null +++ b/libs/guessit/rules/properties/other.py @@ -0,0 +1,181 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +""" +other property +""" +import copy + +from rebulk.remodule import re + +from rebulk import Rebulk, Rule, RemoveMatch, POST_PROCESS, AppendMatch +from ..common import dash +from ..common import seps +from ..common.validators import seps_surround, compose +from ...rules.common.formatters import raw_cleanup +from ...reutils import build_or_pattern + + +def other(): + """ + Builder for rebulk object. + :return: Created Rebulk object + :rtype: Rebulk + """ + rebulk = Rebulk().regex_defaults(flags=re.IGNORECASE, abbreviations=[dash]).string_defaults(ignore_case=True) + rebulk.defaults(name="other", validator=seps_surround) + + rebulk.regex('Audio-?Fix', 'Audio-?Fixed', value='AudioFix') + rebulk.regex('Sync-?Fix', 'Sync-?Fixed', value='SyncFix') + rebulk.regex('Dual-?Audio', value='DualAudio') + rebulk.regex('ws', 'wide-?screen', value='WideScreen') + rebulk.string('Netflix', 'NF', value='Netflix') + + rebulk.string('Real', 'Fix', 'Fixed', value='Proper', tags=['has-neighbor-before', 'has-neighbor-after']) + rebulk.string('Proper', 'Repack', 'Rerip', value='Proper') + rebulk.string('Fansub', value='Fansub', tags='has-neighbor') + rebulk.string('Fastsub', value='Fastsub', tags='has-neighbor') + + season_words = build_or_pattern(["seasons?", "series?"]) + complete_articles = build_or_pattern(["The"]) + + def validate_complete(match): + """ + Make sure season word is are defined. + :param match: + :type match: + :return: + :rtype: + """ + children = match.children + if not children.named('completeWordsBefore') and not children.named('completeWordsAfter'): + return False + return True + + rebulk.regex('(?P' + complete_articles + '-)?' + + '(?P' + season_words + '-)?' + + 'Complete' + '(?P-' + season_words + ')?', + private_names=['completeArticle', 'completeWordsBefore', 'completeWordsAfter'], + value={'other': 'Complete'}, + tags=['release-group-prefix'], + validator={'__parent__': compose(seps_surround, validate_complete)}) + rebulk.string('R5', 'RC', value='R5') + rebulk.regex('Pre-?Air', value='Preair') + + for value in ( + 'Screener', 'Remux', 'Remastered', '3D', 'HD', 'mHD', 'HDLight', 'HQ', 'DDC', 'HR', 'PAL', 'SECAM', 'NTSC', + 'CC', 'LD', 'MD', 'XXX'): + rebulk.string(value, value=value) + + for value in ('Limited', 'Complete', 'Classic', 'Unrated', 'LiNE', 'Bonus', 'Trailer', 'FINAL', 'Retail', 'Uncut', + 'Extended', 'Extended Cut'): + rebulk.string(value, value=value, tags=['has-neighbor', 'release-group-prefix']) + + rebulk.string('VO', 'OV', value='OV', tags='has-neighbor') + + rebulk.regex('Scr(?:eener)?', value='Screener', validator=None, tags='other.validate.screener') + + rebulk.rules(ValidateHasNeighbor, ValidateHasNeighborAfter, ValidateHasNeighborBefore, ValidateScreenerRule, + ProperCountRule) + + return rebulk + + +class ProperCountRule(Rule): + """ + Add proper_count property + """ + priority = POST_PROCESS + + consequence = AppendMatch + + properties = {'proper_count': [None]} + + def when(self, matches, context): + propers = matches.named('other', lambda match: match.value == 'Proper') + if propers: + raws = {} # Count distinct raw values + for proper in propers: + raws[raw_cleanup(proper.raw)] = proper + proper_count_match = copy.copy(propers[-1]) + proper_count_match.name = 'proper_count' + proper_count_match.value = len(raws) + return proper_count_match + + +class ValidateHasNeighbor(Rule): + """ + Validate tag has-neighbor + """ + consequence = RemoveMatch + + def when(self, matches, context): + ret = [] + for to_check in matches.range(predicate=lambda match: 'has-neighbor' in match.tags): + previous_match = matches.previous(to_check, index=0) + previous_group = matches.markers.previous(to_check, lambda marker: marker.name == 'group', 0) + if previous_group and (not previous_match or previous_group.end > previous_match.end): + previous_match = previous_group + if previous_match and not matches.input_string[previous_match.end:to_check.start].strip(seps): + break + next_match = matches.next(to_check, index=0) + next_group = matches.markers.next(to_check, lambda marker: marker.name == 'group', 0) + if next_group and (not next_match or next_group.start < next_match.start): + next_match = next_group + if next_match and not matches.input_string[to_check.end:next_match.start].strip(seps): + break + ret.append(to_check) + return ret + + +class ValidateHasNeighborBefore(Rule): + """ + Validate tag has-neighbor-before that previous match exists. + """ + consequence = RemoveMatch + + def when(self, matches, context): + ret = [] + for to_check in matches.range(predicate=lambda match: 'has-neighbor-before' in match.tags): + next_match = matches.next(to_check, index=0) + next_group = matches.markers.next(to_check, lambda marker: marker.name == 'group', 0) + if next_group and (not next_match or next_group.start < next_match.start): + next_match = next_group + if next_match and not matches.input_string[to_check.end:next_match.start].strip(seps): + break + ret.append(to_check) + return ret + + +class ValidateHasNeighborAfter(Rule): + """ + Validate tag has-neighbor-after that next match exists. + """ + consequence = RemoveMatch + + def when(self, matches, context): + ret = [] + for to_check in matches.range(predicate=lambda match: 'has-neighbor-after' in match.tags): + previous_match = matches.previous(to_check, index=0) + previous_group = matches.markers.previous(to_check, lambda marker: marker.name == 'group', 0) + if previous_group and (not previous_match or previous_group.end > previous_match.end): + previous_match = previous_group + if previous_match and not matches.input_string[previous_match.end:to_check.start].strip(seps): + break + ret.append(to_check) + return ret + + +class ValidateScreenerRule(Rule): + """ + Validate tag other.validate.screener + """ + consequence = RemoveMatch + priority = 64 + + def when(self, matches, context): + ret = [] + for screener in matches.named('other', lambda match: 'other.validate.screener' in match.tags): + format_match = matches.previous(screener, lambda match: match.name == 'format', 0) + if not format_match or matches.input_string[format_match.end:screener.start].strip(seps): + ret.append(screener) + return ret diff --git a/libs/guessit/rules/properties/part.py b/libs/guessit/rules/properties/part.py new file mode 100644 index 00000000..d274f7fb --- /dev/null +++ b/libs/guessit/rules/properties/part.py @@ -0,0 +1,41 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +""" +part property +""" +from rebulk.remodule import re + +from rebulk import Rebulk +from ..common import dash +from ..common.validators import seps_surround, int_coercable, compose +from ..common.numeral import numeral, parse_numeral +from ...reutils import build_or_pattern + + +def part(): + """ + Builder for rebulk object. + :return: Created Rebulk object + :rtype: Rebulk + """ + rebulk = Rebulk().regex_defaults(flags=re.IGNORECASE, abbreviations=[dash], validator={'__parent__': seps_surround}) + + prefixes = ['pt', 'part'] + + def validate_roman(match): + """ + Validate a roman match if surrounded by separators + :param match: + :type match: + :return: + :rtype: + """ + if int_coercable(match.raw): + return True + return seps_surround(match) + + rebulk.regex(build_or_pattern(prefixes) + r'-?(?P' + numeral + r')', + prefixes=prefixes, validate_all=True, private_parent=True, children=True, formatter=parse_numeral, + validator={'part': compose(validate_roman, lambda m: 0 < m.value < 100)}) + + return rebulk diff --git a/libs/guessit/rules/properties/release_group.py b/libs/guessit/rules/properties/release_group.py new file mode 100644 index 00000000..b92ad168 --- /dev/null +++ b/libs/guessit/rules/properties/release_group.py @@ -0,0 +1,171 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +""" +release_group property +""" +import copy + +from rebulk.remodule import re + +from rebulk import Rebulk, Rule, AppendMatch +from ..common.validators import int_coercable +from ..properties.title import TitleFromPosition +from ..common.formatters import cleanup +from ..common import seps, dash +from ..common.comparators import marker_sorted + + +def release_group(): + """ + Builder for rebulk object. + :return: Created Rebulk object + :rtype: Rebulk + """ + return Rebulk().rules(SceneReleaseGroup, AnimeReleaseGroup, ExpectedReleaseGroup) + + +forbidden_groupnames = ['rip', 'by', 'for', 'par', 'pour', 'bonus'] + +groupname_ignore_seps = '[]{}()' +groupname_seps = ''.join([c for c in seps if c not in groupname_ignore_seps]) + + +def clean_groupname(string): + """ + Removes and strip separators from input_string + :param input_string: + :type input_string: + :return: + :rtype: + """ + string = string.strip(groupname_seps) + if not (string.endswith(tuple(groupname_ignore_seps)) and string.startswith(tuple(groupname_ignore_seps)))\ + and not any(i in string.strip(groupname_ignore_seps) for i in groupname_ignore_seps): + string = string.strip(groupname_ignore_seps) + for forbidden in forbidden_groupnames: + if string.lower().startswith(forbidden): + string = string[len(forbidden):] + string = string.strip(groupname_seps) + if string.lower().endswith(forbidden): + string = string[:len(forbidden)] + string = string.strip(groupname_seps) + return string + + +_scene_previous_names = ['video_codec', 'format', 'video_api', 'audio_codec', 'audio_profile', 'video_profile', + 'audio_channels', 'screen_size', 'other', 'container', 'language', 'subtitle_language', + 'subtitle_language.suffix', 'subtitle_language.prefix'] + +_scene_previous_tags = ['release-group-prefix'] + + +class ExpectedReleaseGroup(Rule): + """ + Add release_group match from expected_group option + """ + consequence = AppendMatch + + properties = {'release_group': [None]} + + def enabled(self, context): + return context.get('expected_group') + + def when(self, matches, context): + expected_rebulk = Rebulk().defaults(name='release_group') + + for expected_group in context.get('expected_group'): + if expected_group.startswith('re:'): + expected_group = expected_group[3:] + expected_group = expected_group.replace(' ', '-') + expected_rebulk.regex(expected_group, abbreviations=[dash], flags=re.IGNORECASE) + else: + expected_rebulk.string(expected_group, ignore_case=True) + + matches = expected_rebulk.matches(matches.input_string, context) + return matches + + +class SceneReleaseGroup(Rule): + """ + Add release_group match in existing matches (scene format). + + Something.XViD-ReleaseGroup.mkv + """ + dependency = [TitleFromPosition, ExpectedReleaseGroup] + consequence = AppendMatch + + properties = {'release_group': [None]} + + def when(self, matches, context): + # If a release_group is found before, ignore this kind of release_group rule. + + ret = [] + + for filepart in marker_sorted(matches.markers.named('path'), matches): + start, end = filepart.span + + last_hole = matches.holes(start, end + 1, formatter=clean_groupname, + predicate=lambda hole: cleanup(hole.value), index=-1) + + if last_hole: + previous_match = matches.previous(last_hole, + lambda match: not match.private or + match.name in _scene_previous_names, + index=0) + if previous_match and (previous_match.name in _scene_previous_names or + any(tag in previous_match.tags for tag in _scene_previous_tags)) and \ + not matches.input_string[previous_match.end:last_hole.start].strip(seps) \ + and not int_coercable(last_hole.value.strip(seps)): + + last_hole.name = 'release_group' + last_hole.tags = ['scene'] + + # if hole is inside a group marker with same value, remove [](){} ... + group = matches.markers.at_match(last_hole, lambda marker: marker.name == 'group', 0) + if group: + group.formatter = clean_groupname + if group.value == last_hole.value: + last_hole.start = group.start + 1 + last_hole.end = group.end - 1 + last_hole.tags = ['anime'] + + ret.append(last_hole) + return ret + + +class AnimeReleaseGroup(Rule): + """ + Add release_group match in existing matches (anime format) + ...[ReleaseGroup] Something.mkv + """ + dependency = [SceneReleaseGroup, TitleFromPosition] + consequence = AppendMatch + + properties = {'release_group': [None]} + + def when(self, matches, context): + ret = [] + + # If a release_group is found before, ignore this kind of release_group rule. + if not matches.named('episode') and not matches.named('season') and matches.named('release_group'): + # This doesn't seems to be an anime + return + + for filepart in marker_sorted(matches.markers.named('path'), matches): + + # pylint:disable=bad-continuation + empty_group_marker = matches.markers \ + .range(filepart.start, filepart.end, lambda marker: marker.name == 'group' + and not matches.range(marker.start, marker.end) + and not int_coercable(marker.value.strip(seps)), + 0) + + if empty_group_marker: + group = copy.copy(empty_group_marker) + group.marker = False + group.raw_start += 1 + group.raw_end -= 1 + group.tags = ['anime'] + group.name = 'release_group' + ret.append(group) + return ret diff --git a/libs/guessit/rules/properties/screen_size.py b/libs/guessit/rules/properties/screen_size.py new file mode 100644 index 00000000..80d68c29 --- /dev/null +++ b/libs/guessit/rules/properties/screen_size.py @@ -0,0 +1,77 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +""" +screen_size property +""" +from rebulk.remodule import re + +from rebulk import Rebulk, Rule, RemoveMatch +from ..common.validators import seps_surround +from ..common import dash + + +def screen_size(): + """ + Builder for rebulk object. + :return: Created Rebulk object + :rtype: Rebulk + """ + def conflict_solver(match, other): + """ + Conflict solver for most screen_size. + """ + if other.name == 'screen_size': + if 'resolution' in other.tags: + # The chtouile to solve conflict in "720 x 432" string matching both 720p pattern + int_value = _digits_re.findall(match.raw)[-1] + if other.value.startswith(int_value): + return match + return other + return '__default__' + + rebulk = Rebulk().regex_defaults(flags=re.IGNORECASE) + rebulk.defaults(name="screen_size", validator=seps_surround, conflict_solver=conflict_solver) + + rebulk.regex(r"(?:\d{3,}(?:x|\*))?360(?:i|p?x?)", value="360p") + rebulk.regex(r"(?:\d{3,}(?:x|\*))?368(?:i|p?x?)", value="368p") + rebulk.regex(r"(?:\d{3,}(?:x|\*))?480(?:i|p?x?)", value="480p") + rebulk.regex(r"(?:\d{3,}(?:x|\*))?576(?:i|p?x?)", value="576p") + rebulk.regex(r"(?:\d{3,}(?:x|\*))?720(?:i|p?(?:50|60)?x?)", value="720p") + rebulk.regex(r"(?:\d{3,}(?:x|\*))?720(?:p(?:50|60)?x?)", value="720p") + rebulk.regex(r"(?:\d{3,}(?:x|\*))?720p?hd", value="720p") + rebulk.regex(r"(?:\d{3,}(?:x|\*))?900(?:i|p?x?)", value="900p") + rebulk.regex(r"(?:\d{3,}(?:x|\*))?1080i", value="1080i") + rebulk.regex(r"(?:\d{3,}(?:x|\*))?1080p?x?", value="1080p") + rebulk.regex(r"(?:\d{3,}(?:x|\*))?1080(?:p(?:50|60)?x?)", value="1080p") + rebulk.regex(r"(?:\d{3,}(?:x|\*))?1080p?hd", value="1080p") + rebulk.regex(r"(?:\d{3,}(?:x|\*))?2160(?:i|p?x?)", value="4K") + + _digits_re = re.compile(r'\d+') + + rebulk.defaults(name="screen_size", validator=seps_surround) + rebulk.regex(r'\d{3,}-?(?:x|\*)-?\d{3,}', + formatter=lambda value: 'x'.join(_digits_re.findall(value)), + abbreviations=[dash], + tags=['resolution'], + conflict_solver=lambda match, other: '__default__' if other.name == 'screen_size' else other) + + rebulk.rules(ScreenSizeOnlyOne) + + return rebulk + + +class ScreenSizeOnlyOne(Rule): + """ + Keep a single screen_size pet filepath part. + """ + consequence = RemoveMatch + + def when(self, matches, context): + to_remove = [] + for filepart in matches.markers.named('path'): + screensize = list(reversed(matches.range(filepart.start, filepart.end, + lambda match: match.name == 'screen_size'))) + if len(screensize) > 1: + to_remove.extend(screensize[1:]) + + return to_remove diff --git a/libs/guessit/rules/properties/title.py b/libs/guessit/rules/properties/title.py new file mode 100644 index 00000000..067d432d --- /dev/null +++ b/libs/guessit/rules/properties/title.py @@ -0,0 +1,347 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +""" +title property +""" +import re + +from rebulk import Rebulk, Rule, AppendMatch, RemoveMatch, AppendTags +from rebulk.formatters import formatters +from rebulk.pattern import RePattern +from rebulk.utils import find_all + +from .film import FilmTitleRule +from .language import SubtitlePrefixLanguageRule, SubtitleSuffixLanguageRule, SubtitleExtensionRule +from ..common.formatters import cleanup, reorder_title +from ..common.comparators import marker_sorted +from ..common import seps, title_seps, dash + + +def title(): + """ + Builder for rebulk object. + :return: Created Rebulk object + :rtype: Rebulk + """ + rebulk = Rebulk().rules(TitleFromPosition, PreferTitleWithYear) + + def expected_title(input_string, context): + """ + Expected title functional pattern. + :param input_string: + :type input_string: + :param context: + :type context: + :return: + :rtype: + """ + ret = [] + for search in context.get('expected_title'): + if search.startswith('re:'): + search = search[3:] + search = search.replace(' ', '-') + matches = RePattern(search, abbreviations=[dash], flags=re.IGNORECASE).matches(input_string, context) + for match in matches: + # Instance of 'list' has no 'span' member (no-member). Seems to be a pylint bug. + # pylint: disable=no-member + ret.append(match.span) + else: + for start in find_all(input_string, search, ignore_case=True): + ret.append((start, start+len(search))) + return ret + + rebulk.functional(expected_title, name='title', tags=['expected'], + conflict_solver=lambda match, other: other, + disabled=lambda context: not context.get('expected_title')) + + return rebulk + + +class TitleBaseRule(Rule): + """ + Add title match in existing matches + """ + # pylint:disable=no-self-use,unused-argument + consequence = [AppendMatch, RemoveMatch] + + def __init__(self, match_name, match_tags=None, alternative_match_name=None): + super(TitleBaseRule, self).__init__() + self.match_name = match_name + self.match_tags = match_tags + self.alternative_match_name = alternative_match_name + + def hole_filter(self, hole, matches): + """ + Filter holes for titles. + :param hole: + :type hole: + :param matches: + :type matches: + :return: + :rtype: + """ + return True + + def filepart_filter(self, filepart, matches): + """ + Filter filepart for titles. + :param filepart: + :type filepart: + :param matches: + :type matches: + :return: + :rtype: + """ + return True + + def holes_process(self, holes, matches): + """ + process holes + :param holes: + :type holes: + :param matches: + :type matches: + :return: + :rtype: + """ + cropped_holes = [] + for hole in holes: + group_markers = matches.markers.named('group') + cropped_holes.extend(hole.crop(group_markers)) + return cropped_holes + + def is_ignored(self, match): + """ + Ignore matches when scanning for title (hole). + + Full word language and countries won't be ignored if they are uppercase. + """ + return not (len(match) > 3 and match.raw.isupper()) and match.name in ['language', 'country', 'episode_details'] + + def should_keep(self, match, to_keep, matches, filepart, hole, starting): + """ + Check if this match should be accepted when ending or starting a hole. + :param match: + :type match: + :param to_keep: + :type to_keep: list[Match] + :param matches: + :type matches: Matches + :param hole: the filepart match + :type hole: Match + :param hole: the hole match + :type hole: Match + :param starting: true if match is starting the hole + :type starting: bool + :return: + :rtype: + """ + if match.name in ['language', 'country']: + # Keep language if exactly matching the hole. + if len(hole.value) == len(match.raw): + return True + + # Keep language if other languages exists in the filepart. + outside_matches = filepart.crop(hole) + other_languages = [] + for outside in outside_matches: + other_languages.extend(matches.range(outside.start, outside.end, + lambda c_match: c_match.name == match.name and + c_match not in to_keep)) + + if not other_languages: + return True + + return False + + def should_remove(self, match, matches, filepart, hole, context): + """ + Check if this match should be removed after beeing ignored. + :param match: + :param matches: + :param filepart: + :param hole: + :return: + """ + if context.get('type') == 'episode' and match.name == 'episode_details': + return False + return True + + def check_titles_in_filepart(self, filepart, matches, context): + """ + Find title in filepart (ignoring language) + """ + # pylint:disable=too-many-locals,too-many-branches,too-many-statements + start, end = filepart.span + + holes = matches.holes(start, end + 1, formatter=formatters(cleanup, reorder_title), + ignore=self.is_ignored, + predicate=lambda hole: hole.value) + + holes = self.holes_process(holes, matches) + + for hole in holes: + # pylint:disable=cell-var-from-loop + if not hole or (self.hole_filter and not self.hole_filter(hole, matches)): + continue + + to_remove = [] + to_keep = [] + + ignored_matches = matches.range(hole.start, hole.end, self.is_ignored) + + if ignored_matches: + for ignored_match in reversed(ignored_matches): + # pylint:disable=undefined-loop-variable + trailing = matches.chain_before(hole.end, seps, predicate=lambda match: match == ignored_match) + if trailing: + should_keep = self.should_keep(ignored_match, to_keep, matches, filepart, hole, False) + if should_keep: + # pylint:disable=unpacking-non-sequence + try: + append, crop = should_keep + except TypeError: + append, crop = should_keep, should_keep + if append: + to_keep.append(ignored_match) + if crop: + hole.end = ignored_match.start + + for ignored_match in ignored_matches: + if ignored_match not in to_keep: + starting = matches.chain_after(hole.start, seps, + predicate=lambda match: match == ignored_match) + if starting: + should_keep = self.should_keep(ignored_match, to_keep, matches, filepart, hole, True) + if should_keep: + # pylint:disable=unpacking-non-sequence + try: + append, crop = should_keep + except TypeError: + append, crop = should_keep, should_keep + if append: + to_keep.append(ignored_match) + if crop: + hole.start = ignored_match.end + + for match in ignored_matches: + if self.should_remove(match, matches, filepart, hole, context): + to_remove.append(match) + for keep_match in to_keep: + if keep_match in to_remove: + to_remove.remove(keep_match) + + if hole and hole.value: + hole.name = self.match_name + hole.tags = self.match_tags + if self.alternative_match_name: + # Split and keep values that can be a title + titles = hole.split(title_seps, lambda match: match.value) + for title_match in list(titles[1:]): + previous_title = titles[titles.index(title_match) - 1] + separator = matches.input_string[previous_title.end:title_match.start] + if len(separator) == 1 and separator == '-' \ + and previous_title.raw[-1] not in seps \ + and title_match.raw[0] not in seps: + titles[titles.index(title_match) - 1].end = title_match.end + titles.remove(title_match) + else: + title_match.name = self.alternative_match_name + + else: + titles = [hole] + return titles, to_remove + + def when(self, matches, context): + if matches.named(self.match_name, lambda match: 'expected' in match.tags): + return + + fileparts = [filepart for filepart in list(marker_sorted(matches.markers.named('path'), matches)) + if not self.filepart_filter or self.filepart_filter(filepart, matches)] + + to_remove = [] + + # Priorize fileparts containing the year + years_fileparts = [] + for filepart in fileparts: + year_match = matches.range(filepart.start, filepart.end, lambda match: match.name == 'year', 0) + if year_match: + years_fileparts.append(filepart) + + ret = [] + for filepart in fileparts: + try: + years_fileparts.remove(filepart) + except ValueError: + pass + titles = self.check_titles_in_filepart(filepart, matches, context) + if titles: + titles, to_remove_c = titles + ret.extend(titles) + to_remove.extend(to_remove_c) + break + + # Add title match in all fileparts containing the year. + for filepart in years_fileparts: + titles = self.check_titles_in_filepart(filepart, matches, context) + if titles: + # pylint:disable=unbalanced-tuple-unpacking + titles, to_remove_c = titles + ret.extend(titles) + to_remove.extend(to_remove_c) + + return ret, to_remove + + +class TitleFromPosition(TitleBaseRule): + """ + Add title match in existing matches + """ + dependency = [FilmTitleRule, SubtitlePrefixLanguageRule, SubtitleSuffixLanguageRule, SubtitleExtensionRule] + + properties = {'title': [None], 'alternative_title': [None]} + + def __init__(self): + super(TitleFromPosition, self).__init__('title', ['title'], 'alternative_title') + + +class PreferTitleWithYear(Rule): + """ + Prefer title where filepart contains year. + """ + dependency = TitleFromPosition + consequence = [RemoveMatch, AppendTags(['equivalent-ignore'])] + + properties = {'title': [None]} + + def when(self, matches, context): + with_year_in_group = [] + with_year = [] + titles = matches.named('title') + + for title_match in titles: + filepart = matches.markers.at_match(title_match, lambda marker: marker.name == 'path', 0) + if filepart: + year_match = matches.range(filepart.start, filepart.end, lambda match: match.name == 'year', 0) + if year_match: + group = matches.markers.at_match(year_match, lambda group: group.name == 'group') + if group: + with_year_in_group.append(title_match) + else: + with_year.append(title_match) + + to_tag = [] + if with_year_in_group: + title_values = set([title_match.value for title_match in with_year_in_group]) + to_tag.extend(with_year_in_group) + elif with_year: + title_values = set([title_match.value for title_match in with_year]) + to_tag.extend(with_year) + else: + title_values = set([title_match.value for title_match in titles]) + + to_remove = [] + for title_match in titles: + if title_match.value not in title_values: + to_remove.append(title_match) + return to_remove, to_tag diff --git a/libs/guessit/rules/properties/type.py b/libs/guessit/rules/properties/type.py new file mode 100644 index 00000000..6d798b64 --- /dev/null +++ b/libs/guessit/rules/properties/type.py @@ -0,0 +1,75 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +""" +type property +""" +from rebulk import CustomRule, Rebulk, POST_PROCESS +from rebulk.match import Match + +from ...rules.processors import Processors + + +def _type(matches, value): + """ + Define type match with given value. + :param matches: + :param value: + :return: + """ + matches.append(Match(len(matches.input_string), len(matches.input_string), name='type', value=value)) + + +def type_(): + """ + Builder for rebulk object. + :return: Created Rebulk object + :rtype: Rebulk + """ + return Rebulk().rules(TypeProcessor) + + +class TypeProcessor(CustomRule): + """ + Post processor to find file type based on all others found matches. + """ + priority = POST_PROCESS + + dependency = Processors + + properties = {'type': ['episode', 'movie']} + + def when(self, matches, context): # pylint:disable=too-many-return-statements + option_type = context.get('type', None) + if option_type: + return option_type + + episode = matches.named('episode') + season = matches.named('season') + episode_details = matches.named('episode_details') + + if episode or season or episode_details: + return 'episode' + + film = matches.named('film') + if film: + return 'movie' + + year = matches.named('year') + date = matches.named('date') + + if date and not year: + return 'episode' + + bonus = matches.named('bonus') + if bonus and not year: + return 'episode' + + crc32 = matches.named('crc32') + anime_release_group = matches.named('release_group', lambda match: 'anime' in match.tags) + if crc32 and anime_release_group: + return 'episode' + + return 'movie' + + def then(self, matches, when_response, context): + _type(matches, when_response) diff --git a/libs/guessit/rules/properties/video_codec.py b/libs/guessit/rules/properties/video_codec.py new file mode 100644 index 00000000..2ab1cfaf --- /dev/null +++ b/libs/guessit/rules/properties/video_codec.py @@ -0,0 +1,87 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +""" +video_codec and video_profile property +""" +from rebulk.remodule import re + +from rebulk import Rebulk, Rule, RemoveMatch + +from guessit.rules.common.validators import seps_after, seps_before +from ..common import dash +from ..common.validators import seps_surround + + +def video_codec(): + """ + Builder for rebulk object. + :return: Created Rebulk object + :rtype: Rebulk + """ + rebulk = Rebulk().regex_defaults(flags=re.IGNORECASE, abbreviations=[dash]).string_defaults(ignore_case=True) + rebulk.defaults(name="video_codec") + + rebulk.regex(r"Rv\d{2}", value="Real") + rebulk.regex("Mpeg2", value="Mpeg2") + rebulk.regex("DVDivX", "DivX", value="DivX") + rebulk.regex("XviD", value="XviD") + rebulk.regex("[hx]-?264(?:-?AVC(HD)?)?", "MPEG-?4(?:-?AVC(HD)?)", "AVCHD", value="h264") + rebulk.regex("[hx]-?265(?:-?HEVC)?", "HEVC", value="h265") + + # http://blog.mediacoderhq.com/h264-profiles-and-levels/ + # http://fr.wikipedia.org/wiki/H.264 + rebulk.defaults(name="video_profile", validator=seps_surround) + + rebulk.regex('10.?bit', 'Hi10P', value='10bit') + rebulk.regex('8.?bit', value='8bit') + + rebulk.string('BP', value='BP', tags='video_profile.rule') + rebulk.string('XP', 'EP', value='XP', tags='video_profile.rule') + rebulk.string('MP', value='MP', tags='video_profile.rule') + rebulk.string('HP', 'HiP', value='HP', tags='video_profile.rule') + rebulk.regex('Hi422P', value='Hi422P', tags='video_profile.rule') + rebulk.regex('Hi444PP', value='Hi444PP', tags='video_profile.rule') + + rebulk.string('DXVA', value='DXVA', name='video_api') + + rebulk.rules(ValidateVideoCodec, VideoProfileRule) + + return rebulk + + +class ValidateVideoCodec(Rule): + """ + Validate video_codec with format property or separated + """ + priority = 64 + consequence = RemoveMatch + + def when(self, matches, context): + ret = [] + for codec in matches.named('video_codec'): + if not seps_before(codec) and \ + not matches.at_index(codec.start - 1, lambda match: match.name == 'format'): + ret.append(codec) + continue + if not seps_after(codec): + ret.append(codec) + continue + return ret + + +class VideoProfileRule(Rule): + """ + Rule to validate video_profile + """ + consequence = RemoveMatch + + def when(self, matches, context): + profile_list = matches.named('video_profile', lambda match: 'video_profile.rule' in match.tags) + ret = [] + for profile in profile_list: + codec = matches.previous(profile, lambda match: match.name == 'video_codec') + if not codec: + codec = matches.next(profile, lambda match: match.name == 'video_codec') + if not codec: + ret.append(profile) + return ret diff --git a/libs/guessit/rules/properties/website.py b/libs/guessit/rules/properties/website.py new file mode 100644 index 00000000..8563ea16 --- /dev/null +++ b/libs/guessit/rules/properties/website.py @@ -0,0 +1,67 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +""" +Website property. +""" +from pkg_resources import resource_stream # @UnresolvedImport +from rebulk.remodule import re + +from rebulk import Rebulk, Rule, RemoveMatch +from ...reutils import build_or_pattern + + +def website(): + """ + Builder for rebulk object. + :return: Created Rebulk object + :rtype: Rebulk + """ + rebulk = Rebulk().regex_defaults(flags=re.IGNORECASE) + rebulk.defaults(name="website") + + tlds = [l.strip().decode('utf-8') + for l in resource_stream('guessit', 'tlds-alpha-by-domain.txt').readlines() + if b'--' not in l][1:] # All registered domain extension + + safe_tlds = ['com', 'org', 'net'] # For sure a website extension + safe_subdomains = ['www'] # For sure a website subdomain + safe_prefix = ['co', 'com', 'org', 'net'] # Those words before a tlds are sure + + rebulk.regex(r'(?:[^a-z0-9]|^)((?:'+build_or_pattern(safe_subdomains) + + r'\.)+(?:[a-z-]+\.)+(?:'+build_or_pattern(tlds) + + r'))(?:[^a-z0-9]|$)', + children=True) + rebulk.regex(r'(?:[^a-z0-9]|^)((?:'+build_or_pattern(safe_subdomains) + + r'\.)*[a-z-]+\.(?:'+build_or_pattern(safe_tlds) + + r'))(?:[^a-z0-9]|$)', + safe_subdomains=safe_subdomains, safe_tlds=safe_tlds, children=True) + rebulk.regex(r'(?:[^a-z0-9]|^)((?:'+build_or_pattern(safe_subdomains) + + r'\.)*[a-z-]+\.(?:'+build_or_pattern(safe_prefix) + + r'\.)+(?:'+build_or_pattern(tlds) + + r'))(?:[^a-z0-9]|$)', + safe_subdomains=safe_subdomains, safe_prefix=safe_prefix, tlds=tlds, children=True) + + class PreferTitleOverWebsite(Rule): + """ + If found match is more likely a title, remove website. + """ + consequence = RemoveMatch + + @staticmethod + def valid_followers(match): + """ + Validator for next website matches + """ + return any(name in ['season', 'episode', 'year'] for name in match.names) + + def when(self, matches, context): + to_remove = [] + for website_match in matches.named('website'): + suffix = matches.next(website_match, PreferTitleOverWebsite.valid_followers, 0) + if suffix: + to_remove.append(website_match) + return to_remove + + rebulk.rules(PreferTitleOverWebsite) + + return rebulk diff --git a/libs/guessit/slogging.py b/libs/guessit/slogging.py deleted file mode 100644 index 00fb80f7..00000000 --- a/libs/guessit/slogging.py +++ /dev/null @@ -1,89 +0,0 @@ -#!/usr/bin/env python -# -*- coding: utf-8 -*- -# -# GuessIt - A library for guessing information from filenames -# Copyright (c) 2013 Nicolas Wack -# -# GuessIt is free software; you can redistribute it and/or modify it under -# the terms of the Lesser GNU General Public License as published by -# the Free Software Foundation; either version 3 of the License, or -# (at your option) any later version. -# -# GuessIt is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# Lesser GNU General Public License for more details. -# -# You should have received a copy of the Lesser GNU General Public License -# along with this program. If not, see . -# - -from __future__ import absolute_import, division, print_function, unicode_literals - -import logging -import sys -import os - -GREEN_FONT = "\x1B[0;32m" -YELLOW_FONT = "\x1B[0;33m" -BLUE_FONT = "\x1B[0;34m" -RED_FONT = "\x1B[0;31m" -RESET_FONT = "\x1B[0m" - - -def setup_logging(colored=True, with_time=False, with_thread=False, filename=None, with_lineno=False): # pragma: no cover - """Set up a nice colored logger as the main application logger.""" - - class SimpleFormatter(logging.Formatter): - def __init__(self, with_time, with_thread): - self.fmt = (('%(asctime)s ' if with_time else '') + - '%(levelname)-8s ' + - '[%(name)s:%(funcName)s' + - (':%(lineno)s' if with_lineno else '') + ']' + - ('[%(threadName)s]' if with_thread else '') + - ' -- %(message)s') - logging.Formatter.__init__(self, self.fmt) - - class ColoredFormatter(logging.Formatter): - def __init__(self, with_time, with_thread): - self.fmt = (('%(asctime)s ' if with_time else '') + - '-CC-%(levelname)-8s ' + - BLUE_FONT + '[%(name)s:%(funcName)s' + - (':%(lineno)s' if with_lineno else '') + ']' + - RESET_FONT + ('[%(threadName)s]' if with_thread else '') + - ' -- %(message)s') - - logging.Formatter.__init__(self, self.fmt) - - def format(self, record): - modpath = record.name.split('.') - record.mname = modpath[0] - record.mmodule = '.'.join(modpath[1:]) - result = logging.Formatter.format(self, record) - if record.levelno == logging.DEBUG: - color = BLUE_FONT - elif record.levelno == logging.INFO: - color = GREEN_FONT - elif record.levelno == logging.WARNING: - color = YELLOW_FONT - else: - color = RED_FONT - - result = result.replace('-CC-', color) - return result - - if filename is not None: - # make sure we can write to our log file - logdir = os.path.dirname(filename) - if not os.path.exists(logdir): - os.makedirs(logdir) - ch = logging.FileHandler(filename, mode='w') - ch.setFormatter(SimpleFormatter(with_time, with_thread)) - else: - ch = logging.StreamHandler() - if colored and sys.platform != 'win32': - ch.setFormatter(ColoredFormatter(with_time, with_thread)) - else: - ch.setFormatter(SimpleFormatter(with_time, with_thread)) - - logging.getLogger().addHandler(ch) diff --git a/libs/guessit/test/1MB b/libs/guessit/test/1MB deleted file mode 100644 index 66d50a84..00000000 Binary files a/libs/guessit/test/1MB and /dev/null differ diff --git a/libs/guessit/test/__init__.py b/libs/guessit/test/__init__.py index 7ce54945..e5be370e 100644 --- a/libs/guessit/test/__init__.py +++ b/libs/guessit/test/__init__.py @@ -1,26 +1,3 @@ #!/usr/bin/env python # -*- coding: utf-8 -*- -# -# GuessIt - A library for guessing information from filenames -# Copyright (c) 2013 Nicolas Wack -# -# GuessIt is free software; you can redistribute it and/or modify it under -# the terms of the Lesser GNU General Public License as published by -# the Free Software Foundation; either version 3 of the License, or -# (at your option) any later version. -# -# GuessIt is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# Lesser GNU General Public License for more details. -# -# You should have received a copy of the Lesser GNU General Public License -# along with this program. If not, see . -# - -from __future__ import absolute_import, division, print_function, unicode_literals - -import logging -from guessit.slogging import setup_logging -setup_logging() -logging.disable(logging.INFO) +# pylint: disable=no-self-use, pointless-statement, missing-docstring, invalid-name diff --git a/libs/guessit/test/__main__.py b/libs/guessit/test/__main__.py deleted file mode 100644 index 32b8dd10..00000000 --- a/libs/guessit/test/__main__.py +++ /dev/null @@ -1,40 +0,0 @@ -#!/usr/bin/env python -# -*- coding: utf-8 -*- -# -# GuessIt - A library for guessing information from filenames -# Copyright (c) 2013 Nicolas Wack -# -# GuessIt is free software; you can redistribute it and/or modify it under -# the terms of the Lesser GNU General Public License as published by -# the Free Software Foundation; either version 3 of the License, or -# (at your option) any later version. -# -# GuessIt is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# Lesser GNU General Public License for more details. -# -# You should have received a copy of the Lesser GNU General Public License -# along with this program. If not, see . -# - -from __future__ import absolute_import, division, print_function, unicode_literals -from guessit.test import (test_api, test_autodetect, test_autodetect_all, test_doctests, - test_episode, test_hashes, test_language, test_main, - test_matchtree, test_movie, test_quality, test_utils) -from unittest import TextTestRunner - - -import logging - -def main(): - for suite in [test_api.suite, test_autodetect.suite, - test_autodetect_all.suite, test_doctests.suite, - test_episode.suite, test_hashes.suite, test_language.suite, - test_main.suite, test_matchtree.suite, test_movie.suite, - test_quality.suite, test_utils.suite]: - TextTestRunner(verbosity=2).run(suite) - - -if __name__ == '__main__': - main() diff --git a/libs/guessit/test/autodetect.yaml b/libs/guessit/test/autodetect.yaml deleted file mode 100644 index 864b8827..00000000 --- a/libs/guessit/test/autodetect.yaml +++ /dev/null @@ -1,489 +0,0 @@ -? Movies/Fear and Loathing in Las Vegas (1998)/Fear.and.Loathing.in.Las.Vegas.720p.HDDVD.DTS.x264-ESiR.mkv -: type: movie - title: Fear and Loathing in Las Vegas - year: 1998 - screenSize: 720p - format: HD-DVD - audioCodec: DTS - videoCodec: h264 - releaseGroup: ESiR - -? Leopard.dmg -: type: unknown - extension: dmg - -? Series/Duckman/Duckman - 101 (01) - 20021107 - I, Duckman.avi -: type: episode - series: Duckman - season: 1 - episodeNumber: 1 - title: I, Duckman - date: 2002-11-07 - -? Series/Neverwhere/Neverwhere.05.Down.Street.[tvu.org.ru].avi -: type: episode - series: Neverwhere - episodeNumber: 5 - title: Down Street - website: tvu.org.ru - -? Neverwhere.05.Down.Street.[tvu.org.ru].avi -: type: episode - series: Neverwhere - episodeNumber: 5 - title: Down Street - website: tvu.org.ru - -? Series/Breaking Bad/Minisodes/Breaking.Bad.(Minisodes).01.Good.Cop.Bad.Cop.WEBRip.XviD.avi -: type: episode - series: Breaking Bad - episodeFormat: Minisode - episodeNumber: 1 - title: Good Cop Bad Cop - format: WEBRip - videoCodec: XviD - -? Series/Kaamelott/Kaamelott - Livre V - Ep 23 - Le Forfait.avi -: type: episode - series: Kaamelott - episodeNumber: 23 - title: Le Forfait - -? Movies/The Doors (1991)/09.03.08.The.Doors.(1991).BDRip.720p.AC3.X264-HiS@SiLUHD-English.[sharethefiles.com].mkv -: type: movie - title: The Doors - year: 1991 - date: 2008-03-09 - format: BluRay - screenSize: 720p - audioCodec: AC3 - videoCodec: h264 - releaseGroup: HiS@SiLUHD - language: english - website: sharethefiles.com - -? Movies/M.A.S.H. (1970)/MASH.(1970).[Divx.5.02][Dual-Subtitulos][DVDRip].ogm -: type: movie - title: M.A.S.H. - year: 1970 - videoCodec: DivX - format: DVD - -? the.mentalist.501.hdtv-lol.mp4 -: type: episode - series: The Mentalist - season: 5 - episodeNumber: 1 - format: HDTV - releaseGroup: LOL - -? the.simpsons.2401.hdtv-lol.mp4 -: type: episode - series: The Simpsons - season: 24 - episodeNumber: 1 - format: HDTV - releaseGroup: LOL - -? Homeland.S02E01.HDTV.x264-EVOLVE.mp4 -: type: episode - series: Homeland - season: 2 - episodeNumber: 1 - format: HDTV - videoCodec: h264 - releaseGroup: EVOLVE - -? /media/Band_of_Brothers-e01-Currahee.mkv -: type: episode - series: Band of Brothers - episodeNumber: 1 - title: Currahee - -? /media/Band_of_Brothers-x02-We_Stand_Alone_Together.mkv -: type: episode - series: Band of Brothers - bonusNumber: 2 - bonusTitle: We Stand Alone Together - -? /movies/James_Bond-f21-Casino_Royale-x02-Stunts.mkv -: type: movie - title: Casino Royale - filmSeries: James Bond - filmNumber: 21 - bonusNumber: 2 - bonusTitle: Stunts - -? /TV Shows/new.girl.117.hdtv-lol.mp4 -: type: episode - series: New Girl - season: 1 - episodeNumber: 17 - format: HDTV - releaseGroup: LOL - -? The.Office.(US).1x03.Health.Care.HDTV.XviD-LOL.avi -: type: episode - series: The Office (US) - country: US - season: 1 - episodeNumber: 3 - title: Health Care - format: HDTV - videoCodec: XviD - releaseGroup: LOL - -? The_Insider-(1999)-x02-60_Minutes_Interview-1996.mp4 -: type: movie - title: The Insider - year: 1999 - bonusNumber: 2 - bonusTitle: 60 Minutes Interview-1996 - -? OSS_117--Cairo,_Nest_of_Spies.mkv -: type: movie - title: OSS 117--Cairo, Nest of Spies - -? Rush.._Beyond_The_Lighted_Stage-x09-Between_Sun_and_Moon-2002_Hartford.mkv -: type: movie - title: Rush Beyond The Lighted Stage - bonusNumber: 9 - bonusTitle: Between Sun and Moon-2002 Hartford - -? House.Hunters.International.S56E06.720p.hdtv.x264.mp4 -: type: episode - series: House Hunters International - season: 56 - episodeNumber: 6 - screenSize: 720p - format: HDTV - videoCodec: h264 - -? White.House.Down.2013.1080p.BluRay.DTS-HD.MA.5.1.x264-PublicHD.mkv -: type: movie - title: White House Down - year: 2013 - screenSize: 1080p - format: BluRay - audioCodec: DTS - audioProfile: HDMA - videoCodec: h264 - releaseGroup: PublicHD - audioChannels: "5.1" - -? White.House.Down.2013.1080p.BluRay.DTSHD.MA.5.1.x264-PublicHD.mkv -: type: movie - title: White House Down - year: 2013 - screenSize: 1080p - format: BluRay - audioCodec: DTS - audioProfile: HDMA - videoCodec: h264 - releaseGroup: PublicHD - audioChannels: "5.1" - -? Hostages.S01E01.Pilot.for.Air.720p.WEB-DL.DD5.1.H.264-NTb.nfo -: type: episodeinfo - series: Hostages - title: Pilot for Air - season: 1 - episodeNumber: 1 - screenSize: 720p - format: WEB-DL - audioChannels: "5.1" - videoCodec: h264 - audioCodec: DolbyDigital - releaseGroup: NTb - -? Despicable.Me.2.2013.1080p.BluRay.x264-VeDeTT.nfo -: type: movieinfo - title: Despicable Me 2 - year: 2013 - screenSize: 1080p - format: BluRay - videoCodec: h264 - releaseGroup: VeDeTT - -? Le Cinquieme Commando 1971 SUBFORCED FRENCH DVDRiP XViD AC3 Bandix.mkv -: type: movie - audioCodec: AC3 - format: DVD - releaseGroup: Bandix - subtitleLanguage: French - title: Le Cinquieme Commando - videoCodec: XviD - year: 1971 - -? Le Seigneur des Anneaux - La Communauté de l'Anneau - Version Longue - BDRip.mkv -: type: movie - format: BluRay - title: Le Seigneur des Anneaux - -? La petite bande (Michel Deville - 1983) VF PAL MP4 x264 AAC.mkv -: type: movie - audioCodec: AAC - language: French - title: La petite bande - videoCodec: h264 - year: 1983 - -? Retour de Flammes (Gregor Schnitzler 2003) FULL DVD.iso -: type: movie - format: DVD - title: Retour de Flammes - type: movie - year: 2003 - -? A.Common.Title.Special.2014.avi -: type: movie - year: 2014 - title: A Common Title Special - -? A.Common.Title.2014.Special.avi -: type: episode - year: 2014 - series: A Common Title - title: Special - episodeDetails: Special - -? A.Common.Title.2014.Special.Edition.avi -: type: movie - year: 2014 - title: A Common Title - edition: Special Edition - -? Downton.Abbey.2013.Christmas.Special.HDTV.x264-FoV.mp4 -: type: episode - year: 2013 - series: Downton Abbey - title: Christmas Special - videoCodec: h264 - releaseGroup: FoV - format: HDTV - episodeDetails: Special - -? Doctor_Who_2013_Christmas_Special.The_Time_of_The_Doctor.HD -: options: -n - type: episode - series: Doctor Who - other: HD - episodeDetails: Special - title: Christmas Special The Time of The Doctor - year: 2013 - -? Doctor Who 2005 50th Anniversary Special The Day of the Doctor 3.avi -: type: episode - series: Doctor Who - episodeDetails: Special - title: 50th Anniversary Special The Day of the Doctor 3 - year: 2005 - -? Robot Chicken S06-Born Again Virgin Christmas Special HDTV x264.avi -: type: episode - series: Robot Chicken - format: HDTV - season: 6 - title: Born Again Virgin Christmas Special - videoCodec: h264 - episodeDetails: Special - -? Wicked.Tuna.S03E00.Head.To.Tail.Special.HDTV.x264-YesTV -: options: -n - type: episode - series: Wicked Tuna - title: Head To Tail Special - releaseGroup: YesTV - season: 3 - episodeNumber: 0 - videoCodec: h264 - format: HDTV - episodeDetails: Special - -? The.Voice.UK.S03E12.HDTV.x264-C4TV -: options: -n - episodeNumber: 12 - videoCodec: h264 - format: HDTV - series: The Voice (UK) - releaseGroup: C4TV - season: 3 - country: United Kingdom - type: episode - -? /tmp/star.trek.9/star.trek.9.mkv -: type: movie - title: star trek 9 - -? star.trek.9.mkv -: type: movie - title: star trek 9 - -? FlexGet.S01E02.TheName.HDTV.xvid -: options: -n - episodeNumber: 2 - format: HDTV - season: 1 - series: FlexGet - title: TheName - type: episode - videoCodec: XviD - -? FlexGet.S01E02.TheName.HDTV.xvid -: options: -n - episodeNumber: 2 - format: HDTV - season: 1 - series: FlexGet - title: TheName - type: episode - videoCodec: XviD - -? some.series.S03E14.Title.Here.720p -: options: -n - episodeNumber: 14 - screenSize: 720p - season: 3 - series: some series - title: Title Here - type: episode - -? '[the.group] Some.Series.S03E15.Title.Two.720p' -: options: -n - episodeNumber: 15 - releaseGroup: the.group - screenSize: 720p - season: 3 - series: Some Series - title: Title Two - type: episode - -? 'HD 720p: Some series.S03E16.Title.Three' -: options: -n - episodeNumber: 16 - other: HD - screenSize: 720p - season: 3 - series: Some series - title: Title Three - type: episode - -? Something.Season.2.1of4.Ep.Title.HDTV.torrent -: episodeCount: 4 - episodeNumber: 1 - format: HDTV - season: 2 - series: Something - title: Title - type: episode - -? Show-A (US) - Episode Title S02E09 hdtv -: options: -n - country: US - episodeNumber: 9 - format: HDTV - season: 2 - series: Show-A (US) - type: episode - -? Jack's.Show.S03E01.blah.1080p -: options: -n - episodeNumber: 1 - screenSize: 1080p - season: 3 - series: Jack's Show - title: blah - type: episode - -? FlexGet.epic -: options: -n - title: FlexGet epic - type: movie - -? FlexGet.Apt.1 -: options: -n - title: FlexGet Apt 1 - type: movie - -? FlexGet.aptitude -: options: -n - title: FlexGet aptitude - type: movie - -? FlexGet.Step1 -: options: -n - title: FlexGet Step1 - type: movie - -? Movies/El Bosque Animado (1987)/El.Bosque.Animado.[Jose.Luis.Cuerda.1987].[Xvid-Dvdrip-720 * 432].avi -: format: DVD - screenSize: 720x432 - title: El Bosque Animado - videoCodec: XviD - year: 1987 - type: movie - -? Movies/El Bosque Animado (1987)/El.Bosque.Animado.[Jose.Luis.Cuerda.1987].[Xvid-Dvdrip-720x432].avi -: format: DVD - screenSize: 720x432 - title: El Bosque Animado - videoCodec: XviD - year: 1987 - type: movie - -? 2009.shoot.fruit.chan.multi.dvd9.pal -: options: -n - format: DVD - language: mul - other: PAL - title: shoot fruit chan - type: movie - year: 2009 - -? 2009.shoot.fruit.chan.multi.dvd5.pal -: options: -n - format: DVD - language: mul - other: PAL - title: shoot fruit chan - type: movie - year: 2009 - -? The.Flash.2014.S01E01.PREAIR.WEBRip.XviD-EVO.avi -: episodeNumber: 1 - format: WEBRip - other: Preair - releaseGroup: EVO - season: 1 - series: The Flash - type: episode - videoCodec: XviD - year: 2014 - -? Ice.Lake.Rebels.S01E06.Ice.Lake.Games.720p.HDTV.x264-DHD -: options: -n - episodeNumber: 6 - format: HDTV - releaseGroup: DHD - screenSize: 720p - season: 1 - series: Ice Lake Rebels - title: Ice Lake Games - type: episode - videoCodec: h264 - -? The League - S06E10 - Epi Sexy.mkv -: episodeNumber: 10 - season: 6 - series: The League - title: Epi Sexy - type: episode - -? Stay (2005) [1080p]/Stay.2005.1080p.BluRay.x264.YIFY.mp4 -: format: BluRay - releaseGroup: YIFY - screenSize: 1080p - title: Stay - type: movie - videoCodec: h264 - year: 2005 \ No newline at end of file diff --git a/libs/guessit/test/dummy.srt b/libs/guessit/test/dummy.srt deleted file mode 100644 index ca4cf8b8..00000000 --- a/libs/guessit/test/dummy.srt +++ /dev/null @@ -1 +0,0 @@ -Just a dummy srt file (used for unittests: do not remove!) diff --git a/libs/guessit/test/episodes.yaml b/libs/guessit/test/episodes.yaml deleted file mode 100644 index afba6e74..00000000 --- a/libs/guessit/test/episodes.yaml +++ /dev/null @@ -1,1174 +0,0 @@ -# Dubious tests -# -#? "finale " -#: releaseGroup: FiNaLe -# extension: "" - - -? Series/Californication/Season 2/Californication.2x05.Vaginatown.HDTV.XviD-0TV.avi -: series: Californication - season: 2 - episodeNumber: 5 - title: Vaginatown - format: HDTV - videoCodec: XviD - releaseGroup: 0TV - -? Series/dexter/Dexter.5x02.Hello,.Bandit.ENG.-.sub.FR.HDTV.XviD-AlFleNi-TeaM.[tvu.org.ru].avi -: series: Dexter - season: 5 - episodeNumber: 2 - title: Hello, Bandit - language: English - subtitleLanguage: French - format: HDTV - videoCodec: XviD - releaseGroup: AlFleNi-TeaM - website: tvu.org.ru - -? Series/Treme/Treme.1x03.Right.Place,.Wrong.Time.HDTV.XviD-NoTV.avi -: series: Treme - season: 1 - episodeNumber: 3 - title: Right Place, Wrong Time - format: HDTV - videoCodec: XviD - releaseGroup: NoTV - -? Series/Duckman/Duckman - 101 (01) - 20021107 - I, Duckman.avi -: series: Duckman - season: 1 - episodeNumber: 1 - title: I, Duckman - date: 2002-11-07 - -? Series/Duckman/Duckman - S1E13 Joking The Chicken (unedited).avi -: series: Duckman - season: 1 - episodeNumber: 13 - title: Joking The Chicken - -? Series/Simpsons/Saison 12 Français/Simpsons,.The.12x08.A.Bas.Le.Sergent.Skinner.FR.avi -: series: The Simpsons - season: 12 - episodeNumber: 8 - title: A Bas Le Sergent Skinner - language: French - -? Series/Futurama/Season 3 (mkv)/[â„¢] Futurama - S03E22 - Le chef de fer à 30% ( 30 Percent Iron Chef ).mkv -: series: Futurama - season: 3 - episodeNumber: 22 - title: Le chef de fer à 30% - -? Series/The Office/Season 6/The Office - S06xE01.avi -: series: The Office - season: 6 - episodeNumber: 1 - -? series/The Office/Season 4/The Office [401] Fun Run.avi -: series: The Office - season: 4 - episodeNumber: 1 - title: Fun Run - -? Series/Mad Men Season 1 Complete/Mad.Men.S01E01.avi -: series: Mad Men - season: 1 - episodeNumber: 1 - other: complete - -? series/Psych/Psych S02 Season 2 Complete English DVD/Psych.S02E02.65.Million.Years.Off.avi -: series: Psych - season: 2 - episodeNumber: 2 - title: 65 Million Years Off - language: english - format: DVD - other: complete - -? series/Psych/Psych S02 Season 2 Complete English DVD/Psych.S02E03.Psy.Vs.Psy.Français.srt -: series: Psych - season: 2 - episodeNumber: 3 - title: Psy Vs Psy - format: DVD - language: English - subtitleLanguage: French - other: complete - -? Series/Pure Laine/Pure.Laine.1x01.Toutes.Couleurs.Unies.FR.(Québec).DVB-Kceb.[tvu.org.ru].avi -: series: Pure Laine - season: 1 - episodeNumber: 1 - title: Toutes Couleurs Unies - format: DVB - releaseGroup: Kceb - language: french - website: tvu.org.ru - -? Series/Pure Laine/2x05 - Pure Laine - Je Me Souviens.avi -: series: Pure Laine - season: 2 - episodeNumber: 5 - title: Je Me Souviens - -? Series/Tout sur moi/Tout sur moi - S02E02 - Ménage à trois (14-01-2008) [Rip by Ampli].avi -: series: Tout sur moi - season: 2 - episodeNumber: 2 - title: Ménage à trois - date: 2008-01-14 - -? The.Mentalist.2x21.18-5-4.ENG.-.sub.FR.HDTV.XviD-AlFleNi-TeaM.[tvu.org.ru].avi -: series: The Mentalist - season: 2 - episodeNumber: 21 - title: 18-5-4 - language: english - subtitleLanguage: french - format: HDTV - videoCodec: Xvid - releaseGroup: AlFleNi-TeaM - website: tvu.org.ru - -? series/__ Incomplete __/Dr Slump (Catalan)/Dr._Slump_-_003_DVB-Rip_Catalan_by_kelf.avi -: series: Dr Slump - episodeNumber: 3 - format: DVB - language: catalan - -? series/Ren and Stimpy - Black_hole_[DivX].avi -: series: Ren and Stimpy - title: Black hole - videoCodec: DivX - -? Series/Walt Disney/Donald.Duck.-.Good.Scouts.[www.bigernie.jump.to].avi -: series: Donald Duck - title: Good Scouts - website: www.bigernie.jump.to - -? Series/Neverwhere/Neverwhere.05.Down.Street.[tvu.org.ru].avi -: series: Neverwhere - episodeNumber: 5 - title: Down Street - website: tvu.org.ru - -? Series/South Park/Season 4/South.Park.4x07.Cherokee.Hair.Tampons.DVDRip.[tvu.org.ru].avi -: series: South Park - season: 4 - episodeNumber: 7 - title: Cherokee Hair Tampons - format: DVD - website: tvu.org.ru - -? Series/Kaamelott/Kaamelott - Livre V - Ep 23 - Le Forfait.avi -: series: Kaamelott - episodeNumber: 23 - title: Le Forfait - -? Series/Duckman/Duckman - 110 (10) - 20021218 - Cellar Beware.avi -: series: Duckman - season: 1 - episodeNumber: 10 - date: 2002-12-18 - title: Cellar Beware - -? Series/Ren & Stimpy/Ren And Stimpy - Onward & Upward-Adult Party Cartoon.avi -: series: Ren And Stimpy - title: Onward & Upward-Adult Party Cartoon - -? Series/Breaking Bad/Minisodes/Breaking.Bad.(Minisodes).01.Good.Cop.Bad.Cop.WEBRip.XviD.avi -: series: Breaking Bad - episodeFormat: Minisode - episodeNumber: 1 - title: Good Cop Bad Cop - format: WEBRip - videoCodec: XviD - -? Series/My Name Is Earl/My.Name.Is.Earl.S01Extras.-.Bad.Karma.DVDRip.XviD.avi -: series: My Name Is Earl - season: 1 - title: Bad Karma - format: DVD - episodeDetails: Extras - videoCodec: XviD - -? series/Freaks And Geeks/Season 1/Episode 4 - Kim Kelly Is My Friend-eng(1).srt -: series: Freaks And Geeks - season: 1 - episodeNumber: 4 - title: Kim Kelly Is My Friend - language: English - -? /mnt/series/The Big Bang Theory/S01/The.Big.Bang.Theory.S01E01.mkv -: series: The Big Bang Theory - season: 1 - episodeNumber: 1 - -? /media/Parks_and_Recreation-s03-e01.mkv -: series: Parks and Recreation - season: 3 - episodeNumber: 1 - -? /media/Parks_and_Recreation-s03-e02-Flu_Season.mkv -: series: Parks and Recreation - season: 3 - title: Flu Season - episodeNumber: 2 - -? /media/Parks_and_Recreation-s03-x01.mkv -: series: Parks and Recreation - season: 3 - bonusNumber: 1 - -? /media/Parks_and_Recreation-s03-x02-Gag_Reel.mkv -: series: Parks and Recreation - season: 3 - bonusNumber: 2 - bonusTitle: Gag Reel - -? /media/Band_of_Brothers-e01-Currahee.mkv -: series: Band of Brothers - episodeNumber: 1 - title: Currahee - -? /media/Band_of_Brothers-x02-We_Stand_Alone_Together.mkv -: series: Band of Brothers - bonusNumber: 2 - bonusTitle: We Stand Alone Together - -? /TV Shows/Mad.M-5x9.mkv -: series: Mad M - season: 5 - episodeNumber: 9 - -? /TV Shows/new.girl.117.hdtv-lol.mp4 -: series: New Girl - season: 1 - episodeNumber: 17 - format: HDTV - releaseGroup: LOL - -? Kaamelott - 5x44x45x46x47x48x49x50.avi -: series: Kaamelott - season: 5 - episodeNumber: 44 - episodeList: [44, 45, 46, 47, 48, 49, 50] - -? Example S01E01-02.avi -: series: Example - season: 1 - episodeNumber: 1 - episodeList: [1, 2] - -? Example S01E01E02.avi -: series: Example - season: 1 - episodeNumber: 1 - episodeList: [1, 2] - -? Series/Baccano!/Baccano!_-_T1_-_Trailer_-_[Ayu](dae8173e).mkv -: series: Baccano! - other: Trailer - releaseGroup: Ayu - title: T1 - crc32: dae8173e - -? Series/Doctor Who (2005)/Season 06/Doctor Who (2005) - S06E01 - The Impossible Astronaut (1).avi -: series: Doctor Who - year: 2005 - season: 6 - episodeNumber: 1 - title: The Impossible Astronaut - -? Parks and Recreation - [04x12] - Ad Campaign.avi -: series: Parks and Recreation - season: 4 - episodeNumber: 12 - title: Ad Campaign - -? The Sopranos - [05x07] - In Camelot.mp4 -: series: The Sopranos - season: 5 - episodeNumber: 7 - title: In Camelot - -? The.Office.(US).1x03.Health.Care.HDTV.XviD-LOL.avi -: series: The Office (US) - country: US - season: 1 - episodeNumber: 3 - title: Health Care - format: HDTV - videoCodec: XviD - releaseGroup: LOL - -? /Volumes/data-1/Series/Futurama/Season 3/Futurama_-_S03_DVD_Bonus_-_Deleted_Scenes_Part_3.ogm -: series: Futurama - season: 3 - part: 3 - other: Bonus - title: Deleted Scenes - format: DVD - -? Ben.and.Kate.S01E02.720p.HDTV.X264-DIMENSION.mkv -: series: Ben and Kate - season: 1 - episodeNumber: 2 - screenSize: 720p - format: HDTV - videoCodec: h264 - releaseGroup: DIMENSION - -? /volume1/TV Series/Drawn Together/Season 1/Drawn Together 1x04 Requiem for a Reality Show.avi -: series: Drawn Together - season: 1 - episodeNumber: 4 - title: Requiem for a Reality Show - -? Sons.of.Anarchy.S05E06.720p.WEB.DL.DD5.1.H.264-CtrlHD.mkv -: series: Sons of Anarchy - season: 5 - episodeNumber: 6 - screenSize: 720p - format: WEB-DL - audioChannels: "5.1" - audioCodec: DolbyDigital - videoCodec: h264 - releaseGroup: CtrlHD - -? /media/bdc64bfe-e36f-4af8-b550-e6fd2dfaa507/TV_Shows/Doctor Who (2005)/Saison 6/Doctor Who (2005) - S06E13 - The Wedding of River Song.mkv -: series: Doctor Who - season: 6 - episodeNumber: 13 - year: 2005 - title: The Wedding of River Song - idNumber: bdc64bfe-e36f-4af8-b550-e6fd2dfaa507 - -? /mnt/videos/tvshows/Doctor Who/Season 06/E13 - The Wedding of River Song.mkv -: series: Doctor Who - season: 6 - episodeNumber: 13 - title: The Wedding of River Song - -? The.Simpsons.S24E03.Adventures.in.Baby-Getting.720p.WEB-DL.DD5.1.H.264-CtrlHD.mkv -: series: The Simpsons - season: 24 - episodeNumber: 3 - title: Adventures in Baby-Getting - screenSize: 720p - format: WEB-DL - audioChannels: "5.1" - audioCodec: DolbyDigital - videoCodec: h264 - releaseGroup: CtrlHD - -? /home/disaster/Videos/TV/Merlin/merlin_2008.5x02.arthurs_bane_part_two.repack.720p_hdtv_x264-fov.mkv -: series: Merlin - season: 5 - episodeNumber: 2 - part: 2 - title: Arthurs bane - screenSize: 720p - format: HDTV - videoCodec: h264 - releaseGroup: Fov - year: 2008 - other: Proper - -? "Da Vinci's Demons - 1x04 - The Magician.mkv" -: series: "Da Vinci's Demons" - season: 1 - episodeNumber: 4 - title: The Magician - -? CSI.S013E18.Sheltered.720p.WEB-DL.DD5.1.H.264.mkv -: series: CSI - season: 13 - episodeNumber: 18 - title: Sheltered - screenSize: 720p - format: WEB-DL - audioChannels: "5.1" - audioCodec: DolbyDigital - videoCodec: h264 - -? Game of Thrones S03E06 1080i HDTV DD5.1 MPEG2-TrollHD.ts -: series: Game of Thrones - season: 3 - episodeNumber: 6 - screenSize: 1080i - format: HDTV - audioChannels: "5.1" - audioCodec: DolbyDigital - videoCodec: MPEG2 - releaseGroup: TrollHD - -? gossip.girl.s01e18.hdtv.xvid-2hd.eng.srt -: series: gossip girl - season: 1 - episodeNumber: 18 - format: HDTV - videoCodec: XviD - releaseGroup: 2HD - subtitleLanguage: english - -? Wheels.S03E01E02.720p.HDTV.x264-IMMERSE.mkv -: series: Wheels - season: 3 - episodeNumber: 1 - episodeList: [1, 2] - screenSize: 720p - format: HDTV - videoCodec: h264 - releaseGroup: IMMERSE - -? Wheels.S03E01-02.720p.HDTV.x264-IMMERSE.mkv -: series: Wheels - season: 3 - episodeNumber: 1 - episodeList: [1, 2] - screenSize: 720p - format: HDTV - videoCodec: h264 - releaseGroup: IMMERSE - -? Wheels.S03E01-E02.720p.HDTV.x264-IMMERSE.mkv -: series: Wheels - season: 3 - episodeNumber: 1 - episodeList: [1, 2] - screenSize: 720p - format: HDTV - videoCodec: h264 - releaseGroup: IMMERSE - -? Wheels.S03E01-03.720p.HDTV.x264-IMMERSE.mkv -: series: Wheels - season: 3 - episodeNumber: 1 - episodeList: [1, 2, 3] - screenSize: 720p - format: HDTV - videoCodec: h264 - releaseGroup: IMMERSE - -? Marvels.Agents.of.S.H.I.E.L.D.S01E06.720p.HDTV.X264-DIMENSION.mkv -: series: Marvels Agents of S.H.I.E.L.D. - season: 1 - episodeNumber: 6 - screenSize: 720p - format: HDTV - videoCodec: h264 - releaseGroup: DIMENSION - -? Marvels.Agents.of.S.H.I.E.L.D..S01E06.720p.HDTV.X264-DIMENSION.mkv -: series: Marvels Agents of S.H.I.E.L.D. - season: 1 - episodeNumber: 6 - screenSize: 720p - format: HDTV - videoCodec: h264 - releaseGroup: DIMENSION - -? Series/Friday Night Lights/Season 1/Friday Night Lights S01E19 - Ch-Ch-Ch-Ch-Changes.avi -: series: Friday Night Lights - season: 1 - episodeNumber: 19 - title: Ch-Ch-Ch-Ch-Changes - -? Dexter Saison VII FRENCH.BDRip.XviD-MiND.nfo -: series: Dexter - season: 7 - videoCodec: XviD - language: French - format: BluRay - releaseGroup: MiND - -? Dexter Saison sept FRENCH.BDRip.XviD-MiND.nfo -: series: Dexter - season: 7 - videoCodec: XviD - language: French - format: BluRay - releaseGroup: MiND - -? "Pokémon S16 - E29 - 1280*720 HDTV VF.mkv" -: series: Pokémon - format: HDTV - language: French - season: 16 - episodeNumber: 29 - screenSize: 720p - -? One.Piece.E576.VOSTFR.720p.HDTV.x264-MARINE-FORD.mkv -: episodeNumber: 576 - videoCodec: h264 - format: HDTV - series: One Piece - releaseGroup: MARINE-FORD - subtitleLanguage: French - screenSize: 720p - -? Dexter.S08E12.FINAL.MULTi.1080p.BluRay.x264-MiND.mkv -: videoCodec: h264 - episodeNumber: 12 - season: 8 - format: BluRay - series: Dexter - other: final - language: Multiple languages - releaseGroup: MiND - screenSize: 1080p - -? One Piece - E623 VOSTFR HD [www.manga-ddl-free.com].mkv -: website: www.manga-ddl-free.com - episodeNumber: 623 - subtitleLanguage: French - series: One Piece - other: HD - -? Falling Skies Saison 1.HDLight.720p.x264.VFF.mkv -: language: French - screenSize: 720p - season: 1 - series: Falling Skies - videoCodec: h264 - other: HDLight - -? Sleepy.Hollow.S01E09.720p.WEB-DL.DD5.1.H.264-BP.mkv -: episodeNumber: 9 - videoCodec: h264 - format: WEB-DL - series: Sleepy Hollow - audioChannels: "5.1" - screenSize: 720p - season: 1 - videoProfile: BP - audioCodec: DolbyDigital - -? Sleepy.Hollow.S01E09.720p.WEB-DL.DD5.1.H.264-BS.mkv -: episodeNumber: 9 - videoCodec: h264 - format: WEB-DL - series: Sleepy Hollow - audioChannels: "5.1" - screenSize: 720p - season: 1 - releaseGroup: BS - audioCodec: DolbyDigital - -? Battlestar.Galactica.S00.Pilot.FRENCH.DVDRip.XviD-NOTAG.avi -: series: Battlestar Galactica - season: 0 - title: Pilot - episodeDetails: Pilot - language: French - format: DVD - videoCodec: XviD - releaseGroup: NOTAG - -? The Big Bang Theory S00E00 Unaired Pilot VOSTFR TVRip XviD-VioCs -: options: -n - series: The Big Bang Theory - season: 0 - episodeNumber: 0 - subtitleLanguage: French - format: TV - videoCodec: XviD - releaseGroup: VioCs - episodeDetails: [Unaired, Pilot] - title: Unaired Pilot - -? The Big Bang Theory S01E00 PROPER Unaired Pilot TVRip XviD-GIGGITY -: options: -n - series: The Big Bang Theory - season: 1 - episodeNumber: 0 - format: TV - videoCodec: XviD - releaseGroup: GIGGITY - other: proper - episodeDetails: [Unaired, Pilot] - title: Unaired Pilot - -? Pawn.Stars.S2014E18.720p.HDTV.x264-KILLERS -: options: -n - series: Pawn Stars - season: 2014 - year: 2014 - episodeNumber: 18 - screenSize: 720p - format: HDTV - videoCodec: h264 - releaseGroup: KILLERS - -? 2.Broke.Girls.S03E10.480p.HDTV.x264-mSD.mkv -: series: 2 Broke Girls - season: 3 - episodeNumber: 10 - screenSize: 480p - format: HDTV - videoCodec: h264 - releaseGroup: mSD - -? House.of.Cards.2013.S02E03.1080p.NF.WEBRip.DD5.1.x264-NTb.mkv -: series: House of Cards - year: 2013 - season: 2 - episodeNumber: 3 - screenSize: 1080p - other: Netflix - format: Webrip - audioChannels: "5.1" - audioCodec: DolbyDigital - videoCodec: h264 - releaseGroup: NTb - -? the.100.109.hdtv-lol.mp4 -: series: the 100 - season: 1 - episodeNumber: 9 - format: HDTV - releaseGroup: lol - -? 03-Criminal.Minds.5x03.Reckoner.ENG.-.sub.FR.HDTV.XviD-STi.[tvu.org.ru].avi -: series: Criminal Minds - language: English - subtitleLanguage: French - season: 5 - episodeNumber: 3 - videoCodec: XviD - format: HDTV - website: tvu.org.ru - releaseGroup: STi - title: Reckoner - -? 03-Criminal.Minds.avi -: series: Criminal Minds - episodeNumber: 3 - -? '[Evil-Saizen]_Laughing_Salesman_14_[DVD][1C98686A].mkv' -: crc32: 1C98686A - episodeNumber: 14 - format: DVD - releaseGroup: Evil-Saizen - series: Laughing Salesman - -? '[Kaylith] Zankyou no Terror - 04 [480p][B4D4514E].mp4' -: crc32: B4D4514E - episodeNumber: 4 - releaseGroup: Kaylith - screenSize: 480p - series: Zankyou no Terror - -? '[PuyaSubs!] Seirei Tsukai no Blade Dance - 05 [720p][32DD560E].mkv' -: crc32: 32DD560E - episodeNumber: 5 - releaseGroup: PuyaSubs! - screenSize: 720p - series: Seirei Tsukai no Blade Dance - -? '[Doremi].Happiness.Charge.Precure.27.[1280x720].[DC91581A].mkv' -: crc32: DC91581A - episodeNumber: 27 - releaseGroup: Doremi - screenSize: 720p - series: Happiness Charge Precure - -? "[Daisei] Free!:Iwatobi Swim Club - 01 ~ (BD 720p 10-bit AAC) [99E8E009].mkv" -: audioCodec: AAC - crc32: 99E8E009 - episodeNumber: 1 - format: BluRay - releaseGroup: Daisei - screenSize: 720p - series: Free!:Iwatobi Swim Club - videoProfile: 10bit - -? '[Tsundere] Boku wa Tomodachi ga Sukunai - 03 [BDRip h264 1920x1080 10bit FLAC][AF0C22CC].mkv' -: audioCodec: Flac - crc32: AF0C22CC - episodeNumber: 3 - format: BluRay - releaseGroup: Tsundere - screenSize: 1080p - series: Boku wa Tomodachi ga Sukunai - videoCodec: h264 - videoProfile: 10bit - -? '[t.3.3.d]_Mikakunin_de_Shinkoukei_-_12_[720p][5DDC1352].mkv' -: crc32: 5DDC1352 - episodeNumber: 12 - screenSize: 720p - series: Mikakunin de Shinkoukei - releaseGroup: t.3.3.d - -? '[Anime-Koi] Sabagebu! - 06 [h264-720p][ABB3728A].mkv' -: crc32: ABB3728A - episodeNumber: 6 - releaseGroup: Anime-Koi - screenSize: 720p - series: Sabagebu! - videoCodec: h264 - -? '[aprm-Diogo4D] [BD][1080p] Nagi no Asukara 08 [4D102B7C].mkv' -: crc32: 4D102B7C - episodeNumber: 8 - format: BluRay - releaseGroup: aprm-Diogo4D - screenSize: 1080p - series: Nagi no Asukara - -? '[Akindo-SSK] Zankyou no Terror - 05 [720P][Sub_ITA][F5CCE87C].mkv' -: crc32: F5CCE87C - episodeNumber: 5 - releaseGroup: Akindo-SSK - screenSize: 720p - series: Zankyou no Terror - subtitleLanguage: it - -? Naruto Shippuden Episode 366 VOSTFR.avi -: episodeNumber: 366 - series: Naruto Shippuden - subtitleLanguage: fr - -? Naruto Shippuden Episode 366v2 VOSTFR.avi -: episodeNumber: 366 - version: 2 - series: Naruto Shippuden - subtitleLanguage: fr - -? '[HorribleSubs] Ao Haru Ride - 06 [480p].mkv' -: episodeNumber: 6 - releaseGroup: HorribleSubs - screenSize: 480p - series: Ao Haru Ride - -? '[DeadFish] Tari Tari - 01 [BD][720p][AAC].mp4' -: audioCodec: AAC - episodeNumber: 1 - format: BluRay - releaseGroup: DeadFish - screenSize: 720p - series: Tari Tari - -? '[NoobSubs] Sword Art Online II 06 (720p 8bit AAC).mp4' -: audioCodec: AAC - episodeNumber: 6 - releaseGroup: NoobSubs - screenSize: 720p - series: Sword Art Online II - videoProfile: 8bit - -? '[DeadFish] 01 - Tari Tari [BD][720p][AAC].mp4' -: audioCodec: AAC - episodeNumber: 1 - format: BluRay - releaseGroup: DeadFish - screenSize: 720p - series: Tari Tari - -? '[NoobSubs] 06 Sword Art Online II (720p 8bit AAC).mp4' -: audioCodec: AAC - episodeNumber: 6 - releaseGroup: NoobSubs - screenSize: 720p - series: Sword Art Online II - videoProfile: 8bit - -? '[DeadFish] 12 - Tari Tari [BD][720p][AAC].mp4' -: audioCodec: AAC - episodeNumber: 12 - format: BluRay - releaseGroup: DeadFish - screenSize: 720p - series: Tari Tari - -? Something.Season.2.1of4.Ep.Title.HDTV.torrent -: episodeCount: 4 - episodeNumber: 1 - format: HDTV - season: 2 - series: Something - title: Title - extension: torrent - -? Something.Season.2of5.3of9.Ep.Title.HDTV.torrent -: episodeCount: 9 - episodeNumber: 3 - format: HDTV - season: 2 - seasonCount: 5 - series: Something - title: Title - extension: torrent - -? Something.Other.Season.3of5.Complete.HDTV.torrent -: format: HDTV - other: Complete - season: 3 - seasonCount: 5 - series: Something Other - extension: torrent - -? Something.Other.Season.1-3.avi -: season: 1 - seasonList: - - 1 - - 2 - - 3 - series: Something Other - -? Something.Other.Season.1&3.avi -: season: 1 - seasonList: - - 1 - - 3 - series: Something Other - -? Something.Other.Season.1&3-1to12ep.avi -: season: 1 - seasonList: - - 1 - - 3 - series: Something Other - -? Something.Other.saison 1 2 & 4 a 7.avi -: season: 1 - seasonList: - - 1 - - 2 - - 4 - - 5 - - 6 - - 7 - series: Something Other - -? W2Test.123.HDTV.XViD-FlexGet -: options: -n - episodeNumber: 23 - season: 1 - format: HDTV - releaseGroup: FlexGet - series: W2Test - videoCodec: XviD - -? W2Test.123.HDTV.XViD-FlexGet -: options: -n --episode-prefer-number - episodeNumber: 123 - format: HDTV - releaseGroup: FlexGet - series: W2Test - videoCodec: XviD - -? FooBar.0307.PDTV-FlexGet -: options: -n --episode-prefer-number - episodeNumber: 7 - format: DVB - releaseGroup: FlexGet - season: 3 - series: FooBar - -? FooBar.307.PDTV-FlexGet -: options: -n --episode-prefer-number - episodeNumber: 307 - format: DVB - releaseGroup: FlexGet - series: FooBar - -? FooBar.07.PDTV-FlexGet -: options: -n --episode-prefer-number - episodeNumber: 7 - format: DVB - releaseGroup: FlexGet - series: FooBar - -? FooBar.7.PDTV-FlexGet -: options: -n -t episode --episode-prefer-number - episodeNumber: 7 - format: DVB - releaseGroup: FlexGet - series: FooBar - -? FooBar.0307.PDTV-FlexGet -: options: -n - episodeNumber: 7 - format: DVB - releaseGroup: FlexGet - season: 3 - series: FooBar - -? FooBar.307.PDTV-FlexGet -: options: -n - episodeNumber: 7 - format: DVB - releaseGroup: FlexGet - season: 3 - series: FooBar - -? FooBar.07.PDTV-FlexGet -: options: -n - episodeNumber: 7 - format: DVB - releaseGroup: FlexGet - series: FooBar - -? FooBar.07v4.PDTV-FlexGet -: options: -n - episodeNumber: 7 - version: 4 - format: DVB - releaseGroup: FlexGet - series: FooBar - -? FooBar.7.PDTV-FlexGet -: options: -n -t episode - format: DVB - releaseGroup: FlexGet - series: FooBar 7 - -? FooBar.7v3.PDTV-FlexGet -: options: -n -t episode - episodeNumber: 7 - version: 3 - format: DVB - releaseGroup: FlexGet - series: FooBar - -? Test.S02E01.hdtv.real.proper -: options: -n - episodeNumber: 1 - format: HDTV - other: Proper - properCount: 2 - season: 2 - series: Test - -? Real.Test.S02E01.hdtv.proper -: options: -n - episodeNumber: 1 - format: HDTV - other: Proper - properCount: 1 - season: 2 - series: Real Test - -? Test.Real.S02E01.hdtv.proper -: options: -n - episodeNumber: 1 - format: HDTV - other: Proper - properCount: 1 - season: 2 - series: Test Real - -? Test.S02E01.hdtv.proper -: options: -n - episodeNumber: 1 - format: HDTV - other: Proper - properCount: 1 - season: 2 - series: Test - -? Test.S02E01.hdtv.real.repack.proper -: options: -n - episodeNumber: 1 - format: HDTV - other: Proper - properCount: 3 - season: 2 - series: Test - -? Date.Show.03-29-2012.HDTV.XViD-FlexGet -: options: -n - date: 2012-03-29 - format: HDTV - releaseGroup: FlexGet - series: Date Show - videoCodec: XviD - -? Something.1x5.Season.Complete-FlexGet -: options: -n - episodeNumber: 5 - other: Complete - season: 1 - series: Something - releaseGroup: FlexGet - -? Something Seasons 1 & 2 - Complete -: options: -n - other: Complete - season: 1 - seasonList: - - 1 - - 2 - series: Something - -? Something Seasons 4 Complete -: options: -n - other: Complete - season: 4 - series: Something - -? Something.1xAll.Season.Complete-FlexGet -: options: -n - other: Complete - season: 1 - series: Something - releaseGroup: FlexGet - -? Something.1xAll-FlexGet -: options: -n - other: Complete - season: 1 - series: Something - releaseGroup: FlexGet - -? FlexGet.US.S2013E14.Title.Here.720p.HDTV.AAC5.1.x264-NOGRP -: options: -n - audioChannels: '5.1' - audioCodec: AAC - country: US - episodeNumber: 14 - format: HDTV - releaseGroup: NOGRP - screenSize: 720p - season: 2013 - series: FlexGet (US) - title: Title Here - videoCodec: h264 - year: 2013 - -? FlexGet.14.of.21.Title.Here.720p.HDTV.AAC5.1.x264-NOGRP -: options: -n - audioChannels: '5.1' - audioCodec: AAC - episodeCount: 21 - episodeNumber: 14 - format: HDTV - releaseGroup: NOGRP - screenSize: 720p - series: FlexGet - title: Title Here - videoCodec: h264 - -? FlexGet.Series.2013.14.of.21.Title.Here.720p.HDTV.AAC5.1.x264-NOGRP -: options: -n - audioChannels: '5.1' - audioCodec: AAC - episodeCount: 21 - episodeNumber: 14 - format: HDTV - releaseGroup: NOGRP - screenSize: 720p - season: 2013 - series: FlexGet - title: Title Here - videoCodec: h264 - year: 2013 - -? Something.S04E05E09 -: options: -n - episodeList: - - 5 - - 6 - - 7 - - 8 - - 9 - episodeNumber: 5 - season: 4 - series: Something - -? FooBar 360 1080i -: options: -n -t episode --episode-prefer-number - episodeNumber: 360 - screenSize: 1080i - series: FooBar - -? FooBar 360 1080i -: options: -n -t episode - episodeNumber: 60 - season: 3 - screenSize: 1080i - series: FooBar - -? FooBar 360 -: options: -n -t episode - screenSize: 360p - series: FooBar - -? BarFood christmas special HDTV -: options: -n -t episode --expected-series BarFood - format: HDTV - series: BarFood - title: christmas special - episodeDetails: Special - -? Something.2008x12.13-FlexGet -: options: -n -t episode - series: Something - date: 2008-12-13 - title: FlexGet - -? '[Ignored] Test 12' -: options: -n - episodeNumber: 12 - releaseGroup: Ignored - series: Test - -? '[FlexGet] Test 12' -: options: -n - episodeNumber: 12 - releaseGroup: FlexGet - series: Test - -? Test.13.HDTV-Ignored -: options: -n - episodeNumber: 13 - format: HDTV - releaseGroup: Ignored - series: Test - -? Test.13.HDTV-Ignored -: options: -n --expected-series test - episodeNumber: 13 - format: HDTV - releaseGroup: Ignored - series: Test - -? Test.13.HDTV-Ignored -: series: Test - episodeNumber: 13 - format: HDTV - releaseGroup: Ignored - -? Test.13.HDTV-Ignored -: options: -n --expected-group "Name;FlexGet" - episodeNumber: 13 - format: HDTV - releaseGroup: Ignored - series: Test - -? Test.13.HDTV-FlexGet -: options: -n - episodeNumber: 13 - format: HDTV - releaseGroup: FlexGet - series: Test - -? Test.14.HDTV-Name -: options: -n - episodeNumber: 14 - format: HDTV - releaseGroup: Name - series: Test - -? Real.Time.With.Bill.Maher.2014.10.31.HDTV.XviD-AFG.avi -: date: 2014-10-31 - format: HDTV - releaseGroup: AFG - series: Real Time With Bill Maher - videoCodec: XviD diff --git a/libs/guessit/test/episodes.yml b/libs/guessit/test/episodes.yml new file mode 100644 index 00000000..adc4755e --- /dev/null +++ b/libs/guessit/test/episodes.yml @@ -0,0 +1,2048 @@ +? __default__ +: type: episode + +? Series/Californication/Season 2/Californication.2x05.Vaginatown.HDTV.XviD-0TV.avi +: title: Californication + season: 2 + episode: 5 + episode_title: Vaginatown + format: HDTV + video_codec: XviD + release_group: 0TV + container: avi + +? Series/dexter/Dexter.5x02.Hello,.Bandit.ENG.-.sub.FR.HDTV.XviD-AlFleNi-TeaM.[tvu.org.ru].avi +: title: Dexter + season: 5 + episode: 2 + episode_title: Hello, Bandit + language: English + subtitle_language: French + format: HDTV + video_codec: XviD + release_group: AlFleNi-TeaM + website: tvu.org.ru + container: avi + +? Series/Treme/Treme.1x03.Right.Place,.Wrong.Time.HDTV.XviD-NoTV.avi +: title: Treme + season: 1 + episode: 3 + episode_title: Right Place, Wrong Time + format: HDTV + video_codec: XviD + release_group: NoTV + +? Series/Duckman/Duckman - S1E13 Joking The Chicken (unedited).avi +: title: Duckman + season: 1 + episode: 13 + episode_title: Joking The Chicken + +? Series/Simpsons/Saison 12 Français/Simpsons,.The.12x08.A.Bas.Le.Sergent.Skinner.FR.avi +: title: The Simpsons + season: 12 + episode: 8 + episode_title: A Bas Le Sergent Skinner + language: French + +? Series/Duckman/Duckman - 101 (01) - 20021107 - I, Duckman.avi +: title: Duckman + season: 1 + episode: 1 + episode_title: I, Duckman + date: 2002-11-07 + +? Series/Simpsons/Saison 12 Français/Simpsons,.The.12x08.A.Bas.Le.Sergent.Skinner.FR.avi +: title: The Simpsons + season: 12 + episode: 8 + episode_title: A Bas Le Sergent Skinner + language: French + +? Series/Futurama/Season 3 (mkv)/[â„¢] Futurama - S03E22 - Le chef de fer à 30% ( 30 Percent Iron Chef ).mkv +: title: Futurama + season: 3 + episode: 22 + episode_title: Le chef de fer à 30% + +? Series/The Office/Season 6/The Office - S06xE01.avi +: title: The Office + season: 6 + episode: 1 + +? series/The Office/Season 4/The Office [401] Fun Run.avi +: title: The Office + season: 4 + episode: 1 + episode_title: Fun Run + +? Series/Mad Men Season 1 Complete/Mad.Men.S01E01.avi +: title: Mad Men + season: 1 + episode: 1 + other: Complete + +? series/Psych/Psych S02 Season 2 Complete English DVD/Psych.S02E02.65.Million.Years.Off.avi +: title: Psych + season: 2 + episode: 2 + episode_title: 65 Million Years Off + language: english + format: DVD + other: Complete + +? series/Psych/Psych S02 Season 2 Complete English DVD/Psych.S02E03.Psy.Vs.Psy.Français.srt +: title: Psych + season: 2 + episode: 3 + episode_title: Psy Vs Psy + format: DVD + language: English + subtitle_language: French + other: Complete + +? Series/Pure Laine/Pure.Laine.1x01.Toutes.Couleurs.Unies.FR.(Québec).DVB-Kceb.[tvu.org.ru].avi +: title: Pure Laine + season: 1 + episode: 1 + episode_title: Toutes Couleurs Unies + format: DVB + release_group: Kceb + language: french + website: tvu.org.ru + +? Series/Pure Laine/2x05 - Pure Laine - Je Me Souviens.avi +: title: Pure Laine + season: 2 + episode: 5 + episode_title: Je Me Souviens + +? Series/Tout sur moi/Tout sur moi - S02E02 - Ménage à trois (14-01-2008) [Rip by Ampli].avi +: title: Tout sur moi + season: 2 + episode: 2 + episode_title: Ménage à trois + date: 2008-01-14 + +? The.Mentalist.2x21.18-5-4.ENG.-.sub.FR.HDTV.XviD-AlFleNi-TeaM.[tvu.org.ru].avi +: title: The Mentalist + season: 2 + episode: 21 + episode_title: 18-5-4 + language: english + subtitle_language: french + format: HDTV + video_codec: XviD + release_group: AlFleNi-TeaM + website: tvu.org.ru + +? series/__ Incomplete __/Dr Slump (Catalan)/Dr._Slump_-_003_DVB-Rip_Catalan_by_kelf.avi +: title: Dr Slump + episode: 3 + format: DVB + language: catalan + +# Disabling this test because it just doesn't looks like a serie ... +#? series/Ren and Stimpy - Black_hole_[DivX].avi +#: title: Ren and Stimpy +# episode_title: Black hole +# video_codec: DivX + +# Disabling this test because it just doesn't looks like a serie ... +# ? Series/Walt Disney/Donald.Duck.-.Good.Scouts.[www.bigernie.jump.to].avi +#: title: Donald Duck +# episode_title: Good Scouts +# website: www.bigernie.jump.to + +? Series/Neverwhere/Neverwhere.05.Down.Street.[tvu.org.ru].avi +: title: Neverwhere + episode: 5 + episode_title: Down Street + website: tvu.org.ru + +? Series/South Park/Season 4/South.Park.4x07.Cherokee.Hair.Tampons.DVDRip.[tvu.org.ru].avi +: title: South Park + season: 4 + episode: 7 + episode_title: Cherokee Hair Tampons + format: DVD + website: tvu.org.ru + +? Series/Kaamelott/Kaamelott - Livre V - Ep 23 - Le Forfait.avi +: title: Kaamelott + alternative_title: Livre V + episode: 23 + episode_title: Le Forfait + +? Series/Duckman/Duckman - 110 (10) - 20021218 - Cellar Beware.avi +: title: Duckman + season: 1 + episode: 10 + date: 2002-12-18 + episode_title: Cellar Beware + +# Removing this test because it doesn't look like a series +# ? Series/Ren & Stimpy/Ren And Stimpy - Onward & Upward-Adult Party Cartoon.avi +# : title: Ren And Stimpy +# episode_title: Onward & Upward-Adult Party Cartoon + +? Series/Breaking Bad/Minisodes/Breaking.Bad.(Minisodes).01.Good.Cop.Bad.Cop.WEBRip.XviD.avi +: title: Breaking Bad + episode_format: Minisode + episode: 1 + episode_title: Good Cop Bad Cop + format: WEBRip + video_codec: XviD + +? Series/My Name Is Earl/My.Name.Is.Earl.S01Extras.-.Bad.Karma.DVDRip.XviD.avi +: title: My Name Is Earl + season: 1 + episode_title: Extras - Bad Karma + format: DVD + episode_details: Extras + video_codec: XviD + +? series/Freaks And Geeks/Season 1/Episode 4 - Kim Kelly Is My Friend-eng(1).srt +: title: Freaks And Geeks + season: 1 + episode: 4 + episode_title: Kim Kelly Is My Friend + subtitle_language: English # This is really a subtitle_language, despite guessit 1.x assert for language. + +? /mnt/series/The Big Bang Theory/S01/The.Big.Bang.Theory.S01E01.mkv +: title: The Big Bang Theory + season: 1 + episode: 1 + +? /media/Parks_and_Recreation-s03-e01.mkv +: title: Parks and Recreation + season: 3 + episode: 1 + +? /media/Parks_and_Recreation-s03-e02-Flu_Season.mkv +: title: Parks and Recreation + season: 3 + episode_title: Flu Season + episode: 2 + +? /media/Parks_and_Recreation-s03-x01.mkv +: title: Parks and Recreation + season: 3 + episode: 1 + +? /media/Parks_and_Recreation-s03-x02-Gag_Reel.mkv +: title: Parks and Recreation + season: 3 + episode: 2 + episode_title: Gag Reel + +? /media/Band_of_Brothers-e01-Currahee.mkv +: title: Band of Brothers + episode: 1 + episode_title: Currahee + +? /media/Band_of_Brothers-x02-We_Stand_Alone_Together.mkv +: title: Band of Brothers + bonus: 2 + bonus_title: We Stand Alone Together + +? /TV Shows/Mad.M-5x9.mkv +: title: Mad M + season: 5 + episode: 9 + +? /TV Shows/new.girl.117.hdtv-lol.mp4 +: title: new girl + season: 1 + episode: 17 + format: HDTV + release_group: lol + +? Kaamelott - 5x44x45x46x47x48x49x50.avi +: title: Kaamelott + season: 5 + episode: [44, 45, 46, 47, 48, 49, 50] + +? Example S01E01-02.avi +? Example S01E01E02.avi +: title: Example + season: 1 + episode: [1, 2] + +? Series/Baccano!/Baccano!_-_T1_-_Trailer_-_[Ayu](dae8173e).mkv +: title: Baccano! + other: Trailer + release_group: Ayu + episode_title: T1 + crc32: dae8173e + +? Series/Doctor Who (2005)/Season 06/Doctor Who (2005) - S06E01 - The Impossible Astronaut (1).avi +: title: Doctor Who + year: 2005 + season: 6 + episode: 1 + episode_title: The Impossible Astronaut + +? The Sopranos - [05x07] - In Camelot.mp4 +: title: The Sopranos + season: 5 + episode: 7 + episode_title: In Camelot + +? The.Office.(US).1x03.Health.Care.HDTV.XviD-LOL.avi +: title: The Office + country: US + season: 1 + episode: 3 + episode_title: Health Care + format: HDTV + video_codec: XviD + release_group: LOL + +? /Volumes/data-1/Series/Futurama/Season 3/Futurama_-_S03_DVD_Bonus_-_Deleted_Scenes_Part_3.ogm +: title: Futurama + season: 3 + part: 3 + other: Bonus + episode_title: Deleted Scenes + format: DVD + +? Ben.and.Kate.S01E02.720p.HDTV.X264-DIMENSION.mkv +: title: Ben and Kate + season: 1 + episode: 2 + screen_size: 720p + format: HDTV + video_codec: h264 + release_group: DIMENSION + +? /volume1/TV Series/Drawn Together/Season 1/Drawn Together 1x04 Requiem for a Reality Show.avi +: title: Drawn Together + season: 1 + episode: 4 + episode_title: Requiem for a Reality Show + +? Sons.of.Anarchy.S05E06.720p.WEB.DL.DD5.1.H.264-CtrlHD.mkv +: title: Sons of Anarchy + season: 5 + episode: 6 + screen_size: 720p + format: WEB-DL + audio_channels: "5.1" + audio_codec: DolbyDigital + video_codec: h264 + release_group: CtrlHD + +? /media/bdc64bfe-e36f-4af8-b550-e6fd2dfaa507/TV_Shows/Doctor Who (2005)/Saison 6/Doctor Who (2005) - S06E13 - The Wedding of River Song.mkv +: title: Doctor Who + season: 6 + episode: 13 + year: 2005 + episode_title: The Wedding of River Song + uuid: bdc64bfe-e36f-4af8-b550-e6fd2dfaa507 + +? /mnt/videos/tvshows/Doctor Who/Season 06/E13 - The Wedding of River Song.mkv +: title: Doctor Who + season: 6 + episode: 13 + episode_title: The Wedding of River Song + +? The.Simpsons.S24E03.Adventures.in.Baby-Getting.720p.WEB-DL.DD5.1.H.264-CtrlHD.mkv +: title: The Simpsons + season: 24 + episode: 3 + episode_title: Adventures in Baby-Getting + screen_size: 720p + format: WEB-DL + audio_channels: "5.1" + audio_codec: DolbyDigital + video_codec: h264 + release_group: CtrlHD + +? /home/disaster/Videos/TV/Merlin/merlin_2008.5x02.arthurs_bane_part_two.repack.720p_hdtv_x264-fov.mkv +: title: merlin + season: 5 + episode: 2 + part: 2 + episode_title: arthurs bane + screen_size: 720p + format: HDTV + video_codec: h264 + release_group: fov + year: 2008 + other: Proper + proper_count: 1 + +? "Da Vinci's Demons - 1x04 - The Magician.mkv" +: title: "Da Vinci's Demons" + season: 1 + episode: 4 + episode_title: The Magician + +? CSI.S013E18.Sheltered.720p.WEB-DL.DD5.1.H.264.mkv +: title: CSI + season: 13 + episode: 18 + episode_title: Sheltered + screen_size: 720p + format: WEB-DL + audio_channels: "5.1" + audio_codec: DolbyDigital + video_codec: h264 + +? Game of Thrones S03E06 1080i HDTV DD5.1 MPEG2-TrollHD.ts +: title: Game of Thrones + season: 3 + episode: 6 + screen_size: 1080i + format: HDTV + audio_channels: "5.1" + audio_codec: DolbyDigital + video_codec: Mpeg2 + release_group: TrollHD + +? gossip.girl.s01e18.hdtv.xvid-2hd.eng.srt +: title: gossip girl + season: 1 + episode: 18 + format: HDTV + video_codec: XviD + release_group: 2hd + subtitle_language: english + +? Wheels.S03E01E02.720p.HDTV.x264-IMMERSE.mkv +: title: Wheels + season: 3 + episode: [1, 2] + screen_size: 720p + format: HDTV + video_codec: h264 + release_group: IMMERSE + +? Wheels.S03E01-02.720p.HDTV.x264-IMMERSE.mkv +: title: Wheels + season: 3 + episode: [1, 2] + screen_size: 720p + format: HDTV + video_codec: h264 + release_group: IMMERSE + +? Wheels.S03E01-E02.720p.HDTV.x264-IMMERSE.mkv +: title: Wheels + season: 3 + episode: [1, 2] + screen_size: 720p + format: HDTV + video_codec: h264 + release_group: IMMERSE + +? Wheels.S03E01-04.720p.HDTV.x264-IMMERSE.mkv +: title: Wheels + season: 3 + episode: [1, 2, 3, 4] + screen_size: 720p + format: HDTV + video_codec: h264 + release_group: IMMERSE + +? Marvels.Agents.of.S.H.I.E.L.D-S01E06.720p.HDTV.X264-DIMENSION.mkv +: title: Marvels Agents of S.H.I.E.L.D + season: 1 + episode: 6 + screen_size: 720p + format: HDTV + video_codec: h264 + release_group: DIMENSION + +? Marvels.Agents.of.S.H.I.E.L.D.S01E06.720p.HDTV.X264-DIMENSION.mkv +: title: Marvels Agents of S.H.I.E.L.D. + season: 1 + episode: 6 + screen_size: 720p + format: HDTV + video_codec: h264 + release_group: DIMENSION + +? Marvels.Agents.of.S.H.I.E.L.D..S01E06.720p.HDTV.X264-DIMENSION.mkv +: title: Marvels Agents of S.H.I.E.L.D. + season: 1 + episode: 6 + screen_size: 720p + format: HDTV + video_codec: h264 + release_group: DIMENSION + +? Series/Friday Night Lights/Season 1/Friday Night Lights S01E19 - Ch-Ch-Ch-Ch-Changes.avi +: title: Friday Night Lights + season: 1 + episode: 19 + episode_title: Ch-Ch-Ch-Ch-Changes + +? Dexter Saison VII FRENCH.BDRip.XviD-MiND.nfo +: title: Dexter + season: 7 + video_codec: XviD + language: French + format: BluRay + release_group: MiND + +? Dexter Saison sept FRENCH.BDRip.XviD-MiND.nfo +: title: Dexter + season: 7 + video_codec: XviD + language: French + format: BluRay + release_group: MiND + +? "Pokémon S16 - E29 - 1280*720 HDTV VF.mkv" +: title: Pokémon + format: HDTV + language: French + season: 16 + episode: 29 + screen_size: 720p + +? One.Piece.E576.VOSTFR.720p.HDTV.x264-MARINE-FORD.mkv +: episode: 576 + video_codec: h264 + format: HDTV + title: One Piece + release_group: MARINE-FORD + subtitle_language: French + screen_size: 720p + +? Dexter.S08E12.FINAL.MULTi.1080p.BluRay.x264-MiND.mkv +: video_codec: h264 + episode: 12 + season: 8 + format: BluRay + title: Dexter + other: FINAL + language: Multiple languages + release_group: MiND + screen_size: 1080p + +? One Piece - E623 VOSTFR HD [www.manga-ddl-free.com].mkv +: website: www.manga-ddl-free.com + episode: 623 + subtitle_language: French + title: One Piece + other: HD + +? Falling Skies Saison 1.HDLight.720p.x264.VFF.mkv +: language: French + screen_size: 720p + season: 1 + title: Falling Skies + video_codec: h264 + other: HDLight + +? Sleepy.Hollow.S01E09.720p.WEB-DL.DD5.1.H.264-BP.mkv +: episode: 9 + video_codec: h264 + format: WEB-DL + title: Sleepy Hollow + audio_channels: "5.1" + screen_size: 720p + season: 1 + video_profile: BP + audio_codec: DolbyDigital + +? Sleepy.Hollow.S01E09.720p.WEB-DL.DD5.1.H.264-BS.mkv +: episode: 9 + video_codec: h264 + format: WEB-DL + title: Sleepy Hollow + audio_channels: "5.1" + screen_size: 720p + season: 1 + release_group: BS + audio_codec: DolbyDigital + +? Battlestar.Galactica.S00.Pilot.FRENCH.DVDRip.XviD-NOTAG.avi +: title: Battlestar Galactica + season: 0 + episode_details: Pilot + episode_title: Pilot + language: French + format: DVD + video_codec: XviD + release_group: NOTAG + +? The Big Bang Theory S00E00 Unaired Pilot VOSTFR TVRip XviD-VioCs +: title: The Big Bang Theory + season: 0 + episode: 0 + subtitle_language: French + format: TV + video_codec: XviD + release_group: VioCs + episode_details: [Unaired, Pilot] + +? The Big Bang Theory S01E00 PROPER Unaired Pilot TVRip XviD-GIGGITY +: title: The Big Bang Theory + season: 1 + episode: 0 + format: TV + video_codec: XviD + release_group: GIGGITY + other: Proper + proper_count: 1 + episode_details: [Unaired, Pilot] + +? Pawn.Stars.S2014E18.720p.HDTV.x264-KILLERS +: title: Pawn Stars + season: 2014 + year: 2014 + episode: 18 + screen_size: 720p + format: HDTV + video_codec: h264 + release_group: KILLERS + +? 2.Broke.Girls.S03E10.480p.HDTV.x264-mSD.mkv +: title: 2 Broke Girls + season: 3 + episode: 10 + screen_size: 480p + format: HDTV + video_codec: h264 + release_group: mSD + +? House.of.Cards.2013.S02E03.1080p.NF.WEBRip.DD5.1.x264-NTb.mkv +: title: House of Cards + year: 2013 + season: 2 + episode: 3 + screen_size: 1080p + other: Netflix + format: WEBRip + audio_channels: "5.1" + audio_codec: DolbyDigital + video_codec: h264 + release_group: NTb + +? the.100.109.hdtv-lol.mp4 +: title: the 100 + season: 1 + episode: 9 + format: HDTV + release_group: lol + +? Criminal.Minds.5x03.Reckoner.ENG.-.sub.FR.HDTV.XviD-STi.[tvu.org.ru].avi +: title: Criminal Minds + language: English + subtitle_language: French + season: 5 + episode: 3 + video_codec: XviD + format: HDTV + website: tvu.org.ru + release_group: STi + episode_title: Reckoner + +? 03-Criminal.Minds.avi +: title: Criminal Minds + episode: 3 + +? '[Evil-Saizen]_Laughing_Salesman_14_[DVD][1C98686A].mkv' +: crc32: 1C98686A + episode: 14 + format: DVD + release_group: Evil-Saizen + title: Laughing Salesman + +? '[Kaylith] Zankyou no Terror - 04 [480p][B4D4514E].mp4' +: crc32: B4D4514E + episode: 4 + release_group: Kaylith + screen_size: 480p + title: Zankyou no Terror + +? '[PuyaSubs!] Seirei Tsukai no Blade Dance - 05 [720p][32DD560E].mkv' +: crc32: 32DD560E + episode: 5 + release_group: PuyaSubs! + screen_size: 720p + title: Seirei Tsukai no Blade Dance + +? '[Doremi].Happiness.Charge.Precure.27.[1280x720].[DC91581A].mkv' +: crc32: DC91581A + episode: 27 + release_group: Doremi + screen_size: 720p + title: Happiness Charge Precure + +? "[Daisei] Free!:Iwatobi Swim Club - 01 ~ (BD 720p 10-bit AAC) [99E8E009].mkv" +: audio_codec: AAC + crc32: 99E8E009 + episode: 1 + format: BluRay + release_group: Daisei + screen_size: 720p + title: Free!:Iwatobi Swim Club + video_profile: 10bit + +? '[Tsundere] Boku wa Tomodachi ga Sukunai - 03 [BDRip h264 1920x1080 10bit FLAC][AF0C22CC].mkv' +: audio_codec: FLAC + crc32: AF0C22CC + episode: 3 + format: BluRay + release_group: Tsundere + screen_size: 1080p + title: Boku wa Tomodachi ga Sukunai + video_codec: h264 + video_profile: 10bit + +? '[t.3.3.d]_Mikakunin_de_Shinkoukei_-_12_[720p][5DDC1352].mkv' +: crc32: 5DDC1352 + episode: 12 + screen_size: 720p + title: Mikakunin de Shinkoukei + release_group: t.3.3.d + +? '[Anime-Koi] Sabagebu! - 06 [h264-720p][ABB3728A].mkv' +: crc32: ABB3728A + episode: 6 + release_group: Anime-Koi + screen_size: 720p + title: Sabagebu! + video_codec: h264 + +? '[aprm-Diogo4D] [BD][1080p] Nagi no Asukara 08 [4D102B7C].mkv' +: crc32: 4D102B7C + episode: 8 + format: BluRay + release_group: aprm-Diogo4D + screen_size: 1080p + title: Nagi no Asukara + +? '[Akindo-SSK] Zankyou no Terror - 05 [720P][Sub_ITA][F5CCE87C].mkv' +: crc32: F5CCE87C + episode: 5 + release_group: Akindo-SSK + screen_size: 720p + title: Zankyou no Terror + subtitle_language: it + +? Naruto Shippuden Episode 366 VOSTFR.avi +: episode: 366 + title: Naruto Shippuden + subtitle_language: fr + +? Naruto Shippuden Episode 366v2 VOSTFR.avi +: episode: 366 + version: 2 + title: Naruto Shippuden + subtitle_language: fr + +? '[HorribleSubs] Ao Haru Ride - 06 [480p].mkv' +: episode: 6 + release_group: HorribleSubs + screen_size: 480p + title: Ao Haru Ride + +? '[DeadFish] Tari Tari - 01 [BD][720p][AAC].mp4' +: audio_codec: AAC + episode: 1 + format: BluRay + release_group: DeadFish + screen_size: 720p + title: Tari Tari + +? '[NoobSubs] Sword Art Online II 06 (720p 8bit AAC).mp4' +: audio_codec: AAC + episode: 6 + release_group: NoobSubs + screen_size: 720p + title: Sword Art Online II + video_profile: 8bit + +? '[DeadFish] 01 - Tari Tari [BD][720p][AAC].mp4' +: audio_codec: AAC + episode: 1 + format: BluRay + release_group: DeadFish + screen_size: 720p + title: Tari Tari + +? '[NoobSubs] 06 Sword Art Online II (720p 8bit AAC).mp4' +: audio_codec: AAC + episode: 6 + release_group: NoobSubs + screen_size: 720p + title: Sword Art Online II + video_profile: 8bit + +? '[DeadFish] 12 - Tari Tari [BD][720p][AAC].mp4' +: audio_codec: AAC + episode: 12 + format: BluRay + release_group: DeadFish + screen_size: 720p + title: Tari Tari + +? Something.Season.2.1of4.Ep.Title.HDTV.torrent +: episode_count: 4 + episode: 1 + format: HDTV + season: 2 + title: Something + episode_title: Title + container: torrent + +? Something.Season.2of5.3of9.Ep.Title.HDTV.torrent +: episode_count: 9 + episode: 3 + format: HDTV + season: 2 + season_count: 5 + title: Something + episode_title: Title + container: torrent + +? Something.Other.Season.3of5.Complete.HDTV.torrent +: format: HDTV + other: Complete + season: 3 + season_count: 5 + title: Something Other + container: torrent + +? Something.Other.Season.1-3.avi +: season: [1, 2, 3] + title: Something Other + +? Something.Other.Season.1&3.avi +: season: [1, 3] + title: Something Other + +? Something.Other.Season.1&3-1to12ep.avi +: season: [1, 3] + title: Something Other + +? W2Test.123.HDTV.XViD-FlexGet +: episode: 23 + season: 1 + format: HDTV + release_group: FlexGet + title: W2Test + video_codec: XviD + +? W2Test.123.HDTV.XViD-FlexGet +: options: --episode-prefer-number + episode: 123 + format: HDTV + release_group: FlexGet + title: W2Test + video_codec: XviD + +? FooBar.0307.PDTV-FlexGet +: episode: 7 + format: DVB + release_group: FlexGet + season: 3 + title: FooBar + +? FooBar.0307.PDTV-FlexGet +? FooBar.307.PDTV-FlexGet +: options: --episode-prefer-number + episode: 307 + format: DVB + release_group: FlexGet + title: FooBar + +? FooBar.07.PDTV-FlexGet +: options: --episode-prefer-number + episode: 7 + format: DVB + release_group: FlexGet + title: FooBar + +? FooBar.7.PDTV-FlexGet +: options: --episode-prefer-number + episode: 7 + format: DVB + release_group: FlexGet + title: FooBar + +? FooBar.0307.PDTV-FlexGet +: episode: 7 + format: DVB + release_group: FlexGet + season: 3 + title: FooBar + +? FooBar.307.PDTV-FlexGet +: episode: 7 + format: DVB + release_group: FlexGet + season: 3 + title: FooBar + +? FooBar.07.PDTV-FlexGet +: episode: 7 + format: DVB + release_group: FlexGet + title: FooBar + +? FooBar.07v4.PDTV-FlexGet +: episode: 7 + version: 4 + format: DVB + release_group: FlexGet + title: FooBar + +? FooBar.7.PDTV-FlexGet +: format: DVB + release_group: FlexGet + title: FooBar 7 + type: movie + +? FooBar.7.PDTV-FlexGet +: options: -t episode + episode: 7 + format: DVB + release_group: FlexGet + title: FooBar + +? FooBar.7v3.PDTV-FlexGet +: options: -t episode + episode: 7 + version: 3 + format: DVB + release_group: FlexGet + title: FooBar + +? Test.S02E01.hdtv.real.proper +: episode: 1 + format: HDTV + other: Proper + proper_count: 2 + season: 2 + title: Test + +? Real.Test.S02E01.hdtv.proper +: episode: 1 + format: HDTV + other: Proper + proper_count: 1 + season: 2 + title: Real Test + +? Test.Real.S02E01.hdtv.proper +: episode: 1 + format: HDTV + other: Proper + proper_count: 1 + season: 2 + title: Test Real + +? Test.S02E01.hdtv.proper +: episode: 1 + format: HDTV + other: Proper + proper_count: 1 + season: 2 + title: Test + +? Test.S02E01.hdtv.real.repack.proper +: episode: 1 + format: HDTV + other: Proper + proper_count: 3 + season: 2 + title: Test + +? Date.Show.03-29-2012.HDTV.XViD-FlexGet +: date: 2012-03-29 + format: HDTV + release_group: FlexGet + title: Date Show + video_codec: XviD + +? Something.1x5.Season.Complete-FlexGet +: episode: 5 + other: Complete + season: 1 + title: Something + release_group: FlexGet + +? Something Seasons 1 & 2 - Complete +: other: Complete + season: + - 1 + - 2 + title: Something + +? Something Seasons 4 Complete +: other: Complete + season: 4 + title: Something + +? Something.1xAll.Season.Complete-FlexGet +: other: Complete + season: 1 + title: Something + release_group: FlexGet + +? Something.1xAll-FlexGet +: other: Complete + season: 1 + title: Something + release_group: FlexGet + +? FlexGet.US.S2013E14.Title.Here.720p.HDTV.AAC5.1.x264-NOGRP +: audio_channels: '5.1' + audio_codec: AAC + country: US + episode: 14 + format: HDTV + release_group: NOGRP + screen_size: 720p + season: 2013 + title: FlexGet + episode_title: Title Here + video_codec: h264 + year: 2013 + +? FlexGet.14.of.21.Title.Here.720p.HDTV.AAC5.1.x264-NOGRP +: audio_channels: '5.1' + audio_codec: AAC + episode_count: 21 + episode: 14 + format: HDTV + release_group: NOGRP + screen_size: 720p + title: FlexGet + episode_title: Title Here + video_codec: h264 + +? FlexGet.Series.2013.14.of.21.Title.Here.720p.HDTV.AAC5.1.x264-NOGRP +: audio_channels: '5.1' + audio_codec: AAC + episode_count: 21 + episode: 14 + format: HDTV + release_group: NOGRP + screen_size: 720p + season: 2013 + title: FlexGet + episode_title: Title Here + video_codec: h264 + year: 2013 + +? Something.S04E05E09 +: episode: # 1.x guessit this as a range from 5 to 9. But not sure if it should ... + - 5 + - 9 + season: 4 + title: Something + +? FooBar 360 1080i +: options: --episode-prefer-number + episode: 360 + screen_size: 1080i + title: FooBar + +? FooBar 360 1080i +: episode: 60 + season: 3 + screen_size: 1080i + title: FooBar + +? FooBar 360 +: screen_size: 360p + title: FooBar + +? BarFood christmas special HDTV +: options: --expected-title BarFood + format: HDTV + title: BarFood + episode_title: christmas special + episode_details: Special + +? Something.2008x12.13-FlexGet +: title: Something + date: 2008-12-13 + episode_title: FlexGet + +? '[Ignored] Test 12' +: episode: 12 + release_group: Ignored + title: Test + +? '[FlexGet] Test 12' +: episode: 12 + release_group: FlexGet + title: Test + +? Test.13.HDTV-Ignored +: episode: 13 + format: HDTV + release_group: Ignored + title: Test + +? Test.13.HDTV-Ignored +: options: --expected-series test + episode: 13 + format: HDTV + release_group: Ignored + title: Test + +? Test.13.HDTV-Ignored +: title: Test + episode: 13 + format: HDTV + release_group: Ignored + +? Test.13.HDTV-Ignored +: episode: 13 + format: HDTV + release_group: Ignored + title: Test + +? Test.13.HDTV-FlexGet +: episode: 13 + format: HDTV + release_group: FlexGet + title: Test + +? Test.14.HDTV-Name +: episode: 14 + format: HDTV + release_group: Name + title: Test + +? Real.Time.With.Bill.Maher.2014.10.31.HDTV.XviD-AFG.avi +: date: 2014-10-31 + format: HDTV + release_group: AFG + title: Real Time With Bill Maher + video_codec: XviD + +? Arrow.S03E21.Al.Sah-Him.1080p.WEB-DL.DD5.1.H.264-BS.mkv +: title: Arrow + season: 3 + episode: 21 + episode_title: Al Sah-Him + screen_size: 1080p + audio_codec: DolbyDigital + audio_channels: "5.1" + video_codec: h264 + release_group: BS + format: WEB-DL + +? How to Make It in America - S02E06 - I'm Sorry, Who's Yosi?.mkv +: title: How to Make It in America + season: 2 + episode: 6 + episode_title: I'm Sorry, Who's Yosi? + +? 24.S05E07.FRENCH.DVDRip.XviD-FiXi0N.avi +: episode: 7 + format: DVD + language: fr + season: 5 + title: '24' + video_codec: XviD + release_group: FiXi0N + +? 12.Monkeys.S01E12.FRENCH.BDRip.x264-VENUE.mkv +: episode: 12 + format: BluRay + language: fr + release_group: VENUE + season: 1 + title: 12 Monkeys + video_codec: h264 + +? The.Daily.Show.2015.07.01.Kirsten.Gillibrand.Extended.720p.CC.WEBRip.AAC2.0.x264-BTW.mkv +: audio_channels: '2.0' + audio_codec: AAC + date: 2015-07-01 + format: WEBRip + other: [Extended, CC] + release_group: BTW + screen_size: 720p + title: The Daily Show + episode_title: Kirsten Gillibrand + video_codec: h264 + +? The.Daily.Show.2015.07.01.Kirsten.Gillibrand.Extended.Interview.720p.CC.WEBRip.AAC2.0.x264-BTW.mkv +: audio_channels: '2.0' + audio_codec: AAC + date: 2015-07-01 + format: WEBRip + other: CC + release_group: BTW + screen_size: 720p + title: The Daily Show + episode_title: Kirsten Gillibrand Extended Interview + video_codec: h264 + +? The.Daily.Show.2015.07.02.Sarah.Vowell.CC.WEBRip.AAC2.0.x264-BTW.mkv +: audio_channels: '2.0' + audio_codec: AAC + date: 2015-07-02 + format: WEBRip + other: CC + release_group: BTW + title: The Daily Show + episode_title: Sarah Vowell + video_codec: h264 + +? 90.Day.Fiance.S02E07.I.Have.To.Tell.You.Something.720p.HDTV.x264-W4F +: episode: 7 + format: HDTV + screen_size: 720p + season: 2 + title: 90 Day Fiance + episode_title: I Have To Tell You Something + release_group: W4F + +? Doctor.Who.2005.S04E06.FRENCH.LD.DVDRip.XviD-TRACKS.avi +: episode: 6 + format: DVD + language: fr + release_group: TRACKS + season: 4 + title: Doctor Who + other: LD + video_codec: XviD + year: 2005 + +? Astro.Le.Petit.Robot.S01E01+02.FRENCH.DVDRiP.X264.INT-BOOLZ.mkv +: episode: [1, 2] + format: DVD + language: fr + release_group: INT-BOOLZ + season: 1 + title: Astro Le Petit Robot + video_codec: h264 + +? Annika.Bengtzon.2012.E01.Le.Testament.De.Nobel.FRENCH.DVDRiP.XViD-STVFRV.avi +: episode: 1 + format: DVD + language: fr + release_group: STVFRV + title: Annika Bengtzon + episode_title: Le Testament De Nobel + video_codec: XviD + year: 2012 + +? Dead.Set.02.FRENCH.LD.DVDRip.XviD-EPZ.avi +: episode: 2 + format: DVD + language: fr + other: LD + release_group: EPZ + title: Dead Set + video_codec: XviD + +? Phineas and Ferb S01E00 & S01E01 & S01E02 +: episode: [0, 1, 2] + season: 1 + title: Phineas and Ferb + +? Show.Name.S01E02.S01E03.HDTV.XViD.Etc-Group +: episode: [2, 3] + format: HDTV + release_group: Etc-Group + season: 1 + title: Show Name + video_codec: XviD + +? Show Name - S01E02 - S01E03 - S01E04 - Ep Name +: episode: [2, 3, 4] + season: 1 + title: Show Name + episode_title: Ep Name + +? Show.Name.1x02.1x03.HDTV.XViD.Etc-Group +: episode: [2, 3] + format: HDTV + release_group: Etc-Group + season: 1 + title: Show Name + video_codec: XviD + +? Show Name - 1x02 - 1x03 - 1x04 - Ep Name +: episode: [2, 3, 4] + season: 1 + title: Show Name + episode_title: Ep Name + +? Show.Name.S01E02.HDTV.XViD.Etc-Group +: episode: 2 + format: HDTV + release_group: Etc-Group + season: 1 + title: Show Name + video_codec: XviD + +? Show Name - S01E02 - My Ep Name +: episode: 2 + season: 1 + title: Show Name + episode_title: My Ep Name + +? Show Name - S01.E03 - My Ep Name +: episode: 3 + season: 1 + title: Show Name + episode_title: My Ep Name + +? Show.Name.S01E02E03.HDTV.XViD.Etc-Group +: episode: [2, 3] + format: HDTV + release_group: Etc-Group + season: 1 + title: Show Name + video_codec: XviD + +? Show Name - S01E02-03 - My Ep Name +: episode: [2, 3] + season: 1 + title: Show Name + episode_title: My Ep Name + +? Show.Name.S01.E02.E03 +: episode: [2, 3] + season: 1 + title: Show Name + +? Show_Name.1x02.HDTV_XViD_Etc-Group +: episode: 2 + format: HDTV + release_group: Etc-Group + season: 1 + title: Show Name + video_codec: XviD + +? Show Name - 1x02 - My Ep Name +: episode: 2 + season: 1 + title: Show Name + episode_title: My Ep Name + +? Show_Name.1x02x03x04.HDTV_XViD_Etc-Group +: episode: [2, 3, 4] + format: HDTV + release_group: Etc-Group + season: 1 + title: Show Name + video_codec: XviD + +? Show Name - 1x02-03-04 - My Ep Name +: episode: [2, 3, 4] + season: 1 + title: Show Name + episode_title: My Ep Name + +# 1x guess this as episode 100 but 101 as episode 1 season 1. +? Show.Name.100.Event.2010.11.23.HDTV.XViD.Etc-Group +: date: 2010-11-23 + season: 1 + episode: 0 + format: HDTV + release_group: Etc-Group + title: Show Name + episode_title: Event + video_codec: XviD + +? Show.Name.101.Event.2010.11.23.HDTV.XViD.Etc-Group +: date: 2010-11-23 + season: 1 + episode: 1 + format: HDTV + release_group: Etc-Group + title: Show Name + episode_title: Event + video_codec: XviD + +? Show.Name.2010.11.23.HDTV.XViD.Etc-Group +: date: 2010-11-23 + format: HDTV + release_group: Etc-Group + title: Show Name + +? Show Name - 2010-11-23 - Ep Name +: date: 2010-11-23 + title: Show Name + episode_title: Ep Name + +? Show Name Season 1 Episode 2 Ep Name +: episode: 2 + season: 1 + title: Show Name + episode_title: Ep Name + +? Show.Name.S01.HDTV.XViD.Etc-Group +: format: HDTV + release_group: Etc-Group + season: 1 + title: Show Name + video_codec: XviD + +? Show.Name.E02-03 +: episode: [2, 3] + title: Show Name + +? Show.Name.E02.2010 +: episode: 2 + year: 2010 + title: Show Name + +? Show.Name.E23.Test +: episode: 23 + title: Show Name + episode_title: Test + +? Show.Name.Part.3.HDTV.XViD.Etc-Group +: part: 3 + title: Show Name + format: HDTV + video_codec: XviD + release_group: Etc-Group + type: movie + # Fallback to movie type because we can't tell it's a series ... + +? Show.Name.Part.1.and.Part.2.Blah-Group +: part: [1, 2] + title: Show Name + type: movie + # Fallback to movie type because we can't tell it's a series ... + +? Show Name - 01 - Ep Name +: episode: 1 + title: Show Name + episode_title: Ep Name + +? 01 - Ep Name +: episode: 1 + title: Ep Name + +? Show.Name.102.HDTV.XViD.Etc-Group +: episode: 2 + format: HDTV + release_group: Etc-Group + season: 1 + title: Show Name + video_codec: XviD + +? '[HorribleSubs] Maria the Virgin Witch - 01 [720p].mkv' +: episode: 1 + release_group: HorribleSubs + screen_size: 720p + title: Maria the Virgin Witch + +? '[ISLAND]One_Piece_679_[VOSTFR]_[V1]_[8bit]_[720p]_[EB7838FC].mp4' +: options: -E + crc32: EB7838FC + episode: 679 + release_group: ISLAND + screen_size: 720p + title: One Piece + subtitle_language: fr + video_profile: 8bit + version: 1 + +? '[ISLAND]One_Piece_679_[VOSTFR]_[8bit]_[720p]_[EB7838FC].mp4' +: options: -E + crc32: EB7838FC + episode: 679 + release_group: ISLAND + screen_size: 720p + title: One Piece + subtitle_language: fr + video_profile: 8bit + +? '[Kaerizaki-Fansub]_One_Piece_679_[VOSTFR][HD_1280x720].mp4' +: options: -E + episode: 679 + other: HD + release_group: Kaerizaki-Fansub + screen_size: 720p + title: One Piece + subtitle_language: fr + +? '[Kaerizaki-Fansub]_One_Piece_679_[VOSTFR][FANSUB][HD_1280x720].mp4' +: options: -E + episode: 679 + other: + - Fansub + - HD + release_group: Kaerizaki-Fansub + screen_size: 720p + title: One Piece + subtitle_language: fr + +? '[Kaerizaki-Fansub]_One_Piece_681_[VOSTFR][HD_1280x720]_V2.mp4' +: options: -E + episode: 681 + other: HD + release_group: Kaerizaki-Fansub + screen_size: 720p + title: One Piece + subtitle_language: fr + version: 2 + +? '[Kaerizaki-Fansub] High School DxD New 04 VOSTFR HD (1280x720) V2.mp4' +: options: -E + episode: 4 + other: HD + release_group: Kaerizaki-Fansub + screen_size: 720p + title: High School DxD New + subtitle_language: fr + version: 2 + +? '[Kaerizaki-Fansub] One Piece 603 VOSTFR PS VITA (960x544) V2.mp4' +: options: -E + episode: 603 + release_group: + - Kaerizaki-Fansub + - PS VITA + screen_size: 960x544 + title: One Piece + subtitle_language: fr + version: 2 + +? '[Group Name] Show Name.13' +: episode: 13 + release_group: Group Name + title: Show Name + +? '[Group Name] Show Name - 13' +: episode: 13 + release_group: Group Name + title: Show Name + +? '[Group Name] Show Name 13' +: episode: 13 + release_group: Group Name + title: Show Name + +# [Group Name] Show Name.13-14 +# [Group Name] Show Name - 13-14 +# Show Name 13-14 + +? '[Stratos-Subs]_Infinite_Stratos_-_12_(1280x720_H.264_AAC)_[379759DB]' +: audio_codec: AAC + crc32: 379759DB + episode: 12 + release_group: Stratos-Subs + screen_size: 720p + title: Infinite Stratos + video_codec: h264 + +# [ShinBunBu-Subs] Bleach - 02-03 (CX 1280x720 x264 AAC) + +? '[SGKK] Bleach 312v1 [720p/MKV]' +: options: -E # guessit 1.x for episode only when version is guessed, but it's doesn't make it consistent. + episode: 312 + release_group: SGKK + screen_size: 720p + title: Bleach + version: 1 + +? '[Ayako]_Infinite_Stratos_-_IS_-_07_[H264][720p][EB7838FC]' +: crc32: EB7838FC + episode: 7 + release_group: Ayako + screen_size: 720p + title: Infinite Stratos + video_codec: h264 + +? '[Ayako] Infinite Stratos - IS - 07v2 [H264][720p][44419534]' +: crc32: '44419534' + episode: 7 + release_group: Ayako + screen_size: 720p + title: Infinite Stratos + video_codec: h264 + version: 2 + +? '[Ayako-Shikkaku] Oniichan no Koto Nanka Zenzen Suki Janain Dakara ne - 10 [LQ][h264][720p] [8853B21C]' +: crc32: 8853B21C + episode: 10 + release_group: Ayako-Shikkaku + screen_size: 720p + title: Oniichan no Koto Nanka Zenzen Suki Janain Dakara ne + video_codec: h264 + +# TODO: Add support for absolute episodes +? Bleach - s16e03-04 - 313-314 +? Bleach.s16e03-04.313-314 +? Bleach.s16e03-04.313-314 +? Bleach - s16e03-04 - 313-314 +? Bleach.s16e03-04.313-314 +? Bleach s16e03e04 313-314 +: episode: [3, 4] + season: 16 + title: Bleach + +? Bleach - 313-314 +: options: -E + episode: [313, 314] + title: Bleach + +? '[ShinBunBu-Subs] Bleach - 02-03 (CX 1280x720 x264 AAC)' +: audio_codec: AAC + episode: [2, 3] + release_group: ShinBunBu-Subs + screen_size: 720p + title: Bleach + video_codec: h264 + +? 003. Show Name - Ep Name.avi +: episode: 3 + title: Show Name + episode_title: Ep Name + +? 003-004. Show Name - Ep Name.avi +: episode: [3, 4] + title: Show Name + episode_title: Ep Name + +? One Piece - 102 +: episode: 2 + season: 1 + title: One Piece + +? "[ACX]_Wolf's_Spirit_001.mkv" +: episode: 1 + release_group: ACX + title: "Wolf's Spirit" + +? Project.Runway.S14E00.and.S14E01.(Eng.Subs).SDTV.x264-[2Maverick].mp4 +: episode: [0, 1] + format: TV + release_group: 2Maverick + season: 14 + title: Project Runway + subtitle_language: en + video_codec: h264 + +? '[Hatsuyuki-Kaitou]_Fairy_Tail_2_-_16-20_[720p][10bit].torrent' +: episode: [16, 17, 18, 19, 20] + release_group: Hatsuyuki-Kaitou + screen_size: 720p + title: Fairy Tail 2 + video_profile: 10bit + +? '[Hatsuyuki-Kaitou]_Fairy_Tail_2_-_16-20_(191-195)_[720p][10bit].torrent' +: options: -E + episode: [16, 17, 18, 19, 20, 191, 192, 193, 194, 195] + release_group: Hatsuyuki-Kaitou + screen_size: 720p + title: Fairy Tail 2 + +? "Looney Tunes 1940x01 Porky's Last Stand.mkv" +: episode: 1 + season: 1940 + title: Looney Tunes + episode_title: Porky's Last Stand + year: 1940 + +? The.Good.Wife.S06E01.E10.720p.WEB-DL.DD5.1.H.264-CtrlHD/The.Good.Wife.S06E09.Trust.Issues.720p.WEB-DL.DD5.1.H.264-CtrlHD.mkv +: audio_channels: '5.1' + audio_codec: DolbyDigital + episode: 9 + format: WEB-DL + release_group: CtrlHD + screen_size: 720p + season: 6 + title: The Good Wife + episode_title: Trust Issues + video_codec: h264 + +? Fear the Walking Dead - 01x02 - So Close, Yet So Far.REPACK-KILLERS.French.C.updated.Addic7ed.com.mkv +: episode: 2 + language: fr + other: Proper + proper_count: 1 + season: 1 + title: Fear the Walking Dead + episode_title: So Close, Yet So Far + +? Fear the Walking Dead - 01x02 - En Close, Yet En Far.REPACK-KILLERS.French.C.updated.Addic7ed.com.mkv +: episode: 2 + language: fr + other: Proper + proper_count: 1 + season: 1 + title: Fear the Walking Dead + episode_title: En Close, Yet En Far + +? /av/unsorted/The.Daily.Show.2015.07.22.Jake.Gyllenhaal.720p.HDTV.x264-BATV.mkv +: date: 2015-07-22 + format: HDTV + release_group: BATV + screen_size: 720p + title: The Daily Show + episode_title: Jake Gyllenhaal + video_codec: h264 + +? "[7.1.7.8.5] Foo Bar - 11 (H.264) [5235532D].mkv" +: options: -E + episode: 11 + +? my 720p show S01E02 +: options: -T "my 720p show" + title: my 720p show + season: 1 + episode: 2 + +? my 720p show S01E02 720p +: options: -T "my 720p show" + title: my 720p show + season: 1 + episode: 2 + screen_size: 720p + +? -my 720p show S01E02 +: options: -T "re:my \d+p show" + screen_size: 720p + +? Show S01E02 +: options: -T "The Show" + title: Show + season: 1 + episode: 2 + +? Foo's & Bars (2009) S01E01 720p XviD-2HD[AOEU] +: episode: 1 + release_group: 2HD[AOEU] + screen_size: 720p + season: 1 + title: Foo's & Bars + video_codec: XviD + year: 2009 + +? Date.Series.10-11-2008.XViD +: date: 2008-11-10 + title: Date + video_codec: XviD + +? Scrubs/SEASON-06/Scrubs.S06E09.My.Perspective.DVDRip.XviD-WAT/scrubs.s06e09.dvdrip.xvid-wat.avi +: container: avi + episode: 9 + episode_title: My Perspective + format: DVD + mimetype: video/x-msvideo + release_group: WAT + season: 6 + title: Scrubs + video_codec: XviD + +? '[PuyaSubs!] Digimon Adventure tri - 01 [720p][F9967949].mkv' +: container: mkv + crc32: F9967949 + episode: 1 + mimetype: video/x-matroska + release_group: PuyaSubs! + screen_size: 720p + title: Digimon Adventure tri + +? Sherlock.S01.720p.BluRay.x264-AVCHD +: format: BluRay + screen_size: 720p + season: 1 + title: Sherlock + video_codec: h264 + +? Running.Wild.With.Bear.Grylls.S02E07.Michael.B.Jordan.PROPER.HDTV.x264-W4F.avi +: container: avi + episode: 7 + episode_title: Michael B Jordan + format: HDTV + mimetype: video/x-msvideo + other: Proper + proper_count: 1 + release_group: W4F + season: 2 + title: Running Wild With Bear Grylls + video_codec: h264 + +? Homeland.S05E11.Our.Man.in.Damascus.German.Sub.720p.HDTV.x264.iNTERNAL-BaCKToRG +: episode: 11 + episode_title: Our Man in Damascus + format: HDTV + release_group: iNTERNAL-BaCKToRG + screen_size: 720p + season: 5 + subtitle_language: de + title: Homeland + type: episode + video_codec: h264 + +? Breaking.Bad.S01E01.2008.BluRay.VC1.1080P.5.1.WMV-NOVO +: title: Breaking Bad + season: 1 + episode: 1 + year: 2008 + format: BluRay + screen_size: 1080p + audio_channels: '5.1' + container: WMV + release_group: NOVO + type: episode + +? Cosmos.A.Space.Time.Odyssey.S01E02.HDTV.x264.PROPER-LOL +: title: Cosmos A Space Time Odyssey + season: 1 + episode: 2 + format: HDTV + video_codec: h264 + other: Proper + proper_count: 1 + release_group: LOL + type: episode + +? Fear.The.Walking.Dead.S02E01.HDTV.x264.AAC.MP4-k3n +: title: Fear The Walking Dead + season: 2 + episode: 1 + format: HDTV + video_codec: h264 + audio_codec: AAC + container: MP4 + release_group: k3n + type: episode + +? Elementary.S01E01.Pilot.DVDSCR.x264.PREAiR-NoGRP +: title: Elementary + season: 1 + episode: 1 + episode_details: Pilot + episode_title: Pilot + format: DVD + video_codec: h264 + other: [Screener, Preair] + release_group: NoGRP + type: episode + +? Once.Upon.a.Time.S05E19.HDTV.x264.REPACK-LOL[ettv] +: title: Once Upon a Time + season: 5 + episode: 19 + format: HDTV + video_codec: h264 + other: Proper + proper_count: 1 + release_group: LOL[ettv] + type: episode + +? Show.Name.S01E03.WEB-DL.x264.HUN-nIk +: title: Show Name + season: 1 + episode: 3 + format: WEB-DL + video_codec: h264 + language: hu + release_group: nIk + type: episode + +? Game.of.Thrones.S6.Ep5.X265.Dolby.2.0.KTM3.mp4 +: audio_channels: '2.0' + audio_codec: DolbyDigital + container: mp4 + episode: 5 + release_group: KTM3 + season: 6 + title: Game of Thrones + type: episode + video_codec: h265 + +? Fargo.-.Season.1.-.720p.BluRay.-.x264.-.ShAaNiG +: format: BluRay + release_group: ShAaNiG + screen_size: 720p + season: 1 + title: Fargo + type: episode + video_codec: h264 + +? Show.Name.S02E02.Episode.Title.1080p.WEB-DL.x264.5.1Ch.-.Group +: audio_channels: '5.1' + episode: 2 + episode_title: Episode Title + format: WEB-DL + release_group: Group + screen_size: 1080p + season: 2 + title: Show Name + type: episode + video_codec: h264 + +? Breaking.Bad.S01E01.2008.BluRay.VC1.1080P.5.1.WMV-NOVO +: audio_channels: '5.1' + container: WMV + episode: 1 + format: BluRay + release_group: NOVO + screen_size: 1080p + season: 1 + title: Breaking Bad + type: episode + year: 2008 + +? Cosmos.A.Space.Time.Odyssey.S01E02.HDTV.x264.PROPER-LOL +: episode: 2 + format: HDTV + other: Proper + proper_count: 1 + release_group: LOL + season: 1 + title: Cosmos A Space Time Odyssey + type: episode + video_codec: h264 + +? Elementary.S01E01.Pilot.DVDSCR.x264.PREAiR-NoGRP +: episode: 1 + episode_details: Pilot + episode_title: Pilot + format: DVD + other: + - Screener + - Preair + release_group: NoGRP + season: 1 + title: Elementary + type: episode + video_codec: h264 + +? Fear.The.Walking.Dead.S02E01.HDTV.x264.AAC.MP4-k3n.mp4 +: audio_codec: AAC + container: + - MP4 + - mp4 + episode: 1 + format: HDTV + mimetype: video/mp4 + release_group: k3n + season: 2 + title: Fear The Walking Dead + type: episode + video_codec: h264 + +? Game.of.Thrones.S03.1080p.BluRay.DTS-HD.MA.5.1.AVC.REMUX-FraMeSToR +: audio_channels: '5.1' + audio_codec: DTS + audio_profile: HDMA + format: BluRay + other: Remux + release_group: FraMeSToR + screen_size: 1080p + season: 3 + title: Game of Thrones + type: episode + +? Show.Name.S01E02.HDTV.x264.NL-subs-ABC +: episode: 2 + format: HDTV + release_group: ABC + season: 1 + subtitle_language: nl + title: Show Name + type: episode + video_codec: h264 + +? Friends.S01-S10.COMPLETE.720p.BluRay.x264-PtM +: format: BluRay + other: Complete + release_group: PtM + screen_size: 720p + season: # Should it be [1, 2, 3, 4, 5, 6, 7, 8, 9, 10] ? + - 1 + - 2 + - 3 + - 4 + - 5 + - 6 + - 7 + - 8 + - 9 + - 10 + title: Friends + type: episode + video_codec: h264 + +? Duck.Dynasty.S02E07.Streik.German.DOKU.DL.WS.DVDRiP.x264-CDP +: episode: 7 + episode_title: Streik German DOKU + format: DVD + language: mul + other: WideScreen + release_group: CDP + season: 2 + title: Duck Dynasty + type: episode + video_codec: h264 + +? Family.Guy.S13E14.JOLO.German.AC3D.DL.720p.WebHD.x264-CDD +: audio_codec: AC3 + episode: 14 + episode_title: JOLO German + format: WEB-DL + language: mul + release_group: CDD + screen_size: 720p + season: 13 + title: Family Guy + type: episode + video_codec: h264 + +? How.I.Met.Your.Mother.COMPLETE.SERIES.DVDRip.XviD-AR +: options: -L en -C us + format: DVD + other: Complete + release_group: AR + title: How I Met Your Mother + type: movie + video_codec: XviD + +? Show Name The Complete Seasons 1 to 5 720p BluRay x265 HEVC-SUJAIDR[UTR] +: format: BluRay + other: Complete + release_group: SUJAIDR[UTR] + screen_size: 720p + season: + - 1 + - 2 + - 3 + - 4 + - 5 + title: Show Name + type: episode + video_codec: h265 + +? Fear.the.Walking.Dead.-.Season.2.epi.02.XviD.Eng.Ac3-5.1.sub.ita.eng.iCV-MIRCrew +: options: -t episode + audio_channels: '5.1' + audio_codec: AC3 + episode: 2 + episode_title: epi + language: en + release_group: iCV-MIRCrew + season: 2 + subtitle_language: it + title: Fear the Walking Dead + type: episode + video_codec: XviD + +? Game.Of.Thrones.S06E04.720p.PROPER.HDTV.x264-HDD +: episode: 4 + format: HDTV + other: Proper + proper_count: 1 + release_group: HDD + screen_size: 720p + season: 6 + title: Game Of Thrones + type: episode + video_codec: h264 \ No newline at end of file diff --git a/libs/guessit/test/guessittest.py b/libs/guessit/test/guessittest.py deleted file mode 100644 index 1e9374f0..00000000 --- a/libs/guessit/test/guessittest.py +++ /dev/null @@ -1,187 +0,0 @@ -#!/usr/bin/env python -# -*- coding: utf-8 -*- -# -# GuessIt - A library for guessing information from filenames -# Copyright (c) 2013 Nicolas Wack -# -# GuessIt is free software; you can redistribute it and/or modify it under -# the terms of the Lesser GNU General Public License as published by -# the Free Software Foundation; either version 3 of the License, or -# (at your option) any later version. -# -# GuessIt is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# Lesser GNU General Public License for more details. -# -# You should have received a copy of the Lesser GNU General Public License -# along with this program. If not, see . -# - -from __future__ import absolute_import, division, print_function, unicode_literals - -from guessit import base_text_type, u -from collections import defaultdict -from unittest import TestCase, TestLoader, TextTestRunner -import shlex -import babelfish -import yaml, logging, sys, os -from os.path import * - - -def currentPath(): - '''Returns the path in which the calling file is located.''' - return dirname(join(os.getcwd(), sys._getframe(1).f_globals['__file__'])) - - -def addImportPath(path): - '''Function that adds the specified path to the import path. The path can be - absolute or relative to the calling file.''' - importPath = abspath(join(currentPath(), path)) - sys.path = [importPath] + sys.path - -log = logging.getLogger(__name__) - -from guessit.plugins import transformers -from guessit.options import get_opts -import guessit -from guessit import * -from guessit.matcher import * -from guessit.fileutils import * - - -def allTests(testClass): - return TestLoader().loadTestsFromTestCase(testClass) - - -class TestGuessit(TestCase): - - def checkMinimumFieldsCorrect(self, filename, filetype=None, remove_type=True, - exclude_files=None): - groundTruth = yaml.load(load_file_in_same_dir(__file__, filename)) - - def guess_func(string, options=None): - return guess_file_info(string, options=options, type=filetype) - - return self.checkFields(groundTruth, guess_func, remove_type, exclude_files) - - def checkFields(self, groundTruth, guess_func, remove_type=True, - exclude_files=None): - total = 0 - exclude_files = exclude_files or [] - - fails = defaultdict(list) - additionals = defaultdict(list) - - for filename, required_fields in groundTruth.items(): - filename = u(filename) - if filename in exclude_files: - continue - - log.debug('\n' + '-' * 120) - log.info('Guessing information for file: %s' % filename) - - options = required_fields.pop('options') if 'options' in required_fields else None - - if options: - args = shlex.split(options) - options = get_opts().parse_args(args) - options = vars(options) - try: - found = guess_func(filename, options) - except Exception as e: - fails[filename].append("An exception has occured in %s: %s" % (filename, e)) - log.exception("An exception has occured in %s: %s" % (filename, e)) - continue - - total = total + 1 - - # no need for these in the unittests - if remove_type: - try: - del found['type'] - except: - pass - for prop in ('container', 'mimetype', 'unidentified'): - if prop in found: - del found[prop] - - # props which are list of just 1 elem should be opened for easier writing of the tests - for prop in ('language', 'subtitleLanguage', 'other', 'episodeDetails', 'unidentified'): - value = found.get(prop, None) - if isinstance(value, list) and len(value) == 1: - found[prop] = value[0] - - # look for missing properties - for prop, value in required_fields.items(): - if prop not in found: - log.debug("Prop '%s' not found in: %s" % (prop, filename)) - fails[filename].append("'%s' not found in: %s" % (prop, filename)) - continue - - # if both properties are strings, do a case-insensitive comparison - if (isinstance(value, base_text_type) and - isinstance(found[prop], base_text_type)): - if value.lower() != found[prop].lower(): - log.debug("Wrong prop value [str] for '%s': expected = '%s' - received = '%s'" % (prop, u(value), u(found[prop]))) - fails[filename].append("'%s': expected = '%s' - received = '%s'" % (prop, u(value), u(found[prop]))) - - elif isinstance(value, list) and isinstance(found[prop], list): - if found[prop] and isinstance(found[prop][0], babelfish.Language): - # list of languages - s1 = set(Language.fromguessit(s) for s in value) - s2 = set(found[prop]) - else: - # by default we assume list of strings and do a case-insensitive - # comparison on their elements - s1 = set(u(s).lower() for s in value) - s2 = set(u(s).lower() for s in found[prop]) - - if s1 != s2: - log.debug("Wrong prop value [list] for '%s': expected = '%s' - received = '%s'" % (prop, u(value), u(found[prop]))) - fails[filename].append("'%s': expected = '%s' - received = '%s'" % (prop, u(value), u(found[prop]))) - - elif isinstance(found[prop], babelfish.Language): - try: - if babelfish.Language.fromguessit(value) != found[prop]: - raise ValueError - except: - log.debug("Wrong prop value [Language] for '%s': expected = '%s' - received = '%s'" % (prop, u(value), u(found[prop]))) - fails[filename].append("'%s': expected = '%s' - received = '%s'" % (prop, u(value), u(found[prop]))) - - elif isinstance(found[prop], babelfish.Country): - try: - if babelfish.Country.fromguessit(value) != found[prop]: - raise ValueError - except: - log.debug("Wrong prop value [Country] for '%s': expected = '%s' - received = '%s'" % (prop, u(value), u(found[prop]))) - fails[filename].append("'%s': expected = '%s' - received = '%s'" % (prop, u(value), u(found[prop]))) - - - # otherwise, just compare their values directly - else: - if found[prop] != value: - log.debug("Wrong prop value for '%s': expected = '%s' [%s] - received = '%s' [%s]" % (prop, u(value), type(value), u(found[prop]), type(found[prop]))) - fails[filename].append("'%s': expected = '%s' [%s] - received = '%s' [%s]" % (prop, u(value), type(value), u(found[prop]), type(found[prop]))) - - # look for additional properties - for prop, value in found.items(): - if prop not in required_fields: - log.debug("Found additional info for prop = '%s': '%s'" % (prop, u(value))) - additionals[filename].append("'%s': '%s'" % (prop, u(value))) - - correct = total - len(fails) - log.info('SUMMARY: Guessed correctly %d out of %d filenames' % (correct, total)) - - for failed_entry, failed_properties in fails.items(): - log.error('---- ' + failed_entry + ' ----') - for failed_property in failed_properties: - log.error("FAILED: " + failed_property) - - for additional_entry, additional_properties in additionals.items(): - log.warning('---- ' + additional_entry + ' ----') - for additional_property in additional_properties: - log.warning("ADDITIONAL: " + additional_property) - - self.assertTrue(correct == total, - msg='Correct: %d < Total: %d' % (correct, total)) diff --git a/libs/guessit/test/movies.yaml b/libs/guessit/test/movies.yml similarity index 60% rename from libs/guessit/test/movies.yaml rename to libs/guessit/test/movies.yml index 7894ef69..a132b116 100644 --- a/libs/guessit/test/movies.yaml +++ b/libs/guessit/test/movies.yml @@ -1,91 +1,93 @@ +? __default__ +: type: movie ? Movies/Fear and Loathing in Las Vegas (1998)/Fear.and.Loathing.in.Las.Vegas.720p.HDDVD.DTS.x264-ESiR.mkv : title: Fear and Loathing in Las Vegas year: 1998 - screenSize: 720p + screen_size: 720p format: HD-DVD - audioCodec: DTS - videoCodec: h264 - releaseGroup: ESiR + audio_codec: DTS + video_codec: h264 + container: mkv + release_group: ESiR ? Movies/El Dia de la Bestia (1995)/El.dia.de.la.bestia.DVDrip.Spanish.DivX.by.Artik[SEDG].avi : title: El Dia de la Bestia year: 1995 format: DVD language: spanish - videoCodec: DivX - releaseGroup: Artik[SEDG] + video_codec: DivX + release_group: Artik[SEDG] + container: avi ? Movies/Dark City (1998)/Dark.City.(1998).DC.BDRip.720p.DTS.X264-CHD.mkv : title: Dark City year: 1998 format: BluRay - screenSize: 720p - audioCodec: DTS - videoCodec: h264 - releaseGroup: CHD + screen_size: 720p + audio_codec: DTS + video_codec: h264 + release_group: CHD ? Movies/Sin City (BluRay) (2005)/Sin.City.2005.BDRip.720p.x264.AC3-SEPTiC.mkv : title: Sin City year: 2005 format: BluRay - screenSize: 720p - videoCodec: h264 - audioCodec: AC3 - releaseGroup: SEPTiC - + screen_size: 720p + video_codec: h264 + audio_codec: AC3 + release_group: SEPTiC ? Movies/Borat (2006)/Borat.(2006).R5.PROPER.REPACK.DVDRip.XviD-PUKKA.avi : title: Borat year: 2006 - other: PROPER + proper_count: 2 format: DVD other: [ R5, Proper ] - videoCodec: XviD - releaseGroup: PUKKA - + video_codec: XviD + release_group: PUKKA ? "[XCT].Le.Prestige.(The.Prestige).DVDRip.[x264.HP.He-Aac.{Fr-Eng}.St{Fr-Eng}.Chaps].mkv" : title: Le Prestige format: DVD - videoCodec: h264 - videoProfile: HP - audioCodec: AAC - audioProfile: HE + video_codec: h264 + video_profile: HP + audio_codec: AAC + audio_profile: HE language: [ french, english ] - subtitleLanguage: [ french, english ] - releaseGroup: XCT + subtitle_language: [ french, english ] + release_group: Chaps ? Battle Royale (2000)/Battle.Royale.(Batoru.Rowaiaru).(2000).(Special.Edition).CD1of2.DVDRiP.XviD-[ZeaL].avi : title: Battle Royale year: 2000 - edition: special edition - cdNumber: 1 - cdNumberTotal: 2 + edition: Special Edition + cd: 1 + cd_count: 2 format: DVD - videoCodec: XviD - releaseGroup: ZeaL + video_codec: XviD + release_group: ZeaL ? Movies/Brazil (1985)/Brazil_Criterion_Edition_(1985).CD2.avi : title: Brazil edition: Criterion Edition year: 1985 - cdNumber: 2 + cd: 2 ? Movies/Persepolis (2007)/[XCT] Persepolis [H264+Aac-128(Fr-Eng)+ST(Fr-Eng)+Ind].mkv : title: Persepolis year: 2007 - videoCodec: h264 - audioCodec: AAC + video_codec: h264 + audio_codec: AAC language: [ French, English ] - subtitleLanguage: [ French, English ] - releaseGroup: XCT + subtitle_language: [ French, English ] + release_group: Ind ? Movies/Toy Story (1995)/Toy Story [HDTV 720p English-Spanish].mkv : title: Toy Story year: 1995 format: HDTV - screenSize: 720p + screen_size: 720p language: [ english, spanish ] ? Movies/Office Space (1999)/Office.Space.[Dual-DVDRip].[Spanish-English].[XviD-AC3-AC3].[by.Oswald].avi @@ -93,84 +95,85 @@ year: 1999 format: DVD language: [ english, spanish ] - videoCodec: XviD - audioCodec: AC3 + video_codec: XviD + audio_codec: AC3 ? Movies/Wild Zero (2000)/Wild.Zero.DVDivX-EPiC.avi : title: Wild Zero year: 2000 - videoCodec: DivX - releaseGroup: EPiC + video_codec: DivX + release_group: EPiC ? movies/Baraka_Edition_Collector.avi : title: Baraka - edition: collector edition + edition: Collector Edition ? Movies/Blade Runner (1982)/Blade.Runner.(1982).(Director's.Cut).CD1.DVDRip.XviD.AC3-WAF.avi : title: Blade Runner year: 1982 - edition: Director's Cut - cdNumber: 1 + edition: Director's cut + cd: 1 format: DVD - videoCodec: XviD - audioCodec: AC3 - releaseGroup: WAF + video_codec: XviD + audio_codec: AC3 + release_group: WAF ? movies/American.The.Bill.Hicks.Story.2009.DVDRip.XviD-EPiSODE.[UsaBit.com]/UsaBit.com_esd-americanbh.avi : title: American The Bill Hicks Story year: 2009 format: DVD - videoCodec: XviD - releaseGroup: EPiSODE + video_codec: XviD + release_group: EPiSODE website: UsaBit.com ? movies/Charlie.And.Boots.DVDRip.XviD-TheWretched/wthd-cab.avi : title: Charlie And Boots format: DVD - videoCodec: XviD - releaseGroup: TheWretched + video_codec: XviD + release_group: TheWretched ? movies/Steig Larsson Millenium Trilogy (2009) BRrip 720 AAC x264/(1)The Girl With The Dragon Tattoo (2009) BRrip 720 AAC x264.mkv : title: The Girl With The Dragon Tattoo - filmSeries: Steig Larsson Millenium Trilogy - filmNumber: 1 + #film_title: Steig Larsson Millenium Trilogy + #film: 1 year: 2009 format: BluRay - audioCodec: AAC - videoCodec: h264 - screenSize: 720p + audio_codec: AAC + video_codec: h264 + screen_size: 720p ? movies/Greenberg.REPACK.LiMiTED.DVDRip.XviD-ARROW/arw-repack-greenberg.dvdrip.xvid.avi : title: Greenberg format: DVD - videoCodec: XviD - releaseGroup: ARROW + video_codec: XviD + release_group: ARROW other: ['Proper', 'Limited'] + proper_count: 1 ? Movies/Fr - Paris 2054, Renaissance (2005) - De Christian Volckman - (Film Divx Science Fiction Fantastique Thriller Policier N&B).avi : title: Paris 2054, Renaissance year: 2005 language: french - videoCodec: DivX + video_codec: DivX ? Movies/[阿维达].Avida.2006.FRENCH.DVDRiP.XViD-PROD.avi : title: Avida year: 2006 language: french format: DVD - videoCodec: XviD - releaseGroup: PROD + video_codec: XviD + release_group: PROD ? Movies/Alice in Wonderland DVDRip.XviD-DiAMOND/dmd-aw.avi : title: Alice in Wonderland format: DVD - videoCodec: XviD - releaseGroup: DiAMOND + video_codec: XviD + release_group: DiAMOND ? Movies/Ne.Le.Dis.A.Personne.Fr 2 cd/personnea_mp.avi : title: Ne Le Dis A Personne language: french - cdNumberTotal: 2 + cd_count: 2 ? Movies/Bunker Palace Hôtel (Enki Bilal) (1989)/Enki Bilal - Bunker Palace Hotel (Fr Vhs Rip).avi : title: Bunker Palace Hôtel @@ -182,33 +185,33 @@ : title: "21" year: 2008 format: DVD - videoCodec: h264 - audioCodec: AC3 - releaseGroup: FtS + video_codec: h264 + audio_codec: AC3 + release_group: FtS website: sharethefiles.com ? Movies/9 (2009)/9.2009.Blu-ray.DTS.720p.x264.HDBRiSe.[sharethefiles.com].mkv : title: "9" year: 2009 format: BluRay - audioCodec: DTS - screenSize: 720p - videoCodec: h264 - releaseGroup: HDBRiSe + audio_codec: DTS + screen_size: 720p + video_codec: h264 + release_group: HDBRiSe website: sharethefiles.com ? Movies/Mamma.Mia.2008.DVDRip.AC3.XviD-CrazyTeam/Mamma.Mia.2008.DVDRip.AC3.XviD-CrazyTeam.avi : title: Mamma Mia year: 2008 format: DVD - audioCodec: AC3 - videoCodec: XviD - releaseGroup: CrazyTeam + audio_codec: AC3 + video_codec: XviD + release_group: CrazyTeam ? Movies/M.A.S.H. (1970)/MASH.(1970).[Divx.5.02][Dual-Subtitulos][DVDRip].ogm -: title: M.A.S.H. +: title: MASH year: 1970 - videoCodec: DivX + video_codec: DivX format: DVD ? Movies/The Doors (1991)/09.03.08.The.Doors.(1991).BDRip.720p.AC3.X264-HiS@SiLUHD-English.[sharethefiles.com].mkv @@ -216,10 +219,10 @@ year: 1991 date: 2008-03-09 format: BluRay - screenSize: 720p - audioCodec: AC3 - videoCodec: h264 - releaseGroup: HiS@SiLUHD + screen_size: 720p + audio_codec: AC3 + video_codec: h264 + release_group: HiS@SiLUHD language: english website: sharethefiles.com @@ -229,10 +232,10 @@ year: 1991 date: 2008-03-09 format: BluRay - screenSize: 720p - audioCodec: AC3 - videoCodec: h264 - releaseGroup: HiS@SiLUHD + screen_size: 720p + audio_codec: AC3 + video_codec: h264 + release_group: HiS@SiLUHD language: english website: sharethefiles.com @@ -240,79 +243,78 @@ : title: Ratatouille format: DVD -? Movies/001 __ A classer/Fantomas se déchaine - Louis de Funès.avi -: title: Fantomas se déchaine +# Removing this one because 001 is guessed as an episode number. +# ? Movies/001 __ A classer/Fantomas se déchaine - Louis de Funès.avi +# : title: Fantomas se déchaine ? Movies/Comme une Image (2004)/Comme.Une.Image.FRENCH.DVDRiP.XViD-NTK.par-www.divx-overnet.com.avi : title: Comme une Image year: 2004 language: french format: DVD - videoCodec: XviD - releaseGroup: NTK + video_codec: XviD + release_group: NTK website: www.divx-overnet.com ? Movies/Fantastic Mr Fox/Fantastic.Mr.Fox.2009.DVDRip.{x264+LC-AAC.5.1}{Fr-Eng}{Sub.Fr-Eng}-â„¢.[sharethefiles.com].mkv : title: Fantastic Mr Fox year: 2009 format: DVD - videoCodec: h264 - audioCodec: AAC - audioProfile: LC - audioChannels: "5.1" + video_codec: h264 + audio_codec: AAC + audio_profile: LC + audio_channels: "5.1" language: [ french, english ] - subtitleLanguage: [ french, english ] + subtitle_language: [ french, english ] website: sharethefiles.com ? Movies/Somewhere.2010.DVDRip.XviD-iLG/i-smwhr.avi : title: Somewhere year: 2010 format: DVD - videoCodec: XviD - releaseGroup: iLG + video_codec: XviD + release_group: iLG ? Movies/Moon_(2009).mkv : title: Moon year: 2009 -? Movies/Moon_(2009)-x01.mkv -: title: Moon - year: 2009 - bonusNumber: 1 - ? Movies/Moon_(2009)-x02-Making_Of.mkv : title: Moon year: 2009 - bonusNumber: 2 - bonusTitle: Making Of + bonus: 2 + bonus_title: Making Of ? movies/James_Bond-f17-Goldeneye.mkv : title: Goldeneye - filmSeries: James Bond - filmNumber: 17 + film_title: James Bond + film: 17 + ? /movies/James_Bond-f21-Casino_Royale.mkv : title: Casino Royale - filmSeries: James Bond - filmNumber: 21 + film_title: James Bond + film: 21 ? /movies/James_Bond-f21-Casino_Royale-x01-Becoming_Bond.mkv : title: Casino Royale - filmSeries: James Bond - filmNumber: 21 - bonusNumber: 1 - bonusTitle: Becoming Bond + film_title: James Bond + film: 21 + bonus: 1 + bonus_title: Becoming Bond ? /movies/James_Bond-f21-Casino_Royale-x02-Stunts.mkv : title: Casino Royale - filmSeries: James Bond - filmNumber: 21 - bonusNumber: 2 - bonusTitle: Stunts + film_title: James Bond + film: 21 + bonus: 2 + bonus_title: Stunts ? OSS_117--Cairo,_Nest_of_Spies.mkv -: title: OSS 117--Cairo, Nest of Spies +: title: OSS 117 +# TODO: Implement subTitle for movies. +? The Godfather Part 3.mkv ? The Godfather Part III.mkv : title: The Godfather part: 3 @@ -324,50 +326,52 @@ ? The_Insider-(1999)-x02-60_Minutes_Interview-1996.mp4 : title: The Insider year: 1999 - bonusNumber: 2 - bonusTitle: 60 Minutes Interview-1996 + bonus: 2 + bonus_title: 60 Minutes Interview-1996 ? Rush.._Beyond_The_Lighted_Stage-x09-Between_Sun_and_Moon-2002_Hartford.mkv : title: Rush Beyond The Lighted Stage - bonusNumber: 9 - bonusTitle: Between Sun and Moon-2002 Hartford + bonus: 9 + bonus_title: Between Sun and Moon + year: 2002 ? /public/uTorrent/Downloads Finished/Movies/Indiana.Jones.and.the.Temple.of.Doom.1984.HDTV.720p.x264.AC3.5.1-REDµX/Indiana.Jones.and.the.Temple.of.Doom.1984.HDTV.720p.x264.AC3.5.1-REDµX.mkv : title: Indiana Jones and the Temple of Doom year: 1984 format: HDTV - screenSize: 720p - videoCodec: h264 - audioCodec: AC3 - audioChannels: "5.1" - releaseGroup: REDµX + screen_size: 720p + video_codec: h264 + audio_codec: AC3 + audio_channels: "5.1" + release_group: REDµX ? The.Director’s.Notebook.2006.Blu-Ray.x264.DXVA.720p.AC3-de[42].mkv : title: The Director’s Notebook year: 2006 format: BluRay - videoCodec: h264 - videoApi: DXVA - screenSize: 720p - audioCodec: AC3 - releaseGroup: de[42] + video_codec: h264 + video_api: DXVA + screen_size: 720p + audio_codec: AC3 + release_group: de[42] + ? Movies/Cosmopolis.2012.LiMiTED.720p.BluRay.x264-AN0NYM0US[bb]/ano-cosmo.720p.mkv : title: Cosmopolis year: 2012 - screenSize: 720p - videoCodec: h264 - releaseGroup: AN0NYM0US[bb] + screen_size: 720p + video_codec: h264 + release_group: AN0NYM0US[bb] format: BluRay - other: LIMITED + other: Limited -? movies/La Science des ReÌ‚ves (2006)/La.Science.Des.Reves.FRENCH.DVDRip.XviD-MP-AceBot.avi +? movies/La Science des Rêves (2006)/La.Science.Des.Reves.FRENCH.DVDRip.XviD-MP-AceBot.avi : title: La Science des Rêves year: 2006 format: DVD - videoCodec: XviD - videoProfile: MP - releaseGroup: AceBot + video_codec: XviD + video_profile: MP + release_group: AceBot language: French ? The_Italian_Job.mkv @@ -376,76 +380,76 @@ ? The.Rum.Diary.2011.1080p.BluRay.DTS.x264.D-Z0N3.mkv : title: The Rum Diary year: 2011 - screenSize: 1080p + screen_size: 1080p format: BluRay - videoCodec: h264 - audioCodec: DTS - releaseGroup: D-Z0N3 + video_codec: h264 + audio_codec: DTS + release_group: D-Z0N3 ? Life.Of.Pi.2012.1080p.BluRay.DTS.x264.D-Z0N3.mkv : title: Life Of Pi year: 2012 - screenSize: 1080p + screen_size: 1080p format: BluRay - videoCodec: h264 - audioCodec: DTS - releaseGroup: D-Z0N3 + video_codec: h264 + audio_codec: DTS + release_group: D-Z0N3 ? The.Kings.Speech.2010.1080p.BluRay.DTS.x264.D Z0N3.mkv : title: The Kings Speech year: 2010 - screenSize: 1080p + screen_size: 1080p format: BluRay - audioCodec: DTS - videoCodec: h264 - releaseGroup: D Z0N3 + audio_codec: DTS + video_codec: h264 + release_group: D Z0N3 ? Street.Kings.2008.BluRay.1080p.DTS.x264.dxva EuReKA.mkv : title: Street Kings year: 2008 format: BluRay - screenSize: 1080p - audioCodec: DTS - videoCodec: h264 - videoApi: DXVA - releaseGroup: EuReKa + screen_size: 1080p + audio_codec: DTS + video_codec: h264 + video_api: DXVA + release_group: EuReKA ? 2001.A.Space.Odyssey.1968.HDDVD.1080p.DTS.x264.dxva EuReKA.mkv : title: 2001 A Space Odyssey year: 1968 format: HD-DVD - screenSize: 1080p - audioCodec: DTS - videoCodec: h264 - videoApi: DXVA - releaseGroup: EuReKa + screen_size: 1080p + audio_codec: DTS + video_codec: h264 + video_api: DXVA + release_group: EuReKA ? 2012.2009.720p.BluRay.x264.DTS WiKi.mkv : title: "2012" year: 2009 - screenSize: 720p + screen_size: 720p format: BluRay - videoCodec: h264 - audioCodec: DTS - releaseGroup: WiKi + video_codec: h264 + audio_codec: DTS + release_group: WiKi ? /share/Download/movie/Dead Man Down (2013) BRRiP XViD DD5_1 Custom NLSubs =-_lt Q_o_Q gt-=_/XD607ebb-BRc59935-5155473f-1c5f49/XD607ebb-BRc59935-5155473f-1c5f49.avi : title: Dead Man Down year: 2013 format: BluRay - videoCodec: XviD - audioChannels: "5.1" - audioCodec: DolbyDigital - idNumber: XD607ebb-BRc59935-5155473f-1c5f49 + video_codec: XviD + audio_channels: "5.1" + audio_codec: DolbyDigital + uuid: XD607ebb-BRc59935-5155473f-1c5f49 ? Pacific.Rim.3D.2013.COMPLETE.BLURAY-PCH.avi : title: Pacific Rim year: 2013 format: BluRay other: - - complete + - Complete - 3D - releaseGroup: PCH + release_group: PCH ? Immersion.French.2011.STV.READNFO.QC.FRENCH.ENGLISH.NTSC.DVDR.nfo : title: Immersion French @@ -454,64 +458,69 @@ - French - English format: DVD + other: NTSC ? Immersion.French.2011.STV.READNFO.QC.FRENCH.NTSC.DVDR.nfo : title: Immersion French year: 2011 language: French format: DVD + other: NTSC ? Immersion.French.2011.STV.READNFO.QC.NTSC.DVDR.nfo -: title: Immersion French +: title: Immersion + language: French year: 2011 format: DVD + other: NTSC ? French.Immersion.2011.STV.READNFO.QC.ENGLISH.NTSC.DVDR.nfo : title: French Immersion year: 2011 language: ENGLISH format: DVD + other: NTSC ? Howl's_Moving_Castle_(2004)_[720p,HDTV,x264,DTS]-FlexGet.avi -: videoCodec: h264 +: video_codec: h264 format: HDTV title: Howl's Moving Castle - screenSize: 720p + screen_size: 720p year: 2004 - audioCodec: DTS - releaseGroup: FlexGet + audio_codec: DTS + release_group: FlexGet ? Pirates de langkasuka.2008.FRENCH.1920X1080.h264.AVC.AsiaRa.mkv -: screenSize: 1080p +: screen_size: 1080p year: 2008 language: French - videoCodec: h264 + video_codec: h264 title: Pirates de langkasuka - releaseGroup: AsiaRa + release_group: AsiaRa ? Masala (2013) Telugu Movie HD DVDScr XviD - Exclusive.avi : year: 2013 - videoCodec: XviD + video_codec: XviD title: Masala format: HD-DVD - other: screener + other: Screener language: Telugu - releaseGroup: Exclusive + release_group: Exclusive ? Django Unchained 2012 DVDSCR X264 AAC-P2P.nfo : year: 2012 - other: screener - videoCodec: h264 + other: Screener + video_codec: h264 title: Django Unchained - audioCodec: AAC + audio_codec: AAC format: DVD - releaseGroup: P2P + release_group: P2P ? Ejecutiva.En.Apuros(2009).BLURAY.SCR.Xvid.Spanish.LanzamientosD.nfo : year: 2009 - other: screener + other: Screener format: BluRay - videoCodec: XviD + video_codec: XviD language: Spanish title: Ejecutiva En Apuros @@ -521,26 +530,26 @@ language: - Multiple languages - German - videoCodec: h264 - releaseGroup: EXQUiSiTE - screenSize: 1080p + video_codec: h264 + release_group: EXQUiSiTE + screen_size: 1080p ? Rocky 1976 French SubForced BRRip x264 AC3-FUNKY.mkv : title: Rocky year: 1976 - subtitleLanguage: French + subtitle_language: French format: BluRay - videoCodec: h264 - audioCodec: AC3 - releaseGroup: FUNKY + video_codec: h264 + audio_codec: AC3 + release_group: FUNKY ? REDLINE (BD 1080p H264 10bit FLAC) [3xR].mkv : title: REDLINE format: BluRay - videoCodec: h264 - videoProfile: 10bit - audioCodec: Flac - screenSize: 1080p + video_codec: h264 + video_profile: 10bit + audio_codec: FLAC + screen_size: 1080p ? The.Lizzie.McGuire.Movie.(2003).HR.DVDRiP.avi : title: The Lizzie McGuire Movie @@ -550,205 +559,279 @@ ? Hua.Mulan.BRRIP.MP4.x264.720p-HR.avi : title: Hua Mulan - videoCodec: h264 + video_codec: h264 format: BluRay - screenSize: 720p + screen_size: 720p other: HR ? Dr.Seuss.The.Lorax.2012.DVDRip.LiNE.XviD.AC3.HQ.Hive-CM8.mp4 -: videoCodec: XviD +: video_codec: XviD title: Dr Seuss The Lorax format: DVD other: LiNE year: 2012 - audioCodec: AC3 - audioProfile: HQ - releaseGroup: Hive-CM8 - + audio_codec: AC3 + audio_profile: HQ + release_group: Hive-CM8 ? "Star Wars: Episode IV - A New Hope (2004) Special Edition.MKV" -: title: Star Wars Episode IV +: title: "Star Wars: Episode IV" + alternative_title: A New Hope year: 2004 edition: Special Edition - + ? Dr.LiNE.The.Lorax.2012.DVDRip.LiNE.XviD.AC3.HQ.Hive-CM8.mp4 -: videoCodec: XviD +: video_codec: XviD title: Dr LiNE The Lorax format: DVD other: LiNE year: 2012 - audioCodec: AC3 - audioProfile: HQ - releaseGroup: Hive-CM8 + audio_codec: AC3 + audio_profile: HQ + release_group: Hive-CM8 + +? Dr.LiNE.The.Lorax.2012.DVDRip.XviD.AC3.HQ.Hive-CM8.mp4 +: video_codec: XviD + title: Dr LiNE The Lorax + format: DVD + year: 2012 + audio_codec: AC3 + audio_profile: HQ + release_group: Hive-CM8 ? Perfect Child-2007-TRUEFRENCH-TVRip.Xvid-h@mster.avi -: releaseGroup: h@mster +: release_group: h@mster title: Perfect Child - videoCodec: XviD + video_codec: XviD language: French format: TV year: 2007 - + ? entre.ciel.et.terre.(1994).dvdrip.h264.aac-psypeon.avi -: audioCodec: AAC +: audio_codec: AAC format: DVD - releaseGroup: psypeon + release_group: psypeon title: entre ciel et terre - videoCodec: h264 + video_codec: h264 year: 1994 - + ? Yves.Saint.Laurent.2013.FRENCH.DVDSCR.MD.XviD-ViVARiUM.avi : format: DVD language: French - other: Screener - releaseGroup: ViVARiUM + other: + - MD + - Screener + release_group: ViVARiUM title: Yves Saint Laurent - videoCodec: XviD + video_codec: XviD year: 2013 - + ? Echec et Mort - Hard to Kill - Steven Seagal Multi 1080p BluRay x264 CCATS.avi : format: BluRay language: Multiple languages - releaseGroup: CCATS - screenSize: 1080p + release_group: CCATS + screen_size: 1080p title: Echec et Mort - videoCodec: h264 + alternative_title: + - Hard to Kill + - Steven Seagal + video_codec: h264 ? Paparazzi - Timsit/Lindon (MKV 1080p tvripHD) : options: -n title: Paparazzi - screenSize: 1080p + alternative_title: + - Timsit + - Lindon + screen_size: 1080p + container: MKV format: HDTV - + ? some.movie.720p.bluray.x264-mind -: options: -n - title: some movie - screenSize: 720p - videoCodec: h264 - releaseGroup: mind +: title: some movie + screen_size: 720p + video_codec: h264 + release_group: mind format: BluRay - + ? Dr LiNE The Lorax 720p h264 BluRay -: options: -n - title: Dr LiNE The Lorax - screenSize: 720p - videoCodec: h264 +: title: Dr LiNE The Lorax + screen_size: 720p + video_codec: h264 format: BluRay -? BeatdownFrenchDVDRip.mkv -: options: -c - title: Beatdown - language: French - format: DVD +#TODO: Camelcase implementation +#? BeatdownFrenchDVDRip.mkv +#: options: -c +# title: Beatdown +# language: French +# format: DVD + +#? YvesSaintLaurent2013FrenchDVDScrXvid.avi +#: options: -c +# format: DVD +# language: French +# other: Screener +# title: Yves saint laurent +# video_codec: XviD +# year: 2013 -? YvesSaintLaurent2013FrenchDVDScrXvid.avi -: options: -c - format: DVD - language: French - other: Screener - title: Yves saint laurent - videoCodec: XviD - year: 2013 ? Elle.s.en.va.720p.mkv -: screenSize: 720p +: screen_size: 720p title: Elle s en va ? FooBar.7.PDTV-FlexGet -: options: -n - format: DVB - releaseGroup: FlexGet +: format: DVB + release_group: FlexGet title: FooBar 7 ? h265 - HEVC Riddick Unrated Director Cut French 1080p DTS.mkv -: audioCodec: DTS +: audio_codec: DTS edition: Director's cut language: fr - screenSize: 1080p - title: Riddick Unrated - videoCodec: h265 + screen_size: 1080p + title: Riddick + other: Unrated + video_codec: h265 ? "[h265 - HEVC] Riddick Unrated Director Cut French [1080p DTS].mkv" -: audioCodec: DTS +: audio_codec: DTS edition: Director's cut language: fr - screenSize: 1080p - title: Riddick Unrated - videoCodec: h265 + screen_size: 1080p + title: Riddick + other: Unrated + video_codec: h265 ? Barbecue-2014-French-mHD-1080p -: options: -n - language: fr +: language: fr other: mHD - screenSize: 1080p + screen_size: 1080p title: Barbecue year: 2014 ? Underworld Quadrilogie VO+VFF+VFQ 1080p HDlight.x264~Tonyk~Monde Infernal -: options: -n - language: - - fr - - vo - other: HDLight - screenSize: 1080p +: language: fr + other: + - HDLight + - OV + screen_size: 1080p title: Underworld Quadrilogie - videoCodec: h264 + video_codec: h264 ? A Bout Portant (The Killers).PAL.Multi.DVD-R-KZ -: options: -n - format: DVD +: format: DVD language: mul - releaseGroup: KZ + release_group: KZ title: A Bout Portant ? "Mise à Sac (Alain Cavalier, 1967) [Vhs.Rip.Vff]" -: options: -n - format: VHS +: format: VHS language: fr title: "Mise à Sac" year: 1967 ? A Bout Portant (The Killers).PAL.Multi.DVD-R-KZ -: options: -n - format: DVD +: format: DVD + other: PAL language: mul - releaseGroup: KZ + release_group: KZ title: A Bout Portant ? Youth.In.Revolt.(Be.Bad).2009.MULTI.1080p.LAME3*92-MEDIOZZ -: options: -n - audioCodec: MP3 +: audio_codec: MP3 language: mul - releaseGroup: MEDIOZZ - screenSize: 1080p + release_group: MEDIOZZ + screen_size: 1080p title: Youth In Revolt year: 2009 ? La Defense Lincoln (The Lincoln Lawyer) 2011 [DVDRIP][Vostfr] -: options: -n - format: DVD - subtitleLanguage: fr +: format: DVD + subtitle_language: fr title: La Defense Lincoln year: 2011 ? '[h265 - HEVC] Fight Club French 1080p DTS.' -: options: -n - audioCodec: DTS +: audio_codec: DTS language: fr - screenSize: 1080p + screen_size: 1080p title: Fight Club - videoCodec: h265 + video_codec: h265 ? Love Gourou (Mike Myers) - FR -: options: -n - language: fr +: language: fr title: Love Gourou ? '[h265 - hevc] transformers 2 1080p french ac3 6ch.' -: options: -n - audioChannels: '5.1' - audioCodec: AC3 +: audio_channels: '5.1' + audio_codec: AC3 language: fr - screenSize: 1080p + screen_size: 1080p title: transformers 2 - videoCodec: h265 + video_codec: h265 + +? 1.Angry.Man.1957.mkv +: title: 1 Angry Man + year: 1957 + +? 12.Angry.Men.1957.mkv +: title: 12 Angry Men + year: 1957 + +? 123.Angry.Men.1957.mkv +: title: 123 Angry Men + year: 1957 + +? "Looney Tunes 1444x866 Porky's Last Stand.mkv" +: screen_size: 1444x866 + title: Looney Tunes + +? Das.Appartement.German.AC3D.DL.720p.BluRay.x264-TVP +: audio_codec: AC3 + format: BluRay + language: mul + release_group: TVP + screen_size: 720p + title: Das Appartement German + type: movie + video_codec: h264 + +? Das.Appartement.GERMAN.AC3D.DL.720p.BluRay.x264-TVP +: audio_codec: AC3 + format: BluRay + language: + - de + - mul + release_group: TVP + screen_size: 720p + title: Das Appartement + video_codec: h264 + +? Hyena.Road.2015.German.1080p.DL.DTSHD.Bluray.x264-pmHD +: audio_codec: DTS + audio_profile: HD + format: BluRay + language: + - de + - mul + release_group: pmHD + screen_size: 1080p + title: Hyena Road + type: movie + video_codec: h264 + year: 2015 + +? Hyena.Road.2015.German.Ep.Title.1080p.DL.DTSHD.Bluray.x264-pmHD +: audio_codec: DTS + audio_profile: HD + episode_title: German Ep Title + format: BluRay + language: mul + release_group: pmHD + screen_size: 1080p + title: Hyena Road + type: movie + video_codec: h264 + year: 2015 diff --git a/libs/guessit/test/opensubtitles_languages_2012_05_09.txt b/libs/guessit/test/opensubtitles_languages_2012_05_09.txt deleted file mode 100644 index 4a08d9b5..00000000 --- a/libs/guessit/test/opensubtitles_languages_2012_05_09.txt +++ /dev/null @@ -1,473 +0,0 @@ -IdSubLanguage ISO639 LanguageName UploadEnabled WebEnabled -aar aa Afar, afar 0 0 -abk ab Abkhazian 0 0 -ace Achinese 0 0 -ach Acoli 0 0 -ada Adangme 0 0 -ady adyghé 0 0 -afa Afro-Asiatic (Other) 0 0 -afh Afrihili 0 0 -afr af Afrikaans 0 0 -ain Ainu 0 0 -aka ak Akan 0 0 -akk Akkadian 0 0 -alb sq Albanian 1 1 -ale Aleut 0 0 -alg Algonquian languages 0 0 -alt Southern Altai 0 0 -amh am Amharic 0 0 -ang English, Old (ca.450-1100) 0 0 -apa Apache languages 0 0 -ara ar Arabic 1 1 -arc Aramaic 0 0 -arg an Aragonese 0 0 -arm hy Armenian 1 0 -arn Araucanian 0 0 -arp Arapaho 0 0 -art Artificial (Other) 0 0 -arw Arawak 0 0 -asm as Assamese 0 0 -ast Asturian, Bable 0 0 -ath Athapascan languages 0 0 -aus Australian languages 0 0 -ava av Avaric 0 0 -ave ae Avestan 0 0 -awa Awadhi 0 0 -aym ay Aymara 0 0 -aze az Azerbaijani 0 0 -bad Banda 0 0 -bai Bamileke languages 0 0 -bak ba Bashkir 0 0 -bal Baluchi 0 0 -bam bm Bambara 0 0 -ban Balinese 0 0 -baq eu Basque 1 1 -bas Basa 0 0 -bat Baltic (Other) 0 0 -bej Beja 0 0 -bel be Belarusian 0 0 -bem Bemba 0 0 -ben bn Bengali 1 0 -ber Berber (Other) 0 0 -bho Bhojpuri 0 0 -bih bh Bihari 0 0 -bik Bikol 0 0 -bin Bini 0 0 -bis bi Bislama 0 0 -bla Siksika 0 0 -bnt Bantu (Other) 0 0 -bos bs Bosnian 1 0 -bra Braj 0 0 -bre br Breton 1 0 -btk Batak (Indonesia) 0 0 -bua Buriat 0 0 -bug Buginese 0 0 -bul bg Bulgarian 1 1 -bur my Burmese 0 0 -byn Blin 0 0 -cad Caddo 0 0 -cai Central American Indian (Other) 0 0 -car Carib 0 0 -cat ca Catalan 1 1 -cau Caucasian (Other) 0 0 -ceb Cebuano 0 0 -cel Celtic (Other) 0 0 -cha ch Chamorro 0 0 -chb Chibcha 0 0 -che ce Chechen 0 0 -chg Chagatai 0 0 -chi zh Chinese 1 1 -chk Chuukese 0 0 -chm Mari 0 0 -chn Chinook jargon 0 0 -cho Choctaw 0 0 -chp Chipewyan 0 0 -chr Cherokee 0 0 -chu cu Church Slavic 0 0 -chv cv Chuvash 0 0 -chy Cheyenne 0 0 -cmc Chamic languages 0 0 -cop Coptic 0 0 -cor kw Cornish 0 0 -cos co Corsican 0 0 -cpe Creoles and pidgins, English based (Other) 0 0 -cpf Creoles and pidgins, French-based (Other) 0 0 -cpp Creoles and pidgins, Portuguese-based (Other) 0 0 -cre cr Cree 0 0 -crh Crimean Tatar 0 0 -crp Creoles and pidgins (Other) 0 0 -csb Kashubian 0 0 -cus Cushitic (Other)' couchitiques, autres langues 0 0 -cze cs Czech 1 1 -dak Dakota 0 0 -dan da Danish 1 1 -dar Dargwa 0 0 -day Dayak 0 0 -del Delaware 0 0 -den Slave (Athapascan) 0 0 -dgr Dogrib 0 0 -din Dinka 0 0 -div dv Divehi 0 0 -doi Dogri 0 0 -dra Dravidian (Other) 0 0 -dua Duala 0 0 -dum Dutch, Middle (ca.1050-1350) 0 0 -dut nl Dutch 1 1 -dyu Dyula 0 0 -dzo dz Dzongkha 0 0 -efi Efik 0 0 -egy Egyptian (Ancient) 0 0 -eka Ekajuk 0 0 -elx Elamite 0 0 -eng en English 1 1 -enm English, Middle (1100-1500) 0 0 -epo eo Esperanto 1 0 -est et Estonian 1 1 -ewe ee Ewe 0 0 -ewo Ewondo 0 0 -fan Fang 0 0 -fao fo Faroese 0 0 -fat Fanti 0 0 -fij fj Fijian 0 0 -fil Filipino 0 0 -fin fi Finnish 1 1 -fiu Finno-Ugrian (Other) 0 0 -fon Fon 0 0 -fre fr French 1 1 -frm French, Middle (ca.1400-1600) 0 0 -fro French, Old (842-ca.1400) 0 0 -fry fy Frisian 0 0 -ful ff Fulah 0 0 -fur Friulian 0 0 -gaa Ga 0 0 -gay Gayo 0 0 -gba Gbaya 0 0 -gem Germanic (Other) 0 0 -geo ka Georgian 1 1 -ger de German 1 1 -gez Geez 0 0 -gil Gilbertese 0 0 -gla gd Gaelic 0 0 -gle ga Irish 0 0 -glg gl Galician 1 1 -glv gv Manx 0 0 -gmh German, Middle High (ca.1050-1500) 0 0 -goh German, Old High (ca.750-1050) 0 0 -gon Gondi 0 0 -gor Gorontalo 0 0 -got Gothic 0 0 -grb Grebo 0 0 -grc Greek, Ancient (to 1453) 0 0 -ell el Greek 1 1 -grn gn Guarani 0 0 -guj gu Gujarati 0 0 -gwi Gwich´in 0 0 -hai Haida 0 0 -hat ht Haitian 0 0 -hau ha Hausa 0 0 -haw Hawaiian 0 0 -heb he Hebrew 1 1 -her hz Herero 0 0 -hil Hiligaynon 0 0 -him Himachali 0 0 -hin hi Hindi 1 1 -hit Hittite 0 0 -hmn Hmong 0 0 -hmo ho Hiri Motu 0 0 -hrv hr Croatian 1 1 -hun hu Hungarian 1 1 -hup Hupa 0 0 -iba Iban 0 0 -ibo ig Igbo 0 0 -ice is Icelandic 1 1 -ido io Ido 0 0 -iii ii Sichuan Yi 0 0 -ijo Ijo 0 0 -iku iu Inuktitut 0 0 -ile ie Interlingue 0 0 -ilo Iloko 0 0 -ina ia Interlingua (International Auxiliary Language Asso 0 0 -inc Indic (Other) 0 0 -ind id Indonesian 1 1 -ine Indo-European (Other) 0 0 -inh Ingush 0 0 -ipk ik Inupiaq 0 0 -ira Iranian (Other) 0 0 -iro Iroquoian languages 0 0 -ita it Italian 1 1 -jav jv Javanese 0 0 -jpn ja Japanese 1 1 -jpr Judeo-Persian 0 0 -jrb Judeo-Arabic 0 0 -kaa Kara-Kalpak 0 0 -kab Kabyle 0 0 -kac Kachin 0 0 -kal kl Kalaallisut 0 0 -kam Kamba 0 0 -kan kn Kannada 0 0 -kar Karen 0 0 -kas ks Kashmiri 0 0 -kau kr Kanuri 0 0 -kaw Kawi 0 0 -kaz kk Kazakh 1 0 -kbd Kabardian 0 0 -kha Khasi 0 0 -khi Khoisan (Other) 0 0 -khm km Khmer 1 1 -kho Khotanese 0 0 -kik ki Kikuyu 0 0 -kin rw Kinyarwanda 0 0 -kir ky Kirghiz 0 0 -kmb Kimbundu 0 0 -kok Konkani 0 0 -kom kv Komi 0 0 -kon kg Kongo 0 0 -kor ko Korean 1 1 -kos Kosraean 0 0 -kpe Kpelle 0 0 -krc Karachay-Balkar 0 0 -kro Kru 0 0 -kru Kurukh 0 0 -kua kj Kuanyama 0 0 -kum Kumyk 0 0 -kur ku Kurdish 0 0 -kut Kutenai 0 0 -lad Ladino 0 0 -lah Lahnda 0 0 -lam Lamba 0 0 -lao lo Lao 0 0 -lat la Latin 0 0 -lav lv Latvian 1 0 -lez Lezghian 0 0 -lim li Limburgan 0 0 -lin ln Lingala 0 0 -lit lt Lithuanian 1 0 -lol Mongo 0 0 -loz Lozi 0 0 -ltz lb Luxembourgish 1 0 -lua Luba-Lulua 0 0 -lub lu Luba-Katanga 0 0 -lug lg Ganda 0 0 -lui Luiseno 0 0 -lun Lunda 0 0 -luo Luo (Kenya and Tanzania) 0 0 -lus lushai 0 0 -mac mk Macedonian 1 1 -mad Madurese 0 0 -mag Magahi 0 0 -mah mh Marshallese 0 0 -mai Maithili 0 0 -mak Makasar 0 0 -mal ml Malayalam 0 0 -man Mandingo 0 0 -mao mi Maori 0 0 -map Austronesian (Other) 0 0 -mar mr Marathi 0 0 -mas Masai 0 0 -may ms Malay 1 1 -mdf Moksha 0 0 -mdr Mandar 0 0 -men Mende 0 0 -mga Irish, Middle (900-1200) 0 0 -mic Mi'kmaq 0 0 -min Minangkabau 0 0 -mis Miscellaneous languages 0 0 -mkh Mon-Khmer (Other) 0 0 -mlg mg Malagasy 0 0 -mlt mt Maltese 0 0 -mnc Manchu 0 0 -mni Manipuri 0 0 -mno Manobo languages 0 0 -moh Mohawk 0 0 -mol mo Moldavian 0 0 -mon mn Mongolian 1 0 -mos Mossi 0 0 -mwl Mirandese 0 0 -mul Multiple languages 0 0 -mun Munda languages 0 0 -mus Creek 0 0 -mwr Marwari 0 0 -myn Mayan languages 0 0 -myv Erzya 0 0 -nah Nahuatl 0 0 -nai North American Indian 0 0 -nap Neapolitan 0 0 -nau na Nauru 0 0 -nav nv Navajo 0 0 -nbl nr Ndebele, South 0 0 -nde nd Ndebele, North 0 0 -ndo ng Ndonga 0 0 -nds Low German 0 0 -nep ne Nepali 0 0 -new Nepal Bhasa 0 0 -nia Nias 0 0 -nic Niger-Kordofanian (Other) 0 0 -niu Niuean 0 0 -nno nn Norwegian Nynorsk 0 0 -nob nb Norwegian Bokmal 0 0 -nog Nogai 0 0 -non Norse, Old 0 0 -nor no Norwegian 1 1 -nso Northern Sotho 0 0 -nub Nubian languages 0 0 -nwc Classical Newari 0 0 -nya ny Chichewa 0 0 -nym Nyamwezi 0 0 -nyn Nyankole 0 0 -nyo Nyoro 0 0 -nzi Nzima 0 0 -oci oc Occitan 1 1 -oji oj Ojibwa 0 0 -ori or Oriya 0 0 -orm om Oromo 0 0 -osa Osage 0 0 -oss os Ossetian 0 0 -ota Turkish, Ottoman (1500-1928) 0 0 -oto Otomian languages 0 0 -paa Papuan (Other) 0 0 -pag Pangasinan 0 0 -pal Pahlavi 0 0 -pam Pampanga 0 0 -pan pa Panjabi 0 0 -pap Papiamento 0 0 -pau Palauan 0 0 -peo Persian, Old (ca.600-400 B.C.) 0 0 -per fa Persian 1 1 -phi Philippine (Other) 0 0 -phn Phoenician 0 0 -pli pi Pali 0 0 -pol pl Polish 1 1 -pon Pohnpeian 0 0 -por pt Portuguese 1 1 -pra Prakrit languages 0 0 -pro Provençal, Old (to 1500) 0 0 -pus ps Pushto 0 0 -que qu Quechua 0 0 -raj Rajasthani 0 0 -rap Rapanui 0 0 -rar Rarotongan 0 0 -roa Romance (Other) 0 0 -roh rm Raeto-Romance 0 0 -rom Romany 0 0 -run rn Rundi 0 0 -rup Aromanian 0 0 -rus ru Russian 1 1 -sad Sandawe 0 0 -sag sg Sango 0 0 -sah Yakut 0 0 -sai South American Indian (Other) 0 0 -sal Salishan languages 0 0 -sam Samaritan Aramaic 0 0 -san sa Sanskrit 0 0 -sas Sasak 0 0 -sat Santali 0 0 -scc sr Serbian 1 1 -scn Sicilian 0 0 -sco Scots 0 0 -sel Selkup 0 0 -sem Semitic (Other) 0 0 -sga Irish, Old (to 900) 0 0 -sgn Sign Languages 0 0 -shn Shan 0 0 -sid Sidamo 0 0 -sin si Sinhalese 1 1 -sio Siouan languages 0 0 -sit Sino-Tibetan (Other) 0 0 -sla Slavic (Other) 0 0 -slo sk Slovak 1 1 -slv sl Slovenian 1 1 -sma Southern Sami 0 0 -sme se Northern Sami 0 0 -smi Sami languages (Other) 0 0 -smj Lule Sami 0 0 -smn Inari Sami 0 0 -smo sm Samoan 0 0 -sms Skolt Sami 0 0 -sna sn Shona 0 0 -snd sd Sindhi 0 0 -snk Soninke 0 0 -sog Sogdian 0 0 -som so Somali 0 0 -son Songhai 0 0 -sot st Sotho, Southern 0 0 -spa es Spanish 1 1 -srd sc Sardinian 0 0 -srr Serer 0 0 -ssa Nilo-Saharan (Other) 0 0 -ssw ss Swati 0 0 -suk Sukuma 0 0 -sun su Sundanese 0 0 -sus Susu 0 0 -sux Sumerian 0 0 -swa sw Swahili 1 0 -swe sv Swedish 1 1 -syr Syriac 1 0 -tah ty Tahitian 0 0 -tai Tai (Other) 0 0 -tam ta Tamil 0 0 -tat tt Tatar 0 0 -tel te Telugu 0 0 -tem Timne 0 0 -ter Tereno 0 0 -tet Tetum 0 0 -tgk tg Tajik 0 0 -tgl tl Tagalog 1 1 -tha th Thai 1 1 -tib bo Tibetan 0 0 -tig Tigre 0 0 -tir ti Tigrinya 0 0 -tiv Tiv 0 0 -tkl Tokelau 0 0 -tlh Klingon 0 0 -tli Tlingit 0 0 -tmh Tamashek 0 0 -tog Tonga (Nyasa) 0 0 -ton to Tonga (Tonga Islands) 0 0 -tpi Tok Pisin 0 0 -tsi Tsimshian 0 0 -tsn tn Tswana 0 0 -tso ts Tsonga 0 0 -tuk tk Turkmen 0 0 -tum Tumbuka 0 0 -tup Tupi languages 0 0 -tur tr Turkish 1 1 -tut Altaic (Other) 0 0 -tvl Tuvalu 0 0 -twi tw Twi 0 0 -tyv Tuvinian 0 0 -udm Udmurt 0 0 -uga Ugaritic 0 0 -uig ug Uighur 0 0 -ukr uk Ukrainian 1 1 -umb Umbundu 0 0 -und Undetermined 0 0 -urd ur Urdu 1 0 -uzb uz Uzbek 0 0 -vai Vai 0 0 -ven ve Venda 0 0 -vie vi Vietnamese 1 1 -vol vo Volapük 0 0 -vot Votic 0 0 -wak Wakashan languages 0 0 -wal Walamo 0 0 -war Waray 0 0 -was Washo 0 0 -wel cy Welsh 0 0 -wen Sorbian languages 0 0 -wln wa Walloon 0 0 -wol wo Wolof 0 0 -xal Kalmyk 0 0 -xho xh Xhosa 0 0 -yao Yao 0 0 -yap Yapese 0 0 -yid yi Yiddish 0 0 -yor yo Yoruba 0 0 -ypk Yupik languages 0 0 -zap Zapotec 0 0 -zen Zenaga 0 0 -zha za Zhuang 0 0 -znd Zande 0 0 -zul zu Zulu 0 0 -zun Zuni 0 0 -rum ro Romanian 1 1 -pob pb Brazilian 1 1 diff --git a/libs/guessit/test/rules/__init__.py b/libs/guessit/test/rules/__init__.py new file mode 100644 index 00000000..e5be370e --- /dev/null +++ b/libs/guessit/test/rules/__init__.py @@ -0,0 +1,3 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +# pylint: disable=no-self-use, pointless-statement, missing-docstring, invalid-name diff --git a/libs/guessit/test/rules/audio_codec.yml b/libs/guessit/test/rules/audio_codec.yml new file mode 100644 index 00000000..b744d7bf --- /dev/null +++ b/libs/guessit/test/rules/audio_codec.yml @@ -0,0 +1,83 @@ +# Multiple input strings having same expected results can be chained. +# Use $ marker to check inputs that should not match results. + + +? +MP3 +? +lame +? +lame3.12 +? +lame3.100 +: audio_codec: MP3 + +? +DolbyDigital +? +DD +? +Dolby Digital +: audio_codec: DolbyDigital + +? +DolbyAtmos +? +Dolby Atmos +? +Atmos +? -Atmosphere +: audio_codec: DolbyAtmos + +? +AAC +: audio_codec: AAC + +? +AC3 +: audio_codec: AC3 + +? +Flac +: audio_codec: FLAC + +? +DTS +: audio_codec: DTS + +? +True-HD +? +trueHD +: audio_codec: TrueHD + +? +DTS-HD +: audio_codec: DTS + audio_profile: HD + +? +DTS-HDma +: audio_codec: DTS + audio_profile: HDMA + +? +AC3-hq +: audio_codec: AC3 + audio_profile: HQ + +? +AAC-HE +: audio_codec: AAC + audio_profile: HE + +? +AAC-LC +: audio_codec: AAC + audio_profile: LC + +? +AAC2.0 +: audio_codec: AAC + audio_channels: '2.0' + +? +7.1 +? +7ch +? +8ch +: audio_channels: '7.1' + +? +5.1 +? +5ch +? +6ch +: audio_channels: '5.1' + +? +2ch +? +2.0 +? +stereo +: audio_channels: '2.0' + +? +1ch +? +mono +: audio_channels: '1.0' + +? DD5.1 +: audio_codec: DolbyDigital + audio_channels: '5.1' diff --git a/libs/guessit/test/rules/bonus.yml b/libs/guessit/test/rules/bonus.yml new file mode 100644 index 00000000..6ef6f5b2 --- /dev/null +++ b/libs/guessit/test/rules/bonus.yml @@ -0,0 +1,9 @@ +# Multiple input strings having same expected results can be chained. +# Use - marker to check inputs that should not match results. +? Movie Title-x01-Other Title.mkv +? Movie Title-x01-Other Title +? directory/Movie Title-x01-Other Title/file.mkv +: title: Movie Title + bonus_title: Other Title + bonus: 1 + diff --git a/libs/guessit/test/rules/cds.yml b/libs/guessit/test/rules/cds.yml new file mode 100644 index 00000000..cc63765e --- /dev/null +++ b/libs/guessit/test/rules/cds.yml @@ -0,0 +1,10 @@ +# Multiple input strings having same expected results can be chained. +# Use - marker to check inputs that should not match results. +? cd 1of3 +: cd: 1 + cd_count: 3 + +? Some.Title-DVDRIP-x264-CDP +: cd: !!null + release_group: CDP + video_codec: h264 diff --git a/libs/guessit/test/rules/country.yml b/libs/guessit/test/rules/country.yml new file mode 100644 index 00000000..f2da1b20 --- /dev/null +++ b/libs/guessit/test/rules/country.yml @@ -0,0 +1,10 @@ +# Multiple input strings having same expected results can be chained. +# Use $ marker to check inputs that should not match results. +? Us.this.is.title +? this.is.title.US +: country: US + title: this is title + +? This.is.us.title +: title: This is us title + diff --git a/libs/guessit/test/rules/date.yml b/libs/guessit/test/rules/date.yml new file mode 100644 index 00000000..d7379f03 --- /dev/null +++ b/libs/guessit/test/rules/date.yml @@ -0,0 +1,50 @@ +# Multiple input strings having same expected results can be chained. +# Use - marker to check inputs that should not match results. +? +09.03.08 +? +09.03.2008 +? +2008.03.09 +: date: 2008-03-09 + +? +31.01.15 +? +31.01.2015 +? +15.01.31 +? +2015.01.31 +: date: 2015-01-31 + +? +01.02.03 +: date: 2003-02-01 + +? +01.02.03 +: options: --date-year-first + date: 2001-02-03 + +? +01.02.03 +: options: --date-day-first + date: 2003-02-01 + +? 1919 +? 2030 +: !!map {} + +? 2029 +: year: 2029 + +? (1920) +: year: 1920 + +? 2012 +: year: 2012 + +? 2011 2013 (2012) (2015) # first marked year is guessed. +: title: "2011 2013" + year: 2012 + +? 2012 2009 S01E02 2015 # If no year is marked, the second one is guessed. +: title: "2012" + year: 2009 + episode_title: "2015" + +? Something 2 mar 2013) +: title: Something + date: 2013-03-02 + type: episode diff --git a/libs/guessit/test/rules/edition.yml b/libs/guessit/test/rules/edition.yml new file mode 100644 index 00000000..bc35b85e --- /dev/null +++ b/libs/guessit/test/rules/edition.yml @@ -0,0 +1,25 @@ +# Multiple input strings having same expected results can be chained. +# Use - marker to check inputs that should not match results. +? Director's cut +? Edition Director's cut +: edition: Director's cut + +? Collector +? Collector Edition +? Edition Collector +: edition: Collector Edition + +? Special Edition +? Edition Special +? -Special +: edition: Special Edition + +? Criterion Edition +? Edition Criterion +? -Criterion +: edition: Criterion Edition + +? Deluxe +? Deluxe Edition +? Edition Deluxe +: edition: Deluxe Edition diff --git a/libs/guessit/test/rules/episodes.yml b/libs/guessit/test/rules/episodes.yml new file mode 100644 index 00000000..a75e6702 --- /dev/null +++ b/libs/guessit/test/rules/episodes.yml @@ -0,0 +1,247 @@ +# Multiple input strings having same expected results can be chained. +# Use $ marker to check inputs that should not match results. +? +2x5 +? +2X5 +? +02x05 +? +2X05 +? +02x5 +? S02E05 +? s02e05 +? s02e5 +? s2e05 +? s02ep05 +? s2EP5 +? -s03e05 +? -s02e06 +? -3x05 +? -2x06 +: season: 2 + episode: 5 + +? "+0102" +? "+102" +: season: 1 + episode: 2 + +? "0102 S03E04" +? "S03E04 102" +: season: 3 + episode: 4 + +? +serie Saison 2 other +? +serie Season 2 other +? +serie Saisons 2 other +? +serie Seasons 2 other +? +serie Serie 2 other +? +serie Series 2 other +? +serie Season Two other +? +serie Season II other +: season: 2 + +? Some Series.S02E01.Episode.title.mkv +? Some Series/Season 02/E01-Episode title.mkv +? Some Series/Season 02/Some Series-E01-Episode title.mkv +? Some Dummy Directory/Season 02/Some Series-E01-Episode title.mkv +? -Some Dummy Directory/Season 02/E01-Episode title.mkv +? Some Series/Unsafe Season 02/Some Series-E01-Episode title.mkv +? -Some Series/Unsafe Season 02/E01-Episode title.mkv +? Some Series/Season 02/E01-Episode title.mkv +? Some Series/ Season 02/E01-Episode title.mkv +? Some Dummy Directory/Some Series S02/E01-Episode title.mkv +? Some Dummy Directory/S02 Some Series/E01-Episode title.mkv +: title: Some Series + episode_title: Episode title + season: 2 + episode: 1 + +? Some Series.S02E01.mkv +? Some Series/Season 02/E01.mkv +? Some Series/Season 02/Some Series-E01.mkv +? Some Dummy Directory/Season 02/Some Series-E01.mkv +? -Some Dummy Directory/Season 02/E01.mkv +? Some Series/Unsafe Season 02/Some Series-E01.mkv +? -Some Series/Unsafe Season 02/E01.mkv +? Some Series/Season 02/E01.mkv +? Some Series/ Season 02/E01.mkv +? Some Dummy Directory/Some Series S02/E01-AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA.mkv +: title: Some Series + season: 2 + episode: 1 + +? Some Series S03E01E02 +: title: Some Series + season: 3 + episode: [1, 2] + +? Some Series S01S02S03 +? Some Series S01-02-03 +? Some Series S01 S02 S03 +? Some Series S01 02 03 +: title: Some Series + season: [1, 2, 3] + +? Some Series E01E02E03 +? Some Series E01-02-03 +? Some Series E01-03 +? Some Series E01 E02 E03 +? Some Series E01 02 03 +: title: Some Series + episode: [1, 2, 3] + +? Some Series E01E02E04 +? Some Series E01 E02 E04 +? Some Series E01 02 04 +: title: Some Series + episode: [1, 2, 4] + +? Some Series E01-02-04 +? Some Series E01-04 +? Some Series E01-04 +: title: Some Series + episode: [1, 2, 3, 4] + +? Some Series E01-02-E04 +: title: Some Series + episode: [1, 2, 3, 4] + +? Episode 3 +? -Episode III +: episode: 3 + +? Episode 3 +? Episode III +: options: -t episode + episode: 3 + +? -A very special movie +: episode_details: Special + +? A very special episode +: options: -t episode + episode_details: Special + +? 12 Monkeys\Season 01\Episode 05\12 Monkeys - S01E05 - The Night Room.mkv +: container: mkv + title: 12 Monkeys + episode: 5 + season: 1 + +? S03E02.X.1080p +: episode: 2 + screen_size: 1080p + season: 3 + +? Something 1 x 2-FlexGet +: options: -t episode + title: Something + season: 1 + episode: 2 + episode_title: FlexGet + +? Show.Name.-.Season.1.to.3.-.Mp4.1080p +? Show.Name.-.Season.1~3.-.Mp4.1080p +? Show.Name.-.Saison.1.a.3.-.Mp4.1080p +: container: MP4 + screen_size: 1080p + season: + - 1 + - 2 + - 3 + title: Show Name + +? Show.Name.Season.1.3&5.HDTV.XviD-GoodGroup[SomeTrash] +? Show.Name.Season.1.3 and 5.HDTV.XviD-GoodGroup[SomeTrash] +: format: HDTV + release_group: GoodGroup[SomeTrash] + season: + - 1 + - 3 + - 5 + title: Show Name + type: episode + video_codec: XviD + +? Show.Name.Season.1.2.3-5.HDTV.XviD-GoodGroup[SomeTrash] +? Show.Name.Season.1.2.3~5.HDTV.XviD-GoodGroup[SomeTrash] +? Show.Name.Season.1.2.3 to 5.HDTV.XviD-GoodGroup[SomeTrash] +: format: HDTV + release_group: GoodGroup[SomeTrash] + season: + - 1 + - 2 + - 3 + - 4 + - 5 + title: Show Name + type: episode + video_codec: XviD + +? The.Get.Down.S01EP01.FRENCH.720p.WEBRIP.XVID-STR +: episode: 1 + format: WEBRip + language: fr + release_group: STR + screen_size: 720p + season: 1 + title: The Get Down + type: episode + video_codec: XviD + +? My.Name.Is.Earl.S01E01-S01E21.SWE-SUB +: episode: + - 1 + - 2 + - 3 + - 4 + - 5 + - 6 + - 7 + - 8 + - 9 + - 10 + - 11 + - 12 + - 13 + - 14 + - 15 + - 16 + - 17 + - 18 + - 19 + - 20 + - 21 + season: 1 + subtitle_language: sv + title: My Name Is Earl + type: episode + +? Show.Name.Season.4.Episodes.1-12 +: episode: + - 1 + - 2 + - 3 + - 4 + - 5 + - 6 + - 7 + - 8 + - 9 + - 10 + - 11 + - 12 + season: 4 + title: Show Name + type: episode + +? show name s01.to.s04 +: season: + - 1 + - 2 + - 3 + - 4 + title: show name + type: episode + +? epi +: options: -t episode + title: epi \ No newline at end of file diff --git a/libs/guessit/test/rules/film.yml b/libs/guessit/test/rules/film.yml new file mode 100644 index 00000000..1f774331 --- /dev/null +++ b/libs/guessit/test/rules/film.yml @@ -0,0 +1,9 @@ +# Multiple input strings having same expected results can be chained. +# Use - marker to check inputs that should not match results. +? Film Title-f01-Series Title.mkv +? Film Title-f01-Series Title +? directory/Film Title-f01-Series Title/file.mkv +: title: Series Title + film_title: Film Title + film: 1 + diff --git a/libs/guessit/test/rules/format.yml b/libs/guessit/test/rules/format.yml new file mode 100644 index 00000000..cf3dea92 --- /dev/null +++ b/libs/guessit/test/rules/format.yml @@ -0,0 +1,112 @@ +# Multiple input strings having same expected results can be chained. +# Use - marker to check inputs that should not match results. +? +VHS +? +VHSRip +? +VHS-Rip +? +VhS_rip +? +VHS.RIP +? -VHSAnythingElse +? -SomeVHS stuff +? -VH +? -VHx +? -VHxRip +: format: VHS + +? +Cam +? +CamRip +? +CaM Rip +? +Cam_Rip +? +cam.rip +: format: Cam + +? +Telesync +? +TS +? +HD TS +? -Hd.Ts # ts file extension +? -HD.TS # ts file extension +? +Hd-Ts +: format: Telesync + +? +Workprint +? +workPrint +? +WorkPrint +? +WP +? -Work Print +: format: Workprint + +? +Telecine +? +teleCine +? +TC +? -Tele Cine +: format: Telecine + +? +PPV +? +ppv-rip +: format: PPV + +? -TV +? +SDTV +? +SDTVRIP +? +Rip sd tv +? +TvRip +? +Rip TV +: format: TV + +? +DVB +? +DVB-Rip +? +DvBRiP +? +pdTV +? +Pd Tv +: format: DVB + +? +DVD +? +DVD-RIP +? +video ts +? +DVDR +? +DVD 9 +? +dvd 5 +? -dvd ts +: format: DVD + -format: ts + +? +HDTV +? +tv rip hd +? +HDtv Rip +? +HdRip +: format: HDTV + +? +VOD +? +VodRip +? +vod rip +: format: VOD + +? +webrip +? +Web Rip +: format: WEBRip + +? +webdl +? +Web DL +? +webHD +? +WEB hd +? +web +: format: WEB-DL + +? +HDDVD +? +hd dvd +? +hdDvdRip +: format: HD-DVD + +? +BluRay +? +BluRay rip +? +BD +? +BR +? +BDRip +? +BR rip +? +BD5 +? +BD9 +? +BD25 +? +bd50 +: format: BluRay + +? XVID.NTSC.DVDR.nfo +: format: DVD diff --git a/libs/guessit/test/rules/language.yml b/libs/guessit/test/rules/language.yml new file mode 100644 index 00000000..51bbd8da --- /dev/null +++ b/libs/guessit/test/rules/language.yml @@ -0,0 +1,39 @@ +# Multiple input strings having same expected results can be chained. +# Use - marker to check inputs that should not match results. +? +English +? .ENG. +: language: English + +? +French +: language: French + +? +SubFrench +? +SubFr +? +STFr +? ST.FR +: subtitle_language: French + +? +ENG.-.sub.FR +? ENG.-.FR Sub +? +ENG.-.SubFR +? +ENG.-.FRSUB +? +ENG.-.FRSUBS +? +ENG.-.FR-SUBS +: language: English + subtitle_language: French + +? "{Fr-Eng}.St{Fr-Eng}" +? "Le.Prestige[x264.{Fr-Eng}.St{Fr-Eng}.Chaps].mkv" +: language: [French, English] + subtitle_language: [French, English] + +? +ENG.-.sub.SWE +? ENG.-.SWE Sub +? +ENG.-.SubSWE +? +ENG.-.SWESUB +? +ENG.-.sub.SV +? ENG.-.SV Sub +? +ENG.-.SubSV +? +ENG.-.SVSUB +: language: English + subtitle_language: Swedish \ No newline at end of file diff --git a/libs/guessit/test/rules/other.yml b/libs/guessit/test/rules/other.yml new file mode 100644 index 00000000..cce8cbd0 --- /dev/null +++ b/libs/guessit/test/rules/other.yml @@ -0,0 +1,137 @@ +# Multiple input strings having same expected results can be chained. +# Use - marker to check inputs that should not match results. +? +DVDSCR +? +DVDScreener +? +DVD-SCR +? +DVD Screener +? +DVD AnythingElse Screener +? -DVD AnythingElse SCR +: other: Screener + +? +AudioFix +? +AudioFixed +? +Audio Fix +? +Audio Fixed +: other: AudioFix + +? +SyncFix +? +SyncFixed +? +Sync Fix +? +Sync Fixed +: other: SyncFix + +? +DualAudio +? +Dual Audio +: other: DualAudio + +? +ws +? +WideScreen +? +Wide Screen +: other: WideScreen + +? +NF +? +Netflix +: other: Netflix + +# Fix and Real must be surround by others properties to be matched. +? DVD.Real.XViD +? DVD.fix.XViD +? -DVD.Real +? -DVD.Fix +? -Real.XViD +? -Fix.XViD +: other: Proper + proper_count: 1 + +? -DVD.BlablaBla.Fix.Blablabla.XVID +? -DVD.BlablaBla.Fix.XVID +? -DVD.Fix.Blablabla.XVID +: other: Proper + proper_count: 1 + + +? DVD.Real.PROPER.REPACK +: other: Proper + proper_count: 3 + + +? Proper +? +Repack +? +Rerip +: other: Proper + proper_count: 1 + +? XViD.Fansub +: other: Fansub + +? XViD.Fastsub +: other: Fastsub + +? +Season Complete +? -Complete +: other: Complete + +? R5 +? RC +: other: R5 + +? PreAir +? Pre Air +: other: Preair + +? Screener +: other: Screener + +? Remux +: other: Remux + +? 3D +: other: 3D + +? HD +: other: HD + +? mHD # ?? +: other: mHD + +? HDLight +: other: HDLight + +? HQ +: other: HQ + +? ddc +: other: DDC + +? hr +: other: HR + +? PAL +: other: PAL + +? SECAM +: other: SECAM + +? NTSC +: other: NTSC + +? CC +: other: CC + +? LD +: other: LD + +? MD +: other: MD + +? -The complete movie +: other: Complete + +? +The complete movie +: title: The complete movie + +? +AC3-HQ +: audio_profile: HQ + +? Other-HQ +: other: HQ diff --git a/libs/guessit/test/rules/part.yml b/libs/guessit/test/rules/part.yml new file mode 100644 index 00000000..72f3d98a --- /dev/null +++ b/libs/guessit/test/rules/part.yml @@ -0,0 +1,18 @@ +# Multiple input strings having same expected results can be chained. +# Use - marker to check inputs that should not match results. +? Filename Part 3.mkv +? Filename Part III.mkv +? Filename Part Three.mkv +? Filename Part Trois.mkv +: title: Filename + part: 3 + +? Part 3 +? Part III +? Part Three +? Part Trois +? Part3 +: part: 3 + +? -Something.Apt.1 +: part: 1 \ No newline at end of file diff --git a/libs/guessit/test/rules/processors.yml b/libs/guessit/test/rules/processors.yml new file mode 100644 index 00000000..ee906b2c --- /dev/null +++ b/libs/guessit/test/rules/processors.yml @@ -0,0 +1,8 @@ +# Multiple input strings having same expected results can be chained. +# Use $ marker to check inputs that should not match results. + +# Prefer information for last path. +? Some movie (2000)/Some movie (2001).mkv +? Some movie (2001)/Some movie.mkv +: year: 2001 + container: mkv diff --git a/libs/guessit/test/rules/release_group.yml b/libs/guessit/test/rules/release_group.yml new file mode 100644 index 00000000..d048ff71 --- /dev/null +++ b/libs/guessit/test/rules/release_group.yml @@ -0,0 +1,41 @@ +# Multiple input strings having same expected results can be chained. +# Use - marker to check inputs that should not match results. +? Some.Title.XViD-ReleaseGroup +? Some.Title.XViD-ReleaseGroup.mkv +: release_group: ReleaseGroup + +? Some.Title.XViD-by.Artik[SEDG].avi +: release_group: Artik[SEDG] + +? "[ABC] Some.Title.avi" +? some/folder/[ABC]Some.Title.avi +: release_group: ABC + +? "[ABC] Some.Title.XViD-GRP.avi" +? some/folder/[ABC]Some.Title.XViD-GRP.avi +: release_group: GRP + +? "[ABC] Some.Title.S01E02.avi" +? some/folder/[ABC]Some.Title.S01E02.avi +: release_group: ABC + +? Some.Title.XViD-S2E02.NoReleaseGroup.avi +: release_group: !!null + +? Test.S01E01-FooBar-Group +: options: -G group -G xxxx + episode: 1 + episode_title: FooBar + release_group: Group + season: 1 + title: Test + type: episode + +? Test.S01E01-FooBar-Group +: options: -G re:gr.?up -G xxxx + episode: 1 + episode_title: FooBar + release_group: Group + season: 1 + title: Test + type: episode diff --git a/libs/guessit/test/rules/screen_size.yml b/libs/guessit/test/rules/screen_size.yml new file mode 100644 index 00000000..1145dd7e --- /dev/null +++ b/libs/guessit/test/rules/screen_size.yml @@ -0,0 +1,69 @@ +# Multiple input strings having same expected results can be chained. +# Use - marker to check inputs that should not match results. +? +360p +? +360px +? +360i +? "+360" +? +500x360 +: screen_size: 360p + +? +368p +? +368px +? +368i +? "+368" +? +500x368 +: screen_size: 368p + +? +480p +? +480px +? +480i +? "+480" +? +500x480 +: screen_size: 480p + +? +576p +? +576px +? +576i +? "+576" +? +500x576 +: screen_size: 576p + +? +720p +? +720px +? 720hd +? 720pHD +? +720i +? "+720" +? +500x720 +: screen_size: 720p + +? +900p +? +900px +? +900i +? "+900" +? +500x900 +: screen_size: 900p + +? +1080p +? +1080px +? +1080hd +? +1080pHD +? -1080i +? "+1080" +? +500x1080 +: screen_size: 1080p + +? +1080i +? -1080p +: screen_size: 1080i + +? +2160p +? +2160px +? +2160i +? "+2160" +? +4096x2160 +: screen_size: 4K + +? Test.File.720hd.bluray +? Test.File.720p50 +: screen_size: 720p diff --git a/libs/guessit/test/rules/title.yml b/libs/guessit/test/rules/title.yml new file mode 100644 index 00000000..fffaf8a2 --- /dev/null +++ b/libs/guessit/test/rules/title.yml @@ -0,0 +1,32 @@ +# Multiple input strings having same expected results can be chained. +# Use - marker to check inputs that should not match results. +? Title Only +? -Title XViD 720p Only +? sub/folder/Title Only +? -sub/folder/Title XViD 720p Only +? Title Only.mkv +? Title Only.avi +: title: Title Only + +? Title Only/title_only.mkv +: title: Title Only + +? title_only.mkv +: title: title only + +? Some Title/some.title.mkv +? some.title/Some.Title.mkv +: title: Some Title + +? SOME TITLE/Some.title.mkv +? Some.title/SOME TITLE.mkv +: title: Some title + +? some title/Some.title.mkv +? Some.title/some title.mkv +: title: Some title + +? Some other title/Some.Other.title.mkv +? Some.Other title/Some other title.mkv +: title: Some Other title + diff --git a/libs/guessit/test/rules/video_codec.yml b/libs/guessit/test/rules/video_codec.yml new file mode 100644 index 00000000..d195eaaf --- /dev/null +++ b/libs/guessit/test/rules/video_codec.yml @@ -0,0 +1,54 @@ +# Multiple input strings having same expected results can be chained. +# Use - marker to check inputs that should not match results. +? rv10 +? rv13 +? RV20 +? Rv30 +? rv40 +? -xrv40 +: video_codec: Real + +? mpeg2 +? MPEG2 +? -mpeg +? -mpeg 2 # Not sure if we should ignore this one ... +? -xmpeg2 +? -mpeg2x +: video_codec: Mpeg2 + +? DivX +? -div X +? divx +? dvdivx +? DVDivX +: video_codec: DivX + +? XviD +? xvid +? -x vid +: video_codec: XviD + +? h264 +? x264 +? h.264 +? x.264 +? mpeg4-AVC +? -MPEG-4 +? -mpeg4 +? -mpeg +? -h 265 +? -x265 +: video_codec: h264 + +? h265 +? x265 +? h.265 +? x.265 +? hevc +? -h 264 +? -x264 +: video_codec: h265 + +? h265-HP +: video_codec: h265 + video_profile: HP \ No newline at end of file diff --git a/libs/guessit/test/rules/website.yml b/libs/guessit/test/rules/website.yml new file mode 100644 index 00000000..11d434d2 --- /dev/null +++ b/libs/guessit/test/rules/website.yml @@ -0,0 +1,23 @@ +# Multiple input strings having same expected results can be chained. +# Use - marker to check inputs that should not match results. +? +tvu.org.ru +? -tvu.unsafe.ru +: website: tvu.org.ru + +? +www.nimp.na +? -somewww.nimp.na +? -www.nimp.nawouak +? -nimp.na +: website: www.nimp.na + +? +wawa.co.uk +? -wawa.uk +: website: wawa.co.uk + +? -Dark.Net.S01E06.720p.HDTV.x264-BATV + -Dark.Net.2015.720p.HDTV.x264-BATV +: website: Dark.Net + +? Dark.Net.S01E06.720p.HDTV.x264-BATV + Dark.Net.2015.720p.HDTV.x264-BATV +: title: Dark Net diff --git a/libs/guessit/test/test-input-file.txt b/libs/guessit/test/test-input-file.txt new file mode 100644 index 00000000..656bc931 --- /dev/null +++ b/libs/guessit/test/test-input-file.txt @@ -0,0 +1,2 @@ +Fear.and.Loathing.in.Las.Vegas.FRENCH.ENGLISH.720p.HDDVD.DTS.x264-ESiR.mkv +SecondFile.avi \ No newline at end of file diff --git a/libs/guessit/test/test_api.py b/libs/guessit/test/test_api.py index 92cef41b..ca33df04 100644 --- a/libs/guessit/test/test_api.py +++ b/libs/guessit/test/test_api.py @@ -1,54 +1,63 @@ #!/usr/bin/env python # -*- coding: utf-8 -*- -# -# GuessIt - A library for guessing information from filenames -# Copyright (c) 2014 Nicolas Wack -# -# GuessIt is free software; you can redistribute it and/or modify it under -# the terms of the Lesser GNU General Public License as published by -# the Free Software Foundation; either version 3 of the License, or -# (at your option) any later version. -# -# GuessIt is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# Lesser GNU General Public License for more details. -# -# You should have received a copy of the Lesser GNU General Public License -# along with this program. If not, see . -# +# pylint: disable=no-self-use, pointless-statement, missing-docstring, invalid-name, pointless-string-statement -from __future__ import absolute_import, division, print_function, unicode_literals +import os -from guessit.test.guessittest import * +import pytest +import six + +from ..api import guessit, properties, GuessitException + +__location__ = os.path.realpath(os.path.join(os.getcwd(), os.path.dirname(__file__))) -class TestApi(TestGuessit): - def test_api(self): - movie_path = 'Movies/Dark City (1998)/Dark.City.(1998).DC.BDRip.720p.DTS.X264-CHD.mkv' +def test_default(): + ret = guessit('Fear.and.Loathing.in.Las.Vegas.FRENCH.ENGLISH.720p.HDDVD.DTS.x264-ESiR.mkv') + assert ret and 'title' in ret - movie_info = guessit.guess_movie_info(movie_path) - video_info = guessit.guess_video_info(movie_path) - episode_info = guessit.guess_episode_info(movie_path) - file_info = guessit.guess_file_info(movie_path) - self.assertEqual(guessit.guess_file_info(movie_path, type='movie'), movie_info) - self.assertEqual(guessit.guess_file_info(movie_path, type='video'), video_info) - self.assertEqual(guessit.guess_file_info(movie_path, type='episode'), episode_info) +def test_forced_unicode(): + ret = guessit(u'Fear.and.Loathing.in.Las.Vegas.FRENCH.ENGLISH.720p.HDDVD.DTS.x264-ESiR.mkv') + assert ret and 'title' in ret and isinstance(ret['title'], six.text_type) - self.assertEqual(guessit.guess_file_info(movie_path, options={'type': 'movie'}), movie_info) - self.assertEqual(guessit.guess_file_info(movie_path, options={'type': 'video'}), video_info) - self.assertEqual(guessit.guess_file_info(movie_path, options={'type': 'episode'}), episode_info) - self.assertEqual(guessit.guess_file_info(movie_path, options={'type': 'episode'}, type='movie'), episode_info) # kwargs priority other options +def test_forced_binary(): + ret = guessit(b'Fear.and.Loathing.in.Las.Vegas.FRENCH.ENGLISH.720p.HDDVD.DTS.x264-ESiR.mkv') + assert ret and 'title' in ret and isinstance(ret['title'], six.binary_type) - movie_path_name_only = 'Movies/Dark City (1998)/Dark.City.(1998).DC.BDRip.720p.DTS.X264-CHD' - file_info_name_only = guessit.guess_file_info(movie_path_name_only, options={"name_only": True}) - self.assertFalse('container' in file_info_name_only) - self.assertTrue('container' in file_info) +def test_unicode_japanese(): + ret = guessit('[阿维达].Avida.2006.FRENCH.DVDRiP.XViD-PROD.avi') + assert ret and 'title' in ret -suite = allTests(TestApi) -if __name__ == '__main__': - TextTestRunner(verbosity=2).run(suite) +def test_unicode_japanese_options(): + ret = guessit("[阿维达].Avida.2006.FRENCH.DVDRiP.XViD-PROD.avi", options={"expected_title": ["阿维达"]}) + assert ret and 'title' in ret and ret['title'] == "阿维达" + + +def test_forced_unicode_japanese_options(): + ret = guessit(u"[阿维达].Avida.2006.FRENCH.DVDRiP.XViD-PROD.avi", options={"expected_title": [u"阿维达"]}) + assert ret and 'title' in ret and ret['title'] == u"阿维达" + +# TODO: This doesn't compile on python 3, but should be tested on python 2. +""" +if six.PY2: + def test_forced_binary_japanese_options(): + ret = guessit(b"[阿维达].Avida.2006.FRENCH.DVDRiP.XViD-PROD.avi", options={"expected_title": [b"阿维达"]}) + assert ret and 'title' in ret and ret['title'] == b"阿维达" +""" + + +def test_properties(): + props = properties() + assert 'video_codec' in props.keys() + + +def test_exception(): + with pytest.raises(GuessitException) as excinfo: + guessit(object()) + assert "An internal error has occured in guessit" in str(excinfo.value) + assert "Guessit Exception Report" in str(excinfo.value) + assert "Please report at https://github.com/guessit-io/guessit/issues" in str(excinfo.value) diff --git a/libs/guessit/test/test_api_unicode_literals.py b/libs/guessit/test/test_api_unicode_literals.py new file mode 100644 index 00000000..3347a7d8 --- /dev/null +++ b/libs/guessit/test/test_api_unicode_literals.py @@ -0,0 +1,66 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +# pylint: disable=no-self-use, pointless-statement, missing-docstring, invalid-name, pointless-string-statement + + +from __future__ import unicode_literals + +import os + +import pytest +import six + +from ..api import guessit, properties, GuessitException + +__location__ = os.path.realpath(os.path.join(os.getcwd(), os.path.dirname(__file__))) + + +def test_default(): + ret = guessit('Fear.and.Loathing.in.Las.Vegas.FRENCH.ENGLISH.720p.HDDVD.DTS.x264-ESiR.mkv') + assert ret and 'title' in ret + + +def test_forced_unicode(): + ret = guessit(u'Fear.and.Loathing.in.Las.Vegas.FRENCH.ENGLISH.720p.HDDVD.DTS.x264-ESiR.mkv') + assert ret and 'title' in ret and isinstance(ret['title'], six.text_type) + + +def test_forced_binary(): + ret = guessit(b'Fear.and.Loathing.in.Las.Vegas.FRENCH.ENGLISH.720p.HDDVD.DTS.x264-ESiR.mkv') + assert ret and 'title' in ret and isinstance(ret['title'], six.binary_type) + + +def test_unicode_japanese(): + ret = guessit('[阿维达].Avida.2006.FRENCH.DVDRiP.XViD-PROD.avi') + assert ret and 'title' in ret + + +def test_unicode_japanese_options(): + ret = guessit("[阿维达].Avida.2006.FRENCH.DVDRiP.XViD-PROD.avi", options={"expected_title": ["阿维达"]}) + assert ret and 'title' in ret and ret['title'] == "阿维达" + + +def test_forced_unicode_japanese_options(): + ret = guessit(u"[阿维达].Avida.2006.FRENCH.DVDRiP.XViD-PROD.avi", options={"expected_title": [u"阿维达"]}) + assert ret and 'title' in ret and ret['title'] == u"阿维达" + +# TODO: This doesn't compile on python 3, but should be tested on python 2. +""" +if six.PY2: + def test_forced_binary_japanese_options(): + ret = guessit(b"[阿维达].Avida.2006.FRENCH.DVDRiP.XViD-PROD.avi", options={"expected_title": [b"阿维达"]}) + assert ret and 'title' in ret and ret['title'] == b"阿维达" +""" + + +def test_properties(): + props = properties() + assert 'video_codec' in props.keys() + + +def test_exception(): + with pytest.raises(GuessitException) as excinfo: + guessit(object()) + assert "An internal error has occured in guessit" in str(excinfo.value) + assert "Guessit Exception Report" in str(excinfo.value) + assert "Please report at https://github.com/guessit-io/guessit/issues" in str(excinfo.value) diff --git a/libs/guessit/test/test_autodetect.py b/libs/guessit/test/test_autodetect.py deleted file mode 100644 index 229b491f..00000000 --- a/libs/guessit/test/test_autodetect.py +++ /dev/null @@ -1,45 +0,0 @@ -#!/usr/bin/env python -# -*- coding: utf-8 -*- -# -# GuessIt - A library for guessing information from filenames -# Copyright (c) 2013 Nicolas Wack -# -# GuessIt is free software; you can redistribute it and/or modify it under -# the terms of the Lesser GNU General Public License as published by -# the Free Software Foundation; either version 3 of the License, or -# (at your option) any later version. -# -# GuessIt is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# Lesser GNU General Public License for more details. -# -# You should have received a copy of the Lesser GNU General Public License -# along with this program. If not, see . -# - -from __future__ import absolute_import, division, print_function, unicode_literals - -from guessit.test.guessittest import * - - -class TestAutoDetect(TestGuessit): - def testEmpty(self): - result = guessit.guess_file_info('') - self.assertEqual(result, {}) - - result = guessit.guess_file_info('___-__') - self.assertEqual(result, {}) - - result = guessit.guess_file_info('__-.avc') - self.assertEqual(result, {'type': 'unknown', 'extension': 'avc'}) - - def testAutoDetect(self): - self.checkMinimumFieldsCorrect(filename='autodetect.yaml', - remove_type=False) - - -suite = allTests(TestAutoDetect) - -if __name__ == '__main__': - TextTestRunner(verbosity=2).run(suite) diff --git a/libs/guessit/test/test_autodetect_all.py b/libs/guessit/test/test_autodetect_all.py deleted file mode 100644 index 033e1571..00000000 --- a/libs/guessit/test/test_autodetect_all.py +++ /dev/null @@ -1,46 +0,0 @@ -#!/usr/bin/env python -# -*- coding: utf-8 -*- -# -# GuessIt - A library for guessing information from filenames -# Copyright (c) 2013 Nicolas Wack -# -# GuessIt is free software; you can redistribute it and/or modify it under -# the terms of the Lesser GNU General Public License as published by -# the Free Software Foundation; either version 3 of the License, or -# (at your option) any later version. -# -# GuessIt is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# Lesser GNU General Public License for more details. -# -# You should have received a copy of the Lesser GNU General Public License -# along with this program. If not, see . -# - -from __future__ import absolute_import, division, print_function, unicode_literals - -from guessit.test.guessittest import * - -IGNORE_EPISODES = [] -IGNORE_MOVIES = [] - - -class TestAutoDetectAll(TestGuessit): - def testAutoMatcher(self): - self.checkMinimumFieldsCorrect(filename='autodetect.yaml', - remove_type=False) - - def testAutoMatcherMovies(self): - self.checkMinimumFieldsCorrect(filename='movies.yaml', - exclude_files=IGNORE_MOVIES) - - def testAutoMatcherEpisodes(self): - self.checkMinimumFieldsCorrect(filename='episodes.yaml', - exclude_files=IGNORE_EPISODES) - - -suite = allTests(TestAutoDetectAll) - -if __name__ == '__main__': - TextTestRunner(verbosity=2).run(suite) diff --git a/libs/guessit/test/test_benchmark.py b/libs/guessit/test/test_benchmark.py new file mode 100644 index 00000000..34386e30 --- /dev/null +++ b/libs/guessit/test/test_benchmark.py @@ -0,0 +1,52 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +# pylint: disable=no-self-use,pointless-statement,missing-docstring,invalid-name,line-too-long +import time + +import pytest + +from ..api import guessit + + +def case1(): + return guessit('Fear.and.Loathing.in.Las.Vegas.FRENCH.ENGLISH.720p.HDDVD.DTS.x264-ESiR.mkv') + + +def case2(): + return guessit('Movies/Fantastic Mr Fox/Fantastic.Mr.Fox.2009.DVDRip.{x264+LC-AAC.5.1}{Fr-Eng}{Sub.Fr-Eng}-â„¢.[sharethefiles.com].mkv') + + +def case3(): + return guessit('Series/dexter/Dexter.5x02.Hello,.Bandit.ENG.-.sub.FR.HDTV.XviD-AlFleNi-TeaM.[tvu.org.ru].avi') + + +def case4(): + return guessit('Movies/The Doors (1991)/09.03.08.The.Doors.(1991).BDRip.720p.AC3.X264-HiS@SiLUHD-English.[sharethefiles.com].mkv') + + +@pytest.mark.benchmark( + group="Performance Tests", + min_time=1, + max_time=2, + min_rounds=5, + timer=time.time, + disable_gc=True, + warmup=False +) +@pytest.mark.skipif(True, reason="Disabled") +class TestBenchmark(object): + def test_case1(self, benchmark): + ret = benchmark(case1) + assert ret + + def test_case2(self, benchmark): + ret = benchmark(case2) + assert ret + + def test_case3(self, benchmark): + ret = benchmark(case3) + assert ret + + def test_case4(self, benchmark): + ret = benchmark(case4) + assert ret diff --git a/libs/guessit/test/test_doctests.py b/libs/guessit/test/test_doctests.py deleted file mode 100644 index 9fedeb0f..00000000 --- a/libs/guessit/test/test_doctests.py +++ /dev/null @@ -1,45 +0,0 @@ -#!/usr/bin/env python -# -*- coding: utf-8 -*- -# -# GuessIt - A library for guessing information from filenames -# Copyright (c) 2014 Nicolas Wack -# -# GuessIt is free software; you can redistribute it and/or modify it under -# the terms of the Lesser GNU General Public License as published by -# the Free Software Foundation; either version 3 of the License, or -# (at your option) any later version. -# -# GuessIt is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# Lesser GNU General Public License for more details. -# -# You should have received a copy of the Lesser GNU General Public License -# along with this program. If not, see . -# - -from __future__ import absolute_import, division, print_function, unicode_literals - -from guessit.test.guessittest import * -import guessit -import guessit.hash_ed2k -import unittest -import doctest - - -def load_tests(loader, tests, ignore): - tests.addTests(doctest.DocTestSuite(guessit)) - tests.addTests(doctest.DocTestSuite(guessit.date)) - tests.addTests(doctest.DocTestSuite(guessit.fileutils)) - tests.addTests(doctest.DocTestSuite(guessit.guess)) - tests.addTests(doctest.DocTestSuite(guessit.hash_ed2k)) - tests.addTests(doctest.DocTestSuite(guessit.language)) - tests.addTests(doctest.DocTestSuite(guessit.matchtree)) - tests.addTests(doctest.DocTestSuite(guessit.textutils)) - return tests - -suite = unittest.TestSuite() -load_tests(None, suite, None) - -if __name__ == '__main__': - TextTestRunner(verbosity=2).run(suite) diff --git a/libs/guessit/test/test_episode.py b/libs/guessit/test/test_episode.py deleted file mode 100644 index 03abf6b0..00000000 --- a/libs/guessit/test/test_episode.py +++ /dev/null @@ -1,35 +0,0 @@ -#!/usr/bin/env python -# -*- coding: utf-8 -*- -# -# GuessIt - A library for guessing information from filenames -# Copyright (c) 2013 Nicolas Wack -# -# GuessIt is free software; you can redistribute it and/or modify it under -# the terms of the Lesser GNU General Public License as published by -# the Free Software Foundation; either version 3 of the License, or -# (at your option) any later version. -# -# GuessIt is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# Lesser GNU General Public License for more details. -# -# You should have received a copy of the Lesser GNU General Public License -# along with this program. If not, see . -# - -from __future__ import absolute_import, division, print_function, unicode_literals - -from guessit.test.guessittest import * - - -class TestEpisode(TestGuessit): - def testEpisodes(self): - self.checkMinimumFieldsCorrect(filetype='episode', - filename='episodes.yaml') - - -suite = allTests(TestEpisode) - -if __name__ == '__main__': - TextTestRunner(verbosity=2).run(suite) diff --git a/libs/guessit/test/test_hashes.py b/libs/guessit/test/test_hashes.py deleted file mode 100644 index a8bc763c..00000000 --- a/libs/guessit/test/test_hashes.py +++ /dev/null @@ -1,46 +0,0 @@ -#!/usr/bin/env python -# -*- coding: utf-8 -*- -# -# GuessIt - A library for guessing information from filenames -# Copyright (c) 2013 Nicolas Wack -# -# GuessIt is free software; you can redistribute it and/or modify it under -# the terms of the Lesser GNU General Public License as published by -# the Free Software Foundation; either version 3 of the License, or -# (at your option) any later version. -# -# GuessIt is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# Lesser GNU General Public License for more details. -# -# You should have received a copy of the Lesser GNU General Public License -# along with this program. If not, see . -# - -from __future__ import absolute_import, division, print_function, unicode_literals - -from guessit.test.guessittest import * - - -class TestHashes(TestGuessit): - def test_hashes(self): - hashes = ( - ('hash_mpc', '1MB', u'8542ad406c15c8bd'), # TODO: Check if this value is valid - ('hash_ed2k', '1MB', u'ed2k://|file|1MB|1048576|AA3CC5552A9931A76B61A41D306735F7|/'), # TODO: Check if this value is valid - ('hash_md5', '1MB', u'5d8dcbca8d8ac21766f28797d6c3954c'), - ('hash_sha1', '1MB', u'51d2b8f3248d7ee495b7750c8da5aa3b3819de9d'), - ('hash_md5', 'dummy.srt', u'64de6b5893cac24456c46a935ef9c359'), - ('hash_sha1', 'dummy.srt', u'a703fc0fa4518080505809bf562c6fc6f7b3c98c') - ) - - for hash_type, filename, expected_value in hashes: - guess = guess_file_info(file_in_same_dir(__file__, filename), hash_type) - computed_value = guess.get(hash_type) - self.assertEqual(expected_value, guess.get(hash_type), "Invalid %s for %s: %s != %s" % (hash_type, filename, computed_value, expected_value)) - - -suite = allTests(TestHashes) - -if __name__ == '__main__': - TextTestRunner(verbosity=2).run(suite) diff --git a/libs/guessit/test/test_language.py b/libs/guessit/test/test_language.py deleted file mode 100644 index 99578fe7..00000000 --- a/libs/guessit/test/test_language.py +++ /dev/null @@ -1,130 +0,0 @@ -#!/usr/bin/env python -# -*- coding: utf-8 -*- -# -# GuessIt - A library for guessing information from filenames -# Copyright (c) 2013 Nicolas Wack -# -# GuessIt is free software; you can redistribute it and/or modify it under -# the terms of the Lesser GNU General Public License as published by -# the Free Software Foundation; either version 3 of the License, or -# (at your option) any later version. -# -# GuessIt is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# Lesser GNU General Public License for more details. -# -# You should have received a copy of the Lesser GNU General Public License -# along with this program. If not, see . -# - -from __future__ import absolute_import, division, print_function, unicode_literals - -from guessit.test.guessittest import * - -import io - - -class TestLanguage(TestGuessit): - - def check_languages(self, languages): - for lang1, lang2 in languages.items(): - self.assertEqual(Language.fromguessit(lang1), - Language.fromguessit(lang2)) - - def test_addic7ed(self): - languages = {'English': 'en', - 'English (US)': 'en-US', - 'English (UK)': 'en-UK', - 'Italian': 'it', - 'Portuguese': 'pt', - 'Portuguese (Brazilian)': 'pt-BR', - 'Romanian': 'ro', - 'Español (Latinoamérica)': 'es-MX', - 'Español (España)': 'es-ES', - 'Spanish (Latin America)': 'es-MX', - 'Español': 'es', - 'Spanish': 'es', - 'Spanish (Spain)': 'es-ES', - 'French': 'fr', - 'Greek': 'el', - 'Arabic': 'ar', - 'German': 'de', - 'Croatian': 'hr', - 'Indonesian': 'id', - 'Hebrew': 'he', - 'Russian': 'ru', - 'Turkish': 'tr', - 'Swedish': 'se', - 'Czech': 'cs', - 'Dutch': 'nl', - 'Hungarian': 'hu', - 'Norwegian': 'no', - 'Polish': 'pl', - 'Persian': 'fa'} - - self.check_languages(languages) - - def test_subswiki(self): - languages = {'English (US)': 'en-US', 'English (UK)': 'en-UK', 'English': 'en', - 'French': 'fr', 'Brazilian': 'po', 'Portuguese': 'pt', - 'Español (Latinoamérica)': 'es-MX', 'Español (España)': 'es-ES', - 'Español': 'es', 'Italian': 'it', 'Català': 'ca'} - - self.check_languages(languages) - - def test_tvsubtitles(self): - languages = {'English': 'en', 'Español': 'es', 'French': 'fr', 'German': 'de', - 'Brazilian': 'br', 'Russian': 'ru', 'Ukrainian': 'ua', 'Italian': 'it', - 'Greek': 'gr', 'Arabic': 'ar', 'Hungarian': 'hu', 'Polish': 'pl', - 'Turkish': 'tr', 'Dutch': 'nl', 'Portuguese': 'pt', 'Swedish': 'sv', - 'Danish': 'da', 'Finnish': 'fi', 'Korean': 'ko', 'Chinese': 'cn', - 'Japanese': 'jp', 'Bulgarian': 'bg', 'Czech': 'cz', 'Romanian': 'ro'} - - self.check_languages(languages) - - def test_opensubtitles(self): - opensubtitles_langfile = file_in_same_dir(__file__, 'opensubtitles_languages_2012_05_09.txt') - for l in [u(l).strip() for l in io.open(opensubtitles_langfile, encoding='utf-8')][1:]: - idlang, alpha2, _, upload_enabled, web_enabled = l.strip().split('\t') - # do not test languages that are too esoteric / not widely available - if int(upload_enabled) and int(web_enabled): - # check that we recognize the opensubtitles language code correctly - # and that we are able to output this code from a language - self.assertEqual(idlang, Language.fromguessit(idlang).opensubtitles) - if alpha2: - # check we recognize the opensubtitles 2-letter code correctly - self.check_languages({idlang: alpha2}) - - def test_tmdb(self): - # examples from http://api.themoviedb.org/2.1/language-tags - for lang in ['en-US', 'en-CA', 'es-MX', 'fr-PF']: - self.assertEqual(lang, str(Language.fromguessit(lang))) - - def test_subtitulos(self): - languages = {'English (US)': 'en-US', 'English (UK)': 'en-UK', 'English': 'en', - 'French': 'fr', 'Brazilian': 'po', 'Portuguese': 'pt', - 'Español (Latinoamérica)': 'es-MX', 'Español (España)': 'es-ES', - 'Español': 'es', 'Italian': 'it', 'Català': 'ca'} - - self.check_languages(languages) - - def test_thesubdb(self): - languages = {'af': 'af', 'cs': 'cs', 'da': 'da', 'de': 'de', 'en': 'en', 'es': 'es', 'fi': 'fi', - 'fr': 'fr', 'hu': 'hu', 'id': 'id', 'it': 'it', 'la': 'la', 'nl': 'nl', 'no': 'no', - 'oc': 'oc', 'pl': 'pl', 'pt': 'pt', 'ro': 'ro', 'ru': 'ru', 'sl': 'sl', 'sr': 'sr', - 'sv': 'sv', 'tr': 'tr'} - - self.check_languages(languages) - - def test_exceptions(self): - self.assertEqual(Language.fromguessit('br'), Language.fromguessit('pt(br)')) - - self.assertEqual(Language.fromguessit('unknown'), - Language.fromguessit('und')) - - -suite = allTests(TestLanguage) - -if __name__ == '__main__': - TextTestRunner(verbosity=2).run(suite) diff --git a/libs/guessit/test/test_main.py b/libs/guessit/test/test_main.py index 1140654a..cbdba7aa 100644 --- a/libs/guessit/test/test_main.py +++ b/libs/guessit/test/test_main.py @@ -1,69 +1,72 @@ #!/usr/bin/env python # -*- coding: utf-8 -*- -# -# GuessIt - A library for guessing information from filenames -# Copyright (c) 2014 Nicolas Wack -# -# GuessIt is free software; you can redistribute it and/or modify it under -# the terms of the Lesser GNU General Public License as published by -# the Free Software Foundation; either version 3 of the License, or -# (at your option) any later version. -# -# GuessIt is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# Lesser GNU General Public License for more details. -# -# You should have received a copy of the Lesser GNU General Public License -# along with this program. If not, see . -# +# pylint: disable=no-self-use, pointless-statement, missing-docstring, invalid-name -from __future__ import absolute_import, division, print_function, unicode_literals +import os -from guessit.test.guessittest import * -from guessit.fileutils import split_path, file_in_same_dir -from guessit.textutils import strip_brackets, str_replace, str_fill -from guessit import PY2 -from guessit import __main__ +import pytest -if PY2: - from StringIO import StringIO -else: - from io import StringIO +from ..__main__ import main + +__location__ = os.path.realpath(os.path.join(os.getcwd(), os.path.dirname(__file__))) -class TestMain(TestGuessit): - def setUp(self): - self._stdout = sys.stdout - string_out = StringIO() - sys.stdout = string_out +def test_main_no_args(): + main([]) - def tearDown(self): - sys.stdout = self._stdout - def test_list_properties(self): - __main__.main(["-p"], False) - __main__.main(["-V"], False) +def test_main(): + main(['Fear.and.Loathing.in.Las.Vegas.FRENCH.ENGLISH.720p.HDDVD.DTS.x264-ESiR.mkv']) - def test_list_transformers(self): - __main__.main(["--transformers"], False) - __main__.main(["-V", "--transformers"], False) - def test_demo(self): - __main__.main(["-d"], False) +def test_main_unicode(): + main(['[阿维达].Avida.2006.FRENCH.DVDRiP.XViD-PROD.avi']) - def test_filename(self): - __main__.main(["A.Movie.2014.avi"], False) - __main__.main(["A.Movie.2014.avi", "A.2nd.Movie.2014.avi"], False) - __main__.main(["-y", "A.Movie.2014.avi"], False) - __main__.main(["-a", "A.Movie.2014.avi"], False) - __main__.main(["-v", "A.Movie.2014.avi"], False) - __main__.main(["-t", "movie", "A.Movie.2014.avi"], False) - __main__.main(["-t", "episode", "A.Serie.S02E06.avi"], False) - __main__.main(["-i", "hash_mpc", file_in_same_dir(__file__, "1MB")], False) - __main__.main(["-i", "hash_md5", file_in_same_dir(__file__, "1MB")], False) -suite = allTests(TestMain) +def test_main_forced_unicode(): + main([u'Fear.and.Loathing.in.Las.Vegas.FRENCH.ENGLISH.720p.HDDVD.DTS.x264-ESiR.mkv']) -if __name__ == '__main__': - TextTestRunner(verbosity=2).run(suite) + +def test_main_verbose(): + main(['Fear.and.Loathing.in.Las.Vegas.FRENCH.ENGLISH.720p.HDDVD.DTS.x264-ESiR.mkv', '--verbose']) + + +def test_main_yaml(): + main(['Fear.and.Loathing.in.Las.Vegas.FRENCH.ENGLISH.720p.HDDVD.DTS.x264-ESiR.mkv', '--yaml']) + + +def test_main_json(): + main(['Fear.and.Loathing.in.Las.Vegas.FRENCH.ENGLISH.720p.HDDVD.DTS.x264-ESiR.mkv', '--json']) + + +def test_main_show_property(): + main(['Fear.and.Loathing.in.Las.Vegas.FRENCH.ENGLISH.720p.HDDVD.DTS.x264-ESiR.mkv', '-P', 'title']) + + +def test_main_advanced(): + main(['Fear.and.Loathing.in.Las.Vegas.FRENCH.ENGLISH.720p.HDDVD.DTS.x264-ESiR.mkv', '-a']) + + +def test_main_input(): + main(['--input', os.path.join(__location__, 'test-input-file.txt')]) + + +def test_main_properties(): + main(['-p']) + main(['-p', '--json']) + main(['-p', '--yaml']) + + +def test_main_values(): + main(['-V']) + main(['-V', '--json']) + main(['-V', '--yaml']) + + +def test_main_help(): + with pytest.raises(SystemExit): + main(['--help']) + + +def test_main_version(): + main(['--version']) diff --git a/libs/guessit/test/test_matchtree.py b/libs/guessit/test/test_matchtree.py deleted file mode 100644 index 8712d78f..00000000 --- a/libs/guessit/test/test_matchtree.py +++ /dev/null @@ -1,93 +0,0 @@ -#!/usr/bin/env python -# -*- coding: utf-8 -*- -# -# GuessIt - A library for guessing information from filenames -# Copyright (c) 2013 Nicolas Wack -# -# GuessIt is free software; you can redistribute it and/or modify it under -# the terms of the Lesser GNU General Public License as published by -# the Free Software Foundation; either version 3 of the License, or -# (at your option) any later version. -# -# GuessIt is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# Lesser GNU General Public License for more details. -# -# You should have received a copy of the Lesser GNU General Public License -# along with this program. If not, see . -# - -from __future__ import absolute_import, division, print_function, unicode_literals - -from guessit.test.guessittest import * - -from guessit.transfo.guess_release_group import GuessReleaseGroup -from guessit.transfo.guess_properties import GuessProperties -from guessit.matchtree import BaseMatchTree - -keywords = yaml.load(""" - -? Xvid PROPER -: videoCodec: Xvid - other: PROPER - -? PROPER-Xvid -: videoCodec: Xvid - other: PROPER - -""") - - -def guess_info(string, options=None): - mtree = MatchTree(string) - GuessReleaseGroup().process(mtree, options) - GuessProperties().process(mtree, options) - return mtree.matched() - - -class TestMatchTree(TestGuessit): - def test_base_tree(self): - t = BaseMatchTree('One Two Three(Three) Four') - t.partition((3, 7, 20)) - leaves = list(t.leaves()) - - self.assertEqual(leaves[0].span, (0, 3)) - - self.assertEqual('One', leaves[0].value) - self.assertEqual(' Two', leaves[1].value) - self.assertEqual(' Three(Three)', leaves[2].value) - self.assertEqual(' Four', leaves[3].value) - - leaves[2].partition((1, 6, 7, 12)) - three_leaves = list(leaves[2].leaves()) - - self.assertEqual('Three', three_leaves[1].value) - self.assertEqual('Three', three_leaves[3].value) - - leaves = list(t.leaves()) - - self.assertEqual(len(leaves), 8) - - self.assertEqual(leaves[5], three_leaves[3]) - - self.assertEqual(t.previous_leaf(leaves[5]), leaves[4]) - self.assertEqual(t.next_leaf(leaves[5]), leaves[6]) - - self.assertEqual(t.next_leaves(leaves[5]), [leaves[6], leaves[7]]) - self.assertEqual(t.previous_leaves(leaves[5]), [leaves[4], leaves[3], leaves[2], leaves[1], leaves[0]]) - - self.assertEqual(t.next_leaf(leaves[7]), None) - self.assertEqual(t.previous_leaf(leaves[0]), None) - - self.assertEqual(t.next_leaves(leaves[7]), []) - self.assertEqual(t.previous_leaves(leaves[0]), []) - - def test_match(self): - self.checkFields(keywords, guess_info) - - -suite = allTests(TestMatchTree) - -if __name__ == '__main__': - TextTestRunner(verbosity=2).run(suite) diff --git a/libs/guessit/test/test_movie.py b/libs/guessit/test/test_movie.py deleted file mode 100644 index eecbf49d..00000000 --- a/libs/guessit/test/test_movie.py +++ /dev/null @@ -1,35 +0,0 @@ -#!/usr/bin/env python -# -*- coding: utf-8 -*- -# -# GuessIt - A library for guessing information from filenames -# Copyright (c) 2013 Nicolas Wack -# -# GuessIt is free software; you can redistribute it and/or modify it under -# the terms of the Lesser GNU General Public License as published by -# the Free Software Foundation; either version 3 of the License, or -# (at your option) any later version. -# -# GuessIt is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# Lesser GNU General Public License for more details. -# -# You should have received a copy of the Lesser GNU General Public License -# along with this program. If not, see . -# - -from __future__ import absolute_import, division, print_function, unicode_literals - -from guessit.test.guessittest import * - - -class TestMovie(TestGuessit): - def testMovies(self): - self.checkMinimumFieldsCorrect(filetype='movie', - filename='movies.yaml') - - -suite = allTests(TestMovie) - -if __name__ == '__main__': - TextTestRunner(verbosity=2).run(suite) diff --git a/libs/guessit/test/test_quality.py b/libs/guessit/test/test_quality.py deleted file mode 100644 index 52e21791..00000000 --- a/libs/guessit/test/test_quality.py +++ /dev/null @@ -1,126 +0,0 @@ -#!/usr/bin/env python -# -*- coding: utf-8 -*- -# -# GuessIt - A library for guessing information from filenames -# Copyright (c) 2013 Nicolas Wack -# -# GuessIt is free software; you can redistribute it and/or modify it under -# the terms of the Lesser GNU General Public License as published by -# the Free Software Foundation; either version 3 of the License, or -# (at your option) any later version. -# -# GuessIt is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# Lesser GNU General Public License for more details. -# -# You should have received a copy of the Lesser GNU General Public License -# along with this program. If not, see . -# - -from __future__ import absolute_import, division, print_function, unicode_literals - -from guessit.quality import best_quality, best_quality_properties -from guessit.containers import QualitiesContainer -from guessit.test.guessittest import * - - -class TestQuality(TestGuessit): - def test_container(self): - container = QualitiesContainer() - - container.register_quality('color', 'red', 10) - container.register_quality('color', 'orange', 20) - container.register_quality('color', 'green', 30) - - container.register_quality('context', 'sun', 100) - container.register_quality('context', 'sea', 200) - container.register_quality('context', 'sex', 300) - - g1 = Guess() - g1['color'] = 'red' - - g2 = Guess() - g2['color'] = 'green' - - g3 = Guess() - g3['color'] = 'orange' - - q3 = container.rate_quality(g3) - self.assertEqual(q3, 20, "ORANGE should be rated 20. Don't ask why!") - - q1 = container.rate_quality(g1) - q2 = container.rate_quality(g2) - - self.assertTrue(q2 > q1, "GREEN should be greater than RED. Don't ask why!") - - g1['context'] = 'sex' - g2['context'] = 'sun' - - q1 = container.rate_quality(g1) - q2 = container.rate_quality(g2) - - self.assertTrue(q1 > q2, "SEX should be greater than SUN. Don't ask why!") - - self.assertEqual(container.best_quality(g1, g2), g1, "RED&SEX should be better than GREEN&SUN. Don't ask why!") - - self.assertEqual(container.best_quality_properties(['color'], g1, g2), g2, "GREEN should be better than RED. Don't ask why!") - - self.assertEqual(container.best_quality_properties(['context'], g1, g2), g1, "SEX should be better than SUN. Don't ask why!") - - q1 = container.rate_quality(g1, 'color') - q2 = container.rate_quality(g2, 'color') - - self.assertTrue(q2 > q1, "GREEN should be greater than RED. Don't ask why!") - - container.unregister_quality('context', 'sex') - container.unregister_quality('context', 'sun') - - q1 = container.rate_quality(g1) - q2 = container.rate_quality(g2) - - self.assertTrue(q2 > q1, "GREEN&SUN should be greater than RED&SEX. Don't ask why!") - - g3['context'] = 'sea' - container.unregister_quality('context', 'sea') - - q3 = container.rate_quality(g3, 'context') - self.assertEqual(q3, 0, "Context should be unregistered.") - - container.unregister_quality('color') - q3 = container.rate_quality(g3, 'color') - - self.assertEqual(q3, 0, "Color should be unregistered.") - - container.clear_qualities() - - q1 = container.rate_quality(g1) - q2 = container.rate_quality(g2) - - self.assertTrue(q1 == q2 == 0, "Empty quality container should rate each guess to 0") - - def test_quality_transformers(self): - guess_720p = guessit.guess_file_info("2012.2009.720p.BluRay.x264.DTS WiKi.mkv") - guess_1080p = guessit.guess_file_info("2012.2009.1080p.BluRay.x264.MP3 WiKi.mkv") - - self.assertTrue('audioCodec' in guess_720p, "audioCodec should be present") - self.assertTrue('audioCodec' in guess_1080p, "audioCodec should be present") - self.assertTrue('screenSize' in guess_720p, "screenSize should be present") - self.assertTrue('screenSize' in guess_1080p, "screenSize should be present") - - best_quality_guess = best_quality(guess_720p, guess_1080p) - - self.assertTrue(guess_1080p == best_quality_guess, "1080p+MP3 is not the best global quality") - - best_quality_guess = best_quality_properties(['screenSize'], guess_720p, guess_1080p) - - self.assertTrue(guess_1080p == best_quality_guess, "1080p is not the best screenSize") - - best_quality_guess = best_quality_properties(['audioCodec'], guess_720p, guess_1080p) - - self.assertTrue(guess_720p == best_quality_guess, "DTS is not the best audioCodec") - -suite = allTests(TestQuality) - -if __name__ == '__main__': - TextTestRunner(verbosity=2).run(suite) diff --git a/libs/guessit/test/test_utils.py b/libs/guessit/test/test_utils.py deleted file mode 100644 index 87eecb98..00000000 --- a/libs/guessit/test/test_utils.py +++ /dev/null @@ -1,163 +0,0 @@ -#!/usr/bin/env python -# -*- coding: utf-8 -*- -# -# GuessIt - A library for guessing information from filenames -# Copyright (c) 2013 Nicolas Wack -# -# GuessIt is free software; you can redistribute it and/or modify it under -# the terms of the Lesser GNU General Public License as published by -# the Free Software Foundation; either version 3 of the License, or -# (at your option) any later version. -# -# GuessIt is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# Lesser GNU General Public License for more details. -# -# You should have received a copy of the Lesser GNU General Public License -# along with this program. If not, see . -# - -from __future__ import absolute_import, division, print_function, unicode_literals - -from guessit.test.guessittest import * -from guessit.fileutils import split_path -from guessit.textutils import strip_brackets, str_replace, str_fill, from_camel, is_camel,\ - levenshtein, reorder_title -from guessit import PY2 -from guessit.date import search_date, search_year -from datetime import datetime, date, timedelta - - -class TestUtils(TestGuessit): - def test_splitpath(self): - alltests = {False: {'/usr/bin/smewt': ['/', 'usr', 'bin', 'smewt'], - 'relative_path/to/my_folder/': ['relative_path', 'to', 'my_folder'], - '//some/path': ['//', 'some', 'path'], - '//some//path': ['//', 'some', 'path'], - '///some////path': ['///', 'some', 'path'] - - }, - True: {'C:\\Program Files\\Smewt\\smewt.exe': ['C:\\', 'Program Files', 'Smewt', 'smewt.exe'], - 'Documents and Settings\\User\\config': ['Documents and Settings', 'User', 'config'], - 'C:\\Documents and Settings\\User\\config': ['C:\\', 'Documents and Settings', 'User', 'config'], - # http://bugs.python.org/issue19945 - '\\\\netdrive\\share': ['\\\\', 'netdrive', 'share'] if PY2 else ['\\\\netdrive\\share'], - '\\\\netdrive\\share\\folder': ['\\\\', 'netdrive', 'share', 'folder'] if PY2 else ['\\\\netdrive\\share\\', 'folder'], - } - } - tests = alltests[sys.platform == 'win32'] - for path, split in tests.items(): - self.assertEqual(split, split_path(path)) - - def test_strip_brackets(self): - allTests = (('', ''), - ('[test]', 'test'), - ('{test2}', 'test2'), - ('(test3)', 'test3'), - ('(test4]', '(test4]'), - ) - - for i, e in allTests: - self.assertEqual(e, strip_brackets(i)) - - def test_levenshtein(self): - self.assertEqual(levenshtein("abcdef ghijk lmno", "abcdef ghijk lmno"), 0) - self.assertEqual(levenshtein("abcdef ghijk lmnop", "abcdef ghijk lmno"), 1) - self.assertEqual(levenshtein("abcdef ghijk lmno", "abcdef ghijk lmn"), 1) - self.assertEqual(levenshtein("abcdef ghijk lmno", "abcdef ghijk lmnp"), 1) - self.assertEqual(levenshtein("abcdef ghijk lmno", "abcdef ghijk lmnq"), 1) - self.assertEqual(levenshtein("cbcdef ghijk lmno", "abcdef ghijk lmnq"), 2) - self.assertEqual(levenshtein("cbcdef ghihk lmno", "abcdef ghijk lmnq"), 3) - - def test_reorder_title(self): - self.assertEqual(reorder_title("Simpsons, The"), "The Simpsons") - self.assertEqual(reorder_title("Simpsons,The"), "The Simpsons") - self.assertEqual(reorder_title("Simpsons,Les", articles=('the', 'le', 'la', 'les')), "Les Simpsons") - self.assertEqual(reorder_title("Simpsons, Les", articles=('the', 'le', 'la', 'les')), "Les Simpsons") - - def test_camel(self): - self.assertEqual("", from_camel("")) - - self.assertEqual("Hello world", str_replace("Hello World", 6, 'w')) - self.assertEqual("Hello *****", str_fill("Hello World", (6, 11), '*')) - - self.assertTrue("This is camel", from_camel("ThisIsCamel")) - - self.assertEqual('camel case', from_camel('camelCase')) - self.assertEqual('A case', from_camel('ACase')) - self.assertEqual('MiXedCaSe is not camel case', from_camel('MiXedCaSe is not camelCase')) - - self.assertEqual("This is camel cased title", from_camel("ThisIsCamelCasedTitle")) - self.assertEqual("This is camel CASED title", from_camel("ThisIsCamelCASEDTitle")) - - self.assertEqual("These are camel CASED title", from_camel("TheseAreCamelCASEDTitle")) - - self.assertEqual("Give a camel case string", from_camel("GiveACamelCaseString")) - - self.assertEqual("Death TO camel case", from_camel("DeathTOCamelCase")) - self.assertEqual("But i like java too:)", from_camel("ButILikeJavaToo:)")) - - self.assertEqual("Beatdown french DVD rip.mkv", from_camel("BeatdownFrenchDVDRip.mkv")) - self.assertEqual("DO NOTHING ON UPPER CASE", from_camel("DO NOTHING ON UPPER CASE")) - - self.assertFalse(is_camel("this_is_not_camel")) - self.assertTrue(is_camel("ThisIsCamel")) - - self.assertEqual("Dark.City.(1998).DC.BDRIP.720p.DTS.X264-CHD.mkv", from_camel("Dark.City.(1998).DC.BDRIP.720p.DTS.X264-CHD.mkv")) - self.assertFalse(is_camel("Dark.City.(1998).DC.BDRIP.720p.DTS.X264-CHD.mkv")) - - self.assertEqual("A2LiNE", from_camel("A2LiNE")) - - def test_date(self): - self.assertEqual(search_year(' in the year 2000... '), (2000, (13, 17))) - self.assertEqual(search_year(' they arrived in 1492. '), (None, None)) - - today = date.today() - today_year_2 = int(str(today.year)[2:]) - - future = today + timedelta(days=1000) - future_year_2 = int(str(future.year)[2:]) - - past = today - timedelta(days=10000) - past_year_2 = int(str(past.year)[2:]) - - self.assertEqual(search_date(' Something before 2002-04-22 '), (date(2002, 4, 22), (18, 28))) - self.assertEqual(search_date(' 2002-04-22 Something after '), (date(2002, 4, 22), (1, 11))) - - self.assertEqual(search_date(' This happened on 2002-04-22. '), (date(2002, 4, 22), (18, 28))) - self.assertEqual(search_date(' This happened on 22-04-2002. '), (date(2002, 4, 22), (18, 28))) - - self.assertEqual(search_date(' This happened on 13-04-%s. ' % (today_year_2,)), (date(today.year, 4, 13), (18, 26))) - self.assertEqual(search_date(' This happened on 22-04-%s. ' % (future_year_2,)), (date(future.year, 4, 22), (18, 26))) - self.assertEqual(search_date(' This happened on 20-04-%s. ' % (past_year_2)), (date(past.year, 4, 20), (18, 26))) - - self.assertEqual(search_date(' This happened on 13-06-14. ', year_first=True), (date(2013, 6, 14), (18, 26))) - self.assertEqual(search_date(' This happened on 13-05-14. ', year_first=False), (date(2014, 5, 13), (18, 26))) - - self.assertEqual(search_date(' This happened on 04-13-%s. ' % (today_year_2,)), (date(today.year, 4, 13), (18, 26))) - self.assertEqual(search_date(' This happened on 04-22-%s. ' % (future_year_2,)), (date(future.year, 4, 22), (18, 26))) - self.assertEqual(search_date(' This happened on 04-20-%s. ' % (past_year_2)), (date(past.year, 4, 20), (18, 26))) - - self.assertEqual(search_date(' This happened on 35-12-%s. ' % (today_year_2,)), (None, None)) - self.assertEqual(search_date(' This happened on 37-18-%s. ' % (future_year_2,)), (None, None)) - self.assertEqual(search_date(' This happened on 44-42-%s. ' % (past_year_2)), (None, None)) - - self.assertEqual(search_date(' This happened on %s. ' % (today, )), (today, (18, 28))) - self.assertEqual(search_date(' This happened on %s. ' % (future, )), (future, (18, 28))) - self.assertEqual(search_date(' This happened on %s. ' % (past, )), (past, (18, 28))) - - self.assertEqual(search_date(' released date: 04-03-1901? '), (None, None)) - - self.assertEqual(search_date(' There\'s no date in here. '), (None, None)) - - self.assertEqual(search_date(' Something 01-02-03 '), (date(2003, 2, 1), (11, 19))) - self.assertEqual(search_date(' Something 01-02-03 ', year_first=False, day_first=True), (date(2003, 2, 1), (11, 19))) - self.assertEqual(search_date(' Something 01-02-03 ', year_first=True), (date(2001, 2, 3), (11, 19))) - self.assertEqual(search_date(' Something 01-02-03 ', day_first=False), (date(2003, 1, 2), (11, 19))) - - -suite = allTests(TestUtils) - -if __name__ == '__main__': - TextTestRunner(verbosity=2).run(suite) diff --git a/libs/guessit/test/test_yml.py b/libs/guessit/test/test_yml.py new file mode 100644 index 00000000..c8e3d193 --- /dev/null +++ b/libs/guessit/test/test_yml.py @@ -0,0 +1,285 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +# pylint: disable=no-self-use, pointless-statement, missing-docstring, invalid-name +import logging + +# io.open supports encoding= in python 2.7 +from io import open # pylint: disable=redefined-builtin +import os +import yaml + +import six + +import babelfish +import pytest + +from rebulk.remodule import re +from rebulk.utils import is_iterable + +from guessit.options import parse_options +from ..yamlutils import OrderedDictYAMLLoader +from .. import guessit + + +logger = logging.getLogger(__name__) + +__location__ = os.path.realpath(os.path.join(os.getcwd(), os.path.dirname(__file__))) + +filename_predicate = None +string_predicate = None + + +# filename_predicate = lambda filename: 'episode_title' in filename +# string_predicate = lambda string: '-DVD.BlablaBla.Fix.Blablabla.XVID' in string + + +class EntryResult(object): + def __init__(self, string, negates=False): + self.string = string + self.negates = negates + self.valid = [] + self.missing = [] + self.different = [] + self.extra = [] + self.others = [] + + @property + def ok(self): + if self.negates: + return self.missing or self.different + return not self.missing and not self.different and not self.extra and not self.others + + @property + def warning(self): + if self.negates: + return False + return not self.missing and not self.different and self.extra + + @property + def error(self): + if self.negates: + return not self.missing and not self.different and not self.others + return self.missing or self.different or self.others + + def __repr__(self): + if self.ok: + return self.string + ': OK!' + elif self.warning: + return '%s%s: WARNING! (valid=%i, extra=%i)' % ('-' if self.negates else '', self.string, len(self.valid), + len(self.extra)) + elif self.error: + return '%s%s: ERROR! (valid=%i, missing=%i, different=%i, extra=%i, others=%i)' % \ + ('-' if self.negates else '', self.string, len(self.valid), len(self.missing), len(self.different), + len(self.extra), len(self.others)) + else: + return '%s%s: UNKOWN! (valid=%i, missing=%i, different=%i, extra=%i, others=%i)' % \ + ('-' if self.negates else '', self.string, len(self.valid), len(self.missing), len(self.different), + len(self.extra), len(self.others)) + + @property + def details(self): + ret = [] + if self.valid: + ret.append('valid=' + str(len(self.valid))) + for valid in self.valid: + ret.append(' ' * 4 + str(valid)) + if self.missing: + ret.append('missing=' + str(len(self.missing))) + for missing in self.missing: + ret.append(' ' * 4 + str(missing)) + if self.different: + ret.append('different=' + str(len(self.different))) + for different in self.different: + ret.append(' ' * 4 + str(different)) + if self.extra: + ret.append('extra=' + str(len(self.extra))) + for extra in self.extra: + ret.append(' ' * 4 + str(extra)) + if self.others: + ret.append('others=' + str(len(self.others))) + for other in self.others: + ret.append(' ' * 4 + str(other)) + return ret + + +class Results(list): + def assert_ok(self): + errors = [entry for entry in self if entry.error] + assert not errors + + +def files_and_ids(predicate=None): + files = [] + ids = [] + + for (dirpath, _, filenames) in os.walk(__location__): + if dirpath == __location__: + dirpath_rel = '' + else: + dirpath_rel = os.path.relpath(dirpath, __location__) + for filename in filenames: + name, ext = os.path.splitext(filename) + filepath = os.path.join(dirpath_rel, filename) + if ext == '.yml' and (not predicate or predicate(filepath)): + files.append(filepath) + ids.append(os.path.join(dirpath_rel, name)) + + return files, ids + + +class TestYml(object): + """ + Run tests from yaml files. + Multiple input strings having same expected results can be chained. + Use $ marker to check inputs that should not match results. + """ + + options_re = re.compile(r'^([ \+-]+)(.*)') + + files, ids = files_and_ids(filename_predicate) + + @staticmethod + def set_default(expected, default): + if default: + for k, v in default.items(): + if k not in expected: + expected[k] = v + + @pytest.mark.parametrize('filename', files, ids=ids) + def test(self, filename, caplog): + caplog.setLevel(logging.INFO) + with open(os.path.join(__location__, filename), 'r', encoding='utf-8') as infile: + data = yaml.load(infile, OrderedDictYAMLLoader) + entries = Results() + + last_expected = None + for string, expected in reversed(list(data.items())): + if expected is None: + data[string] = last_expected + else: + last_expected = expected + + default = None + try: + default = data['__default__'] + del data['__default__'] + except KeyError: + pass + + for string, expected in data.items(): + TestYml.set_default(expected, default) + entry = self.check_data(filename, string, expected) + entries.append(entry) + entries.assert_ok() + + def check_data(self, filename, string, expected): + if six.PY2 and isinstance(string, six.text_type): + string = string.encode('utf-8') + converts = [] + for k, v in expected.items(): + if isinstance(v, six.text_type): + v = v.encode('utf-8') + converts.append((k, v)) + for k, v in converts: + expected[k] = v + if not isinstance(string, str): + string = str(string) + if not string_predicate or string_predicate(string): # pylint: disable=not-callable + entry = self.check(string, expected) + if entry.ok: + logger.debug('[' + filename + '] ' + str(entry)) + elif entry.warning: + logger.warning('[' + filename + '] ' + str(entry)) + elif entry.error: + logger.error('[' + filename + '] ' + str(entry)) + for line in entry.details: + logger.error('[' + filename + '] ' + ' ' * 4 + line) + return entry + + def check(self, string, expected): + negates, global_, string = self.parse_token_options(string) + + options = expected.get('options') + if options is None: + options = {} + if not isinstance(options, dict): + options = parse_options(options) + if 'implicit' not in options: + options['implicit'] = True + try: + result = guessit(string, options) + except Exception as exc: + logger.error('[' + string + '] Exception: ' + str(exc)) + raise exc + + entry = EntryResult(string, negates) + + if global_: + self.check_global(string, result, entry) + + self.check_expected(result, expected, entry) + + return entry + + def parse_token_options(self, string): + matches = self.options_re.search(string) + negates = False + global_ = False + if matches: + string = matches.group(2) + for opt in matches.group(1): + if '-' in opt: + negates = True + if '+' in opt: + global_ = True + return negates, global_, string + + def check_global(self, string, result, entry): + global_span = [] + for result_matches in result.matches.values(): + for result_match in result_matches: + if not global_span: + global_span = list(result_match.span) + else: + if global_span[0] > result_match.span[0]: + global_span[0] = result_match.span[0] + if global_span[1] < result_match.span[1]: + global_span[1] = result_match.span[1] + if global_span and global_span[1] - global_span[0] < len(string): + entry.others.append("Match is not global") + + def is_same(self, value, expected): + values = set(value) if is_iterable(value) else set((value,)) + expecteds = set(expected) if is_iterable(expected) else set((expected,)) + if len(values) != len(expecteds): + return False + if isinstance(next(iter(values)), babelfish.Language): + # pylint: disable=no-member + expecteds = set([babelfish.Language.fromguessit(expected) for expected in expecteds]) + elif isinstance(next(iter(values)), babelfish.Country): + # pylint: disable=no-member + expecteds = set([babelfish.Country.fromguessit(expected) for expected in expecteds]) + return values == expecteds + + def check_expected(self, result, expected, entry): + if expected: + for expected_key, expected_value in expected.items(): + if expected_key and expected_key != 'options' and expected_value is not None: + negates_key, _, result_key = self.parse_token_options(expected_key) + if result_key in result.keys(): + if not self.is_same(result[result_key], expected_value): + if negates_key: + entry.valid.append((expected_key, expected_value)) + else: + entry.different.append((expected_key, expected_value, result[expected_key])) + else: + if negates_key: + entry.different.append((expected_key, expected_value, result[expected_key])) + else: + entry.valid.append((expected_key, expected_value)) + elif not negates_key: + entry.missing.append((expected_key, expected_value)) + + for result_key, result_value in result.items(): + if result_key not in expected.keys(): + entry.extra.append((result_key, result_value)) diff --git a/libs/guessit/test/various.yml b/libs/guessit/test/various.yml new file mode 100644 index 00000000..72e2f602 --- /dev/null +++ b/libs/guessit/test/various.yml @@ -0,0 +1,800 @@ +? Movies/Fear and Loathing in Las Vegas (1998)/Fear.and.Loathing.in.Las.Vegas.720p.HDDVD.DTS.x264-ESiR.mkv +: type: movie + title: Fear and Loathing in Las Vegas + year: 1998 + screen_size: 720p + format: HD-DVD + audio_codec: DTS + video_codec: h264 + release_group: ESiR + +? Series/Duckman/Duckman - 101 (01) - 20021107 - I, Duckman.avi +: type: episode + title: Duckman + season: 1 + episode: 1 + episode_title: I, Duckman + date: 2002-11-07 + +? Series/Neverwhere/Neverwhere.05.Down.Street.[tvu.org.ru].avi +: type: episode + title: Neverwhere + episode: 5 + episode_title: Down Street + website: tvu.org.ru + +? Neverwhere.05.Down.Street.[tvu.org.ru].avi +: type: episode + title: Neverwhere + episode: 5 + episode_title: Down Street + website: tvu.org.ru + +? Series/Breaking Bad/Minisodes/Breaking.Bad.(Minisodes).01.Good.Cop.Bad.Cop.WEBRip.XviD.avi +: type: episode + title: Breaking Bad + episode_format: Minisode + episode: 1 + episode_title: Good Cop Bad Cop + format: WEBRip + video_codec: XviD + +? Series/Kaamelott/Kaamelott - Livre V - Ep 23 - Le Forfait.avi +: type: episode + title: Kaamelott + episode: 23 + episode_title: Le Forfait + +? Movies/The Doors (1991)/09.03.08.The.Doors.(1991).BDRip.720p.AC3.X264-HiS@SiLUHD-English.[sharethefiles.com].mkv +: type: movie + title: The Doors + year: 1991 + date: 2008-03-09 + format: BluRay + screen_size: 720p + audio_codec: AC3 + video_codec: h264 + release_group: HiS@SiLUHD + language: english + website: sharethefiles.com + +? Movies/M.A.S.H. (1970)/MASH.(1970).[Divx.5.02][Dual-Subtitulos][DVDRip].ogm +: type: movie + title: MASH + year: 1970 + video_codec: DivX + format: DVD + +? the.mentalist.501.hdtv-lol.mp4 +: type: episode + title: the mentalist + season: 5 + episode: 1 + format: HDTV + release_group: lol + +? the.simpsons.2401.hdtv-lol.mp4 +: type: episode + title: the simpsons + season: 24 + episode: 1 + format: HDTV + release_group: lol + +? Homeland.S02E01.HDTV.x264-EVOLVE.mp4 +: type: episode + title: Homeland + season: 2 + episode: 1 + format: HDTV + video_codec: h264 + release_group: EVOLVE + +? /media/Band_of_Brothers-e01-Currahee.mkv +: type: episode + title: Band of Brothers + episode: 1 + episode_title: Currahee + +? /media/Band_of_Brothers-x02-We_Stand_Alone_Together.mkv +: type: episode + title: Band of Brothers + bonus: 2 + bonus_title: We Stand Alone Together + +? /movies/James_Bond-f21-Casino_Royale-x02-Stunts.mkv +: type: movie + title: Casino Royale + film_title: James Bond + film: 21 + bonus: 2 + bonus_title: Stunts + +? /TV Shows/new.girl.117.hdtv-lol.mp4 +: type: episode + title: new girl + season: 1 + episode: 17 + format: HDTV + release_group: lol + +? The.Office.(US).1x03.Health.Care.HDTV.XviD-LOL.avi +: type: episode + title: The Office + country: US + season: 1 + episode: 3 + episode_title: Health Care + format: HDTV + video_codec: XviD + release_group: LOL + +? The_Insider-(1999)-x02-60_Minutes_Interview-1996.mp4 +: type: movie + title: The Insider + year: 1999 + bonus: 2 + bonus_title: 60 Minutes Interview-1996 + +? OSS_117--Cairo,_Nest_of_Spies.mkv +: type: movie + title: OSS 117 + alternative_title: Cairo, Nest of Spies + +? Rush.._Beyond_The_Lighted_Stage-x09-Between_Sun_and_Moon-2002_Hartford.mkv +: type: movie + title: Rush Beyond The Lighted Stage + bonus: 9 + bonus_title: Between Sun and Moon + year: 2002 + +? House.Hunters.International.S56E06.720p.hdtv.x264.mp4 +: type: episode + title: House Hunters International + season: 56 + episode: 6 + screen_size: 720p + format: HDTV + video_codec: h264 + +? White.House.Down.2013.1080p.BluRay.DTS-HD.MA.5.1.x264-PublicHD.mkv +: type: movie + title: White House Down + year: 2013 + screen_size: 1080p + format: BluRay + audio_codec: DTS + audio_profile: HDMA + video_codec: h264 + release_group: PublicHD + audio_channels: "5.1" + +? White.House.Down.2013.1080p.BluRay.DTSHD.MA.5.1.x264-PublicHD.mkv +: type: movie + title: White House Down + year: 2013 + screen_size: 1080p + format: BluRay + audio_codec: DTS + audio_profile: HDMA + video_codec: h264 + release_group: PublicHD + audio_channels: "5.1" + +? Hostages.S01E01.Pilot.for.Air.720p.WEB-DL.DD5.1.H.264-NTb.nfo +: type: episode + title: Hostages + episode_title: Pilot for Air + season: 1 + episode: 1 + screen_size: 720p + format: WEB-DL + audio_channels: "5.1" + video_codec: h264 + audio_codec: DolbyDigital + release_group: NTb + +? Despicable.Me.2.2013.1080p.BluRay.x264-VeDeTT.nfo +: type: movie + title: Despicable Me 2 + year: 2013 + screen_size: 1080p + format: BluRay + video_codec: h264 + release_group: VeDeTT + +? Le Cinquieme Commando 1971 SUBFORCED FRENCH DVDRiP XViD AC3 Bandix.mkv +: type: movie + audio_codec: AC3 + format: DVD + release_group: Bandix + subtitle_language: French + title: Le Cinquieme Commando + video_codec: XviD + year: 1971 + +? Le Seigneur des Anneaux - La Communauté de l'Anneau - Version Longue - BDRip.mkv +: type: movie + format: BluRay + title: Le Seigneur des Anneaux + +? La petite bande (Michel Deville - 1983) VF PAL MP4 x264 AAC.mkv +: type: movie + audio_codec: AAC + language: French + title: La petite bande + video_codec: h264 + year: 1983 + other: PAL + +? Retour de Flammes (Gregor Schnitzler 2003) FULL DVD.iso +: type: movie + format: DVD + title: Retour de Flammes + type: movie + year: 2003 + +? A.Common.Title.Special.2014.avi +: type: movie + year: 2014 + title: A Common Title Special + +? A.Common.Title.2014.Special.avi +: type: episode + year: 2014 + title: A Common Title + episode_title: Special + episode_details: Special + +? A.Common.Title.2014.Special.Edition.avi +: type: movie + year: 2014 + title: A Common Title + edition: Special Edition + +? Downton.Abbey.2013.Christmas.Special.HDTV.x264-FoV.mp4 +: type: episode + year: 2013 + title: Downton Abbey + episode_title: Christmas Special + video_codec: h264 + release_group: FoV + format: HDTV + episode_details: Special + +? Doctor_Who_2013_Christmas_Special.The_Time_of_The_Doctor.HD +: type: episode + title: Doctor Who + other: HD + episode_details: Special + episode_title: Christmas Special The Time of The Doctor + year: 2013 + +? Doctor Who 2005 50th Anniversary Special The Day of the Doctor 3.avi +: type: episode + title: Doctor Who + episode_details: Special + episode_title: 50th Anniversary Special The Day of the Doctor 3 + year: 2005 + +? Robot Chicken S06-Born Again Virgin Christmas Special HDTV x264.avi +: type: episode + title: Robot Chicken + format: HDTV + season: 6 + episode_title: Born Again Virgin Christmas Special + video_codec: h264 + episode_details: Special + +? Wicked.Tuna.S03E00.Head.To.Tail.Special.HDTV.x264-YesTV +: type: episode + title: Wicked Tuna + episode_title: Head To Tail Special + release_group: YesTV + season: 3 + episode: 0 + video_codec: h264 + format: HDTV + episode_details: Special + +? The.Voice.UK.S03E12.HDTV.x264-C4TV +: episode: 12 + video_codec: h264 + format: HDTV + title: The Voice + release_group: C4TV + season: 3 + country: United Kingdom + type: episode + +? /tmp/star.trek.9/star.trek.9.mkv +: type: movie + title: star trek 9 + +? star.trek.9.mkv +: type: movie + title: star trek 9 + +? FlexGet.S01E02.TheName.HDTV.xvid +: episode: 2 + format: HDTV + season: 1 + title: FlexGet + episode_title: TheName + type: episode + video_codec: XviD + +? FlexGet.S01E02.TheName.HDTV.xvid +: episode: 2 + format: HDTV + season: 1 + title: FlexGet + episode_title: TheName + type: episode + video_codec: XviD + +? some.series.S03E14.Title.Here.720p +: episode: 14 + screen_size: 720p + season: 3 + title: some series + episode_title: Title Here + type: episode + +? '[the.group] Some.Series.S03E15.Title.Two.720p' +: episode: 15 + release_group: the.group + screen_size: 720p + season: 3 + title: Some Series + episode_title: Title Two + type: episode + +? 'HD 720p: Some series.S03E16.Title.Three' +: episode: 16 + other: HD + screen_size: 720p + season: 3 + title: Some series + episode_title: Title Three + type: episode + +? Something.Season.2.1of4.Ep.Title.HDTV.torrent +: episode_count: 4 + episode: 1 + format: HDTV + season: 2 + title: Something + episode_title: Title + type: episode + container: torrent + +? Show-A (US) - Episode Title S02E09 hdtv +: country: US + episode: 9 + format: HDTV + season: 2 + title: Show-A + type: episode + +? Jack's.Show.S03E01.blah.1080p +: episode: 1 + screen_size: 1080p + season: 3 + title: Jack's Show + episode_title: blah + type: episode + +? FlexGet.epic +: title: FlexGet epic + type: movie + +? FlexGet.Apt.1 +: title: FlexGet Apt 1 + type: movie + +? FlexGet.aptitude +: title: FlexGet aptitude + type: movie + +? FlexGet.Step1 +: title: FlexGet Step1 + type: movie + +? Movies/El Bosque Animado (1987)/El.Bosque.Animado.[Jose.Luis.Cuerda.1987].[Xvid-Dvdrip-720 * 432].avi +: format: DVD + screen_size: 720x432 + title: El Bosque Animado + video_codec: XviD + year: 1987 + type: movie + +? Movies/El Bosque Animado (1987)/El.Bosque.Animado.[Jose.Luis.Cuerda.1987].[Xvid-Dvdrip-720x432].avi +: format: DVD + screen_size: 720x432 + title: El Bosque Animado + video_codec: XviD + year: 1987 + type: movie + +? 2009.shoot.fruit.chan.multi.dvd9.pal +: format: DVD + language: mul + other: PAL + title: shoot fruit chan + type: movie + year: 2009 + +? 2009.shoot.fruit.chan.multi.dvd5.pal +: format: DVD + language: mul + other: PAL + title: shoot fruit chan + type: movie + year: 2009 + +? The.Flash.2014.S01E01.PREAIR.WEBRip.XviD-EVO.avi +: episode: 1 + format: WEBRip + other: Preair + release_group: EVO + season: 1 + title: The Flash + type: episode + video_codec: XviD + year: 2014 + +? Ice.Lake.Rebels.S01E06.Ice.Lake.Games.720p.HDTV.x264-DHD +: episode: 6 + format: HDTV + release_group: DHD + screen_size: 720p + season: 1 + title: Ice Lake Rebels + episode_title: Ice Lake Games + type: episode + video_codec: h264 + +? The League - S06E10 - Epi Sexy.mkv +: episode: 10 + season: 6 + title: The League + episode_title: Epi Sexy + type: episode + +? Stay (2005) [1080p]/Stay.2005.1080p.BluRay.x264.YIFY.mp4 +: format: BluRay + release_group: YIFY + screen_size: 1080p + title: Stay + type: movie + video_codec: h264 + year: 2005 + +? /media/live/A/Anger.Management.S02E82.720p.HDTV.X264-DIMENSION.mkv +: format: HDTV + release_group: DIMENSION + screen_size: 720p + title: Anger Management + type: episode + season: 2 + episode: 82 + video_codec: h264 + +? "[Figmentos] Monster 34 - At the End of Darkness [781219F1].mkv" +: type: episode + release_group: Figmentos + title: Monster + episode: 34 + episode_title: At the End of Darkness + crc32: 781219F1 + +? Game.of.Thrones.S05E07.720p.HDTV-KILLERS.mkv +: type: episode + episode: 7 + format: HDTV + release_group: KILLERS + screen_size: 720p + season: 5 + title: Game of Thrones + +? Game.of.Thrones.S05E07.HDTV.720p-KILLERS.mkv +: type: episode + episode: 7 + format: HDTV + release_group: KILLERS + screen_size: 720p + season: 5 + title: Game of Thrones + +? Parks and Recreation - [04x12] - Ad Campaign.avi +: type: episode + title: Parks and Recreation + season: 4 + episode: 12 + episode_title: Ad Campaign + +? Star Trek Into Darkness (2013)/star.trek.into.darkness.2013.720p.web-dl.h264-publichd.mkv +: type: movie + title: Star Trek Into Darkness + year: 2013 + screen_size: 720p + format: WEB-DL + video_codec: h264 + release_group: publichd + +? /var/medias/series/The Originals/Season 02/The.Originals.S02E15.720p.HDTV.X264-DIMENSION.mkv +: type: episode + title: The Originals + season: 2 + episode: 15 + screen_size: 720p + format: HDTV + video_codec: h264 + release_group: DIMENSION + +? Test.S01E01E07-FooBar-Group.avi +: container: avi + episode: + - 1 + - 7 + episode_title: FooBar-Group # Make sure it doesn't conflict with uuid + mimetype: video/x-msvideo + season: 1 + title: Test + type: episode + +? TEST.S01E02.2160p.NF.WEBRip.x264.DD5.1-ABC +: audio_channels: '5.1' + audio_codec: DolbyDigital + episode: 2 + format: WEBRip + other: Netflix + release_group: ABC + screen_size: 4K + season: 1 + title: TEST + type: episode + video_codec: h264 + +? TEST.2015.12.30.720p.WEBRip.h264-ABC +: date: 2015-12-30 + format: WEBRip + release_group: ABC + screen_size: 720p + title: TEST + type: episode + video_codec: h264 + +? TEST.S01E10.24.1080p.NF.WEBRip.AAC2.0.x264-ABC +: audio_channels: '2.0' + audio_codec: AAC + episode: 10 + episode_title: '24' + format: WEBRip + other: Netflix + release_group: ABC + screen_size: 1080p + season: 1 + title: TEST + type: episode + video_codec: h264 + +? TEST.S01E10.24.1080p.NF.WEBRip.AAC2.0.x264-ABC +: audio_channels: '2.0' + audio_codec: AAC + episode: 10 + episode_title: '24' + format: WEBRip + other: Netflix + release_group: ABC + screen_size: 1080p + season: 1 + title: TEST + type: episode + video_codec: h264 + +? TEST.S01E10.24.1080p.NF.WEBRip.AAC.2.0.x264-ABC +: audio_channels: '2.0' + audio_codec: AAC + episode: 10 + episode_title: '24' + format: WEBRip + other: Netflix + release_group: ABC + screen_size: 1080p + season: 1 + title: TEST + type: episode + video_codec: h264 + +? TEST.S05E02.720p.iP.WEBRip.AAC2.0.H264-ABC +: audio_channels: '2.0' + audio_codec: AAC + episode: 2 + format: WEBRip + release_group: ABC + screen_size: 720p + season: 5 + title: TEST + type: episode + video_codec: h264 + +? TEST.S03E07.720p.WEBRip.AAC2.0.x264-ABC +: audio_channels: '2.0' + audio_codec: AAC + episode: 7 + format: WEBRip + release_group: ABC + screen_size: 720p + season: 3 + title: TEST + type: episode + video_codec: h264 + +? TEST.S15E15.24.1080p.FREE.WEBRip.AAC2.0.x264-ABC +: audio_channels: '2.0' + audio_codec: AAC + episode: 15 + episode_title: '24' + format: WEBRip + release_group: ABC + screen_size: 1080p + season: 15 + title: TEST + type: episode + video_codec: h264 + +? TEST.S11E11.24.720p.ETV.WEBRip.AAC2.0.x264-ABC +: audio_channels: '2.0' + audio_codec: AAC + episode: 11 + episode_title: '24' + format: WEBRip + release_group: ABC + screen_size: 720p + season: 11 + title: TEST + type: episode + video_codec: h264 + +? TEST.2015.1080p.HC.WEBRip.x264.AAC2.0-ABC +: audio_channels: '2.0' + audio_codec: AAC + format: WEBRip + release_group: ABC + screen_size: 1080p + title: TEST + type: movie + video_codec: h264 + year: 2015 + +? TEST.2015.1080p.3D.BluRay.Half-SBS.x264.DTS-HD.MA.7.1-ABC +: audio_channels: '7.1' + audio_codec: DTS + audio_profile: HDMA + format: BluRay + other: 3D + release_group: ABC + screen_size: 1080p + title: TEST + type: movie + video_codec: h264 + year: 2015 + +? TEST.2015.1080p.3D.BluRay.Half-OU.x264.DTS-HD.MA.7.1-ABC +: audio_channels: '7.1' + audio_codec: DTS + audio_profile: HDMA + format: BluRay + other: 3D + release_group: ABC + screen_size: 1080p + title: TEST + type: movie + video_codec: h264 + year: 2015 + +? TEST.2015.1080p.3D.BluRay.Half-OU.x264.DTS-HD.MA.TrueHD.7.1.Atmos-ABC +: audio_channels: '7.1' + audio_codec: + - DTS + - TrueHD + - DolbyAtmos + audio_profile: HDMA + format: BluRay + other: 3D + release_group: ABC + screen_size: 1080p + title: TEST + type: movie + video_codec: h264 + year: 2015 + +? TEST.2015.1080p.3D.BluRay.Half-SBS.x264.DTS-HD.MA.TrueHD.7.1.Atmos-ABC +: audio_channels: '7.1' + audio_codec: + - DTS + - TrueHD + - DolbyAtmos + audio_profile: HDMA + format: BluRay + other: 3D + release_group: ABC + screen_size: 1080p + title: TEST + type: movie + video_codec: h264 + year: 2015 + +? TEST.2015.1080p.BluRay.REMUX.AVC.DTS-HD.MA.TrueHD.7.1.Atmos-ABC +: audio_channels: '7.1' + audio_codec: + - DTS + - TrueHD + - DolbyAtmos + audio_profile: HDMA + format: BluRay + other: Remux + release_group: ABC + screen_size: 1080p + title: TEST + type: movie + year: 2015 + +? Gangs of New York 2002 REMASTERED 1080p BluRay x264-AVCHD +: format: BluRay + other: Remastered + screen_size: 1080p + title: Gangs of New York + type: movie + video_codec: h264 + year: 2002 + +? Peep.Show.S06E02.DVDrip.x264-faks86.mkv +: container: mkv + episode: 2 + format: DVD + release_group: faks86 + season: 6 + title: Peep Show + type: episode + video_codec: h264 + +? The Soup - 11x41 - October 8, 2014.mp4 +: container: mp4 + episode: 41 + episode_title: October 8 + season: 11 + title: The Soup + type: episode + year: 2014 + +? Red.Rock.S02E59.WEB-DLx264-JIVE +: episode: 59 + season: 2 + format: WEB-DL + release_group: JIVE + title: Red Rock + type: episode + video_codec: h264 + +? Pawn.Stars.S12E31.Deals.On.Wheels.PDTVx264-JIVE +: episode: 31 + episode_title: Deals On Wheels + season: 12 + format: DVB + release_group: JIVE + title: Pawn Stars + type: episode + video_codec: h264 + +? Duck.Dynasty.S09E09.Van.He-llsing.HDTVx264-JIVE +: episode: 9 + episode_title: Van He-llsing + season: 9 + format: HDTV + release_group: JIVE + title: Duck Dynasty + type: episode + video_codec: h264 \ No newline at end of file diff --git a/libs/guessit/textutils.py b/libs/guessit/textutils.py deleted file mode 100644 index 3537aa3b..00000000 --- a/libs/guessit/textutils.py +++ /dev/null @@ -1,355 +0,0 @@ -#!/usr/bin/env python -# -*- coding: utf-8 -*- -# -# GuessIt - A library for guessing information from filenames -# Copyright (c) 2013 Nicolas Wack -# -# GuessIt is free software; you can redistribute it and/or modify it under -# the terms of the Lesser GNU General Public License as published by -# the Free Software Foundation; either version 3 of the License, or -# (at your option) any later version. -# -# GuessIt is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# Lesser GNU General Public License for more details. -# -# You should have received a copy of the Lesser GNU General Public License -# along with this program. If not, see . -# - -from __future__ import absolute_import, division, print_function, unicode_literals - -from guessit import s -from guessit.patterns import sep -import functools -import unicodedata -import re - -# string-related functions - - -def normalize_unicode(s): - return unicodedata.normalize('NFC', s) - - -def strip_brackets(s): - if not s: - return s - - if ((s[0] == '[' and s[-1] == ']') or - (s[0] == '(' and s[-1] == ')') or - (s[0] == '{' and s[-1] == '}')): - return s[1:-1] - - return s - - -_dotted_rexp = re.compile(r'(?:\W|^)(([A-Za-z]\.){2,}[A-Za-z]\.?)') - - -def clean_default(st): - for c in sep: - # do not remove certain chars - if c in ['-', ',']: - continue - - if c == '.': - # we should not remove the dots for acronyms and such - dotted = _dotted_rexp.search(st) - if dotted: - s = dotted.group(1) - exclude_begin, exclude_end = dotted.span(1) - - st = (st[:exclude_begin].replace(c, ' ') + - st[exclude_begin:exclude_end] + - st[exclude_end:].replace(c, ' ')) - continue - - st = st.replace(c, ' ') - - parts = st.split() - result = ' '.join(p for p in parts if p != '') - - # now also remove dashes on the outer part of the string - while result and result[0] in '-': - result = result[1:] - while result and result[-1] in '-': - result = result[:-1] - - return result - -_words_rexp = re.compile('\w+', re.UNICODE) - - -def find_words(s): - return _words_rexp.findall(s.replace('_', ' ')) - - -def iter_words(s): - return _words_rexp.finditer(s.replace('_', ' ')) - - -def reorder_title(title, articles=('the',), separators=(',', ', ')): - ltitle = title.lower() - for article in articles: - for separator in separators: - suffix = separator + article - if ltitle[-len(suffix):] == suffix: - return title[-len(suffix) + len(separator):] + ' ' + title[:-len(suffix)] - return title - - -def str_replace(string, pos, c): - return string[:pos] + c + string[pos + 1:] - - -def str_fill(string, region, c): - start, end = region - return string[:start] + c * (end - start) + string[end:] - - -def levenshtein(a, b): - if not a: - return len(b) - if not b: - return len(a) - - m = len(a) - n = len(b) - d = [] - for i in range(m + 1): - d.append([0] * (n + 1)) - - for i in range(m + 1): - d[i][0] = i - - for j in range(n + 1): - d[0][j] = j - - for i in range(1, m + 1): - for j in range(1, n + 1): - if a[i - 1] == b[j - 1]: - cost = 0 - else: - cost = 1 - - d[i][j] = min(d[i - 1][j] + 1, # deletion - d[i][j - 1] + 1, # insertion - d[i - 1][j - 1] + cost # substitution - ) - - return d[m][n] - - -# group-related functions - -def find_first_level_groups_span(string, enclosing): - """Return a list of pairs (start, end) for the groups delimited by the given - enclosing characters. - This does not return nested groups, ie: '(ab(c)(d))' will return a single group - containing the whole string. - - >>> find_first_level_groups_span('abcd', '()') - [] - - >>> find_first_level_groups_span('abc(de)fgh', '()') - [(3, 7)] - - >>> find_first_level_groups_span('(ab(c)(d))', '()') - [(0, 10)] - - >>> find_first_level_groups_span('ab[c]de[f]gh(i)', '[]') - [(2, 5), (7, 10)] - """ - opening, closing = enclosing - depth = [] # depth is a stack of indices where we opened a group - result = [] - for i, c, in enumerate(string): - if c == opening: - depth.append(i) - elif c == closing: - try: - start = depth.pop() - end = i - if not depth: - # we emptied our stack, so we have a 1st level group - result.append((start, end + 1)) - except IndexError: - # we closed a group which was not opened before - pass - - return result - - -def split_on_groups(string, groups): - """Split the given string using the different known groups for boundaries. - >>> s(split_on_groups('0123456789', [ (2, 4) ])) - ['01', '23', '456789'] - - >>> s(split_on_groups('0123456789', [ (2, 4), (4, 6) ])) - ['01', '23', '45', '6789'] - - >>> s(split_on_groups('0123456789', [ (5, 7), (2, 4) ])) - ['01', '23', '4', '56', '789'] - - """ - if not groups: - return [string] - - boundaries = sorted(set(functools.reduce(lambda l, x: l + list(x), groups, []))) - if boundaries[0] != 0: - boundaries.insert(0, 0) - if boundaries[-1] != len(string): - boundaries.append(len(string)) - - groups = [string[start:end] for start, end in zip(boundaries[:-1], - boundaries[1:])] - - return [g for g in groups if g] # return only non-empty groups - - -def find_first_level_groups(string, enclosing, blank_sep=None): - """Return a list of groups that could be split because of explicit grouping. - The groups are delimited by the given enclosing characters. - - You can also specify if you want to blank the separator chars in the returned - list of groups by specifying a character for it. None means it won't be replaced. - - This does not return nested groups, ie: '(ab(c)(d))' will return a single group - containing the whole string. - - >>> s(find_first_level_groups('', '()')) - [''] - - >>> s(find_first_level_groups('abcd', '()')) - ['abcd'] - - >>> s(find_first_level_groups('abc(de)fgh', '()')) - ['abc', '(de)', 'fgh'] - - >>> s(find_first_level_groups('(ab(c)(d))', '()', blank_sep = '_')) - ['_ab(c)(d)_'] - - >>> s(find_first_level_groups('ab[c]de[f]gh(i)', '[]')) - ['ab', '[c]', 'de', '[f]', 'gh(i)'] - - >>> s(find_first_level_groups('()[]()', '()', blank_sep = '-')) - ['--', '[]', '--'] - - """ - groups = find_first_level_groups_span(string, enclosing) - if blank_sep: - for start, end in groups: - string = str_replace(string, start, blank_sep) - string = str_replace(string, end - 1, blank_sep) - - return split_on_groups(string, groups) - - -_camel_word2_set = set(('is', 'to',)) -_camel_word3_set = set(('the',)) - - -def _camel_split_and_lower(string, i): - """Retrieves a tuple (need_split, need_lower) - - need_split is True if this char is a first letter in a camelCasedString. - need_lower is True if this char should be lowercased. - """ - - def islower(c): - return c.isalpha() and not c.isupper() - - previous_char2 = string[i - 2] if i > 1 else None - previous_char = string[i - 1] if i > 0 else None - char = string[i] - next_char = string[i + 1] if i + 1 < len(string) else None - next_char2 = string[i + 2] if i + 2 < len(string) else None - - char_upper = char.isupper() - char_lower = islower(char) - - # previous_char2_lower = islower(previous_char2) if previous_char2 else False - previous_char2_upper = previous_char2.isupper() if previous_char2 else False - - previous_char_lower = islower(previous_char) if previous_char else False - previous_char_upper = previous_char.isupper() if previous_char else False - - next_char_upper = next_char.isupper() if next_char else False - next_char_lower = islower(next_char) if next_char else False - - next_char2_upper = next_char2.isupper() if next_char2 else False - # next_char2_lower = islower(next_char2) if next_char2 else False - - mixedcase_word = (previous_char_upper and char_lower and next_char_upper) or \ - (previous_char_lower and char_upper and next_char_lower and next_char2_upper) or \ - (previous_char2_upper and previous_char_lower and char_upper) - if mixedcase_word: - word2 = (char + next_char).lower() if next_char else None - word3 = (char + next_char + next_char2).lower() if next_char and next_char2 else None - word2b = (previous_char2 + previous_char).lower() if previous_char2 and previous_char else None - if word2 in _camel_word2_set or word2b in _camel_word2_set or word3 in _camel_word3_set: - mixedcase_word = False - - uppercase_word = previous_char_upper and char_upper and next_char_upper or (char_upper and next_char_upper and next_char2_upper) - - need_split = char_upper and previous_char_lower and not mixedcase_word - - if not need_split: - previous_char_upper = string[i - 1].isupper() if i > 0 else False - next_char_lower = (string[i + 1].isalpha() and not string[i + 1].isupper()) if i + 1 < len(string) else False - need_split = char_upper and previous_char_upper and next_char_lower - uppercase_word = previous_char_upper and not next_char_lower - - need_lower = not uppercase_word and not mixedcase_word and need_split - - return (need_split, need_lower) - - -def is_camel(string): - """ - >>> is_camel('dogEATDog') - True - >>> is_camel('DeathToCamelCase') - True - >>> is_camel('death_to_camel_case') - False - >>> is_camel('TheBest') - True - >>> is_camel('The Best') - False - """ - for i in range(0, len(string)): - need_split, _ = _camel_split_and_lower(string, i) - if need_split: - return True - return False - - -def from_camel(string): - """ - >>> from_camel('dogEATDog') == 'dog EAT dog' - True - >>> from_camel('DeathToCamelCase') == 'Death to camel case' - True - >>> from_camel('TheBest') == 'The best' - True - >>> from_camel('MiXedCaSe is not camelCase') == 'MiXedCaSe is not camel case' - True - """ - if not string: - return string - pieces = [] - - for i in range(0, len(string)): - char = string[i] - need_split, need_lower = _camel_split_and_lower(string, i) - if need_split: - pieces.append(' ') - - if need_lower: - pieces.append(char.lower()) - else: - pieces.append(char) - return ''.join(pieces) diff --git a/libs/guessit/transfo/__init__.py b/libs/guessit/transfo/__init__.py deleted file mode 100644 index cce2dfda..00000000 --- a/libs/guessit/transfo/__init__.py +++ /dev/null @@ -1,30 +0,0 @@ -#!/usr/bin/env python -# -*- coding: utf-8 -*- -# -# GuessIt - A library for guessing information from filenames -# Copyright (c) 2013 Nicolas Wack -# -# GuessIt is free software; you can redistribute it and/or modify it under -# the terms of the Lesser GNU General Public License as published by -# the Free Software Foundation; either version 3 of the License, or -# (at your option) any later version. -# -# GuessIt is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# Lesser GNU General Public License for more details. -# -# You should have received a copy of the Lesser GNU General Public License -# along with this program. If not, see . -# - -from __future__ import absolute_import, division, print_function, unicode_literals - - -class TransformerException(Exception): - def __init__(self, transformer, message): - - # Call the base class constructor with the parameters it needs - Exception.__init__(self, message) - - self.transformer = transformer \ No newline at end of file diff --git a/libs/guessit/transfo/expected_series.py b/libs/guessit/transfo/expected_series.py deleted file mode 100644 index edbd46d4..00000000 --- a/libs/guessit/transfo/expected_series.py +++ /dev/null @@ -1,60 +0,0 @@ -#!/usr/bin/env python -# -*- coding: utf-8 -*- -# -# GuessIt - A library for guessing information from filenames -# Copyright (c) 2013 Nicolas Wack -# -# GuessIt is free software; you can redistribute it and/or modify it under -# the terms of the Lesser GNU General Public License as published by -# the Free Software Foundation; either version 3 of the License, or -# (at your option) any later version. -# -# GuessIt is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# Lesser GNU General Public License for more details. -# -# You should have received a copy of the Lesser GNU General Public License -# along with this program. If not, see . -# - -from __future__ import absolute_import, division, print_function, unicode_literals -from guessit.containers import PropertiesContainer -from guessit.matcher import GuessFinder - -from guessit.plugins.transformers import Transformer - -import re - - -class ExpectedSeries(Transformer): - def __init__(self): - Transformer.__init__(self, 230) - - def register_arguments(self, opts, naming_opts, output_opts, information_opts, webservice_opts, other_options): - naming_opts.add_argument('-S', '--expected-series', action='append', dest='expected_series', - help='Expected series to parse (can be used multiple times)') - - def should_process(self, mtree, options=None): - return options and options.get('expected_series') - - def expected_series(self, string, node=None, options=None): - container = PropertiesContainer(enhance=True, canonical_from_pattern=False) - - for expected_serie in options.get('expected_series'): - if expected_serie.startswith('re:'): - expected_serie = expected_serie[3:] - expected_serie = expected_serie.replace(' ', '-') - container.register_property('series', expected_serie, enhance=True) - else: - expected_serie = re.escape(expected_serie) - container.register_property('series', expected_serie, enhance=False) - - found = container.find_properties(string, node, options) - return container.as_guess(found, string) - - def supported_properties(self): - return ['series'] - - def process(self, mtree, options=None): - GuessFinder(self.expected_series, None, self.log, options).process_nodes(mtree.unidentified_leaves()) diff --git a/libs/guessit/transfo/expected_title.py b/libs/guessit/transfo/expected_title.py deleted file mode 100644 index 2fe3d20e..00000000 --- a/libs/guessit/transfo/expected_title.py +++ /dev/null @@ -1,61 +0,0 @@ -#!/usr/bin/env python -# -*- coding: utf-8 -*- -# -# GuessIt - A library for guessing information from filenames -# Copyright (c) 2013 Nicolas Wack -# -# GuessIt is free software; you can redistribute it and/or modify it under -# the terms of the Lesser GNU General Public License as published by -# the Free Software Foundation; either version 3 of the License, or -# (at your option) any later version. -# -# GuessIt is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# Lesser GNU General Public License for more details. -# -# You should have received a copy of the Lesser GNU General Public License -# along with this program. If not, see . -# - -from __future__ import absolute_import, division, print_function, unicode_literals - -from guessit.containers import PropertiesContainer -from guessit.matcher import GuessFinder - -from guessit.plugins.transformers import Transformer - -import re - - -class ExpectedTitle(Transformer): - def __init__(self): - Transformer.__init__(self, 225) - - def register_arguments(self, opts, naming_opts, output_opts, information_opts, webservice_opts, other_options): - naming_opts.add_argument('-T', '--expected-title', action='append', dest='expected_title', - help='Expected title (can be used multiple times)') - - def should_process(self, mtree, options=None): - return options and options.get('expected_title') - - def expected_titles(self, string, node=None, options=None): - container = PropertiesContainer(enhance=True, canonical_from_pattern=False) - - for expected_title in options.get('expected_title'): - if expected_title.startswith('re:'): - expected_title = expected_title[3:] - expected_title = expected_title.replace(' ', '-') - container.register_property('title', expected_title, enhance=True) - else: - expected_title = re.escape(expected_title) - container.register_property('title', expected_title, enhance=False) - - found = container.find_properties(string, node, options) - return container.as_guess(found, string) - - def supported_properties(self): - return ['title'] - - def process(self, mtree, options=None): - GuessFinder(self.expected_titles, None, self.log, options).process_nodes(mtree.unidentified_leaves()) diff --git a/libs/guessit/transfo/guess_bonus_features.py b/libs/guessit/transfo/guess_bonus_features.py deleted file mode 100644 index c70b31e5..00000000 --- a/libs/guessit/transfo/guess_bonus_features.py +++ /dev/null @@ -1,67 +0,0 @@ -#!/usr/bin/env python -# -*- coding: utf-8 -*- -# -# GuessIt - A library for guessing information from filenames -# Copyright (c) 2013 Nicolas Wack -# -# GuessIt is free software; you can redistribute it and/or modify it under -# the terms of the Lesser GNU General Public License as published by -# the Free Software Foundation; either version 3 of the License, or -# (at your option) any later version. -# -# GuessIt is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# Lesser GNU General Public License for more details. -# -# You should have received a copy of the Lesser GNU General Public License -# along with this program. If not, see . -# - -from __future__ import absolute_import, division, print_function, unicode_literals - -from guessit.plugins.transformers import Transformer -from guessit.matcher import found_property - - -class GuessBonusFeatures(Transformer): - def __init__(self): - Transformer.__init__(self, -150) - - def supported_properties(self): - return ['bonusNumber', 'bonusTitle', 'filmNumber', 'filmSeries', 'title', 'series'] - - def process(self, mtree, options=None): - def previous_group(g): - for leaf in reversed(list(mtree.unidentified_leaves())): - if leaf.node_idx < g.node_idx: - return leaf - - def next_group(g): - for leaf in mtree.unidentified_leaves(): - if leaf.node_idx > g.node_idx: - return leaf - - def same_group(g1, g2): - return g1.node_idx[:2] == g2.node_idx[:2] - - bonus = [node for node in mtree.leaves() if 'bonusNumber' in node.guess] - if bonus: - bonus_title = next_group(bonus[0]) - if bonus_title and same_group(bonus_title, bonus[0]): - found_property(bonus_title, 'bonusTitle', confidence=0.8) - - film_number = [node for node in mtree.leaves() - if 'filmNumber' in node.guess] - if film_number: - film_series = previous_group(film_number[0]) - found_property(film_series, 'filmSeries', confidence=0.9) - - title = next_group(film_number[0]) - found_property(title, 'title', confidence=0.9) - - season = [node for node in mtree.leaves() if 'season' in node.guess] - if season and 'bonusNumber' in mtree.info: - series = previous_group(season[0]) - if same_group(series, season[0]): - found_property(series, 'series', confidence=0.9) diff --git a/libs/guessit/transfo/guess_country.py b/libs/guessit/transfo/guess_country.py deleted file mode 100644 index c08cac7b..00000000 --- a/libs/guessit/transfo/guess_country.py +++ /dev/null @@ -1,124 +0,0 @@ -#!/usr/bin/env python -# -*- coding: utf-8 -*- -# -# GuessIt - A library for guessing information from filenames -# Copyright (c) 2013 Nicolas Wack -# -# GuessIt is free software; you can redistribute it and/or modify it under -# the terms of the Lesser GNU General Public License as published by -# the Free Software Foundation; either version 3 of the License, or -# (at your option) any later version. -# -# GuessIt is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# Lesser GNU General Public License for more details. -# -# You should have received a copy of the Lesser GNU General Public License -# along with this program. If not, see . -# - -from __future__ import absolute_import, division, print_function, unicode_literals - -from guessit.plugins.transformers import Transformer -from babelfish import Country -from guessit import Guess -from guessit.textutils import iter_words -from guessit.matcher import GuessFinder, found_guess -from guessit.language import LNG_COMMON_WORDS -import babelfish -import logging - -log = logging.getLogger(__name__) - - -class GuessCountry(Transformer): - def __init__(self): - Transformer.__init__(self, -170) - self.replace_language = frozenset(['uk']) - - def register_arguments(self, opts, naming_opts, output_opts, information_opts, webservice_opts, other_options): - naming_opts.add_argument('-C', '--allowed-country', action='append', dest='allowed_countries', - help='Allowed country (can be used multiple times)') - - def supported_properties(self): - return ['country'] - - def should_process(self, mtree, options=None): - options = options or {} - return options.get('country', True) - - def _scan_country(self, country, strict=False): - """ - Find a country if it is at the start or end of country string - """ - words_match = list(iter_words(country.lower())) - s = "" - start = None - - for word_match in words_match: - if not start: - start = word_match.start(0) - s += word_match.group(0) - try: - return Country.fromguessit(s), (start, word_match.end(0)) - except babelfish.Error: - continue - - words_match.reverse() - s = "" - end = None - for word_match in words_match: - if not end: - end = word_match.end(0) - s = word_match.group(0) + s - try: - return Country.fromguessit(s), (word_match.start(0), end) - except babelfish.Error: - continue - - return Country.fromguessit(country), (start, end) - - def is_valid_country(self, country, options=None): - if options and options.get('allowed_countries'): - allowed_countries = options.get('allowed_countries') - return country.name.lower() in allowed_countries or country.alpha2.lower() in allowed_countries - else: - return (country.name.lower() not in LNG_COMMON_WORDS and - country.alpha2.lower() not in LNG_COMMON_WORDS) - - def guess_country(self, string, node=None, options=None): - c = string.strip().lower() - if c not in LNG_COMMON_WORDS: - try: - country, country_span = self._scan_country(c, True) - if self.is_valid_country(country, options): - guess = Guess(country=country, confidence=1.0, input=node.value, span=(country_span[0] + 1, country_span[1] + 1)) - return guess - except babelfish.Error: - pass - return None, None - - def process(self, mtree, options=None): - GuessFinder(self.guess_country, None, self.log, options).process_nodes(mtree.unidentified_leaves()) - for node in mtree.leaves_containing('language'): - c = node.clean_value.lower() - if c in self.replace_language: - node.guess.set('language', None) - try: - country = Country.fromguessit(c) - if self.is_valid_country(country, options): - guess = Guess(country=country, confidence=0.9, input=node.value, span=node.span) - found_guess(node, guess, logger=log) - except babelfish.Error: - pass - - def post_process(self, mtree, options=None, *args, **kwargs): - # if country is in the guessed properties, make it part of the series name - series_leaves = list(mtree.leaves_containing('series')) - country_leaves = list(mtree.leaves_containing('country')) - - if series_leaves and country_leaves: - country_leaf = country_leaves[0] - for serie_leaf in series_leaves: - serie_leaf.guess['series'] += ' (%s)' % str(country_leaf.guess['country'].guessit) diff --git a/libs/guessit/transfo/guess_date.py b/libs/guessit/transfo/guess_date.py deleted file mode 100644 index 73fa246d..00000000 --- a/libs/guessit/transfo/guess_date.py +++ /dev/null @@ -1,49 +0,0 @@ -#!/usr/bin/env python -# -*- coding: utf-8 -*- -# -# GuessIt - A library for guessing information from filenames -# Copyright (c) 2013 Nicolas Wack -# -# GuessIt is free software; you can redistribute it and/or modify it under -# the terms of the Lesser GNU General Public License as published by -# the Free Software Foundation; either version 3 of the License, or -# (at your option) any later version. -# -# GuessIt is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# Lesser GNU General Public License for more details. -# -# You should have received a copy of the Lesser GNU General Public License -# along with this program. If not, see . -# - -from __future__ import absolute_import, division, print_function, unicode_literals - -from guessit.plugins.transformers import Transformer -from guessit.matcher import GuessFinder -from guessit.date import search_date - - -class GuessDate(Transformer): - def __init__(self): - Transformer.__init__(self, 50) - - def register_arguments(self, opts, naming_opts, output_opts, information_opts, webservice_opts, other_options): - naming_opts.add_argument('-Y', '--date-year-first', action='store_true', dest='date_year_first', default=None, - help='If short date is found, consider the first digits as the year.') - naming_opts.add_argument('-D', '--date-day-first', action='store_true', dest='date_day_first', default=None, - help='If short date is found, consider the second digits as the day.') - - def supported_properties(self): - return ['date'] - - def guess_date(self, string, node=None, options=None): - date, span = search_date(string, options.get('date_year_first') if options else False, options.get('date_day_first') if options else False) - if date: - return {'date': date}, span - else: - return None, None - - def process(self, mtree, options=None): - GuessFinder(self.guess_date, 1.0, self.log, options).process_nodes(mtree.unidentified_leaves()) diff --git a/libs/guessit/transfo/guess_episode_details.py b/libs/guessit/transfo/guess_episode_details.py deleted file mode 100644 index ba7ff298..00000000 --- a/libs/guessit/transfo/guess_episode_details.py +++ /dev/null @@ -1,64 +0,0 @@ -#!/usr/bin/env python -# -*- coding: utf-8 -*- -# -# GuessIt - A library for guessing information from filenames -# Copyright (c) 2013 Nicolas Wack -# -# GuessIt is free software; you can redistribute it and/or modify it under -# the terms of the Lesser GNU General Public License as published by -# the Free Software Foundation; either version 3 of the License, or -# (at your option) any later version. -# -# GuessIt is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# Lesser GNU General Public License for more details. -# -# You should have received a copy of the Lesser GNU General Public License -# along with this program. If not, see . -# - -from __future__ import absolute_import, division, print_function, unicode_literals - -from guessit.plugins.transformers import Transformer -from guessit.matcher import found_guess -from guessit.containers import PropertiesContainer -import itertools - - -class GuessEpisodeDetails(Transformer): - def __init__(self): - Transformer.__init__(self, -205) - self.container = PropertiesContainer() - self.container.register_property('episodeDetails', 'Special', 'Bonus', 'Omake', 'Ova', 'Oav', 'Pilot', 'Unaired') - self.container.register_property('episodeDetails', 'Extras?', canonical_form='Extras') - - def guess_details(self, string, node=None, options=None): - properties = self.container.find_properties(string, node, options, 'episodeDetails', multiple=True) - guesses = self.container.as_guess(properties, multiple=True) - return guesses - - def second_pass_options(self, mtree, options=None): - if not mtree.guess.get('type', '').startswith('episode'): - for unidentified_leaf in mtree.unidentified_leaves(): - properties = self.container.find_properties(unidentified_leaf.value, unidentified_leaf, options, 'episodeDetails') - guess = self.container.as_guess(properties) - if guess: - return {'type': 'episode'} - return None - - def supported_properties(self): - return self.container.get_supported_properties() - - def process(self, mtree, options=None): - if (mtree.guess.get('type', '').startswith('episode') and - (not mtree.info.get('episodeNumber') or - mtree.info.get('season') == 0)): - - for leaf in itertools.chain(mtree.leaves_containing('title'), - mtree.unidentified_leaves()): - guesses = self.guess_details(leaf.value, leaf, options) - for guess in guesses: - found_guess(leaf, guess, update_guess=False) - - return None diff --git a/libs/guessit/transfo/guess_episode_info_from_position.py b/libs/guessit/transfo/guess_episode_info_from_position.py deleted file mode 100644 index ad8973dd..00000000 --- a/libs/guessit/transfo/guess_episode_info_from_position.py +++ /dev/null @@ -1,181 +0,0 @@ -#!/usr/bin/env python -# -*- coding: utf-8 -*- -# -# GuessIt - A library for guessing information from filenames -# Copyright (c) 2013 Nicolas Wack -# -# GuessIt is free software; you can redistribute it and/or modify it under -# the terms of the Lesser GNU General Public License as published by -# the Free Software Foundation; either version 3 of the License, or -# (at your option) any later version. -# -# GuessIt is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# Lesser GNU General Public License for more details. -# -# You should have received a copy of the Lesser GNU General Public License -# along with this program. If not, see . -# - -from __future__ import absolute_import, division, print_function, unicode_literals - -from guessit.plugins.transformers import Transformer, get_transformer -from guessit.textutils import reorder_title - -from guessit.matcher import found_property - - -class GuessEpisodeInfoFromPosition(Transformer): - def __init__(self): - Transformer.__init__(self, -200) - - def supported_properties(self): - return ['title', 'series'] - - def match_from_epnum_position(self, mtree, node, options): - epnum_idx = node.node_idx - - # a few helper functions to be able to filter using high-level semantics - def before_epnum_in_same_pathgroup(): - return [leaf for leaf in mtree.unidentified_leaves(lambda x: len(x.clean_value) > 1) - if (leaf.node_idx[0] == epnum_idx[0] and - leaf.node_idx[1:] < epnum_idx[1:])] - - def after_epnum_in_same_pathgroup(): - return [leaf for leaf in mtree.unidentified_leaves(lambda x: len(x.clean_value) > 1) - if (leaf.node_idx[0] == epnum_idx[0] and - leaf.node_idx[1:] > epnum_idx[1:])] - - def after_epnum_in_same_explicitgroup(): - return [leaf for leaf in mtree.unidentified_leaves(lambda x: len(x.clean_value) > 1) - if (leaf.node_idx[:2] == epnum_idx[:2] and - leaf.node_idx[2:] > epnum_idx[2:])] - - # epnumber is the first group and there are only 2 after it in same - # path group - # -> series title - episode title - title_candidates = self._filter_candidates(after_epnum_in_same_pathgroup(), options) - - if ('title' not in mtree.info and # no title - 'series' in mtree.info and # series present - before_epnum_in_same_pathgroup() == [] and # no groups before - len(title_candidates) == 1): # only 1 group after - - found_property(title_candidates[0], 'title', confidence=0.4) - return - - if ('title' not in mtree.info and # no title - before_epnum_in_same_pathgroup() == [] and # no groups before - len(title_candidates) == 2): # only 2 groups after - - found_property(title_candidates[0], 'series', confidence=0.4) - found_property(title_candidates[1], 'title', confidence=0.4) - return - - # if we have at least 1 valid group before the episodeNumber, then it's - # probably the series name - series_candidates = before_epnum_in_same_pathgroup() - if len(series_candidates) >= 1: - found_property(series_candidates[0], 'series', confidence=0.7) - - # only 1 group after (in the same path group) and it's probably the - # episode title. - title_candidates = self._filter_candidates(after_epnum_in_same_pathgroup(), options) - if len(title_candidates) == 1: - found_property(title_candidates[0], 'title', confidence=0.5) - return - else: - # try in the same explicit group, with lower confidence - title_candidates = self._filter_candidates(after_epnum_in_same_explicitgroup(), options) - if len(title_candidates) == 1: - found_property(title_candidates[0], 'title', confidence=0.4) - return - elif len(title_candidates) > 1: - found_property(title_candidates[0], 'title', confidence=0.3) - return - - # get the one with the longest value - title_candidates = self._filter_candidates(after_epnum_in_same_pathgroup(), options) - if title_candidates: - maxidx = -1 - maxv = -1 - for i, c in enumerate(title_candidates): - if len(c.clean_value) > maxv: - maxidx = i - maxv = len(c.clean_value) - found_property(title_candidates[maxidx], 'title', confidence=0.3) - - def should_process(self, mtree, options=None): - options = options or {} - return not options.get('skip_title') and mtree.guess.get('type', '').startswith('episode') - - def _filter_candidates(self, candidates, options): - episode_details_transformer = get_transformer('guess_episode_details') - if episode_details_transformer: - return [n for n in candidates if not episode_details_transformer.container.find_properties(n.value, n, options, re_match=True)] - else: - return candidates - - def process(self, mtree, options=None): - """ - try to identify the remaining unknown groups by looking at their - position relative to other known elements - """ - eps = [node for node in mtree.leaves() if 'episodeNumber' in node.guess] - - if not eps: - eps = [node for node in mtree.leaves() if 'date' in node.guess] - - if eps: - self.match_from_epnum_position(mtree, eps[0], options) - - else: - # if we don't have the episode number, but at least 2 groups in the - # basename, then it's probably series - eptitle - basename = mtree.node_at((-2,)) - - title_candidates = self._filter_candidates(basename.unidentified_leaves(), options) - - if len(title_candidates) >= 2 and 'series' not in mtree.info: - found_property(title_candidates[0], 'series', confidence=0.4) - found_property(title_candidates[1], 'title', confidence=0.4) - elif len(title_candidates) == 1: - # but if there's only one candidate, it's probably the series name - found_property(title_candidates[0], 'series' if 'series' not in mtree.info else 'title', confidence=0.4) - - # if we only have 1 remaining valid group in the folder containing the - # file, then it's likely that it is the series name - try: - series_candidates = list(mtree.node_at((-3,)).unidentified_leaves()) - except ValueError: - series_candidates = [] - - if len(series_candidates) == 1: - found_property(series_candidates[0], 'series', confidence=0.3) - - # if there's a path group that only contains the season info, then the - # previous one is most likely the series title (ie: ../series/season X/..) - eps = [node for node in mtree.nodes() - if 'season' in node.guess and 'episodeNumber' not in node.guess] - - if eps: - previous = [node for node in mtree.unidentified_leaves() - if node.node_idx[0] == eps[0].node_idx[0] - 1] - if len(previous) == 1: - found_property(previous[0], 'series', confidence=0.5) - - # If we have found title without any serie name, replace it by the serie name. - if 'series' not in mtree.info and 'title' in mtree.info: - title_leaf = mtree.first_leaf_containing('title') - metadata = title_leaf.guess.metadata('title') - value = title_leaf.guess['title'] - del title_leaf.guess['title'] - title_leaf.guess.set('series', value, metadata=metadata) - - def post_process(self, mtree, options=None): - for node in mtree.nodes(): - if 'series' not in node.guess: - continue - - node.guess['series'] = reorder_title(node.guess['series']) diff --git a/libs/guessit/transfo/guess_episodes_rexps.py b/libs/guessit/transfo/guess_episodes_rexps.py deleted file mode 100644 index 927c9890..00000000 --- a/libs/guessit/transfo/guess_episodes_rexps.py +++ /dev/null @@ -1,193 +0,0 @@ -#!/usr/bin/env python -# -*- coding: utf-8 -*- -# -# GuessIt - A library for guessing information from filenames -# Copyright (c) 2013 Nicolas Wack -# -# GuessIt is free software; you can redistribute it and/or modify it under -# the terms of the Lesser GNU General Public License as published by -# the Free Software Foundation; either version 3 of the License, or -# (at your option) any later version. -# -# GuessIt is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# Lesser GNU General Public License for more details. -# -# You should have received a copy of the Lesser GNU General Public License -# along with this program. If not, see . -# - -from __future__ import absolute_import, division, print_function, unicode_literals - -from guessit.plugins.transformers import Transformer -from guessit.matcher import GuessFinder -from guessit.patterns import sep, build_or_pattern -from guessit.containers import PropertiesContainer, WeakValidator, NoValidator, ChainedValidator, DefaultValidator, \ - FormatterValidator -from guessit.patterns.numeral import numeral, digital_numeral, parse_numeral -import re - - -class GuessEpisodesRexps(Transformer): - def __init__(self): - Transformer.__init__(self, 20) - - range_separators = ['-', 'to', 'a'] - discrete_separators = ['&', 'and', 'et'] - of_separators = ['of', 'sur', '/', '\\'] - - season_words = ['seasons?', 'saisons?', 'series?'] - episode_words = ['episodes?'] - - season_markers = ['s'] - episode_markers = ['e', 'ep'] - - discrete_sep = sep - for range_separator in range_separators: - discrete_sep = discrete_sep.replace(range_separator, '') - discrete_separators.append(discrete_sep) - all_separators = list(range_separators) - all_separators.extend(discrete_separators) - - self.container = PropertiesContainer(enhance=False, canonical_from_pattern=False) - - range_separators_re = re.compile(build_or_pattern(range_separators), re.IGNORECASE) - discrete_separators_re = re.compile(build_or_pattern(discrete_separators), re.IGNORECASE) - all_separators_re = re.compile(build_or_pattern(all_separators), re.IGNORECASE) - of_separators_re = re.compile(build_or_pattern(of_separators, escape=True), re.IGNORECASE) - - season_words_re = re.compile(build_or_pattern(season_words), re.IGNORECASE) - episode_words_re = re.compile(build_or_pattern(episode_words), re.IGNORECASE) - - season_markers_re = re.compile(build_or_pattern(season_markers), re.IGNORECASE) - episode_markers_re = re.compile(build_or_pattern(episode_markers), re.IGNORECASE) - - def list_parser(value, property_list_name, discrete_separators_re=discrete_separators_re, range_separators_re=range_separators_re, allow_discrete=False, fill_gaps=False): - discrete_elements = filter(lambda x: x != '', discrete_separators_re.split(value)) - discrete_elements = [x.strip() for x in discrete_elements] - - proper_discrete_elements = [] - i = 0 - while i < len(discrete_elements): - if i < len(discrete_elements) - 2 and range_separators_re.match(discrete_elements[i+1]): - proper_discrete_elements.append(discrete_elements[i] + discrete_elements[i+1] + discrete_elements[i+2]) - i += 3 - else: - match = range_separators_re.search(discrete_elements[i]) - if match and match.start() == 0: - proper_discrete_elements[i-1] = proper_discrete_elements[i-1] + discrete_elements[i] - elif match and match.end() == len(discrete_elements[i]): - proper_discrete_elements.append(discrete_elements[i] + discrete_elements[i + 1]) - else: - proper_discrete_elements.append(discrete_elements[i]) - i += 1 - - discrete_elements = proper_discrete_elements - - ret = [] - - for discrete_element in discrete_elements: - range_values = filter(lambda x: x != '', range_separators_re.split(discrete_element)) - range_values = [x.strip() for x in range_values] - if len(range_values) > 1: - for x in range(0, len(range_values) - 1): - start_range_ep = parse_numeral(range_values[x]) - end_range_ep = parse_numeral(range_values[x+1]) - for range_ep in range(start_range_ep, end_range_ep + 1): - if range_ep not in ret: - ret.append(range_ep) - else: - discrete_value = parse_numeral(discrete_element) - if discrete_value not in ret: - ret.append(discrete_value) - - if len(ret) > 1: - if not allow_discrete: - valid_ret = list() - # replace discrete elements by ranges - valid_ret.append(ret[0]) - for i in range(0, len(ret) - 1): - previous = valid_ret[len(valid_ret) - 1] - if ret[i+1] < previous: - pass - else: - valid_ret.append(ret[i+1]) - ret = valid_ret - if fill_gaps: - ret = list(range(min(ret), max(ret) + 1)) - if len(ret) > 1: - return {None: ret[0], property_list_name: ret} - if len(ret) > 0: - return ret[0] - return None - - def episode_parser_x(value): - return list_parser(value, 'episodeList', discrete_separators_re=re.compile('x', re.IGNORECASE)) - - def episode_parser_e(value): - return list_parser(value, 'episodeList', discrete_separators_re=re.compile('e', re.IGNORECASE), fill_gaps=True) - - def episode_parser(value): - return list_parser(value, 'episodeList') - - def season_parser(value): - return list_parser(value, 'seasonList') - - class ResolutionCollisionValidator(object): - def validate(self, prop, string, node, match, entry_start, entry_end): - return len(match.group(2)) < 3 # limit - - self.container.register_property(None, r'(' + season_words_re.pattern + sep + '?(?P' + numeral + ')' + sep + '?' + season_words_re.pattern + '?)', confidence=1.0, formatter=parse_numeral) - self.container.register_property(None, r'(' + season_words_re.pattern + sep + '?(?P' + digital_numeral + '(?:' + sep + '?' + all_separators_re.pattern + sep + '?' + digital_numeral + ')*)' + sep + '?' + season_words_re.pattern + '?)' + sep, confidence=1.0, formatter={None: parse_numeral, 'season': season_parser}, validator=ChainedValidator(DefaultValidator(), FormatterValidator('season', lambda x: len(x) > 1 if hasattr(x, '__len__') else False))) - - self.container.register_property(None, r'(' + season_markers_re.pattern + '(?P' + digital_numeral + ')[^0-9]?' + sep + '?(?P(?:e' + digital_numeral + '(?:' + sep + '?[e-]' + digital_numeral + ')*)))', confidence=1.0, formatter={None: parse_numeral, 'episodeNumber': episode_parser_e, 'season': season_parser}, validator=NoValidator()) - # self.container.register_property(None, r'[^0-9]((?P' + digital_numeral + ')[^0-9 .-]?-?(?P(?:x' + digital_numeral + '(?:' + sep + '?[x-]' + digital_numeral + ')*)))', confidence=1.0, formatter={None: parse_numeral, 'episodeNumber': episode_parser_x, 'season': season_parser}, validator=ChainedValidator(DefaultValidator(), ResolutionCollisionValidator())) - self.container.register_property(None, sep + r'((?P' + digital_numeral + ')' + sep + '' + '(?P(?:x' + sep + digital_numeral + '(?:' + sep + '[x-]' + digital_numeral + ')*)))', confidence=1.0, formatter={None: parse_numeral, 'episodeNumber': episode_parser_x, 'season': season_parser}, validator=ChainedValidator(DefaultValidator(), ResolutionCollisionValidator())) - self.container.register_property(None, r'((?P' + digital_numeral + ')' + '(?P(?:x' + digital_numeral + '(?:[x-]' + digital_numeral + ')*)))', confidence=1.0, formatter={None: parse_numeral, 'episodeNumber': episode_parser_x, 'season': season_parser}, validator=ChainedValidator(DefaultValidator(), ResolutionCollisionValidator())) - self.container.register_property(None, r'(' + season_markers_re.pattern + '(?P' + digital_numeral + '(?:' + sep + '?' + all_separators_re.pattern + sep + '?' + digital_numeral + ')*))', confidence=0.6, formatter={None: parse_numeral, 'season': season_parser}, validator=NoValidator()) - - self.container.register_property(None, r'((?P' + digital_numeral + ')' + sep + '?v(?P\d+))', confidence=0.6, formatter=parse_numeral) - self.container.register_property(None, r'(ep' + sep + r'?(?P' + digital_numeral + ')' + sep + '?)', confidence=0.7, formatter=parse_numeral) - self.container.register_property(None, r'(ep' + sep + r'?(?P' + digital_numeral + ')' + sep + '?v(?P\d+))', confidence=0.7, formatter=parse_numeral) - - - self.container.register_property(None, r'(' + episode_markers_re.pattern + '(?P' + digital_numeral + '(?:' + sep + '?' + all_separators_re.pattern + sep + '?' + digital_numeral + ')*))', confidence=0.6, formatter={None: parse_numeral, 'episodeNumber': episode_parser}) - self.container.register_property(None, r'(' + episode_words_re.pattern + sep + '?(?P' + digital_numeral + '(?:' + sep + '?' + all_separators_re.pattern + sep + '?' + digital_numeral + ')*)' + sep + '?' + episode_words_re.pattern + '?)', confidence=0.8, formatter={None: parse_numeral, 'episodeNumber': episode_parser}) - - self.container.register_property(None, r'(' + episode_markers_re.pattern + '(?P' + digital_numeral + ')' + sep + '?v(?P\d+))', confidence=0.6, formatter={None: parse_numeral, 'episodeNumber': episode_parser}) - self.container.register_property(None, r'(' + episode_words_re.pattern + sep + '?(?P' + digital_numeral + ')' + sep + '?v(?P\d+))', confidence=0.8, formatter={None: parse_numeral, 'episodeNumber': episode_parser}) - - - self.container.register_property('episodeNumber', r'^ ?(\d{2})' + sep, confidence=0.4, formatter=parse_numeral) - self.container.register_property('episodeNumber', r'^ ?(\d{2})' + sep, confidence=0.4, formatter=parse_numeral) - self.container.register_property('episodeNumber', r'^ ?0(\d{1,2})' + sep, confidence=0.4, formatter=parse_numeral) - self.container.register_property('episodeNumber', sep + r'(\d{2}) ?$', confidence=0.4, formatter=parse_numeral) - self.container.register_property('episodeNumber', sep + r'0(\d{1,2}) ?$', confidence=0.4, formatter=parse_numeral) - - self.container.register_property(None, r'((?P' + numeral + ')' + sep + '?' + of_separators_re.pattern + sep + '?(?P' + numeral + ')(?:' + sep + '?(?:episodes?|eps?))?)', confidence=0.7, formatter=parse_numeral) - self.container.register_property(None, r'((?:episodes?|eps?)' + sep + '?(?P' + numeral + ')' + sep + '?' + of_separators_re.pattern + sep + '?(?P' + numeral + '))', confidence=0.7, formatter=parse_numeral) - self.container.register_property(None, r'((?:seasons?|saisons?|s)' + sep + '?(?P' + numeral + ')' + sep + '?' + of_separators_re.pattern + sep + '?(?P' + numeral + '))', confidence=0.7, formatter=parse_numeral) - self.container.register_property(None, r'((?P' + numeral + ')' + sep + '?' + of_separators_re.pattern + sep + '?(?P' + numeral + ')' + sep + '?(?:seasons?|saisons?|s))', confidence=0.7, formatter=parse_numeral) - - self.container.register_canonical_properties('other', 'FiNAL', 'Complete', validator=WeakValidator()) - - self.container.register_property(None, r'[^0-9]((?P' + digital_numeral + ')[^0-9 .-]?-?(?PxAll))', confidence=1.0, formatter={None: parse_numeral, 'other': lambda x: 'Complete', 'season': season_parser}, validator=ChainedValidator(DefaultValidator(), ResolutionCollisionValidator())) - - def register_arguments(self, opts, naming_opts, output_opts, information_opts, webservice_opts, other_options): - naming_opts.add_argument('-E', '--episode-prefer-number', action='store_true', dest='episode_prefer_number', default=False, - help='Guess "serie.213.avi" as the episodeNumber 213. Without this option, ' - 'it will be guessed as season 2, episodeNumber 13') - - def supported_properties(self): - return ['episodeNumber', 'season', 'episodeList', 'seasonList', 'episodeCount', 'seasonCount', 'version', 'other'] - - def guess_episodes_rexps(self, string, node=None, options=None): - found = self.container.find_properties(string, node, options) - return self.container.as_guess(found, string) - - def should_process(self, mtree, options=None): - return mtree.guess.get('type', '').startswith('episode') - - def process(self, mtree, options=None): - GuessFinder(self.guess_episodes_rexps, None, self.log, options).process_nodes(mtree.unidentified_leaves()) diff --git a/libs/guessit/transfo/guess_filetype.py b/libs/guessit/transfo/guess_filetype.py deleted file mode 100644 index 0eb3475f..00000000 --- a/libs/guessit/transfo/guess_filetype.py +++ /dev/null @@ -1,237 +0,0 @@ -#!/usr/bin/env python -# -*- coding: utf-8 -*- -# -# GuessIt - A library for guessing information from filenames -# Copyright (c) 2013 Nicolas Wack -# -# GuessIt is free software; you can redistribute it and/or modify it under -# the terms of the Lesser GNU General Public License as published by -# the Free Software Foundation; either version 3 of the License, or -# (at your option) any later version. -# -# GuessIt is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# Lesser GNU General Public License for more details. -# -# You should have received a copy of the Lesser GNU General Public License -# along with this program. If not, see . -# - -from __future__ import absolute_import, division, print_function, unicode_literals - -import mimetypes -import os.path -import re - -from guessit.guess import Guess -from guessit.patterns.extension import subtitle_exts, info_exts, video_exts -from guessit.transfo import TransformerException -from guessit.plugins.transformers import Transformer, get_transformer -from guessit.matcher import log_found_guess, found_guess, found_property - - -class GuessFiletype(Transformer): - def __init__(self): - Transformer.__init__(self, 200) - - # List of well known movies and series, hardcoded because they cannot be - # guessed appropriately otherwise - MOVIES = ['OSS 117'] - SERIES = ['Band of Brothers'] - - MOVIES = [m.lower() for m in MOVIES] - SERIES = [s.lower() for s in SERIES] - - def guess_filetype(self, mtree, options=None): - options = options or {} - - # put the filetype inside a dummy container to be able to have the - # following functions work correctly as closures - # this is a workaround for python 2 which doesn't have the - # 'nonlocal' keyword which we could use here in the upgrade_* functions - # (python 3 does have it) - filetype_container = [mtree.guess.get('type')] - other = {} - filename = mtree.string - - def upgrade_episode(): - if filetype_container[0] == 'subtitle': - filetype_container[0] = 'episodesubtitle' - elif filetype_container[0] == 'info': - filetype_container[0] = 'episodeinfo' - elif (not filetype_container[0] or - filetype_container[0] == 'video'): - filetype_container[0] = 'episode' - - def upgrade_movie(): - if filetype_container[0] == 'subtitle': - filetype_container[0] = 'moviesubtitle' - elif filetype_container[0] == 'info': - filetype_container[0] = 'movieinfo' - elif (not filetype_container[0] or - filetype_container[0] == 'video'): - filetype_container[0] = 'movie' - - def upgrade_subtitle(): - if filetype_container[0] == 'movie': - filetype_container[0] = 'moviesubtitle' - elif filetype_container[0] == 'episode': - filetype_container[0] = 'episodesubtitle' - elif not filetype_container[0]: - filetype_container[0] = 'subtitle' - - def upgrade_info(): - if filetype_container[0] == 'movie': - filetype_container[0] = 'movieinfo' - elif filetype_container[0] == 'episode': - filetype_container[0] = 'episodeinfo' - elif not filetype_container[0]: - filetype_container[0] = 'info' - - # look at the extension first - fileext = os.path.splitext(filename)[1][1:].lower() - if fileext in subtitle_exts: - upgrade_subtitle() - other = {'container': fileext} - elif fileext in info_exts: - upgrade_info() - other = {'container': fileext} - elif fileext in video_exts: - other = {'container': fileext} - else: - if fileext and not options.get('name_only'): - other = {'extension': fileext} - list(mtree.unidentified_leaves())[-1].guess = Guess(other) - - # check whether we are in a 'Movies', 'Tv Shows', ... folder - folder_rexps = [(r'Movies?', upgrade_movie), - (r'Films?', upgrade_movie), - (r'Tv[ _-]?Shows?', upgrade_episode), - (r'Series?', upgrade_episode), - (r'Episodes?', upgrade_episode)] - for frexp, upgrade_func in folder_rexps: - frexp = re.compile(frexp, re.IGNORECASE) - for pathgroup in mtree.children: - if frexp.match(pathgroup.value): - upgrade_func() - return filetype_container[0], other - - # check for a few specific cases which will unintentionally make the - # following heuristics confused (eg: OSS 117 will look like an episode, - # season 1, epnum 17, when it is in fact a movie) - fname = mtree.clean_string(filename).lower() - for m in self.MOVIES: - if m in fname: - self.log.debug('Found in exception list of movies -> type = movie') - upgrade_movie() - return filetype_container[0], other - for s in self.SERIES: - if s in fname: - self.log.debug('Found in exception list of series -> type = episode') - upgrade_episode() - return filetype_container[0], other - - # if we have an episode_rexp (eg: s02e13), it is an episode - episode_transformer = get_transformer('guess_episodes_rexps') - if episode_transformer: - filename_parts = list(x.value for x in mtree.unidentified_leaves()); - filename_parts.append(filename) - for filename_part in filename_parts: - guess = episode_transformer.guess_episodes_rexps(filename_part) - if guess: - self.log.debug('Found guess_episodes_rexps: %s -> type = episode', guess) - upgrade_episode() - return filetype_container[0], other - - properties_transformer = get_transformer('guess_properties') - if properties_transformer: - # if we have certain properties characteristic of episodes, it is an ep - found = properties_transformer.container.find_properties(filename, mtree, options, 'episodeFormat') - guess = properties_transformer.container.as_guess(found, filename) - if guess: - self.log.debug('Found characteristic property of episodes: %s"', guess) - upgrade_episode() - return filetype_container[0], other - - weak_episode_transformer = get_transformer('guess_weak_episodes_rexps') - if weak_episode_transformer: - found = properties_transformer.container.find_properties(filename, mtree, options, 'crc32') - guess = properties_transformer.container.as_guess(found, filename) - if guess: - found = weak_episode_transformer.container.find_properties(filename, mtree, options) - guess = weak_episode_transformer.container.as_guess(found, filename) - if guess: - self.log.debug('Found characteristic property of episodes: %s"', guess) - upgrade_episode() - return filetype_container[0], other - - found = properties_transformer.container.find_properties(filename, mtree, options, 'format') - guess = properties_transformer.container.as_guess(found, filename) - if guess and guess['format'] in ('HDTV', 'WEBRip', 'WEB-DL', 'DVB'): - # Use weak episodes only if TV or WEB source - weak_episode_transformer = get_transformer('guess_weak_episodes_rexps') - if weak_episode_transformer: - guess = weak_episode_transformer.guess_weak_episodes_rexps(filename) - if guess: - self.log.debug('Found guess_weak_episodes_rexps: %s -> type = episode', guess) - upgrade_episode() - return filetype_container[0], other - - website_transformer = get_transformer('guess_website') - if website_transformer: - found = website_transformer.container.find_properties(filename, mtree, options, 'website') - guess = website_transformer.container.as_guess(found, filename) - if guess: - for namepart in ('tv', 'serie', 'episode'): - if namepart in guess['website']: - # origin-specific type - self.log.debug('Found characteristic property of episodes: %s', guess) - upgrade_episode() - return filetype_container[0], other - - if filetype_container[0] in ('subtitle', 'info') or (not filetype_container[0] and fileext in video_exts): - # if no episode info found, assume it's a movie - self.log.debug('Nothing characteristic found, assuming type = movie') - upgrade_movie() - - if not filetype_container[0]: - self.log.debug('Nothing characteristic found, assuming type = unknown') - filetype_container[0] = 'unknown' - - return filetype_container[0], other - - def process(self, mtree, options=None): - """guess the file type now (will be useful later) - """ - filetype, other = self.guess_filetype(mtree, options) - - mtree.guess.set('type', filetype, confidence=1.0) - log_found_guess(mtree.guess) - - filetype_info = Guess(other, confidence=1.0) - # guess the mimetype of the filename - # TODO: handle other mimetypes not found on the default type_maps - # mimetypes.types_map['.srt']='text/subtitle' - mime, _ = mimetypes.guess_type(mtree.string, strict=False) - if mime is not None: - filetype_info.update({'mimetype': mime}, confidence=1.0) - - node_ext = mtree.node_at((-1,)) - found_guess(node_ext, filetype_info) - - if mtree.guess.get('type') in [None, 'unknown']: - if options.get('name_only'): - mtree.guess.set('type', 'movie', confidence=0.6) - else: - raise TransformerException(__name__, 'Unknown file type') - - def post_process(self, mtree, options=None): - # now look whether there are some specific hints for episode vs movie - # If we have a date and no year, this is a TV Show. - if 'date' in mtree.info and 'year' not in mtree.info and mtree.info.get('type') != 'episode': - mtree.guess['type'] = 'episode' - for type_leaves in mtree.leaves_containing('type'): - type_leaves.guess['type'] = 'episode' - for title_leaves in mtree.leaves_containing('title'): - title_leaves.guess.rename('title', 'series') \ No newline at end of file diff --git a/libs/guessit/transfo/guess_idnumber.py b/libs/guessit/transfo/guess_idnumber.py deleted file mode 100644 index 30b63cbd..00000000 --- a/libs/guessit/transfo/guess_idnumber.py +++ /dev/null @@ -1,79 +0,0 @@ -#!/usr/bin/env python -# -*- coding: utf-8 -*- -# -# GuessIt - A library for guessing information from filenames -# Copyright (c) 2013 Nicolas Wack -# -# GuessIt is free software; you can redistribute it and/or modify it under -# the terms of the Lesser GNU General Public License as published by -# the Free Software Foundation; either version 3 of the License, or -# (at your option) any later version. -# -# GuessIt is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# Lesser GNU General Public License for more details. -# -# You should have received a copy of the Lesser GNU General Public License -# along with this program. If not, see . -# - -from __future__ import absolute_import, division, print_function, unicode_literals - -from guessit.plugins.transformers import Transformer -from guessit.matcher import GuessFinder -import re - -_DIGIT = 0 -_LETTER = 1 -_OTHER = 2 - - -class GuessIdnumber(Transformer): - def __init__(self): - Transformer.__init__(self, 220) - - def supported_properties(self): - return ['idNumber'] - - _idnum = re.compile(r'(?P[a-zA-Z0-9-]{20,})') # 1.0, (0, 0)) - - def guess_idnumber(self, string, node=None, options=None): - match = self._idnum.search(string) - if match is not None: - result = match.groupdict() - switch_count = 0 - switch_letter_count = 0; - letter_count = 0; - last_letter = None - - last = _LETTER - for c in result['idNumber']: - if c in '0123456789': - ci = _DIGIT - elif c in 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ': - ci = _LETTER - if c != last_letter: - switch_letter_count += 1 - last_letter = c - letter_count += 1 - else: - ci = _OTHER - - if ci != last: - switch_count += 1 - - last = ci - - switch_ratio = float(switch_count) / len(result['idNumber']) - letters_ratio = (float(switch_letter_count) / letter_count) if letter_count > 0 else 1 - - # only return the result as probable if we alternate often between - # char type (more likely for hash values than for common words) - if switch_ratio > 0.4 and letters_ratio > 0.4: - return result, match.span() - - return None, None - - def process(self, mtree, options=None): - GuessFinder(self.guess_idnumber, 0.4, self.log, options).process_nodes(mtree.unidentified_leaves()) diff --git a/libs/guessit/transfo/guess_language.py b/libs/guessit/transfo/guess_language.py deleted file mode 100644 index cb9787d3..00000000 --- a/libs/guessit/transfo/guess_language.py +++ /dev/null @@ -1,186 +0,0 @@ -#!/usr/bin/env python -# -*- coding: utf-8 -*- -# -# GuessIt - A library for guessing information from filenames -# Copyright (c) 2013 Nicolas Wack -# -# GuessIt is free software; you can redistribute it and/or modify it under -# the terms of the Lesser GNU General Public License as published by -# the Free Software Foundation; either version 3 of the License, or -# (at your option) any later version. -# -# GuessIt is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# Lesser GNU General Public License for more details. -# -# You should have received a copy of the Lesser GNU General Public License -# along with this program. If not, see . -# - -from __future__ import absolute_import, division, print_function, unicode_literals - -from guessit.language import search_language, subtitle_prefixes, subtitle_suffixes -from guessit.patterns.extension import subtitle_exts -from guessit.textutils import find_words -from guessit.plugins.transformers import Transformer -from guessit.matcher import GuessFinder - - -class GuessLanguage(Transformer): - def __init__(self): - Transformer.__init__(self, 30) - - def register_arguments(self, opts, naming_opts, output_opts, information_opts, webservice_opts, other_options): - naming_opts.add_argument('-L', '--allowed-languages', action='append', dest='allowed_languages', - help='Allowed language (can be used multiple times)') - - def supported_properties(self): - return ['language', 'subtitleLanguage'] - - def guess_language(self, string, node=None, options=None): - allowed_languages = None - if options and 'allowed_languages' in options: - allowed_languages = options.get('allowed_languages') - guess = search_language(string, allowed_languages) - return guess - - def _skip_language_on_second_pass(self, mtree, node): - """Check if found node is a valid language node, or if it's a false positive. - - :param mtree: Tree detected on first pass. - :type mtree: :class:`guessit.matchtree.MatchTree` - :param node: Node that contains a language Guess - :type node: :class:`guessit.matchtree.MatchTree` - - :return: True if a second pass skipping this node is required - :rtype: bool - """ - unidentified_starts = {} - unidentified_ends = {} - - property_starts = {} - property_ends = {} - - title_starts = {} - title_ends = {} - - for unidentified_node in mtree.unidentified_leaves(): - unidentified_starts[unidentified_node.span[0]] = unidentified_node - unidentified_ends[unidentified_node.span[1]] = unidentified_node - - for property_node in mtree.leaves_containing('year'): - property_starts[property_node.span[0]] = property_node - property_ends[property_node.span[1]] = property_node - - for title_node in mtree.leaves_containing(['title', 'series']): - title_starts[title_node.span[0]] = title_node - title_ends[title_node.span[1]] = title_node - - return node.span[0] in title_ends.keys() and (node.span[1] in unidentified_starts.keys() or node.span[1] + 1 in property_starts.keys()) or\ - node.span[1] in title_starts.keys() and (node.span[0] == node.group_node().span[0] or node.span[0] in unidentified_ends.keys() or node.span[0] in property_ends.keys()) - - def second_pass_options(self, mtree, options=None): - m = mtree.matched() - to_skip_language_nodes = [] - - for lang_key in ('language', 'subtitleLanguage'): - langs = {} - lang_nodes = set(mtree.leaves_containing(lang_key)) - - for lang_node in lang_nodes: - lang = lang_node.guess.get(lang_key, None) - if self._skip_language_on_second_pass(mtree, lang_node): - # Language probably split the title. Add to skip for 2nd pass. - - # if filetype is subtitle and the language appears last, just before - # the extension, then it is likely a subtitle language - parts = mtree.clean_string(lang_node.root.value).split() - if m.get('type') in ['moviesubtitle', 'episodesubtitle']: - if lang_node.value in parts and \ - (parts.index(lang_node.value) == len(parts) - 2): - continue - to_skip_language_nodes.append(lang_node) - elif lang not in langs: - langs[lang] = lang_node - else: - # The same language was found. Keep the more confident one, - # and add others to skip for 2nd pass. - existing_lang_node = langs[lang] - to_skip = None - if (existing_lang_node.guess.confidence('language') >= - lang_node.guess.confidence('language')): - # lang_node is to remove - to_skip = lang_node - else: - # existing_lang_node is to remove - langs[lang] = lang_node - to_skip = existing_lang_node - to_skip_language_nodes.append(to_skip) - - if to_skip_language_nodes: - # Also skip same value nodes - skipped_values = [skip_node.value for skip_node in to_skip_language_nodes] - - for lang_key in ('language', 'subtitleLanguage'): - lang_nodes = set(mtree.leaves_containing(lang_key)) - - for lang_node in lang_nodes: - if lang_node not in to_skip_language_nodes and lang_node.value in skipped_values: - to_skip_language_nodes.append(lang_node) - return {'skip_nodes': to_skip_language_nodes} - return None - - def should_process(self, mtree, options=None): - options = options or {} - return options.get('language', True) - - def process(self, mtree, options=None): - GuessFinder(self.guess_language, None, self.log, options).process_nodes(mtree.unidentified_leaves()) - - def promote_subtitle(self, node): - if 'language' in node.guess: - node.guess.set('subtitleLanguage', node.guess['language'], - confidence=node.guess.confidence('language')) - del node.guess['language'] - - def post_process(self, mtree, options=None): - # 1- try to promote language to subtitle language where it makes sense - for node in mtree.nodes(): - if 'language' not in node.guess: - continue - - # - if we matched a language in a file with a sub extension and that - # the group is the last group of the filename, it is probably the - # language of the subtitle - # (eg: 'xxx.english.srt') - if (mtree.node_at((-1,)).value.lower() in subtitle_exts and - node == list(mtree.leaves())[-2]): - self.promote_subtitle(node) - - # - if we find in the same explicit group - # a subtitle prefix before the language, - # or a subtitle suffix after the language, - # then upgrade the language - explicit_group = mtree.node_at(node.node_idx[:2]) - group_str = explicit_group.value.lower() - - for sub_prefix in subtitle_prefixes: - if (sub_prefix in find_words(group_str) and - 0 <= group_str.find(sub_prefix) < (node.span[0] - explicit_group.span[0])): - self.promote_subtitle(node) - - for sub_suffix in subtitle_suffixes: - if (sub_suffix in find_words(group_str) and - (node.span[0] - explicit_group.span[0]) < group_str.find(sub_suffix)): - self.promote_subtitle(node) - - # - if a language is in an explicit group just preceded by "st", - # it is a subtitle language (eg: '...st[fr-eng]...') - try: - idx = node.node_idx - previous = list(mtree.node_at((idx[0], idx[1] - 1)).leaves())[-1] - if previous.value.lower()[-2:] == 'st': - self.promote_subtitle(node) - except IndexError: - pass diff --git a/libs/guessit/transfo/guess_movie_title_from_position.py b/libs/guessit/transfo/guess_movie_title_from_position.py deleted file mode 100644 index 671e4cb5..00000000 --- a/libs/guessit/transfo/guess_movie_title_from_position.py +++ /dev/null @@ -1,173 +0,0 @@ -#!/usr/bin/env python -# -*- coding: utf-8 -*- -# -# GuessIt - A library for guessing information from filenames -# Copyright (c) 2013 Nicolas Wack -# -# GuessIt is free software; you can redistribute it and/or modify it under -# the terms of the Lesser GNU General Public License as published by -# the Free Software Foundation; either version 3 of the License, or -# (at your option) any later version. -# -# GuessIt is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# Lesser GNU General Public License for more details. -# -# You should have received a copy of the Lesser GNU General Public License -# along with this program. If not, see . -# - -from __future__ import absolute_import, division, print_function, unicode_literals - -from guessit.plugins.transformers import Transformer -from guessit.matcher import found_property -from guessit import u - - -class GuessMovieTitleFromPosition(Transformer): - def __init__(self): - Transformer.__init__(self, -200) - - def supported_properties(self): - return ['title'] - - def should_process(self, mtree, options=None): - options = options or {} - return not options.get('skip_title') and not mtree.guess.get('type', '').startswith('episode') - - def process(self, mtree, options=None): - """ - try to identify the remaining unknown groups by looking at their - position relative to other known elements - """ - if 'title' in mtree.info: - return - - basename = mtree.node_at((-2,)) - all_valid = lambda leaf: len(leaf.clean_value) > 0 - basename_leftover = list(basename.unidentified_leaves(valid=all_valid)) - - try: - folder = mtree.node_at((-3,)) - folder_leftover = list(folder.unidentified_leaves()) - except ValueError: - folder = None - folder_leftover = [] - - self.log.debug('folder: %s' % u(folder_leftover)) - self.log.debug('basename: %s' % u(basename_leftover)) - - # specific cases: - # if we find the same group both in the folder name and the filename, - # it's a good candidate for title - if folder_leftover and basename_leftover and folder_leftover[0].clean_value == basename_leftover[0].clean_value: - found_property(folder_leftover[0], 'title', confidence=0.8) - return - - # specific cases: - # if the basename contains a number first followed by an unidentified - # group, and the folder only contains 1 unidentified one, then we have - # a series - # ex: Millenium Trilogy (2009)/(1)The Girl With The Dragon Tattoo(2009).mkv - if len(folder_leftover) > 0 and len(basename_leftover) > 1: - series = folder_leftover[0] - film_number = basename_leftover[0] - title = basename_leftover[1] - - basename_leaves = list(basename.leaves()) - - num = None - try: - num = int(film_number.clean_value) - except ValueError: - pass - - if num: - self.log.debug('series: %s' % series.clean_value) - self.log.debug('title: %s' % title.clean_value) - if (series.clean_value != title.clean_value and - series.clean_value != film_number.clean_value and - basename_leaves.index(film_number) == 0 and - basename_leaves.index(title) == 1): - - found_property(title, 'title', confidence=0.6) - found_property(series, 'filmSeries', confidence=0.6) - found_property(film_number, 'filmNumber', num, confidence=0.6) - return - - if folder: - year_group = folder.first_leaf_containing('year') - if year_group: - groups_before = folder.previous_unidentified_leaves(year_group) - if groups_before: - try: - node = next(groups_before) - found_property(node, 'title', confidence=0.8) - return - except StopIteration: - pass - - # if we have either format or videoCodec in the folder containing the - # file or one of its parents, then we should probably look for the title - # in there rather than in the basename - try: - props = list(mtree.previous_leaves_containing(mtree.children[-2], - ['videoCodec', - 'format', - 'language'])) - except IndexError: - props = [] - - if props: - group_idx = props[0].node_idx[0] - if all(g.node_idx[0] == group_idx for g in props): - # if they're all in the same group, take leftover info from there - leftover = mtree.node_at((group_idx,)).unidentified_leaves() - try: - found_property(next(leftover), 'title', confidence=0.7) - return - except StopIteration: - pass - - # look for title in basename if there are some remaining unidentified - # groups there - if basename_leftover: - # if basename is only one word and the containing folder has at least - # 3 words in it, we should take the title from the folder name - # ex: Movies/Alice in Wonderland DVDRip.XviD-DiAMOND/dmd-aw.avi - # ex: Movies/Somewhere.2010.DVDRip.XviD-iLG/i-smwhr.avi <-- TODO: gets caught here? - if (basename_leftover[0].clean_value.count(' ') == 0 and - folder_leftover and folder_leftover[0].clean_value.count(' ') >= 2): - - found_property(folder_leftover[0], 'title', confidence=0.7) - return - - # if there are only many unidentified groups, take the first of which is - # not inside brackets or parentheses. - # ex: Movies/[阿维达].Avida.2006.FRENCH.DVDRiP.XViD-PROD.avi - if basename_leftover[0].is_explicit(): - for basename_leftover_elt in basename_leftover: - if not basename_leftover_elt.is_explicit(): - found_property(basename_leftover_elt, 'title', confidence=0.8) - return - - # if all else fails, take the first remaining unidentified group in the - # basename as title - found_property(basename_leftover[0], 'title', confidence=0.6) - return - - # if there are no leftover groups in the basename, look in the folder name - if folder_leftover: - found_property(folder_leftover[0], 'title', confidence=0.5) - return - - # if nothing worked, look if we have a very small group at the beginning - # of the basename - basename = mtree.node_at((-2,)) - basename_leftover = basename.unidentified_leaves(valid=lambda leaf: True) - try: - found_property(next(basename_leftover), 'title', confidence=0.4) - return - except StopIteration: - pass diff --git a/libs/guessit/transfo/guess_properties.py b/libs/guessit/transfo/guess_properties.py deleted file mode 100644 index 01aecddc..00000000 --- a/libs/guessit/transfo/guess_properties.py +++ /dev/null @@ -1,288 +0,0 @@ -#!/usr/bin/env python -# -*- coding: utf-8 -*- -# -# GuessIt - A library for guessing information from filenames -# Copyright (c) 2013 Rémi Alvergnat -# -# GuessIt is free software; you can redistribute it and/or modify it under -# the terms of the Lesser GNU General Public License as published by -# the Free Software Foundation; either version 3 of the License, or -# (at your option) any later version. -# -# GuessIt is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# Lesser GNU General Public License for more details. -# -# You should have received a copy of the Lesser GNU General Public License -# along with this program. If not, see . -# - -from __future__ import absolute_import, division, print_function, unicode_literals - -from guessit.containers import PropertiesContainer, WeakValidator, LeavesValidator, QualitiesContainer, NoValidator, \ - ChainedValidator, DefaultValidator, OnlyOneValidator, LeftValidator, NeighborValidator -from guessit.patterns import sep, build_or_pattern -from guessit.patterns.extension import subtitle_exts, video_exts, info_exts -from guessit.patterns.numeral import numeral, parse_numeral -from guessit.plugins.transformers import Transformer -from guessit.matcher import GuessFinder, found_property -import re - - -class GuessProperties(Transformer): - def __init__(self): - Transformer.__init__(self, 35) - - self.container = PropertiesContainer() - self.qualities = QualitiesContainer() - - def register_property(propname, props, **kwargs): - """props a dict of {value: [patterns]}""" - for canonical_form, patterns in props.items(): - if isinstance(patterns, tuple): - patterns2, pattern_kwarg = patterns - if kwargs: - current_kwarg = dict(kwargs) - current_kwarg.update(pattern_kwarg) - else: - current_kwarg = dict(pattern_kwarg) - current_kwarg['canonical_form'] = canonical_form - self.container.register_property(propname, *patterns2, **current_kwarg) - elif kwargs: - current_kwarg = dict(kwargs) - current_kwarg['canonical_form'] = canonical_form - self.container.register_property(propname, *patterns, **current_kwarg) - else: - self.container.register_property(propname, *patterns, canonical_form=canonical_form) - - def register_quality(propname, quality_dict): - """props a dict of {canonical_form: quality}""" - for canonical_form, quality in quality_dict.items(): - self.qualities.register_quality(propname, canonical_form, quality) - - register_property('container', {'mp4': ['MP4']}) - - # http://en.wikipedia.org/wiki/Pirated_movie_release_types - register_property('format', {'VHS': ['VHS', 'VHS-Rip'], - 'Cam': ['CAM', 'CAMRip', 'HD-CAM'], - #'Telesync': ['TELESYNC', 'PDVD'], - 'Telesync': (['TS', 'HD-TS'], {'confidence': 0.4}), - 'Workprint': ['WORKPRINT', 'WP'], - 'Telecine': ['TELECINE', 'TC'], - 'PPV': ['PPV', 'PPV-Rip'], # Pay Per View - 'TV': ['SD-TV', 'SD-TV-Rip', 'Rip-SD-TV', 'TV-Rip', 'Rip-TV'], - 'DVB': ['DVB-Rip', 'DVB', 'PD-TV'], - 'DVD': ['DVD', 'DVD-Rip', 'VIDEO-TS', 'DVD-R', 'DVD-9', 'DVD-5'], - 'HDTV': ['HD-TV', 'TV-RIP-HD', 'HD-TV-RIP'], - 'VOD': ['VOD', 'VOD-Rip'], - 'WEBRip': ['WEB-Rip'], - 'WEB-DL': ['WEB-DL', 'WEB-HD', 'WEB'], - 'HD-DVD': ['HD-(?:DVD)?-Rip', 'HD-DVD'], - 'BluRay': ['Blu-ray(?:-Rip)?', 'B[DR]', 'B[DR]-Rip', 'BD[59]', 'BD25', 'BD50'] - }) - - register_quality('format', {'VHS': -100, - 'Cam': -90, - 'Telesync': -80, - 'Workprint': -70, - 'Telecine': -60, - 'PPV': -50, - 'TV': -30, - 'DVB': -20, - 'DVD': 0, - 'HDTV': 20, - 'VOD': 40, - 'WEBRip': 50, - 'WEB-DL': 60, - 'HD-DVD': 80, - 'BluRay': 100 - }) - - register_property('screenSize', {'360p': ['(?:\d{3,}(?:\\|\/|x|\*))?360(?:i|p?x?)'], - '368p': ['(?:\d{3,}(?:\\|\/|x|\*))?368(?:i|p?x?)'], - '480p': ['(?:\d{3,}(?:\\|\/|x|\*))?480(?:i|p?x?)'], - #'480p': (['hr'], {'confidence': 0.2}), # duplicate dict key - '576p': ['(?:\d{3,}(?:\\|\/|x|\*))?576(?:i|p?x?)'], - '720p': ['(?:\d{3,}(?:\\|\/|x|\*))?720(?:i|p?x?)'], - '900p': ['(?:\d{3,}(?:\\|\/|x|\*))?900(?:i|p?x?)'], - '1080i': ['(?:\d{3,}(?:\\|\/|x|\*))?1080i'], - '1080p': ['(?:\d{3,}(?:\\|\/|x|\*))?1080p?x?'], - '4K': ['(?:\d{3,}(?:\\|\/|x|\*))?2160(?:i|p?x?)'] - }, - validator=ChainedValidator(DefaultValidator(), OnlyOneValidator())) - - class ResolutionValidator(object): - """Make sure our match is surrounded by separators, or by another entry""" - def validate(self, prop, string, node, match, entry_start, entry_end): - """ - span = _get_span(prop, match) - span = _trim_span(span, string[span[0]:span[1]]) - start, end = span - - sep_start = start <= 0 or string[start - 1] in sep - sep_end = end >= len(string) or string[end] in sep - start_by_other = start in entry_end - end_by_other = end in entry_start - if (sep_start or start_by_other) and (sep_end or end_by_other): - return True - return False - """ - return True - - _digits_re = re.compile('\d+') - - def resolution_formatter(value): - digits = _digits_re.findall(value) - return 'x'.join(digits) - - self.container.register_property('screenSize', '\d{3,4}-?[x\*]-?\d{3,4}', canonical_from_pattern=False, formatter=resolution_formatter, validator=ChainedValidator(DefaultValidator(), ResolutionValidator())) - - register_quality('screenSize', {'360p': -300, - '368p': -200, - '480p': -100, - '576p': 0, - '720p': 100, - '900p': 130, - '1080i': 180, - '1080p': 200, - '4K': 400 - }) - - _videoCodecProperty = {'Real': ['Rv\d{2}'], # http://en.wikipedia.org/wiki/RealVideo - 'Mpeg2': ['Mpeg2'], - 'DivX': ['DVDivX', 'DivX'], - 'XviD': ['XviD'], - 'h264': ['[hx]-264(?:-AVC)?', 'MPEG-4(?:-AVC)'], - 'h265': ['[hx]-265(?:-HEVC)?', 'HEVC'] - } - - register_property('videoCodec', _videoCodecProperty) - - register_quality('videoCodec', {'Real': -50, - 'Mpeg2': -30, - 'DivX': -10, - 'XviD': 0, - 'h264': 100, - 'h265': 150 - }) - - # http://blog.mediacoderhq.com/h264-profiles-and-levels/ - # http://fr.wikipedia.org/wiki/H.264 - self.container.register_property('videoProfile', 'BP', validator=LeavesValidator(lambdas=[lambda node: 'videoCodec' in node.guess])) - self.container.register_property('videoProfile', 'XP', 'EP', canonical_form='XP', validator=LeavesValidator(lambdas=[lambda node: 'videoCodec' in node.guess])) - self.container.register_property('videoProfile', 'MP', validator=LeavesValidator(lambdas=[lambda node: 'videoCodec' in node.guess])) - self.container.register_property('videoProfile', 'HP', 'HiP', canonical_form='HP', validator=LeavesValidator(lambdas=[lambda node: 'videoCodec' in node.guess])) - self.container.register_property('videoProfile', '10.?bit', 'Hi10P', canonical_form='10bit') - self.container.register_property('videoProfile', '8.?bit', canonical_form='8bit') - self.container.register_property('videoProfile', 'Hi422P', validator=LeavesValidator(lambdas=[lambda node: 'videoCodec' in node.guess])) - self.container.register_property('videoProfile', 'Hi444PP', validator=LeavesValidator(lambdas=[lambda node: 'videoCodec' in node.guess])) - - register_quality('videoProfile', {'BP': -20, - 'XP': -10, - 'MP': 0, - 'HP': 10, - '10bit': 15, - 'Hi422P': 25, - 'Hi444PP': 35 - }) - - # has nothing to do here (or on filenames for that matter), but some - # releases use it and it helps to identify release groups, so we adapt - register_property('videoApi', {'DXVA': ['DXVA']}) - - register_property('audioCodec', {'MP3': ['MP3', 'LAME', 'LAME(?:\d)+-(?:\d)+'], - 'DolbyDigital': ['DD'], - 'AAC': ['AAC'], - 'AC3': ['AC3'], - 'Flac': ['FLAC'], - 'DTS': (['DTS'], {'validator': LeftValidator()}), - 'TrueHD': ['True-HD'] - }) - - register_quality('audioCodec', {'MP3': 10, - 'DolbyDigital': 30, - 'AAC': 35, - 'AC3': 40, - 'Flac': 45, - 'DTS': 60, - 'TrueHD': 70 - }) - - self.container.register_property('audioProfile', 'HD', validator=LeavesValidator(lambdas=[lambda node: node.guess.get('audioCodec') == 'DTS'])) - self.container.register_property('audioProfile', 'HD-MA', canonical_form='HDMA', validator=LeavesValidator(lambdas=[lambda node: node.guess.get('audioCodec') == 'DTS'])) - self.container.register_property('audioProfile', 'HE', validator=LeavesValidator(lambdas=[lambda node: node.guess.get('audioCodec') == 'AAC'])) - self.container.register_property('audioProfile', 'LC', validator=LeavesValidator(lambdas=[lambda node: node.guess.get('audioCodec') == 'AAC'])) - self.container.register_property('audioProfile', 'HQ', validator=LeavesValidator(lambdas=[lambda node: node.guess.get('audioCodec') == 'AC3'])) - - register_quality('audioProfile', {'HD': 20, - 'HDMA': 50, - 'LC': 0, - 'HQ': 0, - 'HE': 20 - }) - - register_property('audioChannels', {'7.1': ['7[\W_]1', '7ch', '8ch'], - '5.1': ['5[\W_]1', '5ch', '6ch'], - '2.0': ['2[\W_]0', '2ch', 'stereo'], - '1.0': ['1[\W_]0', '1ch', 'mono'] - }) - - register_quality('audioChannels', {'7.1': 200, - '5.1': 100, - '2.0': 0, - '1.0': -100 - }) - - self.container.register_property('episodeFormat', r'Minisodes?', canonical_form='Minisode') - - self.container.register_property('crc32', '(?:[a-fA-F]|[0-9]){8}', enhance=False, canonical_from_pattern=False) - - weak_episode_words = ['pt', 'part'] - self.container.register_property(None, '(' + build_or_pattern(weak_episode_words) + sep + '?(?P' + numeral + '))[^0-9]', enhance=False, canonical_from_pattern=False, confidence=0.4, formatter=parse_numeral) - - register_property('other', {'AudioFix': ['Audio-Fix', 'Audio-Fixed'], - 'SyncFix': ['Sync-Fix', 'Sync-Fixed'], - 'DualAudio': ['Dual-Audio'], - 'WideScreen': ['ws', 'wide-screen'], - 'Netflix': ['Netflix', 'NF'] - }) - - self.container.register_property('other', 'Real', 'Fix', canonical_form='Proper', validator=NeighborValidator()) - self.container.register_property('other', 'Proper', 'Repack', 'Rerip', canonical_form='Proper') - self.container.register_property('other', 'Fansub', canonical_form='Fansub') - self.container.register_property('other', 'Fastsub', canonical_form='Fastsub') - self.container.register_property('other', '(?:Seasons?' + sep + '?)?Complete', canonical_form='Complete') - self.container.register_property('other', 'R5', 'RC', canonical_form='R5') - self.container.register_property('other', 'Pre-Air', 'Preair', canonical_form='Preair') - - self.container.register_canonical_properties('other', 'Screener', 'Remux', '3D', 'HD', 'mHD', 'HDLight', 'HQ', - 'DDC', - 'HR', 'PAL', 'SECAM', 'NTSC') - self.container.register_canonical_properties('other', 'Limited', 'Complete', 'Classic', 'Unrated', 'LiNE', 'Bonus', 'Trailer', validator=WeakValidator()) - - for prop in self.container.get_properties('format'): - self.container.register_property('other', prop.pattern + '(-?Scr(?:eener)?)', canonical_form='Screener') - - for exts in (subtitle_exts, info_exts, video_exts): - for container in exts: - self.container.register_property('container', container, confidence=0.3) - - def guess_properties(self, string, node=None, options=None): - found = self.container.find_properties(string, node, options) - return self.container.as_guess(found, string) - - def supported_properties(self): - return self.container.get_supported_properties() - - def process(self, mtree, options=None): - GuessFinder(self.guess_properties, 1.0, self.log, options).process_nodes(mtree.unidentified_leaves()) - proper_count = 0 - for other_leaf in mtree.leaves_containing('other'): - if 'other' in other_leaf.info and 'Proper' in other_leaf.info['other']: - proper_count += 1 - if proper_count: - found_property(mtree, 'properCount', proper_count) - - def rate_quality(self, guess, *props): - return self.qualities.rate_quality(guess, *props) diff --git a/libs/guessit/transfo/guess_release_group.py b/libs/guessit/transfo/guess_release_group.py deleted file mode 100644 index 646c7128..00000000 --- a/libs/guessit/transfo/guess_release_group.py +++ /dev/null @@ -1,204 +0,0 @@ -#!/usr/bin/env python -# -*- coding: utf-8 -*- -# -# GuessIt - A library for guessing information from filenames -# Copyright (c) 2013 Nicolas Wack -# -# GuessIt is free software; you can redistribute it and/or modify it under -# the terms of the Lesser GNU General Public License as published by -# the Free Software Foundation; either version 3 of the License, or -# (at your option) any later version. -# -# GuessIt is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# Lesser GNU General Public License for more details. -# -# You should have received a copy of the Lesser GNU General Public License -# along with this program. If not, see . -# - -from __future__ import absolute_import, division, print_function, unicode_literals - -from guessit.plugins.transformers import Transformer -from guessit.matcher import GuessFinder, build_guess -from guessit.containers import PropertiesContainer -from guessit.patterns import sep -from guessit.guess import Guess -from guessit.textutils import strip_brackets -import re - - -class GuessReleaseGroup(Transformer): - def __init__(self): - Transformer.__init__(self, -190) - - self.container = PropertiesContainer(canonical_from_pattern=False) - self._allowed_groupname_pattern = '[\w@#€£$&!\?]' - self._forbidden_groupname_lambda = [lambda elt: elt in ['rip', 'by', 'for', 'par', 'pour', 'bonus'], - lambda elt: self._is_number(elt)] - # If the previous property in this list, the match will be considered as safe - # and group name can contain a separator. - self.previous_safe_properties = ['videoCodec', 'format', 'videoApi', 'audioCodec', 'audioProfile', 'videoProfile', 'audioChannels', 'other'] - self.previous_safe_values = {'other': ['Complete']} - self.next_safe_properties = ['extension', 'website'] - self.next_safe_values = {'format': ['Telesync']} - self.container.sep_replace_char = '-' - self.container.canonical_from_pattern = False - self.container.enhance = True - self.container.register_property('releaseGroup', self._allowed_groupname_pattern + '+') - self.container.register_property('releaseGroup', self._allowed_groupname_pattern + '+-' + self._allowed_groupname_pattern + '+') - self.re_sep = re.compile('(' + sep + ')') - - def register_arguments(self, opts, naming_opts, output_opts, information_opts, webservice_opts, other_options): - naming_opts.add_argument('-G', '--expected-group', action='append', dest='expected_group', - help='Expected release group (can be used multiple times)') - - def supported_properties(self): - return self.container.get_supported_properties() - - def _is_number(self, s): - try: - int(s) - return True - except ValueError: - return False - - def validate_group_name(self, guess): - val = guess['releaseGroup'] - if len(val) > 1: - checked_val = "" - forbidden = False - for elt in self.re_sep.split(val): # separators are in the list because of capturing group - if forbidden: - # Previous was forbidden, don't had separator - forbidden = False - continue - for forbidden_lambda in self._forbidden_groupname_lambda: - forbidden = forbidden_lambda(elt.lower()) - if forbidden: - if checked_val: - # Removing previous separator - checked_val = checked_val[0:len(checked_val) - 1] - break - if not forbidden: - checked_val += elt - - val = checked_val - if not val: - return False - if self.re_sep.match(val[-1]): - val = val[:len(val)-1] - if self.re_sep.match(val[0]): - val = val[1:] - guess['releaseGroup'] = val - forbidden = False - for forbidden_lambda in self._forbidden_groupname_lambda: - forbidden = forbidden_lambda(val.lower()) - if forbidden: - break - if not forbidden: - return True - return False - - def is_leaf_previous(self, leaf, node): - if leaf.span[1] <= node.span[0]: - for idx in range(leaf.span[1], node.span[0]): - if leaf.root.value[idx] not in sep: - return False - return True - return False - - def validate_next_leaves(self, node): - if 'series' in node.root.info or 'title' in node.root.info: - # --expected-series or --expected-title is used. - return True - - # Make sure to avoid collision with 'series' or 'title' guessed later. Should be more precise. - leaves = node.root.unidentified_leaves() - return len(list(leaves)) > 1 - - def validate_node(self, leaf, node, safe=False): - if not self.is_leaf_previous(leaf, node): - return False - if not self.validate_next_leaves(node): - return False - if safe: - for k, v in leaf.guess.items(): - if k in self.previous_safe_values and not v in self.previous_safe_values[k]: - return False - return True - - def guess_release_group(self, string, node=None, options=None): - if options and options.get('expected_group'): - expected_container = PropertiesContainer(enhance=True, canonical_from_pattern=False) - for expected_group in options.get('expected_group'): - if expected_group.startswith('re:'): - expected_group = expected_group[3:] - expected_group = expected_group.replace(' ', '-') - expected_container.register_property('releaseGroup', expected_group, enhance=True) - else: - expected_group = re.escape(expected_group) - expected_container.register_property('releaseGroup', expected_group, enhance=False) - - found = expected_container.find_properties(string, node, options, 'releaseGroup') - guess = expected_container.as_guess(found, string, self.validate_group_name) - if guess: - return guess - - found = self.container.find_properties(string, node, options, 'releaseGroup') - guess = self.container.as_guess(found, string, self.validate_group_name) - validated_guess = None - if guess: - group_node = node.group_node() - if group_node: - for leaf in group_node.leaves_containing(self.previous_safe_properties): - if self.validate_node(leaf, node, True): - if leaf.root.value[leaf.span[1]] == '-': - guess.metadata().confidence = 1 - else: - guess.metadata().confidence = 0.7 - validated_guess = guess - - if not validated_guess: - # If previous group last leaf is identified as a safe property, - # consider the raw value as a releaseGroup - previous_group_node = node.previous_group_node() - if previous_group_node: - for leaf in previous_group_node.leaves_containing(self.previous_safe_properties): - if self.validate_node(leaf, node, False): - guess = Guess({'releaseGroup': node.value}, confidence=1, input=node.value, span=(0, len(node.value))) - if self.validate_group_name(guess): - node.guess = guess - validated_guess = guess - - if validated_guess: - # If following group nodes have only one unidentified leaf, it belongs to the release group - next_group_node = node - - while True: - next_group_node = next_group_node.next_group_node() - if next_group_node: - leaves = list(next_group_node.leaves()) - if len(leaves) == 1 and not leaves[0].guess: - validated_guess['releaseGroup'] = validated_guess['releaseGroup'] + leaves[0].value - leaves[0].guess = validated_guess - else: - break - else: - break - - if not validated_guess and node.is_explicit() and node.node_last_idx == 0: # first node from group - validated_guess = build_guess(node, 'releaseGroup', value=node.value[1:len(node.value)-1]) - validated_guess.metadata().confidence = 0.4 - validated_guess.metadata().span = 1, len(node.value) - node.guess = validated_guess - - if validated_guess: - # Strip brackets - validated_guess['releaseGroup'] = strip_brackets(validated_guess['releaseGroup']) - - return validated_guess - - def process(self, mtree, options=None): - GuessFinder(self.guess_release_group, None, self.log, options).process_nodes(mtree.unidentified_leaves()) diff --git a/libs/guessit/transfo/guess_video_rexps.py b/libs/guessit/transfo/guess_video_rexps.py deleted file mode 100644 index b1dca8ee..00000000 --- a/libs/guessit/transfo/guess_video_rexps.py +++ /dev/null @@ -1,58 +0,0 @@ -#!/usr/bin/env python -# -*- coding: utf-8 -*- -# -# GuessIt - A library for guessing information from filenames -# Copyright (c) 2013 Nicolas Wack -# -# GuessIt is free software; you can redistribute it and/or modify it under -# the terms of the Lesser GNU General Public License as published by -# the Free Software Foundation; either version 3 of the License, or -# (at your option) any later version. -# -# GuessIt is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# Lesser GNU General Public License for more details. -# -# You should have received a copy of the Lesser GNU General Public License -# along with this program. If not, see . -# - -from __future__ import absolute_import, division, print_function, \ - unicode_literals - -from guessit.patterns import _psep -from guessit.containers import PropertiesContainer -from guessit.plugins.transformers import Transformer -from guessit.matcher import GuessFinder -from guessit.patterns.numeral import parse_numeral - - -class GuessVideoRexps(Transformer): - def __init__(self): - Transformer.__init__(self, 25) - - self.container = PropertiesContainer(canonical_from_pattern=False) - - self.container.register_property(None, 'cd' + _psep + '(?P[0-9])(?:' + _psep + 'of' + _psep + '(?P[0-9]))?', confidence=1.0, enhance=False, global_span=True, formatter=parse_numeral) - self.container.register_property('cdNumberTotal', '([1-9])' + _psep + 'cds?', confidence=0.9, enhance=False, formatter=parse_numeral) - - self.container.register_property('bonusNumber', 'x([0-9]{1,2})', enhance=False, global_span=True, formatter=parse_numeral) - - self.container.register_property('filmNumber', 'f([0-9]{1,2})', enhance=False, global_span=True, formatter=parse_numeral) - - self.container.register_property('edition', 'collector', 'collector-edition', 'edition-collector', canonical_form='Collector Edition') - self.container.register_property('edition', 'special-edition', 'edition-special', canonical_form='Special Edition') - self.container.register_property('edition', 'criterion', 'criterion-edition', 'edition-criterion', canonical_form='Criterion Edition') - self.container.register_property('edition', 'deluxe', 'cdeluxe-edition', 'edition-deluxe', canonical_form='Deluxe Edition') - self.container.register_property('edition', 'director\'?s?-cut', 'director\'?s?-cut-edition', 'edition-director\'?s?-cut', canonical_form='Director\'s cut') - - def supported_properties(self): - return self.container.get_supported_properties() - - def guess_video_rexps(self, string, node=None, options=None): - found = self.container.find_properties(string, node, options) - return self.container.as_guess(found, string) - - def process(self, mtree, options=None): - GuessFinder(self.guess_video_rexps, None, self.log, options).process_nodes(mtree.unidentified_leaves()) diff --git a/libs/guessit/transfo/guess_weak_episodes_rexps.py b/libs/guessit/transfo/guess_weak_episodes_rexps.py deleted file mode 100644 index 93d7a7bb..00000000 --- a/libs/guessit/transfo/guess_weak_episodes_rexps.py +++ /dev/null @@ -1,81 +0,0 @@ -#!/usr/bin/env python -# -*- coding: utf-8 -*- -# -# GuessIt - A library for guessing information from filenames -# Copyright (c) 2013 Nicolas Wack -# -# GuessIt is free software; you can redistribute it and/or modify it under -# the terms of the Lesser GNU General Public License as published by -# the Free Software Foundation; either version 3 of the License, or -# (at your option) any later version. -# -# GuessIt is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# Lesser GNU General Public License for more details. -# -# You should have received a copy of the Lesser GNU General Public License -# along with this program. If not, see . -# - -from __future__ import absolute_import, division, print_function, unicode_literals - -from guessit.plugins.transformers import Transformer -from guessit.matcher import GuessFinder -from guessit.patterns import sep, build_or_pattern -from guessit.containers import PropertiesContainer, LeavesValidator, NoValidator, WeakValidator -from guessit.patterns.numeral import numeral, parse_numeral -from guessit.date import valid_year - -import re - - -class GuessWeakEpisodesRexps(Transformer): - def __init__(self): - Transformer.__init__(self, 15) - - of_separators = ['of', 'sur', '/', '\\'] - of_separators_re = re.compile(build_or_pattern(of_separators, escape=True), re.IGNORECASE) - - self.container = PropertiesContainer(enhance=False, canonical_from_pattern=False) - - episode_words = ['episodes?'] - - def _formater(episode_number): - epnum = parse_numeral(episode_number) - if not valid_year(epnum): - if epnum > 100: - season, epnum = epnum // 100, epnum % 100 - # episodes which have a season > 50 are most likely errors - # (Simpson is at 25!) - if season > 50: - return None - return {'season': season, 'episodeNumber': epnum} - else: - return epnum - - self.container.register_property(['episodeNumber', 'season'], '[0-9]{2,4}', confidence=0.6, formatter=_formater, disabler=lambda options: options.get('episode_prefer_number') if options else False) - self.container.register_property(['episodeNumber', 'season'], '[0-9]{4}', confidence=0.6, formatter=_formater) - self.container.register_property('episodeNumber', '[^0-9](\d{1,3})', confidence=0.6, formatter=parse_numeral, disabler=lambda options: not options.get('episode_prefer_number') if options else True) - self.container.register_property(None, '(' + build_or_pattern(episode_words) + sep + '?(?P' + numeral + '))[^0-9]', confidence=0.4, formatter=parse_numeral) - self.container.register_property(None, r'(?P' + numeral + ')' + sep + '?' + of_separators_re.pattern + sep + '?(?P' + numeral +')', confidence=0.6, formatter=parse_numeral) - self.container.register_property('episodeNumber', r'^' + sep + '?(\d{1,3})' + sep, confidence=0.4, formatter=parse_numeral, disabler=lambda options: not options.get('episode_prefer_number') if options else True) - self.container.register_property('episodeNumber', sep + r'(\d{1,3})' + sep + '?$', confidence=0.4, formatter=parse_numeral, disabler=lambda options: not options.get('episode_prefer_number') if options else True) - - def supported_properties(self): - return self.container.get_supported_properties() - - def guess_weak_episodes_rexps(self, string, node=None, options=None): - if node and 'episodeNumber' in node.root.info: - return None - - properties = self.container.find_properties(string, node, options) - guess = self.container.as_guess(properties, string) - - return guess - - def should_process(self, mtree, options=None): - return mtree.guess.get('type', '').startswith('episode') - - def process(self, mtree, options=None): - GuessFinder(self.guess_weak_episodes_rexps, 0.6, self.log, options).process_nodes(mtree.unidentified_leaves()) diff --git a/libs/guessit/transfo/guess_website.py b/libs/guessit/transfo/guess_website.py deleted file mode 100644 index aa33226b..00000000 --- a/libs/guessit/transfo/guess_website.py +++ /dev/null @@ -1,56 +0,0 @@ -#!/usr/bin/env python -# -*- coding: utf-8 -*- -# -# GuessIt - A library for guessing information from filenames -# Copyright (c) 2013 Rémi Alvergnat -# -# GuessIt is free software; you can redistribute it and/or modify it under -# the terms of the Lesser GNU General Public License as published by -# the Free Software Foundation; either version 3 of the License, or -# (at your option) any later version. -# -# GuessIt is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# Lesser GNU General Public License for more details. -# -# You should have received a copy of the Lesser GNU General Public License -# along with this program. If not, see . -# - -from __future__ import absolute_import, division, print_function, unicode_literals -from guessit.patterns import build_or_pattern -from guessit.containers import PropertiesContainer -from guessit.plugins.transformers import Transformer -from guessit.matcher import GuessFinder -from pkg_resources import resource_stream # @UnresolvedImport - -TLDS = [l.strip().decode('utf-8') - for l in resource_stream('guessit', 'tlds-alpha-by-domain.txt').readlines() - if b'--' not in l][1:] - - -class GuessWebsite(Transformer): - def __init__(self): - Transformer.__init__(self, 45) - - self.container = PropertiesContainer(enhance=False, canonical_from_pattern=False) - - tlds_pattern = build_or_pattern(TLDS) # All registered domain extension - safe_tlds_pattern = build_or_pattern(['com', 'org', 'net']) # For sure a website extension - safe_subdomains_pattern = build_or_pattern(['www']) # For sure a website subdomain - safe_prefix_tlds_pattern = build_or_pattern(['co', 'com', 'org', 'net']) # Those words before a tlds are sure - - self.container.register_property('website', '(?:' + safe_subdomains_pattern + '\.)+' + r'(?:[a-z-]+\.)+' + r'(?:' + tlds_pattern + r')+') - self.container.register_property('website', '(?:' + safe_subdomains_pattern + '\.)*' + r'[a-z-]+\.' + r'(?:' + safe_tlds_pattern + r')+') - self.container.register_property('website', '(?:' + safe_subdomains_pattern + '\.)*' + r'[a-z-]+\.' + r'(?:' + safe_prefix_tlds_pattern + r'\.)+' + r'(?:' + tlds_pattern + r')+') - - def supported_properties(self): - return self.container.get_supported_properties() - - def guess_website(self, string, node=None, options=None): - found = self.container.find_properties(string, node, options, 'website') - return self.container.as_guess(found, string) - - def process(self, mtree, options=None): - GuessFinder(self.guess_website, 1.0, self.log, options).process_nodes(mtree.unidentified_leaves()) diff --git a/libs/guessit/transfo/guess_year.py b/libs/guessit/transfo/guess_year.py deleted file mode 100644 index 61363da5..00000000 --- a/libs/guessit/transfo/guess_year.py +++ /dev/null @@ -1,57 +0,0 @@ -#!/usr/bin/env python -# -*- coding: utf-8 -*- -# -# GuessIt - A library for guessing information from filenames -# Copyright (c) 2013 Nicolas Wack -# -# GuessIt is free software; you can redistribute it and/or modify it under -# the terms of the Lesser GNU General Public License as published by -# the Free Software Foundation; either version 3 of the License, or -# (at your option) any later version. -# -# GuessIt is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# Lesser GNU General Public License for more details. -# -# You should have received a copy of the Lesser GNU General Public License -# along with this program. If not, see . -# - -from __future__ import absolute_import, division, print_function, unicode_literals - -from guessit.plugins.transformers import Transformer -from guessit.matcher import GuessFinder -from guessit.date import search_year, valid_year - - -class GuessYear(Transformer): - def __init__(self): - Transformer.__init__(self, -160) - - def supported_properties(self): - return ['year'] - - def guess_year(self, string, node=None, options=None): - year, span = search_year(string) - if year: - return {'year': year}, span - else: - return None, None - - def second_pass_options(self, mtree, options=None): - year_nodes = list(mtree.leaves_containing('year')) - if len(year_nodes) > 1: - return {'skip_nodes': year_nodes[:len(year_nodes) - 1]} - return None - - def process(self, mtree, options=None): - GuessFinder(self.guess_year, 1.0, self.log, options).process_nodes(mtree.unidentified_leaves()) - - # if we found a season number that is a valid year, it is usually safe to assume - # we can also set the year property to that value - for n in mtree.leaves_containing('season'): - g = n.guess - season = g['season'] - if valid_year(season): - g['year'] = season diff --git a/libs/guessit/transfo/split_explicit_groups.py b/libs/guessit/transfo/split_explicit_groups.py deleted file mode 100644 index 67d54cfb..00000000 --- a/libs/guessit/transfo/split_explicit_groups.py +++ /dev/null @@ -1,49 +0,0 @@ -#!/usr/bin/env python -# -*- coding: utf-8 -*- -# -# GuessIt - A library for guessing information from filenames -# Copyright (c) 2013 Nicolas Wack -# -# GuessIt is free software; you can redistribute it and/or modify it under -# the terms of the Lesser GNU General Public License as published by -# the Free Software Foundation; either version 3 of the License, or -# (at your option) any later version. -# -# GuessIt is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# Lesser GNU General Public License for more details. -# -# You should have received a copy of the Lesser GNU General Public License -# along with this program. If not, see . -# - -from __future__ import absolute_import, division, print_function, unicode_literals - -from guessit.plugins.transformers import Transformer -from guessit.textutils import find_first_level_groups -from guessit.patterns import group_delimiters -from functools import reduce - - -class SplitExplicitGroups(Transformer): - def __init__(self): - Transformer.__init__(self, 250) - - def process(self, mtree, options=None): - """split each of those into explicit groups (separated by parentheses or square brackets) - - :return: return the string split into explicit groups, that is, those either - between parenthese, square brackets or curly braces, and those separated - by a dash.""" - for c in mtree.children: - groups = find_first_level_groups(c.value, group_delimiters[0]) - for delimiters in group_delimiters: - flatten = lambda l, x: l + find_first_level_groups(x, delimiters) - groups = reduce(flatten, groups, []) - - # do not do this at this moment, it is not strong enough and can break other - # patterns, such as dates, etc... - # groups = functools.reduce(lambda l, x: l + x.split('-'), groups, []) - - c.split_on_components(groups) diff --git a/libs/guessit/transfo/split_on_dash.py b/libs/guessit/transfo/split_on_dash.py deleted file mode 100644 index e86c6a3f..00000000 --- a/libs/guessit/transfo/split_on_dash.py +++ /dev/null @@ -1,47 +0,0 @@ -#!/usr/bin/env python -# -*- coding: utf-8 -*- -# -# GuessIt - A library for guessing information from filenames -# Copyright (c) 2013 Nicolas Wack -# -# GuessIt is free software; you can redistribute it and/or modify it under -# the terms of the Lesser GNU General Public License as published by -# the Free Software Foundation; either version 3 of the License, or -# (at your option) any later version. -# -# GuessIt is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# Lesser GNU General Public License for more details. -# -# You should have received a copy of the Lesser GNU General Public License -# along with this program. If not, see . -# - -from __future__ import absolute_import, division, print_function, unicode_literals - -from guessit.plugins.transformers import Transformer -from guessit.patterns import sep -import re - - -class SplitOnDash(Transformer): - def __init__(self): - Transformer.__init__(self, 245) - - def process(self, mtree, options=None): - """split into '-' separated subgroups (with required separator chars - around the dash) - """ - for node in mtree.unidentified_leaves(): - indices = [] - - pattern = re.compile(sep + '-' + sep) - match = pattern.search(node.value) - while match: - span = match.span() - indices.extend([span[0], span[1]]) - match = pattern.search(node.value, span[1]) - - if indices: - node.partition(indices) diff --git a/libs/guessit/transfo/split_path_components.py b/libs/guessit/transfo/split_path_components.py deleted file mode 100644 index c630a30c..00000000 --- a/libs/guessit/transfo/split_path_components.py +++ /dev/null @@ -1,45 +0,0 @@ -#!/usr/bin/env python -# -*- coding: utf-8 -*- -# -# GuessIt - A library for guessing information from filenames -# Copyright (c) 2013 Nicolas Wack -# -# GuessIt is free software; you can redistribute it and/or modify it under -# the terms of the Lesser GNU General Public License as published by -# the Free Software Foundation; either version 3 of the License, or -# (at your option) any later version. -# -# GuessIt is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# Lesser GNU General Public License for more details. -# -# You should have received a copy of the Lesser GNU General Public License -# along with this program. If not, see . -# - -from __future__ import absolute_import, division, print_function, unicode_literals - -from guessit.plugins.transformers import Transformer -from guessit import fileutils -from os.path import splitext - - -class SplitPathComponents(Transformer): - def __init__(self): - Transformer.__init__(self, 255) - - def process(self, mtree, options=None): - """first split our path into dirs + basename + ext - - :return: the filename split into [ dir*, basename, ext ] - """ - if not options.get('name_only'): - components = fileutils.split_path(mtree.value) - basename = components.pop(-1) - components += list(splitext(basename)) - components[-1] = components[-1][1:] # remove the '.' from the extension - - mtree.split_on_components(components) - else: - mtree.split_on_components([mtree.value, '']) diff --git a/libs/guessit/yamlutils.py b/libs/guessit/yamlutils.py new file mode 100644 index 00000000..2824575d --- /dev/null +++ b/libs/guessit/yamlutils.py @@ -0,0 +1,71 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +""" +Options +""" +try: + from collections import OrderedDict +except ImportError: # pragma: no-cover + from ordereddict import OrderedDict # pylint:disable=import-error +import babelfish + +import yaml + + +class OrderedDictYAMLLoader(yaml.Loader): + """ + A YAML loader that loads mappings into ordered dictionaries. + From https://gist.github.com/enaeseth/844388 + """ + + def __init__(self, *args, **kwargs): + yaml.Loader.__init__(self, *args, **kwargs) + + self.add_constructor(u'tag:yaml.org,2002:map', type(self).construct_yaml_map) + self.add_constructor(u'tag:yaml.org,2002:omap', type(self).construct_yaml_map) + + def construct_yaml_map(self, node): + data = OrderedDict() + yield data + value = self.construct_mapping(node) + data.update(value) + + def construct_mapping(self, node, deep=False): + if isinstance(node, yaml.MappingNode): + self.flatten_mapping(node) + else: # pragma: no cover + raise yaml.constructor.ConstructorError(None, None, + 'expected a mapping node, but found %s' % node.id, node.start_mark) + + mapping = OrderedDict() + for key_node, value_node in node.value: + key = self.construct_object(key_node, deep=deep) + try: + hash(key) + except TypeError as exc: # pragma: no cover + raise yaml.constructor.ConstructorError('while constructing a mapping', + node.start_mark, 'found unacceptable key (%s)' + % exc, key_node.start_mark) + value = self.construct_object(value_node, deep=deep) + mapping[key] = value + return mapping + + +class CustomDumper(yaml.SafeDumper): + """ + Custom YAML Dumper. + """ + pass + + +def default_representer(dumper, data): + """Default representer""" + return dumper.represent_str(str(data)) +CustomDumper.add_representer(babelfish.Language, default_representer) +CustomDumper.add_representer(babelfish.Country, default_representer) + + +def ordered_dict_representer(dumper, data): + """OrderedDict representer""" + return dumper.represent_dict(data) +CustomDumper.add_representer(OrderedDict, ordered_dict_representer) diff --git a/libs/rarfile.py b/libs/rarfile.py new file mode 100644 index 00000000..25b61196 --- /dev/null +++ b/libs/rarfile.py @@ -0,0 +1,2002 @@ +# rarfile.py +# +# Copyright (c) 2005-2016 Marko Kreen +# +# Permission to use, copy, modify, and/or distribute this software for any +# purpose with or without fee is hereby granted, provided that the above +# copyright notice and this permission notice appear in all copies. +# +# THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES +# WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF +# MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR +# ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES +# WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN +# ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF +# OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + +r"""RAR archive reader. + +This is Python module for Rar archive reading. The interface +is made as :mod:`zipfile`-like as possible. + +Basic logic: + - Parse archive structure with Python. + - Extract non-compressed files with Python + - Extract compressed files with unrar. + - Optionally write compressed data to temp file to speed up unrar, + otherwise it needs to scan whole archive on each execution. + +Example:: + + import rarfile + + rf = rarfile.RarFile('myarchive.rar') + for f in rf.infolist(): + print f.filename, f.file_size + if f.filename == 'README': + print(rf.read(f)) + +Archive files can also be accessed via file-like object returned +by :meth:`RarFile.open`:: + + import rarfile + + with rarfile.RarFile('archive.rar') as rf: + with rf.open('README') as f: + for ln in f: + print(ln.strip()) + +There are few module-level parameters to tune behaviour, +here they are with defaults, and reason to change it:: + + import rarfile + + # Set to full path of unrar.exe if it is not in PATH + rarfile.UNRAR_TOOL = "unrar" + + # Set to 0 if you don't look at comments and want to + # avoid wasting time for parsing them + rarfile.NEED_COMMENTS = 1 + + # Set up to 1 if you don't want to deal with decoding comments + # from unknown encoding. rarfile will try couple of common + # encodings in sequence. + rarfile.UNICODE_COMMENTS = 0 + + # Set to 1 if you prefer timestamps to be datetime objects + # instead tuples + rarfile.USE_DATETIME = 0 + + # Set to '/' to be more compatible with zipfile + rarfile.PATH_SEP = '\\' + +For more details, refer to source. + +""" + +__version__ = '2.8' + +# export only interesting items +__all__ = ['is_rarfile', 'RarInfo', 'RarFile', 'RarExtFile'] + +## +## Imports and compat - support both Python 2.x and 3.x +## + +import sys, os, struct, errno +from struct import pack, unpack, Struct +from binascii import crc32 +from tempfile import mkstemp +from subprocess import Popen, PIPE, STDOUT +from datetime import datetime +from io import RawIOBase +from hashlib import sha1 + +# only needed for encryped headers +try: + try: + from cryptography.hazmat.primitives.ciphers import algorithms, modes, Cipher + from cryptography.hazmat.backends import default_backend + class AES_CBC_Decrypt(object): + block_size = 16 + def __init__(self, key, iv): + ciph = Cipher(algorithms.AES(key), modes.CBC(iv), default_backend()) + self.dec = ciph.decryptor() + def decrypt(self, data): + return self.dec.update(data) + except ImportError: + from Crypto.Cipher import AES + class AES_CBC_Decrypt(object): + block_size = 16 + def __init__(self, key, iv): + self.dec = AES.new(key, AES.MODE_CBC, iv) + def decrypt(self, data): + return self.dec.decrypt(data) + _have_crypto = 1 +except ImportError: + _have_crypto = 0 + +# compat with 2.x +if sys.hexversion < 0x3000000: + # prefer 3.x behaviour + range = xrange +else: + unicode = str + +## +## Module configuration. Can be tuned after importing. +## + +#: default fallback charset +DEFAULT_CHARSET = "windows-1252" + +#: list of encodings to try, with fallback to DEFAULT_CHARSET if none succeed +TRY_ENCODINGS = ('utf8', 'utf-16le') + +#: 'unrar', 'rar' or full path to either one +UNRAR_TOOL = "unrar" + +#: Command line args to use for opening file for reading. +OPEN_ARGS = ('p', '-inul') + +#: Command line args to use for extracting file to disk. +EXTRACT_ARGS = ('x', '-y', '-idq') + +#: args for testrar() +TEST_ARGS = ('t', '-idq') + +# +# Allow use of tool that is not compatible with unrar. +# +# By default use 'bsdtar' which is 'tar' program that +# sits on top of libarchive. +# +# Problems with libarchive RAR backend: +# - Does not support solid archives. +# - Does not support password-protected archives. +# + +ALT_TOOL = 'bsdtar' +ALT_OPEN_ARGS = ('-x', '--to-stdout', '-f') +ALT_EXTRACT_ARGS = ('-x', '-f') +ALT_TEST_ARGS = ('-t', '-f') +ALT_CHECK_ARGS = ('--help',) + +#: whether to speed up decompression by using tmp archive +USE_EXTRACT_HACK = 1 + +#: limit the filesize for tmp archive usage +HACK_SIZE_LIMIT = 20*1024*1024 + +#: whether to parse file/archive comments. +NEED_COMMENTS = 1 + +#: whether to convert comments to unicode strings +UNICODE_COMMENTS = 0 + +#: Convert RAR time tuple into datetime() object +USE_DATETIME = 0 + +#: Separator for path name components. RAR internally uses '\\'. +#: Use '/' to be similar with zipfile. +PATH_SEP = '\\' + +## +## rar constants +## + +# block types +RAR_BLOCK_MARK = 0x72 # r +RAR_BLOCK_MAIN = 0x73 # s +RAR_BLOCK_FILE = 0x74 # t +RAR_BLOCK_OLD_COMMENT = 0x75 # u +RAR_BLOCK_OLD_EXTRA = 0x76 # v +RAR_BLOCK_OLD_SUB = 0x77 # w +RAR_BLOCK_OLD_RECOVERY = 0x78 # x +RAR_BLOCK_OLD_AUTH = 0x79 # y +RAR_BLOCK_SUB = 0x7a # z +RAR_BLOCK_ENDARC = 0x7b # { + +# flags for RAR_BLOCK_MAIN +RAR_MAIN_VOLUME = 0x0001 +RAR_MAIN_COMMENT = 0x0002 +RAR_MAIN_LOCK = 0x0004 +RAR_MAIN_SOLID = 0x0008 +RAR_MAIN_NEWNUMBERING = 0x0010 +RAR_MAIN_AUTH = 0x0020 +RAR_MAIN_RECOVERY = 0x0040 +RAR_MAIN_PASSWORD = 0x0080 +RAR_MAIN_FIRSTVOLUME = 0x0100 +RAR_MAIN_ENCRYPTVER = 0x0200 + +# flags for RAR_BLOCK_FILE +RAR_FILE_SPLIT_BEFORE = 0x0001 +RAR_FILE_SPLIT_AFTER = 0x0002 +RAR_FILE_PASSWORD = 0x0004 +RAR_FILE_COMMENT = 0x0008 +RAR_FILE_SOLID = 0x0010 +RAR_FILE_DICTMASK = 0x00e0 +RAR_FILE_DICT64 = 0x0000 +RAR_FILE_DICT128 = 0x0020 +RAR_FILE_DICT256 = 0x0040 +RAR_FILE_DICT512 = 0x0060 +RAR_FILE_DICT1024 = 0x0080 +RAR_FILE_DICT2048 = 0x00a0 +RAR_FILE_DICT4096 = 0x00c0 +RAR_FILE_DIRECTORY = 0x00e0 +RAR_FILE_LARGE = 0x0100 +RAR_FILE_UNICODE = 0x0200 +RAR_FILE_SALT = 0x0400 +RAR_FILE_VERSION = 0x0800 +RAR_FILE_EXTTIME = 0x1000 +RAR_FILE_EXTFLAGS = 0x2000 + +# flags for RAR_BLOCK_ENDARC +RAR_ENDARC_NEXT_VOLUME = 0x0001 +RAR_ENDARC_DATACRC = 0x0002 +RAR_ENDARC_REVSPACE = 0x0004 +RAR_ENDARC_VOLNR = 0x0008 + +# flags common to all blocks +RAR_SKIP_IF_UNKNOWN = 0x4000 +RAR_LONG_BLOCK = 0x8000 + +# Host OS types +RAR_OS_MSDOS = 0 +RAR_OS_OS2 = 1 +RAR_OS_WIN32 = 2 +RAR_OS_UNIX = 3 +RAR_OS_MACOS = 4 +RAR_OS_BEOS = 5 + +# Compression methods - '0'..'5' +RAR_M0 = 0x30 +RAR_M1 = 0x31 +RAR_M2 = 0x32 +RAR_M3 = 0x33 +RAR_M4 = 0x34 +RAR_M5 = 0x35 + +## +## internal constants +## + +RAR_ID = b"Rar!\x1a\x07\x00" +ZERO = b"\0" +EMPTY = b"" + +S_BLK_HDR = Struct(' 0 + + +class RarFile(object): + '''Parse RAR structure, provide access to files in archive. + ''' + + #: Archive comment. Byte string or None. Use :data:`UNICODE_COMMENTS` + #: to get automatic decoding to unicode. + comment = None + + def __init__(self, rarfile, mode="r", charset=None, info_callback=None, + crc_check = True, errors = "stop"): + """Open and parse a RAR archive. + + Parameters: + + rarfile + archive file name + mode + only 'r' is supported. + charset + fallback charset to use, if filenames are not already Unicode-enabled. + info_callback + debug callback, gets to see all archive entries. + crc_check + set to False to disable CRC checks + errors + Either "stop" to quietly stop parsing on errors, + or "strict" to raise errors. Default is "stop". + """ + self.rarfile = rarfile + self.comment = None + self._charset = charset or DEFAULT_CHARSET + self._info_callback = info_callback + + self._info_list = [] + self._info_map = {} + self._parse_error = None + self._needs_password = False + self._password = None + self._crc_check = crc_check + self._vol_list = [] + + if errors == "stop": + self._strict = False + elif errors == "strict": + self._strict = True + else: + raise ValueError("Invalid value for 'errors' parameter.") + + self._main = None + + if mode != "r": + raise NotImplementedError("RarFile supports only mode=r") + + self._parse() + + def __enter__(self): + return self + + def __exit__(self, type, value, traceback): + self.close() + + def setpassword(self, password): + '''Sets the password to use when extracting.''' + self._password = password + if not self._main: + self._parse() + + def needs_password(self): + '''Returns True if any archive entries require password for extraction.''' + return self._needs_password + + def namelist(self): + '''Return list of filenames in archive.''' + return [f.filename for f in self.infolist()] + + def infolist(self): + '''Return RarInfo objects for all files/directories in archive.''' + return self._info_list + + def volumelist(self): + '''Returns filenames of archive volumes. + + In case of single-volume archive, the list contains + just the name of main archive file. + ''' + return self._vol_list + + def getinfo(self, fname): + '''Return RarInfo for file.''' + + if isinstance(fname, RarInfo): + return fname + + # accept both ways here + if PATH_SEP == '/': + fname2 = fname.replace("\\", "/") + else: + fname2 = fname.replace("/", "\\") + + try: + return self._info_map[fname] + except KeyError: + try: + return self._info_map[fname2] + except KeyError: + raise NoRarEntry("No such file: "+fname) + + def open(self, fname, mode = 'r', psw = None): + '''Returns file-like object (:class:`RarExtFile`), + from where the data can be read. + + The object implements :class:`io.RawIOBase` interface, so it can + be further wrapped with :class:`io.BufferedReader` + and :class:`io.TextIOWrapper`. + + On older Python where io module is not available, it implements + only .read(), .seek(), .tell() and .close() methods. + + The object is seekable, although the seeking is fast only on + uncompressed files, on compressed files the seeking is implemented + by reading ahead and/or restarting the decompression. + + Parameters: + + fname + file name or RarInfo instance. + mode + must be 'r' + psw + password to use for extracting. + ''' + + if mode != 'r': + raise NotImplementedError("RarFile.open() supports only mode=r") + + # entry lookup + inf = self.getinfo(fname) + if inf.isdir(): + raise TypeError("Directory does not have any data: " + inf.filename) + + if inf.flags & RAR_FILE_SPLIT_BEFORE: + raise NeedFirstVolume("Partial file, please start from first volume: " + inf.filename) + + # check password + if inf.needs_password(): + psw = psw or self._password + if psw is None: + raise PasswordRequired("File %s requires password" % inf.filename) + else: + psw = None + + # is temp write usable? + use_hack = 1 + if not self._main: + use_hack = 0 + elif self._main.flags & (RAR_MAIN_SOLID | RAR_MAIN_PASSWORD): + use_hack = 0 + elif inf.flags & (RAR_FILE_SPLIT_BEFORE | RAR_FILE_SPLIT_AFTER): + use_hack = 0 + elif is_filelike(self.rarfile): + pass + elif inf.file_size > HACK_SIZE_LIMIT: + use_hack = 0 + elif not USE_EXTRACT_HACK: + use_hack = 0 + + # now extract + if inf.compress_type == RAR_M0 and (inf.flags & RAR_FILE_PASSWORD) == 0: + return self._open_clear(inf) + elif use_hack: + return self._open_hack(inf, psw) + elif is_filelike(self.rarfile): + return self._open_unrar_membuf(self.rarfile, inf, psw) + else: + return self._open_unrar(self.rarfile, inf, psw) + + def read(self, fname, psw = None): + """Return uncompressed data for archive entry. + + For longer files using :meth:`RarFile.open` may be better idea. + + Parameters: + + fname + filename or RarInfo instance + psw + password to use for extracting. + """ + + f = self.open(fname, 'r', psw) + try: + return f.read() + finally: + f.close() + + def close(self): + """Release open resources.""" + pass + + def printdir(self): + """Print archive file list to stdout.""" + for f in self.infolist(): + print(f.filename) + + def extract(self, member, path=None, pwd=None): + """Extract single file into current directory. + + Parameters: + + member + filename or :class:`RarInfo` instance + path + optional destination path + pwd + optional password to use + """ + if isinstance(member, RarInfo): + fname = member.filename + else: + fname = member + self._extract([fname], path, pwd) + + def extractall(self, path=None, members=None, pwd=None): + """Extract all files into current directory. + + Parameters: + + path + optional destination path + members + optional filename or :class:`RarInfo` instance list to extract + pwd + optional password to use + """ + fnlist = [] + if members is not None: + for m in members: + if isinstance(m, RarInfo): + fnlist.append(m.filename) + else: + fnlist.append(m) + self._extract(fnlist, path, pwd) + + def testrar(self): + """Let 'unrar' test the archive. + """ + cmd = [UNRAR_TOOL] + list(TEST_ARGS) + add_password_arg(cmd, self._password) + cmd.append('--') + + if is_filelike(self.rarfile): + tmpname = membuf_tempfile(self.rarfile) + cmd.append(tmpname) + else: + tmpname = None + cmd.append(self.rarfile) + + try: + p = custom_popen(cmd) + output = p.communicate()[0] + check_returncode(p, output) + finally: + if tmpname: + os.unlink(tmpname) + + def strerror(self): + """Return error string if parsing failed, + or None if no problems. + """ + return self._parse_error + + ## + ## private methods + ## + + def _set_error(self, msg, *args): + if args: + msg = msg % args + self._parse_error = msg + if self._strict: + raise BadRarFile(msg) + + # store entry + def _process_entry(self, item): + if item.type == RAR_BLOCK_FILE: + # use only first part + if (item.flags & RAR_FILE_SPLIT_BEFORE) == 0: + self._info_map[item.filename] = item + self._info_list.append(item) + # remember if any items require password + if item.needs_password(): + self._needs_password = True + elif len(self._info_list) > 0: + # final crc is in last block + old = self._info_list[-1] + old.CRC = item.CRC + old.compress_size += item.compress_size + + # parse new-style comment + if item.type == RAR_BLOCK_SUB and item.filename == 'CMT': + if not NEED_COMMENTS: + pass + elif item.flags & (RAR_FILE_SPLIT_BEFORE | RAR_FILE_SPLIT_AFTER): + pass + elif item.flags & RAR_FILE_SOLID: + # file comment + cmt = self._read_comment_v3(item, self._password) + if len(self._info_list) > 0: + old = self._info_list[-1] + old.comment = cmt + else: + # archive comment + cmt = self._read_comment_v3(item, self._password) + self.comment = cmt + + if self._info_callback: + self._info_callback(item) + + # read rar + def _parse(self): + self._fd = None + try: + self._parse_real() + finally: + if self._fd: + self._fd.close() + self._fd = None + + def _parse_real(self): + fd = XFile(self.rarfile) + self._fd = fd + id = fd.read(len(RAR_ID)) + if id != RAR_ID: + if isinstance(self.rarfile, (str, unicode)): + raise NotRarFile("Not a Rar archive: {}".format(self.rarfile)) + raise NotRarFile("Not a Rar archive") + + volume = 0 # first vol (.rar) is 0 + more_vols = 0 + endarc = 0 + volfile = self.rarfile + self._vol_list = [self.rarfile] + while 1: + if endarc: + h = None # don't read past ENDARC + else: + h = self._parse_header(fd) + if not h: + if more_vols: + volume += 1 + fd.close() + try: + volfile = self._next_volname(volfile) + fd = XFile(volfile) + except IOError: + self._set_error("Cannot open next volume: %s", volfile) + break + self._fd = fd + more_vols = 0 + endarc = 0 + self._vol_list.append(volfile) + continue + break + h.volume = volume + h.volume_file = volfile + + if h.type == RAR_BLOCK_MAIN and not self._main: + self._main = h + if h.flags & RAR_MAIN_NEWNUMBERING: + # RAR 2.x does not set FIRSTVOLUME, + # so check it only if NEWNUMBERING is used + if (h.flags & RAR_MAIN_FIRSTVOLUME) == 0: + raise NeedFirstVolume("Need to start from first volume") + if h.flags & RAR_MAIN_PASSWORD: + self._needs_password = True + if not self._password: + self._main = None + break + elif h.type == RAR_BLOCK_ENDARC: + more_vols = h.flags & RAR_ENDARC_NEXT_VOLUME + endarc = 1 + elif h.type == RAR_BLOCK_FILE: + # RAR 2.x does not write RAR_BLOCK_ENDARC + if h.flags & RAR_FILE_SPLIT_AFTER: + more_vols = 1 + # RAR 2.x does not set RAR_MAIN_FIRSTVOLUME + if volume == 0 and h.flags & RAR_FILE_SPLIT_BEFORE: + raise NeedFirstVolume("Need to start from first volume") + + # store it + self._process_entry(h) + + # go to next header + if h.add_size > 0: + fd.seek(h.file_offset + h.add_size, 0) + + # AES encrypted headers + _last_aes_key = (None, None, None) # (salt, key, iv) + def _decrypt_header(self, fd): + if not _have_crypto: + raise NoCrypto('Cannot parse encrypted headers - no crypto') + salt = fd.read(8) + if self._last_aes_key[0] == salt: + key, iv = self._last_aes_key[1:] + else: + key, iv = rar3_s2k(self._password, salt) + self._last_aes_key = (salt, key, iv) + return HeaderDecrypt(fd, key, iv) + + # read single header + def _parse_header(self, fd): + try: + # handle encrypted headers + if self._main and self._main.flags & RAR_MAIN_PASSWORD: + if not self._password: + return + fd = self._decrypt_header(fd) + + # now read actual header + return self._parse_block_header(fd) + except struct.error: + self._set_error('Broken header in RAR file') + return None + + # common header + def _parse_block_header(self, fd): + h = RarInfo() + h.header_offset = fd.tell() + h.comment = None + + # read and parse base header + buf = fd.read(S_BLK_HDR.size) + if not buf: + return None + t = S_BLK_HDR.unpack_from(buf) + h.header_crc, h.type, h.flags, h.header_size = t + h.header_base = S_BLK_HDR.size + pos = S_BLK_HDR.size + + # read full header + if h.header_size > S_BLK_HDR.size: + h.header_data = buf + fd.read(h.header_size - S_BLK_HDR.size) + else: + h.header_data = buf + h.file_offset = fd.tell() + + # unexpected EOF? + if len(h.header_data) != h.header_size: + self._set_error('Unexpected EOF when reading header') + return None + + # block has data assiciated with it? + if h.flags & RAR_LONG_BLOCK: + h.add_size = S_LONG.unpack_from(h.header_data, pos)[0] + else: + h.add_size = 0 + + # parse interesting ones, decide header boundaries for crc + if h.type == RAR_BLOCK_MARK: + return h + elif h.type == RAR_BLOCK_MAIN: + h.header_base += 6 + if h.flags & RAR_MAIN_ENCRYPTVER: + h.header_base += 1 + if h.flags & RAR_MAIN_COMMENT: + self._parse_subblocks(h, h.header_base) + self.comment = h.comment + elif h.type == RAR_BLOCK_FILE: + self._parse_file_header(h, pos) + elif h.type == RAR_BLOCK_SUB: + self._parse_file_header(h, pos) + h.header_base = h.header_size + elif h.type == RAR_BLOCK_OLD_AUTH: + h.header_base += 8 + elif h.type == RAR_BLOCK_OLD_EXTRA: + h.header_base += 7 + else: + h.header_base = h.header_size + + # check crc + if h.type == RAR_BLOCK_OLD_SUB: + crcdat = h.header_data[2:] + fd.read(h.add_size) + else: + crcdat = h.header_data[2:h.header_base] + + calc_crc = crc32(crcdat) & 0xFFFF + + # return good header + if h.header_crc == calc_crc: + return h + + # header parsing failed. + self._set_error('Header CRC error (%02x): exp=%x got=%x (xlen = %d)', + h.type, h.header_crc, calc_crc, len(crcdat)) + + # instead panicing, send eof + return None + + # read file-specific header + def _parse_file_header(self, h, pos): + fld = S_FILE_HDR.unpack_from(h.header_data, pos) + h.compress_size = fld[0] + h.file_size = fld[1] + h.host_os = fld[2] + h.CRC = fld[3] + h.date_time = parse_dos_time(fld[4]) + h.extract_version = fld[5] + h.compress_type = fld[6] + h.name_size = fld[7] + h.mode = fld[8] + pos += S_FILE_HDR.size + + if h.flags & RAR_FILE_LARGE: + h1 = S_LONG.unpack_from(h.header_data, pos)[0] + h2 = S_LONG.unpack_from(h.header_data, pos + 4)[0] + h.compress_size |= h1 << 32 + h.file_size |= h2 << 32 + pos += 8 + h.add_size = h.compress_size + + name = h.header_data[pos : pos + h.name_size ] + pos += h.name_size + if h.flags & RAR_FILE_UNICODE: + nul = name.find(ZERO) + h.orig_filename = name[:nul] + u = UnicodeFilename(h.orig_filename, name[nul + 1 : ]) + h.filename = u.decode() + + # if parsing failed fall back to simple name + if u.failed: + h.filename = self._decode(h.orig_filename) + else: + h.orig_filename = name + h.filename = self._decode(name) + + # change separator, if requested + if PATH_SEP != '\\': + h.filename = h.filename.replace('\\', PATH_SEP) + + if h.flags & RAR_FILE_SALT: + h.salt = h.header_data[pos : pos + 8] + pos += 8 + else: + h.salt = None + + # optional extended time stamps + if h.flags & RAR_FILE_EXTTIME: + pos = self._parse_ext_time(h, pos) + else: + h.mtime = h.atime = h.ctime = h.arctime = None + + # base header end + h.header_base = pos + + if h.flags & RAR_FILE_COMMENT: + self._parse_subblocks(h, pos) + + # convert timestamps + if USE_DATETIME: + h.date_time = to_datetime(h.date_time) + h.mtime = to_datetime(h.mtime) + h.atime = to_datetime(h.atime) + h.ctime = to_datetime(h.ctime) + h.arctime = to_datetime(h.arctime) + + # .mtime is .date_time with more precision + if h.mtime: + if USE_DATETIME: + h.date_time = h.mtime + else: + # keep seconds int + h.date_time = h.mtime[:5] + (int(h.mtime[5]),) + + return pos + + # find old-style comment subblock + def _parse_subblocks(self, h, pos): + hdata = h.header_data + while pos < len(hdata): + # ordinary block header + t = S_BLK_HDR.unpack_from(hdata, pos) + scrc, stype, sflags, slen = t + pos_next = pos + slen + pos += S_BLK_HDR.size + + # corrupt header + if pos_next < pos: + break + + # followed by block-specific header + if stype == RAR_BLOCK_OLD_COMMENT and pos + S_COMMENT_HDR.size <= pos_next: + declen, ver, meth, crc = S_COMMENT_HDR.unpack_from(hdata, pos) + pos += S_COMMENT_HDR.size + data = hdata[pos : pos_next] + cmt = rar_decompress(ver, meth, data, declen, sflags, + crc, self._password) + if not self._crc_check: + h.comment = self._decode_comment(cmt) + elif crc32(cmt) & 0xFFFF == crc: + h.comment = self._decode_comment(cmt) + + pos = pos_next + + def _parse_ext_time(self, h, pos): + data = h.header_data + + # flags and rest of data can be missing + flags = 0 + if pos + 2 <= len(data): + flags = S_SHORT.unpack_from(data, pos)[0] + pos += 2 + + h.mtime, pos = self._parse_xtime(flags >> 3*4, data, pos, h.date_time) + h.ctime, pos = self._parse_xtime(flags >> 2*4, data, pos) + h.atime, pos = self._parse_xtime(flags >> 1*4, data, pos) + h.arctime, pos = self._parse_xtime(flags >> 0*4, data, pos) + return pos + + def _parse_xtime(self, flag, data, pos, dostime = None): + unit = 10000000.0 # 100 ns units + if flag & 8: + if not dostime: + t = S_LONG.unpack_from(data, pos)[0] + dostime = parse_dos_time(t) + pos += 4 + rem = 0 + cnt = flag & 3 + for i in range(cnt): + b = S_BYTE.unpack_from(data, pos)[0] + rem = (b << 16) | (rem >> 8) + pos += 1 + sec = dostime[5] + rem / unit + if flag & 4: + sec += 1 + dostime = dostime[:5] + (sec,) + return dostime, pos + + # given current vol name, construct next one + def _next_volname(self, volfile): + if is_filelike(volfile): + raise IOError("Working on single FD") + if self._main.flags & RAR_MAIN_NEWNUMBERING: + return self._next_newvol(volfile) + return self._next_oldvol(volfile) + + # new-style next volume + def _next_newvol(self, volfile): + i = len(volfile) - 1 + while i >= 0: + if volfile[i] >= '0' and volfile[i] <= '9': + return self._inc_volname(volfile, i) + i -= 1 + raise BadRarName("Cannot construct volume name: "+volfile) + + # old-style next volume + def _next_oldvol(self, volfile): + # rar -> r00 + if volfile[-4:].lower() == '.rar': + return volfile[:-2] + '00' + return self._inc_volname(volfile, len(volfile) - 1) + + # increase digits with carry, otherwise just increment char + def _inc_volname(self, volfile, i): + fn = list(volfile) + while i >= 0: + if fn[i] != '9': + fn[i] = chr(ord(fn[i]) + 1) + break + fn[i] = '0' + i -= 1 + return ''.join(fn) + + def _open_clear(self, inf): + return DirectReader(self, inf) + + # put file compressed data into temporary .rar archive, and run + # unrar on that, thus avoiding unrar going over whole archive + def _open_hack(self, inf, psw = None): + BSIZE = 32*1024 + + size = inf.compress_size + inf.header_size + rf = XFile(inf.volume_file, 0) + rf.seek(inf.header_offset) + + tmpfd, tmpname = mkstemp(suffix='.rar') + tmpf = os.fdopen(tmpfd, "wb") + + try: + # create main header: crc, type, flags, size, res1, res2 + mh = S_BLK_HDR.pack(0x90CF, 0x73, 0, 13) + ZERO * (2+4) + tmpf.write(RAR_ID + mh) + while size > 0: + if size > BSIZE: + buf = rf.read(BSIZE) + else: + buf = rf.read(size) + if not buf: + raise BadRarFile('read failed: ' + inf.filename) + tmpf.write(buf) + size -= len(buf) + tmpf.close() + rf.close() + except: + rf.close() + tmpf.close() + os.unlink(tmpname) + raise + + return self._open_unrar(tmpname, inf, psw, tmpname) + + def _read_comment_v3(self, inf, psw=None): + + # read data + rf = XFile(inf.volume_file) + rf.seek(inf.file_offset) + data = rf.read(inf.compress_size) + rf.close() + + # decompress + cmt = rar_decompress(inf.extract_version, inf.compress_type, data, + inf.file_size, inf.flags, inf.CRC, psw, inf.salt) + + # check crc + if self._crc_check: + crc = crc32(cmt) + if crc < 0: + crc += (1 << 32) + if crc != inf.CRC: + return None + + return self._decode_comment(cmt) + + # write in-memory archive to temp file - needed for solid archives + def _open_unrar_membuf(self, memfile, inf, psw): + tmpname = membuf_tempfile(memfile) + return self._open_unrar(tmpname, inf, psw, tmpname) + + # extract using unrar + def _open_unrar(self, rarfile, inf, psw = None, tmpfile = None): + if is_filelike(rarfile): + raise ValueError("Cannot use unrar directly on memory buffer") + cmd = [UNRAR_TOOL] + list(OPEN_ARGS) + add_password_arg(cmd, psw) + cmd.append("--") + cmd.append(rarfile) + + # not giving filename avoids encoding related problems + if not tmpfile: + fn = inf.filename + if PATH_SEP != os.sep: + fn = fn.replace(PATH_SEP, os.sep) + cmd.append(fn) + + # read from unrar pipe + return PipeReader(self, inf, cmd, tmpfile) + + def _decode(self, val): + for c in TRY_ENCODINGS: + try: + return val.decode(c) + except UnicodeError: + pass + return val.decode(self._charset, 'replace') + + def _decode_comment(self, val): + if UNICODE_COMMENTS: + return self._decode(val) + return val + + # call unrar to extract a file + def _extract(self, fnlist, path=None, psw=None): + cmd = [UNRAR_TOOL] + list(EXTRACT_ARGS) + + # pasoword + psw = psw or self._password + add_password_arg(cmd, psw) + cmd.append('--') + + # rar file + if is_filelike(self.rarfile): + tmpname = membuf_tempfile(self.rarfile) + cmd.append(tmpname) + else: + tmpname = None + cmd.append(self.rarfile) + + # file list + for fn in fnlist: + if os.sep != PATH_SEP: + fn = fn.replace(PATH_SEP, os.sep) + cmd.append(fn) + + # destination path + if path is not None: + cmd.append(path + os.sep) + + # call + try: + p = custom_popen(cmd) + output = p.communicate()[0] + check_returncode(p, output) + finally: + if tmpname: + os.unlink(tmpname) + +## +## Utility classes +## + +class UnicodeFilename(object): + """Handle unicode filename decompression""" + + def __init__(self, name, encdata): + self.std_name = bytearray(name) + self.encdata = bytearray(encdata) + self.pos = self.encpos = 0 + self.buf = bytearray() + self.failed = 0 + + def enc_byte(self): + try: + c = self.encdata[self.encpos] + self.encpos += 1 + return c + except IndexError: + self.failed = 1 + return 0 + + def std_byte(self): + try: + return self.std_name[self.pos] + except IndexError: + self.failed = 1 + return ord('?') + + def put(self, lo, hi): + self.buf.append(lo) + self.buf.append(hi) + self.pos += 1 + + def decode(self): + hi = self.enc_byte() + flagbits = 0 + while self.encpos < len(self.encdata): + if flagbits == 0: + flags = self.enc_byte() + flagbits = 8 + flagbits -= 2 + t = (flags >> flagbits) & 3 + if t == 0: + self.put(self.enc_byte(), 0) + elif t == 1: + self.put(self.enc_byte(), hi) + elif t == 2: + self.put(self.enc_byte(), self.enc_byte()) + else: + n = self.enc_byte() + if n & 0x80: + c = self.enc_byte() + for i in range((n & 0x7f) + 2): + lo = (self.std_byte() + c) & 0xFF + self.put(lo, hi) + else: + for i in range(n + 2): + self.put(self.std_byte(), 0) + return self.buf.decode("utf-16le", "replace") + + +class RarExtFile(RawIOBase): + """Base class for file-like object that :meth:`RarFile.open` returns. + + Provides public methods and common crc checking. + + Behaviour: + - no short reads - .read() and .readinfo() read as much as requested. + - no internal buffer, use io.BufferedReader for that. + + If :mod:`io` module is available (Python 2.6+, 3.x), then this calls + will inherit from :class:`io.RawIOBase` class. This makes line-based + access available: :meth:`RarExtFile.readline` and ``for ln in f``. + """ + + #: Filename of the archive entry + name = None + + def __init__(self, rf, inf): + super(RarExtFile, self).__init__() + + # standard io.* properties + self.name = inf.filename + self.mode = 'rb' + + self.rf = rf + self.inf = inf + self.crc_check = rf._crc_check + self.fd = None + self.CRC = 0 + self.remain = 0 + self.returncode = 0 + + self._open() + + def _open(self): + if self.fd: + self.fd.close() + self.fd = None + self.CRC = 0 + self.remain = self.inf.file_size + + def read(self, cnt = None): + """Read all or specified amount of data from archive entry.""" + + # sanitize cnt + if cnt is None or cnt < 0: + cnt = self.remain + elif cnt > self.remain: + cnt = self.remain + if cnt == 0: + return EMPTY + + # actual read + data = self._read(cnt) + if data: + self.CRC = crc32(data, self.CRC) + self.remain -= len(data) + if len(data) != cnt: + raise BadRarFile("Failed the read enough data") + + # done? + if not data or self.remain == 0: + #self.close() + self._check() + return data + + def _check(self): + """Check final CRC.""" + if not self.crc_check: + return + if self.returncode: + check_returncode(self, '') + if self.remain != 0: + raise BadRarFile("Failed the read enough data") + crc = self.CRC + if crc < 0: + crc += (1 << 32) + if crc != self.inf.CRC: + raise BadRarFile("Corrupt file - CRC check failed: " + self.inf.filename) + + def _read(self, cnt): + """Actual read that gets sanitized cnt.""" + + def close(self): + """Close open resources.""" + + super(RarExtFile, self).close() + + if self.fd: + self.fd.close() + self.fd = None + + def __del__(self): + """Hook delete to make sure tempfile is removed.""" + self.close() + + def readinto(self, buf): + """Zero-copy read directly into buffer. + + Returns bytes read. + """ + + data = self.read(len(buf)) + n = len(data) + try: + buf[:n] = data + except TypeError: + import array + if not isinstance(buf, array.array): + raise + buf[:n] = array.array(buf.typecode, data) + return n + + def tell(self): + """Return current reading position in uncompressed data.""" + return self.inf.file_size - self.remain + + def seek(self, ofs, whence = 0): + """Seek in data. + + On uncompressed files, the seeking works by actual + seeks so it's fast. On compresses files its slow + - forward seeking happends by reading ahead, + backwards by re-opening and decompressing from the start. + """ + + # disable crc check when seeking + self.crc_check = 0 + + fsize = self.inf.file_size + cur_ofs = self.tell() + + if whence == 0: # seek from beginning of file + new_ofs = ofs + elif whence == 1: # seek from current position + new_ofs = cur_ofs + ofs + elif whence == 2: # seek from end of file + new_ofs = fsize + ofs + else: + raise ValueError('Invalid value for whence') + + # sanity check + if new_ofs < 0: + new_ofs = 0 + elif new_ofs > fsize: + new_ofs = fsize + + # do the actual seek + if new_ofs >= cur_ofs: + self._skip(new_ofs - cur_ofs) + else: + # process old data ? + #self._skip(fsize - cur_ofs) + # reopen and seek + self._open() + self._skip(new_ofs) + return self.tell() + + def _skip(self, cnt): + """Read and discard data""" + while cnt > 0: + if cnt > 8192: + buf = self.read(8192) + else: + buf = self.read(cnt) + if not buf: + break + cnt -= len(buf) + + def readable(self): + """Returns True""" + return True + + def writable(self): + """Returns False. + + Writing is not supported.""" + return False + + def seekable(self): + """Returns True. + + Seeking is supported, although it's slow on compressed files. + """ + return True + + def readall(self): + """Read all remaining data""" + # avoid RawIOBase default impl + return self.read() + + +class PipeReader(RarExtFile): + """Read data from pipe, handle tempfile cleanup.""" + + def __init__(self, rf, inf, cmd, tempfile=None): + self.cmd = cmd + self.proc = None + self.tempfile = tempfile + super(PipeReader, self).__init__(rf, inf) + + def _close_proc(self): + if not self.proc: + return + if self.proc.stdout: + self.proc.stdout.close() + if self.proc.stdin: + self.proc.stdin.close() + if self.proc.stderr: + self.proc.stderr.close() + self.proc.wait() + self.returncode = self.proc.returncode + self.proc = None + + def _open(self): + super(PipeReader, self)._open() + + # stop old process + self._close_proc() + + # launch new process + self.returncode = 0 + self.proc = custom_popen(self.cmd) + self.fd = self.proc.stdout + + # avoid situation where unrar waits on stdin + if self.proc.stdin: + self.proc.stdin.close() + + def _read(self, cnt): + """Read from pipe.""" + + # normal read is usually enough + data = self.fd.read(cnt) + if len(data) == cnt or not data: + return data + + # short read, try looping + buf = [data] + cnt -= len(data) + while cnt > 0: + data = self.fd.read(cnt) + if not data: + break + cnt -= len(data) + buf.append(data) + return EMPTY.join(buf) + + def close(self): + """Close open resources.""" + + self._close_proc() + super(PipeReader, self).close() + + if self.tempfile: + try: + os.unlink(self.tempfile) + except OSError: + pass + self.tempfile = None + + def readinto(self, buf): + """Zero-copy read directly into buffer.""" + cnt = len(buf) + if cnt > self.remain: + cnt = self.remain + vbuf = memoryview(buf) + res = got = 0 + while got < cnt: + res = self.fd.readinto(vbuf[got : cnt]) + if not res: + break + if self.crc_check: + self.CRC = crc32(vbuf[got : got + res], self.CRC) + self.remain -= res + got += res + return got + + +class DirectReader(RarExtFile): + """Read uncompressed data directly from archive.""" + + def _open(self): + super(DirectReader, self)._open() + + self.volfile = self.inf.volume_file + self.fd = XFile(self.volfile, 0) + self.fd.seek(self.inf.header_offset, 0) + self.cur = self.rf._parse_header(self.fd) + self.cur_avail = self.cur.add_size + + def _skip(self, cnt): + """RAR Seek, skipping through rar files to get to correct position + """ + + while cnt > 0: + # next vol needed? + if self.cur_avail == 0: + if not self._open_next(): + break + + # fd is in read pos, do the read + if cnt > self.cur_avail: + cnt -= self.cur_avail + self.remain -= self.cur_avail + self.cur_avail = 0 + else: + self.fd.seek(cnt, 1) + self.cur_avail -= cnt + self.remain -= cnt + cnt = 0 + + def _read(self, cnt): + """Read from potentially multi-volume archive.""" + + buf = [] + while cnt > 0: + # next vol needed? + if self.cur_avail == 0: + if not self._open_next(): + break + + # fd is in read pos, do the read + if cnt > self.cur_avail: + data = self.fd.read(self.cur_avail) + else: + data = self.fd.read(cnt) + if not data: + break + + # got some data + cnt -= len(data) + self.cur_avail -= len(data) + buf.append(data) + + if len(buf) == 1: + return buf[0] + return EMPTY.join(buf) + + def _open_next(self): + """Proceed to next volume.""" + + # is the file split over archives? + if (self.cur.flags & RAR_FILE_SPLIT_AFTER) == 0: + return False + + if self.fd: + self.fd.close() + self.fd = None + + # open next part + self.volfile = self.rf._next_volname(self.volfile) + fd = open(self.volfile, "rb", 0) + self.fd = fd + + # loop until first file header + while 1: + cur = self.rf._parse_header(fd) + if not cur: + raise BadRarFile("Unexpected EOF") + if cur.type in (RAR_BLOCK_MARK, RAR_BLOCK_MAIN): + if cur.add_size: + fd.seek(cur.add_size, 1) + continue + if cur.orig_filename != self.inf.orig_filename: + raise BadRarFile("Did not found file entry") + self.cur = cur + self.cur_avail = cur.add_size + return True + + def readinto(self, buf): + """Zero-copy read directly into buffer.""" + got = 0 + vbuf = memoryview(buf) + while got < len(buf): + # next vol needed? + if self.cur_avail == 0: + if not self._open_next(): + break + + # length for next read + cnt = len(buf) - got + if cnt > self.cur_avail: + cnt = self.cur_avail + + # read into temp view + res = self.fd.readinto(vbuf[got : got + cnt]) + if not res: + break + if self.crc_check: + self.CRC = crc32(vbuf[got : got + res], self.CRC) + self.cur_avail -= res + self.remain -= res + got += res + return got + + +class HeaderDecrypt(object): + """File-like object that decrypts from another file""" + def __init__(self, f, key, iv): + self.f = f + self.ciph = AES_CBC_Decrypt(key, iv) + self.buf = EMPTY + + def tell(self): + return self.f.tell() + + def read(self, cnt=None): + if cnt > 8*1024: + raise BadRarFile('Bad count to header decrypt - wrong password?') + + # consume old data + if cnt <= len(self.buf): + res = self.buf[:cnt] + self.buf = self.buf[cnt:] + return res + res = self.buf + self.buf = EMPTY + cnt -= len(res) + + # decrypt new data + BLK = self.ciph.block_size + while cnt > 0: + enc = self.f.read(BLK) + if len(enc) < BLK: + break + dec = self.ciph.decrypt(enc) + if cnt >= len(dec): + res += dec + cnt -= len(dec) + else: + res += dec[:cnt] + self.buf = dec[cnt:] + cnt = 0 + + return res + +# handle (filename|filelike) object +class XFile(object): + __slots__ = ('_fd', '_need_close') + def __init__(self, xfile, bufsize = 1024): + if is_filelike(xfile): + self._need_close = False + self._fd = xfile + self._fd.seek(0) + else: + self._need_close = True + self._fd = open(xfile, 'rb', bufsize) + def read(self, n=None): + return self._fd.read(n) + def tell(self): + return self._fd.tell() + def seek(self, ofs, whence=0): + return self._fd.seek(ofs, whence) + def readinto(self, dst): + return self._fd.readinto(dst) + def close(self): + if self._need_close: + self._fd.close() + def __enter__(self): + return self + def __exit__(self, typ, val, tb): + self.close() + +## +## Utility functions +## + +def is_filelike(obj): + if isinstance(obj, str) or isinstance(obj, unicode): + return False + res = True + for a in ('read', 'tell', 'seek'): + res = res and hasattr(obj, a) + if not res: + raise ValueError("Invalid object passed as file") + return True + +def rar3_s2k(psw, salt): + """String-to-key hash for RAR3.""" + + seed = psw.encode('utf-16le') + salt + iv = EMPTY + h = sha1() + for i in range(16): + for j in range(0x4000): + cnt = S_LONG.pack(i*0x4000 + j) + h.update(seed + cnt[:3]) + if j == 0: + iv += h.digest()[19:20] + key_be = h.digest()[:16] + key_le = pack("LLLL", key_be)) + return key_le, iv + +def rar_decompress(vers, meth, data, declen=0, flags=0, crc=0, psw=None, salt=None): + """Decompress blob of compressed data. + + Used for data with non-standard header - eg. comments. + """ + + # already uncompressed? + if meth == RAR_M0 and (flags & RAR_FILE_PASSWORD) == 0: + return data + + # take only necessary flags + flags = flags & (RAR_FILE_PASSWORD | RAR_FILE_SALT | RAR_FILE_DICTMASK) + flags |= RAR_LONG_BLOCK + + # file header + fname = b'data' + date = 0 + mode = 0x20 + fhdr = S_FILE_HDR.pack(len(data), declen, RAR_OS_MSDOS, crc, + date, vers, meth, len(fname), mode) + fhdr += fname + if flags & RAR_FILE_SALT: + if not salt: + return EMPTY + fhdr += salt + + # full header + hlen = S_BLK_HDR.size + len(fhdr) + hdr = S_BLK_HDR.pack(0, RAR_BLOCK_FILE, flags, hlen) + fhdr + hcrc = crc32(hdr[2:]) & 0xFFFF + hdr = S_BLK_HDR.pack(hcrc, RAR_BLOCK_FILE, flags, hlen) + fhdr + + # archive main header + mh = S_BLK_HDR.pack(0x90CF, RAR_BLOCK_MAIN, 0, 13) + ZERO * (2+4) + + # decompress via temp rar + tmpfd, tmpname = mkstemp(suffix='.rar') + tmpf = os.fdopen(tmpfd, "wb") + try: + tmpf.write(RAR_ID + mh + hdr + data) + tmpf.close() + + cmd = [UNRAR_TOOL] + list(OPEN_ARGS) + add_password_arg(cmd, psw, (flags & RAR_FILE_PASSWORD)) + cmd.append(tmpname) + + p = custom_popen(cmd) + return p.communicate()[0] + finally: + tmpf.close() + os.unlink(tmpname) + +def to_datetime(t): + """Convert 6-part time tuple into datetime object.""" + + if t is None: + return None + + # extract values + year, mon, day, h, m, xs = t + s = int(xs) + us = int(1000000 * (xs - s)) + + # assume the values are valid + try: + return datetime(year, mon, day, h, m, s, us) + except ValueError: + pass + + # sanitize invalid values + MDAY = (0, 31, 29, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31) + if mon < 1: mon = 1 + if mon > 12: mon = 12 + if day < 1: day = 1 + if day > MDAY[mon]: day = MDAY[mon] + if h > 23: h = 23 + if m > 59: m = 59 + if s > 59: s = 59 + if mon == 2 and day == 29: + try: + return datetime(year, mon, day, h, m, s, us) + except ValueError: + day = 28 + return datetime(year, mon, day, h, m, s, us) + +def parse_dos_time(stamp): + """Parse standard 32-bit DOS timestamp.""" + + sec = stamp & 0x1F; stamp = stamp >> 5 + min = stamp & 0x3F; stamp = stamp >> 6 + hr = stamp & 0x1F; stamp = stamp >> 5 + day = stamp & 0x1F; stamp = stamp >> 5 + mon = stamp & 0x0F; stamp = stamp >> 4 + yr = (stamp & 0x7F) + 1980 + return (yr, mon, day, hr, min, sec * 2) + +def custom_popen(cmd): + """Disconnect cmd from parent fds, read only from stdout.""" + + # needed for py2exe + creationflags = 0 + if sys.platform == 'win32': + creationflags = 0x08000000 # CREATE_NO_WINDOW + + # run command + try: + p = Popen(cmd, bufsize = 0, + stdout = PIPE, stdin = PIPE, stderr = STDOUT, + creationflags = creationflags) + except OSError as ex: + if ex.errno == errno.ENOENT: + raise RarCannotExec("Unrar not installed? (rarfile.UNRAR_TOOL=%r)" % UNRAR_TOOL) + raise + return p + +def custom_check(cmd, ignore_retcode=False): + """Run command, collect output, raise error if needed.""" + p = custom_popen(cmd) + out, err = p.communicate() + if p.returncode and not ignore_retcode: + raise RarExecError("Check-run failed") + return out + +def add_password_arg(cmd, psw, required=False): + """Append password switch to commandline.""" + if UNRAR_TOOL == ALT_TOOL: + return + if psw is not None: + cmd.append('-p' + psw) + else: + cmd.append('-p-') + +def check_returncode(p, out): + """Raise exception according to unrar exit code""" + + code = p.returncode + if code == 0: + return + + # map return code to exception class + errmap = [None, + RarWarning, RarFatalError, RarCRCError, RarLockedArchiveError, + RarWriteError, RarOpenError, RarUserError, RarMemoryError, + RarCreateError, RarNoFilesError] # codes from rar.txt + if UNRAR_TOOL == ALT_TOOL: + errmap = [None] + if code > 0 and code < len(errmap): + exc = errmap[code] + elif code == 255: + exc = RarUserBreak + elif code < 0: + exc = RarSignalExit + else: + exc = RarUnknownError + + # format message + if out: + msg = "%s [%d]: %s" % (exc.__doc__, p.returncode, out) + else: + msg = "%s [%d]" % (exc.__doc__, p.returncode) + + raise exc(msg) + +def membuf_tempfile(memfile): + memfile.seek(0, 0) + + tmpfd, tmpname = mkstemp(suffix='.rar') + tmpf = os.fdopen(tmpfd, "wb") + + try: + BSIZE = 32*1024 + while True: + buf = memfile.read(BSIZE) + if not buf: + break + tmpf.write(buf) + tmpf.close() + return tmpname + except: + tmpf.close() + os.unlink(tmpname) + raise + +# +# Check if unrar works +# + +try: + # does UNRAR_TOOL work? + custom_check([UNRAR_TOOL], True) +except RarCannotExec: + try: + # does ALT_TOOL work? + custom_check([ALT_TOOL] + list(ALT_CHECK_ARGS), True) + # replace config + UNRAR_TOOL = ALT_TOOL + OPEN_ARGS = ALT_OPEN_ARGS + EXTRACT_ARGS = ALT_EXTRACT_ARGS + TEST_ARGS = ALT_TEST_ARGS + except RarCannotExec: + # no usable tool, only uncompressed archives work + pass + diff --git a/libs/rarfile1/LICENSE b/libs/rarfile1/LICENSE new file mode 100644 index 00000000..cd53af08 --- /dev/null +++ b/libs/rarfile1/LICENSE @@ -0,0 +1,15 @@ + +Copyright (c) 2005-2016 Marko Kreen + +Permission to use, copy, modify, and/or distribute this software for any +purpose with or without fee is hereby granted, provided that the above +copyright notice and this permission notice appear in all copies. + +THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES +WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF +MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR +ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES +WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN +ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF +OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + diff --git a/libs/rarfile1/MANIFEST.in b/libs/rarfile1/MANIFEST.in new file mode 100644 index 00000000..6d1f1f6b --- /dev/null +++ b/libs/rarfile1/MANIFEST.in @@ -0,0 +1,3 @@ +include README.rst Makefile MANIFEST.in LICENSE dumprar.py +include doc/*.rst doc/Makefile doc/conf.py doc/make.bat +include test/Makefile test/*.sh test/files/*.rar test/files/*.exp diff --git a/libs/rarfile1/Makefile b/libs/rarfile1/Makefile new file mode 100644 index 00000000..45e3c2b7 --- /dev/null +++ b/libs/rarfile1/Makefile @@ -0,0 +1,31 @@ + +prefix = /usr/local + +all: + python setup.py build + +install: + python setup.py install --prefix=$(prefix) + +tgz: clean + python setup.py sdist + +clean: + rm -rf __pycache__ build dist + rm -f *.pyc MANIFEST *.orig *.rej *.html *.class + rm -rf doc/_build doc/_static doc/_templates + make -C test clean + +html: + rst2html README.rst > README.html + make -C doc html + +lint: + pylint -E rarfile.py + +rbuild: + curl -X POST https://readthedocs.org/build/6715 + +upload: + python setup.py sdist upload + diff --git a/libs/rarfile1/PKG-INFO b/libs/rarfile1/PKG-INFO new file mode 100644 index 00000000..282b56d3 --- /dev/null +++ b/libs/rarfile1/PKG-INFO @@ -0,0 +1,56 @@ +Metadata-Version: 1.1 +Name: rarfile +Version: 2.8 +Summary: RAR archive reader for Python +Home-page: https://github.com/markokr/rarfile +Author: Marko Kreen +Author-email: markokr@gmail.com +License: ISC +Description: rarfile - RAR archive reader for Python + ======================================= + + This is Python module for RAR_ archive reading. The interface + is made as zipfile_ like as possible. Licensed under ISC_ + license. + + Features: + + - Supports both RAR2 and RAR3 archives (WinRAR 2.x .. WinRAR 4.x). + - Supports multi volume archives. + - Supports Unicode filenames. + - Supports password-protected archives. + - Supports archive and file comments. + - Archive parsing and non-compressed files are handled in pure Python code. + - Compressed files are extracted by executing external tool: either ``unrar`` + from RARLAB_ or ``bsdtar`` from libarchive_. + - Works with both Python 2.7 and 3.x. + + Notes: + + - Does not support the RAR5 format introduced in WinRAR 5.0. + - ``bsdtar`` does not support all RAR3 features. + + Links: + + - `Documentation`_ + - `Downloads`_ + - `Git`_ repo + + .. _RAR: https://en.wikipedia.org/wiki/RAR_%28file_format%29 + .. _zipfile: https://docs.python.org/2/library/zipfile.html + .. _ISC: https://en.wikipedia.org/wiki/ISC_license + .. _Git: https://github.com/markokr/rarfile + .. _Downloads: https://pypi.python.org/pypi/rarfile + .. _Documentation: https://rarfile.readthedocs.io/ + .. _libarchive: https://github.com/libarchive/libarchive + .. _RARLAB: http://www.rarlab.com/ +Keywords: rar,unrar,archive +Platform: UNKNOWN +Classifier: Development Status :: 5 - Production/Stable +Classifier: Intended Audience :: Developers +Classifier: License :: OSI Approved :: ISC License (ISCL) +Classifier: Operating System :: OS Independent +Classifier: Programming Language :: Python :: 2 +Classifier: Programming Language :: Python :: 3 +Classifier: Topic :: Software Development :: Libraries :: Python Modules +Classifier: Topic :: System :: Archiving :: Compression diff --git a/libs/rarfile1/README.rst b/libs/rarfile1/README.rst new file mode 100644 index 00000000..596ca917 --- /dev/null +++ b/libs/rarfile1/README.rst @@ -0,0 +1,39 @@ + +rarfile - RAR archive reader for Python +======================================= + +This is Python module for RAR_ archive reading. The interface +is made as zipfile_ like as possible. Licensed under ISC_ +license. + +Features: + +- Supports both RAR2 and RAR3 archives (WinRAR 2.x .. WinRAR 4.x). +- Supports multi volume archives. +- Supports Unicode filenames. +- Supports password-protected archives. +- Supports archive and file comments. +- Archive parsing and non-compressed files are handled in pure Python code. +- Compressed files are extracted by executing external tool: either ``unrar`` + from RARLAB_ or ``bsdtar`` from libarchive_. +- Works with both Python 2.7 and 3.x. + +Notes: + +- Does not support the RAR5 format introduced in WinRAR 5.0. +- ``bsdtar`` does not support all RAR3 features. + +Links: + +- `Documentation`_ +- `Downloads`_ +- `Git`_ repo + +.. _RAR: https://en.wikipedia.org/wiki/RAR_%28file_format%29 +.. _zipfile: https://docs.python.org/2/library/zipfile.html +.. _ISC: https://en.wikipedia.org/wiki/ISC_license +.. _Git: https://github.com/markokr/rarfile +.. _Downloads: https://pypi.python.org/pypi/rarfile +.. _Documentation: https://rarfile.readthedocs.io/ +.. _libarchive: https://github.com/libarchive/libarchive +.. _RARLAB: http://www.rarlab.com/ diff --git a/libs/rarfile1/doc/Makefile b/libs/rarfile1/doc/Makefile new file mode 100644 index 00000000..d257cf0b --- /dev/null +++ b/libs/rarfile1/doc/Makefile @@ -0,0 +1,153 @@ +# Makefile for Sphinx documentation +# + +# You can set these variables from the command line. +SPHINXOPTS = +SPHINXBUILD = sphinx-build +PAPER = +BUILDDIR = _build + +# Internal variables. +PAPEROPT_a4 = -D latex_paper_size=a4 +PAPEROPT_letter = -D latex_paper_size=letter +ALLSPHINXOPTS = -d $(BUILDDIR)/doctrees $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) . +# the i18n builder cannot share the environment and doctrees with the others +I18NSPHINXOPTS = $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) . + +.PHONY: help clean html dirhtml singlehtml pickle json htmlhelp qthelp devhelp epub latex latexpdf text man changes linkcheck doctest gettext + +help: + @echo "Please use \`make ' where is one of" + @echo " html to make standalone HTML files" + @echo " dirhtml to make HTML files named index.html in directories" + @echo " singlehtml to make a single large HTML file" + @echo " pickle to make pickle files" + @echo " json to make JSON files" + @echo " htmlhelp to make HTML files and a HTML help project" + @echo " qthelp to make HTML files and a qthelp project" + @echo " devhelp to make HTML files and a Devhelp project" + @echo " epub to make an epub" + @echo " latex to make LaTeX files, you can set PAPER=a4 or PAPER=letter" + @echo " latexpdf to make LaTeX files and run them through pdflatex" + @echo " text to make text files" + @echo " man to make manual pages" + @echo " texinfo to make Texinfo files" + @echo " info to make Texinfo files and run them through makeinfo" + @echo " gettext to make PO message catalogs" + @echo " changes to make an overview of all changed/added/deprecated items" + @echo " linkcheck to check all external links for integrity" + @echo " doctest to run all doctests embedded in the documentation (if enabled)" + +clean: + -rm -rf $(BUILDDIR)/* + +html: + $(SPHINXBUILD) -b html $(ALLSPHINXOPTS) $(BUILDDIR)/html + @echo + @echo "Build finished. The HTML pages are in $(BUILDDIR)/html." + +dirhtml: + $(SPHINXBUILD) -b dirhtml $(ALLSPHINXOPTS) $(BUILDDIR)/dirhtml + @echo + @echo "Build finished. The HTML pages are in $(BUILDDIR)/dirhtml." + +singlehtml: + $(SPHINXBUILD) -b singlehtml $(ALLSPHINXOPTS) $(BUILDDIR)/singlehtml + @echo + @echo "Build finished. The HTML page is in $(BUILDDIR)/singlehtml." + +pickle: + $(SPHINXBUILD) -b pickle $(ALLSPHINXOPTS) $(BUILDDIR)/pickle + @echo + @echo "Build finished; now you can process the pickle files." + +json: + $(SPHINXBUILD) -b json $(ALLSPHINXOPTS) $(BUILDDIR)/json + @echo + @echo "Build finished; now you can process the JSON files." + +htmlhelp: + $(SPHINXBUILD) -b htmlhelp $(ALLSPHINXOPTS) $(BUILDDIR)/htmlhelp + @echo + @echo "Build finished; now you can run HTML Help Workshop with the" \ + ".hhp project file in $(BUILDDIR)/htmlhelp." + +qthelp: + $(SPHINXBUILD) -b qthelp $(ALLSPHINXOPTS) $(BUILDDIR)/qthelp + @echo + @echo "Build finished; now you can run "qcollectiongenerator" with the" \ + ".qhcp project file in $(BUILDDIR)/qthelp, like this:" + @echo "# qcollectiongenerator $(BUILDDIR)/qthelp/RarFile.qhcp" + @echo "To view the help file:" + @echo "# assistant -collectionFile $(BUILDDIR)/qthelp/RarFile.qhc" + +devhelp: + $(SPHINXBUILD) -b devhelp $(ALLSPHINXOPTS) $(BUILDDIR)/devhelp + @echo + @echo "Build finished." + @echo "To view the help file:" + @echo "# mkdir -p $$HOME/.local/share/devhelp/RarFile" + @echo "# ln -s $(BUILDDIR)/devhelp $$HOME/.local/share/devhelp/RarFile" + @echo "# devhelp" + +epub: + $(SPHINXBUILD) -b epub $(ALLSPHINXOPTS) $(BUILDDIR)/epub + @echo + @echo "Build finished. The epub file is in $(BUILDDIR)/epub." + +latex: + $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex + @echo + @echo "Build finished; the LaTeX files are in $(BUILDDIR)/latex." + @echo "Run \`make' in that directory to run these through (pdf)latex" \ + "(use \`make latexpdf' here to do that automatically)." + +latexpdf: + $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex + @echo "Running LaTeX files through pdflatex..." + $(MAKE) -C $(BUILDDIR)/latex all-pdf + @echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex." + +text: + $(SPHINXBUILD) -b text $(ALLSPHINXOPTS) $(BUILDDIR)/text + @echo + @echo "Build finished. The text files are in $(BUILDDIR)/text." + +man: + $(SPHINXBUILD) -b man $(ALLSPHINXOPTS) $(BUILDDIR)/man + @echo + @echo "Build finished. The manual pages are in $(BUILDDIR)/man." + +texinfo: + $(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo + @echo + @echo "Build finished. The Texinfo files are in $(BUILDDIR)/texinfo." + @echo "Run \`make' in that directory to run these through makeinfo" \ + "(use \`make info' here to do that automatically)." + +info: + $(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo + @echo "Running Texinfo files through makeinfo..." + make -C $(BUILDDIR)/texinfo info + @echo "makeinfo finished; the Info files are in $(BUILDDIR)/texinfo." + +gettext: + $(SPHINXBUILD) -b gettext $(I18NSPHINXOPTS) $(BUILDDIR)/locale + @echo + @echo "Build finished. The message catalogs are in $(BUILDDIR)/locale." + +changes: + $(SPHINXBUILD) -b changes $(ALLSPHINXOPTS) $(BUILDDIR)/changes + @echo + @echo "The overview file is in $(BUILDDIR)/changes." + +linkcheck: + $(SPHINXBUILD) -b linkcheck $(ALLSPHINXOPTS) $(BUILDDIR)/linkcheck + @echo + @echo "Link check complete; look for any errors in the above output " \ + "or in $(BUILDDIR)/linkcheck/output.txt." + +doctest: + $(SPHINXBUILD) -b doctest $(ALLSPHINXOPTS) $(BUILDDIR)/doctest + @echo "Testing of doctests in the sources finished, look at the " \ + "results in $(BUILDDIR)/doctest/output.txt." diff --git a/libs/rarfile1/doc/api.rst b/libs/rarfile1/doc/api.rst new file mode 100644 index 00000000..9892e8b4 --- /dev/null +++ b/libs/rarfile1/doc/api.rst @@ -0,0 +1,111 @@ + +rarfile API documentation +========================= + +.. contents:: Table Of Contents + +Introduction +------------ + +.. automodule:: rarfile + +RarFile class +------------- + +.. autoclass:: RarFile + :members: + :inherited-members: + +RarInfo class +------------- + +.. autoclass:: RarInfo + :members: + :inherited-members: + +RarExtFile class +---------------- + +.. autoclass:: RarExtFile + :members: + :inherited-members: + +Functions +--------- + +.. autofunction:: is_rarfile + +Module Configuration +-------------------- + +.. autodata:: UNRAR_TOOL +.. autodata:: DEFAULT_CHARSET +.. autodata:: TRY_ENCODINGS +.. autodata:: USE_DATETIME +.. autodata:: PATH_SEP +.. autodata:: NEED_COMMENTS +.. autodata:: UNICODE_COMMENTS +.. autodata:: USE_EXTRACT_HACK +.. autodata:: HACK_SIZE_LIMIT + +Constants +--------- + +.. py:data:: RAR_M0 + + No compression. + +.. py:data:: RAR_M1 + + Compression level `-m1` - Fastest compression. + +.. py:data:: RAR_M2 + + Compression level `-m2`. + +.. py:data:: RAR_M3 + + Compression level `-m3`. + +.. py:data:: RAR_M4 + + Compression level `-m4`. + +.. py:data:: RAR_M5 + + Compression level `-m5` - Maximum compression. + +.. py:data:: RAR_OS_MSDOS +.. py:data:: RAR_OS_OS2 +.. py:data:: RAR_OS_WIN32 +.. py:data:: RAR_OS_UNIX +.. py:data:: RAR_OS_MACOS +.. py:data:: RAR_OS_BEOS + +Exceptions +---------- + +.. autoclass:: Error +.. autoclass:: BadRarFile +.. autoclass:: NotRarFile +.. autoclass:: BadRarName +.. autoclass:: NoRarEntry +.. autoclass:: PasswordRequired +.. autoclass:: NeedFirstVolume +.. autoclass:: NoCrypto +.. autoclass:: RarExecError +.. autoclass:: RarWarning +.. autoclass:: RarFatalError +.. autoclass:: RarCRCError +.. autoclass:: RarLockedArchiveError +.. autoclass:: RarWriteError +.. autoclass:: RarOpenError +.. autoclass:: RarUserError +.. autoclass:: RarMemoryError +.. autoclass:: RarCreateError +.. autoclass:: RarNoFilesError +.. autoclass:: RarUserBreak +.. autoclass:: RarUnknownError +.. autoclass:: RarSignalExit + + diff --git a/libs/rarfile1/doc/conf.py b/libs/rarfile1/doc/conf.py new file mode 100644 index 00000000..47094733 --- /dev/null +++ b/libs/rarfile1/doc/conf.py @@ -0,0 +1,249 @@ +# -*- coding: utf-8 -*- +# +# RarFile documentation build configuration file, created by +# sphinx-quickstart on Sun Mar 24 13:29:46 2013. +# +# This file is execfile()d with the current directory set to its containing dir. +# +# Note that not all possible configuration values are present in this +# autogenerated file. +# +# All configuration values have a default; values that are commented out +# serve to show the default. + +import sys, os, os.path + +# If extensions (or modules to document with autodoc) are in another directory, +# add these directories to sys.path here. If the directory is relative to the +# documentation root, use os.path.abspath to make it absolute, like shown here. +sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..')) +import rarfile + +# -- General configuration ----------------------------------------------------- + +# If your documentation needs a minimal Sphinx version, state it here. +#needs_sphinx = '1.0' + +# Add any Sphinx extension module names here, as strings. They can be extensions +# coming with Sphinx (named 'sphinx.ext.*') or your custom ones. +extensions = ['sphinx.ext.autodoc', 'sphinx.ext.intersphinx'] + +autodoc_member_order = 'bysource' +autoclass_content = 'both' +autodoc_default_flags = ['show-inheritance'] + +intersphinx_mapping = {'python': ('http://docs.python.org/2', None)} + +# Add any paths that contain templates here, relative to this directory. +templates_path = ['_templates'] + +# The suffix of source filenames. +source_suffix = '.rst' + +# The encoding of source files. +#source_encoding = 'utf-8-sig' + +# The master toctree document. +master_doc = 'index' + +# General information about the project. +project = u'RarFile' +copyright = u'2005-2016, Marko Kreen' + +# The version info for the project you're documenting, acts as replacement for +# |version| and |release|, also used in various other places throughout the +# built documents. +# +# The short X.Y version. +version = rarfile.__version__ +# The full version, including alpha/beta/rc tags. +release = rarfile.__version__ + +# The language for content autogenerated by Sphinx. Refer to documentation +# for a list of supported languages. +#language = None + +# There are two options for replacing |today|: either, you set today to some +# non-false value, then it is used: +#today = '' +# Else, today_fmt is used as the format for a strftime call. +#today_fmt = '%B %d, %Y' + +# List of patterns, relative to source directory, that match files and +# directories to ignore when looking for source files. +exclude_patterns = ['_build'] + +# The reST default role (used for this markup: `text`) to use for all documents. +#default_role = None + +# If true, '()' will be appended to :func: etc. cross-reference text. +#add_function_parentheses = True + +# If true, the current module name will be prepended to all description +# unit titles (such as .. function::). +#add_module_names = True + +# If true, sectionauthor and moduleauthor directives will be shown in the +# output. They are ignored by default. +#show_authors = False + +# The name of the Pygments (syntax highlighting) style to use. +pygments_style = 'sphinx' + +# A list of ignored prefixes for module index sorting. +#modindex_common_prefix = [] + + +# -- Options for HTML output --------------------------------------------------- + +# The theme to use for HTML and HTML Help pages. See the documentation for +# a list of builtin themes. +html_theme = 'default' + +# Theme options are theme-specific and customize the look and feel of a theme +# further. For a list of options available for each theme, see the +# documentation. +#html_theme_options = {} + +# Add any paths that contain custom themes here, relative to this directory. +#html_theme_path = [] + +# The name for this set of Sphinx documents. If None, it defaults to +# " v documentation". +#html_title = None + +# A shorter title for the navigation bar. Default is the same as html_title. +#html_short_title = None + +# The name of an image file (relative to this directory) to place at the top +# of the sidebar. +#html_logo = None + +# The name of an image file (within the static path) to use as favicon of the +# docs. This file should be a Windows icon file (.ico) being 16x16 or 32x32 +# pixels large. +#html_favicon = None + +# Add any paths that contain custom static files (such as style sheets) here, +# relative to this directory. They are copied after the builtin static files, +# so a file named "default.css" will overwrite the builtin "default.css". +#html_static_path = ['_static'] + +# If not '', a 'Last updated on:' timestamp is inserted at every page bottom, +# using the given strftime format. +#html_last_updated_fmt = '%b %d, %Y' + +# If true, SmartyPants will be used to convert quotes and dashes to +# typographically correct entities. +#html_use_smartypants = True + +# Custom sidebar templates, maps document names to template names. +#html_sidebars = {} + +# Additional templates that should be rendered to pages, maps page names to +# template names. +#html_additional_pages = {} + +# If false, no module index is generated. +#html_domain_indices = True + +# If false, no index is generated. +#html_use_index = True + +# If true, the index is split into individual pages for each letter. +#html_split_index = False + +# If true, links to the reST sources are added to the pages. +#html_show_sourcelink = True + +# If true, "Created using Sphinx" is shown in the HTML footer. Default is True. +html_show_sphinx = False + +# If true, "(C) Copyright ..." is shown in the HTML footer. Default is True. +#html_show_copyright = True + +# If true, an OpenSearch description file will be output, and all pages will +# contain a tag referring to it. The value of this option must be the +# base URL from which the finished HTML is served. +#html_use_opensearch = '' + +# This is the file name suffix for HTML files (e.g. ".xhtml"). +#html_file_suffix = None + +# Output file base name for HTML help builder. +#htmlhelp_basename = 'RarFiledoc' + + +# -- Options for LaTeX output -------------------------------------------------- + +latex_elements = { +# The paper size ('letterpaper' or 'a4paper'). +#'papersize': 'letterpaper', + +# The font size ('10pt', '11pt' or '12pt'). +#'pointsize': '10pt', + +# Additional stuff for the LaTeX preamble. +#'preamble': '', +} + +# Grouping the document tree into LaTeX files. List of tuples +# (source start file, target name, title, author, documentclass [howto/manual]). +latex_documents = [ + ('index', 'RarFile.tex', u'RarFile Documentation', + u'Marko Kreen', 'manual'), +] + +# The name of an image file (relative to this directory) to place at the top of +# the title page. +#latex_logo = None + +# For "manual" documents, if this is true, then toplevel headings are parts, +# not chapters. +#latex_use_parts = False + +# If true, show page references after internal links. +#latex_show_pagerefs = False + +# If true, show URL addresses after external links. +#latex_show_urls = False + +# Documents to append as an appendix to all manuals. +#latex_appendices = [] + +# If false, no module index is generated. +#latex_domain_indices = True + + +# -- Options for manual page output -------------------------------------------- + +# One entry per manual page. List of tuples +# (source start file, name, description, authors, manual section). +#man_pages = [ +# ('index', 'rarfile', u'RarFile Documentation', +# [u'Marko Kreen'], 1) +#] + +# If true, show URL addresses after external links. +#man_show_urls = False + + +# -- Options for Texinfo output ------------------------------------------------ + +# Grouping the document tree into Texinfo files. List of tuples +# (source start file, target name, title, author, +# dir menu entry, description, category) +texinfo_documents = [ + ('index', 'RarFile', u'RarFile Documentation', + u'Marko Kreen', 'RarFile', 'One line description of project.', + 'Miscellaneous'), +] + +# Documents to append as an appendix to all manuals. +#texinfo_appendices = [] + +# If false, no module index is generated. +#texinfo_domain_indices = True + +# How to display URL addresses: 'footnote', 'no', or 'inline'. +#texinfo_show_urls = 'footnote' diff --git a/libs/rarfile1/doc/faq.rst b/libs/rarfile1/doc/faq.rst new file mode 100644 index 00000000..488b42a4 --- /dev/null +++ b/libs/rarfile1/doc/faq.rst @@ -0,0 +1,87 @@ + +rarfile FAQ +=========== + +.. contents:: Table of Contents + +What are the dependencies? +-------------------------- + +It depends on ``unrar`` command-line utility to do the actual decompression. +Note that by default it expect it to be in ``PATH``. If unrar +launching fails, you need to fix this. + +Alternatively, :mod:`rarfile` can use bsdtar_ from libarchive_ as +decompression backend, but that is a bit problematic as bsdtar_ does not support +all RAR features. + +.. _bsdtar: https://github.com/libarchive/libarchive/wiki/ManPageBsdtar1 +.. _libarchive: http://www.libarchive.org/ + +It depends on cryptography_ or PyCrypto_ modules to process +archives with password-protected headers. + +.. _cryptography: https://pypi.python.org/pypi/cryptography +.. _PyCrypto: https://pypi.python.org/pypi/pycrypto + +Does it parse ``unrar`` output to get archive contents? +------------------------------------------------------- + +No, :mod:`rarfile` parses RAR structure in Python code. Also it can +read uncompressed files from archive without external utility. + +Will rarfile support wrapping unrarlib/unrar.dll/unrar.so in the future? +------------------------------------------------------------------------ + +No. The current architecture - parsing in Python and decompression with +command line tools work well across all interesting operating systems +(Windows/Linux/MacOS), wrapping a library does not bring any advantages. + +Simple execution of command-line tools is also legally simpler situation +than linking with external library. + +How can I get it work on Windows? +--------------------------------- + +On Windows the ``unrar.exe`` is not in ``PATH`` so simple ``Popen("unrar ..")`` does not work. +It can be solved several ways: + +1. Add location of ``unrar.exe`` to PATH. +2. Set :data:`rarfile.UNRAR_TOOL` to full path of ``unrar.exe``. +3. Copy ``unrar.exe`` to your program directory. +4. Copy ``unrar.exe`` to system directory that is in PATH, eg. ``C:\Windows``. + +How to avoid the need for user to manually install rarfile/unrar? +----------------------------------------------------------------- + +Include ``rarfile.py`` and/or ``unrar`` with your application. + +Will it support creating RAR archives? +-------------------------------------- + +No. RARLAB_ is not interested in RAR becoming open format +and specifically discourages writing RAR creation software. + +In the meantime use either Zip_ (better compatibility) or 7z_ (better compression) +format for your own archives. + +.. _RARLAB: http://www.rarlab.com/ +.. _Zip: https://en.wikipedia.org/wiki/ZIP_%28file_format%29 +.. _7z: https://en.wikipedia.org/wiki/7z + +What is the USE_EXTRACT_HACK? +----------------------------- + +RarFile uses ``unrar`` to extract compressed files. But when extracting +single file from archive containing many entries, ``unrar`` needs to parse +whole archive until it finds the right entry. This makes random-access +to entries slow. To avoid that, RarFile remembers location of compressed +data for each entry and on read it copies it to temporary archive containing +only data for that one file, thus making ``unrar`` fast. + +The logic is only activated for entries smaller than :data:`rarfile.HACK_SIZE_LIMIT` +(20M by default). Bigger files are accessed directly from RAR. + +Note - it only works for non-solid archives. So if you care about +random access to files in your archive, do not create solid archives. + diff --git a/libs/rarfile1/doc/index.rst b/libs/rarfile1/doc/index.rst new file mode 100644 index 00000000..bbd4a51b --- /dev/null +++ b/libs/rarfile1/doc/index.rst @@ -0,0 +1,42 @@ + +rarfile - RAR archive reader for Python +======================================= + +This is Python module for RAR_ archive reading. The interface +is made as zipfile_ like as possible. Licensed under ISC_ +license. + +.. _RAR: http://en.wikipedia.org/wiki/RAR +.. _zipfile: http://docs.python.org/library/zipfile.html +.. _ISC: http://en.wikipedia.org/wiki/ISC_license + +Features: + +- Supports both RAR 2.x and 3.x archives. +- Supports multi volume archives. +- Supports Unicode filenames. +- Supports password-protected archives. +- Supports archive and file comments. +- Archive parsing and non-compressed files are handled in pure Python code. +- For compressed files runs ``unrar`` utility. +- Works with both Python 2.x and 3.x. + + + +Documentation: + +.. toctree:: + :maxdepth: 1 + + Module Documentation + FAQs + Release News + + +Indices and tables +================== + +* :ref:`genindex` +* :ref:`modindex` +* :ref:`search` + diff --git a/libs/rarfile1/doc/make.bat b/libs/rarfile1/doc/make.bat new file mode 100644 index 00000000..5a239c33 --- /dev/null +++ b/libs/rarfile1/doc/make.bat @@ -0,0 +1,190 @@ +@ECHO OFF + +REM Command file for Sphinx documentation + +if "%SPHINXBUILD%" == "" ( + set SPHINXBUILD=sphinx-build +) +set BUILDDIR=_build +set ALLSPHINXOPTS=-d %BUILDDIR%/doctrees %SPHINXOPTS% . +set I18NSPHINXOPTS=%SPHINXOPTS% . +if NOT "%PAPER%" == "" ( + set ALLSPHINXOPTS=-D latex_paper_size=%PAPER% %ALLSPHINXOPTS% + set I18NSPHINXOPTS=-D latex_paper_size=%PAPER% %I18NSPHINXOPTS% +) + +if "%1" == "" goto help + +if "%1" == "help" ( + :help + echo.Please use `make ^` where ^ is one of + echo. html to make standalone HTML files + echo. dirhtml to make HTML files named index.html in directories + echo. singlehtml to make a single large HTML file + echo. pickle to make pickle files + echo. json to make JSON files + echo. htmlhelp to make HTML files and a HTML help project + echo. qthelp to make HTML files and a qthelp project + echo. devhelp to make HTML files and a Devhelp project + echo. epub to make an epub + echo. latex to make LaTeX files, you can set PAPER=a4 or PAPER=letter + echo. text to make text files + echo. man to make manual pages + echo. texinfo to make Texinfo files + echo. gettext to make PO message catalogs + echo. changes to make an overview over all changed/added/deprecated items + echo. linkcheck to check all external links for integrity + echo. doctest to run all doctests embedded in the documentation if enabled + goto end +) + +if "%1" == "clean" ( + for /d %%i in (%BUILDDIR%\*) do rmdir /q /s %%i + del /q /s %BUILDDIR%\* + goto end +) + +if "%1" == "html" ( + %SPHINXBUILD% -b html %ALLSPHINXOPTS% %BUILDDIR%/html + if errorlevel 1 exit /b 1 + echo. + echo.Build finished. The HTML pages are in %BUILDDIR%/html. + goto end +) + +if "%1" == "dirhtml" ( + %SPHINXBUILD% -b dirhtml %ALLSPHINXOPTS% %BUILDDIR%/dirhtml + if errorlevel 1 exit /b 1 + echo. + echo.Build finished. The HTML pages are in %BUILDDIR%/dirhtml. + goto end +) + +if "%1" == "singlehtml" ( + %SPHINXBUILD% -b singlehtml %ALLSPHINXOPTS% %BUILDDIR%/singlehtml + if errorlevel 1 exit /b 1 + echo. + echo.Build finished. The HTML pages are in %BUILDDIR%/singlehtml. + goto end +) + +if "%1" == "pickle" ( + %SPHINXBUILD% -b pickle %ALLSPHINXOPTS% %BUILDDIR%/pickle + if errorlevel 1 exit /b 1 + echo. + echo.Build finished; now you can process the pickle files. + goto end +) + +if "%1" == "json" ( + %SPHINXBUILD% -b json %ALLSPHINXOPTS% %BUILDDIR%/json + if errorlevel 1 exit /b 1 + echo. + echo.Build finished; now you can process the JSON files. + goto end +) + +if "%1" == "htmlhelp" ( + %SPHINXBUILD% -b htmlhelp %ALLSPHINXOPTS% %BUILDDIR%/htmlhelp + if errorlevel 1 exit /b 1 + echo. + echo.Build finished; now you can run HTML Help Workshop with the ^ +.hhp project file in %BUILDDIR%/htmlhelp. + goto end +) + +if "%1" == "qthelp" ( + %SPHINXBUILD% -b qthelp %ALLSPHINXOPTS% %BUILDDIR%/qthelp + if errorlevel 1 exit /b 1 + echo. + echo.Build finished; now you can run "qcollectiongenerator" with the ^ +.qhcp project file in %BUILDDIR%/qthelp, like this: + echo.^> qcollectiongenerator %BUILDDIR%\qthelp\RarFile.qhcp + echo.To view the help file: + echo.^> assistant -collectionFile %BUILDDIR%\qthelp\RarFile.ghc + goto end +) + +if "%1" == "devhelp" ( + %SPHINXBUILD% -b devhelp %ALLSPHINXOPTS% %BUILDDIR%/devhelp + if errorlevel 1 exit /b 1 + echo. + echo.Build finished. + goto end +) + +if "%1" == "epub" ( + %SPHINXBUILD% -b epub %ALLSPHINXOPTS% %BUILDDIR%/epub + if errorlevel 1 exit /b 1 + echo. + echo.Build finished. The epub file is in %BUILDDIR%/epub. + goto end +) + +if "%1" == "latex" ( + %SPHINXBUILD% -b latex %ALLSPHINXOPTS% %BUILDDIR%/latex + if errorlevel 1 exit /b 1 + echo. + echo.Build finished; the LaTeX files are in %BUILDDIR%/latex. + goto end +) + +if "%1" == "text" ( + %SPHINXBUILD% -b text %ALLSPHINXOPTS% %BUILDDIR%/text + if errorlevel 1 exit /b 1 + echo. + echo.Build finished. The text files are in %BUILDDIR%/text. + goto end +) + +if "%1" == "man" ( + %SPHINXBUILD% -b man %ALLSPHINXOPTS% %BUILDDIR%/man + if errorlevel 1 exit /b 1 + echo. + echo.Build finished. The manual pages are in %BUILDDIR%/man. + goto end +) + +if "%1" == "texinfo" ( + %SPHINXBUILD% -b texinfo %ALLSPHINXOPTS% %BUILDDIR%/texinfo + if errorlevel 1 exit /b 1 + echo. + echo.Build finished. The Texinfo files are in %BUILDDIR%/texinfo. + goto end +) + +if "%1" == "gettext" ( + %SPHINXBUILD% -b gettext %I18NSPHINXOPTS% %BUILDDIR%/locale + if errorlevel 1 exit /b 1 + echo. + echo.Build finished. The message catalogs are in %BUILDDIR%/locale. + goto end +) + +if "%1" == "changes" ( + %SPHINXBUILD% -b changes %ALLSPHINXOPTS% %BUILDDIR%/changes + if errorlevel 1 exit /b 1 + echo. + echo.The overview file is in %BUILDDIR%/changes. + goto end +) + +if "%1" == "linkcheck" ( + %SPHINXBUILD% -b linkcheck %ALLSPHINXOPTS% %BUILDDIR%/linkcheck + if errorlevel 1 exit /b 1 + echo. + echo.Link check complete; look for any errors in the above output ^ +or in %BUILDDIR%/linkcheck/output.txt. + goto end +) + +if "%1" == "doctest" ( + %SPHINXBUILD% -b doctest %ALLSPHINXOPTS% %BUILDDIR%/doctest + if errorlevel 1 exit /b 1 + echo. + echo.Testing of doctests in the sources finished, look at the ^ +results in %BUILDDIR%/doctest/output.txt. + goto end +) + +:end diff --git a/libs/rarfile1/doc/news.rst b/libs/rarfile1/doc/news.rst new file mode 100644 index 00000000..85d00f19 --- /dev/null +++ b/libs/rarfile1/doc/news.rst @@ -0,0 +1,243 @@ + +rarfile history +=============== + +.. py:currentmodule:: rarfile + +Version 2.8 (2016-06-07) +------------------------ + +* Fix: support solid archives from in-memory file object. + Full archive will be written out to temp file. + [`#21 `_] + +* Fix: ask unrar stop switches scanning, + to handle archive names starting with "-". + (Alexander Shadchin) + [`#12 `_] + +* Fix: add missing _parse_error variable to RarFile object. + (Gregory Mazzola) + [`#20 `_] + +* Fix: return proper boolean from :meth:`RarInfo.needs_password`. + [`#22 `_] + +* Fix: do not insert non-string rarfile into exception string. + (Tim Muller) + [`#23 `_] + +* Fix: make :meth:`RarFile.extract` and :meth:`RarFile.testrar` + support in-memory archives. + +* Use cryptography_ module as preferred crypto backend. + PyCrypto_ will be used as fallback. + +* Cleanup: remove compat code for Python 2.4/2.5/2.6. + +.. _cryptography: https://pypi.python.org/pypi/cryptography +.. _PyCrypto: https://pypi.python.org/pypi/pycrypto + +Version 2.7 (2014-11-23) +------------------------ + +* Allow use of bsdtar_ as decompression backend. It sits + on top of libarchive_, which has support for reading RAR archives. + + Limitations of ``libarchive`` RAR backend: + + - Does not support solid archives. + - Does not support password-protected archives. + - Does not support "parsing filters" used for audio/image/executable data, + so few non-solid, non-encrypted archives also fail. + + Now :mod:`rarfile` checks if ``unrar`` and if not then tries ``bsdtar``. + If that works, then keeps using it. If not then configuration + stays with ``unrar`` which will then appear in error messages. + +.. _bsdtar: https://github.com/libarchive/libarchive/wiki/ManPageBsdtar1 +.. _libarchive: http://www.libarchive.org/ + +* Both :class:`RarFile` and :func:`is_rarfile` now accept file-like + object. Eg. :class:`io.BytesIO`. Only requirement is that the object + must be seekable. This mirrors similar funtionality in zipfile. + + Based on patch by Chase Zhang. + +* Uniform error handling. :class:`RarFile` accepts ``errors="strict"`` + argument. + + Allow user to tune whether parsing and missing file errors will raise + exception. If error is not raised, the error string can be queried + with :meth:`RarFile.strerror` method. + +Version 2.6 (2013-04-10) +------------------------ + +* Add context manager support for :class:`RarFile` class. + Both :class:`RarFile` and :class:`RarExtFile` support + :keyword:`with` statement now. + (Wentao Han) +* :meth:`RarFile.volumelist` method, returns filenames of archive volumes. +* Re-throw clearer error in case ``unrar`` is not found in ``PATH``. +* Sync new unrar4.x error code from ``rar.txt``. +* Use Sphinx for documentation, push docs to rtfd.org_ + +.. _rtfd.org: https://rarfile.readthedocs.org/ + +Version 2.5 (2012-01-19) +------------------------ + +Fixes: + +* :meth:`RarExtFile.read` and :meth:`RarExtFile.readinto` now do looping read + to work properly on short reads. Important for Python 3.2+ where read from pipe + can return short result even on blocking file descriptor. +* Proper error reporting in :meth:`RarFile.extract`, :meth:`RarFile.extractall` + and :meth:`RarFile.testrar`. +* :meth:`RarExtFile.read` from unrar pipe: prefer to return unrar error code, + if thats not available, do own error checks. +* Avoid string addition in :meth:`RarExtFile.read`, instead use always list+join to + merge multi-part reads. +* dumprar: dont re-encode byte strings (Python 2.x). This avoids + unneccessary failure when printing invalid unicode. + +Version 2.4 (2011-11-05) +------------------------ + +Fixes: + +* :data:`USE_DATETIME`: survive bad values from RAR +* Fix bug in corrupt unicode filename handling +* dumprar: make unicode chars work with both pipe and console + +Version 2.3 (2011-07-03) +------------------------ + +Features: + +* Support .seek() method on file streams. (Kristian Larsson) +* Support .readinto() method on file streams. Optimized implementation + is available on Python 2.6+ where :class:`memoryview` is available. +* Support file comments - :attr:`RarInfo.comment` contains decompressed data if available. +* File objects returned by :meth:`RarFile.open()` are :class:`io.RawIOBase`-compatible. + They can further wrapped with :class:`io.BufferedReader` and :class:`io.TextIOWrapper`. +* Now .getinfo() uses dict lookup instead of sequential scan when + searching archive entry. This speeds up prococessing for archives that + have many entries. +* Option :data:`UNICODE_COMMENTS` to decode both archive and file comments to unicode. + It uses :data:`TRY_ENCODINGS` for list of encodings to try. If off, comments are + left as byte strings. Default: 0 +* Option :data:`PATH_SEP` to change path separator. Default: ``r'\'``, + set ``rarfile.PATH_SEP='/'`` to be compatibe with zipfile. +* Option :data:`USE_DATETIME` to convert timestamps to datetime objects. + Default: 0, timestamps are tuples. +* Option :data:`TRY_ENCODINGS` to allow tuning attempted encoding list. +* Reorder :class:`RarInfo` fiels to better show zipfile-compatible fields. +* Standard regtests to make sure various features work + +Compatibility: + +* Drop :attr:`RarInfo.unicode_filename`, plain :attr:`RarInfo.filename` is already unicode since 2.0. +* .read(-1) reads now until EOF. Previously it returned empty buffer. + +Fixes: + +* Make encrypted headers work with Python 3.x bytes() and with old 2.x 'sha' module. +* Simplify :class:`subprocess.Popen` usage when launching ``unrar``. Previously + it tried to optimize and work around OS/Python bugs, but this is not + maintainable. +* Use temp rar file hack on multi-volume archives too. +* Always .wait() on unrar, to avoid zombies +* Convert struct.error to BadRarFile +* Plug some fd leaks. Affected: Jython, PyPy. +* Broken archives are handled more robustly. + +Version 2.2 (2010-08-19) +------------------------ + +Fixes: + +* Relaxed volume naming. Now it just calculates new volume name by finding number + in old one and increasing it, without any expectations what that number should be. +* Files with 4G of compressed data in one colume were handled wrong. Fix. +* DOS timestamp seconds need to be multiplied with 2. +* Correct EXTTIME parsing. + +Cleanups: + +* Compressed size is per-volume, sum them together, so that user sees complete + compressed size for files split over several volumes. +* dumprar: Show unknown bits. +* Use :class:`struct.Struct` to cache unpack formats. +* Support missing :data:`os.devnull`. (Python 2.3) + +Version 2.1 (2010-07-31) +------------------------ + +Features: + +* Minimal implmentation for :meth:`RarFile.extract`, :meth:`RarFile.extractall`, :meth:`RarFile.testrar`. + They are simple shortcuts to ``unrar`` invocation. +* Accept :class:`RarInfo` object where filename is expected. +* Include ``dumprar.py`` in .tgz. It can be used to visualize RAR structure + and test module. +* Support for encrypted file headers. + +Fixes: + +* Don't read past ENDARC, there could be non-RAR data there. +* RAR 2.x: It does not write ENDARC, but our volume code expected it. Fix that. +* RAR 2.x: Support more than 200 old-style volumes. + +Cleanups: + +* Load comment only when requested. +* Cleanup of internal config variables. They should have now final names. +* :meth:`RarFile.open`: Add mode=r argument to match zipfile. +* Doc and comments cleanup, minimize duplication. +* Common wrappers for both compressed and uncompressed files, + now :meth:`RarFile.open` also does CRC-checking. + +Version 2.0 (2010-04-29) +------------------------ + +Features: + +* Python 3 support. Still works with 2.x. +* Parses extended time fields. (.mtime, .ctime, .atime) +* :meth:`RarFile.open` method. This makes possible to process large + entries that do not fit into memory. +* Supports password-protected archives. +* Supports archive comments. + +Cleanups: + +* Uses :mod:`subprocess` module to launch ``unrar``. +* .filename is always Unicode string, .unicode_filename is now deprecated. +* .CRC is unsigned again, as python3 crc32() is unsigned. + +Version 1.1 (2008-08-31) +------------------------ + +Fixes: + +* Replace :func:`os.tempnam` with :func:`tempfile.mkstemp`. (Jason Moiron) +* Fix infinite loop in _extract_hack on unexpected EOF +* :attr:`RarInfo.CRC` is now signed value to match crc32() +* :meth:`RarFile.read` now checks file crc + +Cleanups: + +* more docstrings +* throw proper exceptions (subclasses of :exc:`rarfile.Error`) +* RarInfo has fields pre-initialized, so they appear in help() +* rename RarInfo.data to RarInfo.header_data +* dont use "print" when header parsing fails +* use try/finally to delete temp rar + +Version 1.0 (2005-08-08) +------------------------ + +* First release. + diff --git a/libs/rarfile1/dumprar.py b/libs/rarfile1/dumprar.py new file mode 100755 index 00000000..f7ab062b --- /dev/null +++ b/libs/rarfile1/dumprar.py @@ -0,0 +1,361 @@ +#! /usr/bin/env python + +"""Dump archive contents, test extraction.""" + +import io +import sys +import rarfile as rf +from binascii import crc32, hexlify +from datetime import datetime + +try: + bytearray +except NameError: + import array + def bytearray(v): + return array.array('B', v) + +rf.UNICODE_COMMENTS = 1 +rf.USE_DATETIME = 1 + +usage = """ +dumprar [switches] [ARC1 ARC2 ...] [@ARCLIST] +switches: + @file read archive names from file + -pPSW set password + -Ccharset set fallback charset + -v increase verbosity + -t attempt to read all files + -x write read files out + -c show archive comment + -h show usage + -- stop switch parsing +""".strip() + +os_list = ['DOS', 'OS2', 'WIN', 'UNIX', 'MACOS', 'BEOS'] + +block_strs = ['MARK', 'MAIN', 'FILE', 'OLD_COMMENT', 'OLD_EXTRA', + 'OLD_SUB', 'OLD_RECOVERY', 'OLD_AUTH', 'SUB', 'ENDARC'] + +def rarType(type): + if type < rf.RAR_BLOCK_MARK or type > rf.RAR_BLOCK_ENDARC: + return "*UNKNOWN*" + return block_strs[type - rf.RAR_BLOCK_MARK] + +main_bits = ( + (rf.RAR_MAIN_VOLUME, "VOL"), + (rf.RAR_MAIN_COMMENT, "COMMENT"), + (rf.RAR_MAIN_LOCK, "LOCK"), + (rf.RAR_MAIN_SOLID, "SOLID"), + (rf.RAR_MAIN_NEWNUMBERING, "NEWNR"), + (rf.RAR_MAIN_AUTH, "AUTH"), + (rf.RAR_MAIN_RECOVERY, "RECOVERY"), + (rf.RAR_MAIN_PASSWORD, "PASSWORD"), + (rf.RAR_MAIN_FIRSTVOLUME, "FIRSTVOL"), + (rf.RAR_SKIP_IF_UNKNOWN, "SKIP"), + (rf.RAR_LONG_BLOCK, "LONG"), +) + +endarc_bits = ( + (rf.RAR_ENDARC_NEXT_VOLUME, "NEXTVOL"), + (rf.RAR_ENDARC_DATACRC, "DATACRC"), + (rf.RAR_ENDARC_REVSPACE, "REVSPACE"), + (rf.RAR_ENDARC_VOLNR, "VOLNR"), + (rf.RAR_SKIP_IF_UNKNOWN, "SKIP"), + (rf.RAR_LONG_BLOCK, "LONG"), +) + +file_bits = ( + (rf.RAR_FILE_SPLIT_BEFORE, "SPLIT_BEFORE"), + (rf.RAR_FILE_SPLIT_AFTER, "SPLIT_AFTER"), + (rf.RAR_FILE_PASSWORD, "PASSWORD"), + (rf.RAR_FILE_COMMENT, "COMMENT"), + (rf.RAR_FILE_SOLID, "SOLID"), + (rf.RAR_FILE_LARGE, "LARGE"), + (rf.RAR_FILE_UNICODE, "UNICODE"), + (rf.RAR_FILE_SALT, "SALT"), + (rf.RAR_FILE_VERSION, "VERSION"), + (rf.RAR_FILE_EXTTIME, "EXTTIME"), + (rf.RAR_FILE_EXTFLAGS, "EXTFLAGS"), + (rf.RAR_SKIP_IF_UNKNOWN, "SKIP"), + (rf.RAR_LONG_BLOCK, "LONG"), +) + +generic_bits = ( + (rf.RAR_SKIP_IF_UNKNOWN, "SKIP"), + (rf.RAR_LONG_BLOCK, "LONG"), +) + +file_parms = ("D64", "D128", "D256", "D512", + "D1024", "D2048", "D4096", "DIR") + +def xprint(m, *args): + if sys.hexversion < 0x3000000: + m = m.decode('utf8') + if args: + m = m % args + if sys.hexversion < 0x3000000: + m = m.encode('utf8') + sys.stdout.write(m) + sys.stdout.write('\n') + +def render_flags(flags, bit_list): + res = [] + known = 0 + for bit in bit_list: + known = known | bit[0] + if flags & bit[0]: + res.append(bit[1]) + unknown = flags & ~known + n = 0 + while unknown: + if unknown & 1: + res.append("UNK_%04x" % (1 << n)) + unknown = unknown >> 1 + n += 1 + + return ",".join(res) + +def get_file_flags(flags): + res = render_flags(flags & ~rf.RAR_FILE_DICTMASK, file_bits) + + xf = (flags & rf.RAR_FILE_DICTMASK) >> 5 + res += "," + file_parms[xf] + return res + +def get_main_flags(flags): + return render_flags(flags, main_bits) + +def get_endarc_flags(flags): + return render_flags(flags, endarc_bits) + +def get_generic_flags(flags): + return render_flags(flags, generic_bits) + +def fmt_time(t): + if isinstance(t, datetime): + return t.isoformat(' ') + return "%04d-%02d-%02d %02d:%02d:%02d" % t + +def show_item(h): + st = rarType(h.type) + unknown = h.header_size - h.header_base + xprint("%s: hdrlen=%d datlen=%d hdr_unknown=%d", st, h.header_size, + h.add_size, unknown) + if unknown > 0 and cf_verbose > 1: + dat = h.header_data[h.header_base : ] + xprint(" unknown: %s", hexlify(dat)) + if h.type in (rf.RAR_BLOCK_FILE, rf.RAR_BLOCK_SUB): + if h.host_os == rf.RAR_OS_UNIX: + s_mode = "0%o" % h.mode + else: + s_mode = "0x%x" % h.mode + xprint(" flags=0x%04x:%s", h.flags, get_file_flags(h.flags)) + if h.host_os >= 0 and h.host_os < len(os_list): + s_os = os_list[h.host_os] + else: + s_os = "?" + xprint(" os=%d:%s ver=%d mode=%s meth=%c cmp=%d dec=%d vol=%d", + h.host_os, s_os, + h.extract_version, s_mode, h.compress_type, + h.compress_size, h.file_size, h.volume) + ucrc = (h.CRC + (1 << 32)) & ((1 << 32) - 1) + xprint(" crc=0x%08x (%d) time=%s", ucrc, h.CRC, fmt_time(h.date_time)) + xprint(" name=%s", h.filename) + if h.mtime: + xprint(" mtime=%s", fmt_time(h.mtime)) + if h.ctime: + xprint(" ctime=%s", fmt_time(h.ctime)) + if h.atime: + xprint(" atime=%s", fmt_time(h.atime)) + if h.arctime: + xprint(" arctime=%s", fmt_time(h.arctime)) + elif h.type == rf.RAR_BLOCK_MAIN: + xprint(" flags=0x%04x:%s", h.flags, get_main_flags(h.flags)) + elif h.type == rf.RAR_BLOCK_ENDARC: + xprint(" flags=0x%04x:%s", h.flags, get_endarc_flags(h.flags)) + elif h.type == rf.RAR_BLOCK_MARK: + xprint(" flags=0x%04x:", h.flags) + else: + xprint(" flags=0x%04x:%s", h.flags, get_generic_flags(h.flags)) + + if h.comment is not None: + cm = repr(h.comment) + if cm[0] == 'u': + cm = cm[1:] + xprint(" comment=%s", cm) + +cf_show_comment = 0 +cf_verbose = 0 +cf_charset = None +cf_extract = 0 +cf_test_read = 0 +cf_test_unrar = 0 +cf_test_memory = 0 + +def check_crc(f, inf): + ucrc = f.CRC + if ucrc < 0: + ucrc += (long(1) << 32) + if ucrc != inf.CRC: + print ('crc error') + +def test_read_long(r, inf): + f = r.open(inf.filename) + total = 0 + while 1: + data = f.read(8192) + if not data: + break + total += len(data) + if total != inf.file_size: + xprint("\n *** %s has corrupt file: %s ***", r.rarfile, inf.filename) + xprint(" *** short read: got=%d, need=%d ***\n", total, inf.file_size) + check_crc(f, inf) + + # test .seek() & .readinto() + if cf_test_read > 1: + f.seek(0,0) + + # hack: re-enable crc calc + f.crc_check = 1 + f.CRC = 0 + + total = 0 + buf = bytearray(rf.ZERO*4096) + while 1: + res = f.readinto(buf) + if not res: + break + total += res + if inf.file_size != total: + xprint(" *** readinto failed: got=%d, need=%d ***\n", total, inf.file_size) + check_crc(f, inf) + f.close() + +def test_read(r, inf): + test_read_long(r, inf) + + +def test_real(fn, psw): + xprint("Archive: %s", fn) + + cb = None + if cf_verbose > 1: + cb = show_item + + rfarg = fn + if cf_test_memory: + rfarg = io.BytesIO(open(fn, 'rb').read()) + + # check if rar + if not rf.is_rarfile(rfarg): + xprint(" --- %s is not a RAR file ---", fn) + return + + # open + r = rf.RarFile(rfarg, charset = cf_charset, info_callback = cb) + # set password + if r.needs_password(): + if psw: + r.setpassword(psw) + else: + xprint(" --- %s requires password ---", fn) + return + + # show comment + if cf_show_comment and r.comment: + for ln in r.comment.split('\n'): + xprint(" %s", ln) + elif cf_verbose == 1 and r.comment: + cm = repr(r.comment) + if cm[0] == 'u': + cm = cm[1:] + xprint(" comment=%s", cm) + + # process + for n in r.namelist(): + inf = r.getinfo(n) + if inf.isdir(): + continue + if cf_verbose == 1: + show_item(inf) + if cf_test_read: + test_read(r, inf) + + if cf_extract: + r.extractall() + for inf in r.infolist(): + r.extract(inf) + + if cf_test_unrar: + r.testrar() + +def test(fn, psw): + try: + test_real(fn, psw) + except rf.NeedFirstVolume: + xprint(" --- %s is middle part of multi-vol archive ---", fn) + except rf.Error: + exc, msg, tb = sys.exc_info() + xprint("\n *** %s: %s ***\n", exc.__name__, msg) + del tb + except IOError: + exc, msg, tb = sys.exc_info() + xprint("\n *** %s: %s ***\n", exc.__name__, msg) + del tb + +def main(): + global cf_verbose, cf_show_comment, cf_charset + global cf_extract, cf_test_read, cf_test_unrar + global cf_test_memory + + # parse args + args = [] + psw = None + noswitch = False + for a in sys.argv[1:]: + if noswitch: + args.append(a) + elif a[0] == "@": + for ln in open(a[1:], 'r'): + fn = ln[:-1] + args.append(fn) + elif a[0] != '-': + args.append(a) + elif a[1] == 'p': + psw = a[2:] + elif a == '--': + noswitch = True + elif a == '-h': + xprint(usage) + return + elif a == '-v': + cf_verbose += 1 + elif a == '-c': + cf_show_comment = 1 + elif a == '-x': + cf_extract = 1 + elif a == '-t': + cf_test_read += 1 + elif a == '-T': + cf_test_unrar = 1 + elif a == '-M': + cf_test_memory = 1 + elif a[1] == 'C': + cf_charset = a[2:] + else: + raise Exception("unknown switch: "+a) + if not args: + xprint(usage) + + for fn in args: + test(fn, psw) + + +if __name__ == '__main__': + try: + main() + except KeyboardInterrupt: + pass + diff --git a/libs/rarfile1/setup.py b/libs/rarfile1/setup.py new file mode 100644 index 00000000..e1b412c6 --- /dev/null +++ b/libs/rarfile1/setup.py @@ -0,0 +1,33 @@ +#! /usr/bin/env python + +from distutils.core import setup + +import rarfile + +ver = rarfile.__version__ +ldesc = open("README.rst").read().strip() +sdesc = ldesc.split('\n')[0].split(' - ')[1].strip() + +setup( + name = "rarfile", + version = ver, + description = sdesc, + long_description = ldesc, + author = "Marko Kreen", + license = "ISC", + author_email = "markokr@gmail.com", + url = "https://github.com/markokr/rarfile", + py_modules = ['rarfile'], + keywords = ['rar', 'unrar', 'archive'], + classifiers = [ + "Development Status :: 5 - Production/Stable", + "Intended Audience :: Developers", + "License :: OSI Approved :: ISC License (ISCL)", + "Operating System :: OS Independent", + "Programming Language :: Python :: 2", + "Programming Language :: Python :: 3", + "Topic :: Software Development :: Libraries :: Python Modules", + "Topic :: System :: Archiving :: Compression", + ] +) + diff --git a/libs/rarfile1/test/Makefile b/libs/rarfile1/test/Makefile new file mode 100644 index 00000000..5383db3f --- /dev/null +++ b/libs/rarfile1/test/Makefile @@ -0,0 +1,9 @@ +test: + ./test1.sh + ./test2.sh + +clean: + rm -rf __pycache__ + rm -f files/*.rar.[pj]* *.pyc *.class *.diffs + rm -f rarfile.py + diff --git a/libs/rarfile1/test/files/ctime0.rar b/libs/rarfile1/test/files/ctime0.rar new file mode 100644 index 00000000..d72c62dd Binary files /dev/null and b/libs/rarfile1/test/files/ctime0.rar differ diff --git a/libs/rarfile1/test/files/ctime0.rar.exp b/libs/rarfile1/test/files/ctime0.rar.exp new file mode 100644 index 00000000..2d6d0527 --- /dev/null +++ b/libs/rarfile1/test/files/ctime0.rar.exp @@ -0,0 +1,7 @@ +Archive: files/ctime0.rar +FILE: hdrlen=46 datlen=0 hdr_unknown=0 + flags=0x9020:EXTTIME,LONG,D128 + os=2:WIN ver=29 mode=0x20 meth=0 cmp=0 dec=0 vol=0 + crc=0x00000000 (0) time=2011-05-10 21:28:47.899345 + name=afile.txt + mtime=2011-05-10 21:28:47.899345 diff --git a/libs/rarfile1/test/files/ctime1.rar b/libs/rarfile1/test/files/ctime1.rar new file mode 100644 index 00000000..89d82557 Binary files /dev/null and b/libs/rarfile1/test/files/ctime1.rar differ diff --git a/libs/rarfile1/test/files/ctime1.rar.exp b/libs/rarfile1/test/files/ctime1.rar.exp new file mode 100644 index 00000000..acab0250 --- /dev/null +++ b/libs/rarfile1/test/files/ctime1.rar.exp @@ -0,0 +1,8 @@ +Archive: files/ctime1.rar +FILE: hdrlen=50 datlen=0 hdr_unknown=0 + flags=0x9020:EXTTIME,LONG,D128 + os=2:WIN ver=29 mode=0x20 meth=0 cmp=0 dec=0 vol=0 + crc=0x00000000 (0) time=2011-05-10 21:28:47.899345 + name=afile.txt + mtime=2011-05-10 21:28:47.899345 + ctime=2011-05-10 21:28:47 diff --git a/libs/rarfile1/test/files/ctime2.rar b/libs/rarfile1/test/files/ctime2.rar new file mode 100644 index 00000000..09c91371 Binary files /dev/null and b/libs/rarfile1/test/files/ctime2.rar differ diff --git a/libs/rarfile1/test/files/ctime2.rar.exp b/libs/rarfile1/test/files/ctime2.rar.exp new file mode 100644 index 00000000..0b45e28d --- /dev/null +++ b/libs/rarfile1/test/files/ctime2.rar.exp @@ -0,0 +1,8 @@ +Archive: files/ctime2.rar +FILE: hdrlen=51 datlen=0 hdr_unknown=0 + flags=0x9020:EXTTIME,LONG,D128 + os=2:WIN ver=29 mode=0x20 meth=0 cmp=0 dec=0 vol=0 + crc=0x00000000 (0) time=2011-05-10 21:28:47.899345 + name=afile.txt + mtime=2011-05-10 21:28:47.899345 + ctime=2011-05-10 21:28:47.897843 diff --git a/libs/rarfile1/test/files/ctime3.rar b/libs/rarfile1/test/files/ctime3.rar new file mode 100644 index 00000000..a32fa14f Binary files /dev/null and b/libs/rarfile1/test/files/ctime3.rar differ diff --git a/libs/rarfile1/test/files/ctime3.rar.exp b/libs/rarfile1/test/files/ctime3.rar.exp new file mode 100644 index 00000000..7a185b5d --- /dev/null +++ b/libs/rarfile1/test/files/ctime3.rar.exp @@ -0,0 +1,8 @@ +Archive: files/ctime3.rar +FILE: hdrlen=52 datlen=0 hdr_unknown=0 + flags=0x9020:EXTTIME,LONG,D128 + os=2:WIN ver=29 mode=0x20 meth=0 cmp=0 dec=0 vol=0 + crc=0x00000000 (0) time=2011-05-10 21:28:47.899345 + name=afile.txt + mtime=2011-05-10 21:28:47.899345 + ctime=2011-05-10 21:28:47.899327 diff --git a/libs/rarfile1/test/files/ctime4.rar b/libs/rarfile1/test/files/ctime4.rar new file mode 100644 index 00000000..921e0da6 Binary files /dev/null and b/libs/rarfile1/test/files/ctime4.rar differ diff --git a/libs/rarfile1/test/files/ctime4.rar.exp b/libs/rarfile1/test/files/ctime4.rar.exp new file mode 100644 index 00000000..7ce30c0d --- /dev/null +++ b/libs/rarfile1/test/files/ctime4.rar.exp @@ -0,0 +1,8 @@ +Archive: files/ctime4.rar +FILE: hdrlen=53 datlen=0 hdr_unknown=0 + flags=0x9020:EXTTIME,LONG,D128 + os=2:WIN ver=29 mode=0x20 meth=0 cmp=0 dec=0 vol=0 + crc=0x00000000 (0) time=2011-05-10 21:28:47.899345 + name=afile.txt + mtime=2011-05-10 21:28:47.899345 + ctime=2011-05-10 21:28:47.899345 diff --git a/libs/rarfile1/test/files/rar15-comment-lock.rar b/libs/rarfile1/test/files/rar15-comment-lock.rar new file mode 100644 index 00000000..462f2625 Binary files /dev/null and b/libs/rarfile1/test/files/rar15-comment-lock.rar differ diff --git a/libs/rarfile1/test/files/rar15-comment-lock.rar.exp b/libs/rarfile1/test/files/rar15-comment-lock.rar.exp new file mode 100644 index 00000000..4a4af276 --- /dev/null +++ b/libs/rarfile1/test/files/rar15-comment-lock.rar.exp @@ -0,0 +1,14 @@ +Archive: files/rar15-comment-lock.rar + comment='RARcomment -----' +FILE: hdrlen=72 datlen=7 hdr_unknown=31 + flags=0x8008:COMMENT,LONG,D64 + os=0:DOS ver=15 mode=0x20 meth=3 cmp=7 dec=7 vol=0 + crc=0xe27f07a9 (3799975849) time=2010-11-03 19:49:32 + name=FILE1.TXT + comment='file1comment -----' +FILE: hdrlen=72 datlen=8 hdr_unknown=31 + flags=0x8008:COMMENT,LONG,D64 + os=0:DOS ver=15 mode=0x20 meth=0 cmp=8 dec=8 vol=0 + crc=0x3c4306f7 (1011025655) time=2010-11-03 19:49:38 + name=FILE2.TXT + comment='file2comment -----' diff --git a/libs/rarfile1/test/files/rar15-comment.rar b/libs/rarfile1/test/files/rar15-comment.rar new file mode 100644 index 00000000..f193bb0f Binary files /dev/null and b/libs/rarfile1/test/files/rar15-comment.rar differ diff --git a/libs/rarfile1/test/files/rar15-comment.rar.exp b/libs/rarfile1/test/files/rar15-comment.rar.exp new file mode 100644 index 00000000..05e5a928 --- /dev/null +++ b/libs/rarfile1/test/files/rar15-comment.rar.exp @@ -0,0 +1,14 @@ +Archive: files/rar15-comment.rar + comment='RARcomment -----' +FILE: hdrlen=72 datlen=7 hdr_unknown=31 + flags=0x8008:COMMENT,LONG,D64 + os=0:DOS ver=15 mode=0x20 meth=3 cmp=7 dec=7 vol=0 + crc=0xe27f07a9 (3799975849) time=2010-11-03 19:49:32 + name=FILE1.TXT + comment='file1comment -----' +FILE: hdrlen=72 datlen=8 hdr_unknown=31 + flags=0x8008:COMMENT,LONG,D64 + os=0:DOS ver=15 mode=0x20 meth=0 cmp=8 dec=8 vol=0 + crc=0x3c4306f7 (1011025655) time=2010-11-03 19:49:38 + name=FILE2.TXT + comment='file2comment -----' diff --git a/libs/rarfile1/test/files/rar202-comment-nopsw.rar b/libs/rarfile1/test/files/rar202-comment-nopsw.rar new file mode 100644 index 00000000..329dc72a Binary files /dev/null and b/libs/rarfile1/test/files/rar202-comment-nopsw.rar differ diff --git a/libs/rarfile1/test/files/rar202-comment-nopsw.rar.exp b/libs/rarfile1/test/files/rar202-comment-nopsw.rar.exp new file mode 100644 index 00000000..b20cb577 --- /dev/null +++ b/libs/rarfile1/test/files/rar202-comment-nopsw.rar.exp @@ -0,0 +1,14 @@ +Archive: files/rar202-comment-nopsw.rar + comment='RARcomment' +FILE: hdrlen=66 datlen=7 hdr_unknown=25 + flags=0x8008:COMMENT,LONG,D64 + os=0:DOS ver=20 mode=0x20 meth=0 cmp=7 dec=7 vol=0 + crc=0x7a197dba (2048490938) time=2010-11-03 00:27:28 + name=FILE1.TXT + comment='file1comment' +FILE: hdrlen=66 datlen=7 hdr_unknown=25 + flags=0x8008:COMMENT,LONG,D64 + os=0:DOS ver=20 mode=0x20 meth=0 cmp=7 dec=7 vol=0 + crc=0x785fc3e3 (2019541987) time=2010-11-03 00:27:34 + name=FILE2.TXT + comment='file2comment' diff --git a/libs/rarfile1/test/files/rar202-comment-psw.rar b/libs/rarfile1/test/files/rar202-comment-psw.rar new file mode 100644 index 00000000..60fb14f4 Binary files /dev/null and b/libs/rarfile1/test/files/rar202-comment-psw.rar differ diff --git a/libs/rarfile1/test/files/rar202-comment-psw.rar.exp b/libs/rarfile1/test/files/rar202-comment-psw.rar.exp new file mode 100644 index 00000000..a54ac4b6 --- /dev/null +++ b/libs/rarfile1/test/files/rar202-comment-psw.rar.exp @@ -0,0 +1,14 @@ +Archive: files/rar202-comment-psw.rar + comment='RARcomment' +FILE: hdrlen=66 datlen=32 hdr_unknown=25 + flags=0x800c:PASSWORD,COMMENT,LONG,D64 + os=0:DOS ver=20 mode=0x20 meth=3 cmp=32 dec=7 vol=0 + crc=0x7a197dba (2048490938) time=2010-11-03 00:27:28 + name=FILE1.TXT + comment='file1comment' +FILE: hdrlen=66 datlen=32 hdr_unknown=25 + flags=0x800c:PASSWORD,COMMENT,LONG,D64 + os=0:DOS ver=20 mode=0x20 meth=3 cmp=32 dec=7 vol=0 + crc=0x785fc3e3 (2019541987) time=2010-11-03 00:27:34 + name=FILE2.TXT + comment='file2comment' diff --git a/libs/rarfile1/test/files/rar3-comment-hpsw.rar b/libs/rarfile1/test/files/rar3-comment-hpsw.rar new file mode 100644 index 00000000..37210ad6 Binary files /dev/null and b/libs/rarfile1/test/files/rar3-comment-hpsw.rar differ diff --git a/libs/rarfile1/test/files/rar3-comment-hpsw.rar.exp b/libs/rarfile1/test/files/rar3-comment-hpsw.rar.exp new file mode 100644 index 00000000..d861704e --- /dev/null +++ b/libs/rarfile1/test/files/rar3-comment-hpsw.rar.exp @@ -0,0 +1,16 @@ +Archive: files/rar3-comment-hpsw.rar + comment='RARcomment\n' +FILE: hdrlen=51 datlen=16 hdr_unknown=0 + flags=0x9424:PASSWORD,SALT,EXTTIME,LONG,D128 + os=3:UNIX ver=29 mode=0100644 meth=3 cmp=16 dec=0 vol=0 + crc=0x00000000 (0) time=2010-11-02 10:03:25 + name=file1.txt + mtime=2010-11-02 10:03:25 + comment='Comment1v2\n' +FILE: hdrlen=51 datlen=16 hdr_unknown=0 + flags=0x9424:PASSWORD,SALT,EXTTIME,LONG,D128 + os=3:UNIX ver=29 mode=0100644 meth=3 cmp=16 dec=0 vol=0 + crc=0x00000000 (0) time=2010-11-02 10:03:25 + name=file2.txt + mtime=2010-11-02 10:03:25 + comment='Comment2v2\n' diff --git a/libs/rarfile1/test/files/rar3-comment-plain.rar b/libs/rarfile1/test/files/rar3-comment-plain.rar new file mode 100644 index 00000000..29d8cb00 Binary files /dev/null and b/libs/rarfile1/test/files/rar3-comment-plain.rar differ diff --git a/libs/rarfile1/test/files/rar3-comment-plain.rar.exp b/libs/rarfile1/test/files/rar3-comment-plain.rar.exp new file mode 100644 index 00000000..0ad21471 --- /dev/null +++ b/libs/rarfile1/test/files/rar3-comment-plain.rar.exp @@ -0,0 +1,16 @@ +Archive: files/rar3-comment-plain.rar + comment='RARcomment\n' +FILE: hdrlen=43 datlen=8 hdr_unknown=0 + flags=0x9020:EXTTIME,LONG,D128 + os=3:UNIX ver=29 mode=0100644 meth=3 cmp=8 dec=0 vol=0 + crc=0x00000000 (0) time=2010-11-02 10:03:25 + name=file1.txt + mtime=2010-11-02 10:03:25 + comment='Comment1v2\n' +FILE: hdrlen=43 datlen=8 hdr_unknown=0 + flags=0x9020:EXTTIME,LONG,D128 + os=3:UNIX ver=29 mode=0100644 meth=3 cmp=8 dec=0 vol=0 + crc=0x00000000 (0) time=2010-11-02 10:03:25 + name=file2.txt + mtime=2010-11-02 10:03:25 + comment='Comment2v2\n' diff --git a/libs/rarfile1/test/files/rar3-comment-psw.rar b/libs/rarfile1/test/files/rar3-comment-psw.rar new file mode 100644 index 00000000..dd1beabf Binary files /dev/null and b/libs/rarfile1/test/files/rar3-comment-psw.rar differ diff --git a/libs/rarfile1/test/files/rar3-comment-psw.rar.exp b/libs/rarfile1/test/files/rar3-comment-psw.rar.exp new file mode 100644 index 00000000..a817bda9 --- /dev/null +++ b/libs/rarfile1/test/files/rar3-comment-psw.rar.exp @@ -0,0 +1,16 @@ +Archive: files/rar3-comment-psw.rar + comment='RARcomment\n' +FILE: hdrlen=51 datlen=16 hdr_unknown=0 + flags=0x9424:PASSWORD,SALT,EXTTIME,LONG,D128 + os=3:UNIX ver=29 mode=0100644 meth=3 cmp=16 dec=0 vol=0 + crc=0x00000000 (0) time=2010-11-02 10:03:25 + name=file1.txt + mtime=2010-11-02 10:03:25 + comment='Comment1v2\n' +FILE: hdrlen=51 datlen=16 hdr_unknown=0 + flags=0x9424:PASSWORD,SALT,EXTTIME,LONG,D128 + os=3:UNIX ver=29 mode=0100644 meth=3 cmp=16 dec=0 vol=0 + crc=0x00000000 (0) time=2010-11-02 10:03:25 + name=file2.txt + mtime=2010-11-02 10:03:25 + comment='Comment2v2\n' diff --git a/libs/rarfile1/test/files/seektest.rar b/libs/rarfile1/test/files/seektest.rar new file mode 100644 index 00000000..b1d72bb7 Binary files /dev/null and b/libs/rarfile1/test/files/seektest.rar differ diff --git a/libs/rarfile1/test/files/seektest.rar.exp b/libs/rarfile1/test/files/seektest.rar.exp new file mode 100644 index 00000000..cb61124a --- /dev/null +++ b/libs/rarfile1/test/files/seektest.rar.exp @@ -0,0 +1,13 @@ +Archive: files/seektest.rar +FILE: hdrlen=44 datlen=90 hdr_unknown=0 + flags=0x9020:EXTTIME,LONG,D128 + os=3:UNIX ver=29 mode=0100644 meth=5 cmp=90 dec=2048 vol=0 + crc=0xc5b7e6a2 (3317163682) time=2011-06-12 12:53:33 + name=stest1.txt + mtime=2011-06-12 12:53:33 +FILE: hdrlen=44 datlen=2048 hdr_unknown=0 + flags=0x9020:EXTTIME,LONG,D128 + os=3:UNIX ver=20 mode=0100644 meth=0 cmp=2048 dec=2048 vol=0 + crc=0xc5b7e6a2 (3317163682) time=2011-06-12 12:53:33 + name=stest2.txt + mtime=2011-06-12 12:53:33 diff --git a/libs/rarfile1/test/files/unicode.rar b/libs/rarfile1/test/files/unicode.rar new file mode 100644 index 00000000..7453ac0f Binary files /dev/null and b/libs/rarfile1/test/files/unicode.rar differ diff --git a/libs/rarfile1/test/files/unicode.rar.exp b/libs/rarfile1/test/files/unicode.rar.exp new file mode 100644 index 00000000..5044f7b3 --- /dev/null +++ b/libs/rarfile1/test/files/unicode.rar.exp @@ -0,0 +1,11 @@ +Archive: files/unicode.rar +FILE: hdrlen=54 datlen=17 hdr_unknown=0 + flags=0x8080:LONG,D1024 + os=3:UNIX ver=29 mode=0100644 meth=5 cmp=17 dec=2 vol=0 + crc=0x6751fc53 (1733426259) time=2011-07-06 16:48:04 + name=уииоотивл.txt +FILE: hdrlen=52 datlen=13 hdr_unknown=0 + flags=0x8090:SOLID,LONG,D1024 + os=3:UNIX ver=29 mode=0100644 meth=5 cmp=13 dec=2 vol=0 + crc=0x6751fc53 (1733426259) time=2011-07-06 16:48:04 + name=ð€ððð‚.txt diff --git a/libs/rarfile1/test/test1.sh b/libs/rarfile1/test/test1.sh new file mode 100755 index 00000000..5b0f86aa --- /dev/null +++ b/libs/rarfile1/test/test1.sh @@ -0,0 +1,32 @@ +#! /bin/sh + +PYTHONPATH=..:$PYTHONPATH +export PYTHONPATH + +JAVA_OPTIONS="-Dpython.path=`pwd`/.." +export JAVA_OPTIONS + +plist="python2.7 python3.2 python3.3 python3.4 python3.5 python3.6 pypy jython jython2.7" + +rm -f test.diffs + +for py in $plist; do + if which $py > /dev/null; then + for f in files/*.rar; do + printf "%s -> %-30s .. " $py $f + $py ../dumprar.py -t -t -v -ppassword $f > $f.$py + if diff -uw $f.exp $f.$py > /dev/null; then + echo "ok" + else + echo "FAIL" + echo "#### $py ####" >> test.diffs + diff -uw $f.exp $f.$py >> test.diffs + fi + done + echo "" + else + echo $py not available + echo "" + fi +done + diff --git a/libs/rarfile1/test/test2.sh b/libs/rarfile1/test/test2.sh new file mode 100755 index 00000000..328e3ea0 --- /dev/null +++ b/libs/rarfile1/test/test2.sh @@ -0,0 +1,19 @@ +#! /bin/sh + +cp ../rarfile.py . + +#ulimit -n 16 + +plist="python2.7 python3.2 python3.3 python3.4 python3.5 python3.6 pypy jython jython2.7" + +for py in $plist; do + if which $py > /dev/null; then + echo "== $py ==" + $py ./testseek.py + $py ./testio.py + $py ./testcorrupt.py --quick + fi +done + +rm -f rarfile.py + diff --git a/libs/rarfile1/test/testcorrupt.py b/libs/rarfile1/test/testcorrupt.py new file mode 100755 index 00000000..91fc3d80 --- /dev/null +++ b/libs/rarfile1/test/testcorrupt.py @@ -0,0 +1,85 @@ +#! /usr/bin/env python + +import rarfile +import sys, os, time +import tempfile + +def progress(): + sys.stdout.write('.') + sys.stdout.flush() + +def try_read(tmpfn): + #progress() + try: + rf = rarfile.RarFile(tmpfn) + if rf.needs_password(): + rf.setpassword('password') + except rarfile.Error: + return + for fn in rf.namelist(): + try: + data = rf.read(fn) + pass + except rarfile.Error: + pass + +def test_rar(rarfn): + data = open(rarfn, "rb").read() + + fd, tmpfn = tempfile.mkstemp('.rar') + os.close(fd) + + print('testcorrupt 1') + for n in range(len(data)): + bad = data[:n] + f = open(tmpfn, 'wb') + f.write(bad) + f.close() + + try_read(tmpfn) + + print('testcorrupt 2') + crap = rarfile.RAR_ID + for n in range(1, len(data)): + for i in range(len(crap)): + c = crap[i:i+1] + bad = data[:n - 1] + c + data[n:] + f = open(tmpfn, 'wb') + f.write(bad) + f.close() + try_read(tmpfn) + + os.unlink(tmpfn) + +test_rar_list = [ + "files/ctime0.rar", + "files/ctime1.rar", + "files/ctime2.rar", + "files/ctime3.rar", + "files/ctime4.rar", + "files/seektest.rar", + "files/rar15-comment-lock.rar", + "files/rar15-comment.rar", + "files/rar202-comment-nopsw.rar", + "files/rar202-comment-psw.rar", + "files/rar3-comment-hpsw.rar", + "files/rar3-comment-plain.rar", + "files/rar3-comment-psw.rar", + "files/unicode.rar", +] + +def main(): + if sys.argv[-1] == '--quick': + test_rar("files/rar3-comment-plain.rar") + return + for rar in test_rar_list: + print(rar) + test_rar(rar) + +if __name__ == '__main__': + try: + main() + except OSError: + print('OSError: pid = %d' % os.getpid()) + time.sleep(80000) + diff --git a/libs/rarfile1/test/testio.py b/libs/rarfile1/test/testio.py new file mode 100755 index 00000000..ee008b95 --- /dev/null +++ b/libs/rarfile1/test/testio.py @@ -0,0 +1,35 @@ +#! /usr/bin/env python + +import rarfile, os, os.path, time, sys + +try: + from io import BufferedReader, TextIOWrapper +except ImportError: + print('no io module') + sys.exit(0) + def BufferedReader(x): return x + def TextIOWrapper(x): return x + +def test_readline(rf, fn): + f = rf.open(fn) + tr = TextIOWrapper(BufferedReader(f)) + while 1: + ln = tr.readline() + if not ln: + break + tr.close() + +def main(): + files = ['stest1.txt', 'stest2.txt'] + arc = 'files/seektest.rar' + + rf = rarfile.RarFile(arc, crc_check=0) + for fn in files: + sys.stdout.write('test/readline: %s .. ' % fn) + sys.stdout.flush() + test_readline(rf, fn) + print('ok') + +if __name__ == '__main__': + main() + diff --git a/libs/rarfile1/test/testseek.py b/libs/rarfile1/test/testseek.py new file mode 100755 index 00000000..e6925ebf --- /dev/null +++ b/libs/rarfile1/test/testseek.py @@ -0,0 +1,103 @@ +#! /usr/bin/env python + +import rarfile, os, os.path, time, sys + +def show_fds(): + fdir = "/proc/%d/fd" % os.getpid() + if os.path.isdir(fdir): + os.system('printf "fds = "; ls -l %s | wc -l' % fdir) + +def do_seek(f, pos, lim): + ofs = pos*4 + fsize = lim*4 + + if ofs < 0: + exp = 0 + elif ofs > fsize: + exp = fsize + else: + exp = ofs + + f.seek(ofs) + + got = f.tell() + + if got != exp: + raise Exception('seek failed (got=%d, exp=%d)' % (got, exp)) + ln = f.read(4) + if got == fsize and ln: + raise Exception('unexpected read') + if not ln and got < fsize: + raise Exception('unexpected read failure') + if ln: + spos = int(ln) + if spos*4 != got: + raise Exception('unexpected pos: spos=%d pos=%d' % (spos, pos)) + +def test_seek(rf, fn): + inf = rf.getinfo(fn) + cnt = int(inf.file_size / 4) + f = rf.open(fn) + + do_seek(f, int(cnt/2), cnt) + do_seek(f, 0, cnt) + + for i in range(int(cnt/2)): + do_seek(f, i*2, cnt) + + for i in range(cnt): + do_seek(f, i*2 - int(cnt / 2), cnt) + + for i in range(cnt + 10): + do_seek(f, cnt - i - 5, cnt) + + f.close() + + print('OK') + +def test_arc(arc, desc): + files = ['stest1.txt', 'stest2.txt'] + rf = rarfile.RarFile(arc, crc_check=0) + for fn in files: + sys.stdout.write('%s | test/seek %s .. ' % (desc, fn)) + sys.stdout.flush() + test_seek(rf, fn) + +def main(): + arc = 'files/seektest.rar' + data = open(arc, 'rb').read() + + # filename + test_arc(arc, "fn") + + # filelike: cStringIO + try: + import cStringIO + test_arc(cStringIO.StringIO(data), "cStringIO") + except ImportError: + pass + + # filelike: io.BytesIO, io.open() + try: + import io + test_arc(io.BytesIO(data), "io.BytesIO") + test_arc(io.open(arc, 'rb'), "io.open") + except ImportError: + pass + + # filelike: StringIO + try: + import StringIO + test_arc(StringIO.StringIO(data), "StringIO") + except ImportError: + pass + + # filelike: file() + test_arc(open(arc, 'rb'), "file") + + time.sleep(1) + show_fds() + +if __name__ == '__main__': + main() + diff --git a/libs/rebulk/__init__.py b/libs/rebulk/__init__.py new file mode 100644 index 00000000..93d5e477 --- /dev/null +++ b/libs/rebulk/__init__.py @@ -0,0 +1,10 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +""" +Define simple search patterns in bulk to perform advanced matching on any string. +""" +# pylint:disable=import-self +from .rebulk import Rebulk +from .rules import Rule, CustomRule, AppendMatch, RemoveMatch, RenameMatch, AppendTags, RemoveTags +from .processors import ConflictSolver, PrivateRemover, POST_PROCESS, PRE_PROCESS +from .pattern import REGEX_AVAILABLE diff --git a/libs/rebulk/__version__.py b/libs/rebulk/__version__.py new file mode 100644 index 00000000..6b0a83ec --- /dev/null +++ b/libs/rebulk/__version__.py @@ -0,0 +1,7 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +""" +Version module +""" +# pragma: no cover +__version__ = '0.7.7.dev0' diff --git a/libs/rebulk/chain.py b/libs/rebulk/chain.py new file mode 100644 index 00000000..7817e8c0 --- /dev/null +++ b/libs/rebulk/chain.py @@ -0,0 +1,440 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +""" +Chain patterns and handle repetiting capture group +""" +# pylint: disable=super-init-not-called +import itertools + +from .loose import call, set_defaults +from .match import Match, Matches +from .pattern import Pattern, filter_match_kwargs +from .remodule import re + + +class _InvalidChainException(Exception): + """ + Internal exception raised when a chain is not valid + """ + pass + + +class Chain(Pattern): + """ + Definition of a pattern chain to search for. + """ + + def __init__(self, rebulk, **kwargs): + call(super(Chain, self).__init__, **kwargs) + self._kwargs = kwargs + self._match_kwargs = filter_match_kwargs(kwargs) + self._defaults = {} + self._regex_defaults = {} + self._string_defaults = {} + self._functional_defaults = {} + self.rebulk = rebulk + self.parts = [] + + def defaults(self, **kwargs): + """ + Define default keyword arguments for all patterns + :param kwargs: + :type kwargs: + :return: + :rtype: + """ + self._defaults = kwargs + return self + + def regex_defaults(self, **kwargs): + """ + Define default keyword arguments for functional patterns. + :param kwargs: + :type kwargs: + :return: + :rtype: + """ + self._regex_defaults = kwargs + return self + + def string_defaults(self, **kwargs): + """ + Define default keyword arguments for string patterns. + :param kwargs: + :type kwargs: + :return: + :rtype: + """ + self._string_defaults = kwargs + return self + + def functional_defaults(self, **kwargs): + """ + Define default keyword arguments for functional patterns. + :param kwargs: + :type kwargs: + :return: + :rtype: + """ + self._functional_defaults = kwargs + return self + + def chain(self): + """ + Add patterns chain, using configuration from this chain + + :return: + :rtype: + """ + # pylint: disable=protected-access + chain = self.rebulk.chain(**self._kwargs) + chain._defaults = dict(self._defaults) + chain._regex_defaults = dict(self._regex_defaults) + chain._functional_defaults = dict(self._functional_defaults) + chain._string_defaults = dict(self._string_defaults) + return chain + + def regex(self, *pattern, **kwargs): + """ + Add re pattern + + :param pattern: + :type pattern: + :param kwargs: + :type kwargs: + :return: + :rtype: + """ + set_defaults(self._kwargs, kwargs) + set_defaults(self._regex_defaults, kwargs) + set_defaults(self._defaults, kwargs) + pattern = self.rebulk.build_re(*pattern, **kwargs) + part = ChainPart(self, pattern) + self.parts.append(part) + return part + + def functional(self, *pattern, **kwargs): + """ + Add functional pattern + + :param pattern: + :type pattern: + :param kwargs: + :type kwargs: + :return: + :rtype: + """ + set_defaults(self._kwargs, kwargs) + set_defaults(self._functional_defaults, kwargs) + set_defaults(self._defaults, kwargs) + pattern = self.rebulk.build_functional(*pattern, **kwargs) + part = ChainPart(self, pattern) + self.parts.append(part) + return part + + def string(self, *pattern, **kwargs): + """ + Add string pattern + + :param pattern: + :type pattern: + :param kwargs: + :type kwargs: + :return: + :rtype: + """ + set_defaults(self._kwargs, kwargs) + set_defaults(self._functional_defaults, kwargs) + set_defaults(self._defaults, kwargs) + pattern = self.rebulk.build_string(*pattern, **kwargs) + part = ChainPart(self, pattern) + self.parts.append(part) + return part + + def close(self): + """ + Close chain builder to continue registering other pattern + + :return: + :rtype: + """ + return self.rebulk + + def _match(self, pattern, input_string, context=None): + chain_matches = [] + chain_input_string = input_string + offset = 0 + while offset < len(input_string): + current_chain_matches = [] + valid_chain = True + is_chain_start = True + for chain_part in self.parts: + try: + chain_part_matches, raw_chain_part_matches = Chain._match_chain_part(is_chain_start, chain_part, + chain_input_string, + context) + if raw_chain_part_matches: + Chain._fix_matches_offset(raw_chain_part_matches, input_string, offset) + offset = raw_chain_part_matches[-1].raw_end + chain_input_string = input_string[offset:] + if not chain_part.is_hidden: + current_chain_matches.extend(chain_part_matches) + except _InvalidChainException: + valid_chain = False + if current_chain_matches: + offset = current_chain_matches[0].raw_end + break + is_chain_start = False + if not current_chain_matches: + break + if valid_chain: + match = self._build_chain_match(current_chain_matches, input_string) + chain_matches.append(match) + + return chain_matches + + def _match_parent(self, match, yield_parent): + """ + Handle a parent match + :param match: + :type match: + :param yield_parent: + :type yield_parent: + :return: + :rtype: + """ + ret = super(Chain, self)._match_parent(match, yield_parent) + original_children = Matches(match.children) + original_end = match.end + while not ret and match.children: + last_pattern = match.children[-1].pattern + last_pattern_children = [child for child in match.children if child.pattern == last_pattern] + last_pattern_groups_iter = itertools.groupby(last_pattern_children, lambda child: child.match_index) + last_pattern_groups = {} + for index, matches in last_pattern_groups_iter: + last_pattern_groups[index] = list(matches) + + for index in reversed(list(last_pattern_groups)): + last_matches = list(last_pattern_groups[index]) + for last_match in last_matches: + match.children.remove(last_match) + match.end = match.children[-1].end if match.children else match.start + ret = super(Chain, self)._match_parent(match, yield_parent) + if ret: + return True + match.children = original_children + match.end = original_end + return ret + + def _build_chain_match(self, current_chain_matches, input_string): + start = None + end = None + for match in current_chain_matches: + if start is None or start > match.start: + start = match.start + if end is None or end < match.end: + end = match.end + match = call(Match, start, end, pattern=self, input_string=input_string, **self._match_kwargs) + for chain_match in current_chain_matches: + if chain_match.children: + for child in chain_match.children: + match.children.append(child) + if chain_match not in match.children: + match.children.append(chain_match) + chain_match.parent = match + return match + + @staticmethod + def _fix_matches_offset(chain_part_matches, input_string, offset): + for chain_part_match in chain_part_matches: + if chain_part_match.input_string != input_string: + chain_part_match.input_string = input_string + chain_part_match.end += offset + chain_part_match.start += offset + if chain_part_match.children: + Chain._fix_matches_offset(chain_part_match.children, input_string, offset) + + @staticmethod + def _match_chain_part(is_chain_start, chain_part, chain_input_string, context): + chain_part_matches, raw_chain_part_matches = chain_part.pattern.matches(chain_input_string, context, + with_raw_matches=True) + chain_part_matches = Chain._truncate_chain_part_matches(is_chain_start, chain_part_matches, chain_part, + chain_input_string) + raw_chain_part_matches = Chain._truncate_chain_part_matches(is_chain_start, raw_chain_part_matches, chain_part, + chain_input_string) + + Chain._validate_chain_part_matches(raw_chain_part_matches, chain_part) + return chain_part_matches, raw_chain_part_matches + + @staticmethod + def _truncate_chain_part_matches(is_chain_start, chain_part_matches, chain_part, chain_input_string): + if not chain_part_matches: + return chain_part_matches + + if not is_chain_start: + separator = chain_input_string[0:chain_part_matches[0].initiator.raw_start] + if len(separator) > 0: + return [] + + j = 1 + for i in range(0, len(chain_part_matches) - 1): + separator = chain_input_string[chain_part_matches[i].initiator.raw_end: + chain_part_matches[i + 1].initiator.raw_start] + if len(separator) > 0: + break + j += 1 + truncated = chain_part_matches[:j] + if chain_part.repeater_end is not None: + truncated = [m for m in truncated if m.match_index < chain_part.repeater_end] + return truncated + + @staticmethod + def _validate_chain_part_matches(chain_part_matches, chain_part): + max_match_index = -1 + if chain_part_matches: + max_match_index = max([m.match_index for m in chain_part_matches]) + if max_match_index + 1 < chain_part.repeater_start: + raise _InvalidChainException + + @property + def match_options(self): + return {} + + @property + def patterns(self): + return [self] + + def __repr__(self): + defined = "" + if self.defined_at: + defined = "@%s" % (self.defined_at,) + return "<%s%s:%s>" % (self.__class__.__name__, defined, self.parts) + + +class ChainPart(object): + """ + Part of a pattern chain. + """ + + def __init__(self, chain, pattern): + self._chain = chain + self.pattern = pattern + self.repeater_start = 1 + self.repeater_end = 1 + self._hidden = False + + def chain(self): + """ + Add patterns chain, using configuration from this chain + + :return: + :rtype: + """ + return self._chain.chain() + + def hidden(self, hidden=True): + """ + Hide chain part results from global chain result + + :param hidden: + :type hidden: + :return: + :rtype: + """ + self._hidden = hidden + return self + + @property + def is_hidden(self): + """ + Check if the chain part is hidden + :return: + :rtype: + """ + return self._hidden + + def regex(self, *pattern, **kwargs): + """ + Add re pattern + + :param pattern: + :type pattern: + :param kwargs: + :type kwargs: + :return: + :rtype: + """ + return self._chain.regex(*pattern, **kwargs) + + def functional(self, *pattern, **kwargs): + """ + Add functional pattern + + :param pattern: + :type pattern: + :param kwargs: + :type kwargs: + :return: + :rtype: + """ + return self._chain.functional(*pattern, **kwargs) + + def string(self, *pattern, **kwargs): + """ + Add string pattern + + :param pattern: + :type pattern: + :param kwargs: + :type kwargs: + :return: + :rtype: + """ + return self._chain.string(*pattern, **kwargs) + + def close(self): + """ + Close the chain builder to continue registering other patterns + + :return: + :rtype: + """ + return self._chain.close() + + def repeater(self, value): + """ + Define the repeater of the current chain part. + + :param value: + :type value: + :return: + :rtype: + """ + try: + value = int(value) + self.repeater_start = value + self.repeater_end = value + return self + except ValueError: + pass + if value == '+': + self.repeater_start = 1 + self.repeater_end = None + if value == '*': + self.repeater_start = 0 + self.repeater_end = None + elif value == '?': + self.repeater_start = 0 + self.repeater_end = 1 + else: + match = re.match(r'\{\s*(\d*)\s*,?\s*(\d*)\s*\}', value) + if match: + start = match.group(1) + end = match.group(2) + if start or end: + self.repeater_start = int(start) if start else 0 + self.repeater_end = int(end) if end else None + return self + + def __repr__(self): + return "%s({%s,%s})" % (self.pattern, self.repeater_start, self.repeater_end) diff --git a/libs/rebulk/debug.py b/libs/rebulk/debug.py new file mode 100644 index 00000000..2384b26e --- /dev/null +++ b/libs/rebulk/debug.py @@ -0,0 +1,56 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +""" +Debug tools. + +Can be configured by changing values of those variable. + +DEBUG = False +Enable this variable to activate debug features (like defined_at parameters). It can slow down Rebulk + +LOG_LEVEL = 0 +Default log level of generated rebulk logs. +""" + +import inspect +import logging +import os +from collections import namedtuple + + +DEBUG = False +LOG_LEVEL = logging.DEBUG + + +class Frame(namedtuple('Frame', ['lineno', 'package', 'name', 'filename'])): + """ + Stack frame representation. + """ + __slots__ = () + + def __repr__(self): + return "%s#L%s" % (os.path.basename(self.filename), self.lineno) + + +def defined_at(): + """ + Get definition location of a pattern or a match (outside of rebulk package). + :return: + :rtype: + """ + if DEBUG: + frame = inspect.currentframe() + while frame: + try: + if frame.f_globals['__package__'] != __package__: + break + except KeyError: # pragma:no cover + # If package is missing, consider we are in. Workaround for python 3.3. + break + frame = frame.f_back + ret = Frame(frame.f_lineno, + frame.f_globals.get('__package__'), + frame.f_globals.get('__name__'), + frame.f_code.co_filename) + del frame + return ret diff --git a/libs/rebulk/formatters.py b/libs/rebulk/formatters.py new file mode 100644 index 00000000..47046942 --- /dev/null +++ b/libs/rebulk/formatters.py @@ -0,0 +1,23 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +""" +Formatter functions to use in patterns. + +All those function have last argument as match.value (str). +""" + + +def formatters(*chained_formatters): + """ + Chain formatter functions. + :param chained_formatters: + :type chained_formatters: + :return: + :rtype: + """ + def formatters_chain(input_string): # pylint:disable=missing-docstring + for chained_formatter in chained_formatters: + input_string = chained_formatter(input_string) + return input_string + + return formatters_chain diff --git a/libs/rebulk/introspector.py b/libs/rebulk/introspector.py new file mode 100644 index 00000000..64b9836f --- /dev/null +++ b/libs/rebulk/introspector.py @@ -0,0 +1,126 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +""" +Introspect rebulk object to retrieve capabilities. +""" +from abc import ABCMeta, abstractproperty +from collections import defaultdict + +import six +from .pattern import StringPattern, RePattern, FunctionalPattern +from .utils import extend_safe + + +@six.add_metaclass(ABCMeta) +class Description(object): + """ + Abstract class for a description. + """ + @abstractproperty + def properties(self): # pragma: no cover + """ + Properties of described object. + :return: all properties that described object can generate grouped by name. + :rtype: dict + """ + pass + + +class PatternDescription(Description): + """ + Description of a pattern. + """ + def __init__(self, pattern): # pylint:disable=too-many-branches + self.pattern = pattern + self._properties = defaultdict(list) + + if pattern.properties: + for key, values in pattern.properties.items(): + extend_safe(self._properties[key], values) + elif 'value' in pattern.match_options: + self._properties[pattern.name].append(pattern.match_options['value']) + elif isinstance(pattern, StringPattern): + extend_safe(self._properties[pattern.name], pattern.patterns) + elif isinstance(pattern, RePattern): + if pattern.name and pattern.name not in pattern.private_names: + extend_safe(self._properties[pattern.name], [None]) + if not pattern.private_children: + for regex_pattern in pattern.patterns: + for group_name, values in regex_pattern.groupindex.items(): + if group_name not in pattern.private_names: + extend_safe(self._properties[group_name], [None]) + elif isinstance(pattern, FunctionalPattern): + if pattern.name and pattern.name not in pattern.private_names: + extend_safe(self._properties[pattern.name], [None]) + + + @property + def properties(self): + """ + Properties for this rule. + :return: + :rtype: dict + """ + return self._properties + + +class RuleDescription(Description): + """ + Description of a rule. + """ + def __init__(self, rule): + self.rule = rule + + self._properties = defaultdict(list) + + if rule.properties: + for key, values in rule.properties.items(): + extend_safe(self._properties[key], values) + + @property + def properties(self): + """ + Properties for this rule. + :return: + :rtype: dict + """ + return self._properties + + +class Introspection(Description): + """ + Introspection results. + """ + def __init__(self, rebulk, context=None): + self.patterns = [PatternDescription(pattern) for pattern in rebulk.effective_patterns(context) + if not pattern.private and not pattern.marker] + self.rules = [RuleDescription(rule) for rule in rebulk.effective_rules(context)] + + @property + def properties(self): + """ + Properties for Introspection results. + :return: + :rtype: + """ + properties = defaultdict(list) + for pattern in self.patterns: + for key, values in pattern.properties.items(): + extend_safe(properties[key], values) + for rule in self.rules: + for key, values in rule.properties.items(): + extend_safe(properties[key], values) + return properties + + +def introspect(rebulk, context=None): + """ + Introspect a Rebulk instance to grab defined objects and properties that can be generated. + :param rebulk: + :type rebulk: Rebulk + :param context: + :type context: + :return: Introspection instance + :rtype: Introspection + """ + return Introspection(rebulk, context) diff --git a/libs/rebulk/loose.py b/libs/rebulk/loose.py new file mode 100644 index 00000000..72543b1e --- /dev/null +++ b/libs/rebulk/loose.py @@ -0,0 +1,198 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +""" +Various utilities functions +""" +import inspect +import sys +from .utils import is_iterable + +if sys.version_info < (3, 4, 0): # pragma: no cover + def _constructor(class_): + """ + Retrieves constructor from given class + + :param class_: + :type class_: class + :return: constructor from given class + :rtype: callable + """ + return class_.__init__ +else: # pragma: no cover + def _constructor(class_): + """ + Retrieves constructor from given class + + :param class_: + :type class_: class + :return: constructor from given class + :rtype: callable + """ + return class_ + + +def call(function, *args, **kwargs): + """ + Call a function or constructor with given args and kwargs after removing args and kwargs that doesn't match + function or constructor signature + + :param function: Function or constructor to call + :type function: callable + :param args: + :type args: + :param kwargs: + :type kwargs: + :return: sale vakye as default function call + :rtype: object + """ + func = constructor_args if inspect.isclass(function) else function_args + call_args, call_kwargs = func(function, *args, **kwargs) + return function(*call_args, **call_kwargs) + + +def function_args(callable_, *args, **kwargs): + """ + Return (args, kwargs) matching the function signature + + :param callable: callable to inspect + :type callable: callable + :param args: + :type args: + :param kwargs: + :type kwargs: + :return: (args, kwargs) matching the function signature + :rtype: tuple + """ + argspec = inspect.getargspec(callable_) # pylint:disable=deprecated-method + return argspec_args(argspec, False, *args, **kwargs) + + +def constructor_args(class_, *args, **kwargs): + """ + Return (args, kwargs) matching the function signature + + :param callable: callable to inspect + :type callable: Callable + :param args: + :type args: + :param kwargs: + :type kwargs: + :return: (args, kwargs) matching the function signature + :rtype: tuple + """ + argspec = inspect.getargspec(_constructor(class_)) # pylint:disable=deprecated-method + return argspec_args(argspec, True, *args, **kwargs) + + +def argspec_args(argspec, constructor, *args, **kwargs): + """ + Return (args, kwargs) matching the argspec object + + :param argspec: argspec to use + :type argspec: argspec + :param constructor: is it a constructor ? + :type constructor: bool + :param args: + :type args: + :param kwargs: + :type kwargs: + :return: (args, kwargs) matching the function signature + :rtype: tuple + """ + if argspec.keywords: + call_kwarg = kwargs + else: + call_kwarg = dict((k, kwargs[k]) for k in kwargs if k in argspec.args) # Python 2.6 dict comprehension + if argspec.varargs: + call_args = args + else: + call_args = args[:len(argspec.args) - (1 if constructor else 0)] + return call_args, call_kwarg + + +def ensure_list(param): + """ + Retrieves a list from given parameter. + + :param param: + :type param: + :return: + :rtype: + """ + if not param: + param = [] + elif not is_iterable(param): + param = [param] + return param + + +def ensure_dict(param, default_value, default_key=None): + """ + Retrieves a dict and a default value from given parameter. + + if parameter is not a dict, it will be promoted as the default value. + + :param param: + :type param: + :param default_value: + :type default_value: + :param default_key: + :type default_key: + :return: + :rtype: + """ + if not param: + param = default_value + if not isinstance(param, dict): + if param: + default_value = param + return {default_key: param}, default_value + return param, default_value + + +def filter_index(collection, predicate=None, index=None): + """ + Filter collection with predicate function and index. + + If index is not found, returns None. + :param collection: + :type collection: collection supporting iteration and slicing + :param predicate: function to filter the collection with + :type predicate: function + :param index: position of a single element to retrieve + :type index: int + :return: filtered list, or single element of filtered list if index is defined + :rtype: list or object + """ + if index is None and isinstance(predicate, int): + index = predicate + predicate = None + if predicate: + collection = collection.__class__(filter(predicate, collection)) + if index is not None: + try: + collection = collection[index] + except IndexError: + collection = None + return collection + + +def set_defaults(defaults, kwargs): + """ + Set defaults from defaults dict to kwargs dict + :param defaults: + :type defaults: + :param kwargs: + :type kwargs: + :return: + :rtype: + """ + for key, value in defaults.items(): + if key not in kwargs and value is not None: + kwargs[key] = value + elif isinstance(value, list) and isinstance(kwargs[key], list): + kwargs[key] = list(value) + kwargs[key] + elif isinstance(value, dict) and isinstance(kwargs[key], dict): + set_defaults(value, kwargs[key]) + elif key in kwargs and value is None: + kwargs[key] = None diff --git a/libs/rebulk/match.py b/libs/rebulk/match.py new file mode 100644 index 00000000..909c9fd6 --- /dev/null +++ b/libs/rebulk/match.py @@ -0,0 +1,784 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +""" +Classes and functions related to matches +""" +from collections import defaultdict, MutableSequence +import copy +try: + from collections import OrderedDict # pylint:disable=ungrouped-imports +except ImportError: # pragma: no cover + from ordereddict import OrderedDict # pylint:disable=import-error +import six + +from .loose import ensure_list, filter_index +from .utils import is_iterable +from .debug import defined_at + + +class MatchesDict(OrderedDict): + """ + A custom dict with matches property. + """ + def __init__(self): + super(MatchesDict, self).__init__() + self.matches = defaultdict(list) + self.values_list = defaultdict(list) + + +class _BaseMatches(MutableSequence): + """ + A custom list[Match] that automatically maintains name, tag, start and end lookup structures. + """ + _base = list + _base_add = _base.append + _base_remove = _base.remove + + def __init__(self, matches=None, input_string=None): + self.input_string = input_string + self._max_end = 0 + self._delegate = [] + self._name_dict = defaultdict(_BaseMatches._base) + self._tag_dict = defaultdict(_BaseMatches._base) + self._start_dict = defaultdict(_BaseMatches._base) + self._end_dict = defaultdict(_BaseMatches._base) + self._index_dict = defaultdict(_BaseMatches._base) + if matches: + self.extend(matches) + + def _add_match(self, match): + """ + Add a match + :param match: + :type match: Match + """ + if match.name: + _BaseMatches._base_add(self._name_dict[match.name], (match)) + for tag in match.tags: + _BaseMatches._base_add(self._tag_dict[tag], match) + _BaseMatches._base_add(self._start_dict[match.start], match) + _BaseMatches._base_add(self._end_dict[match.end], match) + for index in range(*match.span): + _BaseMatches._base_add(self._index_dict[index], match) + if match.end > self._max_end: + self._max_end = match.end + + def _remove_match(self, match): + """ + Remove a match + :param match: + :type match: Match + """ + if match.name: + _BaseMatches._base_remove(self._name_dict[match.name], match) + for tag in match.tags: + _BaseMatches._base_remove(self._tag_dict[tag], match) + _BaseMatches._base_remove(self._start_dict[match.start], match) + _BaseMatches._base_remove(self._end_dict[match.end], match) + for index in range(*match.span): + _BaseMatches._base_remove(self._index_dict[index], match) + if match.end >= self._max_end and not self._end_dict[match.end]: + self._max_end = max(self._end_dict.keys()) + + def previous(self, match, predicate=None, index=None): + """ + Retrieves the nearest previous matches. + :param match: + :type match: + :param predicate: + :type predicate: + :param index: + :type index: int + :return: + :rtype: + """ + current = match.start + while current > -1: + previous_matches = self.ending(current) + if previous_matches: + return filter_index(previous_matches, predicate, index) + current -= 1 + return filter_index(_BaseMatches._base(), predicate, index) + + def next(self, match, predicate=None, index=None): + """ + Retrieves the nearest next matches. + :param match: + :type match: + :param predicate: + :type predicate: + :param index: + :type index: int + :return: + :rtype: + """ + current = match.start + 1 + while current <= self._max_end: + next_matches = self.starting(current) + if next_matches: + return filter_index(next_matches, predicate, index) + current += 1 + return filter_index(_BaseMatches._base(), predicate, index) + + def named(self, name, predicate=None, index=None): + """ + Retrieves a set of Match objects that have the given name. + :param name: + :type name: str + :param predicate: + :type predicate: + :param index: + :type index: int + :return: set of matches + :rtype: set[Match] + """ + return filter_index(_BaseMatches._base(self._name_dict[name]), predicate, index) + + def tagged(self, tag, predicate=None, index=None): + """ + Retrieves a set of Match objects that have the given tag defined. + :param tag: + :type tag: str + :param predicate: + :type predicate: + :param index: + :type index: int + :return: set of matches + :rtype: set[Match] + """ + return filter_index(_BaseMatches._base(self._tag_dict[tag]), predicate, index) + + def starting(self, start, predicate=None, index=None): + """ + Retrieves a set of Match objects that starts at given index. + :param start: the starting index + :type start: int + :param predicate: + :type predicate: + :param index: + :type index: int + :return: set of matches + :rtype: set[Match] + """ + return filter_index(_BaseMatches._base(self._start_dict[start]), predicate, index) + + def ending(self, end, predicate=None, index=None): + """ + Retrieves a set of Match objects that ends at given index. + :param end: the ending index + :type end: int + :param predicate: + :type predicate: + :return: set of matches + :rtype: set[Match] + """ + return filter_index(_BaseMatches._base(self._end_dict[end]), predicate, index) + + def range(self, start=0, end=None, predicate=None, index=None): + """ + Retrieves a set of Match objects that are available in given range, sorted from start to end. + :param start: the starting index + :type start: int + :param end: the ending index + :type end: int + :param predicate: + :type predicate: + :param index: + :type index: int + :return: set of matches + :rtype: set[Match] + """ + if end is None: + end = self.max_end + else: + end = min(self.max_end, end) + ret = _BaseMatches._base() + for match in sorted(self): + if match.start < end and match.end > start: + ret.append(match) + return filter_index(ret, predicate, index) + + def chain_before(self, position, seps, start=0, predicate=None, index=None): + """ + Retrieves a list of chained matches, before position, matching predicate and separated by characters from seps + only. + :param position: + :type position: + :param seps: + :type seps: + :param start: + :type start: + :param predicate: + :type predicate: + :param index: + :type index: + :return: + :rtype: + """ + if hasattr(position, 'start'): + position = position.start + + chain = _BaseMatches._base() + position = min(self.max_end, position) + + for i in reversed(range(start, position)): + index_matches = self.at_index(i) + filtered_matches = [index_match for index_match in index_matches if not predicate or predicate(index_match)] + if filtered_matches: + for chain_match in filtered_matches: + if chain_match not in chain: + chain.append(chain_match) + elif self.input_string[i] not in seps: + break + + return filter_index(chain, predicate, index) + + def chain_after(self, position, seps, end=None, predicate=None, index=None): + """ + Retrieves a list of chained matches, after position, matching predicate and separated by characters from seps + only. + :param position: + :type position: + :param seps: + :type seps: + :param end: + :type end: + :param predicate: + :type predicate: + :param index: + :type index: + :return: + :rtype: + """ + if hasattr(position, 'end'): + position = position.end + chain = _BaseMatches._base() + + if end is None: + end = self.max_end + else: + end = min(self.max_end, end) + + for i in range(position, end): + index_matches = self.at_index(i) + filtered_matches = [index_match for index_match in index_matches if not predicate or predicate(index_match)] + if filtered_matches: + for chain_match in filtered_matches: + if chain_match not in chain: + chain.append(chain_match) + elif self.input_string[i] not in seps: + break + + return filter_index(chain, predicate, index) + + @property + def max_end(self): + """ + Retrieves the maximum index. + :return: + """ + return max(len(self.input_string), self._max_end) if self.input_string else self._max_end + + def _hole_start(self, position, ignore=None): + """ + Retrieves the start of hole index from position. + :param position: + :type position: + :param ignore: + :type ignore: + :return: + :rtype: + """ + for lindex in reversed(range(0, position)): + for starting in self.starting(lindex): + if not ignore or not ignore(starting): + return lindex + return 0 + + def _hole_end(self, position, ignore=None): + """ + Retrieves the end of hole index from position. + :param position: + :type position: + :param ignore: + :type ignore: + :return: + :rtype: + """ + for rindex in range(position, self.max_end): + for starting in self.starting(rindex): + if not ignore or not ignore(starting): + return rindex + return self.max_end + + def holes(self, start=0, end=None, formatter=None, ignore=None, seps=None, predicate=None, index=None): # pylint: disable=too-many-branches,too-many-locals + """ + Retrieves a set of Match objects that are not defined in given range. + :param start: + :type start: + :param end: + :type end: + :param formatter: + :type formatter: + :param ignore: + :type ignore: + :param seps: + :type seps: + :param predicate: + :type predicate: + :param index: + :type index: + :return: + :rtype: + """ + assert self.input_string if seps else True, "input_string must be defined when using seps parameter" + if end is None: + end = self.max_end + else: + end = min(self.max_end, end) + ret = _BaseMatches._base() + hole = False + rindex = start + + loop_start = self._hole_start(start, ignore) + + for rindex in range(loop_start, end): + current = [] + for at_index in self.at_index(rindex): + if not ignore or not ignore(at_index): + current.append(at_index) + + if seps and hole and self.input_string and self.input_string[rindex] in seps: + hole = False + ret[-1].end = rindex + else: + if not current and not hole: + # Open a new hole match + hole = True + ret.append(Match(max(rindex, start), None, input_string=self.input_string, formatter=formatter)) + elif current and hole: + # Close current hole match + hole = False + ret[-1].end = rindex + + if ret and hole: + # go the the next starting element ... + ret[-1].end = min(self._hole_end(rindex, ignore), end) + return filter_index(ret, predicate, index) + + def conflicting(self, match, predicate=None, index=None): + """ + Retrieves a list of ``Match`` objects that conflicts with given match. + :param match: + :type match: + :param predicate: + :type predicate: + :param index: + :type index: + :return: + :rtype: + """ + ret = _BaseMatches._base() + + for i in range(*match.span): + for at_match in self.at_index(i): + if at_match not in ret: + ret.append(at_match) + + ret.remove(match) + + return filter_index(ret, predicate, index) + + def at_match(self, match, predicate=None, index=None): + """ + Retrieves a list of matches from given match. + """ + return self.at_span(match.span, predicate, index) + + def at_span(self, span, predicate=None, index=None): + """ + Retrieves a list of matches from given (start, end) tuple. + """ + starting = self._index_dict[span[0]] + ending = self._index_dict[span[1] - 1] + + merged = list(starting) + for marker in ending: + if marker not in merged: + merged.append(marker) + + return filter_index(merged, predicate, index) + + def at_index(self, pos, predicate=None, index=None): + """ + Retrieves a list of matches from given position + """ + return filter_index(self._index_dict[pos], predicate, index) + + @property + def names(self): + """ + Retrieve all names. + :return: + """ + return self._name_dict.keys() + + @property + def tags(self): + """ + Retrieve all tags. + :return: + """ + return self._tag_dict.keys() + + def to_dict(self, details=False, implicit=False): + """ + Converts matches to a dict object. + :param details if True, values will be complete Match object, else it will be only string Match.value property + :type details: bool + :param implicit if True, multiple values will be set as a list in the dict. Else, only the first value + will be kept. + :type implicit: bool + :return: + :rtype: dict + """ + ret = MatchesDict() + for match in sorted(self): + value = match if details else match.value + ret.matches[match.name].append(match) + if value not in ret.values_list[match.name]: + ret.values_list[match.name].append(value) + if match.name in ret.keys(): + if implicit: + if not isinstance(ret[match.name], list): + if ret[match.name] == value: + continue + ret[match.name] = [ret[match.name]] + else: + if value in ret[match.name]: + continue + ret[match.name].append(value) + else: + ret[match.name] = value + return ret + + if six.PY2: # pragma: no cover + def clear(self): + """ + Python 3 backport + """ + del self[:] + + def __len__(self): + return len(self._delegate) + + def __getitem__(self, index): + ret = self._delegate[index] + if isinstance(ret, list): + return Matches(ret) + return ret + + def __setitem__(self, index, match): + self._delegate[index] = match + if isinstance(index, slice): + for match_item in match: + self._add_match(match_item) + return + self._add_match(match) + + def __delitem__(self, index): + match = self._delegate[index] + del self._delegate[index] + if isinstance(match, list): + # if index is a slice, we has a match list + for match_item in match: + self._remove_match(match_item) + else: + self._remove_match(match) + + def __repr__(self): + return self._delegate.__repr__() + + def insert(self, index, match): + self._delegate.insert(index, match) + self._add_match(match) + + +class Matches(_BaseMatches): + """ + A custom list[Match] contains matches list. + """ + def __init__(self, matches=None, input_string=None): + self.markers = Markers(input_string=input_string) + super(Matches, self).__init__(matches=matches, input_string=input_string) + + def _add_match(self, match): + assert not match.marker, "A marker match should not be added to object" + super(Matches, self)._add_match(match) + + +class Markers(_BaseMatches): + """ + A custom list[Match] containing markers list. + """ + def __init__(self, matches=None, input_string=None): + super(Markers, self).__init__(matches=None, input_string=input_string) + + def _add_match(self, match): + assert match.marker, "A non-marker match should not be added to object" + super(Markers, self)._add_match(match) + + +class Match(object): + """ + Object storing values related to a single match + """ + def __init__(self, start, end, value=None, name=None, tags=None, marker=None, parent=None, private=None, + pattern=None, input_string=None, formatter=None, conflict_solver=None): + self.start = start + self.end = end + self.name = name + self._value = value + self.tags = ensure_list(tags) + self.marker = marker + self.parent = parent + self.input_string = input_string + self.formatter = formatter + self.pattern = pattern + self.private = private + self.conflict_solver = conflict_solver + self.children = Matches([], input_string) + self._raw_start = None + self._raw_end = None + self.defined_at = pattern.defined_at if pattern else defined_at() + + @property + def span(self): + """ + 2-tuple with start and end indices of the match + """ + return self.start, self.end + + @property + def value(self): + """ + Get the value of the match, using formatter if defined. + :return: + :rtype: + """ + if self._value: + return self._value + if self.formatter: + return self.formatter(self.raw) + return self.raw + + @value.setter + def value(self, value): + """ + Set the value (hardcode) + :param value: + :type value: + :return: + :rtype: + """ + self._value = value # pylint: disable=attribute-defined-outside-init + + @property + def names(self): + """ + Get all names of children + :return: + :rtype: + """ + if not self.children: + return set([self.name]) + else: + ret = set() + for child in self.children: + for name in child.names: + ret.add(name) + return ret + + @property + def raw_start(self): + """ + start index of raw value + :return: + :rtype: + """ + if self._raw_start is None: + return self.start + return self._raw_start + + @raw_start.setter + def raw_start(self, value): + """ + Set start index of raw value + :return: + :rtype: + """ + self._raw_start = value + + @property + def raw_end(self): + """ + end index of raw value + :return: + :rtype: + """ + if self._raw_end is None: + return self.end + return self._raw_end + + @raw_end.setter + def raw_end(self, value): + """ + Set end index of raw value + :return: + :rtype: + """ + self._raw_end = value + + @property + def raw(self): + """ + Get the raw value of the match, without using hardcoded value nor formatter. + :return: + :rtype: + """ + if self.input_string: + return self.input_string[self.raw_start:self.raw_end] + return None + + @property + def initiator(self): + """ + Retrieve the initiator parent of a match + :param match: + :type match: + :return: + :rtype: + """ + match = self + while match.parent: + match = match.parent + return match + + def crop(self, crops, predicate=None, index=None): + """ + crop the match with given Match objects or spans tuples + :param crops: + :type crops: list or object + :return: a list of Match objects + :rtype: list[Match] + """ + if not is_iterable(crops) or len(crops) == 2 and isinstance(crops[0], int): + crops = [crops] + initial = copy.deepcopy(self) + ret = [initial] + for crop in crops: + if hasattr(crop, 'span'): + start, end = crop.span + else: + start, end = crop + for current in list(ret): + if start <= current.start and end >= current.end: + # self is included in crop, remove current ... + ret.remove(current) + elif start >= current.start and end <= current.end: + # crop is included in self, split current ... + right = copy.deepcopy(current) + current.end = start + if len(current) <= 0: + ret.remove(current) + right.start = end + if len(right) > 0: + ret.append(right) + elif end <= current.end and end > current.start: + current.start = end + elif start >= current.start and start < current.end: + current.end = start + return filter_index(ret, predicate, index) + + def split(self, seps, predicate=None, index=None): + """ + Split this match in multiple matches using given separators. + :param seps: + :type seps: string containing separator characters + :return: list of new Match objects + :rtype: list + """ + split_match = copy.deepcopy(self) + current_match = split_match + ret = [] + + for i in range(0, len(self.raw)): + if self.raw[i] in seps: + if not split_match: + split_match = copy.deepcopy(current_match) + current_match.end = self.start + i + + else: + if split_match: + split_match.start = self.start + i + current_match = split_match + ret.append(split_match) + split_match = None + + return filter_index(ret, predicate, index) + + def __len__(self): + return self.end - self.start + + def __hash__(self): + return hash(Match) + hash(self.start) + hash(self.end) + hash(self.value) + + def __eq__(self, other): + if isinstance(other, Match): + return self.span == other.span and self.value == other.value and self.name == other.name and \ + self.parent == other.parent + return NotImplemented + + def __ne__(self, other): + if isinstance(other, Match): + return self.span != other.span or self.value != other.value or self.name != other.name or \ + self.parent != other.parent + return NotImplemented + + def __lt__(self, other): + if isinstance(other, Match): + return self.span < other.span + return NotImplemented + + def __gt__(self, other): + if isinstance(other, Match): + return self.span > other.span + return NotImplemented + + def __le__(self, other): + if isinstance(other, Match): + return self.span <= other.span + return NotImplemented + + def __ge__(self, other): + if isinstance(other, Match): + return self.span >= other.span + return NotImplemented + + def __repr__(self): + flags = "" + name = "" + tags = "" + defined = "" + initiator = "" + if self.initiator.value != self.value: + initiator = "+initiator=" + self.initiator.value + if self.private: + flags += '+private' + if self.name: + name = "+name=%s" % (self.name,) + if self.tags: + tags = "+tags=%s" % (self.tags,) + if self.defined_at: + defined += "@%s" % (self.defined_at,) + return "<%s:%s%s%s%s%s%s>" % (self.value, self.span, flags, name, tags, initiator, defined) diff --git a/libs/rebulk/pattern.py b/libs/rebulk/pattern.py new file mode 100644 index 00000000..767767b4 --- /dev/null +++ b/libs/rebulk/pattern.py @@ -0,0 +1,471 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +""" +Abstract pattern class definition along with various implementations (regexp, string, functional) +""" +# pylint: disable=super-init-not-called,wrong-import-position + +from abc import ABCMeta, abstractmethod, abstractproperty + +import six + +from . import debug +from .loose import call, ensure_list, ensure_dict +from .match import Match +from .remodule import re, REGEX_AVAILABLE +from .utils import find_all, is_iterable, get_first_defined + + +@six.add_metaclass(ABCMeta) +class Pattern(object): + """ + Definition of a particular pattern to search for. + """ + + def __init__(self, name=None, tags=None, formatter=None, value=None, validator=None, children=False, every=False, + private_parent=False, private_children=False, private=False, private_names=None, ignore_names=None, + marker=False, format_all=False, validate_all=False, disabled=lambda context: False, log_level=None, + properties=None): + """ + :param name: Name of this pattern + :type name: str + :param tags: List of tags related to this pattern + :type tags: list[str] + :param formatter: dict (name, func) of formatter to use with this pattern. name is the match name to support, + and func a function(input_string) that returns the formatted string. A single formatter function can also be + passed as a shortcut for {None: formatter}. The returned formatted string with be set in Match.value property. + :type formatter: dict[str, func] || func + :param value: dict (name, value) of value to use with this pattern. name is the match name to support, + and value an object for the match value. A single object value can also be + passed as a shortcut for {None: value}. The value with be set in Match.value property. + :type value: dict[str, object] || object + :param validator: dict (name, func) of validator to use with this pattern. name is the match name to support, + and func a function(match) that returns the a boolean. A single validator function can also be + passed as a shortcut for {None: validator}. If return value is False, match will be ignored. + :param children: generates children instead of parent + :type children: bool + :param every: generates both parent and children. + :type every: bool + :param private: flag this pattern as beeing private. + :type private: bool + :param private_parent: force return of parent and flag parent matches as private. + :type private_parent: bool + :param private_children: force return of children and flag children matches as private. + :type private_children: bool + :param private_names: force return of named matches as private. + :type private_names: bool + :param ignore_names: drop some named matches after validation. + :type ignore_names: bool + :param marker: flag this pattern as beeing a marker. + :type private: bool + :param format_all if True, pattern will format every match in the hierarchy (even match not yield). + :type format_all: bool + :param validate_all if True, pattern will validate every match in the hierarchy (even match not yield). + :type validate_all: bool + :param disabled: if True, this pattern is disabled. Can also be a function(context). + :type disabled: bool|function + :param log_lvl: Log level associated to this pattern + :type log_lvl: int + """ + # pylint:disable=too-many-locals + self.name = name + self.tags = ensure_list(tags) + self.formatters, self._default_formatter = ensure_dict(formatter, lambda x: x) + self.values, self._default_value = ensure_dict(value, None) + self.validators, self._default_validator = ensure_dict(validator, lambda match: True) + self.every = every + self.children = children + self.private = private + self.private_names = private_names if private_names else [] + self.ignore_names = ignore_names if ignore_names else [] + self.private_parent = private_parent + self.private_children = private_children + self.marker = marker + self.format_all = format_all + self.validate_all = validate_all + if not callable(disabled): + self.disabled = lambda context: disabled + else: + self.disabled = disabled + self._log_level = log_level + self._properties = properties + self.defined_at = debug.defined_at() + + @property + def log_level(self): + """ + Log level for this pattern. + :return: + :rtype: + """ + return self._log_level if self._log_level is not None else debug.LOG_LEVEL + + def _yield_children(self, match): + """ + Does this match has children + :param match: + :type match: + :return: + :rtype: + """ + return match.children and (self.children or self.every) + + def _yield_parent(self): + """ + Does this mat + :param match: + :type match: + :return: + :rtype: + """ + return not self.children or self.every + + def _match_parent(self, match, yield_parent): + """ + Handle a parent match + :param match: + :type match: + :param yield_parent: + :type yield_parent: + :return: + :rtype: + """ + if len(match) < 0 or match.value == "": + return False + + pattern_value = get_first_defined(self.values, [match.name, '__parent__', None], + self._default_value) + if pattern_value: + match.value = pattern_value + + if yield_parent or self.format_all: + match.formatter = get_first_defined(self.formatters, [match.name, '__parent__', None], + self._default_formatter) + if yield_parent or self.validate_all: + validator = get_first_defined(self.validators, [match.name, '__parent__', None], + self._default_validator) + if validator and not validator(match): + return False + return True + + def _match_child(self, child, yield_children): + """ + Handle a children match + :param child: + :type child: + :param yield_children: + :type yield_children: + :return: + :rtype: + """ + if len(child) < 0 or child.value == "": + return False + + pattern_value = get_first_defined(self.values, [child.name, '__children__', None], + self._default_value) + if pattern_value: + child.value = pattern_value + + if yield_children or self.format_all: + child.formatter = get_first_defined(self.formatters, [child.name, '__children__', None], + self._default_formatter) + + if yield_children or self.validate_all: + validator = get_first_defined(self.validators, [child.name, '__children__', None], + self._default_validator) + if validator and not validator(child): + return False + return True + + def matches(self, input_string, context=None, with_raw_matches=False): + """ + Computes all matches for a given input + + :param input_string: the string to parse + :type input_string: str + :param context: the context + :type context: dict + :param with_raw_matches: should return details + :type with_raw_matches: dict + :return: matches based on input_string for this pattern + :rtype: iterator[Match] + """ + # pylint: disable=too-many-branches + + matches = [] + raw_matches = [] + for pattern in self.patterns: + yield_parent = self._yield_parent() + match_index = -1 + for match in self._match(pattern, input_string, context): + match_index += 1 + match.match_index = match_index + raw_matches.append(match) + yield_children = self._yield_children(match) + if not self._match_parent(match, yield_parent): + continue + validated = True + for child in match.children: + if not self._match_child(child, yield_children): + validated = False + break + if validated: + if self.private_parent: + match.private = True + if self.private_children: + for child in match.children: + child.private = True + if yield_parent or self.private_parent: + matches.append(match) + if yield_children or self.private_children: + for child in match.children: + child.match_index = match_index + matches.append(child) + self._matches_privatize(matches) + self._matches_ignore(matches) + if with_raw_matches: + return matches, raw_matches + return matches + + def _matches_privatize(self, matches): + """ + Mark matches included in private_names with private flag. + :param matches: + :type matches: + :return: + :rtype: + """ + if self.private_names: + for match in matches: + if match.name in self.private_names: + match.private = True + + def _matches_ignore(self, matches): + """ + Ignore matches included in ignore_names. + :param matches: + :type matches: + :return: + :rtype: + """ + if self.ignore_names: + for match in list(matches): + if match.name in self.ignore_names: + matches.remove(match) + + @abstractproperty + def patterns(self): # pragma: no cover + """ + List of base patterns defined + + :return: A list of base patterns + :rtype: list + """ + pass + + @property + def properties(self): + """ + Properties names and values that can ben retrieved by this pattern. + :return: + :rtype: + """ + if self._properties: + return self._properties + return {} + + @abstractproperty + def match_options(self): # pragma: no cover + """ + dict of default options for generated Match objects + + :return: **options to pass to Match constructor + :rtype: dict + """ + pass + + @abstractmethod + def _match(self, pattern, input_string, context=None): # pragma: no cover + """ + Computes all matches for a given pattern and input + + :param pattern: the pattern to use + :param input_string: the string to parse + :type input_string: str + :param context: the context + :type context: dict + :return: matches based on input_string for this pattern + :rtype: iterator[Match] + """ + pass + + def __repr__(self): + defined = "" + if self.defined_at: + defined = "@%s" % (self.defined_at,) + return "<%s%s:%s>" % (self.__class__.__name__, defined, self.__repr__patterns__) + + @property + def __repr__patterns__(self): + return self.patterns + + +class StringPattern(Pattern): + """ + Definition of one or many strings to search for. + """ + + def __init__(self, *patterns, **kwargs): + call(super(StringPattern, self).__init__, **kwargs) + self._patterns = patterns + self._kwargs = kwargs + self._match_kwargs = filter_match_kwargs(kwargs) + + @property + def patterns(self): + return self._patterns + + @property + def match_options(self): + return self._match_kwargs + + def _match(self, pattern, input_string, context=None): + for index in call(find_all, input_string, pattern, **self._kwargs): + yield call(Match, index, index + len(pattern), pattern=self, input_string=input_string, + **self._match_kwargs) + + +class RePattern(Pattern): + """ + Definition of one or many regular expression pattern to search for. + """ + + def __init__(self, *patterns, **kwargs): + call(super(RePattern, self).__init__, **kwargs) + self.repeated_captures = REGEX_AVAILABLE + if 'repeated_captures' in kwargs: + self.repeated_captures = kwargs.get('repeated_captures') + if self.repeated_captures and not REGEX_AVAILABLE: # pragma: no cover + raise NotImplementedError("repeated_capture is available only with regex module.") + self.abbreviations = kwargs.get('abbreviations', []) + self._kwargs = kwargs + self._match_kwargs = filter_match_kwargs(kwargs) + self._children_match_kwargs = filter_match_kwargs(kwargs, children=True) + self._patterns = [] + for pattern in patterns: + if isinstance(pattern, six.string_types): + if self.abbreviations and pattern: + for key, replacement in self.abbreviations: + pattern = pattern.replace(key, replacement) + pattern = call(re.compile, pattern, **self._kwargs) + elif isinstance(pattern, dict): + if self.abbreviations and 'pattern' in pattern: + for key, replacement in self.abbreviations: + pattern['pattern'] = pattern['pattern'].replace(key, replacement) + pattern = re.compile(**pattern) + elif hasattr(pattern, '__iter__'): + pattern = re.compile(*pattern) + self._patterns.append(pattern) + + @property + def patterns(self): + return self._patterns + + @property + def __repr__patterns__(self): + return [pattern.pattern for pattern in self.patterns] + + @property + def match_options(self): + return self._match_kwargs + + def _match(self, pattern, input_string, context=None): + names = dict((v, k) for k, v in pattern.groupindex.items()) + for match_object in pattern.finditer(input_string): + start = match_object.start() + end = match_object.end() + main_match = call(Match, start, end, pattern=self, input_string=input_string, **self._match_kwargs) + + if pattern.groups: + for i in range(1, pattern.groups + 1): + name = names.get(i, main_match.name) + if self.repeated_captures: + for start, end in match_object.spans(i): + child_match = call(Match, start, end, name=name, parent=main_match, pattern=self, + input_string=input_string, **self._children_match_kwargs) + main_match.children.append(child_match) + else: + start, end = match_object.span(i) + if start > -1 and end > -1: + child_match = call(Match, start, end, name=name, parent=main_match, pattern=self, + input_string=input_string, **self._children_match_kwargs) + main_match.children.append(child_match) + + yield main_match + + +class FunctionalPattern(Pattern): + """ + Definition of one or many functional pattern to search for. + """ + + def __init__(self, *patterns, **kwargs): + call(super(FunctionalPattern, self).__init__, **kwargs) + self._patterns = patterns + self._kwargs = kwargs + self._match_kwargs = filter_match_kwargs(kwargs) + + @property + def patterns(self): + return self._patterns + + @property + def match_options(self): + return self._match_kwargs + + def _match(self, pattern, input_string, context=None): + ret = call(pattern, input_string, context, **self._kwargs) + if ret: + if not is_iterable(ret) or isinstance(ret, dict) \ + or (is_iterable(ret) and hasattr(ret, '__getitem__') and isinstance(ret[0], int)): + args_iterable = [ret] + else: + args_iterable = ret + for args in args_iterable: + if isinstance(args, dict): + options = args + options.pop('input_string', None) + options.pop('pattern', None) + if self._match_kwargs: + options = self._match_kwargs.copy() + options.update(args) + yield call(Match, pattern=self, input_string=input_string, **options) + else: + kwargs = self._match_kwargs + if isinstance(args[-1], dict): + kwargs = dict(kwargs) + kwargs.update(args[-1]) + args = args[:-1] + yield call(Match, *args, pattern=self, input_string=input_string, **kwargs) + + +def filter_match_kwargs(kwargs, children=False): + """ + Filters out kwargs for Match construction + + :param kwargs: + :type kwargs: dict + :param children: + :type children: Flag to filter children matches + :return: A filtered dict + :rtype: dict + """ + kwargs = kwargs.copy() + for key in ('pattern', 'start', 'end', 'parent', 'formatter', 'value'): + if key in kwargs: + del kwargs[key] + if children: + for key in ('name',): + if key in kwargs: + del kwargs[key] + return kwargs diff --git a/libs/rebulk/processors.py b/libs/rebulk/processors.py new file mode 100644 index 00000000..0121c658 --- /dev/null +++ b/libs/rebulk/processors.py @@ -0,0 +1,106 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +""" +Processor functions +""" +from logging import getLogger + +from .utils import IdentitySet + +from .rules import Rule, RemoveMatch + +log = getLogger(__name__).log + +DEFAULT = '__default__' + +POST_PROCESS = -2048 +PRE_PROCESS = 2048 + + +def _default_conflict_solver(match, conflicting_match): + """ + Default conflict solver for matches, shorter matches if they conflicts with longer ones + + :param conflicting_match: + :type conflicting_match: + :param match: + :type match: + :return: + :rtype: + """ + if len(conflicting_match.initiator) < len(match.initiator): + return conflicting_match + elif len(match.initiator) < len(conflicting_match.initiator): + return match + return None + + +class ConflictSolver(Rule): + """ + Remove conflicting matches. + """ + priority = PRE_PROCESS + + consequence = RemoveMatch + + @property + def default_conflict_solver(self): # pylint:disable=no-self-use + """ + Default conflict solver to use. + """ + return _default_conflict_solver + + def when(self, matches, context): + to_remove_matches = IdentitySet() + + public_matches = [match for match in matches if not match.private] + public_matches.sort(key=len) + + for match in public_matches: + conflicting_matches = matches.conflicting(match) + + if conflicting_matches: + # keep the match only if it's the longest + conflicting_matches = [conflicting_match for conflicting_match in conflicting_matches if + not conflicting_match.private] + conflicting_matches.sort(key=len) + + for conflicting_match in conflicting_matches: + conflict_solvers = [(self.default_conflict_solver, False)] + + if match.conflict_solver: + conflict_solvers.append((match.conflict_solver, False)) + if conflicting_match.conflict_solver: + conflict_solvers.append((conflicting_match.conflict_solver, True)) + + for conflict_solver, reverse in reversed(conflict_solvers): + if reverse: + to_remove = conflict_solver(conflicting_match, match) + else: + to_remove = conflict_solver(match, conflicting_match) + if to_remove == DEFAULT: + continue + if to_remove and to_remove not in to_remove_matches: + both_matches = [match, conflicting_match] + both_matches.remove(to_remove) + to_keep = both_matches[0] + + if to_keep not in to_remove_matches: + log(self.log_level, "Conflicting match %s will be removed in favor of match %s", + to_remove, to_keep) + + to_remove_matches.add(to_remove) + break + return to_remove_matches + + +class PrivateRemover(Rule): + """ + Removes private matches rule. + """ + priority = POST_PROCESS + + consequence = RemoveMatch + + def when(self, matches, context): + return [match for match in matches if match.private] diff --git a/libs/rebulk/rebulk.py b/libs/rebulk/rebulk.py new file mode 100644 index 00000000..9326482b --- /dev/null +++ b/libs/rebulk/rebulk.py @@ -0,0 +1,350 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +""" +Entry point functions and classes for Rebulk +""" +from logging import getLogger + +from .match import Matches + +from .pattern import RePattern, StringPattern, FunctionalPattern +from .chain import Chain + +from .processors import ConflictSolver, PrivateRemover +from .loose import set_defaults +from .utils import extend_safe +from .rules import Rules + +log = getLogger(__name__).log + + +class Rebulk(object): + r""" + Regular expression, string and function based patterns are declared in a ``Rebulk`` object. It use a fluent API to + chain ``string``, ``regex``, and ``functional`` methods to define various patterns types. + + .. code-block:: python + + >>> from rebulk import Rebulk + >>> bulk = Rebulk().string('brown').regex(r'qu\w+').functional(lambda s: (20, 25)) + + When ``Rebulk`` object is fully configured, you can call ``matches`` method with an input string to retrieve all + ``Match`` objects found by registered pattern. + + .. code-block:: python + + >>> bulk.matches("The quick brown fox jumps over the lazy dog") + [, , ] + + If multiple ``Match`` objects are found at the same position, only the longer one is kept. + + .. code-block:: python + + >>> bulk = Rebulk().string('lakers').string('la') + >>> bulk.matches("the lakers are from la") + [, ] + """ + # pylint:disable=protected-access + + def __init__(self, disabled=lambda context: False, default_rules=True): + """ + Creates a new Rebulk object. + :param disabled: if True, this pattern is disabled. Can also be a function(context). + :type disabled: bool|function + :param default_rules: use default rules + :type default_rules: + :return: + :rtype: + """ + if not callable(disabled): + self.disabled = lambda context: disabled + else: + self.disabled = disabled + self._patterns = [] + self._rules = Rules() + if default_rules: + self.rules(ConflictSolver, PrivateRemover) + self._defaults = {} + self._regex_defaults = {} + self._string_defaults = {} + self._functional_defaults = {} + self._rebulks = [] + + def pattern(self, *pattern): + """ + Add patterns objects + + :param pattern: + :type pattern: rebulk.pattern.Pattern + :return: self + :rtype: Rebulk + """ + self._patterns.extend(pattern) + return self + + def defaults(self, **kwargs): + """ + Define default keyword arguments for all patterns + :param kwargs: + :type kwargs: + :return: + :rtype: + """ + self._defaults = kwargs + return self + + def regex_defaults(self, **kwargs): + """ + Define default keyword arguments for functional patterns. + :param kwargs: + :type kwargs: + :return: + :rtype: + """ + self._regex_defaults = kwargs + return self + + def regex(self, *pattern, **kwargs): + """ + Add re pattern + + :param pattern: + :type pattern: + :return: self + :rtype: Rebulk + """ + self.pattern(self.build_re(*pattern, **kwargs)) + return self + + def build_re(self, *pattern, **kwargs): + """ + Builds a new regular expression pattern + + :param pattern: + :type pattern: + :param kwargs: + :type kwargs: + :return: + :rtype: + """ + set_defaults(self._regex_defaults, kwargs) + set_defaults(self._defaults, kwargs) + return RePattern(*pattern, **kwargs) + + def string_defaults(self, **kwargs): + """ + Define default keyword arguments for string patterns. + :param kwargs: + :type kwargs: + :return: + :rtype: + """ + self._string_defaults = kwargs + return self + + def string(self, *pattern, **kwargs): + """ + Add string pattern + + :param pattern: + :type pattern: + :return: self + :rtype: Rebulk + """ + self.pattern(self.build_string(*pattern, **kwargs)) + return self + + def build_string(self, *pattern, **kwargs): + """ + Builds a new string pattern + + :param pattern: + :type pattern: + :param kwargs: + :type kwargs: + :return: + :rtype: + """ + set_defaults(self._string_defaults, kwargs) + set_defaults(self._defaults, kwargs) + return StringPattern(*pattern, **kwargs) + + def functional_defaults(self, **kwargs): + """ + Define default keyword arguments for functional patterns. + :param kwargs: + :type kwargs: + :return: + :rtype: + """ + self._functional_defaults = kwargs + return self + + def functional(self, *pattern, **kwargs): + """ + Add functional pattern + + :param pattern: + :type pattern: + :return: self + :rtype: Rebulk + """ + self.pattern(self.build_functional(*pattern, **kwargs)) + return self + + def build_functional(self, *pattern, **kwargs): + """ + Builds a new functional pattern + + :param pattern: + :type pattern: + :param kwargs: + :type kwargs: + :return: + :rtype: + """ + set_defaults(self._functional_defaults, kwargs) + set_defaults(self._defaults, kwargs) + return FunctionalPattern(*pattern, **kwargs) + + def chain(self, **kwargs): + """ + Add patterns chain, using configuration of this rebulk + + :param pattern: + :type pattern: + :param kwargs: + :type kwargs: + :return: + :rtype: + """ + chain = self.build_chain(**kwargs) + self._patterns.append(chain) + return chain + + def build_chain(self, **kwargs): + """ + Builds a new patterns chain + + :param pattern: + :type pattern: + :param kwargs: + :type kwargs: + :return: + :rtype: + """ + set_defaults(self._defaults, kwargs) + return Chain(self, **kwargs) + + def rules(self, *rules): + """ + Add rules as a module, class or instance. + :param rules: + :type rules: list[Rule] + :return: + """ + self._rules.load(*rules) + return self + + def rebulk(self, *rebulks): + """ + Add a children rebulk object + :param rebulks: + :type rebulks: Rebulk + :return: + """ + self._rebulks.extend(rebulks) + return self + + def matches(self, string, context=None): + """ + Search for all matches with current configuration against input_string + :param string: string to search into + :type string: str + :param context: context to use + :type context: dict + :return: A custom list of matches + :rtype: Matches + """ + matches = Matches(input_string=string) + if context is None: + context = {} + + self._matches_patterns(matches, context) + + self._execute_rules(matches, context) + + return matches + + def effective_rules(self, context=None): + """ + Get effective rules for this rebulk object and its children. + :param context: + :type context: + :return: + :rtype: + """ + rules = Rules() + rules.extend(self._rules) + for rebulk in self._rebulks: + if not rebulk.disabled(context): + extend_safe(rules, rebulk._rules) + return rules + + def _execute_rules(self, matches, context): + """ + Execute rules for this rebulk and children. + :param matches: + :type matches: + :param context: + :type context: + :return: + :rtype: + """ + if not self.disabled(context): + rules = self.effective_rules(context) + rules.execute_all_rules(matches, context) + + def effective_patterns(self, context=None): + """ + Get effective patterns for this rebulk object and its children. + :param context: + :type context: + :return: + :rtype: + """ + patterns = list(self._patterns) + for rebulk in self._rebulks: + if not rebulk.disabled(context): + extend_safe(patterns, rebulk._patterns) + return patterns + + def _matches_patterns(self, matches, context): + """ + Search for all matches with current paterns agains input_string + :param matches: matches list + :type matches: Matches + :param context: context to use + :type context: dict + :return: + :rtype: + """ + if not self.disabled(context): + patterns = self.effective_patterns(context) + for pattern in patterns: + if not pattern.disabled(context): + pattern_matches = pattern.matches(matches.input_string, context) + if pattern_matches: + log(pattern.log_level, "Pattern has %s match(es). (%s)", len(pattern_matches), pattern) + else: + pass + # log(pattern.log_level, "Pattern doesn't match. (%s)" % (pattern,)) + for match in pattern_matches: + if match.marker: + log(pattern.log_level, "Marker found. (%s)", match) + matches.markers.append(match) + else: + log(pattern.log_level, "Match found. (%s)", match) + matches.append(match) + else: + log(pattern.log_level, "Pattern is disabled. (%s)", pattern) diff --git a/libs/rebulk/remodule.py b/libs/rebulk/remodule.py new file mode 100644 index 00000000..d1d68d19 --- /dev/null +++ b/libs/rebulk/remodule.py @@ -0,0 +1,17 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +""" +Uniform re module +""" +# pylint: disable-all +import os + +REGEX_AVAILABLE = False +if os.environ.get('REGEX_DISABLED') in ["1", "true", "True", "Y"]: + import re +else: + try: + import regex as re + REGEX_AVAILABLE = True + except ImportError: + import re diff --git a/libs/rebulk/rules.py b/libs/rebulk/rules.py new file mode 100644 index 00000000..19b563ab --- /dev/null +++ b/libs/rebulk/rules.py @@ -0,0 +1,375 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +""" +Abstract rule class definition and rule engine implementation +""" +from abc import ABCMeta, abstractmethod +import inspect +from itertools import groupby +from logging import getLogger + +import six +from .utils import is_iterable + +from .toposort import toposort + +from . import debug + +log = getLogger(__name__).log + + +@six.add_metaclass(ABCMeta) +class Consequence(object): + """ + Definition of a consequence to apply. + """ + @abstractmethod + def then(self, matches, when_response, context): # pragma: no cover + """ + Action implementation. + + :param matches: + :type matches: rebulk.match.Matches + :param context: + :type context: + :param when_response: return object from when call. + :type when_response: object + :return: True if the action was runned, False if it wasn't. + :rtype: bool + """ + pass + + +@six.add_metaclass(ABCMeta) +class Condition(object): + """ + Definition of a condition to check. + """ + @abstractmethod + def when(self, matches, context): # pragma: no cover + """ + Condition implementation. + + :param matches: + :type matches: rebulk.match.Matches + :param context: + :type context: + :return: truthy if rule should be triggered and execute then action, falsy if it should not. + :rtype: object + """ + pass + + +@six.add_metaclass(ABCMeta) +class CustomRule(Condition, Consequence): + """ + Definition of a rule to apply + """ + # pylint: disable=no-self-use, unused-argument, abstract-method + priority = 0 + name = None + dependency = None + properties = {} + + def __init__(self, log_level=None): + self.defined_at = debug.defined_at() + if log_level is None and not hasattr(self, 'log_level'): + self.log_level = debug.LOG_LEVEL + + def enabled(self, context): + """ + Disable rule. + + :param context: + :type context: + :return: True if rule is enabled, False if disabled + :rtype: bool + """ + return True + + def __lt__(self, other): + return self.priority > other.priority + + def __repr__(self): + defined = "" + if self.defined_at: + defined = "@%s" % (self.defined_at,) + return "<%s%s>" % (self.name if self.name else self.__class__.__name__, defined) + + def __eq__(self, other): + return self.__class__ == other.__class__ + + def __hash__(self): + return hash(self.__class__) + + +class Rule(CustomRule): + """ + Definition of a rule to apply + """ + # pylint:disable=abstract-method + consequence = None + + def then(self, matches, when_response, context): + assert self.consequence + if is_iterable(self.consequence): + if not is_iterable(when_response): + when_response = [when_response] + iterator = iter(when_response) + for cons in self.consequence: #pylint: disable=not-an-iterable + if inspect.isclass(cons): + cons = cons() + cons.then(matches, next(iterator), context) + else: + cons = self.consequence + if inspect.isclass(cons): + cons = cons() # pylint:disable=not-callable + cons.then(matches, when_response, context) + + +class RemoveMatch(Consequence): # pylint: disable=abstract-method + """ + Remove matches returned by then + """ + def then(self, matches, when_response, context): + if is_iterable(when_response): + ret = [] + when_response = list(when_response) + for match in when_response: + if match in matches: + matches.remove(match) + ret.append(match) + return ret + else: + if when_response in matches: + matches.remove(when_response) + return when_response + + +class AppendMatch(Consequence): # pylint: disable=abstract-method + """ + Append matches returned by then + """ + def __init__(self, match_name=None): + self.match_name = match_name + + def then(self, matches, when_response, context): + if is_iterable(when_response): + ret = [] + when_response = list(when_response) + for match in when_response: + if match not in matches: + if self.match_name: + match.name = self.match_name + matches.append(match) + ret.append(match) + return ret + else: + if self.match_name: + when_response.name = self.match_name + if when_response not in matches: + matches.append(when_response) + return when_response + + +class RenameMatch(Consequence): # pylint: disable=abstract-method + """ + Rename matches returned by then + """ + def __init__(self, match_name): + self.match_name = match_name + self.remove = RemoveMatch() + self.append = AppendMatch() + + def then(self, matches, when_response, context): + removed = self.remove.then(matches, when_response, context) + if is_iterable(removed): + removed = list(removed) + for match in removed: + match.name = self.match_name + elif removed: + removed.name = self.match_name + if removed: + self.append.then(matches, removed, context) + + +class AppendTags(Consequence): # pylint: disable=abstract-method + """ + Add tags to returned matches + """ + def __init__(self, tags): + self.tags = tags + self.remove = RemoveMatch() + self.append = AppendMatch() + + def then(self, matches, when_response, context): + removed = self.remove.then(matches, when_response, context) + if is_iterable(removed): + removed = list(removed) + for match in removed: + match.tags.extend(self.tags) + elif removed: + removed.tags.extend(self.tags) # pylint: disable=no-member + if removed: + self.append.then(matches, removed, context) + + +class RemoveTags(Consequence): # pylint: disable=abstract-method + """ + Remove tags from returned matches + """ + def __init__(self, tags): + self.tags = tags + self.remove = RemoveMatch() + self.append = AppendMatch() + + def then(self, matches, when_response, context): + removed = self.remove.then(matches, when_response, context) + if is_iterable(removed): + removed = list(removed) + for match in removed: + for tag in self.tags: + if tag in match.tags: + match.tags.remove(tag) + elif removed: + for tag in self.tags: + if tag in removed.tags: # pylint: disable=no-member + removed.tags.remove(tag) # pylint: disable=no-member + if removed: + self.append.then(matches, removed, context) + + +class Rules(list): + """ + list of rules ready to execute. + """ + + def __init__(self, *rules): + super(Rules, self).__init__() + self.load(*rules) + + def load(self, *rules): + """ + Load rules from a Rule module, class or instance + + :param rules: + :type rules: + :return: + :rtype: + """ + for rule in rules: + if inspect.ismodule(rule): + self.load_module(rule) + elif inspect.isclass(rule): + self.load_class(rule) + else: + self.append(rule) + + def load_module(self, module): + """ + Load a rules module + + :param module: + :type module: + :return: + :rtype: + """ + # pylint: disable=unused-variable + for name, obj in inspect.getmembers(module, + lambda member: hasattr(member, '__module__') + and member.__module__ == module.__name__ + and inspect.isclass): + self.load_class(obj) + + def load_class(self, class_): + """ + Load a Rule class. + + :param class_: + :type class_: + :return: + :rtype: + """ + self.append(class_()) + + def execute_all_rules(self, matches, context): + """ + Execute all rules from this rules list. All when condition with same priority will be performed before + calling then actions. + + :param matches: + :type matches: + :param context: + :type context: + :return: + :rtype: + """ + ret = [] + for priority, priority_rules in groupby(sorted(self), lambda rule: rule.priority): + sorted_rules = toposort_rules(list(priority_rules)) # Group by dependency graph toposort + for rules_group in sorted_rules: + rules_group = list(sorted(rules_group, key=self.index)) # Sort rules group based on initial ordering. + group_log_level = None + for rule in rules_group: + if group_log_level is None or group_log_level < rule.log_level: + group_log_level = rule.log_level + log(group_log_level, "%s independent rule(s) at priority %s.", len(rules_group), priority) + for rule in rules_group: + when_response = execute_rule(rule, matches, context) + if when_response is not None: + ret.append((rule, when_response)) + + return ret + + +def execute_rule(rule, matches, context): + """ + Execute the given rule. + :param rule: + :type rule: + :param matches: + :type matches: + :param context: + :type context: + :return: + :rtype: + """ + if rule.enabled(context): + log(rule.log_level, "Checking rule condition: %s", rule) + when_response = rule.when(matches, context) + if when_response: + log(rule.log_level, "Rule was triggered: %s", when_response) + log(rule.log_level, "Running rule consequence: %s %s", rule, when_response) + rule.then(matches, when_response, context) + return when_response + else: + log(rule.log_level, "Rule is disabled: %s", rule) + +def toposort_rules(rules): + """ + Sort given rules using toposort with dependency parameter. + :param rules: + :type rules: + :return: + :rtype: + """ + graph = {} + class_dict = {} + for rule in rules: + if rule.__class__ in class_dict: + raise ValueError("Duplicate class rules are not allowed: %s" % rule.__class__) + class_dict[rule.__class__] = rule + for rule in rules: + if not is_iterable(rule.dependency) and rule.dependency: + rule_dependencies = [rule.dependency] + else: + rule_dependencies = rule.dependency + dependencies = set() + if rule_dependencies: + for dependency in rule_dependencies: + if inspect.isclass(dependency): + dependency = class_dict.get(dependency) + if dependency: + dependencies.add(dependency) + graph[rule] = dependencies + return toposort(graph) diff --git a/libs/rebulk/test/__init__.py b/libs/rebulk/test/__init__.py new file mode 100644 index 00000000..0ab48c94 --- /dev/null +++ b/libs/rebulk/test/__init__.py @@ -0,0 +1,3 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +# pylint: disable=no-self-use, pointless-statement, missing-docstring diff --git a/libs/rebulk/test/default_rules_module.py b/libs/rebulk/test/default_rules_module.py new file mode 100644 index 00000000..5eed8e0d --- /dev/null +++ b/libs/rebulk/test/default_rules_module.py @@ -0,0 +1,79 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +# pylint: disable=no-self-use, pointless-statement, missing-docstring, invalid-name +from ..match import Match +from ..rules import Rule, RemoveMatch, AppendMatch, RenameMatch, AppendTags, RemoveTags + + +class RuleRemove0(Rule): + consequence = RemoveMatch + def when(self, matches, context): + return matches[0] + + +class RuleAppend0(Rule): + consequence = AppendMatch() + def when(self, matches, context): + return Match(5, 10) + +class RuleRename0(Rule): + consequence = [RenameMatch('renamed')] + def when(self, matches, context): + return [Match(5, 10, name="original")] + +class RuleRemove1(Rule): + consequence = [RemoveMatch()] + def when(self, matches, context): + return [matches[0]] + +class RuleAppend1(Rule): + consequence = [AppendMatch] + def when(self, matches, context): + return [Match(5, 10)] + +class RuleRename1(Rule): + consequence = RenameMatch('renamed') + def when(self, matches, context): + return [Match(5, 10, name="original")] + +class RuleAppend2(Rule): + consequence = [AppendMatch('renamed')] + properties = {'renamed': [None]} + def when(self, matches, context): + return [Match(5, 10)] + +class RuleRename2(Rule): + consequence = RenameMatch('renamed') + def when(self, matches, context): + return Match(5, 10, name="original") + +class RuleAppend3(Rule): + consequence = AppendMatch('renamed') + properties = {'renamed': [None]} + def when(self, matches, context): + return [Match(5, 10)] + +class RuleRename3(Rule): + consequence = [RenameMatch('renamed')] + def when(self, matches, context): + return Match(5, 10, name="original") + +class RuleAppendTags0(Rule): + consequence = AppendTags(['new-tag']) + def when(self, matches, context): + return matches.named('tags', 0) + +class RuleRemoveTags0(Rule): + consequence = RemoveTags(['new-tag']) + def when(self, matches, context): + return matches.named('tags', 0) + +class RuleAppendTags1(Rule): + consequence = AppendTags(['new-tag']) + def when(self, matches, context): + return matches.named('tags') + +class RuleRemoveTags1(Rule): + consequence = RemoveTags(['new-tag']) + def when(self, matches, context): + return matches.named('tags') diff --git a/libs/rebulk/test/rebulk_rules_module.py b/libs/rebulk/test/rebulk_rules_module.py new file mode 100644 index 00000000..0bd5ef33 --- /dev/null +++ b/libs/rebulk/test/rebulk_rules_module.py @@ -0,0 +1,38 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +# pylint: disable=no-self-use, pointless-statement, missing-docstring, invalid-name +from rebulk.rules import Rule, RemoveMatch, CustomRule + + +class RemoveAllButLastYear(Rule): + consequence = RemoveMatch + def when(self, matches, context): + entries = matches.named('year') + return entries[:-1] + + +class PrefixedSuffixedYear(CustomRule): + def when(self, matches, context): + toRemove = [] + years = matches.named('year') + for year in years: + if not matches.previous(year, lambda p: p.name == 'yearPrefix') and \ + not matches.next(year, lambda n: n.name == 'yearSuffix'): + toRemove.append(year) + return toRemove + + def then(self, matches, when_response, context): + for to_remove in when_response: + matches.remove(to_remove) + + +class PrefixedSuffixedYearNoLambda(Rule): + consequence = RemoveMatch + def when(self, matches, context): + toRemove = [] + years = matches.named('year') + for year in years: + if not [m for m in matches.previous(year) if m.name == 'yearPrefix'] and \ + not [m for m in matches.next(year) if m.name == 'yearSuffix']: + toRemove.append(year) + return toRemove diff --git a/libs/rebulk/test/rules_module.py b/libs/rebulk/test/rules_module.py new file mode 100644 index 00000000..887b81da --- /dev/null +++ b/libs/rebulk/test/rules_module.py @@ -0,0 +1,54 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +# pylint: disable=no-self-use, pointless-statement, missing-docstring, invalid-name +from ..match import Match +from ..rules import Rule + + +class Rule3(Rule): + def when(self, matches, context): + return context.get('when') + + def then(self, matches, when_response, context): + assert when_response in [True, False] + matches.append(Match(3, 4)) + + +class Rule2(Rule): + dependency = Rule3 + + def when(self, matches, context): + return True + + def then(self, matches, when_response, context): + assert when_response + matches.append(Match(3, 4)) + + +class Rule1(Rule): + dependency = Rule2 + + def when(self, matches, context): + return True + + def then(self, matches, when_response, context): + assert when_response + matches.clear() + + +class Rule0(Rule): + dependency = Rule1 + + def when(self, matches, context): + return True + + def then(self, matches, when_response, context): + assert when_response + matches.append(Match(3, 4)) + + +class Rule1Disabled(Rule1): + name = "Disabled Rule1" + + def enabled(self, context): + return False diff --git a/libs/rebulk/test/test_chain.py b/libs/rebulk/test/test_chain.py new file mode 100644 index 00000000..8238ad63 --- /dev/null +++ b/libs/rebulk/test/test_chain.py @@ -0,0 +1,303 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +# pylint: disable=no-self-use, pointless-statement, missing-docstring, no-member +import re + +from functools import partial + +from ..validators import chars_surround +from ..rebulk import Rebulk, FunctionalPattern, RePattern, StringPattern + + +def test_chain_close(): + rebulk = Rebulk() + ret = rebulk.chain().close() + + assert ret == rebulk + assert len(rebulk.effective_patterns()) == 1 + + +def test_build_chain(): + rebulk = Rebulk() + + def digit(input_string): + i = input_string.find("1849") + if i > -1: + return i, i + len("1849") + + ret = rebulk.chain() \ + .functional(digit) \ + .string("test").repeater(2) \ + .string("x").repeater('{1,3}') \ + .string("optional").repeater('?') \ + .regex("f?x").repeater('+') \ + .close() + + assert ret == rebulk + assert len(rebulk.effective_patterns()) == 1 + + chain = rebulk.effective_patterns()[0] + + assert len(chain.parts) == 5 + + assert isinstance(chain.parts[0].pattern, FunctionalPattern) + assert chain.parts[0].repeater_start == 1 + assert chain.parts[0].repeater_end == 1 + + assert isinstance(chain.parts[1].pattern, StringPattern) + assert chain.parts[1].repeater_start == 2 + assert chain.parts[1].repeater_end == 2 + + assert isinstance(chain.parts[2].pattern, StringPattern) + assert chain.parts[2].repeater_start == 1 + assert chain.parts[2].repeater_end == 3 + + assert isinstance(chain.parts[3].pattern, StringPattern) + assert chain.parts[3].repeater_start == 0 + assert chain.parts[3].repeater_end == 1 + + assert isinstance(chain.parts[4].pattern, RePattern) + assert chain.parts[4].repeater_start == 1 + assert chain.parts[4].repeater_end is None + + +def test_chain_defaults(): + rebulk = Rebulk() + rebulk.defaults(validator=lambda x: True, ignore_names=['testIgnore'], children=True) + + rebulk.chain()\ + .regex("(?Ptest)") \ + .regex(" ").repeater("*") \ + .regex("(?PtestIgnore)") + matches = rebulk.matches("test testIgnore") + + assert len(matches) == 1 + assert matches[0].name == "test" + + +def test_matches(): + rebulk = Rebulk() + + def digit(input_string): + i = input_string.find("1849") + if i > -1: + return i, i + len("1849") + + input_string = "1849testtestxxfixfux_foxabc1849testtestxoptionalfoxabc" + + chain = rebulk.chain() \ + .functional(digit) \ + .string("test").hidden().repeater(2) \ + .string("x").hidden().repeater('{1,3}') \ + .string("optional").hidden().repeater('?') \ + .regex("f.?x", name='result').repeater('+') \ + .close() + + matches = chain.matches(input_string) + + assert len(matches) == 2 + children = matches[0].children + + assert children[0].value == '1849' + assert children[1].value == 'fix' + assert children[2].value == 'fux' + + children = matches[1].children + assert children[0].value == '1849' + assert children[1].value == 'fox' + + input_string = "_1850testtestxoptionalfoxabc" + matches = chain.matches(input_string) + + assert len(matches) == 0 + + input_string = "_1849testtesttesttestxoptionalfoxabc" + matches = chain.matches(input_string) + + assert len(matches) == 0 + + input_string = "_1849testtestxxxxoptionalfoxabc" + matches = chain.matches(input_string) + + assert len(matches) == 0 + + input_string = "_1849testtestoptionalfoxabc" + matches = chain.matches(input_string) + + assert len(matches) == 0 + + input_string = "_1849testtestxoptionalabc" + matches = chain.matches(input_string) + + assert len(matches) == 0 + + input_string = "_1849testtestxoptionalfaxabc" + matches = chain.matches(input_string) + + assert len(matches) == 1 + children = matches[0].children + + assert children[0].value == '1849' + assert children[1].value == 'fax' + + +def test_matches_2(): + rebulk = Rebulk() \ + .regex_defaults(flags=re.IGNORECASE) \ + .chain(children=True, formatter={'episode': int}) \ + .defaults(formatter={'version': int}) \ + .regex(r'e(?P\d{1,4})') \ + .regex(r'v(?P\d+)').repeater('?') \ + .regex(r'[ex-](?P\d{1,4})').repeater('*') \ + .close() + + matches = rebulk.matches("This is E14v2-15E16x17") + assert len(matches) == 5 + + assert matches[0].name == 'episode' + assert matches[0].value == 14 + + assert matches[1].name == 'version' + assert matches[1].value == 2 + + assert matches[2].name == 'episode' + assert matches[2].value == 15 + + assert matches[3].name == 'episode' + assert matches[3].value == 16 + + assert matches[4].name == 'episode' + assert matches[4].value == 17 + + +def test_matches_3(): + alt_dash = (r'@', r'[\W_]') # abbreviation + + rebulk = Rebulk() + + rebulk.chain(formatter={'season': int, 'episode': int}, + tags=['SxxExx'], + abbreviations=[alt_dash], + private_names=['episodeSeparator', 'seasonSeparator'], + children=True, + private_parent=True, + conflict_solver=lambda match, other: match + if match.name in ['season', 'episode'] and other.name in + ['screen_size', 'video_codec', 'audio_codec', + 'audio_channels', 'container', 'date'] + else '__default__') \ + .regex(r'(?P\d+)@?x@?(?P\d+)') \ + .regex(r'(?Px|-|\+|&)(?P\d+)').repeater('*') \ + .chain() \ + .regex(r'S(?P\d+)@?(?:xE|Ex|E|x)@?(?P\d+)') \ + .regex(r'(?:(?PxE|Ex|E|x|-|\+|&)(?P\d+))').repeater('*') \ + .chain() \ + .regex(r'S(?P\d+)') \ + .regex(r'(?PS|-|\+|&)(?P\d+)').repeater('*') + + matches = rebulk.matches("test-01x02-03") + assert len(matches) == 3 + + assert matches[0].name == 'season' + assert matches[0].value == 1 + + assert matches[1].name == 'episode' + assert matches[1].value == 2 + + assert matches[2].name == 'episode' + assert matches[2].value == 3 + + matches = rebulk.matches("test-S01E02-03") + + assert len(matches) == 3 + assert matches[0].name == 'season' + assert matches[0].value == 1 + + assert matches[1].name == 'episode' + assert matches[1].value == 2 + + assert matches[2].name == 'episode' + assert matches[2].value == 3 + + matches = rebulk.matches("test-S01-02-03-04") + + assert len(matches) == 4 + assert matches[0].name == 'season' + assert matches[0].value == 1 + + assert matches[1].name == 'season' + assert matches[1].value == 2 + + assert matches[2].name == 'season' + assert matches[2].value == 3 + + assert matches[3].name == 'season' + assert matches[3].value == 4 + + +def test_matches_4(): + seps_surround = partial(chars_surround, " ") + + rebulk = Rebulk() + rebulk.regex_defaults(flags=re.IGNORECASE) + rebulk.defaults(private_names=['episodeSeparator', 'seasonSeparator'], validate_all=True, + validator={'__parent__': seps_surround}, children=True, private_parent=True) + + rebulk.chain(formatter={'episode': int, 'version': int}) \ + .defaults(validator=None) \ + .regex(r'e(?P\d{1,4})') \ + .regex(r'v(?P\d+)').repeater('?') \ + .regex(r'(?Pe|x|-)(?P\d{1,4})').repeater('*') + + matches = rebulk.matches("Some Series E01E02E03") + assert len(matches) == 3 + + assert matches[0].value == 1 + assert matches[1].value == 2 + assert matches[2].value == 3 + + +def test_matches_5(): + seps_surround = partial(chars_surround, " ") + + rebulk = Rebulk() + rebulk.regex_defaults(flags=re.IGNORECASE) + rebulk.defaults(private_names=['episodeSeparator', 'seasonSeparator'], validate_all=True, + validator={'__parent__': seps_surround}, children=True, private_parent=True) + + rebulk.chain(formatter={'episode': int, 'version': int}) \ + .defaults(validator=None) \ + .regex(r'e(?P\d{1,4})') \ + .regex(r'v(?P\d+)').repeater('?') \ + .regex(r'(?Pe|x|-)(?P\d{1,4})').repeater('{2,3}') + + matches = rebulk.matches("Some Series E01E02E03") + assert len(matches) == 3 + + matches = rebulk.matches("Some Series E01E02") + assert len(matches) == 0 + + matches = rebulk.matches("Some Series E01E02E03E04E05E06") # Parent can't be validated, so no results at all + assert len(matches) == 0 + + +def test_matches_6(): + rebulk = Rebulk() + rebulk.regex_defaults(flags=re.IGNORECASE) + rebulk.defaults(private_names=['episodeSeparator', 'seasonSeparator'], validate_all=True, + validator=None, children=True, private_parent=True) + + rebulk.chain(formatter={'episode': int, 'version': int}) \ + .defaults(validator=None) \ + .regex(r'e(?P\d{1,4})') \ + .regex(r'v(?P\d+)').repeater('?') \ + .regex(r'(?Pe|x|-)(?P\d{1,4})').repeater('{2,3}') + + matches = rebulk.matches("Some Series E01E02E03") + assert len(matches) == 3 + + matches = rebulk.matches("Some Series E01E02") + assert len(matches) == 0 + + matches = rebulk.matches("Some Series E01E02E03E04E05E06") # No validator on parent, so it should give 4 episodes. + assert len(matches) == 4 diff --git a/libs/rebulk/test/test_debug.py b/libs/rebulk/test/test_debug.py new file mode 100644 index 00000000..a35f95fd --- /dev/null +++ b/libs/rebulk/test/test_debug.py @@ -0,0 +1,83 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +# pylint: disable=no-self-use, pointless-statement, missing-docstring, protected-access, invalid-name + +from ..pattern import StringPattern +from ..rebulk import Rebulk +from ..match import Match +from .. import debug +from .default_rules_module import RuleRemove0 + + +class TestDebug(object): + + + #request.addfinalizer(disable_debug) + + + + debug.DEBUG = True + pattern = StringPattern(1, 3, value="es") + + match = Match(1, 3, value="es") + rule = RuleRemove0() + + input_string = "This is a debug test" + rebulk = Rebulk().string("debug") \ + .string("is") + + matches = rebulk.matches(input_string) + debug.DEBUG = False + + @classmethod + def setup_class(cls): + debug.DEBUG = True + + @classmethod + def teardown_class(cls): + debug.DEBUG = False + + def test_pattern(self): + assert self.pattern.defined_at.lineno == 20 + assert self.pattern.defined_at.name == 'rebulk.test.test_debug' + assert self.pattern.defined_at.filename.endswith('test_debug.py') + + assert str(self.pattern.defined_at) == 'test_debug.py#L20' + assert repr(self.pattern) == '' + + def test_match(self): + assert self.match.defined_at.lineno == 22 + assert self.match.defined_at.name == 'rebulk.test.test_debug' + assert self.match.defined_at.filename.endswith('test_debug.py') + + assert str(self.match.defined_at) == 'test_debug.py#L22' + + def test_rule(self): + assert self.rule.defined_at.lineno == 23 + assert self.rule.defined_at.name == 'rebulk.test.test_debug' + assert self.rule.defined_at.filename.endswith('test_debug.py') + + assert str(self.rule.defined_at) == 'test_debug.py#L23' + assert repr(self.rule) == '' + + def test_rebulk(self): + """ + This test fails on travis CI, can't find out why there's 1 line offset ... + """ + assert self.rebulk._patterns[0].defined_at.lineno in [26, 27] + assert self.rebulk._patterns[0].defined_at.name == 'rebulk.test.test_debug' + assert self.rebulk._patterns[0].defined_at.filename.endswith('test_debug.py') + + assert str(self.rebulk._patterns[0].defined_at) in ['test_debug.py#L26', 'test_debug.py#L27'] + + assert self.rebulk._patterns[1].defined_at.lineno in [27, 28] + assert self.rebulk._patterns[1].defined_at.name == 'rebulk.test.test_debug' + assert self.rebulk._patterns[1].defined_at.filename.endswith('test_debug.py') + + assert str(self.rebulk._patterns[1].defined_at) in ['test_debug.py#L27', 'test_debug.py#L28'] + + assert self.matches[0].defined_at == self.rebulk._patterns[0].defined_at + assert self.matches[1].defined_at == self.rebulk._patterns[1].defined_at + + def test_repr(self): + str(self.matches) diff --git a/libs/rebulk/test/test_introspector.py b/libs/rebulk/test/test_introspector.py new file mode 100644 index 00000000..24c0c500 --- /dev/null +++ b/libs/rebulk/test/test_introspector.py @@ -0,0 +1,138 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +""" +Introspector tests +""" +# pylint: disable=no-self-use,pointless-statement,missing-docstring,protected-access,invalid-name +from ..rebulk import Rebulk +from .. import introspector +from .default_rules_module import RuleAppend2, RuleAppend3 + + +def test_string_introspector(): + rebulk = Rebulk().string('One', 'Two', 'Three', name='first').string('1', '2', '3', name='second') + + introspected = introspector.introspect(rebulk, None) + + assert len(introspected.patterns) == 2 + + first_properties = introspected.patterns[0].properties + assert len(first_properties) == 1 + first_properties['first'] == ['One', 'Two', 'Three'] + + second_properties = introspected.patterns[1].properties + assert len(second_properties) == 1 + second_properties['second'] == ['1', '2', '3'] + + properties = introspected.properties + assert len(properties) == 2 + assert properties['first'] == first_properties['first'] + assert properties['second'] == second_properties['second'] + + +def test_string_properties(): + rebulk = Rebulk()\ + .string('One', 'Two', 'Three', name='first', properties={'custom': ['One']})\ + .string('1', '2', '3', name='second', properties={'custom': [1]}) + + introspected = introspector.introspect(rebulk, None) + + assert len(introspected.patterns) == 2 + assert len(introspected.rules) == 2 + + first_properties = introspected.patterns[0].properties + assert len(first_properties) == 1 + first_properties['custom'] == ['One'] + + second_properties = introspected.patterns[1].properties + assert len(second_properties) == 1 + second_properties['custom'] == [1] + + properties = introspected.properties + assert len(properties) == 1 + assert properties['custom'] == ['One', 1] + + +def test_various_pattern(): + rebulk = Rebulk()\ + .regex('One', 'Two', 'Three', name='first', value="string") \ + .string('1', '2', '3', name='second', value="digit") \ + .string('4', '5', '6', name='third') \ + .string('private', private=True) \ + .functional(lambda string: (0, 5), name='func', value='test') \ + .regex('One', 'Two', 'Three', name='regex_name') \ + .regex('(?POne)(?PTwo)(?PThree)') \ + .functional(lambda string: (6, 10), name='func2') \ + .string('7', name='third') + + introspected = introspector.introspect(rebulk, None) + + assert len(introspected.patterns) == 8 + assert len(introspected.rules) == 2 + + first_properties = introspected.patterns[0].properties + assert len(first_properties) == 1 + first_properties['first'] == ['string'] + + second_properties = introspected.patterns[1].properties + assert len(second_properties) == 1 + second_properties['second'] == ['digit'] + + third_properties = introspected.patterns[2].properties + assert len(third_properties) == 1 + third_properties['third'] == ['4', '5', '6'] + + func_properties = introspected.patterns[3].properties + assert len(func_properties) == 1 + func_properties['func'] == ['test'] + + regex_name_properties = introspected.patterns[4].properties + assert len(regex_name_properties) == 1 + regex_name_properties['regex_name'] == [None] + + regex_groups_properties = introspected.patterns[5].properties + assert len(regex_groups_properties) == 3 + regex_groups_properties['one'] == [None] + regex_groups_properties['two'] == [None] + regex_groups_properties['three'] == [None] + + func2_properties = introspected.patterns[6].properties + assert len(func2_properties) == 1 + func2_properties['func2'] == [None] + + append_third_properties = introspected.patterns[7].properties + assert len(append_third_properties) == 1 + append_third_properties['third'] == [None] + + properties = introspected.properties + assert len(properties) == 9 + assert properties['first'] == first_properties['first'] + assert properties['second'] == second_properties['second'] + assert properties['third'] == third_properties['third'] + append_third_properties['third'] + assert properties['func'] == func_properties['func'] + assert properties['regex_name'] == regex_name_properties['regex_name'] + assert properties['one'] == regex_groups_properties['one'] + assert properties['two'] == regex_groups_properties['two'] + assert properties['three'] == regex_groups_properties['three'] + assert properties['func2'] == func2_properties['func2'] + + +def test_rule_properties(): + rebulk = Rebulk(default_rules=False).rules(RuleAppend2, RuleAppend3) + + introspected = introspector.introspect(rebulk, None) + + assert len(introspected.rules) == 2 + assert len(introspected.patterns) == 0 + + rule_properties = introspected.rules[0].properties + assert len(rule_properties) == 1 + assert rule_properties['renamed'] == [None] + + rule_properties = introspected.rules[1].properties + assert len(rule_properties) == 1 + assert rule_properties['renamed'] == [None] + + properties = introspected.properties + assert len(properties) == 1 + assert properties['renamed'] == [None] diff --git a/libs/rebulk/test/test_loose.py b/libs/rebulk/test/test_loose.py new file mode 100644 index 00000000..bc0c6bca --- /dev/null +++ b/libs/rebulk/test/test_loose.py @@ -0,0 +1,83 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +# pylint: disable=no-self-use, pointless-statement, missing-docstring, invalid-name + +from ..loose import call + + +def test_loose_function(): + + def func(v1, v2, v3=3, v4=4): + return v1 + v2 + v3 + v4 + + assert call(func, 1, 2) == func(1, 2) + assert call(func, 1, 2, 3, 5) == func(1, 2, 3, 5) + assert call(func, 1, 2, v3=4, v4=5) == func(1, 2, v3=4, v4=5) + assert call(func, 1, 2, 3, 4, 5) == func(1, 2, 3, 4) + assert call(func, 1, 2, 3, 4, more=5) == func(1, 2, 3, 4) + + +def test_loose_varargs_function(): + def func(v1, v2, *args): + return v1 + v2 + args[0] if len(args) > 0 else 3 + args[1] if len(args) > 1 else 4 + + assert call(func, 1, 2) == func(1, 2) + assert call(func, 1, 2, 3, 5) == func(1, 2, 3, 5) + assert call(func, 1, 2, 3, 4, 5) == func(1, 2, 3, 4) + + +def test_loose_kwargs_function(): + def func(v1, v2, **kwargs): + return v1 + v2 + kwargs.get('v3', 3) + kwargs.get('v4', 4) + + assert call(func, v1=1, v2=2) == func(v1=1, v2=2) + assert call(func, v1=1, v2=2, v3=3, v4=5) == func(v1=1, v2=2, v3=3, v4=5) + + +def test_loose_class(): + class Dummy(object): + def __init__(self, v1, v2, v3=3, v4=4): + self.v1 = v1 + self.v2 = v2 + self.v3 = v3 + self.v4 = v4 + + def call(self): + return self.v1 + self.v2 + self.v3 + self.v4 + + assert call(Dummy, 1, 2).call() == Dummy(1, 2).call() + assert call(Dummy, 1, 2, 3, 5).call() == Dummy(1, 2, 3, 5).call() + assert call(Dummy, 1, 2, v3=4, v4=5).call() == Dummy(1, 2, v3=4, v4=5).call() + assert call(Dummy, 1, 2, 3, 4, 5).call() == Dummy(1, 2, 3, 4).call() + assert call(Dummy, 1, 2, 3, 4, more=5).call() == Dummy(1, 2, 3, 4).call() + + +def test_loose_varargs_class(): + class Dummy(object): + def __init__(self, v1, v2, *args): + self.v1 = v1 + self.v2 = v2 + self.v3 = args[0] if len(args) > 0 else 3 + self.v4 = args[1] if len(args) > 1 else 4 + + def call(self): + return self.v1 + self.v2 + self.v3 + self.v4 + + assert call(Dummy, 1, 2).call() == Dummy(1, 2).call() + assert call(Dummy, 1, 2, 3, 5).call() == Dummy(1, 2, 3, 5).call() + assert call(Dummy, 1, 2, 3, 4, 5).call() == Dummy(1, 2, 3, 4).call() + + +def test_loose_kwargs_class(): + class Dummy(object): + def __init__(self, v1, v2, **kwargs): + self.v1 = v1 + self.v2 = v2 + self.v3 = kwargs.get('v3', 3) + self.v4 = kwargs.get('v4', 4) + + def call(self): + return self.v1 + self.v2 + self.v3 + self.v4 + + assert call(Dummy, v1=1, v2=2).call() == Dummy(v1=1, v2=2).call() + assert call(Dummy, v1=1, v2=2, v3=3, v4=5).call() == Dummy(v1=1, v2=2, v3=3, v4=5).call() diff --git a/libs/rebulk/test/test_match.py b/libs/rebulk/test/test_match.py new file mode 100644 index 00000000..efbc63d0 --- /dev/null +++ b/libs/rebulk/test/test_match.py @@ -0,0 +1,565 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +# pylint: disable=no-self-use, pointless-statement, missing-docstring, unneeded-not + +import pytest +import six + +from ..match import Match, Matches +from ..pattern import StringPattern, RePattern +from ..formatters import formatters + + +class TestMatchClass(object): + def test_repr(self): + match1 = Match(1, 3, value="es") + + assert repr(match1) == '' + + match2 = Match(0, 4, value="test", private=True, name="abc", tags=['one', 'two']) + + assert repr(match2) == '' + + def test_names(self): + parent = Match(0, 10, name="test") + parent.children.append(Match(0, 10, name="child1", parent=parent)) + parent.children.append(Match(0, 10, name="child2", parent=parent)) + + assert set(parent.names) == set(["child1", "child2"]) + + def test_equality(self): + match1 = Match(1, 3, value="es") + match2 = Match(1, 3, value="es") + + other = object() + + assert hash(match1) == hash(match2) + assert hash(match1) != hash(other) + + assert match1 == match2 + assert not match1 == other + + def test_inequality(self): + match1 = Match(0, 2, value="te") + match2 = Match(2, 4, value="st") + match3 = Match(0, 2, value="other") + + other = object() + + assert hash(match1) != hash(match2) + assert hash(match1) != hash(match3) + + assert match1 != other + assert match1 != match2 + assert match1 != match3 + + def test_length(self): + match1 = Match(0, 4, value="test") + match2 = Match(0, 2, value="spanIsUsed") + + assert len(match1) == 4 + assert len(match2) == 2 + + def test_compare(self): + match1 = Match(0, 2, value="te") + match2 = Match(2, 4, value="st") + + other = object() + + assert match1 < match2 + assert match1 <= match2 + + assert match2 > match1 + assert match2 >= match1 + + if six.PY3: + with pytest.raises(TypeError): + match1 < other + + with pytest.raises(TypeError): + match1 <= other + + with pytest.raises(TypeError): + match1 > other + + with pytest.raises(TypeError): + match1 >= other + else: + assert match1 < other + assert match1 <= other + assert not match1 > other + assert not match1 >= other + + def test_value(self): + match1 = Match(1, 3) + match1.value = "test" + + assert match1.value == "test" + + +class TestMatchesClass(object): + match1 = Match(0, 2, value="te", name="start") + match2 = Match(2, 3, value="s", tags="tag1") + match3 = Match(3, 4, value="t", tags=["tag1", "tag2"]) + match4 = Match(2, 4, value="st", name="end") + + def test_tag(self): + matches = Matches() + matches.append(self.match1) + matches.append(self.match2) + matches.append(self.match3) + matches.append(self.match4) + + assert "start" in matches.names + assert "end" in matches.names + + assert "tag1" in matches.tags + assert "tag2" in matches.tags + + tag1 = matches.tagged("tag1") + assert len(tag1) == 2 + assert tag1[0] == self.match2 + assert tag1[1] == self.match3 + + tag2 = matches.tagged("tag2") + assert len(tag2) == 1 + assert tag2[0] == self.match3 + + start = matches.named("start") + assert len(start) == 1 + assert start[0] == self.match1 + + end = matches.named("end") + assert len(end) == 1 + assert end[0] == self.match4 + + def test_base(self): + matches = Matches() + matches.append(self.match1) + + assert len(matches) == 1 + assert repr(matches) == repr([self.match1]) + assert list(matches.starting(0)) == [self.match1] + assert list(matches.ending(2)) == [self.match1] + + matches.append(self.match2) + matches.append(self.match3) + matches.append(self.match4) + + assert len(matches) == 4 + assert list(matches.starting(2)) == [self.match2, self.match4] + assert list(matches.starting(3)) == [self.match3] + assert list(matches.ending(3)) == [self.match2] + assert list(matches.ending(4)) == [self.match3, self.match4] + assert list(matches.range()) == [self.match1, self.match2, self.match4, self.match3] + assert list(matches.range(0)) == [self.match1, self.match2, self.match4, self.match3] + assert list(matches.range(0, 3)) == [self.match1, self.match2, self.match4] + assert list(matches.range(2, 3)) == [self.match2, self.match4] + assert list(matches.range(3, 4)) == [self.match4, self.match3] + + matches.remove(self.match1) + assert len(matches) == 3 + assert len(matches.starting(0)) == 0 + assert len(matches.ending(2)) == 0 + + matches.clear() + + assert len(matches) == 0 + assert len(matches.starting(0)) == 0 + assert len(matches.starting(2)) == 0 + assert len(matches.starting(3)) == 0 + assert len(matches.ending(2)) == 0 + assert len(matches.ending(3)) == 0 + assert len(matches.ending(4)) == 0 + + def test_get_slices(self): + matches = Matches() + matches.append(self.match1) + matches.append(self.match2) + matches.append(self.match3) + matches.append(self.match4) + + slice_matches = matches[1:3] + + assert isinstance(slice_matches, Matches) + + assert len(slice_matches) == 2 + assert slice_matches[0] == self.match2 + assert slice_matches[1] == self.match3 + + def test_remove_slices(self): + matches = Matches() + matches.append(self.match1) + matches.append(self.match2) + matches.append(self.match3) + matches.append(self.match4) + + del matches[1:3] + + assert len(matches) == 2 + assert matches[0] == self.match1 + assert matches[1] == self.match4 + + def test_set_slices(self): + matches = Matches() + matches.append(self.match1) + matches.append(self.match2) + matches.append(self.match3) + matches.append(self.match4) + + matches[1:3] = self.match1, self.match4 + + assert len(matches) == 4 + assert matches[0] == self.match1 + assert matches[1] == self.match1 + assert matches[2] == self.match4 + assert matches[3] == self.match4 + + def test_set_index(self): + matches = Matches() + matches.append(self.match1) + matches.append(self.match2) + matches.append(self.match3) + + matches[1] = self.match4 + + assert len(matches) == 3 + assert matches[0] == self.match1 + assert matches[1] == self.match4 + assert matches[2] == self.match3 + + def test_constructor(self): + matches = Matches([self.match1, self.match2, self.match3, self.match4]) + + assert len(matches) == 4 + assert list(matches.starting(0)) == [self.match1] + assert list(matches.ending(2)) == [self.match1] + assert list(matches.starting(2)) == [self.match2, self.match4] + assert list(matches.starting(3)) == [self.match3] + assert list(matches.ending(3)) == [self.match2] + assert list(matches.ending(4)) == [self.match3, self.match4] + + def test_constructor_kwargs(self): + matches = Matches([self.match1, self.match2, self.match3, self.match4], input_string="test") + + assert len(matches) == 4 + assert matches.input_string == "test" + assert list(matches.starting(0)) == [self.match1] + assert list(matches.ending(2)) == [self.match1] + assert list(matches.starting(2)) == [self.match2, self.match4] + assert list(matches.starting(3)) == [self.match3] + assert list(matches.ending(3)) == [self.match2] + assert list(matches.ending(4)) == [self.match3, self.match4] + + def test_crop(self): + input_string = "abcdefghijklmnopqrstuvwxyz" + + match1 = Match(1, 10, input_string=input_string) + match2 = Match(0, 2, input_string=input_string) + match3 = Match(8, 15, input_string=input_string) + + ret = match1.crop([match2, match3.span]) + + assert len(ret) == 1 + + assert ret[0].span == (2, 8) + assert ret[0].value == "cdefgh" + + ret = match1.crop((1, 10)) + assert len(ret) == 0 + + ret = match1.crop((1, 3)) + assert len(ret) == 1 + assert ret[0].span == (3, 10) + + ret = match1.crop((7, 10)) + assert len(ret) == 1 + assert ret[0].span == (1, 7) + + ret = match1.crop((0, 12)) + assert len(ret) == 0 + + ret = match1.crop((4, 6)) + assert len(ret) == 2 + + assert ret[0].span == (1, 4) + assert ret[1].span == (6, 10) + + ret = match1.crop([(3, 5), (7, 9)]) + assert len(ret) == 3 + + assert ret[0].span == (1, 3) + assert ret[1].span == (5, 7) + assert ret[2].span == (9, 10) + + def test_split(self): + input_string = "123 +word1 - word2 + word3 456" + match = Match(3, len(input_string) - 3, input_string=input_string) + splitted = match.split(" -+") + + assert len(splitted) == 3 + assert [split.value for split in splitted] == ["word1", "word2", "word3"] + + +class TestMaches(object): + def test_names(self): + input_string = "One Two Three" + + matches = Matches() + + matches.extend(StringPattern("One", name="1-str", tags=["One", "str"]).matches(input_string)) + matches.extend(RePattern("One", name="1-re", tags=["One", "re"]).matches(input_string)) + matches.extend(StringPattern("Two", name="2-str", tags=["Two", "str"]).matches(input_string)) + matches.extend(RePattern("Two", name="2-re", tags=["Two", "re"]).matches(input_string)) + matches.extend(StringPattern("Three", name="3-str", tags=["Three", "str"]).matches(input_string)) + matches.extend(RePattern("Three", name="3-re", tags=["Three", "re"]).matches(input_string)) + + assert set(matches.names) == set(["1-str", "1-re", "2-str", "2-re", "3-str", "3-re"]) + + def test_filters(self): + input_string = "One Two Three" + + matches = Matches() + + matches.extend(StringPattern("One", name="1-str", tags=["One", "str"]).matches(input_string)) + matches.extend(RePattern("One", name="1-re", tags=["One", "re"]).matches(input_string)) + matches.extend(StringPattern("Two", name="2-str", tags=["Two", "str"]).matches(input_string)) + matches.extend(RePattern("Two", name="2-re", tags=["Two", "re"]).matches(input_string)) + matches.extend(StringPattern("Three", name="3-str", tags=["Three", "str"]).matches(input_string)) + matches.extend(RePattern("Three", name="3-re", tags=["Three", "re"]).matches(input_string)) + + selection = matches.starting(0) + assert len(selection) == 2 + + selection = matches.starting(0, lambda m: "str" in m.tags) + assert len(selection) == 1 + assert selection[0].pattern.name == "1-str" + + selection = matches.ending(7, predicate=lambda m: "str" in m.tags) + assert len(selection) == 1 + assert selection[0].pattern.name == "2-str" + + selection = matches.previous(matches.named("2-str")[0]) + assert len(selection) == 2 + assert selection[0].pattern.name == "1-str" + assert selection[1].pattern.name == "1-re" + + selection = matches.previous(matches.named("2-str", 0), lambda m: "str" in m.tags) + assert len(selection) == 1 + assert selection[0].pattern.name == "1-str" + + selection = matches.next(matches.named("2-str", 0)) + assert len(selection) == 2 + assert selection[0].pattern.name == "3-str" + assert selection[1].pattern.name == "3-re" + + selection = matches.next(matches.named("2-str", 0), index=0, predicate=lambda m: "re" in m.tags) + assert selection is not None + assert selection.pattern.name == "3-re" + + selection = matches.next(matches.named("2-str", index=0), lambda m: "re" in m.tags) + assert len(selection) == 1 + assert selection[0].pattern.name == "3-re" + + selection = matches.named("2-str", lambda m: "re" in m.tags) + assert len(selection) == 0 + + selection = matches.named("2-re", lambda m: "re" in m.tags, 0) + assert selection is not None + assert selection.name == "2-re" # pylint:disable=no-member + + selection = matches.named("2-re", lambda m: "re" in m.tags) + assert len(selection) == 1 + assert selection[0].name == "2-re" + + selection = matches.named("2-re", lambda m: "re" in m.tags, index=1000) + assert selection is None + + def test_raw(self): + input_string = "0123456789" + + match = Match(0, 10, input_string=input_string, formatter=lambda s: s*2) + + assert match.value == match.raw * 2 + assert match.raw == input_string + + match.raw_end = 9 + match.raw_start = 1 + + assert match.value == match.raw * 2 + assert match.raw == input_string[1:9] + + match.raw_end = None + match.raw_start = None + + assert match.value == match.raw * 2 + assert match.raw == input_string + + + def test_formatter_chain(self): + input_string = "100" + + match = Match(0, 3, input_string=input_string, formatter=formatters(int, lambda s: s*2, lambda s: s+10)) + + assert match.raw == input_string + assert match.value == 100 * 2 + 10 + + + def test_to_dict(self): + input_string = "One Two Two Three" + + matches = Matches() + + matches.extend(StringPattern("One", name="1", tags=["One", "str"]).matches(input_string)) + matches.extend(RePattern("One", name="1", tags=["One", "re"]).matches(input_string)) + matches.extend(StringPattern("Two", name="2", tags=["Two", "str"]).matches(input_string)) + matches.extend(RePattern("Two", name="2", tags=["Two", "re"]).matches(input_string)) + matches.extend(RePattern("Two", name="2", tags=["Two", "reBis"]).matches(input_string)) + matches.extend(StringPattern("Three", name="3", tags=["Three", "str"]).matches(input_string)) + matches.extend(RePattern("Three", name="3bis", tags=["Three", "re"]).matches(input_string)) + matches.extend(RePattern(r"(\w+)", name="words").matches(input_string)) + + kvalues = matches.to_dict() + assert kvalues == {"1": "One", + "2": "Two", + "3": "Three", + "3bis": "Three", + "words": "One"} + assert kvalues.values_list["words"] == ["One", "Two", "Three"] + + kvalues = matches.to_dict(details=True, implicit=True) + assert kvalues["1"].value == "One" + + assert len(kvalues["2"]) == 2 + assert kvalues["2"][0].value == "Two" + assert kvalues["2"][1].value == "Two" + + assert kvalues["3"].value == "Three" + assert kvalues["3bis"].value == "Three" + + assert len(kvalues["words"]) == 4 + assert kvalues["words"][0].value == "One" + assert kvalues["words"][1].value == "Two" + assert kvalues["words"][2].value == "Two" + assert kvalues["words"][3].value == "Three" + + kvalues = matches.to_dict(details=True) + assert kvalues["1"].value == "One" + + assert len(kvalues.values_list["2"]) == 2 + assert kvalues.values_list["2"][0].value == "Two" + assert kvalues.values_list["2"][1].value == "Two" + + assert kvalues["3"].value == "Three" + assert kvalues["3bis"].value == "Three" + + assert len(kvalues.values_list["words"]) == 4 + assert kvalues.values_list["words"][0].value == "One" + assert kvalues.values_list["words"][1].value == "Two" + assert kvalues.values_list["words"][2].value == "Two" + assert kvalues.values_list["words"][3].value == "Three" + + def test_chains(self): + input_string = "wordX 10 20 30 40 wordA, wordB, wordC 70 80 wordX" + + matches = Matches(input_string=input_string) + + matches.extend(RePattern(r"\d+", name="digit").matches(input_string)) + matches.extend(RePattern("[a-zA-Z]+", name="word").matches(input_string)) + + assert len(matches) == 11 + + a_start = input_string.find('wordA') + + b_start = input_string.find('wordB') + b_end = b_start + len('wordB') + + c_start = input_string.find('wordC') + c_end = c_start + len('wordC') + + chain_before = matches.chain_before(b_start, " ,", predicate=lambda match: match.name == "word") + assert len(chain_before) == 1 + assert chain_before[0].value == 'wordA' + + chain_before = matches.chain_before(Match(b_start, b_start), " ,", predicate=lambda match: match.name == "word") + assert len(chain_before) == 1 + assert chain_before[0].value == 'wordA' + + chain_before = matches.chain_before(b_start, " ,", predicate=lambda match: match.name == "digit") + assert len(chain_before) == 0 + + chain_before = matches.chain_before(a_start, " ,", predicate=lambda match: match.name == "digit") + assert len(chain_before) == 4 + assert [match.value for match in chain_before] == ["40", "30", "20", "10"] + + chain_after = matches.chain_after(b_end, " ,", predicate=lambda match: match.name == "word") + assert len(chain_after) == 1 + assert chain_after[0].value == 'wordC' + + chain_after = matches.chain_after(Match(b_end, b_end), " ,", predicate=lambda match: match.name == "word") + assert len(chain_after) == 1 + assert chain_after[0].value == 'wordC' + + chain_after = matches.chain_after(b_end, " ,", predicate=lambda match: match.name == "digit") + assert len(chain_after) == 0 + + chain_after = matches.chain_after(c_end, " ,", predicate=lambda match: match.name == "digit") + assert len(chain_after) == 2 + assert [match.value for match in chain_after] == ["70", "80"] + + chain_after = matches.chain_after(c_end, " ,", end=10000, predicate=lambda match: match.name == "digit") + assert len(chain_after) == 2 + assert [match.value for match in chain_after] == ["70", "80"] + + def test_holes(self): + input_string = '1'*10+'2'*10+'3'*10+'4'*10+'5'*10+'6'*10+'7'*10 + + hole1 = Match(0, 10, input_string=input_string) + hole2 = Match(20, 30, input_string=input_string) + hole3 = Match(30, 40, input_string=input_string) + hole4 = Match(60, 70, input_string=input_string) + + matches = Matches([hole1, hole2], input_string=input_string) + matches.append(hole3) + matches.append(hole4) + + holes = list(matches.holes()) + assert len(holes) == 2 + assert holes[0].span == (10, 20) + assert holes[0].value == '2'*10 + assert holes[1].span == (40, 60) + assert holes[1].value == '5' * 10 + '6' * 10 + + holes = list(matches.holes(5, 15)) + assert len(holes) == 1 + assert holes[0].span == (10, 15) + assert holes[0].value == '2'*5 + + holes = list(matches.holes(5, 15, formatter=lambda value: "formatted")) + assert len(holes) == 1 + assert holes[0].span == (10, 15) + assert holes[0].value == "formatted" + + holes = list(matches.holes(5, 15, predicate=lambda hole: False)) + assert len(holes) == 0 + + def test_holes_empty(self): + input_string = "Test hole on empty matches" + matches = Matches(input_string=input_string) + holes = matches.holes() + assert len(holes) == 1 + assert holes[0].value == input_string + + def test_holes_seps(self): + input_string = "Test hole - with many separators + included" + match = StringPattern("many").matches(input_string) + + matches = Matches(match, input_string) + holes = matches.holes() + + assert len(holes) == 2 + + holes = matches.holes(seps="-+") + + assert len(holes) == 4 + assert [hole.value for hole in holes] == ["Test hole ", " with ", " separators ", " included"] diff --git a/libs/rebulk/test/test_pattern.py b/libs/rebulk/test/test_pattern.py new file mode 100644 index 00000000..fadca5f2 --- /dev/null +++ b/libs/rebulk/test/test_pattern.py @@ -0,0 +1,848 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +# pylint: disable=no-self-use, pointless-statement, missing-docstring, unbalanced-tuple-unpacking + +import re +import pytest + +from ..pattern import StringPattern, RePattern, FunctionalPattern, REGEX_AVAILABLE +from ..match import Match + +class TestStringPattern(object): + """ + Tests for StringPattern matching + """ + + input_string = "An Abyssinian fly playing a Celtic violin was annoyed by trashy flags on " \ + "which were the Hebrew letter qoph." + + def test_single(self): + pattern = StringPattern("Celtic") + + matches = list(pattern.matches(self.input_string)) + assert len(matches) == 1 + assert isinstance(matches[0], Match) + assert matches[0].pattern == pattern + assert matches[0].span == (28, 34) + assert matches[0].value == "Celtic" + + def test_repr(self): + pattern = StringPattern("Celtic") + + assert repr(pattern) == '' + + def test_ignore_case(self): + pattern = StringPattern("celtic", ignore_case=False) + + matches = list(pattern.matches(self.input_string)) + assert len(matches) == 0 + + pattern = StringPattern("celtic", ignore_case=True) + + matches = list(pattern.matches(self.input_string)) + assert len(matches) == 1 + assert matches[0].value == "Celtic" + + def test_private_names(self): + pattern = StringPattern("celtic", name="test", private_names=["test"], ignore_case=True) + + matches = list(pattern.matches(self.input_string)) + assert len(matches) == 1 + assert matches[0].private + + def test_ignore_names(self): + pattern = StringPattern("celtic", name="test", ignore_names=["test"], ignore_case=True) + + matches = list(pattern.matches(self.input_string)) + assert len(matches) == 0 + + def test_no_match(self): + pattern = StringPattern("Python") + + matches = list(pattern.matches(self.input_string)) + assert not matches + + def test_multiple_patterns(self): + pattern = StringPattern("playing", "annoyed", "Hebrew") + + matches = list(pattern.matches(self.input_string)) + assert len(matches) == 3 + + assert isinstance(matches[0], Match) + assert matches[0].pattern == pattern + assert matches[0].span == (18, 25) + assert matches[0].value == "playing" + + assert isinstance(matches[1], Match) + assert matches[1].pattern == pattern + assert matches[1].span == (46, 53) + assert matches[1].value == "annoyed" + + assert isinstance(matches[2], Match) + assert matches[2].pattern == pattern + assert matches[2].span == (88, 94) + assert matches[2].value == "Hebrew" + + def test_start_end_kwargs(self): + pattern = StringPattern("Abyssinian", start=20, end=40) + matches = list(pattern.matches(self.input_string)) + + assert len(matches) == 0 + + def test_matches_kwargs(self): + pattern = StringPattern("Abyssinian", name="test", value="AB") + matches = list(pattern.matches(self.input_string)) + + assert len(matches) == 1 + assert matches[0].name == "test" + assert matches[0].value == "AB" + + +class TestRePattern(object): + """ + Tests for RePattern matching + """ + + input_string = "An Abyssinian fly playing a Celtic violin was annoyed by trashy flags on " \ + "which were the Hebrew letter qoph." + + def test_single_compiled(self): + pattern = RePattern(re.compile("Celt.?c")) + + matches = list(pattern.matches(self.input_string)) + assert len(matches) == 1 + assert isinstance(matches[0], Match) + assert matches[0].pattern == pattern + assert matches[0].span == (28, 34) + assert matches[0].value == "Celtic" + + def test_single_string(self): + pattern = RePattern("Celt.?c") + + matches = list(pattern.matches(self.input_string)) + assert len(matches) == 1 + assert isinstance(matches[0], Match) + assert matches[0].pattern == pattern + assert matches[0].span == (28, 34) + assert matches[0].value == "Celtic" + + def test_single_kwargs(self): + pattern = RePattern({"pattern": "celt.?c", "flags": re.IGNORECASE}) + + matches = list(pattern.matches(self.input_string)) + assert len(matches) == 1 + assert isinstance(matches[0], Match) + assert matches[0].pattern == pattern + assert matches[0].span == (28, 34) + assert matches[0].value == "Celtic" + + def test_single_vargs(self): + pattern = RePattern(("celt.?c", re.IGNORECASE)) + + matches = list(pattern.matches(self.input_string)) + assert len(matches) == 1 + assert isinstance(matches[0], Match) + assert matches[0].pattern == pattern + assert matches[0].span == (28, 34) + assert matches[0].value == "Celtic" + + def test_no_match(self): + pattern = RePattern("abc.?def") + + matches = list(pattern.matches(self.input_string)) + assert len(matches) == 0 + + def test_shortcuts(self): + pattern = RePattern("Celtic-violin", abbreviations=[("-", r"[\W_]+")]) + + matches = list(pattern.matches(self.input_string)) + assert len(matches) == 1 + + pattern = RePattern({"pattern": "celtic-violin", "flags": re.IGNORECASE}, abbreviations=[("-", r"[\W_]+")]) + + matches = list(pattern.matches(self.input_string)) + assert len(matches) == 1 + + def test_multiple_patterns(self): + pattern = RePattern("pla.?ing", "ann.?yed", "Heb.?ew") + + matches = list(pattern.matches(self.input_string)) + assert len(matches) == 3 + + assert isinstance(matches[0], Match) + assert matches[0].pattern == pattern + assert matches[0].span == (18, 25) + assert matches[0].value == "playing" + + assert isinstance(matches[1], Match) + assert matches[1].pattern == pattern + assert matches[1].span == (46, 53) + assert matches[1].value == "annoyed" + + assert isinstance(matches[2], Match) + assert matches[2].pattern == pattern + assert matches[2].span == (88, 94) + assert matches[2].value == "Hebrew" + + def test_unnamed_groups(self): + pattern = RePattern(r"(Celt.?c)\s+(\w+)") + + matches = list(pattern.matches(self.input_string)) + assert len(matches) == 1 + + parent = matches[0] + + assert isinstance(parent, Match) + assert parent.pattern == pattern + assert parent.span == (28, 41) + assert parent.name is None + assert parent.value == "Celtic violin" + + assert len(parent.children) == 2 + + group1, group2 = parent.children + + assert isinstance(group1, Match) + assert group1.pattern == pattern + assert group1.span == (28, 34) + assert group1.name is None + assert group1.value == "Celtic" + assert group1.parent == parent + + assert isinstance(group2, Match) + assert group2.pattern == pattern + assert group2.span == (35, 41) + assert group2.name is None + assert group2.value == "violin" + assert group2.parent == parent + + def test_named_groups(self): + pattern = RePattern(r"(?PCelt.?c)\s+(?P\w+)") + + matches = list(pattern.matches(self.input_string)) + assert len(matches) == 1 + + parent = matches[0] + + assert isinstance(parent, Match) + assert parent.pattern == pattern + assert parent.span == (28, 41) + assert parent.name is None + assert parent.value == "Celtic violin" + + assert len(parent.children) == 2 + group1, group2 = parent.children + + assert isinstance(group1, Match) + assert group1.pattern == pattern + assert group1.span == (28, 34) + assert group1.name == "param1" + assert group1.value == "Celtic" + assert group1.parent == parent + + assert isinstance(group2, Match) + assert group2.pattern == pattern + assert group2.span == (35, 41) + assert group2.name == "param2" + assert group2.value == "violin" + assert group2.parent == parent + + def test_children(self): + pattern = RePattern(r"(?PCelt.?c)\s+(?P\w+)", children=True) + + matches = list(pattern.matches(self.input_string)) + assert len(matches) == 2 + group1, group2 = matches + + assert isinstance(group1, Match) + assert group1.pattern == pattern + assert group1.span == (28, 34) + assert group1.name == "param1" + assert group1.value == "Celtic" + + assert isinstance(group2, Match) + assert group2.pattern == pattern + assert group2.span == (35, 41) + assert group2.name == "param2" + assert group2.value == "violin" + + def test_children_parent_private(self): + pattern = RePattern(r"(?PCelt.?c)\s+(?P\w+)", children=True, private_parent=True) + + matches = list(pattern.matches(self.input_string)) + assert len(matches) == 3 + parent, group1, group2 = matches + + assert isinstance(group1, Match) + assert parent.private + assert parent.pattern == pattern + assert parent.span == (28, 41) + assert parent.name is None + assert parent.value == "Celtic violin" + + assert isinstance(group1, Match) + assert not group1.private + assert group1.pattern == pattern + assert group1.span == (28, 34) + assert group1.name == "param1" + assert group1.value == "Celtic" + + assert isinstance(group2, Match) + assert not group2.private + assert group2.pattern == pattern + assert group2.span == (35, 41) + assert group2.name == "param2" + assert group2.value == "violin" + + def test_parent_children_private(self): + pattern = RePattern(r"(?PCelt.?c)\s+(?P\w+)", private_children=True) + + matches = list(pattern.matches(self.input_string)) + assert len(matches) == 3 + parent, group1, group2 = matches + + assert isinstance(group1, Match) + assert not parent.private + assert parent.pattern == pattern + assert parent.span == (28, 41) + assert parent.name is None + assert parent.value == "Celtic violin" + + assert isinstance(group1, Match) + assert group1.private + assert group1.pattern == pattern + assert group1.span == (28, 34) + assert group1.name == "param1" + assert group1.value == "Celtic" + + assert isinstance(group2, Match) + assert group2.private + assert group2.pattern == pattern + assert group2.span == (35, 41) + assert group2.name == "param2" + assert group2.value == "violin" + + def test_every(self): + pattern = RePattern(r"(?PCelt.?c)\s+(?P\w+)", every=True) + + matches = list(pattern.matches(self.input_string)) + assert len(matches) == 3 + parent, group1, group2 = matches + + assert isinstance(group1, Match) + assert not parent.private + assert parent.pattern == pattern + assert parent.span == (28, 41) + assert parent.name is None + assert parent.value == "Celtic violin" + + assert isinstance(group1, Match) + assert not group1.private + assert group1.pattern == pattern + assert group1.span == (28, 34) + assert group1.name == "param1" + assert group1.value == "Celtic" + + assert isinstance(group2, Match) + assert not group2.private + assert group2.pattern == pattern + assert group2.span == (35, 41) + assert group2.name == "param2" + assert group2.value == "violin" + + def test_private_names(self): + pattern = RePattern(r"(?PCelt.?c)\s+(?P\w+)", private_names=["param2"], children=True) + + matches = list(pattern.matches(self.input_string)) + assert len(matches) == 2 + assert matches[0].name == "param1" + assert not matches[0].private + assert matches[1].name == "param2" + assert matches[1].private + + def test_ignore_names(self): + pattern = RePattern(r"(?PCelt.?c)\s+(?P\w+)", ignore_names=["param2"], children=True) + + matches = list(pattern.matches(self.input_string)) + assert len(matches) == 1 + assert matches[0].name == "param1" + + def test_matches_kwargs(self): + pattern = RePattern("He.rew", name="test", value="HE") + matches = list(pattern.matches(self.input_string)) + + assert len(matches) == 1 + assert matches[0].name == "test" + assert matches[0].value == "HE" + + pattern = RePattern("H(e.)(rew)", name="test", value="HE") + matches = list(pattern.matches(self.input_string)) + + assert len(matches) == 1 + assert matches[0].name == "test" + assert matches[0].value == "HE" + + children = matches[0].children + assert len(children) == 2 + assert children[0].name is "test" + assert children[0].value == "HE" + + assert children[1].name is "test" + assert children[1].value == "HE" + + pattern = RePattern("H(?Pe.)(?Prew)", name="test", value="HE") + matches = list(pattern.matches(self.input_string)) + + assert len(matches) == 1 + assert matches[0].name == "test" + assert matches[0].value == "HE" + + children = matches[0].children + assert len(children) == 2 + assert children[0].name == "first" + assert children[0].value == "HE" + + assert children[1].name == "second" + assert children[1].value == "HE" + + +class TestFunctionalPattern(object): + """ + Tests for FunctionalPattern matching + """ + + input_string = "An Abyssinian fly playing a Celtic violin was annoyed by trashy flags on " \ + "which were the Hebrew letter qoph." + + def test_single_vargs(self): + def func(input_string): + i = input_string.find("fly") + if i > -1: + return i, i + len("fly"), "fly", "functional" + + pattern = FunctionalPattern(func) + + matches = list(pattern.matches(self.input_string)) + assert len(matches) == 1 + assert isinstance(matches[0], Match) + assert matches[0].pattern == pattern + assert matches[0].span == (14, 17) + assert matches[0].name == "functional" + assert matches[0].value == "fly" + + def test_single_kwargs(self): + def func(input_string): + i = input_string.find("fly") + if i > -1: + return {"start": i, "end": i + len("fly"), "name": "functional"} + + pattern = FunctionalPattern(func) + + matches = list(pattern.matches(self.input_string)) + assert len(matches) == 1 + assert isinstance(matches[0], Match) + assert matches[0].pattern == pattern + assert matches[0].span == (14, 17) + assert matches[0].name == "functional" + assert matches[0].value == "fly" + + def test_multiple_objects(self): + def func(input_string): + i = input_string.find("fly") + matches = [] + if i > -1: + matches.append((i, i + len("fly"), {'name': "functional"})) + i = input_string.find("annoyed") + if i > -1: + matches.append((i, i + len("annoyed"))) + i = input_string.find("Hebrew") + if i > -1: + matches.append({"start": i, "end": i + len("Hebrew")}) + return matches + + pattern = FunctionalPattern(func) + + matches = list(pattern.matches(self.input_string)) + assert len(matches) == 3 + assert isinstance(matches[0], Match) + assert matches[0].pattern == pattern + assert matches[0].span == (14, 17) + assert matches[0].name == "functional" + assert matches[0].value == "fly" + + assert isinstance(matches[1], Match) + assert matches[1].pattern == pattern + assert matches[1].span == (46, 53) + assert matches[1].value == "annoyed" + + assert isinstance(matches[2], Match) + assert matches[2].pattern == pattern + assert matches[2].span == (88, 94) + assert matches[2].value == "Hebrew" + + def test_multiple_generator(self): + def func(input_string): + i = input_string.find("fly") + if i > -1: + yield (i, i + len("fly"), {'name': "functional"}) + i = input_string.find("annoyed") + if i > -1: + yield (i, i + len("annoyed")) + i = input_string.find("Hebrew") + if i > -1: + yield (i, {"end": i + len("Hebrew")}) + + pattern = FunctionalPattern(func) + + matches = list(pattern.matches(self.input_string)) + assert len(matches) == 3 + assert isinstance(matches[0], Match) + assert matches[0].pattern == pattern + assert matches[0].span == (14, 17) + assert matches[0].name == "functional" + assert matches[0].value == "fly" + + assert isinstance(matches[1], Match) + assert matches[1].pattern == pattern + assert matches[1].span == (46, 53) + assert matches[1].value == "annoyed" + + assert isinstance(matches[2], Match) + assert matches[2].pattern == pattern + assert matches[2].span == (88, 94) + assert matches[2].value == "Hebrew" + + def test_no_match(self): + pattern = FunctionalPattern(lambda x: None) + + matches = list(pattern.matches(self.input_string)) + assert len(matches) == 0 + + def test_multiple_patterns(self): + def playing(input_string): + i = input_string.find("playing") + if i > -1: + return i, i + len("playing") + + def annoyed(input_string): + i = input_string.find("annoyed") + if i > -1: + return i, i + len("annoyed") + + def hebrew(input_string): + i = input_string.find("Hebrew") + if i > -1: + return i, i + len("Hebrew") + + pattern = FunctionalPattern(playing, annoyed, hebrew) + + matches = list(pattern.matches(self.input_string)) + assert len(matches) == 3 + + assert isinstance(matches[0], Match) + assert matches[0].pattern == pattern + assert matches[0].span == (18, 25) + assert matches[0].value == "playing" + + assert isinstance(matches[1], Match) + assert matches[1].pattern == pattern + assert matches[1].span == (46, 53) + assert matches[1].value == "annoyed" + + assert isinstance(matches[2], Match) + assert matches[2].pattern == pattern + assert matches[2].span == (88, 94) + assert matches[2].value == "Hebrew" + + def test_matches_kwargs(self): + def playing(input_string): + i = input_string.find("playing") + if i > -1: + return i, i + len("playing") + + pattern = FunctionalPattern(playing, name="test", value="PLAY") + matches = list(pattern.matches(self.input_string)) + + assert len(matches) == 1 + assert matches[0].name == "test" + assert matches[0].value == "PLAY" + + +class TestValue(object): + """ + Tests for value option + """ + + input_string = "This string contains 1849 a number" + + def test_str_value(self): + pattern = StringPattern("1849", name="dummy", value="test") + + matches = list(pattern.matches(self.input_string)) + assert len(matches) == 1 + assert isinstance(matches[0], Match) + assert matches[0].pattern == pattern + assert matches[0].span == (21, 25) + assert matches[0].value == "test" + + def test_dict_child_value(self): + pattern = RePattern(r"(?Pcont.?ins)\s+(?P\d+)", + formatter={'intParam': lambda x: int(x) * 2, + 'strParam': lambda x: "really " + x}, + format_all=True, + value={'intParam': 'INT_PARAM_VALUE'}) + + matches = list(pattern.matches(self.input_string)) + assert len(matches) == 1 + + parent = matches[0] + assert len(parent.children) == 2 + + group1, group2 = parent.children + + assert isinstance(group1, Match) + assert group1.pattern == pattern + assert group1.span == (12, 20) + assert group1.value == "really contains" + + assert isinstance(group2, Match) + assert group2.pattern == pattern + assert group2.span == (21, 25) + assert group2.value == 'INT_PARAM_VALUE' + + def test_dict_default_value(self): + pattern = RePattern(r"(?Pcont.?ins)\s+(?P\d+)", + formatter={'intParam': lambda x: int(x) * 2, + 'strParam': lambda x: "really " + x}, + format_all=True, + value={'__children__': 'CHILD', 'strParam': 'STR_VALUE', '__parent__': 'PARENT'}) + + matches = list(pattern.matches(self.input_string)) + assert len(matches) == 1 + + parent = matches[0] + assert parent.value == "PARENT" + assert len(parent.children) == 2 + + group1, group2 = parent.children + + assert isinstance(group1, Match) + assert group1.pattern == pattern + assert group1.span == (12, 20) + assert group1.value == "STR_VALUE" + + assert isinstance(group2, Match) + assert group2.pattern == pattern + assert group2.span == (21, 25) + assert group2.value == "CHILD" + + +class TestFormatter(object): + """ + Tests for formatter option + """ + + input_string = "This string contains 1849 a number" + + def test_single_string(self): + pattern = StringPattern("1849", name="dummy", formatter=lambda x: int(x) / 2) + + matches = list(pattern.matches(self.input_string)) + assert len(matches) == 1 + assert isinstance(matches[0], Match) + assert matches[0].pattern == pattern + assert matches[0].span == (21, 25) + assert matches[0].value == 1849 / 2 + + def test_single_re_no_group(self): + pattern = RePattern(r"\d+", formatter=lambda x: int(x) * 2) + + matches = list(pattern.matches(self.input_string)) + assert len(matches) == 1 + assert isinstance(matches[0], Match) + assert matches[0].pattern == pattern + assert matches[0].span == (21, 25) + assert matches[0].value == 1849 * 2 + + def test_single_re_named_groups(self): + pattern = RePattern(r"(?Pcont.?ins)\s+(?P\d+)", + formatter={'intParam': lambda x: int(x) * 2, + 'strParam': lambda x: "really " + x}, format_all=True) + + matches = list(pattern.matches(self.input_string)) + assert len(matches) == 1 + + parent = matches[0] + assert len(parent.children) == 2 + + group1, group2 = parent.children + + assert isinstance(group1, Match) + assert group1.pattern == pattern + assert group1.span == (12, 20) + assert group1.value == "really contains" + + assert isinstance(group2, Match) + assert group2.pattern == pattern + assert group2.span == (21, 25) + assert group2.value == 1849 * 2 + + def test_repeated_captures_option(self): + pattern = RePattern(r"\[(\d+)\](?:-(\d+))*") + + matches = list(pattern.matches("[02]-03-04-05-06")) + assert len(matches) == 1 + + match = matches[0] + if REGEX_AVAILABLE: + assert len(match.children) == 5 + assert [child.value for child in match.children] == ["02", "03", "04", "05", "06"] + else: + assert len(match.children) == 2 + assert [child.value for child in match.children] == ["02", "06"] + + with pytest.raises(NotImplementedError): + RePattern(r"\[(\d+)\](?:-(\d+))*", repeated_captures=True) + + pattern = RePattern(r"\[(\d+)\](?:-(\d+))*", repeated_captures=False) + + matches = list(pattern.matches("[02]-03-04-05-06")) + assert len(matches) == 1 + + match = matches[0] + assert len(match.children) == 2 + assert [child.value for child in match.children] == ["02", "06"] + + def test_single_functional(self): + def digit(input_string): + i = input_string.find("1849") + if i > -1: + return i, i + len("1849") + + pattern = FunctionalPattern(digit, formatter=lambda x: int(x) * 3) + + matches = list(pattern.matches(self.input_string)) + assert len(matches) == 1 + assert isinstance(matches[0], Match) + assert matches[0].pattern == pattern + assert matches[0].span == (21, 25) + assert matches[0].value == 1849 * 3 + + +class TestValidator(object): + """ + Tests for validator option + """ + + input_string = "This string contains 1849 a number" + + @staticmethod + def true_validator(match): + return int(match.value) < 1850 + + @staticmethod + def false_validator(match): + return int(match.value) >= 1850 + + def test_single_string(self): + pattern = StringPattern("1849", name="dummy", validator=self.false_validator) + + matches = list(pattern.matches(self.input_string)) + assert len(matches) == 0 + + pattern = StringPattern("1849", validator=self.true_validator) + + matches = list(pattern.matches(self.input_string)) + assert len(matches) == 1 + + def test_single_re_no_group(self): + pattern = RePattern(r"\d+", validator=self.false_validator) + + matches = list(pattern.matches(self.input_string)) + assert len(matches) == 0 + + pattern = RePattern(r"\d+", validator=self.true_validator) + + matches = list(pattern.matches(self.input_string)) + assert len(matches) == 1 + + def test_single_re_named_groups(self): + pattern = RePattern(r"(?Pcont.?ins)\s+(?P\d+)", + validator={'intParam': self.false_validator}, validate_all=True) + + matches = list(pattern.matches(self.input_string)) + assert len(matches) == 0 + + pattern = RePattern(r"(?Pcont.?ins)\s+(?P\d+)", + validator={'intParam': self.true_validator}, validate_all=True) + + matches = list(pattern.matches(self.input_string)) + assert len(matches) == 1 + + def test_validate_all(self): + pattern = RePattern(r"contains (?P\d+)", formatter=int, validator=lambda match: match.value < 100, + children=True) + + matches = list(pattern.matches(self.input_string)) + assert len(matches) == 0 + + pattern = RePattern(r"contains (?P\d+)", formatter=int, validator=lambda match: match.value > 100, + children=True) + + matches = list(pattern.matches(self.input_string)) + assert len(matches) == 1 + + def invalid_func(match): + if match.name == 'intParam': + return True + else: + return match.value.startswith('abc') + + pattern = RePattern(r"contains (?P\d+)", formatter=int, validator=invalid_func, validate_all=True, + children=True) + + matches = list(pattern.matches(self.input_string)) + assert len(matches) == 0 + + def func(match): + if match.name == 'intParam': + return True + else: + return match.value.startswith('contains') + + pattern = RePattern(r"contains (?P\d+)", formatter=int, validator=func, validate_all=True, + children=True) + + matches = list(pattern.matches(self.input_string)) + assert len(matches) == 1 + + def test_format_all(self): + pattern = RePattern(r"contains (?P\d+)", formatter=int, + children=True) + + matches = list(pattern.matches(self.input_string)) + assert len(matches) == 1 + for match in matches: + assert match.value is not None + + with pytest.raises(ValueError): + pattern = RePattern(r"contains (?P\d+)", formatter=int, format_all=True) + matches = list(pattern.matches(self.input_string)) + for match in matches: + assert match.value is not None + + def test_single_functional(self): + def digit(input_string): + i = input_string.find("1849") + if i > -1: + return i, i + len("1849") + + pattern = FunctionalPattern(digit, validator=self.false_validator) + + matches = list(pattern.matches(self.input_string)) + assert len(matches) == 0 + + pattern = FunctionalPattern(digit, validator=self.true_validator) + + matches = list(pattern.matches(self.input_string)) + assert len(matches) == 1 diff --git a/libs/rebulk/test/test_processors.py b/libs/rebulk/test/test_processors.py new file mode 100644 index 00000000..7afd4535 --- /dev/null +++ b/libs/rebulk/test/test_processors.py @@ -0,0 +1,215 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +# pylint: disable=no-self-use, pointless-statement, missing-docstring, no-member + +from ..pattern import StringPattern, RePattern +from ..processors import ConflictSolver +from ..rules import execute_rule +from ..match import Matches + + +def test_conflict_1(): + input_string = "abcdefghijklmnopqrstuvwxyz" + + pattern = StringPattern("ijklmn", "kl", "abcdef", "ab", "ef", "yz") + matches = Matches(pattern.matches(input_string)) + + execute_rule(ConflictSolver(), matches, None) + + values = [x.value for x in matches] + + assert values == ["ijklmn", "abcdef", "yz"] + + +def test_conflict_2(): + input_string = "abcdefghijklmnopqrstuvwxyz" + + pattern = StringPattern("ijklmn", "jklmnopqrst") + matches = Matches(pattern.matches(input_string)) + + execute_rule(ConflictSolver(), matches, None) + + values = [x.value for x in matches] + + assert values == ["jklmnopqrst"] + + +def test_conflict_3(): + input_string = "abcdefghijklmnopqrstuvwxyz" + + pattern = StringPattern("ijklmnopqrst", "jklmnopqrst") + matches = Matches(pattern.matches(input_string)) + + execute_rule(ConflictSolver(), matches, None) + + values = [x.value for x in matches] + + assert values == ["ijklmnopqrst"] + + +def test_conflict_4(): + input_string = "123456789" + + pattern = StringPattern("123", "456789") + matches = Matches(pattern.matches(input_string)) + + execute_rule(ConflictSolver(), matches, None) + + values = [x.value for x in matches] + assert values == ["123", "456789"] + + +def test_conflict_5(): + input_string = "123456789" + + pattern = StringPattern("123456", "789") + matches = Matches(pattern.matches(input_string)) + + execute_rule(ConflictSolver(), matches, None) + + values = [x.value for x in matches] + assert values == ["123456", "789"] + + +def test_prefer_longer_parent(): + input_string = "xxx.1x02.xxx" + + re1 = RePattern("([0-9]+)x([0-9]+)", name='prefer', children=True, formatter=int) + re2 = RePattern("x([0-9]+)", name='skip', children=True) + + matches = Matches(re1.matches(input_string)) + matches.extend(re2.matches(input_string)) + + execute_rule(ConflictSolver(), matches, None) + assert len(matches) == 2 + assert matches[0].value == 1 + assert matches[1].value == 2 + + +def test_conflict_solver_1(): + input_string = "123456789" + + re1 = StringPattern("2345678", conflict_solver=lambda match, conflicting: '__default__') + re2 = StringPattern("34567") + + matches = Matches(re1.matches(input_string)) + matches.extend(re2.matches(input_string)) + + execute_rule(ConflictSolver(), matches, None) + assert len(matches) == 1 + assert matches[0].value == "2345678" + + +def test_conflict_solver_2(): + input_string = "123456789" + + re1 = StringPattern("2345678", conflict_solver=lambda match, conflicting: '__default__') + re2 = StringPattern("34567", conflict_solver=lambda match, conflicting: conflicting) + + matches = Matches(re1.matches(input_string)) + matches.extend(re2.matches(input_string)) + + execute_rule(ConflictSolver(), matches, None) + assert len(matches) == 1 + assert matches[0].value == "34567" + + +def test_conflict_solver_3(): + input_string = "123456789" + + re1 = StringPattern("2345678", conflict_solver=lambda match, conflicting: match) + re2 = StringPattern("34567") + + matches = Matches(re1.matches(input_string)) + matches.extend(re2.matches(input_string)) + + execute_rule(ConflictSolver(), matches, None) + assert len(matches) == 1 + assert matches[0].value == "34567" + + +def test_conflict_solver_4(): + input_string = "123456789" + + re1 = StringPattern("2345678") + re2 = StringPattern("34567", conflict_solver=lambda match, conflicting: conflicting) + + matches = Matches(re1.matches(input_string)) + matches.extend(re2.matches(input_string)) + + execute_rule(ConflictSolver(), matches, None) + assert len(matches) == 1 + assert matches[0].value == "34567" + + +def test_conflict_solver_5(): + input_string = "123456789" + + re1 = StringPattern("2345678", conflict_solver=lambda match, conflicting: conflicting) + re2 = StringPattern("34567") + + matches = Matches(re1.matches(input_string)) + matches.extend(re2.matches(input_string)) + + execute_rule(ConflictSolver(), matches, None) + assert len(matches) == 1 + assert matches[0].value == "2345678" + + +def test_conflict_solver_6(): + input_string = "123456789" + + re1 = StringPattern("2345678") + re2 = StringPattern("34567", conflict_solver=lambda match, conflicting: conflicting) + + matches = Matches(re1.matches(input_string)) + matches.extend(re2.matches(input_string)) + + execute_rule(ConflictSolver(), matches, None) + assert len(matches) == 1 + assert matches[0].value == "34567" + + +def test_conflict_solver_7(): + input_string = "102" + + re1 = StringPattern("102") + re2 = StringPattern("02") + + matches = Matches(re2.matches(input_string)) + matches.extend(re1.matches(input_string)) + + execute_rule(ConflictSolver(), matches, None) + assert len(matches) == 1 + assert matches[0].value == "102" + + +def test_unresolved(): + input_string = "123456789" + + re1 = StringPattern("23456") + re2 = StringPattern("34567") + + matches = Matches(re1.matches(input_string)) + matches.extend(re2.matches(input_string)) + + execute_rule(ConflictSolver(), matches, None) + assert len(matches) == 2 + + re1 = StringPattern("34567") + re2 = StringPattern("2345678", conflict_solver=lambda match, conflicting: None) + + matches = Matches(re1.matches(input_string)) + matches.extend(re2.matches(input_string)) + + execute_rule(ConflictSolver(), matches, None) + assert len(matches) == 2 + + re1 = StringPattern("34567", conflict_solver=lambda match, conflicting: None) + re2 = StringPattern("2345678") + + matches = Matches(re1.matches(input_string)) + matches.extend(re2.matches(input_string)) + + execute_rule(ConflictSolver(), matches, None) + assert len(matches) == 2 diff --git a/libs/rebulk/test/test_rebulk.py b/libs/rebulk/test/test_rebulk.py new file mode 100644 index 00000000..bf0bc966 --- /dev/null +++ b/libs/rebulk/test/test_rebulk.py @@ -0,0 +1,419 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +# pylint: disable=no-self-use, pointless-statement, missing-docstring, no-member + +from ..rebulk import Rebulk +from ..rules import Rule +from . import rebulk_rules_module as rm + + +def test_rebulk_simple(): + rebulk = Rebulk() + + rebulk.string("quick") + rebulk.regex("f.x") + + def func(input_string): + i = input_string.find("over") + if i > -1: + return i, i + len("over") + + rebulk.functional(func) + + input_string = "The quick brown fox jumps over the lazy dog" + + matches = rebulk.matches(input_string) + assert len(matches) == 3 + + assert matches[0].value == "quick" + assert matches[1].value == "fox" + assert matches[2].value == "over" + + +def test_rebulk_composition(): + rebulk = Rebulk() + + rebulk.string("quick") + rebulk.rebulk(Rebulk().regex("f.x")) + + rebulk.rebulk(Rebulk(disabled=lambda context: True).functional(lambda string: None)) + + input_string = "The quick brown fox jumps over the lazy dog" + + matches = rebulk.matches(input_string) + assert len(matches) == 2 + + assert matches[0].value == "quick" + assert matches[1].value == "fox" + + +def test_rebulk_context(): + rebulk = Rebulk() + + context = {'nostring': True, 'word': 'lazy'} + + rebulk.string("quick", disabled=lambda context: context.get('nostring', False)) + rebulk.regex("f.x", disabled=lambda context: context.get('noregex', False)) + + def func(input_string, context): + word = context.get('word', 'over') + i = input_string.find(word) + if i > -1: + return i, i + len(word) + + rebulk.functional(func) + + input_string = "The quick brown fox jumps over the lazy dog" + + matches = rebulk.matches(input_string, context) + assert len(matches) == 2 + + assert matches[0].value == "fox" + assert matches[1].value == "lazy" + + +def test_rebulk_prefer_longer(): + input_string = "The quick brown fox jumps over the lazy dog" + + matches = Rebulk().string("quick").string("own").regex("br.{2}n").matches(input_string) + + assert len(matches) == 2 + + assert matches[0].value == "quick" + assert matches[1].value == "brown" + + +def test_rebulk_defaults(): + input_string = "The quick brown fox jumps over the lazy dog" + + def func(input_string): + i = input_string.find("fox") + if i > -1: + return i, i + len("fox") + + matches = Rebulk()\ + .string_defaults(name="string", tags=["a", "b"])\ + .regex_defaults(name="regex") \ + .functional_defaults(name="functional") \ + .string("quick", tags=["c"])\ + .functional(func)\ + .regex("br.{2}n") \ + .matches(input_string) + assert matches[0].name == "string" + assert matches[0].tags == ["a", "b", "c"] + assert matches[1].name == "functional" + assert matches[2].name == "regex" + + matches = Rebulk() \ + .defaults(name="default", tags=["0"])\ + .string_defaults(name="string", tags=["a", "b"]) \ + .functional_defaults(name="functional", tags=["1"]) \ + .string("quick", tags=["c"]) \ + .functional(func) \ + .regex("br.{2}n") \ + .matches(input_string) + assert matches[0].name == "string" + assert matches[0].tags == ["0", "a", "b", "c"] + assert matches[1].name == "functional" + assert matches[1].tags == ["0", "1"] + assert matches[2].name == "default" + assert matches[2].tags == ["0"] + + +def test_rebulk_rebulk(): + input_string = "The quick brown fox jumps over the lazy dog" + + base = Rebulk().string("quick") + child = Rebulk().string("own").regex("br.{2}n") + + matches = base.rebulk(child).matches(input_string) + + assert len(matches) == 2 + + assert matches[0].value == "quick" + assert matches[1].value == "brown" + + +def test_rebulk_no_default(): + input_string = "The quick brown fox jumps over the lazy dog" + + matches = Rebulk(default_rules=False).string("quick").string("own").regex("br.{2}n").matches(input_string) + + assert len(matches) == 3 + + assert matches[0].value == "quick" + assert matches[1].value == "own" + assert matches[2].value == "brown" + + +def test_rebulk_empty_match(): + input_string = "The quick brown fox jumps over the lazy dog" + + matches = Rebulk(default_rules=False).string("quick").string("own").regex("br(.*?)own", children=True)\ + .matches(input_string) + + assert len(matches) == 2 + + assert matches[0].value == "quick" + assert matches[1].value == "own" + + +def test_rebulk_tags_names(): + rebulk = Rebulk() + + rebulk.string("quick", name="str", tags=["first", "other"]) + rebulk.regex("f.x", tags="other") + + def func(input_string): + i = input_string.find("over") + if i > -1: + return i, i + len("over"), {'tags': ['custom']} + + rebulk.functional(func, name="fn") + + def func2(input_string): + i = input_string.find("lazy") + if i > -1: + return {'start': i, 'end': i + len("lazy"), 'tags': ['custom']} + + rebulk.functional(func2, name="fn") + + input_string = "The quick brown fox jumps over the lazy dog" + + matches = rebulk.matches(input_string) + assert len(matches) == 4 + + assert len(matches.named("str")) == 1 + assert len(matches.named("fn")) == 2 + assert len(matches.named("false")) == 0 + assert len(matches.tagged("false")) == 0 + assert len(matches.tagged("first")) == 1 + assert len(matches.tagged("other")) == 2 + assert len(matches.tagged("custom")) == 2 + + +def test_rebulk_rules_1(): + rebulk = Rebulk() + + rebulk.regex(r'\d{4}', name="year") + rebulk.rules(rm.RemoveAllButLastYear) + + matches = rebulk.matches("1984 keep only last 1968 entry 1982 case") + assert len(matches) == 1 + assert matches[0].value == "1982" + + +def test_rebulk_rules_2(): + rebulk = Rebulk() + + rebulk.regex(r'\d{4}', name="year") + rebulk.string(r'year', name="yearPrefix", private=True) + rebulk.string(r'keep', name="yearSuffix", private=True) + rebulk.rules(rm.PrefixedSuffixedYear) + + matches = rebulk.matches("Keep suffix 1984 keep prefixed year 1968 and remove the rest 1982") + assert len(matches) == 2 + assert matches[0].value == "1984" + assert matches[1].value == "1968" + + +def test_rebulk_rules_3(): + rebulk = Rebulk() + + rebulk.regex(r'\d{4}', name="year") + rebulk.string(r'year', name="yearPrefix", private=True) + rebulk.string(r'keep', name="yearSuffix", private=True) + rebulk.rules(rm.PrefixedSuffixedYearNoLambda) + + matches = rebulk.matches("Keep suffix 1984 keep prefixed year 1968 and remove the rest 1982") + assert len(matches) == 2 + assert matches[0].value == "1984" + assert matches[1].value == "1968" + + +def test_rebulk_rules_4(): + class FirstOnlyRule(Rule): + def when(self, matches, context): + grabbed = matches.named("grabbed", 0) + if grabbed and matches.previous(grabbed): + return grabbed + + def then(self, matches, when_response, context): + matches.remove(when_response) + + rebulk = Rebulk() + + rebulk.regex("This match (.*?)grabbed", name="grabbed") + rebulk.regex("if it's (.*?)first match", private=True) + + rebulk.rules(FirstOnlyRule) + + matches = rebulk.matches("This match is grabbed only if it's the first match") + assert len(matches) == 1 + assert matches[0].value == "This match is grabbed" + + matches = rebulk.matches("if it's NOT the first match, This match is NOT grabbed") + assert len(matches) == 0 + + +class TestMarkers(object): + def test_one_marker(self): + class MarkerRule(Rule): + def when(self, matches, context): + word_match = matches.named("word", 0) + marker = matches.markers.at_match(word_match, lambda marker: marker.name == "mark1", 0) + if not marker: + return word_match + + def then(self, matches, when_response, context): + matches.remove(when_response) + + rebulk = Rebulk().regex(r'\(.*?\)', marker=True, name="mark1") \ + .regex(r'\[.*?\]', marker=True, name="mark2") \ + .string("word", name="word") \ + .rules(MarkerRule) + + matches = rebulk.matches("grab (word) only if it's in parenthesis") + + assert len(matches) == 1 + assert matches[0].value == "word" + + matches = rebulk.matches("don't grab [word] if it's in braket") + assert len(matches) == 0 + + matches = rebulk.matches("don't grab word at all") + assert len(matches) == 0 + + def test_multiple_marker(self): + class MarkerRule(Rule): + def when(self, matches, context): + word_match = matches.named("word", 0) + marker = matches.markers.at_match(word_match, + lambda marker: marker.name == "mark1" or marker.name == "mark2") + if len(marker) < 2: + return word_match + + def then(self, matches, when_response, context): + matches.remove(when_response) + + rebulk = Rebulk().regex(r'\(.*?\)', marker=True, name="mark1") \ + .regex(r'\[.*?\]', marker=True, name="mark2") \ + .regex("w.*?d", name="word") \ + .rules(MarkerRule) + + matches = rebulk.matches("[grab (word) only] if it's in parenthesis and brakets") + + assert len(matches) == 1 + assert matches[0].value == "word" + + matches = rebulk.matches("[don't grab](word)[if brakets are outside]") + assert len(matches) == 0 + + matches = rebulk.matches("(grab w[or)d even] if it's partially in parenthesis and brakets") + assert len(matches) == 1 + assert matches[0].value == "w[or)d" + + def test_at_index_marker(self): + class MarkerRule(Rule): + def when(self, matches, context): + word_match = matches.named("word", 0) + marker = matches.markers.at_index(word_match.start, + lambda marker: marker.name == "mark1", 0) + if not marker: + return word_match + + def then(self, matches, when_response, context): + matches.remove(when_response) + + rebulk = Rebulk().regex(r'\(.*?\)', marker=True, name="mark1") \ + .regex("w.*?d", name="word") \ + .rules(MarkerRule) + + matches = rebulk.matches("gr(ab wo)rd only if starting of match is inside parenthesis") + + assert len(matches) == 1 + assert matches[0].value == "wo)rd" + + matches = rebulk.matches("don't grab wo(rd if starting of match is not inside parenthesis") + + assert len(matches) == 0 + + def test_remove_marker(self): + class MarkerRule(Rule): + def when(self, matches, context): + marker = matches.markers.named("mark1", 0) + if marker: + return marker + + def then(self, matches, when_response, context): + matches.markers.remove(when_response) + + rebulk = Rebulk().regex(r'\(.*?\)', marker=True, name="mark1") \ + .regex("w.*?d", name="word") \ + .rules(MarkerRule) + + matches = rebulk.matches("grab word event (if it's not) inside parenthesis") + + assert len(matches) == 1 + assert matches[0].value == "word" + + assert not matches.markers + + +class TestUnicode(object): + def test_rebulk_simple(self): + input_string = u"æ•æ·çš„æ£•色ç‹ç‹¸è·³éŽæ‡¶ç‹—" + + rebulk = Rebulk() + + rebulk.string(u"æ•") + rebulk.regex(u"æ·") + + def func(input_string): + i = input_string.find(u"çš„") + if i > -1: + return i, i + len(u"çš„") + + rebulk.functional(func) + + matches = rebulk.matches(input_string) + assert len(matches) == 3 + + assert matches[0].value == u"æ•" + assert matches[1].value == u"æ·" + assert matches[2].value == u"çš„" + + +class TestImmutable(object): + def test_starting(self): + input_string = "The quick brown fox jumps over the lazy dog" + matches = Rebulk().string("quick").string("over").string("fox").matches(input_string) + + for i in range(0, len(input_string)): + starting = matches.starting(i) + for match in list(starting): + starting.remove(match) + + assert len(matches) == 3 + + def test_ending(self): + input_string = "The quick brown fox jumps over the lazy dog" + matches = Rebulk().string("quick").string("over").string("fox").matches(input_string) + + for i in range(0, len(input_string)): + starting = matches.ending(i) + for match in list(starting): + starting.remove(match) + + assert len(matches) == 3 + + def test_named(self): + input_string = "The quick brown fox jumps over the lazy dog" + matches = Rebulk().defaults(name='test').string("quick").string("over").string("fox").matches(input_string) + + named = matches.named('test') + for match in list(named): + named.remove(match) + + assert len(named) == 0 + assert len(matches) == 3 diff --git a/libs/rebulk/test/test_rules.py b/libs/rebulk/test/test_rules.py new file mode 100644 index 00000000..47b6f5fc --- /dev/null +++ b/libs/rebulk/test/test_rules.py @@ -0,0 +1,197 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +# pylint: disable=no-self-use, pointless-statement, missing-docstring, invalid-name, no-member +import pytest +from rebulk.test.default_rules_module import RuleRemove0, RuleAppend0, RuleRename0, RuleAppend1, RuleRemove1, \ + RuleRename1, RuleAppend2, RuleRename2, RuleAppend3, RuleRename3, RuleAppendTags0, RuleRemoveTags0, \ + RuleAppendTags1, RuleRemoveTags1 + +from ..rules import Rules +from ..match import Matches, Match + +from .rules_module import Rule1, Rule2, Rule3, Rule0, Rule1Disabled +from . import rules_module as rm + + +def test_rule_priority(): + matches = Matches([Match(1, 2)]) + + rules = Rules(Rule1, Rule2()) + + rules.execute_all_rules(matches, {}) + assert len(matches) == 0 + matches = Matches([Match(1, 2)]) + + rules = Rules(Rule1(), Rule0) + + rules.execute_all_rules(matches, {}) + assert len(matches) == 1 + assert matches[0] == Match(3, 4) + + +def test_rules_duplicates(): + matches = Matches([Match(1, 2)]) + + rules = Rules(Rule1, Rule1) + + with pytest.raises(ValueError): + rules.execute_all_rules(matches, {}) + + +def test_rule_disabled(): + matches = Matches([Match(1, 2)]) + + rules = Rules(Rule1Disabled(), Rule2()) + + rules.execute_all_rules(matches, {}) + assert len(matches) == 2 + assert matches[0] == Match(1, 2) + assert matches[1] == Match(3, 4) + + +def test_rule_when(): + matches = Matches([Match(1, 2)]) + + rules = Rules(Rule3()) + + rules.execute_all_rules(matches, {'when': False}) + assert len(matches) == 1 + assert matches[0] == Match(1, 2) + + matches = Matches([Match(1, 2)]) + + rules.execute_all_rules(matches, {'when': True}) + assert len(matches) == 2 + assert matches[0] == Match(1, 2) + assert matches[1] == Match(3, 4) + + +class TestDefaultRules(object): + def test_remove(self): + rules = Rules(RuleRemove0) + + matches = Matches([Match(1, 2)]) + rules.execute_all_rules(matches, {}) + + assert len(matches) == 0 + + rules = Rules(RuleRemove1) + + matches = Matches([Match(1, 2)]) + rules.execute_all_rules(matches, {}) + + assert len(matches) == 0 + + def test_append(self): + rules = Rules(RuleAppend0) + + matches = Matches([Match(1, 2)]) + rules.execute_all_rules(matches, {}) + + assert len(matches) == 2 + + rules = Rules(RuleAppend1) + + matches = Matches([Match(1, 2)]) + rules.execute_all_rules(matches, {}) + + assert len(matches) == 2 + + rules = Rules(RuleAppend2) + + matches = Matches([Match(1, 2)]) + rules.execute_all_rules(matches, {}) + + assert len(matches) == 2 + assert len(matches.named('renamed')) == 1 + + rules = Rules(RuleAppend3) + + matches = Matches([Match(1, 2)]) + rules.execute_all_rules(matches, {}) + + assert len(matches) == 2 + assert len(matches.named('renamed')) == 1 + + def test_rename(self): + rules = Rules(RuleRename0) + + matches = Matches([Match(1, 2, name='original')]) + rules.execute_all_rules(matches, {}) + + assert len(matches.named('original')) == 1 + assert len(matches.named('renamed')) == 0 + + rules = Rules(RuleRename1) + + matches = Matches([Match(5, 10, name='original')]) + rules.execute_all_rules(matches, {}) + + assert len(matches.named('original')) == 0 + assert len(matches.named('renamed')) == 1 + + rules = Rules(RuleRename2) + + matches = Matches([Match(5, 10, name='original')]) + rules.execute_all_rules(matches, {}) + + assert len(matches.named('original')) == 0 + assert len(matches.named('renamed')) == 1 + + rules = Rules(RuleRename3) + + matches = Matches([Match(5, 10, name='original')]) + rules.execute_all_rules(matches, {}) + + assert len(matches.named('original')) == 0 + assert len(matches.named('renamed')) == 1 + + def test_append_tags(self): + rules = Rules(RuleAppendTags0) + + matches = Matches([Match(1, 2, name='tags', tags=['other'])]) + rules.execute_all_rules(matches, {}) + + assert len(matches.named('tags')) == 1 + assert matches.named('tags', index=0).tags == ['other', 'new-tag'] + + rules = Rules(RuleAppendTags1) + + matches = Matches([Match(1, 2, name='tags', tags=['other'])]) + rules.execute_all_rules(matches, {}) + + assert len(matches.named('tags')) == 1 + assert matches.named('tags', index=0).tags == ['other', 'new-tag'] + + def test_remove_tags(self): + rules = Rules(RuleRemoveTags0) + + matches = Matches([Match(1, 2, name='tags', tags=['other', 'new-tag'])]) + rules.execute_all_rules(matches, {}) + + assert len(matches.named('tags')) == 1 + assert matches.named('tags', index=0).tags == ['other'] + + rules = Rules(RuleRemoveTags1) + + matches = Matches([Match(1, 2, name='tags', tags=['other', 'new-tag'])]) + rules.execute_all_rules(matches, {}) + + assert len(matches.named('tags')) == 1 + assert matches.named('tags', index=0).tags == ['other'] + + +def test_rule_module(): + rules = Rules(rm) + + matches = Matches([Match(1, 2)]) + rules.execute_all_rules(matches, {}) + + assert len(matches) == 1 + + +def test_rule_repr(): + assert str(Rule0()) == "" + assert str(Rule1()) == "" + assert str(Rule2()) == "" + assert str(Rule1Disabled()) == "" diff --git a/libs/rebulk/test/test_toposort.py b/libs/rebulk/test/test_toposort.py new file mode 100644 index 00000000..76ea6031 --- /dev/null +++ b/libs/rebulk/test/test_toposort.py @@ -0,0 +1,111 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +# Copyright 2014 True Blade Systems, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Original: +# - https://bitbucket.org/ericvsmith/toposort (1.4) +# Modifications: +# - port to pytest +# pylint: skip-file + +import pytest +from ..toposort import toposort, toposort_flatten, CyclicDependency + + +class TestCase(object): + def test_simple(self): + results = list(toposort({2: set([11]), 9: set([11, 8]), 10: set([11, 3]), 11: set([7, 5]), 8: set([7, 3])})) + expected = [set([3, 5, 7]), set([8, 11]), set([2, 9, 10])] + assert results == expected + + # make sure self dependencies are ignored + results = list(toposort({2: set([2, 11]), 9: set([11, 8]), 10: set([10, 11, 3]), 11: set([7, 5]), 8: set([7, 3])})) + expected = [set([3, 5, 7]), set([8, 11]), set([2, 9, 10])] + assert results == expected + + assert list(toposort({1: set()})) == [set([1])] + assert list(toposort({1: set([1])})) == [set([1])] + + def test_no_dependencies(self): + assert list(toposort({1: set([2]), 3: set([4]), 5: set([6])})) == [set([2, 4, 6]), set([1, 3, 5])] + assert list(toposort({1: set(), 3: set(), 5: set()})) == [set([1, 3, 5])] + + def test_empty(self): + assert list(toposort({})) == [] + + def test_strings(self): + results = list(toposort({'2': set(['11']), '9': set(['11', '8']), '10': set(['11', '3']), '11': set(['7', '5']), '8': set(['7', '3'])})) + expected = [set(['3', '5', '7']), set(['8', '11']), set(['2', '9', '10'])] + assert results == expected + + def test_objects(self): + o2 = object() + o3 = object() + o5 = object() + o7 = object() + o8 = object() + o9 = object() + o10 = object() + o11 = object() + results = list(toposort({o2: set([o11]), o9: set([o11, o8]), o10: set([o11, o3]), o11: set([o7, o5]), o8: set([o7, o3, o8])})) + expected = [set([o3, o5, o7]), set([o8, o11]), set([o2, o9, o10])] + assert results == expected + + def test_cycle(self): + # a simple, 2 element cycle + with pytest.raises(CyclicDependency): + list(toposort({1: set([2]), 2: set([1])})) + + # an indirect cycle + with pytest.raises(CyclicDependency): + list(toposort({1: set([2]), 2: set([3]), 3: set([1])})) + + def test_input_not_modified(self): + data = {2: set([11]), + 9: set([11, 8]), + 10: set([11, 3]), + 11: set([7, 5]), + 8: set([7, 3, 8]), # includes something self-referential + } + orig = data.copy() + results = list(toposort(data)) + assert data == orig + + def test_input_not_modified_when_cycle_error(self): + data = {1: set([2]), + 2: set([1]), + 3: set([4]), + } + orig = data.copy() + with pytest.raises(CyclicDependency): + list(toposort(data)) + assert data == orig + + +class TestCaseAll(object): + def test_sort_flatten(self): + data = {2: set([11]), + 9: set([11, 8]), + 10: set([11, 3]), + 11: set([7, 5]), + 8: set([7, 3, 8]), # includes something self-referential + } + expected = [set([3, 5, 7]), set([8, 11]), set([2, 9, 10])] + assert list(toposort(data)) == expected + + # now check the sorted results + results = [] + for item in expected: + results.extend(sorted(item)) + assert toposort_flatten(data) == results + + # and the unsorted results. break the results up into groups to compare them + actual = toposort_flatten(data, False) + results = [set([i for i in actual[0:3]]), set([i for i in actual[3:5]]), set([i for i in actual[5:8]])] + assert results == expected diff --git a/libs/rebulk/test/test_validators.py b/libs/rebulk/test/test_validators.py new file mode 100644 index 00000000..38511cbf --- /dev/null +++ b/libs/rebulk/test/test_validators.py @@ -0,0 +1,64 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +# pylint: disable=no-self-use, pointless-statement, missing-docstring, invalid-name + +from functools import partial + +from rebulk.pattern import StringPattern + +from ..validators import chars_before, chars_after, chars_surround, validators + +chars = ' _.' +left = partial(chars_before, chars) +right = partial(chars_after, chars) +surrounding = partial(chars_surround, chars) + + +def test_left_chars(): + matches = list(StringPattern("word", validator=left).matches("xxxwordxxx")) + assert len(matches) == 0 + + matches = list(StringPattern("word", validator=left).matches("xxx_wordxxx")) + assert len(matches) == 1 + + matches = list(StringPattern("word", validator=left).matches("wordxxx")) + assert len(matches) == 1 + + +def test_right_chars(): + matches = list(StringPattern("word", validator=right).matches("xxxwordxxx")) + assert len(matches) == 0 + + matches = list(StringPattern("word", validator=right).matches("xxxword.xxx")) + assert len(matches) == 1 + + matches = list(StringPattern("word", validator=right).matches("xxxword")) + assert len(matches) == 1 + + +def test_surrounding_chars(): + matches = list(StringPattern("word", validator=surrounding).matches("xxxword xxx")) + assert len(matches) == 0 + + matches = list(StringPattern("word", validator=surrounding).matches("xxx.wordxxx")) + assert len(matches) == 0 + + matches = list(StringPattern("word", validator=surrounding).matches("xxx word_xxx")) + assert len(matches) == 1 + + matches = list(StringPattern("word", validator=surrounding).matches("word")) + assert len(matches) == 1 + + +def test_chain(): + matches = list(StringPattern("word", validator=validators(left, right)).matches("xxxword xxx")) + assert len(matches) == 0 + + matches = list(StringPattern("word", validator=validators(left, right)).matches("xxx.wordxxx")) + assert len(matches) == 0 + + matches = list(StringPattern("word", validator=validators(left, right)).matches("xxx word_xxx")) + assert len(matches) == 1 + + matches = list(StringPattern("word", validator=validators(left, right)).matches("word")) + assert len(matches) == 1 diff --git a/libs/rebulk/toposort.py b/libs/rebulk/toposort.py new file mode 100644 index 00000000..2bcba9ae --- /dev/null +++ b/libs/rebulk/toposort.py @@ -0,0 +1,84 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +# Copyright 2014 True Blade Systems, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Original: +# - https://bitbucket.org/ericvsmith/toposort (1.4) +# Modifications: +# - merged Pull request #2 for CyclicDependency error +# - import reduce as original name +# - support python 2.6 dict comprehension + +# pylint: skip-file +from functools import reduce + + +class CyclicDependency(ValueError): + def __init__(self, cyclic): + s = 'Cyclic dependencies exist among these items: {0}'.format(', '.join(repr(x) for x in cyclic.items())) + super(CyclicDependency, self).__init__(s) + self.cyclic = cyclic + + +def toposort(data): + """ + Dependencies are expressed as a dictionary whose keys are items + and whose values are a set of dependent items. Output is a list of + sets in topological order. The first set consists of items with no + dependences, each subsequent set consists of items that depend upon + items in the preceeding sets. + :param data: + :type data: + :return: + :rtype: + """ + + # Special case empty input. + if len(data) == 0: + return + + # Copy the input so as to leave it unmodified. + data = data.copy() + + # Ignore self dependencies. + for k, v in data.items(): + v.discard(k) + # Find all items that don't depend on anything. + extra_items_in_deps = reduce(set.union, data.values()) - set(data.keys()) + # Add empty dependences where needed. + data.update(dict((item, set()) for item in extra_items_in_deps)) + while True: + ordered = set(item for item, dep in data.items() if len(dep) == 0) + if not ordered: + break + yield ordered + data = dict((item, (dep - ordered)) + for item, dep in data.items() + if item not in ordered) + if len(data) != 0: + raise CyclicDependency(data) + + +def toposort_flatten(data, sort=True): + """ + Returns a single list of dependencies. For any set returned by + toposort(), those items are sorted and appended to the result (just to + make the results deterministic). + :param data: + :type data: + :param sort: + :type sort: + :return: Single list of dependencies. + :rtype: list + """ + + result = [] + for d in toposort(data): + result.extend((sorted if sort else list)(d)) + return result diff --git a/libs/rebulk/utils.py b/libs/rebulk/utils.py new file mode 100644 index 00000000..a49fe4ff --- /dev/null +++ b/libs/rebulk/utils.py @@ -0,0 +1,153 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +""" +Various utilities functions +""" +from collections import MutableSet + +from types import GeneratorType + + +def find_all(string, sub, start=None, end=None, ignore_case=False): + """ + Return all indices in string s where substring sub is + found, such that sub is contained in the slice s[start:end]. + + >>> list(find_all('The quick brown fox jumps over the lazy dog', 'fox')) + [16] + + >>> list(find_all('The quick brown fox jumps over the lazy dog', 'mountain')) + [] + + >>> list(find_all('The quick brown fox jumps over the lazy dog', 'The')) + [0] + + >>> list(find_all( + ... 'Carved symbols in a mountain hollow on the bank of an inlet irritated an eccentric person', + ... 'an')) + [44, 51, 70] + + >>> list(find_all( + ... 'Carved symbols in a mountain hollow on the bank of an inlet irritated an eccentric person', + ... 'an', + ... 50, + ... 60)) + [51] + + :param string: the input string + :type string: str + :param sub: the substring + :type sub: str + :return: all indices in the input string + :rtype: __generator[str] + """ + if ignore_case: + sub = sub.lower() + string = string.lower() + while True: + start = string.find(sub, start, end) + if start == -1: + return + yield start + start += len(sub) + + +def get_first_defined(data, keys, default_value=None): + """ + Get the first defined key in data. + :param data: + :type data: + :param keys: + :type keys: + :param default_value: + :type default_value: + :return: + :rtype: + """ + for key in keys: + try: + return data[key] + except KeyError: + pass + return default_value + + +def is_iterable(obj): + """ + Are we being asked to look up a list of things, instead of a single thing? + We check for the `__iter__` attribute so that this can cover types that + don't have to be known by this module, such as NumPy arrays. + + Strings, however, should be considered as atomic values to look up, not + iterables. + + We don't need to check for the Python 2 `unicode` type, because it doesn't + have an `__iter__` attribute anyway. + """ + return hasattr(obj, '__iter__') and not isinstance(obj, str) or isinstance(obj, GeneratorType) + + +def extend_safe(target, source): + """ + Extends source list to target list only if elements doesn't exists in target list. + :param target: + :type target: list + :param source: + :type source: list + """ + for elt in source: + if elt not in target: + target.append(elt) + + +class _Ref(object): + """ + Reference for IdentitySet + """ + def __init__(self, value): + self.value = value + + def __eq__(self, other): + return self.value is other.value + + def __hash__(self): + return id(self.value) + + +class IdentitySet(MutableSet): # pragma: no cover + """ + Set based on identity + """ + def __init__(self, items=None): + if items is None: + items = [] + self.refs = set(map(_Ref, items)) + + def __contains__(self, elem): + return _Ref(elem) in self.refs + + def __iter__(self): + return (ref.value for ref in self.refs) + + def __len__(self): + return len(self.refs) + + def add(self, elem): + self.refs.add(_Ref(elem)) + + def discard(self, elem): + self.refs.discard(_Ref(elem)) + + def update(self, iterable): + """ + Update set with iterable + :param iterable: + :type iterable: + :return: + :rtype: + """ + for elem in iterable: + self.add(elem) + + def __repr__(self): # pragma: no cover + return "%s(%s)" % (type(self).__name__, list(self)) diff --git a/libs/rebulk/validators.py b/libs/rebulk/validators.py new file mode 100644 index 00000000..5fd3dcb6 --- /dev/null +++ b/libs/rebulk/validators.py @@ -0,0 +1,70 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +""" +Validator functions to use in patterns. + +All those function have last argument as match, so it's possible to use functools.partial to bind previous arguments. +""" + + +def chars_before(chars, match): + """ + Validate the match if left character is in a given sequence. + + :param chars: + :type chars: + :param match: + :type match: + :return: + :rtype: + """ + if match.start <= 0: + return True + return match.input_string[match.start - 1] in chars + + +def chars_after(chars, match): + """ + Validate the match if right character is in a given sequence. + + :param chars: + :type chars: + :param match: + :type match: + :return: + :rtype: + """ + if match.end >= len(match.input_string): + return True + return match.input_string[match.end] in chars + + +def chars_surround(chars, match): + """ + Validate the match if surrounding characters are in a given sequence. + + :param chars: + :type chars: + :param match: + :type match: + :return: + :rtype: + """ + return chars_before(chars, match) and chars_after(chars, match) + + +def validators(*chained_validators): + """ + Creates a validator chain from several validator functions. + + :param chained_validators: + :type chained_validators: + :return: + :rtype: + """ + def validator_chain(match): # pylint:disable=missing-docstring + for chained_validator in chained_validators: + if not chained_validator(match): + return False + return True + return validator_chain diff --git a/libs/subliminal/__init__.py b/libs/subliminal/__init__.py index 836700c1..7ff8ac34 100644 --- a/libs/subliminal/__init__.py +++ b/libs/subliminal/__init__.py @@ -1,17 +1,21 @@ # -*- coding: utf-8 -*- __title__ = 'subliminal' -__version__ = '0.8.0-dev' +__version__ = '2.0.5' +__short_version__ = '.'.join(__version__.split('.')[:2]) __author__ = 'Antoine Bertin' __license__ = 'MIT' -__copyright__ = 'Copyright 2013 Antoine Bertin' +__copyright__ = 'Copyright 2016, Antoine Bertin' import logging -from .api import list_subtitles, download_subtitles, download_best_subtitles, save_subtitles -from .cache import MutexLock, region as cache_region -from .exceptions import Error, ProviderError -from .providers import Provider, ProviderPool, provider_manager -from .subtitle import Subtitle -from .video import VIDEO_EXTENSIONS, SUBTITLE_EXTENSIONS, Video, Episode, Movie, scan_videos, scan_video +from .core import (AsyncProviderPool, ProviderPool, check_video, download_best_subtitles, download_subtitles, + list_subtitles, refine, save_subtitles, scan_video, scan_videos) +from .cache import region +from .exceptions import Error, ProviderError +from .extensions import provider_manager, refiner_manager +from .providers import Provider +from .score import compute_score, get_scores +from .subtitle import SUBTITLE_EXTENSIONS, Subtitle +from .video import VIDEO_EXTENSIONS, Episode, Movie, Video logging.getLogger(__name__).addHandler(logging.NullHandler()) diff --git a/libs/subliminal/api.py b/libs/subliminal/api.py deleted file mode 100644 index 47d6a2cb..00000000 --- a/libs/subliminal/api.py +++ /dev/null @@ -1,140 +0,0 @@ -# -*- coding: utf-8 -*- -from __future__ import unicode_literals -import collections -import io -import logging -import operator -import os.path -import babelfish -from .providers import ProviderPool -from .subtitle import get_subtitle_path - - -logger = logging.getLogger(__name__) - - -def list_subtitles(videos, languages, providers=None, provider_configs=None): - """List subtitles for `videos` with the given `languages` using the specified `providers` - - :param videos: videos to list subtitles for - :type videos: set of :class:`~subliminal.video.Video` - :param languages: languages of subtitles to search for - :type languages: set of :class:`babelfish.Language` - :param providers: providers to use, if not all - :type providers: list of string or None - :param provider_configs: configuration for providers - :type provider_configs: dict of provider name => provider constructor kwargs or None - :return: found subtitles - :rtype: dict of :class:`~subliminal.video.Video` => [:class:`~subliminal.subtitle.Subtitle`] - - """ - subtitles = collections.defaultdict(list) - with ProviderPool(providers, provider_configs) as pp: - for video in videos: - logger.info('Listing subtitles for %r', video) - video_subtitles = pp.list_subtitles(video, languages) - logger.info('Found %d subtitles total', len(video_subtitles)) - subtitles[video].extend(video_subtitles) - return subtitles - - -def download_subtitles(subtitles, provider_configs=None): - """Download subtitles - - :param subtitles: subtitles to download - :type subtitles: list of :class:`~subliminal.subtitle.Subtitle` - :param provider_configs: configuration for providers - :type provider_configs: dict of provider name => provider constructor kwargs or None - - """ - with ProviderPool(provider_configs=provider_configs) as pp: - for subtitle in subtitles: - logger.info('Downloading subtitle %r', subtitle) - pp.download_subtitle(subtitle) - - -def download_best_subtitles(videos, languages, providers=None, provider_configs=None, min_score=0, - hearing_impaired=False, single=False): - """Download the best subtitles for `videos` with the given `languages` using the specified `providers` - - :param videos: videos to download subtitles for - :type videos: set of :class:`~subliminal.video.Video` - :param languages: languages of subtitles to download - :type languages: set of :class:`babelfish.Language` - :param providers: providers to use for the search, if not all - :type providers: list of string or None - :param provider_configs: configuration for providers - :type provider_configs: dict of provider name => provider constructor kwargs or None - :param int min_score: minimum score for subtitles to download - :param bool hearing_impaired: download hearing impaired subtitles - :param bool single: do not download for videos with an undetermined subtitle language detected - - """ - downloaded_subtitles = collections.defaultdict(list) - with ProviderPool(providers, provider_configs) as pp: - for video in videos: - # filter - if single and babelfish.Language('und') in video.subtitle_languages: - logger.debug('Skipping video %r: undetermined language found') - continue - - # list - logger.info('Listing subtitles for %r', video) - video_subtitles = pp.list_subtitles(video, languages) - logger.info('Found %d subtitles total', len(video_subtitles)) - - # download - downloaded_languages = set() - for subtitle, score in sorted([(s, s.compute_score(video)) for s in video_subtitles], - key=operator.itemgetter(1), reverse=True): - if score < min_score: - logger.info('No subtitle with score >= %d', min_score) - break - if subtitle.hearing_impaired != hearing_impaired: - logger.debug('Skipping subtitle: hearing impaired != %r', hearing_impaired) - continue - if subtitle.language in downloaded_languages: - logger.debug('Skipping subtitle: %r already downloaded', subtitle.language) - continue - logger.info('Downloading subtitle %r with score %d', subtitle, score) - if pp.download_subtitle(subtitle): - downloaded_languages.add(subtitle.language) - downloaded_subtitles[video].append(subtitle) - if single or downloaded_languages == languages: - logger.debug('All languages downloaded') - break - return downloaded_subtitles - - -def save_subtitles(subtitles, single=False, directory=None, encoding=None): - """Save subtitles on disk next to the video or in a specific folder if `folder_path` is specified - - :param bool single: download with .srt extension if ``True``, add language identifier otherwise - :param directory: path to directory where to save the subtitles, if any - :type directory: string or None - :param encoding: encoding for the subtitles or ``None`` to use the original encoding - :type encoding: string or None - - """ - for video, video_subtitles in subtitles.items(): - saved_languages = set() - for video_subtitle in video_subtitles: - if video_subtitle.content is None: - logger.debug('Skipping subtitle %r: no content', video_subtitle) - continue - if video_subtitle.language in saved_languages: - logger.debug('Skipping subtitle %r: language already saved', video_subtitle) - continue - subtitle_path = get_subtitle_path(video.name, None if single else video_subtitle.language) - if directory is not None: - subtitle_path = os.path.join(directory, os.path.split(subtitle_path)[1]) - logger.info('Saving %r to %r', video_subtitle, subtitle_path) - if encoding is None: - with io.open(subtitle_path, 'wb') as f: - f.write(video_subtitle.content) - else: - with io.open(subtitle_path, 'w', encoding=encoding) as f: - f.write(video_subtitle.text) - saved_languages.add(video_subtitle.language) - if single: - break diff --git a/libs/subliminal/cache.py b/libs/subliminal/cache.py index 72fbe01b..244ba953 100644 --- a/libs/subliminal/cache.py +++ b/libs/subliminal/cache.py @@ -1,14 +1,7 @@ # -*- coding: utf-8 -*- import datetime -import inspect -from dogpile.cache import make_region # @UnresolvedImport -from dogpile.cache.backends.file import AbstractFileLock # @UnresolvedImport -from dogpile.cache.compat import string_type # @UnresolvedImport -from dogpile.core.readwrite_lock import ReadWriteMutex # @UnresolvedImport - -#: Subliminal's cache version -CACHE_VERSION = 1 +from dogpile.cache import make_region #: Expiration time for show caching SHOW_EXPIRATION_TIME = datetime.timedelta(weeks=3).total_seconds() @@ -16,45 +9,8 @@ SHOW_EXPIRATION_TIME = datetime.timedelta(weeks=3).total_seconds() #: Expiration time for episode caching EPISODE_EXPIRATION_TIME = datetime.timedelta(days=3).total_seconds() - -def subliminal_key_generator(namespace, fn, to_str=string_type): - """Add a :data:`CACHE_VERSION` to dogpile.cache's default function_key_generator""" - if namespace is None: - namespace = '%d:%s:%s' % (CACHE_VERSION, fn.__module__, fn.__name__) - else: - namespace = '%d:%s:%s|%s' % (CACHE_VERSION, fn.__module__, fn.__name__, namespace) - - args = inspect.getargspec(fn) - has_self = args[0] and args[0][0] in ('self', 'cls') - - def generate_key(*args, **kw): - if kw: - raise ValueError('Keyword arguments not supported') - if has_self: - args = args[1:] - return namespace + '|' + ' '.join(map(to_str, args)) - return generate_key +#: Expiration time for scraper searches +REFINER_EXPIRATION_TIME = datetime.timedelta(weeks=1).total_seconds() -class MutexLock(AbstractFileLock): - """:class:`MutexLock` is a thread-based rw lock based on :class:`dogpile.core.ReadWriteMutex`""" - def __init__(self, filename): - self.mutex = ReadWriteMutex() - - def acquire_read_lock(self, wait): - ret = self.mutex.acquire_read_lock(wait) - return wait or ret - - def acquire_write_lock(self, wait): - ret = self.mutex.acquire_write_lock(wait) - return wait or ret - - def release_read_lock(self): - return self.mutex.release_read_lock() - - def release_write_lock(self): - return self.mutex.release_write_lock() - - -#: The dogpile.cache region -region = make_region(function_key_generator=subliminal_key_generator) +region = make_region() diff --git a/libs/subliminal/cli.py b/libs/subliminal/cli.py index cabcdfc8..cc24853c 100644 --- a/libs/subliminal/cli.py +++ b/libs/subliminal/cli.py @@ -1,197 +1,461 @@ # -*- coding: utf-8 -*- -from __future__ import unicode_literals, print_function -import argparse -import datetime +""" +Subliminal uses `click `_ to provide a powerful :abbr:`CLI (command-line interface)`. + +""" +from __future__ import division +from collections import defaultdict +from datetime import timedelta +import glob +import json import logging import os import re -import sys -import babelfish -import xdg.BaseDirectory -from subliminal import (__version__, cache_region, MutexLock, provider_manager, Video, Episode, Movie, scan_videos, - download_best_subtitles, save_subtitles) -try: - import colorlog -except ImportError: - colorlog = None + +from appdirs import AppDirs +from babelfish import Error as BabelfishError, Language +import click +from dogpile.cache.backends.file import AbstractFileLock +from dogpile.util.readwrite_lock import ReadWriteMutex +from six.moves import configparser + +from subliminal import (AsyncProviderPool, Episode, Movie, Video, __version__, check_video, compute_score, get_scores, + provider_manager, refine, refiner_manager, region, save_subtitles, scan_video, scan_videos) +from subliminal.core import ARCHIVE_EXTENSIONS, search_external_subtitles + +logger = logging.getLogger(__name__) -DEFAULT_CACHE_FILE = os.path.join(xdg.BaseDirectory.save_cache_path('subliminal'), 'cli.dbm') +class MutexLock(AbstractFileLock): + """:class:`MutexLock` is a thread-based rw lock based on :class:`dogpile.core.ReadWriteMutex`.""" + def __init__(self, filename): + self.mutex = ReadWriteMutex() + + def acquire_read_lock(self, wait): + ret = self.mutex.acquire_read_lock(wait) + return wait or ret + + def acquire_write_lock(self, wait): + ret = self.mutex.acquire_write_lock(wait) + return wait or ret + + def release_read_lock(self): + return self.mutex.release_read_lock() + + def release_write_lock(self): + return self.mutex.release_write_lock() -def subliminal(): - parser = argparse.ArgumentParser(prog='subliminal', description='Subtitles, faster than your thoughts', - epilog='Suggestions and bug reports are greatly appreciated: ' - 'https://github.com/Diaoul/subliminal/issues', add_help=False) +class Config(object): + """A :class:`~configparser.ConfigParser` wrapper to store configuration. - # required arguments - required_arguments_group = parser.add_argument_group('required arguments') - required_arguments_group.add_argument('paths', nargs='+', metavar='PATH', help='path to video file or folder') - required_arguments_group.add_argument('-l', '--languages', nargs='+', required=True, metavar='LANGUAGE', - help='wanted languages as IETF codes e.g. fr, pt-BR, sr-Cyrl ') + Interaction with the configuration is done with the properties. - # configuration - configuration_group = parser.add_argument_group('configuration') - configuration_group.add_argument('-s', '--single', action='store_true', - help='download without language code in subtitle\'s filename i.e. .srt only') - configuration_group.add_argument('-c', '--cache-file', default=DEFAULT_CACHE_FILE, - help='cache file (default: %(default)s)') + :param str path: path to the configuration file. - # filtering - filtering_group = parser.add_argument_group('filtering') - filtering_group.add_argument('-p', '--providers', nargs='+', metavar='PROVIDER', - help='providers to use (%s)' % ', '.join(provider_manager.available_providers)) - filtering_group.add_argument('-m', '--min-score', type=int, default=0, - help='minimum score for subtitles (0-%d for episodes, 0-%d for movies)' - % (Episode.scores['hash'], Movie.scores['hash'])) - filtering_group.add_argument('-a', '--age', help='download subtitles for videos newer than AGE e.g. 12h, 1w2d') - filtering_group.add_argument('-h', '--hearing-impaired', action='store_true', - help='download hearing impaired subtitles') - filtering_group.add_argument('-f', '--force', action='store_true', - help='force subtitle download for videos with existing subtitles') + """ + def __init__(self, path): + #: Path to the configuration file + self.path = path - # addic7ed - addic7ed_group = parser.add_argument_group('addic7ed') - addic7ed_group.add_argument('--addic7ed-username', metavar='USERNAME', help='username for addic7ed provider') - addic7ed_group.add_argument('--addic7ed-password', metavar='PASSWORD', help='password for addic7ed provider') + #: The underlying configuration object + self.config = configparser.SafeConfigParser() + self.config.add_section('general') + self.config.set('general', 'languages', json.dumps(['en'])) + self.config.set('general', 'providers', json.dumps(sorted([p.name for p in provider_manager]))) + self.config.set('general', 'refiners', json.dumps(sorted([r.name for r in refiner_manager]))) + self.config.set('general', 'single', str(0)) + self.config.set('general', 'embedded_subtitles', str(1)) + self.config.set('general', 'age', str(int(timedelta(weeks=2).total_seconds()))) + self.config.set('general', 'hearing_impaired', str(1)) + self.config.set('general', 'min_score', str(0)) - # output - output_group = parser.add_argument_group('output') - output_group.add_argument('-d', '--directory', - help='save subtitles in the given directory rather than next to the video') - output_group.add_argument('-e', '--encoding', default=None, - help='encoding to convert the subtitle to (default: no conversion)') - output_exclusive_group = output_group.add_mutually_exclusive_group() - output_exclusive_group.add_argument('-q', '--quiet', action='store_true', help='disable output') - output_exclusive_group.add_argument('-v', '--verbose', action='store_true', help='verbose output') - output_group.add_argument('--log-file', help='log into a file instead of stdout') - output_group.add_argument('--color', action='store_true', help='add color to console output (requires colorlog)') + def read(self): + """Read the configuration from :attr:`path`""" + self.config.read(self.path) - # troubleshooting - troubleshooting_group = parser.add_argument_group('troubleshooting') - troubleshooting_group.add_argument('--debug', action='store_true', help='debug output') - troubleshooting_group.add_argument('--version', action='version', version=__version__) - troubleshooting_group.add_argument('--help', action='help', help='show this help message and exit') + def write(self): + """Write the configuration to :attr:`path`""" + with open(self.path, 'w') as f: + self.config.write(f) - # parse args - args = parser.parse_args() + @property + def languages(self): + return {Language.fromietf(l) for l in json.loads(self.config.get('general', 'languages'))} - # parse paths - try: - args.paths = [os.path.abspath(os.path.expanduser(p.decode('utf-8') if isinstance(p, bytes) else p)) - for p in args.paths] - except UnicodeDecodeError: - parser.error('argument paths: encodings is not utf-8: %r' % args.paths) + @languages.setter + def languages(self, value): + self.config.set('general', 'languages', json.dumps(sorted([str(l) for l in value]))) - # parse languages - try: - args.languages = {babelfish.Language.fromietf(l) for l in args.languages} - except babelfish.Error: - parser.error('argument -l/--languages: codes are not IETF: %r' % args.languages) + @property + def providers(self): + return json.loads(self.config.get('general', 'providers')) - # parse age - if args.age is not None: - match = re.match(r'^(?:(?P\d+?)w)?(?:(?P\d+?)d)?(?:(?P\d+?)h)?$', args.age) + @providers.setter + def providers(self, value): + self.config.set('general', 'providers', json.dumps(sorted([p.lower() for p in value]))) + + @property + def refiners(self): + return json.loads(self.config.get('general', 'refiners')) + + @refiners.setter + def refiners(self, value): + self.config.set('general', 'refiners', json.dumps([r.lower() for r in value])) + + @property + def single(self): + return self.config.getboolean('general', 'single') + + @single.setter + def single(self, value): + self.config.set('general', 'single', str(int(value))) + + @property + def embedded_subtitles(self): + return self.config.getboolean('general', 'embedded_subtitles') + + @embedded_subtitles.setter + def embedded_subtitles(self, value): + self.config.set('general', 'embedded_subtitles', str(int(value))) + + @property + def age(self): + return timedelta(seconds=self.config.getint('general', 'age')) + + @age.setter + def age(self, value): + self.config.set('general', 'age', str(int(value.total_seconds()))) + + @property + def hearing_impaired(self): + return self.config.getboolean('general', 'hearing_impaired') + + @hearing_impaired.setter + def hearing_impaired(self, value): + self.config.set('general', 'hearing_impaired', str(int(value))) + + @property + def min_score(self): + return self.config.getfloat('general', 'min_score') + + @min_score.setter + def min_score(self, value): + self.config.set('general', 'min_score', str(value)) + + @property + def provider_configs(self): + rv = {} + for provider in provider_manager: + if self.config.has_section(provider.name): + rv[provider.name] = {k: v for k, v in self.config.items(provider.name)} + return rv + + @provider_configs.setter + def provider_configs(self, value): + # loop over provider configurations + for provider, config in value.items(): + # create the corresponding section if necessary + if not self.config.has_section(provider): + self.config.add_section(provider) + + # add config options + for k, v in config.items(): + self.config.set(provider, k, v) + + +class LanguageParamType(click.ParamType): + """:class:`~click.ParamType` for languages that returns a :class:`~babelfish.language.Language`""" + name = 'language' + + def convert(self, value, param, ctx): + try: + return Language.fromietf(value) + except BabelfishError: + self.fail('%s is not a valid language' % value) + +LANGUAGE = LanguageParamType() + + +class AgeParamType(click.ParamType): + """:class:`~click.ParamType` for age strings that returns a :class:`~datetime.timedelta` + + An age string is in the form `number + identifier` with possible identifiers: + + * ``w`` for weeks + * ``d`` for days + * ``h`` for hours + + The form can be specified multiple times but only with that idenfier ordering. For example: + + * ``1w2d4h`` for 1 week, 2 days and 4 hours + * ``2w`` for 2 weeks + * ``3w6h`` for 3 weeks and 6 hours + + """ + name = 'age' + + def convert(self, value, param, ctx): + match = re.match(r'^(?:(?P\d+?)w)?(?:(?P\d+?)d)?(?:(?P\d+?)h)?$', value) if not match: - parser.error('argument -a/--age: invalid age: %r' % args.age) - args.age = datetime.timedelta(**{k: int(v) for k, v in match.groupdict(0).items()}) + self.fail('%s is not a valid age' % value) - # parse cache-file - args.cache_file = os.path.abspath(os.path.expanduser(args.cache_file)) - if not os.path.exists(os.path.split(args.cache_file)[0]): - parser.error('argument -c/--cache-file: directory %r for cache file does not exist' - % os.path.split(args.cache_file)[0]) + return timedelta(**{k: int(v) for k, v in match.groupdict(0).items()}) - # parse provider configs - provider_configs = {} - if (args.addic7ed_username is not None and args.addic7ed_password is None - or args.addic7ed_username is None and args.addic7ed_password is not None): - parser.error('argument --addic7ed-username/--addic7ed-password: both arguments are required or none') - if args.addic7ed_username is not None and args.addic7ed_password is not None: - provider_configs['addic7ed'] = {'username': args.addic7ed_username, 'password': args.addic7ed_password} +AGE = AgeParamType() - # parse color - if args.color and colorlog is None: - parser.error('argument --color: colorlog required') +PROVIDER = click.Choice(sorted(provider_manager.names())) - # setup output - if args.log_file is None: - handler = logging.StreamHandler() - else: - handler = logging.FileHandler(args.log_file, encoding='utf-8') - if args.debug: - if args.color: - if args.log_file is None: - log_format = '%(log_color)s%(levelname)-8s%(reset)s [%(blue)s%(name)s-%(funcName)s:%(lineno)d%(reset)s] %(message)s' - else: - log_format = '%(purple)s%(asctime)s%(reset)s %(log_color)s%(levelname)-8s%(reset)s [%(blue)s%(name)s-%(funcName)s:%(lineno)d%(reset)s] %(message)s' - handler.setFormatter(colorlog.ColoredFormatter(log_format, - log_colors=dict(colorlog.default_log_colors.items() + [('DEBUG', 'cyan')]))) - else: - if args.log_file is None: - log_format = '%(levelname)-8s [%(name)s-%(funcName)s:%(lineno)d] %(message)s' - else: - log_format = '%(asctime)s %(levelname)-8s [%(name)s-%(funcName)s:%(lineno)d] %(message)s' - handler.setFormatter(logging.Formatter(log_format)) - logging.getLogger().addHandler(handler) - logging.getLogger().setLevel(logging.DEBUG) - elif args.verbose: - if args.color: - if args.log_file is None: - log_format = '%(log_color)s%(levelname)-8s%(reset)s [%(blue)s%(name)s%(reset)s] %(message)s' - else: - log_format = '%(purple)s%(asctime)s%(reset)s %(log_color)s%(levelname)-8s%(reset)s [%(blue)s%(name)s%(reset)s] %(message)s' - handler.setFormatter(colorlog.ColoredFormatter(log_format)) - else: - log_format = '%(levelname)-8s [%(name)s] %(message)s' - if args.log_file is not None: - log_format = '%(asctime)s ' + log_format - handler.setFormatter(logging.Formatter(log_format)) - logging.getLogger('subliminal').addHandler(handler) - logging.getLogger('subliminal').setLevel(logging.INFO) - elif not args.quiet: - if args.color: - if args.log_file is None: - log_format = '[%(log_color)s%(levelname)s%(reset)s] %(message)s' - else: - log_format = '%(purple)s%(asctime)s%(reset)s [%(log_color)s%(levelname)s%(reset)s] %(message)s' - handler.setFormatter(colorlog.ColoredFormatter(log_format)) - else: - if args.log_file is None: - log_format = '%(levelname)s: %(message)s' - else: - log_format = '%(asctime)s %(levelname)s: %(message)s' - handler.setFormatter(logging.Formatter(log_format)) - logging.getLogger('subliminal.api').addHandler(handler) - logging.getLogger('subliminal.api').setLevel(logging.INFO) +REFINER = click.Choice(sorted(refiner_manager.names())) + +dirs = AppDirs('subliminal') +cache_file = 'subliminal.dbm' +config_file = 'config.ini' + + +@click.group(context_settings={'max_content_width': 100}, epilog='Suggestions and bug reports are greatly appreciated: ' + 'https://github.com/Diaoul/subliminal/') +@click.option('--addic7ed', type=click.STRING, nargs=2, metavar='USERNAME PASSWORD', help='Addic7ed configuration.') +@click.option('--legendastv', type=click.STRING, nargs=2, metavar='USERNAME PASSWORD', help='LegendasTV configuration.') +@click.option('--opensubtitles', type=click.STRING, nargs=2, metavar='USERNAME PASSWORD', + help='OpenSubtitles configuration.') +@click.option('--subscenter', type=click.STRING, nargs=2, metavar='USERNAME PASSWORD', help='SubsCenter configuration.') +@click.option('--cache-dir', type=click.Path(writable=True, file_okay=False), default=dirs.user_cache_dir, + show_default=True, expose_value=True, help='Path to the cache directory.') +@click.option('--debug', is_flag=True, help='Print useful information for debugging subliminal and for reporting bugs.') +@click.version_option(__version__) +@click.pass_context +def subliminal(ctx, addic7ed, legendastv, opensubtitles, subscenter, cache_dir, debug): + """Subtitles, faster than your thoughts.""" + # create cache directory + try: + os.makedirs(cache_dir) + except OSError: + if not os.path.isdir(cache_dir): + raise # configure cache - cache_region.configure('dogpile.cache.dbm', expiration_time=datetime.timedelta(days=30), # @UndefinedVariable - arguments={'filename': args.cache_file, 'lock_factory': MutexLock}) + region.configure('dogpile.cache.dbm', expiration_time=timedelta(days=30), + arguments={'filename': os.path.join(cache_dir, cache_file), 'lock_factory': MutexLock}) + + # configure logging + if debug: + handler = logging.StreamHandler() + handler.setFormatter(logging.Formatter(logging.BASIC_FORMAT)) + logging.getLogger('subliminal').addHandler(handler) + logging.getLogger('subliminal').setLevel(logging.DEBUG) + + # provider configs + ctx.obj = {'provider_configs': {}} + if addic7ed: + ctx.obj['provider_configs']['addic7ed'] = {'username': addic7ed[0], 'password': addic7ed[1]} + if legendastv: + ctx.obj['provider_configs']['legendastv'] = {'username': legendastv[0], 'password': legendastv[1]} + if opensubtitles: + ctx.obj['provider_configs']['opensubtitles'] = {'username': opensubtitles[0], 'password': opensubtitles[1]} + if subscenter: + ctx.obj['provider_configs']['subscenter'] = {'username': subscenter[0], 'password': subscenter[1]} + + +@subliminal.command() +@click.option('--clear-subliminal', is_flag=True, help='Clear subliminal\'s cache. Use this ONLY if your cache is ' + 'corrupted or if you experience issues.') +@click.pass_context +def cache(ctx, clear_subliminal): + """Cache management.""" + if clear_subliminal: + for file in glob.glob(os.path.join(ctx.parent.params['cache_dir'], cache_file) + '*'): + os.remove(file) + click.echo('Subliminal\'s cache cleared.') + else: + click.echo('Nothing done.') + + +@subliminal.command() +@click.option('-l', '--language', type=LANGUAGE, required=True, multiple=True, help='Language as IETF code, ' + 'e.g. en, pt-BR (can be used multiple times).') +@click.option('-p', '--provider', type=PROVIDER, multiple=True, help='Provider to use (can be used multiple times).') +@click.option('-r', '--refiner', type=REFINER, multiple=True, help='Refiner to use (can be used multiple times).') +@click.option('-a', '--age', type=AGE, help='Filter videos newer than AGE, e.g. 12h, 1w2d.') +@click.option('-d', '--directory', type=click.STRING, metavar='DIR', help='Directory where to save subtitles, ' + 'default is next to the video file.') +@click.option('-e', '--encoding', type=click.STRING, metavar='ENC', help='Subtitle file encoding, default is to ' + 'preserve original encoding.') +@click.option('-s', '--single', is_flag=True, default=False, help='Save subtitle without language code in the file ' + 'name, i.e. use .srt extension. Do not use this unless your media player requires it.') +@click.option('-f', '--force', is_flag=True, default=False, help='Force download even if a subtitle already exist.') +@click.option('-hi', '--hearing-impaired', is_flag=True, default=False, help='Prefer hearing impaired subtitles.') +@click.option('-m', '--min-score', type=click.IntRange(0, 100), default=0, help='Minimum score for a subtitle ' + 'to be downloaded (0 to 100).') +@click.option('-w', '--max-workers', type=click.IntRange(1, 50), default=None, help='Maximum number of threads to use.') +@click.option('-z/-Z', '--archives/--no-archives', default=True, show_default=True, help='Scan archives for videos ' + '(supported extensions: %s).' % ', '.join(ARCHIVE_EXTENSIONS)) +@click.option('-v', '--verbose', count=True, help='Increase verbosity.') +@click.argument('path', type=click.Path(), required=True, nargs=-1) +@click.pass_obj +def download(obj, provider, refiner, language, age, directory, encoding, single, force, hearing_impaired, min_score, + max_workers, archives, verbose, path): + """Download best subtitles. + + PATH can be an directory containing videos, a video file path or a video file name. It can be used multiple times. + + If an existing subtitle is detected (external or embedded) in the correct language, the download is skipped for + the associated video. + + """ + # process parameters + language = set(language) # scan videos - videos = scan_videos([p for p in args.paths if os.path.exists(p)], subtitles=not args.force, - embedded_subtitles=not args.force, age=args.age) + videos = [] + ignored_videos = [] + errored_paths = [] + with click.progressbar(path, label='Collecting videos', item_show_func=lambda p: p or '') as bar: + for p in bar: + logger.debug('Collecting path %s', p) - # guess videos - videos.extend([Video.fromname(p) for p in args.paths if not os.path.exists(p)]) + # non-existing + if not os.path.exists(p): + try: + video = Video.fromname(p) + except: + logger.exception('Unexpected error while collecting non-existing path %s', p) + errored_paths.append(p) + continue + if not force: + video.subtitle_languages |= set(search_external_subtitles(video.name, directory=directory).values()) + refine(video, episode_refiners=refiner, movie_refiners=refiner, embedded_subtitles=not force) + videos.append(video) + continue + + # directories + if os.path.isdir(p): + try: + scanned_videos = scan_videos(p, age=age, archives=archives) + except: + logger.exception('Unexpected error while collecting directory path %s', p) + errored_paths.append(p) + continue + for video in scanned_videos: + if not force: + video.subtitle_languages |= set(search_external_subtitles(video.name, + directory=directory).values()) + if check_video(video, languages=language, age=age, undefined=single): + refine(video, episode_refiners=refiner, movie_refiners=refiner, embedded_subtitles=not force) + videos.append(video) + else: + ignored_videos.append(video) + continue + + # other inputs + try: + video = scan_video(p) + except: + logger.exception('Unexpected error while collecting path %s', p) + errored_paths.append(p) + continue + if not force: + video.subtitle_languages |= set(search_external_subtitles(video.name, directory=directory).values()) + if check_video(video, languages=language, age=age, undefined=single): + refine(video, episode_refiners=refiner, movie_refiners=refiner, embedded_subtitles=not force) + videos.append(video) + else: + ignored_videos.append(video) + + # output errored paths + if verbose > 0: + for p in errored_paths: + click.secho('%s errored' % p, fg='red') + + # output ignored videos + if verbose > 1: + for video in ignored_videos: + click.secho('%s ignored - subtitles: %s / age: %d day%s' % ( + os.path.split(video.name)[1], + ', '.join(str(s) for s in video.subtitle_languages) or 'none', + video.age.days, + 's' if video.age.days > 1 else '' + ), fg='yellow') + + # report collected videos + click.echo('%s video%s collected / %s video%s ignored / %s error%s' % ( + click.style(str(len(videos)), bold=True, fg='green' if videos else None), + 's' if len(videos) > 1 else '', + click.style(str(len(ignored_videos)), bold=True, fg='yellow' if ignored_videos else None), + 's' if len(ignored_videos) > 1 else '', + click.style(str(len(errored_paths)), bold=True, fg='red' if errored_paths else None), + 's' if len(errored_paths) > 1 else '', + )) + + # exit if no video collected + if not videos: + return # download best subtitles - subtitles = download_best_subtitles(videos, args.languages, providers=args.providers, - provider_configs=provider_configs, min_score=args.min_score, - hearing_impaired=args.hearing_impaired, single=args.single) + downloaded_subtitles = defaultdict(list) + with AsyncProviderPool(max_workers=max_workers, providers=provider, provider_configs=obj['provider_configs']) as p: + with click.progressbar(videos, label='Downloading subtitles', + item_show_func=lambda v: os.path.split(v.name)[1] if v is not None else '') as bar: + for v in bar: + scores = get_scores(v) + subtitles = p.download_best_subtitles(p.list_subtitles(v, language - v.subtitle_languages), + v, language, min_score=scores['hash'] * min_score / 100, + hearing_impaired=hearing_impaired, only_one=single) + downloaded_subtitles[v] = subtitles + + if p.discarded_providers: + click.secho('Some providers have been discarded due to unexpected errors: %s' % + ', '.join(p.discarded_providers), fg='yellow') # save subtitles - save_subtitles(subtitles, single=args.single, directory=args.directory, encoding=args.encoding) + total_subtitles = 0 + for v, subtitles in downloaded_subtitles.items(): + saved_subtitles = save_subtitles(v, subtitles, single=single, directory=directory, encoding=encoding) + total_subtitles += len(saved_subtitles) - # result output - if not subtitles: - if not args.quiet: - print('No subtitles downloaded', file=sys.stderr) - exit(1) - if not args.quiet: - subtitles_count = sum([len(s) for s in subtitles.values()]) - if subtitles_count == 1: - print('%d subtitle downloaded' % subtitles_count) - else: - print('%d subtitles downloaded' % subtitles_count) + if verbose > 0: + click.echo('%s subtitle%s downloaded for %s' % (click.style(str(len(saved_subtitles)), bold=True), + 's' if len(saved_subtitles) > 1 else '', + os.path.split(v.name)[1])) + + if verbose > 1: + for s in saved_subtitles: + matches = s.get_matches(v) + score = compute_score(s, v) + + # score color + score_color = None + scores = get_scores(v) + if isinstance(v, Movie): + if score < scores['title']: + score_color = 'red' + elif score < scores['title'] + scores['year'] + scores['release_group']: + score_color = 'yellow' + else: + score_color = 'green' + elif isinstance(v, Episode): + if score < scores['series'] + scores['season'] + scores['episode']: + score_color = 'red' + elif score < scores['series'] + scores['season'] + scores['episode'] + scores['release_group']: + score_color = 'yellow' + else: + score_color = 'green' + + # scale score from 0 to 100 taking out preferences + scaled_score = score + if s.hearing_impaired == hearing_impaired: + scaled_score -= scores['hearing_impaired'] + scaled_score *= 100 / scores['hash'] + + # echo some nice colored output + click.echo(' - [{score}] {language} subtitle from {provider_name} (match on {matches})'.format( + score=click.style('{:5.1f}'.format(scaled_score), fg=score_color, bold=score >= scores['hash']), + language=s.language.name if s.language.country is None else '%s (%s)' % (s.language.name, + s.language.country.name), + provider_name=s.provider_name, + matches=', '.join(sorted(matches, key=scores.get, reverse=True)) + )) + + if verbose == 0: + click.echo('Downloaded %s subtitle%s' % (click.style(str(total_subtitles), bold=True), + 's' if total_subtitles > 1 else '')) diff --git a/libs/subliminal/compat.py b/libs/subliminal/compat.py deleted file mode 100644 index 28bd3e84..00000000 --- a/libs/subliminal/compat.py +++ /dev/null @@ -1,21 +0,0 @@ -# -*- coding: utf-8 -*- -import sys -import socket - - -if sys.version_info[0] == 2: - from xmlrpclib import ServerProxy, Transport - from httplib import HTTPConnection -elif sys.version_info[0] == 3: - from xmlrpc.client import ServerProxy, Transport - from http.client import HTTPConnection - - -class TimeoutTransport(Transport, object): - def __init__(self, timeout=socket._GLOBAL_DEFAULT_TIMEOUT, *args, **kwargs): - super(TimeoutTransport, self).__init__(*args, **kwargs) - self.timeout = timeout - - def make_connection(self, host): - h = HTTPConnection(host, timeout=self.timeout) - return h diff --git a/libs/subliminal/converters/addic7ed.py b/libs/subliminal/converters/addic7ed.py index 0e862931..f9cb8316 100644 --- a/libs/subliminal/converters/addic7ed.py +++ b/libs/subliminal/converters/addic7ed.py @@ -1,12 +1,11 @@ # -*- coding: utf-8 -*- -from __future__ import unicode_literals from babelfish import LanguageReverseConverter, language_converters class Addic7edConverter(LanguageReverseConverter): def __init__(self): self.name_converter = language_converters['name'] - self.from_addic7ed = {'Català': ('cat',), 'Chinese (Simplified)': ('zho',), 'Chinese (Traditional)': ('zho',), + self.from_addic7ed = {u'Català': ('cat',), 'Chinese (Simplified)': ('zho',), 'Chinese (Traditional)': ('zho',), 'Euskera': ('eus',), 'Galego': ('glg',), 'Greek': ('ell',), 'Malay': ('msa',), 'Portuguese (Brazilian)': ('por', 'BR'), 'Serbian (Cyrillic)': ('srp', None, 'Cyrl'), 'Serbian (Latin)': ('srp',), 'Spanish (Latin America)': ('spa',), @@ -23,9 +22,11 @@ class Addic7edConverter(LanguageReverseConverter): return self.to_addic7ed[(alpha3, country)] if (alpha3,) in self.to_addic7ed: return self.to_addic7ed[(alpha3,)] + return self.name_converter.convert(alpha3, country, script) def reverse(self, addic7ed): if addic7ed in self.from_addic7ed: return self.from_addic7ed[addic7ed] + return self.name_converter.reverse(addic7ed) diff --git a/libs/subliminal/converters/legendastv.py b/libs/subliminal/converters/legendastv.py new file mode 100644 index 00000000..c2e13bd3 --- /dev/null +++ b/libs/subliminal/converters/legendastv.py @@ -0,0 +1,27 @@ +# -*- coding: utf-8 -*- +from babelfish import LanguageReverseConverter + +from ..exceptions import ConfigurationError + + +class LegendasTVConverter(LanguageReverseConverter): + def __init__(self): + self.from_legendastv = {1: ('por', 'BR'), 2: ('eng',), 3: ('spa',), 4: ('fra',), 5: ('deu',), 6: ('jpn',), + 7: ('dan',), 8: ('nor',), 9: ('swe',), 10: ('por',), 11: ('ara',), 12: ('ces',), + 13: ('zho',), 14: ('kor',), 15: ('bul',), 16: ('ita',), 17: ('pol',)} + self.to_legendastv = {v: k for k, v in self.from_legendastv.items()} + self.codes = set(self.from_legendastv.keys()) + + def convert(self, alpha3, country=None, script=None): + if (alpha3, country) in self.to_legendastv: + return self.to_legendastv[(alpha3, country)] + if (alpha3,) in self.to_legendastv: + return self.to_legendastv[(alpha3,)] + + raise ConfigurationError('Unsupported language code for legendastv: %s, %s, %s' % (alpha3, country, script)) + + def reverse(self, legendastv): + if legendastv in self.from_legendastv: + return self.from_legendastv[legendastv] + + raise ConfigurationError('Unsupported language number for legendastv: %s' % legendastv) diff --git a/libs/subliminal/converters/podnapisi.py b/libs/subliminal/converters/podnapisi.py deleted file mode 100644 index d73cb1c1..00000000 --- a/libs/subliminal/converters/podnapisi.py +++ /dev/null @@ -1,32 +0,0 @@ -# -*- coding: utf-8 -*- -from __future__ import unicode_literals -from babelfish import LanguageReverseConverter, LanguageConvertError, LanguageReverseError - - -class PodnapisiConverter(LanguageReverseConverter): - def __init__(self): - self.from_podnapisi = {2: ('eng',), 28: ('spa',), 26: ('pol',), 36: ('srp',), 1: ('slv',), 38: ('hrv',), - 9: ('ita',), 8: ('fra',), 48: ('por', 'BR'), 23: ('nld',), 12: ('ara',), 13: ('ron',), - 33: ('bul',), 32: ('por',), 16: ('ell',), 15: ('hun',), 31: ('fin',), 30: ('tur',), - 7: ('ces',), 25: ('swe',), 27: ('rus',), 24: ('dan',), 22: ('heb',), 51: ('vie',), - 52: ('fas',), 5: ('deu',), 14: ('spa', 'AR'), 54: ('ind',), 47: ('srp', None, 'Cyrl'), - 3: ('nor',), 20: ('est',), 10: ('bos',), 17: ('zho',), 37: ('slk',), 35: ('mkd',), - 11: ('jpn',), 4: ('kor',), 29: ('sqi',), 6: ('isl',), 19: ('lit',), 46: ('ukr',), - 44: ('tha',), 53: ('cat',), 56: ('sin',), 21: ('lav',), 40: ('cmn',), 55: ('msa',), - 42: ('hin',), 50: ('bel',)} - self.to_podnapisi = {v: k for k, v in self.from_podnapisi.items()} - self.codes = set(self.from_podnapisi.keys()) - - def convert(self, alpha3, country=None, script=None): - if (alpha3,) in self.to_podnapisi: - return self.to_podnapisi[(alpha3,)] - if (alpha3, country) in self.to_podnapisi: - return self.to_podnapisi[(alpha3, country)] - if (alpha3, country, script) in self.to_podnapisi: - return self.to_podnapisi[(alpha3, country, script)] - raise LanguageConvertError(alpha3, country, script) - - def reverse(self, podnapisi): - if podnapisi not in self.from_podnapisi: - raise LanguageReverseError(podnapisi) - return self.from_podnapisi[podnapisi] diff --git a/libs/subliminal/converters/shooter.py b/libs/subliminal/converters/shooter.py new file mode 100644 index 00000000..ac6431a6 --- /dev/null +++ b/libs/subliminal/converters/shooter.py @@ -0,0 +1,23 @@ +# -*- coding: utf-8 -*- +from babelfish import LanguageReverseConverter + +from ..exceptions import ConfigurationError + + +class ShooterConverter(LanguageReverseConverter): + def __init__(self): + self.from_shooter = {'chn': ('zho',), 'eng': ('eng',)} + self.to_shooter = {v: k for k, v in self.from_shooter.items()} + self.codes = set(self.from_shooter.keys()) + + def convert(self, alpha3, country=None, script=None): + if (alpha3,) in self.to_shooter: + return self.to_shooter[(alpha3,)] + + raise ConfigurationError('Unsupported language for shooter: %s, %s, %s' % (alpha3, country, script)) + + def reverse(self, shooter): + if shooter in self.from_shooter: + return self.from_shooter[shooter] + + raise ConfigurationError('Unsupported language code for shooter: %s' % shooter) diff --git a/libs/subliminal/converters/thesubdb.py b/libs/subliminal/converters/thesubdb.py new file mode 100644 index 00000000..58051afb --- /dev/null +++ b/libs/subliminal/converters/thesubdb.py @@ -0,0 +1,26 @@ +# -*- coding: utf-8 -*- +from babelfish import LanguageReverseConverter + +from ..exceptions import ConfigurationError + + +class TheSubDBConverter(LanguageReverseConverter): + def __init__(self): + self.from_thesubdb = {'en': ('eng',), 'es': ('spa',), 'fr': ('fra',), 'it': ('ita',), 'nl': ('nld',), + 'pl': ('pol',), 'pt': ('por', 'BR'), 'ro': ('ron',), 'sv': ('swe',), 'tr': ('tur',)} + self.to_thesubdb = {v: k for k, v in self.from_thesubdb.items()} + self.codes = set(self.from_thesubdb.keys()) + + def convert(self, alpha3, country=None, script=None): + if (alpha3, country) in self.to_thesubdb: + return self.to_thesubdb[(alpha3, country)] + if (alpha3,) in self.to_thesubdb: + return self.to_thesubdb[(alpha3,)] + + raise ConfigurationError('Unsupported language for thesubdb: %s, %s, %s' % (alpha3, country, script)) + + def reverse(self, thesubdb): + if thesubdb in self.from_thesubdb: + return self.from_thesubdb[thesubdb] + + raise ConfigurationError('Unsupported language code for thesubdb: %s' % thesubdb) diff --git a/libs/subliminal/converters/tvsubtitles.py b/libs/subliminal/converters/tvsubtitles.py index e9b7e74f..45b9fed1 100644 --- a/libs/subliminal/converters/tvsubtitles.py +++ b/libs/subliminal/converters/tvsubtitles.py @@ -1,5 +1,4 @@ # -*- coding: utf-8 -*- -from __future__ import unicode_literals from babelfish import LanguageReverseConverter, language_converters @@ -8,7 +7,7 @@ class TVsubtitlesConverter(LanguageReverseConverter): self.alpha2_converter = language_converters['alpha2'] self.from_tvsubtitles = {'br': ('por', 'BR'), 'ua': ('ukr',), 'gr': ('ell',), 'cn': ('zho',), 'jp': ('jpn',), 'cz': ('ces',)} - self.to_tvsubtitles = {v: k for k, v in self.from_tvsubtitles} + self.to_tvsubtitles = {v: k for k, v in self.from_tvsubtitles.items()} self.codes = self.alpha2_converter.codes | set(self.from_tvsubtitles.keys()) def convert(self, alpha3, country=None, script=None): @@ -16,9 +15,11 @@ class TVsubtitlesConverter(LanguageReverseConverter): return self.to_tvsubtitles[(alpha3, country)] if (alpha3,) in self.to_tvsubtitles: return self.to_tvsubtitles[(alpha3,)] + return self.alpha2_converter.convert(alpha3, country, script) def reverse(self, tvsubtitles): if tvsubtitles in self.from_tvsubtitles: return self.from_tvsubtitles[tvsubtitles] + return self.alpha2_converter.reverse(tvsubtitles) diff --git a/libs/subliminal/core.py b/libs/subliminal/core.py new file mode 100644 index 00000000..c516c49d --- /dev/null +++ b/libs/subliminal/core.py @@ -0,0 +1,705 @@ +# -*- coding: utf-8 -*- +from collections import defaultdict +from concurrent.futures import ThreadPoolExecutor +from datetime import datetime +import io +import itertools +import logging +import operator +import os.path +import socket + +from babelfish import Language, LanguageReverseError +from guessit import guessit +from rarfile import NotRarFile, RarCannotExec, RarFile +import requests + +from .extensions import provider_manager, refiner_manager +from .score import compute_score as default_compute_score +from .subtitle import SUBTITLE_EXTENSIONS, get_subtitle_path +from .utils import hash_napiprojekt, hash_opensubtitles, hash_shooter, hash_thesubdb +from .video import VIDEO_EXTENSIONS, Episode, Movie, Video + +#: Supported archive extensions +ARCHIVE_EXTENSIONS = ('.rar',) + +logger = logging.getLogger(__name__) + + +class ProviderPool(object): + """A pool of providers with the same API as a single :class:`~subliminal.providers.Provider`. + + It has a few extra features: + + * Lazy loads providers when needed and supports the `with` statement to :meth:`terminate` + the providers on exit. + * Automatically discard providers on failure. + + :param list providers: name of providers to use, if not all. + :param dict provider_configs: provider configuration as keyword arguments per provider name to pass when + instanciating the :class:`~subliminal.providers.Provider`. + + """ + def __init__(self, providers=None, provider_configs=None): + #: Name of providers to use + self.providers = providers or provider_manager.names() + + #: Provider configuration + self.provider_configs = provider_configs or {} + + #: Initialized providers + self.initialized_providers = {} + + #: Discarded providers + self.discarded_providers = set() + + def __enter__(self): + return self + + def __exit__(self, exc_type, exc_value, traceback): + self.terminate() + + def __getitem__(self, name): + if name not in self.providers: + raise KeyError + if name not in self.initialized_providers: + logger.info('Initializing provider %s', name) + provider = provider_manager[name].plugin(**self.provider_configs.get(name, {})) + provider.initialize() + self.initialized_providers[name] = provider + + return self.initialized_providers[name] + + def __delitem__(self, name): + if name not in self.initialized_providers: + raise KeyError(name) + + try: + logger.info('Terminating provider %s', name) + self.initialized_providers[name].terminate() + except (requests.Timeout, socket.timeout): + logger.error('Provider %r timed out, improperly terminated', name) + except: + logger.exception('Provider %r terminated unexpectedly', name) + + del self.initialized_providers[name] + + def __iter__(self): + return iter(self.initialized_providers) + + def list_subtitles_provider(self, provider, video, languages): + """List subtitles with a single provider. + + The video and languages are checked against the provider. + + :param str provider: name of the provider. + :param video: video to list subtitles for. + :type video: :class:`~subliminal.video.Video` + :param languages: languages to search for. + :type languages: set of :class:`~babelfish.language.Language` + :return: found subtitles. + :rtype: list of :class:`~subliminal.subtitle.Subtitle` or None + + """ + # check video validity + if not provider_manager[provider].plugin.check(video): + logger.info('Skipping provider %r: not a valid video', provider) + return [] + + # check supported languages + provider_languages = provider_manager[provider].plugin.languages & languages + if not provider_languages: + logger.info('Skipping provider %r: no language to search for', provider) + return [] + + # list subtitles + logger.info('Listing subtitles with provider %r and languages %r', provider, provider_languages) + try: + return self[provider].list_subtitles(video, provider_languages) + except (requests.Timeout, socket.timeout): + logger.error('Provider %r timed out', provider) + except: + logger.exception('Unexpected error in provider %r', provider) + + def list_subtitles(self, video, languages): + """List subtitles. + + :param video: video to list subtitles for. + :type video: :class:`~subliminal.video.Video` + :param languages: languages to search for. + :type languages: set of :class:`~babelfish.language.Language` + :return: found subtitles. + :rtype: list of :class:`~subliminal.subtitle.Subtitle` + + """ + subtitles = [] + + for name in self.providers: + # check discarded providers + if name in self.discarded_providers: + logger.debug('Skipping discarded provider %r', name) + continue + + # list subtitles + provider_subtitles = self.list_subtitles_provider(name, video, languages) + if provider_subtitles is None: + logger.info('Discarding provider %s', name) + self.discarded_providers.add(name) + continue + + # add the subtitles + subtitles.extend(provider_subtitles) + + return subtitles + + def download_subtitle(self, subtitle): + """Download `subtitle`'s :attr:`~subliminal.subtitle.Subtitle.content`. + + :param subtitle: subtitle to download. + :type subtitle: :class:`~subliminal.subtitle.Subtitle` + :return: `True` if the subtitle has been successfully downloaded, `False` otherwise. + :rtype: bool + + """ + # check discarded providers + if subtitle.provider_name in self.discarded_providers: + logger.warning('Provider %r is discarded', subtitle.provider_name) + return False + + logger.info('Downloading subtitle %r', subtitle) + try: + self[subtitle.provider_name].download_subtitle(subtitle) + except (requests.Timeout, socket.timeout): + logger.error('Provider %r timed out, discarding it', subtitle.provider_name) + self.discarded_providers.add(subtitle.provider_name) + return False + except: + logger.exception('Unexpected error in provider %r, discarding it', subtitle.provider_name) + self.discarded_providers.add(subtitle.provider_name) + return False + + # check subtitle validity + if not subtitle.is_valid(): + logger.error('Invalid subtitle') + return False + + return True + + def download_best_subtitles(self, subtitles, video, languages, min_score=0, hearing_impaired=False, only_one=False, + compute_score=None): + """Download the best matching subtitles. + + :param subtitles: the subtitles to use. + :type subtitles: list of :class:`~subliminal.subtitle.Subtitle` + :param video: video to download subtitles for. + :type video: :class:`~subliminal.video.Video` + :param languages: languages to download. + :type languages: set of :class:`~babelfish.language.Language` + :param int min_score: minimum score for a subtitle to be downloaded. + :param bool hearing_impaired: hearing impaired preference. + :param bool only_one: download only one subtitle, not one per language. + :param compute_score: function that takes `subtitle` and `video` as positional arguments, + `hearing_impaired` as keyword argument and returns the score. + :return: downloaded subtitles. + :rtype: list of :class:`~subliminal.subtitle.Subtitle` + + """ + compute_score = compute_score or default_compute_score + + # sort subtitles by score + scored_subtitles = sorted([(s, compute_score(s, video, hearing_impaired=hearing_impaired)) + for s in subtitles], key=operator.itemgetter(1), reverse=True) + + # download best subtitles, falling back on the next on error + downloaded_subtitles = [] + for subtitle, score in scored_subtitles: + # check score + if score < min_score: + logger.info('Score %d is below min_score (%d)', score, min_score) + break + + # check downloaded languages + if subtitle.language in set(s.language for s in downloaded_subtitles): + logger.debug('Skipping subtitle: %r already downloaded', subtitle.language) + continue + + # download + if self.download_subtitle(subtitle): + downloaded_subtitles.append(subtitle) + + # stop when all languages are downloaded + if set(s.language for s in downloaded_subtitles) == languages: + logger.debug('All languages downloaded') + break + + # stop if only one subtitle is requested + if only_one: + logger.debug('Only one subtitle downloaded') + break + + return downloaded_subtitles + + def terminate(self): + """Terminate all the :attr:`initialized_providers`.""" + logger.debug('Terminating initialized providers') + for name in list(self.initialized_providers): + del self[name] + + +class AsyncProviderPool(ProviderPool): + """Subclass of :class:`ProviderPool` with asynchronous support for :meth:`~ProviderPool.list_subtitles`. + + :param int max_workers: maximum number of threads to use. If `None`, :attr:`max_workers` will be set + to the number of :attr:`~ProviderPool.providers`. + + """ + def __init__(self, max_workers=None, *args, **kwargs): + super(AsyncProviderPool, self).__init__(*args, **kwargs) + + #: Maximum number of threads to use + self.max_workers = max_workers or len(self.providers) + + def list_subtitles_provider(self, provider, video, languages): + return provider, super(AsyncProviderPool, self).list_subtitles_provider(provider, video, languages) + + def list_subtitles(self, video, languages): + subtitles = [] + + with ThreadPoolExecutor(self.max_workers) as executor: + for provider, provider_subtitles in executor.map(self.list_subtitles_provider, self.providers, + itertools.repeat(video, len(self.providers)), + itertools.repeat(languages, len(self.providers))): + # discard provider that failed + if provider_subtitles is None: + logger.info('Discarding provider %s', provider) + self.discarded_providers.add(provider) + continue + + # add subtitles + subtitles.extend(provider_subtitles) + + return subtitles + + +def check_video(video, languages=None, age=None, undefined=False): + """Perform some checks on the `video`. + + All the checks are optional. Return `False` if any of this check fails: + + * `languages` already exist in `video`'s :attr:`~subliminal.video.Video.subtitle_languages`. + * `video` is older than `age`. + * `video` has an `undefined` language in :attr:`~subliminal.video.Video.subtitle_languages`. + + :param video: video to check. + :type video: :class:`~subliminal.video.Video` + :param languages: desired languages. + :type languages: set of :class:`~babelfish.language.Language` + :param datetime.timedelta age: maximum age of the video. + :param bool undefined: fail on existing undefined language. + :return: `True` if the video passes the checks, `False` otherwise. + :rtype: bool + + """ + # language test + if languages and not (languages - video.subtitle_languages): + logger.debug('All languages %r exist', languages) + return False + + # age test + if age and video.age > age: + logger.debug('Video is older than %r', age) + return False + + # undefined test + if undefined and Language('und') in video.subtitle_languages: + logger.debug('Undefined language found') + return False + + return True + + +def search_external_subtitles(path, directory=None): + """Search for external subtitles from a video `path` and their associated language. + + Unless `directory` is provided, search will be made in the same directory as the video file. + + :param str path: path to the video. + :param str directory: directory to search for subtitles. + :return: found subtitles with their languages. + :rtype: dict + + """ + # split path + dirpath, filename = os.path.split(path) + dirpath = dirpath or '.' + fileroot, fileext = os.path.splitext(filename) + + # search for subtitles + subtitles = {} + for p in os.listdir(directory or dirpath): + # keep only valid subtitle filenames + if not p.startswith(fileroot) or not p.endswith(SUBTITLE_EXTENSIONS): + continue + + # extract the potential language code + language = Language('und') + language_code = p[len(fileroot):-len(os.path.splitext(p)[1])].replace(fileext, '').replace('_', '-')[1:] + if language_code: + try: + language = Language.fromietf(language_code) + except (ValueError, LanguageReverseError): + logger.error('Cannot parse language code %r', language_code) + + subtitles[p] = language + + logger.debug('Found subtitles %r', subtitles) + + return subtitles + + +def scan_video(path): + """Scan a video from a `path`. + + :param str path: existing path to the video. + :return: the scanned video. + :rtype: :class:`~subliminal.video.Video` + + """ + # check for non-existing path + if not os.path.exists(path): + raise ValueError('Path does not exist') + + # check video extension + if not path.endswith(VIDEO_EXTENSIONS): + raise ValueError('%r is not a valid video extension' % os.path.splitext(path)[1]) + + dirpath, filename = os.path.split(path) + logger.info('Scanning video %r in %r', filename, dirpath) + + # guess + video = Video.fromguess(path, guessit(path)) + + # size and hashes + video.size = os.path.getsize(path) + if video.size > 10485760: + logger.debug('Size is %d', video.size) + video.hashes['opensubtitles'] = hash_opensubtitles(path) + video.hashes['shooter'] = hash_shooter(path) + video.hashes['thesubdb'] = hash_thesubdb(path) + video.hashes['napiprojekt'] = hash_napiprojekt(path) + logger.debug('Computed hashes %r', video.hashes) + else: + logger.warning('Size is lower than 10MB: hashes not computed') + + return video + + +def scan_archive(path): + """Scan an archive from a `path`. + + :param str path: existing path to the archive. + :return: the scanned video. + :rtype: :class:`~subliminal.video.Video` + + """ + # check for non-existing path + if not os.path.exists(path): + raise ValueError('Path does not exist') + + # check video extension + if not path.endswith(ARCHIVE_EXTENSIONS): + raise ValueError('%r is not a valid archive extension' % os.path.splitext(path)[1]) + + dirpath, filename = os.path.split(path) + logger.info('Scanning archive %r in %r', filename, dirpath) + + # rar extension + if filename.endswith('.rar'): + rar = RarFile(path) + + # filter on video extensions + rar_filenames = [f for f in rar.namelist() if f.endswith(VIDEO_EXTENSIONS)] + + # no video found + if not rar_filenames: + raise ValueError('No video in archive') + + # more than one video found + if len(rar_filenames) > 1: + raise ValueError('More than one video in archive') + + # guess + rar_filename = rar_filenames[0] + rar_filepath = os.path.join(dirpath, rar_filename) + video = Video.fromguess(rar_filepath, guessit(rar_filepath)) + + # size + video.size = rar.getinfo(rar_filename).file_size + else: + raise ValueError('Unsupported extension %r' % os.path.splitext(path)[1]) + + return video + + +def scan_videos(path, age=None, archives=True): + """Scan `path` for videos and their subtitles. + + See :func:`refine` to find additional information for the video. + + :param str path: existing directory path to scan. + :param datetime.timedelta age: maximum age of the video or archive. + :param bool archives: scan videos in archives. + :return: the scanned videos. + :rtype: list of :class:`~subliminal.video.Video` + + """ + # check for non-existing path + if not os.path.exists(path): + raise ValueError('Path does not exist') + + # check for non-directory path + if not os.path.isdir(path): + raise ValueError('Path is not a directory') + + # walk the path + videos = [] + for dirpath, dirnames, filenames in os.walk(path): + logger.debug('Walking directory %r', dirpath) + + # remove badly encoded and hidden dirnames + for dirname in list(dirnames): + if dirname.startswith('.'): + logger.debug('Skipping hidden dirname %r in %r', dirname, dirpath) + dirnames.remove(dirname) + + # scan for videos + for filename in filenames: + # filter on videos and archives + if not (filename.endswith(VIDEO_EXTENSIONS) or archives and filename.endswith(ARCHIVE_EXTENSIONS)): + continue + + # skip hidden files + if filename.startswith('.'): + logger.debug('Skipping hidden filename %r in %r', filename, dirpath) + continue + + # reconstruct the file path + filepath = os.path.join(dirpath, filename) + + # skip links + if os.path.islink(filepath): + logger.debug('Skipping link %r in %r', filename, dirpath) + continue + + # skip old files + if age and datetime.utcnow() - datetime.utcfromtimestamp(os.path.getmtime(filepath)) > age: + logger.debug('Skipping old file %r in %r', filename, dirpath) + continue + + # scan + if filename.endswith(VIDEO_EXTENSIONS): # video + try: + video = scan_video(filepath) + except ValueError: # pragma: no cover + logger.exception('Error scanning video') + continue + elif archives and filename.endswith(ARCHIVE_EXTENSIONS): # archive + try: + video = scan_archive(filepath) + except (NotRarFile, RarCannotExec, ValueError): # pragma: no cover + logger.exception('Error scanning archive') + continue + else: # pragma: no cover + raise ValueError('Unsupported file %r' % filename) + + videos.append(video) + + return videos + + +def refine(video, episode_refiners=None, movie_refiners=None, **kwargs): + """Refine a video using :ref:`refiners`. + + .. note:: + + Exceptions raised in refiners are silently passed and logged. + + :param video: the video to refine. + :type video: :class:`~subliminal.video.Video` + :param tuple episode_refiners: refiners to use for episodes. + :param tuple movie_refiners: refiners to use for movies. + :param \*\*kwargs: additional parameters for the :func:`~subliminal.refiners.refine` functions. + + """ + refiners = () + if isinstance(video, Episode): + refiners = episode_refiners or ('metadata', 'tvdb', 'omdb') + elif isinstance(video, Movie): + refiners = movie_refiners or ('metadata', 'omdb') + for refiner in refiners: + logger.info('Refining video with %s', refiner) + try: + refiner_manager[refiner].plugin(video, **kwargs) + except: + logger.exception('Failed to refine video') + + +def list_subtitles(videos, languages, pool_class=ProviderPool, **kwargs): + """List subtitles. + + The `videos` must pass the `languages` check of :func:`check_video`. + + :param videos: videos to list subtitles for. + :type videos: set of :class:`~subliminal.video.Video` + :param languages: languages to search for. + :type languages: set of :class:`~babelfish.language.Language` + :param pool_class: class to use as provider pool. + :type pool_class: :class:`ProviderPool`, :class:`AsyncProviderPool` or similar + :param \*\*kwargs: additional parameters for the provided `pool_class` constructor. + :return: found subtitles per video. + :rtype: dict of :class:`~subliminal.video.Video` to list of :class:`~subliminal.subtitle.Subtitle` + + """ + listed_subtitles = defaultdict(list) + + # check videos + checked_videos = [] + for video in videos: + if not check_video(video, languages=languages): + logger.info('Skipping video %r', video) + continue + checked_videos.append(video) + + # return immediately if no video passed the checks + if not checked_videos: + return listed_subtitles + + # list subtitles + with pool_class(**kwargs) as pool: + for video in checked_videos: + logger.info('Listing subtitles for %r', video) + subtitles = pool.list_subtitles(video, languages - video.subtitle_languages) + listed_subtitles[video].extend(subtitles) + logger.info('Found %d subtitle(s)', len(subtitles)) + + return listed_subtitles + + +def download_subtitles(subtitles, pool_class=ProviderPool, **kwargs): + """Download :attr:`~subliminal.subtitle.Subtitle.content` of `subtitles`. + + :param subtitles: subtitles to download. + :type subtitles: list of :class:`~subliminal.subtitle.Subtitle` + :param pool_class: class to use as provider pool. + :type pool_class: :class:`ProviderPool`, :class:`AsyncProviderPool` or similar + :param \*\*kwargs: additional parameters for the provided `pool_class` constructor. + + """ + with pool_class(**kwargs) as pool: + for subtitle in subtitles: + logger.info('Downloading subtitle %r', subtitle) + pool.download_subtitle(subtitle) + + +def download_best_subtitles(videos, languages, min_score=0, hearing_impaired=False, only_one=False, compute_score=None, + pool_class=ProviderPool, **kwargs): + """List and download the best matching subtitles. + + The `videos` must pass the `languages` and `undefined` (`only_one`) checks of :func:`check_video`. + + :param videos: videos to download subtitles for. + :type videos: set of :class:`~subliminal.video.Video` + :param languages: languages to download. + :type languages: set of :class:`~babelfish.language.Language` + :param int min_score: minimum score for a subtitle to be downloaded. + :param bool hearing_impaired: hearing impaired preference. + :param bool only_one: download only one subtitle, not one per language. + :param compute_score: function that takes `subtitle` and `video` as positional arguments, + `hearing_impaired` as keyword argument and returns the score. + :param pool_class: class to use as provider pool. + :type pool_class: :class:`ProviderPool`, :class:`AsyncProviderPool` or similar + :param \*\*kwargs: additional parameters for the provided `pool_class` constructor. + :return: downloaded subtitles per video. + :rtype: dict of :class:`~subliminal.video.Video` to list of :class:`~subliminal.subtitle.Subtitle` + + """ + downloaded_subtitles = defaultdict(list) + + # check videos + checked_videos = [] + for video in videos: + if not check_video(video, languages=languages, undefined=only_one): + logger.info('Skipping video %r', video) + continue + checked_videos.append(video) + + # return immediately if no video passed the checks + if not checked_videos: + return downloaded_subtitles + + # download best subtitles + with pool_class(**kwargs) as pool: + for video in checked_videos: + logger.info('Downloading best subtitles for %r', video) + subtitles = pool.download_best_subtitles(pool.list_subtitles(video, languages - video.subtitle_languages), + video, languages, min_score=min_score, + hearing_impaired=hearing_impaired, only_one=only_one, + compute_score=compute_score) + logger.info('Downloaded %d subtitle(s)', len(subtitles)) + downloaded_subtitles[video].extend(subtitles) + + return downloaded_subtitles + + +def save_subtitles(video, subtitles, single=False, directory=None, encoding=None): + """Save subtitles on filesystem. + + Subtitles are saved in the order of the list. If a subtitle with a language has already been saved, other subtitles + with the same language are silently ignored. + + The extension used is `.lang.srt` by default or `.srt` is `single` is `True`, with `lang` being the IETF code for + the :attr:`~subliminal.subtitle.Subtitle.language` of the subtitle. + + :param video: video of the subtitles. + :type video: :class:`~subliminal.video.Video` + :param subtitles: subtitles to save. + :type subtitles: list of :class:`~subliminal.subtitle.Subtitle` + :param bool single: save a single subtitle, default is to save one subtitle per language. + :param str directory: path to directory where to save the subtitles, default is next to the video. + :param str encoding: encoding in which to save the subtitles, default is to keep original encoding. + :return: the saved subtitles + :rtype: list of :class:`~subliminal.subtitle.Subtitle` + + """ + saved_subtitles = [] + for subtitle in subtitles: + # check content + if subtitle.content is None: + logger.error('Skipping subtitle %r: no content', subtitle) + continue + + # check language + if subtitle.language in set(s.language for s in saved_subtitles): + logger.debug('Skipping subtitle %r: language already saved', subtitle) + continue + + # create subtitle path + subtitle_path = get_subtitle_path(video.name, None if single else subtitle.language) + if directory is not None: + subtitle_path = os.path.join(directory, os.path.split(subtitle_path)[1]) + + # save content as is or in the specified encoding + logger.info('Saving %r to %r', subtitle, subtitle_path) + if encoding is None: + with io.open(subtitle_path, 'wb') as f: + f.write(subtitle.content) + else: + with io.open(subtitle_path, 'w', encoding=encoding) as f: + f.write(subtitle.text) + saved_subtitles.append(subtitle) + + # check single + if single: + break + + return saved_subtitles diff --git a/libs/subliminal/exceptions.py b/libs/subliminal/exceptions.py index be954800..5f5c7a77 100644 --- a/libs/subliminal/exceptions.py +++ b/libs/subliminal/exceptions.py @@ -1,22 +1,29 @@ # -*- coding: utf-8 -*- -from __future__ import unicode_literals - - class Error(Exception): - """Base class for exceptions in subliminal""" + """Base class for exceptions in subliminal.""" + pass class ProviderError(Error): - """Exception raised by providers""" + """Exception raised by providers.""" + pass class ConfigurationError(ProviderError): - """Exception raised by providers when badly configured""" + """Exception raised by providers when badly configured.""" + pass class AuthenticationError(ProviderError): - """Exception raised by providers when authentication failed""" + """Exception raised by providers when authentication failed.""" + pass + + +class TooManyRequests(ProviderError): + """Exception raised by providers when too many requests are made.""" + pass class DownloadLimitExceeded(ProviderError): - """Exception raised by providers when download limit is exceeded""" + """Exception raised by providers when download limit is exceeded.""" + pass diff --git a/libs/subliminal/extensions.py b/libs/subliminal/extensions.py new file mode 100644 index 00000000..1f378b7f --- /dev/null +++ b/libs/subliminal/extensions.py @@ -0,0 +1,106 @@ +# -*- coding: utf-8 -*- +from pkg_resources import EntryPoint + +from stevedore import ExtensionManager + + +class RegistrableExtensionManager(ExtensionManager): + """:class:~stevedore.extensions.ExtensionManager` with support for registration. + + It allows loading of internal extensions without setup and registering/unregistering additional extensions. + + Loading is done in this order: + + * Entry point extensions + * Internal extensions + * Registered extensions + + :param str namespace: namespace argument for :class:~stevedore.extensions.ExtensionManager`. + :param list internal_extensions: internal extensions to use with entry point syntax. + :param \*\*kwargs: additional parameters for the :class:~stevedore.extensions.ExtensionManager` constructor. + + """ + def __init__(self, namespace, internal_extensions, **kwargs): + #: Registered extensions with entry point syntax + self.registered_extensions = [] + + #: Internal extensions with entry point syntax + self.internal_extensions = internal_extensions + + super(RegistrableExtensionManager, self).__init__(namespace, **kwargs) + + def _find_entry_points(self, namespace): + # copy of default extensions + eps = list(super(RegistrableExtensionManager, self)._find_entry_points(namespace)) + + # internal extensions + for iep in self.internal_extensions: + ep = EntryPoint.parse(iep) + if ep.name not in [e.name for e in eps]: + eps.append(ep) + + # registered extensions + for rep in self.registered_extensions: + ep = EntryPoint.parse(rep) + if ep.name not in [e.name for e in eps]: + eps.append(ep) + + return eps + + def register(self, entry_point): + """Register an extension + + :param str entry_point: extension to register (entry point syntax). + :raise: ValueError if already registered. + + """ + if entry_point in self.registered_extensions: + raise ValueError('Extension already registered') + + ep = EntryPoint.parse(entry_point) + if ep.name in self.names(): + raise ValueError('An extension with the same name already exist') + + ext = self._load_one_plugin(ep, False, (), {}, False) + self.extensions.append(ext) + if self._extensions_by_name is not None: + self._extensions_by_name[ext.name] = ext + self.registered_extensions.insert(0, entry_point) + + def unregister(self, entry_point): + """Unregister a provider + + :param str entry_point: provider to unregister (entry point syntax). + + """ + if entry_point not in self.registered_extensions: + raise ValueError('Extension not registered') + + ep = EntryPoint.parse(entry_point) + self.registered_extensions.remove(entry_point) + if self._extensions_by_name is not None: + del self._extensions_by_name[ep.name] + for i, ext in enumerate(self.extensions): + if ext.name == ep.name: + del self.extensions[i] + break + + +#: Provider manager +provider_manager = RegistrableExtensionManager('subliminal.providers', [ + 'addic7ed = subliminal.providers.addic7ed:Addic7edProvider', + 'legendastv = subliminal.providers.legendastv:LegendasTVProvider', + 'opensubtitles = subliminal.providers.opensubtitles:OpenSubtitlesProvider', + 'podnapisi = subliminal.providers.podnapisi:PodnapisiProvider', + 'shooter = subliminal.providers.shooter:ShooterProvider', + 'subscenter = subliminal.providers.subscenter:SubsCenterProvider', + 'thesubdb = subliminal.providers.thesubdb:TheSubDBProvider', + 'tvsubtitles = subliminal.providers.tvsubtitles:TVsubtitlesProvider' +]) + +#: Refiner manager +refiner_manager = RegistrableExtensionManager('subliminal.refiners', [ + 'metadata = subliminal.refiners.metadata:refine', + 'omdb = subliminal.refiners.omdb:refine', + 'tvdb = subliminal.refiners.tvdb:refine' +]) diff --git a/libs/subliminal/providers/__init__.py b/libs/subliminal/providers/__init__.py index 70daa12d..9d2fd6d2 100644 --- a/libs/subliminal/providers/__init__.py +++ b/libs/subliminal/providers/__init__.py @@ -1,27 +1,65 @@ # -*- coding: utf-8 -*- -from __future__ import unicode_literals -import contextlib import logging -import socket -import babelfish -from pkg_resources import iter_entry_points, EntryPoint -import requests -from ..video import Episode, Movie +from bs4 import BeautifulSoup, FeatureNotFound +from six.moves.xmlrpc_client import SafeTransport + +from ..video import Episode, Movie logger = logging.getLogger(__name__) -class Provider(object): - """Base class for providers +class TimeoutSafeTransport(SafeTransport): + """Timeout support for ``xmlrpc.client.SafeTransport``.""" + def __init__(self, timeout, *args, **kwargs): + SafeTransport.__init__(self, *args, **kwargs) + self.timeout = timeout - If any configuration is possible for the provider, like credentials, it must take place during instantiation + def make_connection(self, host): + c = SafeTransport.make_connection(self, host) + c.timeout = self.timeout - :param \*\*kwargs: configuration - :raise: :class:`~subliminal.exceptions.ProviderConfigurationError` if there is a configuration error + return c + + +class ParserBeautifulSoup(BeautifulSoup): + """A ``bs4.BeautifulSoup`` that picks the first parser available in `parsers`. + + :param markup: markup for the ``bs4.BeautifulSoup``. + :param list parsers: parser names, in order of preference. """ - #: Supported BabelFish languages + def __init__(self, markup, parsers, **kwargs): + # reject features + if set(parsers).intersection({'fast', 'permissive', 'strict', 'xml', 'html', 'html5'}): + raise ValueError('Features not allowed, only parser names') + + # reject some kwargs + if 'features' in kwargs: + raise ValueError('Cannot use features kwarg') + if 'builder' in kwargs: + raise ValueError('Cannot use builder kwarg') + + # pick the first parser available + for parser in parsers: + try: + super(ParserBeautifulSoup, self).__init__(markup, parser, **kwargs) + return + except FeatureNotFound: + pass + + raise FeatureNotFound + + +class Provider(object): + """Base class for providers. + + If any configuration is possible for the provider, like credentials, it must take place during instantiation. + + :raise: :class:`~subliminal.exceptions.ConfigurationError` if there is a configuration error + + """ + #: Supported set of :class:`~babelfish.language.Language` languages = set() #: Supported video types @@ -30,53 +68,46 @@ class Provider(object): #: Required hash, if any required_hash = None - def __init__(self, **kwargs): - pass - def __enter__(self): self.initialize() return self - def __exit__(self, type, value, traceback): # @ReservedAssignment + def __exit__(self, exc_type, exc_value, traceback): self.terminate() def initialize(self): - """Initialize the provider + """Initialize the provider. Must be called when starting to work with the provider. This is the place for network initialization or login operations. - .. note: - This is called automatically if you use the :keyword:`with` statement - - - :raise: :class:`~subliminal.exceptions.ProviderNotAvailable` if the provider is unavailable + .. note:: + This is called automatically when entering the `with` statement """ - pass + raise NotImplementedError def terminate(self): - """Terminate the provider + """Terminate the provider. Must be called when done with the provider. This is the place for network shutdown or logout operations. - .. note: - This is called automatically if you use the :keyword:`with` statement + .. note:: + This is called automatically when exiting the `with` statement - :raise: :class:`~subliminal.exceptions.ProviderNotAvailable` if the provider is unavailable """ - pass + raise NotImplementedError @classmethod def check(cls, video): - """Check if the `video` can be processed + """Check if the `video` can be processed. - The video is considered invalid if not an instance of :attr:`video_types` or if the :attr:`required_hash` is - not present in :attr:`~subliminal.video.Video`'s `hashes` attribute. + The `video` is considered invalid if not an instance of :attr:`video_types` or if the :attr:`required_hash` is + not present in :attr:`~subliminal.video.Video.hashes` attribute of the `video`. - :param video: the video to check + :param video: the video to check. :type video: :class:`~subliminal.video.Video` - :return: `True` if the `video` and `languages` are valid, `False` otherwise + :return: `True` if the `video` is valid, `False` otherwise. :rtype: bool """ @@ -84,255 +115,47 @@ class Provider(object): return False if cls.required_hash is not None and cls.required_hash not in video.hashes: return False + return True - def query(self, languages, *args, **kwargs): - """Query the provider for subtitles + def query(self, *args, **kwargs): + """Query the provider for subtitles. - This method arguments match as much as possible the actual parameters for querying the provider + Arguments should match as much as possible the actual parameters for querying the provider - :param languages: languages to search for - :type languages: set of :class:`babelfish.Language` - :param \*args: other required arguments - :param \*\*kwargs: other optional arguments - :return: the subtitles + :return: found subtitles. :rtype: list of :class:`~subliminal.subtitle.Subtitle` - :raise: :class:`~subliminal.exceptions.ProviderNotAvailable` if the provider is unavailable - :raise: :class:`~subliminal.exceptions.ProviderError` if something unexpected occured + :raise: :class:`~subliminal.exceptions.ProviderError` """ raise NotImplementedError def list_subtitles(self, video, languages): - """List subtitles for the `video` with the given `languages` + """List subtitles for the `video` with the given `languages`. - This is a proxy for the :meth:`query` method. The parameters passed to the :meth:`query` method may - vary depending on the amount of information available in the `video` + This will call the :meth:`query` method internally. The parameters passed to the :meth:`query` method may + vary depending on the amount of information available in the `video`. - :param video: video to list subtitles for + :param video: video to list subtitles for. :type video: :class:`~subliminal.video.Video` - :param languages: languages to search for - :type languages: set of :class:`babelfish.Language` - :return: the subtitles + :param languages: languages to search for. + :type languages: set of :class:`~babelfish.language.Language` + :return: found subtitles. :rtype: list of :class:`~subliminal.subtitle.Subtitle` - :raise: :class:`~subliminal.exceptions.ProviderNotAvailable` if the provider is unavailable - :raise: :class:`~subliminal.exceptions.ProviderError` if something unexpected occured + :raise: :class:`~subliminal.exceptions.ProviderError` """ raise NotImplementedError def download_subtitle(self, subtitle): - """Download the `subtitle` an fill its :attr:`~subliminal.subtitle.Subtitle.content` attribute with - subtitle's text + """Download `subtitle`'s :attr:`~subliminal.subtitle.Subtitle.content`. - :param subtitle: subtitle to download + :param subtitle: subtitle to download. :type subtitle: :class:`~subliminal.subtitle.Subtitle` - :raise: :class:`~subliminal.exceptions.ProviderNotAvailable` if the provider is unavailable - :raise: :class:`~subliminal.exceptions.ProviderError` if something unexpected occured + :raise: :class:`~subliminal.exceptions.ProviderError` """ raise NotImplementedError def __repr__(self): return '<%s [%r]>' % (self.__class__.__name__, self.video_types) - - -class ProviderManager(object): - """Manager for providers behaving like a dict with lazy loading - - Loading is done in this order: - - * Entry point providers - * Registered providers - - .. attribute:: entry_point - - The entry point where to look for providers - - """ - entry_point = 'subliminal.providers' - - def __init__(self): - #: Registered providers with entry point syntax - self.registered_providers = ['addic7ed = subliminal.providers.addic7ed:Addic7edProvider', - 'opensubtitles = subliminal.providers.opensubtitles:OpenSubtitlesProvider', - 'podnapisi = subliminal.providers.podnapisi:PodnapisiProvider', - 'thesubdb = subliminal.providers.thesubdb:TheSubDBProvider', - 'tvsubtitles = subliminal.providers.tvsubtitles:TVsubtitlesProvider'] - - #: Loaded providers - self.providers = {} - - @property - def available_providers(self): - """Available providers""" - available_providers = set(self.providers.keys()) - available_providers.update([ep.name for ep in iter_entry_points(self.entry_point)]) - available_providers.update([EntryPoint.parse(c).name for c in self.registered_providers]) - return available_providers - - def __getitem__(self, name): - """Get a provider, lazy loading it if necessary""" - if name in self.providers: - return self.providers[name] - for ep in iter_entry_points(self.entry_point): - if ep.name == name: - self.providers[ep.name] = ep.load() - return self.providers[ep.name] - for ep in (EntryPoint.parse(c) for c in self.registered_providers): - if ep.name == name: - self.providers[ep.name] = ep.load(require=False) - return self.providers[ep.name] - raise KeyError(name) - - def __setitem__(self, name, provider): - """Load a provider""" - self.providers[name] = provider - - def __delitem__(self, name): - """Unload a provider""" - del self.providers[name] - - def __iter__(self): - """Iterator over loaded providers""" - return iter(self.providers) - - def register(self, entry_point): - """Register a provider - - :param string entry_point: provider to register (entry point syntax) - :raise: ValueError if already registered - - """ - if entry_point in self.registered_providers: - raise ValueError('Entry point \'%s\' already registered' % entry_point) - entry_point_name = EntryPoint.parse(entry_point).name - if entry_point_name in self.available_providers: - raise ValueError('An entry point with name \'%s\' already registered' % entry_point_name) - self.registered_providers.insert(0, entry_point) - - def unregister(self, entry_point): - """Unregister a provider - - :param string entry_point: provider to unregister (entry point syntax) - - """ - self.registered_providers.remove(entry_point) - - def __contains__(self, name): - return name in self.providers - -provider_manager = ProviderManager() - - -class ProviderPool(object): - """A pool of providers with the same API as a single :class:`Provider` - - The :class:`ProviderPool` supports the ``with`` statement to :meth:`terminate` the providers - - :param providers: providers to use, if not all - :type providers: list of string or None - :param provider_configs: configuration for providers - :type provider_configs: dict of provider name => provider constructor kwargs or None - - """ - def __init__(self, providers=None, provider_configs=None): - self.provider_configs = provider_configs or {} - self.providers = {p: provider_manager[p] for p in (providers or provider_manager.available_providers)} - self.initialized_providers = {} - self.discarded_providers = set() - - def __enter__(self): - return self - - def __exit__(self, type, value, traceback): # @ReservedAssignment - self.terminate() - - def get_initialized_provider(self, name): - """Get a :class:`Provider` by name, initializing it if necessary - - :param string name: name of the provider - :return: the initialized provider - :rtype: :class:`Provider` - - """ - if name in self.initialized_providers: - return self.initialized_providers[name] - provider = self.providers[name](**self.provider_configs.get(name, {})) - provider.initialize() - self.initialized_providers[name] = provider - return provider - - def list_subtitles(self, video, languages): - """List subtitles for `video` with the given `languages` - - :param video: video to list subtitles for - :type video: :class:`~subliminal.video.Video` - :param languages: languages of subtitles to search for - :type languages: set of :class:`babelfish.Language` - :return: found subtitles - :rtype: list of :class:`~subliminal.subtitle.Subtitle` - - """ - subtitles = [] - for provider_name, provider_class in self.providers.items(): - if not provider_class.check(video): - logger.info('Skipping provider %r: not a valid video', provider_name) - continue - provider_languages = provider_class.languages & languages - video.subtitle_languages - if not provider_languages: - logger.info('Skipping provider %r: no language to search for', provider_name) - continue - if provider_name in self.discarded_providers: - logger.debug('Skipping discarded provider %r', provider_name) - continue - try: - provider = self.get_initialized_provider(provider_name) - logger.info('Listing subtitles with provider %r and languages %r', provider_name, provider_languages) - provider_subtitles = provider.list_subtitles(video, provider_languages) - logger.info('Found %d subtitles', len(provider_subtitles)) - subtitles.extend(provider_subtitles) - except (requests.exceptions.Timeout, socket.timeout): - logger.warning('Provider %r timed out, discarding it', provider_name) - self.discarded_providers.add(provider_name) - except: - logger.exception('Unexpected error in provider %r, discarding it', provider_name) - self.discarded_providers.add(provider_name) - return subtitles - - def download_subtitle(self, subtitle): - """Download a subtitle - - :param subtitle: subtitle to download - :type subtitle: :class:`~subliminal.subtitle.Subtitle` - :return: ``True`` if the subtitle has been successfully downloaded, ``False`` otherwise - :rtype: bool - - """ - if subtitle.provider_name in self.discarded_providers: - logger.debug('Discarded provider %r', subtitle.provider_name) - return False - try: - provider = self.get_initialized_provider(subtitle.provider_name) - provider.download_subtitle(subtitle) - if not subtitle.is_valid: - logger.warning('Invalid subtitle') - return False - return True - except (requests.exceptions.Timeout, socket.timeout): - logger.warning('Provider %r timed out, discarding it', subtitle.provider_name) - self.discarded_providers.add(subtitle.provider_name) - except: - logger.exception('Unexpected error in provider %r, discarding it', subtitle.provider_name) - self.discarded_providers.add(subtitle.provider_name) - return False - - def terminate(self): - """Terminate all the initialized providers""" - for (provider_name, provider) in self.initialized_providers.items(): - try: - provider.terminate() - except (requests.exceptions.Timeout, socket.timeout): - logger.warning('Provider %r timed out, unable to terminate', provider_name) - except: - logger.exception('Unexpected error in provider %r', provider_name) diff --git a/libs/subliminal/providers/addic7ed.py b/libs/subliminal/providers/addic7ed.py index 93ea0884..0d4a58fd 100644 --- a/libs/subliminal/providers/addic7ed.py +++ b/libs/subliminal/providers/addic7ed.py @@ -1,26 +1,34 @@ # -*- coding: utf-8 -*- -from __future__ import unicode_literals import logging -import babelfish -import bs4 -import requests -from . import Provider -from .. import __version__ -from ..cache import region, SHOW_EXPIRATION_TIME -from ..exceptions import ConfigurationError, AuthenticationError, DownloadLimitExceeded, ProviderError -from ..subtitle import Subtitle, fix_line_endings, compute_guess_properties_matches +import re + +from babelfish import Language, language_converters +from guessit import guessit +from requests import Session + +from . import ParserBeautifulSoup, Provider +from .. import __short_version__ +from ..cache import SHOW_EXPIRATION_TIME, region +from ..exceptions import AuthenticationError, ConfigurationError, DownloadLimitExceeded, TooManyRequests +from ..score import get_equivalent_release_groups +from ..subtitle import Subtitle, fix_line_ending, guess_matches +from ..utils import sanitize, sanitize_release_group from ..video import Episode - logger = logging.getLogger(__name__) -babelfish.language_converters.register('addic7ed = subliminal.converters.addic7ed:Addic7edConverter') + +language_converters.register('addic7ed = subliminal.converters.addic7ed:Addic7edConverter') + +#: Series header parsing regex +series_year_re = re.compile(r'^(?P[ \w\'.:(),&!?-]+?)(?: \((?P\d{4})\))?$') class Addic7edSubtitle(Subtitle): + """Addic7ed Subtitle.""" provider_name = 'addic7ed' - def __init__(self, language, series, season, episode, title, year, version, hearing_impaired, download_link, - page_link): + def __init__(self, language, hearing_impaired, page_link, series, season, episode, title, year, version, + download_link): super(Addic7edSubtitle, self).__init__(language, hearing_impaired, page_link) self.series = series self.season = season @@ -30,10 +38,15 @@ class Addic7edSubtitle(Subtitle): self.version = version self.download_link = download_link - def compute_matches(self, video): + @property + def id(self): + return self.download_link + + def get_matches(self, video): matches = set() + # series - if video.series and self.series == video.series: + if video.series and sanitize(self.series) == sanitize(video.series): matches.add('series') # season if video.season and self.season == video.season: @@ -42,153 +55,218 @@ class Addic7edSubtitle(Subtitle): if video.episode and self.episode == video.episode: matches.add('episode') # title - if video.title and self.title.lower() == video.title.lower(): + if video.title and sanitize(self.title) == sanitize(video.title): matches.add('title') # year - if self.year == video.year: + if video.original_series and self.year is None or video.year and video.year == self.year: matches.add('year') # release_group - if video.release_group and self.version and video.release_group.lower() in self.version.lower(): + if (video.release_group and self.version and + any(r in sanitize_release_group(self.version) + for r in get_equivalent_release_groups(sanitize_release_group(video.release_group)))): matches.add('release_group') - """ # resolution if video.resolution and self.version and video.resolution in self.version.lower(): matches.add('resolution') # format - if video.format and self.version and video.format in self.version.lower: + if video.format and self.version and video.format.lower() in self.version.lower(): matches.add('format') - """ - # we don't have the complete filename, so we need to guess the matches separately - # guess resolution (screenSize in guessit) - matches |= compute_guess_properties_matches(video, self.version, 'screenSize') - # guess format - matches |= compute_guess_properties_matches(video, self.version, 'format') + # other properties + matches |= guess_matches(video, guessit(self.version), partial=True) + return matches class Addic7edProvider(Provider): - languages = {babelfish.Language('por', 'BR')} | {babelfish.Language(l) - for l in ['ara', 'aze', 'ben', 'bos', 'bul', 'cat', 'ces', 'dan', 'deu', 'ell', 'eng', 'eus', 'fas', - 'fin', 'fra', 'glg', 'heb', 'hrv', 'hun', 'hye', 'ind', 'ita', 'jpn', 'kor', 'mkd', 'msa', - 'nld', 'nor', 'pol', 'por', 'ron', 'rus', 'slk', 'slv', 'spa', 'sqi', 'srp', 'swe', 'tha', - 'tur', 'ukr', 'vie', 'zho']} + """Addic7ed Provider.""" + languages = {Language('por', 'BR')} | {Language(l) for l in [ + 'ara', 'aze', 'ben', 'bos', 'bul', 'cat', 'ces', 'dan', 'deu', 'ell', 'eng', 'eus', 'fas', 'fin', 'fra', 'glg', + 'heb', 'hrv', 'hun', 'hye', 'ind', 'ita', 'jpn', 'kor', 'mkd', 'msa', 'nld', 'nor', 'pol', 'por', 'ron', 'rus', + 'slk', 'slv', 'spa', 'sqi', 'srp', 'swe', 'tha', 'tur', 'ukr', 'vie', 'zho' + ]} video_types = (Episode,) - server = 'http://www.addic7ed.com' + server_url = 'http://www.addic7ed.com/' def __init__(self, username=None, password=None): if username is not None and password is None or username is None and password is not None: raise ConfigurationError('Username and password must be specified') + self.username = username self.password = password self.logged_in = False def initialize(self): - self.session = requests.Session() - self.session.headers = {'User-Agent': 'Subliminal/%s' % __version__.split('-')[0]} + self.session = Session() + self.session.headers['User-Agent'] = 'Subliminal/%s' % __short_version__ + # login if self.username is not None and self.password is not None: - logger.debug('Logging in') + logger.info('Logging in') data = {'username': self.username, 'password': self.password, 'Submit': 'Log in'} - r = self.session.post(self.server + '/dologin.php', data, timeout=10, allow_redirects=False) - if r.status_code == 302: - logger.info('Logged in') - self.logged_in = True - else: + r = self.session.post(self.server_url + 'dologin.php', data, allow_redirects=False, timeout=10) + + if r.status_code != 302: raise AuthenticationError(self.username) + logger.debug('Logged in') + self.logged_in = True + def terminate(self): # logout if self.logged_in: - r = self.session.get(self.server + '/logout.php', timeout=10) - logger.info('Logged out') - if r.status_code != 200: - raise ProviderError('Request failed with status code %d' % r.status_code) + logger.info('Logging out') + r = self.session.get(self.server_url + 'logout.php', timeout=10) + r.raise_for_status() + logger.debug('Logged out') + self.logged_in = False + self.session.close() - def get(self, url, params=None): - """Make a GET request on `url` with the given parameters - - :param string url: part of the URL to reach with the leading slash - :param params: params of the request - :return: the response - :rtype: :class:`bs4.BeautifulSoup` - - """ - r = self.session.get(self.server + url, params=params, timeout=10) - if r.status_code != 200: - raise ProviderError('Request failed with status code %d' % r.status_code) - return bs4.BeautifulSoup(r.content, ['permissive']) - @region.cache_on_arguments(expiration_time=SHOW_EXPIRATION_TIME) - def get_show_ids(self): - """Load the shows page with default series to show ids mapping + def _get_show_ids(self): + """Get the ``dict`` of show ids per series by querying the `shows.php` page. - :return: series to show ids + :return: show id per series, lower case and without quotes. :rtype: dict """ - soup = self.get('/shows.php') + # get the show page + logger.info('Getting show ids') + r = self.session.get(self.server_url + 'shows.php', timeout=10) + r.raise_for_status() + soup = ParserBeautifulSoup(r.content, ['lxml', 'html.parser']) + + # populate the show ids show_ids = {} - for html_show in soup.select('td.version > h3 > a[href^="/show/"]'): - show_ids[html_show.string.lower()] = int(html_show['href'][6:]) + for show in soup.select('td.version > h3 > a[href^="/show/"]'): + show_ids[sanitize(show.text)] = int(show['href'][6:]) + logger.debug('Found %d show ids', len(show_ids)) + return show_ids @region.cache_on_arguments(expiration_time=SHOW_EXPIRATION_TIME) - def find_show_id(self, series, year=None): - """Find the show id from the `series` with optional `year` + def _search_show_id(self, series, year=None): + """Search the show id from the `series` and `year`. - Use this only if the show id cannot be found with :meth:`get_show_ids` - - :param string series: series of the episode in lowercase - :param year: year of the series, if any - :type year: int or None - :return: the show id, if any - :rtype: int or None + :param str series: series of the episode. + :param year: year of the series, if any. + :type year: int + :return: the show id, if found. + :rtype: int """ - series_year = series - if year is not None: - series_year += ' (%d)' % year - params = {'search': series_year, 'Submit': 'Search'} - logger.debug('Searching series %r', params) - suggested_shows = self.get('/search.php', params).select('span.titulo > a[href^="/show/"]') - if not suggested_shows: - logger.info('Series %r not found', series_year) - return None - return int(suggested_shows[0]['href'][6:]) + # addic7ed doesn't support search with quotes + series = series.replace('\'', ' ') - def query(self, series, season, year=None): - show_ids = self.get_show_ids() + # build the params + series_year = '%s %d' % (series, year) if year is not None else series + params = {'search': series_year, 'Submit': 'Search'} + + # make the search + logger.info('Searching show ids with %r', params) + r = self.session.get(self.server_url + 'search.php', params=params, timeout=10) + r.raise_for_status() + if r.status_code == 304: + raise TooManyRequests() + soup = ParserBeautifulSoup(r.content, ['lxml', 'html.parser']) + + # get the suggestion + suggestion = soup.select('span.titulo > a[href^="/show/"]') + if not suggestion: + logger.warning('Show id not found: no suggestion') + return None + if not sanitize(suggestion[0].i.text.replace('\'', ' ')) == sanitize(series_year): + logger.warning('Show id not found: suggestion does not match') + return None + show_id = int(suggestion[0]['href'][6:]) + logger.debug('Found show id %d', show_id) + + return show_id + + def get_show_id(self, series, year=None, country_code=None): + """Get the best matching show id for `series`, `year` and `country_code`. + + First search in the result of :meth:`_get_show_ids` and fallback on a search with :meth:`_search_show_id`. + + :param str series: series of the episode. + :param year: year of the series, if any. + :type year: int + :param country_code: country code of the series, if any. + :type country_code: str + :return: the show id, if found. + :rtype: int + + """ + series_sanitized = sanitize(series).lower() + show_ids = self._get_show_ids() show_id = None - if year is not None: # search with the year - series_year = '%s (%d)' % (series.lower(), year) - if series_year in show_ids: - show_id = show_ids[series_year] - else: - show_id = self.find_show_id(series.lower(), year) - if show_id is None: # search without the year - year = None - if series.lower() in show_ids: - show_id = show_ids[series.lower()] - else: - show_id = self.find_show_id(series.lower()) + + # attempt with country + if not show_id and country_code: + logger.debug('Getting show id with country') + show_id = show_ids.get('%s %s' % (series_sanitized, country_code.lower())) + + # attempt with year + if not show_id and year: + logger.debug('Getting show id with year') + show_id = show_ids.get('%s %d' % (series_sanitized, year)) + + # attempt clean + if not show_id: + logger.debug('Getting show id') + show_id = show_ids.get(series_sanitized) + + # search as last resort + if not show_id: + logger.warning('Series not found in show ids') + show_id = self._search_show_id(series) + + return show_id + + def query(self, series, season, year=None, country=None): + # get the show id + show_id = self.get_show_id(series, year, country) if show_id is None: + logger.error('No show id found for %r (%r)', series, {'year': year, 'country': country}) return [] - params = {'show_id': show_id, 'season': season} - logger.debug('Searching subtitles %r', params) - link = '/show/{show_id}&season={season}'.format(**params) - soup = self.get(link) + + # get the page of the season of the show + logger.info('Getting the page of show id %d, season %d', show_id, season) + r = self.session.get(self.server_url + 'show/%d' % show_id, params={'season': season}, timeout=10) + r.raise_for_status() + if r.status_code == 304: + raise TooManyRequests() + soup = ParserBeautifulSoup(r.content, ['lxml', 'html.parser']) + + # loop over subtitle rows + match = series_year_re.match(soup.select('#header font')[0].text.strip()[:-10]) + series = match.group('series') + year = int(match.group('year')) if match.group('year') else None subtitles = [] - for row in soup('tr', class_='epeven completed'): + for row in soup.select('tr.epeven'): cells = row('td') - if cells[5].string != 'Completed': + + # ignore incomplete subtitles + status = cells[5].text + if status != 'Completed': + logger.debug('Ignoring subtitle with status %s', status) continue - if not cells[3].string: - continue - subtitles.append(Addic7edSubtitle(babelfish.Language.fromaddic7ed(cells[3].string), series, season, - int(cells[1].string), cells[2].string, year, cells[4].string, - bool(cells[6].string), cells[9].a['href'], - self.server + cells[2].a['href'])) + + # read the item + language = Language.fromaddic7ed(cells[3].text) + hearing_impaired = bool(cells[6].text) + page_link = self.server_url + cells[2].a['href'][1:] + season = int(cells[0].text) + episode = int(cells[1].text) + title = cells[2].text + version = cells[4].text + download_link = cells[9].a['href'][1:] + + subtitle = Addic7edSubtitle(language, hearing_impaired, page_link, series, season, episode, title, year, + version, download_link) + logger.debug('Found subtitle %r', subtitle) + subtitles.append(subtitle) + return subtitles def list_subtitles(self, video, languages): @@ -196,9 +274,14 @@ class Addic7edProvider(Provider): if s.language in languages and s.episode == video.episode] def download_subtitle(self, subtitle): - r = self.session.get(self.server + subtitle.download_link, timeout=10, headers={'Referer': subtitle.page_link}) - if r.status_code != 200: - raise ProviderError('Request failed with status code %d' % r.status_code) + # download the subtitle + logger.info('Downloading subtitle %r', subtitle) + r = self.session.get(self.server_url + subtitle.download_link, headers={'Referer': subtitle.page_link}, + timeout=10) + r.raise_for_status() + + # detect download limit exceeded if r.headers['Content-Type'] == 'text/html': raise DownloadLimitExceeded - subtitle.content = fix_line_endings(r.content) + + subtitle.content = fix_line_ending(r.content) diff --git a/libs/subliminal/providers/legendastv.py b/libs/subliminal/providers/legendastv.py new file mode 100644 index 00000000..cdd16aca --- /dev/null +++ b/libs/subliminal/providers/legendastv.py @@ -0,0 +1,448 @@ +# -*- coding: utf-8 -*- +import io +import json +import logging +import os +import re + +from babelfish import Language, language_converters +from datetime import datetime, timedelta +from dogpile.cache.api import NO_VALUE +from guessit import guessit +import pytz +import rarfile +from rarfile import RarFile, is_rarfile +from requests import Session +from zipfile import ZipFile, is_zipfile + +from . import ParserBeautifulSoup, Provider +from .. import __short_version__ +from ..cache import SHOW_EXPIRATION_TIME, region +from ..exceptions import AuthenticationError, ConfigurationError, ProviderError +from ..subtitle import SUBTITLE_EXTENSIONS, Subtitle, fix_line_ending, guess_matches, sanitize +from ..video import Episode, Movie + +logger = logging.getLogger(__name__) + +language_converters.register('legendastv = subliminal.converters.legendastv:LegendasTVConverter') + +# Configure :mod:`rarfile` to use the same path separator as :mod:`zipfile` +rarfile.PATH_SEP = '/' + +#: Conversion map for types +type_map = {'M': 'movie', 'S': 'episode', 'C': 'episode'} + +#: BR title season parsing regex +season_re = re.compile(r' - (?P\d+)(\xaa|a|st|nd|rd|th) (temporada|season)', re.IGNORECASE) + +#: Downloads parsing regex +downloads_re = re.compile(r'(?P\d+) downloads') + +#: Rating parsing regex +rating_re = re.compile(r'nota (?P\d+)') + +#: Timestamp parsing regex +timestamp_re = re.compile(r'(?P\d+)/(?P\d+)/(?P\d+) - (?P\d+):(?P\d+)') + +#: Cache key for releases +releases_key = __name__ + ':releases|{archive_id}' + + +class LegendasTVArchive(object): + """LegendasTV Archive. + + :param str id: identifier. + :param str name: name. + :param bool pack: contains subtitles for multiple episodes. + :param bool pack: featured. + :param str link: link. + :param int downloads: download count. + :param int rating: rating (0-10). + :param timestamp: timestamp. + :type timestamp: datetime.datetime + + """ + def __init__(self, id, name, pack, featured, link, downloads=0, rating=0, timestamp=None): + #: Identifier + self.id = id + + #: Name + self.name = name + + #: Pack + self.pack = pack + + #: Featured + self.featured = featured + + #: Link + self.link = link + + #: Download count + self.downloads = downloads + + #: Rating (0-10) + self.rating = rating + + #: Timestamp + self.timestamp = timestamp + + #: Compressed content as :class:`rarfile.RarFile` or :class:`zipfile.ZipFile` + self.content = None + + def __repr__(self): + return '<%s [%s] %r>' % (self.__class__.__name__, self.id, self.name) + + +class LegendasTVSubtitle(Subtitle): + """LegendasTV Subtitle.""" + provider_name = 'legendastv' + + def __init__(self, language, type, title, year, imdb_id, season, archive, name): + super(LegendasTVSubtitle, self).__init__(language, archive.link) + self.type = type + self.title = title + self.year = year + self.imdb_id = imdb_id + self.season = season + self.archive = archive + self.name = name + + @property + def id(self): + return '%s-%s' % (self.archive.id, self.name.lower()) + + def get_matches(self, video, hearing_impaired=False): + matches = set() + + # episode + if isinstance(video, Episode) and self.type == 'episode': + # series + if video.series and sanitize(self.title) == sanitize(video.series): + matches.add('series') + + # year (year is based on season air date hence the adjustment) + if video.original_series and self.year is None or video.year and video.year == self.year - self.season + 1: + matches.add('year') + + # imdb_id + if video.series_imdb_id and self.imdb_id == video.series_imdb_id: + matches.add('series_imdb_id') + + # movie + elif isinstance(video, Movie) and self.type == 'movie': + # title + if video.title and sanitize(self.title) == sanitize(video.title): + matches.add('title') + + # year + if video.year and self.year == video.year: + matches.add('year') + + # imdb_id + if video.imdb_id and self.imdb_id == video.imdb_id: + matches.add('imdb_id') + + # archive name + matches |= guess_matches(video, guessit(self.archive.name, {'type': self.type})) + + # name + matches |= guess_matches(video, guessit(self.name, {'type': self.type})) + + return matches + + +class LegendasTVProvider(Provider): + """LegendasTV Provider. + + :param str username: username. + :param str password: password. + + """ + languages = {Language.fromlegendastv(l) for l in language_converters['legendastv'].codes} + server_url = 'http://legendas.tv/' + + def __init__(self, username=None, password=None): + if username and not password or not username and password: + raise ConfigurationError('Username and password must be specified') + + self.username = username + self.password = password + self.logged_in = False + + def initialize(self): + self.session = Session() + self.session.headers['User-Agent'] = 'Subliminal/%s' % __short_version__ + + # login + if self.username is not None and self.password is not None: + logger.info('Logging in') + data = {'_method': 'POST', 'data[User][username]': self.username, 'data[User][password]': self.password} + r = self.session.post(self.server_url + 'login', data, allow_redirects=False, timeout=10) + r.raise_for_status() + + soup = ParserBeautifulSoup(r.content, ['html.parser']) + if soup.find('div', {'class': 'alert-error'}, string=re.compile(u'Usuário ou senha inválidos')): + raise AuthenticationError(self.username) + + logger.debug('Logged in') + self.logged_in = True + + def terminate(self): + # logout + if self.logged_in: + logger.info('Logging out') + r = self.session.get(self.server_url + 'users/logout', allow_redirects=False, timeout=10) + r.raise_for_status() + logger.debug('Logged out') + self.logged_in = False + + self.session.close() + + @region.cache_on_arguments(expiration_time=SHOW_EXPIRATION_TIME) + def search_titles(self, title): + """Search for titles matching the `title`. + + :param str title: the title to search for. + :return: found titles. + :rtype: dict + + """ + # make the query + logger.info('Searching title %r', title) + r = self.session.get(self.server_url + 'legenda/sugestao/{}'.format(title), timeout=10) + r.raise_for_status() + results = json.loads(r.text) + + # loop over results + titles = {} + for result in results: + source = result['_source'] + + # extract id + title_id = int(source['id_filme']) + + # extract type and title + title = {'type': type_map[source['tipo']], 'title': source['dsc_nome']} + + # extract year + if source['dsc_data_lancamento'] and source['dsc_data_lancamento'].isdigit(): + title['year'] = int(source['dsc_data_lancamento']) + + # extract imdb_id + if source['id_imdb'] != '0': + if not source['id_imdb'].startswith('tt'): + title['imdb_id'] = 'tt' + source['id_imdb'].zfill(7) + else: + title['imdb_id'] = source['id_imdb'] + + # extract season + if title['type'] == 'episode': + if source['temporada'] and source['temporada'].isdigit(): + title['season'] = int(source['temporada']) + else: + match = season_re.search(source['dsc_nome_br']) + if match: + title['season'] = int(match.group('season')) + else: + logger.warning('No season detected for title %d', title_id) + + # add title + titles[title_id] = title + + logger.debug('Found %d titles', len(titles)) + + return titles + + @region.cache_on_arguments(expiration_time=timedelta(minutes=15).total_seconds()) + def get_archives(self, title_id, language_code): + """Get the archive list from a given `title_id` and `language_code`. + + :param int title_id: title id. + :param int language_code: language code. + :return: the archives. + :rtype: list of :class:`LegendasTVArchive` + + """ + logger.info('Getting archives for title %d and language %d', title_id, language_code) + archives = [] + page = 1 + while True: + # get the archive page + url = self.server_url + 'util/carrega_legendas_busca_filme/{title}/{language}/-/{page}'.format( + title=title_id, language=language_code, page=page) + r = self.session.get(url) + r.raise_for_status() + + # parse the results + soup = ParserBeautifulSoup(r.content, ['lxml', 'html.parser']) + for archive_soup in soup.select('div.list_element > article > div'): + # create archive + archive = LegendasTVArchive(archive_soup.a['href'].split('/')[2], archive_soup.a.text, + 'pack' in archive_soup['class'], 'destaque' in archive_soup['class'], + self.server_url + archive_soup.a['href'][1:]) + + # extract text containing downloads, rating and timestamp + data_text = archive_soup.find('p', class_='data').text + + # match downloads + archive.downloads = int(downloads_re.search(data_text).group('downloads')) + + # match rating + match = rating_re.search(data_text) + if match: + archive.rating = int(match.group('rating')) + + # match timestamp and validate it + time_data = {k: int(v) for k, v in timestamp_re.search(data_text).groupdict().items()} + archive.timestamp = pytz.timezone('America/Sao_Paulo').localize(datetime(**time_data)) + if archive.timestamp > datetime.utcnow().replace(tzinfo=pytz.utc): + raise ProviderError('Archive timestamp is in the future') + + # add archive + archives.append(archive) + + # stop on last page + if soup.find('a', attrs={'class': 'load_more'}, string='carregar mais') is None: + break + + # increment page count + page += 1 + + logger.debug('Found %d archives', len(archives)) + + return archives + + def download_archive(self, archive): + """Download an archive's :attr:`~LegendasTVArchive.content`. + + :param archive: the archive to download :attr:`~LegendasTVArchive.content` of. + :type archive: :class:`LegendasTVArchive` + + """ + logger.info('Downloading archive %s', archive.id) + r = self.session.get(self.server_url + 'downloadarquivo/{}'.format(archive.id)) + r.raise_for_status() + + # open the archive + archive_stream = io.BytesIO(r.content) + if is_rarfile(archive_stream): + logger.debug('Identified rar archive') + archive.content = RarFile(archive_stream) + elif is_zipfile(archive_stream): + logger.debug('Identified zip archive') + archive.content = ZipFile(archive_stream) + else: + raise ValueError('Not a valid archive') + + def query(self, language, title, season=None, episode=None, year=None): + # search for titles + titles = self.search_titles(sanitize(title)) + + # search for titles with the quote or dot character + ignore_characters = {'\'', '.'} + if any(c in title for c in ignore_characters): + titles.update(self.search_titles(sanitize(title, ignore_characters=ignore_characters))) + + subtitles = [] + # iterate over titles + for title_id, t in titles.items(): + # discard mismatches on title + if sanitize(t['title']) != sanitize(title): + continue + + # episode + if season and episode: + # discard mismatches on type + if t['type'] != 'episode': + continue + + # discard mismatches on season + if 'season' not in t or t['season'] != season: + continue + # movie + else: + # discard mismatches on type + if t['type'] != 'movie': + continue + + # discard mismatches on year + if year is not None and 'year' in t and t['year'] != year: + continue + + # iterate over title's archives + for a in self.get_archives(title_id, language.legendastv): + # clean name of path separators and pack flags + clean_name = a.name.replace('/', '-') + if a.pack and clean_name.startswith('(p)'): + clean_name = clean_name[3:] + + # guess from name + guess = guessit(clean_name, {'type': t['type']}) + + # episode + if season and episode: + # discard mismatches on episode in non-pack archives + if not a.pack and 'episode' in guess and guess['episode'] != episode: + continue + + # compute an expiration time based on the archive timestamp + expiration_time = (datetime.utcnow().replace(tzinfo=pytz.utc) - a.timestamp).total_seconds() + + # attempt to get the releases from the cache + releases = region.get(releases_key.format(archive_id=a.id), expiration_time=expiration_time) + + # the releases are not in cache or cache is expired + if releases == NO_VALUE: + logger.info('Releases not found in cache') + + # download archive + self.download_archive(a) + + # extract the releases + releases = [] + for name in a.content.namelist(): + # discard the legendastv file + if name.startswith('Legendas.tv'): + continue + + # discard hidden files + if os.path.split(name)[-1].startswith('.'): + continue + + # discard non-subtitle files + if not name.lower().endswith(SUBTITLE_EXTENSIONS): + continue + + releases.append(name) + + # cache the releases + region.set(releases_key.format(archive_id=a.id), releases) + + # iterate over releases + for r in releases: + subtitle = LegendasTVSubtitle(language, t['type'], t['title'], t.get('year'), t.get('imdb_id'), + t.get('season'), a, r) + logger.debug('Found subtitle %r', subtitle) + subtitles.append(subtitle) + + return subtitles + + def list_subtitles(self, video, languages): + season = episode = None + if isinstance(video, Episode): + title = video.series + season = video.season + episode = video.episode + else: + title = video.title + + return [s for l in languages for s in self.query(l, title, season=season, episode=episode, year=video.year)] + + def download_subtitle(self, subtitle): + # download archive in case we previously hit the releases cache and didn't download it + if subtitle.archive.content is None: + self.download_archive(subtitle.archive) + + # extract subtitle's content + subtitle.content = fix_line_ending(subtitle.archive.content.read(subtitle.name)) diff --git a/libs/subliminal/providers/napiprojekt.py b/libs/subliminal/providers/napiprojekt.py new file mode 100644 index 00000000..f44f85d9 --- /dev/null +++ b/libs/subliminal/providers/napiprojekt.py @@ -0,0 +1,103 @@ +# -*- coding: utf-8 -*- +import logging + +from babelfish import Language +from requests import Session + +from . import Provider +from .. import __short_version__ +from ..subtitle import Subtitle + +logger = logging.getLogger(__name__) + + +def get_subhash(hash): + """Get a second hash based on napiprojekt's hash. + + :param str hash: napiprojekt's hash. + :return: the subhash. + :rtype: str + + """ + idx = [0xe, 0x3, 0x6, 0x8, 0x2] + mul = [2, 2, 5, 4, 3] + add = [0, 0xd, 0x10, 0xb, 0x5] + + b = [] + for i in range(len(idx)): + a = add[i] + m = mul[i] + i = idx[i] + t = a + int(hash[i], 16) + v = int(hash[t:t + 2], 16) + b.append(('%x' % (v * m))[-1]) + + return ''.join(b) + + +class NapiProjektSubtitle(Subtitle): + """NapiProjekt Subtitle.""" + provider_name = 'napiprojekt' + + def __init__(self, language, hash): + super(NapiProjektSubtitle, self).__init__(language) + self.hash = hash + + @property + def id(self): + return self.hash + + def get_matches(self, video): + matches = set() + + # hash + if 'napiprojekt' in video.hashes and video.hashes['napiprojekt'] == self.hash: + matches.add('hash') + + return matches + + +class NapiProjektProvider(Provider): + """NapiProjekt Provider.""" + languages = {Language.fromalpha2(l) for l in ['pl']} + required_hash = 'napiprojekt' + server_url = 'http://napiprojekt.pl/unit_napisy/dl.php' + + def initialize(self): + self.session = Session() + self.session.headers['User-Agent'] = 'Subliminal/%s' % __short_version__ + + def terminate(self): + self.session.close() + + def query(self, language, hash): + params = { + 'v': 'dreambox', + 'kolejka': 'false', + 'nick': '', + 'pass': '', + 'napios': 'Linux', + 'l': language.alpha2.upper(), + 'f': hash, + 't': get_subhash(hash)} + logger.info('Searching subtitle %r', params) + response = self.session.get(self.server_url, params=params, timeout=10) + response.raise_for_status() + + # handle subtitles not found and errors + if response.content[:4] == b'NPc0': + logger.debug('No subtitles found') + return None + + subtitle = NapiProjektSubtitle(language, hash) + subtitle.content = response.content + logger.debug('Found subtitle %r', subtitle) + + return subtitle + + def list_subtitles(self, video, languages): + return [s for s in [self.query(l, video.hashes['napiprojekt']) for l in languages] if s is not None] + + def download_subtitle(self, subtitle): + # there is no download step, content is already filled from listing subtitles + pass diff --git a/libs/subliminal/providers/opensubtitles.py b/libs/subliminal/providers/opensubtitles.py index 795799d2..5ab09da4 100644 --- a/libs/subliminal/providers/opensubtitles.py +++ b/libs/subliminal/providers/opensubtitles.py @@ -1,31 +1,33 @@ # -*- coding: utf-8 -*- -from __future__ import unicode_literals import base64 import logging import os import re import zlib -import babelfish -import guessit -from . import Provider -from .. import __version__ -from ..compat import ServerProxy, TimeoutTransport -from ..exceptions import ProviderError, AuthenticationError, DownloadLimitExceeded -from ..subtitle import Subtitle, fix_line_endings, compute_guess_matches -from ..video import Episode, Movie +from babelfish import Language, language_converters +from guessit import guessit +from six.moves.xmlrpc_client import ServerProxy + +from . import Provider, TimeoutSafeTransport +from .. import __short_version__ +from ..exceptions import AuthenticationError, ConfigurationError, DownloadLimitExceeded, ProviderError +from ..subtitle import Subtitle, fix_line_ending, guess_matches +from ..utils import sanitize +from ..video import Episode, Movie logger = logging.getLogger(__name__) class OpenSubtitlesSubtitle(Subtitle): + """OpenSubtitles Subtitle.""" provider_name = 'opensubtitles' - series_re = re.compile('^"(?P.*)" (?P.*)$') + series_re = re.compile(r'^"(?P.*)" (?P.*)$') - def __init__(self, language, hearing_impaired, id, matched_by, movie_kind, hash, movie_name, movie_release_name, # @ReservedAssignment - movie_year, movie_imdb_id, series_season, series_episode, page_link): - super(OpenSubtitlesSubtitle, self).__init__(language, hearing_impaired, page_link) - self.id = id + def __init__(self, language, hearing_impaired, page_link, subtitle_id, matched_by, movie_kind, hash, movie_name, + movie_release_name, movie_year, movie_imdb_id, series_season, series_episode, filename, encoding): + super(OpenSubtitlesSubtitle, self).__init__(language, hearing_impaired, page_link, encoding) + self.subtitle_id = subtitle_id self.matched_by = matched_by self.movie_kind = movie_kind self.hash = hash @@ -35,6 +37,11 @@ class OpenSubtitlesSubtitle(Subtitle): self.movie_imdb_id = movie_imdb_id self.series_season = series_season self.series_episode = series_episode + self.filename = filename + + @property + def id(self): + return str(self.subtitle_id) @property def series_name(self): @@ -44,145 +51,225 @@ class OpenSubtitlesSubtitle(Subtitle): def series_title(self): return self.series_re.match(self.movie_name).group('series_title') - def compute_matches(self, video): + def get_matches(self, video): matches = set() + # episode if isinstance(video, Episode) and self.movie_kind == 'episode': + # tag match, assume series, year, season and episode matches + if self.matched_by == 'tag': + matches |= {'series', 'year', 'season', 'episode'} # series - if video.series and self.series_name.lower() == video.series.lower(): + if video.series and sanitize(self.series_name) == sanitize(video.series): matches.add('series') + # year + if video.original_series and self.movie_year is None or video.year and video.year == self.movie_year: + matches.add('year') # season if video.season and self.series_season == video.season: matches.add('season') # episode if video.episode and self.series_episode == video.episode: matches.add('episode') + # title + if video.title and sanitize(self.series_title) == sanitize(video.title): + matches.add('title') # guess - matches |= compute_guess_matches(video, guessit.guess_episode_info(self.movie_release_name + '.mkv')) + matches |= guess_matches(video, guessit(self.movie_release_name, {'type': 'episode'})) + matches |= guess_matches(video, guessit(self.filename, {'type': 'episode'})) + # hash + if 'opensubtitles' in video.hashes and self.hash == video.hashes['opensubtitles']: + if 'series' in matches and 'season' in matches and 'episode' in matches: + matches.add('hash') + else: + logger.debug('Match on hash discarded') # movie elif isinstance(video, Movie) and self.movie_kind == 'movie': + # tag match, assume title and year matches + if self.matched_by == 'tag': + matches |= {'title', 'year'} + # title + if video.title and sanitize(self.movie_name) == sanitize(video.title): + matches.add('title') # year if video.year and self.movie_year == video.year: matches.add('year') # guess - matches |= compute_guess_matches(video, guessit.guess_movie_info(self.movie_release_name + '.mkv')) + matches |= guess_matches(video, guessit(self.movie_release_name, {'type': 'movie'})) + matches |= guess_matches(video, guessit(self.filename, {'type': 'movie'})) + # hash + if 'opensubtitles' in video.hashes and self.hash == video.hashes['opensubtitles']: + if 'title' in matches: + matches.add('hash') + else: + logger.debug('Match on hash discarded') else: - logger.info('%r is not a valid movie_kind for %r', self.movie_kind, video) + logger.info('%r is not a valid movie_kind', self.movie_kind) return matches - # hash - if 'opensubtitles' in video.hashes and self.hash == video.hashes['opensubtitles']: - matches.add('hash') + # imdb_id if video.imdb_id and self.movie_imdb_id == video.imdb_id: matches.add('imdb_id') - # title - if video.title and self.movie_name.lower() == video.title.lower(): - matches.add('title') + return matches class OpenSubtitlesProvider(Provider): - languages = {babelfish.Language.fromopensubtitles(l) for l in babelfish.language_converters['opensubtitles'].codes} + """OpenSubtitles Provider. - def __init__(self): - self.server = ServerProxy('http://api.opensubtitles.org/xml-rpc', transport=TimeoutTransport(10)) + :param str username: username. + :param str password: password. + + """ + languages = {Language.fromopensubtitles(l) for l in language_converters['opensubtitles'].codes} + + def __init__(self, username=None, password=None): + self.server = ServerProxy('https://api.opensubtitles.org/xml-rpc', TimeoutSafeTransport(10)) + if username and not password or not username and password: + raise ConfigurationError('Username and password must be specified') + # None values not allowed for logging in, so replace it by '' + self.username = username or '' + self.password = password or '' self.token = None def initialize(self): - response = checked(self.server.LogIn('', '', 'eng', 'subliminal v%s' % __version__.split('-')[0])) + logger.info('Logging in') + response = checked(self.server.LogIn(self.username, self.password, 'eng', + 'subliminal v%s' % __short_version__)) self.token = response['token'] + logger.debug('Logged in with token %r', self.token) def terminate(self): + logger.info('Logging out') checked(self.server.LogOut(self.token)) self.server.close() + self.token = None + logger.debug('Logged out') def no_operation(self): + logger.debug('No operation') checked(self.server.NoOperation(self.token)) - def query(self, languages, hash=None, size=None, imdb_id=None, query=None, season=None, episode=None): # @ReservedAssignment - searches = [] + def query(self, languages, hash=None, size=None, imdb_id=None, query=None, season=None, episode=None, tag=None): + # fill the search criteria + criteria = [] if hash and size: - searches.append({'moviehash': hash, 'moviebytesize': str(size)}) + criteria.append({'moviehash': hash, 'moviebytesize': str(size)}) if imdb_id: - searches.append({'imdbid': imdb_id}) + criteria.append({'imdbid': imdb_id[2:]}) + if tag: + criteria.append({'tag': tag}) if query and season and episode: - searches.append({'query': query, 'season': season, 'episode': episode}) + criteria.append({'query': query.replace('\'', ''), 'season': season, 'episode': episode}) elif query: - searches.append({'query': query}) - if not searches: - raise ValueError('One or more parameter missing') - for search in searches: - search['sublanguageid'] = ','.join(l.opensubtitles for l in languages) - logger.debug('Searching subtitles %r', searches) - response = checked(self.server.SearchSubtitles(self.token, searches)) + criteria.append({'query': query.replace('\'', '')}) + if not criteria: + raise ValueError('Not enough information') + + # add the language + for criterion in criteria: + criterion['sublanguageid'] = ','.join(sorted(l.opensubtitles for l in languages)) + + # query the server + logger.info('Searching subtitles %r', criteria) + response = checked(self.server.SearchSubtitles(self.token, criteria)) + subtitles = [] + + # exit if no data if not response['data']: - logger.debug('No subtitle found') - return [] - return [OpenSubtitlesSubtitle(babelfish.Language.fromopensubtitles(r['SubLanguageID']), - bool(int(r['SubHearingImpaired'])), r['IDSubtitleFile'], r['MatchedBy'], - r['MovieKind'], r['MovieHash'], r['MovieName'], r['MovieReleaseName'], - int(r['MovieYear']) if r['MovieYear'] else None, int(r['IDMovieImdb']), - int(r['SeriesSeason']) if r['SeriesSeason'] else None, - int(r['SeriesEpisode']) if r['SeriesEpisode'] else None, r['SubtitlesLink']) - for r in response['data']] + logger.debug('No subtitles found') + return subtitles + + # loop over subtitle items + for subtitle_item in response['data']: + # read the item + language = Language.fromopensubtitles(subtitle_item['SubLanguageID']) + hearing_impaired = bool(int(subtitle_item['SubHearingImpaired'])) + page_link = subtitle_item['SubtitlesLink'] + subtitle_id = int(subtitle_item['IDSubtitleFile']) + matched_by = subtitle_item['MatchedBy'] + movie_kind = subtitle_item['MovieKind'] + hash = subtitle_item['MovieHash'] + movie_name = subtitle_item['MovieName'] + movie_release_name = subtitle_item['MovieReleaseName'] + movie_year = int(subtitle_item['MovieYear']) if subtitle_item['MovieYear'] else None + movie_imdb_id = 'tt' + subtitle_item['IDMovieImdb'] + series_season = int(subtitle_item['SeriesSeason']) if subtitle_item['SeriesSeason'] else None + series_episode = int(subtitle_item['SeriesEpisode']) if subtitle_item['SeriesEpisode'] else None + filename = subtitle_item['SubFileName'] + encoding = subtitle_item.get('SubEncoding') or None + + subtitle = OpenSubtitlesSubtitle(language, hearing_impaired, page_link, subtitle_id, matched_by, movie_kind, + hash, movie_name, movie_release_name, movie_year, movie_imdb_id, + series_season, series_episode, filename, encoding) + logger.debug('Found subtitle %r by %s', subtitle, matched_by) + subtitles.append(subtitle) + + return subtitles def list_subtitles(self, video, languages): - query = None - season = None - episode = None - if ('opensubtitles' not in video.hashes or not video.size) and not video.imdb_id: - query = video.name.split(os.sep)[-1] + season = episode = None if isinstance(video, Episode): query = video.series season = video.season episode = video.episode + else: + query = video.title + return self.query(languages, hash=video.hashes.get('opensubtitles'), size=video.size, imdb_id=video.imdb_id, - query=query, season=season, episode=episode) + query=query, season=season, episode=episode, tag=os.path.basename(video.name)) def download_subtitle(self, subtitle): - response = checked(self.server.DownloadSubtitles(self.token, [subtitle.id])) - if not response['data']: - raise ProviderError('Nothing to download') - subtitle.content = fix_line_endings(zlib.decompress(base64.b64decode(response['data'][0]['data']), 47)) + logger.info('Downloading subtitle %r', subtitle) + response = checked(self.server.DownloadSubtitles(self.token, [str(subtitle.subtitle_id)])) + subtitle.content = fix_line_ending(zlib.decompress(base64.b64decode(response['data'][0]['data']), 47)) class OpenSubtitlesError(ProviderError): - """Base class for non-generic :class:`OpenSubtitlesProvider` exceptions""" + """Base class for non-generic :class:`OpenSubtitlesProvider` exceptions.""" + pass class Unauthorized(OpenSubtitlesError, AuthenticationError): - """Exception raised when status is '401 Unauthorized'""" + """Exception raised when status is '401 Unauthorized'.""" + pass class NoSession(OpenSubtitlesError, AuthenticationError): - """Exception raised when status is '406 No session'""" + """Exception raised when status is '406 No session'.""" + pass class DownloadLimitReached(OpenSubtitlesError, DownloadLimitExceeded): - """Exception raised when status is '407 Download limit reached'""" + """Exception raised when status is '407 Download limit reached'.""" + pass class InvalidImdbid(OpenSubtitlesError): - """Exception raised when status is '413 Invalid ImdbID'""" + """Exception raised when status is '413 Invalid ImdbID'.""" + pass class UnknownUserAgent(OpenSubtitlesError, AuthenticationError): - """Exception raised when status is '414 Unknown User Agent'""" + """Exception raised when status is '414 Unknown User Agent'.""" + pass class DisabledUserAgent(OpenSubtitlesError, AuthenticationError): - """Exception raised when status is '415 Disabled user agent'""" + """Exception raised when status is '415 Disabled user agent'.""" + pass class ServiceUnavailable(OpenSubtitlesError): - """Exception raised when status is '503 Service Unavailable'""" + """Exception raised when status is '503 Service Unavailable'.""" + pass def checked(response): - """Check a response status before returning it + """Check a response status before returning it. - :param response: a response from a XMLRPC call to OpenSubtitles - :return: the response + :param response: a response from a XMLRPC call to OpenSubtitles. + :return: the response. :raise: :class:`OpenSubtitlesError` """ @@ -203,4 +290,5 @@ def checked(response): raise ServiceUnavailable if status_code != 200: raise OpenSubtitlesError(response['status']) + return response diff --git a/libs/subliminal/providers/podnapisi.py b/libs/subliminal/providers/podnapisi.py index 2aa1e7dc..f643682b 100644 --- a/libs/subliminal/providers/podnapisi.py +++ b/libs/subliminal/providers/podnapisi.py @@ -1,47 +1,59 @@ # -*- coding: utf-8 -*- -from __future__ import unicode_literals import io import logging import re -import xml.etree.ElementTree -import zipfile -import babelfish -import bs4 -import guessit -import requests + +from babelfish import Language, language_converters +from guessit import guessit +try: + from lxml import etree +except ImportError: + try: + import xml.etree.cElementTree as etree + except ImportError: + import xml.etree.ElementTree as etree +from requests import Session +from zipfile import ZipFile + from . import Provider -from .. import __version__ +from .. import __short_version__ from ..exceptions import ProviderError -from ..subtitle import Subtitle, fix_line_endings, compute_guess_matches +from ..subtitle import Subtitle, fix_line_ending, guess_matches +from ..utils import sanitize from ..video import Episode, Movie - logger = logging.getLogger(__name__) -babelfish.language_converters.register('podnapisi = subliminal.converters.podnapisi:PodnapisiConverter') class PodnapisiSubtitle(Subtitle): + """Podnapisi Subtitle.""" provider_name = 'podnapisi' - def __init__(self, language, id, releases, hearing_impaired, page_link, series=None, season=None, episode=None, # @ReservedAssignment - title=None, year=None): + def __init__(self, language, hearing_impaired, page_link, pid, releases, title, season=None, episode=None, + year=None): super(PodnapisiSubtitle, self).__init__(language, hearing_impaired, page_link) - self.id = id + self.pid = pid self.releases = releases - self.hearing_impaired = hearing_impaired - self.series = series + self.title = title self.season = season self.episode = episode - self.title = title self.year = year - def compute_matches(self, video): + @property + def id(self): + return self.pid + + def get_matches(self, video): matches = set() + # episode if isinstance(video, Episode): # series - if video.series and self.series.lower() == video.series.lower(): + if video.series and sanitize(self.title) == sanitize(video.series): matches.add('series') + # year + if video.original_series and self.year is None or video.year and video.year == self.year: + matches.add('year') # season if video.season and self.season == video.season: matches.add('season') @@ -50,105 +62,118 @@ class PodnapisiSubtitle(Subtitle): matches.add('episode') # guess for release in self.releases: - matches |= compute_guess_matches(video, guessit.guess_episode_info(release + '.mkv')) + matches |= guess_matches(video, guessit(release, {'type': 'episode'})) # movie elif isinstance(video, Movie): # title - if video.title and self.title.lower() == video.title.lower(): + if video.title and sanitize(self.title) == sanitize(video.title): matches.add('title') + # year + if video.year and self.year == video.year: + matches.add('year') # guess for release in self.releases: - matches |= compute_guess_matches(video, guessit.guess_movie_info(release + '.mkv')) - # year - if self.year == video.year: - matches.add('year') + matches |= guess_matches(video, guessit(release, {'type': 'movie'})) + return matches class PodnapisiProvider(Provider): - languages = {babelfish.Language.frompodnapisi(l) for l in babelfish.language_converters['podnapisi'].codes} - video_types = (Episode, Movie) - server = 'http://simple.podnapisi.net' - link_re = re.compile('^.*(?P/ppodnapisi/download/i/\d+/k/.*$)') + """Podnapisi Provider.""" + languages = ({Language('por', 'BR'), Language('srp', script='Latn')} | + {Language.fromalpha2(l) for l in language_converters['alpha2'].codes}) + server_url = 'http://podnapisi.net/subtitles/' def initialize(self): - self.session = requests.Session() - self.session.headers = {'User-Agent': 'Subliminal/%s' % __version__.split('-')[0]} + self.session = Session() + self.session.headers['User-Agent'] = 'Subliminal/%s' % __short_version__ def terminate(self): self.session.close() - def get(self, url, params=None, is_xml=True): - """Make a GET request on `url` with the given parameters - - :param string url: part of the URL to reach with the leading slash - :param dict params: params of the request - :param bool xml: whether the response content is XML or not - :return: the response - :rtype: :class:`xml.etree.ElementTree.Element` or :class:`bs4.BeautifulSoup` - - """ - r = self.session.get(self.server + '/ppodnapisi' + url, params=params, timeout=10) - if r.status_code != 200: - raise ProviderError('Request failed with status code %d' % r.status_code) - if is_xml: - return xml.etree.ElementTree.fromstring(r.content) - else: - return bs4.BeautifulSoup(r.content, ['permissive']) - - def query(self, language, series=None, season=None, episode=None, title=None, year=None): - params = {'sXML': 1, 'sJ': language.podnapisi} - if series and season and episode: - params['sK'] = series + def query(self, language, keyword, season=None, episode=None, year=None): + # set parameters, see http://www.podnapisi.net/forum/viewtopic.php?f=62&t=26164#p212652 + params = {'sXML': 1, 'sL': str(language), 'sK': keyword} + is_episode = False + if season and episode: + is_episode = True params['sTS'] = season params['sTE'] = episode - elif title: - params['sK'] = title - else: - raise ValueError('Missing parameters series and season and episode or title') if year: params['sY'] = year - logger.debug('Searching episode %r', params) + + # loop over paginated results + logger.info('Searching subtitles %r', params) subtitles = [] + pids = set() while True: - root = self.get('/search', params) - if not int(root.find('pagination/results').text): - logger.debug('No subtitle found') + # query the server + xml = etree.fromstring(self.session.get(self.server_url + 'search/old', params=params, timeout=10).content) + + # exit if no results + if not int(xml.find('pagination/results').text): + logger.debug('No subtitles found') break - if series and season and episode: - subtitles.extend([PodnapisiSubtitle(language, int(s.find('id').text), - s.find('release').text.split() if s.find('release').text else [], - 'n' in (s.find('flags').text or ''), s.find('url').text, - series=series, season=season, episode=episode, - year=s.find('year').text) - for s in root.findall('subtitle')]) - elif title: - subtitles.extend([PodnapisiSubtitle(language, int(s.find('id').text), - s.find('release').text.split() if s.find('release').text else [], - 'n' in (s.find('flags').text or ''), s.find('url').text, - title=title, year=s.find('year').text) - for s in root.findall('subtitle')]) - if int(root.find('pagination/current').text) >= int(root.find('pagination/count').text): + + # loop over subtitles + for subtitle_xml in xml.findall('subtitle'): + # read xml elements + language = Language.fromietf(subtitle_xml.find('language').text) + hearing_impaired = 'n' in (subtitle_xml.find('flags').text or '') + page_link = subtitle_xml.find('url').text + pid = subtitle_xml.find('pid').text + releases = [] + if subtitle_xml.find('release').text: + for release in subtitle_xml.find('release').text.split(): + release = re.sub(r'\.+$', '', release) # remove trailing dots + release = ''.join(filter(lambda x: ord(x) < 128, release)) # remove non-ascii characters + releases.append(release) + title = subtitle_xml.find('title').text + season = int(subtitle_xml.find('tvSeason').text) + episode = int(subtitle_xml.find('tvEpisode').text) + year = int(subtitle_xml.find('year').text) + + if is_episode: + subtitle = PodnapisiSubtitle(language, hearing_impaired, page_link, pid, releases, title, + season=season, episode=episode, year=year) + else: + subtitle = PodnapisiSubtitle(language, hearing_impaired, page_link, pid, releases, title, + year=year) + + # ignore duplicates, see http://www.podnapisi.net/forum/viewtopic.php?f=62&t=26164&start=10#p213321 + if pid in pids: + continue + + logger.debug('Found subtitle %r', subtitle) + subtitles.append(subtitle) + pids.add(pid) + + # stop on last page + if int(xml.find('pagination/current').text) >= int(xml.find('pagination/count').text): break - params['page'] = int(root.find('pagination/current').text) + 1 + + # increment current page + params['page'] = int(xml.find('pagination/current').text) + 1 + logger.debug('Getting page %d', params['page']) + return subtitles def list_subtitles(self, video, languages): if isinstance(video, Episode): - return [s for l in languages for s in self.query(l, series=video.series, season=video.season, + return [s for l in languages for s in self.query(l, video.series, season=video.season, episode=video.episode, year=video.year)] elif isinstance(video, Movie): - return [s for l in languages for s in self.query(l, title=video.title, year=video.year)] + return [s for l in languages for s in self.query(l, video.title, year=video.year)] def download_subtitle(self, subtitle): - soup = self.get(subtitle.page_link[38:], is_xml=False) - link = soup.find('a', href=self.link_re) - if not link: - raise ProviderError('Cannot find the download link') - r = self.session.get(self.server + self.link_re.match(link['href']).group('link'), timeout=10) - if r.status_code != 200: - raise ProviderError('Request failed with status code %d' % r.status_code) - with zipfile.ZipFile(io.BytesIO(r.content)) as zf: + # download as a zip + logger.info('Downloading subtitle %r', subtitle) + r = self.session.get(self.server_url + subtitle.pid + '/download', params={'container': 'zip'}, timeout=10) + r.raise_for_status() + + # open the zip + with ZipFile(io.BytesIO(r.content)) as zf: if len(zf.namelist()) > 1: raise ProviderError('More than one file to unzip') - subtitle.content = fix_line_endings(zf.read(zf.namelist()[0])) + + subtitle.content = fix_line_ending(zf.read(zf.namelist()[0])) diff --git a/libs/subliminal/providers/shooter.py b/libs/subliminal/providers/shooter.py new file mode 100644 index 00000000..fc79faf7 --- /dev/null +++ b/libs/subliminal/providers/shooter.py @@ -0,0 +1,79 @@ +# -*- coding: utf-8 -*- +import json +import logging +import os + +from babelfish import Language, language_converters +from requests import Session + +from . import Provider +from .. import __short_version__ +from ..subtitle import Subtitle, fix_line_ending + +logger = logging.getLogger(__name__) + +language_converters.register('shooter = subliminal.converters.shooter:ShooterConverter') + + +class ShooterSubtitle(Subtitle): + """Shooter Subtitle.""" + provider_name = 'shooter' + + def __init__(self, language, hash, download_link): + super(ShooterSubtitle, self).__init__(language) + self.hash = hash + self.download_link = download_link + + @property + def id(self): + return self.download_link + + def get_matches(self, video): + matches = set() + + # hash + if 'shooter' in video.hashes and video.hashes['shooter'] == self.hash: + matches.add('hash') + + return matches + + +class ShooterProvider(Provider): + """Shooter Provider.""" + languages = {Language(l) for l in ['eng', 'zho']} + server_url = 'https://www.shooter.cn/api/subapi.php' + + def initialize(self): + self.session = Session() + self.session.headers['User-Agent'] = 'Subliminal/%s' % __short_version__ + + def terminate(self): + self.session.close() + + def query(self, language, filename, hash=None): + # query the server + params = {'filehash': hash, 'pathinfo': os.path.realpath(filename), 'format': 'json', 'lang': language.shooter} + logger.debug('Searching subtitles %r', params) + r = self.session.post(self.server_url, params=params, timeout=10) + r.raise_for_status() + + # handle subtitles not found + if r.content == b'\xff': + logger.debug('No subtitles found') + return [] + + # parse the subtitles + results = json.loads(r.text) + subtitles = [ShooterSubtitle(language, hash, t['Link']) for s in results for t in s['Files']] + + return subtitles + + def list_subtitles(self, video, languages): + return [s for l in languages for s in self.query(l, video.name, video.hashes.get('shooter'))] + + def download_subtitle(self, subtitle): + logger.info('Downloading subtitle %r', subtitle) + r = self.session.get(subtitle.download_link, timeout=10) + r.raise_for_status() + + subtitle.content = fix_line_ending(r.content) diff --git a/libs/subliminal/providers/subscenter.py b/libs/subliminal/providers/subscenter.py new file mode 100644 index 00000000..1e25e5e1 --- /dev/null +++ b/libs/subliminal/providers/subscenter.py @@ -0,0 +1,235 @@ +# -*- coding: utf-8 -*- +import bisect +from collections import defaultdict +import io +import json +import logging +import zipfile + +from babelfish import Language +from guessit import guessit +from requests import Session + +from . import ParserBeautifulSoup, Provider +from .. import __short_version__ +from ..cache import SHOW_EXPIRATION_TIME, region +from ..exceptions import AuthenticationError, ConfigurationError, ProviderError +from ..subtitle import Subtitle, fix_line_ending, guess_matches +from ..utils import sanitize +from ..video import Episode, Movie + +logger = logging.getLogger(__name__) + + +class SubsCenterSubtitle(Subtitle): + """SubsCenter Subtitle.""" + provider_name = 'subscenter' + + def __init__(self, language, hearing_impaired, page_link, series, season, episode, title, subtitle_id, subtitle_key, + downloaded, releases): + super(SubsCenterSubtitle, self).__init__(language, hearing_impaired, page_link) + self.series = series + self.season = season + self.episode = episode + self.title = title + self.subtitle_id = subtitle_id + self.subtitle_key = subtitle_key + self.downloaded = downloaded + self.releases = releases + + @property + def id(self): + return str(self.subtitle_id) + + def get_matches(self, video): + matches = set() + + # episode + if isinstance(video, Episode): + # series + if video.series and sanitize(self.series) == sanitize(video.series): + matches.add('series') + # season + if video.season and self.season == video.season: + matches.add('season') + # episode + if video.episode and self.episode == video.episode: + matches.add('episode') + # guess + for release in self.releases: + matches |= guess_matches(video, guessit(release, {'type': 'episode'})) + # movie + elif isinstance(video, Movie): + # guess + for release in self.releases: + matches |= guess_matches(video, guessit(release, {'type': 'movie'})) + + # title + if video.title and sanitize(self.title) == sanitize(video.title): + matches.add('title') + + return matches + + +class SubsCenterProvider(Provider): + """SubsCenter Provider.""" + languages = {Language.fromalpha2(l) for l in ['he']} + server_url = 'http://www.subscenter.co/he/' + + def __init__(self, username=None, password=None): + if username is not None and password is None or username is None and password is not None: + raise ConfigurationError('Username and password must be specified') + + self.session = None + self.username = username + self.password = password + self.logged_in = False + + def initialize(self): + self.session = Session() + self.session.headers['User-Agent'] = 'Subliminal/{}'.format(__short_version__) + + # login + if self.username is not None and self.password is not None: + logger.debug('Logging in') + url = self.server_url + 'subscenter/accounts/login/' + + # retrieve CSRF token + self.session.get(url) + csrf_token = self.session.cookies['csrftoken'] + + # actual login + data = {'username': self.username, 'password': self.password, 'csrfmiddlewaretoken': csrf_token} + r = self.session.post(url, data, allow_redirects=False, timeout=10) + + if r.status_code != 302: + raise AuthenticationError(self.username) + + logger.info('Logged in') + self.logged_in = True + + def terminate(self): + # logout + if self.logged_in: + logger.info('Logging out') + r = self.session.get(self.server_url + 'subscenter/accounts/logout/', timeout=10) + r.raise_for_status() + logger.info('Logged out') + self.logged_in = False + + self.session.close() + + @region.cache_on_arguments(expiration_time=SHOW_EXPIRATION_TIME) + def _search_url_titles(self, title): + """Search the URL titles by kind for the given `title`. + + :param str title: title to search for. + :return: the URL titles by kind. + :rtype: collections.defaultdict + + """ + # make the search + logger.info('Searching title name for %r', title) + r = self.session.get(self.server_url + 'subtitle/search/', params={'q': title}, timeout=10) + r.raise_for_status() + + # check for redirections + if r.history and all([h.status_code == 302 for h in r.history]): + logger.debug('Redirected to the subtitles page') + links = [r.url] + else: + # get the suggestions (if needed) + soup = ParserBeautifulSoup(r.content, ['lxml', 'html.parser']) + links = [link.attrs['href'] for link in soup.select('#processes div.generalWindowTop a')] + logger.debug('Found %d suggestions', len(links)) + + url_titles = defaultdict(list) + for link in links: + parts = link.split('/') + url_titles[parts[-3]].append(parts[-2]) + + return url_titles + + def query(self, title, season=None, episode=None): + # search for the url title + url_titles = self._search_url_titles(title) + + # episode + if season and episode: + if 'series' not in url_titles: + logger.error('No URL title found for series %r', title) + return [] + url_title = url_titles['series'][0] + logger.debug('Using series title %r', url_title) + url = self.server_url + 'cst/data/series/sb/{}/{}/{}/'.format(url_title, season, episode) + page_link = self.server_url + 'subtitle/series/{}/{}/{}/'.format(url_title, season, episode) + else: + if 'movie' not in url_titles: + logger.error('No URL title found for movie %r', title) + return [] + url_title = url_titles['movie'][0] + logger.debug('Using movie title %r', url_title) + url = self.server_url + 'cst/data/movie/sb/{}/'.format(url_title) + page_link = self.server_url + 'subtitle/movie/{}/'.format(url_title) + + # get the list of subtitles + logger.debug('Getting the list of subtitles') + r = self.session.get(url) + r.raise_for_status() + results = json.loads(r.text) + + # loop over results + subtitles = {} + for language_code, language_data in results.items(): + for quality_data in language_data.values(): + for quality, subtitles_data in quality_data.items(): + for subtitle_item in subtitles_data.values(): + # read the item + language = Language.fromalpha2(language_code) + hearing_impaired = bool(subtitle_item['hearing_impaired']) + subtitle_id = subtitle_item['id'] + subtitle_key = subtitle_item['key'] + downloaded = subtitle_item['downloaded'] + release = subtitle_item['subtitle_version'] + + # add the release and increment downloaded count if we already have the subtitle + if subtitle_id in subtitles: + logger.debug('Found additional release %r for subtitle %d', release, subtitle_id) + bisect.insort_left(subtitles[subtitle_id].releases, release) # deterministic order + subtitles[subtitle_id].downloaded += downloaded + continue + + # otherwise create it + subtitle = SubsCenterSubtitle(language, hearing_impaired, page_link, title, season, episode, + title, subtitle_id, subtitle_key, downloaded, [release]) + logger.debug('Found subtitle %r', subtitle) + subtitles[subtitle_id] = subtitle + + return subtitles.values() + + def list_subtitles(self, video, languages): + season = episode = None + title = video.title + + if isinstance(video, Episode): + title = video.series + season = video.season + episode = video.episode + + return [s for s in self.query(title, season, episode) if s.language in languages] + + def download_subtitle(self, subtitle): + # download + url = self.server_url + 'subtitle/download/{}/{}/'.format(subtitle.language.alpha2, subtitle.subtitle_id) + params = {'v': subtitle.releases[0], 'key': subtitle.subtitle_key} + r = self.session.get(url, params=params, headers={'Referer': subtitle.page_link}, timeout=10) + r.raise_for_status() + + # open the zip + with zipfile.ZipFile(io.BytesIO(r.content)) as zf: + # remove some filenames from the namelist + namelist = [n for n in zf.namelist() if not n.endswith('.txt')] + if len(namelist) > 1: + raise ProviderError('More than one file to unzip') + + subtitle.content = fix_line_ending(zf.read(namelist[0])) diff --git a/libs/subliminal/providers/thesubdb.py b/libs/subliminal/providers/thesubdb.py index 44623173..6bf4a0eb 100644 --- a/libs/subliminal/providers/thesubdb.py +++ b/libs/subliminal/providers/thesubdb.py @@ -1,72 +1,84 @@ # -*- coding: utf-8 -*- -from __future__ import unicode_literals import logging -import babelfish -import requests -from . import Provider -from .. import __version__ -from ..exceptions import ProviderError -from ..subtitle import Subtitle, fix_line_endings +from babelfish import Language, language_converters +from requests import Session + +from . import Provider +from .. import __short_version__ +from ..subtitle import Subtitle, fix_line_ending logger = logging.getLogger(__name__) +language_converters.register('thesubdb = subliminal.converters.thesubdb:TheSubDBConverter') + class TheSubDBSubtitle(Subtitle): + """TheSubDB Subtitle.""" provider_name = 'thesubdb' - def __init__(self, language, hash): # @ReservedAssignment + def __init__(self, language, hash): super(TheSubDBSubtitle, self).__init__(language) self.hash = hash - def compute_matches(self, video): + @property + def id(self): + return self.hash + '-' + str(self.language) + + def get_matches(self, video): matches = set() + # hash if 'thesubdb' in video.hashes and video.hashes['thesubdb'] == self.hash: matches.add('hash') + return matches class TheSubDBProvider(Provider): - languages = {babelfish.Language.fromalpha2(l) for l in ['en', 'es', 'fr', 'it', 'nl', 'pl', 'pt', 'ro', 'sv', 'tr']} + """TheSubDB Provider.""" + languages = {Language.fromthesubdb(l) for l in language_converters['thesubdb'].codes} required_hash = 'thesubdb' + server_url = 'http://api.thesubdb.com/' def initialize(self): - self.session = requests.Session() - self.session.headers = {'User-Agent': 'SubDB/1.0 (subliminal/%s; https://github.com/Diaoul/subliminal)' % - __version__.split('-')[0]} + self.session = Session() + self.session.headers['User-Agent'] = ('SubDB/1.0 (subliminal/%s; https://github.com/Diaoul/subliminal)' % + __short_version__) def terminate(self): self.session.close() - def get(self, params): - """Make a GET request on the server with the given parameters - - :param params: params of the request - :return: the response - :rtype: :class:`requests.Response` - - """ - return self.session.get('http://api.thesubdb.com', params=params, timeout=10) - - def query(self, hash): # @ReservedAssignment + def query(self, hash): + # make the query params = {'action': 'search', 'hash': hash} - logger.debug('Searching subtitles %r', params) - r = self.get(params) + logger.info('Searching subtitles %r', params) + r = self.session.get(self.server_url, params=params, timeout=10) + + # handle subtitles not found and errors if r.status_code == 404: - logger.debug('No subtitle found') + logger.debug('No subtitles found') return [] - elif r.status_code != 200: - raise ProviderError('Request failed with status code %d' % r.status_code) - return [TheSubDBSubtitle(language, hash) for language in - {babelfish.Language.fromalpha2(l) for l in r.content.decode('utf-8').split(',')}] + r.raise_for_status() + + # loop over languages + subtitles = [] + for language_code in r.text.split(','): + language = Language.fromthesubdb(language_code) + + subtitle = TheSubDBSubtitle(language, hash) + logger.debug('Found subtitle %r', subtitle) + subtitles.append(subtitle) + + return subtitles def list_subtitles(self, video, languages): return [s for s in self.query(video.hashes['thesubdb']) if s.language in languages] def download_subtitle(self, subtitle): + logger.info('Downloading subtitle %r', subtitle) params = {'action': 'download', 'hash': subtitle.hash, 'language': subtitle.language.alpha2} - r = self.get(params) - if r.status_code != 200: - raise ProviderError('Request failed with status code %d' % r.status_code) - subtitle.content = fix_line_endings(r.content) + r = self.session.get(self.server_url, params=params, timeout=10) + r.raise_for_status() + + subtitle.content = fix_line_ending(r.content) diff --git a/libs/subliminal/providers/tvsubtitles.py b/libs/subliminal/providers/tvsubtitles.py index 3f21928b..ec033ee7 100644 --- a/libs/subliminal/providers/tvsubtitles.py +++ b/libs/subliminal/providers/tvsubtitles.py @@ -1,41 +1,53 @@ # -*- coding: utf-8 -*- -from __future__ import unicode_literals import io import logging import re -import zipfile -import babelfish -import bs4 -import requests -from . import Provider -from .. import __version__ -from ..cache import region, SHOW_EXPIRATION_TIME, EPISODE_EXPIRATION_TIME +from zipfile import ZipFile + +from babelfish import Language, language_converters +from guessit import guessit +from requests import Session + +from . import ParserBeautifulSoup, Provider +from .. import __short_version__ +from ..cache import EPISODE_EXPIRATION_TIME, SHOW_EXPIRATION_TIME, region from ..exceptions import ProviderError -from ..subtitle import Subtitle, fix_line_endings, compute_guess_properties_matches +from ..score import get_equivalent_release_groups +from ..subtitle import Subtitle, fix_line_ending, guess_matches +from ..utils import sanitize, sanitize_release_group from ..video import Episode - logger = logging.getLogger(__name__) -babelfish.language_converters.register('tvsubtitles = subliminal.converters.tvsubtitles:TVsubtitlesConverter') + +language_converters.register('tvsubtitles = subliminal.converters.tvsubtitles:TVsubtitlesConverter') + +link_re = re.compile(r'^(?P.+?)(?: \(?\d{4}\)?| \((?:US|UK)\))? \((?P\d{4})-\d{4}\)$') +episode_id_re = re.compile(r'^episode-\d+\.html$') class TVsubtitlesSubtitle(Subtitle): + """TVsubtitles Subtitle.""" provider_name = 'tvsubtitles' - def __init__(self, language, series, season, episode, year, id, rip, release, page_link): # @ReservedAssignment + def __init__(self, language, page_link, subtitle_id, series, season, episode, year, rip, release): super(TVsubtitlesSubtitle, self).__init__(language, page_link=page_link) + self.subtitle_id = subtitle_id self.series = series self.season = season self.episode = episode self.year = year - self.id = id self.rip = rip self.release = release - def compute_matches(self, video): + @property + def id(self): + return str(self.subtitle_id) + + def get_matches(self, video): matches = set() + # series - if video.series and self.series == video.series: + if video.series and sanitize(self.series) == sanitize(video.series): matches.add('series') # season if video.season and self.season == video.season: @@ -44,148 +56,155 @@ class TVsubtitlesSubtitle(Subtitle): if video.episode and self.episode == video.episode: matches.add('episode') # year - if self.year == video.year: + if video.original_series and self.year is None or video.year and video.year == self.year: matches.add('year') # release_group - if video.release_group and self.release and video.release_group.lower() in self.release.lower(): + if (video.release_group and self.release and + any(r in sanitize_release_group(self.release) + for r in get_equivalent_release_groups(sanitize_release_group(video.release_group)))): matches.add('release_group') - """ - # video_codec - if video.video_codec and self.release and (video.video_codec in self.release.lower() - or video.video_codec == 'h264' and 'x264' in self.release.lower()): - matches.add('video_codec') - # resolution - if video.resolution and self.rip and video.resolution in self.rip.lower(): - matches.add('resolution') - # format - if video.format and self.rip and video.format in self.rip.lower(): - matches.add('format') - """ - # we don't have the complete filename, so we need to guess the matches separately - # guess video_codec (videoCodec in guessit) - matches |= compute_guess_properties_matches(video, self.release, 'videoCodec') - # guess resolution (screenSize in guessit) - matches |= compute_guess_properties_matches(video, self.rip, 'screenSize') - # guess format - matches |= compute_guess_properties_matches(video, self.rip, 'format') + # other properties + if self.release: + matches |= guess_matches(video, guessit(self.release, {'type': 'episode'}), partial=True) + if self.rip: + matches |= guess_matches(video, guessit(self.rip), partial=True) + return matches class TVsubtitlesProvider(Provider): - languages = {babelfish.Language('por', 'BR')} | {babelfish.Language(l) - for l in ['ara', 'bul', 'ces', 'dan', 'deu', 'ell', 'eng', 'fin', 'fra', 'hun', 'ita', 'jpn', 'kor', - 'nld', 'pol', 'por', 'ron', 'rus', 'spa', 'swe', 'tur', 'ukr', 'zho']} + """TVsubtitles Provider.""" + languages = {Language('por', 'BR')} | {Language(l) for l in [ + 'ara', 'bul', 'ces', 'dan', 'deu', 'ell', 'eng', 'fin', 'fra', 'hun', 'ita', 'jpn', 'kor', 'nld', 'pol', 'por', + 'ron', 'rus', 'spa', 'swe', 'tur', 'ukr', 'zho' + ]} video_types = (Episode,) - server = 'http://www.tvsubtitles.net' - episode_id_re = re.compile('^episode-\d+\.html$') - subtitle_re = re.compile('^\/subtitle-\d+\.html$') - link_re = re.compile('^(?P[A-Za-z0-9 \'.]+).*\((?P\d{4})-\d{4}\)$') + server_url = 'http://www.tvsubtitles.net/' def initialize(self): - self.session = requests.Session() - self.session.headers = {'User-Agent': 'Subliminal/%s' % __version__.split('-')[0]} + self.session = Session() + self.session.headers['User-Agent'] = 'Subliminal/%s' % __short_version__ def terminate(self): self.session.close() - def request(self, url, params=None, data=None, method='GET'): - """Make a `method` request on `url` with the given parameters - - :param string url: part of the URL to reach with the leading slash - :param dict params: params of the request - :param dict data: data of the request - :param string method: method of the request - :return: the response - :rtype: :class:`bs4.BeautifulSoup` - - """ - r = self.session.request(method, self.server + url, params=params, data=data, timeout=10) - if r.status_code != 200: - raise ProviderError('Request failed with status code %d' % r.status_code) - return bs4.BeautifulSoup(r.content, ['permissive']) - @region.cache_on_arguments(expiration_time=SHOW_EXPIRATION_TIME) - def find_show_id(self, series, year=None): - """Find the show id from the `series` with optional `year` + def search_show_id(self, series, year=None): + """Search the show id from the `series` and `year`. - :param string series: series of the episode in lowercase - :param year: year of the series, if any - :type year: int or None - :return: the show id, if any - :rtype: int or None + :param str series: series of the episode. + :param year: year of the series, if any. + :type year: int + :return: the show id, if any. + :rtype: int """ - data = {'q': series} - logger.debug('Searching series %r', data) - soup = self.request('/search.php', data=data, method='POST') - links = soup.select('div.left li div a[href^="/tvshow-"]') - if not links: - logger.info('Series %r not found', series) - return None - matched_links = [link for link in links if self.link_re.match(link.string)] - for link in matched_links: # first pass with exact match on series - match = self.link_re.match(link.string) - if match.group('series').lower().replace('.', ' ').strip() == series: + # make the search + logger.info('Searching show id for %r', series) + r = self.session.post(self.server_url + 'search.php', data={'q': series}, timeout=10) + r.raise_for_status() + + # get the series out of the suggestions + soup = ParserBeautifulSoup(r.content, ['lxml', 'html.parser']) + show_id = None + for suggestion in soup.select('div.left li div a[href^="/tvshow-"]'): + match = link_re.match(suggestion.text) + if not match: + logger.error('Failed to match %s', suggestion.text) + continue + + if match.group('series').lower() == series.lower(): if year is not None and int(match.group('first_year')) != year: + logger.debug('Year does not match') continue - return int(link['href'][8:-5]) - for link in matched_links: # less selective second pass - match = self.link_re.match(link.string) - if match.group('series').lower().replace('.', ' ').strip().startswith(series): - if year is not None and int(match.group('first_year')) != year: - continue - return int(link['href'][8:-5]) - return None + show_id = int(suggestion['href'][8:-5]) + logger.debug('Found show id %d', show_id) + break + + return show_id @region.cache_on_arguments(expiration_time=EPISODE_EXPIRATION_TIME) - def find_episode_ids(self, show_id, season): - """Find episode ids from the show id and the season + def get_episode_ids(self, show_id, season): + """Get episode ids from the show id and the season. - :param int show_id: show id - :param int season: season of the episode - :return: episode ids per episode number + :param int show_id: show id. + :param int season: season of the episode. + :return: episode ids per episode number. :rtype: dict """ - params = {'show_id': show_id, 'season': season} - logger.debug('Searching episodes %r', params) - soup = self.request('/tvshow-{show_id}-{season}.html'.format(**params)) + # get the page of the season of the show + logger.info('Getting the page of show id %d, season %d', show_id, season) + r = self.session.get(self.server_url + 'tvshow-%d-%d.html' % (show_id, season), timeout=10) + soup = ParserBeautifulSoup(r.content, ['lxml', 'html.parser']) + + # loop over episode rows episode_ids = {} for row in soup.select('table#table5 tr'): - if not row('a', href=self.episode_id_re): + # skip rows that do not have a link to the episode page + if not row('a', href=episode_id_re): continue + + # extract data from the cells cells = row('td') - episode_ids[int(cells[0].string.split('x')[1])] = int(cells[1].a['href'][8:-5]) + episode = int(cells[0].text.split('x')[1]) + episode_id = int(cells[1].a['href'][8:-5]) + episode_ids[episode] = episode_id + + if episode_ids: + logger.debug('Found episode ids %r', episode_ids) + else: + logger.warning('No episode ids found') + return episode_ids def query(self, series, season, episode, year=None): - show_id = self.find_show_id(series.lower(), year) + # search the show id + show_id = self.search_show_id(series, year) if show_id is None: + logger.error('No show id found for %r (%r)', series, {'year': year}) return [] - episode_ids = self.find_episode_ids(show_id, season) + + # get the episode ids + episode_ids = self.get_episode_ids(show_id, season) if episode not in episode_ids: - logger.info('Episode %d not found', episode) + logger.error('Episode %d not found', episode) return [] - params = {'episode_id': episode_ids[episode]} - logger.debug('Searching episode %r', params) - link = '/episode-{episode_id}.html'.format(**params) - soup = self.request(link) - return [TVsubtitlesSubtitle(babelfish.Language.fromtvsubtitles(row.h5.img['src'][13:-4]), series, season, - episode, year if year and show_id != self.find_show_id(series.lower()) else None, - int(row['href'][10:-5]), row.find('p', title='rip').text.strip() or None, - row.find('p', title='release').text.strip() or None, - self.server + '/subtitle-%d.html' % int(row['href'][10:-5])) - for row in soup('a', href=self.subtitle_re)] + + # get the episode page + logger.info('Getting the page for episode %d', episode_ids[episode]) + r = self.session.get(self.server_url + 'episode-%d.html' % episode_ids[episode], timeout=10) + soup = ParserBeautifulSoup(r.content, ['lxml', 'html.parser']) + + # loop over subtitles rows + subtitles = [] + for row in soup.select('.subtitlen'): + # read the item + language = Language.fromtvsubtitles(row.h5.img['src'][13:-4]) + subtitle_id = int(row.parent['href'][10:-5]) + page_link = self.server_url + 'subtitle-%d.html' % subtitle_id + rip = row.find('p', title='rip').text.strip() or None + release = row.find('p', title='release').text.strip() or None + + subtitle = TVsubtitlesSubtitle(language, page_link, subtitle_id, series, season, episode, year, rip, + release) + logger.debug('Found subtitle %s', subtitle) + subtitles.append(subtitle) + + return subtitles def list_subtitles(self, video, languages): return [s for s in self.query(video.series, video.season, video.episode, video.year) if s.language in languages] def download_subtitle(self, subtitle): - r = self.session.get(self.server + '/download-{subtitle_id}.html'.format(subtitle_id=subtitle.id), - timeout=10) - if r.status_code != 200: - raise ProviderError('Request failed with status code %d' % r.status_code) - with zipfile.ZipFile(io.BytesIO(r.content)) as zf: + # download as a zip + logger.info('Downloading subtitle %r', subtitle) + r = self.session.get(self.server_url + 'download-%d.html' % subtitle.subtitle_id, timeout=10) + r.raise_for_status() + + # open the zip + with ZipFile(io.BytesIO(r.content)) as zf: if len(zf.namelist()) > 1: raise ProviderError('More than one file to unzip') - subtitle.content = fix_line_endings(zf.read(zf.namelist()[0])) + + subtitle.content = fix_line_ending(zf.read(zf.namelist()[0])) diff --git a/libs/subliminal/refiners/__init__.py b/libs/subliminal/refiners/__init__.py new file mode 100644 index 00000000..bbb8d3ef --- /dev/null +++ b/libs/subliminal/refiners/__init__.py @@ -0,0 +1,12 @@ +""" +Refiners enrich a :class:`~subliminal.video.Video` object by adding information to it. + +A refiner is a simple function: + +.. py:function:: refine(video, **kwargs) + + :param video: the video to refine. + :type video: :class:`~subliminal.video.Video` + :param \*\*kwargs: additional parameters for refiners. + +""" diff --git a/libs/subliminal/refiners/metadata.py b/libs/subliminal/refiners/metadata.py new file mode 100644 index 00000000..a8408742 --- /dev/null +++ b/libs/subliminal/refiners/metadata.py @@ -0,0 +1,99 @@ +# -*- coding: utf-8 -*- +import logging +import os + +from babelfish import Error as BabelfishError, Language +from enzyme import MKV + +logger = logging.getLogger(__name__) + + +def refine(video, embedded_subtitles=True, **kwargs): + """Refine a video by searching its metadata. + + Several :class:`~subliminal.video.Video` attributes can be found: + + * :attr:`~subliminal.video.Video.resolution` + * :attr:`~subliminal.video.Video.video_codec` + * :attr:`~subliminal.video.Video.audio_codec` + * :attr:`~subliminal.video.Video.subtitle_languages` + + :param bool embedded_subtitles: search for embedded subtitles. + + """ + # skip non existing videos + if not video.exists: + return + + # check extensions + extension = os.path.splitext(video.name)[1] + if extension == '.mkv': + with open(video.name, 'rb') as f: + mkv = MKV(f) + + # main video track + if mkv.video_tracks: + video_track = mkv.video_tracks[0] + + # resolution + if video_track.height in (480, 720, 1080): + if video_track.interlaced: + video.resolution = '%di' % video_track.height + else: + video.resolution = '%dp' % video_track.height + logger.debug('Found resolution %s', video.resolution) + + # video codec + if video_track.codec_id == 'V_MPEG4/ISO/AVC': + video.video_codec = 'h264' + logger.debug('Found video_codec %s', video.video_codec) + elif video_track.codec_id == 'V_MPEG4/ISO/SP': + video.video_codec = 'DivX' + logger.debug('Found video_codec %s', video.video_codec) + elif video_track.codec_id == 'V_MPEG4/ISO/ASP': + video.video_codec = 'XviD' + logger.debug('Found video_codec %s', video.video_codec) + else: + logger.warning('MKV has no video track') + + # main audio track + if mkv.audio_tracks: + audio_track = mkv.audio_tracks[0] + # audio codec + if audio_track.codec_id == 'A_AC3': + video.audio_codec = 'AC3' + logger.debug('Found audio_codec %s', video.audio_codec) + elif audio_track.codec_id == 'A_DTS': + video.audio_codec = 'DTS' + logger.debug('Found audio_codec %s', video.audio_codec) + elif audio_track.codec_id == 'A_AAC': + video.audio_codec = 'AAC' + logger.debug('Found audio_codec %s', video.audio_codec) + else: + logger.warning('MKV has no audio track') + + # subtitle tracks + if mkv.subtitle_tracks: + if embedded_subtitles: + embedded_subtitle_languages = set() + for st in mkv.subtitle_tracks: + if st.language: + try: + embedded_subtitle_languages.add(Language.fromalpha3b(st.language)) + except BabelfishError: + logger.error('Embedded subtitle track language %r is not a valid language', st.language) + embedded_subtitle_languages.add(Language('und')) + elif st.name: + try: + embedded_subtitle_languages.add(Language.fromname(st.name)) + except BabelfishError: + logger.debug('Embedded subtitle track name %r is not a valid language', st.name) + embedded_subtitle_languages.add(Language('und')) + else: + embedded_subtitle_languages.add(Language('und')) + logger.debug('Found embedded subtitle %r', embedded_subtitle_languages) + video.subtitle_languages |= embedded_subtitle_languages + else: + logger.debug('MKV has no subtitle track') + else: + logger.debug('Unsupported video extension %s', extension) diff --git a/libs/subliminal/refiners/omdb.py b/libs/subliminal/refiners/omdb.py new file mode 100644 index 00000000..e2514ae9 --- /dev/null +++ b/libs/subliminal/refiners/omdb.py @@ -0,0 +1,187 @@ +# -*- coding: utf-8 -*- +import logging +import operator + +import requests + +from .. import __short_version__ +from ..cache import REFINER_EXPIRATION_TIME, region +from ..video import Episode, Movie +from ..utils import sanitize + +logger = logging.getLogger(__name__) + + +class OMDBClient(object): + base_url = 'http://www.omdbapi.com' + + def __init__(self, version=1, session=None, headers=None, timeout=10): + #: Session for the requests + self.session = session or requests.Session() + self.session.timeout = timeout + self.session.headers.update(headers or {}) + self.session.params['r'] = 'json' + self.session.params['v'] = version + + def get(self, id=None, title=None, type=None, year=None, plot='short', tomatoes=False): + # build the params + params = {} + if id: + params['i'] = id + if title: + params['t'] = title + if not params: + raise ValueError('At least id or title is required') + params['type'] = type + params['y'] = year + params['plot'] = plot + params['tomatoes'] = tomatoes + + # perform the request + r = self.session.get(self.base_url, params=params) + r.raise_for_status() + + # get the response as json + j = r.json() + + # check response status + if j['Response'] == 'False': + return None + + return j + + def search(self, title, type=None, year=None, page=1): + # build the params + params = {'s': title, 'type': type, 'y': year, 'page': page} + + # perform the request + r = self.session.get(self.base_url, params=params) + r.raise_for_status() + + # get the response as json + j = r.json() + + # check response status + if j['Response'] == 'False': + return None + + return j + + +omdb_client = OMDBClient(headers={'User-Agent': 'Subliminal/%s' % __short_version__}) + + +@region.cache_on_arguments(expiration_time=REFINER_EXPIRATION_TIME) +def search(title, type, year): + results = omdb_client.search(title, type, year) + if not results: + return None + + # fetch all paginated results + all_results = results['Search'] + total_results = int(results['totalResults']) + page = 1 + while total_results > page * 10: + page += 1 + results = omdb_client.search(title, type, year, page=page) + all_results.extend(results['Search']) + + return all_results + + +def refine(video, **kwargs): + """Refine a video by searching `OMDb API `_. + + Several :class:`~subliminal.video.Episode` attributes can be found: + + * :attr:`~subliminal.video.Episode.series` + * :attr:`~subliminal.video.Episode.year` + * :attr:`~subliminal.video.Episode.series_imdb_id` + + Similarly, for a :class:`~subliminal.video.Movie`: + + * :attr:`~subliminal.video.Movie.title` + * :attr:`~subliminal.video.Movie.year` + * :attr:`~subliminal.video.Video.imdb_id` + + """ + if isinstance(video, Episode): + # exit if the information is complete + if video.series_imdb_id: + logger.debug('No need to search') + return + + # search the series + results = search(video.series, 'series', video.year) + if not results: + logger.warning('No results for series') + return + logger.debug('Found %d results', len(results)) + + # filter the results + results = [r for r in results if sanitize(r['Title']) == sanitize(video.series)] + if not results: + logger.warning('No matching series found') + return + + # process the results + found = False + for result in sorted(results, key=operator.itemgetter('Year')): + if video.original_series and video.year is None: + logger.debug('Found result for original series without year') + found = True + break + if video.year == int(result['Year'].split(u'\u2013')[0]): + logger.debug('Found result with matching year') + found = True + break + + if not found: + logger.warning('No matching series found') + return + + # add series information + logger.debug('Found series %r', result) + video.series = result['Title'] + video.year = int(result['Year'].split(u'\u2013')[0]) + video.series_imdb_id = result['imdbID'] + + elif isinstance(video, Movie): + # exit if the information is complete + if video.imdb_id: + return + + # search the movie + results = search(video.title, 'movie', video.year) + if not results: + logger.warning('No results') + return + logger.debug('Found %d results', len(results)) + + # filter the results + results = [r for r in results if sanitize(r['Title']) == sanitize(video.title)] + if not results: + logger.warning('No matching movie found') + return + + # process the results + found = False + for result in results: + if video.year is None: + logger.debug('Found result for movie without year') + found = True + break + if video.year == int(result['Year']): + logger.debug('Found result with matching year') + found = True + break + + if not found: + logger.warning('No matching movie found') + return + + # add movie information + logger.debug('Found movie %r', result) + video.title = result['Title'] + video.year = int(result['Year'].split(u'\u2013')[0]) + video.imdb_id = result['imdbID'] diff --git a/libs/subliminal/refiners/tvdb.py b/libs/subliminal/refiners/tvdb.py new file mode 100644 index 00000000..1828e5cf --- /dev/null +++ b/libs/subliminal/refiners/tvdb.py @@ -0,0 +1,350 @@ +# -*- coding: utf-8 -*- +from datetime import datetime, timedelta +from functools import wraps +import logging +import re + +import requests + +from .. import __short_version__ +from ..cache import REFINER_EXPIRATION_TIME, region +from ..utils import sanitize +from ..video import Episode + +logger = logging.getLogger(__name__) + +series_re = re.compile(r'^(?P.*?)(?: \((?:(?P\d{4})|(?P[A-Z]{2}))\))?$') + + +def requires_auth(func): + """Decorator for :class:`TVDBClient` methods that require authentication""" + @wraps(func) + def wrapper(self, *args, **kwargs): + if self.token is None or self.token_expired: + self.login() + elif self.token_needs_refresh: + self.refresh_token() + return func(self, *args, **kwargs) + return wrapper + + +class TVDBClient(object): + """TVDB REST API Client + + :param str apikey: API key to use. + :param str username: username to use. + :param str password: password to use. + :param str language: language of the responses. + :param session: session object to use. + :type session: :class:`requests.sessions.Session` or compatible. + :param dict headers: additional headers. + :param int timeout: timeout for the requests. + + """ + #: Base URL of the API + base_url = 'https://api.thetvdb.com' + + #: Token lifespan + token_lifespan = timedelta(hours=1) + + #: Minimum token age before a :meth:`refresh_token` is triggered + refresh_token_every = timedelta(minutes=30) + + def __init__(self, apikey=None, username=None, password=None, language='en', session=None, headers=None, + timeout=10): + #: API key + self.apikey = apikey + + #: Username + self.username = username + + #: Password + self.password = password + + #: Last token acquisition date + self.token_date = datetime.utcnow() - self.token_lifespan + + #: Session for the requests + self.session = session or requests.Session() + self.session.timeout = timeout + self.session.headers.update(headers or {}) + self.session.headers['Content-Type'] = 'application/json' + self.session.headers['Accept-Language'] = language + + @property + def language(self): + return self.session.headers['Accept-Language'] + + @language.setter + def language(self, value): + self.session.headers['Accept-Language'] = value + + @property + def token(self): + if 'Authorization' not in self.session.headers: + return None + return self.session.headers['Authorization'][7:] + + @property + def token_expired(self): + return datetime.utcnow() - self.token_date > self.token_lifespan + + @property + def token_needs_refresh(self): + return datetime.utcnow() - self.token_date > self.refresh_token_every + + def login(self): + """Login""" + # perform the request + data = {'apikey': self.apikey, 'username': self.username, 'password': self.password} + r = self.session.post(self.base_url + '/login', json=data) + r.raise_for_status() + + # set the Authorization header + self.session.headers['Authorization'] = 'Bearer ' + r.json()['token'] + + # update token_date + self.token_date = datetime.utcnow() + + def refresh_token(self): + """Refresh token""" + # perform the request + r = self.session.get(self.base_url + '/refresh_token') + r.raise_for_status() + + # set the Authorization header + self.session.headers['Authorization'] = 'Bearer ' + r.json()['token'] + + # update token_date + self.token_date = datetime.utcnow() + + @requires_auth + def search_series(self, name=None, imdb_id=None, zap2it_id=None): + """Search series""" + # perform the request + params = {'name': name, 'imdbId': imdb_id, 'zap2itId': zap2it_id} + r = self.session.get(self.base_url + '/search/series', params=params) + if r.status_code == 404: + return None + r.raise_for_status() + + return r.json()['data'] + + @requires_auth + def get_series(self, id): + """Get series""" + # perform the request + r = self.session.get(self.base_url + '/series/{}'.format(id)) + if r.status_code == 404: + return None + r.raise_for_status() + + return r.json()['data'] + + @requires_auth + def get_series_actors(self, id): + """Get series actors""" + # perform the request + r = self.session.get(self.base_url + '/series/{}/actors'.format(id)) + if r.status_code == 404: + return None + r.raise_for_status() + + return r.json()['data'] + + @requires_auth + def get_series_episodes(self, id, page=1): + """Get series episodes""" + # perform the request + params = {'page': page} + r = self.session.get(self.base_url + '/series/{}/episodes'.format(id), params=params) + if r.status_code == 404: + return None + r.raise_for_status() + + return r.json() + + @requires_auth + def query_series_episodes(self, id, absolute_number=None, aired_season=None, aired_episode=None, dvd_season=None, + dvd_episode=None, imdb_id=None, page=1): + """Query series episodes""" + # perform the request + params = {'absoluteNumber': absolute_number, 'airedSeason': aired_season, 'airedEpisode': aired_episode, + 'dvdSeason': dvd_season, 'dvdEpisode': dvd_episode, 'imdbId': imdb_id, 'page': page} + r = self.session.get(self.base_url + '/series/{}/episodes/query'.format(id), params=params) + if r.status_code == 404: + return None + r.raise_for_status() + + return r.json() + + @requires_auth + def get_episode(self, id): + """Get episode""" + # perform the request + r = self.session.get(self.base_url + '/episodes/{}'.format(id)) + if r.status_code == 404: + return None + r.raise_for_status() + + return r.json()['data'] + + +#: Configured instance of :class:`TVDBClient` +tvdb_client = TVDBClient('5EC930FB90DA1ADA', headers={'User-Agent': 'Subliminal/%s' % __short_version__}) + + +@region.cache_on_arguments(expiration_time=REFINER_EXPIRATION_TIME) +def search_series(name): + """Search series. + + :param str name: name of the series. + :return: the search results. + :rtype: list + + """ + return tvdb_client.search_series(name) + + +@region.cache_on_arguments(expiration_time=REFINER_EXPIRATION_TIME) +def get_series(id): + """Get series. + + :param int id: id of the series. + :return: the series data. + :rtype: dict + + """ + return tvdb_client.get_series(id) + + +@region.cache_on_arguments(expiration_time=REFINER_EXPIRATION_TIME) +def get_series_episode(series_id, season, episode): + """Get an episode of a series. + + :param int series_id: id of the series. + :param int season: season number of the episode. + :param int episode: episode number of the episode. + :return: the episode data. + :rtype: dict + + """ + result = tvdb_client.query_series_episodes(series_id, aired_season=season, aired_episode=episode) + if result: + return tvdb_client.get_episode(result['data'][0]['id']) + + +def refine(video, **kwargs): + """Refine a video by searching `TheTVDB `_. + + .. note:: + + This refiner only work for instances of :class:`~subliminal.video.Episode`. + + Several attributes can be found: + + * :attr:`~subliminal.video.Episode.series` + * :attr:`~subliminal.video.Episode.year` + * :attr:`~subliminal.video.Episode.series_imdb_id` + * :attr:`~subliminal.video.Episode.series_tvdb_id` + * :attr:`~subliminal.video.Episode.title` + * :attr:`~subliminal.video.Video.imdb_id` + * :attr:`~subliminal.video.Episode.tvdb_id` + + """ + # only deal with Episode videos + if not isinstance(video, Episode): + logger.error('Cannot refine episodes') + return + + # exit if the information is complete + if video.series_tvdb_id and video.tvdb_id: + logger.debug('No need to search') + return + + # search the series + logger.info('Searching series %r', video.series) + results = search_series(video.series.lower()) + if not results: + logger.warning('No results for series') + return + logger.debug('Found %d results', len(results)) + + # search for exact matches + matching_results = [] + for result in results: + matching_result = {} + + # use seriesName and aliases + series_names = [result['seriesName']] + series_names.extend(result['aliases']) + + # parse the original series as series + year or country + original_match = series_re.match(result['seriesName']).groupdict() + + # parse series year + series_year = None + if result['firstAired']: + series_year = datetime.strptime(result['firstAired'], '%Y-%m-%d').year + + # discard mismatches on year + if video.year and series_year and video.year != series_year: + logger.debug('Discarding series %r mismatch on year %d', result['seriesName'], series_year) + continue + + # iterate over series names + for series_name in series_names: + # parse as series and year + series, year, country = series_re.match(series_name).groups() + if year: + year = int(year) + + # discard mismatches on year + if year and (video.original_series or video.year != year): + logger.debug('Discarding series name %r mismatch on year %d', series, year) + continue + + # match on sanitized series name + if sanitize(series) == sanitize(video.series): + logger.debug('Found exact match on series %r', series_name) + matching_result['match'] = {'series': original_match['series'], 'year': series_year, + 'original_series': original_match['year'] is None} + break + + # add the result on match + if matching_result: + matching_result['data'] = result + matching_results.append(matching_result) + + # exit if we don't have exactly 1 matching result + if not matching_results: + logger.error('No matching series found') + return + if len(matching_results) > 1: + logger.error('Multiple matches found') + return + + # get the series + matching_result = matching_results[0] + series = get_series(matching_result['data']['id']) + + # add series information + logger.debug('Found series %r', series) + video.series = matching_result['match']['series'] + video.year = matching_result['match']['year'] + video.original_series = matching_result['match']['original_series'] + video.series_tvdb_id = series['id'] + video.series_imdb_id = series['imdbId'] or None + + # get the episode + logger.info('Getting series episode %dx%d', video.season, video.episode) + episode = get_series_episode(video.series_tvdb_id, video.season, video.episode) + if not episode: + logger.warning('No results for episode') + return + + # add episode information + logger.debug('Found episode %r', episode) + video.tvdb_id = episode['id'] + video.title = episode['episodeName'] or None + video.imdb_id = episode['imdbId'] or None diff --git a/libs/subliminal/score.py b/libs/subliminal/score.py index f9dcaede..31ccb343 100755 --- a/libs/subliminal/score.py +++ b/libs/subliminal/score.py @@ -1,90 +1,234 @@ -#!/usr/bin/env python # -*- coding: utf-8 -*- -from __future__ import print_function, unicode_literals -from sympy import Eq, symbols, solve +""" +This module provides the default implementation of the `compute_score` parameter in +:meth:`~subliminal.core.ProviderPool.download_best_subtitles` and :func:`~subliminal.core.download_best_subtitles`. + +.. note:: + + To avoid unnecessary dependency on `sympy `_ and boost subliminal's import time, the + resulting scores are hardcoded here and manually updated when the set of equations change. + +Available matches: + + * hash + * title + * year + * series + * season + * episode + * release_group + * format + * audio_codec + * resolution + * hearing_impaired + * video_codec + * series_imdb_id + * imdb_id + * tvdb_id + +""" +from __future__ import division, print_function +import logging + +from .video import Episode, Movie + +logger = logging.getLogger(__name__) -# Symbols -release_group, resolution, format, video_codec, audio_codec = symbols('release_group resolution format video_codec audio_codec') -imdb_id, hash, title, series, tvdb_id, season, episode = symbols('imdb_id hash title series tvdb_id season episode') # @ReservedAssignment -year = symbols('year') +#: Scores for episodes +episode_scores = {'hash': 359, 'series': 180, 'year': 90, 'season': 30, 'episode': 30, 'release_group': 15, + 'format': 7, 'audio_codec': 3, 'resolution': 2, 'video_codec': 2, 'hearing_impaired': 1} + +#: Scores for movies +movie_scores = {'hash': 119, 'title': 60, 'year': 30, 'release_group': 15, + 'format': 7, 'audio_codec': 3, 'resolution': 2, 'video_codec': 2, 'hearing_impaired': 1} + +#: Equivalent release groups +equivalent_release_groups = ({'LOL', 'DIMENSION'}, {'ASAP', 'IMMERSE', 'FLEET'}) -def get_episode_equations(): - """Get the score equations for a :class:`~subliminal.video.Episode` +def get_equivalent_release_groups(release_group): + """Get all the equivalents of the given release group. - The equations are the following: - - 1. hash = resolution + format + video_codec + audio_codec + series + season + episode + year + release_group - 2. series = resolution + video_codec + audio_codec + season + episode + release_group + 1 - 3. year = series - 4. tvdb_id = series + year - 5. season = resolution + video_codec + audio_codec + 1 - 6. imdb_id = series + season + episode + year - 7. format = video_codec + audio_codec - 8. resolution = video_codec - 9. video_codec = 2 * audio_codec - 10. title = season + episode - 11. season = episode - 12. release_group = season - 13. audio_codec = 1 - - :return: the score equations for an episode - :rtype: list of :class:`sympy.Eq` + :param str release_group: the release group to get the equivalents of. + :return: the equivalent release groups. + :rtype: set """ - equations = [] - equations.append(Eq(hash, resolution + format + video_codec + audio_codec + series + season + episode + year + release_group)) - equations.append(Eq(series, resolution + video_codec + audio_codec + season + episode + release_group + 1)) - equations.append(Eq(series, year)) - equations.append(Eq(tvdb_id, series + year)) - equations.append(Eq(season, resolution + video_codec + audio_codec + 1)) - equations.append(Eq(imdb_id, series + season + episode + year)) - equations.append(Eq(format, video_codec + audio_codec)) - equations.append(Eq(resolution, video_codec)) - equations.append(Eq(video_codec, 2 * audio_codec)) - equations.append(Eq(title, season + episode)) - equations.append(Eq(season, episode)) - equations.append(Eq(release_group, season)) - equations.append(Eq(audio_codec, 1)) - return equations + for equivalent_release_group in equivalent_release_groups: + if release_group in equivalent_release_group: + return equivalent_release_group + + return {release_group} -def get_movie_equations(): - """Get the score equations for a :class:`~subliminal.video.Movie` +def get_scores(video): + """Get the scores dict for the given `video`. - The equations are the following: + This will return either :data:`episode_scores` or :data:`movie_scores` based on the type of the `video`. - 1. hash = resolution + format + video_codec + audio_codec + title + year + release_group - 2. imdb_id = hash - 3. resolution = video_codec - 4. video_codec = 2 * audio_codec - 5. format = video_codec + audio_codec - 6. title = resolution + video_codec + audio_codec + year + 1 - 7. release_group = resolution + video_codec + audio_codec + 1 - 8. year = release_group + 1 - 9. audio_codec = 1 - - :return: the score equations for a movie - :rtype: list of :class:`sympy.Eq` + :param video: the video to compute the score against. + :type video: :class:`~subliminal.video.Video` + :return: the scores dict. + :rtype: dict """ - equations = [] - equations.append(Eq(hash, resolution + format + video_codec + audio_codec + title + year + release_group)) - equations.append(Eq(imdb_id, hash)) - equations.append(Eq(resolution, video_codec)) - equations.append(Eq(video_codec, 2 * audio_codec)) - equations.append(Eq(format, video_codec + audio_codec)) - equations.append(Eq(title, resolution + video_codec + audio_codec + year + 1)) - equations.append(Eq(video_codec, 2 * audio_codec)) - equations.append(Eq(release_group, resolution + video_codec + audio_codec + 1)) - equations.append(Eq(year, release_group + 1)) - equations.append(Eq(audio_codec, 1)) - return equations + if isinstance(video, Episode): + return episode_scores + elif isinstance(video, Movie): + return movie_scores + + raise ValueError('video must be an instance of Episode or Movie') -if __name__ == '__main__': - print(solve(get_episode_equations(), [release_group, resolution, format, video_codec, audio_codec, imdb_id, - hash, series, tvdb_id, season, episode, title, year])) - print(solve(get_movie_equations(), [release_group, resolution, format, video_codec, audio_codec, imdb_id, - hash, title, year])) +def compute_score(subtitle, video, hearing_impaired=None): + """Compute the score of the `subtitle` against the `video` with `hearing_impaired` preference. + + :func:`compute_score` uses the :meth:`Subtitle.get_matches ` method and + applies the scores (either from :data:`episode_scores` or :data:`movie_scores`) after some processing. + + :param subtitle: the subtitle to compute the score of. + :type subtitle: :class:`~subliminal.subtitle.Subtitle` + :param video: the video to compute the score against. + :type video: :class:`~subliminal.video.Video` + :param bool hearing_impaired: hearing impaired preference. + :return: score of the subtitle. + :rtype: int + + """ + logger.info('Computing score of %r for video %r with %r', subtitle, video, dict(hearing_impaired=hearing_impaired)) + + # get the scores dict + scores = get_scores(video) + logger.debug('Using scores %r', scores) + + # get the matches + matches = subtitle.get_matches(video) + logger.debug('Found matches %r', matches) + + # on hash match, discard everything else + if 'hash' in matches: + logger.debug('Keeping only hash match') + matches &= {'hash'} + + # handle equivalent matches + if isinstance(video, Episode): + if 'title' in matches: + logger.debug('Adding title match equivalent') + matches.add('episode') + if 'series_imdb_id' in matches: + logger.debug('Adding series_imdb_id match equivalent') + matches |= {'series', 'year'} + if 'imdb_id' in matches: + logger.debug('Adding imdb_id match equivalents') + matches |= {'series', 'year', 'season', 'episode'} + if 'tvdb_id' in matches: + logger.debug('Adding tvdb_id match equivalents') + matches |= {'series', 'year', 'season', 'episode'} + if 'series_tvdb_id' in matches: + logger.debug('Adding series_tvdb_id match equivalents') + matches |= {'series', 'year'} + elif isinstance(video, Movie): + if 'imdb_id' in matches: + logger.debug('Adding imdb_id match equivalents') + matches |= {'title', 'year'} + + # handle hearing impaired + if hearing_impaired is not None and subtitle.hearing_impaired == hearing_impaired: + logger.debug('Matched hearing_impaired') + matches.add('hearing_impaired') + + # compute the score + score = sum((scores.get(match, 0) for match in matches)) + logger.info('Computed score %r with final matches %r', score, matches) + + # ensure score is within valid bounds + assert 0 <= score <= scores['hash'] + scores['hearing_impaired'] + + return score + + +def solve_episode_equations(): + from sympy import Eq, solve, symbols + + hash, series, year, season, episode, release_group = symbols('hash series year season episode release_group') + format, audio_codec, resolution, video_codec = symbols('format audio_codec resolution video_codec') + hearing_impaired = symbols('hearing_impaired') + + equations = [ + # hash is best + Eq(hash, series + year + season + episode + release_group + format + audio_codec + resolution + video_codec), + + # series counts for the most part in the total score + Eq(series, year + season + episode + release_group + format + audio_codec + resolution + video_codec + 1), + + # year is the second most important part + Eq(year, season + episode + release_group + format + audio_codec + resolution + video_codec + 1), + + # season is important too + Eq(season, release_group + format + audio_codec + resolution + video_codec + 1), + + # episode is equally important to season + Eq(episode, season), + + # release group is the next most wanted match + Eq(release_group, format + audio_codec + resolution + video_codec + 1), + + # format counts as much as audio_codec, resolution and video_codec + Eq(format, audio_codec + resolution + video_codec), + + # audio_codec is more valuable than video_codec + Eq(audio_codec, video_codec + 1), + + # resolution counts as much as video_codec + Eq(resolution, video_codec), + + # video_codec is the least valuable match but counts more than the sum of all scoring increasing matches + Eq(video_codec, hearing_impaired + 1), + + # hearing impaired is only used for score increasing, so put it to 1 + Eq(hearing_impaired, 1), + ] + + return solve(equations, [hash, series, year, season, episode, release_group, format, audio_codec, resolution, + hearing_impaired, video_codec]) + + +def solve_movie_equations(): + from sympy import Eq, solve, symbols + + hash, title, year, release_group = symbols('hash title year release_group') + format, audio_codec, resolution, video_codec = symbols('format audio_codec resolution video_codec') + hearing_impaired = symbols('hearing_impaired') + + equations = [ + # hash is best + Eq(hash, title + year + release_group + format + audio_codec + resolution + video_codec), + + # title counts for the most part in the total score + Eq(title, year + release_group + format + audio_codec + resolution + video_codec + 1), + + # year is the second most important part + Eq(year, release_group + format + audio_codec + resolution + video_codec + 1), + + # release group is the next most wanted match + Eq(release_group, format + audio_codec + resolution + video_codec + 1), + + # format counts as much as audio_codec, resolution and video_codec + Eq(format, audio_codec + resolution + video_codec), + + # audio_codec is more valuable than video_codec + Eq(audio_codec, video_codec + 1), + + # resolution counts as much as video_codec + Eq(resolution, video_codec), + + # video_codec is the least valuable match but counts more than the sum of all scoring increasing matches + Eq(video_codec, hearing_impaired + 1), + + # hearing impaired is only used for score increasing, so put it to 1 + Eq(hearing_impaired, 1), + ] + + return solve(equations, [hash, title, year, release_group, format, audio_codec, resolution, hearing_impaired, + video_codec]) diff --git a/libs/subliminal/subtitle.py b/libs/subliminal/subtitle.py index 1ff7945d..60cdf3d6 100644 --- a/libs/subliminal/subtitle.py +++ b/libs/subliminal/subtitle.py @@ -1,31 +1,45 @@ # -*- coding: utf-8 -*- -from __future__ import unicode_literals +import codecs import logging -import os.path -import babelfish +import os + import chardet -import guessit.matchtree -import guessit.transfo import pysrt + +from .score import get_equivalent_release_groups from .video import Episode, Movie +from .utils import sanitize, sanitize_release_group logger = logging.getLogger(__name__) +#: Subtitle extensions +SUBTITLE_EXTENSIONS = ('.srt', '.sub', '.smi', '.txt', '.ssa', '.ass', '.mpl') + class Subtitle(object): - """Base class for subtitle + """Base class for subtitle. - :param language: language of the subtitle - :type language: :class:`babelfish.Language` - :param bool hearing_impaired: `True` if the subtitle is hearing impaired, `False` otherwise - :param page_link: link to the web page from which the subtitle can be downloaded, if any - :type page_link: string or None + :param language: language of the subtitle. + :type language: :class:`~babelfish.language.Language` + :param bool hearing_impaired: whether or not the subtitle is hearing impaired. + :param page_link: URL of the web page from which the subtitle can be downloaded. + :type page_link: str + :param encoding: Text encoding of the subtitle. + :type encoding: str """ - def __init__(self, language, hearing_impaired=False, page_link=None): + #: Name of the provider that returns that class of subtitle + provider_name = '' + + def __init__(self, language, hearing_impaired=False, page_link=None, encoding=None): + #: Language of the subtitle self.language = language + + #: Whether or not the subtitle is hearing impaired self.hearing_impaired = hearing_impaired + + #: URL of the web page from which the subtitle can be downloaded self.page_link = page_link #: Content as bytes @@ -34,9 +48,60 @@ class Subtitle(object): #: Encoding to decode with when accessing :attr:`text` self.encoding = None + # validate the encoding + if encoding: + try: + self.encoding = codecs.lookup(encoding).name + except (TypeError, LookupError): + logger.debug('Unsupported encoding %s', encoding) + @property - def guessed_encoding(self): - """Guessed encoding using the language, falling back on chardet""" + def id(self): + """Unique identifier of the subtitle""" + raise NotImplementedError + + @property + def text(self): + """Content as string + + If :attr:`encoding` is None, the encoding is guessed with :meth:`guess_encoding` + + """ + if not self.content: + return + + if self.encoding: + return self.content.decode(self.encoding, errors='replace') + + return self.content.decode(self.guess_encoding(), errors='replace') + + def is_valid(self): + """Check if a :attr:`text` is a valid SubRip format. + + :return: whether or not the subtitle is valid. + :rtype: bool + + """ + if not self.text: + return False + + try: + pysrt.from_string(self.text, error_handling=pysrt.ERROR_RAISE) + except pysrt.Error as e: + if e.args[0] < 80: + return False + + return True + + def guess_encoding(self): + """Guess encoding using the language, falling back on chardet. + + :return: the guessed encoding. + :rtype: str + + """ + logger.info('Guessing encoding for language %s', self.language) + # always try utf-8 first encodings = ['utf-8'] @@ -62,223 +127,128 @@ class Subtitle(object): encodings.append('latin-1') # try to decode + logger.debug('Trying encodings %r', encodings) for encoding in encodings: try: self.content.decode(encoding) - return encoding except UnicodeDecodeError: pass + else: + logger.info('Guessed encoding %s', encoding) + return encoding + + logger.warning('Could not guess encoding from language') # fallback on chardet - logger.warning('Could not decode content with encodings %r', encodings) - return chardet.detect(self.content)['encoding'] + encoding = chardet.detect(self.content)['encoding'] + logger.info('Chardet found encoding %s', encoding) - @property - def text(self): - """Content as string + return encoding - If :attr:`encoding` is None, the encoding is guessed with :attr:`guessed_encoding` + def get_matches(self, video): + """Get the matches against the `video`. - """ - if not self.content: - return '' - return self.content.decode(self.encoding or self.guessed_encoding, errors='replace') - - @property - def is_valid(self): - """Check if a subtitle text is a valid SubRip format""" - try: - pysrt.from_string(self.text, error_handling=pysrt.ERROR_RAISE) - return True - except pysrt.Error as e: - if e.args[0] > 80: - return True - except: - logger.exception('Unexpected error when validating subtitle') - return False - - def compute_matches(self, video): - """Compute the matches of the subtitle against the `video` - - :param video: the video to compute the matches against + :param video: the video to get the matches with. :type video: :class:`~subliminal.video.Video` - :return: matches of the subtitle + :return: matches of the subtitle. :rtype: set """ raise NotImplementedError - def compute_score(self, video): - """Compute the score of the subtitle against the `video` - - There are equivalent matches so that a provider can match one element or its equivalent. This is - to give all provider a chance to have a score in the same range without hurting quality. - - * Matching :class:`~subliminal.video.Video`'s `hashes` is equivalent to matching everything else - * Matching :class:`~subliminal.video.Episode`'s `season` and `episode` - is equivalent to matching :class:`~subliminal.video.Episode`'s `title` - * Matching :class:`~subliminal.video.Episode`'s `tvdb_id` is equivalent to matching - :class:`~subliminal.video.Episode`'s `series` - - :param video: the video to compute the score against - :type video: :class:`~subliminal.video.Video` - :return: score of the subtitle - :rtype: int - - """ - score = 0 - # compute matches - initial_matches = self.compute_matches(video) - matches = initial_matches.copy() - # hash is the perfect match - if 'hash' in matches: - score = video.scores['hash'] - else: - # remove equivalences - if isinstance(video, Episode): - if 'imdb_id' in matches: - matches -= {'series', 'tvdb_id', 'season', 'episode', 'title', 'year'} - if 'tvdb_id' in matches: - matches -= {'series', 'year'} - if 'title' in matches: - matches -= {'season', 'episode'} - # add other scores - score += sum((video.scores[match] for match in matches)) - logger.info('Computed score %d with matches %r', score, initial_matches) - return score + def __hash__(self): + return hash(self.provider_name + '-' + self.id) def __repr__(self): - return '<%s [%s]>' % (self.__class__.__name__, self.language) + return '<%s %r [%s]>' % (self.__class__.__name__, self.id, self.language) -def get_subtitle_path(video_path, language=None): - """Create the subtitle path from the given `video_path` and `language` +def get_subtitle_path(video_path, language=None, extension='.srt'): + """Get the subtitle path using the `video_path` and `language`. - :param string video_path: path to the video - :param language: language of the subtitle to put in the path - :type language: :class:`babelfish.Language` or None - :return: path of the subtitle - :rtype: string + :param str video_path: path to the video. + :param language: language of the subtitle to put in the path. + :type language: :class:`~babelfish.language.Language` + :param str extension: extension of the subtitle. + :return: path of the subtitle. + :rtype: str """ - subtitle_path = os.path.splitext(video_path)[0] - if language is not None: - try: - return subtitle_path + '.%s.%s' % (language.alpha2, 'srt') - except babelfish.LanguageConvertError: - return subtitle_path + '.%s.%s' % (language.alpha3, 'srt') - return subtitle_path + '.srt' + subtitle_root = os.path.splitext(video_path)[0] + + if language: + subtitle_root += '.' + str(language) + + return subtitle_root + extension -def compute_guess_matches(video, guess): - """Compute matches between a `video` and a `guess` +def guess_matches(video, guess, partial=False): + """Get matches between a `video` and a `guess`. - :param video: the video to compute the matches on + If a guess is `partial`, the absence information won't be counted as a match. + + :param video: the video. :type video: :class:`~subliminal.video.Video` - :param guess: the guess to compute the matches on - :type guess: :class:`guessit.Guess` - :return: matches of the `guess` + :param guess: the guess. + :type guess: dict + :param bool partial: whether or not the guess is partial. + :return: matches between the `video` and the `guess`. :rtype: set """ matches = set() if isinstance(video, Episode): # series - if video.series and 'series' in guess and guess['series'].lower() == video.series.lower(): + if video.series and 'title' in guess and sanitize(guess['title']) == sanitize(video.series): matches.add('series') + # title + if video.title and 'episode_title' in guess and sanitize(guess['episode_title']) == sanitize(video.title): + matches.add('title') # season - if video.season and 'seasonNumber' in guess and guess['seasonNumber'] == video.season: + if video.season and 'season' in guess and guess['season'] == video.season: matches.add('season') # episode - if video.episode and 'episodeNumber' in guess and guess['episodeNumber'] == video.episode: + if video.episode and 'episode' in guess and guess['episode'] == video.episode: matches.add('episode') # year - if video.year == guess.get('year'): # count "no year" as an information + if video.year and 'year' in guess and guess['year'] == video.year: + matches.add('year') + # count "no year" as an information + if not partial and video.original_series and 'year' not in guess: matches.add('year') elif isinstance(video, Movie): # year if video.year and 'year' in guess and guess['year'] == video.year: matches.add('year') - # title - if video.title and 'title' in guess and guess['title'].lower() == video.title.lower(): - matches.add('title') - # release group - if video.release_group and 'releaseGroup' in guess and guess['releaseGroup'].lower() == video.release_group.lower(): + # title + if video.title and 'title' in guess and sanitize(guess['title']) == sanitize(video.title): + matches.add('title') + # release_group + if (video.release_group and 'release_group' in guess and + sanitize_release_group(guess['release_group']) in + get_equivalent_release_groups(sanitize_release_group(video.release_group))): matches.add('release_group') - # screen size - if video.resolution and 'screenSize' in guess and guess['screenSize'] == video.resolution: + # resolution + if video.resolution and 'screen_size' in guess and guess['screen_size'] == video.resolution: matches.add('resolution') # format if video.format and 'format' in guess and guess['format'].lower() == video.format.lower(): matches.add('format') - # video codec - if video.video_codec and 'videoCodec' in guess and guess['videoCodec'] == video.video_codec: + # video_codec + if video.video_codec and 'video_codec' in guess and guess['video_codec'] == video.video_codec: matches.add('video_codec') - # audio codec - if video.audio_codec and 'audioCodec' in guess and guess['audioCodec'] == video.audio_codec: + # audio_codec + if video.audio_codec and 'audio_codec' in guess and guess['audio_codec'] == video.audio_codec: matches.add('audio_codec') + return matches -def compute_guess_properties_matches(video, string, propertytype): - """Compute matches between a `video` and properties of a certain property type +def fix_line_ending(content): + """Fix line ending of `content` by changing it to \n. - :param video: the video to compute the matches on - :type video: :class:`~subliminal.video.Video` - :param string string: the string to check for a certain property type - :param string propertytype: the type of property to check (as defined in guessit) - :return: matches of a certain property type (but will only be 1 match because we are checking for 1 property type) - :rtype: set - - Supported property types: result of guessit.transfo.guess_properties.GuessProperties().supported_properties() - [u'audioProfile', - u'videoCodec', - u'container', - u'format', - u'episodeFormat', - u'videoApi', - u'screenSize', - u'videoProfile', - u'audioChannels', - u'other', - u'audioCodec'] - - """ - matches = set() - # We only check for the property types relevant for us - if propertytype == 'screenSize' and video.resolution: - for prop in guess_properties(string, propertytype): - if prop.lower() == video.resolution.lower(): - matches.add('resolution') - elif propertytype == 'format' and video.format: - for prop in guess_properties(string, propertytype): - if prop.lower() == video.format.lower(): - matches.add('format') - elif propertytype == 'videoCodec' and video.video_codec: - for prop in guess_properties(string, propertytype): - if prop.lower() == video.video_codec.lower(): - matches.add('video_codec') - elif propertytype == 'audioCodec' and video.audio_codec: - for prop in guess_properties(string, propertytype): - if prop.lower() == video.audio_codec.lower(): - matches.add('audio_codec') - return matches - - -def guess_properties(string, propertytype): - properties = set() - if string: - tree = guessit.matchtree.MatchTree(string) - guessit.transfo.guess_properties.GuessProperties().process(tree) - properties = set(n.guess[propertytype] for n in tree.nodes() if propertytype in n.guess) - return properties - - -def fix_line_endings(content): - """Fix line ending of `content` by changing it to \n - - :param bytes content: content of the subtitle - :return: the content with fixed line endings + :param bytes content: content of the subtitle. + :return: the content with fixed line endings. :rtype: bytes """ diff --git a/libs/subliminal/tests/__init__.py b/libs/subliminal/tests/__init__.py deleted file mode 100644 index 6cef7800..00000000 --- a/libs/subliminal/tests/__init__.py +++ /dev/null @@ -1,14 +0,0 @@ -#!/usr/bin/env python -# -*- coding: utf-8 -*- -from __future__ import unicode_literals -from unittest import TextTestRunner, TestSuite -from subliminal import cache_region -from . import test_providers, test_subliminal - - -cache_region.configure('dogpile.cache.memory', expiration_time=60 * 30) # @UndefinedVariable -suite = TestSuite([test_providers.suite(), test_subliminal.suite()]) - - -if __name__ == '__main__': - TextTestRunner().run(suite) diff --git a/libs/subliminal/tests/common.py b/libs/subliminal/tests/common.py deleted file mode 100644 index bd1608d4..00000000 --- a/libs/subliminal/tests/common.py +++ /dev/null @@ -1,22 +0,0 @@ -# -*- coding: utf-8 -*- -from __future__ import unicode_literals -from subliminal import Movie, Episode - - -MOVIES = [Movie('Man of Steel (2013)/man.of.steel.2013.720p.bluray.x264-felony.mkv', 'Man of Steel', - format='BluRay', release_group='felony', resolution='720p', video_codec='h264', audio_codec='DTS', - imdb_id=770828, size=7033732714, year=2013, - hashes={'opensubtitles': '5b8f8f4e41ccb21e', 'thesubdb': 'ad32876133355929d814457537e12dc2'})] - -EPISODES = [Episode('The Big Bang Theory/Season 07/The.Big.Bang.Theory.S07E05.720p.HDTV.X264-DIMENSION.mkv', - 'The Big Bang Theory', 7, 5, format='HDTV', release_group='DIMENSION', resolution='720p', - video_codec='h264', audio_codec='AC3', imdb_id=3229392, size=501910737, - title='The Workplace Proximity', year=2007, tvdb_id=80379, - hashes={'opensubtitles': '6878b3ef7c1bd19e', 'thesubdb': '9dbbfb7ba81c9a6237237dae8589fccc'}), - Episode('Game of Thrones/Season 03/Game.of.Thrones.S03E10.Mhysa.720p.WEB-DL.DD5.1.H.264-NTb.mkv', - 'Game of Thrones', 3, 10, format='WEB-DL', release_group='NTb', resolution='720p', - video_codec='h264', audio_codec='AC3', imdb_id=2178796, size=2142810931, title='Mhysa', - tvdb_id=121361, - hashes={'opensubtitles': 'b850baa096976c22', 'thesubdb': 'b1f899c77f4c960b84b8dbf840d4e42d'}), - Episode('Dallas.S01E03.mkv', 'Dallas', 1, 3), - Episode('Dallas.2012.S01E03.mkv', 'Dallas', 1, 3, year=2012)] diff --git a/libs/subliminal/tests/test_providers.py b/libs/subliminal/tests/test_providers.py deleted file mode 100644 index e98d9ad3..00000000 --- a/libs/subliminal/tests/test_providers.py +++ /dev/null @@ -1,475 +0,0 @@ -#!/usr/bin/env python -# -*- coding: utf-8 -*- -from __future__ import unicode_literals -import os -from unittest import TestCase, TestSuite, TestLoader, TextTestRunner -from babelfish import Language -from subliminal import provider_manager -from subliminal.tests.common import MOVIES, EPISODES - - -class ProviderTestCase(TestCase): - provider_name = '' - - def setUp(self): - self.Provider = provider_manager[self.provider_name] - - -class Addic7edProviderTestCase(ProviderTestCase): - provider_name = 'addic7ed' - - def test_find_show_id(self): - with self.Provider() as provider: - show_id = provider.find_show_id('the big bang') - self.assertEqual(show_id, 126) - - def test_find_show_id_no_year(self): - with self.Provider() as provider: - show_id = provider.find_show_id('dallas') - self.assertEqual(show_id, 802) - - def test_find_show_id_year(self): - with self.Provider() as provider: - show_id = provider.find_show_id('dallas', 2012) - self.assertEqual(show_id, 2559) - - def test_find_show_id_error(self): - with self.Provider() as provider: - show_id = provider.find_show_id('the big how i met your mother') - self.assertIsNone(show_id) - - def test_get_show_ids(self): - with self.Provider() as provider: - show_ids = provider.get_show_ids() - self.assertIn('the big bang theory', show_ids) - self.assertEqual(show_ids['the big bang theory'], 126) - - def test_get_show_ids_no_year(self): - with self.Provider() as provider: - show_ids = provider.get_show_ids() - self.assertIn('dallas', show_ids) - self.assertEqual(show_ids['dallas'], 802) - - def test_get_show_ids_year(self): - with self.Provider() as provider: - show_ids = provider.get_show_ids() - self.assertIn('dallas (2012)', show_ids) - self.assertEqual(show_ids['dallas (2012)'], 2559) - - def test_query_episode_0(self): - video = EPISODES[0] - languages = {Language('tur'), Language('rus'), Language('heb'), Language('ita'), Language('fra'), - Language('ron'), Language('nld'), Language('eng'), Language('deu'), Language('ell'), - Language('por', 'BR'), Language('bul'), Language('por'), Language('msa')} - matches = {frozenset(['series', 'resolution', 'season']), - frozenset(['series', 'episode', 'season', 'title']), - frozenset(['series', 'release_group', 'season']), - frozenset(['series', 'episode', 'season', 'release_group', 'title']), - frozenset(['series', 'season']), - frozenset(['series', 'season', 'format'])} - with self.Provider() as provider: - subtitles = provider.query(video.series, video.season, video.year) - self.assertEqual({frozenset(subtitle.compute_matches(video)) for subtitle in subtitles}, matches) - self.assertEqual({subtitle.language for subtitle in subtitles}, languages) - - def test_query_episode_1(self): - video = EPISODES[1] - languages = {Language('ind'), Language('spa'), Language('hrv'), Language('ita'), Language('fra'), - Language('cat'), Language('ell'), Language('nld'), Language('eng'), Language('fas'), - Language('por'), Language('nor'), Language('deu'), Language('ron'), Language('por', 'BR'), - Language('bul')} - matches = {frozenset(['series', 'episode', 'resolution', 'season', 'title', 'year']), - frozenset(['series', 'resolution', 'season', 'year']), - frozenset(['series', 'resolution', 'season', 'year', 'format']), - frozenset(['series', 'episode', 'season', 'title', 'year']), - frozenset(['series', 'episode', 'season', 'title', 'year', 'format']), - frozenset(['series', 'release_group', 'season', 'year']), - frozenset(['series', 'release_group', 'season', 'year', 'format']), - frozenset(['series', 'resolution', 'release_group', 'season', 'year']), - frozenset(['series', 'resolution', 'release_group', 'season', 'year', 'format']), - frozenset(['series', 'episode', 'season', 'release_group', 'title', 'year', 'format']), - frozenset(['series', 'season', 'year']), - frozenset(['series', 'season', 'year', 'format'])} - with self.Provider() as provider: - subtitles = provider.query(video.series, video.season, video.year) - self.assertEqual({frozenset(subtitle.compute_matches(video)) for subtitle in subtitles}, matches) - self.assertEqual({subtitle.language for subtitle in subtitles}, languages) - - def test_query_episode_year(self): - video_no_year = EPISODES[2] - video_year = EPISODES[3] - with self.Provider() as provider: - subtitles_no_year = provider.query(video_no_year.series, video_no_year.season, video_no_year.year) - subtitles_year = provider.query(video_year.series, video_year.season, video_year.year) - self.assertNotEqual(subtitles_no_year, subtitles_year) - - def test_list_subtitles(self): - video = EPISODES[0] - languages = {Language('eng'), Language('fra')} - matches = {frozenset(['series', 'episode', 'season', 'release_group', 'title']), - frozenset(['series', 'episode', 'season', 'title'])} - with self.Provider() as provider: - subtitles = provider.list_subtitles(video, languages) - self.assertEqual({frozenset(subtitle.compute_matches(video)) for subtitle in subtitles}, matches) - self.assertEqual({subtitle.language for subtitle in subtitles}, languages) - - def test_download_subtitle(self): - video = EPISODES[0] - languages = {Language('eng'), Language('fra')} - with self.Provider() as provider: - subtitles = provider.list_subtitles(video, languages) - provider.download_subtitle(subtitles[0]) - self.assertIsNotNone(subtitles[0].content) - self.assertTrue(subtitles[0].is_valid) - - -class OpenSubtitlesProviderTestCase(ProviderTestCase): - provider_name = 'opensubtitles' - - def test_query_movie_0_query(self): - video = MOVIES[0] - languages = {Language('eng')} - matches = {frozenset([]), - frozenset(['imdb_id', 'resolution', 'title', 'year']), - frozenset(['imdb_id', 'resolution', 'title', 'year', 'format']), - frozenset(['imdb_id', 'title', 'year']), - frozenset(['imdb_id', 'title', 'year', 'format']), - frozenset(['imdb_id', 'video_codec', 'title', 'year', 'format']), - frozenset(['imdb_id', 'resolution', 'title', 'video_codec', 'year', 'format']), - frozenset(['imdb_id', 'title', 'year', 'video_codec', 'resolution', 'release_group', 'format'])} - with self.Provider() as provider: - subtitles = provider.query(languages, query=video.title) - self.assertEqual({frozenset(subtitle.compute_matches(video)) for subtitle in subtitles}, matches) - self.assertEqual({subtitle.language for subtitle in subtitles}, languages) - - def test_query_episode_0_query(self): - video = EPISODES[0] - languages = {Language('eng')} - matches = {frozenset(['series', 'episode', 'season', 'imdb_id', 'format']), - frozenset(['series', 'imdb_id', 'video_codec', 'episode', 'season', 'format']), - frozenset(['episode', 'title', 'series', 'imdb_id', 'video_codec', 'season'])} - with self.Provider() as provider: - subtitles = provider.query(languages, query=os.path.split(video.name)[1]) - self.assertEqual({frozenset(subtitle.compute_matches(video)) for subtitle in subtitles}, matches) - self.assertEqual({subtitle.language for subtitle in subtitles}, languages) - - def test_query_episode_year(self): - video_no_year = EPISODES[2] - video_year = EPISODES[3] - languages = {Language('eng')} - with self.Provider() as provider: - subtitles_no_year = provider.query(languages, query=os.path.split(video_no_year.name)[1]) - subtitles_year = provider.query(languages, query=os.path.split(video_year.name)[1]) - self.assertNotEqual(subtitles_no_year, subtitles_year) - - def test_query_episode_1_query(self): - video = EPISODES[1] - languages = {Language('eng'), Language('fra')} - matches = {frozenset(['episode', 'title', 'series', 'imdb_id', 'video_codec', 'season', 'year', 'format']), - frozenset(['series', 'imdb_id', 'video_codec', 'episode', 'season', 'year']), - frozenset(['episode', 'video_codec', 'series', 'imdb_id', 'resolution', 'season', 'year']), - frozenset(['series', 'imdb_id', 'resolution', 'episode', 'season', 'year']), - frozenset(['series', 'episode', 'season', 'imdb_id', 'year']), - frozenset(['series', 'episode', 'season', 'imdb_id', 'year', 'format'])} - with self.Provider() as provider: - subtitles = provider.query(languages, query=os.path.split(video.name)[1]) - self.assertEqual({frozenset(subtitle.compute_matches(video)) for subtitle in subtitles}, matches) - self.assertEqual({subtitle.language for subtitle in subtitles}, languages) - - def test_query_movie_0_imdb_id(self): - video = MOVIES[0] - languages = {Language('eng'), Language('fra')} - matches = {frozenset(['imdb_id', 'video_codec', 'title', 'year', 'format']), - frozenset(['imdb_id', 'resolution', 'title', 'video_codec', 'year']), - frozenset(['imdb_id', 'resolution', 'title', 'video_codec', 'year', 'format']), - frozenset(['imdb_id', 'title', 'year', 'video_codec', 'resolution', 'release_group', 'format']), - frozenset(['imdb_id', 'title', 'year']), - frozenset(['imdb_id', 'title', 'year', 'format']), - frozenset(['imdb_id', 'resolution', 'title', 'year']), - frozenset(['imdb_id', 'resolution', 'title', 'year', 'format'])} - with self.Provider() as provider: - subtitles = provider.query(languages, imdb_id=video.imdb_id) - self.assertEqual({frozenset(subtitle.compute_matches(video)) for subtitle in subtitles}, matches) - self.assertEqual({subtitle.language for subtitle in subtitles}, languages) - - def test_query_episode_0_imdb_id(self): - video = EPISODES[0] - languages = {Language('eng'), Language('fra')} - matches = {frozenset(['series', 'episode', 'season', 'imdb_id', 'format']), - frozenset(['episode', 'release_group', 'video_codec', 'series', 'imdb_id', 'resolution', 'season', 'format']), - frozenset(['series', 'imdb_id', 'video_codec', 'episode', 'season', 'format']), - frozenset(['episode', 'title', 'series', 'imdb_id', 'video_codec', 'season'])} - with self.Provider() as provider: - subtitles = provider.query(languages, imdb_id=video.imdb_id) - self.assertEqual({frozenset(subtitle.compute_matches(video)) for subtitle in subtitles}, matches) - self.assertEqual({subtitle.language for subtitle in subtitles}, languages) - - def test_query_movie_0_hash(self): - video = MOVIES[0] - languages = {Language('eng')} - matches = {frozenset(['hash', 'title', 'video_codec', 'year', 'resolution', 'imdb_id', 'format']), - frozenset(['hash', 'title', 'video_codec', 'year', 'resolution', 'release_group', 'imdb_id', 'format']), - frozenset(['year', 'video_codec', 'imdb_id', 'hash', 'title', 'format']), - frozenset([]), - frozenset(['year', 'resolution', 'imdb_id', 'hash', 'title', 'format']), - frozenset(['year', 'imdb_id', 'hash', 'title']), - frozenset(['year', 'imdb_id', 'hash', 'title', 'format'])} - with self.Provider() as provider: - subtitles = provider.query(languages, hash=video.hashes['opensubtitles'], size=video.size) - self.assertEqual({frozenset(subtitle.compute_matches(video)) for subtitle in subtitles}, matches) - self.assertEqual({subtitle.language for subtitle in subtitles}, languages) - - def test_query_episode_0_hash(self): - video = EPISODES[0] - languages = {Language('eng')} - matches = {frozenset(['series', 'hash', 'format']), - frozenset(['episode', 'season', 'series', 'imdb_id', 'video_codec', 'hash', 'format']), - frozenset(['series', 'episode', 'season', 'hash', 'imdb_id', 'format']), - frozenset(['series', 'resolution', 'hash', 'video_codec', 'format'])} - with self.Provider() as provider: - subtitles = provider.query(languages, hash=video.hashes['opensubtitles'], size=video.size) - self.assertEqual({frozenset(subtitle.compute_matches(video)) for subtitle in subtitles}, matches) - self.assertEqual({subtitle.language for subtitle in subtitles}, languages) - - def test_list_subtitles(self): - video = MOVIES[0] - languages = {Language('eng'), Language('fra')} - matches = {frozenset(['title', 'video_codec', 'year', 'resolution', 'release_group', 'imdb_id', 'format']), - frozenset(['imdb_id', 'year', 'title']), - frozenset(['imdb_id', 'year', 'title', 'format']), - frozenset(['year', 'video_codec', 'imdb_id', 'resolution', 'title']), - frozenset(['year', 'video_codec', 'imdb_id', 'resolution', 'title', 'format']), - frozenset(['hash', 'title', 'video_codec', 'year', 'resolution', 'release_group', 'imdb_id', 'format']), - frozenset(['year', 'video_codec', 'imdb_id', 'hash', 'title', 'format']), - frozenset([]), - frozenset(['year', 'resolution', 'imdb_id', 'hash', 'title', 'format']), - frozenset(['hash', 'title', 'video_codec', 'year', 'resolution', 'imdb_id', 'format']), - frozenset(['year', 'imdb_id', 'hash', 'title']), - frozenset(['year', 'imdb_id', 'hash', 'title', 'format']), - frozenset(['video_codec', 'imdb_id', 'year', 'title', 'format']), - frozenset(['year', 'imdb_id', 'resolution', 'title']), - frozenset(['year', 'imdb_id', 'resolution', 'title', 'format'])} - with self.Provider() as provider: - subtitles = provider.list_subtitles(video, languages) - self.assertEqual({frozenset(subtitle.compute_matches(video)) for subtitle in subtitles}, matches) - self.assertEqual({subtitle.language for subtitle in subtitles}, languages) - - def test_download_subtitle(self): - video = MOVIES[0] - languages = {Language('eng'), Language('fra')} - with self.Provider() as provider: - subtitles = provider.list_subtitles(video, languages) - provider.download_subtitle(subtitles[0]) - self.assertIsNotNone(subtitles[0].content) - self.assertTrue(subtitles[0].is_valid) - - -class PodnapisiProviderTestCase(ProviderTestCase): - provider_name = 'podnapisi' - - def test_query_movie_0(self): - video = MOVIES[0] - language = Language('eng') - matches = {frozenset(['video_codec', 'title', 'resolution', 'year']), - frozenset(['title', 'resolution', 'year']), - frozenset(['video_codec', 'title', 'year']), - frozenset(['title', 'year']), - frozenset(['title']), - frozenset(['video_codec', 'title', 'resolution', 'release_group', 'year', 'format']), - frozenset(['video_codec', 'title', 'resolution', 'audio_codec', 'year', 'format'])} - with self.Provider() as provider: - subtitles = provider.query(language, title=video.title, year=video.year) - self.assertEqual({frozenset(subtitle.compute_matches(video)) for subtitle in subtitles}, matches) - self.assertEqual({subtitle.language for subtitle in subtitles}, {language}) - - def test_query_episode_0(self): - video = EPISODES[0] - language = Language('eng') - matches = {frozenset(['episode', 'series', 'season', 'video_codec', 'resolution', 'release_group', 'format']), - frozenset(['season', 'video_codec', 'episode', 'resolution', 'series'])} - with self.Provider() as provider: - subtitles = provider.query(language, series=video.series, season=video.season, episode=video.episode, - year=video.year) - self.assertEqual({frozenset(subtitle.compute_matches(video)) for subtitle in subtitles}, matches) - self.assertEqual({subtitle.language for subtitle in subtitles}, {language}) - - def test_query_episode_1(self): - video = EPISODES[1] - language = Language('eng') - matches = {frozenset(['episode', 'release_group', 'series', 'video_codec', 'resolution', 'season', 'year', 'format']), - frozenset(['episode', 'series', 'video_codec', 'resolution', 'season', 'year']), - frozenset(['season', 'video_codec', 'episode', 'series', 'year'])} - with self.Provider() as provider: - subtitles = provider.query(language, series=video.series, season=video.season, episode=video.episode, - year=video.year) - self.assertEqual({frozenset(subtitle.compute_matches(video)) for subtitle in subtitles}, matches) - self.assertEqual({subtitle.language for subtitle in subtitles}, {language}) - - def test_list_subtitles(self): - video = MOVIES[0] - languages = {Language('eng'), Language('fra')} - matches = {frozenset(['video_codec', 'title', 'resolution', 'year']), - frozenset(['title', 'resolution', 'year']), - frozenset(['video_codec', 'title', 'year']), - frozenset(['video_codec', 'title', 'year', 'format']), - frozenset(['title', 'year']), - frozenset(['title']), - frozenset(['video_codec', 'title', 'resolution', 'release_group', 'year', 'format']), - frozenset(['video_codec', 'title', 'resolution', 'audio_codec', 'year', 'format'])} - with self.Provider() as provider: - subtitles = provider.list_subtitles(video, languages) - self.assertEqual({frozenset(subtitle.compute_matches(video)) for subtitle in subtitles}, matches) - self.assertEqual({subtitle.language for subtitle in subtitles}, languages) - - def test_download_subtitle(self): - video = MOVIES[0] - languages = {Language('eng'), Language('fra')} - with self.Provider() as provider: - subtitles = provider.list_subtitles(video, languages) - provider.download_subtitle(subtitles[0]) - self.assertIsNotNone(subtitles[0].content) - self.assertTrue(subtitles[0].is_valid) - - -class TheSubDBProviderTestCase(ProviderTestCase): - provider_name = 'thesubdb' - - def test_query_episode_0(self): - video = EPISODES[0] - languages = {Language('eng'), Language('spa'), Language('por')} - matches = {frozenset(['hash'])} - with self.Provider() as provider: - subtitles = provider.query(video.hashes['thesubdb']) - self.assertEqual({frozenset(subtitle.compute_matches(video)) for subtitle in subtitles}, matches) - self.assertEqual({subtitle.language for subtitle in subtitles}, languages) - - def test_query_episode_1(self): - video = EPISODES[1] - languages = {Language('eng'), Language('por')} - matches = {frozenset(['hash'])} - with self.Provider() as provider: - subtitles = provider.query(video.hashes['thesubdb']) - self.assertEqual({frozenset(subtitle.compute_matches(video)) for subtitle in subtitles}, matches) - self.assertEqual({subtitle.language for subtitle in subtitles}, languages) - - def test_list_subtitles(self): - video = MOVIES[0] - languages = {Language('eng'), Language('por')} - matches = {frozenset(['hash'])} - with self.Provider() as provider: - subtitles = provider.list_subtitles(video, languages) - self.assertEqual({frozenset(subtitle.compute_matches(video)) for subtitle in subtitles}, matches) - self.assertEqual({subtitle.language for subtitle in subtitles}, languages) - - def test_download_subtitle(self): - video = MOVIES[0] - languages = {Language('eng'), Language('por')} - with self.Provider() as provider: - subtitles = provider.list_subtitles(video, languages) - provider.download_subtitle(subtitles[0]) - provider.download_subtitle(subtitles[1]) - self.assertIsNotNone(subtitles[0].content) - self.assertTrue(subtitles[0].is_valid) - - -class TVsubtitlesProviderTestCase(ProviderTestCase): - provider_name = 'tvsubtitles' - - def test_find_show_id(self): - with self.Provider() as provider: - show_id = provider.find_show_id('the big bang') - self.assertEqual(show_id, 154) - - def test_find_show_id_ambiguous(self): - with self.Provider() as provider: - show_id = provider.find_show_id('new girl') - self.assertEqual(show_id, 977) - - def test_find_show_id_no_dots(self): - with self.Provider() as provider: - show_id = provider.find_show_id('marvel\'s agents of s h i e l d') - self.assertEqual(show_id, 1340) - - def test_find_show_id_no_year_dallas(self): - with self.Provider() as provider: - show_id = provider.find_show_id('dallas') - self.assertEqual(show_id, 646) - - def test_find_show_id_no_year_house_of_cards(self): - with self.Provider() as provider: - show_id = provider.find_show_id('house of cards') - self.assertEqual(show_id, 352) - - def test_find_show_id_year_dallas(self): - with self.Provider() as provider: - show_id = provider.find_show_id('dallas', 2012) - self.assertEqual(show_id, 1127) - - def test_find_show_id_year_house_of_cards(self): - with self.Provider() as provider: - show_id = provider.find_show_id('house of cards', 2013) - self.assertEqual(show_id, 1246) - - def test_find_show_id_error(self): - with self.Provider() as provider: - show_id = provider.find_show_id('the big gaming') - self.assertIsNone(show_id) - - def test_find_episode_ids(self): - with self.Provider() as provider: - episode_ids = provider.find_episode_ids(154, 5) - self.assertEqual(set(episode_ids.keys()), set(range(1, 25))) - - def test_query_episode_0(self): - video = EPISODES[0] - languages = {Language('fra'), Language('por'), Language('hun'), Language('ron'), Language('eng')} - matches = {frozenset(['series', 'episode', 'season', 'video_codec', 'format']), - frozenset(['series', 'episode', 'season', 'format'])} - with self.Provider() as provider: - subtitles = provider.query(video.series, video.season, video.episode, video.year) - self.assertEqual({frozenset(subtitle.compute_matches(video)) for subtitle in subtitles}, matches) - self.assertEqual({subtitle.language for subtitle in subtitles}, languages) - - def test_query_episode_1(self): - video = EPISODES[1] - languages = {Language('fra'), Language('ell'), Language('ron'), Language('eng'), Language('hun'), - Language('por'), Language('por', 'BR'), Language('jpn')} - matches = {frozenset(['series', 'episode', 'resolution', 'season', 'year']), - frozenset(['series', 'episode', 'season', 'video_codec', 'year']), - frozenset(['series', 'episode', 'season', 'year'])} - with self.Provider() as provider: - subtitles = provider.query(video.series, video.season, video.episode, video.year) - self.assertEqual({frozenset(subtitle.compute_matches(video)) for subtitle in subtitles}, matches) - self.assertEqual({subtitle.language for subtitle in subtitles}, languages) - - def test_list_subtitles(self): - video = EPISODES[0] - languages = {Language('eng'), Language('fra')} - matches = {frozenset(['series', 'episode', 'season', 'format'])} - with self.Provider() as provider: - subtitles = provider.list_subtitles(video, languages) - self.assertEqual({frozenset(subtitle.compute_matches(video)) for subtitle in subtitles}, matches) - self.assertEqual({subtitle.language for subtitle in subtitles}, languages) - - def test_download_subtitle(self): - video = EPISODES[0] - languages = {Language('hun')} - with self.Provider() as provider: - subtitles = provider.list_subtitles(video, languages) - provider.download_subtitle(subtitles[0]) - self.assertIsNotNone(subtitles[0].content) - self.assertTrue(subtitles[0].is_valid) - - -def suite(): - suite = TestSuite() - suite.addTest(TestLoader().loadTestsFromTestCase(Addic7edProviderTestCase)) - suite.addTest(TestLoader().loadTestsFromTestCase(OpenSubtitlesProviderTestCase)) - suite.addTest(TestLoader().loadTestsFromTestCase(PodnapisiProviderTestCase)) - suite.addTest(TestLoader().loadTestsFromTestCase(TheSubDBProviderTestCase)) - suite.addTest(TestLoader().loadTestsFromTestCase(TVsubtitlesProviderTestCase)) - return suite - - -if __name__ == '__main__': - TextTestRunner().run(suite()) diff --git a/libs/subliminal/tests/test_subliminal.py b/libs/subliminal/tests/test_subliminal.py deleted file mode 100644 index a991d81f..00000000 --- a/libs/subliminal/tests/test_subliminal.py +++ /dev/null @@ -1,191 +0,0 @@ -#!/usr/bin/env python -# -*- coding: utf-8 -*- -from __future__ import unicode_literals -import os -import shutil -from unittest import TestCase, TestSuite, TestLoader, TextTestRunner -from babelfish import Language -from subliminal import list_subtitles, download_subtitles, save_subtitles, download_best_subtitles, scan_video -from subliminal.tests.common import MOVIES, EPISODES - - -TEST_DIR = 'test_data' - - -class ApiTestCase(TestCase): - def setUp(self): - os.mkdir(TEST_DIR) - - def tearDown(self): - shutil.rmtree(TEST_DIR) - - def test_list_subtitles_movie_0(self): - videos = [MOVIES[0]] - languages = {Language('eng')} - subtitles = list_subtitles(videos, languages) - self.assertEqual(len(subtitles), len(videos)) - self.assertGreater(len(subtitles[videos[0]]), 0) - - def test_list_subtitles_movie_0_por_br(self): - videos = [MOVIES[0]] - languages = {Language('por', 'BR')} - subtitles = list_subtitles(videos, languages) - self.assertEqual(len(subtitles), len(videos)) - self.assertGreater(len(subtitles[videos[0]]), 0) - - def test_list_subtitles_episodes(self): - videos = [EPISODES[0], EPISODES[1]] - languages = {Language('eng'), Language('fra')} - subtitles = list_subtitles(videos, languages) - self.assertEqual(len(subtitles), len(videos)) - self.assertGreater(len(subtitles[videos[0]]), 0) - - def test_download_subtitles(self): - videos = [EPISODES[0]] - for video in videos: - video.name = os.path.join(TEST_DIR, os.path.split(video.name)[1]) - languages = {Language('eng')} - subtitles = list_subtitles(videos, languages) - download_subtitles(subtitles[videos[0]][:5]) - self.assertGreaterEqual(len([s for s in subtitles[videos[0]] if s.content is not None]), 4) - - def test_download_best_subtitles(self): - videos = [EPISODES[0], EPISODES[1]] - for video in videos: - video.name = os.path.join(TEST_DIR, os.path.split(video.name)[1]) - languages = {Language('eng'), Language('fra')} - subtitles = download_best_subtitles(videos, languages) - for video in videos: - self.assertIn(video, subtitles) - self.assertEqual(len(subtitles[video]), 2) - - def test_save_subtitles(self): - videos = [EPISODES[0], EPISODES[1]] - for video in videos: - video.name = os.path.join(TEST_DIR, os.path.split(video.name)[1]) - languages = {Language('eng'), Language('fra')} - subtitles = list_subtitles(videos, languages) - - # make a list of subtitles to download (one per language per video) - subtitles_to_download = [] - for video, video_subtitles in subtitles.items(): - video_subtitle_languages = set() - for video_subtitle in video_subtitles: - if video_subtitle.language in video_subtitle_languages: - continue - subtitles_to_download.append(video_subtitle) - video_subtitle_languages.add(video_subtitle.language) - if video_subtitle_languages == languages: - break - self.assertEqual(len(subtitles_to_download), 4) - - # download - download_subtitles(subtitles_to_download) - save_subtitles(subtitles) - for video in videos: - self.assertTrue(os.path.exists(os.path.splitext(video.name)[0] + '.en.srt')) - self.assertTrue(os.path.exists(os.path.splitext(video.name)[0] + '.fr.srt')) - - def test_save_subtitles_single(self): - videos = [EPISODES[0], EPISODES[1]] - for video in videos: - video.name = os.path.join(TEST_DIR, os.path.split(video.name)[1]) - languages = {Language('eng'), Language('fra')} - subtitles = download_best_subtitles(videos, languages) - save_subtitles(subtitles, single=True) - for video in videos: - self.assertIn(video, subtitles) - self.assertEqual(len(subtitles[video]), 2) - self.assertTrue(os.path.exists(os.path.splitext(video.name)[0] + '.srt')) - - def test_download_best_subtitles_min_score(self): - videos = [MOVIES[0]] - for video in videos: - video.name = os.path.join(TEST_DIR, os.path.split(video.name)[1]) - languages = {Language('eng'), Language('fra')} - subtitles = download_best_subtitles(videos, languages, min_score=1000) - self.assertEqual(len(subtitles), 0) - - def test_download_best_subtitles_hearing_impaired(self): - videos = [MOVIES[0]] - for video in videos: - video.name = os.path.join(TEST_DIR, os.path.split(video.name)[1]) - languages = {Language('eng')} - subtitles = download_best_subtitles(videos, languages, hearing_impaired=True) - self.assertTrue(subtitles[videos[0]][0].hearing_impaired) - - -class VideoTestCase(TestCase): - def setUp(self): - os.mkdir(TEST_DIR) - for video in MOVIES + EPISODES: - open(os.path.join(TEST_DIR, os.path.split(video.name)[1]), 'w').close() - - def tearDown(self): - shutil.rmtree(TEST_DIR) - - def test_scan_video_movie(self): - video = MOVIES[0] - scanned_video = scan_video(os.path.join(TEST_DIR, os.path.split(video.name)[1])) - self.assertEqual(scanned_video.name, os.path.join(TEST_DIR, os.path.split(video.name)[1])) - self.assertEqual(scanned_video.title.lower(), video.title.lower()) - self.assertEqual(scanned_video.year, video.year) - self.assertEqual(scanned_video.video_codec, video.video_codec) - self.assertEqual(scanned_video.format, video.format) - self.assertEqual(scanned_video.resolution, video.resolution) - self.assertEqual(scanned_video.release_group, video.release_group) - self.assertEqual(scanned_video.subtitle_languages, set()) - self.assertEqual(scanned_video.hashes, {}) - self.assertIsNone(scanned_video.audio_codec) - self.assertIsNone(scanned_video.imdb_id) - self.assertEqual(scanned_video.size, 0) - - def test_scan_video_episode(self): - video = EPISODES[0] - scanned_video = scan_video(os.path.join(TEST_DIR, os.path.split(video.name)[1])) - self.assertEqual(scanned_video.name, os.path.join(TEST_DIR, os.path.split(video.name)[1])) - self.assertEqual(scanned_video.series, video.series) - self.assertEqual(scanned_video.season, video.season) - self.assertEqual(scanned_video.episode, video.episode) - self.assertEqual(scanned_video.video_codec, video.video_codec) - self.assertEqual(scanned_video.format, video.format) - self.assertEqual(scanned_video.resolution, video.resolution) - self.assertEqual(scanned_video.release_group, video.release_group) - self.assertEqual(scanned_video.subtitle_languages, set()) - self.assertEqual(scanned_video.hashes, {}) - self.assertIsNone(scanned_video.title) - self.assertIsNone(scanned_video.tvdb_id) - self.assertIsNone(scanned_video.imdb_id) - self.assertIsNone(scanned_video.audio_codec) - self.assertEqual(scanned_video.size, 0) - - def test_scan_video_subtitle_language_und(self): - video = EPISODES[0] - open(os.path.join(TEST_DIR, os.path.splitext(os.path.split(video.name)[1])[0]) + '.srt', 'w').close() - scanned_video = scan_video(os.path.join(TEST_DIR, os.path.split(video.name)[1])) - self.assertEqual(scanned_video.subtitle_languages, {Language('und')}) - - def test_scan_video_subtitles_language_eng(self): - video = EPISODES[0] - open(os.path.join(TEST_DIR, os.path.splitext(os.path.split(video.name)[1])[0]) + '.en.srt', 'w').close() - scanned_video = scan_video(os.path.join(TEST_DIR, os.path.split(video.name)[1])) - self.assertEqual(scanned_video.subtitle_languages, {Language('eng')}) - - def test_scan_video_subtitles_languages(self): - video = EPISODES[0] - open(os.path.join(TEST_DIR, os.path.splitext(os.path.split(video.name)[1])[0]) + '.en.srt', 'w').close() - open(os.path.join(TEST_DIR, os.path.splitext(os.path.split(video.name)[1])[0]) + '.fr.srt', 'w').close() - open(os.path.join(TEST_DIR, os.path.splitext(os.path.split(video.name)[1])[0]) + '.srt', 'w').close() - scanned_video = scan_video(os.path.join(TEST_DIR, os.path.split(video.name)[1])) - self.assertEqual(scanned_video.subtitle_languages, {Language('eng'), Language('fra'), Language('und')}) - - -def suite(): - suite = TestSuite() - suite.addTest(TestLoader().loadTestsFromTestCase(ApiTestCase)) - suite.addTest(TestLoader().loadTestsFromTestCase(VideoTestCase)) - return suite - - -if __name__ == '__main__': - TextTestRunner().run(suite()) diff --git a/libs/subliminal/utils.py b/libs/subliminal/utils.py new file mode 100644 index 00000000..ac426d45 --- /dev/null +++ b/libs/subliminal/utils.py @@ -0,0 +1,152 @@ +# -*- coding: utf-8 -*- +from datetime import datetime +import hashlib +import os +import re +import struct + + +def hash_opensubtitles(video_path): + """Compute a hash using OpenSubtitles' algorithm. + + :param str video_path: path of the video. + :return: the hash. + :rtype: str + + """ + bytesize = struct.calcsize(b'' % (self.__class__.__name__, self.name) @@ -79,333 +114,108 @@ class Video(object): class Episode(Video): - """Episode :class:`Video` + """Episode :class:`Video`. - Scores are defined by a set of equations, see :func:`~subliminal.score.get_episode_equations` - - :param string series: series of the episode - :param int season: season number of the episode - :param int episode: episode number of the episode - :param string title: title of the episode - :param int year: year of series - :param int tvdb_id: TheTVDB id of the episode + :param str series: series of the episode. + :param int season: season number of the episode. + :param int episode: episode number of the episode. + :param str title: title of the episode. + :param int year: year of the series. + :param bool original_series: whether the series is the first with this name. + :param int tvdb_id: TVDB id of the episode. + :param \*\*kwargs: additional parameters for the :class:`Video` constructor. """ - scores = {'format': 3, 'video_codec': 2, 'tvdb_id': 48, 'title': 12, 'imdb_id': 60, 'audio_codec': 1, 'year': 24, - 'resolution': 2, 'season': 6, 'release_group': 6, 'series': 24, 'episode': 6, 'hash': 74} + def __init__(self, name, series, season, episode, title=None, year=None, original_series=True, tvdb_id=None, + series_tvdb_id=None, series_imdb_id=None, **kwargs): + super(Episode, self).__init__(name, **kwargs) - def __init__(self, name, series, season, episode, format=None, release_group=None, resolution=None, video_codec=None, - audio_codec=None, imdb_id=None, hashes=None, size=None, subtitle_languages=None, title=None, - year=None, tvdb_id=None): - super(Episode, self).__init__(name, format, release_group, resolution, video_codec, audio_codec, imdb_id, hashes, - size, subtitle_languages) + #: Series of the episode self.series = series + + #: Season number of the episode self.season = season + + #: Episode number of the episode self.episode = episode + + #: Title of the episode self.title = title + + #: Year of series self.year = year + + #: The series is the first with this name + self.original_series = original_series + + #: TVDB id of the episode self.tvdb_id = tvdb_id + #: TVDB id of the series + self.series_tvdb_id = series_tvdb_id + + #: IMDb id of the series + self.series_imdb_id = series_imdb_id + @classmethod def fromguess(cls, name, guess): if guess['type'] != 'episode': raise ValueError('The guess must be an episode guess') - if 'series' not in guess or 'season' not in guess or 'episodeNumber' not in guess: + + if 'title' not in guess or 'episode' not in guess: raise ValueError('Insufficient data to process the guess') - return cls(name, guess['series'], guess['season'], guess['episodeNumber'], format=guess.get('format'), - release_group=guess.get('releaseGroup'), resolution=guess.get('screenSize'), - video_codec=guess.get('videoCodec'), audio_codec=guess.get('audioCodec'), - title=guess.get('title'), year=guess.get('year')) + + return cls(name, guess['title'], guess.get('season', 1), guess['episode'], title=guess.get('episode_title'), + year=guess.get('year'), format=guess.get('format'), original_series='year' not in guess, + release_group=guess.get('release_group'), resolution=guess.get('screen_size'), + video_codec=guess.get('video_codec'), audio_codec=guess.get('audio_codec')) @classmethod def fromname(cls, name): - return cls.fromguess(os.path.split(name)[1], guessit.guess_episode_info(name)) + return cls.fromguess(name, guessit(name, {'type': 'episode'})) def __repr__(self): if self.year is None: return '<%s [%r, %dx%d]>' % (self.__class__.__name__, self.series, self.season, self.episode) + return '<%s [%r, %d, %dx%d]>' % (self.__class__.__name__, self.series, self.year, self.season, self.episode) class Movie(Video): - """Movie :class:`Video` + """Movie :class:`Video`. - Scores are defined by a set of equations, see :func:`~subliminal.score.get_movie_equations` - - :param string title: title of the movie - :param int year: year of the movie + :param str title: title of the movie. + :param int year: year of the movie. + :param \*\*kwargs: additional parameters for the :class:`Video` constructor. """ - scores = {'format': 3, 'video_codec': 2, 'title': 13, 'imdb_id': 34, 'audio_codec': 1, 'year': 7, 'resolution': 2, - 'release_group': 6, 'hash': 34} + def __init__(self, name, title, year=None, **kwargs): + super(Movie, self).__init__(name, **kwargs) - def __init__(self, name, title, format=None, release_group=None, resolution=None, video_codec=None, audio_codec=None, - imdb_id=None, hashes=None, size=None, subtitle_languages=None, year=None): - super(Movie, self).__init__(name, format, release_group, resolution, video_codec, audio_codec, imdb_id, hashes, - size, subtitle_languages) + #: Title of the movie self.title = title + + #: Year of the movie self.year = year @classmethod def fromguess(cls, name, guess): if guess['type'] != 'movie': raise ValueError('The guess must be a movie guess') + if 'title' not in guess: raise ValueError('Insufficient data to process the guess') - return cls(name, guess['title'], format=guess.get('format'), release_group=guess.get('releaseGroup'), - resolution=guess.get('screenSize'), video_codec=guess.get('videoCodec'), - audio_codec=guess.get('audioCodec'),year=guess.get('year')) + + return cls(name, guess['title'], format=guess.get('format'), release_group=guess.get('release_group'), + resolution=guess.get('screen_size'), video_codec=guess.get('video_codec'), + audio_codec=guess.get('audio_codec'), year=guess.get('year')) @classmethod def fromname(cls, name): - return cls.fromguess(os.path.split(name)[1], guessit.guess_movie_info(name)) + return cls.fromguess(name, guessit(name, {'type': 'movie'})) def __repr__(self): if self.year is None: return '<%s [%r]>' % (self.__class__.__name__, self.title) + return '<%s [%r, %d]>' % (self.__class__.__name__, self.title, self.year) - - -def scan_subtitle_languages(path): - """Search for subtitles with alpha2 extension from a video `path` and return their language - - :param string path: path to the video - :return: found subtitle languages - :rtype: set - - """ - language_extensions = tuple('.' + c for c in babelfish.language_converters['alpha2'].codes) - dirpath, filename = os.path.split(path) - subtitles = set() - for p in os.listdir(dirpath): - if not isinstance(p, bytes) and p.startswith(os.path.splitext(filename)[0]) and p.endswith(SUBTITLE_EXTENSIONS): - if os.path.splitext(p)[0].endswith(language_extensions): - subtitles.add(babelfish.Language.fromalpha2(os.path.splitext(p)[0][-2:])) - else: - subtitles.add(babelfish.Language('und')) - logger.debug('Found subtitles %r', subtitles) - return subtitles - - -def scan_video(path, subtitles=True, embedded_subtitles=True): - """Scan a video and its subtitle languages from a video `path` - - :param string path: absolute path to the video - :param bool subtitles: scan for subtitles with the same name - :param bool embedded_subtitles: scan for embedded subtitles - :return: the scanned video - :rtype: :class:`Video` - :raise: ValueError if cannot guess enough information from the path - - """ - dirpath, filename = os.path.split(path) - logger.info('Scanning video %r in %r', filename, dirpath) - video = Video.fromguess(path, guessit.guess_file_info(path)) - video.size = os.path.getsize(path) - if video.size > 10485760: - logger.debug('Size is %d', video.size) - video.hashes['opensubtitles'] = hash_opensubtitles(path) - video.hashes['thesubdb'] = hash_thesubdb(path) - logger.debug('Computed hashes %r', video.hashes) - else: - logger.warning('Size is lower than 10MB: hashes not computed') - if subtitles: - video.subtitle_languages |= scan_subtitle_languages(path) - # enzyme - try: - if filename.endswith('.mkv'): - with open(path, 'rb') as f: - mkv = enzyme.MKV(f) - if mkv.video_tracks: - video_track = mkv.video_tracks[0] - # resolution - if video_track.height in (480, 720, 1080): - if video_track.interlaced: - video.resolution = '%di' % video_track.height - logger.debug('Found resolution %s with enzyme', video.resolution) - else: - video.resolution = '%dp' % video_track.height - logger.debug('Found resolution %s with enzyme', video.resolution) - # video codec - if video_track.codec_id == 'V_MPEG4/ISO/AVC': - video.video_codec = 'h264' - logger.debug('Found video_codec %s with enzyme', video.video_codec) - elif video_track.codec_id == 'V_MPEG4/ISO/SP': - video.video_codec = 'DivX' - logger.debug('Found video_codec %s with enzyme', video.video_codec) - elif video_track.codec_id == 'V_MPEG4/ISO/ASP': - video.video_codec = 'XviD' - logger.debug('Found video_codec %s with enzyme', video.video_codec) - else: - logger.warning('MKV has no video track') - if mkv.audio_tracks: - audio_track = mkv.audio_tracks[0] - # audio codec - if audio_track.codec_id == 'A_AC3': - video.audio_codec = 'AC3' - logger.debug('Found audio_codec %s with enzyme', video.audio_codec) - elif audio_track.codec_id == 'A_DTS': - video.audio_codec = 'DTS' - logger.debug('Found audio_codec %s with enzyme', video.audio_codec) - elif audio_track.codec_id == 'A_AAC': - video.audio_codec = 'AAC' - logger.debug('Found audio_codec %s with enzyme', video.audio_codec) - else: - logger.warning('MKV has no audio track') - if mkv.subtitle_tracks: - # embedded subtitles - if embedded_subtitles: - embedded_subtitle_languages = set() - for st in mkv.subtitle_tracks: - if st.language: - try: - embedded_subtitle_languages.add(babelfish.Language.fromalpha3b(st.language)) - except babelfish.Error: - logger.error('Embedded subtitle track language %r is not a valid language', st.language) - embedded_subtitle_languages.add(babelfish.Language('und')) - elif st.name: - try: - embedded_subtitle_languages.add(babelfish.Language.fromname(st.name)) - except babelfish.Error: - logger.debug('Embedded subtitle track name %r is not a valid language', st.name) - embedded_subtitle_languages.add(babelfish.Language('und')) - else: - embedded_subtitle_languages.add(babelfish.Language('und')) - logger.debug('Found embedded subtitle %r with enzyme', embedded_subtitle_languages) - video.subtitle_languages |= embedded_subtitle_languages - else: - logger.debug('MKV has no subtitle track') - except enzyme.Error: - logger.exception('Parsing video metadata with enzyme failed') - return video - - -def scan_videos(paths, subtitles=True, embedded_subtitles=True, age=None): - """Scan `paths` for videos and their subtitle languages - - :params paths: absolute paths to scan for videos - :type paths: list of string - :param bool subtitles: scan for subtitles with the same name - :param bool embedded_subtitles: scan for embedded subtitles - :param age: age of the video, if any - :type age: datetime.timedelta or None - :return: the scanned videos - :rtype: list of :class:`Video` - - """ - videos = [] - # scan files - for filepath in [p for p in paths if os.path.isfile(p)]: - if age is not None: - try: - video_age = datetime.datetime.now() - datetime.datetime.fromtimestamp(os.path.getmtime(filepath)) - except ValueError: - logger.exception('Error while getting video age, skipping it') - continue - if video_age > age: - logger.info('Skipping video %r: older than %r', filepath, age) - continue - try: - videos.append(scan_video(filepath, subtitles, embedded_subtitles)) - except ValueError as e: - logger.error('Skipping video: %s', e) - continue - # scan directories - for path in [p for p in paths if os.path.isdir(p)]: - logger.info('Scanning directory %r', path) - for dirpath, dirnames, filenames in os.walk(path): - # skip badly encoded directories - if isinstance(dirpath, bytes): - logger.error('Skipping badly encoded directory %r', dirpath.decode('utf-8', errors='replace')) - continue - # skip badly encoded and hidden sub directories - for dirname in list(dirnames): - if isinstance(dirname, bytes): - logger.error('Skipping badly encoded dirname %r in %r', dirname.decode('utf-8', errors='replace'), - dirpath) - dirnames.remove(dirname) - elif dirname.startswith('.'): - logger.debug('Skipping hidden dirname %r in %r', dirname, dirpath) - dirnames.remove(dirname) - # scan for videos - for filename in filenames: - # skip badly encoded files - if isinstance(filename, bytes): - logger.error('Skipping badly encoded filename %r in %r', filename.decode('utf-8', errors='replace'), - dirpath) - continue - # filter videos - if not filename.endswith(VIDEO_EXTENSIONS): - continue - # skip hidden files - if filename.startswith('.'): - logger.debug('Skipping hidden filename %r in %r', filename, dirpath) - continue - filepath = os.path.join(dirpath, filename) - # skip links - if os.path.islink(filepath): - logger.debug('Skipping link %r in %r', filename, dirpath) - continue - if age is not None: - try: - video_age = datetime.datetime.now() - datetime.datetime.fromtimestamp(os.path.getmtime(filepath)) - except ValueError: - logger.exception('Error while getting video age, skipping it') - continue - if video_age > age: - logger.info('Skipping video %r: older than %r', filepath, age) - continue - try: - video = scan_video(filepath, subtitles, embedded_subtitles) - except ValueError as e: - logger.error('Skipping video: %s', e) - continue - videos.append(video) - return videos - - -def hash_opensubtitles(video_path): - """Compute a hash using OpenSubtitles' algorithm - - :param string video_path: path of the video - :return: the hash - :rtype: string - - """ - bytesize = struct.calcsize(b'