mirror of
https://github.com/clinton-hall/nzbToMedia.git
synced 2025-08-20 13:23:18 -07:00
Move common libs to libs/common
This commit is contained in:
parent
8dbb1a2451
commit
1f4bd41bcc
1612 changed files with 962 additions and 10 deletions
106
libs/common/guessit/rules/properties/website.py
Normal file
106
libs/common/guessit/rules/properties/website.py
Normal file
|
@ -0,0 +1,106 @@
|
|||
#!/usr/bin/env python
|
||||
# -*- coding: utf-8 -*-
|
||||
"""
|
||||
Website property.
|
||||
"""
|
||||
from pkg_resources import resource_stream # @UnresolvedImport
|
||||
from rebulk.remodule import re
|
||||
|
||||
from rebulk import Rebulk, Rule, RemoveMatch
|
||||
from ..common import seps
|
||||
from ..common.formatters import cleanup
|
||||
from ..common.pattern import is_disabled
|
||||
from ..common.validators import seps_surround
|
||||
from ...reutils import build_or_pattern
|
||||
|
||||
|
||||
def website(config):
|
||||
"""
|
||||
Builder for rebulk object.
|
||||
|
||||
:param config: rule configuration
|
||||
:type config: dict
|
||||
:return: Created Rebulk object
|
||||
:rtype: Rebulk
|
||||
"""
|
||||
rebulk = Rebulk(disabled=lambda context: is_disabled(context, 'website'))
|
||||
rebulk = rebulk.regex_defaults(flags=re.IGNORECASE).string_defaults(ignore_case=True)
|
||||
rebulk.defaults(name="website")
|
||||
|
||||
with resource_stream('guessit', 'tlds-alpha-by-domain.txt') as tld_file:
|
||||
tlds = [
|
||||
tld.strip().decode('utf-8')
|
||||
for tld in tld_file.readlines()
|
||||
if b'--' not in tld
|
||||
][1:] # All registered domain extension
|
||||
|
||||
safe_tlds = config['safe_tlds'] # For sure a website extension
|
||||
safe_subdomains = config['safe_subdomains'] # For sure a website subdomain
|
||||
safe_prefix = config['safe_prefixes'] # Those words before a tlds are sure
|
||||
website_prefixes = config['prefixes']
|
||||
|
||||
rebulk.regex(r'(?:[^a-z0-9]|^)((?:'+build_or_pattern(safe_subdomains) +
|
||||
r'\.)+(?:[a-z-]+\.)+(?:'+build_or_pattern(tlds) +
|
||||
r'))(?:[^a-z0-9]|$)',
|
||||
children=True)
|
||||
rebulk.regex(r'(?:[^a-z0-9]|^)((?:'+build_or_pattern(safe_subdomains) +
|
||||
r'\.)*[a-z-]+\.(?:'+build_or_pattern(safe_tlds) +
|
||||
r'))(?:[^a-z0-9]|$)',
|
||||
safe_subdomains=safe_subdomains, safe_tlds=safe_tlds, children=True)
|
||||
rebulk.regex(r'(?:[^a-z0-9]|^)((?:'+build_or_pattern(safe_subdomains) +
|
||||
r'\.)*[a-z-]+\.(?:'+build_or_pattern(safe_prefix) +
|
||||
r'\.)+(?:'+build_or_pattern(tlds) +
|
||||
r'))(?:[^a-z0-9]|$)',
|
||||
safe_subdomains=safe_subdomains, safe_prefix=safe_prefix, tlds=tlds, children=True)
|
||||
|
||||
rebulk.string(*website_prefixes,
|
||||
validator=seps_surround, private=True, tags=['website.prefix'])
|
||||
|
||||
class PreferTitleOverWebsite(Rule):
|
||||
"""
|
||||
If found match is more likely a title, remove website.
|
||||
"""
|
||||
consequence = RemoveMatch
|
||||
|
||||
@staticmethod
|
||||
def valid_followers(match):
|
||||
"""
|
||||
Validator for next website matches
|
||||
"""
|
||||
return any(name in ['season', 'episode', 'year'] for name in match.names)
|
||||
|
||||
def when(self, matches, context):
|
||||
to_remove = []
|
||||
for website_match in matches.named('website'):
|
||||
safe = False
|
||||
for safe_start in safe_subdomains + safe_prefix:
|
||||
if website_match.value.lower().startswith(safe_start):
|
||||
safe = True
|
||||
break
|
||||
if not safe:
|
||||
suffix = matches.next(website_match, PreferTitleOverWebsite.valid_followers, 0)
|
||||
if suffix:
|
||||
to_remove.append(website_match)
|
||||
return to_remove
|
||||
|
||||
rebulk.rules(PreferTitleOverWebsite, ValidateWebsitePrefix)
|
||||
|
||||
return rebulk
|
||||
|
||||
|
||||
class ValidateWebsitePrefix(Rule):
|
||||
"""
|
||||
Validate website prefixes
|
||||
"""
|
||||
priority = 64
|
||||
consequence = RemoveMatch
|
||||
|
||||
def when(self, matches, context):
|
||||
to_remove = []
|
||||
for prefix in matches.tagged('website.prefix'):
|
||||
website_match = matches.next(prefix, predicate=lambda match: match.name == 'website', index=0)
|
||||
if (not website_match or
|
||||
matches.holes(prefix.end, website_match.start,
|
||||
formatter=cleanup, seps=seps, predicate=lambda match: match.value)):
|
||||
to_remove.append(prefix)
|
||||
return to_remove
|
Loading…
Add table
Add a link
Reference in a new issue