Move common libs to libs/common

2025-08-20 13:23:18 -07:00 · 2018-12-16 13:30:24 -05:00 · 2018-12-16 13:30:24 -05:00 · 1f4bd41bcc
commit 1f4bd41bcc
parent 8dbb1a2451
1612 changed files with 962 additions and 10 deletions
--- a/libs/common/guessit/rules/properties/website.py
+++ b/libs/common/guessit/rules/properties/website.py
@ -0,0 +1,106 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+"""
+Website property.
+"""
+from pkg_resources import resource_stream  # @UnresolvedImport
+from rebulk.remodule import re
+
+from rebulk import Rebulk, Rule, RemoveMatch
+from ..common import seps
+from ..common.formatters import cleanup
+from ..common.pattern import is_disabled
+from ..common.validators import seps_surround
+from ...reutils import build_or_pattern
+
+
+def website(config):
+    """
+    Builder for rebulk object.
+
+    :param config: rule configuration
+    :type config: dict
+    :return: Created Rebulk object
+    :rtype: Rebulk
+    """
+    rebulk = Rebulk(disabled=lambda context: is_disabled(context, 'website'))
+    rebulk = rebulk.regex_defaults(flags=re.IGNORECASE).string_defaults(ignore_case=True)
+    rebulk.defaults(name="website")
+
+    with resource_stream('guessit', 'tlds-alpha-by-domain.txt') as tld_file:
+        tlds = [
+            tld.strip().decode('utf-8')
+            for tld in tld_file.readlines()
+            if b'--' not in tld
+        ][1:]  # All registered domain extension
+
+    safe_tlds = config['safe_tlds']  # For sure a website extension
+    safe_subdomains = config['safe_subdomains']  # For sure a website subdomain
+    safe_prefix = config['safe_prefixes']  # Those words before a tlds are sure
+    website_prefixes = config['prefixes']
+
+    rebulk.regex(r'(?:[^a-z0-9]|^)((?:'+build_or_pattern(safe_subdomains) +
+                 r'\.)+(?:[a-z-]+\.)+(?:'+build_or_pattern(tlds) +
+                 r'))(?:[^a-z0-9]|$)',
+                 children=True)
+    rebulk.regex(r'(?:[^a-z0-9]|^)((?:'+build_or_pattern(safe_subdomains) +
+                 r'\.)*[a-z-]+\.(?:'+build_or_pattern(safe_tlds) +
+                 r'))(?:[^a-z0-9]|$)',
+                 safe_subdomains=safe_subdomains, safe_tlds=safe_tlds, children=True)
+    rebulk.regex(r'(?:[^a-z0-9]|^)((?:'+build_or_pattern(safe_subdomains) +
+                 r'\.)*[a-z-]+\.(?:'+build_or_pattern(safe_prefix) +
+                 r'\.)+(?:'+build_or_pattern(tlds) +
+                 r'))(?:[^a-z0-9]|$)',
+                 safe_subdomains=safe_subdomains, safe_prefix=safe_prefix, tlds=tlds, children=True)
+
+    rebulk.string(*website_prefixes,
+                  validator=seps_surround, private=True, tags=['website.prefix'])
+
+    class PreferTitleOverWebsite(Rule):
+        """
+        If found match is more likely a title, remove website.
+        """
+        consequence = RemoveMatch
+
+        @staticmethod
+        def valid_followers(match):
+            """
+            Validator for next website matches
+            """
+            return any(name in ['season', 'episode', 'year'] for name in match.names)
+
+        def when(self, matches, context):
+            to_remove = []
+            for website_match in matches.named('website'):
+                safe = False
+                for safe_start in safe_subdomains + safe_prefix:
+                    if website_match.value.lower().startswith(safe_start):
+                        safe = True
+                        break
+                if not safe:
+                    suffix = matches.next(website_match, PreferTitleOverWebsite.valid_followers, 0)
+                    if suffix:
+                        to_remove.append(website_match)
+            return to_remove
+
+    rebulk.rules(PreferTitleOverWebsite, ValidateWebsitePrefix)
+
+    return rebulk
+
+
+class ValidateWebsitePrefix(Rule):
+    """
+    Validate website prefixes
+    """
+    priority = 64
+    consequence = RemoveMatch
+
+    def when(self, matches, context):
+        to_remove = []
+        for prefix in matches.tagged('website.prefix'):
+            website_match = matches.next(prefix, predicate=lambda match: match.name == 'website', index=0)
+            if (not website_match or
+                    matches.holes(prefix.end, website_match.start,
+                                  formatter=cleanup, seps=seps, predicate=lambda match: match.value)):
+                to_remove.append(prefix)
+        return to_remove