Bump requests-oauthlib from 1.3.1 to 2.0.0 (#2293)

* Bump requests-oauthlib from 1.3.1 to 2.0.0 Bumps [requests-oauthlib](https://github.com/requests/requests-oauthlib) from 1.3.1 to 2.0.0. - [Release notes](https://github.com/requests/requests-oauthlib/releases) - [Changelog](https://github.com/requests/requests-oauthlib/blob/master/HISTORY.rst) - [Commits](https://github.com/requests/requests-oauthlib/compare/v1.3.1...v2.0.0) --- updated-dependencies: - dependency-name: requests-oauthlib dependency-type: direct:production update-type: version-update:semver-major ... Signed-off-by: dependabot[bot] <support@github.com> * Update requests-oauthlib==2.0.0 --------- Signed-off-by: dependabot[bot] <support@github.com> Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> Co-authored-by: JonnyWong16 <9099342+JonnyWong16@users.noreply.github.com> [skip ci]
2025-08-20 21:33:18 -07:00 · 2024-03-30 15:28:02 -07:00 · 2024-03-30 15:28:02 -07:00 · 0d1d2a3e6b
commit 0d1d2a3e6b
parent 452a4afdcf
60 changed files with 2414 additions and 2291 deletions
--- a/lib/charset_normalizer/main.py
+++ b/lib/charset_normalizer/main.py
@ -0,0 +1,4 @@
+from .cli import cli_detect
+
+if __name__ == "__main__":
+    cli_detect()
--- a/lib/charset_normalizer/assets/init.py
+++ b/lib/charset_normalizer/assets/init.py
--- a/lib/charset_normalizer/cd.py
+++ b/lib/charset_normalizer/cd.py
@ -4,8 +4,13 @@ from collections import Counter
 from functools import lru_cache
 from typing import Counter as TypeCounter, Dict, List, Optional, Tuple

-from .assets import FREQUENCIES
-from .constant import KO_NAMES, LANGUAGE_SUPPORTED_COUNT, TOO_SMALL_SEQUENCE, ZH_NAMES
+from .constant import (
+    FREQUENCIES,
+    KO_NAMES,
+    LANGUAGE_SUPPORTED_COUNT,
+    TOO_SMALL_SEQUENCE,
+    ZH_NAMES,
+)
 from .md import is_suspiciously_successive_range
 from .models import CoherenceMatches
 from .utils import (
--- a/lib/charset_normalizer/cli/init.py
+++ b/lib/charset_normalizer/cli/init.py
@ -0,0 +1,6 @@
+from .__main__ import cli_detect, query_yes_no
+
+__all__ = (
+    "cli_detect",
+    "query_yes_no",
+)
--- a/lib/charset_normalizer/cli/normalizer.py
+++ b/lib/charset_normalizer/cli/normalizer.py
--- a/lib/charset_normalizer/constant.py
+++ b/lib/charset_normalizer/constant.py
--- a/lib/charset_normalizer/md.py
+++ b/lib/charset_normalizer/md.py
@ -9,7 +9,8 @@ from .constant import (
 )
 from .utils import (
    is_accentuated,
-    is_ascii,
+    is_arabic,
+    is_arabic_isolated_form,
    is_case_variable,
    is_cjk,
    is_emoticon,
@ -128,8 +129,9 @@ class TooManyAccentuatedPlugin(MessDetectorPlugin):

    @property
    def ratio(self) -> float:
-        if self._character_count == 0 or self._character_count < 8:
+        if self._character_count < 8:
            return 0.0
+
        ratio_of_accentuation: float = self._accentuated_count / self._character_count
        return ratio_of_accentuation if ratio_of_accentuation >= 0.35 else 0.0

@ -234,16 +236,13 @@ class SuspiciousRange(MessDetectorPlugin):

    @property
    def ratio(self) -> float:
-        if self._character_count == 0:
+        if self._character_count <= 24:
            return 0.0

        ratio_of_suspicious_range_usage: float = (
            self._suspicious_successive_range_count * 2
        ) / self._character_count

-        if ratio_of_suspicious_range_usage < 0.1:
-            return 0.0
-
        return ratio_of_suspicious_range_usage


@ -296,7 +295,11 @@ class SuperWeirdWordPlugin(MessDetectorPlugin):
                    self._is_current_word_bad = True
                # Word/Buffer ending with an upper case accentuated letter are so rare,
                # that we will consider them all as suspicious. Same weight as foreign_long suspicious.
-                if is_accentuated(self._buffer[-1]) and self._buffer[-1].isupper():
+                if (
+                    is_accentuated(self._buffer[-1])
+                    and self._buffer[-1].isupper()
+                    and all(_.isupper() for _ in self._buffer) is False
+                ):
                    self._foreign_long_count += 1
                    self._is_current_word_bad = True
            if buffer_length >= 24 and self._foreign_long_watch:
@ -419,7 +422,7 @@ class ArchaicUpperLowerPlugin(MessDetectorPlugin):

            return

-        if self._current_ascii_only is True and is_ascii(character) is False:
+        if self._current_ascii_only is True and character.isascii() is False:
            self._current_ascii_only = False

        if self._last_alpha_seen is not None:
@ -455,6 +458,34 @@ class ArchaicUpperLowerPlugin(MessDetectorPlugin):
        return self._successive_upper_lower_count_final / self._character_count


+class ArabicIsolatedFormPlugin(MessDetectorPlugin):
+    def __init__(self) -> None:
+        self._character_count: int = 0
+        self._isolated_form_count: int = 0
+
+    def reset(self) -> None:  # pragma: no cover
+        self._character_count = 0
+        self._isolated_form_count = 0
+
+    def eligible(self, character: str) -> bool:
+        return is_arabic(character)
+
+    def feed(self, character: str) -> None:
+        self._character_count += 1
+
+        if is_arabic_isolated_form(character):
+            self._isolated_form_count += 1
+
+    @property
+    def ratio(self) -> float:
+        if self._character_count < 8:
+            return 0.0
+
+        isolated_form_usage: float = self._isolated_form_count / self._character_count
+
+        return isolated_form_usage
+
+
@lru_cache(maxsize=1024)
 def is_suspiciously_successive_range(
    unicode_range_a: Optional[str], unicode_range_b: Optional[str]
@ -522,6 +553,8 @@ def is_suspiciously_successive_range(
            return False
        if "Forms" in unicode_range_a or "Forms" in unicode_range_b:
            return False
+        if unicode_range_a == "Basic Latin" or unicode_range_b == "Basic Latin":
+            return False

    return True

--- a/lib/charset_normalizer/models.py
+++ b/lib/charset_normalizer/models.py
@ -54,16 +54,19 @@ class CharsetMatch:

        # Below 1% difference --> Use Coherence
        if chaos_difference < 0.01 and coherence_difference > 0.02:
-            # When having a tough decision, use the result that decoded as many multi-byte as possible.
-            if chaos_difference == 0.0 and self.coherence == other.coherence:
-                return self.multi_byte_usage > other.multi_byte_usage
            return self.coherence > other.coherence
+        elif chaos_difference < 0.01 and coherence_difference <= 0.02:
+            # When having a difficult decision, use the result that decoded as many multi-byte as possible.
+            # preserve RAM usage!
+            if len(self._payload) >= TOO_BIG_SEQUENCE:
+                return self.chaos < other.chaos
+            return self.multi_byte_usage > other.multi_byte_usage

        return self.chaos < other.chaos

    @property
    def multi_byte_usage(self) -> float:
-        return 1.0 - len(str(self)) / len(self.raw)
+        return 1.0 - (len(str(self)) / len(self.raw))

    def __str__(self) -> str:
        # Lazy Str Loading
--- a/lib/charset_normalizer/utils.py
+++ b/lib/charset_normalizer/utils.py
@ -32,6 +32,8 @@ def is_accentuated(character: str) -> bool:
        or "WITH DIAERESIS" in description
        or "WITH CIRCUMFLEX" in description
        or "WITH TILDE" in description
+        or "WITH MACRON" in description
+        or "WITH RING ABOVE" in description
    )


@ -69,15 +71,6 @@ def is_latin(character: str) -> bool:
    return "LATIN" in description


-@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
-def is_ascii(character: str) -> bool:
-    try:
-        character.encode("ascii")
-    except UnicodeEncodeError:
-        return False
-    return True
-
-
@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
 def is_punctuation(character: str) -> bool:
    character_category: str = unicodedata.category(character)
@ -105,7 +98,7 @@ def is_symbol(character: str) -> bool:
    if character_range is None:
        return False

-    return "Forms" in character_range
+    return "Forms" in character_range and character_category != "Lo"


@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
@ -115,7 +108,7 @@ def is_emoticon(character: str) -> bool:
    if character_range is None:
        return False

-    return "Emoticons" in character_range
+    return "Emoticons" in character_range or "Pictographs" in character_range


@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
@ -133,12 +126,6 @@ def is_case_variable(character: str) -> bool:
    return character.islower() != character.isupper()


-def is_private_use_only(character: str) -> bool:
-    character_category: str = unicodedata.category(character)
-
-    return character_category == "Co"
-
-
@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
 def is_cjk(character: str) -> bool:
    try:
@ -189,6 +176,26 @@ def is_thai(character: str) -> bool:
    return "THAI" in character_name


+@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
+def is_arabic(character: str) -> bool:
+    try:
+        character_name = unicodedata.name(character)
+    except ValueError:
+        return False
+
+    return "ARABIC" in character_name
+
+
+@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
+def is_arabic_isolated_form(character: str) -> bool:
+    try:
+        character_name = unicodedata.name(character)
+    except ValueError:
+        return False
+
+    return "ARABIC" in character_name and "ISOLATED FORM" in character_name
+
+
@lru_cache(maxsize=len(UNICODE_RANGES_COMBINED))
 def is_unicode_range_secondary(range_name: str) -> bool:
    return any(keyword in range_name for keyword in UNICODE_SECONDARY_RANGE_KEYWORD)
@ -205,7 +212,7 @@ def is_unprintable(character: str) -> bool:
    )


-def any_specified_encoding(sequence: bytes, search_zone: int = 4096) -> Optional[str]:
+def any_specified_encoding(sequence: bytes, search_zone: int = 8192) -> Optional[str]:
    """
    Extract using ASCII-only decoder any specified encoding in the first n-bytes.
    """
--- a/lib/charset_normalizer/version.py
+++ b/lib/charset_normalizer/version.py
@ -2,5 +2,5 @@
 Expose version
 """

-__version__ = "3.2.0"
+__version__ = "3.3.2"
 VERSION = __version__.split(".")