Bump plexapi from 4.15.16 to 4.16.0 (#2439)

* Bump plexapi from 4.15.16 to 4.16.0 Bumps [plexapi](https://github.com/pkkid/python-plexapi) from 4.15.16 to 4.16.0. - [Release notes](https://github.com/pkkid/python-plexapi/releases) - [Commits](https://github.com/pkkid/python-plexapi/compare/4.15.16...4.16.0) --- updated-dependencies: - dependency-name: plexapi dependency-type: direct:production update-type: version-update:semver-minor ... Signed-off-by: dependabot[bot] <support@github.com> * Update plexapi==4.16.0 --------- Signed-off-by: dependabot[bot] <support@github.com> Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> Co-authored-by: JonnyWong16 <9099342+JonnyWong16@users.noreply.github.com> [skip ci]
2025-08-20 05:13:21 -07:00 · 2024-11-19 10:00:37 -08:00 · 2024-11-19 10:00:37 -08:00 · 0836fb902c
commit 0836fb902c
parent eb2c372d82
20 changed files with 287 additions and 49 deletions
--- a/lib/charset_normalizer/api.py
+++ b/lib/charset_normalizer/api.py
@ -159,6 +159,8 @@ def from_bytes(

    results: CharsetMatches = CharsetMatches()

+    early_stop_results: CharsetMatches = CharsetMatches()
+
    sig_encoding, sig_payload = identify_sig_or_bom(sequences)

    if sig_encoding is not None:
@ -221,16 +223,20 @@ def from_bytes(
        try:
            if is_too_large_sequence and is_multi_byte_decoder is False:
                str(
-                    sequences[: int(50e4)]
-                    if strip_sig_or_bom is False
-                    else sequences[len(sig_payload) : int(50e4)],
+                    (
+                        sequences[: int(50e4)]
+                        if strip_sig_or_bom is False
+                        else sequences[len(sig_payload) : int(50e4)]
+                    ),
                    encoding=encoding_iana,
                )
            else:
                decoded_payload = str(
-                    sequences
-                    if strip_sig_or_bom is False
-                    else sequences[len(sig_payload) :],
+                    (
+                        sequences
+                        if strip_sig_or_bom is False
+                        else sequences[len(sig_payload) :]
+                    ),
                    encoding=encoding_iana,
                )
        except (UnicodeDecodeError, LookupError) as e:
@ -367,7 +373,13 @@ def from_bytes(
                and not lazy_str_hard_failure
            ):
                fallback_entry = CharsetMatch(
-                    sequences, encoding_iana, threshold, False, [], decoded_payload
+                    sequences,
+                    encoding_iana,
+                    threshold,
+                    False,
+                    [],
+                    decoded_payload,
+                    preemptive_declaration=specified_encoding,
                )
                if encoding_iana == specified_encoding:
                    fallback_specified = fallback_entry
@ -421,28 +433,58 @@ def from_bytes(
                ),
            )

-        results.append(
-            CharsetMatch(
-                sequences,
-                encoding_iana,
-                mean_mess_ratio,
-                bom_or_sig_available,
-                cd_ratios_merged,
-                decoded_payload,
-            )
+        current_match = CharsetMatch(
+            sequences,
+            encoding_iana,
+            mean_mess_ratio,
+            bom_or_sig_available,
+            cd_ratios_merged,
+            (
+                decoded_payload
+                if (
+                    is_too_large_sequence is False
+                    or encoding_iana in [specified_encoding, "ascii", "utf_8"]
+                )
+                else None
+            ),
+            preemptive_declaration=specified_encoding,
        )

+        results.append(current_match)
+
        if (
            encoding_iana in [specified_encoding, "ascii", "utf_8"]
            and mean_mess_ratio < 0.1
        ):
+            # If md says nothing to worry about, then... stop immediately!
+            if mean_mess_ratio == 0.0:
+                logger.debug(
+                    "Encoding detection: %s is most likely the one.",
+                    current_match.encoding,
+                )
+                if explain:
+                    logger.removeHandler(explain_handler)
+                    logger.setLevel(previous_logger_level)
+                return CharsetMatches([current_match])
+
+            early_stop_results.append(current_match)
+
+        if (
+            len(early_stop_results)
+            and (specified_encoding is None or specified_encoding in tested)
+            and "ascii" in tested
+            and "utf_8" in tested
+        ):
+            probable_result: CharsetMatch = early_stop_results.best()  # type: ignore[assignment]
            logger.debug(
-                "Encoding detection: %s is most likely the one.", encoding_iana
+                "Encoding detection: %s is most likely the one.",
+                probable_result.encoding,
            )
            if explain:
                logger.removeHandler(explain_handler)
                logger.setLevel(previous_logger_level)
-            return CharsetMatches([results[encoding_iana]])
+
+            return CharsetMatches([probable_result])

        if encoding_iana == sig_encoding:
            logger.debug(
--- a/lib/charset_normalizer/cli/main.py
+++ b/lib/charset_normalizer/cli/main.py
@ -109,6 +109,14 @@ def cli_detect(argv: Optional[List[str]] = None) -> int:
        dest="force",
        help="Replace file without asking if you are sure, use this flag with caution.",
    )
+    parser.add_argument(
+        "-i",
+        "--no-preemptive",
+        action="store_true",
+        default=False,
+        dest="no_preemptive",
+        help="Disable looking at a charset declaration to hint the detector.",
+    )
    parser.add_argument(
        "-t",
        "--threshold",
@ -133,21 +141,35 @@ def cli_detect(argv: Optional[List[str]] = None) -> int:
    args = parser.parse_args(argv)

    if args.replace is True and args.normalize is False:
+        if args.files:
+            for my_file in args.files:
+                my_file.close()
        print("Use --replace in addition of --normalize only.", file=sys.stderr)
        return 1

    if args.force is True and args.replace is False:
+        if args.files:
+            for my_file in args.files:
+                my_file.close()
        print("Use --force in addition of --replace only.", file=sys.stderr)
        return 1

    if args.threshold < 0.0 or args.threshold > 1.0:
+        if args.files:
+            for my_file in args.files:
+                my_file.close()
        print("--threshold VALUE should be between 0. AND 1.", file=sys.stderr)
        return 1

    x_ = []

    for my_file in args.files:
-        matches = from_fp(my_file, threshold=args.threshold, explain=args.verbose)
+        matches = from_fp(
+            my_file,
+            threshold=args.threshold,
+            explain=args.verbose,
+            preemptive_behaviour=args.no_preemptive is False,
+        )

        best_guess = matches.best()

@ -155,9 +177,11 @@ def cli_detect(argv: Optional[List[str]] = None) -> int:
            print(
                'Unable to identify originating encoding for "{}". {}'.format(
                    my_file.name,
-                    "Maybe try increasing maximum amount of chaos."
-                    if args.threshold < 1.0
-                    else "",
+                    (
+                        "Maybe try increasing maximum amount of chaos."
+                        if args.threshold < 1.0
+                        else ""
+                    ),
                ),
                file=sys.stderr,
            )
@ -258,8 +282,8 @@ def cli_detect(argv: Optional[List[str]] = None) -> int:
                try:
                    x_[0].unicode_path = join(dir_path, ".".join(o_))

-                    with open(x_[0].unicode_path, "w", encoding="utf-8") as fp:
-                        fp.write(str(best_guess))
+                    with open(x_[0].unicode_path, "wb") as fp:
+                        fp.write(best_guess.output())
                except IOError as e:
                    print(str(e), file=sys.stderr)
                    if my_file.closed is False:
--- a/lib/charset_normalizer/constant.py
+++ b/lib/charset_normalizer/constant.py
@ -544,6 +544,8 @@ COMMON_SAFE_ASCII_CHARACTERS: Set[str] = {
    "|",
    '"',
    "-",
+    "(",
+    ")",
 }


--- a/lib/charset_normalizer/legacy.py
+++ b/lib/charset_normalizer/legacy.py
@ -1,13 +1,24 @@
-from typing import Any, Dict, Optional, Union
+from __future__ import annotations
+
+from typing import TYPE_CHECKING, Any, Optional
 from warnings import warn

 from .api import from_bytes
 from .constant import CHARDET_CORRESPONDENCE

+# TODO: remove this check when dropping Python 3.7 support
+if TYPE_CHECKING:
+    from typing_extensions import TypedDict
+
+    class ResultDict(TypedDict):
+        encoding: Optional[str]
+        language: str
+        confidence: Optional[float]
+

 def detect(
    byte_str: bytes, should_rename_legacy: bool = False, **kwargs: Any
-) -> Dict[str, Optional[Union[str, float]]]:
+) -> ResultDict:
    """
    chardet legacy method
    Detect the encoding of the given byte string. It should be mostly backward-compatible.
--- a/lib/charset_normalizer/md.py
+++ b/lib/charset_normalizer/md.py
@ -236,7 +236,7 @@ class SuspiciousRange(MessDetectorPlugin):

    @property
    def ratio(self) -> float:
-        if self._character_count <= 24:
+        if self._character_count <= 13:
            return 0.0

        ratio_of_suspicious_range_usage: float = (
@ -260,6 +260,7 @@ class SuperWeirdWordPlugin(MessDetectorPlugin):

        self._buffer: str = ""
        self._buffer_accent_count: int = 0
+        self._buffer_glyph_count: int = 0

    def eligible(self, character: str) -> bool:
        return True
@ -279,6 +280,14 @@ class SuperWeirdWordPlugin(MessDetectorPlugin):
                and is_thai(character) is False
            ):
                self._foreign_long_watch = True
+            if (
+                is_cjk(character)
+                or is_hangul(character)
+                or is_katakana(character)
+                or is_hiragana(character)
+                or is_thai(character)
+            ):
+                self._buffer_glyph_count += 1
            return
        if not self._buffer:
            return
@ -291,17 +300,20 @@ class SuperWeirdWordPlugin(MessDetectorPlugin):
            self._character_count += buffer_length

            if buffer_length >= 4:
-                if self._buffer_accent_count / buffer_length > 0.34:
+                if self._buffer_accent_count / buffer_length >= 0.5:
                    self._is_current_word_bad = True
                # Word/Buffer ending with an upper case accentuated letter are so rare,
                # that we will consider them all as suspicious. Same weight as foreign_long suspicious.
-                if (
+                elif (
                    is_accentuated(self._buffer[-1])
                    and self._buffer[-1].isupper()
                    and all(_.isupper() for _ in self._buffer) is False
                ):
                    self._foreign_long_count += 1
                    self._is_current_word_bad = True
+                elif self._buffer_glyph_count == 1:
+                    self._is_current_word_bad = True
+                    self._foreign_long_count += 1
            if buffer_length >= 24 and self._foreign_long_watch:
                camel_case_dst = [
                    i
@ -325,6 +337,7 @@ class SuperWeirdWordPlugin(MessDetectorPlugin):
            self._foreign_long_watch = False
            self._buffer = ""
            self._buffer_accent_count = 0
+            self._buffer_glyph_count = 0
        elif (
            character not in {"<", ">", "-", "=", "~", "|", "_"}
            and character.isdigit() is False
--- a/lib/charset_normalizer/models.py
+++ b/lib/charset_normalizer/models.py
@ -1,9 +1,10 @@
 from encodings.aliases import aliases
 from hashlib import sha256
 from json import dumps
+from re import sub
 from typing import Any, Dict, Iterator, List, Optional, Tuple, Union

-from .constant import TOO_BIG_SEQUENCE
+from .constant import RE_POSSIBLE_ENCODING_INDICATION, TOO_BIG_SEQUENCE
 from .utils import iana_name, is_multi_byte_encoding, unicode_range


@ -16,6 +17,7 @@ class CharsetMatch:
        has_sig_or_bom: bool,
        languages: "CoherenceMatches",
        decoded_payload: Optional[str] = None,
+        preemptive_declaration: Optional[str] = None,
    ):
        self._payload: bytes = payload

@ -33,13 +35,13 @@ class CharsetMatch:

        self._string: Optional[str] = decoded_payload

+        self._preemptive_declaration: Optional[str] = preemptive_declaration
+
    def __eq__(self, other: object) -> bool:
        if not isinstance(other, CharsetMatch):
-            raise TypeError(
-                "__eq__ cannot be invoked on {} and {}.".format(
-                    str(other.__class__), str(self.__class__)
-                )
-            )
+            if isinstance(other, str):
+                return iana_name(other) == self.encoding
+            return False
        return self.encoding == other.encoding and self.fingerprint == other.fingerprint

    def __lt__(self, other: object) -> bool:
@ -210,7 +212,24 @@ class CharsetMatch:
        """
        if self._output_encoding is None or self._output_encoding != encoding:
            self._output_encoding = encoding
-            self._output_payload = str(self).encode(encoding, "replace")
+            decoded_string = str(self)
+            if (
+                self._preemptive_declaration is not None
+                and self._preemptive_declaration.lower()
+                not in ["utf-8", "utf8", "utf_8"]
+            ):
+                patched_header = sub(
+                    RE_POSSIBLE_ENCODING_INDICATION,
+                    lambda m: m.string[m.span()[0] : m.span()[1]].replace(
+                        m.groups()[0], iana_name(self._output_encoding)  # type: ignore[arg-type]
+                    ),
+                    decoded_string[:8192],
+                    1,
+                )
+
+                decoded_string = patched_header + decoded_string[8192:]
+
+            self._output_payload = decoded_string.encode(encoding, "replace")

        return self._output_payload  # type: ignore

@ -266,7 +285,7 @@ class CharsetMatches:
                )
            )
        # We should disable the submatch factoring when the input file is too heavy (conserve RAM usage)
-        if len(item.raw) <= TOO_BIG_SEQUENCE:
+        if len(item.raw) < TOO_BIG_SEQUENCE:
            for match in self._results:
                if match.fingerprint == item.fingerprint and match.chaos == item.chaos:
                    match.add_submatch(item)
--- a/lib/charset_normalizer/version.py
+++ b/lib/charset_normalizer/version.py
@ -2,5 +2,5 @@
 Expose version
 """

-__version__ = "3.3.2"
+__version__ = "3.4.0"
 VERSION = __version__.split(".")