Update plexapi==4.8.0

2025-08-20 21:33:18 -07:00 · 2021-11-28 14:17:35 -08:00 · 2021-11-28 14:17:35 -08:00 · 3a50981976
commit 3a50981976
parent 36b55398a8
20 changed files with 522 additions and 314 deletions
--- a/lib/charset_normalizer/init.py
+++ b/lib/charset_normalizer/init.py
@ -19,6 +19,8 @@ at <https://github.com/Ousret/charset_normalizer>.
 :copyright: (c) 2021 by Ahmed TAHRI
 :license: MIT, see LICENSE for more details.
 """
+import logging
+
 from .api import from_bytes, from_fp, from_path, normalize
 from .legacy import (
    CharsetDetector,
@ -28,6 +30,7 @@ from .legacy import (
    detect,
 )
 from .models import CharsetMatch, CharsetMatches
+from .utils import set_logging_handler
 from .version import VERSION, __version__

 __all__ = (
@ -44,4 +47,10 @@ __all__ = (
    "CharsetDoctor",
    "__version__",
    "VERSION",
+    "set_logging_handler",
 )
+
+# Attach a NullHandler to the top level logger by default
+# https://docs.python.org/3.3/howto/logging.html#configuring-logging-for-a-library
+
+logging.getLogger("charset_normalizer").addHandler(logging.NullHandler())
--- a/lib/charset_normalizer/api.py
+++ b/lib/charset_normalizer/api.py
@ -1,3 +1,4 @@
+import logging
 from os.path import basename, splitext
 from typing import BinaryIO, List, Optional, Set

@ -6,8 +7,6 @@ try:
 except ImportError:  # pragma: no cover
    PathLike = str  # type: ignore

-import logging
-
 from .cd import (
    coherence_ratio,
    encoding_languages,
@ -27,11 +26,10 @@ from .utils import (
 )

 logger = logging.getLogger("charset_normalizer")
-logger.setLevel(logging.DEBUG)
-
-handler = logging.StreamHandler()
-handler.setFormatter(logging.Formatter("%(asctime)s | %(levelname)s | %(message)s"))
-logger.addHandler(handler)
+explain_handler = logging.StreamHandler()
+explain_handler.setFormatter(
+    logging.Formatter("%(asctime)s | %(levelname)s | %(message)s")
+)


 def from_bytes(
@ -57,6 +55,9 @@ def from_bytes(
    purpose.

    This function will strip the SIG in the payload/sequence every time except on UTF-16, UTF-32.
+    By default the library does not setup any handler other than the NullHandler, if you choose to set the 'explain'
+    toggle to True it will alter the logger configuration to add a StreamHandler that is suitable for debugging.
+    Custom logging format and handler can be set manually.
    """

    if not isinstance(sequences, (bytearray, bytes)):
@ -66,10 +67,8 @@ def from_bytes(
            )
        )

-    if not explain:
-        logger.setLevel(logging.CRITICAL)
-    else:
-        logger.setLevel(logging.INFO)
+    if explain:
+        logger.addHandler(explain_handler)

    length = len(sequences)  # type: int

@ -77,6 +76,8 @@ def from_bytes(
        logger.warning(
            "Given content is empty, stopping the process very early, returning empty utf_8 str match"
        )
+        if explain:
+            logger.removeHandler(explain_handler)
        return CharsetMatches([CharsetMatch(sequences, "utf_8", 0.0, False, [], "")])

    if cp_isolation is not None:
@ -131,7 +132,7 @@ def from_bytes(
    prioritized_encodings = []  # type: List[str]

    specified_encoding = (
-        any_specified_encoding(sequences) if preemptive_behaviour is True else None
+        any_specified_encoding(sequences) if preemptive_behaviour else None
    )  # type: Optional[str]

    if specified_encoding is not None:
@ -185,7 +186,7 @@ def from_bytes(
            encoding_iana
        )  # type: bool

-        if encoding_iana in {"utf_16", "utf_32"} and bom_or_sig_available is False:
+        if encoding_iana in {"utf_16", "utf_32"} and not bom_or_sig_available:
            logger.info(
                "Encoding %s wont be tested as-is because it require a BOM. Will try some sub-encoder LE/BE.",
                encoding_iana,
@ -241,7 +242,7 @@ def from_bytes(
            continue

        r_ = range(
-            0 if bom_or_sig_available is False else len(sig_payload),
+            0 if not bom_or_sig_available else len(sig_payload),
            length,
            int(length / steps),
        )
@ -261,29 +262,40 @@ def from_bytes(

        max_chunk_gave_up = int(len(r_) / 4)  # type: int

-        if max_chunk_gave_up < 2:
-            max_chunk_gave_up = 2
-
+        max_chunk_gave_up = max(max_chunk_gave_up, 2)
        early_stop_count = 0  # type: int

        md_chunks = []  # type: List[str]
        md_ratios = []

        for i in r_:
+            if i + chunk_size > length + 8:
+                continue
+
            cut_sequence = sequences[i : i + chunk_size]

            if bom_or_sig_available and strip_sig_or_bom is False:
                cut_sequence = sig_payload + cut_sequence

-            chunk = cut_sequence.decode(encoding_iana, errors="ignore")  # type: str
+            try:
+                chunk = cut_sequence.decode(
+                    encoding_iana,
+                    errors="ignore" if is_multi_byte_decoder else "strict",
+                )  # type: str
+            except UnicodeDecodeError as e:  # Lazy str loading may have missed something there
+                logger.warning(
+                    "LazyStr Loading: After MD chunk decode, code page %s does not fit given bytes sequence at ALL. %s",
+                    encoding_iana,
+                    str(e),
+                )
+                early_stop_count = max_chunk_gave_up
+                break

            # multi-byte bad cutting detector and adjustment
            # not the cleanest way to perform that fix but clever enough for now.
            if is_multi_byte_decoder and i > 0 and sequences[i] >= 0x80:

-                chunk_partial_size_chk = (
-                    16 if chunk_size > 16 else chunk_size
-                )  # type: int
+                chunk_partial_size_chk = min(chunk_size, 16)  # type: int

                if (
                    decoded_payload
@ -312,11 +324,9 @@ def from_bytes(
            ):
                break

-        if md_ratios:
-            mean_mess_ratio = sum(md_ratios) / len(md_ratios)  # type: float
-        else:
-            mean_mess_ratio = 0.0
-
+        mean_mess_ratio = (
+            sum(md_ratios) / len(md_ratios) if md_ratios else 0.0
+        )  # type: float
        if mean_mess_ratio >= threshold or early_stop_count >= max_chunk_gave_up:
            tested_but_soft_failure.append(encoding_iana)
            logger.warning(
@ -375,6 +385,20 @@ def from_bytes(
                )
            )

+        # We might want to check the sequence again with the whole content
+        # Only if initial MD/CD tests passes
+        if is_too_large_sequence and not is_multi_byte_decoder:
+            try:
+                sequences[int(50e3) :].decode(encoding_iana, errors="strict")
+            except UnicodeDecodeError as e:
+                logger.warning(
+                    "LazyStr Loading: After final lookup, code page %s does not fit given bytes sequence at ALL. %s",
+                    encoding_iana,
+                    str(e),
+                )
+                tested_but_hard_failure.append(encoding_iana)
+                continue
+
        results.append(
            CharsetMatch(
                sequences,
@ -393,6 +417,8 @@ def from_bytes(
            logger.info(
                "%s is most likely the one. Stopping the process.", encoding_iana
            )
+            if explain:
+                logger.removeHandler(explain_handler)
            return CharsetMatches([results[encoding_iana]])

        if encoding_iana == sig_encoding:
@ -400,6 +426,8 @@ def from_bytes(
                "%s is most likely the one as we detected a BOM or SIG within the beginning of the sequence.",
                encoding_iana,
            )
+            if explain:
+                logger.removeHandler(explain_handler)
            return CharsetMatches([results[encoding_iana]])

    if len(results) == 0:
@ -428,6 +456,9 @@ def from_bytes(
            logger.warning("ascii will be used as a fallback match")
            results.append(fallback_ascii)

+    if explain:
+        logger.removeHandler(explain_handler)
+
    return results


--- a/lib/charset_normalizer/cd.py
+++ b/lib/charset_normalizer/cd.py
@ -5,7 +5,7 @@ from functools import lru_cache
 from typing import Dict, List, Optional, Tuple

 from .assets import FREQUENCIES
-from .constant import KO_NAMES, TOO_SMALL_SEQUENCE, ZH_NAMES
+from .constant import KO_NAMES, LANGUAGE_SUPPORTED_COUNT, TOO_SMALL_SEQUENCE, ZH_NAMES
 from .md import is_suspiciously_successive_range
 from .models import CoherenceMatches
 from .utils import (
@ -110,6 +110,23 @@ def mb_encoding_languages(iana_name: str) -> List[str]:
    return []


+@lru_cache(maxsize=LANGUAGE_SUPPORTED_COUNT)
+def get_target_features(language: str) -> Tuple[bool, bool]:
+    """
+    Determine main aspects from a supported language if it contains accents and if is pure Latin.
+    """
+    target_have_accents = False  # type: bool
+    target_pure_latin = True  # type: bool
+
+    for character in FREQUENCIES[language]:
+        if not target_have_accents and is_accentuated(character):
+            target_have_accents = True
+        if target_pure_latin and is_latin(character) is False:
+            target_pure_latin = False
+
+    return target_have_accents, target_pure_latin
+
+
 def alphabet_languages(
    characters: List[str], ignore_non_latin: bool = False
 ) -> List[str]:
@ -118,23 +135,11 @@ def alphabet_languages(
    """
    languages = []  # type: List[Tuple[str, float]]

-    source_have_accents = False  # type: bool
-
-    for character in characters:
-        if is_accentuated(character):
-            source_have_accents = True
-            break
+    source_have_accents = any(is_accentuated(character) for character in characters)

    for language, language_characters in FREQUENCIES.items():

-        target_have_accents = False  # type: bool
-        target_pure_latin = True  # type: bool
-
-        for language_character in language_characters:
-            if target_have_accents is False and is_accentuated(language_character):
-                target_have_accents = True
-            if target_pure_latin is True and is_latin(language_character) is False:
-                target_pure_latin = False
+        target_have_accents, target_pure_latin = get_target_features(language)

        if ignore_non_latin and target_pure_latin is False:
            continue
@ -263,8 +268,6 @@ def merge_coherence_ratios(results: List[CoherenceMatches]) -> CoherenceMatches:
    The return type is the same as coherence_ratio.
    """
    per_language_ratios = OrderedDict()  # type: Dict[str, List[float]]
-    merge = []  # type: CoherenceMatches
-
    for result in results:
        for sub_result in result:
            language, ratio = sub_result
@ -273,17 +276,16 @@ def merge_coherence_ratios(results: List[CoherenceMatches]) -> CoherenceMatches:
                continue
            per_language_ratios[language].append(ratio)

-    for language in per_language_ratios:
-        merge.append(
-            (
-                language,
-                round(
-                    sum(per_language_ratios[language])
-                    / len(per_language_ratios[language]),
-                    4,
-                ),
-            )
+    merge = [
+        (
+            language,
+            round(
+                sum(per_language_ratios[language]) / len(per_language_ratios[language]),
+                4,
+            ),
        )
+        for language in per_language_ratios
+    ]

    return sorted(merge, key=lambda x: x[1], reverse=True)

@ -298,14 +300,11 @@ def coherence_ratio(
    """

    results = []  # type: List[Tuple[str, float]]
-    lg_inclusion_list = []  # type: List[str]
    ignore_non_latin = False  # type: bool

    sufficient_match_count = 0  # type: int

-    if lg_inclusion is not None:
-        lg_inclusion_list = lg_inclusion.split(",")
-
+    lg_inclusion_list = lg_inclusion.split(",") if lg_inclusion is not None else []
    if "Latin Based" in lg_inclusion_list:
        ignore_non_latin = True
        lg_inclusion_list.remove("Latin Based")
@ -314,7 +313,7 @@ def coherence_ratio(
        sequence_frequencies = Counter(layer)  # type: Counter
        most_common = sequence_frequencies.most_common()

-        character_count = sum([o for c, o in most_common])  # type: int
+        character_count = sum(o for c, o in most_common)  # type: int

        if character_count <= TOO_SMALL_SEQUENCE:
            continue
--- a/lib/charset_normalizer/cli/normalizer.py
+++ b/lib/charset_normalizer/cli/normalizer.py
@ -235,20 +235,19 @@ def cli_detect(argv: List[str] = None) -> int:
                    o_.insert(-1, best_guess.encoding)
                    if my_file.closed is False:
                        my_file.close()
-                else:
-                    if (
-                        args.force is False
-                        and query_yes_no(
-                            'Are you sure to normalize "{}" by replacing it ?'.format(
-                                my_file.name
-                            ),
-                            "no",
-                        )
-                        is False
-                    ):
-                        if my_file.closed is False:
-                            my_file.close()
-                        continue
+                elif (
+                    args.force is False
+                    and query_yes_no(
+                        'Are you sure to normalize "{}" by replacing it ?'.format(
+                            my_file.name
+                        ),
+                        "no",
+                    )
+                    is False
+                ):
+                    if my_file.closed is False:
+                        my_file.close()
+                    continue

                try:
                    x_[0].unicode_path = abspath("./{}".format(".".join(o_)))
@ -277,7 +276,7 @@ def cli_detect(argv: List[str] = None) -> int:
            print(
                ", ".join(
                    [
-                        el.encoding if el.encoding else "undefined"
+                        el.encoding or "undefined"
                        for el in x_
                        if el.path == abspath(my_file.name)
                    ]
--- a/lib/charset_normalizer/constant.py
+++ b/lib/charset_normalizer/constant.py
@ -4,6 +4,8 @@ from encodings.aliases import aliases
 from re import IGNORECASE, compile as re_compile
 from typing import Dict, List, Set, Union

+from .assets import FREQUENCIES
+
 # Contain for each eligible encoding a list of/item bytes SIG/BOM
 ENCODING_MARKS = OrderedDict(
    [
@ -30,7 +32,7 @@ TOO_BIG_SEQUENCE = int(10e6)  # type: int
 UTF8_MAXIMAL_ALLOCATION = 1112064  # type: int

 UNICODE_RANGES_COMBINED = {
-    "Control character": range(0, 31 + 1),
+    "Control character": range(31 + 1),
    "Basic Latin": range(32, 127 + 1),
    "Latin-1 Supplement": range(128, 255 + 1),
    "Latin Extended-A": range(256, 383 + 1),
@ -311,6 +313,7 @@ UNICODE_RANGES_COMBINED = {
    "Variation Selectors Supplement": range(917760, 917999 + 1),
 }  # type: Dict[str, range]

+
 UNICODE_SECONDARY_RANGE_KEYWORD = [
    "Supplement",
    "Extended",
@ -352,11 +355,10 @@ IANA_SUPPORTED_SIMILAR = {
    "cp1140": ["cp037", "cp1026", "cp273", "cp500"],
    "cp1250": ["iso8859_2"],
    "cp1251": ["kz1048", "ptcp154"],
-    "cp1252": ["cp1258", "iso8859_15", "iso8859_9", "latin_1"],
+    "cp1252": ["iso8859_15", "iso8859_9", "latin_1"],
    "cp1253": ["iso8859_7"],
-    "cp1254": ["cp1258", "iso8859_15", "iso8859_9", "latin_1"],
+    "cp1254": ["iso8859_15", "iso8859_9", "latin_1"],
    "cp1257": ["iso8859_13"],
-    "cp1258": ["cp1252", "cp1254", "iso8859_9", "latin_1"],
    "cp273": ["cp037", "cp1026", "cp1140", "cp500"],
    "cp437": ["cp850", "cp858", "cp860", "cp861", "cp862", "cp863", "cp865"],
    "cp500": ["cp037", "cp1026", "cp1140", "cp273"],
@ -494,3 +496,5 @@ KO_NAMES = {"johab", "cp949", "euc_kr"}  # type: Set[str]
 ZH_NAMES = {"big5", "cp950", "big5hkscs", "hz"}  # type: Set[str]

 NOT_PRINTABLE_PATTERN = re_compile(r"[0-9\W\n\r\t]+")
+
+LANGUAGE_SUPPORTED_COUNT = len(FREQUENCIES)  # type: int
--- a/lib/charset_normalizer/md.py
+++ b/lib/charset_normalizer/md.py
@ -40,11 +40,11 @@ class MessDetectorPlugin:
        """
        raise NotImplementedError  # pragma: nocover

-    def reset(self) -> None:
+    def reset(self) -> None:  # pragma: no cover
        """
        Permit to reset the plugin to the initial state.
        """
-        raise NotImplementedError  # pragma: nocover
+        raise NotImplementedError

    @property
    def ratio(self) -> float:
@ -85,7 +85,7 @@ class TooManySymbolOrPunctuationPlugin(MessDetectorPlugin):

        self._last_printable_char = character

-    def reset(self) -> None:
+    def reset(self) -> None:  # pragma: no cover
        self._punctuation_count = 0
        self._character_count = 0
        self._symbol_count = 0
@ -116,7 +116,7 @@ class TooManyAccentuatedPlugin(MessDetectorPlugin):
        if is_accentuated(character):
            self._accentuated_count += 1

-    def reset(self) -> None:
+    def reset(self) -> None:  # pragma: no cover
        self._character_count = 0
        self._accentuated_count = 0

@ -147,7 +147,7 @@ class UnprintablePlugin(MessDetectorPlugin):
            self._unprintable_count += 1
        self._character_count += 1

-    def reset(self) -> None:
+    def reset(self) -> None:  # pragma: no cover
        self._unprintable_count = 0

    @property
@ -170,18 +170,19 @@ class SuspiciousDuplicateAccentPlugin(MessDetectorPlugin):

    def feed(self, character: str) -> None:
        self._character_count += 1
-        if self._last_latin_character is not None:
-            if is_accentuated(character) and is_accentuated(self._last_latin_character):
-                if character.isupper() and self._last_latin_character.isupper():
-                    self._successive_count += 1
-                # Worse if its the same char duplicated with different accent.
-                if remove_accent(character) == remove_accent(
-                    self._last_latin_character
-                ):
-                    self._successive_count += 1
+        if (
+            self._last_latin_character is not None
+            and is_accentuated(character)
+            and is_accentuated(self._last_latin_character)
+        ):
+            if character.isupper() and self._last_latin_character.isupper():
+                self._successive_count += 1
+            # Worse if its the same char duplicated with different accent.
+            if remove_accent(character) == remove_accent(self._last_latin_character):
+                self._successive_count += 1
        self._last_latin_character = character

-    def reset(self) -> None:
+    def reset(self) -> None:  # pragma: no cover
        self._successive_count = 0
        self._character_count = 0
        self._last_latin_character = None
@ -228,7 +229,7 @@ class SuspiciousRange(MessDetectorPlugin):

        self._last_printable_seen = character

-    def reset(self) -> None:
+    def reset(self) -> None:  # pragma: no cover
        self._character_count = 0
        self._suspicious_successive_range_count = 0
        self._last_printable_seen = None
@ -252,6 +253,8 @@ class SuperWeirdWordPlugin(MessDetectorPlugin):
    def __init__(self) -> None:
        self._word_count = 0  # type: int
        self._bad_word_count = 0  # type: int
+        self._foreign_long_count = 0  # type: int
+
        self._is_current_word_bad = False  # type: bool
        self._foreign_long_watch = False  # type: bool

@ -271,7 +274,7 @@ class SuperWeirdWordPlugin(MessDetectorPlugin):
                self._buffer_accent_count += 1
            if (
                self._foreign_long_watch is False
-                and is_latin(character) is False
+                and (is_latin(character) is False or is_accentuated(character))
                and is_cjk(character) is False
                and is_hangul(character) is False
                and is_katakana(character) is False
@ -290,9 +293,16 @@ class SuperWeirdWordPlugin(MessDetectorPlugin):

            self._character_count += buffer_length

-            if buffer_length >= 4 and self._buffer_accent_count / buffer_length > 0.34:
-                self._is_current_word_bad = True
+            if buffer_length >= 4:
+                if self._buffer_accent_count / buffer_length > 0.34:
+                    self._is_current_word_bad = True
+                # Word/Buffer ending with a upper case accentuated letter are so rare,
+                # that we will consider them all as suspicious. Same weight as foreign_long suspicious.
+                if is_accentuated(self._buffer[-1]) and self._buffer[-1].isupper():
+                    self._foreign_long_count += 1
+                    self._is_current_word_bad = True
            if buffer_length >= 24 and self._foreign_long_watch:
+                self._foreign_long_count += 1
                self._is_current_word_bad = True

            if self._is_current_word_bad:
@ -311,7 +321,7 @@ class SuperWeirdWordPlugin(MessDetectorPlugin):
            self._is_current_word_bad = True
            self._buffer += character

-    def reset(self) -> None:
+    def reset(self) -> None:  # pragma: no cover
        self._buffer = ""
        self._is_current_word_bad = False
        self._foreign_long_watch = False
@ -319,10 +329,11 @@ class SuperWeirdWordPlugin(MessDetectorPlugin):
        self._word_count = 0
        self._character_count = 0
        self._bad_character_count = 0
+        self._foreign_long_count = 0

    @property
    def ratio(self) -> float:
-        if self._word_count <= 10:
+        if self._word_count <= 10 and self._foreign_long_count == 0:
            return 0.0

        return self._bad_character_count / self._character_count
@ -342,13 +353,13 @@ class CjkInvalidStopPlugin(MessDetectorPlugin):
        return True

    def feed(self, character: str) -> None:
-        if character in ["丅", "丄"]:
+        if character in {"丅", "丄"}:
            self._wrong_stop_count += 1
            return
        if is_cjk(character):
            self._cjk_character_count += 1

-    def reset(self) -> None:
+    def reset(self) -> None:  # pragma: no cover
        self._wrong_stop_count = 0
        self._cjk_character_count = 0

@ -418,7 +429,7 @@ class ArchaicUpperLowerPlugin(MessDetectorPlugin):
        self._character_count_since_last_sep += 1
        self._last_alpha_seen = character

-    def reset(self) -> None:
+    def reset(self) -> None:  # pragma: no cover
        self._character_count = 0
        self._character_count_since_last_sep = 0
        self._successive_upper_lower_count = 0
@ -453,6 +464,13 @@ def is_suspiciously_successive_range(
    if "Emoticons" in unicode_range_a or "Emoticons" in unicode_range_b:
        return False

+    # Latin characters can be accompanied with a combining diacritical mark
+    # eg. Vietnamese.
+    if ("Latin" in unicode_range_a or "Latin" in unicode_range_b) and (
+        "Combining" in unicode_range_a or "Combining" in unicode_range_b
+    ):
+        return False
+
    keywords_range_a, keywords_range_b = unicode_range_a.split(
        " "
    ), unicode_range_b.split(" ")
@ -472,11 +490,12 @@ def is_suspiciously_successive_range(
        ),
        unicode_range_b in ("Hiragana", "Katakana"),
    )
-    if range_a_jp_chars or range_b_jp_chars:
-        if "CJK" in unicode_range_a or "CJK" in unicode_range_b:
-            return False
-        if range_a_jp_chars and range_b_jp_chars:
-            return False
+    if (range_a_jp_chars or range_b_jp_chars) and (
+        "CJK" in unicode_range_a or "CJK" in unicode_range_b
+    ):
+        return False
+    if range_a_jp_chars and range_b_jp_chars:
+        return False

    if "Hangul" in unicode_range_a or "Hangul" in unicode_range_b:
        if "CJK" in unicode_range_a or "CJK" in unicode_range_b:
@ -509,7 +528,7 @@ def mess_ratio(
        md_class() for md_class in MessDetectorPlugin.__subclasses__()
    ]  # type: List[MessDetectorPlugin]

-    length = len(decoded_sequence)  # type: int
+    length = len(decoded_sequence) + 1  # type: int

    mean_mess_ratio = 0.0  # type: float

@ -520,7 +539,7 @@ def mess_ratio(
    else:
        intermediary_mean_mess_ratio_calc = 128

-    for character, index in zip(decoded_sequence, range(0, length)):
+    for character, index in zip(decoded_sequence + "\n", range(length)):
        for detector in detectors:
            if detector.eligible(character):
                detector.feed(character)
@ -528,7 +547,7 @@ def mess_ratio(
        if (
            index > 0 and index % intermediary_mean_mess_ratio_calc == 0
        ) or index == length - 1:
-            mean_mess_ratio = sum([dt.ratio for dt in detectors])
+            mean_mess_ratio = sum(dt.ratio for dt in detectors)

            if mean_mess_ratio >= maximum_threshold:
                break
--- a/lib/charset_normalizer/models.py
+++ b/lib/charset_normalizer/models.py
@ -284,8 +284,7 @@ class CharsetMatches:
        self._results = sorted(results) if results else []  # type: List[CharsetMatch]

    def __iter__(self) -> Iterator[CharsetMatch]:
-        for result in self._results:
-            yield result
+        yield from self._results

    def __getitem__(self, item: Union[int, str]) -> CharsetMatch:
        """
--- a/lib/charset_normalizer/utils.py
+++ b/lib/charset_normalizer/utils.py
@ -4,6 +4,7 @@ except ImportError:
    import unicodedata  # type: ignore[no-redef]

 import importlib
+import logging
 from codecs import IncrementalDecoder
 from encodings.aliases import aliases
 from functools import lru_cache
@ -122,7 +123,7 @@ def is_emoticon(character: str) -> bool:

@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
 def is_separator(character: str) -> bool:
-    if character.isspace() or character in ["｜", "+", ",", ";", "<", ">"]:
+    if character.isspace() or character in {"｜", "+", ",", ";", "<", ">"}:
        return True

    character_category = unicodedata.category(character)  # type: str
@ -138,7 +139,7 @@ def is_case_variable(character: str) -> bool:
 def is_private_use_only(character: str) -> bool:
    character_category = unicodedata.category(character)  # type: str

-    return "Co" == character_category
+    return character_category == "Co"


@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
@ -193,11 +194,7 @@ def is_thai(character: str) -> bool:

@lru_cache(maxsize=len(UNICODE_RANGES_COMBINED))
 def is_unicode_range_secondary(range_name: str) -> bool:
-    for keyword in UNICODE_SECONDARY_RANGE_KEYWORD:
-        if keyword in range_name:
-            return True
-
-    return False
+    return any(keyword in range_name for keyword in UNICODE_SECONDARY_RANGE_KEYWORD)


 def any_specified_encoding(sequence: bytes, search_zone: int = 4096) -> Optional[str]:
@ -211,9 +208,7 @@ def any_specified_encoding(sequence: bytes, search_zone: int = 4096) -> Optional

    results = findall(
        RE_POSSIBLE_ENCODING_INDICATION,
-        sequence[: seq_len if seq_len <= search_zone else search_zone].decode(
-            "ascii", errors="ignore"
-        ),
+        sequence[: min(seq_len, search_zone)].decode("ascii", errors="ignore"),
    )  # type: List[str]

    if len(results) == 0:
@ -278,7 +273,7 @@ def iana_name(cp_name: str, strict: bool = True) -> str:
    cp_name = cp_name.lower().replace("-", "_")

    for encoding_alias, encoding_iana in aliases.items():
-        if cp_name == encoding_alias or cp_name == encoding_iana:
+        if cp_name in [encoding_alias, encoding_iana]:
            return encoding_iana

    if strict:
@ -314,7 +309,7 @@ def cp_similarity(iana_name_a: str, iana_name_b: str) -> float:

    character_match_count = 0  # type: int

-    for i in range(0, 255):
+    for i in range(255):
        to_be_decoded = bytes([i])  # type: bytes
        if id_a.decode(to_be_decoded) == id_b.decode(to_be_decoded):
            character_match_count += 1
@ -331,3 +326,17 @@ def is_cp_similar(iana_name_a: str, iana_name_b: str) -> bool:
        iana_name_a in IANA_SUPPORTED_SIMILAR
        and iana_name_b in IANA_SUPPORTED_SIMILAR[iana_name_a]
    )
+
+
+def set_logging_handler(
+    name: str = "charset_normalizer",
+    level: int = logging.INFO,
+    format_string: str = "%(asctime)s | %(levelname)s | %(message)s",
+) -> None:
+
+    logger = logging.getLogger(name)
+    logger.setLevel(level)
+
+    handler = logging.StreamHandler()
+    handler.setFormatter(logging.Formatter(format_string))
+    logger.addHandler(handler)
--- a/lib/charset_normalizer/version.py
+++ b/lib/charset_normalizer/version.py
@ -2,5 +2,5 @@
 Expose version
 """

-__version__ = "2.0.7"
+__version__ = "2.0.8"
 VERSION = __version__.split(".")