Bump requests from 2.27.1 to 2.28.1 (#1781)

* Bump requests from 2.27.1 to 2.28.1 Bumps [requests](https://github.com/psf/requests) from 2.27.1 to 2.28.1. - [Release notes](https://github.com/psf/requests/releases) - [Changelog](https://github.com/psf/requests/blob/main/HISTORY.md) - [Commits](https://github.com/psf/requests/compare/v2.27.1...v2.28.1) --- updated-dependencies: - dependency-name: requests dependency-type: direct:production update-type: version-update:semver-minor ... Signed-off-by: dependabot[bot] <support@github.com> * Update requests==2.28.1 * Update urllib3==1.26.12 * Update certifi==2022.9.24 * Update idna==3.4 * Update charset-normalizer==2.1.1 Signed-off-by: dependabot[bot] <support@github.com> Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> Co-authored-by: JonnyWong16 <9099342+JonnyWong16@users.noreply.github.com> [skip ci]
2025-08-14 02:26:58 -07:00 · 2022-11-12 17:12:19 -08:00 · 2022-11-12 17:12:19 -08:00 · af1aed0b6b
commit af1aed0b6b
parent baa0e08c2a
46 changed files with 3295 additions and 2709 deletions
--- a/lib/charset_normalizer/init.py
+++ b/lib/charset_normalizer/init.py
@ -1,4 +1,4 @@
-# -*- coding: utf_8 -*-
+# -*- coding: utf-8 -*-
 """
 Charset-Normalizer
 ~~~~~~~~~~~~~~
--- a/lib/charset_normalizer/api.py
+++ b/lib/charset_normalizer/api.py
@ -1,11 +1,8 @@
 import logging
+import warnings
+from os import PathLike
 from os.path import basename, splitext
-from typing import BinaryIO, List, Optional, Set
-
-try:
-    from os import PathLike
-except ImportError:  # pragma: no cover
-    PathLike = str  # type: ignore
+from typing import Any, BinaryIO, List, Optional, Set

 from .cd import (
    coherence_ratio,
@ -18,6 +15,7 @@ from .md import mess_ratio
 from .models import CharsetMatch, CharsetMatches
 from .utils import (
    any_specified_encoding,
+    cut_sequence_chunks,
    iana_name,
    identify_sig_or_bom,
    is_cp_similar,
@ -39,8 +37,8 @@ def from_bytes(
    steps: int = 5,
    chunk_size: int = 512,
    threshold: float = 0.2,
-    cp_isolation: List[str] = None,
-    cp_exclusion: List[str] = None,
+    cp_isolation: Optional[List[str]] = None,
+    cp_exclusion: Optional[List[str]] = None,
    preemptive_behaviour: bool = True,
    explain: bool = False,
 ) -> CharsetMatches:
@ -70,11 +68,11 @@ def from_bytes(
        )

    if explain:
-        previous_logger_level = logger.level  # type: int
+        previous_logger_level: int = logger.level
        logger.addHandler(explain_handler)
        logger.setLevel(TRACE)

-    length = len(sequences)  # type: int
+    length: int = len(sequences)

    if length == 0:
        logger.debug("Encoding detection on empty bytes, assuming utf_8 intention.")
@ -119,8 +117,8 @@ def from_bytes(
    if steps > 1 and length / steps < chunk_size:
        chunk_size = int(length / steps)

-    is_too_small_sequence = len(sequences) < TOO_SMALL_SEQUENCE  # type: bool
-    is_too_large_sequence = len(sequences) >= TOO_BIG_SEQUENCE  # type: bool
+    is_too_small_sequence: bool = len(sequences) < TOO_SMALL_SEQUENCE
+    is_too_large_sequence: bool = len(sequences) >= TOO_BIG_SEQUENCE

    if is_too_small_sequence:
        logger.log(
@ -137,11 +135,11 @@ def from_bytes(
            ),
        )

-    prioritized_encodings = []  # type: List[str]
+    prioritized_encodings: List[str] = []

-    specified_encoding = (
+    specified_encoding: Optional[str] = (
        any_specified_encoding(sequences) if preemptive_behaviour else None
-    )  # type: Optional[str]
+    )

    if specified_encoding is not None:
        prioritized_encodings.append(specified_encoding)
@ -151,15 +149,15 @@ def from_bytes(
            specified_encoding,
        )

-    tested = set()  # type: Set[str]
-    tested_but_hard_failure = []  # type: List[str]
-    tested_but_soft_failure = []  # type: List[str]
+    tested: Set[str] = set()
+    tested_but_hard_failure: List[str] = []
+    tested_but_soft_failure: List[str] = []

-    fallback_ascii = None  # type: Optional[CharsetMatch]
-    fallback_u8 = None  # type: Optional[CharsetMatch]
-    fallback_specified = None  # type: Optional[CharsetMatch]
+    fallback_ascii: Optional[CharsetMatch] = None
+    fallback_u8: Optional[CharsetMatch] = None
+    fallback_specified: Optional[CharsetMatch] = None

-    results = CharsetMatches()  # type: CharsetMatches
+    results: CharsetMatches = CharsetMatches()

    sig_encoding, sig_payload = identify_sig_or_bom(sequences)

@ -190,11 +188,11 @@ def from_bytes(

        tested.add(encoding_iana)

-        decoded_payload = None  # type: Optional[str]
-        bom_or_sig_available = sig_encoding == encoding_iana  # type: bool
-        strip_sig_or_bom = bom_or_sig_available and should_strip_sig_or_bom(
+        decoded_payload: Optional[str] = None
+        bom_or_sig_available: bool = sig_encoding == encoding_iana
+        strip_sig_or_bom: bool = bom_or_sig_available and should_strip_sig_or_bom(
            encoding_iana
-        )  # type: bool
+        )

        if encoding_iana in {"utf_16", "utf_32"} and not bom_or_sig_available:
            logger.log(
@ -205,7 +203,7 @@ def from_bytes(
            continue

        try:
-            is_multi_byte_decoder = is_multi_byte_encoding(encoding_iana)  # type: bool
+            is_multi_byte_decoder: bool = is_multi_byte_encoding(encoding_iana)
        except (ModuleNotFoundError, ImportError):
            logger.log(
                TRACE,
@ -240,7 +238,7 @@ def from_bytes(
            tested_but_hard_failure.append(encoding_iana)
            continue

-        similar_soft_failure_test = False  # type: bool
+        similar_soft_failure_test: bool = False

        for encoding_soft_failed in tested_but_soft_failure:
            if is_cp_similar(encoding_iana, encoding_soft_failed):
@ -262,11 +260,11 @@ def from_bytes(
            int(length / steps),
        )

-        multi_byte_bonus = (
+        multi_byte_bonus: bool = (
            is_multi_byte_decoder
            and decoded_payload is not None
            and len(decoded_payload) < length
-        )  # type: bool
+        )

        if multi_byte_bonus:
            logger.log(
@ -276,72 +274,47 @@ def from_bytes(
                encoding_iana,
            )

-        max_chunk_gave_up = int(len(r_) / 4)  # type: int
+        max_chunk_gave_up: int = int(len(r_) / 4)

        max_chunk_gave_up = max(max_chunk_gave_up, 2)
-        early_stop_count = 0  # type: int
+        early_stop_count: int = 0
        lazy_str_hard_failure = False

-        md_chunks = []  # type: List[str]
+        md_chunks: List[str] = []
        md_ratios = []

-        for i in r_:
-            if i + chunk_size > length + 8:
-                continue
-
-            cut_sequence = sequences[i : i + chunk_size]
-
-            if bom_or_sig_available and strip_sig_or_bom is False:
-                cut_sequence = sig_payload + cut_sequence
-
-            try:
-                chunk = cut_sequence.decode(
-                    encoding_iana,
-                    errors="ignore" if is_multi_byte_decoder else "strict",
-                )  # type: str
-            except UnicodeDecodeError as e:  # Lazy str loading may have missed something there
-                logger.log(
-                    TRACE,
-                    "LazyStr Loading: After MD chunk decode, code page %s does not fit given bytes sequence at ALL. %s",
-                    encoding_iana,
-                    str(e),
-                )
-                early_stop_count = max_chunk_gave_up
-                lazy_str_hard_failure = True
-                break
-
-            # multi-byte bad cutting detector and adjustment
-            # not the cleanest way to perform that fix but clever enough for now.
-            if is_multi_byte_decoder and i > 0 and sequences[i] >= 0x80:
-
-                chunk_partial_size_chk = min(chunk_size, 16)  # type: int
-
-                if (
-                    decoded_payload
-                    and chunk[:chunk_partial_size_chk] not in decoded_payload
-                ):
-                    for j in range(i, i - 4, -1):
-                        cut_sequence = sequences[j : i + chunk_size]
-
-                        if bom_or_sig_available and strip_sig_or_bom is False:
-                            cut_sequence = sig_payload + cut_sequence
-
-                        chunk = cut_sequence.decode(encoding_iana, errors="ignore")
-
-                        if chunk[:chunk_partial_size_chk] in decoded_payload:
-                            break
-
-            md_chunks.append(chunk)
-
-            md_ratios.append(mess_ratio(chunk, threshold))
-
-            if md_ratios[-1] >= threshold:
-                early_stop_count += 1
-
-            if (early_stop_count >= max_chunk_gave_up) or (
-                bom_or_sig_available and strip_sig_or_bom is False
+        try:
+            for chunk in cut_sequence_chunks(
+                sequences,
+                encoding_iana,
+                r_,
+                chunk_size,
+                bom_or_sig_available,
+                strip_sig_or_bom,
+                sig_payload,
+                is_multi_byte_decoder,
+                decoded_payload,
            ):
-                break
+                md_chunks.append(chunk)
+
+                md_ratios.append(mess_ratio(chunk, threshold))
+
+                if md_ratios[-1] >= threshold:
+                    early_stop_count += 1
+
+                if (early_stop_count >= max_chunk_gave_up) or (
+                    bom_or_sig_available and strip_sig_or_bom is False
+                ):
+                    break
+        except UnicodeDecodeError as e:  # Lazy str loading may have missed something there
+            logger.log(
+                TRACE,
+                "LazyStr Loading: After MD chunk decode, code page %s does not fit given bytes sequence at ALL. %s",
+                encoding_iana,
+                str(e),
+            )
+            early_stop_count = max_chunk_gave_up
+            lazy_str_hard_failure = True

        # We might want to check the sequence again with the whole content
        # Only if initial MD tests passes
@ -362,9 +335,7 @@ def from_bytes(
                tested_but_hard_failure.append(encoding_iana)
                continue

-        mean_mess_ratio = (
-            sum(md_ratios) / len(md_ratios) if md_ratios else 0.0
-        )  # type: float
+        mean_mess_ratio: float = sum(md_ratios) / len(md_ratios) if md_ratios else 0.0
        if mean_mess_ratio >= threshold or early_stop_count >= max_chunk_gave_up:
            tested_but_soft_failure.append(encoding_iana)
            logger.log(
@ -399,7 +370,7 @@ def from_bytes(
        )

        if not is_multi_byte_decoder:
-            target_languages = encoding_languages(encoding_iana)  # type: List[str]
+            target_languages: List[str] = encoding_languages(encoding_iana)
        else:
            target_languages = mb_encoding_languages(encoding_iana)

@ -516,8 +487,8 @@ def from_fp(
    steps: int = 5,
    chunk_size: int = 512,
    threshold: float = 0.20,
-    cp_isolation: List[str] = None,
-    cp_exclusion: List[str] = None,
+    cp_isolation: Optional[List[str]] = None,
+    cp_exclusion: Optional[List[str]] = None,
    preemptive_behaviour: bool = True,
    explain: bool = False,
 ) -> CharsetMatches:
@ -538,12 +509,12 @@ def from_fp(


 def from_path(
-    path: PathLike,
+    path: "PathLike[Any]",
    steps: int = 5,
    chunk_size: int = 512,
    threshold: float = 0.20,
-    cp_isolation: List[str] = None,
-    cp_exclusion: List[str] = None,
+    cp_isolation: Optional[List[str]] = None,
+    cp_exclusion: Optional[List[str]] = None,
    preemptive_behaviour: bool = True,
    explain: bool = False,
 ) -> CharsetMatches:
@ -565,17 +536,22 @@ def from_path(


 def normalize(
-    path: PathLike,
+    path: "PathLike[Any]",
    steps: int = 5,
    chunk_size: int = 512,
    threshold: float = 0.20,
-    cp_isolation: List[str] = None,
-    cp_exclusion: List[str] = None,
+    cp_isolation: Optional[List[str]] = None,
+    cp_exclusion: Optional[List[str]] = None,
    preemptive_behaviour: bool = True,
 ) -> CharsetMatch:
    """
    Take a (text-based) file path and try to create another file next to it, this time using UTF-8.
    """
+    warnings.warn(
+        "normalize is deprecated and will be removed in 3.0",
+        DeprecationWarning,
+    )
+
    results = from_path(
        path,
        steps,
--- a/lib/charset_normalizer/assets/init.py
+++ b/lib/charset_normalizer/assets/init.py
--- a/lib/charset_normalizer/cd.py
+++ b/lib/charset_normalizer/cd.py
@ -1,8 +1,8 @@
 import importlib
 from codecs import IncrementalDecoder
-from collections import Counter, OrderedDict
+from collections import Counter
 from functools import lru_cache
-from typing import Dict, List, Optional, Tuple
+from typing import Counter as TypeCounter, Dict, List, Optional, Tuple

 from .assets import FREQUENCIES
 from .constant import KO_NAMES, LANGUAGE_SUPPORTED_COUNT, TOO_SMALL_SEQUENCE, ZH_NAMES
@ -24,17 +24,19 @@ def encoding_unicode_range(iana_name: str) -> List[str]:
    if is_multi_byte_encoding(iana_name):
        raise IOError("Function not supported on multi-byte code page")

-    decoder = importlib.import_module("encodings.{}".format(iana_name)).IncrementalDecoder  # type: ignore
+    decoder = importlib.import_module(
+        "encodings.{}".format(iana_name)
+    ).IncrementalDecoder

-    p = decoder(errors="ignore")  # type: IncrementalDecoder
-    seen_ranges = {}  # type: Dict[str, int]
-    character_count = 0  # type: int
+    p: IncrementalDecoder = decoder(errors="ignore")
+    seen_ranges: Dict[str, int] = {}
+    character_count: int = 0

    for i in range(0x40, 0xFF):
-        chunk = p.decode(bytes([i]))  # type: str
+        chunk: str = p.decode(bytes([i]))

        if chunk:
-            character_range = unicode_range(chunk)  # type: Optional[str]
+            character_range: Optional[str] = unicode_range(chunk)

            if character_range is None:
                continue
@ -58,7 +60,7 @@ def unicode_range_languages(primary_range: str) -> List[str]:
    """
    Return inferred languages used with a unicode range.
    """
-    languages = []  # type: List[str]
+    languages: List[str] = []

    for language, characters in FREQUENCIES.items():
        for character in characters:
@ -75,8 +77,8 @@ def encoding_languages(iana_name: str) -> List[str]:
    Single-byte encoding language association. Some code page are heavily linked to particular language(s).
    This function does the correspondence.
    """
-    unicode_ranges = encoding_unicode_range(iana_name)  # type: List[str]
-    primary_range = None  # type: Optional[str]
+    unicode_ranges: List[str] = encoding_unicode_range(iana_name)
+    primary_range: Optional[str] = None

    for specified_range in unicode_ranges:
        if "Latin" not in specified_range:
@ -115,8 +117,8 @@ def get_target_features(language: str) -> Tuple[bool, bool]:
    """
    Determine main aspects from a supported language if it contains accents and if is pure Latin.
    """
-    target_have_accents = False  # type: bool
-    target_pure_latin = True  # type: bool
+    target_have_accents: bool = False
+    target_pure_latin: bool = True

    for character in FREQUENCIES[language]:
        if not target_have_accents and is_accentuated(character):
@ -133,7 +135,7 @@ def alphabet_languages(
    """
    Return associated languages associated to given characters.
    """
-    languages = []  # type: List[Tuple[str, float]]
+    languages: List[Tuple[str, float]] = []

    source_have_accents = any(is_accentuated(character) for character in characters)

@ -147,13 +149,13 @@ def alphabet_languages(
        if target_have_accents is False and source_have_accents:
            continue

-        character_count = len(language_characters)  # type: int
+        character_count: int = len(language_characters)

-        character_match_count = len(
+        character_match_count: int = len(
            [c for c in language_characters if c in characters]
-        )  # type: int
+        )

-        ratio = character_match_count / character_count  # type: float
+        ratio: float = character_match_count / character_count

        if ratio >= 0.2:
            languages.append((language, ratio))
@ -174,36 +176,33 @@ def characters_popularity_compare(
    if language not in FREQUENCIES:
        raise ValueError("{} not available".format(language))

-    character_approved_count = 0  # type: int
+    character_approved_count: int = 0
+    FREQUENCIES_language_set = set(FREQUENCIES[language])

    for character in ordered_characters:
-        if character not in FREQUENCIES[language]:
+        if character not in FREQUENCIES_language_set:
            continue

-        characters_before_source = FREQUENCIES[language][
+        characters_before_source: List[str] = FREQUENCIES[language][
            0 : FREQUENCIES[language].index(character)
-        ]  # type: List[str]
-        characters_after_source = FREQUENCIES[language][
+        ]
+        characters_after_source: List[str] = FREQUENCIES[language][
            FREQUENCIES[language].index(character) :
-        ]  # type: List[str]
-
-        characters_before = ordered_characters[
+        ]
+        characters_before: List[str] = ordered_characters[
            0 : ordered_characters.index(character)
-        ]  # type: List[str]
-        characters_after = ordered_characters[
+        ]
+        characters_after: List[str] = ordered_characters[
            ordered_characters.index(character) :
-        ]  # type: List[str]
+        ]

-        before_match_count = [
-            e in characters_before for e in characters_before_source
-        ].count(
-            True
-        )  # type: int
-        after_match_count = [
-            e in characters_after for e in characters_after_source
-        ].count(
-            True
-        )  # type: int
+        before_match_count: int = len(
+            set(characters_before) & set(characters_before_source)
+        )
+
+        after_match_count: int = len(
+            set(characters_after) & set(characters_after_source)
+        )

        if len(characters_before_source) == 0 and before_match_count <= 4:
            character_approved_count += 1
@ -229,18 +228,18 @@ def alpha_unicode_split(decoded_sequence: str) -> List[str]:
    Ex. a text containing English/Latin with a bit a Hebrew will return two items in the resulting list;
    One containing the latin letters and the other hebrew.
    """
-    layers = OrderedDict()  # type: Dict[str, str]
+    layers: Dict[str, str] = {}

    for character in decoded_sequence:
        if character.isalpha() is False:
            continue

-        character_range = unicode_range(character)  # type: Optional[str]
+        character_range: Optional[str] = unicode_range(character)

        if character_range is None:
            continue

-        layer_target_range = None  # type: Optional[str]
+        layer_target_range: Optional[str] = None

        for discovered_range in layers:
            if (
@ -267,7 +266,7 @@ def merge_coherence_ratios(results: List[CoherenceMatches]) -> CoherenceMatches:
    This function merge results previously given by the function coherence_ratio.
    The return type is the same as coherence_ratio.
    """
-    per_language_ratios = OrderedDict()  # type: Dict[str, List[float]]
+    per_language_ratios: Dict[str, List[float]] = {}
    for result in results:
        for sub_result in result:
            language, ratio = sub_result
@ -299,10 +298,10 @@ def coherence_ratio(
    A layer = Character extraction by alphabets/ranges.
    """

-    results = []  # type: List[Tuple[str, float]]
-    ignore_non_latin = False  # type: bool
+    results: List[Tuple[str, float]] = []
+    ignore_non_latin: bool = False

-    sufficient_match_count = 0  # type: int
+    sufficient_match_count: int = 0

    lg_inclusion_list = lg_inclusion.split(",") if lg_inclusion is not None else []
    if "Latin Based" in lg_inclusion_list:
@ -310,22 +309,22 @@ def coherence_ratio(
        lg_inclusion_list.remove("Latin Based")

    for layer in alpha_unicode_split(decoded_sequence):
-        sequence_frequencies = Counter(layer)  # type: Counter
+        sequence_frequencies: TypeCounter[str] = Counter(layer)
        most_common = sequence_frequencies.most_common()

-        character_count = sum(o for c, o in most_common)  # type: int
+        character_count: int = sum(o for c, o in most_common)

        if character_count <= TOO_SMALL_SEQUENCE:
            continue

-        popular_character_ordered = [c for c, o in most_common]  # type: List[str]
+        popular_character_ordered: List[str] = [c for c, o in most_common]

        for language in lg_inclusion_list or alphabet_languages(
            popular_character_ordered, ignore_non_latin
        ):
-            ratio = characters_popularity_compare(
+            ratio: float = characters_popularity_compare(
                language, popular_character_ordered
-            )  # type: float
+            )

            if ratio < threshold:
                continue
--- a/lib/charset_normalizer/cli/normalizer.py
+++ b/lib/charset_normalizer/cli/normalizer.py
@ -3,7 +3,12 @@ import sys
 from json import dumps
 from os.path import abspath
 from platform import python_version
-from typing import List
+from typing import List, Optional
+
+try:
+    from unicodedata2 import unidata_version
+except ImportError:
+    from unicodedata import unidata_version

 from charset_normalizer import from_fp
 from charset_normalizer.models import CliDetectionResult
@ -43,7 +48,7 @@ def query_yes_no(question: str, default: str = "yes") -> bool:
            sys.stdout.write("Please respond with 'yes' or 'no' " "(or 'y' or 'n').\n")


-def cli_detect(argv: List[str] = None) -> int:
+def cli_detect(argv: Optional[List[str]] = None) -> int:
    """
    CLI assistant using ARGV and ArgumentParser
    :param argv:
@ -111,7 +116,7 @@ def cli_detect(argv: List[str] = None) -> int:
        "-t",
        "--threshold",
        action="store",
-        default=0.1,
+        default=0.2,
        type=float,
        dest="threshold",
        help="Define a custom maximum amount of chaos allowed in decoded content. 0. <= chaos <= 1.",
@ -119,8 +124,8 @@ def cli_detect(argv: List[str] = None) -> int:
    parser.add_argument(
        "--version",
        action="version",
-        version="Charset-Normalizer {} - Python {}".format(
-            __version__, python_version()
+        version="Charset-Normalizer {} - Python {} - Unicode {}".format(
+            __version__, python_version(), unidata_version
        ),
        help="Show version information and exit.",
    )
@ -229,7 +234,7 @@ def cli_detect(argv: List[str] = None) -> int:
                        my_file.close()
                    continue

-                o_ = my_file.name.split(".")  # type: List[str]
+                o_: List[str] = my_file.name.split(".")

                if args.replace is False:
                    o_.insert(-1, best_guess.encoding)
--- a/lib/charset_normalizer/constant.py
+++ b/lib/charset_normalizer/constant.py
@ -1,5 +1,4 @@
 from codecs import BOM_UTF8, BOM_UTF16_BE, BOM_UTF16_LE, BOM_UTF32_BE, BOM_UTF32_LE
-from collections import OrderedDict
 from encodings.aliases import aliases
 from re import IGNORECASE, compile as re_compile
 from typing import Dict, List, Set, Union
@ -7,31 +6,26 @@ from typing import Dict, List, Set, Union
 from .assets import FREQUENCIES

 # Contain for each eligible encoding a list of/item bytes SIG/BOM
-ENCODING_MARKS = OrderedDict(
-    [
-        ("utf_8", BOM_UTF8),
-        (
-            "utf_7",
-            [
-                b"\x2b\x2f\x76\x38",
-                b"\x2b\x2f\x76\x39",
-                b"\x2b\x2f\x76\x2b",
-                b"\x2b\x2f\x76\x2f",
-                b"\x2b\x2f\x76\x38\x2d",
-            ],
-        ),
-        ("gb18030", b"\x84\x31\x95\x33"),
-        ("utf_32", [BOM_UTF32_BE, BOM_UTF32_LE]),
-        ("utf_16", [BOM_UTF16_BE, BOM_UTF16_LE]),
-    ]
-)  # type: Dict[str, Union[bytes, List[bytes]]]
+ENCODING_MARKS: Dict[str, Union[bytes, List[bytes]]] = {
+    "utf_8": BOM_UTF8,
+    "utf_7": [
+        b"\x2b\x2f\x76\x38",
+        b"\x2b\x2f\x76\x39",
+        b"\x2b\x2f\x76\x2b",
+        b"\x2b\x2f\x76\x2f",
+        b"\x2b\x2f\x76\x38\x2d",
+    ],
+    "gb18030": b"\x84\x31\x95\x33",
+    "utf_32": [BOM_UTF32_BE, BOM_UTF32_LE],
+    "utf_16": [BOM_UTF16_BE, BOM_UTF16_LE],
+}

-TOO_SMALL_SEQUENCE = 32  # type: int
-TOO_BIG_SEQUENCE = int(10e6)  # type: int
+TOO_SMALL_SEQUENCE: int = 32
+TOO_BIG_SEQUENCE: int = int(10e6)

-UTF8_MAXIMAL_ALLOCATION = 1112064  # type: int
+UTF8_MAXIMAL_ALLOCATION: int = 1112064

-UNICODE_RANGES_COMBINED = {
+UNICODE_RANGES_COMBINED: Dict[str, range] = {
    "Control character": range(31 + 1),
    "Basic Latin": range(32, 127 + 1),
    "Latin-1 Supplement": range(128, 255 + 1),
@ -311,10 +305,10 @@ UNICODE_RANGES_COMBINED = {
    "CJK Compatibility Ideographs Supplement": range(194560, 195103 + 1),
    "Tags": range(917504, 917631 + 1),
    "Variation Selectors Supplement": range(917760, 917999 + 1),
-}  # type: Dict[str, range]
+}


-UNICODE_SECONDARY_RANGE_KEYWORD = [
+UNICODE_SECONDARY_RANGE_KEYWORD: List[str] = [
    "Supplement",
    "Extended",
    "Extensions",
@ -330,25 +324,25 @@ UNICODE_SECONDARY_RANGE_KEYWORD = [
    "Shapes",
    "Supplemental",
    "Tags",
-]  # type: List[str]
+]

 RE_POSSIBLE_ENCODING_INDICATION = re_compile(
    r"(?:(?:encoding)|(?:charset)|(?:coding))(?:[\:= ]{1,10})(?:[\"\']?)([a-zA-Z0-9\-_]+)(?:[\"\']?)",
    IGNORECASE,
 )

-IANA_SUPPORTED = sorted(
+IANA_SUPPORTED: List[str] = sorted(
    filter(
        lambda x: x.endswith("_codec") is False
        and x not in {"rot_13", "tactis", "mbcs"},
        list(set(aliases.values())),
    )
-)  # type: List[str]
+)

-IANA_SUPPORTED_COUNT = len(IANA_SUPPORTED)  # type: int
+IANA_SUPPORTED_COUNT: int = len(IANA_SUPPORTED)

 # pre-computed code page that are similar using the function cp_similarity.
-IANA_SUPPORTED_SIMILAR = {
+IANA_SUPPORTED_SIMILAR: Dict[str, List[str]] = {
    "cp037": ["cp1026", "cp1140", "cp273", "cp500"],
    "cp1026": ["cp037", "cp1140", "cp273", "cp500"],
    "cp1125": ["cp866"],
@ -434,10 +428,10 @@ IANA_SUPPORTED_SIMILAR = {
    "mac_turkish": ["mac_iceland", "mac_roman"],
    "ptcp154": ["cp1251", "kz1048"],
    "tis_620": ["iso8859_11"],
-}  # type: Dict[str, List[str]]
+}


-CHARDET_CORRESPONDENCE = {
+CHARDET_CORRESPONDENCE: Dict[str, str] = {
    "iso2022_kr": "ISO-2022-KR",
    "iso2022_jp": "ISO-2022-JP",
    "euc_kr": "EUC-KR",
@ -470,10 +464,10 @@ CHARDET_CORRESPONDENCE = {
    "cp1256": "windows-1256",
    "cp1254": "Windows-1254",
    "cp949": "CP949",
-}  # type: Dict[str, str]
+}


-COMMON_SAFE_ASCII_CHARACTERS = {
+COMMON_SAFE_ASCII_CHARACTERS: Set[str] = {
    "<",
    ">",
    "=",
@ -489,15 +483,15 @@ COMMON_SAFE_ASCII_CHARACTERS = {
    "|",
    '"',
    "-",
-}  # type: Set[str]
+}


-KO_NAMES = {"johab", "cp949", "euc_kr"}  # type: Set[str]
-ZH_NAMES = {"big5", "cp950", "big5hkscs", "hz"}  # type: Set[str]
+KO_NAMES: Set[str] = {"johab", "cp949", "euc_kr"}
+ZH_NAMES: Set[str] = {"big5", "cp950", "big5hkscs", "hz"}

 NOT_PRINTABLE_PATTERN = re_compile(r"[0-9\W\n\r\t]+")

-LANGUAGE_SUPPORTED_COUNT = len(FREQUENCIES)  # type: int
+LANGUAGE_SUPPORTED_COUNT: int = len(FREQUENCIES)

 # Logging LEVEL bellow DEBUG
-TRACE = 5  # type: int
+TRACE: int = 5
--- a/lib/charset_normalizer/md.py
+++ b/lib/charset_normalizer/md.py
@ -16,6 +16,7 @@ from .utils import (
    is_separator,
    is_symbol,
    is_thai,
+    is_unprintable,
    remove_accent,
    unicode_range,
 )
@ -57,12 +58,12 @@ class MessDetectorPlugin:

 class TooManySymbolOrPunctuationPlugin(MessDetectorPlugin):
    def __init__(self) -> None:
-        self._punctuation_count = 0  # type: int
-        self._symbol_count = 0  # type: int
-        self._character_count = 0  # type: int
+        self._punctuation_count: int = 0
+        self._symbol_count: int = 0
+        self._character_count: int = 0

-        self._last_printable_char = None  # type: Optional[str]
-        self._frenzy_symbol_in_word = False  # type: bool
+        self._last_printable_char: Optional[str] = None
+        self._frenzy_symbol_in_word: bool = False

    def eligible(self, character: str) -> bool:
        return character.isprintable()
@ -95,17 +96,17 @@ class TooManySymbolOrPunctuationPlugin(MessDetectorPlugin):
        if self._character_count == 0:
            return 0.0

-        ratio_of_punctuation = (
+        ratio_of_punctuation: float = (
            self._punctuation_count + self._symbol_count
-        ) / self._character_count  # type: float
+        ) / self._character_count

        return ratio_of_punctuation if ratio_of_punctuation >= 0.3 else 0.0


 class TooManyAccentuatedPlugin(MessDetectorPlugin):
    def __init__(self) -> None:
-        self._character_count = 0  # type: int
-        self._accentuated_count = 0  # type: int
+        self._character_count: int = 0
+        self._accentuated_count: int = 0

    def eligible(self, character: str) -> bool:
        return character.isalpha()
@ -124,26 +125,20 @@ class TooManyAccentuatedPlugin(MessDetectorPlugin):
    def ratio(self) -> float:
        if self._character_count == 0:
            return 0.0
-        ratio_of_accentuation = (
-            self._accentuated_count / self._character_count
-        )  # type: float
+        ratio_of_accentuation: float = self._accentuated_count / self._character_count
        return ratio_of_accentuation if ratio_of_accentuation >= 0.35 else 0.0


 class UnprintablePlugin(MessDetectorPlugin):
    def __init__(self) -> None:
-        self._unprintable_count = 0  # type: int
-        self._character_count = 0  # type: int
+        self._unprintable_count: int = 0
+        self._character_count: int = 0

    def eligible(self, character: str) -> bool:
        return True

    def feed(self, character: str) -> None:
-        if (
-            character.isspace() is False  # includes \n \t \r \v
-            and character.isprintable() is False
-            and character != "\x1A"  # Why? Its the ASCII substitute character.
-        ):
+        if is_unprintable(character):
            self._unprintable_count += 1
        self._character_count += 1

@ -160,10 +155,10 @@ class UnprintablePlugin(MessDetectorPlugin):

 class SuspiciousDuplicateAccentPlugin(MessDetectorPlugin):
    def __init__(self) -> None:
-        self._successive_count = 0  # type: int
-        self._character_count = 0  # type: int
+        self._successive_count: int = 0
+        self._character_count: int = 0

-        self._last_latin_character = None  # type: Optional[str]
+        self._last_latin_character: Optional[str] = None

    def eligible(self, character: str) -> bool:
        return character.isalpha() and is_latin(character)
@ -197,9 +192,9 @@ class SuspiciousDuplicateAccentPlugin(MessDetectorPlugin):

 class SuspiciousRange(MessDetectorPlugin):
    def __init__(self) -> None:
-        self._suspicious_successive_range_count = 0  # type: int
-        self._character_count = 0  # type: int
-        self._last_printable_seen = None  # type: Optional[str]
+        self._suspicious_successive_range_count: int = 0
+        self._character_count: int = 0
+        self._last_printable_seen: Optional[str] = None

    def eligible(self, character: str) -> bool:
        return character.isprintable()
@ -219,10 +214,8 @@ class SuspiciousRange(MessDetectorPlugin):
            self._last_printable_seen = character
            return

-        unicode_range_a = unicode_range(
-            self._last_printable_seen
-        )  # type: Optional[str]
-        unicode_range_b = unicode_range(character)  # type: Optional[str]
+        unicode_range_a: Optional[str] = unicode_range(self._last_printable_seen)
+        unicode_range_b: Optional[str] = unicode_range(character)

        if is_suspiciously_successive_range(unicode_range_a, unicode_range_b):
            self._suspicious_successive_range_count += 1
@ -239,9 +232,9 @@ class SuspiciousRange(MessDetectorPlugin):
        if self._character_count == 0:
            return 0.0

-        ratio_of_suspicious_range_usage = (
+        ratio_of_suspicious_range_usage: float = (
            self._suspicious_successive_range_count * 2
-        ) / self._character_count  # type: float
+        ) / self._character_count

        if ratio_of_suspicious_range_usage < 0.1:
            return 0.0
@ -251,25 +244,25 @@ class SuspiciousRange(MessDetectorPlugin):

 class SuperWeirdWordPlugin(MessDetectorPlugin):
    def __init__(self) -> None:
-        self._word_count = 0  # type: int
-        self._bad_word_count = 0  # type: int
-        self._foreign_long_count = 0  # type: int
+        self._word_count: int = 0
+        self._bad_word_count: int = 0
+        self._foreign_long_count: int = 0

-        self._is_current_word_bad = False  # type: bool
-        self._foreign_long_watch = False  # type: bool
+        self._is_current_word_bad: bool = False
+        self._foreign_long_watch: bool = False

-        self._character_count = 0  # type: int
-        self._bad_character_count = 0  # type: int
+        self._character_count: int = 0
+        self._bad_character_count: int = 0

-        self._buffer = ""  # type: str
-        self._buffer_accent_count = 0  # type: int
+        self._buffer: str = ""
+        self._buffer_accent_count: int = 0

    def eligible(self, character: str) -> bool:
        return True

    def feed(self, character: str) -> None:
        if character.isalpha():
-            self._buffer = "".join([self._buffer, character])
+            self._buffer += character
            if is_accentuated(character):
                self._buffer_accent_count += 1
            if (
@ -289,7 +282,7 @@ class SuperWeirdWordPlugin(MessDetectorPlugin):
            character.isspace() or is_punctuation(character) or is_separator(character)
        ) and self._buffer:
            self._word_count += 1
-            buffer_length = len(self._buffer)  # type: int
+            buffer_length: int = len(self._buffer)

            self._character_count += buffer_length

@ -346,8 +339,8 @@ class CjkInvalidStopPlugin(MessDetectorPlugin):
    """

    def __init__(self) -> None:
-        self._wrong_stop_count = 0  # type: int
-        self._cjk_character_count = 0  # type: int
+        self._wrong_stop_count: int = 0
+        self._cjk_character_count: int = 0

    def eligible(self, character: str) -> bool:
        return True
@ -372,17 +365,17 @@ class CjkInvalidStopPlugin(MessDetectorPlugin):

 class ArchaicUpperLowerPlugin(MessDetectorPlugin):
    def __init__(self) -> None:
-        self._buf = False  # type: bool
+        self._buf: bool = False

-        self._character_count_since_last_sep = 0  # type: int
+        self._character_count_since_last_sep: int = 0

-        self._successive_upper_lower_count = 0  # type: int
-        self._successive_upper_lower_count_final = 0  # type: int
+        self._successive_upper_lower_count: int = 0
+        self._successive_upper_lower_count_final: int = 0

-        self._character_count = 0  # type: int
+        self._character_count: int = 0

-        self._last_alpha_seen = None  # type: Optional[str]
-        self._current_ascii_only = True  # type: bool
+        self._last_alpha_seen: Optional[str] = None
+        self._current_ascii_only: bool = True

    def eligible(self, character: str) -> bool:
        return True
@ -446,6 +439,7 @@ class ArchaicUpperLowerPlugin(MessDetectorPlugin):
        return self._successive_upper_lower_count_final / self._character_count


+@lru_cache(maxsize=1024)
 def is_suspiciously_successive_range(
    unicode_range_a: Optional[str], unicode_range_b: Optional[str]
 ) -> bool:
@ -524,16 +518,16 @@ def mess_ratio(
    Compute a mess ratio given a decoded bytes sequence. The maximum threshold does stop the computation earlier.
    """

-    detectors = [
+    detectors: List[MessDetectorPlugin] = [
        md_class() for md_class in MessDetectorPlugin.__subclasses__()
-    ]  # type: List[MessDetectorPlugin]
+    ]

-    length = len(decoded_sequence) + 1  # type: int
+    length: int = len(decoded_sequence) + 1

-    mean_mess_ratio = 0.0  # type: float
+    mean_mess_ratio: float = 0.0

    if length < 512:
-        intermediary_mean_mess_ratio_calc = 32  # type: int
+        intermediary_mean_mess_ratio_calc: int = 32
    elif length <= 1024:
        intermediary_mean_mess_ratio_calc = 64
    else:
--- a/lib/charset_normalizer/models.py
+++ b/lib/charset_normalizer/models.py
@ -4,7 +4,16 @@ from encodings.aliases import aliases
 from hashlib import sha256
 from json import dumps
 from re import sub
-from typing import Any, Dict, Iterator, List, Optional, Tuple, Union
+from typing import (
+    Any,
+    Counter as TypeCounter,
+    Dict,
+    Iterator,
+    List,
+    Optional,
+    Tuple,
+    Union,
+)

 from .constant import NOT_PRINTABLE_PATTERN, TOO_BIG_SEQUENCE
 from .md import mess_ratio
@ -21,21 +30,21 @@ class CharsetMatch:
        languages: "CoherenceMatches",
        decoded_payload: Optional[str] = None,
    ):
-        self._payload = payload  # type: bytes
+        self._payload: bytes = payload

-        self._encoding = guessed_encoding  # type: str
-        self._mean_mess_ratio = mean_mess_ratio  # type: float
-        self._languages = languages  # type: CoherenceMatches
-        self._has_sig_or_bom = has_sig_or_bom  # type: bool
-        self._unicode_ranges = None  # type: Optional[List[str]]
+        self._encoding: str = guessed_encoding
+        self._mean_mess_ratio: float = mean_mess_ratio
+        self._languages: CoherenceMatches = languages
+        self._has_sig_or_bom: bool = has_sig_or_bom
+        self._unicode_ranges: Optional[List[str]] = None

-        self._leaves = []  # type: List[CharsetMatch]
-        self._mean_coherence_ratio = 0.0  # type: float
+        self._leaves: List[CharsetMatch] = []
+        self._mean_coherence_ratio: float = 0.0

-        self._output_payload = None  # type: Optional[bytes]
-        self._output_encoding = None  # type: Optional[str]
+        self._output_payload: Optional[bytes] = None
+        self._output_encoding: Optional[str] = None

-        self._string = decoded_payload  # type: Optional[str]
+        self._string: Optional[str] = decoded_payload

    def __eq__(self, other: object) -> bool:
        if not isinstance(other, CharsetMatch):
@ -53,8 +62,8 @@ class CharsetMatch:
        if not isinstance(other, CharsetMatch):
            raise ValueError

-        chaos_difference = abs(self.chaos - other.chaos)  # type: float
-        coherence_difference = abs(self.coherence - other.coherence)  # type: float
+        chaos_difference: float = abs(self.chaos - other.chaos)
+        coherence_difference: float = abs(self.coherence - other.coherence)

        # Bellow 1% difference --> Use Coherence
        if chaos_difference < 0.01 and coherence_difference > 0.02:
@ -95,7 +104,7 @@ class CharsetMatch:
        return 0.0

    @property
-    def w_counter(self) -> Counter:
+    def w_counter(self) -> TypeCounter[str]:
        """
        Word counter instance on decoded text.
        Notice: Will be removed in 3.0
@ -137,7 +146,7 @@ class CharsetMatch:
        """
        Encoding name are known by many name, using this could help when searching for IBM855 when it's listed as CP855.
        """
-        also_known_as = []  # type: List[str]
+        also_known_as: List[str] = []
        for u, p in aliases.items():
            if self.encoding == u:
                also_known_as.append(p)
@ -227,9 +236,9 @@ class CharsetMatch:
        if self._unicode_ranges is not None:
            return self._unicode_ranges
        # list detected ranges
-        detected_ranges = [
+        detected_ranges: List[Optional[str]] = [
            unicode_range(char) for char in str(self)
-        ]  # type: List[Optional[str]]
+        ]
        # filter and sort
        self._unicode_ranges = sorted(list({r for r in detected_ranges if r}))
        return self._unicode_ranges
@ -280,8 +289,8 @@ class CharsetMatches:
    Act like a list(iterable) but does not implements all related methods.
    """

-    def __init__(self, results: List[CharsetMatch] = None):
-        self._results = sorted(results) if results else []  # type: List[CharsetMatch]
+    def __init__(self, results: Optional[List[CharsetMatch]] = None):
+        self._results: List[CharsetMatch] = sorted(results) if results else []

    def __iter__(self) -> Iterator[CharsetMatch]:
        yield from self._results
@ -360,17 +369,17 @@ class CliDetectionResult:
        unicode_path: Optional[str],
        is_preferred: bool,
    ):
-        self.path = path  # type: str
-        self.unicode_path = unicode_path  # type: Optional[str]
-        self.encoding = encoding  # type: Optional[str]
-        self.encoding_aliases = encoding_aliases  # type: List[str]
-        self.alternative_encodings = alternative_encodings  # type: List[str]
-        self.language = language  # type: str
-        self.alphabets = alphabets  # type: List[str]
-        self.has_sig_or_bom = has_sig_or_bom  # type: bool
-        self.chaos = chaos  # type: float
-        self.coherence = coherence  # type: float
-        self.is_preferred = is_preferred  # type: bool
+        self.path: str = path
+        self.unicode_path: Optional[str] = unicode_path
+        self.encoding: Optional[str] = encoding
+        self.encoding_aliases: List[str] = encoding_aliases
+        self.alternative_encodings: List[str] = alternative_encodings
+        self.language: str = language
+        self.alphabets: List[str] = alphabets
+        self.has_sig_or_bom: bool = has_sig_or_bom
+        self.chaos: float = chaos
+        self.coherence: float = coherence
+        self.is_preferred: bool = is_preferred

    @property
    def __dict__(self) -> Dict[str, Any]:  # type: ignore
--- a/lib/charset_normalizer/utils.py
+++ b/lib/charset_normalizer/utils.py
@ -1,4 +1,6 @@
 try:
+    # WARNING: unicodedata2 support is going to be removed in 3.0
+    # Python is quickly catching up.
    import unicodedata2 as unicodedata
 except ImportError:
    import unicodedata  # type: ignore[no-redef]
@ -9,9 +11,9 @@ from codecs import IncrementalDecoder
 from encodings.aliases import aliases
 from functools import lru_cache
 from re import findall
-from typing import List, Optional, Set, Tuple, Union
+from typing import Generator, List, Optional, Set, Tuple, Union

-from _multibytecodec import MultibyteIncrementalDecoder  # type: ignore
+from _multibytecodec import MultibyteIncrementalDecoder

 from .constant import (
    ENCODING_MARKS,
@ -26,7 +28,7 @@ from .constant import (
@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
 def is_accentuated(character: str) -> bool:
    try:
-        description = unicodedata.name(character)  # type: str
+        description: str = unicodedata.name(character)
    except ValueError:
        return False
    return (
@ -41,11 +43,11 @@ def is_accentuated(character: str) -> bool:

@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
 def remove_accent(character: str) -> str:
-    decomposed = unicodedata.decomposition(character)  # type: str
+    decomposed: str = unicodedata.decomposition(character)
    if not decomposed:
        return character

-    codes = decomposed.split(" ")  # type: List[str]
+    codes: List[str] = decomposed.split(" ")

    return chr(int(codes[0], 16))

@ -55,7 +57,7 @@ def unicode_range(character: str) -> Optional[str]:
    """
    Retrieve the Unicode range official name from a single character.
    """
-    character_ord = ord(character)  # type: int
+    character_ord: int = ord(character)

    for range_name, ord_range in UNICODE_RANGES_COMBINED.items():
        if character_ord in ord_range:
@ -67,12 +69,13 @@ def unicode_range(character: str) -> Optional[str]:
@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
 def is_latin(character: str) -> bool:
    try:
-        description = unicodedata.name(character)  # type: str
+        description: str = unicodedata.name(character)
    except ValueError:
        return False
    return "LATIN" in description


+@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
 def is_ascii(character: str) -> bool:
    try:
        character.encode("ascii")
@ -83,12 +86,12 @@ def is_ascii(character: str) -> bool:

@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
 def is_punctuation(character: str) -> bool:
-    character_category = unicodedata.category(character)  # type: str
+    character_category: str = unicodedata.category(character)

    if "P" in character_category:
        return True

-    character_range = unicode_range(character)  # type: Optional[str]
+    character_range: Optional[str] = unicode_range(character)

    if character_range is None:
        return False
@ -98,12 +101,12 @@ def is_punctuation(character: str) -> bool:

@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
 def is_symbol(character: str) -> bool:
-    character_category = unicodedata.category(character)  # type: str
+    character_category: str = unicodedata.category(character)

    if "S" in character_category or "N" in character_category:
        return True

-    character_range = unicode_range(character)  # type: Optional[str]
+    character_range: Optional[str] = unicode_range(character)

    if character_range is None:
        return False
@ -113,7 +116,7 @@ def is_symbol(character: str) -> bool:

@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
 def is_emoticon(character: str) -> bool:
-    character_range = unicode_range(character)  # type: Optional[str]
+    character_range: Optional[str] = unicode_range(character)

    if character_range is None:
        return False
@ -126,7 +129,7 @@ def is_separator(character: str) -> bool:
    if character.isspace() or character in {"｜", "+", ",", ";", "<", ">"}:
        return True

-    character_category = unicodedata.category(character)  # type: str
+    character_category: str = unicodedata.category(character)

    return "Z" in character_category

@ -137,7 +140,7 @@ def is_case_variable(character: str) -> bool:


 def is_private_use_only(character: str) -> bool:
-    character_category = unicodedata.category(character)  # type: str
+    character_category: str = unicodedata.category(character)

    return character_category == "Co"

@ -197,6 +200,17 @@ def is_unicode_range_secondary(range_name: str) -> bool:
    return any(keyword in range_name for keyword in UNICODE_SECONDARY_RANGE_KEYWORD)


+@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
+def is_unprintable(character: str) -> bool:
+    return (
+        character.isspace() is False  # includes \n \t \r \v
+        and character.isprintable() is False
+        and character != "\x1A"  # Why? Its the ASCII substitute character.
+        and character != "\ufeff"  # bug discovered in Python,
+        # Zero Width No-Break Space located in 	Arabic Presentation Forms-B, Unicode 1.1 not acknowledged as space.
+    )
+
+
 def any_specified_encoding(sequence: bytes, search_zone: int = 4096) -> Optional[str]:
    """
    Extract using ASCII-only decoder any specified encoding in the first n-bytes.
@ -204,12 +218,12 @@ def any_specified_encoding(sequence: bytes, search_zone: int = 4096) -> Optional
    if not isinstance(sequence, bytes):
        raise TypeError

-    seq_len = len(sequence)  # type: int
+    seq_len: int = len(sequence)

-    results = findall(
+    results: List[str] = findall(
        RE_POSSIBLE_ENCODING_INDICATION,
        sequence[: min(seq_len, search_zone)].decode("ascii", errors="ignore"),
-    )  # type: List[str]
+    )

    if len(results) == 0:
        return None
@ -217,6 +231,9 @@ def any_specified_encoding(sequence: bytes, search_zone: int = 4096) -> Optional
    for specified_encoding in results:
        specified_encoding = specified_encoding.lower().replace("-", "_")

+        encoding_alias: str
+        encoding_iana: str
+
        for encoding_alias, encoding_iana in aliases.items():
            if encoding_alias == specified_encoding:
                return encoding_iana
@ -242,7 +259,7 @@ def is_multi_byte_encoding(name: str) -> bool:
        "utf_32_be",
        "utf_7",
    } or issubclass(
-        importlib.import_module("encodings.{}".format(name)).IncrementalDecoder,  # type: ignore
+        importlib.import_module("encodings.{}".format(name)).IncrementalDecoder,
        MultibyteIncrementalDecoder,
    )

@ -253,7 +270,7 @@ def identify_sig_or_bom(sequence: bytes) -> Tuple[Optional[str], bytes]:
    """

    for iana_encoding in ENCODING_MARKS:
-        marks = ENCODING_MARKS[iana_encoding]  # type: Union[bytes, List[bytes]]
+        marks: Union[bytes, List[bytes]] = ENCODING_MARKS[iana_encoding]

        if isinstance(marks, bytes):
            marks = [marks]
@ -272,6 +289,9 @@ def should_strip_sig_or_bom(iana_encoding: str) -> bool:
 def iana_name(cp_name: str, strict: bool = True) -> str:
    cp_name = cp_name.lower().replace("-", "_")

+    encoding_alias: str
+    encoding_iana: str
+
    for encoding_alias, encoding_iana in aliases.items():
        if cp_name in [encoding_alias, encoding_iana]:
            return encoding_iana
@ -283,10 +303,10 @@ def iana_name(cp_name: str, strict: bool = True) -> str:


 def range_scan(decoded_sequence: str) -> List[str]:
-    ranges = set()  # type: Set[str]
+    ranges: Set[str] = set()

    for character in decoded_sequence:
-        character_range = unicode_range(character)  # type: Optional[str]
+        character_range: Optional[str] = unicode_range(character)

        if character_range is None:
            continue
@ -301,16 +321,20 @@ def cp_similarity(iana_name_a: str, iana_name_b: str) -> float:
    if is_multi_byte_encoding(iana_name_a) or is_multi_byte_encoding(iana_name_b):
        return 0.0

-    decoder_a = importlib.import_module("encodings.{}".format(iana_name_a)).IncrementalDecoder  # type: ignore
-    decoder_b = importlib.import_module("encodings.{}".format(iana_name_b)).IncrementalDecoder  # type: ignore
+    decoder_a = importlib.import_module(
+        "encodings.{}".format(iana_name_a)
+    ).IncrementalDecoder
+    decoder_b = importlib.import_module(
+        "encodings.{}".format(iana_name_b)
+    ).IncrementalDecoder

-    id_a = decoder_a(errors="ignore")  # type: IncrementalDecoder
-    id_b = decoder_b(errors="ignore")  # type: IncrementalDecoder
+    id_a: IncrementalDecoder = decoder_a(errors="ignore")
+    id_b: IncrementalDecoder = decoder_b(errors="ignore")

-    character_match_count = 0  # type: int
+    character_match_count: int = 0

    for i in range(255):
-        to_be_decoded = bytes([i])  # type: bytes
+        to_be_decoded: bytes = bytes([i])
        if id_a.decode(to_be_decoded) == id_b.decode(to_be_decoded):
            character_match_count += 1

@ -340,3 +364,61 @@ def set_logging_handler(
    handler = logging.StreamHandler()
    handler.setFormatter(logging.Formatter(format_string))
    logger.addHandler(handler)
+
+
+def cut_sequence_chunks(
+    sequences: bytes,
+    encoding_iana: str,
+    offsets: range,
+    chunk_size: int,
+    bom_or_sig_available: bool,
+    strip_sig_or_bom: bool,
+    sig_payload: bytes,
+    is_multi_byte_decoder: bool,
+    decoded_payload: Optional[str] = None,
+) -> Generator[str, None, None]:
+
+    if decoded_payload and is_multi_byte_decoder is False:
+        for i in offsets:
+            chunk = decoded_payload[i : i + chunk_size]
+            if not chunk:
+                break
+            yield chunk
+    else:
+        for i in offsets:
+            chunk_end = i + chunk_size
+            if chunk_end > len(sequences) + 8:
+                continue
+
+            cut_sequence = sequences[i : i + chunk_size]
+
+            if bom_or_sig_available and strip_sig_or_bom is False:
+                cut_sequence = sig_payload + cut_sequence
+
+            chunk = cut_sequence.decode(
+                encoding_iana,
+                errors="ignore" if is_multi_byte_decoder else "strict",
+            )
+
+            # multi-byte bad cutting detector and adjustment
+            # not the cleanest way to perform that fix but clever enough for now.
+            if is_multi_byte_decoder and i > 0 and sequences[i] >= 0x80:
+
+                chunk_partial_size_chk: int = min(chunk_size, 16)
+
+                if (
+                    decoded_payload
+                    and chunk[:chunk_partial_size_chk] not in decoded_payload
+                ):
+                    for j in range(i, i - 4, -1):
+                        cut_sequence = sequences[j:chunk_end]
+
+                        if bom_or_sig_available and strip_sig_or_bom is False:
+                            cut_sequence = sig_payload + cut_sequence
+
+                        chunk = cut_sequence.decode(encoding_iana, errors="ignore")
+
+                        if chunk[:chunk_partial_size_chk] in decoded_payload:
+                            break
+
+            yield chunk
--- a/lib/charset_normalizer/version.py
+++ b/lib/charset_normalizer/version.py
@ -2,5 +2,5 @@
 Expose version
 """

-__version__ = "2.0.12"
+__version__ = "2.1.1"
 VERSION = __version__.split(".")