Update charset-normalizer==2.1.1

2025-08-23 06:25:27 -07:00 · 2022-11-12 17:09:39 -08:00 · 2022-11-12 17:09:39 -08:00 · 637ccee60f
commit 637ccee60f
parent aaa336de28
10 changed files with 1493 additions and 1556 deletions
--- a/lib/charset_normalizer/init.py
+++ b/lib/charset_normalizer/init.py
@ -1,4 +1,4 @@
-# -*- coding: utf_8 -*-
+# -*- coding: utf-8 -*-
 """
 Charset-Normalizer
 ~~~~~~~~~~~~~~
--- a/lib/charset_normalizer/api.py
+++ b/lib/charset_normalizer/api.py
@ -1,11 +1,8 @@
 import logging
-from os.path import basename, splitext
+import warnings
 from typing import BinaryIO, List, Optional, Set
 try:
 from os import PathLike
-except ImportError:  # pragma: no cover
+from os.path import basename, splitext
-    PathLike = str  # type: ignore
+from typing import Any, BinaryIO, List, Optional, Set
 from .cd import (
    coherence_ratio,
@ -18,6 +15,7 @@ from .md import mess_ratio
 from .models import CharsetMatch, CharsetMatches
 from .utils import (
    any_specified_encoding,
    cut_sequence_chunks,
    iana_name,
    identify_sig_or_bom,
    is_cp_similar,
@ -39,8 +37,8 @@ def from_bytes(
    steps: int = 5,
    chunk_size: int = 512,
    threshold: float = 0.2,
-    cp_isolation: List[str] = None,
+    cp_isolation: Optional[List[str]] = None,
-    cp_exclusion: List[str] = None,
+    cp_exclusion: Optional[List[str]] = None,
    preemptive_behaviour: bool = True,
    explain: bool = False,
 ) -> CharsetMatches:
@ -70,11 +68,11 @@ def from_bytes(
        )
    if explain:
-        previous_logger_level = logger.level  # type: int
+        previous_logger_level: int = logger.level
        logger.addHandler(explain_handler)
        logger.setLevel(TRACE)
-    length = len(sequences)  # type: int
+    length: int = len(sequences)
    if length == 0:
        logger.debug("Encoding detection on empty bytes, assuming utf_8 intention.")
@ -119,8 +117,8 @@ def from_bytes(
    if steps > 1 and length / steps < chunk_size:
        chunk_size = int(length / steps)
-    is_too_small_sequence = len(sequences) < TOO_SMALL_SEQUENCE  # type: bool
+    is_too_small_sequence: bool = len(sequences) < TOO_SMALL_SEQUENCE
-    is_too_large_sequence = len(sequences) >= TOO_BIG_SEQUENCE  # type: bool
+    is_too_large_sequence: bool = len(sequences) >= TOO_BIG_SEQUENCE
    if is_too_small_sequence:
        logger.log(
@ -137,11 +135,11 @@ def from_bytes(
            ),
        )
-    prioritized_encodings = []  # type: List[str]
+    prioritized_encodings: List[str] = []
-    specified_encoding = (
+    specified_encoding: Optional[str] = (
        any_specified_encoding(sequences) if preemptive_behaviour else None
-    )  # type: Optional[str]
+    )
    if specified_encoding is not None:
        prioritized_encodings.append(specified_encoding)
@ -151,15 +149,15 @@ def from_bytes(
            specified_encoding,
        )
-    tested = set()  # type: Set[str]
+    tested: Set[str] = set()
-    tested_but_hard_failure = []  # type: List[str]
+    tested_but_hard_failure: List[str] = []
-    tested_but_soft_failure = []  # type: List[str]
+    tested_but_soft_failure: List[str] = []
-    fallback_ascii = None  # type: Optional[CharsetMatch]
+    fallback_ascii: Optional[CharsetMatch] = None
-    fallback_u8 = None  # type: Optional[CharsetMatch]
+    fallback_u8: Optional[CharsetMatch] = None
-    fallback_specified = None  # type: Optional[CharsetMatch]
+    fallback_specified: Optional[CharsetMatch] = None
-    results = CharsetMatches()  # type: CharsetMatches
+    results: CharsetMatches = CharsetMatches()
    sig_encoding, sig_payload = identify_sig_or_bom(sequences)
@ -190,11 +188,11 @@ def from_bytes(
        tested.add(encoding_iana)
-        decoded_payload = None  # type: Optional[str]
+        decoded_payload: Optional[str] = None
-        bom_or_sig_available = sig_encoding == encoding_iana  # type: bool
+        bom_or_sig_available: bool = sig_encoding == encoding_iana
-        strip_sig_or_bom = bom_or_sig_available and should_strip_sig_or_bom(
+        strip_sig_or_bom: bool = bom_or_sig_available and should_strip_sig_or_bom(
            encoding_iana
-        )  # type: bool
+        )
        if encoding_iana in {"utf_16", "utf_32"} and not bom_or_sig_available:
            logger.log(
@ -205,7 +203,7 @@ def from_bytes(
            continue
        try:
-            is_multi_byte_decoder = is_multi_byte_encoding(encoding_iana)  # type: bool
+            is_multi_byte_decoder: bool = is_multi_byte_encoding(encoding_iana)
        except (ModuleNotFoundError, ImportError):
            logger.log(
                TRACE,
@ -240,7 +238,7 @@ def from_bytes(
            tested_but_hard_failure.append(encoding_iana)
            continue
-        similar_soft_failure_test = False  # type: bool
+        similar_soft_failure_test: bool = False
        for encoding_soft_failed in tested_but_soft_failure:
            if is_cp_similar(encoding_iana, encoding_soft_failed):
@ -262,11 +260,11 @@ def from_bytes(
            int(length / steps),
        )
-        multi_byte_bonus = (
+        multi_byte_bonus: bool = (
            is_multi_byte_decoder
            and decoded_payload is not None
            and len(decoded_payload) < length
-        )  # type: bool
+        )
        if multi_byte_bonus:
            logger.log(
@ -276,61 +274,27 @@ def from_bytes(
                encoding_iana,
            )
-        max_chunk_gave_up = int(len(r_) / 4)  # type: int
+        max_chunk_gave_up: int = int(len(r_) / 4)
        max_chunk_gave_up = max(max_chunk_gave_up, 2)
-        early_stop_count = 0  # type: int
+        early_stop_count: int = 0
        lazy_str_hard_failure = False
-        md_chunks = []  # type: List[str]
+        md_chunks: List[str] = []
        md_ratios = []
        for i in r_:
            if i + chunk_size > length + 8:
                continue
            cut_sequence = sequences[i : i + chunk_size]
            if bom_or_sig_available and strip_sig_or_bom is False:
                cut_sequence = sig_payload + cut_sequence
        try:
-                chunk = cut_sequence.decode(
+            for chunk in cut_sequence_chunks(
                sequences,
                encoding_iana,
-                    errors="ignore" if is_multi_byte_decoder else "strict",
+                r_,
-                )  # type: str
+                chunk_size,
-            except UnicodeDecodeError as e:  # Lazy str loading may have missed something there
+                bom_or_sig_available,
-                logger.log(
+                strip_sig_or_bom,
-                    TRACE,
+                sig_payload,
-                    "LazyStr Loading: After MD chunk decode, code page %s does not fit given bytes sequence at ALL. %s",
+                is_multi_byte_decoder,
-                    encoding_iana,
+                decoded_payload,
                    str(e),
                )
                early_stop_count = max_chunk_gave_up
                lazy_str_hard_failure = True
                break
            # multi-byte bad cutting detector and adjustment
            # not the cleanest way to perform that fix but clever enough for now.
            if is_multi_byte_decoder and i > 0 and sequences[i] >= 0x80:
                chunk_partial_size_chk = min(chunk_size, 16)  # type: int
                if (
                    decoded_payload
                    and chunk[:chunk_partial_size_chk] not in decoded_payload
            ):
                    for j in range(i, i - 4, -1):
                        cut_sequence = sequences[j : i + chunk_size]
                        if bom_or_sig_available and strip_sig_or_bom is False:
                            cut_sequence = sig_payload + cut_sequence
                        chunk = cut_sequence.decode(encoding_iana, errors="ignore")
                        if chunk[:chunk_partial_size_chk] in decoded_payload:
                            break
                md_chunks.append(chunk)
                md_ratios.append(mess_ratio(chunk, threshold))
@ -342,6 +306,15 @@ def from_bytes(
                    bom_or_sig_available and strip_sig_or_bom is False
                ):
                    break
        except UnicodeDecodeError as e:  # Lazy str loading may have missed something there
            logger.log(
                TRACE,
                "LazyStr Loading: After MD chunk decode, code page %s does not fit given bytes sequence at ALL. %s",
                encoding_iana,
                str(e),
            )
            early_stop_count = max_chunk_gave_up
            lazy_str_hard_failure = True
        # We might want to check the sequence again with the whole content
        # Only if initial MD tests passes
@ -362,9 +335,7 @@ def from_bytes(
                tested_but_hard_failure.append(encoding_iana)
                continue
-        mean_mess_ratio = (
+        mean_mess_ratio: float = sum(md_ratios) / len(md_ratios) if md_ratios else 0.0
            sum(md_ratios) / len(md_ratios) if md_ratios else 0.0
        )  # type: float
        if mean_mess_ratio >= threshold or early_stop_count >= max_chunk_gave_up:
            tested_but_soft_failure.append(encoding_iana)
            logger.log(
@ -399,7 +370,7 @@ def from_bytes(
        )
        if not is_multi_byte_decoder:
-            target_languages = encoding_languages(encoding_iana)  # type: List[str]
+            target_languages: List[str] = encoding_languages(encoding_iana)
        else:
            target_languages = mb_encoding_languages(encoding_iana)
@ -516,8 +487,8 @@ def from_fp(
    steps: int = 5,
    chunk_size: int = 512,
    threshold: float = 0.20,
-    cp_isolation: List[str] = None,
+    cp_isolation: Optional[List[str]] = None,
-    cp_exclusion: List[str] = None,
+    cp_exclusion: Optional[List[str]] = None,
    preemptive_behaviour: bool = True,
    explain: bool = False,
 ) -> CharsetMatches:
@ -538,12 +509,12 @@ def from_fp(
 def from_path(
-    path: PathLike,
+    path: "PathLike[Any]",
    steps: int = 5,
    chunk_size: int = 512,
    threshold: float = 0.20,
-    cp_isolation: List[str] = None,
+    cp_isolation: Optional[List[str]] = None,
-    cp_exclusion: List[str] = None,
+    cp_exclusion: Optional[List[str]] = None,
    preemptive_behaviour: bool = True,
    explain: bool = False,
 ) -> CharsetMatches:
@ -565,17 +536,22 @@ def from_path(
 def normalize(
-    path: PathLike,
+    path: "PathLike[Any]",
    steps: int = 5,
    chunk_size: int = 512,
    threshold: float = 0.20,
-    cp_isolation: List[str] = None,
+    cp_isolation: Optional[List[str]] = None,
-    cp_exclusion: List[str] = None,
+    cp_exclusion: Optional[List[str]] = None,
    preemptive_behaviour: bool = True,
 ) -> CharsetMatch:
    """
    Take a (text-based) file path and try to create another file next to it, this time using UTF-8.
    """
    warnings.warn(
        "normalize is deprecated and will be removed in 3.0",
        DeprecationWarning,
    )
    results = from_path(
        path,
        steps,
--- a/lib/charset_normalizer/assets/init.py
+++ b/lib/charset_normalizer/assets/init.py
@ -1,11 +1,8 @@
-# -*- coding: utf_8 -*-
+# -*- coding: utf-8 -*-
-from collections import OrderedDict
+from typing import Dict, List
-FREQUENCIES = OrderedDict(
+FREQUENCIES: Dict[str, List[str]] = {
-    [
+    "English": [
        (
            "English",
            [
        "e",
        "a",
        "t",
@ -33,10 +30,7 @@ FREQUENCIES = OrderedDict(
        "z",
        "q",
    ],
-        ),
+    "German": [
        (
            "German",
            [
        "e",
        "n",
        "i",
@ -64,10 +58,7 @@ FREQUENCIES = OrderedDict(
        "ö",
        "j",
    ],
-        ),
+    "French": [
        (
            "French",
            [
        "e",
        "a",
        "s",
@ -95,10 +86,7 @@ FREQUENCIES = OrderedDict(
        "y",
        "j",
    ],
-        ),
+    "Dutch": [
        (
            "Dutch",
            [
        "e",
        "n",
        "a",
@ -126,10 +114,7 @@ FREQUENCIES = OrderedDict(
        "x",
        "ë",
    ],
-        ),
+    "Italian": [
        (
            "Italian",
            [
        "e",
        "i",
        "a",
@ -157,10 +142,7 @@ FREQUENCIES = OrderedDict(
        "y",
        "ò",
    ],
-        ),
+    "Polish": [
        (
            "Polish",
            [
        "a",
        "i",
        "o",
@ -188,10 +170,7 @@ FREQUENCIES = OrderedDict(
        "ę",
        "ó",
    ],
-        ),
+    "Spanish": [
        (
            "Spanish",
            [
        "e",
        "a",
        "o",
@ -219,10 +198,7 @@ FREQUENCIES = OrderedDict(
        "z",
        "á",
    ],
-        ),
+    "Russian": [
        (
            "Russian",
            [
        "о",
        "а",
        "е",
@ -250,10 +226,7 @@ FREQUENCIES = OrderedDict(
        "ж",
        "ц",
    ],
-        ),
+    "Japanese": [
        (
            "Japanese",
            [
        "の",
        "に",
        "る",
@ -281,10 +254,7 @@ FREQUENCIES = OrderedDict(
        "も",
        "り",
    ],
-        ),
+    "Portuguese": [
        (
            "Portuguese",
            [
        "a",
        "e",
        "o",
@ -312,10 +282,7 @@ FREQUENCIES = OrderedDict(
        "z",
        "í",
    ],
-        ),
+    "Swedish": [
        (
            "Swedish",
            [
        "e",
        "a",
        "n",
@ -343,10 +310,7 @@ FREQUENCIES = OrderedDict(
        "j",
        "x",
    ],
-        ),
+    "Chinese": [
        (
            "Chinese",
            [
        "的",
        "一",
        "是",
@ -377,10 +341,7 @@ FREQUENCIES = OrderedDict(
        "出",
        "会",
    ],
-        ),
+    "Ukrainian": [
        (
            "Ukrainian",
            [
        "о",
        "а",
        "н",
@ -408,10 +369,7 @@ FREQUENCIES = OrderedDict(
        "ц",
        "ї",
    ],
-        ),
+    "Norwegian": [
        (
            "Norwegian",
            [
        "e",
        "r",
        "n",
@ -439,10 +397,7 @@ FREQUENCIES = OrderedDict(
        "æ",
        "w",
    ],
-        ),
+    "Finnish": [
        (
            "Finnish",
            [
        "a",
        "i",
        "n",
@ -470,10 +425,7 @@ FREQUENCIES = OrderedDict(
        "w",
        "z",
    ],
-        ),
+    "Vietnamese": [
        (
            "Vietnamese",
            [
        "n",
        "h",
        "t",
@ -501,10 +453,7 @@ FREQUENCIES = OrderedDict(
        "ộ",
        "ế",
    ],
-        ),
+    "Czech": [
        (
            "Czech",
            [
        "o",
        "e",
        "a",
@ -532,10 +481,7 @@ FREQUENCIES = OrderedDict(
        "é",
        "ř",
    ],
-        ),
+    "Hungarian": [
        (
            "Hungarian",
            [
        "e",
        "a",
        "t",
@ -563,10 +509,7 @@ FREQUENCIES = OrderedDict(
        "f",
        "c",
    ],
-        ),
+    "Korean": [
        (
            "Korean",
            [
        "이",
        "다",
        "에",
@ -594,10 +537,7 @@ FREQUENCIES = OrderedDict(
        "스",
        "일",
    ],
-        ),
+    "Indonesian": [
        (
            "Indonesian",
            [
        "a",
        "n",
        "e",
@ -625,10 +565,7 @@ FREQUENCIES = OrderedDict(
        "x",
        "q",
    ],
-        ),
+    "Turkish": [
        (
            "Turkish",
            [
        "a",
        "e",
        "i",
@ -656,10 +593,7 @@ FREQUENCIES = OrderedDict(
        "ç",
        "ğ",
    ],
-        ),
+    "Romanian": [
        (
            "Romanian",
            [
        "e",
        "i",
        "a",
@ -687,10 +621,7 @@ FREQUENCIES = OrderedDict(
        "â",
        "j",
    ],
-        ),
+    "Farsi": [
        (
            "Farsi",
            [
        "ا",
        "ی",
        "ر",
@ -718,10 +649,7 @@ FREQUENCIES = OrderedDict(
        "ط",
        "ص",
    ],
-        ),
+    "Arabic": [
        (
            "Arabic",
            [
        "ا",
        "ل",
        "ي",
@ -749,10 +677,7 @@ FREQUENCIES = OrderedDict(
        "خ",
        "إ",
    ],
-        ),
+    "Danish": [
        (
            "Danish",
            [
        "e",
        "r",
        "n",
@ -780,10 +705,7 @@ FREQUENCIES = OrderedDict(
        "j",
        "w",
    ],
-        ),
+    "Serbian": [
        (
            "Serbian",
            [
        "а",
        "и",
        "о",
@ -811,10 +733,7 @@ FREQUENCIES = OrderedDict(
        "ц",
        "ш",
    ],
-        ),
+    "Lithuanian": [
        (
            "Lithuanian",
            [
        "i",
        "a",
        "s",
@ -842,10 +761,7 @@ FREQUENCIES = OrderedDict(
        "ą",
        "į",
    ],
-        ),
+    "Slovene": [
        (
            "Slovene",
            [
        "e",
        "a",
        "i",
@ -873,10 +789,7 @@ FREQUENCIES = OrderedDict(
        "f",
        "y",
    ],
-        ),
+    "Slovak": [
        (
            "Slovak",
            [
        "o",
        "a",
        "e",
@ -904,10 +817,7 @@ FREQUENCIES = OrderedDict(
        "č",
        "é",
    ],
-        ),
+    "Hebrew": [
        (
            "Hebrew",
            [
        "י",
        "ו",
        "ה",
@ -934,10 +844,7 @@ FREQUENCIES = OrderedDict(
        "ז",
        "ך",
    ],
-        ),
+    "Bulgarian": [
        (
            "Bulgarian",
            [
        "а",
        "и",
        "о",
@ -965,10 +872,7 @@ FREQUENCIES = OrderedDict(
        "щ",
        "х",
    ],
-        ),
+    "Croatian": [
        (
            "Croatian",
            [
        "a",
        "i",
        "o",
@ -996,10 +900,7 @@ FREQUENCIES = OrderedDict(
        "ć",
        "f",
    ],
-        ),
+    "Hindi": [
        (
            "Hindi",
            [
        "क",
        "र",
        "स",
@ -1027,10 +928,7 @@ FREQUENCIES = OrderedDict(
        "ष",
        "इ",
    ],
-        ),
+    "Estonian": [
        (
            "Estonian",
            [
        "a",
        "i",
        "e",
@ -1058,10 +956,7 @@ FREQUENCIES = OrderedDict(
        "ö",
        "y",
    ],
-        ),
+    "Simple English": [
        (
            "Simple English",
            [
        "e",
        "a",
        "t",
@ -1089,10 +984,7 @@ FREQUENCIES = OrderedDict(
        "z",
        "q",
    ],
-        ),
+    "Thai": [
        (
            "Thai",
            [
        "า",
        "น",
        "ร",
@ -1120,10 +1012,7 @@ FREQUENCIES = OrderedDict(
        "ข",
        "ใ",
    ],
-        ),
+    "Greek": [
        (
            "Greek",
            [
        "α",
        "τ",
        "ο",
@ -1151,10 +1040,7 @@ FREQUENCIES = OrderedDict(
        "θ",
        "ύ",
    ],
-        ),
+    "Tamil": [
        (
            "Tamil",
            [
        "க",
        "த",
        "ப",
@ -1180,10 +1066,7 @@ FREQUENCIES = OrderedDict(
        "ஒ",
        "ஸ",
    ],
-        ),
+    "Classical Chinese": [
        (
            "Classical Chinese",
            [
        "之",
        "年",
        "為",
@ -1208,10 +1091,7 @@ FREQUENCIES = OrderedDict(
        "五",
        "四",
    ],
-        ),
+    "Kazakh": [
        (
            "Kazakh",
            [
        "а",
        "ы",
        "е",
@ -1239,6 +1119,4 @@ FREQUENCIES = OrderedDict(
        "г",
        "ө",
    ],
-        ),
+}
    ]
 )
--- a/lib/charset_normalizer/cd.py
+++ b/lib/charset_normalizer/cd.py
@ -1,8 +1,8 @@
 import importlib
 from codecs import IncrementalDecoder
-from collections import Counter, OrderedDict
+from collections import Counter
 from functools import lru_cache
-from typing import Dict, List, Optional, Tuple
+from typing import Counter as TypeCounter, Dict, List, Optional, Tuple
 from .assets import FREQUENCIES
 from .constant import KO_NAMES, LANGUAGE_SUPPORTED_COUNT, TOO_SMALL_SEQUENCE, ZH_NAMES
@ -24,17 +24,19 @@ def encoding_unicode_range(iana_name: str) -> List[str]:
    if is_multi_byte_encoding(iana_name):
        raise IOError("Function not supported on multi-byte code page")
-    decoder = importlib.import_module("encodings.{}".format(iana_name)).IncrementalDecoder  # type: ignore
+    decoder = importlib.import_module(
        "encodings.{}".format(iana_name)
    ).IncrementalDecoder
-    p = decoder(errors="ignore")  # type: IncrementalDecoder
+    p: IncrementalDecoder = decoder(errors="ignore")
-    seen_ranges = {}  # type: Dict[str, int]
+    seen_ranges: Dict[str, int] = {}
-    character_count = 0  # type: int
+    character_count: int = 0
    for i in range(0x40, 0xFF):
-        chunk = p.decode(bytes([i]))  # type: str
+        chunk: str = p.decode(bytes([i]))
        if chunk:
-            character_range = unicode_range(chunk)  # type: Optional[str]
+            character_range: Optional[str] = unicode_range(chunk)
            if character_range is None:
                continue
@ -58,7 +60,7 @@ def unicode_range_languages(primary_range: str) -> List[str]:
    """
    Return inferred languages used with a unicode range.
    """
-    languages = []  # type: List[str]
+    languages: List[str] = []
    for language, characters in FREQUENCIES.items():
        for character in characters:
@ -75,8 +77,8 @@ def encoding_languages(iana_name: str) -> List[str]:
    Single-byte encoding language association. Some code page are heavily linked to particular language(s).
    This function does the correspondence.
    """
-    unicode_ranges = encoding_unicode_range(iana_name)  # type: List[str]
+    unicode_ranges: List[str] = encoding_unicode_range(iana_name)
-    primary_range = None  # type: Optional[str]
+    primary_range: Optional[str] = None
    for specified_range in unicode_ranges:
        if "Latin" not in specified_range:
@ -115,8 +117,8 @@ def get_target_features(language: str) -> Tuple[bool, bool]:
    """
    Determine main aspects from a supported language if it contains accents and if is pure Latin.
    """
-    target_have_accents = False  # type: bool
+    target_have_accents: bool = False
-    target_pure_latin = True  # type: bool
+    target_pure_latin: bool = True
    for character in FREQUENCIES[language]:
        if not target_have_accents and is_accentuated(character):
@ -133,7 +135,7 @@ def alphabet_languages(
    """
    Return associated languages associated to given characters.
    """
-    languages = []  # type: List[Tuple[str, float]]
+    languages: List[Tuple[str, float]] = []
    source_have_accents = any(is_accentuated(character) for character in characters)
@ -147,13 +149,13 @@ def alphabet_languages(
        if target_have_accents is False and source_have_accents:
            continue
-        character_count = len(language_characters)  # type: int
+        character_count: int = len(language_characters)
-        character_match_count = len(
+        character_match_count: int = len(
            [c for c in language_characters if c in characters]
-        )  # type: int
+        )
-        ratio = character_match_count / character_count  # type: float
+        ratio: float = character_match_count / character_count
        if ratio >= 0.2:
            languages.append((language, ratio))
@ -174,36 +176,33 @@ def characters_popularity_compare(
    if language not in FREQUENCIES:
        raise ValueError("{} not available".format(language))
-    character_approved_count = 0  # type: int
+    character_approved_count: int = 0
    FREQUENCIES_language_set = set(FREQUENCIES[language])
    for character in ordered_characters:
-        if character not in FREQUENCIES[language]:
+        if character not in FREQUENCIES_language_set:
            continue
-        characters_before_source = FREQUENCIES[language][
+        characters_before_source: List[str] = FREQUENCIES[language][
            0 : FREQUENCIES[language].index(character)
-        ]  # type: List[str]
+        ]
-        characters_after_source = FREQUENCIES[language][
+        characters_after_source: List[str] = FREQUENCIES[language][
            FREQUENCIES[language].index(character) :
-        ]  # type: List[str]
+        ]
-
+        characters_before: List[str] = ordered_characters[
        characters_before = ordered_characters[
            0 : ordered_characters.index(character)
-        ]  # type: List[str]
+        ]
-        characters_after = ordered_characters[
+        characters_after: List[str] = ordered_characters[
            ordered_characters.index(character) :
-        ]  # type: List[str]
+        ]
-        before_match_count = [
+        before_match_count: int = len(
-            e in characters_before for e in characters_before_source
+            set(characters_before) & set(characters_before_source)
-        ].count(
+        )
-            True
+
-        )  # type: int
+        after_match_count: int = len(
-        after_match_count = [
+            set(characters_after) & set(characters_after_source)
-            e in characters_after for e in characters_after_source
+        )
        ].count(
            True
        )  # type: int
        if len(characters_before_source) == 0 and before_match_count <= 4:
            character_approved_count += 1
@ -229,18 +228,18 @@ def alpha_unicode_split(decoded_sequence: str) -> List[str]:
    Ex. a text containing English/Latin with a bit a Hebrew will return two items in the resulting list;
    One containing the latin letters and the other hebrew.
    """
-    layers = OrderedDict()  # type: Dict[str, str]
+    layers: Dict[str, str] = {}
    for character in decoded_sequence:
        if character.isalpha() is False:
            continue
-        character_range = unicode_range(character)  # type: Optional[str]
+        character_range: Optional[str] = unicode_range(character)
        if character_range is None:
            continue
-        layer_target_range = None  # type: Optional[str]
+        layer_target_range: Optional[str] = None
        for discovered_range in layers:
            if (
@ -267,7 +266,7 @@ def merge_coherence_ratios(results: List[CoherenceMatches]) -> CoherenceMatches:
    This function merge results previously given by the function coherence_ratio.
    The return type is the same as coherence_ratio.
    """
-    per_language_ratios = OrderedDict()  # type: Dict[str, List[float]]
+    per_language_ratios: Dict[str, List[float]] = {}
    for result in results:
        for sub_result in result:
            language, ratio = sub_result
@ -299,10 +298,10 @@ def coherence_ratio(
    A layer = Character extraction by alphabets/ranges.
    """
-    results = []  # type: List[Tuple[str, float]]
+    results: List[Tuple[str, float]] = []
-    ignore_non_latin = False  # type: bool
+    ignore_non_latin: bool = False
-    sufficient_match_count = 0  # type: int
+    sufficient_match_count: int = 0
    lg_inclusion_list = lg_inclusion.split(",") if lg_inclusion is not None else []
    if "Latin Based" in lg_inclusion_list:
@ -310,22 +309,22 @@ def coherence_ratio(
        lg_inclusion_list.remove("Latin Based")
    for layer in alpha_unicode_split(decoded_sequence):
-        sequence_frequencies = Counter(layer)  # type: Counter
+        sequence_frequencies: TypeCounter[str] = Counter(layer)
        most_common = sequence_frequencies.most_common()
-        character_count = sum(o for c, o in most_common)  # type: int
+        character_count: int = sum(o for c, o in most_common)
        if character_count <= TOO_SMALL_SEQUENCE:
            continue
-        popular_character_ordered = [c for c, o in most_common]  # type: List[str]
+        popular_character_ordered: List[str] = [c for c, o in most_common]
        for language in lg_inclusion_list or alphabet_languages(
            popular_character_ordered, ignore_non_latin
        ):
-            ratio = characters_popularity_compare(
+            ratio: float = characters_popularity_compare(
                language, popular_character_ordered
-            )  # type: float
+            )
            if ratio < threshold:
                continue
--- a/lib/charset_normalizer/cli/normalizer.py
+++ b/lib/charset_normalizer/cli/normalizer.py
@ -3,7 +3,12 @@ import sys
 from json import dumps
 from os.path import abspath
 from platform import python_version
-from typing import List
+from typing import List, Optional
 try:
    from unicodedata2 import unidata_version
 except ImportError:
    from unicodedata import unidata_version
 from charset_normalizer import from_fp
 from charset_normalizer.models import CliDetectionResult
@ -43,7 +48,7 @@ def query_yes_no(question: str, default: str = "yes") -> bool:
            sys.stdout.write("Please respond with 'yes' or 'no' " "(or 'y' or 'n').\n")
-def cli_detect(argv: List[str] = None) -> int:
+def cli_detect(argv: Optional[List[str]] = None) -> int:
    """
    CLI assistant using ARGV and ArgumentParser
    :param argv:
@ -111,7 +116,7 @@ def cli_detect(argv: List[str] = None) -> int:
        "-t",
        "--threshold",
        action="store",
-        default=0.1,
+        default=0.2,
        type=float,
        dest="threshold",
        help="Define a custom maximum amount of chaos allowed in decoded content. 0. <= chaos <= 1.",
@ -119,8 +124,8 @@ def cli_detect(argv: List[str] = None) -> int:
    parser.add_argument(
        "--version",
        action="version",
-        version="Charset-Normalizer {} - Python {}".format(
+        version="Charset-Normalizer {} - Python {} - Unicode {}".format(
-            __version__, python_version()
+            __version__, python_version(), unidata_version
        ),
        help="Show version information and exit.",
    )
@ -229,7 +234,7 @@ def cli_detect(argv: List[str] = None) -> int:
                        my_file.close()
                    continue
-                o_ = my_file.name.split(".")  # type: List[str]
+                o_: List[str] = my_file.name.split(".")
                if args.replace is False:
                    o_.insert(-1, best_guess.encoding)
--- a/lib/charset_normalizer/constant.py
+++ b/lib/charset_normalizer/constant.py
@ -1,5 +1,4 @@
 from codecs import BOM_UTF8, BOM_UTF16_BE, BOM_UTF16_LE, BOM_UTF32_BE, BOM_UTF32_LE
 from collections import OrderedDict
 from encodings.aliases import aliases
 from re import IGNORECASE, compile as re_compile
 from typing import Dict, List, Set, Union
@ -7,31 +6,26 @@ from typing import Dict, List, Set, Union
 from .assets import FREQUENCIES
 # Contain for each eligible encoding a list of/item bytes SIG/BOM
-ENCODING_MARKS = OrderedDict(
+ENCODING_MARKS: Dict[str, Union[bytes, List[bytes]]] = {
-    [
+    "utf_8": BOM_UTF8,
-        ("utf_8", BOM_UTF8),
+    "utf_7": [
        (
            "utf_7",
            [
        b"\x2b\x2f\x76\x38",
        b"\x2b\x2f\x76\x39",
        b"\x2b\x2f\x76\x2b",
        b"\x2b\x2f\x76\x2f",
        b"\x2b\x2f\x76\x38\x2d",
    ],
-        ),
+    "gb18030": b"\x84\x31\x95\x33",
-        ("gb18030", b"\x84\x31\x95\x33"),
+    "utf_32": [BOM_UTF32_BE, BOM_UTF32_LE],
-        ("utf_32", [BOM_UTF32_BE, BOM_UTF32_LE]),
+    "utf_16": [BOM_UTF16_BE, BOM_UTF16_LE],
-        ("utf_16", [BOM_UTF16_BE, BOM_UTF16_LE]),
+}
    ]
 )  # type: Dict[str, Union[bytes, List[bytes]]]
-TOO_SMALL_SEQUENCE = 32  # type: int
+TOO_SMALL_SEQUENCE: int = 32
-TOO_BIG_SEQUENCE = int(10e6)  # type: int
+TOO_BIG_SEQUENCE: int = int(10e6)
-UTF8_MAXIMAL_ALLOCATION = 1112064  # type: int
+UTF8_MAXIMAL_ALLOCATION: int = 1112064
-UNICODE_RANGES_COMBINED = {
+UNICODE_RANGES_COMBINED: Dict[str, range] = {
    "Control character": range(31 + 1),
    "Basic Latin": range(32, 127 + 1),
    "Latin-1 Supplement": range(128, 255 + 1),
@ -311,10 +305,10 @@ UNICODE_RANGES_COMBINED = {
    "CJK Compatibility Ideographs Supplement": range(194560, 195103 + 1),
    "Tags": range(917504, 917631 + 1),
    "Variation Selectors Supplement": range(917760, 917999 + 1),
-}  # type: Dict[str, range]
+}
-UNICODE_SECONDARY_RANGE_KEYWORD = [
+UNICODE_SECONDARY_RANGE_KEYWORD: List[str] = [
    "Supplement",
    "Extended",
    "Extensions",
@ -330,25 +324,25 @@ UNICODE_SECONDARY_RANGE_KEYWORD = [
    "Shapes",
    "Supplemental",
    "Tags",
-]  # type: List[str]
+]
 RE_POSSIBLE_ENCODING_INDICATION = re_compile(
    r"(?:(?:encoding)|(?:charset)|(?:coding))(?:[\:= ]{1,10})(?:[\"\']?)([a-zA-Z0-9\-_]+)(?:[\"\']?)",
    IGNORECASE,
 )
-IANA_SUPPORTED = sorted(
+IANA_SUPPORTED: List[str] = sorted(
    filter(
        lambda x: x.endswith("_codec") is False
        and x not in {"rot_13", "tactis", "mbcs"},
        list(set(aliases.values())),
    )
-)  # type: List[str]
+)
-IANA_SUPPORTED_COUNT = len(IANA_SUPPORTED)  # type: int
+IANA_SUPPORTED_COUNT: int = len(IANA_SUPPORTED)
 # pre-computed code page that are similar using the function cp_similarity.
-IANA_SUPPORTED_SIMILAR = {
+IANA_SUPPORTED_SIMILAR: Dict[str, List[str]] = {
    "cp037": ["cp1026", "cp1140", "cp273", "cp500"],
    "cp1026": ["cp037", "cp1140", "cp273", "cp500"],
    "cp1125": ["cp866"],
@ -434,10 +428,10 @@ IANA_SUPPORTED_SIMILAR = {
    "mac_turkish": ["mac_iceland", "mac_roman"],
    "ptcp154": ["cp1251", "kz1048"],
    "tis_620": ["iso8859_11"],
-}  # type: Dict[str, List[str]]
+}
-CHARDET_CORRESPONDENCE = {
+CHARDET_CORRESPONDENCE: Dict[str, str] = {
    "iso2022_kr": "ISO-2022-KR",
    "iso2022_jp": "ISO-2022-JP",
    "euc_kr": "EUC-KR",
@ -470,10 +464,10 @@ CHARDET_CORRESPONDENCE = {
    "cp1256": "windows-1256",
    "cp1254": "Windows-1254",
    "cp949": "CP949",
-}  # type: Dict[str, str]
+}
-COMMON_SAFE_ASCII_CHARACTERS = {
+COMMON_SAFE_ASCII_CHARACTERS: Set[str] = {
    "<",
    ">",
    "=",
@ -489,15 +483,15 @@ COMMON_SAFE_ASCII_CHARACTERS = {
    "|",
    '"',
    "-",
-}  # type: Set[str]
+}
-KO_NAMES = {"johab", "cp949", "euc_kr"}  # type: Set[str]
+KO_NAMES: Set[str] = {"johab", "cp949", "euc_kr"}
-ZH_NAMES = {"big5", "cp950", "big5hkscs", "hz"}  # type: Set[str]
+ZH_NAMES: Set[str] = {"big5", "cp950", "big5hkscs", "hz"}
 NOT_PRINTABLE_PATTERN = re_compile(r"[0-9\W\n\r\t]+")
-LANGUAGE_SUPPORTED_COUNT = len(FREQUENCIES)  # type: int
+LANGUAGE_SUPPORTED_COUNT: int = len(FREQUENCIES)
 # Logging LEVEL bellow DEBUG
-TRACE = 5  # type: int
+TRACE: int = 5
--- a/lib/charset_normalizer/md.py
+++ b/lib/charset_normalizer/md.py
@ -16,6 +16,7 @@ from .utils import (
    is_separator,
    is_symbol,
    is_thai,
    is_unprintable,
    remove_accent,
    unicode_range,
 )
@ -57,12 +58,12 @@ class MessDetectorPlugin:
 class TooManySymbolOrPunctuationPlugin(MessDetectorPlugin):
    def __init__(self) -> None:
-        self._punctuation_count = 0  # type: int
+        self._punctuation_count: int = 0
-        self._symbol_count = 0  # type: int
+        self._symbol_count: int = 0
-        self._character_count = 0  # type: int
+        self._character_count: int = 0
-        self._last_printable_char = None  # type: Optional[str]
+        self._last_printable_char: Optional[str] = None
-        self._frenzy_symbol_in_word = False  # type: bool
+        self._frenzy_symbol_in_word: bool = False
    def eligible(self, character: str) -> bool:
        return character.isprintable()
@ -95,17 +96,17 @@ class TooManySymbolOrPunctuationPlugin(MessDetectorPlugin):
        if self._character_count == 0:
            return 0.0
-        ratio_of_punctuation = (
+        ratio_of_punctuation: float = (
            self._punctuation_count + self._symbol_count
-        ) / self._character_count  # type: float
+        ) / self._character_count
        return ratio_of_punctuation if ratio_of_punctuation >= 0.3 else 0.0
 class TooManyAccentuatedPlugin(MessDetectorPlugin):
    def __init__(self) -> None:
-        self._character_count = 0  # type: int
+        self._character_count: int = 0
-        self._accentuated_count = 0  # type: int
+        self._accentuated_count: int = 0
    def eligible(self, character: str) -> bool:
        return character.isalpha()
@ -124,26 +125,20 @@ class TooManyAccentuatedPlugin(MessDetectorPlugin):
    def ratio(self) -> float:
        if self._character_count == 0:
            return 0.0
-        ratio_of_accentuation = (
+        ratio_of_accentuation: float = self._accentuated_count / self._character_count
            self._accentuated_count / self._character_count
        )  # type: float
        return ratio_of_accentuation if ratio_of_accentuation >= 0.35 else 0.0
 class UnprintablePlugin(MessDetectorPlugin):
    def __init__(self) -> None:
-        self._unprintable_count = 0  # type: int
+        self._unprintable_count: int = 0
-        self._character_count = 0  # type: int
+        self._character_count: int = 0
    def eligible(self, character: str) -> bool:
        return True
    def feed(self, character: str) -> None:
-        if (
+        if is_unprintable(character):
            character.isspace() is False  # includes \n \t \r \v
            and character.isprintable() is False
            and character != "\x1A"  # Why? Its the ASCII substitute character.
        ):
            self._unprintable_count += 1
        self._character_count += 1
@ -160,10 +155,10 @@ class UnprintablePlugin(MessDetectorPlugin):
 class SuspiciousDuplicateAccentPlugin(MessDetectorPlugin):
    def __init__(self) -> None:
-        self._successive_count = 0  # type: int
+        self._successive_count: int = 0
-        self._character_count = 0  # type: int
+        self._character_count: int = 0
-        self._last_latin_character = None  # type: Optional[str]
+        self._last_latin_character: Optional[str] = None
    def eligible(self, character: str) -> bool:
        return character.isalpha() and is_latin(character)
@ -197,9 +192,9 @@ class SuspiciousDuplicateAccentPlugin(MessDetectorPlugin):
 class SuspiciousRange(MessDetectorPlugin):
    def __init__(self) -> None:
-        self._suspicious_successive_range_count = 0  # type: int
+        self._suspicious_successive_range_count: int = 0
-        self._character_count = 0  # type: int
+        self._character_count: int = 0
-        self._last_printable_seen = None  # type: Optional[str]
+        self._last_printable_seen: Optional[str] = None
    def eligible(self, character: str) -> bool:
        return character.isprintable()
@ -219,10 +214,8 @@ class SuspiciousRange(MessDetectorPlugin):
            self._last_printable_seen = character
            return
-        unicode_range_a = unicode_range(
+        unicode_range_a: Optional[str] = unicode_range(self._last_printable_seen)
-            self._last_printable_seen
+        unicode_range_b: Optional[str] = unicode_range(character)
        )  # type: Optional[str]
        unicode_range_b = unicode_range(character)  # type: Optional[str]
        if is_suspiciously_successive_range(unicode_range_a, unicode_range_b):
            self._suspicious_successive_range_count += 1
@ -239,9 +232,9 @@ class SuspiciousRange(MessDetectorPlugin):
        if self._character_count == 0:
            return 0.0
-        ratio_of_suspicious_range_usage = (
+        ratio_of_suspicious_range_usage: float = (
            self._suspicious_successive_range_count * 2
-        ) / self._character_count  # type: float
+        ) / self._character_count
        if ratio_of_suspicious_range_usage < 0.1:
            return 0.0
@ -251,25 +244,25 @@ class SuspiciousRange(MessDetectorPlugin):
 class SuperWeirdWordPlugin(MessDetectorPlugin):
    def __init__(self) -> None:
-        self._word_count = 0  # type: int
+        self._word_count: int = 0
-        self._bad_word_count = 0  # type: int
+        self._bad_word_count: int = 0
-        self._foreign_long_count = 0  # type: int
+        self._foreign_long_count: int = 0
-        self._is_current_word_bad = False  # type: bool
+        self._is_current_word_bad: bool = False
-        self._foreign_long_watch = False  # type: bool
+        self._foreign_long_watch: bool = False
-        self._character_count = 0  # type: int
+        self._character_count: int = 0
-        self._bad_character_count = 0  # type: int
+        self._bad_character_count: int = 0
-        self._buffer = ""  # type: str
+        self._buffer: str = ""
-        self._buffer_accent_count = 0  # type: int
+        self._buffer_accent_count: int = 0
    def eligible(self, character: str) -> bool:
        return True
    def feed(self, character: str) -> None:
        if character.isalpha():
-            self._buffer = "".join([self._buffer, character])
+            self._buffer += character
            if is_accentuated(character):
                self._buffer_accent_count += 1
            if (
@ -289,7 +282,7 @@ class SuperWeirdWordPlugin(MessDetectorPlugin):
            character.isspace() or is_punctuation(character) or is_separator(character)
        ) and self._buffer:
            self._word_count += 1
-            buffer_length = len(self._buffer)  # type: int
+            buffer_length: int = len(self._buffer)
            self._character_count += buffer_length
@ -346,8 +339,8 @@ class CjkInvalidStopPlugin(MessDetectorPlugin):
    """
    def __init__(self) -> None:
-        self._wrong_stop_count = 0  # type: int
+        self._wrong_stop_count: int = 0
-        self._cjk_character_count = 0  # type: int
+        self._cjk_character_count: int = 0
    def eligible(self, character: str) -> bool:
        return True
@ -372,17 +365,17 @@ class CjkInvalidStopPlugin(MessDetectorPlugin):
 class ArchaicUpperLowerPlugin(MessDetectorPlugin):
    def __init__(self) -> None:
-        self._buf = False  # type: bool
+        self._buf: bool = False
-        self._character_count_since_last_sep = 0  # type: int
+        self._character_count_since_last_sep: int = 0
-        self._successive_upper_lower_count = 0  # type: int
+        self._successive_upper_lower_count: int = 0
-        self._successive_upper_lower_count_final = 0  # type: int
+        self._successive_upper_lower_count_final: int = 0
-        self._character_count = 0  # type: int
+        self._character_count: int = 0
-        self._last_alpha_seen = None  # type: Optional[str]
+        self._last_alpha_seen: Optional[str] = None
-        self._current_ascii_only = True  # type: bool
+        self._current_ascii_only: bool = True
    def eligible(self, character: str) -> bool:
        return True
@ -446,6 +439,7 @@ class ArchaicUpperLowerPlugin(MessDetectorPlugin):
        return self._successive_upper_lower_count_final / self._character_count
@lru_cache(maxsize=1024)
 def is_suspiciously_successive_range(
    unicode_range_a: Optional[str], unicode_range_b: Optional[str]
 ) -> bool:
@ -524,16 +518,16 @@ def mess_ratio(
    Compute a mess ratio given a decoded bytes sequence. The maximum threshold does stop the computation earlier.
    """
-    detectors = [
+    detectors: List[MessDetectorPlugin] = [
        md_class() for md_class in MessDetectorPlugin.__subclasses__()
-    ]  # type: List[MessDetectorPlugin]
+    ]
-    length = len(decoded_sequence) + 1  # type: int
+    length: int = len(decoded_sequence) + 1
-    mean_mess_ratio = 0.0  # type: float
+    mean_mess_ratio: float = 0.0
    if length < 512:
-        intermediary_mean_mess_ratio_calc = 32  # type: int
+        intermediary_mean_mess_ratio_calc: int = 32
    elif length <= 1024:
        intermediary_mean_mess_ratio_calc = 64
    else:
--- a/lib/charset_normalizer/models.py
+++ b/lib/charset_normalizer/models.py
@ -4,7 +4,16 @@ from encodings.aliases import aliases
 from hashlib import sha256
 from json import dumps
 from re import sub
-from typing import Any, Dict, Iterator, List, Optional, Tuple, Union
+from typing import (
    Any,
    Counter as TypeCounter,
    Dict,
    Iterator,
    List,
    Optional,
    Tuple,
    Union,
 )
 from .constant import NOT_PRINTABLE_PATTERN, TOO_BIG_SEQUENCE
 from .md import mess_ratio
@ -21,21 +30,21 @@ class CharsetMatch:
        languages: "CoherenceMatches",
        decoded_payload: Optional[str] = None,
    ):
-        self._payload = payload  # type: bytes
+        self._payload: bytes = payload
-        self._encoding = guessed_encoding  # type: str
+        self._encoding: str = guessed_encoding
-        self._mean_mess_ratio = mean_mess_ratio  # type: float
+        self._mean_mess_ratio: float = mean_mess_ratio
-        self._languages = languages  # type: CoherenceMatches
+        self._languages: CoherenceMatches = languages
-        self._has_sig_or_bom = has_sig_or_bom  # type: bool
+        self._has_sig_or_bom: bool = has_sig_or_bom
-        self._unicode_ranges = None  # type: Optional[List[str]]
+        self._unicode_ranges: Optional[List[str]] = None
-        self._leaves = []  # type: List[CharsetMatch]
+        self._leaves: List[CharsetMatch] = []
-        self._mean_coherence_ratio = 0.0  # type: float
+        self._mean_coherence_ratio: float = 0.0
-        self._output_payload = None  # type: Optional[bytes]
+        self._output_payload: Optional[bytes] = None
-        self._output_encoding = None  # type: Optional[str]
+        self._output_encoding: Optional[str] = None
-        self._string = decoded_payload  # type: Optional[str]
+        self._string: Optional[str] = decoded_payload
    def __eq__(self, other: object) -> bool:
        if not isinstance(other, CharsetMatch):
@ -53,8 +62,8 @@ class CharsetMatch:
        if not isinstance(other, CharsetMatch):
            raise ValueError
-        chaos_difference = abs(self.chaos - other.chaos)  # type: float
+        chaos_difference: float = abs(self.chaos - other.chaos)
-        coherence_difference = abs(self.coherence - other.coherence)  # type: float
+        coherence_difference: float = abs(self.coherence - other.coherence)
        # Bellow 1% difference --> Use Coherence
        if chaos_difference < 0.01 and coherence_difference > 0.02:
@ -95,7 +104,7 @@ class CharsetMatch:
        return 0.0
    @property
-    def w_counter(self) -> Counter:
+    def w_counter(self) -> TypeCounter[str]:
        """
        Word counter instance on decoded text.
        Notice: Will be removed in 3.0
@ -137,7 +146,7 @@ class CharsetMatch:
        """
        Encoding name are known by many name, using this could help when searching for IBM855 when it's listed as CP855.
        """
-        also_known_as = []  # type: List[str]
+        also_known_as: List[str] = []
        for u, p in aliases.items():
            if self.encoding == u:
                also_known_as.append(p)
@ -227,9 +236,9 @@ class CharsetMatch:
        if self._unicode_ranges is not None:
            return self._unicode_ranges
        # list detected ranges
-        detected_ranges = [
+        detected_ranges: List[Optional[str]] = [
            unicode_range(char) for char in str(self)
-        ]  # type: List[Optional[str]]
+        ]
        # filter and sort
        self._unicode_ranges = sorted(list({r for r in detected_ranges if r}))
        return self._unicode_ranges
@ -280,8 +289,8 @@ class CharsetMatches:
    Act like a list(iterable) but does not implements all related methods.
    """
-    def __init__(self, results: List[CharsetMatch] = None):
+    def __init__(self, results: Optional[List[CharsetMatch]] = None):
-        self._results = sorted(results) if results else []  # type: List[CharsetMatch]
+        self._results: List[CharsetMatch] = sorted(results) if results else []
    def __iter__(self) -> Iterator[CharsetMatch]:
        yield from self._results
@ -360,17 +369,17 @@ class CliDetectionResult:
        unicode_path: Optional[str],
        is_preferred: bool,
    ):
-        self.path = path  # type: str
+        self.path: str = path
-        self.unicode_path = unicode_path  # type: Optional[str]
+        self.unicode_path: Optional[str] = unicode_path
-        self.encoding = encoding  # type: Optional[str]
+        self.encoding: Optional[str] = encoding
-        self.encoding_aliases = encoding_aliases  # type: List[str]
+        self.encoding_aliases: List[str] = encoding_aliases
-        self.alternative_encodings = alternative_encodings  # type: List[str]
+        self.alternative_encodings: List[str] = alternative_encodings
-        self.language = language  # type: str
+        self.language: str = language
-        self.alphabets = alphabets  # type: List[str]
+        self.alphabets: List[str] = alphabets
-        self.has_sig_or_bom = has_sig_or_bom  # type: bool
+        self.has_sig_or_bom: bool = has_sig_or_bom
-        self.chaos = chaos  # type: float
+        self.chaos: float = chaos
-        self.coherence = coherence  # type: float
+        self.coherence: float = coherence
-        self.is_preferred = is_preferred  # type: bool
+        self.is_preferred: bool = is_preferred
    @property
    def __dict__(self) -> Dict[str, Any]:  # type: ignore
--- a/lib/charset_normalizer/utils.py
+++ b/lib/charset_normalizer/utils.py
@ -1,4 +1,6 @@
 try:
    # WARNING: unicodedata2 support is going to be removed in 3.0
    # Python is quickly catching up.
    import unicodedata2 as unicodedata
 except ImportError:
    import unicodedata  # type: ignore[no-redef]
@ -9,9 +11,9 @@ from codecs import IncrementalDecoder
 from encodings.aliases import aliases
 from functools import lru_cache
 from re import findall
-from typing import List, Optional, Set, Tuple, Union
+from typing import Generator, List, Optional, Set, Tuple, Union
-from _multibytecodec import MultibyteIncrementalDecoder  # type: ignore
+from _multibytecodec import MultibyteIncrementalDecoder
 from .constant import (
    ENCODING_MARKS,
@ -26,7 +28,7 @@ from .constant import (
@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
 def is_accentuated(character: str) -> bool:
    try:
-        description = unicodedata.name(character)  # type: str
+        description: str = unicodedata.name(character)
    except ValueError:
        return False
    return (
@ -41,11 +43,11 @@ def is_accentuated(character: str) -> bool:
@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
 def remove_accent(character: str) -> str:
-    decomposed = unicodedata.decomposition(character)  # type: str
+    decomposed: str = unicodedata.decomposition(character)
    if not decomposed:
        return character
-    codes = decomposed.split(" ")  # type: List[str]
+    codes: List[str] = decomposed.split(" ")
    return chr(int(codes[0], 16))
@ -55,7 +57,7 @@ def unicode_range(character: str) -> Optional[str]:
    """
    Retrieve the Unicode range official name from a single character.
    """
-    character_ord = ord(character)  # type: int
+    character_ord: int = ord(character)
    for range_name, ord_range in UNICODE_RANGES_COMBINED.items():
        if character_ord in ord_range:
@ -67,12 +69,13 @@ def unicode_range(character: str) -> Optional[str]:
@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
 def is_latin(character: str) -> bool:
    try:
-        description = unicodedata.name(character)  # type: str
+        description: str = unicodedata.name(character)
    except ValueError:
        return False
    return "LATIN" in description
@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
 def is_ascii(character: str) -> bool:
    try:
        character.encode("ascii")
@ -83,12 +86,12 @@ def is_ascii(character: str) -> bool:
@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
 def is_punctuation(character: str) -> bool:
-    character_category = unicodedata.category(character)  # type: str
+    character_category: str = unicodedata.category(character)
    if "P" in character_category:
        return True
-    character_range = unicode_range(character)  # type: Optional[str]
+    character_range: Optional[str] = unicode_range(character)
    if character_range is None:
        return False
@ -98,12 +101,12 @@ def is_punctuation(character: str) -> bool:
@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
 def is_symbol(character: str) -> bool:
-    character_category = unicodedata.category(character)  # type: str
+    character_category: str = unicodedata.category(character)
    if "S" in character_category or "N" in character_category:
        return True
-    character_range = unicode_range(character)  # type: Optional[str]
+    character_range: Optional[str] = unicode_range(character)
    if character_range is None:
        return False
@ -113,7 +116,7 @@ def is_symbol(character: str) -> bool:
@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
 def is_emoticon(character: str) -> bool:
-    character_range = unicode_range(character)  # type: Optional[str]
+    character_range: Optional[str] = unicode_range(character)
    if character_range is None:
        return False
@ -126,7 +129,7 @@ def is_separator(character: str) -> bool:
    if character.isspace() or character in {"｜", "+", ",", ";", "<", ">"}:
        return True
-    character_category = unicodedata.category(character)  # type: str
+    character_category: str = unicodedata.category(character)
    return "Z" in character_category
@ -137,7 +140,7 @@ def is_case_variable(character: str) -> bool:
 def is_private_use_only(character: str) -> bool:
-    character_category = unicodedata.category(character)  # type: str
+    character_category: str = unicodedata.category(character)
    return character_category == "Co"
@ -197,6 +200,17 @@ def is_unicode_range_secondary(range_name: str) -> bool:
    return any(keyword in range_name for keyword in UNICODE_SECONDARY_RANGE_KEYWORD)
@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
 def is_unprintable(character: str) -> bool:
    return (
        character.isspace() is False  # includes \n \t \r \v
        and character.isprintable() is False
        and character != "\x1A"  # Why? Its the ASCII substitute character.
        and character != "\ufeff"  # bug discovered in Python,
        # Zero Width No-Break Space located in 	Arabic Presentation Forms-B, Unicode 1.1 not acknowledged as space.
    )
 def any_specified_encoding(sequence: bytes, search_zone: int = 4096) -> Optional[str]:
    """
    Extract using ASCII-only decoder any specified encoding in the first n-bytes.
@ -204,12 +218,12 @@ def any_specified_encoding(sequence: bytes, search_zone: int = 4096) -> Optional
    if not isinstance(sequence, bytes):
        raise TypeError
-    seq_len = len(sequence)  # type: int
+    seq_len: int = len(sequence)
-    results = findall(
+    results: List[str] = findall(
        RE_POSSIBLE_ENCODING_INDICATION,
        sequence[: min(seq_len, search_zone)].decode("ascii", errors="ignore"),
-    )  # type: List[str]
+    )
    if len(results) == 0:
        return None
@ -217,6 +231,9 @@ def any_specified_encoding(sequence: bytes, search_zone: int = 4096) -> Optional
    for specified_encoding in results:
        specified_encoding = specified_encoding.lower().replace("-", "_")
        encoding_alias: str
        encoding_iana: str
        for encoding_alias, encoding_iana in aliases.items():
            if encoding_alias == specified_encoding:
                return encoding_iana
@ -242,7 +259,7 @@ def is_multi_byte_encoding(name: str) -> bool:
        "utf_32_be",
        "utf_7",
    } or issubclass(
-        importlib.import_module("encodings.{}".format(name)).IncrementalDecoder,  # type: ignore
+        importlib.import_module("encodings.{}".format(name)).IncrementalDecoder,
        MultibyteIncrementalDecoder,
    )
@ -253,7 +270,7 @@ def identify_sig_or_bom(sequence: bytes) -> Tuple[Optional[str], bytes]:
    """
    for iana_encoding in ENCODING_MARKS:
-        marks = ENCODING_MARKS[iana_encoding]  # type: Union[bytes, List[bytes]]
+        marks: Union[bytes, List[bytes]] = ENCODING_MARKS[iana_encoding]
        if isinstance(marks, bytes):
            marks = [marks]
@ -272,6 +289,9 @@ def should_strip_sig_or_bom(iana_encoding: str) -> bool:
 def iana_name(cp_name: str, strict: bool = True) -> str:
    cp_name = cp_name.lower().replace("-", "_")
    encoding_alias: str
    encoding_iana: str
    for encoding_alias, encoding_iana in aliases.items():
        if cp_name in [encoding_alias, encoding_iana]:
            return encoding_iana
@ -283,10 +303,10 @@ def iana_name(cp_name: str, strict: bool = True) -> str:
 def range_scan(decoded_sequence: str) -> List[str]:
-    ranges = set()  # type: Set[str]
+    ranges: Set[str] = set()
    for character in decoded_sequence:
-        character_range = unicode_range(character)  # type: Optional[str]
+        character_range: Optional[str] = unicode_range(character)
        if character_range is None:
            continue
@ -301,16 +321,20 @@ def cp_similarity(iana_name_a: str, iana_name_b: str) -> float:
    if is_multi_byte_encoding(iana_name_a) or is_multi_byte_encoding(iana_name_b):
        return 0.0
-    decoder_a = importlib.import_module("encodings.{}".format(iana_name_a)).IncrementalDecoder  # type: ignore
+    decoder_a = importlib.import_module(
-    decoder_b = importlib.import_module("encodings.{}".format(iana_name_b)).IncrementalDecoder  # type: ignore
+        "encodings.{}".format(iana_name_a)
    ).IncrementalDecoder
    decoder_b = importlib.import_module(
        "encodings.{}".format(iana_name_b)
    ).IncrementalDecoder
-    id_a = decoder_a(errors="ignore")  # type: IncrementalDecoder
+    id_a: IncrementalDecoder = decoder_a(errors="ignore")
-    id_b = decoder_b(errors="ignore")  # type: IncrementalDecoder
+    id_b: IncrementalDecoder = decoder_b(errors="ignore")
-    character_match_count = 0  # type: int
+    character_match_count: int = 0
    for i in range(255):
-        to_be_decoded = bytes([i])  # type: bytes
+        to_be_decoded: bytes = bytes([i])
        if id_a.decode(to_be_decoded) == id_b.decode(to_be_decoded):
            character_match_count += 1
@ -340,3 +364,61 @@ def set_logging_handler(
    handler = logging.StreamHandler()
    handler.setFormatter(logging.Formatter(format_string))
    logger.addHandler(handler)
 def cut_sequence_chunks(
    sequences: bytes,
    encoding_iana: str,
    offsets: range,
    chunk_size: int,
    bom_or_sig_available: bool,
    strip_sig_or_bom: bool,
    sig_payload: bytes,
    is_multi_byte_decoder: bool,
    decoded_payload: Optional[str] = None,
 ) -> Generator[str, None, None]:
    if decoded_payload and is_multi_byte_decoder is False:
        for i in offsets:
            chunk = decoded_payload[i : i + chunk_size]
            if not chunk:
                break
            yield chunk
    else:
        for i in offsets:
            chunk_end = i + chunk_size
            if chunk_end > len(sequences) + 8:
                continue
            cut_sequence = sequences[i : i + chunk_size]
            if bom_or_sig_available and strip_sig_or_bom is False:
                cut_sequence = sig_payload + cut_sequence
            chunk = cut_sequence.decode(
                encoding_iana,
                errors="ignore" if is_multi_byte_decoder else "strict",
            )
            # multi-byte bad cutting detector and adjustment
            # not the cleanest way to perform that fix but clever enough for now.
            if is_multi_byte_decoder and i > 0 and sequences[i] >= 0x80:
                chunk_partial_size_chk: int = min(chunk_size, 16)
                if (
                    decoded_payload
                    and chunk[:chunk_partial_size_chk] not in decoded_payload
                ):
                    for j in range(i, i - 4, -1):
                        cut_sequence = sequences[j:chunk_end]
                        if bom_or_sig_available and strip_sig_or_bom is False:
                            cut_sequence = sig_payload + cut_sequence
                        chunk = cut_sequence.decode(encoding_iana, errors="ignore")
                        if chunk[:chunk_partial_size_chk] in decoded_payload:
                            break
            yield chunk
--- a/lib/charset_normalizer/version.py
+++ b/lib/charset_normalizer/version.py
@ -2,5 +2,5 @@
 Expose version
 """
-__version__ = "2.0.12"
+__version__ = "2.1.1"
 VERSION = __version__.split(".")