Update charset-normalizer==2.1.1

This commit is contained in:
JonnyWong16 2022-11-12 17:09:39 -08:00
commit 637ccee60f
No known key found for this signature in database
GPG key ID: B1F1F9807184697A
10 changed files with 1493 additions and 1556 deletions

View file

@ -1,4 +1,4 @@
# -*- coding: utf_8 -*- # -*- coding: utf-8 -*-
""" """
Charset-Normalizer Charset-Normalizer
~~~~~~~~~~~~~~ ~~~~~~~~~~~~~~

View file

@ -1,11 +1,8 @@
import logging import logging
import warnings
from os import PathLike
from os.path import basename, splitext from os.path import basename, splitext
from typing import BinaryIO, List, Optional, Set from typing import Any, BinaryIO, List, Optional, Set
try:
from os import PathLike
except ImportError: # pragma: no cover
PathLike = str # type: ignore
from .cd import ( from .cd import (
coherence_ratio, coherence_ratio,
@ -18,6 +15,7 @@ from .md import mess_ratio
from .models import CharsetMatch, CharsetMatches from .models import CharsetMatch, CharsetMatches
from .utils import ( from .utils import (
any_specified_encoding, any_specified_encoding,
cut_sequence_chunks,
iana_name, iana_name,
identify_sig_or_bom, identify_sig_or_bom,
is_cp_similar, is_cp_similar,
@ -39,8 +37,8 @@ def from_bytes(
steps: int = 5, steps: int = 5,
chunk_size: int = 512, chunk_size: int = 512,
threshold: float = 0.2, threshold: float = 0.2,
cp_isolation: List[str] = None, cp_isolation: Optional[List[str]] = None,
cp_exclusion: List[str] = None, cp_exclusion: Optional[List[str]] = None,
preemptive_behaviour: bool = True, preemptive_behaviour: bool = True,
explain: bool = False, explain: bool = False,
) -> CharsetMatches: ) -> CharsetMatches:
@ -70,11 +68,11 @@ def from_bytes(
) )
if explain: if explain:
previous_logger_level = logger.level # type: int previous_logger_level: int = logger.level
logger.addHandler(explain_handler) logger.addHandler(explain_handler)
logger.setLevel(TRACE) logger.setLevel(TRACE)
length = len(sequences) # type: int length: int = len(sequences)
if length == 0: if length == 0:
logger.debug("Encoding detection on empty bytes, assuming utf_8 intention.") logger.debug("Encoding detection on empty bytes, assuming utf_8 intention.")
@ -119,8 +117,8 @@ def from_bytes(
if steps > 1 and length / steps < chunk_size: if steps > 1 and length / steps < chunk_size:
chunk_size = int(length / steps) chunk_size = int(length / steps)
is_too_small_sequence = len(sequences) < TOO_SMALL_SEQUENCE # type: bool is_too_small_sequence: bool = len(sequences) < TOO_SMALL_SEQUENCE
is_too_large_sequence = len(sequences) >= TOO_BIG_SEQUENCE # type: bool is_too_large_sequence: bool = len(sequences) >= TOO_BIG_SEQUENCE
if is_too_small_sequence: if is_too_small_sequence:
logger.log( logger.log(
@ -137,11 +135,11 @@ def from_bytes(
), ),
) )
prioritized_encodings = [] # type: List[str] prioritized_encodings: List[str] = []
specified_encoding = ( specified_encoding: Optional[str] = (
any_specified_encoding(sequences) if preemptive_behaviour else None any_specified_encoding(sequences) if preemptive_behaviour else None
) # type: Optional[str] )
if specified_encoding is not None: if specified_encoding is not None:
prioritized_encodings.append(specified_encoding) prioritized_encodings.append(specified_encoding)
@ -151,15 +149,15 @@ def from_bytes(
specified_encoding, specified_encoding,
) )
tested = set() # type: Set[str] tested: Set[str] = set()
tested_but_hard_failure = [] # type: List[str] tested_but_hard_failure: List[str] = []
tested_but_soft_failure = [] # type: List[str] tested_but_soft_failure: List[str] = []
fallback_ascii = None # type: Optional[CharsetMatch] fallback_ascii: Optional[CharsetMatch] = None
fallback_u8 = None # type: Optional[CharsetMatch] fallback_u8: Optional[CharsetMatch] = None
fallback_specified = None # type: Optional[CharsetMatch] fallback_specified: Optional[CharsetMatch] = None
results = CharsetMatches() # type: CharsetMatches results: CharsetMatches = CharsetMatches()
sig_encoding, sig_payload = identify_sig_or_bom(sequences) sig_encoding, sig_payload = identify_sig_or_bom(sequences)
@ -190,11 +188,11 @@ def from_bytes(
tested.add(encoding_iana) tested.add(encoding_iana)
decoded_payload = None # type: Optional[str] decoded_payload: Optional[str] = None
bom_or_sig_available = sig_encoding == encoding_iana # type: bool bom_or_sig_available: bool = sig_encoding == encoding_iana
strip_sig_or_bom = bom_or_sig_available and should_strip_sig_or_bom( strip_sig_or_bom: bool = bom_or_sig_available and should_strip_sig_or_bom(
encoding_iana encoding_iana
) # type: bool )
if encoding_iana in {"utf_16", "utf_32"} and not bom_or_sig_available: if encoding_iana in {"utf_16", "utf_32"} and not bom_or_sig_available:
logger.log( logger.log(
@ -205,7 +203,7 @@ def from_bytes(
continue continue
try: try:
is_multi_byte_decoder = is_multi_byte_encoding(encoding_iana) # type: bool is_multi_byte_decoder: bool = is_multi_byte_encoding(encoding_iana)
except (ModuleNotFoundError, ImportError): except (ModuleNotFoundError, ImportError):
logger.log( logger.log(
TRACE, TRACE,
@ -240,7 +238,7 @@ def from_bytes(
tested_but_hard_failure.append(encoding_iana) tested_but_hard_failure.append(encoding_iana)
continue continue
similar_soft_failure_test = False # type: bool similar_soft_failure_test: bool = False
for encoding_soft_failed in tested_but_soft_failure: for encoding_soft_failed in tested_but_soft_failure:
if is_cp_similar(encoding_iana, encoding_soft_failed): if is_cp_similar(encoding_iana, encoding_soft_failed):
@ -262,11 +260,11 @@ def from_bytes(
int(length / steps), int(length / steps),
) )
multi_byte_bonus = ( multi_byte_bonus: bool = (
is_multi_byte_decoder is_multi_byte_decoder
and decoded_payload is not None and decoded_payload is not None
and len(decoded_payload) < length and len(decoded_payload) < length
) # type: bool )
if multi_byte_bonus: if multi_byte_bonus:
logger.log( logger.log(
@ -276,61 +274,27 @@ def from_bytes(
encoding_iana, encoding_iana,
) )
max_chunk_gave_up = int(len(r_) / 4) # type: int max_chunk_gave_up: int = int(len(r_) / 4)
max_chunk_gave_up = max(max_chunk_gave_up, 2) max_chunk_gave_up = max(max_chunk_gave_up, 2)
early_stop_count = 0 # type: int early_stop_count: int = 0
lazy_str_hard_failure = False lazy_str_hard_failure = False
md_chunks = [] # type: List[str] md_chunks: List[str] = []
md_ratios = [] md_ratios = []
for i in r_:
if i + chunk_size > length + 8:
continue
cut_sequence = sequences[i : i + chunk_size]
if bom_or_sig_available and strip_sig_or_bom is False:
cut_sequence = sig_payload + cut_sequence
try: try:
chunk = cut_sequence.decode( for chunk in cut_sequence_chunks(
sequences,
encoding_iana, encoding_iana,
errors="ignore" if is_multi_byte_decoder else "strict", r_,
) # type: str chunk_size,
except UnicodeDecodeError as e: # Lazy str loading may have missed something there bom_or_sig_available,
logger.log( strip_sig_or_bom,
TRACE, sig_payload,
"LazyStr Loading: After MD chunk decode, code page %s does not fit given bytes sequence at ALL. %s", is_multi_byte_decoder,
encoding_iana, decoded_payload,
str(e),
)
early_stop_count = max_chunk_gave_up
lazy_str_hard_failure = True
break
# multi-byte bad cutting detector and adjustment
# not the cleanest way to perform that fix but clever enough for now.
if is_multi_byte_decoder and i > 0 and sequences[i] >= 0x80:
chunk_partial_size_chk = min(chunk_size, 16) # type: int
if (
decoded_payload
and chunk[:chunk_partial_size_chk] not in decoded_payload
): ):
for j in range(i, i - 4, -1):
cut_sequence = sequences[j : i + chunk_size]
if bom_or_sig_available and strip_sig_or_bom is False:
cut_sequence = sig_payload + cut_sequence
chunk = cut_sequence.decode(encoding_iana, errors="ignore")
if chunk[:chunk_partial_size_chk] in decoded_payload:
break
md_chunks.append(chunk) md_chunks.append(chunk)
md_ratios.append(mess_ratio(chunk, threshold)) md_ratios.append(mess_ratio(chunk, threshold))
@ -342,6 +306,15 @@ def from_bytes(
bom_or_sig_available and strip_sig_or_bom is False bom_or_sig_available and strip_sig_or_bom is False
): ):
break break
except UnicodeDecodeError as e: # Lazy str loading may have missed something there
logger.log(
TRACE,
"LazyStr Loading: After MD chunk decode, code page %s does not fit given bytes sequence at ALL. %s",
encoding_iana,
str(e),
)
early_stop_count = max_chunk_gave_up
lazy_str_hard_failure = True
# We might want to check the sequence again with the whole content # We might want to check the sequence again with the whole content
# Only if initial MD tests passes # Only if initial MD tests passes
@ -362,9 +335,7 @@ def from_bytes(
tested_but_hard_failure.append(encoding_iana) tested_but_hard_failure.append(encoding_iana)
continue continue
mean_mess_ratio = ( mean_mess_ratio: float = sum(md_ratios) / len(md_ratios) if md_ratios else 0.0
sum(md_ratios) / len(md_ratios) if md_ratios else 0.0
) # type: float
if mean_mess_ratio >= threshold or early_stop_count >= max_chunk_gave_up: if mean_mess_ratio >= threshold or early_stop_count >= max_chunk_gave_up:
tested_but_soft_failure.append(encoding_iana) tested_but_soft_failure.append(encoding_iana)
logger.log( logger.log(
@ -399,7 +370,7 @@ def from_bytes(
) )
if not is_multi_byte_decoder: if not is_multi_byte_decoder:
target_languages = encoding_languages(encoding_iana) # type: List[str] target_languages: List[str] = encoding_languages(encoding_iana)
else: else:
target_languages = mb_encoding_languages(encoding_iana) target_languages = mb_encoding_languages(encoding_iana)
@ -516,8 +487,8 @@ def from_fp(
steps: int = 5, steps: int = 5,
chunk_size: int = 512, chunk_size: int = 512,
threshold: float = 0.20, threshold: float = 0.20,
cp_isolation: List[str] = None, cp_isolation: Optional[List[str]] = None,
cp_exclusion: List[str] = None, cp_exclusion: Optional[List[str]] = None,
preemptive_behaviour: bool = True, preemptive_behaviour: bool = True,
explain: bool = False, explain: bool = False,
) -> CharsetMatches: ) -> CharsetMatches:
@ -538,12 +509,12 @@ def from_fp(
def from_path( def from_path(
path: PathLike, path: "PathLike[Any]",
steps: int = 5, steps: int = 5,
chunk_size: int = 512, chunk_size: int = 512,
threshold: float = 0.20, threshold: float = 0.20,
cp_isolation: List[str] = None, cp_isolation: Optional[List[str]] = None,
cp_exclusion: List[str] = None, cp_exclusion: Optional[List[str]] = None,
preemptive_behaviour: bool = True, preemptive_behaviour: bool = True,
explain: bool = False, explain: bool = False,
) -> CharsetMatches: ) -> CharsetMatches:
@ -565,17 +536,22 @@ def from_path(
def normalize( def normalize(
path: PathLike, path: "PathLike[Any]",
steps: int = 5, steps: int = 5,
chunk_size: int = 512, chunk_size: int = 512,
threshold: float = 0.20, threshold: float = 0.20,
cp_isolation: List[str] = None, cp_isolation: Optional[List[str]] = None,
cp_exclusion: List[str] = None, cp_exclusion: Optional[List[str]] = None,
preemptive_behaviour: bool = True, preemptive_behaviour: bool = True,
) -> CharsetMatch: ) -> CharsetMatch:
""" """
Take a (text-based) file path and try to create another file next to it, this time using UTF-8. Take a (text-based) file path and try to create another file next to it, this time using UTF-8.
""" """
warnings.warn(
"normalize is deprecated and will be removed in 3.0",
DeprecationWarning,
)
results = from_path( results = from_path(
path, path,
steps, steps,

View file

@ -1,11 +1,8 @@
# -*- coding: utf_8 -*- # -*- coding: utf-8 -*-
from collections import OrderedDict from typing import Dict, List
FREQUENCIES = OrderedDict( FREQUENCIES: Dict[str, List[str]] = {
[ "English": [
(
"English",
[
"e", "e",
"a", "a",
"t", "t",
@ -33,10 +30,7 @@ FREQUENCIES = OrderedDict(
"z", "z",
"q", "q",
], ],
), "German": [
(
"German",
[
"e", "e",
"n", "n",
"i", "i",
@ -64,10 +58,7 @@ FREQUENCIES = OrderedDict(
"ö", "ö",
"j", "j",
], ],
), "French": [
(
"French",
[
"e", "e",
"a", "a",
"s", "s",
@ -95,10 +86,7 @@ FREQUENCIES = OrderedDict(
"y", "y",
"j", "j",
], ],
), "Dutch": [
(
"Dutch",
[
"e", "e",
"n", "n",
"a", "a",
@ -126,10 +114,7 @@ FREQUENCIES = OrderedDict(
"x", "x",
"ë", "ë",
], ],
), "Italian": [
(
"Italian",
[
"e", "e",
"i", "i",
"a", "a",
@ -157,10 +142,7 @@ FREQUENCIES = OrderedDict(
"y", "y",
"ò", "ò",
], ],
), "Polish": [
(
"Polish",
[
"a", "a",
"i", "i",
"o", "o",
@ -188,10 +170,7 @@ FREQUENCIES = OrderedDict(
"ę", "ę",
"ó", "ó",
], ],
), "Spanish": [
(
"Spanish",
[
"e", "e",
"a", "a",
"o", "o",
@ -219,10 +198,7 @@ FREQUENCIES = OrderedDict(
"z", "z",
"á", "á",
], ],
), "Russian": [
(
"Russian",
[
"о", "о",
"а", "а",
"е", "е",
@ -250,10 +226,7 @@ FREQUENCIES = OrderedDict(
"ж", "ж",
"ц", "ц",
], ],
), "Japanese": [
(
"Japanese",
[
"", "",
"", "",
"", "",
@ -281,10 +254,7 @@ FREQUENCIES = OrderedDict(
"", "",
"", "",
], ],
), "Portuguese": [
(
"Portuguese",
[
"a", "a",
"e", "e",
"o", "o",
@ -312,10 +282,7 @@ FREQUENCIES = OrderedDict(
"z", "z",
"í", "í",
], ],
), "Swedish": [
(
"Swedish",
[
"e", "e",
"a", "a",
"n", "n",
@ -343,10 +310,7 @@ FREQUENCIES = OrderedDict(
"j", "j",
"x", "x",
], ],
), "Chinese": [
(
"Chinese",
[
"", "",
"", "",
"", "",
@ -377,10 +341,7 @@ FREQUENCIES = OrderedDict(
"", "",
"", "",
], ],
), "Ukrainian": [
(
"Ukrainian",
[
"о", "о",
"а", "а",
"н", "н",
@ -408,10 +369,7 @@ FREQUENCIES = OrderedDict(
"ц", "ц",
"ї", "ї",
], ],
), "Norwegian": [
(
"Norwegian",
[
"e", "e",
"r", "r",
"n", "n",
@ -439,10 +397,7 @@ FREQUENCIES = OrderedDict(
"æ", "æ",
"w", "w",
], ],
), "Finnish": [
(
"Finnish",
[
"a", "a",
"i", "i",
"n", "n",
@ -470,10 +425,7 @@ FREQUENCIES = OrderedDict(
"w", "w",
"z", "z",
], ],
), "Vietnamese": [
(
"Vietnamese",
[
"n", "n",
"h", "h",
"t", "t",
@ -501,10 +453,7 @@ FREQUENCIES = OrderedDict(
"", "",
"ế", "ế",
], ],
), "Czech": [
(
"Czech",
[
"o", "o",
"e", "e",
"a", "a",
@ -532,10 +481,7 @@ FREQUENCIES = OrderedDict(
"é", "é",
"ř", "ř",
], ],
), "Hungarian": [
(
"Hungarian",
[
"e", "e",
"a", "a",
"t", "t",
@ -563,10 +509,7 @@ FREQUENCIES = OrderedDict(
"f", "f",
"c", "c",
], ],
), "Korean": [
(
"Korean",
[
"", "",
"", "",
"", "",
@ -594,10 +537,7 @@ FREQUENCIES = OrderedDict(
"", "",
"", "",
], ],
), "Indonesian": [
(
"Indonesian",
[
"a", "a",
"n", "n",
"e", "e",
@ -625,10 +565,7 @@ FREQUENCIES = OrderedDict(
"x", "x",
"q", "q",
], ],
), "Turkish": [
(
"Turkish",
[
"a", "a",
"e", "e",
"i", "i",
@ -656,10 +593,7 @@ FREQUENCIES = OrderedDict(
"ç", "ç",
"ğ", "ğ",
], ],
), "Romanian": [
(
"Romanian",
[
"e", "e",
"i", "i",
"a", "a",
@ -687,10 +621,7 @@ FREQUENCIES = OrderedDict(
"â", "â",
"j", "j",
], ],
), "Farsi": [
(
"Farsi",
[
"ا", "ا",
"ی", "ی",
"ر", "ر",
@ -718,10 +649,7 @@ FREQUENCIES = OrderedDict(
"ط", "ط",
"ص", "ص",
], ],
), "Arabic": [
(
"Arabic",
[
"ا", "ا",
"ل", "ل",
"ي", "ي",
@ -749,10 +677,7 @@ FREQUENCIES = OrderedDict(
"خ", "خ",
"إ", "إ",
], ],
), "Danish": [
(
"Danish",
[
"e", "e",
"r", "r",
"n", "n",
@ -780,10 +705,7 @@ FREQUENCIES = OrderedDict(
"j", "j",
"w", "w",
], ],
), "Serbian": [
(
"Serbian",
[
"а", "а",
"и", "и",
"о", "о",
@ -811,10 +733,7 @@ FREQUENCIES = OrderedDict(
"ц", "ц",
"ш", "ш",
], ],
), "Lithuanian": [
(
"Lithuanian",
[
"i", "i",
"a", "a",
"s", "s",
@ -842,10 +761,7 @@ FREQUENCIES = OrderedDict(
"ą", "ą",
"į", "į",
], ],
), "Slovene": [
(
"Slovene",
[
"e", "e",
"a", "a",
"i", "i",
@ -873,10 +789,7 @@ FREQUENCIES = OrderedDict(
"f", "f",
"y", "y",
], ],
), "Slovak": [
(
"Slovak",
[
"o", "o",
"a", "a",
"e", "e",
@ -904,10 +817,7 @@ FREQUENCIES = OrderedDict(
"č", "č",
"é", "é",
], ],
), "Hebrew": [
(
"Hebrew",
[
"י", "י",
"ו", "ו",
"ה", "ה",
@ -934,10 +844,7 @@ FREQUENCIES = OrderedDict(
"ז", "ז",
"ך", "ך",
], ],
), "Bulgarian": [
(
"Bulgarian",
[
"а", "а",
"и", "и",
"о", "о",
@ -965,10 +872,7 @@ FREQUENCIES = OrderedDict(
"щ", "щ",
"х", "х",
], ],
), "Croatian": [
(
"Croatian",
[
"a", "a",
"i", "i",
"o", "o",
@ -996,10 +900,7 @@ FREQUENCIES = OrderedDict(
"ć", "ć",
"f", "f",
], ],
), "Hindi": [
(
"Hindi",
[
"", "",
"", "",
"", "",
@ -1027,10 +928,7 @@ FREQUENCIES = OrderedDict(
"", "",
"", "",
], ],
), "Estonian": [
(
"Estonian",
[
"a", "a",
"i", "i",
"e", "e",
@ -1058,10 +956,7 @@ FREQUENCIES = OrderedDict(
"ö", "ö",
"y", "y",
], ],
), "Simple English": [
(
"Simple English",
[
"e", "e",
"a", "a",
"t", "t",
@ -1089,10 +984,7 @@ FREQUENCIES = OrderedDict(
"z", "z",
"q", "q",
], ],
), "Thai": [
(
"Thai",
[
"", "",
"", "",
"", "",
@ -1120,10 +1012,7 @@ FREQUENCIES = OrderedDict(
"", "",
"", "",
], ],
), "Greek": [
(
"Greek",
[
"α", "α",
"τ", "τ",
"ο", "ο",
@ -1151,10 +1040,7 @@ FREQUENCIES = OrderedDict(
"θ", "θ",
"ύ", "ύ",
], ],
), "Tamil": [
(
"Tamil",
[
"", "",
"", "",
"", "",
@ -1180,10 +1066,7 @@ FREQUENCIES = OrderedDict(
"", "",
"", "",
], ],
), "Classical Chinese": [
(
"Classical Chinese",
[
"", "",
"", "",
"", "",
@ -1208,10 +1091,7 @@ FREQUENCIES = OrderedDict(
"", "",
"", "",
], ],
), "Kazakh": [
(
"Kazakh",
[
"а", "а",
"ы", "ы",
"е", "е",
@ -1239,6 +1119,4 @@ FREQUENCIES = OrderedDict(
"г", "г",
"ө", "ө",
], ],
), }
]
)

View file

@ -1,8 +1,8 @@
import importlib import importlib
from codecs import IncrementalDecoder from codecs import IncrementalDecoder
from collections import Counter, OrderedDict from collections import Counter
from functools import lru_cache from functools import lru_cache
from typing import Dict, List, Optional, Tuple from typing import Counter as TypeCounter, Dict, List, Optional, Tuple
from .assets import FREQUENCIES from .assets import FREQUENCIES
from .constant import KO_NAMES, LANGUAGE_SUPPORTED_COUNT, TOO_SMALL_SEQUENCE, ZH_NAMES from .constant import KO_NAMES, LANGUAGE_SUPPORTED_COUNT, TOO_SMALL_SEQUENCE, ZH_NAMES
@ -24,17 +24,19 @@ def encoding_unicode_range(iana_name: str) -> List[str]:
if is_multi_byte_encoding(iana_name): if is_multi_byte_encoding(iana_name):
raise IOError("Function not supported on multi-byte code page") raise IOError("Function not supported on multi-byte code page")
decoder = importlib.import_module("encodings.{}".format(iana_name)).IncrementalDecoder # type: ignore decoder = importlib.import_module(
"encodings.{}".format(iana_name)
).IncrementalDecoder
p = decoder(errors="ignore") # type: IncrementalDecoder p: IncrementalDecoder = decoder(errors="ignore")
seen_ranges = {} # type: Dict[str, int] seen_ranges: Dict[str, int] = {}
character_count = 0 # type: int character_count: int = 0
for i in range(0x40, 0xFF): for i in range(0x40, 0xFF):
chunk = p.decode(bytes([i])) # type: str chunk: str = p.decode(bytes([i]))
if chunk: if chunk:
character_range = unicode_range(chunk) # type: Optional[str] character_range: Optional[str] = unicode_range(chunk)
if character_range is None: if character_range is None:
continue continue
@ -58,7 +60,7 @@ def unicode_range_languages(primary_range: str) -> List[str]:
""" """
Return inferred languages used with a unicode range. Return inferred languages used with a unicode range.
""" """
languages = [] # type: List[str] languages: List[str] = []
for language, characters in FREQUENCIES.items(): for language, characters in FREQUENCIES.items():
for character in characters: for character in characters:
@ -75,8 +77,8 @@ def encoding_languages(iana_name: str) -> List[str]:
Single-byte encoding language association. Some code page are heavily linked to particular language(s). Single-byte encoding language association. Some code page are heavily linked to particular language(s).
This function does the correspondence. This function does the correspondence.
""" """
unicode_ranges = encoding_unicode_range(iana_name) # type: List[str] unicode_ranges: List[str] = encoding_unicode_range(iana_name)
primary_range = None # type: Optional[str] primary_range: Optional[str] = None
for specified_range in unicode_ranges: for specified_range in unicode_ranges:
if "Latin" not in specified_range: if "Latin" not in specified_range:
@ -115,8 +117,8 @@ def get_target_features(language: str) -> Tuple[bool, bool]:
""" """
Determine main aspects from a supported language if it contains accents and if is pure Latin. Determine main aspects from a supported language if it contains accents and if is pure Latin.
""" """
target_have_accents = False # type: bool target_have_accents: bool = False
target_pure_latin = True # type: bool target_pure_latin: bool = True
for character in FREQUENCIES[language]: for character in FREQUENCIES[language]:
if not target_have_accents and is_accentuated(character): if not target_have_accents and is_accentuated(character):
@ -133,7 +135,7 @@ def alphabet_languages(
""" """
Return associated languages associated to given characters. Return associated languages associated to given characters.
""" """
languages = [] # type: List[Tuple[str, float]] languages: List[Tuple[str, float]] = []
source_have_accents = any(is_accentuated(character) for character in characters) source_have_accents = any(is_accentuated(character) for character in characters)
@ -147,13 +149,13 @@ def alphabet_languages(
if target_have_accents is False and source_have_accents: if target_have_accents is False and source_have_accents:
continue continue
character_count = len(language_characters) # type: int character_count: int = len(language_characters)
character_match_count = len( character_match_count: int = len(
[c for c in language_characters if c in characters] [c for c in language_characters if c in characters]
) # type: int )
ratio = character_match_count / character_count # type: float ratio: float = character_match_count / character_count
if ratio >= 0.2: if ratio >= 0.2:
languages.append((language, ratio)) languages.append((language, ratio))
@ -174,36 +176,33 @@ def characters_popularity_compare(
if language not in FREQUENCIES: if language not in FREQUENCIES:
raise ValueError("{} not available".format(language)) raise ValueError("{} not available".format(language))
character_approved_count = 0 # type: int character_approved_count: int = 0
FREQUENCIES_language_set = set(FREQUENCIES[language])
for character in ordered_characters: for character in ordered_characters:
if character not in FREQUENCIES[language]: if character not in FREQUENCIES_language_set:
continue continue
characters_before_source = FREQUENCIES[language][ characters_before_source: List[str] = FREQUENCIES[language][
0 : FREQUENCIES[language].index(character) 0 : FREQUENCIES[language].index(character)
] # type: List[str] ]
characters_after_source = FREQUENCIES[language][ characters_after_source: List[str] = FREQUENCIES[language][
FREQUENCIES[language].index(character) : FREQUENCIES[language].index(character) :
] # type: List[str] ]
characters_before: List[str] = ordered_characters[
characters_before = ordered_characters[
0 : ordered_characters.index(character) 0 : ordered_characters.index(character)
] # type: List[str] ]
characters_after = ordered_characters[ characters_after: List[str] = ordered_characters[
ordered_characters.index(character) : ordered_characters.index(character) :
] # type: List[str] ]
before_match_count = [ before_match_count: int = len(
e in characters_before for e in characters_before_source set(characters_before) & set(characters_before_source)
].count( )
True
) # type: int after_match_count: int = len(
after_match_count = [ set(characters_after) & set(characters_after_source)
e in characters_after for e in characters_after_source )
].count(
True
) # type: int
if len(characters_before_source) == 0 and before_match_count <= 4: if len(characters_before_source) == 0 and before_match_count <= 4:
character_approved_count += 1 character_approved_count += 1
@ -229,18 +228,18 @@ def alpha_unicode_split(decoded_sequence: str) -> List[str]:
Ex. a text containing English/Latin with a bit a Hebrew will return two items in the resulting list; Ex. a text containing English/Latin with a bit a Hebrew will return two items in the resulting list;
One containing the latin letters and the other hebrew. One containing the latin letters and the other hebrew.
""" """
layers = OrderedDict() # type: Dict[str, str] layers: Dict[str, str] = {}
for character in decoded_sequence: for character in decoded_sequence:
if character.isalpha() is False: if character.isalpha() is False:
continue continue
character_range = unicode_range(character) # type: Optional[str] character_range: Optional[str] = unicode_range(character)
if character_range is None: if character_range is None:
continue continue
layer_target_range = None # type: Optional[str] layer_target_range: Optional[str] = None
for discovered_range in layers: for discovered_range in layers:
if ( if (
@ -267,7 +266,7 @@ def merge_coherence_ratios(results: List[CoherenceMatches]) -> CoherenceMatches:
This function merge results previously given by the function coherence_ratio. This function merge results previously given by the function coherence_ratio.
The return type is the same as coherence_ratio. The return type is the same as coherence_ratio.
""" """
per_language_ratios = OrderedDict() # type: Dict[str, List[float]] per_language_ratios: Dict[str, List[float]] = {}
for result in results: for result in results:
for sub_result in result: for sub_result in result:
language, ratio = sub_result language, ratio = sub_result
@ -299,10 +298,10 @@ def coherence_ratio(
A layer = Character extraction by alphabets/ranges. A layer = Character extraction by alphabets/ranges.
""" """
results = [] # type: List[Tuple[str, float]] results: List[Tuple[str, float]] = []
ignore_non_latin = False # type: bool ignore_non_latin: bool = False
sufficient_match_count = 0 # type: int sufficient_match_count: int = 0
lg_inclusion_list = lg_inclusion.split(",") if lg_inclusion is not None else [] lg_inclusion_list = lg_inclusion.split(",") if lg_inclusion is not None else []
if "Latin Based" in lg_inclusion_list: if "Latin Based" in lg_inclusion_list:
@ -310,22 +309,22 @@ def coherence_ratio(
lg_inclusion_list.remove("Latin Based") lg_inclusion_list.remove("Latin Based")
for layer in alpha_unicode_split(decoded_sequence): for layer in alpha_unicode_split(decoded_sequence):
sequence_frequencies = Counter(layer) # type: Counter sequence_frequencies: TypeCounter[str] = Counter(layer)
most_common = sequence_frequencies.most_common() most_common = sequence_frequencies.most_common()
character_count = sum(o for c, o in most_common) # type: int character_count: int = sum(o for c, o in most_common)
if character_count <= TOO_SMALL_SEQUENCE: if character_count <= TOO_SMALL_SEQUENCE:
continue continue
popular_character_ordered = [c for c, o in most_common] # type: List[str] popular_character_ordered: List[str] = [c for c, o in most_common]
for language in lg_inclusion_list or alphabet_languages( for language in lg_inclusion_list or alphabet_languages(
popular_character_ordered, ignore_non_latin popular_character_ordered, ignore_non_latin
): ):
ratio = characters_popularity_compare( ratio: float = characters_popularity_compare(
language, popular_character_ordered language, popular_character_ordered
) # type: float )
if ratio < threshold: if ratio < threshold:
continue continue

View file

@ -3,7 +3,12 @@ import sys
from json import dumps from json import dumps
from os.path import abspath from os.path import abspath
from platform import python_version from platform import python_version
from typing import List from typing import List, Optional
try:
from unicodedata2 import unidata_version
except ImportError:
from unicodedata import unidata_version
from charset_normalizer import from_fp from charset_normalizer import from_fp
from charset_normalizer.models import CliDetectionResult from charset_normalizer.models import CliDetectionResult
@ -43,7 +48,7 @@ def query_yes_no(question: str, default: str = "yes") -> bool:
sys.stdout.write("Please respond with 'yes' or 'no' " "(or 'y' or 'n').\n") sys.stdout.write("Please respond with 'yes' or 'no' " "(or 'y' or 'n').\n")
def cli_detect(argv: List[str] = None) -> int: def cli_detect(argv: Optional[List[str]] = None) -> int:
""" """
CLI assistant using ARGV and ArgumentParser CLI assistant using ARGV and ArgumentParser
:param argv: :param argv:
@ -111,7 +116,7 @@ def cli_detect(argv: List[str] = None) -> int:
"-t", "-t",
"--threshold", "--threshold",
action="store", action="store",
default=0.1, default=0.2,
type=float, type=float,
dest="threshold", dest="threshold",
help="Define a custom maximum amount of chaos allowed in decoded content. 0. <= chaos <= 1.", help="Define a custom maximum amount of chaos allowed in decoded content. 0. <= chaos <= 1.",
@ -119,8 +124,8 @@ def cli_detect(argv: List[str] = None) -> int:
parser.add_argument( parser.add_argument(
"--version", "--version",
action="version", action="version",
version="Charset-Normalizer {} - Python {}".format( version="Charset-Normalizer {} - Python {} - Unicode {}".format(
__version__, python_version() __version__, python_version(), unidata_version
), ),
help="Show version information and exit.", help="Show version information and exit.",
) )
@ -229,7 +234,7 @@ def cli_detect(argv: List[str] = None) -> int:
my_file.close() my_file.close()
continue continue
o_ = my_file.name.split(".") # type: List[str] o_: List[str] = my_file.name.split(".")
if args.replace is False: if args.replace is False:
o_.insert(-1, best_guess.encoding) o_.insert(-1, best_guess.encoding)

View file

@ -1,5 +1,4 @@
from codecs import BOM_UTF8, BOM_UTF16_BE, BOM_UTF16_LE, BOM_UTF32_BE, BOM_UTF32_LE from codecs import BOM_UTF8, BOM_UTF16_BE, BOM_UTF16_LE, BOM_UTF32_BE, BOM_UTF32_LE
from collections import OrderedDict
from encodings.aliases import aliases from encodings.aliases import aliases
from re import IGNORECASE, compile as re_compile from re import IGNORECASE, compile as re_compile
from typing import Dict, List, Set, Union from typing import Dict, List, Set, Union
@ -7,31 +6,26 @@ from typing import Dict, List, Set, Union
from .assets import FREQUENCIES from .assets import FREQUENCIES
# Contain for each eligible encoding a list of/item bytes SIG/BOM # Contain for each eligible encoding a list of/item bytes SIG/BOM
ENCODING_MARKS = OrderedDict( ENCODING_MARKS: Dict[str, Union[bytes, List[bytes]]] = {
[ "utf_8": BOM_UTF8,
("utf_8", BOM_UTF8), "utf_7": [
(
"utf_7",
[
b"\x2b\x2f\x76\x38", b"\x2b\x2f\x76\x38",
b"\x2b\x2f\x76\x39", b"\x2b\x2f\x76\x39",
b"\x2b\x2f\x76\x2b", b"\x2b\x2f\x76\x2b",
b"\x2b\x2f\x76\x2f", b"\x2b\x2f\x76\x2f",
b"\x2b\x2f\x76\x38\x2d", b"\x2b\x2f\x76\x38\x2d",
], ],
), "gb18030": b"\x84\x31\x95\x33",
("gb18030", b"\x84\x31\x95\x33"), "utf_32": [BOM_UTF32_BE, BOM_UTF32_LE],
("utf_32", [BOM_UTF32_BE, BOM_UTF32_LE]), "utf_16": [BOM_UTF16_BE, BOM_UTF16_LE],
("utf_16", [BOM_UTF16_BE, BOM_UTF16_LE]), }
]
) # type: Dict[str, Union[bytes, List[bytes]]]
TOO_SMALL_SEQUENCE = 32 # type: int TOO_SMALL_SEQUENCE: int = 32
TOO_BIG_SEQUENCE = int(10e6) # type: int TOO_BIG_SEQUENCE: int = int(10e6)
UTF8_MAXIMAL_ALLOCATION = 1112064 # type: int UTF8_MAXIMAL_ALLOCATION: int = 1112064
UNICODE_RANGES_COMBINED = { UNICODE_RANGES_COMBINED: Dict[str, range] = {
"Control character": range(31 + 1), "Control character": range(31 + 1),
"Basic Latin": range(32, 127 + 1), "Basic Latin": range(32, 127 + 1),
"Latin-1 Supplement": range(128, 255 + 1), "Latin-1 Supplement": range(128, 255 + 1),
@ -311,10 +305,10 @@ UNICODE_RANGES_COMBINED = {
"CJK Compatibility Ideographs Supplement": range(194560, 195103 + 1), "CJK Compatibility Ideographs Supplement": range(194560, 195103 + 1),
"Tags": range(917504, 917631 + 1), "Tags": range(917504, 917631 + 1),
"Variation Selectors Supplement": range(917760, 917999 + 1), "Variation Selectors Supplement": range(917760, 917999 + 1),
} # type: Dict[str, range] }
UNICODE_SECONDARY_RANGE_KEYWORD = [ UNICODE_SECONDARY_RANGE_KEYWORD: List[str] = [
"Supplement", "Supplement",
"Extended", "Extended",
"Extensions", "Extensions",
@ -330,25 +324,25 @@ UNICODE_SECONDARY_RANGE_KEYWORD = [
"Shapes", "Shapes",
"Supplemental", "Supplemental",
"Tags", "Tags",
] # type: List[str] ]
RE_POSSIBLE_ENCODING_INDICATION = re_compile( RE_POSSIBLE_ENCODING_INDICATION = re_compile(
r"(?:(?:encoding)|(?:charset)|(?:coding))(?:[\:= ]{1,10})(?:[\"\']?)([a-zA-Z0-9\-_]+)(?:[\"\']?)", r"(?:(?:encoding)|(?:charset)|(?:coding))(?:[\:= ]{1,10})(?:[\"\']?)([a-zA-Z0-9\-_]+)(?:[\"\']?)",
IGNORECASE, IGNORECASE,
) )
IANA_SUPPORTED = sorted( IANA_SUPPORTED: List[str] = sorted(
filter( filter(
lambda x: x.endswith("_codec") is False lambda x: x.endswith("_codec") is False
and x not in {"rot_13", "tactis", "mbcs"}, and x not in {"rot_13", "tactis", "mbcs"},
list(set(aliases.values())), list(set(aliases.values())),
) )
) # type: List[str] )
IANA_SUPPORTED_COUNT = len(IANA_SUPPORTED) # type: int IANA_SUPPORTED_COUNT: int = len(IANA_SUPPORTED)
# pre-computed code page that are similar using the function cp_similarity. # pre-computed code page that are similar using the function cp_similarity.
IANA_SUPPORTED_SIMILAR = { IANA_SUPPORTED_SIMILAR: Dict[str, List[str]] = {
"cp037": ["cp1026", "cp1140", "cp273", "cp500"], "cp037": ["cp1026", "cp1140", "cp273", "cp500"],
"cp1026": ["cp037", "cp1140", "cp273", "cp500"], "cp1026": ["cp037", "cp1140", "cp273", "cp500"],
"cp1125": ["cp866"], "cp1125": ["cp866"],
@ -434,10 +428,10 @@ IANA_SUPPORTED_SIMILAR = {
"mac_turkish": ["mac_iceland", "mac_roman"], "mac_turkish": ["mac_iceland", "mac_roman"],
"ptcp154": ["cp1251", "kz1048"], "ptcp154": ["cp1251", "kz1048"],
"tis_620": ["iso8859_11"], "tis_620": ["iso8859_11"],
} # type: Dict[str, List[str]] }
CHARDET_CORRESPONDENCE = { CHARDET_CORRESPONDENCE: Dict[str, str] = {
"iso2022_kr": "ISO-2022-KR", "iso2022_kr": "ISO-2022-KR",
"iso2022_jp": "ISO-2022-JP", "iso2022_jp": "ISO-2022-JP",
"euc_kr": "EUC-KR", "euc_kr": "EUC-KR",
@ -470,10 +464,10 @@ CHARDET_CORRESPONDENCE = {
"cp1256": "windows-1256", "cp1256": "windows-1256",
"cp1254": "Windows-1254", "cp1254": "Windows-1254",
"cp949": "CP949", "cp949": "CP949",
} # type: Dict[str, str] }
COMMON_SAFE_ASCII_CHARACTERS = { COMMON_SAFE_ASCII_CHARACTERS: Set[str] = {
"<", "<",
">", ">",
"=", "=",
@ -489,15 +483,15 @@ COMMON_SAFE_ASCII_CHARACTERS = {
"|", "|",
'"', '"',
"-", "-",
} # type: Set[str] }
KO_NAMES = {"johab", "cp949", "euc_kr"} # type: Set[str] KO_NAMES: Set[str] = {"johab", "cp949", "euc_kr"}
ZH_NAMES = {"big5", "cp950", "big5hkscs", "hz"} # type: Set[str] ZH_NAMES: Set[str] = {"big5", "cp950", "big5hkscs", "hz"}
NOT_PRINTABLE_PATTERN = re_compile(r"[0-9\W\n\r\t]+") NOT_PRINTABLE_PATTERN = re_compile(r"[0-9\W\n\r\t]+")
LANGUAGE_SUPPORTED_COUNT = len(FREQUENCIES) # type: int LANGUAGE_SUPPORTED_COUNT: int = len(FREQUENCIES)
# Logging LEVEL bellow DEBUG # Logging LEVEL bellow DEBUG
TRACE = 5 # type: int TRACE: int = 5

View file

@ -16,6 +16,7 @@ from .utils import (
is_separator, is_separator,
is_symbol, is_symbol,
is_thai, is_thai,
is_unprintable,
remove_accent, remove_accent,
unicode_range, unicode_range,
) )
@ -57,12 +58,12 @@ class MessDetectorPlugin:
class TooManySymbolOrPunctuationPlugin(MessDetectorPlugin): class TooManySymbolOrPunctuationPlugin(MessDetectorPlugin):
def __init__(self) -> None: def __init__(self) -> None:
self._punctuation_count = 0 # type: int self._punctuation_count: int = 0
self._symbol_count = 0 # type: int self._symbol_count: int = 0
self._character_count = 0 # type: int self._character_count: int = 0
self._last_printable_char = None # type: Optional[str] self._last_printable_char: Optional[str] = None
self._frenzy_symbol_in_word = False # type: bool self._frenzy_symbol_in_word: bool = False
def eligible(self, character: str) -> bool: def eligible(self, character: str) -> bool:
return character.isprintable() return character.isprintable()
@ -95,17 +96,17 @@ class TooManySymbolOrPunctuationPlugin(MessDetectorPlugin):
if self._character_count == 0: if self._character_count == 0:
return 0.0 return 0.0
ratio_of_punctuation = ( ratio_of_punctuation: float = (
self._punctuation_count + self._symbol_count self._punctuation_count + self._symbol_count
) / self._character_count # type: float ) / self._character_count
return ratio_of_punctuation if ratio_of_punctuation >= 0.3 else 0.0 return ratio_of_punctuation if ratio_of_punctuation >= 0.3 else 0.0
class TooManyAccentuatedPlugin(MessDetectorPlugin): class TooManyAccentuatedPlugin(MessDetectorPlugin):
def __init__(self) -> None: def __init__(self) -> None:
self._character_count = 0 # type: int self._character_count: int = 0
self._accentuated_count = 0 # type: int self._accentuated_count: int = 0
def eligible(self, character: str) -> bool: def eligible(self, character: str) -> bool:
return character.isalpha() return character.isalpha()
@ -124,26 +125,20 @@ class TooManyAccentuatedPlugin(MessDetectorPlugin):
def ratio(self) -> float: def ratio(self) -> float:
if self._character_count == 0: if self._character_count == 0:
return 0.0 return 0.0
ratio_of_accentuation = ( ratio_of_accentuation: float = self._accentuated_count / self._character_count
self._accentuated_count / self._character_count
) # type: float
return ratio_of_accentuation if ratio_of_accentuation >= 0.35 else 0.0 return ratio_of_accentuation if ratio_of_accentuation >= 0.35 else 0.0
class UnprintablePlugin(MessDetectorPlugin): class UnprintablePlugin(MessDetectorPlugin):
def __init__(self) -> None: def __init__(self) -> None:
self._unprintable_count = 0 # type: int self._unprintable_count: int = 0
self._character_count = 0 # type: int self._character_count: int = 0
def eligible(self, character: str) -> bool: def eligible(self, character: str) -> bool:
return True return True
def feed(self, character: str) -> None: def feed(self, character: str) -> None:
if ( if is_unprintable(character):
character.isspace() is False # includes \n \t \r \v
and character.isprintable() is False
and character != "\x1A" # Why? Its the ASCII substitute character.
):
self._unprintable_count += 1 self._unprintable_count += 1
self._character_count += 1 self._character_count += 1
@ -160,10 +155,10 @@ class UnprintablePlugin(MessDetectorPlugin):
class SuspiciousDuplicateAccentPlugin(MessDetectorPlugin): class SuspiciousDuplicateAccentPlugin(MessDetectorPlugin):
def __init__(self) -> None: def __init__(self) -> None:
self._successive_count = 0 # type: int self._successive_count: int = 0
self._character_count = 0 # type: int self._character_count: int = 0
self._last_latin_character = None # type: Optional[str] self._last_latin_character: Optional[str] = None
def eligible(self, character: str) -> bool: def eligible(self, character: str) -> bool:
return character.isalpha() and is_latin(character) return character.isalpha() and is_latin(character)
@ -197,9 +192,9 @@ class SuspiciousDuplicateAccentPlugin(MessDetectorPlugin):
class SuspiciousRange(MessDetectorPlugin): class SuspiciousRange(MessDetectorPlugin):
def __init__(self) -> None: def __init__(self) -> None:
self._suspicious_successive_range_count = 0 # type: int self._suspicious_successive_range_count: int = 0
self._character_count = 0 # type: int self._character_count: int = 0
self._last_printable_seen = None # type: Optional[str] self._last_printable_seen: Optional[str] = None
def eligible(self, character: str) -> bool: def eligible(self, character: str) -> bool:
return character.isprintable() return character.isprintable()
@ -219,10 +214,8 @@ class SuspiciousRange(MessDetectorPlugin):
self._last_printable_seen = character self._last_printable_seen = character
return return
unicode_range_a = unicode_range( unicode_range_a: Optional[str] = unicode_range(self._last_printable_seen)
self._last_printable_seen unicode_range_b: Optional[str] = unicode_range(character)
) # type: Optional[str]
unicode_range_b = unicode_range(character) # type: Optional[str]
if is_suspiciously_successive_range(unicode_range_a, unicode_range_b): if is_suspiciously_successive_range(unicode_range_a, unicode_range_b):
self._suspicious_successive_range_count += 1 self._suspicious_successive_range_count += 1
@ -239,9 +232,9 @@ class SuspiciousRange(MessDetectorPlugin):
if self._character_count == 0: if self._character_count == 0:
return 0.0 return 0.0
ratio_of_suspicious_range_usage = ( ratio_of_suspicious_range_usage: float = (
self._suspicious_successive_range_count * 2 self._suspicious_successive_range_count * 2
) / self._character_count # type: float ) / self._character_count
if ratio_of_suspicious_range_usage < 0.1: if ratio_of_suspicious_range_usage < 0.1:
return 0.0 return 0.0
@ -251,25 +244,25 @@ class SuspiciousRange(MessDetectorPlugin):
class SuperWeirdWordPlugin(MessDetectorPlugin): class SuperWeirdWordPlugin(MessDetectorPlugin):
def __init__(self) -> None: def __init__(self) -> None:
self._word_count = 0 # type: int self._word_count: int = 0
self._bad_word_count = 0 # type: int self._bad_word_count: int = 0
self._foreign_long_count = 0 # type: int self._foreign_long_count: int = 0
self._is_current_word_bad = False # type: bool self._is_current_word_bad: bool = False
self._foreign_long_watch = False # type: bool self._foreign_long_watch: bool = False
self._character_count = 0 # type: int self._character_count: int = 0
self._bad_character_count = 0 # type: int self._bad_character_count: int = 0
self._buffer = "" # type: str self._buffer: str = ""
self._buffer_accent_count = 0 # type: int self._buffer_accent_count: int = 0
def eligible(self, character: str) -> bool: def eligible(self, character: str) -> bool:
return True return True
def feed(self, character: str) -> None: def feed(self, character: str) -> None:
if character.isalpha(): if character.isalpha():
self._buffer = "".join([self._buffer, character]) self._buffer += character
if is_accentuated(character): if is_accentuated(character):
self._buffer_accent_count += 1 self._buffer_accent_count += 1
if ( if (
@ -289,7 +282,7 @@ class SuperWeirdWordPlugin(MessDetectorPlugin):
character.isspace() or is_punctuation(character) or is_separator(character) character.isspace() or is_punctuation(character) or is_separator(character)
) and self._buffer: ) and self._buffer:
self._word_count += 1 self._word_count += 1
buffer_length = len(self._buffer) # type: int buffer_length: int = len(self._buffer)
self._character_count += buffer_length self._character_count += buffer_length
@ -346,8 +339,8 @@ class CjkInvalidStopPlugin(MessDetectorPlugin):
""" """
def __init__(self) -> None: def __init__(self) -> None:
self._wrong_stop_count = 0 # type: int self._wrong_stop_count: int = 0
self._cjk_character_count = 0 # type: int self._cjk_character_count: int = 0
def eligible(self, character: str) -> bool: def eligible(self, character: str) -> bool:
return True return True
@ -372,17 +365,17 @@ class CjkInvalidStopPlugin(MessDetectorPlugin):
class ArchaicUpperLowerPlugin(MessDetectorPlugin): class ArchaicUpperLowerPlugin(MessDetectorPlugin):
def __init__(self) -> None: def __init__(self) -> None:
self._buf = False # type: bool self._buf: bool = False
self._character_count_since_last_sep = 0 # type: int self._character_count_since_last_sep: int = 0
self._successive_upper_lower_count = 0 # type: int self._successive_upper_lower_count: int = 0
self._successive_upper_lower_count_final = 0 # type: int self._successive_upper_lower_count_final: int = 0
self._character_count = 0 # type: int self._character_count: int = 0
self._last_alpha_seen = None # type: Optional[str] self._last_alpha_seen: Optional[str] = None
self._current_ascii_only = True # type: bool self._current_ascii_only: bool = True
def eligible(self, character: str) -> bool: def eligible(self, character: str) -> bool:
return True return True
@ -446,6 +439,7 @@ class ArchaicUpperLowerPlugin(MessDetectorPlugin):
return self._successive_upper_lower_count_final / self._character_count return self._successive_upper_lower_count_final / self._character_count
@lru_cache(maxsize=1024)
def is_suspiciously_successive_range( def is_suspiciously_successive_range(
unicode_range_a: Optional[str], unicode_range_b: Optional[str] unicode_range_a: Optional[str], unicode_range_b: Optional[str]
) -> bool: ) -> bool:
@ -524,16 +518,16 @@ def mess_ratio(
Compute a mess ratio given a decoded bytes sequence. The maximum threshold does stop the computation earlier. Compute a mess ratio given a decoded bytes sequence. The maximum threshold does stop the computation earlier.
""" """
detectors = [ detectors: List[MessDetectorPlugin] = [
md_class() for md_class in MessDetectorPlugin.__subclasses__() md_class() for md_class in MessDetectorPlugin.__subclasses__()
] # type: List[MessDetectorPlugin] ]
length = len(decoded_sequence) + 1 # type: int length: int = len(decoded_sequence) + 1
mean_mess_ratio = 0.0 # type: float mean_mess_ratio: float = 0.0
if length < 512: if length < 512:
intermediary_mean_mess_ratio_calc = 32 # type: int intermediary_mean_mess_ratio_calc: int = 32
elif length <= 1024: elif length <= 1024:
intermediary_mean_mess_ratio_calc = 64 intermediary_mean_mess_ratio_calc = 64
else: else:

View file

@ -4,7 +4,16 @@ from encodings.aliases import aliases
from hashlib import sha256 from hashlib import sha256
from json import dumps from json import dumps
from re import sub from re import sub
from typing import Any, Dict, Iterator, List, Optional, Tuple, Union from typing import (
Any,
Counter as TypeCounter,
Dict,
Iterator,
List,
Optional,
Tuple,
Union,
)
from .constant import NOT_PRINTABLE_PATTERN, TOO_BIG_SEQUENCE from .constant import NOT_PRINTABLE_PATTERN, TOO_BIG_SEQUENCE
from .md import mess_ratio from .md import mess_ratio
@ -21,21 +30,21 @@ class CharsetMatch:
languages: "CoherenceMatches", languages: "CoherenceMatches",
decoded_payload: Optional[str] = None, decoded_payload: Optional[str] = None,
): ):
self._payload = payload # type: bytes self._payload: bytes = payload
self._encoding = guessed_encoding # type: str self._encoding: str = guessed_encoding
self._mean_mess_ratio = mean_mess_ratio # type: float self._mean_mess_ratio: float = mean_mess_ratio
self._languages = languages # type: CoherenceMatches self._languages: CoherenceMatches = languages
self._has_sig_or_bom = has_sig_or_bom # type: bool self._has_sig_or_bom: bool = has_sig_or_bom
self._unicode_ranges = None # type: Optional[List[str]] self._unicode_ranges: Optional[List[str]] = None
self._leaves = [] # type: List[CharsetMatch] self._leaves: List[CharsetMatch] = []
self._mean_coherence_ratio = 0.0 # type: float self._mean_coherence_ratio: float = 0.0
self._output_payload = None # type: Optional[bytes] self._output_payload: Optional[bytes] = None
self._output_encoding = None # type: Optional[str] self._output_encoding: Optional[str] = None
self._string = decoded_payload # type: Optional[str] self._string: Optional[str] = decoded_payload
def __eq__(self, other: object) -> bool: def __eq__(self, other: object) -> bool:
if not isinstance(other, CharsetMatch): if not isinstance(other, CharsetMatch):
@ -53,8 +62,8 @@ class CharsetMatch:
if not isinstance(other, CharsetMatch): if not isinstance(other, CharsetMatch):
raise ValueError raise ValueError
chaos_difference = abs(self.chaos - other.chaos) # type: float chaos_difference: float = abs(self.chaos - other.chaos)
coherence_difference = abs(self.coherence - other.coherence) # type: float coherence_difference: float = abs(self.coherence - other.coherence)
# Bellow 1% difference --> Use Coherence # Bellow 1% difference --> Use Coherence
if chaos_difference < 0.01 and coherence_difference > 0.02: if chaos_difference < 0.01 and coherence_difference > 0.02:
@ -95,7 +104,7 @@ class CharsetMatch:
return 0.0 return 0.0
@property @property
def w_counter(self) -> Counter: def w_counter(self) -> TypeCounter[str]:
""" """
Word counter instance on decoded text. Word counter instance on decoded text.
Notice: Will be removed in 3.0 Notice: Will be removed in 3.0
@ -137,7 +146,7 @@ class CharsetMatch:
""" """
Encoding name are known by many name, using this could help when searching for IBM855 when it's listed as CP855. Encoding name are known by many name, using this could help when searching for IBM855 when it's listed as CP855.
""" """
also_known_as = [] # type: List[str] also_known_as: List[str] = []
for u, p in aliases.items(): for u, p in aliases.items():
if self.encoding == u: if self.encoding == u:
also_known_as.append(p) also_known_as.append(p)
@ -227,9 +236,9 @@ class CharsetMatch:
if self._unicode_ranges is not None: if self._unicode_ranges is not None:
return self._unicode_ranges return self._unicode_ranges
# list detected ranges # list detected ranges
detected_ranges = [ detected_ranges: List[Optional[str]] = [
unicode_range(char) for char in str(self) unicode_range(char) for char in str(self)
] # type: List[Optional[str]] ]
# filter and sort # filter and sort
self._unicode_ranges = sorted(list({r for r in detected_ranges if r})) self._unicode_ranges = sorted(list({r for r in detected_ranges if r}))
return self._unicode_ranges return self._unicode_ranges
@ -280,8 +289,8 @@ class CharsetMatches:
Act like a list(iterable) but does not implements all related methods. Act like a list(iterable) but does not implements all related methods.
""" """
def __init__(self, results: List[CharsetMatch] = None): def __init__(self, results: Optional[List[CharsetMatch]] = None):
self._results = sorted(results) if results else [] # type: List[CharsetMatch] self._results: List[CharsetMatch] = sorted(results) if results else []
def __iter__(self) -> Iterator[CharsetMatch]: def __iter__(self) -> Iterator[CharsetMatch]:
yield from self._results yield from self._results
@ -360,17 +369,17 @@ class CliDetectionResult:
unicode_path: Optional[str], unicode_path: Optional[str],
is_preferred: bool, is_preferred: bool,
): ):
self.path = path # type: str self.path: str = path
self.unicode_path = unicode_path # type: Optional[str] self.unicode_path: Optional[str] = unicode_path
self.encoding = encoding # type: Optional[str] self.encoding: Optional[str] = encoding
self.encoding_aliases = encoding_aliases # type: List[str] self.encoding_aliases: List[str] = encoding_aliases
self.alternative_encodings = alternative_encodings # type: List[str] self.alternative_encodings: List[str] = alternative_encodings
self.language = language # type: str self.language: str = language
self.alphabets = alphabets # type: List[str] self.alphabets: List[str] = alphabets
self.has_sig_or_bom = has_sig_or_bom # type: bool self.has_sig_or_bom: bool = has_sig_or_bom
self.chaos = chaos # type: float self.chaos: float = chaos
self.coherence = coherence # type: float self.coherence: float = coherence
self.is_preferred = is_preferred # type: bool self.is_preferred: bool = is_preferred
@property @property
def __dict__(self) -> Dict[str, Any]: # type: ignore def __dict__(self) -> Dict[str, Any]: # type: ignore

View file

@ -1,4 +1,6 @@
try: try:
# WARNING: unicodedata2 support is going to be removed in 3.0
# Python is quickly catching up.
import unicodedata2 as unicodedata import unicodedata2 as unicodedata
except ImportError: except ImportError:
import unicodedata # type: ignore[no-redef] import unicodedata # type: ignore[no-redef]
@ -9,9 +11,9 @@ from codecs import IncrementalDecoder
from encodings.aliases import aliases from encodings.aliases import aliases
from functools import lru_cache from functools import lru_cache
from re import findall from re import findall
from typing import List, Optional, Set, Tuple, Union from typing import Generator, List, Optional, Set, Tuple, Union
from _multibytecodec import MultibyteIncrementalDecoder # type: ignore from _multibytecodec import MultibyteIncrementalDecoder
from .constant import ( from .constant import (
ENCODING_MARKS, ENCODING_MARKS,
@ -26,7 +28,7 @@ from .constant import (
@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION) @lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
def is_accentuated(character: str) -> bool: def is_accentuated(character: str) -> bool:
try: try:
description = unicodedata.name(character) # type: str description: str = unicodedata.name(character)
except ValueError: except ValueError:
return False return False
return ( return (
@ -41,11 +43,11 @@ def is_accentuated(character: str) -> bool:
@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION) @lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
def remove_accent(character: str) -> str: def remove_accent(character: str) -> str:
decomposed = unicodedata.decomposition(character) # type: str decomposed: str = unicodedata.decomposition(character)
if not decomposed: if not decomposed:
return character return character
codes = decomposed.split(" ") # type: List[str] codes: List[str] = decomposed.split(" ")
return chr(int(codes[0], 16)) return chr(int(codes[0], 16))
@ -55,7 +57,7 @@ def unicode_range(character: str) -> Optional[str]:
""" """
Retrieve the Unicode range official name from a single character. Retrieve the Unicode range official name from a single character.
""" """
character_ord = ord(character) # type: int character_ord: int = ord(character)
for range_name, ord_range in UNICODE_RANGES_COMBINED.items(): for range_name, ord_range in UNICODE_RANGES_COMBINED.items():
if character_ord in ord_range: if character_ord in ord_range:
@ -67,12 +69,13 @@ def unicode_range(character: str) -> Optional[str]:
@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION) @lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
def is_latin(character: str) -> bool: def is_latin(character: str) -> bool:
try: try:
description = unicodedata.name(character) # type: str description: str = unicodedata.name(character)
except ValueError: except ValueError:
return False return False
return "LATIN" in description return "LATIN" in description
@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
def is_ascii(character: str) -> bool: def is_ascii(character: str) -> bool:
try: try:
character.encode("ascii") character.encode("ascii")
@ -83,12 +86,12 @@ def is_ascii(character: str) -> bool:
@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION) @lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
def is_punctuation(character: str) -> bool: def is_punctuation(character: str) -> bool:
character_category = unicodedata.category(character) # type: str character_category: str = unicodedata.category(character)
if "P" in character_category: if "P" in character_category:
return True return True
character_range = unicode_range(character) # type: Optional[str] character_range: Optional[str] = unicode_range(character)
if character_range is None: if character_range is None:
return False return False
@ -98,12 +101,12 @@ def is_punctuation(character: str) -> bool:
@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION) @lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
def is_symbol(character: str) -> bool: def is_symbol(character: str) -> bool:
character_category = unicodedata.category(character) # type: str character_category: str = unicodedata.category(character)
if "S" in character_category or "N" in character_category: if "S" in character_category or "N" in character_category:
return True return True
character_range = unicode_range(character) # type: Optional[str] character_range: Optional[str] = unicode_range(character)
if character_range is None: if character_range is None:
return False return False
@ -113,7 +116,7 @@ def is_symbol(character: str) -> bool:
@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION) @lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
def is_emoticon(character: str) -> bool: def is_emoticon(character: str) -> bool:
character_range = unicode_range(character) # type: Optional[str] character_range: Optional[str] = unicode_range(character)
if character_range is None: if character_range is None:
return False return False
@ -126,7 +129,7 @@ def is_separator(character: str) -> bool:
if character.isspace() or character in {"", "+", ",", ";", "<", ">"}: if character.isspace() or character in {"", "+", ",", ";", "<", ">"}:
return True return True
character_category = unicodedata.category(character) # type: str character_category: str = unicodedata.category(character)
return "Z" in character_category return "Z" in character_category
@ -137,7 +140,7 @@ def is_case_variable(character: str) -> bool:
def is_private_use_only(character: str) -> bool: def is_private_use_only(character: str) -> bool:
character_category = unicodedata.category(character) # type: str character_category: str = unicodedata.category(character)
return character_category == "Co" return character_category == "Co"
@ -197,6 +200,17 @@ def is_unicode_range_secondary(range_name: str) -> bool:
return any(keyword in range_name for keyword in UNICODE_SECONDARY_RANGE_KEYWORD) return any(keyword in range_name for keyword in UNICODE_SECONDARY_RANGE_KEYWORD)
@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
def is_unprintable(character: str) -> bool:
return (
character.isspace() is False # includes \n \t \r \v
and character.isprintable() is False
and character != "\x1A" # Why? Its the ASCII substitute character.
and character != "\ufeff" # bug discovered in Python,
# Zero Width No-Break Space located in Arabic Presentation Forms-B, Unicode 1.1 not acknowledged as space.
)
def any_specified_encoding(sequence: bytes, search_zone: int = 4096) -> Optional[str]: def any_specified_encoding(sequence: bytes, search_zone: int = 4096) -> Optional[str]:
""" """
Extract using ASCII-only decoder any specified encoding in the first n-bytes. Extract using ASCII-only decoder any specified encoding in the first n-bytes.
@ -204,12 +218,12 @@ def any_specified_encoding(sequence: bytes, search_zone: int = 4096) -> Optional
if not isinstance(sequence, bytes): if not isinstance(sequence, bytes):
raise TypeError raise TypeError
seq_len = len(sequence) # type: int seq_len: int = len(sequence)
results = findall( results: List[str] = findall(
RE_POSSIBLE_ENCODING_INDICATION, RE_POSSIBLE_ENCODING_INDICATION,
sequence[: min(seq_len, search_zone)].decode("ascii", errors="ignore"), sequence[: min(seq_len, search_zone)].decode("ascii", errors="ignore"),
) # type: List[str] )
if len(results) == 0: if len(results) == 0:
return None return None
@ -217,6 +231,9 @@ def any_specified_encoding(sequence: bytes, search_zone: int = 4096) -> Optional
for specified_encoding in results: for specified_encoding in results:
specified_encoding = specified_encoding.lower().replace("-", "_") specified_encoding = specified_encoding.lower().replace("-", "_")
encoding_alias: str
encoding_iana: str
for encoding_alias, encoding_iana in aliases.items(): for encoding_alias, encoding_iana in aliases.items():
if encoding_alias == specified_encoding: if encoding_alias == specified_encoding:
return encoding_iana return encoding_iana
@ -242,7 +259,7 @@ def is_multi_byte_encoding(name: str) -> bool:
"utf_32_be", "utf_32_be",
"utf_7", "utf_7",
} or issubclass( } or issubclass(
importlib.import_module("encodings.{}".format(name)).IncrementalDecoder, # type: ignore importlib.import_module("encodings.{}".format(name)).IncrementalDecoder,
MultibyteIncrementalDecoder, MultibyteIncrementalDecoder,
) )
@ -253,7 +270,7 @@ def identify_sig_or_bom(sequence: bytes) -> Tuple[Optional[str], bytes]:
""" """
for iana_encoding in ENCODING_MARKS: for iana_encoding in ENCODING_MARKS:
marks = ENCODING_MARKS[iana_encoding] # type: Union[bytes, List[bytes]] marks: Union[bytes, List[bytes]] = ENCODING_MARKS[iana_encoding]
if isinstance(marks, bytes): if isinstance(marks, bytes):
marks = [marks] marks = [marks]
@ -272,6 +289,9 @@ def should_strip_sig_or_bom(iana_encoding: str) -> bool:
def iana_name(cp_name: str, strict: bool = True) -> str: def iana_name(cp_name: str, strict: bool = True) -> str:
cp_name = cp_name.lower().replace("-", "_") cp_name = cp_name.lower().replace("-", "_")
encoding_alias: str
encoding_iana: str
for encoding_alias, encoding_iana in aliases.items(): for encoding_alias, encoding_iana in aliases.items():
if cp_name in [encoding_alias, encoding_iana]: if cp_name in [encoding_alias, encoding_iana]:
return encoding_iana return encoding_iana
@ -283,10 +303,10 @@ def iana_name(cp_name: str, strict: bool = True) -> str:
def range_scan(decoded_sequence: str) -> List[str]: def range_scan(decoded_sequence: str) -> List[str]:
ranges = set() # type: Set[str] ranges: Set[str] = set()
for character in decoded_sequence: for character in decoded_sequence:
character_range = unicode_range(character) # type: Optional[str] character_range: Optional[str] = unicode_range(character)
if character_range is None: if character_range is None:
continue continue
@ -301,16 +321,20 @@ def cp_similarity(iana_name_a: str, iana_name_b: str) -> float:
if is_multi_byte_encoding(iana_name_a) or is_multi_byte_encoding(iana_name_b): if is_multi_byte_encoding(iana_name_a) or is_multi_byte_encoding(iana_name_b):
return 0.0 return 0.0
decoder_a = importlib.import_module("encodings.{}".format(iana_name_a)).IncrementalDecoder # type: ignore decoder_a = importlib.import_module(
decoder_b = importlib.import_module("encodings.{}".format(iana_name_b)).IncrementalDecoder # type: ignore "encodings.{}".format(iana_name_a)
).IncrementalDecoder
decoder_b = importlib.import_module(
"encodings.{}".format(iana_name_b)
).IncrementalDecoder
id_a = decoder_a(errors="ignore") # type: IncrementalDecoder id_a: IncrementalDecoder = decoder_a(errors="ignore")
id_b = decoder_b(errors="ignore") # type: IncrementalDecoder id_b: IncrementalDecoder = decoder_b(errors="ignore")
character_match_count = 0 # type: int character_match_count: int = 0
for i in range(255): for i in range(255):
to_be_decoded = bytes([i]) # type: bytes to_be_decoded: bytes = bytes([i])
if id_a.decode(to_be_decoded) == id_b.decode(to_be_decoded): if id_a.decode(to_be_decoded) == id_b.decode(to_be_decoded):
character_match_count += 1 character_match_count += 1
@ -340,3 +364,61 @@ def set_logging_handler(
handler = logging.StreamHandler() handler = logging.StreamHandler()
handler.setFormatter(logging.Formatter(format_string)) handler.setFormatter(logging.Formatter(format_string))
logger.addHandler(handler) logger.addHandler(handler)
def cut_sequence_chunks(
sequences: bytes,
encoding_iana: str,
offsets: range,
chunk_size: int,
bom_or_sig_available: bool,
strip_sig_or_bom: bool,
sig_payload: bytes,
is_multi_byte_decoder: bool,
decoded_payload: Optional[str] = None,
) -> Generator[str, None, None]:
if decoded_payload and is_multi_byte_decoder is False:
for i in offsets:
chunk = decoded_payload[i : i + chunk_size]
if not chunk:
break
yield chunk
else:
for i in offsets:
chunk_end = i + chunk_size
if chunk_end > len(sequences) + 8:
continue
cut_sequence = sequences[i : i + chunk_size]
if bom_or_sig_available and strip_sig_or_bom is False:
cut_sequence = sig_payload + cut_sequence
chunk = cut_sequence.decode(
encoding_iana,
errors="ignore" if is_multi_byte_decoder else "strict",
)
# multi-byte bad cutting detector and adjustment
# not the cleanest way to perform that fix but clever enough for now.
if is_multi_byte_decoder and i > 0 and sequences[i] >= 0x80:
chunk_partial_size_chk: int = min(chunk_size, 16)
if (
decoded_payload
and chunk[:chunk_partial_size_chk] not in decoded_payload
):
for j in range(i, i - 4, -1):
cut_sequence = sequences[j:chunk_end]
if bom_or_sig_available and strip_sig_or_bom is False:
cut_sequence = sig_payload + cut_sequence
chunk = cut_sequence.decode(encoding_iana, errors="ignore")
if chunk[:chunk_partial_size_chk] in decoded_payload:
break
yield chunk

View file

@ -2,5 +2,5 @@
Expose version Expose version
""" """
__version__ = "2.0.12" __version__ = "2.1.1"
VERSION = __version__.split(".") VERSION = __version__.split(".")