mirror of
https://github.com/Tautulli/Tautulli.git
synced 2025-08-14 02:26:58 -07:00
Bump requests from 2.27.1 to 2.28.1 (#1781)
* Bump requests from 2.27.1 to 2.28.1 Bumps [requests](https://github.com/psf/requests) from 2.27.1 to 2.28.1. - [Release notes](https://github.com/psf/requests/releases) - [Changelog](https://github.com/psf/requests/blob/main/HISTORY.md) - [Commits](https://github.com/psf/requests/compare/v2.27.1...v2.28.1) --- updated-dependencies: - dependency-name: requests dependency-type: direct:production update-type: version-update:semver-minor ... Signed-off-by: dependabot[bot] <support@github.com> * Update requests==2.28.1 * Update urllib3==1.26.12 * Update certifi==2022.9.24 * Update idna==3.4 * Update charset-normalizer==2.1.1 Signed-off-by: dependabot[bot] <support@github.com> Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> Co-authored-by: JonnyWong16 <9099342+JonnyWong16@users.noreply.github.com> [skip ci]
This commit is contained in:
parent
baa0e08c2a
commit
af1aed0b6b
46 changed files with 3295 additions and 2709 deletions
|
@ -1,4 +1,4 @@
|
|||
# -*- coding: utf_8 -*-
|
||||
# -*- coding: utf-8 -*-
|
||||
"""
|
||||
Charset-Normalizer
|
||||
~~~~~~~~~~~~~~
|
||||
|
|
|
@ -1,11 +1,8 @@
|
|||
import logging
|
||||
import warnings
|
||||
from os import PathLike
|
||||
from os.path import basename, splitext
|
||||
from typing import BinaryIO, List, Optional, Set
|
||||
|
||||
try:
|
||||
from os import PathLike
|
||||
except ImportError: # pragma: no cover
|
||||
PathLike = str # type: ignore
|
||||
from typing import Any, BinaryIO, List, Optional, Set
|
||||
|
||||
from .cd import (
|
||||
coherence_ratio,
|
||||
|
@ -18,6 +15,7 @@ from .md import mess_ratio
|
|||
from .models import CharsetMatch, CharsetMatches
|
||||
from .utils import (
|
||||
any_specified_encoding,
|
||||
cut_sequence_chunks,
|
||||
iana_name,
|
||||
identify_sig_or_bom,
|
||||
is_cp_similar,
|
||||
|
@ -39,8 +37,8 @@ def from_bytes(
|
|||
steps: int = 5,
|
||||
chunk_size: int = 512,
|
||||
threshold: float = 0.2,
|
||||
cp_isolation: List[str] = None,
|
||||
cp_exclusion: List[str] = None,
|
||||
cp_isolation: Optional[List[str]] = None,
|
||||
cp_exclusion: Optional[List[str]] = None,
|
||||
preemptive_behaviour: bool = True,
|
||||
explain: bool = False,
|
||||
) -> CharsetMatches:
|
||||
|
@ -70,11 +68,11 @@ def from_bytes(
|
|||
)
|
||||
|
||||
if explain:
|
||||
previous_logger_level = logger.level # type: int
|
||||
previous_logger_level: int = logger.level
|
||||
logger.addHandler(explain_handler)
|
||||
logger.setLevel(TRACE)
|
||||
|
||||
length = len(sequences) # type: int
|
||||
length: int = len(sequences)
|
||||
|
||||
if length == 0:
|
||||
logger.debug("Encoding detection on empty bytes, assuming utf_8 intention.")
|
||||
|
@ -119,8 +117,8 @@ def from_bytes(
|
|||
if steps > 1 and length / steps < chunk_size:
|
||||
chunk_size = int(length / steps)
|
||||
|
||||
is_too_small_sequence = len(sequences) < TOO_SMALL_SEQUENCE # type: bool
|
||||
is_too_large_sequence = len(sequences) >= TOO_BIG_SEQUENCE # type: bool
|
||||
is_too_small_sequence: bool = len(sequences) < TOO_SMALL_SEQUENCE
|
||||
is_too_large_sequence: bool = len(sequences) >= TOO_BIG_SEQUENCE
|
||||
|
||||
if is_too_small_sequence:
|
||||
logger.log(
|
||||
|
@ -137,11 +135,11 @@ def from_bytes(
|
|||
),
|
||||
)
|
||||
|
||||
prioritized_encodings = [] # type: List[str]
|
||||
prioritized_encodings: List[str] = []
|
||||
|
||||
specified_encoding = (
|
||||
specified_encoding: Optional[str] = (
|
||||
any_specified_encoding(sequences) if preemptive_behaviour else None
|
||||
) # type: Optional[str]
|
||||
)
|
||||
|
||||
if specified_encoding is not None:
|
||||
prioritized_encodings.append(specified_encoding)
|
||||
|
@ -151,15 +149,15 @@ def from_bytes(
|
|||
specified_encoding,
|
||||
)
|
||||
|
||||
tested = set() # type: Set[str]
|
||||
tested_but_hard_failure = [] # type: List[str]
|
||||
tested_but_soft_failure = [] # type: List[str]
|
||||
tested: Set[str] = set()
|
||||
tested_but_hard_failure: List[str] = []
|
||||
tested_but_soft_failure: List[str] = []
|
||||
|
||||
fallback_ascii = None # type: Optional[CharsetMatch]
|
||||
fallback_u8 = None # type: Optional[CharsetMatch]
|
||||
fallback_specified = None # type: Optional[CharsetMatch]
|
||||
fallback_ascii: Optional[CharsetMatch] = None
|
||||
fallback_u8: Optional[CharsetMatch] = None
|
||||
fallback_specified: Optional[CharsetMatch] = None
|
||||
|
||||
results = CharsetMatches() # type: CharsetMatches
|
||||
results: CharsetMatches = CharsetMatches()
|
||||
|
||||
sig_encoding, sig_payload = identify_sig_or_bom(sequences)
|
||||
|
||||
|
@ -190,11 +188,11 @@ def from_bytes(
|
|||
|
||||
tested.add(encoding_iana)
|
||||
|
||||
decoded_payload = None # type: Optional[str]
|
||||
bom_or_sig_available = sig_encoding == encoding_iana # type: bool
|
||||
strip_sig_or_bom = bom_or_sig_available and should_strip_sig_or_bom(
|
||||
decoded_payload: Optional[str] = None
|
||||
bom_or_sig_available: bool = sig_encoding == encoding_iana
|
||||
strip_sig_or_bom: bool = bom_or_sig_available and should_strip_sig_or_bom(
|
||||
encoding_iana
|
||||
) # type: bool
|
||||
)
|
||||
|
||||
if encoding_iana in {"utf_16", "utf_32"} and not bom_or_sig_available:
|
||||
logger.log(
|
||||
|
@ -205,7 +203,7 @@ def from_bytes(
|
|||
continue
|
||||
|
||||
try:
|
||||
is_multi_byte_decoder = is_multi_byte_encoding(encoding_iana) # type: bool
|
||||
is_multi_byte_decoder: bool = is_multi_byte_encoding(encoding_iana)
|
||||
except (ModuleNotFoundError, ImportError):
|
||||
logger.log(
|
||||
TRACE,
|
||||
|
@ -240,7 +238,7 @@ def from_bytes(
|
|||
tested_but_hard_failure.append(encoding_iana)
|
||||
continue
|
||||
|
||||
similar_soft_failure_test = False # type: bool
|
||||
similar_soft_failure_test: bool = False
|
||||
|
||||
for encoding_soft_failed in tested_but_soft_failure:
|
||||
if is_cp_similar(encoding_iana, encoding_soft_failed):
|
||||
|
@ -262,11 +260,11 @@ def from_bytes(
|
|||
int(length / steps),
|
||||
)
|
||||
|
||||
multi_byte_bonus = (
|
||||
multi_byte_bonus: bool = (
|
||||
is_multi_byte_decoder
|
||||
and decoded_payload is not None
|
||||
and len(decoded_payload) < length
|
||||
) # type: bool
|
||||
)
|
||||
|
||||
if multi_byte_bonus:
|
||||
logger.log(
|
||||
|
@ -276,72 +274,47 @@ def from_bytes(
|
|||
encoding_iana,
|
||||
)
|
||||
|
||||
max_chunk_gave_up = int(len(r_) / 4) # type: int
|
||||
max_chunk_gave_up: int = int(len(r_) / 4)
|
||||
|
||||
max_chunk_gave_up = max(max_chunk_gave_up, 2)
|
||||
early_stop_count = 0 # type: int
|
||||
early_stop_count: int = 0
|
||||
lazy_str_hard_failure = False
|
||||
|
||||
md_chunks = [] # type: List[str]
|
||||
md_chunks: List[str] = []
|
||||
md_ratios = []
|
||||
|
||||
for i in r_:
|
||||
if i + chunk_size > length + 8:
|
||||
continue
|
||||
|
||||
cut_sequence = sequences[i : i + chunk_size]
|
||||
|
||||
if bom_or_sig_available and strip_sig_or_bom is False:
|
||||
cut_sequence = sig_payload + cut_sequence
|
||||
|
||||
try:
|
||||
chunk = cut_sequence.decode(
|
||||
encoding_iana,
|
||||
errors="ignore" if is_multi_byte_decoder else "strict",
|
||||
) # type: str
|
||||
except UnicodeDecodeError as e: # Lazy str loading may have missed something there
|
||||
logger.log(
|
||||
TRACE,
|
||||
"LazyStr Loading: After MD chunk decode, code page %s does not fit given bytes sequence at ALL. %s",
|
||||
encoding_iana,
|
||||
str(e),
|
||||
)
|
||||
early_stop_count = max_chunk_gave_up
|
||||
lazy_str_hard_failure = True
|
||||
break
|
||||
|
||||
# multi-byte bad cutting detector and adjustment
|
||||
# not the cleanest way to perform that fix but clever enough for now.
|
||||
if is_multi_byte_decoder and i > 0 and sequences[i] >= 0x80:
|
||||
|
||||
chunk_partial_size_chk = min(chunk_size, 16) # type: int
|
||||
|
||||
if (
|
||||
decoded_payload
|
||||
and chunk[:chunk_partial_size_chk] not in decoded_payload
|
||||
):
|
||||
for j in range(i, i - 4, -1):
|
||||
cut_sequence = sequences[j : i + chunk_size]
|
||||
|
||||
if bom_or_sig_available and strip_sig_or_bom is False:
|
||||
cut_sequence = sig_payload + cut_sequence
|
||||
|
||||
chunk = cut_sequence.decode(encoding_iana, errors="ignore")
|
||||
|
||||
if chunk[:chunk_partial_size_chk] in decoded_payload:
|
||||
break
|
||||
|
||||
md_chunks.append(chunk)
|
||||
|
||||
md_ratios.append(mess_ratio(chunk, threshold))
|
||||
|
||||
if md_ratios[-1] >= threshold:
|
||||
early_stop_count += 1
|
||||
|
||||
if (early_stop_count >= max_chunk_gave_up) or (
|
||||
bom_or_sig_available and strip_sig_or_bom is False
|
||||
try:
|
||||
for chunk in cut_sequence_chunks(
|
||||
sequences,
|
||||
encoding_iana,
|
||||
r_,
|
||||
chunk_size,
|
||||
bom_or_sig_available,
|
||||
strip_sig_or_bom,
|
||||
sig_payload,
|
||||
is_multi_byte_decoder,
|
||||
decoded_payload,
|
||||
):
|
||||
break
|
||||
md_chunks.append(chunk)
|
||||
|
||||
md_ratios.append(mess_ratio(chunk, threshold))
|
||||
|
||||
if md_ratios[-1] >= threshold:
|
||||
early_stop_count += 1
|
||||
|
||||
if (early_stop_count >= max_chunk_gave_up) or (
|
||||
bom_or_sig_available and strip_sig_or_bom is False
|
||||
):
|
||||
break
|
||||
except UnicodeDecodeError as e: # Lazy str loading may have missed something there
|
||||
logger.log(
|
||||
TRACE,
|
||||
"LazyStr Loading: After MD chunk decode, code page %s does not fit given bytes sequence at ALL. %s",
|
||||
encoding_iana,
|
||||
str(e),
|
||||
)
|
||||
early_stop_count = max_chunk_gave_up
|
||||
lazy_str_hard_failure = True
|
||||
|
||||
# We might want to check the sequence again with the whole content
|
||||
# Only if initial MD tests passes
|
||||
|
@ -362,9 +335,7 @@ def from_bytes(
|
|||
tested_but_hard_failure.append(encoding_iana)
|
||||
continue
|
||||
|
||||
mean_mess_ratio = (
|
||||
sum(md_ratios) / len(md_ratios) if md_ratios else 0.0
|
||||
) # type: float
|
||||
mean_mess_ratio: float = sum(md_ratios) / len(md_ratios) if md_ratios else 0.0
|
||||
if mean_mess_ratio >= threshold or early_stop_count >= max_chunk_gave_up:
|
||||
tested_but_soft_failure.append(encoding_iana)
|
||||
logger.log(
|
||||
|
@ -399,7 +370,7 @@ def from_bytes(
|
|||
)
|
||||
|
||||
if not is_multi_byte_decoder:
|
||||
target_languages = encoding_languages(encoding_iana) # type: List[str]
|
||||
target_languages: List[str] = encoding_languages(encoding_iana)
|
||||
else:
|
||||
target_languages = mb_encoding_languages(encoding_iana)
|
||||
|
||||
|
@ -516,8 +487,8 @@ def from_fp(
|
|||
steps: int = 5,
|
||||
chunk_size: int = 512,
|
||||
threshold: float = 0.20,
|
||||
cp_isolation: List[str] = None,
|
||||
cp_exclusion: List[str] = None,
|
||||
cp_isolation: Optional[List[str]] = None,
|
||||
cp_exclusion: Optional[List[str]] = None,
|
||||
preemptive_behaviour: bool = True,
|
||||
explain: bool = False,
|
||||
) -> CharsetMatches:
|
||||
|
@ -538,12 +509,12 @@ def from_fp(
|
|||
|
||||
|
||||
def from_path(
|
||||
path: PathLike,
|
||||
path: "PathLike[Any]",
|
||||
steps: int = 5,
|
||||
chunk_size: int = 512,
|
||||
threshold: float = 0.20,
|
||||
cp_isolation: List[str] = None,
|
||||
cp_exclusion: List[str] = None,
|
||||
cp_isolation: Optional[List[str]] = None,
|
||||
cp_exclusion: Optional[List[str]] = None,
|
||||
preemptive_behaviour: bool = True,
|
||||
explain: bool = False,
|
||||
) -> CharsetMatches:
|
||||
|
@ -565,17 +536,22 @@ def from_path(
|
|||
|
||||
|
||||
def normalize(
|
||||
path: PathLike,
|
||||
path: "PathLike[Any]",
|
||||
steps: int = 5,
|
||||
chunk_size: int = 512,
|
||||
threshold: float = 0.20,
|
||||
cp_isolation: List[str] = None,
|
||||
cp_exclusion: List[str] = None,
|
||||
cp_isolation: Optional[List[str]] = None,
|
||||
cp_exclusion: Optional[List[str]] = None,
|
||||
preemptive_behaviour: bool = True,
|
||||
) -> CharsetMatch:
|
||||
"""
|
||||
Take a (text-based) file path and try to create another file next to it, this time using UTF-8.
|
||||
"""
|
||||
warnings.warn(
|
||||
"normalize is deprecated and will be removed in 3.0",
|
||||
DeprecationWarning,
|
||||
)
|
||||
|
||||
results = from_path(
|
||||
path,
|
||||
steps,
|
||||
|
|
File diff suppressed because it is too large
Load diff
|
@ -1,8 +1,8 @@
|
|||
import importlib
|
||||
from codecs import IncrementalDecoder
|
||||
from collections import Counter, OrderedDict
|
||||
from collections import Counter
|
||||
from functools import lru_cache
|
||||
from typing import Dict, List, Optional, Tuple
|
||||
from typing import Counter as TypeCounter, Dict, List, Optional, Tuple
|
||||
|
||||
from .assets import FREQUENCIES
|
||||
from .constant import KO_NAMES, LANGUAGE_SUPPORTED_COUNT, TOO_SMALL_SEQUENCE, ZH_NAMES
|
||||
|
@ -24,17 +24,19 @@ def encoding_unicode_range(iana_name: str) -> List[str]:
|
|||
if is_multi_byte_encoding(iana_name):
|
||||
raise IOError("Function not supported on multi-byte code page")
|
||||
|
||||
decoder = importlib.import_module("encodings.{}".format(iana_name)).IncrementalDecoder # type: ignore
|
||||
decoder = importlib.import_module(
|
||||
"encodings.{}".format(iana_name)
|
||||
).IncrementalDecoder
|
||||
|
||||
p = decoder(errors="ignore") # type: IncrementalDecoder
|
||||
seen_ranges = {} # type: Dict[str, int]
|
||||
character_count = 0 # type: int
|
||||
p: IncrementalDecoder = decoder(errors="ignore")
|
||||
seen_ranges: Dict[str, int] = {}
|
||||
character_count: int = 0
|
||||
|
||||
for i in range(0x40, 0xFF):
|
||||
chunk = p.decode(bytes([i])) # type: str
|
||||
chunk: str = p.decode(bytes([i]))
|
||||
|
||||
if chunk:
|
||||
character_range = unicode_range(chunk) # type: Optional[str]
|
||||
character_range: Optional[str] = unicode_range(chunk)
|
||||
|
||||
if character_range is None:
|
||||
continue
|
||||
|
@ -58,7 +60,7 @@ def unicode_range_languages(primary_range: str) -> List[str]:
|
|||
"""
|
||||
Return inferred languages used with a unicode range.
|
||||
"""
|
||||
languages = [] # type: List[str]
|
||||
languages: List[str] = []
|
||||
|
||||
for language, characters in FREQUENCIES.items():
|
||||
for character in characters:
|
||||
|
@ -75,8 +77,8 @@ def encoding_languages(iana_name: str) -> List[str]:
|
|||
Single-byte encoding language association. Some code page are heavily linked to particular language(s).
|
||||
This function does the correspondence.
|
||||
"""
|
||||
unicode_ranges = encoding_unicode_range(iana_name) # type: List[str]
|
||||
primary_range = None # type: Optional[str]
|
||||
unicode_ranges: List[str] = encoding_unicode_range(iana_name)
|
||||
primary_range: Optional[str] = None
|
||||
|
||||
for specified_range in unicode_ranges:
|
||||
if "Latin" not in specified_range:
|
||||
|
@ -115,8 +117,8 @@ def get_target_features(language: str) -> Tuple[bool, bool]:
|
|||
"""
|
||||
Determine main aspects from a supported language if it contains accents and if is pure Latin.
|
||||
"""
|
||||
target_have_accents = False # type: bool
|
||||
target_pure_latin = True # type: bool
|
||||
target_have_accents: bool = False
|
||||
target_pure_latin: bool = True
|
||||
|
||||
for character in FREQUENCIES[language]:
|
||||
if not target_have_accents and is_accentuated(character):
|
||||
|
@ -133,7 +135,7 @@ def alphabet_languages(
|
|||
"""
|
||||
Return associated languages associated to given characters.
|
||||
"""
|
||||
languages = [] # type: List[Tuple[str, float]]
|
||||
languages: List[Tuple[str, float]] = []
|
||||
|
||||
source_have_accents = any(is_accentuated(character) for character in characters)
|
||||
|
||||
|
@ -147,13 +149,13 @@ def alphabet_languages(
|
|||
if target_have_accents is False and source_have_accents:
|
||||
continue
|
||||
|
||||
character_count = len(language_characters) # type: int
|
||||
character_count: int = len(language_characters)
|
||||
|
||||
character_match_count = len(
|
||||
character_match_count: int = len(
|
||||
[c for c in language_characters if c in characters]
|
||||
) # type: int
|
||||
)
|
||||
|
||||
ratio = character_match_count / character_count # type: float
|
||||
ratio: float = character_match_count / character_count
|
||||
|
||||
if ratio >= 0.2:
|
||||
languages.append((language, ratio))
|
||||
|
@ -174,36 +176,33 @@ def characters_popularity_compare(
|
|||
if language not in FREQUENCIES:
|
||||
raise ValueError("{} not available".format(language))
|
||||
|
||||
character_approved_count = 0 # type: int
|
||||
character_approved_count: int = 0
|
||||
FREQUENCIES_language_set = set(FREQUENCIES[language])
|
||||
|
||||
for character in ordered_characters:
|
||||
if character not in FREQUENCIES[language]:
|
||||
if character not in FREQUENCIES_language_set:
|
||||
continue
|
||||
|
||||
characters_before_source = FREQUENCIES[language][
|
||||
characters_before_source: List[str] = FREQUENCIES[language][
|
||||
0 : FREQUENCIES[language].index(character)
|
||||
] # type: List[str]
|
||||
characters_after_source = FREQUENCIES[language][
|
||||
]
|
||||
characters_after_source: List[str] = FREQUENCIES[language][
|
||||
FREQUENCIES[language].index(character) :
|
||||
] # type: List[str]
|
||||
|
||||
characters_before = ordered_characters[
|
||||
]
|
||||
characters_before: List[str] = ordered_characters[
|
||||
0 : ordered_characters.index(character)
|
||||
] # type: List[str]
|
||||
characters_after = ordered_characters[
|
||||
]
|
||||
characters_after: List[str] = ordered_characters[
|
||||
ordered_characters.index(character) :
|
||||
] # type: List[str]
|
||||
]
|
||||
|
||||
before_match_count = [
|
||||
e in characters_before for e in characters_before_source
|
||||
].count(
|
||||
True
|
||||
) # type: int
|
||||
after_match_count = [
|
||||
e in characters_after for e in characters_after_source
|
||||
].count(
|
||||
True
|
||||
) # type: int
|
||||
before_match_count: int = len(
|
||||
set(characters_before) & set(characters_before_source)
|
||||
)
|
||||
|
||||
after_match_count: int = len(
|
||||
set(characters_after) & set(characters_after_source)
|
||||
)
|
||||
|
||||
if len(characters_before_source) == 0 and before_match_count <= 4:
|
||||
character_approved_count += 1
|
||||
|
@ -229,18 +228,18 @@ def alpha_unicode_split(decoded_sequence: str) -> List[str]:
|
|||
Ex. a text containing English/Latin with a bit a Hebrew will return two items in the resulting list;
|
||||
One containing the latin letters and the other hebrew.
|
||||
"""
|
||||
layers = OrderedDict() # type: Dict[str, str]
|
||||
layers: Dict[str, str] = {}
|
||||
|
||||
for character in decoded_sequence:
|
||||
if character.isalpha() is False:
|
||||
continue
|
||||
|
||||
character_range = unicode_range(character) # type: Optional[str]
|
||||
character_range: Optional[str] = unicode_range(character)
|
||||
|
||||
if character_range is None:
|
||||
continue
|
||||
|
||||
layer_target_range = None # type: Optional[str]
|
||||
layer_target_range: Optional[str] = None
|
||||
|
||||
for discovered_range in layers:
|
||||
if (
|
||||
|
@ -267,7 +266,7 @@ def merge_coherence_ratios(results: List[CoherenceMatches]) -> CoherenceMatches:
|
|||
This function merge results previously given by the function coherence_ratio.
|
||||
The return type is the same as coherence_ratio.
|
||||
"""
|
||||
per_language_ratios = OrderedDict() # type: Dict[str, List[float]]
|
||||
per_language_ratios: Dict[str, List[float]] = {}
|
||||
for result in results:
|
||||
for sub_result in result:
|
||||
language, ratio = sub_result
|
||||
|
@ -299,10 +298,10 @@ def coherence_ratio(
|
|||
A layer = Character extraction by alphabets/ranges.
|
||||
"""
|
||||
|
||||
results = [] # type: List[Tuple[str, float]]
|
||||
ignore_non_latin = False # type: bool
|
||||
results: List[Tuple[str, float]] = []
|
||||
ignore_non_latin: bool = False
|
||||
|
||||
sufficient_match_count = 0 # type: int
|
||||
sufficient_match_count: int = 0
|
||||
|
||||
lg_inclusion_list = lg_inclusion.split(",") if lg_inclusion is not None else []
|
||||
if "Latin Based" in lg_inclusion_list:
|
||||
|
@ -310,22 +309,22 @@ def coherence_ratio(
|
|||
lg_inclusion_list.remove("Latin Based")
|
||||
|
||||
for layer in alpha_unicode_split(decoded_sequence):
|
||||
sequence_frequencies = Counter(layer) # type: Counter
|
||||
sequence_frequencies: TypeCounter[str] = Counter(layer)
|
||||
most_common = sequence_frequencies.most_common()
|
||||
|
||||
character_count = sum(o for c, o in most_common) # type: int
|
||||
character_count: int = sum(o for c, o in most_common)
|
||||
|
||||
if character_count <= TOO_SMALL_SEQUENCE:
|
||||
continue
|
||||
|
||||
popular_character_ordered = [c for c, o in most_common] # type: List[str]
|
||||
popular_character_ordered: List[str] = [c for c, o in most_common]
|
||||
|
||||
for language in lg_inclusion_list or alphabet_languages(
|
||||
popular_character_ordered, ignore_non_latin
|
||||
):
|
||||
ratio = characters_popularity_compare(
|
||||
ratio: float = characters_popularity_compare(
|
||||
language, popular_character_ordered
|
||||
) # type: float
|
||||
)
|
||||
|
||||
if ratio < threshold:
|
||||
continue
|
||||
|
|
|
@ -3,7 +3,12 @@ import sys
|
|||
from json import dumps
|
||||
from os.path import abspath
|
||||
from platform import python_version
|
||||
from typing import List
|
||||
from typing import List, Optional
|
||||
|
||||
try:
|
||||
from unicodedata2 import unidata_version
|
||||
except ImportError:
|
||||
from unicodedata import unidata_version
|
||||
|
||||
from charset_normalizer import from_fp
|
||||
from charset_normalizer.models import CliDetectionResult
|
||||
|
@ -43,7 +48,7 @@ def query_yes_no(question: str, default: str = "yes") -> bool:
|
|||
sys.stdout.write("Please respond with 'yes' or 'no' " "(or 'y' or 'n').\n")
|
||||
|
||||
|
||||
def cli_detect(argv: List[str] = None) -> int:
|
||||
def cli_detect(argv: Optional[List[str]] = None) -> int:
|
||||
"""
|
||||
CLI assistant using ARGV and ArgumentParser
|
||||
:param argv:
|
||||
|
@ -111,7 +116,7 @@ def cli_detect(argv: List[str] = None) -> int:
|
|||
"-t",
|
||||
"--threshold",
|
||||
action="store",
|
||||
default=0.1,
|
||||
default=0.2,
|
||||
type=float,
|
||||
dest="threshold",
|
||||
help="Define a custom maximum amount of chaos allowed in decoded content. 0. <= chaos <= 1.",
|
||||
|
@ -119,8 +124,8 @@ def cli_detect(argv: List[str] = None) -> int:
|
|||
parser.add_argument(
|
||||
"--version",
|
||||
action="version",
|
||||
version="Charset-Normalizer {} - Python {}".format(
|
||||
__version__, python_version()
|
||||
version="Charset-Normalizer {} - Python {} - Unicode {}".format(
|
||||
__version__, python_version(), unidata_version
|
||||
),
|
||||
help="Show version information and exit.",
|
||||
)
|
||||
|
@ -229,7 +234,7 @@ def cli_detect(argv: List[str] = None) -> int:
|
|||
my_file.close()
|
||||
continue
|
||||
|
||||
o_ = my_file.name.split(".") # type: List[str]
|
||||
o_: List[str] = my_file.name.split(".")
|
||||
|
||||
if args.replace is False:
|
||||
o_.insert(-1, best_guess.encoding)
|
||||
|
|
|
@ -1,5 +1,4 @@
|
|||
from codecs import BOM_UTF8, BOM_UTF16_BE, BOM_UTF16_LE, BOM_UTF32_BE, BOM_UTF32_LE
|
||||
from collections import OrderedDict
|
||||
from encodings.aliases import aliases
|
||||
from re import IGNORECASE, compile as re_compile
|
||||
from typing import Dict, List, Set, Union
|
||||
|
@ -7,31 +6,26 @@ from typing import Dict, List, Set, Union
|
|||
from .assets import FREQUENCIES
|
||||
|
||||
# Contain for each eligible encoding a list of/item bytes SIG/BOM
|
||||
ENCODING_MARKS = OrderedDict(
|
||||
[
|
||||
("utf_8", BOM_UTF8),
|
||||
(
|
||||
"utf_7",
|
||||
[
|
||||
b"\x2b\x2f\x76\x38",
|
||||
b"\x2b\x2f\x76\x39",
|
||||
b"\x2b\x2f\x76\x2b",
|
||||
b"\x2b\x2f\x76\x2f",
|
||||
b"\x2b\x2f\x76\x38\x2d",
|
||||
],
|
||||
),
|
||||
("gb18030", b"\x84\x31\x95\x33"),
|
||||
("utf_32", [BOM_UTF32_BE, BOM_UTF32_LE]),
|
||||
("utf_16", [BOM_UTF16_BE, BOM_UTF16_LE]),
|
||||
]
|
||||
) # type: Dict[str, Union[bytes, List[bytes]]]
|
||||
ENCODING_MARKS: Dict[str, Union[bytes, List[bytes]]] = {
|
||||
"utf_8": BOM_UTF8,
|
||||
"utf_7": [
|
||||
b"\x2b\x2f\x76\x38",
|
||||
b"\x2b\x2f\x76\x39",
|
||||
b"\x2b\x2f\x76\x2b",
|
||||
b"\x2b\x2f\x76\x2f",
|
||||
b"\x2b\x2f\x76\x38\x2d",
|
||||
],
|
||||
"gb18030": b"\x84\x31\x95\x33",
|
||||
"utf_32": [BOM_UTF32_BE, BOM_UTF32_LE],
|
||||
"utf_16": [BOM_UTF16_BE, BOM_UTF16_LE],
|
||||
}
|
||||
|
||||
TOO_SMALL_SEQUENCE = 32 # type: int
|
||||
TOO_BIG_SEQUENCE = int(10e6) # type: int
|
||||
TOO_SMALL_SEQUENCE: int = 32
|
||||
TOO_BIG_SEQUENCE: int = int(10e6)
|
||||
|
||||
UTF8_MAXIMAL_ALLOCATION = 1112064 # type: int
|
||||
UTF8_MAXIMAL_ALLOCATION: int = 1112064
|
||||
|
||||
UNICODE_RANGES_COMBINED = {
|
||||
UNICODE_RANGES_COMBINED: Dict[str, range] = {
|
||||
"Control character": range(31 + 1),
|
||||
"Basic Latin": range(32, 127 + 1),
|
||||
"Latin-1 Supplement": range(128, 255 + 1),
|
||||
|
@ -311,10 +305,10 @@ UNICODE_RANGES_COMBINED = {
|
|||
"CJK Compatibility Ideographs Supplement": range(194560, 195103 + 1),
|
||||
"Tags": range(917504, 917631 + 1),
|
||||
"Variation Selectors Supplement": range(917760, 917999 + 1),
|
||||
} # type: Dict[str, range]
|
||||
}
|
||||
|
||||
|
||||
UNICODE_SECONDARY_RANGE_KEYWORD = [
|
||||
UNICODE_SECONDARY_RANGE_KEYWORD: List[str] = [
|
||||
"Supplement",
|
||||
"Extended",
|
||||
"Extensions",
|
||||
|
@ -330,25 +324,25 @@ UNICODE_SECONDARY_RANGE_KEYWORD = [
|
|||
"Shapes",
|
||||
"Supplemental",
|
||||
"Tags",
|
||||
] # type: List[str]
|
||||
]
|
||||
|
||||
RE_POSSIBLE_ENCODING_INDICATION = re_compile(
|
||||
r"(?:(?:encoding)|(?:charset)|(?:coding))(?:[\:= ]{1,10})(?:[\"\']?)([a-zA-Z0-9\-_]+)(?:[\"\']?)",
|
||||
IGNORECASE,
|
||||
)
|
||||
|
||||
IANA_SUPPORTED = sorted(
|
||||
IANA_SUPPORTED: List[str] = sorted(
|
||||
filter(
|
||||
lambda x: x.endswith("_codec") is False
|
||||
and x not in {"rot_13", "tactis", "mbcs"},
|
||||
list(set(aliases.values())),
|
||||
)
|
||||
) # type: List[str]
|
||||
)
|
||||
|
||||
IANA_SUPPORTED_COUNT = len(IANA_SUPPORTED) # type: int
|
||||
IANA_SUPPORTED_COUNT: int = len(IANA_SUPPORTED)
|
||||
|
||||
# pre-computed code page that are similar using the function cp_similarity.
|
||||
IANA_SUPPORTED_SIMILAR = {
|
||||
IANA_SUPPORTED_SIMILAR: Dict[str, List[str]] = {
|
||||
"cp037": ["cp1026", "cp1140", "cp273", "cp500"],
|
||||
"cp1026": ["cp037", "cp1140", "cp273", "cp500"],
|
||||
"cp1125": ["cp866"],
|
||||
|
@ -434,10 +428,10 @@ IANA_SUPPORTED_SIMILAR = {
|
|||
"mac_turkish": ["mac_iceland", "mac_roman"],
|
||||
"ptcp154": ["cp1251", "kz1048"],
|
||||
"tis_620": ["iso8859_11"],
|
||||
} # type: Dict[str, List[str]]
|
||||
}
|
||||
|
||||
|
||||
CHARDET_CORRESPONDENCE = {
|
||||
CHARDET_CORRESPONDENCE: Dict[str, str] = {
|
||||
"iso2022_kr": "ISO-2022-KR",
|
||||
"iso2022_jp": "ISO-2022-JP",
|
||||
"euc_kr": "EUC-KR",
|
||||
|
@ -470,10 +464,10 @@ CHARDET_CORRESPONDENCE = {
|
|||
"cp1256": "windows-1256",
|
||||
"cp1254": "Windows-1254",
|
||||
"cp949": "CP949",
|
||||
} # type: Dict[str, str]
|
||||
}
|
||||
|
||||
|
||||
COMMON_SAFE_ASCII_CHARACTERS = {
|
||||
COMMON_SAFE_ASCII_CHARACTERS: Set[str] = {
|
||||
"<",
|
||||
">",
|
||||
"=",
|
||||
|
@ -489,15 +483,15 @@ COMMON_SAFE_ASCII_CHARACTERS = {
|
|||
"|",
|
||||
'"',
|
||||
"-",
|
||||
} # type: Set[str]
|
||||
}
|
||||
|
||||
|
||||
KO_NAMES = {"johab", "cp949", "euc_kr"} # type: Set[str]
|
||||
ZH_NAMES = {"big5", "cp950", "big5hkscs", "hz"} # type: Set[str]
|
||||
KO_NAMES: Set[str] = {"johab", "cp949", "euc_kr"}
|
||||
ZH_NAMES: Set[str] = {"big5", "cp950", "big5hkscs", "hz"}
|
||||
|
||||
NOT_PRINTABLE_PATTERN = re_compile(r"[0-9\W\n\r\t]+")
|
||||
|
||||
LANGUAGE_SUPPORTED_COUNT = len(FREQUENCIES) # type: int
|
||||
LANGUAGE_SUPPORTED_COUNT: int = len(FREQUENCIES)
|
||||
|
||||
# Logging LEVEL bellow DEBUG
|
||||
TRACE = 5 # type: int
|
||||
TRACE: int = 5
|
||||
|
|
|
@ -16,6 +16,7 @@ from .utils import (
|
|||
is_separator,
|
||||
is_symbol,
|
||||
is_thai,
|
||||
is_unprintable,
|
||||
remove_accent,
|
||||
unicode_range,
|
||||
)
|
||||
|
@ -57,12 +58,12 @@ class MessDetectorPlugin:
|
|||
|
||||
class TooManySymbolOrPunctuationPlugin(MessDetectorPlugin):
|
||||
def __init__(self) -> None:
|
||||
self._punctuation_count = 0 # type: int
|
||||
self._symbol_count = 0 # type: int
|
||||
self._character_count = 0 # type: int
|
||||
self._punctuation_count: int = 0
|
||||
self._symbol_count: int = 0
|
||||
self._character_count: int = 0
|
||||
|
||||
self._last_printable_char = None # type: Optional[str]
|
||||
self._frenzy_symbol_in_word = False # type: bool
|
||||
self._last_printable_char: Optional[str] = None
|
||||
self._frenzy_symbol_in_word: bool = False
|
||||
|
||||
def eligible(self, character: str) -> bool:
|
||||
return character.isprintable()
|
||||
|
@ -95,17 +96,17 @@ class TooManySymbolOrPunctuationPlugin(MessDetectorPlugin):
|
|||
if self._character_count == 0:
|
||||
return 0.0
|
||||
|
||||
ratio_of_punctuation = (
|
||||
ratio_of_punctuation: float = (
|
||||
self._punctuation_count + self._symbol_count
|
||||
) / self._character_count # type: float
|
||||
) / self._character_count
|
||||
|
||||
return ratio_of_punctuation if ratio_of_punctuation >= 0.3 else 0.0
|
||||
|
||||
|
||||
class TooManyAccentuatedPlugin(MessDetectorPlugin):
|
||||
def __init__(self) -> None:
|
||||
self._character_count = 0 # type: int
|
||||
self._accentuated_count = 0 # type: int
|
||||
self._character_count: int = 0
|
||||
self._accentuated_count: int = 0
|
||||
|
||||
def eligible(self, character: str) -> bool:
|
||||
return character.isalpha()
|
||||
|
@ -124,26 +125,20 @@ class TooManyAccentuatedPlugin(MessDetectorPlugin):
|
|||
def ratio(self) -> float:
|
||||
if self._character_count == 0:
|
||||
return 0.0
|
||||
ratio_of_accentuation = (
|
||||
self._accentuated_count / self._character_count
|
||||
) # type: float
|
||||
ratio_of_accentuation: float = self._accentuated_count / self._character_count
|
||||
return ratio_of_accentuation if ratio_of_accentuation >= 0.35 else 0.0
|
||||
|
||||
|
||||
class UnprintablePlugin(MessDetectorPlugin):
|
||||
def __init__(self) -> None:
|
||||
self._unprintable_count = 0 # type: int
|
||||
self._character_count = 0 # type: int
|
||||
self._unprintable_count: int = 0
|
||||
self._character_count: int = 0
|
||||
|
||||
def eligible(self, character: str) -> bool:
|
||||
return True
|
||||
|
||||
def feed(self, character: str) -> None:
|
||||
if (
|
||||
character.isspace() is False # includes \n \t \r \v
|
||||
and character.isprintable() is False
|
||||
and character != "\x1A" # Why? Its the ASCII substitute character.
|
||||
):
|
||||
if is_unprintable(character):
|
||||
self._unprintable_count += 1
|
||||
self._character_count += 1
|
||||
|
||||
|
@ -160,10 +155,10 @@ class UnprintablePlugin(MessDetectorPlugin):
|
|||
|
||||
class SuspiciousDuplicateAccentPlugin(MessDetectorPlugin):
|
||||
def __init__(self) -> None:
|
||||
self._successive_count = 0 # type: int
|
||||
self._character_count = 0 # type: int
|
||||
self._successive_count: int = 0
|
||||
self._character_count: int = 0
|
||||
|
||||
self._last_latin_character = None # type: Optional[str]
|
||||
self._last_latin_character: Optional[str] = None
|
||||
|
||||
def eligible(self, character: str) -> bool:
|
||||
return character.isalpha() and is_latin(character)
|
||||
|
@ -197,9 +192,9 @@ class SuspiciousDuplicateAccentPlugin(MessDetectorPlugin):
|
|||
|
||||
class SuspiciousRange(MessDetectorPlugin):
|
||||
def __init__(self) -> None:
|
||||
self._suspicious_successive_range_count = 0 # type: int
|
||||
self._character_count = 0 # type: int
|
||||
self._last_printable_seen = None # type: Optional[str]
|
||||
self._suspicious_successive_range_count: int = 0
|
||||
self._character_count: int = 0
|
||||
self._last_printable_seen: Optional[str] = None
|
||||
|
||||
def eligible(self, character: str) -> bool:
|
||||
return character.isprintable()
|
||||
|
@ -219,10 +214,8 @@ class SuspiciousRange(MessDetectorPlugin):
|
|||
self._last_printable_seen = character
|
||||
return
|
||||
|
||||
unicode_range_a = unicode_range(
|
||||
self._last_printable_seen
|
||||
) # type: Optional[str]
|
||||
unicode_range_b = unicode_range(character) # type: Optional[str]
|
||||
unicode_range_a: Optional[str] = unicode_range(self._last_printable_seen)
|
||||
unicode_range_b: Optional[str] = unicode_range(character)
|
||||
|
||||
if is_suspiciously_successive_range(unicode_range_a, unicode_range_b):
|
||||
self._suspicious_successive_range_count += 1
|
||||
|
@ -239,9 +232,9 @@ class SuspiciousRange(MessDetectorPlugin):
|
|||
if self._character_count == 0:
|
||||
return 0.0
|
||||
|
||||
ratio_of_suspicious_range_usage = (
|
||||
ratio_of_suspicious_range_usage: float = (
|
||||
self._suspicious_successive_range_count * 2
|
||||
) / self._character_count # type: float
|
||||
) / self._character_count
|
||||
|
||||
if ratio_of_suspicious_range_usage < 0.1:
|
||||
return 0.0
|
||||
|
@ -251,25 +244,25 @@ class SuspiciousRange(MessDetectorPlugin):
|
|||
|
||||
class SuperWeirdWordPlugin(MessDetectorPlugin):
|
||||
def __init__(self) -> None:
|
||||
self._word_count = 0 # type: int
|
||||
self._bad_word_count = 0 # type: int
|
||||
self._foreign_long_count = 0 # type: int
|
||||
self._word_count: int = 0
|
||||
self._bad_word_count: int = 0
|
||||
self._foreign_long_count: int = 0
|
||||
|
||||
self._is_current_word_bad = False # type: bool
|
||||
self._foreign_long_watch = False # type: bool
|
||||
self._is_current_word_bad: bool = False
|
||||
self._foreign_long_watch: bool = False
|
||||
|
||||
self._character_count = 0 # type: int
|
||||
self._bad_character_count = 0 # type: int
|
||||
self._character_count: int = 0
|
||||
self._bad_character_count: int = 0
|
||||
|
||||
self._buffer = "" # type: str
|
||||
self._buffer_accent_count = 0 # type: int
|
||||
self._buffer: str = ""
|
||||
self._buffer_accent_count: int = 0
|
||||
|
||||
def eligible(self, character: str) -> bool:
|
||||
return True
|
||||
|
||||
def feed(self, character: str) -> None:
|
||||
if character.isalpha():
|
||||
self._buffer = "".join([self._buffer, character])
|
||||
self._buffer += character
|
||||
if is_accentuated(character):
|
||||
self._buffer_accent_count += 1
|
||||
if (
|
||||
|
@ -289,7 +282,7 @@ class SuperWeirdWordPlugin(MessDetectorPlugin):
|
|||
character.isspace() or is_punctuation(character) or is_separator(character)
|
||||
) and self._buffer:
|
||||
self._word_count += 1
|
||||
buffer_length = len(self._buffer) # type: int
|
||||
buffer_length: int = len(self._buffer)
|
||||
|
||||
self._character_count += buffer_length
|
||||
|
||||
|
@ -346,8 +339,8 @@ class CjkInvalidStopPlugin(MessDetectorPlugin):
|
|||
"""
|
||||
|
||||
def __init__(self) -> None:
|
||||
self._wrong_stop_count = 0 # type: int
|
||||
self._cjk_character_count = 0 # type: int
|
||||
self._wrong_stop_count: int = 0
|
||||
self._cjk_character_count: int = 0
|
||||
|
||||
def eligible(self, character: str) -> bool:
|
||||
return True
|
||||
|
@ -372,17 +365,17 @@ class CjkInvalidStopPlugin(MessDetectorPlugin):
|
|||
|
||||
class ArchaicUpperLowerPlugin(MessDetectorPlugin):
|
||||
def __init__(self) -> None:
|
||||
self._buf = False # type: bool
|
||||
self._buf: bool = False
|
||||
|
||||
self._character_count_since_last_sep = 0 # type: int
|
||||
self._character_count_since_last_sep: int = 0
|
||||
|
||||
self._successive_upper_lower_count = 0 # type: int
|
||||
self._successive_upper_lower_count_final = 0 # type: int
|
||||
self._successive_upper_lower_count: int = 0
|
||||
self._successive_upper_lower_count_final: int = 0
|
||||
|
||||
self._character_count = 0 # type: int
|
||||
self._character_count: int = 0
|
||||
|
||||
self._last_alpha_seen = None # type: Optional[str]
|
||||
self._current_ascii_only = True # type: bool
|
||||
self._last_alpha_seen: Optional[str] = None
|
||||
self._current_ascii_only: bool = True
|
||||
|
||||
def eligible(self, character: str) -> bool:
|
||||
return True
|
||||
|
@ -446,6 +439,7 @@ class ArchaicUpperLowerPlugin(MessDetectorPlugin):
|
|||
return self._successive_upper_lower_count_final / self._character_count
|
||||
|
||||
|
||||
@lru_cache(maxsize=1024)
|
||||
def is_suspiciously_successive_range(
|
||||
unicode_range_a: Optional[str], unicode_range_b: Optional[str]
|
||||
) -> bool:
|
||||
|
@ -524,16 +518,16 @@ def mess_ratio(
|
|||
Compute a mess ratio given a decoded bytes sequence. The maximum threshold does stop the computation earlier.
|
||||
"""
|
||||
|
||||
detectors = [
|
||||
detectors: List[MessDetectorPlugin] = [
|
||||
md_class() for md_class in MessDetectorPlugin.__subclasses__()
|
||||
] # type: List[MessDetectorPlugin]
|
||||
]
|
||||
|
||||
length = len(decoded_sequence) + 1 # type: int
|
||||
length: int = len(decoded_sequence) + 1
|
||||
|
||||
mean_mess_ratio = 0.0 # type: float
|
||||
mean_mess_ratio: float = 0.0
|
||||
|
||||
if length < 512:
|
||||
intermediary_mean_mess_ratio_calc = 32 # type: int
|
||||
intermediary_mean_mess_ratio_calc: int = 32
|
||||
elif length <= 1024:
|
||||
intermediary_mean_mess_ratio_calc = 64
|
||||
else:
|
||||
|
|
|
@ -4,7 +4,16 @@ from encodings.aliases import aliases
|
|||
from hashlib import sha256
|
||||
from json import dumps
|
||||
from re import sub
|
||||
from typing import Any, Dict, Iterator, List, Optional, Tuple, Union
|
||||
from typing import (
|
||||
Any,
|
||||
Counter as TypeCounter,
|
||||
Dict,
|
||||
Iterator,
|
||||
List,
|
||||
Optional,
|
||||
Tuple,
|
||||
Union,
|
||||
)
|
||||
|
||||
from .constant import NOT_PRINTABLE_PATTERN, TOO_BIG_SEQUENCE
|
||||
from .md import mess_ratio
|
||||
|
@ -21,21 +30,21 @@ class CharsetMatch:
|
|||
languages: "CoherenceMatches",
|
||||
decoded_payload: Optional[str] = None,
|
||||
):
|
||||
self._payload = payload # type: bytes
|
||||
self._payload: bytes = payload
|
||||
|
||||
self._encoding = guessed_encoding # type: str
|
||||
self._mean_mess_ratio = mean_mess_ratio # type: float
|
||||
self._languages = languages # type: CoherenceMatches
|
||||
self._has_sig_or_bom = has_sig_or_bom # type: bool
|
||||
self._unicode_ranges = None # type: Optional[List[str]]
|
||||
self._encoding: str = guessed_encoding
|
||||
self._mean_mess_ratio: float = mean_mess_ratio
|
||||
self._languages: CoherenceMatches = languages
|
||||
self._has_sig_or_bom: bool = has_sig_or_bom
|
||||
self._unicode_ranges: Optional[List[str]] = None
|
||||
|
||||
self._leaves = [] # type: List[CharsetMatch]
|
||||
self._mean_coherence_ratio = 0.0 # type: float
|
||||
self._leaves: List[CharsetMatch] = []
|
||||
self._mean_coherence_ratio: float = 0.0
|
||||
|
||||
self._output_payload = None # type: Optional[bytes]
|
||||
self._output_encoding = None # type: Optional[str]
|
||||
self._output_payload: Optional[bytes] = None
|
||||
self._output_encoding: Optional[str] = None
|
||||
|
||||
self._string = decoded_payload # type: Optional[str]
|
||||
self._string: Optional[str] = decoded_payload
|
||||
|
||||
def __eq__(self, other: object) -> bool:
|
||||
if not isinstance(other, CharsetMatch):
|
||||
|
@ -53,8 +62,8 @@ class CharsetMatch:
|
|||
if not isinstance(other, CharsetMatch):
|
||||
raise ValueError
|
||||
|
||||
chaos_difference = abs(self.chaos - other.chaos) # type: float
|
||||
coherence_difference = abs(self.coherence - other.coherence) # type: float
|
||||
chaos_difference: float = abs(self.chaos - other.chaos)
|
||||
coherence_difference: float = abs(self.coherence - other.coherence)
|
||||
|
||||
# Bellow 1% difference --> Use Coherence
|
||||
if chaos_difference < 0.01 and coherence_difference > 0.02:
|
||||
|
@ -95,7 +104,7 @@ class CharsetMatch:
|
|||
return 0.0
|
||||
|
||||
@property
|
||||
def w_counter(self) -> Counter:
|
||||
def w_counter(self) -> TypeCounter[str]:
|
||||
"""
|
||||
Word counter instance on decoded text.
|
||||
Notice: Will be removed in 3.0
|
||||
|
@ -137,7 +146,7 @@ class CharsetMatch:
|
|||
"""
|
||||
Encoding name are known by many name, using this could help when searching for IBM855 when it's listed as CP855.
|
||||
"""
|
||||
also_known_as = [] # type: List[str]
|
||||
also_known_as: List[str] = []
|
||||
for u, p in aliases.items():
|
||||
if self.encoding == u:
|
||||
also_known_as.append(p)
|
||||
|
@ -227,9 +236,9 @@ class CharsetMatch:
|
|||
if self._unicode_ranges is not None:
|
||||
return self._unicode_ranges
|
||||
# list detected ranges
|
||||
detected_ranges = [
|
||||
detected_ranges: List[Optional[str]] = [
|
||||
unicode_range(char) for char in str(self)
|
||||
] # type: List[Optional[str]]
|
||||
]
|
||||
# filter and sort
|
||||
self._unicode_ranges = sorted(list({r for r in detected_ranges if r}))
|
||||
return self._unicode_ranges
|
||||
|
@ -280,8 +289,8 @@ class CharsetMatches:
|
|||
Act like a list(iterable) but does not implements all related methods.
|
||||
"""
|
||||
|
||||
def __init__(self, results: List[CharsetMatch] = None):
|
||||
self._results = sorted(results) if results else [] # type: List[CharsetMatch]
|
||||
def __init__(self, results: Optional[List[CharsetMatch]] = None):
|
||||
self._results: List[CharsetMatch] = sorted(results) if results else []
|
||||
|
||||
def __iter__(self) -> Iterator[CharsetMatch]:
|
||||
yield from self._results
|
||||
|
@ -360,17 +369,17 @@ class CliDetectionResult:
|
|||
unicode_path: Optional[str],
|
||||
is_preferred: bool,
|
||||
):
|
||||
self.path = path # type: str
|
||||
self.unicode_path = unicode_path # type: Optional[str]
|
||||
self.encoding = encoding # type: Optional[str]
|
||||
self.encoding_aliases = encoding_aliases # type: List[str]
|
||||
self.alternative_encodings = alternative_encodings # type: List[str]
|
||||
self.language = language # type: str
|
||||
self.alphabets = alphabets # type: List[str]
|
||||
self.has_sig_or_bom = has_sig_or_bom # type: bool
|
||||
self.chaos = chaos # type: float
|
||||
self.coherence = coherence # type: float
|
||||
self.is_preferred = is_preferred # type: bool
|
||||
self.path: str = path
|
||||
self.unicode_path: Optional[str] = unicode_path
|
||||
self.encoding: Optional[str] = encoding
|
||||
self.encoding_aliases: List[str] = encoding_aliases
|
||||
self.alternative_encodings: List[str] = alternative_encodings
|
||||
self.language: str = language
|
||||
self.alphabets: List[str] = alphabets
|
||||
self.has_sig_or_bom: bool = has_sig_or_bom
|
||||
self.chaos: float = chaos
|
||||
self.coherence: float = coherence
|
||||
self.is_preferred: bool = is_preferred
|
||||
|
||||
@property
|
||||
def __dict__(self) -> Dict[str, Any]: # type: ignore
|
||||
|
|
|
@ -1,4 +1,6 @@
|
|||
try:
|
||||
# WARNING: unicodedata2 support is going to be removed in 3.0
|
||||
# Python is quickly catching up.
|
||||
import unicodedata2 as unicodedata
|
||||
except ImportError:
|
||||
import unicodedata # type: ignore[no-redef]
|
||||
|
@ -9,9 +11,9 @@ from codecs import IncrementalDecoder
|
|||
from encodings.aliases import aliases
|
||||
from functools import lru_cache
|
||||
from re import findall
|
||||
from typing import List, Optional, Set, Tuple, Union
|
||||
from typing import Generator, List, Optional, Set, Tuple, Union
|
||||
|
||||
from _multibytecodec import MultibyteIncrementalDecoder # type: ignore
|
||||
from _multibytecodec import MultibyteIncrementalDecoder
|
||||
|
||||
from .constant import (
|
||||
ENCODING_MARKS,
|
||||
|
@ -26,7 +28,7 @@ from .constant import (
|
|||
@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
|
||||
def is_accentuated(character: str) -> bool:
|
||||
try:
|
||||
description = unicodedata.name(character) # type: str
|
||||
description: str = unicodedata.name(character)
|
||||
except ValueError:
|
||||
return False
|
||||
return (
|
||||
|
@ -41,11 +43,11 @@ def is_accentuated(character: str) -> bool:
|
|||
|
||||
@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
|
||||
def remove_accent(character: str) -> str:
|
||||
decomposed = unicodedata.decomposition(character) # type: str
|
||||
decomposed: str = unicodedata.decomposition(character)
|
||||
if not decomposed:
|
||||
return character
|
||||
|
||||
codes = decomposed.split(" ") # type: List[str]
|
||||
codes: List[str] = decomposed.split(" ")
|
||||
|
||||
return chr(int(codes[0], 16))
|
||||
|
||||
|
@ -55,7 +57,7 @@ def unicode_range(character: str) -> Optional[str]:
|
|||
"""
|
||||
Retrieve the Unicode range official name from a single character.
|
||||
"""
|
||||
character_ord = ord(character) # type: int
|
||||
character_ord: int = ord(character)
|
||||
|
||||
for range_name, ord_range in UNICODE_RANGES_COMBINED.items():
|
||||
if character_ord in ord_range:
|
||||
|
@ -67,12 +69,13 @@ def unicode_range(character: str) -> Optional[str]:
|
|||
@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
|
||||
def is_latin(character: str) -> bool:
|
||||
try:
|
||||
description = unicodedata.name(character) # type: str
|
||||
description: str = unicodedata.name(character)
|
||||
except ValueError:
|
||||
return False
|
||||
return "LATIN" in description
|
||||
|
||||
|
||||
@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
|
||||
def is_ascii(character: str) -> bool:
|
||||
try:
|
||||
character.encode("ascii")
|
||||
|
@ -83,12 +86,12 @@ def is_ascii(character: str) -> bool:
|
|||
|
||||
@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
|
||||
def is_punctuation(character: str) -> bool:
|
||||
character_category = unicodedata.category(character) # type: str
|
||||
character_category: str = unicodedata.category(character)
|
||||
|
||||
if "P" in character_category:
|
||||
return True
|
||||
|
||||
character_range = unicode_range(character) # type: Optional[str]
|
||||
character_range: Optional[str] = unicode_range(character)
|
||||
|
||||
if character_range is None:
|
||||
return False
|
||||
|
@ -98,12 +101,12 @@ def is_punctuation(character: str) -> bool:
|
|||
|
||||
@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
|
||||
def is_symbol(character: str) -> bool:
|
||||
character_category = unicodedata.category(character) # type: str
|
||||
character_category: str = unicodedata.category(character)
|
||||
|
||||
if "S" in character_category or "N" in character_category:
|
||||
return True
|
||||
|
||||
character_range = unicode_range(character) # type: Optional[str]
|
||||
character_range: Optional[str] = unicode_range(character)
|
||||
|
||||
if character_range is None:
|
||||
return False
|
||||
|
@ -113,7 +116,7 @@ def is_symbol(character: str) -> bool:
|
|||
|
||||
@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
|
||||
def is_emoticon(character: str) -> bool:
|
||||
character_range = unicode_range(character) # type: Optional[str]
|
||||
character_range: Optional[str] = unicode_range(character)
|
||||
|
||||
if character_range is None:
|
||||
return False
|
||||
|
@ -126,7 +129,7 @@ def is_separator(character: str) -> bool:
|
|||
if character.isspace() or character in {"|", "+", ",", ";", "<", ">"}:
|
||||
return True
|
||||
|
||||
character_category = unicodedata.category(character) # type: str
|
||||
character_category: str = unicodedata.category(character)
|
||||
|
||||
return "Z" in character_category
|
||||
|
||||
|
@ -137,7 +140,7 @@ def is_case_variable(character: str) -> bool:
|
|||
|
||||
|
||||
def is_private_use_only(character: str) -> bool:
|
||||
character_category = unicodedata.category(character) # type: str
|
||||
character_category: str = unicodedata.category(character)
|
||||
|
||||
return character_category == "Co"
|
||||
|
||||
|
@ -197,6 +200,17 @@ def is_unicode_range_secondary(range_name: str) -> bool:
|
|||
return any(keyword in range_name for keyword in UNICODE_SECONDARY_RANGE_KEYWORD)
|
||||
|
||||
|
||||
@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
|
||||
def is_unprintable(character: str) -> bool:
|
||||
return (
|
||||
character.isspace() is False # includes \n \t \r \v
|
||||
and character.isprintable() is False
|
||||
and character != "\x1A" # Why? Its the ASCII substitute character.
|
||||
and character != "\ufeff" # bug discovered in Python,
|
||||
# Zero Width No-Break Space located in Arabic Presentation Forms-B, Unicode 1.1 not acknowledged as space.
|
||||
)
|
||||
|
||||
|
||||
def any_specified_encoding(sequence: bytes, search_zone: int = 4096) -> Optional[str]:
|
||||
"""
|
||||
Extract using ASCII-only decoder any specified encoding in the first n-bytes.
|
||||
|
@ -204,12 +218,12 @@ def any_specified_encoding(sequence: bytes, search_zone: int = 4096) -> Optional
|
|||
if not isinstance(sequence, bytes):
|
||||
raise TypeError
|
||||
|
||||
seq_len = len(sequence) # type: int
|
||||
seq_len: int = len(sequence)
|
||||
|
||||
results = findall(
|
||||
results: List[str] = findall(
|
||||
RE_POSSIBLE_ENCODING_INDICATION,
|
||||
sequence[: min(seq_len, search_zone)].decode("ascii", errors="ignore"),
|
||||
) # type: List[str]
|
||||
)
|
||||
|
||||
if len(results) == 0:
|
||||
return None
|
||||
|
@ -217,6 +231,9 @@ def any_specified_encoding(sequence: bytes, search_zone: int = 4096) -> Optional
|
|||
for specified_encoding in results:
|
||||
specified_encoding = specified_encoding.lower().replace("-", "_")
|
||||
|
||||
encoding_alias: str
|
||||
encoding_iana: str
|
||||
|
||||
for encoding_alias, encoding_iana in aliases.items():
|
||||
if encoding_alias == specified_encoding:
|
||||
return encoding_iana
|
||||
|
@ -242,7 +259,7 @@ def is_multi_byte_encoding(name: str) -> bool:
|
|||
"utf_32_be",
|
||||
"utf_7",
|
||||
} or issubclass(
|
||||
importlib.import_module("encodings.{}".format(name)).IncrementalDecoder, # type: ignore
|
||||
importlib.import_module("encodings.{}".format(name)).IncrementalDecoder,
|
||||
MultibyteIncrementalDecoder,
|
||||
)
|
||||
|
||||
|
@ -253,7 +270,7 @@ def identify_sig_or_bom(sequence: bytes) -> Tuple[Optional[str], bytes]:
|
|||
"""
|
||||
|
||||
for iana_encoding in ENCODING_MARKS:
|
||||
marks = ENCODING_MARKS[iana_encoding] # type: Union[bytes, List[bytes]]
|
||||
marks: Union[bytes, List[bytes]] = ENCODING_MARKS[iana_encoding]
|
||||
|
||||
if isinstance(marks, bytes):
|
||||
marks = [marks]
|
||||
|
@ -272,6 +289,9 @@ def should_strip_sig_or_bom(iana_encoding: str) -> bool:
|
|||
def iana_name(cp_name: str, strict: bool = True) -> str:
|
||||
cp_name = cp_name.lower().replace("-", "_")
|
||||
|
||||
encoding_alias: str
|
||||
encoding_iana: str
|
||||
|
||||
for encoding_alias, encoding_iana in aliases.items():
|
||||
if cp_name in [encoding_alias, encoding_iana]:
|
||||
return encoding_iana
|
||||
|
@ -283,10 +303,10 @@ def iana_name(cp_name: str, strict: bool = True) -> str:
|
|||
|
||||
|
||||
def range_scan(decoded_sequence: str) -> List[str]:
|
||||
ranges = set() # type: Set[str]
|
||||
ranges: Set[str] = set()
|
||||
|
||||
for character in decoded_sequence:
|
||||
character_range = unicode_range(character) # type: Optional[str]
|
||||
character_range: Optional[str] = unicode_range(character)
|
||||
|
||||
if character_range is None:
|
||||
continue
|
||||
|
@ -301,16 +321,20 @@ def cp_similarity(iana_name_a: str, iana_name_b: str) -> float:
|
|||
if is_multi_byte_encoding(iana_name_a) or is_multi_byte_encoding(iana_name_b):
|
||||
return 0.0
|
||||
|
||||
decoder_a = importlib.import_module("encodings.{}".format(iana_name_a)).IncrementalDecoder # type: ignore
|
||||
decoder_b = importlib.import_module("encodings.{}".format(iana_name_b)).IncrementalDecoder # type: ignore
|
||||
decoder_a = importlib.import_module(
|
||||
"encodings.{}".format(iana_name_a)
|
||||
).IncrementalDecoder
|
||||
decoder_b = importlib.import_module(
|
||||
"encodings.{}".format(iana_name_b)
|
||||
).IncrementalDecoder
|
||||
|
||||
id_a = decoder_a(errors="ignore") # type: IncrementalDecoder
|
||||
id_b = decoder_b(errors="ignore") # type: IncrementalDecoder
|
||||
id_a: IncrementalDecoder = decoder_a(errors="ignore")
|
||||
id_b: IncrementalDecoder = decoder_b(errors="ignore")
|
||||
|
||||
character_match_count = 0 # type: int
|
||||
character_match_count: int = 0
|
||||
|
||||
for i in range(255):
|
||||
to_be_decoded = bytes([i]) # type: bytes
|
||||
to_be_decoded: bytes = bytes([i])
|
||||
if id_a.decode(to_be_decoded) == id_b.decode(to_be_decoded):
|
||||
character_match_count += 1
|
||||
|
||||
|
@ -340,3 +364,61 @@ def set_logging_handler(
|
|||
handler = logging.StreamHandler()
|
||||
handler.setFormatter(logging.Formatter(format_string))
|
||||
logger.addHandler(handler)
|
||||
|
||||
|
||||
def cut_sequence_chunks(
|
||||
sequences: bytes,
|
||||
encoding_iana: str,
|
||||
offsets: range,
|
||||
chunk_size: int,
|
||||
bom_or_sig_available: bool,
|
||||
strip_sig_or_bom: bool,
|
||||
sig_payload: bytes,
|
||||
is_multi_byte_decoder: bool,
|
||||
decoded_payload: Optional[str] = None,
|
||||
) -> Generator[str, None, None]:
|
||||
|
||||
if decoded_payload and is_multi_byte_decoder is False:
|
||||
for i in offsets:
|
||||
chunk = decoded_payload[i : i + chunk_size]
|
||||
if not chunk:
|
||||
break
|
||||
yield chunk
|
||||
else:
|
||||
for i in offsets:
|
||||
chunk_end = i + chunk_size
|
||||
if chunk_end > len(sequences) + 8:
|
||||
continue
|
||||
|
||||
cut_sequence = sequences[i : i + chunk_size]
|
||||
|
||||
if bom_or_sig_available and strip_sig_or_bom is False:
|
||||
cut_sequence = sig_payload + cut_sequence
|
||||
|
||||
chunk = cut_sequence.decode(
|
||||
encoding_iana,
|
||||
errors="ignore" if is_multi_byte_decoder else "strict",
|
||||
)
|
||||
|
||||
# multi-byte bad cutting detector and adjustment
|
||||
# not the cleanest way to perform that fix but clever enough for now.
|
||||
if is_multi_byte_decoder and i > 0 and sequences[i] >= 0x80:
|
||||
|
||||
chunk_partial_size_chk: int = min(chunk_size, 16)
|
||||
|
||||
if (
|
||||
decoded_payload
|
||||
and chunk[:chunk_partial_size_chk] not in decoded_payload
|
||||
):
|
||||
for j in range(i, i - 4, -1):
|
||||
cut_sequence = sequences[j:chunk_end]
|
||||
|
||||
if bom_or_sig_available and strip_sig_or_bom is False:
|
||||
cut_sequence = sig_payload + cut_sequence
|
||||
|
||||
chunk = cut_sequence.decode(encoding_iana, errors="ignore")
|
||||
|
||||
if chunk[:chunk_partial_size_chk] in decoded_payload:
|
||||
break
|
||||
|
||||
yield chunk
|
||||
|
|
|
@ -2,5 +2,5 @@
|
|||
Expose version
|
||||
"""
|
||||
|
||||
__version__ = "2.0.12"
|
||||
__version__ = "2.1.1"
|
||||
VERSION = __version__.split(".")
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue