mirror of
https://github.com/Tautulli/Tautulli.git
synced 2025-08-23 06:25:27 -07:00
Update charset-normalizer==2.1.1
This commit is contained in:
parent
aaa336de28
commit
637ccee60f
10 changed files with 1493 additions and 1556 deletions
|
@ -1,4 +1,4 @@
|
||||||
# -*- coding: utf_8 -*-
|
# -*- coding: utf-8 -*-
|
||||||
"""
|
"""
|
||||||
Charset-Normalizer
|
Charset-Normalizer
|
||||||
~~~~~~~~~~~~~~
|
~~~~~~~~~~~~~~
|
||||||
|
|
|
@ -1,11 +1,8 @@
|
||||||
import logging
|
import logging
|
||||||
from os.path import basename, splitext
|
import warnings
|
||||||
from typing import BinaryIO, List, Optional, Set
|
|
||||||
|
|
||||||
try:
|
|
||||||
from os import PathLike
|
from os import PathLike
|
||||||
except ImportError: # pragma: no cover
|
from os.path import basename, splitext
|
||||||
PathLike = str # type: ignore
|
from typing import Any, BinaryIO, List, Optional, Set
|
||||||
|
|
||||||
from .cd import (
|
from .cd import (
|
||||||
coherence_ratio,
|
coherence_ratio,
|
||||||
|
@ -18,6 +15,7 @@ from .md import mess_ratio
|
||||||
from .models import CharsetMatch, CharsetMatches
|
from .models import CharsetMatch, CharsetMatches
|
||||||
from .utils import (
|
from .utils import (
|
||||||
any_specified_encoding,
|
any_specified_encoding,
|
||||||
|
cut_sequence_chunks,
|
||||||
iana_name,
|
iana_name,
|
||||||
identify_sig_or_bom,
|
identify_sig_or_bom,
|
||||||
is_cp_similar,
|
is_cp_similar,
|
||||||
|
@ -39,8 +37,8 @@ def from_bytes(
|
||||||
steps: int = 5,
|
steps: int = 5,
|
||||||
chunk_size: int = 512,
|
chunk_size: int = 512,
|
||||||
threshold: float = 0.2,
|
threshold: float = 0.2,
|
||||||
cp_isolation: List[str] = None,
|
cp_isolation: Optional[List[str]] = None,
|
||||||
cp_exclusion: List[str] = None,
|
cp_exclusion: Optional[List[str]] = None,
|
||||||
preemptive_behaviour: bool = True,
|
preemptive_behaviour: bool = True,
|
||||||
explain: bool = False,
|
explain: bool = False,
|
||||||
) -> CharsetMatches:
|
) -> CharsetMatches:
|
||||||
|
@ -70,11 +68,11 @@ def from_bytes(
|
||||||
)
|
)
|
||||||
|
|
||||||
if explain:
|
if explain:
|
||||||
previous_logger_level = logger.level # type: int
|
previous_logger_level: int = logger.level
|
||||||
logger.addHandler(explain_handler)
|
logger.addHandler(explain_handler)
|
||||||
logger.setLevel(TRACE)
|
logger.setLevel(TRACE)
|
||||||
|
|
||||||
length = len(sequences) # type: int
|
length: int = len(sequences)
|
||||||
|
|
||||||
if length == 0:
|
if length == 0:
|
||||||
logger.debug("Encoding detection on empty bytes, assuming utf_8 intention.")
|
logger.debug("Encoding detection on empty bytes, assuming utf_8 intention.")
|
||||||
|
@ -119,8 +117,8 @@ def from_bytes(
|
||||||
if steps > 1 and length / steps < chunk_size:
|
if steps > 1 and length / steps < chunk_size:
|
||||||
chunk_size = int(length / steps)
|
chunk_size = int(length / steps)
|
||||||
|
|
||||||
is_too_small_sequence = len(sequences) < TOO_SMALL_SEQUENCE # type: bool
|
is_too_small_sequence: bool = len(sequences) < TOO_SMALL_SEQUENCE
|
||||||
is_too_large_sequence = len(sequences) >= TOO_BIG_SEQUENCE # type: bool
|
is_too_large_sequence: bool = len(sequences) >= TOO_BIG_SEQUENCE
|
||||||
|
|
||||||
if is_too_small_sequence:
|
if is_too_small_sequence:
|
||||||
logger.log(
|
logger.log(
|
||||||
|
@ -137,11 +135,11 @@ def from_bytes(
|
||||||
),
|
),
|
||||||
)
|
)
|
||||||
|
|
||||||
prioritized_encodings = [] # type: List[str]
|
prioritized_encodings: List[str] = []
|
||||||
|
|
||||||
specified_encoding = (
|
specified_encoding: Optional[str] = (
|
||||||
any_specified_encoding(sequences) if preemptive_behaviour else None
|
any_specified_encoding(sequences) if preemptive_behaviour else None
|
||||||
) # type: Optional[str]
|
)
|
||||||
|
|
||||||
if specified_encoding is not None:
|
if specified_encoding is not None:
|
||||||
prioritized_encodings.append(specified_encoding)
|
prioritized_encodings.append(specified_encoding)
|
||||||
|
@ -151,15 +149,15 @@ def from_bytes(
|
||||||
specified_encoding,
|
specified_encoding,
|
||||||
)
|
)
|
||||||
|
|
||||||
tested = set() # type: Set[str]
|
tested: Set[str] = set()
|
||||||
tested_but_hard_failure = [] # type: List[str]
|
tested_but_hard_failure: List[str] = []
|
||||||
tested_but_soft_failure = [] # type: List[str]
|
tested_but_soft_failure: List[str] = []
|
||||||
|
|
||||||
fallback_ascii = None # type: Optional[CharsetMatch]
|
fallback_ascii: Optional[CharsetMatch] = None
|
||||||
fallback_u8 = None # type: Optional[CharsetMatch]
|
fallback_u8: Optional[CharsetMatch] = None
|
||||||
fallback_specified = None # type: Optional[CharsetMatch]
|
fallback_specified: Optional[CharsetMatch] = None
|
||||||
|
|
||||||
results = CharsetMatches() # type: CharsetMatches
|
results: CharsetMatches = CharsetMatches()
|
||||||
|
|
||||||
sig_encoding, sig_payload = identify_sig_or_bom(sequences)
|
sig_encoding, sig_payload = identify_sig_or_bom(sequences)
|
||||||
|
|
||||||
|
@ -190,11 +188,11 @@ def from_bytes(
|
||||||
|
|
||||||
tested.add(encoding_iana)
|
tested.add(encoding_iana)
|
||||||
|
|
||||||
decoded_payload = None # type: Optional[str]
|
decoded_payload: Optional[str] = None
|
||||||
bom_or_sig_available = sig_encoding == encoding_iana # type: bool
|
bom_or_sig_available: bool = sig_encoding == encoding_iana
|
||||||
strip_sig_or_bom = bom_or_sig_available and should_strip_sig_or_bom(
|
strip_sig_or_bom: bool = bom_or_sig_available and should_strip_sig_or_bom(
|
||||||
encoding_iana
|
encoding_iana
|
||||||
) # type: bool
|
)
|
||||||
|
|
||||||
if encoding_iana in {"utf_16", "utf_32"} and not bom_or_sig_available:
|
if encoding_iana in {"utf_16", "utf_32"} and not bom_or_sig_available:
|
||||||
logger.log(
|
logger.log(
|
||||||
|
@ -205,7 +203,7 @@ def from_bytes(
|
||||||
continue
|
continue
|
||||||
|
|
||||||
try:
|
try:
|
||||||
is_multi_byte_decoder = is_multi_byte_encoding(encoding_iana) # type: bool
|
is_multi_byte_decoder: bool = is_multi_byte_encoding(encoding_iana)
|
||||||
except (ModuleNotFoundError, ImportError):
|
except (ModuleNotFoundError, ImportError):
|
||||||
logger.log(
|
logger.log(
|
||||||
TRACE,
|
TRACE,
|
||||||
|
@ -240,7 +238,7 @@ def from_bytes(
|
||||||
tested_but_hard_failure.append(encoding_iana)
|
tested_but_hard_failure.append(encoding_iana)
|
||||||
continue
|
continue
|
||||||
|
|
||||||
similar_soft_failure_test = False # type: bool
|
similar_soft_failure_test: bool = False
|
||||||
|
|
||||||
for encoding_soft_failed in tested_but_soft_failure:
|
for encoding_soft_failed in tested_but_soft_failure:
|
||||||
if is_cp_similar(encoding_iana, encoding_soft_failed):
|
if is_cp_similar(encoding_iana, encoding_soft_failed):
|
||||||
|
@ -262,11 +260,11 @@ def from_bytes(
|
||||||
int(length / steps),
|
int(length / steps),
|
||||||
)
|
)
|
||||||
|
|
||||||
multi_byte_bonus = (
|
multi_byte_bonus: bool = (
|
||||||
is_multi_byte_decoder
|
is_multi_byte_decoder
|
||||||
and decoded_payload is not None
|
and decoded_payload is not None
|
||||||
and len(decoded_payload) < length
|
and len(decoded_payload) < length
|
||||||
) # type: bool
|
)
|
||||||
|
|
||||||
if multi_byte_bonus:
|
if multi_byte_bonus:
|
||||||
logger.log(
|
logger.log(
|
||||||
|
@ -276,61 +274,27 @@ def from_bytes(
|
||||||
encoding_iana,
|
encoding_iana,
|
||||||
)
|
)
|
||||||
|
|
||||||
max_chunk_gave_up = int(len(r_) / 4) # type: int
|
max_chunk_gave_up: int = int(len(r_) / 4)
|
||||||
|
|
||||||
max_chunk_gave_up = max(max_chunk_gave_up, 2)
|
max_chunk_gave_up = max(max_chunk_gave_up, 2)
|
||||||
early_stop_count = 0 # type: int
|
early_stop_count: int = 0
|
||||||
lazy_str_hard_failure = False
|
lazy_str_hard_failure = False
|
||||||
|
|
||||||
md_chunks = [] # type: List[str]
|
md_chunks: List[str] = []
|
||||||
md_ratios = []
|
md_ratios = []
|
||||||
|
|
||||||
for i in r_:
|
|
||||||
if i + chunk_size > length + 8:
|
|
||||||
continue
|
|
||||||
|
|
||||||
cut_sequence = sequences[i : i + chunk_size]
|
|
||||||
|
|
||||||
if bom_or_sig_available and strip_sig_or_bom is False:
|
|
||||||
cut_sequence = sig_payload + cut_sequence
|
|
||||||
|
|
||||||
try:
|
try:
|
||||||
chunk = cut_sequence.decode(
|
for chunk in cut_sequence_chunks(
|
||||||
|
sequences,
|
||||||
encoding_iana,
|
encoding_iana,
|
||||||
errors="ignore" if is_multi_byte_decoder else "strict",
|
r_,
|
||||||
) # type: str
|
chunk_size,
|
||||||
except UnicodeDecodeError as e: # Lazy str loading may have missed something there
|
bom_or_sig_available,
|
||||||
logger.log(
|
strip_sig_or_bom,
|
||||||
TRACE,
|
sig_payload,
|
||||||
"LazyStr Loading: After MD chunk decode, code page %s does not fit given bytes sequence at ALL. %s",
|
is_multi_byte_decoder,
|
||||||
encoding_iana,
|
decoded_payload,
|
||||||
str(e),
|
|
||||||
)
|
|
||||||
early_stop_count = max_chunk_gave_up
|
|
||||||
lazy_str_hard_failure = True
|
|
||||||
break
|
|
||||||
|
|
||||||
# multi-byte bad cutting detector and adjustment
|
|
||||||
# not the cleanest way to perform that fix but clever enough for now.
|
|
||||||
if is_multi_byte_decoder and i > 0 and sequences[i] >= 0x80:
|
|
||||||
|
|
||||||
chunk_partial_size_chk = min(chunk_size, 16) # type: int
|
|
||||||
|
|
||||||
if (
|
|
||||||
decoded_payload
|
|
||||||
and chunk[:chunk_partial_size_chk] not in decoded_payload
|
|
||||||
):
|
):
|
||||||
for j in range(i, i - 4, -1):
|
|
||||||
cut_sequence = sequences[j : i + chunk_size]
|
|
||||||
|
|
||||||
if bom_or_sig_available and strip_sig_or_bom is False:
|
|
||||||
cut_sequence = sig_payload + cut_sequence
|
|
||||||
|
|
||||||
chunk = cut_sequence.decode(encoding_iana, errors="ignore")
|
|
||||||
|
|
||||||
if chunk[:chunk_partial_size_chk] in decoded_payload:
|
|
||||||
break
|
|
||||||
|
|
||||||
md_chunks.append(chunk)
|
md_chunks.append(chunk)
|
||||||
|
|
||||||
md_ratios.append(mess_ratio(chunk, threshold))
|
md_ratios.append(mess_ratio(chunk, threshold))
|
||||||
|
@ -342,6 +306,15 @@ def from_bytes(
|
||||||
bom_or_sig_available and strip_sig_or_bom is False
|
bom_or_sig_available and strip_sig_or_bom is False
|
||||||
):
|
):
|
||||||
break
|
break
|
||||||
|
except UnicodeDecodeError as e: # Lazy str loading may have missed something there
|
||||||
|
logger.log(
|
||||||
|
TRACE,
|
||||||
|
"LazyStr Loading: After MD chunk decode, code page %s does not fit given bytes sequence at ALL. %s",
|
||||||
|
encoding_iana,
|
||||||
|
str(e),
|
||||||
|
)
|
||||||
|
early_stop_count = max_chunk_gave_up
|
||||||
|
lazy_str_hard_failure = True
|
||||||
|
|
||||||
# We might want to check the sequence again with the whole content
|
# We might want to check the sequence again with the whole content
|
||||||
# Only if initial MD tests passes
|
# Only if initial MD tests passes
|
||||||
|
@ -362,9 +335,7 @@ def from_bytes(
|
||||||
tested_but_hard_failure.append(encoding_iana)
|
tested_but_hard_failure.append(encoding_iana)
|
||||||
continue
|
continue
|
||||||
|
|
||||||
mean_mess_ratio = (
|
mean_mess_ratio: float = sum(md_ratios) / len(md_ratios) if md_ratios else 0.0
|
||||||
sum(md_ratios) / len(md_ratios) if md_ratios else 0.0
|
|
||||||
) # type: float
|
|
||||||
if mean_mess_ratio >= threshold or early_stop_count >= max_chunk_gave_up:
|
if mean_mess_ratio >= threshold or early_stop_count >= max_chunk_gave_up:
|
||||||
tested_but_soft_failure.append(encoding_iana)
|
tested_but_soft_failure.append(encoding_iana)
|
||||||
logger.log(
|
logger.log(
|
||||||
|
@ -399,7 +370,7 @@ def from_bytes(
|
||||||
)
|
)
|
||||||
|
|
||||||
if not is_multi_byte_decoder:
|
if not is_multi_byte_decoder:
|
||||||
target_languages = encoding_languages(encoding_iana) # type: List[str]
|
target_languages: List[str] = encoding_languages(encoding_iana)
|
||||||
else:
|
else:
|
||||||
target_languages = mb_encoding_languages(encoding_iana)
|
target_languages = mb_encoding_languages(encoding_iana)
|
||||||
|
|
||||||
|
@ -516,8 +487,8 @@ def from_fp(
|
||||||
steps: int = 5,
|
steps: int = 5,
|
||||||
chunk_size: int = 512,
|
chunk_size: int = 512,
|
||||||
threshold: float = 0.20,
|
threshold: float = 0.20,
|
||||||
cp_isolation: List[str] = None,
|
cp_isolation: Optional[List[str]] = None,
|
||||||
cp_exclusion: List[str] = None,
|
cp_exclusion: Optional[List[str]] = None,
|
||||||
preemptive_behaviour: bool = True,
|
preemptive_behaviour: bool = True,
|
||||||
explain: bool = False,
|
explain: bool = False,
|
||||||
) -> CharsetMatches:
|
) -> CharsetMatches:
|
||||||
|
@ -538,12 +509,12 @@ def from_fp(
|
||||||
|
|
||||||
|
|
||||||
def from_path(
|
def from_path(
|
||||||
path: PathLike,
|
path: "PathLike[Any]",
|
||||||
steps: int = 5,
|
steps: int = 5,
|
||||||
chunk_size: int = 512,
|
chunk_size: int = 512,
|
||||||
threshold: float = 0.20,
|
threshold: float = 0.20,
|
||||||
cp_isolation: List[str] = None,
|
cp_isolation: Optional[List[str]] = None,
|
||||||
cp_exclusion: List[str] = None,
|
cp_exclusion: Optional[List[str]] = None,
|
||||||
preemptive_behaviour: bool = True,
|
preemptive_behaviour: bool = True,
|
||||||
explain: bool = False,
|
explain: bool = False,
|
||||||
) -> CharsetMatches:
|
) -> CharsetMatches:
|
||||||
|
@ -565,17 +536,22 @@ def from_path(
|
||||||
|
|
||||||
|
|
||||||
def normalize(
|
def normalize(
|
||||||
path: PathLike,
|
path: "PathLike[Any]",
|
||||||
steps: int = 5,
|
steps: int = 5,
|
||||||
chunk_size: int = 512,
|
chunk_size: int = 512,
|
||||||
threshold: float = 0.20,
|
threshold: float = 0.20,
|
||||||
cp_isolation: List[str] = None,
|
cp_isolation: Optional[List[str]] = None,
|
||||||
cp_exclusion: List[str] = None,
|
cp_exclusion: Optional[List[str]] = None,
|
||||||
preemptive_behaviour: bool = True,
|
preemptive_behaviour: bool = True,
|
||||||
) -> CharsetMatch:
|
) -> CharsetMatch:
|
||||||
"""
|
"""
|
||||||
Take a (text-based) file path and try to create another file next to it, this time using UTF-8.
|
Take a (text-based) file path and try to create another file next to it, this time using UTF-8.
|
||||||
"""
|
"""
|
||||||
|
warnings.warn(
|
||||||
|
"normalize is deprecated and will be removed in 3.0",
|
||||||
|
DeprecationWarning,
|
||||||
|
)
|
||||||
|
|
||||||
results = from_path(
|
results = from_path(
|
||||||
path,
|
path,
|
||||||
steps,
|
steps,
|
||||||
|
|
|
@ -1,11 +1,8 @@
|
||||||
# -*- coding: utf_8 -*-
|
# -*- coding: utf-8 -*-
|
||||||
from collections import OrderedDict
|
from typing import Dict, List
|
||||||
|
|
||||||
FREQUENCIES = OrderedDict(
|
FREQUENCIES: Dict[str, List[str]] = {
|
||||||
[
|
"English": [
|
||||||
(
|
|
||||||
"English",
|
|
||||||
[
|
|
||||||
"e",
|
"e",
|
||||||
"a",
|
"a",
|
||||||
"t",
|
"t",
|
||||||
|
@ -33,10 +30,7 @@ FREQUENCIES = OrderedDict(
|
||||||
"z",
|
"z",
|
||||||
"q",
|
"q",
|
||||||
],
|
],
|
||||||
),
|
"German": [
|
||||||
(
|
|
||||||
"German",
|
|
||||||
[
|
|
||||||
"e",
|
"e",
|
||||||
"n",
|
"n",
|
||||||
"i",
|
"i",
|
||||||
|
@ -64,10 +58,7 @@ FREQUENCIES = OrderedDict(
|
||||||
"ö",
|
"ö",
|
||||||
"j",
|
"j",
|
||||||
],
|
],
|
||||||
),
|
"French": [
|
||||||
(
|
|
||||||
"French",
|
|
||||||
[
|
|
||||||
"e",
|
"e",
|
||||||
"a",
|
"a",
|
||||||
"s",
|
"s",
|
||||||
|
@ -95,10 +86,7 @@ FREQUENCIES = OrderedDict(
|
||||||
"y",
|
"y",
|
||||||
"j",
|
"j",
|
||||||
],
|
],
|
||||||
),
|
"Dutch": [
|
||||||
(
|
|
||||||
"Dutch",
|
|
||||||
[
|
|
||||||
"e",
|
"e",
|
||||||
"n",
|
"n",
|
||||||
"a",
|
"a",
|
||||||
|
@ -126,10 +114,7 @@ FREQUENCIES = OrderedDict(
|
||||||
"x",
|
"x",
|
||||||
"ë",
|
"ë",
|
||||||
],
|
],
|
||||||
),
|
"Italian": [
|
||||||
(
|
|
||||||
"Italian",
|
|
||||||
[
|
|
||||||
"e",
|
"e",
|
||||||
"i",
|
"i",
|
||||||
"a",
|
"a",
|
||||||
|
@ -157,10 +142,7 @@ FREQUENCIES = OrderedDict(
|
||||||
"y",
|
"y",
|
||||||
"ò",
|
"ò",
|
||||||
],
|
],
|
||||||
),
|
"Polish": [
|
||||||
(
|
|
||||||
"Polish",
|
|
||||||
[
|
|
||||||
"a",
|
"a",
|
||||||
"i",
|
"i",
|
||||||
"o",
|
"o",
|
||||||
|
@ -188,10 +170,7 @@ FREQUENCIES = OrderedDict(
|
||||||
"ę",
|
"ę",
|
||||||
"ó",
|
"ó",
|
||||||
],
|
],
|
||||||
),
|
"Spanish": [
|
||||||
(
|
|
||||||
"Spanish",
|
|
||||||
[
|
|
||||||
"e",
|
"e",
|
||||||
"a",
|
"a",
|
||||||
"o",
|
"o",
|
||||||
|
@ -219,10 +198,7 @@ FREQUENCIES = OrderedDict(
|
||||||
"z",
|
"z",
|
||||||
"á",
|
"á",
|
||||||
],
|
],
|
||||||
),
|
"Russian": [
|
||||||
(
|
|
||||||
"Russian",
|
|
||||||
[
|
|
||||||
"о",
|
"о",
|
||||||
"а",
|
"а",
|
||||||
"е",
|
"е",
|
||||||
|
@ -250,10 +226,7 @@ FREQUENCIES = OrderedDict(
|
||||||
"ж",
|
"ж",
|
||||||
"ц",
|
"ц",
|
||||||
],
|
],
|
||||||
),
|
"Japanese": [
|
||||||
(
|
|
||||||
"Japanese",
|
|
||||||
[
|
|
||||||
"の",
|
"の",
|
||||||
"に",
|
"に",
|
||||||
"る",
|
"る",
|
||||||
|
@ -281,10 +254,7 @@ FREQUENCIES = OrderedDict(
|
||||||
"も",
|
"も",
|
||||||
"り",
|
"り",
|
||||||
],
|
],
|
||||||
),
|
"Portuguese": [
|
||||||
(
|
|
||||||
"Portuguese",
|
|
||||||
[
|
|
||||||
"a",
|
"a",
|
||||||
"e",
|
"e",
|
||||||
"o",
|
"o",
|
||||||
|
@ -312,10 +282,7 @@ FREQUENCIES = OrderedDict(
|
||||||
"z",
|
"z",
|
||||||
"í",
|
"í",
|
||||||
],
|
],
|
||||||
),
|
"Swedish": [
|
||||||
(
|
|
||||||
"Swedish",
|
|
||||||
[
|
|
||||||
"e",
|
"e",
|
||||||
"a",
|
"a",
|
||||||
"n",
|
"n",
|
||||||
|
@ -343,10 +310,7 @@ FREQUENCIES = OrderedDict(
|
||||||
"j",
|
"j",
|
||||||
"x",
|
"x",
|
||||||
],
|
],
|
||||||
),
|
"Chinese": [
|
||||||
(
|
|
||||||
"Chinese",
|
|
||||||
[
|
|
||||||
"的",
|
"的",
|
||||||
"一",
|
"一",
|
||||||
"是",
|
"是",
|
||||||
|
@ -377,10 +341,7 @@ FREQUENCIES = OrderedDict(
|
||||||
"出",
|
"出",
|
||||||
"会",
|
"会",
|
||||||
],
|
],
|
||||||
),
|
"Ukrainian": [
|
||||||
(
|
|
||||||
"Ukrainian",
|
|
||||||
[
|
|
||||||
"о",
|
"о",
|
||||||
"а",
|
"а",
|
||||||
"н",
|
"н",
|
||||||
|
@ -408,10 +369,7 @@ FREQUENCIES = OrderedDict(
|
||||||
"ц",
|
"ц",
|
||||||
"ї",
|
"ї",
|
||||||
],
|
],
|
||||||
),
|
"Norwegian": [
|
||||||
(
|
|
||||||
"Norwegian",
|
|
||||||
[
|
|
||||||
"e",
|
"e",
|
||||||
"r",
|
"r",
|
||||||
"n",
|
"n",
|
||||||
|
@ -439,10 +397,7 @@ FREQUENCIES = OrderedDict(
|
||||||
"æ",
|
"æ",
|
||||||
"w",
|
"w",
|
||||||
],
|
],
|
||||||
),
|
"Finnish": [
|
||||||
(
|
|
||||||
"Finnish",
|
|
||||||
[
|
|
||||||
"a",
|
"a",
|
||||||
"i",
|
"i",
|
||||||
"n",
|
"n",
|
||||||
|
@ -470,10 +425,7 @@ FREQUENCIES = OrderedDict(
|
||||||
"w",
|
"w",
|
||||||
"z",
|
"z",
|
||||||
],
|
],
|
||||||
),
|
"Vietnamese": [
|
||||||
(
|
|
||||||
"Vietnamese",
|
|
||||||
[
|
|
||||||
"n",
|
"n",
|
||||||
"h",
|
"h",
|
||||||
"t",
|
"t",
|
||||||
|
@ -501,10 +453,7 @@ FREQUENCIES = OrderedDict(
|
||||||
"ộ",
|
"ộ",
|
||||||
"ế",
|
"ế",
|
||||||
],
|
],
|
||||||
),
|
"Czech": [
|
||||||
(
|
|
||||||
"Czech",
|
|
||||||
[
|
|
||||||
"o",
|
"o",
|
||||||
"e",
|
"e",
|
||||||
"a",
|
"a",
|
||||||
|
@ -532,10 +481,7 @@ FREQUENCIES = OrderedDict(
|
||||||
"é",
|
"é",
|
||||||
"ř",
|
"ř",
|
||||||
],
|
],
|
||||||
),
|
"Hungarian": [
|
||||||
(
|
|
||||||
"Hungarian",
|
|
||||||
[
|
|
||||||
"e",
|
"e",
|
||||||
"a",
|
"a",
|
||||||
"t",
|
"t",
|
||||||
|
@ -563,10 +509,7 @@ FREQUENCIES = OrderedDict(
|
||||||
"f",
|
"f",
|
||||||
"c",
|
"c",
|
||||||
],
|
],
|
||||||
),
|
"Korean": [
|
||||||
(
|
|
||||||
"Korean",
|
|
||||||
[
|
|
||||||
"이",
|
"이",
|
||||||
"다",
|
"다",
|
||||||
"에",
|
"에",
|
||||||
|
@ -594,10 +537,7 @@ FREQUENCIES = OrderedDict(
|
||||||
"스",
|
"스",
|
||||||
"일",
|
"일",
|
||||||
],
|
],
|
||||||
),
|
"Indonesian": [
|
||||||
(
|
|
||||||
"Indonesian",
|
|
||||||
[
|
|
||||||
"a",
|
"a",
|
||||||
"n",
|
"n",
|
||||||
"e",
|
"e",
|
||||||
|
@ -625,10 +565,7 @@ FREQUENCIES = OrderedDict(
|
||||||
"x",
|
"x",
|
||||||
"q",
|
"q",
|
||||||
],
|
],
|
||||||
),
|
"Turkish": [
|
||||||
(
|
|
||||||
"Turkish",
|
|
||||||
[
|
|
||||||
"a",
|
"a",
|
||||||
"e",
|
"e",
|
||||||
"i",
|
"i",
|
||||||
|
@ -656,10 +593,7 @@ FREQUENCIES = OrderedDict(
|
||||||
"ç",
|
"ç",
|
||||||
"ğ",
|
"ğ",
|
||||||
],
|
],
|
||||||
),
|
"Romanian": [
|
||||||
(
|
|
||||||
"Romanian",
|
|
||||||
[
|
|
||||||
"e",
|
"e",
|
||||||
"i",
|
"i",
|
||||||
"a",
|
"a",
|
||||||
|
@ -687,10 +621,7 @@ FREQUENCIES = OrderedDict(
|
||||||
"â",
|
"â",
|
||||||
"j",
|
"j",
|
||||||
],
|
],
|
||||||
),
|
"Farsi": [
|
||||||
(
|
|
||||||
"Farsi",
|
|
||||||
[
|
|
||||||
"ا",
|
"ا",
|
||||||
"ی",
|
"ی",
|
||||||
"ر",
|
"ر",
|
||||||
|
@ -718,10 +649,7 @@ FREQUENCIES = OrderedDict(
|
||||||
"ط",
|
"ط",
|
||||||
"ص",
|
"ص",
|
||||||
],
|
],
|
||||||
),
|
"Arabic": [
|
||||||
(
|
|
||||||
"Arabic",
|
|
||||||
[
|
|
||||||
"ا",
|
"ا",
|
||||||
"ل",
|
"ل",
|
||||||
"ي",
|
"ي",
|
||||||
|
@ -749,10 +677,7 @@ FREQUENCIES = OrderedDict(
|
||||||
"خ",
|
"خ",
|
||||||
"إ",
|
"إ",
|
||||||
],
|
],
|
||||||
),
|
"Danish": [
|
||||||
(
|
|
||||||
"Danish",
|
|
||||||
[
|
|
||||||
"e",
|
"e",
|
||||||
"r",
|
"r",
|
||||||
"n",
|
"n",
|
||||||
|
@ -780,10 +705,7 @@ FREQUENCIES = OrderedDict(
|
||||||
"j",
|
"j",
|
||||||
"w",
|
"w",
|
||||||
],
|
],
|
||||||
),
|
"Serbian": [
|
||||||
(
|
|
||||||
"Serbian",
|
|
||||||
[
|
|
||||||
"а",
|
"а",
|
||||||
"и",
|
"и",
|
||||||
"о",
|
"о",
|
||||||
|
@ -811,10 +733,7 @@ FREQUENCIES = OrderedDict(
|
||||||
"ц",
|
"ц",
|
||||||
"ш",
|
"ш",
|
||||||
],
|
],
|
||||||
),
|
"Lithuanian": [
|
||||||
(
|
|
||||||
"Lithuanian",
|
|
||||||
[
|
|
||||||
"i",
|
"i",
|
||||||
"a",
|
"a",
|
||||||
"s",
|
"s",
|
||||||
|
@ -842,10 +761,7 @@ FREQUENCIES = OrderedDict(
|
||||||
"ą",
|
"ą",
|
||||||
"į",
|
"į",
|
||||||
],
|
],
|
||||||
),
|
"Slovene": [
|
||||||
(
|
|
||||||
"Slovene",
|
|
||||||
[
|
|
||||||
"e",
|
"e",
|
||||||
"a",
|
"a",
|
||||||
"i",
|
"i",
|
||||||
|
@ -873,10 +789,7 @@ FREQUENCIES = OrderedDict(
|
||||||
"f",
|
"f",
|
||||||
"y",
|
"y",
|
||||||
],
|
],
|
||||||
),
|
"Slovak": [
|
||||||
(
|
|
||||||
"Slovak",
|
|
||||||
[
|
|
||||||
"o",
|
"o",
|
||||||
"a",
|
"a",
|
||||||
"e",
|
"e",
|
||||||
|
@ -904,10 +817,7 @@ FREQUENCIES = OrderedDict(
|
||||||
"č",
|
"č",
|
||||||
"é",
|
"é",
|
||||||
],
|
],
|
||||||
),
|
"Hebrew": [
|
||||||
(
|
|
||||||
"Hebrew",
|
|
||||||
[
|
|
||||||
"י",
|
"י",
|
||||||
"ו",
|
"ו",
|
||||||
"ה",
|
"ה",
|
||||||
|
@ -934,10 +844,7 @@ FREQUENCIES = OrderedDict(
|
||||||
"ז",
|
"ז",
|
||||||
"ך",
|
"ך",
|
||||||
],
|
],
|
||||||
),
|
"Bulgarian": [
|
||||||
(
|
|
||||||
"Bulgarian",
|
|
||||||
[
|
|
||||||
"а",
|
"а",
|
||||||
"и",
|
"и",
|
||||||
"о",
|
"о",
|
||||||
|
@ -965,10 +872,7 @@ FREQUENCIES = OrderedDict(
|
||||||
"щ",
|
"щ",
|
||||||
"х",
|
"х",
|
||||||
],
|
],
|
||||||
),
|
"Croatian": [
|
||||||
(
|
|
||||||
"Croatian",
|
|
||||||
[
|
|
||||||
"a",
|
"a",
|
||||||
"i",
|
"i",
|
||||||
"o",
|
"o",
|
||||||
|
@ -996,10 +900,7 @@ FREQUENCIES = OrderedDict(
|
||||||
"ć",
|
"ć",
|
||||||
"f",
|
"f",
|
||||||
],
|
],
|
||||||
),
|
"Hindi": [
|
||||||
(
|
|
||||||
"Hindi",
|
|
||||||
[
|
|
||||||
"क",
|
"क",
|
||||||
"र",
|
"र",
|
||||||
"स",
|
"स",
|
||||||
|
@ -1027,10 +928,7 @@ FREQUENCIES = OrderedDict(
|
||||||
"ष",
|
"ष",
|
||||||
"इ",
|
"इ",
|
||||||
],
|
],
|
||||||
),
|
"Estonian": [
|
||||||
(
|
|
||||||
"Estonian",
|
|
||||||
[
|
|
||||||
"a",
|
"a",
|
||||||
"i",
|
"i",
|
||||||
"e",
|
"e",
|
||||||
|
@ -1058,10 +956,7 @@ FREQUENCIES = OrderedDict(
|
||||||
"ö",
|
"ö",
|
||||||
"y",
|
"y",
|
||||||
],
|
],
|
||||||
),
|
"Simple English": [
|
||||||
(
|
|
||||||
"Simple English",
|
|
||||||
[
|
|
||||||
"e",
|
"e",
|
||||||
"a",
|
"a",
|
||||||
"t",
|
"t",
|
||||||
|
@ -1089,10 +984,7 @@ FREQUENCIES = OrderedDict(
|
||||||
"z",
|
"z",
|
||||||
"q",
|
"q",
|
||||||
],
|
],
|
||||||
),
|
"Thai": [
|
||||||
(
|
|
||||||
"Thai",
|
|
||||||
[
|
|
||||||
"า",
|
"า",
|
||||||
"น",
|
"น",
|
||||||
"ร",
|
"ร",
|
||||||
|
@ -1120,10 +1012,7 @@ FREQUENCIES = OrderedDict(
|
||||||
"ข",
|
"ข",
|
||||||
"ใ",
|
"ใ",
|
||||||
],
|
],
|
||||||
),
|
"Greek": [
|
||||||
(
|
|
||||||
"Greek",
|
|
||||||
[
|
|
||||||
"α",
|
"α",
|
||||||
"τ",
|
"τ",
|
||||||
"ο",
|
"ο",
|
||||||
|
@ -1151,10 +1040,7 @@ FREQUENCIES = OrderedDict(
|
||||||
"θ",
|
"θ",
|
||||||
"ύ",
|
"ύ",
|
||||||
],
|
],
|
||||||
),
|
"Tamil": [
|
||||||
(
|
|
||||||
"Tamil",
|
|
||||||
[
|
|
||||||
"க",
|
"க",
|
||||||
"த",
|
"த",
|
||||||
"ப",
|
"ப",
|
||||||
|
@ -1180,10 +1066,7 @@ FREQUENCIES = OrderedDict(
|
||||||
"ஒ",
|
"ஒ",
|
||||||
"ஸ",
|
"ஸ",
|
||||||
],
|
],
|
||||||
),
|
"Classical Chinese": [
|
||||||
(
|
|
||||||
"Classical Chinese",
|
|
||||||
[
|
|
||||||
"之",
|
"之",
|
||||||
"年",
|
"年",
|
||||||
"為",
|
"為",
|
||||||
|
@ -1208,10 +1091,7 @@ FREQUENCIES = OrderedDict(
|
||||||
"五",
|
"五",
|
||||||
"四",
|
"四",
|
||||||
],
|
],
|
||||||
),
|
"Kazakh": [
|
||||||
(
|
|
||||||
"Kazakh",
|
|
||||||
[
|
|
||||||
"а",
|
"а",
|
||||||
"ы",
|
"ы",
|
||||||
"е",
|
"е",
|
||||||
|
@ -1239,6 +1119,4 @@ FREQUENCIES = OrderedDict(
|
||||||
"г",
|
"г",
|
||||||
"ө",
|
"ө",
|
||||||
],
|
],
|
||||||
),
|
}
|
||||||
]
|
|
||||||
)
|
|
||||||
|
|
|
@ -1,8 +1,8 @@
|
||||||
import importlib
|
import importlib
|
||||||
from codecs import IncrementalDecoder
|
from codecs import IncrementalDecoder
|
||||||
from collections import Counter, OrderedDict
|
from collections import Counter
|
||||||
from functools import lru_cache
|
from functools import lru_cache
|
||||||
from typing import Dict, List, Optional, Tuple
|
from typing import Counter as TypeCounter, Dict, List, Optional, Tuple
|
||||||
|
|
||||||
from .assets import FREQUENCIES
|
from .assets import FREQUENCIES
|
||||||
from .constant import KO_NAMES, LANGUAGE_SUPPORTED_COUNT, TOO_SMALL_SEQUENCE, ZH_NAMES
|
from .constant import KO_NAMES, LANGUAGE_SUPPORTED_COUNT, TOO_SMALL_SEQUENCE, ZH_NAMES
|
||||||
|
@ -24,17 +24,19 @@ def encoding_unicode_range(iana_name: str) -> List[str]:
|
||||||
if is_multi_byte_encoding(iana_name):
|
if is_multi_byte_encoding(iana_name):
|
||||||
raise IOError("Function not supported on multi-byte code page")
|
raise IOError("Function not supported on multi-byte code page")
|
||||||
|
|
||||||
decoder = importlib.import_module("encodings.{}".format(iana_name)).IncrementalDecoder # type: ignore
|
decoder = importlib.import_module(
|
||||||
|
"encodings.{}".format(iana_name)
|
||||||
|
).IncrementalDecoder
|
||||||
|
|
||||||
p = decoder(errors="ignore") # type: IncrementalDecoder
|
p: IncrementalDecoder = decoder(errors="ignore")
|
||||||
seen_ranges = {} # type: Dict[str, int]
|
seen_ranges: Dict[str, int] = {}
|
||||||
character_count = 0 # type: int
|
character_count: int = 0
|
||||||
|
|
||||||
for i in range(0x40, 0xFF):
|
for i in range(0x40, 0xFF):
|
||||||
chunk = p.decode(bytes([i])) # type: str
|
chunk: str = p.decode(bytes([i]))
|
||||||
|
|
||||||
if chunk:
|
if chunk:
|
||||||
character_range = unicode_range(chunk) # type: Optional[str]
|
character_range: Optional[str] = unicode_range(chunk)
|
||||||
|
|
||||||
if character_range is None:
|
if character_range is None:
|
||||||
continue
|
continue
|
||||||
|
@ -58,7 +60,7 @@ def unicode_range_languages(primary_range: str) -> List[str]:
|
||||||
"""
|
"""
|
||||||
Return inferred languages used with a unicode range.
|
Return inferred languages used with a unicode range.
|
||||||
"""
|
"""
|
||||||
languages = [] # type: List[str]
|
languages: List[str] = []
|
||||||
|
|
||||||
for language, characters in FREQUENCIES.items():
|
for language, characters in FREQUENCIES.items():
|
||||||
for character in characters:
|
for character in characters:
|
||||||
|
@ -75,8 +77,8 @@ def encoding_languages(iana_name: str) -> List[str]:
|
||||||
Single-byte encoding language association. Some code page are heavily linked to particular language(s).
|
Single-byte encoding language association. Some code page are heavily linked to particular language(s).
|
||||||
This function does the correspondence.
|
This function does the correspondence.
|
||||||
"""
|
"""
|
||||||
unicode_ranges = encoding_unicode_range(iana_name) # type: List[str]
|
unicode_ranges: List[str] = encoding_unicode_range(iana_name)
|
||||||
primary_range = None # type: Optional[str]
|
primary_range: Optional[str] = None
|
||||||
|
|
||||||
for specified_range in unicode_ranges:
|
for specified_range in unicode_ranges:
|
||||||
if "Latin" not in specified_range:
|
if "Latin" not in specified_range:
|
||||||
|
@ -115,8 +117,8 @@ def get_target_features(language: str) -> Tuple[bool, bool]:
|
||||||
"""
|
"""
|
||||||
Determine main aspects from a supported language if it contains accents and if is pure Latin.
|
Determine main aspects from a supported language if it contains accents and if is pure Latin.
|
||||||
"""
|
"""
|
||||||
target_have_accents = False # type: bool
|
target_have_accents: bool = False
|
||||||
target_pure_latin = True # type: bool
|
target_pure_latin: bool = True
|
||||||
|
|
||||||
for character in FREQUENCIES[language]:
|
for character in FREQUENCIES[language]:
|
||||||
if not target_have_accents and is_accentuated(character):
|
if not target_have_accents and is_accentuated(character):
|
||||||
|
@ -133,7 +135,7 @@ def alphabet_languages(
|
||||||
"""
|
"""
|
||||||
Return associated languages associated to given characters.
|
Return associated languages associated to given characters.
|
||||||
"""
|
"""
|
||||||
languages = [] # type: List[Tuple[str, float]]
|
languages: List[Tuple[str, float]] = []
|
||||||
|
|
||||||
source_have_accents = any(is_accentuated(character) for character in characters)
|
source_have_accents = any(is_accentuated(character) for character in characters)
|
||||||
|
|
||||||
|
@ -147,13 +149,13 @@ def alphabet_languages(
|
||||||
if target_have_accents is False and source_have_accents:
|
if target_have_accents is False and source_have_accents:
|
||||||
continue
|
continue
|
||||||
|
|
||||||
character_count = len(language_characters) # type: int
|
character_count: int = len(language_characters)
|
||||||
|
|
||||||
character_match_count = len(
|
character_match_count: int = len(
|
||||||
[c for c in language_characters if c in characters]
|
[c for c in language_characters if c in characters]
|
||||||
) # type: int
|
)
|
||||||
|
|
||||||
ratio = character_match_count / character_count # type: float
|
ratio: float = character_match_count / character_count
|
||||||
|
|
||||||
if ratio >= 0.2:
|
if ratio >= 0.2:
|
||||||
languages.append((language, ratio))
|
languages.append((language, ratio))
|
||||||
|
@ -174,36 +176,33 @@ def characters_popularity_compare(
|
||||||
if language not in FREQUENCIES:
|
if language not in FREQUENCIES:
|
||||||
raise ValueError("{} not available".format(language))
|
raise ValueError("{} not available".format(language))
|
||||||
|
|
||||||
character_approved_count = 0 # type: int
|
character_approved_count: int = 0
|
||||||
|
FREQUENCIES_language_set = set(FREQUENCIES[language])
|
||||||
|
|
||||||
for character in ordered_characters:
|
for character in ordered_characters:
|
||||||
if character not in FREQUENCIES[language]:
|
if character not in FREQUENCIES_language_set:
|
||||||
continue
|
continue
|
||||||
|
|
||||||
characters_before_source = FREQUENCIES[language][
|
characters_before_source: List[str] = FREQUENCIES[language][
|
||||||
0 : FREQUENCIES[language].index(character)
|
0 : FREQUENCIES[language].index(character)
|
||||||
] # type: List[str]
|
]
|
||||||
characters_after_source = FREQUENCIES[language][
|
characters_after_source: List[str] = FREQUENCIES[language][
|
||||||
FREQUENCIES[language].index(character) :
|
FREQUENCIES[language].index(character) :
|
||||||
] # type: List[str]
|
]
|
||||||
|
characters_before: List[str] = ordered_characters[
|
||||||
characters_before = ordered_characters[
|
|
||||||
0 : ordered_characters.index(character)
|
0 : ordered_characters.index(character)
|
||||||
] # type: List[str]
|
]
|
||||||
characters_after = ordered_characters[
|
characters_after: List[str] = ordered_characters[
|
||||||
ordered_characters.index(character) :
|
ordered_characters.index(character) :
|
||||||
] # type: List[str]
|
]
|
||||||
|
|
||||||
before_match_count = [
|
before_match_count: int = len(
|
||||||
e in characters_before for e in characters_before_source
|
set(characters_before) & set(characters_before_source)
|
||||||
].count(
|
)
|
||||||
True
|
|
||||||
) # type: int
|
after_match_count: int = len(
|
||||||
after_match_count = [
|
set(characters_after) & set(characters_after_source)
|
||||||
e in characters_after for e in characters_after_source
|
)
|
||||||
].count(
|
|
||||||
True
|
|
||||||
) # type: int
|
|
||||||
|
|
||||||
if len(characters_before_source) == 0 and before_match_count <= 4:
|
if len(characters_before_source) == 0 and before_match_count <= 4:
|
||||||
character_approved_count += 1
|
character_approved_count += 1
|
||||||
|
@ -229,18 +228,18 @@ def alpha_unicode_split(decoded_sequence: str) -> List[str]:
|
||||||
Ex. a text containing English/Latin with a bit a Hebrew will return two items in the resulting list;
|
Ex. a text containing English/Latin with a bit a Hebrew will return two items in the resulting list;
|
||||||
One containing the latin letters and the other hebrew.
|
One containing the latin letters and the other hebrew.
|
||||||
"""
|
"""
|
||||||
layers = OrderedDict() # type: Dict[str, str]
|
layers: Dict[str, str] = {}
|
||||||
|
|
||||||
for character in decoded_sequence:
|
for character in decoded_sequence:
|
||||||
if character.isalpha() is False:
|
if character.isalpha() is False:
|
||||||
continue
|
continue
|
||||||
|
|
||||||
character_range = unicode_range(character) # type: Optional[str]
|
character_range: Optional[str] = unicode_range(character)
|
||||||
|
|
||||||
if character_range is None:
|
if character_range is None:
|
||||||
continue
|
continue
|
||||||
|
|
||||||
layer_target_range = None # type: Optional[str]
|
layer_target_range: Optional[str] = None
|
||||||
|
|
||||||
for discovered_range in layers:
|
for discovered_range in layers:
|
||||||
if (
|
if (
|
||||||
|
@ -267,7 +266,7 @@ def merge_coherence_ratios(results: List[CoherenceMatches]) -> CoherenceMatches:
|
||||||
This function merge results previously given by the function coherence_ratio.
|
This function merge results previously given by the function coherence_ratio.
|
||||||
The return type is the same as coherence_ratio.
|
The return type is the same as coherence_ratio.
|
||||||
"""
|
"""
|
||||||
per_language_ratios = OrderedDict() # type: Dict[str, List[float]]
|
per_language_ratios: Dict[str, List[float]] = {}
|
||||||
for result in results:
|
for result in results:
|
||||||
for sub_result in result:
|
for sub_result in result:
|
||||||
language, ratio = sub_result
|
language, ratio = sub_result
|
||||||
|
@ -299,10 +298,10 @@ def coherence_ratio(
|
||||||
A layer = Character extraction by alphabets/ranges.
|
A layer = Character extraction by alphabets/ranges.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
results = [] # type: List[Tuple[str, float]]
|
results: List[Tuple[str, float]] = []
|
||||||
ignore_non_latin = False # type: bool
|
ignore_non_latin: bool = False
|
||||||
|
|
||||||
sufficient_match_count = 0 # type: int
|
sufficient_match_count: int = 0
|
||||||
|
|
||||||
lg_inclusion_list = lg_inclusion.split(",") if lg_inclusion is not None else []
|
lg_inclusion_list = lg_inclusion.split(",") if lg_inclusion is not None else []
|
||||||
if "Latin Based" in lg_inclusion_list:
|
if "Latin Based" in lg_inclusion_list:
|
||||||
|
@ -310,22 +309,22 @@ def coherence_ratio(
|
||||||
lg_inclusion_list.remove("Latin Based")
|
lg_inclusion_list.remove("Latin Based")
|
||||||
|
|
||||||
for layer in alpha_unicode_split(decoded_sequence):
|
for layer in alpha_unicode_split(decoded_sequence):
|
||||||
sequence_frequencies = Counter(layer) # type: Counter
|
sequence_frequencies: TypeCounter[str] = Counter(layer)
|
||||||
most_common = sequence_frequencies.most_common()
|
most_common = sequence_frequencies.most_common()
|
||||||
|
|
||||||
character_count = sum(o for c, o in most_common) # type: int
|
character_count: int = sum(o for c, o in most_common)
|
||||||
|
|
||||||
if character_count <= TOO_SMALL_SEQUENCE:
|
if character_count <= TOO_SMALL_SEQUENCE:
|
||||||
continue
|
continue
|
||||||
|
|
||||||
popular_character_ordered = [c for c, o in most_common] # type: List[str]
|
popular_character_ordered: List[str] = [c for c, o in most_common]
|
||||||
|
|
||||||
for language in lg_inclusion_list or alphabet_languages(
|
for language in lg_inclusion_list or alphabet_languages(
|
||||||
popular_character_ordered, ignore_non_latin
|
popular_character_ordered, ignore_non_latin
|
||||||
):
|
):
|
||||||
ratio = characters_popularity_compare(
|
ratio: float = characters_popularity_compare(
|
||||||
language, popular_character_ordered
|
language, popular_character_ordered
|
||||||
) # type: float
|
)
|
||||||
|
|
||||||
if ratio < threshold:
|
if ratio < threshold:
|
||||||
continue
|
continue
|
||||||
|
|
|
@ -3,7 +3,12 @@ import sys
|
||||||
from json import dumps
|
from json import dumps
|
||||||
from os.path import abspath
|
from os.path import abspath
|
||||||
from platform import python_version
|
from platform import python_version
|
||||||
from typing import List
|
from typing import List, Optional
|
||||||
|
|
||||||
|
try:
|
||||||
|
from unicodedata2 import unidata_version
|
||||||
|
except ImportError:
|
||||||
|
from unicodedata import unidata_version
|
||||||
|
|
||||||
from charset_normalizer import from_fp
|
from charset_normalizer import from_fp
|
||||||
from charset_normalizer.models import CliDetectionResult
|
from charset_normalizer.models import CliDetectionResult
|
||||||
|
@ -43,7 +48,7 @@ def query_yes_no(question: str, default: str = "yes") -> bool:
|
||||||
sys.stdout.write("Please respond with 'yes' or 'no' " "(or 'y' or 'n').\n")
|
sys.stdout.write("Please respond with 'yes' or 'no' " "(or 'y' or 'n').\n")
|
||||||
|
|
||||||
|
|
||||||
def cli_detect(argv: List[str] = None) -> int:
|
def cli_detect(argv: Optional[List[str]] = None) -> int:
|
||||||
"""
|
"""
|
||||||
CLI assistant using ARGV and ArgumentParser
|
CLI assistant using ARGV and ArgumentParser
|
||||||
:param argv:
|
:param argv:
|
||||||
|
@ -111,7 +116,7 @@ def cli_detect(argv: List[str] = None) -> int:
|
||||||
"-t",
|
"-t",
|
||||||
"--threshold",
|
"--threshold",
|
||||||
action="store",
|
action="store",
|
||||||
default=0.1,
|
default=0.2,
|
||||||
type=float,
|
type=float,
|
||||||
dest="threshold",
|
dest="threshold",
|
||||||
help="Define a custom maximum amount of chaos allowed in decoded content. 0. <= chaos <= 1.",
|
help="Define a custom maximum amount of chaos allowed in decoded content. 0. <= chaos <= 1.",
|
||||||
|
@ -119,8 +124,8 @@ def cli_detect(argv: List[str] = None) -> int:
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--version",
|
"--version",
|
||||||
action="version",
|
action="version",
|
||||||
version="Charset-Normalizer {} - Python {}".format(
|
version="Charset-Normalizer {} - Python {} - Unicode {}".format(
|
||||||
__version__, python_version()
|
__version__, python_version(), unidata_version
|
||||||
),
|
),
|
||||||
help="Show version information and exit.",
|
help="Show version information and exit.",
|
||||||
)
|
)
|
||||||
|
@ -229,7 +234,7 @@ def cli_detect(argv: List[str] = None) -> int:
|
||||||
my_file.close()
|
my_file.close()
|
||||||
continue
|
continue
|
||||||
|
|
||||||
o_ = my_file.name.split(".") # type: List[str]
|
o_: List[str] = my_file.name.split(".")
|
||||||
|
|
||||||
if args.replace is False:
|
if args.replace is False:
|
||||||
o_.insert(-1, best_guess.encoding)
|
o_.insert(-1, best_guess.encoding)
|
||||||
|
|
|
@ -1,5 +1,4 @@
|
||||||
from codecs import BOM_UTF8, BOM_UTF16_BE, BOM_UTF16_LE, BOM_UTF32_BE, BOM_UTF32_LE
|
from codecs import BOM_UTF8, BOM_UTF16_BE, BOM_UTF16_LE, BOM_UTF32_BE, BOM_UTF32_LE
|
||||||
from collections import OrderedDict
|
|
||||||
from encodings.aliases import aliases
|
from encodings.aliases import aliases
|
||||||
from re import IGNORECASE, compile as re_compile
|
from re import IGNORECASE, compile as re_compile
|
||||||
from typing import Dict, List, Set, Union
|
from typing import Dict, List, Set, Union
|
||||||
|
@ -7,31 +6,26 @@ from typing import Dict, List, Set, Union
|
||||||
from .assets import FREQUENCIES
|
from .assets import FREQUENCIES
|
||||||
|
|
||||||
# Contain for each eligible encoding a list of/item bytes SIG/BOM
|
# Contain for each eligible encoding a list of/item bytes SIG/BOM
|
||||||
ENCODING_MARKS = OrderedDict(
|
ENCODING_MARKS: Dict[str, Union[bytes, List[bytes]]] = {
|
||||||
[
|
"utf_8": BOM_UTF8,
|
||||||
("utf_8", BOM_UTF8),
|
"utf_7": [
|
||||||
(
|
|
||||||
"utf_7",
|
|
||||||
[
|
|
||||||
b"\x2b\x2f\x76\x38",
|
b"\x2b\x2f\x76\x38",
|
||||||
b"\x2b\x2f\x76\x39",
|
b"\x2b\x2f\x76\x39",
|
||||||
b"\x2b\x2f\x76\x2b",
|
b"\x2b\x2f\x76\x2b",
|
||||||
b"\x2b\x2f\x76\x2f",
|
b"\x2b\x2f\x76\x2f",
|
||||||
b"\x2b\x2f\x76\x38\x2d",
|
b"\x2b\x2f\x76\x38\x2d",
|
||||||
],
|
],
|
||||||
),
|
"gb18030": b"\x84\x31\x95\x33",
|
||||||
("gb18030", b"\x84\x31\x95\x33"),
|
"utf_32": [BOM_UTF32_BE, BOM_UTF32_LE],
|
||||||
("utf_32", [BOM_UTF32_BE, BOM_UTF32_LE]),
|
"utf_16": [BOM_UTF16_BE, BOM_UTF16_LE],
|
||||||
("utf_16", [BOM_UTF16_BE, BOM_UTF16_LE]),
|
}
|
||||||
]
|
|
||||||
) # type: Dict[str, Union[bytes, List[bytes]]]
|
|
||||||
|
|
||||||
TOO_SMALL_SEQUENCE = 32 # type: int
|
TOO_SMALL_SEQUENCE: int = 32
|
||||||
TOO_BIG_SEQUENCE = int(10e6) # type: int
|
TOO_BIG_SEQUENCE: int = int(10e6)
|
||||||
|
|
||||||
UTF8_MAXIMAL_ALLOCATION = 1112064 # type: int
|
UTF8_MAXIMAL_ALLOCATION: int = 1112064
|
||||||
|
|
||||||
UNICODE_RANGES_COMBINED = {
|
UNICODE_RANGES_COMBINED: Dict[str, range] = {
|
||||||
"Control character": range(31 + 1),
|
"Control character": range(31 + 1),
|
||||||
"Basic Latin": range(32, 127 + 1),
|
"Basic Latin": range(32, 127 + 1),
|
||||||
"Latin-1 Supplement": range(128, 255 + 1),
|
"Latin-1 Supplement": range(128, 255 + 1),
|
||||||
|
@ -311,10 +305,10 @@ UNICODE_RANGES_COMBINED = {
|
||||||
"CJK Compatibility Ideographs Supplement": range(194560, 195103 + 1),
|
"CJK Compatibility Ideographs Supplement": range(194560, 195103 + 1),
|
||||||
"Tags": range(917504, 917631 + 1),
|
"Tags": range(917504, 917631 + 1),
|
||||||
"Variation Selectors Supplement": range(917760, 917999 + 1),
|
"Variation Selectors Supplement": range(917760, 917999 + 1),
|
||||||
} # type: Dict[str, range]
|
}
|
||||||
|
|
||||||
|
|
||||||
UNICODE_SECONDARY_RANGE_KEYWORD = [
|
UNICODE_SECONDARY_RANGE_KEYWORD: List[str] = [
|
||||||
"Supplement",
|
"Supplement",
|
||||||
"Extended",
|
"Extended",
|
||||||
"Extensions",
|
"Extensions",
|
||||||
|
@ -330,25 +324,25 @@ UNICODE_SECONDARY_RANGE_KEYWORD = [
|
||||||
"Shapes",
|
"Shapes",
|
||||||
"Supplemental",
|
"Supplemental",
|
||||||
"Tags",
|
"Tags",
|
||||||
] # type: List[str]
|
]
|
||||||
|
|
||||||
RE_POSSIBLE_ENCODING_INDICATION = re_compile(
|
RE_POSSIBLE_ENCODING_INDICATION = re_compile(
|
||||||
r"(?:(?:encoding)|(?:charset)|(?:coding))(?:[\:= ]{1,10})(?:[\"\']?)([a-zA-Z0-9\-_]+)(?:[\"\']?)",
|
r"(?:(?:encoding)|(?:charset)|(?:coding))(?:[\:= ]{1,10})(?:[\"\']?)([a-zA-Z0-9\-_]+)(?:[\"\']?)",
|
||||||
IGNORECASE,
|
IGNORECASE,
|
||||||
)
|
)
|
||||||
|
|
||||||
IANA_SUPPORTED = sorted(
|
IANA_SUPPORTED: List[str] = sorted(
|
||||||
filter(
|
filter(
|
||||||
lambda x: x.endswith("_codec") is False
|
lambda x: x.endswith("_codec") is False
|
||||||
and x not in {"rot_13", "tactis", "mbcs"},
|
and x not in {"rot_13", "tactis", "mbcs"},
|
||||||
list(set(aliases.values())),
|
list(set(aliases.values())),
|
||||||
)
|
)
|
||||||
) # type: List[str]
|
)
|
||||||
|
|
||||||
IANA_SUPPORTED_COUNT = len(IANA_SUPPORTED) # type: int
|
IANA_SUPPORTED_COUNT: int = len(IANA_SUPPORTED)
|
||||||
|
|
||||||
# pre-computed code page that are similar using the function cp_similarity.
|
# pre-computed code page that are similar using the function cp_similarity.
|
||||||
IANA_SUPPORTED_SIMILAR = {
|
IANA_SUPPORTED_SIMILAR: Dict[str, List[str]] = {
|
||||||
"cp037": ["cp1026", "cp1140", "cp273", "cp500"],
|
"cp037": ["cp1026", "cp1140", "cp273", "cp500"],
|
||||||
"cp1026": ["cp037", "cp1140", "cp273", "cp500"],
|
"cp1026": ["cp037", "cp1140", "cp273", "cp500"],
|
||||||
"cp1125": ["cp866"],
|
"cp1125": ["cp866"],
|
||||||
|
@ -434,10 +428,10 @@ IANA_SUPPORTED_SIMILAR = {
|
||||||
"mac_turkish": ["mac_iceland", "mac_roman"],
|
"mac_turkish": ["mac_iceland", "mac_roman"],
|
||||||
"ptcp154": ["cp1251", "kz1048"],
|
"ptcp154": ["cp1251", "kz1048"],
|
||||||
"tis_620": ["iso8859_11"],
|
"tis_620": ["iso8859_11"],
|
||||||
} # type: Dict[str, List[str]]
|
}
|
||||||
|
|
||||||
|
|
||||||
CHARDET_CORRESPONDENCE = {
|
CHARDET_CORRESPONDENCE: Dict[str, str] = {
|
||||||
"iso2022_kr": "ISO-2022-KR",
|
"iso2022_kr": "ISO-2022-KR",
|
||||||
"iso2022_jp": "ISO-2022-JP",
|
"iso2022_jp": "ISO-2022-JP",
|
||||||
"euc_kr": "EUC-KR",
|
"euc_kr": "EUC-KR",
|
||||||
|
@ -470,10 +464,10 @@ CHARDET_CORRESPONDENCE = {
|
||||||
"cp1256": "windows-1256",
|
"cp1256": "windows-1256",
|
||||||
"cp1254": "Windows-1254",
|
"cp1254": "Windows-1254",
|
||||||
"cp949": "CP949",
|
"cp949": "CP949",
|
||||||
} # type: Dict[str, str]
|
}
|
||||||
|
|
||||||
|
|
||||||
COMMON_SAFE_ASCII_CHARACTERS = {
|
COMMON_SAFE_ASCII_CHARACTERS: Set[str] = {
|
||||||
"<",
|
"<",
|
||||||
">",
|
">",
|
||||||
"=",
|
"=",
|
||||||
|
@ -489,15 +483,15 @@ COMMON_SAFE_ASCII_CHARACTERS = {
|
||||||
"|",
|
"|",
|
||||||
'"',
|
'"',
|
||||||
"-",
|
"-",
|
||||||
} # type: Set[str]
|
}
|
||||||
|
|
||||||
|
|
||||||
KO_NAMES = {"johab", "cp949", "euc_kr"} # type: Set[str]
|
KO_NAMES: Set[str] = {"johab", "cp949", "euc_kr"}
|
||||||
ZH_NAMES = {"big5", "cp950", "big5hkscs", "hz"} # type: Set[str]
|
ZH_NAMES: Set[str] = {"big5", "cp950", "big5hkscs", "hz"}
|
||||||
|
|
||||||
NOT_PRINTABLE_PATTERN = re_compile(r"[0-9\W\n\r\t]+")
|
NOT_PRINTABLE_PATTERN = re_compile(r"[0-9\W\n\r\t]+")
|
||||||
|
|
||||||
LANGUAGE_SUPPORTED_COUNT = len(FREQUENCIES) # type: int
|
LANGUAGE_SUPPORTED_COUNT: int = len(FREQUENCIES)
|
||||||
|
|
||||||
# Logging LEVEL bellow DEBUG
|
# Logging LEVEL bellow DEBUG
|
||||||
TRACE = 5 # type: int
|
TRACE: int = 5
|
||||||
|
|
|
@ -16,6 +16,7 @@ from .utils import (
|
||||||
is_separator,
|
is_separator,
|
||||||
is_symbol,
|
is_symbol,
|
||||||
is_thai,
|
is_thai,
|
||||||
|
is_unprintable,
|
||||||
remove_accent,
|
remove_accent,
|
||||||
unicode_range,
|
unicode_range,
|
||||||
)
|
)
|
||||||
|
@ -57,12 +58,12 @@ class MessDetectorPlugin:
|
||||||
|
|
||||||
class TooManySymbolOrPunctuationPlugin(MessDetectorPlugin):
|
class TooManySymbolOrPunctuationPlugin(MessDetectorPlugin):
|
||||||
def __init__(self) -> None:
|
def __init__(self) -> None:
|
||||||
self._punctuation_count = 0 # type: int
|
self._punctuation_count: int = 0
|
||||||
self._symbol_count = 0 # type: int
|
self._symbol_count: int = 0
|
||||||
self._character_count = 0 # type: int
|
self._character_count: int = 0
|
||||||
|
|
||||||
self._last_printable_char = None # type: Optional[str]
|
self._last_printable_char: Optional[str] = None
|
||||||
self._frenzy_symbol_in_word = False # type: bool
|
self._frenzy_symbol_in_word: bool = False
|
||||||
|
|
||||||
def eligible(self, character: str) -> bool:
|
def eligible(self, character: str) -> bool:
|
||||||
return character.isprintable()
|
return character.isprintable()
|
||||||
|
@ -95,17 +96,17 @@ class TooManySymbolOrPunctuationPlugin(MessDetectorPlugin):
|
||||||
if self._character_count == 0:
|
if self._character_count == 0:
|
||||||
return 0.0
|
return 0.0
|
||||||
|
|
||||||
ratio_of_punctuation = (
|
ratio_of_punctuation: float = (
|
||||||
self._punctuation_count + self._symbol_count
|
self._punctuation_count + self._symbol_count
|
||||||
) / self._character_count # type: float
|
) / self._character_count
|
||||||
|
|
||||||
return ratio_of_punctuation if ratio_of_punctuation >= 0.3 else 0.0
|
return ratio_of_punctuation if ratio_of_punctuation >= 0.3 else 0.0
|
||||||
|
|
||||||
|
|
||||||
class TooManyAccentuatedPlugin(MessDetectorPlugin):
|
class TooManyAccentuatedPlugin(MessDetectorPlugin):
|
||||||
def __init__(self) -> None:
|
def __init__(self) -> None:
|
||||||
self._character_count = 0 # type: int
|
self._character_count: int = 0
|
||||||
self._accentuated_count = 0 # type: int
|
self._accentuated_count: int = 0
|
||||||
|
|
||||||
def eligible(self, character: str) -> bool:
|
def eligible(self, character: str) -> bool:
|
||||||
return character.isalpha()
|
return character.isalpha()
|
||||||
|
@ -124,26 +125,20 @@ class TooManyAccentuatedPlugin(MessDetectorPlugin):
|
||||||
def ratio(self) -> float:
|
def ratio(self) -> float:
|
||||||
if self._character_count == 0:
|
if self._character_count == 0:
|
||||||
return 0.0
|
return 0.0
|
||||||
ratio_of_accentuation = (
|
ratio_of_accentuation: float = self._accentuated_count / self._character_count
|
||||||
self._accentuated_count / self._character_count
|
|
||||||
) # type: float
|
|
||||||
return ratio_of_accentuation if ratio_of_accentuation >= 0.35 else 0.0
|
return ratio_of_accentuation if ratio_of_accentuation >= 0.35 else 0.0
|
||||||
|
|
||||||
|
|
||||||
class UnprintablePlugin(MessDetectorPlugin):
|
class UnprintablePlugin(MessDetectorPlugin):
|
||||||
def __init__(self) -> None:
|
def __init__(self) -> None:
|
||||||
self._unprintable_count = 0 # type: int
|
self._unprintable_count: int = 0
|
||||||
self._character_count = 0 # type: int
|
self._character_count: int = 0
|
||||||
|
|
||||||
def eligible(self, character: str) -> bool:
|
def eligible(self, character: str) -> bool:
|
||||||
return True
|
return True
|
||||||
|
|
||||||
def feed(self, character: str) -> None:
|
def feed(self, character: str) -> None:
|
||||||
if (
|
if is_unprintable(character):
|
||||||
character.isspace() is False # includes \n \t \r \v
|
|
||||||
and character.isprintable() is False
|
|
||||||
and character != "\x1A" # Why? Its the ASCII substitute character.
|
|
||||||
):
|
|
||||||
self._unprintable_count += 1
|
self._unprintable_count += 1
|
||||||
self._character_count += 1
|
self._character_count += 1
|
||||||
|
|
||||||
|
@ -160,10 +155,10 @@ class UnprintablePlugin(MessDetectorPlugin):
|
||||||
|
|
||||||
class SuspiciousDuplicateAccentPlugin(MessDetectorPlugin):
|
class SuspiciousDuplicateAccentPlugin(MessDetectorPlugin):
|
||||||
def __init__(self) -> None:
|
def __init__(self) -> None:
|
||||||
self._successive_count = 0 # type: int
|
self._successive_count: int = 0
|
||||||
self._character_count = 0 # type: int
|
self._character_count: int = 0
|
||||||
|
|
||||||
self._last_latin_character = None # type: Optional[str]
|
self._last_latin_character: Optional[str] = None
|
||||||
|
|
||||||
def eligible(self, character: str) -> bool:
|
def eligible(self, character: str) -> bool:
|
||||||
return character.isalpha() and is_latin(character)
|
return character.isalpha() and is_latin(character)
|
||||||
|
@ -197,9 +192,9 @@ class SuspiciousDuplicateAccentPlugin(MessDetectorPlugin):
|
||||||
|
|
||||||
class SuspiciousRange(MessDetectorPlugin):
|
class SuspiciousRange(MessDetectorPlugin):
|
||||||
def __init__(self) -> None:
|
def __init__(self) -> None:
|
||||||
self._suspicious_successive_range_count = 0 # type: int
|
self._suspicious_successive_range_count: int = 0
|
||||||
self._character_count = 0 # type: int
|
self._character_count: int = 0
|
||||||
self._last_printable_seen = None # type: Optional[str]
|
self._last_printable_seen: Optional[str] = None
|
||||||
|
|
||||||
def eligible(self, character: str) -> bool:
|
def eligible(self, character: str) -> bool:
|
||||||
return character.isprintable()
|
return character.isprintable()
|
||||||
|
@ -219,10 +214,8 @@ class SuspiciousRange(MessDetectorPlugin):
|
||||||
self._last_printable_seen = character
|
self._last_printable_seen = character
|
||||||
return
|
return
|
||||||
|
|
||||||
unicode_range_a = unicode_range(
|
unicode_range_a: Optional[str] = unicode_range(self._last_printable_seen)
|
||||||
self._last_printable_seen
|
unicode_range_b: Optional[str] = unicode_range(character)
|
||||||
) # type: Optional[str]
|
|
||||||
unicode_range_b = unicode_range(character) # type: Optional[str]
|
|
||||||
|
|
||||||
if is_suspiciously_successive_range(unicode_range_a, unicode_range_b):
|
if is_suspiciously_successive_range(unicode_range_a, unicode_range_b):
|
||||||
self._suspicious_successive_range_count += 1
|
self._suspicious_successive_range_count += 1
|
||||||
|
@ -239,9 +232,9 @@ class SuspiciousRange(MessDetectorPlugin):
|
||||||
if self._character_count == 0:
|
if self._character_count == 0:
|
||||||
return 0.0
|
return 0.0
|
||||||
|
|
||||||
ratio_of_suspicious_range_usage = (
|
ratio_of_suspicious_range_usage: float = (
|
||||||
self._suspicious_successive_range_count * 2
|
self._suspicious_successive_range_count * 2
|
||||||
) / self._character_count # type: float
|
) / self._character_count
|
||||||
|
|
||||||
if ratio_of_suspicious_range_usage < 0.1:
|
if ratio_of_suspicious_range_usage < 0.1:
|
||||||
return 0.0
|
return 0.0
|
||||||
|
@ -251,25 +244,25 @@ class SuspiciousRange(MessDetectorPlugin):
|
||||||
|
|
||||||
class SuperWeirdWordPlugin(MessDetectorPlugin):
|
class SuperWeirdWordPlugin(MessDetectorPlugin):
|
||||||
def __init__(self) -> None:
|
def __init__(self) -> None:
|
||||||
self._word_count = 0 # type: int
|
self._word_count: int = 0
|
||||||
self._bad_word_count = 0 # type: int
|
self._bad_word_count: int = 0
|
||||||
self._foreign_long_count = 0 # type: int
|
self._foreign_long_count: int = 0
|
||||||
|
|
||||||
self._is_current_word_bad = False # type: bool
|
self._is_current_word_bad: bool = False
|
||||||
self._foreign_long_watch = False # type: bool
|
self._foreign_long_watch: bool = False
|
||||||
|
|
||||||
self._character_count = 0 # type: int
|
self._character_count: int = 0
|
||||||
self._bad_character_count = 0 # type: int
|
self._bad_character_count: int = 0
|
||||||
|
|
||||||
self._buffer = "" # type: str
|
self._buffer: str = ""
|
||||||
self._buffer_accent_count = 0 # type: int
|
self._buffer_accent_count: int = 0
|
||||||
|
|
||||||
def eligible(self, character: str) -> bool:
|
def eligible(self, character: str) -> bool:
|
||||||
return True
|
return True
|
||||||
|
|
||||||
def feed(self, character: str) -> None:
|
def feed(self, character: str) -> None:
|
||||||
if character.isalpha():
|
if character.isalpha():
|
||||||
self._buffer = "".join([self._buffer, character])
|
self._buffer += character
|
||||||
if is_accentuated(character):
|
if is_accentuated(character):
|
||||||
self._buffer_accent_count += 1
|
self._buffer_accent_count += 1
|
||||||
if (
|
if (
|
||||||
|
@ -289,7 +282,7 @@ class SuperWeirdWordPlugin(MessDetectorPlugin):
|
||||||
character.isspace() or is_punctuation(character) or is_separator(character)
|
character.isspace() or is_punctuation(character) or is_separator(character)
|
||||||
) and self._buffer:
|
) and self._buffer:
|
||||||
self._word_count += 1
|
self._word_count += 1
|
||||||
buffer_length = len(self._buffer) # type: int
|
buffer_length: int = len(self._buffer)
|
||||||
|
|
||||||
self._character_count += buffer_length
|
self._character_count += buffer_length
|
||||||
|
|
||||||
|
@ -346,8 +339,8 @@ class CjkInvalidStopPlugin(MessDetectorPlugin):
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self) -> None:
|
def __init__(self) -> None:
|
||||||
self._wrong_stop_count = 0 # type: int
|
self._wrong_stop_count: int = 0
|
||||||
self._cjk_character_count = 0 # type: int
|
self._cjk_character_count: int = 0
|
||||||
|
|
||||||
def eligible(self, character: str) -> bool:
|
def eligible(self, character: str) -> bool:
|
||||||
return True
|
return True
|
||||||
|
@ -372,17 +365,17 @@ class CjkInvalidStopPlugin(MessDetectorPlugin):
|
||||||
|
|
||||||
class ArchaicUpperLowerPlugin(MessDetectorPlugin):
|
class ArchaicUpperLowerPlugin(MessDetectorPlugin):
|
||||||
def __init__(self) -> None:
|
def __init__(self) -> None:
|
||||||
self._buf = False # type: bool
|
self._buf: bool = False
|
||||||
|
|
||||||
self._character_count_since_last_sep = 0 # type: int
|
self._character_count_since_last_sep: int = 0
|
||||||
|
|
||||||
self._successive_upper_lower_count = 0 # type: int
|
self._successive_upper_lower_count: int = 0
|
||||||
self._successive_upper_lower_count_final = 0 # type: int
|
self._successive_upper_lower_count_final: int = 0
|
||||||
|
|
||||||
self._character_count = 0 # type: int
|
self._character_count: int = 0
|
||||||
|
|
||||||
self._last_alpha_seen = None # type: Optional[str]
|
self._last_alpha_seen: Optional[str] = None
|
||||||
self._current_ascii_only = True # type: bool
|
self._current_ascii_only: bool = True
|
||||||
|
|
||||||
def eligible(self, character: str) -> bool:
|
def eligible(self, character: str) -> bool:
|
||||||
return True
|
return True
|
||||||
|
@ -446,6 +439,7 @@ class ArchaicUpperLowerPlugin(MessDetectorPlugin):
|
||||||
return self._successive_upper_lower_count_final / self._character_count
|
return self._successive_upper_lower_count_final / self._character_count
|
||||||
|
|
||||||
|
|
||||||
|
@lru_cache(maxsize=1024)
|
||||||
def is_suspiciously_successive_range(
|
def is_suspiciously_successive_range(
|
||||||
unicode_range_a: Optional[str], unicode_range_b: Optional[str]
|
unicode_range_a: Optional[str], unicode_range_b: Optional[str]
|
||||||
) -> bool:
|
) -> bool:
|
||||||
|
@ -524,16 +518,16 @@ def mess_ratio(
|
||||||
Compute a mess ratio given a decoded bytes sequence. The maximum threshold does stop the computation earlier.
|
Compute a mess ratio given a decoded bytes sequence. The maximum threshold does stop the computation earlier.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
detectors = [
|
detectors: List[MessDetectorPlugin] = [
|
||||||
md_class() for md_class in MessDetectorPlugin.__subclasses__()
|
md_class() for md_class in MessDetectorPlugin.__subclasses__()
|
||||||
] # type: List[MessDetectorPlugin]
|
]
|
||||||
|
|
||||||
length = len(decoded_sequence) + 1 # type: int
|
length: int = len(decoded_sequence) + 1
|
||||||
|
|
||||||
mean_mess_ratio = 0.0 # type: float
|
mean_mess_ratio: float = 0.0
|
||||||
|
|
||||||
if length < 512:
|
if length < 512:
|
||||||
intermediary_mean_mess_ratio_calc = 32 # type: int
|
intermediary_mean_mess_ratio_calc: int = 32
|
||||||
elif length <= 1024:
|
elif length <= 1024:
|
||||||
intermediary_mean_mess_ratio_calc = 64
|
intermediary_mean_mess_ratio_calc = 64
|
||||||
else:
|
else:
|
||||||
|
|
|
@ -4,7 +4,16 @@ from encodings.aliases import aliases
|
||||||
from hashlib import sha256
|
from hashlib import sha256
|
||||||
from json import dumps
|
from json import dumps
|
||||||
from re import sub
|
from re import sub
|
||||||
from typing import Any, Dict, Iterator, List, Optional, Tuple, Union
|
from typing import (
|
||||||
|
Any,
|
||||||
|
Counter as TypeCounter,
|
||||||
|
Dict,
|
||||||
|
Iterator,
|
||||||
|
List,
|
||||||
|
Optional,
|
||||||
|
Tuple,
|
||||||
|
Union,
|
||||||
|
)
|
||||||
|
|
||||||
from .constant import NOT_PRINTABLE_PATTERN, TOO_BIG_SEQUENCE
|
from .constant import NOT_PRINTABLE_PATTERN, TOO_BIG_SEQUENCE
|
||||||
from .md import mess_ratio
|
from .md import mess_ratio
|
||||||
|
@ -21,21 +30,21 @@ class CharsetMatch:
|
||||||
languages: "CoherenceMatches",
|
languages: "CoherenceMatches",
|
||||||
decoded_payload: Optional[str] = None,
|
decoded_payload: Optional[str] = None,
|
||||||
):
|
):
|
||||||
self._payload = payload # type: bytes
|
self._payload: bytes = payload
|
||||||
|
|
||||||
self._encoding = guessed_encoding # type: str
|
self._encoding: str = guessed_encoding
|
||||||
self._mean_mess_ratio = mean_mess_ratio # type: float
|
self._mean_mess_ratio: float = mean_mess_ratio
|
||||||
self._languages = languages # type: CoherenceMatches
|
self._languages: CoherenceMatches = languages
|
||||||
self._has_sig_or_bom = has_sig_or_bom # type: bool
|
self._has_sig_or_bom: bool = has_sig_or_bom
|
||||||
self._unicode_ranges = None # type: Optional[List[str]]
|
self._unicode_ranges: Optional[List[str]] = None
|
||||||
|
|
||||||
self._leaves = [] # type: List[CharsetMatch]
|
self._leaves: List[CharsetMatch] = []
|
||||||
self._mean_coherence_ratio = 0.0 # type: float
|
self._mean_coherence_ratio: float = 0.0
|
||||||
|
|
||||||
self._output_payload = None # type: Optional[bytes]
|
self._output_payload: Optional[bytes] = None
|
||||||
self._output_encoding = None # type: Optional[str]
|
self._output_encoding: Optional[str] = None
|
||||||
|
|
||||||
self._string = decoded_payload # type: Optional[str]
|
self._string: Optional[str] = decoded_payload
|
||||||
|
|
||||||
def __eq__(self, other: object) -> bool:
|
def __eq__(self, other: object) -> bool:
|
||||||
if not isinstance(other, CharsetMatch):
|
if not isinstance(other, CharsetMatch):
|
||||||
|
@ -53,8 +62,8 @@ class CharsetMatch:
|
||||||
if not isinstance(other, CharsetMatch):
|
if not isinstance(other, CharsetMatch):
|
||||||
raise ValueError
|
raise ValueError
|
||||||
|
|
||||||
chaos_difference = abs(self.chaos - other.chaos) # type: float
|
chaos_difference: float = abs(self.chaos - other.chaos)
|
||||||
coherence_difference = abs(self.coherence - other.coherence) # type: float
|
coherence_difference: float = abs(self.coherence - other.coherence)
|
||||||
|
|
||||||
# Bellow 1% difference --> Use Coherence
|
# Bellow 1% difference --> Use Coherence
|
||||||
if chaos_difference < 0.01 and coherence_difference > 0.02:
|
if chaos_difference < 0.01 and coherence_difference > 0.02:
|
||||||
|
@ -95,7 +104,7 @@ class CharsetMatch:
|
||||||
return 0.0
|
return 0.0
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def w_counter(self) -> Counter:
|
def w_counter(self) -> TypeCounter[str]:
|
||||||
"""
|
"""
|
||||||
Word counter instance on decoded text.
|
Word counter instance on decoded text.
|
||||||
Notice: Will be removed in 3.0
|
Notice: Will be removed in 3.0
|
||||||
|
@ -137,7 +146,7 @@ class CharsetMatch:
|
||||||
"""
|
"""
|
||||||
Encoding name are known by many name, using this could help when searching for IBM855 when it's listed as CP855.
|
Encoding name are known by many name, using this could help when searching for IBM855 when it's listed as CP855.
|
||||||
"""
|
"""
|
||||||
also_known_as = [] # type: List[str]
|
also_known_as: List[str] = []
|
||||||
for u, p in aliases.items():
|
for u, p in aliases.items():
|
||||||
if self.encoding == u:
|
if self.encoding == u:
|
||||||
also_known_as.append(p)
|
also_known_as.append(p)
|
||||||
|
@ -227,9 +236,9 @@ class CharsetMatch:
|
||||||
if self._unicode_ranges is not None:
|
if self._unicode_ranges is not None:
|
||||||
return self._unicode_ranges
|
return self._unicode_ranges
|
||||||
# list detected ranges
|
# list detected ranges
|
||||||
detected_ranges = [
|
detected_ranges: List[Optional[str]] = [
|
||||||
unicode_range(char) for char in str(self)
|
unicode_range(char) for char in str(self)
|
||||||
] # type: List[Optional[str]]
|
]
|
||||||
# filter and sort
|
# filter and sort
|
||||||
self._unicode_ranges = sorted(list({r for r in detected_ranges if r}))
|
self._unicode_ranges = sorted(list({r for r in detected_ranges if r}))
|
||||||
return self._unicode_ranges
|
return self._unicode_ranges
|
||||||
|
@ -280,8 +289,8 @@ class CharsetMatches:
|
||||||
Act like a list(iterable) but does not implements all related methods.
|
Act like a list(iterable) but does not implements all related methods.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, results: List[CharsetMatch] = None):
|
def __init__(self, results: Optional[List[CharsetMatch]] = None):
|
||||||
self._results = sorted(results) if results else [] # type: List[CharsetMatch]
|
self._results: List[CharsetMatch] = sorted(results) if results else []
|
||||||
|
|
||||||
def __iter__(self) -> Iterator[CharsetMatch]:
|
def __iter__(self) -> Iterator[CharsetMatch]:
|
||||||
yield from self._results
|
yield from self._results
|
||||||
|
@ -360,17 +369,17 @@ class CliDetectionResult:
|
||||||
unicode_path: Optional[str],
|
unicode_path: Optional[str],
|
||||||
is_preferred: bool,
|
is_preferred: bool,
|
||||||
):
|
):
|
||||||
self.path = path # type: str
|
self.path: str = path
|
||||||
self.unicode_path = unicode_path # type: Optional[str]
|
self.unicode_path: Optional[str] = unicode_path
|
||||||
self.encoding = encoding # type: Optional[str]
|
self.encoding: Optional[str] = encoding
|
||||||
self.encoding_aliases = encoding_aliases # type: List[str]
|
self.encoding_aliases: List[str] = encoding_aliases
|
||||||
self.alternative_encodings = alternative_encodings # type: List[str]
|
self.alternative_encodings: List[str] = alternative_encodings
|
||||||
self.language = language # type: str
|
self.language: str = language
|
||||||
self.alphabets = alphabets # type: List[str]
|
self.alphabets: List[str] = alphabets
|
||||||
self.has_sig_or_bom = has_sig_or_bom # type: bool
|
self.has_sig_or_bom: bool = has_sig_or_bom
|
||||||
self.chaos = chaos # type: float
|
self.chaos: float = chaos
|
||||||
self.coherence = coherence # type: float
|
self.coherence: float = coherence
|
||||||
self.is_preferred = is_preferred # type: bool
|
self.is_preferred: bool = is_preferred
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def __dict__(self) -> Dict[str, Any]: # type: ignore
|
def __dict__(self) -> Dict[str, Any]: # type: ignore
|
||||||
|
|
|
@ -1,4 +1,6 @@
|
||||||
try:
|
try:
|
||||||
|
# WARNING: unicodedata2 support is going to be removed in 3.0
|
||||||
|
# Python is quickly catching up.
|
||||||
import unicodedata2 as unicodedata
|
import unicodedata2 as unicodedata
|
||||||
except ImportError:
|
except ImportError:
|
||||||
import unicodedata # type: ignore[no-redef]
|
import unicodedata # type: ignore[no-redef]
|
||||||
|
@ -9,9 +11,9 @@ from codecs import IncrementalDecoder
|
||||||
from encodings.aliases import aliases
|
from encodings.aliases import aliases
|
||||||
from functools import lru_cache
|
from functools import lru_cache
|
||||||
from re import findall
|
from re import findall
|
||||||
from typing import List, Optional, Set, Tuple, Union
|
from typing import Generator, List, Optional, Set, Tuple, Union
|
||||||
|
|
||||||
from _multibytecodec import MultibyteIncrementalDecoder # type: ignore
|
from _multibytecodec import MultibyteIncrementalDecoder
|
||||||
|
|
||||||
from .constant import (
|
from .constant import (
|
||||||
ENCODING_MARKS,
|
ENCODING_MARKS,
|
||||||
|
@ -26,7 +28,7 @@ from .constant import (
|
||||||
@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
|
@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
|
||||||
def is_accentuated(character: str) -> bool:
|
def is_accentuated(character: str) -> bool:
|
||||||
try:
|
try:
|
||||||
description = unicodedata.name(character) # type: str
|
description: str = unicodedata.name(character)
|
||||||
except ValueError:
|
except ValueError:
|
||||||
return False
|
return False
|
||||||
return (
|
return (
|
||||||
|
@ -41,11 +43,11 @@ def is_accentuated(character: str) -> bool:
|
||||||
|
|
||||||
@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
|
@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
|
||||||
def remove_accent(character: str) -> str:
|
def remove_accent(character: str) -> str:
|
||||||
decomposed = unicodedata.decomposition(character) # type: str
|
decomposed: str = unicodedata.decomposition(character)
|
||||||
if not decomposed:
|
if not decomposed:
|
||||||
return character
|
return character
|
||||||
|
|
||||||
codes = decomposed.split(" ") # type: List[str]
|
codes: List[str] = decomposed.split(" ")
|
||||||
|
|
||||||
return chr(int(codes[0], 16))
|
return chr(int(codes[0], 16))
|
||||||
|
|
||||||
|
@ -55,7 +57,7 @@ def unicode_range(character: str) -> Optional[str]:
|
||||||
"""
|
"""
|
||||||
Retrieve the Unicode range official name from a single character.
|
Retrieve the Unicode range official name from a single character.
|
||||||
"""
|
"""
|
||||||
character_ord = ord(character) # type: int
|
character_ord: int = ord(character)
|
||||||
|
|
||||||
for range_name, ord_range in UNICODE_RANGES_COMBINED.items():
|
for range_name, ord_range in UNICODE_RANGES_COMBINED.items():
|
||||||
if character_ord in ord_range:
|
if character_ord in ord_range:
|
||||||
|
@ -67,12 +69,13 @@ def unicode_range(character: str) -> Optional[str]:
|
||||||
@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
|
@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
|
||||||
def is_latin(character: str) -> bool:
|
def is_latin(character: str) -> bool:
|
||||||
try:
|
try:
|
||||||
description = unicodedata.name(character) # type: str
|
description: str = unicodedata.name(character)
|
||||||
except ValueError:
|
except ValueError:
|
||||||
return False
|
return False
|
||||||
return "LATIN" in description
|
return "LATIN" in description
|
||||||
|
|
||||||
|
|
||||||
|
@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
|
||||||
def is_ascii(character: str) -> bool:
|
def is_ascii(character: str) -> bool:
|
||||||
try:
|
try:
|
||||||
character.encode("ascii")
|
character.encode("ascii")
|
||||||
|
@ -83,12 +86,12 @@ def is_ascii(character: str) -> bool:
|
||||||
|
|
||||||
@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
|
@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
|
||||||
def is_punctuation(character: str) -> bool:
|
def is_punctuation(character: str) -> bool:
|
||||||
character_category = unicodedata.category(character) # type: str
|
character_category: str = unicodedata.category(character)
|
||||||
|
|
||||||
if "P" in character_category:
|
if "P" in character_category:
|
||||||
return True
|
return True
|
||||||
|
|
||||||
character_range = unicode_range(character) # type: Optional[str]
|
character_range: Optional[str] = unicode_range(character)
|
||||||
|
|
||||||
if character_range is None:
|
if character_range is None:
|
||||||
return False
|
return False
|
||||||
|
@ -98,12 +101,12 @@ def is_punctuation(character: str) -> bool:
|
||||||
|
|
||||||
@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
|
@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
|
||||||
def is_symbol(character: str) -> bool:
|
def is_symbol(character: str) -> bool:
|
||||||
character_category = unicodedata.category(character) # type: str
|
character_category: str = unicodedata.category(character)
|
||||||
|
|
||||||
if "S" in character_category or "N" in character_category:
|
if "S" in character_category or "N" in character_category:
|
||||||
return True
|
return True
|
||||||
|
|
||||||
character_range = unicode_range(character) # type: Optional[str]
|
character_range: Optional[str] = unicode_range(character)
|
||||||
|
|
||||||
if character_range is None:
|
if character_range is None:
|
||||||
return False
|
return False
|
||||||
|
@ -113,7 +116,7 @@ def is_symbol(character: str) -> bool:
|
||||||
|
|
||||||
@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
|
@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
|
||||||
def is_emoticon(character: str) -> bool:
|
def is_emoticon(character: str) -> bool:
|
||||||
character_range = unicode_range(character) # type: Optional[str]
|
character_range: Optional[str] = unicode_range(character)
|
||||||
|
|
||||||
if character_range is None:
|
if character_range is None:
|
||||||
return False
|
return False
|
||||||
|
@ -126,7 +129,7 @@ def is_separator(character: str) -> bool:
|
||||||
if character.isspace() or character in {"|", "+", ",", ";", "<", ">"}:
|
if character.isspace() or character in {"|", "+", ",", ";", "<", ">"}:
|
||||||
return True
|
return True
|
||||||
|
|
||||||
character_category = unicodedata.category(character) # type: str
|
character_category: str = unicodedata.category(character)
|
||||||
|
|
||||||
return "Z" in character_category
|
return "Z" in character_category
|
||||||
|
|
||||||
|
@ -137,7 +140,7 @@ def is_case_variable(character: str) -> bool:
|
||||||
|
|
||||||
|
|
||||||
def is_private_use_only(character: str) -> bool:
|
def is_private_use_only(character: str) -> bool:
|
||||||
character_category = unicodedata.category(character) # type: str
|
character_category: str = unicodedata.category(character)
|
||||||
|
|
||||||
return character_category == "Co"
|
return character_category == "Co"
|
||||||
|
|
||||||
|
@ -197,6 +200,17 @@ def is_unicode_range_secondary(range_name: str) -> bool:
|
||||||
return any(keyword in range_name for keyword in UNICODE_SECONDARY_RANGE_KEYWORD)
|
return any(keyword in range_name for keyword in UNICODE_SECONDARY_RANGE_KEYWORD)
|
||||||
|
|
||||||
|
|
||||||
|
@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
|
||||||
|
def is_unprintable(character: str) -> bool:
|
||||||
|
return (
|
||||||
|
character.isspace() is False # includes \n \t \r \v
|
||||||
|
and character.isprintable() is False
|
||||||
|
and character != "\x1A" # Why? Its the ASCII substitute character.
|
||||||
|
and character != "\ufeff" # bug discovered in Python,
|
||||||
|
# Zero Width No-Break Space located in Arabic Presentation Forms-B, Unicode 1.1 not acknowledged as space.
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
def any_specified_encoding(sequence: bytes, search_zone: int = 4096) -> Optional[str]:
|
def any_specified_encoding(sequence: bytes, search_zone: int = 4096) -> Optional[str]:
|
||||||
"""
|
"""
|
||||||
Extract using ASCII-only decoder any specified encoding in the first n-bytes.
|
Extract using ASCII-only decoder any specified encoding in the first n-bytes.
|
||||||
|
@ -204,12 +218,12 @@ def any_specified_encoding(sequence: bytes, search_zone: int = 4096) -> Optional
|
||||||
if not isinstance(sequence, bytes):
|
if not isinstance(sequence, bytes):
|
||||||
raise TypeError
|
raise TypeError
|
||||||
|
|
||||||
seq_len = len(sequence) # type: int
|
seq_len: int = len(sequence)
|
||||||
|
|
||||||
results = findall(
|
results: List[str] = findall(
|
||||||
RE_POSSIBLE_ENCODING_INDICATION,
|
RE_POSSIBLE_ENCODING_INDICATION,
|
||||||
sequence[: min(seq_len, search_zone)].decode("ascii", errors="ignore"),
|
sequence[: min(seq_len, search_zone)].decode("ascii", errors="ignore"),
|
||||||
) # type: List[str]
|
)
|
||||||
|
|
||||||
if len(results) == 0:
|
if len(results) == 0:
|
||||||
return None
|
return None
|
||||||
|
@ -217,6 +231,9 @@ def any_specified_encoding(sequence: bytes, search_zone: int = 4096) -> Optional
|
||||||
for specified_encoding in results:
|
for specified_encoding in results:
|
||||||
specified_encoding = specified_encoding.lower().replace("-", "_")
|
specified_encoding = specified_encoding.lower().replace("-", "_")
|
||||||
|
|
||||||
|
encoding_alias: str
|
||||||
|
encoding_iana: str
|
||||||
|
|
||||||
for encoding_alias, encoding_iana in aliases.items():
|
for encoding_alias, encoding_iana in aliases.items():
|
||||||
if encoding_alias == specified_encoding:
|
if encoding_alias == specified_encoding:
|
||||||
return encoding_iana
|
return encoding_iana
|
||||||
|
@ -242,7 +259,7 @@ def is_multi_byte_encoding(name: str) -> bool:
|
||||||
"utf_32_be",
|
"utf_32_be",
|
||||||
"utf_7",
|
"utf_7",
|
||||||
} or issubclass(
|
} or issubclass(
|
||||||
importlib.import_module("encodings.{}".format(name)).IncrementalDecoder, # type: ignore
|
importlib.import_module("encodings.{}".format(name)).IncrementalDecoder,
|
||||||
MultibyteIncrementalDecoder,
|
MultibyteIncrementalDecoder,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
@ -253,7 +270,7 @@ def identify_sig_or_bom(sequence: bytes) -> Tuple[Optional[str], bytes]:
|
||||||
"""
|
"""
|
||||||
|
|
||||||
for iana_encoding in ENCODING_MARKS:
|
for iana_encoding in ENCODING_MARKS:
|
||||||
marks = ENCODING_MARKS[iana_encoding] # type: Union[bytes, List[bytes]]
|
marks: Union[bytes, List[bytes]] = ENCODING_MARKS[iana_encoding]
|
||||||
|
|
||||||
if isinstance(marks, bytes):
|
if isinstance(marks, bytes):
|
||||||
marks = [marks]
|
marks = [marks]
|
||||||
|
@ -272,6 +289,9 @@ def should_strip_sig_or_bom(iana_encoding: str) -> bool:
|
||||||
def iana_name(cp_name: str, strict: bool = True) -> str:
|
def iana_name(cp_name: str, strict: bool = True) -> str:
|
||||||
cp_name = cp_name.lower().replace("-", "_")
|
cp_name = cp_name.lower().replace("-", "_")
|
||||||
|
|
||||||
|
encoding_alias: str
|
||||||
|
encoding_iana: str
|
||||||
|
|
||||||
for encoding_alias, encoding_iana in aliases.items():
|
for encoding_alias, encoding_iana in aliases.items():
|
||||||
if cp_name in [encoding_alias, encoding_iana]:
|
if cp_name in [encoding_alias, encoding_iana]:
|
||||||
return encoding_iana
|
return encoding_iana
|
||||||
|
@ -283,10 +303,10 @@ def iana_name(cp_name: str, strict: bool = True) -> str:
|
||||||
|
|
||||||
|
|
||||||
def range_scan(decoded_sequence: str) -> List[str]:
|
def range_scan(decoded_sequence: str) -> List[str]:
|
||||||
ranges = set() # type: Set[str]
|
ranges: Set[str] = set()
|
||||||
|
|
||||||
for character in decoded_sequence:
|
for character in decoded_sequence:
|
||||||
character_range = unicode_range(character) # type: Optional[str]
|
character_range: Optional[str] = unicode_range(character)
|
||||||
|
|
||||||
if character_range is None:
|
if character_range is None:
|
||||||
continue
|
continue
|
||||||
|
@ -301,16 +321,20 @@ def cp_similarity(iana_name_a: str, iana_name_b: str) -> float:
|
||||||
if is_multi_byte_encoding(iana_name_a) or is_multi_byte_encoding(iana_name_b):
|
if is_multi_byte_encoding(iana_name_a) or is_multi_byte_encoding(iana_name_b):
|
||||||
return 0.0
|
return 0.0
|
||||||
|
|
||||||
decoder_a = importlib.import_module("encodings.{}".format(iana_name_a)).IncrementalDecoder # type: ignore
|
decoder_a = importlib.import_module(
|
||||||
decoder_b = importlib.import_module("encodings.{}".format(iana_name_b)).IncrementalDecoder # type: ignore
|
"encodings.{}".format(iana_name_a)
|
||||||
|
).IncrementalDecoder
|
||||||
|
decoder_b = importlib.import_module(
|
||||||
|
"encodings.{}".format(iana_name_b)
|
||||||
|
).IncrementalDecoder
|
||||||
|
|
||||||
id_a = decoder_a(errors="ignore") # type: IncrementalDecoder
|
id_a: IncrementalDecoder = decoder_a(errors="ignore")
|
||||||
id_b = decoder_b(errors="ignore") # type: IncrementalDecoder
|
id_b: IncrementalDecoder = decoder_b(errors="ignore")
|
||||||
|
|
||||||
character_match_count = 0 # type: int
|
character_match_count: int = 0
|
||||||
|
|
||||||
for i in range(255):
|
for i in range(255):
|
||||||
to_be_decoded = bytes([i]) # type: bytes
|
to_be_decoded: bytes = bytes([i])
|
||||||
if id_a.decode(to_be_decoded) == id_b.decode(to_be_decoded):
|
if id_a.decode(to_be_decoded) == id_b.decode(to_be_decoded):
|
||||||
character_match_count += 1
|
character_match_count += 1
|
||||||
|
|
||||||
|
@ -340,3 +364,61 @@ def set_logging_handler(
|
||||||
handler = logging.StreamHandler()
|
handler = logging.StreamHandler()
|
||||||
handler.setFormatter(logging.Formatter(format_string))
|
handler.setFormatter(logging.Formatter(format_string))
|
||||||
logger.addHandler(handler)
|
logger.addHandler(handler)
|
||||||
|
|
||||||
|
|
||||||
|
def cut_sequence_chunks(
|
||||||
|
sequences: bytes,
|
||||||
|
encoding_iana: str,
|
||||||
|
offsets: range,
|
||||||
|
chunk_size: int,
|
||||||
|
bom_or_sig_available: bool,
|
||||||
|
strip_sig_or_bom: bool,
|
||||||
|
sig_payload: bytes,
|
||||||
|
is_multi_byte_decoder: bool,
|
||||||
|
decoded_payload: Optional[str] = None,
|
||||||
|
) -> Generator[str, None, None]:
|
||||||
|
|
||||||
|
if decoded_payload and is_multi_byte_decoder is False:
|
||||||
|
for i in offsets:
|
||||||
|
chunk = decoded_payload[i : i + chunk_size]
|
||||||
|
if not chunk:
|
||||||
|
break
|
||||||
|
yield chunk
|
||||||
|
else:
|
||||||
|
for i in offsets:
|
||||||
|
chunk_end = i + chunk_size
|
||||||
|
if chunk_end > len(sequences) + 8:
|
||||||
|
continue
|
||||||
|
|
||||||
|
cut_sequence = sequences[i : i + chunk_size]
|
||||||
|
|
||||||
|
if bom_or_sig_available and strip_sig_or_bom is False:
|
||||||
|
cut_sequence = sig_payload + cut_sequence
|
||||||
|
|
||||||
|
chunk = cut_sequence.decode(
|
||||||
|
encoding_iana,
|
||||||
|
errors="ignore" if is_multi_byte_decoder else "strict",
|
||||||
|
)
|
||||||
|
|
||||||
|
# multi-byte bad cutting detector and adjustment
|
||||||
|
# not the cleanest way to perform that fix but clever enough for now.
|
||||||
|
if is_multi_byte_decoder and i > 0 and sequences[i] >= 0x80:
|
||||||
|
|
||||||
|
chunk_partial_size_chk: int = min(chunk_size, 16)
|
||||||
|
|
||||||
|
if (
|
||||||
|
decoded_payload
|
||||||
|
and chunk[:chunk_partial_size_chk] not in decoded_payload
|
||||||
|
):
|
||||||
|
for j in range(i, i - 4, -1):
|
||||||
|
cut_sequence = sequences[j:chunk_end]
|
||||||
|
|
||||||
|
if bom_or_sig_available and strip_sig_or_bom is False:
|
||||||
|
cut_sequence = sig_payload + cut_sequence
|
||||||
|
|
||||||
|
chunk = cut_sequence.decode(encoding_iana, errors="ignore")
|
||||||
|
|
||||||
|
if chunk[:chunk_partial_size_chk] in decoded_payload:
|
||||||
|
break
|
||||||
|
|
||||||
|
yield chunk
|
||||||
|
|
|
@ -2,5 +2,5 @@
|
||||||
Expose version
|
Expose version
|
||||||
"""
|
"""
|
||||||
|
|
||||||
__version__ = "2.0.12"
|
__version__ = "2.1.1"
|
||||||
VERSION = __version__.split(".")
|
VERSION = __version__.split(".")
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue