mirror of
https://github.com/Tautulli/Tautulli.git
synced 2025-07-15 01:32:57 -07:00
Add charset_normalizer-2.0.7
This commit is contained in:
parent
4c25cc3cc2
commit
2f1a08009f
13 changed files with 4314 additions and 0 deletions
47
lib/charset_normalizer/__init__.py
Normal file
47
lib/charset_normalizer/__init__.py
Normal file
|
@ -0,0 +1,47 @@
|
||||||
|
# -*- coding: utf_8 -*-
|
||||||
|
"""
|
||||||
|
Charset-Normalizer
|
||||||
|
~~~~~~~~~~~~~~
|
||||||
|
The Real First Universal Charset Detector.
|
||||||
|
A library that helps you read text from an unknown charset encoding.
|
||||||
|
Motivated by chardet, This package is trying to resolve the issue by taking a new approach.
|
||||||
|
All IANA character set names for which the Python core library provides codecs are supported.
|
||||||
|
|
||||||
|
Basic usage:
|
||||||
|
>>> from charset_normalizer import from_bytes
|
||||||
|
>>> results = from_bytes('Bсеки човек има право на образование. Oбразованието!'.encode('utf_8'))
|
||||||
|
>>> best_guess = results.best()
|
||||||
|
>>> str(best_guess)
|
||||||
|
'Bсеки човек има право на образование. Oбразованието!'
|
||||||
|
|
||||||
|
Others methods and usages are available - see the full documentation
|
||||||
|
at <https://github.com/Ousret/charset_normalizer>.
|
||||||
|
:copyright: (c) 2021 by Ahmed TAHRI
|
||||||
|
:license: MIT, see LICENSE for more details.
|
||||||
|
"""
|
||||||
|
from .api import from_bytes, from_fp, from_path, normalize
|
||||||
|
from .legacy import (
|
||||||
|
CharsetDetector,
|
||||||
|
CharsetDoctor,
|
||||||
|
CharsetNormalizerMatch,
|
||||||
|
CharsetNormalizerMatches,
|
||||||
|
detect,
|
||||||
|
)
|
||||||
|
from .models import CharsetMatch, CharsetMatches
|
||||||
|
from .version import VERSION, __version__
|
||||||
|
|
||||||
|
__all__ = (
|
||||||
|
"from_fp",
|
||||||
|
"from_path",
|
||||||
|
"from_bytes",
|
||||||
|
"normalize",
|
||||||
|
"detect",
|
||||||
|
"CharsetMatch",
|
||||||
|
"CharsetMatches",
|
||||||
|
"CharsetNormalizerMatch",
|
||||||
|
"CharsetNormalizerMatches",
|
||||||
|
"CharsetDetector",
|
||||||
|
"CharsetDoctor",
|
||||||
|
"__version__",
|
||||||
|
"VERSION",
|
||||||
|
)
|
528
lib/charset_normalizer/api.py
Normal file
528
lib/charset_normalizer/api.py
Normal file
|
@ -0,0 +1,528 @@
|
||||||
|
from os.path import basename, splitext
|
||||||
|
from typing import BinaryIO, List, Optional, Set
|
||||||
|
|
||||||
|
try:
|
||||||
|
from os import PathLike
|
||||||
|
except ImportError: # pragma: no cover
|
||||||
|
PathLike = str # type: ignore
|
||||||
|
|
||||||
|
import logging
|
||||||
|
|
||||||
|
from .cd import (
|
||||||
|
coherence_ratio,
|
||||||
|
encoding_languages,
|
||||||
|
mb_encoding_languages,
|
||||||
|
merge_coherence_ratios,
|
||||||
|
)
|
||||||
|
from .constant import IANA_SUPPORTED, TOO_BIG_SEQUENCE, TOO_SMALL_SEQUENCE
|
||||||
|
from .md import mess_ratio
|
||||||
|
from .models import CharsetMatch, CharsetMatches
|
||||||
|
from .utils import (
|
||||||
|
any_specified_encoding,
|
||||||
|
iana_name,
|
||||||
|
identify_sig_or_bom,
|
||||||
|
is_cp_similar,
|
||||||
|
is_multi_byte_encoding,
|
||||||
|
should_strip_sig_or_bom,
|
||||||
|
)
|
||||||
|
|
||||||
|
logger = logging.getLogger("charset_normalizer")
|
||||||
|
logger.setLevel(logging.DEBUG)
|
||||||
|
|
||||||
|
handler = logging.StreamHandler()
|
||||||
|
handler.setFormatter(logging.Formatter("%(asctime)s | %(levelname)s | %(message)s"))
|
||||||
|
logger.addHandler(handler)
|
||||||
|
|
||||||
|
|
||||||
|
def from_bytes(
|
||||||
|
sequences: bytes,
|
||||||
|
steps: int = 5,
|
||||||
|
chunk_size: int = 512,
|
||||||
|
threshold: float = 0.2,
|
||||||
|
cp_isolation: List[str] = None,
|
||||||
|
cp_exclusion: List[str] = None,
|
||||||
|
preemptive_behaviour: bool = True,
|
||||||
|
explain: bool = False,
|
||||||
|
) -> CharsetMatches:
|
||||||
|
"""
|
||||||
|
Given a raw bytes sequence, return the best possibles charset usable to render str objects.
|
||||||
|
If there is no results, it is a strong indicator that the source is binary/not text.
|
||||||
|
By default, the process will extract 5 blocs of 512o each to assess the mess and coherence of a given sequence.
|
||||||
|
And will give up a particular code page after 20% of measured mess. Those criteria are customizable at will.
|
||||||
|
|
||||||
|
The preemptive behavior DOES NOT replace the traditional detection workflow, it prioritize a particular code page
|
||||||
|
but never take it for granted. Can improve the performance.
|
||||||
|
|
||||||
|
You may want to focus your attention to some code page or/and not others, use cp_isolation and cp_exclusion for that
|
||||||
|
purpose.
|
||||||
|
|
||||||
|
This function will strip the SIG in the payload/sequence every time except on UTF-16, UTF-32.
|
||||||
|
"""
|
||||||
|
|
||||||
|
if not isinstance(sequences, (bytearray, bytes)):
|
||||||
|
raise TypeError(
|
||||||
|
"Expected object of type bytes or bytearray, got: {0}".format(
|
||||||
|
type(sequences)
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
if not explain:
|
||||||
|
logger.setLevel(logging.CRITICAL)
|
||||||
|
else:
|
||||||
|
logger.setLevel(logging.INFO)
|
||||||
|
|
||||||
|
length = len(sequences) # type: int
|
||||||
|
|
||||||
|
if length == 0:
|
||||||
|
logger.warning(
|
||||||
|
"Given content is empty, stopping the process very early, returning empty utf_8 str match"
|
||||||
|
)
|
||||||
|
return CharsetMatches([CharsetMatch(sequences, "utf_8", 0.0, False, [], "")])
|
||||||
|
|
||||||
|
if cp_isolation is not None:
|
||||||
|
logger.warning(
|
||||||
|
"cp_isolation is set. use this flag for debugging purpose. "
|
||||||
|
"limited list of encoding allowed : %s.",
|
||||||
|
", ".join(cp_isolation),
|
||||||
|
)
|
||||||
|
cp_isolation = [iana_name(cp, False) for cp in cp_isolation]
|
||||||
|
else:
|
||||||
|
cp_isolation = []
|
||||||
|
|
||||||
|
if cp_exclusion is not None:
|
||||||
|
logger.warning(
|
||||||
|
"cp_exclusion is set. use this flag for debugging purpose. "
|
||||||
|
"limited list of encoding excluded : %s.",
|
||||||
|
", ".join(cp_exclusion),
|
||||||
|
)
|
||||||
|
cp_exclusion = [iana_name(cp, False) for cp in cp_exclusion]
|
||||||
|
else:
|
||||||
|
cp_exclusion = []
|
||||||
|
|
||||||
|
if length <= (chunk_size * steps):
|
||||||
|
logger.warning(
|
||||||
|
"override steps (%i) and chunk_size (%i) as content does not fit (%i byte(s) given) parameters.",
|
||||||
|
steps,
|
||||||
|
chunk_size,
|
||||||
|
length,
|
||||||
|
)
|
||||||
|
steps = 1
|
||||||
|
chunk_size = length
|
||||||
|
|
||||||
|
if steps > 1 and length / steps < chunk_size:
|
||||||
|
chunk_size = int(length / steps)
|
||||||
|
|
||||||
|
is_too_small_sequence = len(sequences) < TOO_SMALL_SEQUENCE # type: bool
|
||||||
|
is_too_large_sequence = len(sequences) >= TOO_BIG_SEQUENCE # type: bool
|
||||||
|
|
||||||
|
if is_too_small_sequence:
|
||||||
|
logger.warning(
|
||||||
|
"Trying to detect encoding from a tiny portion of ({}) byte(s).".format(
|
||||||
|
length
|
||||||
|
)
|
||||||
|
)
|
||||||
|
elif is_too_large_sequence:
|
||||||
|
logger.info(
|
||||||
|
"Using lazy str decoding because the payload is quite large, ({}) byte(s).".format(
|
||||||
|
length
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
prioritized_encodings = [] # type: List[str]
|
||||||
|
|
||||||
|
specified_encoding = (
|
||||||
|
any_specified_encoding(sequences) if preemptive_behaviour is True else None
|
||||||
|
) # type: Optional[str]
|
||||||
|
|
||||||
|
if specified_encoding is not None:
|
||||||
|
prioritized_encodings.append(specified_encoding)
|
||||||
|
logger.info(
|
||||||
|
"Detected declarative mark in sequence. Priority +1 given for %s.",
|
||||||
|
specified_encoding,
|
||||||
|
)
|
||||||
|
|
||||||
|
tested = set() # type: Set[str]
|
||||||
|
tested_but_hard_failure = [] # type: List[str]
|
||||||
|
tested_but_soft_failure = [] # type: List[str]
|
||||||
|
|
||||||
|
fallback_ascii = None # type: Optional[CharsetMatch]
|
||||||
|
fallback_u8 = None # type: Optional[CharsetMatch]
|
||||||
|
fallback_specified = None # type: Optional[CharsetMatch]
|
||||||
|
|
||||||
|
results = CharsetMatches() # type: CharsetMatches
|
||||||
|
|
||||||
|
sig_encoding, sig_payload = identify_sig_or_bom(sequences)
|
||||||
|
|
||||||
|
if sig_encoding is not None:
|
||||||
|
prioritized_encodings.append(sig_encoding)
|
||||||
|
logger.info(
|
||||||
|
"Detected a SIG or BOM mark on first %i byte(s). Priority +1 given for %s.",
|
||||||
|
len(sig_payload),
|
||||||
|
sig_encoding,
|
||||||
|
)
|
||||||
|
|
||||||
|
prioritized_encodings.append("ascii")
|
||||||
|
|
||||||
|
if "utf_8" not in prioritized_encodings:
|
||||||
|
prioritized_encodings.append("utf_8")
|
||||||
|
|
||||||
|
for encoding_iana in prioritized_encodings + IANA_SUPPORTED:
|
||||||
|
|
||||||
|
if cp_isolation and encoding_iana not in cp_isolation:
|
||||||
|
continue
|
||||||
|
|
||||||
|
if cp_exclusion and encoding_iana in cp_exclusion:
|
||||||
|
continue
|
||||||
|
|
||||||
|
if encoding_iana in tested:
|
||||||
|
continue
|
||||||
|
|
||||||
|
tested.add(encoding_iana)
|
||||||
|
|
||||||
|
decoded_payload = None # type: Optional[str]
|
||||||
|
bom_or_sig_available = sig_encoding == encoding_iana # type: bool
|
||||||
|
strip_sig_or_bom = bom_or_sig_available and should_strip_sig_or_bom(
|
||||||
|
encoding_iana
|
||||||
|
) # type: bool
|
||||||
|
|
||||||
|
if encoding_iana in {"utf_16", "utf_32"} and bom_or_sig_available is False:
|
||||||
|
logger.info(
|
||||||
|
"Encoding %s wont be tested as-is because it require a BOM. Will try some sub-encoder LE/BE.",
|
||||||
|
encoding_iana,
|
||||||
|
)
|
||||||
|
continue
|
||||||
|
|
||||||
|
try:
|
||||||
|
is_multi_byte_decoder = is_multi_byte_encoding(encoding_iana) # type: bool
|
||||||
|
except (ModuleNotFoundError, ImportError):
|
||||||
|
logger.debug(
|
||||||
|
"Encoding %s does not provide an IncrementalDecoder", encoding_iana
|
||||||
|
)
|
||||||
|
continue
|
||||||
|
|
||||||
|
try:
|
||||||
|
if is_too_large_sequence and is_multi_byte_decoder is False:
|
||||||
|
str(
|
||||||
|
sequences[: int(50e4)]
|
||||||
|
if strip_sig_or_bom is False
|
||||||
|
else sequences[len(sig_payload) : int(50e4)],
|
||||||
|
encoding=encoding_iana,
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
decoded_payload = str(
|
||||||
|
sequences
|
||||||
|
if strip_sig_or_bom is False
|
||||||
|
else sequences[len(sig_payload) :],
|
||||||
|
encoding=encoding_iana,
|
||||||
|
)
|
||||||
|
except (UnicodeDecodeError, LookupError) as e:
|
||||||
|
if not isinstance(e, LookupError):
|
||||||
|
logger.warning(
|
||||||
|
"Code page %s does not fit given bytes sequence at ALL. %s",
|
||||||
|
encoding_iana,
|
||||||
|
str(e),
|
||||||
|
)
|
||||||
|
tested_but_hard_failure.append(encoding_iana)
|
||||||
|
continue
|
||||||
|
|
||||||
|
similar_soft_failure_test = False # type: bool
|
||||||
|
|
||||||
|
for encoding_soft_failed in tested_but_soft_failure:
|
||||||
|
if is_cp_similar(encoding_iana, encoding_soft_failed):
|
||||||
|
similar_soft_failure_test = True
|
||||||
|
break
|
||||||
|
|
||||||
|
if similar_soft_failure_test:
|
||||||
|
logger.warning(
|
||||||
|
"%s is deemed too similar to code page %s and was consider unsuited already. Continuing!",
|
||||||
|
encoding_iana,
|
||||||
|
encoding_soft_failed,
|
||||||
|
)
|
||||||
|
continue
|
||||||
|
|
||||||
|
r_ = range(
|
||||||
|
0 if bom_or_sig_available is False else len(sig_payload),
|
||||||
|
length,
|
||||||
|
int(length / steps),
|
||||||
|
)
|
||||||
|
|
||||||
|
multi_byte_bonus = (
|
||||||
|
is_multi_byte_decoder
|
||||||
|
and decoded_payload is not None
|
||||||
|
and len(decoded_payload) < length
|
||||||
|
) # type: bool
|
||||||
|
|
||||||
|
if multi_byte_bonus:
|
||||||
|
logger.info(
|
||||||
|
"Code page %s is a multi byte encoding table and it appear that at least one character "
|
||||||
|
"was encoded using n-bytes.",
|
||||||
|
encoding_iana,
|
||||||
|
)
|
||||||
|
|
||||||
|
max_chunk_gave_up = int(len(r_) / 4) # type: int
|
||||||
|
|
||||||
|
if max_chunk_gave_up < 2:
|
||||||
|
max_chunk_gave_up = 2
|
||||||
|
|
||||||
|
early_stop_count = 0 # type: int
|
||||||
|
|
||||||
|
md_chunks = [] # type: List[str]
|
||||||
|
md_ratios = []
|
||||||
|
|
||||||
|
for i in r_:
|
||||||
|
cut_sequence = sequences[i : i + chunk_size]
|
||||||
|
|
||||||
|
if bom_or_sig_available and strip_sig_or_bom is False:
|
||||||
|
cut_sequence = sig_payload + cut_sequence
|
||||||
|
|
||||||
|
chunk = cut_sequence.decode(encoding_iana, errors="ignore") # type: str
|
||||||
|
|
||||||
|
# multi-byte bad cutting detector and adjustment
|
||||||
|
# not the cleanest way to perform that fix but clever enough for now.
|
||||||
|
if is_multi_byte_decoder and i > 0 and sequences[i] >= 0x80:
|
||||||
|
|
||||||
|
chunk_partial_size_chk = (
|
||||||
|
16 if chunk_size > 16 else chunk_size
|
||||||
|
) # type: int
|
||||||
|
|
||||||
|
if (
|
||||||
|
decoded_payload
|
||||||
|
and chunk[:chunk_partial_size_chk] not in decoded_payload
|
||||||
|
):
|
||||||
|
for j in range(i, i - 4, -1):
|
||||||
|
cut_sequence = sequences[j : i + chunk_size]
|
||||||
|
|
||||||
|
if bom_or_sig_available and strip_sig_or_bom is False:
|
||||||
|
cut_sequence = sig_payload + cut_sequence
|
||||||
|
|
||||||
|
chunk = cut_sequence.decode(encoding_iana, errors="ignore")
|
||||||
|
|
||||||
|
if chunk[:chunk_partial_size_chk] in decoded_payload:
|
||||||
|
break
|
||||||
|
|
||||||
|
md_chunks.append(chunk)
|
||||||
|
|
||||||
|
md_ratios.append(mess_ratio(chunk, threshold))
|
||||||
|
|
||||||
|
if md_ratios[-1] >= threshold:
|
||||||
|
early_stop_count += 1
|
||||||
|
|
||||||
|
if (early_stop_count >= max_chunk_gave_up) or (
|
||||||
|
bom_or_sig_available and strip_sig_or_bom is False
|
||||||
|
):
|
||||||
|
break
|
||||||
|
|
||||||
|
if md_ratios:
|
||||||
|
mean_mess_ratio = sum(md_ratios) / len(md_ratios) # type: float
|
||||||
|
else:
|
||||||
|
mean_mess_ratio = 0.0
|
||||||
|
|
||||||
|
if mean_mess_ratio >= threshold or early_stop_count >= max_chunk_gave_up:
|
||||||
|
tested_but_soft_failure.append(encoding_iana)
|
||||||
|
logger.warning(
|
||||||
|
"%s was excluded because of initial chaos probing. Gave up %i time(s). "
|
||||||
|
"Computed mean chaos is %f %%.",
|
||||||
|
encoding_iana,
|
||||||
|
early_stop_count,
|
||||||
|
round(mean_mess_ratio * 100, ndigits=3),
|
||||||
|
)
|
||||||
|
# Preparing those fallbacks in case we got nothing.
|
||||||
|
if encoding_iana in ["ascii", "utf_8", specified_encoding]:
|
||||||
|
fallback_entry = CharsetMatch(
|
||||||
|
sequences, encoding_iana, threshold, False, [], decoded_payload
|
||||||
|
)
|
||||||
|
if encoding_iana == specified_encoding:
|
||||||
|
fallback_specified = fallback_entry
|
||||||
|
elif encoding_iana == "ascii":
|
||||||
|
fallback_ascii = fallback_entry
|
||||||
|
else:
|
||||||
|
fallback_u8 = fallback_entry
|
||||||
|
continue
|
||||||
|
|
||||||
|
logger.info(
|
||||||
|
"%s passed initial chaos probing. Mean measured chaos is %f %%",
|
||||||
|
encoding_iana,
|
||||||
|
round(mean_mess_ratio * 100, ndigits=3),
|
||||||
|
)
|
||||||
|
|
||||||
|
if not is_multi_byte_decoder:
|
||||||
|
target_languages = encoding_languages(encoding_iana) # type: List[str]
|
||||||
|
else:
|
||||||
|
target_languages = mb_encoding_languages(encoding_iana)
|
||||||
|
|
||||||
|
if target_languages:
|
||||||
|
logger.info(
|
||||||
|
"{} should target any language(s) of {}".format(
|
||||||
|
encoding_iana, str(target_languages)
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
cd_ratios = []
|
||||||
|
|
||||||
|
for chunk in md_chunks:
|
||||||
|
chunk_languages = coherence_ratio(
|
||||||
|
chunk, 0.1, ",".join(target_languages) if target_languages else None
|
||||||
|
)
|
||||||
|
|
||||||
|
cd_ratios.append(chunk_languages)
|
||||||
|
|
||||||
|
cd_ratios_merged = merge_coherence_ratios(cd_ratios)
|
||||||
|
|
||||||
|
if cd_ratios_merged:
|
||||||
|
logger.info(
|
||||||
|
"We detected language {} using {}".format(
|
||||||
|
cd_ratios_merged, encoding_iana
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
results.append(
|
||||||
|
CharsetMatch(
|
||||||
|
sequences,
|
||||||
|
encoding_iana,
|
||||||
|
mean_mess_ratio,
|
||||||
|
bom_or_sig_available,
|
||||||
|
cd_ratios_merged,
|
||||||
|
decoded_payload,
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
if (
|
||||||
|
encoding_iana in [specified_encoding, "ascii", "utf_8"]
|
||||||
|
and mean_mess_ratio < 0.1
|
||||||
|
):
|
||||||
|
logger.info(
|
||||||
|
"%s is most likely the one. Stopping the process.", encoding_iana
|
||||||
|
)
|
||||||
|
return CharsetMatches([results[encoding_iana]])
|
||||||
|
|
||||||
|
if encoding_iana == sig_encoding:
|
||||||
|
logger.info(
|
||||||
|
"%s is most likely the one as we detected a BOM or SIG within the beginning of the sequence.",
|
||||||
|
encoding_iana,
|
||||||
|
)
|
||||||
|
return CharsetMatches([results[encoding_iana]])
|
||||||
|
|
||||||
|
if len(results) == 0:
|
||||||
|
if fallback_u8 or fallback_ascii or fallback_specified:
|
||||||
|
logger.warning(
|
||||||
|
"Nothing got out of the detection process. Using ASCII/UTF-8/Specified fallback."
|
||||||
|
)
|
||||||
|
|
||||||
|
if fallback_specified:
|
||||||
|
logger.warning(
|
||||||
|
"%s will be used as a fallback match", fallback_specified.encoding
|
||||||
|
)
|
||||||
|
results.append(fallback_specified)
|
||||||
|
elif (
|
||||||
|
(fallback_u8 and fallback_ascii is None)
|
||||||
|
or (
|
||||||
|
fallback_u8
|
||||||
|
and fallback_ascii
|
||||||
|
and fallback_u8.fingerprint != fallback_ascii.fingerprint
|
||||||
|
)
|
||||||
|
or (fallback_u8 is not None)
|
||||||
|
):
|
||||||
|
logger.warning("utf_8 will be used as a fallback match")
|
||||||
|
results.append(fallback_u8)
|
||||||
|
elif fallback_ascii:
|
||||||
|
logger.warning("ascii will be used as a fallback match")
|
||||||
|
results.append(fallback_ascii)
|
||||||
|
|
||||||
|
return results
|
||||||
|
|
||||||
|
|
||||||
|
def from_fp(
|
||||||
|
fp: BinaryIO,
|
||||||
|
steps: int = 5,
|
||||||
|
chunk_size: int = 512,
|
||||||
|
threshold: float = 0.20,
|
||||||
|
cp_isolation: List[str] = None,
|
||||||
|
cp_exclusion: List[str] = None,
|
||||||
|
preemptive_behaviour: bool = True,
|
||||||
|
explain: bool = False,
|
||||||
|
) -> CharsetMatches:
|
||||||
|
"""
|
||||||
|
Same thing than the function from_bytes but using a file pointer that is already ready.
|
||||||
|
Will not close the file pointer.
|
||||||
|
"""
|
||||||
|
return from_bytes(
|
||||||
|
fp.read(),
|
||||||
|
steps,
|
||||||
|
chunk_size,
|
||||||
|
threshold,
|
||||||
|
cp_isolation,
|
||||||
|
cp_exclusion,
|
||||||
|
preemptive_behaviour,
|
||||||
|
explain,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def from_path(
|
||||||
|
path: PathLike,
|
||||||
|
steps: int = 5,
|
||||||
|
chunk_size: int = 512,
|
||||||
|
threshold: float = 0.20,
|
||||||
|
cp_isolation: List[str] = None,
|
||||||
|
cp_exclusion: List[str] = None,
|
||||||
|
preemptive_behaviour: bool = True,
|
||||||
|
explain: bool = False,
|
||||||
|
) -> CharsetMatches:
|
||||||
|
"""
|
||||||
|
Same thing than the function from_bytes but with one extra step. Opening and reading given file path in binary mode.
|
||||||
|
Can raise IOError.
|
||||||
|
"""
|
||||||
|
with open(path, "rb") as fp:
|
||||||
|
return from_fp(
|
||||||
|
fp,
|
||||||
|
steps,
|
||||||
|
chunk_size,
|
||||||
|
threshold,
|
||||||
|
cp_isolation,
|
||||||
|
cp_exclusion,
|
||||||
|
preemptive_behaviour,
|
||||||
|
explain,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def normalize(
|
||||||
|
path: PathLike,
|
||||||
|
steps: int = 5,
|
||||||
|
chunk_size: int = 512,
|
||||||
|
threshold: float = 0.20,
|
||||||
|
cp_isolation: List[str] = None,
|
||||||
|
cp_exclusion: List[str] = None,
|
||||||
|
preemptive_behaviour: bool = True,
|
||||||
|
) -> CharsetMatch:
|
||||||
|
"""
|
||||||
|
Take a (text-based) file path and try to create another file next to it, this time using UTF-8.
|
||||||
|
"""
|
||||||
|
results = from_path(
|
||||||
|
path,
|
||||||
|
steps,
|
||||||
|
chunk_size,
|
||||||
|
threshold,
|
||||||
|
cp_isolation,
|
||||||
|
cp_exclusion,
|
||||||
|
preemptive_behaviour,
|
||||||
|
)
|
||||||
|
|
||||||
|
filename = basename(path)
|
||||||
|
target_extensions = list(splitext(filename))
|
||||||
|
|
||||||
|
if len(results) == 0:
|
||||||
|
raise IOError(
|
||||||
|
'Unable to normalize "{}", no encoding charset seems to fit.'.format(
|
||||||
|
filename
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
result = results.best()
|
||||||
|
|
||||||
|
target_extensions[0] += "-" + result.encoding # type: ignore
|
||||||
|
|
||||||
|
with open(
|
||||||
|
"{}".format(str(path).replace(filename, "".join(target_extensions))), "wb"
|
||||||
|
) as fp:
|
||||||
|
fp.write(result.output()) # type: ignore
|
||||||
|
|
||||||
|
return result # type: ignore
|
1244
lib/charset_normalizer/assets/__init__.py
Normal file
1244
lib/charset_normalizer/assets/__init__.py
Normal file
File diff suppressed because it is too large
Load diff
341
lib/charset_normalizer/cd.py
Normal file
341
lib/charset_normalizer/cd.py
Normal file
|
@ -0,0 +1,341 @@
|
||||||
|
import importlib
|
||||||
|
from codecs import IncrementalDecoder
|
||||||
|
from collections import Counter, OrderedDict
|
||||||
|
from functools import lru_cache
|
||||||
|
from typing import Dict, List, Optional, Tuple
|
||||||
|
|
||||||
|
from .assets import FREQUENCIES
|
||||||
|
from .constant import KO_NAMES, TOO_SMALL_SEQUENCE, ZH_NAMES
|
||||||
|
from .md import is_suspiciously_successive_range
|
||||||
|
from .models import CoherenceMatches
|
||||||
|
from .utils import (
|
||||||
|
is_accentuated,
|
||||||
|
is_latin,
|
||||||
|
is_multi_byte_encoding,
|
||||||
|
is_unicode_range_secondary,
|
||||||
|
unicode_range,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def encoding_unicode_range(iana_name: str) -> List[str]:
|
||||||
|
"""
|
||||||
|
Return associated unicode ranges in a single byte code page.
|
||||||
|
"""
|
||||||
|
if is_multi_byte_encoding(iana_name):
|
||||||
|
raise IOError("Function not supported on multi-byte code page")
|
||||||
|
|
||||||
|
decoder = importlib.import_module("encodings.{}".format(iana_name)).IncrementalDecoder # type: ignore
|
||||||
|
|
||||||
|
p = decoder(errors="ignore") # type: IncrementalDecoder
|
||||||
|
seen_ranges = {} # type: Dict[str, int]
|
||||||
|
character_count = 0 # type: int
|
||||||
|
|
||||||
|
for i in range(0x40, 0xFF):
|
||||||
|
chunk = p.decode(bytes([i])) # type: str
|
||||||
|
|
||||||
|
if chunk:
|
||||||
|
character_range = unicode_range(chunk) # type: Optional[str]
|
||||||
|
|
||||||
|
if character_range is None:
|
||||||
|
continue
|
||||||
|
|
||||||
|
if is_unicode_range_secondary(character_range) is False:
|
||||||
|
if character_range not in seen_ranges:
|
||||||
|
seen_ranges[character_range] = 0
|
||||||
|
seen_ranges[character_range] += 1
|
||||||
|
character_count += 1
|
||||||
|
|
||||||
|
return sorted(
|
||||||
|
[
|
||||||
|
character_range
|
||||||
|
for character_range in seen_ranges
|
||||||
|
if seen_ranges[character_range] / character_count >= 0.15
|
||||||
|
]
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def unicode_range_languages(primary_range: str) -> List[str]:
|
||||||
|
"""
|
||||||
|
Return inferred languages used with a unicode range.
|
||||||
|
"""
|
||||||
|
languages = [] # type: List[str]
|
||||||
|
|
||||||
|
for language, characters in FREQUENCIES.items():
|
||||||
|
for character in characters:
|
||||||
|
if unicode_range(character) == primary_range:
|
||||||
|
languages.append(language)
|
||||||
|
break
|
||||||
|
|
||||||
|
return languages
|
||||||
|
|
||||||
|
|
||||||
|
@lru_cache()
|
||||||
|
def encoding_languages(iana_name: str) -> List[str]:
|
||||||
|
"""
|
||||||
|
Single-byte encoding language association. Some code page are heavily linked to particular language(s).
|
||||||
|
This function does the correspondence.
|
||||||
|
"""
|
||||||
|
unicode_ranges = encoding_unicode_range(iana_name) # type: List[str]
|
||||||
|
primary_range = None # type: Optional[str]
|
||||||
|
|
||||||
|
for specified_range in unicode_ranges:
|
||||||
|
if "Latin" not in specified_range:
|
||||||
|
primary_range = specified_range
|
||||||
|
break
|
||||||
|
|
||||||
|
if primary_range is None:
|
||||||
|
return ["Latin Based"]
|
||||||
|
|
||||||
|
return unicode_range_languages(primary_range)
|
||||||
|
|
||||||
|
|
||||||
|
@lru_cache()
|
||||||
|
def mb_encoding_languages(iana_name: str) -> List[str]:
|
||||||
|
"""
|
||||||
|
Multi-byte encoding language association. Some code page are heavily linked to particular language(s).
|
||||||
|
This function does the correspondence.
|
||||||
|
"""
|
||||||
|
if (
|
||||||
|
iana_name.startswith("shift_")
|
||||||
|
or iana_name.startswith("iso2022_jp")
|
||||||
|
or iana_name.startswith("euc_j")
|
||||||
|
or iana_name == "cp932"
|
||||||
|
):
|
||||||
|
return ["Japanese"]
|
||||||
|
if iana_name.startswith("gb") or iana_name in ZH_NAMES:
|
||||||
|
return ["Chinese", "Classical Chinese"]
|
||||||
|
if iana_name.startswith("iso2022_kr") or iana_name in KO_NAMES:
|
||||||
|
return ["Korean"]
|
||||||
|
|
||||||
|
return []
|
||||||
|
|
||||||
|
|
||||||
|
def alphabet_languages(
|
||||||
|
characters: List[str], ignore_non_latin: bool = False
|
||||||
|
) -> List[str]:
|
||||||
|
"""
|
||||||
|
Return associated languages associated to given characters.
|
||||||
|
"""
|
||||||
|
languages = [] # type: List[Tuple[str, float]]
|
||||||
|
|
||||||
|
source_have_accents = False # type: bool
|
||||||
|
|
||||||
|
for character in characters:
|
||||||
|
if is_accentuated(character):
|
||||||
|
source_have_accents = True
|
||||||
|
break
|
||||||
|
|
||||||
|
for language, language_characters in FREQUENCIES.items():
|
||||||
|
|
||||||
|
target_have_accents = False # type: bool
|
||||||
|
target_pure_latin = True # type: bool
|
||||||
|
|
||||||
|
for language_character in language_characters:
|
||||||
|
if target_have_accents is False and is_accentuated(language_character):
|
||||||
|
target_have_accents = True
|
||||||
|
if target_pure_latin is True and is_latin(language_character) is False:
|
||||||
|
target_pure_latin = False
|
||||||
|
|
||||||
|
if ignore_non_latin and target_pure_latin is False:
|
||||||
|
continue
|
||||||
|
|
||||||
|
if target_have_accents is False and source_have_accents:
|
||||||
|
continue
|
||||||
|
|
||||||
|
character_count = len(language_characters) # type: int
|
||||||
|
|
||||||
|
character_match_count = len(
|
||||||
|
[c for c in language_characters if c in characters]
|
||||||
|
) # type: int
|
||||||
|
|
||||||
|
ratio = character_match_count / character_count # type: float
|
||||||
|
|
||||||
|
if ratio >= 0.2:
|
||||||
|
languages.append((language, ratio))
|
||||||
|
|
||||||
|
languages = sorted(languages, key=lambda x: x[1], reverse=True)
|
||||||
|
|
||||||
|
return [compatible_language[0] for compatible_language in languages]
|
||||||
|
|
||||||
|
|
||||||
|
def characters_popularity_compare(
|
||||||
|
language: str, ordered_characters: List[str]
|
||||||
|
) -> float:
|
||||||
|
"""
|
||||||
|
Determine if a ordered characters list (by occurrence from most appearance to rarest) match a particular language.
|
||||||
|
The result is a ratio between 0. (absolutely no correspondence) and 1. (near perfect fit).
|
||||||
|
Beware that is function is not strict on the match in order to ease the detection. (Meaning close match is 1.)
|
||||||
|
"""
|
||||||
|
if language not in FREQUENCIES:
|
||||||
|
raise ValueError("{} not available".format(language))
|
||||||
|
|
||||||
|
character_approved_count = 0 # type: int
|
||||||
|
|
||||||
|
for character in ordered_characters:
|
||||||
|
if character not in FREQUENCIES[language]:
|
||||||
|
continue
|
||||||
|
|
||||||
|
characters_before_source = FREQUENCIES[language][
|
||||||
|
0 : FREQUENCIES[language].index(character)
|
||||||
|
] # type: List[str]
|
||||||
|
characters_after_source = FREQUENCIES[language][
|
||||||
|
FREQUENCIES[language].index(character) :
|
||||||
|
] # type: List[str]
|
||||||
|
|
||||||
|
characters_before = ordered_characters[
|
||||||
|
0 : ordered_characters.index(character)
|
||||||
|
] # type: List[str]
|
||||||
|
characters_after = ordered_characters[
|
||||||
|
ordered_characters.index(character) :
|
||||||
|
] # type: List[str]
|
||||||
|
|
||||||
|
before_match_count = [
|
||||||
|
e in characters_before for e in characters_before_source
|
||||||
|
].count(
|
||||||
|
True
|
||||||
|
) # type: int
|
||||||
|
after_match_count = [
|
||||||
|
e in characters_after for e in characters_after_source
|
||||||
|
].count(
|
||||||
|
True
|
||||||
|
) # type: int
|
||||||
|
|
||||||
|
if len(characters_before_source) == 0 and before_match_count <= 4:
|
||||||
|
character_approved_count += 1
|
||||||
|
continue
|
||||||
|
|
||||||
|
if len(characters_after_source) == 0 and after_match_count <= 4:
|
||||||
|
character_approved_count += 1
|
||||||
|
continue
|
||||||
|
|
||||||
|
if (
|
||||||
|
before_match_count / len(characters_before_source) >= 0.4
|
||||||
|
or after_match_count / len(characters_after_source) >= 0.4
|
||||||
|
):
|
||||||
|
character_approved_count += 1
|
||||||
|
continue
|
||||||
|
|
||||||
|
return character_approved_count / len(ordered_characters)
|
||||||
|
|
||||||
|
|
||||||
|
def alpha_unicode_split(decoded_sequence: str) -> List[str]:
|
||||||
|
"""
|
||||||
|
Given a decoded text sequence, return a list of str. Unicode range / alphabet separation.
|
||||||
|
Ex. a text containing English/Latin with a bit a Hebrew will return two items in the resulting list;
|
||||||
|
One containing the latin letters and the other hebrew.
|
||||||
|
"""
|
||||||
|
layers = OrderedDict() # type: Dict[str, str]
|
||||||
|
|
||||||
|
for character in decoded_sequence:
|
||||||
|
if character.isalpha() is False:
|
||||||
|
continue
|
||||||
|
|
||||||
|
character_range = unicode_range(character) # type: Optional[str]
|
||||||
|
|
||||||
|
if character_range is None:
|
||||||
|
continue
|
||||||
|
|
||||||
|
layer_target_range = None # type: Optional[str]
|
||||||
|
|
||||||
|
for discovered_range in layers:
|
||||||
|
if (
|
||||||
|
is_suspiciously_successive_range(discovered_range, character_range)
|
||||||
|
is False
|
||||||
|
):
|
||||||
|
layer_target_range = discovered_range
|
||||||
|
break
|
||||||
|
|
||||||
|
if layer_target_range is None:
|
||||||
|
layer_target_range = character_range
|
||||||
|
|
||||||
|
if layer_target_range not in layers:
|
||||||
|
layers[layer_target_range] = character.lower()
|
||||||
|
continue
|
||||||
|
|
||||||
|
layers[layer_target_range] += character.lower()
|
||||||
|
|
||||||
|
return list(layers.values())
|
||||||
|
|
||||||
|
|
||||||
|
def merge_coherence_ratios(results: List[CoherenceMatches]) -> CoherenceMatches:
|
||||||
|
"""
|
||||||
|
This function merge results previously given by the function coherence_ratio.
|
||||||
|
The return type is the same as coherence_ratio.
|
||||||
|
"""
|
||||||
|
per_language_ratios = OrderedDict() # type: Dict[str, List[float]]
|
||||||
|
merge = [] # type: CoherenceMatches
|
||||||
|
|
||||||
|
for result in results:
|
||||||
|
for sub_result in result:
|
||||||
|
language, ratio = sub_result
|
||||||
|
if language not in per_language_ratios:
|
||||||
|
per_language_ratios[language] = [ratio]
|
||||||
|
continue
|
||||||
|
per_language_ratios[language].append(ratio)
|
||||||
|
|
||||||
|
for language in per_language_ratios:
|
||||||
|
merge.append(
|
||||||
|
(
|
||||||
|
language,
|
||||||
|
round(
|
||||||
|
sum(per_language_ratios[language])
|
||||||
|
/ len(per_language_ratios[language]),
|
||||||
|
4,
|
||||||
|
),
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
return sorted(merge, key=lambda x: x[1], reverse=True)
|
||||||
|
|
||||||
|
|
||||||
|
@lru_cache(maxsize=2048)
|
||||||
|
def coherence_ratio(
|
||||||
|
decoded_sequence: str, threshold: float = 0.1, lg_inclusion: Optional[str] = None
|
||||||
|
) -> CoherenceMatches:
|
||||||
|
"""
|
||||||
|
Detect ANY language that can be identified in given sequence. The sequence will be analysed by layers.
|
||||||
|
A layer = Character extraction by alphabets/ranges.
|
||||||
|
"""
|
||||||
|
|
||||||
|
results = [] # type: List[Tuple[str, float]]
|
||||||
|
lg_inclusion_list = [] # type: List[str]
|
||||||
|
ignore_non_latin = False # type: bool
|
||||||
|
|
||||||
|
sufficient_match_count = 0 # type: int
|
||||||
|
|
||||||
|
if lg_inclusion is not None:
|
||||||
|
lg_inclusion_list = lg_inclusion.split(",")
|
||||||
|
|
||||||
|
if "Latin Based" in lg_inclusion_list:
|
||||||
|
ignore_non_latin = True
|
||||||
|
lg_inclusion_list.remove("Latin Based")
|
||||||
|
|
||||||
|
for layer in alpha_unicode_split(decoded_sequence):
|
||||||
|
sequence_frequencies = Counter(layer) # type: Counter
|
||||||
|
most_common = sequence_frequencies.most_common()
|
||||||
|
|
||||||
|
character_count = sum([o for c, o in most_common]) # type: int
|
||||||
|
|
||||||
|
if character_count <= TOO_SMALL_SEQUENCE:
|
||||||
|
continue
|
||||||
|
|
||||||
|
popular_character_ordered = [c for c, o in most_common] # type: List[str]
|
||||||
|
|
||||||
|
for language in lg_inclusion_list or alphabet_languages(
|
||||||
|
popular_character_ordered, ignore_non_latin
|
||||||
|
):
|
||||||
|
ratio = characters_popularity_compare(
|
||||||
|
language, popular_character_ordered
|
||||||
|
) # type: float
|
||||||
|
|
||||||
|
if ratio < threshold:
|
||||||
|
continue
|
||||||
|
elif ratio >= 0.8:
|
||||||
|
sufficient_match_count += 1
|
||||||
|
|
||||||
|
results.append((language, round(ratio, 4)))
|
||||||
|
|
||||||
|
if sufficient_match_count >= 3:
|
||||||
|
break
|
||||||
|
|
||||||
|
return sorted(results, key=lambda x: x[1], reverse=True)
|
0
lib/charset_normalizer/cli/__init__.py
Normal file
0
lib/charset_normalizer/cli/__init__.py
Normal file
291
lib/charset_normalizer/cli/normalizer.py
Normal file
291
lib/charset_normalizer/cli/normalizer.py
Normal file
|
@ -0,0 +1,291 @@
|
||||||
|
import argparse
|
||||||
|
import sys
|
||||||
|
from json import dumps
|
||||||
|
from os.path import abspath
|
||||||
|
from platform import python_version
|
||||||
|
from typing import List
|
||||||
|
|
||||||
|
from charset_normalizer import from_fp
|
||||||
|
from charset_normalizer.models import CliDetectionResult
|
||||||
|
from charset_normalizer.version import __version__
|
||||||
|
|
||||||
|
|
||||||
|
def query_yes_no(question: str, default: str = "yes") -> bool:
|
||||||
|
"""Ask a yes/no question via input() and return their answer.
|
||||||
|
|
||||||
|
"question" is a string that is presented to the user.
|
||||||
|
"default" is the presumed answer if the user just hits <Enter>.
|
||||||
|
It must be "yes" (the default), "no" or None (meaning
|
||||||
|
an answer is required of the user).
|
||||||
|
|
||||||
|
The "answer" return value is True for "yes" or False for "no".
|
||||||
|
|
||||||
|
Credit goes to (c) https://stackoverflow.com/questions/3041986/apt-command-line-interface-like-yes-no-input
|
||||||
|
"""
|
||||||
|
valid = {"yes": True, "y": True, "ye": True, "no": False, "n": False}
|
||||||
|
if default is None:
|
||||||
|
prompt = " [y/n] "
|
||||||
|
elif default == "yes":
|
||||||
|
prompt = " [Y/n] "
|
||||||
|
elif default == "no":
|
||||||
|
prompt = " [y/N] "
|
||||||
|
else:
|
||||||
|
raise ValueError("invalid default answer: '%s'" % default)
|
||||||
|
|
||||||
|
while True:
|
||||||
|
sys.stdout.write(question + prompt)
|
||||||
|
choice = input().lower()
|
||||||
|
if default is not None and choice == "":
|
||||||
|
return valid[default]
|
||||||
|
elif choice in valid:
|
||||||
|
return valid[choice]
|
||||||
|
else:
|
||||||
|
sys.stdout.write("Please respond with 'yes' or 'no' " "(or 'y' or 'n').\n")
|
||||||
|
|
||||||
|
|
||||||
|
def cli_detect(argv: List[str] = None) -> int:
|
||||||
|
"""
|
||||||
|
CLI assistant using ARGV and ArgumentParser
|
||||||
|
:param argv:
|
||||||
|
:return: 0 if everything is fine, anything else equal trouble
|
||||||
|
"""
|
||||||
|
parser = argparse.ArgumentParser(
|
||||||
|
description="The Real First Universal Charset Detector. "
|
||||||
|
"Discover originating encoding used on text file. "
|
||||||
|
"Normalize text to unicode."
|
||||||
|
)
|
||||||
|
|
||||||
|
parser.add_argument(
|
||||||
|
"files", type=argparse.FileType("rb"), nargs="+", help="File(s) to be analysed"
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"-v",
|
||||||
|
"--verbose",
|
||||||
|
action="store_true",
|
||||||
|
default=False,
|
||||||
|
dest="verbose",
|
||||||
|
help="Display complementary information about file if any. "
|
||||||
|
"Stdout will contain logs about the detection process.",
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"-a",
|
||||||
|
"--with-alternative",
|
||||||
|
action="store_true",
|
||||||
|
default=False,
|
||||||
|
dest="alternatives",
|
||||||
|
help="Output complementary possibilities if any. Top-level JSON WILL be a list.",
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"-n",
|
||||||
|
"--normalize",
|
||||||
|
action="store_true",
|
||||||
|
default=False,
|
||||||
|
dest="normalize",
|
||||||
|
help="Permit to normalize input file. If not set, program does not write anything.",
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"-m",
|
||||||
|
"--minimal",
|
||||||
|
action="store_true",
|
||||||
|
default=False,
|
||||||
|
dest="minimal",
|
||||||
|
help="Only output the charset detected to STDOUT. Disabling JSON output.",
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"-r",
|
||||||
|
"--replace",
|
||||||
|
action="store_true",
|
||||||
|
default=False,
|
||||||
|
dest="replace",
|
||||||
|
help="Replace file when trying to normalize it instead of creating a new one.",
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"-f",
|
||||||
|
"--force",
|
||||||
|
action="store_true",
|
||||||
|
default=False,
|
||||||
|
dest="force",
|
||||||
|
help="Replace file without asking if you are sure, use this flag with caution.",
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"-t",
|
||||||
|
"--threshold",
|
||||||
|
action="store",
|
||||||
|
default=0.1,
|
||||||
|
type=float,
|
||||||
|
dest="threshold",
|
||||||
|
help="Define a custom maximum amount of chaos allowed in decoded content. 0. <= chaos <= 1.",
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--version",
|
||||||
|
action="version",
|
||||||
|
version="Charset-Normalizer {} - Python {}".format(
|
||||||
|
__version__, python_version()
|
||||||
|
),
|
||||||
|
help="Show version information and exit.",
|
||||||
|
)
|
||||||
|
|
||||||
|
args = parser.parse_args(argv)
|
||||||
|
|
||||||
|
if args.replace is True and args.normalize is False:
|
||||||
|
print("Use --replace in addition of --normalize only.", file=sys.stderr)
|
||||||
|
return 1
|
||||||
|
|
||||||
|
if args.force is True and args.replace is False:
|
||||||
|
print("Use --force in addition of --replace only.", file=sys.stderr)
|
||||||
|
return 1
|
||||||
|
|
||||||
|
if args.threshold < 0.0 or args.threshold > 1.0:
|
||||||
|
print("--threshold VALUE should be between 0. AND 1.", file=sys.stderr)
|
||||||
|
return 1
|
||||||
|
|
||||||
|
x_ = []
|
||||||
|
|
||||||
|
for my_file in args.files:
|
||||||
|
|
||||||
|
matches = from_fp(my_file, threshold=args.threshold, explain=args.verbose)
|
||||||
|
|
||||||
|
best_guess = matches.best()
|
||||||
|
|
||||||
|
if best_guess is None:
|
||||||
|
print(
|
||||||
|
'Unable to identify originating encoding for "{}". {}'.format(
|
||||||
|
my_file.name,
|
||||||
|
"Maybe try increasing maximum amount of chaos."
|
||||||
|
if args.threshold < 1.0
|
||||||
|
else "",
|
||||||
|
),
|
||||||
|
file=sys.stderr,
|
||||||
|
)
|
||||||
|
x_.append(
|
||||||
|
CliDetectionResult(
|
||||||
|
abspath(my_file.name),
|
||||||
|
None,
|
||||||
|
[],
|
||||||
|
[],
|
||||||
|
"Unknown",
|
||||||
|
[],
|
||||||
|
False,
|
||||||
|
1.0,
|
||||||
|
0.0,
|
||||||
|
None,
|
||||||
|
True,
|
||||||
|
)
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
x_.append(
|
||||||
|
CliDetectionResult(
|
||||||
|
abspath(my_file.name),
|
||||||
|
best_guess.encoding,
|
||||||
|
best_guess.encoding_aliases,
|
||||||
|
[
|
||||||
|
cp
|
||||||
|
for cp in best_guess.could_be_from_charset
|
||||||
|
if cp != best_guess.encoding
|
||||||
|
],
|
||||||
|
best_guess.language,
|
||||||
|
best_guess.alphabets,
|
||||||
|
best_guess.bom,
|
||||||
|
best_guess.percent_chaos,
|
||||||
|
best_guess.percent_coherence,
|
||||||
|
None,
|
||||||
|
True,
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
if len(matches) > 1 and args.alternatives:
|
||||||
|
for el in matches:
|
||||||
|
if el != best_guess:
|
||||||
|
x_.append(
|
||||||
|
CliDetectionResult(
|
||||||
|
abspath(my_file.name),
|
||||||
|
el.encoding,
|
||||||
|
el.encoding_aliases,
|
||||||
|
[
|
||||||
|
cp
|
||||||
|
for cp in el.could_be_from_charset
|
||||||
|
if cp != el.encoding
|
||||||
|
],
|
||||||
|
el.language,
|
||||||
|
el.alphabets,
|
||||||
|
el.bom,
|
||||||
|
el.percent_chaos,
|
||||||
|
el.percent_coherence,
|
||||||
|
None,
|
||||||
|
False,
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
if args.normalize is True:
|
||||||
|
|
||||||
|
if best_guess.encoding.startswith("utf") is True:
|
||||||
|
print(
|
||||||
|
'"{}" file does not need to be normalized, as it already came from unicode.'.format(
|
||||||
|
my_file.name
|
||||||
|
),
|
||||||
|
file=sys.stderr,
|
||||||
|
)
|
||||||
|
if my_file.closed is False:
|
||||||
|
my_file.close()
|
||||||
|
continue
|
||||||
|
|
||||||
|
o_ = my_file.name.split(".") # type: List[str]
|
||||||
|
|
||||||
|
if args.replace is False:
|
||||||
|
o_.insert(-1, best_guess.encoding)
|
||||||
|
if my_file.closed is False:
|
||||||
|
my_file.close()
|
||||||
|
else:
|
||||||
|
if (
|
||||||
|
args.force is False
|
||||||
|
and query_yes_no(
|
||||||
|
'Are you sure to normalize "{}" by replacing it ?'.format(
|
||||||
|
my_file.name
|
||||||
|
),
|
||||||
|
"no",
|
||||||
|
)
|
||||||
|
is False
|
||||||
|
):
|
||||||
|
if my_file.closed is False:
|
||||||
|
my_file.close()
|
||||||
|
continue
|
||||||
|
|
||||||
|
try:
|
||||||
|
x_[0].unicode_path = abspath("./{}".format(".".join(o_)))
|
||||||
|
|
||||||
|
with open(x_[0].unicode_path, "w", encoding="utf-8") as fp:
|
||||||
|
fp.write(str(best_guess))
|
||||||
|
except IOError as e:
|
||||||
|
print(str(e), file=sys.stderr)
|
||||||
|
if my_file.closed is False:
|
||||||
|
my_file.close()
|
||||||
|
return 2
|
||||||
|
|
||||||
|
if my_file.closed is False:
|
||||||
|
my_file.close()
|
||||||
|
|
||||||
|
if args.minimal is False:
|
||||||
|
print(
|
||||||
|
dumps(
|
||||||
|
[el.__dict__ for el in x_] if len(x_) > 1 else x_[0].__dict__,
|
||||||
|
ensure_ascii=True,
|
||||||
|
indent=4,
|
||||||
|
)
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
for my_file in args.files:
|
||||||
|
print(
|
||||||
|
", ".join(
|
||||||
|
[
|
||||||
|
el.encoding if el.encoding else "undefined"
|
||||||
|
for el in x_
|
||||||
|
if el.path == abspath(my_file.name)
|
||||||
|
]
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
return 0
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
cli_detect()
|
496
lib/charset_normalizer/constant.py
Normal file
496
lib/charset_normalizer/constant.py
Normal file
|
@ -0,0 +1,496 @@
|
||||||
|
from codecs import BOM_UTF8, BOM_UTF16_BE, BOM_UTF16_LE, BOM_UTF32_BE, BOM_UTF32_LE
|
||||||
|
from collections import OrderedDict
|
||||||
|
from encodings.aliases import aliases
|
||||||
|
from re import IGNORECASE, compile as re_compile
|
||||||
|
from typing import Dict, List, Set, Union
|
||||||
|
|
||||||
|
# Contain for each eligible encoding a list of/item bytes SIG/BOM
|
||||||
|
ENCODING_MARKS = OrderedDict(
|
||||||
|
[
|
||||||
|
("utf_8", BOM_UTF8),
|
||||||
|
(
|
||||||
|
"utf_7",
|
||||||
|
[
|
||||||
|
b"\x2b\x2f\x76\x38",
|
||||||
|
b"\x2b\x2f\x76\x39",
|
||||||
|
b"\x2b\x2f\x76\x2b",
|
||||||
|
b"\x2b\x2f\x76\x2f",
|
||||||
|
b"\x2b\x2f\x76\x38\x2d",
|
||||||
|
],
|
||||||
|
),
|
||||||
|
("gb18030", b"\x84\x31\x95\x33"),
|
||||||
|
("utf_32", [BOM_UTF32_BE, BOM_UTF32_LE]),
|
||||||
|
("utf_16", [BOM_UTF16_BE, BOM_UTF16_LE]),
|
||||||
|
]
|
||||||
|
) # type: Dict[str, Union[bytes, List[bytes]]]
|
||||||
|
|
||||||
|
TOO_SMALL_SEQUENCE = 32 # type: int
|
||||||
|
TOO_BIG_SEQUENCE = int(10e6) # type: int
|
||||||
|
|
||||||
|
UTF8_MAXIMAL_ALLOCATION = 1112064 # type: int
|
||||||
|
|
||||||
|
UNICODE_RANGES_COMBINED = {
|
||||||
|
"Control character": range(0, 31 + 1),
|
||||||
|
"Basic Latin": range(32, 127 + 1),
|
||||||
|
"Latin-1 Supplement": range(128, 255 + 1),
|
||||||
|
"Latin Extended-A": range(256, 383 + 1),
|
||||||
|
"Latin Extended-B": range(384, 591 + 1),
|
||||||
|
"IPA Extensions": range(592, 687 + 1),
|
||||||
|
"Spacing Modifier Letters": range(688, 767 + 1),
|
||||||
|
"Combining Diacritical Marks": range(768, 879 + 1),
|
||||||
|
"Greek and Coptic": range(880, 1023 + 1),
|
||||||
|
"Cyrillic": range(1024, 1279 + 1),
|
||||||
|
"Cyrillic Supplement": range(1280, 1327 + 1),
|
||||||
|
"Armenian": range(1328, 1423 + 1),
|
||||||
|
"Hebrew": range(1424, 1535 + 1),
|
||||||
|
"Arabic": range(1536, 1791 + 1),
|
||||||
|
"Syriac": range(1792, 1871 + 1),
|
||||||
|
"Arabic Supplement": range(1872, 1919 + 1),
|
||||||
|
"Thaana": range(1920, 1983 + 1),
|
||||||
|
"NKo": range(1984, 2047 + 1),
|
||||||
|
"Samaritan": range(2048, 2111 + 1),
|
||||||
|
"Mandaic": range(2112, 2143 + 1),
|
||||||
|
"Syriac Supplement": range(2144, 2159 + 1),
|
||||||
|
"Arabic Extended-A": range(2208, 2303 + 1),
|
||||||
|
"Devanagari": range(2304, 2431 + 1),
|
||||||
|
"Bengali": range(2432, 2559 + 1),
|
||||||
|
"Gurmukhi": range(2560, 2687 + 1),
|
||||||
|
"Gujarati": range(2688, 2815 + 1),
|
||||||
|
"Oriya": range(2816, 2943 + 1),
|
||||||
|
"Tamil": range(2944, 3071 + 1),
|
||||||
|
"Telugu": range(3072, 3199 + 1),
|
||||||
|
"Kannada": range(3200, 3327 + 1),
|
||||||
|
"Malayalam": range(3328, 3455 + 1),
|
||||||
|
"Sinhala": range(3456, 3583 + 1),
|
||||||
|
"Thai": range(3584, 3711 + 1),
|
||||||
|
"Lao": range(3712, 3839 + 1),
|
||||||
|
"Tibetan": range(3840, 4095 + 1),
|
||||||
|
"Myanmar": range(4096, 4255 + 1),
|
||||||
|
"Georgian": range(4256, 4351 + 1),
|
||||||
|
"Hangul Jamo": range(4352, 4607 + 1),
|
||||||
|
"Ethiopic": range(4608, 4991 + 1),
|
||||||
|
"Ethiopic Supplement": range(4992, 5023 + 1),
|
||||||
|
"Cherokee": range(5024, 5119 + 1),
|
||||||
|
"Unified Canadian Aboriginal Syllabics": range(5120, 5759 + 1),
|
||||||
|
"Ogham": range(5760, 5791 + 1),
|
||||||
|
"Runic": range(5792, 5887 + 1),
|
||||||
|
"Tagalog": range(5888, 5919 + 1),
|
||||||
|
"Hanunoo": range(5920, 5951 + 1),
|
||||||
|
"Buhid": range(5952, 5983 + 1),
|
||||||
|
"Tagbanwa": range(5984, 6015 + 1),
|
||||||
|
"Khmer": range(6016, 6143 + 1),
|
||||||
|
"Mongolian": range(6144, 6319 + 1),
|
||||||
|
"Unified Canadian Aboriginal Syllabics Extended": range(6320, 6399 + 1),
|
||||||
|
"Limbu": range(6400, 6479 + 1),
|
||||||
|
"Tai Le": range(6480, 6527 + 1),
|
||||||
|
"New Tai Lue": range(6528, 6623 + 1),
|
||||||
|
"Khmer Symbols": range(6624, 6655 + 1),
|
||||||
|
"Buginese": range(6656, 6687 + 1),
|
||||||
|
"Tai Tham": range(6688, 6831 + 1),
|
||||||
|
"Combining Diacritical Marks Extended": range(6832, 6911 + 1),
|
||||||
|
"Balinese": range(6912, 7039 + 1),
|
||||||
|
"Sundanese": range(7040, 7103 + 1),
|
||||||
|
"Batak": range(7104, 7167 + 1),
|
||||||
|
"Lepcha": range(7168, 7247 + 1),
|
||||||
|
"Ol Chiki": range(7248, 7295 + 1),
|
||||||
|
"Cyrillic Extended C": range(7296, 7311 + 1),
|
||||||
|
"Sundanese Supplement": range(7360, 7375 + 1),
|
||||||
|
"Vedic Extensions": range(7376, 7423 + 1),
|
||||||
|
"Phonetic Extensions": range(7424, 7551 + 1),
|
||||||
|
"Phonetic Extensions Supplement": range(7552, 7615 + 1),
|
||||||
|
"Combining Diacritical Marks Supplement": range(7616, 7679 + 1),
|
||||||
|
"Latin Extended Additional": range(7680, 7935 + 1),
|
||||||
|
"Greek Extended": range(7936, 8191 + 1),
|
||||||
|
"General Punctuation": range(8192, 8303 + 1),
|
||||||
|
"Superscripts and Subscripts": range(8304, 8351 + 1),
|
||||||
|
"Currency Symbols": range(8352, 8399 + 1),
|
||||||
|
"Combining Diacritical Marks for Symbols": range(8400, 8447 + 1),
|
||||||
|
"Letterlike Symbols": range(8448, 8527 + 1),
|
||||||
|
"Number Forms": range(8528, 8591 + 1),
|
||||||
|
"Arrows": range(8592, 8703 + 1),
|
||||||
|
"Mathematical Operators": range(8704, 8959 + 1),
|
||||||
|
"Miscellaneous Technical": range(8960, 9215 + 1),
|
||||||
|
"Control Pictures": range(9216, 9279 + 1),
|
||||||
|
"Optical Character Recognition": range(9280, 9311 + 1),
|
||||||
|
"Enclosed Alphanumerics": range(9312, 9471 + 1),
|
||||||
|
"Box Drawing": range(9472, 9599 + 1),
|
||||||
|
"Block Elements": range(9600, 9631 + 1),
|
||||||
|
"Geometric Shapes": range(9632, 9727 + 1),
|
||||||
|
"Miscellaneous Symbols": range(9728, 9983 + 1),
|
||||||
|
"Dingbats": range(9984, 10175 + 1),
|
||||||
|
"Miscellaneous Mathematical Symbols-A": range(10176, 10223 + 1),
|
||||||
|
"Supplemental Arrows-A": range(10224, 10239 + 1),
|
||||||
|
"Braille Patterns": range(10240, 10495 + 1),
|
||||||
|
"Supplemental Arrows-B": range(10496, 10623 + 1),
|
||||||
|
"Miscellaneous Mathematical Symbols-B": range(10624, 10751 + 1),
|
||||||
|
"Supplemental Mathematical Operators": range(10752, 11007 + 1),
|
||||||
|
"Miscellaneous Symbols and Arrows": range(11008, 11263 + 1),
|
||||||
|
"Glagolitic": range(11264, 11359 + 1),
|
||||||
|
"Latin Extended-C": range(11360, 11391 + 1),
|
||||||
|
"Coptic": range(11392, 11519 + 1),
|
||||||
|
"Georgian Supplement": range(11520, 11567 + 1),
|
||||||
|
"Tifinagh": range(11568, 11647 + 1),
|
||||||
|
"Ethiopic Extended": range(11648, 11743 + 1),
|
||||||
|
"Cyrillic Extended-A": range(11744, 11775 + 1),
|
||||||
|
"Supplemental Punctuation": range(11776, 11903 + 1),
|
||||||
|
"CJK Radicals Supplement": range(11904, 12031 + 1),
|
||||||
|
"Kangxi Radicals": range(12032, 12255 + 1),
|
||||||
|
"Ideographic Description Characters": range(12272, 12287 + 1),
|
||||||
|
"CJK Symbols and Punctuation": range(12288, 12351 + 1),
|
||||||
|
"Hiragana": range(12352, 12447 + 1),
|
||||||
|
"Katakana": range(12448, 12543 + 1),
|
||||||
|
"Bopomofo": range(12544, 12591 + 1),
|
||||||
|
"Hangul Compatibility Jamo": range(12592, 12687 + 1),
|
||||||
|
"Kanbun": range(12688, 12703 + 1),
|
||||||
|
"Bopomofo Extended": range(12704, 12735 + 1),
|
||||||
|
"CJK Strokes": range(12736, 12783 + 1),
|
||||||
|
"Katakana Phonetic Extensions": range(12784, 12799 + 1),
|
||||||
|
"Enclosed CJK Letters and Months": range(12800, 13055 + 1),
|
||||||
|
"CJK Compatibility": range(13056, 13311 + 1),
|
||||||
|
"CJK Unified Ideographs Extension A": range(13312, 19903 + 1),
|
||||||
|
"Yijing Hexagram Symbols": range(19904, 19967 + 1),
|
||||||
|
"CJK Unified Ideographs": range(19968, 40959 + 1),
|
||||||
|
"Yi Syllables": range(40960, 42127 + 1),
|
||||||
|
"Yi Radicals": range(42128, 42191 + 1),
|
||||||
|
"Lisu": range(42192, 42239 + 1),
|
||||||
|
"Vai": range(42240, 42559 + 1),
|
||||||
|
"Cyrillic Extended-B": range(42560, 42655 + 1),
|
||||||
|
"Bamum": range(42656, 42751 + 1),
|
||||||
|
"Modifier Tone Letters": range(42752, 42783 + 1),
|
||||||
|
"Latin Extended-D": range(42784, 43007 + 1),
|
||||||
|
"Syloti Nagri": range(43008, 43055 + 1),
|
||||||
|
"Common Indic Number Forms": range(43056, 43071 + 1),
|
||||||
|
"Phags-pa": range(43072, 43135 + 1),
|
||||||
|
"Saurashtra": range(43136, 43231 + 1),
|
||||||
|
"Devanagari Extended": range(43232, 43263 + 1),
|
||||||
|
"Kayah Li": range(43264, 43311 + 1),
|
||||||
|
"Rejang": range(43312, 43359 + 1),
|
||||||
|
"Hangul Jamo Extended-A": range(43360, 43391 + 1),
|
||||||
|
"Javanese": range(43392, 43487 + 1),
|
||||||
|
"Myanmar Extended-B": range(43488, 43519 + 1),
|
||||||
|
"Cham": range(43520, 43615 + 1),
|
||||||
|
"Myanmar Extended-A": range(43616, 43647 + 1),
|
||||||
|
"Tai Viet": range(43648, 43743 + 1),
|
||||||
|
"Meetei Mayek Extensions": range(43744, 43775 + 1),
|
||||||
|
"Ethiopic Extended-A": range(43776, 43823 + 1),
|
||||||
|
"Latin Extended-E": range(43824, 43887 + 1),
|
||||||
|
"Cherokee Supplement": range(43888, 43967 + 1),
|
||||||
|
"Meetei Mayek": range(43968, 44031 + 1),
|
||||||
|
"Hangul Syllables": range(44032, 55215 + 1),
|
||||||
|
"Hangul Jamo Extended-B": range(55216, 55295 + 1),
|
||||||
|
"High Surrogates": range(55296, 56191 + 1),
|
||||||
|
"High Private Use Surrogates": range(56192, 56319 + 1),
|
||||||
|
"Low Surrogates": range(56320, 57343 + 1),
|
||||||
|
"Private Use Area": range(57344, 63743 + 1),
|
||||||
|
"CJK Compatibility Ideographs": range(63744, 64255 + 1),
|
||||||
|
"Alphabetic Presentation Forms": range(64256, 64335 + 1),
|
||||||
|
"Arabic Presentation Forms-A": range(64336, 65023 + 1),
|
||||||
|
"Variation Selectors": range(65024, 65039 + 1),
|
||||||
|
"Vertical Forms": range(65040, 65055 + 1),
|
||||||
|
"Combining Half Marks": range(65056, 65071 + 1),
|
||||||
|
"CJK Compatibility Forms": range(65072, 65103 + 1),
|
||||||
|
"Small Form Variants": range(65104, 65135 + 1),
|
||||||
|
"Arabic Presentation Forms-B": range(65136, 65279 + 1),
|
||||||
|
"Halfwidth and Fullwidth Forms": range(65280, 65519 + 1),
|
||||||
|
"Specials": range(65520, 65535 + 1),
|
||||||
|
"Linear B Syllabary": range(65536, 65663 + 1),
|
||||||
|
"Linear B Ideograms": range(65664, 65791 + 1),
|
||||||
|
"Aegean Numbers": range(65792, 65855 + 1),
|
||||||
|
"Ancient Greek Numbers": range(65856, 65935 + 1),
|
||||||
|
"Ancient Symbols": range(65936, 65999 + 1),
|
||||||
|
"Phaistos Disc": range(66000, 66047 + 1),
|
||||||
|
"Lycian": range(66176, 66207 + 1),
|
||||||
|
"Carian": range(66208, 66271 + 1),
|
||||||
|
"Coptic Epact Numbers": range(66272, 66303 + 1),
|
||||||
|
"Old Italic": range(66304, 66351 + 1),
|
||||||
|
"Gothic": range(66352, 66383 + 1),
|
||||||
|
"Old Permic": range(66384, 66431 + 1),
|
||||||
|
"Ugaritic": range(66432, 66463 + 1),
|
||||||
|
"Old Persian": range(66464, 66527 + 1),
|
||||||
|
"Deseret": range(66560, 66639 + 1),
|
||||||
|
"Shavian": range(66640, 66687 + 1),
|
||||||
|
"Osmanya": range(66688, 66735 + 1),
|
||||||
|
"Osage": range(66736, 66815 + 1),
|
||||||
|
"Elbasan": range(66816, 66863 + 1),
|
||||||
|
"Caucasian Albanian": range(66864, 66927 + 1),
|
||||||
|
"Linear A": range(67072, 67455 + 1),
|
||||||
|
"Cypriot Syllabary": range(67584, 67647 + 1),
|
||||||
|
"Imperial Aramaic": range(67648, 67679 + 1),
|
||||||
|
"Palmyrene": range(67680, 67711 + 1),
|
||||||
|
"Nabataean": range(67712, 67759 + 1),
|
||||||
|
"Hatran": range(67808, 67839 + 1),
|
||||||
|
"Phoenician": range(67840, 67871 + 1),
|
||||||
|
"Lydian": range(67872, 67903 + 1),
|
||||||
|
"Meroitic Hieroglyphs": range(67968, 67999 + 1),
|
||||||
|
"Meroitic Cursive": range(68000, 68095 + 1),
|
||||||
|
"Kharoshthi": range(68096, 68191 + 1),
|
||||||
|
"Old South Arabian": range(68192, 68223 + 1),
|
||||||
|
"Old North Arabian": range(68224, 68255 + 1),
|
||||||
|
"Manichaean": range(68288, 68351 + 1),
|
||||||
|
"Avestan": range(68352, 68415 + 1),
|
||||||
|
"Inscriptional Parthian": range(68416, 68447 + 1),
|
||||||
|
"Inscriptional Pahlavi": range(68448, 68479 + 1),
|
||||||
|
"Psalter Pahlavi": range(68480, 68527 + 1),
|
||||||
|
"Old Turkic": range(68608, 68687 + 1),
|
||||||
|
"Old Hungarian": range(68736, 68863 + 1),
|
||||||
|
"Rumi Numeral Symbols": range(69216, 69247 + 1),
|
||||||
|
"Brahmi": range(69632, 69759 + 1),
|
||||||
|
"Kaithi": range(69760, 69839 + 1),
|
||||||
|
"Sora Sompeng": range(69840, 69887 + 1),
|
||||||
|
"Chakma": range(69888, 69967 + 1),
|
||||||
|
"Mahajani": range(69968, 70015 + 1),
|
||||||
|
"Sharada": range(70016, 70111 + 1),
|
||||||
|
"Sinhala Archaic Numbers": range(70112, 70143 + 1),
|
||||||
|
"Khojki": range(70144, 70223 + 1),
|
||||||
|
"Multani": range(70272, 70319 + 1),
|
||||||
|
"Khudawadi": range(70320, 70399 + 1),
|
||||||
|
"Grantha": range(70400, 70527 + 1),
|
||||||
|
"Newa": range(70656, 70783 + 1),
|
||||||
|
"Tirhuta": range(70784, 70879 + 1),
|
||||||
|
"Siddham": range(71040, 71167 + 1),
|
||||||
|
"Modi": range(71168, 71263 + 1),
|
||||||
|
"Mongolian Supplement": range(71264, 71295 + 1),
|
||||||
|
"Takri": range(71296, 71375 + 1),
|
||||||
|
"Ahom": range(71424, 71487 + 1),
|
||||||
|
"Warang Citi": range(71840, 71935 + 1),
|
||||||
|
"Zanabazar Square": range(72192, 72271 + 1),
|
||||||
|
"Soyombo": range(72272, 72367 + 1),
|
||||||
|
"Pau Cin Hau": range(72384, 72447 + 1),
|
||||||
|
"Bhaiksuki": range(72704, 72815 + 1),
|
||||||
|
"Marchen": range(72816, 72895 + 1),
|
||||||
|
"Masaram Gondi": range(72960, 73055 + 1),
|
||||||
|
"Cuneiform": range(73728, 74751 + 1),
|
||||||
|
"Cuneiform Numbers and Punctuation": range(74752, 74879 + 1),
|
||||||
|
"Early Dynastic Cuneiform": range(74880, 75087 + 1),
|
||||||
|
"Egyptian Hieroglyphs": range(77824, 78895 + 1),
|
||||||
|
"Anatolian Hieroglyphs": range(82944, 83583 + 1),
|
||||||
|
"Bamum Supplement": range(92160, 92735 + 1),
|
||||||
|
"Mro": range(92736, 92783 + 1),
|
||||||
|
"Bassa Vah": range(92880, 92927 + 1),
|
||||||
|
"Pahawh Hmong": range(92928, 93071 + 1),
|
||||||
|
"Miao": range(93952, 94111 + 1),
|
||||||
|
"Ideographic Symbols and Punctuation": range(94176, 94207 + 1),
|
||||||
|
"Tangut": range(94208, 100351 + 1),
|
||||||
|
"Tangut Components": range(100352, 101119 + 1),
|
||||||
|
"Kana Supplement": range(110592, 110847 + 1),
|
||||||
|
"Kana Extended-A": range(110848, 110895 + 1),
|
||||||
|
"Nushu": range(110960, 111359 + 1),
|
||||||
|
"Duployan": range(113664, 113823 + 1),
|
||||||
|
"Shorthand Format Controls": range(113824, 113839 + 1),
|
||||||
|
"Byzantine Musical Symbols": range(118784, 119039 + 1),
|
||||||
|
"Musical Symbols": range(119040, 119295 + 1),
|
||||||
|
"Ancient Greek Musical Notation": range(119296, 119375 + 1),
|
||||||
|
"Tai Xuan Jing Symbols": range(119552, 119647 + 1),
|
||||||
|
"Counting Rod Numerals": range(119648, 119679 + 1),
|
||||||
|
"Mathematical Alphanumeric Symbols": range(119808, 120831 + 1),
|
||||||
|
"Sutton SignWriting": range(120832, 121519 + 1),
|
||||||
|
"Glagolitic Supplement": range(122880, 122927 + 1),
|
||||||
|
"Mende Kikakui": range(124928, 125151 + 1),
|
||||||
|
"Adlam": range(125184, 125279 + 1),
|
||||||
|
"Arabic Mathematical Alphabetic Symbols": range(126464, 126719 + 1),
|
||||||
|
"Mahjong Tiles": range(126976, 127023 + 1),
|
||||||
|
"Domino Tiles": range(127024, 127135 + 1),
|
||||||
|
"Playing Cards": range(127136, 127231 + 1),
|
||||||
|
"Enclosed Alphanumeric Supplement": range(127232, 127487 + 1),
|
||||||
|
"Enclosed Ideographic Supplement": range(127488, 127743 + 1),
|
||||||
|
"Miscellaneous Symbols and Pictographs": range(127744, 128511 + 1),
|
||||||
|
"Emoticons range(Emoji)": range(128512, 128591 + 1),
|
||||||
|
"Ornamental Dingbats": range(128592, 128639 + 1),
|
||||||
|
"Transport and Map Symbols": range(128640, 128767 + 1),
|
||||||
|
"Alchemical Symbols": range(128768, 128895 + 1),
|
||||||
|
"Geometric Shapes Extended": range(128896, 129023 + 1),
|
||||||
|
"Supplemental Arrows-C": range(129024, 129279 + 1),
|
||||||
|
"Supplemental Symbols and Pictographs": range(129280, 129535 + 1),
|
||||||
|
"CJK Unified Ideographs Extension B": range(131072, 173791 + 1),
|
||||||
|
"CJK Unified Ideographs Extension C": range(173824, 177983 + 1),
|
||||||
|
"CJK Unified Ideographs Extension D": range(177984, 178207 + 1),
|
||||||
|
"CJK Unified Ideographs Extension E": range(178208, 183983 + 1),
|
||||||
|
"CJK Unified Ideographs Extension F": range(183984, 191471 + 1),
|
||||||
|
"CJK Compatibility Ideographs Supplement": range(194560, 195103 + 1),
|
||||||
|
"Tags": range(917504, 917631 + 1),
|
||||||
|
"Variation Selectors Supplement": range(917760, 917999 + 1),
|
||||||
|
} # type: Dict[str, range]
|
||||||
|
|
||||||
|
UNICODE_SECONDARY_RANGE_KEYWORD = [
|
||||||
|
"Supplement",
|
||||||
|
"Extended",
|
||||||
|
"Extensions",
|
||||||
|
"Modifier",
|
||||||
|
"Marks",
|
||||||
|
"Punctuation",
|
||||||
|
"Symbols",
|
||||||
|
"Forms",
|
||||||
|
"Operators",
|
||||||
|
"Miscellaneous",
|
||||||
|
"Drawing",
|
||||||
|
"Block",
|
||||||
|
"Shapes",
|
||||||
|
"Supplemental",
|
||||||
|
"Tags",
|
||||||
|
] # type: List[str]
|
||||||
|
|
||||||
|
RE_POSSIBLE_ENCODING_INDICATION = re_compile(
|
||||||
|
r"(?:(?:encoding)|(?:charset)|(?:coding))(?:[\:= ]{1,10})(?:[\"\']?)([a-zA-Z0-9\-_]+)(?:[\"\']?)",
|
||||||
|
IGNORECASE,
|
||||||
|
)
|
||||||
|
|
||||||
|
IANA_SUPPORTED = sorted(
|
||||||
|
filter(
|
||||||
|
lambda x: x.endswith("_codec") is False
|
||||||
|
and x not in {"rot_13", "tactis", "mbcs"},
|
||||||
|
list(set(aliases.values())),
|
||||||
|
)
|
||||||
|
) # type: List[str]
|
||||||
|
|
||||||
|
IANA_SUPPORTED_COUNT = len(IANA_SUPPORTED) # type: int
|
||||||
|
|
||||||
|
# pre-computed code page that are similar using the function cp_similarity.
|
||||||
|
IANA_SUPPORTED_SIMILAR = {
|
||||||
|
"cp037": ["cp1026", "cp1140", "cp273", "cp500"],
|
||||||
|
"cp1026": ["cp037", "cp1140", "cp273", "cp500"],
|
||||||
|
"cp1125": ["cp866"],
|
||||||
|
"cp1140": ["cp037", "cp1026", "cp273", "cp500"],
|
||||||
|
"cp1250": ["iso8859_2"],
|
||||||
|
"cp1251": ["kz1048", "ptcp154"],
|
||||||
|
"cp1252": ["cp1258", "iso8859_15", "iso8859_9", "latin_1"],
|
||||||
|
"cp1253": ["iso8859_7"],
|
||||||
|
"cp1254": ["cp1258", "iso8859_15", "iso8859_9", "latin_1"],
|
||||||
|
"cp1257": ["iso8859_13"],
|
||||||
|
"cp1258": ["cp1252", "cp1254", "iso8859_9", "latin_1"],
|
||||||
|
"cp273": ["cp037", "cp1026", "cp1140", "cp500"],
|
||||||
|
"cp437": ["cp850", "cp858", "cp860", "cp861", "cp862", "cp863", "cp865"],
|
||||||
|
"cp500": ["cp037", "cp1026", "cp1140", "cp273"],
|
||||||
|
"cp850": ["cp437", "cp857", "cp858", "cp865"],
|
||||||
|
"cp857": ["cp850", "cp858", "cp865"],
|
||||||
|
"cp858": ["cp437", "cp850", "cp857", "cp865"],
|
||||||
|
"cp860": ["cp437", "cp861", "cp862", "cp863", "cp865"],
|
||||||
|
"cp861": ["cp437", "cp860", "cp862", "cp863", "cp865"],
|
||||||
|
"cp862": ["cp437", "cp860", "cp861", "cp863", "cp865"],
|
||||||
|
"cp863": ["cp437", "cp860", "cp861", "cp862", "cp865"],
|
||||||
|
"cp865": ["cp437", "cp850", "cp857", "cp858", "cp860", "cp861", "cp862", "cp863"],
|
||||||
|
"cp866": ["cp1125"],
|
||||||
|
"iso8859_10": ["iso8859_14", "iso8859_15", "iso8859_4", "iso8859_9", "latin_1"],
|
||||||
|
"iso8859_11": ["tis_620"],
|
||||||
|
"iso8859_13": ["cp1257"],
|
||||||
|
"iso8859_14": [
|
||||||
|
"iso8859_10",
|
||||||
|
"iso8859_15",
|
||||||
|
"iso8859_16",
|
||||||
|
"iso8859_3",
|
||||||
|
"iso8859_9",
|
||||||
|
"latin_1",
|
||||||
|
],
|
||||||
|
"iso8859_15": [
|
||||||
|
"cp1252",
|
||||||
|
"cp1254",
|
||||||
|
"iso8859_10",
|
||||||
|
"iso8859_14",
|
||||||
|
"iso8859_16",
|
||||||
|
"iso8859_3",
|
||||||
|
"iso8859_9",
|
||||||
|
"latin_1",
|
||||||
|
],
|
||||||
|
"iso8859_16": [
|
||||||
|
"iso8859_14",
|
||||||
|
"iso8859_15",
|
||||||
|
"iso8859_2",
|
||||||
|
"iso8859_3",
|
||||||
|
"iso8859_9",
|
||||||
|
"latin_1",
|
||||||
|
],
|
||||||
|
"iso8859_2": ["cp1250", "iso8859_16", "iso8859_4"],
|
||||||
|
"iso8859_3": ["iso8859_14", "iso8859_15", "iso8859_16", "iso8859_9", "latin_1"],
|
||||||
|
"iso8859_4": ["iso8859_10", "iso8859_2", "iso8859_9", "latin_1"],
|
||||||
|
"iso8859_7": ["cp1253"],
|
||||||
|
"iso8859_9": [
|
||||||
|
"cp1252",
|
||||||
|
"cp1254",
|
||||||
|
"cp1258",
|
||||||
|
"iso8859_10",
|
||||||
|
"iso8859_14",
|
||||||
|
"iso8859_15",
|
||||||
|
"iso8859_16",
|
||||||
|
"iso8859_3",
|
||||||
|
"iso8859_4",
|
||||||
|
"latin_1",
|
||||||
|
],
|
||||||
|
"kz1048": ["cp1251", "ptcp154"],
|
||||||
|
"latin_1": [
|
||||||
|
"cp1252",
|
||||||
|
"cp1254",
|
||||||
|
"cp1258",
|
||||||
|
"iso8859_10",
|
||||||
|
"iso8859_14",
|
||||||
|
"iso8859_15",
|
||||||
|
"iso8859_16",
|
||||||
|
"iso8859_3",
|
||||||
|
"iso8859_4",
|
||||||
|
"iso8859_9",
|
||||||
|
],
|
||||||
|
"mac_iceland": ["mac_roman", "mac_turkish"],
|
||||||
|
"mac_roman": ["mac_iceland", "mac_turkish"],
|
||||||
|
"mac_turkish": ["mac_iceland", "mac_roman"],
|
||||||
|
"ptcp154": ["cp1251", "kz1048"],
|
||||||
|
"tis_620": ["iso8859_11"],
|
||||||
|
} # type: Dict[str, List[str]]
|
||||||
|
|
||||||
|
|
||||||
|
CHARDET_CORRESPONDENCE = {
|
||||||
|
"iso2022_kr": "ISO-2022-KR",
|
||||||
|
"iso2022_jp": "ISO-2022-JP",
|
||||||
|
"euc_kr": "EUC-KR",
|
||||||
|
"tis_620": "TIS-620",
|
||||||
|
"utf_32": "UTF-32",
|
||||||
|
"euc_jp": "EUC-JP",
|
||||||
|
"koi8_r": "KOI8-R",
|
||||||
|
"iso8859_1": "ISO-8859-1",
|
||||||
|
"iso8859_2": "ISO-8859-2",
|
||||||
|
"iso8859_5": "ISO-8859-5",
|
||||||
|
"iso8859_6": "ISO-8859-6",
|
||||||
|
"iso8859_7": "ISO-8859-7",
|
||||||
|
"iso8859_8": "ISO-8859-8",
|
||||||
|
"utf_16": "UTF-16",
|
||||||
|
"cp855": "IBM855",
|
||||||
|
"mac_cyrillic": "MacCyrillic",
|
||||||
|
"gb2312": "GB2312",
|
||||||
|
"gb18030": "GB18030",
|
||||||
|
"cp932": "CP932",
|
||||||
|
"cp866": "IBM866",
|
||||||
|
"utf_8": "utf-8",
|
||||||
|
"utf_8_sig": "UTF-8-SIG",
|
||||||
|
"shift_jis": "SHIFT_JIS",
|
||||||
|
"big5": "Big5",
|
||||||
|
"cp1250": "windows-1250",
|
||||||
|
"cp1251": "windows-1251",
|
||||||
|
"cp1252": "Windows-1252",
|
||||||
|
"cp1253": "windows-1253",
|
||||||
|
"cp1255": "windows-1255",
|
||||||
|
"cp1256": "windows-1256",
|
||||||
|
"cp1254": "Windows-1254",
|
||||||
|
"cp949": "CP949",
|
||||||
|
} # type: Dict[str, str]
|
||||||
|
|
||||||
|
|
||||||
|
COMMON_SAFE_ASCII_CHARACTERS = {
|
||||||
|
"<",
|
||||||
|
">",
|
||||||
|
"=",
|
||||||
|
":",
|
||||||
|
"/",
|
||||||
|
"&",
|
||||||
|
";",
|
||||||
|
"{",
|
||||||
|
"}",
|
||||||
|
"[",
|
||||||
|
"]",
|
||||||
|
",",
|
||||||
|
"|",
|
||||||
|
'"',
|
||||||
|
"-",
|
||||||
|
} # type: Set[str]
|
||||||
|
|
||||||
|
|
||||||
|
KO_NAMES = {"johab", "cp949", "euc_kr"} # type: Set[str]
|
||||||
|
ZH_NAMES = {"big5", "cp950", "big5hkscs", "hz"} # type: Set[str]
|
||||||
|
|
||||||
|
NOT_PRINTABLE_PATTERN = re_compile(r"[0-9\W\n\r\t]+")
|
95
lib/charset_normalizer/legacy.py
Normal file
95
lib/charset_normalizer/legacy.py
Normal file
|
@ -0,0 +1,95 @@
|
||||||
|
import warnings
|
||||||
|
from typing import Dict, Optional, Union
|
||||||
|
|
||||||
|
from .api import from_bytes, from_fp, from_path, normalize
|
||||||
|
from .constant import CHARDET_CORRESPONDENCE
|
||||||
|
from .models import CharsetMatch, CharsetMatches
|
||||||
|
|
||||||
|
|
||||||
|
def detect(byte_str: bytes) -> Dict[str, Optional[Union[str, float]]]:
|
||||||
|
"""
|
||||||
|
chardet legacy method
|
||||||
|
Detect the encoding of the given byte string. It should be mostly backward-compatible.
|
||||||
|
Encoding name will match Chardet own writing whenever possible. (Not on encoding name unsupported by it)
|
||||||
|
This function is deprecated and should be used to migrate your project easily, consult the documentation for
|
||||||
|
further information. Not planned for removal.
|
||||||
|
|
||||||
|
:param byte_str: The byte sequence to examine.
|
||||||
|
"""
|
||||||
|
if not isinstance(byte_str, (bytearray, bytes)):
|
||||||
|
raise TypeError( # pragma: nocover
|
||||||
|
"Expected object of type bytes or bytearray, got: "
|
||||||
|
"{0}".format(type(byte_str))
|
||||||
|
)
|
||||||
|
|
||||||
|
if isinstance(byte_str, bytearray):
|
||||||
|
byte_str = bytes(byte_str)
|
||||||
|
|
||||||
|
r = from_bytes(byte_str).best()
|
||||||
|
|
||||||
|
encoding = r.encoding if r is not None else None
|
||||||
|
language = r.language if r is not None and r.language != "Unknown" else ""
|
||||||
|
confidence = 1.0 - r.chaos if r is not None else None
|
||||||
|
|
||||||
|
# Note: CharsetNormalizer does not return 'UTF-8-SIG' as the sig get stripped in the detection/normalization process
|
||||||
|
# but chardet does return 'utf-8-sig' and it is a valid codec name.
|
||||||
|
if r is not None and encoding == "utf_8" and r.bom:
|
||||||
|
encoding += "_sig"
|
||||||
|
|
||||||
|
return {
|
||||||
|
"encoding": encoding
|
||||||
|
if encoding not in CHARDET_CORRESPONDENCE
|
||||||
|
else CHARDET_CORRESPONDENCE[encoding],
|
||||||
|
"language": language,
|
||||||
|
"confidence": confidence,
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
class CharsetNormalizerMatch(CharsetMatch):
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
class CharsetNormalizerMatches(CharsetMatches):
|
||||||
|
@staticmethod
|
||||||
|
def from_fp(*args, **kwargs): # type: ignore
|
||||||
|
warnings.warn( # pragma: nocover
|
||||||
|
"staticmethod from_fp, from_bytes, from_path and normalize are deprecated "
|
||||||
|
"and scheduled to be removed in 3.0",
|
||||||
|
DeprecationWarning,
|
||||||
|
)
|
||||||
|
return from_fp(*args, **kwargs) # pragma: nocover
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def from_bytes(*args, **kwargs): # type: ignore
|
||||||
|
warnings.warn( # pragma: nocover
|
||||||
|
"staticmethod from_fp, from_bytes, from_path and normalize are deprecated "
|
||||||
|
"and scheduled to be removed in 3.0",
|
||||||
|
DeprecationWarning,
|
||||||
|
)
|
||||||
|
return from_bytes(*args, **kwargs) # pragma: nocover
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def from_path(*args, **kwargs): # type: ignore
|
||||||
|
warnings.warn( # pragma: nocover
|
||||||
|
"staticmethod from_fp, from_bytes, from_path and normalize are deprecated "
|
||||||
|
"and scheduled to be removed in 3.0",
|
||||||
|
DeprecationWarning,
|
||||||
|
)
|
||||||
|
return from_path(*args, **kwargs) # pragma: nocover
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def normalize(*args, **kwargs): # type: ignore
|
||||||
|
warnings.warn( # pragma: nocover
|
||||||
|
"staticmethod from_fp, from_bytes, from_path and normalize are deprecated "
|
||||||
|
"and scheduled to be removed in 3.0",
|
||||||
|
DeprecationWarning,
|
||||||
|
)
|
||||||
|
return normalize(*args, **kwargs) # pragma: nocover
|
||||||
|
|
||||||
|
|
||||||
|
class CharsetDetector(CharsetNormalizerMatches):
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
class CharsetDoctor(CharsetNormalizerMatches):
|
||||||
|
pass
|
540
lib/charset_normalizer/md.py
Normal file
540
lib/charset_normalizer/md.py
Normal file
|
@ -0,0 +1,540 @@
|
||||||
|
from functools import lru_cache
|
||||||
|
from typing import List, Optional
|
||||||
|
|
||||||
|
from .constant import COMMON_SAFE_ASCII_CHARACTERS, UNICODE_SECONDARY_RANGE_KEYWORD
|
||||||
|
from .utils import (
|
||||||
|
is_accentuated,
|
||||||
|
is_ascii,
|
||||||
|
is_case_variable,
|
||||||
|
is_cjk,
|
||||||
|
is_emoticon,
|
||||||
|
is_hangul,
|
||||||
|
is_hiragana,
|
||||||
|
is_katakana,
|
||||||
|
is_latin,
|
||||||
|
is_punctuation,
|
||||||
|
is_separator,
|
||||||
|
is_symbol,
|
||||||
|
is_thai,
|
||||||
|
remove_accent,
|
||||||
|
unicode_range,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
class MessDetectorPlugin:
|
||||||
|
"""
|
||||||
|
Base abstract class used for mess detection plugins.
|
||||||
|
All detectors MUST extend and implement given methods.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def eligible(self, character: str) -> bool:
|
||||||
|
"""
|
||||||
|
Determine if given character should be fed in.
|
||||||
|
"""
|
||||||
|
raise NotImplementedError # pragma: nocover
|
||||||
|
|
||||||
|
def feed(self, character: str) -> None:
|
||||||
|
"""
|
||||||
|
The main routine to be executed upon character.
|
||||||
|
Insert the logic in witch the text would be considered chaotic.
|
||||||
|
"""
|
||||||
|
raise NotImplementedError # pragma: nocover
|
||||||
|
|
||||||
|
def reset(self) -> None:
|
||||||
|
"""
|
||||||
|
Permit to reset the plugin to the initial state.
|
||||||
|
"""
|
||||||
|
raise NotImplementedError # pragma: nocover
|
||||||
|
|
||||||
|
@property
|
||||||
|
def ratio(self) -> float:
|
||||||
|
"""
|
||||||
|
Compute the chaos ratio based on what your feed() has seen.
|
||||||
|
Must NOT be lower than 0.; No restriction gt 0.
|
||||||
|
"""
|
||||||
|
raise NotImplementedError # pragma: nocover
|
||||||
|
|
||||||
|
|
||||||
|
class TooManySymbolOrPunctuationPlugin(MessDetectorPlugin):
|
||||||
|
def __init__(self) -> None:
|
||||||
|
self._punctuation_count = 0 # type: int
|
||||||
|
self._symbol_count = 0 # type: int
|
||||||
|
self._character_count = 0 # type: int
|
||||||
|
|
||||||
|
self._last_printable_char = None # type: Optional[str]
|
||||||
|
self._frenzy_symbol_in_word = False # type: bool
|
||||||
|
|
||||||
|
def eligible(self, character: str) -> bool:
|
||||||
|
return character.isprintable()
|
||||||
|
|
||||||
|
def feed(self, character: str) -> None:
|
||||||
|
self._character_count += 1
|
||||||
|
|
||||||
|
if (
|
||||||
|
character != self._last_printable_char
|
||||||
|
and character not in COMMON_SAFE_ASCII_CHARACTERS
|
||||||
|
):
|
||||||
|
if is_punctuation(character):
|
||||||
|
self._punctuation_count += 1
|
||||||
|
elif (
|
||||||
|
character.isdigit() is False
|
||||||
|
and is_symbol(character)
|
||||||
|
and is_emoticon(character) is False
|
||||||
|
):
|
||||||
|
self._symbol_count += 2
|
||||||
|
|
||||||
|
self._last_printable_char = character
|
||||||
|
|
||||||
|
def reset(self) -> None:
|
||||||
|
self._punctuation_count = 0
|
||||||
|
self._character_count = 0
|
||||||
|
self._symbol_count = 0
|
||||||
|
|
||||||
|
@property
|
||||||
|
def ratio(self) -> float:
|
||||||
|
if self._character_count == 0:
|
||||||
|
return 0.0
|
||||||
|
|
||||||
|
ratio_of_punctuation = (
|
||||||
|
self._punctuation_count + self._symbol_count
|
||||||
|
) / self._character_count # type: float
|
||||||
|
|
||||||
|
return ratio_of_punctuation if ratio_of_punctuation >= 0.3 else 0.0
|
||||||
|
|
||||||
|
|
||||||
|
class TooManyAccentuatedPlugin(MessDetectorPlugin):
|
||||||
|
def __init__(self) -> None:
|
||||||
|
self._character_count = 0 # type: int
|
||||||
|
self._accentuated_count = 0 # type: int
|
||||||
|
|
||||||
|
def eligible(self, character: str) -> bool:
|
||||||
|
return character.isalpha()
|
||||||
|
|
||||||
|
def feed(self, character: str) -> None:
|
||||||
|
self._character_count += 1
|
||||||
|
|
||||||
|
if is_accentuated(character):
|
||||||
|
self._accentuated_count += 1
|
||||||
|
|
||||||
|
def reset(self) -> None:
|
||||||
|
self._character_count = 0
|
||||||
|
self._accentuated_count = 0
|
||||||
|
|
||||||
|
@property
|
||||||
|
def ratio(self) -> float:
|
||||||
|
if self._character_count == 0:
|
||||||
|
return 0.0
|
||||||
|
ratio_of_accentuation = (
|
||||||
|
self._accentuated_count / self._character_count
|
||||||
|
) # type: float
|
||||||
|
return ratio_of_accentuation if ratio_of_accentuation >= 0.35 else 0.0
|
||||||
|
|
||||||
|
|
||||||
|
class UnprintablePlugin(MessDetectorPlugin):
|
||||||
|
def __init__(self) -> None:
|
||||||
|
self._unprintable_count = 0 # type: int
|
||||||
|
self._character_count = 0 # type: int
|
||||||
|
|
||||||
|
def eligible(self, character: str) -> bool:
|
||||||
|
return True
|
||||||
|
|
||||||
|
def feed(self, character: str) -> None:
|
||||||
|
if (
|
||||||
|
character.isspace() is False # includes \n \t \r \v
|
||||||
|
and character.isprintable() is False
|
||||||
|
and character != "\x1A" # Why? Its the ASCII substitute character.
|
||||||
|
):
|
||||||
|
self._unprintable_count += 1
|
||||||
|
self._character_count += 1
|
||||||
|
|
||||||
|
def reset(self) -> None:
|
||||||
|
self._unprintable_count = 0
|
||||||
|
|
||||||
|
@property
|
||||||
|
def ratio(self) -> float:
|
||||||
|
if self._character_count == 0:
|
||||||
|
return 0.0
|
||||||
|
|
||||||
|
return (self._unprintable_count * 8) / self._character_count
|
||||||
|
|
||||||
|
|
||||||
|
class SuspiciousDuplicateAccentPlugin(MessDetectorPlugin):
|
||||||
|
def __init__(self) -> None:
|
||||||
|
self._successive_count = 0 # type: int
|
||||||
|
self._character_count = 0 # type: int
|
||||||
|
|
||||||
|
self._last_latin_character = None # type: Optional[str]
|
||||||
|
|
||||||
|
def eligible(self, character: str) -> bool:
|
||||||
|
return character.isalpha() and is_latin(character)
|
||||||
|
|
||||||
|
def feed(self, character: str) -> None:
|
||||||
|
self._character_count += 1
|
||||||
|
if self._last_latin_character is not None:
|
||||||
|
if is_accentuated(character) and is_accentuated(self._last_latin_character):
|
||||||
|
if character.isupper() and self._last_latin_character.isupper():
|
||||||
|
self._successive_count += 1
|
||||||
|
# Worse if its the same char duplicated with different accent.
|
||||||
|
if remove_accent(character) == remove_accent(
|
||||||
|
self._last_latin_character
|
||||||
|
):
|
||||||
|
self._successive_count += 1
|
||||||
|
self._last_latin_character = character
|
||||||
|
|
||||||
|
def reset(self) -> None:
|
||||||
|
self._successive_count = 0
|
||||||
|
self._character_count = 0
|
||||||
|
self._last_latin_character = None
|
||||||
|
|
||||||
|
@property
|
||||||
|
def ratio(self) -> float:
|
||||||
|
if self._character_count == 0:
|
||||||
|
return 0.0
|
||||||
|
|
||||||
|
return (self._successive_count * 2) / self._character_count
|
||||||
|
|
||||||
|
|
||||||
|
class SuspiciousRange(MessDetectorPlugin):
|
||||||
|
def __init__(self) -> None:
|
||||||
|
self._suspicious_successive_range_count = 0 # type: int
|
||||||
|
self._character_count = 0 # type: int
|
||||||
|
self._last_printable_seen = None # type: Optional[str]
|
||||||
|
|
||||||
|
def eligible(self, character: str) -> bool:
|
||||||
|
return character.isprintable()
|
||||||
|
|
||||||
|
def feed(self, character: str) -> None:
|
||||||
|
self._character_count += 1
|
||||||
|
|
||||||
|
if (
|
||||||
|
character.isspace()
|
||||||
|
or is_punctuation(character)
|
||||||
|
or character in COMMON_SAFE_ASCII_CHARACTERS
|
||||||
|
):
|
||||||
|
self._last_printable_seen = None
|
||||||
|
return
|
||||||
|
|
||||||
|
if self._last_printable_seen is None:
|
||||||
|
self._last_printable_seen = character
|
||||||
|
return
|
||||||
|
|
||||||
|
unicode_range_a = unicode_range(
|
||||||
|
self._last_printable_seen
|
||||||
|
) # type: Optional[str]
|
||||||
|
unicode_range_b = unicode_range(character) # type: Optional[str]
|
||||||
|
|
||||||
|
if is_suspiciously_successive_range(unicode_range_a, unicode_range_b):
|
||||||
|
self._suspicious_successive_range_count += 1
|
||||||
|
|
||||||
|
self._last_printable_seen = character
|
||||||
|
|
||||||
|
def reset(self) -> None:
|
||||||
|
self._character_count = 0
|
||||||
|
self._suspicious_successive_range_count = 0
|
||||||
|
self._last_printable_seen = None
|
||||||
|
|
||||||
|
@property
|
||||||
|
def ratio(self) -> float:
|
||||||
|
if self._character_count == 0:
|
||||||
|
return 0.0
|
||||||
|
|
||||||
|
ratio_of_suspicious_range_usage = (
|
||||||
|
self._suspicious_successive_range_count * 2
|
||||||
|
) / self._character_count # type: float
|
||||||
|
|
||||||
|
if ratio_of_suspicious_range_usage < 0.1:
|
||||||
|
return 0.0
|
||||||
|
|
||||||
|
return ratio_of_suspicious_range_usage
|
||||||
|
|
||||||
|
|
||||||
|
class SuperWeirdWordPlugin(MessDetectorPlugin):
|
||||||
|
def __init__(self) -> None:
|
||||||
|
self._word_count = 0 # type: int
|
||||||
|
self._bad_word_count = 0 # type: int
|
||||||
|
self._is_current_word_bad = False # type: bool
|
||||||
|
self._foreign_long_watch = False # type: bool
|
||||||
|
|
||||||
|
self._character_count = 0 # type: int
|
||||||
|
self._bad_character_count = 0 # type: int
|
||||||
|
|
||||||
|
self._buffer = "" # type: str
|
||||||
|
self._buffer_accent_count = 0 # type: int
|
||||||
|
|
||||||
|
def eligible(self, character: str) -> bool:
|
||||||
|
return True
|
||||||
|
|
||||||
|
def feed(self, character: str) -> None:
|
||||||
|
if character.isalpha():
|
||||||
|
self._buffer = "".join([self._buffer, character])
|
||||||
|
if is_accentuated(character):
|
||||||
|
self._buffer_accent_count += 1
|
||||||
|
if (
|
||||||
|
self._foreign_long_watch is False
|
||||||
|
and is_latin(character) is False
|
||||||
|
and is_cjk(character) is False
|
||||||
|
and is_hangul(character) is False
|
||||||
|
and is_katakana(character) is False
|
||||||
|
and is_hiragana(character) is False
|
||||||
|
and is_thai(character) is False
|
||||||
|
):
|
||||||
|
self._foreign_long_watch = True
|
||||||
|
return
|
||||||
|
if not self._buffer:
|
||||||
|
return
|
||||||
|
if (
|
||||||
|
character.isspace() or is_punctuation(character) or is_separator(character)
|
||||||
|
) and self._buffer:
|
||||||
|
self._word_count += 1
|
||||||
|
buffer_length = len(self._buffer) # type: int
|
||||||
|
|
||||||
|
self._character_count += buffer_length
|
||||||
|
|
||||||
|
if buffer_length >= 4 and self._buffer_accent_count / buffer_length > 0.34:
|
||||||
|
self._is_current_word_bad = True
|
||||||
|
if buffer_length >= 24 and self._foreign_long_watch:
|
||||||
|
self._is_current_word_bad = True
|
||||||
|
|
||||||
|
if self._is_current_word_bad:
|
||||||
|
self._bad_word_count += 1
|
||||||
|
self._bad_character_count += len(self._buffer)
|
||||||
|
self._is_current_word_bad = False
|
||||||
|
|
||||||
|
self._foreign_long_watch = False
|
||||||
|
self._buffer = ""
|
||||||
|
self._buffer_accent_count = 0
|
||||||
|
elif (
|
||||||
|
character not in {"<", ">", "-", "="}
|
||||||
|
and character.isdigit() is False
|
||||||
|
and is_symbol(character)
|
||||||
|
):
|
||||||
|
self._is_current_word_bad = True
|
||||||
|
self._buffer += character
|
||||||
|
|
||||||
|
def reset(self) -> None:
|
||||||
|
self._buffer = ""
|
||||||
|
self._is_current_word_bad = False
|
||||||
|
self._foreign_long_watch = False
|
||||||
|
self._bad_word_count = 0
|
||||||
|
self._word_count = 0
|
||||||
|
self._character_count = 0
|
||||||
|
self._bad_character_count = 0
|
||||||
|
|
||||||
|
@property
|
||||||
|
def ratio(self) -> float:
|
||||||
|
if self._word_count <= 10:
|
||||||
|
return 0.0
|
||||||
|
|
||||||
|
return self._bad_character_count / self._character_count
|
||||||
|
|
||||||
|
|
||||||
|
class CjkInvalidStopPlugin(MessDetectorPlugin):
|
||||||
|
"""
|
||||||
|
GB(Chinese) based encoding often render the stop incorrectly when the content does not fit and
|
||||||
|
can be easily detected. Searching for the overuse of '丅' and '丄'.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self) -> None:
|
||||||
|
self._wrong_stop_count = 0 # type: int
|
||||||
|
self._cjk_character_count = 0 # type: int
|
||||||
|
|
||||||
|
def eligible(self, character: str) -> bool:
|
||||||
|
return True
|
||||||
|
|
||||||
|
def feed(self, character: str) -> None:
|
||||||
|
if character in ["丅", "丄"]:
|
||||||
|
self._wrong_stop_count += 1
|
||||||
|
return
|
||||||
|
if is_cjk(character):
|
||||||
|
self._cjk_character_count += 1
|
||||||
|
|
||||||
|
def reset(self) -> None:
|
||||||
|
self._wrong_stop_count = 0
|
||||||
|
self._cjk_character_count = 0
|
||||||
|
|
||||||
|
@property
|
||||||
|
def ratio(self) -> float:
|
||||||
|
if self._cjk_character_count < 16:
|
||||||
|
return 0.0
|
||||||
|
return self._wrong_stop_count / self._cjk_character_count
|
||||||
|
|
||||||
|
|
||||||
|
class ArchaicUpperLowerPlugin(MessDetectorPlugin):
|
||||||
|
def __init__(self) -> None:
|
||||||
|
self._buf = False # type: bool
|
||||||
|
|
||||||
|
self._character_count_since_last_sep = 0 # type: int
|
||||||
|
|
||||||
|
self._successive_upper_lower_count = 0 # type: int
|
||||||
|
self._successive_upper_lower_count_final = 0 # type: int
|
||||||
|
|
||||||
|
self._character_count = 0 # type: int
|
||||||
|
|
||||||
|
self._last_alpha_seen = None # type: Optional[str]
|
||||||
|
self._current_ascii_only = True # type: bool
|
||||||
|
|
||||||
|
def eligible(self, character: str) -> bool:
|
||||||
|
return True
|
||||||
|
|
||||||
|
def feed(self, character: str) -> None:
|
||||||
|
is_concerned = character.isalpha() and is_case_variable(character)
|
||||||
|
chunk_sep = is_concerned is False
|
||||||
|
|
||||||
|
if chunk_sep and self._character_count_since_last_sep > 0:
|
||||||
|
if (
|
||||||
|
self._character_count_since_last_sep <= 64
|
||||||
|
and character.isdigit() is False
|
||||||
|
and self._current_ascii_only is False
|
||||||
|
):
|
||||||
|
self._successive_upper_lower_count_final += (
|
||||||
|
self._successive_upper_lower_count
|
||||||
|
)
|
||||||
|
|
||||||
|
self._successive_upper_lower_count = 0
|
||||||
|
self._character_count_since_last_sep = 0
|
||||||
|
self._last_alpha_seen = None
|
||||||
|
self._buf = False
|
||||||
|
self._character_count += 1
|
||||||
|
self._current_ascii_only = True
|
||||||
|
|
||||||
|
return
|
||||||
|
|
||||||
|
if self._current_ascii_only is True and is_ascii(character) is False:
|
||||||
|
self._current_ascii_only = False
|
||||||
|
|
||||||
|
if self._last_alpha_seen is not None:
|
||||||
|
if (character.isupper() and self._last_alpha_seen.islower()) or (
|
||||||
|
character.islower() and self._last_alpha_seen.isupper()
|
||||||
|
):
|
||||||
|
if self._buf is True:
|
||||||
|
self._successive_upper_lower_count += 2
|
||||||
|
self._buf = False
|
||||||
|
else:
|
||||||
|
self._buf = True
|
||||||
|
else:
|
||||||
|
self._buf = False
|
||||||
|
|
||||||
|
self._character_count += 1
|
||||||
|
self._character_count_since_last_sep += 1
|
||||||
|
self._last_alpha_seen = character
|
||||||
|
|
||||||
|
def reset(self) -> None:
|
||||||
|
self._character_count = 0
|
||||||
|
self._character_count_since_last_sep = 0
|
||||||
|
self._successive_upper_lower_count = 0
|
||||||
|
self._successive_upper_lower_count_final = 0
|
||||||
|
self._last_alpha_seen = None
|
||||||
|
self._buf = False
|
||||||
|
self._current_ascii_only = True
|
||||||
|
|
||||||
|
@property
|
||||||
|
def ratio(self) -> float:
|
||||||
|
if self._character_count == 0:
|
||||||
|
return 0.0
|
||||||
|
|
||||||
|
return self._successive_upper_lower_count_final / self._character_count
|
||||||
|
|
||||||
|
|
||||||
|
def is_suspiciously_successive_range(
|
||||||
|
unicode_range_a: Optional[str], unicode_range_b: Optional[str]
|
||||||
|
) -> bool:
|
||||||
|
"""
|
||||||
|
Determine if two Unicode range seen next to each other can be considered as suspicious.
|
||||||
|
"""
|
||||||
|
if unicode_range_a is None or unicode_range_b is None:
|
||||||
|
return True
|
||||||
|
|
||||||
|
if unicode_range_a == unicode_range_b:
|
||||||
|
return False
|
||||||
|
|
||||||
|
if "Latin" in unicode_range_a and "Latin" in unicode_range_b:
|
||||||
|
return False
|
||||||
|
|
||||||
|
if "Emoticons" in unicode_range_a or "Emoticons" in unicode_range_b:
|
||||||
|
return False
|
||||||
|
|
||||||
|
keywords_range_a, keywords_range_b = unicode_range_a.split(
|
||||||
|
" "
|
||||||
|
), unicode_range_b.split(" ")
|
||||||
|
|
||||||
|
for el in keywords_range_a:
|
||||||
|
if el in UNICODE_SECONDARY_RANGE_KEYWORD:
|
||||||
|
continue
|
||||||
|
if el in keywords_range_b:
|
||||||
|
return False
|
||||||
|
|
||||||
|
# Japanese Exception
|
||||||
|
range_a_jp_chars, range_b_jp_chars = (
|
||||||
|
unicode_range_a
|
||||||
|
in (
|
||||||
|
"Hiragana",
|
||||||
|
"Katakana",
|
||||||
|
),
|
||||||
|
unicode_range_b in ("Hiragana", "Katakana"),
|
||||||
|
)
|
||||||
|
if range_a_jp_chars or range_b_jp_chars:
|
||||||
|
if "CJK" in unicode_range_a or "CJK" in unicode_range_b:
|
||||||
|
return False
|
||||||
|
if range_a_jp_chars and range_b_jp_chars:
|
||||||
|
return False
|
||||||
|
|
||||||
|
if "Hangul" in unicode_range_a or "Hangul" in unicode_range_b:
|
||||||
|
if "CJK" in unicode_range_a or "CJK" in unicode_range_b:
|
||||||
|
return False
|
||||||
|
if unicode_range_a == "Basic Latin" or unicode_range_b == "Basic Latin":
|
||||||
|
return False
|
||||||
|
|
||||||
|
# Chinese/Japanese use dedicated range for punctuation and/or separators.
|
||||||
|
if ("CJK" in unicode_range_a or "CJK" in unicode_range_b) or (
|
||||||
|
unicode_range_a in ["Katakana", "Hiragana"]
|
||||||
|
and unicode_range_b in ["Katakana", "Hiragana"]
|
||||||
|
):
|
||||||
|
if "Punctuation" in unicode_range_a or "Punctuation" in unicode_range_b:
|
||||||
|
return False
|
||||||
|
if "Forms" in unicode_range_a or "Forms" in unicode_range_b:
|
||||||
|
return False
|
||||||
|
|
||||||
|
return True
|
||||||
|
|
||||||
|
|
||||||
|
@lru_cache(maxsize=2048)
|
||||||
|
def mess_ratio(
|
||||||
|
decoded_sequence: str, maximum_threshold: float = 0.2, debug: bool = False
|
||||||
|
) -> float:
|
||||||
|
"""
|
||||||
|
Compute a mess ratio given a decoded bytes sequence. The maximum threshold does stop the computation earlier.
|
||||||
|
"""
|
||||||
|
|
||||||
|
detectors = [
|
||||||
|
md_class() for md_class in MessDetectorPlugin.__subclasses__()
|
||||||
|
] # type: List[MessDetectorPlugin]
|
||||||
|
|
||||||
|
length = len(decoded_sequence) # type: int
|
||||||
|
|
||||||
|
mean_mess_ratio = 0.0 # type: float
|
||||||
|
|
||||||
|
if length < 512:
|
||||||
|
intermediary_mean_mess_ratio_calc = 32 # type: int
|
||||||
|
elif length <= 1024:
|
||||||
|
intermediary_mean_mess_ratio_calc = 64
|
||||||
|
else:
|
||||||
|
intermediary_mean_mess_ratio_calc = 128
|
||||||
|
|
||||||
|
for character, index in zip(decoded_sequence, range(0, length)):
|
||||||
|
for detector in detectors:
|
||||||
|
if detector.eligible(character):
|
||||||
|
detector.feed(character)
|
||||||
|
|
||||||
|
if (
|
||||||
|
index > 0 and index % intermediary_mean_mess_ratio_calc == 0
|
||||||
|
) or index == length - 1:
|
||||||
|
mean_mess_ratio = sum([dt.ratio for dt in detectors])
|
||||||
|
|
||||||
|
if mean_mess_ratio >= maximum_threshold:
|
||||||
|
break
|
||||||
|
|
||||||
|
if debug:
|
||||||
|
for dt in detectors: # pragma: nocover
|
||||||
|
print(dt.__class__, dt.ratio)
|
||||||
|
|
||||||
|
return round(mean_mess_ratio, 3)
|
393
lib/charset_normalizer/models.py
Normal file
393
lib/charset_normalizer/models.py
Normal file
|
@ -0,0 +1,393 @@
|
||||||
|
import warnings
|
||||||
|
from collections import Counter
|
||||||
|
from encodings.aliases import aliases
|
||||||
|
from hashlib import sha256
|
||||||
|
from json import dumps
|
||||||
|
from re import sub
|
||||||
|
from typing import Any, Dict, Iterator, List, Optional, Tuple, Union
|
||||||
|
|
||||||
|
from .constant import NOT_PRINTABLE_PATTERN, TOO_BIG_SEQUENCE
|
||||||
|
from .md import mess_ratio
|
||||||
|
from .utils import iana_name, is_multi_byte_encoding, unicode_range
|
||||||
|
|
||||||
|
|
||||||
|
class CharsetMatch:
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
payload: bytes,
|
||||||
|
guessed_encoding: str,
|
||||||
|
mean_mess_ratio: float,
|
||||||
|
has_sig_or_bom: bool,
|
||||||
|
languages: "CoherenceMatches",
|
||||||
|
decoded_payload: Optional[str] = None,
|
||||||
|
):
|
||||||
|
self._payload = payload # type: bytes
|
||||||
|
|
||||||
|
self._encoding = guessed_encoding # type: str
|
||||||
|
self._mean_mess_ratio = mean_mess_ratio # type: float
|
||||||
|
self._languages = languages # type: CoherenceMatches
|
||||||
|
self._has_sig_or_bom = has_sig_or_bom # type: bool
|
||||||
|
self._unicode_ranges = None # type: Optional[List[str]]
|
||||||
|
|
||||||
|
self._leaves = [] # type: List[CharsetMatch]
|
||||||
|
self._mean_coherence_ratio = 0.0 # type: float
|
||||||
|
|
||||||
|
self._output_payload = None # type: Optional[bytes]
|
||||||
|
self._output_encoding = None # type: Optional[str]
|
||||||
|
|
||||||
|
self._string = decoded_payload # type: Optional[str]
|
||||||
|
|
||||||
|
def __eq__(self, other: object) -> bool:
|
||||||
|
if not isinstance(other, CharsetMatch):
|
||||||
|
raise TypeError(
|
||||||
|
"__eq__ cannot be invoked on {} and {}.".format(
|
||||||
|
str(other.__class__), str(self.__class__)
|
||||||
|
)
|
||||||
|
)
|
||||||
|
return self.encoding == other.encoding and self.fingerprint == other.fingerprint
|
||||||
|
|
||||||
|
def __lt__(self, other: object) -> bool:
|
||||||
|
"""
|
||||||
|
Implemented to make sorted available upon CharsetMatches items.
|
||||||
|
"""
|
||||||
|
if not isinstance(other, CharsetMatch):
|
||||||
|
raise ValueError
|
||||||
|
|
||||||
|
chaos_difference = abs(self.chaos - other.chaos) # type: float
|
||||||
|
coherence_difference = abs(self.coherence - other.coherence) # type: float
|
||||||
|
|
||||||
|
# Bellow 1% difference --> Use Coherence
|
||||||
|
if chaos_difference < 0.01 and coherence_difference > 0.02:
|
||||||
|
# When having a tough decision, use the result that decoded as many multi-byte as possible.
|
||||||
|
if chaos_difference == 0.0 and self.coherence == other.coherence:
|
||||||
|
return self.multi_byte_usage > other.multi_byte_usage
|
||||||
|
return self.coherence > other.coherence
|
||||||
|
|
||||||
|
return self.chaos < other.chaos
|
||||||
|
|
||||||
|
@property
|
||||||
|
def multi_byte_usage(self) -> float:
|
||||||
|
return 1.0 - len(str(self)) / len(self.raw)
|
||||||
|
|
||||||
|
@property
|
||||||
|
def chaos_secondary_pass(self) -> float:
|
||||||
|
"""
|
||||||
|
Check once again chaos in decoded text, except this time, with full content.
|
||||||
|
Use with caution, this can be very slow.
|
||||||
|
Notice: Will be removed in 3.0
|
||||||
|
"""
|
||||||
|
warnings.warn(
|
||||||
|
"chaos_secondary_pass is deprecated and will be removed in 3.0",
|
||||||
|
DeprecationWarning,
|
||||||
|
)
|
||||||
|
return mess_ratio(str(self), 1.0)
|
||||||
|
|
||||||
|
@property
|
||||||
|
def coherence_non_latin(self) -> float:
|
||||||
|
"""
|
||||||
|
Coherence ratio on the first non-latin language detected if ANY.
|
||||||
|
Notice: Will be removed in 3.0
|
||||||
|
"""
|
||||||
|
warnings.warn(
|
||||||
|
"coherence_non_latin is deprecated and will be removed in 3.0",
|
||||||
|
DeprecationWarning,
|
||||||
|
)
|
||||||
|
return 0.0
|
||||||
|
|
||||||
|
@property
|
||||||
|
def w_counter(self) -> Counter:
|
||||||
|
"""
|
||||||
|
Word counter instance on decoded text.
|
||||||
|
Notice: Will be removed in 3.0
|
||||||
|
"""
|
||||||
|
warnings.warn(
|
||||||
|
"w_counter is deprecated and will be removed in 3.0", DeprecationWarning
|
||||||
|
)
|
||||||
|
|
||||||
|
string_printable_only = sub(NOT_PRINTABLE_PATTERN, " ", str(self).lower())
|
||||||
|
|
||||||
|
return Counter(string_printable_only.split())
|
||||||
|
|
||||||
|
def __str__(self) -> str:
|
||||||
|
# Lazy Str Loading
|
||||||
|
if self._string is None:
|
||||||
|
self._string = str(self._payload, self._encoding, "strict")
|
||||||
|
return self._string
|
||||||
|
|
||||||
|
def __repr__(self) -> str:
|
||||||
|
return "<CharsetMatch '{}' bytes({})>".format(self.encoding, self.fingerprint)
|
||||||
|
|
||||||
|
def add_submatch(self, other: "CharsetMatch") -> None:
|
||||||
|
if not isinstance(other, CharsetMatch) or other == self:
|
||||||
|
raise ValueError(
|
||||||
|
"Unable to add instance <{}> as a submatch of a CharsetMatch".format(
|
||||||
|
other.__class__
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
other._string = None # Unload RAM usage; dirty trick.
|
||||||
|
self._leaves.append(other)
|
||||||
|
|
||||||
|
@property
|
||||||
|
def encoding(self) -> str:
|
||||||
|
return self._encoding
|
||||||
|
|
||||||
|
@property
|
||||||
|
def encoding_aliases(self) -> List[str]:
|
||||||
|
"""
|
||||||
|
Encoding name are known by many name, using this could help when searching for IBM855 when it's listed as CP855.
|
||||||
|
"""
|
||||||
|
also_known_as = [] # type: List[str]
|
||||||
|
for u, p in aliases.items():
|
||||||
|
if self.encoding == u:
|
||||||
|
also_known_as.append(p)
|
||||||
|
elif self.encoding == p:
|
||||||
|
also_known_as.append(u)
|
||||||
|
return also_known_as
|
||||||
|
|
||||||
|
@property
|
||||||
|
def bom(self) -> bool:
|
||||||
|
return self._has_sig_or_bom
|
||||||
|
|
||||||
|
@property
|
||||||
|
def byte_order_mark(self) -> bool:
|
||||||
|
return self._has_sig_or_bom
|
||||||
|
|
||||||
|
@property
|
||||||
|
def languages(self) -> List[str]:
|
||||||
|
"""
|
||||||
|
Return the complete list of possible languages found in decoded sequence.
|
||||||
|
Usually not really useful. Returned list may be empty even if 'language' property return something != 'Unknown'.
|
||||||
|
"""
|
||||||
|
return [e[0] for e in self._languages]
|
||||||
|
|
||||||
|
@property
|
||||||
|
def language(self) -> str:
|
||||||
|
"""
|
||||||
|
Most probable language found in decoded sequence. If none were detected or inferred, the property will return
|
||||||
|
"Unknown".
|
||||||
|
"""
|
||||||
|
if not self._languages:
|
||||||
|
# Trying to infer the language based on the given encoding
|
||||||
|
# Its either English or we should not pronounce ourselves in certain cases.
|
||||||
|
if "ascii" in self.could_be_from_charset:
|
||||||
|
return "English"
|
||||||
|
|
||||||
|
# doing it there to avoid circular import
|
||||||
|
from charset_normalizer.cd import encoding_languages, mb_encoding_languages
|
||||||
|
|
||||||
|
languages = (
|
||||||
|
mb_encoding_languages(self.encoding)
|
||||||
|
if is_multi_byte_encoding(self.encoding)
|
||||||
|
else encoding_languages(self.encoding)
|
||||||
|
)
|
||||||
|
|
||||||
|
if len(languages) == 0 or "Latin Based" in languages:
|
||||||
|
return "Unknown"
|
||||||
|
|
||||||
|
return languages[0]
|
||||||
|
|
||||||
|
return self._languages[0][0]
|
||||||
|
|
||||||
|
@property
|
||||||
|
def chaos(self) -> float:
|
||||||
|
return self._mean_mess_ratio
|
||||||
|
|
||||||
|
@property
|
||||||
|
def coherence(self) -> float:
|
||||||
|
if not self._languages:
|
||||||
|
return 0.0
|
||||||
|
return self._languages[0][1]
|
||||||
|
|
||||||
|
@property
|
||||||
|
def percent_chaos(self) -> float:
|
||||||
|
return round(self.chaos * 100, ndigits=3)
|
||||||
|
|
||||||
|
@property
|
||||||
|
def percent_coherence(self) -> float:
|
||||||
|
return round(self.coherence * 100, ndigits=3)
|
||||||
|
|
||||||
|
@property
|
||||||
|
def raw(self) -> bytes:
|
||||||
|
"""
|
||||||
|
Original untouched bytes.
|
||||||
|
"""
|
||||||
|
return self._payload
|
||||||
|
|
||||||
|
@property
|
||||||
|
def submatch(self) -> List["CharsetMatch"]:
|
||||||
|
return self._leaves
|
||||||
|
|
||||||
|
@property
|
||||||
|
def has_submatch(self) -> bool:
|
||||||
|
return len(self._leaves) > 0
|
||||||
|
|
||||||
|
@property
|
||||||
|
def alphabets(self) -> List[str]:
|
||||||
|
if self._unicode_ranges is not None:
|
||||||
|
return self._unicode_ranges
|
||||||
|
# list detected ranges
|
||||||
|
detected_ranges = [
|
||||||
|
unicode_range(char) for char in str(self)
|
||||||
|
] # type: List[Optional[str]]
|
||||||
|
# filter and sort
|
||||||
|
self._unicode_ranges = sorted(list({r for r in detected_ranges if r}))
|
||||||
|
return self._unicode_ranges
|
||||||
|
|
||||||
|
@property
|
||||||
|
def could_be_from_charset(self) -> List[str]:
|
||||||
|
"""
|
||||||
|
The complete list of encoding that output the exact SAME str result and therefore could be the originating
|
||||||
|
encoding.
|
||||||
|
This list does include the encoding available in property 'encoding'.
|
||||||
|
"""
|
||||||
|
return [self._encoding] + [m.encoding for m in self._leaves]
|
||||||
|
|
||||||
|
def first(self) -> "CharsetMatch":
|
||||||
|
"""
|
||||||
|
Kept for BC reasons. Will be removed in 3.0.
|
||||||
|
"""
|
||||||
|
return self
|
||||||
|
|
||||||
|
def best(self) -> "CharsetMatch":
|
||||||
|
"""
|
||||||
|
Kept for BC reasons. Will be removed in 3.0.
|
||||||
|
"""
|
||||||
|
return self
|
||||||
|
|
||||||
|
def output(self, encoding: str = "utf_8") -> bytes:
|
||||||
|
"""
|
||||||
|
Method to get re-encoded bytes payload using given target encoding. Default to UTF-8.
|
||||||
|
Any errors will be simply ignored by the encoder NOT replaced.
|
||||||
|
"""
|
||||||
|
if self._output_encoding is None or self._output_encoding != encoding:
|
||||||
|
self._output_encoding = encoding
|
||||||
|
self._output_payload = str(self).encode(encoding, "replace")
|
||||||
|
|
||||||
|
return self._output_payload # type: ignore
|
||||||
|
|
||||||
|
@property
|
||||||
|
def fingerprint(self) -> str:
|
||||||
|
"""
|
||||||
|
Retrieve the unique SHA256 computed using the transformed (re-encoded) payload. Not the original one.
|
||||||
|
"""
|
||||||
|
return sha256(self.output()).hexdigest()
|
||||||
|
|
||||||
|
|
||||||
|
class CharsetMatches:
|
||||||
|
"""
|
||||||
|
Container with every CharsetMatch items ordered by default from most probable to the less one.
|
||||||
|
Act like a list(iterable) but does not implements all related methods.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, results: List[CharsetMatch] = None):
|
||||||
|
self._results = sorted(results) if results else [] # type: List[CharsetMatch]
|
||||||
|
|
||||||
|
def __iter__(self) -> Iterator[CharsetMatch]:
|
||||||
|
for result in self._results:
|
||||||
|
yield result
|
||||||
|
|
||||||
|
def __getitem__(self, item: Union[int, str]) -> CharsetMatch:
|
||||||
|
"""
|
||||||
|
Retrieve a single item either by its position or encoding name (alias may be used here).
|
||||||
|
Raise KeyError upon invalid index or encoding not present in results.
|
||||||
|
"""
|
||||||
|
if isinstance(item, int):
|
||||||
|
return self._results[item]
|
||||||
|
if isinstance(item, str):
|
||||||
|
item = iana_name(item, False)
|
||||||
|
for result in self._results:
|
||||||
|
if item in result.could_be_from_charset:
|
||||||
|
return result
|
||||||
|
raise KeyError
|
||||||
|
|
||||||
|
def __len__(self) -> int:
|
||||||
|
return len(self._results)
|
||||||
|
|
||||||
|
def __bool__(self) -> bool:
|
||||||
|
return len(self._results) > 0
|
||||||
|
|
||||||
|
def append(self, item: CharsetMatch) -> None:
|
||||||
|
"""
|
||||||
|
Insert a single match. Will be inserted accordingly to preserve sort.
|
||||||
|
Can be inserted as a submatch.
|
||||||
|
"""
|
||||||
|
if not isinstance(item, CharsetMatch):
|
||||||
|
raise ValueError(
|
||||||
|
"Cannot append instance '{}' to CharsetMatches".format(
|
||||||
|
str(item.__class__)
|
||||||
|
)
|
||||||
|
)
|
||||||
|
# We should disable the submatch factoring when the input file is too heavy (conserve RAM usage)
|
||||||
|
if len(item.raw) <= TOO_BIG_SEQUENCE:
|
||||||
|
for match in self._results:
|
||||||
|
if match.fingerprint == item.fingerprint and match.chaos == item.chaos:
|
||||||
|
match.add_submatch(item)
|
||||||
|
return
|
||||||
|
self._results.append(item)
|
||||||
|
self._results = sorted(self._results)
|
||||||
|
|
||||||
|
def best(self) -> Optional["CharsetMatch"]:
|
||||||
|
"""
|
||||||
|
Simply return the first match. Strict equivalent to matches[0].
|
||||||
|
"""
|
||||||
|
if not self._results:
|
||||||
|
return None
|
||||||
|
return self._results[0]
|
||||||
|
|
||||||
|
def first(self) -> Optional["CharsetMatch"]:
|
||||||
|
"""
|
||||||
|
Redundant method, call the method best(). Kept for BC reasons.
|
||||||
|
"""
|
||||||
|
return self.best()
|
||||||
|
|
||||||
|
|
||||||
|
CoherenceMatch = Tuple[str, float]
|
||||||
|
CoherenceMatches = List[CoherenceMatch]
|
||||||
|
|
||||||
|
|
||||||
|
class CliDetectionResult:
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
path: str,
|
||||||
|
encoding: Optional[str],
|
||||||
|
encoding_aliases: List[str],
|
||||||
|
alternative_encodings: List[str],
|
||||||
|
language: str,
|
||||||
|
alphabets: List[str],
|
||||||
|
has_sig_or_bom: bool,
|
||||||
|
chaos: float,
|
||||||
|
coherence: float,
|
||||||
|
unicode_path: Optional[str],
|
||||||
|
is_preferred: bool,
|
||||||
|
):
|
||||||
|
self.path = path # type: str
|
||||||
|
self.unicode_path = unicode_path # type: Optional[str]
|
||||||
|
self.encoding = encoding # type: Optional[str]
|
||||||
|
self.encoding_aliases = encoding_aliases # type: List[str]
|
||||||
|
self.alternative_encodings = alternative_encodings # type: List[str]
|
||||||
|
self.language = language # type: str
|
||||||
|
self.alphabets = alphabets # type: List[str]
|
||||||
|
self.has_sig_or_bom = has_sig_or_bom # type: bool
|
||||||
|
self.chaos = chaos # type: float
|
||||||
|
self.coherence = coherence # type: float
|
||||||
|
self.is_preferred = is_preferred # type: bool
|
||||||
|
|
||||||
|
@property
|
||||||
|
def __dict__(self) -> Dict[str, Any]: # type: ignore
|
||||||
|
return {
|
||||||
|
"path": self.path,
|
||||||
|
"encoding": self.encoding,
|
||||||
|
"encoding_aliases": self.encoding_aliases,
|
||||||
|
"alternative_encodings": self.alternative_encodings,
|
||||||
|
"language": self.language,
|
||||||
|
"alphabets": self.alphabets,
|
||||||
|
"has_sig_or_bom": self.has_sig_or_bom,
|
||||||
|
"chaos": self.chaos,
|
||||||
|
"coherence": self.coherence,
|
||||||
|
"unicode_path": self.unicode_path,
|
||||||
|
"is_preferred": self.is_preferred,
|
||||||
|
}
|
||||||
|
|
||||||
|
def to_json(self) -> str:
|
||||||
|
return dumps(self.__dict__, ensure_ascii=True, indent=4)
|
0
lib/charset_normalizer/py.typed
Normal file
0
lib/charset_normalizer/py.typed
Normal file
333
lib/charset_normalizer/utils.py
Normal file
333
lib/charset_normalizer/utils.py
Normal file
|
@ -0,0 +1,333 @@
|
||||||
|
try:
|
||||||
|
import unicodedata2 as unicodedata
|
||||||
|
except ImportError:
|
||||||
|
import unicodedata # type: ignore[no-redef]
|
||||||
|
|
||||||
|
import importlib
|
||||||
|
from codecs import IncrementalDecoder
|
||||||
|
from encodings.aliases import aliases
|
||||||
|
from functools import lru_cache
|
||||||
|
from re import findall
|
||||||
|
from typing import List, Optional, Set, Tuple, Union
|
||||||
|
|
||||||
|
from _multibytecodec import MultibyteIncrementalDecoder # type: ignore
|
||||||
|
|
||||||
|
from .constant import (
|
||||||
|
ENCODING_MARKS,
|
||||||
|
IANA_SUPPORTED_SIMILAR,
|
||||||
|
RE_POSSIBLE_ENCODING_INDICATION,
|
||||||
|
UNICODE_RANGES_COMBINED,
|
||||||
|
UNICODE_SECONDARY_RANGE_KEYWORD,
|
||||||
|
UTF8_MAXIMAL_ALLOCATION,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
|
||||||
|
def is_accentuated(character: str) -> bool:
|
||||||
|
try:
|
||||||
|
description = unicodedata.name(character) # type: str
|
||||||
|
except ValueError:
|
||||||
|
return False
|
||||||
|
return (
|
||||||
|
"WITH GRAVE" in description
|
||||||
|
or "WITH ACUTE" in description
|
||||||
|
or "WITH CEDILLA" in description
|
||||||
|
or "WITH DIAERESIS" in description
|
||||||
|
or "WITH CIRCUMFLEX" in description
|
||||||
|
or "WITH TILDE" in description
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
|
||||||
|
def remove_accent(character: str) -> str:
|
||||||
|
decomposed = unicodedata.decomposition(character) # type: str
|
||||||
|
if not decomposed:
|
||||||
|
return character
|
||||||
|
|
||||||
|
codes = decomposed.split(" ") # type: List[str]
|
||||||
|
|
||||||
|
return chr(int(codes[0], 16))
|
||||||
|
|
||||||
|
|
||||||
|
@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
|
||||||
|
def unicode_range(character: str) -> Optional[str]:
|
||||||
|
"""
|
||||||
|
Retrieve the Unicode range official name from a single character.
|
||||||
|
"""
|
||||||
|
character_ord = ord(character) # type: int
|
||||||
|
|
||||||
|
for range_name, ord_range in UNICODE_RANGES_COMBINED.items():
|
||||||
|
if character_ord in ord_range:
|
||||||
|
return range_name
|
||||||
|
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
|
||||||
|
def is_latin(character: str) -> bool:
|
||||||
|
try:
|
||||||
|
description = unicodedata.name(character) # type: str
|
||||||
|
except ValueError:
|
||||||
|
return False
|
||||||
|
return "LATIN" in description
|
||||||
|
|
||||||
|
|
||||||
|
def is_ascii(character: str) -> bool:
|
||||||
|
try:
|
||||||
|
character.encode("ascii")
|
||||||
|
except UnicodeEncodeError:
|
||||||
|
return False
|
||||||
|
return True
|
||||||
|
|
||||||
|
|
||||||
|
@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
|
||||||
|
def is_punctuation(character: str) -> bool:
|
||||||
|
character_category = unicodedata.category(character) # type: str
|
||||||
|
|
||||||
|
if "P" in character_category:
|
||||||
|
return True
|
||||||
|
|
||||||
|
character_range = unicode_range(character) # type: Optional[str]
|
||||||
|
|
||||||
|
if character_range is None:
|
||||||
|
return False
|
||||||
|
|
||||||
|
return "Punctuation" in character_range
|
||||||
|
|
||||||
|
|
||||||
|
@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
|
||||||
|
def is_symbol(character: str) -> bool:
|
||||||
|
character_category = unicodedata.category(character) # type: str
|
||||||
|
|
||||||
|
if "S" in character_category or "N" in character_category:
|
||||||
|
return True
|
||||||
|
|
||||||
|
character_range = unicode_range(character) # type: Optional[str]
|
||||||
|
|
||||||
|
if character_range is None:
|
||||||
|
return False
|
||||||
|
|
||||||
|
return "Forms" in character_range
|
||||||
|
|
||||||
|
|
||||||
|
@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
|
||||||
|
def is_emoticon(character: str) -> bool:
|
||||||
|
character_range = unicode_range(character) # type: Optional[str]
|
||||||
|
|
||||||
|
if character_range is None:
|
||||||
|
return False
|
||||||
|
|
||||||
|
return "Emoticons" in character_range
|
||||||
|
|
||||||
|
|
||||||
|
@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
|
||||||
|
def is_separator(character: str) -> bool:
|
||||||
|
if character.isspace() or character in ["|", "+", ",", ";", "<", ">"]:
|
||||||
|
return True
|
||||||
|
|
||||||
|
character_category = unicodedata.category(character) # type: str
|
||||||
|
|
||||||
|
return "Z" in character_category
|
||||||
|
|
||||||
|
|
||||||
|
@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
|
||||||
|
def is_case_variable(character: str) -> bool:
|
||||||
|
return character.islower() != character.isupper()
|
||||||
|
|
||||||
|
|
||||||
|
def is_private_use_only(character: str) -> bool:
|
||||||
|
character_category = unicodedata.category(character) # type: str
|
||||||
|
|
||||||
|
return "Co" == character_category
|
||||||
|
|
||||||
|
|
||||||
|
@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
|
||||||
|
def is_cjk(character: str) -> bool:
|
||||||
|
try:
|
||||||
|
character_name = unicodedata.name(character)
|
||||||
|
except ValueError:
|
||||||
|
return False
|
||||||
|
|
||||||
|
return "CJK" in character_name
|
||||||
|
|
||||||
|
|
||||||
|
@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
|
||||||
|
def is_hiragana(character: str) -> bool:
|
||||||
|
try:
|
||||||
|
character_name = unicodedata.name(character)
|
||||||
|
except ValueError:
|
||||||
|
return False
|
||||||
|
|
||||||
|
return "HIRAGANA" in character_name
|
||||||
|
|
||||||
|
|
||||||
|
@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
|
||||||
|
def is_katakana(character: str) -> bool:
|
||||||
|
try:
|
||||||
|
character_name = unicodedata.name(character)
|
||||||
|
except ValueError:
|
||||||
|
return False
|
||||||
|
|
||||||
|
return "KATAKANA" in character_name
|
||||||
|
|
||||||
|
|
||||||
|
@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
|
||||||
|
def is_hangul(character: str) -> bool:
|
||||||
|
try:
|
||||||
|
character_name = unicodedata.name(character)
|
||||||
|
except ValueError:
|
||||||
|
return False
|
||||||
|
|
||||||
|
return "HANGUL" in character_name
|
||||||
|
|
||||||
|
|
||||||
|
@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
|
||||||
|
def is_thai(character: str) -> bool:
|
||||||
|
try:
|
||||||
|
character_name = unicodedata.name(character)
|
||||||
|
except ValueError:
|
||||||
|
return False
|
||||||
|
|
||||||
|
return "THAI" in character_name
|
||||||
|
|
||||||
|
|
||||||
|
@lru_cache(maxsize=len(UNICODE_RANGES_COMBINED))
|
||||||
|
def is_unicode_range_secondary(range_name: str) -> bool:
|
||||||
|
for keyword in UNICODE_SECONDARY_RANGE_KEYWORD:
|
||||||
|
if keyword in range_name:
|
||||||
|
return True
|
||||||
|
|
||||||
|
return False
|
||||||
|
|
||||||
|
|
||||||
|
def any_specified_encoding(sequence: bytes, search_zone: int = 4096) -> Optional[str]:
|
||||||
|
"""
|
||||||
|
Extract using ASCII-only decoder any specified encoding in the first n-bytes.
|
||||||
|
"""
|
||||||
|
if not isinstance(sequence, bytes):
|
||||||
|
raise TypeError
|
||||||
|
|
||||||
|
seq_len = len(sequence) # type: int
|
||||||
|
|
||||||
|
results = findall(
|
||||||
|
RE_POSSIBLE_ENCODING_INDICATION,
|
||||||
|
sequence[: seq_len if seq_len <= search_zone else search_zone].decode(
|
||||||
|
"ascii", errors="ignore"
|
||||||
|
),
|
||||||
|
) # type: List[str]
|
||||||
|
|
||||||
|
if len(results) == 0:
|
||||||
|
return None
|
||||||
|
|
||||||
|
for specified_encoding in results:
|
||||||
|
specified_encoding = specified_encoding.lower().replace("-", "_")
|
||||||
|
|
||||||
|
for encoding_alias, encoding_iana in aliases.items():
|
||||||
|
if encoding_alias == specified_encoding:
|
||||||
|
return encoding_iana
|
||||||
|
if encoding_iana == specified_encoding:
|
||||||
|
return encoding_iana
|
||||||
|
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
@lru_cache(maxsize=128)
|
||||||
|
def is_multi_byte_encoding(name: str) -> bool:
|
||||||
|
"""
|
||||||
|
Verify is a specific encoding is a multi byte one based on it IANA name
|
||||||
|
"""
|
||||||
|
return name in {
|
||||||
|
"utf_8",
|
||||||
|
"utf_8_sig",
|
||||||
|
"utf_16",
|
||||||
|
"utf_16_be",
|
||||||
|
"utf_16_le",
|
||||||
|
"utf_32",
|
||||||
|
"utf_32_le",
|
||||||
|
"utf_32_be",
|
||||||
|
"utf_7",
|
||||||
|
} or issubclass(
|
||||||
|
importlib.import_module("encodings.{}".format(name)).IncrementalDecoder, # type: ignore
|
||||||
|
MultibyteIncrementalDecoder,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def identify_sig_or_bom(sequence: bytes) -> Tuple[Optional[str], bytes]:
|
||||||
|
"""
|
||||||
|
Identify and extract SIG/BOM in given sequence.
|
||||||
|
"""
|
||||||
|
|
||||||
|
for iana_encoding in ENCODING_MARKS:
|
||||||
|
marks = ENCODING_MARKS[iana_encoding] # type: Union[bytes, List[bytes]]
|
||||||
|
|
||||||
|
if isinstance(marks, bytes):
|
||||||
|
marks = [marks]
|
||||||
|
|
||||||
|
for mark in marks:
|
||||||
|
if sequence.startswith(mark):
|
||||||
|
return iana_encoding, mark
|
||||||
|
|
||||||
|
return None, b""
|
||||||
|
|
||||||
|
|
||||||
|
def should_strip_sig_or_bom(iana_encoding: str) -> bool:
|
||||||
|
return iana_encoding not in {"utf_16", "utf_32"}
|
||||||
|
|
||||||
|
|
||||||
|
def iana_name(cp_name: str, strict: bool = True) -> str:
|
||||||
|
cp_name = cp_name.lower().replace("-", "_")
|
||||||
|
|
||||||
|
for encoding_alias, encoding_iana in aliases.items():
|
||||||
|
if cp_name == encoding_alias or cp_name == encoding_iana:
|
||||||
|
return encoding_iana
|
||||||
|
|
||||||
|
if strict:
|
||||||
|
raise ValueError("Unable to retrieve IANA for '{}'".format(cp_name))
|
||||||
|
|
||||||
|
return cp_name
|
||||||
|
|
||||||
|
|
||||||
|
def range_scan(decoded_sequence: str) -> List[str]:
|
||||||
|
ranges = set() # type: Set[str]
|
||||||
|
|
||||||
|
for character in decoded_sequence:
|
||||||
|
character_range = unicode_range(character) # type: Optional[str]
|
||||||
|
|
||||||
|
if character_range is None:
|
||||||
|
continue
|
||||||
|
|
||||||
|
ranges.add(character_range)
|
||||||
|
|
||||||
|
return list(ranges)
|
||||||
|
|
||||||
|
|
||||||
|
def cp_similarity(iana_name_a: str, iana_name_b: str) -> float:
|
||||||
|
|
||||||
|
if is_multi_byte_encoding(iana_name_a) or is_multi_byte_encoding(iana_name_b):
|
||||||
|
return 0.0
|
||||||
|
|
||||||
|
decoder_a = importlib.import_module("encodings.{}".format(iana_name_a)).IncrementalDecoder # type: ignore
|
||||||
|
decoder_b = importlib.import_module("encodings.{}".format(iana_name_b)).IncrementalDecoder # type: ignore
|
||||||
|
|
||||||
|
id_a = decoder_a(errors="ignore") # type: IncrementalDecoder
|
||||||
|
id_b = decoder_b(errors="ignore") # type: IncrementalDecoder
|
||||||
|
|
||||||
|
character_match_count = 0 # type: int
|
||||||
|
|
||||||
|
for i in range(0, 255):
|
||||||
|
to_be_decoded = bytes([i]) # type: bytes
|
||||||
|
if id_a.decode(to_be_decoded) == id_b.decode(to_be_decoded):
|
||||||
|
character_match_count += 1
|
||||||
|
|
||||||
|
return character_match_count / 254
|
||||||
|
|
||||||
|
|
||||||
|
def is_cp_similar(iana_name_a: str, iana_name_b: str) -> bool:
|
||||||
|
"""
|
||||||
|
Determine if two code page are at least 80% similar. IANA_SUPPORTED_SIMILAR dict was generated using
|
||||||
|
the function cp_similarity.
|
||||||
|
"""
|
||||||
|
return (
|
||||||
|
iana_name_a in IANA_SUPPORTED_SIMILAR
|
||||||
|
and iana_name_b in IANA_SUPPORTED_SIMILAR[iana_name_a]
|
||||||
|
)
|
6
lib/charset_normalizer/version.py
Normal file
6
lib/charset_normalizer/version.py
Normal file
|
@ -0,0 +1,6 @@
|
||||||
|
"""
|
||||||
|
Expose version
|
||||||
|
"""
|
||||||
|
|
||||||
|
__version__ = "2.0.7"
|
||||||
|
VERSION = __version__.split(".")
|
Loading…
Add table
Add a link
Reference in a new issue