mirror of
https://github.com/Tautulli/Tautulli.git
synced 2025-08-14 02:26:58 -07:00
Update plexapi==4.17.0
This commit is contained in:
parent
3cb71f94a3
commit
f6bffe1850
32 changed files with 1224 additions and 966 deletions
|
@ -1,4 +1,3 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
"""
|
||||
Charset-Normalizer
|
||||
~~~~~~~~~~~~~~
|
||||
|
@ -19,6 +18,9 @@ at <https://github.com/Ousret/charset_normalizer>.
|
|||
:copyright: (c) 2021 by Ahmed TAHRI
|
||||
:license: MIT, see LICENSE for more details.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import logging
|
||||
|
||||
from .api import from_bytes, from_fp, from_path, is_binary
|
||||
|
|
|
@ -1,3 +1,5 @@
|
|||
from __future__ import annotations
|
||||
|
||||
from .cli import cli_detect
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
|
|
@ -1,6 +1,8 @@
|
|||
from __future__ import annotations
|
||||
|
||||
import logging
|
||||
from os import PathLike
|
||||
from typing import BinaryIO, List, Optional, Set, Union
|
||||
from typing import BinaryIO
|
||||
|
||||
from .cd import (
|
||||
coherence_ratio,
|
||||
|
@ -21,8 +23,6 @@ from .utils import (
|
|||
should_strip_sig_or_bom,
|
||||
)
|
||||
|
||||
# Will most likely be controversial
|
||||
# logging.addLevelName(TRACE, "TRACE")
|
||||
logger = logging.getLogger("charset_normalizer")
|
||||
explain_handler = logging.StreamHandler()
|
||||
explain_handler.setFormatter(
|
||||
|
@ -31,12 +31,12 @@ explain_handler.setFormatter(
|
|||
|
||||
|
||||
def from_bytes(
|
||||
sequences: Union[bytes, bytearray],
|
||||
sequences: bytes | bytearray,
|
||||
steps: int = 5,
|
||||
chunk_size: int = 512,
|
||||
threshold: float = 0.2,
|
||||
cp_isolation: Optional[List[str]] = None,
|
||||
cp_exclusion: Optional[List[str]] = None,
|
||||
cp_isolation: list[str] | None = None,
|
||||
cp_exclusion: list[str] | None = None,
|
||||
preemptive_behaviour: bool = True,
|
||||
explain: bool = False,
|
||||
language_threshold: float = 0.1,
|
||||
|
@ -62,7 +62,7 @@ def from_bytes(
|
|||
|
||||
if not isinstance(sequences, (bytearray, bytes)):
|
||||
raise TypeError(
|
||||
"Expected object of type bytes or bytearray, got: {0}".format(
|
||||
"Expected object of type bytes or bytearray, got: {}".format(
|
||||
type(sequences)
|
||||
)
|
||||
)
|
||||
|
@ -76,7 +76,7 @@ def from_bytes(
|
|||
|
||||
if length == 0:
|
||||
logger.debug("Encoding detection on empty bytes, assuming utf_8 intention.")
|
||||
if explain:
|
||||
if explain: # Defensive: ensure exit path clean handler
|
||||
logger.removeHandler(explain_handler)
|
||||
logger.setLevel(previous_logger_level or logging.WARNING)
|
||||
return CharsetMatches([CharsetMatch(sequences, "utf_8", 0.0, False, [], "")])
|
||||
|
@ -135,9 +135,9 @@ def from_bytes(
|
|||
),
|
||||
)
|
||||
|
||||
prioritized_encodings: List[str] = []
|
||||
prioritized_encodings: list[str] = []
|
||||
|
||||
specified_encoding: Optional[str] = (
|
||||
specified_encoding: str | None = (
|
||||
any_specified_encoding(sequences) if preemptive_behaviour else None
|
||||
)
|
||||
|
||||
|
@ -149,13 +149,13 @@ def from_bytes(
|
|||
specified_encoding,
|
||||
)
|
||||
|
||||
tested: Set[str] = set()
|
||||
tested_but_hard_failure: List[str] = []
|
||||
tested_but_soft_failure: List[str] = []
|
||||
tested: set[str] = set()
|
||||
tested_but_hard_failure: list[str] = []
|
||||
tested_but_soft_failure: list[str] = []
|
||||
|
||||
fallback_ascii: Optional[CharsetMatch] = None
|
||||
fallback_u8: Optional[CharsetMatch] = None
|
||||
fallback_specified: Optional[CharsetMatch] = None
|
||||
fallback_ascii: CharsetMatch | None = None
|
||||
fallback_u8: CharsetMatch | None = None
|
||||
fallback_specified: CharsetMatch | None = None
|
||||
|
||||
results: CharsetMatches = CharsetMatches()
|
||||
|
||||
|
@ -189,7 +189,7 @@ def from_bytes(
|
|||
|
||||
tested.add(encoding_iana)
|
||||
|
||||
decoded_payload: Optional[str] = None
|
||||
decoded_payload: str | None = None
|
||||
bom_or_sig_available: bool = sig_encoding == encoding_iana
|
||||
strip_sig_or_bom: bool = bom_or_sig_available and should_strip_sig_or_bom(
|
||||
encoding_iana
|
||||
|
@ -292,7 +292,7 @@ def from_bytes(
|
|||
early_stop_count: int = 0
|
||||
lazy_str_hard_failure = False
|
||||
|
||||
md_chunks: List[str] = []
|
||||
md_chunks: list[str] = []
|
||||
md_ratios = []
|
||||
|
||||
try:
|
||||
|
@ -397,7 +397,7 @@ def from_bytes(
|
|||
)
|
||||
|
||||
if not is_multi_byte_decoder:
|
||||
target_languages: List[str] = encoding_languages(encoding_iana)
|
||||
target_languages: list[str] = encoding_languages(encoding_iana)
|
||||
else:
|
||||
target_languages = mb_encoding_languages(encoding_iana)
|
||||
|
||||
|
@ -462,7 +462,7 @@ def from_bytes(
|
|||
"Encoding detection: %s is most likely the one.",
|
||||
current_match.encoding,
|
||||
)
|
||||
if explain:
|
||||
if explain: # Defensive: ensure exit path clean handler
|
||||
logger.removeHandler(explain_handler)
|
||||
logger.setLevel(previous_logger_level)
|
||||
return CharsetMatches([current_match])
|
||||
|
@ -480,7 +480,7 @@ def from_bytes(
|
|||
"Encoding detection: %s is most likely the one.",
|
||||
probable_result.encoding,
|
||||
)
|
||||
if explain:
|
||||
if explain: # Defensive: ensure exit path clean handler
|
||||
logger.removeHandler(explain_handler)
|
||||
logger.setLevel(previous_logger_level)
|
||||
|
||||
|
@ -492,7 +492,7 @@ def from_bytes(
|
|||
"the beginning of the sequence.",
|
||||
encoding_iana,
|
||||
)
|
||||
if explain:
|
||||
if explain: # Defensive: ensure exit path clean handler
|
||||
logger.removeHandler(explain_handler)
|
||||
logger.setLevel(previous_logger_level)
|
||||
return CharsetMatches([results[encoding_iana]])
|
||||
|
@ -546,8 +546,8 @@ def from_fp(
|
|||
steps: int = 5,
|
||||
chunk_size: int = 512,
|
||||
threshold: float = 0.20,
|
||||
cp_isolation: Optional[List[str]] = None,
|
||||
cp_exclusion: Optional[List[str]] = None,
|
||||
cp_isolation: list[str] | None = None,
|
||||
cp_exclusion: list[str] | None = None,
|
||||
preemptive_behaviour: bool = True,
|
||||
explain: bool = False,
|
||||
language_threshold: float = 0.1,
|
||||
|
@ -572,12 +572,12 @@ def from_fp(
|
|||
|
||||
|
||||
def from_path(
|
||||
path: Union[str, bytes, PathLike], # type: ignore[type-arg]
|
||||
path: str | bytes | PathLike, # type: ignore[type-arg]
|
||||
steps: int = 5,
|
||||
chunk_size: int = 512,
|
||||
threshold: float = 0.20,
|
||||
cp_isolation: Optional[List[str]] = None,
|
||||
cp_exclusion: Optional[List[str]] = None,
|
||||
cp_isolation: list[str] | None = None,
|
||||
cp_exclusion: list[str] | None = None,
|
||||
preemptive_behaviour: bool = True,
|
||||
explain: bool = False,
|
||||
language_threshold: float = 0.1,
|
||||
|
@ -603,12 +603,12 @@ def from_path(
|
|||
|
||||
|
||||
def is_binary(
|
||||
fp_or_path_or_payload: Union[PathLike, str, BinaryIO, bytes], # type: ignore[type-arg]
|
||||
fp_or_path_or_payload: PathLike | str | BinaryIO | bytes, # type: ignore[type-arg]
|
||||
steps: int = 5,
|
||||
chunk_size: int = 512,
|
||||
threshold: float = 0.20,
|
||||
cp_isolation: Optional[List[str]] = None,
|
||||
cp_exclusion: Optional[List[str]] = None,
|
||||
cp_isolation: list[str] | None = None,
|
||||
cp_exclusion: list[str] | None = None,
|
||||
preemptive_behaviour: bool = True,
|
||||
explain: bool = False,
|
||||
language_threshold: float = 0.1,
|
||||
|
|
|
@ -1,8 +1,10 @@
|
|||
from __future__ import annotations
|
||||
|
||||
import importlib
|
||||
from codecs import IncrementalDecoder
|
||||
from collections import Counter
|
||||
from functools import lru_cache
|
||||
from typing import Counter as TypeCounter, Dict, List, Optional, Tuple
|
||||
from typing import Counter as TypeCounter
|
||||
|
||||
from .constant import (
|
||||
FREQUENCIES,
|
||||
|
@ -22,26 +24,24 @@ from .utils import (
|
|||
)
|
||||
|
||||
|
||||
def encoding_unicode_range(iana_name: str) -> List[str]:
|
||||
def encoding_unicode_range(iana_name: str) -> list[str]:
|
||||
"""
|
||||
Return associated unicode ranges in a single byte code page.
|
||||
"""
|
||||
if is_multi_byte_encoding(iana_name):
|
||||
raise IOError("Function not supported on multi-byte code page")
|
||||
raise OSError("Function not supported on multi-byte code page")
|
||||
|
||||
decoder = importlib.import_module(
|
||||
"encodings.{}".format(iana_name)
|
||||
).IncrementalDecoder
|
||||
decoder = importlib.import_module(f"encodings.{iana_name}").IncrementalDecoder
|
||||
|
||||
p: IncrementalDecoder = decoder(errors="ignore")
|
||||
seen_ranges: Dict[str, int] = {}
|
||||
seen_ranges: dict[str, int] = {}
|
||||
character_count: int = 0
|
||||
|
||||
for i in range(0x40, 0xFF):
|
||||
chunk: str = p.decode(bytes([i]))
|
||||
|
||||
if chunk:
|
||||
character_range: Optional[str] = unicode_range(chunk)
|
||||
character_range: str | None = unicode_range(chunk)
|
||||
|
||||
if character_range is None:
|
||||
continue
|
||||
|
@ -61,11 +61,11 @@ def encoding_unicode_range(iana_name: str) -> List[str]:
|
|||
)
|
||||
|
||||
|
||||
def unicode_range_languages(primary_range: str) -> List[str]:
|
||||
def unicode_range_languages(primary_range: str) -> list[str]:
|
||||
"""
|
||||
Return inferred languages used with a unicode range.
|
||||
"""
|
||||
languages: List[str] = []
|
||||
languages: list[str] = []
|
||||
|
||||
for language, characters in FREQUENCIES.items():
|
||||
for character in characters:
|
||||
|
@ -77,13 +77,13 @@ def unicode_range_languages(primary_range: str) -> List[str]:
|
|||
|
||||
|
||||
@lru_cache()
|
||||
def encoding_languages(iana_name: str) -> List[str]:
|
||||
def encoding_languages(iana_name: str) -> list[str]:
|
||||
"""
|
||||
Single-byte encoding language association. Some code page are heavily linked to particular language(s).
|
||||
This function does the correspondence.
|
||||
"""
|
||||
unicode_ranges: List[str] = encoding_unicode_range(iana_name)
|
||||
primary_range: Optional[str] = None
|
||||
unicode_ranges: list[str] = encoding_unicode_range(iana_name)
|
||||
primary_range: str | None = None
|
||||
|
||||
for specified_range in unicode_ranges:
|
||||
if "Latin" not in specified_range:
|
||||
|
@ -97,7 +97,7 @@ def encoding_languages(iana_name: str) -> List[str]:
|
|||
|
||||
|
||||
@lru_cache()
|
||||
def mb_encoding_languages(iana_name: str) -> List[str]:
|
||||
def mb_encoding_languages(iana_name: str) -> list[str]:
|
||||
"""
|
||||
Multi-byte encoding language association. Some code page are heavily linked to particular language(s).
|
||||
This function does the correspondence.
|
||||
|
@ -118,7 +118,7 @@ def mb_encoding_languages(iana_name: str) -> List[str]:
|
|||
|
||||
|
||||
@lru_cache(maxsize=LANGUAGE_SUPPORTED_COUNT)
|
||||
def get_target_features(language: str) -> Tuple[bool, bool]:
|
||||
def get_target_features(language: str) -> tuple[bool, bool]:
|
||||
"""
|
||||
Determine main aspects from a supported language if it contains accents and if is pure Latin.
|
||||
"""
|
||||
|
@ -135,12 +135,12 @@ def get_target_features(language: str) -> Tuple[bool, bool]:
|
|||
|
||||
|
||||
def alphabet_languages(
|
||||
characters: List[str], ignore_non_latin: bool = False
|
||||
) -> List[str]:
|
||||
characters: list[str], ignore_non_latin: bool = False
|
||||
) -> list[str]:
|
||||
"""
|
||||
Return associated languages associated to given characters.
|
||||
"""
|
||||
languages: List[Tuple[str, float]] = []
|
||||
languages: list[tuple[str, float]] = []
|
||||
|
||||
source_have_accents = any(is_accentuated(character) for character in characters)
|
||||
|
||||
|
@ -170,7 +170,7 @@ def alphabet_languages(
|
|||
|
||||
|
||||
def characters_popularity_compare(
|
||||
language: str, ordered_characters: List[str]
|
||||
language: str, ordered_characters: list[str]
|
||||
) -> float:
|
||||
"""
|
||||
Determine if a ordered characters list (by occurrence from most appearance to rarest) match a particular language.
|
||||
|
@ -178,7 +178,7 @@ def characters_popularity_compare(
|
|||
Beware that is function is not strict on the match in order to ease the detection. (Meaning close match is 1.)
|
||||
"""
|
||||
if language not in FREQUENCIES:
|
||||
raise ValueError("{} not available".format(language))
|
||||
raise ValueError(f"{language} not available")
|
||||
|
||||
character_approved_count: int = 0
|
||||
FREQUENCIES_language_set = set(FREQUENCIES[language])
|
||||
|
@ -214,14 +214,14 @@ def characters_popularity_compare(
|
|||
character_approved_count += 1
|
||||
continue
|
||||
|
||||
characters_before_source: List[str] = FREQUENCIES[language][
|
||||
characters_before_source: list[str] = FREQUENCIES[language][
|
||||
0:character_rank_in_language
|
||||
]
|
||||
characters_after_source: List[str] = FREQUENCIES[language][
|
||||
characters_after_source: list[str] = FREQUENCIES[language][
|
||||
character_rank_in_language:
|
||||
]
|
||||
characters_before: List[str] = ordered_characters[0:character_rank]
|
||||
characters_after: List[str] = ordered_characters[character_rank:]
|
||||
characters_before: list[str] = ordered_characters[0:character_rank]
|
||||
characters_after: list[str] = ordered_characters[character_rank:]
|
||||
|
||||
before_match_count: int = len(
|
||||
set(characters_before) & set(characters_before_source)
|
||||
|
@ -249,24 +249,24 @@ def characters_popularity_compare(
|
|||
return character_approved_count / len(ordered_characters)
|
||||
|
||||
|
||||
def alpha_unicode_split(decoded_sequence: str) -> List[str]:
|
||||
def alpha_unicode_split(decoded_sequence: str) -> list[str]:
|
||||
"""
|
||||
Given a decoded text sequence, return a list of str. Unicode range / alphabet separation.
|
||||
Ex. a text containing English/Latin with a bit a Hebrew will return two items in the resulting list;
|
||||
One containing the latin letters and the other hebrew.
|
||||
"""
|
||||
layers: Dict[str, str] = {}
|
||||
layers: dict[str, str] = {}
|
||||
|
||||
for character in decoded_sequence:
|
||||
if character.isalpha() is False:
|
||||
continue
|
||||
|
||||
character_range: Optional[str] = unicode_range(character)
|
||||
character_range: str | None = unicode_range(character)
|
||||
|
||||
if character_range is None:
|
||||
continue
|
||||
|
||||
layer_target_range: Optional[str] = None
|
||||
layer_target_range: str | None = None
|
||||
|
||||
for discovered_range in layers:
|
||||
if (
|
||||
|
@ -288,12 +288,12 @@ def alpha_unicode_split(decoded_sequence: str) -> List[str]:
|
|||
return list(layers.values())
|
||||
|
||||
|
||||
def merge_coherence_ratios(results: List[CoherenceMatches]) -> CoherenceMatches:
|
||||
def merge_coherence_ratios(results: list[CoherenceMatches]) -> CoherenceMatches:
|
||||
"""
|
||||
This function merge results previously given by the function coherence_ratio.
|
||||
The return type is the same as coherence_ratio.
|
||||
"""
|
||||
per_language_ratios: Dict[str, List[float]] = {}
|
||||
per_language_ratios: dict[str, list[float]] = {}
|
||||
for result in results:
|
||||
for sub_result in result:
|
||||
language, ratio = sub_result
|
||||
|
@ -321,7 +321,7 @@ def filter_alt_coherence_matches(results: CoherenceMatches) -> CoherenceMatches:
|
|||
We shall NOT return "English—" in CoherenceMatches because it is an alternative
|
||||
of "English". This function only keeps the best match and remove the em-dash in it.
|
||||
"""
|
||||
index_results: Dict[str, List[float]] = dict()
|
||||
index_results: dict[str, list[float]] = dict()
|
||||
|
||||
for result in results:
|
||||
language, ratio = result
|
||||
|
@ -345,14 +345,14 @@ def filter_alt_coherence_matches(results: CoherenceMatches) -> CoherenceMatches:
|
|||
|
||||
@lru_cache(maxsize=2048)
|
||||
def coherence_ratio(
|
||||
decoded_sequence: str, threshold: float = 0.1, lg_inclusion: Optional[str] = None
|
||||
decoded_sequence: str, threshold: float = 0.1, lg_inclusion: str | None = None
|
||||
) -> CoherenceMatches:
|
||||
"""
|
||||
Detect ANY language that can be identified in given sequence. The sequence will be analysed by layers.
|
||||
A layer = Character extraction by alphabets/ranges.
|
||||
"""
|
||||
|
||||
results: List[Tuple[str, float]] = []
|
||||
results: list[tuple[str, float]] = []
|
||||
ignore_non_latin: bool = False
|
||||
|
||||
sufficient_match_count: int = 0
|
||||
|
@ -371,7 +371,7 @@ def coherence_ratio(
|
|||
if character_count <= TOO_SMALL_SEQUENCE:
|
||||
continue
|
||||
|
||||
popular_character_ordered: List[str] = [c for c, o in most_common]
|
||||
popular_character_ordered: list[str] = [c for c, o in most_common]
|
||||
|
||||
for language in lg_inclusion_list or alphabet_languages(
|
||||
popular_character_ordered, ignore_non_latin
|
||||
|
|
|
@ -1,3 +1,5 @@
|
|||
from __future__ import annotations
|
||||
|
||||
from .__main__ import cli_detect, query_yes_no
|
||||
|
||||
__all__ = (
|
||||
|
|
|
@ -1,9 +1,11 @@
|
|||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import sys
|
||||
import typing
|
||||
from json import dumps
|
||||
from os.path import abspath, basename, dirname, join, realpath
|
||||
from platform import python_version
|
||||
from typing import List, Optional
|
||||
from unicodedata import unidata_version
|
||||
|
||||
import charset_normalizer.md as md_module
|
||||
|
@ -42,10 +44,69 @@ def query_yes_no(question: str, default: str = "yes") -> bool:
|
|||
elif choice in valid:
|
||||
return valid[choice]
|
||||
else:
|
||||
sys.stdout.write("Please respond with 'yes' or 'no' " "(or 'y' or 'n').\n")
|
||||
sys.stdout.write("Please respond with 'yes' or 'no' (or 'y' or 'n').\n")
|
||||
|
||||
|
||||
def cli_detect(argv: Optional[List[str]] = None) -> int:
|
||||
class FileType:
|
||||
"""Factory for creating file object types
|
||||
|
||||
Instances of FileType are typically passed as type= arguments to the
|
||||
ArgumentParser add_argument() method.
|
||||
|
||||
Keyword Arguments:
|
||||
- mode -- A string indicating how the file is to be opened. Accepts the
|
||||
same values as the builtin open() function.
|
||||
- bufsize -- The file's desired buffer size. Accepts the same values as
|
||||
the builtin open() function.
|
||||
- encoding -- The file's encoding. Accepts the same values as the
|
||||
builtin open() function.
|
||||
- errors -- A string indicating how encoding and decoding errors are to
|
||||
be handled. Accepts the same value as the builtin open() function.
|
||||
|
||||
Backported from CPython 3.12
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
mode: str = "r",
|
||||
bufsize: int = -1,
|
||||
encoding: str | None = None,
|
||||
errors: str | None = None,
|
||||
):
|
||||
self._mode = mode
|
||||
self._bufsize = bufsize
|
||||
self._encoding = encoding
|
||||
self._errors = errors
|
||||
|
||||
def __call__(self, string: str) -> typing.IO: # type: ignore[type-arg]
|
||||
# the special argument "-" means sys.std{in,out}
|
||||
if string == "-":
|
||||
if "r" in self._mode:
|
||||
return sys.stdin.buffer if "b" in self._mode else sys.stdin
|
||||
elif any(c in self._mode for c in "wax"):
|
||||
return sys.stdout.buffer if "b" in self._mode else sys.stdout
|
||||
else:
|
||||
msg = f'argument "-" with mode {self._mode}'
|
||||
raise ValueError(msg)
|
||||
|
||||
# all other arguments are used as file names
|
||||
try:
|
||||
return open(string, self._mode, self._bufsize, self._encoding, self._errors)
|
||||
except OSError as e:
|
||||
message = f"can't open '{string}': {e}"
|
||||
raise argparse.ArgumentTypeError(message)
|
||||
|
||||
def __repr__(self) -> str:
|
||||
args = self._mode, self._bufsize
|
||||
kwargs = [("encoding", self._encoding), ("errors", self._errors)]
|
||||
args_str = ", ".join(
|
||||
[repr(arg) for arg in args if arg != -1]
|
||||
+ [f"{kw}={arg!r}" for kw, arg in kwargs if arg is not None]
|
||||
)
|
||||
return f"{type(self).__name__}({args_str})"
|
||||
|
||||
|
||||
def cli_detect(argv: list[str] | None = None) -> int:
|
||||
"""
|
||||
CLI assistant using ARGV and ArgumentParser
|
||||
:param argv:
|
||||
|
@ -58,7 +119,7 @@ def cli_detect(argv: Optional[List[str]] = None) -> int:
|
|||
)
|
||||
|
||||
parser.add_argument(
|
||||
"files", type=argparse.FileType("rb"), nargs="+", help="File(s) to be analysed"
|
||||
"files", type=FileType("rb"), nargs="+", help="File(s) to be analysed"
|
||||
)
|
||||
parser.add_argument(
|
||||
"-v",
|
||||
|
@ -124,7 +185,7 @@ def cli_detect(argv: Optional[List[str]] = None) -> int:
|
|||
default=0.2,
|
||||
type=float,
|
||||
dest="threshold",
|
||||
help="Define a custom maximum amount of chaos allowed in decoded content. 0. <= chaos <= 1.",
|
||||
help="Define a custom maximum amount of noise allowed in decoded content. 0. <= noise <= 1.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--version",
|
||||
|
@ -259,7 +320,7 @@ def cli_detect(argv: Optional[List[str]] = None) -> int:
|
|||
dir_path = dirname(realpath(my_file.name))
|
||||
file_name = basename(realpath(my_file.name))
|
||||
|
||||
o_: List[str] = file_name.split(".")
|
||||
o_: list[str] = file_name.split(".")
|
||||
|
||||
if args.replace is False:
|
||||
o_.insert(-1, best_guess.encoding)
|
||||
|
@ -284,7 +345,7 @@ def cli_detect(argv: Optional[List[str]] = None) -> int:
|
|||
|
||||
with open(x_[0].unicode_path, "wb") as fp:
|
||||
fp.write(best_guess.output())
|
||||
except IOError as e:
|
||||
except OSError as e:
|
||||
print(str(e), file=sys.stderr)
|
||||
if my_file.closed is False:
|
||||
my_file.close()
|
||||
|
|
|
@ -1,11 +1,12 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
from __future__ import annotations
|
||||
|
||||
from codecs import BOM_UTF8, BOM_UTF16_BE, BOM_UTF16_LE, BOM_UTF32_BE, BOM_UTF32_LE
|
||||
from encodings.aliases import aliases
|
||||
from re import IGNORECASE, compile as re_compile
|
||||
from typing import Dict, List, Set, Union
|
||||
from re import IGNORECASE
|
||||
from re import compile as re_compile
|
||||
|
||||
# Contain for each eligible encoding a list of/item bytes SIG/BOM
|
||||
ENCODING_MARKS: Dict[str, Union[bytes, List[bytes]]] = {
|
||||
ENCODING_MARKS: dict[str, bytes | list[bytes]] = {
|
||||
"utf_8": BOM_UTF8,
|
||||
"utf_7": [
|
||||
b"\x2b\x2f\x76\x38",
|
||||
|
@ -25,7 +26,7 @@ TOO_BIG_SEQUENCE: int = int(10e6)
|
|||
UTF8_MAXIMAL_ALLOCATION: int = 1_112_064
|
||||
|
||||
# Up-to-date Unicode ucd/15.0.0
|
||||
UNICODE_RANGES_COMBINED: Dict[str, range] = {
|
||||
UNICODE_RANGES_COMBINED: dict[str, range] = {
|
||||
"Control character": range(32),
|
||||
"Basic Latin": range(32, 128),
|
||||
"Latin-1 Supplement": range(128, 256),
|
||||
|
@ -357,7 +358,7 @@ UNICODE_RANGES_COMBINED: Dict[str, range] = {
|
|||
}
|
||||
|
||||
|
||||
UNICODE_SECONDARY_RANGE_KEYWORD: List[str] = [
|
||||
UNICODE_SECONDARY_RANGE_KEYWORD: list[str] = [
|
||||
"Supplement",
|
||||
"Extended",
|
||||
"Extensions",
|
||||
|
@ -392,7 +393,7 @@ IANA_NO_ALIASES = [
|
|||
"koi8_u",
|
||||
]
|
||||
|
||||
IANA_SUPPORTED: List[str] = sorted(
|
||||
IANA_SUPPORTED: list[str] = sorted(
|
||||
filter(
|
||||
lambda x: x.endswith("_codec") is False
|
||||
and x not in {"rot_13", "tactis", "mbcs"},
|
||||
|
@ -403,7 +404,7 @@ IANA_SUPPORTED: List[str] = sorted(
|
|||
IANA_SUPPORTED_COUNT: int = len(IANA_SUPPORTED)
|
||||
|
||||
# pre-computed code page that are similar using the function cp_similarity.
|
||||
IANA_SUPPORTED_SIMILAR: Dict[str, List[str]] = {
|
||||
IANA_SUPPORTED_SIMILAR: dict[str, list[str]] = {
|
||||
"cp037": ["cp1026", "cp1140", "cp273", "cp500"],
|
||||
"cp1026": ["cp037", "cp1140", "cp273", "cp500"],
|
||||
"cp1125": ["cp866"],
|
||||
|
@ -492,7 +493,7 @@ IANA_SUPPORTED_SIMILAR: Dict[str, List[str]] = {
|
|||
}
|
||||
|
||||
|
||||
CHARDET_CORRESPONDENCE: Dict[str, str] = {
|
||||
CHARDET_CORRESPONDENCE: dict[str, str] = {
|
||||
"iso2022_kr": "ISO-2022-KR",
|
||||
"iso2022_jp": "ISO-2022-JP",
|
||||
"euc_kr": "EUC-KR",
|
||||
|
@ -528,7 +529,7 @@ CHARDET_CORRESPONDENCE: Dict[str, str] = {
|
|||
}
|
||||
|
||||
|
||||
COMMON_SAFE_ASCII_CHARACTERS: Set[str] = {
|
||||
COMMON_SAFE_ASCII_CHARACTERS: set[str] = {
|
||||
"<",
|
||||
">",
|
||||
"=",
|
||||
|
@ -548,9 +549,26 @@ COMMON_SAFE_ASCII_CHARACTERS: Set[str] = {
|
|||
")",
|
||||
}
|
||||
|
||||
# Sample character sets — replace with full lists if needed
|
||||
COMMON_CHINESE_CHARACTERS = "的一是在不了有和人这中大为上个国我以要他时来用们生到作地于出就分对成会可主发年动同工也能下过子说产种面而方后多定行学法所民得经十三之进着等部度家电力里如水化高自二理起小物现实加量都两体制机当使点从业本去把性好应开它合还因由其些然前外天政四日那社义事平形相全表间样与关各重新线内数正心反你明看原又么利比或但质气第向道命此变条只没结解问意建月公无系军很情者最立代想已通并提直题党程展五果料象员革位入常文总次品式活设及管特件长求老头基资边流路级少图山统接知较将组见计别她手角期根论运农指几九区强放决西被干做必战先回则任取据处队南给色光门即保治北造百规热领七海口东导器压志世金增争济阶油思术极交受联什认六共权收证改清己美再采转更单风切打白教速花带安场身车例真务具万每目至达走积示议声报斗完类八离华名确才科张信马节话米整空元况今集温传土许步群广石记需段研界拉林律叫且究观越织装影算低持音众书布复容儿须际商非验连断深难近矿千周委素技备半办青省列习响约支般史感劳便团往酸历市克何除消构府太准精值号率族维划选标写存候毛亲快效斯院查江型眼王按格养易置派层片始却专状育厂京识适属圆包火住调满县局照参红细引听该铁价严龙飞"
|
||||
|
||||
KO_NAMES: Set[str] = {"johab", "cp949", "euc_kr"}
|
||||
ZH_NAMES: Set[str] = {"big5", "cp950", "big5hkscs", "hz"}
|
||||
COMMON_JAPANESE_CHARACTERS = "日一国年大十二本中長出三時行見月分後前生五間上東四今金九入学高円子外八六下来気小七山話女北午百書先名川千水半男西電校語土木聞食車何南万毎白天母火右読友左休父雨"
|
||||
|
||||
COMMON_KOREAN_CHARACTERS = "一二三四五六七八九十百千萬上下左右中人女子大小山川日月火水木金土父母天地國名年時文校學生"
|
||||
|
||||
# Combine all into a set
|
||||
COMMON_CJK_CHARACTERS = set(
|
||||
"".join(
|
||||
[
|
||||
COMMON_CHINESE_CHARACTERS,
|
||||
COMMON_JAPANESE_CHARACTERS,
|
||||
COMMON_KOREAN_CHARACTERS,
|
||||
]
|
||||
)
|
||||
)
|
||||
|
||||
KO_NAMES: set[str] = {"johab", "cp949", "euc_kr"}
|
||||
ZH_NAMES: set[str] = {"big5", "cp950", "big5hkscs", "hz"}
|
||||
|
||||
# Logging LEVEL below DEBUG
|
||||
TRACE: int = 5
|
||||
|
@ -558,7 +576,7 @@ TRACE: int = 5
|
|||
|
||||
# Language label that contain the em dash "—"
|
||||
# character are to be considered alternative seq to origin
|
||||
FREQUENCIES: Dict[str, List[str]] = {
|
||||
FREQUENCIES: dict[str, list[str]] = {
|
||||
"English": [
|
||||
"e",
|
||||
"a",
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
from __future__ import annotations
|
||||
|
||||
from typing import TYPE_CHECKING, Any, Optional
|
||||
from typing import TYPE_CHECKING, Any
|
||||
from warnings import warn
|
||||
|
||||
from .api import from_bytes
|
||||
|
@ -11,9 +11,9 @@ if TYPE_CHECKING:
|
|||
from typing_extensions import TypedDict
|
||||
|
||||
class ResultDict(TypedDict):
|
||||
encoding: Optional[str]
|
||||
encoding: str | None
|
||||
language: str
|
||||
confidence: Optional[float]
|
||||
confidence: float | None
|
||||
|
||||
|
||||
def detect(
|
||||
|
@ -37,8 +37,7 @@ def detect(
|
|||
|
||||
if not isinstance(byte_str, (bytearray, bytes)):
|
||||
raise TypeError( # pragma: nocover
|
||||
"Expected object of type bytes or bytearray, got: "
|
||||
"{0}".format(type(byte_str))
|
||||
f"Expected object of type bytes or bytearray, got: {type(byte_str)}"
|
||||
)
|
||||
|
||||
if isinstance(byte_str, bytearray):
|
||||
|
|
|
@ -1,6 +1,7 @@
|
|||
from __future__ import annotations
|
||||
|
||||
from functools import lru_cache
|
||||
from logging import getLogger
|
||||
from typing import List, Optional
|
||||
|
||||
from .constant import (
|
||||
COMMON_SAFE_ASCII_CHARACTERS,
|
||||
|
@ -25,6 +26,7 @@ from .utils import (
|
|||
is_unprintable,
|
||||
remove_accent,
|
||||
unicode_range,
|
||||
is_cjk_uncommon,
|
||||
)
|
||||
|
||||
|
||||
|
@ -68,7 +70,7 @@ class TooManySymbolOrPunctuationPlugin(MessDetectorPlugin):
|
|||
self._symbol_count: int = 0
|
||||
self._character_count: int = 0
|
||||
|
||||
self._last_printable_char: Optional[str] = None
|
||||
self._last_printable_char: str | None = None
|
||||
self._frenzy_symbol_in_word: bool = False
|
||||
|
||||
def eligible(self, character: str) -> bool:
|
||||
|
@ -92,7 +94,7 @@ class TooManySymbolOrPunctuationPlugin(MessDetectorPlugin):
|
|||
|
||||
self._last_printable_char = character
|
||||
|
||||
def reset(self) -> None: # pragma: no cover
|
||||
def reset(self) -> None: # Abstract
|
||||
self._punctuation_count = 0
|
||||
self._character_count = 0
|
||||
self._symbol_count = 0
|
||||
|
@ -123,7 +125,7 @@ class TooManyAccentuatedPlugin(MessDetectorPlugin):
|
|||
if is_accentuated(character):
|
||||
self._accentuated_count += 1
|
||||
|
||||
def reset(self) -> None: # pragma: no cover
|
||||
def reset(self) -> None: # Abstract
|
||||
self._character_count = 0
|
||||
self._accentuated_count = 0
|
||||
|
||||
|
@ -149,7 +151,7 @@ class UnprintablePlugin(MessDetectorPlugin):
|
|||
self._unprintable_count += 1
|
||||
self._character_count += 1
|
||||
|
||||
def reset(self) -> None: # pragma: no cover
|
||||
def reset(self) -> None: # Abstract
|
||||
self._unprintable_count = 0
|
||||
|
||||
@property
|
||||
|
@ -165,7 +167,7 @@ class SuspiciousDuplicateAccentPlugin(MessDetectorPlugin):
|
|||
self._successive_count: int = 0
|
||||
self._character_count: int = 0
|
||||
|
||||
self._last_latin_character: Optional[str] = None
|
||||
self._last_latin_character: str | None = None
|
||||
|
||||
def eligible(self, character: str) -> bool:
|
||||
return character.isalpha() and is_latin(character)
|
||||
|
@ -184,7 +186,7 @@ class SuspiciousDuplicateAccentPlugin(MessDetectorPlugin):
|
|||
self._successive_count += 1
|
||||
self._last_latin_character = character
|
||||
|
||||
def reset(self) -> None: # pragma: no cover
|
||||
def reset(self) -> None: # Abstract
|
||||
self._successive_count = 0
|
||||
self._character_count = 0
|
||||
self._last_latin_character = None
|
||||
|
@ -201,7 +203,7 @@ class SuspiciousRange(MessDetectorPlugin):
|
|||
def __init__(self) -> None:
|
||||
self._suspicious_successive_range_count: int = 0
|
||||
self._character_count: int = 0
|
||||
self._last_printable_seen: Optional[str] = None
|
||||
self._last_printable_seen: str | None = None
|
||||
|
||||
def eligible(self, character: str) -> bool:
|
||||
return character.isprintable()
|
||||
|
@ -221,15 +223,15 @@ class SuspiciousRange(MessDetectorPlugin):
|
|||
self._last_printable_seen = character
|
||||
return
|
||||
|
||||
unicode_range_a: Optional[str] = unicode_range(self._last_printable_seen)
|
||||
unicode_range_b: Optional[str] = unicode_range(character)
|
||||
unicode_range_a: str | None = unicode_range(self._last_printable_seen)
|
||||
unicode_range_b: str | None = unicode_range(character)
|
||||
|
||||
if is_suspiciously_successive_range(unicode_range_a, unicode_range_b):
|
||||
self._suspicious_successive_range_count += 1
|
||||
|
||||
self._last_printable_seen = character
|
||||
|
||||
def reset(self) -> None: # pragma: no cover
|
||||
def reset(self) -> None: # Abstract
|
||||
self._character_count = 0
|
||||
self._suspicious_successive_range_count = 0
|
||||
self._last_printable_seen = None
|
||||
|
@ -346,7 +348,7 @@ class SuperWeirdWordPlugin(MessDetectorPlugin):
|
|||
self._is_current_word_bad = True
|
||||
self._buffer += character
|
||||
|
||||
def reset(self) -> None: # pragma: no cover
|
||||
def reset(self) -> None: # Abstract
|
||||
self._buffer = ""
|
||||
self._is_current_word_bad = False
|
||||
self._foreign_long_watch = False
|
||||
|
@ -364,35 +366,39 @@ class SuperWeirdWordPlugin(MessDetectorPlugin):
|
|||
return self._bad_character_count / self._character_count
|
||||
|
||||
|
||||
class CjkInvalidStopPlugin(MessDetectorPlugin):
|
||||
class CjkUncommonPlugin(MessDetectorPlugin):
|
||||
"""
|
||||
GB(Chinese) based encoding often render the stop incorrectly when the content does not fit and
|
||||
can be easily detected. Searching for the overuse of '丅' and '丄'.
|
||||
Detect messy CJK text that probably means nothing.
|
||||
"""
|
||||
|
||||
def __init__(self) -> None:
|
||||
self._wrong_stop_count: int = 0
|
||||
self._cjk_character_count: int = 0
|
||||
self._character_count: int = 0
|
||||
self._uncommon_count: int = 0
|
||||
|
||||
def eligible(self, character: str) -> bool:
|
||||
return True
|
||||
return is_cjk(character)
|
||||
|
||||
def feed(self, character: str) -> None:
|
||||
if character in {"丅", "丄"}:
|
||||
self._wrong_stop_count += 1
|
||||
return
|
||||
if is_cjk(character):
|
||||
self._cjk_character_count += 1
|
||||
self._character_count += 1
|
||||
|
||||
def reset(self) -> None: # pragma: no cover
|
||||
self._wrong_stop_count = 0
|
||||
self._cjk_character_count = 0
|
||||
if is_cjk_uncommon(character):
|
||||
self._uncommon_count += 1
|
||||
return
|
||||
|
||||
def reset(self) -> None: # Abstract
|
||||
self._character_count = 0
|
||||
self._uncommon_count = 0
|
||||
|
||||
@property
|
||||
def ratio(self) -> float:
|
||||
if self._cjk_character_count < 16:
|
||||
if self._character_count < 8:
|
||||
return 0.0
|
||||
return self._wrong_stop_count / self._cjk_character_count
|
||||
|
||||
uncommon_form_usage: float = self._uncommon_count / self._character_count
|
||||
|
||||
# we can be pretty sure it's garbage when uncommon characters are widely
|
||||
# used. otherwise it could just be traditional chinese for example.
|
||||
return uncommon_form_usage / 10 if uncommon_form_usage > 0.5 else 0.0
|
||||
|
||||
|
||||
class ArchaicUpperLowerPlugin(MessDetectorPlugin):
|
||||
|
@ -406,7 +412,7 @@ class ArchaicUpperLowerPlugin(MessDetectorPlugin):
|
|||
|
||||
self._character_count: int = 0
|
||||
|
||||
self._last_alpha_seen: Optional[str] = None
|
||||
self._last_alpha_seen: str | None = None
|
||||
self._current_ascii_only: bool = True
|
||||
|
||||
def eligible(self, character: str) -> bool:
|
||||
|
@ -454,7 +460,7 @@ class ArchaicUpperLowerPlugin(MessDetectorPlugin):
|
|||
self._character_count_since_last_sep += 1
|
||||
self._last_alpha_seen = character
|
||||
|
||||
def reset(self) -> None: # pragma: no cover
|
||||
def reset(self) -> None: # Abstract
|
||||
self._character_count = 0
|
||||
self._character_count_since_last_sep = 0
|
||||
self._successive_upper_lower_count = 0
|
||||
|
@ -476,7 +482,7 @@ class ArabicIsolatedFormPlugin(MessDetectorPlugin):
|
|||
self._character_count: int = 0
|
||||
self._isolated_form_count: int = 0
|
||||
|
||||
def reset(self) -> None: # pragma: no cover
|
||||
def reset(self) -> None: # Abstract
|
||||
self._character_count = 0
|
||||
self._isolated_form_count = 0
|
||||
|
||||
|
@ -501,7 +507,7 @@ class ArabicIsolatedFormPlugin(MessDetectorPlugin):
|
|||
|
||||
@lru_cache(maxsize=1024)
|
||||
def is_suspiciously_successive_range(
|
||||
unicode_range_a: Optional[str], unicode_range_b: Optional[str]
|
||||
unicode_range_a: str | None, unicode_range_b: str | None
|
||||
) -> bool:
|
||||
"""
|
||||
Determine if two Unicode range seen next to each other can be considered as suspicious.
|
||||
|
@ -525,9 +531,10 @@ def is_suspiciously_successive_range(
|
|||
):
|
||||
return False
|
||||
|
||||
keywords_range_a, keywords_range_b = unicode_range_a.split(
|
||||
" "
|
||||
), unicode_range_b.split(" ")
|
||||
keywords_range_a, keywords_range_b = (
|
||||
unicode_range_a.split(" "),
|
||||
unicode_range_b.split(" "),
|
||||
)
|
||||
|
||||
for el in keywords_range_a:
|
||||
if el in UNICODE_SECONDARY_RANGE_KEYWORD:
|
||||
|
@ -580,7 +587,7 @@ def mess_ratio(
|
|||
Compute a mess ratio given a decoded bytes sequence. The maximum threshold does stop the computation earlier.
|
||||
"""
|
||||
|
||||
detectors: List[MessDetectorPlugin] = [
|
||||
detectors: list[MessDetectorPlugin] = [
|
||||
md_class() for md_class in MessDetectorPlugin.__subclasses__()
|
||||
]
|
||||
|
||||
|
@ -622,7 +629,7 @@ def mess_ratio(
|
|||
logger.log(TRACE, f"Starting with: {decoded_sequence[:16]}")
|
||||
logger.log(TRACE, f"Ending with: {decoded_sequence[-16::]}")
|
||||
|
||||
for dt in detectors: # pragma: nocover
|
||||
for dt in detectors:
|
||||
logger.log(TRACE, f"{dt.__class__}: {dt.ratio}")
|
||||
|
||||
return round(mean_mess_ratio, 3)
|
||||
|
|
|
@ -1,8 +1,10 @@
|
|||
from __future__ import annotations
|
||||
|
||||
from encodings.aliases import aliases
|
||||
from hashlib import sha256
|
||||
from json import dumps
|
||||
from re import sub
|
||||
from typing import Any, Dict, Iterator, List, Optional, Tuple, Union
|
||||
from typing import Any, Iterator, List, Tuple
|
||||
|
||||
from .constant import RE_POSSIBLE_ENCODING_INDICATION, TOO_BIG_SEQUENCE
|
||||
from .utils import iana_name, is_multi_byte_encoding, unicode_range
|
||||
|
@ -15,9 +17,9 @@ class CharsetMatch:
|
|||
guessed_encoding: str,
|
||||
mean_mess_ratio: float,
|
||||
has_sig_or_bom: bool,
|
||||
languages: "CoherenceMatches",
|
||||
decoded_payload: Optional[str] = None,
|
||||
preemptive_declaration: Optional[str] = None,
|
||||
languages: CoherenceMatches,
|
||||
decoded_payload: str | None = None,
|
||||
preemptive_declaration: str | None = None,
|
||||
):
|
||||
self._payload: bytes = payload
|
||||
|
||||
|
@ -25,17 +27,17 @@ class CharsetMatch:
|
|||
self._mean_mess_ratio: float = mean_mess_ratio
|
||||
self._languages: CoherenceMatches = languages
|
||||
self._has_sig_or_bom: bool = has_sig_or_bom
|
||||
self._unicode_ranges: Optional[List[str]] = None
|
||||
self._unicode_ranges: list[str] | None = None
|
||||
|
||||
self._leaves: List[CharsetMatch] = []
|
||||
self._leaves: list[CharsetMatch] = []
|
||||
self._mean_coherence_ratio: float = 0.0
|
||||
|
||||
self._output_payload: Optional[bytes] = None
|
||||
self._output_encoding: Optional[str] = None
|
||||
self._output_payload: bytes | None = None
|
||||
self._output_encoding: str | None = None
|
||||
|
||||
self._string: Optional[str] = decoded_payload
|
||||
self._string: str | None = decoded_payload
|
||||
|
||||
self._preemptive_declaration: Optional[str] = preemptive_declaration
|
||||
self._preemptive_declaration: str | None = preemptive_declaration
|
||||
|
||||
def __eq__(self, other: object) -> bool:
|
||||
if not isinstance(other, CharsetMatch):
|
||||
|
@ -77,9 +79,9 @@ class CharsetMatch:
|
|||
return self._string
|
||||
|
||||
def __repr__(self) -> str:
|
||||
return "<CharsetMatch '{}' bytes({})>".format(self.encoding, self.fingerprint)
|
||||
return f"<CharsetMatch '{self.encoding}' bytes({self.fingerprint})>"
|
||||
|
||||
def add_submatch(self, other: "CharsetMatch") -> None:
|
||||
def add_submatch(self, other: CharsetMatch) -> None:
|
||||
if not isinstance(other, CharsetMatch) or other == self:
|
||||
raise ValueError(
|
||||
"Unable to add instance <{}> as a submatch of a CharsetMatch".format(
|
||||
|
@ -95,11 +97,11 @@ class CharsetMatch:
|
|||
return self._encoding
|
||||
|
||||
@property
|
||||
def encoding_aliases(self) -> List[str]:
|
||||
def encoding_aliases(self) -> list[str]:
|
||||
"""
|
||||
Encoding name are known by many name, using this could help when searching for IBM855 when it's listed as CP855.
|
||||
"""
|
||||
also_known_as: List[str] = []
|
||||
also_known_as: list[str] = []
|
||||
for u, p in aliases.items():
|
||||
if self.encoding == u:
|
||||
also_known_as.append(p)
|
||||
|
@ -116,7 +118,7 @@ class CharsetMatch:
|
|||
return self._has_sig_or_bom
|
||||
|
||||
@property
|
||||
def languages(self) -> List[str]:
|
||||
def languages(self) -> list[str]:
|
||||
"""
|
||||
Return the complete list of possible languages found in decoded sequence.
|
||||
Usually not really useful. Returned list may be empty even if 'language' property return something != 'Unknown'.
|
||||
|
@ -177,7 +179,7 @@ class CharsetMatch:
|
|||
return self._payload
|
||||
|
||||
@property
|
||||
def submatch(self) -> List["CharsetMatch"]:
|
||||
def submatch(self) -> list[CharsetMatch]:
|
||||
return self._leaves
|
||||
|
||||
@property
|
||||
|
@ -185,19 +187,17 @@ class CharsetMatch:
|
|||
return len(self._leaves) > 0
|
||||
|
||||
@property
|
||||
def alphabets(self) -> List[str]:
|
||||
def alphabets(self) -> list[str]:
|
||||
if self._unicode_ranges is not None:
|
||||
return self._unicode_ranges
|
||||
# list detected ranges
|
||||
detected_ranges: List[Optional[str]] = [
|
||||
unicode_range(char) for char in str(self)
|
||||
]
|
||||
detected_ranges: list[str | None] = [unicode_range(char) for char in str(self)]
|
||||
# filter and sort
|
||||
self._unicode_ranges = sorted(list({r for r in detected_ranges if r}))
|
||||
return self._unicode_ranges
|
||||
|
||||
@property
|
||||
def could_be_from_charset(self) -> List[str]:
|
||||
def could_be_from_charset(self) -> list[str]:
|
||||
"""
|
||||
The complete list of encoding that output the exact SAME str result and therefore could be the originating
|
||||
encoding.
|
||||
|
@ -221,10 +221,11 @@ class CharsetMatch:
|
|||
patched_header = sub(
|
||||
RE_POSSIBLE_ENCODING_INDICATION,
|
||||
lambda m: m.string[m.span()[0] : m.span()[1]].replace(
|
||||
m.groups()[0], iana_name(self._output_encoding) # type: ignore[arg-type]
|
||||
m.groups()[0],
|
||||
iana_name(self._output_encoding).replace("_", "-"), # type: ignore[arg-type]
|
||||
),
|
||||
decoded_string[:8192],
|
||||
1,
|
||||
count=1,
|
||||
)
|
||||
|
||||
decoded_string = patched_header + decoded_string[8192:]
|
||||
|
@ -247,13 +248,13 @@ class CharsetMatches:
|
|||
Act like a list(iterable) but does not implements all related methods.
|
||||
"""
|
||||
|
||||
def __init__(self, results: Optional[List[CharsetMatch]] = None):
|
||||
self._results: List[CharsetMatch] = sorted(results) if results else []
|
||||
def __init__(self, results: list[CharsetMatch] | None = None):
|
||||
self._results: list[CharsetMatch] = sorted(results) if results else []
|
||||
|
||||
def __iter__(self) -> Iterator[CharsetMatch]:
|
||||
yield from self._results
|
||||
|
||||
def __getitem__(self, item: Union[int, str]) -> CharsetMatch:
|
||||
def __getitem__(self, item: int | str) -> CharsetMatch:
|
||||
"""
|
||||
Retrieve a single item either by its position or encoding name (alias may be used here).
|
||||
Raise KeyError upon invalid index or encoding not present in results.
|
||||
|
@ -293,7 +294,7 @@ class CharsetMatches:
|
|||
self._results.append(item)
|
||||
self._results = sorted(self._results)
|
||||
|
||||
def best(self) -> Optional["CharsetMatch"]:
|
||||
def best(self) -> CharsetMatch | None:
|
||||
"""
|
||||
Simply return the first match. Strict equivalent to matches[0].
|
||||
"""
|
||||
|
@ -301,7 +302,7 @@ class CharsetMatches:
|
|||
return None
|
||||
return self._results[0]
|
||||
|
||||
def first(self) -> Optional["CharsetMatch"]:
|
||||
def first(self) -> CharsetMatch | None:
|
||||
"""
|
||||
Redundant method, call the method best(). Kept for BC reasons.
|
||||
"""
|
||||
|
@ -316,31 +317,31 @@ class CliDetectionResult:
|
|||
def __init__(
|
||||
self,
|
||||
path: str,
|
||||
encoding: Optional[str],
|
||||
encoding_aliases: List[str],
|
||||
alternative_encodings: List[str],
|
||||
encoding: str | None,
|
||||
encoding_aliases: list[str],
|
||||
alternative_encodings: list[str],
|
||||
language: str,
|
||||
alphabets: List[str],
|
||||
alphabets: list[str],
|
||||
has_sig_or_bom: bool,
|
||||
chaos: float,
|
||||
coherence: float,
|
||||
unicode_path: Optional[str],
|
||||
unicode_path: str | None,
|
||||
is_preferred: bool,
|
||||
):
|
||||
self.path: str = path
|
||||
self.unicode_path: Optional[str] = unicode_path
|
||||
self.encoding: Optional[str] = encoding
|
||||
self.encoding_aliases: List[str] = encoding_aliases
|
||||
self.alternative_encodings: List[str] = alternative_encodings
|
||||
self.unicode_path: str | None = unicode_path
|
||||
self.encoding: str | None = encoding
|
||||
self.encoding_aliases: list[str] = encoding_aliases
|
||||
self.alternative_encodings: list[str] = alternative_encodings
|
||||
self.language: str = language
|
||||
self.alphabets: List[str] = alphabets
|
||||
self.alphabets: list[str] = alphabets
|
||||
self.has_sig_or_bom: bool = has_sig_or_bom
|
||||
self.chaos: float = chaos
|
||||
self.coherence: float = coherence
|
||||
self.is_preferred: bool = is_preferred
|
||||
|
||||
@property
|
||||
def __dict__(self) -> Dict[str, Any]: # type: ignore
|
||||
def __dict__(self) -> dict[str, Any]: # type: ignore
|
||||
return {
|
||||
"path": self.path,
|
||||
"encoding": self.encoding,
|
||||
|
|
|
@ -1,3 +1,5 @@
|
|||
from __future__ import annotations
|
||||
|
||||
import importlib
|
||||
import logging
|
||||
import unicodedata
|
||||
|
@ -5,9 +7,11 @@ from codecs import IncrementalDecoder
|
|||
from encodings.aliases import aliases
|
||||
from functools import lru_cache
|
||||
from re import findall
|
||||
from typing import Generator, List, Optional, Set, Tuple, Union
|
||||
from typing import Generator
|
||||
|
||||
from _multibytecodec import MultibyteIncrementalDecoder
|
||||
from _multibytecodec import ( # type: ignore[import-not-found,import]
|
||||
MultibyteIncrementalDecoder,
|
||||
)
|
||||
|
||||
from .constant import (
|
||||
ENCODING_MARKS,
|
||||
|
@ -16,6 +20,7 @@ from .constant import (
|
|||
UNICODE_RANGES_COMBINED,
|
||||
UNICODE_SECONDARY_RANGE_KEYWORD,
|
||||
UTF8_MAXIMAL_ALLOCATION,
|
||||
COMMON_CJK_CHARACTERS,
|
||||
)
|
||||
|
||||
|
||||
|
@ -23,7 +28,7 @@ from .constant import (
|
|||
def is_accentuated(character: str) -> bool:
|
||||
try:
|
||||
description: str = unicodedata.name(character)
|
||||
except ValueError:
|
||||
except ValueError: # Defensive: unicode database outdated?
|
||||
return False
|
||||
return (
|
||||
"WITH GRAVE" in description
|
||||
|
@ -43,13 +48,13 @@ def remove_accent(character: str) -> str:
|
|||
if not decomposed:
|
||||
return character
|
||||
|
||||
codes: List[str] = decomposed.split(" ")
|
||||
codes: list[str] = decomposed.split(" ")
|
||||
|
||||
return chr(int(codes[0], 16))
|
||||
|
||||
|
||||
@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
|
||||
def unicode_range(character: str) -> Optional[str]:
|
||||
def unicode_range(character: str) -> str | None:
|
||||
"""
|
||||
Retrieve the Unicode range official name from a single character.
|
||||
"""
|
||||
|
@ -66,7 +71,7 @@ def unicode_range(character: str) -> Optional[str]:
|
|||
def is_latin(character: str) -> bool:
|
||||
try:
|
||||
description: str = unicodedata.name(character)
|
||||
except ValueError:
|
||||
except ValueError: # Defensive: unicode database outdated?
|
||||
return False
|
||||
return "LATIN" in description
|
||||
|
||||
|
@ -78,7 +83,7 @@ def is_punctuation(character: str) -> bool:
|
|||
if "P" in character_category:
|
||||
return True
|
||||
|
||||
character_range: Optional[str] = unicode_range(character)
|
||||
character_range: str | None = unicode_range(character)
|
||||
|
||||
if character_range is None:
|
||||
return False
|
||||
|
@ -93,7 +98,7 @@ def is_symbol(character: str) -> bool:
|
|||
if "S" in character_category or "N" in character_category:
|
||||
return True
|
||||
|
||||
character_range: Optional[str] = unicode_range(character)
|
||||
character_range: str | None = unicode_range(character)
|
||||
|
||||
if character_range is None:
|
||||
return False
|
||||
|
@ -103,7 +108,7 @@ def is_symbol(character: str) -> bool:
|
|||
|
||||
@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
|
||||
def is_emoticon(character: str) -> bool:
|
||||
character_range: Optional[str] = unicode_range(character)
|
||||
character_range: str | None = unicode_range(character)
|
||||
|
||||
if character_range is None:
|
||||
return False
|
||||
|
@ -130,7 +135,7 @@ def is_case_variable(character: str) -> bool:
|
|||
def is_cjk(character: str) -> bool:
|
||||
try:
|
||||
character_name = unicodedata.name(character)
|
||||
except ValueError:
|
||||
except ValueError: # Defensive: unicode database outdated?
|
||||
return False
|
||||
|
||||
return "CJK" in character_name
|
||||
|
@ -140,7 +145,7 @@ def is_cjk(character: str) -> bool:
|
|||
def is_hiragana(character: str) -> bool:
|
||||
try:
|
||||
character_name = unicodedata.name(character)
|
||||
except ValueError:
|
||||
except ValueError: # Defensive: unicode database outdated?
|
||||
return False
|
||||
|
||||
return "HIRAGANA" in character_name
|
||||
|
@ -150,7 +155,7 @@ def is_hiragana(character: str) -> bool:
|
|||
def is_katakana(character: str) -> bool:
|
||||
try:
|
||||
character_name = unicodedata.name(character)
|
||||
except ValueError:
|
||||
except ValueError: # Defensive: unicode database outdated?
|
||||
return False
|
||||
|
||||
return "KATAKANA" in character_name
|
||||
|
@ -160,7 +165,7 @@ def is_katakana(character: str) -> bool:
|
|||
def is_hangul(character: str) -> bool:
|
||||
try:
|
||||
character_name = unicodedata.name(character)
|
||||
except ValueError:
|
||||
except ValueError: # Defensive: unicode database outdated?
|
||||
return False
|
||||
|
||||
return "HANGUL" in character_name
|
||||
|
@ -170,7 +175,7 @@ def is_hangul(character: str) -> bool:
|
|||
def is_thai(character: str) -> bool:
|
||||
try:
|
||||
character_name = unicodedata.name(character)
|
||||
except ValueError:
|
||||
except ValueError: # Defensive: unicode database outdated?
|
||||
return False
|
||||
|
||||
return "THAI" in character_name
|
||||
|
@ -180,7 +185,7 @@ def is_thai(character: str) -> bool:
|
|||
def is_arabic(character: str) -> bool:
|
||||
try:
|
||||
character_name = unicodedata.name(character)
|
||||
except ValueError:
|
||||
except ValueError: # Defensive: unicode database outdated?
|
||||
return False
|
||||
|
||||
return "ARABIC" in character_name
|
||||
|
@ -190,12 +195,17 @@ def is_arabic(character: str) -> bool:
|
|||
def is_arabic_isolated_form(character: str) -> bool:
|
||||
try:
|
||||
character_name = unicodedata.name(character)
|
||||
except ValueError:
|
||||
except ValueError: # Defensive: unicode database outdated?
|
||||
return False
|
||||
|
||||
return "ARABIC" in character_name and "ISOLATED FORM" in character_name
|
||||
|
||||
|
||||
@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
|
||||
def is_cjk_uncommon(character: str) -> bool:
|
||||
return character not in COMMON_CJK_CHARACTERS
|
||||
|
||||
|
||||
@lru_cache(maxsize=len(UNICODE_RANGES_COMBINED))
|
||||
def is_unicode_range_secondary(range_name: str) -> bool:
|
||||
return any(keyword in range_name for keyword in UNICODE_SECONDARY_RANGE_KEYWORD)
|
||||
|
@ -206,13 +216,13 @@ def is_unprintable(character: str) -> bool:
|
|||
return (
|
||||
character.isspace() is False # includes \n \t \r \v
|
||||
and character.isprintable() is False
|
||||
and character != "\x1A" # Why? Its the ASCII substitute character.
|
||||
and character != "\x1a" # Why? Its the ASCII substitute character.
|
||||
and character != "\ufeff" # bug discovered in Python,
|
||||
# Zero Width No-Break Space located in Arabic Presentation Forms-B, Unicode 1.1 not acknowledged as space.
|
||||
)
|
||||
|
||||
|
||||
def any_specified_encoding(sequence: bytes, search_zone: int = 8192) -> Optional[str]:
|
||||
def any_specified_encoding(sequence: bytes, search_zone: int = 8192) -> str | None:
|
||||
"""
|
||||
Extract using ASCII-only decoder any specified encoding in the first n-bytes.
|
||||
"""
|
||||
|
@ -221,7 +231,7 @@ def any_specified_encoding(sequence: bytes, search_zone: int = 8192) -> Optional
|
|||
|
||||
seq_len: int = len(sequence)
|
||||
|
||||
results: List[str] = findall(
|
||||
results: list[str] = findall(
|
||||
RE_POSSIBLE_ENCODING_INDICATION,
|
||||
sequence[: min(seq_len, search_zone)].decode("ascii", errors="ignore"),
|
||||
)
|
||||
|
@ -260,18 +270,18 @@ def is_multi_byte_encoding(name: str) -> bool:
|
|||
"utf_32_be",
|
||||
"utf_7",
|
||||
} or issubclass(
|
||||
importlib.import_module("encodings.{}".format(name)).IncrementalDecoder,
|
||||
importlib.import_module(f"encodings.{name}").IncrementalDecoder,
|
||||
MultibyteIncrementalDecoder,
|
||||
)
|
||||
|
||||
|
||||
def identify_sig_or_bom(sequence: bytes) -> Tuple[Optional[str], bytes]:
|
||||
def identify_sig_or_bom(sequence: bytes) -> tuple[str | None, bytes]:
|
||||
"""
|
||||
Identify and extract SIG/BOM in given sequence.
|
||||
"""
|
||||
|
||||
for iana_encoding in ENCODING_MARKS:
|
||||
marks: Union[bytes, List[bytes]] = ENCODING_MARKS[iana_encoding]
|
||||
marks: bytes | list[bytes] = ENCODING_MARKS[iana_encoding]
|
||||
|
||||
if isinstance(marks, bytes):
|
||||
marks = [marks]
|
||||
|
@ -288,6 +298,7 @@ def should_strip_sig_or_bom(iana_encoding: str) -> bool:
|
|||
|
||||
|
||||
def iana_name(cp_name: str, strict: bool = True) -> str:
|
||||
"""Returns the Python normalized encoding name (Not the IANA official name)."""
|
||||
cp_name = cp_name.lower().replace("-", "_")
|
||||
|
||||
encoding_alias: str
|
||||
|
@ -298,35 +309,17 @@ def iana_name(cp_name: str, strict: bool = True) -> str:
|
|||
return encoding_iana
|
||||
|
||||
if strict:
|
||||
raise ValueError("Unable to retrieve IANA for '{}'".format(cp_name))
|
||||
raise ValueError(f"Unable to retrieve IANA for '{cp_name}'")
|
||||
|
||||
return cp_name
|
||||
|
||||
|
||||
def range_scan(decoded_sequence: str) -> List[str]:
|
||||
ranges: Set[str] = set()
|
||||
|
||||
for character in decoded_sequence:
|
||||
character_range: Optional[str] = unicode_range(character)
|
||||
|
||||
if character_range is None:
|
||||
continue
|
||||
|
||||
ranges.add(character_range)
|
||||
|
||||
return list(ranges)
|
||||
|
||||
|
||||
def cp_similarity(iana_name_a: str, iana_name_b: str) -> float:
|
||||
if is_multi_byte_encoding(iana_name_a) or is_multi_byte_encoding(iana_name_b):
|
||||
return 0.0
|
||||
|
||||
decoder_a = importlib.import_module(
|
||||
"encodings.{}".format(iana_name_a)
|
||||
).IncrementalDecoder
|
||||
decoder_b = importlib.import_module(
|
||||
"encodings.{}".format(iana_name_b)
|
||||
).IncrementalDecoder
|
||||
decoder_a = importlib.import_module(f"encodings.{iana_name_a}").IncrementalDecoder
|
||||
decoder_b = importlib.import_module(f"encodings.{iana_name_b}").IncrementalDecoder
|
||||
|
||||
id_a: IncrementalDecoder = decoder_a(errors="ignore")
|
||||
id_b: IncrementalDecoder = decoder_b(errors="ignore")
|
||||
|
@ -374,7 +367,7 @@ def cut_sequence_chunks(
|
|||
strip_sig_or_bom: bool,
|
||||
sig_payload: bytes,
|
||||
is_multi_byte_decoder: bool,
|
||||
decoded_payload: Optional[str] = None,
|
||||
decoded_payload: str | None = None,
|
||||
) -> Generator[str, None, None]:
|
||||
if decoded_payload and is_multi_byte_decoder is False:
|
||||
for i in offsets:
|
||||
|
|
|
@ -2,5 +2,7 @@
|
|||
Expose version
|
||||
"""
|
||||
|
||||
__version__ = "3.4.0"
|
||||
from __future__ import annotations
|
||||
|
||||
__version__ = "3.4.2"
|
||||
VERSION = __version__.split(".")
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue