Update plexapi==4.17.0

This commit is contained in:
JonnyWong16 2025-05-10 16:13:23 -07:00
commit f6bffe1850
No known key found for this signature in database
GPG key ID: B1F1F9807184697A
32 changed files with 1224 additions and 966 deletions

View file

@ -1,4 +1,3 @@
# -*- coding: utf-8 -*-
"""
Charset-Normalizer
~~~~~~~~~~~~~~
@ -19,6 +18,9 @@ at <https://github.com/Ousret/charset_normalizer>.
:copyright: (c) 2021 by Ahmed TAHRI
:license: MIT, see LICENSE for more details.
"""
from __future__ import annotations
import logging
from .api import from_bytes, from_fp, from_path, is_binary

View file

@ -1,3 +1,5 @@
from __future__ import annotations
from .cli import cli_detect
if __name__ == "__main__":

View file

@ -1,6 +1,8 @@
from __future__ import annotations
import logging
from os import PathLike
from typing import BinaryIO, List, Optional, Set, Union
from typing import BinaryIO
from .cd import (
coherence_ratio,
@ -21,8 +23,6 @@ from .utils import (
should_strip_sig_or_bom,
)
# Will most likely be controversial
# logging.addLevelName(TRACE, "TRACE")
logger = logging.getLogger("charset_normalizer")
explain_handler = logging.StreamHandler()
explain_handler.setFormatter(
@ -31,12 +31,12 @@ explain_handler.setFormatter(
def from_bytes(
sequences: Union[bytes, bytearray],
sequences: bytes | bytearray,
steps: int = 5,
chunk_size: int = 512,
threshold: float = 0.2,
cp_isolation: Optional[List[str]] = None,
cp_exclusion: Optional[List[str]] = None,
cp_isolation: list[str] | None = None,
cp_exclusion: list[str] | None = None,
preemptive_behaviour: bool = True,
explain: bool = False,
language_threshold: float = 0.1,
@ -62,7 +62,7 @@ def from_bytes(
if not isinstance(sequences, (bytearray, bytes)):
raise TypeError(
"Expected object of type bytes or bytearray, got: {0}".format(
"Expected object of type bytes or bytearray, got: {}".format(
type(sequences)
)
)
@ -76,7 +76,7 @@ def from_bytes(
if length == 0:
logger.debug("Encoding detection on empty bytes, assuming utf_8 intention.")
if explain:
if explain: # Defensive: ensure exit path clean handler
logger.removeHandler(explain_handler)
logger.setLevel(previous_logger_level or logging.WARNING)
return CharsetMatches([CharsetMatch(sequences, "utf_8", 0.0, False, [], "")])
@ -135,9 +135,9 @@ def from_bytes(
),
)
prioritized_encodings: List[str] = []
prioritized_encodings: list[str] = []
specified_encoding: Optional[str] = (
specified_encoding: str | None = (
any_specified_encoding(sequences) if preemptive_behaviour else None
)
@ -149,13 +149,13 @@ def from_bytes(
specified_encoding,
)
tested: Set[str] = set()
tested_but_hard_failure: List[str] = []
tested_but_soft_failure: List[str] = []
tested: set[str] = set()
tested_but_hard_failure: list[str] = []
tested_but_soft_failure: list[str] = []
fallback_ascii: Optional[CharsetMatch] = None
fallback_u8: Optional[CharsetMatch] = None
fallback_specified: Optional[CharsetMatch] = None
fallback_ascii: CharsetMatch | None = None
fallback_u8: CharsetMatch | None = None
fallback_specified: CharsetMatch | None = None
results: CharsetMatches = CharsetMatches()
@ -189,7 +189,7 @@ def from_bytes(
tested.add(encoding_iana)
decoded_payload: Optional[str] = None
decoded_payload: str | None = None
bom_or_sig_available: bool = sig_encoding == encoding_iana
strip_sig_or_bom: bool = bom_or_sig_available and should_strip_sig_or_bom(
encoding_iana
@ -292,7 +292,7 @@ def from_bytes(
early_stop_count: int = 0
lazy_str_hard_failure = False
md_chunks: List[str] = []
md_chunks: list[str] = []
md_ratios = []
try:
@ -397,7 +397,7 @@ def from_bytes(
)
if not is_multi_byte_decoder:
target_languages: List[str] = encoding_languages(encoding_iana)
target_languages: list[str] = encoding_languages(encoding_iana)
else:
target_languages = mb_encoding_languages(encoding_iana)
@ -462,7 +462,7 @@ def from_bytes(
"Encoding detection: %s is most likely the one.",
current_match.encoding,
)
if explain:
if explain: # Defensive: ensure exit path clean handler
logger.removeHandler(explain_handler)
logger.setLevel(previous_logger_level)
return CharsetMatches([current_match])
@ -480,7 +480,7 @@ def from_bytes(
"Encoding detection: %s is most likely the one.",
probable_result.encoding,
)
if explain:
if explain: # Defensive: ensure exit path clean handler
logger.removeHandler(explain_handler)
logger.setLevel(previous_logger_level)
@ -492,7 +492,7 @@ def from_bytes(
"the beginning of the sequence.",
encoding_iana,
)
if explain:
if explain: # Defensive: ensure exit path clean handler
logger.removeHandler(explain_handler)
logger.setLevel(previous_logger_level)
return CharsetMatches([results[encoding_iana]])
@ -546,8 +546,8 @@ def from_fp(
steps: int = 5,
chunk_size: int = 512,
threshold: float = 0.20,
cp_isolation: Optional[List[str]] = None,
cp_exclusion: Optional[List[str]] = None,
cp_isolation: list[str] | None = None,
cp_exclusion: list[str] | None = None,
preemptive_behaviour: bool = True,
explain: bool = False,
language_threshold: float = 0.1,
@ -572,12 +572,12 @@ def from_fp(
def from_path(
path: Union[str, bytes, PathLike], # type: ignore[type-arg]
path: str | bytes | PathLike, # type: ignore[type-arg]
steps: int = 5,
chunk_size: int = 512,
threshold: float = 0.20,
cp_isolation: Optional[List[str]] = None,
cp_exclusion: Optional[List[str]] = None,
cp_isolation: list[str] | None = None,
cp_exclusion: list[str] | None = None,
preemptive_behaviour: bool = True,
explain: bool = False,
language_threshold: float = 0.1,
@ -603,12 +603,12 @@ def from_path(
def is_binary(
fp_or_path_or_payload: Union[PathLike, str, BinaryIO, bytes], # type: ignore[type-arg]
fp_or_path_or_payload: PathLike | str | BinaryIO | bytes, # type: ignore[type-arg]
steps: int = 5,
chunk_size: int = 512,
threshold: float = 0.20,
cp_isolation: Optional[List[str]] = None,
cp_exclusion: Optional[List[str]] = None,
cp_isolation: list[str] | None = None,
cp_exclusion: list[str] | None = None,
preemptive_behaviour: bool = True,
explain: bool = False,
language_threshold: float = 0.1,

View file

@ -1,8 +1,10 @@
from __future__ import annotations
import importlib
from codecs import IncrementalDecoder
from collections import Counter
from functools import lru_cache
from typing import Counter as TypeCounter, Dict, List, Optional, Tuple
from typing import Counter as TypeCounter
from .constant import (
FREQUENCIES,
@ -22,26 +24,24 @@ from .utils import (
)
def encoding_unicode_range(iana_name: str) -> List[str]:
def encoding_unicode_range(iana_name: str) -> list[str]:
"""
Return associated unicode ranges in a single byte code page.
"""
if is_multi_byte_encoding(iana_name):
raise IOError("Function not supported on multi-byte code page")
raise OSError("Function not supported on multi-byte code page")
decoder = importlib.import_module(
"encodings.{}".format(iana_name)
).IncrementalDecoder
decoder = importlib.import_module(f"encodings.{iana_name}").IncrementalDecoder
p: IncrementalDecoder = decoder(errors="ignore")
seen_ranges: Dict[str, int] = {}
seen_ranges: dict[str, int] = {}
character_count: int = 0
for i in range(0x40, 0xFF):
chunk: str = p.decode(bytes([i]))
if chunk:
character_range: Optional[str] = unicode_range(chunk)
character_range: str | None = unicode_range(chunk)
if character_range is None:
continue
@ -61,11 +61,11 @@ def encoding_unicode_range(iana_name: str) -> List[str]:
)
def unicode_range_languages(primary_range: str) -> List[str]:
def unicode_range_languages(primary_range: str) -> list[str]:
"""
Return inferred languages used with a unicode range.
"""
languages: List[str] = []
languages: list[str] = []
for language, characters in FREQUENCIES.items():
for character in characters:
@ -77,13 +77,13 @@ def unicode_range_languages(primary_range: str) -> List[str]:
@lru_cache()
def encoding_languages(iana_name: str) -> List[str]:
def encoding_languages(iana_name: str) -> list[str]:
"""
Single-byte encoding language association. Some code page are heavily linked to particular language(s).
This function does the correspondence.
"""
unicode_ranges: List[str] = encoding_unicode_range(iana_name)
primary_range: Optional[str] = None
unicode_ranges: list[str] = encoding_unicode_range(iana_name)
primary_range: str | None = None
for specified_range in unicode_ranges:
if "Latin" not in specified_range:
@ -97,7 +97,7 @@ def encoding_languages(iana_name: str) -> List[str]:
@lru_cache()
def mb_encoding_languages(iana_name: str) -> List[str]:
def mb_encoding_languages(iana_name: str) -> list[str]:
"""
Multi-byte encoding language association. Some code page are heavily linked to particular language(s).
This function does the correspondence.
@ -118,7 +118,7 @@ def mb_encoding_languages(iana_name: str) -> List[str]:
@lru_cache(maxsize=LANGUAGE_SUPPORTED_COUNT)
def get_target_features(language: str) -> Tuple[bool, bool]:
def get_target_features(language: str) -> tuple[bool, bool]:
"""
Determine main aspects from a supported language if it contains accents and if is pure Latin.
"""
@ -135,12 +135,12 @@ def get_target_features(language: str) -> Tuple[bool, bool]:
def alphabet_languages(
characters: List[str], ignore_non_latin: bool = False
) -> List[str]:
characters: list[str], ignore_non_latin: bool = False
) -> list[str]:
"""
Return associated languages associated to given characters.
"""
languages: List[Tuple[str, float]] = []
languages: list[tuple[str, float]] = []
source_have_accents = any(is_accentuated(character) for character in characters)
@ -170,7 +170,7 @@ def alphabet_languages(
def characters_popularity_compare(
language: str, ordered_characters: List[str]
language: str, ordered_characters: list[str]
) -> float:
"""
Determine if a ordered characters list (by occurrence from most appearance to rarest) match a particular language.
@ -178,7 +178,7 @@ def characters_popularity_compare(
Beware that is function is not strict on the match in order to ease the detection. (Meaning close match is 1.)
"""
if language not in FREQUENCIES:
raise ValueError("{} not available".format(language))
raise ValueError(f"{language} not available")
character_approved_count: int = 0
FREQUENCIES_language_set = set(FREQUENCIES[language])
@ -214,14 +214,14 @@ def characters_popularity_compare(
character_approved_count += 1
continue
characters_before_source: List[str] = FREQUENCIES[language][
characters_before_source: list[str] = FREQUENCIES[language][
0:character_rank_in_language
]
characters_after_source: List[str] = FREQUENCIES[language][
characters_after_source: list[str] = FREQUENCIES[language][
character_rank_in_language:
]
characters_before: List[str] = ordered_characters[0:character_rank]
characters_after: List[str] = ordered_characters[character_rank:]
characters_before: list[str] = ordered_characters[0:character_rank]
characters_after: list[str] = ordered_characters[character_rank:]
before_match_count: int = len(
set(characters_before) & set(characters_before_source)
@ -249,24 +249,24 @@ def characters_popularity_compare(
return character_approved_count / len(ordered_characters)
def alpha_unicode_split(decoded_sequence: str) -> List[str]:
def alpha_unicode_split(decoded_sequence: str) -> list[str]:
"""
Given a decoded text sequence, return a list of str. Unicode range / alphabet separation.
Ex. a text containing English/Latin with a bit a Hebrew will return two items in the resulting list;
One containing the latin letters and the other hebrew.
"""
layers: Dict[str, str] = {}
layers: dict[str, str] = {}
for character in decoded_sequence:
if character.isalpha() is False:
continue
character_range: Optional[str] = unicode_range(character)
character_range: str | None = unicode_range(character)
if character_range is None:
continue
layer_target_range: Optional[str] = None
layer_target_range: str | None = None
for discovered_range in layers:
if (
@ -288,12 +288,12 @@ def alpha_unicode_split(decoded_sequence: str) -> List[str]:
return list(layers.values())
def merge_coherence_ratios(results: List[CoherenceMatches]) -> CoherenceMatches:
def merge_coherence_ratios(results: list[CoherenceMatches]) -> CoherenceMatches:
"""
This function merge results previously given by the function coherence_ratio.
The return type is the same as coherence_ratio.
"""
per_language_ratios: Dict[str, List[float]] = {}
per_language_ratios: dict[str, list[float]] = {}
for result in results:
for sub_result in result:
language, ratio = sub_result
@ -321,7 +321,7 @@ def filter_alt_coherence_matches(results: CoherenceMatches) -> CoherenceMatches:
We shall NOT return "English—" in CoherenceMatches because it is an alternative
of "English". This function only keeps the best match and remove the em-dash in it.
"""
index_results: Dict[str, List[float]] = dict()
index_results: dict[str, list[float]] = dict()
for result in results:
language, ratio = result
@ -345,14 +345,14 @@ def filter_alt_coherence_matches(results: CoherenceMatches) -> CoherenceMatches:
@lru_cache(maxsize=2048)
def coherence_ratio(
decoded_sequence: str, threshold: float = 0.1, lg_inclusion: Optional[str] = None
decoded_sequence: str, threshold: float = 0.1, lg_inclusion: str | None = None
) -> CoherenceMatches:
"""
Detect ANY language that can be identified in given sequence. The sequence will be analysed by layers.
A layer = Character extraction by alphabets/ranges.
"""
results: List[Tuple[str, float]] = []
results: list[tuple[str, float]] = []
ignore_non_latin: bool = False
sufficient_match_count: int = 0
@ -371,7 +371,7 @@ def coherence_ratio(
if character_count <= TOO_SMALL_SEQUENCE:
continue
popular_character_ordered: List[str] = [c for c, o in most_common]
popular_character_ordered: list[str] = [c for c, o in most_common]
for language in lg_inclusion_list or alphabet_languages(
popular_character_ordered, ignore_non_latin

View file

@ -1,3 +1,5 @@
from __future__ import annotations
from .__main__ import cli_detect, query_yes_no
__all__ = (

View file

@ -1,9 +1,11 @@
from __future__ import annotations
import argparse
import sys
import typing
from json import dumps
from os.path import abspath, basename, dirname, join, realpath
from platform import python_version
from typing import List, Optional
from unicodedata import unidata_version
import charset_normalizer.md as md_module
@ -42,10 +44,69 @@ def query_yes_no(question: str, default: str = "yes") -> bool:
elif choice in valid:
return valid[choice]
else:
sys.stdout.write("Please respond with 'yes' or 'no' " "(or 'y' or 'n').\n")
sys.stdout.write("Please respond with 'yes' or 'no' (or 'y' or 'n').\n")
def cli_detect(argv: Optional[List[str]] = None) -> int:
class FileType:
"""Factory for creating file object types
Instances of FileType are typically passed as type= arguments to the
ArgumentParser add_argument() method.
Keyword Arguments:
- mode -- A string indicating how the file is to be opened. Accepts the
same values as the builtin open() function.
- bufsize -- The file's desired buffer size. Accepts the same values as
the builtin open() function.
- encoding -- The file's encoding. Accepts the same values as the
builtin open() function.
- errors -- A string indicating how encoding and decoding errors are to
be handled. Accepts the same value as the builtin open() function.
Backported from CPython 3.12
"""
def __init__(
self,
mode: str = "r",
bufsize: int = -1,
encoding: str | None = None,
errors: str | None = None,
):
self._mode = mode
self._bufsize = bufsize
self._encoding = encoding
self._errors = errors
def __call__(self, string: str) -> typing.IO: # type: ignore[type-arg]
# the special argument "-" means sys.std{in,out}
if string == "-":
if "r" in self._mode:
return sys.stdin.buffer if "b" in self._mode else sys.stdin
elif any(c in self._mode for c in "wax"):
return sys.stdout.buffer if "b" in self._mode else sys.stdout
else:
msg = f'argument "-" with mode {self._mode}'
raise ValueError(msg)
# all other arguments are used as file names
try:
return open(string, self._mode, self._bufsize, self._encoding, self._errors)
except OSError as e:
message = f"can't open '{string}': {e}"
raise argparse.ArgumentTypeError(message)
def __repr__(self) -> str:
args = self._mode, self._bufsize
kwargs = [("encoding", self._encoding), ("errors", self._errors)]
args_str = ", ".join(
[repr(arg) for arg in args if arg != -1]
+ [f"{kw}={arg!r}" for kw, arg in kwargs if arg is not None]
)
return f"{type(self).__name__}({args_str})"
def cli_detect(argv: list[str] | None = None) -> int:
"""
CLI assistant using ARGV and ArgumentParser
:param argv:
@ -58,7 +119,7 @@ def cli_detect(argv: Optional[List[str]] = None) -> int:
)
parser.add_argument(
"files", type=argparse.FileType("rb"), nargs="+", help="File(s) to be analysed"
"files", type=FileType("rb"), nargs="+", help="File(s) to be analysed"
)
parser.add_argument(
"-v",
@ -124,7 +185,7 @@ def cli_detect(argv: Optional[List[str]] = None) -> int:
default=0.2,
type=float,
dest="threshold",
help="Define a custom maximum amount of chaos allowed in decoded content. 0. <= chaos <= 1.",
help="Define a custom maximum amount of noise allowed in decoded content. 0. <= noise <= 1.",
)
parser.add_argument(
"--version",
@ -259,7 +320,7 @@ def cli_detect(argv: Optional[List[str]] = None) -> int:
dir_path = dirname(realpath(my_file.name))
file_name = basename(realpath(my_file.name))
o_: List[str] = file_name.split(".")
o_: list[str] = file_name.split(".")
if args.replace is False:
o_.insert(-1, best_guess.encoding)
@ -284,7 +345,7 @@ def cli_detect(argv: Optional[List[str]] = None) -> int:
with open(x_[0].unicode_path, "wb") as fp:
fp.write(best_guess.output())
except IOError as e:
except OSError as e:
print(str(e), file=sys.stderr)
if my_file.closed is False:
my_file.close()

View file

@ -1,11 +1,12 @@
# -*- coding: utf-8 -*-
from __future__ import annotations
from codecs import BOM_UTF8, BOM_UTF16_BE, BOM_UTF16_LE, BOM_UTF32_BE, BOM_UTF32_LE
from encodings.aliases import aliases
from re import IGNORECASE, compile as re_compile
from typing import Dict, List, Set, Union
from re import IGNORECASE
from re import compile as re_compile
# Contain for each eligible encoding a list of/item bytes SIG/BOM
ENCODING_MARKS: Dict[str, Union[bytes, List[bytes]]] = {
ENCODING_MARKS: dict[str, bytes | list[bytes]] = {
"utf_8": BOM_UTF8,
"utf_7": [
b"\x2b\x2f\x76\x38",
@ -25,7 +26,7 @@ TOO_BIG_SEQUENCE: int = int(10e6)
UTF8_MAXIMAL_ALLOCATION: int = 1_112_064
# Up-to-date Unicode ucd/15.0.0
UNICODE_RANGES_COMBINED: Dict[str, range] = {
UNICODE_RANGES_COMBINED: dict[str, range] = {
"Control character": range(32),
"Basic Latin": range(32, 128),
"Latin-1 Supplement": range(128, 256),
@ -357,7 +358,7 @@ UNICODE_RANGES_COMBINED: Dict[str, range] = {
}
UNICODE_SECONDARY_RANGE_KEYWORD: List[str] = [
UNICODE_SECONDARY_RANGE_KEYWORD: list[str] = [
"Supplement",
"Extended",
"Extensions",
@ -392,7 +393,7 @@ IANA_NO_ALIASES = [
"koi8_u",
]
IANA_SUPPORTED: List[str] = sorted(
IANA_SUPPORTED: list[str] = sorted(
filter(
lambda x: x.endswith("_codec") is False
and x not in {"rot_13", "tactis", "mbcs"},
@ -403,7 +404,7 @@ IANA_SUPPORTED: List[str] = sorted(
IANA_SUPPORTED_COUNT: int = len(IANA_SUPPORTED)
# pre-computed code page that are similar using the function cp_similarity.
IANA_SUPPORTED_SIMILAR: Dict[str, List[str]] = {
IANA_SUPPORTED_SIMILAR: dict[str, list[str]] = {
"cp037": ["cp1026", "cp1140", "cp273", "cp500"],
"cp1026": ["cp037", "cp1140", "cp273", "cp500"],
"cp1125": ["cp866"],
@ -492,7 +493,7 @@ IANA_SUPPORTED_SIMILAR: Dict[str, List[str]] = {
}
CHARDET_CORRESPONDENCE: Dict[str, str] = {
CHARDET_CORRESPONDENCE: dict[str, str] = {
"iso2022_kr": "ISO-2022-KR",
"iso2022_jp": "ISO-2022-JP",
"euc_kr": "EUC-KR",
@ -528,7 +529,7 @@ CHARDET_CORRESPONDENCE: Dict[str, str] = {
}
COMMON_SAFE_ASCII_CHARACTERS: Set[str] = {
COMMON_SAFE_ASCII_CHARACTERS: set[str] = {
"<",
">",
"=",
@ -548,9 +549,26 @@ COMMON_SAFE_ASCII_CHARACTERS: Set[str] = {
")",
}
# Sample character sets — replace with full lists if needed
COMMON_CHINESE_CHARACTERS = "的一是在不了有和人这中大为上个国我以要他时来用们生到作地于出就分对成会可主发年动同工也能下过子说产种面而方后多定行学法所民得经十三之进着等部度家电力里如水化高自二理起小物现实加量都两体制机当使点从业本去把性好应开它合还因由其些然前外天政四日那社义事平形相全表间样与关各重新线内数正心反你明看原又么利比或但质气第向道命此变条只没结解问意建月公无系军很情者最立代想已通并提直题党程展五果料象员革位入常文总次品式活设及管特件长求老头基资边流路级少图山统接知较将组见计别她手角期根论运农指几九区强放决西被干做必战先回则任取据处队南给色光门即保治北造百规热领七海口东导器压志世金增争济阶油思术极交受联什认六共权收证改清己美再采转更单风切打白教速花带安场身车例真务具万每目至达走积示议声报斗完类八离华名确才科张信马节话米整空元况今集温传土许步群广石记需段研界拉林律叫且究观越织装影算低持音众书布复容儿须际商非验连断深难近矿千周委素技备半办青省列习响约支般史感劳便团往酸历市克何除消构府太准精值号率族维划选标写存候毛亲快效斯院查江型眼王按格养易置派层片始却专状育厂京识适属圆包火住调满县局照参红细引听该铁价严龙飞"
KO_NAMES: Set[str] = {"johab", "cp949", "euc_kr"}
ZH_NAMES: Set[str] = {"big5", "cp950", "big5hkscs", "hz"}
COMMON_JAPANESE_CHARACTERS = "日一国年大十二本中長出三時行見月分後前生五間上東四今金九入学高円子外八六下来気小七山話女北午百書先名川千水半男西電校語土木聞食車何南万毎白天母火右読友左休父雨"
COMMON_KOREAN_CHARACTERS = "一二三四五六七八九十百千萬上下左右中人女子大小山川日月火水木金土父母天地國名年時文校學生"
# Combine all into a set
COMMON_CJK_CHARACTERS = set(
"".join(
[
COMMON_CHINESE_CHARACTERS,
COMMON_JAPANESE_CHARACTERS,
COMMON_KOREAN_CHARACTERS,
]
)
)
KO_NAMES: set[str] = {"johab", "cp949", "euc_kr"}
ZH_NAMES: set[str] = {"big5", "cp950", "big5hkscs", "hz"}
# Logging LEVEL below DEBUG
TRACE: int = 5
@ -558,7 +576,7 @@ TRACE: int = 5
# Language label that contain the em dash "—"
# character are to be considered alternative seq to origin
FREQUENCIES: Dict[str, List[str]] = {
FREQUENCIES: dict[str, list[str]] = {
"English": [
"e",
"a",

View file

@ -1,6 +1,6 @@
from __future__ import annotations
from typing import TYPE_CHECKING, Any, Optional
from typing import TYPE_CHECKING, Any
from warnings import warn
from .api import from_bytes
@ -11,9 +11,9 @@ if TYPE_CHECKING:
from typing_extensions import TypedDict
class ResultDict(TypedDict):
encoding: Optional[str]
encoding: str | None
language: str
confidence: Optional[float]
confidence: float | None
def detect(
@ -37,8 +37,7 @@ def detect(
if not isinstance(byte_str, (bytearray, bytes)):
raise TypeError( # pragma: nocover
"Expected object of type bytes or bytearray, got: "
"{0}".format(type(byte_str))
f"Expected object of type bytes or bytearray, got: {type(byte_str)}"
)
if isinstance(byte_str, bytearray):

View file

@ -1,6 +1,7 @@
from __future__ import annotations
from functools import lru_cache
from logging import getLogger
from typing import List, Optional
from .constant import (
COMMON_SAFE_ASCII_CHARACTERS,
@ -25,6 +26,7 @@ from .utils import (
is_unprintable,
remove_accent,
unicode_range,
is_cjk_uncommon,
)
@ -68,7 +70,7 @@ class TooManySymbolOrPunctuationPlugin(MessDetectorPlugin):
self._symbol_count: int = 0
self._character_count: int = 0
self._last_printable_char: Optional[str] = None
self._last_printable_char: str | None = None
self._frenzy_symbol_in_word: bool = False
def eligible(self, character: str) -> bool:
@ -92,7 +94,7 @@ class TooManySymbolOrPunctuationPlugin(MessDetectorPlugin):
self._last_printable_char = character
def reset(self) -> None: # pragma: no cover
def reset(self) -> None: # Abstract
self._punctuation_count = 0
self._character_count = 0
self._symbol_count = 0
@ -123,7 +125,7 @@ class TooManyAccentuatedPlugin(MessDetectorPlugin):
if is_accentuated(character):
self._accentuated_count += 1
def reset(self) -> None: # pragma: no cover
def reset(self) -> None: # Abstract
self._character_count = 0
self._accentuated_count = 0
@ -149,7 +151,7 @@ class UnprintablePlugin(MessDetectorPlugin):
self._unprintable_count += 1
self._character_count += 1
def reset(self) -> None: # pragma: no cover
def reset(self) -> None: # Abstract
self._unprintable_count = 0
@property
@ -165,7 +167,7 @@ class SuspiciousDuplicateAccentPlugin(MessDetectorPlugin):
self._successive_count: int = 0
self._character_count: int = 0
self._last_latin_character: Optional[str] = None
self._last_latin_character: str | None = None
def eligible(self, character: str) -> bool:
return character.isalpha() and is_latin(character)
@ -184,7 +186,7 @@ class SuspiciousDuplicateAccentPlugin(MessDetectorPlugin):
self._successive_count += 1
self._last_latin_character = character
def reset(self) -> None: # pragma: no cover
def reset(self) -> None: # Abstract
self._successive_count = 0
self._character_count = 0
self._last_latin_character = None
@ -201,7 +203,7 @@ class SuspiciousRange(MessDetectorPlugin):
def __init__(self) -> None:
self._suspicious_successive_range_count: int = 0
self._character_count: int = 0
self._last_printable_seen: Optional[str] = None
self._last_printable_seen: str | None = None
def eligible(self, character: str) -> bool:
return character.isprintable()
@ -221,15 +223,15 @@ class SuspiciousRange(MessDetectorPlugin):
self._last_printable_seen = character
return
unicode_range_a: Optional[str] = unicode_range(self._last_printable_seen)
unicode_range_b: Optional[str] = unicode_range(character)
unicode_range_a: str | None = unicode_range(self._last_printable_seen)
unicode_range_b: str | None = unicode_range(character)
if is_suspiciously_successive_range(unicode_range_a, unicode_range_b):
self._suspicious_successive_range_count += 1
self._last_printable_seen = character
def reset(self) -> None: # pragma: no cover
def reset(self) -> None: # Abstract
self._character_count = 0
self._suspicious_successive_range_count = 0
self._last_printable_seen = None
@ -346,7 +348,7 @@ class SuperWeirdWordPlugin(MessDetectorPlugin):
self._is_current_word_bad = True
self._buffer += character
def reset(self) -> None: # pragma: no cover
def reset(self) -> None: # Abstract
self._buffer = ""
self._is_current_word_bad = False
self._foreign_long_watch = False
@ -364,35 +366,39 @@ class SuperWeirdWordPlugin(MessDetectorPlugin):
return self._bad_character_count / self._character_count
class CjkInvalidStopPlugin(MessDetectorPlugin):
class CjkUncommonPlugin(MessDetectorPlugin):
"""
GB(Chinese) based encoding often render the stop incorrectly when the content does not fit and
can be easily detected. Searching for the overuse of '' and ''.
Detect messy CJK text that probably means nothing.
"""
def __init__(self) -> None:
self._wrong_stop_count: int = 0
self._cjk_character_count: int = 0
self._character_count: int = 0
self._uncommon_count: int = 0
def eligible(self, character: str) -> bool:
return True
return is_cjk(character)
def feed(self, character: str) -> None:
if character in {"", ""}:
self._wrong_stop_count += 1
return
if is_cjk(character):
self._cjk_character_count += 1
self._character_count += 1
def reset(self) -> None: # pragma: no cover
self._wrong_stop_count = 0
self._cjk_character_count = 0
if is_cjk_uncommon(character):
self._uncommon_count += 1
return
def reset(self) -> None: # Abstract
self._character_count = 0
self._uncommon_count = 0
@property
def ratio(self) -> float:
if self._cjk_character_count < 16:
if self._character_count < 8:
return 0.0
return self._wrong_stop_count / self._cjk_character_count
uncommon_form_usage: float = self._uncommon_count / self._character_count
# we can be pretty sure it's garbage when uncommon characters are widely
# used. otherwise it could just be traditional chinese for example.
return uncommon_form_usage / 10 if uncommon_form_usage > 0.5 else 0.0
class ArchaicUpperLowerPlugin(MessDetectorPlugin):
@ -406,7 +412,7 @@ class ArchaicUpperLowerPlugin(MessDetectorPlugin):
self._character_count: int = 0
self._last_alpha_seen: Optional[str] = None
self._last_alpha_seen: str | None = None
self._current_ascii_only: bool = True
def eligible(self, character: str) -> bool:
@ -454,7 +460,7 @@ class ArchaicUpperLowerPlugin(MessDetectorPlugin):
self._character_count_since_last_sep += 1
self._last_alpha_seen = character
def reset(self) -> None: # pragma: no cover
def reset(self) -> None: # Abstract
self._character_count = 0
self._character_count_since_last_sep = 0
self._successive_upper_lower_count = 0
@ -476,7 +482,7 @@ class ArabicIsolatedFormPlugin(MessDetectorPlugin):
self._character_count: int = 0
self._isolated_form_count: int = 0
def reset(self) -> None: # pragma: no cover
def reset(self) -> None: # Abstract
self._character_count = 0
self._isolated_form_count = 0
@ -501,7 +507,7 @@ class ArabicIsolatedFormPlugin(MessDetectorPlugin):
@lru_cache(maxsize=1024)
def is_suspiciously_successive_range(
unicode_range_a: Optional[str], unicode_range_b: Optional[str]
unicode_range_a: str | None, unicode_range_b: str | None
) -> bool:
"""
Determine if two Unicode range seen next to each other can be considered as suspicious.
@ -525,9 +531,10 @@ def is_suspiciously_successive_range(
):
return False
keywords_range_a, keywords_range_b = unicode_range_a.split(
" "
), unicode_range_b.split(" ")
keywords_range_a, keywords_range_b = (
unicode_range_a.split(" "),
unicode_range_b.split(" "),
)
for el in keywords_range_a:
if el in UNICODE_SECONDARY_RANGE_KEYWORD:
@ -580,7 +587,7 @@ def mess_ratio(
Compute a mess ratio given a decoded bytes sequence. The maximum threshold does stop the computation earlier.
"""
detectors: List[MessDetectorPlugin] = [
detectors: list[MessDetectorPlugin] = [
md_class() for md_class in MessDetectorPlugin.__subclasses__()
]
@ -622,7 +629,7 @@ def mess_ratio(
logger.log(TRACE, f"Starting with: {decoded_sequence[:16]}")
logger.log(TRACE, f"Ending with: {decoded_sequence[-16::]}")
for dt in detectors: # pragma: nocover
for dt in detectors:
logger.log(TRACE, f"{dt.__class__}: {dt.ratio}")
return round(mean_mess_ratio, 3)

View file

@ -1,8 +1,10 @@
from __future__ import annotations
from encodings.aliases import aliases
from hashlib import sha256
from json import dumps
from re import sub
from typing import Any, Dict, Iterator, List, Optional, Tuple, Union
from typing import Any, Iterator, List, Tuple
from .constant import RE_POSSIBLE_ENCODING_INDICATION, TOO_BIG_SEQUENCE
from .utils import iana_name, is_multi_byte_encoding, unicode_range
@ -15,9 +17,9 @@ class CharsetMatch:
guessed_encoding: str,
mean_mess_ratio: float,
has_sig_or_bom: bool,
languages: "CoherenceMatches",
decoded_payload: Optional[str] = None,
preemptive_declaration: Optional[str] = None,
languages: CoherenceMatches,
decoded_payload: str | None = None,
preemptive_declaration: str | None = None,
):
self._payload: bytes = payload
@ -25,17 +27,17 @@ class CharsetMatch:
self._mean_mess_ratio: float = mean_mess_ratio
self._languages: CoherenceMatches = languages
self._has_sig_or_bom: bool = has_sig_or_bom
self._unicode_ranges: Optional[List[str]] = None
self._unicode_ranges: list[str] | None = None
self._leaves: List[CharsetMatch] = []
self._leaves: list[CharsetMatch] = []
self._mean_coherence_ratio: float = 0.0
self._output_payload: Optional[bytes] = None
self._output_encoding: Optional[str] = None
self._output_payload: bytes | None = None
self._output_encoding: str | None = None
self._string: Optional[str] = decoded_payload
self._string: str | None = decoded_payload
self._preemptive_declaration: Optional[str] = preemptive_declaration
self._preemptive_declaration: str | None = preemptive_declaration
def __eq__(self, other: object) -> bool:
if not isinstance(other, CharsetMatch):
@ -77,9 +79,9 @@ class CharsetMatch:
return self._string
def __repr__(self) -> str:
return "<CharsetMatch '{}' bytes({})>".format(self.encoding, self.fingerprint)
return f"<CharsetMatch '{self.encoding}' bytes({self.fingerprint})>"
def add_submatch(self, other: "CharsetMatch") -> None:
def add_submatch(self, other: CharsetMatch) -> None:
if not isinstance(other, CharsetMatch) or other == self:
raise ValueError(
"Unable to add instance <{}> as a submatch of a CharsetMatch".format(
@ -95,11 +97,11 @@ class CharsetMatch:
return self._encoding
@property
def encoding_aliases(self) -> List[str]:
def encoding_aliases(self) -> list[str]:
"""
Encoding name are known by many name, using this could help when searching for IBM855 when it's listed as CP855.
"""
also_known_as: List[str] = []
also_known_as: list[str] = []
for u, p in aliases.items():
if self.encoding == u:
also_known_as.append(p)
@ -116,7 +118,7 @@ class CharsetMatch:
return self._has_sig_or_bom
@property
def languages(self) -> List[str]:
def languages(self) -> list[str]:
"""
Return the complete list of possible languages found in decoded sequence.
Usually not really useful. Returned list may be empty even if 'language' property return something != 'Unknown'.
@ -177,7 +179,7 @@ class CharsetMatch:
return self._payload
@property
def submatch(self) -> List["CharsetMatch"]:
def submatch(self) -> list[CharsetMatch]:
return self._leaves
@property
@ -185,19 +187,17 @@ class CharsetMatch:
return len(self._leaves) > 0
@property
def alphabets(self) -> List[str]:
def alphabets(self) -> list[str]:
if self._unicode_ranges is not None:
return self._unicode_ranges
# list detected ranges
detected_ranges: List[Optional[str]] = [
unicode_range(char) for char in str(self)
]
detected_ranges: list[str | None] = [unicode_range(char) for char in str(self)]
# filter and sort
self._unicode_ranges = sorted(list({r for r in detected_ranges if r}))
return self._unicode_ranges
@property
def could_be_from_charset(self) -> List[str]:
def could_be_from_charset(self) -> list[str]:
"""
The complete list of encoding that output the exact SAME str result and therefore could be the originating
encoding.
@ -221,10 +221,11 @@ class CharsetMatch:
patched_header = sub(
RE_POSSIBLE_ENCODING_INDICATION,
lambda m: m.string[m.span()[0] : m.span()[1]].replace(
m.groups()[0], iana_name(self._output_encoding) # type: ignore[arg-type]
m.groups()[0],
iana_name(self._output_encoding).replace("_", "-"), # type: ignore[arg-type]
),
decoded_string[:8192],
1,
count=1,
)
decoded_string = patched_header + decoded_string[8192:]
@ -247,13 +248,13 @@ class CharsetMatches:
Act like a list(iterable) but does not implements all related methods.
"""
def __init__(self, results: Optional[List[CharsetMatch]] = None):
self._results: List[CharsetMatch] = sorted(results) if results else []
def __init__(self, results: list[CharsetMatch] | None = None):
self._results: list[CharsetMatch] = sorted(results) if results else []
def __iter__(self) -> Iterator[CharsetMatch]:
yield from self._results
def __getitem__(self, item: Union[int, str]) -> CharsetMatch:
def __getitem__(self, item: int | str) -> CharsetMatch:
"""
Retrieve a single item either by its position or encoding name (alias may be used here).
Raise KeyError upon invalid index or encoding not present in results.
@ -293,7 +294,7 @@ class CharsetMatches:
self._results.append(item)
self._results = sorted(self._results)
def best(self) -> Optional["CharsetMatch"]:
def best(self) -> CharsetMatch | None:
"""
Simply return the first match. Strict equivalent to matches[0].
"""
@ -301,7 +302,7 @@ class CharsetMatches:
return None
return self._results[0]
def first(self) -> Optional["CharsetMatch"]:
def first(self) -> CharsetMatch | None:
"""
Redundant method, call the method best(). Kept for BC reasons.
"""
@ -316,31 +317,31 @@ class CliDetectionResult:
def __init__(
self,
path: str,
encoding: Optional[str],
encoding_aliases: List[str],
alternative_encodings: List[str],
encoding: str | None,
encoding_aliases: list[str],
alternative_encodings: list[str],
language: str,
alphabets: List[str],
alphabets: list[str],
has_sig_or_bom: bool,
chaos: float,
coherence: float,
unicode_path: Optional[str],
unicode_path: str | None,
is_preferred: bool,
):
self.path: str = path
self.unicode_path: Optional[str] = unicode_path
self.encoding: Optional[str] = encoding
self.encoding_aliases: List[str] = encoding_aliases
self.alternative_encodings: List[str] = alternative_encodings
self.unicode_path: str | None = unicode_path
self.encoding: str | None = encoding
self.encoding_aliases: list[str] = encoding_aliases
self.alternative_encodings: list[str] = alternative_encodings
self.language: str = language
self.alphabets: List[str] = alphabets
self.alphabets: list[str] = alphabets
self.has_sig_or_bom: bool = has_sig_or_bom
self.chaos: float = chaos
self.coherence: float = coherence
self.is_preferred: bool = is_preferred
@property
def __dict__(self) -> Dict[str, Any]: # type: ignore
def __dict__(self) -> dict[str, Any]: # type: ignore
return {
"path": self.path,
"encoding": self.encoding,

View file

@ -1,3 +1,5 @@
from __future__ import annotations
import importlib
import logging
import unicodedata
@ -5,9 +7,11 @@ from codecs import IncrementalDecoder
from encodings.aliases import aliases
from functools import lru_cache
from re import findall
from typing import Generator, List, Optional, Set, Tuple, Union
from typing import Generator
from _multibytecodec import MultibyteIncrementalDecoder
from _multibytecodec import ( # type: ignore[import-not-found,import]
MultibyteIncrementalDecoder,
)
from .constant import (
ENCODING_MARKS,
@ -16,6 +20,7 @@ from .constant import (
UNICODE_RANGES_COMBINED,
UNICODE_SECONDARY_RANGE_KEYWORD,
UTF8_MAXIMAL_ALLOCATION,
COMMON_CJK_CHARACTERS,
)
@ -23,7 +28,7 @@ from .constant import (
def is_accentuated(character: str) -> bool:
try:
description: str = unicodedata.name(character)
except ValueError:
except ValueError: # Defensive: unicode database outdated?
return False
return (
"WITH GRAVE" in description
@ -43,13 +48,13 @@ def remove_accent(character: str) -> str:
if not decomposed:
return character
codes: List[str] = decomposed.split(" ")
codes: list[str] = decomposed.split(" ")
return chr(int(codes[0], 16))
@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
def unicode_range(character: str) -> Optional[str]:
def unicode_range(character: str) -> str | None:
"""
Retrieve the Unicode range official name from a single character.
"""
@ -66,7 +71,7 @@ def unicode_range(character: str) -> Optional[str]:
def is_latin(character: str) -> bool:
try:
description: str = unicodedata.name(character)
except ValueError:
except ValueError: # Defensive: unicode database outdated?
return False
return "LATIN" in description
@ -78,7 +83,7 @@ def is_punctuation(character: str) -> bool:
if "P" in character_category:
return True
character_range: Optional[str] = unicode_range(character)
character_range: str | None = unicode_range(character)
if character_range is None:
return False
@ -93,7 +98,7 @@ def is_symbol(character: str) -> bool:
if "S" in character_category or "N" in character_category:
return True
character_range: Optional[str] = unicode_range(character)
character_range: str | None = unicode_range(character)
if character_range is None:
return False
@ -103,7 +108,7 @@ def is_symbol(character: str) -> bool:
@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
def is_emoticon(character: str) -> bool:
character_range: Optional[str] = unicode_range(character)
character_range: str | None = unicode_range(character)
if character_range is None:
return False
@ -130,7 +135,7 @@ def is_case_variable(character: str) -> bool:
def is_cjk(character: str) -> bool:
try:
character_name = unicodedata.name(character)
except ValueError:
except ValueError: # Defensive: unicode database outdated?
return False
return "CJK" in character_name
@ -140,7 +145,7 @@ def is_cjk(character: str) -> bool:
def is_hiragana(character: str) -> bool:
try:
character_name = unicodedata.name(character)
except ValueError:
except ValueError: # Defensive: unicode database outdated?
return False
return "HIRAGANA" in character_name
@ -150,7 +155,7 @@ def is_hiragana(character: str) -> bool:
def is_katakana(character: str) -> bool:
try:
character_name = unicodedata.name(character)
except ValueError:
except ValueError: # Defensive: unicode database outdated?
return False
return "KATAKANA" in character_name
@ -160,7 +165,7 @@ def is_katakana(character: str) -> bool:
def is_hangul(character: str) -> bool:
try:
character_name = unicodedata.name(character)
except ValueError:
except ValueError: # Defensive: unicode database outdated?
return False
return "HANGUL" in character_name
@ -170,7 +175,7 @@ def is_hangul(character: str) -> bool:
def is_thai(character: str) -> bool:
try:
character_name = unicodedata.name(character)
except ValueError:
except ValueError: # Defensive: unicode database outdated?
return False
return "THAI" in character_name
@ -180,7 +185,7 @@ def is_thai(character: str) -> bool:
def is_arabic(character: str) -> bool:
try:
character_name = unicodedata.name(character)
except ValueError:
except ValueError: # Defensive: unicode database outdated?
return False
return "ARABIC" in character_name
@ -190,12 +195,17 @@ def is_arabic(character: str) -> bool:
def is_arabic_isolated_form(character: str) -> bool:
try:
character_name = unicodedata.name(character)
except ValueError:
except ValueError: # Defensive: unicode database outdated?
return False
return "ARABIC" in character_name and "ISOLATED FORM" in character_name
@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
def is_cjk_uncommon(character: str) -> bool:
return character not in COMMON_CJK_CHARACTERS
@lru_cache(maxsize=len(UNICODE_RANGES_COMBINED))
def is_unicode_range_secondary(range_name: str) -> bool:
return any(keyword in range_name for keyword in UNICODE_SECONDARY_RANGE_KEYWORD)
@ -206,13 +216,13 @@ def is_unprintable(character: str) -> bool:
return (
character.isspace() is False # includes \n \t \r \v
and character.isprintable() is False
and character != "\x1A" # Why? Its the ASCII substitute character.
and character != "\x1a" # Why? Its the ASCII substitute character.
and character != "\ufeff" # bug discovered in Python,
# Zero Width No-Break Space located in Arabic Presentation Forms-B, Unicode 1.1 not acknowledged as space.
)
def any_specified_encoding(sequence: bytes, search_zone: int = 8192) -> Optional[str]:
def any_specified_encoding(sequence: bytes, search_zone: int = 8192) -> str | None:
"""
Extract using ASCII-only decoder any specified encoding in the first n-bytes.
"""
@ -221,7 +231,7 @@ def any_specified_encoding(sequence: bytes, search_zone: int = 8192) -> Optional
seq_len: int = len(sequence)
results: List[str] = findall(
results: list[str] = findall(
RE_POSSIBLE_ENCODING_INDICATION,
sequence[: min(seq_len, search_zone)].decode("ascii", errors="ignore"),
)
@ -260,18 +270,18 @@ def is_multi_byte_encoding(name: str) -> bool:
"utf_32_be",
"utf_7",
} or issubclass(
importlib.import_module("encodings.{}".format(name)).IncrementalDecoder,
importlib.import_module(f"encodings.{name}").IncrementalDecoder,
MultibyteIncrementalDecoder,
)
def identify_sig_or_bom(sequence: bytes) -> Tuple[Optional[str], bytes]:
def identify_sig_or_bom(sequence: bytes) -> tuple[str | None, bytes]:
"""
Identify and extract SIG/BOM in given sequence.
"""
for iana_encoding in ENCODING_MARKS:
marks: Union[bytes, List[bytes]] = ENCODING_MARKS[iana_encoding]
marks: bytes | list[bytes] = ENCODING_MARKS[iana_encoding]
if isinstance(marks, bytes):
marks = [marks]
@ -288,6 +298,7 @@ def should_strip_sig_or_bom(iana_encoding: str) -> bool:
def iana_name(cp_name: str, strict: bool = True) -> str:
"""Returns the Python normalized encoding name (Not the IANA official name)."""
cp_name = cp_name.lower().replace("-", "_")
encoding_alias: str
@ -298,35 +309,17 @@ def iana_name(cp_name: str, strict: bool = True) -> str:
return encoding_iana
if strict:
raise ValueError("Unable to retrieve IANA for '{}'".format(cp_name))
raise ValueError(f"Unable to retrieve IANA for '{cp_name}'")
return cp_name
def range_scan(decoded_sequence: str) -> List[str]:
ranges: Set[str] = set()
for character in decoded_sequence:
character_range: Optional[str] = unicode_range(character)
if character_range is None:
continue
ranges.add(character_range)
return list(ranges)
def cp_similarity(iana_name_a: str, iana_name_b: str) -> float:
if is_multi_byte_encoding(iana_name_a) or is_multi_byte_encoding(iana_name_b):
return 0.0
decoder_a = importlib.import_module(
"encodings.{}".format(iana_name_a)
).IncrementalDecoder
decoder_b = importlib.import_module(
"encodings.{}".format(iana_name_b)
).IncrementalDecoder
decoder_a = importlib.import_module(f"encodings.{iana_name_a}").IncrementalDecoder
decoder_b = importlib.import_module(f"encodings.{iana_name_b}").IncrementalDecoder
id_a: IncrementalDecoder = decoder_a(errors="ignore")
id_b: IncrementalDecoder = decoder_b(errors="ignore")
@ -374,7 +367,7 @@ def cut_sequence_chunks(
strip_sig_or_bom: bool,
sig_payload: bytes,
is_multi_byte_decoder: bool,
decoded_payload: Optional[str] = None,
decoded_payload: str | None = None,
) -> Generator[str, None, None]:
if decoded_payload and is_multi_byte_decoder is False:
for i in offsets:

View file

@ -2,5 +2,7 @@
Expose version
"""
__version__ = "3.4.0"
from __future__ import annotations
__version__ = "3.4.2"
VERSION = __version__.split(".")