mirror of
https://github.com/Tautulli/Tautulli.git
synced 2025-08-20 21:33:18 -07:00
Update plexapi==4.8.0
This commit is contained in:
parent
36b55398a8
commit
3a50981976
20 changed files with 522 additions and 314 deletions
|
@ -19,6 +19,8 @@ at <https://github.com/Ousret/charset_normalizer>.
|
|||
:copyright: (c) 2021 by Ahmed TAHRI
|
||||
:license: MIT, see LICENSE for more details.
|
||||
"""
|
||||
import logging
|
||||
|
||||
from .api import from_bytes, from_fp, from_path, normalize
|
||||
from .legacy import (
|
||||
CharsetDetector,
|
||||
|
@ -28,6 +30,7 @@ from .legacy import (
|
|||
detect,
|
||||
)
|
||||
from .models import CharsetMatch, CharsetMatches
|
||||
from .utils import set_logging_handler
|
||||
from .version import VERSION, __version__
|
||||
|
||||
__all__ = (
|
||||
|
@ -44,4 +47,10 @@ __all__ = (
|
|||
"CharsetDoctor",
|
||||
"__version__",
|
||||
"VERSION",
|
||||
"set_logging_handler",
|
||||
)
|
||||
|
||||
# Attach a NullHandler to the top level logger by default
|
||||
# https://docs.python.org/3.3/howto/logging.html#configuring-logging-for-a-library
|
||||
|
||||
logging.getLogger("charset_normalizer").addHandler(logging.NullHandler())
|
||||
|
|
|
@ -1,3 +1,4 @@
|
|||
import logging
|
||||
from os.path import basename, splitext
|
||||
from typing import BinaryIO, List, Optional, Set
|
||||
|
||||
|
@ -6,8 +7,6 @@ try:
|
|||
except ImportError: # pragma: no cover
|
||||
PathLike = str # type: ignore
|
||||
|
||||
import logging
|
||||
|
||||
from .cd import (
|
||||
coherence_ratio,
|
||||
encoding_languages,
|
||||
|
@ -27,11 +26,10 @@ from .utils import (
|
|||
)
|
||||
|
||||
logger = logging.getLogger("charset_normalizer")
|
||||
logger.setLevel(logging.DEBUG)
|
||||
|
||||
handler = logging.StreamHandler()
|
||||
handler.setFormatter(logging.Formatter("%(asctime)s | %(levelname)s | %(message)s"))
|
||||
logger.addHandler(handler)
|
||||
explain_handler = logging.StreamHandler()
|
||||
explain_handler.setFormatter(
|
||||
logging.Formatter("%(asctime)s | %(levelname)s | %(message)s")
|
||||
)
|
||||
|
||||
|
||||
def from_bytes(
|
||||
|
@ -57,6 +55,9 @@ def from_bytes(
|
|||
purpose.
|
||||
|
||||
This function will strip the SIG in the payload/sequence every time except on UTF-16, UTF-32.
|
||||
By default the library does not setup any handler other than the NullHandler, if you choose to set the 'explain'
|
||||
toggle to True it will alter the logger configuration to add a StreamHandler that is suitable for debugging.
|
||||
Custom logging format and handler can be set manually.
|
||||
"""
|
||||
|
||||
if not isinstance(sequences, (bytearray, bytes)):
|
||||
|
@ -66,10 +67,8 @@ def from_bytes(
|
|||
)
|
||||
)
|
||||
|
||||
if not explain:
|
||||
logger.setLevel(logging.CRITICAL)
|
||||
else:
|
||||
logger.setLevel(logging.INFO)
|
||||
if explain:
|
||||
logger.addHandler(explain_handler)
|
||||
|
||||
length = len(sequences) # type: int
|
||||
|
||||
|
@ -77,6 +76,8 @@ def from_bytes(
|
|||
logger.warning(
|
||||
"Given content is empty, stopping the process very early, returning empty utf_8 str match"
|
||||
)
|
||||
if explain:
|
||||
logger.removeHandler(explain_handler)
|
||||
return CharsetMatches([CharsetMatch(sequences, "utf_8", 0.0, False, [], "")])
|
||||
|
||||
if cp_isolation is not None:
|
||||
|
@ -131,7 +132,7 @@ def from_bytes(
|
|||
prioritized_encodings = [] # type: List[str]
|
||||
|
||||
specified_encoding = (
|
||||
any_specified_encoding(sequences) if preemptive_behaviour is True else None
|
||||
any_specified_encoding(sequences) if preemptive_behaviour else None
|
||||
) # type: Optional[str]
|
||||
|
||||
if specified_encoding is not None:
|
||||
|
@ -185,7 +186,7 @@ def from_bytes(
|
|||
encoding_iana
|
||||
) # type: bool
|
||||
|
||||
if encoding_iana in {"utf_16", "utf_32"} and bom_or_sig_available is False:
|
||||
if encoding_iana in {"utf_16", "utf_32"} and not bom_or_sig_available:
|
||||
logger.info(
|
||||
"Encoding %s wont be tested as-is because it require a BOM. Will try some sub-encoder LE/BE.",
|
||||
encoding_iana,
|
||||
|
@ -241,7 +242,7 @@ def from_bytes(
|
|||
continue
|
||||
|
||||
r_ = range(
|
||||
0 if bom_or_sig_available is False else len(sig_payload),
|
||||
0 if not bom_or_sig_available else len(sig_payload),
|
||||
length,
|
||||
int(length / steps),
|
||||
)
|
||||
|
@ -261,29 +262,40 @@ def from_bytes(
|
|||
|
||||
max_chunk_gave_up = int(len(r_) / 4) # type: int
|
||||
|
||||
if max_chunk_gave_up < 2:
|
||||
max_chunk_gave_up = 2
|
||||
|
||||
max_chunk_gave_up = max(max_chunk_gave_up, 2)
|
||||
early_stop_count = 0 # type: int
|
||||
|
||||
md_chunks = [] # type: List[str]
|
||||
md_ratios = []
|
||||
|
||||
for i in r_:
|
||||
if i + chunk_size > length + 8:
|
||||
continue
|
||||
|
||||
cut_sequence = sequences[i : i + chunk_size]
|
||||
|
||||
if bom_or_sig_available and strip_sig_or_bom is False:
|
||||
cut_sequence = sig_payload + cut_sequence
|
||||
|
||||
chunk = cut_sequence.decode(encoding_iana, errors="ignore") # type: str
|
||||
try:
|
||||
chunk = cut_sequence.decode(
|
||||
encoding_iana,
|
||||
errors="ignore" if is_multi_byte_decoder else "strict",
|
||||
) # type: str
|
||||
except UnicodeDecodeError as e: # Lazy str loading may have missed something there
|
||||
logger.warning(
|
||||
"LazyStr Loading: After MD chunk decode, code page %s does not fit given bytes sequence at ALL. %s",
|
||||
encoding_iana,
|
||||
str(e),
|
||||
)
|
||||
early_stop_count = max_chunk_gave_up
|
||||
break
|
||||
|
||||
# multi-byte bad cutting detector and adjustment
|
||||
# not the cleanest way to perform that fix but clever enough for now.
|
||||
if is_multi_byte_decoder and i > 0 and sequences[i] >= 0x80:
|
||||
|
||||
chunk_partial_size_chk = (
|
||||
16 if chunk_size > 16 else chunk_size
|
||||
) # type: int
|
||||
chunk_partial_size_chk = min(chunk_size, 16) # type: int
|
||||
|
||||
if (
|
||||
decoded_payload
|
||||
|
@ -312,11 +324,9 @@ def from_bytes(
|
|||
):
|
||||
break
|
||||
|
||||
if md_ratios:
|
||||
mean_mess_ratio = sum(md_ratios) / len(md_ratios) # type: float
|
||||
else:
|
||||
mean_mess_ratio = 0.0
|
||||
|
||||
mean_mess_ratio = (
|
||||
sum(md_ratios) / len(md_ratios) if md_ratios else 0.0
|
||||
) # type: float
|
||||
if mean_mess_ratio >= threshold or early_stop_count >= max_chunk_gave_up:
|
||||
tested_but_soft_failure.append(encoding_iana)
|
||||
logger.warning(
|
||||
|
@ -375,6 +385,20 @@ def from_bytes(
|
|||
)
|
||||
)
|
||||
|
||||
# We might want to check the sequence again with the whole content
|
||||
# Only if initial MD/CD tests passes
|
||||
if is_too_large_sequence and not is_multi_byte_decoder:
|
||||
try:
|
||||
sequences[int(50e3) :].decode(encoding_iana, errors="strict")
|
||||
except UnicodeDecodeError as e:
|
||||
logger.warning(
|
||||
"LazyStr Loading: After final lookup, code page %s does not fit given bytes sequence at ALL. %s",
|
||||
encoding_iana,
|
||||
str(e),
|
||||
)
|
||||
tested_but_hard_failure.append(encoding_iana)
|
||||
continue
|
||||
|
||||
results.append(
|
||||
CharsetMatch(
|
||||
sequences,
|
||||
|
@ -393,6 +417,8 @@ def from_bytes(
|
|||
logger.info(
|
||||
"%s is most likely the one. Stopping the process.", encoding_iana
|
||||
)
|
||||
if explain:
|
||||
logger.removeHandler(explain_handler)
|
||||
return CharsetMatches([results[encoding_iana]])
|
||||
|
||||
if encoding_iana == sig_encoding:
|
||||
|
@ -400,6 +426,8 @@ def from_bytes(
|
|||
"%s is most likely the one as we detected a BOM or SIG within the beginning of the sequence.",
|
||||
encoding_iana,
|
||||
)
|
||||
if explain:
|
||||
logger.removeHandler(explain_handler)
|
||||
return CharsetMatches([results[encoding_iana]])
|
||||
|
||||
if len(results) == 0:
|
||||
|
@ -428,6 +456,9 @@ def from_bytes(
|
|||
logger.warning("ascii will be used as a fallback match")
|
||||
results.append(fallback_ascii)
|
||||
|
||||
if explain:
|
||||
logger.removeHandler(explain_handler)
|
||||
|
||||
return results
|
||||
|
||||
|
||||
|
|
|
@ -5,7 +5,7 @@ from functools import lru_cache
|
|||
from typing import Dict, List, Optional, Tuple
|
||||
|
||||
from .assets import FREQUENCIES
|
||||
from .constant import KO_NAMES, TOO_SMALL_SEQUENCE, ZH_NAMES
|
||||
from .constant import KO_NAMES, LANGUAGE_SUPPORTED_COUNT, TOO_SMALL_SEQUENCE, ZH_NAMES
|
||||
from .md import is_suspiciously_successive_range
|
||||
from .models import CoherenceMatches
|
||||
from .utils import (
|
||||
|
@ -110,6 +110,23 @@ def mb_encoding_languages(iana_name: str) -> List[str]:
|
|||
return []
|
||||
|
||||
|
||||
@lru_cache(maxsize=LANGUAGE_SUPPORTED_COUNT)
|
||||
def get_target_features(language: str) -> Tuple[bool, bool]:
|
||||
"""
|
||||
Determine main aspects from a supported language if it contains accents and if is pure Latin.
|
||||
"""
|
||||
target_have_accents = False # type: bool
|
||||
target_pure_latin = True # type: bool
|
||||
|
||||
for character in FREQUENCIES[language]:
|
||||
if not target_have_accents and is_accentuated(character):
|
||||
target_have_accents = True
|
||||
if target_pure_latin and is_latin(character) is False:
|
||||
target_pure_latin = False
|
||||
|
||||
return target_have_accents, target_pure_latin
|
||||
|
||||
|
||||
def alphabet_languages(
|
||||
characters: List[str], ignore_non_latin: bool = False
|
||||
) -> List[str]:
|
||||
|
@ -118,23 +135,11 @@ def alphabet_languages(
|
|||
"""
|
||||
languages = [] # type: List[Tuple[str, float]]
|
||||
|
||||
source_have_accents = False # type: bool
|
||||
|
||||
for character in characters:
|
||||
if is_accentuated(character):
|
||||
source_have_accents = True
|
||||
break
|
||||
source_have_accents = any(is_accentuated(character) for character in characters)
|
||||
|
||||
for language, language_characters in FREQUENCIES.items():
|
||||
|
||||
target_have_accents = False # type: bool
|
||||
target_pure_latin = True # type: bool
|
||||
|
||||
for language_character in language_characters:
|
||||
if target_have_accents is False and is_accentuated(language_character):
|
||||
target_have_accents = True
|
||||
if target_pure_latin is True and is_latin(language_character) is False:
|
||||
target_pure_latin = False
|
||||
target_have_accents, target_pure_latin = get_target_features(language)
|
||||
|
||||
if ignore_non_latin and target_pure_latin is False:
|
||||
continue
|
||||
|
@ -263,8 +268,6 @@ def merge_coherence_ratios(results: List[CoherenceMatches]) -> CoherenceMatches:
|
|||
The return type is the same as coherence_ratio.
|
||||
"""
|
||||
per_language_ratios = OrderedDict() # type: Dict[str, List[float]]
|
||||
merge = [] # type: CoherenceMatches
|
||||
|
||||
for result in results:
|
||||
for sub_result in result:
|
||||
language, ratio = sub_result
|
||||
|
@ -273,17 +276,16 @@ def merge_coherence_ratios(results: List[CoherenceMatches]) -> CoherenceMatches:
|
|||
continue
|
||||
per_language_ratios[language].append(ratio)
|
||||
|
||||
for language in per_language_ratios:
|
||||
merge.append(
|
||||
(
|
||||
language,
|
||||
round(
|
||||
sum(per_language_ratios[language])
|
||||
/ len(per_language_ratios[language]),
|
||||
4,
|
||||
),
|
||||
)
|
||||
merge = [
|
||||
(
|
||||
language,
|
||||
round(
|
||||
sum(per_language_ratios[language]) / len(per_language_ratios[language]),
|
||||
4,
|
||||
),
|
||||
)
|
||||
for language in per_language_ratios
|
||||
]
|
||||
|
||||
return sorted(merge, key=lambda x: x[1], reverse=True)
|
||||
|
||||
|
@ -298,14 +300,11 @@ def coherence_ratio(
|
|||
"""
|
||||
|
||||
results = [] # type: List[Tuple[str, float]]
|
||||
lg_inclusion_list = [] # type: List[str]
|
||||
ignore_non_latin = False # type: bool
|
||||
|
||||
sufficient_match_count = 0 # type: int
|
||||
|
||||
if lg_inclusion is not None:
|
||||
lg_inclusion_list = lg_inclusion.split(",")
|
||||
|
||||
lg_inclusion_list = lg_inclusion.split(",") if lg_inclusion is not None else []
|
||||
if "Latin Based" in lg_inclusion_list:
|
||||
ignore_non_latin = True
|
||||
lg_inclusion_list.remove("Latin Based")
|
||||
|
@ -314,7 +313,7 @@ def coherence_ratio(
|
|||
sequence_frequencies = Counter(layer) # type: Counter
|
||||
most_common = sequence_frequencies.most_common()
|
||||
|
||||
character_count = sum([o for c, o in most_common]) # type: int
|
||||
character_count = sum(o for c, o in most_common) # type: int
|
||||
|
||||
if character_count <= TOO_SMALL_SEQUENCE:
|
||||
continue
|
||||
|
|
|
@ -235,20 +235,19 @@ def cli_detect(argv: List[str] = None) -> int:
|
|||
o_.insert(-1, best_guess.encoding)
|
||||
if my_file.closed is False:
|
||||
my_file.close()
|
||||
else:
|
||||
if (
|
||||
args.force is False
|
||||
and query_yes_no(
|
||||
'Are you sure to normalize "{}" by replacing it ?'.format(
|
||||
my_file.name
|
||||
),
|
||||
"no",
|
||||
)
|
||||
is False
|
||||
):
|
||||
if my_file.closed is False:
|
||||
my_file.close()
|
||||
continue
|
||||
elif (
|
||||
args.force is False
|
||||
and query_yes_no(
|
||||
'Are you sure to normalize "{}" by replacing it ?'.format(
|
||||
my_file.name
|
||||
),
|
||||
"no",
|
||||
)
|
||||
is False
|
||||
):
|
||||
if my_file.closed is False:
|
||||
my_file.close()
|
||||
continue
|
||||
|
||||
try:
|
||||
x_[0].unicode_path = abspath("./{}".format(".".join(o_)))
|
||||
|
@ -277,7 +276,7 @@ def cli_detect(argv: List[str] = None) -> int:
|
|||
print(
|
||||
", ".join(
|
||||
[
|
||||
el.encoding if el.encoding else "undefined"
|
||||
el.encoding or "undefined"
|
||||
for el in x_
|
||||
if el.path == abspath(my_file.name)
|
||||
]
|
||||
|
|
|
@ -4,6 +4,8 @@ from encodings.aliases import aliases
|
|||
from re import IGNORECASE, compile as re_compile
|
||||
from typing import Dict, List, Set, Union
|
||||
|
||||
from .assets import FREQUENCIES
|
||||
|
||||
# Contain for each eligible encoding a list of/item bytes SIG/BOM
|
||||
ENCODING_MARKS = OrderedDict(
|
||||
[
|
||||
|
@ -30,7 +32,7 @@ TOO_BIG_SEQUENCE = int(10e6) # type: int
|
|||
UTF8_MAXIMAL_ALLOCATION = 1112064 # type: int
|
||||
|
||||
UNICODE_RANGES_COMBINED = {
|
||||
"Control character": range(0, 31 + 1),
|
||||
"Control character": range(31 + 1),
|
||||
"Basic Latin": range(32, 127 + 1),
|
||||
"Latin-1 Supplement": range(128, 255 + 1),
|
||||
"Latin Extended-A": range(256, 383 + 1),
|
||||
|
@ -311,6 +313,7 @@ UNICODE_RANGES_COMBINED = {
|
|||
"Variation Selectors Supplement": range(917760, 917999 + 1),
|
||||
} # type: Dict[str, range]
|
||||
|
||||
|
||||
UNICODE_SECONDARY_RANGE_KEYWORD = [
|
||||
"Supplement",
|
||||
"Extended",
|
||||
|
@ -352,11 +355,10 @@ IANA_SUPPORTED_SIMILAR = {
|
|||
"cp1140": ["cp037", "cp1026", "cp273", "cp500"],
|
||||
"cp1250": ["iso8859_2"],
|
||||
"cp1251": ["kz1048", "ptcp154"],
|
||||
"cp1252": ["cp1258", "iso8859_15", "iso8859_9", "latin_1"],
|
||||
"cp1252": ["iso8859_15", "iso8859_9", "latin_1"],
|
||||
"cp1253": ["iso8859_7"],
|
||||
"cp1254": ["cp1258", "iso8859_15", "iso8859_9", "latin_1"],
|
||||
"cp1254": ["iso8859_15", "iso8859_9", "latin_1"],
|
||||
"cp1257": ["iso8859_13"],
|
||||
"cp1258": ["cp1252", "cp1254", "iso8859_9", "latin_1"],
|
||||
"cp273": ["cp037", "cp1026", "cp1140", "cp500"],
|
||||
"cp437": ["cp850", "cp858", "cp860", "cp861", "cp862", "cp863", "cp865"],
|
||||
"cp500": ["cp037", "cp1026", "cp1140", "cp273"],
|
||||
|
@ -494,3 +496,5 @@ KO_NAMES = {"johab", "cp949", "euc_kr"} # type: Set[str]
|
|||
ZH_NAMES = {"big5", "cp950", "big5hkscs", "hz"} # type: Set[str]
|
||||
|
||||
NOT_PRINTABLE_PATTERN = re_compile(r"[0-9\W\n\r\t]+")
|
||||
|
||||
LANGUAGE_SUPPORTED_COUNT = len(FREQUENCIES) # type: int
|
||||
|
|
|
@ -40,11 +40,11 @@ class MessDetectorPlugin:
|
|||
"""
|
||||
raise NotImplementedError # pragma: nocover
|
||||
|
||||
def reset(self) -> None:
|
||||
def reset(self) -> None: # pragma: no cover
|
||||
"""
|
||||
Permit to reset the plugin to the initial state.
|
||||
"""
|
||||
raise NotImplementedError # pragma: nocover
|
||||
raise NotImplementedError
|
||||
|
||||
@property
|
||||
def ratio(self) -> float:
|
||||
|
@ -85,7 +85,7 @@ class TooManySymbolOrPunctuationPlugin(MessDetectorPlugin):
|
|||
|
||||
self._last_printable_char = character
|
||||
|
||||
def reset(self) -> None:
|
||||
def reset(self) -> None: # pragma: no cover
|
||||
self._punctuation_count = 0
|
||||
self._character_count = 0
|
||||
self._symbol_count = 0
|
||||
|
@ -116,7 +116,7 @@ class TooManyAccentuatedPlugin(MessDetectorPlugin):
|
|||
if is_accentuated(character):
|
||||
self._accentuated_count += 1
|
||||
|
||||
def reset(self) -> None:
|
||||
def reset(self) -> None: # pragma: no cover
|
||||
self._character_count = 0
|
||||
self._accentuated_count = 0
|
||||
|
||||
|
@ -147,7 +147,7 @@ class UnprintablePlugin(MessDetectorPlugin):
|
|||
self._unprintable_count += 1
|
||||
self._character_count += 1
|
||||
|
||||
def reset(self) -> None:
|
||||
def reset(self) -> None: # pragma: no cover
|
||||
self._unprintable_count = 0
|
||||
|
||||
@property
|
||||
|
@ -170,18 +170,19 @@ class SuspiciousDuplicateAccentPlugin(MessDetectorPlugin):
|
|||
|
||||
def feed(self, character: str) -> None:
|
||||
self._character_count += 1
|
||||
if self._last_latin_character is not None:
|
||||
if is_accentuated(character) and is_accentuated(self._last_latin_character):
|
||||
if character.isupper() and self._last_latin_character.isupper():
|
||||
self._successive_count += 1
|
||||
# Worse if its the same char duplicated with different accent.
|
||||
if remove_accent(character) == remove_accent(
|
||||
self._last_latin_character
|
||||
):
|
||||
self._successive_count += 1
|
||||
if (
|
||||
self._last_latin_character is not None
|
||||
and is_accentuated(character)
|
||||
and is_accentuated(self._last_latin_character)
|
||||
):
|
||||
if character.isupper() and self._last_latin_character.isupper():
|
||||
self._successive_count += 1
|
||||
# Worse if its the same char duplicated with different accent.
|
||||
if remove_accent(character) == remove_accent(self._last_latin_character):
|
||||
self._successive_count += 1
|
||||
self._last_latin_character = character
|
||||
|
||||
def reset(self) -> None:
|
||||
def reset(self) -> None: # pragma: no cover
|
||||
self._successive_count = 0
|
||||
self._character_count = 0
|
||||
self._last_latin_character = None
|
||||
|
@ -228,7 +229,7 @@ class SuspiciousRange(MessDetectorPlugin):
|
|||
|
||||
self._last_printable_seen = character
|
||||
|
||||
def reset(self) -> None:
|
||||
def reset(self) -> None: # pragma: no cover
|
||||
self._character_count = 0
|
||||
self._suspicious_successive_range_count = 0
|
||||
self._last_printable_seen = None
|
||||
|
@ -252,6 +253,8 @@ class SuperWeirdWordPlugin(MessDetectorPlugin):
|
|||
def __init__(self) -> None:
|
||||
self._word_count = 0 # type: int
|
||||
self._bad_word_count = 0 # type: int
|
||||
self._foreign_long_count = 0 # type: int
|
||||
|
||||
self._is_current_word_bad = False # type: bool
|
||||
self._foreign_long_watch = False # type: bool
|
||||
|
||||
|
@ -271,7 +274,7 @@ class SuperWeirdWordPlugin(MessDetectorPlugin):
|
|||
self._buffer_accent_count += 1
|
||||
if (
|
||||
self._foreign_long_watch is False
|
||||
and is_latin(character) is False
|
||||
and (is_latin(character) is False or is_accentuated(character))
|
||||
and is_cjk(character) is False
|
||||
and is_hangul(character) is False
|
||||
and is_katakana(character) is False
|
||||
|
@ -290,9 +293,16 @@ class SuperWeirdWordPlugin(MessDetectorPlugin):
|
|||
|
||||
self._character_count += buffer_length
|
||||
|
||||
if buffer_length >= 4 and self._buffer_accent_count / buffer_length > 0.34:
|
||||
self._is_current_word_bad = True
|
||||
if buffer_length >= 4:
|
||||
if self._buffer_accent_count / buffer_length > 0.34:
|
||||
self._is_current_word_bad = True
|
||||
# Word/Buffer ending with a upper case accentuated letter are so rare,
|
||||
# that we will consider them all as suspicious. Same weight as foreign_long suspicious.
|
||||
if is_accentuated(self._buffer[-1]) and self._buffer[-1].isupper():
|
||||
self._foreign_long_count += 1
|
||||
self._is_current_word_bad = True
|
||||
if buffer_length >= 24 and self._foreign_long_watch:
|
||||
self._foreign_long_count += 1
|
||||
self._is_current_word_bad = True
|
||||
|
||||
if self._is_current_word_bad:
|
||||
|
@ -311,7 +321,7 @@ class SuperWeirdWordPlugin(MessDetectorPlugin):
|
|||
self._is_current_word_bad = True
|
||||
self._buffer += character
|
||||
|
||||
def reset(self) -> None:
|
||||
def reset(self) -> None: # pragma: no cover
|
||||
self._buffer = ""
|
||||
self._is_current_word_bad = False
|
||||
self._foreign_long_watch = False
|
||||
|
@ -319,10 +329,11 @@ class SuperWeirdWordPlugin(MessDetectorPlugin):
|
|||
self._word_count = 0
|
||||
self._character_count = 0
|
||||
self._bad_character_count = 0
|
||||
self._foreign_long_count = 0
|
||||
|
||||
@property
|
||||
def ratio(self) -> float:
|
||||
if self._word_count <= 10:
|
||||
if self._word_count <= 10 and self._foreign_long_count == 0:
|
||||
return 0.0
|
||||
|
||||
return self._bad_character_count / self._character_count
|
||||
|
@ -342,13 +353,13 @@ class CjkInvalidStopPlugin(MessDetectorPlugin):
|
|||
return True
|
||||
|
||||
def feed(self, character: str) -> None:
|
||||
if character in ["丅", "丄"]:
|
||||
if character in {"丅", "丄"}:
|
||||
self._wrong_stop_count += 1
|
||||
return
|
||||
if is_cjk(character):
|
||||
self._cjk_character_count += 1
|
||||
|
||||
def reset(self) -> None:
|
||||
def reset(self) -> None: # pragma: no cover
|
||||
self._wrong_stop_count = 0
|
||||
self._cjk_character_count = 0
|
||||
|
||||
|
@ -418,7 +429,7 @@ class ArchaicUpperLowerPlugin(MessDetectorPlugin):
|
|||
self._character_count_since_last_sep += 1
|
||||
self._last_alpha_seen = character
|
||||
|
||||
def reset(self) -> None:
|
||||
def reset(self) -> None: # pragma: no cover
|
||||
self._character_count = 0
|
||||
self._character_count_since_last_sep = 0
|
||||
self._successive_upper_lower_count = 0
|
||||
|
@ -453,6 +464,13 @@ def is_suspiciously_successive_range(
|
|||
if "Emoticons" in unicode_range_a or "Emoticons" in unicode_range_b:
|
||||
return False
|
||||
|
||||
# Latin characters can be accompanied with a combining diacritical mark
|
||||
# eg. Vietnamese.
|
||||
if ("Latin" in unicode_range_a or "Latin" in unicode_range_b) and (
|
||||
"Combining" in unicode_range_a or "Combining" in unicode_range_b
|
||||
):
|
||||
return False
|
||||
|
||||
keywords_range_a, keywords_range_b = unicode_range_a.split(
|
||||
" "
|
||||
), unicode_range_b.split(" ")
|
||||
|
@ -472,11 +490,12 @@ def is_suspiciously_successive_range(
|
|||
),
|
||||
unicode_range_b in ("Hiragana", "Katakana"),
|
||||
)
|
||||
if range_a_jp_chars or range_b_jp_chars:
|
||||
if "CJK" in unicode_range_a or "CJK" in unicode_range_b:
|
||||
return False
|
||||
if range_a_jp_chars and range_b_jp_chars:
|
||||
return False
|
||||
if (range_a_jp_chars or range_b_jp_chars) and (
|
||||
"CJK" in unicode_range_a or "CJK" in unicode_range_b
|
||||
):
|
||||
return False
|
||||
if range_a_jp_chars and range_b_jp_chars:
|
||||
return False
|
||||
|
||||
if "Hangul" in unicode_range_a or "Hangul" in unicode_range_b:
|
||||
if "CJK" in unicode_range_a or "CJK" in unicode_range_b:
|
||||
|
@ -509,7 +528,7 @@ def mess_ratio(
|
|||
md_class() for md_class in MessDetectorPlugin.__subclasses__()
|
||||
] # type: List[MessDetectorPlugin]
|
||||
|
||||
length = len(decoded_sequence) # type: int
|
||||
length = len(decoded_sequence) + 1 # type: int
|
||||
|
||||
mean_mess_ratio = 0.0 # type: float
|
||||
|
||||
|
@ -520,7 +539,7 @@ def mess_ratio(
|
|||
else:
|
||||
intermediary_mean_mess_ratio_calc = 128
|
||||
|
||||
for character, index in zip(decoded_sequence, range(0, length)):
|
||||
for character, index in zip(decoded_sequence + "\n", range(length)):
|
||||
for detector in detectors:
|
||||
if detector.eligible(character):
|
||||
detector.feed(character)
|
||||
|
@ -528,7 +547,7 @@ def mess_ratio(
|
|||
if (
|
||||
index > 0 and index % intermediary_mean_mess_ratio_calc == 0
|
||||
) or index == length - 1:
|
||||
mean_mess_ratio = sum([dt.ratio for dt in detectors])
|
||||
mean_mess_ratio = sum(dt.ratio for dt in detectors)
|
||||
|
||||
if mean_mess_ratio >= maximum_threshold:
|
||||
break
|
||||
|
|
|
@ -284,8 +284,7 @@ class CharsetMatches:
|
|||
self._results = sorted(results) if results else [] # type: List[CharsetMatch]
|
||||
|
||||
def __iter__(self) -> Iterator[CharsetMatch]:
|
||||
for result in self._results:
|
||||
yield result
|
||||
yield from self._results
|
||||
|
||||
def __getitem__(self, item: Union[int, str]) -> CharsetMatch:
|
||||
"""
|
||||
|
|
|
@ -4,6 +4,7 @@ except ImportError:
|
|||
import unicodedata # type: ignore[no-redef]
|
||||
|
||||
import importlib
|
||||
import logging
|
||||
from codecs import IncrementalDecoder
|
||||
from encodings.aliases import aliases
|
||||
from functools import lru_cache
|
||||
|
@ -122,7 +123,7 @@ def is_emoticon(character: str) -> bool:
|
|||
|
||||
@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
|
||||
def is_separator(character: str) -> bool:
|
||||
if character.isspace() or character in ["|", "+", ",", ";", "<", ">"]:
|
||||
if character.isspace() or character in {"|", "+", ",", ";", "<", ">"}:
|
||||
return True
|
||||
|
||||
character_category = unicodedata.category(character) # type: str
|
||||
|
@ -138,7 +139,7 @@ def is_case_variable(character: str) -> bool:
|
|||
def is_private_use_only(character: str) -> bool:
|
||||
character_category = unicodedata.category(character) # type: str
|
||||
|
||||
return "Co" == character_category
|
||||
return character_category == "Co"
|
||||
|
||||
|
||||
@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
|
||||
|
@ -193,11 +194,7 @@ def is_thai(character: str) -> bool:
|
|||
|
||||
@lru_cache(maxsize=len(UNICODE_RANGES_COMBINED))
|
||||
def is_unicode_range_secondary(range_name: str) -> bool:
|
||||
for keyword in UNICODE_SECONDARY_RANGE_KEYWORD:
|
||||
if keyword in range_name:
|
||||
return True
|
||||
|
||||
return False
|
||||
return any(keyword in range_name for keyword in UNICODE_SECONDARY_RANGE_KEYWORD)
|
||||
|
||||
|
||||
def any_specified_encoding(sequence: bytes, search_zone: int = 4096) -> Optional[str]:
|
||||
|
@ -211,9 +208,7 @@ def any_specified_encoding(sequence: bytes, search_zone: int = 4096) -> Optional
|
|||
|
||||
results = findall(
|
||||
RE_POSSIBLE_ENCODING_INDICATION,
|
||||
sequence[: seq_len if seq_len <= search_zone else search_zone].decode(
|
||||
"ascii", errors="ignore"
|
||||
),
|
||||
sequence[: min(seq_len, search_zone)].decode("ascii", errors="ignore"),
|
||||
) # type: List[str]
|
||||
|
||||
if len(results) == 0:
|
||||
|
@ -278,7 +273,7 @@ def iana_name(cp_name: str, strict: bool = True) -> str:
|
|||
cp_name = cp_name.lower().replace("-", "_")
|
||||
|
||||
for encoding_alias, encoding_iana in aliases.items():
|
||||
if cp_name == encoding_alias or cp_name == encoding_iana:
|
||||
if cp_name in [encoding_alias, encoding_iana]:
|
||||
return encoding_iana
|
||||
|
||||
if strict:
|
||||
|
@ -314,7 +309,7 @@ def cp_similarity(iana_name_a: str, iana_name_b: str) -> float:
|
|||
|
||||
character_match_count = 0 # type: int
|
||||
|
||||
for i in range(0, 255):
|
||||
for i in range(255):
|
||||
to_be_decoded = bytes([i]) # type: bytes
|
||||
if id_a.decode(to_be_decoded) == id_b.decode(to_be_decoded):
|
||||
character_match_count += 1
|
||||
|
@ -331,3 +326,17 @@ def is_cp_similar(iana_name_a: str, iana_name_b: str) -> bool:
|
|||
iana_name_a in IANA_SUPPORTED_SIMILAR
|
||||
and iana_name_b in IANA_SUPPORTED_SIMILAR[iana_name_a]
|
||||
)
|
||||
|
||||
|
||||
def set_logging_handler(
|
||||
name: str = "charset_normalizer",
|
||||
level: int = logging.INFO,
|
||||
format_string: str = "%(asctime)s | %(levelname)s | %(message)s",
|
||||
) -> None:
|
||||
|
||||
logger = logging.getLogger(name)
|
||||
logger.setLevel(level)
|
||||
|
||||
handler = logging.StreamHandler()
|
||||
handler.setFormatter(logging.Formatter(format_string))
|
||||
logger.addHandler(handler)
|
||||
|
|
|
@ -2,5 +2,5 @@
|
|||
Expose version
|
||||
"""
|
||||
|
||||
__version__ = "2.0.7"
|
||||
__version__ = "2.0.8"
|
||||
VERSION = __version__.split(".")
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue