mirror of
https://github.com/Tautulli/Tautulli.git
synced 2025-07-07 05:31:15 -07:00
Bump requests from 2.28.1 to 2.28.2 (#1968)
* Bump requests from 2.28.1 to 2.28.2 Bumps [requests](https://github.com/psf/requests) from 2.28.1 to 2.28.2. - [Release notes](https://github.com/psf/requests/releases) - [Changelog](https://github.com/psf/requests/blob/main/HISTORY.md) - [Commits](https://github.com/psf/requests/compare/v2.28.1...v2.28.2) --- updated-dependencies: - dependency-name: requests dependency-type: direct:production update-type: version-update:semver-patch ... Signed-off-by: dependabot[bot] <support@github.com> * Update requests==2.28.2 --------- Signed-off-by: dependabot[bot] <support@github.com> Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> Co-authored-by: JonnyWong16 <9099342+JonnyWong16@users.noreply.github.com> [skip ci]
This commit is contained in:
parent
70e09582da
commit
cc78f17be5
20 changed files with 527 additions and 302 deletions
|
@ -21,14 +21,8 @@ at <https://github.com/Ousret/charset_normalizer>.
|
|||
"""
|
||||
import logging
|
||||
|
||||
from .api import from_bytes, from_fp, from_path, normalize
|
||||
from .legacy import (
|
||||
CharsetDetector,
|
||||
CharsetDoctor,
|
||||
CharsetNormalizerMatch,
|
||||
CharsetNormalizerMatches,
|
||||
detect,
|
||||
)
|
||||
from .api import from_bytes, from_fp, from_path
|
||||
from .legacy import detect
|
||||
from .models import CharsetMatch, CharsetMatches
|
||||
from .utils import set_logging_handler
|
||||
from .version import VERSION, __version__
|
||||
|
@ -37,14 +31,9 @@ __all__ = (
|
|||
"from_fp",
|
||||
"from_path",
|
||||
"from_bytes",
|
||||
"normalize",
|
||||
"detect",
|
||||
"CharsetMatch",
|
||||
"CharsetMatches",
|
||||
"CharsetNormalizerMatch",
|
||||
"CharsetNormalizerMatches",
|
||||
"CharsetDetector",
|
||||
"CharsetDoctor",
|
||||
"__version__",
|
||||
"VERSION",
|
||||
"set_logging_handler",
|
||||
|
|
|
@ -1,7 +1,5 @@
|
|||
import logging
|
||||
import warnings
|
||||
from os import PathLike
|
||||
from os.path import basename, splitext
|
||||
from typing import Any, BinaryIO, List, Optional, Set
|
||||
|
||||
from .cd import (
|
||||
|
@ -41,11 +39,12 @@ def from_bytes(
|
|||
cp_exclusion: Optional[List[str]] = None,
|
||||
preemptive_behaviour: bool = True,
|
||||
explain: bool = False,
|
||||
language_threshold: float = 0.1,
|
||||
) -> CharsetMatches:
|
||||
"""
|
||||
Given a raw bytes sequence, return the best possibles charset usable to render str objects.
|
||||
If there is no results, it is a strong indicator that the source is binary/not text.
|
||||
By default, the process will extract 5 blocs of 512o each to assess the mess and coherence of a given sequence.
|
||||
By default, the process will extract 5 blocks of 512o each to assess the mess and coherence of a given sequence.
|
||||
And will give up a particular code page after 20% of measured mess. Those criteria are customizable at will.
|
||||
|
||||
The preemptive behavior DOES NOT replace the traditional detection workflow, it prioritize a particular code page
|
||||
|
@ -197,7 +196,14 @@ def from_bytes(
|
|||
if encoding_iana in {"utf_16", "utf_32"} and not bom_or_sig_available:
|
||||
logger.log(
|
||||
TRACE,
|
||||
"Encoding %s wont be tested as-is because it require a BOM. Will try some sub-encoder LE/BE.",
|
||||
"Encoding %s won't be tested as-is because it require a BOM. Will try some sub-encoder LE/BE.",
|
||||
encoding_iana,
|
||||
)
|
||||
continue
|
||||
if encoding_iana in {"utf_7"} and not bom_or_sig_available:
|
||||
logger.log(
|
||||
TRACE,
|
||||
"Encoding %s won't be tested as-is because detection is unreliable without BOM/SIG.",
|
||||
encoding_iana,
|
||||
)
|
||||
continue
|
||||
|
@ -297,7 +303,13 @@ def from_bytes(
|
|||
):
|
||||
md_chunks.append(chunk)
|
||||
|
||||
md_ratios.append(mess_ratio(chunk, threshold))
|
||||
md_ratios.append(
|
||||
mess_ratio(
|
||||
chunk,
|
||||
threshold,
|
||||
explain is True and 1 <= len(cp_isolation) <= 2,
|
||||
)
|
||||
)
|
||||
|
||||
if md_ratios[-1] >= threshold:
|
||||
early_stop_count += 1
|
||||
|
@ -389,7 +401,9 @@ def from_bytes(
|
|||
if encoding_iana != "ascii":
|
||||
for chunk in md_chunks:
|
||||
chunk_languages = coherence_ratio(
|
||||
chunk, 0.1, ",".join(target_languages) if target_languages else None
|
||||
chunk,
|
||||
language_threshold,
|
||||
",".join(target_languages) if target_languages else None,
|
||||
)
|
||||
|
||||
cd_ratios.append(chunk_languages)
|
||||
|
@ -491,6 +505,7 @@ def from_fp(
|
|||
cp_exclusion: Optional[List[str]] = None,
|
||||
preemptive_behaviour: bool = True,
|
||||
explain: bool = False,
|
||||
language_threshold: float = 0.1,
|
||||
) -> CharsetMatches:
|
||||
"""
|
||||
Same thing than the function from_bytes but using a file pointer that is already ready.
|
||||
|
@ -505,6 +520,7 @@ def from_fp(
|
|||
cp_exclusion,
|
||||
preemptive_behaviour,
|
||||
explain,
|
||||
language_threshold,
|
||||
)
|
||||
|
||||
|
||||
|
@ -517,6 +533,7 @@ def from_path(
|
|||
cp_exclusion: Optional[List[str]] = None,
|
||||
preemptive_behaviour: bool = True,
|
||||
explain: bool = False,
|
||||
language_threshold: float = 0.1,
|
||||
) -> CharsetMatches:
|
||||
"""
|
||||
Same thing than the function from_bytes but with one extra step. Opening and reading given file path in binary mode.
|
||||
|
@ -532,53 +549,5 @@ def from_path(
|
|||
cp_exclusion,
|
||||
preemptive_behaviour,
|
||||
explain,
|
||||
language_threshold,
|
||||
)
|
||||
|
||||
|
||||
def normalize(
|
||||
path: "PathLike[Any]",
|
||||
steps: int = 5,
|
||||
chunk_size: int = 512,
|
||||
threshold: float = 0.20,
|
||||
cp_isolation: Optional[List[str]] = None,
|
||||
cp_exclusion: Optional[List[str]] = None,
|
||||
preemptive_behaviour: bool = True,
|
||||
) -> CharsetMatch:
|
||||
"""
|
||||
Take a (text-based) file path and try to create another file next to it, this time using UTF-8.
|
||||
"""
|
||||
warnings.warn(
|
||||
"normalize is deprecated and will be removed in 3.0",
|
||||
DeprecationWarning,
|
||||
)
|
||||
|
||||
results = from_path(
|
||||
path,
|
||||
steps,
|
||||
chunk_size,
|
||||
threshold,
|
||||
cp_isolation,
|
||||
cp_exclusion,
|
||||
preemptive_behaviour,
|
||||
)
|
||||
|
||||
filename = basename(path)
|
||||
target_extensions = list(splitext(filename))
|
||||
|
||||
if len(results) == 0:
|
||||
raise IOError(
|
||||
'Unable to normalize "{}", no encoding charset seems to fit.'.format(
|
||||
filename
|
||||
)
|
||||
)
|
||||
|
||||
result = results.best()
|
||||
|
||||
target_extensions[0] += "-" + result.encoding # type: ignore
|
||||
|
||||
with open(
|
||||
"{}".format(str(path).replace(filename, "".join(target_extensions))), "wb"
|
||||
) as fp:
|
||||
fp.write(result.output()) # type: ignore
|
||||
|
||||
return result # type: ignore
|
||||
|
|
|
@ -1,6 +1,8 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
from typing import Dict, List
|
||||
|
||||
# Language label that contain the em dash "—"
|
||||
# character are to be considered alternative seq to origin
|
||||
FREQUENCIES: Dict[str, List[str]] = {
|
||||
"English": [
|
||||
"e",
|
||||
|
@ -30,6 +32,34 @@ FREQUENCIES: Dict[str, List[str]] = {
|
|||
"z",
|
||||
"q",
|
||||
],
|
||||
"English—": [
|
||||
"e",
|
||||
"a",
|
||||
"t",
|
||||
"i",
|
||||
"o",
|
||||
"n",
|
||||
"s",
|
||||
"r",
|
||||
"h",
|
||||
"l",
|
||||
"d",
|
||||
"c",
|
||||
"m",
|
||||
"u",
|
||||
"f",
|
||||
"p",
|
||||
"g",
|
||||
"w",
|
||||
"b",
|
||||
"y",
|
||||
"v",
|
||||
"k",
|
||||
"j",
|
||||
"x",
|
||||
"z",
|
||||
"q",
|
||||
],
|
||||
"German": [
|
||||
"e",
|
||||
"n",
|
||||
|
@ -226,33 +256,303 @@ FREQUENCIES: Dict[str, List[str]] = {
|
|||
"ж",
|
||||
"ц",
|
||||
],
|
||||
# Jap-Kanji
|
||||
"Japanese": [
|
||||
"人",
|
||||
"一",
|
||||
"大",
|
||||
"亅",
|
||||
"丁",
|
||||
"丨",
|
||||
"竹",
|
||||
"笑",
|
||||
"口",
|
||||
"日",
|
||||
"今",
|
||||
"二",
|
||||
"彳",
|
||||
"行",
|
||||
"十",
|
||||
"土",
|
||||
"丶",
|
||||
"寸",
|
||||
"寺",
|
||||
"時",
|
||||
"乙",
|
||||
"丿",
|
||||
"乂",
|
||||
"气",
|
||||
"気",
|
||||
"冂",
|
||||
"巾",
|
||||
"亠",
|
||||
"市",
|
||||
"目",
|
||||
"儿",
|
||||
"見",
|
||||
"八",
|
||||
"小",
|
||||
"凵",
|
||||
"県",
|
||||
"月",
|
||||
"彐",
|
||||
"門",
|
||||
"間",
|
||||
"木",
|
||||
"東",
|
||||
"山",
|
||||
"出",
|
||||
"本",
|
||||
"中",
|
||||
"刀",
|
||||
"分",
|
||||
"耳",
|
||||
"又",
|
||||
"取",
|
||||
"最",
|
||||
"言",
|
||||
"田",
|
||||
"心",
|
||||
"思",
|
||||
"刂",
|
||||
"前",
|
||||
"京",
|
||||
"尹",
|
||||
"事",
|
||||
"生",
|
||||
"厶",
|
||||
"云",
|
||||
"会",
|
||||
"未",
|
||||
"来",
|
||||
"白",
|
||||
"冫",
|
||||
"楽",
|
||||
"灬",
|
||||
"馬",
|
||||
"尸",
|
||||
"尺",
|
||||
"駅",
|
||||
"明",
|
||||
"耂",
|
||||
"者",
|
||||
"了",
|
||||
"阝",
|
||||
"都",
|
||||
"高",
|
||||
"卜",
|
||||
"占",
|
||||
"厂",
|
||||
"广",
|
||||
"店",
|
||||
"子",
|
||||
"申",
|
||||
"奄",
|
||||
"亻",
|
||||
"俺",
|
||||
"上",
|
||||
"方",
|
||||
"冖",
|
||||
"学",
|
||||
"衣",
|
||||
"艮",
|
||||
"食",
|
||||
"自",
|
||||
],
|
||||
# Jap-Katakana
|
||||
"Japanese—": [
|
||||
"ー",
|
||||
"ン",
|
||||
"ス",
|
||||
"・",
|
||||
"ル",
|
||||
"ト",
|
||||
"リ",
|
||||
"イ",
|
||||
"ア",
|
||||
"ラ",
|
||||
"ッ",
|
||||
"ク",
|
||||
"ド",
|
||||
"シ",
|
||||
"レ",
|
||||
"ジ",
|
||||
"タ",
|
||||
"フ",
|
||||
"ロ",
|
||||
"カ",
|
||||
"テ",
|
||||
"マ",
|
||||
"ィ",
|
||||
"グ",
|
||||
"バ",
|
||||
"ム",
|
||||
"プ",
|
||||
"オ",
|
||||
"コ",
|
||||
"デ",
|
||||
"ニ",
|
||||
"ウ",
|
||||
"メ",
|
||||
"サ",
|
||||
"ビ",
|
||||
"ナ",
|
||||
"ブ",
|
||||
"ャ",
|
||||
"エ",
|
||||
"ュ",
|
||||
"チ",
|
||||
"キ",
|
||||
"ズ",
|
||||
"ダ",
|
||||
"パ",
|
||||
"ミ",
|
||||
"ェ",
|
||||
"ョ",
|
||||
"ハ",
|
||||
"セ",
|
||||
"ベ",
|
||||
"ガ",
|
||||
"モ",
|
||||
"ツ",
|
||||
"ネ",
|
||||
"ボ",
|
||||
"ソ",
|
||||
"ノ",
|
||||
"ァ",
|
||||
"ヴ",
|
||||
"ワ",
|
||||
"ポ",
|
||||
"ペ",
|
||||
"ピ",
|
||||
"ケ",
|
||||
"ゴ",
|
||||
"ギ",
|
||||
"ザ",
|
||||
"ホ",
|
||||
"ゲ",
|
||||
"ォ",
|
||||
"ヤ",
|
||||
"ヒ",
|
||||
"ユ",
|
||||
"ヨ",
|
||||
"ヘ",
|
||||
"ゼ",
|
||||
"ヌ",
|
||||
"ゥ",
|
||||
"ゾ",
|
||||
"ヶ",
|
||||
"ヂ",
|
||||
"ヲ",
|
||||
"ヅ",
|
||||
"ヵ",
|
||||
"ヱ",
|
||||
"ヰ",
|
||||
"ヮ",
|
||||
"ヽ",
|
||||
"゠",
|
||||
"ヾ",
|
||||
"ヷ",
|
||||
"ヿ",
|
||||
"ヸ",
|
||||
"ヹ",
|
||||
"ヺ",
|
||||
],
|
||||
# Jap-Hiragana
|
||||
"Japanese——": [
|
||||
"の",
|
||||
"に",
|
||||
"る",
|
||||
"た",
|
||||
"は",
|
||||
"ー",
|
||||
"と",
|
||||
"は",
|
||||
"し",
|
||||
"い",
|
||||
"を",
|
||||
"で",
|
||||
"て",
|
||||
"が",
|
||||
"い",
|
||||
"ン",
|
||||
"れ",
|
||||
"な",
|
||||
"年",
|
||||
"ス",
|
||||
"っ",
|
||||
"ル",
|
||||
"れ",
|
||||
"か",
|
||||
"ら",
|
||||
"あ",
|
||||
"さ",
|
||||
"も",
|
||||
"っ",
|
||||
"り",
|
||||
"す",
|
||||
"あ",
|
||||
"も",
|
||||
"こ",
|
||||
"ま",
|
||||
"う",
|
||||
"く",
|
||||
"よ",
|
||||
"き",
|
||||
"ん",
|
||||
"め",
|
||||
"お",
|
||||
"け",
|
||||
"そ",
|
||||
"つ",
|
||||
"だ",
|
||||
"や",
|
||||
"え",
|
||||
"ど",
|
||||
"わ",
|
||||
"ち",
|
||||
"み",
|
||||
"せ",
|
||||
"じ",
|
||||
"ば",
|
||||
"へ",
|
||||
"び",
|
||||
"ず",
|
||||
"ろ",
|
||||
"ほ",
|
||||
"げ",
|
||||
"む",
|
||||
"べ",
|
||||
"ひ",
|
||||
"ょ",
|
||||
"ゆ",
|
||||
"ぶ",
|
||||
"ご",
|
||||
"ゃ",
|
||||
"ね",
|
||||
"ふ",
|
||||
"ぐ",
|
||||
"ぎ",
|
||||
"ぼ",
|
||||
"ゅ",
|
||||
"づ",
|
||||
"ざ",
|
||||
"ぞ",
|
||||
"ぬ",
|
||||
"ぜ",
|
||||
"ぱ",
|
||||
"ぽ",
|
||||
"ぷ",
|
||||
"ぴ",
|
||||
"ぃ",
|
||||
"ぁ",
|
||||
"ぇ",
|
||||
"ぺ",
|
||||
"ゞ",
|
||||
"ぢ",
|
||||
"ぉ",
|
||||
"ぅ",
|
||||
"ゐ",
|
||||
"ゝ",
|
||||
"ゑ",
|
||||
"゛",
|
||||
"゜",
|
||||
"ゎ",
|
||||
"ゔ",
|
||||
"゚",
|
||||
"ゟ",
|
||||
"゙",
|
||||
"ゕ",
|
||||
"ゖ",
|
||||
],
|
||||
"Portuguese": [
|
||||
"a",
|
||||
|
@ -340,6 +640,77 @@ FREQUENCIES: Dict[str, List[str]] = {
|
|||
"就",
|
||||
"出",
|
||||
"会",
|
||||
"可",
|
||||
"也",
|
||||
"你",
|
||||
"对",
|
||||
"生",
|
||||
"能",
|
||||
"而",
|
||||
"子",
|
||||
"那",
|
||||
"得",
|
||||
"于",
|
||||
"着",
|
||||
"下",
|
||||
"自",
|
||||
"之",
|
||||
"年",
|
||||
"过",
|
||||
"发",
|
||||
"后",
|
||||
"作",
|
||||
"里",
|
||||
"用",
|
||||
"道",
|
||||
"行",
|
||||
"所",
|
||||
"然",
|
||||
"家",
|
||||
"种",
|
||||
"事",
|
||||
"成",
|
||||
"方",
|
||||
"多",
|
||||
"经",
|
||||
"么",
|
||||
"去",
|
||||
"法",
|
||||
"学",
|
||||
"如",
|
||||
"都",
|
||||
"同",
|
||||
"现",
|
||||
"当",
|
||||
"没",
|
||||
"动",
|
||||
"面",
|
||||
"起",
|
||||
"看",
|
||||
"定",
|
||||
"天",
|
||||
"分",
|
||||
"还",
|
||||
"进",
|
||||
"好",
|
||||
"小",
|
||||
"部",
|
||||
"其",
|
||||
"些",
|
||||
"主",
|
||||
"样",
|
||||
"理",
|
||||
"心",
|
||||
"她",
|
||||
"本",
|
||||
"前",
|
||||
"开",
|
||||
"但",
|
||||
"因",
|
||||
"只",
|
||||
"从",
|
||||
"想",
|
||||
"实",
|
||||
],
|
||||
"Ukrainian": [
|
||||
"о",
|
||||
|
@ -956,34 +1327,6 @@ FREQUENCIES: Dict[str, List[str]] = {
|
|||
"ö",
|
||||
"y",
|
||||
],
|
||||
"Simple English": [
|
||||
"e",
|
||||
"a",
|
||||
"t",
|
||||
"i",
|
||||
"o",
|
||||
"n",
|
||||
"s",
|
||||
"r",
|
||||
"h",
|
||||
"l",
|
||||
"d",
|
||||
"c",
|
||||
"m",
|
||||
"u",
|
||||
"f",
|
||||
"p",
|
||||
"g",
|
||||
"w",
|
||||
"b",
|
||||
"y",
|
||||
"v",
|
||||
"k",
|
||||
"j",
|
||||
"x",
|
||||
"z",
|
||||
"q",
|
||||
],
|
||||
"Thai": [
|
||||
"า",
|
||||
"น",
|
||||
|
@ -1066,31 +1409,6 @@ FREQUENCIES: Dict[str, List[str]] = {
|
|||
"ஒ",
|
||||
"ஸ",
|
||||
],
|
||||
"Classical Chinese": [
|
||||
"之",
|
||||
"年",
|
||||
"為",
|
||||
"也",
|
||||
"以",
|
||||
"一",
|
||||
"人",
|
||||
"其",
|
||||
"者",
|
||||
"國",
|
||||
"有",
|
||||
"二",
|
||||
"十",
|
||||
"於",
|
||||
"曰",
|
||||
"三",
|
||||
"不",
|
||||
"大",
|
||||
"而",
|
||||
"子",
|
||||
"中",
|
||||
"五",
|
||||
"四",
|
||||
],
|
||||
"Kazakh": [
|
||||
"а",
|
||||
"ы",
|
||||
|
|
|
@ -105,7 +105,7 @@ def mb_encoding_languages(iana_name: str) -> List[str]:
|
|||
):
|
||||
return ["Japanese"]
|
||||
if iana_name.startswith("gb") or iana_name in ZH_NAMES:
|
||||
return ["Chinese", "Classical Chinese"]
|
||||
return ["Chinese"]
|
||||
if iana_name.startswith("iso2022_kr") or iana_name in KO_NAMES:
|
||||
return ["Korean"]
|
||||
|
||||
|
@ -179,22 +179,45 @@ def characters_popularity_compare(
|
|||
character_approved_count: int = 0
|
||||
FREQUENCIES_language_set = set(FREQUENCIES[language])
|
||||
|
||||
for character in ordered_characters:
|
||||
ordered_characters_count: int = len(ordered_characters)
|
||||
target_language_characters_count: int = len(FREQUENCIES[language])
|
||||
|
||||
large_alphabet: bool = target_language_characters_count > 26
|
||||
|
||||
for character, character_rank in zip(
|
||||
ordered_characters, range(0, ordered_characters_count)
|
||||
):
|
||||
if character not in FREQUENCIES_language_set:
|
||||
continue
|
||||
|
||||
character_rank_in_language: int = FREQUENCIES[language].index(character)
|
||||
expected_projection_ratio: float = (
|
||||
target_language_characters_count / ordered_characters_count
|
||||
)
|
||||
character_rank_projection: int = int(character_rank * expected_projection_ratio)
|
||||
|
||||
if (
|
||||
large_alphabet is False
|
||||
and abs(character_rank_projection - character_rank_in_language) > 4
|
||||
):
|
||||
continue
|
||||
|
||||
if (
|
||||
large_alphabet is True
|
||||
and abs(character_rank_projection - character_rank_in_language)
|
||||
< target_language_characters_count / 3
|
||||
):
|
||||
character_approved_count += 1
|
||||
continue
|
||||
|
||||
characters_before_source: List[str] = FREQUENCIES[language][
|
||||
0 : FREQUENCIES[language].index(character)
|
||||
0:character_rank_in_language
|
||||
]
|
||||
characters_after_source: List[str] = FREQUENCIES[language][
|
||||
FREQUENCIES[language].index(character) :
|
||||
]
|
||||
characters_before: List[str] = ordered_characters[
|
||||
0 : ordered_characters.index(character)
|
||||
]
|
||||
characters_after: List[str] = ordered_characters[
|
||||
ordered_characters.index(character) :
|
||||
character_rank_in_language:
|
||||
]
|
||||
characters_before: List[str] = ordered_characters[0:character_rank]
|
||||
characters_after: List[str] = ordered_characters[character_rank:]
|
||||
|
||||
before_match_count: int = len(
|
||||
set(characters_before) & set(characters_before_source)
|
||||
|
@ -289,6 +312,33 @@ def merge_coherence_ratios(results: List[CoherenceMatches]) -> CoherenceMatches:
|
|||
return sorted(merge, key=lambda x: x[1], reverse=True)
|
||||
|
||||
|
||||
def filter_alt_coherence_matches(results: CoherenceMatches) -> CoherenceMatches:
|
||||
"""
|
||||
We shall NOT return "English—" in CoherenceMatches because it is an alternative
|
||||
of "English". This function only keeps the best match and remove the em-dash in it.
|
||||
"""
|
||||
index_results: Dict[str, List[float]] = dict()
|
||||
|
||||
for result in results:
|
||||
language, ratio = result
|
||||
no_em_name: str = language.replace("—", "")
|
||||
|
||||
if no_em_name not in index_results:
|
||||
index_results[no_em_name] = []
|
||||
|
||||
index_results[no_em_name].append(ratio)
|
||||
|
||||
if any(len(index_results[e]) > 1 for e in index_results):
|
||||
filtered_results: CoherenceMatches = []
|
||||
|
||||
for language in index_results:
|
||||
filtered_results.append((language, max(index_results[language])))
|
||||
|
||||
return filtered_results
|
||||
|
||||
return results
|
||||
|
||||
|
||||
@lru_cache(maxsize=2048)
|
||||
def coherence_ratio(
|
||||
decoded_sequence: str, threshold: float = 0.1, lg_inclusion: Optional[str] = None
|
||||
|
@ -336,4 +386,6 @@ def coherence_ratio(
|
|||
if sufficient_match_count >= 3:
|
||||
break
|
||||
|
||||
return sorted(results, key=lambda x: x[1], reverse=True)
|
||||
return sorted(
|
||||
filter_alt_coherence_matches(results), key=lambda x: x[1], reverse=True
|
||||
)
|
||||
|
|
|
@ -1,15 +1,12 @@
|
|||
import argparse
|
||||
import sys
|
||||
from json import dumps
|
||||
from os.path import abspath
|
||||
from os.path import abspath, basename, dirname, join, realpath
|
||||
from platform import python_version
|
||||
from typing import List, Optional
|
||||
from unicodedata import unidata_version
|
||||
|
||||
try:
|
||||
from unicodedata2 import unidata_version
|
||||
except ImportError:
|
||||
from unicodedata import unidata_version
|
||||
|
||||
import charset_normalizer.md as md_module
|
||||
from charset_normalizer import from_fp
|
||||
from charset_normalizer.models import CliDetectionResult
|
||||
from charset_normalizer.version import __version__
|
||||
|
@ -124,8 +121,11 @@ def cli_detect(argv: Optional[List[str]] = None) -> int:
|
|||
parser.add_argument(
|
||||
"--version",
|
||||
action="version",
|
||||
version="Charset-Normalizer {} - Python {} - Unicode {}".format(
|
||||
__version__, python_version(), unidata_version
|
||||
version="Charset-Normalizer {} - Python {} - Unicode {} - SpeedUp {}".format(
|
||||
__version__,
|
||||
python_version(),
|
||||
unidata_version,
|
||||
"OFF" if md_module.__file__.lower().endswith(".py") else "ON",
|
||||
),
|
||||
help="Show version information and exit.",
|
||||
)
|
||||
|
@ -234,7 +234,10 @@ def cli_detect(argv: Optional[List[str]] = None) -> int:
|
|||
my_file.close()
|
||||
continue
|
||||
|
||||
o_: List[str] = my_file.name.split(".")
|
||||
dir_path = dirname(realpath(my_file.name))
|
||||
file_name = basename(realpath(my_file.name))
|
||||
|
||||
o_: List[str] = file_name.split(".")
|
||||
|
||||
if args.replace is False:
|
||||
o_.insert(-1, best_guess.encoding)
|
||||
|
@ -255,7 +258,7 @@ def cli_detect(argv: Optional[List[str]] = None) -> int:
|
|||
continue
|
||||
|
||||
try:
|
||||
x_[0].unicode_path = abspath("./{}".format(".".join(o_)))
|
||||
x_[0].unicode_path = join(dir_path, ".".join(o_))
|
||||
|
||||
with open(x_[0].unicode_path, "w", encoding="utf-8") as fp:
|
||||
fp.write(str(best_guess))
|
||||
|
|
|
@ -489,9 +489,7 @@ COMMON_SAFE_ASCII_CHARACTERS: Set[str] = {
|
|||
KO_NAMES: Set[str] = {"johab", "cp949", "euc_kr"}
|
||||
ZH_NAMES: Set[str] = {"big5", "cp950", "big5hkscs", "hz"}
|
||||
|
||||
NOT_PRINTABLE_PATTERN = re_compile(r"[0-9\W\n\r\t]+")
|
||||
|
||||
LANGUAGE_SUPPORTED_COUNT: int = len(FREQUENCIES)
|
||||
|
||||
# Logging LEVEL bellow DEBUG
|
||||
# Logging LEVEL below DEBUG
|
||||
TRACE: int = 5
|
||||
|
|
|
@ -1,9 +1,7 @@
|
|||
import warnings
|
||||
from typing import Dict, Optional, Union
|
||||
|
||||
from .api import from_bytes, from_fp, from_path, normalize
|
||||
from .api import from_bytes
|
||||
from .constant import CHARDET_CORRESPONDENCE
|
||||
from .models import CharsetMatch, CharsetMatches
|
||||
|
||||
|
||||
def detect(byte_str: bytes) -> Dict[str, Optional[Union[str, float]]]:
|
||||
|
@ -43,53 +41,3 @@ def detect(byte_str: bytes) -> Dict[str, Optional[Union[str, float]]]:
|
|||
"language": language,
|
||||
"confidence": confidence,
|
||||
}
|
||||
|
||||
|
||||
class CharsetNormalizerMatch(CharsetMatch):
|
||||
pass
|
||||
|
||||
|
||||
class CharsetNormalizerMatches(CharsetMatches):
|
||||
@staticmethod
|
||||
def from_fp(*args, **kwargs): # type: ignore
|
||||
warnings.warn( # pragma: nocover
|
||||
"staticmethod from_fp, from_bytes, from_path and normalize are deprecated "
|
||||
"and scheduled to be removed in 3.0",
|
||||
DeprecationWarning,
|
||||
)
|
||||
return from_fp(*args, **kwargs) # pragma: nocover
|
||||
|
||||
@staticmethod
|
||||
def from_bytes(*args, **kwargs): # type: ignore
|
||||
warnings.warn( # pragma: nocover
|
||||
"staticmethod from_fp, from_bytes, from_path and normalize are deprecated "
|
||||
"and scheduled to be removed in 3.0",
|
||||
DeprecationWarning,
|
||||
)
|
||||
return from_bytes(*args, **kwargs) # pragma: nocover
|
||||
|
||||
@staticmethod
|
||||
def from_path(*args, **kwargs): # type: ignore
|
||||
warnings.warn( # pragma: nocover
|
||||
"staticmethod from_fp, from_bytes, from_path and normalize are deprecated "
|
||||
"and scheduled to be removed in 3.0",
|
||||
DeprecationWarning,
|
||||
)
|
||||
return from_path(*args, **kwargs) # pragma: nocover
|
||||
|
||||
@staticmethod
|
||||
def normalize(*args, **kwargs): # type: ignore
|
||||
warnings.warn( # pragma: nocover
|
||||
"staticmethod from_fp, from_bytes, from_path and normalize are deprecated "
|
||||
"and scheduled to be removed in 3.0",
|
||||
DeprecationWarning,
|
||||
)
|
||||
return normalize(*args, **kwargs) # pragma: nocover
|
||||
|
||||
|
||||
class CharsetDetector(CharsetNormalizerMatches):
|
||||
pass
|
||||
|
||||
|
||||
class CharsetDoctor(CharsetNormalizerMatches):
|
||||
pass
|
||||
|
|
|
@ -1,7 +1,12 @@
|
|||
from functools import lru_cache
|
||||
from logging import getLogger
|
||||
from typing import List, Optional
|
||||
|
||||
from .constant import COMMON_SAFE_ASCII_CHARACTERS, UNICODE_SECONDARY_RANGE_KEYWORD
|
||||
from .constant import (
|
||||
COMMON_SAFE_ASCII_CHARACTERS,
|
||||
TRACE,
|
||||
UNICODE_SECONDARY_RANGE_KEYWORD,
|
||||
)
|
||||
from .utils import (
|
||||
is_accentuated,
|
||||
is_ascii,
|
||||
|
@ -123,7 +128,7 @@ class TooManyAccentuatedPlugin(MessDetectorPlugin):
|
|||
|
||||
@property
|
||||
def ratio(self) -> float:
|
||||
if self._character_count == 0:
|
||||
if self._character_count == 0 or self._character_count < 8:
|
||||
return 0.0
|
||||
ratio_of_accentuation: float = self._accentuated_count / self._character_count
|
||||
return ratio_of_accentuation if ratio_of_accentuation >= 0.35 else 0.0
|
||||
|
@ -547,7 +552,20 @@ def mess_ratio(
|
|||
break
|
||||
|
||||
if debug:
|
||||
logger = getLogger("charset_normalizer")
|
||||
|
||||
logger.log(
|
||||
TRACE,
|
||||
"Mess-detector extended-analysis start. "
|
||||
f"intermediary_mean_mess_ratio_calc={intermediary_mean_mess_ratio_calc} mean_mess_ratio={mean_mess_ratio} "
|
||||
f"maximum_threshold={maximum_threshold}",
|
||||
)
|
||||
|
||||
if len(decoded_sequence) > 16:
|
||||
logger.log(TRACE, f"Starting with: {decoded_sequence[:16]}")
|
||||
logger.log(TRACE, f"Ending with: {decoded_sequence[-16::]}")
|
||||
|
||||
for dt in detectors: # pragma: nocover
|
||||
print(dt.__class__, dt.ratio)
|
||||
logger.log(TRACE, f"{dt.__class__}: {dt.ratio}")
|
||||
|
||||
return round(mean_mess_ratio, 3)
|
||||
|
|
|
@ -1,22 +1,9 @@
|
|||
import warnings
|
||||
from collections import Counter
|
||||
from encodings.aliases import aliases
|
||||
from hashlib import sha256
|
||||
from json import dumps
|
||||
from re import sub
|
||||
from typing import (
|
||||
Any,
|
||||
Counter as TypeCounter,
|
||||
Dict,
|
||||
Iterator,
|
||||
List,
|
||||
Optional,
|
||||
Tuple,
|
||||
Union,
|
||||
)
|
||||
from typing import Any, Dict, Iterator, List, Optional, Tuple, Union
|
||||
|
||||
from .constant import NOT_PRINTABLE_PATTERN, TOO_BIG_SEQUENCE
|
||||
from .md import mess_ratio
|
||||
from .constant import TOO_BIG_SEQUENCE
|
||||
from .utils import iana_name, is_multi_byte_encoding, unicode_range
|
||||
|
||||
|
||||
|
@ -65,7 +52,7 @@ class CharsetMatch:
|
|||
chaos_difference: float = abs(self.chaos - other.chaos)
|
||||
coherence_difference: float = abs(self.coherence - other.coherence)
|
||||
|
||||
# Bellow 1% difference --> Use Coherence
|
||||
# Below 1% difference --> Use Coherence
|
||||
if chaos_difference < 0.01 and coherence_difference > 0.02:
|
||||
# When having a tough decision, use the result that decoded as many multi-byte as possible.
|
||||
if chaos_difference == 0.0 and self.coherence == other.coherence:
|
||||
|
@ -78,45 +65,6 @@ class CharsetMatch:
|
|||
def multi_byte_usage(self) -> float:
|
||||
return 1.0 - len(str(self)) / len(self.raw)
|
||||
|
||||
@property
|
||||
def chaos_secondary_pass(self) -> float:
|
||||
"""
|
||||
Check once again chaos in decoded text, except this time, with full content.
|
||||
Use with caution, this can be very slow.
|
||||
Notice: Will be removed in 3.0
|
||||
"""
|
||||
warnings.warn(
|
||||
"chaos_secondary_pass is deprecated and will be removed in 3.0",
|
||||
DeprecationWarning,
|
||||
)
|
||||
return mess_ratio(str(self), 1.0)
|
||||
|
||||
@property
|
||||
def coherence_non_latin(self) -> float:
|
||||
"""
|
||||
Coherence ratio on the first non-latin language detected if ANY.
|
||||
Notice: Will be removed in 3.0
|
||||
"""
|
||||
warnings.warn(
|
||||
"coherence_non_latin is deprecated and will be removed in 3.0",
|
||||
DeprecationWarning,
|
||||
)
|
||||
return 0.0
|
||||
|
||||
@property
|
||||
def w_counter(self) -> TypeCounter[str]:
|
||||
"""
|
||||
Word counter instance on decoded text.
|
||||
Notice: Will be removed in 3.0
|
||||
"""
|
||||
warnings.warn(
|
||||
"w_counter is deprecated and will be removed in 3.0", DeprecationWarning
|
||||
)
|
||||
|
||||
string_printable_only = sub(NOT_PRINTABLE_PATTERN, " ", str(self).lower())
|
||||
|
||||
return Counter(string_printable_only.split())
|
||||
|
||||
def __str__(self) -> str:
|
||||
# Lazy Str Loading
|
||||
if self._string is None:
|
||||
|
@ -252,18 +200,6 @@ class CharsetMatch:
|
|||
"""
|
||||
return [self._encoding] + [m.encoding for m in self._leaves]
|
||||
|
||||
def first(self) -> "CharsetMatch":
|
||||
"""
|
||||
Kept for BC reasons. Will be removed in 3.0.
|
||||
"""
|
||||
return self
|
||||
|
||||
def best(self) -> "CharsetMatch":
|
||||
"""
|
||||
Kept for BC reasons. Will be removed in 3.0.
|
||||
"""
|
||||
return self
|
||||
|
||||
def output(self, encoding: str = "utf_8") -> bytes:
|
||||
"""
|
||||
Method to get re-encoded bytes payload using given target encoding. Default to UTF-8.
|
||||
|
|
|
@ -1,12 +1,6 @@
|
|||
try:
|
||||
# WARNING: unicodedata2 support is going to be removed in 3.0
|
||||
# Python is quickly catching up.
|
||||
import unicodedata2 as unicodedata
|
||||
except ImportError:
|
||||
import unicodedata # type: ignore[no-redef]
|
||||
|
||||
import importlib
|
||||
import logging
|
||||
import unicodedata
|
||||
from codecs import IncrementalDecoder
|
||||
from encodings.aliases import aliases
|
||||
from functools import lru_cache
|
||||
|
@ -402,7 +396,7 @@ def cut_sequence_chunks(
|
|||
|
||||
# multi-byte bad cutting detector and adjustment
|
||||
# not the cleanest way to perform that fix but clever enough for now.
|
||||
if is_multi_byte_decoder and i > 0 and sequences[i] >= 0x80:
|
||||
if is_multi_byte_decoder and i > 0:
|
||||
|
||||
chunk_partial_size_chk: int = min(chunk_size, 16)
|
||||
|
||||
|
|
|
@ -2,5 +2,5 @@
|
|||
Expose version
|
||||
"""
|
||||
|
||||
__version__ = "2.1.1"
|
||||
__version__ = "3.0.1"
|
||||
VERSION = __version__.split(".")
|
||||
|
|
|
@ -80,8 +80,8 @@ def check_compatibility(urllib3_version, chardet_version, charset_normalizer_ver
|
|||
elif charset_normalizer_version:
|
||||
major, minor, patch = charset_normalizer_version.split(".")[:3]
|
||||
major, minor, patch = int(major), int(minor), int(patch)
|
||||
# charset_normalizer >= 2.0.0 < 3.0.0
|
||||
assert (2, 0, 0) <= (major, minor, patch) < (3, 0, 0)
|
||||
# charset_normalizer >= 2.0.0 < 4.0.0
|
||||
assert (2, 0, 0) <= (major, minor, patch) < (4, 0, 0)
|
||||
else:
|
||||
raise Exception("You need either charset_normalizer or chardet installed")
|
||||
|
||||
|
|
|
@ -5,10 +5,10 @@
|
|||
__title__ = "requests"
|
||||
__description__ = "Python HTTP for Humans."
|
||||
__url__ = "https://requests.readthedocs.io"
|
||||
__version__ = "2.28.1"
|
||||
__build__ = 0x022801
|
||||
__version__ = "2.28.2"
|
||||
__build__ = 0x022802
|
||||
__author__ = "Kenneth Reitz"
|
||||
__author_email__ = "me@kennethreitz.org"
|
||||
__license__ = "Apache 2.0"
|
||||
__copyright__ = "Copyright 2022 Kenneth Reitz"
|
||||
__copyright__ = "Copyright Kenneth Reitz"
|
||||
__cake__ = "\u2728 \U0001f370 \u2728"
|
||||
|
|
|
@ -438,7 +438,7 @@ class PreparedRequest(RequestEncodingMixin, RequestHooksMixin):
|
|||
if not scheme:
|
||||
raise MissingSchema(
|
||||
f"Invalid URL {url!r}: No scheme supplied. "
|
||||
f"Perhaps you meant http://{url}?"
|
||||
f"Perhaps you meant https://{url}?"
|
||||
)
|
||||
|
||||
if not host:
|
||||
|
|
|
@ -1,2 +1,2 @@
|
|||
# This file is protected via CODEOWNERS
|
||||
__version__ = "1.26.13"
|
||||
__version__ = "1.26.14"
|
||||
|
|
|
@ -224,7 +224,7 @@ class AppEngineManager(RequestMethods):
|
|||
)
|
||||
|
||||
# Check if we should retry the HTTP response.
|
||||
has_retry_after = bool(http_response.getheader("Retry-After"))
|
||||
has_retry_after = bool(http_response.headers.get("Retry-After"))
|
||||
if retries.is_retry(method, http_response.status, has_retry_after):
|
||||
retries = retries.increment(method, url, response=http_response, _pool=self)
|
||||
log.debug("Retry: %s", url)
|
||||
|
|
|
@ -69,7 +69,7 @@ class NTLMConnectionPool(HTTPSConnectionPool):
|
|||
log.debug("Request headers: %s", headers)
|
||||
conn.request("GET", self.authurl, None, headers)
|
||||
res = conn.getresponse()
|
||||
reshdr = dict(res.getheaders())
|
||||
reshdr = dict(res.headers)
|
||||
log.debug("Response status: %s %s", res.status, res.reason)
|
||||
log.debug("Response headers: %s", reshdr)
|
||||
log.debug("Response data: %s [...]", res.read(100))
|
||||
|
@ -101,7 +101,7 @@ class NTLMConnectionPool(HTTPSConnectionPool):
|
|||
conn.request("GET", self.authurl, None, headers)
|
||||
res = conn.getresponse()
|
||||
log.debug("Response status: %s %s", res.status, res.reason)
|
||||
log.debug("Response headers: %s", dict(res.getheaders()))
|
||||
log.debug("Response headers: %s", dict(res.headers))
|
||||
log.debug("Response data: %s [...]", res.read()[:100])
|
||||
if res.status != 200:
|
||||
if res.status == 401:
|
||||
|
|
|
@ -666,7 +666,7 @@ class HTTPResponse(io.IOBase):
|
|||
def getheaders(self):
|
||||
warnings.warn(
|
||||
"HTTPResponse.getheaders() is deprecated and will be removed "
|
||||
"in urllib3 v2.1.0. Instead access HTTResponse.headers directly.",
|
||||
"in urllib3 v2.1.0. Instead access HTTPResponse.headers directly.",
|
||||
category=DeprecationWarning,
|
||||
stacklevel=2,
|
||||
)
|
||||
|
@ -675,7 +675,7 @@ class HTTPResponse(io.IOBase):
|
|||
def getheader(self, name, default=None):
|
||||
warnings.warn(
|
||||
"HTTPResponse.getheader() is deprecated and will be removed "
|
||||
"in urllib3 v2.1.0. Instead use HTTResponse.headers.get(name, default).",
|
||||
"in urllib3 v2.1.0. Instead use HTTPResponse.headers.get(name, default).",
|
||||
category=DeprecationWarning,
|
||||
stacklevel=2,
|
||||
)
|
||||
|
|
|
@ -63,7 +63,7 @@ IPV6_ADDRZ_RE = re.compile("^" + IPV6_ADDRZ_PAT + "$")
|
|||
BRACELESS_IPV6_ADDRZ_RE = re.compile("^" + IPV6_ADDRZ_PAT[2:-2] + "$")
|
||||
ZONE_ID_RE = re.compile("(" + ZONE_ID_PAT + r")\]$")
|
||||
|
||||
_HOST_PORT_PAT = ("^(%s|%s|%s)(?::0*([0-9]{0,5}))?$") % (
|
||||
_HOST_PORT_PAT = ("^(%s|%s|%s)(?::0*?(|0|[1-9][0-9]{0,4}))?$") % (
|
||||
REG_NAME_PAT,
|
||||
IPV4_PAT,
|
||||
IPV6_ADDRZ_PAT,
|
||||
|
|
|
@ -36,7 +36,7 @@ pyparsing==3.0.9
|
|||
python-dateutil==2.8.2
|
||||
python-twitter==3.5
|
||||
pytz==2022.7
|
||||
requests==2.28.1
|
||||
requests==2.28.2
|
||||
requests-oauthlib==1.3.1
|
||||
rumps==0.4.0; platform_system == "Darwin"
|
||||
simplejson==3.18.0
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue