mirror of
https://github.com/Tautulli/Tautulli.git
synced 2025-07-07 05:31:15 -07:00
Bump requests from 2.28.1 to 2.28.2 (#1968)
* Bump requests from 2.28.1 to 2.28.2 Bumps [requests](https://github.com/psf/requests) from 2.28.1 to 2.28.2. - [Release notes](https://github.com/psf/requests/releases) - [Changelog](https://github.com/psf/requests/blob/main/HISTORY.md) - [Commits](https://github.com/psf/requests/compare/v2.28.1...v2.28.2) --- updated-dependencies: - dependency-name: requests dependency-type: direct:production update-type: version-update:semver-patch ... Signed-off-by: dependabot[bot] <support@github.com> * Update requests==2.28.2 --------- Signed-off-by: dependabot[bot] <support@github.com> Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> Co-authored-by: JonnyWong16 <9099342+JonnyWong16@users.noreply.github.com> [skip ci]
This commit is contained in:
parent
70e09582da
commit
cc78f17be5
20 changed files with 527 additions and 302 deletions
|
@ -21,14 +21,8 @@ at <https://github.com/Ousret/charset_normalizer>.
|
||||||
"""
|
"""
|
||||||
import logging
|
import logging
|
||||||
|
|
||||||
from .api import from_bytes, from_fp, from_path, normalize
|
from .api import from_bytes, from_fp, from_path
|
||||||
from .legacy import (
|
from .legacy import detect
|
||||||
CharsetDetector,
|
|
||||||
CharsetDoctor,
|
|
||||||
CharsetNormalizerMatch,
|
|
||||||
CharsetNormalizerMatches,
|
|
||||||
detect,
|
|
||||||
)
|
|
||||||
from .models import CharsetMatch, CharsetMatches
|
from .models import CharsetMatch, CharsetMatches
|
||||||
from .utils import set_logging_handler
|
from .utils import set_logging_handler
|
||||||
from .version import VERSION, __version__
|
from .version import VERSION, __version__
|
||||||
|
@ -37,14 +31,9 @@ __all__ = (
|
||||||
"from_fp",
|
"from_fp",
|
||||||
"from_path",
|
"from_path",
|
||||||
"from_bytes",
|
"from_bytes",
|
||||||
"normalize",
|
|
||||||
"detect",
|
"detect",
|
||||||
"CharsetMatch",
|
"CharsetMatch",
|
||||||
"CharsetMatches",
|
"CharsetMatches",
|
||||||
"CharsetNormalizerMatch",
|
|
||||||
"CharsetNormalizerMatches",
|
|
||||||
"CharsetDetector",
|
|
||||||
"CharsetDoctor",
|
|
||||||
"__version__",
|
"__version__",
|
||||||
"VERSION",
|
"VERSION",
|
||||||
"set_logging_handler",
|
"set_logging_handler",
|
||||||
|
|
|
@ -1,7 +1,5 @@
|
||||||
import logging
|
import logging
|
||||||
import warnings
|
|
||||||
from os import PathLike
|
from os import PathLike
|
||||||
from os.path import basename, splitext
|
|
||||||
from typing import Any, BinaryIO, List, Optional, Set
|
from typing import Any, BinaryIO, List, Optional, Set
|
||||||
|
|
||||||
from .cd import (
|
from .cd import (
|
||||||
|
@ -41,11 +39,12 @@ def from_bytes(
|
||||||
cp_exclusion: Optional[List[str]] = None,
|
cp_exclusion: Optional[List[str]] = None,
|
||||||
preemptive_behaviour: bool = True,
|
preemptive_behaviour: bool = True,
|
||||||
explain: bool = False,
|
explain: bool = False,
|
||||||
|
language_threshold: float = 0.1,
|
||||||
) -> CharsetMatches:
|
) -> CharsetMatches:
|
||||||
"""
|
"""
|
||||||
Given a raw bytes sequence, return the best possibles charset usable to render str objects.
|
Given a raw bytes sequence, return the best possibles charset usable to render str objects.
|
||||||
If there is no results, it is a strong indicator that the source is binary/not text.
|
If there is no results, it is a strong indicator that the source is binary/not text.
|
||||||
By default, the process will extract 5 blocs of 512o each to assess the mess and coherence of a given sequence.
|
By default, the process will extract 5 blocks of 512o each to assess the mess and coherence of a given sequence.
|
||||||
And will give up a particular code page after 20% of measured mess. Those criteria are customizable at will.
|
And will give up a particular code page after 20% of measured mess. Those criteria are customizable at will.
|
||||||
|
|
||||||
The preemptive behavior DOES NOT replace the traditional detection workflow, it prioritize a particular code page
|
The preemptive behavior DOES NOT replace the traditional detection workflow, it prioritize a particular code page
|
||||||
|
@ -197,7 +196,14 @@ def from_bytes(
|
||||||
if encoding_iana in {"utf_16", "utf_32"} and not bom_or_sig_available:
|
if encoding_iana in {"utf_16", "utf_32"} and not bom_or_sig_available:
|
||||||
logger.log(
|
logger.log(
|
||||||
TRACE,
|
TRACE,
|
||||||
"Encoding %s wont be tested as-is because it require a BOM. Will try some sub-encoder LE/BE.",
|
"Encoding %s won't be tested as-is because it require a BOM. Will try some sub-encoder LE/BE.",
|
||||||
|
encoding_iana,
|
||||||
|
)
|
||||||
|
continue
|
||||||
|
if encoding_iana in {"utf_7"} and not bom_or_sig_available:
|
||||||
|
logger.log(
|
||||||
|
TRACE,
|
||||||
|
"Encoding %s won't be tested as-is because detection is unreliable without BOM/SIG.",
|
||||||
encoding_iana,
|
encoding_iana,
|
||||||
)
|
)
|
||||||
continue
|
continue
|
||||||
|
@ -297,7 +303,13 @@ def from_bytes(
|
||||||
):
|
):
|
||||||
md_chunks.append(chunk)
|
md_chunks.append(chunk)
|
||||||
|
|
||||||
md_ratios.append(mess_ratio(chunk, threshold))
|
md_ratios.append(
|
||||||
|
mess_ratio(
|
||||||
|
chunk,
|
||||||
|
threshold,
|
||||||
|
explain is True and 1 <= len(cp_isolation) <= 2,
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
if md_ratios[-1] >= threshold:
|
if md_ratios[-1] >= threshold:
|
||||||
early_stop_count += 1
|
early_stop_count += 1
|
||||||
|
@ -389,7 +401,9 @@ def from_bytes(
|
||||||
if encoding_iana != "ascii":
|
if encoding_iana != "ascii":
|
||||||
for chunk in md_chunks:
|
for chunk in md_chunks:
|
||||||
chunk_languages = coherence_ratio(
|
chunk_languages = coherence_ratio(
|
||||||
chunk, 0.1, ",".join(target_languages) if target_languages else None
|
chunk,
|
||||||
|
language_threshold,
|
||||||
|
",".join(target_languages) if target_languages else None,
|
||||||
)
|
)
|
||||||
|
|
||||||
cd_ratios.append(chunk_languages)
|
cd_ratios.append(chunk_languages)
|
||||||
|
@ -491,6 +505,7 @@ def from_fp(
|
||||||
cp_exclusion: Optional[List[str]] = None,
|
cp_exclusion: Optional[List[str]] = None,
|
||||||
preemptive_behaviour: bool = True,
|
preemptive_behaviour: bool = True,
|
||||||
explain: bool = False,
|
explain: bool = False,
|
||||||
|
language_threshold: float = 0.1,
|
||||||
) -> CharsetMatches:
|
) -> CharsetMatches:
|
||||||
"""
|
"""
|
||||||
Same thing than the function from_bytes but using a file pointer that is already ready.
|
Same thing than the function from_bytes but using a file pointer that is already ready.
|
||||||
|
@ -505,6 +520,7 @@ def from_fp(
|
||||||
cp_exclusion,
|
cp_exclusion,
|
||||||
preemptive_behaviour,
|
preemptive_behaviour,
|
||||||
explain,
|
explain,
|
||||||
|
language_threshold,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@ -517,6 +533,7 @@ def from_path(
|
||||||
cp_exclusion: Optional[List[str]] = None,
|
cp_exclusion: Optional[List[str]] = None,
|
||||||
preemptive_behaviour: bool = True,
|
preemptive_behaviour: bool = True,
|
||||||
explain: bool = False,
|
explain: bool = False,
|
||||||
|
language_threshold: float = 0.1,
|
||||||
) -> CharsetMatches:
|
) -> CharsetMatches:
|
||||||
"""
|
"""
|
||||||
Same thing than the function from_bytes but with one extra step. Opening and reading given file path in binary mode.
|
Same thing than the function from_bytes but with one extra step. Opening and reading given file path in binary mode.
|
||||||
|
@ -532,53 +549,5 @@ def from_path(
|
||||||
cp_exclusion,
|
cp_exclusion,
|
||||||
preemptive_behaviour,
|
preemptive_behaviour,
|
||||||
explain,
|
explain,
|
||||||
|
language_threshold,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
def normalize(
|
|
||||||
path: "PathLike[Any]",
|
|
||||||
steps: int = 5,
|
|
||||||
chunk_size: int = 512,
|
|
||||||
threshold: float = 0.20,
|
|
||||||
cp_isolation: Optional[List[str]] = None,
|
|
||||||
cp_exclusion: Optional[List[str]] = None,
|
|
||||||
preemptive_behaviour: bool = True,
|
|
||||||
) -> CharsetMatch:
|
|
||||||
"""
|
|
||||||
Take a (text-based) file path and try to create another file next to it, this time using UTF-8.
|
|
||||||
"""
|
|
||||||
warnings.warn(
|
|
||||||
"normalize is deprecated and will be removed in 3.0",
|
|
||||||
DeprecationWarning,
|
|
||||||
)
|
|
||||||
|
|
||||||
results = from_path(
|
|
||||||
path,
|
|
||||||
steps,
|
|
||||||
chunk_size,
|
|
||||||
threshold,
|
|
||||||
cp_isolation,
|
|
||||||
cp_exclusion,
|
|
||||||
preemptive_behaviour,
|
|
||||||
)
|
|
||||||
|
|
||||||
filename = basename(path)
|
|
||||||
target_extensions = list(splitext(filename))
|
|
||||||
|
|
||||||
if len(results) == 0:
|
|
||||||
raise IOError(
|
|
||||||
'Unable to normalize "{}", no encoding charset seems to fit.'.format(
|
|
||||||
filename
|
|
||||||
)
|
|
||||||
)
|
|
||||||
|
|
||||||
result = results.best()
|
|
||||||
|
|
||||||
target_extensions[0] += "-" + result.encoding # type: ignore
|
|
||||||
|
|
||||||
with open(
|
|
||||||
"{}".format(str(path).replace(filename, "".join(target_extensions))), "wb"
|
|
||||||
) as fp:
|
|
||||||
fp.write(result.output()) # type: ignore
|
|
||||||
|
|
||||||
return result # type: ignore
|
|
||||||
|
|
|
@ -1,6 +1,8 @@
|
||||||
# -*- coding: utf-8 -*-
|
# -*- coding: utf-8 -*-
|
||||||
from typing import Dict, List
|
from typing import Dict, List
|
||||||
|
|
||||||
|
# Language label that contain the em dash "—"
|
||||||
|
# character are to be considered alternative seq to origin
|
||||||
FREQUENCIES: Dict[str, List[str]] = {
|
FREQUENCIES: Dict[str, List[str]] = {
|
||||||
"English": [
|
"English": [
|
||||||
"e",
|
"e",
|
||||||
|
@ -30,6 +32,34 @@ FREQUENCIES: Dict[str, List[str]] = {
|
||||||
"z",
|
"z",
|
||||||
"q",
|
"q",
|
||||||
],
|
],
|
||||||
|
"English—": [
|
||||||
|
"e",
|
||||||
|
"a",
|
||||||
|
"t",
|
||||||
|
"i",
|
||||||
|
"o",
|
||||||
|
"n",
|
||||||
|
"s",
|
||||||
|
"r",
|
||||||
|
"h",
|
||||||
|
"l",
|
||||||
|
"d",
|
||||||
|
"c",
|
||||||
|
"m",
|
||||||
|
"u",
|
||||||
|
"f",
|
||||||
|
"p",
|
||||||
|
"g",
|
||||||
|
"w",
|
||||||
|
"b",
|
||||||
|
"y",
|
||||||
|
"v",
|
||||||
|
"k",
|
||||||
|
"j",
|
||||||
|
"x",
|
||||||
|
"z",
|
||||||
|
"q",
|
||||||
|
],
|
||||||
"German": [
|
"German": [
|
||||||
"e",
|
"e",
|
||||||
"n",
|
"n",
|
||||||
|
@ -226,33 +256,303 @@ FREQUENCIES: Dict[str, List[str]] = {
|
||||||
"ж",
|
"ж",
|
||||||
"ц",
|
"ц",
|
||||||
],
|
],
|
||||||
|
# Jap-Kanji
|
||||||
"Japanese": [
|
"Japanese": [
|
||||||
|
"人",
|
||||||
|
"一",
|
||||||
|
"大",
|
||||||
|
"亅",
|
||||||
|
"丁",
|
||||||
|
"丨",
|
||||||
|
"竹",
|
||||||
|
"笑",
|
||||||
|
"口",
|
||||||
|
"日",
|
||||||
|
"今",
|
||||||
|
"二",
|
||||||
|
"彳",
|
||||||
|
"行",
|
||||||
|
"十",
|
||||||
|
"土",
|
||||||
|
"丶",
|
||||||
|
"寸",
|
||||||
|
"寺",
|
||||||
|
"時",
|
||||||
|
"乙",
|
||||||
|
"丿",
|
||||||
|
"乂",
|
||||||
|
"气",
|
||||||
|
"気",
|
||||||
|
"冂",
|
||||||
|
"巾",
|
||||||
|
"亠",
|
||||||
|
"市",
|
||||||
|
"目",
|
||||||
|
"儿",
|
||||||
|
"見",
|
||||||
|
"八",
|
||||||
|
"小",
|
||||||
|
"凵",
|
||||||
|
"県",
|
||||||
|
"月",
|
||||||
|
"彐",
|
||||||
|
"門",
|
||||||
|
"間",
|
||||||
|
"木",
|
||||||
|
"東",
|
||||||
|
"山",
|
||||||
|
"出",
|
||||||
|
"本",
|
||||||
|
"中",
|
||||||
|
"刀",
|
||||||
|
"分",
|
||||||
|
"耳",
|
||||||
|
"又",
|
||||||
|
"取",
|
||||||
|
"最",
|
||||||
|
"言",
|
||||||
|
"田",
|
||||||
|
"心",
|
||||||
|
"思",
|
||||||
|
"刂",
|
||||||
|
"前",
|
||||||
|
"京",
|
||||||
|
"尹",
|
||||||
|
"事",
|
||||||
|
"生",
|
||||||
|
"厶",
|
||||||
|
"云",
|
||||||
|
"会",
|
||||||
|
"未",
|
||||||
|
"来",
|
||||||
|
"白",
|
||||||
|
"冫",
|
||||||
|
"楽",
|
||||||
|
"灬",
|
||||||
|
"馬",
|
||||||
|
"尸",
|
||||||
|
"尺",
|
||||||
|
"駅",
|
||||||
|
"明",
|
||||||
|
"耂",
|
||||||
|
"者",
|
||||||
|
"了",
|
||||||
|
"阝",
|
||||||
|
"都",
|
||||||
|
"高",
|
||||||
|
"卜",
|
||||||
|
"占",
|
||||||
|
"厂",
|
||||||
|
"广",
|
||||||
|
"店",
|
||||||
|
"子",
|
||||||
|
"申",
|
||||||
|
"奄",
|
||||||
|
"亻",
|
||||||
|
"俺",
|
||||||
|
"上",
|
||||||
|
"方",
|
||||||
|
"冖",
|
||||||
|
"学",
|
||||||
|
"衣",
|
||||||
|
"艮",
|
||||||
|
"食",
|
||||||
|
"自",
|
||||||
|
],
|
||||||
|
# Jap-Katakana
|
||||||
|
"Japanese—": [
|
||||||
|
"ー",
|
||||||
|
"ン",
|
||||||
|
"ス",
|
||||||
|
"・",
|
||||||
|
"ル",
|
||||||
|
"ト",
|
||||||
|
"リ",
|
||||||
|
"イ",
|
||||||
|
"ア",
|
||||||
|
"ラ",
|
||||||
|
"ッ",
|
||||||
|
"ク",
|
||||||
|
"ド",
|
||||||
|
"シ",
|
||||||
|
"レ",
|
||||||
|
"ジ",
|
||||||
|
"タ",
|
||||||
|
"フ",
|
||||||
|
"ロ",
|
||||||
|
"カ",
|
||||||
|
"テ",
|
||||||
|
"マ",
|
||||||
|
"ィ",
|
||||||
|
"グ",
|
||||||
|
"バ",
|
||||||
|
"ム",
|
||||||
|
"プ",
|
||||||
|
"オ",
|
||||||
|
"コ",
|
||||||
|
"デ",
|
||||||
|
"ニ",
|
||||||
|
"ウ",
|
||||||
|
"メ",
|
||||||
|
"サ",
|
||||||
|
"ビ",
|
||||||
|
"ナ",
|
||||||
|
"ブ",
|
||||||
|
"ャ",
|
||||||
|
"エ",
|
||||||
|
"ュ",
|
||||||
|
"チ",
|
||||||
|
"キ",
|
||||||
|
"ズ",
|
||||||
|
"ダ",
|
||||||
|
"パ",
|
||||||
|
"ミ",
|
||||||
|
"ェ",
|
||||||
|
"ョ",
|
||||||
|
"ハ",
|
||||||
|
"セ",
|
||||||
|
"ベ",
|
||||||
|
"ガ",
|
||||||
|
"モ",
|
||||||
|
"ツ",
|
||||||
|
"ネ",
|
||||||
|
"ボ",
|
||||||
|
"ソ",
|
||||||
|
"ノ",
|
||||||
|
"ァ",
|
||||||
|
"ヴ",
|
||||||
|
"ワ",
|
||||||
|
"ポ",
|
||||||
|
"ペ",
|
||||||
|
"ピ",
|
||||||
|
"ケ",
|
||||||
|
"ゴ",
|
||||||
|
"ギ",
|
||||||
|
"ザ",
|
||||||
|
"ホ",
|
||||||
|
"ゲ",
|
||||||
|
"ォ",
|
||||||
|
"ヤ",
|
||||||
|
"ヒ",
|
||||||
|
"ユ",
|
||||||
|
"ヨ",
|
||||||
|
"ヘ",
|
||||||
|
"ゼ",
|
||||||
|
"ヌ",
|
||||||
|
"ゥ",
|
||||||
|
"ゾ",
|
||||||
|
"ヶ",
|
||||||
|
"ヂ",
|
||||||
|
"ヲ",
|
||||||
|
"ヅ",
|
||||||
|
"ヵ",
|
||||||
|
"ヱ",
|
||||||
|
"ヰ",
|
||||||
|
"ヮ",
|
||||||
|
"ヽ",
|
||||||
|
"゠",
|
||||||
|
"ヾ",
|
||||||
|
"ヷ",
|
||||||
|
"ヿ",
|
||||||
|
"ヸ",
|
||||||
|
"ヹ",
|
||||||
|
"ヺ",
|
||||||
|
],
|
||||||
|
# Jap-Hiragana
|
||||||
|
"Japanese——": [
|
||||||
"の",
|
"の",
|
||||||
"に",
|
"に",
|
||||||
"る",
|
"る",
|
||||||
"た",
|
"た",
|
||||||
"は",
|
|
||||||
"ー",
|
|
||||||
"と",
|
"と",
|
||||||
|
"は",
|
||||||
"し",
|
"し",
|
||||||
|
"い",
|
||||||
"を",
|
"を",
|
||||||
"で",
|
"で",
|
||||||
"て",
|
"て",
|
||||||
"が",
|
"が",
|
||||||
"い",
|
|
||||||
"ン",
|
|
||||||
"れ",
|
|
||||||
"な",
|
"な",
|
||||||
"年",
|
"れ",
|
||||||
"ス",
|
|
||||||
"っ",
|
|
||||||
"ル",
|
|
||||||
"か",
|
"か",
|
||||||
"ら",
|
"ら",
|
||||||
"あ",
|
|
||||||
"さ",
|
"さ",
|
||||||
"も",
|
"っ",
|
||||||
"り",
|
"り",
|
||||||
|
"す",
|
||||||
|
"あ",
|
||||||
|
"も",
|
||||||
|
"こ",
|
||||||
|
"ま",
|
||||||
|
"う",
|
||||||
|
"く",
|
||||||
|
"よ",
|
||||||
|
"き",
|
||||||
|
"ん",
|
||||||
|
"め",
|
||||||
|
"お",
|
||||||
|
"け",
|
||||||
|
"そ",
|
||||||
|
"つ",
|
||||||
|
"だ",
|
||||||
|
"や",
|
||||||
|
"え",
|
||||||
|
"ど",
|
||||||
|
"わ",
|
||||||
|
"ち",
|
||||||
|
"み",
|
||||||
|
"せ",
|
||||||
|
"じ",
|
||||||
|
"ば",
|
||||||
|
"へ",
|
||||||
|
"び",
|
||||||
|
"ず",
|
||||||
|
"ろ",
|
||||||
|
"ほ",
|
||||||
|
"げ",
|
||||||
|
"む",
|
||||||
|
"べ",
|
||||||
|
"ひ",
|
||||||
|
"ょ",
|
||||||
|
"ゆ",
|
||||||
|
"ぶ",
|
||||||
|
"ご",
|
||||||
|
"ゃ",
|
||||||
|
"ね",
|
||||||
|
"ふ",
|
||||||
|
"ぐ",
|
||||||
|
"ぎ",
|
||||||
|
"ぼ",
|
||||||
|
"ゅ",
|
||||||
|
"づ",
|
||||||
|
"ざ",
|
||||||
|
"ぞ",
|
||||||
|
"ぬ",
|
||||||
|
"ぜ",
|
||||||
|
"ぱ",
|
||||||
|
"ぽ",
|
||||||
|
"ぷ",
|
||||||
|
"ぴ",
|
||||||
|
"ぃ",
|
||||||
|
"ぁ",
|
||||||
|
"ぇ",
|
||||||
|
"ぺ",
|
||||||
|
"ゞ",
|
||||||
|
"ぢ",
|
||||||
|
"ぉ",
|
||||||
|
"ぅ",
|
||||||
|
"ゐ",
|
||||||
|
"ゝ",
|
||||||
|
"ゑ",
|
||||||
|
"゛",
|
||||||
|
"゜",
|
||||||
|
"ゎ",
|
||||||
|
"ゔ",
|
||||||
|
"゚",
|
||||||
|
"ゟ",
|
||||||
|
"゙",
|
||||||
|
"ゕ",
|
||||||
|
"ゖ",
|
||||||
],
|
],
|
||||||
"Portuguese": [
|
"Portuguese": [
|
||||||
"a",
|
"a",
|
||||||
|
@ -340,6 +640,77 @@ FREQUENCIES: Dict[str, List[str]] = {
|
||||||
"就",
|
"就",
|
||||||
"出",
|
"出",
|
||||||
"会",
|
"会",
|
||||||
|
"可",
|
||||||
|
"也",
|
||||||
|
"你",
|
||||||
|
"对",
|
||||||
|
"生",
|
||||||
|
"能",
|
||||||
|
"而",
|
||||||
|
"子",
|
||||||
|
"那",
|
||||||
|
"得",
|
||||||
|
"于",
|
||||||
|
"着",
|
||||||
|
"下",
|
||||||
|
"自",
|
||||||
|
"之",
|
||||||
|
"年",
|
||||||
|
"过",
|
||||||
|
"发",
|
||||||
|
"后",
|
||||||
|
"作",
|
||||||
|
"里",
|
||||||
|
"用",
|
||||||
|
"道",
|
||||||
|
"行",
|
||||||
|
"所",
|
||||||
|
"然",
|
||||||
|
"家",
|
||||||
|
"种",
|
||||||
|
"事",
|
||||||
|
"成",
|
||||||
|
"方",
|
||||||
|
"多",
|
||||||
|
"经",
|
||||||
|
"么",
|
||||||
|
"去",
|
||||||
|
"法",
|
||||||
|
"学",
|
||||||
|
"如",
|
||||||
|
"都",
|
||||||
|
"同",
|
||||||
|
"现",
|
||||||
|
"当",
|
||||||
|
"没",
|
||||||
|
"动",
|
||||||
|
"面",
|
||||||
|
"起",
|
||||||
|
"看",
|
||||||
|
"定",
|
||||||
|
"天",
|
||||||
|
"分",
|
||||||
|
"还",
|
||||||
|
"进",
|
||||||
|
"好",
|
||||||
|
"小",
|
||||||
|
"部",
|
||||||
|
"其",
|
||||||
|
"些",
|
||||||
|
"主",
|
||||||
|
"样",
|
||||||
|
"理",
|
||||||
|
"心",
|
||||||
|
"她",
|
||||||
|
"本",
|
||||||
|
"前",
|
||||||
|
"开",
|
||||||
|
"但",
|
||||||
|
"因",
|
||||||
|
"只",
|
||||||
|
"从",
|
||||||
|
"想",
|
||||||
|
"实",
|
||||||
],
|
],
|
||||||
"Ukrainian": [
|
"Ukrainian": [
|
||||||
"о",
|
"о",
|
||||||
|
@ -956,34 +1327,6 @@ FREQUENCIES: Dict[str, List[str]] = {
|
||||||
"ö",
|
"ö",
|
||||||
"y",
|
"y",
|
||||||
],
|
],
|
||||||
"Simple English": [
|
|
||||||
"e",
|
|
||||||
"a",
|
|
||||||
"t",
|
|
||||||
"i",
|
|
||||||
"o",
|
|
||||||
"n",
|
|
||||||
"s",
|
|
||||||
"r",
|
|
||||||
"h",
|
|
||||||
"l",
|
|
||||||
"d",
|
|
||||||
"c",
|
|
||||||
"m",
|
|
||||||
"u",
|
|
||||||
"f",
|
|
||||||
"p",
|
|
||||||
"g",
|
|
||||||
"w",
|
|
||||||
"b",
|
|
||||||
"y",
|
|
||||||
"v",
|
|
||||||
"k",
|
|
||||||
"j",
|
|
||||||
"x",
|
|
||||||
"z",
|
|
||||||
"q",
|
|
||||||
],
|
|
||||||
"Thai": [
|
"Thai": [
|
||||||
"า",
|
"า",
|
||||||
"น",
|
"น",
|
||||||
|
@ -1066,31 +1409,6 @@ FREQUENCIES: Dict[str, List[str]] = {
|
||||||
"ஒ",
|
"ஒ",
|
||||||
"ஸ",
|
"ஸ",
|
||||||
],
|
],
|
||||||
"Classical Chinese": [
|
|
||||||
"之",
|
|
||||||
"年",
|
|
||||||
"為",
|
|
||||||
"也",
|
|
||||||
"以",
|
|
||||||
"一",
|
|
||||||
"人",
|
|
||||||
"其",
|
|
||||||
"者",
|
|
||||||
"國",
|
|
||||||
"有",
|
|
||||||
"二",
|
|
||||||
"十",
|
|
||||||
"於",
|
|
||||||
"曰",
|
|
||||||
"三",
|
|
||||||
"不",
|
|
||||||
"大",
|
|
||||||
"而",
|
|
||||||
"子",
|
|
||||||
"中",
|
|
||||||
"五",
|
|
||||||
"四",
|
|
||||||
],
|
|
||||||
"Kazakh": [
|
"Kazakh": [
|
||||||
"а",
|
"а",
|
||||||
"ы",
|
"ы",
|
||||||
|
|
|
@ -105,7 +105,7 @@ def mb_encoding_languages(iana_name: str) -> List[str]:
|
||||||
):
|
):
|
||||||
return ["Japanese"]
|
return ["Japanese"]
|
||||||
if iana_name.startswith("gb") or iana_name in ZH_NAMES:
|
if iana_name.startswith("gb") or iana_name in ZH_NAMES:
|
||||||
return ["Chinese", "Classical Chinese"]
|
return ["Chinese"]
|
||||||
if iana_name.startswith("iso2022_kr") or iana_name in KO_NAMES:
|
if iana_name.startswith("iso2022_kr") or iana_name in KO_NAMES:
|
||||||
return ["Korean"]
|
return ["Korean"]
|
||||||
|
|
||||||
|
@ -179,22 +179,45 @@ def characters_popularity_compare(
|
||||||
character_approved_count: int = 0
|
character_approved_count: int = 0
|
||||||
FREQUENCIES_language_set = set(FREQUENCIES[language])
|
FREQUENCIES_language_set = set(FREQUENCIES[language])
|
||||||
|
|
||||||
for character in ordered_characters:
|
ordered_characters_count: int = len(ordered_characters)
|
||||||
|
target_language_characters_count: int = len(FREQUENCIES[language])
|
||||||
|
|
||||||
|
large_alphabet: bool = target_language_characters_count > 26
|
||||||
|
|
||||||
|
for character, character_rank in zip(
|
||||||
|
ordered_characters, range(0, ordered_characters_count)
|
||||||
|
):
|
||||||
if character not in FREQUENCIES_language_set:
|
if character not in FREQUENCIES_language_set:
|
||||||
continue
|
continue
|
||||||
|
|
||||||
|
character_rank_in_language: int = FREQUENCIES[language].index(character)
|
||||||
|
expected_projection_ratio: float = (
|
||||||
|
target_language_characters_count / ordered_characters_count
|
||||||
|
)
|
||||||
|
character_rank_projection: int = int(character_rank * expected_projection_ratio)
|
||||||
|
|
||||||
|
if (
|
||||||
|
large_alphabet is False
|
||||||
|
and abs(character_rank_projection - character_rank_in_language) > 4
|
||||||
|
):
|
||||||
|
continue
|
||||||
|
|
||||||
|
if (
|
||||||
|
large_alphabet is True
|
||||||
|
and abs(character_rank_projection - character_rank_in_language)
|
||||||
|
< target_language_characters_count / 3
|
||||||
|
):
|
||||||
|
character_approved_count += 1
|
||||||
|
continue
|
||||||
|
|
||||||
characters_before_source: List[str] = FREQUENCIES[language][
|
characters_before_source: List[str] = FREQUENCIES[language][
|
||||||
0 : FREQUENCIES[language].index(character)
|
0:character_rank_in_language
|
||||||
]
|
]
|
||||||
characters_after_source: List[str] = FREQUENCIES[language][
|
characters_after_source: List[str] = FREQUENCIES[language][
|
||||||
FREQUENCIES[language].index(character) :
|
character_rank_in_language:
|
||||||
]
|
|
||||||
characters_before: List[str] = ordered_characters[
|
|
||||||
0 : ordered_characters.index(character)
|
|
||||||
]
|
|
||||||
characters_after: List[str] = ordered_characters[
|
|
||||||
ordered_characters.index(character) :
|
|
||||||
]
|
]
|
||||||
|
characters_before: List[str] = ordered_characters[0:character_rank]
|
||||||
|
characters_after: List[str] = ordered_characters[character_rank:]
|
||||||
|
|
||||||
before_match_count: int = len(
|
before_match_count: int = len(
|
||||||
set(characters_before) & set(characters_before_source)
|
set(characters_before) & set(characters_before_source)
|
||||||
|
@ -289,6 +312,33 @@ def merge_coherence_ratios(results: List[CoherenceMatches]) -> CoherenceMatches:
|
||||||
return sorted(merge, key=lambda x: x[1], reverse=True)
|
return sorted(merge, key=lambda x: x[1], reverse=True)
|
||||||
|
|
||||||
|
|
||||||
|
def filter_alt_coherence_matches(results: CoherenceMatches) -> CoherenceMatches:
|
||||||
|
"""
|
||||||
|
We shall NOT return "English—" in CoherenceMatches because it is an alternative
|
||||||
|
of "English". This function only keeps the best match and remove the em-dash in it.
|
||||||
|
"""
|
||||||
|
index_results: Dict[str, List[float]] = dict()
|
||||||
|
|
||||||
|
for result in results:
|
||||||
|
language, ratio = result
|
||||||
|
no_em_name: str = language.replace("—", "")
|
||||||
|
|
||||||
|
if no_em_name not in index_results:
|
||||||
|
index_results[no_em_name] = []
|
||||||
|
|
||||||
|
index_results[no_em_name].append(ratio)
|
||||||
|
|
||||||
|
if any(len(index_results[e]) > 1 for e in index_results):
|
||||||
|
filtered_results: CoherenceMatches = []
|
||||||
|
|
||||||
|
for language in index_results:
|
||||||
|
filtered_results.append((language, max(index_results[language])))
|
||||||
|
|
||||||
|
return filtered_results
|
||||||
|
|
||||||
|
return results
|
||||||
|
|
||||||
|
|
||||||
@lru_cache(maxsize=2048)
|
@lru_cache(maxsize=2048)
|
||||||
def coherence_ratio(
|
def coherence_ratio(
|
||||||
decoded_sequence: str, threshold: float = 0.1, lg_inclusion: Optional[str] = None
|
decoded_sequence: str, threshold: float = 0.1, lg_inclusion: Optional[str] = None
|
||||||
|
@ -336,4 +386,6 @@ def coherence_ratio(
|
||||||
if sufficient_match_count >= 3:
|
if sufficient_match_count >= 3:
|
||||||
break
|
break
|
||||||
|
|
||||||
return sorted(results, key=lambda x: x[1], reverse=True)
|
return sorted(
|
||||||
|
filter_alt_coherence_matches(results), key=lambda x: x[1], reverse=True
|
||||||
|
)
|
||||||
|
|
|
@ -1,15 +1,12 @@
|
||||||
import argparse
|
import argparse
|
||||||
import sys
|
import sys
|
||||||
from json import dumps
|
from json import dumps
|
||||||
from os.path import abspath
|
from os.path import abspath, basename, dirname, join, realpath
|
||||||
from platform import python_version
|
from platform import python_version
|
||||||
from typing import List, Optional
|
from typing import List, Optional
|
||||||
|
from unicodedata import unidata_version
|
||||||
|
|
||||||
try:
|
import charset_normalizer.md as md_module
|
||||||
from unicodedata2 import unidata_version
|
|
||||||
except ImportError:
|
|
||||||
from unicodedata import unidata_version
|
|
||||||
|
|
||||||
from charset_normalizer import from_fp
|
from charset_normalizer import from_fp
|
||||||
from charset_normalizer.models import CliDetectionResult
|
from charset_normalizer.models import CliDetectionResult
|
||||||
from charset_normalizer.version import __version__
|
from charset_normalizer.version import __version__
|
||||||
|
@ -124,8 +121,11 @@ def cli_detect(argv: Optional[List[str]] = None) -> int:
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--version",
|
"--version",
|
||||||
action="version",
|
action="version",
|
||||||
version="Charset-Normalizer {} - Python {} - Unicode {}".format(
|
version="Charset-Normalizer {} - Python {} - Unicode {} - SpeedUp {}".format(
|
||||||
__version__, python_version(), unidata_version
|
__version__,
|
||||||
|
python_version(),
|
||||||
|
unidata_version,
|
||||||
|
"OFF" if md_module.__file__.lower().endswith(".py") else "ON",
|
||||||
),
|
),
|
||||||
help="Show version information and exit.",
|
help="Show version information and exit.",
|
||||||
)
|
)
|
||||||
|
@ -234,7 +234,10 @@ def cli_detect(argv: Optional[List[str]] = None) -> int:
|
||||||
my_file.close()
|
my_file.close()
|
||||||
continue
|
continue
|
||||||
|
|
||||||
o_: List[str] = my_file.name.split(".")
|
dir_path = dirname(realpath(my_file.name))
|
||||||
|
file_name = basename(realpath(my_file.name))
|
||||||
|
|
||||||
|
o_: List[str] = file_name.split(".")
|
||||||
|
|
||||||
if args.replace is False:
|
if args.replace is False:
|
||||||
o_.insert(-1, best_guess.encoding)
|
o_.insert(-1, best_guess.encoding)
|
||||||
|
@ -255,7 +258,7 @@ def cli_detect(argv: Optional[List[str]] = None) -> int:
|
||||||
continue
|
continue
|
||||||
|
|
||||||
try:
|
try:
|
||||||
x_[0].unicode_path = abspath("./{}".format(".".join(o_)))
|
x_[0].unicode_path = join(dir_path, ".".join(o_))
|
||||||
|
|
||||||
with open(x_[0].unicode_path, "w", encoding="utf-8") as fp:
|
with open(x_[0].unicode_path, "w", encoding="utf-8") as fp:
|
||||||
fp.write(str(best_guess))
|
fp.write(str(best_guess))
|
||||||
|
|
|
@ -489,9 +489,7 @@ COMMON_SAFE_ASCII_CHARACTERS: Set[str] = {
|
||||||
KO_NAMES: Set[str] = {"johab", "cp949", "euc_kr"}
|
KO_NAMES: Set[str] = {"johab", "cp949", "euc_kr"}
|
||||||
ZH_NAMES: Set[str] = {"big5", "cp950", "big5hkscs", "hz"}
|
ZH_NAMES: Set[str] = {"big5", "cp950", "big5hkscs", "hz"}
|
||||||
|
|
||||||
NOT_PRINTABLE_PATTERN = re_compile(r"[0-9\W\n\r\t]+")
|
|
||||||
|
|
||||||
LANGUAGE_SUPPORTED_COUNT: int = len(FREQUENCIES)
|
LANGUAGE_SUPPORTED_COUNT: int = len(FREQUENCIES)
|
||||||
|
|
||||||
# Logging LEVEL bellow DEBUG
|
# Logging LEVEL below DEBUG
|
||||||
TRACE: int = 5
|
TRACE: int = 5
|
||||||
|
|
|
@ -1,9 +1,7 @@
|
||||||
import warnings
|
|
||||||
from typing import Dict, Optional, Union
|
from typing import Dict, Optional, Union
|
||||||
|
|
||||||
from .api import from_bytes, from_fp, from_path, normalize
|
from .api import from_bytes
|
||||||
from .constant import CHARDET_CORRESPONDENCE
|
from .constant import CHARDET_CORRESPONDENCE
|
||||||
from .models import CharsetMatch, CharsetMatches
|
|
||||||
|
|
||||||
|
|
||||||
def detect(byte_str: bytes) -> Dict[str, Optional[Union[str, float]]]:
|
def detect(byte_str: bytes) -> Dict[str, Optional[Union[str, float]]]:
|
||||||
|
@ -43,53 +41,3 @@ def detect(byte_str: bytes) -> Dict[str, Optional[Union[str, float]]]:
|
||||||
"language": language,
|
"language": language,
|
||||||
"confidence": confidence,
|
"confidence": confidence,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
class CharsetNormalizerMatch(CharsetMatch):
|
|
||||||
pass
|
|
||||||
|
|
||||||
|
|
||||||
class CharsetNormalizerMatches(CharsetMatches):
|
|
||||||
@staticmethod
|
|
||||||
def from_fp(*args, **kwargs): # type: ignore
|
|
||||||
warnings.warn( # pragma: nocover
|
|
||||||
"staticmethod from_fp, from_bytes, from_path and normalize are deprecated "
|
|
||||||
"and scheduled to be removed in 3.0",
|
|
||||||
DeprecationWarning,
|
|
||||||
)
|
|
||||||
return from_fp(*args, **kwargs) # pragma: nocover
|
|
||||||
|
|
||||||
@staticmethod
|
|
||||||
def from_bytes(*args, **kwargs): # type: ignore
|
|
||||||
warnings.warn( # pragma: nocover
|
|
||||||
"staticmethod from_fp, from_bytes, from_path and normalize are deprecated "
|
|
||||||
"and scheduled to be removed in 3.0",
|
|
||||||
DeprecationWarning,
|
|
||||||
)
|
|
||||||
return from_bytes(*args, **kwargs) # pragma: nocover
|
|
||||||
|
|
||||||
@staticmethod
|
|
||||||
def from_path(*args, **kwargs): # type: ignore
|
|
||||||
warnings.warn( # pragma: nocover
|
|
||||||
"staticmethod from_fp, from_bytes, from_path and normalize are deprecated "
|
|
||||||
"and scheduled to be removed in 3.0",
|
|
||||||
DeprecationWarning,
|
|
||||||
)
|
|
||||||
return from_path(*args, **kwargs) # pragma: nocover
|
|
||||||
|
|
||||||
@staticmethod
|
|
||||||
def normalize(*args, **kwargs): # type: ignore
|
|
||||||
warnings.warn( # pragma: nocover
|
|
||||||
"staticmethod from_fp, from_bytes, from_path and normalize are deprecated "
|
|
||||||
"and scheduled to be removed in 3.0",
|
|
||||||
DeprecationWarning,
|
|
||||||
)
|
|
||||||
return normalize(*args, **kwargs) # pragma: nocover
|
|
||||||
|
|
||||||
|
|
||||||
class CharsetDetector(CharsetNormalizerMatches):
|
|
||||||
pass
|
|
||||||
|
|
||||||
|
|
||||||
class CharsetDoctor(CharsetNormalizerMatches):
|
|
||||||
pass
|
|
||||||
|
|
|
@ -1,7 +1,12 @@
|
||||||
from functools import lru_cache
|
from functools import lru_cache
|
||||||
|
from logging import getLogger
|
||||||
from typing import List, Optional
|
from typing import List, Optional
|
||||||
|
|
||||||
from .constant import COMMON_SAFE_ASCII_CHARACTERS, UNICODE_SECONDARY_RANGE_KEYWORD
|
from .constant import (
|
||||||
|
COMMON_SAFE_ASCII_CHARACTERS,
|
||||||
|
TRACE,
|
||||||
|
UNICODE_SECONDARY_RANGE_KEYWORD,
|
||||||
|
)
|
||||||
from .utils import (
|
from .utils import (
|
||||||
is_accentuated,
|
is_accentuated,
|
||||||
is_ascii,
|
is_ascii,
|
||||||
|
@ -123,7 +128,7 @@ class TooManyAccentuatedPlugin(MessDetectorPlugin):
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def ratio(self) -> float:
|
def ratio(self) -> float:
|
||||||
if self._character_count == 0:
|
if self._character_count == 0 or self._character_count < 8:
|
||||||
return 0.0
|
return 0.0
|
||||||
ratio_of_accentuation: float = self._accentuated_count / self._character_count
|
ratio_of_accentuation: float = self._accentuated_count / self._character_count
|
||||||
return ratio_of_accentuation if ratio_of_accentuation >= 0.35 else 0.0
|
return ratio_of_accentuation if ratio_of_accentuation >= 0.35 else 0.0
|
||||||
|
@ -547,7 +552,20 @@ def mess_ratio(
|
||||||
break
|
break
|
||||||
|
|
||||||
if debug:
|
if debug:
|
||||||
|
logger = getLogger("charset_normalizer")
|
||||||
|
|
||||||
|
logger.log(
|
||||||
|
TRACE,
|
||||||
|
"Mess-detector extended-analysis start. "
|
||||||
|
f"intermediary_mean_mess_ratio_calc={intermediary_mean_mess_ratio_calc} mean_mess_ratio={mean_mess_ratio} "
|
||||||
|
f"maximum_threshold={maximum_threshold}",
|
||||||
|
)
|
||||||
|
|
||||||
|
if len(decoded_sequence) > 16:
|
||||||
|
logger.log(TRACE, f"Starting with: {decoded_sequence[:16]}")
|
||||||
|
logger.log(TRACE, f"Ending with: {decoded_sequence[-16::]}")
|
||||||
|
|
||||||
for dt in detectors: # pragma: nocover
|
for dt in detectors: # pragma: nocover
|
||||||
print(dt.__class__, dt.ratio)
|
logger.log(TRACE, f"{dt.__class__}: {dt.ratio}")
|
||||||
|
|
||||||
return round(mean_mess_ratio, 3)
|
return round(mean_mess_ratio, 3)
|
||||||
|
|
|
@ -1,22 +1,9 @@
|
||||||
import warnings
|
|
||||||
from collections import Counter
|
|
||||||
from encodings.aliases import aliases
|
from encodings.aliases import aliases
|
||||||
from hashlib import sha256
|
from hashlib import sha256
|
||||||
from json import dumps
|
from json import dumps
|
||||||
from re import sub
|
from typing import Any, Dict, Iterator, List, Optional, Tuple, Union
|
||||||
from typing import (
|
|
||||||
Any,
|
|
||||||
Counter as TypeCounter,
|
|
||||||
Dict,
|
|
||||||
Iterator,
|
|
||||||
List,
|
|
||||||
Optional,
|
|
||||||
Tuple,
|
|
||||||
Union,
|
|
||||||
)
|
|
||||||
|
|
||||||
from .constant import NOT_PRINTABLE_PATTERN, TOO_BIG_SEQUENCE
|
from .constant import TOO_BIG_SEQUENCE
|
||||||
from .md import mess_ratio
|
|
||||||
from .utils import iana_name, is_multi_byte_encoding, unicode_range
|
from .utils import iana_name, is_multi_byte_encoding, unicode_range
|
||||||
|
|
||||||
|
|
||||||
|
@ -65,7 +52,7 @@ class CharsetMatch:
|
||||||
chaos_difference: float = abs(self.chaos - other.chaos)
|
chaos_difference: float = abs(self.chaos - other.chaos)
|
||||||
coherence_difference: float = abs(self.coherence - other.coherence)
|
coherence_difference: float = abs(self.coherence - other.coherence)
|
||||||
|
|
||||||
# Bellow 1% difference --> Use Coherence
|
# Below 1% difference --> Use Coherence
|
||||||
if chaos_difference < 0.01 and coherence_difference > 0.02:
|
if chaos_difference < 0.01 and coherence_difference > 0.02:
|
||||||
# When having a tough decision, use the result that decoded as many multi-byte as possible.
|
# When having a tough decision, use the result that decoded as many multi-byte as possible.
|
||||||
if chaos_difference == 0.0 and self.coherence == other.coherence:
|
if chaos_difference == 0.0 and self.coherence == other.coherence:
|
||||||
|
@ -78,45 +65,6 @@ class CharsetMatch:
|
||||||
def multi_byte_usage(self) -> float:
|
def multi_byte_usage(self) -> float:
|
||||||
return 1.0 - len(str(self)) / len(self.raw)
|
return 1.0 - len(str(self)) / len(self.raw)
|
||||||
|
|
||||||
@property
|
|
||||||
def chaos_secondary_pass(self) -> float:
|
|
||||||
"""
|
|
||||||
Check once again chaos in decoded text, except this time, with full content.
|
|
||||||
Use with caution, this can be very slow.
|
|
||||||
Notice: Will be removed in 3.0
|
|
||||||
"""
|
|
||||||
warnings.warn(
|
|
||||||
"chaos_secondary_pass is deprecated and will be removed in 3.0",
|
|
||||||
DeprecationWarning,
|
|
||||||
)
|
|
||||||
return mess_ratio(str(self), 1.0)
|
|
||||||
|
|
||||||
@property
|
|
||||||
def coherence_non_latin(self) -> float:
|
|
||||||
"""
|
|
||||||
Coherence ratio on the first non-latin language detected if ANY.
|
|
||||||
Notice: Will be removed in 3.0
|
|
||||||
"""
|
|
||||||
warnings.warn(
|
|
||||||
"coherence_non_latin is deprecated and will be removed in 3.0",
|
|
||||||
DeprecationWarning,
|
|
||||||
)
|
|
||||||
return 0.0
|
|
||||||
|
|
||||||
@property
|
|
||||||
def w_counter(self) -> TypeCounter[str]:
|
|
||||||
"""
|
|
||||||
Word counter instance on decoded text.
|
|
||||||
Notice: Will be removed in 3.0
|
|
||||||
"""
|
|
||||||
warnings.warn(
|
|
||||||
"w_counter is deprecated and will be removed in 3.0", DeprecationWarning
|
|
||||||
)
|
|
||||||
|
|
||||||
string_printable_only = sub(NOT_PRINTABLE_PATTERN, " ", str(self).lower())
|
|
||||||
|
|
||||||
return Counter(string_printable_only.split())
|
|
||||||
|
|
||||||
def __str__(self) -> str:
|
def __str__(self) -> str:
|
||||||
# Lazy Str Loading
|
# Lazy Str Loading
|
||||||
if self._string is None:
|
if self._string is None:
|
||||||
|
@ -252,18 +200,6 @@ class CharsetMatch:
|
||||||
"""
|
"""
|
||||||
return [self._encoding] + [m.encoding for m in self._leaves]
|
return [self._encoding] + [m.encoding for m in self._leaves]
|
||||||
|
|
||||||
def first(self) -> "CharsetMatch":
|
|
||||||
"""
|
|
||||||
Kept for BC reasons. Will be removed in 3.0.
|
|
||||||
"""
|
|
||||||
return self
|
|
||||||
|
|
||||||
def best(self) -> "CharsetMatch":
|
|
||||||
"""
|
|
||||||
Kept for BC reasons. Will be removed in 3.0.
|
|
||||||
"""
|
|
||||||
return self
|
|
||||||
|
|
||||||
def output(self, encoding: str = "utf_8") -> bytes:
|
def output(self, encoding: str = "utf_8") -> bytes:
|
||||||
"""
|
"""
|
||||||
Method to get re-encoded bytes payload using given target encoding. Default to UTF-8.
|
Method to get re-encoded bytes payload using given target encoding. Default to UTF-8.
|
||||||
|
|
|
@ -1,12 +1,6 @@
|
||||||
try:
|
|
||||||
# WARNING: unicodedata2 support is going to be removed in 3.0
|
|
||||||
# Python is quickly catching up.
|
|
||||||
import unicodedata2 as unicodedata
|
|
||||||
except ImportError:
|
|
||||||
import unicodedata # type: ignore[no-redef]
|
|
||||||
|
|
||||||
import importlib
|
import importlib
|
||||||
import logging
|
import logging
|
||||||
|
import unicodedata
|
||||||
from codecs import IncrementalDecoder
|
from codecs import IncrementalDecoder
|
||||||
from encodings.aliases import aliases
|
from encodings.aliases import aliases
|
||||||
from functools import lru_cache
|
from functools import lru_cache
|
||||||
|
@ -402,7 +396,7 @@ def cut_sequence_chunks(
|
||||||
|
|
||||||
# multi-byte bad cutting detector and adjustment
|
# multi-byte bad cutting detector and adjustment
|
||||||
# not the cleanest way to perform that fix but clever enough for now.
|
# not the cleanest way to perform that fix but clever enough for now.
|
||||||
if is_multi_byte_decoder and i > 0 and sequences[i] >= 0x80:
|
if is_multi_byte_decoder and i > 0:
|
||||||
|
|
||||||
chunk_partial_size_chk: int = min(chunk_size, 16)
|
chunk_partial_size_chk: int = min(chunk_size, 16)
|
||||||
|
|
||||||
|
|
|
@ -2,5 +2,5 @@
|
||||||
Expose version
|
Expose version
|
||||||
"""
|
"""
|
||||||
|
|
||||||
__version__ = "2.1.1"
|
__version__ = "3.0.1"
|
||||||
VERSION = __version__.split(".")
|
VERSION = __version__.split(".")
|
||||||
|
|
|
@ -80,8 +80,8 @@ def check_compatibility(urllib3_version, chardet_version, charset_normalizer_ver
|
||||||
elif charset_normalizer_version:
|
elif charset_normalizer_version:
|
||||||
major, minor, patch = charset_normalizer_version.split(".")[:3]
|
major, minor, patch = charset_normalizer_version.split(".")[:3]
|
||||||
major, minor, patch = int(major), int(minor), int(patch)
|
major, minor, patch = int(major), int(minor), int(patch)
|
||||||
# charset_normalizer >= 2.0.0 < 3.0.0
|
# charset_normalizer >= 2.0.0 < 4.0.0
|
||||||
assert (2, 0, 0) <= (major, minor, patch) < (3, 0, 0)
|
assert (2, 0, 0) <= (major, minor, patch) < (4, 0, 0)
|
||||||
else:
|
else:
|
||||||
raise Exception("You need either charset_normalizer or chardet installed")
|
raise Exception("You need either charset_normalizer or chardet installed")
|
||||||
|
|
||||||
|
|
|
@ -5,10 +5,10 @@
|
||||||
__title__ = "requests"
|
__title__ = "requests"
|
||||||
__description__ = "Python HTTP for Humans."
|
__description__ = "Python HTTP for Humans."
|
||||||
__url__ = "https://requests.readthedocs.io"
|
__url__ = "https://requests.readthedocs.io"
|
||||||
__version__ = "2.28.1"
|
__version__ = "2.28.2"
|
||||||
__build__ = 0x022801
|
__build__ = 0x022802
|
||||||
__author__ = "Kenneth Reitz"
|
__author__ = "Kenneth Reitz"
|
||||||
__author_email__ = "me@kennethreitz.org"
|
__author_email__ = "me@kennethreitz.org"
|
||||||
__license__ = "Apache 2.0"
|
__license__ = "Apache 2.0"
|
||||||
__copyright__ = "Copyright 2022 Kenneth Reitz"
|
__copyright__ = "Copyright Kenneth Reitz"
|
||||||
__cake__ = "\u2728 \U0001f370 \u2728"
|
__cake__ = "\u2728 \U0001f370 \u2728"
|
||||||
|
|
|
@ -438,7 +438,7 @@ class PreparedRequest(RequestEncodingMixin, RequestHooksMixin):
|
||||||
if not scheme:
|
if not scheme:
|
||||||
raise MissingSchema(
|
raise MissingSchema(
|
||||||
f"Invalid URL {url!r}: No scheme supplied. "
|
f"Invalid URL {url!r}: No scheme supplied. "
|
||||||
f"Perhaps you meant http://{url}?"
|
f"Perhaps you meant https://{url}?"
|
||||||
)
|
)
|
||||||
|
|
||||||
if not host:
|
if not host:
|
||||||
|
|
|
@ -1,2 +1,2 @@
|
||||||
# This file is protected via CODEOWNERS
|
# This file is protected via CODEOWNERS
|
||||||
__version__ = "1.26.13"
|
__version__ = "1.26.14"
|
||||||
|
|
|
@ -224,7 +224,7 @@ class AppEngineManager(RequestMethods):
|
||||||
)
|
)
|
||||||
|
|
||||||
# Check if we should retry the HTTP response.
|
# Check if we should retry the HTTP response.
|
||||||
has_retry_after = bool(http_response.getheader("Retry-After"))
|
has_retry_after = bool(http_response.headers.get("Retry-After"))
|
||||||
if retries.is_retry(method, http_response.status, has_retry_after):
|
if retries.is_retry(method, http_response.status, has_retry_after):
|
||||||
retries = retries.increment(method, url, response=http_response, _pool=self)
|
retries = retries.increment(method, url, response=http_response, _pool=self)
|
||||||
log.debug("Retry: %s", url)
|
log.debug("Retry: %s", url)
|
||||||
|
|
|
@ -69,7 +69,7 @@ class NTLMConnectionPool(HTTPSConnectionPool):
|
||||||
log.debug("Request headers: %s", headers)
|
log.debug("Request headers: %s", headers)
|
||||||
conn.request("GET", self.authurl, None, headers)
|
conn.request("GET", self.authurl, None, headers)
|
||||||
res = conn.getresponse()
|
res = conn.getresponse()
|
||||||
reshdr = dict(res.getheaders())
|
reshdr = dict(res.headers)
|
||||||
log.debug("Response status: %s %s", res.status, res.reason)
|
log.debug("Response status: %s %s", res.status, res.reason)
|
||||||
log.debug("Response headers: %s", reshdr)
|
log.debug("Response headers: %s", reshdr)
|
||||||
log.debug("Response data: %s [...]", res.read(100))
|
log.debug("Response data: %s [...]", res.read(100))
|
||||||
|
@ -101,7 +101,7 @@ class NTLMConnectionPool(HTTPSConnectionPool):
|
||||||
conn.request("GET", self.authurl, None, headers)
|
conn.request("GET", self.authurl, None, headers)
|
||||||
res = conn.getresponse()
|
res = conn.getresponse()
|
||||||
log.debug("Response status: %s %s", res.status, res.reason)
|
log.debug("Response status: %s %s", res.status, res.reason)
|
||||||
log.debug("Response headers: %s", dict(res.getheaders()))
|
log.debug("Response headers: %s", dict(res.headers))
|
||||||
log.debug("Response data: %s [...]", res.read()[:100])
|
log.debug("Response data: %s [...]", res.read()[:100])
|
||||||
if res.status != 200:
|
if res.status != 200:
|
||||||
if res.status == 401:
|
if res.status == 401:
|
||||||
|
|
|
@ -666,7 +666,7 @@ class HTTPResponse(io.IOBase):
|
||||||
def getheaders(self):
|
def getheaders(self):
|
||||||
warnings.warn(
|
warnings.warn(
|
||||||
"HTTPResponse.getheaders() is deprecated and will be removed "
|
"HTTPResponse.getheaders() is deprecated and will be removed "
|
||||||
"in urllib3 v2.1.0. Instead access HTTResponse.headers directly.",
|
"in urllib3 v2.1.0. Instead access HTTPResponse.headers directly.",
|
||||||
category=DeprecationWarning,
|
category=DeprecationWarning,
|
||||||
stacklevel=2,
|
stacklevel=2,
|
||||||
)
|
)
|
||||||
|
@ -675,7 +675,7 @@ class HTTPResponse(io.IOBase):
|
||||||
def getheader(self, name, default=None):
|
def getheader(self, name, default=None):
|
||||||
warnings.warn(
|
warnings.warn(
|
||||||
"HTTPResponse.getheader() is deprecated and will be removed "
|
"HTTPResponse.getheader() is deprecated and will be removed "
|
||||||
"in urllib3 v2.1.0. Instead use HTTResponse.headers.get(name, default).",
|
"in urllib3 v2.1.0. Instead use HTTPResponse.headers.get(name, default).",
|
||||||
category=DeprecationWarning,
|
category=DeprecationWarning,
|
||||||
stacklevel=2,
|
stacklevel=2,
|
||||||
)
|
)
|
||||||
|
|
|
@ -63,7 +63,7 @@ IPV6_ADDRZ_RE = re.compile("^" + IPV6_ADDRZ_PAT + "$")
|
||||||
BRACELESS_IPV6_ADDRZ_RE = re.compile("^" + IPV6_ADDRZ_PAT[2:-2] + "$")
|
BRACELESS_IPV6_ADDRZ_RE = re.compile("^" + IPV6_ADDRZ_PAT[2:-2] + "$")
|
||||||
ZONE_ID_RE = re.compile("(" + ZONE_ID_PAT + r")\]$")
|
ZONE_ID_RE = re.compile("(" + ZONE_ID_PAT + r")\]$")
|
||||||
|
|
||||||
_HOST_PORT_PAT = ("^(%s|%s|%s)(?::0*([0-9]{0,5}))?$") % (
|
_HOST_PORT_PAT = ("^(%s|%s|%s)(?::0*?(|0|[1-9][0-9]{0,4}))?$") % (
|
||||||
REG_NAME_PAT,
|
REG_NAME_PAT,
|
||||||
IPV4_PAT,
|
IPV4_PAT,
|
||||||
IPV6_ADDRZ_PAT,
|
IPV6_ADDRZ_PAT,
|
||||||
|
|
|
@ -36,7 +36,7 @@ pyparsing==3.0.9
|
||||||
python-dateutil==2.8.2
|
python-dateutil==2.8.2
|
||||||
python-twitter==3.5
|
python-twitter==3.5
|
||||||
pytz==2022.7
|
pytz==2022.7
|
||||||
requests==2.28.1
|
requests==2.28.2
|
||||||
requests-oauthlib==1.3.1
|
requests-oauthlib==1.3.1
|
||||||
rumps==0.4.0; platform_system == "Darwin"
|
rumps==0.4.0; platform_system == "Darwin"
|
||||||
simplejson==3.18.0
|
simplejson==3.18.0
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue