diff --git a/lib/charset_normalizer/__init__.py b/lib/charset_normalizer/__init__.py index 2dcaf56f..ebb5da89 100644 --- a/lib/charset_normalizer/__init__.py +++ b/lib/charset_normalizer/__init__.py @@ -21,14 +21,8 @@ at . """ import logging -from .api import from_bytes, from_fp, from_path, normalize -from .legacy import ( - CharsetDetector, - CharsetDoctor, - CharsetNormalizerMatch, - CharsetNormalizerMatches, - detect, -) +from .api import from_bytes, from_fp, from_path +from .legacy import detect from .models import CharsetMatch, CharsetMatches from .utils import set_logging_handler from .version import VERSION, __version__ @@ -37,14 +31,9 @@ __all__ = ( "from_fp", "from_path", "from_bytes", - "normalize", "detect", "CharsetMatch", "CharsetMatches", - "CharsetNormalizerMatch", - "CharsetNormalizerMatches", - "CharsetDetector", - "CharsetDoctor", "__version__", "VERSION", "set_logging_handler", diff --git a/lib/charset_normalizer/api.py b/lib/charset_normalizer/api.py index 72907f94..6c7e8983 100644 --- a/lib/charset_normalizer/api.py +++ b/lib/charset_normalizer/api.py @@ -1,7 +1,5 @@ import logging -import warnings from os import PathLike -from os.path import basename, splitext from typing import Any, BinaryIO, List, Optional, Set from .cd import ( @@ -41,11 +39,12 @@ def from_bytes( cp_exclusion: Optional[List[str]] = None, preemptive_behaviour: bool = True, explain: bool = False, + language_threshold: float = 0.1, ) -> CharsetMatches: """ Given a raw bytes sequence, return the best possibles charset usable to render str objects. If there is no results, it is a strong indicator that the source is binary/not text. - By default, the process will extract 5 blocs of 512o each to assess the mess and coherence of a given sequence. + By default, the process will extract 5 blocks of 512o each to assess the mess and coherence of a given sequence. And will give up a particular code page after 20% of measured mess. Those criteria are customizable at will. The preemptive behavior DOES NOT replace the traditional detection workflow, it prioritize a particular code page @@ -197,7 +196,14 @@ def from_bytes( if encoding_iana in {"utf_16", "utf_32"} and not bom_or_sig_available: logger.log( TRACE, - "Encoding %s wont be tested as-is because it require a BOM. Will try some sub-encoder LE/BE.", + "Encoding %s won't be tested as-is because it require a BOM. Will try some sub-encoder LE/BE.", + encoding_iana, + ) + continue + if encoding_iana in {"utf_7"} and not bom_or_sig_available: + logger.log( + TRACE, + "Encoding %s won't be tested as-is because detection is unreliable without BOM/SIG.", encoding_iana, ) continue @@ -297,7 +303,13 @@ def from_bytes( ): md_chunks.append(chunk) - md_ratios.append(mess_ratio(chunk, threshold)) + md_ratios.append( + mess_ratio( + chunk, + threshold, + explain is True and 1 <= len(cp_isolation) <= 2, + ) + ) if md_ratios[-1] >= threshold: early_stop_count += 1 @@ -389,7 +401,9 @@ def from_bytes( if encoding_iana != "ascii": for chunk in md_chunks: chunk_languages = coherence_ratio( - chunk, 0.1, ",".join(target_languages) if target_languages else None + chunk, + language_threshold, + ",".join(target_languages) if target_languages else None, ) cd_ratios.append(chunk_languages) @@ -491,6 +505,7 @@ def from_fp( cp_exclusion: Optional[List[str]] = None, preemptive_behaviour: bool = True, explain: bool = False, + language_threshold: float = 0.1, ) -> CharsetMatches: """ Same thing than the function from_bytes but using a file pointer that is already ready. @@ -505,6 +520,7 @@ def from_fp( cp_exclusion, preemptive_behaviour, explain, + language_threshold, ) @@ -517,6 +533,7 @@ def from_path( cp_exclusion: Optional[List[str]] = None, preemptive_behaviour: bool = True, explain: bool = False, + language_threshold: float = 0.1, ) -> CharsetMatches: """ Same thing than the function from_bytes but with one extra step. Opening and reading given file path in binary mode. @@ -532,53 +549,5 @@ def from_path( cp_exclusion, preemptive_behaviour, explain, + language_threshold, ) - - -def normalize( - path: "PathLike[Any]", - steps: int = 5, - chunk_size: int = 512, - threshold: float = 0.20, - cp_isolation: Optional[List[str]] = None, - cp_exclusion: Optional[List[str]] = None, - preemptive_behaviour: bool = True, -) -> CharsetMatch: - """ - Take a (text-based) file path and try to create another file next to it, this time using UTF-8. - """ - warnings.warn( - "normalize is deprecated and will be removed in 3.0", - DeprecationWarning, - ) - - results = from_path( - path, - steps, - chunk_size, - threshold, - cp_isolation, - cp_exclusion, - preemptive_behaviour, - ) - - filename = basename(path) - target_extensions = list(splitext(filename)) - - if len(results) == 0: - raise IOError( - 'Unable to normalize "{}", no encoding charset seems to fit.'.format( - filename - ) - ) - - result = results.best() - - target_extensions[0] += "-" + result.encoding # type: ignore - - with open( - "{}".format(str(path).replace(filename, "".join(target_extensions))), "wb" - ) as fp: - fp.write(result.output()) # type: ignore - - return result # type: ignore diff --git a/lib/charset_normalizer/assets/__init__.py b/lib/charset_normalizer/assets/__init__.py index 3c33ba30..9075930d 100644 --- a/lib/charset_normalizer/assets/__init__.py +++ b/lib/charset_normalizer/assets/__init__.py @@ -1,6 +1,8 @@ # -*- coding: utf-8 -*- from typing import Dict, List +# Language label that contain the em dash "—" +# character are to be considered alternative seq to origin FREQUENCIES: Dict[str, List[str]] = { "English": [ "e", @@ -30,6 +32,34 @@ FREQUENCIES: Dict[str, List[str]] = { "z", "q", ], + "English—": [ + "e", + "a", + "t", + "i", + "o", + "n", + "s", + "r", + "h", + "l", + "d", + "c", + "m", + "u", + "f", + "p", + "g", + "w", + "b", + "y", + "v", + "k", + "j", + "x", + "z", + "q", + ], "German": [ "e", "n", @@ -226,33 +256,303 @@ FREQUENCIES: Dict[str, List[str]] = { "ж", "ц", ], + # Jap-Kanji "Japanese": [ + "人", + "一", + "大", + "亅", + "丁", + "丨", + "竹", + "笑", + "口", + "日", + "今", + "二", + "彳", + "行", + "十", + "土", + "丶", + "寸", + "寺", + "時", + "乙", + "丿", + "乂", + "气", + "気", + "冂", + "巾", + "亠", + "市", + "目", + "儿", + "見", + "八", + "小", + "凵", + "県", + "月", + "彐", + "門", + "間", + "木", + "東", + "山", + "出", + "本", + "中", + "刀", + "分", + "耳", + "又", + "取", + "最", + "言", + "田", + "心", + "思", + "刂", + "前", + "京", + "尹", + "事", + "生", + "厶", + "云", + "会", + "未", + "来", + "白", + "冫", + "楽", + "灬", + "馬", + "尸", + "尺", + "駅", + "明", + "耂", + "者", + "了", + "阝", + "都", + "高", + "卜", + "占", + "厂", + "广", + "店", + "子", + "申", + "奄", + "亻", + "俺", + "上", + "方", + "冖", + "学", + "衣", + "艮", + "食", + "自", + ], + # Jap-Katakana + "Japanese—": [ + "ー", + "ン", + "ス", + "・", + "ル", + "ト", + "リ", + "イ", + "ア", + "ラ", + "ッ", + "ク", + "ド", + "シ", + "レ", + "ジ", + "タ", + "フ", + "ロ", + "カ", + "テ", + "マ", + "ィ", + "グ", + "バ", + "ム", + "プ", + "オ", + "コ", + "デ", + "ニ", + "ウ", + "メ", + "サ", + "ビ", + "ナ", + "ブ", + "ャ", + "エ", + "ュ", + "チ", + "キ", + "ズ", + "ダ", + "パ", + "ミ", + "ェ", + "ョ", + "ハ", + "セ", + "ベ", + "ガ", + "モ", + "ツ", + "ネ", + "ボ", + "ソ", + "ノ", + "ァ", + "ヴ", + "ワ", + "ポ", + "ペ", + "ピ", + "ケ", + "ゴ", + "ギ", + "ザ", + "ホ", + "ゲ", + "ォ", + "ヤ", + "ヒ", + "ユ", + "ヨ", + "ヘ", + "ゼ", + "ヌ", + "ゥ", + "ゾ", + "ヶ", + "ヂ", + "ヲ", + "ヅ", + "ヵ", + "ヱ", + "ヰ", + "ヮ", + "ヽ", + "゠", + "ヾ", + "ヷ", + "ヿ", + "ヸ", + "ヹ", + "ヺ", + ], + # Jap-Hiragana + "Japanese——": [ "の", "に", "る", "た", - "は", - "ー", "と", + "は", "し", + "い", "を", "で", "て", "が", - "い", - "ン", - "れ", "な", - "年", - "ス", - "っ", - "ル", + "れ", "か", "ら", - "あ", "さ", - "も", + "っ", "り", + "す", + "あ", + "も", + "こ", + "ま", + "う", + "く", + "よ", + "き", + "ん", + "め", + "お", + "け", + "そ", + "つ", + "だ", + "や", + "え", + "ど", + "わ", + "ち", + "み", + "せ", + "じ", + "ば", + "へ", + "び", + "ず", + "ろ", + "ほ", + "げ", + "む", + "べ", + "ひ", + "ょ", + "ゆ", + "ぶ", + "ご", + "ゃ", + "ね", + "ふ", + "ぐ", + "ぎ", + "ぼ", + "ゅ", + "づ", + "ざ", + "ぞ", + "ぬ", + "ぜ", + "ぱ", + "ぽ", + "ぷ", + "ぴ", + "ぃ", + "ぁ", + "ぇ", + "ぺ", + "ゞ", + "ぢ", + "ぉ", + "ぅ", + "ゐ", + "ゝ", + "ゑ", + "゛", + "゜", + "ゎ", + "ゔ", + "゚", + "ゟ", + "゙", + "ゕ", + "ゖ", ], "Portuguese": [ "a", @@ -340,6 +640,77 @@ FREQUENCIES: Dict[str, List[str]] = { "就", "出", "会", + "可", + "也", + "你", + "对", + "生", + "能", + "而", + "子", + "那", + "得", + "于", + "着", + "下", + "自", + "之", + "年", + "过", + "发", + "后", + "作", + "里", + "用", + "道", + "行", + "所", + "然", + "家", + "种", + "事", + "成", + "方", + "多", + "经", + "么", + "去", + "法", + "学", + "如", + "都", + "同", + "现", + "当", + "没", + "动", + "面", + "起", + "看", + "定", + "天", + "分", + "还", + "进", + "好", + "小", + "部", + "其", + "些", + "主", + "样", + "理", + "心", + "她", + "本", + "前", + "开", + "但", + "因", + "只", + "从", + "想", + "实", ], "Ukrainian": [ "о", @@ -956,34 +1327,6 @@ FREQUENCIES: Dict[str, List[str]] = { "ö", "y", ], - "Simple English": [ - "e", - "a", - "t", - "i", - "o", - "n", - "s", - "r", - "h", - "l", - "d", - "c", - "m", - "u", - "f", - "p", - "g", - "w", - "b", - "y", - "v", - "k", - "j", - "x", - "z", - "q", - ], "Thai": [ "า", "น", @@ -1066,31 +1409,6 @@ FREQUENCIES: Dict[str, List[str]] = { "ஒ", "ஸ", ], - "Classical Chinese": [ - "之", - "年", - "為", - "也", - "以", - "一", - "人", - "其", - "者", - "國", - "有", - "二", - "十", - "於", - "曰", - "三", - "不", - "大", - "而", - "子", - "中", - "五", - "四", - ], "Kazakh": [ "а", "ы", diff --git a/lib/charset_normalizer/cd.py b/lib/charset_normalizer/cd.py index ee4b7424..ae2813fb 100644 --- a/lib/charset_normalizer/cd.py +++ b/lib/charset_normalizer/cd.py @@ -105,7 +105,7 @@ def mb_encoding_languages(iana_name: str) -> List[str]: ): return ["Japanese"] if iana_name.startswith("gb") or iana_name in ZH_NAMES: - return ["Chinese", "Classical Chinese"] + return ["Chinese"] if iana_name.startswith("iso2022_kr") or iana_name in KO_NAMES: return ["Korean"] @@ -179,22 +179,45 @@ def characters_popularity_compare( character_approved_count: int = 0 FREQUENCIES_language_set = set(FREQUENCIES[language]) - for character in ordered_characters: + ordered_characters_count: int = len(ordered_characters) + target_language_characters_count: int = len(FREQUENCIES[language]) + + large_alphabet: bool = target_language_characters_count > 26 + + for character, character_rank in zip( + ordered_characters, range(0, ordered_characters_count) + ): if character not in FREQUENCIES_language_set: continue + character_rank_in_language: int = FREQUENCIES[language].index(character) + expected_projection_ratio: float = ( + target_language_characters_count / ordered_characters_count + ) + character_rank_projection: int = int(character_rank * expected_projection_ratio) + + if ( + large_alphabet is False + and abs(character_rank_projection - character_rank_in_language) > 4 + ): + continue + + if ( + large_alphabet is True + and abs(character_rank_projection - character_rank_in_language) + < target_language_characters_count / 3 + ): + character_approved_count += 1 + continue + characters_before_source: List[str] = FREQUENCIES[language][ - 0 : FREQUENCIES[language].index(character) + 0:character_rank_in_language ] characters_after_source: List[str] = FREQUENCIES[language][ - FREQUENCIES[language].index(character) : - ] - characters_before: List[str] = ordered_characters[ - 0 : ordered_characters.index(character) - ] - characters_after: List[str] = ordered_characters[ - ordered_characters.index(character) : + character_rank_in_language: ] + characters_before: List[str] = ordered_characters[0:character_rank] + characters_after: List[str] = ordered_characters[character_rank:] before_match_count: int = len( set(characters_before) & set(characters_before_source) @@ -289,6 +312,33 @@ def merge_coherence_ratios(results: List[CoherenceMatches]) -> CoherenceMatches: return sorted(merge, key=lambda x: x[1], reverse=True) +def filter_alt_coherence_matches(results: CoherenceMatches) -> CoherenceMatches: + """ + We shall NOT return "English—" in CoherenceMatches because it is an alternative + of "English". This function only keeps the best match and remove the em-dash in it. + """ + index_results: Dict[str, List[float]] = dict() + + for result in results: + language, ratio = result + no_em_name: str = language.replace("—", "") + + if no_em_name not in index_results: + index_results[no_em_name] = [] + + index_results[no_em_name].append(ratio) + + if any(len(index_results[e]) > 1 for e in index_results): + filtered_results: CoherenceMatches = [] + + for language in index_results: + filtered_results.append((language, max(index_results[language]))) + + return filtered_results + + return results + + @lru_cache(maxsize=2048) def coherence_ratio( decoded_sequence: str, threshold: float = 0.1, lg_inclusion: Optional[str] = None @@ -336,4 +386,6 @@ def coherence_ratio( if sufficient_match_count >= 3: break - return sorted(results, key=lambda x: x[1], reverse=True) + return sorted( + filter_alt_coherence_matches(results), key=lambda x: x[1], reverse=True + ) diff --git a/lib/charset_normalizer/cli/normalizer.py b/lib/charset_normalizer/cli/normalizer.py index b8b652a5..ad26b4d0 100644 --- a/lib/charset_normalizer/cli/normalizer.py +++ b/lib/charset_normalizer/cli/normalizer.py @@ -1,15 +1,12 @@ import argparse import sys from json import dumps -from os.path import abspath +from os.path import abspath, basename, dirname, join, realpath from platform import python_version from typing import List, Optional +from unicodedata import unidata_version -try: - from unicodedata2 import unidata_version -except ImportError: - from unicodedata import unidata_version - +import charset_normalizer.md as md_module from charset_normalizer import from_fp from charset_normalizer.models import CliDetectionResult from charset_normalizer.version import __version__ @@ -124,8 +121,11 @@ def cli_detect(argv: Optional[List[str]] = None) -> int: parser.add_argument( "--version", action="version", - version="Charset-Normalizer {} - Python {} - Unicode {}".format( - __version__, python_version(), unidata_version + version="Charset-Normalizer {} - Python {} - Unicode {} - SpeedUp {}".format( + __version__, + python_version(), + unidata_version, + "OFF" if md_module.__file__.lower().endswith(".py") else "ON", ), help="Show version information and exit.", ) @@ -234,7 +234,10 @@ def cli_detect(argv: Optional[List[str]] = None) -> int: my_file.close() continue - o_: List[str] = my_file.name.split(".") + dir_path = dirname(realpath(my_file.name)) + file_name = basename(realpath(my_file.name)) + + o_: List[str] = file_name.split(".") if args.replace is False: o_.insert(-1, best_guess.encoding) @@ -255,7 +258,7 @@ def cli_detect(argv: Optional[List[str]] = None) -> int: continue try: - x_[0].unicode_path = abspath("./{}".format(".".join(o_))) + x_[0].unicode_path = join(dir_path, ".".join(o_)) with open(x_[0].unicode_path, "w", encoding="utf-8") as fp: fp.write(str(best_guess)) diff --git a/lib/charset_normalizer/constant.py b/lib/charset_normalizer/constant.py index ac840c46..3188108d 100644 --- a/lib/charset_normalizer/constant.py +++ b/lib/charset_normalizer/constant.py @@ -489,9 +489,7 @@ COMMON_SAFE_ASCII_CHARACTERS: Set[str] = { KO_NAMES: Set[str] = {"johab", "cp949", "euc_kr"} ZH_NAMES: Set[str] = {"big5", "cp950", "big5hkscs", "hz"} -NOT_PRINTABLE_PATTERN = re_compile(r"[0-9\W\n\r\t]+") - LANGUAGE_SUPPORTED_COUNT: int = len(FREQUENCIES) -# Logging LEVEL bellow DEBUG +# Logging LEVEL below DEBUG TRACE: int = 5 diff --git a/lib/charset_normalizer/legacy.py b/lib/charset_normalizer/legacy.py index cdebe2b8..b266d176 100644 --- a/lib/charset_normalizer/legacy.py +++ b/lib/charset_normalizer/legacy.py @@ -1,9 +1,7 @@ -import warnings from typing import Dict, Optional, Union -from .api import from_bytes, from_fp, from_path, normalize +from .api import from_bytes from .constant import CHARDET_CORRESPONDENCE -from .models import CharsetMatch, CharsetMatches def detect(byte_str: bytes) -> Dict[str, Optional[Union[str, float]]]: @@ -43,53 +41,3 @@ def detect(byte_str: bytes) -> Dict[str, Optional[Union[str, float]]]: "language": language, "confidence": confidence, } - - -class CharsetNormalizerMatch(CharsetMatch): - pass - - -class CharsetNormalizerMatches(CharsetMatches): - @staticmethod - def from_fp(*args, **kwargs): # type: ignore - warnings.warn( # pragma: nocover - "staticmethod from_fp, from_bytes, from_path and normalize are deprecated " - "and scheduled to be removed in 3.0", - DeprecationWarning, - ) - return from_fp(*args, **kwargs) # pragma: nocover - - @staticmethod - def from_bytes(*args, **kwargs): # type: ignore - warnings.warn( # pragma: nocover - "staticmethod from_fp, from_bytes, from_path and normalize are deprecated " - "and scheduled to be removed in 3.0", - DeprecationWarning, - ) - return from_bytes(*args, **kwargs) # pragma: nocover - - @staticmethod - def from_path(*args, **kwargs): # type: ignore - warnings.warn( # pragma: nocover - "staticmethod from_fp, from_bytes, from_path and normalize are deprecated " - "and scheduled to be removed in 3.0", - DeprecationWarning, - ) - return from_path(*args, **kwargs) # pragma: nocover - - @staticmethod - def normalize(*args, **kwargs): # type: ignore - warnings.warn( # pragma: nocover - "staticmethod from_fp, from_bytes, from_path and normalize are deprecated " - "and scheduled to be removed in 3.0", - DeprecationWarning, - ) - return normalize(*args, **kwargs) # pragma: nocover - - -class CharsetDetector(CharsetNormalizerMatches): - pass - - -class CharsetDoctor(CharsetNormalizerMatches): - pass diff --git a/lib/charset_normalizer/md.py b/lib/charset_normalizer/md.py index 31808af8..56e9321a 100644 --- a/lib/charset_normalizer/md.py +++ b/lib/charset_normalizer/md.py @@ -1,7 +1,12 @@ from functools import lru_cache +from logging import getLogger from typing import List, Optional -from .constant import COMMON_SAFE_ASCII_CHARACTERS, UNICODE_SECONDARY_RANGE_KEYWORD +from .constant import ( + COMMON_SAFE_ASCII_CHARACTERS, + TRACE, + UNICODE_SECONDARY_RANGE_KEYWORD, +) from .utils import ( is_accentuated, is_ascii, @@ -123,7 +128,7 @@ class TooManyAccentuatedPlugin(MessDetectorPlugin): @property def ratio(self) -> float: - if self._character_count == 0: + if self._character_count == 0 or self._character_count < 8: return 0.0 ratio_of_accentuation: float = self._accentuated_count / self._character_count return ratio_of_accentuation if ratio_of_accentuation >= 0.35 else 0.0 @@ -547,7 +552,20 @@ def mess_ratio( break if debug: + logger = getLogger("charset_normalizer") + + logger.log( + TRACE, + "Mess-detector extended-analysis start. " + f"intermediary_mean_mess_ratio_calc={intermediary_mean_mess_ratio_calc} mean_mess_ratio={mean_mess_ratio} " + f"maximum_threshold={maximum_threshold}", + ) + + if len(decoded_sequence) > 16: + logger.log(TRACE, f"Starting with: {decoded_sequence[:16]}") + logger.log(TRACE, f"Ending with: {decoded_sequence[-16::]}") + for dt in detectors: # pragma: nocover - print(dt.__class__, dt.ratio) + logger.log(TRACE, f"{dt.__class__}: {dt.ratio}") return round(mean_mess_ratio, 3) diff --git a/lib/charset_normalizer/models.py b/lib/charset_normalizer/models.py index ccb0d475..7f8ca389 100644 --- a/lib/charset_normalizer/models.py +++ b/lib/charset_normalizer/models.py @@ -1,22 +1,9 @@ -import warnings -from collections import Counter from encodings.aliases import aliases from hashlib import sha256 from json import dumps -from re import sub -from typing import ( - Any, - Counter as TypeCounter, - Dict, - Iterator, - List, - Optional, - Tuple, - Union, -) +from typing import Any, Dict, Iterator, List, Optional, Tuple, Union -from .constant import NOT_PRINTABLE_PATTERN, TOO_BIG_SEQUENCE -from .md import mess_ratio +from .constant import TOO_BIG_SEQUENCE from .utils import iana_name, is_multi_byte_encoding, unicode_range @@ -65,7 +52,7 @@ class CharsetMatch: chaos_difference: float = abs(self.chaos - other.chaos) coherence_difference: float = abs(self.coherence - other.coherence) - # Bellow 1% difference --> Use Coherence + # Below 1% difference --> Use Coherence if chaos_difference < 0.01 and coherence_difference > 0.02: # When having a tough decision, use the result that decoded as many multi-byte as possible. if chaos_difference == 0.0 and self.coherence == other.coherence: @@ -78,45 +65,6 @@ class CharsetMatch: def multi_byte_usage(self) -> float: return 1.0 - len(str(self)) / len(self.raw) - @property - def chaos_secondary_pass(self) -> float: - """ - Check once again chaos in decoded text, except this time, with full content. - Use with caution, this can be very slow. - Notice: Will be removed in 3.0 - """ - warnings.warn( - "chaos_secondary_pass is deprecated and will be removed in 3.0", - DeprecationWarning, - ) - return mess_ratio(str(self), 1.0) - - @property - def coherence_non_latin(self) -> float: - """ - Coherence ratio on the first non-latin language detected if ANY. - Notice: Will be removed in 3.0 - """ - warnings.warn( - "coherence_non_latin is deprecated and will be removed in 3.0", - DeprecationWarning, - ) - return 0.0 - - @property - def w_counter(self) -> TypeCounter[str]: - """ - Word counter instance on decoded text. - Notice: Will be removed in 3.0 - """ - warnings.warn( - "w_counter is deprecated and will be removed in 3.0", DeprecationWarning - ) - - string_printable_only = sub(NOT_PRINTABLE_PATTERN, " ", str(self).lower()) - - return Counter(string_printable_only.split()) - def __str__(self) -> str: # Lazy Str Loading if self._string is None: @@ -252,18 +200,6 @@ class CharsetMatch: """ return [self._encoding] + [m.encoding for m in self._leaves] - def first(self) -> "CharsetMatch": - """ - Kept for BC reasons. Will be removed in 3.0. - """ - return self - - def best(self) -> "CharsetMatch": - """ - Kept for BC reasons. Will be removed in 3.0. - """ - return self - def output(self, encoding: str = "utf_8") -> bytes: """ Method to get re-encoded bytes payload using given target encoding. Default to UTF-8. diff --git a/lib/charset_normalizer/utils.py b/lib/charset_normalizer/utils.py index 859f212b..e3536267 100644 --- a/lib/charset_normalizer/utils.py +++ b/lib/charset_normalizer/utils.py @@ -1,12 +1,6 @@ -try: - # WARNING: unicodedata2 support is going to be removed in 3.0 - # Python is quickly catching up. - import unicodedata2 as unicodedata -except ImportError: - import unicodedata # type: ignore[no-redef] - import importlib import logging +import unicodedata from codecs import IncrementalDecoder from encodings.aliases import aliases from functools import lru_cache @@ -402,7 +396,7 @@ def cut_sequence_chunks( # multi-byte bad cutting detector and adjustment # not the cleanest way to perform that fix but clever enough for now. - if is_multi_byte_decoder and i > 0 and sequences[i] >= 0x80: + if is_multi_byte_decoder and i > 0: chunk_partial_size_chk: int = min(chunk_size, 16) diff --git a/lib/charset_normalizer/version.py b/lib/charset_normalizer/version.py index 64c0dbde..cb503673 100644 --- a/lib/charset_normalizer/version.py +++ b/lib/charset_normalizer/version.py @@ -2,5 +2,5 @@ Expose version """ -__version__ = "2.1.1" +__version__ = "3.0.1" VERSION = __version__.split(".") diff --git a/lib/requests/__init__.py b/lib/requests/__init__.py index 7ac8e297..22db3c1d 100644 --- a/lib/requests/__init__.py +++ b/lib/requests/__init__.py @@ -80,8 +80,8 @@ def check_compatibility(urllib3_version, chardet_version, charset_normalizer_ver elif charset_normalizer_version: major, minor, patch = charset_normalizer_version.split(".")[:3] major, minor, patch = int(major), int(minor), int(patch) - # charset_normalizer >= 2.0.0 < 3.0.0 - assert (2, 0, 0) <= (major, minor, patch) < (3, 0, 0) + # charset_normalizer >= 2.0.0 < 4.0.0 + assert (2, 0, 0) <= (major, minor, patch) < (4, 0, 0) else: raise Exception("You need either charset_normalizer or chardet installed") diff --git a/lib/requests/__version__.py b/lib/requests/__version__.py index e725ada6..69be3dec 100644 --- a/lib/requests/__version__.py +++ b/lib/requests/__version__.py @@ -5,10 +5,10 @@ __title__ = "requests" __description__ = "Python HTTP for Humans." __url__ = "https://requests.readthedocs.io" -__version__ = "2.28.1" -__build__ = 0x022801 +__version__ = "2.28.2" +__build__ = 0x022802 __author__ = "Kenneth Reitz" __author_email__ = "me@kennethreitz.org" __license__ = "Apache 2.0" -__copyright__ = "Copyright 2022 Kenneth Reitz" +__copyright__ = "Copyright Kenneth Reitz" __cake__ = "\u2728 \U0001f370 \u2728" diff --git a/lib/requests/models.py b/lib/requests/models.py index 3cd49f5b..617a4134 100644 --- a/lib/requests/models.py +++ b/lib/requests/models.py @@ -438,7 +438,7 @@ class PreparedRequest(RequestEncodingMixin, RequestHooksMixin): if not scheme: raise MissingSchema( f"Invalid URL {url!r}: No scheme supplied. " - f"Perhaps you meant http://{url}?" + f"Perhaps you meant https://{url}?" ) if not host: diff --git a/lib/urllib3/_version.py b/lib/urllib3/_version.py index 308d7f28..7c031661 100644 --- a/lib/urllib3/_version.py +++ b/lib/urllib3/_version.py @@ -1,2 +1,2 @@ # This file is protected via CODEOWNERS -__version__ = "1.26.13" +__version__ = "1.26.14" diff --git a/lib/urllib3/contrib/appengine.py b/lib/urllib3/contrib/appengine.py index f91bdd6e..a5a6d910 100644 --- a/lib/urllib3/contrib/appengine.py +++ b/lib/urllib3/contrib/appengine.py @@ -224,7 +224,7 @@ class AppEngineManager(RequestMethods): ) # Check if we should retry the HTTP response. - has_retry_after = bool(http_response.getheader("Retry-After")) + has_retry_after = bool(http_response.headers.get("Retry-After")) if retries.is_retry(method, http_response.status, has_retry_after): retries = retries.increment(method, url, response=http_response, _pool=self) log.debug("Retry: %s", url) diff --git a/lib/urllib3/contrib/ntlmpool.py b/lib/urllib3/contrib/ntlmpool.py index 41a8fd17..47166575 100644 --- a/lib/urllib3/contrib/ntlmpool.py +++ b/lib/urllib3/contrib/ntlmpool.py @@ -69,7 +69,7 @@ class NTLMConnectionPool(HTTPSConnectionPool): log.debug("Request headers: %s", headers) conn.request("GET", self.authurl, None, headers) res = conn.getresponse() - reshdr = dict(res.getheaders()) + reshdr = dict(res.headers) log.debug("Response status: %s %s", res.status, res.reason) log.debug("Response headers: %s", reshdr) log.debug("Response data: %s [...]", res.read(100)) @@ -101,7 +101,7 @@ class NTLMConnectionPool(HTTPSConnectionPool): conn.request("GET", self.authurl, None, headers) res = conn.getresponse() log.debug("Response status: %s %s", res.status, res.reason) - log.debug("Response headers: %s", dict(res.getheaders())) + log.debug("Response headers: %s", dict(res.headers)) log.debug("Response data: %s [...]", res.read()[:100]) if res.status != 200: if res.status == 401: diff --git a/lib/urllib3/response.py b/lib/urllib3/response.py index 8f1b4fa8..0bd13d40 100644 --- a/lib/urllib3/response.py +++ b/lib/urllib3/response.py @@ -666,7 +666,7 @@ class HTTPResponse(io.IOBase): def getheaders(self): warnings.warn( "HTTPResponse.getheaders() is deprecated and will be removed " - "in urllib3 v2.1.0. Instead access HTTResponse.headers directly.", + "in urllib3 v2.1.0. Instead access HTTPResponse.headers directly.", category=DeprecationWarning, stacklevel=2, ) @@ -675,7 +675,7 @@ class HTTPResponse(io.IOBase): def getheader(self, name, default=None): warnings.warn( "HTTPResponse.getheader() is deprecated and will be removed " - "in urllib3 v2.1.0. Instead use HTTResponse.headers.get(name, default).", + "in urllib3 v2.1.0. Instead use HTTPResponse.headers.get(name, default).", category=DeprecationWarning, stacklevel=2, ) diff --git a/lib/urllib3/util/url.py b/lib/urllib3/util/url.py index 94f1b8d4..3a169a43 100644 --- a/lib/urllib3/util/url.py +++ b/lib/urllib3/util/url.py @@ -63,7 +63,7 @@ IPV6_ADDRZ_RE = re.compile("^" + IPV6_ADDRZ_PAT + "$") BRACELESS_IPV6_ADDRZ_RE = re.compile("^" + IPV6_ADDRZ_PAT[2:-2] + "$") ZONE_ID_RE = re.compile("(" + ZONE_ID_PAT + r")\]$") -_HOST_PORT_PAT = ("^(%s|%s|%s)(?::0*([0-9]{0,5}))?$") % ( +_HOST_PORT_PAT = ("^(%s|%s|%s)(?::0*?(|0|[1-9][0-9]{0,4}))?$") % ( REG_NAME_PAT, IPV4_PAT, IPV6_ADDRZ_PAT, diff --git a/requirements.txt b/requirements.txt index 80de6d04..63e238ff 100644 --- a/requirements.txt +++ b/requirements.txt @@ -36,7 +36,7 @@ pyparsing==3.0.9 python-dateutil==2.8.2 python-twitter==3.5 pytz==2022.7 -requests==2.28.1 +requests==2.28.2 requests-oauthlib==1.3.1 rumps==0.4.0; platform_system == "Darwin" simplejson==3.18.0