Bump requests from 2.27.1 to 2.28.1 (#1781)

* Bump requests from 2.27.1 to 2.28.1 Bumps [requests](https://github.com/psf/requests) from 2.27.1 to 2.28.1. - [Release notes](https://github.com/psf/requests/releases) - [Changelog](https://github.com/psf/requests/blob/main/HISTORY.md) - [Commits](https://github.com/psf/requests/compare/v2.27.1...v2.28.1) --- updated-dependencies: - dependency-name: requests dependency-type: direct:production update-type: version-update:semver-minor ... Signed-off-by: dependabot[bot] <support@github.com> * Update requests==2.28.1 * Update urllib3==1.26.12 * Update certifi==2022.9.24 * Update idna==3.4 * Update charset-normalizer==2.1.1 Signed-off-by: dependabot[bot] <support@github.com> Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> Co-authored-by: JonnyWong16 <9099342+JonnyWong16@users.noreply.github.com> [skip ci]
2025-07-06 05:01:14 -07:00 · 2022-11-12 17:12:19 -08:00 · 2022-11-12 17:12:19 -08:00 · af1aed0b6b
commit af1aed0b6b
parent baa0e08c2a
46 changed files with 3295 additions and 2709 deletions
--- a/lib/charset_normalizer/api.py
+++ b/lib/charset_normalizer/api.py
@ -1,11 +1,8 @@
 import logging
+import warnings
+from os import PathLike
 from os.path import basename, splitext
-from typing import BinaryIO, List, Optional, Set
-
-try:
-    from os import PathLike
-except ImportError:  # pragma: no cover
-    PathLike = str  # type: ignore
+from typing import Any, BinaryIO, List, Optional, Set

 from .cd import (
    coherence_ratio,
@ -18,6 +15,7 @@ from .md import mess_ratio
 from .models import CharsetMatch, CharsetMatches
 from .utils import (
    any_specified_encoding,
+    cut_sequence_chunks,
    iana_name,
    identify_sig_or_bom,
    is_cp_similar,
@ -39,8 +37,8 @@ def from_bytes(
    steps: int = 5,
    chunk_size: int = 512,
    threshold: float = 0.2,
-    cp_isolation: List[str] = None,
-    cp_exclusion: List[str] = None,
+    cp_isolation: Optional[List[str]] = None,
+    cp_exclusion: Optional[List[str]] = None,
    preemptive_behaviour: bool = True,
    explain: bool = False,
 ) -> CharsetMatches:
@ -70,11 +68,11 @@ def from_bytes(
        )

    if explain:
-        previous_logger_level = logger.level  # type: int
+        previous_logger_level: int = logger.level
        logger.addHandler(explain_handler)
        logger.setLevel(TRACE)

-    length = len(sequences)  # type: int
+    length: int = len(sequences)

    if length == 0:
        logger.debug("Encoding detection on empty bytes, assuming utf_8 intention.")
@ -119,8 +117,8 @@ def from_bytes(
    if steps > 1 and length / steps < chunk_size:
        chunk_size = int(length / steps)

-    is_too_small_sequence = len(sequences) < TOO_SMALL_SEQUENCE  # type: bool
-    is_too_large_sequence = len(sequences) >= TOO_BIG_SEQUENCE  # type: bool
+    is_too_small_sequence: bool = len(sequences) < TOO_SMALL_SEQUENCE
+    is_too_large_sequence: bool = len(sequences) >= TOO_BIG_SEQUENCE

    if is_too_small_sequence:
        logger.log(
@ -137,11 +135,11 @@ def from_bytes(
            ),
        )

-    prioritized_encodings = []  # type: List[str]
+    prioritized_encodings: List[str] = []

-    specified_encoding = (
+    specified_encoding: Optional[str] = (
        any_specified_encoding(sequences) if preemptive_behaviour else None
-    )  # type: Optional[str]
+    )

    if specified_encoding is not None:
        prioritized_encodings.append(specified_encoding)
@ -151,15 +149,15 @@ def from_bytes(
            specified_encoding,
        )

-    tested = set()  # type: Set[str]
-    tested_but_hard_failure = []  # type: List[str]
-    tested_but_soft_failure = []  # type: List[str]
+    tested: Set[str] = set()
+    tested_but_hard_failure: List[str] = []
+    tested_but_soft_failure: List[str] = []

-    fallback_ascii = None  # type: Optional[CharsetMatch]
-    fallback_u8 = None  # type: Optional[CharsetMatch]
-    fallback_specified = None  # type: Optional[CharsetMatch]
+    fallback_ascii: Optional[CharsetMatch] = None
+    fallback_u8: Optional[CharsetMatch] = None
+    fallback_specified: Optional[CharsetMatch] = None

-    results = CharsetMatches()  # type: CharsetMatches
+    results: CharsetMatches = CharsetMatches()

    sig_encoding, sig_payload = identify_sig_or_bom(sequences)

@ -190,11 +188,11 @@ def from_bytes(

        tested.add(encoding_iana)

-        decoded_payload = None  # type: Optional[str]
-        bom_or_sig_available = sig_encoding == encoding_iana  # type: bool
-        strip_sig_or_bom = bom_or_sig_available and should_strip_sig_or_bom(
+        decoded_payload: Optional[str] = None
+        bom_or_sig_available: bool = sig_encoding == encoding_iana
+        strip_sig_or_bom: bool = bom_or_sig_available and should_strip_sig_or_bom(
            encoding_iana
-        )  # type: bool
+        )

        if encoding_iana in {"utf_16", "utf_32"} and not bom_or_sig_available:
            logger.log(
@ -205,7 +203,7 @@ def from_bytes(
            continue

        try:
-            is_multi_byte_decoder = is_multi_byte_encoding(encoding_iana)  # type: bool
+            is_multi_byte_decoder: bool = is_multi_byte_encoding(encoding_iana)
        except (ModuleNotFoundError, ImportError):
            logger.log(
                TRACE,
@ -240,7 +238,7 @@ def from_bytes(
            tested_but_hard_failure.append(encoding_iana)
            continue

-        similar_soft_failure_test = False  # type: bool
+        similar_soft_failure_test: bool = False

        for encoding_soft_failed in tested_but_soft_failure:
            if is_cp_similar(encoding_iana, encoding_soft_failed):
@ -262,11 +260,11 @@ def from_bytes(
            int(length / steps),
        )

-        multi_byte_bonus = (
+        multi_byte_bonus: bool = (
            is_multi_byte_decoder
            and decoded_payload is not None
            and len(decoded_payload) < length
-        )  # type: bool
+        )

        if multi_byte_bonus:
            logger.log(
@ -276,72 +274,47 @@ def from_bytes(
                encoding_iana,
            )

-        max_chunk_gave_up = int(len(r_) / 4)  # type: int
+        max_chunk_gave_up: int = int(len(r_) / 4)

        max_chunk_gave_up = max(max_chunk_gave_up, 2)
-        early_stop_count = 0  # type: int
+        early_stop_count: int = 0
        lazy_str_hard_failure = False

-        md_chunks = []  # type: List[str]
+        md_chunks: List[str] = []
        md_ratios = []

-        for i in r_:
-            if i + chunk_size > length + 8:
-                continue
-
-            cut_sequence = sequences[i : i + chunk_size]
-
-            if bom_or_sig_available and strip_sig_or_bom is False:
-                cut_sequence = sig_payload + cut_sequence
-
-            try:
-                chunk = cut_sequence.decode(
-                    encoding_iana,
-                    errors="ignore" if is_multi_byte_decoder else "strict",
-                )  # type: str
-            except UnicodeDecodeError as e:  # Lazy str loading may have missed something there
-                logger.log(
-                    TRACE,
-                    "LazyStr Loading: After MD chunk decode, code page %s does not fit given bytes sequence at ALL. %s",
-                    encoding_iana,
-                    str(e),
-                )
-                early_stop_count = max_chunk_gave_up
-                lazy_str_hard_failure = True
-                break
-
-            # multi-byte bad cutting detector and adjustment
-            # not the cleanest way to perform that fix but clever enough for now.
-            if is_multi_byte_decoder and i > 0 and sequences[i] >= 0x80:
-
-                chunk_partial_size_chk = min(chunk_size, 16)  # type: int
-
-                if (
-                    decoded_payload
-                    and chunk[:chunk_partial_size_chk] not in decoded_payload
-                ):
-                    for j in range(i, i - 4, -1):
-                        cut_sequence = sequences[j : i + chunk_size]
-
-                        if bom_or_sig_available and strip_sig_or_bom is False:
-                            cut_sequence = sig_payload + cut_sequence
-
-                        chunk = cut_sequence.decode(encoding_iana, errors="ignore")
-
-                        if chunk[:chunk_partial_size_chk] in decoded_payload:
-                            break
-
-            md_chunks.append(chunk)
-
-            md_ratios.append(mess_ratio(chunk, threshold))
-
-            if md_ratios[-1] >= threshold:
-                early_stop_count += 1
-
-            if (early_stop_count >= max_chunk_gave_up) or (
-                bom_or_sig_available and strip_sig_or_bom is False
+        try:
+            for chunk in cut_sequence_chunks(
+                sequences,
+                encoding_iana,
+                r_,
+                chunk_size,
+                bom_or_sig_available,
+                strip_sig_or_bom,
+                sig_payload,
+                is_multi_byte_decoder,
+                decoded_payload,
            ):
-                break
+                md_chunks.append(chunk)
+
+                md_ratios.append(mess_ratio(chunk, threshold))
+
+                if md_ratios[-1] >= threshold:
+                    early_stop_count += 1
+
+                if (early_stop_count >= max_chunk_gave_up) or (
+                    bom_or_sig_available and strip_sig_or_bom is False
+                ):
+                    break
+        except UnicodeDecodeError as e:  # Lazy str loading may have missed something there
+            logger.log(
+                TRACE,
+                "LazyStr Loading: After MD chunk decode, code page %s does not fit given bytes sequence at ALL. %s",
+                encoding_iana,
+                str(e),
+            )
+            early_stop_count = max_chunk_gave_up
+            lazy_str_hard_failure = True

        # We might want to check the sequence again with the whole content
        # Only if initial MD tests passes
@ -362,9 +335,7 @@ def from_bytes(
                tested_but_hard_failure.append(encoding_iana)
                continue

-        mean_mess_ratio = (
-            sum(md_ratios) / len(md_ratios) if md_ratios else 0.0
-        )  # type: float
+        mean_mess_ratio: float = sum(md_ratios) / len(md_ratios) if md_ratios else 0.0
        if mean_mess_ratio >= threshold or early_stop_count >= max_chunk_gave_up:
            tested_but_soft_failure.append(encoding_iana)
            logger.log(
@ -399,7 +370,7 @@ def from_bytes(
        )

        if not is_multi_byte_decoder:
-            target_languages = encoding_languages(encoding_iana)  # type: List[str]
+            target_languages: List[str] = encoding_languages(encoding_iana)
        else:
            target_languages = mb_encoding_languages(encoding_iana)

@ -516,8 +487,8 @@ def from_fp(
    steps: int = 5,
    chunk_size: int = 512,
    threshold: float = 0.20,
-    cp_isolation: List[str] = None,
-    cp_exclusion: List[str] = None,
+    cp_isolation: Optional[List[str]] = None,
+    cp_exclusion: Optional[List[str]] = None,
    preemptive_behaviour: bool = True,
    explain: bool = False,
 ) -> CharsetMatches:
@ -538,12 +509,12 @@ def from_fp(


 def from_path(
-    path: PathLike,
+    path: "PathLike[Any]",
    steps: int = 5,
    chunk_size: int = 512,
    threshold: float = 0.20,
-    cp_isolation: List[str] = None,
-    cp_exclusion: List[str] = None,
+    cp_isolation: Optional[List[str]] = None,
+    cp_exclusion: Optional[List[str]] = None,
    preemptive_behaviour: bool = True,
    explain: bool = False,
 ) -> CharsetMatches:
@ -565,17 +536,22 @@ def from_path(


 def normalize(
-    path: PathLike,
+    path: "PathLike[Any]",
    steps: int = 5,
    chunk_size: int = 512,
    threshold: float = 0.20,
-    cp_isolation: List[str] = None,
-    cp_exclusion: List[str] = None,
+    cp_isolation: Optional[List[str]] = None,
+    cp_exclusion: Optional[List[str]] = None,
    preemptive_behaviour: bool = True,
 ) -> CharsetMatch:
    """
    Take a (text-based) file path and try to create another file next to it, this time using UTF-8.
    """
+    warnings.warn(
+        "normalize is deprecated and will be removed in 3.0",
+        DeprecationWarning,
+    )
+
    results = from_path(
        path,
        steps,