Bump requests from 2.28.1 to 2.28.2 (#1968)

* Bump requests from 2.28.1 to 2.28.2 Bumps [requests](https://github.com/psf/requests) from 2.28.1 to 2.28.2. - [Release notes](https://github.com/psf/requests/releases) - [Changelog](https://github.com/psf/requests/blob/main/HISTORY.md) - [Commits](https://github.com/psf/requests/compare/v2.28.1...v2.28.2) --- updated-dependencies: - dependency-name: requests dependency-type: direct:production update-type: version-update:semver-patch ... Signed-off-by: dependabot[bot] <support@github.com> * Update requests==2.28.2 --------- Signed-off-by: dependabot[bot] <support@github.com> Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> Co-authored-by: JonnyWong16 <9099342+JonnyWong16@users.noreply.github.com> [skip ci]
2025-07-07 05:31:15 -07:00 · 2023-03-02 20:53:15 -08:00 · 2023-03-02 20:53:15 -08:00 · cc78f17be5
commit cc78f17be5
parent 70e09582da
20 changed files with 527 additions and 302 deletions
--- a/lib/charset_normalizer/api.py
+++ b/lib/charset_normalizer/api.py
@ -1,7 +1,5 @@
 import logging
-import warnings
 from os import PathLike
-from os.path import basename, splitext
 from typing import Any, BinaryIO, List, Optional, Set

 from .cd import (
@ -41,11 +39,12 @@ def from_bytes(
    cp_exclusion: Optional[List[str]] = None,
    preemptive_behaviour: bool = True,
    explain: bool = False,
+    language_threshold: float = 0.1,
 ) -> CharsetMatches:
    """
    Given a raw bytes sequence, return the best possibles charset usable to render str objects.
    If there is no results, it is a strong indicator that the source is binary/not text.
-    By default, the process will extract 5 blocs of 512o each to assess the mess and coherence of a given sequence.
+    By default, the process will extract 5 blocks of 512o each to assess the mess and coherence of a given sequence.
    And will give up a particular code page after 20% of measured mess. Those criteria are customizable at will.

    The preemptive behavior DOES NOT replace the traditional detection workflow, it prioritize a particular code page
@ -197,7 +196,14 @@ def from_bytes(
        if encoding_iana in {"utf_16", "utf_32"} and not bom_or_sig_available:
            logger.log(
                TRACE,
-                "Encoding %s wont be tested as-is because it require a BOM. Will try some sub-encoder LE/BE.",
+                "Encoding %s won't be tested as-is because it require a BOM. Will try some sub-encoder LE/BE.",
+                encoding_iana,
+            )
+            continue
+        if encoding_iana in {"utf_7"} and not bom_or_sig_available:
+            logger.log(
+                TRACE,
+                "Encoding %s won't be tested as-is because detection is unreliable without BOM/SIG.",
                encoding_iana,
            )
            continue
@ -297,7 +303,13 @@ def from_bytes(
            ):
                md_chunks.append(chunk)

-                md_ratios.append(mess_ratio(chunk, threshold))
+                md_ratios.append(
+                    mess_ratio(
+                        chunk,
+                        threshold,
+                        explain is True and 1 <= len(cp_isolation) <= 2,
+                    )
+                )

                if md_ratios[-1] >= threshold:
                    early_stop_count += 1
@ -389,7 +401,9 @@ def from_bytes(
        if encoding_iana != "ascii":
            for chunk in md_chunks:
                chunk_languages = coherence_ratio(
-                    chunk, 0.1, ",".join(target_languages) if target_languages else None
+                    chunk,
+                    language_threshold,
+                    ",".join(target_languages) if target_languages else None,
                )

                cd_ratios.append(chunk_languages)
@ -491,6 +505,7 @@ def from_fp(
    cp_exclusion: Optional[List[str]] = None,
    preemptive_behaviour: bool = True,
    explain: bool = False,
+    language_threshold: float = 0.1,
 ) -> CharsetMatches:
    """
    Same thing than the function from_bytes but using a file pointer that is already ready.
@ -505,6 +520,7 @@ def from_fp(
        cp_exclusion,
        preemptive_behaviour,
        explain,
+        language_threshold,
    )


@ -517,6 +533,7 @@ def from_path(
    cp_exclusion: Optional[List[str]] = None,
    preemptive_behaviour: bool = True,
    explain: bool = False,
+    language_threshold: float = 0.1,
 ) -> CharsetMatches:
    """
    Same thing than the function from_bytes but with one extra step. Opening and reading given file path in binary mode.
@ -532,53 +549,5 @@ def from_path(
            cp_exclusion,
            preemptive_behaviour,
            explain,
+            language_threshold,
        )
-
-
-def normalize(
-    path: "PathLike[Any]",
-    steps: int = 5,
-    chunk_size: int = 512,
-    threshold: float = 0.20,
-    cp_isolation: Optional[List[str]] = None,
-    cp_exclusion: Optional[List[str]] = None,
-    preemptive_behaviour: bool = True,
-) -> CharsetMatch:
-    """
-    Take a (text-based) file path and try to create another file next to it, this time using UTF-8.
-    """
-    warnings.warn(
-        "normalize is deprecated and will be removed in 3.0",
-        DeprecationWarning,
-    )
-
-    results = from_path(
-        path,
-        steps,
-        chunk_size,
-        threshold,
-        cp_isolation,
-        cp_exclusion,
-        preemptive_behaviour,
-    )
-
-    filename = basename(path)
-    target_extensions = list(splitext(filename))
-
-    if len(results) == 0:
-        raise IOError(
-            'Unable to normalize "{}", no encoding charset seems to fit.'.format(
-                filename
-            )
-        )
-
-    result = results.best()
-
-    target_extensions[0] += "-" + result.encoding  # type: ignore
-
-    with open(
-        "{}".format(str(path).replace(filename, "".join(target_extensions))), "wb"
-    ) as fp:
-        fp.write(result.output())  # type: ignore
-
-    return result  # type: ignore