Bump requests from 2.26.0 to 2.27.0 (#1602)

* Bump requests from 2.26.0 to 2.27.0 Bumps [requests](https://github.com/psf/requests) from 2.26.0 to 2.27.0. - [Release notes](https://github.com/psf/requests/releases) - [Changelog](https://github.com/psf/requests/blob/main/HISTORY.md) - [Commits](https://github.com/psf/requests/compare/v2.26.0...v2.27.0) --- updated-dependencies: - dependency-name: requests dependency-type: direct:production update-type: version-update:semver-minor ... Signed-off-by: dependabot[bot] <support@github.com> * Update requests==2.27.0 Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> Co-authored-by: JonnyWong16 <9099342+JonnyWong16@users.noreply.github.com>
2025-07-16 02:02:58 -07:00 · 2022-01-04 13:20:40 -08:00 · 2022-01-04 13:20:40 -08:00 · bb5ebe0fa5
commit bb5ebe0fa5
parent 2c7a3934cb
11 changed files with 156 additions and 83 deletions
--- a/lib/charset_normalizer/api.py
+++ b/lib/charset_normalizer/api.py
@ -68,20 +68,21 @@ def from_bytes(
        )

    if explain:
+        previous_logger_level = logger.level  # type: int
        logger.addHandler(explain_handler)
+        logger.setLevel(logging.DEBUG)

    length = len(sequences)  # type: int

    if length == 0:
-        logger.warning(
-            "Given content is empty, stopping the process very early, returning empty utf_8 str match"
-        )
+        logger.warning("Encoding detection on empty bytes, assuming utf_8 intention.")
        if explain:
            logger.removeHandler(explain_handler)
+            logger.setLevel(previous_logger_level or logging.WARNING)
        return CharsetMatches([CharsetMatch(sequences, "utf_8", 0.0, False, [], "")])

    if cp_isolation is not None:
-        logger.warning(
+        logger.debug(
            "cp_isolation is set. use this flag for debugging purpose. "
            "limited list of encoding allowed : %s.",
            ", ".join(cp_isolation),
@ -91,7 +92,7 @@ def from_bytes(
        cp_isolation = []

    if cp_exclusion is not None:
-        logger.warning(
+        logger.debug(
            "cp_exclusion is set. use this flag for debugging purpose. "
            "limited list of encoding excluded : %s.",
            ", ".join(cp_exclusion),
@ -101,7 +102,7 @@ def from_bytes(
        cp_exclusion = []

    if length <= (chunk_size * steps):
-        logger.warning(
+        logger.debug(
            "override steps (%i) and chunk_size (%i) as content does not fit (%i byte(s) given) parameters.",
            steps,
            chunk_size,
@ -187,7 +188,7 @@ def from_bytes(
        )  # type: bool

        if encoding_iana in {"utf_16", "utf_32"} and not bom_or_sig_available:
-            logger.info(
+            logger.debug(
                "Encoding %s wont be tested as-is because it require a BOM. Will try some sub-encoder LE/BE.",
                encoding_iana,
            )
@ -218,7 +219,7 @@ def from_bytes(
                )
        except (UnicodeDecodeError, LookupError) as e:
            if not isinstance(e, LookupError):
-                logger.warning(
+                logger.debug(
                    "Code page %s does not fit given bytes sequence at ALL. %s",
                    encoding_iana,
                    str(e),
@ -234,7 +235,7 @@ def from_bytes(
                break

        if similar_soft_failure_test:
-            logger.warning(
+            logger.debug(
                "%s is deemed too similar to code page %s and was consider unsuited already. Continuing!",
                encoding_iana,
                encoding_soft_failed,
@ -254,7 +255,7 @@ def from_bytes(
        )  # type: bool

        if multi_byte_bonus:
-            logger.info(
+            logger.debug(
                "Code page %s is a multi byte encoding table and it appear that at least one character "
                "was encoded using n-bytes.",
                encoding_iana,
@ -264,6 +265,7 @@ def from_bytes(

        max_chunk_gave_up = max(max_chunk_gave_up, 2)
        early_stop_count = 0  # type: int
+        lazy_str_hard_failure = False

        md_chunks = []  # type: List[str]
        md_ratios = []
@ -283,12 +285,13 @@ def from_bytes(
                    errors="ignore" if is_multi_byte_decoder else "strict",
                )  # type: str
            except UnicodeDecodeError as e:  # Lazy str loading may have missed something there
-                logger.warning(
+                logger.debug(
                    "LazyStr Loading: After MD chunk decode, code page %s does not fit given bytes sequence at ALL. %s",
                    encoding_iana,
                    str(e),
                )
                early_stop_count = max_chunk_gave_up
+                lazy_str_hard_failure = True
                break

            # multi-byte bad cutting detector and adjustment
@ -324,12 +327,30 @@ def from_bytes(
            ):
                break

+        # We might want to check the sequence again with the whole content
+        # Only if initial MD tests passes
+        if (
+            not lazy_str_hard_failure
+            and is_too_large_sequence
+            and not is_multi_byte_decoder
+        ):
+            try:
+                sequences[int(50e3) :].decode(encoding_iana, errors="strict")
+            except UnicodeDecodeError as e:
+                logger.debug(
+                    "LazyStr Loading: After final lookup, code page %s does not fit given bytes sequence at ALL. %s",
+                    encoding_iana,
+                    str(e),
+                )
+                tested_but_hard_failure.append(encoding_iana)
+                continue
+
        mean_mess_ratio = (
            sum(md_ratios) / len(md_ratios) if md_ratios else 0.0
        )  # type: float
        if mean_mess_ratio >= threshold or early_stop_count >= max_chunk_gave_up:
            tested_but_soft_failure.append(encoding_iana)
-            logger.warning(
+            logger.info(
                "%s was excluded because of initial chaos probing. Gave up %i time(s). "
                "Computed mean chaos is %f %%.",
                encoding_iana,
@ -337,7 +358,10 @@ def from_bytes(
                round(mean_mess_ratio * 100, ndigits=3),
            )
            # Preparing those fallbacks in case we got nothing.
-            if encoding_iana in ["ascii", "utf_8", specified_encoding]:
+            if (
+                encoding_iana in ["ascii", "utf_8", specified_encoding]
+                and not lazy_str_hard_failure
+            ):
                fallback_entry = CharsetMatch(
                    sequences, encoding_iana, threshold, False, [], decoded_payload
                )
@ -361,7 +385,7 @@ def from_bytes(
            target_languages = mb_encoding_languages(encoding_iana)

        if target_languages:
-            logger.info(
+            logger.debug(
                "{} should target any language(s) of {}".format(
                    encoding_iana, str(target_languages)
                )
@ -369,12 +393,15 @@ def from_bytes(

        cd_ratios = []

-        for chunk in md_chunks:
-            chunk_languages = coherence_ratio(
-                chunk, 0.1, ",".join(target_languages) if target_languages else None
-            )
+        # We shall skip the CD when its about ASCII
+        # Most of the time its not relevant to run "language-detection" on it.
+        if encoding_iana != "ascii":
+            for chunk in md_chunks:
+                chunk_languages = coherence_ratio(
+                    chunk, 0.1, ",".join(target_languages) if target_languages else None
+                )

-            cd_ratios.append(chunk_languages)
+                cd_ratios.append(chunk_languages)

        cd_ratios_merged = merge_coherence_ratios(cd_ratios)

@ -385,20 +412,6 @@ def from_bytes(
                )
            )

-        # We might want to check the sequence again with the whole content
-        # Only if initial MD/CD tests passes
-        if is_too_large_sequence and not is_multi_byte_decoder:
-            try:
-                sequences[int(50e3) :].decode(encoding_iana, errors="strict")
-            except UnicodeDecodeError as e:
-                logger.warning(
-                    "LazyStr Loading: After final lookup, code page %s does not fit given bytes sequence at ALL. %s",
-                    encoding_iana,
-                    str(e),
-                )
-                tested_but_hard_failure.append(encoding_iana)
-                continue
-
        results.append(
            CharsetMatch(
                sequences,
@ -419,6 +432,7 @@ def from_bytes(
            )
            if explain:
                logger.removeHandler(explain_handler)
+                logger.setLevel(previous_logger_level)
            return CharsetMatches([results[encoding_iana]])

        if encoding_iana == sig_encoding:
@ -428,16 +442,17 @@ def from_bytes(
            )
            if explain:
                logger.removeHandler(explain_handler)
+                logger.setLevel(previous_logger_level)
            return CharsetMatches([results[encoding_iana]])

    if len(results) == 0:
        if fallback_u8 or fallback_ascii or fallback_specified:
-            logger.warning(
+            logger.debug(
                "Nothing got out of the detection process. Using ASCII/UTF-8/Specified fallback."
            )

        if fallback_specified:
-            logger.warning(
+            logger.debug(
                "%s will be used as a fallback match", fallback_specified.encoding
            )
            results.append(fallback_specified)
@ -458,6 +473,7 @@ def from_bytes(

    if explain:
        logger.removeHandler(explain_handler)
+        logger.setLevel(previous_logger_level)

    return results