mirror of
https://github.com/clinton-hall/nzbToMedia.git
synced 2025-08-20 13:23:18 -07:00
Update vendored beautifulsoup4 to 4.11.1
Adds soupsieve 2.3.2.post1
This commit is contained in:
parent
2226a74ef8
commit
968ec8a1d8
33 changed files with 10852 additions and 3694 deletions
|
@ -6,70 +6,185 @@ necessary. It is heavily based on code from Mark Pilgrim's Universal
|
|||
Feed Parser. It works best on XML and HTML, but it does not rewrite the
|
||||
XML or HTML to reflect a new encoding; that's the tree builder's job.
|
||||
"""
|
||||
# Use of this source code is governed by a BSD-style license that can be
|
||||
# found in the LICENSE file.
|
||||
# Use of this source code is governed by the MIT license.
|
||||
__license__ = "MIT"
|
||||
|
||||
import codecs
|
||||
from html.entities import codepoint2name
|
||||
from collections import defaultdict
|
||||
import codecs
|
||||
import re
|
||||
import logging
|
||||
import string
|
||||
|
||||
# Import a library to autodetect character encodings.
|
||||
chardet_type = None
|
||||
# Import a library to autodetect character encodings. We'll support
|
||||
# any of a number of libraries that all support the same API:
|
||||
#
|
||||
# * cchardet
|
||||
# * chardet
|
||||
# * charset-normalizer
|
||||
chardet_module = None
|
||||
try:
|
||||
# First try the fast C implementation.
|
||||
# PyPI package: cchardet
|
||||
import cchardet
|
||||
def chardet_dammit(s):
|
||||
return cchardet.detect(s)['encoding']
|
||||
import cchardet as chardet_module
|
||||
except ImportError:
|
||||
try:
|
||||
# Fall back to the pure Python implementation
|
||||
# Debian package: python-chardet
|
||||
# PyPI package: chardet
|
||||
import chardet
|
||||
def chardet_dammit(s):
|
||||
return chardet.detect(s)['encoding']
|
||||
#import chardet.constants
|
||||
#chardet.constants._debug = 1
|
||||
import chardet as chardet_module
|
||||
except ImportError:
|
||||
# No chardet available.
|
||||
def chardet_dammit(s):
|
||||
try:
|
||||
# PyPI package: charset-normalizer
|
||||
import charset_normalizer as chardet_module
|
||||
except ImportError:
|
||||
# No chardet available.
|
||||
chardet_module = None
|
||||
|
||||
if chardet_module:
|
||||
def chardet_dammit(s):
|
||||
if isinstance(s, str):
|
||||
return None
|
||||
return chardet_module.detect(s)['encoding']
|
||||
else:
|
||||
def chardet_dammit(s):
|
||||
return None
|
||||
|
||||
# Available from http://cjkpython.i18n.org/.
|
||||
try:
|
||||
import iconv_codec
|
||||
except ImportError:
|
||||
pass
|
||||
# Build bytestring and Unicode versions of regular expressions for finding
|
||||
# a declared encoding inside an XML or HTML document.
|
||||
xml_encoding = '^\\s*<\\?.*encoding=[\'"](.*?)[\'"].*\\?>'
|
||||
html_meta = '<\\s*meta[^>]+charset\\s*=\\s*["\']?([^>]*?)[ /;\'">]'
|
||||
encoding_res = dict()
|
||||
encoding_res[bytes] = {
|
||||
'html' : re.compile(html_meta.encode("ascii"), re.I),
|
||||
'xml' : re.compile(xml_encoding.encode("ascii"), re.I),
|
||||
}
|
||||
encoding_res[str] = {
|
||||
'html' : re.compile(html_meta, re.I),
|
||||
'xml' : re.compile(xml_encoding, re.I)
|
||||
}
|
||||
|
||||
xml_encoding_re = re.compile(
|
||||
'^<\\?.*encoding=[\'"](.*?)[\'"].*\\?>'.encode(), re.I)
|
||||
html_meta_re = re.compile(
|
||||
'<\\s*meta[^>]+charset\\s*=\\s*["\']?([^>]*?)[ /;\'">]'.encode(), re.I)
|
||||
from html.entities import html5
|
||||
|
||||
class EntitySubstitution(object):
|
||||
|
||||
"""Substitute XML or HTML entities for the corresponding characters."""
|
||||
"""The ability to substitute XML or HTML entities for certain characters."""
|
||||
|
||||
def _populate_class_variables():
|
||||
lookup = {}
|
||||
reverse_lookup = {}
|
||||
characters_for_re = []
|
||||
"""Initialize variables used by this class to manage the plethora of
|
||||
HTML5 named entities.
|
||||
|
||||
This function returns a 3-tuple containing two dictionaries
|
||||
and a regular expression:
|
||||
|
||||
unicode_to_name - A mapping of Unicode strings like "⦨" to
|
||||
entity names like "angmsdaa". When a single Unicode string has
|
||||
multiple entity names, we try to choose the most commonly-used
|
||||
name.
|
||||
|
||||
name_to_unicode: A mapping of entity names like "angmsdaa" to
|
||||
Unicode strings like "⦨".
|
||||
|
||||
named_entity_re: A regular expression matching (almost) any
|
||||
Unicode string that corresponds to an HTML5 named entity.
|
||||
"""
|
||||
unicode_to_name = {}
|
||||
name_to_unicode = {}
|
||||
|
||||
short_entities = set()
|
||||
long_entities_by_first_character = defaultdict(set)
|
||||
|
||||
for name_with_semicolon, character in sorted(html5.items()):
|
||||
# "It is intentional, for legacy compatibility, that many
|
||||
# code points have multiple character reference names. For
|
||||
# example, some appear both with and without the trailing
|
||||
# semicolon, or with different capitalizations."
|
||||
# - https://html.spec.whatwg.org/multipage/named-characters.html#named-character-references
|
||||
#
|
||||
# The parsers are in charge of handling (or not) character
|
||||
# references with no trailing semicolon, so we remove the
|
||||
# semicolon whenever it appears.
|
||||
if name_with_semicolon.endswith(';'):
|
||||
name = name_with_semicolon[:-1]
|
||||
else:
|
||||
name = name_with_semicolon
|
||||
|
||||
# When parsing HTML, we want to recognize any known named
|
||||
# entity and convert it to a sequence of Unicode
|
||||
# characters.
|
||||
if name not in name_to_unicode:
|
||||
name_to_unicode[name] = character
|
||||
|
||||
# When _generating_ HTML, we want to recognize special
|
||||
# character sequences that _could_ be converted to named
|
||||
# entities.
|
||||
unicode_to_name[character] = name
|
||||
|
||||
# We also need to build a regular expression that lets us
|
||||
# _find_ those characters in output strings so we can
|
||||
# replace them.
|
||||
#
|
||||
# This is tricky, for two reasons.
|
||||
|
||||
if (len(character) == 1 and ord(character) < 128
|
||||
and character not in '<>&'):
|
||||
# First, it would be annoying to turn single ASCII
|
||||
# characters like | into named entities like
|
||||
# |. The exceptions are <>&, which we _must_
|
||||
# turn into named entities to produce valid HTML.
|
||||
continue
|
||||
|
||||
if len(character) > 1 and all(ord(x) < 128 for x in character):
|
||||
# We also do not want to turn _combinations_ of ASCII
|
||||
# characters like 'fj' into named entities like 'fj',
|
||||
# though that's more debateable.
|
||||
continue
|
||||
|
||||
# Second, some named entities have a Unicode value that's
|
||||
# a subset of the Unicode value for some _other_ named
|
||||
# entity. As an example, \u2267' is ≧,
|
||||
# but '\u2267\u0338' is ≧̸. Our regular
|
||||
# expression needs to match the first two characters of
|
||||
# "\u2267\u0338foo", but only the first character of
|
||||
# "\u2267foo".
|
||||
#
|
||||
# In this step, we build two sets of characters that
|
||||
# _eventually_ need to go into the regular expression. But
|
||||
# we won't know exactly what the regular expression needs
|
||||
# to look like until we've gone through the entire list of
|
||||
# named entities.
|
||||
if len(character) == 1:
|
||||
short_entities.add(character)
|
||||
else:
|
||||
long_entities_by_first_character[character[0]].add(character)
|
||||
|
||||
# Now that we've been through the entire list of entities, we
|
||||
# can create a regular expression that matches any of them.
|
||||
particles = set()
|
||||
for short in short_entities:
|
||||
long_versions = long_entities_by_first_character[short]
|
||||
if not long_versions:
|
||||
particles.add(short)
|
||||
else:
|
||||
ignore = "".join([x[1] for x in long_versions])
|
||||
# This finds, e.g. \u2267 but only if it is _not_
|
||||
# followed by \u0338.
|
||||
particles.add("%s(?![%s])" % (short, ignore))
|
||||
|
||||
for long_entities in list(long_entities_by_first_character.values()):
|
||||
for long_entity in long_entities:
|
||||
particles.add(long_entity)
|
||||
|
||||
re_definition = "(%s)" % "|".join(particles)
|
||||
|
||||
# If an entity shows up in both html5 and codepoint2name, it's
|
||||
# likely that HTML5 gives it several different names, such as
|
||||
# 'rsquo' and 'rsquor'. When converting Unicode characters to
|
||||
# named entities, the codepoint2name name should take
|
||||
# precedence where possible, since that's the more easily
|
||||
# recognizable one.
|
||||
for codepoint, name in list(codepoint2name.items()):
|
||||
character = chr(codepoint)
|
||||
if codepoint != 34:
|
||||
# There's no point in turning the quotation mark into
|
||||
# ", unless it happens within an attribute value, which
|
||||
# is handled elsewhere.
|
||||
characters_for_re.append(character)
|
||||
lookup[character] = name
|
||||
# But we do want to turn " into the quotation mark.
|
||||
reverse_lookup[name] = character
|
||||
re_definition = "[%s]" % "".join(characters_for_re)
|
||||
return lookup, reverse_lookup, re.compile(re_definition)
|
||||
unicode_to_name[character] = name
|
||||
|
||||
return unicode_to_name, name_to_unicode, re.compile(re_definition)
|
||||
(CHARACTER_TO_HTML_ENTITY, HTML_ENTITY_TO_CHARACTER,
|
||||
CHARACTER_TO_HTML_ENTITY_RE) = _populate_class_variables()
|
||||
|
||||
|
@ -89,13 +204,15 @@ class EntitySubstitution(object):
|
|||
|
||||
@classmethod
|
||||
def _substitute_html_entity(cls, matchobj):
|
||||
"""Used with a regular expression to substitute the
|
||||
appropriate HTML entity for a special character string."""
|
||||
entity = cls.CHARACTER_TO_HTML_ENTITY.get(matchobj.group(0))
|
||||
return "&%s;" % entity
|
||||
|
||||
@classmethod
|
||||
def _substitute_xml_entity(cls, matchobj):
|
||||
"""Used with a regular expression to substitute the
|
||||
appropriate XML entity for an XML special character."""
|
||||
appropriate XML entity for a special character string."""
|
||||
entity = cls.CHARACTER_TO_XML_ENTITY[matchobj.group(0)]
|
||||
return "&%s;" % entity
|
||||
|
||||
|
@ -190,6 +307,8 @@ class EntitySubstitution(object):
|
|||
containg a LATIN SMALL LETTER E WITH ACUTE, but replacing that
|
||||
character with "é" will make it more readable to some
|
||||
people.
|
||||
|
||||
:param s: A Unicode string.
|
||||
"""
|
||||
return cls.CHARACTER_TO_HTML_ENTITY_RE.sub(
|
||||
cls._substitute_html_entity, s)
|
||||
|
@ -201,23 +320,65 @@ class EncodingDetector:
|
|||
Order of precedence:
|
||||
|
||||
1. Encodings you specifically tell EncodingDetector to try first
|
||||
(the override_encodings argument to the constructor).
|
||||
(the known_definite_encodings argument to the constructor).
|
||||
|
||||
2. An encoding declared within the bytestring itself, either in an
|
||||
2. An encoding determined by sniffing the document's byte-order mark.
|
||||
|
||||
3. Encodings you specifically tell EncodingDetector to try if
|
||||
byte-order mark sniffing fails (the user_encodings argument to the
|
||||
constructor).
|
||||
|
||||
4. An encoding declared within the bytestring itself, either in an
|
||||
XML declaration (if the bytestring is to be interpreted as an XML
|
||||
document), or in a <meta> tag (if the bytestring is to be
|
||||
interpreted as an HTML document.)
|
||||
|
||||
3. An encoding detected through textual analysis by chardet,
|
||||
5. An encoding detected through textual analysis by chardet,
|
||||
cchardet, or a similar external library.
|
||||
|
||||
4. UTF-8.
|
||||
|
||||
5. Windows-1252.
|
||||
|
||||
"""
|
||||
def __init__(self, markup, override_encodings=None, is_html=False,
|
||||
exclude_encodings=None):
|
||||
self.override_encodings = override_encodings or []
|
||||
def __init__(self, markup, known_definite_encodings=None,
|
||||
is_html=False, exclude_encodings=None,
|
||||
user_encodings=None, override_encodings=None):
|
||||
"""Constructor.
|
||||
|
||||
:param markup: Some markup in an unknown encoding.
|
||||
|
||||
:param known_definite_encodings: When determining the encoding
|
||||
of `markup`, these encodings will be tried first, in
|
||||
order. In HTML terms, this corresponds to the "known
|
||||
definite encoding" step defined here:
|
||||
https://html.spec.whatwg.org/multipage/parsing.html#parsing-with-a-known-character-encoding
|
||||
|
||||
:param user_encodings: These encodings will be tried after the
|
||||
`known_definite_encodings` have been tried and failed, and
|
||||
after an attempt to sniff the encoding by looking at a
|
||||
byte order mark has failed. In HTML terms, this
|
||||
corresponds to the step "user has explicitly instructed
|
||||
the user agent to override the document's character
|
||||
encoding", defined here:
|
||||
https://html.spec.whatwg.org/multipage/parsing.html#determining-the-character-encoding
|
||||
|
||||
:param override_encodings: A deprecated alias for
|
||||
known_definite_encodings. Any encodings here will be tried
|
||||
immediately after the encodings in
|
||||
known_definite_encodings.
|
||||
|
||||
:param is_html: If True, this markup is considered to be
|
||||
HTML. Otherwise it's assumed to be XML.
|
||||
|
||||
:param exclude_encodings: These encodings will not be tried,
|
||||
even if they otherwise would be.
|
||||
|
||||
"""
|
||||
self.known_definite_encodings = list(known_definite_encodings or [])
|
||||
if override_encodings:
|
||||
self.known_definite_encodings += override_encodings
|
||||
self.user_encodings = user_encodings or []
|
||||
exclude_encodings = exclude_encodings or []
|
||||
self.exclude_encodings = set([x.lower() for x in exclude_encodings])
|
||||
self.chardet_encoding = None
|
||||
|
@ -228,6 +389,12 @@ class EncodingDetector:
|
|||
self.markup, self.sniffed_encoding = self.strip_byte_order_mark(markup)
|
||||
|
||||
def _usable(self, encoding, tried):
|
||||
"""Should we even bother to try this encoding?
|
||||
|
||||
:param encoding: Name of an encoding.
|
||||
:param tried: Encodings that have already been tried. This will be modified
|
||||
as a side effect.
|
||||
"""
|
||||
if encoding is not None:
|
||||
encoding = encoding.lower()
|
||||
if encoding in self.exclude_encodings:
|
||||
|
@ -239,9 +406,14 @@ class EncodingDetector:
|
|||
|
||||
@property
|
||||
def encodings(self):
|
||||
"""Yield a number of encodings that might work for this markup."""
|
||||
"""Yield a number of encodings that might work for this markup.
|
||||
|
||||
:yield: A sequence of strings.
|
||||
"""
|
||||
tried = set()
|
||||
for e in self.override_encodings:
|
||||
|
||||
# First, try the known definite encodings
|
||||
for e in self.known_definite_encodings:
|
||||
if self._usable(e, tried):
|
||||
yield e
|
||||
|
||||
|
@ -250,6 +422,12 @@ class EncodingDetector:
|
|||
if self._usable(self.sniffed_encoding, tried):
|
||||
yield self.sniffed_encoding
|
||||
|
||||
# Sniffing the byte-order mark did nothing; try the user
|
||||
# encodings.
|
||||
for e in self.user_encodings:
|
||||
if self._usable(e, tried):
|
||||
yield e
|
||||
|
||||
# Look within the document for an XML or HTML encoding
|
||||
# declaration.
|
||||
if self.declared_encoding is None:
|
||||
|
@ -272,7 +450,11 @@ class EncodingDetector:
|
|||
|
||||
@classmethod
|
||||
def strip_byte_order_mark(cls, data):
|
||||
"""If a byte-order mark is present, strip it and return the encoding it implies."""
|
||||
"""If a byte-order mark is present, strip it and return the encoding it implies.
|
||||
|
||||
:param data: Some markup.
|
||||
:return: A 2-tuple (modified data, implied encoding)
|
||||
"""
|
||||
encoding = None
|
||||
if isinstance(data, str):
|
||||
# Unicode data cannot have a byte-order mark.
|
||||
|
@ -304,6 +486,13 @@ class EncodingDetector:
|
|||
|
||||
An HTML encoding is declared in a <meta> tag, hopefully near the
|
||||
beginning of the document.
|
||||
|
||||
:param markup: Some markup.
|
||||
:param is_html: If True, this markup is considered to be HTML. Otherwise
|
||||
it's assumed to be XML.
|
||||
:param search_entire_document: Since an encoding is supposed to declared near the beginning
|
||||
of the document, most of the time it's only necessary to search a few kilobytes of data.
|
||||
Set this to True to force this method to search the entire document.
|
||||
"""
|
||||
if search_entire_document:
|
||||
xml_endpos = html_endpos = len(markup)
|
||||
|
@ -311,14 +500,22 @@ class EncodingDetector:
|
|||
xml_endpos = 1024
|
||||
html_endpos = max(2048, int(len(markup) * 0.05))
|
||||
|
||||
if isinstance(markup, bytes):
|
||||
res = encoding_res[bytes]
|
||||
else:
|
||||
res = encoding_res[str]
|
||||
|
||||
xml_re = res['xml']
|
||||
html_re = res['html']
|
||||
declared_encoding = None
|
||||
declared_encoding_match = xml_encoding_re.search(markup, endpos=xml_endpos)
|
||||
declared_encoding_match = xml_re.search(markup, endpos=xml_endpos)
|
||||
if not declared_encoding_match and is_html:
|
||||
declared_encoding_match = html_meta_re.search(markup, endpos=html_endpos)
|
||||
declared_encoding_match = html_re.search(markup, endpos=html_endpos)
|
||||
if declared_encoding_match is not None:
|
||||
declared_encoding = declared_encoding_match.groups()[0].decode(
|
||||
'ascii', 'replace')
|
||||
declared_encoding = declared_encoding_match.groups()[0]
|
||||
if declared_encoding:
|
||||
if isinstance(declared_encoding, bytes):
|
||||
declared_encoding = declared_encoding.decode('ascii', 'replace')
|
||||
return declared_encoding.lower()
|
||||
return None
|
||||
|
||||
|
@ -341,15 +538,53 @@ class UnicodeDammit:
|
|||
"iso-8859-2",
|
||||
]
|
||||
|
||||
def __init__(self, markup, override_encodings=[],
|
||||
smart_quotes_to=None, is_html=False, exclude_encodings=[]):
|
||||
def __init__(self, markup, known_definite_encodings=[],
|
||||
smart_quotes_to=None, is_html=False, exclude_encodings=[],
|
||||
user_encodings=None, override_encodings=None
|
||||
):
|
||||
"""Constructor.
|
||||
|
||||
:param markup: A bytestring representing markup in an unknown encoding.
|
||||
|
||||
:param known_definite_encodings: When determining the encoding
|
||||
of `markup`, these encodings will be tried first, in
|
||||
order. In HTML terms, this corresponds to the "known
|
||||
definite encoding" step defined here:
|
||||
https://html.spec.whatwg.org/multipage/parsing.html#parsing-with-a-known-character-encoding
|
||||
|
||||
:param user_encodings: These encodings will be tried after the
|
||||
`known_definite_encodings` have been tried and failed, and
|
||||
after an attempt to sniff the encoding by looking at a
|
||||
byte order mark has failed. In HTML terms, this
|
||||
corresponds to the step "user has explicitly instructed
|
||||
the user agent to override the document's character
|
||||
encoding", defined here:
|
||||
https://html.spec.whatwg.org/multipage/parsing.html#determining-the-character-encoding
|
||||
|
||||
:param override_encodings: A deprecated alias for
|
||||
known_definite_encodings. Any encodings here will be tried
|
||||
immediately after the encodings in
|
||||
known_definite_encodings.
|
||||
|
||||
:param smart_quotes_to: By default, Microsoft smart quotes will, like all other characters, be converted
|
||||
to Unicode characters. Setting this to 'ascii' will convert them to ASCII quotes instead.
|
||||
Setting it to 'xml' will convert them to XML entity references, and setting it to 'html'
|
||||
will convert them to HTML entity references.
|
||||
:param is_html: If True, this markup is considered to be HTML. Otherwise
|
||||
it's assumed to be XML.
|
||||
:param exclude_encodings: These encodings will not be considered, even
|
||||
if the sniffing code thinks they might make sense.
|
||||
|
||||
"""
|
||||
self.smart_quotes_to = smart_quotes_to
|
||||
self.tried_encodings = []
|
||||
self.contains_replacement_characters = False
|
||||
self.is_html = is_html
|
||||
self.log = logging.getLogger(__name__)
|
||||
self.detector = EncodingDetector(
|
||||
markup, override_encodings, is_html, exclude_encodings)
|
||||
markup, known_definite_encodings, is_html, exclude_encodings,
|
||||
user_encodings, override_encodings
|
||||
)
|
||||
|
||||
# Short-circuit if the data is in Unicode to begin with.
|
||||
if isinstance(markup, str) or markup == '':
|
||||
|
@ -409,6 +644,10 @@ class UnicodeDammit:
|
|||
return sub
|
||||
|
||||
def _convert_from(self, proposed, errors="strict"):
|
||||
"""Attempt to convert the markup to the proposed encoding.
|
||||
|
||||
:param proposed: The name of a character encoding.
|
||||
"""
|
||||
proposed = self.find_codec(proposed)
|
||||
if not proposed or (proposed, errors) in self.tried_encodings:
|
||||
return None
|
||||
|
@ -423,30 +662,40 @@ class UnicodeDammit:
|
|||
markup = smart_quotes_compiled.sub(self._sub_ms_char, markup)
|
||||
|
||||
try:
|
||||
#print "Trying to convert document to %s (errors=%s)" % (
|
||||
# proposed, errors)
|
||||
#print("Trying to convert document to %s (errors=%s)" % (
|
||||
# proposed, errors))
|
||||
u = self._to_unicode(markup, proposed, errors)
|
||||
self.markup = u
|
||||
self.original_encoding = proposed
|
||||
except Exception as e:
|
||||
#print "That didn't work!"
|
||||
#print e
|
||||
#print("That didn't work!")
|
||||
#print(e)
|
||||
return None
|
||||
#print "Correct encoding: %s" % proposed
|
||||
#print("Correct encoding: %s" % proposed)
|
||||
return self.markup
|
||||
|
||||
def _to_unicode(self, data, encoding, errors="strict"):
|
||||
'''Given a string and its encoding, decodes the string into Unicode.
|
||||
%encoding is a string recognized by encodings.aliases'''
|
||||
"""Given a string and its encoding, decodes the string into Unicode.
|
||||
|
||||
:param encoding: The name of an encoding.
|
||||
"""
|
||||
return str(data, encoding, errors)
|
||||
|
||||
@property
|
||||
def declared_html_encoding(self):
|
||||
"""If the markup is an HTML document, returns the encoding declared _within_
|
||||
the document.
|
||||
"""
|
||||
if not self.is_html:
|
||||
return None
|
||||
return self.detector.declared_encoding
|
||||
|
||||
def find_codec(self, charset):
|
||||
"""Convert the name of a character set to a codec name.
|
||||
|
||||
:param charset: The name of a character set.
|
||||
:return: The name of a codec.
|
||||
"""
|
||||
value = (self._codec(self.CHARSET_ALIASES.get(charset, charset))
|
||||
or (charset and self._codec(charset.replace("-", "")))
|
||||
or (charset and self._codec(charset.replace("-", "_")))
|
||||
|
@ -785,12 +1034,16 @@ class UnicodeDammit:
|
|||
Currently the only situation supported is Windows-1252 (or its
|
||||
subset ISO-8859-1), embedded in UTF-8.
|
||||
|
||||
The input must be a bytestring. If you've already converted
|
||||
the document to Unicode, you're too late.
|
||||
|
||||
The output is a bytestring in which `embedded_encoding`
|
||||
characters have been converted to their `main_encoding`
|
||||
equivalents.
|
||||
:param in_bytes: A bytestring that you suspect contains
|
||||
characters from multiple encodings. Note that this _must_
|
||||
be a bytestring. If you've already converted the document
|
||||
to Unicode, you're too late.
|
||||
:param main_encoding: The primary encoding of `in_bytes`.
|
||||
:param embedded_encoding: The encoding that was used to embed characters
|
||||
in the main document.
|
||||
:return: A bytestring in which `embedded_encoding`
|
||||
characters have been converted to their `main_encoding`
|
||||
equivalents.
|
||||
"""
|
||||
if embedded_encoding.replace('_', '-').lower() not in (
|
||||
'windows-1252', 'windows_1252'):
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue