mirror of
https://github.com/Tautulli/Tautulli.git
synced 2025-08-19 21:03:21 -07:00
Update beautifulsoup4-4.10.0
This commit is contained in:
parent
b581460b51
commit
ab8fa4d5b3
16 changed files with 4599 additions and 743 deletions
|
@ -182,3 +182,45 @@ class HTML5LibBuilderSmokeTest(SoupTest, HTML5TreeBuilderSmokeTest):
|
|||
soup = self.soup(markup, store_line_numbers=False)
|
||||
self.assertEqual("sourceline", soup.p.sourceline.name)
|
||||
self.assertEqual("sourcepos", soup.p.sourcepos.name)
|
||||
|
||||
def test_special_string_containers(self):
|
||||
# The html5lib tree builder doesn't support this standard feature,
|
||||
# because there's no way of knowing, when a string is created,
|
||||
# where in the tree it will eventually end up.
|
||||
pass
|
||||
|
||||
def test_html5_attributes(self):
|
||||
# The html5lib TreeBuilder can convert any entity named in
|
||||
# the HTML5 spec to a sequence of Unicode characters, and
|
||||
# convert those Unicode characters to a (potentially
|
||||
# different) named entity on the way out.
|
||||
#
|
||||
# This is a copy of the same test from
|
||||
# HTMLParserTreeBuilderSmokeTest. It's not in the superclass
|
||||
# because the lxml HTML TreeBuilder _doesn't_ work this way.
|
||||
for input_element, output_unicode, output_element in (
|
||||
("⇄", '\u21c4', b'⇄'),
|
||||
('⊧', '\u22a7', b'⊧'),
|
||||
('𝔑', '\U0001d511', b'𝔑'),
|
||||
('≧̸', '\u2267\u0338', b'≧̸'),
|
||||
('¬', '\xac', b'¬'),
|
||||
('⫬', '\u2aec', b'⫬'),
|
||||
('"', '"', b'"'),
|
||||
('∴', '\u2234', b'∴'),
|
||||
('∴', '\u2234', b'∴'),
|
||||
('∴', '\u2234', b'∴'),
|
||||
("fj", 'fj', b'fj'),
|
||||
("⊔", '\u2294', b'⊔'),
|
||||
("⊔︀", '\u2294\ufe00', b'⊔︀'),
|
||||
("'", "'", b"'"),
|
||||
("|", "|", b"|"),
|
||||
):
|
||||
markup = '<div>%s</div>' % input_element
|
||||
div = self.soup(markup).div
|
||||
without_element = div.encode()
|
||||
expect = b"<div>%s</div>" % output_unicode.encode("utf8")
|
||||
self.assertEqual(without_element, expect)
|
||||
|
||||
with_element = div.encode(formatter="html")
|
||||
expect = b"<div>%s</div>" % output_element
|
||||
self.assertEqual(with_element, expect)
|
||||
|
|
|
@ -3,6 +3,7 @@ trees."""
|
|||
|
||||
from pdb import set_trace
|
||||
import pickle
|
||||
import warnings
|
||||
from bs4.testing import SoupTest, HTMLTreeBuilderSmokeTest
|
||||
from bs4.builder import HTMLParserTreeBuilder
|
||||
from bs4.builder._htmlparser import BeautifulSoupHTMLParser
|
||||
|
@ -51,11 +52,83 @@ class HTMLParserTreeBuilderSmokeTest(SoupTest, HTMLTreeBuilderSmokeTest):
|
|||
self.assertEqual("sourceline", soup.p.sourceline.name)
|
||||
self.assertEqual("sourcepos", soup.p.sourcepos.name)
|
||||
|
||||
def test_on_duplicate_attribute(self):
|
||||
# The html.parser tree builder has a variety of ways of
|
||||
# handling a tag that contains the same attribute multiple times.
|
||||
|
||||
markup = '<a class="cls" href="url1" href="url2" href="url3" id="id">'
|
||||
|
||||
# If you don't provide any particular value for
|
||||
# on_duplicate_attribute, later values replace earlier values.
|
||||
soup = self.soup(markup)
|
||||
self.assertEqual("url3", soup.a['href'])
|
||||
self.assertEqual(["cls"], soup.a['class'])
|
||||
self.assertEqual("id", soup.a['id'])
|
||||
|
||||
# You can also get this behavior explicitly.
|
||||
def assert_attribute(on_duplicate_attribute, expected):
|
||||
soup = self.soup(
|
||||
markup, on_duplicate_attribute=on_duplicate_attribute
|
||||
)
|
||||
self.assertEqual(expected, soup.a['href'])
|
||||
|
||||
# Verify that non-duplicate attributes are treated normally.
|
||||
self.assertEqual(["cls"], soup.a['class'])
|
||||
self.assertEqual("id", soup.a['id'])
|
||||
assert_attribute(None, "url3")
|
||||
assert_attribute(BeautifulSoupHTMLParser.REPLACE, "url3")
|
||||
|
||||
# You can ignore subsequent values in favor of the first.
|
||||
assert_attribute(BeautifulSoupHTMLParser.IGNORE, "url1")
|
||||
|
||||
# And you can pass in a callable that does whatever you want.
|
||||
def accumulate(attrs, key, value):
|
||||
if not isinstance(attrs[key], list):
|
||||
attrs[key] = [attrs[key]]
|
||||
attrs[key].append(value)
|
||||
assert_attribute(accumulate, ["url1", "url2", "url3"])
|
||||
|
||||
def test_html5_attributes(self):
|
||||
# The html.parser TreeBuilder can convert any entity named in
|
||||
# the HTML5 spec to a sequence of Unicode characters, and
|
||||
# convert those Unicode characters to a (potentially
|
||||
# different) named entity on the way out.
|
||||
for input_element, output_unicode, output_element in (
|
||||
("⇄", '\u21c4', b'⇄'),
|
||||
('⊧', '\u22a7', b'⊧'),
|
||||
('𝔑', '\U0001d511', b'𝔑'),
|
||||
('≧̸', '\u2267\u0338', b'≧̸'),
|
||||
('¬', '\xac', b'¬'),
|
||||
('⫬', '\u2aec', b'⫬'),
|
||||
('"', '"', b'"'),
|
||||
('∴', '\u2234', b'∴'),
|
||||
('∴', '\u2234', b'∴'),
|
||||
('∴', '\u2234', b'∴'),
|
||||
("fj", 'fj', b'fj'),
|
||||
("⊔", '\u2294', b'⊔'),
|
||||
("⊔︀", '\u2294\ufe00', b'⊔︀'),
|
||||
("'", "'", b"'"),
|
||||
("|", "|", b"|"),
|
||||
):
|
||||
markup = '<div>%s</div>' % input_element
|
||||
div = self.soup(markup).div
|
||||
without_element = div.encode()
|
||||
expect = b"<div>%s</div>" % output_unicode.encode("utf8")
|
||||
self.assertEqual(without_element, expect)
|
||||
|
||||
with_element = div.encode(formatter="html")
|
||||
expect = b"<div>%s</div>" % output_element
|
||||
self.assertEqual(with_element, expect)
|
||||
|
||||
|
||||
class TestHTMLParserSubclass(SoupTest):
|
||||
def test_error(self):
|
||||
"""Verify that our HTMLParser subclass implements error() in a way
|
||||
that doesn't cause a crash.
|
||||
"""
|
||||
parser = BeautifulSoupHTMLParser()
|
||||
parser.error("don't crash")
|
||||
with warnings.catch_warnings(record=True) as warns:
|
||||
parser.error("don't crash")
|
||||
[warning] = warns
|
||||
assert "don't crash" == str(warning.message)
|
||||
|
||||
|
|
|
@ -45,7 +45,7 @@ class LXMLTreeBuilderSmokeTest(SoupTest, HTMLTreeBuilderSmokeTest):
|
|||
"<p>foo�bar</p>", "<p>foobar</p>")
|
||||
self.assertSoupEquals(
|
||||
"<p>foo�bar</p>", "<p>foobar</p>")
|
||||
|
||||
|
||||
def test_entities_in_foreign_document_encoding(self):
|
||||
# We can't implement this case correctly because by the time we
|
||||
# hear about markup like "“", it's been (incorrectly) converted into
|
||||
|
|
|
@ -3,6 +3,7 @@
|
|||
|
||||
from pdb import set_trace
|
||||
import logging
|
||||
import os
|
||||
import unittest
|
||||
import sys
|
||||
import tempfile
|
||||
|
@ -10,6 +11,8 @@ import tempfile
|
|||
from bs4 import (
|
||||
BeautifulSoup,
|
||||
BeautifulStoneSoup,
|
||||
GuessedAtParserWarning,
|
||||
MarkupResemblesLocatorWarning,
|
||||
)
|
||||
from bs4.builder import (
|
||||
TreeBuilder,
|
||||
|
@ -29,7 +32,6 @@ import bs4.dammit
|
|||
from bs4.dammit import (
|
||||
EntitySubstitution,
|
||||
UnicodeDammit,
|
||||
EncodingDetector,
|
||||
)
|
||||
from bs4.testing import (
|
||||
default_builder,
|
||||
|
@ -73,6 +75,7 @@ class TestConstructor(SoupTest):
|
|||
self.store_line_numbers = False
|
||||
self.cdata_list_attributes = []
|
||||
self.preserve_whitespace_tags = []
|
||||
self.string_containers = {}
|
||||
def initialize_soup(self, soup):
|
||||
pass
|
||||
def feed(self, markup):
|
||||
|
@ -186,28 +189,69 @@ class TestConstructor(SoupTest):
|
|||
isinstance(x, (TagPlus, StringPlus, CommentPlus))
|
||||
for x in soup.recursiveChildGenerator()
|
||||
)
|
||||
|
||||
def test_alternate_string_containers(self):
|
||||
# Test the ability to customize the string containers for
|
||||
# different types of tags.
|
||||
class PString(NavigableString):
|
||||
pass
|
||||
|
||||
class BString(NavigableString):
|
||||
pass
|
||||
|
||||
soup = self.soup(
|
||||
"<div>Hello.<p>Here is <b>some <i>bolded</i></b> text",
|
||||
string_containers = {
|
||||
'b': BString,
|
||||
'p': PString,
|
||||
}
|
||||
)
|
||||
|
||||
# The string before the <p> tag is a regular NavigableString.
|
||||
assert isinstance(soup.div.contents[0], NavigableString)
|
||||
|
||||
# The string inside the <p> tag, but not inside the <i> tag,
|
||||
# is a PString.
|
||||
assert isinstance(soup.p.contents[0], PString)
|
||||
|
||||
# Every string inside the <b> tag is a BString, even the one that
|
||||
# was also inside an <i> tag.
|
||||
for s in soup.b.strings:
|
||||
assert isinstance(s, BString)
|
||||
|
||||
# Now that parsing was complete, the string_container_stack
|
||||
# (where this information was kept) has been cleared out.
|
||||
self.assertEqual([], soup.string_container_stack)
|
||||
|
||||
|
||||
class TestWarnings(SoupTest):
|
||||
|
||||
def _no_parser_specified(self, s, is_there=True):
|
||||
v = s.startswith(BeautifulSoup.NO_PARSER_SPECIFIED_WARNING[:80])
|
||||
self.assertTrue(v)
|
||||
def _assert_warning(self, warnings, cls):
|
||||
for w in warnings:
|
||||
if isinstance(w.message, cls):
|
||||
return w
|
||||
raise Exception("%s warning not found in %r" % cls, warnings)
|
||||
|
||||
def _assert_no_parser_specified(self, w):
|
||||
warning = self._assert_warning(w, GuessedAtParserWarning)
|
||||
message = str(warning.message)
|
||||
self.assertTrue(
|
||||
message.startswith(BeautifulSoup.NO_PARSER_SPECIFIED_WARNING[:60])
|
||||
)
|
||||
|
||||
def test_warning_if_no_parser_specified(self):
|
||||
with warnings.catch_warnings(record=True) as w:
|
||||
soup = self.soup("<a><b></b></a>")
|
||||
msg = str(w[0].message)
|
||||
self._assert_no_parser_specified(msg)
|
||||
soup = BeautifulSoup("<a><b></b></a>")
|
||||
self._assert_no_parser_specified(w)
|
||||
|
||||
def test_warning_if_parser_specified_too_vague(self):
|
||||
with warnings.catch_warnings(record=True) as w:
|
||||
soup = self.soup("<a><b></b></a>", "html")
|
||||
msg = str(w[0].message)
|
||||
self._assert_no_parser_specified(msg)
|
||||
soup = BeautifulSoup("<a><b></b></a>", "html")
|
||||
self._assert_no_parser_specified(w)
|
||||
|
||||
def test_no_warning_if_explicit_parser_specified(self):
|
||||
with warnings.catch_warnings(record=True) as w:
|
||||
soup = self.soup("<a><b></b></a>", "html.parser")
|
||||
soup = BeautifulSoup("<a><b></b></a>", "html.parser")
|
||||
self.assertEqual([], w)
|
||||
|
||||
def test_parseOnlyThese_renamed_to_parse_only(self):
|
||||
|
@ -231,41 +275,58 @@ class TestWarnings(SoupTest):
|
|||
self.assertRaises(
|
||||
TypeError, self.soup, "<a>", no_such_argument=True)
|
||||
|
||||
class TestWarnings(SoupTest):
|
||||
|
||||
def test_disk_file_warning(self):
|
||||
filehandle = tempfile.NamedTemporaryFile()
|
||||
filename = filehandle.name
|
||||
try:
|
||||
with warnings.catch_warnings(record=True) as w:
|
||||
soup = self.soup(filename)
|
||||
msg = str(w[0].message)
|
||||
self.assertTrue("looks like a filename" in msg)
|
||||
warning = self._assert_warning(w, MarkupResemblesLocatorWarning)
|
||||
self.assertTrue("looks like a filename" in str(warning.message))
|
||||
finally:
|
||||
filehandle.close()
|
||||
|
||||
# The file no longer exists, so Beautiful Soup will no longer issue the warning.
|
||||
with warnings.catch_warnings(record=True) as w:
|
||||
soup = self.soup(filename)
|
||||
self.assertEqual(0, len(w))
|
||||
self.assertEqual([], w)
|
||||
|
||||
def test_directory_warning(self):
|
||||
try:
|
||||
filename = tempfile.mkdtemp()
|
||||
with warnings.catch_warnings(record=True) as w:
|
||||
soup = self.soup(filename)
|
||||
warning = self._assert_warning(w, MarkupResemblesLocatorWarning)
|
||||
self.assertTrue("looks like a directory" in str(warning.message))
|
||||
finally:
|
||||
os.rmdir(filename)
|
||||
|
||||
# The directory no longer exists, so Beautiful Soup will no longer issue the warning.
|
||||
with warnings.catch_warnings(record=True) as w:
|
||||
soup = self.soup(filename)
|
||||
self.assertEqual([], w)
|
||||
|
||||
def test_url_warning_with_bytes_url(self):
|
||||
with warnings.catch_warnings(record=True) as warning_list:
|
||||
soup = self.soup(b"http://www.crummybytes.com/")
|
||||
# Be aware this isn't the only warning that can be raised during
|
||||
# execution..
|
||||
self.assertTrue(any("looks like a URL" in str(w.message)
|
||||
for w in warning_list))
|
||||
warning = self._assert_warning(
|
||||
warning_list, MarkupResemblesLocatorWarning
|
||||
)
|
||||
self.assertTrue("looks like a URL" in str(warning.message))
|
||||
|
||||
def test_url_warning_with_unicode_url(self):
|
||||
with warnings.catch_warnings(record=True) as warning_list:
|
||||
# note - this url must differ from the bytes one otherwise
|
||||
# python's warnings system swallows the second warning
|
||||
soup = self.soup("http://www.crummyunicode.com/")
|
||||
self.assertTrue(any("looks like a URL" in str(w.message)
|
||||
for w in warning_list))
|
||||
warning = self._assert_warning(
|
||||
warning_list, MarkupResemblesLocatorWarning
|
||||
)
|
||||
self.assertTrue("looks like a URL" in str(warning.message))
|
||||
|
||||
def test_url_warning_with_bytes_and_space(self):
|
||||
# Here the markup contains something besides a URL, so no warning
|
||||
# is issued.
|
||||
with warnings.catch_warnings(record=True) as warning_list:
|
||||
soup = self.soup(b"http://www.crummybytes.com/ is great")
|
||||
self.assertFalse(any("looks like a URL" in str(w.message)
|
||||
|
@ -307,6 +368,51 @@ class TestEntitySubstitution(unittest.TestCase):
|
|||
self.assertEqual(self.sub.substitute_html(dammit.markup),
|
||||
"‘’foo“”")
|
||||
|
||||
def test_html5_entity(self):
|
||||
# Some HTML5 entities correspond to single- or multi-character
|
||||
# Unicode sequences.
|
||||
|
||||
for entity, u in (
|
||||
# A few spot checks of our ability to recognize
|
||||
# special character sequences and convert them
|
||||
# to named entities.
|
||||
('⊧', '\u22a7'),
|
||||
('𝔑', '\U0001d511'),
|
||||
('≧̸', '\u2267\u0338'),
|
||||
('¬', '\xac'),
|
||||
('⫬', '\u2aec'),
|
||||
|
||||
# We _could_ convert | to &verbarr;, but we don't, because
|
||||
# | is an ASCII character.
|
||||
('|' '|'),
|
||||
|
||||
# Similarly for the fj ligature, which we could convert to
|
||||
# fj, but we don't.
|
||||
("fj", "fj"),
|
||||
|
||||
# We do convert _these_ ASCII characters to HTML entities,
|
||||
# because that's required to generate valid HTML.
|
||||
('>', '>'),
|
||||
('<', '<'),
|
||||
('&', '&'),
|
||||
):
|
||||
template = '3 %s 4'
|
||||
raw = template % u
|
||||
with_entities = template % entity
|
||||
self.assertEqual(self.sub.substitute_html(raw), with_entities)
|
||||
|
||||
def test_html5_entity_with_variation_selector(self):
|
||||
# Some HTML5 entities correspond either to a single-character
|
||||
# Unicode sequence _or_ to the same character plus U+FE00,
|
||||
# VARIATION SELECTOR 1. We can handle this.
|
||||
data = "fjords \u2294 penguins"
|
||||
markup = "fjords ⊔ penguins"
|
||||
self.assertEqual(self.sub.substitute_html(data), markup)
|
||||
|
||||
data = "fjords \u2294\ufe00 penguins"
|
||||
markup = "fjords ⊔︀ penguins"
|
||||
self.assertEqual(self.sub.substitute_html(data), markup)
|
||||
|
||||
def test_xml_converstion_includes_no_quotes_if_make_quoted_attribute_is_false(self):
|
||||
s = 'Welcome to "my bar"'
|
||||
self.assertEqual(self.sub.substitute_xml(s, False), s)
|
||||
|
@ -416,235 +522,26 @@ class TestEncodingConversion(SoupTest):
|
|||
markup = '<div><a \N{SNOWMAN}="snowman"></a></div>'
|
||||
self.assertEqual(self.soup(markup).div.encode("utf8"), markup.encode("utf8"))
|
||||
|
||||
class TestUnicodeDammit(unittest.TestCase):
|
||||
"""Standalone tests of UnicodeDammit."""
|
||||
|
||||
def test_unicode_input(self):
|
||||
markup = "I'm already Unicode! \N{SNOWMAN}"
|
||||
dammit = UnicodeDammit(markup)
|
||||
self.assertEqual(dammit.unicode_markup, markup)
|
||||
|
||||
def test_smart_quotes_to_unicode(self):
|
||||
markup = b"<foo>\x91\x92\x93\x94</foo>"
|
||||
dammit = UnicodeDammit(markup)
|
||||
self.assertEqual(
|
||||
dammit.unicode_markup, "<foo>\u2018\u2019\u201c\u201d</foo>")
|
||||
|
||||
def test_smart_quotes_to_xml_entities(self):
|
||||
markup = b"<foo>\x91\x92\x93\x94</foo>"
|
||||
dammit = UnicodeDammit(markup, smart_quotes_to="xml")
|
||||
self.assertEqual(
|
||||
dammit.unicode_markup, "<foo>‘’“”</foo>")
|
||||
|
||||
def test_smart_quotes_to_html_entities(self):
|
||||
markup = b"<foo>\x91\x92\x93\x94</foo>"
|
||||
dammit = UnicodeDammit(markup, smart_quotes_to="html")
|
||||
self.assertEqual(
|
||||
dammit.unicode_markup, "<foo>‘’“”</foo>")
|
||||
|
||||
def test_smart_quotes_to_ascii(self):
|
||||
markup = b"<foo>\x91\x92\x93\x94</foo>"
|
||||
dammit = UnicodeDammit(markup, smart_quotes_to="ascii")
|
||||
self.assertEqual(
|
||||
dammit.unicode_markup, """<foo>''""</foo>""")
|
||||
|
||||
def test_detect_utf8(self):
|
||||
utf8 = b"Sacr\xc3\xa9 bleu! \xe2\x98\x83"
|
||||
dammit = UnicodeDammit(utf8)
|
||||
self.assertEqual(dammit.original_encoding.lower(), 'utf-8')
|
||||
self.assertEqual(dammit.unicode_markup, 'Sacr\xe9 bleu! \N{SNOWMAN}')
|
||||
|
||||
|
||||
def test_convert_hebrew(self):
|
||||
hebrew = b"\xed\xe5\xec\xf9"
|
||||
dammit = UnicodeDammit(hebrew, ["iso-8859-8"])
|
||||
self.assertEqual(dammit.original_encoding.lower(), 'iso-8859-8')
|
||||
self.assertEqual(dammit.unicode_markup, '\u05dd\u05d5\u05dc\u05e9')
|
||||
|
||||
def test_dont_see_smart_quotes_where_there_are_none(self):
|
||||
utf_8 = b"\343\202\261\343\203\274\343\202\277\343\202\244 Watch"
|
||||
dammit = UnicodeDammit(utf_8)
|
||||
self.assertEqual(dammit.original_encoding.lower(), 'utf-8')
|
||||
self.assertEqual(dammit.unicode_markup.encode("utf-8"), utf_8)
|
||||
|
||||
def test_ignore_inappropriate_codecs(self):
|
||||
utf8_data = "Räksmörgås".encode("utf-8")
|
||||
dammit = UnicodeDammit(utf8_data, ["iso-8859-8"])
|
||||
self.assertEqual(dammit.original_encoding.lower(), 'utf-8')
|
||||
|
||||
def test_ignore_invalid_codecs(self):
|
||||
utf8_data = "Räksmörgås".encode("utf-8")
|
||||
for bad_encoding in ['.utf8', '...', 'utF---16.!']:
|
||||
dammit = UnicodeDammit(utf8_data, [bad_encoding])
|
||||
self.assertEqual(dammit.original_encoding.lower(), 'utf-8')
|
||||
|
||||
def test_exclude_encodings(self):
|
||||
# This is UTF-8.
|
||||
utf8_data = "Räksmörgås".encode("utf-8")
|
||||
|
||||
# But if we exclude UTF-8 from consideration, the guess is
|
||||
# Windows-1252.
|
||||
dammit = UnicodeDammit(utf8_data, exclude_encodings=["utf-8"])
|
||||
self.assertEqual(dammit.original_encoding.lower(), 'windows-1252')
|
||||
|
||||
# And if we exclude that, there is no valid guess at all.
|
||||
dammit = UnicodeDammit(
|
||||
utf8_data, exclude_encodings=["utf-8", "windows-1252"])
|
||||
self.assertEqual(dammit.original_encoding, None)
|
||||
|
||||
def test_encoding_detector_replaces_junk_in_encoding_name_with_replacement_character(self):
|
||||
detected = EncodingDetector(
|
||||
b'<?xml version="1.0" encoding="UTF-\xdb" ?>')
|
||||
encodings = list(detected.encodings)
|
||||
assert 'utf-\N{REPLACEMENT CHARACTER}' in encodings
|
||||
|
||||
def test_detect_html5_style_meta_tag(self):
|
||||
|
||||
for data in (
|
||||
b'<html><meta charset="euc-jp" /></html>',
|
||||
b"<html><meta charset='euc-jp' /></html>",
|
||||
b"<html><meta charset=euc-jp /></html>",
|
||||
b"<html><meta charset=euc-jp/></html>"):
|
||||
dammit = UnicodeDammit(data, is_html=True)
|
||||
self.assertEqual(
|
||||
"euc-jp", dammit.original_encoding)
|
||||
|
||||
def test_last_ditch_entity_replacement(self):
|
||||
# This is a UTF-8 document that contains bytestrings
|
||||
# completely incompatible with UTF-8 (ie. encoded with some other
|
||||
# encoding).
|
||||
#
|
||||
# Since there is no consistent encoding for the document,
|
||||
# Unicode, Dammit will eventually encode the document as UTF-8
|
||||
# and encode the incompatible characters as REPLACEMENT
|
||||
# CHARACTER.
|
||||
#
|
||||
# If chardet is installed, it will detect that the document
|
||||
# can be converted into ISO-8859-1 without errors. This happens
|
||||
# to be the wrong encoding, but it is a consistent encoding, so the
|
||||
# code we're testing here won't run.
|
||||
#
|
||||
# So we temporarily disable chardet if it's present.
|
||||
doc = b"""\357\273\277<?xml version="1.0" encoding="UTF-8"?>
|
||||
<html><b>\330\250\330\252\330\261</b>
|
||||
<i>\310\322\321\220\312\321\355\344</i></html>"""
|
||||
chardet = bs4.dammit.chardet_dammit
|
||||
logging.disable(logging.WARNING)
|
||||
try:
|
||||
def noop(str):
|
||||
return None
|
||||
bs4.dammit.chardet_dammit = noop
|
||||
dammit = UnicodeDammit(doc)
|
||||
self.assertEqual(True, dammit.contains_replacement_characters)
|
||||
self.assertTrue("\ufffd" in dammit.unicode_markup)
|
||||
|
||||
soup = BeautifulSoup(doc, "html.parser")
|
||||
self.assertTrue(soup.contains_replacement_characters)
|
||||
finally:
|
||||
logging.disable(logging.NOTSET)
|
||||
bs4.dammit.chardet_dammit = chardet
|
||||
|
||||
def test_byte_order_mark_removed(self):
|
||||
# A document written in UTF-16LE will have its byte order marker stripped.
|
||||
data = b'\xff\xfe<\x00a\x00>\x00\xe1\x00\xe9\x00<\x00/\x00a\x00>\x00'
|
||||
dammit = UnicodeDammit(data)
|
||||
self.assertEqual("<a>áé</a>", dammit.unicode_markup)
|
||||
self.assertEqual("utf-16le", dammit.original_encoding)
|
||||
|
||||
def test_detwingle(self):
|
||||
# Here's a UTF8 document.
|
||||
utf8 = ("\N{SNOWMAN}" * 3).encode("utf8")
|
||||
|
||||
# Here's a Windows-1252 document.
|
||||
windows_1252 = (
|
||||
"\N{LEFT DOUBLE QUOTATION MARK}Hi, I like Windows!"
|
||||
"\N{RIGHT DOUBLE QUOTATION MARK}").encode("windows_1252")
|
||||
|
||||
# Through some unholy alchemy, they've been stuck together.
|
||||
doc = utf8 + windows_1252 + utf8
|
||||
|
||||
# The document can't be turned into UTF-8:
|
||||
self.assertRaises(UnicodeDecodeError, doc.decode, "utf8")
|
||||
|
||||
# Unicode, Dammit thinks the whole document is Windows-1252,
|
||||
# and decodes it into "☃☃☃“Hi, I like Windows!”☃☃☃"
|
||||
|
||||
# But if we run it through fix_embedded_windows_1252, it's fixed:
|
||||
|
||||
fixed = UnicodeDammit.detwingle(doc)
|
||||
self.assertEqual(
|
||||
"☃☃☃“Hi, I like Windows!”☃☃☃", fixed.decode("utf8"))
|
||||
|
||||
def test_detwingle_ignores_multibyte_characters(self):
|
||||
# Each of these characters has a UTF-8 representation ending
|
||||
# in \x93. \x93 is a smart quote if interpreted as
|
||||
# Windows-1252. But our code knows to skip over multibyte
|
||||
# UTF-8 characters, so they'll survive the process unscathed.
|
||||
for tricky_unicode_char in (
|
||||
"\N{LATIN SMALL LIGATURE OE}", # 2-byte char '\xc5\x93'
|
||||
"\N{LATIN SUBSCRIPT SMALL LETTER X}", # 3-byte char '\xe2\x82\x93'
|
||||
"\xf0\x90\x90\x93", # This is a CJK character, not sure which one.
|
||||
):
|
||||
input = tricky_unicode_char.encode("utf8")
|
||||
self.assertTrue(input.endswith(b'\x93'))
|
||||
output = UnicodeDammit.detwingle(input)
|
||||
self.assertEqual(output, input)
|
||||
|
||||
def test_find_declared_encoding(self):
|
||||
# Test our ability to find a declared encoding inside an
|
||||
# XML or HTML document.
|
||||
#
|
||||
# Even if the document comes in as Unicode, it may be
|
||||
# interesting to know what encoding was claimed
|
||||
# originally.
|
||||
|
||||
html_unicode = '<html><head><meta charset="utf-8"></head></html>'
|
||||
html_bytes = html_unicode.encode("ascii")
|
||||
|
||||
xml_unicode= '<?xml version="1.0" encoding="ISO-8859-1" ?>'
|
||||
xml_bytes = xml_unicode.encode("ascii")
|
||||
|
||||
m = EncodingDetector.find_declared_encoding
|
||||
self.assertEqual(None, m(html_unicode, is_html=False))
|
||||
self.assertEqual("utf-8", m(html_unicode, is_html=True))
|
||||
self.assertEqual("utf-8", m(html_bytes, is_html=True))
|
||||
|
||||
self.assertEqual("iso-8859-1", m(xml_unicode))
|
||||
self.assertEqual("iso-8859-1", m(xml_bytes))
|
||||
|
||||
# Normally, only the first few kilobytes of a document are checked for
|
||||
# an encoding.
|
||||
spacer = b' ' * 5000
|
||||
self.assertEqual(None, m(spacer + html_bytes))
|
||||
self.assertEqual(None, m(spacer + xml_bytes))
|
||||
|
||||
# But you can tell find_declared_encoding to search an entire
|
||||
# HTML document.
|
||||
self.assertEqual(
|
||||
"utf-8",
|
||||
m(spacer + html_bytes, is_html=True, search_entire_document=True)
|
||||
)
|
||||
|
||||
# The XML encoding declaration has to be the very first thing
|
||||
# in the document. We'll allow whitespace before the document
|
||||
# starts, but nothing else.
|
||||
self.assertEqual(
|
||||
"iso-8859-1",
|
||||
m(xml_bytes, search_entire_document=True)
|
||||
)
|
||||
self.assertEqual(
|
||||
None, m(b'a' + xml_bytes, search_entire_document=True)
|
||||
)
|
||||
|
||||
class TestNamedspacedAttribute(SoupTest):
|
||||
|
||||
def test_name_may_be_none_or_missing(self):
|
||||
a = NamespacedAttribute("xmlns", None)
|
||||
self.assertEqual(a, "xmlns")
|
||||
|
||||
a = NamespacedAttribute("xmlns", "")
|
||||
self.assertEqual(a, "xmlns")
|
||||
|
||||
a = NamespacedAttribute("xmlns")
|
||||
self.assertEqual(a, "xmlns")
|
||||
|
||||
def test_namespace_may_be_none_or_missing(self):
|
||||
a = NamespacedAttribute(None, "tag")
|
||||
self.assertEqual(a, "tag")
|
||||
|
||||
a = NamespacedAttribute("", "tag")
|
||||
self.assertEqual(a, "tag")
|
||||
|
||||
def test_attribute_is_equivalent_to_colon_separated_string(self):
|
||||
a = NamespacedAttribute("a", "b")
|
||||
self.assertEqual("a:b", a)
|
||||
|
|
|
@ -27,13 +27,17 @@ from bs4.element import (
|
|||
Doctype,
|
||||
Formatter,
|
||||
NavigableString,
|
||||
Script,
|
||||
SoupStrainer,
|
||||
Stylesheet,
|
||||
Tag,
|
||||
TemplateString,
|
||||
)
|
||||
from bs4.testing import (
|
||||
SoupTest,
|
||||
skipIf,
|
||||
)
|
||||
from soupsieve import SelectorSyntaxError
|
||||
|
||||
XML_BUILDER_PRESENT = (builder_registry.lookup("xml") is not None)
|
||||
LXML_PRESENT = (builder_registry.lookup("lxml") is not None)
|
||||
|
@ -1005,6 +1009,15 @@ class TestTreeModification(SoupTest):
|
|||
soup.a.extend(l)
|
||||
self.assertEqual("<a><g></g><f></f><e></e><d></d><c></c><b></b></a>", soup.decode())
|
||||
|
||||
def test_extend_with_another_tags_contents(self):
|
||||
data = '<body><div id="d1"><a>1</a><a>2</a><a>3</a><a>4</a></div><div id="d2"></div></body>'
|
||||
soup = self.soup(data)
|
||||
d1 = soup.find('div', id='d1')
|
||||
d2 = soup.find('div', id='d2')
|
||||
d2.extend(d1)
|
||||
self.assertEqual('<div id="d1"></div>', d1.decode())
|
||||
self.assertEqual('<div id="d2"><a>1</a><a>2</a><a>3</a><a>4</a></div>', d2.decode())
|
||||
|
||||
def test_move_tag_to_beginning_of_parent(self):
|
||||
data = "<a><b></b><c></c><d></d></a>"
|
||||
soup = self.soup(data)
|
||||
|
@ -1117,6 +1130,37 @@ class TestTreeModification(SoupTest):
|
|||
self.assertEqual(no.next_element, "no")
|
||||
self.assertEqual(no.next_sibling, " business")
|
||||
|
||||
def test_replace_with_errors(self):
|
||||
# Can't replace a tag that's not part of a tree.
|
||||
a_tag = Tag(name="a")
|
||||
self.assertRaises(ValueError, a_tag.replace_with, "won't work")
|
||||
|
||||
# Can't replace a tag with its parent.
|
||||
a_tag = self.soup("<a><b></b></a>").a
|
||||
self.assertRaises(ValueError, a_tag.b.replace_with, a_tag)
|
||||
|
||||
# Or with a list that includes its parent.
|
||||
self.assertRaises(ValueError, a_tag.b.replace_with,
|
||||
"string1", a_tag, "string2")
|
||||
|
||||
def test_replace_with_multiple(self):
|
||||
data = "<a><b></b><c></c></a>"
|
||||
soup = self.soup(data)
|
||||
d_tag = soup.new_tag("d")
|
||||
d_tag.string = "Text In D Tag"
|
||||
e_tag = soup.new_tag("e")
|
||||
f_tag = soup.new_tag("f")
|
||||
a_string = "Random Text"
|
||||
soup.c.replace_with(d_tag, e_tag, a_string, f_tag)
|
||||
self.assertEqual(
|
||||
"<a><b></b><d>Text In D Tag</d><e></e>Random Text<f></f></a>",
|
||||
soup.decode()
|
||||
)
|
||||
assert soup.b.next_element == d_tag
|
||||
assert d_tag.string.next_element==e_tag
|
||||
assert e_tag.next_element.string == a_string
|
||||
assert e_tag.next_element.next_element == f_tag
|
||||
|
||||
def test_replace_first_child(self):
|
||||
data = "<a><b></b><c></c></a>"
|
||||
soup = self.soup(data)
|
||||
|
@ -1275,6 +1319,23 @@ class TestTreeModification(SoupTest):
|
|||
a.clear(decompose=True)
|
||||
self.assertEqual(0, len(em.contents))
|
||||
|
||||
|
||||
def test_decompose(self):
|
||||
# Test PageElement.decompose() and PageElement.decomposed
|
||||
soup = self.soup("<p><a>String <em>Italicized</em></a></p><p>Another para</p>")
|
||||
p1, p2 = soup.find_all('p')
|
||||
a = p1.a
|
||||
text = p1.em.string
|
||||
for i in [p1, p2, a, text]:
|
||||
self.assertEqual(False, i.decomposed)
|
||||
|
||||
# This sets p1 and everything beneath it to decomposed.
|
||||
p1.decompose()
|
||||
for i in [p1, a, text]:
|
||||
self.assertEqual(True, i.decomposed)
|
||||
# p2 is unaffected.
|
||||
self.assertEqual(False, p2.decomposed)
|
||||
|
||||
def test_string_set(self):
|
||||
"""Tag.string = 'string'"""
|
||||
soup = self.soup("<a></a> <b><c></c></b>")
|
||||
|
@ -1391,7 +1452,7 @@ class TestElementObjects(SoupTest):
|
|||
self.assertEqual(soup.a.get_text(","), "a,r, , t ")
|
||||
self.assertEqual(soup.a.get_text(",", strip=True), "a,r,t")
|
||||
|
||||
def test_get_text_ignores_comments(self):
|
||||
def test_get_text_ignores_special_string_containers(self):
|
||||
soup = self.soup("foo<!--IGNORE-->bar")
|
||||
self.assertEqual(soup.get_text(), "foobar")
|
||||
|
||||
|
@ -1400,10 +1461,51 @@ class TestElementObjects(SoupTest):
|
|||
self.assertEqual(
|
||||
soup.get_text(types=None), "fooIGNOREbar")
|
||||
|
||||
def test_all_strings_ignores_comments(self):
|
||||
soup = self.soup("foo<style>CSS</style><script>Javascript</script>bar")
|
||||
self.assertEqual(soup.get_text(), "foobar")
|
||||
|
||||
def test_all_strings_ignores_special_string_containers(self):
|
||||
soup = self.soup("foo<!--IGNORE-->bar")
|
||||
self.assertEqual(['foo', 'bar'], list(soup.strings))
|
||||
|
||||
soup = self.soup("foo<style>CSS</style><script>Javascript</script>bar")
|
||||
self.assertEqual(['foo', 'bar'], list(soup.strings))
|
||||
|
||||
def test_string_methods_inside_special_string_container_tags(self):
|
||||
# Strings inside tags like <script> are generally ignored by
|
||||
# methods like get_text, because they're not what humans
|
||||
# consider 'text'. But if you call get_text on the <script>
|
||||
# tag itself, those strings _are_ considered to be 'text',
|
||||
# because there's nothing else you might be looking for.
|
||||
|
||||
style = self.soup("<div>a<style>Some CSS</style></div>")
|
||||
template = self.soup("<div>a<template><p>Templated <b>text</b>.</p><!--With a comment.--></template></div>")
|
||||
script = self.soup("<div>a<script><!--a comment-->Some text</script></div>")
|
||||
|
||||
self.assertEqual(style.div.get_text(), "a")
|
||||
self.assertEqual(list(style.div.strings), ["a"])
|
||||
self.assertEqual(style.div.style.get_text(), "Some CSS")
|
||||
self.assertEqual(list(style.div.style.strings),
|
||||
['Some CSS'])
|
||||
|
||||
# The comment is not picked up here. That's because it was
|
||||
# parsed into a Comment object, which is not considered
|
||||
# interesting by template.strings.
|
||||
self.assertEqual(template.div.get_text(), "a")
|
||||
self.assertEqual(list(template.div.strings), ["a"])
|
||||
self.assertEqual(template.div.template.get_text(), "Templated text.")
|
||||
self.assertEqual(list(template.div.template.strings),
|
||||
["Templated ", "text", "."])
|
||||
|
||||
# The comment is included here, because it didn't get parsed
|
||||
# into a Comment object--it's part of the Script string.
|
||||
self.assertEqual(script.div.get_text(), "a")
|
||||
self.assertEqual(list(script.div.strings), ["a"])
|
||||
self.assertEqual(script.div.script.get_text(),
|
||||
"<!--a comment-->Some text")
|
||||
self.assertEqual(list(script.div.script.strings),
|
||||
['<!--a comment-->Some text'])
|
||||
|
||||
class TestCDAtaListAttributes(SoupTest):
|
||||
|
||||
"""Testing cdata-list attributes like 'class'.
|
||||
|
@ -1775,71 +1877,7 @@ class TestEncoding(SoupTest):
|
|||
else:
|
||||
self.assertEqual(b'<b>\\u2603</b>', repr(soup))
|
||||
|
||||
class TestFormatter(SoupTest):
|
||||
|
||||
def test_sort_attributes(self):
|
||||
# Test the ability to override Formatter.attributes() to,
|
||||
# e.g., disable the normal sorting of attributes.
|
||||
class UnsortedFormatter(Formatter):
|
||||
def attributes(self, tag):
|
||||
self.called_with = tag
|
||||
for k, v in sorted(tag.attrs.items()):
|
||||
if k == 'ignore':
|
||||
continue
|
||||
yield k,v
|
||||
|
||||
soup = self.soup('<p cval="1" aval="2" ignore="ignored"></p>')
|
||||
formatter = UnsortedFormatter()
|
||||
decoded = soup.decode(formatter=formatter)
|
||||
|
||||
# attributes() was called on the <p> tag. It filtered out one
|
||||
# attribute and sorted the other two.
|
||||
self.assertEqual(formatter.called_with, soup.p)
|
||||
self.assertEqual('<p aval="2" cval="1"></p>', decoded)
|
||||
|
||||
|
||||
class TestNavigableStringSubclasses(SoupTest):
|
||||
|
||||
def test_cdata(self):
|
||||
# None of the current builders turn CDATA sections into CData
|
||||
# objects, but you can create them manually.
|
||||
soup = self.soup("")
|
||||
cdata = CData("foo")
|
||||
soup.insert(1, cdata)
|
||||
self.assertEqual(str(soup), "<![CDATA[foo]]>")
|
||||
self.assertEqual(soup.find(text="foo"), "foo")
|
||||
self.assertEqual(soup.contents[0], "foo")
|
||||
|
||||
def test_cdata_is_never_formatted(self):
|
||||
"""Text inside a CData object is passed into the formatter.
|
||||
|
||||
But the return value is ignored.
|
||||
"""
|
||||
|
||||
self.count = 0
|
||||
def increment(*args):
|
||||
self.count += 1
|
||||
return "BITTER FAILURE"
|
||||
|
||||
soup = self.soup("")
|
||||
cdata = CData("<><><>")
|
||||
soup.insert(1, cdata)
|
||||
self.assertEqual(
|
||||
b"<![CDATA[<><><>]]>", soup.encode(formatter=increment))
|
||||
self.assertEqual(1, self.count)
|
||||
|
||||
def test_doctype_ends_in_newline(self):
|
||||
# Unlike other NavigableString subclasses, a DOCTYPE always ends
|
||||
# in a newline.
|
||||
doctype = Doctype("foo")
|
||||
soup = self.soup("")
|
||||
soup.insert(1, doctype)
|
||||
self.assertEqual(soup.encode(), b"<!DOCTYPE foo>\n")
|
||||
|
||||
def test_declaration(self):
|
||||
d = Declaration("foo")
|
||||
self.assertEqual("<?foo?>", d.output_ready())
|
||||
|
||||
|
||||
class TestSoupSelector(TreeTest):
|
||||
|
||||
HTML = """
|
||||
|
@ -1949,7 +1987,7 @@ class TestSoupSelector(TreeTest):
|
|||
self.assertEqual(len(self.soup.select('del')), 0)
|
||||
|
||||
def test_invalid_tag(self):
|
||||
self.assertRaises(SyntaxError, self.soup.select, 'tag%t')
|
||||
self.assertRaises(SelectorSyntaxError, self.soup.select, 'tag%t')
|
||||
|
||||
def test_select_dashed_tag_ids(self):
|
||||
self.assertSelects('custom-dashed-tag', ['dash1', 'dash2'])
|
||||
|
@ -2140,7 +2178,7 @@ class TestSoupSelector(TreeTest):
|
|||
NotImplementedError, self.soup.select, "a:no-such-pseudoclass")
|
||||
|
||||
self.assertRaises(
|
||||
SyntaxError, self.soup.select, "a:nth-of-type(a)")
|
||||
SelectorSyntaxError, self.soup.select, "a:nth-of-type(a)")
|
||||
|
||||
def test_nth_of_type(self):
|
||||
# Try to select first paragraph
|
||||
|
@ -2196,7 +2234,7 @@ class TestSoupSelector(TreeTest):
|
|||
self.assertEqual([], self.soup.select('#inner ~ h2'))
|
||||
|
||||
def test_dangling_combinator(self):
|
||||
self.assertRaises(SyntaxError, self.soup.select, 'h1 >')
|
||||
self.assertRaises(SelectorSyntaxError, self.soup.select, 'h1 >')
|
||||
|
||||
def test_sibling_combinator_wont_select_same_tag_twice(self):
|
||||
self.assertSelects('p[lang] ~ p', ['lang-en-gb', 'lang-en-us', 'lang-fr'])
|
||||
|
@ -2227,8 +2265,8 @@ class TestSoupSelector(TreeTest):
|
|||
self.assertSelects('div x,y, z', ['xid', 'yid', 'zida', 'zidb', 'zidab', 'zidac'])
|
||||
|
||||
def test_invalid_multiple_select(self):
|
||||
self.assertRaises(SyntaxError, self.soup.select, ',x, y')
|
||||
self.assertRaises(SyntaxError, self.soup.select, 'x,,y')
|
||||
self.assertRaises(SelectorSyntaxError, self.soup.select, ',x, y')
|
||||
self.assertRaises(SelectorSyntaxError, self.soup.select, 'x,,y')
|
||||
|
||||
def test_multiple_select_attrs(self):
|
||||
self.assertSelects('p[lang=en], p[lang=en-gb]', ['lang-en', 'lang-en-gb'])
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue