Bump beautifulsoup4 from 4.11.2 to 4.12.2 (#2037)

* Bump beautifulsoup4 from 4.11.2 to 4.12.2

Bumps [beautifulsoup4](https://www.crummy.com/software/BeautifulSoup/bs4/) from 4.11.2 to 4.12.2.

---
updated-dependencies:
- dependency-name: beautifulsoup4
  dependency-type: direct:production
  update-type: version-update:semver-minor
...

Signed-off-by: dependabot[bot] <support@github.com>

* Update beautifulsoup4==4.12.2

---------

Signed-off-by: dependabot[bot] <support@github.com>
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
Co-authored-by: JonnyWong16 <9099342+JonnyWong16@users.noreply.github.com>

[skip ci]
This commit is contained in:
dependabot[bot] 2023-08-23 21:38:49 -07:00 committed by GitHub
commit e70e08c3f5
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
32 changed files with 1439 additions and 755 deletions

View file

@ -8,14 +8,8 @@ except ImportError as e:
import re
import sys
import warnings
try:
import soupsieve
except ImportError as e:
soupsieve = None
warnings.warn(
'The soupsieve package is not installed. CSS selectors cannot be used.'
)
from bs4.css import CSS
from bs4.formatter import (
Formatter,
HTMLFormatter,
@ -69,13 +63,13 @@ PYTHON_SPECIFIC_ENCODINGS = set([
"string-escape",
"string_escape",
])
class NamespacedAttribute(str):
"""A namespaced string (e.g. 'xml:lang') that remembers the namespace
('xml') and the name ('lang') that were used to create it.
"""
def __new__(cls, prefix, name=None, namespace=None):
if not name:
# This is the default namespace. Its name "has no value"
@ -146,14 +140,19 @@ class ContentMetaAttributeValue(AttributeValueWithCharsetSubstitution):
return match.group(1) + encoding
return self.CHARSET_RE.sub(rewrite, self.original_value)
class PageElement(object):
"""Contains the navigational information for some part of the page:
that is, its current location in the parse tree.
NavigableString, Tag, etc. are all subclasses of PageElement.
"""
# In general, we can't tell just by looking at an element whether
# it's contained in an XML document or an HTML document. But for
# Tags (q.v.) we can store this information at parse time.
known_xml = None
def setup(self, parent=None, previous_element=None, next_element=None,
previous_sibling=None, next_sibling=None):
"""Sets up the initial relations between this element and
@ -163,7 +162,7 @@ class PageElement(object):
:param previous_element: The element parsed immediately before
this one.
:param next_element: The element parsed immediately before
this one.
@ -257,11 +256,11 @@ class PageElement(object):
default = object()
def _all_strings(self, strip=False, types=default):
"""Yield all strings of certain classes, possibly stripping them.
This is implemented differently in Tag and NavigableString.
"""
raise NotImplementedError()
@property
def stripped_strings(self):
"""Yield all strings in this PageElement, stripping them first.
@ -294,11 +293,11 @@ class PageElement(object):
strip, types=types)])
getText = get_text
text = property(get_text)
def replace_with(self, *args):
"""Replace this PageElement with one or more PageElements, keeping the
"""Replace this PageElement with one or more PageElements, keeping the
rest of the tree the same.
:param args: One or more PageElements.
:return: `self`, no longer part of the tree.
"""
@ -410,7 +409,7 @@ class PageElement(object):
This works the same way as `list.insert`.
:param position: The numeric position that should be occupied
in `self.children` by the new PageElement.
in `self.children` by the new PageElement.
:param new_child: A PageElement.
"""
if new_child is None:
@ -546,7 +545,7 @@ class PageElement(object):
"Element has no parent, so 'after' has no meaning.")
if any(x is self for x in args):
raise ValueError("Can't insert an element after itself.")
offset = 0
for successor in args:
# Extract first so that the index won't be screwed up if they
@ -912,7 +911,7 @@ class PageElement(object):
:rtype: bool
"""
return getattr(self, '_decomposed', False) or False
# Old non-property versions of the generators, for backwards
# compatibility with BS3.
def nextGenerator(self):
@ -936,16 +935,11 @@ class NavigableString(str, PageElement):
When Beautiful Soup parses the markup <b>penguin</b>, it will
create a NavigableString for the string "penguin".
"""
"""
PREFIX = ''
SUFFIX = ''
# We can't tell just by looking at a string whether it's contained
# in an XML document or an HTML document.
known_xml = None
def __new__(cls, value):
"""Create a new NavigableString.
@ -961,12 +955,22 @@ class NavigableString(str, PageElement):
u.setup()
return u
def __copy__(self):
def __deepcopy__(self, memo, recursive=False):
"""A copy of a NavigableString has the same contents and class
as the original, but it is not connected to the parse tree.
:param recursive: This parameter is ignored; it's only defined
so that NavigableString.__deepcopy__ implements the same
signature as Tag.__deepcopy__.
"""
return type(self)(self)
def __copy__(self):
"""A copy of a NavigableString can only be a deep copy, because
only one PageElement can occupy a given place in a parse tree.
"""
return self.__deepcopy__({})
def __getnewargs__(self):
return (str(self),)
@ -1059,10 +1063,10 @@ class PreformattedString(NavigableString):
as comments (the Comment class) and CDATA blocks (the CData
class).
"""
PREFIX = ''
SUFFIX = ''
def output_ready(self, formatter=None):
"""Make this string ready for output by adding any subclass-specific
prefix or suffix.
@ -1144,7 +1148,7 @@ class Stylesheet(NavigableString):
"""
pass
class Script(NavigableString):
"""A NavigableString representing an executable script (probably
Javascript).
@ -1250,7 +1254,7 @@ class Tag(PageElement):
if ((not builder or builder.store_line_numbers)
and (sourceline is not None or sourcepos is not None)):
self.sourceline = sourceline
self.sourcepos = sourcepos
self.sourcepos = sourcepos
if attrs is None:
attrs = {}
elif attrs:
@ -1308,13 +1312,49 @@ class Tag(PageElement):
self.interesting_string_types = builder.string_containers[self.name]
else:
self.interesting_string_types = self.DEFAULT_INTERESTING_STRING_TYPES
parserClass = _alias("parser_class") # BS3
def __copy__(self):
"""A copy of a Tag is a new Tag, unconnected to the parse tree.
def __deepcopy__(self, memo, recursive=True):
"""A deepcopy of a Tag is a new Tag, unconnected to the parse tree.
Its contents are a copy of the old Tag's contents.
"""
clone = self._clone()
if recursive:
# Clone this tag's descendants recursively, but without
# making any recursive function calls.
tag_stack = [clone]
for event, element in self._event_stream(self.descendants):
if event is Tag.END_ELEMENT_EVENT:
# Stop appending incoming Tags to the Tag that was
# just closed.
tag_stack.pop()
else:
descendant_clone = element.__deepcopy__(
memo, recursive=False
)
# Add to its parent's .contents
tag_stack[-1].append(descendant_clone)
if event is Tag.START_ELEMENT_EVENT:
# Add the Tag itself to the stack so that its
# children will be .appended to it.
tag_stack.append(descendant_clone)
return clone
def __copy__(self):
"""A copy of a Tag must always be a deep copy, because a Tag's
children can only have one parent at a time.
"""
return self.__deepcopy__({})
def _clone(self):
"""Create a new Tag just like this one, but with no
contents and unattached to any parse tree.
This is the first step in the deepcopy process.
"""
clone = type(self)(
None, self.builder, self.name, self.namespace,
self.prefix, self.attrs, is_xml=self._is_xml,
@ -1326,8 +1366,6 @@ class Tag(PageElement):
)
for attr in ('can_be_empty_element', 'hidden'):
setattr(clone, attr, getattr(self, attr))
for child in self.contents:
clone.append(child.__copy__())
return clone
@property
@ -1433,7 +1471,7 @@ class Tag(PageElement):
i.contents = []
i._decomposed = True
i = n
def clear(self, decompose=False):
"""Wipe out all children of this PageElement by calling extract()
on them.
@ -1521,7 +1559,7 @@ class Tag(PageElement):
if not isinstance(value, list):
value = [value]
return value
def has_attr(self, key):
"""Does this PageElement have an attribute with the given name?"""
return key in self.attrs
@ -1608,7 +1646,7 @@ class Tag(PageElement):
def __repr__(self, encoding="unicode-escape"):
"""Renders this PageElement as a string.
:param encoding: The encoding to use (Python 2 only).
:param encoding: The encoding to use (Python 2 only).
TODO: This is now ignored and a warning should be issued
if a value is provided.
:return: A (Unicode) string.
@ -1650,106 +1688,212 @@ class Tag(PageElement):
def decode(self, indent_level=None,
eventual_encoding=DEFAULT_OUTPUT_ENCODING,
formatter="minimal"):
"""Render a Unicode representation of this PageElement and its
contents.
:param indent_level: Each line of the rendering will be
indented this many spaces. Used internally in
recursive calls while pretty-printing.
:param eventual_encoding: The tag is destined to be
encoded into this encoding. This method is _not_
responsible for performing that encoding. This information
is passed in so that it can be substituted in if the
document contains a <META> tag that mentions the document's
encoding.
:param formatter: A Formatter object, or a string naming one of
the standard formatters.
"""
formatter="minimal",
iterator=None):
pieces = []
# First off, turn a non-Formatter `formatter` into a Formatter
# object. This will stop the lookup from happening over and
# over again.
if not isinstance(formatter, Formatter):
formatter = self.formatter_for_name(formatter)
attributes = formatter.attributes(self)
attrs = []
for key, val in attributes:
if val is None:
decoded = key
if indent_level is True:
indent_level = 0
# The currently active tag that put us into string literal
# mode. Until this element is closed, children will be treated
# as string literals and not pretty-printed. String literal
# mode is turned on immediately after this tag begins, and
# turned off immediately before it's closed. This means there
# will be whitespace before and after the tag itself.
string_literal_tag = None
for event, element in self._event_stream(iterator):
if event in (Tag.START_ELEMENT_EVENT, Tag.EMPTY_ELEMENT_EVENT):
piece = element._format_tag(
eventual_encoding, formatter, opening=True
)
elif event is Tag.END_ELEMENT_EVENT:
piece = element._format_tag(
eventual_encoding, formatter, opening=False
)
if indent_level is not None:
indent_level -= 1
else:
if isinstance(val, list) or isinstance(val, tuple):
val = ' '.join(val)
elif not isinstance(val, str):
val = str(val)
elif (
isinstance(val, AttributeValueWithCharsetSubstitution)
and eventual_encoding is not None
):
val = val.encode(eventual_encoding)
piece = element.output_ready(formatter)
text = formatter.attribute_value(val)
decoded = (
str(key) + '='
+ formatter.quoted_attribute_value(text))
attrs.append(decoded)
close = ''
closeTag = ''
# Now we need to apply the 'prettiness' -- extra
# whitespace before and/or after this tag. This can get
# complicated because certain tags, like <pre> and
# <script>, can't be prettified, since adding whitespace would
# change the meaning of the content.
# The default behavior is to add whitespace before and
# after an element when string literal mode is off, and to
# leave things as they are when string literal mode is on.
if string_literal_tag:
indent_before = indent_after = False
else:
indent_before = indent_after = True
# The only time the behavior is more complex than that is
# when we encounter an opening or closing tag that might
# put us into or out of string literal mode.
if (event is Tag.START_ELEMENT_EVENT
and not string_literal_tag
and not element._should_pretty_print()):
# We are about to enter string literal mode. Add
# whitespace before this tag, but not after. We
# will stay in string literal mode until this tag
# is closed.
indent_before = True
indent_after = False
string_literal_tag = element
elif (event is Tag.END_ELEMENT_EVENT
and element is string_literal_tag):
# We are about to exit string literal mode by closing
# the tag that sent us into that mode. Add whitespace
# after this tag, but not before.
indent_before = False
indent_after = True
string_literal_tag = None
# Now we know whether to add whitespace before and/or
# after this element.
if indent_level is not None:
if (indent_before or indent_after):
if isinstance(element, NavigableString):
piece = piece.strip()
if piece:
piece = self._indent_string(
piece, indent_level, formatter,
indent_before, indent_after
)
if event == Tag.START_ELEMENT_EVENT:
indent_level += 1
pieces.append(piece)
return "".join(pieces)
# Names for the different events yielded by _event_stream
START_ELEMENT_EVENT = object()
END_ELEMENT_EVENT = object()
EMPTY_ELEMENT_EVENT = object()
STRING_ELEMENT_EVENT = object()
def _event_stream(self, iterator=None):
"""Yield a sequence of events that can be used to reconstruct the DOM
for this element.
This lets us recreate the nested structure of this element
(e.g. when formatting it as a string) without using recursive
method calls.
This is similar in concept to the SAX API, but it's a simpler
interface designed for internal use. The events are different
from SAX and the arguments associated with the events are Tags
and other Beautiful Soup objects.
:param iterator: An alternate iterator to use when traversing
the tree.
"""
tag_stack = []
iterator = iterator or self.self_and_descendants
for c in iterator:
# If the parent of the element we're about to yield is not
# the tag currently on the stack, it means that the tag on
# the stack closed before this element appeared.
while tag_stack and c.parent != tag_stack[-1]:
now_closed_tag = tag_stack.pop()
yield Tag.END_ELEMENT_EVENT, now_closed_tag
if isinstance(c, Tag):
if c.is_empty_element:
yield Tag.EMPTY_ELEMENT_EVENT, c
else:
yield Tag.START_ELEMENT_EVENT, c
tag_stack.append(c)
continue
else:
yield Tag.STRING_ELEMENT_EVENT, c
while tag_stack:
now_closed_tag = tag_stack.pop()
yield Tag.END_ELEMENT_EVENT, now_closed_tag
def _indent_string(self, s, indent_level, formatter,
indent_before, indent_after):
"""Add indentation whitespace before and/or after a string.
:param s: The string to amend with whitespace.
:param indent_level: The indentation level; affects how much
whitespace goes before the string.
:param indent_before: Whether or not to add whitespace
before the string.
:param indent_after: Whether or not to add whitespace
(a newline) after the string.
"""
space_before = ''
if indent_before and indent_level:
space_before = (formatter.indent * indent_level)
space_after = ''
if indent_after:
space_after = "\n"
return space_before + s + space_after
def _format_tag(self, eventual_encoding, formatter, opening):
# A tag starts with the < character (see below).
# Then the / character, if this is a closing tag.
closing_slash = ''
if not opening:
closing_slash = '/'
# Then an optional namespace prefix.
prefix = ''
if self.prefix:
prefix = self.prefix + ":"
if self.is_empty_element:
close = formatter.void_element_close_prefix or ''
else:
closeTag = '</%s%s>' % (prefix, self.name)
# Then a list of attribute values, if this is an opening tag.
attribute_string = ''
if opening:
attributes = formatter.attributes(self)
attrs = []
for key, val in attributes:
if val is None:
decoded = key
else:
if isinstance(val, list) or isinstance(val, tuple):
val = ' '.join(val)
elif not isinstance(val, str):
val = str(val)
elif (
isinstance(val, AttributeValueWithCharsetSubstitution)
and eventual_encoding is not None
):
val = val.encode(eventual_encoding)
pretty_print = self._should_pretty_print(indent_level)
space = ''
indent_space = ''
if indent_level is not None:
indent_space = (formatter.indent * (indent_level - 1))
if pretty_print:
space = indent_space
indent_contents = indent_level + 1
else:
indent_contents = None
contents = self.decode_contents(
indent_contents, eventual_encoding, formatter
)
if self.hidden:
# This is the 'document root' object.
s = contents
else:
s = []
attribute_string = ''
text = formatter.attribute_value(val)
decoded = (
str(key) + '='
+ formatter.quoted_attribute_value(text))
attrs.append(decoded)
if attrs:
attribute_string = ' ' + ' '.join(attrs)
if indent_level is not None:
# Even if this particular tag is not pretty-printed,
# we should indent up to the start of the tag.
s.append(indent_space)
s.append('<%s%s%s%s>' % (
prefix, self.name, attribute_string, close))
if pretty_print:
s.append("\n")
s.append(contents)
if pretty_print and contents and contents[-1] != "\n":
s.append("\n")
if pretty_print and closeTag:
s.append(space)
s.append(closeTag)
if indent_level is not None and closeTag and self.next_sibling:
# Even if this particular tag is not pretty-printed,
# we're now done with the tag, and we should add a
# newline if appropriate.
s.append("\n")
s = ''.join(s)
return s
def _should_pretty_print(self, indent_level):
# Then an optional closing slash (for a void element in an
# XML document).
void_element_closing_slash = ''
if self.is_empty_element:
void_element_closing_slash = formatter.void_element_close_prefix or ''
# Put it all together.
return '<' + closing_slash + prefix + self.name + attribute_string + void_element_closing_slash + '>'
def _should_pretty_print(self, indent_level=1):
"""Should this tag be pretty-printed?
Most of them should, but some (such as <pre> in HTML
@ -1770,7 +1914,7 @@ class Tag(PageElement):
a Unicode string will be returned.
:param formatter: A Formatter object, or a string naming one of
the standard formatters.
:return: A Unicode string (if encoding==None) or a bytestring
:return: A Unicode string (if encoding==None) or a bytestring
(otherwise).
"""
if encoding is None:
@ -1800,33 +1944,9 @@ class Tag(PageElement):
the standard Formatters.
"""
# First off, turn a string formatter into a Formatter object. This
# will stop the lookup from happening over and over again.
if not isinstance(formatter, Formatter):
formatter = self.formatter_for_name(formatter)
return self.decode(indent_level, eventual_encoding, formatter,
iterator=self.descendants)
pretty_print = (indent_level is not None)
s = []
for c in self:
text = None
if isinstance(c, NavigableString):
text = c.output_ready(formatter)
elif isinstance(c, Tag):
s.append(c.decode(indent_level, eventual_encoding,
formatter))
preserve_whitespace = (
self.preserve_whitespace_tags and self.name in self.preserve_whitespace_tags
)
if text and indent_level and not preserve_whitespace:
text = text.strip()
if text:
if pretty_print and not preserve_whitespace:
s.append(formatter.indent * (indent_level - 1))
s.append(text)
if pretty_print and not preserve_whitespace:
s.append("\n")
return ''.join(s)
def encode_contents(
self, indent_level=None, encoding=DEFAULT_OUTPUT_ENCODING,
formatter="minimal"):
@ -1922,6 +2042,18 @@ class Tag(PageElement):
# return iter() to make the purpose of the method clear
return iter(self.contents) # XXX This seems to be untested.
@property
def self_and_descendants(self):
"""Iterate over this PageElement and its children in a
breadth-first sequence.
:yield: A sequence of PageElements.
"""
if not self.hidden:
yield self
for i in self.descendants:
yield i
@property
def descendants(self):
"""Iterate over all children of this PageElement in a
@ -1948,16 +2080,13 @@ class Tag(PageElement):
Beautiful Soup will use the prefixes it encountered while
parsing the document.
:param kwargs: Keyword arguments to be passed into SoupSieve's
:param kwargs: Keyword arguments to be passed into Soup Sieve's
soupsieve.select() method.
:return: A Tag.
:rtype: bs4.element.Tag
"""
value = self.select(selector, namespaces, 1, **kwargs)
if value:
return value[0]
return None
return self.css.select_one(selector, namespaces, **kwargs)
def select(self, selector, namespaces=None, limit=None, **kwargs):
"""Perform a CSS selection operation on the current element.
@ -1973,27 +2102,18 @@ class Tag(PageElement):
:param limit: After finding this number of results, stop looking.
:param kwargs: Keyword arguments to be passed into SoupSieve's
:param kwargs: Keyword arguments to be passed into SoupSieve's
soupsieve.select() method.
:return: A ResultSet of Tags.
:rtype: bs4.element.ResultSet
"""
if namespaces is None:
namespaces = self._namespaces
if limit is None:
limit = 0
if soupsieve is None:
raise NotImplementedError(
"Cannot execute CSS selectors because the soupsieve package is not installed."
)
results = soupsieve.select(selector, self, namespaces, limit, **kwargs)
return self.css.select(selector, namespaces, limit, **kwargs)
# We do this because it's more consistent and because
# ResultSet.__getattr__ has a helpful error message.
return ResultSet(None, results)
@property
def css(self):
"""Return an interface to the CSS selector API."""
return CSS(self)
# Old names for backwards compatibility
def childGenerator(self):
@ -2038,7 +2158,7 @@ class SoupStrainer(object):
:param attrs: A dictionary of filters on attribute values.
:param string: A filter for a NavigableString with specific text.
:kwargs: A dictionary of filters on attribute values.
"""
"""
if string is None and 'text' in kwargs:
string = kwargs.pop('text')
warnings.warn(
@ -2137,7 +2257,7 @@ class SoupStrainer(object):
# looking at a tag with a different name.
if markup and not markup.prefix and self.name != markup.name:
return False
call_function_with_tag_data = (
isinstance(self.name, Callable)
and not isinstance(markup_name, Tag))
@ -2223,7 +2343,7 @@ class SoupStrainer(object):
if self._matches(' '.join(markup), match_against):
return True
return False
if match_against is True:
# True matches any non-None value.
return markup is not None
@ -2267,11 +2387,11 @@ class SoupStrainer(object):
return True
else:
return False
# Beyond this point we might need to run the test twice: once against
# the tag's name and once against its prefixed name.
match = False
if not match and isinstance(match_against, str):
# Exact string match
match = markup == match_against