mirror of
https://github.com/clinton-hall/nzbToMedia.git
synced 2025-08-21 05:43:16 -07:00
Update vendored beautifulsoup4 to 4.11.1
Adds soupsieve 2.3.2.post1
This commit is contained in:
parent
2226a74ef8
commit
968ec8a1d8
33 changed files with 10852 additions and 3694 deletions
|
@ -1,6 +1,5 @@
|
|||
"""Beautiful Soup
|
||||
Elixir and Tonic
|
||||
"The Screen-Scraper's Friend"
|
||||
"""Beautiful Soup Elixir and Tonic - "The Screen-Scraper's Friend".
|
||||
|
||||
http://www.crummy.com/software/BeautifulSoup/
|
||||
|
||||
Beautiful Soup uses a pluggable XML or HTML parser to parse a
|
||||
|
@ -8,32 +7,38 @@ Beautiful Soup uses a pluggable XML or HTML parser to parse a
|
|||
provides methods and Pythonic idioms that make it easy to navigate,
|
||||
search, and modify the parse tree.
|
||||
|
||||
Beautiful Soup works with Python 2.7 and up. It works better if lxml
|
||||
Beautiful Soup works with Python 3.5 and up. It works better if lxml
|
||||
and/or html5lib is installed.
|
||||
|
||||
For more than you ever wanted to know about Beautiful Soup, see the
|
||||
documentation:
|
||||
http://www.crummy.com/software/BeautifulSoup/bs4/doc/
|
||||
|
||||
documentation: http://www.crummy.com/software/BeautifulSoup/bs4/doc/
|
||||
"""
|
||||
|
||||
# Use of this source code is governed by a BSD-style license that can be
|
||||
# found in the LICENSE file.
|
||||
|
||||
__author__ = "Leonard Richardson (leonardr@segfault.org)"
|
||||
__version__ = "4.6.3"
|
||||
__copyright__ = "Copyright (c) 2004-2018 Leonard Richardson"
|
||||
__version__ = "4.11.1"
|
||||
__copyright__ = "Copyright (c) 2004-2022 Leonard Richardson"
|
||||
# Use of this source code is governed by the MIT license.
|
||||
__license__ = "MIT"
|
||||
|
||||
__all__ = ['BeautifulSoup']
|
||||
|
||||
from collections import Counter
|
||||
import os
|
||||
import re
|
||||
import sys
|
||||
import traceback
|
||||
import warnings
|
||||
|
||||
from .builder import builder_registry, ParserRejectedMarkup
|
||||
# The very first thing we do is give a useful error if someone is
|
||||
# running this code under Python 2.
|
||||
if sys.version_info.major < 3:
|
||||
raise ImportError('You are trying to use a Python 3-specific version of Beautiful Soup under Python 2. This will not work. The final version of Beautiful Soup to support Python 2 was 4.9.3.')
|
||||
|
||||
from .builder import (
|
||||
builder_registry,
|
||||
ParserRejectedMarkup,
|
||||
XMLParsedAsHTMLWarning,
|
||||
)
|
||||
from .dammit import UnicodeDammit
|
||||
from .element import (
|
||||
CData,
|
||||
|
@ -44,28 +49,49 @@ from .element import (
|
|||
NavigableString,
|
||||
PageElement,
|
||||
ProcessingInstruction,
|
||||
PYTHON_SPECIFIC_ENCODINGS,
|
||||
ResultSet,
|
||||
Script,
|
||||
Stylesheet,
|
||||
SoupStrainer,
|
||||
Tag,
|
||||
TemplateString,
|
||||
)
|
||||
|
||||
# The very first thing we do is give a useful error if someone is
|
||||
# running this code under Python 3 without converting it.
|
||||
'You are trying to run the Python 2 version of Beautiful Soup under Python 3. This will not work.'!='You need to convert the code, either by installing it (`python setup.py install`) or by running 2to3 (`2to3 -w bs4`).'
|
||||
|
||||
class BeautifulSoup(Tag):
|
||||
# Define some custom warnings.
|
||||
class GuessedAtParserWarning(UserWarning):
|
||||
"""The warning issued when BeautifulSoup has to guess what parser to
|
||||
use -- probably because no parser was specified in the constructor.
|
||||
"""
|
||||
This class defines the basic interface called by the tree builders.
|
||||
|
||||
These methods will be called by the parser:
|
||||
reset()
|
||||
feed(markup)
|
||||
class MarkupResemblesLocatorWarning(UserWarning):
|
||||
"""The warning issued when BeautifulSoup is given 'markup' that
|
||||
actually looks like a resource locator -- a URL or a path to a file
|
||||
on disk.
|
||||
"""
|
||||
|
||||
|
||||
class BeautifulSoup(Tag):
|
||||
"""A data structure representing a parsed HTML or XML document.
|
||||
|
||||
Most of the methods you'll call on a BeautifulSoup object are inherited from
|
||||
PageElement or Tag.
|
||||
|
||||
Internally, this class defines the basic interface called by the
|
||||
tree builders when converting an HTML/XML document into a data
|
||||
structure. The interface abstracts away the differences between
|
||||
parsers. To write a new tree builder, you'll need to understand
|
||||
these methods as a whole.
|
||||
|
||||
These methods will be called by the BeautifulSoup constructor:
|
||||
* reset()
|
||||
* feed(markup)
|
||||
|
||||
The tree builder may call these methods from its feed() implementation:
|
||||
handle_starttag(name, attrs) # See note about return value
|
||||
handle_endtag(name)
|
||||
handle_data(data) # Appends to the current data node
|
||||
endData(containerClass=NavigableString) # Ends the current data node
|
||||
* handle_starttag(name, attrs) # See note about return value
|
||||
* handle_endtag(name)
|
||||
* handle_data(data) # Appends to the current data node
|
||||
* endData(containerClass) # Ends the current data node
|
||||
|
||||
No matter how complicated the underlying parser is, you should be
|
||||
able to build a tree using 'start tag' events, 'end tag' events,
|
||||
|
@ -75,56 +101,77 @@ class BeautifulSoup(Tag):
|
|||
like HTML's <br> tag), call handle_starttag and then
|
||||
handle_endtag.
|
||||
"""
|
||||
|
||||
# Since BeautifulSoup subclasses Tag, it's possible to treat it as
|
||||
# a Tag with a .name. This name makes it clear the BeautifulSoup
|
||||
# object isn't a real markup tag.
|
||||
ROOT_TAG_NAME = '[document]'
|
||||
|
||||
# If the end-user gives no indication which tree builder they
|
||||
# want, look for one with these features.
|
||||
DEFAULT_BUILDER_FEATURES = ['html', 'fast']
|
||||
|
||||
# A string containing all ASCII whitespace characters, used in
|
||||
# endData() to detect data chunks that seem 'empty'.
|
||||
ASCII_SPACES = '\x20\x0a\x09\x0c\x0d'
|
||||
|
||||
NO_PARSER_SPECIFIED_WARNING = "No parser was explicitly specified, so I'm using the best available %(markup_type)s parser for this system (\"%(parser)s\"). This usually isn't a problem, but if you run this code on another system, or in a different virtual environment, it may use a different parser and behave differently.\n\nThe code that caused this warning is on line %(line_number)s of the file %(filename)s. To get rid of this warning, pass the additional argument 'features=\"%(parser)s\"' to the BeautifulSoup constructor.\n"
|
||||
|
||||
|
||||
def __init__(self, markup="", features=None, builder=None,
|
||||
parse_only=None, from_encoding=None, exclude_encodings=None,
|
||||
**kwargs):
|
||||
element_classes=None, **kwargs):
|
||||
"""Constructor.
|
||||
|
||||
:param markup: A string or a file-like object representing
|
||||
markup to be parsed.
|
||||
markup to be parsed.
|
||||
|
||||
:param features: Desirable features of the parser to be used. This
|
||||
may be the name of a specific parser ("lxml", "lxml-xml",
|
||||
"html.parser", or "html5lib") or it may be the type of markup
|
||||
to be used ("html", "html5", "xml"). It's recommended that you
|
||||
name a specific parser, so that Beautiful Soup gives you the
|
||||
same results across platforms and virtual environments.
|
||||
:param features: Desirable features of the parser to be
|
||||
used. This may be the name of a specific parser ("lxml",
|
||||
"lxml-xml", "html.parser", or "html5lib") or it may be the
|
||||
type of markup to be used ("html", "html5", "xml"). It's
|
||||
recommended that you name a specific parser, so that
|
||||
Beautiful Soup gives you the same results across platforms
|
||||
and virtual environments.
|
||||
|
||||
:param builder: A specific TreeBuilder to use instead of looking one
|
||||
up based on `features`. You shouldn't need to use this.
|
||||
:param builder: A TreeBuilder subclass to instantiate (or
|
||||
instance to use) instead of looking one up based on
|
||||
`features`. You only need to use this if you've implemented a
|
||||
custom TreeBuilder.
|
||||
|
||||
:param parse_only: A SoupStrainer. Only parts of the document
|
||||
matching the SoupStrainer will be considered. This is useful
|
||||
when parsing part of a document that would otherwise be too
|
||||
large to fit into memory.
|
||||
matching the SoupStrainer will be considered. This is useful
|
||||
when parsing part of a document that would otherwise be too
|
||||
large to fit into memory.
|
||||
|
||||
:param from_encoding: A string indicating the encoding of the
|
||||
document to be parsed. Pass this in if Beautiful Soup is
|
||||
guessing wrongly about the document's encoding.
|
||||
document to be parsed. Pass this in if Beautiful Soup is
|
||||
guessing wrongly about the document's encoding.
|
||||
|
||||
:param exclude_encodings: A list of strings indicating
|
||||
encodings known to be wrong. Pass this in if you don't know
|
||||
the document's encoding but you know Beautiful Soup's guess is
|
||||
wrong.
|
||||
encodings known to be wrong. Pass this in if you don't know
|
||||
the document's encoding but you know Beautiful Soup's guess is
|
||||
wrong.
|
||||
|
||||
:param element_classes: A dictionary mapping BeautifulSoup
|
||||
classes like Tag and NavigableString, to other classes you'd
|
||||
like to be instantiated instead as the parse tree is
|
||||
built. This is useful for subclassing Tag or NavigableString
|
||||
to modify default behavior.
|
||||
|
||||
:param kwargs: For backwards compatibility purposes, the
|
||||
constructor accepts certain keyword arguments used in
|
||||
Beautiful Soup 3. None of these arguments do anything in
|
||||
Beautiful Soup 4 and there's no need to actually pass keyword
|
||||
arguments into the constructor.
|
||||
constructor accepts certain keyword arguments used in
|
||||
Beautiful Soup 3. None of these arguments do anything in
|
||||
Beautiful Soup 4; they will result in a warning and then be
|
||||
ignored.
|
||||
|
||||
Apart from this, any keyword arguments passed into the
|
||||
BeautifulSoup constructor are propagated to the TreeBuilder
|
||||
constructor. This makes it possible to configure a
|
||||
TreeBuilder by passing in arguments, not just by saying which
|
||||
one to use.
|
||||
"""
|
||||
|
||||
if 'convertEntities' in kwargs:
|
||||
del kwargs['convertEntities']
|
||||
warnings.warn(
|
||||
"BS4 does not respect the convertEntities argument to the "
|
||||
"BeautifulSoup constructor. Entities are always converted "
|
||||
|
@ -163,10 +210,10 @@ class BeautifulSoup(Tag):
|
|||
if old_name in kwargs:
|
||||
warnings.warn(
|
||||
'The "%s" argument to the BeautifulSoup constructor '
|
||||
'has been renamed to "%s."' % (old_name, new_name))
|
||||
value = kwargs[old_name]
|
||||
del kwargs[old_name]
|
||||
return value
|
||||
'has been renamed to "%s."' % (old_name, new_name),
|
||||
DeprecationWarning
|
||||
)
|
||||
return kwargs.pop(old_name)
|
||||
return None
|
||||
|
||||
parse_only = parse_only or deprecated_argument(
|
||||
|
@ -179,13 +226,19 @@ class BeautifulSoup(Tag):
|
|||
warnings.warn("You provided Unicode markup but also provided a value for from_encoding. Your from_encoding will be ignored.")
|
||||
from_encoding = None
|
||||
|
||||
if len(kwargs) > 0:
|
||||
arg = list(kwargs.keys()).pop()
|
||||
raise TypeError(
|
||||
"__init__() got an unexpected keyword argument '%s'" % arg)
|
||||
self.element_classes = element_classes or dict()
|
||||
|
||||
if builder is None:
|
||||
original_features = features
|
||||
# We need this information to track whether or not the builder
|
||||
# was specified well enough that we can omit the 'you need to
|
||||
# specify a parser' warning.
|
||||
original_builder = builder
|
||||
original_features = features
|
||||
|
||||
if isinstance(builder, type):
|
||||
# A builder class was passed in; it needs to be instantiated.
|
||||
builder_class = builder
|
||||
builder = None
|
||||
elif builder is None:
|
||||
if isinstance(features, str):
|
||||
features = [features]
|
||||
if features is None or len(features) == 0:
|
||||
|
@ -196,9 +249,18 @@ class BeautifulSoup(Tag):
|
|||
"Couldn't find a tree builder with the features you "
|
||||
"requested: %s. Do you need to install a parser library?"
|
||||
% ",".join(features))
|
||||
builder = builder_class()
|
||||
if not (original_features == builder.NAME or
|
||||
original_features in builder.ALTERNATE_NAMES):
|
||||
|
||||
# At this point either we have a TreeBuilder instance in
|
||||
# builder, or we have a builder_class that we can instantiate
|
||||
# with the remaining **kwargs.
|
||||
if builder is None:
|
||||
builder = builder_class(**kwargs)
|
||||
if not original_builder and not (
|
||||
original_features == builder.NAME or
|
||||
original_features in builder.ALTERNATE_NAMES
|
||||
) and markup:
|
||||
# The user did not tell us which TreeBuilder to use,
|
||||
# and we had to guess. Issue a warning.
|
||||
if builder.is_xml:
|
||||
markup_type = "XML"
|
||||
else:
|
||||
|
@ -232,13 +294,18 @@ class BeautifulSoup(Tag):
|
|||
parser=builder.NAME,
|
||||
markup_type=markup_type
|
||||
)
|
||||
warnings.warn(self.NO_PARSER_SPECIFIED_WARNING % values, stacklevel=2)
|
||||
|
||||
warnings.warn(
|
||||
self.NO_PARSER_SPECIFIED_WARNING % values,
|
||||
GuessedAtParserWarning, stacklevel=2
|
||||
)
|
||||
else:
|
||||
if kwargs:
|
||||
warnings.warn("Keyword arguments to the BeautifulSoup constructor will be ignored. These would normally be passed into the TreeBuilder constructor, but a TreeBuilder instance was passed in as `builder`.")
|
||||
|
||||
self.builder = builder
|
||||
self.is_xml = builder.is_xml
|
||||
self.known_xml = self.is_xml
|
||||
self.builder.soup = self
|
||||
|
||||
self._namespaces = dict()
|
||||
self.parse_only = parse_only
|
||||
|
||||
if hasattr(markup, 'read'): # It's a file-type object.
|
||||
|
@ -247,49 +314,42 @@ class BeautifulSoup(Tag):
|
|||
(isinstance(markup, bytes) and not b'<' in markup)
|
||||
or (isinstance(markup, str) and not '<' in markup)
|
||||
):
|
||||
# Print out warnings for a couple beginner problems
|
||||
# Issue warnings for a couple beginner problems
|
||||
# involving passing non-markup to Beautiful Soup.
|
||||
# Beautiful Soup will still parse the input as markup,
|
||||
# just in case that's what the user really wants.
|
||||
if (isinstance(markup, str)
|
||||
and not os.path.supports_unicode_filenames):
|
||||
possible_filename = markup.encode("utf8")
|
||||
else:
|
||||
possible_filename = markup
|
||||
is_file = False
|
||||
try:
|
||||
is_file = os.path.exists(possible_filename)
|
||||
except Exception as e:
|
||||
# This is almost certainly a problem involving
|
||||
# characters not valid in filenames on this
|
||||
# system. Just let it go.
|
||||
pass
|
||||
if is_file:
|
||||
if isinstance(markup, str):
|
||||
markup = markup.encode("utf8")
|
||||
warnings.warn(
|
||||
'"%s" looks like a filename, not markup. You should'
|
||||
' probably open this file and pass the filehandle into'
|
||||
' Beautiful Soup.' % markup)
|
||||
self._check_markup_is_url(markup)
|
||||
# since that is sometimes the intended behavior.
|
||||
if not self._markup_is_url(markup):
|
||||
self._markup_resembles_filename(markup)
|
||||
|
||||
rejections = []
|
||||
success = False
|
||||
for (self.markup, self.original_encoding, self.declared_html_encoding,
|
||||
self.contains_replacement_characters) in (
|
||||
self.builder.prepare_markup(
|
||||
markup, from_encoding, exclude_encodings=exclude_encodings)):
|
||||
self.reset()
|
||||
self.builder.initialize_soup(self)
|
||||
try:
|
||||
self._feed()
|
||||
success = True
|
||||
break
|
||||
except ParserRejectedMarkup:
|
||||
except ParserRejectedMarkup as e:
|
||||
rejections.append(e)
|
||||
pass
|
||||
|
||||
if not success:
|
||||
other_exceptions = [str(e) for e in rejections]
|
||||
raise ParserRejectedMarkup(
|
||||
"The markup you provided was rejected by the parser. Trying a different parser or a different encoding may help.\n\nOriginal exception(s) from parser:\n " + "\n ".join(other_exceptions)
|
||||
)
|
||||
|
||||
# Clear out the markup and remove the builder's circular
|
||||
# reference to this object.
|
||||
self.markup = None
|
||||
self.builder.soup = None
|
||||
|
||||
def __copy__(self):
|
||||
"""Copy a BeautifulSoup object by converting the document to a string and parsing it again."""
|
||||
copy = type(self)(
|
||||
self.encode('utf-8'), builder=self.builder, from_encoding='utf-8'
|
||||
)
|
||||
|
@ -304,15 +364,31 @@ class BeautifulSoup(Tag):
|
|||
def __getstate__(self):
|
||||
# Frequently a tree builder can't be pickled.
|
||||
d = dict(self.__dict__)
|
||||
if 'builder' in d and not self.builder.picklable:
|
||||
if 'builder' in d and d['builder'] is not None and not self.builder.picklable:
|
||||
d['builder'] = None
|
||||
return d
|
||||
|
||||
@classmethod
|
||||
def _decode_markup(cls, markup):
|
||||
"""Ensure `markup` is bytes so it's safe to send into warnings.warn.
|
||||
|
||||
@staticmethod
|
||||
def _check_markup_is_url(markup):
|
||||
"""
|
||||
Check if markup looks like it's actually a url and raise a warning
|
||||
if so. Markup can be unicode or str (py2) / bytes (py3).
|
||||
TODO: warnings.warn had this problem back in 2010 but it might not
|
||||
anymore.
|
||||
"""
|
||||
if isinstance(markup, bytes):
|
||||
decoded = markup.decode('utf-8', 'replace')
|
||||
else:
|
||||
decoded = markup
|
||||
return decoded
|
||||
|
||||
@classmethod
|
||||
def _markup_is_url(cls, markup):
|
||||
"""Error-handling method to raise a warning if incoming markup looks
|
||||
like a URL.
|
||||
|
||||
:param markup: A string.
|
||||
:return: Whether or not the markup resembles a URL
|
||||
closely enough to justify a warning.
|
||||
"""
|
||||
if isinstance(markup, bytes):
|
||||
space = b' '
|
||||
|
@ -321,22 +397,54 @@ class BeautifulSoup(Tag):
|
|||
space = ' '
|
||||
cant_start_with = ("http:", "https:")
|
||||
else:
|
||||
return
|
||||
return False
|
||||
|
||||
if any(markup.startswith(prefix) for prefix in cant_start_with):
|
||||
if not space in markup:
|
||||
if isinstance(markup, bytes):
|
||||
decoded_markup = markup.decode('utf-8', 'replace')
|
||||
else:
|
||||
decoded_markup = markup
|
||||
warnings.warn(
|
||||
'"%s" looks like a URL. Beautiful Soup is not an'
|
||||
' HTTP client. You should probably use an HTTP client like'
|
||||
' requests to get the document behind the URL, and feed'
|
||||
' that document to Beautiful Soup.' % decoded_markup
|
||||
'The input looks more like a URL than markup. You may want to use'
|
||||
' an HTTP client like requests to get the document behind'
|
||||
' the URL, and feed that document to Beautiful Soup.',
|
||||
MarkupResemblesLocatorWarning
|
||||
)
|
||||
return True
|
||||
return False
|
||||
|
||||
@classmethod
|
||||
def _markup_resembles_filename(cls, markup):
|
||||
"""Error-handling method to raise a warning if incoming markup
|
||||
resembles a filename.
|
||||
|
||||
:param markup: A bytestring or string.
|
||||
:return: Whether or not the markup resembles a filename
|
||||
closely enough to justify a warning.
|
||||
"""
|
||||
path_characters = '/\\'
|
||||
extensions = ['.html', '.htm', '.xml', '.xhtml', '.txt']
|
||||
if isinstance(markup, bytes):
|
||||
path_characters = path_characters.encode("utf8")
|
||||
extensions = [x.encode('utf8') for x in extensions]
|
||||
filelike = False
|
||||
if any(x in markup for x in path_characters):
|
||||
filelike = True
|
||||
else:
|
||||
lower = markup.lower()
|
||||
if any(lower.endswith(ext) for ext in extensions):
|
||||
filelike = True
|
||||
if filelike:
|
||||
warnings.warn(
|
||||
'The input looks more like a filename than markup. You may'
|
||||
' want to open this file and pass the filehandle into'
|
||||
' Beautiful Soup.',
|
||||
MarkupResemblesLocatorWarning
|
||||
)
|
||||
return True
|
||||
return False
|
||||
|
||||
def _feed(self):
|
||||
"""Internal method that parses previously set markup, creating a large
|
||||
number of Tag and NavigableString objects.
|
||||
"""
|
||||
# Convert the document to Unicode.
|
||||
self.builder.reset()
|
||||
|
||||
|
@ -347,49 +455,110 @@ class BeautifulSoup(Tag):
|
|||
self.popTag()
|
||||
|
||||
def reset(self):
|
||||
"""Reset this object to a state as though it had never parsed any
|
||||
markup.
|
||||
"""
|
||||
Tag.__init__(self, self, self.builder, self.ROOT_TAG_NAME)
|
||||
self.hidden = 1
|
||||
self.builder.reset()
|
||||
self.current_data = []
|
||||
self.currentTag = None
|
||||
self.tagStack = []
|
||||
self.open_tag_counter = Counter()
|
||||
self.preserve_whitespace_tag_stack = []
|
||||
self.string_container_stack = []
|
||||
self.pushTag(self)
|
||||
|
||||
def new_tag(self, name, namespace=None, nsprefix=None, attrs={}, **kwattrs):
|
||||
"""Create a new tag associated with this soup."""
|
||||
def new_tag(self, name, namespace=None, nsprefix=None, attrs={},
|
||||
sourceline=None, sourcepos=None, **kwattrs):
|
||||
"""Create a new Tag associated with this BeautifulSoup object.
|
||||
|
||||
:param name: The name of the new Tag.
|
||||
:param namespace: The URI of the new Tag's XML namespace, if any.
|
||||
:param prefix: The prefix for the new Tag's XML namespace, if any.
|
||||
:param attrs: A dictionary of this Tag's attribute values; can
|
||||
be used instead of `kwattrs` for attributes like 'class'
|
||||
that are reserved words in Python.
|
||||
:param sourceline: The line number where this tag was
|
||||
(purportedly) found in its source document.
|
||||
:param sourcepos: The character position within `sourceline` where this
|
||||
tag was (purportedly) found.
|
||||
:param kwattrs: Keyword arguments for the new Tag's attribute values.
|
||||
|
||||
"""
|
||||
kwattrs.update(attrs)
|
||||
return Tag(None, self.builder, name, namespace, nsprefix, kwattrs)
|
||||
return self.element_classes.get(Tag, Tag)(
|
||||
None, self.builder, name, namespace, nsprefix, kwattrs,
|
||||
sourceline=sourceline, sourcepos=sourcepos
|
||||
)
|
||||
|
||||
def new_string(self, s, subclass=NavigableString):
|
||||
"""Create a new NavigableString associated with this soup."""
|
||||
return subclass(s)
|
||||
def string_container(self, base_class=None):
|
||||
container = base_class or NavigableString
|
||||
|
||||
# There may be a general override of NavigableString.
|
||||
container = self.element_classes.get(
|
||||
container, container
|
||||
)
|
||||
|
||||
def insert_before(self, successor):
|
||||
# On top of that, we may be inside a tag that needs a special
|
||||
# container class.
|
||||
if self.string_container_stack and container is NavigableString:
|
||||
container = self.builder.string_containers.get(
|
||||
self.string_container_stack[-1].name, container
|
||||
)
|
||||
return container
|
||||
|
||||
def new_string(self, s, subclass=None):
|
||||
"""Create a new NavigableString associated with this BeautifulSoup
|
||||
object.
|
||||
"""
|
||||
container = self.string_container(subclass)
|
||||
return container(s)
|
||||
|
||||
def insert_before(self, *args):
|
||||
"""This method is part of the PageElement API, but `BeautifulSoup` doesn't implement
|
||||
it because there is nothing before or after it in the parse tree.
|
||||
"""
|
||||
raise NotImplementedError("BeautifulSoup objects don't support insert_before().")
|
||||
|
||||
def insert_after(self, successor):
|
||||
def insert_after(self, *args):
|
||||
"""This method is part of the PageElement API, but `BeautifulSoup` doesn't implement
|
||||
it because there is nothing before or after it in the parse tree.
|
||||
"""
|
||||
raise NotImplementedError("BeautifulSoup objects don't support insert_after().")
|
||||
|
||||
def popTag(self):
|
||||
"""Internal method called by _popToTag when a tag is closed."""
|
||||
tag = self.tagStack.pop()
|
||||
if tag.name in self.open_tag_counter:
|
||||
self.open_tag_counter[tag.name] -= 1
|
||||
if self.preserve_whitespace_tag_stack and tag == self.preserve_whitespace_tag_stack[-1]:
|
||||
self.preserve_whitespace_tag_stack.pop()
|
||||
#print "Pop", tag.name
|
||||
if self.string_container_stack and tag == self.string_container_stack[-1]:
|
||||
self.string_container_stack.pop()
|
||||
#print("Pop", tag.name)
|
||||
if self.tagStack:
|
||||
self.currentTag = self.tagStack[-1]
|
||||
return self.currentTag
|
||||
|
||||
def pushTag(self, tag):
|
||||
#print "Push", tag.name
|
||||
if self.currentTag:
|
||||
"""Internal method called by handle_starttag when a tag is opened."""
|
||||
#print("Push", tag.name)
|
||||
if self.currentTag is not None:
|
||||
self.currentTag.contents.append(tag)
|
||||
self.tagStack.append(tag)
|
||||
self.currentTag = self.tagStack[-1]
|
||||
if tag.name != self.ROOT_TAG_NAME:
|
||||
self.open_tag_counter[tag.name] += 1
|
||||
if tag.name in self.builder.preserve_whitespace_tags:
|
||||
self.preserve_whitespace_tag_stack.append(tag)
|
||||
if tag.name in self.builder.string_containers:
|
||||
self.string_container_stack.append(tag)
|
||||
|
||||
def endData(self, containerClass=NavigableString):
|
||||
def endData(self, containerClass=None):
|
||||
"""Method called by the TreeBuilder when the end of a data segment
|
||||
occurs.
|
||||
"""
|
||||
if self.current_data:
|
||||
current_data = ''.join(self.current_data)
|
||||
# If whitespace is not preserved, and this string contains
|
||||
|
@ -416,72 +585,93 @@ class BeautifulSoup(Tag):
|
|||
not self.parse_only.search(current_data)):
|
||||
return
|
||||
|
||||
containerClass = self.string_container(containerClass)
|
||||
o = containerClass(current_data)
|
||||
self.object_was_parsed(o)
|
||||
|
||||
def object_was_parsed(self, o, parent=None, most_recent_element=None):
|
||||
"""Add an object to the parse tree."""
|
||||
parent = parent or self.currentTag
|
||||
previous_element = most_recent_element or self._most_recent_element
|
||||
"""Method called by the TreeBuilder to integrate an object into the parse tree."""
|
||||
if parent is None:
|
||||
parent = self.currentTag
|
||||
if most_recent_element is not None:
|
||||
previous_element = most_recent_element
|
||||
else:
|
||||
previous_element = self._most_recent_element
|
||||
|
||||
next_element = previous_sibling = next_sibling = None
|
||||
if isinstance(o, Tag):
|
||||
next_element = o.next_element
|
||||
next_sibling = o.next_sibling
|
||||
previous_sibling = o.previous_sibling
|
||||
if not previous_element:
|
||||
if previous_element is None:
|
||||
previous_element = o.previous_element
|
||||
|
||||
fix = parent.next_element is not None
|
||||
|
||||
o.setup(parent, previous_element, next_element, previous_sibling, next_sibling)
|
||||
|
||||
self._most_recent_element = o
|
||||
parent.contents.append(o)
|
||||
|
||||
if parent.next_sibling:
|
||||
# This node is being inserted into an element that has
|
||||
# already been parsed. Deal with any dangling references.
|
||||
index = len(parent.contents)-1
|
||||
while index >= 0:
|
||||
if parent.contents[index] is o:
|
||||
break
|
||||
index -= 1
|
||||
else:
|
||||
raise ValueError(
|
||||
"Error building tree: supposedly %r was inserted "
|
||||
"into %r after the fact, but I don't see it!" % (
|
||||
o, parent
|
||||
)
|
||||
)
|
||||
if index == 0:
|
||||
previous_element = parent
|
||||
previous_sibling = None
|
||||
else:
|
||||
previous_element = previous_sibling = parent.contents[index-1]
|
||||
if index == len(parent.contents)-1:
|
||||
next_element = parent.next_sibling
|
||||
next_sibling = None
|
||||
else:
|
||||
next_element = next_sibling = parent.contents[index+1]
|
||||
# Check if we are inserting into an already parsed node.
|
||||
if fix:
|
||||
self._linkage_fixer(parent)
|
||||
|
||||
o.previous_element = previous_element
|
||||
if previous_element:
|
||||
previous_element.next_element = o
|
||||
o.next_element = next_element
|
||||
if next_element:
|
||||
next_element.previous_element = o
|
||||
o.next_sibling = next_sibling
|
||||
if next_sibling:
|
||||
next_sibling.previous_sibling = o
|
||||
o.previous_sibling = previous_sibling
|
||||
if previous_sibling:
|
||||
previous_sibling.next_sibling = o
|
||||
def _linkage_fixer(self, el):
|
||||
"""Make sure linkage of this fragment is sound."""
|
||||
|
||||
first = el.contents[0]
|
||||
child = el.contents[-1]
|
||||
descendant = child
|
||||
|
||||
if child is first and el.parent is not None:
|
||||
# Parent should be linked to first child
|
||||
el.next_element = child
|
||||
# We are no longer linked to whatever this element is
|
||||
prev_el = child.previous_element
|
||||
if prev_el is not None and prev_el is not el:
|
||||
prev_el.next_element = None
|
||||
# First child should be linked to the parent, and no previous siblings.
|
||||
child.previous_element = el
|
||||
child.previous_sibling = None
|
||||
|
||||
# We have no sibling as we've been appended as the last.
|
||||
child.next_sibling = None
|
||||
|
||||
# This index is a tag, dig deeper for a "last descendant"
|
||||
if isinstance(child, Tag) and child.contents:
|
||||
descendant = child._last_descendant(False)
|
||||
|
||||
# As the final step, link last descendant. It should be linked
|
||||
# to the parent's next sibling (if found), else walk up the chain
|
||||
# and find a parent with a sibling. It should have no next sibling.
|
||||
descendant.next_element = None
|
||||
descendant.next_sibling = None
|
||||
target = el
|
||||
while True:
|
||||
if target is None:
|
||||
break
|
||||
elif target.next_sibling is not None:
|
||||
descendant.next_element = target.next_sibling
|
||||
target.next_sibling.previous_element = child
|
||||
break
|
||||
target = target.parent
|
||||
|
||||
def _popToTag(self, name, nsprefix=None, inclusivePop=True):
|
||||
"""Pops the tag stack up to and including the most recent
|
||||
instance of the given tag. If inclusivePop is false, pops the tag
|
||||
stack up to but *not* including the most recent instqance of
|
||||
the given tag."""
|
||||
#print "Popping to %s" % name
|
||||
instance of the given tag.
|
||||
|
||||
If there are no open tags with the given name, nothing will be
|
||||
popped.
|
||||
|
||||
:param name: Pop up to the most recent tag with this name.
|
||||
:param nsprefix: The namespace prefix that goes with `name`.
|
||||
:param inclusivePop: It this is false, pops the tag stack up
|
||||
to but *not* including the most recent instqance of the
|
||||
given tag.
|
||||
|
||||
"""
|
||||
#print("Popping to %s" % name)
|
||||
if name == self.ROOT_TAG_NAME:
|
||||
# The BeautifulSoup object itself can never be popped.
|
||||
return
|
||||
|
@ -490,6 +680,8 @@ class BeautifulSoup(Tag):
|
|||
|
||||
stack_size = len(self.tagStack)
|
||||
for i in range(stack_size - 1, 0, -1):
|
||||
if not self.open_tag_counter.get(name):
|
||||
break
|
||||
t = self.tagStack[i]
|
||||
if (name == t.name and nsprefix == t.prefix):
|
||||
if inclusivePop:
|
||||
|
@ -499,16 +691,26 @@ class BeautifulSoup(Tag):
|
|||
|
||||
return most_recently_popped
|
||||
|
||||
def handle_starttag(self, name, namespace, nsprefix, attrs):
|
||||
"""Push a start tag on to the stack.
|
||||
def handle_starttag(self, name, namespace, nsprefix, attrs, sourceline=None,
|
||||
sourcepos=None, namespaces=None):
|
||||
"""Called by the tree builder when a new tag is encountered.
|
||||
|
||||
If this method returns None, the tag was rejected by the
|
||||
:param name: Name of the tag.
|
||||
:param nsprefix: Namespace prefix for the tag.
|
||||
:param attrs: A dictionary of attribute values.
|
||||
:param sourceline: The line number where this tag was found in its
|
||||
source document.
|
||||
:param sourcepos: The character position within `sourceline` where this
|
||||
tag was found.
|
||||
:param namespaces: A dictionary of all namespace prefix mappings
|
||||
currently in scope in the document.
|
||||
|
||||
If this method returns None, the tag was rejected by an active
|
||||
SoupStrainer. You should proceed as if the tag had not occurred
|
||||
in the document. For instance, if this was a self-closing tag,
|
||||
don't call handle_endtag.
|
||||
"""
|
||||
|
||||
# print "Start tag %s: %s" % (name, attrs)
|
||||
# print("Start tag %s: %s" % (name, attrs))
|
||||
self.endData()
|
||||
|
||||
if (self.parse_only and len(self.tagStack) <= 1
|
||||
|
@ -516,33 +718,53 @@ class BeautifulSoup(Tag):
|
|||
or not self.parse_only.search_tag(name, attrs))):
|
||||
return None
|
||||
|
||||
tag = Tag(self, self.builder, name, namespace, nsprefix, attrs,
|
||||
self.currentTag, self._most_recent_element)
|
||||
tag = self.element_classes.get(Tag, Tag)(
|
||||
self, self.builder, name, namespace, nsprefix, attrs,
|
||||
self.currentTag, self._most_recent_element,
|
||||
sourceline=sourceline, sourcepos=sourcepos,
|
||||
namespaces=namespaces
|
||||
)
|
||||
if tag is None:
|
||||
return tag
|
||||
if self._most_recent_element:
|
||||
if self._most_recent_element is not None:
|
||||
self._most_recent_element.next_element = tag
|
||||
self._most_recent_element = tag
|
||||
self.pushTag(tag)
|
||||
return tag
|
||||
|
||||
def handle_endtag(self, name, nsprefix=None):
|
||||
#print "End tag: " + name
|
||||
"""Called by the tree builder when an ending tag is encountered.
|
||||
|
||||
:param name: Name of the tag.
|
||||
:param nsprefix: Namespace prefix for the tag.
|
||||
"""
|
||||
#print("End tag: " + name)
|
||||
self.endData()
|
||||
self._popToTag(name, nsprefix)
|
||||
|
||||
|
||||
def handle_data(self, data):
|
||||
"""Called by the tree builder when a chunk of textual data is encountered."""
|
||||
self.current_data.append(data)
|
||||
|
||||
|
||||
def decode(self, pretty_print=False,
|
||||
eventual_encoding=DEFAULT_OUTPUT_ENCODING,
|
||||
formatter="minimal"):
|
||||
"""Returns a string or Unicode representation of this document.
|
||||
To get Unicode, pass None for encoding."""
|
||||
"""Returns a string or Unicode representation of the parse tree
|
||||
as an HTML or XML document.
|
||||
|
||||
:param pretty_print: If this is True, indentation will be used to
|
||||
make the document more readable.
|
||||
:param eventual_encoding: The encoding of the final document.
|
||||
If this is None, the document will be a Unicode string.
|
||||
"""
|
||||
if self.is_xml:
|
||||
# Print the XML declaration
|
||||
encoding_part = ''
|
||||
if eventual_encoding in PYTHON_SPECIFIC_ENCODINGS:
|
||||
# This is a special Python encoding; it can't actually
|
||||
# go into an XML document because it means nothing
|
||||
# outside of Python.
|
||||
eventual_encoding = None
|
||||
if eventual_encoding != None:
|
||||
encoding_part = ' encoding="%s"' % eventual_encoding
|
||||
prefix = '<?xml version="1.0"%s?>\n' % encoding_part
|
||||
|
@ -555,7 +777,7 @@ class BeautifulSoup(Tag):
|
|||
return prefix + super(BeautifulSoup, self).decode(
|
||||
indent_level, eventual_encoding, formatter)
|
||||
|
||||
# Alias to make it easier to type import: 'from bs4 import _soup'
|
||||
# Aliases to make it easier to get started quickly, e.g. 'from bs4 import _soup'
|
||||
_s = BeautifulSoup
|
||||
_soup = BeautifulSoup
|
||||
|
||||
|
@ -566,19 +788,25 @@ class BeautifulStoneSoup(BeautifulSoup):
|
|||
kwargs['features'] = 'xml'
|
||||
warnings.warn(
|
||||
'The BeautifulStoneSoup class is deprecated. Instead of using '
|
||||
'it, pass features="xml" into the BeautifulSoup constructor.')
|
||||
'it, pass features="xml" into the BeautifulSoup constructor.',
|
||||
DeprecationWarning
|
||||
)
|
||||
super(BeautifulStoneSoup, self).__init__(*args, **kwargs)
|
||||
|
||||
|
||||
class StopParsing(Exception):
|
||||
"""Exception raised by a TreeBuilder if it's unable to continue parsing."""
|
||||
pass
|
||||
|
||||
class FeatureNotFound(ValueError):
|
||||
"""Exception raised by the BeautifulSoup constructor if no parser with the
|
||||
requested features is found.
|
||||
"""
|
||||
pass
|
||||
|
||||
|
||||
#By default, act as an HTML pretty-printer.
|
||||
#If this file is run as a script, act as an HTML pretty-printer.
|
||||
if __name__ == '__main__':
|
||||
import sys
|
||||
soup = BeautifulSoup(sys.stdin)
|
||||
print(soup.prettify())
|
||||
print((soup.prettify()))
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue