diff --git a/lib/bs4/__init__.py b/lib/bs4/__init__.py index db71cc7c..3d2ab09a 100644 --- a/lib/bs4/__init__.py +++ b/lib/bs4/__init__.py @@ -15,7 +15,7 @@ documentation: http://www.crummy.com/software/BeautifulSoup/bs4/doc/ """ __author__ = "Leonard Richardson (leonardr@segfault.org)" -__version__ = "4.11.2" +__version__ = "4.12.2" __copyright__ = "Copyright (c) 2004-2023 Leonard Richardson" # Use of this source code is governed by the MIT license. __license__ = "MIT" @@ -38,11 +38,13 @@ from .builder import ( builder_registry, ParserRejectedMarkup, XMLParsedAsHTMLWarning, + HTMLParserTreeBuilder ) from .dammit import UnicodeDammit from .element import ( CData, Comment, + CSS, DEFAULT_OUTPUT_ENCODING, Declaration, Doctype, @@ -116,7 +118,7 @@ class BeautifulSoup(Tag): ASCII_SPACES = '\x20\x0a\x09\x0c\x0d' NO_PARSER_SPECIFIED_WARNING = "No parser was explicitly specified, so I'm using the best available %(markup_type)s parser for this system (\"%(parser)s\"). This usually isn't a problem, but if you run this code on another system, or in a different virtual environment, it may use a different parser and behave differently.\n\nThe code that caused this warning is on line %(line_number)s of the file %(filename)s. To get rid of this warning, pass the additional argument 'features=\"%(parser)s\"' to the BeautifulSoup constructor.\n" - + def __init__(self, markup="", features=None, builder=None, parse_only=None, from_encoding=None, exclude_encodings=None, element_classes=None, **kwargs): @@ -348,25 +350,49 @@ class BeautifulSoup(Tag): self.markup = None self.builder.soup = None - def __copy__(self): - """Copy a BeautifulSoup object by converting the document to a string and parsing it again.""" - copy = type(self)( - self.encode('utf-8'), builder=self.builder, from_encoding='utf-8' - ) + def _clone(self): + """Create a new BeautifulSoup object with the same TreeBuilder, + but not associated with any markup. - # Although we encoded the tree to UTF-8, that may not have - # been the encoding of the original markup. Set the copy's - # .original_encoding to reflect the original object's - # .original_encoding. - copy.original_encoding = self.original_encoding - return copy + This is the first step of the deepcopy process. + """ + clone = type(self)("", None, self.builder) + # Keep track of the encoding of the original document, + # since we won't be parsing it again. + clone.original_encoding = self.original_encoding + return clone + def __getstate__(self): # Frequently a tree builder can't be pickled. d = dict(self.__dict__) if 'builder' in d and d['builder'] is not None and not self.builder.picklable: - d['builder'] = None + d['builder'] = type(self.builder) + # Store the contents as a Unicode string. + d['contents'] = [] + d['markup'] = self.decode() + + # If _most_recent_element is present, it's a Tag object left + # over from initial parse. It might not be picklable and we + # don't need it. + if '_most_recent_element' in d: + del d['_most_recent_element'] return d + + def __setstate__(self, state): + # If necessary, restore the TreeBuilder by looking it up. + self.__dict__ = state + if isinstance(self.builder, type): + self.builder = self.builder() + elif not self.builder: + # We don't know which builder was used to build this + # parse tree, so use a default we know is always available. + self.builder = HTMLParserTreeBuilder() + self.builder.soup = self + self.reset() + self._feed() + return state + @classmethod def _decode_markup(cls, markup): @@ -468,6 +494,7 @@ class BeautifulSoup(Tag): self.open_tag_counter = Counter() self.preserve_whitespace_tag_stack = [] self.string_container_stack = [] + self._most_recent_element = None self.pushTag(self) def new_tag(self, name, namespace=None, nsprefix=None, attrs={}, @@ -749,7 +776,7 @@ class BeautifulSoup(Tag): def decode(self, pretty_print=False, eventual_encoding=DEFAULT_OUTPUT_ENCODING, - formatter="minimal"): + formatter="minimal", iterator=None): """Returns a string or Unicode representation of the parse tree as an HTML or XML document. @@ -776,7 +803,7 @@ class BeautifulSoup(Tag): else: indent_level = 0 return prefix + super(BeautifulSoup, self).decode( - indent_level, eventual_encoding, formatter) + indent_level, eventual_encoding, formatter, iterator) # Aliases to make it easier to get started quickly, e.g. 'from bs4 import _soup' _s = BeautifulSoup diff --git a/lib/bs4/builder/_htmlparser.py b/lib/bs4/builder/_htmlparser.py index e48b6a0e..e065096b 100644 --- a/lib/bs4/builder/_htmlparser.py +++ b/lib/bs4/builder/_htmlparser.py @@ -24,6 +24,7 @@ from bs4.dammit import EntitySubstitution, UnicodeDammit from bs4.builder import ( DetectsXMLParsedAsHTML, + ParserRejectedMarkup, HTML, HTMLTreeBuilder, STRICT, @@ -70,6 +71,22 @@ class BeautifulSoupHTMLParser(HTMLParser, DetectsXMLParsedAsHTML): self._initialize_xml_detector() + def error(self, message): + # NOTE: This method is required so long as Python 3.9 is + # supported. The corresponding code is removed from HTMLParser + # in 3.5, but not removed from ParserBase until 3.10. + # https://github.com/python/cpython/issues/76025 + # + # The original implementation turned the error into a warning, + # but in every case I discovered, this made HTMLParser + # immediately crash with an error message that was less + # helpful than the warning. The new implementation makes it + # more clear that html.parser just can't parse this + # markup. The 3.10 implementation does the same, though it + # raises AssertionError rather than calling a method. (We + # catch this error and wrap it in a ParserRejectedMarkup.) + raise ParserRejectedMarkup(message) + def handle_startendtag(self, name, attrs): """Handle an incoming empty-element tag. @@ -359,6 +376,12 @@ class HTMLParserTreeBuilder(HTMLTreeBuilder): args, kwargs = self.parser_args parser = BeautifulSoupHTMLParser(*args, **kwargs) parser.soup = self.soup - parser.feed(markup) + try: + parser.feed(markup) + except AssertionError as e: + # html.parser raises AssertionError in rare cases to + # indicate a fatal problem with the markup, especially + # when there's an error in the doctype declaration. + raise ParserRejectedMarkup(e) parser.close() parser.already_closed_empty_element = [] diff --git a/lib/bs4/css.py b/lib/bs4/css.py new file mode 100644 index 00000000..245ac601 --- /dev/null +++ b/lib/bs4/css.py @@ -0,0 +1,280 @@ +"""Integration code for CSS selectors using Soup Sieve (pypi: soupsieve).""" + +import warnings +try: + import soupsieve +except ImportError as e: + soupsieve = None + warnings.warn( + 'The soupsieve package is not installed. CSS selectors cannot be used.' + ) + + +class CSS(object): + """A proxy object against the soupsieve library, to simplify its + CSS selector API. + + Acquire this object through the .css attribute on the + BeautifulSoup object, or on the Tag you want to use as the + starting point for a CSS selector. + + The main advantage of doing this is that the tag to be selected + against doesn't need to be explicitly specified in the function + calls, since it's already scoped to a tag. + """ + + def __init__(self, tag, api=soupsieve): + """Constructor. + + You don't need to instantiate this class yourself; instead, + access the .css attribute on the BeautifulSoup object, or on + the Tag you want to use as the starting point for your CSS + selector. + + :param tag: All CSS selectors will use this as their starting + point. + + :param api: A plug-in replacement for the soupsieve module, + designed mainly for use in tests. + """ + if api is None: + raise NotImplementedError( + "Cannot execute CSS selectors because the soupsieve package is not installed." + ) + self.api = api + self.tag = tag + + def escape(self, ident): + """Escape a CSS identifier. + + This is a simple wrapper around soupselect.escape(). See the + documentation for that function for more information. + """ + if soupsieve is None: + raise NotImplementedError( + "Cannot escape CSS identifiers because the soupsieve package is not installed." + ) + return self.api.escape(ident) + + def _ns(self, ns, select): + """Normalize a dictionary of namespaces.""" + if not isinstance(select, self.api.SoupSieve) and ns is None: + # If the selector is a precompiled pattern, it already has + # a namespace context compiled in, which cannot be + # replaced. + ns = self.tag._namespaces + return ns + + def _rs(self, results): + """Normalize a list of results to a Resultset. + + A ResultSet is more consistent with the rest of Beautiful + Soup's API, and ResultSet.__getattr__ has a helpful error + message if you try to treat a list of results as a single + result (a common mistake). + """ + # Import here to avoid circular import + from bs4.element import ResultSet + return ResultSet(None, results) + + def compile(self, select, namespaces=None, flags=0, **kwargs): + """Pre-compile a selector and return the compiled object. + + :param selector: A CSS selector. + + :param namespaces: A dictionary mapping namespace prefixes + used in the CSS selector to namespace URIs. By default, + Beautiful Soup will use the prefixes it encountered while + parsing the document. + + :param flags: Flags to be passed into Soup Sieve's + soupsieve.compile() method. + + :param kwargs: Keyword arguments to be passed into SoupSieve's + soupsieve.compile() method. + + :return: A precompiled selector object. + :rtype: soupsieve.SoupSieve + """ + return self.api.compile( + select, self._ns(namespaces, select), flags, **kwargs + ) + + def select_one(self, select, namespaces=None, flags=0, **kwargs): + """Perform a CSS selection operation on the current Tag and return the + first result. + + This uses the Soup Sieve library. For more information, see + that library's documentation for the soupsieve.select_one() + method. + + :param selector: A CSS selector. + + :param namespaces: A dictionary mapping namespace prefixes + used in the CSS selector to namespace URIs. By default, + Beautiful Soup will use the prefixes it encountered while + parsing the document. + + :param flags: Flags to be passed into Soup Sieve's + soupsieve.select_one() method. + + :param kwargs: Keyword arguments to be passed into SoupSieve's + soupsieve.select_one() method. + + :return: A Tag, or None if the selector has no match. + :rtype: bs4.element.Tag + + """ + return self.api.select_one( + select, self.tag, self._ns(namespaces, select), flags, **kwargs + ) + + def select(self, select, namespaces=None, limit=0, flags=0, **kwargs): + """Perform a CSS selection operation on the current Tag. + + This uses the Soup Sieve library. For more information, see + that library's documentation for the soupsieve.select() + method. + + :param selector: A string containing a CSS selector. + + :param namespaces: A dictionary mapping namespace prefixes + used in the CSS selector to namespace URIs. By default, + Beautiful Soup will pass in the prefixes it encountered while + parsing the document. + + :param limit: After finding this number of results, stop looking. + + :param flags: Flags to be passed into Soup Sieve's + soupsieve.select() method. + + :param kwargs: Keyword arguments to be passed into SoupSieve's + soupsieve.select() method. + + :return: A ResultSet of Tag objects. + :rtype: bs4.element.ResultSet + + """ + if limit is None: + limit = 0 + + return self._rs( + self.api.select( + select, self.tag, self._ns(namespaces, select), limit, flags, + **kwargs + ) + ) + + def iselect(self, select, namespaces=None, limit=0, flags=0, **kwargs): + """Perform a CSS selection operation on the current Tag. + + This uses the Soup Sieve library. For more information, see + that library's documentation for the soupsieve.iselect() + method. It is the same as select(), but it returns a generator + instead of a list. + + :param selector: A string containing a CSS selector. + + :param namespaces: A dictionary mapping namespace prefixes + used in the CSS selector to namespace URIs. By default, + Beautiful Soup will pass in the prefixes it encountered while + parsing the document. + + :param limit: After finding this number of results, stop looking. + + :param flags: Flags to be passed into Soup Sieve's + soupsieve.iselect() method. + + :param kwargs: Keyword arguments to be passed into SoupSieve's + soupsieve.iselect() method. + + :return: A generator + :rtype: types.GeneratorType + """ + return self.api.iselect( + select, self.tag, self._ns(namespaces, select), limit, flags, **kwargs + ) + + def closest(self, select, namespaces=None, flags=0, **kwargs): + """Find the Tag closest to this one that matches the given selector. + + This uses the Soup Sieve library. For more information, see + that library's documentation for the soupsieve.closest() + method. + + :param selector: A string containing a CSS selector. + + :param namespaces: A dictionary mapping namespace prefixes + used in the CSS selector to namespace URIs. By default, + Beautiful Soup will pass in the prefixes it encountered while + parsing the document. + + :param flags: Flags to be passed into Soup Sieve's + soupsieve.closest() method. + + :param kwargs: Keyword arguments to be passed into SoupSieve's + soupsieve.closest() method. + + :return: A Tag, or None if there is no match. + :rtype: bs4.Tag + + """ + return self.api.closest( + select, self.tag, self._ns(namespaces, select), flags, **kwargs + ) + + def match(self, select, namespaces=None, flags=0, **kwargs): + """Check whether this Tag matches the given CSS selector. + + This uses the Soup Sieve library. For more information, see + that library's documentation for the soupsieve.match() + method. + + :param: a CSS selector. + + :param namespaces: A dictionary mapping namespace prefixes + used in the CSS selector to namespace URIs. By default, + Beautiful Soup will pass in the prefixes it encountered while + parsing the document. + + :param flags: Flags to be passed into Soup Sieve's + soupsieve.match() method. + + :param kwargs: Keyword arguments to be passed into SoupSieve's + soupsieve.match() method. + + :return: True if this Tag matches the selector; False otherwise. + :rtype: bool + """ + return self.api.match( + select, self.tag, self._ns(namespaces, select), flags, **kwargs + ) + + def filter(self, select, namespaces=None, flags=0, **kwargs): + """Filter this Tag's direct children based on the given CSS selector. + + This uses the Soup Sieve library. It works the same way as + passing this Tag into that library's soupsieve.filter() + method. More information, for more information see the + documentation for soupsieve.filter(). + + :param namespaces: A dictionary mapping namespace prefixes + used in the CSS selector to namespace URIs. By default, + Beautiful Soup will pass in the prefixes it encountered while + parsing the document. + + :param flags: Flags to be passed into Soup Sieve's + soupsieve.filter() method. + + :param kwargs: Keyword arguments to be passed into SoupSieve's + soupsieve.filter() method. + + :return: A ResultSet of Tag objects. + :rtype: bs4.element.ResultSet + + """ + return self._rs( + self.api.filter( + select, self.tag, self._ns(namespaces, select), flags, **kwargs + ) + ) diff --git a/lib/bs4/diagnose.py b/lib/bs4/diagnose.py index 3bf583f5..e079772e 100644 --- a/lib/bs4/diagnose.py +++ b/lib/bs4/diagnose.py @@ -59,21 +59,6 @@ def diagnose(data): if hasattr(data, 'read'): data = data.read() - elif data.startswith("http:") or data.startswith("https:"): - print(('"%s" looks like a URL. Beautiful Soup is not an HTTP client.' % data)) - print("You need to use some other library to get the document behind the URL, and feed that document to Beautiful Soup.") - return - else: - try: - if os.path.exists(data): - print(('"%s" looks like a filename. Reading data from the file.' % data)) - with open(data) as fp: - data = fp.read() - except ValueError: - # This can happen on some platforms when the 'filename' is - # too long. Assume it's data and not a filename. - pass - print("") for parser in basic_parsers: print(("Trying to parse your markup with %s" % parser)) diff --git a/lib/bs4/element.py b/lib/bs4/element.py index 583d0e8a..9c73957c 100644 --- a/lib/bs4/element.py +++ b/lib/bs4/element.py @@ -8,14 +8,8 @@ except ImportError as e: import re import sys import warnings -try: - import soupsieve -except ImportError as e: - soupsieve = None - warnings.warn( - 'The soupsieve package is not installed. CSS selectors cannot be used.' - ) +from bs4.css import CSS from bs4.formatter import ( Formatter, HTMLFormatter, @@ -69,13 +63,13 @@ PYTHON_SPECIFIC_ENCODINGS = set([ "string-escape", "string_escape", ]) - + class NamespacedAttribute(str): """A namespaced string (e.g. 'xml:lang') that remembers the namespace ('xml') and the name ('lang') that were used to create it. """ - + def __new__(cls, prefix, name=None, namespace=None): if not name: # This is the default namespace. Its name "has no value" @@ -146,14 +140,19 @@ class ContentMetaAttributeValue(AttributeValueWithCharsetSubstitution): return match.group(1) + encoding return self.CHARSET_RE.sub(rewrite, self.original_value) - + class PageElement(object): """Contains the navigational information for some part of the page: that is, its current location in the parse tree. NavigableString, Tag, etc. are all subclasses of PageElement. """ - + + # In general, we can't tell just by looking at an element whether + # it's contained in an XML document or an HTML document. But for + # Tags (q.v.) we can store this information at parse time. + known_xml = None + def setup(self, parent=None, previous_element=None, next_element=None, previous_sibling=None, next_sibling=None): """Sets up the initial relations between this element and @@ -163,7 +162,7 @@ class PageElement(object): :param previous_element: The element parsed immediately before this one. - + :param next_element: The element parsed immediately before this one. @@ -257,11 +256,11 @@ class PageElement(object): default = object() def _all_strings(self, strip=False, types=default): """Yield all strings of certain classes, possibly stripping them. - + This is implemented differently in Tag and NavigableString. """ raise NotImplementedError() - + @property def stripped_strings(self): """Yield all strings in this PageElement, stripping them first. @@ -294,11 +293,11 @@ class PageElement(object): strip, types=types)]) getText = get_text text = property(get_text) - + def replace_with(self, *args): - """Replace this PageElement with one or more PageElements, keeping the + """Replace this PageElement with one or more PageElements, keeping the rest of the tree the same. - + :param args: One or more PageElements. :return: `self`, no longer part of the tree. """ @@ -410,7 +409,7 @@ class PageElement(object): This works the same way as `list.insert`. :param position: The numeric position that should be occupied - in `self.children` by the new PageElement. + in `self.children` by the new PageElement. :param new_child: A PageElement. """ if new_child is None: @@ -546,7 +545,7 @@ class PageElement(object): "Element has no parent, so 'after' has no meaning.") if any(x is self for x in args): raise ValueError("Can't insert an element after itself.") - + offset = 0 for successor in args: # Extract first so that the index won't be screwed up if they @@ -912,7 +911,7 @@ class PageElement(object): :rtype: bool """ return getattr(self, '_decomposed', False) or False - + # Old non-property versions of the generators, for backwards # compatibility with BS3. def nextGenerator(self): @@ -936,16 +935,11 @@ class NavigableString(str, PageElement): When Beautiful Soup parses the markup penguin, it will create a NavigableString for the string "penguin". - """ + """ PREFIX = '' SUFFIX = '' - # We can't tell just by looking at a string whether it's contained - # in an XML document or an HTML document. - - known_xml = None - def __new__(cls, value): """Create a new NavigableString. @@ -961,12 +955,22 @@ class NavigableString(str, PageElement): u.setup() return u - def __copy__(self): + def __deepcopy__(self, memo, recursive=False): """A copy of a NavigableString has the same contents and class as the original, but it is not connected to the parse tree. + + :param recursive: This parameter is ignored; it's only defined + so that NavigableString.__deepcopy__ implements the same + signature as Tag.__deepcopy__. """ return type(self)(self) + def __copy__(self): + """A copy of a NavigableString can only be a deep copy, because + only one PageElement can occupy a given place in a parse tree. + """ + return self.__deepcopy__({}) + def __getnewargs__(self): return (str(self),) @@ -1059,10 +1063,10 @@ class PreformattedString(NavigableString): as comments (the Comment class) and CDATA blocks (the CData class). """ - + PREFIX = '' SUFFIX = '' - + def output_ready(self, formatter=None): """Make this string ready for output by adding any subclass-specific prefix or suffix. @@ -1144,7 +1148,7 @@ class Stylesheet(NavigableString): """ pass - + class Script(NavigableString): """A NavigableString representing an executable script (probably Javascript). @@ -1250,7 +1254,7 @@ class Tag(PageElement): if ((not builder or builder.store_line_numbers) and (sourceline is not None or sourcepos is not None)): self.sourceline = sourceline - self.sourcepos = sourcepos + self.sourcepos = sourcepos if attrs is None: attrs = {} elif attrs: @@ -1308,13 +1312,49 @@ class Tag(PageElement): self.interesting_string_types = builder.string_containers[self.name] else: self.interesting_string_types = self.DEFAULT_INTERESTING_STRING_TYPES - + parserClass = _alias("parser_class") # BS3 - def __copy__(self): - """A copy of a Tag is a new Tag, unconnected to the parse tree. + def __deepcopy__(self, memo, recursive=True): + """A deepcopy of a Tag is a new Tag, unconnected to the parse tree. Its contents are a copy of the old Tag's contents. """ + clone = self._clone() + + if recursive: + # Clone this tag's descendants recursively, but without + # making any recursive function calls. + tag_stack = [clone] + for event, element in self._event_stream(self.descendants): + if event is Tag.END_ELEMENT_EVENT: + # Stop appending incoming Tags to the Tag that was + # just closed. + tag_stack.pop() + else: + descendant_clone = element.__deepcopy__( + memo, recursive=False + ) + # Add to its parent's .contents + tag_stack[-1].append(descendant_clone) + + if event is Tag.START_ELEMENT_EVENT: + # Add the Tag itself to the stack so that its + # children will be .appended to it. + tag_stack.append(descendant_clone) + return clone + + def __copy__(self): + """A copy of a Tag must always be a deep copy, because a Tag's + children can only have one parent at a time. + """ + return self.__deepcopy__({}) + + def _clone(self): + """Create a new Tag just like this one, but with no + contents and unattached to any parse tree. + + This is the first step in the deepcopy process. + """ clone = type(self)( None, self.builder, self.name, self.namespace, self.prefix, self.attrs, is_xml=self._is_xml, @@ -1326,8 +1366,6 @@ class Tag(PageElement): ) for attr in ('can_be_empty_element', 'hidden'): setattr(clone, attr, getattr(self, attr)) - for child in self.contents: - clone.append(child.__copy__()) return clone @property @@ -1433,7 +1471,7 @@ class Tag(PageElement): i.contents = [] i._decomposed = True i = n - + def clear(self, decompose=False): """Wipe out all children of this PageElement by calling extract() on them. @@ -1521,7 +1559,7 @@ class Tag(PageElement): if not isinstance(value, list): value = [value] return value - + def has_attr(self, key): """Does this PageElement have an attribute with the given name?""" return key in self.attrs @@ -1608,7 +1646,7 @@ class Tag(PageElement): def __repr__(self, encoding="unicode-escape"): """Renders this PageElement as a string. - :param encoding: The encoding to use (Python 2 only). + :param encoding: The encoding to use (Python 2 only). TODO: This is now ignored and a warning should be issued if a value is provided. :return: A (Unicode) string. @@ -1650,106 +1688,212 @@ class Tag(PageElement): def decode(self, indent_level=None, eventual_encoding=DEFAULT_OUTPUT_ENCODING, - formatter="minimal"): - """Render a Unicode representation of this PageElement and its - contents. - - :param indent_level: Each line of the rendering will be - indented this many spaces. Used internally in - recursive calls while pretty-printing. - :param eventual_encoding: The tag is destined to be - encoded into this encoding. This method is _not_ - responsible for performing that encoding. This information - is passed in so that it can be substituted in if the - document contains a tag that mentions the document's - encoding. - :param formatter: A Formatter object, or a string naming one of - the standard formatters. - """ - + formatter="minimal", + iterator=None): + pieces = [] # First off, turn a non-Formatter `formatter` into a Formatter # object. This will stop the lookup from happening over and # over again. if not isinstance(formatter, Formatter): formatter = self.formatter_for_name(formatter) - attributes = formatter.attributes(self) - attrs = [] - for key, val in attributes: - if val is None: - decoded = key + + if indent_level is True: + indent_level = 0 + + # The currently active tag that put us into string literal + # mode. Until this element is closed, children will be treated + # as string literals and not pretty-printed. String literal + # mode is turned on immediately after this tag begins, and + # turned off immediately before it's closed. This means there + # will be whitespace before and after the tag itself. + string_literal_tag = None + + for event, element in self._event_stream(iterator): + if event in (Tag.START_ELEMENT_EVENT, Tag.EMPTY_ELEMENT_EVENT): + piece = element._format_tag( + eventual_encoding, formatter, opening=True + ) + elif event is Tag.END_ELEMENT_EVENT: + piece = element._format_tag( + eventual_encoding, formatter, opening=False + ) + if indent_level is not None: + indent_level -= 1 else: - if isinstance(val, list) or isinstance(val, tuple): - val = ' '.join(val) - elif not isinstance(val, str): - val = str(val) - elif ( - isinstance(val, AttributeValueWithCharsetSubstitution) - and eventual_encoding is not None - ): - val = val.encode(eventual_encoding) + piece = element.output_ready(formatter) - text = formatter.attribute_value(val) - decoded = ( - str(key) + '=' - + formatter.quoted_attribute_value(text)) - attrs.append(decoded) - close = '' - closeTag = '' + # Now we need to apply the 'prettiness' -- extra + # whitespace before and/or after this tag. This can get + # complicated because certain tags, like
 and
+            #  for you 
+
""" + + expect = """
+
some
+ for you 
+
+
+""" + soup = self.soup(markup) + assert expect == soup.div.prettify() def test_prettify_accepts_formatter_function(self): soup = BeautifulSoup("foo", 'html.parser') @@ -216,429 +249,6 @@ class TestFormatters(SoupTest): assert soup.contents[0].name == 'pre' -@pytest.mark.skipif(not SOUP_SIEVE_PRESENT, reason="Soup Sieve not installed") -class TestCSSSelectors(SoupTest): - """Test basic CSS selector functionality. - - This functionality is implemented in soupsieve, which has a much - more comprehensive test suite, so this is basically an extra check - that soupsieve works as expected. - """ - - HTML = """ - - - -The title - - - -Hello there. -
-
-

An H1

-

Some text

-

Some more text

-

An H2

-

Another

-Bob -

Another H2

-me - -span1a1 -span1a2 test - -span2a1 - - - -
- -
- - - - - - - - -

English

-

English UK

-

English US

-

French

-
- - -""" - - def setup_method(self): - self.soup = BeautifulSoup(self.HTML, 'html.parser') - - def assert_selects(self, selector, expected_ids, **kwargs): - el_ids = [el['id'] for el in self.soup.select(selector, **kwargs)] - el_ids.sort() - expected_ids.sort() - assert expected_ids == el_ids, "Selector %s, expected [%s], got [%s]" % ( - selector, ', '.join(expected_ids), ', '.join(el_ids) - ) - - assertSelect = assert_selects - - def assert_select_multiple(self, *tests): - for selector, expected_ids in tests: - self.assert_selects(selector, expected_ids) - - def test_one_tag_one(self): - els = self.soup.select('title') - assert len(els) == 1 - assert els[0].name == 'title' - assert els[0].contents == ['The title'] - - def test_one_tag_many(self): - els = self.soup.select('div') - assert len(els) == 4 - for div in els: - assert div.name == 'div' - - el = self.soup.select_one('div') - assert 'main' == el['id'] - - def test_select_one_returns_none_if_no_match(self): - match = self.soup.select_one('nonexistenttag') - assert None == match - - - def test_tag_in_tag_one(self): - els = self.soup.select('div div') - self.assert_selects('div div', ['inner', 'data1']) - - def test_tag_in_tag_many(self): - for selector in ('html div', 'html body div', 'body div'): - self.assert_selects(selector, ['data1', 'main', 'inner', 'footer']) - - - def test_limit(self): - self.assert_selects('html div', ['main'], limit=1) - self.assert_selects('html body div', ['inner', 'main'], limit=2) - self.assert_selects('body div', ['data1', 'main', 'inner', 'footer'], - limit=10) - - def test_tag_no_match(self): - assert len(self.soup.select('del')) == 0 - - def test_invalid_tag(self): - with pytest.raises(SelectorSyntaxError): - self.soup.select('tag%t') - - def test_select_dashed_tag_ids(self): - self.assert_selects('custom-dashed-tag', ['dash1', 'dash2']) - - def test_select_dashed_by_id(self): - dashed = self.soup.select('custom-dashed-tag[id=\"dash2\"]') - assert dashed[0].name == 'custom-dashed-tag' - assert dashed[0]['id'] == 'dash2' - - def test_dashed_tag_text(self): - assert self.soup.select('body > custom-dashed-tag')[0].text == 'Hello there.' - - def test_select_dashed_matches_find_all(self): - assert self.soup.select('custom-dashed-tag') == self.soup.find_all('custom-dashed-tag') - - def test_header_tags(self): - self.assert_select_multiple( - ('h1', ['header1']), - ('h2', ['header2', 'header3']), - ) - - def test_class_one(self): - for selector in ('.onep', 'p.onep', 'html p.onep'): - els = self.soup.select(selector) - assert len(els) == 1 - assert els[0].name == 'p' - assert els[0]['class'] == ['onep'] - - def test_class_mismatched_tag(self): - els = self.soup.select('div.onep') - assert len(els) == 0 - - def test_one_id(self): - for selector in ('div#inner', '#inner', 'div div#inner'): - self.assert_selects(selector, ['inner']) - - def test_bad_id(self): - els = self.soup.select('#doesnotexist') - assert len(els) == 0 - - def test_items_in_id(self): - els = self.soup.select('div#inner p') - assert len(els) == 3 - for el in els: - assert el.name == 'p' - assert els[1]['class'] == ['onep'] - assert not els[0].has_attr('class') - - def test_a_bunch_of_emptys(self): - for selector in ('div#main del', 'div#main div.oops', 'div div#main'): - assert len(self.soup.select(selector)) == 0 - - def test_multi_class_support(self): - for selector in ('.class1', 'p.class1', '.class2', 'p.class2', - '.class3', 'p.class3', 'html p.class2', 'div#inner .class2'): - self.assert_selects(selector, ['pmulti']) - - def test_multi_class_selection(self): - for selector in ('.class1.class3', '.class3.class2', - '.class1.class2.class3'): - self.assert_selects(selector, ['pmulti']) - - def test_child_selector(self): - self.assert_selects('.s1 > a', ['s1a1', 's1a2']) - self.assert_selects('.s1 > a span', ['s1a2s1']) - - def test_child_selector_id(self): - self.assert_selects('.s1 > a#s1a2 span', ['s1a2s1']) - - def test_attribute_equals(self): - self.assert_select_multiple( - ('p[class="onep"]', ['p1']), - ('p[id="p1"]', ['p1']), - ('[class="onep"]', ['p1']), - ('[id="p1"]', ['p1']), - ('link[rel="stylesheet"]', ['l1']), - ('link[type="text/css"]', ['l1']), - ('link[href="blah.css"]', ['l1']), - ('link[href="no-blah.css"]', []), - ('[rel="stylesheet"]', ['l1']), - ('[type="text/css"]', ['l1']), - ('[href="blah.css"]', ['l1']), - ('[href="no-blah.css"]', []), - ('p[href="no-blah.css"]', []), - ('[href="no-blah.css"]', []), - ) - - def test_attribute_tilde(self): - self.assert_select_multiple( - ('p[class~="class1"]', ['pmulti']), - ('p[class~="class2"]', ['pmulti']), - ('p[class~="class3"]', ['pmulti']), - ('[class~="class1"]', ['pmulti']), - ('[class~="class2"]', ['pmulti']), - ('[class~="class3"]', ['pmulti']), - ('a[rel~="friend"]', ['bob']), - ('a[rel~="met"]', ['bob']), - ('[rel~="friend"]', ['bob']), - ('[rel~="met"]', ['bob']), - ) - - def test_attribute_startswith(self): - self.assert_select_multiple( - ('[rel^="style"]', ['l1']), - ('link[rel^="style"]', ['l1']), - ('notlink[rel^="notstyle"]', []), - ('[rel^="notstyle"]', []), - ('link[rel^="notstyle"]', []), - ('link[href^="bla"]', ['l1']), - ('a[href^="http://"]', ['bob', 'me']), - ('[href^="http://"]', ['bob', 'me']), - ('[id^="p"]', ['pmulti', 'p1']), - ('[id^="m"]', ['me', 'main']), - ('div[id^="m"]', ['main']), - ('a[id^="m"]', ['me']), - ('div[data-tag^="dashed"]', ['data1']) - ) - - def test_attribute_endswith(self): - self.assert_select_multiple( - ('[href$=".css"]', ['l1']), - ('link[href$=".css"]', ['l1']), - ('link[id$="1"]', ['l1']), - ('[id$="1"]', ['data1', 'l1', 'p1', 'header1', 's1a1', 's2a1', 's1a2s1', 'dash1']), - ('div[id$="1"]', ['data1']), - ('[id$="noending"]', []), - ) - - def test_attribute_contains(self): - self.assert_select_multiple( - # From test_attribute_startswith - ('[rel*="style"]', ['l1']), - ('link[rel*="style"]', ['l1']), - ('notlink[rel*="notstyle"]', []), - ('[rel*="notstyle"]', []), - ('link[rel*="notstyle"]', []), - ('link[href*="bla"]', ['l1']), - ('[href*="http://"]', ['bob', 'me']), - ('[id*="p"]', ['pmulti', 'p1']), - ('div[id*="m"]', ['main']), - ('a[id*="m"]', ['me']), - # From test_attribute_endswith - ('[href*=".css"]', ['l1']), - ('link[href*=".css"]', ['l1']), - ('link[id*="1"]', ['l1']), - ('[id*="1"]', ['data1', 'l1', 'p1', 'header1', 's1a1', 's1a2', 's2a1', 's1a2s1', 'dash1']), - ('div[id*="1"]', ['data1']), - ('[id*="noending"]', []), - # New for this test - ('[href*="."]', ['bob', 'me', 'l1']), - ('a[href*="."]', ['bob', 'me']), - ('link[href*="."]', ['l1']), - ('div[id*="n"]', ['main', 'inner']), - ('div[id*="nn"]', ['inner']), - ('div[data-tag*="edval"]', ['data1']) - ) - - def test_attribute_exact_or_hypen(self): - self.assert_select_multiple( - ('p[lang|="en"]', ['lang-en', 'lang-en-gb', 'lang-en-us']), - ('[lang|="en"]', ['lang-en', 'lang-en-gb', 'lang-en-us']), - ('p[lang|="fr"]', ['lang-fr']), - ('p[lang|="gb"]', []), - ) - - def test_attribute_exists(self): - self.assert_select_multiple( - ('[rel]', ['l1', 'bob', 'me']), - ('link[rel]', ['l1']), - ('a[rel]', ['bob', 'me']), - ('[lang]', ['lang-en', 'lang-en-gb', 'lang-en-us', 'lang-fr']), - ('p[class]', ['p1', 'pmulti']), - ('[blah]', []), - ('p[blah]', []), - ('div[data-tag]', ['data1']) - ) - - def test_quoted_space_in_selector_name(self): - html = """
nope
-
yes
- """ - soup = BeautifulSoup(html, 'html.parser') - [chosen] = soup.select('div[style="display: right"]') - assert "yes" == chosen.string - - def test_unsupported_pseudoclass(self): - with pytest.raises(NotImplementedError): - self.soup.select("a:no-such-pseudoclass") - - with pytest.raises(SelectorSyntaxError): - self.soup.select("a:nth-of-type(a)") - - def test_nth_of_type(self): - # Try to select first paragraph - els = self.soup.select('div#inner p:nth-of-type(1)') - assert len(els) == 1 - assert els[0].string == 'Some text' - - # Try to select third paragraph - els = self.soup.select('div#inner p:nth-of-type(3)') - assert len(els) == 1 - assert els[0].string == 'Another' - - # Try to select (non-existent!) fourth paragraph - els = self.soup.select('div#inner p:nth-of-type(4)') - assert len(els) == 0 - - # Zero will select no tags. - els = self.soup.select('div p:nth-of-type(0)') - assert len(els) == 0 - - def test_nth_of_type_direct_descendant(self): - els = self.soup.select('div#inner > p:nth-of-type(1)') - assert len(els) == 1 - assert els[0].string == 'Some text' - - def test_id_child_selector_nth_of_type(self): - self.assert_selects('#inner > p:nth-of-type(2)', ['p1']) - - def test_select_on_element(self): - # Other tests operate on the tree; this operates on an element - # within the tree. - inner = self.soup.find("div", id="main") - selected = inner.select("div") - # The
tag was selected. The