diff --git a/lib/bs4/__init__.py b/lib/bs4/__init__.py index fcc27457..2a436d34 100644 --- a/lib/bs4/__init__.py +++ b/lib/bs4/__init__.py @@ -1,6 +1,5 @@ -"""Beautiful Soup -Elixir and Tonic -"The Screen-Scraper's Friend" +"""Beautiful Soup Elixir and Tonic - "The Screen-Scraper's Friend". + http://www.crummy.com/software/BeautifulSoup/ Beautiful Soup uses a pluggable XML or HTML parser to parse a @@ -8,29 +7,34 @@ Beautiful Soup uses a pluggable XML or HTML parser to parse a provides methods and Pythonic idioms that make it easy to navigate, search, and modify the parse tree. -Beautiful Soup works with Python 2.7 and up. It works better if lxml +Beautiful Soup works with Python 3.5 and up. It works better if lxml and/or html5lib is installed. For more than you ever wanted to know about Beautiful Soup, see the -documentation: -http://www.crummy.com/software/BeautifulSoup/bs4/doc/ - +documentation: http://www.crummy.com/software/BeautifulSoup/bs4/doc/ """ __author__ = "Leonard Richardson (leonardr@segfault.org)" -__version__ = "4.8.1" -__copyright__ = "Copyright (c) 2004-2019 Leonard Richardson" +__version__ = "4.10.0" +__copyright__ = "Copyright (c) 2004-2021 Leonard Richardson" # Use of this source code is governed by the MIT license. __license__ = "MIT" __all__ = ['BeautifulSoup'] + +from collections import Counter import os import re import sys import traceback import warnings +# The very first thing we do is give a useful error if someone is +# running this code under Python 2. +if sys.version_info.major < 3: + raise ImportError('You are trying to use a Python 3-specific version of Beautiful Soup under Python 2. This will not work. The final version of Beautiful Soup to support Python 2 was 4.9.3.') + from .builder import builder_registry, ParserRejectedMarkup from .dammit import UnicodeDammit from .element import ( @@ -42,28 +46,49 @@ from .element import ( NavigableString, PageElement, ProcessingInstruction, + PYTHON_SPECIFIC_ENCODINGS, ResultSet, + Script, + Stylesheet, SoupStrainer, Tag, + TemplateString, ) -# The very first thing we do is give a useful error if someone is -# running this code under Python 3 without converting it. -'You are trying to run the Python 2 version of Beautiful Soup under Python 3. This will not work.'!='You need to convert the code, either by installing it (`python setup.py install`) or by running 2to3 (`2to3 -w bs4`).' +# Define some custom warnings. +class GuessedAtParserWarning(UserWarning): + """The warning issued when BeautifulSoup has to guess what parser to + use -- probably because no parser was specified in the constructor. + """ + +class MarkupResemblesLocatorWarning(UserWarning): + """The warning issued when BeautifulSoup is given 'markup' that + actually looks like a resource locator -- a URL or a path to a file + on disk. + """ + class BeautifulSoup(Tag): - """ - This class defines the basic interface called by the tree builders. + """A data structure representing a parsed HTML or XML document. - These methods will be called by the parser: - reset() - feed(markup) + Most of the methods you'll call on a BeautifulSoup object are inherited from + PageElement or Tag. + + Internally, this class defines the basic interface called by the + tree builders when converting an HTML/XML document into a data + structure. The interface abstracts away the differences between + parsers. To write a new tree builder, you'll need to understand + these methods as a whole. + + These methods will be called by the BeautifulSoup constructor: + * reset() + * feed(markup) The tree builder may call these methods from its feed() implementation: - handle_starttag(name, attrs) # See note about return value - handle_endtag(name) - handle_data(data) # Appends to the current data node - endData(containerClass) # Ends the current data node + * handle_starttag(name, attrs) # See note about return value + * handle_endtag(name) + * handle_data(data) # Appends to the current data node + * endData(containerClass) # Ends the current data node No matter how complicated the underlying parser is, you should be able to build a tree using 'start tag' events, 'end tag' events, @@ -73,68 +98,75 @@ class BeautifulSoup(Tag): like HTML's
tag), call handle_starttag and then handle_endtag. """ + + # Since BeautifulSoup subclasses Tag, it's possible to treat it as + # a Tag with a .name. This name makes it clear the BeautifulSoup + # object isn't a real markup tag. ROOT_TAG_NAME = '[document]' # If the end-user gives no indication which tree builder they # want, look for one with these features. DEFAULT_BUILDER_FEATURES = ['html', 'fast'] - + + # A string containing all ASCII whitespace characters, used in + # endData() to detect data chunks that seem 'empty'. ASCII_SPACES = '\x20\x0a\x09\x0c\x0d' NO_PARSER_SPECIFIED_WARNING = "No parser was explicitly specified, so I'm using the best available %(markup_type)s parser for this system (\"%(parser)s\"). This usually isn't a problem, but if you run this code on another system, or in a different virtual environment, it may use a different parser and behave differently.\n\nThe code that caused this warning is on line %(line_number)s of the file %(filename)s. To get rid of this warning, pass the additional argument 'features=\"%(parser)s\"' to the BeautifulSoup constructor.\n" - + def __init__(self, markup="", features=None, builder=None, parse_only=None, from_encoding=None, exclude_encodings=None, element_classes=None, **kwargs): """Constructor. :param markup: A string or a file-like object representing - markup to be parsed. + markup to be parsed. - :param features: Desirable features of the parser to be used. This - may be the name of a specific parser ("lxml", "lxml-xml", - "html.parser", or "html5lib") or it may be the type of markup - to be used ("html", "html5", "xml"). It's recommended that you - name a specific parser, so that Beautiful Soup gives you the - same results across platforms and virtual environments. + :param features: Desirable features of the parser to be + used. This may be the name of a specific parser ("lxml", + "lxml-xml", "html.parser", or "html5lib") or it may be the + type of markup to be used ("html", "html5", "xml"). It's + recommended that you name a specific parser, so that + Beautiful Soup gives you the same results across platforms + and virtual environments. :param builder: A TreeBuilder subclass to instantiate (or - instance to use) instead of looking one up based on - `features`. You only need to use this if you've implemented a - custom TreeBuilder. + instance to use) instead of looking one up based on + `features`. You only need to use this if you've implemented a + custom TreeBuilder. :param parse_only: A SoupStrainer. Only parts of the document - matching the SoupStrainer will be considered. This is useful - when parsing part of a document that would otherwise be too - large to fit into memory. + matching the SoupStrainer will be considered. This is useful + when parsing part of a document that would otherwise be too + large to fit into memory. :param from_encoding: A string indicating the encoding of the - document to be parsed. Pass this in if Beautiful Soup is - guessing wrongly about the document's encoding. + document to be parsed. Pass this in if Beautiful Soup is + guessing wrongly about the document's encoding. :param exclude_encodings: A list of strings indicating - encodings known to be wrong. Pass this in if you don't know - the document's encoding but you know Beautiful Soup's guess is - wrong. + encodings known to be wrong. Pass this in if you don't know + the document's encoding but you know Beautiful Soup's guess is + wrong. :param element_classes: A dictionary mapping BeautifulSoup - classes like Tag and NavigableString to other classes you'd - like to be instantiated instead as the parse tree is - built. This is useful for using subclasses to modify the - default behavior of Tag or NavigableString. + classes like Tag and NavigableString, to other classes you'd + like to be instantiated instead as the parse tree is + built. This is useful for subclassing Tag or NavigableString + to modify default behavior. :param kwargs: For backwards compatibility purposes, the - constructor accepts certain keyword arguments used in - Beautiful Soup 3. None of these arguments do anything in - Beautiful Soup 4; they will result in a warning and then be ignored. - - Apart from this, any keyword arguments passed into the BeautifulSoup - constructor are propagated to the TreeBuilder constructor. This - makes it possible to configure a TreeBuilder beyond saying - which one to use. - + constructor accepts certain keyword arguments used in + Beautiful Soup 3. None of these arguments do anything in + Beautiful Soup 4; they will result in a warning and then be + ignored. + + Apart from this, any keyword arguments passed into the + BeautifulSoup constructor are propagated to the TreeBuilder + constructor. This makes it possible to configure a + TreeBuilder by passing in arguments, not just by saying which + one to use. """ - if 'convertEntities' in kwargs: del kwargs['convertEntities'] warnings.warn( @@ -223,7 +255,9 @@ class BeautifulSoup(Tag): if not original_builder and not ( original_features == builder.NAME or original_features in builder.ALTERNATE_NAMES - ): + ) and markup: + # The user did not tell us which TreeBuilder to use, + # and we had to guess. Issue a warning. if builder.is_xml: markup_type = "XML" else: @@ -257,7 +291,10 @@ class BeautifulSoup(Tag): parser=builder.NAME, markup_type=markup_type ) - warnings.warn(self.NO_PARSER_SPECIFIED_WARNING % values, stacklevel=2) + warnings.warn( + self.NO_PARSER_SPECIFIED_WARNING % values, + GuessedAtParserWarning, stacklevel=2 + ) else: if kwargs: warnings.warn("Keyword arguments to the BeautifulSoup constructor will be ignored. These would normally be passed into the TreeBuilder constructor, but a TreeBuilder instance was passed in as `builder`.") @@ -286,20 +323,32 @@ class BeautifulSoup(Tag): else: possible_filename = markup is_file = False + is_directory = False try: is_file = os.path.exists(possible_filename) + if is_file: + is_directory = os.path.isdir(possible_filename) except Exception as e: # This is almost certainly a problem involving # characters not valid in filenames on this # system. Just let it go. pass - if is_file: - if isinstance(markup, str): - markup = markup.encode("utf8") + if is_directory: + warnings.warn( + '"%s" looks like a directory name, not markup. You may' + ' want to open a file found in this directory and pass' + ' the filehandle into Beautiful Soup.' % ( + self._decode_markup(markup) + ), + MarkupResemblesLocatorWarning + ) + elif is_file: warnings.warn( '"%s" looks like a filename, not markup. You should' ' probably open this file and pass the filehandle into' - ' Beautiful Soup.' % markup) + ' Beautiful Soup.' % self._decode_markup(markup), + MarkupResemblesLocatorWarning + ) self._check_markup_is_url(markup) rejections = [] @@ -329,6 +378,7 @@ class BeautifulSoup(Tag): self.builder.soup = None def __copy__(self): + """Copy a BeautifulSoup object by converting the document to a string and parsing it again.""" copy = type(self)( self.encode('utf-8'), builder=self.builder, from_encoding='utf-8' ) @@ -347,11 +397,25 @@ class BeautifulSoup(Tag): d['builder'] = None return d - @staticmethod - def _check_markup_is_url(markup): - """ - Check if markup looks like it's actually a url and raise a warning - if so. Markup can be unicode or str (py2) / bytes (py3). + @classmethod + def _decode_markup(cls, markup): + """Ensure `markup` is bytes so it's safe to send into warnings.warn. + + TODO: warnings.warn had this problem back in 2010 but it might not + anymore. + """ + if isinstance(markup, bytes): + decoded = markup.decode('utf-8', 'replace') + else: + decoded = markup + return decoded + + @classmethod + def _check_markup_is_url(cls, markup): + """Error-handling method to raise a warning if incoming markup looks + like a URL. + + :param markup: A string. """ if isinstance(markup, bytes): space = b' ' @@ -364,18 +428,20 @@ class BeautifulSoup(Tag): if any(markup.startswith(prefix) for prefix in cant_start_with): if not space in markup: - if isinstance(markup, bytes): - decoded_markup = markup.decode('utf-8', 'replace') - else: - decoded_markup = markup warnings.warn( '"%s" looks like a URL. Beautiful Soup is not an' ' HTTP client. You should probably use an HTTP client like' ' requests to get the document behind the URL, and feed' - ' that document to Beautiful Soup.' % decoded_markup + ' that document to Beautiful Soup.' % cls._decode_markup( + markup + ), + MarkupResemblesLocatorWarning ) def _feed(self): + """Internal method that parses previously set markup, creating a large + number of Tag and NavigableString objects. + """ # Convert the document to Unicode. self.builder.reset() @@ -386,66 +452,110 @@ class BeautifulSoup(Tag): self.popTag() def reset(self): + """Reset this object to a state as though it had never parsed any + markup. + """ Tag.__init__(self, self, self.builder, self.ROOT_TAG_NAME) self.hidden = 1 self.builder.reset() self.current_data = [] self.currentTag = None self.tagStack = [] + self.open_tag_counter = Counter() self.preserve_whitespace_tag_stack = [] + self.string_container_stack = [] self.pushTag(self) def new_tag(self, name, namespace=None, nsprefix=None, attrs={}, sourceline=None, sourcepos=None, **kwattrs): - """Create a new tag associated with this soup.""" + """Create a new Tag associated with this BeautifulSoup object. + + :param name: The name of the new Tag. + :param namespace: The URI of the new Tag's XML namespace, if any. + :param prefix: The prefix for the new Tag's XML namespace, if any. + :param attrs: A dictionary of this Tag's attribute values; can + be used instead of `kwattrs` for attributes like 'class' + that are reserved words in Python. + :param sourceline: The line number where this tag was + (purportedly) found in its source document. + :param sourcepos: The character position within `sourceline` where this + tag was (purportedly) found. + :param kwattrs: Keyword arguments for the new Tag's attribute values. + + """ kwattrs.update(attrs) return self.element_classes.get(Tag, Tag)( None, self.builder, name, namespace, nsprefix, kwattrs, sourceline=sourceline, sourcepos=sourcepos ) - def new_string(self, s, subclass=None): - """Create a new NavigableString associated with this soup.""" - subclass = subclass or self.element_classes.get( - NavigableString, NavigableString + def string_container(self, base_class=None): + container = base_class or NavigableString + + # There may be a general override of NavigableString. + container = self.element_classes.get( + container, container ) - return subclass(s) - def insert_before(self, successor): + # On top of that, we may be inside a tag that needs a special + # container class. + if self.string_container_stack and container is NavigableString: + container = self.builder.string_containers.get( + self.string_container_stack[-1].name, container + ) + return container + + def new_string(self, s, subclass=None): + """Create a new NavigableString associated with this BeautifulSoup + object. + """ + container = self.string_container(subclass) + return container(s) + + def insert_before(self, *args): + """This method is part of the PageElement API, but `BeautifulSoup` doesn't implement + it because there is nothing before or after it in the parse tree. + """ raise NotImplementedError("BeautifulSoup objects don't support insert_before().") - def insert_after(self, successor): + def insert_after(self, *args): + """This method is part of the PageElement API, but `BeautifulSoup` doesn't implement + it because there is nothing before or after it in the parse tree. + """ raise NotImplementedError("BeautifulSoup objects don't support insert_after().") def popTag(self): + """Internal method called by _popToTag when a tag is closed.""" tag = self.tagStack.pop() + if tag.name in self.open_tag_counter: + self.open_tag_counter[tag.name] -= 1 if self.preserve_whitespace_tag_stack and tag == self.preserve_whitespace_tag_stack[-1]: self.preserve_whitespace_tag_stack.pop() - #print "Pop", tag.name + if self.string_container_stack and tag == self.string_container_stack[-1]: + self.string_container_stack.pop() + #print("Pop", tag.name) if self.tagStack: self.currentTag = self.tagStack[-1] return self.currentTag def pushTag(self, tag): - #print "Push", tag.name + """Internal method called by handle_starttag when a tag is opened.""" + #print("Push", tag.name) if self.currentTag is not None: self.currentTag.contents.append(tag) self.tagStack.append(tag) self.currentTag = self.tagStack[-1] + if tag.name != self.ROOT_TAG_NAME: + self.open_tag_counter[tag.name] += 1 if tag.name in self.builder.preserve_whitespace_tags: self.preserve_whitespace_tag_stack.append(tag) + if tag.name in self.builder.string_containers: + self.string_container_stack.append(tag) def endData(self, containerClass=None): - - # Default container is NavigableString. - containerClass = containerClass or NavigableString - - # The user may want us to instantiate some alias for the - # container class. - containerClass = self.element_classes.get( - containerClass, containerClass - ) - + """Method called by the TreeBuilder when the end of a data segment + occurs. + """ if self.current_data: current_data = ''.join(self.current_data) # If whitespace is not preserved, and this string contains @@ -472,11 +582,12 @@ class BeautifulSoup(Tag): not self.parse_only.search(current_data)): return + containerClass = self.string_container(containerClass) o = containerClass(current_data) self.object_was_parsed(o) def object_was_parsed(self, o, parent=None, most_recent_element=None): - """Add an object to the parse tree.""" + """Method called by the TreeBuilder to integrate an object into the parse tree.""" if parent is None: parent = self.currentTag if most_recent_element is not None: @@ -545,10 +656,19 @@ class BeautifulSoup(Tag): def _popToTag(self, name, nsprefix=None, inclusivePop=True): """Pops the tag stack up to and including the most recent - instance of the given tag. If inclusivePop is false, pops the tag - stack up to but *not* including the most recent instqance of - the given tag.""" - #print "Popping to %s" % name + instance of the given tag. + + If there are no open tags with the given name, nothing will be + popped. + + :param name: Pop up to the most recent tag with this name. + :param nsprefix: The namespace prefix that goes with `name`. + :param inclusivePop: It this is false, pops the tag stack up + to but *not* including the most recent instqance of the + given tag. + + """ + #print("Popping to %s" % name) if name == self.ROOT_TAG_NAME: # The BeautifulSoup object itself can never be popped. return @@ -557,6 +677,8 @@ class BeautifulSoup(Tag): stack_size = len(self.tagStack) for i in range(stack_size - 1, 0, -1): + if not self.open_tag_counter.get(name): + break t = self.tagStack[i] if (name == t.name and nsprefix == t.prefix): if inclusivePop: @@ -568,15 +690,22 @@ class BeautifulSoup(Tag): def handle_starttag(self, name, namespace, nsprefix, attrs, sourceline=None, sourcepos=None): - """Push a start tag on to the stack. + """Called by the tree builder when a new tag is encountered. - If this method returns None, the tag was rejected by the + :param name: Name of the tag. + :param nsprefix: Namespace prefix for the tag. + :param attrs: A dictionary of attribute values. + :param sourceline: The line number where this tag was found in its + source document. + :param sourcepos: The character position within `sourceline` where this + tag was found. + + If this method returns None, the tag was rejected by an active SoupStrainer. You should proceed as if the tag had not occurred in the document. For instance, if this was a self-closing tag, don't call handle_endtag. """ - - # print "Start tag %s: %s" % (name, attrs) + # print("Start tag %s: %s" % (name, attrs)) self.endData() if (self.parse_only and len(self.tagStack) <= 1 @@ -598,22 +727,38 @@ class BeautifulSoup(Tag): return tag def handle_endtag(self, name, nsprefix=None): - #print "End tag: " + name + """Called by the tree builder when an ending tag is encountered. + + :param name: Name of the tag. + :param nsprefix: Namespace prefix for the tag. + """ + #print("End tag: " + name) self.endData() self._popToTag(name, nsprefix) def handle_data(self, data): + """Called by the tree builder when a chunk of textual data is encountered.""" self.current_data.append(data) - + def decode(self, pretty_print=False, eventual_encoding=DEFAULT_OUTPUT_ENCODING, formatter="minimal"): - """Returns a string or Unicode representation of this document. - To get Unicode, pass None for encoding.""" + """Returns a string or Unicode representation of the parse tree + as an HTML or XML document. + :param pretty_print: If this is True, indentation will be used to + make the document more readable. + :param eventual_encoding: The encoding of the final document. + If this is None, the document will be a Unicode string. + """ if self.is_xml: # Print the XML declaration encoding_part = '' + if eventual_encoding in PYTHON_SPECIFIC_ENCODINGS: + # This is a special Python encoding; it can't actually + # go into an XML document because it means nothing + # outside of Python. + eventual_encoding = None if eventual_encoding != None: encoding_part = ' encoding="%s"' % eventual_encoding prefix = '\n' % encoding_part @@ -626,7 +771,7 @@ class BeautifulSoup(Tag): return prefix + super(BeautifulSoup, self).decode( indent_level, eventual_encoding, formatter) -# Alias to make it easier to type import: 'from bs4 import _soup' +# Aliases to make it easier to get started quickly, e.g. 'from bs4 import _soup' _s = BeautifulSoup _soup = BeautifulSoup @@ -642,14 +787,18 @@ class BeautifulStoneSoup(BeautifulSoup): class StopParsing(Exception): + """Exception raised by a TreeBuilder if it's unable to continue parsing.""" pass class FeatureNotFound(ValueError): + """Exception raised by the BeautifulSoup constructor if no parser with the + requested features is found. + """ pass -#By default, act as an HTML pretty-printer. +#If this file is run as a script, act as an HTML pretty-printer. if __name__ == '__main__': import sys soup = BeautifulSoup(sys.stdin) - print(soup.prettify()) + print((soup.prettify())) diff --git a/lib/bs4/builder/__init__.py b/lib/bs4/builder/__init__.py index 03a4c1e0..bd44905e 100644 --- a/lib/bs4/builder/__init__.py +++ b/lib/bs4/builder/__init__.py @@ -7,8 +7,11 @@ import sys from bs4.element import ( CharsetMetaAttributeValue, ContentMetaAttributeValue, + Stylesheet, + Script, + TemplateString, nonwhitespace_re - ) +) __all__ = [ 'HTMLTreeBuilder', @@ -27,18 +30,33 @@ HTML_5 = 'html5' class TreeBuilderRegistry(object): - + """A way of looking up TreeBuilder subclasses by their name or by desired + features. + """ + def __init__(self): self.builders_for_feature = defaultdict(list) self.builders = [] def register(self, treebuilder_class): - """Register a treebuilder based on its advertised features.""" + """Register a treebuilder based on its advertised features. + + :param treebuilder_class: A subclass of Treebuilder. its .features + attribute should list its features. + """ for feature in treebuilder_class.features: self.builders_for_feature[feature].insert(0, treebuilder_class) self.builders.insert(0, treebuilder_class) def lookup(self, *features): + """Look up a TreeBuilder subclass with the desired features. + + :param features: A list of features to look for. If none are + provided, the most recently registered TreeBuilder subclass + will be used. + :return: A TreeBuilder subclass, or None if there's no + registered subclass with all the requested features. + """ if len(self.builders) == 0: # There are no builders at all. return None @@ -81,7 +99,7 @@ class TreeBuilderRegistry(object): builder_registry = TreeBuilderRegistry() class TreeBuilder(object): - """Turn a document into a Beautiful Soup object tree.""" + """Turn a textual document into a Beautiful Soup object tree.""" NAME = "[Unknown tree builder]" ALTERNATE_NAMES = [] @@ -96,7 +114,12 @@ class TreeBuilder(object): # comma-separated list of CDATA, rather than a single CDATA. DEFAULT_CDATA_LIST_ATTRIBUTES = {} + # Whitespace should be preserved inside these tags. DEFAULT_PRESERVE_WHITESPACE_TAGS = set() + + # The textual contents of tags with these names should be + # instantiated with some class other than NavigableString. + DEFAULT_STRING_CONTAINERS = {} USE_DEFAULT = object() @@ -105,30 +128,39 @@ class TreeBuilder(object): def __init__(self, multi_valued_attributes=USE_DEFAULT, preserve_whitespace_tags=USE_DEFAULT, - store_line_numbers=USE_DEFAULT): + store_line_numbers=USE_DEFAULT, + string_containers=USE_DEFAULT, + ): """Constructor. :param multi_valued_attributes: If this is set to None, the - TreeBuilder will not turn any values for attributes like - 'class' into lists. Setting this do a dictionary will - customize this behavior; look at DEFAULT_CDATA_LIST_ATTRIBUTES - for an example. + TreeBuilder will not turn any values for attributes like + 'class' into lists. Setting this to a dictionary will + customize this behavior; look at DEFAULT_CDATA_LIST_ATTRIBUTES + for an example. - Internally, these are called "CDATA list attributes", but that - probably doesn't make sense to an end-user, so the argument name - is `multi_valued_attributes`. + Internally, these are called "CDATA list attributes", but that + probably doesn't make sense to an end-user, so the argument name + is `multi_valued_attributes`. :param preserve_whitespace_tags: A list of tags to treat - the way
 tags are treated in HTML. Tags in this list
-        will have 
+         the way 
 tags are treated in HTML. Tags in this list
+         are immune from pretty-printing; their contents will always be
+         output as-is.
+
+        :param string_containers: A dictionary mapping tag names to
+        the classes that should be instantiated to contain the textual
+        contents of those tags. The default is to use NavigableString
+        for every tag, no matter what the name. You can override the
+        default by changing DEFAULT_STRING_CONTAINERS.
 
         :param store_line_numbers: If the parser keeps track of the
-        line numbers and positions of the original markup, that
-        information will, by default, be stored in each corresponding
-        `Tag` object. You can turn this off by passing
-        store_line_numbers=False. If the parser you're using doesn't 
-        keep track of this information, then setting store_line_numbers=True
-        will do nothing.
+         line numbers and positions of the original markup, that
+         information will, by default, be stored in each corresponding
+         `Tag` object. You can turn this off by passing
+         store_line_numbers=False. If the parser you're using doesn't 
+         keep track of this information, then setting store_line_numbers=True
+         will do nothing.
         """
         self.soup = None
         if multi_valued_attributes is self.USE_DEFAULT:
@@ -139,15 +171,25 @@ class TreeBuilder(object):
         self.preserve_whitespace_tags = preserve_whitespace_tags
         if store_line_numbers == self.USE_DEFAULT:
             store_line_numbers = self.TRACKS_LINE_NUMBERS
-        self.store_line_numbers = store_line_numbers
+        self.store_line_numbers = store_line_numbers 
+        if string_containers == self.USE_DEFAULT:
+            string_containers = self.DEFAULT_STRING_CONTAINERS
+        self.string_containers = string_containers
         
     def initialize_soup(self, soup):
         """The BeautifulSoup object has been initialized and is now
         being associated with the TreeBuilder.
+
+        :param soup: A BeautifulSoup object.
         """
         self.soup = soup
         
     def reset(self):
+        """Do any work necessary to reset the underlying parser
+        for a new document.
+
+        By default, this does nothing.
+        """
         pass
 
     def can_be_empty_element(self, tag_name):
@@ -159,23 +201,57 @@ class TreeBuilder(object):
         For instance: an HTMLBuilder does not consider a 

tag to be an empty-element tag (it's not in HTMLBuilder.empty_element_tags). This means an empty

tag - will be presented as "

", not "

". + will be presented as "

", not "

" or "

". The default implementation has no opinion about which tags are empty-element tags, so a tag will be presented as an - empty-element tag if and only if it has no contents. - "" will become "", and "bar" will + empty-element tag if and only if it has no children. + "" will become "", and "bar" will be left alone. + + :param tag_name: The name of a markup tag. """ if self.empty_element_tags is None: return True return tag_name in self.empty_element_tags def feed(self, markup): + """Run some incoming markup through some parsing process, + populating the `BeautifulSoup` object in self.soup. + + This method is not implemented in TreeBuilder; it must be + implemented in subclasses. + + :return: None. + """ raise NotImplementedError() def prepare_markup(self, markup, user_specified_encoding=None, document_declared_encoding=None, exclude_encodings=None): + """Run any preliminary steps necessary to make incoming markup + acceptable to the parser. + + :param markup: Some markup -- probably a bytestring. + :param user_specified_encoding: The user asked to try this encoding. + :param document_declared_encoding: The markup itself claims to be + in this encoding. NOTE: This argument is not used by the + calling code and can probably be removed. + :param exclude_encodings: The user asked _not_ to try any of + these encodings. + + :yield: A series of 4-tuples: + (markup, encoding, declared encoding, + has undergone character replacement) + + Each 4-tuple represents a strategy for converting the + document to Unicode and parsing it. Each strategy will be tried + in turn. + + By default, the only strategy is to parse the markup + as-is. See `LXMLTreeBuilderForXML` and + `HTMLParserTreeBuilder` for implementations that take into + account the quirks of particular parsers. + """ yield markup, None, None, False def test_fragment_to_document(self, fragment): @@ -188,16 +264,36 @@ class TreeBuilder(object): results against other HTML fragments. This method should not be used outside of tests. + + :param fragment: A string -- fragment of HTML. + :return: A string -- a full HTML document. """ return fragment def set_up_substitutions(self, tag): + """Set up any substitutions that will need to be performed on + a `Tag` when it's output as a string. + + By default, this does nothing. See `HTMLTreeBuilder` for a + case where this is used. + + :param tag: A `Tag` + :return: Whether or not a substitution was performed. + """ return False def _replace_cdata_list_attribute_values(self, tag_name, attrs): - """Replaces class="foo bar" with class=["foo", "bar"] + """When an attribute value is associated with a tag that can + have multiple values for that attribute, convert the string + value to a list of strings. - Modifies its input in place. + Basically, replaces class="foo bar" with class=["foo", "bar"] + + NOTE: This method modifies its input in place. + + :param tag_name: The name of a tag. + :param attrs: A dictionary containing the tag's attributes. + Any appropriate attribute values will be modified in place. """ if not attrs: return attrs @@ -225,7 +321,11 @@ class TreeBuilder(object): return attrs class SAXTreeBuilder(TreeBuilder): - """A Beautiful Soup treebuilder that listens for SAX events.""" + """A Beautiful Soup treebuilder that listens for SAX events. + + This is not currently used for anything, but it demonstrates + how a simple TreeBuilder would work. + """ def feed(self, markup): raise NotImplementedError() @@ -235,11 +335,11 @@ class SAXTreeBuilder(TreeBuilder): def startElement(self, name, attrs): attrs = dict((key[1], value) for key, value in list(attrs.items())) - #print "Start %s, %r" % (name, attrs) + #print("Start %s, %r" % (name, attrs)) self.soup.handle_starttag(name, attrs) def endElement(self, name): - #print "End %s" % name + #print("End %s" % name) self.soup.handle_endtag(name) def startElementNS(self, nsTuple, nodeName, attrs): @@ -289,6 +389,22 @@ class HTMLTreeBuilder(TreeBuilder): # but it may do so eventually, and this information is available if # you need to use it. block_elements = set(["address", "article", "aside", "blockquote", "canvas", "dd", "div", "dl", "dt", "fieldset", "figcaption", "figure", "footer", "form", "h1", "h2", "h3", "h4", "h5", "h6", "header", "hr", "li", "main", "nav", "noscript", "ol", "output", "p", "pre", "section", "table", "tfoot", "ul", "video"]) + + # The HTML standard defines an unusual content model for these tags. + # We represent this by using a string class other than NavigableString + # inside these tags. + # + # I made this list by going through the HTML spec + # (https://html.spec.whatwg.org/#metadata-content) and looking for + # "metadata content" elements that can contain strings. + # + # TODO: Arguably

 in HTML
+        documents) should not.
+        """
         return (
             indent_level is not None
             and (
@@ -1196,6 +1729,15 @@ class Tag(PageElement):
         )
 
     def prettify(self, encoding=None, formatter="minimal"):
+        """Pretty-print this PageElement as a string.
+
+        :param encoding: The eventual encoding of the string. If this is None,
+            a Unicode string will be returned.
+        :param formatter: A Formatter object, or a string naming one of
+            the standard formatters.
+        :return: A Unicode string (if encoding==None) or a bytestring 
+            (otherwise).
+        """
         if encoding is None:
             return self.decode(True, formatter=formatter)
         else:
@@ -1207,7 +1749,8 @@ class Tag(PageElement):
         """Renders the contents of this tag as a Unicode string.
 
         :param indent_level: Each line of the rendering will be
-           indented this many spaces.
+           indented this many spaces. Used internally in
+           recursive calls while pretty-printing.
 
         :param eventual_encoding: The tag is destined to be
            encoded into this encoding. decode_contents() is _not_
@@ -1249,23 +1792,26 @@ class Tag(PageElement):
     def encode_contents(
         self, indent_level=None, encoding=DEFAULT_OUTPUT_ENCODING,
         formatter="minimal"):
-        """Renders the contents of this tag as a bytestring.
+        """Renders the contents of this PageElement as a bytestring.
 
         :param indent_level: Each line of the rendering will be
-           indented this many spaces.
+           indented this many spaces. Used internally in
+           recursive calls while pretty-printing.
 
         :param eventual_encoding: The bytestring will be in this encoding.
 
-        :param formatter: The output formatter responsible for converting
-           entities to Unicode characters.
-        """
+        :param formatter: A Formatter object, or a string naming one of
+            the standard Formatters.
 
+        :return: A bytestring.
+        """
         contents = self.decode_contents(indent_level, encoding, formatter)
         return contents.encode(encoding)
 
     # Old method for BS3 compatibility
     def renderContents(self, encoding=DEFAULT_OUTPUT_ENCODING,
                        prettyPrint=False, indentLevel=0):
+        """Deprecated method for BS3 compatibility."""
         if not prettyPrint:
             indentLevel = None
         return self.encode_contents(
@@ -1275,27 +1821,47 @@ class Tag(PageElement):
 
     def find(self, name=None, attrs={}, recursive=True, text=None,
              **kwargs):
-        """Return only the first child of this Tag matching the given
-        criteria."""
+        """Look in the children of this PageElement and find the first
+        PageElement that matches the given criteria.
+
+        All find_* methods take a common set of arguments. See the online
+        documentation for detailed explanations.
+
+        :param name: A filter on tag name.
+        :param attrs: A dictionary of filters on attribute values.
+        :param recursive: If this is True, find() will perform a
+            recursive search of this PageElement's children. Otherwise,
+            only the direct children will be considered.
+        :param limit: Stop looking after finding this many results.
+        :kwargs: A dictionary of filters on attribute values.
+        :return: A PageElement.
+        :rtype: bs4.element.Tag | bs4.element.NavigableString
+        """
         r = None
         l = self.find_all(name, attrs, recursive, text, 1, **kwargs)
         if l:
             r = l[0]
         return r
-    findChild = find
+    findChild = find #BS2
 
     def find_all(self, name=None, attrs={}, recursive=True, text=None,
                  limit=None, **kwargs):
-        """Extracts a list of Tag objects that match the given
-        criteria.  You can specify the name of the Tag and any
-        attributes you want the Tag to have.
+        """Look in the children of this PageElement and find all
+        PageElements that match the given criteria.
 
-        The value of a key-value pair in the 'attrs' map can be a
-        string, a list of strings, a regular expression object, or a
-        callable that takes a string and returns whether or not the
-        string matches for some custom definition of 'matches'. The
-        same is true of the tag name."""
+        All find_* methods take a common set of arguments. See the online
+        documentation for detailed explanations.
 
+        :param name: A filter on tag name.
+        :param attrs: A dictionary of filters on attribute values.
+        :param recursive: If this is True, find_all() will perform a
+            recursive search of this PageElement's children. Otherwise,
+            only the direct children will be considered.
+        :param limit: Stop looking after finding this many results.
+        :kwargs: A dictionary of filters on attribute values.
+        :return: A ResultSet of PageElements.
+        :rtype: bs4.element.ResultSet
+        """
         generator = self.descendants
         if not recursive:
             generator = self.children
@@ -1306,11 +1872,20 @@ class Tag(PageElement):
     #Generator methods
     @property
     def children(self):
+        """Iterate over all direct children of this PageElement.
+
+        :yield: A sequence of PageElements.
+        """
         # return iter() to make the purpose of the method clear
         return iter(self.contents)  # XXX This seems to be untested.
 
     @property
     def descendants(self):
+        """Iterate over all children of this PageElement in a
+        breadth-first sequence.
+
+        :yield: A sequence of PageElements.
+        """
         if not len(self.contents):
             return
         stopNode = self._last_descendant().next_element
@@ -1321,7 +1896,21 @@ class Tag(PageElement):
 
     # CSS selector code
     def select_one(self, selector, namespaces=None, **kwargs):
-        """Perform a CSS selection operation on the current element."""
+        """Perform a CSS selection operation on the current element.
+
+        :param selector: A CSS selector.
+
+        :param namespaces: A dictionary mapping namespace prefixes
+           used in the CSS selector to namespace URIs. By default,
+           Beautiful Soup will use the prefixes it encountered while
+           parsing the document.
+
+        :param kwargs: Keyword arguments to be passed into SoupSieve's 
+           soupsieve.select() method.
+
+        :return: A Tag.
+        :rtype: bs4.element.Tag
+        """
         value = self.select(selector, namespaces, 1, **kwargs)
         if value:
             return value[0]
@@ -1335,14 +1924,17 @@ class Tag(PageElement):
         :param selector: A string containing a CSS selector.
 
         :param namespaces: A dictionary mapping namespace prefixes
-        used in the CSS selector to namespace URIs. By default,
-        Beautiful Soup will use the prefixes it encountered while
-        parsing the document.
+           used in the CSS selector to namespace URIs. By default,
+           Beautiful Soup will use the prefixes it encountered while
+           parsing the document.
 
         :param limit: After finding this number of results, stop looking.
 
-        :param kwargs: Any extra arguments you'd like to pass in to
-        soupsieve.select().
+        :param kwargs: Keyword arguments to be passed into SoupSieve's 
+           soupsieve.select() method.
+
+        :return: A ResultSet of Tags.
+        :rtype: bs4.element.ResultSet
         """
         if namespaces is None:
             namespaces = self._namespaces
@@ -1354,19 +1946,27 @@ class Tag(PageElement):
                 "Cannot execute CSS selectors because the soupsieve package is not installed."
             )
             
-        return soupsieve.select(selector, self, namespaces, limit, **kwargs)
+        results = soupsieve.select(selector, self, namespaces, limit, **kwargs)
+
+        # We do this because it's more consistent and because
+        # ResultSet.__getattr__ has a helpful error message.
+        return ResultSet(None, results)
 
     # Old names for backwards compatibility
     def childGenerator(self):
+        """Deprecated generator."""
         return self.children
 
     def recursiveChildGenerator(self):
+        """Deprecated generator."""
         return self.descendants
 
     def has_key(self, key):
-        """This was kind of misleading because has_key() (attributes)
-        was different from __in__ (contents). has_key() is gone in
-        Python 3, anyway."""
+        """Deprecated method. This was kind of misleading because has_key()
+        (attributes) was different from __in__ (contents).
+
+        has_key() is gone in Python 3, anyway.
+        """
         warnings.warn('has_key is deprecated. Use has_attr("%s") instead.' % (
                 key))
         return self.has_attr(key)
@@ -1374,9 +1974,26 @@ class Tag(PageElement):
 # Next, a couple classes to represent queries and their results.
 class SoupStrainer(object):
     """Encapsulates a number of ways of matching a markup element (tag or
-    text)."""
+    string).
+
+    This is primarily used to underpin the find_* methods, but you can
+    create one yourself and pass it in as `parse_only` to the
+    `BeautifulSoup` constructor, to parse a subset of a large
+    document.
+    """
 
     def __init__(self, name=None, attrs={}, text=None, **kwargs):
+        """Constructor.
+
+        The SoupStrainer constructor takes the same arguments passed
+        into the find_* methods. See the online documentation for
+        detailed explanations.
+
+        :param name: A filter on tag name.
+        :param attrs: A dictionary of filters on attribute values.
+        :param text: A filter for a NavigableString with specific text.
+        :kwargs: A dictionary of filters on attribute values.
+        """        
         self.name = self._normalize_search_value(name)
         if not isinstance(attrs, dict):
             # Treat a non-dict value for attrs as a search for the 'class'
@@ -1434,17 +2051,38 @@ class SoupStrainer(object):
         return str(str(value))
 
     def __str__(self):
+        """A human-readable representation of this SoupStrainer."""
         if self.text:
             return self.text
         else:
             return "%s|%s" % (self.name, self.attrs)
 
     def search_tag(self, markup_name=None, markup_attrs={}):
+        """Check whether a Tag with the given name and attributes would
+        match this SoupStrainer.
+
+        Used prospectively to decide whether to even bother creating a Tag
+        object.
+
+        :param markup_name: A tag name as found in some markup.
+        :param markup_attrs: A dictionary of attributes as found in some markup.
+
+        :return: True if the prospective tag would match this SoupStrainer;
+            False otherwise.
+        """
         found = None
         markup = None
         if isinstance(markup_name, Tag):
             markup = markup_name
             markup_attrs = markup
+
+        if isinstance(self.name, str):
+            # Optimization for a very common case where the user is
+            # searching for a tag with one specific name, and we're
+            # looking at a tag with a different name.
+            if markup and not markup.prefix and self.name != markup.name:
+                 return False
+            
         call_function_with_tag_data = (
             isinstance(self.name, Callable)
             and not isinstance(markup_name, Tag))
@@ -1478,10 +2116,19 @@ class SoupStrainer(object):
         if found and self.text and not self._matches(found.string, self.text):
             found = None
         return found
+
+    # For BS3 compatibility.
     searchTag = search_tag
 
     def search(self, markup):
-        # print 'looking for %s in %s' % (self, markup)
+        """Find all items in `markup` that match this SoupStrainer.
+
+        Used by the core _find_all() method, which is ultimately
+        called by all find_* methods.
+
+        :param markup: A PageElement or a list of them.
+        """
+        # print('looking for %s in %s' % (self, markup))
         found = None
         # If given a list of items, scan it for a text element that
         # matches.
@@ -1507,7 +2154,7 @@ class SoupStrainer(object):
         return found
 
     def _matches(self, markup, match_against, already_tried=None):
-        # print u"Matching %s against %s" % (markup, match_against)
+        # print(u"Matching %s against %s" % (markup, match_against))
         result = False
         if isinstance(markup, list) or isinstance(markup, tuple):
             # This should only happen when searching a multi-valued attribute
@@ -1593,10 +2240,16 @@ class ResultSet(list):
     """A ResultSet is just a list that keeps track of the SoupStrainer
     that created it."""
     def __init__(self, source, result=()):
+        """Constructor.
+
+        :param source: A SoupStrainer.
+        :param result: A list of PageElements.
+        """
         super(ResultSet, self).__init__(result)
         self.source = source
 
     def __getattr__(self, key):
+        """Raise a helpful exception to explain a common code fix."""
         raise AttributeError(
-            "ResultSet object has no attribute '%s'. You're probably treating a list of items like a single item. Did you call find_all() when you meant to call find()?" % key
+            "ResultSet object has no attribute '%s'. You're probably treating a list of elements like a single element. Did you call find_all() when you meant to call find()?" % key
         )
diff --git a/lib/bs4/formatter.py b/lib/bs4/formatter.py
index 7dbaa385..3bd9f859 100644
--- a/lib/bs4/formatter.py
+++ b/lib/bs4/formatter.py
@@ -5,6 +5,28 @@ class Formatter(EntitySubstitution):
 
     Some parts of this strategy come from the distinction between
     HTML4, HTML5, and XML. Others are configurable by the user.
+
+    Formatters are passed in as the `formatter` argument to methods
+    like `PageElement.encode`. Most people won't need to think about
+    formatters, and most people who need to think about them can pass
+    in one of these predefined strings as `formatter` rather than
+    making a new Formatter object:
+
+    For HTML documents:
+     * 'html' - HTML entity substitution for generic HTML documents. (default)
+     * 'html5' - HTML entity substitution for HTML5 documents, as
+                 well as some optimizations in the way tags are rendered.
+     * 'minimal' - Only make the substitutions necessary to guarantee
+                   valid HTML.
+     * None - Do not perform any substitution. This will be faster
+              but may result in invalid markup.
+
+    For XML documents:
+     * 'html' - Entity substitution for XHTML documents.
+     * 'minimal' - Only make the substitutions necessary to guarantee
+                   valid XML. (default)
+     * None - Do not perform any substitution. This will be faster
+              but may result in invalid markup.
     """
     # Registries of XML and HTML formatters.
     XML_FORMATTERS = {}
@@ -27,11 +49,26 @@ class Formatter(EntitySubstitution):
     def __init__(
             self, language=None, entity_substitution=None,
             void_element_close_prefix='/', cdata_containing_tags=None,
+            empty_attributes_are_booleans=False,
     ):
-        """
+        """Constructor.
 
-        :param void_element_close_prefix: By default, represent void
-        elements as  rather than 
+        :param language: This should be Formatter.XML if you are formatting
+           XML markup and Formatter.HTML if you are formatting HTML markup.
+
+        :param entity_substitution: A function to call to replace special
+           characters with XML/HTML entities. For examples, see 
+           bs4.dammit.EntitySubstitution.substitute_html and substitute_xml.
+        :param void_element_close_prefix: By default, void elements
+           are represented as  (XML rules) rather than 
+           (HTML rules). To get , pass in the empty string.
+        :param cdata_containing_tags: The list of tags that are defined
+           as containing CDATA in this dialect. For example, in HTML,
+           "
+        )
+        assert isinstance(soup.style.string, Stylesheet)
+        assert isinstance(soup.script.string, Script)
+
+        soup = self.soup(
+            ""
+        )
+        assert isinstance(soup.style.string, Stylesheet)
+        # The contents of the style tag resemble an HTML comment, but
+        # it's not treated as a comment.
+        self.assertEqual("", soup.style.string)
+        assert isinstance(soup.style.string, Stylesheet)
+        
     def test_pickle_and_unpickle_identity(self):
         # Pickling a tree, then unpickling it, yields a tree identical
         # to the original.
@@ -250,18 +318,21 @@ class HTMLTreeBuilderSmokeTest(object):
         doctype = soup.contents[0]
         self.assertEqual(doctype.__class__, Doctype)
         self.assertEqual(doctype, doctype_fragment)
-        self.assertEqual(str(soup)[:len(doctype_str)], doctype_str)
+        self.assertEqual(
+            soup.encode("utf8")[:len(doctype_str)],
+            doctype_str
+        )
 
         # Make sure that the doctype was correctly associated with the
         # parse tree and that the rest of the document parsed.
         self.assertEqual(soup.p.contents[0], 'foo')
 
-    def _document_with_doctype(self, doctype_fragment):
+    def _document_with_doctype(self, doctype_fragment, doctype_string="DOCTYPE"):
         """Generate and parse a document with the given doctype."""
-        doctype = '' % doctype_fragment
+        doctype = '' % (doctype_string, doctype_fragment)
         markup = doctype + '\n

foo

' soup = self.soup(markup) - return doctype, soup + return doctype.encode("utf8"), soup def test_normal_doctypes(self): """Make sure normal, everyday HTML doctypes are handled correctly.""" @@ -274,6 +345,27 @@ class HTMLTreeBuilderSmokeTest(object): doctype = soup.contents[0] self.assertEqual("", doctype.strip()) + def test_mixed_case_doctype(self): + # A lowercase or mixed-case doctype becomes a Doctype. + for doctype_fragment in ("doctype", "DocType"): + doctype_str, soup = self._document_with_doctype( + "html", doctype_fragment + ) + + # Make sure a Doctype object was created and that the DOCTYPE + # is uppercase. + doctype = soup.contents[0] + self.assertEqual(doctype.__class__, Doctype) + self.assertEqual(doctype, "html") + self.assertEqual( + soup.encode("utf8")[:len(doctype_str)], + b"" + ) + + # Make sure that the doctype was correctly associated with the + # parse tree and that the rest of the document parsed. + self.assertEqual(soup.p.contents[0], 'foo') + def test_public_doctype_with_url(self): doctype = 'html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd"' self.assertDoctypeHandled(doctype) @@ -532,7 +624,7 @@ Hello, world! self.assertSoupEquals("�", expect) self.assertSoupEquals("�", expect) self.assertSoupEquals("�", expect) - + def test_multipart_strings(self): "Mostly to prevent a recurrence of a bug in the html5lib treebuilder." soup = self.soup("

\nfoo

") @@ -594,7 +686,7 @@ Hello, world! markup = b'' soup = self.soup(markup) self.assertEqual(['foo', 'bar'], soup.a['class']) - + # # Generally speaking, tests below this point are more tests of # Beautiful Soup than tests of the tree builders. But parsers are @@ -779,11 +871,44 @@ Hello, world! # encoding. self.assertEqual('utf8', charset.encode("utf8")) + def test_python_specific_encodings_not_used_in_charset(self): + # You can encode an HTML document using a Python-specific + # encoding, but that encoding won't be mentioned _inside_ the + # resulting document. Instead, the document will appear to + # have no encoding. + for markup in [ + b'' + b'' + ]: + soup = self.soup(markup) + for encoding in PYTHON_SPECIFIC_ENCODINGS: + if encoding in ( + 'idna', 'mbcs', 'oem', 'undefined', + 'string_escape', 'string-escape' + ): + # For one reason or another, these will raise an + # exception if we actually try to use them, so don't + # bother. + continue + encoded = soup.encode(encoding) + assert b'meta charset=""' in encoded + assert encoding.encode("ascii") not in encoded + def test_tag_with_no_attributes_can_have_attributes_added(self): data = self.soup("text") data.a['foo'] = 'bar' self.assertEqual('text', data.a.decode()) + def test_closing_tag_with_no_opening_tag(self): + # Without BeautifulSoup.open_tag_counter, the tag will + # cause _popToTag to be called over and over again as we look + # for a tag that wasn't there. The result is that 'text2' + # will show up outside the body of the document. + soup = self.soup("

text1

text2
") + self.assertEqual( + "

text1

text2
", soup.body.decode() + ) + def test_worst_case(self): """Test the worst case (currently) for linking issues.""" @@ -791,7 +916,7 @@ Hello, world! self.linkage_validator(soup) -class XMLTreeBuilderSmokeTest(object): +class XMLTreeBuilderSmokeTest(TreeBuilderSmokeTest): def test_pickle_and_unpickle_identity(self): # Pickling a tree, then unpickling it, yields a tree identical @@ -812,6 +937,25 @@ class XMLTreeBuilderSmokeTest(object): soup = self.soup(markup) self.assertEqual(markup, soup.encode("utf8")) + def test_python_specific_encodings_not_used_in_xml_declaration(self): + # You can encode an XML document using a Python-specific + # encoding, but that encoding won't be mentioned _inside_ the + # resulting document. + markup = b"""\n""" + soup = self.soup(markup) + for encoding in PYTHON_SPECIFIC_ENCODINGS: + if encoding in ( + 'idna', 'mbcs', 'oem', 'undefined', + 'string_escape', 'string-escape' + ): + # For one reason or another, these will raise an + # exception if we actually try to use them, so don't + # bother. + continue + encoded = soup.encode(encoding) + assert b'' in encoded + assert encoding.encode("ascii") not in encoded + def test_processing_instruction(self): markup = b"""\n""" soup = self.soup(markup) @@ -828,7 +972,7 @@ class XMLTreeBuilderSmokeTest(object): soup = self.soup(markup) self.assertEqual( soup.encode("utf-8"), markup) - + def test_nested_namespaces(self): doc = b""" diff --git a/lib/bs4/tests/test_html5lib.py b/lib/bs4/tests/test_html5lib.py index d7a0b298..f8902ad7 100644 --- a/lib/bs4/tests/test_html5lib.py +++ b/lib/bs4/tests/test_html5lib.py @@ -182,3 +182,45 @@ class HTML5LibBuilderSmokeTest(SoupTest, HTML5TreeBuilderSmokeTest): soup = self.soup(markup, store_line_numbers=False) self.assertEqual("sourceline", soup.p.sourceline.name) self.assertEqual("sourcepos", soup.p.sourcepos.name) + + def test_special_string_containers(self): + # The html5lib tree builder doesn't support this standard feature, + # because there's no way of knowing, when a string is created, + # where in the tree it will eventually end up. + pass + + def test_html5_attributes(self): + # The html5lib TreeBuilder can convert any entity named in + # the HTML5 spec to a sequence of Unicode characters, and + # convert those Unicode characters to a (potentially + # different) named entity on the way out. + # + # This is a copy of the same test from + # HTMLParserTreeBuilderSmokeTest. It's not in the superclass + # because the lxml HTML TreeBuilder _doesn't_ work this way. + for input_element, output_unicode, output_element in ( + ("⇄", '\u21c4', b'⇄'), + ('⊧', '\u22a7', b'⊧'), + ('𝔑', '\U0001d511', b'𝔑'), + ('≧̸', '\u2267\u0338', b'≧̸'), + ('¬', '\xac', b'¬'), + ('⫬', '\u2aec', b'⫬'), + ('"', '"', b'"'), + ('∴', '\u2234', b'∴'), + ('∴', '\u2234', b'∴'), + ('∴', '\u2234', b'∴'), + ("fj", 'fj', b'fj'), + ("⊔", '\u2294', b'⊔'), + ("⊔︀", '\u2294\ufe00', b'⊔︀'), + ("'", "'", b"'"), + ("|", "|", b"|"), + ): + markup = '
%s
' % input_element + div = self.soup(markup).div + without_element = div.encode() + expect = b"
%s
" % output_unicode.encode("utf8") + self.assertEqual(without_element, expect) + + with_element = div.encode(formatter="html") + expect = b"
%s
" % output_element + self.assertEqual(with_element, expect) diff --git a/lib/bs4/tests/test_htmlparser.py b/lib/bs4/tests/test_htmlparser.py index 7be64935..0d8161ef 100644 --- a/lib/bs4/tests/test_htmlparser.py +++ b/lib/bs4/tests/test_htmlparser.py @@ -3,6 +3,7 @@ trees.""" from pdb import set_trace import pickle +import warnings from bs4.testing import SoupTest, HTMLTreeBuilderSmokeTest from bs4.builder import HTMLParserTreeBuilder from bs4.builder._htmlparser import BeautifulSoupHTMLParser @@ -51,11 +52,83 @@ class HTMLParserTreeBuilderSmokeTest(SoupTest, HTMLTreeBuilderSmokeTest): self.assertEqual("sourceline", soup.p.sourceline.name) self.assertEqual("sourcepos", soup.p.sourcepos.name) + def test_on_duplicate_attribute(self): + # The html.parser tree builder has a variety of ways of + # handling a tag that contains the same attribute multiple times. + + markup = '' + + # If you don't provide any particular value for + # on_duplicate_attribute, later values replace earlier values. + soup = self.soup(markup) + self.assertEqual("url3", soup.a['href']) + self.assertEqual(["cls"], soup.a['class']) + self.assertEqual("id", soup.a['id']) + # You can also get this behavior explicitly. + def assert_attribute(on_duplicate_attribute, expected): + soup = self.soup( + markup, on_duplicate_attribute=on_duplicate_attribute + ) + self.assertEqual(expected, soup.a['href']) + + # Verify that non-duplicate attributes are treated normally. + self.assertEqual(["cls"], soup.a['class']) + self.assertEqual("id", soup.a['id']) + assert_attribute(None, "url3") + assert_attribute(BeautifulSoupHTMLParser.REPLACE, "url3") + + # You can ignore subsequent values in favor of the first. + assert_attribute(BeautifulSoupHTMLParser.IGNORE, "url1") + + # And you can pass in a callable that does whatever you want. + def accumulate(attrs, key, value): + if not isinstance(attrs[key], list): + attrs[key] = [attrs[key]] + attrs[key].append(value) + assert_attribute(accumulate, ["url1", "url2", "url3"]) + + def test_html5_attributes(self): + # The html.parser TreeBuilder can convert any entity named in + # the HTML5 spec to a sequence of Unicode characters, and + # convert those Unicode characters to a (potentially + # different) named entity on the way out. + for input_element, output_unicode, output_element in ( + ("⇄", '\u21c4', b'⇄'), + ('⊧', '\u22a7', b'⊧'), + ('𝔑', '\U0001d511', b'𝔑'), + ('≧̸', '\u2267\u0338', b'≧̸'), + ('¬', '\xac', b'¬'), + ('⫬', '\u2aec', b'⫬'), + ('"', '"', b'"'), + ('∴', '\u2234', b'∴'), + ('∴', '\u2234', b'∴'), + ('∴', '\u2234', b'∴'), + ("fj", 'fj', b'fj'), + ("⊔", '\u2294', b'⊔'), + ("⊔︀", '\u2294\ufe00', b'⊔︀'), + ("'", "'", b"'"), + ("|", "|", b"|"), + ): + markup = '
%s
' % input_element + div = self.soup(markup).div + without_element = div.encode() + expect = b"
%s
" % output_unicode.encode("utf8") + self.assertEqual(without_element, expect) + + with_element = div.encode(formatter="html") + expect = b"
%s
" % output_element + self.assertEqual(with_element, expect) + + class TestHTMLParserSubclass(SoupTest): def test_error(self): """Verify that our HTMLParser subclass implements error() in a way that doesn't cause a crash. """ parser = BeautifulSoupHTMLParser() - parser.error("don't crash") + with warnings.catch_warnings(record=True) as warns: + parser.error("don't crash") + [warning] = warns + assert "don't crash" == str(warning.message) + diff --git a/lib/bs4/tests/test_lxml.py b/lib/bs4/tests/test_lxml.py index 3d0c75fa..71931ffe 100644 --- a/lib/bs4/tests/test_lxml.py +++ b/lib/bs4/tests/test_lxml.py @@ -45,7 +45,7 @@ class LXMLTreeBuilderSmokeTest(SoupTest, HTMLTreeBuilderSmokeTest): "

foo�bar

", "

foobar

") self.assertSoupEquals( "

foo�bar

", "

foobar

") - + def test_entities_in_foreign_document_encoding(self): # We can't implement this case correctly because by the time we # hear about markup like "“", it's been (incorrectly) converted into diff --git a/lib/bs4/tests/test_soup.py b/lib/bs4/tests/test_soup.py index 0e7dac11..4d00845d 100644 --- a/lib/bs4/tests/test_soup.py +++ b/lib/bs4/tests/test_soup.py @@ -3,6 +3,7 @@ from pdb import set_trace import logging +import os import unittest import sys import tempfile @@ -10,6 +11,8 @@ import tempfile from bs4 import ( BeautifulSoup, BeautifulStoneSoup, + GuessedAtParserWarning, + MarkupResemblesLocatorWarning, ) from bs4.builder import ( TreeBuilder, @@ -29,7 +32,6 @@ import bs4.dammit from bs4.dammit import ( EntitySubstitution, UnicodeDammit, - EncodingDetector, ) from bs4.testing import ( default_builder, @@ -73,6 +75,7 @@ class TestConstructor(SoupTest): self.store_line_numbers = False self.cdata_list_attributes = [] self.preserve_whitespace_tags = [] + self.string_containers = {} def initialize_soup(self, soup): pass def feed(self, markup): @@ -186,28 +189,69 @@ class TestConstructor(SoupTest): isinstance(x, (TagPlus, StringPlus, CommentPlus)) for x in soup.recursiveChildGenerator() ) + + def test_alternate_string_containers(self): + # Test the ability to customize the string containers for + # different types of tags. + class PString(NavigableString): + pass + + class BString(NavigableString): + pass + + soup = self.soup( + "
Hello.

Here is some bolded text", + string_containers = { + 'b': BString, + 'p': PString, + } + ) + + # The string before the

tag is a regular NavigableString. + assert isinstance(soup.div.contents[0], NavigableString) + # The string inside the

tag, but not inside the tag, + # is a PString. + assert isinstance(soup.p.contents[0], PString) + + # Every string inside the tag is a BString, even the one that + # was also inside an tag. + for s in soup.b.strings: + assert isinstance(s, BString) + + # Now that parsing was complete, the string_container_stack + # (where this information was kept) has been cleared out. + self.assertEqual([], soup.string_container_stack) + + class TestWarnings(SoupTest): - def _no_parser_specified(self, s, is_there=True): - v = s.startswith(BeautifulSoup.NO_PARSER_SPECIFIED_WARNING[:80]) - self.assertTrue(v) + def _assert_warning(self, warnings, cls): + for w in warnings: + if isinstance(w.message, cls): + return w + raise Exception("%s warning not found in %r" % cls, warnings) + + def _assert_no_parser_specified(self, w): + warning = self._assert_warning(w, GuessedAtParserWarning) + message = str(warning.message) + self.assertTrue( + message.startswith(BeautifulSoup.NO_PARSER_SPECIFIED_WARNING[:60]) + ) def test_warning_if_no_parser_specified(self): with warnings.catch_warnings(record=True) as w: - soup = self.soup("") - msg = str(w[0].message) - self._assert_no_parser_specified(msg) + soup = BeautifulSoup("") + self._assert_no_parser_specified(w) def test_warning_if_parser_specified_too_vague(self): with warnings.catch_warnings(record=True) as w: - soup = self.soup("", "html") - msg = str(w[0].message) - self._assert_no_parser_specified(msg) + soup = BeautifulSoup("", "html") + self._assert_no_parser_specified(w) def test_no_warning_if_explicit_parser_specified(self): with warnings.catch_warnings(record=True) as w: - soup = self.soup("", "html.parser") + soup = BeautifulSoup("", "html.parser") self.assertEqual([], w) def test_parseOnlyThese_renamed_to_parse_only(self): @@ -231,41 +275,58 @@ class TestWarnings(SoupTest): self.assertRaises( TypeError, self.soup, "", no_such_argument=True) -class TestWarnings(SoupTest): - def test_disk_file_warning(self): filehandle = tempfile.NamedTemporaryFile() filename = filehandle.name try: with warnings.catch_warnings(record=True) as w: soup = self.soup(filename) - msg = str(w[0].message) - self.assertTrue("looks like a filename" in msg) + warning = self._assert_warning(w, MarkupResemblesLocatorWarning) + self.assertTrue("looks like a filename" in str(warning.message)) finally: filehandle.close() # The file no longer exists, so Beautiful Soup will no longer issue the warning. with warnings.catch_warnings(record=True) as w: soup = self.soup(filename) - self.assertEqual(0, len(w)) + self.assertEqual([], w) + def test_directory_warning(self): + try: + filename = tempfile.mkdtemp() + with warnings.catch_warnings(record=True) as w: + soup = self.soup(filename) + warning = self._assert_warning(w, MarkupResemblesLocatorWarning) + self.assertTrue("looks like a directory" in str(warning.message)) + finally: + os.rmdir(filename) + + # The directory no longer exists, so Beautiful Soup will no longer issue the warning. + with warnings.catch_warnings(record=True) as w: + soup = self.soup(filename) + self.assertEqual([], w) + def test_url_warning_with_bytes_url(self): with warnings.catch_warnings(record=True) as warning_list: soup = self.soup(b"http://www.crummybytes.com/") - # Be aware this isn't the only warning that can be raised during - # execution.. - self.assertTrue(any("looks like a URL" in str(w.message) - for w in warning_list)) + warning = self._assert_warning( + warning_list, MarkupResemblesLocatorWarning + ) + self.assertTrue("looks like a URL" in str(warning.message)) def test_url_warning_with_unicode_url(self): with warnings.catch_warnings(record=True) as warning_list: # note - this url must differ from the bytes one otherwise # python's warnings system swallows the second warning soup = self.soup("http://www.crummyunicode.com/") - self.assertTrue(any("looks like a URL" in str(w.message) - for w in warning_list)) + warning = self._assert_warning( + warning_list, MarkupResemblesLocatorWarning + ) + self.assertTrue("looks like a URL" in str(warning.message)) def test_url_warning_with_bytes_and_space(self): + # Here the markup contains something besides a URL, so no warning + # is issued. with warnings.catch_warnings(record=True) as warning_list: soup = self.soup(b"http://www.crummybytes.com/ is great") self.assertFalse(any("looks like a URL" in str(w.message) @@ -307,6 +368,51 @@ class TestEntitySubstitution(unittest.TestCase): self.assertEqual(self.sub.substitute_html(dammit.markup), "‘’foo“”") + def test_html5_entity(self): + # Some HTML5 entities correspond to single- or multi-character + # Unicode sequences. + + for entity, u in ( + # A few spot checks of our ability to recognize + # special character sequences and convert them + # to named entities. + ('⊧', '\u22a7'), + ('𝔑', '\U0001d511'), + ('≧̸', '\u2267\u0338'), + ('¬', '\xac'), + ('⫬', '\u2aec'), + + # We _could_ convert | to &verbarr;, but we don't, because + # | is an ASCII character. + ('|' '|'), + + # Similarly for the fj ligature, which we could convert to + # fj, but we don't. + ("fj", "fj"), + + # We do convert _these_ ASCII characters to HTML entities, + # because that's required to generate valid HTML. + ('>', '>'), + ('<', '<'), + ('&', '&'), + ): + template = '3 %s 4' + raw = template % u + with_entities = template % entity + self.assertEqual(self.sub.substitute_html(raw), with_entities) + + def test_html5_entity_with_variation_selector(self): + # Some HTML5 entities correspond either to a single-character + # Unicode sequence _or_ to the same character plus U+FE00, + # VARIATION SELECTOR 1. We can handle this. + data = "fjords \u2294 penguins" + markup = "fjords ⊔ penguins" + self.assertEqual(self.sub.substitute_html(data), markup) + + data = "fjords \u2294\ufe00 penguins" + markup = "fjords ⊔︀ penguins" + self.assertEqual(self.sub.substitute_html(data), markup) + def test_xml_converstion_includes_no_quotes_if_make_quoted_attribute_is_false(self): s = 'Welcome to "my bar"' self.assertEqual(self.sub.substitute_xml(s, False), s) @@ -416,235 +522,26 @@ class TestEncodingConversion(SoupTest): markup = '

' self.assertEqual(self.soup(markup).div.encode("utf8"), markup.encode("utf8")) -class TestUnicodeDammit(unittest.TestCase): - """Standalone tests of UnicodeDammit.""" - def test_unicode_input(self): - markup = "I'm already Unicode! \N{SNOWMAN}" - dammit = UnicodeDammit(markup) - self.assertEqual(dammit.unicode_markup, markup) - - def test_smart_quotes_to_unicode(self): - markup = b"\x91\x92\x93\x94" - dammit = UnicodeDammit(markup) - self.assertEqual( - dammit.unicode_markup, "\u2018\u2019\u201c\u201d") - - def test_smart_quotes_to_xml_entities(self): - markup = b"\x91\x92\x93\x94" - dammit = UnicodeDammit(markup, smart_quotes_to="xml") - self.assertEqual( - dammit.unicode_markup, "‘’“”") - - def test_smart_quotes_to_html_entities(self): - markup = b"\x91\x92\x93\x94" - dammit = UnicodeDammit(markup, smart_quotes_to="html") - self.assertEqual( - dammit.unicode_markup, "‘’“”") - - def test_smart_quotes_to_ascii(self): - markup = b"\x91\x92\x93\x94" - dammit = UnicodeDammit(markup, smart_quotes_to="ascii") - self.assertEqual( - dammit.unicode_markup, """''""""") - - def test_detect_utf8(self): - utf8 = b"Sacr\xc3\xa9 bleu! \xe2\x98\x83" - dammit = UnicodeDammit(utf8) - self.assertEqual(dammit.original_encoding.lower(), 'utf-8') - self.assertEqual(dammit.unicode_markup, 'Sacr\xe9 bleu! \N{SNOWMAN}') - - - def test_convert_hebrew(self): - hebrew = b"\xed\xe5\xec\xf9" - dammit = UnicodeDammit(hebrew, ["iso-8859-8"]) - self.assertEqual(dammit.original_encoding.lower(), 'iso-8859-8') - self.assertEqual(dammit.unicode_markup, '\u05dd\u05d5\u05dc\u05e9') - - def test_dont_see_smart_quotes_where_there_are_none(self): - utf_8 = b"\343\202\261\343\203\274\343\202\277\343\202\244 Watch" - dammit = UnicodeDammit(utf_8) - self.assertEqual(dammit.original_encoding.lower(), 'utf-8') - self.assertEqual(dammit.unicode_markup.encode("utf-8"), utf_8) - - def test_ignore_inappropriate_codecs(self): - utf8_data = "Räksmörgås".encode("utf-8") - dammit = UnicodeDammit(utf8_data, ["iso-8859-8"]) - self.assertEqual(dammit.original_encoding.lower(), 'utf-8') - - def test_ignore_invalid_codecs(self): - utf8_data = "Räksmörgås".encode("utf-8") - for bad_encoding in ['.utf8', '...', 'utF---16.!']: - dammit = UnicodeDammit(utf8_data, [bad_encoding]) - self.assertEqual(dammit.original_encoding.lower(), 'utf-8') - - def test_exclude_encodings(self): - # This is UTF-8. - utf8_data = "Räksmörgås".encode("utf-8") - - # But if we exclude UTF-8 from consideration, the guess is - # Windows-1252. - dammit = UnicodeDammit(utf8_data, exclude_encodings=["utf-8"]) - self.assertEqual(dammit.original_encoding.lower(), 'windows-1252') - - # And if we exclude that, there is no valid guess at all. - dammit = UnicodeDammit( - utf8_data, exclude_encodings=["utf-8", "windows-1252"]) - self.assertEqual(dammit.original_encoding, None) - - def test_encoding_detector_replaces_junk_in_encoding_name_with_replacement_character(self): - detected = EncodingDetector( - b'') - encodings = list(detected.encodings) - assert 'utf-\N{REPLACEMENT CHARACTER}' in encodings - - def test_detect_html5_style_meta_tag(self): - - for data in ( - b'', - b"", - b"", - b""): - dammit = UnicodeDammit(data, is_html=True) - self.assertEqual( - "euc-jp", dammit.original_encoding) - - def test_last_ditch_entity_replacement(self): - # This is a UTF-8 document that contains bytestrings - # completely incompatible with UTF-8 (ie. encoded with some other - # encoding). - # - # Since there is no consistent encoding for the document, - # Unicode, Dammit will eventually encode the document as UTF-8 - # and encode the incompatible characters as REPLACEMENT - # CHARACTER. - # - # If chardet is installed, it will detect that the document - # can be converted into ISO-8859-1 without errors. This happens - # to be the wrong encoding, but it is a consistent encoding, so the - # code we're testing here won't run. - # - # So we temporarily disable chardet if it's present. - doc = b"""\357\273\277 -\330\250\330\252\330\261 -\310\322\321\220\312\321\355\344""" - chardet = bs4.dammit.chardet_dammit - logging.disable(logging.WARNING) - try: - def noop(str): - return None - bs4.dammit.chardet_dammit = noop - dammit = UnicodeDammit(doc) - self.assertEqual(True, dammit.contains_replacement_characters) - self.assertTrue("\ufffd" in dammit.unicode_markup) - - soup = BeautifulSoup(doc, "html.parser") - self.assertTrue(soup.contains_replacement_characters) - finally: - logging.disable(logging.NOTSET) - bs4.dammit.chardet_dammit = chardet - - def test_byte_order_mark_removed(self): - # A document written in UTF-16LE will have its byte order marker stripped. - data = b'\xff\xfe<\x00a\x00>\x00\xe1\x00\xe9\x00<\x00/\x00a\x00>\x00' - dammit = UnicodeDammit(data) - self.assertEqual("áé", dammit.unicode_markup) - self.assertEqual("utf-16le", dammit.original_encoding) - - def test_detwingle(self): - # Here's a UTF8 document. - utf8 = ("\N{SNOWMAN}" * 3).encode("utf8") - - # Here's a Windows-1252 document. - windows_1252 = ( - "\N{LEFT DOUBLE QUOTATION MARK}Hi, I like Windows!" - "\N{RIGHT DOUBLE QUOTATION MARK}").encode("windows_1252") - - # Through some unholy alchemy, they've been stuck together. - doc = utf8 + windows_1252 + utf8 - - # The document can't be turned into UTF-8: - self.assertRaises(UnicodeDecodeError, doc.decode, "utf8") - - # Unicode, Dammit thinks the whole document is Windows-1252, - # and decodes it into "☃☃☃“Hi, I like Windows!”☃☃☃" - - # But if we run it through fix_embedded_windows_1252, it's fixed: - - fixed = UnicodeDammit.detwingle(doc) - self.assertEqual( - "☃☃☃“Hi, I like Windows!”☃☃☃", fixed.decode("utf8")) - - def test_detwingle_ignores_multibyte_characters(self): - # Each of these characters has a UTF-8 representation ending - # in \x93. \x93 is a smart quote if interpreted as - # Windows-1252. But our code knows to skip over multibyte - # UTF-8 characters, so they'll survive the process unscathed. - for tricky_unicode_char in ( - "\N{LATIN SMALL LIGATURE OE}", # 2-byte char '\xc5\x93' - "\N{LATIN SUBSCRIPT SMALL LETTER X}", # 3-byte char '\xe2\x82\x93' - "\xf0\x90\x90\x93", # This is a CJK character, not sure which one. - ): - input = tricky_unicode_char.encode("utf8") - self.assertTrue(input.endswith(b'\x93')) - output = UnicodeDammit.detwingle(input) - self.assertEqual(output, input) - - def test_find_declared_encoding(self): - # Test our ability to find a declared encoding inside an - # XML or HTML document. - # - # Even if the document comes in as Unicode, it may be - # interesting to know what encoding was claimed - # originally. - - html_unicode = '' - html_bytes = html_unicode.encode("ascii") - - xml_unicode= '' - xml_bytes = xml_unicode.encode("ascii") - - m = EncodingDetector.find_declared_encoding - self.assertEqual(None, m(html_unicode, is_html=False)) - self.assertEqual("utf-8", m(html_unicode, is_html=True)) - self.assertEqual("utf-8", m(html_bytes, is_html=True)) - - self.assertEqual("iso-8859-1", m(xml_unicode)) - self.assertEqual("iso-8859-1", m(xml_bytes)) - - # Normally, only the first few kilobytes of a document are checked for - # an encoding. - spacer = b' ' * 5000 - self.assertEqual(None, m(spacer + html_bytes)) - self.assertEqual(None, m(spacer + xml_bytes)) - - # But you can tell find_declared_encoding to search an entire - # HTML document. - self.assertEqual( - "utf-8", - m(spacer + html_bytes, is_html=True, search_entire_document=True) - ) - - # The XML encoding declaration has to be the very first thing - # in the document. We'll allow whitespace before the document - # starts, but nothing else. - self.assertEqual( - "iso-8859-1", - m(xml_bytes, search_entire_document=True) - ) - self.assertEqual( - None, m(b'a' + xml_bytes, search_entire_document=True) - ) - class TestNamedspacedAttribute(SoupTest): def test_name_may_be_none_or_missing(self): a = NamespacedAttribute("xmlns", None) self.assertEqual(a, "xmlns") + a = NamespacedAttribute("xmlns", "") + self.assertEqual(a, "xmlns") + a = NamespacedAttribute("xmlns") self.assertEqual(a, "xmlns") + def test_namespace_may_be_none_or_missing(self): + a = NamespacedAttribute(None, "tag") + self.assertEqual(a, "tag") + + a = NamespacedAttribute("", "tag") + self.assertEqual(a, "tag") + def test_attribute_is_equivalent_to_colon_separated_string(self): a = NamespacedAttribute("a", "b") self.assertEqual("a:b", a) diff --git a/lib/bs4/tests/test_tree.py b/lib/bs4/tests/test_tree.py index e69afdf9..59b51d0b 100644 --- a/lib/bs4/tests/test_tree.py +++ b/lib/bs4/tests/test_tree.py @@ -27,13 +27,17 @@ from bs4.element import ( Doctype, Formatter, NavigableString, + Script, SoupStrainer, + Stylesheet, Tag, + TemplateString, ) from bs4.testing import ( SoupTest, skipIf, ) +from soupsieve import SelectorSyntaxError XML_BUILDER_PRESENT = (builder_registry.lookup("xml") is not None) LXML_PRESENT = (builder_registry.lookup("lxml") is not None) @@ -1005,6 +1009,15 @@ class TestTreeModification(SoupTest): soup.a.extend(l) self.assertEqual("", soup.decode()) + def test_extend_with_another_tags_contents(self): + data = '
' + soup = self.soup(data) + d1 = soup.find('div', id='d1') + d2 = soup.find('div', id='d2') + d2.extend(d1) + self.assertEqual('
', d1.decode()) + self.assertEqual('', d2.decode()) + def test_move_tag_to_beginning_of_parent(self): data = "" soup = self.soup(data) @@ -1117,6 +1130,37 @@ class TestTreeModification(SoupTest): self.assertEqual(no.next_element, "no") self.assertEqual(no.next_sibling, " business") + def test_replace_with_errors(self): + # Can't replace a tag that's not part of a tree. + a_tag = Tag(name="a") + self.assertRaises(ValueError, a_tag.replace_with, "won't work") + + # Can't replace a tag with its parent. + a_tag = self.soup("").a + self.assertRaises(ValueError, a_tag.b.replace_with, a_tag) + + # Or with a list that includes its parent. + self.assertRaises(ValueError, a_tag.b.replace_with, + "string1", a_tag, "string2") + + def test_replace_with_multiple(self): + data = "" + soup = self.soup(data) + d_tag = soup.new_tag("d") + d_tag.string = "Text In D Tag" + e_tag = soup.new_tag("e") + f_tag = soup.new_tag("f") + a_string = "Random Text" + soup.c.replace_with(d_tag, e_tag, a_string, f_tag) + self.assertEqual( + "Text In D TagRandom Text", + soup.decode() + ) + assert soup.b.next_element == d_tag + assert d_tag.string.next_element==e_tag + assert e_tag.next_element.string == a_string + assert e_tag.next_element.next_element == f_tag + def test_replace_first_child(self): data = "" soup = self.soup(data) @@ -1275,6 +1319,23 @@ class TestTreeModification(SoupTest): a.clear(decompose=True) self.assertEqual(0, len(em.contents)) + + def test_decompose(self): + # Test PageElement.decompose() and PageElement.decomposed + soup = self.soup("

String Italicized

Another para

") + p1, p2 = soup.find_all('p') + a = p1.a + text = p1.em.string + for i in [p1, p2, a, text]: + self.assertEqual(False, i.decomposed) + + # This sets p1 and everything beneath it to decomposed. + p1.decompose() + for i in [p1, a, text]: + self.assertEqual(True, i.decomposed) + # p2 is unaffected. + self.assertEqual(False, p2.decomposed) + def test_string_set(self): """Tag.string = 'string'""" soup = self.soup(" ") @@ -1391,7 +1452,7 @@ class TestElementObjects(SoupTest): self.assertEqual(soup.a.get_text(","), "a,r, , t ") self.assertEqual(soup.a.get_text(",", strip=True), "a,r,t") - def test_get_text_ignores_comments(self): + def test_get_text_ignores_special_string_containers(self): soup = self.soup("foobar") self.assertEqual(soup.get_text(), "foobar") @@ -1400,10 +1461,51 @@ class TestElementObjects(SoupTest): self.assertEqual( soup.get_text(types=None), "fooIGNOREbar") - def test_all_strings_ignores_comments(self): + soup = self.soup("foobar") + self.assertEqual(soup.get_text(), "foobar") + + def test_all_strings_ignores_special_string_containers(self): soup = self.soup("foobar") self.assertEqual(['foo', 'bar'], list(soup.strings)) + soup = self.soup("foobar") + self.assertEqual(['foo', 'bar'], list(soup.strings)) + + def test_string_methods_inside_special_string_container_tags(self): + # Strings inside tags like
") + + self.assertEqual(style.div.get_text(), "a") + self.assertEqual(list(style.div.strings), ["a"]) + self.assertEqual(style.div.style.get_text(), "Some CSS") + self.assertEqual(list(style.div.style.strings), + ['Some CSS']) + + # The comment is not picked up here. That's because it was + # parsed into a Comment object, which is not considered + # interesting by template.strings. + self.assertEqual(template.div.get_text(), "a") + self.assertEqual(list(template.div.strings), ["a"]) + self.assertEqual(template.div.template.get_text(), "Templated text.") + self.assertEqual(list(template.div.template.strings), + ["Templated ", "text", "."]) + + # The comment is included here, because it didn't get parsed + # into a Comment object--it's part of the Script string. + self.assertEqual(script.div.get_text(), "a") + self.assertEqual(list(script.div.strings), ["a"]) + self.assertEqual(script.div.script.get_text(), + "Some text") + self.assertEqual(list(script.div.script.strings), + ['Some text']) + class TestCDAtaListAttributes(SoupTest): """Testing cdata-list attributes like 'class'. @@ -1775,71 +1877,7 @@ class TestEncoding(SoupTest): else: self.assertEqual(b'\\u2603', repr(soup)) -class TestFormatter(SoupTest): - - def test_sort_attributes(self): - # Test the ability to override Formatter.attributes() to, - # e.g., disable the normal sorting of attributes. - class UnsortedFormatter(Formatter): - def attributes(self, tag): - self.called_with = tag - for k, v in sorted(tag.attrs.items()): - if k == 'ignore': - continue - yield k,v - - soup = self.soup('

') - formatter = UnsortedFormatter() - decoded = soup.decode(formatter=formatter) - - # attributes() was called on the

tag. It filtered out one - # attribute and sorted the other two. - self.assertEqual(formatter.called_with, soup.p) - self.assertEqual('

', decoded) - - -class TestNavigableStringSubclasses(SoupTest): - - def test_cdata(self): - # None of the current builders turn CDATA sections into CData - # objects, but you can create them manually. - soup = self.soup("") - cdata = CData("foo") - soup.insert(1, cdata) - self.assertEqual(str(soup), "") - self.assertEqual(soup.find(text="foo"), "foo") - self.assertEqual(soup.contents[0], "foo") - - def test_cdata_is_never_formatted(self): - """Text inside a CData object is passed into the formatter. - - But the return value is ignored. - """ - - self.count = 0 - def increment(*args): - self.count += 1 - return "BITTER FAILURE" - - soup = self.soup("") - cdata = CData("<><><>") - soup.insert(1, cdata) - self.assertEqual( - b"<><>]]>", soup.encode(formatter=increment)) - self.assertEqual(1, self.count) - - def test_doctype_ends_in_newline(self): - # Unlike other NavigableString subclasses, a DOCTYPE always ends - # in a newline. - doctype = Doctype("foo") - soup = self.soup("") - soup.insert(1, doctype) - self.assertEqual(soup.encode(), b"\n") - - def test_declaration(self): - d = Declaration("foo") - self.assertEqual("", d.output_ready()) - + class TestSoupSelector(TreeTest): HTML = """ @@ -1949,7 +1987,7 @@ class TestSoupSelector(TreeTest): self.assertEqual(len(self.soup.select('del')), 0) def test_invalid_tag(self): - self.assertRaises(SyntaxError, self.soup.select, 'tag%t') + self.assertRaises(SelectorSyntaxError, self.soup.select, 'tag%t') def test_select_dashed_tag_ids(self): self.assertSelects('custom-dashed-tag', ['dash1', 'dash2']) @@ -2140,7 +2178,7 @@ class TestSoupSelector(TreeTest): NotImplementedError, self.soup.select, "a:no-such-pseudoclass") self.assertRaises( - SyntaxError, self.soup.select, "a:nth-of-type(a)") + SelectorSyntaxError, self.soup.select, "a:nth-of-type(a)") def test_nth_of_type(self): # Try to select first paragraph @@ -2196,7 +2234,7 @@ class TestSoupSelector(TreeTest): self.assertEqual([], self.soup.select('#inner ~ h2')) def test_dangling_combinator(self): - self.assertRaises(SyntaxError, self.soup.select, 'h1 >') + self.assertRaises(SelectorSyntaxError, self.soup.select, 'h1 >') def test_sibling_combinator_wont_select_same_tag_twice(self): self.assertSelects('p[lang] ~ p', ['lang-en-gb', 'lang-en-us', 'lang-fr']) @@ -2227,8 +2265,8 @@ class TestSoupSelector(TreeTest): self.assertSelects('div x,y, z', ['xid', 'yid', 'zida', 'zidb', 'zidab', 'zidac']) def test_invalid_multiple_select(self): - self.assertRaises(SyntaxError, self.soup.select, ',x, y') - self.assertRaises(SyntaxError, self.soup.select, 'x,,y') + self.assertRaises(SelectorSyntaxError, self.soup.select, ',x, y') + self.assertRaises(SelectorSyntaxError, self.soup.select, 'x,,y') def test_multiple_select_attrs(self): self.assertSelects('p[lang=en], p[lang=en-gb]', ['lang-en', 'lang-en-gb'])