diff --git a/libs/bs4/__init__.py b/libs/bs4/__init__.py index 7ba34269..aa818ae4 100644 --- a/libs/bs4/__init__.py +++ b/libs/bs4/__init__.py @@ -5,26 +5,31 @@ http://www.crummy.com/software/BeautifulSoup/ Beautiful Soup uses a pluggable XML or HTML parser to parse a (possibly invalid) document into a tree representation. Beautiful Soup -provides provides methods and Pythonic idioms that make it easy to -navigate, search, and modify the parse tree. +provides methods and Pythonic idioms that make it easy to navigate, +search, and modify the parse tree. -Beautiful Soup works with Python 2.6 and up. It works better if lxml +Beautiful Soup works with Python 2.7 and up. It works better if lxml and/or html5lib is installed. For more than you ever wanted to know about Beautiful Soup, see the documentation: http://www.crummy.com/software/BeautifulSoup/bs4/doc/ + """ +# Use of this source code is governed by a BSD-style license that can be +# found in the LICENSE file. + __author__ = "Leonard Richardson (leonardr@segfault.org)" -__version__ = "4.3.2" -__copyright__ = "Copyright (c) 2004-2013 Leonard Richardson" +__version__ = "4.5.1" +__copyright__ = "Copyright (c) 2004-2016 Leonard Richardson" __license__ = "MIT" __all__ = ['BeautifulSoup'] import os import re +import traceback import warnings from .builder import builder_registry, ParserRejectedMarkup @@ -45,7 +50,7 @@ from .element import ( # The very first thing we do is give a useful error if someone is # running this code under Python 3 without converting it. -syntax_error = u'You are trying to run the Python 2 version of Beautiful Soup under Python 3. This will not work. You need to convert the code, either by installing it (`python setup.py install`) or by running 2to3 (`2to3 -w bs4`).' +'You are trying to run the Python 2 version of Beautiful Soup under Python 3. This will not work.'<>'You need to convert the code, either by installing it (`python setup.py install`) or by running 2to3 (`2to3 -w bs4`).' class BeautifulSoup(Tag): """ @@ -77,8 +82,11 @@ class BeautifulSoup(Tag): ASCII_SPACES = '\x20\x0a\x09\x0c\x0d' + NO_PARSER_SPECIFIED_WARNING = "No parser was explicitly specified, so I'm using the best available %(markup_type)s parser for this system (\"%(parser)s\"). This usually isn't a problem, but if you run this code on another system, or in a different virtual environment, it may use a different parser and behave differently.\n\nThe code that caused this warning is on line %(line_number)s of the file %(filename)s. To get rid of this warning, change code that looks like this:\n\n BeautifulSoup([your markup])\n\nto this:\n\n BeautifulSoup([your markup], \"%(parser)s\")\n" + def __init__(self, markup="", features=None, builder=None, - parse_only=None, from_encoding=None, **kwargs): + parse_only=None, from_encoding=None, exclude_encodings=None, + **kwargs): """The Soup object is initialized as the 'root tag', and the provided markup (which can be a string or a file-like object) is fed into the underlying parser.""" @@ -114,9 +122,9 @@ class BeautifulSoup(Tag): del kwargs['isHTML'] warnings.warn( "BS4 does not respect the isHTML argument to the " - "BeautifulSoup constructor. You can pass in features='html' " - "or features='xml' to get a builder capable of handling " - "one or the other.") + "BeautifulSoup constructor. Suggest you use " + "features='lxml' for HTML and features='lxml-xml' for " + "XML.") def deprecated_argument(old_name, new_name): if old_name in kwargs: @@ -134,12 +142,17 @@ class BeautifulSoup(Tag): from_encoding = from_encoding or deprecated_argument( "fromEncoding", "from_encoding") + if from_encoding and isinstance(markup, unicode): + warnings.warn("You provided Unicode markup but also provided a value for from_encoding. Your from_encoding will be ignored.") + from_encoding = None + if len(kwargs) > 0: arg = kwargs.keys().pop() raise TypeError( "__init__() got an unexpected keyword argument '%s'" % arg) if builder is None: + original_features = features if isinstance(features, basestring): features = [features] if features is None or len(features) == 0: @@ -151,15 +164,35 @@ class BeautifulSoup(Tag): "requested: %s. Do you need to install a parser library?" % ",".join(features)) builder = builder_class() + if not (original_features == builder.NAME or + original_features in builder.ALTERNATE_NAMES): + if builder.is_xml: + markup_type = "XML" + else: + markup_type = "HTML" + + caller = traceback.extract_stack()[0] + filename = caller[0] + line_number = caller[1] + warnings.warn(self.NO_PARSER_SPECIFIED_WARNING % dict( + filename=filename, + line_number=line_number, + parser=builder.NAME, + markup_type=markup_type)) + self.builder = builder self.is_xml = builder.is_xml + self.known_xml = self.is_xml self.builder.soup = self self.parse_only = parse_only if hasattr(markup, 'read'): # It's a file-type object. markup = markup.read() - elif len(markup) <= 256: + elif len(markup) <= 256 and ( + (isinstance(markup, bytes) and not b'<' in markup) + or (isinstance(markup, unicode) and not u'<' in markup) + ): # Print out warnings for a couple beginner problems # involving passing non-markup to Beautiful Soup. # Beautiful Soup will still parse the input as markup, @@ -178,19 +211,18 @@ class BeautifulSoup(Tag): # system. Just let it go. pass if is_file: + if isinstance(markup, unicode): + markup = markup.encode("utf8") warnings.warn( - '"%s" looks like a filename, not markup. You should probably open this file and pass the filehandle into Beautiful Soup.' % markup) - if markup[:5] == "http:" or markup[:6] == "https:": - # TODO: This is ugly but I couldn't get it to work in - # Python 3 otherwise. - if ((isinstance(markup, bytes) and not b' ' in markup) - or (isinstance(markup, unicode) and not u' ' in markup)): - warnings.warn( - '"%s" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client to get the document behind the URL, and feed that document to Beautiful Soup.' % markup) + '"%s" looks like a filename, not markup. You should' + 'probably open this file and pass the filehandle into' + 'Beautiful Soup.' % markup) + self._check_markup_is_url(markup) for (self.markup, self.original_encoding, self.declared_html_encoding, self.contains_replacement_characters) in ( - self.builder.prepare_markup(markup, from_encoding)): + self.builder.prepare_markup( + markup, from_encoding, exclude_encodings=exclude_encodings)): self.reset() try: self._feed() @@ -203,6 +235,53 @@ class BeautifulSoup(Tag): self.markup = None self.builder.soup = None + def __copy__(self): + copy = type(self)( + self.encode('utf-8'), builder=self.builder, from_encoding='utf-8' + ) + + # Although we encoded the tree to UTF-8, that may not have + # been the encoding of the original markup. Set the copy's + # .original_encoding to reflect the original object's + # .original_encoding. + copy.original_encoding = self.original_encoding + return copy + + def __getstate__(self): + # Frequently a tree builder can't be pickled. + d = dict(self.__dict__) + if 'builder' in d and not self.builder.picklable: + d['builder'] = None + return d + + @staticmethod + def _check_markup_is_url(markup): + """ + Check if markup looks like it's actually a url and raise a warning + if so. Markup can be unicode or str (py2) / bytes (py3). + """ + if isinstance(markup, bytes): + space = b' ' + cant_start_with = (b"http:", b"https:") + elif isinstance(markup, unicode): + space = u' ' + cant_start_with = (u"http:", u"https:") + else: + return + + if any(markup.startswith(prefix) for prefix in cant_start_with): + if not space in markup: + if isinstance(markup, bytes): + decoded_markup = markup.decode('utf-8', 'replace') + else: + decoded_markup = markup + warnings.warn( + '"%s" looks like a URL. Beautiful Soup is not an' + ' HTTP client. You should probably use an HTTP client like' + ' requests to get the document behind the URL, and feed' + ' that document to Beautiful Soup.' % decoded_markup + ) + def _feed(self): # Convert the document to Unicode. self.builder.reset() @@ -229,9 +308,7 @@ class BeautifulSoup(Tag): def new_string(self, s, subclass=NavigableString): """Create a new NavigableString associated with this soup.""" - navigable = subclass(s) - navigable.setup() - return navigable + return subclass(s) def insert_before(self, successor): raise NotImplementedError("BeautifulSoup objects don't support insert_before().") @@ -290,14 +367,60 @@ class BeautifulSoup(Tag): def object_was_parsed(self, o, parent=None, most_recent_element=None): """Add an object to the parse tree.""" parent = parent or self.currentTag - most_recent_element = most_recent_element or self._most_recent_element - o.setup(parent, most_recent_element) + previous_element = most_recent_element or self._most_recent_element + + next_element = previous_sibling = next_sibling = None + if isinstance(o, Tag): + next_element = o.next_element + next_sibling = o.next_sibling + previous_sibling = o.previous_sibling + if not previous_element: + previous_element = o.previous_element + + o.setup(parent, previous_element, next_element, previous_sibling, next_sibling) - if most_recent_element is not None: - most_recent_element.next_element = o self._most_recent_element = o parent.contents.append(o) + if parent.next_sibling: + # This node is being inserted into an element that has + # already been parsed. Deal with any dangling references. + index = len(parent.contents)-1 + while index >= 0: + if parent.contents[index] is o: + break + index -= 1 + else: + raise ValueError( + "Error building tree: supposedly %r was inserted " + "into %r after the fact, but I don't see it!" % ( + o, parent + ) + ) + if index == 0: + previous_element = parent + previous_sibling = None + else: + previous_element = previous_sibling = parent.contents[index-1] + if index == len(parent.contents)-1: + next_element = parent.next_sibling + next_sibling = None + else: + next_element = next_sibling = parent.contents[index+1] + + o.previous_element = previous_element + if previous_element: + previous_element.next_element = o + o.next_element = next_element + if next_element: + next_element.previous_element = o + o.next_sibling = next_sibling + if next_sibling: + next_sibling.previous_sibling = o + o.previous_sibling = previous_sibling + if previous_sibling: + previous_sibling.next_sibling = o + def _popToTag(self, name, nsprefix=None, inclusivePop=True): """Pops the tag stack up to and including the most recent instance of the given tag. If inclusivePop is false, pops the tag @@ -325,7 +448,7 @@ class BeautifulSoup(Tag): """Push a start tag on to the stack. If this method returns None, the tag was rejected by the - SoupStrainer. You should proceed as if the tag had not occured + SoupStrainer. You should proceed as if the tag had not occurred in the document. For instance, if this was a self-closing tag, don't call handle_endtag. """ diff --git a/libs/bs4/builder/__init__.py b/libs/bs4/builder/__init__.py index 740f5f29..601979bf 100644 --- a/libs/bs4/builder/__init__.py +++ b/libs/bs4/builder/__init__.py @@ -1,9 +1,13 @@ +# Use of this source code is governed by a BSD-style license that can be +# found in the LICENSE file. + from collections import defaultdict import itertools import sys from bs4.element import ( CharsetMetaAttributeValue, ContentMetaAttributeValue, + HTMLAwareEntitySubstitution, whitespace_re ) @@ -80,9 +84,12 @@ builder_registry = TreeBuilderRegistry() class TreeBuilder(object): """Turn a document into a Beautiful Soup object tree.""" + NAME = "[Unknown tree builder]" + ALTERNATE_NAMES = [] features = [] is_xml = False + picklable = False preserve_whitespace_tags = set() empty_element_tags = None # A tag will be considered an empty-element # tag when and only when it has no contents. @@ -224,7 +231,7 @@ class HTMLTreeBuilder(TreeBuilder): Such as which tags are empty-element tags. """ - preserve_whitespace_tags = set(['pre', 'textarea']) + preserve_whitespace_tags = HTMLAwareEntitySubstitution.preserve_whitespace_tags empty_element_tags = set(['br' , 'hr', 'input', 'img', 'meta', 'spacer', 'link', 'frame', 'base']) diff --git a/libs/bs4/builder/_html5lib.py b/libs/bs4/builder/_html5lib.py index 7de36ae7..c46f8823 100644 --- a/libs/bs4/builder/_html5lib.py +++ b/libs/bs4/builder/_html5lib.py @@ -1,3 +1,6 @@ +# Use of this source code is governed by a BSD-style license that can be +# found in the LICENSE file. + __all__ = [ 'HTML5TreeBuilder', ] @@ -9,7 +12,10 @@ from bs4.builder import ( HTML_5, HTMLTreeBuilder, ) -from bs4.element import NamespacedAttribute +from bs4.element import ( + NamespacedAttribute, + whitespace_re, +) import html5lib from html5lib.constants import namespaces from bs4.element import ( @@ -19,14 +25,32 @@ from bs4.element import ( Tag, ) +try: + # Pre-0.99999999 + from html5lib.treebuilders import _base as treebuilder_base + new_html5lib = False +except ImportError, e: + # 0.99999999 and up + from html5lib.treebuilders import base as treebuilder_base + new_html5lib = True + class HTML5TreeBuilder(HTMLTreeBuilder): """Use html5lib to build a tree.""" - features = ['html5lib', PERMISSIVE, HTML_5, HTML] + NAME = "html5lib" - def prepare_markup(self, markup, user_specified_encoding): + features = [NAME, PERMISSIVE, HTML_5, HTML] + + def prepare_markup(self, markup, user_specified_encoding, + document_declared_encoding=None, exclude_encodings=None): # Store the user-specified encoding for use later on. self.user_specified_encoding = user_specified_encoding + + # document_declared_encoding and exclude_encodings aren't used + # ATM because the html5lib TreeBuilder doesn't use + # UnicodeDammit. + if exclude_encodings: + warnings.warn("You provided a value for exclude_encoding, but the html5lib tree builder doesn't support exclude_encoding.") yield (markup, None, None, False) # These methods are defined by Beautiful Soup. @@ -34,7 +58,14 @@ class HTML5TreeBuilder(HTMLTreeBuilder): if self.soup.parse_only is not None: warnings.warn("You provided a value for parse_only, but the html5lib tree builder doesn't support parse_only. The entire document will be parsed.") parser = html5lib.HTMLParser(tree=self.create_treebuilder) - doc = parser.parse(markup, encoding=self.user_specified_encoding) + + extra_kwargs = dict() + if not isinstance(markup, unicode): + if new_html5lib: + extra_kwargs['override_encoding'] = self.user_specified_encoding + else: + extra_kwargs['encoding'] = self.user_specified_encoding + doc = parser.parse(markup, **extra_kwargs) # Set the character encoding detected by the tokenizer. if isinstance(markup, unicode): @@ -42,7 +73,13 @@ class HTML5TreeBuilder(HTMLTreeBuilder): # charEncoding to UTF-8 if it gets Unicode input. doc.original_encoding = None else: - doc.original_encoding = parser.tokenizer.stream.charEncoding[0] + original_encoding = parser.tokenizer.stream.charEncoding[0] + if not isinstance(original_encoding, basestring): + # In 0.99999999 and up, the encoding is an html5lib + # Encoding object. We want to use a string for compatibility + # with other tree builders. + original_encoding = original_encoding.name + doc.original_encoding = original_encoding def create_treebuilder(self, namespaceHTMLElements): self.underlying_builder = TreeBuilderForHtml5lib( @@ -54,7 +91,7 @@ class HTML5TreeBuilder(HTMLTreeBuilder): return u'
%s' % fragment -class TreeBuilderForHtml5lib(html5lib.treebuilders._base.TreeBuilder): +class TreeBuilderForHtml5lib(treebuilder_base.TreeBuilder): def __init__(self, soup, namespaceHTMLElements): self.soup = soup @@ -92,7 +129,7 @@ class TreeBuilderForHtml5lib(html5lib.treebuilders._base.TreeBuilder): return self.soup def getFragment(self): - return html5lib.treebuilders._base.TreeBuilder.getFragment(self).element + return treebuilder_base.TreeBuilder.getFragment(self).element class AttrList(object): def __init__(self, element): @@ -101,7 +138,16 @@ class AttrList(object): def __iter__(self): return list(self.attrs.items()).__iter__() def __setitem__(self, name, value): - "set attr", name, value + # If this attribute is a multi-valued attribute for this element, + # turn its value into a list. + list_attr = HTML5TreeBuilder.cdata_list_attributes + if (name in list_attr['*'] + or (self.element.name in list_attr + and name in list_attr[self.element.name])): + # A node that is being cloned may have already undergone + # this procedure. + if not isinstance(value, list): + value = whitespace_re.split(value) self.element[name] = value def items(self): return list(self.attrs.items()) @@ -115,9 +161,9 @@ class AttrList(object): return name in list(self.attrs.keys()) -class Element(html5lib.treebuilders._base.Node): +class Element(treebuilder_base.Node): def __init__(self, element, soup, namespace): - html5lib.treebuilders._base.Node.__init__(self, element.name) + treebuilder_base.Node.__init__(self, element.name) self.element = element self.soup = soup self.namespace = namespace @@ -161,6 +207,12 @@ class Element(html5lib.treebuilders._base.Node): # immediately after the parent, if it has no children.) if self.element.contents: most_recent_element = self.element._last_descendant(False) + elif self.element.next_element is not None: + # Something from further ahead in the parse tree is + # being inserted into this earlier element. This is + # very annoying because it means an expensive search + # for the last element in the tree. + most_recent_element = self.soup._last_descendant() else: most_recent_element = self.element @@ -172,6 +224,7 @@ class Element(html5lib.treebuilders._base.Node): return AttrList(self.element) def setAttributes(self, attributes): + if attributes is not None and len(attributes) > 0: converted_attributes = [] @@ -218,6 +271,9 @@ class Element(html5lib.treebuilders._base.Node): def reparentChildren(self, new_parent): """Move all of this tag's children into another tag.""" + # print "MOVE", self.element.contents + # print "FROM", self.element + # print "TO", new_parent.element element = self.element new_parent_element = new_parent.element # Determine what this tag's next_element will be once all the children @@ -236,17 +292,28 @@ class Element(html5lib.treebuilders._base.Node): new_parents_last_descendant_next_element = new_parent_element.next_element to_append = element.contents - append_after = new_parent.element.contents + append_after = new_parent_element.contents if len(to_append) > 0: # Set the first child's previous_element and previous_sibling # to elements within the new parent first_child = to_append[0] - first_child.previous_element = new_parents_last_descendant + if new_parents_last_descendant: + first_child.previous_element = new_parents_last_descendant + else: + first_child.previous_element = new_parent_element first_child.previous_sibling = new_parents_last_child + if new_parents_last_descendant: + new_parents_last_descendant.next_element = first_child + else: + new_parent_element.next_element = first_child + if new_parents_last_child: + new_parents_last_child.next_sibling = first_child # Fix the last child's next_element and next_sibling last_child = to_append[-1] last_child.next_element = new_parents_last_descendant_next_element + if new_parents_last_descendant_next_element: + new_parents_last_descendant_next_element.previous_element = last_child last_child.next_sibling = None for child in to_append: @@ -257,6 +324,10 @@ class Element(html5lib.treebuilders._base.Node): element.contents = [] element.next_element = final_next_element + # print "DONE WITH MOVE" + # print "FROM", self.element + # print "TO", new_parent_element + def cloneNode(self): tag = self.soup.new_tag(self.element.name, self.namespace) node = Element(tag, self.soup, self.namespace) @@ -277,7 +348,7 @@ class Element(html5lib.treebuilders._base.Node): class TextNode(Element): def __init__(self, element, soup): - html5lib.treebuilders._base.Node.__init__(self, None) + treebuilder_base.Node.__init__(self, None) self.element = element self.soup = soup diff --git a/libs/bs4/builder/_htmlparser.py b/libs/bs4/builder/_htmlparser.py index ca8d8b89..823ca15a 100644 --- a/libs/bs4/builder/_htmlparser.py +++ b/libs/bs4/builder/_htmlparser.py @@ -1,13 +1,22 @@ """Use the HTMLParser library to parse HTML files that aren't too bad.""" +# Use of this source code is governed by a BSD-style license that can be +# found in the LICENSE file. + __all__ = [ 'HTMLParserTreeBuilder', ] -from HTMLParser import ( - HTMLParser, - HTMLParseError, - ) +from HTMLParser import HTMLParser + +try: + from HTMLParser import HTMLParseError +except ImportError, e: + # HTMLParseError is removed in Python 3.5. Since it can never be + # thrown in 3.5, we can just define our own class as a placeholder. + class HTMLParseError(Exception): + pass + import sys import warnings @@ -19,10 +28,10 @@ import warnings # At the end of this file, we monkeypatch HTMLParser so that # strict=True works well on Python 3.2.2. major, minor, release = sys.version_info[:3] -CONSTRUCTOR_TAKES_STRICT = ( - major > 3 - or (major == 3 and minor > 2) - or (major == 3 and minor == 2 and release >= 3)) +CONSTRUCTOR_TAKES_STRICT = major == 3 and minor == 2 and release >= 3 +CONSTRUCTOR_STRICT_IS_DEPRECATED = major == 3 and minor == 3 +CONSTRUCTOR_TAKES_CONVERT_CHARREFS = major == 3 and minor >= 4 + from bs4.element import ( CData, @@ -63,7 +72,8 @@ class BeautifulSoupHTMLParser(HTMLParser): def handle_charref(self, name): # XXX workaround for a bug in HTMLParser. Remove this once - # it's fixed. + # it's fixed in all supported versions. + # http://bugs.python.org/issue13633 if name.startswith('x'): real_name = int(name.lstrip('x'), 16) elif name.startswith('X'): @@ -113,14 +123,6 @@ class BeautifulSoupHTMLParser(HTMLParser): def handle_pi(self, data): self.soup.endData() - if data.endswith("?") and data.lower().startswith("xml"): - # "An XHTML processing instruction using the trailing '?' - # will cause the '?' to be included in data." - HTMLParser - # docs. - # - # Strip the question mark so we don't end up with two - # question marks. - data = data[:-1] self.soup.handle_data(data) self.soup.endData(ProcessingInstruction) @@ -128,15 +130,19 @@ class BeautifulSoupHTMLParser(HTMLParser): class HTMLParserTreeBuilder(HTMLTreeBuilder): is_xml = False - features = [HTML, STRICT, HTMLPARSER] + picklable = True + NAME = HTMLPARSER + features = [NAME, HTML, STRICT] def __init__(self, *args, **kwargs): - if CONSTRUCTOR_TAKES_STRICT: + if CONSTRUCTOR_TAKES_STRICT and not CONSTRUCTOR_STRICT_IS_DEPRECATED: kwargs['strict'] = False + if CONSTRUCTOR_TAKES_CONVERT_CHARREFS: + kwargs['convert_charrefs'] = False self.parser_args = (args, kwargs) def prepare_markup(self, markup, user_specified_encoding=None, - document_declared_encoding=None): + document_declared_encoding=None, exclude_encodings=None): """ :return: A 4-tuple (markup, original encoding, encoding declared within markup, whether any characters had to be @@ -147,7 +153,8 @@ class HTMLParserTreeBuilder(HTMLTreeBuilder): return try_encodings = [user_specified_encoding, document_declared_encoding] - dammit = UnicodeDammit(markup, try_encodings, is_html=True) + dammit = UnicodeDammit(markup, try_encodings, is_html=True, + exclude_encodings=exclude_encodings) yield (dammit.markup, dammit.original_encoding, dammit.declared_html_encoding, dammit.contains_replacement_characters) diff --git a/libs/bs4/builder/_lxml.py b/libs/bs4/builder/_lxml.py index fa5d4987..d2ca2872 100644 --- a/libs/bs4/builder/_lxml.py +++ b/libs/bs4/builder/_lxml.py @@ -1,3 +1,5 @@ +# Use of this source code is governed by a BSD-style license that can be +# found in the LICENSE file. __all__ = [ 'LXMLTreeBuilderForXML', 'LXMLTreeBuilder', @@ -7,7 +9,13 @@ from io import BytesIO from StringIO import StringIO import collections from lxml import etree -from bs4.element import Comment, Doctype, NamespacedAttribute +from bs4.element import ( + Comment, + Doctype, + NamespacedAttribute, + ProcessingInstruction, + XMLProcessingInstruction, +) from bs4.builder import ( FAST, HTML, @@ -24,9 +32,13 @@ class LXMLTreeBuilderForXML(TreeBuilder): DEFAULT_PARSER_CLASS = etree.XMLParser is_xml = True + processing_instruction_class = XMLProcessingInstruction + + NAME = "lxml-xml" + ALTERNATE_NAMES = ["xml"] # Well, it's permissive by XML parser standards. - features = [LXML, XML, FAST, PERMISSIVE] + features = [NAME, LXML, XML, FAST, PERMISSIVE] CHUNK_SIZE = 512 @@ -70,6 +82,7 @@ class LXMLTreeBuilderForXML(TreeBuilder): return (None, tag) def prepare_markup(self, markup, user_specified_encoding=None, + exclude_encodings=None, document_declared_encoding=None): """ :yield: A series of 4-tuples. @@ -78,6 +91,16 @@ class LXMLTreeBuilderForXML(TreeBuilder): Each 4-tuple represents a strategy for parsing the document. """ + # Instead of using UnicodeDammit to convert the bytestring to + # Unicode using different encodings, use EncodingDetector to + # iterate over the encodings, and tell lxml to try to parse + # the document as each one in turn. + is_html = not self.is_xml + if is_html: + self.processing_instruction_class = ProcessingInstruction + else: + self.processing_instruction_class = XMLProcessingInstruction + if isinstance(markup, unicode): # We were given Unicode. Maybe lxml can parse Unicode on # this system? @@ -89,13 +112,9 @@ class LXMLTreeBuilderForXML(TreeBuilder): yield (markup.encode("utf8"), "utf8", document_declared_encoding, False) - # Instead of using UnicodeDammit to convert the bytestring to - # Unicode using different encodings, use EncodingDetector to - # iterate over the encodings, and tell lxml to try to parse - # the document as each one in turn. - is_html = not self.is_xml try_encodings = [user_specified_encoding, document_declared_encoding] - detector = EncodingDetector(markup, try_encodings, is_html) + detector = EncodingDetector( + markup, try_encodings, is_html, exclude_encodings) for encoding in detector.encodings: yield (detector.markup, encoding, document_declared_encoding, False) @@ -189,7 +208,9 @@ class LXMLTreeBuilderForXML(TreeBuilder): self.nsmaps.pop() def pi(self, target, data): - pass + self.soup.endData() + self.soup.handle_data(target + ' ' + data) + self.soup.endData(self.processing_instruction_class) def data(self, content): self.soup.handle_data(content) @@ -212,8 +233,12 @@ class LXMLTreeBuilderForXML(TreeBuilder): class LXMLTreeBuilder(HTMLTreeBuilder, LXMLTreeBuilderForXML): - features = [LXML, HTML, FAST, PERMISSIVE] + NAME = LXML + ALTERNATE_NAMES = ["lxml-html"] + + features = ALTERNATE_NAMES + [NAME, HTML, FAST, PERMISSIVE] is_xml = False + processing_instruction_class = ProcessingInstruction def default_parser(self, encoding): return etree.HTMLParser diff --git a/libs/bs4/dammit.py b/libs/bs4/dammit.py index 59640b7c..2bf67f7f 100644 --- a/libs/bs4/dammit.py +++ b/libs/bs4/dammit.py @@ -3,9 +3,12 @@ This library converts a bytestream to Unicode through any means necessary. It is heavily based on code from Mark Pilgrim's Universal -Feed Parser. It works best on XML and XML, but it does not rewrite the +Feed Parser. It works best on XML and HTML, but it does not rewrite the XML or HTML to reflect a new encoding; that's the tree builder's job. """ +# Use of this source code is governed by a BSD-style license that can be +# found in the LICENSE file. +__license__ = "MIT" import codecs from htmlentitydefs import codepoint2name @@ -212,8 +215,11 @@ class EncodingDetector: 5. Windows-1252. """ - def __init__(self, markup, override_encodings=None, is_html=False): + def __init__(self, markup, override_encodings=None, is_html=False, + exclude_encodings=None): self.override_encodings = override_encodings or [] + exclude_encodings = exclude_encodings or [] + self.exclude_encodings = set([x.lower() for x in exclude_encodings]) self.chardet_encoding = None self.is_html = is_html self.declared_encoding = None @@ -224,6 +230,8 @@ class EncodingDetector: def _usable(self, encoding, tried): if encoding is not None: encoding = encoding.lower() + if encoding in self.exclude_encodings: + return False if encoding not in tried: tried.add(encoding) return True @@ -266,6 +274,9 @@ class EncodingDetector: def strip_byte_order_mark(cls, data): """If a byte-order mark is present, strip it and return the encoding it implies.""" encoding = None + if isinstance(data, unicode): + # Unicode data cannot have a byte-order mark. + return data, encoding if (len(data) >= 4) and (data[:2] == b'\xfe\xff') \ and (data[2:4] != '\x00\x00'): encoding = 'utf-16be' @@ -306,7 +317,7 @@ class EncodingDetector: declared_encoding_match = html_meta_re.search(markup, endpos=html_endpos) if declared_encoding_match is not None: declared_encoding = declared_encoding_match.groups()[0].decode( - 'ascii') + 'ascii', 'replace') if declared_encoding: return declared_encoding.lower() return None @@ -331,13 +342,14 @@ class UnicodeDammit: ] def __init__(self, markup, override_encodings=[], - smart_quotes_to=None, is_html=False): + smart_quotes_to=None, is_html=False, exclude_encodings=[]): self.smart_quotes_to = smart_quotes_to self.tried_encodings = [] self.contains_replacement_characters = False self.is_html = is_html - - self.detector = EncodingDetector(markup, override_encodings, is_html) + self.log = logging.getLogger(__name__) + self.detector = EncodingDetector( + markup, override_encodings, is_html, exclude_encodings) # Short-circuit if the data is in Unicode to begin with. if isinstance(markup, unicode) or markup == '': @@ -365,9 +377,10 @@ class UnicodeDammit: if encoding != "ascii": u = self._convert_from(encoding, "replace") if u is not None: - logging.warning( + self.log.warning( "Some characters could not be decoded, and were " - "replaced with REPLACEMENT CHARACTER.") + "replaced with REPLACEMENT CHARACTER." + ) self.contains_replacement_characters = True break diff --git a/libs/bs4/diagnose.py b/libs/bs4/diagnose.py index 4d0b00af..8768332f 100644 --- a/libs/bs4/diagnose.py +++ b/libs/bs4/diagnose.py @@ -1,4 +1,9 @@ """Diagnostic functions, mainly for use when doing tech support.""" + +# Use of this source code is governed by a BSD-style license that can be +# found in the LICENSE file. +__license__ = "MIT" + import cProfile from StringIO import StringIO from HTMLParser import HTMLParser @@ -33,18 +38,28 @@ def diagnose(data): if 'lxml' in basic_parsers: basic_parsers.append(["lxml", "xml"]) - from lxml import etree - print "Found lxml version %s" % ".".join(map(str,etree.LXML_VERSION)) + try: + from lxml import etree + print "Found lxml version %s" % ".".join(map(str,etree.LXML_VERSION)) + except ImportError, e: + print ( + "lxml is not installed or couldn't be imported.") + if 'html5lib' in basic_parsers: - import html5lib - print "Found html5lib version %s" % html5lib.__version__ + try: + import html5lib + print "Found html5lib version %s" % html5lib.__version__ + except ImportError, e: + print ( + "html5lib is not installed or couldn't be imported.") if hasattr(data, 'read'): data = data.read() elif os.path.exists(data): print '"%s" looks like a filename. Reading data from the file.' % data - data = open(data).read() + with open(data) as fp: + data = fp.read() elif data.startswith("http:") or data.startswith("https:"): print '"%s" looks like a URL. Beautiful Soup is not an HTTP client.' % data print "You need to use some other library to get the document behind the URL, and feed that document to Beautiful Soup." diff --git a/libs/bs4/element.py b/libs/bs4/element.py index da9afdf4..b100d18b 100644 --- a/libs/bs4/element.py +++ b/libs/bs4/element.py @@ -1,5 +1,10 @@ +# Use of this source code is governed by a BSD-style license that can be +# found in the LICENSE file. +__license__ = "MIT" + import collections import re +import shlex import sys import warnings from bs4.dammit import EntitySubstitution @@ -96,6 +101,8 @@ class HTMLAwareEntitySubstitution(EntitySubstitution): preformatted_tags = set(["pre"]) + preserve_whitespace_tags = set(['pre', 'textarea']) + @classmethod def _substitute_if_appropriate(cls, ns, f): if (isinstance(ns, NavigableString) @@ -166,11 +173,19 @@ class PageElement(object): This is used when mapping a formatter name ("minimal") to an appropriate function (one that performs entity-substitution on - the contents of + +Hello, world! + +