updated bs4 lib. Fixes #1128

2025-08-14 02:26:53 -07:00 · 2016-11-09 15:02:24 +10:30 · 2016-11-09 15:02:24 +10:30 · c14618ebdd
commit c14618ebdd
parent c94731bcbb
15 changed files with 1077 additions and 237 deletions
--- a/libs/bs4/builder/_lxml.py
+++ b/libs/bs4/builder/_lxml.py
@ -1,3 +1,5 @@
+# Use of this source code is governed by a BSD-style license that can be
+# found in the LICENSE file.
 __all__ = [
    'LXMLTreeBuilderForXML',
    'LXMLTreeBuilder',
@ -7,7 +9,13 @@ from io import BytesIO
 from StringIO import StringIO
 import collections
 from lxml import etree
-from bs4.element import Comment, Doctype, NamespacedAttribute
+from bs4.element import (
+    Comment,
+    Doctype,
+    NamespacedAttribute,
+    ProcessingInstruction,
+    XMLProcessingInstruction,
+)
 from bs4.builder import (
    FAST,
    HTML,
@ -24,9 +32,13 @@ class LXMLTreeBuilderForXML(TreeBuilder):
    DEFAULT_PARSER_CLASS = etree.XMLParser

    is_xml = True
+    processing_instruction_class = XMLProcessingInstruction
+
+    NAME = "lxml-xml"
+    ALTERNATE_NAMES = ["xml"]

    # Well, it's permissive by XML parser standards.
-    features = [LXML, XML, FAST, PERMISSIVE]
+    features = [NAME, LXML, XML, FAST, PERMISSIVE]

    CHUNK_SIZE = 512

@ -70,6 +82,7 @@ class LXMLTreeBuilderForXML(TreeBuilder):
            return (None, tag)

    def prepare_markup(self, markup, user_specified_encoding=None,
+                       exclude_encodings=None,
                       document_declared_encoding=None):
        """
        :yield: A series of 4-tuples.
@ -78,6 +91,16 @@ class LXMLTreeBuilderForXML(TreeBuilder):

        Each 4-tuple represents a strategy for parsing the document.
        """
+        # Instead of using UnicodeDammit to convert the bytestring to
+        # Unicode using different encodings, use EncodingDetector to
+        # iterate over the encodings, and tell lxml to try to parse
+        # the document as each one in turn.
+        is_html = not self.is_xml
+        if is_html:
+            self.processing_instruction_class = ProcessingInstruction
+        else:
+            self.processing_instruction_class = XMLProcessingInstruction
+
        if isinstance(markup, unicode):
            # We were given Unicode. Maybe lxml can parse Unicode on
            # this system?
@ -89,13 +112,9 @@ class LXMLTreeBuilderForXML(TreeBuilder):
            yield (markup.encode("utf8"), "utf8",
                   document_declared_encoding, False)

-        # Instead of using UnicodeDammit to convert the bytestring to
-        # Unicode using different encodings, use EncodingDetector to
-        # iterate over the encodings, and tell lxml to try to parse
-        # the document as each one in turn.
-        is_html = not self.is_xml
        try_encodings = [user_specified_encoding, document_declared_encoding]
-        detector = EncodingDetector(markup, try_encodings, is_html)
+        detector = EncodingDetector(
+            markup, try_encodings, is_html, exclude_encodings)
        for encoding in detector.encodings:
            yield (detector.markup, encoding, document_declared_encoding, False)

@ -189,7 +208,9 @@ class LXMLTreeBuilderForXML(TreeBuilder):
            self.nsmaps.pop()

    def pi(self, target, data):
-        pass
+        self.soup.endData()
+        self.soup.handle_data(target + ' ' + data)
+        self.soup.endData(self.processing_instruction_class)

    def data(self, content):
        self.soup.handle_data(content)
@ -212,8 +233,12 @@ class LXMLTreeBuilderForXML(TreeBuilder):

 class LXMLTreeBuilder(HTMLTreeBuilder, LXMLTreeBuilderForXML):

-    features = [LXML, HTML, FAST, PERMISSIVE]
+    NAME = LXML
+    ALTERNATE_NAMES = ["lxml-html"]
+
+    features = ALTERNATE_NAMES + [NAME, HTML, FAST, PERMISSIVE]
    is_xml = False
+    processing_instruction_class = ProcessingInstruction

    def default_parser(self, encoding):
        return etree.HTMLParser