Update bs4 to 4.8.1 (with 2to3)

2025-08-14 02:26:58 -07:00 · 2019-11-23 18:54:24 -08:00 · 2019-11-23 18:54:24 -08:00 · f28e741ad7
commit f28e741ad7
parent 23c4e5b09d
19 changed files with 5487 additions and 792 deletions
--- a/lib/bs4/diagnose.py
+++ b/lib/bs4/diagnose.py
@ -1,7 +1,11 @@
 """Diagnostic functions, mainly for use when doing tech support."""
+
+# Use of this source code is governed by the MIT license.
+__license__ = "MIT"
+
 import cProfile
-from StringIO import StringIO
-from HTMLParser import HTMLParser
+from io import StringIO
+from html.parser import HTMLParser
 import bs4
 from bs4 import BeautifulSoup, __version__
 from bs4.builder import builder_registry
@ -17,8 +21,8 @@ import cProfile

 def diagnose(data):
    """Diagnostic suite for isolating common problems."""
-    print "Diagnostic running on Beautiful Soup %s" % __version__
-    print "Python version %s" % sys.version
+    print("Diagnostic running on Beautiful Soup %s" % __version__)
+    print("Python version %s" % sys.version)

    basic_parsers = ["html.parser", "html5lib", "lxml"]
    for name in basic_parsers:
@ -27,44 +31,60 @@ def diagnose(data):
                break
        else:
            basic_parsers.remove(name)
-            print (
+            print((
                "I noticed that %s is not installed. Installing it may help." %
-                name)
+                name))

    if 'lxml' in basic_parsers:
-        basic_parsers.append(["lxml", "xml"])
-        from lxml import etree
-        print "Found lxml version %s" % ".".join(map(str,etree.LXML_VERSION))
+        basic_parsers.append("lxml-xml")
+        try:
+            from lxml import etree
+            print("Found lxml version %s" % ".".join(map(str,etree.LXML_VERSION)))
+        except ImportError as e:
+            print (
+                "lxml is not installed or couldn't be imported.")
+

    if 'html5lib' in basic_parsers:
-        import html5lib
-        print "Found html5lib version %s" % html5lib.__version__
+        try:
+            import html5lib
+            print("Found html5lib version %s" % html5lib.__version__)
+        except ImportError as e:
+            print (
+                "html5lib is not installed or couldn't be imported.")

    if hasattr(data, 'read'):
        data = data.read()
-    elif os.path.exists(data):
-        print '"%s" looks like a filename. Reading data from the file.' % data
-        data = open(data).read()
    elif data.startswith("http:") or data.startswith("https:"):
-        print '"%s" looks like a URL. Beautiful Soup is not an HTTP client.' % data
-        print "You need to use some other library to get the document behind the URL, and feed that document to Beautiful Soup."
+        print('"%s" looks like a URL. Beautiful Soup is not an HTTP client.' % data)
+        print("You need to use some other library to get the document behind the URL, and feed that document to Beautiful Soup.")
        return
-    print
+    else:
+        try:
+            if os.path.exists(data):
+                print('"%s" looks like a filename. Reading data from the file.' % data)
+                with open(data) as fp:
+                    data = fp.read()
+        except ValueError:
+            # This can happen on some platforms when the 'filename' is
+            # too long. Assume it's data and not a filename.
+            pass
+        print()

    for parser in basic_parsers:
-        print "Trying to parse your markup with %s" % parser
+        print("Trying to parse your markup with %s" % parser)
        success = False
        try:
-            soup = BeautifulSoup(data, parser)
+            soup = BeautifulSoup(data, features=parser)
            success = True
-        except Exception, e:
-            print "%s could not parse the markup." % parser
+        except Exception as e:
+            print("%s could not parse the markup." % parser)
            traceback.print_exc()
        if success:
-            print "Here's what %s did with the markup:" % parser
-            print soup.prettify()
+            print("Here's what %s did with the markup:" % parser)
+            print(soup.prettify())

-        print "-" * 80
+        print("-" * 80)

 def lxml_trace(data, html=True, **kwargs):
    """Print out the lxml events that occur during parsing.
@ -74,7 +94,7 @@ def lxml_trace(data, html=True, **kwargs):
    """
    from lxml import etree
    for event, element in etree.iterparse(StringIO(data), html=html, **kwargs):
-        print("%s, %4s, %s" % (event, element.tag, element.text))
+        print(("%s, %4s, %s" % (event, element.tag, element.text)))

 class AnnouncingParser(HTMLParser):
    """Announces HTMLParser parse events, without doing anything else."""
@ -135,7 +155,7 @@ def rword(length=5):
 def rsentence(length=4):
    "Generate a random sentence-like string."
    return " ".join(rword(random.randint(4,9)) for i in range(length))
-
+        
 def rdoc(num_elements=1000):
    """Randomly generate an invalid HTML document."""
    tag_names = ['p', 'div', 'span', 'i', 'b', 'script', 'table']
@ -156,10 +176,10 @@ def rdoc(num_elements=1000):

 def benchmark_parsers(num_elements=100000):
    """Very basic head-to-head performance benchmark."""
-    print "Comparative parser benchmark on Beautiful Soup %s" % __version__
+    print("Comparative parser benchmark on Beautiful Soup %s" % __version__)
    data = rdoc(num_elements)
-    print "Generated a large invalid HTML document (%d bytes)." % len(data)
-
+    print("Generated a large invalid HTML document (%d bytes)." % len(data))
+    
    for parser in ["lxml", ["lxml", "html"], "html5lib", "html.parser"]:
        success = False
        try:
@ -167,24 +187,24 @@ def benchmark_parsers(num_elements=100000):
            soup = BeautifulSoup(data, parser)
            b = time.time()
            success = True
-        except Exception, e:
-            print "%s could not parse the markup." % parser
+        except Exception as e:
+            print("%s could not parse the markup." % parser)
            traceback.print_exc()
        if success:
-            print "BS4+%s parsed the markup in %.2fs." % (parser, b-a)
+            print("BS4+%s parsed the markup in %.2fs." % (parser, b-a))

    from lxml import etree
    a = time.time()
    etree.HTML(data)
    b = time.time()
-    print "Raw lxml parsed the markup in %.2fs." % (b-a)
+    print("Raw lxml parsed the markup in %.2fs." % (b-a))

    import html5lib
    parser = html5lib.HTMLParser()
    a = time.time()
    parser.parse(data)
    b = time.time()
-    print "Raw html5lib parsed the markup in %.2fs." % (b-a)
+    print("Raw html5lib parsed the markup in %.2fs." % (b-a))

 def profile(num_elements=100000, parser="lxml"):