mirror of
https://github.com/Tautulli/Tautulli.git
synced 2025-07-16 02:02:58 -07:00
Update beautifulsoup4-4.10.0
This commit is contained in:
parent
b581460b51
commit
ab8fa4d5b3
16 changed files with 4599 additions and 743 deletions
|
@ -20,9 +20,13 @@ import sys
|
|||
import cProfile
|
||||
|
||||
def diagnose(data):
|
||||
"""Diagnostic suite for isolating common problems."""
|
||||
print("Diagnostic running on Beautiful Soup %s" % __version__)
|
||||
print("Python version %s" % sys.version)
|
||||
"""Diagnostic suite for isolating common problems.
|
||||
|
||||
:param data: A string containing markup that needs to be explained.
|
||||
:return: None; diagnostics are printed to standard output.
|
||||
"""
|
||||
print(("Diagnostic running on Beautiful Soup %s" % __version__))
|
||||
print(("Python version %s" % sys.version))
|
||||
|
||||
basic_parsers = ["html.parser", "html5lib", "lxml"]
|
||||
for name in basic_parsers:
|
||||
|
@ -39,65 +43,76 @@ def diagnose(data):
|
|||
basic_parsers.append("lxml-xml")
|
||||
try:
|
||||
from lxml import etree
|
||||
print("Found lxml version %s" % ".".join(map(str,etree.LXML_VERSION)))
|
||||
print(("Found lxml version %s" % ".".join(map(str,etree.LXML_VERSION))))
|
||||
except ImportError as e:
|
||||
print (
|
||||
print(
|
||||
"lxml is not installed or couldn't be imported.")
|
||||
|
||||
|
||||
if 'html5lib' in basic_parsers:
|
||||
try:
|
||||
import html5lib
|
||||
print("Found html5lib version %s" % html5lib.__version__)
|
||||
print(("Found html5lib version %s" % html5lib.__version__))
|
||||
except ImportError as e:
|
||||
print (
|
||||
print(
|
||||
"html5lib is not installed or couldn't be imported.")
|
||||
|
||||
if hasattr(data, 'read'):
|
||||
data = data.read()
|
||||
elif data.startswith("http:") or data.startswith("https:"):
|
||||
print('"%s" looks like a URL. Beautiful Soup is not an HTTP client.' % data)
|
||||
print(('"%s" looks like a URL. Beautiful Soup is not an HTTP client.' % data))
|
||||
print("You need to use some other library to get the document behind the URL, and feed that document to Beautiful Soup.")
|
||||
return
|
||||
else:
|
||||
try:
|
||||
if os.path.exists(data):
|
||||
print('"%s" looks like a filename. Reading data from the file.' % data)
|
||||
print(('"%s" looks like a filename. Reading data from the file.' % data))
|
||||
with open(data) as fp:
|
||||
data = fp.read()
|
||||
except ValueError:
|
||||
# This can happen on some platforms when the 'filename' is
|
||||
# too long. Assume it's data and not a filename.
|
||||
pass
|
||||
print()
|
||||
print("")
|
||||
|
||||
for parser in basic_parsers:
|
||||
print("Trying to parse your markup with %s" % parser)
|
||||
print(("Trying to parse your markup with %s" % parser))
|
||||
success = False
|
||||
try:
|
||||
soup = BeautifulSoup(data, features=parser)
|
||||
success = True
|
||||
except Exception as e:
|
||||
print("%s could not parse the markup." % parser)
|
||||
print(("%s could not parse the markup." % parser))
|
||||
traceback.print_exc()
|
||||
if success:
|
||||
print("Here's what %s did with the markup:" % parser)
|
||||
print(soup.prettify())
|
||||
print(("Here's what %s did with the markup:" % parser))
|
||||
print((soup.prettify()))
|
||||
|
||||
print("-" * 80)
|
||||
print(("-" * 80))
|
||||
|
||||
def lxml_trace(data, html=True, **kwargs):
|
||||
"""Print out the lxml events that occur during parsing.
|
||||
|
||||
This lets you see how lxml parses a document when no Beautiful
|
||||
Soup code is running.
|
||||
Soup code is running. You can use this to determine whether
|
||||
an lxml-specific problem is in Beautiful Soup's lxml tree builders
|
||||
or in lxml itself.
|
||||
|
||||
:param data: Some markup.
|
||||
:param html: If True, markup will be parsed with lxml's HTML parser.
|
||||
if False, lxml's XML parser will be used.
|
||||
"""
|
||||
from lxml import etree
|
||||
for event, element in etree.iterparse(StringIO(data), html=html, **kwargs):
|
||||
print(("%s, %4s, %s" % (event, element.tag, element.text)))
|
||||
|
||||
class AnnouncingParser(HTMLParser):
|
||||
"""Announces HTMLParser parse events, without doing anything else."""
|
||||
"""Subclass of HTMLParser that announces parse events, without doing
|
||||
anything else.
|
||||
|
||||
You can use this to get a picture of how html.parser sees a given
|
||||
document. The easiest way to do this is to call `htmlparser_trace`.
|
||||
"""
|
||||
|
||||
def _p(self, s):
|
||||
print(s)
|
||||
|
@ -134,6 +149,8 @@ def htmlparser_trace(data):
|
|||
|
||||
This lets you see how HTMLParser parses a document when no
|
||||
Beautiful Soup code is running.
|
||||
|
||||
:param data: Some markup.
|
||||
"""
|
||||
parser = AnnouncingParser()
|
||||
parser.feed(data)
|
||||
|
@ -176,9 +193,9 @@ def rdoc(num_elements=1000):
|
|||
|
||||
def benchmark_parsers(num_elements=100000):
|
||||
"""Very basic head-to-head performance benchmark."""
|
||||
print("Comparative parser benchmark on Beautiful Soup %s" % __version__)
|
||||
print(("Comparative parser benchmark on Beautiful Soup %s" % __version__))
|
||||
data = rdoc(num_elements)
|
||||
print("Generated a large invalid HTML document (%d bytes)." % len(data))
|
||||
print(("Generated a large invalid HTML document (%d bytes)." % len(data)))
|
||||
|
||||
for parser in ["lxml", ["lxml", "html"], "html5lib", "html.parser"]:
|
||||
success = False
|
||||
|
@ -188,26 +205,26 @@ def benchmark_parsers(num_elements=100000):
|
|||
b = time.time()
|
||||
success = True
|
||||
except Exception as e:
|
||||
print("%s could not parse the markup." % parser)
|
||||
print(("%s could not parse the markup." % parser))
|
||||
traceback.print_exc()
|
||||
if success:
|
||||
print("BS4+%s parsed the markup in %.2fs." % (parser, b-a))
|
||||
print(("BS4+%s parsed the markup in %.2fs." % (parser, b-a)))
|
||||
|
||||
from lxml import etree
|
||||
a = time.time()
|
||||
etree.HTML(data)
|
||||
b = time.time()
|
||||
print("Raw lxml parsed the markup in %.2fs." % (b-a))
|
||||
print(("Raw lxml parsed the markup in %.2fs." % (b-a)))
|
||||
|
||||
import html5lib
|
||||
parser = html5lib.HTMLParser()
|
||||
a = time.time()
|
||||
parser.parse(data)
|
||||
b = time.time()
|
||||
print("Raw html5lib parsed the markup in %.2fs." % (b-a))
|
||||
print(("Raw html5lib parsed the markup in %.2fs." % (b-a)))
|
||||
|
||||
def profile(num_elements=100000, parser="lxml"):
|
||||
|
||||
"""Use Python's profiler on a randomly generated document."""
|
||||
filehandle = tempfile.NamedTemporaryFile()
|
||||
filename = filehandle.name
|
||||
|
||||
|
@ -220,5 +237,6 @@ def profile(num_elements=100000, parser="lxml"):
|
|||
stats.sort_stats("cumulative")
|
||||
stats.print_stats('_html5lib|bs4', 50)
|
||||
|
||||
# If this file is run as a script, standard input is diagnosed.
|
||||
if __name__ == '__main__':
|
||||
diagnose(sys.stdin.read())
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue