Update subliminal to 2.0.5

Also updates:
- appdirs-1.4.3
- babelfish-0.5.5
- beautifulsoup4-4.6.3
- certifi-2018.11.29
- chardet-3.0.4
- click-7.0
- decorator-4.3.0
- dogpile.cache-0.7.1
- enzyme-0.4.1
- guessit-3.0.3
- idna-2.8
- pbr-5.1.1
- pysrt-1.1.1
- python-dateutil-2.7.5
- pytz-2018.7
- rarfile-3.0
- rebulk-1.0.0
- requests-2.21.0
- six-1.12.0
- stevedore-1.30.0
- urllib3-1.24.1
This commit is contained in:
Labrys of Knossos 2018-12-15 01:12:12 -05:00
commit f3fcb47427
761 changed files with 29015 additions and 1843 deletions

View file

@ -6,6 +6,7 @@ __all__ = [
]
import warnings
import re
from bs4.builder import (
PERMISSIVE,
HTML,
@ -17,7 +18,10 @@ from bs4.element import (
whitespace_re,
)
import html5lib
from html5lib.constants import namespaces
from html5lib.constants import (
namespaces,
prefixes,
)
from bs4.element import (
Comment,
Doctype,
@ -29,7 +33,7 @@ try:
# Pre-0.99999999
from html5lib.treebuilders import _base as treebuilder_base
new_html5lib = False
except ImportError, e:
except ImportError as e:
# 0.99999999 and up
from html5lib.treebuilders import base as treebuilder_base
new_html5lib = True
@ -60,7 +64,7 @@ class HTML5TreeBuilder(HTMLTreeBuilder):
parser = html5lib.HTMLParser(tree=self.create_treebuilder)
extra_kwargs = dict()
if not isinstance(markup, unicode):
if not isinstance(markup, str):
if new_html5lib:
extra_kwargs['override_encoding'] = self.user_specified_encoding
else:
@ -68,13 +72,13 @@ class HTML5TreeBuilder(HTMLTreeBuilder):
doc = parser.parse(markup, **extra_kwargs)
# Set the character encoding detected by the tokenizer.
if isinstance(markup, unicode):
if isinstance(markup, str):
# We need to special-case this because html5lib sets
# charEncoding to UTF-8 if it gets Unicode input.
doc.original_encoding = None
else:
original_encoding = parser.tokenizer.stream.charEncoding[0]
if not isinstance(original_encoding, basestring):
if not isinstance(original_encoding, str):
# In 0.99999999 and up, the encoding is an html5lib
# Encoding object. We want to use a string for compatibility
# with other tree builders.
@ -83,18 +87,22 @@ class HTML5TreeBuilder(HTMLTreeBuilder):
def create_treebuilder(self, namespaceHTMLElements):
self.underlying_builder = TreeBuilderForHtml5lib(
self.soup, namespaceHTMLElements)
namespaceHTMLElements, self.soup)
return self.underlying_builder
def test_fragment_to_document(self, fragment):
"""See `TreeBuilder`."""
return u'<html><head></head><body>%s</body></html>' % fragment
return '<html><head></head><body>%s</body></html>' % fragment
class TreeBuilderForHtml5lib(treebuilder_base.TreeBuilder):
def __init__(self, soup, namespaceHTMLElements):
self.soup = soup
def __init__(self, namespaceHTMLElements, soup=None):
if soup:
self.soup = soup
else:
from bs4 import BeautifulSoup
self.soup = BeautifulSoup("", "html.parser")
super(TreeBuilderForHtml5lib, self).__init__(namespaceHTMLElements)
def documentClass(self):
@ -117,7 +125,8 @@ class TreeBuilderForHtml5lib(treebuilder_base.TreeBuilder):
return TextNode(Comment(data), self.soup)
def fragmentClass(self):
self.soup = BeautifulSoup("")
from bs4 import BeautifulSoup
self.soup = BeautifulSoup("", "html.parser")
self.soup.name = "[document_fragment]"
return Element(self.soup, self.soup, None)
@ -131,6 +140,56 @@ class TreeBuilderForHtml5lib(treebuilder_base.TreeBuilder):
def getFragment(self):
return treebuilder_base.TreeBuilder.getFragment(self).element
def testSerializer(self, element):
from bs4 import BeautifulSoup
rv = []
doctype_re = re.compile(r'^(.*?)(?: PUBLIC "(.*?)"(?: "(.*?)")?| SYSTEM "(.*?)")?$')
def serializeElement(element, indent=0):
if isinstance(element, BeautifulSoup):
pass
if isinstance(element, Doctype):
m = doctype_re.match(element)
if m:
name = m.group(1)
if m.lastindex > 1:
publicId = m.group(2) or ""
systemId = m.group(3) or m.group(4) or ""
rv.append("""|%s<!DOCTYPE %s "%s" "%s">""" %
(' ' * indent, name, publicId, systemId))
else:
rv.append("|%s<!DOCTYPE %s>" % (' ' * indent, name))
else:
rv.append("|%s<!DOCTYPE >" % (' ' * indent,))
elif isinstance(element, Comment):
rv.append("|%s<!-- %s -->" % (' ' * indent, element))
elif isinstance(element, NavigableString):
rv.append("|%s\"%s\"" % (' ' * indent, element))
else:
if element.namespace:
name = "%s %s" % (prefixes[element.namespace],
element.name)
else:
name = element.name
rv.append("|%s<%s>" % (' ' * indent, name))
if element.attrs:
attributes = []
for name, value in list(element.attrs.items()):
if isinstance(name, NamespacedAttribute):
name = "%s %s" % (prefixes[name.namespace], name.name)
if isinstance(value, list):
value = " ".join(value)
attributes.append((name, value))
for name, value in sorted(attributes):
rv.append('|%s%s="%s"' % (' ' * (indent + 2), name, value))
indent += 2
for child in element.children:
serializeElement(child, indent)
serializeElement(element, 0)
return "\n".join(rv)
class AttrList(object):
def __init__(self, element):
self.element = element
@ -170,7 +229,7 @@ class Element(treebuilder_base.Node):
def appendChild(self, node):
string_child = child = None
if isinstance(node, basestring):
if isinstance(node, str):
# Some other piece of code decided to pass in a string
# instead of creating a TextElement object to contain the
# string.
@ -182,10 +241,12 @@ class Element(treebuilder_base.Node):
child = node
elif node.element.__class__ == NavigableString:
string_child = child = node.element
node.parent = self
else:
child = node.element
node.parent = self
if not isinstance(child, basestring) and child.parent is not None:
if not isinstance(child, str) and child.parent is not None:
node.element.extract()
if (string_child and self.element.contents
@ -198,7 +259,7 @@ class Element(treebuilder_base.Node):
old_element.replace_with(new_element)
self.soup._most_recent_element = new_element
else:
if isinstance(node, basestring):
if isinstance(node, str):
# Create a brand new NavigableString from this string.
child = self.soup.new_string(node)
@ -221,6 +282,8 @@ class Element(treebuilder_base.Node):
most_recent_element=most_recent_element)
def getAttributes(self):
if isinstance(self.element, Comment):
return {}
return AttrList(self.element)
def setAttributes(self, attributes):
@ -236,7 +299,7 @@ class Element(treebuilder_base.Node):
self.soup.builder._replace_cdata_list_attribute_values(
self.name, attributes)
for name, value in attributes.items():
for name, value in list(attributes.items()):
self.element[name] = value
# The attributes may contain variables that need substitution.
@ -248,11 +311,11 @@ class Element(treebuilder_base.Node):
attributes = property(getAttributes, setAttributes)
def insertText(self, data, insertBefore=None):
text = TextNode(self.soup.new_string(data), self.soup)
if insertBefore:
text = TextNode(self.soup.new_string(data), self.soup)
self.insertBefore(data, insertBefore)
self.insertBefore(text, insertBefore)
else:
self.appendChild(data)
self.appendChild(text)
def insertBefore(self, node, refNode):
index = self.element.index(refNode.element)
@ -274,6 +337,7 @@ class Element(treebuilder_base.Node):
# print "MOVE", self.element.contents
# print "FROM", self.element
# print "TO", new_parent.element
element = self.element
new_parent_element = new_parent.element
# Determine what this tag's next_element will be once all the children
@ -292,7 +356,6 @@ class Element(treebuilder_base.Node):
new_parents_last_descendant_next_element = new_parent_element.next_element
to_append = element.contents
append_after = new_parent_element.contents
if len(to_append) > 0:
# Set the first child's previous_element and previous_sibling
# to elements within the new parent
@ -309,12 +372,19 @@ class Element(treebuilder_base.Node):
if new_parents_last_child:
new_parents_last_child.next_sibling = first_child
# Fix the last child's next_element and next_sibling
last_child = to_append[-1]
last_child.next_element = new_parents_last_descendant_next_element
# Find the very last element being moved. It is now the
# parent's last descendant. It has no .next_sibling and
# its .next_element is whatever the previous last
# descendant had.
last_childs_last_descendant = to_append[-1]._last_descendant(False, True)
last_childs_last_descendant.next_element = new_parents_last_descendant_next_element
if new_parents_last_descendant_next_element:
new_parents_last_descendant_next_element.previous_element = last_child
last_child.next_sibling = None
# TODO: This code has no test coverage and I'm not sure
# how to get html5lib to go through this path, but it's
# just the other side of the previous line.
new_parents_last_descendant_next_element.previous_element = last_childs_last_descendant
last_childs_last_descendant.next_sibling = None
for child in to_append:
child.parent = new_parent_element