diff --git a/azure-pipelines.yml b/azure-pipelines.yml
index 644c9c34..82decb5d 100644
--- a/azure-pipelines.yml
+++ b/azure-pipelines.yml
@@ -13,8 +13,6 @@ jobs:
vmImage: 'Ubuntu-latest'
strategy:
matrix:
- Python27:
- python.version: '2.7'
Python36:
python.version: '3.6'
Python37:
@@ -32,7 +30,7 @@ jobs:
sudo apt-get update
sudo apt-get install ffmpeg
displayName: 'Install ffmpeg'
-
+
- task: UsePythonVersion@0
inputs:
versionSpec: '$(python.version)'
diff --git a/cleanup.py b/cleanup.py
index bb4caf98..9628d55c 100644
--- a/cleanup.py
+++ b/cleanup.py
@@ -18,7 +18,6 @@ FOLDER_STRUCTURE = {
'libs': [
'common',
'custom',
- 'py2',
'win',
],
'core': [
diff --git a/libs/py2/_yaml.pyd b/libs/py2/_yaml.pyd
deleted file mode 100644
index 3a505435..00000000
Binary files a/libs/py2/_yaml.pyd and /dev/null differ
diff --git a/libs/py2/backports/__init__.py b/libs/py2/backports/__init__.py
deleted file mode 100644
index 69e3be50..00000000
--- a/libs/py2/backports/__init__.py
+++ /dev/null
@@ -1 +0,0 @@
-__path__ = __import__('pkgutil').extend_path(__path__, __name__)
diff --git a/libs/py2/backports/functools_lru_cache.py b/libs/py2/backports/functools_lru_cache.py
deleted file mode 100644
index 707c6c76..00000000
--- a/libs/py2/backports/functools_lru_cache.py
+++ /dev/null
@@ -1,184 +0,0 @@
-from __future__ import absolute_import
-
-import functools
-from collections import namedtuple
-from threading import RLock
-
-_CacheInfo = namedtuple("CacheInfo", ["hits", "misses", "maxsize", "currsize"])
-
-
-@functools.wraps(functools.update_wrapper)
-def update_wrapper(wrapper,
- wrapped,
- assigned = functools.WRAPPER_ASSIGNMENTS,
- updated = functools.WRAPPER_UPDATES):
- """
- Patch two bugs in functools.update_wrapper.
- """
- # workaround for http://bugs.python.org/issue3445
- assigned = tuple(attr for attr in assigned if hasattr(wrapped, attr))
- wrapper = functools.update_wrapper(wrapper, wrapped, assigned, updated)
- # workaround for https://bugs.python.org/issue17482
- wrapper.__wrapped__ = wrapped
- return wrapper
-
-
-class _HashedSeq(list):
- __slots__ = 'hashvalue'
-
- def __init__(self, tup, hash=hash):
- self[:] = tup
- self.hashvalue = hash(tup)
-
- def __hash__(self):
- return self.hashvalue
-
-
-def _make_key(args, kwds, typed,
- kwd_mark=(object(),),
- fasttypes=set([int, str, frozenset, type(None)]),
- sorted=sorted, tuple=tuple, type=type, len=len):
- 'Make a cache key from optionally typed positional and keyword arguments'
- key = args
- if kwds:
- sorted_items = sorted(kwds.items())
- key += kwd_mark
- for item in sorted_items:
- key += item
- if typed:
- key += tuple(type(v) for v in args)
- if kwds:
- key += tuple(type(v) for k, v in sorted_items)
- elif len(key) == 1 and type(key[0]) in fasttypes:
- return key[0]
- return _HashedSeq(key)
-
-
-def lru_cache(maxsize=100, typed=False):
- """Least-recently-used cache decorator.
-
- If *maxsize* is set to None, the LRU features are disabled and the cache
- can grow without bound.
-
- If *typed* is True, arguments of different types will be cached separately.
- For example, f(3.0) and f(3) will be treated as distinct calls with
- distinct results.
-
- Arguments to the cached function must be hashable.
-
- View the cache statistics named tuple (hits, misses, maxsize, currsize) with
- f.cache_info(). Clear the cache and statistics with f.cache_clear().
- Access the underlying function with f.__wrapped__.
-
- See: http://en.wikipedia.org/wiki/Cache_algorithms#Least_Recently_Used
-
- """
-
- # Users should only access the lru_cache through its public API:
- # cache_info, cache_clear, and f.__wrapped__
- # The internals of the lru_cache are encapsulated for thread safety and
- # to allow the implementation to change (including a possible C version).
-
- def decorating_function(user_function):
-
- cache = dict()
- stats = [0, 0] # make statistics updateable non-locally
- HITS, MISSES = 0, 1 # names for the stats fields
- make_key = _make_key
- cache_get = cache.get # bound method to lookup key or return None
- _len = len # localize the global len() function
- lock = RLock() # because linkedlist updates aren't threadsafe
- root = [] # root of the circular doubly linked list
- root[:] = [root, root, None, None] # initialize by pointing to self
- nonlocal_root = [root] # make updateable non-locally
- PREV, NEXT, KEY, RESULT = 0, 1, 2, 3 # names for the link fields
-
- if maxsize == 0:
-
- def wrapper(*args, **kwds):
- # no caching, just do a statistics update after a successful call
- result = user_function(*args, **kwds)
- stats[MISSES] += 1
- return result
-
- elif maxsize is None:
-
- def wrapper(*args, **kwds):
- # simple caching without ordering or size limit
- key = make_key(args, kwds, typed)
- result = cache_get(key, root) # root used here as a unique not-found sentinel
- if result is not root:
- stats[HITS] += 1
- return result
- result = user_function(*args, **kwds)
- cache[key] = result
- stats[MISSES] += 1
- return result
-
- else:
-
- def wrapper(*args, **kwds):
- # size limited caching that tracks accesses by recency
- key = make_key(args, kwds, typed) if kwds or typed else args
- with lock:
- link = cache_get(key)
- if link is not None:
- # record recent use of the key by moving it to the front of the list
- root, = nonlocal_root
- link_prev, link_next, key, result = link
- link_prev[NEXT] = link_next
- link_next[PREV] = link_prev
- last = root[PREV]
- last[NEXT] = root[PREV] = link
- link[PREV] = last
- link[NEXT] = root
- stats[HITS] += 1
- return result
- result = user_function(*args, **kwds)
- with lock:
- root, = nonlocal_root
- if key in cache:
- # getting here means that this same key was added to the
- # cache while the lock was released. since the link
- # update is already done, we need only return the
- # computed result and update the count of misses.
- pass
- elif _len(cache) >= maxsize:
- # use the old root to store the new key and result
- oldroot = root
- oldroot[KEY] = key
- oldroot[RESULT] = result
- # empty the oldest link and make it the new root
- root = nonlocal_root[0] = oldroot[NEXT]
- oldkey = root[KEY]
- root[KEY] = root[RESULT] = None
- # now update the cache dictionary for the new links
- del cache[oldkey]
- cache[key] = oldroot
- else:
- # put result in a new link at the front of the list
- last = root[PREV]
- link = [last, root, key, result]
- last[NEXT] = root[PREV] = cache[key] = link
- stats[MISSES] += 1
- return result
-
- def cache_info():
- """Report cache statistics"""
- with lock:
- return _CacheInfo(stats[HITS], stats[MISSES], maxsize, len(cache))
-
- def cache_clear():
- """Clear the cache and cache statistics"""
- with lock:
- cache.clear()
- root = nonlocal_root[0]
- root[:] = [root, root, None, None]
- stats[:] = [0, 0]
-
- wrapper.__wrapped__ = user_function
- wrapper.cache_info = cache_info
- wrapper.cache_clear = cache_clear
- return update_wrapper(wrapper, user_function)
-
- return decorating_function
diff --git a/libs/py2/bs4/__init__.py b/libs/py2/bs4/__init__.py
deleted file mode 100644
index 470177fd..00000000
--- a/libs/py2/bs4/__init__.py
+++ /dev/null
@@ -1,584 +0,0 @@
-"""Beautiful Soup
-Elixir and Tonic
-"The Screen-Scraper's Friend"
-http://www.crummy.com/software/BeautifulSoup/
-
-Beautiful Soup uses a pluggable XML or HTML parser to parse a
-(possibly invalid) document into a tree representation. Beautiful Soup
-provides methods and Pythonic idioms that make it easy to navigate,
-search, and modify the parse tree.
-
-Beautiful Soup works with Python 2.7 and up. It works better if lxml
-and/or html5lib is installed.
-
-For more than you ever wanted to know about Beautiful Soup, see the
-documentation:
-http://www.crummy.com/software/BeautifulSoup/bs4/doc/
-
-"""
-
-# Use of this source code is governed by a BSD-style license that can be
-# found in the LICENSE file.
-
-__author__ = "Leonard Richardson (leonardr@segfault.org)"
-__version__ = "4.6.3"
-__copyright__ = "Copyright (c) 2004-2018 Leonard Richardson"
-__license__ = "MIT"
-
-__all__ = ['BeautifulSoup']
-
-import os
-import re
-import sys
-import traceback
-import warnings
-
-from .builder import builder_registry, ParserRejectedMarkup
-from .dammit import UnicodeDammit
-from .element import (
- CData,
- Comment,
- DEFAULT_OUTPUT_ENCODING,
- Declaration,
- Doctype,
- NavigableString,
- PageElement,
- ProcessingInstruction,
- ResultSet,
- SoupStrainer,
- Tag,
- )
-
-# The very first thing we do is give a useful error if someone is
-# running this code under Python 3 without converting it.
-'You are trying to run the Python 2 version of Beautiful Soup under Python 3. This will not work.'<>'You need to convert the code, either by installing it (`python setup.py install`) or by running 2to3 (`2to3 -w bs4`).'
-
-class BeautifulSoup(Tag):
- """
- This class defines the basic interface called by the tree builders.
-
- These methods will be called by the parser:
- reset()
- feed(markup)
-
- The tree builder may call these methods from its feed() implementation:
- handle_starttag(name, attrs) # See note about return value
- handle_endtag(name)
- handle_data(data) # Appends to the current data node
- endData(containerClass=NavigableString) # Ends the current data node
-
- No matter how complicated the underlying parser is, you should be
- able to build a tree using 'start tag' events, 'end tag' events,
- 'data' events, and "done with data" events.
-
- If you encounter an empty-element tag (aka a self-closing tag,
- like HTML's
tag), call handle_starttag and then
- handle_endtag.
- """
- ROOT_TAG_NAME = u'[document]'
-
- # If the end-user gives no indication which tree builder they
- # want, look for one with these features.
- DEFAULT_BUILDER_FEATURES = ['html', 'fast']
-
- ASCII_SPACES = '\x20\x0a\x09\x0c\x0d'
-
- NO_PARSER_SPECIFIED_WARNING = "No parser was explicitly specified, so I'm using the best available %(markup_type)s parser for this system (\"%(parser)s\"). This usually isn't a problem, but if you run this code on another system, or in a different virtual environment, it may use a different parser and behave differently.\n\nThe code that caused this warning is on line %(line_number)s of the file %(filename)s. To get rid of this warning, pass the additional argument 'features=\"%(parser)s\"' to the BeautifulSoup constructor.\n"
-
- def __init__(self, markup="", features=None, builder=None,
- parse_only=None, from_encoding=None, exclude_encodings=None,
- **kwargs):
- """Constructor.
-
- :param markup: A string or a file-like object representing
- markup to be parsed.
-
- :param features: Desirable features of the parser to be used. This
- may be the name of a specific parser ("lxml", "lxml-xml",
- "html.parser", or "html5lib") or it may be the type of markup
- to be used ("html", "html5", "xml"). It's recommended that you
- name a specific parser, so that Beautiful Soup gives you the
- same results across platforms and virtual environments.
-
- :param builder: A specific TreeBuilder to use instead of looking one
- up based on `features`. You shouldn't need to use this.
-
- :param parse_only: A SoupStrainer. Only parts of the document
- matching the SoupStrainer will be considered. This is useful
- when parsing part of a document that would otherwise be too
- large to fit into memory.
-
- :param from_encoding: A string indicating the encoding of the
- document to be parsed. Pass this in if Beautiful Soup is
- guessing wrongly about the document's encoding.
-
- :param exclude_encodings: A list of strings indicating
- encodings known to be wrong. Pass this in if you don't know
- the document's encoding but you know Beautiful Soup's guess is
- wrong.
-
- :param kwargs: For backwards compatibility purposes, the
- constructor accepts certain keyword arguments used in
- Beautiful Soup 3. None of these arguments do anything in
- Beautiful Soup 4 and there's no need to actually pass keyword
- arguments into the constructor.
- """
-
- if 'convertEntities' in kwargs:
- warnings.warn(
- "BS4 does not respect the convertEntities argument to the "
- "BeautifulSoup constructor. Entities are always converted "
- "to Unicode characters.")
-
- if 'markupMassage' in kwargs:
- del kwargs['markupMassage']
- warnings.warn(
- "BS4 does not respect the markupMassage argument to the "
- "BeautifulSoup constructor. The tree builder is responsible "
- "for any necessary markup massage.")
-
- if 'smartQuotesTo' in kwargs:
- del kwargs['smartQuotesTo']
- warnings.warn(
- "BS4 does not respect the smartQuotesTo argument to the "
- "BeautifulSoup constructor. Smart quotes are always converted "
- "to Unicode characters.")
-
- if 'selfClosingTags' in kwargs:
- del kwargs['selfClosingTags']
- warnings.warn(
- "BS4 does not respect the selfClosingTags argument to the "
- "BeautifulSoup constructor. The tree builder is responsible "
- "for understanding self-closing tags.")
-
- if 'isHTML' in kwargs:
- del kwargs['isHTML']
- warnings.warn(
- "BS4 does not respect the isHTML argument to the "
- "BeautifulSoup constructor. Suggest you use "
- "features='lxml' for HTML and features='lxml-xml' for "
- "XML.")
-
- def deprecated_argument(old_name, new_name):
- if old_name in kwargs:
- warnings.warn(
- 'The "%s" argument to the BeautifulSoup constructor '
- 'has been renamed to "%s."' % (old_name, new_name))
- value = kwargs[old_name]
- del kwargs[old_name]
- return value
- return None
-
- parse_only = parse_only or deprecated_argument(
- "parseOnlyThese", "parse_only")
-
- from_encoding = from_encoding or deprecated_argument(
- "fromEncoding", "from_encoding")
-
- if from_encoding and isinstance(markup, unicode):
- warnings.warn("You provided Unicode markup but also provided a value for from_encoding. Your from_encoding will be ignored.")
- from_encoding = None
-
- if len(kwargs) > 0:
- arg = kwargs.keys().pop()
- raise TypeError(
- "__init__() got an unexpected keyword argument '%s'" % arg)
-
- if builder is None:
- original_features = features
- if isinstance(features, basestring):
- features = [features]
- if features is None or len(features) == 0:
- features = self.DEFAULT_BUILDER_FEATURES
- builder_class = builder_registry.lookup(*features)
- if builder_class is None:
- raise FeatureNotFound(
- "Couldn't find a tree builder with the features you "
- "requested: %s. Do you need to install a parser library?"
- % ",".join(features))
- builder = builder_class()
- if not (original_features == builder.NAME or
- original_features in builder.ALTERNATE_NAMES):
- if builder.is_xml:
- markup_type = "XML"
- else:
- markup_type = "HTML"
-
- # This code adapted from warnings.py so that we get the same line
- # of code as our warnings.warn() call gets, even if the answer is wrong
- # (as it may be in a multithreading situation).
- caller = None
- try:
- caller = sys._getframe(1)
- except ValueError:
- pass
- if caller:
- globals = caller.f_globals
- line_number = caller.f_lineno
- else:
- globals = sys.__dict__
- line_number= 1
- filename = globals.get('__file__')
- if filename:
- fnl = filename.lower()
- if fnl.endswith((".pyc", ".pyo")):
- filename = filename[:-1]
- if filename:
- # If there is no filename at all, the user is most likely in a REPL,
- # and the warning is not necessary.
- values = dict(
- filename=filename,
- line_number=line_number,
- parser=builder.NAME,
- markup_type=markup_type
- )
- warnings.warn(self.NO_PARSER_SPECIFIED_WARNING % values, stacklevel=2)
-
- self.builder = builder
- self.is_xml = builder.is_xml
- self.known_xml = self.is_xml
- self.builder.soup = self
-
- self.parse_only = parse_only
-
- if hasattr(markup, 'read'): # It's a file-type object.
- markup = markup.read()
- elif len(markup) <= 256 and (
- (isinstance(markup, bytes) and not b'<' in markup)
- or (isinstance(markup, unicode) and not u'<' in markup)
- ):
- # Print out warnings for a couple beginner problems
- # involving passing non-markup to Beautiful Soup.
- # Beautiful Soup will still parse the input as markup,
- # just in case that's what the user really wants.
- if (isinstance(markup, unicode)
- and not os.path.supports_unicode_filenames):
- possible_filename = markup.encode("utf8")
- else:
- possible_filename = markup
- is_file = False
- try:
- is_file = os.path.exists(possible_filename)
- except Exception, e:
- # This is almost certainly a problem involving
- # characters not valid in filenames on this
- # system. Just let it go.
- pass
- if is_file:
- if isinstance(markup, unicode):
- markup = markup.encode("utf8")
- warnings.warn(
- '"%s" looks like a filename, not markup. You should'
- ' probably open this file and pass the filehandle into'
- ' Beautiful Soup.' % markup)
- self._check_markup_is_url(markup)
-
- for (self.markup, self.original_encoding, self.declared_html_encoding,
- self.contains_replacement_characters) in (
- self.builder.prepare_markup(
- markup, from_encoding, exclude_encodings=exclude_encodings)):
- self.reset()
- try:
- self._feed()
- break
- except ParserRejectedMarkup:
- pass
-
- # Clear out the markup and remove the builder's circular
- # reference to this object.
- self.markup = None
- self.builder.soup = None
-
- def __copy__(self):
- copy = type(self)(
- self.encode('utf-8'), builder=self.builder, from_encoding='utf-8'
- )
-
- # Although we encoded the tree to UTF-8, that may not have
- # been the encoding of the original markup. Set the copy's
- # .original_encoding to reflect the original object's
- # .original_encoding.
- copy.original_encoding = self.original_encoding
- return copy
-
- def __getstate__(self):
- # Frequently a tree builder can't be pickled.
- d = dict(self.__dict__)
- if 'builder' in d and not self.builder.picklable:
- d['builder'] = None
- return d
-
- @staticmethod
- def _check_markup_is_url(markup):
- """
- Check if markup looks like it's actually a url and raise a warning
- if so. Markup can be unicode or str (py2) / bytes (py3).
- """
- if isinstance(markup, bytes):
- space = b' '
- cant_start_with = (b"http:", b"https:")
- elif isinstance(markup, unicode):
- space = u' '
- cant_start_with = (u"http:", u"https:")
- else:
- return
-
- if any(markup.startswith(prefix) for prefix in cant_start_with):
- if not space in markup:
- if isinstance(markup, bytes):
- decoded_markup = markup.decode('utf-8', 'replace')
- else:
- decoded_markup = markup
- warnings.warn(
- '"%s" looks like a URL. Beautiful Soup is not an'
- ' HTTP client. You should probably use an HTTP client like'
- ' requests to get the document behind the URL, and feed'
- ' that document to Beautiful Soup.' % decoded_markup
- )
-
- def _feed(self):
- # Convert the document to Unicode.
- self.builder.reset()
-
- self.builder.feed(self.markup)
- # Close out any unfinished strings and close all the open tags.
- self.endData()
- while self.currentTag.name != self.ROOT_TAG_NAME:
- self.popTag()
-
- def reset(self):
- Tag.__init__(self, self, self.builder, self.ROOT_TAG_NAME)
- self.hidden = 1
- self.builder.reset()
- self.current_data = []
- self.currentTag = None
- self.tagStack = []
- self.preserve_whitespace_tag_stack = []
- self.pushTag(self)
-
- def new_tag(self, name, namespace=None, nsprefix=None, attrs={}, **kwattrs):
- """Create a new tag associated with this soup."""
- kwattrs.update(attrs)
- return Tag(None, self.builder, name, namespace, nsprefix, kwattrs)
-
- def new_string(self, s, subclass=NavigableString):
- """Create a new NavigableString associated with this soup."""
- return subclass(s)
-
- def insert_before(self, successor):
- raise NotImplementedError("BeautifulSoup objects don't support insert_before().")
-
- def insert_after(self, successor):
- raise NotImplementedError("BeautifulSoup objects don't support insert_after().")
-
- def popTag(self):
- tag = self.tagStack.pop()
- if self.preserve_whitespace_tag_stack and tag == self.preserve_whitespace_tag_stack[-1]:
- self.preserve_whitespace_tag_stack.pop()
- #print "Pop", tag.name
- if self.tagStack:
- self.currentTag = self.tagStack[-1]
- return self.currentTag
-
- def pushTag(self, tag):
- #print "Push", tag.name
- if self.currentTag:
- self.currentTag.contents.append(tag)
- self.tagStack.append(tag)
- self.currentTag = self.tagStack[-1]
- if tag.name in self.builder.preserve_whitespace_tags:
- self.preserve_whitespace_tag_stack.append(tag)
-
- def endData(self, containerClass=NavigableString):
- if self.current_data:
- current_data = u''.join(self.current_data)
- # If whitespace is not preserved, and this string contains
- # nothing but ASCII spaces, replace it with a single space
- # or newline.
- if not self.preserve_whitespace_tag_stack:
- strippable = True
- for i in current_data:
- if i not in self.ASCII_SPACES:
- strippable = False
- break
- if strippable:
- if '\n' in current_data:
- current_data = '\n'
- else:
- current_data = ' '
-
- # Reset the data collector.
- self.current_data = []
-
- # Should we add this string to the tree at all?
- if self.parse_only and len(self.tagStack) <= 1 and \
- (not self.parse_only.text or \
- not self.parse_only.search(current_data)):
- return
-
- o = containerClass(current_data)
- self.object_was_parsed(o)
-
- def object_was_parsed(self, o, parent=None, most_recent_element=None):
- """Add an object to the parse tree."""
- parent = parent or self.currentTag
- previous_element = most_recent_element or self._most_recent_element
-
- next_element = previous_sibling = next_sibling = None
- if isinstance(o, Tag):
- next_element = o.next_element
- next_sibling = o.next_sibling
- previous_sibling = o.previous_sibling
- if not previous_element:
- previous_element = o.previous_element
-
- o.setup(parent, previous_element, next_element, previous_sibling, next_sibling)
-
- self._most_recent_element = o
- parent.contents.append(o)
-
- if parent.next_sibling:
- # This node is being inserted into an element that has
- # already been parsed. Deal with any dangling references.
- index = len(parent.contents)-1
- while index >= 0:
- if parent.contents[index] is o:
- break
- index -= 1
- else:
- raise ValueError(
- "Error building tree: supposedly %r was inserted "
- "into %r after the fact, but I don't see it!" % (
- o, parent
- )
- )
- if index == 0:
- previous_element = parent
- previous_sibling = None
- else:
- previous_element = previous_sibling = parent.contents[index-1]
- if index == len(parent.contents)-1:
- next_element = parent.next_sibling
- next_sibling = None
- else:
- next_element = next_sibling = parent.contents[index+1]
-
- o.previous_element = previous_element
- if previous_element:
- previous_element.next_element = o
- o.next_element = next_element
- if next_element:
- next_element.previous_element = o
- o.next_sibling = next_sibling
- if next_sibling:
- next_sibling.previous_sibling = o
- o.previous_sibling = previous_sibling
- if previous_sibling:
- previous_sibling.next_sibling = o
-
- def _popToTag(self, name, nsprefix=None, inclusivePop=True):
- """Pops the tag stack up to and including the most recent
- instance of the given tag. If inclusivePop is false, pops the tag
- stack up to but *not* including the most recent instqance of
- the given tag."""
- #print "Popping to %s" % name
- if name == self.ROOT_TAG_NAME:
- # The BeautifulSoup object itself can never be popped.
- return
-
- most_recently_popped = None
-
- stack_size = len(self.tagStack)
- for i in range(stack_size - 1, 0, -1):
- t = self.tagStack[i]
- if (name == t.name and nsprefix == t.prefix):
- if inclusivePop:
- most_recently_popped = self.popTag()
- break
- most_recently_popped = self.popTag()
-
- return most_recently_popped
-
- def handle_starttag(self, name, namespace, nsprefix, attrs):
- """Push a start tag on to the stack.
-
- If this method returns None, the tag was rejected by the
- SoupStrainer. You should proceed as if the tag had not occurred
- in the document. For instance, if this was a self-closing tag,
- don't call handle_endtag.
- """
-
- # print "Start tag %s: %s" % (name, attrs)
- self.endData()
-
- if (self.parse_only and len(self.tagStack) <= 1
- and (self.parse_only.text
- or not self.parse_only.search_tag(name, attrs))):
- return None
-
- tag = Tag(self, self.builder, name, namespace, nsprefix, attrs,
- self.currentTag, self._most_recent_element)
- if tag is None:
- return tag
- if self._most_recent_element:
- self._most_recent_element.next_element = tag
- self._most_recent_element = tag
- self.pushTag(tag)
- return tag
-
- def handle_endtag(self, name, nsprefix=None):
- #print "End tag: " + name
- self.endData()
- self._popToTag(name, nsprefix)
-
- def handle_data(self, data):
- self.current_data.append(data)
-
- def decode(self, pretty_print=False,
- eventual_encoding=DEFAULT_OUTPUT_ENCODING,
- formatter="minimal"):
- """Returns a string or Unicode representation of this document.
- To get Unicode, pass None for encoding."""
-
- if self.is_xml:
- # Print the XML declaration
- encoding_part = ''
- if eventual_encoding != None:
- encoding_part = ' encoding="%s"' % eventual_encoding
- prefix = u'\n' % encoding_part
- else:
- prefix = u''
- if not pretty_print:
- indent_level = None
- else:
- indent_level = 0
- return prefix + super(BeautifulSoup, self).decode(
- indent_level, eventual_encoding, formatter)
-
-# Alias to make it easier to type import: 'from bs4 import _soup'
-_s = BeautifulSoup
-_soup = BeautifulSoup
-
-class BeautifulStoneSoup(BeautifulSoup):
- """Deprecated interface to an XML parser."""
-
- def __init__(self, *args, **kwargs):
- kwargs['features'] = 'xml'
- warnings.warn(
- 'The BeautifulStoneSoup class is deprecated. Instead of using '
- 'it, pass features="xml" into the BeautifulSoup constructor.')
- super(BeautifulStoneSoup, self).__init__(*args, **kwargs)
-
-
-class StopParsing(Exception):
- pass
-
-class FeatureNotFound(ValueError):
- pass
-
-
-#By default, act as an HTML pretty-printer.
-if __name__ == '__main__':
- import sys
- soup = BeautifulSoup(sys.stdin)
- print soup.prettify()
diff --git a/libs/py2/bs4/builder/__init__.py b/libs/py2/bs4/builder/__init__.py
deleted file mode 100644
index c9e3f3d3..00000000
--- a/libs/py2/bs4/builder/__init__.py
+++ /dev/null
@@ -1,339 +0,0 @@
-# Use of this source code is governed by a BSD-style license that can be
-# found in the LICENSE file.
-
-from collections import defaultdict
-import itertools
-import sys
-from bs4.element import (
- CharsetMetaAttributeValue,
- ContentMetaAttributeValue,
- HTMLAwareEntitySubstitution,
- whitespace_re
- )
-
-__all__ = [
- 'HTMLTreeBuilder',
- 'SAXTreeBuilder',
- 'TreeBuilder',
- 'TreeBuilderRegistry',
- ]
-
-# Some useful features for a TreeBuilder to have.
-FAST = 'fast'
-PERMISSIVE = 'permissive'
-STRICT = 'strict'
-XML = 'xml'
-HTML = 'html'
-HTML_5 = 'html5'
-
-
-class TreeBuilderRegistry(object):
-
- def __init__(self):
- self.builders_for_feature = defaultdict(list)
- self.builders = []
-
- def register(self, treebuilder_class):
- """Register a treebuilder based on its advertised features."""
- for feature in treebuilder_class.features:
- self.builders_for_feature[feature].insert(0, treebuilder_class)
- self.builders.insert(0, treebuilder_class)
-
- def lookup(self, *features):
- if len(self.builders) == 0:
- # There are no builders at all.
- return None
-
- if len(features) == 0:
- # They didn't ask for any features. Give them the most
- # recently registered builder.
- return self.builders[0]
-
- # Go down the list of features in order, and eliminate any builders
- # that don't match every feature.
- features = list(features)
- features.reverse()
- candidates = None
- candidate_set = None
- while len(features) > 0:
- feature = features.pop()
- we_have_the_feature = self.builders_for_feature.get(feature, [])
- if len(we_have_the_feature) > 0:
- if candidates is None:
- candidates = we_have_the_feature
- candidate_set = set(candidates)
- else:
- # Eliminate any candidates that don't have this feature.
- candidate_set = candidate_set.intersection(
- set(we_have_the_feature))
-
- # The only valid candidates are the ones in candidate_set.
- # Go through the original list of candidates and pick the first one
- # that's in candidate_set.
- if candidate_set is None:
- return None
- for candidate in candidates:
- if candidate in candidate_set:
- return candidate
- return None
-
-# The BeautifulSoup class will take feature lists from developers and use them
-# to look up builders in this registry.
-builder_registry = TreeBuilderRegistry()
-
-class TreeBuilder(object):
- """Turn a document into a Beautiful Soup object tree."""
-
- NAME = "[Unknown tree builder]"
- ALTERNATE_NAMES = []
- features = []
-
- is_xml = False
- picklable = False
- preserve_whitespace_tags = set()
- empty_element_tags = None # A tag will be considered an empty-element
- # tag when and only when it has no contents.
-
- # A value for these tag/attribute combinations is a space- or
- # comma-separated list of CDATA, rather than a single CDATA.
- cdata_list_attributes = {}
-
-
- def __init__(self):
- self.soup = None
-
- def reset(self):
- pass
-
- def can_be_empty_element(self, tag_name):
- """Might a tag with this name be an empty-element tag?
-
- The final markup may or may not actually present this tag as
- self-closing.
-
- For instance: an HTMLBuilder does not consider a
tag to be
- an empty-element tag (it's not in
- HTMLBuilder.empty_element_tags). This means an empty
tag
- will be presented as "
", not "".
-
- The default implementation has no opinion about which tags are
- empty-element tags, so a tag will be presented as an
- empty-element tag if and only if it has no contents.
- "" will become "", and "bar" will
- be left alone.
- """
- if self.empty_element_tags is None:
- return True
- return tag_name in self.empty_element_tags
-
- def feed(self, markup):
- raise NotImplementedError()
-
- def prepare_markup(self, markup, user_specified_encoding=None,
- document_declared_encoding=None):
- return markup, None, None, False
-
- def test_fragment_to_document(self, fragment):
- """Wrap an HTML fragment to make it look like a document.
-
- Different parsers do this differently. For instance, lxml
- introduces an empty tag, and html5lib
- doesn't. Abstracting this away lets us write simple tests
- which run HTML fragments through the parser and compare the
- results against other HTML fragments.
-
- This method should not be used outside of tests.
- """
- return fragment
-
- def set_up_substitutions(self, tag):
- return False
-
- def _replace_cdata_list_attribute_values(self, tag_name, attrs):
- """Replaces class="foo bar" with class=["foo", "bar"]
-
- Modifies its input in place.
- """
- if not attrs:
- return attrs
- if self.cdata_list_attributes:
- universal = self.cdata_list_attributes.get('*', [])
- tag_specific = self.cdata_list_attributes.get(
- tag_name.lower(), None)
- for attr in attrs.keys():
- if attr in universal or (tag_specific and attr in tag_specific):
- # We have a "class"-type attribute whose string
- # value is a whitespace-separated list of
- # values. Split it into a list.
- value = attrs[attr]
- if isinstance(value, basestring):
- values = whitespace_re.split(value)
- else:
- # html5lib sometimes calls setAttributes twice
- # for the same tag when rearranging the parse
- # tree. On the second call the attribute value
- # here is already a list. If this happens,
- # leave the value alone rather than trying to
- # split it again.
- values = value
- attrs[attr] = values
- return attrs
-
-class SAXTreeBuilder(TreeBuilder):
- """A Beautiful Soup treebuilder that listens for SAX events."""
-
- def feed(self, markup):
- raise NotImplementedError()
-
- def close(self):
- pass
-
- def startElement(self, name, attrs):
- attrs = dict((key[1], value) for key, value in list(attrs.items()))
- #print "Start %s, %r" % (name, attrs)
- self.soup.handle_starttag(name, attrs)
-
- def endElement(self, name):
- #print "End %s" % name
- self.soup.handle_endtag(name)
-
- def startElementNS(self, nsTuple, nodeName, attrs):
- # Throw away (ns, nodeName) for now.
- self.startElement(nodeName, attrs)
-
- def endElementNS(self, nsTuple, nodeName):
- # Throw away (ns, nodeName) for now.
- self.endElement(nodeName)
- #handler.endElementNS((ns, node.nodeName), node.nodeName)
-
- def startPrefixMapping(self, prefix, nodeValue):
- # Ignore the prefix for now.
- pass
-
- def endPrefixMapping(self, prefix):
- # Ignore the prefix for now.
- # handler.endPrefixMapping(prefix)
- pass
-
- def characters(self, content):
- self.soup.handle_data(content)
-
- def startDocument(self):
- pass
-
- def endDocument(self):
- pass
-
-
-class HTMLTreeBuilder(TreeBuilder):
- """This TreeBuilder knows facts about HTML.
-
- Such as which tags are empty-element tags.
- """
-
- preserve_whitespace_tags = HTMLAwareEntitySubstitution.preserve_whitespace_tags
- empty_element_tags = set([
- # These are from HTML5.
- 'area', 'base', 'br', 'col', 'embed', 'hr', 'img', 'input', 'keygen', 'link', 'menuitem', 'meta', 'param', 'source', 'track', 'wbr',
-
- # These are from earlier versions of HTML and are removed in HTML5.
- 'basefont', 'bgsound', 'command', 'frame', 'image', 'isindex', 'nextid', 'spacer'
- ])
-
- # The HTML standard defines these as block-level elements. Beautiful
- # Soup does not treat these elements differently from other elements,
- # but it may do so eventually, and this information is available if
- # you need to use it.
- block_elements = set(["address", "article", "aside", "blockquote", "canvas", "dd", "div", "dl", "dt", "fieldset", "figcaption", "figure", "footer", "form", "h1", "h2", "h3", "h4", "h5", "h6", "header", "hr", "li", "main", "nav", "noscript", "ol", "output", "p", "pre", "section", "table", "tfoot", "ul", "video"])
-
- # The HTML standard defines these attributes as containing a
- # space-separated list of values, not a single value. That is,
- # class="foo bar" means that the 'class' attribute has two values,
- # 'foo' and 'bar', not the single value 'foo bar'. When we
- # encounter one of these attributes, we will parse its value into
- # a list of values if possible. Upon output, the list will be
- # converted back into a string.
- cdata_list_attributes = {
- "*" : ['class', 'accesskey', 'dropzone'],
- "a" : ['rel', 'rev'],
- "link" : ['rel', 'rev'],
- "td" : ["headers"],
- "th" : ["headers"],
- "td" : ["headers"],
- "form" : ["accept-charset"],
- "object" : ["archive"],
-
- # These are HTML5 specific, as are *.accesskey and *.dropzone above.
- "area" : ["rel"],
- "icon" : ["sizes"],
- "iframe" : ["sandbox"],
- "output" : ["for"],
- }
-
- def set_up_substitutions(self, tag):
- # We are only interested in tags
- if tag.name != 'meta':
- return False
-
- http_equiv = tag.get('http-equiv')
- content = tag.get('content')
- charset = tag.get('charset')
-
- # We are interested in tags that say what encoding the
- # document was originally in. This means HTML 5-style
- # tags that provide the "charset" attribute. It also means
- # HTML 4-style tags that provide the "content"
- # attribute and have "http-equiv" set to "content-type".
- #
- # In both cases we will replace the value of the appropriate
- # attribute with a standin object that can take on any
- # encoding.
- meta_encoding = None
- if charset is not None:
- # HTML 5 style:
- #
- meta_encoding = charset
- tag['charset'] = CharsetMetaAttributeValue(charset)
-
- elif (content is not None and http_equiv is not None
- and http_equiv.lower() == 'content-type'):
- # HTML 4 style:
- #
- tag['content'] = ContentMetaAttributeValue(content)
-
- return (meta_encoding is not None)
-
-def register_treebuilders_from(module):
- """Copy TreeBuilders from the given module into this module."""
- # I'm fairly sure this is not the best way to do this.
- this_module = sys.modules['bs4.builder']
- for name in module.__all__:
- obj = getattr(module, name)
-
- if issubclass(obj, TreeBuilder):
- setattr(this_module, name, obj)
- this_module.__all__.append(name)
- # Register the builder while we're at it.
- this_module.builder_registry.register(obj)
-
-class ParserRejectedMarkup(Exception):
- pass
-
-# Builders are registered in reverse order of priority, so that custom
-# builder registrations will take precedence. In general, we want lxml
-# to take precedence over html5lib, because it's faster. And we only
-# want to use HTMLParser as a last result.
-from . import _htmlparser
-register_treebuilders_from(_htmlparser)
-try:
- from . import _html5lib
- register_treebuilders_from(_html5lib)
-except ImportError:
- # They don't have html5lib installed.
- pass
-try:
- from . import _lxml
- register_treebuilders_from(_lxml)
-except ImportError:
- # They don't have lxml installed.
- pass
diff --git a/libs/py2/bs4/builder/_html5lib.py b/libs/py2/bs4/builder/_html5lib.py
deleted file mode 100644
index 5f548935..00000000
--- a/libs/py2/bs4/builder/_html5lib.py
+++ /dev/null
@@ -1,426 +0,0 @@
-# Use of this source code is governed by a BSD-style license that can be
-# found in the LICENSE file.
-
-__all__ = [
- 'HTML5TreeBuilder',
- ]
-
-import warnings
-import re
-from bs4.builder import (
- PERMISSIVE,
- HTML,
- HTML_5,
- HTMLTreeBuilder,
- )
-from bs4.element import (
- NamespacedAttribute,
- whitespace_re,
-)
-import html5lib
-from html5lib.constants import (
- namespaces,
- prefixes,
- )
-from bs4.element import (
- Comment,
- Doctype,
- NavigableString,
- Tag,
- )
-
-try:
- # Pre-0.99999999
- from html5lib.treebuilders import _base as treebuilder_base
- new_html5lib = False
-except ImportError, e:
- # 0.99999999 and up
- from html5lib.treebuilders import base as treebuilder_base
- new_html5lib = True
-
-class HTML5TreeBuilder(HTMLTreeBuilder):
- """Use html5lib to build a tree."""
-
- NAME = "html5lib"
-
- features = [NAME, PERMISSIVE, HTML_5, HTML]
-
- def prepare_markup(self, markup, user_specified_encoding,
- document_declared_encoding=None, exclude_encodings=None):
- # Store the user-specified encoding for use later on.
- self.user_specified_encoding = user_specified_encoding
-
- # document_declared_encoding and exclude_encodings aren't used
- # ATM because the html5lib TreeBuilder doesn't use
- # UnicodeDammit.
- if exclude_encodings:
- warnings.warn("You provided a value for exclude_encoding, but the html5lib tree builder doesn't support exclude_encoding.")
- yield (markup, None, None, False)
-
- # These methods are defined by Beautiful Soup.
- def feed(self, markup):
- if self.soup.parse_only is not None:
- warnings.warn("You provided a value for parse_only, but the html5lib tree builder doesn't support parse_only. The entire document will be parsed.")
- parser = html5lib.HTMLParser(tree=self.create_treebuilder)
-
- extra_kwargs = dict()
- if not isinstance(markup, unicode):
- if new_html5lib:
- extra_kwargs['override_encoding'] = self.user_specified_encoding
- else:
- extra_kwargs['encoding'] = self.user_specified_encoding
- doc = parser.parse(markup, **extra_kwargs)
-
- # Set the character encoding detected by the tokenizer.
- if isinstance(markup, unicode):
- # We need to special-case this because html5lib sets
- # charEncoding to UTF-8 if it gets Unicode input.
- doc.original_encoding = None
- else:
- original_encoding = parser.tokenizer.stream.charEncoding[0]
- if not isinstance(original_encoding, basestring):
- # In 0.99999999 and up, the encoding is an html5lib
- # Encoding object. We want to use a string for compatibility
- # with other tree builders.
- original_encoding = original_encoding.name
- doc.original_encoding = original_encoding
-
- def create_treebuilder(self, namespaceHTMLElements):
- self.underlying_builder = TreeBuilderForHtml5lib(
- namespaceHTMLElements, self.soup)
- return self.underlying_builder
-
- def test_fragment_to_document(self, fragment):
- """See `TreeBuilder`."""
- return u'%s' % fragment
-
-
-class TreeBuilderForHtml5lib(treebuilder_base.TreeBuilder):
-
- def __init__(self, namespaceHTMLElements, soup=None):
- if soup:
- self.soup = soup
- else:
- from bs4 import BeautifulSoup
- self.soup = BeautifulSoup("", "html.parser")
- super(TreeBuilderForHtml5lib, self).__init__(namespaceHTMLElements)
-
- def documentClass(self):
- self.soup.reset()
- return Element(self.soup, self.soup, None)
-
- def insertDoctype(self, token):
- name = token["name"]
- publicId = token["publicId"]
- systemId = token["systemId"]
-
- doctype = Doctype.for_name_and_ids(name, publicId, systemId)
- self.soup.object_was_parsed(doctype)
-
- def elementClass(self, name, namespace):
- tag = self.soup.new_tag(name, namespace)
- return Element(tag, self.soup, namespace)
-
- def commentClass(self, data):
- return TextNode(Comment(data), self.soup)
-
- def fragmentClass(self):
- from bs4 import BeautifulSoup
- self.soup = BeautifulSoup("", "html.parser")
- self.soup.name = "[document_fragment]"
- return Element(self.soup, self.soup, None)
-
- def appendChild(self, node):
- # XXX This code is not covered by the BS4 tests.
- self.soup.append(node.element)
-
- def getDocument(self):
- return self.soup
-
- def getFragment(self):
- return treebuilder_base.TreeBuilder.getFragment(self).element
-
- def testSerializer(self, element):
- from bs4 import BeautifulSoup
- rv = []
- doctype_re = re.compile(r'^(.*?)(?: PUBLIC "(.*?)"(?: "(.*?)")?| SYSTEM "(.*?)")?$')
-
- def serializeElement(element, indent=0):
- if isinstance(element, BeautifulSoup):
- pass
- if isinstance(element, Doctype):
- m = doctype_re.match(element)
- if m:
- name = m.group(1)
- if m.lastindex > 1:
- publicId = m.group(2) or ""
- systemId = m.group(3) or m.group(4) or ""
- rv.append("""|%s""" %
- (' ' * indent, name, publicId, systemId))
- else:
- rv.append("|%s" % (' ' * indent, name))
- else:
- rv.append("|%s" % (' ' * indent,))
- elif isinstance(element, Comment):
- rv.append("|%s" % (' ' * indent, element))
- elif isinstance(element, NavigableString):
- rv.append("|%s\"%s\"" % (' ' * indent, element))
- else:
- if element.namespace:
- name = "%s %s" % (prefixes[element.namespace],
- element.name)
- else:
- name = element.name
- rv.append("|%s<%s>" % (' ' * indent, name))
- if element.attrs:
- attributes = []
- for name, value in element.attrs.items():
- if isinstance(name, NamespacedAttribute):
- name = "%s %s" % (prefixes[name.namespace], name.name)
- if isinstance(value, list):
- value = " ".join(value)
- attributes.append((name, value))
-
- for name, value in sorted(attributes):
- rv.append('|%s%s="%s"' % (' ' * (indent + 2), name, value))
- indent += 2
- for child in element.children:
- serializeElement(child, indent)
- serializeElement(element, 0)
-
- return "\n".join(rv)
-
-class AttrList(object):
- def __init__(self, element):
- self.element = element
- self.attrs = dict(self.element.attrs)
- def __iter__(self):
- return list(self.attrs.items()).__iter__()
- def __setitem__(self, name, value):
- # If this attribute is a multi-valued attribute for this element,
- # turn its value into a list.
- list_attr = HTML5TreeBuilder.cdata_list_attributes
- if (name in list_attr['*']
- or (self.element.name in list_attr
- and name in list_attr[self.element.name])):
- # A node that is being cloned may have already undergone
- # this procedure.
- if not isinstance(value, list):
- value = whitespace_re.split(value)
- self.element[name] = value
- def items(self):
- return list(self.attrs.items())
- def keys(self):
- return list(self.attrs.keys())
- def __len__(self):
- return len(self.attrs)
- def __getitem__(self, name):
- return self.attrs[name]
- def __contains__(self, name):
- return name in list(self.attrs.keys())
-
-
-class Element(treebuilder_base.Node):
- def __init__(self, element, soup, namespace):
- treebuilder_base.Node.__init__(self, element.name)
- self.element = element
- self.soup = soup
- self.namespace = namespace
-
- def appendChild(self, node):
- string_child = child = None
- if isinstance(node, basestring):
- # Some other piece of code decided to pass in a string
- # instead of creating a TextElement object to contain the
- # string.
- string_child = child = node
- elif isinstance(node, Tag):
- # Some other piece of code decided to pass in a Tag
- # instead of creating an Element object to contain the
- # Tag.
- child = node
- elif node.element.__class__ == NavigableString:
- string_child = child = node.element
- node.parent = self
- else:
- child = node.element
- node.parent = self
-
- if not isinstance(child, basestring) and child.parent is not None:
- node.element.extract()
-
- if (string_child and self.element.contents
- and self.element.contents[-1].__class__ == NavigableString):
- # We are appending a string onto another string.
- # TODO This has O(n^2) performance, for input like
- # "aaa..."
- old_element = self.element.contents[-1]
- new_element = self.soup.new_string(old_element + string_child)
- old_element.replace_with(new_element)
- self.soup._most_recent_element = new_element
- else:
- if isinstance(node, basestring):
- # Create a brand new NavigableString from this string.
- child = self.soup.new_string(node)
-
- # Tell Beautiful Soup to act as if it parsed this element
- # immediately after the parent's last descendant. (Or
- # immediately after the parent, if it has no children.)
- if self.element.contents:
- most_recent_element = self.element._last_descendant(False)
- elif self.element.next_element is not None:
- # Something from further ahead in the parse tree is
- # being inserted into this earlier element. This is
- # very annoying because it means an expensive search
- # for the last element in the tree.
- most_recent_element = self.soup._last_descendant()
- else:
- most_recent_element = self.element
-
- self.soup.object_was_parsed(
- child, parent=self.element,
- most_recent_element=most_recent_element)
-
- def getAttributes(self):
- if isinstance(self.element, Comment):
- return {}
- return AttrList(self.element)
-
- def setAttributes(self, attributes):
-
- if attributes is not None and len(attributes) > 0:
-
- converted_attributes = []
- for name, value in list(attributes.items()):
- if isinstance(name, tuple):
- new_name = NamespacedAttribute(*name)
- del attributes[name]
- attributes[new_name] = value
-
- self.soup.builder._replace_cdata_list_attribute_values(
- self.name, attributes)
- for name, value in attributes.items():
- self.element[name] = value
-
- # The attributes may contain variables that need substitution.
- # Call set_up_substitutions manually.
- #
- # The Tag constructor called this method when the Tag was created,
- # but we just set/changed the attributes, so call it again.
- self.soup.builder.set_up_substitutions(self.element)
- attributes = property(getAttributes, setAttributes)
-
- def insertText(self, data, insertBefore=None):
- text = TextNode(self.soup.new_string(data), self.soup)
- if insertBefore:
- self.insertBefore(text, insertBefore)
- else:
- self.appendChild(text)
-
- def insertBefore(self, node, refNode):
- index = self.element.index(refNode.element)
- if (node.element.__class__ == NavigableString and self.element.contents
- and self.element.contents[index-1].__class__ == NavigableString):
- # (See comments in appendChild)
- old_node = self.element.contents[index-1]
- new_str = self.soup.new_string(old_node + node.element)
- old_node.replace_with(new_str)
- else:
- self.element.insert(index, node.element)
- node.parent = self
-
- def removeChild(self, node):
- node.element.extract()
-
- def reparentChildren(self, new_parent):
- """Move all of this tag's children into another tag."""
- # print "MOVE", self.element.contents
- # print "FROM", self.element
- # print "TO", new_parent.element
-
- element = self.element
- new_parent_element = new_parent.element
- # Determine what this tag's next_element will be once all the children
- # are removed.
- final_next_element = element.next_sibling
-
- new_parents_last_descendant = new_parent_element._last_descendant(False, False)
- if len(new_parent_element.contents) > 0:
- # The new parent already contains children. We will be
- # appending this tag's children to the end.
- new_parents_last_child = new_parent_element.contents[-1]
- new_parents_last_descendant_next_element = new_parents_last_descendant.next_element
- else:
- # The new parent contains no children.
- new_parents_last_child = None
- new_parents_last_descendant_next_element = new_parent_element.next_element
-
- to_append = element.contents
- if len(to_append) > 0:
- # Set the first child's previous_element and previous_sibling
- # to elements within the new parent
- first_child = to_append[0]
- if new_parents_last_descendant:
- first_child.previous_element = new_parents_last_descendant
- else:
- first_child.previous_element = new_parent_element
- first_child.previous_sibling = new_parents_last_child
- if new_parents_last_descendant:
- new_parents_last_descendant.next_element = first_child
- else:
- new_parent_element.next_element = first_child
- if new_parents_last_child:
- new_parents_last_child.next_sibling = first_child
-
- # Find the very last element being moved. It is now the
- # parent's last descendant. It has no .next_sibling and
- # its .next_element is whatever the previous last
- # descendant had.
- last_childs_last_descendant = to_append[-1]._last_descendant(False, True)
-
- last_childs_last_descendant.next_element = new_parents_last_descendant_next_element
- if new_parents_last_descendant_next_element:
- # TODO: This code has no test coverage and I'm not sure
- # how to get html5lib to go through this path, but it's
- # just the other side of the previous line.
- new_parents_last_descendant_next_element.previous_element = last_childs_last_descendant
- last_childs_last_descendant.next_sibling = None
-
- for child in to_append:
- child.parent = new_parent_element
- new_parent_element.contents.append(child)
-
- # Now that this element has no children, change its .next_element.
- element.contents = []
- element.next_element = final_next_element
-
- # print "DONE WITH MOVE"
- # print "FROM", self.element
- # print "TO", new_parent_element
-
- def cloneNode(self):
- tag = self.soup.new_tag(self.element.name, self.namespace)
- node = Element(tag, self.soup, self.namespace)
- for key,value in self.attributes:
- node.attributes[key] = value
- return node
-
- def hasContent(self):
- return self.element.contents
-
- def getNameTuple(self):
- if self.namespace == None:
- return namespaces["html"], self.name
- else:
- return self.namespace, self.name
-
- nameTuple = property(getNameTuple)
-
-class TextNode(Element):
- def __init__(self, element, soup):
- treebuilder_base.Node.__init__(self, None)
- self.element = element
- self.soup = soup
-
- def cloneNode(self):
- raise NotImplementedError
diff --git a/libs/py2/bs4/builder/_htmlparser.py b/libs/py2/bs4/builder/_htmlparser.py
deleted file mode 100644
index ee6c685d..00000000
--- a/libs/py2/bs4/builder/_htmlparser.py
+++ /dev/null
@@ -1,347 +0,0 @@
-# encoding: utf-8
-"""Use the HTMLParser library to parse HTML files that aren't too bad."""
-
-# Use of this source code is governed by a BSD-style license that can be
-# found in the LICENSE file.
-
-__all__ = [
- 'HTMLParserTreeBuilder',
- ]
-
-from HTMLParser import HTMLParser
-
-try:
- from HTMLParser import HTMLParseError
-except ImportError, e:
- # HTMLParseError is removed in Python 3.5. Since it can never be
- # thrown in 3.5, we can just define our own class as a placeholder.
- class HTMLParseError(Exception):
- pass
-
-import sys
-import warnings
-
-# Starting in Python 3.2, the HTMLParser constructor takes a 'strict'
-# argument, which we'd like to set to False. Unfortunately,
-# http://bugs.python.org/issue13273 makes strict=True a better bet
-# before Python 3.2.3.
-#
-# At the end of this file, we monkeypatch HTMLParser so that
-# strict=True works well on Python 3.2.2.
-major, minor, release = sys.version_info[:3]
-CONSTRUCTOR_TAKES_STRICT = major == 3 and minor == 2 and release >= 3
-CONSTRUCTOR_STRICT_IS_DEPRECATED = major == 3 and minor == 3
-CONSTRUCTOR_TAKES_CONVERT_CHARREFS = major == 3 and minor >= 4
-
-
-from bs4.element import (
- CData,
- Comment,
- Declaration,
- Doctype,
- ProcessingInstruction,
- )
-from bs4.dammit import EntitySubstitution, UnicodeDammit
-
-from bs4.builder import (
- HTML,
- HTMLTreeBuilder,
- STRICT,
- )
-
-
-HTMLPARSER = 'html.parser'
-
-class BeautifulSoupHTMLParser(HTMLParser):
-
- def __init__(self, *args, **kwargs):
- HTMLParser.__init__(self, *args, **kwargs)
-
- # Keep a list of empty-element tags that were encountered
- # without an explicit closing tag. If we encounter a closing tag
- # of this type, we'll associate it with one of those entries.
- #
- # This isn't a stack because we don't care about the
- # order. It's a list of closing tags we've already handled and
- # will ignore, assuming they ever show up.
- self.already_closed_empty_element = []
-
- def error(self, msg):
- """In Python 3, HTMLParser subclasses must implement error(), although this
- requirement doesn't appear to be documented.
-
- In Python 2, HTMLParser implements error() as raising an exception.
-
- In any event, this method is called only on very strange markup and our best strategy
- is to pretend it didn't happen and keep going.
- """
- warnings.warn(msg)
-
- def handle_startendtag(self, name, attrs):
- # This is only called when the markup looks like
- # .
-
- # is_startend() tells handle_starttag not to close the tag
- # just because its name matches a known empty-element tag. We
- # know that this is an empty-element tag and we want to call
- # handle_endtag ourselves.
- tag = self.handle_starttag(name, attrs, handle_empty_element=False)
- self.handle_endtag(name)
-
- def handle_starttag(self, name, attrs, handle_empty_element=True):
- # XXX namespace
- attr_dict = {}
- for key, value in attrs:
- # Change None attribute values to the empty string
- # for consistency with the other tree builders.
- if value is None:
- value = ''
- attr_dict[key] = value
- attrvalue = '""'
- #print "START", name
- tag = self.soup.handle_starttag(name, None, None, attr_dict)
- if tag and tag.is_empty_element and handle_empty_element:
- # Unlike other parsers, html.parser doesn't send separate end tag
- # events for empty-element tags. (It's handled in
- # handle_startendtag, but only if the original markup looked like
- # .)
- #
- # So we need to call handle_endtag() ourselves. Since we
- # know the start event is identical to the end event, we
- # don't want handle_endtag() to cross off any previous end
- # events for tags of this name.
- self.handle_endtag(name, check_already_closed=False)
-
- # But we might encounter an explicit closing tag for this tag
- # later on. If so, we want to ignore it.
- self.already_closed_empty_element.append(name)
-
- def handle_endtag(self, name, check_already_closed=True):
- #print "END", name
- if check_already_closed and name in self.already_closed_empty_element:
- # This is a redundant end tag for an empty-element tag.
- # We've already called handle_endtag() for it, so just
- # check it off the list.
- # print "ALREADY CLOSED", name
- self.already_closed_empty_element.remove(name)
- else:
- self.soup.handle_endtag(name)
-
- def handle_data(self, data):
- self.soup.handle_data(data)
-
- def handle_charref(self, name):
- # XXX workaround for a bug in HTMLParser. Remove this once
- # it's fixed in all supported versions.
- # http://bugs.python.org/issue13633
- if name.startswith('x'):
- real_name = int(name.lstrip('x'), 16)
- elif name.startswith('X'):
- real_name = int(name.lstrip('X'), 16)
- else:
- real_name = int(name)
-
- data = None
- if real_name < 256:
- # HTML numeric entities are supposed to reference Unicode
- # code points, but sometimes they reference code points in
- # some other encoding (ahem, Windows-1252). E.g.
- # instead of É for LEFT DOUBLE QUOTATION MARK. This
- # code tries to detect this situation and compensate.
- for encoding in (self.soup.original_encoding, 'windows-1252'):
- if not encoding:
- continue
- try:
- data = bytearray([real_name]).decode(encoding)
- except UnicodeDecodeError, e:
- pass
- if not data:
- try:
- data = unichr(real_name)
- except (ValueError, OverflowError), e:
- pass
- data = data or u"\N{REPLACEMENT CHARACTER}"
- self.handle_data(data)
-
- def handle_entityref(self, name):
- character = EntitySubstitution.HTML_ENTITY_TO_CHARACTER.get(name)
- if character is not None:
- data = character
- else:
- # If this were XML, it would be ambiguous whether "&foo"
- # was an character entity reference with a missing
- # semicolon or the literal string "&foo". Since this is
- # HTML, we have a complete list of all character entity references,
- # and this one wasn't found, so assume it's the literal string "&foo".
- data = "&%s" % name
- self.handle_data(data)
-
- def handle_comment(self, data):
- self.soup.endData()
- self.soup.handle_data(data)
- self.soup.endData(Comment)
-
- def handle_decl(self, data):
- self.soup.endData()
- if data.startswith("DOCTYPE "):
- data = data[len("DOCTYPE "):]
- elif data == 'DOCTYPE':
- # i.e. ""
- data = ''
- self.soup.handle_data(data)
- self.soup.endData(Doctype)
-
- def unknown_decl(self, data):
- if data.upper().startswith('CDATA['):
- cls = CData
- data = data[len('CDATA['):]
- else:
- cls = Declaration
- self.soup.endData()
- self.soup.handle_data(data)
- self.soup.endData(cls)
-
- def handle_pi(self, data):
- self.soup.endData()
- self.soup.handle_data(data)
- self.soup.endData(ProcessingInstruction)
-
-
-class HTMLParserTreeBuilder(HTMLTreeBuilder):
-
- is_xml = False
- picklable = True
- NAME = HTMLPARSER
- features = [NAME, HTML, STRICT]
-
- def __init__(self, *args, **kwargs):
- if CONSTRUCTOR_TAKES_STRICT and not CONSTRUCTOR_STRICT_IS_DEPRECATED:
- kwargs['strict'] = False
- if CONSTRUCTOR_TAKES_CONVERT_CHARREFS:
- kwargs['convert_charrefs'] = False
- self.parser_args = (args, kwargs)
-
- def prepare_markup(self, markup, user_specified_encoding=None,
- document_declared_encoding=None, exclude_encodings=None):
- """
- :return: A 4-tuple (markup, original encoding, encoding
- declared within markup, whether any characters had to be
- replaced with REPLACEMENT CHARACTER).
- """
- if isinstance(markup, unicode):
- yield (markup, None, None, False)
- return
-
- try_encodings = [user_specified_encoding, document_declared_encoding]
- dammit = UnicodeDammit(markup, try_encodings, is_html=True,
- exclude_encodings=exclude_encodings)
- yield (dammit.markup, dammit.original_encoding,
- dammit.declared_html_encoding,
- dammit.contains_replacement_characters)
-
- def feed(self, markup):
- args, kwargs = self.parser_args
- parser = BeautifulSoupHTMLParser(*args, **kwargs)
- parser.soup = self.soup
- try:
- parser.feed(markup)
- parser.close()
- except HTMLParseError, e:
- warnings.warn(RuntimeWarning(
- "Python's built-in HTMLParser cannot parse the given document. This is not a bug in Beautiful Soup. The best solution is to install an external parser (lxml or html5lib), and use Beautiful Soup with that parser. See http://www.crummy.com/software/BeautifulSoup/bs4/doc/#installing-a-parser for help."))
- raise e
- parser.already_closed_empty_element = []
-
-# Patch 3.2 versions of HTMLParser earlier than 3.2.3 to use some
-# 3.2.3 code. This ensures they don't treat markup like as a
-# string.
-#
-# XXX This code can be removed once most Python 3 users are on 3.2.3.
-if major == 3 and minor == 2 and not CONSTRUCTOR_TAKES_STRICT:
- import re
- attrfind_tolerant = re.compile(
- r'\s*((?<=[\'"\s])[^\s/>][^\s/=>]*)(\s*=+\s*'
- r'(\'[^\']*\'|"[^"]*"|(?![\'"])[^>\s]*))?')
- HTMLParserTreeBuilder.attrfind_tolerant = attrfind_tolerant
-
- locatestarttagend = re.compile(r"""
- <[a-zA-Z][-.a-zA-Z0-9:_]* # tag name
- (?:\s+ # whitespace before attribute name
- (?:[a-zA-Z_][-.:a-zA-Z0-9_]* # attribute name
- (?:\s*=\s* # value indicator
- (?:'[^']*' # LITA-enclosed value
- |\"[^\"]*\" # LIT-enclosed value
- |[^'\">\s]+ # bare value
- )
- )?
- )
- )*
- \s* # trailing whitespace
-""", re.VERBOSE)
- BeautifulSoupHTMLParser.locatestarttagend = locatestarttagend
-
- from html.parser import tagfind, attrfind
-
- def parse_starttag(self, i):
- self.__starttag_text = None
- endpos = self.check_for_whole_start_tag(i)
- if endpos < 0:
- return endpos
- rawdata = self.rawdata
- self.__starttag_text = rawdata[i:endpos]
-
- # Now parse the data between i+1 and j into a tag and attrs
- attrs = []
- match = tagfind.match(rawdata, i+1)
- assert match, 'unexpected call to parse_starttag()'
- k = match.end()
- self.lasttag = tag = rawdata[i+1:k].lower()
- while k < endpos:
- if self.strict:
- m = attrfind.match(rawdata, k)
- else:
- m = attrfind_tolerant.match(rawdata, k)
- if not m:
- break
- attrname, rest, attrvalue = m.group(1, 2, 3)
- if not rest:
- attrvalue = None
- elif attrvalue[:1] == '\'' == attrvalue[-1:] or \
- attrvalue[:1] == '"' == attrvalue[-1:]:
- attrvalue = attrvalue[1:-1]
- if attrvalue:
- attrvalue = self.unescape(attrvalue)
- attrs.append((attrname.lower(), attrvalue))
- k = m.end()
-
- end = rawdata[k:endpos].strip()
- if end not in (">", "/>"):
- lineno, offset = self.getpos()
- if "\n" in self.__starttag_text:
- lineno = lineno + self.__starttag_text.count("\n")
- offset = len(self.__starttag_text) \
- - self.__starttag_text.rfind("\n")
- else:
- offset = offset + len(self.__starttag_text)
- if self.strict:
- self.error("junk characters in start tag: %r"
- % (rawdata[k:endpos][:20],))
- self.handle_data(rawdata[i:endpos])
- return endpos
- if end.endswith('/>'):
- # XHTML-style empty tag:
- self.handle_startendtag(tag, attrs)
- else:
- self.handle_starttag(tag, attrs)
- if tag in self.CDATA_CONTENT_ELEMENTS:
- self.set_cdata_mode(tag)
- return endpos
-
- def set_cdata_mode(self, elem):
- self.cdata_elem = elem.lower()
- self.interesting = re.compile(r'\s*%s\s*>' % self.cdata_elem, re.I)
-
- BeautifulSoupHTMLParser.parse_starttag = parse_starttag
- BeautifulSoupHTMLParser.set_cdata_mode = set_cdata_mode
-
- CONSTRUCTOR_TAKES_STRICT = True
diff --git a/libs/py2/bs4/builder/_lxml.py b/libs/py2/bs4/builder/_lxml.py
deleted file mode 100644
index 4a0f7de4..00000000
--- a/libs/py2/bs4/builder/_lxml.py
+++ /dev/null
@@ -1,262 +0,0 @@
-# Use of this source code is governed by a BSD-style license that can be
-# found in the LICENSE file.
-__all__ = [
- 'LXMLTreeBuilderForXML',
- 'LXMLTreeBuilder',
- ]
-
-try:
- from collections.abc import Callable # Python 3.6
-except ImportError , e:
- from collections import Callable
-
-from io import BytesIO
-from StringIO import StringIO
-from lxml import etree
-from bs4.element import (
- Comment,
- Doctype,
- NamespacedAttribute,
- ProcessingInstruction,
- XMLProcessingInstruction,
-)
-from bs4.builder import (
- FAST,
- HTML,
- HTMLTreeBuilder,
- PERMISSIVE,
- ParserRejectedMarkup,
- TreeBuilder,
- XML)
-from bs4.dammit import EncodingDetector
-
-LXML = 'lxml'
-
-class LXMLTreeBuilderForXML(TreeBuilder):
- DEFAULT_PARSER_CLASS = etree.XMLParser
-
- is_xml = True
- processing_instruction_class = XMLProcessingInstruction
-
- NAME = "lxml-xml"
- ALTERNATE_NAMES = ["xml"]
-
- # Well, it's permissive by XML parser standards.
- features = [NAME, LXML, XML, FAST, PERMISSIVE]
-
- CHUNK_SIZE = 512
-
- # This namespace mapping is specified in the XML Namespace
- # standard.
- DEFAULT_NSMAPS = {'http://www.w3.org/XML/1998/namespace' : "xml"}
-
- def default_parser(self, encoding):
- # This can either return a parser object or a class, which
- # will be instantiated with default arguments.
- if self._default_parser is not None:
- return self._default_parser
- return etree.XMLParser(
- target=self, strip_cdata=False, recover=True, encoding=encoding)
-
- def parser_for(self, encoding):
- # Use the default parser.
- parser = self.default_parser(encoding)
-
- if isinstance(parser, Callable):
- # Instantiate the parser with default arguments
- parser = parser(target=self, strip_cdata=False, encoding=encoding)
- return parser
-
- def __init__(self, parser=None, empty_element_tags=None):
- # TODO: Issue a warning if parser is present but not a
- # callable, since that means there's no way to create new
- # parsers for different encodings.
- self._default_parser = parser
- if empty_element_tags is not None:
- self.empty_element_tags = set(empty_element_tags)
- self.soup = None
- self.nsmaps = [self.DEFAULT_NSMAPS]
-
- def _getNsTag(self, tag):
- # Split the namespace URL out of a fully-qualified lxml tag
- # name. Copied from lxml's src/lxml/sax.py.
- if tag[0] == '{':
- return tuple(tag[1:].split('}', 1))
- else:
- return (None, tag)
-
- def prepare_markup(self, markup, user_specified_encoding=None,
- exclude_encodings=None,
- document_declared_encoding=None):
- """
- :yield: A series of 4-tuples.
- (markup, encoding, declared encoding,
- has undergone character replacement)
-
- Each 4-tuple represents a strategy for parsing the document.
- """
- # Instead of using UnicodeDammit to convert the bytestring to
- # Unicode using different encodings, use EncodingDetector to
- # iterate over the encodings, and tell lxml to try to parse
- # the document as each one in turn.
- is_html = not self.is_xml
- if is_html:
- self.processing_instruction_class = ProcessingInstruction
- else:
- self.processing_instruction_class = XMLProcessingInstruction
-
- if isinstance(markup, unicode):
- # We were given Unicode. Maybe lxml can parse Unicode on
- # this system?
- yield markup, None, document_declared_encoding, False
-
- if isinstance(markup, unicode):
- # No, apparently not. Convert the Unicode to UTF-8 and
- # tell lxml to parse it as UTF-8.
- yield (markup.encode("utf8"), "utf8",
- document_declared_encoding, False)
-
- try_encodings = [user_specified_encoding, document_declared_encoding]
- detector = EncodingDetector(
- markup, try_encodings, is_html, exclude_encodings)
- for encoding in detector.encodings:
- yield (detector.markup, encoding, document_declared_encoding, False)
-
- def feed(self, markup):
- if isinstance(markup, bytes):
- markup = BytesIO(markup)
- elif isinstance(markup, unicode):
- markup = StringIO(markup)
-
- # Call feed() at least once, even if the markup is empty,
- # or the parser won't be initialized.
- data = markup.read(self.CHUNK_SIZE)
- try:
- self.parser = self.parser_for(self.soup.original_encoding)
- self.parser.feed(data)
- while len(data) != 0:
- # Now call feed() on the rest of the data, chunk by chunk.
- data = markup.read(self.CHUNK_SIZE)
- if len(data) != 0:
- self.parser.feed(data)
- self.parser.close()
- except (UnicodeDecodeError, LookupError, etree.ParserError), e:
- raise ParserRejectedMarkup(str(e))
-
- def close(self):
- self.nsmaps = [self.DEFAULT_NSMAPS]
-
- def start(self, name, attrs, nsmap={}):
- # Make sure attrs is a mutable dict--lxml may send an immutable dictproxy.
- attrs = dict(attrs)
- nsprefix = None
- # Invert each namespace map as it comes in.
- if len(nsmap) == 0 and len(self.nsmaps) > 1:
- # There are no new namespaces for this tag, but
- # non-default namespaces are in play, so we need a
- # separate tag stack to know when they end.
- self.nsmaps.append(None)
- elif len(nsmap) > 0:
- # A new namespace mapping has come into play.
- inverted_nsmap = dict((value, key) for key, value in nsmap.items())
- self.nsmaps.append(inverted_nsmap)
- # Also treat the namespace mapping as a set of attributes on the
- # tag, so we can recreate it later.
- attrs = attrs.copy()
- for prefix, namespace in nsmap.items():
- attribute = NamespacedAttribute(
- "xmlns", prefix, "http://www.w3.org/2000/xmlns/")
- attrs[attribute] = namespace
-
- # Namespaces are in play. Find any attributes that came in
- # from lxml with namespaces attached to their names, and
- # turn then into NamespacedAttribute objects.
- new_attrs = {}
- for attr, value in attrs.items():
- namespace, attr = self._getNsTag(attr)
- if namespace is None:
- new_attrs[attr] = value
- else:
- nsprefix = self._prefix_for_namespace(namespace)
- attr = NamespacedAttribute(nsprefix, attr, namespace)
- new_attrs[attr] = value
- attrs = new_attrs
-
- namespace, name = self._getNsTag(name)
- nsprefix = self._prefix_for_namespace(namespace)
- self.soup.handle_starttag(name, namespace, nsprefix, attrs)
-
- def _prefix_for_namespace(self, namespace):
- """Find the currently active prefix for the given namespace."""
- if namespace is None:
- return None
- for inverted_nsmap in reversed(self.nsmaps):
- if inverted_nsmap is not None and namespace in inverted_nsmap:
- return inverted_nsmap[namespace]
- return None
-
- def end(self, name):
- self.soup.endData()
- completed_tag = self.soup.tagStack[-1]
- namespace, name = self._getNsTag(name)
- nsprefix = None
- if namespace is not None:
- for inverted_nsmap in reversed(self.nsmaps):
- if inverted_nsmap is not None and namespace in inverted_nsmap:
- nsprefix = inverted_nsmap[namespace]
- break
- self.soup.handle_endtag(name, nsprefix)
- if len(self.nsmaps) > 1:
- # This tag, or one of its parents, introduced a namespace
- # mapping, so pop it off the stack.
- self.nsmaps.pop()
-
- def pi(self, target, data):
- self.soup.endData()
- self.soup.handle_data(target + ' ' + data)
- self.soup.endData(self.processing_instruction_class)
-
- def data(self, content):
- self.soup.handle_data(content)
-
- def doctype(self, name, pubid, system):
- self.soup.endData()
- doctype = Doctype.for_name_and_ids(name, pubid, system)
- self.soup.object_was_parsed(doctype)
-
- def comment(self, content):
- "Handle comments as Comment objects."
- self.soup.endData()
- self.soup.handle_data(content)
- self.soup.endData(Comment)
-
- def test_fragment_to_document(self, fragment):
- """See `TreeBuilder`."""
- return u'\n%s' % fragment
-
-
-class LXMLTreeBuilder(HTMLTreeBuilder, LXMLTreeBuilderForXML):
-
- NAME = LXML
- ALTERNATE_NAMES = ["lxml-html"]
-
- features = ALTERNATE_NAMES + [NAME, HTML, FAST, PERMISSIVE]
- is_xml = False
- processing_instruction_class = ProcessingInstruction
-
- def default_parser(self, encoding):
- return etree.HTMLParser
-
- def feed(self, markup):
- encoding = self.soup.original_encoding
- try:
- self.parser = self.parser_for(encoding)
- self.parser.feed(markup)
- self.parser.close()
- except (UnicodeDecodeError, LookupError, etree.ParserError), e:
- raise ParserRejectedMarkup(str(e))
-
-
- def test_fragment_to_document(self, fragment):
- """See `TreeBuilder`."""
- return u'%s' % fragment
diff --git a/libs/py2/bs4/dammit.py b/libs/py2/bs4/dammit.py
deleted file mode 100644
index be46b394..00000000
--- a/libs/py2/bs4/dammit.py
+++ /dev/null
@@ -1,842 +0,0 @@
-# -*- coding: utf-8 -*-
-"""Beautiful Soup bonus library: Unicode, Dammit
-
-This library converts a bytestream to Unicode through any means
-necessary. It is heavily based on code from Mark Pilgrim's Universal
-Feed Parser. It works best on XML and HTML, but it does not rewrite the
-XML or HTML to reflect a new encoding; that's the tree builder's job.
-"""
-# Use of this source code is governed by a BSD-style license that can be
-# found in the LICENSE file.
-__license__ = "MIT"
-
-import codecs
-from htmlentitydefs import codepoint2name
-import re
-import logging
-import string
-
-# Import a library to autodetect character encodings.
-chardet_type = None
-try:
- # First try the fast C implementation.
- # PyPI package: cchardet
- import cchardet
- def chardet_dammit(s):
- return cchardet.detect(s)['encoding']
-except ImportError:
- try:
- # Fall back to the pure Python implementation
- # Debian package: python-chardet
- # PyPI package: chardet
- import chardet
- def chardet_dammit(s):
- return chardet.detect(s)['encoding']
- #import chardet.constants
- #chardet.constants._debug = 1
- except ImportError:
- # No chardet available.
- def chardet_dammit(s):
- return None
-
-# Available from http://cjkpython.i18n.org/.
-try:
- import iconv_codec
-except ImportError:
- pass
-
-xml_encoding_re = re.compile(
- '^<\\?.*encoding=[\'"](.*?)[\'"].*\\?>'.encode(), re.I)
-html_meta_re = re.compile(
- '<\\s*meta[^>]+charset\\s*=\\s*["\']?([^>]*?)[ /;\'">]'.encode(), re.I)
-
-class EntitySubstitution(object):
-
- """Substitute XML or HTML entities for the corresponding characters."""
-
- def _populate_class_variables():
- lookup = {}
- reverse_lookup = {}
- characters_for_re = []
- for codepoint, name in list(codepoint2name.items()):
- character = unichr(codepoint)
- if codepoint != 34:
- # There's no point in turning the quotation mark into
- # ", unless it happens within an attribute value, which
- # is handled elsewhere.
- characters_for_re.append(character)
- lookup[character] = name
- # But we do want to turn " into the quotation mark.
- reverse_lookup[name] = character
- re_definition = "[%s]" % "".join(characters_for_re)
- return lookup, reverse_lookup, re.compile(re_definition)
- (CHARACTER_TO_HTML_ENTITY, HTML_ENTITY_TO_CHARACTER,
- CHARACTER_TO_HTML_ENTITY_RE) = _populate_class_variables()
-
- CHARACTER_TO_XML_ENTITY = {
- "'": "apos",
- '"': "quot",
- "&": "amp",
- "<": "lt",
- ">": "gt",
- }
-
- BARE_AMPERSAND_OR_BRACKET = re.compile("([<>]|"
- "&(?!#\\d+;|#x[0-9a-fA-F]+;|\\w+;)"
- ")")
-
- AMPERSAND_OR_BRACKET = re.compile("([<>&])")
-
- @classmethod
- def _substitute_html_entity(cls, matchobj):
- entity = cls.CHARACTER_TO_HTML_ENTITY.get(matchobj.group(0))
- return "&%s;" % entity
-
- @classmethod
- def _substitute_xml_entity(cls, matchobj):
- """Used with a regular expression to substitute the
- appropriate XML entity for an XML special character."""
- entity = cls.CHARACTER_TO_XML_ENTITY[matchobj.group(0)]
- return "&%s;" % entity
-
- @classmethod
- def quoted_attribute_value(self, value):
- """Make a value into a quoted XML attribute, possibly escaping it.
-
- Most strings will be quoted using double quotes.
-
- Bob's Bar -> "Bob's Bar"
-
- If a string contains double quotes, it will be quoted using
- single quotes.
-
- Welcome to "my bar" -> 'Welcome to "my bar"'
-
- If a string contains both single and double quotes, the
- double quotes will be escaped, and the string will be quoted
- using double quotes.
-
- Welcome to "Bob's Bar" -> "Welcome to "Bob's bar"
- """
- quote_with = '"'
- if '"' in value:
- if "'" in value:
- # The string contains both single and double
- # quotes. Turn the double quotes into
- # entities. We quote the double quotes rather than
- # the single quotes because the entity name is
- # """ whether this is HTML or XML. If we
- # quoted the single quotes, we'd have to decide
- # between ' and &squot;.
- replace_with = """
- value = value.replace('"', replace_with)
- else:
- # There are double quotes but no single quotes.
- # We can use single quotes to quote the attribute.
- quote_with = "'"
- return quote_with + value + quote_with
-
- @classmethod
- def substitute_xml(cls, value, make_quoted_attribute=False):
- """Substitute XML entities for special XML characters.
-
- :param value: A string to be substituted. The less-than sign
- will become <, the greater-than sign will become >,
- and any ampersands will become &. If you want ampersands
- that appear to be part of an entity definition to be left
- alone, use substitute_xml_containing_entities() instead.
-
- :param make_quoted_attribute: If True, then the string will be
- quoted, as befits an attribute value.
- """
- # Escape angle brackets and ampersands.
- value = cls.AMPERSAND_OR_BRACKET.sub(
- cls._substitute_xml_entity, value)
-
- if make_quoted_attribute:
- value = cls.quoted_attribute_value(value)
- return value
-
- @classmethod
- def substitute_xml_containing_entities(
- cls, value, make_quoted_attribute=False):
- """Substitute XML entities for special XML characters.
-
- :param value: A string to be substituted. The less-than sign will
- become <, the greater-than sign will become >, and any
- ampersands that are not part of an entity defition will
- become &.
-
- :param make_quoted_attribute: If True, then the string will be
- quoted, as befits an attribute value.
- """
- # Escape angle brackets, and ampersands that aren't part of
- # entities.
- value = cls.BARE_AMPERSAND_OR_BRACKET.sub(
- cls._substitute_xml_entity, value)
-
- if make_quoted_attribute:
- value = cls.quoted_attribute_value(value)
- return value
-
- @classmethod
- def substitute_html(cls, s):
- """Replace certain Unicode characters with named HTML entities.
-
- This differs from data.encode(encoding, 'xmlcharrefreplace')
- in that the goal is to make the result more readable (to those
- with ASCII displays) rather than to recover from
- errors. There's absolutely nothing wrong with a UTF-8 string
- containg a LATIN SMALL LETTER E WITH ACUTE, but replacing that
- character with "é" will make it more readable to some
- people.
- """
- return cls.CHARACTER_TO_HTML_ENTITY_RE.sub(
- cls._substitute_html_entity, s)
-
-
-class EncodingDetector:
- """Suggests a number of possible encodings for a bytestring.
-
- Order of precedence:
-
- 1. Encodings you specifically tell EncodingDetector to try first
- (the override_encodings argument to the constructor).
-
- 2. An encoding declared within the bytestring itself, either in an
- XML declaration (if the bytestring is to be interpreted as an XML
- document), or in a tag (if the bytestring is to be
- interpreted as an HTML document.)
-
- 3. An encoding detected through textual analysis by chardet,
- cchardet, or a similar external library.
-
- 4. UTF-8.
-
- 5. Windows-1252.
- """
- def __init__(self, markup, override_encodings=None, is_html=False,
- exclude_encodings=None):
- self.override_encodings = override_encodings or []
- exclude_encodings = exclude_encodings or []
- self.exclude_encodings = set([x.lower() for x in exclude_encodings])
- self.chardet_encoding = None
- self.is_html = is_html
- self.declared_encoding = None
-
- # First order of business: strip a byte-order mark.
- self.markup, self.sniffed_encoding = self.strip_byte_order_mark(markup)
-
- def _usable(self, encoding, tried):
- if encoding is not None:
- encoding = encoding.lower()
- if encoding in self.exclude_encodings:
- return False
- if encoding not in tried:
- tried.add(encoding)
- return True
- return False
-
- @property
- def encodings(self):
- """Yield a number of encodings that might work for this markup."""
- tried = set()
- for e in self.override_encodings:
- if self._usable(e, tried):
- yield e
-
- # Did the document originally start with a byte-order mark
- # that indicated its encoding?
- if self._usable(self.sniffed_encoding, tried):
- yield self.sniffed_encoding
-
- # Look within the document for an XML or HTML encoding
- # declaration.
- if self.declared_encoding is None:
- self.declared_encoding = self.find_declared_encoding(
- self.markup, self.is_html)
- if self._usable(self.declared_encoding, tried):
- yield self.declared_encoding
-
- # Use third-party character set detection to guess at the
- # encoding.
- if self.chardet_encoding is None:
- self.chardet_encoding = chardet_dammit(self.markup)
- if self._usable(self.chardet_encoding, tried):
- yield self.chardet_encoding
-
- # As a last-ditch effort, try utf-8 and windows-1252.
- for e in ('utf-8', 'windows-1252'):
- if self._usable(e, tried):
- yield e
-
- @classmethod
- def strip_byte_order_mark(cls, data):
- """If a byte-order mark is present, strip it and return the encoding it implies."""
- encoding = None
- if isinstance(data, unicode):
- # Unicode data cannot have a byte-order mark.
- return data, encoding
- if (len(data) >= 4) and (data[:2] == b'\xfe\xff') \
- and (data[2:4] != '\x00\x00'):
- encoding = 'utf-16be'
- data = data[2:]
- elif (len(data) >= 4) and (data[:2] == b'\xff\xfe') \
- and (data[2:4] != '\x00\x00'):
- encoding = 'utf-16le'
- data = data[2:]
- elif data[:3] == b'\xef\xbb\xbf':
- encoding = 'utf-8'
- data = data[3:]
- elif data[:4] == b'\x00\x00\xfe\xff':
- encoding = 'utf-32be'
- data = data[4:]
- elif data[:4] == b'\xff\xfe\x00\x00':
- encoding = 'utf-32le'
- data = data[4:]
- return data, encoding
-
- @classmethod
- def find_declared_encoding(cls, markup, is_html=False, search_entire_document=False):
- """Given a document, tries to find its declared encoding.
-
- An XML encoding is declared at the beginning of the document.
-
- An HTML encoding is declared in a tag, hopefully near the
- beginning of the document.
- """
- if search_entire_document:
- xml_endpos = html_endpos = len(markup)
- else:
- xml_endpos = 1024
- html_endpos = max(2048, int(len(markup) * 0.05))
-
- declared_encoding = None
- declared_encoding_match = xml_encoding_re.search(markup, endpos=xml_endpos)
- if not declared_encoding_match and is_html:
- declared_encoding_match = html_meta_re.search(markup, endpos=html_endpos)
- if declared_encoding_match is not None:
- declared_encoding = declared_encoding_match.groups()[0].decode(
- 'ascii', 'replace')
- if declared_encoding:
- return declared_encoding.lower()
- return None
-
-class UnicodeDammit:
- """A class for detecting the encoding of a *ML document and
- converting it to a Unicode string. If the source encoding is
- windows-1252, can replace MS smart quotes with their HTML or XML
- equivalents."""
-
- # This dictionary maps commonly seen values for "charset" in HTML
- # meta tags to the corresponding Python codec names. It only covers
- # values that aren't in Python's aliases and can't be determined
- # by the heuristics in find_codec.
- CHARSET_ALIASES = {"macintosh": "mac-roman",
- "x-sjis": "shift-jis"}
-
- ENCODINGS_WITH_SMART_QUOTES = [
- "windows-1252",
- "iso-8859-1",
- "iso-8859-2",
- ]
-
- def __init__(self, markup, override_encodings=[],
- smart_quotes_to=None, is_html=False, exclude_encodings=[]):
- self.smart_quotes_to = smart_quotes_to
- self.tried_encodings = []
- self.contains_replacement_characters = False
- self.is_html = is_html
- self.log = logging.getLogger(__name__)
- self.detector = EncodingDetector(
- markup, override_encodings, is_html, exclude_encodings)
-
- # Short-circuit if the data is in Unicode to begin with.
- if isinstance(markup, unicode) or markup == '':
- self.markup = markup
- self.unicode_markup = unicode(markup)
- self.original_encoding = None
- return
-
- # The encoding detector may have stripped a byte-order mark.
- # Use the stripped markup from this point on.
- self.markup = self.detector.markup
-
- u = None
- for encoding in self.detector.encodings:
- markup = self.detector.markup
- u = self._convert_from(encoding)
- if u is not None:
- break
-
- if not u:
- # None of the encodings worked. As an absolute last resort,
- # try them again with character replacement.
-
- for encoding in self.detector.encodings:
- if encoding != "ascii":
- u = self._convert_from(encoding, "replace")
- if u is not None:
- self.log.warning(
- "Some characters could not be decoded, and were "
- "replaced with REPLACEMENT CHARACTER."
- )
- self.contains_replacement_characters = True
- break
-
- # If none of that worked, we could at this point force it to
- # ASCII, but that would destroy so much data that I think
- # giving up is better.
- self.unicode_markup = u
- if not u:
- self.original_encoding = None
-
- def _sub_ms_char(self, match):
- """Changes a MS smart quote character to an XML or HTML
- entity, or an ASCII character."""
- orig = match.group(1)
- if self.smart_quotes_to == 'ascii':
- sub = self.MS_CHARS_TO_ASCII.get(orig).encode()
- else:
- sub = self.MS_CHARS.get(orig)
- if type(sub) == tuple:
- if self.smart_quotes_to == 'xml':
- sub = ''.encode() + sub[1].encode() + ';'.encode()
- else:
- sub = '&'.encode() + sub[0].encode() + ';'.encode()
- else:
- sub = sub.encode()
- return sub
-
- def _convert_from(self, proposed, errors="strict"):
- proposed = self.find_codec(proposed)
- if not proposed or (proposed, errors) in self.tried_encodings:
- return None
- self.tried_encodings.append((proposed, errors))
- markup = self.markup
- # Convert smart quotes to HTML if coming from an encoding
- # that might have them.
- if (self.smart_quotes_to is not None
- and proposed in self.ENCODINGS_WITH_SMART_QUOTES):
- smart_quotes_re = b"([\x80-\x9f])"
- smart_quotes_compiled = re.compile(smart_quotes_re)
- markup = smart_quotes_compiled.sub(self._sub_ms_char, markup)
-
- try:
- #print "Trying to convert document to %s (errors=%s)" % (
- # proposed, errors)
- u = self._to_unicode(markup, proposed, errors)
- self.markup = u
- self.original_encoding = proposed
- except Exception as e:
- #print "That didn't work!"
- #print e
- return None
- #print "Correct encoding: %s" % proposed
- return self.markup
-
- def _to_unicode(self, data, encoding, errors="strict"):
- '''Given a string and its encoding, decodes the string into Unicode.
- %encoding is a string recognized by encodings.aliases'''
- return unicode(data, encoding, errors)
-
- @property
- def declared_html_encoding(self):
- if not self.is_html:
- return None
- return self.detector.declared_encoding
-
- def find_codec(self, charset):
- value = (self._codec(self.CHARSET_ALIASES.get(charset, charset))
- or (charset and self._codec(charset.replace("-", "")))
- or (charset and self._codec(charset.replace("-", "_")))
- or (charset and charset.lower())
- or charset
- )
- if value:
- return value.lower()
- return None
-
- def _codec(self, charset):
- if not charset:
- return charset
- codec = None
- try:
- codecs.lookup(charset)
- codec = charset
- except (LookupError, ValueError):
- pass
- return codec
-
-
- # A partial mapping of ISO-Latin-1 to HTML entities/XML numeric entities.
- MS_CHARS = {b'\x80': ('euro', '20AC'),
- b'\x81': ' ',
- b'\x82': ('sbquo', '201A'),
- b'\x83': ('fnof', '192'),
- b'\x84': ('bdquo', '201E'),
- b'\x85': ('hellip', '2026'),
- b'\x86': ('dagger', '2020'),
- b'\x87': ('Dagger', '2021'),
- b'\x88': ('circ', '2C6'),
- b'\x89': ('permil', '2030'),
- b'\x8A': ('Scaron', '160'),
- b'\x8B': ('lsaquo', '2039'),
- b'\x8C': ('OElig', '152'),
- b'\x8D': '?',
- b'\x8E': ('#x17D', '17D'),
- b'\x8F': '?',
- b'\x90': '?',
- b'\x91': ('lsquo', '2018'),
- b'\x92': ('rsquo', '2019'),
- b'\x93': ('ldquo', '201C'),
- b'\x94': ('rdquo', '201D'),
- b'\x95': ('bull', '2022'),
- b'\x96': ('ndash', '2013'),
- b'\x97': ('mdash', '2014'),
- b'\x98': ('tilde', '2DC'),
- b'\x99': ('trade', '2122'),
- b'\x9a': ('scaron', '161'),
- b'\x9b': ('rsaquo', '203A'),
- b'\x9c': ('oelig', '153'),
- b'\x9d': '?',
- b'\x9e': ('#x17E', '17E'),
- b'\x9f': ('Yuml', ''),}
-
- # A parochial partial mapping of ISO-Latin-1 to ASCII. Contains
- # horrors like stripping diacritical marks to turn á into a, but also
- # contains non-horrors like turning “ into ".
- MS_CHARS_TO_ASCII = {
- b'\x80' : 'EUR',
- b'\x81' : ' ',
- b'\x82' : ',',
- b'\x83' : 'f',
- b'\x84' : ',,',
- b'\x85' : '...',
- b'\x86' : '+',
- b'\x87' : '++',
- b'\x88' : '^',
- b'\x89' : '%',
- b'\x8a' : 'S',
- b'\x8b' : '<',
- b'\x8c' : 'OE',
- b'\x8d' : '?',
- b'\x8e' : 'Z',
- b'\x8f' : '?',
- b'\x90' : '?',
- b'\x91' : "'",
- b'\x92' : "'",
- b'\x93' : '"',
- b'\x94' : '"',
- b'\x95' : '*',
- b'\x96' : '-',
- b'\x97' : '--',
- b'\x98' : '~',
- b'\x99' : '(TM)',
- b'\x9a' : 's',
- b'\x9b' : '>',
- b'\x9c' : 'oe',
- b'\x9d' : '?',
- b'\x9e' : 'z',
- b'\x9f' : 'Y',
- b'\xa0' : ' ',
- b'\xa1' : '!',
- b'\xa2' : 'c',
- b'\xa3' : 'GBP',
- b'\xa4' : '$', #This approximation is especially parochial--this is the
- #generic currency symbol.
- b'\xa5' : 'YEN',
- b'\xa6' : '|',
- b'\xa7' : 'S',
- b'\xa8' : '..',
- b'\xa9' : '',
- b'\xaa' : '(th)',
- b'\xab' : '<<',
- b'\xac' : '!',
- b'\xad' : ' ',
- b'\xae' : '(R)',
- b'\xaf' : '-',
- b'\xb0' : 'o',
- b'\xb1' : '+-',
- b'\xb2' : '2',
- b'\xb3' : '3',
- b'\xb4' : ("'", 'acute'),
- b'\xb5' : 'u',
- b'\xb6' : 'P',
- b'\xb7' : '*',
- b'\xb8' : ',',
- b'\xb9' : '1',
- b'\xba' : '(th)',
- b'\xbb' : '>>',
- b'\xbc' : '1/4',
- b'\xbd' : '1/2',
- b'\xbe' : '3/4',
- b'\xbf' : '?',
- b'\xc0' : 'A',
- b'\xc1' : 'A',
- b'\xc2' : 'A',
- b'\xc3' : 'A',
- b'\xc4' : 'A',
- b'\xc5' : 'A',
- b'\xc6' : 'AE',
- b'\xc7' : 'C',
- b'\xc8' : 'E',
- b'\xc9' : 'E',
- b'\xca' : 'E',
- b'\xcb' : 'E',
- b'\xcc' : 'I',
- b'\xcd' : 'I',
- b'\xce' : 'I',
- b'\xcf' : 'I',
- b'\xd0' : 'D',
- b'\xd1' : 'N',
- b'\xd2' : 'O',
- b'\xd3' : 'O',
- b'\xd4' : 'O',
- b'\xd5' : 'O',
- b'\xd6' : 'O',
- b'\xd7' : '*',
- b'\xd8' : 'O',
- b'\xd9' : 'U',
- b'\xda' : 'U',
- b'\xdb' : 'U',
- b'\xdc' : 'U',
- b'\xdd' : 'Y',
- b'\xde' : 'b',
- b'\xdf' : 'B',
- b'\xe0' : 'a',
- b'\xe1' : 'a',
- b'\xe2' : 'a',
- b'\xe3' : 'a',
- b'\xe4' : 'a',
- b'\xe5' : 'a',
- b'\xe6' : 'ae',
- b'\xe7' : 'c',
- b'\xe8' : 'e',
- b'\xe9' : 'e',
- b'\xea' : 'e',
- b'\xeb' : 'e',
- b'\xec' : 'i',
- b'\xed' : 'i',
- b'\xee' : 'i',
- b'\xef' : 'i',
- b'\xf0' : 'o',
- b'\xf1' : 'n',
- b'\xf2' : 'o',
- b'\xf3' : 'o',
- b'\xf4' : 'o',
- b'\xf5' : 'o',
- b'\xf6' : 'o',
- b'\xf7' : '/',
- b'\xf8' : 'o',
- b'\xf9' : 'u',
- b'\xfa' : 'u',
- b'\xfb' : 'u',
- b'\xfc' : 'u',
- b'\xfd' : 'y',
- b'\xfe' : 'b',
- b'\xff' : 'y',
- }
-
- # A map used when removing rogue Windows-1252/ISO-8859-1
- # characters in otherwise UTF-8 documents.
- #
- # Note that \x81, \x8d, \x8f, \x90, and \x9d are undefined in
- # Windows-1252.
- WINDOWS_1252_TO_UTF8 = {
- 0x80 : b'\xe2\x82\xac', # €
- 0x82 : b'\xe2\x80\x9a', # ‚
- 0x83 : b'\xc6\x92', # ƒ
- 0x84 : b'\xe2\x80\x9e', # „
- 0x85 : b'\xe2\x80\xa6', # …
- 0x86 : b'\xe2\x80\xa0', # †
- 0x87 : b'\xe2\x80\xa1', # ‡
- 0x88 : b'\xcb\x86', # ˆ
- 0x89 : b'\xe2\x80\xb0', # ‰
- 0x8a : b'\xc5\xa0', # Š
- 0x8b : b'\xe2\x80\xb9', # ‹
- 0x8c : b'\xc5\x92', # Œ
- 0x8e : b'\xc5\xbd', # Ž
- 0x91 : b'\xe2\x80\x98', # ‘
- 0x92 : b'\xe2\x80\x99', # ’
- 0x93 : b'\xe2\x80\x9c', # “
- 0x94 : b'\xe2\x80\x9d', # ”
- 0x95 : b'\xe2\x80\xa2', # •
- 0x96 : b'\xe2\x80\x93', # –
- 0x97 : b'\xe2\x80\x94', # —
- 0x98 : b'\xcb\x9c', # ˜
- 0x99 : b'\xe2\x84\xa2', # ™
- 0x9a : b'\xc5\xa1', # š
- 0x9b : b'\xe2\x80\xba', # ›
- 0x9c : b'\xc5\x93', # œ
- 0x9e : b'\xc5\xbe', # ž
- 0x9f : b'\xc5\xb8', # Ÿ
- 0xa0 : b'\xc2\xa0', #
- 0xa1 : b'\xc2\xa1', # ¡
- 0xa2 : b'\xc2\xa2', # ¢
- 0xa3 : b'\xc2\xa3', # £
- 0xa4 : b'\xc2\xa4', # ¤
- 0xa5 : b'\xc2\xa5', # ¥
- 0xa6 : b'\xc2\xa6', # ¦
- 0xa7 : b'\xc2\xa7', # §
- 0xa8 : b'\xc2\xa8', # ¨
- 0xa9 : b'\xc2\xa9', # ©
- 0xaa : b'\xc2\xaa', # ª
- 0xab : b'\xc2\xab', # «
- 0xac : b'\xc2\xac', # ¬
- 0xad : b'\xc2\xad', #
- 0xae : b'\xc2\xae', # ®
- 0xaf : b'\xc2\xaf', # ¯
- 0xb0 : b'\xc2\xb0', # °
- 0xb1 : b'\xc2\xb1', # ±
- 0xb2 : b'\xc2\xb2', # ²
- 0xb3 : b'\xc2\xb3', # ³
- 0xb4 : b'\xc2\xb4', # ´
- 0xb5 : b'\xc2\xb5', # µ
- 0xb6 : b'\xc2\xb6', # ¶
- 0xb7 : b'\xc2\xb7', # ·
- 0xb8 : b'\xc2\xb8', # ¸
- 0xb9 : b'\xc2\xb9', # ¹
- 0xba : b'\xc2\xba', # º
- 0xbb : b'\xc2\xbb', # »
- 0xbc : b'\xc2\xbc', # ¼
- 0xbd : b'\xc2\xbd', # ½
- 0xbe : b'\xc2\xbe', # ¾
- 0xbf : b'\xc2\xbf', # ¿
- 0xc0 : b'\xc3\x80', # À
- 0xc1 : b'\xc3\x81', # Á
- 0xc2 : b'\xc3\x82', # Â
- 0xc3 : b'\xc3\x83', # Ã
- 0xc4 : b'\xc3\x84', # Ä
- 0xc5 : b'\xc3\x85', # Å
- 0xc6 : b'\xc3\x86', # Æ
- 0xc7 : b'\xc3\x87', # Ç
- 0xc8 : b'\xc3\x88', # È
- 0xc9 : b'\xc3\x89', # É
- 0xca : b'\xc3\x8a', # Ê
- 0xcb : b'\xc3\x8b', # Ë
- 0xcc : b'\xc3\x8c', # Ì
- 0xcd : b'\xc3\x8d', # Í
- 0xce : b'\xc3\x8e', # Î
- 0xcf : b'\xc3\x8f', # Ï
- 0xd0 : b'\xc3\x90', # Ð
- 0xd1 : b'\xc3\x91', # Ñ
- 0xd2 : b'\xc3\x92', # Ò
- 0xd3 : b'\xc3\x93', # Ó
- 0xd4 : b'\xc3\x94', # Ô
- 0xd5 : b'\xc3\x95', # Õ
- 0xd6 : b'\xc3\x96', # Ö
- 0xd7 : b'\xc3\x97', # ×
- 0xd8 : b'\xc3\x98', # Ø
- 0xd9 : b'\xc3\x99', # Ù
- 0xda : b'\xc3\x9a', # Ú
- 0xdb : b'\xc3\x9b', # Û
- 0xdc : b'\xc3\x9c', # Ü
- 0xdd : b'\xc3\x9d', # Ý
- 0xde : b'\xc3\x9e', # Þ
- 0xdf : b'\xc3\x9f', # ß
- 0xe0 : b'\xc3\xa0', # à
- 0xe1 : b'\xa1', # á
- 0xe2 : b'\xc3\xa2', # â
- 0xe3 : b'\xc3\xa3', # ã
- 0xe4 : b'\xc3\xa4', # ä
- 0xe5 : b'\xc3\xa5', # å
- 0xe6 : b'\xc3\xa6', # æ
- 0xe7 : b'\xc3\xa7', # ç
- 0xe8 : b'\xc3\xa8', # è
- 0xe9 : b'\xc3\xa9', # é
- 0xea : b'\xc3\xaa', # ê
- 0xeb : b'\xc3\xab', # ë
- 0xec : b'\xc3\xac', # ì
- 0xed : b'\xc3\xad', # í
- 0xee : b'\xc3\xae', # î
- 0xef : b'\xc3\xaf', # ï
- 0xf0 : b'\xc3\xb0', # ð
- 0xf1 : b'\xc3\xb1', # ñ
- 0xf2 : b'\xc3\xb2', # ò
- 0xf3 : b'\xc3\xb3', # ó
- 0xf4 : b'\xc3\xb4', # ô
- 0xf5 : b'\xc3\xb5', # õ
- 0xf6 : b'\xc3\xb6', # ö
- 0xf7 : b'\xc3\xb7', # ÷
- 0xf8 : b'\xc3\xb8', # ø
- 0xf9 : b'\xc3\xb9', # ù
- 0xfa : b'\xc3\xba', # ú
- 0xfb : b'\xc3\xbb', # û
- 0xfc : b'\xc3\xbc', # ü
- 0xfd : b'\xc3\xbd', # ý
- 0xfe : b'\xc3\xbe', # þ
- }
-
- MULTIBYTE_MARKERS_AND_SIZES = [
- (0xc2, 0xdf, 2), # 2-byte characters start with a byte C2-DF
- (0xe0, 0xef, 3), # 3-byte characters start with E0-EF
- (0xf0, 0xf4, 4), # 4-byte characters start with F0-F4
- ]
-
- FIRST_MULTIBYTE_MARKER = MULTIBYTE_MARKERS_AND_SIZES[0][0]
- LAST_MULTIBYTE_MARKER = MULTIBYTE_MARKERS_AND_SIZES[-1][1]
-
- @classmethod
- def detwingle(cls, in_bytes, main_encoding="utf8",
- embedded_encoding="windows-1252"):
- """Fix characters from one encoding embedded in some other encoding.
-
- Currently the only situation supported is Windows-1252 (or its
- subset ISO-8859-1), embedded in UTF-8.
-
- The input must be a bytestring. If you've already converted
- the document to Unicode, you're too late.
-
- The output is a bytestring in which `embedded_encoding`
- characters have been converted to their `main_encoding`
- equivalents.
- """
- if embedded_encoding.replace('_', '-').lower() not in (
- 'windows-1252', 'windows_1252'):
- raise NotImplementedError(
- "Windows-1252 and ISO-8859-1 are the only currently supported "
- "embedded encodings.")
-
- if main_encoding.lower() not in ('utf8', 'utf-8'):
- raise NotImplementedError(
- "UTF-8 is the only currently supported main encoding.")
-
- byte_chunks = []
-
- chunk_start = 0
- pos = 0
- while pos < len(in_bytes):
- byte = in_bytes[pos]
- if not isinstance(byte, int):
- # Python 2.x
- byte = ord(byte)
- if (byte >= cls.FIRST_MULTIBYTE_MARKER
- and byte <= cls.LAST_MULTIBYTE_MARKER):
- # This is the start of a UTF-8 multibyte character. Skip
- # to the end.
- for start, end, size in cls.MULTIBYTE_MARKERS_AND_SIZES:
- if byte >= start and byte <= end:
- pos += size
- break
- elif byte >= 0x80 and byte in cls.WINDOWS_1252_TO_UTF8:
- # We found a Windows-1252 character!
- # Save the string up to this point as a chunk.
- byte_chunks.append(in_bytes[chunk_start:pos])
-
- # Now translate the Windows-1252 character into UTF-8
- # and add it as another, one-byte chunk.
- byte_chunks.append(cls.WINDOWS_1252_TO_UTF8[byte])
- pos += 1
- chunk_start = pos
- else:
- # Go on to the next character.
- pos += 1
- if chunk_start == 0:
- # The string is unchanged.
- return in_bytes
- else:
- # Store the final chunk.
- byte_chunks.append(in_bytes[chunk_start:])
- return b''.join(byte_chunks)
-
diff --git a/libs/py2/bs4/diagnose.py b/libs/py2/bs4/diagnose.py
deleted file mode 100644
index 7a28c09a..00000000
--- a/libs/py2/bs4/diagnose.py
+++ /dev/null
@@ -1,225 +0,0 @@
-"""Diagnostic functions, mainly for use when doing tech support."""
-
-# Use of this source code is governed by a BSD-style license that can be
-# found in the LICENSE file.
-__license__ = "MIT"
-
-import cProfile
-from StringIO import StringIO
-from HTMLParser import HTMLParser
-import bs4
-from bs4 import BeautifulSoup, __version__
-from bs4.builder import builder_registry
-
-import os
-import pstats
-import random
-import tempfile
-import time
-import traceback
-import sys
-import cProfile
-
-def diagnose(data):
- """Diagnostic suite for isolating common problems."""
- print "Diagnostic running on Beautiful Soup %s" % __version__
- print "Python version %s" % sys.version
-
- basic_parsers = ["html.parser", "html5lib", "lxml"]
- for name in basic_parsers:
- for builder in builder_registry.builders:
- if name in builder.features:
- break
- else:
- basic_parsers.remove(name)
- print (
- "I noticed that %s is not installed. Installing it may help." %
- name)
-
- if 'lxml' in basic_parsers:
- basic_parsers.append("lxml-xml")
- try:
- from lxml import etree
- print "Found lxml version %s" % ".".join(map(str,etree.LXML_VERSION))
- except ImportError, e:
- print (
- "lxml is not installed or couldn't be imported.")
-
-
- if 'html5lib' in basic_parsers:
- try:
- import html5lib
- print "Found html5lib version %s" % html5lib.__version__
- except ImportError, e:
- print (
- "html5lib is not installed or couldn't be imported.")
-
- if hasattr(data, 'read'):
- data = data.read()
- elif data.startswith("http:") or data.startswith("https:"):
- print '"%s" looks like a URL. Beautiful Soup is not an HTTP client.' % data
- print "You need to use some other library to get the document behind the URL, and feed that document to Beautiful Soup."
- return
- else:
- try:
- if os.path.exists(data):
- print '"%s" looks like a filename. Reading data from the file.' % data
- with open(data) as fp:
- data = fp.read()
- except ValueError:
- # This can happen on some platforms when the 'filename' is
- # too long. Assume it's data and not a filename.
- pass
- print
-
- for parser in basic_parsers:
- print "Trying to parse your markup with %s" % parser
- success = False
- try:
- soup = BeautifulSoup(data, features=parser)
- success = True
- except Exception, e:
- print "%s could not parse the markup." % parser
- traceback.print_exc()
- if success:
- print "Here's what %s did with the markup:" % parser
- print soup.prettify()
-
- print "-" * 80
-
-def lxml_trace(data, html=True, **kwargs):
- """Print out the lxml events that occur during parsing.
-
- This lets you see how lxml parses a document when no Beautiful
- Soup code is running.
- """
- from lxml import etree
- for event, element in etree.iterparse(StringIO(data), html=html, **kwargs):
- print("%s, %4s, %s" % (event, element.tag, element.text))
-
-class AnnouncingParser(HTMLParser):
- """Announces HTMLParser parse events, without doing anything else."""
-
- def _p(self, s):
- print(s)
-
- def handle_starttag(self, name, attrs):
- self._p("%s START" % name)
-
- def handle_endtag(self, name):
- self._p("%s END" % name)
-
- def handle_data(self, data):
- self._p("%s DATA" % data)
-
- def handle_charref(self, name):
- self._p("%s CHARREF" % name)
-
- def handle_entityref(self, name):
- self._p("%s ENTITYREF" % name)
-
- def handle_comment(self, data):
- self._p("%s COMMENT" % data)
-
- def handle_decl(self, data):
- self._p("%s DECL" % data)
-
- def unknown_decl(self, data):
- self._p("%s UNKNOWN-DECL" % data)
-
- def handle_pi(self, data):
- self._p("%s PI" % data)
-
-def htmlparser_trace(data):
- """Print out the HTMLParser events that occur during parsing.
-
- This lets you see how HTMLParser parses a document when no
- Beautiful Soup code is running.
- """
- parser = AnnouncingParser()
- parser.feed(data)
-
-_vowels = "aeiou"
-_consonants = "bcdfghjklmnpqrstvwxyz"
-
-def rword(length=5):
- "Generate a random word-like string."
- s = ''
- for i in range(length):
- if i % 2 == 0:
- t = _consonants
- else:
- t = _vowels
- s += random.choice(t)
- return s
-
-def rsentence(length=4):
- "Generate a random sentence-like string."
- return " ".join(rword(random.randint(4,9)) for i in range(length))
-
-def rdoc(num_elements=1000):
- """Randomly generate an invalid HTML document."""
- tag_names = ['p', 'div', 'span', 'i', 'b', 'script', 'table']
- elements = []
- for i in range(num_elements):
- choice = random.randint(0,3)
- if choice == 0:
- # New tag.
- tag_name = random.choice(tag_names)
- elements.append("<%s>" % tag_name)
- elif choice == 1:
- elements.append(rsentence(random.randint(1,4)))
- elif choice == 2:
- # Close a tag.
- tag_name = random.choice(tag_names)
- elements.append("%s>" % tag_name)
- return "" + "\n".join(elements) + ""
-
-def benchmark_parsers(num_elements=100000):
- """Very basic head-to-head performance benchmark."""
- print "Comparative parser benchmark on Beautiful Soup %s" % __version__
- data = rdoc(num_elements)
- print "Generated a large invalid HTML document (%d bytes)." % len(data)
-
- for parser in ["lxml", ["lxml", "html"], "html5lib", "html.parser"]:
- success = False
- try:
- a = time.time()
- soup = BeautifulSoup(data, parser)
- b = time.time()
- success = True
- except Exception, e:
- print "%s could not parse the markup." % parser
- traceback.print_exc()
- if success:
- print "BS4+%s parsed the markup in %.2fs." % (parser, b-a)
-
- from lxml import etree
- a = time.time()
- etree.HTML(data)
- b = time.time()
- print "Raw lxml parsed the markup in %.2fs." % (b-a)
-
- import html5lib
- parser = html5lib.HTMLParser()
- a = time.time()
- parser.parse(data)
- b = time.time()
- print "Raw html5lib parsed the markup in %.2fs." % (b-a)
-
-def profile(num_elements=100000, parser="lxml"):
-
- filehandle = tempfile.NamedTemporaryFile()
- filename = filehandle.name
-
- data = rdoc(num_elements)
- vars = dict(bs4=bs4, data=data, parser=parser)
- cProfile.runctx('bs4.BeautifulSoup(data, parser)' , vars, vars, filename)
-
- stats = pstats.Stats(filename)
- # stats.strip_dirs()
- stats.sort_stats("cumulative")
- stats.print_stats('_html5lib|bs4', 50)
-
-if __name__ == '__main__':
- diagnose(sys.stdin.read())
diff --git a/libs/py2/bs4/element.py b/libs/py2/bs4/element.py
deleted file mode 100644
index 886eb91f..00000000
--- a/libs/py2/bs4/element.py
+++ /dev/null
@@ -1,1885 +0,0 @@
-# Use of this source code is governed by a BSD-style license that can be
-# found in the LICENSE file.
-__license__ = "MIT"
-
-try:
- from collections.abc import Callable # Python 3.6
-except ImportError , e:
- from collections import Callable
-import re
-import shlex
-import sys
-import warnings
-from bs4.dammit import EntitySubstitution
-
-DEFAULT_OUTPUT_ENCODING = "utf-8"
-PY3K = (sys.version_info[0] > 2)
-
-whitespace_re = re.compile(r"\s+")
-
-def _alias(attr):
- """Alias one attribute name to another for backward compatibility"""
- @property
- def alias(self):
- return getattr(self, attr)
-
- @alias.setter
- def alias(self):
- return setattr(self, attr)
- return alias
-
-
-class NamespacedAttribute(unicode):
-
- def __new__(cls, prefix, name, namespace=None):
- if name is None:
- obj = unicode.__new__(cls, prefix)
- elif prefix is None:
- # Not really namespaced.
- obj = unicode.__new__(cls, name)
- else:
- obj = unicode.__new__(cls, prefix + ":" + name)
- obj.prefix = prefix
- obj.name = name
- obj.namespace = namespace
- return obj
-
-class AttributeValueWithCharsetSubstitution(unicode):
- """A stand-in object for a character encoding specified in HTML."""
-
-class CharsetMetaAttributeValue(AttributeValueWithCharsetSubstitution):
- """A generic stand-in for the value of a meta tag's 'charset' attribute.
-
- When Beautiful Soup parses the markup '', the
- value of the 'charset' attribute will be one of these objects.
- """
-
- def __new__(cls, original_value):
- obj = unicode.__new__(cls, original_value)
- obj.original_value = original_value
- return obj
-
- def encode(self, encoding):
- return encoding
-
-
-class ContentMetaAttributeValue(AttributeValueWithCharsetSubstitution):
- """A generic stand-in for the value of a meta tag's 'content' attribute.
-
- When Beautiful Soup parses the markup:
-
-
- The value of the 'content' attribute will be one of these objects.
- """
-
- CHARSET_RE = re.compile(r"((^|;)\s*charset=)([^;]*)", re.M)
-
- def __new__(cls, original_value):
- match = cls.CHARSET_RE.search(original_value)
- if match is None:
- # No substitution necessary.
- return unicode.__new__(unicode, original_value)
-
- obj = unicode.__new__(cls, original_value)
- obj.original_value = original_value
- return obj
-
- def encode(self, encoding):
- def rewrite(match):
- return match.group(1) + encoding
- return self.CHARSET_RE.sub(rewrite, self.original_value)
-
-class HTMLAwareEntitySubstitution(EntitySubstitution):
-
- """Entity substitution rules that are aware of some HTML quirks.
-
- Specifically, the contents of
-
-Hello, world!
-
-
-'''
- soup = self.soup(html)
- self.assertEqual("text/javascript", soup.find('script')['type'])
-
- def test_comment(self):
- # Comments are represented as Comment objects.
- markup = "foobaz
"
- self.assertSoupEquals(markup)
-
- soup = self.soup(markup)
- comment = soup.find(text="foobar")
- self.assertEqual(comment.__class__, Comment)
-
- # The comment is properly integrated into the tree.
- foo = soup.find(text="foo")
- self.assertEqual(comment, foo.next_element)
- baz = soup.find(text="baz")
- self.assertEqual(comment, baz.previous_element)
-
- def test_preserved_whitespace_in_pre_and_textarea(self):
- """Whitespace must be preserved in and