diff --git a/lib/feedparser.py b/lib/feedparser.py index 15fdc95b..f6fc651a 100644 --- a/lib/feedparser.py +++ b/lib/feedparser.py @@ -1,17 +1,19 @@ -#!/usr/bin/env python """Universal feed parser Handles RSS 0.9x, RSS 1.0, RSS 2.0, CDF, Atom 0.3, and Atom 1.0 feeds -Visit http://feedparser.org/ for the latest version -Visit http://feedparser.org/docs/ for the latest documentation +Visit https://code.google.com/p/feedparser/ for the latest version +Visit http://packages.python.org/feedparser/ for the latest documentation Required: Python 2.4 or later -Recommended: CJKCodecs and iconv_codec +Recommended: iconv_codec """ -__version__ = "5.0.1" -__license__ = """Copyright (c) 2002-2008, Mark Pilgrim, All rights reserved. +__version__ = "5.2.1" +__license__ = """ +Copyright 2010-2015 Kurt McKee +Copyright 2002-2008 Mark Pilgrim +All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: @@ -42,13 +44,13 @@ __contributors__ = ["Jason Diamond ", "Sam Ruby ", "Ade Oshineye ", "Martin Pool ", - "Kurt McKee "] -_debug = 0 + "Kurt McKee ", + "Bernd Schlapsi ",] # HTTP "User-Agent" header to send to servers when downloading feeds. # If you are embedding feedparser in a larger application, you should # change this to your application name and URL. -USER_AGENT = "UniversalFeedParser/%s +http://feedparser.org/" % __version__ +USER_AGENT = "UniversalFeedParser/%s +https://code.google.com/p/feedparser/" % __version__ # HTTP "Accept" header to send to servers when downloading feeds. If you don't # want to send an Accept header, set this to None. @@ -59,15 +61,6 @@ ACCEPT_HEADER = "application/atom+xml,application/rdf+xml,application/rss+xml,ap # of pre-installed parsers until it finds one that supports everything we need. PREFERRED_XML_PARSERS = ["drv_libxml2"] -# If you want feedparser to automatically run HTML markup through HTML Tidy, set -# this to 1. Requires mxTidy -# or utidylib . -TIDY_MARKUP = 0 - -# List of Python interfaces for HTML Tidy, in order of preference. Only useful -# if TIDY_MARKUP = 1 -PREFERRED_TIDY_INTERFACES = ["uTidy", "mxTidy"] - # If you want feedparser to automatically resolve all relative URIs, set this # to 1. RESOLVE_RELATIVE_URIS = 1 @@ -93,30 +86,31 @@ except (NameError, AttributeError): # base64 support for Atom feeds that contain embedded binary data try: import base64, binascii +except ImportError: + base64 = binascii = None +else: # Python 3.1 deprecates decodestring in favor of decodebytes _base64decode = getattr(base64, 'decodebytes', base64.decodestring) -except: - base64 = binascii = None -def _s2bytes(s): - # Convert a UTF-8 str to bytes if the interpreter is Python 3 - try: - return bytes(s, 'utf8') - except (NameError, TypeError): - # In Python 2.5 and below, bytes doesn't exist (NameError) - # In Python 2.6 and above, bytes and str are the same (TypeError) - return s - -def _l2bytes(l): - # Convert a list of ints to bytes if the interpreter is Python 3 - try: - if bytes is not str: - # In Python 2.6 and above, this call won't raise an exception - # but it will return bytes([65]) as '[65]' instead of 'A' - return bytes(l) - raise NameError - except NameError: - return ''.join(map(chr, l)) +# _s2bytes: convert a UTF-8 str to bytes if the interpreter is Python 3 +# _l2bytes: convert a list of ints to bytes if the interpreter is Python 3 +try: + if bytes is str: + # In Python 2.5 and below, bytes doesn't exist (NameError) + # In Python 2.6 and above, bytes and str are the same type + raise NameError +except NameError: + # Python 2 + def _s2bytes(s): + return s + def _l2bytes(l): + return ''.join(map(chr, l)) +else: + # Python 3 + def _s2bytes(s): + return bytes(s, 'utf8') + def _l2bytes(l): + return bytes(l) # If you want feedparser to allow all URL schemes, set this to () # List culled from Python's urlparse documentation at: @@ -125,9 +119,10 @@ def _l2bytes(l): # https://secure.wikimedia.org/wikipedia/en/wiki/URI_scheme # Many more will likely need to be added! ACCEPTABLE_URI_SCHEMES = ( - 'file', 'ftp', 'gopher', 'h323', 'hdl', 'http', 'https', 'imap', 'mailto', - 'mms', 'news', 'nntp', 'prospero', 'rsync', 'rtsp', 'rtspu', 'sftp', - 'shttp', 'sip', 'sips', 'snews', 'svn', 'svn+ssh', 'telnet', 'wais', + 'file', 'ftp', 'gopher', 'h323', 'hdl', 'http', 'https', 'imap', 'magnet', + 'mailto', 'mms', 'news', 'nntp', 'prospero', 'rsync', 'rtsp', 'rtspu', + 'sftp', 'shttp', 'sip', 'sips', 'snews', 'svn', 'svn+ssh', 'telnet', + 'wais', # Additional common-but-unofficial schemes 'aim', 'callto', 'cvs', 'facetime', 'feed', 'git', 'gtalk', 'irc', 'ircs', 'irc6', 'itms', 'mms', 'msnim', 'skype', 'ssh', 'smb', 'svn', 'ymsg', @@ -135,37 +130,50 @@ ACCEPTABLE_URI_SCHEMES = ( #ACCEPTABLE_URI_SCHEMES = () # ---------- required modules (should come with any Python distribution) ---------- -import sgmllib, re, sys, copy, urlparse, time, types, cgi, urllib, urllib2, datetime +import cgi +import codecs +import copy +import datetime +import itertools +import re +import struct +import time +import types +import urllib.request, urllib.parse, urllib.error +import urllib.request, urllib.error, urllib.parse +import urllib.parse +import warnings + +from html.entities import name2codepoint, codepoint2name, entitydefs + try: from io import BytesIO as _StringIO except ImportError: try: - from cStringIO import StringIO as _StringIO - except: - from StringIO import StringIO as _StringIO + from io import StringIO as _StringIO + except ImportError: + from io import StringIO as _StringIO # ---------- optional modules (feedparser will work without these, but with reduced functionality) ---------- # gzip is included with most Python distributions, but may not be available if you compiled your own try: import gzip -except: +except ImportError: gzip = None try: import zlib -except: +except ImportError: zlib = None # If a real XML parser is available, feedparser will attempt to use it. feedparser has -# been tested with the built-in SAX parser, PyXML, and libxml2. On platforms where the +# been tested with the built-in SAX parser and libxml2. On platforms where the # Python distribution does not come with an XML parser (such as Mac OS X 10.2 and some # versions of FreeBSD), feedparser will quietly fall back on regex-based parsing. try: import xml.sax - xml.sax.make_parser(PREFERRED_XML_PARSERS) # test for valid parsers from xml.sax.saxutils import escape as _xmlescape - _XML_AVAILABLE = 1 -except: +except ImportError: _XML_AVAILABLE = 0 def _xmlescape(data,entities={}): data = data.replace('&', '&') @@ -174,69 +182,60 @@ except: for char, entity in entities: data = data.replace(char, entity) return data +else: + try: + xml.sax.make_parser(PREFERRED_XML_PARSERS) # test for valid parsers + except xml.sax.SAXReaderNotAvailable: + _XML_AVAILABLE = 0 + else: + _XML_AVAILABLE = 1 -# cjkcodecs and iconv_codec provide support for more character encodings. -# Both are available from http://cjkpython.i18n.org/ +# sgmllib is not available by default in Python 3; if the end user doesn't have +# it available then we'll lose illformed XML parsing and content santizing try: - import cjkcodecs.aliases -except: - pass -try: - import iconv_codec -except: - pass + import sgmllib +except ImportError: + # This is probably Python 3, which doesn't include sgmllib anymore + _SGML_AVAILABLE = 0 -# chardet library auto-detects character encodings -# Download from http://chardet.feedparser.org/ -try: - import chardet - if _debug: - import chardet.constants - chardet.constants._debug = 1 -except: - chardet = None + # Mock sgmllib enough to allow subclassing later on + class sgmllib(object): + class SGMLParser(object): + def goahead(self, i): + pass + def parse_starttag(self, i): + pass +else: + _SGML_AVAILABLE = 1 -# reversable htmlentitydefs mappings for Python 2.2 -try: - from htmlentitydefs import name2codepoint, codepoint2name -except: - import htmlentitydefs - name2codepoint={} - codepoint2name={} - for (name,codepoint) in htmlentitydefs.entitydefs.iteritems(): - if codepoint.startswith('&#'): codepoint=unichr(int(codepoint[2:-1])) - name2codepoint[name]=ord(codepoint) - codepoint2name[ord(codepoint)]=name + # sgmllib defines a number of module-level regular expressions that are + # insufficient for the XML parsing feedparser needs. Rather than modify + # the variables directly in sgmllib, they're defined here using the same + # names, and the compiled code objects of several sgmllib.SGMLParser + # methods are copied into _BaseHTMLProcessor so that they execute in + # feedparser's scope instead of sgmllib's scope. + charref = re.compile('&#(\d+|[xX][0-9a-fA-F]+);') + tagfind = re.compile('[a-zA-Z][-_.:a-zA-Z0-9]*') + attrfind = re.compile( + r'\s*([a-zA-Z_][-:.a-zA-Z_0-9]*)[$]?(\s*=\s*' + r'(\'[^\']*\'|"[^"]*"|[][\-a-zA-Z0-9./,:;+*%?!&$\(\)_#=~\'"@]*))?' + ) -# BeautifulSoup parser used for parsing microformats from embedded HTML content -# http://www.crummy.com/software/BeautifulSoup/ -# feedparser is tested with BeautifulSoup 3.0.x, but it might work with the -# older 2.x series. If it doesn't, and you can figure out why, I'll accept a -# patch and modify the compatibility statement accordingly. -try: - import BeautifulSoup -except: - BeautifulSoup = None + # Unfortunately, these must be copied over to prevent NameError exceptions + entityref = sgmllib.entityref + incomplete = sgmllib.incomplete + interesting = sgmllib.interesting + shorttag = sgmllib.shorttag + shorttagopen = sgmllib.shorttagopen + starttagopen = sgmllib.starttagopen -# ---------- don't touch these ---------- -class ThingsNobodyCaresAboutButMe(Exception): pass -class CharacterEncodingOverride(ThingsNobodyCaresAboutButMe): pass -class CharacterEncodingUnknown(ThingsNobodyCaresAboutButMe): pass -class NonXMLContentType(ThingsNobodyCaresAboutButMe): pass -class UndeclaredNamespace(Exception): pass - -sgmllib.tagfind = re.compile('[a-zA-Z][-_.:a-zA-Z0-9]*') -sgmllib.special = re.compile(']|"[^"]*"(?=>|/|\s|\w+=)|'[^']*'(?=>|/|\s|\w+=))*(?=[<>])|.*?(?=[<>])''') - def search(self,string,index=0): - match = self.endbracket.match(string,index) + def search(self, target, index=0): + match = self.endbracket.match(target, index) if match is not None: # Returning a new object in the calling thread's context # resolves a thread-safety. @@ -247,7 +246,29 @@ if sgmllib.endbracket.search(' <').start(0): self.match = match def start(self, n): return self.match.end(n) - sgmllib.endbracket = EndBracketRegEx() + endbracket = _EndBracketRegEx() + + +# iconv_codec provides support for more character encodings. +# It's available from http://cjkpython.i18n.org/ +try: + import iconv_codec +except ImportError: + pass + +# chardet library auto-detects character encodings +# Download from http://chardet.feedparser.org/ +try: + import chardet +except ImportError: + chardet = None + +# ---------- don't touch these ---------- +class ThingsNobodyCaresAboutButMe(Exception): pass +class CharacterEncodingOverride(ThingsNobodyCaresAboutButMe): pass +class CharacterEncodingUnknown(ThingsNobodyCaresAboutButMe): pass +class NonXMLContentType(ThingsNobodyCaresAboutButMe): pass +class UndeclaredNamespace(Exception): pass SUPPORTED_VERSIONS = {'': 'unknown', 'rss090': 'RSS 0.90', @@ -265,27 +286,16 @@ SUPPORTED_VERSIONS = {'': 'unknown', 'atom10': 'Atom 1.0', 'atom': 'Atom (unknown version)', 'cdf': 'CDF', - 'hotrss': 'Hot RSS' } -try: - UserDict = dict -except NameError: - # Python 2.1 does not have dict - from UserDict import UserDict - def dict(aList): - rc = {} - for k, v in aList: - rc[k] = v - return rc - -class FeedParserDict(UserDict): +class FeedParserDict(dict): keymap = {'channel': 'feed', 'items': 'entries', 'guid': 'id', 'date': 'updated', 'date_parsed': 'updated_parsed', 'description': ['summary', 'subtitle'], + 'description_detail': ['summary_detail', 'subtitle_detail'], 'url': ['href'], 'modified': 'updated', 'modified_parsed': 'updated_parsed', @@ -296,223 +306,224 @@ class FeedParserDict(UserDict): 'tagline': 'subtitle', 'tagline_detail': 'subtitle_detail'} def __getitem__(self, key): + ''' + :return: A :class:`FeedParserDict`. + ''' if key == 'category': - return UserDict.__getitem__(self, 'tags')[0]['term'] - if key == 'enclosures': - norel = lambda link: FeedParserDict([(name,value) for (name,value) in link.items() if name!='rel']) - return [norel(link) for link in UserDict.__getitem__(self, 'links') if link['rel']=='enclosure'] - if key == 'license': - for link in UserDict.__getitem__(self, 'links'): - if link['rel']=='license' and link.has_key('href'): + try: + return dict.__getitem__(self, 'tags')[0]['term'] + except IndexError: + raise KeyError("object doesn't have key 'category'") + elif key == 'enclosures': + norel = lambda link: FeedParserDict([(name,value) for (name,value) in list(link.items()) if name!='rel']) + return [norel(link) for link in dict.__getitem__(self, 'links') if link['rel']=='enclosure'] + elif key == 'license': + for link in dict.__getitem__(self, 'links'): + if link['rel']=='license' and 'href' in link: return link['href'] - if key == 'categories': - return [(tag['scheme'], tag['term']) for tag in UserDict.__getitem__(self, 'tags')] - realkey = self.keymap.get(key, key) - if type(realkey) == types.ListType: - for k in realkey: - if UserDict.__contains__(self, k): - return UserDict.__getitem__(self, k) - if UserDict.__contains__(self, key): - return UserDict.__getitem__(self, key) - return UserDict.__getitem__(self, realkey) - - def __setitem__(self, key, value): - for k in self.keymap.keys(): - if key == k: - key = self.keymap[k] - if type(key) == types.ListType: - key = key[0] - return UserDict.__setitem__(self, key, value) - - def get(self, key, default=None): - if self.has_key(key): - return self[key] + elif key == 'updated': + # Temporarily help developers out by keeping the old + # broken behavior that was reported in issue 310. + # This fix was proposed in issue 328. + if not dict.__contains__(self, 'updated') and \ + dict.__contains__(self, 'published'): + warnings.warn("To avoid breaking existing software while " + "fixing issue 310, a temporary mapping has been created " + "from `updated` to `published` if `updated` doesn't " + "exist. This fallback will be removed in a future version " + "of feedparser.", DeprecationWarning) + return dict.__getitem__(self, 'published') + return dict.__getitem__(self, 'updated') + elif key == 'updated_parsed': + if not dict.__contains__(self, 'updated_parsed') and \ + dict.__contains__(self, 'published_parsed'): + warnings.warn("To avoid breaking existing software while " + "fixing issue 310, a temporary mapping has been created " + "from `updated_parsed` to `published_parsed` if " + "`updated_parsed` doesn't exist. This fallback will be " + "removed in a future version of feedparser.", + DeprecationWarning) + return dict.__getitem__(self, 'published_parsed') + return dict.__getitem__(self, 'updated_parsed') else: - return default - - def setdefault(self, key, value): - if not self.has_key(key): - self[key] = value - return self[key] - - def has_key(self, key): - try: - return hasattr(self, key) or UserDict.__contains__(self, key) - except AttributeError: - return False - # This alias prevents the 2to3 tool from changing the semantics of the - # __contains__ function below and exhausting the maximum recursion depth - __has_key = has_key - - def __getattr__(self, key): - try: - return self.__dict__[key] - except KeyError: - pass - try: - assert not key.startswith('_') - return self.__getitem__(key) - except: - raise AttributeError, "object has no attribute '%s'" % key - - def __setattr__(self, key, value): - if key.startswith('_') or key == 'data': - self.__dict__[key] = value - else: - return self.__setitem__(key, value) + realkey = self.keymap.get(key, key) + if isinstance(realkey, list): + for k in realkey: + if dict.__contains__(self, k): + return dict.__getitem__(self, k) + elif dict.__contains__(self, realkey): + return dict.__getitem__(self, realkey) + return dict.__getitem__(self, key) def __contains__(self, key): - return self.__has_key(key) + if key in ('updated', 'updated_parsed'): + # Temporarily help developers out by keeping the old + # broken behavior that was reported in issue 310. + # This fix was proposed in issue 328. + return dict.__contains__(self, key) + try: + self.__getitem__(key) + except KeyError: + return False + else: + return True -def zopeCompatibilityHack(): - global FeedParserDict - del FeedParserDict - def FeedParserDict(aDict=None): - rc = {} - if aDict: - rc.update(aDict) - return rc + has_key = __contains__ -_ebcdic_to_ascii_map = None -def _ebcdic_to_ascii(s): - global _ebcdic_to_ascii_map - if not _ebcdic_to_ascii_map: - emap = ( - 0,1,2,3,156,9,134,127,151,141,142,11,12,13,14,15, - 16,17,18,19,157,133,8,135,24,25,146,143,28,29,30,31, - 128,129,130,131,132,10,23,27,136,137,138,139,140,5,6,7, - 144,145,22,147,148,149,150,4,152,153,154,155,20,21,158,26, - 32,160,161,162,163,164,165,166,167,168,91,46,60,40,43,33, - 38,169,170,171,172,173,174,175,176,177,93,36,42,41,59,94, - 45,47,178,179,180,181,182,183,184,185,124,44,37,95,62,63, - 186,187,188,189,190,191,192,193,194,96,58,35,64,39,61,34, - 195,97,98,99,100,101,102,103,104,105,196,197,198,199,200,201, - 202,106,107,108,109,110,111,112,113,114,203,204,205,206,207,208, - 209,126,115,116,117,118,119,120,121,122,210,211,212,213,214,215, - 216,217,218,219,220,221,222,223,224,225,226,227,228,229,230,231, - 123,65,66,67,68,69,70,71,72,73,232,233,234,235,236,237, - 125,74,75,76,77,78,79,80,81,82,238,239,240,241,242,243, - 92,159,83,84,85,86,87,88,89,90,244,245,246,247,248,249, - 48,49,50,51,52,53,54,55,56,57,250,251,252,253,254,255 - ) - _ebcdic_to_ascii_map = _maketrans( \ - _l2bytes(range(256)), _l2bytes(emap)) - return s.translate(_ebcdic_to_ascii_map) + def get(self, key, default=None): + ''' + :return: A :class:`FeedParserDict`. + ''' + try: + return self.__getitem__(key) + except KeyError: + return default + + def __setitem__(self, key, value): + key = self.keymap.get(key, key) + if isinstance(key, list): + key = key[0] + return dict.__setitem__(self, key, value) + + def setdefault(self, key, value): + if key not in self: + self[key] = value + return value + return self[key] + + def __getattr__(self, key): + # __getattribute__() is called first; this will be called + # only if an attribute was not already found + try: + return self.__getitem__(key) + except KeyError: + raise AttributeError("object has no attribute '%s'" % key) + + def __hash__(self): + return id(self) _cp1252 = { - unichr(128): unichr(8364), # euro sign - unichr(130): unichr(8218), # single low-9 quotation mark - unichr(131): unichr( 402), # latin small letter f with hook - unichr(132): unichr(8222), # double low-9 quotation mark - unichr(133): unichr(8230), # horizontal ellipsis - unichr(134): unichr(8224), # dagger - unichr(135): unichr(8225), # double dagger - unichr(136): unichr( 710), # modifier letter circumflex accent - unichr(137): unichr(8240), # per mille sign - unichr(138): unichr( 352), # latin capital letter s with caron - unichr(139): unichr(8249), # single left-pointing angle quotation mark - unichr(140): unichr( 338), # latin capital ligature oe - unichr(142): unichr( 381), # latin capital letter z with caron - unichr(145): unichr(8216), # left single quotation mark - unichr(146): unichr(8217), # right single quotation mark - unichr(147): unichr(8220), # left double quotation mark - unichr(148): unichr(8221), # right double quotation mark - unichr(149): unichr(8226), # bullet - unichr(150): unichr(8211), # en dash - unichr(151): unichr(8212), # em dash - unichr(152): unichr( 732), # small tilde - unichr(153): unichr(8482), # trade mark sign - unichr(154): unichr( 353), # latin small letter s with caron - unichr(155): unichr(8250), # single right-pointing angle quotation mark - unichr(156): unichr( 339), # latin small ligature oe - unichr(158): unichr( 382), # latin small letter z with caron - unichr(159): unichr( 376)} # latin capital letter y with diaeresis + 128: chr(8364), # euro sign + 130: chr(8218), # single low-9 quotation mark + 131: chr( 402), # latin small letter f with hook + 132: chr(8222), # double low-9 quotation mark + 133: chr(8230), # horizontal ellipsis + 134: chr(8224), # dagger + 135: chr(8225), # double dagger + 136: chr( 710), # modifier letter circumflex accent + 137: chr(8240), # per mille sign + 138: chr( 352), # latin capital letter s with caron + 139: chr(8249), # single left-pointing angle quotation mark + 140: chr( 338), # latin capital ligature oe + 142: chr( 381), # latin capital letter z with caron + 145: chr(8216), # left single quotation mark + 146: chr(8217), # right single quotation mark + 147: chr(8220), # left double quotation mark + 148: chr(8221), # right double quotation mark + 149: chr(8226), # bullet + 150: chr(8211), # en dash + 151: chr(8212), # em dash + 152: chr( 732), # small tilde + 153: chr(8482), # trade mark sign + 154: chr( 353), # latin small letter s with caron + 155: chr(8250), # single right-pointing angle quotation mark + 156: chr( 339), # latin small ligature oe + 158: chr( 382), # latin small letter z with caron + 159: chr( 376), # latin capital letter y with diaeresis +} _urifixer = re.compile('^([A-Za-z][A-Za-z0-9+-.]*://)(/*)(.*?)') def _urljoin(base, uri): uri = _urifixer.sub(r'\1\3', uri) + if not isinstance(uri, str): + uri = uri.decode('utf-8', 'ignore') try: - return urlparse.urljoin(base, uri) - except: - uri = urlparse.urlunparse([urllib.quote(part) for part in urlparse.urlparse(uri)]) - return urlparse.urljoin(base, uri) + uri = urllib.parse.urljoin(base, uri) + except ValueError: + uri = '' + if not isinstance(uri, str): + return uri.decode('utf-8', 'ignore') + return uri class _FeedParserMixin: - namespaces = {'': '', - 'http://backend.userland.com/rss': '', - 'http://blogs.law.harvard.edu/tech/rss': '', - 'http://purl.org/rss/1.0/': '', - 'http://my.netscape.com/rdf/simple/0.9/': '', - 'http://example.com/newformat#': '', - 'http://example.com/necho': '', - 'http://purl.org/echo/': '', - 'uri/of/echo/namespace#': '', - 'http://purl.org/pie/': '', - 'http://purl.org/atom/ns#': '', - 'http://www.w3.org/2005/Atom': '', - 'http://purl.org/rss/1.0/modules/rss091#': '', + namespaces = { + '': '', + 'http://backend.userland.com/rss': '', + 'http://blogs.law.harvard.edu/tech/rss': '', + 'http://purl.org/rss/1.0/': '', + 'http://my.netscape.com/rdf/simple/0.9/': '', + 'http://example.com/newformat#': '', + 'http://example.com/necho': '', + 'http://purl.org/echo/': '', + 'uri/of/echo/namespace#': '', + 'http://purl.org/pie/': '', + 'http://purl.org/atom/ns#': '', + 'http://www.w3.org/2005/Atom': '', + 'http://purl.org/rss/1.0/modules/rss091#': '', - 'http://webns.net/mvcb/': 'admin', - 'http://purl.org/rss/1.0/modules/aggregation/': 'ag', - 'http://purl.org/rss/1.0/modules/annotate/': 'annotate', - 'http://media.tangent.org/rss/1.0/': 'audio', - 'http://backend.userland.com/blogChannelModule': 'blogChannel', - 'http://web.resource.org/cc/': 'cc', - 'http://backend.userland.com/creativeCommonsRssModule': 'creativeCommons', - 'http://purl.org/rss/1.0/modules/company': 'co', - 'http://purl.org/rss/1.0/modules/content/': 'content', - 'http://my.theinfo.org/changed/1.0/rss/': 'cp', - 'http://purl.org/dc/elements/1.1/': 'dc', - 'http://purl.org/dc/terms/': 'dcterms', - 'http://purl.org/rss/1.0/modules/email/': 'email', - 'http://purl.org/rss/1.0/modules/event/': 'ev', - 'http://rssnamespace.org/feedburner/ext/1.0': 'feedburner', - 'http://freshmeat.net/rss/fm/': 'fm', - 'http://xmlns.com/foaf/0.1/': 'foaf', - 'http://www.w3.org/2003/01/geo/wgs84_pos#': 'geo', - 'http://postneo.com/icbm/': 'icbm', - 'http://purl.org/rss/1.0/modules/image/': 'image', - 'http://www.itunes.com/DTDs/PodCast-1.0.dtd': 'itunes', - 'http://example.com/DTDs/PodCast-1.0.dtd': 'itunes', - 'http://purl.org/rss/1.0/modules/link/': 'l', - 'http://search.yahoo.com/mrss': 'media', - #Version 1.1.2 of the Media RSS spec added the trailing slash on the namespace - 'http://search.yahoo.com/mrss/': 'media', - 'http://madskills.com/public/xml/rss/module/pingback/': 'pingback', - 'http://prismstandard.org/namespaces/1.2/basic/': 'prism', - 'http://www.w3.org/1999/02/22-rdf-syntax-ns#': 'rdf', - 'http://www.w3.org/2000/01/rdf-schema#': 'rdfs', - 'http://purl.org/rss/1.0/modules/reference/': 'ref', - 'http://purl.org/rss/1.0/modules/richequiv/': 'reqv', - 'http://purl.org/rss/1.0/modules/search/': 'search', - 'http://purl.org/rss/1.0/modules/slash/': 'slash', - 'http://schemas.xmlsoap.org/soap/envelope/': 'soap', - 'http://purl.org/rss/1.0/modules/servicestatus/': 'ss', - 'http://hacks.benhammersley.com/rss/streaming/': 'str', - 'http://purl.org/rss/1.0/modules/subscription/': 'sub', - 'http://purl.org/rss/1.0/modules/syndication/': 'sy', - 'http://schemas.pocketsoap.com/rss/myDescModule/': 'szf', - 'http://purl.org/rss/1.0/modules/taxonomy/': 'taxo', - 'http://purl.org/rss/1.0/modules/threading/': 'thr', - 'http://purl.org/rss/1.0/modules/textinput/': 'ti', - 'http://madskills.com/public/xml/rss/module/trackback/':'trackback', - 'http://wellformedweb.org/commentAPI/': 'wfw', - 'http://purl.org/rss/1.0/modules/wiki/': 'wiki', - 'http://www.w3.org/1999/xhtml': 'xhtml', - 'http://www.w3.org/1999/xlink': 'xlink', - 'http://www.w3.org/XML/1998/namespace': 'xml' -} + 'http://webns.net/mvcb/': 'admin', + 'http://purl.org/rss/1.0/modules/aggregation/': 'ag', + 'http://purl.org/rss/1.0/modules/annotate/': 'annotate', + 'http://media.tangent.org/rss/1.0/': 'audio', + 'http://backend.userland.com/blogChannelModule': 'blogChannel', + 'http://web.resource.org/cc/': 'cc', + 'http://backend.userland.com/creativeCommonsRssModule': 'creativeCommons', + 'http://purl.org/rss/1.0/modules/company': 'co', + 'http://purl.org/rss/1.0/modules/content/': 'content', + 'http://my.theinfo.org/changed/1.0/rss/': 'cp', + 'http://purl.org/dc/elements/1.1/': 'dc', + 'http://purl.org/dc/terms/': 'dcterms', + 'http://purl.org/rss/1.0/modules/email/': 'email', + 'http://purl.org/rss/1.0/modules/event/': 'ev', + 'http://rssnamespace.org/feedburner/ext/1.0': 'feedburner', + 'http://freshmeat.net/rss/fm/': 'fm', + 'http://xmlns.com/foaf/0.1/': 'foaf', + 'http://www.w3.org/2003/01/geo/wgs84_pos#': 'geo', + 'http://www.georss.org/georss': 'georss', + 'http://www.opengis.net/gml': 'gml', + 'http://postneo.com/icbm/': 'icbm', + 'http://purl.org/rss/1.0/modules/image/': 'image', + 'http://www.itunes.com/DTDs/PodCast-1.0.dtd': 'itunes', + 'http://example.com/DTDs/PodCast-1.0.dtd': 'itunes', + 'http://purl.org/rss/1.0/modules/link/': 'l', + 'http://search.yahoo.com/mrss': 'media', + # Version 1.1.2 of the Media RSS spec added the trailing slash on the namespace + 'http://search.yahoo.com/mrss/': 'media', + 'http://madskills.com/public/xml/rss/module/pingback/': 'pingback', + 'http://prismstandard.org/namespaces/1.2/basic/': 'prism', + 'http://www.w3.org/1999/02/22-rdf-syntax-ns#': 'rdf', + 'http://www.w3.org/2000/01/rdf-schema#': 'rdfs', + 'http://purl.org/rss/1.0/modules/reference/': 'ref', + 'http://purl.org/rss/1.0/modules/richequiv/': 'reqv', + 'http://purl.org/rss/1.0/modules/search/': 'search', + 'http://purl.org/rss/1.0/modules/slash/': 'slash', + 'http://schemas.xmlsoap.org/soap/envelope/': 'soap', + 'http://purl.org/rss/1.0/modules/servicestatus/': 'ss', + 'http://hacks.benhammersley.com/rss/streaming/': 'str', + 'http://purl.org/rss/1.0/modules/subscription/': 'sub', + 'http://purl.org/rss/1.0/modules/syndication/': 'sy', + 'http://schemas.pocketsoap.com/rss/myDescModule/': 'szf', + 'http://purl.org/rss/1.0/modules/taxonomy/': 'taxo', + 'http://purl.org/rss/1.0/modules/threading/': 'thr', + 'http://purl.org/rss/1.0/modules/textinput/': 'ti', + 'http://madskills.com/public/xml/rss/module/trackback/': 'trackback', + 'http://wellformedweb.org/commentAPI/': 'wfw', + 'http://purl.org/rss/1.0/modules/wiki/': 'wiki', + 'http://www.w3.org/1999/xhtml': 'xhtml', + 'http://www.w3.org/1999/xlink': 'xlink', + 'http://www.w3.org/XML/1998/namespace': 'xml', + 'http://podlove.org/simple-chapters': 'psc', + } _matchnamespaces = {} - can_be_relative_uri = ['link', 'id', 'wfw_comment', 'wfw_commentrss', 'docs', 'url', 'href', 'comments', 'icon', 'logo'] - can_contain_relative_uris = ['content', 'title', 'summary', 'info', 'tagline', 'subtitle', 'copyright', 'rights', 'description'] - can_contain_dangerous_markup = ['content', 'title', 'summary', 'info', 'tagline', 'subtitle', 'copyright', 'rights', 'description'] + can_be_relative_uri = set(['link', 'id', 'wfw_comment', 'wfw_commentrss', 'docs', 'url', 'href', 'comments', 'icon', 'logo']) + can_contain_relative_uris = set(['content', 'title', 'summary', 'info', 'tagline', 'subtitle', 'copyright', 'rights', 'description']) + can_contain_dangerous_markup = set(['content', 'title', 'summary', 'info', 'tagline', 'subtitle', 'copyright', 'rights', 'description']) html_types = ['text/html', 'application/xhtml+xml'] def __init__(self, baseuri=None, baselang=None, encoding='utf-8'): - if _debug: sys.stderr.write('initializing FeedParser\n') if not self._matchnamespaces: - for k, v in self.namespaces.items(): + for k, v in list(self.namespaces.items()): self._matchnamespaces[k.lower()] = v self.feeddata = FeedParserDict() # feed-level data self.encoding = encoding # character encoding @@ -531,6 +542,10 @@ class _FeedParserMixin: self.incontributor = 0 self.inpublisher = 0 self.insource = 0 + + # georss + self.ingeometry = 0 + self.sourcedata = FeedParserDict() self.contentparams = FeedParserDict() self._summaryKey = None @@ -541,28 +556,49 @@ class _FeedParserMixin: self.baseuri = baseuri or '' self.lang = baselang or None self.svgOK = 0 - self.hasTitle = 0 + self.title_depth = -1 + self.depth = 0 + # psc_chapters_flag prevents multiple psc_chapters from being + # captured in a single entry or item. The transition states are + # None -> True -> False. psc_chapter elements will only be + # captured while it is True. + self.psc_chapters_flag = None if baselang: self.feeddata['language'] = baselang.replace('_','-') - def unknown_starttag(self, tag, attrs): - if _debug: sys.stderr.write('start %s with %s\n' % (tag, attrs)) - # normalize attrs - attrs = [(k.lower(), v) for k, v in attrs] - attrs = [(k, k in ('rel', 'type') and v.lower() or v) for k, v in attrs] - # the sgml parser doesn't handle entities in attributes, but + # A map of the following form: + # { + # object_that_value_is_set_on: { + # property_name: depth_of_node_property_was_extracted_from, + # other_property: depth_of_node_property_was_extracted_from, + # }, + # } + self.property_depth_map = {} + + def _normalize_attributes(self, kv): + k = kv[0].lower() + v = k in ('rel', 'type') and kv[1].lower() or kv[1] + # the sgml parser doesn't handle entities in attributes, nor + # does it pass the attribute values through as unicode, while # strict xml parsers do -- account for this difference if isinstance(self, _LooseFeedParser): - attrs = [(k, v.replace('&', '&')) for k, v in attrs] + v = v.replace('&', '&') + if not isinstance(v, str): + v = v.decode('utf-8') + return (k, v) + + def unknown_starttag(self, tag, attrs): + # increment depth counter + self.depth += 1 + + # normalize attrs + attrs = list(map(self._normalize_attributes, attrs)) # track xml:base and xml:lang attrsD = dict(attrs) baseuri = attrsD.get('xml:base', attrsD.get('base')) or self.baseuri - if type(baseuri) != type(u''): - try: - baseuri = unicode(baseuri, self.encoding) - except: - baseuri = unicode(baseuri, 'iso-8859-1') + if not isinstance(baseuri, str): + baseuri = baseuri.decode(self.encoding, 'ignore') # ensure that self.baseuri is always an absolute URI that # uses a whitelisted URI scheme (e.g. not `javscript:`) if self.baseuri: @@ -591,23 +627,25 @@ class _FeedParserMixin: self.trackNamespace(None, uri) # track inline content - if self.incontent and self.contentparams.has_key('type') and not self.contentparams.get('type', 'xml').endswith('xml'): - if tag in ['xhtml:div', 'div']: return # typepad does this 10/2007 + if self.incontent and not self.contentparams.get('type', 'xml').endswith('xml'): + if tag in ('xhtml:div', 'div'): + return # typepad does this 10/2007 # element declared itself as escaped markup, but it isn't really self.contentparams['type'] = 'application/xhtml+xml' if self.incontent and self.contentparams.get('type') == 'application/xhtml+xml': - if tag.find(':') <> -1: + if tag.find(':') != -1: prefix, tag = tag.split(':', 1) namespace = self.namespacesInUse.get(prefix, '') if tag=='math' and namespace=='http://www.w3.org/1998/Math/MathML': attrs.append(('xmlns',namespace)) if tag=='svg' and namespace=='http://www.w3.org/2000/svg': attrs.append(('xmlns',namespace)) - if tag == 'svg': self.svgOK += 1 + if tag == 'svg': + self.svgOK += 1 return self.handle_data('<%s%s>' % (tag, self.strattrs(attrs)), escape=0) # match namespaces - if tag.find(':') <> -1: + if tag.find(':') != -1: prefix, suffix = tag.split(':', 1) else: prefix, suffix = '', tag @@ -638,30 +676,32 @@ class _FeedParserMixin: context[unknown_tag] = attrsD def unknown_endtag(self, tag): - if _debug: sys.stderr.write('end %s\n' % tag) # match namespaces - if tag.find(':') <> -1: + if tag.find(':') != -1: prefix, suffix = tag.split(':', 1) else: prefix, suffix = '', tag prefix = self.namespacemap.get(prefix, prefix) if prefix: prefix = prefix + '_' - if suffix == 'svg' and self.svgOK: self.svgOK -= 1 + if suffix == 'svg' and self.svgOK: + self.svgOK -= 1 # call special handler (if defined) or default handler methodname = '_end_' + prefix + suffix try: - if self.svgOK: raise AttributeError() + if self.svgOK: + raise AttributeError() method = getattr(self, methodname) method() except AttributeError: self.pop(prefix + suffix) # track inline content - if self.incontent and self.contentparams.has_key('type') and not self.contentparams.get('type', 'xml').endswith('xml'): + if self.incontent and not self.contentparams.get('type', 'xml').endswith('xml'): # element declared itself as escaped markup, but it isn't really - if tag in ['xhtml:div', 'div']: return # typepad does this 10/2007 + if tag in ('xhtml:div', 'div'): + return # typepad does this 10/2007 self.contentparams['type'] = 'application/xhtml+xml' if self.incontent and self.contentparams.get('type') == 'application/xhtml+xml': tag = tag.split(':')[-1] @@ -677,9 +717,12 @@ class _FeedParserMixin: if self.langstack: # and (self.langstack[-1] is not None): self.lang = self.langstack[-1] + self.depth -= 1 + def handle_charref(self, ref): # called for each character reference, e.g. for ' ', ref will be '160' - if not self.elementstack: return + if not self.elementstack: + return ref = ref.lower() if ref in ('34', '38', '39', '60', '62', 'x22', 'x26', 'x27', 'x3c', 'x3e'): text = '&#%s;' % ref @@ -688,29 +731,33 @@ class _FeedParserMixin: c = int(ref[1:], 16) else: c = int(ref) - text = unichr(c).encode('utf-8') + text = chr(c).encode('utf-8') self.elementstack[-1][2].append(text) def handle_entityref(self, ref): # called for each entity reference, e.g. for '©', ref will be 'copy' - if not self.elementstack: return - if _debug: sys.stderr.write('entering handle_entityref with %s\n' % ref) + if not self.elementstack: + return if ref in ('lt', 'gt', 'quot', 'amp', 'apos'): text = '&%s;' % ref - elif ref in self.entities.keys(): + elif ref in self.entities: text = self.entities[ref] if text.startswith('&#') and text.endswith(';'): return self.handle_entityref(text) else: - try: name2codepoint[ref] - except KeyError: text = '&%s;' % ref - else: text = unichr(name2codepoint[ref]).encode('utf-8') + try: + name2codepoint[ref] + except KeyError: + text = '&%s;' % ref + else: + text = chr(name2codepoint[ref]).encode('utf-8') self.elementstack[-1][2].append(text) def handle_data(self, text, escape=1): # called for each block of plain text, i.e. outside of any tag and # not containing any character or entity references - if not self.elementstack: return + if not self.elementstack: + return if escape and self.contentparams.get('type') == 'application/xhtml+xml': text = _xmlescape(text) self.elementstack[-1][2].append(text) @@ -728,7 +775,6 @@ class _FeedParserMixin: def parse_declaration(self, i): # override internal declaration handler to handle CDATA blocks - if _debug: sys.stderr.write('entering parse_declaration\n') if self.rawdata[i:i+9] == '', i) if k == -1: @@ -757,17 +803,18 @@ class _FeedParserMixin: def trackNamespace(self, prefix, uri): loweruri = uri.lower() - if (prefix, loweruri) == (None, 'http://my.netscape.com/rdf/simple/0.9/') and not self.version: - self.version = 'rss090' - if loweruri == 'http://purl.org/rss/1.0/' and not self.version: - self.version = 'rss10' - if loweruri == 'http://www.w3.org/2005/atom' and not self.version: - self.version = 'atom10' - if loweruri.find('backend.userland.com/rss') <> -1: + if not self.version: + if (prefix, loweruri) == (None, 'http://my.netscape.com/rdf/simple/0.9/'): + self.version = 'rss090' + elif loweruri == 'http://purl.org/rss/1.0/': + self.version = 'rss10' + elif loweruri == 'http://www.w3.org/2005/atom': + self.version = 'atom10' + if loweruri.find('backend.userland.com/rss') != -1: # match any backend.userland.com namespace uri = 'http://backend.userland.com/rss' loweruri = uri - if self._matchnamespaces.has_key(loweruri): + if loweruri in self._matchnamespaces: self.namespacemap[prefix] = self._matchnamespaces[loweruri] self.namespacesInUse[self._matchnamespaces[loweruri]] = uri else: @@ -786,12 +833,14 @@ class _FeedParserMixin: self.elementstack.append([element, expectingText, []]) def pop(self, element, stripWhitespace=1): - if not self.elementstack: return - if self.elementstack[-1][0] != element: return + if not self.elementstack: + return + if self.elementstack[-1][0] != element: + return element, expectingText, pieces = self.elementstack.pop() - if self.version == 'atom10' and self.contentparams.get('type','text') == 'application/xhtml+xml': + if self.version == 'atom10' and self.contentparams.get('type', 'text') == 'application/xhtml+xml': # remove enclosing child element, but only if it is a
and # only if all the remaining content is nested underneath it. # This means that the divs would be retained in the following: @@ -805,7 +854,8 @@ class _FeedParserMixin: for piece in pieces[:-1]: if piece.startswith(''): depth += 1 else: @@ -813,13 +863,14 @@ class _FeedParserMixin: # Ensure each piece is a str for Python 3 for (i, v) in enumerate(pieces): - if not isinstance(v, basestring): + if not isinstance(v, str): pieces[i] = v.decode('utf-8') output = ''.join(pieces) if stripWhitespace: output = output.strip() - if not expectingText: return output + if not expectingText: + return output # decode base64 content if base64 and self.contentparams.get('base64', 0): @@ -836,14 +887,19 @@ class _FeedParserMixin: # resolve relative URIs if (element in self.can_be_relative_uri) and output: - output = self.resolveURI(output) + # do not resolve guid elements with isPermalink="false" + if not element == 'id' or self.guidislink: + output = self.resolveURI(output) # decode entities within embedded markup if not self.contentparams.get('base64', 0): output = self.decodeEntities(element, output) - if self.lookslikehtml(output): - self.contentparams['type']='text/html' + # some feed formats require consumers to guess + # whether the content is html or plain text + if not self.version.startswith('atom') and self.contentparams.get('type') == 'text/plain': + if self.lookslikehtml(output): + self.contentparams['type'] = 'text/html' # remove temporary cruft from contentparams try: @@ -861,50 +917,31 @@ class _FeedParserMixin: if element in self.can_contain_relative_uris: output = _resolveRelativeURIs(output, self.baseuri, self.encoding, self.contentparams.get('type', 'text/html')) - # parse microformats - # (must do this before sanitizing because some microformats - # rely on elements that we sanitize) - if is_htmlish and element in ['content', 'description', 'summary']: - mfresults = _parseMicroformats(output, self.baseuri, self.encoding) - if mfresults: - for tag in mfresults.get('tags', []): - self._addTag(tag['term'], tag['scheme'], tag['label']) - for enclosure in mfresults.get('enclosures', []): - self._start_enclosure(enclosure) - for xfn in mfresults.get('xfn', []): - self._addXFN(xfn['relationships'], xfn['href'], xfn['name']) - vcard = mfresults.get('vcard') - if vcard: - self._getContext()['vcard'] = vcard - # sanitize embedded markup if is_htmlish and SANITIZE_HTML: if element in self.can_contain_dangerous_markup: output = _sanitizeHTML(output, self.encoding, self.contentparams.get('type', 'text/html')) - if self.encoding and type(output) != type(u''): - try: - output = unicode(output, self.encoding) - except: - pass + if self.encoding and not isinstance(output, str): + output = output.decode(self.encoding, 'ignore') # address common error where people take data that is already # utf-8, presume that it is iso-8859-1, and re-encode it. - if self.encoding in ('utf-8', 'utf-8_INVALID_PYTHON_3') and type(output) == type(u''): + if self.encoding in ('utf-8', 'utf-8_INVALID_PYTHON_3') and isinstance(output, str): try: - output = unicode(output.encode('iso-8859-1'), 'utf-8') - except: + output = output.encode('iso-8859-1').decode('utf-8') + except (UnicodeEncodeError, UnicodeDecodeError): pass # map win-1252 extensions to the proper code points - if type(output) == type(u''): - output = u''.join([c in _cp1252.keys() and _cp1252[c] or c for c in output]) + if isinstance(output, str): + output = output.translate(_cp1252) - # categories/tags/keywords/whatever are handled in _end_category - if element == 'category': + # categories/tags/keywords/whatever are handled in _end_category or _end_tags or _end_itunes_keywords + if element in ('category', 'tags', 'itunes_keywords'): return output - if element == 'title' and self.hasTitle: + if element == 'title' and -1 < self.title_depth <= self.depth: return output # store output in appropriate place(s) @@ -919,6 +956,7 @@ class _FeedParserMixin: # query variables in urls in link elements are improperly # converted from `?a=1&b=2` to `?a=1&b;=2` as if they're # unhandled character references. fix this special case. + output = output.replace('&', '&') output = re.sub("&([A-Za-z0-9_]+);", "&\g<1>", output) self.entries[-1][element] = output if output: @@ -926,7 +964,10 @@ class _FeedParserMixin: else: if element == 'description': element = 'summary' - self.entries[-1][element] = output + old_value_depth = self.property_depth_map.setdefault(self.entries[-1], {}).get(element) + if old_value_depth is None or self.depth <= old_value_depth: + self.property_depth_map[self.entries[-1]][element] = self.depth + self.entries[-1][element] = output if self.incontent: contentparams = copy.deepcopy(self.contentparams) contentparams['value'] = output @@ -949,7 +990,8 @@ class _FeedParserMixin: def pushContent(self, tag, attrsD, defaultContentType, expectingText): self.incontent += 1 - if self.lang: self.lang=self.lang.replace('_','-') + if self.lang: + self.lang=self.lang.replace('_','-') self.contentparams = FeedParserDict({ 'type': self.mapContentType(attrsD.get('type', defaultContentType)), 'language': self.lang, @@ -967,27 +1009,25 @@ class _FeedParserMixin: # text, but this is routinely ignored. This is an attempt to detect # the most common cases. As false positives often result in silent # data loss, this function errs on the conservative side. - def lookslikehtml(self, s): - if self.version.startswith('atom'): return - if self.contentparams.get('type','text/html') != 'text/plain': return - - # must have a close tag or a entity reference to qualify - if not (re.search(r'',s) or re.search("&#?\w+;",s)): return + @staticmethod + def lookslikehtml(s): + # must have a close tag or an entity reference to qualify + if not (re.search(r'',s) or re.search("&#?\w+;",s)): + return # all tags must be in a restricted subset of valid HTML tags - if filter(lambda t: t.lower() not in _HTMLSanitizer.acceptable_elements, - re.findall(r' -1: + if colonpos != -1: prefix = name[:colonpos] suffix = name[colonpos+1:] prefix = self.namespacemap.get(prefix, prefix) @@ -1047,20 +1087,16 @@ class _FeedParserMixin: else: self.version = 'rss' - def _start_dlhottitles(self, attrsD): - self.version = 'hotrss' - def _start_channel(self, attrsD): self.infeed = 1 self._cdf_common(attrsD) - _start_feedinfo = _start_channel def _cdf_common(self, attrsD): - if attrsD.has_key('lastmod'): + if 'lastmod' in attrsD: self._start_modified({}) self.elementstack[-1][-1] = attrsD['lastmod'] self._end_modified() - if attrsD.has_key('href'): + if 'href' in attrsD: self._start_link({}) self.elementstack[-1][-1] = attrsD['href'] self._end_link() @@ -1087,7 +1123,7 @@ class _FeedParserMixin: if not self.inentry: context.setdefault('image', FeedParserDict()) self.inimage = 1 - self.hasTitle = 0 + self.title_depth = -1 self.push('image', 0) def _end_image(self): @@ -1098,7 +1134,7 @@ class _FeedParserMixin: context = self._getContext() context.setdefault('textinput', FeedParserDict()) self.intextinput = 1 - self.hasTitle = 0 + self.title_depth = -1 self.push('textinput', 0) _start_textInput = _start_textinput @@ -1183,7 +1219,7 @@ class _FeedParserMixin: value = self.pop('width') try: value = int(value) - except: + except ValueError: value = 0 if self.inimage: context = self._getContext() @@ -1196,7 +1232,7 @@ class _FeedParserMixin: value = self.pop('height') try: value = int(value) - except: + except ValueError: value = 0 if self.inimage: context = self._getContext() @@ -1233,7 +1269,7 @@ class _FeedParserMixin: def _getContext(self): if self.insource: context = self.sourcedata - elif self.inimage and self.feeddata.has_key('image'): + elif self.inimage and 'image' in self.feeddata: context = self.feeddata['image'] elif self.intextinput: context = self.feeddata['textinput'] @@ -1258,7 +1294,7 @@ class _FeedParserMixin: def _sync_author_detail(self, key='author'): context = self._getContext() - detail = context.get('%s_detail' % key) + detail = context.get('%ss' % key, [FeedParserDict()])[-1] if detail: name = detail.get('name') email = detail.get('email') @@ -1270,7 +1306,8 @@ class _FeedParserMixin: context[key] = email else: author, email = context.get(key), None - if not author: return + if not author: + return emailmatch = re.search(r'''(([a-zA-Z0-9\_\-\.\+]+)@((\[[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}\.)|(([a-zA-Z0-9\-]+\.)+))([a-zA-Z]{2,4}|[0-9]{1,3})(\]?))(\?subject=\S+)?''', author) if emailmatch: email = emailmatch.group(0) @@ -1286,11 +1323,11 @@ class _FeedParserMixin: author = author[:-1] author = author.strip() if author or email: - context.setdefault('%s_detail' % key, FeedParserDict()) + context.setdefault('%s_detail' % key, detail) if author: - context['%s_detail' % key]['name'] = author + detail['name'] = author if email: - context['%s_detail' % key]['email'] = email + detail['email'] = email def _start_subtitle(self, attrsD): self.pushContent('subtitle', attrsD, 'text/plain', 1) @@ -1317,14 +1354,14 @@ class _FeedParserMixin: self.push('item', 0) self.inentry = 1 self.guidislink = 0 - self.hasTitle = 0 + self.title_depth = -1 + self.psc_chapters_flag = None id = self._getAttribute(attrsD, 'rdf:about') if id: context = self._getContext() context['id'] = id self._cdf_common(attrsD) _start_entry = _start_item - _start_product = _start_item def _end_item(self): self.pop('item') @@ -1348,22 +1385,37 @@ class _FeedParserMixin: self._sync_author_detail('publisher') _end_webmaster = _end_dc_publisher + def _start_dcterms_valid(self, attrsD): + self.push('validity', 1) + + def _end_dcterms_valid(self): + for validity_detail in self.pop('validity').split(';'): + if '=' in validity_detail: + key, value = validity_detail.split('=', 1) + if key == 'start': + self._save('validity_start', value, overwrite=True) + self._save('validity_start_parsed', _parse_date(value), overwrite=True) + elif key == 'end': + self._save('validity_end', value, overwrite=True) + self._save('validity_end_parsed', _parse_date(value), overwrite=True) + def _start_published(self, attrsD): self.push('published', 1) _start_dcterms_issued = _start_published _start_issued = _start_published + _start_pubdate = _start_published def _end_published(self): value = self.pop('published') self._save('published_parsed', _parse_date(value), overwrite=True) _end_dcterms_issued = _end_published _end_issued = _end_published + _end_pubdate = _end_published def _start_updated(self, attrsD): self.push('updated', 1) _start_modified = _start_updated _start_dcterms_modified = _start_updated - _start_pubdate = _start_updated _start_dc_date = _start_updated _start_lastbuilddate = _start_updated @@ -1373,7 +1425,6 @@ class _FeedParserMixin: self._save('updated_parsed', parsed_value, overwrite=True) _end_modified = _end_updated _end_dcterms_modified = _end_updated - _end_pubdate = _end_updated _end_dc_date = _end_updated _end_lastbuilddate = _end_updated @@ -1392,12 +1443,135 @@ class _FeedParserMixin: def _end_expirationdate(self): self._save('expired_parsed', _parse_date(self.pop('expired')), overwrite=True) + # geospatial location, or "where", from georss.org + + def _start_georssgeom(self, attrsD): + self.push('geometry', 0) + context = self._getContext() + context['where'] = FeedParserDict() + + _start_georss_point = _start_georssgeom + _start_georss_line = _start_georssgeom + _start_georss_polygon = _start_georssgeom + _start_georss_box = _start_georssgeom + + def _save_where(self, geometry): + context = self._getContext() + context['where'].update(geometry) + + def _end_georss_point(self): + geometry = _parse_georss_point(self.pop('geometry')) + if geometry: + self._save_where(geometry) + + def _end_georss_line(self): + geometry = _parse_georss_line(self.pop('geometry')) + if geometry: + self._save_where(geometry) + + def _end_georss_polygon(self): + this = self.pop('geometry') + geometry = _parse_georss_polygon(this) + if geometry: + self._save_where(geometry) + + def _end_georss_box(self): + geometry = _parse_georss_box(self.pop('geometry')) + if geometry: + self._save_where(geometry) + + def _start_where(self, attrsD): + self.push('where', 0) + context = self._getContext() + context['where'] = FeedParserDict() + _start_georss_where = _start_where + + def _parse_srs_attrs(self, attrsD): + srsName = attrsD.get('srsname') + try: + srsDimension = int(attrsD.get('srsdimension', '2')) + except ValueError: + srsDimension = 2 + context = self._getContext() + context['where']['srsName'] = srsName + context['where']['srsDimension'] = srsDimension + + def _start_gml_point(self, attrsD): + self._parse_srs_attrs(attrsD) + self.ingeometry = 1 + self.push('geometry', 0) + + def _start_gml_linestring(self, attrsD): + self._parse_srs_attrs(attrsD) + self.ingeometry = 'linestring' + self.push('geometry', 0) + + def _start_gml_polygon(self, attrsD): + self._parse_srs_attrs(attrsD) + self.push('geometry', 0) + + def _start_gml_exterior(self, attrsD): + self.push('geometry', 0) + + def _start_gml_linearring(self, attrsD): + self.ingeometry = 'polygon' + self.push('geometry', 0) + + def _start_gml_pos(self, attrsD): + self.push('pos', 0) + + def _end_gml_pos(self): + this = self.pop('pos') + context = self._getContext() + srsName = context['where'].get('srsName') + srsDimension = context['where'].get('srsDimension', 2) + swap = True + if srsName and "EPSG" in srsName: + epsg = int(srsName.split(":")[-1]) + swap = bool(epsg in _geogCS) + geometry = _parse_georss_point(this, swap=swap, dims=srsDimension) + if geometry: + self._save_where(geometry) + + def _start_gml_poslist(self, attrsD): + self.push('pos', 0) + + def _end_gml_poslist(self): + this = self.pop('pos') + context = self._getContext() + srsName = context['where'].get('srsName') + srsDimension = context['where'].get('srsDimension', 2) + swap = True + if srsName and "EPSG" in srsName: + epsg = int(srsName.split(":")[-1]) + swap = bool(epsg in _geogCS) + geometry = _parse_poslist( + this, self.ingeometry, swap=swap, dims=srsDimension) + if geometry: + self._save_where(geometry) + + def _end_geom(self): + self.ingeometry = 0 + self.pop('geometry') + _end_gml_point = _end_geom + _end_gml_linestring = _end_geom + _end_gml_linearring = _end_geom + _end_gml_exterior = _end_geom + _end_gml_polygon = _end_geom + + def _end_where(self): + self.pop('where') + _end_georss_where = _end_where + + # end geospatial + def _start_cc_license(self, attrsD): context = self._getContext() value = self._getAttribute(attrsD, 'rdf:resource') attrsD = FeedParserDict() - attrsD['rel']='license' - if value: attrsD['href']=value + attrsD['rel'] = 'license' + if value: + attrsD['href']=value context.setdefault('links', []).append(attrsD) def _start_creativecommons_license(self, attrsD): @@ -1408,29 +1582,33 @@ class _FeedParserMixin: value = self.pop('license') context = self._getContext() attrsD = FeedParserDict() - attrsD['rel']='license' - if value: attrsD['href']=value + attrsD['rel'] = 'license' + if value: + attrsD['href'] = value context.setdefault('links', []).append(attrsD) del context['license'] _end_creativeCommons_license = _end_creativecommons_license - def _addXFN(self, relationships, href, name): - context = self._getContext() - xfn = context.setdefault('xfn', []) - value = FeedParserDict({'relationships': relationships, 'href': href, 'name': name}) - if value not in xfn: - xfn.append(value) - def _addTag(self, term, scheme, label): context = self._getContext() tags = context.setdefault('tags', []) - if (not term) and (not scheme) and (not label): return - value = FeedParserDict({'term': term, 'scheme': scheme, 'label': label}) + if (not term) and (not scheme) and (not label): + return + value = FeedParserDict(term=term, scheme=scheme, label=label) if value not in tags: tags.append(value) + def _start_tags(self, attrsD): + # This is a completely-made up element. Its semantics are determined + # only by a single feed that precipitated bug report 392 on Google Code. + # In short, this is junk code. + self.push('tags', 1) + + def _end_tags(self): + for term in self.pop('tags').split(','): + self._addTag(term.strip(), None, None) + def _start_category(self, attrsD): - if _debug: sys.stderr.write('entering _start_category with %s\n' % repr(attrsD)) term = attrsD.get('term') scheme = attrsD.get('scheme', attrsD.get('domain')) label = attrsD.get('label') @@ -1444,8 +1622,14 @@ class _FeedParserMixin: self._start_category(attrsD) def _end_itunes_keywords(self): - for term in self.pop('itunes_keywords').split(): - self._addTag(term, 'http://www.itunes.com/', None) + for term in self.pop('itunes_keywords').split(','): + if term.strip(): + self._addTag(term.strip(), 'http://www.itunes.com/', None) + + def _end_media_keywords(self): + for term in self.pop('media_keywords').split(','): + if term.strip(): + self._addTag(term.strip(), None, None) def _start_itunes_category(self, attrsD): self._addTag(attrsD.get('text'), 'http://www.itunes.com/', None) @@ -1453,7 +1637,8 @@ class _FeedParserMixin: def _end_category(self): value = self.pop('category') - if not value: return + if not value: + return context = self._getContext() tags = context['tags'] if value and len(tags) and not tags[-1]['term']: @@ -1476,64 +1661,66 @@ class _FeedParserMixin: attrsD.setdefault('type', 'text/html') context = self._getContext() attrsD = self._itsAnHrefDamnIt(attrsD) - if attrsD.has_key('href'): + if 'href' in attrsD: attrsD['href'] = self.resolveURI(attrsD['href']) expectingText = self.infeed or self.inentry or self.insource context.setdefault('links', []) if not (self.inentry and self.inimage): context['links'].append(FeedParserDict(attrsD)) - if attrsD.has_key('href'): + if 'href' in attrsD: expectingText = 0 if (attrsD.get('rel') == 'alternate') and (self.mapContentType(attrsD.get('type')) in self.html_types): context['link'] = attrsD['href'] else: self.push('link', expectingText) - _start_producturl = _start_link def _end_link(self): value = self.pop('link') - context = self._getContext() - _end_producturl = _end_link def _start_guid(self, attrsD): self.guidislink = (attrsD.get('ispermalink', 'true') == 'true') self.push('id', 1) + _start_id = _start_guid def _end_guid(self): value = self.pop('id') - self._save('guidislink', self.guidislink and not self._getContext().has_key('link')) + self._save('guidislink', self.guidislink and 'link' not in self._getContext()) if self.guidislink: # guid acts as link, but only if 'ispermalink' is not present or is 'true', # and only if the item doesn't already have a link element self._save('link', value) + _end_id = _end_guid def _start_title(self, attrsD): - if self.svgOK: return self.unknown_starttag('title', attrsD.items()) + if self.svgOK: + return self.unknown_starttag('title', list(attrsD.items())) self.pushContent('title', attrsD, 'text/plain', self.infeed or self.inentry or self.insource) _start_dc_title = _start_title _start_media_title = _start_title def _end_title(self): - if self.svgOK: return + if self.svgOK: + return value = self.popContent('title') - if not value: return - context = self._getContext() - self.hasTitle = 1 + if not value: + return + self.title_depth = self.depth _end_dc_title = _end_title def _end_media_title(self): - hasTitle = self.hasTitle + title_depth = self.title_depth self._end_title() - self.hasTitle = hasTitle + self.title_depth = title_depth def _start_description(self, attrsD): context = self._getContext() - if context.has_key('summary'): + if 'summary' in context: self._summaryKey = 'content' self._start_content(attrsD) else: self.pushContent('description', attrsD, 'text/html', self.infeed or self.inentry or self.insource) _start_dc_description = _start_description + _start_media_description = _start_description def _start_abstract(self, attrsD): self.pushContent('description', attrsD, 'text/plain', self.infeed or self.inentry or self.insource) @@ -1546,6 +1733,7 @@ class _FeedParserMixin: self._summaryKey = None _end_abstract = _end_description _end_dc_description = _end_description + _end_media_description = _end_description def _start_info(self, attrsD): self.pushContent('info', attrsD, 'text/plain', 1) @@ -1558,7 +1746,7 @@ class _FeedParserMixin: def _start_generator(self, attrsD): if attrsD: attrsD = self._itsAnHrefDamnIt(attrsD) - if attrsD.has_key('href'): + if 'href' in attrsD: attrsD['href'] = self.resolveURI(attrsD['href']) self._getContext()['generator_detail'] = FeedParserDict(attrsD) self.push('generator', 1) @@ -1566,7 +1754,7 @@ class _FeedParserMixin: def _end_generator(self): value = self.pop('generator') context = self._getContext() - if context.has_key('generator_detail'): + if 'generator_detail' in context: context['generator_detail']['name'] = value def _start_admin_generatoragent(self, attrsD): @@ -1586,7 +1774,7 @@ class _FeedParserMixin: def _start_summary(self, attrsD): context = self._getContext() - if context.has_key('summary'): + if 'summary' in context: self._summaryKey = 'content' self._start_content(attrsD) else: @@ -1605,22 +1793,22 @@ class _FeedParserMixin: def _start_enclosure(self, attrsD): attrsD = self._itsAnHrefDamnIt(attrsD) context = self._getContext() - attrsD['rel']='enclosure' + attrsD['rel'] = 'enclosure' context.setdefault('links', []).append(FeedParserDict(attrsD)) def _start_source(self, attrsD): if 'url' in attrsD: - # This means that we're processing a source element from an RSS 2.0 feed - self.sourcedata['href'] = attrsD[u'url'] + # This means that we're processing a source element from an RSS 2.0 feed + self.sourcedata['href'] = attrsD['url'] self.push('source', 1) self.insource = 1 - self.hasTitle = 0 + self.title_depth = -1 def _end_source(self): self.insource = 0 value = self.pop('source') if value: - self.sourcedata['title'] = value + self.sourcedata['title'] = value self._getContext()['source'] = copy.deepcopy(self.sourcedata) self.sourcedata.clear() @@ -1631,9 +1819,6 @@ class _FeedParserMixin: self.contentparams['src'] = src self.push('content', 1) - def _start_prodlink(self, attrsD): - self.pushContent('content', attrsD, 'text/html', 1) - def _start_body(self, attrsD): self.pushContent('content', attrsD, 'application/xhtml+xml', 1) _start_xhtml_body = _start_body @@ -1652,12 +1837,13 @@ class _FeedParserMixin: _end_xhtml_body = _end_content _end_content_encoded = _end_content _end_fullitem = _end_content - _end_prodlink = _end_content def _start_itunes_image(self, attrsD): self.push('itunes_image', 0) if attrsD.get('href'): self._getContext()['image'] = FeedParserDict({'href': attrsD.get('href')}) + elif attrsD.get('url'): + self._getContext()['image'] = FeedParserDict({'href': attrsD.get('url')}) _start_itunes_link = _start_itunes_image def _end_itunes_block(self): @@ -1671,6 +1857,55 @@ class _FeedParserMixin: # by applications that only need to know if the content is explicit. self._getContext()['itunes_explicit'] = (None, False, True)[(value == 'yes' and 2) or value == 'clean' or 0] + def _start_media_group(self, attrsD): + # don't do anything, but don't break the enclosed tags either + pass + + def _start_media_rating(self, attrsD): + context = self._getContext() + context.setdefault('media_rating', attrsD) + self.push('rating', 1) + + def _end_media_rating(self): + rating = self.pop('rating') + if rating is not None and rating.strip(): + context = self._getContext() + context['media_rating']['content'] = rating + + def _start_media_credit(self, attrsD): + context = self._getContext() + context.setdefault('media_credit', []) + context['media_credit'].append(attrsD) + self.push('credit', 1) + + def _end_media_credit(self): + credit = self.pop('credit') + if credit != None and len(credit.strip()) != 0: + context = self._getContext() + context['media_credit'][-1]['content'] = credit + + def _start_media_restriction(self, attrsD): + context = self._getContext() + context.setdefault('media_restriction', attrsD) + self.push('restriction', 1) + + def _end_media_restriction(self): + restriction = self.pop('restriction') + if restriction != None and len(restriction.strip()) != 0: + context = self._getContext() + context['media_restriction']['content'] = [cc.strip().lower() for cc in restriction.split(' ')] + + def _start_media_license(self, attrsD): + context = self._getContext() + context.setdefault('media_license', attrsD) + self.push('license', 1) + + def _end_media_license(self): + license = self.pop('license') + if license != None and len(license.strip()) != 0: + context = self._getContext() + context['media_license']['content'] = license + def _start_media_content(self, attrsD): context = self._getContext() context.setdefault('media_content', []) @@ -1686,7 +1921,7 @@ class _FeedParserMixin: url = self.pop('url') context = self._getContext() if url != None and len(url.strip()) != 0: - if not context['media_thumbnail'][-1].has_key('url'): + if 'url' not in context['media_thumbnail'][-1]: context['media_thumbnail'][-1]['url'] = url def _start_media_player(self, attrsD): @@ -1709,10 +1944,29 @@ class _FeedParserMixin: return context['newlocation'] = _makeSafeAbsoluteURI(self.baseuri, url.strip()) + def _start_psc_chapters(self, attrsD): + if self.psc_chapters_flag is None: + # Transition from None -> True + self.psc_chapters_flag = True + attrsD['chapters'] = [] + self._getContext()['psc_chapters'] = FeedParserDict(attrsD) + + def _end_psc_chapters(self): + # Transition from True -> False + self.psc_chapters_flag = False + + def _start_psc_chapter(self, attrsD): + if self.psc_chapters_flag: + start = self._getAttribute(attrsD, 'start') + attrsD['start_parsed'] = _parse_psc_chapter_start(start) + + context = self._getContext()['psc_chapters'] + context['chapters'].append(FeedParserDict(attrsD)) + + if _XML_AVAILABLE: class _StrictFeedParser(_FeedParserMixin, xml.sax.handler.ContentHandler): def __init__(self, baseuri, baselang, encoding): - if _debug: sys.stderr.write('trying StrictFeedParser\n') xml.sax.handler.ContentHandler.__init__(self) _FeedParserMixin.__init__(self, baseuri, baselang, encoding) self.bozo = 0 @@ -1720,14 +1974,18 @@ if _XML_AVAILABLE: self.decls = {} def startPrefixMapping(self, prefix, uri): + if not uri: + return + # Jython uses '' instead of None; standardize on None + prefix = prefix or None self.trackNamespace(prefix, uri) - if uri == 'http://www.w3.org/1999/xlink': - self.decls['xmlns:'+prefix] = uri + if prefix and uri == 'http://www.w3.org/1999/xlink': + self.decls['xmlns:' + prefix] = uri def startElementNS(self, name, qname, attrs): namespace, localname = name lowernamespace = str(namespace or '').lower() - if lowernamespace.find('backend.userland.com/rss') <> -1: + if lowernamespace.find('backend.userland.com/rss') != -1: # match any backend.userland.com namespace namespace = 'http://backend.userland.com/rss' lowernamespace = namespace @@ -1736,8 +1994,8 @@ if _XML_AVAILABLE: else: givenprefix = None prefix = self._matchnamespaces.get(lowernamespace, givenprefix) - if givenprefix and (prefix is None or (prefix == '' and lowernamespace == '')) and not self.namespacesInUse.has_key(givenprefix): - raise UndeclaredNamespace, "'%s' is not associated with a namespace" % givenprefix + if givenprefix and (prefix == None or (prefix == '' and lowernamespace == '')) and givenprefix not in self.namespacesInUse: + raise UndeclaredNamespace("'%s' is not associated with a namespace" % givenprefix) localname = str(localname).lower() # qname implementation is horribly broken in Python 2.1 (it @@ -1756,13 +2014,12 @@ if _XML_AVAILABLE: if prefix: localname = prefix.lower() + ':' + localname elif namespace and not qname: #Expat - for name,value in self.namespacesInUse.items(): - if name and value == namespace: - localname = name + ':' + localname - break - if _debug: sys.stderr.write('startElementNS: qname = %s, namespace = %s, givenprefix = %s, prefix = %s, attrs = %s, localname = %s\n' % (qname, namespace, givenprefix, prefix, attrs.items(), localname)) + for name,value in list(self.namespacesInUse.items()): + if name and value == namespace: + localname = name + ':' + localname + break - for (namespace, attrlocalname), attrvalue in attrs._attrs.items(): + for (namespace, attrlocalname), attrvalue in list(attrs.items()): lowernamespace = (namespace or '').lower() prefix = self._matchnamespaces.get(lowernamespace, '') if prefix: @@ -1770,7 +2027,8 @@ if _XML_AVAILABLE: attrsD[str(attrlocalname).lower()] = attrvalue for qname in attrs.getQNames(): attrsD[str(qname).lower()] = attrs.getValueByQName(qname) - self.unknown_starttag(localname, attrsD.items()) + localname = str(localname).lower() + self.unknown_starttag(localname, list(attrsD.items())) def characters(self, text): self.handle_data(text) @@ -1786,10 +2044,10 @@ if _XML_AVAILABLE: if prefix: localname = prefix + ':' + localname elif namespace and not qname: #Expat - for name,value in self.namespacesInUse.items(): - if name and value == namespace: - localname = name + ':' + localname - break + for name,value in list(self.namespacesInUse.items()): + if name and value == namespace: + localname = name + ':' + localname + break localname = str(localname).lower() self.unknown_endtag(localname) @@ -1797,6 +2055,9 @@ if _XML_AVAILABLE: self.bozo = 1 self.exc = exc + # drv_libxml2 calls warning() in some cases + warning = error + def fatalError(self, exc): self.error(exc) raise exc @@ -1804,16 +2065,15 @@ if _XML_AVAILABLE: class _BaseHTMLProcessor(sgmllib.SGMLParser): special = re.compile('''[<>'"]''') bare_ampersand = re.compile("&(?!#\d+;|#x[0-9a-fA-F]+;|\w+;)") - elements_no_end_tag = [ + elements_no_end_tag = set([ 'area', 'base', 'basefont', 'br', 'col', 'command', 'embed', 'frame', 'hr', 'img', 'input', 'isindex', 'keygen', 'link', 'meta', 'param', 'source', 'track', 'wbr' - ] + ]) def __init__(self, encoding, _type): self.encoding = encoding self._type = _type - if _debug: sys.stderr.write('entering BaseHTMLProcessor, encoding=%s\n' % self.encoding) sgmllib.SGMLParser.__init__(self) def reset(self): @@ -1827,8 +2087,21 @@ class _BaseHTMLProcessor(sgmllib.SGMLParser): else: return '<' + tag + '>' + # By declaring these methods and overriding their compiled code + # with the code from sgmllib, the original code will execute in + # feedparser's scope instead of sgmllib's. This means that the + # `tagfind` and `charref` regular expressions will be found as + # they're declared above, not as they're declared in sgmllib. + def goahead(self, i): + pass + goahead.__code__ = sgmllib.SGMLParser.goahead.__code__ + + def __parse_starttag(self, i): + pass + __parse_starttag.__code__ = sgmllib.SGMLParser.parse_starttag.__code__ + def parse_starttag(self,i): - j=sgmllib.SGMLParser.parse_starttag(self, i) + j = self.__parse_starttag(i) if self._type == 'application/xhtml+xml': if j>2 and self.rawdata[j-2:j]=='/>': self.unknown_endtag(self.lasttag) @@ -1836,7 +2109,6 @@ class _BaseHTMLProcessor(sgmllib.SGMLParser): def feed(self, data): data = re.compile(r'', self._shorttag_replace, data) # bug [ 1399464 ] Bad regexp for _shorttag_replace data = re.sub(r'<([^<>\s]+?)\s*/>', self._shorttag_replace, data) data = data.replace(''', "'") data = data.replace('"', '"') @@ -1846,15 +2118,16 @@ class _BaseHTMLProcessor(sgmllib.SGMLParser): raise NameError self.encoding = self.encoding + '_INVALID_PYTHON_3' except NameError: - if self.encoding and type(data) == type(u''): + if self.encoding and isinstance(data, str): data = data.encode(self.encoding) sgmllib.SGMLParser.feed(self, data) sgmllib.SGMLParser.close(self) def normalize_attrs(self, attrs): - if not attrs: return attrs + if not attrs: + return attrs # utility method to be called by descendants - attrs = dict([(k.lower(), v) for k, v in attrs]).items() + attrs = list(dict([(k.lower(), v) for k, v in attrs]).items()) attrs = [(k, k in ('rel', 'type') and v.lower() or v) for k, v in attrs] attrs.sort() return attrs @@ -1863,7 +2136,6 @@ class _BaseHTMLProcessor(sgmllib.SGMLParser): # called for each start tag # attrs is a list of (attr, value) tuples # e.g. for
, tag='pre', attrs=[('class', 'screen')]
-        if _debug: sys.stderr.write('_BaseHTMLProcessor, unknown_starttag, tag=%s\n' % tag)
         uattrs = []
         strattrs=''
         if attrs:
@@ -1871,77 +2143,74 @@ class _BaseHTMLProcessor(sgmllib.SGMLParser):
                 value=value.replace('>','>').replace('<','<').replace('"','"')
                 value = self.bare_ampersand.sub("&", value)
                 # thanks to Kevin Marks for this breathtaking hack to deal with (valid) high-bit attribute values in UTF-8 feeds
-                if type(value) != type(u''):
-                    try:
-                        value = unicode(value, self.encoding)
-                    except:
-                        value = unicode(value, 'iso-8859-1')
+                if not isinstance(value, str):
+                    value = value.decode(self.encoding, 'ignore')
                 try:
                     # Currently, in Python 3 the key is already a str, and cannot be decoded again
-                    uattrs.append((unicode(key, self.encoding), value))
+                    uattrs.append((str(key, self.encoding), value))
                 except TypeError:
                     uattrs.append((key, value))
-            strattrs = u''.join([u' %s="%s"' % (key, value) for key, value in uattrs])
+            strattrs = ''.join([' %s="%s"' % (key, value) for key, value in uattrs])
             if self.encoding:
                 try:
-                    strattrs=strattrs.encode(self.encoding)
-                except:
+                    strattrs = strattrs.encode(self.encoding)
+                except (UnicodeEncodeError, LookupError):
                     pass
         if tag in self.elements_no_end_tag:
-            self.pieces.append('<%(tag)s%(strattrs)s />' % locals())
+            self.pieces.append('<%s%s />' % (tag, strattrs))
         else:
-            self.pieces.append('<%(tag)s%(strattrs)s>' % locals())
+            self.pieces.append('<%s%s>' % (tag, strattrs))
 
     def unknown_endtag(self, tag):
         # called for each end tag, e.g. for 
, tag will be 'pre' # Reconstruct the original end tag. if tag not in self.elements_no_end_tag: - self.pieces.append("" % locals()) + self.pieces.append("" % tag) def handle_charref(self, ref): # called for each character reference, e.g. for ' ', ref will be '160' # Reconstruct the original character reference. + ref = ref.lower() if ref.startswith('x'): - value = unichr(int(ref[1:],16)) + value = int(ref[1:], 16) else: - value = unichr(int(ref)) + value = int(ref) - if value in _cp1252.keys(): + if value in _cp1252: self.pieces.append('&#%s;' % hex(ord(_cp1252[value]))[1:]) else: - self.pieces.append('&#%(ref)s;' % locals()) + self.pieces.append('&#%s;' % ref) def handle_entityref(self, ref): # called for each entity reference, e.g. for '©', ref will be 'copy' # Reconstruct the original entity reference. - if name2codepoint.has_key(ref): - self.pieces.append('&%(ref)s;' % locals()) + if ref in name2codepoint or ref == 'apos': + self.pieces.append('&%s;' % ref) else: - self.pieces.append('&%(ref)s' % locals()) + self.pieces.append('&%s' % ref) def handle_data(self, text): # called for each block of plain text, i.e. outside of any tag and # not containing any character or entity references # Store the original text verbatim. - if _debug: sys.stderr.write('_BaseHTMLProcessor, handle_data, text=%s\n' % text) self.pieces.append(text) def handle_comment(self, text): # called for each HTML comment, e.g. # Reconstruct the original comment. - self.pieces.append('' % locals()) + self.pieces.append('' % text) def handle_pi(self, text): # called for each processing instruction, e.g. # Reconstruct original processing instruction. - self.pieces.append('' % locals()) + self.pieces.append('' % text) def handle_decl(self, text): # called for the DOCTYPE, if present, e.g. # # Reconstruct original DOCTYPE - self.pieces.append('' % locals()) + self.pieces.append('' % text) _new_declname_match = re.compile(r'[a-zA-Z][-_.a-zA-Z0-9:]*\s*').match def _scan_name(self, i, declstartpos): @@ -1999,439 +2268,24 @@ class _LooseFeedParser(_FeedParserMixin, _BaseHTMLProcessor): data = data.replace('"', '"') data = data.replace(''', ''') data = data.replace(''', ''') - if self.contentparams.has_key('type') and not self.contentparams.get('type', 'xml').endswith('xml'): + if not self.contentparams.get('type', 'xml').endswith('xml'): data = data.replace('<', '<') data = data.replace('>', '>') data = data.replace('&', '&') data = data.replace('"', '"') data = data.replace(''', "'") + data = data.replace('/', '/') + data = data.replace('/', '/') return data def strattrs(self, attrs): return ''.join([' %s="%s"' % (n,v.replace('"','"')) for n,v in attrs]) -class _MicroformatsParser: - STRING = 1 - DATE = 2 - URI = 3 - NODE = 4 - EMAIL = 5 - - known_xfn_relationships = ['contact', 'acquaintance', 'friend', 'met', 'co-worker', 'coworker', 'colleague', 'co-resident', 'coresident', 'neighbor', 'child', 'parent', 'sibling', 'brother', 'sister', 'spouse', 'wife', 'husband', 'kin', 'relative', 'muse', 'crush', 'date', 'sweetheart', 'me'] - known_binary_extensions = ['zip','rar','exe','gz','tar','tgz','tbz2','bz2','z','7z','dmg','img','sit','sitx','hqx','deb','rpm','bz2','jar','rar','iso','bin','msi','mp2','mp3','ogg','ogm','mp4','m4v','m4a','avi','wma','wmv'] - - def __init__(self, data, baseuri, encoding): - self.document = BeautifulSoup.BeautifulSoup(data) - self.baseuri = baseuri - self.encoding = encoding - if type(data) == type(u''): - data = data.encode(encoding) - self.tags = [] - self.enclosures = [] - self.xfn = [] - self.vcard = None - - def vcardEscape(self, s): - if type(s) in (type(''), type(u'')): - s = s.replace(',', '\\,').replace(';', '\\;').replace('\n', '\\n') - return s - - def vcardFold(self, s): - s = re.sub(';+$', '', s) - sFolded = '' - iMax = 75 - sPrefix = '' - while len(s) > iMax: - sFolded += sPrefix + s[:iMax] + '\n' - s = s[iMax:] - sPrefix = ' ' - iMax = 74 - sFolded += sPrefix + s - return sFolded - - def normalize(self, s): - return re.sub(r'\s+', ' ', s).strip() - - def unique(self, aList): - results = [] - for element in aList: - if element not in results: - results.append(element) - return results - - def toISO8601(self, dt): - return time.strftime('%Y-%m-%dT%H:%M:%SZ', dt) - - def getPropertyValue(self, elmRoot, sProperty, iPropertyType=4, bAllowMultiple=0, bAutoEscape=0): - all = lambda x: 1 - sProperty = sProperty.lower() - bFound = 0 - bNormalize = 1 - propertyMatch = {'class': re.compile(r'\b%s\b' % sProperty)} - if bAllowMultiple and (iPropertyType != self.NODE): - snapResults = [] - containers = elmRoot(['ul', 'ol'], propertyMatch) - for container in containers: - snapResults.extend(container('li')) - bFound = (len(snapResults) != 0) - if not bFound: - snapResults = elmRoot(all, propertyMatch) - bFound = (len(snapResults) != 0) - if (not bFound) and (sProperty == 'value'): - snapResults = elmRoot('pre') - bFound = (len(snapResults) != 0) - bNormalize = not bFound - if not bFound: - snapResults = [elmRoot] - bFound = (len(snapResults) != 0) - arFilter = [] - if sProperty == 'vcard': - snapFilter = elmRoot(all, propertyMatch) - for node in snapFilter: - if node.findParent(all, propertyMatch): - arFilter.append(node) - arResults = [] - for node in snapResults: - if node not in arFilter: - arResults.append(node) - bFound = (len(arResults) != 0) - if not bFound: - if bAllowMultiple: return [] - elif iPropertyType == self.STRING: return '' - elif iPropertyType == self.DATE: return None - elif iPropertyType == self.URI: return '' - elif iPropertyType == self.NODE: return None - else: return None - arValues = [] - for elmResult in arResults: - sValue = None - if iPropertyType == self.NODE: - if bAllowMultiple: - arValues.append(elmResult) - continue - else: - return elmResult - sNodeName = elmResult.name.lower() - if (iPropertyType == self.EMAIL) and (sNodeName == 'a'): - sValue = (elmResult.get('href') or '').split('mailto:').pop().split('?')[0] - if sValue: - sValue = bNormalize and self.normalize(sValue) or sValue.strip() - if (not sValue) and (sNodeName == 'abbr'): - sValue = elmResult.get('title') - if sValue: - sValue = bNormalize and self.normalize(sValue) or sValue.strip() - if (not sValue) and (iPropertyType == self.URI): - if sNodeName == 'a': sValue = elmResult.get('href') - elif sNodeName == 'img': sValue = elmResult.get('src') - elif sNodeName == 'object': sValue = elmResult.get('data') - if sValue: - sValue = bNormalize and self.normalize(sValue) or sValue.strip() - if (not sValue) and (sNodeName == 'img'): - sValue = elmResult.get('alt') - if sValue: - sValue = bNormalize and self.normalize(sValue) or sValue.strip() - if not sValue: - sValue = elmResult.renderContents() - sValue = re.sub(r'<\S[^>]*>', '', sValue) - sValue = sValue.replace('\r\n', '\n') - sValue = sValue.replace('\r', '\n') - if sValue: - sValue = bNormalize and self.normalize(sValue) or sValue.strip() - if not sValue: continue - if iPropertyType == self.DATE: - sValue = _parse_date_iso8601(sValue) - if bAllowMultiple: - arValues.append(bAutoEscape and self.vcardEscape(sValue) or sValue) - else: - return bAutoEscape and self.vcardEscape(sValue) or sValue - return arValues - - def findVCards(self, elmRoot, bAgentParsing=0): - sVCards = '' - - if not bAgentParsing: - arCards = self.getPropertyValue(elmRoot, 'vcard', bAllowMultiple=1) - else: - arCards = [elmRoot] - - for elmCard in arCards: - arLines = [] - - def processSingleString(sProperty): - sValue = self.getPropertyValue(elmCard, sProperty, self.STRING, bAutoEscape=1).decode(self.encoding) - if sValue: - arLines.append(self.vcardFold(sProperty.upper() + ':' + sValue)) - return sValue or u'' - - def processSingleURI(sProperty): - sValue = self.getPropertyValue(elmCard, sProperty, self.URI) - if sValue: - sContentType = '' - sEncoding = '' - sValueKey = '' - if sValue.startswith('data:'): - sEncoding = ';ENCODING=b' - sContentType = sValue.split(';')[0].split('/').pop() - sValue = sValue.split(',', 1).pop() - else: - elmValue = self.getPropertyValue(elmCard, sProperty) - if elmValue: - if sProperty != 'url': - sValueKey = ';VALUE=uri' - sContentType = elmValue.get('type', '').strip().split('/').pop().strip() - sContentType = sContentType.upper() - if sContentType == 'OCTET-STREAM': - sContentType = '' - if sContentType: - sContentType = ';TYPE=' + sContentType.upper() - arLines.append(self.vcardFold(sProperty.upper() + sEncoding + sContentType + sValueKey + ':' + sValue)) - - def processTypeValue(sProperty, arDefaultType, arForceType=None): - arResults = self.getPropertyValue(elmCard, sProperty, bAllowMultiple=1) - for elmResult in arResults: - arType = self.getPropertyValue(elmResult, 'type', self.STRING, 1, 1) - if arForceType: - arType = self.unique(arForceType + arType) - if not arType: - arType = arDefaultType - sValue = self.getPropertyValue(elmResult, 'value', self.EMAIL, 0) - if sValue: - arLines.append(self.vcardFold(sProperty.upper() + ';TYPE=' + ','.join(arType) + ':' + sValue)) - - # AGENT - # must do this before all other properties because it is destructive - # (removes nested class="vcard" nodes so they don't interfere with - # this vcard's other properties) - arAgent = self.getPropertyValue(elmCard, 'agent', bAllowMultiple=1) - for elmAgent in arAgent: - if re.compile(r'\bvcard\b').search(elmAgent.get('class')): - sAgentValue = self.findVCards(elmAgent, 1) + '\n' - sAgentValue = sAgentValue.replace('\n', '\\n') - sAgentValue = sAgentValue.replace(';', '\\;') - if sAgentValue: - arLines.append(self.vcardFold('AGENT:' + sAgentValue)) - # Completely remove the agent element from the parse tree - elmAgent.extract() - else: - sAgentValue = self.getPropertyValue(elmAgent, 'value', self.URI, bAutoEscape=1); - if sAgentValue: - arLines.append(self.vcardFold('AGENT;VALUE=uri:' + sAgentValue)) - - # FN (full name) - sFN = processSingleString('fn') - - # N (name) - elmName = self.getPropertyValue(elmCard, 'n') - if elmName: - sFamilyName = self.getPropertyValue(elmName, 'family-name', self.STRING, bAutoEscape=1) - sGivenName = self.getPropertyValue(elmName, 'given-name', self.STRING, bAutoEscape=1) - arAdditionalNames = self.getPropertyValue(elmName, 'additional-name', self.STRING, 1, 1) + self.getPropertyValue(elmName, 'additional-names', self.STRING, 1, 1) - arHonorificPrefixes = self.getPropertyValue(elmName, 'honorific-prefix', self.STRING, 1, 1) + self.getPropertyValue(elmName, 'honorific-prefixes', self.STRING, 1, 1) - arHonorificSuffixes = self.getPropertyValue(elmName, 'honorific-suffix', self.STRING, 1, 1) + self.getPropertyValue(elmName, 'honorific-suffixes', self.STRING, 1, 1) - arLines.append(self.vcardFold('N:' + sFamilyName + ';' + - sGivenName + ';' + - ','.join(arAdditionalNames) + ';' + - ','.join(arHonorificPrefixes) + ';' + - ','.join(arHonorificSuffixes))) - elif sFN: - # implied "N" optimization - # http://microformats.org/wiki/hcard#Implied_.22N.22_Optimization - arNames = self.normalize(sFN).split() - if len(arNames) == 2: - bFamilyNameFirst = (arNames[0].endswith(',') or - len(arNames[1]) == 1 or - ((len(arNames[1]) == 2) and (arNames[1].endswith('.')))) - if bFamilyNameFirst: - arLines.append(self.vcardFold('N:' + arNames[0] + ';' + arNames[1])) - else: - arLines.append(self.vcardFold('N:' + arNames[1] + ';' + arNames[0])) - - # SORT-STRING - sSortString = self.getPropertyValue(elmCard, 'sort-string', self.STRING, bAutoEscape=1) - if sSortString: - arLines.append(self.vcardFold('SORT-STRING:' + sSortString)) - - # NICKNAME - arNickname = self.getPropertyValue(elmCard, 'nickname', self.STRING, 1, 1) - if arNickname: - arLines.append(self.vcardFold('NICKNAME:' + ','.join(arNickname))) - - # PHOTO - processSingleURI('photo') - - # BDAY - dtBday = self.getPropertyValue(elmCard, 'bday', self.DATE) - if dtBday: - arLines.append(self.vcardFold('BDAY:' + self.toISO8601(dtBday))) - - # ADR (address) - arAdr = self.getPropertyValue(elmCard, 'adr', bAllowMultiple=1) - for elmAdr in arAdr: - arType = self.getPropertyValue(elmAdr, 'type', self.STRING, 1, 1) - if not arType: - arType = ['intl','postal','parcel','work'] # default adr types, see RFC 2426 section 3.2.1 - sPostOfficeBox = self.getPropertyValue(elmAdr, 'post-office-box', self.STRING, 0, 1) - sExtendedAddress = self.getPropertyValue(elmAdr, 'extended-address', self.STRING, 0, 1) - sStreetAddress = self.getPropertyValue(elmAdr, 'street-address', self.STRING, 0, 1) - sLocality = self.getPropertyValue(elmAdr, 'locality', self.STRING, 0, 1) - sRegion = self.getPropertyValue(elmAdr, 'region', self.STRING, 0, 1) - sPostalCode = self.getPropertyValue(elmAdr, 'postal-code', self.STRING, 0, 1) - sCountryName = self.getPropertyValue(elmAdr, 'country-name', self.STRING, 0, 1) - arLines.append(self.vcardFold('ADR;TYPE=' + ','.join(arType) + ':' + - sPostOfficeBox + ';' + - sExtendedAddress + ';' + - sStreetAddress + ';' + - sLocality + ';' + - sRegion + ';' + - sPostalCode + ';' + - sCountryName)) - - # LABEL - processTypeValue('label', ['intl','postal','parcel','work']) - - # TEL (phone number) - processTypeValue('tel', ['voice']) - - # EMAIL - processTypeValue('email', ['internet'], ['internet']) - - # MAILER - processSingleString('mailer') - - # TZ (timezone) - processSingleString('tz') - - # GEO (geographical information) - elmGeo = self.getPropertyValue(elmCard, 'geo') - if elmGeo: - sLatitude = self.getPropertyValue(elmGeo, 'latitude', self.STRING, 0, 1) - sLongitude = self.getPropertyValue(elmGeo, 'longitude', self.STRING, 0, 1) - arLines.append(self.vcardFold('GEO:' + sLatitude + ';' + sLongitude)) - - # TITLE - processSingleString('title') - - # ROLE - processSingleString('role') - - # LOGO - processSingleURI('logo') - - # ORG (organization) - elmOrg = self.getPropertyValue(elmCard, 'org') - if elmOrg: - sOrganizationName = self.getPropertyValue(elmOrg, 'organization-name', self.STRING, 0, 1) - if not sOrganizationName: - # implied "organization-name" optimization - # http://microformats.org/wiki/hcard#Implied_.22organization-name.22_Optimization - sOrganizationName = self.getPropertyValue(elmCard, 'org', self.STRING, 0, 1) - if sOrganizationName: - arLines.append(self.vcardFold('ORG:' + sOrganizationName)) - else: - arOrganizationUnit = self.getPropertyValue(elmOrg, 'organization-unit', self.STRING, 1, 1) - arLines.append(self.vcardFold('ORG:' + sOrganizationName + ';' + ';'.join(arOrganizationUnit))) - - # CATEGORY - arCategory = self.getPropertyValue(elmCard, 'category', self.STRING, 1, 1) + self.getPropertyValue(elmCard, 'categories', self.STRING, 1, 1) - if arCategory: - arLines.append(self.vcardFold('CATEGORIES:' + ','.join(arCategory))) - - # NOTE - processSingleString('note') - - # REV - processSingleString('rev') - - # SOUND - processSingleURI('sound') - - # UID - processSingleString('uid') - - # URL - processSingleURI('url') - - # CLASS - processSingleString('class') - - # KEY - processSingleURI('key') - - if arLines: - arLines = [u'BEGIN:vCard',u'VERSION:3.0'] + arLines + [u'END:vCard'] - sVCards += u'\n'.join(arLines) + u'\n' - - return sVCards.strip() - - def isProbablyDownloadable(self, elm): - attrsD = elm.attrMap - if not attrsD.has_key('href'): return 0 - linktype = attrsD.get('type', '').strip() - if linktype.startswith('audio/') or \ - linktype.startswith('video/') or \ - (linktype.startswith('application/') and not linktype.endswith('xml')): - return 1 - path = urlparse.urlparse(attrsD['href'])[2] - if path.find('.') == -1: return 0 - fileext = path.split('.').pop().lower() - return fileext in self.known_binary_extensions - - def findTags(self): - all = lambda x: 1 - for elm in self.document(all, {'rel': re.compile(r'\btag\b')}): - href = elm.get('href') - if not href: continue - urlscheme, domain, path, params, query, fragment = \ - urlparse.urlparse(_urljoin(self.baseuri, href)) - segments = path.split('/') - tag = segments.pop() - if not tag: - tag = segments.pop() - tagscheme = urlparse.urlunparse((urlscheme, domain, '/'.join(segments), '', '', '')) - if not tagscheme.endswith('/'): - tagscheme += '/' - self.tags.append(FeedParserDict({"term": tag, "scheme": tagscheme, "label": elm.string or ''})) - - def findEnclosures(self): - all = lambda x: 1 - enclosure_match = re.compile(r'\benclosure\b') - for elm in self.document(all, {'href': re.compile(r'.+')}): - if not enclosure_match.search(elm.get('rel', '')) and not self.isProbablyDownloadable(elm): continue - if elm.attrMap not in self.enclosures: - self.enclosures.append(elm.attrMap) - if elm.string and not elm.get('title'): - self.enclosures[-1]['title'] = elm.string - - def findXFN(self): - all = lambda x: 1 - for elm in self.document(all, {'rel': re.compile('.+'), 'href': re.compile('.+')}): - rels = elm.get('rel', '').split() - xfn_rels = [] - for rel in rels: - if rel in self.known_xfn_relationships: - xfn_rels.append(rel) - if xfn_rels: - self.xfn.append({"relationships": xfn_rels, "href": elm.get('href', ''), "name": elm.string}) - -def _parseMicroformats(htmlSource, baseURI, encoding): - if not BeautifulSoup: return - if _debug: sys.stderr.write('entering _parseMicroformats\n') - try: - p = _MicroformatsParser(htmlSource, baseURI, encoding) - except UnicodeEncodeError: - # sgmllib throws this exception when performing lookups of tags - # with non-ASCII characters in them. - return - p.vcard = p.findVCards(p.document) - p.findTags() - p.findEnclosures() - p.findXFN() - return {"tags": p.tags, "enclosures": p.enclosures, "xfn": p.xfn, "vcard": p.vcard} - class _RelativeURIResolver(_BaseHTMLProcessor): - relative_uris = [('a', 'href'), + relative_uris = set([('a', 'href'), ('applet', 'codebase'), ('area', 'href'), + ('audio', 'src'), ('blockquote', 'cite'), ('body', 'background'), ('del', 'cite'), @@ -2453,25 +2307,26 @@ class _RelativeURIResolver(_BaseHTMLProcessor): ('object', 'data'), ('object', 'usemap'), ('q', 'cite'), - ('script', 'src')] + ('script', 'src'), + ('source', 'src'), + ('video', 'poster'), + ('video', 'src')]) def __init__(self, baseuri, encoding, _type): _BaseHTMLProcessor.__init__(self, encoding, _type) self.baseuri = baseuri def resolveURI(self, uri): - return _makeSafeAbsoluteURI(_urljoin(self.baseuri, uri.strip())) + return _makeSafeAbsoluteURI(self.baseuri, uri.strip()) def unknown_starttag(self, tag, attrs): - if _debug: - sys.stderr.write('tag: [%s] with attributes: [%s]\n' % (tag, str(attrs))) attrs = self.normalize_attrs(attrs) attrs = [(key, ((tag, key) in self.relative_uris) and self.resolveURI(value) or value) for key, value in attrs] _BaseHTMLProcessor.unknown_starttag(self, tag, attrs) def _resolveRelativeURIs(htmlSource, baseURI, encoding, _type): - if _debug: - sys.stderr.write('entering _resolveRelativeURIs\n') + if not _SGML_AVAILABLE: + return htmlSource p = _RelativeURIResolver(baseURI, encoding, _type) p.feed(htmlSource) @@ -2480,21 +2335,24 @@ def _resolveRelativeURIs(htmlSource, baseURI, encoding, _type): def _makeSafeAbsoluteURI(base, rel=None): # bail if ACCEPTABLE_URI_SCHEMES is empty if not ACCEPTABLE_URI_SCHEMES: - return _urljoin(base, rel or u'') + return _urljoin(base, rel or '') if not base: - return rel or u'' + return rel or '' if not rel: - scheme = urlparse.urlparse(base)[0] + try: + scheme = urllib.parse.urlparse(base)[0] + except ValueError: + return '' if not scheme or scheme in ACCEPTABLE_URI_SCHEMES: return base - return u'' + return '' uri = _urljoin(base, rel) if uri.strip().split(':', 1)[0] not in ACCEPTABLE_URI_SCHEMES: - return u'' + return '' return uri class _HTMLSanitizer(_BaseHTMLProcessor): - acceptable_elements = ['a', 'abbr', 'acronym', 'address', 'area', + acceptable_elements = set(['a', 'abbr', 'acronym', 'address', 'area', 'article', 'aside', 'audio', 'b', 'big', 'blockquote', 'br', 'button', 'canvas', 'caption', 'center', 'cite', 'code', 'col', 'colgroup', 'command', 'datagrid', 'datalist', 'dd', 'del', 'details', 'dfn', @@ -2506,9 +2364,9 @@ class _HTMLSanitizer(_BaseHTMLProcessor): 'p', 'pre', 'progress', 'q', 's', 'samp', 'section', 'select', 'small', 'sound', 'source', 'spacer', 'span', 'strike', 'strong', 'sub', 'sup', 'table', 'tbody', 'td', 'textarea', 'time', 'tfoot', - 'th', 'thead', 'tr', 'tt', 'u', 'ul', 'var', 'video', 'noscript'] + 'th', 'thead', 'tr', 'tt', 'u', 'ul', 'var', 'video', 'noscript']) - acceptable_attributes = ['abbr', 'accept', 'accept-charset', 'accesskey', + acceptable_attributes = set(['abbr', 'accept', 'accept-charset', 'accesskey', 'action', 'align', 'alt', 'autocomplete', 'autofocus', 'axis', 'background', 'balance', 'bgcolor', 'bgproperties', 'border', 'bordercolor', 'bordercolordark', 'bordercolorlight', 'bottompadding', @@ -2523,17 +2381,17 @@ class _HTMLSanitizer(_BaseHTMLProcessor): 'loop', 'loopcount', 'loopend', 'loopstart', 'low', 'lowsrc', 'max', 'maxlength', 'media', 'method', 'min', 'multiple', 'name', 'nohref', 'noshade', 'nowrap', 'open', 'optimum', 'pattern', 'ping', 'point-size', - 'prompt', 'pqg', 'radiogroup', 'readonly', 'rel', 'repeat-max', - 'repeat-min', 'replace', 'required', 'rev', 'rightspacing', 'rows', - 'rowspan', 'rules', 'scope', 'selected', 'shape', 'size', 'span', 'src', - 'start', 'step', 'summary', 'suppress', 'tabindex', 'target', 'template', - 'title', 'toppadding', 'type', 'unselectable', 'usemap', 'urn', 'valign', - 'value', 'variable', 'volume', 'vspace', 'vrml', 'width', 'wrap', - 'xml:lang'] + 'poster', 'pqg', 'preload', 'prompt', 'radiogroup', 'readonly', 'rel', + 'repeat-max', 'repeat-min', 'replace', 'required', 'rev', 'rightspacing', + 'rows', 'rowspan', 'rules', 'scope', 'selected', 'shape', 'size', 'span', + 'src', 'start', 'step', 'summary', 'suppress', 'tabindex', 'target', + 'template', 'title', 'toppadding', 'type', 'unselectable', 'usemap', + 'urn', 'valign', 'value', 'variable', 'volume', 'vspace', 'vrml', + 'width', 'wrap', 'xml:lang']) - unacceptable_elements_with_end_tag = ['script', 'applet', 'style'] + unacceptable_elements_with_end_tag = set(['script', 'applet', 'style']) - acceptable_css_properties = ['azimuth', 'background-color', + acceptable_css_properties = set(['azimuth', 'background-color', 'border-bottom-color', 'border-collapse', 'border-color', 'border-left-color', 'border-right-color', 'border-top-color', 'clear', 'color', 'cursor', 'direction', 'display', 'elevation', 'float', 'font', @@ -2543,45 +2401,178 @@ class _HTMLSanitizer(_BaseHTMLProcessor): 'speak', 'speak-header', 'speak-numeral', 'speak-punctuation', 'speech-rate', 'stress', 'text-align', 'text-decoration', 'text-indent', 'unicode-bidi', 'vertical-align', 'voice-family', 'volume', - 'white-space', 'width'] + 'white-space', 'width']) # survey of common keywords found in feeds - acceptable_css_keywords = ['auto', 'aqua', 'black', 'block', 'blue', + acceptable_css_keywords = set(['auto', 'aqua', 'black', 'block', 'blue', 'bold', 'both', 'bottom', 'brown', 'center', 'collapse', 'dashed', 'dotted', 'fuchsia', 'gray', 'green', '!important', 'italic', 'left', 'lime', 'maroon', 'medium', 'none', 'navy', 'normal', 'nowrap', 'olive', 'pointer', 'purple', 'red', 'right', 'solid', 'silver', 'teal', 'top', - 'transparent', 'underline', 'white', 'yellow'] + 'transparent', 'underline', 'white', 'yellow']) valid_css_values = re.compile('^(#[0-9a-f]+|rgb\(\d+%?,\d*%?,?\d*%?\)?|' + '\d{0,2}\.?\d{0,2}(cm|em|ex|in|mm|pc|pt|px|%|,|\))?)$') - mathml_elements = ['annotation', 'annotation-xml', 'maction', 'math', - 'merror', 'mfenced', 'mfrac', 'mi', 'mmultiscripts', 'mn', 'mo', 'mover', 'mpadded', - 'mphantom', 'mprescripts', 'mroot', 'mrow', 'mspace', 'msqrt', 'mstyle', - 'msub', 'msubsup', 'msup', 'mtable', 'mtd', 'mtext', 'mtr', 'munder', - 'munderover', 'none', 'semantics'] + mathml_elements = set([ + 'annotation', + 'annotation-xml', + 'maction', + 'maligngroup', + 'malignmark', + 'math', + 'menclose', + 'merror', + 'mfenced', + 'mfrac', + 'mglyph', + 'mi', + 'mlabeledtr', + 'mlongdiv', + 'mmultiscripts', + 'mn', + 'mo', + 'mover', + 'mpadded', + 'mphantom', + 'mprescripts', + 'mroot', + 'mrow', + 'ms', + 'mscarries', + 'mscarry', + 'msgroup', + 'msline', + 'mspace', + 'msqrt', + 'msrow', + 'mstack', + 'mstyle', + 'msub', + 'msubsup', + 'msup', + 'mtable', + 'mtd', + 'mtext', + 'mtr', + 'munder', + 'munderover', + 'none', + 'semantics', + ]) - mathml_attributes = ['actiontype', 'align', 'columnalign', 'columnalign', - 'columnalign', 'close', 'columnlines', 'columnspacing', 'columnspan', 'depth', - 'display', 'displaystyle', 'encoding', 'equalcolumns', 'equalrows', - 'fence', 'fontstyle', 'fontweight', 'frame', 'height', 'linethickness', - 'lspace', 'mathbackground', 'mathcolor', 'mathvariant', 'mathvariant', - 'maxsize', 'minsize', 'open', 'other', 'rowalign', 'rowalign', 'rowalign', - 'rowlines', 'rowspacing', 'rowspan', 'rspace', 'scriptlevel', 'selection', - 'separator', 'separators', 'stretchy', 'width', 'width', 'xlink:href', - 'xlink:show', 'xlink:type', 'xmlns', 'xmlns:xlink'] + mathml_attributes = set([ + 'accent', + 'accentunder', + 'actiontype', + 'align', + 'alignmentscope', + 'altimg', + 'altimg-height', + 'altimg-valign', + 'altimg-width', + 'alttext', + 'bevelled', + 'charalign', + 'close', + 'columnalign', + 'columnlines', + 'columnspacing', + 'columnspan', + 'columnwidth', + 'crossout', + 'decimalpoint', + 'denomalign', + 'depth', + 'dir', + 'display', + 'displaystyle', + 'edge', + 'encoding', + 'equalcolumns', + 'equalrows', + 'fence', + 'fontstyle', + 'fontweight', + 'form', + 'frame', + 'framespacing', + 'groupalign', + 'height', + 'href', + 'id', + 'indentalign', + 'indentalignfirst', + 'indentalignlast', + 'indentshift', + 'indentshiftfirst', + 'indentshiftlast', + 'indenttarget', + 'infixlinebreakstyle', + 'largeop', + 'length', + 'linebreak', + 'linebreakmultchar', + 'linebreakstyle', + 'lineleading', + 'linethickness', + 'location', + 'longdivstyle', + 'lquote', + 'lspace', + 'mathbackground', + 'mathcolor', + 'mathsize', + 'mathvariant', + 'maxsize', + 'minlabelspacing', + 'minsize', + 'movablelimits', + 'notation', + 'numalign', + 'open', + 'other', + 'overflow', + 'position', + 'rowalign', + 'rowlines', + 'rowspacing', + 'rowspan', + 'rquote', + 'rspace', + 'scriptlevel', + 'scriptminsize', + 'scriptsizemultiplier', + 'selection', + 'separator', + 'separators', + 'shift', + 'side', + 'src', + 'stackalign', + 'stretchy', + 'subscriptshift', + 'superscriptshift', + 'symmetric', + 'voffset', + 'width', + 'xlink:href', + 'xlink:show', + 'xlink:type', + 'xmlns', + 'xmlns:xlink', + ]) # svgtiny - foreignObject + linearGradient + radialGradient + stop - svg_elements = ['a', 'animate', 'animateColor', 'animateMotion', + svg_elements = set(['a', 'animate', 'animateColor', 'animateMotion', 'animateTransform', 'circle', 'defs', 'desc', 'ellipse', 'foreignObject', 'font-face', 'font-face-name', 'font-face-src', 'g', 'glyph', 'hkern', 'linearGradient', 'line', 'marker', 'metadata', 'missing-glyph', 'mpath', 'path', 'polygon', 'polyline', 'radialGradient', 'rect', 'set', 'stop', - 'svg', 'switch', 'text', 'title', 'tspan', 'use'] + 'svg', 'switch', 'text', 'title', 'tspan', 'use']) # svgtiny + class + opacity + offset + xmlns + xmlns:xlink - svg_attributes = ['accent-height', 'accumulate', 'additive', 'alphabetic', + svg_attributes = set(['accent-height', 'accumulate', 'additive', 'alphabetic', 'arabic-form', 'ascent', 'attributeName', 'attributeType', 'baseProfile', 'bbox', 'begin', 'by', 'calcMode', 'cap-height', 'class', 'color', 'color-rendering', 'content', 'cx', 'cy', 'd', 'dx', @@ -2607,14 +2598,14 @@ class _HTMLSanitizer(_BaseHTMLProcessor): 'widths', 'x', 'x-height', 'x1', 'x2', 'xlink:actuate', 'xlink:arcrole', 'xlink:href', 'xlink:role', 'xlink:show', 'xlink:title', 'xlink:type', 'xml:base', 'xml:lang', 'xml:space', 'xmlns', 'xmlns:xlink', 'y', 'y1', - 'y2', 'zoomAndPan'] + 'y2', 'zoomAndPan']) svg_attr_map = None svg_elem_map = None - acceptable_svg_properties = [ 'fill', 'fill-opacity', 'fill-rule', + acceptable_svg_properties = set([ 'fill', 'fill-opacity', 'fill-rule', 'stroke', 'stroke-width', 'stroke-linecap', 'stroke-linejoin', - 'stroke-opacity'] + 'stroke-opacity']) def reset(self): _BaseHTMLProcessor.reset(self) @@ -2667,7 +2658,7 @@ class _HTMLSanitizer(_BaseHTMLProcessor): # declare xlink namespace, if needed if self.mathmlOK or self.svgOK: - if filter(lambda (n,v): n.startswith('xlink:'),attrs): + if [n_v for n_v in attrs if n_v[0].startswith('xlink:')]: if not ('xmlns:xlink','http://www.w3.org/1999/xlink') in attrs: attrs.append(('xmlns:xlink','http://www.w3.org/1999/xlink')) @@ -2676,12 +2667,13 @@ class _HTMLSanitizer(_BaseHTMLProcessor): if key in acceptable_attributes: key=keymap.get(key,key) # make sure the uri uses an acceptable uri scheme - if key == u'href': + if key == 'href': value = _makeSafeAbsoluteURI(value) clean_attrs.append((key,value)) elif key=='style': clean_value = self.sanitize_style(value) - if clean_value: clean_attrs.append((key,clean_value)) + if clean_value: + clean_attrs.append((key,clean_value)) _BaseHTMLProcessor.unknown_starttag(self, tag, clean_attrs) def unknown_endtag(self, tag): @@ -2689,10 +2681,12 @@ class _HTMLSanitizer(_BaseHTMLProcessor): if tag in self.unacceptable_elements_with_end_tag: self.unacceptablestack -= 1 if self.mathmlOK and tag in self.mathml_elements: - if tag == 'math' and self.mathmlOK: self.mathmlOK -= 1 + if tag == 'math' and self.mathmlOK: + self.mathmlOK -= 1 elif self.svgOK and tag in self.svg_elements: tag = self.svg_elem_map.get(tag,tag) - if tag == 'svg' and self.svgOK: self.svgOK -= 1 + if tag == 'svg' and self.svgOK: + self.svgOK -= 1 else: return _BaseHTMLProcessor.unknown_endtag(self, tag) @@ -2712,24 +2706,27 @@ class _HTMLSanitizer(_BaseHTMLProcessor): style=re.compile('url\s*\(\s*[^\s)]+?\s*\)\s*').sub(' ',style) # gauntlet - if not re.match("""^([:,;#%.\sa-zA-Z0-9!]|\w-\w|'[\s\w]+'|"[\s\w]+"|\([\d,\s]+\))*$""", style): return '' + if not re.match("""^([:,;#%.\sa-zA-Z0-9!]|\w-\w|'[\s\w]+'|"[\s\w]+"|\([\d,\s]+\))*$""", style): + return '' # This replaced a regexp that used re.match and was prone to pathological back-tracking. - if re.sub("\s*[-\w]+\s*:\s*[^:;]*;?", '', style).strip(): return '' + if re.sub("\s*[-\w]+\s*:\s*[^:;]*;?", '', style).strip(): + return '' clean = [] for prop,value in re.findall("([-\w]+)\s*:\s*([^:;]*)",style): - if not value: continue - if prop.lower() in self.acceptable_css_properties: - clean.append(prop + ': ' + value + ';') - elif prop.split('-')[0].lower() in ['background','border','margin','padding']: - for keyword in value.split(): - if not keyword in self.acceptable_css_keywords and \ - not self.valid_css_values.match(keyword): - break - else: - clean.append(prop + ': ' + value + ';') - elif self.svgOK and prop.lower() in self.acceptable_svg_properties: - clean.append(prop + ': ' + value + ';') + if not value: + continue + if prop.lower() in self.acceptable_css_properties: + clean.append(prop + ': ' + value + ';') + elif prop.split('-')[0].lower() in ['background','border','margin','padding']: + for keyword in value.split(): + if not keyword in self.acceptable_css_keywords and \ + not self.valid_css_values.match(keyword): + break + else: + clean.append(prop + ': ' + value + ';') + elif self.svgOK and prop.lower() in self.acceptable_svg_properties: + clean.append(prop + ': ' + value + ';') return ' '.join(clean) @@ -2747,98 +2744,57 @@ class _HTMLSanitizer(_BaseHTMLProcessor): def _sanitizeHTML(htmlSource, encoding, _type): + if not _SGML_AVAILABLE: + return htmlSource p = _HTMLSanitizer(encoding, _type) htmlSource = htmlSource.replace(''): - data = data.split('>', 1)[1] - if data.count('= '2.3.3' - assert base64 != None - user, passw = _base64decode(req.headers['Authorization'].split(' ')[1]).split(':') - realm = re.findall('realm="([^"]*)"', headers['WWW-Authenticate'])[0] - self.add_password(realm, host, user, passw) - retry = self.http_error_auth_reqed('www-authenticate', host, req, headers) - self.reset_retry_count() - return retry - except: + host = urllib.parse.urlparse(req.get_full_url())[1] + if base64 is None or 'Authorization' not in req.headers \ + or 'WWW-Authenticate' not in headers: return self.http_error_default(req, fp, code, msg, headers) + auth = _base64decode(req.headers['Authorization'].split(' ')[1]) + user, passw = auth.split(':') + realm = re.findall('realm="([^"]*)"', headers['WWW-Authenticate'])[0] + self.add_password(realm, host, user, passw) + retry = self.http_error_auth_reqed('www-authenticate', host, req, headers) + self.reset_retry_count() + return retry def _open_resource(url_file_stream_or_string, etag, modified, agent, referrer, handlers, request_headers): """URL, filename, or string --> stream @@ -2870,15 +2826,15 @@ def _open_resource(url_file_stream_or_string, etag, modified, agent, referrer, h if request_headers is supplied it is a dictionary of HTTP request headers that will override the values generated by FeedParser. + + :return: A :class:`StringIO.StringIO` or :class:`io.BytesIO`. """ if hasattr(url_file_stream_or_string, 'read'): return url_file_stream_or_string - if url_file_stream_or_string == '-': - return sys.stdin - - if urlparse.urlparse(url_file_stream_or_string)[0] in ('http', 'https', 'ftp', 'file', 'feed'): + if isinstance(url_file_stream_or_string, str) \ + and urllib.parse.urlparse(url_file_stream_or_string)[0] in ('http', 'https', 'ftp', 'file', 'feed'): # Deal with the feed URI scheme if url_file_stream_or_string.startswith('feed:http'): url_file_stream_or_string = url_file_stream_or_string[5:] @@ -2886,50 +2842,78 @@ def _open_resource(url_file_stream_or_string, etag, modified, agent, referrer, h url_file_stream_or_string = 'http:' + url_file_stream_or_string[5:] if not agent: agent = USER_AGENT - # test for inline user:password for basic auth + # Test for inline user:password credentials for HTTP basic auth auth = None - if base64: - urltype, rest = urllib.splittype(url_file_stream_or_string) - realhost, rest = urllib.splithost(rest) + if base64 and not url_file_stream_or_string.startswith('ftp:'): + urltype, rest = urllib.parse.splittype(url_file_stream_or_string) + realhost, rest = urllib.parse.splithost(rest) if realhost: - user_passwd, realhost = urllib.splituser(realhost) + user_passwd, realhost = urllib.parse.splituser(realhost) if user_passwd: url_file_stream_or_string = '%s://%s%s' % (urltype, realhost, rest) auth = base64.standard_b64encode(user_passwd).strip() # iri support - try: - if isinstance(url_file_stream_or_string,unicode): - url_file_stream_or_string = url_file_stream_or_string.encode('idna').decode('utf-8') - else: - url_file_stream_or_string = url_file_stream_or_string.decode('utf-8').encode('idna').decode('utf-8') - except: - pass + if isinstance(url_file_stream_or_string, str): + url_file_stream_or_string = _convert_to_idn(url_file_stream_or_string) # try to open with urllib2 (to use optional headers) request = _build_urllib2_request(url_file_stream_or_string, agent, etag, modified, referrer, auth, request_headers) - opener = apply(urllib2.build_opener, tuple(handlers + [_FeedURLHandler()])) + opener = urllib.request.build_opener(*tuple(handlers + [_FeedURLHandler()])) opener.addheaders = [] # RMK - must clear so we only send our custom User-Agent try: - return opener.open(request, timeout=15) + return opener.open(request) finally: opener.close() # JohnD # try to open with native open function (if url_file_stream_or_string is a filename) try: return open(url_file_stream_or_string, 'rb') - except: + except (IOError, UnicodeEncodeError, TypeError): + # if url_file_stream_or_string is a unicode object that + # cannot be converted to the encoding returned by + # sys.getfilesystemencoding(), a UnicodeEncodeError + # will be thrown + # If url_file_stream_or_string is a string that contains NULL + # (such as an XML document encoded in UTF-32), TypeError will + # be thrown. pass # treat url_file_stream_or_string as string - return _StringIO(str(url_file_stream_or_string)) + if isinstance(url_file_stream_or_string, str): + return _StringIO(url_file_stream_or_string.encode('utf-8')) + return _StringIO(url_file_stream_or_string) + +def _convert_to_idn(url): + """Convert a URL to IDN notation""" + # this function should only be called with a unicode string + # strategy: if the host cannot be encoded in ascii, then + # it'll be necessary to encode it in idn form + parts = list(urllib.parse.urlsplit(url)) + try: + parts[1].encode('ascii') + except UnicodeEncodeError: + # the url needs to be converted to idn notation + host = parts[1].rsplit(':', 1) + newhost = [] + port = '' + if len(host) == 2: + port = host.pop() + for h in host[0].split('.'): + newhost.append(h.encode('idna').decode('utf-8')) + parts[1] = '.'.join(newhost) + if port: + parts[1] += ':' + port + return urllib.parse.urlunsplit(parts) + else: + return url def _build_urllib2_request(url, agent, etag, modified, referrer, auth, request_headers): - request = urllib2.Request(url) + request = urllib.request.Request(url) request.add_header('User-Agent', agent) if etag: request.add_header('If-None-Match', etag) - if type(modified) == type(''): + if isinstance(modified, str): modified = _parse_date(modified) elif isinstance(modified, datetime.datetime): modified = modified.utctimetuple() @@ -2957,11 +2941,22 @@ def _build_urllib2_request(url, agent, etag, modified, referrer, auth, request_h request.add_header('Accept', ACCEPT_HEADER) # use this for whatever -- cookies, special headers, etc # [('Cookie','Something'),('x-special-header','Another Value')] - for header_name, header_value in request_headers.items(): + for header_name, header_value in list(request_headers.items()): request.add_header(header_name, header_value) request.add_header('A-IM', 'feed') # RFC 3229 support return request +def _parse_psc_chapter_start(start): + FORMAT = r'^((\d{2}):)?(\d{2}):(\d{2})(\.(\d{3}))?$' + + m = re.compile(FORMAT).match(start) + if m is None: + return None + + _, h, m, s, _, ms = m.groups() + h, m, s, ms = (int(h or 0), int(m), int(s), int(ms or 0)) + return datetime.timedelta(0, h*60*60 + m*60 + s, ms*1000) + _date_handlers = [] def registerDateHandler(func): '''Register a date handler function (takes string, returns 9-tuple date in GMT)''' @@ -3004,14 +2999,18 @@ try: del regex except NameError: pass + def _parse_date_iso8601(dateString): '''Parse a variety of ISO-8601-compatible formats like 20040105''' m = None for _iso8601_match in _iso8601_matches: m = _iso8601_match(dateString) - if m: break - if not m: return - if m.span() == (0, 0): return + if m: + break + if not m: + return + if m.span() == (0, 0): + return params = m.groupdict() ordinal = params.get('ordinal', 0) if ordinal: @@ -3049,7 +3048,7 @@ def _parse_date_iso8601(dateString): day = int(day) # special case of the century - is the first year of the 21st century # 2000 or 2001 ? The debate goes on... - if 'century' in params.keys(): + if 'century' in params: year = (int(params['century']) - 1) * 100 + 1 # in ISO 8601 most fields are optional for field in ['hour', 'minute', 'second', 'tzhour', 'tzmin']: @@ -3081,34 +3080,35 @@ def _parse_date_iso8601(dateString): registerDateHandler(_parse_date_iso8601) # 8-bit date handling routines written by ytrewq1. -_korean_year = u'\ub144' # b3e2 in euc-kr -_korean_month = u'\uc6d4' # bff9 in euc-kr -_korean_day = u'\uc77c' # c0cf in euc-kr -_korean_am = u'\uc624\uc804' # bfc0 c0fc in euc-kr -_korean_pm = u'\uc624\ud6c4' # bfc0 c8c4 in euc-kr +_korean_year = '\ub144' # b3e2 in euc-kr +_korean_month = '\uc6d4' # bff9 in euc-kr +_korean_day = '\uc77c' # c0cf in euc-kr +_korean_am = '\uc624\uc804' # bfc0 c0fc in euc-kr +_korean_pm = '\uc624\ud6c4' # bfc0 c8c4 in euc-kr _korean_onblog_date_re = \ re.compile('(\d{4})%s\s+(\d{2})%s\s+(\d{2})%s\s+(\d{2}):(\d{2}):(\d{2})' % \ (_korean_year, _korean_month, _korean_day)) _korean_nate_date_re = \ - re.compile(u'(\d{4})-(\d{2})-(\d{2})\s+(%s|%s)\s+(\d{,2}):(\d{,2}):(\d{,2})' % \ + re.compile('(\d{4})-(\d{2})-(\d{2})\s+(%s|%s)\s+(\d{,2}):(\d{,2}):(\d{,2})' % \ (_korean_am, _korean_pm)) def _parse_date_onblog(dateString): '''Parse a string according to the OnBlog 8-bit date format''' m = _korean_onblog_date_re.match(dateString) - if not m: return + if not m: + return w3dtfdate = '%(year)s-%(month)s-%(day)sT%(hour)s:%(minute)s:%(second)s%(zonediff)s' % \ {'year': m.group(1), 'month': m.group(2), 'day': m.group(3),\ 'hour': m.group(4), 'minute': m.group(5), 'second': m.group(6),\ 'zonediff': '+09:00'} - if _debug: sys.stderr.write('OnBlog date parsed as: %s\n' % w3dtfdate) return _parse_date_w3dtf(w3dtfdate) registerDateHandler(_parse_date_onblog) def _parse_date_nate(dateString): '''Parse a string according to the Nate 8-bit date format''' m = _korean_nate_date_re.match(dateString) - if not m: return + if not m: + return hour = int(m.group(5)) ampm = m.group(4) if (ampm == _korean_pm): @@ -3120,521 +3120,726 @@ def _parse_date_nate(dateString): {'year': m.group(1), 'month': m.group(2), 'day': m.group(3),\ 'hour': hour, 'minute': m.group(6), 'second': m.group(7),\ 'zonediff': '+09:00'} - if _debug: sys.stderr.write('Nate date parsed as: %s\n' % w3dtfdate) return _parse_date_w3dtf(w3dtfdate) registerDateHandler(_parse_date_nate) -_mssql_date_re = \ - re.compile('(\d{4})-(\d{2})-(\d{2})\s+(\d{2}):(\d{2}):(\d{2})(\.\d+)?') -def _parse_date_mssql(dateString): - '''Parse a string according to the MS SQL date format''' - m = _mssql_date_re.match(dateString) - if not m: return - w3dtfdate = '%(year)s-%(month)s-%(day)sT%(hour)s:%(minute)s:%(second)s%(zonediff)s' % \ - {'year': m.group(1), 'month': m.group(2), 'day': m.group(3),\ - 'hour': m.group(4), 'minute': m.group(5), 'second': m.group(6),\ - 'zonediff': '+09:00'} - if _debug: sys.stderr.write('MS SQL date parsed as: %s\n' % w3dtfdate) - return _parse_date_w3dtf(w3dtfdate) -registerDateHandler(_parse_date_mssql) - # Unicode strings for Greek date strings _greek_months = \ { \ - u'\u0399\u03b1\u03bd': u'Jan', # c9e1ed in iso-8859-7 - u'\u03a6\u03b5\u03b2': u'Feb', # d6e5e2 in iso-8859-7 - u'\u039c\u03ac\u03ce': u'Mar', # ccdcfe in iso-8859-7 - u'\u039c\u03b1\u03ce': u'Mar', # cce1fe in iso-8859-7 - u'\u0391\u03c0\u03c1': u'Apr', # c1f0f1 in iso-8859-7 - u'\u039c\u03ac\u03b9': u'May', # ccdce9 in iso-8859-7 - u'\u039c\u03b1\u03ca': u'May', # cce1fa in iso-8859-7 - u'\u039c\u03b1\u03b9': u'May', # cce1e9 in iso-8859-7 - u'\u0399\u03bf\u03cd\u03bd': u'Jun', # c9effded in iso-8859-7 - u'\u0399\u03bf\u03bd': u'Jun', # c9efed in iso-8859-7 - u'\u0399\u03bf\u03cd\u03bb': u'Jul', # c9effdeb in iso-8859-7 - u'\u0399\u03bf\u03bb': u'Jul', # c9f9eb in iso-8859-7 - u'\u0391\u03cd\u03b3': u'Aug', # c1fde3 in iso-8859-7 - u'\u0391\u03c5\u03b3': u'Aug', # c1f5e3 in iso-8859-7 - u'\u03a3\u03b5\u03c0': u'Sep', # d3e5f0 in iso-8859-7 - u'\u039f\u03ba\u03c4': u'Oct', # cfeaf4 in iso-8859-7 - u'\u039d\u03bf\u03ad': u'Nov', # cdefdd in iso-8859-7 - u'\u039d\u03bf\u03b5': u'Nov', # cdefe5 in iso-8859-7 - u'\u0394\u03b5\u03ba': u'Dec', # c4e5ea in iso-8859-7 + '\u0399\u03b1\u03bd': 'Jan', # c9e1ed in iso-8859-7 + '\u03a6\u03b5\u03b2': 'Feb', # d6e5e2 in iso-8859-7 + '\u039c\u03ac\u03ce': 'Mar', # ccdcfe in iso-8859-7 + '\u039c\u03b1\u03ce': 'Mar', # cce1fe in iso-8859-7 + '\u0391\u03c0\u03c1': 'Apr', # c1f0f1 in iso-8859-7 + '\u039c\u03ac\u03b9': 'May', # ccdce9 in iso-8859-7 + '\u039c\u03b1\u03ca': 'May', # cce1fa in iso-8859-7 + '\u039c\u03b1\u03b9': 'May', # cce1e9 in iso-8859-7 + '\u0399\u03bf\u03cd\u03bd': 'Jun', # c9effded in iso-8859-7 + '\u0399\u03bf\u03bd': 'Jun', # c9efed in iso-8859-7 + '\u0399\u03bf\u03cd\u03bb': 'Jul', # c9effdeb in iso-8859-7 + '\u0399\u03bf\u03bb': 'Jul', # c9f9eb in iso-8859-7 + '\u0391\u03cd\u03b3': 'Aug', # c1fde3 in iso-8859-7 + '\u0391\u03c5\u03b3': 'Aug', # c1f5e3 in iso-8859-7 + '\u03a3\u03b5\u03c0': 'Sep', # d3e5f0 in iso-8859-7 + '\u039f\u03ba\u03c4': 'Oct', # cfeaf4 in iso-8859-7 + '\u039d\u03bf\u03ad': 'Nov', # cdefdd in iso-8859-7 + '\u039d\u03bf\u03b5': 'Nov', # cdefe5 in iso-8859-7 + '\u0394\u03b5\u03ba': 'Dec', # c4e5ea in iso-8859-7 } _greek_wdays = \ { \ - u'\u039a\u03c5\u03c1': u'Sun', # caf5f1 in iso-8859-7 - u'\u0394\u03b5\u03c5': u'Mon', # c4e5f5 in iso-8859-7 - u'\u03a4\u03c1\u03b9': u'Tue', # d4f1e9 in iso-8859-7 - u'\u03a4\u03b5\u03c4': u'Wed', # d4e5f4 in iso-8859-7 - u'\u03a0\u03b5\u03bc': u'Thu', # d0e5ec in iso-8859-7 - u'\u03a0\u03b1\u03c1': u'Fri', # d0e1f1 in iso-8859-7 - u'\u03a3\u03b1\u03b2': u'Sat', # d3e1e2 in iso-8859-7 + '\u039a\u03c5\u03c1': 'Sun', # caf5f1 in iso-8859-7 + '\u0394\u03b5\u03c5': 'Mon', # c4e5f5 in iso-8859-7 + '\u03a4\u03c1\u03b9': 'Tue', # d4f1e9 in iso-8859-7 + '\u03a4\u03b5\u03c4': 'Wed', # d4e5f4 in iso-8859-7 + '\u03a0\u03b5\u03bc': 'Thu', # d0e5ec in iso-8859-7 + '\u03a0\u03b1\u03c1': 'Fri', # d0e1f1 in iso-8859-7 + '\u03a3\u03b1\u03b2': 'Sat', # d3e1e2 in iso-8859-7 } _greek_date_format_re = \ - re.compile(u'([^,]+),\s+(\d{2})\s+([^\s]+)\s+(\d{4})\s+(\d{2}):(\d{2}):(\d{2})\s+([^\s]+)') + re.compile('([^,]+),\s+(\d{2})\s+([^\s]+)\s+(\d{4})\s+(\d{2}):(\d{2}):(\d{2})\s+([^\s]+)') def _parse_date_greek(dateString): '''Parse a string according to a Greek 8-bit date format.''' m = _greek_date_format_re.match(dateString) - if not m: return - try: - wday = _greek_wdays[m.group(1)] - month = _greek_months[m.group(3)] - except: + if not m: return + wday = _greek_wdays[m.group(1)] + month = _greek_months[m.group(3)] rfc822date = '%(wday)s, %(day)s %(month)s %(year)s %(hour)s:%(minute)s:%(second)s %(zonediff)s' % \ {'wday': wday, 'day': m.group(2), 'month': month, 'year': m.group(4),\ 'hour': m.group(5), 'minute': m.group(6), 'second': m.group(7),\ 'zonediff': m.group(8)} - if _debug: sys.stderr.write('Greek date parsed as: %s\n' % rfc822date) return _parse_date_rfc822(rfc822date) registerDateHandler(_parse_date_greek) # Unicode strings for Hungarian date strings _hungarian_months = \ { \ - u'janu\u00e1r': u'01', # e1 in iso-8859-2 - u'febru\u00e1ri': u'02', # e1 in iso-8859-2 - u'm\u00e1rcius': u'03', # e1 in iso-8859-2 - u'\u00e1prilis': u'04', # e1 in iso-8859-2 - u'm\u00e1ujus': u'05', # e1 in iso-8859-2 - u'j\u00fanius': u'06', # fa in iso-8859-2 - u'j\u00falius': u'07', # fa in iso-8859-2 - u'augusztus': u'08', - u'szeptember': u'09', - u'okt\u00f3ber': u'10', # f3 in iso-8859-2 - u'november': u'11', - u'december': u'12', + 'janu\u00e1r': '01', # e1 in iso-8859-2 + 'febru\u00e1ri': '02', # e1 in iso-8859-2 + 'm\u00e1rcius': '03', # e1 in iso-8859-2 + '\u00e1prilis': '04', # e1 in iso-8859-2 + 'm\u00e1ujus': '05', # e1 in iso-8859-2 + 'j\u00fanius': '06', # fa in iso-8859-2 + 'j\u00falius': '07', # fa in iso-8859-2 + 'augusztus': '08', + 'szeptember': '09', + 'okt\u00f3ber': '10', # f3 in iso-8859-2 + 'november': '11', + 'december': '12', } _hungarian_date_format_re = \ - re.compile(u'(\d{4})-([^-]+)-(\d{,2})T(\d{,2}):(\d{2})((\+|-)(\d{,2}:\d{2}))') + re.compile('(\d{4})-([^-]+)-(\d{,2})T(\d{,2}):(\d{2})((\+|-)(\d{,2}:\d{2}))') def _parse_date_hungarian(dateString): '''Parse a string according to a Hungarian 8-bit date format.''' m = _hungarian_date_format_re.match(dateString) - if not m: return - try: - month = _hungarian_months[m.group(2)] - day = m.group(3) - if len(day) == 1: - day = '0' + day - hour = m.group(4) - if len(hour) == 1: - hour = '0' + hour - except: - return + if not m or m.group(2) not in _hungarian_months: + return None + month = _hungarian_months[m.group(2)] + day = m.group(3) + if len(day) == 1: + day = '0' + day + hour = m.group(4) + if len(hour) == 1: + hour = '0' + hour w3dtfdate = '%(year)s-%(month)s-%(day)sT%(hour)s:%(minute)s%(zonediff)s' % \ {'year': m.group(1), 'month': month, 'day': day,\ 'hour': hour, 'minute': m.group(5),\ 'zonediff': m.group(6)} - if _debug: sys.stderr.write('Hungarian date parsed as: %s\n' % w3dtfdate) return _parse_date_w3dtf(w3dtfdate) registerDateHandler(_parse_date_hungarian) -# W3DTF-style date parsing adapted from PyXML xml.utils.iso8601, written by -# Drake and licensed under the Python license. Removed all range checking -# for month, day, hour, minute, and second, since mktime will normalize -# these later -def _parse_date_w3dtf(dateString): - def __extract_date(m): - year = int(m.group('year')) - if year < 100: - year = 100 * int(time.gmtime()[0] / 100) + int(year) - if year < 1000: - return 0, 0, 0 - julian = m.group('julian') - if julian: - julian = int(julian) - month = julian / 30 + 1 - day = julian % 30 + 1 - jday = None - while jday != julian: - t = time.mktime((year, month, day, 0, 0, 0, 0, 0, 0)) - jday = time.gmtime(t)[-2] - diff = abs(jday - julian) - if jday > julian: - if diff < day: - day = day - diff - else: - month = month - 1 - day = 31 - elif jday < julian: - if day + diff < 28: - day = day + diff - else: - month = month + 1 - return year, month, day - month = m.group('month') - day = 1 - if month is None: - month = 1 - else: - month = int(month) - day = m.group('day') - if day: - day = int(day) - else: - day = 1 - return year, month, day +timezonenames = { + 'ut': 0, 'gmt': 0, 'z': 0, + 'adt': -3, 'ast': -4, 'at': -4, + 'edt': -4, 'est': -5, 'et': -5, + 'cdt': -5, 'cst': -6, 'ct': -6, + 'mdt': -6, 'mst': -7, 'mt': -7, + 'pdt': -7, 'pst': -8, 'pt': -8, + 'a': -1, 'n': 1, + 'm': -12, 'y': 12, +} +# W3 date and time format parser +# http://www.w3.org/TR/NOTE-datetime +# Also supports MSSQL-style datetimes as defined at: +# http://msdn.microsoft.com/en-us/library/ms186724.aspx +# (basically, allow a space as a date/time/timezone separator) +def _parse_date_w3dtf(datestr): + if not datestr.strip(): + return None + parts = datestr.lower().split('t') + if len(parts) == 1: + # This may be a date only, or may be an MSSQL-style date + parts = parts[0].split() + if len(parts) == 1: + # Treat this as a date only + parts.append('00:00:00z') + elif len(parts) > 2: + return None + date = parts[0].split('-', 2) + if not date or len(date[0]) != 4: + return None + # Ensure that `date` has 3 elements. Using '1' sets the default + # month to January and the default day to the 1st of the month. + date.extend(['1'] * (3 - len(date))) + try: + year, month, day = [int(i) for i in date] + except ValueError: + # `date` may have more than 3 elements or may contain + # non-integer strings. + return None + if parts[1].endswith('z'): + parts[1] = parts[1][:-1] + parts.append('z') + # Append the numeric timezone offset, if any, to parts. + # If this is an MSSQL-style date then parts[2] already contains + # the timezone information, so `append()` will not affect it. + # Add 1 to each value so that if `find()` returns -1 it will be + # treated as False. + loc = parts[1].find('-') + 1 or parts[1].find('+') + 1 or len(parts[1]) + 1 + loc = loc - 1 + parts.append(parts[1][loc:]) + parts[1] = parts[1][:loc] + time = parts[1].split(':', 2) + # Ensure that time has 3 elements. Using '0' means that the + # minutes and seconds, if missing, will default to 0. + time.extend(['0'] * (3 - len(time))) + tzhour = 0 + tzmin = 0 + if parts[2][:1] in ('-', '+'): + try: + tzhour = int(parts[2][1:3]) + tzmin = int(parts[2][4:]) + except ValueError: + return None + if parts[2].startswith('-'): + tzhour = tzhour * -1 + tzmin = tzmin * -1 + else: + tzhour = timezonenames.get(parts[2], 0) + try: + hour, minute, second = [int(float(i)) for i in time] + except ValueError: + return None + # Create the datetime object and timezone delta objects + try: + stamp = datetime.datetime(year, month, day, hour, minute, second) + except ValueError: + return None + delta = datetime.timedelta(0, 0, 0, 0, tzmin, tzhour) + # Return the date and timestamp in a UTC 9-tuple + try: + return (stamp - delta).utctimetuple() + except (OverflowError, ValueError): + # IronPython throws ValueErrors instead of OverflowErrors + return None - def __extract_time(m): - if not m: - return 0, 0, 0 - hours = m.group('hours') - if not hours: - return 0, 0, 0 - hours = int(hours) - minutes = int(m.group('minutes')) - seconds = m.group('seconds') - if seconds: - seconds = int(seconds) - else: - seconds = 0 - return hours, minutes, seconds - - def __extract_tzd(m): - '''Return the Time Zone Designator as an offset in seconds from UTC.''' - if not m: - return 0 - tzd = m.group('tzd') - if not tzd: - return 0 - if tzd == 'Z': - return 0 - hours = int(m.group('tzdhours')) - minutes = m.group('tzdminutes') - if minutes: - minutes = int(minutes) - else: - minutes = 0 - offset = (hours*60 + minutes) * 60 - if tzd[0] == '+': - return -offset - return offset - - __date_re = ('(?P\d\d\d\d)' - '(?:(?P-|)' - '(?:(?P\d\d)(?:(?P=dsep)(?P\d\d))?' - '|(?P\d\d\d)))?') - __tzd_re = '(?P[-+](?P\d\d)(?::?(?P\d\d))|Z)' - __tzd_rx = re.compile(__tzd_re) - __time_re = ('(?P\d\d)(?P:|)(?P\d\d)' - '(?:(?P=tsep)(?P\d\d)(?:[.,]\d+)?)?' - + __tzd_re) - __datetime_re = '%s(?:T%s)?' % (__date_re, __time_re) - __datetime_rx = re.compile(__datetime_re) - m = __datetime_rx.match(dateString) - if (m is None) or (m.group() != dateString): return - gmt = __extract_date(m) + __extract_time(m) + (0, 0, 0) - if gmt[0] == 0: return - return time.gmtime(time.mktime(gmt) + __extract_tzd(m) - time.timezone) registerDateHandler(_parse_date_w3dtf) -def _parse_date_rfc822(dateString): - '''Parse an RFC822, RFC1123, RFC2822, or asctime-style date''' - data = dateString.split() - if data[0][-1] in (',', '.') or data[0].lower() in rfc822._daynames: - del data[0] - if len(data) == 4: - s = data[3] - i = s.find('+') - if i > 0: - data[3:] = [s[:i], s[i+1:]] +def _parse_date_rfc822(date): + """Parse RFC 822 dates and times + http://tools.ietf.org/html/rfc822#section-5 + + There are some formatting differences that are accounted for: + 1. Years may be two or four digits. + 2. The month and day can be swapped. + 3. Additional timezone names are supported. + 4. A default time and timezone are assumed if only a date is present. + """ + daynames = set(['mon', 'tue', 'wed', 'thu', 'fri', 'sat', 'sun']) + months = { + 'jan': 1, 'feb': 2, 'mar': 3, 'apr': 4, 'may': 5, 'jun': 6, + 'jul': 7, 'aug': 8, 'sep': 9, 'oct': 10, 'nov': 11, 'dec': 12, + } + + parts = date.lower().split() + if len(parts) < 5: + # Assume that the time and timezone are missing + parts.extend(('00:00:00', '0000')) + # Remove the day name + if parts[0][:3] in daynames: + parts = parts[1:] + if len(parts) < 5: + # If there are still fewer than five parts, there's not enough + # information to interpret this + return None + try: + day = int(parts[0]) + except ValueError: + # Check if the day and month are swapped + if months.get(parts[0][:3]): + try: + day = int(parts[1]) + except ValueError: + return None + else: + parts[1] = parts[0] else: - data.append('') - dateString = " ".join(data) - # Account for the Etc/GMT timezone by stripping 'Etc/' - elif len(data) == 5 and data[4].lower().startswith('etc/'): - data[4] = data[4][4:] - dateString = " ".join(data) - if len(data) < 5: - dateString += ' 00:00:00 GMT' + return None + month = months.get(parts[1][:3]) + if not month: + return None + try: + year = int(parts[2]) + except ValueError: + return None + # Normalize two-digit years: + # Anything in the 90's is interpreted as 1990 and on + # Anything 89 or less is interpreted as 2089 or before + if len(parts[2]) <= 2: + year += (1900, 2000)[year < 90] + timeparts = parts[3].split(':') + timeparts = timeparts + ([0] * (3 - len(timeparts))) + try: + (hour, minute, second) = list(map(int, timeparts)) + except ValueError: + return None + tzhour = 0 + tzmin = 0 + # Strip 'Etc/' from the timezone + if parts[4].startswith('etc/'): + parts[4] = parts[4][4:] + # Normalize timezones that start with 'gmt': + # GMT-05:00 => -0500 + # GMT => GMT + if parts[4].startswith('gmt'): + parts[4] = ''.join(parts[4][3:].split(':')) or 'gmt' + # Handle timezones like '-0500', '+0500', and 'EST' + if parts[4] and parts[4][0] in ('-', '+'): + try: + tzhour = int(parts[4][1:3]) + tzmin = int(parts[4][3:]) + except ValueError: + return None + if parts[4].startswith('-'): + tzhour = tzhour * -1 + tzmin = tzmin * -1 + else: + tzhour = timezonenames.get(parts[4], 0) + # Create the datetime object and timezone delta objects + try: + stamp = datetime.datetime(year, month, day, hour, minute, second) + except ValueError: + return None + delta = datetime.timedelta(0, 0, 0, 0, tzmin, tzhour) + # Return the date and timestamp in a UTC 9-tuple + try: + return (stamp - delta).utctimetuple() + except (OverflowError, ValueError): + # IronPython throws ValueErrors instead of OverflowErrors + return None +registerDateHandler(_parse_date_rfc822) + +_months = ['jan', 'feb', 'mar', 'apr', 'may', 'jun', + 'jul', 'aug', 'sep', 'oct', 'nov', 'dec'] +def _parse_date_asctime(dt): + """Parse asctime-style dates. + + Converts asctime to RFC822-compatible dates and uses the RFC822 parser + to do the actual parsing. + + Supported formats (format is standardized to the first one listed): + + * {weekday name} {month name} dd hh:mm:ss {+-tz} yyyy + * {weekday name} {month name} dd hh:mm:ss yyyy + """ + + parts = dt.split() + + # Insert a GMT timezone, if needed. + if len(parts) == 5: + parts.insert(4, '+0000') + + # Exit if there are not six parts. + if len(parts) != 6: + return None + + # Reassemble the parts in an RFC822-compatible order and parse them. + return _parse_date_rfc822(' '.join([ + parts[0], parts[2], parts[1], parts[5], parts[3], parts[4], + ])) +registerDateHandler(_parse_date_asctime) + +def _parse_date_perforce(aDateString): + """parse a date in yyyy/mm/dd hh:mm:ss TTT format""" + # Fri, 2006/09/15 08:19:53 EDT + _my_date_pattern = re.compile( \ + r'(\w{,3}), (\d{,4})/(\d{,2})/(\d{2}) (\d{,2}):(\d{2}):(\d{2}) (\w{,3})') + + m = _my_date_pattern.search(aDateString) + if m is None: + return None + dow, year, month, day, hour, minute, second, tz = m.groups() + months = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec'] + dateString = "%s, %s %s %s %s:%s:%s %s" % (dow, day, months[int(month) - 1], year, hour, minute, second, tz) tm = rfc822.parsedate_tz(dateString) if tm: return time.gmtime(rfc822.mktime_tz(tm)) -# rfc822.py defines several time zones, but we define some extra ones. -# 'ET' is equivalent to 'EST', etc. -_additional_timezones = {'AT': -400, 'ET': -500, 'CT': -600, 'MT': -700, 'PT': -800} -rfc822._timezones.update(_additional_timezones) -registerDateHandler(_parse_date_rfc822) - -def _parse_date_perforce(aDateString): - """parse a date in yyyy/mm/dd hh:mm:ss TTT format""" - # Fri, 2006/09/15 08:19:53 EDT - _my_date_pattern = re.compile( \ - r'(\w{,3}), (\d{,4})/(\d{,2})/(\d{2}) (\d{,2}):(\d{2}):(\d{2}) (\w{,3})') - - dow, year, month, day, hour, minute, second, tz = \ - _my_date_pattern.search(aDateString).groups() - months = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec'] - dateString = "%s, %s %s %s %s:%s:%s %s" % (dow, day, months[int(month) - 1], year, hour, minute, second, tz) - tm = rfc822.parsedate_tz(dateString) - if tm: - return time.gmtime(rfc822.mktime_tz(tm)) registerDateHandler(_parse_date_perforce) def _parse_date(dateString): '''Parses a variety of date formats into a 9-tuple in GMT''' + if not dateString: + return None for handler in _date_handlers: try: date9tuple = handler(dateString) - if not date9tuple: continue - if len(date9tuple) != 9: - if _debug: sys.stderr.write('date handler function must return 9-tuple\n') - raise ValueError - map(int, date9tuple) - return date9tuple - except Exception, e: - if _debug: sys.stderr.write('%s raised %s\n' % (handler.__name__, repr(e))) - pass + except (KeyError, OverflowError, ValueError): + continue + if not date9tuple: + continue + if len(date9tuple) != 9: + continue + return date9tuple return None -def _getCharacterEncoding(http_headers, xml_data): - '''Get the character encoding of the XML document +# Each marker represents some of the characters of the opening XML +# processing instruction (' +RE_XML_DECLARATION = re.compile('^<\?xml[^>]*?>') + +# Capture the value of the XML processing instruction's encoding attribute. +# Example: +RE_XML_PI_ENCODING = re.compile(_s2bytes('^<\?.*encoding=[\'"](.*?)[\'"].*\?>')) + +def convert_to_utf8(http_headers, data): + '''Detect and convert the character encoding to UTF-8. http_headers is a dictionary - xml_data is a raw string (not Unicode) + data is a raw string (not Unicode)''' - This is so much trickier than it sounds, it's not even funny. - According to RFC 3023 ('XML Media Types'), if the HTTP Content-Type - is application/xml, application/*+xml, - application/xml-external-parsed-entity, or application/xml-dtd, - the encoding given in the charset parameter of the HTTP Content-Type - takes precedence over the encoding given in the XML prefix within the - document, and defaults to 'utf-8' if neither are specified. But, if - the HTTP Content-Type is text/xml, text/*+xml, or - text/xml-external-parsed-entity, the encoding given in the XML prefix - within the document is ALWAYS IGNORED and only the encoding given in - the charset parameter of the HTTP Content-Type header should be - respected, and it defaults to 'us-ascii' if not specified. + # This is so much trickier than it sounds, it's not even funny. + # According to RFC 3023 ('XML Media Types'), if the HTTP Content-Type + # is application/xml, application/*+xml, + # application/xml-external-parsed-entity, or application/xml-dtd, + # the encoding given in the charset parameter of the HTTP Content-Type + # takes precedence over the encoding given in the XML prefix within the + # document, and defaults to 'utf-8' if neither are specified. But, if + # the HTTP Content-Type is text/xml, text/*+xml, or + # text/xml-external-parsed-entity, the encoding given in the XML prefix + # within the document is ALWAYS IGNORED and only the encoding given in + # the charset parameter of the HTTP Content-Type header should be + # respected, and it defaults to 'us-ascii' if not specified. - Furthermore, discussion on the atom-syntax mailing list with the - author of RFC 3023 leads me to the conclusion that any document - served with a Content-Type of text/* and no charset parameter - must be treated as us-ascii. (We now do this.) And also that it - must always be flagged as non-well-formed. (We now do this too.) + # Furthermore, discussion on the atom-syntax mailing list with the + # author of RFC 3023 leads me to the conclusion that any document + # served with a Content-Type of text/* and no charset parameter + # must be treated as us-ascii. (We now do this.) And also that it + # must always be flagged as non-well-formed. (We now do this too.) - If Content-Type is unspecified (input was local file or non-HTTP source) - or unrecognized (server just got it totally wrong), then go by the - encoding given in the XML prefix of the document and default to - 'iso-8859-1' as per the HTTP specification (RFC 2616). + # If Content-Type is unspecified (input was local file or non-HTTP source) + # or unrecognized (server just got it totally wrong), then go by the + # encoding given in the XML prefix of the document and default to + # 'iso-8859-1' as per the HTTP specification (RFC 2616). - Then, assuming we didn't find a character encoding in the HTTP headers - (and the HTTP Content-type allowed us to look in the body), we need - to sniff the first few bytes of the XML data and try to determine - whether the encoding is ASCII-compatible. Section F of the XML - specification shows the way here: - http://www.w3.org/TR/REC-xml/#sec-guessing-no-ext-info - - If the sniffed encoding is not ASCII-compatible, we need to make it - ASCII compatible so that we can sniff further into the XML declaration - to find the encoding attribute, which will tell us the true encoding. - - Of course, none of this guarantees that we will be able to parse the - feed in the declared character encoding (assuming it was declared - correctly, which many are not). CJKCodecs and iconv_codec help a lot; - you should definitely install them if you can. - http://cjkpython.i18n.org/ - ''' - - def _parseHTTPContentType(content_type): - '''takes HTTP Content-Type header and returns (content type, charset) - - If no charset is specified, returns (content type, '') - If no content type is specified, returns ('', '') - Both return parameters are guaranteed to be lowercase strings - ''' - content_type = content_type or '' - content_type, params = cgi.parse_header(content_type) - return content_type, params.get('charset', '').replace("'", '') - - sniffed_xml_encoding = '' - xml_encoding = '' - true_encoding = '' - http_content_type, http_encoding = _parseHTTPContentType(http_headers.get('content-type', http_headers.get('Content-type'))) - # Must sniff for non-ASCII-compatible character encodings before - # searching for XML declaration. This heuristic is defined in - # section F of the XML specification: + # Then, assuming we didn't find a character encoding in the HTTP headers + # (and the HTTP Content-type allowed us to look in the body), we need + # to sniff the first few bytes of the XML data and try to determine + # whether the encoding is ASCII-compatible. Section F of the XML + # specification shows the way here: # http://www.w3.org/TR/REC-xml/#sec-guessing-no-ext-info + + # If the sniffed encoding is not ASCII-compatible, we need to make it + # ASCII compatible so that we can sniff further into the XML declaration + # to find the encoding attribute, which will tell us the true encoding. + + # Of course, none of this guarantees that we will be able to parse the + # feed in the declared character encoding (assuming it was declared + # correctly, which many are not). iconv_codec can help a lot; + # you should definitely install it if you can. + # http://cjkpython.i18n.org/ + + bom_encoding = '' + xml_encoding = '' + rfc3023_encoding = '' + + # Look at the first few bytes of the document to guess what + # its encoding may be. We only need to decode enough of the + # document that we can use an ASCII-compatible regular + # expression to search for an XML encoding declaration. + # The heuristic follows the XML specification, section F: + # http://www.w3.org/TR/REC-xml/#sec-guessing-no-ext-info + # Check for BOMs first. + if data[:4] == codecs.BOM_UTF32_BE: + bom_encoding = 'utf-32be' + data = data[4:] + elif data[:4] == codecs.BOM_UTF32_LE: + bom_encoding = 'utf-32le' + data = data[4:] + elif data[:2] == codecs.BOM_UTF16_BE and data[2:4] != ZERO_BYTES: + bom_encoding = 'utf-16be' + data = data[2:] + elif data[:2] == codecs.BOM_UTF16_LE and data[2:4] != ZERO_BYTES: + bom_encoding = 'utf-16le' + data = data[2:] + elif data[:3] == codecs.BOM_UTF8: + bom_encoding = 'utf-8' + data = data[3:] + # Check for the characters '= 4) and (xml_data[:2] == _l2bytes([0xfe, 0xff])) and (xml_data[2:4] != _l2bytes([0x00, 0x00])): - # UTF-16BE with BOM - sniffed_xml_encoding = 'utf-16be' - xml_data = unicode(xml_data[2:], 'utf-16be').encode('utf-8') - elif xml_data[:4] == _l2bytes([0x3c, 0x00, 0x3f, 0x00]): - # UTF-16LE - sniffed_xml_encoding = 'utf-16le' - xml_data = unicode(xml_data, 'utf-16le').encode('utf-8') - elif (len(xml_data) >= 4) and (xml_data[:2] == _l2bytes([0xff, 0xfe])) and (xml_data[2:4] != _l2bytes([0x00, 0x00])): - # UTF-16LE with BOM - sniffed_xml_encoding = 'utf-16le' - xml_data = unicode(xml_data[2:], 'utf-16le').encode('utf-8') - elif xml_data[:4] == _l2bytes([0x00, 0x00, 0x00, 0x3c]): - # UTF-32BE - sniffed_xml_encoding = 'utf-32be' - xml_data = unicode(xml_data, 'utf-32be').encode('utf-8') - elif xml_data[:4] == _l2bytes([0x3c, 0x00, 0x00, 0x00]): - # UTF-32LE - sniffed_xml_encoding = 'utf-32le' - xml_data = unicode(xml_data, 'utf-32le').encode('utf-8') - elif xml_data[:4] == _l2bytes([0x00, 0x00, 0xfe, 0xff]): - # UTF-32BE with BOM - sniffed_xml_encoding = 'utf-32be' - xml_data = unicode(xml_data[4:], 'utf-32be').encode('utf-8') - elif xml_data[:4] == _l2bytes([0xff, 0xfe, 0x00, 0x00]): - # UTF-32LE with BOM - sniffed_xml_encoding = 'utf-32le' - xml_data = unicode(xml_data[4:], 'utf-32le').encode('utf-8') - elif xml_data[:3] == _l2bytes([0xef, 0xbb, 0xbf]): - # UTF-8 with BOM - sniffed_xml_encoding = 'utf-8' - xml_data = unicode(xml_data[3:], 'utf-8').encode('utf-8') - else: - # ASCII-compatible - pass - xml_encoding_match = re.compile(_s2bytes('^<\?.*encoding=[\'"](.*?)[\'"].*\?>')).match(xml_data) - except: + if bom_encoding: + tempdata = data.decode(bom_encoding).encode('utf-8') + except (UnicodeDecodeError, LookupError): + # feedparser recognizes UTF-32 encodings that aren't + # available in Python 2.4 and 2.5, so it's possible to + # encounter a LookupError during decoding. xml_encoding_match = None + else: + xml_encoding_match = RE_XML_PI_ENCODING.match(tempdata) + if xml_encoding_match: xml_encoding = xml_encoding_match.groups()[0].decode('utf-8').lower() - if sniffed_xml_encoding and (xml_encoding in ('iso-10646-ucs-2', 'ucs-2', 'csunicode', 'iso-10646-ucs-4', 'ucs-4', 'csucs4', 'utf-16', 'utf-32', 'utf_16', 'utf_32', 'utf16', 'u16')): - xml_encoding = sniffed_xml_encoding + # Normalize the xml_encoding if necessary. + if bom_encoding and (xml_encoding in ( + 'u16', 'utf-16', 'utf16', 'utf_16', + 'u32', 'utf-32', 'utf32', 'utf_32', + 'iso-10646-ucs-2', 'iso-10646-ucs-4', + 'csucs4', 'csunicode', 'ucs-2', 'ucs-4' + )): + xml_encoding = bom_encoding + + # Find the HTTP Content-Type and, hopefully, a character + # encoding provided by the server. The Content-Type is used + # to choose the "correct" encoding among the BOM encoding, + # XML declaration encoding, and HTTP encoding, following the + # heuristic defined in RFC 3023. + http_content_type = http_headers.get('content-type') or '' + http_content_type, params = cgi.parse_header(http_content_type) + http_encoding = params.get('charset', '').replace("'", "") + if not isinstance(http_encoding, str): + http_encoding = http_encoding.decode('utf-8', 'ignore') + acceptable_content_type = 0 - application_content_types = ('application/xml', 'application/xml-dtd', 'application/xml-external-parsed-entity') + application_content_types = ('application/xml', 'application/xml-dtd', + 'application/xml-external-parsed-entity') text_content_types = ('text/xml', 'text/xml-external-parsed-entity') if (http_content_type in application_content_types) or \ - (http_content_type.startswith('application/') and http_content_type.endswith('+xml')): + (http_content_type.startswith('application/') and + http_content_type.endswith('+xml')): acceptable_content_type = 1 - true_encoding = http_encoding or xml_encoding or 'utf-8' + rfc3023_encoding = http_encoding or xml_encoding or 'utf-8' elif (http_content_type in text_content_types) or \ - (http_content_type.startswith('text/')) and http_content_type.endswith('+xml'): + (http_content_type.startswith('text/') and + http_content_type.endswith('+xml')): acceptable_content_type = 1 - true_encoding = http_encoding or 'us-ascii' + rfc3023_encoding = http_encoding or 'us-ascii' elif http_content_type.startswith('text/'): - true_encoding = http_encoding or 'us-ascii' - elif http_headers and (not (http_headers.has_key('content-type') or http_headers.has_key('Content-type'))): - true_encoding = xml_encoding or 'iso-8859-1' + rfc3023_encoding = http_encoding or 'us-ascii' + elif http_headers and 'content-type' not in http_headers: + rfc3023_encoding = xml_encoding or 'iso-8859-1' else: - true_encoding = xml_encoding or 'utf-8' - # some feeds claim to be gb2312 but are actually gb18030. - # apparently MSIE and Firefox both do the following switch: - if true_encoding.lower() == 'gb2312': - true_encoding = 'gb18030' - return true_encoding, http_encoding, xml_encoding, sniffed_xml_encoding, acceptable_content_type + rfc3023_encoding = xml_encoding or 'utf-8' + # gb18030 is a superset of gb2312, so always replace gb2312 + # with gb18030 for greater compatibility. + if rfc3023_encoding.lower() == 'gb2312': + rfc3023_encoding = 'gb18030' + if xml_encoding.lower() == 'gb2312': + xml_encoding = 'gb18030' -def _toUTF8(data, encoding): - '''Changes an XML data stream on the fly to specify a new encoding + # there are four encodings to keep track of: + # - http_encoding is the encoding declared in the Content-Type HTTP header + # - xml_encoding is the encoding declared in the = 4) and (data[:2] == _l2bytes([0xfe, 0xff])) and (data[2:4] != _l2bytes([0x00, 0x00])): - if _debug: - sys.stderr.write('stripping BOM\n') - if encoding != 'utf-16be': - sys.stderr.write('trying utf-16be instead\n') - encoding = 'utf-16be' - data = data[2:] - elif (len(data) >= 4) and (data[:2] == _l2bytes([0xff, 0xfe])) and (data[2:4] != _l2bytes([0x00, 0x00])): - if _debug: - sys.stderr.write('stripping BOM\n') - if encoding != 'utf-16le': - sys.stderr.write('trying utf-16le instead\n') - encoding = 'utf-16le' - data = data[2:] - elif data[:3] == _l2bytes([0xef, 0xbb, 0xbf]): - if _debug: - sys.stderr.write('stripping BOM\n') - if encoding != 'utf-8': - sys.stderr.write('trying utf-8 instead\n') - encoding = 'utf-8' - data = data[3:] - elif data[:4] == _l2bytes([0x00, 0x00, 0xfe, 0xff]): - if _debug: - sys.stderr.write('stripping BOM\n') - if encoding != 'utf-32be': - sys.stderr.write('trying utf-32be instead\n') - encoding = 'utf-32be' - data = data[4:] - elif data[:4] == _l2bytes([0xff, 0xfe, 0x00, 0x00]): - if _debug: - sys.stderr.write('stripping BOM\n') - if encoding != 'utf-32le': - sys.stderr.write('trying utf-32le instead\n') - encoding = 'utf-32le' - data = data[4:] - newdata = unicode(data, encoding) - if _debug: sys.stderr.write('successfully converted %s data to unicode\n' % encoding) - declmatch = re.compile('^<\?xml[^>]*?>') - newdecl = '''''' - if declmatch.search(newdata): - newdata = declmatch.sub(newdecl, newdata) - else: - newdata = newdecl + u'\n' + newdata - return newdata.encode('utf-8') + if http_headers and (not acceptable_content_type): + if 'content-type' in http_headers: + msg = '%s is not an XML media type' % http_headers['content-type'] + else: + msg = 'no Content-type specified' + error = NonXMLContentType(msg) -def _stripDoctype(data): - '''Strips DOCTYPE from XML document, returns (rss_version, stripped_data) + # determine character encoding + known_encoding = 0 + lazy_chardet_encoding = None + tried_encodings = [] + if chardet: + def lazy_chardet_encoding(): + chardet_encoding = chardet.detect(data)['encoding'] + if not chardet_encoding: + chardet_encoding = '' + if not isinstance(chardet_encoding, str): + chardet_encoding = str(chardet_encoding, 'ascii', 'ignore') + return chardet_encoding + # try: HTTP encoding, declared XML encoding, encoding sniffed from BOM + for proposed_encoding in (rfc3023_encoding, xml_encoding, bom_encoding, + lazy_chardet_encoding, 'utf-8', 'windows-1252', 'iso-8859-2'): + if callable(proposed_encoding): + proposed_encoding = proposed_encoding() + if not proposed_encoding: + continue + if proposed_encoding in tried_encodings: + continue + tried_encodings.append(proposed_encoding) + try: + data = data.decode(proposed_encoding) + except (UnicodeDecodeError, LookupError): + pass + else: + known_encoding = 1 + # Update the encoding in the opening XML processing instruction. + new_declaration = '''''' + if RE_XML_DECLARATION.search(data): + data = RE_XML_DECLARATION.sub(new_declaration, data) + else: + data = new_declaration + '\n' + data + data = data.encode('utf-8') + break + # if still no luck, give up + if not known_encoding: + error = CharacterEncodingUnknown( + 'document encoding unknown, I tried ' + + '%s, %s, utf-8, windows-1252, and iso-8859-2 but nothing worked' % + (rfc3023_encoding, xml_encoding)) + rfc3023_encoding = '' + elif proposed_encoding != rfc3023_encoding: + error = CharacterEncodingOverride( + 'document declared as %s, but parsed as %s' % + (rfc3023_encoding, proposed_encoding)) + rfc3023_encoding = proposed_encoding + + return data, rfc3023_encoding, error + +# Match XML entity declarations. +# Example: +RE_ENTITY_PATTERN = re.compile(_s2bytes(r'^\s*]*?)>'), re.MULTILINE) + +# Match XML DOCTYPE declarations. +# Example: +RE_DOCTYPE_PATTERN = re.compile(_s2bytes(r'^\s*]*?)>'), re.MULTILINE) + +# Match safe entity declarations. +# This will allow hexadecimal character references through, +# as well as text, but not arbitrary nested entities. +# Example: cubed "³" +# Example: copyright "(C)" +# Forbidden: explode1 "&explode2;&explode2;" +RE_SAFE_ENTITY_PATTERN = re.compile(_s2bytes('\s+(\w+)\s+"(&#\w+;|[^&"]*)"')) + +def replace_doctype(data): + '''Strips and replaces the DOCTYPE, returns (rss_version, stripped_data) rss_version may be 'rss091n' or None - stripped_data is the same XML document, minus the DOCTYPE + stripped_data is the same XML document with a replaced DOCTYPE ''' + + # Divide the document into two groups by finding the location + # of the first element that doesn't begin with ']*?)>'), re.MULTILINE) - entity_results=entity_pattern.findall(head) - head = entity_pattern.sub(_s2bytes(''), head) - doctype_pattern = re.compile(_s2bytes(r'^\s*]*?)>'), re.MULTILINE) - doctype_results = doctype_pattern.findall(head) + # Save and then remove all of the ENTITY declarations. + entity_results = RE_ENTITY_PATTERN.findall(head) + head = RE_ENTITY_PATTERN.sub(_s2bytes(''), head) + + # Find the DOCTYPE declaration and check the feed type. + doctype_results = RE_DOCTYPE_PATTERN.findall(head) doctype = doctype_results and doctype_results[0] or _s2bytes('') - if doctype.lower().count(_s2bytes('netscape')): + if _s2bytes('netscape') in doctype.lower(): version = 'rss091n' else: version = None - # only allow in 'safe' inline entity definitions - replacement=_s2bytes('') - if len(doctype_results)==1 and entity_results: - safe_pattern=re.compile(_s2bytes('\s+(\w+)\s+"(&#\w+;|[^&"]*)"')) - safe_entities=filter(lambda e: safe_pattern.match(e),entity_results) - if safe_entities: - replacement=_s2bytes('\n \n]>') - data = doctype_pattern.sub(replacement, head) + data + # Re-insert the safe ENTITY declarations if a DOCTYPE was found. + replacement = _s2bytes('') + if len(doctype_results) == 1 and entity_results: + match_safe_entities = lambda e: RE_SAFE_ENTITY_PATTERN.match(e) + safe_entities = list(filter(match_safe_entities, entity_results)) + if safe_entities: + replacement = _s2bytes('\n\n]>') + data = RE_DOCTYPE_PATTERN.sub(replacement, head) + data - return version, data, dict(replacement and [(k.decode('utf-8'), v.decode('utf-8')) for k, v in safe_pattern.findall(replacement)]) + # Precompute the safe entities for the loose parser. + safe_entities = dict((k.decode('utf-8'), v.decode('utf-8')) + for k, v in RE_SAFE_ENTITY_PATTERN.findall(replacement)) + return version, data, safe_entities -def parse(url_file_stream_or_string, etag=None, modified=None, agent=None, referrer=None, handlers=[], request_headers={}, response_headers={}): + +# GeoRSS geometry parsers. Each return a dict with 'type' and 'coordinates' +# items, or None in the case of a parsing error. + +def _parse_poslist(value, geom_type, swap=True, dims=2): + if geom_type == 'linestring': + return _parse_georss_line(value, swap, dims) + elif geom_type == 'polygon': + ring = _parse_georss_line(value, swap, dims) + return {'type': 'Polygon', 'coordinates': (ring['coordinates'],)} + else: + return None + +def _gen_georss_coords(value, swap=True, dims=2): + # A generator of (lon, lat) pairs from a string of encoded GeoRSS + # coordinates. Converts to floats and swaps order. + latlons = map(float, value.strip().replace(',', ' ').split()) + nxt = latlons.__next__ + while True: + t = [nxt(), nxt()][::swap and -1 or 1] + if dims == 3: + t.append(nxt()) + yield tuple(t) + +def _parse_georss_point(value, swap=True, dims=2): + # A point contains a single latitude-longitude pair, separated by + # whitespace. We'll also handle comma separators. + try: + coords = list(_gen_georss_coords(value, swap, dims)) + return {'type': 'Point', 'coordinates': coords[0]} + except (IndexError, ValueError): + return None + +def _parse_georss_line(value, swap=True, dims=2): + # A line contains a space separated list of latitude-longitude pairs in + # WGS84 coordinate reference system, with each pair separated by + # whitespace. There must be at least two pairs. + try: + coords = list(_gen_georss_coords(value, swap, dims)) + return {'type': 'LineString', 'coordinates': coords} + except (IndexError, ValueError): + return None + +def _parse_georss_polygon(value, swap=True, dims=2): + # A polygon contains a space separated list of latitude-longitude pairs, + # with each pair separated by whitespace. There must be at least four + # pairs, with the last being identical to the first (so a polygon has a + # minimum of three actual points). + try: + ring = list(_gen_georss_coords(value, swap, dims)) + except (IndexError, ValueError): + return None + if len(ring) < 4: + return None + return {'type': 'Polygon', 'coordinates': (ring,)} + +def _parse_georss_box(value, swap=True, dims=2): + # A bounding box is a rectangular region, often used to define the extents + # of a map or a rough area of interest. A box contains two space seperate + # latitude-longitude pairs, with each pair separated by whitespace. The + # first pair is the lower corner, the second is the upper corner. + try: + coords = list(_gen_georss_coords(value, swap, dims)) + return {'type': 'Box', 'coordinates': tuple(coords)} + except (IndexError, ValueError): + return None + +# end geospatial parsers + + +def parse(url_file_stream_or_string, etag=None, modified=None, agent=None, referrer=None, handlers=None, request_headers=None, response_headers=None): '''Parse a feed from a URL, file, stream, or string. request_headers, if given, is a dict from http header name to value to add to the request; this overrides internally generated values. + + :return: A :class:`FeedParserDict`. ''' + + if handlers is None: + handlers = [] + if request_headers is None: + request_headers = {} + if response_headers is None: + response_headers = {} + result = FeedParserDict() result['feed'] = FeedParserDict() result['entries'] = [] - if _XML_AVAILABLE: - result['bozo'] = 0 + result['bozo'] = 0 if not isinstance(handlers, list): handlers = [handlers] try: f = _open_resource(url_file_stream_or_string, etag, modified, agent, referrer, handlers, request_headers) data = f.read() - except Exception, e: + except Exception as e: result['bozo'] = 1 result['bozo_exception'] = e data = None @@ -3648,148 +3853,88 @@ def parse(url_file_stream_or_string, etag=None, modified=None, agent=None, refer elif response_headers: result['headers'] = copy.deepcopy(response_headers) + # lowercase all of the HTTP headers for comparisons per RFC 2616 + if 'headers' in result: + http_headers = dict((k.lower(), v) for k, v in list(result['headers'].items())) + else: + http_headers = {} + # if feed is gzip-compressed, decompress it - if f and data and 'headers' in result: - if gzip and result['headers'].get('content-encoding') == 'gzip': + if f and data and http_headers: + if gzip and 'gzip' in http_headers.get('content-encoding', ''): try: data = gzip.GzipFile(fileobj=_StringIO(data)).read() - except Exception, e: - # Some feeds claim to be gzipped but they're not, so - # we get garbage. Ideally, we should re-request the - # feed without the 'Accept-encoding: gzip' header, - # but we don't. + except (IOError, struct.error) as e: + # IOError can occur if the gzip header is bad. + # struct.error can occur if the data is damaged. result['bozo'] = 1 result['bozo_exception'] = e - data = '' - elif zlib and result['headers'].get('content-encoding') == 'deflate': + if isinstance(e, struct.error): + # A gzip header was found but the data is corrupt. + # Ideally, we should re-request the feed without the + # 'Accept-encoding: gzip' header, but we don't. + data = None + elif zlib and 'deflate' in http_headers.get('content-encoding', ''): try: - data = zlib.decompress(data, -zlib.MAX_WBITS) - except Exception, e: - result['bozo'] = 1 - result['bozo_exception'] = e - data = '' + data = zlib.decompress(data) + except zlib.error as e: + try: + # The data may have no headers and no checksum. + data = zlib.decompress(data, -15) + except zlib.error as e: + result['bozo'] = 1 + result['bozo_exception'] = e # save HTTP headers - if 'headers' in result: - if 'etag' in result['headers'] or 'ETag' in result['headers']: - etag = result['headers'].get('etag', result['headers'].get('ETag')) + if http_headers: + if 'etag' in http_headers: + etag = http_headers.get('etag', '') + if not isinstance(etag, str): + etag = etag.decode('utf-8', 'ignore') if etag: result['etag'] = etag - if 'last-modified' in result['headers'] or 'Last-Modified' in result['headers']: - modified = result['headers'].get('last-modified', result['headers'].get('Last-Modified')) + if 'last-modified' in http_headers: + modified = http_headers.get('last-modified', '') if modified: - result['modified'] = _parse_date(modified) + result['modified'] = modified + result['modified_parsed'] = _parse_date(modified) if hasattr(f, 'url'): - result['href'] = f.url + if not isinstance(f.url, str): + result['href'] = f.url.decode('utf-8', 'ignore') + else: + result['href'] = f.url result['status'] = 200 if hasattr(f, 'status'): result['status'] = f.status if hasattr(f, 'close'): f.close() - # there are four encodings to keep track of: - # - http_encoding is the encoding declared in the Content-Type HTTP header - # - xml_encoding is the encoding declared in the