Merge pull request #22070 from Chocobo1/py_html_decode

Improve Search engine
This commit is contained in:
Chocobo1 2024-12-29 14:39:11 +08:00 committed by GitHub
commit e740a42366
No known key found for this signature in database
GPG key ID: B5690EEEBB952194

View file

@ -1,4 +1,4 @@
#VERSION: 1.49 #VERSION: 1.50
# Author: # Author:
# Christophe DUMEZ (chris@qbittorrent.org) # Christophe DUMEZ (chris@qbittorrent.org)
@ -29,12 +29,13 @@
import datetime import datetime
import gzip import gzip
import html.entities import html
import io import io
import os import os
import re import re
import socket import socket
import socks import socks
import ssl
import sys import sys
import tempfile import tempfile
import urllib.error import urllib.error
@ -72,29 +73,16 @@ if "sock_proxy" in os.environ and len(os.environ["sock_proxy"].strip()) > 0:
socket.socket = socks.socksocket # type: ignore[misc] socket.socket = socks.socksocket # type: ignore[misc]
def htmlentitydecode(s: str) -> str: # This is only provided for backward compatibility, new code should not use it
# First convert alpha entities (such as é) htmlentitydecode = html.unescape
# (Inspired from http://mail.python.org/pipermail/python-list/2007-June/443813.html)
def entity2char(m: re.Match[str]) -> str:
entity = m.group(1)
if entity in html.entities.name2codepoint:
return chr(html.entities.name2codepoint[entity])
return " " # Unknown entity: We replace with a space.
t = re.sub('&(%s);' % '|'.join(html.entities.name2codepoint), entity2char, s)
# Then convert numerical entities (such as é)
t = re.sub(r'&#(\d+);', lambda x: chr(int(x.group(1))), t)
# Then convert hexa entities (such as é)
return re.sub(r'&#x(\w+);', lambda x: chr(int(x.group(1), 16)), t)
def retrieve_url(url: str, custom_headers: Mapping[str, Any] = {}, request_data: Optional[Any] = None) -> str: def retrieve_url(url: str, custom_headers: Mapping[str, Any] = {}, request_data: Optional[Any] = None, ssl_context: Optional[ssl.SSLContext] = None) -> str:
""" Return the content of the url page as a string """ """ Return the content of the url page as a string """
request = urllib.request.Request(url, request_data, {**headers, **custom_headers}) request = urllib.request.Request(url, request_data, {**headers, **custom_headers})
try: try:
response = urllib.request.urlopen(request) response = urllib.request.urlopen(request, context=ssl_context)
except urllib.error.URLError as errno: except urllib.error.URLError as errno:
print(f"Connection error: {errno.reason}", file=sys.stderr) print(f"Connection error: {errno.reason}", file=sys.stderr)
return "" return ""
@ -117,14 +105,14 @@ def retrieve_url(url: str, custom_headers: Mapping[str, Any] = {}, request_data:
return dataStr return dataStr
def download_file(url: str, referer: Optional[str] = None) -> str: def download_file(url: str, referer: Optional[str] = None, ssl_context: Optional[ssl.SSLContext] = None) -> str:
""" Download file at url and write it to a file, return the path to the file and the url """ """ Download file at url and write it to a file, return the path to the file and the url """
# Download url # Download url
request = urllib.request.Request(url, headers=headers) request = urllib.request.Request(url, headers=headers)
if referer is not None: if referer is not None:
request.add_header('referer', referer) request.add_header('referer', referer)
response = urllib.request.urlopen(request) response = urllib.request.urlopen(request, context=ssl_context)
data = response.read() data = response.read()
# Check if it is gzipped # Check if it is gzipped