Fix encoding in Inject plugin by using manual encoding detection, BS fails in some cases

This commit is contained in:
HAMIDx9 2015-08-23 20:57:18 +04:30
parent 24070afbd0
commit 7a5186750f
2 changed files with 21 additions and 2 deletions

View file

@ -18,6 +18,8 @@
import time
import sys
import re
import chardet
from bs4 import BeautifulSoup
from plugins.plugin import Plugin
@ -52,6 +54,7 @@ class Inject(Plugin):
self.dtable = {}
self.count = 0
def response(self, response, request, data):
ip = response.getClientIP()
@ -59,7 +62,22 @@ class Inject(Plugin):
mime = response.headers['Content-Type']
if self._should_inject(ip, hn) and self._ip_filter(ip) and self._host_filter(hn) and (hn not in self.ip) and ("text/html" in mime):
html = BeautifulSoup(data, "lxml")
if "charset" in mime:
match = re.search('charset=(.*)', mime)
if match:
encoding = match.group(1).strip().replace('"', "")
else:
try:
encoding = chardet.detect(data)["encoding"]
except:
encoding = None
if encoding:
html = BeautifulSoup(data.decode(encoding, "ignore"), "lxml")
else:
html = BeautifulSoup(data, "lxml") # let bs find the encoding
if html.body:
if self.html_url: