Handle content properly

This commit is contained in:
dave 2015-10-31 16:55:33 -07:00
parent b07a6202b4
commit bdf0081ba8
1 changed files with 4 additions and 2 deletions

View File

@ -121,12 +121,14 @@ class LinkTitler(ModuleBase):
resp = get(url=url, stream=True) resp = get(url=url, stream=True)
# Fetch no more than first 10kb # Fetch no more than first 10kb
# if the title isn't seen by then, you're doing it wrong # if the title isn't seen by then, you're doing it wrong
data = "" data = b""
for chunk in resp.iter_content(1024): for chunk in resp.iter_content(1024):
data += str(chunk) data += chunk
if len(data) > self.REQUEST_SIZE_LIMIT: if len(data) > self.REQUEST_SIZE_LIMIT:
break break
data = data.decode('utf-8', "ignore")
titleMatches = re.findall(r'<title>([^<]+)</title>', data, re.I) titleMatches = re.findall(r'<title>([^<]+)</title>', data, re.I)
if len(titleMatches)>0:# and resp.status_code==200: if len(titleMatches)>0:# and resp.status_code==200:
h = html.parser.HTMLParser() h = html.parser.HTMLParser()