diff --git a/pyircbot/irccore.py b/pyircbot/irccore.py
index ac47d51..d71bcde 100644
--- a/pyircbot/irccore.py
+++ b/pyircbot/irccore.py
@@ -114,8 +114,9 @@ class IRCCore(asynchat.async_chat):
socket_type = socket.AF_INET6
socketInfo = socket.getaddrinfo(self.server, self.port, socket_type)
self.create_socket(socket_type, socket.SOCK_STREAM)
-
+ self.log.debug("Socket created")
self.connect(socketInfo[0][4])
+ self.log.debug("Connection established")
self.asynmap[self._fileno] = self # http://willpython.blogspot.com/2010/08/multiple-event-loops-with-asyncore-and.html
def handle_connect(self):
diff --git a/pyircbot/modules/LinkTitler.py b/pyircbot/modules/LinkTitler.py
index 8a48ca9..db100bf 100755
--- a/pyircbot/modules/LinkTitler.py
+++ b/pyircbot/modules/LinkTitler.py
@@ -13,13 +13,14 @@ import re
import time
import praw #TODO: enable/disable modules
import datetime
-from requests import get
+from requests import get,head
import html.parser
from threading import Thread
class LinkTitler(ModuleBase):
def __init__(self, bot, moduleName):
- ModuleBase.__init__(self, bot, moduleName);
+ ModuleBase.__init__(self, bot, moduleName)
+ self.REQUEST_SIZE_LIMIT = 10*1024
self.hooks=[ModuleHook("PRIVMSG", self.searches)]
def searches(self, args, prefix, trailing):
@@ -83,15 +84,50 @@ class LinkTitler(ModuleBase):
if match[0] in done:
continue
done.append(match[0])
- d = get(match[0])
- titleMatches = re.findall(r'
([^<]+)', d.text, re.I)
- if len(titleMatches)>0 and d.status_code==200:
- h = html.parser.HTMLParser()
- title = h.unescape(titleMatches[0]).strip()
- if len(title)>0:
+
+ headers = self.url_headers(match[0])
+
+ # Don't mess with unknown content types
+ if not "Content-Type" in headers:
+ continue
+
+ if "text/html" in headers["Content-Type"]:
+ # Fetch HTML title
+ title = self.url_htmltitle(match[0])
+ if title:
self.bot.act_PRIVMSG(args[0], "%s: \x02%s\x02" % (sender.nick, title))
+
+ if "image/" in headers["Content-Type"]:
+ self.bot.act_PRIVMSG(args[0], "%s: \x02%s\x02, %s" % (sender.nick, headers["Content-Type"], str(int(int(headers["Content-Length"])/1024))+"KB" if "Content-Length" in headers else "unknown size"))
+
return
+ def url_headers(self, url):
+ "HEAD requests a url to check content type & length, returns something like: {'type': 'image/jpeg', 'size': '90583'}"
+ self.log.debug("url_headers(%s)" % (url,))
+ resp = head(url=url, allow_redirects=True)
+ return resp.headers
+
+ def url_htmltitle(self, url):
+ "Requests page html and returns title in a safe way"
+ self.log.debug("url_htmltitle(%s)" % (url,))
+ resp = get(url=url, stream=True)
+ # Fetch no more than first 10kb
+ # if the title isn't seen by then, you're doing it wrong
+ data = ""
+ for chunk in resp.iter_content(1024):
+ data += str(chunk)
+ if len(data) > self.REQUEST_SIZE_LIMIT:
+ break
+
+ titleMatches = re.findall(r'([^<]+)', data, re.I)
+ if len(titleMatches)>0 and resp.status_code==200:
+ h = html.parser.HTMLParser()
+ title = h.unescape(titleMatches[0]).strip()
+ if len(title)>0:
+ return title
+ return None
+
# For youtube
def getISOdurationseconds(self, stamp):
ISO_8601_period_rx = re.compile(