Make LinkTitler a little safer

2015-10-31 15:12:22 -07:00 · 2015-10-31 15:12:22 -07:00 · 2643883a88
parent 80d99a8cb0
commit 2643883a88
2 changed files with 46 additions and 9 deletions
--- a/pyircbot/irccore.py
+++ b/pyircbot/irccore.py
@ -114,8 +114,9 @@ class IRCCore(asynchat.async_chat):
            socket_type = socket.AF_INET6
        socketInfo = socket.getaddrinfo(self.server, self.port, socket_type)
        self.create_socket(socket_type, socket.SOCK_STREAM)
-        
+        self.log.debug("Socket created")
        self.connect(socketInfo[0][4])
+        self.log.debug("Connection established")
        self.asynmap[self._fileno] = self # http://willpython.blogspot.com/2010/08/multiple-event-loops-with-asyncore-and.html
    
    def handle_connect(self):
--- a/pyircbot/modules/LinkTitler.py
+++ b/pyircbot/modules/LinkTitler.py
@ -13,13 +13,14 @@ import re
 import time
 import praw #TODO: enable/disable modules
 import datetime
-from requests import get
+from requests import get,head
 import html.parser
 from threading import Thread

 class LinkTitler(ModuleBase):
    def __init__(self, bot, moduleName):
-        ModuleBase.__init__(self, bot, moduleName);
+        ModuleBase.__init__(self, bot, moduleName)
+        self.REQUEST_SIZE_LIMIT = 10*1024
        self.hooks=[ModuleHook("PRIVMSG", self.searches)]
    
    def searches(self, args, prefix, trailing):
@ -83,15 +84,50 @@ class LinkTitler(ModuleBase):
                if match[0] in done:
                    continue
                done.append(match[0])
-                d = get(match[0])
-                titleMatches = re.findall(r'<title>([^<]+)</title>', d.text, re.I)
-                if len(titleMatches)>0 and d.status_code==200:
-                    h = html.parser.HTMLParser()
-                    title = h.unescape(titleMatches[0]).strip()
-                    if len(title)>0:
+                
+                headers = self.url_headers(match[0])
+                
+                # Don't mess with unknown content types
+                if not "Content-Type" in headers:
+                    continue
+                
+                if "text/html" in headers["Content-Type"]:
+                    # Fetch HTML title
+                    title = self.url_htmltitle(match[0])
+                    if title:
                        self.bot.act_PRIVMSG(args[0], "%s: \x02%s\x02" % (sender.nick, title))
+                
+                if "image/" in headers["Content-Type"]:
+                    self.bot.act_PRIVMSG(args[0], "%s: \x02%s\x02, %s" % (sender.nick, headers["Content-Type"], str(int(int(headers["Content-Length"])/1024))+"KB" if "Content-Length" in headers else "unknown size"))
+            
            return
    
+    def url_headers(self, url):
+        "HEAD requests a url to check content type & length, returns something like: {'type': 'image/jpeg', 'size': '90583'}"
+        self.log.debug("url_headers(%s)" % (url,))
+        resp = head(url=url, allow_redirects=True)
+        return resp.headers
+    
+    def url_htmltitle(self, url):
+        "Requests page html and returns title in a safe way"
+        self.log.debug("url_htmltitle(%s)" % (url,))
+        resp = get(url=url, stream=True)
+        # Fetch no more than first 10kb
+        # if the title isn't seen by then, you're doing it wrong
+        data = ""
+        for chunk in resp.iter_content(1024):
+            data += str(chunk)
+            if len(data) > self.REQUEST_SIZE_LIMIT:
+                break
+        
+        titleMatches = re.findall(r'<title>([^<]+)</title>', data, re.I)
+        if len(titleMatches)>0 and resp.status_code==200:
+            h = html.parser.HTMLParser()
+            title = h.unescape(titleMatches[0]).strip()
+            if len(title)>0:
+                return title
+        return None
+    
    # For youtube
    def getISOdurationseconds(self, stamp):
        ISO_8601_period_rx = re.compile(