matches = re.compile(r'(https?://([a-zA-Z0-9_\-\.]+/([A-Za-z0-9\-\._~:\/?#[]@!$&\'\*\+\,\;=]+)?))') \ .findall(trailing) if matches: done = [] for match in matches: if match[0] in done: continue done.append(match[0]) headers = self.url_headers(match[0]) # Don't mess with unknown content types if "Content-Type" not in headers: continue if "text/html" in headers["Content-Type"]: # Fetch HTML title title = self.url_htmltitle(match[0]) if title: self.bot.act_PRIVMSG(args[0], "%s: \x02%s\x02" % (sender, title)) else: # Unknown types, just print type and size self.bot.act_PRIVMSG(args[0], "%s: \x02%s\x02, %s" % (sender, headers["Content-Type"], self.nicesize(int(headers["Content-Length"])) if "Content-Length" in headers else "unknown size")) return def get_reddit_submission(self, subid): r = praw.Reddit(**self.config["reddit"]) return r.submission(id=subid) def nicesize(self, numBytes): "Return kb or plain bytes" if numBytes > 1024: return "%skb" % str(int(numBytes / 1024)) else: return "<1kb" def url_headers(self, url): """ HEAD requests a url to check content type & length. Returns something like: {'type': 'image/jpeg', 'size': '90583'}" """ self.log.info("url_headers(%s)" % (url,)) resp = head(url=url, allow_redirects=True) return resp.headers def url_htmltitle(self, url): "Requests page html and returns title in a safe way" self.log.info("url_htmltitle(%s)" % (url,)) resp = get(url=url, stream=True) # Fetch no more than first 10kb # if the title isn't seen by then, you're doing it wrong data = b"" for chunk in resp.iter_content(1024): data += chunk if len(data) > self.REQUEST_SIZE_LIMIT: break data = data.decode('utf-8', "ignore") titleMatches = re.findall(r'<title>([^<]+)

#!/usr/bin/env python """ .. module:: LinkTitler :synopsis: Fetch titles form links. .. moduleauthor:: Dave Pedu """ from pyircbot.modulebase import ModuleBase, hook from requests import get import re import time import praw # TODO: enable/disable modules import datetime from requests import head import html.parser from threading import Thread class LinkTitler(ModuleBase): def __init__(self, bot, moduleName): ModuleBase.__init__(self, bot, moduleName) self.REQUEST_SIZE_LIMIT = 10 * 1024 @hook("PRIVMSG") def searches(self, msg, cmd): t = Thread(target=self.doLinkTitle, args=(msg.args, msg.prefix.nick, msg.trailing)) t.daemon = True t.start() def doLinkTitle(self, args, sender, trailing): # Youtube matches = re.compile(r'(?:youtube.*?(?:v=|/v/)|youtu\.be/|yooouuutuuube.*?id=)([-_a-z0-9]+)', re.I) \ .findall(trailing) if matches: done = [] for item in matches: if item not in done: vidinfo = self.get_video_description(item) if vidinfo: self.bot.act_PRIVMSG(args[0], vidinfo) done.append(item) return # reddit threads matches = re.compile(r'(?:reddit\.com/.*?comments/([a-zA-Z0-9]+)/|https?://(www\.)?redd.it/([a-zA-Z0-9]+))') \ .findall(trailing) # Either [('', '', '2ibrz7')] or [('2ibrz7', '', '')] if matches: done = [] for match in matches: submissionId = match[0] if submissionId == "": submissionId = match[-1] if submissionId in done: continue done.append(submissionId) submission = self.get_reddit_submission(submissionId) msg = "👽 \x02\x031,15REDDIT\x0f\x02 :: %(title)s \x02on \x02%(domain)s%(nsfw)s\x02 - points " \ "\x02%(points)s\x02 (%(percent)s↑) - comments \x02%(comments)s\x02 - by \x02%(author)s\x02 on " \ "\x02%(date)s\x02" % { "title": submission.title, "domain": submission.domain, "nsfw": "[NSFW]" if submission.over_18 else "", "points": submission.ups, "percent": "%s%%" % int(submission.upvote_ratio * 100), "comments": submission.num_comments, "author": submission.author.name, "date": datetime.datetime.fromtimestamp(submission.created).strftime("%Y.%m.%d") } self.bot.act_PRIVMSG(args[0], msg) return # reddit subscribers # subreddits # generic matches = re.compile(r'(https?://([a-zA-Z0-9_\-\.]+/([A-Za-z0-9\-\._~:\/?#[]@!$&\'\*\+\,\;=]+)?))') \ .findall(trailing) if matches: done = [] for match in matches: if match[0] in done: continue done.append(match[0]) headers = self.url_headers(match[0]) # Don't mess with unknown content types if "Content-Type" not in headers: continue if "text/html" in headers["Content-Type"]: # Fetch HTML title title = self.url_htmltitle(match[0]) if title: self.bot.act_PRIVMSG(args[0], "%s: \x02%s\x02" % (sender, title)) else: # Unknown types, just print type and size self.bot.act_PRIVMSG(args[0], "%s: \x02%s\x02, %s" % (sender, headers["Content-Type"], self.nicesize(int(headers["Content-Length"])) if "Content-Length" in headers else "unknown size")) return def get_reddit_submission(self, subid): r = praw.Reddit(**self.config["reddit"]) return r.submission(id=subid) def nicesize(self, numBytes): "Return kb or plain bytes" if numBytes > 1024: return "%skb" % str(int(numBytes / 1024)) else: return "<1kb" def url_headers(self, url): """ HEAD requests a url to check content type & length. Returns something like: {'type': 'image/jpeg', 'size': '90583'}" """ self.log.info("url_headers(%s)" % (url,)) resp = head(url=url, allow_redirects=True) return resp.headers def url_htmltitle(self, url): "Requests page html and returns title in a safe way" self.log.info("url_htmltitle(%s)" % (url,)) resp = get(url=url, stream=True) # Fetch no more than first 10kb # if the title isn't seen by then, you're doing it wrong data = b"" for chunk in resp.iter_content(1024): data += chunk if len(data) > self.REQUEST_SIZE_LIMIT: break data = data.decode('utf-8', "ignore") titleMatches = re.findall(r'<title>([^<]+)', data, re.I) statusCodeWhitelist = self.config.get("status_code_whitelist", [200]) if titleMatches and resp.status_code in statusCodeWhitelist: h = html.parser.HTMLParser() title = h.unescape(titleMatches[0]).strip() if len(title) > 0: return title return None # For youtube def getISOdurationseconds(self, stamp): ISO_8601_period_rx = re.compile( 'P' # designates a period '(?:(?P\d+)Y)?' # years '(?:(?P\d+)M)?' # months '(?:(?P\d+)W)?' # weeks '(?:(?P\d+)D)?' # days '(?:T' # time part must begin with a T '(?:(?P\d+)H)?' # hours '(?:(?P\d+)M)?' # minutes '(?:(?P\d+)S)?' # seconds ')?' # end of time part ) # http://stackoverflow.com/a/16742742 return ISO_8601_period_rx.match(stamp).groupdict() def _get_video_description_api(self, vid_id): return get('https://www.googleapis.com/youtube/v3/videos?part=snippet,contentDetails,statistics&id=%s' '&key=%s' % (vid_id, self.config["youtube_api_key"])).json() def get_video_description(self, vid_id): apidata = self._get_video_description_api(vid_id) if not apidata['pageInfo']['totalResults']: return video = apidata['items'][0] snippet = video["snippet"] duration = self.getISOdurationseconds(video["contentDetails"]["duration"]) out = '\x02\x031,0You\x0f\x030,4Tube\x02\x0f :: \x02%s\x02' % snippet["title"] out += ' - length \x02' if duration["hours"] is not None: out += '%dh ' % int(duration["hours"]) if duration["minutes"] is not None: out += '%dm ' % int(duration["minutes"]) if duration["seconds"] is not None: out += "%ds\x02" % int(duration["seconds"]) totalvotes = float(video["statistics"]["dislikeCount"]) + float(video["statistics"]["likeCount"]) rating = float(video["statistics"]["likeCount"]) / totalvotes out += ' - rated \x02%.2f/5\x02' % round(rating * 5, 1) out += ' - \x02%s\x02 views' % self.group_int_digits(video["statistics"]["viewCount"]) upload_time = time.strptime(snippet['publishedAt'], "%Y-%m-%dT%H:%M:%S.000Z") out += ' - by \x02%s\x02 on \x02%s\x02' % (snippet['channelTitle'], time.strftime("%Y.%m.%d", upload_time)) return out def group_int_digits(self, number, delimiter=',', grouping=3): base = str(number).strip() builder = [] while base: builder.append(base[-grouping:]) base = base[:-grouping] builder.reverse() return delimiter.join(builder)