pyircbot/pyircbot/modules/LinkTitler.py

207 lines
8.0 KiB
Python
Raw Normal View History

2014-10-05 00:57:20 -07:00
#!/usr/bin/env python
"""
.. module:: LinkTitler
2015-08-31 21:23:16 -07:00
:synopsis: Fetch titles form links.
2014-10-05 00:57:20 -07:00
.. moduleauthor:: Dave Pedu <dave@davepedu.com>
"""
2017-11-27 18:58:20 -08:00
from pyircbot.modulebase import ModuleBase, hook
2014-10-05 00:57:20 -07:00
from requests import get
import re
import time
2017-01-01 14:59:01 -08:00
import praw # TODO: enable/disable modules
2014-10-05 00:57:20 -07:00
import datetime
2017-01-01 14:59:01 -08:00
from requests import head
2014-10-05 00:57:20 -07:00
import html.parser
from threading import Thread
2014-10-05 00:57:20 -07:00
2017-01-01 14:59:01 -08:00
2014-10-05 00:57:20 -07:00
class LinkTitler(ModuleBase):
2015-08-31 21:23:16 -07:00
def __init__(self, bot, moduleName):
2015-10-31 15:12:22 -07:00
ModuleBase.__init__(self, bot, moduleName)
2017-01-01 14:59:01 -08:00
self.REQUEST_SIZE_LIMIT = 10 * 1024
2017-11-27 18:58:20 -08:00
@hook("PRIVMSG")
def searches(self, msg, cmd):
t = Thread(target=self.doLinkTitle, args=(msg.args, msg.prefix.nick, msg.trailing))
2015-08-31 21:23:16 -07:00
t.daemon = True
t.start()
2017-01-01 14:59:01 -08:00
2017-11-27 18:58:20 -08:00
def doLinkTitle(self, args, sender, trailing):
2015-08-31 21:23:16 -07:00
# Youtube
2017-01-01 14:59:01 -08:00
matches = re.compile(r'(?:youtube.*?(?:v=|/v/)|youtu\.be/|yooouuutuuube.*?id=)([-_a-z0-9]+)', re.I) \
.findall(trailing)
if matches:
2015-08-31 21:23:16 -07:00
done = []
for item in matches:
2017-01-01 14:59:01 -08:00
if item not in done:
vidinfo = self.get_video_description(item)
if vidinfo:
self.bot.act_PRIVMSG(args[0], vidinfo)
2015-08-31 21:23:16 -07:00
done.append(item)
return
2017-01-01 14:59:01 -08:00
2015-08-31 21:23:16 -07:00
# reddit threads
2017-01-01 14:59:01 -08:00
matches = re.compile(r'(?:reddit\.com/.*?comments/([a-zA-Z0-9]+)/|https?://(www\.)?redd.it/([a-zA-Z0-9]+))') \
.findall(trailing)
2015-08-31 21:23:16 -07:00
# Either [('', '', '2ibrz7')] or [('2ibrz7', '', '')]
2017-01-01 14:59:01 -08:00
if matches:
2015-08-31 21:23:16 -07:00
done = []
for match in matches:
submissionId = match[0]
2017-01-01 14:59:01 -08:00
if submissionId == "":
2015-08-31 21:23:16 -07:00
submissionId = match[-1]
if submissionId in done:
continue
done.append(submissionId)
2017-12-03 23:18:33 -08:00
submission = self.get_reddit_submission(submissionId)
2017-01-01 14:59:01 -08:00
msg = "👽 \x02\x031,15REDDIT\x0f\x02 :: %(title)s \x02on \x02%(domain)s%(nsfw)s\x02 - points " \
"\x02%(points)s\x02 (%(percent)s↑) - comments \x02%(comments)s\x02 - by \x02%(author)s\x02 on " \
"\x02%(date)s\x02" % {
"title": submission.title,
"domain": submission.domain,
"nsfw": "[NSFW]" if submission.over_18 else "",
"points": submission.ups,
2017-11-27 18:58:20 -08:00
"percent": "%s%%" % int(submission.upvote_ratio * 100),
2017-01-01 14:59:01 -08:00
"comments": submission.num_comments,
"author": submission.author.name,
"date": datetime.datetime.fromtimestamp(submission.created).strftime("%Y.%m.%d")
}
2015-08-31 21:23:16 -07:00
self.bot.act_PRIVMSG(args[0], msg)
return
# reddit subscribers
2017-01-01 14:59:01 -08:00
2015-08-31 21:23:16 -07:00
# subreddits
2017-01-01 14:59:01 -08:00
2015-08-31 21:23:16 -07:00
# generic <title>
2017-11-16 17:17:17 -08:00
matches = re.compile(r'(https?://([a-zA-Z0-9_\-\.]+/([A-Za-z0-9\-\._~:\/?#[]@!$&\'\(\)\*\+\,\;=]+)?))') \
.findall(trailing)
2017-01-01 14:59:01 -08:00
if matches:
done = []
2015-08-31 21:23:16 -07:00
for match in matches:
if match[0] in done:
continue
done.append(match[0])
2017-01-01 14:59:01 -08:00
2015-10-31 15:12:22 -07:00
headers = self.url_headers(match[0])
2017-01-01 14:59:01 -08:00
2015-10-31 15:12:22 -07:00
# Don't mess with unknown content types
2017-01-01 14:59:01 -08:00
if "Content-Type" not in headers:
2015-10-31 15:12:22 -07:00
continue
2017-01-01 14:59:01 -08:00
2015-10-31 15:12:22 -07:00
if "text/html" in headers["Content-Type"]:
# Fetch HTML title
title = self.url_htmltitle(match[0])
if title:
2017-11-27 18:58:20 -08:00
self.bot.act_PRIVMSG(args[0], "%s: \x02%s\x02" % (sender, title))
else:
# Unknown types, just print type and size
2017-01-01 14:59:01 -08:00
self.bot.act_PRIVMSG(args[0], "%s: \x02%s\x02, %s" %
2017-11-27 18:58:20 -08:00
(sender, headers["Content-Type"],
2017-01-01 14:59:01 -08:00
self.nicesize(int(headers["Content-Length"])) if
"Content-Length" in headers else "unknown size"))
2015-08-31 21:23:16 -07:00
return
2017-01-01 14:59:01 -08:00
2017-12-03 23:18:33 -08:00
def get_reddit_submission(self, subid):
r = praw.Reddit(**self.config["reddit"])
return r.submission(id=subid)
def nicesize(self, numBytes):
"Return kb or plain bytes"
if numBytes > 1024:
2017-01-01 14:59:01 -08:00
return "%skb" % str(int(numBytes / 1024))
else:
return "<1kb"
2017-01-01 14:59:01 -08:00
2015-10-31 15:12:22 -07:00
def url_headers(self, url):
2017-01-01 14:59:01 -08:00
"""
HEAD requests a url to check content type & length.
Returns something like: {'type': 'image/jpeg', 'size': '90583'}"
"""
2016-11-06 15:00:15 -08:00
self.log.info("url_headers(%s)" % (url,))
2015-10-31 15:12:22 -07:00
resp = head(url=url, allow_redirects=True)
return resp.headers
2017-01-01 14:59:01 -08:00
2015-10-31 15:12:22 -07:00
def url_htmltitle(self, url):
"Requests page html and returns title in a safe way"
2016-11-06 15:00:15 -08:00
self.log.info("url_htmltitle(%s)" % (url,))
2015-10-31 15:12:22 -07:00
resp = get(url=url, stream=True)
# Fetch no more than first 10kb
# if the title isn't seen by then, you're doing it wrong
2015-10-31 16:55:33 -07:00
data = b""
2015-10-31 15:12:22 -07:00
for chunk in resp.iter_content(1024):
2015-10-31 16:55:33 -07:00
data += chunk
2015-10-31 15:12:22 -07:00
if len(data) > self.REQUEST_SIZE_LIMIT:
break
2017-01-01 14:59:01 -08:00
2015-10-31 16:55:33 -07:00
data = data.decode('utf-8', "ignore")
2017-01-01 14:59:01 -08:00
2015-10-31 15:12:22 -07:00
titleMatches = re.findall(r'<title>([^<]+)</title>', data, re.I)
statusCodeWhitelist = self.config.get("status_code_whitelist", [200])
if titleMatches and resp.status_code in statusCodeWhitelist:
2015-10-31 15:12:22 -07:00
h = html.parser.HTMLParser()
title = h.unescape(titleMatches[0]).strip()
2017-01-01 14:59:01 -08:00
if len(title) > 0:
2015-10-31 15:12:22 -07:00
return title
return None
2017-01-01 14:59:01 -08:00
2015-08-31 21:23:16 -07:00
# For youtube
def getISOdurationseconds(self, stamp):
ISO_8601_period_rx = re.compile(
2017-01-01 14:59:01 -08:00
'P' # designates a period
'(?:(?P<years>\d+)Y)?' # years
'(?:(?P<months>\d+)M)?' # months
'(?:(?P<weeks>\d+)W)?' # weeks
'(?:(?P<days>\d+)D)?' # days
'(?:T' # time part must begin with a T
'(?:(?P<hours>\d+)H)?' # hours
'(?:(?P<minutes>\d+)M)?' # minutes
'(?:(?P<seconds>\d+)S)?' # seconds
2015-08-31 21:23:16 -07:00
')?' # end of time part
2017-01-01 14:59:01 -08:00
) # http://stackoverflow.com/a/16742742
2015-08-31 21:23:16 -07:00
return ISO_8601_period_rx.match(stamp).groupdict()
2017-01-01 14:59:01 -08:00
2017-12-03 23:18:33 -08:00
def _get_video_description_api(self, vid_id):
return get('https://www.googleapis.com/youtube/v3/videos?part=snippet,contentDetails,statistics&id=%s'
'&key=%s' % (vid_id, self.config["youtube_api_key"])).json()
2017-01-01 14:59:01 -08:00
2017-12-03 23:18:33 -08:00
def get_video_description(self, vid_id):
apidata = self._get_video_description_api(vid_id)
2015-08-31 21:23:16 -07:00
if not apidata['pageInfo']['totalResults']:
return
2017-01-01 14:59:01 -08:00
2015-08-31 21:23:16 -07:00
video = apidata['items'][0]
snippet = video["snippet"]
duration = self.getISOdurationseconds(video["contentDetails"]["duration"])
2017-01-01 14:59:01 -08:00
2015-08-31 21:23:16 -07:00
out = '\x02\x031,0You\x0f\x030,4Tube\x02\x0f :: \x02%s\x02' % snippet["title"]
2017-01-01 14:59:01 -08:00
2015-08-31 21:23:16 -07:00
out += ' - length \x02'
2017-01-01 14:59:01 -08:00
if duration["hours"] is not None:
2015-08-31 21:23:16 -07:00
out += '%dh ' % int(duration["hours"])
2017-01-01 14:59:01 -08:00
if duration["minutes"] is not None:
2015-08-31 21:23:16 -07:00
out += '%dm ' % int(duration["minutes"])
2017-01-01 14:59:01 -08:00
if duration["seconds"] is not None:
out += "%ds\x02" % int(duration["seconds"])
2017-01-01 14:59:01 -08:00
totalvotes = float(video["statistics"]["dislikeCount"]) + float(video["statistics"]["likeCount"])
2015-08-31 21:23:16 -07:00
rating = float(video["statistics"]["likeCount"]) / totalvotes
2017-01-01 14:59:01 -08:00
out += ' - rated \x02%.2f/5\x02' % round(rating * 5, 1)
2015-08-31 21:23:16 -07:00
out += ' - \x02%s\x02 views' % self.group_int_digits(video["statistics"]["viewCount"])
upload_time = time.strptime(snippet['publishedAt'], "%Y-%m-%dT%H:%M:%S.000Z")
out += ' - by \x02%s\x02 on \x02%s\x02' % (snippet['channelTitle'], time.strftime("%Y.%m.%d", upload_time))
2017-01-01 14:59:01 -08:00
2015-08-31 21:23:16 -07:00
return out
2017-01-01 14:59:01 -08:00
2015-08-31 21:23:16 -07:00
def group_int_digits(self, number, delimiter=',', grouping=3):
base = str(number).strip()
builder = []
while base:
builder.append(base[-grouping:])
base = base[:-grouping]
builder.reverse()
return delimiter.join(builder)