pysonic/pysonic/podcast.py

195 lines
7.1 KiB
Python

from threading import Thread, Timer
from concurrent.futures import ThreadPoolExecutor
from queue import Queue
import shutil
import logging
import os
import requests
import feedparser
import time
class PodcastSettings(object):
"""seconds between updating podcasts"""
refresh_interval = 3 #60 * 60
"""how many seconds to wait after initialization to start refreshing podcasts"""
startup_delay = 30
"""how many podcasts can be scanned at once"""
scan_threads = 4
"""root path of downloaded podcasts"""
path = "podcasts"
"""how many of the most recent episodes to download"""
download_episodes = 2
class PodcastManager(Thread):
def __init__(self, db):
super().__init__()
self.daemon = True
self.db = db
self.settings = PodcastSettings
self.q = Queue()
self.start()
def run(self):
"""
In a loop forever, query for podcasts in need of scanning for new episodes. Wait for a scan being requested (aka
a queue item) as the signal to begin scanning.
"""
self.schedule_rescan()
while True:
self.q.get()
self.refresh_podcasts()
def interval_scan(self):
"""
Schedule the next automated rescan. Request a scan be executed.
"""
self.request_rescan()
#self.schedule_rescan()
def schedule_rescan(self):
"""
Call the next interval scan later
"""
t = Timer(self.settings.refresh_interval, self.interval_scan)
t.daemon = True
t.start()
def request_rescan(self):
"""
Add an item to the queue
"""
self.q.put(None)
def refresh_podcasts(self):
"""
Refresh all the podcasts
"""
logging.info("rescanning podcasts")
# If any episodes are marked as "downloading", it's a lie and left over from before the crash
# TODO this should happen earlier than the scan
for entry in self.db.get_podcast_episodes(status="downloading"):
self.db.set_podcast_episode_status(entry['id'], "new")
futures = []
# TODO the TPE doesn't die as a daemon thread :|
with ThreadPoolExecutor(max_workers=self.settings.scan_threads) as pool:
for item in self.db.get_podcasts():
futures.append(pool.submit(self.refresh_podcast, item, ))
for item in futures:
e = item.exception()
if e:
raise e
# for item in self.db.get_podcasts():
# self.refresh_podcast(item)
logging.info("podcast refresh complete")
#TODO all episodes in 'new' status change to 'skipped'
def refresh_podcast(self, podcast):
"""
Refresh all metadata and episodes of a single podcast
"""
logging.info("updating podcast %s '%s' ", podcast['id'], podcast['title'])
feed = self.get_feed(podcast['url'])
for entry in feed['entries']:
self.refresh_podcast_entry(podcast['id'], entry)
self.refresh_podcast_episodes(podcast['id'])
#TODO update the feed's description
# self.udpate_feed_meta(feed['feed'])
# 'image': {'href': 'http://sysadministrivia.com/images/1.jpg',
# 'link': 'http://sysadministrivia.com/',
# 'links': [{'href': 'http://sysadministrivia.com/',
# 'rel': 'alternate',
# 'type': 'text/html'}],
# 'title': 'The Sysadministrivia Podcast',
# 'title_detail': {'base': '',
# 'language': 'en',
# 'type': 'text/plain',
# 'value': 'The Sysadministrivia Podcast'}},
# 'link': 'http://sysadministrivia.com/',
# 'subtitle': 'We podcast all things system administration/engineering/infosec, '
# 'with a strong focus on GNU/Linux. We use F/OSS software whenever '
# 'possible in the production of these podcasts. Please be sure to '
# 'view our show notes on the site!',
# 'title': 'The Sysadministrivia Podcast',
def refresh_podcast_episodes(self, podcast_id):
"""
Check that the most recent X episodes are downloaded. Start downloads if not.
"""
for entry in self.db.get_podcast_episodes(podcast_id=podcast_id, limit=self.settings.download_episodes):
if entry["status"] == "new":
self.download_episode(entry)
def download_episode(self, episode):
"""
Download the episode:
- mark status as downloading
- clean up any tmp files from previous failures
- create the dir
- stream the url to temp file
- rename the temp file to final location
- mark episode as downloaded
"""
self.db.set_podcast_episode_status(episode['id'], "downloading")
ep_dir = os.path.join(self.settings.path, str(episode['podcastid']))
ep_path = os.path.join(ep_dir, "{}.mp3".format(episode['id']))
ep_tmppath = os.path.join(ep_dir, ".{}.mp3".format(episode['id']))
os.makedirs(ep_dir, exist_ok=True)
if os.path.exists(ep_path):
os.unlink(ep_path) # previous failed downloads
if os.path.exists(ep_tmppath):
os.unlink(ep_tmppath) # previous failed downloads
logging.info("fetching %s", episode['url'])
r = requests.get(episode['url'], stream=True)
r.raise_for_status()
with open(ep_tmppath, 'wb') as f:
shutil.copyfileobj(r.raw, f)
os.rename(ep_tmppath, ep_path)
# TODO verify or update MIME from that of the url
self.db.set_podcast_episode_status(episode['id'], "completed")
def get_feed(self, rss_url):
"""
Download the given URL and return a parsed feed
"""
feed_body = requests.get(rss_url, timeout=30)
return feedparser.parse(feed_body.text)
def refresh_podcast_entry(self, podcast_id, entry):
"""
Update the database for the given podcast entry. Add it to the database if it doesn't exist. Note: we use the
episode TITLE as the uniqueness check against the database
"""
existing = self.db.get_podcast_episodes(podcast_id=podcast_id, title=entry['title'])
if existing:
return
# find media file url
url = None
mime = None
for link in entry['links']:
if link['type'] in ["audio/mpeg", "audio/mp3"]: # TODO more formats
url = link['href']
mime = link['type']
break
if not url:
logging.warning("could not find url for episode in podcast %s", podcast_id)
return
# create entry
ep_id = self.db.add_podcast_episode(podcast_id,
time.mktime(entry['published_parsed']),
entry['title'],
entry['summary'],
url,
mime)
logging.info("added episode %s '%s'", ep_id, entry['title'])