pysonic/pysonic/podcast.py

195 lines
7.1 KiB
Python
Raw Normal View History

2018-09-21 13:49:30 -07:00
from threading import Thread, Timer
2018-04-09 21:52:30 -07:00
from concurrent.futures import ThreadPoolExecutor
from queue import Queue
import shutil
import logging
import os
import requests
import feedparser
import time
class PodcastSettings(object):
"""seconds between updating podcasts"""
refresh_interval = 3 #60 * 60
"""how many seconds to wait after initialization to start refreshing podcasts"""
startup_delay = 30
"""how many podcasts can be scanned at once"""
scan_threads = 4
"""root path of downloaded podcasts"""
path = "podcasts"
"""how many of the most recent episodes to download"""
download_episodes = 2
class PodcastManager(Thread):
def __init__(self, db):
super().__init__()
self.daemon = True
self.db = db
self.settings = PodcastSettings
self.q = Queue()
self.start()
def run(self):
"""
In a loop forever, query for podcasts in need of scanning for new episodes. Wait for a scan being requested (aka
a queue item) as the signal to begin scanning.
"""
self.schedule_rescan()
while True:
self.q.get()
self.refresh_podcasts()
def interval_scan(self):
"""
Schedule the next automated rescan. Request a scan be executed.
"""
self.request_rescan()
#self.schedule_rescan()
def schedule_rescan(self):
"""
Call the next interval scan later
"""
t = Timer(self.settings.refresh_interval, self.interval_scan)
t.daemon = True
t.start()
def request_rescan(self):
"""
Add an item to the queue
"""
self.q.put(None)
def refresh_podcasts(self):
"""
Refresh all the podcasts
"""
logging.info("rescanning podcasts")
# If any episodes are marked as "downloading", it's a lie and left over from before the crash
# TODO this should happen earlier than the scan
for entry in self.db.get_podcast_episodes(status="downloading"):
self.db.set_podcast_episode_status(entry['id'], "new")
futures = []
# TODO the TPE doesn't die as a daemon thread :|
with ThreadPoolExecutor(max_workers=self.settings.scan_threads) as pool:
for item in self.db.get_podcasts():
futures.append(pool.submit(self.refresh_podcast, item, ))
for item in futures:
e = item.exception()
if e:
raise e
# for item in self.db.get_podcasts():
# self.refresh_podcast(item)
logging.info("podcast refresh complete")
2018-09-21 13:49:30 -07:00
#TODO all episodes in 'new' status change to 'skipped'
2018-04-09 21:52:30 -07:00
def refresh_podcast(self, podcast):
"""
Refresh all metadata and episodes of a single podcast
"""
logging.info("updating podcast %s '%s' ", podcast['id'], podcast['title'])
feed = self.get_feed(podcast['url'])
for entry in feed['entries']:
self.refresh_podcast_entry(podcast['id'], entry)
self.refresh_podcast_episodes(podcast['id'])
#TODO update the feed's description
# self.udpate_feed_meta(feed['feed'])
# 'image': {'href': 'http://sysadministrivia.com/images/1.jpg',
# 'link': 'http://sysadministrivia.com/',
# 'links': [{'href': 'http://sysadministrivia.com/',
# 'rel': 'alternate',
# 'type': 'text/html'}],
# 'title': 'The Sysadministrivia Podcast',
# 'title_detail': {'base': '',
# 'language': 'en',
# 'type': 'text/plain',
# 'value': 'The Sysadministrivia Podcast'}},
# 'link': 'http://sysadministrivia.com/',
# 'subtitle': 'We podcast all things system administration/engineering/infosec, '
# 'with a strong focus on GNU/Linux. We use F/OSS software whenever '
# 'possible in the production of these podcasts. Please be sure to '
# 'view our show notes on the site!',
# 'title': 'The Sysadministrivia Podcast',
def refresh_podcast_episodes(self, podcast_id):
"""
Check that the most recent X episodes are downloaded. Start downloads if not.
"""
for entry in self.db.get_podcast_episodes(podcast_id=podcast_id, limit=self.settings.download_episodes):
if entry["status"] == "new":
self.download_episode(entry)
def download_episode(self, episode):
"""
Download the episode:
- mark status as downloading
- clean up any tmp files from previous failures
- create the dir
- stream the url to temp file
- rename the temp file to final location
- mark episode as downloaded
"""
self.db.set_podcast_episode_status(episode['id'], "downloading")
ep_dir = os.path.join(self.settings.path, str(episode['podcastid']))
ep_path = os.path.join(ep_dir, "{}.mp3".format(episode['id']))
ep_tmppath = os.path.join(ep_dir, ".{}.mp3".format(episode['id']))
os.makedirs(ep_dir, exist_ok=True)
if os.path.exists(ep_path):
os.unlink(ep_path) # previous failed downloads
if os.path.exists(ep_tmppath):
os.unlink(ep_tmppath) # previous failed downloads
logging.info("fetching %s", episode['url'])
r = requests.get(episode['url'], stream=True)
r.raise_for_status()
with open(ep_tmppath, 'wb') as f:
shutil.copyfileobj(r.raw, f)
os.rename(ep_tmppath, ep_path)
# TODO verify or update MIME from that of the url
self.db.set_podcast_episode_status(episode['id'], "completed")
def get_feed(self, rss_url):
"""
Download the given URL and return a parsed feed
"""
feed_body = requests.get(rss_url, timeout=30)
return feedparser.parse(feed_body.text)
def refresh_podcast_entry(self, podcast_id, entry):
"""
Update the database for the given podcast entry. Add it to the database if it doesn't exist. Note: we use the
episode TITLE as the uniqueness check against the database
"""
existing = self.db.get_podcast_episodes(podcast_id=podcast_id, title=entry['title'])
if existing:
return
# find media file url
url = None
mime = None
for link in entry['links']:
if link['type'] in ["audio/mpeg", "audio/mp3"]: # TODO more formats
url = link['href']
mime = link['type']
break
if not url:
logging.warning("could not find url for episode in podcast %s", podcast_id)
return
# create entry
ep_id = self.db.add_podcast_episode(podcast_id,
time.mktime(entry['published_parsed']),
entry['title'],
entry['summary'],
url,
mime)
logging.info("added episode %s '%s'", ep_id, entry['title'])