195 lines
7.1 KiB
Python
195 lines
7.1 KiB
Python
from threading import Thread, Timer
|
|
from concurrent.futures import ThreadPoolExecutor
|
|
from queue import Queue
|
|
import shutil
|
|
import logging
|
|
import os
|
|
import requests
|
|
import feedparser
|
|
import time
|
|
|
|
|
|
class PodcastSettings(object):
|
|
"""seconds between updating podcasts"""
|
|
refresh_interval = 3 #60 * 60
|
|
"""how many seconds to wait after initialization to start refreshing podcasts"""
|
|
startup_delay = 30
|
|
"""how many podcasts can be scanned at once"""
|
|
scan_threads = 4
|
|
"""root path of downloaded podcasts"""
|
|
path = "podcasts"
|
|
"""how many of the most recent episodes to download"""
|
|
download_episodes = 2
|
|
|
|
|
|
class PodcastManager(Thread):
|
|
def __init__(self, db):
|
|
super().__init__()
|
|
self.daemon = True
|
|
|
|
self.db = db
|
|
self.settings = PodcastSettings
|
|
self.q = Queue()
|
|
self.start()
|
|
|
|
def run(self):
|
|
"""
|
|
In a loop forever, query for podcasts in need of scanning for new episodes. Wait for a scan being requested (aka
|
|
a queue item) as the signal to begin scanning.
|
|
"""
|
|
self.schedule_rescan()
|
|
while True:
|
|
self.q.get()
|
|
self.refresh_podcasts()
|
|
|
|
def interval_scan(self):
|
|
"""
|
|
Schedule the next automated rescan. Request a scan be executed.
|
|
"""
|
|
self.request_rescan()
|
|
#self.schedule_rescan()
|
|
|
|
def schedule_rescan(self):
|
|
"""
|
|
Call the next interval scan later
|
|
"""
|
|
t = Timer(self.settings.refresh_interval, self.interval_scan)
|
|
t.daemon = True
|
|
t.start()
|
|
|
|
def request_rescan(self):
|
|
"""
|
|
Add an item to the queue
|
|
"""
|
|
self.q.put(None)
|
|
|
|
def refresh_podcasts(self):
|
|
"""
|
|
Refresh all the podcasts
|
|
"""
|
|
logging.info("rescanning podcasts")
|
|
# If any episodes are marked as "downloading", it's a lie and left over from before the crash
|
|
# TODO this should happen earlier than the scan
|
|
for entry in self.db.get_podcast_episodes(status="downloading"):
|
|
self.db.set_podcast_episode_status(entry['id'], "new")
|
|
|
|
futures = []
|
|
# TODO the TPE doesn't die as a daemon thread :|
|
|
with ThreadPoolExecutor(max_workers=self.settings.scan_threads) as pool:
|
|
for item in self.db.get_podcasts():
|
|
futures.append(pool.submit(self.refresh_podcast, item, ))
|
|
for item in futures:
|
|
e = item.exception()
|
|
if e:
|
|
raise e
|
|
# for item in self.db.get_podcasts():
|
|
# self.refresh_podcast(item)
|
|
logging.info("podcast refresh complete")
|
|
#TODO all episodes in 'new' status change to 'skipped'
|
|
|
|
def refresh_podcast(self, podcast):
|
|
"""
|
|
Refresh all metadata and episodes of a single podcast
|
|
"""
|
|
logging.info("updating podcast %s '%s' ", podcast['id'], podcast['title'])
|
|
feed = self.get_feed(podcast['url'])
|
|
for entry in feed['entries']:
|
|
self.refresh_podcast_entry(podcast['id'], entry)
|
|
self.refresh_podcast_episodes(podcast['id'])
|
|
|
|
#TODO update the feed's description
|
|
# self.udpate_feed_meta(feed['feed'])
|
|
# 'image': {'href': 'http://sysadministrivia.com/images/1.jpg',
|
|
# 'link': 'http://sysadministrivia.com/',
|
|
# 'links': [{'href': 'http://sysadministrivia.com/',
|
|
# 'rel': 'alternate',
|
|
# 'type': 'text/html'}],
|
|
# 'title': 'The Sysadministrivia Podcast',
|
|
# 'title_detail': {'base': '',
|
|
# 'language': 'en',
|
|
# 'type': 'text/plain',
|
|
# 'value': 'The Sysadministrivia Podcast'}},
|
|
# 'link': 'http://sysadministrivia.com/',
|
|
# 'subtitle': 'We podcast all things system administration/engineering/infosec, '
|
|
# 'with a strong focus on GNU/Linux. We use F/OSS software whenever '
|
|
# 'possible in the production of these podcasts. Please be sure to '
|
|
# 'view our show notes on the site!',
|
|
# 'title': 'The Sysadministrivia Podcast',
|
|
|
|
def refresh_podcast_episodes(self, podcast_id):
|
|
"""
|
|
Check that the most recent X episodes are downloaded. Start downloads if not.
|
|
"""
|
|
for entry in self.db.get_podcast_episodes(podcast_id=podcast_id, limit=self.settings.download_episodes):
|
|
if entry["status"] == "new":
|
|
self.download_episode(entry)
|
|
|
|
def download_episode(self, episode):
|
|
"""
|
|
Download the episode:
|
|
- mark status as downloading
|
|
- clean up any tmp files from previous failures
|
|
- create the dir
|
|
- stream the url to temp file
|
|
- rename the temp file to final location
|
|
- mark episode as downloaded
|
|
"""
|
|
self.db.set_podcast_episode_status(episode['id'], "downloading")
|
|
|
|
ep_dir = os.path.join(self.settings.path, str(episode['podcastid']))
|
|
ep_path = os.path.join(ep_dir, "{}.mp3".format(episode['id']))
|
|
ep_tmppath = os.path.join(ep_dir, ".{}.mp3".format(episode['id']))
|
|
|
|
os.makedirs(ep_dir, exist_ok=True)
|
|
if os.path.exists(ep_path):
|
|
os.unlink(ep_path) # previous failed downloads
|
|
if os.path.exists(ep_tmppath):
|
|
os.unlink(ep_tmppath) # previous failed downloads
|
|
|
|
logging.info("fetching %s", episode['url'])
|
|
r = requests.get(episode['url'], stream=True)
|
|
r.raise_for_status()
|
|
with open(ep_tmppath, 'wb') as f:
|
|
shutil.copyfileobj(r.raw, f)
|
|
os.rename(ep_tmppath, ep_path)
|
|
# TODO verify or update MIME from that of the url
|
|
self.db.set_podcast_episode_status(episode['id'], "completed")
|
|
|
|
def get_feed(self, rss_url):
|
|
"""
|
|
Download the given URL and return a parsed feed
|
|
"""
|
|
feed_body = requests.get(rss_url, timeout=30)
|
|
return feedparser.parse(feed_body.text)
|
|
|
|
def refresh_podcast_entry(self, podcast_id, entry):
|
|
"""
|
|
Update the database for the given podcast entry. Add it to the database if it doesn't exist. Note: we use the
|
|
episode TITLE as the uniqueness check against the database
|
|
"""
|
|
existing = self.db.get_podcast_episodes(podcast_id=podcast_id, title=entry['title'])
|
|
if existing:
|
|
return
|
|
|
|
# find media file url
|
|
url = None
|
|
mime = None
|
|
for link in entry['links']:
|
|
if link['type'] in ["audio/mpeg", "audio/mp3"]: # TODO more formats
|
|
url = link['href']
|
|
mime = link['type']
|
|
break
|
|
|
|
if not url:
|
|
logging.warning("could not find url for episode in podcast %s", podcast_id)
|
|
return
|
|
|
|
# create entry
|
|
ep_id = self.db.add_podcast_episode(podcast_id,
|
|
time.mktime(entry['published_parsed']),
|
|
entry['title'],
|
|
entry['summary'],
|
|
url,
|
|
mime)
|
|
logging.info("added episode %s '%s'", ep_id, entry['title'])
|