pysonic/pysonic/podcast.py

from threading import Thread, Timer
from concurrent.futures import ThreadPoolExecutor
from queue import Queue
import shutil
import logging
import os
import requests
import feedparser
import time


class PodcastSettings(object):
    """seconds between updating podcasts"""
    refresh_interval = 3 #60 * 60
    """how many seconds to wait after initialization to start refreshing podcasts"""
    startup_delay = 30
    """how many podcasts can be scanned at once"""
    scan_threads = 4
    """root path of downloaded podcasts"""
    path = "podcasts"
    """how many of the most recent episodes to download"""
    download_episodes = 2


class PodcastManager(Thread):
    def __init__(self, db):
        super().__init__()
        self.daemon = True

        self.db = db
        self.settings = PodcastSettings
        self.q = Queue()
        self.start()

    def run(self):
        """
        In a loop forever, query for podcasts in need of scanning for new episodes. Wait for a scan being requested (aka
        a queue item) as the signal to begin scanning.
        """
        self.schedule_rescan()
        while True:
            self.q.get()
            self.refresh_podcasts()

    def interval_scan(self):
        """
        Schedule the next automated rescan. Request a scan be executed.
        """
        self.request_rescan()
        #self.schedule_rescan()

    def schedule_rescan(self):
        """
        Call the next interval scan later
        """
        t = Timer(self.settings.refresh_interval, self.interval_scan)
        t.daemon = True
        t.start()

    def request_rescan(self):
        """
        Add an item to the queue
        """
        self.q.put(None)

    def refresh_podcasts(self):
        """
        Refresh all the podcasts
        """
        logging.info("rescanning podcasts")
        # If any episodes are marked as "downloading", it's a lie and left over from before the crash
        # TODO this should happen earlier than the scan
        for entry in self.db.get_podcast_episodes(status="downloading"):
            self.db.set_podcast_episode_status(entry['id'], "new")

        futures = []
        # TODO the TPE doesn't die as a daemon thread :|
        with ThreadPoolExecutor(max_workers=self.settings.scan_threads) as pool:
            for item in self.db.get_podcasts():
                futures.append(pool.submit(self.refresh_podcast, item, ))
            for item in futures:
                e = item.exception()
                if e:
                    raise e
        # for item in self.db.get_podcasts():
        #     self.refresh_podcast(item)
        logging.info("podcast refresh complete")
        #TODO all episodes in 'new' status change to 'skipped'

    def refresh_podcast(self, podcast):
        """
        Refresh all metadata and episodes of a single podcast
        """
        logging.info("updating podcast %s '%s' ", podcast['id'], podcast['title'])
        feed = self.get_feed(podcast['url'])
        for entry in feed['entries']:
            self.refresh_podcast_entry(podcast['id'], entry)
        self.refresh_podcast_episodes(podcast['id'])

        #TODO update the feed's description
        # self.udpate_feed_meta(feed['feed'])
        # 'image': {'href': 'http://sysadministrivia.com/images/1.jpg',
        #    'link': 'http://sysadministrivia.com/',
        #    'links': [{'href': 'http://sysadministrivia.com/',
        #               'rel': 'alternate',
        #               'type': 'text/html'}],
        #    'title': 'The Sysadministrivia Podcast',
        #    'title_detail': {'base': '',
        #                     'language': 'en',
        #                     'type': 'text/plain',
        #                     'value': 'The Sysadministrivia Podcast'}},
        # 'link': 'http://sysadministrivia.com/',
        # 'subtitle': 'We podcast all things system administration/engineering/infosec, '
        #      'with a strong focus on GNU/Linux. We use F/OSS software whenever '
        #      'possible in the production of these podcasts. Please be sure to '
        #      'view our show notes on the site!',
        # 'title': 'The Sysadministrivia Podcast',

    def refresh_podcast_episodes(self, podcast_id):
        """
        Check that the most recent X episodes are downloaded. Start downloads if not.
        """
        for entry in self.db.get_podcast_episodes(podcast_id=podcast_id, limit=self.settings.download_episodes):
            if entry["status"] == "new":
                self.download_episode(entry)

    def download_episode(self, episode):
        """
        Download the episode:
        - mark status as downloading
        - clean up any tmp files from previous failures
        - create the dir
        - stream the url to temp file
        - rename the temp file to final location
        - mark episode as downloaded
        """
        self.db.set_podcast_episode_status(episode['id'], "downloading")

        ep_dir = os.path.join(self.settings.path, str(episode['podcastid']))
        ep_path = os.path.join(ep_dir, "{}.mp3".format(episode['id']))
        ep_tmppath = os.path.join(ep_dir, ".{}.mp3".format(episode['id']))

        os.makedirs(ep_dir, exist_ok=True)
        if os.path.exists(ep_path):
            os.unlink(ep_path)  # previous failed downloads
        if os.path.exists(ep_tmppath):
            os.unlink(ep_tmppath)  # previous failed downloads

        logging.info("fetching %s", episode['url'])
        r = requests.get(episode['url'], stream=True)
        r.raise_for_status()
        with open(ep_tmppath, 'wb') as f:
            shutil.copyfileobj(r.raw, f)
        os.rename(ep_tmppath, ep_path)
        # TODO verify or update MIME from that of the url
        self.db.set_podcast_episode_status(episode['id'], "completed")

    def get_feed(self, rss_url):
        """
        Download the given URL and return a parsed feed
        """
        feed_body = requests.get(rss_url, timeout=30)
        return feedparser.parse(feed_body.text)

    def refresh_podcast_entry(self, podcast_id, entry):
        """
        Update the database for the given podcast entry. Add it to the database if it doesn't exist. Note: we use the
        episode TITLE as the uniqueness check against the database
        """
        existing = self.db.get_podcast_episodes(podcast_id=podcast_id, title=entry['title'])
        if existing:
            return

        # find media file url
        url = None
        mime = None
        for link in entry['links']:
            if link['type'] in ["audio/mpeg", "audio/mp3"]:  # TODO more formats
                url = link['href']
                mime = link['type']
                break

        if not url:
            logging.warning("could not find url for episode in podcast %s", podcast_id)
            return

        # create entry
        ep_id = self.db.add_podcast_episode(podcast_id,
                                            time.mktime(entry['published_parsed']),
                                            entry['title'],
                                            entry['summary'],
                                            url,
                                            mime)
        logging.info("added episode %s '%s'", ep_id, entry['title'])
some things 2018-09-21 13:49:30 -07:00			`from threading import Thread, Timer`
podcast downloader features 2018-04-09 21:52:30 -07:00			`from concurrent.futures import ThreadPoolExecutor`
			`from queue import Queue`
			`import shutil`
			`import logging`
			`import os`
			`import requests`
			`import feedparser`
			`import time`


			`class PodcastSettings(object):`
			`"""seconds between updating podcasts"""`
			`refresh_interval = 3 #60 * 60`
			`"""how many seconds to wait after initialization to start refreshing podcasts"""`
			`startup_delay = 30`
			`"""how many podcasts can be scanned at once"""`
			`scan_threads = 4`
			`"""root path of downloaded podcasts"""`
			`path = "podcasts"`
			`"""how many of the most recent episodes to download"""`
			`download_episodes = 2`


			`class PodcastManager(Thread):`
			`def __init__(self, db):`
			`super().__init__()`
			`self.daemon = True`

			`self.db = db`
			`self.settings = PodcastSettings`
			`self.q = Queue()`
			`self.start()`

			`def run(self):`
			`"""`
			`In a loop forever, query for podcasts in need of scanning for new episodes. Wait for a scan being requested (aka`
			`a queue item) as the signal to begin scanning.`
			`"""`
			`self.schedule_rescan()`
			`while True:`
			`self.q.get()`
			`self.refresh_podcasts()`

			`def interval_scan(self):`
			`"""`
			`Schedule the next automated rescan. Request a scan be executed.`
			`"""`
			`self.request_rescan()`
			`#self.schedule_rescan()`

			`def schedule_rescan(self):`
			`"""`
			`Call the next interval scan later`
			`"""`
			`t = Timer(self.settings.refresh_interval, self.interval_scan)`
			`t.daemon = True`
			`t.start()`

			`def request_rescan(self):`
			`"""`
			`Add an item to the queue`
			`"""`
			`self.q.put(None)`

			`def refresh_podcasts(self):`
			`"""`
			`Refresh all the podcasts`
			`"""`
			`logging.info("rescanning podcasts")`
			`# If any episodes are marked as "downloading", it's a lie and left over from before the crash`
			`# TODO this should happen earlier than the scan`
			`for entry in self.db.get_podcast_episodes(status="downloading"):`
			`self.db.set_podcast_episode_status(entry['id'], "new")`

			`futures = []`
			`# TODO the TPE doesn't die as a daemon thread :\|`
			`with ThreadPoolExecutor(max_workers=self.settings.scan_threads) as pool:`
			`for item in self.db.get_podcasts():`
			`futures.append(pool.submit(self.refresh_podcast, item, ))`
			`for item in futures:`
			`e = item.exception()`
			`if e:`
			`raise e`
			`# for item in self.db.get_podcasts():`
			`# self.refresh_podcast(item)`
			`logging.info("podcast refresh complete")`
some things 2018-09-21 13:49:30 -07:00			`#TODO all episodes in 'new' status change to 'skipped'`
podcast downloader features 2018-04-09 21:52:30 -07:00
			`def refresh_podcast(self, podcast):`
			`"""`
			`Refresh all metadata and episodes of a single podcast`
			`"""`
			`logging.info("updating podcast %s '%s' ", podcast['id'], podcast['title'])`
			`feed = self.get_feed(podcast['url'])`
			`for entry in feed['entries']:`
			`self.refresh_podcast_entry(podcast['id'], entry)`
			`self.refresh_podcast_episodes(podcast['id'])`

			`#TODO update the feed's description`
			`# self.udpate_feed_meta(feed['feed'])`
			`# 'image': {'href': 'http://sysadministrivia.com/images/1.jpg',`
			`# 'link': 'http://sysadministrivia.com/',`
			`# 'links': [{'href': 'http://sysadministrivia.com/',`
			`# 'rel': 'alternate',`
			`# 'type': 'text/html'}],`
			`# 'title': 'The Sysadministrivia Podcast',`
			`# 'title_detail': {'base': '',`
			`# 'language': 'en',`
			`# 'type': 'text/plain',`
			`# 'value': 'The Sysadministrivia Podcast'}},`
			`# 'link': 'http://sysadministrivia.com/',`
			`# 'subtitle': 'We podcast all things system administration/engineering/infosec, '`
			`# 'with a strong focus on GNU/Linux. We use F/OSS software whenever '`
			`# 'possible in the production of these podcasts. Please be sure to '`
			`# 'view our show notes on the site!',`
			`# 'title': 'The Sysadministrivia Podcast',`

			`def refresh_podcast_episodes(self, podcast_id):`
			`"""`
			`Check that the most recent X episodes are downloaded. Start downloads if not.`
			`"""`
			`for entry in self.db.get_podcast_episodes(podcast_id=podcast_id, limit=self.settings.download_episodes):`
			`if entry["status"] == "new":`
			`self.download_episode(entry)`

			`def download_episode(self, episode):`
			`"""`
			`Download the episode:`
			`- mark status as downloading`
			`- clean up any tmp files from previous failures`
			`- create the dir`
			`- stream the url to temp file`
			`- rename the temp file to final location`
			`- mark episode as downloaded`
			`"""`
			`self.db.set_podcast_episode_status(episode['id'], "downloading")`

			`ep_dir = os.path.join(self.settings.path, str(episode['podcastid']))`
			`ep_path = os.path.join(ep_dir, "{}.mp3".format(episode['id']))`
			`ep_tmppath = os.path.join(ep_dir, ".{}.mp3".format(episode['id']))`

			`os.makedirs(ep_dir, exist_ok=True)`
			`if os.path.exists(ep_path):`
			`os.unlink(ep_path) # previous failed downloads`
			`if os.path.exists(ep_tmppath):`
			`os.unlink(ep_tmppath) # previous failed downloads`

			`logging.info("fetching %s", episode['url'])`
			`r = requests.get(episode['url'], stream=True)`
			`r.raise_for_status()`
			`with open(ep_tmppath, 'wb') as f:`
			`shutil.copyfileobj(r.raw, f)`
			`os.rename(ep_tmppath, ep_path)`
			`# TODO verify or update MIME from that of the url`
			`self.db.set_podcast_episode_status(episode['id'], "completed")`

			`def get_feed(self, rss_url):`
			`"""`
			`Download the given URL and return a parsed feed`
			`"""`
			`feed_body = requests.get(rss_url, timeout=30)`
			`return feedparser.parse(feed_body.text)`

			`def refresh_podcast_entry(self, podcast_id, entry):`
			`"""`
			`Update the database for the given podcast entry. Add it to the database if it doesn't exist. Note: we use the`
			`episode TITLE as the uniqueness check against the database`
			`"""`
			`existing = self.db.get_podcast_episodes(podcast_id=podcast_id, title=entry['title'])`
			`if existing:`
			`return`

			`# find media file url`
			`url = None`
			`mime = None`
			`for link in entry['links']:`
			`if link['type'] in ["audio/mpeg", "audio/mp3"]: # TODO more formats`
			`url = link['href']`
			`mime = link['type']`
			`break`

			`if not url:`
			`logging.warning("could not find url for episode in podcast %s", podcast_id)`
			`return`

			`# create entry`
			`ep_id = self.db.add_podcast_episode(podcast_id,`
			`time.mktime(entry['published_parsed']),`
			`entry['title'],`
			`entry['summary'],`
			`url,`
			`mime)`
			`logging.info("added episode %s '%s'", ep_id, entry['title'])`