mediasort/mediaweb/shows.py

import os
import re
import string
import logging
from enum import Enum
from fuzzywuzzy import fuzz
from collections import namedtuple


# lifted from https://git.davepedu.com/dave/tvsort/src/branch/master/tvsort/


NORMAL_SEASON_EP_RE = re.compile(r'(([sS]([0-9]{2}))x?([eE]([0-9]{2}))?)')  # match something like s01e02
NORMAL_SEASON_EP_RE2 = re.compile(r'(([0-9]+)[xX]([0-9]{2}))')  # match something like 21x04
DATE_SEASON_EP_RE = re.compile(r'((20[1,2][0-9]).([0-9]{1,2})?.([0-9]{1,2})?)')  # match something like 2017-08-3
COMMON_CRAP = [re.compile(i, flags=re.I) for i in
               [r'(720|1080)p',
                r'hdtv',
                r'(h.?)?x?264(.[a-z0-9]+)?',
                r'(ddp\d\.\d)?',
                r'web(\-?(dl|rip))?',
                r'[\.\-\s](amzn|amazon)[\.\-\s]',
                r'dd.5.\d',
                r'aac2.\d']]


class EpisodeParseException(Exception):
    pass


class Seasoning(Enum):
    """
    All episodes are categorized into seasons (or season-like entities). A season may number it's episodes by date or by
    season and episode number. Thirdly, an episode may be associated with a season but not obey the regular naming
    scheme - such as a special episode. This enum is for describing what chronological scheme an episode appears to use.
    """
    NONE = 0
    BY_SEASON = 1
    BY_DATE = 2
    SPECIAL = 3


Show = namedtuple("Show", "root dir name mode seasons")
"""
Struct describing an in-library tv show
    root : abs path to the folder containing dir
    dir : absolute(?) file path to the show
    name : name of the show
    mode : Season strategy (cannot be 'special')
    seasons : list of season ints
"""


EpInfo = namedtuple("EpInfo", "file mode major minor extra")
"""
Struct for describing an episode file.
    file : file name of the episode file
    mode : chronological scheme of file naming (see Season)
    major : least granular chronological unit. Typically season or year
    minor : medium granular unit. Always episode number
    extra : most granular unit. Always day (only used for date-based episodes)
"""


MatchedEpisode = namedtuple("MatchedEpisode", "root ep dest subdest score")
"""
Struct describing the intent to sort and episode file into a location
    root : abs path to the folder containing ep.file
    ep : associated EpInfo object
    dest : associated Show object
    score : scoring value Show::match returned
"""


def create_show(root_path, dirname):
    dir_lower = dirname.lower()

    # Inspect contents of show directory and guess naming scheme
    yearish = 0
    seasonish = 0
    wtfish = 0
    buckets_season = []
    buckets_year = []
    for item in os.listdir(os.path.join(root_path, dirname)):
        if item.lower().startswith("season "):
            seasonish += 1
            buckets_season.append(int(''.join([i if i in string.digits else " " for i in item]).strip()))  # todo flexible season dir detection
            continue
        try:
            year = int(item)
            buckets_year.append(year)
            if year > 1900 and year < 2050:
                yearish += 1
                continue
        except ValueError:
            pass
        wtfish += 1

    mode = None
    episodes = None

    if yearish > seasonish and yearish > wtfish:
        mode = Seasoning.BY_DATE
        episodes = buckets_year
    elif seasonish > yearish and seasonish > wtfish:
        mode = Seasoning.BY_SEASON
        episodes = buckets_season
    else:
        mode = Seasoning.NONE
        episodes = []

    return Show(root_path, dirname, dir_lower, mode, episodes)


def create_index(fs_paths):
    shows = []
    for d in fs_paths:
        for i in os.listdir(d):
            if os.path.isdir(os.path.join(d, i)):
                try:
                    shows.append(create_show(d, i))
                except PermissionError as pe:
                    logging.warning(f"skipping {d} due to {pe}")

    return shows


def parse_episode(fname):
    """
    Given a file name, parse out any information we can from the name
    :return:
    """

    # Remove file extensions
    fname = fname.lower()
    item = '.'.join(fname.split(".")[0:-1])

    # Extract season information
    # And remove seasons info chars from the working name
    epinfo = None
    match = NORMAL_SEASON_EP_RE.search(item) or NORMAL_SEASON_EP_RE2.search(item)
    if match:
        fields = match.groups()
        if len(fields) == 5:
            whole, _, season, _, episode = fields
        else:
            whole, season, episode = fields

        if season and not episode:
            epinfo = EpInfo(fname, Seasoning.SPECIAL, int(season), None, None)
        else:
            assert season and episode
            epinfo = EpInfo(fname, Seasoning.BY_SEASON, int(season), int(episode), None)

        # delete everything after the episode number
        pos = item.find(whole)
        if pos >= 10:
            item = item[0:pos]
        else:
            # unless it makes it too short
            item = item.replace(whole, "")
    else:
        match = DATE_SEASON_EP_RE.search(item)
        if match:
            whole, year, month, day = match.groups()
            assert year is not None
            if month:
                month = int(month)
            if day:
                day = int(day)
            epinfo = EpInfo(fname, Seasoning.BY_DATE, int(year), month, day)
            # delete everything after the episode number
            pos = item.find(whole)
            if pos >= 10:
                item = item[0:pos]
            else:
                # unless it makes it too short
                item = item.replace(whole, "")
        else:
            raise EpisodeParseException("Could not parse episode {}".format(repr(fname)))

    # Remove common torrenty names
    for crap in COMMON_CRAP:
        item = crap.sub("", item)

    # Remaining chars should be a show name and possibly and episode title. And random bs
    allowed_chars = string.ascii_lowercase + string.digits
    item = ''.join([i if i in allowed_chars else " " for i in item]).strip()

    return epinfo, item


def sub_bucket_name(show, major, minor, extra):
    if show.mode == Seasoning.BY_DATE:
        return str(major)
    elif show.mode == Seasoning.BY_SEASON:
        return "Season {}".format(major)
    else:
        return ''


def match_episode(fname, shows, minscore=65):
    """
    Given a filename and a show library, determine which show and season is the best place to sort it to
    """
    matches = []

    # Parse information from the episode file name
    try:
        epinfo, item = parse_episode(fname)
    except EpisodeParseException:
        return matches

    # Find a show from the library best matching this episode
    for show in shows:
        value = fuzz.token_set_ratio(show.name.lower(), item.lower())  #TODO add algorithm swap arg for snakeoil
        if value >= minscore:
            matches.append(
                MatchedEpisode(fname, epinfo, show,
                               sub_bucket_name(show, epinfo.major, epinfo.minor, epinfo.extra),
                               value))
    return sorted(matches, key=lambda x: x.score, reverse=True)