import os import re import string import logging from enum import Enum from fuzzywuzzy import fuzz from collections import namedtuple # lifted from https://git.davepedu.com/dave/tvsort/src/branch/master/tvsort/ NORMAL_SEASON_EP_RE = re.compile(r'(([sS]([0-9]{2}))x?([eE]([0-9]{2}))?)') # match something like s01e02 NORMAL_SEASON_EP_RE2 = re.compile(r'(([0-9]+)[xX]([0-9]{2}))') # match something like 21x04 DATE_SEASON_EP_RE = re.compile(r'((201[0-9]).([0-9]{1,2})?.([0-9]{1,2})?)') # match something like 2017-08-3 COMMON_CRAP = [re.compile(i, flags=re.I) for i in [r'(720|1080)p', r'hdtv', r'(h.?)?x?264(.[a-z0-9]+)?', r'(ddp\d\.\d)?', r'web(\-?(dl|rip))?', r'[\.\-\s](amzn|amazon)[\.\-\s]', r'dd.5.\d', r'AAC2.\d']] class EpisodeParseException(Exception): pass class Seasoning(Enum): """ All episodes are categorized into seasons (or season-like entities). A season may number it's episodes by date or by season and episode number. Thirdly, an episode may be associated with a season but not obey the regular naming scheme - such as a special episode. This enum is for describing what chronological scheme an episode appears to use. """ NONE = 0 BY_SEASON = 1 BY_DATE = 2 SPECIAL = 3 Show = namedtuple("Show", "root dir name mode seasons") """ Struct describing an in-library tv show root : abs path to the folder containing dir dir : absolute(?) file path to the show name : name of the show mode : Season strategy (cannot be 'special') seasons : list of season ints """ EpInfo = namedtuple("EpInfo", "file mode major minor extra") """ Struct for describing an episode file. file : file name of the episode file mode : chronological scheme of file naming (see Season) major : least granular chronological unit. Typically season or year minor : medium granular unit. Always episode number extra : most granular unit. Always day (only used for date-based episodes) """ MatchedEpisode = namedtuple("MatchedEpisode", "root ep dest subdest score") """ Struct describing the intent to sort and episode file into a location root : abs path to the folder containing ep.file ep : associated EpInfo object dest : associated Show object score : scoring value Show::match returned """ def create_show(root_path, dirname): dir_lower = dirname.lower() # Inspect contents of show directory and guess naming scheme yearish = 0 seasonish = 0 wtfish = 0 buckets_season = [] buckets_year = [] for item in os.listdir(os.path.join(root_path, dirname)): if item.lower().startswith("season "): seasonish += 1 buckets_season.append(int(''.join([i if i in string.digits else " " for i in item]).strip())) # todo flexible season dir detection continue try: year = int(item) buckets_year.append(year) if year > 1900 and year < 2050: yearish += 1 continue except ValueError: pass wtfish += 1 mode = None episodes = None if yearish > seasonish and yearish > wtfish: mode = Seasoning.BY_DATE episodes = buckets_year elif seasonish > yearish and seasonish > wtfish: mode = Seasoning.BY_SEASON episodes = buckets_season else: mode = Seasoning.NONE episodes = [] return Show(root_path, dirname, dir_lower, mode, episodes) def create_index(fs_paths): shows = [] for d in fs_paths: for i in os.listdir(d): if os.path.isdir(os.path.join(d, i)): try: shows.append(create_show(d, i)) except PermissionError as pe: logging.warning(f"skipping {d} due to {pe}") return shows def parse_episode(fname): """ Given a file name, parse out any information we can from the name :return: """ # Remove file extensions # item = fname.rstrip(".mkv").lower() #TODO make this better item = '.'.join(fname.split(".")[0:-1]) # Extract season information # And remove seasons info chars from the working name epinfo = None match = NORMAL_SEASON_EP_RE.search(item) or NORMAL_SEASON_EP_RE2.search(item) if match: fields = match.groups() if len(fields) == 5: whole, _, season, _, episode = fields else: whole, season, episode = fields if season and not episode: epinfo = EpInfo(fname, Seasoning.SPECIAL, int(season), None, None) else: assert season and episode epinfo = EpInfo(fname, Seasoning.BY_SEASON, int(season), int(episode), None) # delete everything after the episode number pos = item.find(whole) if pos >= 10: item = item[0:pos] else: # unless it makes it too short item = item.replace(whole, "") else: match = DATE_SEASON_EP_RE.search(item) if match: whole, year, month, day = match.groups() assert year is not None if month: month = int(month) if day: day = int(day) epinfo = EpInfo(fname, Seasoning.BY_DATE, int(year), month, day) # delete everything after the episode number pos = item.find(whole) if pos >= 10: item = item[0:pos] else: # unless it makes it too short item = item.replace(whole, "") else: raise EpisodeParseException("Could not parse episode {}".format(repr(fname))) # Remove common torrenty names for crap in COMMON_CRAP: item = crap.sub("", item) # Remaining chars should be a show name and possibly and episode title. And random bs allowed_chars = string.ascii_lowercase + string.digits item = ''.join([i if i in allowed_chars else " " for i in item]).strip() return epinfo, item def sub_bucket_name(show, major, minor, extra): if show.mode == Seasoning.BY_DATE: return str(major) elif show.mode == Seasoning.BY_SEASON: return "Season {}".format(major) else: return '' def match_episode(fname, shows, thresh=65): """ Given a filename and a show library, determine which show and season is the best place to sort it to """ matches = [] # Parse information from the episode file name try: epinfo, item = parse_episode(fname) except EpisodeParseException: return matches # Find a show from the library best matching this episode for show in shows: value = fuzz.token_set_ratio(show.name.lower(), item.lower()) #TODO add algorithm swap arg for snakeoil if value > thresh: matches.append( MatchedEpisode(fname, epinfo, show, sub_bucket_name(show, epinfo.major, epinfo.minor, epinfo.extra), value)) return sorted(matches, key=lambda x: x.score, reverse=True)