refactor into cli tool

2017-10-07 18:32:48 -07:00 · 2017-10-07 18:32:48 -07:00 · c6bb8b554f
parent e2edfabe28
commit c6bb8b554f
4 changed files with 297 additions and 115 deletions
--- a/tvsort/cli.py
+++ b/tvsort/cli.py
@ -1,28 +1,24 @@
 #!/usr/bin/env python3
 import argparse
 import os
-import re
-import string
-from pprint import pprint
+import pickle
+from tvsort import shows
+from tvsort.parser import EpisodeParseException, parse_episode, sub_bucket_name
+from appdirs import user_config_dir
 from fuzzywuzzy import fuzz
-from collections import namedtuple
 from tabulate import tabulate
+from pprint import pprint
+from collections import namedtuple


-class Season:
-    by_season = 0
-    by_date = 1
-    special = 3
-
-
-EpInfo = namedtuple("EpInfo", "file mode major minor extra")
-Show = namedtuple("Show", "dir name match")
-MatchedEpisode = namedtuple("MatchedEpisode", "ep dest score")
-
-
-def create_show(dirname):
-    dir_lower = dirname.lower()
-    return Show(dirname, dir_lower, lambda other: fuzz.token_set_ratio(dir_lower, other.lower()))
+MatchedEpisode = namedtuple("MatchedEpisode", "root ep dest subdest score")
+"""
+Struct describing the intent to sort and episode file into a location
+    root : abs path to the folder containing ep.file
+    ep : associated EpInfo object
+    dest : associated Show object
+    score : scoring value Show::match returned
+"""


 def main():
@ -30,132 +26,136 @@ def main():
    parser = argparse.ArgumentParser(description="sort tv shows")
    parser.add_argument("-s", "--src", nargs="+", help="", required=True)
    parser.add_argument("-d", "--dest", nargs="+", help="", required=True)
-    parser.add_argument("--soft", action="store_true", help="Soft link instead of hard link")#TODO
-    parser.add_argument("--match-thresh", type=int, default=65)  #0-100
+    parser.add_argument("--soft", action="store_true", help="Soft link instead of hard link")
+    parser.add_argument("-r", "--rescan", action="store_true", help="Rescan library instead of using cache")
+    parser.add_argument("--match-thresh", type=int, default=65)
    parser.add_argument("--mappings", nargs="+", default=[])  # many foo=bar transformations to help witch mapping
    args = parser.parse_args()

+    if args.match_thresh <= 0 or args.match_thresh > 100:
+        parser.error("--match-thresh must be 1-100")
+
+    # mappings allow simple string transforms as workarounds for poorly named episodes
    mappings = {}
    for item in args.mappings:
        key, value = item.split("=")
        mappings[key] = value

-    # Create index of shows
-    shows = []
-    for destdir in args.dest:
-        for i in os.listdir(destdir):
-            shows.append(create_show(i))
-
-    NORMAL_SEASON_EP_RE = re.compile(r'(([sS]([0-9]{2}))x?([eE]([0-9]{2}))?)')  # match something like s01e02
-    NORMAL_SEASON_EP_RE2 = re.compile(r'(([0-9]+)[xX]([0-9]{2}))')  # match something like 21x04
-    DATE_SEASON_EP_RE = re.compile(r'((201[0-9]).([0-9]{1,2})?.([0-9]{1,2})?)')  # match something like 2017-08-3
-    COMMON_CRAP = [re.compile(i, flags=re.I) for i in
-                   [r'(720|1080)p',
-                    r'hdtv',
-                    r'(h.?)?x?264(.[a-z0-9]+)?',
-                    r'(ddp\d\.\d)?',
-                    r'web(\-?(dl|rip))?',
-                    r'[\.\-\s](amzn|amazon)[\.\-\s]',
-                    r'dd.5.\d',
-                    r'AAC2.\d']]
+    # load the library, an index of shows already sorted. the dirnames will be compared to incoming files
+    cachedir = user_config_dir("tvsort")
+    os.makedirs(cachedir, exist_ok=True)
+    cache_file = os.path.join(cachedir, "library.cache")
+    library = None
+    if os.path.exists(cache_file) and not args.rescan:
+        with open(cache_file, "rb") as f:
+            try:
+                library = pickle.load(f)
+            except:
+                print("Failed to load library cache")
+    if not library:
+        library = shows.create_index(args.dest)
+        with open(cache_file, "wb") as f:
+            pickle.dump(library, f)

    failures = []
    results = []

+    # iterate through all children of the src dirs
    for srcdir in args.src:
-        for item in os.listdir(srcdir):
-            if not os.path.isfile(os.path.join(srcdir, item)):
-                # TODO go into subdirs too (we assume dir means season pack)
+        for fname in os.listdir(srcdir):
+            # TODO season dirs are ignored for now
+            if not os.path.isfile(os.path.join(srcdir, fname)):
                continue
-            fname = item

-            # Remove file extension
-            item = item.rstrip(".mkv").lower()#TODO make this better
-
-            # Apply manual transformations
+            # Apply manually specified transformations
+            item = fname
            for old, new in mappings.items():
                item = item.replace(old, new)

-            # Extract season information
-            # And remove seasons info chars from the working name
-            epinfo = None
-            match = NORMAL_SEASON_EP_RE.search(item) or NORMAL_SEASON_EP_RE2.search(item)
-            if match:
-                fields = match.groups()
-                if len(fields) == 5:
-                    whole, _, season, _, episode = fields
-                else:
-                    whole, season, episode = fields
-
-                if season and not episode:
-                    epinfo = EpInfo(fname, Season.special, int(season), None, None)
-                else:
-                    assert season and episode
-                    epinfo = EpInfo(fname, Season.by_season, int(season), int(episode), None)
-
-                # delete everything after the episode number
-                pos = item.find(whole)
-                if pos >= 10:
-                    item = item[0:pos]
-                else:
-                    # unless it makes it too short
-                    item = item.replace(whole, "")
-            else:
-                match = DATE_SEASON_EP_RE.search(item)
-                if match:
-                    whole, year, month, day = match.groups()
-                    assert year is not None
-                    if month:
-                        month = int(month)
-                    if day:
-                        day = int(day)
-                    epinfo = EpInfo(fname, Season.by_date, int(year), month, day)
-                    # delete everything after the episode number
-                    pos = item.find(whole)
-                    if pos >= 10:
-                        item = item[0:pos]
-                    else:
-                        # unless it makes it too short
-                        item = item.replace(whole, "")
-                else:
-                    # raise Exception("Could not parse episode: {}".format(repr(item)))
-                    failures.append(fname)
-                    continue
-
-            # Remove common torrenty names
-            for crap in COMMON_CRAP:
-                item = crap.sub("", item)
-
-            # print(epinfo, "->", item)
-
-            # Remaining chars should be a show name and possibly and episode title. And random bs
-            allowed_chars = string.ascii_lowercase + string.digits
-            item = ''.join([i if i in allowed_chars else " " for i in item]).strip()
+            # Parse information from the episode file name
+            try:
+                epinfo, item = parse_episode(item)
+            except EpisodeParseException:
+                failures.append(fname)

+            # Find a show from the library best matching this episode
            match_score = 0
            best_match_show = None
-            for show in shows:
-                value = show.match(item)
+            for show in library:
+                value = fuzz.token_set_ratio(show.name.lower(), item.lower())  #TODO add algorithm swap arg for snakeoil
                if value > match_score:
                    match_score = value
                    best_match_show = show
-
-            if match_score > args.match_thresh:
-                results.append(MatchedEpisode(epinfo, best_match_show, match_score))
+            if match_score >= args.match_thresh:
+                results.append(
+                    MatchedEpisode(srcdir, epinfo, best_match_show,
+                                   sub_bucket_name(best_match_show, epinfo.major, epinfo.minor, epinfo.extra),
+                                   match_score))
            else:
                failures.append(fname)

-    tab_rows = []
-    for item in sorted(results, key=lambda x: x.dest.dir):
-        row = [item.ep.file, item.ep.major, item.ep.minor, item.dest.dir, item.score]
-        tab_rows.append(row)
+    before = len(results)
+    results = list(
+        filter(
+            lambda r: not os.path.exists(os.path.join(r.dest.root, r.dest.dir, r.subdest, r.ep.file)),
+            results))
+    already_there = before - len(results)

-    print(tabulate(tab_rows, headers=["file", "season", "episode", "dest", "score"]))
+    go = False
+    while not go:
+        tab_rows = []
+        i = 0
+        results.sort(key=lambda x: x.dest.dir)
+        for item in results:
+            row = [i,
+                   os.path.join(item.root, item.ep.file),
+                   item.ep.major,
+                   item.ep.minor,
+                   os.path.join(item.dest.root, item.dest.dir, item.subdest) + "/",
+                   item.score,
+                   "soft" if args.soft else "hard"]
+            tab_rows.append(row)
+            i += 1

-    if failures:
-        print("\n\n")
-        print("Could not match:")
-        pprint(failures)
+        print(tabulate(tab_rows, headers=["number", "file", "season", "episode", "dest", "score", "link"]))
+
+        if already_there:
+            print("\n{} already in library and ignored".format(already_there))
+
+        if failures:
+            print("\n")
+            print("Could not match:")
+            pprint(failures)
+
+        if not results:
+            print("no candidates for linking found!")
+            return
+
+        resp = input("create links? [y/N/<lines to skip and print again>]: ").lower().strip()
+
+        if not resp or resp == "n":
+            return
+
+        if resp == "y":
+            break
+
+        exclude = []
+        for number in resp.split():
+            exclude.append(int(number))
+        exclude.sort(reverse=True)
+        for number in exclude:
+            results.pop(number)
+
+    link = os.symlink if args.soft else os.link
+
+    for item in results:
+        src = os.path.join(item.root, item.ep.file)
+        destdir = os.path.join(item.dest.root, item.dest.dir, item.subdest)
+        dest = os.path.join(destdir, item.ep.file)
+        # print("mkdir ", destdir)
+        os.makedirs(destdir, exist_ok=True)
+        # print(src, "   ->   ", dest)
+        link(src, dest)


 if __name__ == '__main__':
--- a/tvsort/common.py
+++ b/tvsort/common.py
@ -0,0 +1,35 @@
+from collections import namedtuple
+
+
+class Season:
+    """
+    All episodes are categorized into seasons (or season-like entities). A season may number it's episodes by date or by
+    season and episode number. Thirdly, an episode may be associated with a season but not obey the regular naming
+    scheme - such as a special episode. This enum is for describing what chronological scheme an episode appears to use.
+    """
+    none = 0
+    by_season = 1
+    by_date = 2
+    special = 3
+
+
+Show = namedtuple("Show", "root dir name mode seasons")
+"""
+Struct describing an in-library tv show
+    root : abs path to the folder containing dir
+    dir : absolute(?) file path to the show
+    name : name of the show
+    mode : Season strategy (cannot be 'special')
+    seasons : list of season ints
+"""
+
+
+EpInfo = namedtuple("EpInfo", "file mode major minor extra")
+"""
+Struct for describing an episode file.
+    file : file name of the episode file
+    mode : chronological scheme of file naming (see Season)
+    major : least granular chronological unit. Typically season or year
+    minor : medium granular unit. Always episode number
+    extra : most granular unit. Always day (only used for date-based episodes)
+"""
--- a/tvsort/parser.py
+++ b/tvsort/parser.py
@ -0,0 +1,94 @@
+import re
+import string
+from tvsort.common import Season, Show, EpInfo
+
+
+NORMAL_SEASON_EP_RE = re.compile(r'(([sS]([0-9]{2}))x?([eE]([0-9]{2}))?)')  # match something like s01e02
+NORMAL_SEASON_EP_RE2 = re.compile(r'(([0-9]+)[xX]([0-9]{2}))')  # match something like 21x04
+DATE_SEASON_EP_RE = re.compile(r'((201[0-9]).([0-9]{1,2})?.([0-9]{1,2})?)')  # match something like 2017-08-3
+COMMON_CRAP = [re.compile(i, flags=re.I) for i in
+               [r'(720|1080)p',
+                r'hdtv',
+                r'(h.?)?x?264(.[a-z0-9]+)?',
+                r'(ddp\d\.\d)?',
+                r'web(\-?(dl|rip))?',
+                r'[\.\-\s](amzn|amazon)[\.\-\s]',
+                r'dd.5.\d',
+                r'AAC2.\d']]
+
+
+class EpisodeParseException(Exception):
+    pass
+
+
+def parse_episode(fname):
+    """
+    Given a file name, parse out any information we can from the name
+    :return:
+    """
+
+    # Remove file extensions
+    item = fname.rstrip(".mkv").lower()#TODO make this better
+
+    # Extract season information
+    # And remove seasons info chars from the working name
+    epinfo = None
+    match = NORMAL_SEASON_EP_RE.search(item) or NORMAL_SEASON_EP_RE2.search(item)
+    if match:
+        fields = match.groups()
+        if len(fields) == 5:
+            whole, _, season, _, episode = fields
+        else:
+            whole, season, episode = fields
+
+        if season and not episode:
+            epinfo = EpInfo(fname, Season.special, int(season), None, None)
+        else:
+            assert season and episode
+            epinfo = EpInfo(fname, Season.by_season, int(season), int(episode), None)
+
+        # delete everything after the episode number
+        pos = item.find(whole)
+        if pos >= 10:
+            item = item[0:pos]
+        else:
+            # unless it makes it too short
+            item = item.replace(whole, "")
+    else:
+        match = DATE_SEASON_EP_RE.search(item)
+        if match:
+            whole, year, month, day = match.groups()
+            assert year is not None
+            if month:
+                month = int(month)
+            if day:
+                day = int(day)
+            epinfo = EpInfo(fname, Season.by_date, int(year), month, day)
+            # delete everything after the episode number
+            pos = item.find(whole)
+            if pos >= 10:
+                item = item[0:pos]
+            else:
+                # unless it makes it too short
+                item = item.replace(whole, "")
+        else:
+            raise EpisodeParseException("Could not parse episode {}".format(repr(fname)))
+
+    # Remove common torrenty names
+    for crap in COMMON_CRAP:
+        item = crap.sub("", item)
+
+    # Remaining chars should be a show name and possibly and episode title. And random bs
+    allowed_chars = string.ascii_lowercase + string.digits
+    item = ''.join([i if i in allowed_chars else " " for i in item]).strip()
+
+    return epinfo, item
+
+
+def sub_bucket_name(show, major, minor, extra):
+    if show.mode == Season.by_date:
+        return str(major)
+    elif show.mode == Season.by_season:
+        return "Season {}".format(major)
+    else:
+        return ''
--- a/tvsort/shows.py
+++ b/tvsort/shows.py
@ -0,0 +1,53 @@
+import os
+from tvsort.common import Show, Season
+import string
+
+
+def create_show(root_path, dirname):
+    dir_lower = dirname.lower()
+
+    # Inspect contents of show directory and guess naming scheme
+    yearish = 0
+    seasonish = 0
+    wtfish = 0
+    buckets_season = []
+    buckets_year = []
+    for item in os.listdir(os.path.join(root_path, dirname)):
+        if item.lower().startswith("season "):
+            seasonish += 1
+            buckets_season.append(int(''.join([i if i in string.digits else " " for i in item]).strip()))  # todo flexible season dir detection
+            continue
+        try:
+            year = int(item)
+            buckets_year.append(year)
+            if year > 1900 and year < 2050:
+                yearish += 1
+                continue
+        except ValueError:
+            pass
+        wtfish += 1
+
+    mode = None
+    episodes = None
+
+    if yearish > seasonish and yearish > wtfish:
+        mode = Season.by_date
+        episodes = buckets_year
+    elif seasonish > yearish and seasonish > wtfish:
+        mode = Season.by_season
+        episodes = buckets_season
+    else:
+        mode = Season.none
+        episodes = []
+
+    return Show(root_path, dirname, dir_lower, mode, episodes)
+
+
+def create_index(fs_paths):
+    shows = []
+    for d in fs_paths:
+        for i in os.listdir(d):
+            if os.path.isdir(os.path.join(d, i)):
+                shows.append(create_show(d, i))
+
+    return shows