refactor into cli tool

This commit is contained in:
dave 2017-10-07 18:32:48 -07:00
parent e2edfabe28
commit c6bb8b554f
4 changed files with 297 additions and 115 deletions

View File

@ -1,28 +1,24 @@
#!/usr/bin/env python3
import argparse
import os
import re
import string
from pprint import pprint
import pickle
from tvsort import shows
from tvsort.parser import EpisodeParseException, parse_episode, sub_bucket_name
from appdirs import user_config_dir
from fuzzywuzzy import fuzz
from collections import namedtuple
from tabulate import tabulate
from pprint import pprint
from collections import namedtuple
class Season:
by_season = 0
by_date = 1
special = 3
EpInfo = namedtuple("EpInfo", "file mode major minor extra")
Show = namedtuple("Show", "dir name match")
MatchedEpisode = namedtuple("MatchedEpisode", "ep dest score")
def create_show(dirname):
dir_lower = dirname.lower()
return Show(dirname, dir_lower, lambda other: fuzz.token_set_ratio(dir_lower, other.lower()))
MatchedEpisode = namedtuple("MatchedEpisode", "root ep dest subdest score")
"""
Struct describing the intent to sort and episode file into a location
root : abs path to the folder containing ep.file
ep : associated EpInfo object
dest : associated Show object
score : scoring value Show::match returned
"""
def main():
@ -30,132 +26,136 @@ def main():
parser = argparse.ArgumentParser(description="sort tv shows")
parser.add_argument("-s", "--src", nargs="+", help="", required=True)
parser.add_argument("-d", "--dest", nargs="+", help="", required=True)
parser.add_argument("--soft", action="store_true", help="Soft link instead of hard link")#TODO
parser.add_argument("--match-thresh", type=int, default=65) #0-100
parser.add_argument("--soft", action="store_true", help="Soft link instead of hard link")
parser.add_argument("-r", "--rescan", action="store_true", help="Rescan library instead of using cache")
parser.add_argument("--match-thresh", type=int, default=65)
parser.add_argument("--mappings", nargs="+", default=[]) # many foo=bar transformations to help witch mapping
args = parser.parse_args()
if args.match_thresh <= 0 or args.match_thresh > 100:
parser.error("--match-thresh must be 1-100")
# mappings allow simple string transforms as workarounds for poorly named episodes
mappings = {}
for item in args.mappings:
key, value = item.split("=")
mappings[key] = value
# Create index of shows
shows = []
for destdir in args.dest:
for i in os.listdir(destdir):
shows.append(create_show(i))
NORMAL_SEASON_EP_RE = re.compile(r'(([sS]([0-9]{2}))x?([eE]([0-9]{2}))?)') # match something like s01e02
NORMAL_SEASON_EP_RE2 = re.compile(r'(([0-9]+)[xX]([0-9]{2}))') # match something like 21x04
DATE_SEASON_EP_RE = re.compile(r'((201[0-9]).([0-9]{1,2})?.([0-9]{1,2})?)') # match something like 2017-08-3
COMMON_CRAP = [re.compile(i, flags=re.I) for i in
[r'(720|1080)p',
r'hdtv',
r'(h.?)?x?264(.[a-z0-9]+)?',
r'(ddp\d\.\d)?',
r'web(\-?(dl|rip))?',
r'[\.\-\s](amzn|amazon)[\.\-\s]',
r'dd.5.\d',
r'AAC2.\d']]
# load the library, an index of shows already sorted. the dirnames will be compared to incoming files
cachedir = user_config_dir("tvsort")
os.makedirs(cachedir, exist_ok=True)
cache_file = os.path.join(cachedir, "library.cache")
library = None
if os.path.exists(cache_file) and not args.rescan:
with open(cache_file, "rb") as f:
try:
library = pickle.load(f)
except:
print("Failed to load library cache")
if not library:
library = shows.create_index(args.dest)
with open(cache_file, "wb") as f:
pickle.dump(library, f)
failures = []
results = []
# iterate through all children of the src dirs
for srcdir in args.src:
for item in os.listdir(srcdir):
if not os.path.isfile(os.path.join(srcdir, item)):
# TODO go into subdirs too (we assume dir means season pack)
for fname in os.listdir(srcdir):
# TODO season dirs are ignored for now
if not os.path.isfile(os.path.join(srcdir, fname)):
continue
fname = item
# Remove file extension
item = item.rstrip(".mkv").lower()#TODO make this better
# Apply manual transformations
# Apply manually specified transformations
item = fname
for old, new in mappings.items():
item = item.replace(old, new)
# Extract season information
# And remove seasons info chars from the working name
epinfo = None
match = NORMAL_SEASON_EP_RE.search(item) or NORMAL_SEASON_EP_RE2.search(item)
if match:
fields = match.groups()
if len(fields) == 5:
whole, _, season, _, episode = fields
else:
whole, season, episode = fields
if season and not episode:
epinfo = EpInfo(fname, Season.special, int(season), None, None)
else:
assert season and episode
epinfo = EpInfo(fname, Season.by_season, int(season), int(episode), None)
# delete everything after the episode number
pos = item.find(whole)
if pos >= 10:
item = item[0:pos]
else:
# unless it makes it too short
item = item.replace(whole, "")
else:
match = DATE_SEASON_EP_RE.search(item)
if match:
whole, year, month, day = match.groups()
assert year is not None
if month:
month = int(month)
if day:
day = int(day)
epinfo = EpInfo(fname, Season.by_date, int(year), month, day)
# delete everything after the episode number
pos = item.find(whole)
if pos >= 10:
item = item[0:pos]
else:
# unless it makes it too short
item = item.replace(whole, "")
else:
# raise Exception("Could not parse episode: {}".format(repr(item)))
failures.append(fname)
continue
# Remove common torrenty names
for crap in COMMON_CRAP:
item = crap.sub("", item)
# print(epinfo, "->", item)
# Remaining chars should be a show name and possibly and episode title. And random bs
allowed_chars = string.ascii_lowercase + string.digits
item = ''.join([i if i in allowed_chars else " " for i in item]).strip()
# Parse information from the episode file name
try:
epinfo, item = parse_episode(item)
except EpisodeParseException:
failures.append(fname)
# Find a show from the library best matching this episode
match_score = 0
best_match_show = None
for show in shows:
value = show.match(item)
for show in library:
value = fuzz.token_set_ratio(show.name.lower(), item.lower()) #TODO add algorithm swap arg for snakeoil
if value > match_score:
match_score = value
best_match_show = show
if match_score > args.match_thresh:
results.append(MatchedEpisode(epinfo, best_match_show, match_score))
if match_score >= args.match_thresh:
results.append(
MatchedEpisode(srcdir, epinfo, best_match_show,
sub_bucket_name(best_match_show, epinfo.major, epinfo.minor, epinfo.extra),
match_score))
else:
failures.append(fname)
tab_rows = []
for item in sorted(results, key=lambda x: x.dest.dir):
row = [item.ep.file, item.ep.major, item.ep.minor, item.dest.dir, item.score]
tab_rows.append(row)
before = len(results)
results = list(
filter(
lambda r: not os.path.exists(os.path.join(r.dest.root, r.dest.dir, r.subdest, r.ep.file)),
results))
already_there = before - len(results)
print(tabulate(tab_rows, headers=["file", "season", "episode", "dest", "score"]))
go = False
while not go:
tab_rows = []
i = 0
results.sort(key=lambda x: x.dest.dir)
for item in results:
row = [i,
os.path.join(item.root, item.ep.file),
item.ep.major,
item.ep.minor,
os.path.join(item.dest.root, item.dest.dir, item.subdest) + "/",
item.score,
"soft" if args.soft else "hard"]
tab_rows.append(row)
i += 1
if failures:
print("\n\n")
print("Could not match:")
pprint(failures)
print(tabulate(tab_rows, headers=["number", "file", "season", "episode", "dest", "score", "link"]))
if already_there:
print("\n{} already in library and ignored".format(already_there))
if failures:
print("\n")
print("Could not match:")
pprint(failures)
if not results:
print("no candidates for linking found!")
return
resp = input("create links? [y/N/<lines to skip and print again>]: ").lower().strip()
if not resp or resp == "n":
return
if resp == "y":
break
exclude = []
for number in resp.split():
exclude.append(int(number))
exclude.sort(reverse=True)
for number in exclude:
results.pop(number)
link = os.symlink if args.soft else os.link
for item in results:
src = os.path.join(item.root, item.ep.file)
destdir = os.path.join(item.dest.root, item.dest.dir, item.subdest)
dest = os.path.join(destdir, item.ep.file)
# print("mkdir ", destdir)
os.makedirs(destdir, exist_ok=True)
# print(src, " -> ", dest)
link(src, dest)
if __name__ == '__main__':

35
tvsort/common.py Normal file
View File

@ -0,0 +1,35 @@
from collections import namedtuple
class Season:
"""
All episodes are categorized into seasons (or season-like entities). A season may number it's episodes by date or by
season and episode number. Thirdly, an episode may be associated with a season but not obey the regular naming
scheme - such as a special episode. This enum is for describing what chronological scheme an episode appears to use.
"""
none = 0
by_season = 1
by_date = 2
special = 3
Show = namedtuple("Show", "root dir name mode seasons")
"""
Struct describing an in-library tv show
root : abs path to the folder containing dir
dir : absolute(?) file path to the show
name : name of the show
mode : Season strategy (cannot be 'special')
seasons : list of season ints
"""
EpInfo = namedtuple("EpInfo", "file mode major minor extra")
"""
Struct for describing an episode file.
file : file name of the episode file
mode : chronological scheme of file naming (see Season)
major : least granular chronological unit. Typically season or year
minor : medium granular unit. Always episode number
extra : most granular unit. Always day (only used for date-based episodes)
"""

94
tvsort/parser.py Normal file
View File

@ -0,0 +1,94 @@
import re
import string
from tvsort.common import Season, Show, EpInfo
NORMAL_SEASON_EP_RE = re.compile(r'(([sS]([0-9]{2}))x?([eE]([0-9]{2}))?)') # match something like s01e02
NORMAL_SEASON_EP_RE2 = re.compile(r'(([0-9]+)[xX]([0-9]{2}))') # match something like 21x04
DATE_SEASON_EP_RE = re.compile(r'((201[0-9]).([0-9]{1,2})?.([0-9]{1,2})?)') # match something like 2017-08-3
COMMON_CRAP = [re.compile(i, flags=re.I) for i in
[r'(720|1080)p',
r'hdtv',
r'(h.?)?x?264(.[a-z0-9]+)?',
r'(ddp\d\.\d)?',
r'web(\-?(dl|rip))?',
r'[\.\-\s](amzn|amazon)[\.\-\s]',
r'dd.5.\d',
r'AAC2.\d']]
class EpisodeParseException(Exception):
pass
def parse_episode(fname):
"""
Given a file name, parse out any information we can from the name
:return:
"""
# Remove file extensions
item = fname.rstrip(".mkv").lower()#TODO make this better
# Extract season information
# And remove seasons info chars from the working name
epinfo = None
match = NORMAL_SEASON_EP_RE.search(item) or NORMAL_SEASON_EP_RE2.search(item)
if match:
fields = match.groups()
if len(fields) == 5:
whole, _, season, _, episode = fields
else:
whole, season, episode = fields
if season and not episode:
epinfo = EpInfo(fname, Season.special, int(season), None, None)
else:
assert season and episode
epinfo = EpInfo(fname, Season.by_season, int(season), int(episode), None)
# delete everything after the episode number
pos = item.find(whole)
if pos >= 10:
item = item[0:pos]
else:
# unless it makes it too short
item = item.replace(whole, "")
else:
match = DATE_SEASON_EP_RE.search(item)
if match:
whole, year, month, day = match.groups()
assert year is not None
if month:
month = int(month)
if day:
day = int(day)
epinfo = EpInfo(fname, Season.by_date, int(year), month, day)
# delete everything after the episode number
pos = item.find(whole)
if pos >= 10:
item = item[0:pos]
else:
# unless it makes it too short
item = item.replace(whole, "")
else:
raise EpisodeParseException("Could not parse episode {}".format(repr(fname)))
# Remove common torrenty names
for crap in COMMON_CRAP:
item = crap.sub("", item)
# Remaining chars should be a show name and possibly and episode title. And random bs
allowed_chars = string.ascii_lowercase + string.digits
item = ''.join([i if i in allowed_chars else " " for i in item]).strip()
return epinfo, item
def sub_bucket_name(show, major, minor, extra):
if show.mode == Season.by_date:
return str(major)
elif show.mode == Season.by_season:
return "Season {}".format(major)
else:
return ''

53
tvsort/shows.py Normal file
View File

@ -0,0 +1,53 @@
import os
from tvsort.common import Show, Season
import string
def create_show(root_path, dirname):
dir_lower = dirname.lower()
# Inspect contents of show directory and guess naming scheme
yearish = 0
seasonish = 0
wtfish = 0
buckets_season = []
buckets_year = []
for item in os.listdir(os.path.join(root_path, dirname)):
if item.lower().startswith("season "):
seasonish += 1
buckets_season.append(int(''.join([i if i in string.digits else " " for i in item]).strip())) # todo flexible season dir detection
continue
try:
year = int(item)
buckets_year.append(year)
if year > 1900 and year < 2050:
yearish += 1
continue
except ValueError:
pass
wtfish += 1
mode = None
episodes = None
if yearish > seasonish and yearish > wtfish:
mode = Season.by_date
episodes = buckets_year
elif seasonish > yearish and seasonish > wtfish:
mode = Season.by_season
episodes = buckets_season
else:
mode = Season.none
episodes = []
return Show(root_path, dirname, dir_lower, mode, episodes)
def create_index(fs_paths):
shows = []
for d in fs_paths:
for i in os.listdir(d):
if os.path.isdir(os.path.join(d, i)):
shows.append(create_show(d, i))
return shows