From 41b83c1705b2d27acba685cead07489abc39c121 Mon Sep 17 00:00:00 2001 From: dave Date: Sun, 25 Jun 2017 13:06:16 -0700 Subject: [PATCH] initial commit --- parser.py | 376 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 376 insertions(+) create mode 100755 parser.py diff --git a/parser.py b/parser.py new file mode 100755 index 0000000..d3fdf75 --- /dev/null +++ b/parser.py @@ -0,0 +1,376 @@ +#!/usr/bin/env python3 + +import re +from colored import fg, bg, attr, style +import os +from collections import defaultdict + + +IRC_BOLD = "\x02" +IRC_COLOR = "\x03" +IRC_ITALIC = "\x16" +IRC_UNDERLINE = "\x1f" + +# See https://pypi.python.org/pypi/colored +WHITE = "white" +DARK_GREY = "grey_23" +BLUE = "blue" +GREEN = "green" +ORANGE = "light_red" +RED = "red" +PURPLE = "plum_4" +OFF_YELLOW = "gold_1" +YELLOW = "yellow" +LIGHT_GREEN = "light_green" +AQUAMARINE = "cyan" +CYAN = "light_cyan" +LIGHT_BLUE = "light_blue" +LIGHT_PURPLE = "light_magenta" +LIGHT_GREY = "dark_gray" +V_LIGHT_GREY = "grey_82" + +ircolors = [WHITE, DARK_GREY, BLUE, GREEN, ORANGE, RED, PURPLE, OFF_YELLOW, YELLOW, + LIGHT_GREEN, AQUAMARINE, CYAN, LIGHT_BLUE, LIGHT_PURPLE, LIGHT_GREY, V_LIGHT_GREY] + +TRANSFORM_RE = re.compile(r'(?P[0-9]+)(\-(?P[0-9]+))?(?P[a-z]+)') + + +def print_palette(): + """ + Prints out color palette for debugging colors + """ + for i in range(0, len(ircolors)): + print("{reset}{num}: {code} {num} {num} {num} {num} {num} " + .format(reset=style.RESET, num=i, code=bg(ircolors[i]))) + + +def load_file(path): + with open(path, "rb") as f: + all_lines = [] + while True: + line = f.readline() + if not line: + break + try: + all_lines.append(line.decode("UTF-8").rstrip("\n")) + except UnicodeDecodeError: + print("Dropped line: {}".format(line)) + return all_lines + + +def write_ascii(chatlines, output_dir): + if not os.path.exists(output_dir): + os.mkdir(output_dir) + items = os.listdir(output_dir) + max_item = 0 + for item in items: + num = int(item) + if num > max_item: + max_item = num + max_item += 1 + dest_file = os.path.join(output_dir, str(max_item)) + with open(dest_file, "wb") as f: + for line in chatlines: + f.write(line.message.encode("UTF-8")) + f.write("\n".encode("UTF-8")) + return dest_file + + +class ChatLine(object): + """docstring for ChatLine""" + + line_format_re = re.compile(r'^(?P\[[0-9\:]+\]) ((<(?P[^>]+)>) (?P.+)|(?P.+))') + formatting_re = re.compile(r'([\x02\x03\x16\x1f])') + + def __init__(self, line): + self._parts = [] + + # Was the line a privmsg or something else? + self.is_chat = False + # Was there formatting in the line? + self.has_format = False + # Sender nick (if chat) + self.nick = None + # Message only + self.message = None + + self.original = line + + def parse(self): + line = self._original + matches = ChatLine.line_format_re.match(line) + + if not matches: + # Assume it's just a message ready to be formatted + self.message = line + self.parse_message_args(line) + else: + line_parts = matches.groupdict() + + if line_parts["message"]: + line = line_parts["message"] + self.is_chat = True + self.nick = line_parts["nick"] + self.message = line_parts["message"] + self.parse_message_args(self.message) + + def parse_message_args(self, message): + # Split the line on formatting sequences + matches = ChatLine.formatting_re.split(message) + # Anything before a format char isn't formatted + self._parts.append(matches.pop(0)) + + # Read each pair of parts. + # Each pair is a formatting sequence and the following text + part_index = 0 + while part_index < len(matches): + seperator = matches[part_index] + part = matches[part_index + 1] + part_index += 2 + + if seperator == IRC_COLOR: + self.has_format = True + # Try to parse foreground, background colors + # Can look like: + # 9,9DATA + # 10,10DATA or 9,10DATA or 10,9DATA + # ,10DATA + # Neither number will look like + # 09, 00, 01, etc + colors = [None, None] + + # Check if there is a number + if part and ChatLine.is_numeric(part[0]): + color_digits = part[0] + part = part[1:] + # Is there another number? + if part and ChatLine.is_numeric(part[0]): + color_digits_temp = color_digits + part[0] + # Are we breaking the "no 09 formatting" rule? + if int(color_digits_temp) <= 15: + color_digits = color_digits_temp + part = part[1:] + # We found a valid 1 or 2 digit color! + colors[0] = int(color_digits) + # Is there a comma followed by a number? + if len(part) >= 2 and part[0] == "," and ChatLine.is_numeric(part[1]): + color_digits = part[1] + part = part[2:] + # Is there a 2nd digit? + if part and ChatLine.is_numeric(part[0]): + color_digits_temp = color_digits + part[0] + # Are we breaking the "no 09 formatting" rule? + if int(color_digits_temp) <= 15: + color_digits = color_digits_temp + part = part[1:] + # We found a valid 1 or 2 digit color! + colors[1] = int(color_digits) + # Alter the foreground + if colors[0] is not None: + self._parts.append(fg(ircolors[colors[0]])) + # Alter the background + if colors[1] is not None: + self._parts.append(bg(ircolors[colors[1]]),) + # If no valid formatting was found, reset all styles + if colors[0] is None and colors[1] is None: + self._parts.append(style.RESET) + + # if colors[0] is not None: + # print(colors[0], end='') + # if colors[1] is not None: + # print(",", end='') + # print(colors[1], end='') + # elif seperator == IRC_BOLD: + # self.has_format = True + # self._parts.append(attr("bold")) + # elif seperator == IRC_ITALIC: + # pass + # # ??? + # elif seperator == IRC_UNDERLINE: + # self.has_format = True + # self._parts.append(attr("underlined")) + + self._parts.append(part) + + @property + def formatted(self): + return ''.join(self._parts) + + def __str__(self): + return self.formatted + + @property + def original(self): + return self._original + + @original.setter + def original(self, original): + self._original = original + self.parse() + + @classmethod + def is_numeric(self, char): + i = ord(char) + return i >= 48 and i <= 57 + + +def parse_logfile(logfile, output_dir): + # print_palette() + + all_lines = load_file(logfile) + + # Maping of sender nick to lines sent/meta + # meta is a mapping of + # lastseen->lineno(int), line number the chatter was last seen + # lines->list, string lines of chat + # ref -> string if part of a group, act on this instead + + class Ascii: + def __init__(self, lastseen, lines, ref): + self.lastseen = lastseen + self.lines = [] + self.ref = ref + + def __str__(self): + return "Ascii(lastseen={}, len(lines)={}, ref={})".format(self.lastseen, len(self.lines), self.ref) + + watched_asciis = {} + + def resolve(input_nic): + n = watched_asciis[input_nic] + if n.ref: + return resolve(n.ref) + return input_nic + + def clean_refs(input_nic): + for nickslot_name, nickslot in [i for i in watched_asciis.items()]: + if nickslot.ref == input_nic: + del watched_asciis[nickslot_name] + + for lineno in range(0, len(all_lines)): + line = ChatLine(all_lines[lineno]) + # print(line.nick, line, style.RESET) + + # Do EOF completes + for nickslot_name, nickslot in [i for i in watched_asciis.items()]: + if not nickslot.ref and lineno - nickslot.lastseen > 16: + print("EOF-Completed {}".format(nickslot_name)) + for i in nickslot.lines: + print(i.nick, i, style.RESET) + write_ascii(nickslot.lines, output_dir) + del watched_asciis[nickslot_name] + clean_refs(nickslot_name) + + if line.has_format: # TODO or smells like ascii art + nickslot_name = line.nick + nickslot = watched_asciis.get(nickslot_name, None) + if nickslot: + nickslot.lastseen = lineno + if nickslot.ref: + nickslot_name = nickslot.ref + nickslot = watched_asciis.get(nickslot_name) + + nickslot.lastseen = lineno + nickslot.lines.append(line) + continue + + # If: + # - any of the last 3 lines was formatted + # - I speak again with formatting in the next X lines, + # We assume multi-user ascii + + last_3_lines = [ChatLine(i) for i in all_lines[max(lineno - 3, 0):lineno - 1]] + next_X_lines = [ChatLine(i) for i in all_lines[lineno + 1:lineno + 16]] + format_in_last = any([i.has_format for i in last_3_lines]) + i_speak_again = any([i.nick == nickslot_name for i in next_X_lines]) + + if format_in_last and i_speak_again: + # Ref onto existing ascii + last_line = [i for i in last_3_lines if i.has_format][-1] + target = resolve(last_line.nick) + watched_asciis[nickslot_name] = Ascii(lineno, [], target) + watched_asciis[target].lines.append(line) + else: + # Start a new ascii + watched_asciis[nickslot_name] = Ascii(0, [], "") + watched_asciis[nickslot_name].lines.append(line) + watched_asciis[nickslot_name].lastseen = lineno + + for nickslot_name, nickslot in watched_asciis.items(): + print("EOL-Completed {}".format(nickslot_name)) + for line in nickslot.lines: + print(line.nick, line, style.RESET) + write_ascii(nickslot.lines, output_dir) + + +def main(): + import argparse + + parser = argparse.ArgumentParser(description="IRC log ascii art exporter toolkit") + subparser_action = parser.add_subparsers(dest='action', help='action to take') + + parser_parse = subparser_action.add_parser('parse', help='Extract asciis from logs') + parser_parse.add_argument('-o', '--output-dir', help="Output directory", default="./asciis") + parser_parse.add_argument('files', nargs="+", help="Input log files") + + parser_insp = subparser_action.add_parser('inspect', help='View files with fomratting') + parser_insp.add_argument("-l", "--lines", action="store_true", help="Show line numbers") + parser_insp.add_argument('file', nargs=1, help="Input file") + + parser_split = subparser_action.add_parser('split', help='Split asciis into multiple') + parser_split.add_argument('file', nargs=1, help="Input file") + parser_split.add_argument("-p", "--preview", action="store_true", help="Don't write anything") + parser_split.add_argument('-o', '--output-dir', help="Output directory", default="./asciis") + parser_split.add_argument("-t", "--sections", nargs="+", help="sections to modify", required=True) + # TODO write docs for --sections + # Format is: + # 0a associate line 0 with group a + # 10-20a associate lines 10-20, inclusive, with group a + + args = parser.parse_args() + + if args.action == "parse": + for item in args.files: + parse_logfile(item, args.output_dir) + + elif args.action == "inspect": + lineno = 0 + for line in load_file(args.file[0]): + if args.lines: + print("{}\t".format(lineno), end='') + print(ChatLine(line).formatted + style.RESET) + lineno += 1 + + elif args.action == "split": + lines = [[None, line] for line in load_file(args.file[0])] + for t in args.sections: + spec = TRANSFORM_RE.match(t).groupdict() + + start = int(spec["start"]) + end = int(spec["end"]) if spec["end"] else start + + for i in range(start, end + 1): + lines[i][0] = spec["tag"] + + grouped = defaultdict(list) + for tag, line in lines: + grouped[tag].append(line) + + if None in grouped.keys(): + raise Exception("Not all lines grouped") + + for tag, lines in grouped.items(): + print("Tag {}:".format(tag)) + for line in lines: + print(ChatLine(line).formatted + style.RESET) + if not args.preview: + new_file = write_ascii([ChatLine(i) for i in lines], args.output_dir) + print("Wrote {}".format(new_file)) + print("\n\n") + + if not args.preview: + os.unlink(args.file[0]) + + +if __name__ == '__main__': + main()