renames and moves

This commit is contained in:
dave 2019-05-24 19:46:06 -07:00
parent 9aedf2f53d
commit 62f3729a83
2 changed files with 42 additions and 41 deletions

View File

@ -6,7 +6,7 @@ import logging
import cherrypy
from threading import Thread
from jinja2 import Environment, FileSystemLoader, select_autoescape
from dirview.dirtools import gen_db, gen_index, NodeType, NodeGroup
from dirview.dirtools import gen_db, gen_node_index, NodeType, NodeGroup
APPROOT = os.path.abspath(os.path.join(os.path.dirname(__file__), "../"))
@ -24,7 +24,7 @@ class DbUpdater(Thread):
logging.info("Updating database...")
self.root = gen_db(self.root_path)
logging.info("Generating index...")
self.index = gen_index(self.root)
self.index = gen_node_index(self.root)
logging.info("Warming caches...")
self.root.total_size # calculating these require recursing all nodes
self.root.total_children

View File

@ -94,7 +94,7 @@ class Node:
def __hash__(self):
return id(self)
# def __str__(self): # TODO
# def __str__(self): # TODO, because the default str() shows all the children recursively
# pass
@ -177,7 +177,7 @@ def serialize_db(db):
yield node.serialize()
def gen_index(db):
def gen_node_index(db):
index = {}
for node in db.iter():
index[id(node)] = node
@ -189,6 +189,43 @@ def write_db(db, fobj):
fobj.write(json.dumps(ob) + "\n")
def load_db(fpath):
"""
Loading the db
1) parse all node objects and save them in a cache keyed by the embedded IDs
2) for each node in the cache:
3) re-establish child/parent pointers
Note that the cache is discarded and does NOT become the node id cache because it is keyed by the serialized IDs
On my i7-7920HQ CPU @ 3.10GHz, loading a 276M dump with 2.2M lines takes 22s
"""
nodecache = {} # mapping of serialized_id->object
root = None
with open(fpath) as f:
for line in f.readlines():
info = json.loads(line)
node = Node(name=info["name"],
typ=NodeType(info["typ"]),
children=info["children"], # keep as IDs for now
size=info["size"],
parent=None) # this could be 'nodecache[info["parent"]]' but that assumes the dump is in order
nodecache[info["id"]] = node
# if node.parent is None:
# root = node
for oldid, node in nodecache.items():
node.children = [nodecache[child_old_id] for child_old_id in node.children]
if node.parent is not None:
node.parent = nodecache[node.parent]
return root
def test_gen_write_db(path):
path = os.path.normpath(os.path.abspath(path))
@ -238,42 +275,6 @@ def test_gen_write_db(path):
# pass
def load_db(fpath):
"""
Loading the db
1) parse all node objects and save them in a cache keyed by the embedded IDs
2) for each node in the cache:
3) re-establish child pointers
4) re-establish parent pointers
On my i7-7920HQ CPU @ 3.10GHz, loading a 276M dump with 2.2M lines takes 22s
"""
nodecache = {} # mapping of serialized_id->object
root = None
with open(fpath) as f:
for line in f.readlines():
info = json.loads(line)
node = Node(name=info["name"],
typ=NodeType(info["typ"]),
children=info["children"], # keep as IDs for now
size=info["size"],
parent=nodecache[info["parent"]])
nodecache[info["id"]] = node
if node.parent is None:
root = node
# for oldid, node in nodecache.items():
# node.children = [nodecache[child_old_id] for child_old_id in node.children]
# if node.parent is not None:
# node.parent = nodecache[node.parent] # this may break on symlinks or other loops?
return root
def test_load_db(fpath):
print("ready")
start = time()
@ -287,7 +288,7 @@ def test_load_db(fpath):
print(f"counted {count} nodes in {round(time()-start, 2)}s")
start = time()
index = gen_index(db)
index = gen_node_index(db)
print(f"generated index with {len(index)} nodes in {round(time()-start, 2)}s")