purge dest files removed on src

This commit is contained in:
Dave Pedu 2016-06-09 14:48:37 -07:00
parent 4b4d14c832
commit a2922b90a9
1 changed files with 57 additions and 26 deletions

View File

@ -7,19 +7,20 @@ from b2.api import B2Api
import sys import sys
from itertools import islice, filterfalse from itertools import islice, filterfalse
from concurrent.futures import ThreadPoolExecutor, Future from concurrent.futures import ThreadPoolExecutor, Future
import logging
import re import re
#import logging
#logging.basicConfig(level=logging.INFO) #logging.basicConfig(level=logging.INFO)
FileInfo = namedtuple('FileInfo', ['abs_path', 'rel_path', 'size', 'mtime', ]) # 'fp'
""" """
How it works: How it works:
B2SyncManager manages the transfer B2SyncManager manages the transfer
It holes a src and dest object, src objects provide an iterable of FileInfos. It holds a src and dest object, src objects provide an iterable of FileInfos.
The manager will iterate the set of FileInfos, and pass each to the dest The manager will iterate the set of FileInfos, and pass each to the dest
@ -30,7 +31,7 @@ class B2SyncManager(object):
workers = 10 workers = 10
def __init__(self, source_module, dest_module): def __init__(self, source_module, dest_module, exclude_res=None):
self.src = source_module self.src = source_module
self.dest = dest_module self.dest = dest_module
self.db = sqlite3.connect('./sync.db', check_same_thread=False) self.db = sqlite3.connect('./sync.db', check_same_thread=False)
@ -38,7 +39,7 @@ class B2SyncManager(object):
self.db.isolation_level = None # TBD - does it hurt perf? self.db.isolation_level = None # TBD - does it hurt perf?
self.exclude_res = [ self.exclude_res = [
re.compile(r'.*\.(DS_Store|pyc|dropbox)$') re.compile(r'.*\.(DS_Store|pyc|dropbox)$')
] ] + (exclude_res if exclude_res else [])
self._init_db() self._init_db()
@staticmethod @staticmethod
@ -83,6 +84,13 @@ class B2SyncManager(object):
def sync_up(self): def sync_up(self):
print("Syncing from {} to {}".format(self.src, self.dest)) print("Syncing from {} to {}".format(self.src, self.dest))
# Mark all files as unseen
# Files will be marked as seen as they are processed
# Later, unseen files will be purged
c = self.db.cursor()
row = c.execute("UPDATE 'files' SET seen=0;")
c.close()
chunk_size = 1000 chunk_size = 1000
files_source = filterfalse( # if rel_path matches any of the REs, the filter is True and the file is skipped files_source = filterfalse( # if rel_path matches any of the REs, the filter is True and the file is skipped
@ -102,25 +110,19 @@ class B2SyncManager(object):
with ThreadPoolExecutor(max_workers=B2SyncManager.workers) as executor: with ThreadPoolExecutor(max_workers=B2SyncManager.workers) as executor:
upload_futures = [executor.submit(self.xfer_file, item) for item in chunk] upload_futures = [executor.submit(self.xfer_file, item) for item in chunk]
#print("Queued {} tasks".format(len(chunk)))
for i in upload_futures: for i in upload_futures:
assert i.result() assert i.result()
def canskip(self, f):
if f.rel_path.endswith('.DS_Store'):
return True
else:
return False
def xfer_file(self, f): def xfer_file(self, f):
c = self.db.cursor() c = self.db.cursor()
row = c.execute("SELECT * FROM 'files' WHERE `path` = ?;", (f.rel_path,)).fetchone() row = c.execute("SELECT * FROM 'files' WHERE `path` = ?;", (f.rel_path,)).fetchone()
if self.canskip(f) or not row or row['mtime'] < f.mtime:
print("Starting:", f.rel_path) if not row or row['mtime'] < f.mtime:
print("Uploading:", f.rel_path)
try: try:
self.dest.put_file(f) self.dest.put_file(f)
except: except:
@ -128,18 +130,31 @@ class B2SyncManager(object):
print("Unexpected error:", sys.exc_info()[0]) print("Unexpected error:", sys.exc_info()[0])
raise raise
#print("Ok: ", f.rel_path) #print("Ok: ", f.rel_path)
#f.fp.close()
# The file was uploaded, commit it to the db # The file was uploaded, commit it to the db
c.execute("REPLACE INTO 'files' VALUES(?, ?, ?, ?);", (f.rel_path, f.mtime, f.size, 1)) c.execute("REPLACE INTO 'files' VALUES(?, ?, ?, ?);", (f.rel_path, f.mtime, f.size, 1))
#print("Done: ", f.rel_path) #print("Done: ", f.rel_path)
else: else:
print("Skipping:", f.rel_path) c.execute("UPDATE 'files' SET seen=1 WHERE `path` = ?;", (f.rel_path,)).fetchone()
#print("Skipping:", f.rel_path)
c.close() c.close()
return True return True
def purge_remote(self):
c = self.db.cursor()
c_del = self.db.cursor()
for purge_file in c.execute("SELECT * FROM 'files' WHERE seen=0;"):
print("Delete on remote: ", purge_file["path"])
self.dest.purge_file(purge_file["path"])
c_del.execute("DELETE FROM 'files' WHERE path=?;", (purge_file["path"],))
c_del.close()
c.close()
class Provider(object): class Provider(object):
""" """
@ -148,7 +163,12 @@ class Provider(object):
def __init__(self): def __init__(self):
pass pass
FileInfo = namedtuple('FileInfo', ['abs_path', 'rel_path', 'size', 'mtime', ]) # 'fp' def __iter__(self):
return self
def __next__(self):
raise NotImplemented()
class LocalProvider(Provider): class LocalProvider(Provider):
""" """
@ -161,9 +181,6 @@ class LocalProvider(Provider):
self.current_set = (None, [], []) self.current_set = (None, [], [])
self.walker = os.walk(self.local_path) self.walker = os.walk(self.local_path)
def __iter__(self):
return self
def __next__(self): def __next__(self):
if len(self.current_set[2]) > 0: if len(self.current_set[2]) > 0:
file_abs_path = os.path.join(self.current_set[0], self.current_set[2].pop()) file_abs_path = os.path.join(self.current_set[0], self.current_set[2].pop())
@ -196,6 +213,11 @@ class Reciever(object):
""" """
Base class for destinations Base class for destinations
""" """
def put_file(self, file_info):
raise NotImplemented()
def purge_file(self, file_path):
raise NotImplemented()
class B2Reciever(Reciever): class B2Reciever(Reciever):
@ -203,14 +225,14 @@ class B2Reciever(Reciever):
def __init__(self, bucket, path, account_id, app_key): def __init__(self, bucket, path, account_id, app_key):
super(B2Reciever, self).__init__() super(B2Reciever, self).__init__()
self.bucket = bucket self.bucket_name = bucket
self.path = path self.path = path
self.account_id = account_id self.account_id = account_id
self.app_key = app_key self.app_key = app_key
self.api = B2Api(max_upload_workers=B2SyncManager.workers) self.api = B2Api(max_upload_workers=B2SyncManager.workers)
self.api.authorize_account('production', self.account_id, self.app_key) self.api.authorize_account('production', self.account_id, self.app_key)
self.bucket = self.api.get_bucket_by_name(self.bucket) self.bucket = self.api.get_bucket_by_name(self.bucket_name)
def put_file(self, file_info): def put_file(self, file_info):
#print(">>> {}".format(file_info.abs_path)) #print(">>> {}".format(file_info.abs_path))
@ -220,12 +242,23 @@ class B2Reciever(Reciever):
dest_path dest_path
) )
def purge_file(self, file_path):
dest_path = os.path.join(self.path, file_path).lstrip('/')
self.delete_by_path(dest_path)
def delete_by_path(self, file_path):
for f in self.bucket.list_file_versions(start_filename=file_path, max_entries=100)["files"]:
if f["fileName"] == file_path:
self.api.delete_file_version(f["fileId"], f["fileName"])
else:
return
def sync(source_uri, dest_uri, account_id, app_key): def sync(source_uri, dest_uri, account_id, app_key):
source = urlparse(source_uri) source = urlparse(source_uri)
dest = urlparse(dest_uri) dest = urlparse(dest_uri)
syncer = B2SyncManager(source_uri, dest_uri)
source_provider = None source_provider = None
dest_receiver = None dest_receiver = None
@ -239,8 +272,6 @@ def sync(source_uri, dest_uri, account_id, app_key):
else: else:
raise Exception("Dests other than B2 URIs not yet supported") raise Exception("Dests other than B2 URIs not yet supported")
assert source_provider is not None assert source_provider is not None
assert dest_receiver is not None assert dest_receiver is not None