2016-06-10 13:06:38 -07:00
|
|
|
import re
|
|
|
|
import sys
|
2016-06-07 20:24:35 -07:00
|
|
|
import sqlite3
|
|
|
|
from urllib.parse import urlparse
|
2016-06-09 13:28:56 -07:00
|
|
|
from itertools import islice, filterfalse
|
2016-06-09 15:13:12 -07:00
|
|
|
from concurrent.futures import ThreadPoolExecutor
|
2016-06-10 13:06:38 -07:00
|
|
|
|
|
|
|
from b2mirror.localplugin import LocalProvider
|
|
|
|
from b2mirror.b2plugin import B2Reciever
|
|
|
|
from b2mirror.common import Result, results_ok
|
2016-06-09 13:28:56 -07:00
|
|
|
|
2016-06-09 15:13:12 -07:00
|
|
|
# import logging
|
|
|
|
# logging.basicConfig(level=logging.INFO)
|
2016-06-09 13:28:56 -07:00
|
|
|
|
|
|
|
|
2016-06-07 20:24:35 -07:00
|
|
|
"""
|
|
|
|
How it works:
|
|
|
|
|
|
|
|
B2SyncManager manages the transfer
|
|
|
|
|
2016-06-09 14:48:37 -07:00
|
|
|
It holds a src and dest object, src objects provide an iterable of FileInfos.
|
2016-06-07 20:24:35 -07:00
|
|
|
|
|
|
|
The manager will iterate the set of FileInfos, and pass each to the dest
|
|
|
|
|
|
|
|
Dest will upload the file, and inform the manager it was completed
|
|
|
|
|
|
|
|
"""
|
2016-06-09 15:13:12 -07:00
|
|
|
|
|
|
|
|
2016-06-07 20:24:35 -07:00
|
|
|
class B2SyncManager(object):
|
|
|
|
|
2016-06-10 13:06:38 -07:00
|
|
|
def __init__(self, source_module, dest_module, exclude_res=None, workers=10):
|
|
|
|
"""
|
|
|
|
:param source_module: subclass instance of b2mirror.base.Provider acting as a file source
|
|
|
|
:param dest_module: subclass of b2mirror.base.Receiver acting as a file destination
|
|
|
|
:param exclude_res: compiled regular expression objects that file paths will be matched against. Finding a match
|
|
|
|
means skip the file (and delete on the remote).
|
|
|
|
:param workers: Number of parallel transfers
|
|
|
|
"""
|
2016-06-07 20:24:35 -07:00
|
|
|
self.src = source_module
|
|
|
|
self.dest = dest_module
|
2016-06-09 13:28:56 -07:00
|
|
|
self.db = sqlite3.connect('./sync.db', check_same_thread=False)
|
2016-06-07 20:24:35 -07:00
|
|
|
self.db.row_factory = B2SyncManager.dict_factory
|
2016-06-09 15:13:12 -07:00
|
|
|
self.db.isolation_level = None # TBD - does it hurt perf?
|
2016-06-09 13:28:56 -07:00
|
|
|
self.exclude_res = [
|
2016-06-10 13:06:38 -07:00
|
|
|
re.compile(r'.*\.(DS_Store|pyc|dropbox)$'),
|
2016-06-10 13:07:02 -07:00
|
|
|
re.compile(r'.*__pycache__.*'),
|
|
|
|
re.compile(r'.*\.dropbox\.cache.*')
|
2016-06-09 14:48:37 -07:00
|
|
|
] + (exclude_res if exclude_res else [])
|
2016-06-10 13:06:38 -07:00
|
|
|
self.workers = workers
|
2016-06-07 20:24:35 -07:00
|
|
|
self._init_db()
|
2016-06-09 13:28:56 -07:00
|
|
|
|
2016-06-07 20:24:35 -07:00
|
|
|
@staticmethod
|
|
|
|
def dict_factory(cursor, row):
|
|
|
|
d = {}
|
|
|
|
for idx, col in enumerate(cursor.description):
|
|
|
|
d[col[0]] = row[idx]
|
|
|
|
return d
|
|
|
|
|
|
|
|
def _init_db(self):
|
2016-06-10 13:06:38 -07:00
|
|
|
"""
|
|
|
|
Init the sqlite databsae. Creates missing tables.
|
|
|
|
"""
|
2016-06-07 20:24:35 -07:00
|
|
|
c = self.db.cursor()
|
|
|
|
|
|
|
|
def table_exists(table_name):
|
|
|
|
c.execute("SELECT * FROM SQLITE_MASTER WHERE `type`='table' AND `name`=?", (table_name,))
|
|
|
|
tables = c.fetchall()
|
2016-06-09 15:13:12 -07:00
|
|
|
if len(tables) == 0:
|
2016-06-07 20:24:35 -07:00
|
|
|
return False
|
|
|
|
return True
|
|
|
|
|
|
|
|
tables = {
|
|
|
|
"files": """
|
|
|
|
CREATE TABLE `files` (
|
2016-06-09 13:28:56 -07:00
|
|
|
`path` varchar(1024) PRIMARY KEY,
|
2016-06-07 20:24:35 -07:00
|
|
|
`mtime` INTEGER,
|
2016-06-09 13:28:56 -07:00
|
|
|
`size` INTEGER,
|
|
|
|
`seen` BOOLEAN
|
2016-06-07 20:24:35 -07:00
|
|
|
);"""
|
|
|
|
}
|
|
|
|
|
|
|
|
for table_name, table_create_query in tables.items():
|
|
|
|
if not table_exists(table_name):
|
|
|
|
c.execute(table_create_query)
|
|
|
|
|
|
|
|
c.close()
|
|
|
|
|
|
|
|
def sync(self):
|
2016-06-10 13:06:38 -07:00
|
|
|
"""
|
|
|
|
Sync the source to the dest. First uploads new local files, then cleans dead files from the remote.
|
|
|
|
"""
|
2016-06-09 13:37:17 -07:00
|
|
|
# Phase 1 - Upload all local files missing on the remote
|
|
|
|
self.sync_up()
|
|
|
|
# Phase 2 - Delete files on the remote missing locally
|
|
|
|
self.purge_remote()
|
|
|
|
|
|
|
|
def sync_up(self):
|
2016-06-10 13:06:38 -07:00
|
|
|
"""
|
|
|
|
Sync local files to the remote. All files in the DB will be marked as unseen. When a file is found locally it is
|
|
|
|
again marked as seen. This state later used to clear deleted files from the destination
|
|
|
|
"""
|
|
|
|
#print("Syncing from {} to {}".format(self.src, self.dest))
|
2016-06-07 20:24:35 -07:00
|
|
|
|
2016-06-09 14:48:37 -07:00
|
|
|
# Mark all files as unseen
|
|
|
|
# Files will be marked as seen as they are processed
|
|
|
|
# Later, unseen files will be purged
|
|
|
|
c = self.db.cursor()
|
2016-06-09 15:13:12 -07:00
|
|
|
c.execute("UPDATE 'files' SET seen=0;")
|
2016-06-09 14:48:37 -07:00
|
|
|
c.close()
|
|
|
|
|
2016-06-09 13:28:56 -07:00
|
|
|
chunk_size = 1000
|
2016-06-07 20:24:35 -07:00
|
|
|
|
2016-06-10 13:06:38 -07:00
|
|
|
# if rel_path matches any of the REs, the filter is True and the file is skipped
|
|
|
|
files_source = filterfalse(lambda x: any([pattern.match(x.rel_path) for pattern in self.exclude_res]), self.src)
|
2016-06-09 13:37:17 -07:00
|
|
|
|
|
|
|
while True:
|
|
|
|
chunk = list(islice(files_source, chunk_size))
|
2016-06-09 13:28:56 -07:00
|
|
|
|
|
|
|
for item in chunk:
|
2016-06-09 13:37:17 -07:00
|
|
|
# long path names can't be put in sqlite
|
2016-06-09 13:28:56 -07:00
|
|
|
assert len(item.rel_path) < 512
|
|
|
|
|
|
|
|
if len(chunk) == 0:
|
|
|
|
break
|
|
|
|
|
2016-06-10 13:06:38 -07:00
|
|
|
with ThreadPoolExecutor(max_workers=self.workers) as executor:
|
2016-06-09 13:28:56 -07:00
|
|
|
upload_futures = [executor.submit(self.xfer_file, item) for item in chunk]
|
|
|
|
|
|
|
|
for i in upload_futures:
|
2016-06-10 13:06:38 -07:00
|
|
|
assert i.result() in results_ok
|
2016-06-09 13:28:56 -07:00
|
|
|
|
|
|
|
def xfer_file(self, f):
|
2016-06-10 13:06:38 -07:00
|
|
|
"""
|
|
|
|
Future-called function that handles a single file. The file's modification time is checked against the database
|
|
|
|
to see if the file has new content that should be uploaded or is untouched since the last sync
|
|
|
|
"""
|
|
|
|
result = Result.failed
|
|
|
|
|
2016-06-09 13:28:56 -07:00
|
|
|
c = self.db.cursor()
|
|
|
|
|
|
|
|
row = c.execute("SELECT * FROM 'files' WHERE `path` = ?;", (f.rel_path,)).fetchone()
|
2016-06-07 20:24:35 -07:00
|
|
|
|
2016-06-09 14:48:37 -07:00
|
|
|
if not row or row['mtime'] < f.mtime:
|
|
|
|
|
|
|
|
print("Uploading:", f.rel_path)
|
2016-06-09 13:28:56 -07:00
|
|
|
try:
|
2016-06-10 13:06:38 -07:00
|
|
|
result = self.dest.put_file(f, purge_historics=row is not None)
|
2016-06-09 13:28:56 -07:00
|
|
|
except:
|
|
|
|
print("Failed:", f.rel_path)
|
|
|
|
print("Unexpected error:", sys.exc_info()[0])
|
|
|
|
raise
|
2016-06-09 15:13:12 -07:00
|
|
|
# print("Ok: ", f.rel_path)
|
2016-06-09 14:48:37 -07:00
|
|
|
|
2016-06-09 13:28:56 -07:00
|
|
|
# The file was uploaded, commit it to the db
|
|
|
|
c.execute("REPLACE INTO 'files' VALUES(?, ?, ?, ?);", (f.rel_path, f.mtime, f.size, 1))
|
2016-06-09 15:13:12 -07:00
|
|
|
# print("Done: ", f.rel_path)
|
2016-06-07 20:24:35 -07:00
|
|
|
|
2016-06-09 13:28:56 -07:00
|
|
|
else:
|
2016-06-09 14:48:37 -07:00
|
|
|
c.execute("UPDATE 'files' SET seen=1 WHERE `path` = ?;", (f.rel_path,)).fetchone()
|
2016-06-10 13:06:38 -07:00
|
|
|
#print("Skipping:", f.rel_path)
|
|
|
|
result = Result.skipped
|
2016-06-07 20:24:35 -07:00
|
|
|
|
2016-06-09 13:28:56 -07:00
|
|
|
c.close()
|
2016-06-07 20:24:35 -07:00
|
|
|
|
2016-06-10 13:06:38 -07:00
|
|
|
return result
|
2016-06-07 20:24:35 -07:00
|
|
|
|
2016-06-09 14:48:37 -07:00
|
|
|
def purge_remote(self):
|
2016-06-10 13:06:38 -07:00
|
|
|
"""
|
|
|
|
Delete files on the remote that were not found when scanning the local tree.
|
|
|
|
"""
|
2016-06-09 14:48:37 -07:00
|
|
|
c = self.db.cursor()
|
|
|
|
c_del = self.db.cursor()
|
|
|
|
|
|
|
|
for purge_file in c.execute("SELECT * FROM 'files' WHERE seen=0;"):
|
|
|
|
print("Delete on remote: ", purge_file["path"])
|
|
|
|
self.dest.purge_file(purge_file["path"])
|
|
|
|
c_del.execute("DELETE FROM 'files' WHERE path=?;", (purge_file["path"],))
|
|
|
|
|
|
|
|
c_del.close()
|
|
|
|
c.close()
|
|
|
|
|
2016-06-07 20:24:35 -07:00
|
|
|
|
2016-06-10 13:06:38 -07:00
|
|
|
def sync(source_uri, dest_uri, account_id, app_key, workers=10, exclude=[]):
|
2016-06-07 20:24:35 -07:00
|
|
|
source = urlparse(source_uri)
|
|
|
|
dest = urlparse(dest_uri)
|
2016-06-09 13:28:56 -07:00
|
|
|
|
2016-06-07 20:24:35 -07:00
|
|
|
source_provider = None
|
|
|
|
dest_receiver = None
|
|
|
|
|
2016-06-09 15:13:12 -07:00
|
|
|
if source.scheme == '': # Plain file URI
|
2016-06-07 20:24:35 -07:00
|
|
|
source_provider = LocalProvider(source.path)
|
|
|
|
else:
|
|
|
|
raise Exception("Sources other than local file paths not supported")
|
|
|
|
|
2016-06-09 15:13:12 -07:00
|
|
|
if dest.scheme == 'b2': # Plain file URI
|
2016-06-10 13:06:38 -07:00
|
|
|
dest_receiver = B2Reciever(bucket=dest.netloc, path=dest.path, account_id=account_id, app_key=app_key,
|
|
|
|
workers=workers)
|
2016-06-07 20:24:35 -07:00
|
|
|
else:
|
|
|
|
raise Exception("Dests other than B2 URIs not yet supported")
|
|
|
|
|
|
|
|
assert source_provider is not None
|
|
|
|
assert dest_receiver is not None
|
2016-06-09 13:28:56 -07:00
|
|
|
|
2016-06-10 13:06:38 -07:00
|
|
|
syncer = B2SyncManager(source_provider, dest_receiver, workers=workers, exclude_res=exclude)
|
2016-06-09 13:28:56 -07:00
|
|
|
syncer.sync()
|