basic tool

This commit is contained in:
dave 2021-12-23 15:38:18 -08:00
parent 6c186bd3fb
commit 1130138f3e
5 changed files with 179 additions and 86 deletions

View File

@ -1,4 +1,3 @@
import os
import hashlib
@ -9,88 +8,3 @@ def hash_chunk(data):
h = hashlib.md5()
h.update(data)
return h.hexdigest()
class BaseChunkClient(object):
def __init__(self, fpath, chunk_size=CHUNK_SIZE):
self.chunk_size = chunk_size
self.fpath = fpath
def get_hashes(self):
"""
yield a stream of hashes of file chunks. The returned format is tuples of (chunk_number, chunk_hash, )
"""
raise NotImplementedError()
def get_chunk(self, chunk_number):
"""
return a file handle from which CHUNK_SIZE bytes of data can be read
"""
raise NotImplementedError()
def put_chunk(self, chunk_number, contents):
"""
insert the data for chunk_number's position within the file, the content given by contents (which is a file-like object)
"""
raise NotImplementedError()
class LocalChunkClient(BaseChunkClient):
def __init__(self, fpath, chunk_size=CHUNK_SIZE):
super().__init__(fpath, chunk_size)
self.file = open(self.fpath, "rb") # for get chunk operations, this generic file is used instead of doing lots of open/close
def get_hashes(self):
i = 0
with open(self.fpath, "rb") as f:
while True:
data = f.read(self.chunk_size)
if not data:
break
yield (i, hash_chunk(data))
i += 1
def get_chunk(self, chunk_number):
"""
return a file handle from which CHUNK_SIZE bytes of data can be read
"""
position = chunk_number * self.chunk_size
if position > os.path.getsize(self.fpath):#TODO not sure if > or >=
raise Exception("requested chunk {} is beyond EOF".format(chunk_number))
self.file.seek(position)#TODO not thread safe
return self.file.read(self.chunk_size)
def put_chunk(self, chunk_number, contents):
"""
insert the data for chunk_number's position within the file, the content given by contents (which is a file-like object) lol not actually
"""
position = chunk_number * self.chunk_size
# if position > self.fsize:#TODO not sure if > or >=
# raise Exception("requested chunk {} is beyond EOF".format(chunk_number))
with open(self.fpath, "rb+") as f:
f.seek(position)
f.write(contents)
def main():
src = LocalChunkClient("test.zip")
src_hashes = [i for i in src.get_hashes()]
dest = LocalChunkClient("dest.zip")
dest_hashes = [i for i in dest.get_hashes()]
for i, (chunk_number, chunk_hash) in src_hashes:
if i > len(dest_hashes) or chunk_hash != dest_hashes[i][1]:
print("would copy chunk", i)
# chunk_six = lc.get_chunk(6)
# lc.put_chunk(6, chunk_six)
import pdb
pdb.set_trace()
pass
if __name__ == '__main__':
main()

51
blobsend/cli.py Normal file
View File

@ -0,0 +1,51 @@
import argparse
from urllib.parse import urlparse
from blobsend.client_file import FileChunkClient
SCHEMES = {
"file": FileChunkClient,
}
def get_args():
parser = argparse.ArgumentParser(description="file blob copy utility")
parser.add_argument("src", help="source file uri")
parser.add_argument("dest", help="dest file uri")
return parser.parse_args(), parser
def get_client(uri):
clss = SCHEMES[uri.scheme or "file"]
return clss.from_uri(uri)
def main():
args, parser = get_args()
print(args)
src = get_client(urlparse(args.src))
dest = get_client(urlparse(args.dest))
dest_hashes_iter = dest.get_hashes()
for src_chunk_number, src_chunk_hash in src.get_hashes():
dest_chunk_number = None
dest_chunk_hash = None
try:
dest_chunk_number, dest_chunk_hash = next(dest_hashes_iter)
except StopIteration:
pass
if dest_chunk_number is not None and src_chunk_number != dest_chunk_number:
raise Exception("sequence mismatch?")
if src_chunk_hash != dest_chunk_hash:
blob = src.get_chunk(src_chunk_number)
dest.put_chunk(src_chunk_number, blob)
dest.set_length(src.get_length())
if __name__ == '__main__':
main()

44
blobsend/client_base.py Normal file
View File

@ -0,0 +1,44 @@
from blobsend import CHUNK_SIZE
class BaseChunkClient(object):
def __init__(self, fpath, chunk_size=CHUNK_SIZE):
self.chunk_size = chunk_size
self.fpath = fpath
def get_hashes(self):
"""
yield a stream of hashes of file chunks. The returned format is tuples of (chunk_number, chunk_hash, )
"""
raise NotImplementedError()
def get_chunk(self, chunk_number):
"""
return a file handle from which CHUNK_SIZE bytes of data can be read
"""
raise NotImplementedError()
def put_chunk(self, chunk_number, contents):
"""
insert the data for chunk_number's position within the file, the content given by contents (which is a file-like object)
"""
raise NotImplementedError()
def get_length(self):
"""
get the file size
"""
raise NotImplementedError()
def set_length(self, length):
"""
truncate or extend the file
"""
raise NotImplementedError()
@staticmethod
def from_uri(uri):
"""
instantiate a client from the given uri
"""
raise NotImplementedError()

62
blobsend/client_file.py Normal file
View File

@ -0,0 +1,62 @@
import os
from blobsend.client_base import BaseChunkClient
from blobsend import CHUNK_SIZE, hash_chunk
class FileChunkClient(BaseChunkClient):
def __init__(self, fpath, chunk_size=CHUNK_SIZE):
super().__init__(fpath, chunk_size)
self.file = open(self.fpath, "ab+") # for get chunk operations, this generic file is used instead of doing lots of open/close
self.file.seek(0)
def get_hashes(self):
i = 0
with open(self.fpath, "rb+") as f:
while True:
data = f.read(self.chunk_size)
if not data:
break
yield (i, hash_chunk(data))
i += 1
def get_chunk(self, chunk_number):
"""
return a file handle from which CHUNK_SIZE bytes of data can be read
"""
position = chunk_number * self.chunk_size
if position > os.path.getsize(self.fpath):#TODO not sure if > or >=
raise Exception("requested chunk {} is beyond EOF".format(chunk_number))
self.file.seek(position)#TODO not thread safe
return self.file.read(self.chunk_size)
def put_chunk(self, chunk_number, contents):
"""
insert the data for chunk_number's position within the file, the content given by contents (which is a file-like object) lol not actually
"""
position = chunk_number * self.chunk_size
# if position > self.fsize:#TODO not sure if > or >=
# raise Exception("requested chunk {} is beyond EOF".format(chunk_number))
with open(self.fpath, "rb+") as f:
f.seek(position)
f.write(contents)
def get_length(self):
"""
get the file size
"""
self.file.seek(0, 2) # seek to end
return self.file.tell()
def set_length(self, length):
if length < self.get_length():
self.file.truncate(length)
# do nothing for the case of extending the file
# put_chunk handles it
@staticmethod
def from_uri(uri):
"""
instantiate a client from the given uri
"""
return FileChunkClient(uri.path)

View File

@ -0,0 +1,22 @@
#!/usr/bin/env python3
from setuptools import setup
__version__ = "0.0.0"
setup(name='blobsend',
version=__version__,
description='tool for updating sub-portions of large files',
url='',
author='dpedu',
author_email='dave@davepedu.com',
packages=['blobsend'],
install_requires=[],
entry_points={
"console_scripts": [
"blobsend = blobsend.cli:main",
]
},
zip_safe=False)