From f759f47616cfa0d625b4cd472247c695fd1f1238 Mon Sep 17 00:00:00 2001 From: dave Date: Sat, 4 May 2019 18:20:17 -0700 Subject: [PATCH] wheel ingestion --- repobot/aptprovider.py | 62 ++++++----- repobot/pypiprovider.py | 239 +++++++++++++++++++++++++++++++++++++++- 2 files changed, 273 insertions(+), 28 deletions(-) diff --git a/repobot/aptprovider.py b/repobot/aptprovider.py index 77d07a6..6170015 100644 --- a/repobot/aptprovider.py +++ b/repobot/aptprovider.py @@ -1,21 +1,20 @@ import cherrypy -from pydpkg import Dpkg -from repobot.tables import Base, db, get_engine +import gnupg +import hashlib +import json +import os +import queue import sqlalchemy -from sqlalchemy import Column, ForeignKey -from sqlalchemy.types import String, Integer, Text, BOOLEAN +import traceback +from datetime import datetime +from pydpkg import Dpkg +from sqlalchemy import Column, ForeignKey, UniqueConstraint from sqlalchemy.dialects.mysql import LONGTEXT from sqlalchemy.orm import relationship -from sqlalchemy import UniqueConstraint +from sqlalchemy.types import String, Integer, Text, BOOLEAN from tempfile import TemporaryDirectory from threading import Thread -import hashlib -import os -import gnupg -from datetime import datetime -import traceback -import json -import queue +from repobot.tables import Base, db, get_engine class AptRepo(Base): @@ -273,40 +272,49 @@ Description: Generated by Repobot tmppkgpath = os.path.join(tdir, "temp.deb") with open(tmppkgpath, "wb") as fdest: fhashes = copyhash(fobj.file, fdest) + fsize = os.path.getsize(tmppkgpath) p = Dpkg(tmppkgpath) pkgname = "{}_{}_{}.deb".format(p.message['Package'], p.message['Version'], p.message['Architecture']) - yield "package name: {}\n".format(pkgname) - yield "package size: {}\n".format(os.path.getsize(tmppkgpath)) - yield "package message:\n-----------------\n{}\n-----------------\n".format(p.message) - yield "package hashes: {}".format(fhashes) + + #TODO keys can be duplicated in email.message.Message, does this cause any problems? + fields = {key: p.message[key] for key in p.message.keys()} # repos//packages/f/foo.deb dpath = os.path.join(self.basepath, "repos", repo.name, "packages", pkgname[0], pkgname) - - with open(tmppkgpath, "rb") as f: - response = self.s3.put_object(Body=f, Bucket=self.bucket, Key=dpath) - if response["ResponseMetadata"]["HTTPStatusCode"] != 200: - print(response) - raise Exception("failed to store package") - - fields = {key: p.message[key] for key in p.message.keys()} + files = self.s3.list_objects(Bucket=self.bucket, Prefix=dpath).get("Contents") + if files: + print(f"will overwrite: {files}") pkg = AptPackage(repo=repo, dist=dist, name=p.message['Package'], version=p.message['Version'], arch=p.message['Architecture'], fname=pkgname, - size=os.path.getsize(tmppkgpath), + size=fsize, **fhashes, fields=json.dumps(fields)) - - dist.dirty = True db().add(pkg) db().commit() + try: + with open(tmppkgpath, "rb") as f: + response = self.s3.put_object(Body=f, Bucket=self.bucket, Key=dpath) + assert(response["ResponseMetadata"]["HTTPStatusCode"] == 200), f"Upload failed: {response}" + except Exception: + db().delete(pkg) + db().commit() + raise + + dist.dirty = True + db().commit() self.regen_dist(dist.id) + yield "package name: {}\n".format(pkgname) + yield "package size: {}\n".format(fsize) + yield "package message:\n-----------------\n{}\n-----------------\n".format(p.message) + yield "package hashes: {}".format(fhashes) + def regen_dist(self, dist_id): self.queue.put((dist_id, )) diff --git a/repobot/pypiprovider.py b/repobot/pypiprovider.py index dfd08ba..8ee79e3 100644 --- a/repobot/pypiprovider.py +++ b/repobot/pypiprovider.py @@ -1,5 +1,242 @@ +import cherrypy +import hashlib +import json +import os +import queue +import re +from email import message_from_string +from sqlalchemy import Column, ForeignKey, UniqueConstraint +from sqlalchemy.orm import relationship +from sqlalchemy.types import String, Integer, Text +from tempfile import TemporaryDirectory +from wheel import wheelfile +from repobot.tables import Base, db + + +def parse_wheel(path): + fsize = os.path.getsize(path) + + # open up wheel file (it's actually a zip) + p = wheelfile.WheelFile(path) + + # look for the files we care about in the '.dist-info' directory + metadata_file = None + metadata_wheel = None + for zipfile in p.filelist: + parts = os.path.split(zipfile.filename) + if len(parts) == 2 and parts[0].endswith(".dist-info"): + if parts[1] == "METADATA": + metadata_file = zipfile + elif parts[1] == "WHEEL": + metadata_wheel = zipfile + + assert(metadata_file), "METADATA file not found" + assert(metadata_wheel), "WHEEL file not found" + + metadata_data = message_from_string(p.read(metadata_file.filename).decode("UTF-8")) + wheel_data = message_from_string(p.read(metadata_wheel.filename).decode("UTF-8")) + + # get version and whatnot from the pkginfo. there will be multiple Tags with the same python and api, but + # there can be varying platforms. + python_versions = set() + python_apis = set() + python_platforms = set() + + for tag in wheel_data.get_all("Tag"): + python_version, python_api, python_platform = tag.split("-") # ['py3', 'none', 'any'] + python_versions.update([python_version]) + python_apis.update([python_api]) + python_platforms.update([python_platform]) + + assert(len(python_apis) == 1), "wheel metadata python api list has other than 1 unique entry" + + # generate final platforming string + python_version = '.'.join(sorted(list(python_versions), key=natural_keys)) + python_api = python_apis.pop() + python_platform = '.'.join(sorted(list(python_platforms), key=natural_keys)) + + buildtag = wheel_data["Build"] + name_parts = [metadata_data["Name"], metadata_data["Version"], python_version, python_api, python_platform] + if buildtag: + name_parts.insert(2, buildtag) + + assert(None not in name_parts), "Required metadata field missing" + + # construct filename, verify it matches what was submitted + fname_parts = name_parts[:] + fname_parts[0] = fname_parts[0].replace("-", "_") # replaces dashes in dist name with underscore + wheelname = "-".join(fname_parts) + ".whl" + + return {"fields": {"dist": name_parts[0], + "version": name_parts[1], + "build": buildtag, + "python": python_version, + "api": python_api, + "platform": python_platform}, + "wheel": wheel_data.items(), + "metadata": metadata_data.items(), + "description": metadata_data.get_payload(), + "wheelname": wheelname, + "size": fsize} + + +# https://stackoverflow.com/a/5967539 +def sort_atoi(text): + return int(text) if text.isdigit() else text + + +def natural_keys(text): + """ + Sort keeping keys in "natural" order such that version names embedded in strings are ordered correctly such as: + - macosx_10_6_intel + - macosx_10_9_intel + - macosx_10_9_x86_64 + - macosx_10_10_intel + - macosx_10_10_x86_64 + """ + return [sort_atoi(c) for c in re.split(r'(\d+)', text)] + + +class PipRepo(Base): + __tablename__ = 'piprepo' + id = Column(Integer, primary_key=True) + name = Column(String(length=32), unique=True, nullable=False) + + +class PipPackage(Base): + __tablename__ = 'pippkg' + id = Column(Integer, primary_key=True) + + repo_id = Column(Integer, ForeignKey("piprepo.id"), nullable=False) + repo = relationship("PipRepo") + + # see https://github.com/pypa/wheel/blob/master/wheel/wheelfile.py + # {distribution}-{version}(-{build tag})?-{python tag}-{abi tag}-{platform tag}.whl + dist = Column(String(length=128), nullable=False) # 'requests' + version = Column(String(length=64), nullable=False) # '2.14.2' + build = Column(String(length=64), nullable=True) # '1234' + python = Column(String(length=64), nullable=False) # 'cp37' + api = Column(String(length=64), nullable=False) # 'cp37m' + platform = Column(String(length=256), nullable=False) # 'manylinux1_x86_64' + + fname = Column(String(length=256), nullable=False) + + size = Column(Integer, nullable=False) + sha256 = Column(String(length=64)) + + fields = Column(Text()) + + __table_args__ = (UniqueConstraint('fname', 'repo_id', name='pip_unique_repopkg'), ) + + @property + def blobpath(self): + return os.path.join("repos", self.repo.name, "packages", self.name[0], self.fname) + + +def get_repo(_db, repo_name, create_ok=True): + """ + Fetch a repo from the database by name + """ + repo = _db.query(PipRepo).filter(PipRepo.name == repo_name).first() + if not repo and create_ok: + repo = PipRepo(name=repo_name) + _db.add(repo) + _db.commit() + return repo + + +def copysha256(fin, fout): + """ + Copy a file and calculate sha256 while doing so + """ + h = hashlib.sha256() + + while True: + data = fin.read(4096) + if not data: + break + h.update(data) + fout.write(data) + + return h.hexdigest() + class PypiProvider(object): - def __init__(self, dbcon, s3client): + def __init__(self, dbcon, s3client, bucket="aptprovider"): self.db = dbcon self.s3 = s3client + self.bucket = bucket + """base path within the s3 bucket""" + self.basepath = "data/provider/pip" + """queue entries are tuples containing the database id of the dist to regenerate indexes and signatures for""" + self.queue = queue.Queue() + + return + + cherrypy.tree.mount(PipWeb(self), "/repo/pop", {'/': {'tools.trailing_slash.on': False, + 'tools.db.on': True}}) + + # ensure bucket exists + #TODO bucket creation should happen in server.py + if bucket not in [b['Name'] for b in self.s3.list_buckets()['Buckets']]: + print("Creating bucket") + self.s3.create_bucket(Bucket=bucket) + + def web_addpkg(self, reponame, name, version, fobj): + repo = get_repo(db(), reponame) + + # write wheel to temp storage + with TemporaryDirectory() as tdir: + tmppkgpath = os.path.join(tdir, fobj.filename) #TODO verify filename doesnt have any nonsense like ../../passwd + with open(tmppkgpath, "wb") as fdest: + shasum = copysha256(fobj.file, fdest) + + metadata = parse_wheel(tmppkgpath) + assert(version == metadata["fields"]["version"]), "wheel metadata version doesn't match supplied version" + assert(fobj.filename == metadata["wheelname"]), f"file name is invalid, wanted '{metadata['wheelname']}'" + + # s3 path - repos//wheels/f/foo.wheel + dpath = os.path.join(self.basepath, "repos", repo.name, "wheels", + metadata["wheelname"][0], metadata["wheelname"]) + + files = self.s3.list_objects(Bucket=self.bucket, Prefix=dpath).get("Contents") + if files: + print(f"will overwrite: {files}") + + # add to db + pkg = PipPackage(repo=repo, + dist=metadata["fields"]["dist"], + version=metadata["fields"]["version"], + build=metadata["fields"]["build"], + python=metadata["fields"]["python"], + api=metadata["fields"]["api"], + platform=metadata["fields"]["platform"], + fname=metadata["wheelname"], + size=metadata["size"], + sha256=shasum, + fields=json.dumps(metadata)) + db().add(pkg) + db().commit() + + try: + with open(tmppkgpath, "rb") as f: + response = self.s3.put_object(Body=f, Bucket=self.bucket, Key=dpath) + assert(response["ResponseMetadata"]["HTTPStatusCode"] == 200), f"Upload failed: {response}" + except Exception: + db().delete(pkg) + db().commit() + raise + + yield json.dumps(metadata, indent=4) + + +@cherrypy.popargs("reponame") +class PipWeb(object): + def __init__(self, base): + self.base = base + # self.dists = AptDists(base) + # self.packages = AptFiles(base) + + @cherrypy.expose + def index(self, reponame=None): + yield "viewing repo {}".format(reponame)