import cherrypy import hashlib import json import os import queue import re from email import message_from_string from sqlalchemy import Column, ForeignKey, UniqueConstraint from sqlalchemy.orm import relationship from sqlalchemy.types import String, Integer, Text from tempfile import TemporaryDirectory from wheel import wheelfile from repobot.tables import Base, db def parse_wheel(path): fsize = os.path.getsize(path) # open up wheel file (it's actually a zip) p = wheelfile.WheelFile(path) # look for the files we care about in the '.dist-info' directory metadata_file = None metadata_wheel = None for zipfile in p.filelist: parts = os.path.split(zipfile.filename) if len(parts) == 2 and parts[0].endswith(".dist-info"): if parts[1] == "METADATA": metadata_file = zipfile elif parts[1] == "WHEEL": metadata_wheel = zipfile assert(metadata_file), "METADATA file not found" assert(metadata_wheel), "WHEEL file not found" metadata_data = message_from_string(p.read(metadata_file.filename).decode("UTF-8")) wheel_data = message_from_string(p.read(metadata_wheel.filename).decode("UTF-8")) # get version and whatnot from the pkginfo. there will be multiple Tags with the same python and api, but # there can be varying platforms. python_versions = set() python_apis = set() python_platforms = set() for tag in wheel_data.get_all("Tag"): python_version, python_api, python_platform = tag.split("-") # ['py3', 'none', 'any'] python_versions.update([python_version]) python_apis.update([python_api]) python_platforms.update([python_platform]) assert(len(python_apis) == 1), "wheel metadata python api list has other than 1 unique entry" # generate final platforming string python_version = '.'.join(sorted(list(python_versions), key=natural_keys)) python_api = python_apis.pop() python_platform = '.'.join(sorted(list(python_platforms), key=natural_keys)) buildtag = wheel_data["Build"] name_parts = [metadata_data["Name"], metadata_data["Version"], python_version, python_api, python_platform] if buildtag: name_parts.insert(2, buildtag) assert(None not in name_parts), "Required metadata field missing" # construct filename, verify it matches what was submitted fname_parts = name_parts[:] fname_parts[0] = fname_parts[0].replace("-", "_") # replaces dashes in dist name with underscore wheelname = "-".join(fname_parts) + ".whl" return {"fields": {"dist": name_parts[0], "version": name_parts[1], "build": buildtag, "python": python_version, "api": python_api, "platform": python_platform}, "wheel": wheel_data.items(), "metadata": metadata_data.items(), "description": metadata_data.get_payload(), "wheelname": wheelname, "size": fsize} # https://stackoverflow.com/a/5967539 def sort_atoi(text): return int(text) if text.isdigit() else text def natural_keys(text): """ Sort keeping keys in "natural" order such that version names embedded in strings are ordered correctly such as: - macosx_10_6_intel - macosx_10_9_intel - macosx_10_9_x86_64 - macosx_10_10_intel - macosx_10_10_x86_64 """ return [sort_atoi(c) for c in re.split(r'(\d+)', text)] class PipRepo(Base): __tablename__ = 'piprepo' id = Column(Integer, primary_key=True) name = Column(String(length=32), unique=True, nullable=False) class PipPackage(Base): __tablename__ = 'pippkg' id = Column(Integer, primary_key=True) repo_id = Column(Integer, ForeignKey("piprepo.id"), nullable=False) repo = relationship("PipRepo") # see https://github.com/pypa/wheel/blob/master/wheel/wheelfile.py # {distribution}-{version}(-{build tag})?-{python tag}-{abi tag}-{platform tag}.whl dist = Column(String(length=128), nullable=False) # 'requests' version = Column(String(length=64), nullable=False) # '2.14.2' build = Column(String(length=64), nullable=True) # '1234' python = Column(String(length=64), nullable=False) # 'cp37' api = Column(String(length=64), nullable=False) # 'cp37m' platform = Column(String(length=256), nullable=False) # 'manylinux1_x86_64' fname = Column(String(length=256), nullable=False) size = Column(Integer, nullable=False) sha256 = Column(String(length=64)) fields = Column(Text()) __table_args__ = (UniqueConstraint('fname', 'repo_id', name='pip_unique_repopkg'), ) @property def blobpath(self): return os.path.join("repos", self.repo.name, "packages", self.name[0], self.fname) def get_repo(_db, repo_name, create_ok=True): """ Fetch a repo from the database by name """ repo = _db.query(PipRepo).filter(PipRepo.name == repo_name).first() if not repo and create_ok: repo = PipRepo(name=repo_name) _db.add(repo) _db.commit() return repo def copysha256(fin, fout): """ Copy a file and calculate sha256 while doing so """ h = hashlib.sha256() while True: data = fin.read(4096) if not data: break h.update(data) fout.write(data) return h.hexdigest() class PypiProvider(object): def __init__(self, dbcon, s3client, bucket="aptprovider"): self.db = dbcon self.s3 = s3client self.bucket = bucket """base path within the s3 bucket""" self.basepath = "data/provider/pip" """queue entries are tuples containing the database id of the dist to regenerate indexes and signatures for""" self.queue = queue.Queue() return cherrypy.tree.mount(PipWeb(self), "/repo/pop", {'/': {'tools.trailing_slash.on': False, 'tools.db.on': True}}) # ensure bucket exists #TODO bucket creation should happen in server.py if bucket not in [b['Name'] for b in self.s3.list_buckets()['Buckets']]: print("Creating bucket") self.s3.create_bucket(Bucket=bucket) def web_addpkg(self, reponame, name, version, fobj): repo = get_repo(db(), reponame) # write wheel to temp storage with TemporaryDirectory() as tdir: tmppkgpath = os.path.join(tdir, fobj.filename) #TODO verify filename doesnt have any nonsense like ../../passwd with open(tmppkgpath, "wb") as fdest: shasum = copysha256(fobj.file, fdest) metadata = parse_wheel(tmppkgpath) assert(version == metadata["fields"]["version"]), "wheel metadata version doesn't match supplied version" assert(fobj.filename == metadata["wheelname"]), f"file name is invalid, wanted '{metadata['wheelname']}'" # s3 path - repos//wheels/f/foo.wheel dpath = os.path.join(self.basepath, "repos", repo.name, "wheels", metadata["wheelname"][0], metadata["wheelname"]) files = self.s3.list_objects(Bucket=self.bucket, Prefix=dpath).get("Contents") if files: print(f"will overwrite: {files}") # add to db pkg = PipPackage(repo=repo, dist=metadata["fields"]["dist"], version=metadata["fields"]["version"], build=metadata["fields"]["build"], python=metadata["fields"]["python"], api=metadata["fields"]["api"], platform=metadata["fields"]["platform"], fname=metadata["wheelname"], size=metadata["size"], sha256=shasum, fields=json.dumps(metadata)) db().add(pkg) db().commit() try: with open(tmppkgpath, "rb") as f: response = self.s3.put_object(Body=f, Bucket=self.bucket, Key=dpath) assert(response["ResponseMetadata"]["HTTPStatusCode"] == 200), f"Upload failed: {response}" except Exception: db().delete(pkg) db().commit() raise yield json.dumps(metadata, indent=4) @cherrypy.popargs("reponame") class PipWeb(object): def __init__(self, base): self.base = base # self.dists = AptDists(base) # self.packages = AptFiles(base) @cherrypy.expose def index(self, reponame=None): yield "viewing repo {}".format(reponame)