Browse Source

wheel ingestion

master
dave 2 years ago
parent
commit
f759f47616
  1. 62
      repobot/aptprovider.py
  2. 239
      repobot/pypiprovider.py

62
repobot/aptprovider.py

@ -1,21 +1,20 @@
import cherrypy
from pydpkg import Dpkg
from repobot.tables import Base, db, get_engine
import gnupg
import hashlib
import json
import os
import queue
import sqlalchemy
from sqlalchemy import Column, ForeignKey
from sqlalchemy.types import String, Integer, Text, BOOLEAN
import traceback
from datetime import datetime
from pydpkg import Dpkg
from sqlalchemy import Column, ForeignKey, UniqueConstraint
from sqlalchemy.dialects.mysql import LONGTEXT
from sqlalchemy.orm import relationship
from sqlalchemy import UniqueConstraint
from sqlalchemy.types import String, Integer, Text, BOOLEAN
from tempfile import TemporaryDirectory
from threading import Thread
import hashlib
import os
import gnupg
from datetime import datetime
import traceback
import json
import queue
from repobot.tables import Base, db, get_engine
class AptRepo(Base):
@ -273,40 +272,49 @@ Description: Generated by Repobot
tmppkgpath = os.path.join(tdir, "temp.deb")
with open(tmppkgpath, "wb") as fdest:
fhashes = copyhash(fobj.file, fdest)
fsize = os.path.getsize(tmppkgpath)
p = Dpkg(tmppkgpath)
pkgname = "{}_{}_{}.deb".format(p.message['Package'], p.message['Version'], p.message['Architecture'])
yield "package name: {}\n".format(pkgname)
yield "package size: {}\n".format(os.path.getsize(tmppkgpath))
yield "package message:\n-----------------\n{}\n-----------------\n".format(p.message)
yield "package hashes: {}".format(fhashes)
#TODO keys can be duplicated in email.message.Message, does this cause any problems?
fields = {key: p.message[key] for key in p.message.keys()}
# repos/<reponame>/packages/f/foo.deb
dpath = os.path.join(self.basepath, "repos", repo.name, "packages", pkgname[0], pkgname)
with open(tmppkgpath, "rb") as f:
response = self.s3.put_object(Body=f, Bucket=self.bucket, Key=dpath)
if response["ResponseMetadata"]["HTTPStatusCode"] != 200:
print(response)
raise Exception("failed to store package")
fields = {key: p.message[key] for key in p.message.keys()}
files = self.s3.list_objects(Bucket=self.bucket, Prefix=dpath).get("Contents")
if files:
print(f"will overwrite: {files}")
pkg = AptPackage(repo=repo, dist=dist,
name=p.message['Package'],
version=p.message['Version'],
arch=p.message['Architecture'],
fname=pkgname,
size=os.path.getsize(tmppkgpath),
size=fsize,
**fhashes,
fields=json.dumps(fields))
dist.dirty = True
db().add(pkg)
db().commit()
try:
with open(tmppkgpath, "rb") as f:
response = self.s3.put_object(Body=f, Bucket=self.bucket, Key=dpath)
assert(response["ResponseMetadata"]["HTTPStatusCode"] == 200), f"Upload failed: {response}"
except Exception:
db().delete(pkg)
db().commit()
raise
dist.dirty = True
db().commit()
self.regen_dist(dist.id)
yield "package name: {}\n".format(pkgname)
yield "package size: {}\n".format(fsize)
yield "package message:\n-----------------\n{}\n-----------------\n".format(p.message)
yield "package hashes: {}".format(fhashes)
def regen_dist(self, dist_id):
self.queue.put((dist_id, ))

239
repobot/pypiprovider.py

@ -1,5 +1,242 @@
import cherrypy
import hashlib
import json
import os
import queue
import re
from email import message_from_string
from sqlalchemy import Column, ForeignKey, UniqueConstraint
from sqlalchemy.orm import relationship
from sqlalchemy.types import String, Integer, Text
from tempfile import TemporaryDirectory
from wheel import wheelfile
from repobot.tables import Base, db
def parse_wheel(path):
fsize = os.path.getsize(path)
# open up wheel file (it's actually a zip)
p = wheelfile.WheelFile(path)
# look for the files we care about in the '<wheelname>.dist-info' directory
metadata_file = None
metadata_wheel = None
for zipfile in p.filelist:
parts = os.path.split(zipfile.filename)
if len(parts) == 2 and parts[0].endswith(".dist-info"):
if parts[1] == "METADATA":
metadata_file = zipfile
elif parts[1] == "WHEEL":
metadata_wheel = zipfile
assert(metadata_file), "METADATA file not found"
assert(metadata_wheel), "WHEEL file not found"
metadata_data = message_from_string(p.read(metadata_file.filename).decode("UTF-8"))
wheel_data = message_from_string(p.read(metadata_wheel.filename).decode("UTF-8"))
# get version and whatnot from the pkginfo. there will be multiple Tags with the same python and api, but
# there can be varying platforms.
python_versions = set()
python_apis = set()
python_platforms = set()
for tag in wheel_data.get_all("Tag"):
python_version, python_api, python_platform = tag.split("-") # ['py3', 'none', 'any']
python_versions.update([python_version])
python_apis.update([python_api])
python_platforms.update([python_platform])
assert(len(python_apis) == 1), "wheel metadata python api list has other than 1 unique entry"
# generate final platforming string
python_version = '.'.join(sorted(list(python_versions), key=natural_keys))
python_api = python_apis.pop()
python_platform = '.'.join(sorted(list(python_platforms), key=natural_keys))
buildtag = wheel_data["Build"]
name_parts = [metadata_data["Name"], metadata_data["Version"], python_version, python_api, python_platform]
if buildtag:
name_parts.insert(2, buildtag)
assert(None not in name_parts), "Required metadata field missing"
# construct filename, verify it matches what was submitted
fname_parts = name_parts[:]
fname_parts[0] = fname_parts[0].replace("-", "_") # replaces dashes in dist name with underscore
wheelname = "-".join(fname_parts) + ".whl"
return {"fields": {"dist": name_parts[0],
"version": name_parts[1],
"build": buildtag,
"python": python_version,
"api": python_api,
"platform": python_platform},
"wheel": wheel_data.items(),
"metadata": metadata_data.items(),
"description": metadata_data.get_payload(),
"wheelname": wheelname,
"size": fsize}
# https://stackoverflow.com/a/5967539
def sort_atoi(text):
return int(text) if text.isdigit() else text
def natural_keys(text):
"""
Sort keeping keys in "natural" order such that version names embedded in strings are ordered correctly such as:
- macosx_10_6_intel
- macosx_10_9_intel
- macosx_10_9_x86_64
- macosx_10_10_intel
- macosx_10_10_x86_64
"""
return [sort_atoi(c) for c in re.split(r'(\d+)', text)]
class PipRepo(Base):
__tablename__ = 'piprepo'
id = Column(Integer, primary_key=True)
name = Column(String(length=32), unique=True, nullable=False)
class PipPackage(Base):
__tablename__ = 'pippkg'
id = Column(Integer, primary_key=True)
repo_id = Column(Integer, ForeignKey("piprepo.id"), nullable=False)
repo = relationship("PipRepo")
# see https://github.com/pypa/wheel/blob/master/wheel/wheelfile.py
# {distribution}-{version}(-{build tag})?-{python tag}-{abi tag}-{platform tag}.whl
dist = Column(String(length=128), nullable=False) # 'requests'
version = Column(String(length=64), nullable=False) # '2.14.2'
build = Column(String(length=64), nullable=True) # '1234'
python = Column(String(length=64), nullable=False) # 'cp37'
api = Column(String(length=64), nullable=False) # 'cp37m'
platform = Column(String(length=256), nullable=False) # 'manylinux1_x86_64'
fname = Column(String(length=256), nullable=False)
size = Column(Integer, nullable=False)
sha256 = Column(String(length=64))
fields = Column(Text())
__table_args__ = (UniqueConstraint('fname', 'repo_id', name='pip_unique_repopkg'), )
@property
def blobpath(self):
return os.path.join("repos", self.repo.name, "packages", self.name[0], self.fname)
def get_repo(_db, repo_name, create_ok=True):
"""
Fetch a repo from the database by name
"""
repo = _db.query(PipRepo).filter(PipRepo.name == repo_name).first()
if not repo and create_ok:
repo = PipRepo(name=repo_name)
_db.add(repo)
_db.commit()
return repo
def copysha256(fin, fout):
"""
Copy a file and calculate sha256 while doing so
"""
h = hashlib.sha256()
while True:
data = fin.read(4096)
if not data:
break
h.update(data)
fout.write(data)
return h.hexdigest()
class PypiProvider(object):
def __init__(self, dbcon, s3client):
def __init__(self, dbcon, s3client, bucket="aptprovider"):
self.db = dbcon
self.s3 = s3client
self.bucket = bucket
"""base path within the s3 bucket"""
self.basepath = "data/provider/pip"
"""queue entries are tuples containing the database id of the dist to regenerate indexes and signatures for"""
self.queue = queue.Queue()
return
cherrypy.tree.mount(PipWeb(self), "/repo/pop", {'/': {'tools.trailing_slash.on': False,
'tools.db.on': True}})
# ensure bucket exists
#TODO bucket creation should happen in server.py
if bucket not in [b['Name'] for b in self.s3.list_buckets()['Buckets']]:
print("Creating bucket")
self.s3.create_bucket(Bucket=bucket)
def web_addpkg(self, reponame, name, version, fobj):
repo = get_repo(db(), reponame)
# write wheel to temp storage
with TemporaryDirectory() as tdir:
tmppkgpath = os.path.join(tdir, fobj.filename) #TODO verify filename doesnt have any nonsense like ../../passwd
with open(tmppkgpath, "wb") as fdest:
shasum = copysha256(fobj.file, fdest)
metadata = parse_wheel(tmppkgpath)
assert(version == metadata["fields"]["version"]), "wheel metadata version doesn't match supplied version"
assert(fobj.filename == metadata["wheelname"]), f"file name is invalid, wanted '{metadata['wheelname']}'"
# s3 path - repos/<reponame>/wheels/f/foo.wheel
dpath = os.path.join(self.basepath, "repos", repo.name, "wheels",
metadata["wheelname"][0], metadata["wheelname"])
files = self.s3.list_objects(Bucket=self.bucket, Prefix=dpath).get("Contents")
if files:
print(f"will overwrite: {files}")
# add to db
pkg = PipPackage(repo=repo,
dist=metadata["fields"]["dist"],
version=metadata["fields"]["version"],
build=metadata["fields"]["build"],
python=metadata["fields"]["python"],
api=metadata["fields"]["api"],
platform=metadata["fields"]["platform"],
fname=metadata["wheelname"],
size=metadata["size"],
sha256=shasum,
fields=json.dumps(metadata))
db().add(pkg)
db().commit()
try:
with open(tmppkgpath, "rb") as f:
response = self.s3.put_object(Body=f, Bucket=self.bucket, Key=dpath)
assert(response["ResponseMetadata"]["HTTPStatusCode"] == 200), f"Upload failed: {response}"
except Exception:
db().delete(pkg)
db().commit()
raise
yield json.dumps(metadata, indent=4)
@cherrypy.popargs("reponame")
class PipWeb(object):
def __init__(self, base):
self.base = base
# self.dists = AptDists(base)
# self.packages = AptFiles(base)
@cherrypy.expose
def index(self, reponame=None):
yield "viewing repo {}".format(reponame)
Loading…
Cancel
Save