wheel ingestion
This commit is contained in:
parent
4f5966b415
commit
f759f47616
|
@ -1,21 +1,20 @@
|
|||
import cherrypy
|
||||
from pydpkg import Dpkg
|
||||
from repobot.tables import Base, db, get_engine
|
||||
import gnupg
|
||||
import hashlib
|
||||
import json
|
||||
import os
|
||||
import queue
|
||||
import sqlalchemy
|
||||
from sqlalchemy import Column, ForeignKey
|
||||
from sqlalchemy.types import String, Integer, Text, BOOLEAN
|
||||
import traceback
|
||||
from datetime import datetime
|
||||
from pydpkg import Dpkg
|
||||
from sqlalchemy import Column, ForeignKey, UniqueConstraint
|
||||
from sqlalchemy.dialects.mysql import LONGTEXT
|
||||
from sqlalchemy.orm import relationship
|
||||
from sqlalchemy import UniqueConstraint
|
||||
from sqlalchemy.types import String, Integer, Text, BOOLEAN
|
||||
from tempfile import TemporaryDirectory
|
||||
from threading import Thread
|
||||
import hashlib
|
||||
import os
|
||||
import gnupg
|
||||
from datetime import datetime
|
||||
import traceback
|
||||
import json
|
||||
import queue
|
||||
from repobot.tables import Base, db, get_engine
|
||||
|
||||
|
||||
class AptRepo(Base):
|
||||
|
@ -273,40 +272,49 @@ Description: Generated by Repobot
|
|||
tmppkgpath = os.path.join(tdir, "temp.deb")
|
||||
with open(tmppkgpath, "wb") as fdest:
|
||||
fhashes = copyhash(fobj.file, fdest)
|
||||
fsize = os.path.getsize(tmppkgpath)
|
||||
|
||||
p = Dpkg(tmppkgpath)
|
||||
pkgname = "{}_{}_{}.deb".format(p.message['Package'], p.message['Version'], p.message['Architecture'])
|
||||
yield "package name: {}\n".format(pkgname)
|
||||
yield "package size: {}\n".format(os.path.getsize(tmppkgpath))
|
||||
yield "package message:\n-----------------\n{}\n-----------------\n".format(p.message)
|
||||
yield "package hashes: {}".format(fhashes)
|
||||
|
||||
#TODO keys can be duplicated in email.message.Message, does this cause any problems?
|
||||
fields = {key: p.message[key] for key in p.message.keys()}
|
||||
|
||||
# repos/<reponame>/packages/f/foo.deb
|
||||
dpath = os.path.join(self.basepath, "repos", repo.name, "packages", pkgname[0], pkgname)
|
||||
|
||||
with open(tmppkgpath, "rb") as f:
|
||||
response = self.s3.put_object(Body=f, Bucket=self.bucket, Key=dpath)
|
||||
if response["ResponseMetadata"]["HTTPStatusCode"] != 200:
|
||||
print(response)
|
||||
raise Exception("failed to store package")
|
||||
|
||||
fields = {key: p.message[key] for key in p.message.keys()}
|
||||
files = self.s3.list_objects(Bucket=self.bucket, Prefix=dpath).get("Contents")
|
||||
if files:
|
||||
print(f"will overwrite: {files}")
|
||||
|
||||
pkg = AptPackage(repo=repo, dist=dist,
|
||||
name=p.message['Package'],
|
||||
version=p.message['Version'],
|
||||
arch=p.message['Architecture'],
|
||||
fname=pkgname,
|
||||
size=os.path.getsize(tmppkgpath),
|
||||
size=fsize,
|
||||
**fhashes,
|
||||
fields=json.dumps(fields))
|
||||
|
||||
dist.dirty = True
|
||||
db().add(pkg)
|
||||
db().commit()
|
||||
|
||||
try:
|
||||
with open(tmppkgpath, "rb") as f:
|
||||
response = self.s3.put_object(Body=f, Bucket=self.bucket, Key=dpath)
|
||||
assert(response["ResponseMetadata"]["HTTPStatusCode"] == 200), f"Upload failed: {response}"
|
||||
except Exception:
|
||||
db().delete(pkg)
|
||||
db().commit()
|
||||
raise
|
||||
|
||||
dist.dirty = True
|
||||
db().commit()
|
||||
self.regen_dist(dist.id)
|
||||
|
||||
yield "package name: {}\n".format(pkgname)
|
||||
yield "package size: {}\n".format(fsize)
|
||||
yield "package message:\n-----------------\n{}\n-----------------\n".format(p.message)
|
||||
yield "package hashes: {}".format(fhashes)
|
||||
|
||||
def regen_dist(self, dist_id):
|
||||
self.queue.put((dist_id, ))
|
||||
|
||||
|
|
|
@ -1,5 +1,242 @@
|
|||
import cherrypy
|
||||
import hashlib
|
||||
import json
|
||||
import os
|
||||
import queue
|
||||
import re
|
||||
from email import message_from_string
|
||||
from sqlalchemy import Column, ForeignKey, UniqueConstraint
|
||||
from sqlalchemy.orm import relationship
|
||||
from sqlalchemy.types import String, Integer, Text
|
||||
from tempfile import TemporaryDirectory
|
||||
from wheel import wheelfile
|
||||
from repobot.tables import Base, db
|
||||
|
||||
|
||||
def parse_wheel(path):
|
||||
fsize = os.path.getsize(path)
|
||||
|
||||
# open up wheel file (it's actually a zip)
|
||||
p = wheelfile.WheelFile(path)
|
||||
|
||||
# look for the files we care about in the '<wheelname>.dist-info' directory
|
||||
metadata_file = None
|
||||
metadata_wheel = None
|
||||
for zipfile in p.filelist:
|
||||
parts = os.path.split(zipfile.filename)
|
||||
if len(parts) == 2 and parts[0].endswith(".dist-info"):
|
||||
if parts[1] == "METADATA":
|
||||
metadata_file = zipfile
|
||||
elif parts[1] == "WHEEL":
|
||||
metadata_wheel = zipfile
|
||||
|
||||
assert(metadata_file), "METADATA file not found"
|
||||
assert(metadata_wheel), "WHEEL file not found"
|
||||
|
||||
metadata_data = message_from_string(p.read(metadata_file.filename).decode("UTF-8"))
|
||||
wheel_data = message_from_string(p.read(metadata_wheel.filename).decode("UTF-8"))
|
||||
|
||||
# get version and whatnot from the pkginfo. there will be multiple Tags with the same python and api, but
|
||||
# there can be varying platforms.
|
||||
python_versions = set()
|
||||
python_apis = set()
|
||||
python_platforms = set()
|
||||
|
||||
for tag in wheel_data.get_all("Tag"):
|
||||
python_version, python_api, python_platform = tag.split("-") # ['py3', 'none', 'any']
|
||||
python_versions.update([python_version])
|
||||
python_apis.update([python_api])
|
||||
python_platforms.update([python_platform])
|
||||
|
||||
assert(len(python_apis) == 1), "wheel metadata python api list has other than 1 unique entry"
|
||||
|
||||
# generate final platforming string
|
||||
python_version = '.'.join(sorted(list(python_versions), key=natural_keys))
|
||||
python_api = python_apis.pop()
|
||||
python_platform = '.'.join(sorted(list(python_platforms), key=natural_keys))
|
||||
|
||||
buildtag = wheel_data["Build"]
|
||||
name_parts = [metadata_data["Name"], metadata_data["Version"], python_version, python_api, python_platform]
|
||||
if buildtag:
|
||||
name_parts.insert(2, buildtag)
|
||||
|
||||
assert(None not in name_parts), "Required metadata field missing"
|
||||
|
||||
# construct filename, verify it matches what was submitted
|
||||
fname_parts = name_parts[:]
|
||||
fname_parts[0] = fname_parts[0].replace("-", "_") # replaces dashes in dist name with underscore
|
||||
wheelname = "-".join(fname_parts) + ".whl"
|
||||
|
||||
return {"fields": {"dist": name_parts[0],
|
||||
"version": name_parts[1],
|
||||
"build": buildtag,
|
||||
"python": python_version,
|
||||
"api": python_api,
|
||||
"platform": python_platform},
|
||||
"wheel": wheel_data.items(),
|
||||
"metadata": metadata_data.items(),
|
||||
"description": metadata_data.get_payload(),
|
||||
"wheelname": wheelname,
|
||||
"size": fsize}
|
||||
|
||||
|
||||
# https://stackoverflow.com/a/5967539
|
||||
def sort_atoi(text):
|
||||
return int(text) if text.isdigit() else text
|
||||
|
||||
|
||||
def natural_keys(text):
|
||||
"""
|
||||
Sort keeping keys in "natural" order such that version names embedded in strings are ordered correctly such as:
|
||||
- macosx_10_6_intel
|
||||
- macosx_10_9_intel
|
||||
- macosx_10_9_x86_64
|
||||
- macosx_10_10_intel
|
||||
- macosx_10_10_x86_64
|
||||
"""
|
||||
return [sort_atoi(c) for c in re.split(r'(\d+)', text)]
|
||||
|
||||
|
||||
class PipRepo(Base):
|
||||
__tablename__ = 'piprepo'
|
||||
id = Column(Integer, primary_key=True)
|
||||
name = Column(String(length=32), unique=True, nullable=False)
|
||||
|
||||
|
||||
class PipPackage(Base):
|
||||
__tablename__ = 'pippkg'
|
||||
id = Column(Integer, primary_key=True)
|
||||
|
||||
repo_id = Column(Integer, ForeignKey("piprepo.id"), nullable=False)
|
||||
repo = relationship("PipRepo")
|
||||
|
||||
# see https://github.com/pypa/wheel/blob/master/wheel/wheelfile.py
|
||||
# {distribution}-{version}(-{build tag})?-{python tag}-{abi tag}-{platform tag}.whl
|
||||
dist = Column(String(length=128), nullable=False) # 'requests'
|
||||
version = Column(String(length=64), nullable=False) # '2.14.2'
|
||||
build = Column(String(length=64), nullable=True) # '1234'
|
||||
python = Column(String(length=64), nullable=False) # 'cp37'
|
||||
api = Column(String(length=64), nullable=False) # 'cp37m'
|
||||
platform = Column(String(length=256), nullable=False) # 'manylinux1_x86_64'
|
||||
|
||||
fname = Column(String(length=256), nullable=False)
|
||||
|
||||
size = Column(Integer, nullable=False)
|
||||
sha256 = Column(String(length=64))
|
||||
|
||||
fields = Column(Text())
|
||||
|
||||
__table_args__ = (UniqueConstraint('fname', 'repo_id', name='pip_unique_repopkg'), )
|
||||
|
||||
@property
|
||||
def blobpath(self):
|
||||
return os.path.join("repos", self.repo.name, "packages", self.name[0], self.fname)
|
||||
|
||||
|
||||
def get_repo(_db, repo_name, create_ok=True):
|
||||
"""
|
||||
Fetch a repo from the database by name
|
||||
"""
|
||||
repo = _db.query(PipRepo).filter(PipRepo.name == repo_name).first()
|
||||
if not repo and create_ok:
|
||||
repo = PipRepo(name=repo_name)
|
||||
_db.add(repo)
|
||||
_db.commit()
|
||||
return repo
|
||||
|
||||
|
||||
def copysha256(fin, fout):
|
||||
"""
|
||||
Copy a file and calculate sha256 while doing so
|
||||
"""
|
||||
h = hashlib.sha256()
|
||||
|
||||
while True:
|
||||
data = fin.read(4096)
|
||||
if not data:
|
||||
break
|
||||
h.update(data)
|
||||
fout.write(data)
|
||||
|
||||
return h.hexdigest()
|
||||
|
||||
|
||||
class PypiProvider(object):
|
||||
def __init__(self, dbcon, s3client):
|
||||
def __init__(self, dbcon, s3client, bucket="aptprovider"):
|
||||
self.db = dbcon
|
||||
self.s3 = s3client
|
||||
self.bucket = bucket
|
||||
"""base path within the s3 bucket"""
|
||||
self.basepath = "data/provider/pip"
|
||||
"""queue entries are tuples containing the database id of the dist to regenerate indexes and signatures for"""
|
||||
self.queue = queue.Queue()
|
||||
|
||||
return
|
||||
|
||||
cherrypy.tree.mount(PipWeb(self), "/repo/pop", {'/': {'tools.trailing_slash.on': False,
|
||||
'tools.db.on': True}})
|
||||
|
||||
# ensure bucket exists
|
||||
#TODO bucket creation should happen in server.py
|
||||
if bucket not in [b['Name'] for b in self.s3.list_buckets()['Buckets']]:
|
||||
print("Creating bucket")
|
||||
self.s3.create_bucket(Bucket=bucket)
|
||||
|
||||
def web_addpkg(self, reponame, name, version, fobj):
|
||||
repo = get_repo(db(), reponame)
|
||||
|
||||
# write wheel to temp storage
|
||||
with TemporaryDirectory() as tdir:
|
||||
tmppkgpath = os.path.join(tdir, fobj.filename) #TODO verify filename doesnt have any nonsense like ../../passwd
|
||||
with open(tmppkgpath, "wb") as fdest:
|
||||
shasum = copysha256(fobj.file, fdest)
|
||||
|
||||
metadata = parse_wheel(tmppkgpath)
|
||||
assert(version == metadata["fields"]["version"]), "wheel metadata version doesn't match supplied version"
|
||||
assert(fobj.filename == metadata["wheelname"]), f"file name is invalid, wanted '{metadata['wheelname']}'"
|
||||
|
||||
# s3 path - repos/<reponame>/wheels/f/foo.wheel
|
||||
dpath = os.path.join(self.basepath, "repos", repo.name, "wheels",
|
||||
metadata["wheelname"][0], metadata["wheelname"])
|
||||
|
||||
files = self.s3.list_objects(Bucket=self.bucket, Prefix=dpath).get("Contents")
|
||||
if files:
|
||||
print(f"will overwrite: {files}")
|
||||
|
||||
# add to db
|
||||
pkg = PipPackage(repo=repo,
|
||||
dist=metadata["fields"]["dist"],
|
||||
version=metadata["fields"]["version"],
|
||||
build=metadata["fields"]["build"],
|
||||
python=metadata["fields"]["python"],
|
||||
api=metadata["fields"]["api"],
|
||||
platform=metadata["fields"]["platform"],
|
||||
fname=metadata["wheelname"],
|
||||
size=metadata["size"],
|
||||
sha256=shasum,
|
||||
fields=json.dumps(metadata))
|
||||
db().add(pkg)
|
||||
db().commit()
|
||||
|
||||
try:
|
||||
with open(tmppkgpath, "rb") as f:
|
||||
response = self.s3.put_object(Body=f, Bucket=self.bucket, Key=dpath)
|
||||
assert(response["ResponseMetadata"]["HTTPStatusCode"] == 200), f"Upload failed: {response}"
|
||||
except Exception:
|
||||
db().delete(pkg)
|
||||
db().commit()
|
||||
raise
|
||||
|
||||
yield json.dumps(metadata, indent=4)
|
||||
|
||||
|
||||
@cherrypy.popargs("reponame")
|
||||
class PipWeb(object):
|
||||
def __init__(self, base):
|
||||
self.base = base
|
||||
# self.dists = AptDists(base)
|
||||
# self.packages = AptFiles(base)
|
||||
|
||||
@cherrypy.expose
|
||||
def index(self, reponame=None):
|
||||
yield "viewing repo {}".format(reponame)
|
||||
|
|
Loading…
Reference in New Issue