docker-artifact/repobot/pypiprovider.py

309 lines
12 KiB
Python
Raw Normal View History

2019-05-04 18:20:17 -07:00
import cherrypy
import hashlib
import json
import os
import re
from email import message_from_string
2019-05-04 19:29:49 -07:00
from jinja2 import Environment, FileSystemLoader, select_autoescape
2019-05-04 18:20:17 -07:00
from sqlalchemy import Column, ForeignKey, UniqueConstraint
from sqlalchemy.orm import relationship
from sqlalchemy.types import String, Integer, Text
from tempfile import TemporaryDirectory
from wheel import wheelfile
from repobot.tables import Base, db
2019-05-04 19:29:49 -07:00
APPROOT = os.path.abspath(os.path.join(os.path.dirname(__file__), "../"))
2019-05-04 18:20:17 -07:00
def parse_wheel(path):
fsize = os.path.getsize(path)
# open up wheel file (it's actually a zip)
p = wheelfile.WheelFile(path)
# look for the files we care about in the '<wheelname>.dist-info' directory
metadata_file = None
metadata_wheel = None
for zipfile in p.filelist:
parts = os.path.split(zipfile.filename)
if len(parts) == 2 and parts[0].endswith(".dist-info"):
if parts[1] == "METADATA":
metadata_file = zipfile
elif parts[1] == "WHEEL":
metadata_wheel = zipfile
assert(metadata_file), "METADATA file not found"
assert(metadata_wheel), "WHEEL file not found"
metadata_data = message_from_string(p.read(metadata_file.filename).decode("UTF-8"))
wheel_data = message_from_string(p.read(metadata_wheel.filename).decode("UTF-8"))
# get version and whatnot from the pkginfo. there will be multiple Tags with the same python and api, but
# there can be varying platforms.
python_versions = set()
python_apis = set()
python_platforms = set()
for tag in wheel_data.get_all("Tag"):
python_version, python_api, python_platform = tag.split("-") # ['py3', 'none', 'any']
python_versions.update([python_version])
python_apis.update([python_api])
python_platforms.update([python_platform])
assert(len(python_apis) == 1), "wheel metadata python api list has other than 1 unique entry"
# generate final platforming string
python_version = '.'.join(sorted(list(python_versions), key=natural_keys))
python_api = python_apis.pop()
python_platform = '.'.join(sorted(list(python_platforms), key=natural_keys))
buildtag = wheel_data["Build"]
name_parts = [metadata_data["Name"], metadata_data["Version"], python_version, python_api, python_platform]
if buildtag:
name_parts.insert(2, buildtag)
assert(None not in name_parts), "Required metadata field missing"
# construct filename, verify it matches what was submitted
fname_parts = name_parts[:]
fname_parts[0] = fname_parts[0].replace("-", "_") # replaces dashes in dist name with underscore
wheelname = "-".join(fname_parts) + ".whl"
return {"fields": {"dist": name_parts[0],
"version": name_parts[1],
"build": buildtag,
"python": python_version,
"api": python_api,
"platform": python_platform},
"wheel": wheel_data.items(),
"metadata": metadata_data.items(),
"description": metadata_data.get_payload(),
"wheelname": wheelname,
"size": fsize}
2019-05-04 19:29:49 -07:00
def normalize(name):
return re.sub(r"[-_.]+", "-", name).lower()
2019-05-04 18:20:17 -07:00
# https://stackoverflow.com/a/5967539
def sort_atoi(text):
return int(text) if text.isdigit() else text
def natural_keys(text):
"""
Sort keeping keys in "natural" order such that version names embedded in strings are ordered correctly such as:
- macosx_10_6_intel
- macosx_10_9_intel
- macosx_10_9_x86_64
- macosx_10_10_intel
- macosx_10_10_x86_64
"""
return [sort_atoi(c) for c in re.split(r'(\d+)', text)]
class PipRepo(Base):
__tablename__ = 'piprepo'
id = Column(Integer, primary_key=True)
name = Column(String(length=32), unique=True, nullable=False)
class PipPackage(Base):
__tablename__ = 'pippkg'
id = Column(Integer, primary_key=True)
repo_id = Column(Integer, ForeignKey("piprepo.id"), nullable=False)
repo = relationship("PipRepo")
# see https://github.com/pypa/wheel/blob/master/wheel/wheelfile.py
# {distribution}-{version}(-{build tag})?-{python tag}-{abi tag}-{platform tag}.whl
2019-05-04 19:29:49 -07:00
dist = Column(String(length=128), nullable=False) # 'requests'
dist_norm = Column(String(length=128), nullable=False) # 'requests'
version = Column(String(length=64), nullable=False) # '2.14.2'
build = Column(String(length=64), nullable=True) # '1234'
python = Column(String(length=64), nullable=False) # 'cp37'
api = Column(String(length=64), nullable=False) # 'cp37m'
platform = Column(String(length=256), nullable=False) # 'manylinux1_x86_64'
2019-05-04 18:20:17 -07:00
fname = Column(String(length=256), nullable=False)
size = Column(Integer, nullable=False)
sha256 = Column(String(length=64))
fields = Column(Text())
__table_args__ = (UniqueConstraint('fname', 'repo_id', name='pip_unique_repopkg'), )
@property
def blobpath(self):
return os.path.join("repos", self.repo.name, "packages", self.name[0], self.fname)
def get_repo(_db, repo_name, create_ok=True):
"""
Fetch a repo from the database by name
"""
repo = _db.query(PipRepo).filter(PipRepo.name == repo_name).first()
if not repo and create_ok:
repo = PipRepo(name=repo_name)
_db.add(repo)
_db.commit()
return repo
def copysha256(fin, fout):
"""
Copy a file and calculate sha256 while doing so
"""
h = hashlib.sha256()
while True:
data = fin.read(4096)
if not data:
break
h.update(data)
fout.write(data)
return h.hexdigest()
2019-04-29 22:23:25 -07:00
class PypiProvider(object):
2019-05-04 21:14:49 -07:00
def __init__(self, dbcon, s3client, bucket):
2019-04-29 22:23:25 -07:00
self.db = dbcon
self.s3 = s3client
2019-05-04 18:20:17 -07:00
self.bucket = bucket
"""base path within the s3 bucket"""
self.basepath = "data/provider/pip"
2019-05-04 19:29:49 -07:00
cherrypy.tree.mount(PipWeb(self), "/repo/pypi", {'/': {'tools.trailing_slash.on': False,
'tools.db.on': True}})
2019-05-04 18:20:17 -07:00
def web_addpkg(self, reponame, name, version, fobj):
repo = get_repo(db(), reponame)
# write wheel to temp storage
with TemporaryDirectory() as tdir:
tmppkgpath = os.path.join(tdir, fobj.filename) #TODO verify filename doesnt have any nonsense like ../../passwd
with open(tmppkgpath, "wb") as fdest:
shasum = copysha256(fobj.file, fdest)
metadata = parse_wheel(tmppkgpath)
assert(version == metadata["fields"]["version"]), "wheel metadata version doesn't match supplied version"
assert(fobj.filename == metadata["wheelname"]), f"file name is invalid, wanted '{metadata['wheelname']}'"
# s3 path - repos/<reponame>/wheels/f/foo.wheel
dpath = os.path.join(self.basepath, "repos", repo.name, "wheels",
2019-05-04 21:26:27 -07:00
metadata["wheelname"][0].lower(), metadata["wheelname"])
2019-05-04 18:20:17 -07:00
files = self.s3.list_objects(Bucket=self.bucket, Prefix=dpath).get("Contents")
if files:
print(f"will overwrite: {files}")
# add to db
pkg = PipPackage(repo=repo,
dist=metadata["fields"]["dist"],
2019-05-04 19:29:49 -07:00
dist_norm=normalize(metadata["fields"]["dist"]), # index me ?
2019-05-04 18:20:17 -07:00
version=metadata["fields"]["version"],
build=metadata["fields"]["build"],
python=metadata["fields"]["python"],
api=metadata["fields"]["api"],
platform=metadata["fields"]["platform"],
fname=metadata["wheelname"],
size=metadata["size"],
sha256=shasum,
fields=json.dumps(metadata))
db().add(pkg)
db().commit()
try:
with open(tmppkgpath, "rb") as f:
response = self.s3.put_object(Body=f, Bucket=self.bucket, Key=dpath)
assert(response["ResponseMetadata"]["HTTPStatusCode"] == 200), f"Upload failed: {response}"
except Exception:
db().delete(pkg)
db().commit()
raise
2019-05-04 20:13:08 -07:00
return json.dumps(metadata, indent=4)
2019-05-04 18:20:17 -07:00
2019-05-04 19:29:49 -07:00
@cherrypy.popargs("reponame", "distname", "filename")
2019-05-04 18:20:17 -07:00
class PipWeb(object):
def __init__(self, base):
self.base = base
2019-05-04 19:29:49 -07:00
template_dir = "templates" if os.path.exists("templates") else os.path.join(APPROOT, "templates")
self.tpl = Environment(loader=FileSystemLoader(template_dir),
autoescape=select_autoescape(['html', 'xml']))
self.tpl.filters.update(normalize=normalize)
2019-05-04 18:20:17 -07:00
@cherrypy.expose
2019-05-04 19:29:49 -07:00
def index(self, reponame=None, distname=None, filename=None):
if filename:
return self.handle_download(reponame, distname, filename)
else:
return self.handle_navigation(reponame, distname, filename)
def handle_navigation(self, reponame=None, distname=None, filename=None):
if reponame:
repo = get_repo(db(), reponame, create_ok=False)
if distname:
2019-05-04 20:13:08 -07:00
return self.tpl.get_template("pypi/dist.html") \
2019-05-04 19:29:49 -07:00
.render(repo=repo,
pkgs=db().query(PipPackage).filter(PipPackage.repo == repo,
PipPackage.dist_norm == distname).
order_by(PipPackage.version).all(),
distname=normalize(distname))
2019-05-04 20:13:08 -07:00
return self.tpl.get_template("pypi/repo.html") \
2019-05-04 19:29:49 -07:00
.render(repo=repo,
2019-05-04 19:55:55 -07:00
dists=self._get_dists(repo))
2019-05-04 19:29:49 -07:00
2019-05-04 20:13:08 -07:00
return self.tpl.get_template("pypi/root.html") \
2019-05-04 19:29:49 -07:00
.render(repos=db().query(PipRepo).order_by(PipRepo.name).all())
2019-05-04 19:55:55 -07:00
def _get_dists(self, repo):
lastdist = None
for dist in db().query(PipPackage).filter(PipPackage.repo == repo).order_by(PipPackage.dist).all():
if lastdist and dist.dist == lastdist:
continue
yield dist
lastdist = dist.dist
2019-05-04 19:29:49 -07:00
def handle_download(self, reponame, distname, filename):
repo = get_repo(db(), reponame, create_ok=False)
pkg = db().query(PipPackage).filter(PipPackage.repo == repo, PipPackage.fname == filename).first()
if not pkg:
raise cherrypy.HTTPError(404)
2019-05-04 20:13:08 -07:00
2019-05-04 21:26:27 -07:00
dpath = os.path.join(self.base.basepath, "repos", repo.name, "wheels", pkg.fname[0].lower(), pkg.fname)
2019-05-04 19:29:49 -07:00
2019-05-04 20:13:08 -07:00
if str(cherrypy.request.method) == "DELETE":
db().delete(pkg)
files = self.base.s3.list_objects(Bucket=self.base.bucket, Prefix=dpath).get("Contents")
if files:
self.base.s3.delete_object(Bucket=self.base.bucket, Key=dpath)
db().commit()
return "OK"
2019-05-04 19:29:49 -07:00
2019-05-04 20:13:08 -07:00
elif str(cherrypy.request.method) == "GET":
response = self.base.s3.get_object(Bucket=self.base.bucket, Key=dpath)
2019-05-04 19:29:49 -07:00
2019-05-04 20:13:08 -07:00
cherrypy.response.headers["Content-Type"] = "binary/octet-stream"
cherrypy.response.headers["Content-Length"] = response["ContentLength"]
2019-05-04 19:29:49 -07:00
2019-05-04 20:13:08 -07:00
def stream():
while True:
data = response["Body"].read(65535)
if not data:
return
yield data
return stream()
else:
raise cherrypy.HTTPError(405)
2019-05-04 19:29:49 -07:00
2019-05-04 20:13:08 -07:00
index._cp_config = {'response.stream': True}