docker-artifact/repobot/aptprovider.py

430 lines
15 KiB
Python
Raw Normal View History

2019-04-29 22:23:25 -07:00
import cherrypy
2019-05-04 18:20:17 -07:00
import gnupg
import hashlib
import json
import os
import queue
2019-04-29 22:23:25 -07:00
import sqlalchemy
2019-05-04 18:20:17 -07:00
import traceback
from datetime import datetime
from pydpkg import Dpkg
from sqlalchemy import Column, ForeignKey, UniqueConstraint
2019-04-29 22:23:25 -07:00
from sqlalchemy.dialects.mysql import LONGTEXT
from sqlalchemy.orm import relationship
2019-05-04 18:20:17 -07:00
from sqlalchemy.types import String, Integer, Text, BOOLEAN
2019-04-29 22:23:25 -07:00
from tempfile import TemporaryDirectory
from threading import Thread
2019-05-04 21:14:49 -07:00
from repobot.tables import Base, db
2019-04-29 22:23:25 -07:00
class AptRepo(Base):
__tablename__ = 'aptrepo'
id = Column(Integer, primary_key=True)
name = Column(String(length=32), unique=True, nullable=False)
gpgkey = Column(Text(), nullable=True)
gpgkeyprint = Column(Text(), nullable=True)
gpgpubkey = Column(Text(), nullable=True)
2019-04-30 21:40:48 -07:00
dists = relationship("AptDist")
2019-04-29 22:23:25 -07:00
class AptDist(Base):
__tablename__ = 'aptdist'
id = Column(Integer, primary_key=True)
repo_id = Column(Integer, ForeignKey("aptrepo.id"), nullable=False)
repo = relationship("AptRepo")
dirty = Column(BOOLEAN(), nullable=False, default=False)
name = Column(String(length=32), nullable=False)
packages_cache = Column(LONGTEXT(), nullable=True)
release_cache = Column(Text(), nullable=True)
sig_cache = Column(Text(), nullable=True)
__table_args__ = (UniqueConstraint('repo_id', 'name', name='apt_unique_repodist'), )
class AptPackage(Base):
__tablename__ = 'aptpkg'
id = Column(Integer, primary_key=True)
repo_id = Column(Integer, ForeignKey("aptrepo.id"), nullable=False)
repo = relationship("AptRepo")
dist_id = Column(Integer, ForeignKey("aptdist.id"), nullable=False)
dist = relationship("AptDist")
# index (always 'binary-amd64' for now)
name = Column(String(length=128), nullable=False) # 'python3-pip'
version = Column(String(length=128), nullable=False) # '4.20.1'
arch = Column(String(length=16), nullable=False) # 'amd64'
fname = Column(String(length=256), nullable=False)
size = Column(Integer, nullable=False)
md5 = Column(String(length=32))
sha1 = Column(String(length=40))
sha256 = Column(String(length=64))
sha512 = Column(String(length=128))
fields = Column(Text())
__table_args__ = (UniqueConstraint('name', 'version', 'repo_id', 'dist_id', name='apt_unique_repodist'), )
@property
def blobpath(self):
2019-04-30 21:40:48 -07:00
return os.path.join("repos", self.repo.name, "packages", self.name[0], self.fname)
2019-04-29 22:23:25 -07:00
def get_repo(_db, repo_name, create_ok=True):
2019-04-30 21:40:48 -07:00
"""
Fetch a repo from the database by name
"""
2019-04-29 22:23:25 -07:00
repo = _db.query(AptRepo).filter(AptRepo.name == repo_name).first()
if not repo and create_ok:
repo = AptRepo(name=repo_name)
_db.add(repo)
_db.commit()
return repo
def get_dist(_db, repo, dist_name, create_ok=True):
2019-04-30 21:40:48 -07:00
"""
Fetch a repo's dist from the database by name
"""
2019-04-29 22:23:25 -07:00
dist = _db.query(AptDist).filter(AptDist.name == dist_name, AptDist.repo_id == repo.id).first()
if not dist and create_ok:
dist = AptDist(name=dist_name, repo_id=repo.id)
_db.add(dist)
_db.commit()
return dist
algos = {"md5": "MD5Sum",
"sha1": "SHA1",
"sha256": "SHA256",
"sha512": "SHA512"}
def copyhash(fin, fout):
"""
Copy a file and calculate hashes while doing so
"""
hashes = {}
for algo in algos.keys():
hashes[algo] = getattr(hashlib, algo)()
while True:
data = fin.read(4096)
if not data:
break
for h in hashes.values():
h.update(data)
fout.write(data)
return {k: v.hexdigest() for k, v in hashes.items()}
def hashmany(data):
"""
2019-04-30 21:40:48 -07:00
Hash the input data using several algos
2019-04-29 22:23:25 -07:00
"""
hashes = {}
for algo in algos.keys():
hashes[algo] = getattr(hashlib, algo)()
for h in hashes.values():
h.update(data)
return {k: v.hexdigest() for k, v in hashes.items()}
class AptProvider(object):
2019-05-04 21:14:49 -07:00
def __init__(self, dbcon, s3client, bucket):
2019-04-29 22:23:25 -07:00
self.db = dbcon
self.s3 = s3client
self.bucket = bucket
2019-04-30 21:40:48 -07:00
"""base path within the s3 bucket"""
2019-04-29 22:23:25 -07:00
self.basepath = "data/provider/apt"
2019-05-01 18:45:52 -07:00
"""queue entries are tuples containing the database id of the dist to regenerate indexes and signatures for"""
self.queue = queue.Queue()
2019-04-30 21:40:48 -07:00
2019-04-29 22:23:25 -07:00
cherrypy.tree.mount(AptWeb(self), "/repo/apt", {'/': {'tools.trailing_slash.on': False,
'tools.db.on': True}})
self.updater = Thread(target=self.sign_packages, daemon=True)
self.updater.start()
def sign_packages(self):
2019-04-30 21:40:48 -07:00
Session = sqlalchemy.orm.sessionmaker(autoflush=True, autocommit=False)
2019-05-04 21:14:49 -07:00
Session.configure(bind=self.db)
2019-04-29 22:23:25 -07:00
while True:
2019-05-01 18:45:52 -07:00
try:
work = self.queue.get(block=True, timeout=5)
except queue.Empty:
continue
2019-04-30 21:40:48 -07:00
session = Session()
2019-04-29 22:23:25 -07:00
try:
2019-05-01 18:45:52 -07:00
self._sign_packages(session, work)
2019-04-29 22:23:25 -07:00
except:
traceback.print_exc()
2019-04-30 21:40:48 -07:00
finally:
session.close()
2019-04-29 22:23:25 -07:00
2019-05-01 18:45:52 -07:00
def _sign_packages(self, session, work):
dist_id = work[0]
dist = session.query(AptDist).filter(AptDist.id == dist_id).first()
print("Generating metadata for repo:{} dist:{}".format(dist.repo.name, dist.name))
2019-04-29 22:23:25 -07:00
2019-05-01 18:45:52 -07:00
str_packages = ""
2019-04-29 22:23:25 -07:00
2019-05-01 18:45:52 -07:00
for package in session.query(AptPackage) \
.filter(AptPackage.repo == dist.repo,
AptPackage.dist == dist) \
.order_by(AptPackage.id).all():
fields = json.loads(package.fields)
for k, v in fields.items():
str_packages += "{}: {}\n".format(k, v)
for algo, algoname in algos.items():
str_packages += "{}: {}\n".format(algoname, getattr(package, algo))
2019-04-29 22:23:25 -07:00
2019-05-01 18:45:52 -07:00
str_packages += "Filename: packages/{}/{}\n".format(package.fname[0], package.fname)
str_packages += "Size: {}\n".format(package.size)
2019-04-29 22:23:25 -07:00
2019-05-01 18:45:52 -07:00
str_packages += "\n"
2019-04-29 22:23:25 -07:00
2019-05-01 18:45:52 -07:00
dist.packages_cache = str_packages.encode("utf-8")
2019-04-29 22:23:25 -07:00
2019-05-01 18:45:52 -07:00
release_hashes = hashmany(dist.packages_cache)
2019-04-29 22:23:25 -07:00
2019-05-01 18:45:52 -07:00
str_release = """Origin: . {dist}
2019-04-29 22:23:25 -07:00
Label: . {dist}
Suite: {dist}
Codename: {dist}
2019-04-30 21:40:48 -07:00
Date: {time}
2019-04-29 22:23:25 -07:00
Architectures: amd64
Components: main
2019-05-01 18:45:52 -07:00
Description: Generated by Repobot
2019-04-30 21:40:48 -07:00
""".format(dist=dist.name, time=datetime.utcnow().strftime("%a, %d %b %Y %H:%M:%S UTC"))
2019-05-01 18:45:52 -07:00
for algo, algoname in algos.items():
str_release += "{}:\n {} {} {}/{}/{}\n".format(algoname,
release_hashes[algo],
len(dist.packages_cache),
"main", #TODO component
"binary-amd64", #TODO whatever this was
"Packages")
dist.release_cache = str_release.encode("utf-8")
keyemail = 'debian_signing@localhost'
with TemporaryDirectory() as tdir:
gpg = gnupg.GPG(gnupghome=tdir)
def getkey():
keys = [i for i in gpg.list_keys(secret=True) if any([keyemail in k for k in i["uids"]])]
if keys:
return keys[0]
fingerprint = None
if not dist.repo.gpgkey:
print("Generating key for", dist.repo.name)
key = gpg.gen_key(gpg.gen_key_input(name_email=keyemail,
expire_date='2029-04-28',
key_type='RSA',
key_length=4096,
key_usage='encrypt,sign,auth',
passphrase="secret"))
fingerprint = key.fingerprint
dist.repo.gpgkey = gpg.export_keys(fingerprint, secret=True, passphrase="secret")
dist.repo.gpgkeyprint = fingerprint
dist.repo.gpgpubkey = gpg.export_keys(fingerprint)
else:
import_result = gpg.import_keys(dist.repo.gpgkey)
fingerprint = import_result.results[0]['fingerprint'] # errors here suggests some gpg import issue
assert(fingerprint == getkey()['fingerprint'])
dist.sig_cache = gpg.sign(dist.release_cache, keyid=fingerprint, passphrase='secret',
detach=True, clearsign=False).data
dist.dirty = False
session.commit()
print("Metadata generation complete")
2019-04-29 22:23:25 -07:00
def web_addpkg(self, reponame, name, version, fobj, dist):
repo = get_repo(db(), reponame)
dist = get_dist(db(), repo, dist)
print("Dist:", dist)
# - read f (write to temp storage if needed) and generate the hashes
# - load with Dpkg to get name version and whatnot
with TemporaryDirectory() as tdir:
tmppkgpath = os.path.join(tdir, "temp.deb")
with open(tmppkgpath, "wb") as fdest:
fhashes = copyhash(fobj.file, fdest)
2019-05-04 18:20:17 -07:00
fsize = os.path.getsize(tmppkgpath)
2019-04-29 22:23:25 -07:00
p = Dpkg(tmppkgpath)
pkgname = "{}_{}_{}.deb".format(p.message['Package'], p.message['Version'], p.message['Architecture'])
2019-05-04 18:20:17 -07:00
#TODO keys can be duplicated in email.message.Message, does this cause any problems?
fields = {key: p.message[key] for key in p.message.keys()}
2019-04-29 22:23:25 -07:00
# repos/<reponame>/packages/f/foo.deb
dpath = os.path.join(self.basepath, "repos", repo.name, "packages", pkgname[0], pkgname)
2019-05-04 18:20:17 -07:00
files = self.s3.list_objects(Bucket=self.bucket, Prefix=dpath).get("Contents")
if files:
print(f"will overwrite: {files}")
2019-04-29 22:23:25 -07:00
pkg = AptPackage(repo=repo, dist=dist,
name=p.message['Package'],
version=p.message['Version'],
arch=p.message['Architecture'],
fname=pkgname,
2019-05-04 18:20:17 -07:00
size=fsize,
2019-04-29 22:23:25 -07:00
**fhashes,
fields=json.dumps(fields))
db().add(pkg)
db().commit()
2019-05-04 18:20:17 -07:00
try:
with open(tmppkgpath, "rb") as f:
response = self.s3.put_object(Body=f, Bucket=self.bucket, Key=dpath)
assert(response["ResponseMetadata"]["HTTPStatusCode"] == 200), f"Upload failed: {response}"
except Exception:
db().delete(pkg)
db().commit()
raise
dist.dirty = True
db().commit()
2019-05-01 18:45:52 -07:00
self.regen_dist(dist.id)
2019-05-04 18:20:17 -07:00
yield "package name: {}\n".format(pkgname)
yield "package size: {}\n".format(fsize)
yield "package message:\n-----------------\n{}\n-----------------\n".format(p.message)
2019-05-04 20:33:20 -07:00
yield "package hashes: {}\n".format(fhashes)
2019-05-04 18:20:17 -07:00
2019-05-01 18:45:52 -07:00
def regen_dist(self, dist_id):
self.queue.put((dist_id, ))
2019-04-29 22:23:25 -07:00
#TODO
# - verify dpkg name & version match params
# - copy to persistent storage
# - add db record keyed under repo name and dist (and index but only 'binary-amd64' for now)
# - mark dist dirty
@cherrypy.popargs("reponame")
class AptWeb(object):
def __init__(self, base):
self.base = base
self.dists = AptDists(base)
self.packages = AptFiles(base)
@cherrypy.expose
2019-05-01 18:45:52 -07:00
def index(self, reponame=None, regen=False):
2019-04-29 22:23:25 -07:00
if reponame:
2019-04-30 21:40:48 -07:00
repo = get_repo(db(), reponame, create_ok=False)
2019-05-01 18:45:52 -07:00
yield "<a href='/repo/apt/{reponame}/pubkey'>pubkey</a> " \
"<a href='/repo/apt/{reponame}?regen=1'>regen</a><hr/>".format(reponame=repo.name)
2019-04-30 21:40:48 -07:00
for dist in db().query(AptDist).filter(AptDist.repo == repo).order_by(AptDist.name).all():
yield "<a href='/repo/apt/{reponame}/dists/{name}'>{name}</a>: <a href='/repo/apt/{reponame}/dists/{name}/main/indexname/Packages'>Packages</a> <a href='/repo/apt/{reponame}/dists/{name}/Release'>Release</a> <a href='/repo/apt/{reponame}/dists/{name}/Release.gpg'>Release.gpg</a><br />".format(reponame=repo.name, name=dist.name)
2019-05-01 18:45:52 -07:00
if regen:
self.base.regen_dist(dist.id)
2019-04-30 21:40:48 -07:00
# yield "about apt repo '{}'".format(reponame)
2019-04-29 22:23:25 -07:00
else:
2019-04-30 21:40:48 -07:00
for repo in db().query(AptRepo).order_by(AptRepo.name).all():
yield "<a href='/repo/apt/{name}'>{name}</a><br/>".format(name=repo.name)
2019-04-29 22:23:25 -07:00
@cherrypy.expose
def pubkey(self, reponame=None):
2019-04-30 21:40:48 -07:00
cherrypy.response.headers['Content-Type'] = 'text/plain'
return get_repo(db(), reponame, create_ok=False).gpgpubkey
2019-04-29 22:23:25 -07:00
@cherrypy.expose
class AptDists(object):
_cp_config = {'request.dispatch': cherrypy.dispatch.MethodDispatcher()}
def __init__(self, base):
self.base = base
def __call__(self, *segments, reponame=None):
2019-04-30 21:40:48 -07:00
repo = get_repo(db(), reponame, create_ok=False)
2019-04-29 22:23:25 -07:00
if len(segments) == 4 and segments[3] == "Packages":
distname, componentname, indexname, pkgs = segments
dist = get_dist(db(), repo, distname, create_ok=False)
if not repo or not dist:
raise cherrypy.HTTPError(404)
2019-04-30 21:40:48 -07:00
cherrypy.response.headers['Content-Type'] = 'text/plain'
return dist.packages_cache
2019-04-29 22:23:25 -07:00
elif len(segments) == 2:
distname, target = segments
dist = get_dist(db(), repo, distname, create_ok=False)
2019-04-30 21:40:48 -07:00
cherrypy.response.headers['Content-Type'] = 'text/plain'
2019-04-29 22:23:25 -07:00
if target == "Release":
2019-04-30 21:40:48 -07:00
return dist.release_cache
2019-04-29 22:23:25 -07:00
elif target == "Release.gpg":
2019-04-30 21:40:48 -07:00
return dist.sig_cache
else:
raise cherrypy.HTTPError(404)
elif len(segments) == 1:
distname = segments[0]
dist = get_dist(db(), repo, distname, create_ok=False)
body = ""
for package in db().query(AptPackage).filter(AptPackage.repo == repo,
AptPackage.dist == dist).order_by(AptPackage.fname).all():
body += "<a href='/repo/apt/{reponame}/packages/{fname[0]}/{fname}'>{fname}</a><br />".format(reponame=repo.name, fname=package.fname)
return body
2019-04-29 22:23:25 -07:00
raise cherrypy.HTTPError(404)
@cherrypy.expose
class AptFiles(object):
_cp_config = {'request.dispatch': cherrypy.dispatch.MethodDispatcher()}
def __init__(self, base):
self.base = base
def __call__(self, *segments, reponame=None):
firstletter, pkgname = segments
repo = get_repo(db(), reponame, create_ok=False)
package = db().query(AptPackage).filter(AptPackage.repo == repo, AptPackage.fname == pkgname).first()
if not package:
raise cherrypy.HTTPError(404)
2019-04-30 21:40:48 -07:00
dpath = os.path.join(self.base.basepath, package.blobpath)
2019-04-29 22:23:25 -07:00
response = self.base.s3.get_object(Bucket=self.base.bucket, Key=dpath)
cherrypy.response.headers["Content-Type"] = "application/x-debian-package"
cherrypy.response.headers["Content-Length"] = response["ContentLength"]
def stream():
while True:
2019-05-04 20:33:20 -07:00
data = response["Body"].read(65535)
2019-04-29 22:23:25 -07:00
if not data:
return
yield data
return stream()
__call__._cp_config = {'response.stream': True}