initial commit - repo databasing

This commit is contained in:
dave 2022-11-17 19:46:32 -08:00
commit 40965d9775
4 changed files with 623 additions and 0 deletions

0
pydebmirror/__init__.py Normal file
View File

217
pydebmirror/cli.py Normal file
View File

@ -0,0 +1,217 @@
import os
import sqlite3
import requests
import gzip
import argparse
from contextlib import closing
from email import message_from_string
from dataclasses import dataclass
def dict_factory(c, row):
d = {}
for idx, col in enumerate(c.description):
d[col[0]] = row[idx]
return d
def get_db(db_path):
db_file = os.path.join(db_path, "packages.db")
db = sqlite3.connect(db_file)
db.row_factory = dict_factory
queries = [
"""CREATE TABLE IF NOT EXISTS 'packages' (
'name' TEXT,
'version' TEXT,
'arch' TEXT,
'fname' TEXT,
'sha256' TEXT,
'has_file' BOOLEAN DEFAULT(0),
'metadata' TEXT, -- from ubuntu/dists/focal/main/binary-amd64/Packages.gz"
UNIQUE(name, version, arch),
UNIQUE(fname),
UNIQUE(sha256)
)""",
]
with closing(db.cursor()) as c:
for query in queries:
c.execute(query)
return db
def request_packages(url):
"""
the "Packages" metadata file may be plain, or with the .gz or .xz extension. This method requests each until the correct path is found
"""
gzip.decompress(request_packages(url).content).decode().split("\n\n")[0:-1]
return requests.get(url)
@dataclass
class Repoline:
"""
Repoline represents one line in an apt sources.list file
"""
base_url: str
arch: str
dist: str
components: list[str]
@property
def packages_urls(self):
"""
URL to the 'Packages.gz' metadata file for each component
e.g. http://archive.ubuntu.com/ubuntu/dists/focal/main/binary-amd64/Packages.gz
"""
urls = {}
for component in self.components:
urls[component] = "{}dists/{}/{}/binary-{}/Packages" \
.format(self.base_url, self.dist, component, self.arch)
return urls
def fetch_packages_meta(self, component):
url = self.packages_urls[component]
data = request_packages(url).split("\n\n")[0:-1]
return [message_from_string(p) for p in data]
@staticmethod
def parse(line) -> "Repoline":
"""
Parse 'deb [arch=xxx] http://archive.ubuntu.com/ubuntu/ focal main restricted'
"""
line = line.split()
# discard the 'deb' prefix
if line[0] != "deb":
raise Exception("expected deb line to start with 'deb' but got '{}'".format(line[0]))
line.pop(0)
#TODO parse or require arch
# discard '[arch=xxx]'
if line[0].startswith("["):
line.pop(0)
# assume amd64 for now
arch = "amd64"
# now we have the base url
base_url = line.pop(0)
if not base_url.endswith("/"):
base_url = base_url + "/"
# and the dist
dist = line.pop(0)
return Repoline(base_url=base_url, arch=arch, dist=dist, components=line)
def download_file(url, local_path):
print("downloading", url)
# print(local_path)
# import pdb
# pdb.set_trace()
# pass
with open(local_path, "wb") as f:
resp = requests.get(url, stream=True)
resp.raise_for_status()
for chunk in resp.iter_content():
f.write(chunk)
def cmd_ingest(args, parser):
if not args.line:
print("--file not yet supported")
return
repo = Repoline.parse(args.line)
db = get_db(args.database)
if not args.debs:
with closing(db.cursor()) as c:
new_packages = False
for component in repo.components:
print("fetching", component)
for pkg in repo.fetch_packages_meta(component):
c.execute("SELECT count(*) as count FROM packages WHERE name=? AND version=? AND arch=?;",
(pkg["Package"], pkg["Version"], pkg["Architecture"], ))
if c.fetchone()['count'] > 0:
continue
new_packages = True
c.execute("INSERT INTO packages (name, version, arch, fname, sha256, metadata) VALUES (?, ?, ?, ?, ?, ?);",
(pkg["Package"], pkg["Version"], pkg["Architecture"], os.path.basename(pkg["Filename"]),
pkg["SHA256"], pkg.as_string()[0:-2], ))
if new_packages:
c.execute("COMMIT")
if not args.meta:
#TODO parallelize downloads
with closing(db.cursor()) as c:
c.execute("SELECT count(*) as count FROM packages WHERE has_file=0;")
print("need to download {} packages".format(c.fetchone()["count"]))
c.execute("SELECT * FROM packages WHERE has_file=0;")
to_download = c.fetchall()
for row in to_download:
meta = message_from_string(row["metadata"])
url = repo.base_url + meta["Filename"]
local_dir = os.path.join(args.database, "files", row["sha256"][0])
os.makedirs(local_dir, exist_ok=True)
local_path = os.path.join(local_dir, os.path.basename(meta["Filename"]))
download_file(url, local_path)
c.execute("UPDATE packages SET has_file=1 WHERE sha256=?;", (row["sha256"], ))
c.execute("COMMIT")
def cmd_mirror(args, parser):
"""
Create a repo
- containing all the packages from the db
- containing a subset of packages based on some query
- containing a subset of packages matching an existing repo
"""
# filter the packages
# build the metadata files
# sign the files
# put the packages in place
pass
def main():
parser = argparse.ArgumentParser(description="apt repo mirroring tool")
parser.add_argument("--database", required=True, help="package database path")
sp_action = parser.add_subparsers(dest="action", help="action to take")
p_ingest = sp_action.add_parser("ingest", help="import packages from existing repos")
p_ingest.set_defaults(func=cmd_ingest)
ingest_source = p_ingest.add_mutually_exclusive_group(required=True)
ingest_source.add_argument("--line", help="import packages from a single apt sources.list source")
ingest_source.add_argument("--file", help="import packages all sources in the given sources.list file")
ingest_method = p_ingest.add_mutually_exclusive_group()
ingest_method.add_argument("--meta", action="store_true", help="only import metadata")
ingest_method.add_argument("--debs", action="store_true", help="only download packages")
args = parser.parse_args()
args.func(args, parser)
if __name__ == '__main__':
main()

377
pydebmirror/cli2.py Normal file
View File

@ -0,0 +1,377 @@
import os
import sqlite3
import requests
import gzip
import argparse
import tempfile
from contextlib import closing
from email import message_from_string
from dataclasses import dataclass
def dict_factory(c, row):
d = {}
for idx, col in enumerate(c.description):
d[col[0]] = row[idx]
return d
def get_db(db_file):
db = sqlite3.connect(db_file)
db.row_factory = dict_factory
queries = [
# packages is the pool of all deb packages
"""CREATE TABLE IF NOT EXISTS 'packages' (
'name' TEXT,
'version' TEXT,
'arch' TEXT,
'fname' TEXT,
'sha256' TEXT,
'has_file' BOOLEAN DEFAULT(0),
'metadata' TEXT, -- from ubuntu/dists/focal/main/binary-amd64/Packages.gz"
UNIQUE(name, version, arch),
UNIQUE(fname),
UNIQUE(sha256)
)""",
# repo_package is a mapping of package -> dist,component,arch
"""CREATE TABLE IF NOT EXISTS 'repo_package' (
'dist' TEXT,
'component' TEXT,
'arch' TEXT,
'name' TEXT,
'version' TEXT,
UNIQUE(dist, component, arch, name, version)
)""",
]
with closing(db.cursor()) as c:
for query in queries:
c.execute(query)
return db
def parse_dist_release(data):
body = message_from_string(data)
files = []
for line in body["SHA256"].split("\n"):
if not line:
continue # 1st line is blank
hash_, size, path = line.split()
files.append(path)
return files
def fetch_packages_file(url):
"""
http://archive.ubuntu.com/ubuntu/dists/focal/main/binary-amd64/Packages
try the .gz extension first
then the plain file
the raise an error because we don't support .xz
"""
resp = requests.get(url + ".gz")
if resp.status_code == 200:
return gzip.decompress(resp.content).decode()
if resp.status_code != 404:
resp.raise_for_status()
resp = requests.get(url)
resp.raise_for_status()
#TODO support the .xz Packages.xz format
return resp.text
class Repoline:
"""
Repoline represents one line in an apt sources.list file
"""
def __init__(self, *, base_url, arch, dist, components):
self.base_url = base_url
self.arch = arch
self.dist = dist
self.components = components
def get_packages(self):
packages = {}
# get the Release file
# lol we don't actually use it
dist_path = "{}dists/{}/".format(self.base_url, self.dist)
r = requests.get("{}Release".format(dist_path))
r.raise_for_status()
# release = r.text
# parse out each component's Packages/.gz/.xz file
# files = parse_dist_release(release)
for component in self.components:
# main/binary-amd64/Packages.gz
component_prefix = "{}/binary-{}/Packages".format(component, self.arch)
# disabled because http://archive.ubuntu.com/ will list the plain "Packages" file even tho only the .gz or
# .xz is available (lol)
# find the packages file as it could be one of multiple extensions
# packages_file = None
# for fname in files:
# if fname.startswith(component_prefix):
# print("check", fname)
# packages_file = fname
# break
# if not packages_file:
# raise Exception("couldn't find packages file for component: {}".format(component))
# packages_data = fetch_packages_file("{}{}".format(dist_path, packages_file))
# fetch the packages file
packages[component] = [
message_from_string(p)
for p in fetch_packages_file("{}{}".format(dist_path, component_prefix)).split("\n\n")[0:-1]
]
return packages
@staticmethod
def parse(line) -> "Repoline":
"""
Parse 'deb [arch=xxx] http://archive.ubuntu.com/ubuntu/ focal main restricted'
"""
line = line.split()
# discard the 'deb' prefix
if line[0] != "deb":
raise Exception("expected deb line to start with 'deb' but got '{}'".format(line[0]))
line.pop(0)
#TODO parse or require arch
# discard '[arch=xxx]'
if line[0].startswith("["):
line.pop(0)
# assume amd64 for now
arch = "amd64"
# now we have the base url
base_url = line.pop(0)
if not base_url.endswith("/"):
base_url = base_url + "/"
# and the dist
dist = line.pop(0)
return Repoline(base_url=base_url, arch=arch, dist=dist, components=line)
class Repo(object):
def __init__(self, path):
self.db_path = path
self.db = get_db(os.path.join(path, "packages.db"))
self.dists = {}
def cursor(self):
return self.db.cursor()
def get_dist(self, name):
if dist := self.dists.get(name):
return dist
dist = Dist(self, name)
self.dists[name] = dist
return dist
def import_source_metadata(self, line):
packages = line.get_packages()
dist = self.get_dist(line.dist)
dirty = False
with closing(self.db.cursor()) as c:
for component_name in line.components:
component = dist.get_component(component_name)
arch = component.get_arch(line.arch)
for package in packages[component_name]:
dirty = arch.add_package(c, package) or dirty
if dirty:
c.execute("COMMIT")
def import_source_packages(self, line):
#TODO parallelize downloads
with closing(self.db.cursor()) as c:
c.execute("SELECT * FROM packages WHERE has_file=0;")
to_download = c.fetchall()
for row in to_download:
metadata = message_from_string(row["metadata"])
print("downloading", metadata["Package"], "@", metadata["Version"])
self.add_file(os.path.basename(metadata["Filename"]),
metadata["sha256"],
url=line.base_url + metadata["Filename"])
c.execute("UPDATE packages SET has_file=1 WHERE name=? AND version=? AND arch=?;",
(metadata["Package"], metadata["Version"], metadata["Architecture"], ))
c.execute("COMMIT")
def add_file(self, filename, sha256, fpath=None, url=None):
# acquire the file and move it into the repo's sha path
if (fpath and url) or (not fpath and not url):
raise Exception("must specify fpath or url but not both")
if fpath:
raise Exception("fpath not supported yet, use url")
local_dir = os.path.join(self.db_path, "files", sha256[0])
local_path = os.path.join(local_dir, filename)
if os.path.exists(local_path): # skip files we already have
return
with tempfile.TemporaryDirectory() as tmp:
ftmp = os.path.join(tmp, "ftmp")
with open(ftmp, "wb") as f:
resp = requests.get(url, stream=True)
resp.raise_for_status()
for chunk in resp.iter_content(chunk_size=256 * 1024):
f.write(chunk)
os.makedirs(local_dir, exist_ok=True)
os.rename(ftmp, local_path)
class Dist(object):
def __init__(self, repo, name):
self.repo = repo
self.name = name
self.components = {}
def get_component(self, name):
if component := self.components.get(name):
return component
component = Component(self, name)
self.components[name] = component
return component
class Component(object):
def __init__(self, dist, name):
self.dist = dist
self.name = name
self.arches = {}
def get_arch(self, name):
if arch := self.arches.get(name):
return arch
arch = Arch(self, name)
self.arches[name] = arch
return arch
class Arch(object):
def __init__(self, component, name):
self.component = component
self.name = name
def add_package(self, c, metadata):
# insert the package into the pool
# return true if we need the file
c.execute("SELECT * FROM packages WHERE name=? AND version=? AND arch=?;",
(metadata["Package"], metadata["Version"], metadata["Architecture"], ))
row = c.fetchone()
if not row:
c.execute("INSERT INTO packages (name, version, arch, fname, sha256, metadata) VALUES (?, ?, ?, ?, ?, ?);",
(metadata["Package"],
metadata["Version"],
metadata["Architecture"],
os.path.basename(metadata["Filename"]),
metadata["SHA256"],
metadata.as_string()[0:-2], ))
# insert the package into the dist
c.execute("REPLACE INTO repo_package (dist, component, arch, name, version) VALUES (?, ?, ?, ?, ?);",
(self.component.dist.name,
self.component.name,
self.name,
metadata["Package"],
metadata["Version"], ))
if row:
return False
return True
def cmd_mirror(args, parser):
"""
Create a repo
- containing all the packages from the db
- containing a subset of packages based on some query
- containing a subset of packages matching an existing repo
"""
# filter the packages
# build the metadata files
# sign the files
# put the packages in place
pass
def cmd_import(args, parser):
if not args.line:
print("--file not yet supported")
return
line = Repoline.parse(args.line)
r = Repo(args.database)
# phase 1, get metadata
if not args.debs:
r.import_source_metadata(line)
# phase 2, get the .deb files
if not args.meta:
r.import_source_packages(line)
def main():
parser = argparse.ArgumentParser(description="apt repo mirroring tool")
parser.add_argument("--database", required=True, help="package database path")
sp_action = parser.add_subparsers(dest="action", help="action to take")
p_ingest = sp_action.add_parser("ingest", help="import packages from existing repos")
p_ingest.set_defaults(func=cmd_import)
ingest_source = p_ingest.add_mutually_exclusive_group(required=True)
ingest_source.add_argument("--line", help="import packages from a single apt sources.list source")
ingest_source.add_argument("--file", help="import packages all sources in the given sources.list file")
ingest_method = p_ingest.add_mutually_exclusive_group()
ingest_method.add_argument("--meta", action="store_true", help="only import metadata")
ingest_method.add_argument("--debs", action="store_true", help="only download packages")
args = parser.parse_args()
args.func(args, parser)
# r = Repo("./testrepo/")
# r = Repo("./testef/")
# focal = r.get_dist("focal")
# focal_main = focal.get_component("main")
# focal_main_x64 = focal_main.get_arch("binary-amd64")
# focal_main_x64.add_package("x", "y")
# r.deploy(path="./www/")
# r.import_source('deb http://archive.ubuntu.com/ubuntu/ focal main restricted')
# r.import_source('deb http://artifact.scc.net.davepedu.com/repo/apt/extpython/ focal main')
# import pdb
# pdb.set_trace()
# pass

29
setup.py Normal file
View File

@ -0,0 +1,29 @@
#!/usr/bin/env python3
from setuptools import setup
__version__ = "0.0.0"
setup(name='pydebmirror',
version=__version__,
description='Debian repository management tool',
url='',
author='dpedu',
author_email='dave@davepedu.com',
packages=['pydebmirror'],
install_requires=[],
entry_points={
"console_scripts": [
"pydebmirror = pydebmirror.cli:main",
"pydebmirror2 = pydebmirror.cli2:main",
]
},
# include_package_data=True,
# package_data={'photoapp': ['../templates/*.html',
# '../templates/fragments/*.html',
# '../styles/dist/*',
# '../assets/img/*']},
# zip_safe=False
)