From 40965d9775bf0e7453cc3915bee0011705178593 Mon Sep 17 00:00:00 2001 From: dave Date: Thu, 17 Nov 2022 19:46:32 -0800 Subject: [PATCH] initial commit - repo databasing --- pydebmirror/__init__.py | 0 pydebmirror/cli.py | 217 +++++++++++++++++++++++ pydebmirror/cli2.py | 377 ++++++++++++++++++++++++++++++++++++++++ setup.py | 29 ++++ 4 files changed, 623 insertions(+) create mode 100644 pydebmirror/__init__.py create mode 100644 pydebmirror/cli.py create mode 100644 pydebmirror/cli2.py create mode 100644 setup.py diff --git a/pydebmirror/__init__.py b/pydebmirror/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/pydebmirror/cli.py b/pydebmirror/cli.py new file mode 100644 index 0000000..5824b63 --- /dev/null +++ b/pydebmirror/cli.py @@ -0,0 +1,217 @@ +import os +import sqlite3 +import requests +import gzip +import argparse +from contextlib import closing +from email import message_from_string +from dataclasses import dataclass + + +def dict_factory(c, row): + d = {} + for idx, col in enumerate(c.description): + d[col[0]] = row[idx] + return d + + +def get_db(db_path): + db_file = os.path.join(db_path, "packages.db") + db = sqlite3.connect(db_file) + db.row_factory = dict_factory + + queries = [ + """CREATE TABLE IF NOT EXISTS 'packages' ( + 'name' TEXT, + 'version' TEXT, + 'arch' TEXT, + 'fname' TEXT, + 'sha256' TEXT, + 'has_file' BOOLEAN DEFAULT(0), + 'metadata' TEXT, -- from ubuntu/dists/focal/main/binary-amd64/Packages.gz" + UNIQUE(name, version, arch), + UNIQUE(fname), + UNIQUE(sha256) + )""", + ] + + with closing(db.cursor()) as c: + for query in queries: + c.execute(query) + + return db + + +def request_packages(url): + """ + the "Packages" metadata file may be plain, or with the .gz or .xz extension. This method requests each until the correct path is found + """ + gzip.decompress(request_packages(url).content).decode().split("\n\n")[0:-1] + return requests.get(url) + + +@dataclass +class Repoline: + """ + Repoline represents one line in an apt sources.list file + """ + base_url: str + arch: str + dist: str + components: list[str] + + @property + def packages_urls(self): + """ + URL to the 'Packages.gz' metadata file for each component + e.g. http://archive.ubuntu.com/ubuntu/dists/focal/main/binary-amd64/Packages.gz + """ + urls = {} + for component in self.components: + urls[component] = "{}dists/{}/{}/binary-{}/Packages" \ + .format(self.base_url, self.dist, component, self.arch) + return urls + + def fetch_packages_meta(self, component): + url = self.packages_urls[component] + data = request_packages(url).split("\n\n")[0:-1] + return [message_from_string(p) for p in data] + + @staticmethod + def parse(line) -> "Repoline": + """ + Parse 'deb [arch=xxx] http://archive.ubuntu.com/ubuntu/ focal main restricted' + """ + line = line.split() + + # discard the 'deb' prefix + if line[0] != "deb": + raise Exception("expected deb line to start with 'deb' but got '{}'".format(line[0])) + line.pop(0) + + #TODO parse or require arch + # discard '[arch=xxx]' + if line[0].startswith("["): + line.pop(0) + + # assume amd64 for now + arch = "amd64" + + # now we have the base url + base_url = line.pop(0) + if not base_url.endswith("/"): + base_url = base_url + "/" + + # and the dist + dist = line.pop(0) + + return Repoline(base_url=base_url, arch=arch, dist=dist, components=line) + + +def download_file(url, local_path): + print("downloading", url) + # print(local_path) + # import pdb + # pdb.set_trace() + # pass + + with open(local_path, "wb") as f: + resp = requests.get(url, stream=True) + resp.raise_for_status() + for chunk in resp.iter_content(): + f.write(chunk) + + +def cmd_ingest(args, parser): + if not args.line: + print("--file not yet supported") + return + + repo = Repoline.parse(args.line) + db = get_db(args.database) + + if not args.debs: + with closing(db.cursor()) as c: + new_packages = False + for component in repo.components: + print("fetching", component) + for pkg in repo.fetch_packages_meta(component): + + c.execute("SELECT count(*) as count FROM packages WHERE name=? AND version=? AND arch=?;", + (pkg["Package"], pkg["Version"], pkg["Architecture"], )) + + if c.fetchone()['count'] > 0: + continue + + new_packages = True + c.execute("INSERT INTO packages (name, version, arch, fname, sha256, metadata) VALUES (?, ?, ?, ?, ?, ?);", + (pkg["Package"], pkg["Version"], pkg["Architecture"], os.path.basename(pkg["Filename"]), + pkg["SHA256"], pkg.as_string()[0:-2], )) + + if new_packages: + c.execute("COMMIT") + + if not args.meta: + #TODO parallelize downloads + with closing(db.cursor()) as c: + c.execute("SELECT count(*) as count FROM packages WHERE has_file=0;") + print("need to download {} packages".format(c.fetchone()["count"])) + c.execute("SELECT * FROM packages WHERE has_file=0;") + to_download = c.fetchall() + + for row in to_download: + meta = message_from_string(row["metadata"]) + url = repo.base_url + meta["Filename"] + + local_dir = os.path.join(args.database, "files", row["sha256"][0]) + os.makedirs(local_dir, exist_ok=True) + local_path = os.path.join(local_dir, os.path.basename(meta["Filename"])) + + download_file(url, local_path) + + c.execute("UPDATE packages SET has_file=1 WHERE sha256=?;", (row["sha256"], )) + c.execute("COMMIT") + + +def cmd_mirror(args, parser): + """ + Create a repo + - containing all the packages from the db + - containing a subset of packages based on some query + - containing a subset of packages matching an existing repo + """ + + # filter the packages + + # build the metadata files + + # sign the files + + # put the packages in place + + pass + + +def main(): + parser = argparse.ArgumentParser(description="apt repo mirroring tool") + parser.add_argument("--database", required=True, help="package database path") + + sp_action = parser.add_subparsers(dest="action", help="action to take") + p_ingest = sp_action.add_parser("ingest", help="import packages from existing repos") + p_ingest.set_defaults(func=cmd_ingest) + + ingest_source = p_ingest.add_mutually_exclusive_group(required=True) + ingest_source.add_argument("--line", help="import packages from a single apt sources.list source") + ingest_source.add_argument("--file", help="import packages all sources in the given sources.list file") + + ingest_method = p_ingest.add_mutually_exclusive_group() + ingest_method.add_argument("--meta", action="store_true", help="only import metadata") + ingest_method.add_argument("--debs", action="store_true", help="only download packages") + + args = parser.parse_args() + + args.func(args, parser) + + +if __name__ == '__main__': + main() diff --git a/pydebmirror/cli2.py b/pydebmirror/cli2.py new file mode 100644 index 0000000..03b4a64 --- /dev/null +++ b/pydebmirror/cli2.py @@ -0,0 +1,377 @@ +import os +import sqlite3 +import requests +import gzip +import argparse +import tempfile +from contextlib import closing +from email import message_from_string +from dataclasses import dataclass + + +def dict_factory(c, row): + d = {} + for idx, col in enumerate(c.description): + d[col[0]] = row[idx] + return d + + +def get_db(db_file): + db = sqlite3.connect(db_file) + db.row_factory = dict_factory + + queries = [ + # packages is the pool of all deb packages + """CREATE TABLE IF NOT EXISTS 'packages' ( + 'name' TEXT, + 'version' TEXT, + 'arch' TEXT, + 'fname' TEXT, + 'sha256' TEXT, + 'has_file' BOOLEAN DEFAULT(0), + 'metadata' TEXT, -- from ubuntu/dists/focal/main/binary-amd64/Packages.gz" + UNIQUE(name, version, arch), + UNIQUE(fname), + UNIQUE(sha256) + )""", + # repo_package is a mapping of package -> dist,component,arch + """CREATE TABLE IF NOT EXISTS 'repo_package' ( + 'dist' TEXT, + 'component' TEXT, + 'arch' TEXT, + 'name' TEXT, + 'version' TEXT, + UNIQUE(dist, component, arch, name, version) + )""", + ] + + with closing(db.cursor()) as c: + for query in queries: + c.execute(query) + + return db + + +def parse_dist_release(data): + body = message_from_string(data) + files = [] + + for line in body["SHA256"].split("\n"): + if not line: + continue # 1st line is blank + hash_, size, path = line.split() + files.append(path) + + return files + + +def fetch_packages_file(url): + """ + http://archive.ubuntu.com/ubuntu/dists/focal/main/binary-amd64/Packages + try the .gz extension first + then the plain file + the raise an error because we don't support .xz + """ + resp = requests.get(url + ".gz") + + if resp.status_code == 200: + return gzip.decompress(resp.content).decode() + + if resp.status_code != 404: + resp.raise_for_status() + + resp = requests.get(url) + resp.raise_for_status() + + #TODO support the .xz Packages.xz format + return resp.text + + +class Repoline: + """ + Repoline represents one line in an apt sources.list file + """ + def __init__(self, *, base_url, arch, dist, components): + self.base_url = base_url + self.arch = arch + self.dist = dist + self.components = components + + def get_packages(self): + packages = {} + + # get the Release file + # lol we don't actually use it + dist_path = "{}dists/{}/".format(self.base_url, self.dist) + r = requests.get("{}Release".format(dist_path)) + r.raise_for_status() + # release = r.text + + # parse out each component's Packages/.gz/.xz file + # files = parse_dist_release(release) + + for component in self.components: + # main/binary-amd64/Packages.gz + component_prefix = "{}/binary-{}/Packages".format(component, self.arch) + + # disabled because http://archive.ubuntu.com/ will list the plain "Packages" file even tho only the .gz or + # .xz is available (lol) + # find the packages file as it could be one of multiple extensions + # packages_file = None + # for fname in files: + # if fname.startswith(component_prefix): + # print("check", fname) + # packages_file = fname + # break + # if not packages_file: + # raise Exception("couldn't find packages file for component: {}".format(component)) + # packages_data = fetch_packages_file("{}{}".format(dist_path, packages_file)) + + # fetch the packages file + packages[component] = [ + message_from_string(p) + for p in fetch_packages_file("{}{}".format(dist_path, component_prefix)).split("\n\n")[0:-1] + ] + + return packages + + @staticmethod + def parse(line) -> "Repoline": + """ + Parse 'deb [arch=xxx] http://archive.ubuntu.com/ubuntu/ focal main restricted' + """ + line = line.split() + + # discard the 'deb' prefix + if line[0] != "deb": + raise Exception("expected deb line to start with 'deb' but got '{}'".format(line[0])) + line.pop(0) + + #TODO parse or require arch + # discard '[arch=xxx]' + if line[0].startswith("["): + line.pop(0) + + # assume amd64 for now + arch = "amd64" + + # now we have the base url + base_url = line.pop(0) + if not base_url.endswith("/"): + base_url = base_url + "/" + + # and the dist + dist = line.pop(0) + + return Repoline(base_url=base_url, arch=arch, dist=dist, components=line) + + +class Repo(object): + def __init__(self, path): + self.db_path = path + self.db = get_db(os.path.join(path, "packages.db")) + self.dists = {} + + def cursor(self): + return self.db.cursor() + + def get_dist(self, name): + if dist := self.dists.get(name): + return dist + dist = Dist(self, name) + self.dists[name] = dist + return dist + + def import_source_metadata(self, line): + packages = line.get_packages() + dist = self.get_dist(line.dist) + dirty = False + with closing(self.db.cursor()) as c: + for component_name in line.components: + component = dist.get_component(component_name) + arch = component.get_arch(line.arch) + for package in packages[component_name]: + dirty = arch.add_package(c, package) or dirty + + if dirty: + c.execute("COMMIT") + + def import_source_packages(self, line): + #TODO parallelize downloads + with closing(self.db.cursor()) as c: + c.execute("SELECT * FROM packages WHERE has_file=0;") + to_download = c.fetchall() + for row in to_download: + metadata = message_from_string(row["metadata"]) + print("downloading", metadata["Package"], "@", metadata["Version"]) + + self.add_file(os.path.basename(metadata["Filename"]), + metadata["sha256"], + url=line.base_url + metadata["Filename"]) + + c.execute("UPDATE packages SET has_file=1 WHERE name=? AND version=? AND arch=?;", + (metadata["Package"], metadata["Version"], metadata["Architecture"], )) + c.execute("COMMIT") + + def add_file(self, filename, sha256, fpath=None, url=None): + # acquire the file and move it into the repo's sha path + if (fpath and url) or (not fpath and not url): + raise Exception("must specify fpath or url but not both") + + if fpath: + raise Exception("fpath not supported yet, use url") + + local_dir = os.path.join(self.db_path, "files", sha256[0]) + local_path = os.path.join(local_dir, filename) + + if os.path.exists(local_path): # skip files we already have + return + + with tempfile.TemporaryDirectory() as tmp: + ftmp = os.path.join(tmp, "ftmp") + with open(ftmp, "wb") as f: + resp = requests.get(url, stream=True) + resp.raise_for_status() + for chunk in resp.iter_content(chunk_size=256 * 1024): + f.write(chunk) + + os.makedirs(local_dir, exist_ok=True) + os.rename(ftmp, local_path) + + +class Dist(object): + def __init__(self, repo, name): + self.repo = repo + self.name = name + self.components = {} + + def get_component(self, name): + if component := self.components.get(name): + return component + component = Component(self, name) + self.components[name] = component + return component + + +class Component(object): + def __init__(self, dist, name): + self.dist = dist + self.name = name + self.arches = {} + + def get_arch(self, name): + if arch := self.arches.get(name): + return arch + arch = Arch(self, name) + self.arches[name] = arch + return arch + + +class Arch(object): + def __init__(self, component, name): + self.component = component + self.name = name + + def add_package(self, c, metadata): + # insert the package into the pool + # return true if we need the file + c.execute("SELECT * FROM packages WHERE name=? AND version=? AND arch=?;", + (metadata["Package"], metadata["Version"], metadata["Architecture"], )) + + row = c.fetchone() + + if not row: + c.execute("INSERT INTO packages (name, version, arch, fname, sha256, metadata) VALUES (?, ?, ?, ?, ?, ?);", + (metadata["Package"], + metadata["Version"], + metadata["Architecture"], + os.path.basename(metadata["Filename"]), + metadata["SHA256"], + metadata.as_string()[0:-2], )) + + # insert the package into the dist + c.execute("REPLACE INTO repo_package (dist, component, arch, name, version) VALUES (?, ?, ?, ?, ?);", + (self.component.dist.name, + self.component.name, + self.name, + metadata["Package"], + metadata["Version"], )) + + if row: + return False + + return True + + +def cmd_mirror(args, parser): + """ + Create a repo + - containing all the packages from the db + - containing a subset of packages based on some query + - containing a subset of packages matching an existing repo + """ + + # filter the packages + + # build the metadata files + + # sign the files + + # put the packages in place + + pass + + +def cmd_import(args, parser): + if not args.line: + print("--file not yet supported") + return + + line = Repoline.parse(args.line) + + r = Repo(args.database) + + # phase 1, get metadata + if not args.debs: + r.import_source_metadata(line) + + # phase 2, get the .deb files + if not args.meta: + r.import_source_packages(line) + + +def main(): + parser = argparse.ArgumentParser(description="apt repo mirroring tool") + parser.add_argument("--database", required=True, help="package database path") + + sp_action = parser.add_subparsers(dest="action", help="action to take") + p_ingest = sp_action.add_parser("ingest", help="import packages from existing repos") + p_ingest.set_defaults(func=cmd_import) + + ingest_source = p_ingest.add_mutually_exclusive_group(required=True) + ingest_source.add_argument("--line", help="import packages from a single apt sources.list source") + ingest_source.add_argument("--file", help="import packages all sources in the given sources.list file") + + ingest_method = p_ingest.add_mutually_exclusive_group() + ingest_method.add_argument("--meta", action="store_true", help="only import metadata") + ingest_method.add_argument("--debs", action="store_true", help="only download packages") + + args = parser.parse_args() + + args.func(args, parser) + + # r = Repo("./testrepo/") + # r = Repo("./testef/") + + # focal = r.get_dist("focal") + # focal_main = focal.get_component("main") + # focal_main_x64 = focal_main.get_arch("binary-amd64") + # focal_main_x64.add_package("x", "y") + # r.deploy(path="./www/") + + # r.import_source('deb http://archive.ubuntu.com/ubuntu/ focal main restricted') + # r.import_source('deb http://artifact.scc.net.davepedu.com/repo/apt/extpython/ focal main') + + # import pdb + # pdb.set_trace() + # pass diff --git a/setup.py b/setup.py new file mode 100644 index 0000000..c14240e --- /dev/null +++ b/setup.py @@ -0,0 +1,29 @@ +#!/usr/bin/env python3 + +from setuptools import setup + + +__version__ = "0.0.0" + + +setup(name='pydebmirror', + version=__version__, + description='Debian repository management tool', + url='', + author='dpedu', + author_email='dave@davepedu.com', + packages=['pydebmirror'], + install_requires=[], + entry_points={ + "console_scripts": [ + "pydebmirror = pydebmirror.cli:main", + "pydebmirror2 = pydebmirror.cli2:main", + ] + }, + # include_package_data=True, + # package_data={'photoapp': ['../templates/*.html', + # '../templates/fragments/*.html', + # '../styles/dist/*', + # '../assets/img/*']}, + # zip_safe=False + )