From c56c520950a2d21575df54c91290439fbd771e24 Mon Sep 17 00:00:00 2001 From: "Nathan J. Mehl" Date: Sun, 4 Jun 2017 14:33:06 -0700 Subject: [PATCH] python3 and travis support - python3 compatibility - drop py2.6 support - use email.message rather than rfc822.message - add some initial debug logging - pylint and pep8 fixes - add object properties for file hashes - add a simple cli demo script - add travis for continuous build --- .gitignore | 3 + .travis.yml | 14 +++ README.md | 53 ++++++++- pydpkg/__init__.py | 252 ++++++++++++++++++++++++++++------------ scripts/dpkg-inspect.py | 45 +++++++ setup.py | 46 +++++--- tests/test_dpkg.py | 6 +- 7 files changed, 319 insertions(+), 100 deletions(-) create mode 100644 .gitignore create mode 100644 .travis.yml create mode 100755 scripts/dpkg-inspect.py diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..38aa2d3 --- /dev/null +++ b/.gitignore @@ -0,0 +1,3 @@ +target/ +*.egg-info +.cache diff --git a/.travis.yml b/.travis.yml new file mode 100644 index 0000000..c046033 --- /dev/null +++ b/.travis.yml @@ -0,0 +1,14 @@ +language: python +python: + - "2.7" + - "3.3" + - "3.4" + - "3.5" +before_install: + - "pip install -U pip" +install: + - "pip install -e .[test]" +script: + - "py.test tests/" + - "pylint pydpkg/" + - "pep8 pydpkg/" diff --git a/README.md b/README.md index c6339bc..fb5ff56 100644 --- a/README.md +++ b/README.md @@ -1,3 +1,5 @@ +[![Build Status](https://travis-ci.org/TheClimateCorporation/python-dpkg.svg?branch=master)](https://travis-ci.org/TheClimateCorporation/python-dpkg) + python-dpkg =========== @@ -14,8 +16,8 @@ This is primarily intended for use on platforms that do not normally ship [python-apt](http://apt.alioth.debian.org/python-apt-doc/) due to licensing restrictions or the lack of a native libapt.so (e.g. macOS) -Currently only tested on Python 2.6 and 2.7. Should run on any python2 -distribution that can install the [arpy](https://pypi.python.org/pypi/arpy/) +Currently only tested on CPython 2.7 and 3.5, but at least in theory should run +on any python distribution that can install the [arpy](https://pypi.python.org/pypi/arpy/) library. Installing @@ -26,9 +28,9 @@ the [pip](https://packaging.python.org/installing/) tool: $ pip install pydpkg Collecting pydpkg - Downloading pydpkg-1.0-py2-none-any.whl + Downloading pydpkg-1.1-py2-none-any.whl Installing collected packages: pydpkg - Successfully installed pydpkg-1.0 + Successfully installed pydpkg-1.1 Usage ===== @@ -53,6 +55,28 @@ Read and extract headers Description: testdeb a bogus debian package for testing dpkg builds +Interact directly with the package control message +-------------------------------------------------- + + >>> dp.message + + >>> dp.message.get_content_type() + 'text/plain' + +Get package file fingerprints +----------------------------- + + >>> dp.fileinfo + {'sha256': '547500652257bac6f6bc83f0667d0d66c8abd1382c776c4de84b89d0f550ab7f', 'sha1': 'a5d28ae2f23e726a797349d7dd5f21baf8aa02b4', 'filesize': 910, 'md5': '149e61536a9fe36374732ec95cf7945d'} + >>> dp.md5 + '149e61536a9fe36374732ec95cf7945d' + >>> dp.sha1 + 'a5d28ae2f23e726a797349d7dd5f21baf8aa02b4' + >>> dp.sha256 + '547500652257bac6f6bc83f0667d0d66c8abd1382c776c4de84b89d0f550ab7f' + >>> dp.filesize + 910 + Get an arbitrary control header, case-independent ------------------------------------------------- @@ -86,3 +110,24 @@ Use as a cmp function to sort a list of version strings >>> from pydpkg import Dpkg >>> sorted(['0:1.0-test1', '1:0.0-test0', '0:1.0-test2'] , cmp=Dpkg.compare_versions) ['0:1.0-test1', '0:1.0-test2', '1:0.0-test0'] + +Use the `dpkg-inspect.py` script to inspect packages +---------------------------------------------------- + + $ dpkg-inspect.py ~/testdeb*deb + Filename: /Home/n/testdeb_1:0.0.0-test_all.deb + Size: 910 + MD5: 149e61536a9fe36374732ec95cf7945d + SHA1: a5d28ae2f23e726a797349d7dd5f21baf8aa02b4 + SHA256: 547500652257bac6f6bc83f0667d0d66c8abd1382c776c4de84b89d0f550ab7f + Headers: + Package: testdeb + Version: 1:0.0.0-test + Section: base + Priority: extra + Architecture: all + Installed-Size: 0 + Maintainer: Nathan Mehl + Description: testdeb + a bogus debian package for testing dpkg builds + diff --git a/pydpkg/__init__.py b/pydpkg/__init__.py index 0d6411a..865eca3 100644 --- a/pydpkg/__init__.py +++ b/pydpkg/__init__.py @@ -1,35 +1,55 @@ +""" pydpkg: tools for inspecting dpkg archive files in python + without any dependency on libapt +""" + +from __future__ import absolute_import + # stdlib imports +import io +import logging import os import tarfile - -from StringIO import StringIO -from rfc822 import Message from gzip import GzipFile +from hashlib import md5, sha1, sha256 +from email import message_from_string as Message # pypi imports +import six from arpy import Archive REQUIRED_HEADERS = ('package', 'version', 'architecture') +logging.basicConfig() + class DpkgError(Exception): + + """Base error class for pydpkg""" pass class DpkgVersionError(Exception): + + """Corrupt or unparseable version string""" pass class DpkgMissingControlFile(DpkgError): + + """No control file found in control.tar.gz""" pass class DpkgMissingControlGzipFile(DpkgError): + + """No control.tar.gz file found in dpkg file""" pass class DpkgMissingRequiredHeaderError(DpkgError): + + """Corrupt package missing a required header""" pass @@ -37,83 +57,160 @@ class Dpkg(object): """Class allowing import and manipulation of a debian package file.""" - def __init__(self, filename=None): - self.headers = {} - if not isinstance(filename, basestring): + def __init__(self, filename=None, ignore_missing=False, logger=None): + self.filename = os.path.expanduser(filename) + self.ignore_missing = ignore_missing + if not isinstance(self.filename, six.string_types): raise DpkgError('filename argument must be a string') - if not os.path.isfile(filename): + if not os.path.isfile(self.filename): raise DpkgError('filename "%s" does not exist', filename) - self.control_str, self._control_headers = self._process_dpkg_file( - filename) - for k in self._control_headers.keys(): - self.headers[k] = self._control_headers[k] + self._log = logger or logging.getLogger(__name__) + self._fileinfo = None + self._control_str = None + self._headers = None + self._message = None def __repr__(self): - return self.control_str + return repr(self.control_str) + + def __str__(self): + return six.text_type(self.control_str) + + @property + def message(self): + """Return an email.Message object containing the package control + structure.""" + if not self._message: + self._message = self._process_dpkg_file(self.filename) + return self._message + + @property + def control_str(self): + """Return the control message as a string""" + if not self._control_str: + self._control_str = self.message.as_string() + return self._control_str + + @property + def headers(self): + """Return the control message headers as a dict""" + if not self._headers: + self._headers = dict(self.message.items()) + return self._headers + + @property + def fileinfo(self): + """Return a dictionary containing md5/sha1/sha256 checksums + and the size in bytes of our target file.""" + if not self._fileinfo: + h_md5 = md5() + h_sha1 = sha1() + h_sha256 = sha256() + with open(self.filename, 'rb') as dpkg_file: + for chunk in iter(lambda: dpkg_file.read(128), b''): + h_md5.update(chunk) + h_sha1.update(chunk) + h_sha256.update(chunk) + self._fileinfo = { + 'md5': h_md5.hexdigest(), + 'sha1': h_sha1.hexdigest(), + 'sha256': h_sha256.hexdigest(), + 'filesize': os.path.getsize(self.filename) + } + return self._fileinfo + + @property + def md5(self): + """Return the md5 hash of our target file""" + return self.fileinfo['md5'] + + @property + def sha1(self): + """Return the sha1 hash of our target file""" + return self.fileinfo['sha1'] + + @property + def sha256(self): + """Return the sha256 hash of our target file""" + return self.fileinfo['sha256'] + + @property + def filesize(self): + """Return the size of our target file""" + return self.fileinfo['filesize'] def get_header(self, header): """ case-independent query for a control message header value """ return self.headers.get(header.lower(), '') def compare_version_with(self, version_str): - return Dpkg.compare_versions( - self.get_header('version'), - version_str) + """Compare my version to an arbitrary version""" + return Dpkg.compare_versions(self.get_header('version'), version_str) - def _force_encoding(self, obj, encoding='utf-8'): - if isinstance(obj, basestring): - if not isinstance(obj, unicode): - obj = unicode(obj, encoding) + @staticmethod + def _force_encoding(obj, encoding='utf-8'): + """Enforce uniform text encoding""" + if isinstance(obj, six.string_types): + if not isinstance(obj, six.text_type): + obj = six.text_type(obj, encoding) return obj def _process_dpkg_file(self, filename): - dpkg = Archive(filename) - dpkg.read_all_headers() - - if 'control.tar.gz' not in dpkg.archived_files: + dpkg_archive = Archive(filename) + dpkg_archive.read_all_headers() + try: + control_tgz = dpkg_archive.archived_files[b'control.tar.gz'] + except KeyError: raise DpkgMissingControlGzipFile( 'Corrupt dpkg file: no control.tar.gz file in ar archive.') + self._log.debug('found controlgz: %s', control_tgz) - control_tgz = dpkg.archived_files['control.tar.gz'] - - # have to do an intermediate step because gzipfile doesn't support seek + # have to pass through BytesIO because gzipfile doesn't support seek # from end; luckily control tars are tiny - control_tar_intermediate = GzipFile(fileobj=control_tgz, mode='rb') - tar_data = control_tar_intermediate.read() - sio = StringIO(tar_data) - control_tar = tarfile.open(fileobj=sio) + with GzipFile(fileobj=control_tgz) as gzf: + self._log.debug('opened gzip file: %s', gzf) + with tarfile.open(fileobj=io.BytesIO(gzf.read())) as control_tar: + self._log.debug('opened tar file: %s', control_tar) + # pathname in the tar could be ./control, or just control + # (there would never be two control files...right?) + tar_members = [ + os.path.basename(x.name) for x in control_tar.getmembers()] + self._log.debug('got tar members: %s', tar_members) + if 'control' not in tar_members: + raise DpkgMissingControlFile( + 'Corrupt dpkg file: no control file in control.tar.gz') + control_idx = tar_members.index('control') + self._log.debug('got control index: %s', control_idx) + # at last! + control_file = control_tar.extractfile( + control_tar.getmembers()[control_idx]) + self._log.debug('got control file: %s', control_file) + message_body = control_file.read() + # py27 lacks email.message_from_bytes, so... + if isinstance(message_body, bytes): + message_body = message_body.decode('utf-8') + message = Message(message_body) + self._log.debug('got control message: %s', message) - # pathname in the tar could be ./control, or just control - # (there would never be two control files...right?) - tar_members = [os.path.basename(x.name) - for x in control_tar.getmembers()] - if 'control' not in tar_members: - raise DpkgMissingControlFile( - 'Corrupt dpkg file: no control file in control.tar.gz.') - control_idx = tar_members.index('control') - - # at last! - control_file = control_tar.extractfile( - control_tar.getmembers()[control_idx]) - - # beware: dpkg will happily let people drop random encodings into the - # control file - control_str = self._force_encoding(control_file.read()) - - # now build the dict - control_file.seek(0) - control_headers = Message(control_file) - - for header in REQUIRED_HEADERS: - if header not in control_headers: + for req in REQUIRED_HEADERS: + if req not in list(map(str.lower, message.keys())): + import pdb + pdb.set_trace() + if self.ignore_missing: + self._log.debug( + 'Header "%s" not found in control message', req) + continue raise DpkgMissingRequiredHeaderError( - 'Corrupt control section; header: "%s" not found' % header) + 'Corrupt control section; header: "%s" not found' % req) + self._log.debug('all required headers found') - for header in control_headers: - control_headers[header] = self._force_encoding( - control_headers[header]) + for header in message.keys(): + self._log.debug('coercing header to utf8: %s', header) + message.replace_header( + header, self._force_encoding(message[header])) + self._log.debug('all required headers coerced') - return control_str, control_headers + return message @staticmethod def get_epoch(version_str): @@ -152,6 +249,10 @@ class Dpkg(object): @staticmethod def split_full_version(version_str): + """Split a full version string into epoch, upstream version and + debian revision. + :param: version_str + :returns: tuple """ epoch, full_ver = Dpkg.get_epoch(version_str) upstream_rev, debian_rev = Dpkg.get_upstream(full_ver) return epoch, upstream_rev, debian_rev @@ -160,14 +261,12 @@ class Dpkg(object): def get_alphas(revision_str): """Return a tuple of the first non-digit characters of a revision (which may be empty) and the remaining characters.""" - # get the index of the first digit for i, char in enumerate(revision_str): if char.isdigit(): if i == 0: return '', revision_str - else: - return revision_str[0:i], revision_str[i:] + return revision_str[0:i], revision_str[i:] # string is entirely alphas return revision_str, '' @@ -175,17 +274,15 @@ class Dpkg(object): def get_digits(revision_str): """Return a tuple of the first integer characters of a revision (which may be empty) and the remains.""" - + # If the string is empty, return (0,'') if not revision_str: return 0, '' - # get the index of the first non-digit for i, char in enumerate(revision_str): if not char.isdigit(): if i == 0: return 0, revision_str - else: - return int(revision_str[0:i]), revision_str[i:] + return int(revision_str[0:i]), revision_str[i:] # string is entirely digits return int(revision_str), '' @@ -199,12 +296,13 @@ class Dpkg(object): """ result = [] while revision_str: - r1, remains = Dpkg.get_alphas(revision_str) - r2, remains = Dpkg.get_digits(remains) - result.extend([r1, r2]) + rev_1, remains = Dpkg.get_alphas(revision_str) + rev_2, remains = Dpkg.get_digits(remains) + result.extend([rev_1, rev_2]) revision_str = remains return result + # pylint: disable=invalid-name,too-many-return-statements @staticmethod def dstringcmp(a, b): """debian package version string section lexical sort algorithm @@ -241,32 +339,30 @@ class Dpkg(object): # ...except for goddamn tildes if char == '~': return -1 - else: - return 1 + return 1 # if we get here, a is shorter than b but otherwise equal, hence lesser # ...except for goddamn tildes if b[len(a)] == '~': return 1 - else: - return -1 + return -1 @staticmethod def compare_revision_strings(rev1, rev2): + """Compare two debian revision strings as described at + https://www.debian.org/doc/debian-policy/ch-controlfields.html#s-f-Version + """ if rev1 == rev2: return 0 - # listify pads results so that we will always be comparing ints to ints # and strings to strings (at least until we fall off the end of a list) list1 = Dpkg.listify(rev1) list2 = Dpkg.listify(rev2) - if list1 == list2: return 0 - try: for i, item in enumerate(list1): # just in case - if type(item) != type(list2[i]): + if not isinstance(item, list2[i].__class__): raise DpkgVersionError( 'Cannot compare %s to %s, something has gone horribly ' 'awry.' % (item, list2[i])) @@ -274,7 +370,7 @@ class Dpkg(object): if item == list2[i]: continue # numeric comparison - if type(item) == int: + if isinstance(item, int): if item > list2[i]: return 1 if item < list2[i]: @@ -290,6 +386,8 @@ class Dpkg(object): @staticmethod def compare_versions(ver1, ver2): + """Function to compare two Debian package version strings, + suitable for passing to list.sort() and friends.""" if ver1 == ver2: return 0 diff --git a/scripts/dpkg-inspect.py b/scripts/dpkg-inspect.py new file mode 100755 index 0000000..21f8426 --- /dev/null +++ b/scripts/dpkg-inspect.py @@ -0,0 +1,45 @@ +#!/usr/bin/env python + +from __future__ import print_function + +import glob +import logging +import os +import sys + +from pydpkg import Dpkg + +logging.basicConfig() +log = logging.getLogger('dpkg_extract') +log.setLevel(logging.INFO) + +PRETTY = """Filename: {0} +Size: {1} +MD5: {2} +SHA1: {3} +SHA256: {4} +Headers: +{5}""" + + +def indent(input_str, prefix): + return '\n'.join( + ['%s%s' % (prefix, x) for x in input_str.split('\n')] + ) + +try: + filenames = sys.argv[1:] +except KeyError: + log.fatal('You must list at least one deb file as an argument') + sys.exit(1) + +for files in filenames: + for fn in glob.glob(files): + if not os.path.isfile(fn): + log.warning('%s is not a file, skipping', fn) + log.debug('checking %s', fn) + dp = Dpkg(fn) + print(PRETTY.format( + fn, dp.filesize, dp.md5, dp.sha1, dp.sha256, + indent(str(dp), ' ') + )) diff --git a/setup.py b/setup.py index 5583956..beb9cdb 100644 --- a/setup.py +++ b/setup.py @@ -1,20 +1,32 @@ from distutils.core import setup setup( - name = 'pydpkg', - packages = ['pydpkg'], # this must be the same as the name above - version = '1.0', - description = 'A python library for parsing debian package control headers and comparing version strings', - author = 'Nathan J. Mehl', - author_email = 'n@climate.com', - url = 'https://github.com/theclimatecorporation/python-dpkg', - download_url = 'https://github.com/theclimatecorporation/python-dpkg/tarball/1.0', - keywords = ['apt', 'debian', 'dpkg', 'packaging'], - classifiers=[ - "Development Status :: 5 - Production/Stable", - "License :: OSI Approved :: Apache Software License", - "Programming Language :: Python :: 2.6", - "Programming Language :: Python :: 2.7", - "Programming Language :: Python :: Implementation :: CPython", - "Topic :: System :: Archiving :: Packaging", - ] + name='pydpkg', + packages=['pydpkg'], # this must be the same as the name above + version='1.1', + description='A python library for parsing debian package control headers and comparing version strings', + author='Nathan J. Mehl', + author_email='n@climate.com', + url='https://github.com/theclimatecorporation/python-dpkg', + download_url='https://github.com/theclimatecorporation/python-dpkg/tarball/1.1', + keywords=['apt', 'debian', 'dpkg', 'packaging'], + install_requires=[ + 'arpy==1.1.1', + 'six==1.10.0' + ], + extras_require={ + 'test': ['pep8==1.7.0', 'pytest==3.1.1', 'pylint==1.7.1'] + }, + scripts=[ + 'scripts/dpkg-inspect.py' + ], + classifiers=[ + "Development Status :: 5 - Production/Stable", + "License :: OSI Approved :: Apache Software License", + "Programming Language :: Python :: 2.7", + "Programming Language :: Python :: 3.3", + "Programming Language :: Python :: 3.4", + "Programming Language :: Python :: 3.5", + "Programming Language :: Python :: Implementation :: CPython", + "Topic :: System :: Archiving :: Packaging", + ] ) diff --git a/tests/test_dpkg.py b/tests/test_dpkg.py index f607782..86ef492 100644 --- a/tests/test_dpkg.py +++ b/tests/test_dpkg.py @@ -1,6 +1,7 @@ #!/usr/bin/env python import unittest +from functools import cmp_to_key from pydpkg import Dpkg, DpkgVersionError @@ -65,8 +66,9 @@ class DpkgTest(unittest.TestCase): # taken from # http://www.debian.org/doc/debian-policy/ch-controlfields.html#s-f-Version self.assertEqual( - sorted(['a', '', '~', '~~a', '~~'], cmp=Dpkg.dstringcmp), - ['~~', '~~a', '~', '', 'a']) + sorted(['a', '', '~', '~~a', '~~'], + key=cmp_to_key(Dpkg.dstringcmp)), + ['~~', '~~a', '~', '', 'a']) def test_compare_revision_strings(self): # note that these are testing a single revision string, not the full