Browse Source

python3 and travis support

- python3 compatibility
- drop py2.6 support
- use email.message rather than rfc822.message
- add some initial debug logging
- pylint and pep8 fixes
- add object properties for file hashes
- add a simple cli demo script
- add travis for continuous build
Nathan J. Mehl 3 years ago
7 changed files with 321 additions and 102 deletions
  1. +3
  2. +14
  3. +49
  4. +177
  5. +45
  6. +29
  7. +4

+ 3
- 0
.gitignore View File

@@ -0,0 +1,3 @@

+ 14
- 0
.travis.yml View File

@@ -0,0 +1,14 @@
language: python
- "2.7"
- "3.3"
- "3.4"
- "3.5"
- "pip install -U pip"
- "pip install -e .[test]"
- "py.test tests/"
- "pylint pydpkg/"
- "pep8 pydpkg/"

+ 49
- 4 View File

@@ -1,3 +1,5 @@
[![Build Status](](


@@ -14,8 +16,8 @@ This is primarily intended for use on platforms that do not normally
ship [python-apt]( due to
licensing restrictions or the lack of a native (e.g. macOS)

Currently only tested on Python 2.6 and 2.7. Should run on any python2
distribution that can install the [arpy](
Currently only tested on CPython 2.7 and 3.5, but at least in theory should run
on any python distribution that can install the [arpy](
@@ -26,9 +28,9 @@ the [pip]( tool:

$ pip install pydpkg
Collecting pydpkg
Downloading pydpkg-1.0-py2-none-any.whl
Downloading pydpkg-1.1-py2-none-any.whl
Installing collected packages: pydpkg
Successfully installed pydpkg-1.0
Successfully installed pydpkg-1.1

@@ -53,6 +55,28 @@ Read and extract headers
Description: testdeb
a bogus debian package for testing dpkg builds

Interact directly with the package control message

>>> dp.message
<email.message.Message instance at 0x10895c6c8>
>>> dp.message.get_content_type()

Get package file fingerprints

>>> dp.fileinfo
{'sha256': '547500652257bac6f6bc83f0667d0d66c8abd1382c776c4de84b89d0f550ab7f', 'sha1': 'a5d28ae2f23e726a797349d7dd5f21baf8aa02b4', 'filesize': 910, 'md5': '149e61536a9fe36374732ec95cf7945d'}
>>> dp.md5
>>> dp.sha1
>>> dp.sha256
>>> dp.filesize

Get an arbitrary control header, case-independent

@@ -86,3 +110,24 @@ Use as a cmp function to sort a list of version strings
>>> from pydpkg import Dpkg
>>> sorted(['0:1.0-test1', '1:0.0-test0', '0:1.0-test2'] , cmp=Dpkg.compare_versions)
['0:1.0-test1', '0:1.0-test2', '1:0.0-test0']

Use the `` script to inspect packages

$ ~/testdeb*deb
Filename: /Home/n/testdeb_1:0.0.0-test_all.deb
Size: 910
MD5: 149e61536a9fe36374732ec95cf7945d
SHA1: a5d28ae2f23e726a797349d7dd5f21baf8aa02b4
SHA256: 547500652257bac6f6bc83f0667d0d66c8abd1382c776c4de84b89d0f550ab7f
Package: testdeb
Version: 1:0.0.0-test
Section: base
Priority: extra
Architecture: all
Installed-Size: 0
Maintainer: Nathan Mehl <>
Description: testdeb
a bogus debian package for testing dpkg builds

+ 177
- 79
pydpkg/ View File

@@ -1,35 +1,55 @@

""" pydpkg: tools for inspecting dpkg archive files in python
without any dependency on libapt

from __future__ import absolute_import

# stdlib imports
import io
import logging
import os
import tarfile

from StringIO import StringIO
from rfc822 import Message
from gzip import GzipFile
from hashlib import md5, sha1, sha256
from email import message_from_string as Message

# pypi imports
import six
from arpy import Archive

REQUIRED_HEADERS = ('package', 'version', 'architecture')


class DpkgError(Exception):

"""Base error class for pydpkg"""

class DpkgVersionError(Exception):

"""Corrupt or unparseable version string"""

class DpkgMissingControlFile(DpkgError):

"""No control file found in control.tar.gz"""

class DpkgMissingControlGzipFile(DpkgError):

"""No control.tar.gz file found in dpkg file"""

class DpkgMissingRequiredHeaderError(DpkgError):

"""Corrupt package missing a required header"""

@@ -37,83 +57,160 @@ class Dpkg(object):

"""Class allowing import and manipulation of a debian package file."""

def __init__(self, filename=None):
self.headers = {}
if not isinstance(filename, basestring):
def __init__(self, filename=None, ignore_missing=False, logger=None):
self.filename = os.path.expanduser(filename)
self.ignore_missing = ignore_missing
if not isinstance(self.filename, six.string_types):
raise DpkgError('filename argument must be a string')
if not os.path.isfile(filename):
if not os.path.isfile(self.filename):
raise DpkgError('filename "%s" does not exist', filename)
self.control_str, self._control_headers = self._process_dpkg_file(
for k in self._control_headers.keys():
self.headers[k] = self._control_headers[k]
self._log = logger or logging.getLogger(__name__)
self._fileinfo = None
self._control_str = None
self._headers = None
self._message = None

def __repr__(self):
return self.control_str
return repr(self.control_str)

def __str__(self):
return six.text_type(self.control_str)

def message(self):
"""Return an email.Message object containing the package control
if not self._message:
self._message = self._process_dpkg_file(self.filename)
return self._message

def control_str(self):
"""Return the control message as a string"""
if not self._control_str:
self._control_str = self.message.as_string()
return self._control_str

def headers(self):
"""Return the control message headers as a dict"""
if not self._headers:
self._headers = dict(self.message.items())
return self._headers

def fileinfo(self):
"""Return a dictionary containing md5/sha1/sha256 checksums
and the size in bytes of our target file."""
if not self._fileinfo:
h_md5 = md5()
h_sha1 = sha1()
h_sha256 = sha256()
with open(self.filename, 'rb') as dpkg_file:
for chunk in iter(lambda:, b''):
self._fileinfo = {
'md5': h_md5.hexdigest(),
'sha1': h_sha1.hexdigest(),
'sha256': h_sha256.hexdigest(),
'filesize': os.path.getsize(self.filename)
return self._fileinfo

def md5(self):
"""Return the md5 hash of our target file"""
return self.fileinfo['md5']

def sha1(self):
"""Return the sha1 hash of our target file"""
return self.fileinfo['sha1']

def sha256(self):
"""Return the sha256 hash of our target file"""
return self.fileinfo['sha256']

def filesize(self):
"""Return the size of our target file"""
return self.fileinfo['filesize']

def get_header(self, header):
""" case-independent query for a control message header value """
return self.headers.get(header.lower(), '')

def compare_version_with(self, version_str):
return Dpkg.compare_versions(

def _force_encoding(self, obj, encoding='utf-8'):
if isinstance(obj, basestring):
if not isinstance(obj, unicode):
obj = unicode(obj, encoding)
"""Compare my version to an arbitrary version"""
return Dpkg.compare_versions(self.get_header('version'), version_str)

def _force_encoding(obj, encoding='utf-8'):
"""Enforce uniform text encoding"""
if isinstance(obj, six.string_types):
if not isinstance(obj, six.text_type):
obj = six.text_type(obj, encoding)
return obj

def _process_dpkg_file(self, filename):
dpkg = Archive(filename)

if 'control.tar.gz' not in dpkg.archived_files:
dpkg_archive = Archive(filename)
control_tgz = dpkg_archive.archived_files[b'control.tar.gz']
except KeyError:
raise DpkgMissingControlGzipFile(
'Corrupt dpkg file: no control.tar.gz file in ar archive.')
self._log.debug('found controlgz: %s', control_tgz)

control_tgz = dpkg.archived_files['control.tar.gz']

# have to do an intermediate step because gzipfile doesn't support seek
# have to pass through BytesIO because gzipfile doesn't support seek
# from end; luckily control tars are tiny
control_tar_intermediate = GzipFile(fileobj=control_tgz, mode='rb')
tar_data =
sio = StringIO(tar_data)
control_tar =

# pathname in the tar could be ./control, or just control
# (there would never be two control files...right?)
tar_members = [os.path.basename(
for x in control_tar.getmembers()]
if 'control' not in tar_members:
raise DpkgMissingControlFile(
'Corrupt dpkg file: no control file in control.tar.gz.')
control_idx = tar_members.index('control')

# at last!
control_file = control_tar.extractfile(

# beware: dpkg will happily let people drop random encodings into the
# control file
control_str = self._force_encoding(

# now build the dict
control_headers = Message(control_file)

for header in REQUIRED_HEADERS:
if header not in control_headers:
with GzipFile(fileobj=control_tgz) as gzf:
self._log.debug('opened gzip file: %s', gzf)
with as control_tar:
self._log.debug('opened tar file: %s', control_tar)
# pathname in the tar could be ./control, or just control
# (there would never be two control files...right?)
tar_members = [
os.path.basename( for x in control_tar.getmembers()]
self._log.debug('got tar members: %s', tar_members)
if 'control' not in tar_members:
raise DpkgMissingControlFile(
'Corrupt dpkg file: no control file in control.tar.gz')
control_idx = tar_members.index('control')
self._log.debug('got control index: %s', control_idx)
# at last!
control_file = control_tar.extractfile(
self._log.debug('got control file: %s', control_file)
message_body =
# py27 lacks email.message_from_bytes, so...
if isinstance(message_body, bytes):
message_body = message_body.decode('utf-8')
message = Message(message_body)
self._log.debug('got control message: %s', message)

if req not in list(map(str.lower, message.keys())):
import pdb
if self.ignore_missing:
'Header "%s" not found in control message', req)
raise DpkgMissingRequiredHeaderError(
'Corrupt control section; header: "%s" not found' % header)
'Corrupt control section; header: "%s" not found' % req)
self._log.debug('all required headers found')

for header in control_headers:
control_headers[header] = self._force_encoding(
for header in message.keys():
self._log.debug('coercing header to utf8: %s', header)
header, self._force_encoding(message[header]))
self._log.debug('all required headers coerced')

return control_str, control_headers
return message

def get_epoch(version_str):
@@ -152,6 +249,10 @@ class Dpkg(object):

def split_full_version(version_str):
"""Split a full version string into epoch, upstream version and
debian revision.
:param: version_str
:returns: tuple """
epoch, full_ver = Dpkg.get_epoch(version_str)
upstream_rev, debian_rev = Dpkg.get_upstream(full_ver)
return epoch, upstream_rev, debian_rev
@@ -160,14 +261,12 @@ class Dpkg(object):
def get_alphas(revision_str):
"""Return a tuple of the first non-digit characters of a revision (which
may be empty) and the remaining characters."""

# get the index of the first digit
for i, char in enumerate(revision_str):
if char.isdigit():
if i == 0:
return '', revision_str
return revision_str[0:i], revision_str[i:]
return revision_str[0:i], revision_str[i:]
# string is entirely alphas
return revision_str, ''

@@ -175,17 +274,15 @@ class Dpkg(object):
def get_digits(revision_str):
"""Return a tuple of the first integer characters of a revision (which
may be empty) and the remains."""
# If the string is empty, return (0,'')
if not revision_str:
return 0, ''

# get the index of the first non-digit
for i, char in enumerate(revision_str):
if not char.isdigit():
if i == 0:
return 0, revision_str
return int(revision_str[0:i]), revision_str[i:]
return int(revision_str[0:i]), revision_str[i:]
# string is entirely digits
return int(revision_str), ''

@@ -199,12 +296,13 @@ class Dpkg(object):
result = []
while revision_str:
r1, remains = Dpkg.get_alphas(revision_str)
r2, remains = Dpkg.get_digits(remains)
result.extend([r1, r2])
rev_1, remains = Dpkg.get_alphas(revision_str)
rev_2, remains = Dpkg.get_digits(remains)
result.extend([rev_1, rev_2])
revision_str = remains
return result

# pylint: disable=invalid-name,too-many-return-statements
def dstringcmp(a, b):
"""debian package version string section lexical sort algorithm
@@ -241,32 +339,30 @@ class Dpkg(object):
# ...except for goddamn tildes
if char == '~':
return -1
return 1
return 1
# if we get here, a is shorter than b but otherwise equal, hence lesser
# ...except for goddamn tildes
if b[len(a)] == '~':
return 1
return -1
return -1

def compare_revision_strings(rev1, rev2):
"""Compare two debian revision strings as described at
if rev1 == rev2:
return 0

# listify pads results so that we will always be comparing ints to ints
# and strings to strings (at least until we fall off the end of a list)
list1 = Dpkg.listify(rev1)
list2 = Dpkg.listify(rev2)

if list1 == list2:
return 0

for i, item in enumerate(list1):
# just in case
if type(item) != type(list2[i]):
if not isinstance(item, list2[i].__class__):
raise DpkgVersionError(
'Cannot compare %s to %s, something has gone horribly '
'awry.' % (item, list2[i]))
@@ -274,7 +370,7 @@ class Dpkg(object):
if item == list2[i]:
# numeric comparison
if type(item) == int:
if isinstance(item, int):
if item > list2[i]:
return 1
if item < list2[i]:
@@ -290,6 +386,8 @@ class Dpkg(object):

def compare_versions(ver1, ver2):
"""Function to compare two Debian package version strings,
suitable for passing to list.sort() and friends."""
if ver1 == ver2:
return 0

+ 45
- 0
scripts/ View File

@@ -0,0 +1,45 @@
#!/usr/bin/env python

from __future__ import print_function

import glob
import logging
import os
import sys

from pydpkg import Dpkg

log = logging.getLogger('dpkg_extract')

PRETTY = """Filename: {0}
Size: {1}
MD5: {2}
SHA1: {3}
SHA256: {4}

def indent(input_str, prefix):
return '\n'.join(
['%s%s' % (prefix, x) for x in input_str.split('\n')]

filenames = sys.argv[1:]
except KeyError:
log.fatal('You must list at least one deb file as an argument')

for files in filenames:
for fn in glob.glob(files):
if not os.path.isfile(fn):
log.warning('%s is not a file, skipping', fn)
log.debug('checking %s', fn)
dp = Dpkg(fn)
fn, dp.filesize, dp.md5, dp.sha1, dp.sha256,
indent(str(dp), ' ')

+ 29
- 17 View File

@@ -1,20 +1,32 @@
from distutils.core import setup
name = 'pydpkg',
packages = ['pydpkg'], # this must be the same as the name above
version = '1.0',
description = 'A python library for parsing debian package control headers and comparing version strings',
author = 'Nathan J. Mehl',
author_email = '',
url = '',
download_url = '',
keywords = ['apt', 'debian', 'dpkg', 'packaging'],
"Development Status :: 5 - Production/Stable",
"License :: OSI Approved :: Apache Software License",
"Programming Language :: Python :: 2.6",
"Programming Language :: Python :: 2.7",
"Programming Language :: Python :: Implementation :: CPython",
"Topic :: System :: Archiving :: Packaging",
packages=['pydpkg'], # this must be the same as the name above
description='A python library for parsing debian package control headers and comparing version strings',
author='Nathan J. Mehl',
keywords=['apt', 'debian', 'dpkg', 'packaging'],
'test': ['pep8==1.7.0', 'pytest==3.1.1', 'pylint==1.7.1']
"Development Status :: 5 - Production/Stable",
"License :: OSI Approved :: Apache Software License",
"Programming Language :: Python :: 2.7",
"Programming Language :: Python :: 3.3",
"Programming Language :: Python :: 3.4",
"Programming Language :: Python :: 3.5",
"Programming Language :: Python :: Implementation :: CPython",
"Topic :: System :: Archiving :: Packaging",

+ 4
- 2
tests/ View File

@@ -1,6 +1,7 @@
#!/usr/bin/env python

import unittest
from functools import cmp_to_key

from pydpkg import Dpkg, DpkgVersionError

@@ -65,8 +66,9 @@ class DpkgTest(unittest.TestCase):
# taken from
sorted(['a', '', '~', '~~a', '~~'], cmp=Dpkg.dstringcmp),
['~~', '~~a', '~', '', 'a'])
sorted(['a', '', '~', '~~a', '~~'],
['~~', '~~a', '~', '', 'a'])

def test_compare_revision_strings(self):
# note that these are testing a single revision string, not the full