Skip to main content
2 of 4
edited tags
Jamal
  • 35.2k
  • 13
  • 134
  • 238

Python 3.x Hexdump

I posted a small hexdump generator function from a program I've been writing not long ago and applied what a reviewer suggested since then. The goal was to lazily hexdump bytes objects (byte strings, binary files without committing to I/O code).

Here is the relevant code (minus docstrings/tests/script), with some description below:

import re
from itertools import islice


class HexdumpGenerator:
    def __init__(self, iterable, base_addr=0, start=0, stop=None, step=16, sep='\b'):
        self.iterable = islice(iterable, start, stop)
        self.base_addr = base_addr
        self.start = start
        self.stop = stop
        self.step = step
        self.col0 = '08X'
        self.col1 = '02X'
        self.fmt = '{}   {}  {}'
        self.placeholder = ['  ']
        self._sep = sep[0]
        self._mod = (base_addr + start) % self.step
        self._next = start + self.step - self._mod

    def __iter__(self):
        while True:
            row = bytearray(islice(self.iterable, self._next - self.start))
            if not row:
                break
            col0 = format(self.base_addr + self.start - self._mod, self.col0)
            col1 = self._mod * self.placeholder
            col2 = self._mod * ' '
            for byte in row:
                ch = chr(byte)
                col1 += [format(byte, self.col1)]
                col2 += ch if ch.isprintable() else '.'
            self._mod = self.step - len(col1)
            col1 += self._mod * self.placeholder
            col2 += self._mod * ' '
            col1.insert(self.step // 2, self._sep)
            yield self.fmt.format(col0, ' '.join(col1), col2)
            self.start = self._next
            self._next += self.step


class CompressHexdumpGenerator(HexdumpGenerator):
    def __init__(self, *args, **kwargs):
        super(CompressHexdumpGenerator, self).__init__(*args, **kwargs)
        self.row = ''
        self.delimiter = ' '
        self.duplicates = 0

    def _compress(self):
        index = self.row.index(self.delimiter)
        col0 = int(self.row[:index], 16)
        col0 += self.duplicates * self.step
        return format(col0, self.col0) + self.row[index:]

    def __iter__(self):
        for i in super().__iter__():
            if self.row.split()[1:] == i.split()[1:]:
                if not self.duplicates:
                    yield '*'
                self.duplicates += 1
            else:
                yield i
                self.row = i
                self.duplicates = 0
        if self.duplicates:
            yield self._compress()


class FromHexdumpGenerator(CompressHexdumpGenerator):
    def __init__(self, *args, **kwargs):
        super(FromHexdumpGenerator, self).__init__(*args, **kwargs)
        self.base = 16
        self.len = '3'

    def get_repr(self, _row):
        row = bytearray()
        for i in _row[2:self.step * 2 + 1]:
            if i.isalnum():
                row.append(int(i, self.base))
            elif re.match('(\s{' + self.len + ',})', i):
                break
        return row

    def decompress_gen(self, row0, row1):
        i = int(row0[0].rstrip(self.delimiter), 16) + self.step
        j = int(row1[0].rstrip(self.delimiter), 16)
        while not i >= j:
            row = format(i, self.col0) + self.delimiter
            row = [row.rstrip(' ')] + row0[1:]
            yield self.get_repr(row)
            i += self.step

    def __iter__(self):
        i = j = ''
        while True:
            row = j if j else next(self.iterable, None)
            if row is None:
                break
            elif row == '*' or row == '*\n':
                j = next(self.iterable)
                yield from self.decompress_gen(i, j.split())
            else:
                index = row.find(self._sep)
                i = row[:index] + row[index + 1:]
                i = re.split('(\s+)', i)
                j = ''
                yield self.get_repr(i)

Utility functions:

from itertools import chain


def read_binary_gen(file):
    with open(file, 'rb') as f:
        yield from chain.from_iterable(f)


def write(file, gen):
    with open(file, 'w') as f:
        for i in gen:
            f.write(i + '\n')


def read_gen(file):
    with open(file, 'r') as f:
        yield from f


def write_binary(file, gen):
    with open(file, 'wb') as f:
        for i in gen:
            f.write(i)

read_binary_gen() is meant to be passed to the first two generator classes, while read_gen to the latter, thus not reading a file into memory.

I've tested the code with different formats (03o, 03d): if specifying 03d, then the placeholder attribute must be assigned a list with a single string composed of 3 spaces. Using FromHexdumpGenerator to undo the hexdump would then require that base be assigned the integer 10, and len the number '4' (3 + 1). The col0 attribute must remain hex (as it is an address).

If fmt's first column ends with a colon (as I've seen other programs use), the delimiter must be set to that value.

I struggled with different encodings before figuring out this was a case for bytearray, so if a bytes object can be dumped, it can be undumped.

user133955