6
\$\begingroup\$

I just finished my first coding project. Prior to this I have only made small programs so I'd like some feedback on what to improve on and how I did in general. The whole code is 658 lines long.
The goal of the project is implementing the Huffman code in C (C17), whilst being fast or at least not completely slow, my first attempt needed more than 2 minutes to encode 1 GiB of data, now I am down to around 10 seconds. My code was so slow because of the way how I wrote the code for a symbol to a file. Instead of creating a simple lookup table (as I do now), I searched the binary tree each time for each symbol. Each node had all the containing symbols of their children and I checked whether my symbol was in the left or right node.
The code is on Github (and posted below): https://github.com/dryBoneMarrow/huffman.
I appreciate all tips so I can improve my coding skills.

Makefile:

CC := gcc
CFLAGS := -O3 -pedantic-errors -std=c17
CFLAGS_DEBUG := -Og -g -fsanitize=address -std=c17 -pedantic-errors -Wall
DEPS := main huffman bitHandler
BUILDDIR := build/
SRCDIR := src/
BINDIR := /usr/local/bin/
TARGET := huffman
TARGET_DEBUG := $(TARGET)
# TARGET_DEBUG := huffman_debug

OBJS := $(addprefix $(BUILDDIR), $(addsuffix .o, $(DEPS)))
OBJS_DEBUG := $(addprefix $(BUILDDIR)DEBUG_, $(addsuffix .o, $(DEPS)))
SOURCE := $(addprefix $(SRCDIR), $(addsuffix .c, $(DEPS)))

default: $(OBJS)
    $(CC) $(CFLAGS) $(OBJS) -o $(TARGET)

# Debug is noticeably slower
debug: $(OBJS_DEBUG)
    $(CC) $(CFLAGS_DEBUG) $(OBJS_DEBUG) -o $(TARGET_DEBUG)

clean:
    rm -rf $(BUILDDIR) $(TARGET_DEBUG)

# Most likely needs admin rights
install: default
    install -m 755 $(TARGET) $(BINDIR)$(TARGET)

uninstall:
    rm -f $(BINDIR)$(TARGET)

$(BUILDDIR)%.o: $(SRCDIR)%.c
    @mkdir -p $(BUILDDIR)
    $(CC) $(CFLAGS) -c $< -o $@

$(BUILDDIR)DEBUG_%.o: $(SRCDIR)%.c
    @mkdir -p $(BUILDDIR)
    $(CC) $(CFLAGS_DEBUG) -c $< -o $@


.PHONY: default debug clean install

src/main.c:

#include "huffman.h"
#include <stdio.h>
#include <string.h>

void printHelp()
{
    printf("\nEncodes and decodes data using huffman algorithm.\n\n"

           "Usage:\n"
           " huffman [encode|decode] INFILE OUTFILE\n"
           " huffman [encode|decode] INFILE\n"
           " huffman [encode|decode|help]\n\n"

           "Note:\n"
           " - may be used for stdin / stdout\n"
           " When omitting INFILE/OUTFILE, stdin/stdout is used\n\n");
}

// open input file, - for stdin
FILE* getInput(char* filename)
{
    FILE* input;
    if (!strcmp(filename, "-")) {
        // Both the encode and decode functions change the file pointer position which is not
        // possible in stdin. That's why we copy the stdin into a temporary file first
        int currChar;
        input = tmpfile();
        while ((currChar = fgetc(stdin)) != EOF) {
            fputc(currChar, input);
        }
        rewind(input);
    } else {
        input = fopen(filename, "rb");
    }
    return input;
}

// open output file, - for stdout
FILE* getOutput(char* filename)
{
    FILE* output;
    if (!strcmp(filename, "-")) {
        output = stdout;
    } else {
        output = fopen(filename, "wb");
    }
    return output;
}

int main(int argc, char* argv[])
{
    int exitCode = 0;

    // select operation
    int (*operation)(FILE*, FILE*);
    if (argc < 2) {
        fprintf(stderr, "\nNot enough arguments\n");
        printHelp();
        exitCode = 1;
        goto cleanup;
    } else if (!strcmp(argv[1], "help")) {
        printHelp();
        goto cleanup;
    } else if (!strcmp(argv[1], "encode")) {
        operation = &encode;
    } else if (!strcmp(argv[1], "decode")) {
        operation = &decode;
    } else {
        printHelp();
        exitCode = 1;
        goto cleanup;
    }

    // get and validate INFILE and OUTFILE
    FILE *input, *output;
    char* inputPath;
    char* outputPath;
    switch (argc) {
    case 2:
        inputPath = "-";
        outputPath = "-";
        break;
    case 3:
        inputPath = argv[2];
        outputPath = "-";
        break;
    default:
        inputPath = argv[2];
        outputPath = argv[3];
        break;
    }
    input = getInput(inputPath);
    if (!input) {
        fprintf(stderr, "can't open INFILE\n");
        exitCode = 1;
        goto cleanup;
    }
    output = getOutput(outputPath);
    if (!output) {
        fprintf(stderr, "can't open OUTFILE\n");
        exitCode = 1;
        goto cleanupOutput;
    }

    // run the actual program
    exitCode = operation(input, output);

    fclose(output);
cleanupOutput:
    fclose(input);
cleanup:
    return exitCode;
}

src/huffman.h:

#ifndef HUFFMAN_H
#define HUFFMAN_H

#include <stdio.h>

extern int encode(FILE* input, FILE* output);
extern int decode(FILE* input, FILE* output);

#endif

src/huffman.c:

#include "huffman.h"
#include "bitHandler.h"
#include <inttypes.h>
#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>

// Only works when node is initialized with 0
#define isLeaf(a) (!(a)->left)

typedef struct node {
    uint64_t frequency;
    unsigned char symbol;
    struct node* left;
    struct node* right;
} node_t;

// Dictionary is made of codeWords
typedef struct codeWord {
    unsigned char codeLength;
    uint64_t code;
} code_t;

// Sort nodes, used with qsort()
static int compareNodes(const void* a, const void* b)
{
    if ((*(node_t* const*)a)->frequency > (*(node_t* const*)b)->frequency) return -1;
    return 1;
}

// Create dictionary, dictionary[symbol] returns the corresponding code
// https://de.wikipedia.org/w/index.php?title=Huffman-Kodierung&stable=0#Erzeugen_des_Codebuchs
static void createDictionary(node_t* node, code_t* dictionary, uint64_t code, int depth)
{
    if (isLeaf(node)) {
        dictionary[node->symbol].code = code;
        dictionary[node->symbol].codeLength = depth;
        return;
    }
    // Writing code from left to right
    code &= ~(1ull << (63 - depth));
    createDictionary(node->left, dictionary, code, 1 + depth);
    code |= 1ull << (63 - depth);
    createDictionary(node->right, dictionary, code, 1 + depth);
}

// Write tree to file
// https://stackoverflow.com/a/759766/15833045
static void writeTree(node_t* node, bitBuffer_t* bitBuffer)
{
    if (isLeaf(node)) {
        writeBit(1, bitBuffer);
        writeByte(node->symbol, bitBuffer);
        return;
    }
    writeBit(0, bitBuffer);
    writeTree(node->left, bitBuffer);
    writeTree(node->right, bitBuffer);
}

// print representation of tree
static void printTree(node_t* node, int depth)
{
    int i;
    for (i = 0; i < depth; ++i) {
        printf("│  ");
    }
    // Frequency is set if tree was generated but not if tree was read off a file
    if (node->frequency) {
        if (isLeaf(node)) {
            printf("(%c: %" PRIuFAST64 ")\n", node->symbol, node->frequency);
            return;
        }
        printf("(%" PRIu64 ")\n", node->frequency);
    } else {
        if (isLeaf(node)) {
            printf("(%c)\n", node->symbol);
            return;
        }
        printf("*\n");
    }
    printTree(node->left, depth + 1);
    printTree(node->right, depth + 1);
}

// Read tree from file
int readTree(bitBuffer_t* bitBuffer, node_t* node, int* nodeNum)
{
    // We don't have to care for a potential buffer overflow as the maximal theoretical depth of
    // tree is 256, hence the maximum number of nodes 511 and it is assumed that the buffer in
    // bitBuffer is >= 511

    int currNode = *nodeNum;
    ++*nodeNum;

    // bit is 1
    if (bitBuffer->buffer[bitBuffer->bufferPosition] & 1 << (7 - bitBuffer->bitPosition)) {
        // update bitBuffer
        switch (bitBuffer->bitPosition) {
        case 7:
            bitBuffer->bitPosition = 0;
            ++bitBuffer->bufferPosition;
            break;
        default:
            ++bitBuffer->bitPosition;
        }

        // Assign symbol to leaf
        node[currNode].symbol = bitBuffer->buffer[bitBuffer->bufferPosition]
                << bitBuffer->bitPosition
            | bitBuffer->buffer[bitBuffer->bufferPosition + 1] >> (8 - bitBuffer->bitPosition);
        ++bitBuffer->bufferPosition;
        return currNode;
    }

    // bit is 0
    // update bitBuffer
    switch (bitBuffer->bitPosition) {
    case 7:
        bitBuffer->bitPosition = 0;
        bitBuffer->bufferPosition++;
        break;
    default:
        bitBuffer->bitPosition++;
    }

    // Assign left and right to node
    node[currNode].left = &node[readTree(bitBuffer, node, nodeNum)];
    node[currNode].right = &node[readTree(bitBuffer, node, nodeNum)];

    return currNode;
}

// Encode file
extern int encode(FILE* input, FILE* output)
{
    int exitCode = 0;

    //// Create the frequency table
    // Buffering is used for better performance

    int leafCount = 0;
    uint64_t frequencyTable[256] = { 0 };
    unsigned char buffer[BUFSIZ];
    size_t bytesRead;

    while ((bytesRead = fread(buffer, 1, BUFSIZ, input)) > 0) {
        for (size_t i = 0; i < bytesRead; ++i) {
            if (!frequencyTable[buffer[i]]) {
                ++leafCount;
            }
            ++frequencyTable[buffer[i]];
        }
    }

    // handle empty file
    if (!leafCount) {
        fprintf(stderr, "input file is empty");
        exitCode = 1;
        goto cleanup;
    }
    // handle file with one type of symbol (there's no huffman code because the huffman tree has no
    // edges)
    if (leafCount == 1) {
        fputc(1 << 7, output);
        rewind(input);
        unsigned char symbol = fgetc(input);
        uint64_t symbolCount = frequencyTable[symbol];
        fputc(symbol, output);
        fwrite(&symbolCount, sizeof(uint64_t), 1, output);
        goto cleanupBuffer;
    }

    //// Create tree
    // trees[] stores pointers to all subtrees
    // nodes[] holds all the nodes of the trees (2 times the number of symbols minus one)

    int i;
    node_t* nodes = calloc(2 * leafCount - 1, sizeof(node_t));
    if (!nodes) {
        fprintf(stderr, "Couldn't allocate memory for nodes");
        exitCode = 1;
        goto cleanup;
    }
    node_t** trees = malloc(leafCount * sizeof(node_t**));
    if (!trees) {
        fprintf(stderr, "Couldn't allocate memory for trees");
        exitCode = 1;
        goto cleanupNode;
    }

    node_t* nodesp = nodes;
    node_t** treesp = trees;

    // Fill the nodes array with the leafs and reference them in the trees array
    for (i = 0; i < 256; ++i) {
        if (frequencyTable[i]) {
            nodesp->symbol = i;
            nodesp->frequency = frequencyTable[i];
            *treesp = nodesp;
            ++treesp;
            ++nodesp;
        }
    }
    // sort the tree array
    qsort(trees, leafCount, sizeof(node_t**), &compareNodes);
    int treeCount = leafCount;

    // Connect subtrees (leafs) to the huffman tree
    while (treeCount > 1) {
        // create node
        --treeCount;
        nodesp->frequency = trees[treeCount]->frequency + trees[treeCount - 1]->frequency;
        nodesp->left = trees[treeCount];
        nodesp->right = trees[treeCount - 1];

        // sort trees
        for (i = 0; i < treeCount - 1 && nodesp->frequency >= trees[treeCount - (i + 2)]->frequency;
             ++i) {
            trees[treeCount - (i + 1)] = trees[treeCount - (i + 2)];
        }

        // Add node to trees[]
        trees[treeCount - (1 + i)] = nodesp;
        ++nodesp;
    }

    //// Create dictionary
    code_t dictionary[256];
    createDictionary(*trees, dictionary, 0, 0);

    //// Write to outputFile
    // Create bitBuffer, used to write bitwise to output file
    bitBuffer_t bitBuffer = { malloc(BUFSIZ * sizeof(char)), BUFSIZ, 0, 0, output };
    if (!bitBuffer.buffer) {
        fprintf(stderr, "Couldn't allocate memory for buffer");
        exitCode = 1;
        goto cleanupTree;
    }

    // write tree to file
    writeTree(*trees, &bitBuffer);

    // write data to file
    rewind(input);
    while ((bytesRead = fread(buffer, sizeof(char), BUFSIZ, input)) > 0) {
        for (i = 0; i < bytesRead; ++i) {
            writeBits(dictionary[buffer[i]].code, dictionary[buffer[i]].codeLength, &bitBuffer);
        }
    }

    // The last three bits show how many bits of the last byte are significant and hence should be
    // decoded later
    if (bitBuffer.bitPosition <= 5) {
        uint64_t significantBits = (uint64_t)bitBuffer.bitPosition << 61;
        writeBits(0, 5 - bitBuffer.bitPosition, &bitBuffer);
        writeBits(significantBits, 3, &bitBuffer);
    } else {
        writeByte(0, &bitBuffer);
    }

    flush(&bitBuffer);
cleanupBuffer:
    free(bitBuffer.buffer);
cleanupTree:
    free(trees);
cleanupNode:
    free(nodes);
cleanup:

    return exitCode;
}

// decode file
extern int decode(FILE* input, FILE* output)
{
    int exitCode = 0;

    // Initialize bitBuffer to read bitwise later, buffersize has to be at least 511 (max size of
    // tree in file)
    bitBuffer_t bitBuffer = { malloc(BUFSIZ),
        (bitBuffer.buffer) ? fread(bitBuffer.buffer, sizeof(char), BUFSIZ, input) : 0, 0, 0,
        input };

    // Check for failed malloc
    if (!bitBuffer.buffer) {
        fprintf(stderr, "Couldn't allocate memory\n");
        exitCode = 1;
        goto cleanup;
    }

    // check if INFILE is empty
    if (!bitBuffer.bufferSize) {
        fprintf(stderr, "INFILE is empty\n");
        exitCode = 1;
        goto cleanupBuffer;
    }

    // Handle case when only one symbol is encoded
    if (bitBuffer.buffer[0] & (1 << 7)) {
        // File has to have 10 Bytes assuming uint64_t is 8 bytes
        if (bitBuffer.bufferSize != 10) {
            fprintf(stderr, "INFILE has invalid format\n");
            exitCode = 1;
            goto cleanupBuffer;
        }
        int currChar = bitBuffer.buffer[1];
        uint64_t i;
        uint64_t symbolFrequency = *(uint64_t*)(bitBuffer.buffer + 2);
        for (i = 0; i < symbolFrequency; i++) {
            fputc(currChar, output);
        }
        goto cleanupBuffer;
    }

    // Read tree from file
    node_t tree[511] = { 0 };
    int nodeNum = 0;
    readTree(&bitBuffer, tree, &nodeNum);

    // Decode data using the tree

    // get filesize
    size_t filePos = ftell(input);
    fseek(input, 0, SEEK_END);
    size_t bytesLeft = ftell(input);
    bytesLeft -= bitBuffer.bufferPosition;
    fseek(input, filePos, SEEK_SET);

    node_t* currNode = tree;

    // Read bit for bit and follow tree, write character if leaf is reached and begin from root of
    // tree
    while (bitBuffer.bufferSize > 0) {
        while (bitBuffer.bufferPosition < bitBuffer.bufferSize && bytesLeft > 1) {
            for (; bitBuffer.bitPosition < 8; bitBuffer.bitPosition++) {
                if (bitBuffer.buffer[bitBuffer.bufferPosition] & 1 << (7 - bitBuffer.bitPosition))
                    currNode = currNode->right;
                else
                    currNode = currNode->left;
                if (isLeaf(currNode)) {
                    fputc(currNode->symbol, output);
                    currNode = tree;
                }
            }
            ++bitBuffer.bufferPosition;
            bitBuffer.bitPosition = 0;
            --bytesLeft;
        }
        bitBuffer.bufferSize = fread(bitBuffer.buffer, sizeof(char), BUFSIZ, input);
        if (bitBuffer.bufferSize != 0) {
            bitBuffer.bufferPosition = 0;
        }
    }

    // Read bits of last byte, the number of significant bits is stored in the last three bits
    for (; bitBuffer.bitPosition < (bitBuffer.buffer[bitBuffer.bufferPosition] & 7);
         ++bitBuffer.bitPosition) {
        if (bitBuffer.buffer[bitBuffer.bufferPosition] & 1 << (7 - bitBuffer.bitPosition))
            currNode = currNode->right;
        else
            currNode = currNode->left;
        if (isLeaf(currNode)) {
            fputc(currNode->symbol, output);
            currNode = tree;
        }
    }

cleanupBuffer:
    free(bitBuffer.buffer);
cleanup:
    return exitCode;
}

src/bitHandler.h:

// bitHandler provides functions to write single bits to a file
#ifndef BITHANDLER_H
#define BITHANDLER_H

#include <stdint.h>
#include <stdio.h>

// bitBuffer_t contains all necessary infos to read / write bits from / to a file
typedef struct bitBuffer {
    unsigned char* buffer;
    size_t bufferSize;
    unsigned int bufferPosition;
    int bitPosition;
    FILE* file;
} bitBuffer_t;

extern void writeBit(int bit, bitBuffer_t* bitBuffer);
extern void writeBits(uint64_t bits, int nbits, bitBuffer_t* bitBuffer);
extern void writeByte(unsigned char byte, bitBuffer_t* bitBuffer);
extern void flush(bitBuffer_t* bitBuffer);

#endif

src/bitHandler.c:

#include "bitHandler.h"
#include <stdint.h>
#include <stdio.h>

// write single bits to file
extern void writeBit(int bit, bitBuffer_t* bitBuffer)
{
    // write bit
    if (bit) {
        bitBuffer->buffer[bitBuffer->bufferPosition] |= 1u << (7 - bitBuffer->bitPosition);
    } else {
        bitBuffer->buffer[bitBuffer->bufferPosition] &= ~(1u << (7 - bitBuffer->bitPosition));
    }

    // update bitBuffer
    if (bitBuffer->bitPosition == 7) {
        ++bitBuffer->bufferPosition;
        if (bitBuffer->bufferPosition == bitBuffer->bufferSize) {
            fwrite(bitBuffer->buffer, sizeof(char), bitBuffer->bufferSize, bitBuffer->file);
            bitBuffer->bufferPosition = 0;
        }
        bitBuffer->bitPosition = 0;
    } else {
        ++bitBuffer->bitPosition;
    }
}

// write n bits to buffer
// Has better performance than calling writeBit repeatedly
extern void writeBits(uint64_t bits, int nbits, bitBuffer_t* bitBuffer)
{
    //// Fill partially filled byte

    // Clear part of the byte that is going to be written
    bitBuffer->buffer[bitBuffer->bufferPosition] &= ~(255 >> bitBuffer->bitPosition);
    // Write the bits
    bitBuffer->buffer[bitBuffer->bufferPosition] |= bits >> (56 + bitBuffer->bitPosition);
    // Return if all bits have been written
    if (nbits <= 8 - bitBuffer->bitPosition) {
        bitBuffer->bitPosition += nbits;
        return;
    }
    // Remove written bits
    bits <<= 8 - bitBuffer->bitPosition;
    ++bitBuffer->bufferPosition;
    // flush buffer if full
    if (bitBuffer->bufferPosition == bitBuffer->bufferSize) {
        fwrite(bitBuffer->buffer, sizeof(char), bitBuffer->bufferSize, bitBuffer->file);
        bitBuffer->bufferPosition = 0;
    }
    nbits -= 8 - bitBuffer->bitPosition;
    bitBuffer->bitPosition = nbits % 8;

    //// Write as many bytes to buffer as needed
    while (nbits >= 8) {
        bitBuffer->buffer[bitBuffer->bufferPosition] = bits >> 56;
        bits <<= 8;
        ++bitBuffer->bufferPosition;
        nbits -= 8;
        // flush buffer if full
        if (bitBuffer->bufferPosition == bitBuffer->bufferSize) {
            fwrite(bitBuffer->buffer, sizeof(char), bitBuffer->bufferSize, bitBuffer->file);
            bitBuffer->bufferPosition = 0;
        }
    }

    //// Write left over bits
    bitBuffer->buffer[bitBuffer->bufferPosition] &= 0u;
    bitBuffer->buffer[bitBuffer->bufferPosition] |= bits >> 56;
}

// write byte to file
extern void writeByte(unsigned char byte, bitBuffer_t* bitBuffer)
{

    // Complete partially filled byte
    bitBuffer->buffer[bitBuffer->bufferPosition] &= ~(0xFFu >> bitBuffer->bitPosition);
    bitBuffer->buffer[bitBuffer->bufferPosition] |= byte >> bitBuffer->bitPosition;
    ++bitBuffer->bufferPosition;

    // flush buffer if full
    if (bitBuffer->bufferPosition == bitBuffer->bufferSize) {
        fwrite(bitBuffer->buffer, sizeof(char), bitBuffer->bufferSize, bitBuffer->file);
        bitBuffer->bufferPosition = 0;
    }

    // write left over bits
    bitBuffer->buffer[bitBuffer->bufferPosition] = byte << (8 - bitBuffer->bitPosition);
}

// flush the remaining bits in buffer to file
extern void flush(bitBuffer_t* bitBuffer)
{
    fwrite(bitBuffer->buffer, sizeof(char),
        (bitBuffer->bitPosition == 0) ? bitBuffer->bufferPosition : bitBuffer->bufferPosition + 1,
        bitBuffer->file);
    bitBuffer->bufferPosition = 0;
    bitBuffer->bitPosition = 0;
}
\$\endgroup\$
8
  • \$\begingroup\$ Please post your code here rather than at github.com/dryBoneMarrow/huffman/blob/main/src/main.c . // I assume your code works fine on 32- and 64-bit hosts, both big- and little-endian. // Describing the "whoops!" that gave a 12x slowdown could be valuable advice for this site's readers. \$\endgroup\$
    – J_H
    Commented Jun 24, 2024 at 17:31
  • \$\begingroup\$ @J_H I changed the post accordingly, thanks for the hint. The code is tested on a amd64 platform but should work on every system that supports C18 (C99 as well, even though I aimed for C18) and uses the MSb-first bit order. \$\endgroup\$
    – TheGlibber
    Commented Jun 24, 2024 at 17:43
  • \$\begingroup\$ 1) Are you open to C23 suggestions? 2) That fseek() + ftell() hack is undefined behaviour in both C and C++. 3) You're ignoring all I/O errors. \$\endgroup\$
    – Madagascar
    Commented Jun 24, 2024 at 17:56
  • 1
    \$\begingroup\$ "C18" had me confused for a bit, apparently that's an alternative name for C17 \$\endgroup\$
    – user555045
    Commented Jun 24, 2024 at 18:26
  • 1
    \$\begingroup\$ @Harith 1) Yes, I chose the newest (finished) standard but it's not that important to me 2) I couldn't find a non posix way of finding the file size except this one, how should I do it instead? 3) So do I need to check each individual read / write operation? I'd imagine this makes my code way less efficient as there would be a lot more conditionals \$\endgroup\$
    – TheGlibber
    Commented Jun 24, 2024 at 18:37

3 Answers 3

7
\$\begingroup\$

Valgrind reports no leaks or errors. Good job on that, and welcome to programming and C. To complement @user555045's nice answer, this review would be about the intricate details of the code, the build system, and the issues brought up in the comments.


First, some assumptions the code is making that it should make explicit:

  1. Code is being compiled with a C compiler.
  2. The compiler has support for C17.
  3. CHAR_BIT is 8.
  4. uint64_t is available.

These are easy to tackle:

#ifdef __cplusplus
//  #error "Away, you three-inch fool!"
//  #error "Methink’st thou art a general offence and every man should beat thee."
//  #error "Thou damned and luxurious mountain goat."
//  #error "Thou leathern-jerkin, crystal-button, knot-pated, agatering, puke-stocking, caddis-garter, smooth-tongue, Spanish pouch!"
//  #error "Thou lump of foul deformity!"
//  #error "Fie! cometh h're and englut mine own coxcomb thee distemperate fooleth!"
//  #error "Wherefore doth thee throweth a gib's food at a dog?"
//  #error "Thy sin’s not accidental, but a trade!"
    #error "This program must be compiled with a C compiler."
#endif /* __cplusplus */

#if !defined(__STDC_VERSION__) || __STDC_VERSION__ < 201710
    #error "This program requires a C17 compiler."
#endif /* !defined(__STDC_VERSION__) || __STDC_VERSION__ < 201710 */

#include <limits.h>

#if CHAR_BIT != 8
//  #error "Abandon thy DSP! Out of my sight! Thou dost infect mine eyes." 
    #error "This program assumes 8-bit bytes."
#endif /* CHAR_BIT != 8 */

#ifndef UINT64_MAX
    #error "This program requires uint64_t type."
#endif /* UINT64_MAX */

Some notes:

  1. POSIX requires CHAR_BIT == 8.

  2. sizeof (char) is defined by the ISO C Standard to be 1. So you can replace all instances of sizeof (char) with 1. But it does not fix the number of bits. A char may have 32 bits, or 64 bits. Unisys C Compiler Programming Reference Manual specifies a char with 9 bits. BlueCore-5 chip (a Bluetooth chip from Cambridge Silicon Radio has CHAR_BIT == 16. According to GCC source code, CHAR_BIT is 16 bits for 1750a, dsp16xx architectures, 24 bits for dsp56k architecture, and 32 bits for c4x architecture.

  3. C17 was a bug-fix version of C11. I am confident that this code would compile fine with a C11 compiler, and would suggest comparing __STDC_VERSION__ against 201112L. I also do not see any threads, atomics, _Generic, or other C11 features used. So it might also run fine with a C99 compiler.

If it also assumes big-endian, or little-endian, it can make use of ISO C23's __STDC_ENDIAN_NATIVE__, __STDC_ENDIAN_LITTLE__, and __STDC_ENDIAN_BIG__ from <stdbit.h>. Code can use #if defined(__has_include) && __has_include(<stdbit.h>) to check for its existence. Or use GNU C's __BYTE_ORDER..._ macros if available.

Enable more compiler warnings:

~/huffman (main*) » make                                                                 minty@minty-virtual-machine
gcc -O3 -std=c2x -g3 -ggdb -no-pie -Wall -Wextra -Warray-bounds -Werror -Wmissing-braces -Wpedantic -Wstrict-prototypes -Wwrite-strings -Winline -s -fno-builtin -fno-common -fno-omit-frame-pointer -fsanitize=float-cast-overflow -fsanitize=address -fsanitize=undefined -fsanitize=leak -Wformat-signedness -fanalyzer -c src/main.c -o build/main.o
gcc -O3 -std=c2x -g3 -ggdb -no-pie -Wall -Wextra -Warray-bounds -Wno-unused-value -Wno-type-limits -Werror -Wconversion -Wmissing-braces -Wno-parentheses -Wpedantic -Wstrict-prototypes -Wwrite-strings -Winline -s -Wno-discarded-qualifiers -fno-builtin -fno-common -fno-omit-frame-pointer -fsanitize=float-cast-overflow -fsanitize=address -fsanitize=undefined -fsanitize=leak -Wformat-signedness -Wsuggest-attribute=pure -Wsuggest-attribute=const -Wsuggest-attribute=noreturn -Wsuggest-attribute=cold -Wsuggest-attribute=malloc -Wsuggest-attribute=format -fanalyzer -c src/huffman.c -o build/huffman.o
src/huffman.c: In function ‘createDictionary’:
src/huffman.c:37:47: error: conversion from ‘int’ to ‘unsigned char’ may change value [-Werror=conversion]
   37 |         dictionary[node->symbol].codeLength = depth;
      |                                               ^~~~~
src/huffman.c: In function ‘readTree’:
src/huffman.c:109:33: error: conversion from ‘int’ to ‘unsigned char’ may change value [-Werror=conversion]
  109 |         node[currNode].symbol = bitBuffer->buffer[bitBuffer->bufferPosition]
      |                                 ^~~~~~~~~
src/huffman.c: In function ‘encode’:
src/huffman.c:167:32: error: conversion from ‘int’ to ‘unsigned char’ may change value [-Werror=conversion]
  167 |         unsigned char symbol = fgetc(input);
      |                                ^~~~~
src/huffman.c:179:42: error: conversion to ‘size_t’ {aka ‘long unsigned int’} from ‘int’ may change the sign of the result [-Werror=sign-conversion]
  179 |     node_t* nodes = calloc(2 * leafCount - 1, sizeof(node_t));
      |                            ~~~~~~~~~~~~~~^~~
src/huffman.c:185:39: error: conversion to ‘long unsigned int’ from ‘int’ may change the sign of the result [-Werror=sign-conversion]
  185 |     node_t** trees = malloc(leafCount * sizeof(node_t**));
      |                                       ^
src/huffman.c:198:30: error: conversion from ‘int’ to ‘unsigned char’ may change value [-Werror=conversion]
  198 |             nodesp->symbol = i;
      |                              ^
src/huffman.c:206:18: error: conversion to ‘size_t’ {aka ‘long unsigned int’} from ‘int’ may change the sign of the result [-Werror=sign-conversion]
  206 |     qsort(trees, leafCount, sizeof(node_t**), &compareNodes);
      |                  ^~~~~~~~~
src/huffman.c:247:23: error: comparison of integer expressions of different signedness: ‘int’ and ‘size_t’ {aka ‘long unsigned int’} [-Werror=sign-compare]
  247 |         for (i = 0; i < bytesRead; ++i) {
      |                       ^
src/huffman.c: In function ‘decode’:
src/huffman.c:324:22: error: conversion to ‘size_t’ {aka ‘long unsigned int’} from ‘long int’ may change the sign of the result [-Werror=sign-conversion]
  324 |     size_t filePos = ftell(input);
      |                      ^~~~~
src/huffman.c:326:24: error: conversion to ‘size_t’ {aka ‘long unsigned int’} from ‘long int’ may change the sign of the result [-Werror=sign-conversion]
  326 |     size_t bytesLeft = ftell(input);
      |                        ^~~~~
src/huffman.c:328:18: error: conversion to ‘long int’ from ‘size_t’ {aka ‘long unsigned int’} may change the sign of the result [-Werror=sign-conversion]
  328 |     fseek(input, filePos, SEEK_SET);
      |                  ^~~~~~~
At top level:
src/huffman.c:62:13: error: ‘printTree’ defined but not used [-Werror=unused-function]
   62 | static void printTree(node_t* node, int depth)
      |             ^~~~~~~~~
src/huffman.c: In function ‘encode’:
src/huffman.c:266:5: error: ‘trees’ may be used uninitialized in this function [-Werror=maybe-uninitialized]
  266 |     free(trees);
      |     ^~~~~~~~~~~
src/huffman.c:268:5: error: ‘nodes’ may be used uninitialized in this function [-Werror=maybe-uninitialized]
  268 |     free(nodes);
      |     ^~~~~~~~~~~
cc1: all warnings being treated as errors
make: *** [Makefile:76: build/huffman.o] Error 1

Lots of type conversions, sign conversions, truncations, uninitialized variables, et cetera.

These are easily fixed.


How to make your software work in unexpected ways 101:

First thing I did was make a note of all the calls to library functions the program makes with ltrace:

strcmp("encode", "help")                                                = -3
strcmp("encode", "encode")                                              = 0
fopen("src/huffman.c", "rb")                                            = 0x558bdd1d02a0
fread(0x7ffc4ade44e0, 1, 8192, 0x558bdd1d02a0)                          = 8192
fread(0x7ffc4ade44e0, 1, 8192, 0x558bdd1d02a0)                          = 2987
fread(0x7ffc4ade44e0, 1, 8192, 0x558bdd1d02a0)                          = 0
calloc(173, 32)                                                         = 0x558bdd1d1490
malloc(696)                                                             = 0x558bdd1d2a40
qsort(0x558bdd1d2a40, 87, 8, 0x558bdc1615d0)                            = <void>
malloc(8192)                                                            = 0x558bdd1d2d00
rewind(0x558bdd1d02a0, 0x7ffc4ade2cc0, 108, 3)                          = 1
fread(0x7ffc4ade44e0, 1, 8192, 0x558bdd1d02a0)                          = 8192
fread(0x7ffc4ade44e0, 1, 8192, 0x558bdd1d02a0)                          = 2987
fread(0x7ffc4ade44e0, 1, 8192, 0x558bdd1d02a0)                          = 0
fwrite("\v\225\244\262\\\301xL\255t\351jz\250L\022\211It\026\322\246\220\315\313\262fe<T\212L"..., 1, 6642, 0x7f361c33f780) = 6642
free(0x558bdd1d2d00)                                                    = <void>
free(0x558bdd1d2a40)                                                    = <void>
free(0x558bdd1d1490)                                                    = <void>
fclose(0x7f361c33f780)                                                  = 0
fclose(0x558bdd1d02a0)                                                  = 0
+++ exited (status 0) +++

From this list, fopen(), fread(), calloc(), and malloc() (among others) were of concern to me.

From these four functions, you did not check the return values of fwrite() and fread(), so we are going to make them fail intentionally with trip (The utility uses LD_PRELOAD to intercept libc function calls and "trip" them by immediately returning an error code.).

First we check if your code works correctly:

$ ./huffman encode src/huffman.c > encoded.txt
$ ./huffman decode encoded.txt > decoded.txt
$ cmp src/huffman.c decoded.txt
$ echo $?
0

Wonderful. Now let's make fread() fail:

$ trip fread ./huffman encode src/huffman.c encoded.txt
input file is empty%

That does not sound right. How did I compile the program if src/huffman.c was empty? Surely your program is lying. And pray, why has it ruined my prompt?

But let's be more lenient; fread() does not always fail:

$ trip fread:0.5 ./huffman encode src/huffman.c encoded.txt
input file is empty%

$ trip fread:0.5 ./huffman encode src/huffman.c encoded.txt
input file is empty%

$ trip fread:0.5 ./huffman encode src/huffman.c encoded.txt

We only asked fread() to fail 50% of the time this time, and it seemed to have succeeded after 2 tries. Now let us decode it:

$ ./huffman decode encoded.txt decoded.txt

No output, good! Surely that indicates successful termination and now cmp would return 0?

$ cmp src/huffman.c decoded.txt
cmp: EOF on decoded.txt after byte 8192, in line 282

What!? Looks like the encoding was unsuccessful, and the program did not even emit a warning, let alone fail, and now 1000 People Can Not Play Doom At Once.

fread() and fwrite() are not the only ones ignored. The result of tmpfile() also goes unchecked:

The tmpfile() function returns a stream descriptor, or NULL if a unique filename cannot be generated or the unique file cannot be opened.

Though you are over-complicating things by reading from stdin into memory, then writing from memory to a temporary file, and then reading from the temporary file into memory when required. It is more common, and efficient, to read the file directly into memory and work with that.

It would also be better to read in chunks (of say BUFSIZ) instead of reading byte-by-byte.

Lesson learned: Do not ignore errors unless you care naught about robustness or correctness. A fast program is of no good to anyone if it only works 50% of the times.

Do not subject users to cryptic and incomprehensible error messages:

I have never found make to output helpful error messages, especially non-GNU versions. Currently, if I run bmake (derived from NetBSD make(1)), I get:

$ bmake
gcc -O3 -pedantic-errors -std=c17 -o huffman
gcc: fatal error: no input files
compilation terminated.
*** Error code 1

I get that Linux by default uses GNU make, and that there is nothing wrong about using GNU extensions, but at least offer something better than "no input files".

One solution I can suggest is having two Makefiles, a "GNUmakefile" and a "Makefile", where "Makefile" has the following contents:

# Dummy Makefile to indicate that GNU make should be used.

.POSIX:
all huffman:
    @echo "You need GNU make to build huffman." 1>&2; false

Now when I run make on a BSD machine, or Solaris, or some other system that does not use GNU make by default, I get:

$ bmake
You need GNU Make to build huffman
*** Error code 1

Stop.
bmake: stopped in ....

which is objectively better than getting "no input files".

The first name GNU make checks for is "GNUmakefile", so when it is used, "GNUmakefile" would be chosen and "Makefile" would be ignored. But bmake would not look for "GNUmakefile", but would first try to open "makefile", and then "Makefile", which would result in it failing with our desired error message.

The trip tool shared above uses this method.

You do not require CFLAGS_DEBUG:

CFLAGS_DEBUG simply duplicates CFLAGS and adds some additional debugging flags. A better option would be to keep the common flags in CFLAGS and use target-specific assignments for debug and default:

CFLAGS := -pedantic-errors -std=c17

default: CFLAGS += -O3
default: $(OBJS)

debug: CFLAGS += -Og -g -fsanitize=address -Wall
debug: $(OBJS_DEBUG)

Note that the variable-assignment can be any valid form of assignment; recursive (‘=’), simple (‘:=’ or ‘::=’), immediate (‘::=’), appending (‘+=’), or conditional (‘?=’).

I also prefer each compiler option to be on a separate line:

CFLAGS := -pedantic-errors
CFLAGS := -std=c17
...

This makes it easy to quickly edit and comment/uncomment out individual options.

Install to ~/.local/bin when make is not run as root:

The trip utility installs to ~/.local/bin when make is run normally, and globally, targeting /usr/local when run as superuser.

It also has a nice bash script that checks whether the installation was successful, and prompts and automatically adds ~/.local/bin to .bashrc when installation failed because it was not in the PATH variable (after confirmation, of course).

Moreover, it is more common to see:

SRCDIR := src
BUILDDIR := build
$(BUILDDIR)/%.o: $(SRCDIR)/%.c

instead of:

SRCDIR := src/
BUILDDIR := build/
$(BUILDDIR)%.o: $(SRCDIR)%.c

# TARGET_DEBUG := huffman_debug

This should have been removed prior to code review.


We are missing the .DELETE_ON_ERROR: target. According to GNU make's documentation:

If .DELETE_ON_ERROR is mentioned as a target anywhere in the makefile, then make will delete the target of a rule if it has changed and its recipe exits with a nonzero exit status, just as it does when it receives a signal.


uninstall should also be a .PHONY target.

Currently, if I do:

$ touch uninstall
$ make uninstall
make: 'uninstall' is up to date.

make does absolutely nothing.


The default rule is usually called all.

Simplify:

// open output file, - for stdout
FILE* getOutput(char* filename)
{
    FILE* output;
    if (!strcmp(filename, "-")) {
        output = stdout;
    } else {
        output = fopen(filename, "wb");
    }
    return output;
}

is better written as:

// open output file, - for stdout
static FILE* getOutput(const char filename[static 1])
{
    return strcmp(filename, "-") == 0 ? stdout : fopen(filename, "wb");
}

The first static makes it only visible inside "main.c". Using the array notation with static 1 says two things:

  • filename shall not be a null pointer.
  • filename shall point to at least one char.

const says that filename is not modified anywhere in the function.

If this was my code, it would also have:

[[gnu::malloc, gnu::malloc(fclose, 1)]]

See: Common Function Attributes.

Say goodbye to PRI* macros in C23:

No longer do we need to use these macros for formatting the standard integer types from stdint.h. C23 has introduced:

wN Specifies that a following b, B, d, i, o, u, x, or X conversion specifier applies to an integer argument with a specific width where N is a positive decimal integer with no leading zeros (the argument will have been promoted according to the integer promotions, but its value shall be converted to the unpromoted type); or that a following n conversion specifier applies to a pointer to an integer type argument with a width of N bits. All minimum-width integer types (7.22.1.2) and exact-width integer types (7.22.1.1) defined in the header <stdint.h> shall be supported. Other supported values of N are implementation-defined.

wfN Specifies that a following b, B, d, i, o, u, x, or X conversion specifier applies to a fastest minimum-width integer argument with a specific width where N is a positive decimal integer with no leading zeros (the argument will have been promoted according to the integer promotions, but its value shall be converted to the unpromoted type); or that a following n conversion specifier applies to a pointer to a fastest minimum-width integer type argument with a width of N bits. All fastest minimum-width integer types (7.22.1.3) defined in the header <stdint.h> shall be supported. Other supported values of N are implementation-defined.

So %w32d for int32_t, %w64d for int64_t, and so on

Label lies:

    fclose(output);
cleanupOutput:
    fclose(input);

I would have expected cleanupOutput to pass output to fclose().

Unnecessary use of goto:

I am not against using goto, but this:

int main(int argc, char* argv[])
{
    int exitCode = 0;

    ...
    if (argc < 2) {
        exitCode = 1;
        goto cleanup;
    } else if (!strcmp(argv[1], "help")) {
        goto cleanup;
    ...
    } else {
        ...
        exitCode = 1;
        goto cleanup;
    }

    // get and validate INFILE and OUTFILE
    ...
    if (!input) {
        exitCode = 1;
        goto cleanup;
    }
    ...
    if (!output) {
        ...
        exitCode = 1;
        goto cleanupOutput;
    }

    // run the actual program
    exitCode = operation(input, output);

    fclose(output);
cleanupOutput:
    fclose(input);
cleanup:
    return exitCode;
}

is clearly over-engineered and somewhat obfuscated. output and input are only opened at the end of the function. Instead of jumping from the top of the function to cleanup section, it would be more straightforward to simply return EXIT_FAILURE or EXIT_SUCCESS. Utilizing goto would be justified if the files needed to be closed at multiple exit points, but in this case, it introduces avoidable complexity.

The function is written better, clearer, and shorter as:

int main(int argc, char* argv[])
{
    if (argc < 2) {
        return EXIT_FAILURE;
    } else if (!strcmp(argv[1], "help")) {
        return EXIT_SUCCESS;
    ...
    } else {
        return EXIT_FAILURE;
    }

    // get and validate INFILE and OUTFILE
    ...
    if (!input) {
        return EXIT_FAILURE;
    }
    ...
    if (!output) {
        fclose(input);
        return EXIT_FAILURE;
    }

    // run the actual program
    const int exitCode = operation(input, output);

    fclose(output);
    fclose(input);

    return exitCode;
}

Why it is important to check the return value of fclose()/close():

From comp.lang.c:

The fclose() call can fail, and should be error-checked just as assiduously as all the other file operations. Sounds pedantic, right? Wrong. In a former life, my company's product managed to destroy a customer's data by omitting a check for failure when closing a file. The sequence went something like (paraphrased):

stream = fopen(tempfile, "w"); 
if (stream == NULL) ... 
    while (more_to_write) 
        if (fwrite(buffer, 1, buflen, stream) != buflen) ... 
fclose (stream); 

/* The new version has been written successfully.  Delete 
 * the old one and rename. 
 */ 
remove (realfile); 
rename (tempfile, realfile); 

Of course, what happened was that fclose() ran out of disk space trying to write the last couple blocks of data, so the tempfile was truncated and unusable. And since the fclose() failure wasn't detected, the program went right ahead and destroyed the best extant version of the data in favor of the damaged version. And, as Murphy would have it, the victim in this particular incident was the person in charge of the customer's department, the person with authority to buy more of our product or replace it with a competitor's product -- and, natch, a person who was already unhappy with us for other reasons.

It'd be a stretch to ascribe all of the ensuing misery to this single omission, but it may be worth pointing out that both the customer and my former company have since vanished from the corporate ecology.

CHECK THOSE FAILURE CODES!

Though the author of the above post should not have removed the original file in the first place, but that is beside the point.

The fseek() + ftell() trick to determine the size in undefined behavior in both C and C++:

See: FIO19-C. Do not use fseek() and ftell() to compute the size of a regular file.

In plain ISO C, there is only one way to determine the size of a file which is guaranteed to work: To read the entire file from the start, until you encounter end-of-file, while counting the number of bytes read.

Or use POSIX's stat()/fstat(), which are also supported by Windows. And for platforms that do not support them, one can fall back to reading the file in chunks, which although inefficient, is the only alternative.

You can find an implementation of an io_fsize() here: io. It is distributed under the MIT License, so you can do whatever you desire with it. Though, I am not opposed to seeing credit.

Skip the extern keyword for functions:

For functions, this only tells the compiler that linkage is extern. As this is the default (you use the keyword static to indicate that a function is not bound using extern linkage) you do not need to use it explicitly.

It also makes the declarations longer when one starts using proper attribute specifier sequences, const/restrict/volatile qualifiers, and array notation for pointer parameters with the static keyword.

Code breaks strict-aliasing rules and hence invokes undefined behavior:

From Understanding Strict Aliasing:

"Strict aliasing is an assumption, made by the C (or C++) compiler, that dereferencing pointers to objects of different types will never refer to the same memory location (i.e. alias each other.)"

uint64_t symbolFrequency = *(uint64_t*)(bitBuffer.buffer + 2);

bitBuffer.buffer has type unsigned char *, and you are casting it to uint64_t *, and then dereferencing it.

The strict aliasing rule makes this setup illegal: dereferencing a pointer that aliases an object that is not of a compatible type or one of the other types allowed by C 2011 6.5 paragraph 71 is undefined behavior. See What is the strict aliasing rule? for very comprehensive explanations.

Program does not cater to extra arguments:

Passing 10 or 20 or more arguments to the program does not result in an error. You should also be checking that argc < 5.

And this code:

    char* inputPath;
    char* outputPath;
    switch (argc) {
    case 2:
        inputPath = "-";
        outputPath = "-";
        break;
    case 3:
        inputPath = argv[2];
        outputPath = "-";
        break;
    default:
        inputPath = argv[2];
        outputPath = argv[3];
        break;
    }

can be simplified to:

    const char const* inputPath  = argc == 2 ? "-" : argv[2]; 
    const char const* outputPath = argc != 4 ? "-" : argv[3];

The return value of fgetc() is an int:

unsigned char symbol = fgetc(input);

The prototype for fgetc() is:

int fgetc(FILE *stream);

And its documentation says:

fgetc() reads the next character from stream and returns it as an unsigned char cast to an int, or EOF on end of file or error.

See: Why must the variable used to hold getchar's return value be declared as int?.

stdint.h is not required:

In "huffman.c", code has:

#include <inttypes.h>
#include <stdint.h>

Yet the ISO C Standard states that <inttypes.h> includes <stdint.h>, so the latter can be elided.

C23 alternative to declaring a function pointer:

int (*operation)(FILE*, FILE*);

This has some downsides:

  1. Some people just find it hard to understand.
  2. It becomes outdated as soon as the prototype of the function it is pointing to changes.
  3. The return type and the type of the parameters are duplicated.

I present:

typeof(encode) *operation;

// Or
typeof(decode) *operation;

Normally, whenever a function designator (a function's name) is used in an expression, it decays to a function pointer to that function.

typeof is one of those exceptions to the "decay" rules of C (C23 6.3.2.1 §4), so if you give it a function type as operand, the result is a function type and not a function pointer type. @Lundin

See: Are these two function pointer declarations equivalent? for more.

Though this has some downsides too:

  1. It is not backwards-compatible.
  2. It would not work with C++. (Some people like to compile their code with both a C and a C++ compiler).

Assuming the code does not care about backwards-compatibility or having the code compile cleanly under C++, it is kosher, albeit new.

Be more forgiving:

At times I specify the same file (due to autocompletion) as input and output. If I do that with this program, I lose the data I wanted to encode, and also do not get the encoded data, because the program opens the output file with "wb" mode and overwrites it. Then when it tries to read from it, it sees an empty file.

In such cases, I tend to open it with "a" mode, do all the processing, and then when it comes to writing to it, truncate its length to 0 (with ftruncate(fileno(stream), 0), which is POSIX) first before writing. This way, even I lose the original data, I get what I used the program for in the first place. And for this specific program, I can just decode the encoded data and get the original file back.

You can also use C11's "x" mode with fopen() for it to fail if the file exists.

Do not use a string-literal as an initializer for a char *:

String-literals in C have type char [] - unlike C++ where they have type const char [] - but it is undefined behavior to modify them. As such, it is better use const char * instead of char *.

.*_t is reserved for typedefs by ISO POSIX Standard:

Use .*_T instead, or skip the suffix altogether. How is node_t any better than node?

Do not go out of your way to waste memory:

Yes, memory is cheap all, but do not allocate BUFSIZ items (which I have usually found to be 8K) when only 511 items are required according to the comments.

Allocate to the referenced object:

node_t* nodes = calloc(2 * leafCount - 1, sizeof(node_t));

The above has 2 problems:

  1. The type is duplicated.
  2. Some maintainence engineer might later change the type of nodes, but forget to also change the operand of sizeof.

Allocating to the referenced object instead of the type solves both of these problems:

node_t* nodes = calloc(2 * leafCount - 1, sizeof *nodes);

Note that sizeof does not require parentheses when the operand is an expression in lieu of a type.

Performance:

OP states:

The goal of the project is implementing the Huffman code in C (C17), whilst being fast or at least not completely slow, my first attempt needed more than 2 minutes to encode 1 GiB of data, now I am down to around 10 seconds.

Reading a 1 GiB file one byte at a time is bound to be slow. Your options are:

  1. As stated before, read it in chunks, anywhere from 4K-256K.
  2. Memory-map the file with mmap() for UNIX-like systems.

I'd suggest doing (2) when possible, and keeping (1) as a fallback.


Emit more descriptive and helpful error messages:

input = getInput(inputPath);
    if (!input) {
        fprintf(stderr, "can't open INFILE\n");
        exitCode = 1;
        goto cleanup;
    }

As a user, I would have no idea "why". Consider:

  1. Prefixing the error messages with the program name (use argv[0]) so that if the output is redirected/piped to another program, the user would know which program emitted this error.
  2. Checking errno to see if it was set, and using strerror() to get the error value's description as a string.

Sample code:

    if (errno = 0, input = getInput(inputPath), input == nullptr) {
        fprintf(stderr, "%s: error: unable to open INFILE: \"%s\": %s.\n",
                argv[0], inputPath, errno ? strerror(errno) : "unknown error");
        exitCode = 1;
        goto cleanup;
    }

We have to set errno to 0 because it might have already been set by some previous call. If getInput() failed and did not set errno, we might emit a misleading and incorrect error message.

For a file that does not exist, the above outputs:

huffman.exe: error: unable to open INFILE: "caksncas": No such file or directory.

which is objectively better than seeing "can't open INFILE".


See also: Declaring variables inside loops, good practice or bad practice?.

\$\endgroup\$
10
  • \$\begingroup\$ A lot of good advice in there. I’ve found that #if defined(__has_include) && __has_include(<stdbit.h>) only short-circuits as intended in GCC, and on other compilers, can give me an error that _has_include or _has_c_attribute is not defined. \$\endgroup\$
    – Davislor
    Commented Jun 30, 2024 at 1:20
  • \$\begingroup\$ Your comment that casting a char* to uint64_t* “violates the strict aliasing rules” is not, I believe, accurate. C allows a char* to alias any object of any type. The actual problem is that the pointer might not be correctly-aligned for a char64_t (which would cause the program to crash with SIGBUS on some machines I’ve coded for). \$\endgroup\$
    – Davislor
    Commented Jun 30, 2024 at 1:24
  • \$\begingroup\$ The correct way to move eight bytes of memory from a char* to a uint64_t would be memcpy(&symbolFrequency, bitBuffer.buffer + 2, sizeof(symbolFrequency)). You could move this into a phi function if you wanted to be able to write const uint64_t symbolFrequency = getFrequencyFromBuffer(bitBuffer);. Where the bitBuffer parameter is const char bitBuffer[static 10]. \$\endgroup\$
    – Davislor
    Commented Jun 30, 2024 at 1:28
  • \$\begingroup\$ And yes, I’d get an error that _has_include is undefined because I misspelled it. You get the idea. :) \$\endgroup\$
    – Davislor
    Commented Jun 30, 2024 at 1:31
  • \$\begingroup\$ Not sure why that list of suggested warnings has so many -Wnos. \$\endgroup\$
    – Davislor
    Commented Jun 30, 2024 at 2:11
6
\$\begingroup\$

This will be a somewhat higher-level review of the algorithms and approach to Huffman coding, I'll let other people worry about the intricate details of the code and the issues brought up in the comments.

My code was so slow because of the way how I wrote the code for a symbol to a file. Instead of creating a simple lookup table (as I do now), I searched the binary tree each time for each symbol.

OK, good fix. You can use a lookup table for decoding too (see below). Decoding Huffman codes doesn't need to be a slow tree-walk, that's the decoding algorithm that is explained in college and it serves as a proof for why Huffman codes are uniquely decodeable, but typically real decompressors don't use that algorithm.

// write n bits to buffer
// Has better performance than calling writeBit repeatedly
extern void writeBits(uint64_t bits, int nbits, bitBuffer_t* bitBuffer)

I agree that you should have this (as opposed to going through an interface to write one bit and calling that nbits times). The implementation is more complex than anything I've ever used in similar situations though. Typically I buffer up bits in an integer first and then I deal with whole bytes, instead of writing byte-unaligned bit-strings into the array directly. That sounds like an extra step, but simplifies the code and it's efficient. Appending a short bit-string to an integer is very simple, simpler than partially-overwriting an array of bytes.

Up to 7 bits may be "left in the bit-buffer" after removing all the complete bytes, so a symbol of up to 57 bits can be appended to a 64-bit buffer unconditionally without doing anything complicated, and symbols don't get that large (a Huffman tree that skewed also requires an extremely skewed input distribution and therefore an extremely large input file, see http://www.compressconsult.com/huffman/#maxlength )

writeTree(*trees, &bitBuffer);

Serializing the whole tree is OK, it works. You can save some space overhead in the compressed file by using canonical codes, which can be reconstructed based on only the code lengths. That can be much smaller to store, especially if you apply some compression here too, as for example DEFLATE does.

Also, while you can reconstruct the actual canonical Huffman tree from the code lengths, you don't need the tree in tree-shape if you implement efficient table-based decoding. Your decoding table can be initialized directly from the symbol/code pairs without building a proper tree.

The simplest table-based decoding approach relies on a table of length 2L where L is the longest possible symbol length. This is one reason why it is common (though not universal) to limit the symbol length in real life uses of Huffman coding. For each possible bit-pattern that you could have in an L-bit buffer, the decoding table gives the first symbol that can be read from the buffer and the associated code length. There are more advanced tricks that reduce the table size.

\$\endgroup\$
0
2
\$\begingroup\$

Be concise; be expressive

Code is a recipe expressing a series of operations on data values to achieve a goal.

It's ironic that this code whose purpose is to compress (and decompress) data should be so verbose. For reasons known to the author, this has resulted in snippets such as:

        // sort trees
        for (i = 0; i < treeCount - 1 && nodesp->frequency >= trees[treeCount - (i + 2)]->frequency;
             ++i) {
            trees[treeCount - (i + 1)] = trees[treeCount - (i + 2)];
        }

i and treeCount are both local variables inside the function.
frequency is a member of a structure.
trees[ ] is obviously an array of objects.
And, the unfortunate line break jars the reader's concentration, made worse by unnecessary braces.

        // open up a gap into which goes combo-node
        const size_t limit = tCnt - 1;
        for (i = 0; i < limit && nodesp->freq >= trees[limit - 1 - i]->freq; ++i)
            trees[limit - i] = trees[limit - i - 1];

I'm not sure that this suggestion is correct. It appears the OP's code is "shuffling" nodes outward ("expanding") to create a gap in the already sorted sequence for inserting a "combo-node" expected by the Huffman "tree" algorithm. A better version would merely search for the location in the array, then use memmove() to shift array elements.

Personal comment: My brain doesn't take easily to dealing with working from the back end of something, using negative offsets. It's a handicap of mine, I admit. It feels like this logic could be re-written to search forward through the array elements, then, after a simple calculation, memmove() the remainder to create the desired 'gap' element for insertion. If speed is the objective, finding that special location could be done with a binary search into the array. Just one perspective. Take care to avoid writing "off by one" bugs!


Consider using bufPos instead of bufferPosition and bitPos instead of bitPosition. Less to type; less to read; equally expressive.

The shorter token names can be quickly assimilated by the reader, helping code where they appear to accentuate operations being performed instead of repetitive text that must be written onto multiple lines:

typedef struct {
    unsigned char *buf;
    size_t bufSz;
    unsigned int bufPos;
    int bitPos;
    FILE *fp;
} BB;

Gone are the bad old days when every token was visible everywhere. Realise that variables local to functions are disposable. A single comment can explain that abc will be used within a 20 line function instead of a local variable named lower_case_alpha_string that is overly verbose.


One purpose; one file

The objective is Huffman en-/de-coding. My personal opinion is that the administrative overhead of using separate code files (with requisite header files) is, in this case, unwarranted. Good as an exercise, but requires more code and more complexity in design.

Consider merging all the code into a single file.


Being clever

main() makes an effort to parse its arguments, including "help".

Consider building to a binary named encode(.exe), then using whatever target OS magic "links" that same file to decode(.exe).

In this way, argv[1] is "wrapped-into" argv[0].

\$\endgroup\$

Start asking to get answers

Find the answer to your question by asking.

Ask question

Explore related questions

See similar questions with these tags.