I just finished my first coding project. Prior to this I have only made small programs so I'd like some feedback on what to improve on and how I did in general. The whole code is 658 lines long.
The goal of the project is implementing the Huffman code in C (C17), whilst being fast or at least not completely slow, my first attempt needed more than 2 minutes to encode 1 GiB of data, now I am down to around 10 seconds. My code was so slow because of the way how I wrote the code for a symbol to a file. Instead of creating a simple lookup table (as I do now), I searched the binary tree each time for each symbol. Each node had all the containing symbols of their children and I checked whether my symbol was in the left or right node.
The code is on Github (and posted below): https://github.com/dryBoneMarrow/huffman.
I appreciate all tips so I can improve my coding skills.
Makefile:
CC := gcc
CFLAGS := -O3 -pedantic-errors -std=c17
CFLAGS_DEBUG := -Og -g -fsanitize=address -std=c17 -pedantic-errors -Wall
DEPS := main huffman bitHandler
BUILDDIR := build/
SRCDIR := src/
BINDIR := /usr/local/bin/
TARGET := huffman
TARGET_DEBUG := $(TARGET)
# TARGET_DEBUG := huffman_debug
OBJS := $(addprefix $(BUILDDIR), $(addsuffix .o, $(DEPS)))
OBJS_DEBUG := $(addprefix $(BUILDDIR)DEBUG_, $(addsuffix .o, $(DEPS)))
SOURCE := $(addprefix $(SRCDIR), $(addsuffix .c, $(DEPS)))
default: $(OBJS)
$(CC) $(CFLAGS) $(OBJS) -o $(TARGET)
# Debug is noticeably slower
debug: $(OBJS_DEBUG)
$(CC) $(CFLAGS_DEBUG) $(OBJS_DEBUG) -o $(TARGET_DEBUG)
clean:
rm -rf $(BUILDDIR) $(TARGET_DEBUG)
# Most likely needs admin rights
install: default
install -m 755 $(TARGET) $(BINDIR)$(TARGET)
uninstall:
rm -f $(BINDIR)$(TARGET)
$(BUILDDIR)%.o: $(SRCDIR)%.c
@mkdir -p $(BUILDDIR)
$(CC) $(CFLAGS) -c $< -o $@
$(BUILDDIR)DEBUG_%.o: $(SRCDIR)%.c
@mkdir -p $(BUILDDIR)
$(CC) $(CFLAGS_DEBUG) -c $< -o $@
.PHONY: default debug clean install
src/main.c:
#include "huffman.h"
#include <stdio.h>
#include <string.h>
void printHelp()
{
printf("\nEncodes and decodes data using huffman algorithm.\n\n"
"Usage:\n"
" huffman [encode|decode] INFILE OUTFILE\n"
" huffman [encode|decode] INFILE\n"
" huffman [encode|decode|help]\n\n"
"Note:\n"
" - may be used for stdin / stdout\n"
" When omitting INFILE/OUTFILE, stdin/stdout is used\n\n");
}
// open input file, - for stdin
FILE* getInput(char* filename)
{
FILE* input;
if (!strcmp(filename, "-")) {
// Both the encode and decode functions change the file pointer position which is not
// possible in stdin. That's why we copy the stdin into a temporary file first
int currChar;
input = tmpfile();
while ((currChar = fgetc(stdin)) != EOF) {
fputc(currChar, input);
}
rewind(input);
} else {
input = fopen(filename, "rb");
}
return input;
}
// open output file, - for stdout
FILE* getOutput(char* filename)
{
FILE* output;
if (!strcmp(filename, "-")) {
output = stdout;
} else {
output = fopen(filename, "wb");
}
return output;
}
int main(int argc, char* argv[])
{
int exitCode = 0;
// select operation
int (*operation)(FILE*, FILE*);
if (argc < 2) {
fprintf(stderr, "\nNot enough arguments\n");
printHelp();
exitCode = 1;
goto cleanup;
} else if (!strcmp(argv[1], "help")) {
printHelp();
goto cleanup;
} else if (!strcmp(argv[1], "encode")) {
operation = &encode;
} else if (!strcmp(argv[1], "decode")) {
operation = &decode;
} else {
printHelp();
exitCode = 1;
goto cleanup;
}
// get and validate INFILE and OUTFILE
FILE *input, *output;
char* inputPath;
char* outputPath;
switch (argc) {
case 2:
inputPath = "-";
outputPath = "-";
break;
case 3:
inputPath = argv[2];
outputPath = "-";
break;
default:
inputPath = argv[2];
outputPath = argv[3];
break;
}
input = getInput(inputPath);
if (!input) {
fprintf(stderr, "can't open INFILE\n");
exitCode = 1;
goto cleanup;
}
output = getOutput(outputPath);
if (!output) {
fprintf(stderr, "can't open OUTFILE\n");
exitCode = 1;
goto cleanupOutput;
}
// run the actual program
exitCode = operation(input, output);
fclose(output);
cleanupOutput:
fclose(input);
cleanup:
return exitCode;
}
src/huffman.h:
#ifndef HUFFMAN_H
#define HUFFMAN_H
#include <stdio.h>
extern int encode(FILE* input, FILE* output);
extern int decode(FILE* input, FILE* output);
#endif
src/huffman.c:
#include "huffman.h"
#include "bitHandler.h"
#include <inttypes.h>
#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
// Only works when node is initialized with 0
#define isLeaf(a) (!(a)->left)
typedef struct node {
uint64_t frequency;
unsigned char symbol;
struct node* left;
struct node* right;
} node_t;
// Dictionary is made of codeWords
typedef struct codeWord {
unsigned char codeLength;
uint64_t code;
} code_t;
// Sort nodes, used with qsort()
static int compareNodes(const void* a, const void* b)
{
if ((*(node_t* const*)a)->frequency > (*(node_t* const*)b)->frequency) return -1;
return 1;
}
// Create dictionary, dictionary[symbol] returns the corresponding code
// https://de.wikipedia.org/w/index.php?title=Huffman-Kodierung&stable=0#Erzeugen_des_Codebuchs
static void createDictionary(node_t* node, code_t* dictionary, uint64_t code, int depth)
{
if (isLeaf(node)) {
dictionary[node->symbol].code = code;
dictionary[node->symbol].codeLength = depth;
return;
}
// Writing code from left to right
code &= ~(1ull << (63 - depth));
createDictionary(node->left, dictionary, code, 1 + depth);
code |= 1ull << (63 - depth);
createDictionary(node->right, dictionary, code, 1 + depth);
}
// Write tree to file
// https://stackoverflow.com/a/759766/15833045
static void writeTree(node_t* node, bitBuffer_t* bitBuffer)
{
if (isLeaf(node)) {
writeBit(1, bitBuffer);
writeByte(node->symbol, bitBuffer);
return;
}
writeBit(0, bitBuffer);
writeTree(node->left, bitBuffer);
writeTree(node->right, bitBuffer);
}
// print representation of tree
static void printTree(node_t* node, int depth)
{
int i;
for (i = 0; i < depth; ++i) {
printf("│ ");
}
// Frequency is set if tree was generated but not if tree was read off a file
if (node->frequency) {
if (isLeaf(node)) {
printf("(%c: %" PRIuFAST64 ")\n", node->symbol, node->frequency);
return;
}
printf("(%" PRIu64 ")\n", node->frequency);
} else {
if (isLeaf(node)) {
printf("(%c)\n", node->symbol);
return;
}
printf("*\n");
}
printTree(node->left, depth + 1);
printTree(node->right, depth + 1);
}
// Read tree from file
int readTree(bitBuffer_t* bitBuffer, node_t* node, int* nodeNum)
{
// We don't have to care for a potential buffer overflow as the maximal theoretical depth of
// tree is 256, hence the maximum number of nodes 511 and it is assumed that the buffer in
// bitBuffer is >= 511
int currNode = *nodeNum;
++*nodeNum;
// bit is 1
if (bitBuffer->buffer[bitBuffer->bufferPosition] & 1 << (7 - bitBuffer->bitPosition)) {
// update bitBuffer
switch (bitBuffer->bitPosition) {
case 7:
bitBuffer->bitPosition = 0;
++bitBuffer->bufferPosition;
break;
default:
++bitBuffer->bitPosition;
}
// Assign symbol to leaf
node[currNode].symbol = bitBuffer->buffer[bitBuffer->bufferPosition]
<< bitBuffer->bitPosition
| bitBuffer->buffer[bitBuffer->bufferPosition + 1] >> (8 - bitBuffer->bitPosition);
++bitBuffer->bufferPosition;
return currNode;
}
// bit is 0
// update bitBuffer
switch (bitBuffer->bitPosition) {
case 7:
bitBuffer->bitPosition = 0;
bitBuffer->bufferPosition++;
break;
default:
bitBuffer->bitPosition++;
}
// Assign left and right to node
node[currNode].left = &node[readTree(bitBuffer, node, nodeNum)];
node[currNode].right = &node[readTree(bitBuffer, node, nodeNum)];
return currNode;
}
// Encode file
extern int encode(FILE* input, FILE* output)
{
int exitCode = 0;
//// Create the frequency table
// Buffering is used for better performance
int leafCount = 0;
uint64_t frequencyTable[256] = { 0 };
unsigned char buffer[BUFSIZ];
size_t bytesRead;
while ((bytesRead = fread(buffer, 1, BUFSIZ, input)) > 0) {
for (size_t i = 0; i < bytesRead; ++i) {
if (!frequencyTable[buffer[i]]) {
++leafCount;
}
++frequencyTable[buffer[i]];
}
}
// handle empty file
if (!leafCount) {
fprintf(stderr, "input file is empty");
exitCode = 1;
goto cleanup;
}
// handle file with one type of symbol (there's no huffman code because the huffman tree has no
// edges)
if (leafCount == 1) {
fputc(1 << 7, output);
rewind(input);
unsigned char symbol = fgetc(input);
uint64_t symbolCount = frequencyTable[symbol];
fputc(symbol, output);
fwrite(&symbolCount, sizeof(uint64_t), 1, output);
goto cleanupBuffer;
}
//// Create tree
// trees[] stores pointers to all subtrees
// nodes[] holds all the nodes of the trees (2 times the number of symbols minus one)
int i;
node_t* nodes = calloc(2 * leafCount - 1, sizeof(node_t));
if (!nodes) {
fprintf(stderr, "Couldn't allocate memory for nodes");
exitCode = 1;
goto cleanup;
}
node_t** trees = malloc(leafCount * sizeof(node_t**));
if (!trees) {
fprintf(stderr, "Couldn't allocate memory for trees");
exitCode = 1;
goto cleanupNode;
}
node_t* nodesp = nodes;
node_t** treesp = trees;
// Fill the nodes array with the leafs and reference them in the trees array
for (i = 0; i < 256; ++i) {
if (frequencyTable[i]) {
nodesp->symbol = i;
nodesp->frequency = frequencyTable[i];
*treesp = nodesp;
++treesp;
++nodesp;
}
}
// sort the tree array
qsort(trees, leafCount, sizeof(node_t**), &compareNodes);
int treeCount = leafCount;
// Connect subtrees (leafs) to the huffman tree
while (treeCount > 1) {
// create node
--treeCount;
nodesp->frequency = trees[treeCount]->frequency + trees[treeCount - 1]->frequency;
nodesp->left = trees[treeCount];
nodesp->right = trees[treeCount - 1];
// sort trees
for (i = 0; i < treeCount - 1 && nodesp->frequency >= trees[treeCount - (i + 2)]->frequency;
++i) {
trees[treeCount - (i + 1)] = trees[treeCount - (i + 2)];
}
// Add node to trees[]
trees[treeCount - (1 + i)] = nodesp;
++nodesp;
}
//// Create dictionary
code_t dictionary[256];
createDictionary(*trees, dictionary, 0, 0);
//// Write to outputFile
// Create bitBuffer, used to write bitwise to output file
bitBuffer_t bitBuffer = { malloc(BUFSIZ * sizeof(char)), BUFSIZ, 0, 0, output };
if (!bitBuffer.buffer) {
fprintf(stderr, "Couldn't allocate memory for buffer");
exitCode = 1;
goto cleanupTree;
}
// write tree to file
writeTree(*trees, &bitBuffer);
// write data to file
rewind(input);
while ((bytesRead = fread(buffer, sizeof(char), BUFSIZ, input)) > 0) {
for (i = 0; i < bytesRead; ++i) {
writeBits(dictionary[buffer[i]].code, dictionary[buffer[i]].codeLength, &bitBuffer);
}
}
// The last three bits show how many bits of the last byte are significant and hence should be
// decoded later
if (bitBuffer.bitPosition <= 5) {
uint64_t significantBits = (uint64_t)bitBuffer.bitPosition << 61;
writeBits(0, 5 - bitBuffer.bitPosition, &bitBuffer);
writeBits(significantBits, 3, &bitBuffer);
} else {
writeByte(0, &bitBuffer);
}
flush(&bitBuffer);
cleanupBuffer:
free(bitBuffer.buffer);
cleanupTree:
free(trees);
cleanupNode:
free(nodes);
cleanup:
return exitCode;
}
// decode file
extern int decode(FILE* input, FILE* output)
{
int exitCode = 0;
// Initialize bitBuffer to read bitwise later, buffersize has to be at least 511 (max size of
// tree in file)
bitBuffer_t bitBuffer = { malloc(BUFSIZ),
(bitBuffer.buffer) ? fread(bitBuffer.buffer, sizeof(char), BUFSIZ, input) : 0, 0, 0,
input };
// Check for failed malloc
if (!bitBuffer.buffer) {
fprintf(stderr, "Couldn't allocate memory\n");
exitCode = 1;
goto cleanup;
}
// check if INFILE is empty
if (!bitBuffer.bufferSize) {
fprintf(stderr, "INFILE is empty\n");
exitCode = 1;
goto cleanupBuffer;
}
// Handle case when only one symbol is encoded
if (bitBuffer.buffer[0] & (1 << 7)) {
// File has to have 10 Bytes assuming uint64_t is 8 bytes
if (bitBuffer.bufferSize != 10) {
fprintf(stderr, "INFILE has invalid format\n");
exitCode = 1;
goto cleanupBuffer;
}
int currChar = bitBuffer.buffer[1];
uint64_t i;
uint64_t symbolFrequency = *(uint64_t*)(bitBuffer.buffer + 2);
for (i = 0; i < symbolFrequency; i++) {
fputc(currChar, output);
}
goto cleanupBuffer;
}
// Read tree from file
node_t tree[511] = { 0 };
int nodeNum = 0;
readTree(&bitBuffer, tree, &nodeNum);
// Decode data using the tree
// get filesize
size_t filePos = ftell(input);
fseek(input, 0, SEEK_END);
size_t bytesLeft = ftell(input);
bytesLeft -= bitBuffer.bufferPosition;
fseek(input, filePos, SEEK_SET);
node_t* currNode = tree;
// Read bit for bit and follow tree, write character if leaf is reached and begin from root of
// tree
while (bitBuffer.bufferSize > 0) {
while (bitBuffer.bufferPosition < bitBuffer.bufferSize && bytesLeft > 1) {
for (; bitBuffer.bitPosition < 8; bitBuffer.bitPosition++) {
if (bitBuffer.buffer[bitBuffer.bufferPosition] & 1 << (7 - bitBuffer.bitPosition))
currNode = currNode->right;
else
currNode = currNode->left;
if (isLeaf(currNode)) {
fputc(currNode->symbol, output);
currNode = tree;
}
}
++bitBuffer.bufferPosition;
bitBuffer.bitPosition = 0;
--bytesLeft;
}
bitBuffer.bufferSize = fread(bitBuffer.buffer, sizeof(char), BUFSIZ, input);
if (bitBuffer.bufferSize != 0) {
bitBuffer.bufferPosition = 0;
}
}
// Read bits of last byte, the number of significant bits is stored in the last three bits
for (; bitBuffer.bitPosition < (bitBuffer.buffer[bitBuffer.bufferPosition] & 7);
++bitBuffer.bitPosition) {
if (bitBuffer.buffer[bitBuffer.bufferPosition] & 1 << (7 - bitBuffer.bitPosition))
currNode = currNode->right;
else
currNode = currNode->left;
if (isLeaf(currNode)) {
fputc(currNode->symbol, output);
currNode = tree;
}
}
cleanupBuffer:
free(bitBuffer.buffer);
cleanup:
return exitCode;
}
src/bitHandler.h:
// bitHandler provides functions to write single bits to a file
#ifndef BITHANDLER_H
#define BITHANDLER_H
#include <stdint.h>
#include <stdio.h>
// bitBuffer_t contains all necessary infos to read / write bits from / to a file
typedef struct bitBuffer {
unsigned char* buffer;
size_t bufferSize;
unsigned int bufferPosition;
int bitPosition;
FILE* file;
} bitBuffer_t;
extern void writeBit(int bit, bitBuffer_t* bitBuffer);
extern void writeBits(uint64_t bits, int nbits, bitBuffer_t* bitBuffer);
extern void writeByte(unsigned char byte, bitBuffer_t* bitBuffer);
extern void flush(bitBuffer_t* bitBuffer);
#endif
src/bitHandler.c:
#include "bitHandler.h"
#include <stdint.h>
#include <stdio.h>
// write single bits to file
extern void writeBit(int bit, bitBuffer_t* bitBuffer)
{
// write bit
if (bit) {
bitBuffer->buffer[bitBuffer->bufferPosition] |= 1u << (7 - bitBuffer->bitPosition);
} else {
bitBuffer->buffer[bitBuffer->bufferPosition] &= ~(1u << (7 - bitBuffer->bitPosition));
}
// update bitBuffer
if (bitBuffer->bitPosition == 7) {
++bitBuffer->bufferPosition;
if (bitBuffer->bufferPosition == bitBuffer->bufferSize) {
fwrite(bitBuffer->buffer, sizeof(char), bitBuffer->bufferSize, bitBuffer->file);
bitBuffer->bufferPosition = 0;
}
bitBuffer->bitPosition = 0;
} else {
++bitBuffer->bitPosition;
}
}
// write n bits to buffer
// Has better performance than calling writeBit repeatedly
extern void writeBits(uint64_t bits, int nbits, bitBuffer_t* bitBuffer)
{
//// Fill partially filled byte
// Clear part of the byte that is going to be written
bitBuffer->buffer[bitBuffer->bufferPosition] &= ~(255 >> bitBuffer->bitPosition);
// Write the bits
bitBuffer->buffer[bitBuffer->bufferPosition] |= bits >> (56 + bitBuffer->bitPosition);
// Return if all bits have been written
if (nbits <= 8 - bitBuffer->bitPosition) {
bitBuffer->bitPosition += nbits;
return;
}
// Remove written bits
bits <<= 8 - bitBuffer->bitPosition;
++bitBuffer->bufferPosition;
// flush buffer if full
if (bitBuffer->bufferPosition == bitBuffer->bufferSize) {
fwrite(bitBuffer->buffer, sizeof(char), bitBuffer->bufferSize, bitBuffer->file);
bitBuffer->bufferPosition = 0;
}
nbits -= 8 - bitBuffer->bitPosition;
bitBuffer->bitPosition = nbits % 8;
//// Write as many bytes to buffer as needed
while (nbits >= 8) {
bitBuffer->buffer[bitBuffer->bufferPosition] = bits >> 56;
bits <<= 8;
++bitBuffer->bufferPosition;
nbits -= 8;
// flush buffer if full
if (bitBuffer->bufferPosition == bitBuffer->bufferSize) {
fwrite(bitBuffer->buffer, sizeof(char), bitBuffer->bufferSize, bitBuffer->file);
bitBuffer->bufferPosition = 0;
}
}
//// Write left over bits
bitBuffer->buffer[bitBuffer->bufferPosition] &= 0u;
bitBuffer->buffer[bitBuffer->bufferPosition] |= bits >> 56;
}
// write byte to file
extern void writeByte(unsigned char byte, bitBuffer_t* bitBuffer)
{
// Complete partially filled byte
bitBuffer->buffer[bitBuffer->bufferPosition] &= ~(0xFFu >> bitBuffer->bitPosition);
bitBuffer->buffer[bitBuffer->bufferPosition] |= byte >> bitBuffer->bitPosition;
++bitBuffer->bufferPosition;
// flush buffer if full
if (bitBuffer->bufferPosition == bitBuffer->bufferSize) {
fwrite(bitBuffer->buffer, sizeof(char), bitBuffer->bufferSize, bitBuffer->file);
bitBuffer->bufferPosition = 0;
}
// write left over bits
bitBuffer->buffer[bitBuffer->bufferPosition] = byte << (8 - bitBuffer->bitPosition);
}
// flush the remaining bits in buffer to file
extern void flush(bitBuffer_t* bitBuffer)
{
fwrite(bitBuffer->buffer, sizeof(char),
(bitBuffer->bitPosition == 0) ? bitBuffer->bufferPosition : bitBuffer->bufferPosition + 1,
bitBuffer->file);
bitBuffer->bufferPosition = 0;
bitBuffer->bitPosition = 0;
}
fseek()
+ftell()
hack is undefined behaviour in both C and C++. 3) You're ignoring all I/O errors. \$\endgroup\$