1
\$\begingroup\$

Code: https://github.com/Loki-Astari/Puzzle/tree/master/HUF

Challenge: https://codingchallenges.fyi/challenges/challenge-huffman

huf.cpp

#include "Huffman.h"

#include <iostream>
#include <fstream>
#include <string>

using ThorsAnvil::Puzzle::HuffmanEncoder;
using ThorsAnvil::Puzzle::HuffmanDecoder;

int main(int argc, char* argv[])
{
    if (argc != 3) {
        std::cerr << "Usage: huf [+-] <filename>\n";
        return 1;
    }
    if (argv[1][0] != '+' && argv[1][0] != '-') {
        std::cerr << "Usage: huf [+-] <filename>\n";
        return 1;
    }
    std::ifstream   file(argv[2]);
    if (!file) {
        std::cerr << "File: " << argv[2] << " could not be opened\n";
        return 1;
    }

    if (argv[1][0] == '+') {
        HuffmanEncoder  encoder;
        if (encoder.buildTree(file)) {
            std::string     outName(std::string(argv[2]) + ".huf");
            std::ofstream   out(outName);
            if (!out) {
                std::cerr << "File: " << outName << " can not be opened for output\n";
                return 1;
            }
            file.clear();
            file.seekg(0);
            encoder.exportTree(out);
            encoder.encode(file, out);
            return 0;
        }
    }
    else {
        HuffmanDecoder  decoder;
        if (decoder.buildTree(file)) {
            std::string     outName(std::string(argv[2]) + ".dec");
            std::ofstream   out(outName);
            if (!out) {
                std::cerr << "File: " << outName << " can not be opened for output\n";
                return 1;
            }
            decoder.decode(file, out);
            return 0;
        }
    }
    return 1;
} 

Huffman.h

#ifndef THORSANVIL_PUZZLE_HUFFMAN_H
#define THORSANVIL_PUZZLE_HUFFMAN_H

#include <cstddef>
#include <iostream>
#include <utility>
#include <queue>
#include <vector>
#include <functional>
#include <algorithm>
#include <memory>


namespace ThorsAnvil::Puzzle
{

class Huffman
{
    protected:
    struct Node
    {
        std::size_t     cost        = 0;
        bool            dynamic     = false;
        // Leaf Nodes
        // These will have either eof set to true or a letter value.
        bool            eof         = false;
        unsigned char   letter      = '\0';
        // Leaf nodes we calculate the representation of the leaf in the file.
        // The size is the number of bits needed.
        // The value (which fits in size bits) that represents this leaf node when
        // it is serialized into a file.
        std::size_t     size        = 0;
        std::size_t     value       = 0;
        // Non-Leaf nodes (don't use eof/letter)
        // Expected to have both left and right value;
        Node*           left        = nullptr;
        Node*           right       = nullptr;

        // Default constructor used to create automatic storage duration objects.
        // This is used in the code below to create an array of Node objects.
        // Note: This class is a private class that can only be used in
        //       HuffmanEncoder and HuffmanDecoder and it thus this restriction is observed.
        Node() = default;

        // This deletes a Huffman tree.
        // Note the encoder builds the tree using Nodes from an array.
        //      The nodes in the array are not dynamically allocated and thus should not
        //      be deleted.
        ~Node();

        // Build a non leaf node
        Node(std::size_t cost, Node* left, Node* right);

        // Used to read a Huffman tree dynamically from the head of a file.
        // The tree is serialized via exportTree() (see below)
        // The user is expected to check ok and eofMark are both true after is read
        Node(std::istream& str, std::size_t pSize, std::size_t pValue, bool& ok, bool& eofMark);

        // Encode a Huffman tree to a stream.
        void exportTree(std::ostream& str);

        // Once a Hoffman tree is built from scratch.
        // Call this function to calculate the representation of each letter.
        // Also calculate the size of the output file that will be generated.
        std::size_t setName();

        private:
            // Sets the value of each lead node and how it is represented.
            // Return data about the amount of data that will written to the output.
            //  first:  Number of bytes representing the Hoffman tree when encoding.
            //  second: Number of bits to represent the data of the file.
            using DataSize = std::pair<std::size_t, std::size_t>;
            DataSize setName(std::size_t pSize, std::size_t pValue);
    };
};

class HuffmanEncoder: public Huffman
{
    private:
        // Nodes[0-255] represent the ASCII char set.
        // Nodes[256]   represents the EOF character.
        // The constructor initializes this array correctly.
        Node                    count[257];

        // The root of a Hoffman tree.
        std::unique_ptr<Node>   root;

    public:
        HuffmanEncoder();

        bool buildTree(std::istream& input);

        // Export the Hoffman tree to the file.
        void exportTree(std::ostream& out);

        // using the Hoffman tree encode the input file to the output file.
        void encode(std::istream& in, std::ostream& out);

    private:
        // Encode a sing character 'c'
        // If this fills the 'currentVal' object then write to the file.
        void add(std::ostream& out, Node* count, std::size_t const& maxSize, std::size_t& currentLen, std::size_t& currentVal, int c);
};

class HuffmanDecoder: public Huffman
{
    private:
        std::unique_ptr<Node>   root;

    public:
        // Read the Huffman tree from the input stream.
        bool buildTree(std::istream& input);

        // Decode the input stream using the Hoffman stream place the output into out
        void decode(std::istream& in, std::ostream& out);
};

}

#endif

Huffman.cpp

#include "Huffman.h"

#include <cstddef>
#include <iostream>
#include <utility>
#include <queue>
#include <vector>
#include <functional>
#include <algorithm>
#include <memory>


using namespace ThorsAnvil::Puzzle;

Huffman::Node::~Node()
{
    if (left && !left->dynamic) {
        delete left;
    }
    if (right && !right->dynamic) {
        delete right;
    }
}

// Build a non leaf node
Huffman::Node::Node(std::size_t cost, Node* left, Node* right)
    : cost(cost)
    , dynamic(true)
    , left(left)
    , right(right)
{}

// Used to read a Huffman tree dynamically from the head of a file.
// The tree is serialized via exportTree() (see below)
// The user is expected to check ok and eofMark are both true after is read
Huffman::Node::Node(std::istream& str, std::size_t pSize, std::size_t pValue, bool& ok, bool& eofMark)
    : dynamic(true)
{
    char x = '-';
    x = str.get();
    switch (x)
    {
        case 'N':
            left    = new Node(str, pSize + 1, (pValue << 1) | 0x0, ok, eofMark);
            right   = new Node(str, pSize + 1, (pValue << 1) | 0x1, ok, eofMark);
            break;
        case 'C':
            letter  = str.get();
            size    = pSize;
            value   = pValue;
            break;
        case 'Z':
            eof     = true;
            eofMark = true;
            break;
        default:
            ok      = false;
    }
}

// Encode a Huffman tree to a stream.
void Huffman::Node::exportTree(std::ostream& str)
{
    if (left == nullptr && right == nullptr) {
        if (eof) {
            str << "Z";
        }
        else {
            str << "C" << letter;
        }
    }
    else {
        str << "N";
        left->exportTree(str);
        right->exportTree(str);
    }
}

// Once a Hoffman tree is built from scratch.
// Call this function to calculate the representation of each letter.
// Also calculate the size of the output file that will be generated.
std::size_t Huffman::Node::setName()
{
    DataSize size = setName(0, 0);

    // Size of the Huffman tree in bytes.
    std::size_t treeSize    = size.first;

    // size.second is the number of bits.
    // So Calculate the number of `std::size_t` objects needs to be written to the output stream.
    std::size_t streamCount = size.second / (sizeof(std::size_t) * 8);
    std::size_t streamRem   = size.second % (sizeof(std::size_t) * 8);
    std::size_t streamSize  = streamCount * sizeof(std::size_t) + (streamRem == 0 ? 0 : sizeof(std::size_t));

    // return the number of bytes that will be sent to the stream
    return treeSize + streamSize;
}

// Sets the value of each lead node and how it is represented.
// Return data about the amount of data that will written to the output.
//  first:  Number of bytes representing the Hoffman tree when encoding.
//  second: Number of bits to represent the data of the file.
Huffman::Node::DataSize Huffman::Node::setName(std::size_t pSize, std::size_t pValue)
{
    if (left == nullptr && right == nullptr) {
        size = pSize;
        value = pValue;
        if (eof) {
            return {1, pSize * cost};
        }
        else {
            return {2, pSize * cost};
        }
    }
    else {
        DataSize l = left->setName( pSize + 1, (pValue << 1) | 0x0);
        DataSize r = right->setName(pSize + 1, (pValue << 1) | 0x1);

        return {l.first + r.first + 1, l.second + r.second};
    }
}

HuffmanEncoder::HuffmanEncoder()
{
    for (int loop = 0; loop < 256; ++loop) {
        count[loop].letter = static_cast<unsigned char>(loop);
    }
    count[256].eof = true;
    count[256].cost = 1;
}

bool HuffmanEncoder::buildTree(std::istream& input)
{
    // Count the number of each character in a file.
    std::size_t charCount = 0;
    for (unsigned char c = input.get(); input; c = input.get()) {
        ++count[c].cost;
        ++charCount;
    }
    if (charCount == 0) {
        std::cerr << "File is Empty!\n";
        return false;
    }

    // Now build the Hoffman tree.
    // Here we are dynamically allocating nodes without protection.
    // So we need to build it inside a try/catch block so if there
    // is an issue it will be correctly tidied up.
    std::priority_queue<Node*, std::vector<Node*>, std::function<bool(Node*, Node*)>>   p1{[](Node* l, Node* r){return l->cost > r->cost;}};
    try
    {
        for (int loop = 0; loop < 257; ++loop) {
            if (count[loop].cost != 0) {
                p1.emplace(&count[loop]);
            }
        }
        while (p1.size() > 1) {
            Node*   a = p1.top();p1.pop();
            Node*   b = p1.top();p1.pop();

            Node*   r = new Node{a->cost + b->cost, a, b};
            p1.emplace(r);
        }
    }
    catch(...)
    {
        // There was an exception we need to handle cleanup.
        while (p1.size()) {
            Node* n = p1.top();
            p1.pop();
            if (n && n->dynamic) {
                delete n;
            }
        }
        // Re-throw the exception
        throw;
    }

    // We now have the tree correctly rooted.
    root.reset(p1.top());

    // Calculate the representation of all the leaf nodes.
    std::size_t cost = root->setName();
    if (cost > charCount) {
        std::cerr << "Compesssion does not make it smaller\n";
        return false;
    }
    return true;
}

// Export the Hoffman tree to the file.
void HuffmanEncoder::exportTree(std::ostream& out)
{
    root->exportTree(out);
}

// using the Hoffman tree encode the input file to the output file.
void HuffmanEncoder::encode(std::istream& in, std::ostream& out)
{
    std::size_t const   maxSize     = sizeof(std::size_t) * 8;
    std::size_t         currentLen  = 0;
    std::size_t         currentVal  = 0;

    // Add each of the characters one at a time.
    for (unsigned char c = in.get(); in; c = in.get()) {
        add(out, count, maxSize, currentLen, currentVal, c);
    }
    // Add the EOF marker.
    add(out, count, maxSize, currentLen, currentVal, 256);

    // If there is data left in the currentValue
    // Then push that to the stream.
    if (currentLen != 0) {
        std::size_t shift = maxSize - currentLen;
        currentVal = currentVal << shift;
        out.write(reinterpret_cast<char*>(&currentVal), sizeof(currentVal));
    }
}

// Encode a sing character 'c'
// If this fills the 'currentVal' object then write to the file.
void HuffmanEncoder::add(std::ostream& out, Node* count, std::size_t const& maxSize, std::size_t& currentLen, std::size_t& currentVal, int c)
{
    // Representation of the char 'c'
    Node& val = count[c];

    // The value that needs to be encoded.
    // This may fit into multiple values
    std::size_t writeLen = val.size;
    std::size_t writeVal = val.value;

    while (writeLen != 0) {
        // How much can we write into the current value.
        std::size_t outLen = std::min(writeLen, (maxSize - currentLen));

        // Add as many bits as we can from encoded 'c' into this value.
        currentVal = (currentVal << outLen) | (writeVal >> (writeLen - outLen));
        currentLen += outLen;

        // Remove what we have encode
        std::size_t mask = (1UL << (writeLen - outLen)) - 1;
        writeVal = writeVal & mask;
        writeLen -= outLen;

        // If we have filled up the current value write to the output file.
        if (currentLen == maxSize) {
            out.write(reinterpret_cast<char*>(&currentVal), sizeof(currentVal));
            currentVal = 0;
            currentLen = 0;
        }
    }
}

bool HuffmanDecoder::buildTree(std::istream& input)
{
    bool                    ok      = true;
    bool                    eofMark = false;
    std::unique_ptr<Node>   result  = std::make_unique<Node>(input, 0, 0, ok, eofMark);
    if (!ok || !eofMark) {
        return false;
    }
    root = std::move(result);
    return true;
}

// Decode the input stream using the Hoffman stream place the output into out
void HuffmanDecoder::decode(std::istream& in, std::ostream& out)
{
    std::size_t const   maxSize     = sizeof(std::size_t) * 8;
    std::size_t const   maskBase    = (1UL << (maxSize - 1));

    Node*               current     = root.get();

    bool finished = false;

    while (!finished) {
        // Read the next vallue from the input stream.
        std::size_t     currentValue;
        in.read(reinterpret_cast<char*>(&currentValue), sizeof(currentValue));

        // Loop over each of the bit this allows us to follow the Hoffman
        // tree to a leaf node and then output the value of the lead node.
        for (std::size_t currentMask = maskBase; currentMask; currentMask >>= 1) {

            // Get the next bit.
            bool branch = currentValue & currentMask;
            // Update the position in the Hoffman tree.
            current     = branch ? current->right : current->left;

            // If we are at a leaf node.
            if (current->left == nullptr && current->right == nullptr) {
                if (current->eof) {
                    finished = true;
                    break;
                }
                else {
                    out << current->letter;
                }
                // Reset the current point in the Hoffman tree to the root.
                current = root.get();
            }
        }
    }
}
\$\endgroup\$

2 Answers 2

1
\$\begingroup\$

main

Your main is a bit long. I would suggest that the code within the last conditional should be split out into two separate functions to make the whole thing a bit more manageable.

    if (argv[1][0] == '+') {
        huffmanEncode(...);
    }
    else {
        huffmanDecode(...);
    }
\$\endgroup\$
1
\$\begingroup\$

In various places there is an 1UL which looks like it's meant to be equally large as a size_t, but 1UL will be 32 bit on 64-bit windows and size_t will be 64-bit of course so this doesn't always match.

Since the file is built out of size_t's that have the fist symbol at the top bit and are written from the least significant byte up (unless you want to talk about endianness, not very productive in 2026), so the way the file is structured depends on the size of size_t (the first symbol ends up at either bit 31 or bit 63). There is no direct compatibility between 64-bit and 32-bit. If the chunks were filled from the bottom up it wouldn't matter that much, two successive 32-bit chunks would also be the corresponding 64-bit chunk. Well it still matters at the end of the file, an odd number of 32-bit chunks would break a 64-bit decoder, but that's a small change to fix. Of course, you can just decide not to care about 32-bit machines and then it's not a concern.

Exactly filling up a size_t and then writing it out is one way, alternatively you could buffer up the bits until you at least (instead of aiming for an exact count) 8 or 32 and then write those out. That's more write calls, but the advantage is that you can ensure that a code can always be appended to the buffer in one go, there's no messing around with partial codes. Theoretically that also involves setting a limit on the code length, which you may also want to do for decoding purposes. I'm not necessarily saying that this alternative is better, but it exists and you can consider it.

Taking a byte at the time out of the bit buffer also means endianness is no longer a factor.

// Now build the Hoffman tree.
// Here we are dynamically allocating nodes without protection.

It's Huffman btw (also in some other places) but more importantly here's a different idea that you can use if you want: create an array of nodes up front, then use pointers to those nodes. Tree-building wouldn't need to worry about dynamic allocation or exceptions because the nodes already exist and the HuffmanEncoder owns them in bulk (or directly contains them, if that's how you do it). You also wouldn't need a recursive tree destroyer.

Instead of serializing the Huffman tree to the file, you could also do one these:

  • Store the symbol counts. As long as your tree generation is deterministic, which it should be, then the same tree will be re-generated from it.
  • Store only the length of each symbol. If you use the canonical Huffman code then that's all you need to recreate the tree - or the decoding table directly, without building a tree. This is what DEFLATE and bzip2 do, but they also add extra compression on top of this table of symbol lengths.

Speaking of decoding tables, you could use some table-based decoding algorithm instead of bit-by-bit decoding, which would be a lot more efficient. The simplest technique involves setting a length limit on your Huffman codes of k (say, 15 or 16, see DELFATE for example), and building a table of size 2k. I can go into more if you care, otherwise it would be waste to type it all out.

Apparently I had written about it in some detail in another review. To recap, you can think of a normal Huffman tree this way: each internal node has two children, indexed by a single bit, decoding 1 bit per level. You could imagine making the tree flatter and wider by having 4 children per node: the four grand-children of the original node (if there were only 3, or even only 2, then some nodes need to be duplicated), indexed by a 2-bit number. That tree could be used to decode 2 bits per level, but that's always an even number of bits, so you would need to store the code length and only advance through your bit-stream by as many bits as the code length, instead of as many bits as were used to do the lookup. If this process of making the tree wider and shallower is continued until the "tree" has a branching factor of 2k, it only has a depth of 1. Then decoding is only a single step, and the "tree" is just a flat array.

That array can be formed from the list of symbol,code pairs (independently of the Huffman tree - if you use the canonical Huffman code, the decoder does not need to reconstruct the actual tree) by, for each pair of symbol,code, writing symbol,m (where m is the code length) into every entry of the table decodeTable[(code << (k - m)) | padding] for every m-bit value of padding, or the code and the padding may need to be swapped depending on your bit order. This order puts the padding at the bottom of the index, which is what you need if your k-bit window has the first code aligned to the most significant bit of the window. If you have the first code aligned to the bottom of the k-bit window, the padding needs to be at the top.

More advanced techniques can reduce the size of the table. For example if you pick the canonical Huffman code in which long codes have zeroes as their most-significant bits, the decoder can use a usually-efficient leading-zero-count operation to pick a sub-table, each sub-table then only needs to decode the part of the Huffman code that comes after the leading zeroes.

\$\endgroup\$
4
  • \$\begingroup\$ Serializing the tree seems to provide a very compact representation. Also interested in decode efficiency. \$\endgroup\$ Commented 8 hours ago
  • \$\begingroup\$ Changed std::size_t to std::uint64_t so we know the size of the object. Fixed the 1UL Changed to std::uint64_t{1} \$\endgroup\$ Commented 7 hours ago
  • \$\begingroup\$ Endianess is a problem I will noodle on that. \$\endgroup\$ Commented 7 hours ago
  • \$\begingroup\$ @LokiAstari the serialized tree has at least 2 bytes per symbol (counting only the leaves for convenience), you can reduce that to less than a byte per symbol (plus some overhead which I'm ignoring for convenience but it doesn't ruin the comparison). DEFLATE has both run-length encoding for its table of code lengths (which already are only numbers in 0..15 so they would fit 2-per-byte without serious compression) and another layer of Huffman coding on top \$\endgroup\$ Commented 5 hours ago

You must log in to answer this question.

Start asking to get answers

Find the answer to your question by asking.

Ask question

Explore related questions

See similar questions with these tags.