First C++ project: write a hash set for a specific scenario where I only need inserts and containment checks.
This is not AI slop, everything except for the avalanche function is hand written. Please ruthlessly correct everything you dislike; idioms, naming, formatting, performance, safety, structure, references vs pointers vs copies. None of this should require actually thinking about hash sets, but if you're so inclined, I'd also appreciate algorithmic performance insights, like what hash function I could use instead (although it's already 3x faster than the built-in on large sets, so I'm happy on that front).
The hash set is supposed to work well for both small (int / double) types, large structs with dozens of fields, and container types (which I'd store as pointers and for which I'd add comparer argument), and for all sizes.
First, the hash set code, insert_only_hash_set.tpp:
#pragma once
#include <vector>
#include <cassert>
#include <cstring>
static size_t avalanche(size_t h) {
if constexpr (sizeof(size_t) == 8) {
// 64-bit MurmurHash3 finalizer
h ^= h >> 33;
h *= 0xff51afd7ed558ccdULL;
h ^= h >> 33;
h *= 0xc4ceb9fe1a85ec53ULL;
h ^= h >> 33;
} else {
// 32-bit MurmurHash3 finalizer
h ^= h >> 16;
h *= 0x85ebca6bU;
h ^= h >> 13;
h *= 0xc2b2ae35U;
h ^= h >> 16;
}
return h;
}
template <typename T, std::size_t K, std::size_t M>
class InsertOnlyHashSet {
/* Allocates M * capacity buckets.
Stores K entries per bucket "inline", stores the rest in a linear allocator
(which gets compacted as needed) */
public:
explicit InsertOnlyHashSet(const size_t capacity)
: numElements(0),
capacity(capacity),
spillHead(0)
{
static_assert(M > 0, "Template parameter M must positive");
assert(capacity > 0);
spill = std::vector<T>(capacity);
buckets = std::vector<BucketEntry>(M * capacity);
}
static size_t myHash(const T t) {
return avalanche(std::hash<std::remove_cv_t<T>>{}(t));
}
bool contains(const T& item) const {
auto thisHash = myHash(item);
size_t bucketIndex = thisHash % nBuckets();
auto& bucket = buckets[bucketIndex];
auto bucketSize = bucket.size;
auto& bucketValues = bucket.values;
if (bucketSize == 0) {
return false;
}
if (bucketSize < K) {
return std::find(bucketValues.begin(), bucketValues.begin() + bucketSize, item) != bucketValues.begin() + bucketSize;
}
if (std::find(bucketValues.begin(), bucketValues.begin() + K, item) != bucketValues.begin() + K) {
return true;
}
auto spillStartIndex = bucket.spillIndex;
for (size_t offset = 0; offset < bucketSize - K; offset++) {
if (item == spill[spillStartIndex + offset]) {
return true;
}
}
return false;
}
bool insert(const T& item) {
if (numElements + 1 > capacity) {
reserve(capacity == 0 ? 1 : capacity * 2);
}
auto thisHash = myHash(item);
size_t index = thisHash % nBuckets();
auto& bucket = buckets[index];
auto bucketSize = bucket.size;
auto& values = bucket.values;
if (bucketSize < K) {
for (size_t i = 0; i < bucketSize; i++) {
if (item == values[i]) {
return false;
}
}
values[bucketSize] = item;
}
else {
for (size_t i = 0; i < K; i++) {
if (item == values[i]) {
return false;
}
}
auto spillStartIndex = bucket.spillIndex;
for (size_t offset = 0; K + offset < bucketSize; offset++) {
if (item == spill[spillStartIndex + offset]) {
return false;
}
}
reserveBucketSpill(index, bucketSize + 1);
spill[bucket.spillIndex + bucketSize - K] = item;
}
bucket.size = bucketSize + 1;
numElements++;
return true;
}
void reserve(size_t newCapacity) {
auto newHashSet = InsertOnlyHashSet<T, K, M>(newCapacity);
for (size_t j = 0; j < nBuckets(); j++){
auto& bucket = buckets[j];
auto bucketSize = bucket.size;
auto& bucketValues = bucket.values;
if (bucketSize < K) {
for (size_t i = 0; i < bucketSize; i++) {
newHashSet.insert(std::move(bucketValues[i]));
}
}
else {
for (size_t i = 0; i < K; i++) {
newHashSet.insert(std::move(bucketValues[i]));
}
auto spillStartIndex = bucket.spillIndex;
for (size_t offset = 0; K + offset < bucketSize; offset++) {
newHashSet.insert(std::move(spill[spillStartIndex + offset]));
}
}
}
buckets = std::move(newHashSet.buckets);
spill = std::move(newHashSet.spill);
spillHead = newHashSet.spillHead;
capacity = newHashSet.capacity;
}
size_t numElements;
size_t capacity;
private:
struct BucketEntry {
size_t size;
size_t spillIndex;
std::array<T, K> values;
BucketEntry() : size(0), spillIndex(0), values{} {}
};
std::vector<BucketEntry> buckets;
std::vector<T> spill;
size_t nBuckets() const { return buckets.size(); };
size_t nTotalSpill() const { return spill.size(); };
size_t spillHead;
void reserveBucketSpill(size_t bucketIndex, size_t newSize) {
auto& bucket = buckets[bucketIndex];
assert(newSize > bucket.size);
assert(bucket.size >= K);
if (bucket.spillIndex + bucket.size - K == spillHead && spillHead + (newSize - bucket.size) <= nTotalSpill()) {
// spill of this bucket is already at the end, and we can just extend it
spillHead += newSize - bucket.size;
}
else if (spillHead + (newSize - K) <= nTotalSpill()){
// spill of this bucket gets appended to the end and the old spill is abandoned
if constexpr (std::is_trivially_copyable_v<T>) {
std::memcpy(&spill[spillHead], &spill[bucket.spillIndex], (bucket.size - K) * sizeof(T));
} else {
for (size_t offset = 0; K + offset < bucket.size; offset++) {
spill[spillHead + offset] = std::move(spill[bucket.spillIndex + offset]);
}
}
bucket.spillIndex = spillHead;
spillHead += newSize - K;
}
else{
// we first compact the existing spill for all buckets but this one,
// then write the newly extended spill for this bucket to the end
auto newSpill = std::vector<T>(nTotalSpill());
auto newHead = 0;
// move all other spills
for (size_t i = 0; i < nBuckets(); i++)
{
if (i != bucketIndex) {
if (auto& otherBucket = buckets[i]; otherBucket.size > K) {
if constexpr (std::is_trivially_copyable_v<T>) {
std::memcpy(&newSpill[newHead], &spill[otherBucket.spillIndex], (otherBucket.size - K) * sizeof(T));
} else {
for (size_t offset = 0; K + offset < otherBucket.size; offset++) {
newSpill[newHead + offset] = std::move(spill[otherBucket.spillIndex + offset]);
}
}
otherBucket.spillIndex = newHead;
newHead += otherBucket.size - K;
}
}
}
// append this spill
for (size_t offset = 0; K + offset < bucket.size; offset++) {
newSpill[newHead + offset] = std::move(spill[bucket.spillIndex + offset]);
}
if constexpr (std::is_trivially_copyable_v<T>) {
std::memcpy(&newSpill[newHead], &spill[bucket.spillIndex], (bucket.size - K) * sizeof(T));
} else {
for (size_t offset = 0; K + offset < bucket.size; offset++) {
newSpill[newHead + offset] = std::move(spill[bucket.spillIndex + offset]);
}
}
bucket.spillIndex = newHead;
spillHead = newHead + (newSize - K); // newSize instead of bucket.size because we reserve for this bucket
spill = std::move(newSpill);
}
assert(spillHead <= nTotalSpill());
// should hold by design as long as we only ever request one more and as long as capacity() >= numberOfinsertCalls
// (which can be larger than numElements once we add drops)
}
};
Next, the benchmarks: main.cpp:
#include <random>
#include <unordered_set>
#include <iostream>
#include <chrono>
#include "insert_only_hash_set.tpp"
inline void print() {
std::cout << "\n";
}
template <typename T, typename... Args>
void print(const T& first, const Args&... rest) {
std::cout << first << " ";
print(rest...);
}
template <typename F>
double timeFunction(F func) {
const auto start = std::chrono::high_resolution_clock::now();
func();
const auto end = std::chrono::high_resolution_clock::now();
return std::chrono::duration<double>(end - start).count();
}
template <typename SetType>
void test(size_t n, size_t init) {
std::default_random_engine generator1;
std::default_random_engine generator1Clone;
std::default_random_engine generator2(13);
std::uniform_real_distribution<> distribution(0, 1);
for (auto repeat = 0; repeat < (10'000'000 / n) + 1; repeat++) {
SetType numbers(init);
double rand;
for (auto i = 0; i < n; i++) {
rand = distribution(generator1);
numbers.insert(rand);
}
for (auto i = 0; i < n; i++) {
rand = distribution(generator1Clone);
if (!numbers.contains(rand)){
throw std::runtime_error("should be contained");
}
rand = distribution(generator2);
if (numbers.contains(rand)){
throw std::runtime_error("should not be contained");
}
}
}
}
int main() {
// ReSharper disable once CppTooWideScope
for (auto i = 0; i < 2; i++) {
for (auto n : {10, 100, 1000, 100'000, 1'000'000}) {
for (auto init : {1, n}) {
print(i, n, init);
print("InsertOnlyHashSet", timeFunction([n, init]{test<InsertOnlyHashSet<double, 1, 2>>(n, init);}));
print("std::unordered_set", timeFunction([n, init]{test<std::unordered_set<double>>(n, init);}));
}
}
}
}
0 10 1
InsertOnlyHashSet 1.46363
std::unordered_set 1.46558
0 10 10
InsertOnlyHashSet 0.86682
std::unordered_set 1.19881
0 100 1
InsertOnlyHashSet 1.26
std::unordered_set 1.42473
0 100 100
InsertOnlyHashSet 0.928371
std::unordered_set 1.30072
0 1000 1
InsertOnlyHashSet 1.21812
std::unordered_set 1.6374
0 1000 1000
InsertOnlyHashSet 0.845891
std::unordered_set 1.40202
0 100000 1
InsertOnlyHashSet 2.43145
std::unordered_set 2.4168
0 100000 100000
InsertOnlyHashSet 1.23032
std::unordered_set 2.04783
0 1000000 1
InsertOnlyHashSet 3.32925
std::unordered_set 9.74492
0 1000000 1000000
InsertOnlyHashSet 2.74033
std::unordered_set 7.56649
1 10 1
InsertOnlyHashSet 1.47043
std::unordered_set 1.43015
1 10 10
InsertOnlyHashSet 0.890216
std::unordered_set 1.19057
1 100 1
InsertOnlyHashSet 1.25313
std::unordered_set 1.34393
1 100 100
InsertOnlyHashSet 0.849175
std::unordered_set 1.22462
1 1000 1
InsertOnlyHashSet 1.19655
std::unordered_set 1.53719
1 1000 1000
InsertOnlyHashSet 0.863851
std::unordered_set 1.40414
1 100000 1
InsertOnlyHashSet 2.11273
std::unordered_set 2.52363
1 100000 100000
InsertOnlyHashSet 1.38269
std::unordered_set 2.08268
1 1000000 1
InsertOnlyHashSet 3.27826
std::unordered_set 9.46702
1 1000000 1000000
InsertOnlyHashSet 2.80797
std::unordered_set 7.20734
Tobjects should be in a set that is constructed withKas 1,Mas 2, and an initial capacity of 10… but with zeroTobjects actually inserted? Should it be 30? \$\endgroup\$