Here is my wheel. strmap - C string hash map.
Main goal - create usable and simple alternative to hcreate_r, hdestroy_r, hsearch_r GNU extensions.
I am looking for general review of my code. But any other suggestion and criticism (functionality, usability, code style, performance, memory usage, ...) are welcome.
Compile test&benchmark - g++ -O2 -Wall -Wextra -pedantic -o ht ht.cc strmap.c
strmap.h
/**
@file strmap.h
@brief STRMAP - simple alternative to hcreate_r, hdestroy_r, hsearch_r GNU extensions
@date 2021
@license Public Domain
*/
#ifndef _STRMAP_H
#define _STRMAP_H
typedef struct STRMAP strmap;
#ifdef __cplusplus
extern "C" {
#endif
/**
@brief Create a string map which can contain at least `size` elements
*/
strmap *sm_create(uint32_t size);
/**
@brief Retrieves user associated data for given key
*/
bool sm_lookup(strmap *sm, const char *key, void **data);
/**
@brief Insert key (if not exists) and user data
*/
bool sm_insert(strmap *sm, const char *key, const void *data);
/**
@brief Update user data for given key
*/
bool sm_update(strmap *sm, const char *key, const void *data);
/**
@brief Update user data for given key or insert if key not exists
*/
bool sm_upsert(strmap *sm, const char *key, const void *data);
/**
@brief Remove key
Based on
M. A. Kolosovskiy, "Simple implementation of deletion from open-address hash table"
https://arxiv.org/ftp/arxiv/papers/0909/0909.2547.pdf
*/
bool sm_remove(strmap *sm, const char *key);
/**
@brief Remove all keys
*/
void sm_clear(strmap *sm);
/**
@brief Return number of keys
*/
uint32_t sm_size(strmap *sm);
/**
@brief Remove all keys and free memory allocated for the map structure
*/
void sm_free(strmap *sm);
#ifdef __cplusplus
}
#endif
#endif
strmap.c
/**
@file strmap.c
@brief STRMAP - simple alternative to hcreate_r, hdestroy_r, hsearch_r GNU extensions
@date 2021
@license Public Domain
*/
#include <stdint.h>
#include <string.h>
#include <stdbool.h>
#include <stdlib.h>
#include <assert.h>
#include "strmap.h"
#define MIN_SIZE 9
/**
Daniel Lemire, A fast alternative to the modulo reduction
https://lemire.me/blog/2016/06/27/a-fast-alternative-to-the-modulo-reduction/
*/
#define FAST_REDUCE32(x, r) (uint32_t)(((uint64_t)(x) * (uint64_t)(r)) >> 32)
#ifdef USE_FAST_REDUCE
#define POSITION(x, r) FAST_REDUCE32((x), (r))
#else
#define POSITION(x, r) ((x) % (r))
#endif
#define DISTANCE(from, to, r) ((to) >= (from) ? (to) - (from) : (r) - ((from) - (to)))
struct SLOT {
const char *key; // C null terminated string
const void *data; // user data
uint32_t hash; // key hash value
};
struct STRMAP {
uint32_t capacity; // number of allocated slots
uint32_t size; // number of keys in map
struct SLOT *ht;
};
static uint32_t FNV_1a(const char *key);
static void compress(strmap *sm, struct SLOT *slot);
static struct SLOT * find(strmap *sm, const char *key, uint32_t hash);
strmap * sm_create(uint32_t size)
{
struct SLOT *ht;
struct STRMAP *sm;
uint32_t capacity;
capacity = 4 * ((size >= MIN_SIZE ? size : MIN_SIZE) / 3);
if (!(ht = (struct SLOT *)calloc(capacity, sizeof (struct SLOT)))) {
return NULL;
}
if ((sm = (struct STRMAP *)calloc(1, sizeof (struct STRMAP)))) {
sm->size = 0;
sm->capacity = capacity;
sm->ht = ht;
}
else {
free(ht);
}
return sm;
}
bool sm_insert(strmap *sm, const char *key, const void *data)
{
struct SLOT *slot;
uint32_t hash;
assert(sm);
assert(key);
if (sm->size == sm->capacity - 1) {
return false;
}
hash = FNV_1a(key);
slot = find(sm, key, hash);
if (!(slot->key)) {
slot->key = key;
slot->data = data;
slot->hash = hash;
++(sm->size);
return true;
}
return false;
}
bool sm_update(strmap *sm, const char *key, const void *data) {
struct SLOT *slot;
uint32_t hash;
assert(sm);
assert(key);
hash = FNV_1a(key);
slot = find(sm, key, hash);
if (slot->key) {
slot->data = data;
return true;
}
return false;
}
bool sm_upsert(strmap *sm, const char *key, const void *data) {
struct SLOT *slot;
uint32_t hash;
assert(sm);
assert(key);
hash = FNV_1a(key);
slot = find(sm, key, hash);
if (slot->key) {
slot->data = data;
return true;
}
if (sm->size == sm->capacity - 1) {
return false;
}
slot->key = key;
slot->data = data;
slot->hash = hash;
++(sm->size);
return true;
}
bool sm_lookup(strmap *sm, const char *key, void **data)
{
struct SLOT *slot;
uint32_t hash;
assert(sm);
assert(key);
hash = FNV_1a(key);
slot = find(sm, key, hash);
if (slot->key) {
*data = (void *)slot->data;
return true;
}
return false;
}
bool sm_remove(strmap *sm, const char *key)
{
struct SLOT *slot;
uint32_t hash;
assert(sm);
assert(key);
hash = FNV_1a(key);
slot = find(sm, key, hash);
if (slot->key) {
slot->key = 0;
--(sm->size);
compress(sm, slot);
return true;
}
return false;
}
void sm_clear(strmap *sm)
{
assert(sm);
for (uint32_t i = 0; i < sm->capacity; ++i) {
(sm->ht)[i].key = 0;
}
sm->size = 0;
}
uint32_t sm_size(strmap *sm)
{
assert(sm);
return sm->size;
}
void sm_free(strmap *sm)
{
assert(sm);
free(sm->ht);
free(sm);
}
/*
private static functions
*/
/*
find slot with given key and hash in collision chain or return first empty
*/
struct SLOT * find(strmap *sm, const char *key, uint32_t hash)
{
struct SLOT *slot, *htEnd;
slot = sm->ht + POSITION(hash, sm->capacity);
htEnd = sm->ht + sm->capacity;
while (slot->key) {
if (hash == slot->hash) {
if (!strcmp(key, slot->key)) {
return slot;
}
}
if (++slot == htEnd) {
slot = sm->ht;
}
}
return slot;
}
void compress(strmap *sm, struct SLOT *slot)
{
struct SLOT *empty = slot, *htEnd;
struct SLOT *root;
htEnd = sm->ht + sm->capacity;
if (++slot == htEnd) {
slot = sm->ht;
}
while (slot->key) {
root = sm->ht + POSITION(slot->hash, sm->capacity);
if (DISTANCE(root, slot, sm->capacity) >= DISTANCE(empty, slot,
sm->capacity)) {
// swap current slot with empty
empty->key = slot->key;
empty->data = slot->data;
empty->hash = slot->hash;
slot->key = 0;
empty = slot;
}
if (++slot == htEnd) {
slot = sm->ht;
}
}
}
#define FNV_PRIME 16777619UL
#define FNV_OFFSET 2166136261UL
uint32_t FNV_1a(const char *key)
{
uint32_t h = FNV_OFFSET;
while (*key) {
h ^= (unsigned char)*key;
h *= FNV_PRIME;
++key;
}
return h;
}
#undef FNV_PRIME
#undef FNV_OFFSET
ht.cc
#include <algorithm>
#include <vector>
#include <unordered_set>
#include <iostream>
#include <chrono>
#define USE_FAST_REDUCE 1
#include "strmap.h"
typedef std::chrono::high_resolution_clock Clock;
using namespace std;
int main()
{
string str = "abcdefghijklmnopqrstuvwxyz";
string xstr = "Zbcdefghijklmnopqrstuvwxyz";
std::chrono::duration<double> elapsed;
// keys to insert
vector<string> keys;
// not existing keys
vector<string> xkeys;
// key value
int val = 1551;
// key value for update and upsert
int uval = 7117;
// pointer for lookup
int *data;
strmap *ht = sm_create(3700000);
auto t1 = Clock::now();
for (int i=0; i<3700000; i++) {
random_shuffle(str.begin(), str.end());
keys.push_back(str);
}
auto t2 = Clock::now();
elapsed = t2 - t1;
cout << "Generate keys: " << elapsed.count() << '\n';
t1 = Clock::now();
for (int i=0; i<3700000; i++) {
random_shuffle(xstr.begin(), xstr.end());
xkeys.push_back(xstr);
}
t2 = Clock::now();
elapsed = t2 - t1;
cout << "Generate not existing keys: " << elapsed.count() << '\n';
cout << "*** strmap test ***\n";
t1 = Clock::now();
for (int i=0; i<3700000; i++) {
if (!sm_insert(ht, keys[i].c_str(), &val)) {
cout << "Error: " << keys[i].c_str() << '\n';
break;
}
}
t2 = Clock::now();
elapsed = t2 - t1;
cout << "Insert: " << elapsed.count() << '\n';
t1 = Clock::now();
for (int i=0; i<3700000; i++) {
if (!sm_lookup(ht, keys[i].c_str(), (void **)&data)) {
cout << "Error: " << keys[i] << '\n';
break;
}
if (*data != 1551) {
cout << "Error: " << keys[i] << " Data:" << *data << '\n';
break;
}
}
t2 = Clock::now();
elapsed = t2 - t1;
cout << "Lookup existing: " << elapsed.count() << '\n';
t1 = Clock::now();
for (int i=0; i<3700000; i++) {
if (!sm_update(ht, keys[i].c_str(), &uval)) {
cout << "Error: " << keys[i].c_str() << '\n';
break;
}
}
t2 = Clock::now();
elapsed = t2 - t1;
cout << "Update existing: " << elapsed.count() << '\n';
t1 = Clock::now();
for (int i=0; i<3700000; i++) {
if (!sm_lookup(ht, keys[i].c_str(), (void **)&data)) {
cout << "Error: " << keys[i] << '\n';
break;
}
if (*data != 7117) {
cout << "Error: " << keys[i] << " Data:" << *data << '\n';
break;
}
}
t2 = Clock::now();
elapsed = t2 - t1;
cout << "Lookup updated: " << elapsed.count() << '\n';
t1 = Clock::now();
for (int i=0; i<3700000; i++) {
if (sm_lookup(ht, xkeys[i].c_str(), (void **)&data)) {
cout << "Error: " << xkeys[i] << '\n'; ;
break;
}
}
t2 = Clock::now();
elapsed = t2 - t1;
cout << "Lookup not existing: " << elapsed.count() << '\n';
t1 = Clock::now();
for (int i=0; i<3000000; i++) {
if (!sm_remove(ht, keys[i].c_str()))
cout << keys[i];
}
t2 = Clock::now();
elapsed = t2 - t1;
cout << "Remove: " << elapsed.count() << '\n';
t1 = Clock::now();
for (int i=0; i<3000000; i++) {
if (!sm_upsert(ht, keys[i].c_str(), (void *)&uval)) {
cout << "Error: " << keys[i].c_str() << '\n';
break;
}
}
t2 = Clock::now();
elapsed = t2 - t1;
cout << "Upsert removed: " << elapsed.count() << '\n';
t1 = Clock::now();
for (int i=0; i<3700000; i++) {
if (!sm_lookup(ht, keys[i].c_str(), (void **)&data)) {
cout << "Error: " << keys[i] << '\n';
break;
}
if (*data != 7117) {
cout << "Error: " << keys[i] << " Data:" << *data << '\n';
break;
}
}
t2 = Clock::now();
elapsed = t2 - t1;
cout << "Lookup: " << elapsed.count() << '\n';
sm_free(ht);
cout << "*** STL unordered_set test ***\n";
unordered_set<string> strset;
strset.reserve(3700000);
t1 = Clock::now();
for (int i=0; i<3700000; i++) {
strset.insert(keys[i]);
}
t2 = Clock::now();
elapsed = t2 - t1;
cout << "Insert STL unordered_set: " << elapsed.count() << '\n';
t1 = Clock::now();
for (int i=0; i<3700000; i++) {
strset.find(keys[i]);
}
t2 = Clock::now();
elapsed = t2 - t1;
cout << "Lookup existing STL unordered_set: " << elapsed.count() << '\n';
t1 = Clock::now();
for (int i=0; i<3700000; i++) {
strset.find(xkeys[i]);
}
t2 = Clock::now();
elapsed = t2 - t1;
cout << "Lookup not existing STL unordered_set: " << elapsed.count() << '\n';
t1 = Clock::now();
for (int i=0; i<3000000; i++) {
strset.erase(keys[i]);
}
t2 = Clock::now();
elapsed = t2 - t1;
cout << "Remove: " << elapsed.count() << '\n';
}