SPSC templatized lock free queue implementation and benchmarking

Question

I have tried implemented a fast lock free Single producer single consumer queue in C++ for practicing concurrency and low level design concepts. I also benchmarked the code on x86_64 2.3GHz Intel 4core CPU. Please review the benchmark code (inspired from moodycamel) and Queue implementation for correctness:

lfqueue.h

#include <atomic>
#include <cstddef>
#include <unistd.h>
#include <cassert>
#include <limits>

#define CACHE_LINE_SIZE 64

template<typename T>
class LFQueue{
public:
    LFQueue(std::size_t size) : write_id(0), read_id(0), capacity_(size) {
        buf = new T[capacity_];
    }

    ~LFQueue() { delete[] buf;}

    template<typename... Args>
    bool write(Args... args){
        auto curr_read = read_id.load(std::memory_order_acquire);
        auto curr_write = write_id.load(std::memory_order_relaxed);

        if(__builtin_expect(curr_write + 1 == curr_read, 0)){
            return false;
        }
        new (&buf[curr_write]) T(std::forward<Args>(args)...);
        curr_write++;
        if(__builtin_expect(curr_write == capacity_, 0)){
            curr_write = 0;
        }
        write_id.store(curr_write, std::memory_order_release);
        return true;
    }

    bool read(T &val){
        auto curr_write = write_id.load(std::memory_order_acquire);
        auto curr_read = read_id.load(std::memory_order_relaxed);

        if(__builtin_expect(curr_read == curr_write, 0))
            return false;
        
        val = std::move(buf[curr_read]);
        curr_read++;
        if(__builtin_expect(curr_read == capacity_, 0)){
            curr_read = 0;
        }
        read_id.store(curr_read, std::memory_order_release);
        return true;
    }

    T* frontPtr() {
        auto curr_read = read_id.load(std::memory_order_acquire);
        auto curr_write = write_id.load(std::memory_order_acquire);
        if(curr_read == curr_write)
            return nullptr;
        return buf + curr_read;
    }
    void popFront() {
        auto curr_read = read_id.load(std::memory_order_acquire);
        auto curr_write = write_id.load(std::memory_order_acquire);
        if(curr_read != curr_write){
            curr_read++;
            if(curr_read == capacity_)
                curr_read = 0;
            read_id.store(curr_read, std::memory_order_release);
        }
    }
    bool isEmpty() const {
        return write_id.load(std::memory_order_acquire) == read_id.load(std::memory_order_acquire);
    }
    bool isFull() const {
        return write_id.load(std::memory_order_acquire) + 1 == read_id.load(std::memory_order_acquire);
    }
    size_t size() const {
        auto curr_read = read_id.load(std::memory_order_acquire);
        auto curr_write = write_id.load(std::memory_order_acquire);
        if(curr_write >= curr_read){
            return curr_write - curr_read;
        }
        return curr_write + capacity_ - curr_read;
    }
    size_t capacity() const { return capacity_;}

private:
    alignas(CACHE_LINE_SIZE) std::atomic_size_t write_id;
    alignas(CACHE_LINE_SIZE) std::atomic_size_t read_id;
    alignas(CACHE_LINE_SIZE) const std::size_t capacity_;
    alignas(CACHE_LINE_SIZE) T* buf;

    __always_inline bool is_capacity(size_t id){
        return (id & capacity_) && !(id & (capacity_ - 1));
    }
};

benchmark_helpers.h

#include <array>
#include <cstddef>
#include <cstdint>
#include <pthread.h>
#include <random>
#include <stdexcept>
#include <x86intrin.h>

constexpr int QUEUE_SIZE = 1000000;
constexpr int LIMIT = QUEUE_SIZE;
constexpr size_t FREQ = 2300000000;
static std::array<bool, LIMIT> rnd_vals1{}, rnd_vals2{};

#define MAIN_THREAD_CPU 0
#define TEST_THREAD_CPU 1
#define PRODUCER_THREAD_CPU 2
#define CONSUMER_THREAD_CPU 3

namespace detail {
    void pin_thread_to_cpu(int cpu_id) {
        cpu_set_t cpuset;
        CPU_ZERO(&cpuset);
        CPU_SET(cpu_id, &cpuset);

        int rc = pthread_setaffinity_np(pthread_self(), sizeof(cpu_set_t), &cpuset);
        if (rc != 0) {
            throw std::runtime_error("Error setting thread affinity");
        }
    }

    inline uint64_t rdtsc(){
        return __rdtsc();
    }

    void init_rand_array(std::array<bool, LIMIT> &rnd_vals){
        std::random_device rd;
        std::mt19937 gen(rd());
        std::uniform_int_distribution<int> dist(0, 1);

        for(size_t i = 0; i < LIMIT; i++){
            rnd_vals[i] = dist(gen);
        }
    }
}

benchmark_functions.h

#include "benchmark_helpers.h"
#include <iostream>
#include <thread>

template<typename QueueType, typename T>
void sanity_checks(){
    std::cout << "Sanity checks:\n";
    QueueType queue(LIMIT + 1);

    std::thread producer([&]{
        for(int i = 0; i < LIMIT; i++){
            queue.write(i);
        }
    });

    std::thread consumer([&]{
        T val;
        for(int i = 0; i < LIMIT; i++){
            while(!queue.read(val));
            assert(val == i && "Dequeued element doesn't match");
        }
    });

    producer.join();
    consumer.join();
    std::cout << "OK\n";
}

template<typename QueueType, typename T>
static void raw_add() {
    QueueType queue(QUEUE_SIZE + 1);
    size_t total_ops = 0;
    detail::pin_thread_to_cpu(TEST_THREAD_CPU); // OK for x86_64 where TSC are synchronized across all cores. Not portable.
    auto start = detail::rdtsc();

    for (int i = 0; i < LIMIT; i++) {
        if(!queue.write(i)){
            break;
        }
        total_ops++;
    }
    auto cycles = detail::rdtsc() - start;
    double avg_time = (static_cast<double>(cycles)) * 1e9 / (FREQ * total_ops);
    std::cout << "raw_add\t" << avg_time << "ns, total_ops: " << total_ops << "\n";
}

template<typename QueueType, typename T>
static void raw_remove() {
    detail::pin_thread_to_cpu(TEST_THREAD_CPU);
    QueueType queue(QUEUE_SIZE + 1);
    size_t total_ops = 0;

    for(int i = 0; i < QUEUE_SIZE; i++){
        queue.write(i);
    }
    auto start = detail::rdtsc();

    for (int i = 0; i < LIMIT; i++) {
        T val;
        if(!queue.read(val)){
            break;
        }
        total_ops++;
    }
    auto cycles = detail::rdtsc() - start;
    double avg_time = (static_cast<double>(cycles)) * 1e9 / (FREQ * total_ops);
    std::cout << "raw_remove\t" << avg_time << "ns, total_ops: " << total_ops << "\n";
}

template<typename QueueType, typename T>
static void single_threaded() {
    detail::pin_thread_to_cpu(TEST_THREAD_CPU);

    QueueType queue(QUEUE_SIZE + 1);
    size_t total_ops = 0;

    for(int i = 0; i < QUEUE_SIZE; i++){
        queue.write(i);
    }
    auto start = detail::rdtsc();

    for (int i = 0; i < LIMIT; i++) {
        if(rnd_vals1[i]){
            queue.write(i);
        }
        else{
            T val;
            queue.read(val);
        }
    }
    auto cycles = detail::rdtsc() - start;
    total_ops = LIMIT;
    double avg_time = (static_cast<double>(cycles)) * 1e9 / (FREQ * total_ops);
    std::cout << "single_threaded\t" << avg_time << "ns, total_ops: " << total_ops << "\n";
}

template<typename QueueType, typename T>
static void mostly_add() {
    detail::pin_thread_to_cpu(TEST_THREAD_CPU);
    QueueType queue(QUEUE_SIZE + 1);
    std::atomic_size_t total_reads = 0, total_writes = 0;

    auto start = detail::rdtsc();
    std::thread producer([&]{
        detail::pin_thread_to_cpu(PRODUCER_THREAD_CPU);
        for(int i = 0; i < LIMIT; i++){
            queue.write(i);
            total_writes++;
        }
    });

    std::thread consumer([&]{
        detail::pin_thread_to_cpu(CONSUMER_THREAD_CPU);
        T val;
        for(int i = 0; i < LIMIT/10; i++){
            if(rnd_vals1[i]){
                queue.read(val);
                total_reads++;
            }
        }
    });

    producer.join();
    consumer.join();
    auto cycles = detail::rdtsc() - start;
    auto total_ops = total_reads + total_writes;
    double avg_time = (static_cast<double>(cycles)) * 1e9 / (FREQ * total_ops);
    std::cout << "mostly_add\t" << avg_time << "ns, total_ops: " << total_ops << "\n";
}

template<typename QueueType, typename T>
static void mostly_remove() {
    detail::pin_thread_to_cpu(TEST_THREAD_CPU);
    std::random_device rd;
    std::mt19937 gen(rd());
    std::uniform_int_distribution<int> dist(0, 3);

    QueueType queue(QUEUE_SIZE + 1);
    std::atomic_size_t total_reads = 0, total_writes = 0;

    auto start = detail::rdtsc();
    std::thread producer([&]{
        detail::pin_thread_to_cpu(PRODUCER_THREAD_CPU);
        for(int i = 0; i < LIMIT/10; i++){
            if(rnd_vals1[i]){
                queue.write(i);
                total_writes++;
            }
        }
    });

    std::thread consumer([&]{
        detail::pin_thread_to_cpu(CONSUMER_THREAD_CPU);
        T val;
        for(int i = 0; i < LIMIT; i++){
            queue.read(val);
        }
    });
    total_reads = LIMIT;

    producer.join();
    consumer.join();
    auto cycles = detail::rdtsc() - start;
    auto total_ops = total_reads + total_writes;
    double avg_time = (static_cast<double>(cycles)) * 1e9 / (FREQ * total_ops);
    std::cout << "mostly_remove\t" << avg_time << "ns, total_ops: " << total_ops << "\n";
}

template<typename QueueType, typename T>
static void heavy_concurrent() {
    detail::pin_thread_to_cpu(TEST_THREAD_CPU);
    QueueType queue(QUEUE_SIZE + 1);
    std::atomic_size_t total_reads = LIMIT, total_writes = LIMIT;

    auto start = detail::rdtsc();
    std::thread producer([&]{
        detail::pin_thread_to_cpu(PRODUCER_THREAD_CPU);
        for(int i = 0; i < LIMIT; i++){
            queue.write(i);
        }
    });

    std::thread consumer([&]{
        detail::pin_thread_to_cpu(CONSUMER_THREAD_CPU);
        T val;
        for(int i = 0; i < LIMIT; i++){
            queue.read(val);
        }
    });

    producer.join();
    consumer.join();
    auto cycles = detail::rdtsc() - start;
    auto total_ops = total_reads + total_writes;
    double avg_time = (static_cast<double>(cycles)) * 1e9 / (FREQ * total_ops);
    std::cout << "heavy_concurrent\t" << avg_time << "ns, total_ops: " << total_ops << "\n";
}

template<typename QueueType, typename T>
static void random_concurrent() {
    detail::pin_thread_to_cpu(TEST_THREAD_CPU);
    QueueType queue(QUEUE_SIZE + 1);
    std::atomic_size_t total_reads = 0, total_writes = 0;

    auto start = detail::rdtsc();
    std::thread producer([&]{
        detail::pin_thread_to_cpu(PRODUCER_THREAD_CPU);
        for(int i = 0; i < LIMIT; i++){
            if(rnd_vals1[i]){
                queue.write(i);
                total_writes++;
            }
        }
    });

    std::thread consumer([&]{
        detail::pin_thread_to_cpu(CONSUMER_THREAD_CPU);
        T val;
        for(int i = 0; i < LIMIT; i++){
            if(rnd_vals2[i]){
                queue.read(val);
                total_reads++;
            }
        }
    });

    producer.join();
    consumer.join();
    auto cycles = detail::rdtsc() - start;
    auto total_ops = total_reads + total_writes;
    double avg_time = (static_cast<double>(cycles)) * 1e9 / (FREQ * total_ops);
    std::cout << "random_concurrent\t" << avg_time << "ns, total_ops: " << total_ops << "\n";
}

template<typename QueueType, typename T = int>
void run_all_benchmarks(std::string queue_name = "Queue"){
    detail::pin_thread_to_cpu(MAIN_THREAD_CPU);
    std::cout << "\nBenchmarking " << queue_name << "\n";
    sanity_checks<QueueType, T>();

    std::thread th(raw_add<QueueType, T>);
    th.join();

    th = std::thread(raw_remove<QueueType, T>);
    th.join();

    th = std::thread(single_threaded<QueueType, T>);
    th.join();

    th = std::thread(mostly_add<QueueType, T>);
    th.join();

    th = std::thread(mostly_remove<QueueType, T>);
    th.join();

    th = std::thread(heavy_concurrent<QueueType, T>);
    th.join();

    th = std::thread(random_concurrent<QueueType, T>);
    th.join();
}

And the driver code: 4. spsc_raw_benchmark.cpp

#include <folly/ProducerConsumerQueue.h>
#include "../src/lfqueue.h"
#include "benchmark_functions.h"

struct ComplexStruct{
    bool first;
    int second;
    double last;
    ComplexStruct(int val) : second(val) {}
    ComplexStruct() = default;
} __attribute__((packed));

int main(){
    detail::init_rand_array(rnd_vals1);
    detail::init_rand_array(rnd_vals2);

    // using DATA_TYPE = int;
    using DATA_TYPE = ComplexStruct;
    run_all_benchmarks<folly::ProducerConsumerQueue<DATA_TYPE>, DATA_TYPE>("Folly");
    run_all_benchmarks<LFQueue<DATA_TYPE>, DATA_TYPE>("LFQueue");
    return 0;
}

This is a sample benchmark result for non cache-aligned ComplexStruct

Benchmarking Folly
Sanity checks:
OK
raw_add 10.6289ns, total_ops: 1000000
raw_remove      2.46765ns, total_ops: 1000000
single_threaded 6.36481ns, total_ops: 1000000
mostly_add      13.5758ns, total_ops: 1050105
mostly_remove   3.49217ns, total_ops: 1050105
heavy_concurrent        3.57867ns, total_ops: 2000000
random_concurrent       28.8257ns, total_ops: 1000665

Benchmarking LFQueue
Sanity checks:
OK
raw_add 3.5452ns, total_ops: 1000000
raw_remove      2.74462ns, total_ops: 1000000
single_threaded 5.90889ns, total_ops: 1000000
mostly_add      14.2668ns, total_ops: 1050105
mostly_remove   2.31141ns, total_ops: 1050105
heavy_concurrent        3.46162ns, total_ops: 2000000
random_concurrent       38.9555ns, total_ops: 1000665

And this is 1 sample result for int:

Benchmarking Folly
Sanity checks:
OK
raw_add 4.81129ns, total_ops: 1000000
raw_remove      2.78987ns, total_ops: 1000000
single_threaded 6.13505ns, total_ops: 1000000
mostly_add      12.5706ns, total_ops: 1049680
mostly_remove   2.69507ns, total_ops: 1049680
heavy_concurrent        2.90395ns, total_ops: 2000000
random_concurrent       36.6664ns, total_ops: 1000798

Benchmarking LFQueue
Sanity checks:
OK
raw_add 2.55449ns, total_ops: 1000000
raw_remove      2.63848ns, total_ops: 1000000
single_threaded 5.68764ns, total_ops: 1000000
mostly_add      13.2418ns, total_ops: 1049680
mostly_remove   2.70407ns, total_ops: 1049680
heavy_concurrent        2.86895ns, total_ops: 2000000
random_concurrent       52.6381ns, total_ops: 1000798

If the implementations are correct, I have some questions about benchmark results:

I notice in multiple runs that random_concurrent benchmark results vary from 35-50ns and are usually slower than same run for folly, why is that even though the raw benchmarks are consistently faster?
I had tried to improve cache hit for writes by explicitly default constructing the buf array in LFQueue using buf(new T[size]()). But that doesn't improve the write performance or reduce the cache misses (I analysed using cachegrind and they still show 62500 hits on new (&buf[curr_write]) line inside write function). Why is that, even though the entire buf (4MB for int case) can fit inside the cache (L3=48MB).

Matthieu M. · Accepted Answer · 2025-02-10 18:17:24Z

Generic remarks

Use a namespace, rather than dumping all the code in the global namespace.

Use a constant, rather than a macro: this is not C.

You only posted benchmarks, no tests. For any "container" class, I recommend testing with heap allocations: they are not forgiving, and help figuring out issues in resource safety... and memory leaks.

Memory Safety: queue

Use a std::unique_ptr<T[]>, rather than a raw pointer, and you'll get the destructor for free.

Delete the copy/move constructors/assignment operators; they cannot be written safely.

Memory Safety: elements

There is a discrepancy in your usage of buf:

You default construct the elements.
But write treats the memory as raw, never destroying them before overwriting.

You need to choose between:

Default construct, then move-assign to it.
Raw memory, then construct atop it.

And cannot mix the two.

I heartily recommend the latter, you can use the following helper class as queue element:

template <typename T>
struct Raw {
    alignas(T) std::uint8_t raw[sizeof(T)];
};

Simplicity: indexes

Wrapping the indexes manually is possible, but it's actually relatively complicated correctly. At least if you want to use the full capacity -- it looks like you only ever use capacity - 1 elements.

Instead, use std::uint64_t (and atomic) for the read & write indexes, and just always increment them, then use a power-of-2 capacity and mask them on use.

You can check whether the capacity is a power-of-2 with:

auto is_power_of_2(std::uint64_t i) -> bool {
    return i > 0 && (i & (i - 1)) == 0;
}

And once you have a power-of-2, you can mask by & (capacity - 1) to achieve % capacity cheaply.

Correctness: atomics

You use the correct memory ordering semantics in read & write.

On the other hand, you tend to use too acquire too much:

frontPtr is useless, so I'll skip.
popFront is consumer-side, it can read the read index in relaxed mode.
isEmpty, isFull and size do not access the items, they can read indexes in relaxed mode.

Using acquire instead of relaxed is not wrong, and won't cost any performance on x64... but may on other architectures.

Correctness: nodiscard

While [[nodiscard]] is C++17, your compiler of choice may already have a similar attribute in C++11. Consider using them on the read and write methods so the user is reminded that they may fail, if they forget.

Performance: perfect forwarding

The write method is not using perfect forwarding correctly, there's a missing && in the arguments to avoid a copy.

Performance: cache line size

There's a difference between constructive & destructive hardware interference.

Specifically, modern Intel CPUs tend to fetch cache lines by pair, so that for your purpose (no false-sharing, ie no destructive interference) you'll want 128 bytes, not just 64 bytes.

Performance: memoization

Re-reading the read index in the consumer, and re-reading the write index in the producer leads to doubling the contention of your queue.

As a general advice with consumer/producer queues, I recommend splitting the data in 4 parts:

Common: the common part that never changes, capacity & buf here.
Shared: the common part that changes, the poorly named write_id and read_id here.
Consumer: a part only accessed by the consumer.
Producer: a part only access by the producer.

In your case, this would mean:

namespace lfqueue::detail {
    template <typename T>
    struct alignas(CACHE_LINE_SIZE) Common {
        std::uint64_t capacity;
        std::unique_ptr<Raw<T>> buf;
    };

    struct alignas(CACHE_LINE_SIZE) Shared {
        alignas(CACHE_LINE_SIZE) std::atomic_uint64_t read_index{0};
        alignas(CACHE_LINE_SIZE) std::atomic_uint64_t write_index{0};
    };

    struct alignas(CACHE_LINE_SIZE) Consumer {
        std::uint64_t cached_read_index = 0;
    };

    struct alignas(CACHE_LINE_SIZE) Producer {
        std::uint64_t cached_write_index = 0;
    };
}

namespace lfqueue {
    template <typename T>
    class LFQueue {
    private:
        detail::Common const common;
        detail::Shared shared;
        detail::Consumer consumer;
        detail::Producer producer;
    };
}

And as an example:

template <typename... Args>
bool try_write(Args&&... args) {
    auto read = shared.read_index.load(std::memory_order_acquire);
    auto write = producer.write_index;

    if (write == read) {
        return false;
    }

    auto index = write & (self.common.capacity - 1);

    auto slot = &common.buf[index].raw;
    new (slot) T(std::forward<Args>(args)...);

    producer.write_index = write + 1;
    shared.write_index.store(write + 1, std::memory_order_release);

    return true;
}

Style: initialization

Prefer initializing data members are their declaration sites, when the initializer is constant, or when they are built-ins (and therefore uninitialized by default).

Style: naming

Do not overly abbreviate: read_id is meaningless -- it's not an idea -- prefer read_index instead.

Prefer meaningful names:

buf is very generic, consider slots or elements instead.
read and write would be better named try_read and try_write, to highlight their fallible nature.
LFQueue says a lot about the implementation (LF) but nothing about how to use it. The fact that there can be a single consumer & a single producer is VERY important for correct usage, it should be reflected in the name.

And don't overdo it:

frontPtr returns a pointer. Everybody can see that, no need for Ptr in the name.

Style: documentation

There's no documentation at all:

No warning that this is a SPSC queue, and the implications of that.
No explanation as to what that bool return type means for read and write, and in which condition it flips.
No explanation of who can call frontPtr safely, when only the consumer can.

You don't need the full Doxygen crap, but a single sentence can do wonders:

/// Returns whether the write succeeded, or not. It fails on full queues.
template <typename... Args>
bool try_write(...);

And don't forget to document safety invariants:

/// Returns the pointer to the first readable element, if any.
///
/// # Safety:
///
/// - Invalidated by any subsequent call to `try_read` or `pop_front`.
/// - Invalidated by the destruction of `this`.
T* front();

Currently, the copy and move operations are deleted because we have a user-supplied destructor; once we have a unique-pointer, then the copy operations will be deleted. That said, it's good to also explicitly delete them, to make it clearer to users that they are not part of the interface. — Toby Speight
– Toby Speight, Commented Feb 10 at 9:47
@TobySpeight: Right, depending on implementation details some of the special members may be not be implicitly defined, but I still prefer making it explicit as any implicit definition would be wrong. — Matthieu M.
– Matthieu M., Commented Feb 10 at 10:11
1. why can I not use move construct object (new (&buf[curr_write]) T(std::forward<Args>(args)…)) in the location of default constructed (or moved from object) without explicitly calling the destructor? 2. Regarding cache line size point, do you mean I should separate objects by 128 bytes on intel CPUs to avoid false sharing? I see std::hardware_destructive_interference_size = 64 on x86_64 gcc 13.3 compiler. — wwite
– wwite, Commented Feb 10 at 17:46
3. In memoization try_write section, do we not want to CAS producer.cached_write_index with Shared.write_index in case write==read. I do get the point though of separating shared and common variables, is it related to this paper — wwite
– wwite, Commented Feb 10 at 17:46
@TobySpeight: You are correct, and so could deep copies reallly: the reader could make a deep copy themselves, if instead of front they had a span method giving them all guaranteed to be initialized elements (well, it'd take two spans). The problem is that it's hard to make safe, so conservatively it's easier to disable them to avoid the user shooting themselves in the foot. An alternative I've sometimes used is to split the "Consumer" & "Producer" parts out of the queue, and use a spin lock to create a Consumer or Producer. Enqueuing/dequeuing are still lockless, but it's overall safer. — Matthieu M.
– Matthieu M., Commented Feb 10 at 18:21

Toby Speight · Accepted Answer · 2025-02-10 18:11:37Z

3

This looks odd:

LFQueue(std::size_t size) : write_id(0), read_id(0), capacity_(size) {
    buf = new T[capacity_];
}

Why is buf not initialised in the initializer list, like the other members?

More importantly, this default-constructs capacity_ objects. That might just be inefficient, but when T is not default-constructible, it makes the class unusable.

We should really consider allocating uninitialised storage for the elements, and constructing them in place as required. That's what standard containers do.

Also, instead of using raw new, consider changing to std::unique_ptr<T[]> const buf to manage the storage. Then there would be no need for a user-defined constructor.

Or just delegate managing the storage to a std::deque and let this class have the Single Responsibility of managing the concurrent access. I'd be surprised if that even costs anything in performance when compiled with -Ofast.

The write() and read() functions are hard to use. Instead of blocking until the operation is possible, they just return false, meaning that the user has to manage retrying (likely leading to spinning at one end or the other instead of polite waiting).

Prefer to use standard collection names for functions such as emplace_back() and empty(), and provide the normal member type aliases such as value_type. That makes it easier for readers to comprehend, and for users to switch implementations as necessary.

I haven't looked at the benchmarking code, because there's no point measuring performance until the interface is finalised.

edited Feb 10 at 18:11

answered Feb 9 at 17:47

Toby Speight

88.7k14 gold badges104 silver badges327 bronze badges

\$\begingroup\$ I really recommend a std::unique<T[]> over a std::deque, there's no need to pay for the double indirection of a deque. \$\endgroup\$

Matthieu M.
– Matthieu M.

2025-02-10 08:32:21 +00:00
Commented Feb 10 at 8:32
\$\begingroup\$ Also, read and write have the expected interface for the domain... though I'd prefix them with try_ to represent their fallible nature. \$\endgroup\$

Matthieu M.
– Matthieu M.

2025-02-10 08:36:03 +00:00
Commented Feb 10 at 8:36
\$\begingroup\$ Are you sure that a deque has more indirection, @Matthieu? The two approaches amount to roughly the same: one pointer to follow between our class and the underlying storage. I missed a trick, though, by not recommending std::queue as an intermediate wrapper for the deque. \$\endgroup\$

Toby Speight
– Toby Speight

2025-02-10 08:37:44 +00:00
Commented Feb 10 at 8:37
\$\begingroup\$ std::deque is required to guarantee memory stability of its elements, in general it's therefore implemented as a double-ended vector of pointer to fixed-size chunks, so that on resize, the chunks do not move in memory. This leads to a double-indirection. \$\endgroup\$

Matthieu M.
– Matthieu M.

2025-02-10 09:23:52 +00:00
Commented Feb 10 at 9:23
2

\$\begingroup\$ @wwite, nearly right - we need to point to T[]. Not needing the destructor is just a side-effect, rather than the main point, which is that the smart pointer ensures the Right Thing happens during deletion and moving, and disables copying. \$\endgroup\$

Toby Speight
– Toby Speight

2025-02-10 18:09:26 +00:00
Commented Feb 10 at 18:09

| Show 3 more comments

Stack Exchange Network

SPSC templatized lock free queue implementation and benchmarking

2 Answers 2

Generic remarks

Memory Safety: queue

Memory Safety: elements

Simplicity: indexes

Correctness: atomics

Correctness: nodiscard

Performance: perfect forwarding

Performance: cache line size

Performance: memoization

Style: initialization

Style: naming

Style: documentation

You must log in to answer this question.

Hot Network Questions

SPSC templatized lock free queue implementation and benchmarking

2 Answers 2

Generic remarks

Memory Safety: queue

Memory Safety: elements

Simplicity: indexes

Correctness: atomics

Correctness: nodiscard

Performance: perfect forwarding

Performance: cache line size

Performance: memoization

Style: initialization

Style: naming

Style: documentation

You must log in to answer this question.

Related

Hot Network Questions