Strongly-typed CUDA device memory

Question

When I discovered that CUDA device memory was represented by plain old void* I was horrified by having to deal with C-style type safety and resource ownership (i.e. none whatsoever). This is my first attempt to encapsulate this representation with features we expect in C++:

Robust ownership model - resource is allocated in constructor and released in destructor. Resource-owning object is not copyable but is moveable.
Strongly typed (perhaps too strongly?) - device memory represents an array of a specified C++ object type and is not convertible to other types; device memory is distinct from host memory.
Easy interface - no need to use CUDA memcpy functions which require the author to specify whether pointers refer to host or device memory; it's all inferred from the type.

Here's the code. The user-visible types are a block which owns memory and a view which provides access to memory owned by a block.

#include <cuda_runtime.h>

#include <memory>
#include <span>
#include <stdexcept>
#include <type_traits>
#include <vector>

namespace cuda::memory
{
    namespace {
        struct block_base
        {
            // This class is a resource manager for an
            // untyped CUDA device-memory allocation.
        protected:
            std::unique_ptr<void, decltype(&cudaFree)> mem = {nullptr, &cudaFree};

            explicit block_base(std::size_t size)
            {
                void *p;
                auto const error = cudaMalloc(&p, size);
                switch (error) {
                case cudaSuccess:
                    mem.reset(p);
                    return;
                case cudaErrorMemoryAllocation:
                    throw std::bad_alloc();
                default:
                    // can't happen
                    throw std::logic_error(cudaGetErrorString(error));
                }
            }
        };
    }


    template<typename T>
    class view
    {
        using value_type = std::remove_cv_t<T>;
        static_assert(std::is_trivially_copyable_v<value_type>,
                      "Device memory types must be trivially-copyable");

        T *start;
        std::size_t count;

        friend class view<value_type>;

    protected:
        view(T* start, std::size_t count)
            : start{start},
              count{count}
        {}

    public:
        view(const view&) = default;

        view(view&& other)
            : start{}, count{}
        {
            swap(other);
        }

        view& operator=(const view&) = default;
        view& operator=(view&& other) {
            swap(other);
            return *this;
        }

        operator view<const T>() const
        {
            return {start, count};
        }

        void swap(view& other)
        {
            std::swap(start, other.start);
            std::swap(count, other.count);
        }

        std::size_t size() const { return count; }
        //std::size_t size_bytes() const { return count * sizeof (T); }

        view first(std::size_t n = 1)
        { return subview(0, n); }
        view last(std::size_t n = 1)
        { return subview(count - n - 1); }
        view subview(std::size_t offset)
        { return subview(offset, count - offset ); }
        view subview(std::size_t offset, std::size_t length)
        {
            if (offset > count || offset + length > count) {
                throw std::out_of_range("access outside view");
            }
            return {start + offset, length};
        }

        void copy_from(view<const value_type> const& src)
        {
            do_memcpy(start, size(), src.start, src.size(), cudaMemcpyDeviceToDevice);
        }
        void copy_from(std::span<const value_type> const& src)
        {
            do_memcpy(start, size(), src.data(), src.size(), cudaMemcpyHostToDevice);
        }

        void copy_to(view<value_type> const& dest) const
        {
            do_memcpy(dest.start, dest.size(), start, size(), cudaMemcpyDeviceToDevice);
        }
        void copy_to(std::span<value_type> const& dest) const
        {
            do_memcpy(dest.data(), dest.size(), start, size(), cudaMemcpyDeviceToHost);
        }

        auto to_vector() const
        {
            std::vector<value_type> v(count);
            copy_to(v);
            return v;
        }

    private:
        static void do_memcpy(void *dest, std::size_t dest_count,
                              const void *src, std::size_t src_count,
                              cudaMemcpyKind kind)
        {
            if (dest_count != src_count) {
                throw std::invalid_argument("Source and destination size mismatch");
            }
            auto const error = cudaMemcpy(dest, src, dest_count * sizeof (T), kind);
            if (error != cudaSuccess) {
                // most likely this view outlived its storage
                throw std::logic_error(cudaGetErrorString(error));
            }
        }
    };

    template<typename T>
    class block : public block_base, public view<T>
    {

    public:
        explicit block(std::size_t count)
            : block_base{sizeof (T) * count},
              view<T>{static_cast<T*>(mem.get()), count}
        {}

        explicit block(block const& r)
            : block{r.size()}
        {
            this->copy_from(r);
        }

        explicit block(view<T> const& r)
            : block{r.size()}
        {
            this->copy_from(r);
        }

        explicit block(std::span<const T> const& r)
            : block{r.size_bytes()}
        {
            this->copy_from(r);
        }

        block(block&& other) = default;
        block& operator=(block&& other) = default;
    };

}

A simple demo program showing operations that can be used:

int main()
{
    std::vector<float> v(16);
    cuda::memory::block<float> b{v}; // construct by copying host memory
    v = b.to_vector();          // this copies back

    cuda::memory::view<const float> s = b;
    s = s.subview(1, 14);       // no copy here
    v = s.to_vector();          // copy part of device allocation

    // auto b2 = b;  // not allowed
    auto b2 = std::move(b);     // no copy
    b = cuda::memory::block(b2); // device-to-device copy
}

This class hasn't yet been used in a real project, so I'd like to hear whether it's likely to fall short in realistic use cases. It's pretty much my first exposure to CUDA, so consider me a beginner (though quite experienced with C++).

I was horrified by having to deal with C-style type safety and resource ownership Well somehow such trivial piece of code as the Linux kernel seem to have managed this problem just fine. Perhaps the problem here is that you're inexperienced with C and too dependent on C++ "features" which, IMO, add an appearance of safety without much to show for it. I've worked mostly professionally in C, C++ and Java. C++ is the one I most dislike. YMMV, but I'd suggest getting used to C-style memory handling before you avoid it. Remember Cuda, like C, is performance orientated. — StephenG - Help Ukraine
– StephenG - Help Ukraine, Commented Sep 22 at 11:09
@StephenG, I've done a lot of C programming, and I'm used to doing resource management in C. But C++ is a different language, and I want my C++ libraries to act like C++. Mixing the two paradigms is the problem, not one of them on its own. — Toby Speight
– Toby Speight, Commented Sep 22 at 11:25

G. Sliepen · Accepted Answer · 2025-09-20 19:37:34Z

Consider making it look like a container

I think you already made something that is safe and relatively easy to use. However, the to_vector() member function is a code smell: it tells me that your code is not generic enough. What if I wanted to copy to a std::array or a std::deque?

One way to improve it is to add an iterator interface to your view. That way you would be able to write:

cuda::memory::block<float> b{42};
…
auto v = b | std::ranges::to<std::vector>;

Of course, when dereferencing a cuda::memory::view::iterator, you can neither have it return a reference to the data, nor can you return a copy of the data, since you don't know if the iterator is going to be used for reading or writing to the memory. So you have to do the same as is done for std::vector<bool>: dereferencing an iterator returns a proxy object, and this handles reads and writes of individual elements:

class <T>
class reference {
    T* element;
    friend class iterator;

    explicit reference(T* element): element(element);

public:
    operator=(const T& value) {
        cudaMemCpy(element, &value, sizeof value, cudaMemcpyHostToDevice);
    }

    operator T() const {
        T value;
        cudaMemCpy(&value, element, sizeof value, cudaMemcpyDeviceToHost);
    }

    operator=(reference other) {
        cudaMemCpy(other.element, element, sizeof value, cudaMemcpyDeviceToDevice);
    }
};

template<typename T>
class iterator {
    T* ptr;
    …
    friend class view;

    iterator(T* ptr, …): ptr{ptr} {}

public:
    reference<T> operator*() {
        return {ptr};
    }
    …
};

template<typename T>
class view {
    T* start;
    …
public:
    iterator<T> begin() {
        return {start, …};
    }
};

Once it has the same interface as a STL container, a lot of the standard algorithms will suddenly work directly on block.

That's an interesting idea, but one I'll probably decline. I should probably have included in my description the aim to keep the cost of transfer visible, so that device memory needs to be explicitly copied to local to work on it. I expect it's cheaper to do that in one operation rather than element-by-element. As for copying into std::array, I do provide copy_to(std::span), but there might be ways to make that easier to use - thanks for suggesting. — Toby Speight
– Toby Speight, Commented Sep 21 at 7:04

Stack Exchange Network

Strongly-typed CUDA device memory

1 Answer 1

Consider making it look like a container

You must log in to answer this question.

Linked

Hot Network Questions

Strongly-typed CUDA device memory

1 Answer 1

Consider making it look like a container

You must log in to answer this question.

Linked

Related

Hot Network Questions