When I discovered that CUDA device memory was represented by plain old void* I was horrified by having to deal with C-style type safety and resource ownership (i.e. none whatsoever). This is my first attempt to encapsulate this representation with features we expect in C++:
- Robust ownership model - resource is allocated in constructor and released in destructor. Resource-owning object is not copyable but is moveable.
- Strongly typed (perhaps too strongly?) - device memory represents an array of a specified C++ object type and is not convertible to other types; device memory is distinct from host memory.
- Easy interface - no need to use CUDA memcpy functions which require the author to specify whether pointers refer to host or device memory; it's all inferred from the type.
Here's the code. The user-visible types are a block which owns memory and a view which provides access to memory owned by a block.
#include <cuda_runtime.h>
#include <memory>
#include <span>
#include <stdexcept>
#include <type_traits>
#include <vector>
namespace cuda::memory
{
namespace {
struct block_base
{
// This class is a resource manager for an
// untyped CUDA device-memory allocation.
protected:
std::unique_ptr<void, decltype(&cudaFree)> mem = {nullptr, &cudaFree};
explicit block_base(std::size_t size)
{
void *p;
auto const error = cudaMalloc(&p, size);
switch (error) {
case cudaSuccess:
mem.reset(p);
return;
case cudaErrorMemoryAllocation:
throw std::bad_alloc();
default:
// can't happen
throw std::logic_error(cudaGetErrorString(error));
}
}
};
}
template<typename T>
class view
{
using value_type = std::remove_cv_t<T>;
static_assert(std::is_trivially_copyable_v<value_type>,
"Device memory types must be trivially-copyable");
T *start;
std::size_t count;
friend class view<value_type>;
protected:
view(T* start, std::size_t count)
: start{start},
count{count}
{}
public:
view(const view&) = default;
view(view&& other)
: start{}, count{}
{
swap(other);
}
view& operator=(const view&) = default;
view& operator=(view&& other) {
swap(other);
return *this;
}
operator view<const T>() const
{
return {start, count};
}
void swap(view& other)
{
std::swap(start, other.start);
std::swap(count, other.count);
}
std::size_t size() const { return count; }
//std::size_t size_bytes() const { return count * sizeof (T); }
view first(std::size_t n = 1)
{ return subview(0, n); }
view last(std::size_t n = 1)
{ return subview(count - n - 1); }
view subview(std::size_t offset)
{ return subview(offset, count - offset ); }
view subview(std::size_t offset, std::size_t length)
{
if (offset > count || offset + length > count) {
throw std::out_of_range("access outside view");
}
return {start + offset, length};
}
void copy_from(view<const value_type> const& src)
{
do_memcpy(start, size(), src.start, src.size(), cudaMemcpyDeviceToDevice);
}
void copy_from(std::span<const value_type> const& src)
{
do_memcpy(start, size(), src.data(), src.size(), cudaMemcpyHostToDevice);
}
void copy_to(view<value_type> const& dest) const
{
do_memcpy(dest.start, dest.size(), start, size(), cudaMemcpyDeviceToDevice);
}
void copy_to(std::span<value_type> const& dest) const
{
do_memcpy(dest.data(), dest.size(), start, size(), cudaMemcpyDeviceToHost);
}
auto to_vector() const
{
std::vector<value_type> v(count);
copy_to(v);
return v;
}
private:
static void do_memcpy(void *dest, std::size_t dest_count,
const void *src, std::size_t src_count,
cudaMemcpyKind kind)
{
if (dest_count != src_count) {
throw std::invalid_argument("Source and destination size mismatch");
}
auto const error = cudaMemcpy(dest, src, dest_count * sizeof (T), kind);
if (error != cudaSuccess) {
// most likely this view outlived its storage
throw std::logic_error(cudaGetErrorString(error));
}
}
};
template<typename T>
class block : public block_base, public view<T>
{
public:
explicit block(std::size_t count)
: block_base{sizeof (T) * count},
view<T>{static_cast<T*>(mem.get()), count}
{}
explicit block(block const& r)
: block{r.size()}
{
this->copy_from(r);
}
explicit block(view<T> const& r)
: block{r.size()}
{
this->copy_from(r);
}
explicit block(std::span<const T> const& r)
: block{r.size_bytes()}
{
this->copy_from(r);
}
block(block&& other) = default;
block& operator=(block&& other) = default;
};
}
A simple demo program showing operations that can be used:
int main()
{
std::vector<float> v(16);
cuda::memory::block<float> b{v}; // construct by copying host memory
v = b.to_vector(); // this copies back
cuda::memory::view<const float> s = b;
s = s.subview(1, 14); // no copy here
v = s.to_vector(); // copy part of device allocation
// auto b2 = b; // not allowed
auto b2 = std::move(b); // no copy
b = cuda::memory::block(b2); // device-to-device copy
}
This class hasn't yet been used in a real project, so I'd like to hear whether it's likely to fall short in realistic use cases. It's pretty much my first exposure to CUDA, so consider me a beginner (though quite experienced with C++).