I was working on my version of "Universe Sandbox" and first thought comes to your mind is "where the hell are my planets?" so I thought loading models sucks and made this thing, It's supposed to support LOD so in my app I would have 2-3 meshes that would contain different LOD's 1'st, 2'nd, and 4'th for example. I use here CUDA-OpenGl interop along with sdl (if that's important) and c++23.
Questions
- Is my subdivide kernel efficient enough? I implemented it so there will be copies of vertices but don't see it as a big problem.
- Maybe I should make cuda_from_gl_data a child from some class called resource that would take in it's constructor the function and parameters on how to register the resource. (So I'll pass to it for example
cudaGraphicsGLRegisterImageand then parameters or something like that) - Is my idea of class icosahedron_generator even good? I heard better design choice was to separate function that manages just the tesselation process and the class that manages the resources and with them launches the tesselation with just one function call
- I think I am doing unnecessary calculations in the subdivide kernel with UV coordinates, maybe make for it another kernel that would actually make it right? And then launch final orthogonalization.
My current implementation
include/rendering/vertex.h
//
// Created by progamers on 8/3/25.
//
#ifndef SPACE_EXPLORER_VERTEX_H
#define SPACE_EXPLORER_VERTEX_H
#include <glm/glm.hpp>
namespace raw {
struct vertex {
glm::vec3 position;
glm::vec2 tex_coord;
// Normal mapping
glm::vec3 normal;
glm::vec3 tangent;
glm::vec3 bitangent;
};
} // namespace raw
#endif // SPACE_EXPLORER_VERTEX_H
include/sphere_generation/mesh_generator.h
//
// Created by progamers on 7/7/25.
//
#ifndef SPACE_EXPLORER_MESH_GENERATOR_H
#define SPACE_EXPLORER_MESH_GENERATOR_H
#include <raw_memory.h>
#include <array>
#include <glm/glm.hpp>
#include "cuda_types/buffer.h"
#include "cuda_types/cuda_from_gl_data.h"
#include "helper_macros.h"
#include "rendering/vertex.h"
namespace raw {
namespace predef {
// Oh and btw, turns out, even after 4 steps our sphere gets nearly perfect (even on 2k monitor,
// well maybe on 4k it would be nice to have 6, but 4 is pretty much enough)
static constexpr auto BASIC_RADIUS = 1.0f;
static constexpr auto BASIC_STEPS = 7U;
static constexpr auto MAX_STEPS = 8U;
// That you can't change, all things above you can
static constexpr auto BASIC_AMOUNT_OF_TRIANGLES = 20U;
} // namespace predef
// this class serves as a nice thing to warp around hard things in generating sphere from
// icosahedron
class icosahedron_generator {
private:
cuda_from_gl_data<raw::vertex> vertices_handle;
cuda_from_gl_data<UI> indices_handle;
raw::shared_ptr<cuda_stream> stream;
UI _vbo;
UI _ebo;
cuda_buffer<raw::vertex> vertices_second;
cuda_buffer<UI> indices_second;
cuda_buffer<uint32_t> amount_of_triangles;
cuda_buffer<uint32_t> amount_of_vertices;
size_t vertices_bytes = 0;
size_t indices_bytes = 0;
uint32_t num_vertices_cpu = 12;
uint32_t num_triangles_cpu = predef::BASIC_AMOUNT_OF_TRIANGLES;
bool inited = false;
// Called every time after `generate` function
void cleanup();
// Called once when the object is created (or generate function called first time)
void init(UI vbo, UI ebo);
// Called every time before `generate` function
void prepare(UI vbo, UI ebo);
public:
icosahedron_generator();
icosahedron_generator(UI vbo, UI ebo, UI steps = predef::BASIC_STEPS);
void generate(UI vbo, UI ebo, UI steps);
static std::array<raw::vertex, 12> generate_icosahedron_vertices();
static constexpr std::array<UI, 60> generate_icosahedron_indices();
static std::pair<std::array<raw::vertex, 12>, std::array<UI, 60>>
generate_icosahedron_data();
};
} // namespace raw
#endif // SPACE_EXPLORER_MESH_GENERATOR_H
src/sphere_generation/mesh_generator.cpp
//
// Created by progamers on 7/7/25.
//
#include "sphere_generation/mesh_generator.h"
#include <array>
#include <numbers>
#include "clock.h"
#include "cuda_types/buffer.h"
#include "sphere_generation/kernel_launcher.h"
#include "sphere_generation/tessellation_kernel.h"
namespace raw {
inline constexpr float GOLDEN_RATIO = std::numbers::phi_v<float>;
icosahedron_generator::icosahedron_generator()
: stream(make_shared<cuda_stream>()),
amount_of_triangles(sizeof(uint32_t), stream, true),
amount_of_vertices(sizeof(uint32_t), stream, true) {}
icosahedron_generator::icosahedron_generator(raw::UI vbo, raw::UI ebo, raw::UI steps)
: stream(make_shared<cuda_stream>()),
amount_of_triangles(sizeof(uint32_t), stream, true),
amount_of_vertices(sizeof(uint32_t), stream, true) {
_vbo = vbo;
_ebo = ebo;
generate(vbo, ebo, steps);
}
void icosahedron_generator::init(raw::UI vbo, raw::UI ebo) {
_vbo = vbo;
_ebo = ebo;
vertices_handle = cuda_from_gl_data<raw::vertex>(&vertices_bytes, vbo);
indices_handle = cuda_from_gl_data<UI>(&indices_bytes, ebo);
vertices_second = cuda_buffer<raw::vertex>(vertices_bytes, stream, true);
indices_second = cuda_buffer<UI>(indices_bytes, stream, true);
cudaMemcpy(vertices_handle.get_data(), std::data(generate_icosahedron_vertices()),
12 * sizeof(raw::vertex), cudaMemcpyHostToDevice);
cudaMemcpy(indices_handle.get_data(), (void*)std::data(generate_icosahedron_indices()),
num_triangles_cpu * 3 * sizeof(UI), cudaMemcpyHostToDevice);
inited = true;
}
void icosahedron_generator::prepare(raw::UI vbo, raw::UI ebo) {
if (!inited) {
init(vbo, ebo);
return;
}
vertices_handle.map();
indices_handle.map();
vertices_second.allocate(vertices_bytes);
indices_second.allocate(indices_bytes);
cudaMemcpy(vertices_handle.get_data(), std::data(generate_icosahedron_vertices()),
12 * sizeof(raw::vertex), cudaMemcpyHostToDevice);
cudaMemcpy(indices_handle.get_data(), (void*)std::data(generate_icosahedron_indices()),
num_triangles_cpu * 3 * sizeof(UI), cudaMemcpyHostToDevice);
}
void icosahedron_generator::generate(raw::UI vbo, raw::UI ebo, raw::UI steps) {
if (steps >= predef::MAX_STEPS) {
throw std::runtime_error(std::format(
"[Error] Amount of steps should not exceed maximum, which is {}, while was given {}",
predef::MAX_STEPS, steps));
}
if (vbo != _vbo || ebo != _ebo) {
throw std::runtime_error(std::format(
"Function for LOD on different BO's was not yet created, don't call that. VBO given was {} while stored VBO was {}, EBO given was {} while stored was {}",
vbo, _vbo, ebo, _ebo));
}
prepare(vbo, ebo);
raw::clock timer;
for (UI i = 0; i < steps; ++i) {
amount_of_triangles.zero_data(sizeof(UI) * 1);
amount_of_vertices.set_data(&num_vertices_cpu, sizeof(uint32_t), cudaMemcpyHostToDevice);
if (i % 2 == 0) {
vertices_second.set_data(vertices_handle.get_data(),
num_vertices_cpu * sizeof(raw::vertex),
cudaMemcpyDeviceToDevice);
launch_tessellation(vertices_handle.get_data(), indices_handle.get_data(),
vertices_second.get(), indices_second.get(),
amount_of_vertices.get(), amount_of_triangles.get(),
num_triangles_cpu, *stream);
} else {
vertices_second.set_data(vertices_handle.get_data(),
num_vertices_cpu * sizeof(raw::vertex), cudaMemcpyDeviceToDevice,
cudaMemcpyOrder::cudaMemcpy1to2);
launch_tessellation(vertices_second.get(), indices_second.get(),
vertices_handle.get_data(), indices_handle.get_data(),
amount_of_vertices.get(), amount_of_triangles.get(),
num_triangles_cpu, *stream);
}
num_vertices_cpu += 3 * num_triangles_cpu;
num_triangles_cpu *= 4;
}
if (steps % 2 != 0) {
vertices_second.set_data(vertices_handle.get_data(), num_vertices_cpu * sizeof(raw::vertex),
cudaMemcpyDeviceToDevice, cudaMemcpyOrder::cudaMemcpy1to2);
}
launch_orthogonalization(vertices_handle.get_data(), num_vertices_cpu, *stream);
stream->sync();
auto passed_time = timer.restart();
passed_time.to_milli();
std::cout << std::string("[Debug] Tesselation with amount of steps of ") << steps << " took "
<< passed_time << " to complete\n";
cleanup();
}
void icosahedron_generator::cleanup() {
stream->sync();
vertices_second.free();
indices_second.free();
vertices_handle.unmap();
indices_handle.unmap();
num_vertices_cpu = 12;
num_triangles_cpu = predef::BASIC_AMOUNT_OF_TRIANGLES;
}
// Icosahedron as most things in this project sounds horrifying, but, n-body (my algorithms), this
// thing, and tesselation are surprisingly easy things, just for some reason someone wanted give
// them scary names
std::array<raw::vertex, 12> icosahedron_generator::generate_icosahedron_vertices() {
std::array<raw::vertex, 12> vertices {};
int vertex_index = 0;
const float unscaled_dist = std::sqrt(1.0f + GOLDEN_RATIO * GOLDEN_RATIO);
const float scale = 1.0f / unscaled_dist;
const float a = 1.0f * scale;
const float b = GOLDEN_RATIO * scale;
const glm::vec3 worldUp = {0.0f, 1.0f, 0.0f};
for (int i = 0; i < 3; ++i) {
for (int j = 0; j < 4; ++j) {
const float sign1 = (j & 2) ? -1.0f : 1.0f;
const float sign2 = (j & 1) ? -1.0f : 1.0f;
glm::vec3 point = {1.0f, 1.0f, 1.0f};
if (i == 0) {
point = {sign1 * a, sign2 * b, 0.0f};
} else if (i == 1) {
point = {0.0f, sign1 * a, sign2 * b};
} else {
point = {sign1 * b, 0.0f, sign2 * a};
}
raw::vertex v;
v.position = point;
v.normal = glm::normalize(point); // For a sphere, the normal is the normalized position
// Calculate Tangent and Bitangent
v.tangent = glm::normalize(glm::cross(worldUp, v.normal));
v.bitangent = glm::normalize(glm::cross(v.normal, v.tangent));
// Calculate Texture Coordinates
float u = atan2f(v.normal.z, v.normal.x) / (2.0f * std::numbers::pi_v<float>) + 0.5f;
float v_coord = 0.5f - asinf(v.normal.y) / std::numbers::pi_v<float>;
v.tex_coord = {u, v_coord};
vertices[vertex_index++] = v;
}
}
return vertices;
}
constexpr std::array<UI, 60> icosahedron_generator::generate_icosahedron_indices() {
return {2, 10, 4, 2, 4, 0, 2, 0, 5, 2, 5, 11, 2, 11, 10, 0, 4, 8, 4, 10,
6, 10, 11, 3, 11, 5, 7, 5, 0, 9, 1, 8, 6, 1, 6, 3, 1, 3, 7, 1,
7, 9, 1, 9, 8, 6, 8, 4, 3, 6, 10, 7, 3, 11, 9, 7, 5, 8, 9, 0};
}
std::pair<std::array<raw::vertex, 12>, std::array<UI, 60>>
icosahedron_generator::generate_icosahedron_data() {
return std::pair {generate_icosahedron_vertices(), generate_icosahedron_indices()};
}
} // namespace raw
include/sphere_generation/kernel_launcher.h
//
// Created by progamers on 7/18/25.
//
#ifndef SPACE_EXPLORER_KERNEL_LAUNCHER_H
#define SPACE_EXPLORER_KERNEL_LAUNCHER_H
#include <glm/glm.hpp>
#include "cuda_types/stream.h"
#include "helper_macros.h"
namespace raw {
extern void launch_tessellation(const raw::vertex* in_vertices, const UI* in_indices,
raw::vertex* out_vertices, UI* out_indices,
uint32_t* p_vertex_count, uint32_t* p_triangle_count,
size_t num_input_triangles, cuda_stream& stream);
extern void launch_orthogonalization(raw::vertex* vertices, size_t num_vertices, cuda_stream& stream);
}
#endif // SPACE_EXPLORER_KERNEL_LAUNCHER_H
src/sphere_generation/launch_tessellation.cu
//
// Created by progamers on 7/18/25.
//
#include "rendering/vertex.h"
#include "sphere_generation/kernel_launcher.h"
#include "sphere_generation/tessellation_kernel.h"
namespace raw {
void launch_tessellation(const raw::vertex* in_vertices, const UI* in_indices,
raw::vertex* out_vertices, UI* out_indices, uint32_t* p_vertex_count,
uint32_t* p_triangle_count, size_t num_input_triangles,
cuda_stream& stream) {
constexpr auto threads_per_block = 256;
auto blocks = (num_input_triangles + threads_per_block - 1) / 256;
subdivide<<<blocks, threads_per_block, 0, stream.stream()>>>(
in_vertices, in_indices, out_vertices, out_indices, p_vertex_count, p_triangle_count,
num_input_triangles);
}
void launch_orthogonalization(raw::vertex* vertices, size_t num_vertices, cuda_stream& stream) {
constexpr auto threads_per_block = 256;
auto blocks = (num_vertices + threads_per_block - 1) / 256;
orthogonalize<<<blocks, threads_per_block, 0, stream.stream()>>>(vertices, num_vertices);
}
} // namespace raw
include/sphere_generation/tessellation_kernel.h
//
// Created by progamers on 7/18/25.
//
#ifndef SPACE_EXPLORER_TESSELLATION_KERNEL_H
#define SPACE_EXPLORER_TESSELLATION_KERNEL_H
#define GLM_CUDA_FORCE_DEVICE_FUNC
#include <glm/glm.hpp>
extern __global__ void subdivide(const raw::vertex* in_vertices, const unsigned int* in_indices,
raw::vertex* out_vertices, unsigned int* out_indices,
uint32_t* p_vertex_count, uint32_t* p_triangle_count,
size_t num_input_triangles);
extern __global__ void orthogonalize(raw::vertex* vertices, uint32_t vertex_count);
#endif // SPACE_EXPLORER_TESSELLATION_KERNEL_H
src/sphere_generation/tessellation_kernel.cu
//
// Created by progamers on 7/18/25.
//
#include "rendering/vertex.h"
#include "sphere_generation/tessellation_kernel.h"
#ifndef CUDART_PI_F
#define CUDART_PI_F 3.141592654f
#endif
__device__ void calc_tex_coords(glm::vec2* writing_ptr, glm::vec3& normalized_pos) {
float u = atan2f(normalized_pos.z, normalized_pos.x) / (2.0f * CUDART_PI_F) + 0.5f;
float v = 0.5f - asinf(normalized_pos.y) / CUDART_PI_F;
*writing_ptr = glm::vec2 {u, v};
}
__global__ void subdivide(const raw::vertex* in_vertices, const unsigned int* in_indices,
raw::vertex* out_vertices, unsigned int* out_indices,
uint32_t* p_vertex_count, uint32_t* p_triangle_count,
size_t num_input_triangles) {
const unsigned int x = blockIdx.x * blockDim.x + threadIdx.x;
if (x >= num_input_triangles) {
return;
}
size_t i0 = in_indices[x * 3 + 0];
size_t i1 = in_indices[x * 3 + 1];
size_t i2 = in_indices[x * 3 + 2];
const raw::vertex& v0 = in_vertices[i0];
const raw::vertex& v1 = in_vertices[i1];
const raw::vertex& v2 = in_vertices[i2];
glm::vec3 m01 = normalize(v0.position + v1.position);
glm::vec3 m12 = normalize(v1.position + v2.position);
glm::vec3 m20 = normalize(v2.position + v0.position);
uint32_t base_v_idx = atomicAdd(p_vertex_count, 3);
uint32_t new_i01 = base_v_idx + 0;
uint32_t new_i12 = base_v_idx + 1;
uint32_t new_i20 = base_v_idx + 2;
out_vertices[new_i01].position = normalize(m01);
calc_tex_coords(&out_vertices[new_i01].tex_coord, out_vertices[new_i01].position);
out_vertices[new_i12].position = normalize(m12);
calc_tex_coords(&out_vertices[new_i12].tex_coord, out_vertices[new_i12].position);
out_vertices[new_i20].position = normalize(m20);
calc_tex_coords(&out_vertices[new_i20].tex_coord, out_vertices[new_i20].position);
uint32_t base_t_idx = atomicAdd(p_triangle_count, 4);
unsigned int* out_tri_ptr = &out_indices[base_t_idx * 3];
out_tri_ptr[0] = i0;
out_tri_ptr[1] = new_i01;
out_tri_ptr[2] = new_i20;
out_tri_ptr[3] = i1;
out_tri_ptr[4] = new_i12;
out_tri_ptr[5] = new_i01;
out_tri_ptr[6] = i2;
out_tri_ptr[7] = new_i20;
out_tri_ptr[8] = new_i12;
out_tri_ptr[9] = new_i01;
out_tri_ptr[10] = new_i12;
out_tri_ptr[11] = new_i20;
// Calculate tangent and bitangent for new triangles
raw::vertex* new_triangle_vertices[4][3] = {
{(raw::vertex*)&v0, &out_vertices[new_i01], &out_vertices[new_i20]},
{(raw::vertex*)&v1, &out_vertices[new_i12], &out_vertices[new_i01]},
{(raw::vertex*)&v2, &out_vertices[new_i20], &out_vertices[new_i12]},
{&out_vertices[new_i01], &out_vertices[new_i12], &out_vertices[new_i20]}};
for (const auto& new_triangle_vertice : new_triangle_vertices) {
raw::vertex* v_a = new_triangle_vertice[0];
raw::vertex* v_b = new_triangle_vertice[1];
raw::vertex* v_c = new_triangle_vertice[2];
glm::vec3 edge1 = v_b->position - v_a->position;
glm::vec3 edge2 = v_c->position - v_a->position;
glm::vec2 delta_uv1 = v_b->tex_coord - v_a->tex_coord;
glm::vec2 delta_uv2 = v_c->tex_coord - v_a->tex_coord;
float f = 1.0f / (delta_uv1.x * delta_uv2.y - delta_uv2.x * delta_uv1.y);
glm::vec3 tangent, bitangent;
// This is basically what i have to write
// tangent.x = f * (delta_uv2.y * edge1.x - delta_uv1.y * edge2.x);
// tangent.y = f * (delta_uv2.y * edge1.y - delta_uv1.y * edge2.y);
// tangent.z = f * (delta_uv2.y * edge1.z - delta_uv1.y * edge2.z);
// However this look much cooler and nicer
for (int dim = 0; dim < 3; ++dim) {
tangent[dim] = f * (delta_uv2.y * edge1[dim] - delta_uv1.y * edge2[dim]);
bitangent[dim] = f * (-delta_uv2.x * edge1[dim] + delta_uv1.x * edge2[dim]);
}
// This basically updates the values in the output buffers, but since we have places where
// different threads work with same vertice, we need to use that ugly shit
atomicAdd(&v_a->tangent.x, tangent.x);
atomicAdd(&v_a->tangent.y, tangent.y);
atomicAdd(&v_a->tangent.z, tangent.z);
atomicAdd(&v_a->bitangent.x, bitangent.x);
atomicAdd(&v_a->bitangent.y, bitangent.y);
atomicAdd(&v_a->bitangent.z, bitangent.z);
atomicAdd(&v_b->tangent.x, tangent.x);
atomicAdd(&v_b->tangent.y, tangent.y);
atomicAdd(&v_b->tangent.z, tangent.z);
atomicAdd(&v_b->bitangent.x, bitangent.x);
atomicAdd(&v_b->bitangent.y, bitangent.y);
atomicAdd(&v_b->bitangent.z, bitangent.z);
atomicAdd(&v_c->tangent.x, tangent.x);
atomicAdd(&v_c->tangent.y, tangent.y);
atomicAdd(&v_c->tangent.z, tangent.z);
atomicAdd(&v_c->bitangent.x, bitangent.x);
atomicAdd(&v_c->bitangent.y, bitangent.y);
atomicAdd(&v_c->bitangent.z, bitangent.z);
}
}
__global__ void orthogonalize(raw::vertex* vertices, uint32_t vertex_count) {
const unsigned int x = blockIdx.x * blockDim.x + threadIdx.x;
if (x > vertex_count) {
return;
}
raw::vertex& v = vertices[x];
glm::vec3 tangent = v.tangent - v.normal * dot(v.tangent, v.normal);
// Basically zero
if (length(tangent) < 1e-6) {
if (abs(v.normal.x) > abs(v.normal.z)) {
tangent = glm::vec3(-v.normal.y, v.normal.x, 0.0f);
} else {
tangent = glm::vec3(0.0f, -v.normal.z, v.normal.y);
}
}
v.tangent = normalize(tangent);
if (dot(cross(v.normal, v.tangent), v.bitangent) < 0.0f) {
v.tangent = v.tangent * -1.0f;
}
v.bitangent = normalize(cross(v.normal, v.tangent));
// Just for safe measure
v.normal = normalize(v.normal);
}
I would love to hear any suggestions on how to improve my code.