Revisions to - Stack Overflow

added 152 characters in body

Source Link

edited Oct 9, 2025 at 14:08

huseyin tugrul buyukisik

12k
6
53
111

8 x 8 skip loop: this can be replaced by an array that keeps track of used indices to reduce number of checks --> can speedup 10x
modulo arithmetic + math: a lot of iterations are wasted currently. Using exact number of permutations required + some 1to1 mapping would also make it fast --> can speedup 100x
total speedup would be 1000x (10-15 microseconds)

A second gpu could be given half of the work, to make it 2000x speedup (5 microseconds). Scaling isn't good beyond this, due to already-low latency.

added 2439 characters in body; added 7 characters in body

Source Link

edited Oct 9, 2025 at 13:36

huseyin tugrul buyukisik

12k
6
53
111

#define __CUDACC__
#include<iostream>
#include <cuda_runtime.h>
// Error-handling assert code from stackoverflow talonmies.
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, const char* file, int line, bool abort = true)
{
    if (code != cudaSuccess)
    {
        fprintf(stderr, "GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
        if (abort) exit(code);
    }
}
__device__ float path[] = {
    2, 8,
    15, 3,
    6, 6,
    20, 15,
    11, 2,
    14, 1,
    7, 3,
    1, 12,
    3, 6,
    14, 13
};
__device__ float taste[] = {
    7,
    17,
    8,
    30,
    12,
    15,
    5,
    12,
    3,
    18
};
constexpr int P_10_8 = 1000000000;
constexpr int BLOCKS = 46 * 2;
constexpr int THREADS = 768;
__global__ void k_brute(float* scores, int* permutations) {
    const int thread = threadIdx.x + blockIdx.x * blockDim.x;
    const int numThreads = blockDim.x * gridDim.x;
    __shared__ float s_reduction[32];
    __shared__ int s_reduction2[32];
    int indices[10] = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9 };
    float score = 0.0f;
    int prm = 0;
    const int steps = (P_10_8 + numThreads - 1) / numThreads;
    for (int i = 0; i < steps; i++) {
        const int permutation = i * numThreads + thread;
        if (permutation < P_10_8) {
            indices[0] = (permutation / 1) % 10;
            indices[1] = (permutation / 10) % 10;
            indices[2] = (permutation / 100) % 10;
            indices[3] = (permutation / 1000) % 10;
            indices[4] = (permutation / 10000) % 10;
            indices[5] = (permutation / 100000) % 10;
            indices[6] = (permutation / 1000000) % 10;
            indices[7] = (permutation / 10000000) % 10;
            bool skip = false;
            for (int j = 0; j < 8; j++) {
                for (int k = 0; k < 8; k++) {
                    if (j != k && indices[j] == indices[k]) {
                        skip = true;
                    }
                }
            }
            if (skip)
                continue;
            float distance = 0.0f; 
            float tastiness = 0.0f;
            float lastX = 0;
            float lastY = 0;
            for (int i = 0; i < 8; i++) {
                const float dx = path[indices[i] * 2] - lastX;
                const float dy = path[indices[i] * 2 + 1] - lastY;
                const float t = taste[indices[i]];
                distance += sqrtf(fmaf(dx, dx, fmaf(dy, dy, 0.0f)));
                tastiness += t;
                lastX = path[indices[i] * 2];
                lastY = path[indices[i] * 2 + 1];
                if (distance > 0.0f) {
                    if (tastiness / distance > score) {
                        prm = permutation;
                        score = tastiness / distance;
                    }
                }
            }
        }
    }
    const int warpLane = threadIdx.x & 31;
    const int warpIndex = threadIdx.x >> 5;
    for (int i = 16; i >= 1; i >>= 1) {
        const float gather = __shfl_down_sync(0xFFFFFFFF, score, i);
        const int gather2 = __shfl_down_sync(0xFFFFFFFF, prm, i);
        if (warpLane < i) {
            prm = (score < gather ? gather2 : prm);
            score = (score < gather ? gather : score);
        }
    }
    if (warpLane == 0) {
        s_reduction[warpIndex] = score;
        s_reduction2[warpIndex] = prm;
    }
    __syncthreads();
    if (warpIndex == 0) {
        if (warpLane < (blockDim.x >> 5)) {
            score = s_reduction[warpLane];
            prm = s_reduction2[warpLane];
        }
        else {
            score = 0.0f;
            prm = 0;
        }
        for (int i = 16; i >= 1; i >>= 1) {
            const float gather = __shfl_down_sync(0xFFFFFFFF, score, i);
            const int gather2 = __shfl_down_sync(0xFFFFFFFF, prm, i);
            if (warpLane < i) {
                prm = (score < gather ? gather2 : prm);
                score = (score < gather ? gather : score);
            }
        }
        if (warpLane == 0) {
            scores[blockIdx.x] = score;
            permutations[blockIdx.x] = prm;
        }
    }
}


int main() {
    cudaStream_t stream;
    cudaEvent_t event1;
    cudaEvent_t event2;
    gpuErrchk(cudaStreamCreate(&stream));
    gpuErrchk(cudaEventCreate(&event1));
    gpuErrchk(cudaEventCreate(&event2));
    int size1 = 64;
    int size2 = 128;
    int size3 = 128;
    int size4 = 128;
    size_t tmp[4] = { size1, size2, size3, size4 };
    int size = size1 * size2 * size3 * size4;
    float* scores;
    int* permutations;
    float init = 1e35f;
    gpuErrchk(cudaMallocAsync(&scores, sizeof(float) * BLOCKS, stream));
    gpuErrchk(cudaMallocAsync(&permutations, sizeof(int) * BLOCKS, stream));
    gpuErrchk(cudaEventRecord(event1, stream));
    constexpr int BENCHMARK_ITERATIONS = 10;
    for (int i = 0; i < BENCHMARK_ITERATIONS; i++) {
        k_brute << <BLOCKS, THREADS, 0, stream >> > (scores, permutations);
    }
    gpuErrchk(cudaEventRecord(event2, stream));
    gpuErrchk(cudaStreamSynchronize(stream));
    float results[BLOCKS];
    int results2[BLOCKS];
    gpuErrchk(cudaMemcpy(results, scores, sizeof(float) * BLOCKS, cudaMemcpyDeviceToHost));
    gpuErrchk(cudaMemcpy(results2, permutations, sizeof(int) * BLOCKS, cudaMemcpyDeviceToHost));
    float finalScore = 0.0f;
    int finalPermutation = 0;

    for (int i = 0; i < BLOCKS; i++) {
        if (finalScore < results[i]) {
            finalScore = results[i];
            finalPermutation = results2[i];
        }
    }
    std::cout << "tastiness / distance = " << finalScore << std::endl;
    std::cout << "distance / tastiness = " << (1.0f / finalScore) << std::endl;
    float milliseconds = 0;
    cudaEventElapsedTime(&milliseconds, event1, event2);
    std::cout << "milliseconds = " << (milliseconds / BENCHMARK_ITERATIONS) << std::endl;
    int indices[10] = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9 };
    indices[0] = (finalPermutation / 1) % 10;
    indices[1] = (finalPermutation / 10) % 10;
    indices[2] = (finalPermutation / 100) % 10;
    indices[3] = (finalPermutation / 1000) % 10;
    indices[4] = (finalPermutation / 10000) % 10;
    indices[5] = (finalPermutation / 100000) % 10;
    indices[6] = (finalPermutation / 1000000) % 10;
    indices[7] = (finalPermutation / 10000000) % 10;

    float distance = 0.0f;
    float tastiness = 0.0f;
    float lastX = 0;
    float lastY = 0;
    float path[] = {
        2, 8,
        15, 3,
        6, 6,
        20, 15,
        11, 2,
        14, 1,
        7, 3,
        1, 12,
        3, 6,
        14, 13
    };
    float taste[] = {
        7,
        17,
        8,
        30,
        12,
        15,
        5,
        12,
        3,
        18
    };
    for (int i = 0; i < 8; i++) {
        std::cout << "path: " << (indices[i] + 1) << std::endl;
        const float dx = path[indices[i] * 2] - lastX;
        const float dy = path[indices[i] * 2 + 1] - lastY;
        const float t = taste[indices[i]];
        distance += sqrtf(fmaf(dx, dx, fmaf(dy, dy, 0.0f)));
        tastiness += t;
        lastX = path[indices[i] * 2];
        lastY = path[indices[i] * 2 + 1];
        if (distance > 0.0f) {
            if (fabsf(tastiness / distance - finalScore) < 0.001f) {
                std::cout << tastiness / distance << " score reached! " << std::endl;
                break;
            }
        }
    }

    gpuErrchk(cudaFreeAsync(scores, stream));
    gpuErrchk(cudaFreeAsync(permutations, stream));
    gpuErrchk(cudaEventDestroy(event1));
    gpuErrchk(cudaEventDestroy(event2));
    gpuErrchk(cudaStreamDestroy(stream));
    return 0;
}

tastiness / distance = 2.89452
distance / tastiness = 0.345481
milliseconds = 16.5945473
path: 7
path: 5
path: 6
path: 2
path: 10
path: 4
2.89452 score reached!

#define __CUDACC__
#include<iostream>
#include <cuda_runtime.h>
// Error-handling assert code from stackoverflow talonmies.
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, const char* file, int line, bool abort = true)
{
    if (code != cudaSuccess)
    {
        fprintf(stderr, "GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
        if (abort) exit(code);
    }
}
__device__ float path[] = {
    2, 8,
    15, 3,
    6, 6,
    20, 15,
    11, 2,
    14, 1,
    7, 3,
    1, 12,
    3, 6,
    14, 13
};
__device__ float taste[] = {
    7,
    17,
    8,
    30,
    12,
    15,
    5,
    12,
    3,
    18
};
constexpr int P_10_8 = 1000000000;
constexpr int BLOCKS = 46 * 2;
constexpr int THREADS = 768;
__global__ void k_brute(float* scores) {
    const int thread = threadIdx.x + blockIdx.x * blockDim.x;
    const int numThreads = blockDim.x * gridDim.x;
    __shared__ float s_reduction[32];
    int indices[10] = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9 };
    float score = 0.0f;
    const int steps = (P_10_8 + numThreads - 1) / numThreads;
    for (int i = 0; i < steps; i++) {
        const int permutation = i * numThreads + thread;
        if (permutation < P_10_8) {
            indices[0] = (permutation / 1) % 10;
            indices[1] = (permutation / 10) % 10;
            indices[2] = (permutation / 100) % 10;
            indices[3] = (permutation / 1000) % 10;
            indices[4] = (permutation / 10000) % 10;
            indices[5] = (permutation / 100000) % 10;
            indices[6] = (permutation / 1000000) % 10;
            indices[7] = (permutation / 10000000) % 10;
            bool skip = false;
            for (int j = 0; j < 8; j++) {
                for (int k = 0; k < 8; k++) {
                    if (j != k && indices[j] == indices[k]) {
                        skip = true;
                    }
                }
            }
            if (skip)
                continue;
            float distance = 0.0f; 
            float tastiness = 0.0f;
            float lastX = 0;
            float lastY = 0;
            for (int i = 0; i < 8; i++) {
                const float dx = path[indices[i] * 2] - lastX;
                const float dy = path[indices[i] * 2 + 1] - lastY;
                const float t = taste[indices[i]];
                distance += sqrtf(fmaf(dx, dx, fmaf(dy, dy, 0.0f)));
                tastiness += t;
                lastX = path[indices[i] * 2];
                lastY = path[indices[i] * 2 + 1];
                if (distance > 0.0f) {
                    if (tastiness / distance > score) {
                       
                        score = tastiness / distance;
                    }
                }
            }
        }
    }
    const int warpLane = threadIdx.x & 31;
    const int warpIndex = threadIdx.x >> 5;
    for (int i = 16; i >= 1; i >>= 1) {
        const float gather = __shfl_down_sync(0xFFFFFFFF, score, i);
        if (warpLane < i) {
            score = (score < gather ? gather : score);
        }
    }
    if (warpLane == 0) {
        s_reduction[warpIndex] = score;
    }
    __syncthreads();
    if (warpIndex == 0) {
        if (warpLane < (blockDim.x >> 5)) {
            score = s_reduction[warpLane];
        }
        else {
            score = 0.0f;
        }
        for (int i = 16; i >= 1; i >>= 1) {
            const float gather = __shfl_down_sync(0xFFFFFFFF, score, i);
            if (warpLane < i) {
                score = (score < gather ? gather : score);
            }
        }
        if (warpLane == 0) {
            scores[blockIdx.x] = score;
        }
    }
}


int main() {
    cudaStream_t stream;
    cudaEvent_t event1;
    cudaEvent_t event2;
    gpuErrchk(cudaStreamCreate(&stream));
    gpuErrchk(cudaEventCreate(&event1));
    gpuErrchk(cudaEventCreate(&event2));
    int size1 = 64;
    int size2 = 128;
    int size3 = 128;
    int size4 = 128;
    size_t tmp[4] = { size1, size2, size3, size4 };
    int size = size1 * size2 * size3 * size4;
    float* scores;
    float init = 1e35f;
    gpuErrchk(cudaMallocAsync(&scores, sizeof(float) * BLOCKS, stream));
    gpuErrchk(cudaEventRecord(event1, stream));
    constexpr int BENCHMARK_ITERATIONS = 10;
    for (int i = 0; i < BENCHMARK_ITERATIONS; i++) {
        k_brute << <BLOCKS, THREADS, 0, stream >> > (scores);
    }
    gpuErrchk(cudaEventRecord(event2, stream));
    gpuErrchk(cudaStreamSynchronize(stream));
    float results[BLOCKS];
    gpuErrchk(cudaMemcpy(results, scores, sizeof(float) * BLOCKS, cudaMemcpyDeviceToHost));
    float finalScore = 0.0f;
    for (int i = 0; i < BLOCKS; i++) {
        if (finalScore < results[i]) {
            finalScore = results[i];
        }
    }
    std::cout << "tastiness / distance = " << finalScore << std::endl;
    std::cout << "distance / tastiness = " << (1.0f / finalScore) << std::endl;
    float milliseconds = 0;
    cudaEventElapsedTime(&milliseconds, event1, event2);
    std::cout << "milliseconds = " << (milliseconds / BENCHMARK_ITERATIONS) << std::endl;
    gpuErrchk(cudaFreeAsync(scores, stream));
    gpuErrchk(cudaEventDestroy(event1));
    gpuErrchk(cudaEventDestroy(event2));
    gpuErrchk(cudaStreamDestroy(stream));
    return 0;
}

tastiness / distance = 2.89452
distance / tastiness = 0.345481
milliseconds = 16.594

#define __CUDACC__
#include<iostream>
#include <cuda_runtime.h>
// Error-handling assert code from stackoverflow talonmies.
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, const char* file, int line, bool abort = true)
{
    if (code != cudaSuccess)
    {
        fprintf(stderr, "GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
        if (abort) exit(code);
    }
}
__device__ float path[] = {
    2, 8,
    15, 3,
    6, 6,
    20, 15,
    11, 2,
    14, 1,
    7, 3,
    1, 12,
    3, 6,
    14, 13
};
__device__ float taste[] = {
    7,
    17,
    8,
    30,
    12,
    15,
    5,
    12,
    3,
    18
};
constexpr int P_10_8 = 1000000000;
constexpr int BLOCKS = 46 * 2;
constexpr int THREADS = 768;
__global__ void k_brute(float* scores, int* permutations) {
    const int thread = threadIdx.x + blockIdx.x * blockDim.x;
    const int numThreads = blockDim.x * gridDim.x;
    __shared__ float s_reduction[32];
    __shared__ int s_reduction2[32];
    int indices[10] = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9 };
    float score = 0.0f;
    int prm = 0;
    const int steps = (P_10_8 + numThreads - 1) / numThreads;
    for (int i = 0; i < steps; i++) {
        const int permutation = i * numThreads + thread;
        if (permutation < P_10_8) {
            indices[0] = (permutation / 1) % 10;
            indices[1] = (permutation / 10) % 10;
            indices[2] = (permutation / 100) % 10;
            indices[3] = (permutation / 1000) % 10;
            indices[4] = (permutation / 10000) % 10;
            indices[5] = (permutation / 100000) % 10;
            indices[6] = (permutation / 1000000) % 10;
            indices[7] = (permutation / 10000000) % 10;
            bool skip = false;
            for (int j = 0; j < 8; j++) {
                for (int k = 0; k < 8; k++) {
                    if (j != k && indices[j] == indices[k]) {
                        skip = true;
                    }
                }
            }
            if (skip)
                continue;
            float distance = 0.0f; 
            float tastiness = 0.0f;
            float lastX = 0;
            float lastY = 0;
            for (int i = 0; i < 8; i++) {
                const float dx = path[indices[i] * 2] - lastX;
                const float dy = path[indices[i] * 2 + 1] - lastY;
                const float t = taste[indices[i]];
                distance += sqrtf(fmaf(dx, dx, fmaf(dy, dy, 0.0f)));
                tastiness += t;
                lastX = path[indices[i] * 2];
                lastY = path[indices[i] * 2 + 1];
                if (distance > 0.0f) {
                    if (tastiness / distance > score) {
                        prm = permutation;
                        score = tastiness / distance;
                    }
                }
            }
        }
    }
    const int warpLane = threadIdx.x & 31;
    const int warpIndex = threadIdx.x >> 5;
    for (int i = 16; i >= 1; i >>= 1) {
        const float gather = __shfl_down_sync(0xFFFFFFFF, score, i);
        const int gather2 = __shfl_down_sync(0xFFFFFFFF, prm, i);
        if (warpLane < i) {
            prm = (score < gather ? gather2 : prm);
            score = (score < gather ? gather : score);
        }
    }
    if (warpLane == 0) {
        s_reduction[warpIndex] = score;
        s_reduction2[warpIndex] = prm;
    }
    __syncthreads();
    if (warpIndex == 0) {
        if (warpLane < (blockDim.x >> 5)) {
            score = s_reduction[warpLane];
            prm = s_reduction2[warpLane];
        }
        else {
            score = 0.0f;
            prm = 0;
        }
        for (int i = 16; i >= 1; i >>= 1) {
            const float gather = __shfl_down_sync(0xFFFFFFFF, score, i);
            const int gather2 = __shfl_down_sync(0xFFFFFFFF, prm, i);
            if (warpLane < i) {
                prm = (score < gather ? gather2 : prm);
                score = (score < gather ? gather : score);
            }
        }
        if (warpLane == 0) {
            scores[blockIdx.x] = score;
            permutations[blockIdx.x] = prm;
        }
    }
}


int main() {
    cudaStream_t stream;
    cudaEvent_t event1;
    cudaEvent_t event2;
    gpuErrchk(cudaStreamCreate(&stream));
    gpuErrchk(cudaEventCreate(&event1));
    gpuErrchk(cudaEventCreate(&event2));
    int size1 = 64;
    int size2 = 128;
    int size3 = 128;
    int size4 = 128;
    size_t tmp[4] = { size1, size2, size3, size4 };
    int size = size1 * size2 * size3 * size4;
    float* scores;
    int* permutations;
    float init = 1e35f;
    gpuErrchk(cudaMallocAsync(&scores, sizeof(float) * BLOCKS, stream));
    gpuErrchk(cudaMallocAsync(&permutations, sizeof(int) * BLOCKS, stream));
    gpuErrchk(cudaEventRecord(event1, stream));
    constexpr int BENCHMARK_ITERATIONS = 10;
    for (int i = 0; i < BENCHMARK_ITERATIONS; i++) {
        k_brute << <BLOCKS, THREADS, 0, stream >> > (scores, permutations);
    }
    gpuErrchk(cudaEventRecord(event2, stream));
    gpuErrchk(cudaStreamSynchronize(stream));
    float results[BLOCKS];
    int results2[BLOCKS];
    gpuErrchk(cudaMemcpy(results, scores, sizeof(float) * BLOCKS, cudaMemcpyDeviceToHost));
    gpuErrchk(cudaMemcpy(results2, permutations, sizeof(int) * BLOCKS, cudaMemcpyDeviceToHost));
    float finalScore = 0.0f;
    int finalPermutation = 0;

    for (int i = 0; i < BLOCKS; i++) {
        if (finalScore < results[i]) {
            finalScore = results[i];
            finalPermutation = results2[i];
        }
    }
    std::cout << "tastiness / distance = " << finalScore << std::endl;
    std::cout << "distance / tastiness = " << (1.0f / finalScore) << std::endl;
    float milliseconds = 0;
    cudaEventElapsedTime(&milliseconds, event1, event2);
    std::cout << "milliseconds = " << (milliseconds / BENCHMARK_ITERATIONS) << std::endl;
    int indices[10] = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9 };
    indices[0] = (finalPermutation / 1) % 10;
    indices[1] = (finalPermutation / 10) % 10;
    indices[2] = (finalPermutation / 100) % 10;
    indices[3] = (finalPermutation / 1000) % 10;
    indices[4] = (finalPermutation / 10000) % 10;
    indices[5] = (finalPermutation / 100000) % 10;
    indices[6] = (finalPermutation / 1000000) % 10;
    indices[7] = (finalPermutation / 10000000) % 10;

    float distance = 0.0f;
    float tastiness = 0.0f;
    float lastX = 0;
    float lastY = 0;
    float path[] = {
        2, 8,
        15, 3,
        6, 6,
        20, 15,
        11, 2,
        14, 1,
        7, 3,
        1, 12,
        3, 6,
        14, 13
    };
    float taste[] = {
        7,
        17,
        8,
        30,
        12,
        15,
        5,
        12,
        3,
        18
    };
    for (int i = 0; i < 8; i++) {
        std::cout << "path: " << (indices[i] + 1) << std::endl;
        const float dx = path[indices[i] * 2] - lastX;
        const float dy = path[indices[i] * 2 + 1] - lastY;
        const float t = taste[indices[i]];
        distance += sqrtf(fmaf(dx, dx, fmaf(dy, dy, 0.0f)));
        tastiness += t;
        lastX = path[indices[i] * 2];
        lastY = path[indices[i] * 2 + 1];
        if (distance > 0.0f) {
            if (fabsf(tastiness / distance - finalScore) < 0.001f) {
                std::cout << tastiness / distance << " score reached! " << std::endl;
                break;
            }
        }
    }

    gpuErrchk(cudaFreeAsync(scores, stream));
    gpuErrchk(cudaFreeAsync(permutations, stream));
    gpuErrchk(cudaEventDestroy(event1));
    gpuErrchk(cudaEventDestroy(event2));
    gpuErrchk(cudaStreamDestroy(stream));
    return 0;
}

tastiness / distance = 2.89452
distance / tastiness = 0.345481
milliseconds = 16.5473
path: 7
path: 5
path: 6
path: 2
path: 10
path: 4
2.89452 score reached!

added 400 characters in body

Source Link

edited Oct 9, 2025 at 13:18

huseyin tugrul buyukisik

12k
6
53
111

Some parts can be optimized:

8 x 8 skip loop: this can be replaced by an array that keeps track of used indices to reduce number of checks --> can speedup 10x

modulo arithmetic + math: a lot of iterations are wasted currently. Using exact number of permutations required + some 1to1 mapping would also make it fast --> can speedup 100x

total speedup would be 1000x (10-15 microseconds)

Source Link

created Oct 9, 2025 at 13:11

huseyin tugrul buyukisik

12k
6
53
111

Loading

Collectives™ on Stack Overflow

Return to discussion

Post Timeline