8 x 8 skip loop: this can be replaced by an array that keeps track of used indices to reduce number of checks --> can speedup 10x
modulo arithmetic + math: a lot of iterations are wasted currently. Using exact number of permutations required + some 1to1 mapping would also make it fast --> can speedup 100x
total speedup would be 1000x (10-15 microseconds)
A second gpu could be given half of the work, to make it 2000x speedup (5 microseconds). Scaling isn't good beyond this, due to already-low latency.
8 x 8 skip loop: this can be replaced by an array that keeps track of used indices to reduce number of checks --> can speedup 10x
modulo arithmetic + math: a lot of iterations are wasted currently. Using exact number of permutations required + some 1to1 mapping would also make it fast --> can speedup 100x
total speedup would be 1000x (10-15 microseconds)
8 x 8 skip loop: this can be replaced by an array that keeps track of used indices to reduce number of checks --> can speedup 10x
modulo arithmetic + math: a lot of iterations are wasted currently. Using exact number of permutations required + some 1to1 mapping would also make it fast --> can speedup 100x
total speedup would be 1000x (10-15 microseconds)
A second gpu could be given half of the work, to make it 2000x speedup (5 microseconds). Scaling isn't good beyond this, due to already-low latency.
#define __CUDACC__
#include<iostream>
#include <cuda_runtime.h>
// Error-handling assert code from stackoverflow talonmies.
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, const char* file, int line, bool abort = true)
{
if (code != cudaSuccess)
{
fprintf(stderr, "GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
if (abort) exit(code);
}
}
__device__ float path[] = {
2, 8,
15, 3,
6, 6,
20, 15,
11, 2,
14, 1,
7, 3,
1, 12,
3, 6,
14, 13
};
__device__ float taste[] = {
7,
17,
8,
30,
12,
15,
5,
12,
3,
18
};
constexpr int P_10_8 = 1000000000;
constexpr int BLOCKS = 46 * 2;
constexpr int THREADS = 768;
__global__ void k_brute(float* scores, int* permutations) {
const int thread = threadIdx.x + blockIdx.x * blockDim.x;
const int numThreads = blockDim.x * gridDim.x;
__shared__ float s_reduction[32];
__shared__ int s_reduction2[32];
int indices[10] = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9 };
float score = 0.0f;
int prm = 0;
const int steps = (P_10_8 + numThreads - 1) / numThreads;
for (int i = 0; i < steps; i++) {
const int permutation = i * numThreads + thread;
if (permutation < P_10_8) {
indices[0] = (permutation / 1) % 10;
indices[1] = (permutation / 10) % 10;
indices[2] = (permutation / 100) % 10;
indices[3] = (permutation / 1000) % 10;
indices[4] = (permutation / 10000) % 10;
indices[5] = (permutation / 100000) % 10;
indices[6] = (permutation / 1000000) % 10;
indices[7] = (permutation / 10000000) % 10;
bool skip = false;
for (int j = 0; j < 8; j++) {
for (int k = 0; k < 8; k++) {
if (j != k && indices[j] == indices[k]) {
skip = true;
}
}
}
if (skip)
continue;
float distance = 0.0f;
float tastiness = 0.0f;
float lastX = 0;
float lastY = 0;
for (int i = 0; i < 8; i++) {
const float dx = path[indices[i] * 2] - lastX;
const float dy = path[indices[i] * 2 + 1] - lastY;
const float t = taste[indices[i]];
distance += sqrtf(fmaf(dx, dx, fmaf(dy, dy, 0.0f)));
tastiness += t;
lastX = path[indices[i] * 2];
lastY = path[indices[i] * 2 + 1];
if (distance > 0.0f) {
if (tastiness / distance > score) {
prm = permutation;
score = tastiness / distance;
}
}
}
}
}
const int warpLane = threadIdx.x & 31;
const int warpIndex = threadIdx.x >> 5;
for (int i = 16; i >= 1; i >>= 1) {
const float gather = __shfl_down_sync(0xFFFFFFFF, score, i);
const int gather2 = __shfl_down_sync(0xFFFFFFFF, prm, i);
if (warpLane < i) {
prm = (score < gather ? gather2 : prm);
score = (score < gather ? gather : score);
}
}
if (warpLane == 0) {
s_reduction[warpIndex] = score;
s_reduction2[warpIndex] = prm;
}
__syncthreads();
if (warpIndex == 0) {
if (warpLane < (blockDim.x >> 5)) {
score = s_reduction[warpLane];
prm = s_reduction2[warpLane];
}
else {
score = 0.0f;
prm = 0;
}
for (int i = 16; i >= 1; i >>= 1) {
const float gather = __shfl_down_sync(0xFFFFFFFF, score, i);
const int gather2 = __shfl_down_sync(0xFFFFFFFF, prm, i);
if (warpLane < i) {
prm = (score < gather ? gather2 : prm);
score = (score < gather ? gather : score);
}
}
if (warpLane == 0) {
scores[blockIdx.x] = score;
permutations[blockIdx.x] = prm;
}
}
}
int main() {
cudaStream_t stream;
cudaEvent_t event1;
cudaEvent_t event2;
gpuErrchk(cudaStreamCreate(&stream));
gpuErrchk(cudaEventCreate(&event1));
gpuErrchk(cudaEventCreate(&event2));
int size1 = 64;
int size2 = 128;
int size3 = 128;
int size4 = 128;
size_t tmp[4] = { size1, size2, size3, size4 };
int size = size1 * size2 * size3 * size4;
float* scores;
int* permutations;
float init = 1e35f;
gpuErrchk(cudaMallocAsync(&scores, sizeof(float) * BLOCKS, stream));
gpuErrchk(cudaMallocAsync(&permutations, sizeof(int) * BLOCKS, stream));
gpuErrchk(cudaEventRecord(event1, stream));
constexpr int BENCHMARK_ITERATIONS = 10;
for (int i = 0; i < BENCHMARK_ITERATIONS; i++) {
k_brute << <BLOCKS, THREADS, 0, stream >> > (scores, permutations);
}
gpuErrchk(cudaEventRecord(event2, stream));
gpuErrchk(cudaStreamSynchronize(stream));
float results[BLOCKS];
int results2[BLOCKS];
gpuErrchk(cudaMemcpy(results, scores, sizeof(float) * BLOCKS, cudaMemcpyDeviceToHost));
gpuErrchk(cudaMemcpy(results2, permutations, sizeof(int) * BLOCKS, cudaMemcpyDeviceToHost));
float finalScore = 0.0f;
int finalPermutation = 0;
for (int i = 0; i < BLOCKS; i++) {
if (finalScore < results[i]) {
finalScore = results[i];
finalPermutation = results2[i];
}
}
std::cout << "tastiness / distance = " << finalScore << std::endl;
std::cout << "distance / tastiness = " << (1.0f / finalScore) << std::endl;
float milliseconds = 0;
cudaEventElapsedTime(&milliseconds, event1, event2);
std::cout << "milliseconds = " << (milliseconds / BENCHMARK_ITERATIONS) << std::endl;
int indices[10] = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9 };
indices[0] = (finalPermutation / 1) % 10;
indices[1] = (finalPermutation / 10) % 10;
indices[2] = (finalPermutation / 100) % 10;
indices[3] = (finalPermutation / 1000) % 10;
indices[4] = (finalPermutation / 10000) % 10;
indices[5] = (finalPermutation / 100000) % 10;
indices[6] = (finalPermutation / 1000000) % 10;
indices[7] = (finalPermutation / 10000000) % 10;
float distance = 0.0f;
float tastiness = 0.0f;
float lastX = 0;
float lastY = 0;
float path[] = {
2, 8,
15, 3,
6, 6,
20, 15,
11, 2,
14, 1,
7, 3,
1, 12,
3, 6,
14, 13
};
float taste[] = {
7,
17,
8,
30,
12,
15,
5,
12,
3,
18
};
for (int i = 0; i < 8; i++) {
std::cout << "path: " << (indices[i] + 1) << std::endl;
const float dx = path[indices[i] * 2] - lastX;
const float dy = path[indices[i] * 2 + 1] - lastY;
const float t = taste[indices[i]];
distance += sqrtf(fmaf(dx, dx, fmaf(dy, dy, 0.0f)));
tastiness += t;
lastX = path[indices[i] * 2];
lastY = path[indices[i] * 2 + 1];
if (distance > 0.0f) {
if (fabsf(tastiness / distance - finalScore) < 0.001f) {
std::cout << tastiness / distance << " score reached! " << std::endl;
break;
}
}
}
gpuErrchk(cudaFreeAsync(scores, stream));
gpuErrchk(cudaFreeAsync(permutations, stream));
gpuErrchk(cudaEventDestroy(event1));
gpuErrchk(cudaEventDestroy(event2));
gpuErrchk(cudaStreamDestroy(stream));
return 0;
}
tastiness / distance = 2.89452
distance / tastiness = 0.345481
milliseconds = 16.5945473
path: 7
path: 5
path: 6
path: 2
path: 10
path: 4
2.89452 score reached!
#define __CUDACC__
#include<iostream>
#include <cuda_runtime.h>
// Error-handling assert code from stackoverflow talonmies.
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, const char* file, int line, bool abort = true)
{
if (code != cudaSuccess)
{
fprintf(stderr, "GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
if (abort) exit(code);
}
}
__device__ float path[] = {
2, 8,
15, 3,
6, 6,
20, 15,
11, 2,
14, 1,
7, 3,
1, 12,
3, 6,
14, 13
};
__device__ float taste[] = {
7,
17,
8,
30,
12,
15,
5,
12,
3,
18
};
constexpr int P_10_8 = 1000000000;
constexpr int BLOCKS = 46 * 2;
constexpr int THREADS = 768;
__global__ void k_brute(float* scores) {
const int thread = threadIdx.x + blockIdx.x * blockDim.x;
const int numThreads = blockDim.x * gridDim.x;
__shared__ float s_reduction[32];
int indices[10] = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9 };
float score = 0.0f;
const int steps = (P_10_8 + numThreads - 1) / numThreads;
for (int i = 0; i < steps; i++) {
const int permutation = i * numThreads + thread;
if (permutation < P_10_8) {
indices[0] = (permutation / 1) % 10;
indices[1] = (permutation / 10) % 10;
indices[2] = (permutation / 100) % 10;
indices[3] = (permutation / 1000) % 10;
indices[4] = (permutation / 10000) % 10;
indices[5] = (permutation / 100000) % 10;
indices[6] = (permutation / 1000000) % 10;
indices[7] = (permutation / 10000000) % 10;
bool skip = false;
for (int j = 0; j < 8; j++) {
for (int k = 0; k < 8; k++) {
if (j != k && indices[j] == indices[k]) {
skip = true;
}
}
}
if (skip)
continue;
float distance = 0.0f;
float tastiness = 0.0f;
float lastX = 0;
float lastY = 0;
for (int i = 0; i < 8; i++) {
const float dx = path[indices[i] * 2] - lastX;
const float dy = path[indices[i] * 2 + 1] - lastY;
const float t = taste[indices[i]];
distance += sqrtf(fmaf(dx, dx, fmaf(dy, dy, 0.0f)));
tastiness += t;
lastX = path[indices[i] * 2];
lastY = path[indices[i] * 2 + 1];
if (distance > 0.0f) {
if (tastiness / distance > score) {
score = tastiness / distance;
}
}
}
}
}
const int warpLane = threadIdx.x & 31;
const int warpIndex = threadIdx.x >> 5;
for (int i = 16; i >= 1; i >>= 1) {
const float gather = __shfl_down_sync(0xFFFFFFFF, score, i);
if (warpLane < i) {
score = (score < gather ? gather : score);
}
}
if (warpLane == 0) {
s_reduction[warpIndex] = score;
}
__syncthreads();
if (warpIndex == 0) {
if (warpLane < (blockDim.x >> 5)) {
score = s_reduction[warpLane];
}
else {
score = 0.0f;
}
for (int i = 16; i >= 1; i >>= 1) {
const float gather = __shfl_down_sync(0xFFFFFFFF, score, i);
if (warpLane < i) {
score = (score < gather ? gather : score);
}
}
if (warpLane == 0) {
scores[blockIdx.x] = score;
}
}
}
int main() {
cudaStream_t stream;
cudaEvent_t event1;
cudaEvent_t event2;
gpuErrchk(cudaStreamCreate(&stream));
gpuErrchk(cudaEventCreate(&event1));
gpuErrchk(cudaEventCreate(&event2));
int size1 = 64;
int size2 = 128;
int size3 = 128;
int size4 = 128;
size_t tmp[4] = { size1, size2, size3, size4 };
int size = size1 * size2 * size3 * size4;
float* scores;
float init = 1e35f;
gpuErrchk(cudaMallocAsync(&scores, sizeof(float) * BLOCKS, stream));
gpuErrchk(cudaEventRecord(event1, stream));
constexpr int BENCHMARK_ITERATIONS = 10;
for (int i = 0; i < BENCHMARK_ITERATIONS; i++) {
k_brute << <BLOCKS, THREADS, 0, stream >> > (scores);
}
gpuErrchk(cudaEventRecord(event2, stream));
gpuErrchk(cudaStreamSynchronize(stream));
float results[BLOCKS];
gpuErrchk(cudaMemcpy(results, scores, sizeof(float) * BLOCKS, cudaMemcpyDeviceToHost));
float finalScore = 0.0f;
for (int i = 0; i < BLOCKS; i++) {
if (finalScore < results[i]) {
finalScore = results[i];
}
}
std::cout << "tastiness / distance = " << finalScore << std::endl;
std::cout << "distance / tastiness = " << (1.0f / finalScore) << std::endl;
float milliseconds = 0;
cudaEventElapsedTime(&milliseconds, event1, event2);
std::cout << "milliseconds = " << (milliseconds / BENCHMARK_ITERATIONS) << std::endl;
gpuErrchk(cudaFreeAsync(scores, stream));
gpuErrchk(cudaEventDestroy(event1));
gpuErrchk(cudaEventDestroy(event2));
gpuErrchk(cudaStreamDestroy(stream));
return 0;
}
tastiness / distance = 2.89452
distance / tastiness = 0.345481
milliseconds = 16.594
#define __CUDACC__
#include<iostream>
#include <cuda_runtime.h>
// Error-handling assert code from stackoverflow talonmies.
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, const char* file, int line, bool abort = true)
{
if (code != cudaSuccess)
{
fprintf(stderr, "GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
if (abort) exit(code);
}
}
__device__ float path[] = {
2, 8,
15, 3,
6, 6,
20, 15,
11, 2,
14, 1,
7, 3,
1, 12,
3, 6,
14, 13
};
__device__ float taste[] = {
7,
17,
8,
30,
12,
15,
5,
12,
3,
18
};
constexpr int P_10_8 = 1000000000;
constexpr int BLOCKS = 46 * 2;
constexpr int THREADS = 768;
__global__ void k_brute(float* scores, int* permutations) {
const int thread = threadIdx.x + blockIdx.x * blockDim.x;
const int numThreads = blockDim.x * gridDim.x;
__shared__ float s_reduction[32];
__shared__ int s_reduction2[32];
int indices[10] = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9 };
float score = 0.0f;
int prm = 0;
const int steps = (P_10_8 + numThreads - 1) / numThreads;
for (int i = 0; i < steps; i++) {
const int permutation = i * numThreads + thread;
if (permutation < P_10_8) {
indices[0] = (permutation / 1) % 10;
indices[1] = (permutation / 10) % 10;
indices[2] = (permutation / 100) % 10;
indices[3] = (permutation / 1000) % 10;
indices[4] = (permutation / 10000) % 10;
indices[5] = (permutation / 100000) % 10;
indices[6] = (permutation / 1000000) % 10;
indices[7] = (permutation / 10000000) % 10;
bool skip = false;
for (int j = 0; j < 8; j++) {
for (int k = 0; k < 8; k++) {
if (j != k && indices[j] == indices[k]) {
skip = true;
}
}
}
if (skip)
continue;
float distance = 0.0f;
float tastiness = 0.0f;
float lastX = 0;
float lastY = 0;
for (int i = 0; i < 8; i++) {
const float dx = path[indices[i] * 2] - lastX;
const float dy = path[indices[i] * 2 + 1] - lastY;
const float t = taste[indices[i]];
distance += sqrtf(fmaf(dx, dx, fmaf(dy, dy, 0.0f)));
tastiness += t;
lastX = path[indices[i] * 2];
lastY = path[indices[i] * 2 + 1];
if (distance > 0.0f) {
if (tastiness / distance > score) {
prm = permutation;
score = tastiness / distance;
}
}
}
}
}
const int warpLane = threadIdx.x & 31;
const int warpIndex = threadIdx.x >> 5;
for (int i = 16; i >= 1; i >>= 1) {
const float gather = __shfl_down_sync(0xFFFFFFFF, score, i);
const int gather2 = __shfl_down_sync(0xFFFFFFFF, prm, i);
if (warpLane < i) {
prm = (score < gather ? gather2 : prm);
score = (score < gather ? gather : score);
}
}
if (warpLane == 0) {
s_reduction[warpIndex] = score;
s_reduction2[warpIndex] = prm;
}
__syncthreads();
if (warpIndex == 0) {
if (warpLane < (blockDim.x >> 5)) {
score = s_reduction[warpLane];
prm = s_reduction2[warpLane];
}
else {
score = 0.0f;
prm = 0;
}
for (int i = 16; i >= 1; i >>= 1) {
const float gather = __shfl_down_sync(0xFFFFFFFF, score, i);
const int gather2 = __shfl_down_sync(0xFFFFFFFF, prm, i);
if (warpLane < i) {
prm = (score < gather ? gather2 : prm);
score = (score < gather ? gather : score);
}
}
if (warpLane == 0) {
scores[blockIdx.x] = score;
permutations[blockIdx.x] = prm;
}
}
}
int main() {
cudaStream_t stream;
cudaEvent_t event1;
cudaEvent_t event2;
gpuErrchk(cudaStreamCreate(&stream));
gpuErrchk(cudaEventCreate(&event1));
gpuErrchk(cudaEventCreate(&event2));
int size1 = 64;
int size2 = 128;
int size3 = 128;
int size4 = 128;
size_t tmp[4] = { size1, size2, size3, size4 };
int size = size1 * size2 * size3 * size4;
float* scores;
int* permutations;
float init = 1e35f;
gpuErrchk(cudaMallocAsync(&scores, sizeof(float) * BLOCKS, stream));
gpuErrchk(cudaMallocAsync(&permutations, sizeof(int) * BLOCKS, stream));
gpuErrchk(cudaEventRecord(event1, stream));
constexpr int BENCHMARK_ITERATIONS = 10;
for (int i = 0; i < BENCHMARK_ITERATIONS; i++) {
k_brute << <BLOCKS, THREADS, 0, stream >> > (scores, permutations);
}
gpuErrchk(cudaEventRecord(event2, stream));
gpuErrchk(cudaStreamSynchronize(stream));
float results[BLOCKS];
int results2[BLOCKS];
gpuErrchk(cudaMemcpy(results, scores, sizeof(float) * BLOCKS, cudaMemcpyDeviceToHost));
gpuErrchk(cudaMemcpy(results2, permutations, sizeof(int) * BLOCKS, cudaMemcpyDeviceToHost));
float finalScore = 0.0f;
int finalPermutation = 0;
for (int i = 0; i < BLOCKS; i++) {
if (finalScore < results[i]) {
finalScore = results[i];
finalPermutation = results2[i];
}
}
std::cout << "tastiness / distance = " << finalScore << std::endl;
std::cout << "distance / tastiness = " << (1.0f / finalScore) << std::endl;
float milliseconds = 0;
cudaEventElapsedTime(&milliseconds, event1, event2);
std::cout << "milliseconds = " << (milliseconds / BENCHMARK_ITERATIONS) << std::endl;
int indices[10] = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9 };
indices[0] = (finalPermutation / 1) % 10;
indices[1] = (finalPermutation / 10) % 10;
indices[2] = (finalPermutation / 100) % 10;
indices[3] = (finalPermutation / 1000) % 10;
indices[4] = (finalPermutation / 10000) % 10;
indices[5] = (finalPermutation / 100000) % 10;
indices[6] = (finalPermutation / 1000000) % 10;
indices[7] = (finalPermutation / 10000000) % 10;
float distance = 0.0f;
float tastiness = 0.0f;
float lastX = 0;
float lastY = 0;
float path[] = {
2, 8,
15, 3,
6, 6,
20, 15,
11, 2,
14, 1,
7, 3,
1, 12,
3, 6,
14, 13
};
float taste[] = {
7,
17,
8,
30,
12,
15,
5,
12,
3,
18
};
for (int i = 0; i < 8; i++) {
std::cout << "path: " << (indices[i] + 1) << std::endl;
const float dx = path[indices[i] * 2] - lastX;
const float dy = path[indices[i] * 2 + 1] - lastY;
const float t = taste[indices[i]];
distance += sqrtf(fmaf(dx, dx, fmaf(dy, dy, 0.0f)));
tastiness += t;
lastX = path[indices[i] * 2];
lastY = path[indices[i] * 2 + 1];
if (distance > 0.0f) {
if (fabsf(tastiness / distance - finalScore) < 0.001f) {
std::cout << tastiness / distance << " score reached! " << std::endl;
break;
}
}
}
gpuErrchk(cudaFreeAsync(scores, stream));
gpuErrchk(cudaFreeAsync(permutations, stream));
gpuErrchk(cudaEventDestroy(event1));
gpuErrchk(cudaEventDestroy(event2));
gpuErrchk(cudaStreamDestroy(stream));
return 0;
}
tastiness / distance = 2.89452
distance / tastiness = 0.345481
milliseconds = 16.5473
path: 7
path: 5
path: 6
path: 2
path: 10
path: 4
2.89452 score reached!
Some parts can be optimized:
8 x 8 skip loop: this can be replaced by an array that keeps track of used indices to reduce number of checks --> can speedup 10x
modulo arithmetic + math: a lot of iterations are wasted currently. Using exact number of permutations required + some 1to1 mapping would also make it fast --> can speedup 100x
total speedup would be 1000x (10-15 microseconds)
Some parts can be optimized:
8 x 8 skip loop: this can be replaced by an array that keeps track of used indices to reduce number of checks --> can speedup 10x
modulo arithmetic + math: a lot of iterations are wasted currently. Using exact number of permutations required + some 1to1 mapping would also make it fast --> can speedup 100x
total speedup would be 1000x (10-15 microseconds)