I want to perform multiplication on two vectors and integrate it in a vector called acc_y. The acc_y variable will update over every iteration and averaged out.
I have modified vector addition code for it.
kernel for the multiply and integration:
__global__ void cvctmac (int M,float *yre,float *yim,float *x1re,float *x1im,float *x2re,float *x2im,double *acc_yre,double *acc_yim) {
int index = blockIdx.x * blockDim.x + threadIdx.x;
int stride = blockDim.x * gridDim.x;
// Multiplication
for (int i = index; i < M; i += stride) {
acc_yre[i] += x1re[i] * x2re[i] - x1im[i] * x2im[i];
acc_yim[i] += x1re[i] * x2im[i] + x1im[i] * x2re[i];
}
}
__global__ void cavg(int M,double iter,double *xre,double *xim){
// Averaging
int index = blockIdx.x * blockDim.x + threadIdx.x;
int stride = blockDim.x * gridDim.x;
// Grid-stride approch
for (int i = index; i < M; i += stride) {
xre[i] /= iter;
xim[i] /= iter;
}
}
As the code is performing complex arithmetic, I am doing 4 operations per thread(2 for multiplication and 2 for integration). what will be the way to optimize the kernel cvctmac?
Will shared memory help here?
I have used cuComplex.h also but getting the same performance.
host code:
for (j = 0; j < iter; j++) {
// Generate data in host
for (i = j * M, c = 0; i < M * (j + 1); i++, c++) {
x1re[c] = (float)i;
x1im[c] = 0.0;
x2re[c] = 1.0;
x2im[c] = 0.0;
}
// Copy host to device
cudaMemcpy(dx1re, x1re, M * sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(dx1im, x1im, M * sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(dx2re, x2re, M * sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(dx2im, x2im, M * sizeof(float), cudaMemcpyHostToDevice);
//mac
cvctmac<<<numBlock, numThread>>>(
M,
dyre, dyim,
dx1re, dx1im,
dx2re, dx2im,
dacc_yre, dacc_yim
);
}
// Avg
cavg<<<numBlock, numThread>>>(
M, (double) iter,
dacc_yre, dacc_yim
);
Please suggest the way to optimize the code. I'm targeting CUDA with compute capability 6.1.