1
\$\begingroup\$

I don't know if I'm doing this correctly, using OpenCL and SFML together but I know a little about both so I decided to make something with them. I've already tried implementing a pure C++ and SFML version of a Mandelbrot Set generator but it ran for 50 minutes and generated a 1000*1000 not sufficiently detailed image. I wrote this code today following an OpenCL tutorial about how to write a simple vector addition and trying to figure out how to implement my problem. So that's the reason I don't use matrices in my code just vectors. I'm open to a solution that uses a huge matrix in OpenCL (I'm aware that would be much more efficient than my solution). Please devs, experience in OpenCL and GPGPU approach!

#include <iostream>
#include <fstream>
#include <string>
#include <sstream>
#include <CL\opencl.h>
#include <SFML\Graphics.hpp>

#define DATA_SIZE 8192

int main(int argc, char* argv[])
{
    sf::Image img;
    img.create(DATA_SIZE, DATA_SIZE);

    cl_int err;

    size_t global; // globális probléma tér
    size_t local; // lokális probléma tér

    cl_platform_id platform;
    err = clGetPlatformIDs(1, &platform, NULL);
    // Get first available platform
    if (err != CL_SUCCESS) {
        std::cerr << "Error, failed to find platform.\n";
        std::cin.get();
        return EXIT_FAILURE;
    }

    cl_device_id device_id;
    err = clGetDeviceIDs(platform,
        CL_DEVICE_TYPE_GPU,
        1,
        &device_id,
        NULL);
    if (err != CL_SUCCESS) {
        std::cerr << "Error, failed to create device group.\n";
        std::cin.get();
        return EXIT_FAILURE;
    }

    cl_context context;
    context = clCreateContext(0,
        1,
        &device_id,
        NULL,
        NULL,
        &err);
    if (!context) {
        std::cerr << "Error, failed to create a compute context.\n";
        std::cin.get();
        return EXIT_FAILURE;
    }

    cl_command_queue commands;
    commands = clCreateCommandQueue(context,
        device_id,
        0,
        &err);
    if (!commands) {
        std::cerr << "Error, failed to create command queue.\n";
        std::cin.get();
        return EXIT_FAILURE;
    }

    const char* KernelSource = "__kernel void sqr(__global float* input,\n"
        "const int row,\n"
        "__global float* output){\n"
        "int i = get_global_id(0);\n"
        "float c_re = input[i];\n"
        "float c_im = 1.5 - row*3.0/8192.;\n"
        "int count = 0;\n"
        "float x = 0., y = 0.;\n"
        "while(x*x + y*y < 2. && count < 255 ){\n"
        "float x_new = x*x - y*y + c_re;\n"
        "y = 2*x*y + c_im;\n"
        "x = x_new;\n"
        "count++;\n"
        "}\n"
        "output[i] = count;\n"
        "}\n";

    cl_program program;
    program = clCreateProgramWithSource(context,
        1,
        &KernelSource,
        NULL,
        &err);

    err = clBuildProgram(program,
        0,
        NULL,
        NULL,
        NULL,
        NULL);
    if (err != CL_SUCCESS) {
        size_t len;
        char buffer[2048];
        std::cerr << "Failed to build executable.\n";
        clGetProgramBuildInfo(program, device_id,
            CL_PROGRAM_BUILD_LOG,
            sizeof(buffer), buffer, &len);
        std::cerr << buffer << std::endl;
        std::cin.get();
        exit(1);
    }

    cl_kernel kernel;
    kernel = clCreateKernel(program, "sqr", &err);
    if (!kernel || err != CL_SUCCESS) {
        std::cerr << "Error, failed to create compute kernel.\n";
        std::cin.get();
        exit(1);
    }

    float* data = new float[DATA_SIZE];
    float* results = new float[DATA_SIZE];
    int row;
    cl_mem input;
    cl_mem output;

    for (int s = 0; s < DATA_SIZE; s++) {

        row = s;
        unsigned int count = DATA_SIZE;
        for (int i = 0; i < count; i++) {
            data[i] = -1.5 + 3.*i / (float)count;
        }

        input = clCreateBuffer(context,
            CL_MEM_READ_ONLY, sizeof(float)*count,
            NULL,
            NULL);

        output = clCreateBuffer(context, CL_MEM_WRITE_ONLY,
            sizeof(float)*count,
            NULL,
            NULL);

        if (!input || !output) {
            std::cerr << "Error, failed to allocate device memory.\n";
            std::cin.get();
            exit(1);
        }

        err = clEnqueueWriteBuffer(commands,
            input,
            CL_TRUE,
            0,
            sizeof(float)*count,
            data,
            0,
            NULL,
            NULL);

        if (err != CL_SUCCESS) {
            std::cerr << "Error, failed to write to source array.\n";
            std::cin.get();
            exit(1);
        }

        err = 0;
        err = clSetKernelArg(kernel,
            0,
            sizeof(cl_mem),
            &input);
        err |= clSetKernelArg(kernel,
            1,
            sizeof(int),
            &row);
        err |= clSetKernelArg(kernel,
            2,
            sizeof(cl_mem),
            &output);
        if (err != CL_SUCCESS) {
            std::cerr << "Error, failed to set kernel args.\n";
            std::cin.get();
            exit(1);
        }

        err = clGetKernelWorkGroupInfo(kernel,
            device_id,
            CL_KERNEL_WORK_GROUP_SIZE,
            sizeof(local),
            &local,
            NULL);

        if (err != CL_SUCCESS) {
            std::cerr << "Error, failed to retrieve kernel workgroup info.\n";
            std::cin.get();
            exit(1);
        }

        global = count;
        err = clEnqueueNDRangeKernel(commands,
            kernel,
            1,
            NULL,
            &global,
            &local,
            0,
            NULL,
            NULL);

        if (err) {
            std::cerr << "Error: failed to execute kernel.\n";
            std::cin.get();
            exit(1);
        }

        clFinish(commands);
        err = clEnqueueReadBuffer(commands,
            output,
            CL_TRUE,
            0,
            sizeof(float)*count,
            results,
            0,
            NULL,
            NULL);

        if (err != CL_SUCCESS) {
            std::cerr << "Failed to read output array.\n";
            std::cin.get();
            exit(1);
        }

        // Set the pixels in the img after the calculation
        for (int i = 0; i < count; i++) {
            img.setPixel(i, s, sf::Color::Color(0, (int)results[i], 0));
        }
    }
    // Cleanup.
    delete[] data;
    delete[] results;

    clReleaseMemObject(input);
    clReleaseMemObject(output);
    clReleaseProgram(program);
    clReleaseKernel(kernel);
    clReleaseCommandQueue(commands);
    clReleaseContext(context);


    // Save the image to a bitmap
    img.saveToFile("mandelbrot.bmp");
    return 0;
}

To be 100% honest. I do not understand everything in the code. However, I'm familiar with the basics. I'm looking forward for useful ideas.

\$\endgroup\$
8
  • \$\begingroup\$ Comments like // DATA SIZE SHOULD BE 2^n-th !!!!! DON'T KNOW WHY! YET!!! :D ;) aren't a good start for code review. I'd at least beef about these smileys ;-) \$\endgroup\$ Commented Mar 19, 2017 at 21:45
  • \$\begingroup\$ Well. That's a comment for myself. I forgot to delete it (as I was doing so with several others) but at least this way you can explain me why. ;) -Edited- \$\endgroup\$ Commented Mar 19, 2017 at 21:51
  • 1
    \$\begingroup\$ "To be 100% honest. I do not understand everything in the code." All in all, it sounds you're asking about explanations of someone else's code, that's off-topic here. You should know what your code does, that's kinda precondition for code review. \$\endgroup\$ Commented Mar 19, 2017 at 21:52
  • \$\begingroup\$ I understand most of it, but obviously there are parts that I do not grasp yet (that's why I couldn't figure out how to do it with matrices). I hope I can get helpful comments as well. \$\endgroup\$ Commented Mar 19, 2017 at 21:59
  • 4
    \$\begingroup\$ @Qbeer666 , I think he is right. Panta (sorry for misspelling) wants to improve the question. Right now it sounds very weird for me. I think stressing on that the code works and you've written it would be great, especially at the last paragraph. You can even wipe it. \$\endgroup\$ Commented Mar 19, 2017 at 22:17

1 Answer 1

2
\$\begingroup\$

I'm open to a solution that uses a huge matrix in OpenCL (I'm aware that would be much more efficient than my solution).

The kernel code doesn't share(but produce similar, see the 2D kernel part ) any data between workitems. Each workitem working only its own data in this program. So having a larger image just increases the ratio of kernel launch overhead to the (computation+buffer_copy) time so the percieved throughput increases.

But,

Since each compute unit has SIMD, neighboring workitems should produce same or similar colors so decreasing local group size as much as possible should use those SIMDs better since difference in color means divergence in pipeline and bad for performance.

Think of drawing a filled circle, interior pixels need more work, outer part less work. Smaller tiles mean more efficient work distribution around the surface line.

2D Kernel

Scanline is not enough. Even Y-axis can have same or similar for neighbour pixels so you should use 2D-ndrange kernel and have them Z-ordered or at least squares.

If each compute unit has 64 cores, try tiles of 8x8 instead of 2x16 or 16x2 because of pixel result divergence.

Even with 1-D kernel, you can achieve same performance.

  • Get group id, get group x and group y values from that using modulus and division.
  • Map a local group to a tile using modulus and division again so each local thread works on neighbours in a tile instead of a scanline.

// 64 threads per group(square mapped), 256x256 image

 thread_group_x = get_group_id(0)%32  ---> 32 tiles along X axis
 thread_group_y = get_group_id(0)/32  ---> 32 tiles along Y axis
 thread_x = get_local_id(0)%8 ----> pixel-x inside tile
 thread_y = get_local_id(0)/8 ----> pixel-y inside tile
 ---calculate---
 ---calculate end---
 store(result, index=(thread_x+thread_group_x*8 + 256*(thread_y+thread_group_y*8)));

Pixel indices by 1D emulated to 2D kernel for 8k x 8k example with 64 local threads:

 unsigned ix = (get_group_id (0)%1024)*8+get_local_id(0)%8;
 unsigned iy = (get_group_id (0)/1024)*8+get_local_id(0)/8;

After data locality problem is solved, you can optimize for buffer copies to see actual compute performance instead of pci-e bottleneck.

\$\endgroup\$
4
  • \$\begingroup\$ Thank you very much. I'm a beginner. I'm open to every suggestion and happy tó finally get useful help. Thanks again. \$\endgroup\$ Commented Mar 20, 2017 at 23:13
  • \$\begingroup\$ I just triedd a 32-bit precision version with 8192x8192 size image target, it took 1400milliseconds on average. I'll put the kernel here then optimize it later at times \$\endgroup\$ Commented Mar 20, 2017 at 23:17
  • \$\begingroup\$ @Qbeer666 just try decreasing local size to a minimum maintainable number (such as number of cores per compute unit) so they will have less branching in a SIMD and have better performance. Then use 2D ndrangekernel and use Z-ordering to have even better data per compute unit \$\endgroup\$ Commented Mar 21, 2017 at 0:04
  • \$\begingroup\$ floating point operations are 45GLOPs for intel hd 400. I suspect its already bottlenecked by cores and cant get more speed with 2d kernel. \$\endgroup\$ Commented Mar 21, 2017 at 1:06

Start asking to get answers

Find the answer to your question by asking.

Ask question

Explore related questions

See similar questions with these tags.