Revisions to A sleep primitive for CUDA device-side code

added 1 character in body

Source Link

edited Oct 23, 2017 at 22:38

2.1k
14
31

I've so far been spared the need to waste any cycles on the GPU, but it seems like this might change. After getting some information I was missing about the semantics of CUDA's clock64() function, I've written the following bit of code for use in kernels (or other device-side functions):

// Shouldn't this be typedef'ed in the CUDA headers anywheresomewhere?
using clock_value_t = long long;

namespace detail {

__device__ void sleep(
    clock_value_t            num_cycles, 
    volatile clock_value_t*  buffer_to_avoid_optimization)
{
    clock_value_t start = clock64();
    clock_value_t now;
    while (true) {
        now = clock64();
        // (Note the assumption of no wrap-around)
        clock_value_t cycles_elapsed = now - start;
        if (cycles_elapsed >= num_cycles) { break; }
    }
    // The memory write here should (hopefully) prevents the compiler
    // from optimizing the entire loop away
    *buffer_to_avoid_optimization = now - start;
}

} // namespace detail

/**
 * Have the executing warp busy-sleep until at least a certain
 * number of SM clock cycles have passed.
 *
 * @note The exact number of cycles busy-slept will depend on how soon the
 * warp will be scheduled to execute again after the last time it
 * iterates the elapsed-cycles check.
 *
 * @param num_cycles The minimum number of cycles to busy-sleep
 */
__device__ void sleep(clock_value_t num_cycles)
{
    static volatile clock_value_t buffer;
    detail::sleep(num_cycles, &buffer);
}

Other than general observations which are welcome, I have a few questions:

Is this approach reasonable, or should I be doing something very different?
Is there a better way of avoiding the while(true) loop from being optimized away?
Do I even need the volatile modifier on buffer_to_avoid_optimization?
Could I use a reference instead of a pointer?
Should I give it a different name?

I've so far been spared the need to waste any cycles on the GPU, but it seems like this might change. After getting some information I was missing about the semantics of CUDA's clock64() function, I've written the following bit of code for use in kernels (or other device-side functions):

// Shouldn't this be typedef'ed in the CUDA headers anywhere?
using clock_value_t = long long;

namespace detail {

__device__ void sleep(
    clock_value_t            num_cycles, 
    volatile clock_value_t*  buffer_to_avoid_optimization)
{
    clock_value_t start = clock64();
    clock_value_t now;
    while (true) {
        now = clock64();
        // (Note the assumption of no wrap-around)
        clock_value_t cycles_elapsed = now - start;
        if (cycles_elapsed >= num_cycles) { break; }
    }
    // The memory write here should (hopefully) prevents the compiler
    // from optimizing the entire loop away
    *buffer_to_avoid_optimization = now - start;
}

} // namespace detail

/**
 * Have the executing warp busy-sleep until at least a certain
 * number of SM clock cycles have passed.
 *
 * @note The exact number of cycles busy-slept will depend on how soon the
 * warp will be scheduled to execute again after the last time it
 * iterates the elapsed-cycles check.
 *
 * @param num_cycles The minimum number of cycles to busy-sleep
 */
__device__ void sleep(clock_value_t num_cycles)
{
    static volatile clock_value_t buffer;
    detail::sleep(num_cycles, &buffer);
}

Other than general observations which are welcome, I have a few questions:

Is this approach reasonable, or should I be doing something very different?
Is there a better way of avoiding the while(true) loop from being optimized away?
Do I even need the volatile modifier on buffer_to_avoid_optimization?
Could I use a reference instead of a pointer?
Should I give it a different name?

I've so far been spared the need to waste any cycles on the GPU, but it seems like this might change. After getting some information I was missing about the semantics of CUDA's clock64() function, I've written the following bit of code for use in kernels (or other device-side functions):

// Shouldn't this be typedef'ed in the CUDA headers somewhere?
using clock_value_t = long long;

namespace detail {

__device__ void sleep(
    clock_value_t            num_cycles, 
    volatile clock_value_t*  buffer_to_avoid_optimization)
{
    clock_value_t start = clock64();
    clock_value_t now;
    while (true) {
        now = clock64();
        // (Note the assumption of no wrap-around)
        clock_value_t cycles_elapsed = now - start;
        if (cycles_elapsed >= num_cycles) { break; }
    }
    // The memory write here should (hopefully) prevents the compiler
    // from optimizing the entire loop away
    *buffer_to_avoid_optimization = now - start;
}

} // namespace detail

/**
 * Have the executing warp busy-sleep until at least a certain
 * number of SM clock cycles have passed.
 *
 * @note The exact number of cycles busy-slept will depend on how soon the
 * warp will be scheduled to execute again after the last time it
 * iterates the elapsed-cycles check.
 *
 * @param num_cycles The minimum number of cycles to busy-sleep
 */
__device__ void sleep(clock_value_t num_cycles)
{
    static volatile clock_value_t buffer;
    detail::sleep(num_cycles, &buffer);
}

Other than general observations which are welcome, I have a few questions:

Is this approach reasonable, or should I be doing something very different?
Is there a better way of avoiding the while(true) loop from being optimized away?
Do I even need the volatile modifier on buffer_to_avoid_optimization?
Could I use a reference instead of a pointer?
Should I give it a different name?

added 38 characters in body

Source Link

edited Oct 23, 2017 at 13:27

einpoklum

2.1k
14
31

I've so far been spared the need to waste any cycles on the GPU, but it seems like this might change. After getting some information I was missing about the semantics of CUDA's clock64() function, I've written the following bit of code for use in kernels (or other device-side functions):

// Shouldn't this be typedef'ed in the CUDA headers anywhere?
using clock_value_t = long long;

namespace detail {

__device__ void sleep(
    clock_value_t            num_cycles, 
    volatile clock_value_t*  buffer_to_avoid_optimization)
{
    clock_value_t start = clock64();
    clock_value_t now;
    while (true) {
        now = clock64();
        // (Note the assumption of no wrap-around)
        clock_value_t cycles_elapsed = now - start;
        if (cycles_elapsed >= num_cycles) { break; }
    }
    // The memory write here should (hopefully) prevents the compiler
    // from optimizing the entire loop away
    *buffer_to_avoid_optimization = now - start;
}

} // namespace detail

/**
 * Have the executing warp busy-sleep until at least a certain
 * number of SM clock cycles have passed.
 *
 * @note The exact number of cycles busy-slept will depend on how soon the
 * warp will be scheduled to execute again after the last time it
 * iterates the elapsed-cycles check.
 *
 * @param num_cycles The minimum number of cycles to busy-sleep
 */
__device__ void sleep(clock_value_t num_cycles)
{
    static volatile clock_value_t buffer;
    detail::sleep(num_cycles, &buffer);
}

Other than general observations which are welcome, I have a few questions:

Is this approach reasonable, or should I be doing something very different?
Is there a better way of avoiding the while(true) loop from being optimized away?
Do I even need the volatile modifier on buffer_to_avoid_optimization?
Could I use a reference instead of a pointer?
Should I give it a different name?

I've so far been spared the need to waste any cycles on the GPU, but it seems like this might change. After getting some information I was missing about the semantics of CUDA's clock64() function, I've written the following bit of code for use in kernels (or other device-side functions):

// Shouldn't this be typedef'ed in the CUDA headers anywhere?
using clock_value_t = long long;

namespace detail {

__device__ void sleep(
    clock_value_t            num_cycles, 
    volatile clock_value_t*  buffer_to_avoid_optimization)
{
    clock_value_t start = clock64();
    clock_value_t now;
    while (true) {
        now = clock64();
        // (Note the assumption of no wrap-around)
        clock_value_t cycles_elapsed = now - start;
        if (cycles_elapsed >= num_cycles) { break; }
    }
    // The memory write here should (hopefully) prevents the compiler
    // from optimizing the entire loop away
    *buffer_to_avoid_optimization = now - start;
}

} // namespace detail

/**
 * Have the executing warp busy-sleep until at least a certain
 * number of SM clock cycles have passed.
 *
 * @note The exact number of cycles busy-slept will depend on how soon the
 * warp will be scheduled to execute again after the last time it
 * iterates the elapsed-cycles check.
 *
 * @param num_cycles The minimum number of cycles to busy-sleep
 */
__device__ void sleep(clock_value_t num_cycles)
{
    static volatile clock_value_t buffer;
    detail::sleep(num_cycles, &buffer);
}

Other than general observations which are welcome, I have a few questions:

Is this approach reasonable, or should I be doing something very different?
Is there a better way of avoiding the while(true) loop from being optimized away?
Do I even need the volatile modifier on buffer_to_avoid_optimization?
Could I use a reference instead of a pointer?

I've so far been spared the need to waste any cycles on the GPU, but it seems like this might change. After getting some information I was missing about the semantics of CUDA's clock64() function, I've written the following bit of code for use in kernels (or other device-side functions):

// Shouldn't this be typedef'ed in the CUDA headers anywhere?
using clock_value_t = long long;

namespace detail {

__device__ void sleep(
    clock_value_t            num_cycles, 
    volatile clock_value_t*  buffer_to_avoid_optimization)
{
    clock_value_t start = clock64();
    clock_value_t now;
    while (true) {
        now = clock64();
        // (Note the assumption of no wrap-around)
        clock_value_t cycles_elapsed = now - start;
        if (cycles_elapsed >= num_cycles) { break; }
    }
    // The memory write here should (hopefully) prevents the compiler
    // from optimizing the entire loop away
    *buffer_to_avoid_optimization = now - start;
}

} // namespace detail

/**
 * Have the executing warp busy-sleep until at least a certain
 * number of SM clock cycles have passed.
 *
 * @note The exact number of cycles busy-slept will depend on how soon the
 * warp will be scheduled to execute again after the last time it
 * iterates the elapsed-cycles check.
 *
 * @param num_cycles The minimum number of cycles to busy-sleep
 */
__device__ void sleep(clock_value_t num_cycles)
{
    static volatile clock_value_t buffer;
    detail::sleep(num_cycles, &buffer);
}

Other than general observations which are welcome, I have a few questions:

Is this approach reasonable, or should I be doing something very different?
Is there a better way of avoiding the while(true) loop from being optimized away?
Do I even need the volatile modifier on buffer_to_avoid_optimization?
Could I use a reference instead of a pointer?
Should I give it a different name?

added 365 characters in body

Source Link

edited Oct 23, 2017 at 13:19

einpoklum

2.1k
14
31

I've so far been spared the need to waste any cycles on the GPU, but it seems like this might change. After getting some information I was missing about the semantics of CUDA's clock64() function, I've written the following bit of code for use in kernels (or other device-side functions):

// The following definition of `return_type` is not really part of what 
// I want reviewed, but I need it. It is based on: 
// https://stackoverflow.com/a/41301717/1593077
namespace detail {
template<typename R, typename... A>
R return_type(R(*)(A...));
} // namespace detail

// Shouldn't this be typedef'ed in the CUDA headers anywhere?
using clock_value_t = decltype(return_type(clock64));long long;

namespace detail {

__device__ void sleep(
    clock_value_t            num_cycles, 
    volatile clock_value_t*  buffer_to_avoid_optimization)
{
    clock_value_t start = clock64();
    clock_value_t now;
    while (true) {
        now = clock64();
        // (Note the assumption of no wrap-around)
        clock_value_t cycles_elapsed = now - start;
        if (cycles_elapsed >= num_cycles) { break; }
    }
    // The memory write here should (hopefully) prevents the compiler
    // from optimizing the entire loop away
    *buffer_to_avoid_optimization = now - start;
}

} // namespace detail

/**
 * Have the executing warp busy-sleep until at least a certain
 * number of SM clock cycles have passed.
 *
 * @note The exact number of cycles busy-slept will depend on how soon the
 * warp will be scheduled to execute again after the last time it
 * iterates the elapsed-cycles check.
 *
 * @param num_cycles The minimum number of cycles to busy-sleep
 */
__device__ void sleep(clock_value_t num_cycles)
{
    static volatile clock_value_t buffer;
    detail::sleep(num_cycles, &buffer);
}

Other than general observations which are welcome, I have a few questions:

Is this approach reasonable, or should I be doing something very different?
Is there a better way of avoiding the while(true) loop from being optimized away?
Do I even need the volatile modifier on buffer_to_avoid_optimization?

Could I use a reference instead of a pointer?

I've so far been spared the need to waste any cycles on the GPU, but it seems like this might change. After getting some information I was missing about the semantics of CUDA's clock64() function, I've written the following bit of code for use in kernels (or other device-side functions):

// The following definition of `return_type` is not really part of what 
// I want reviewed, but I need it. It is based on: 
// https://stackoverflow.com/a/41301717/1593077
namespace detail {
template<typename R, typename... A>
R return_type(R(*)(A...));
} // namespace detail

// Shouldn't this be typedef'ed in the CUDA headers anywhere?
using clock_value_t = decltype(return_type(clock64));

namespace detail {

__device__ void sleep(
    clock_value_t            num_cycles, 
    volatile clock_value_t*  buffer_to_avoid_optimization)
{
    clock_value_t start = clock64();
    clock_value_t now;
    while (true) {
        now = clock64();
        // (Note the assumption of no wrap-around)
        clock_value_t cycles_elapsed = now - start;
        if (cycles_elapsed >= num_cycles) { break; }
    }
    // The memory write here should (hopefully) prevents the compiler
    // from optimizing the entire loop away
    *buffer_to_avoid_optimization = now - start;
}

} // namespace detail

/**
 * Have the executing warp busy-sleep until at least a certain
 * number of SM clock cycles have passed.
 *
 * @note The exact number of cycles busy-slept will depend on how soon the
 * warp will be scheduled to execute again after the last time it
 * iterates the elapsed-cycles check.
 *
 * @param num_cycles The minimum number of cycles to busy-sleep
 */
__device__ void sleep(clock_value_t num_cycles)
{
    static volatile clock_value_t buffer;
    detail::sleep(num_cycles, &buffer);
}

Other than general observations which are welcome, I have a few questions:

Is this approach reasonable, or should I be doing something very different?
Is there a better way of avoiding the while(true) loop from being optimized away?
Do I even need the volatile modifier on buffer_to_avoid_optimization?

I've so far been spared the need to waste any cycles on the GPU, but it seems like this might change. After getting some information I was missing about the semantics of CUDA's clock64() function, I've written the following bit of code for use in kernels (or other device-side functions):

// Shouldn't this be typedef'ed in the CUDA headers anywhere?
using clock_value_t = long long;

namespace detail {

__device__ void sleep(
    clock_value_t            num_cycles, 
    volatile clock_value_t*  buffer_to_avoid_optimization)
{
    clock_value_t start = clock64();
    clock_value_t now;
    while (true) {
        now = clock64();
        // (Note the assumption of no wrap-around)
        clock_value_t cycles_elapsed = now - start;
        if (cycles_elapsed >= num_cycles) { break; }
    }
    // The memory write here should (hopefully) prevents the compiler
    // from optimizing the entire loop away
    *buffer_to_avoid_optimization = now - start;
}

} // namespace detail

/**
 * Have the executing warp busy-sleep until at least a certain
 * number of SM clock cycles have passed.
 *
 * @note The exact number of cycles busy-slept will depend on how soon the
 * warp will be scheduled to execute again after the last time it
 * iterates the elapsed-cycles check.
 *
 * @param num_cycles The minimum number of cycles to busy-sleep
 */
__device__ void sleep(clock_value_t num_cycles)
{
    static volatile clock_value_t buffer;
    detail::sleep(num_cycles, &buffer);
}

Other than general observations which are welcome, I have a few questions:

Is this approach reasonable, or should I be doing something very different?
Is there a better way of avoiding the while(true) loop from being optimized away?
Do I even need the volatile modifier on buffer_to_avoid_optimization?

Could I use a reference instead of a pointer?

added 365 characters in body

Source Link

edited Oct 23, 2017 at 13:12

einpoklum

2.1k
14
31

Loading

Post Undeleted by einpoklum

occurred Oct 23, 2017 at 13:10

Post Deleted by einpoklum

occurred Oct 23, 2017 at 13:08

Source Link

asked Oct 23, 2017 at 12:49

einpoklum

2.1k
14
31

Loading

Stack Exchange Network

Return to Question