If, as Robert Crovella suggests, the clock64() calls don't get optimized away, then this should be enough:
using clock_value_t = long long;
__device__ void sleep(clock_value_t sleep_cycles)
{
clock_value_t start = clock64();
clock_value_t cycles_elapsed;
do { cycles_elapsed = clock64() - start; }
while (cycles_elapsed < sleep_cycles);
}