If, as Robert Crovella suggests, the clock64() calls don't get optimized away, then this should be enough:
using clock_value_t = long long;
__device__ void sleep(clock_value_t sleep_cycles)
{
autoclock_value_t start = clock64();
autoclock_value_t cycles_elapsed;
do {
cycles_elapsed = clock64() - start;
}
} while (cycles_elapsed < sleep_cycles);
}