diff --git a/samples/16_floatatomics/main.cpp b/samples/16_floatatomics/main.cpp index 440f573..5f66f4b 100644 --- a/samples/16_floatatomics/main.cpp +++ b/samples/16_floatatomics/main.cpp @@ -8,13 +8,15 @@ #include +#include #include #include +#include #include "util.hpp" static const char kernelString[] = R"CLC( -inline float atomic_add_f(volatile global float* addr, float val) +float atomic_add_f(volatile global float* addr, float val) { #if defined(__opencl_c_ext_fp32_global_atomic_add) && !defined(EMULATE) //#pragma message("using cl_ext_float_atomics") @@ -28,18 +30,20 @@ inline float atomic_add_f(volatile global float* addr, float val) return __builtin_amdgcn_global_atomic_fadd_f32(addr, val); #else // fallback, see: https://forums.developer.nvidia.com/t/atomicadd-float-float-atomicmul-float-float/14639/7 //#pragma message("using emulated float atomics") - float ret = atomic_xchg(addr, 0.0f); - float old = ret + val; - while((old = atomic_xchg(addr, old)) != 0.0f) { - old = atomic_xchg(addr, 0.0f) + old; - } - return ret; + float old = val; while((old=atomic_xchg(addr, atomic_xchg(addr, 0.0f)+old))!=0.0f); + // Note: this emulated version cannot reliably return the previous value! + // This makes it unsuitable for general-purpose use, but it is sufficient + // for some cases, such as reductions. + // A more reliable version would use a compare-exchange loop, though it + // would be much slower. + return 0.0f; #endif } -kernel void FloatAtomicTest(global float* dst) +kernel void FloatAtomicTest(global float* dst, global float* results) { - atomic_add_f(dst, 1.0f); + int index = get_global_id(0); + results[index] = atomic_add_f(dst, 1.0f); } )CLC"; @@ -73,9 +77,10 @@ int main( int deviceIndex = 0; size_t iterations = 16; - size_t gwx = 1024 * 1024; + size_t gwx = 64 * 1024; bool emulate = false; + bool check = false; { popl::OptionParser op("Supported Options"); @@ -84,6 +89,7 @@ int main( op.add>("i", "iterations", "Iterations", iterations, &iterations); op.add>("", "gwx", "Global Work Size X AKA Number of Atomics", gwx, &gwx); op.add("e", "emulate", "Unconditionally Emulate Float Atomics", &emulate); + op.add("c", "check", "Check Intermediate Results", &check); bool printUsage = false; try { @@ -113,7 +119,15 @@ int main( printf("Running on device: %s\n", devices[deviceIndex].getInfo().c_str() ); - if (checkDeviceForExtension(devices[deviceIndex], CL_EXT_FLOAT_ATOMICS_EXTENSION_NAME)) { + // On some implementations, the feature test macros for float atomics are + // only defined when compiling for OpenCL C 3.0 or newer. + std::string buildOptions = "-cl-std=CL3.0"; + if (emulate) { + printf("Forcing emulation.\n"); + buildOptions += " -DEMULATE"; + } else if (!checkDeviceForExtension(devices[deviceIndex], CL_EXT_FLOAT_ATOMICS_EXTENSION_NAME)) { + printf("Device does not support " CL_EXT_FLOAT_ATOMICS_EXTENSION_NAME ".\n"); + } else { printf("Device supports " CL_EXT_FLOAT_ATOMICS_EXTENSION_NAME ".\n"); cl_device_fp_atomic_capabilities_ext spcaps = @@ -130,41 +144,29 @@ int main( devices[deviceIndex].getInfo(); printf("CL_DEVICE_HALF_FP_ATOMIC_CAPABILITIES_EXT:\n"); PrintFloatAtomicCapabilities(hpcaps); - - if (spcaps & CL_DEVICE_GLOBAL_FP_ATOMIC_ADD_EXT == 0) { - printf("Device does not support fp32 atomic add.\n"); - } - } else { - printf("Device does not support " CL_EXT_FLOAT_ATOMICS_EXTENSION_NAME ".\n"); } cl::Context context{devices[deviceIndex]}; cl::CommandQueue commandQueue{context, devices[deviceIndex]}; cl::Program program{ context, kernelString }; - - // On some implementations, the feature test macros for float atomics are - // only defined when compiling for OpenCL C 3.0 or newer. - std::string buildOptions = "-cl-std=CL3.0"; - if (emulate) { - printf("Forcing emulation.\n"); - buildOptions += " -DEMULATE"; - } - program.build(buildOptions); cl::Kernel kernel = cl::Kernel{ program, "FloatAtomicTest" }; - cl::Buffer deviceMemDst = cl::Buffer{ + cl::Buffer dst = cl::Buffer{ context, CL_MEM_READ_WRITE, sizeof(cl_float) }; + cl::Buffer intermediates = cl::Buffer{ + context, + CL_MEM_READ_WRITE, + gwx * sizeof(cl_float) }; // execution { - kernel.setArg(0, deviceMemDst); + kernel.setArg(0, dst); + kernel.setArg(1, intermediates); - // Ensure the queue is empty and no processing is happening - // on the device before starting the timer. commandQueue.finish(); auto start = std::chrono::system_clock::now(); @@ -172,7 +174,7 @@ int main( { cl_float zero = 0.0f; commandQueue.enqueueFillBuffer( - deviceMemDst, + dst, zero, 0, sizeof(zero)); @@ -182,7 +184,6 @@ int main( cl::NDRange{gwx}); } - // Ensure all processing is complete before stopping the timer. commandQueue.finish(); auto end = std::chrono::system_clock::now(); @@ -190,11 +191,11 @@ int main( printf("Finished in %f seconds\n", elapsed_seconds.count()); } - // validation + // basic validation { cl_float result = 0.0f; commandQueue.enqueueReadBuffer( - deviceMemDst, + dst, CL_TRUE, 0, sizeof(result), @@ -202,7 +203,45 @@ int main( if (result != (float)gwx) { printf("Error: expected %f, got %f!\n", (float)gwx, result); } else { - printf("Success.\n"); + printf("Basic Validation: Success.\n"); + } + } + + // intermediate results validation + if (check) { + if (emulate) { + printf("Skipping The emulated float atomic add does not support intermediate results.\n"); + } else { + std::vector test(gwx); + commandQueue.enqueueReadBuffer( + intermediates, + CL_TRUE, + 0, + gwx * sizeof(cl_float), + test.data()); + + std::sort(test.begin(), test.end()); + + size_t mismatches = 0; + for (size_t i = 0; i < gwx; i++) { + if (i == 0 && !(test[i] == 0.0f)) { + if (mismatches < 16) { + printf("Error at index %zu: expected %f, got %f!\n", i, 0.0f, test[i]); + } + mismatches++; + } else if (i > 0 && !(test[i] > test[i-1])) { + if (mismatches < 16) { + printf("Error at index %zu: expected %f > %f!\n", i, test[i], test[i-1]); + } + mismatches++; + } + } + + if (mismatches) { + printf("Intermediate Results Validation: Found %zu mismatches / %zu values!!!\n", mismatches, gwx); + } else { + printf("Intermediate Results Validation: Success.\n"); + } } }