From 74a7907cf0d61db7b6f363a653591720c45e4157 Mon Sep 17 00:00:00 2001 From: Nick Sarnie Date: Thu, 25 Jul 2024 12:17:36 -0400 Subject: [PATCH] [SYCL][ESIMD] Don't cast data to int for atomic_update (#14755) FP/double atomic support is emulated on PVC, and the types need to be accurate for the emulation to work correctly. We already do a similar thing for slm_atomic_update. Signed-off-by: Sarnie, Nick --- sycl/include/sycl/ext/intel/esimd/memory.hpp | 79 ++++++++++++------- .../ESIMD/regression/fp_atomic_update.cpp | 69 ++++++++++++++++ .../esimd/memory_properties_atomic_update.cpp | 14 ++-- 3 files changed, 128 insertions(+), 34 deletions(-) create mode 100644 sycl/test-e2e/ESIMD/regression/fp_atomic_update.cpp diff --git a/sycl/include/sycl/ext/intel/esimd/memory.hpp b/sycl/include/sycl/ext/intel/esimd/memory.hpp index 51b606b185777..69afaf9667975 100644 --- a/sycl/include/sycl/ext/intel/esimd/memory.hpp +++ b/sycl/include/sycl/ext/intel/esimd/memory.hpp @@ -8841,14 +8841,20 @@ atomic_update_impl(T *p, simd offsets, simd src0, constexpr lsc_data_order Transposed = lsc_data_order::nontranspose; using MsgT = typename lsc_expand_type::type; constexpr int IOp = lsc_to_internal_atomic_op(); - simd Msg_data = lsc_format_input(src0); simd addrs = reinterpret_cast(p); addrs += convert(offsets); - simd Tmp = - __esimd_lsc_xatomic_stateless_1( - pred.data(), addrs.data(), Msg_data.data()); - return lsc_format_ret(Tmp); + if constexpr (std::is_same_v || std::is_same_v) { + return __esimd_lsc_xatomic_stateless_1( + pred.data(), addrs.data(), src0.data()); + } else { + simd Msg_data = lsc_format_input(src0); + simd Tmp = + __esimd_lsc_xatomic_stateless_1( + pred.data(), addrs.data(), Msg_data.data()); + return lsc_format_ret(Tmp); + } } /// USM pointer atomic. @@ -8885,15 +8891,22 @@ atomic_update_impl(T *p, simd offsets, simd src0, constexpr lsc_data_order Transposed = lsc_data_order::nontranspose; using MsgT = typename lsc_expand_type::type; constexpr int IOp = lsc_to_internal_atomic_op(); - simd Msg_data0 = lsc_format_input(src0); - simd Msg_data1 = lsc_format_input(src1); simd addrs = reinterpret_cast(p); addrs += convert(offsets); - simd Tmp = - __esimd_lsc_xatomic_stateless_2( - pred.data(), addrs.data(), Msg_data0.data(), Msg_data1.data()); - return lsc_format_ret(Tmp); + if constexpr (std::is_same_v || std::is_same_v) { + return __esimd_lsc_xatomic_stateless_2( + pred.data(), addrs.data(), src0.data(), src1.data()); + } else { + simd Msg_data0 = lsc_format_input(src0); + simd Msg_data1 = lsc_format_input(src1); + + simd Tmp = + __esimd_lsc_xatomic_stateless_2( + pred.data(), addrs.data(), Msg_data0.data(), Msg_data1.data()); + return lsc_format_ret(Tmp); + } } /// Accessor-based atomic. @@ -8992,13 +9005,19 @@ __ESIMD_API constexpr lsc_data_order Transposed = lsc_data_order::nontranspose; using MsgT = typename lsc_expand_type::type; constexpr int IOp = lsc_to_internal_atomic_op(); - simd Src0Msg = lsc_format_input(src0); auto si = get_surface_index(acc); - simd Tmp = - __esimd_lsc_xatomic_bti_1( - pred.data(), byte_offset.data(), Src0Msg.data(), si); - return lsc_format_ret(Tmp); + if constexpr (std::is_same_v || std::is_same_v) { + return __esimd_lsc_xatomic_bti_1( + pred.data(), byte_offset.data(), src0.data(), si); + } else { + simd Src0Msg = lsc_format_input(src0); + simd Tmp = + __esimd_lsc_xatomic_bti_1( + pred.data(), byte_offset.data(), Src0Msg.data(), si); + return lsc_format_ret(Tmp); + } #endif } @@ -9047,15 +9066,21 @@ __ESIMD_API constexpr lsc_data_order Transposed = lsc_data_order::nontranspose; using MsgT = typename lsc_expand_type::type; constexpr int IOp = lsc_to_internal_atomic_op(); - simd Msg_data0 = lsc_format_input(src0); - simd Msg_data1 = lsc_format_input(src1); auto si = get_surface_index(acc); - simd Tmp = - __esimd_lsc_xatomic_bti_2( - pred.data(), byte_offset.data(), Msg_data0.data(), Msg_data1.data(), - si); - return lsc_format_ret(Tmp); + if constexpr (std::is_same_v || std::is_same_v) { + return __esimd_lsc_xatomic_bti_2( + pred.data(), byte_offset.data(), src0.data(), src1.data(), si); + } else { + simd Msg_data0 = lsc_format_input(src0); + simd Msg_data1 = lsc_format_input(src1); + simd Tmp = + __esimd_lsc_xatomic_bti_2( + pred.data(), byte_offset.data(), Msg_data0.data(), Msg_data1.data(), + si); + return lsc_format_ret(Tmp); + } #endif } } // namespace detail diff --git a/sycl/test-e2e/ESIMD/regression/fp_atomic_update.cpp b/sycl/test-e2e/ESIMD/regression/fp_atomic_update.cpp new file mode 100644 index 0000000000000..9ded1bdbebb7e --- /dev/null +++ b/sycl/test-e2e/ESIMD/regression/fp_atomic_update.cpp @@ -0,0 +1,69 @@ +//==----------- fp_atomic_update.cpp - DPC++ ESIMD on-device test --------==// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// REQUIRES: arch-intel_gpu_pvc +// RUN: %{build} -o %t.out +// RUN: %{run} %t.out + +#include "../esimd_test_utils.hpp" + +#include +#include +#include +#include + +using namespace sycl; +using namespace sycl::ext::intel::esimd; +using namespace sycl::ext::intel::experimental::esimd; + +int main() { + sycl::queue queue{gpu_selector_v}; + esimd_test::printTestLabel(queue); + constexpr size_t N = 8; + double *data_d = sycl::malloc_device(N, queue); + double *out_d = sycl::malloc_shared(N, queue); + int errCount = 0; + + try { + + std::vector data( + N, sycl::bit_cast(uint64_t(0x400000018FFFFFFF))); + + queue.memcpy(data_d, data.data(), N * sizeof(double)).wait(); + queue.fill(out_d, sycl::bit_cast(uint64_t(0x0000000000000001)), N) + .wait(); + + queue.parallel_for(sycl::nd_range<1>(1, 1), + [=](sycl::nd_item<1> item) SYCL_ESIMD_KERNEL { + // Atomically update the maximum value + simd tmp; + tmp.copy_from(data_d); + atomic_update( + out_d, simd(0, sizeof(double)), tmp); + }); + + queue.wait_and_throw(); + + std::vector out_data(N); + queue.memcpy(out_data.data(), out_d, N * sizeof(double)).wait(); + for (int iter = 0; iter < out_data.size(); iter++) { + double relError = (out_data[iter] - data[iter]) / data[iter]; + if (relError != 0 && ++errCount < 10) + std::cout << "ERROR at index " + std::to_string(iter) << ": " + << std::to_string(relError) + " != 0\n"; + } + } catch (sycl::exception &e) { + free(data_d, queue); + free(out_d, queue); + std::cerr << "SYCL exception caught: " << e.what() << std::endl; + return 1; + } + free(data_d, queue); + free(out_d, queue); + std::cout << (errCount == 0 ? "Passed\n" : "Failed\n"); + return errCount != 0; +} diff --git a/sycl/test/check_device_code/esimd/memory_properties_atomic_update.cpp b/sycl/test/check_device_code/esimd/memory_properties_atomic_update.cpp index d960dabc0b515..97549dd4887ac 100644 --- a/sycl/test/check_device_code/esimd/memory_properties_atomic_update.cpp +++ b/sycl/test/check_device_code/esimd/memory_properties_atomic_update.cpp @@ -596,11 +596,11 @@ test_atomic_update(AccType &acc, LocalAccTypeInt local_acc, float *ptrf, auto compare = swap * 2; auto pred = simd_mask(1); // Do not pass the properties. - // CHECK: call <8 x i32> @llvm.genx.lsc.xatomic.stateless.v8i32.v8i1.v8i64(<8 x i1> {{[^)]+}}, i8 23, i8 0, i8 0, i16 1, i32 0, i8 3, i8 1, i8 1, i8 0, <8 x i64> {{[^)]+}}, <8 x i32> {{[^)]+}}, <8 x i32> {{[^)]+}}, i32 0, <8 x i32> undef) + // CHECK: call <8 x float> @llvm.genx.lsc.xatomic.stateless.v8f32.v8i1.v8i64(<8 x i1> {{[^)]+}}, i8 23, i8 0, i8 0, i16 1, i32 0, i8 3, i8 1, i8 1, i8 0, <8 x i64> {{[^)]+}}, <8 x float> {{[^)]+}}, <8 x float> {{[^)]+}}, i32 0, <8 x float> undef) auto atomic_res0 = atomic_update( ptrf, offsets, swap, compare, pred); // Now with cache hints. - // CHECK: call <8 x i32> @llvm.genx.lsc.xatomic.stateless.v8i32.v8i1.v8i64(<8 x i1> {{[^)]+}}, i8 23, i8 1, i8 3, i16 1, i32 0, i8 3, i8 1, i8 1, i8 0, <8 x i64> {{[^)]+}}, <8 x i32> {{[^)]+}}, <8 x i32> {{[^)]+}}, i32 0, <8 x i32> undef) + // CHECK: call <8 x float> @llvm.genx.lsc.xatomic.stateless.v8f32.v8i1.v8i64(<8 x i1> {{[^)]+}}, i8 23, i8 1, i8 3, i16 1, i32 0, i8 3, i8 1, i8 1, i8 0, <8 x i64> {{[^)]+}}, <8 x float> {{[^)]+}}, <8 x float> {{[^)]+}}, i32 0, <8 x float> undef) auto atomic_res1 = atomic_update( ptrf, offsets, swap, compare, pred, props_a); } @@ -824,13 +824,13 @@ test_atomic_update(AccType &acc, LocalAccTypeInt local_acc, float *ptrf, auto compare = swap * 2; auto pred = simd_mask(1); // Do not pass the properties. - // CHECK-STATEFUL: call <8 x i32> @llvm.genx.lsc.xatomic.bti.v8i32.v8i1.v8i32(<8 x i1> {{[^)]+}}, i8 23, i8 0, i8 0, i16 1, i32 0, i8 3, i8 1, i8 1, i8 0, <8 x i32> {{[^)]+}}, <8 x i32> {{[^)]+}} <8 x i32> {{[^)]+}}, i32 {{[^)]+}}, <8 x i32> undef) - // CHECK-STATELESS: call <8 x i32> @llvm.genx.lsc.xatomic.stateless.v8i32.v8i1.v8i64(<8 x i1> {{[^)]+}}, i8 23, i8 0, i8 0, i16 1, i32 0, i8 3, i8 1, i8 1, i8 0, <8 x i64> {{[^)]+}}, <8 x i32> {{[^)]+}}, <8 x i32> {{[^)]+}}, i32 0, <8 x i32> undef) + // CHECK-STATEFUL: call <8 x float> @llvm.genx.lsc.xatomic.bti.v8f32.v8i1.v8i32(<8 x i1> {{[^)]+}}, i8 23, i8 0, i8 0, i16 1, i32 0, i8 3, i8 1, i8 1, i8 0, <8 x i32> {{[^)]+}}, <8 x float> {{[^)]+}} <8 x float> {{[^)]+}}, i32 {{[^)]+}}, <8 x float> undef) + // CHECK-STATELESS: call <8 x float> @llvm.genx.lsc.xatomic.stateless.v8f32.v8i1.v8i64(<8 x i1> {{[^)]+}}, i8 23, i8 0, i8 0, i16 1, i32 0, i8 3, i8 1, i8 1, i8 0, <8 x i64> {{[^)]+}}, <8 x float> {{[^)]+}}, <8 x float> {{[^)]+}}, i32 0, <8 x float> undef) auto atomic_res0 = atomic_update( acc, offsets, swap, compare, pred); // Now with cache hints. - // CHECK-STATEFUL: call <8 x i32> @llvm.genx.lsc.xatomic.bti.v8i32.v8i1.v8i32(<8 x i1> {{[^)]+}}, i8 23, i8 1, i8 3, i16 1, i32 0, i8 3, i8 1, i8 1, i8 0, <8 x i32> {{[^)]+}}, <8 x i32> {{[^)]+}} <8 x i32> {{[^)]+}}, i32 {{[^)]+}}, <8 x i32> undef) - // CHECK-STATELESS: call <8 x i32> @llvm.genx.lsc.xatomic.stateless.v8i32.v8i1.v8i64(<8 x i1> {{[^)]+}}, i8 23, i8 1, i8 3, i16 1, i32 0, i8 3, i8 1, i8 1, i8 0, <8 x i64> {{[^)]+}}, <8 x i32> {{[^)]+}}, <8 x i32> {{[^)]+}}, i32 0, <8 x i32> undef) + // CHECK-STATEFUL: call <8 x float> @llvm.genx.lsc.xatomic.bti.v8f32.v8i1.v8i32(<8 x i1> {{[^)]+}}, i8 23, i8 1, i8 3, i16 1, i32 0, i8 3, i8 1, i8 1, i8 0, <8 x i32> {{[^)]+}}, <8 x float> {{[^)]+}} <8 x float> {{[^)]+}}, i32 {{[^)]+}}, <8 x float> undef) + // CHECK-STATELESS: call <8 x float> @llvm.genx.lsc.xatomic.stateless.v8f32.v8i1.v8i64(<8 x i1> {{[^)]+}}, i8 23, i8 1, i8 3, i16 1, i32 0, i8 3, i8 1, i8 1, i8 0, <8 x i64> {{[^)]+}}, <8 x float> {{[^)]+}}, <8 x float> {{[^)]+}}, i32 0, <8 x float> undef) auto atomic_res1 = atomic_update( acc, offsets, swap, compare, pred, props_a); } @@ -1414,4 +1414,4 @@ test_atomic_update(AccType &acc, LocalAccTypeInt local_acc, float *ptrf, *local_acc, offsets, swap, compare); } } -} \ No newline at end of file +}