[SYCL][ESIMD] Don't cast data to int for atomic_update (#14755)

FP/double atomic support is emulated on PVC, and the types need to be accurate for the emulation to work correctly. We already do a similar thing for slm_atomic_update. Signed-off-by: Sarnie, Nick <[email protected]>
intel · Jul 25, 2024 · 74a7907 · 74a7907
1 parent ecec9d1
commit 74a7907
Show file tree

Hide file tree

Showing 3 changed files with 128 additions and 34 deletions.
diff --git a/sycl/include/sycl/ext/intel/esimd/memory.hpp b/sycl/include/sycl/ext/intel/esimd/memory.hpp
@@ -8841,14 +8841,20 @@ atomic_update_impl(T *p, simd<Toffset, N> offsets, simd<T, N> src0,
   constexpr lsc_data_order Transposed = lsc_data_order::nontranspose;
   using MsgT = typename lsc_expand_type<T>::type;
   constexpr int IOp = lsc_to_internal_atomic_op<T, Op>();
-  simd<MsgT, N> Msg_data = lsc_format_input<MsgT>(src0);
   simd<uintptr_t, N> addrs = reinterpret_cast<uintptr_t>(p);
   addrs += convert<uintptr_t>(offsets);
-  simd<MsgT, N> Tmp =
-      __esimd_lsc_xatomic_stateless_1<MsgT, IOp, L1H, L2H, AddressScale,
-                                      ImmOffset, EDS, VS, Transposed, N>(
-          pred.data(), addrs.data(), Msg_data.data());
-  return lsc_format_ret<T>(Tmp);
+  if constexpr (std::is_same_v<T, double> || std::is_same_v<T, float>) {
+    return __esimd_lsc_xatomic_stateless_1<T, IOp, L1H, L2H, AddressScale,
+                                           ImmOffset, EDS, VS, Transposed, N>(
+        pred.data(), addrs.data(), src0.data());
+  } else {
+    simd<MsgT, N> Msg_data = lsc_format_input<MsgT>(src0);
+    simd<MsgT, N> Tmp =
+        __esimd_lsc_xatomic_stateless_1<MsgT, IOp, L1H, L2H, AddressScale,
+                                        ImmOffset, EDS, VS, Transposed, N>(
+            pred.data(), addrs.data(), Msg_data.data());
+    return lsc_format_ret<T>(Tmp);
+  }
 }
 
 /// USM pointer atomic.
@@ -8885,15 +8891,22 @@ atomic_update_impl(T *p, simd<Toffset, N> offsets, simd<T, N> src0,
   constexpr lsc_data_order Transposed = lsc_data_order::nontranspose;
   using MsgT = typename lsc_expand_type<T>::type;
   constexpr int IOp = lsc_to_internal_atomic_op<T, Op>();
-  simd<MsgT, N> Msg_data0 = lsc_format_input<MsgT>(src0);
-  simd<MsgT, N> Msg_data1 = lsc_format_input<MsgT>(src1);
   simd<uintptr_t, N> addrs = reinterpret_cast<uintptr_t>(p);
   addrs += convert<uintptr_t>(offsets);
-  simd<MsgT, N> Tmp =
-      __esimd_lsc_xatomic_stateless_2<MsgT, IOp, L1H, L2H, AddressScale,
-                                      ImmOffset, EDS, VS, Transposed, N>(
-          pred.data(), addrs.data(), Msg_data0.data(), Msg_data1.data());
-  return lsc_format_ret<T>(Tmp);
+  if constexpr (std::is_same_v<T, double> || std::is_same_v<T, float>) {
+    return __esimd_lsc_xatomic_stateless_2<T, IOp, L1H, L2H, AddressScale,
+                                           ImmOffset, EDS, VS, Transposed, N>(
+        pred.data(), addrs.data(), src0.data(), src1.data());
+  } else {
+    simd<MsgT, N> Msg_data0 = lsc_format_input<MsgT>(src0);
+    simd<MsgT, N> Msg_data1 = lsc_format_input<MsgT>(src1);
+
+    simd<MsgT, N> Tmp =
+        __esimd_lsc_xatomic_stateless_2<MsgT, IOp, L1H, L2H, AddressScale,
+                                        ImmOffset, EDS, VS, Transposed, N>(
+            pred.data(), addrs.data(), Msg_data0.data(), Msg_data1.data());
+    return lsc_format_ret<T>(Tmp);
+  }
 }
 
 /// Accessor-based atomic.
@@ -8992,13 +9005,19 @@ __ESIMD_API
   constexpr lsc_data_order Transposed = lsc_data_order::nontranspose;
   using MsgT = typename lsc_expand_type<T>::type;
   constexpr int IOp = lsc_to_internal_atomic_op<T, Op>();
-  simd<MsgT, N> Src0Msg = lsc_format_input<MsgT>(src0);
   auto si = get_surface_index(acc);
-  simd<MsgT, N> Tmp =
-      __esimd_lsc_xatomic_bti_1<MsgT, IOp, L1H, L2H, AddressScale, ImmOffset,
-                                EDS, VS, Transposed, N>(
-          pred.data(), byte_offset.data(), Src0Msg.data(), si);
-  return lsc_format_ret<T>(Tmp);
+  if constexpr (std::is_same_v<T, double> || std::is_same_v<T, float>) {
+    return __esimd_lsc_xatomic_bti_1<T, IOp, L1H, L2H, AddressScale, ImmOffset,
+                                     EDS, VS, Transposed, N>(
+        pred.data(), byte_offset.data(), src0.data(), si);
+  } else {
+    simd<MsgT, N> Src0Msg = lsc_format_input<MsgT>(src0);
+    simd<MsgT, N> Tmp =
+        __esimd_lsc_xatomic_bti_1<MsgT, IOp, L1H, L2H, AddressScale, ImmOffset,
+                                  EDS, VS, Transposed, N>(
+            pred.data(), byte_offset.data(), Src0Msg.data(), si);
+    return lsc_format_ret<T>(Tmp);
+  }
 #endif
 }
 
@@ -9047,15 +9066,21 @@ __ESIMD_API
   constexpr lsc_data_order Transposed = lsc_data_order::nontranspose;
   using MsgT = typename lsc_expand_type<T>::type;
   constexpr int IOp = lsc_to_internal_atomic_op<T, Op>();
-  simd<MsgT, N> Msg_data0 = lsc_format_input<MsgT>(src0);
-  simd<MsgT, N> Msg_data1 = lsc_format_input<MsgT>(src1);
   auto si = get_surface_index(acc);
-  simd<MsgT, N> Tmp =
-      __esimd_lsc_xatomic_bti_2<MsgT, IOp, L1H, L2H, AddressScale, ImmOffset,
-                                EDS, VS, Transposed, N>(
-          pred.data(), byte_offset.data(), Msg_data0.data(), Msg_data1.data(),
-          si);
-  return lsc_format_ret<T>(Tmp);
+  if constexpr (std::is_same_v<T, double> || std::is_same_v<T, float>) {
+    return __esimd_lsc_xatomic_bti_2<T, IOp, L1H, L2H, AddressScale, ImmOffset,
+                                     EDS, VS, Transposed, N>(
+        pred.data(), byte_offset.data(), src0.data(), src1.data(), si);
+  } else {
+    simd<MsgT, N> Msg_data0 = lsc_format_input<MsgT>(src0);
+    simd<MsgT, N> Msg_data1 = lsc_format_input<MsgT>(src1);
+    simd<MsgT, N> Tmp =
+        __esimd_lsc_xatomic_bti_2<MsgT, IOp, L1H, L2H, AddressScale, ImmOffset,
+                                  EDS, VS, Transposed, N>(
+            pred.data(), byte_offset.data(), Msg_data0.data(), Msg_data1.data(),
+            si);
+    return lsc_format_ret<T>(Tmp);
+  }
 #endif
 }
 } // namespace detail

diff --git a/sycl/test-e2e/ESIMD/regression/fp_atomic_update.cpp b/sycl/test-e2e/ESIMD/regression/fp_atomic_update.cpp
@@ -0,0 +1,69 @@
+//==----------- fp_atomic_update.cpp  - DPC++ ESIMD on-device test --------==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+// REQUIRES: arch-intel_gpu_pvc
+// RUN: %{build} -o %t.out
+// RUN: %{run} %t.out
+
+#include "../esimd_test_utils.hpp"
+
+#include <bit>
+#include <bitset>
+#include <sycl/detail/core.hpp>
+#include <sycl/ext/intel/esimd.hpp>
+
+using namespace sycl;
+using namespace sycl::ext::intel::esimd;
+using namespace sycl::ext::intel::experimental::esimd;
+
+int main() {
+  sycl::queue queue{gpu_selector_v};
+  esimd_test::printTestLabel(queue);
+  constexpr size_t N = 8;
+  double *data_d = sycl::malloc_device<double>(N, queue);
+  double *out_d = sycl::malloc_shared<double>(N, queue);
+  int errCount = 0;
+
+  try {
+
+    std::vector<double> data(
+        N, sycl::bit_cast<double>(uint64_t(0x400000018FFFFFFF)));
+
+    queue.memcpy(data_d, data.data(), N * sizeof(double)).wait();
+    queue.fill(out_d, sycl::bit_cast<double>(uint64_t(0x0000000000000001)), N)
+        .wait();
+
+    queue.parallel_for(sycl::nd_range<1>(1, 1),
+                       [=](sycl::nd_item<1> item) SYCL_ESIMD_KERNEL {
+                         // Atomically update the maximum value
+                         simd<double, 8> tmp;
+                         tmp.copy_from(data_d);
+                         atomic_update<atomic_op::fmax>(
+                             out_d, simd<uint32_t, N>(0, sizeof(double)), tmp);
+                       });
+
+    queue.wait_and_throw();
+
+    std::vector<double> out_data(N);
+    queue.memcpy(out_data.data(), out_d, N * sizeof(double)).wait();
+    for (int iter = 0; iter < out_data.size(); iter++) {
+      double relError = (out_data[iter] - data[iter]) / data[iter];
+      if (relError != 0 && ++errCount < 10)
+        std::cout << "ERROR at index " + std::to_string(iter) << ": "
+                  << std::to_string(relError) + " != 0\n";
+    }
+  } catch (sycl::exception &e) {
+    free(data_d, queue);
+    free(out_d, queue);
+    std::cerr << "SYCL exception caught: " << e.what() << std::endl;
+    return 1;
+  }
+  free(data_d, queue);
+  free(out_d, queue);
+  std::cout << (errCount == 0 ? "Passed\n" : "Failed\n");
+  return errCount != 0;
+}
diff --git a/sycl/test/check_device_code/esimd/memory_properties_atomic_update.cpp b/sycl/test/check_device_code/esimd/memory_properties_atomic_update.cpp
@@ -596,11 +596,11 @@ test_atomic_update(AccType &acc, LocalAccTypeInt local_acc, float *ptrf,
       auto compare = swap * 2;
       auto pred = simd_mask<VL>(1);
       // Do not pass the properties.
-      // CHECK: call <8 x i32> @llvm.genx.lsc.xatomic.stateless.v8i32.v8i1.v8i64(<8 x i1> {{[^)]+}}, i8 23, i8 0, i8 0, i16 1, i32 0, i8 3, i8 1, i8 1, i8 0, <8 x i64> {{[^)]+}}, <8 x i32> {{[^)]+}}, <8 x i32> {{[^)]+}}, i32 0, <8 x i32> undef)
+      // CHECK: call <8 x float> @llvm.genx.lsc.xatomic.stateless.v8f32.v8i1.v8i64(<8 x i1> {{[^)]+}}, i8 23, i8 0, i8 0, i16 1, i32 0, i8 3, i8 1, i8 1, i8 0, <8 x i64> {{[^)]+}}, <8 x float> {{[^)]+}}, <8 x float> {{[^)]+}}, i32 0, <8 x float> undef)
       auto atomic_res0 = atomic_update<atomic_op::fcmpxchg, float, VL>(
           ptrf, offsets, swap, compare, pred);
       // Now with cache hints.
-      // CHECK: call <8 x i32> @llvm.genx.lsc.xatomic.stateless.v8i32.v8i1.v8i64(<8 x i1> {{[^)]+}}, i8 23, i8 1, i8 3, i16 1, i32 0, i8 3, i8 1, i8 1, i8 0, <8 x i64> {{[^)]+}}, <8 x i32> {{[^)]+}}, <8 x i32> {{[^)]+}}, i32 0, <8 x i32> undef)
+      // CHECK: call <8 x float> @llvm.genx.lsc.xatomic.stateless.v8f32.v8i1.v8i64(<8 x i1> {{[^)]+}}, i8 23, i8 1, i8 3, i16 1, i32 0, i8 3, i8 1, i8 1, i8 0, <8 x i64> {{[^)]+}}, <8 x float> {{[^)]+}}, <8 x float> {{[^)]+}}, i32 0, <8 x float> undef)
       auto atomic_res1 = atomic_update<atomic_op::fcmpxchg, float, VL>(
           ptrf, offsets, swap, compare, pred, props_a);
     }
@@ -824,13 +824,13 @@ test_atomic_update(AccType &acc, LocalAccTypeInt local_acc, float *ptrf,
       auto compare = swap * 2;
       auto pred = simd_mask<VL>(1);
       // Do not pass the properties.
-      // CHECK-STATEFUL:  call <8 x i32> @llvm.genx.lsc.xatomic.bti.v8i32.v8i1.v8i32(<8 x i1> {{[^)]+}}, i8 23, i8 0, i8 0, i16 1, i32 0, i8 3, i8 1, i8 1, i8 0, <8 x i32> {{[^)]+}}, <8 x i32> {{[^)]+}} <8 x i32> {{[^)]+}}, i32 {{[^)]+}}, <8 x i32> undef)
-      // CHECK-STATELESS: call <8 x i32> @llvm.genx.lsc.xatomic.stateless.v8i32.v8i1.v8i64(<8 x i1> {{[^)]+}}, i8 23, i8 0, i8 0, i16 1, i32 0, i8 3, i8 1, i8 1, i8 0, <8 x i64> {{[^)]+}}, <8 x i32> {{[^)]+}}, <8 x i32> {{[^)]+}}, i32 0, <8 x i32> undef)
+      // CHECK-STATEFUL:  call <8 x float> @llvm.genx.lsc.xatomic.bti.v8f32.v8i1.v8i32(<8 x i1> {{[^)]+}}, i8 23, i8 0, i8 0, i16 1, i32 0, i8 3, i8 1, i8 1, i8 0, <8 x i32> {{[^)]+}}, <8 x float> {{[^)]+}} <8 x float> {{[^)]+}}, i32 {{[^)]+}}, <8 x float> undef)
+      // CHECK-STATELESS: call <8 x float> @llvm.genx.lsc.xatomic.stateless.v8f32.v8i1.v8i64(<8 x i1> {{[^)]+}}, i8 23, i8 0, i8 0, i16 1, i32 0, i8 3, i8 1, i8 1, i8 0, <8 x i64> {{[^)]+}}, <8 x float> {{[^)]+}}, <8 x float> {{[^)]+}}, i32 0, <8 x float> undef)
       auto atomic_res0 = atomic_update<atomic_op::fcmpxchg, float, VL>(
           acc, offsets, swap, compare, pred);
       // Now with cache hints.
-      // CHECK-STATEFUL:  call <8 x i32> @llvm.genx.lsc.xatomic.bti.v8i32.v8i1.v8i32(<8 x i1> {{[^)]+}}, i8 23, i8 1, i8 3, i16 1, i32 0, i8 3, i8 1, i8 1, i8 0, <8 x i32> {{[^)]+}}, <8 x i32> {{[^)]+}} <8 x i32> {{[^)]+}}, i32 {{[^)]+}}, <8 x i32> undef)
-      // CHECK-STATELESS: call <8 x i32> @llvm.genx.lsc.xatomic.stateless.v8i32.v8i1.v8i64(<8 x i1> {{[^)]+}}, i8 23, i8 1, i8 3, i16 1, i32 0, i8 3, i8 1, i8 1, i8 0, <8 x i64> {{[^)]+}}, <8 x i32> {{[^)]+}}, <8 x i32> {{[^)]+}}, i32 0, <8 x i32> undef)
+      // CHECK-STATEFUL:  call <8 x float> @llvm.genx.lsc.xatomic.bti.v8f32.v8i1.v8i32(<8 x i1> {{[^)]+}}, i8 23, i8 1, i8 3, i16 1, i32 0, i8 3, i8 1, i8 1, i8 0, <8 x i32> {{[^)]+}}, <8 x float> {{[^)]+}} <8 x float> {{[^)]+}}, i32 {{[^)]+}}, <8 x float> undef)
+      // CHECK-STATELESS: call <8 x float> @llvm.genx.lsc.xatomic.stateless.v8f32.v8i1.v8i64(<8 x i1> {{[^)]+}}, i8 23, i8 1, i8 3, i16 1, i32 0, i8 3, i8 1, i8 1, i8 0, <8 x i64> {{[^)]+}}, <8 x float> {{[^)]+}}, <8 x float> {{[^)]+}}, i32 0, <8 x float> undef)
       auto atomic_res1 = atomic_update<atomic_op::fcmpxchg, float, VL>(
           acc, offsets, swap, compare, pred, props_a);
     }
@@ -1414,4 +1414,4 @@ test_atomic_update(AccType &acc, LocalAccTypeInt local_acc, float *ptrf,
           *local_acc, offsets, swap, compare);
     }
   }
-}
+}