Skip to content

Commit

Permalink
[SYCL][ESIMD] Don't cast data to int for atomic_update (#14755)
Browse files Browse the repository at this point in the history
FP/double atomic support is emulated on PVC, and the types need to be
accurate for the emulation to work correctly. We already do a similar
thing for slm_atomic_update.

Signed-off-by: Sarnie, Nick <[email protected]>
  • Loading branch information
sarnex authored Jul 25, 2024
1 parent ecec9d1 commit 74a7907
Show file tree
Hide file tree
Showing 3 changed files with 128 additions and 34 deletions.
79 changes: 52 additions & 27 deletions sycl/include/sycl/ext/intel/esimd/memory.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -8841,14 +8841,20 @@ atomic_update_impl(T *p, simd<Toffset, N> offsets, simd<T, N> src0,
constexpr lsc_data_order Transposed = lsc_data_order::nontranspose;
using MsgT = typename lsc_expand_type<T>::type;
constexpr int IOp = lsc_to_internal_atomic_op<T, Op>();
simd<MsgT, N> Msg_data = lsc_format_input<MsgT>(src0);
simd<uintptr_t, N> addrs = reinterpret_cast<uintptr_t>(p);
addrs += convert<uintptr_t>(offsets);
simd<MsgT, N> Tmp =
__esimd_lsc_xatomic_stateless_1<MsgT, IOp, L1H, L2H, AddressScale,
ImmOffset, EDS, VS, Transposed, N>(
pred.data(), addrs.data(), Msg_data.data());
return lsc_format_ret<T>(Tmp);
if constexpr (std::is_same_v<T, double> || std::is_same_v<T, float>) {
return __esimd_lsc_xatomic_stateless_1<T, IOp, L1H, L2H, AddressScale,
ImmOffset, EDS, VS, Transposed, N>(
pred.data(), addrs.data(), src0.data());
} else {
simd<MsgT, N> Msg_data = lsc_format_input<MsgT>(src0);
simd<MsgT, N> Tmp =
__esimd_lsc_xatomic_stateless_1<MsgT, IOp, L1H, L2H, AddressScale,
ImmOffset, EDS, VS, Transposed, N>(
pred.data(), addrs.data(), Msg_data.data());
return lsc_format_ret<T>(Tmp);
}
}

/// USM pointer atomic.
Expand Down Expand Up @@ -8885,15 +8891,22 @@ atomic_update_impl(T *p, simd<Toffset, N> offsets, simd<T, N> src0,
constexpr lsc_data_order Transposed = lsc_data_order::nontranspose;
using MsgT = typename lsc_expand_type<T>::type;
constexpr int IOp = lsc_to_internal_atomic_op<T, Op>();
simd<MsgT, N> Msg_data0 = lsc_format_input<MsgT>(src0);
simd<MsgT, N> Msg_data1 = lsc_format_input<MsgT>(src1);
simd<uintptr_t, N> addrs = reinterpret_cast<uintptr_t>(p);
addrs += convert<uintptr_t>(offsets);
simd<MsgT, N> Tmp =
__esimd_lsc_xatomic_stateless_2<MsgT, IOp, L1H, L2H, AddressScale,
ImmOffset, EDS, VS, Transposed, N>(
pred.data(), addrs.data(), Msg_data0.data(), Msg_data1.data());
return lsc_format_ret<T>(Tmp);
if constexpr (std::is_same_v<T, double> || std::is_same_v<T, float>) {
return __esimd_lsc_xatomic_stateless_2<T, IOp, L1H, L2H, AddressScale,
ImmOffset, EDS, VS, Transposed, N>(
pred.data(), addrs.data(), src0.data(), src1.data());
} else {
simd<MsgT, N> Msg_data0 = lsc_format_input<MsgT>(src0);
simd<MsgT, N> Msg_data1 = lsc_format_input<MsgT>(src1);

simd<MsgT, N> Tmp =
__esimd_lsc_xatomic_stateless_2<MsgT, IOp, L1H, L2H, AddressScale,
ImmOffset, EDS, VS, Transposed, N>(
pred.data(), addrs.data(), Msg_data0.data(), Msg_data1.data());
return lsc_format_ret<T>(Tmp);
}
}

/// Accessor-based atomic.
Expand Down Expand Up @@ -8992,13 +9005,19 @@ __ESIMD_API
constexpr lsc_data_order Transposed = lsc_data_order::nontranspose;
using MsgT = typename lsc_expand_type<T>::type;
constexpr int IOp = lsc_to_internal_atomic_op<T, Op>();
simd<MsgT, N> Src0Msg = lsc_format_input<MsgT>(src0);
auto si = get_surface_index(acc);
simd<MsgT, N> Tmp =
__esimd_lsc_xatomic_bti_1<MsgT, IOp, L1H, L2H, AddressScale, ImmOffset,
EDS, VS, Transposed, N>(
pred.data(), byte_offset.data(), Src0Msg.data(), si);
return lsc_format_ret<T>(Tmp);
if constexpr (std::is_same_v<T, double> || std::is_same_v<T, float>) {
return __esimd_lsc_xatomic_bti_1<T, IOp, L1H, L2H, AddressScale, ImmOffset,
EDS, VS, Transposed, N>(
pred.data(), byte_offset.data(), src0.data(), si);
} else {
simd<MsgT, N> Src0Msg = lsc_format_input<MsgT>(src0);
simd<MsgT, N> Tmp =
__esimd_lsc_xatomic_bti_1<MsgT, IOp, L1H, L2H, AddressScale, ImmOffset,
EDS, VS, Transposed, N>(
pred.data(), byte_offset.data(), Src0Msg.data(), si);
return lsc_format_ret<T>(Tmp);
}
#endif
}

Expand Down Expand Up @@ -9047,15 +9066,21 @@ __ESIMD_API
constexpr lsc_data_order Transposed = lsc_data_order::nontranspose;
using MsgT = typename lsc_expand_type<T>::type;
constexpr int IOp = lsc_to_internal_atomic_op<T, Op>();
simd<MsgT, N> Msg_data0 = lsc_format_input<MsgT>(src0);
simd<MsgT, N> Msg_data1 = lsc_format_input<MsgT>(src1);
auto si = get_surface_index(acc);
simd<MsgT, N> Tmp =
__esimd_lsc_xatomic_bti_2<MsgT, IOp, L1H, L2H, AddressScale, ImmOffset,
EDS, VS, Transposed, N>(
pred.data(), byte_offset.data(), Msg_data0.data(), Msg_data1.data(),
si);
return lsc_format_ret<T>(Tmp);
if constexpr (std::is_same_v<T, double> || std::is_same_v<T, float>) {
return __esimd_lsc_xatomic_bti_2<T, IOp, L1H, L2H, AddressScale, ImmOffset,
EDS, VS, Transposed, N>(
pred.data(), byte_offset.data(), src0.data(), src1.data(), si);
} else {
simd<MsgT, N> Msg_data0 = lsc_format_input<MsgT>(src0);
simd<MsgT, N> Msg_data1 = lsc_format_input<MsgT>(src1);
simd<MsgT, N> Tmp =
__esimd_lsc_xatomic_bti_2<MsgT, IOp, L1H, L2H, AddressScale, ImmOffset,
EDS, VS, Transposed, N>(
pred.data(), byte_offset.data(), Msg_data0.data(), Msg_data1.data(),
si);
return lsc_format_ret<T>(Tmp);
}
#endif
}
} // namespace detail
Expand Down
69 changes: 69 additions & 0 deletions sycl/test-e2e/ESIMD/regression/fp_atomic_update.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
//==----------- fp_atomic_update.cpp - DPC++ ESIMD on-device test --------==//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
// REQUIRES: arch-intel_gpu_pvc
// RUN: %{build} -o %t.out
// RUN: %{run} %t.out

#include "../esimd_test_utils.hpp"

#include <bit>
#include <bitset>
#include <sycl/detail/core.hpp>
#include <sycl/ext/intel/esimd.hpp>

using namespace sycl;
using namespace sycl::ext::intel::esimd;
using namespace sycl::ext::intel::experimental::esimd;

int main() {
sycl::queue queue{gpu_selector_v};
esimd_test::printTestLabel(queue);
constexpr size_t N = 8;
double *data_d = sycl::malloc_device<double>(N, queue);
double *out_d = sycl::malloc_shared<double>(N, queue);
int errCount = 0;

try {

std::vector<double> data(
N, sycl::bit_cast<double>(uint64_t(0x400000018FFFFFFF)));

queue.memcpy(data_d, data.data(), N * sizeof(double)).wait();
queue.fill(out_d, sycl::bit_cast<double>(uint64_t(0x0000000000000001)), N)
.wait();

queue.parallel_for(sycl::nd_range<1>(1, 1),
[=](sycl::nd_item<1> item) SYCL_ESIMD_KERNEL {
// Atomically update the maximum value
simd<double, 8> tmp;
tmp.copy_from(data_d);
atomic_update<atomic_op::fmax>(
out_d, simd<uint32_t, N>(0, sizeof(double)), tmp);
});

queue.wait_and_throw();

std::vector<double> out_data(N);
queue.memcpy(out_data.data(), out_d, N * sizeof(double)).wait();
for (int iter = 0; iter < out_data.size(); iter++) {
double relError = (out_data[iter] - data[iter]) / data[iter];
if (relError != 0 && ++errCount < 10)
std::cout << "ERROR at index " + std::to_string(iter) << ": "
<< std::to_string(relError) + " != 0\n";
}
} catch (sycl::exception &e) {
free(data_d, queue);
free(out_d, queue);
std::cerr << "SYCL exception caught: " << e.what() << std::endl;
return 1;
}
free(data_d, queue);
free(out_d, queue);
std::cout << (errCount == 0 ? "Passed\n" : "Failed\n");
return errCount != 0;
}
Original file line number Diff line number Diff line change
Expand Up @@ -596,11 +596,11 @@ test_atomic_update(AccType &acc, LocalAccTypeInt local_acc, float *ptrf,
auto compare = swap * 2;
auto pred = simd_mask<VL>(1);
// Do not pass the properties.
// CHECK: call <8 x i32> @llvm.genx.lsc.xatomic.stateless.v8i32.v8i1.v8i64(<8 x i1> {{[^)]+}}, i8 23, i8 0, i8 0, i16 1, i32 0, i8 3, i8 1, i8 1, i8 0, <8 x i64> {{[^)]+}}, <8 x i32> {{[^)]+}}, <8 x i32> {{[^)]+}}, i32 0, <8 x i32> undef)
// CHECK: call <8 x float> @llvm.genx.lsc.xatomic.stateless.v8f32.v8i1.v8i64(<8 x i1> {{[^)]+}}, i8 23, i8 0, i8 0, i16 1, i32 0, i8 3, i8 1, i8 1, i8 0, <8 x i64> {{[^)]+}}, <8 x float> {{[^)]+}}, <8 x float> {{[^)]+}}, i32 0, <8 x float> undef)
auto atomic_res0 = atomic_update<atomic_op::fcmpxchg, float, VL>(
ptrf, offsets, swap, compare, pred);
// Now with cache hints.
// CHECK: call <8 x i32> @llvm.genx.lsc.xatomic.stateless.v8i32.v8i1.v8i64(<8 x i1> {{[^)]+}}, i8 23, i8 1, i8 3, i16 1, i32 0, i8 3, i8 1, i8 1, i8 0, <8 x i64> {{[^)]+}}, <8 x i32> {{[^)]+}}, <8 x i32> {{[^)]+}}, i32 0, <8 x i32> undef)
// CHECK: call <8 x float> @llvm.genx.lsc.xatomic.stateless.v8f32.v8i1.v8i64(<8 x i1> {{[^)]+}}, i8 23, i8 1, i8 3, i16 1, i32 0, i8 3, i8 1, i8 1, i8 0, <8 x i64> {{[^)]+}}, <8 x float> {{[^)]+}}, <8 x float> {{[^)]+}}, i32 0, <8 x float> undef)
auto atomic_res1 = atomic_update<atomic_op::fcmpxchg, float, VL>(
ptrf, offsets, swap, compare, pred, props_a);
}
Expand Down Expand Up @@ -824,13 +824,13 @@ test_atomic_update(AccType &acc, LocalAccTypeInt local_acc, float *ptrf,
auto compare = swap * 2;
auto pred = simd_mask<VL>(1);
// Do not pass the properties.
// CHECK-STATEFUL: call <8 x i32> @llvm.genx.lsc.xatomic.bti.v8i32.v8i1.v8i32(<8 x i1> {{[^)]+}}, i8 23, i8 0, i8 0, i16 1, i32 0, i8 3, i8 1, i8 1, i8 0, <8 x i32> {{[^)]+}}, <8 x i32> {{[^)]+}} <8 x i32> {{[^)]+}}, i32 {{[^)]+}}, <8 x i32> undef)
// CHECK-STATELESS: call <8 x i32> @llvm.genx.lsc.xatomic.stateless.v8i32.v8i1.v8i64(<8 x i1> {{[^)]+}}, i8 23, i8 0, i8 0, i16 1, i32 0, i8 3, i8 1, i8 1, i8 0, <8 x i64> {{[^)]+}}, <8 x i32> {{[^)]+}}, <8 x i32> {{[^)]+}}, i32 0, <8 x i32> undef)
// CHECK-STATEFUL: call <8 x float> @llvm.genx.lsc.xatomic.bti.v8f32.v8i1.v8i32(<8 x i1> {{[^)]+}}, i8 23, i8 0, i8 0, i16 1, i32 0, i8 3, i8 1, i8 1, i8 0, <8 x i32> {{[^)]+}}, <8 x float> {{[^)]+}} <8 x float> {{[^)]+}}, i32 {{[^)]+}}, <8 x float> undef)
// CHECK-STATELESS: call <8 x float> @llvm.genx.lsc.xatomic.stateless.v8f32.v8i1.v8i64(<8 x i1> {{[^)]+}}, i8 23, i8 0, i8 0, i16 1, i32 0, i8 3, i8 1, i8 1, i8 0, <8 x i64> {{[^)]+}}, <8 x float> {{[^)]+}}, <8 x float> {{[^)]+}}, i32 0, <8 x float> undef)
auto atomic_res0 = atomic_update<atomic_op::fcmpxchg, float, VL>(
acc, offsets, swap, compare, pred);
// Now with cache hints.
// CHECK-STATEFUL: call <8 x i32> @llvm.genx.lsc.xatomic.bti.v8i32.v8i1.v8i32(<8 x i1> {{[^)]+}}, i8 23, i8 1, i8 3, i16 1, i32 0, i8 3, i8 1, i8 1, i8 0, <8 x i32> {{[^)]+}}, <8 x i32> {{[^)]+}} <8 x i32> {{[^)]+}}, i32 {{[^)]+}}, <8 x i32> undef)
// CHECK-STATELESS: call <8 x i32> @llvm.genx.lsc.xatomic.stateless.v8i32.v8i1.v8i64(<8 x i1> {{[^)]+}}, i8 23, i8 1, i8 3, i16 1, i32 0, i8 3, i8 1, i8 1, i8 0, <8 x i64> {{[^)]+}}, <8 x i32> {{[^)]+}}, <8 x i32> {{[^)]+}}, i32 0, <8 x i32> undef)
// CHECK-STATEFUL: call <8 x float> @llvm.genx.lsc.xatomic.bti.v8f32.v8i1.v8i32(<8 x i1> {{[^)]+}}, i8 23, i8 1, i8 3, i16 1, i32 0, i8 3, i8 1, i8 1, i8 0, <8 x i32> {{[^)]+}}, <8 x float> {{[^)]+}} <8 x float> {{[^)]+}}, i32 {{[^)]+}}, <8 x float> undef)
// CHECK-STATELESS: call <8 x float> @llvm.genx.lsc.xatomic.stateless.v8f32.v8i1.v8i64(<8 x i1> {{[^)]+}}, i8 23, i8 1, i8 3, i16 1, i32 0, i8 3, i8 1, i8 1, i8 0, <8 x i64> {{[^)]+}}, <8 x float> {{[^)]+}}, <8 x float> {{[^)]+}}, i32 0, <8 x float> undef)
auto atomic_res1 = atomic_update<atomic_op::fcmpxchg, float, VL>(
acc, offsets, swap, compare, pred, props_a);
}
Expand Down Expand Up @@ -1414,4 +1414,4 @@ test_atomic_update(AccType &acc, LocalAccTypeInt local_acc, float *ptrf,
*local_acc, offsets, swap, compare);
}
}
}
}

0 comments on commit 74a7907

Please sign in to comment.