diff --git a/.gitignore b/.gitignore index 10b3b40f79..269a0763e6 100644 --- a/.gitignore +++ b/.gitignore @@ -14,3 +14,5 @@ /install_*/ /install-*/ /Debug/ +*.swp +*.orig diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index fb6bc7055c..b263b8aa25 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -75,7 +75,7 @@ stages: include: - local: '.gitlab/custom-jobs-and-variables.yml' - project: 'radiuss/radiuss-shared-ci' - ref: 'v2024.04.0' + ref: 'v2024.06.0' file: 'pipelines/${CI_MACHINE}.yml' - artifact: '${CI_MACHINE}-jobs.yml' job: 'generate-job-lists' @@ -91,20 +91,20 @@ stages: trigger-rajaperf: stage: multi-project rules: - - if: '$CI_COMMIT_BRANCH == "${MP_BRANCH}" || $MULTI_PROJECT == "ON"' #run only if ... + - if: $CI_COMMIT_BRANCH == $MP_BRANCH || $MULTI_PROJECT == "ON" #run only if ... variables: UPDATE_RAJA: ${MP_BRANCH} trigger: project: radiuss/rajaperf branch: develop - strategy: depend include: + # Sets ID tokens for every job using `default:` - project: 'lc-templates/id_tokens' file: 'id_tokens.yml' # [Optional] checks preliminary to running the actual CI test - project: 'radiuss/radiuss-shared-ci' - ref: 'v2024.04.0' + ref: 'v2024.06.0' file: 'utilities/preliminary-ignore-draft-pr.yml' # pipelines subscribed by the project - local: '.gitlab/subscribed-pipelines.yml' diff --git a/.gitlab/custom-jobs-and-variables.yml b/.gitlab/custom-jobs-and-variables.yml index 62d7908945..b04cf0de1d 100644 --- a/.gitlab/custom-jobs-and-variables.yml +++ b/.gitlab/custom-jobs-and-variables.yml @@ -21,7 +21,7 @@ variables: # Project specific variants for ruby PROJECT_RUBY_VARIANTS: "~shared +openmp +vectorization +tests" # Project specific deps for ruby - PROJECT_RUBY_DEPS: "" + PROJECT_RUBY_DEPS: "^blt@develop " # Poodle # Arguments for top level allocation @@ -31,7 +31,7 @@ variables: # Project specific variants for poodle PROJECT_POODLE_VARIANTS: "~shared +openmp +vectorization +tests" # Project specific deps for poodle - PROJECT_POODLE_DEPS: "" + PROJECT_POODLE_DEPS: "^blt@develop " # Corona # Arguments for top level allocation @@ -70,3 +70,15 @@ variables: artifacts: reports: junit: junit.xml + +.reproducer_vars: + script: + - | + echo -e " + # Required variables \n + export MODULE_LIST=\"${MODULE_LIST}\" \n + export SPEC=\"${SPEC//\"/\\\"}\" \n + # Allow to set job script for debugging (only this differs from CI) \n + export DEBUG_MODE=true \n + # Using the CI build cache is optional and requires a token. Set it like so: \n + # export REGISTRY_TOKEN=\"\" \n" diff --git a/.gitlab/jobs/corona.yml b/.gitlab/jobs/corona.yml index 9213d6e932..abbafe5bb9 100644 --- a/.gitlab/jobs/corona.yml +++ b/.gitlab/jobs/corona.yml @@ -8,9 +8,7 @@ # Override reproducer section to define project specific variables. .corona_reproducer_vars: script: - - | - echo -e "export MODULE_LIST=\"${MODULE_LIST}\"" - echo -e "export SPEC=\"${SPEC//\"/\\\"}\"" + - !reference [.reproducer_vars, script] ######################## # Overridden shared jobs @@ -33,3 +31,9 @@ rocmcc_5_7_0_hip_desul_atomics: SPEC: " ~shared +rocm ~openmp +tests +desul amdgpu_target=gfx906 %rocmcc@=5.7.0 ^hip@5.7.0 ^blt@develop" extends: .job_on_corona +clang_19_0_0_sycl_gcc_10_3_1_rocmcc_5_7_1_hip: + variables: + SPEC: " ~shared +sycl ~openmp +tests %clang@=19.0.0 cxxflags==\"-w -fsycl -fsycl-unnamed-lambda -fsycl-targets=amdgcn-amd-amdhsa -Xsycl-target-backend --offload-arch=gfx906\" ^blt@develop" + MODULE_LIST: "rocm/5.7.1" + extends: .job_on_corona + diff --git a/.gitlab/jobs/lassen.yml b/.gitlab/jobs/lassen.yml index fbd3d93db0..dc21689ce3 100644 --- a/.gitlab/jobs/lassen.yml +++ b/.gitlab/jobs/lassen.yml @@ -8,9 +8,7 @@ # Override reproducer section to define project specific variables. .lassen_reproducer_vars: script: - - | - echo -e "export MODULE_LIST=\"${MODULE_LIST}\"" - echo -e "export SPEC=\"${SPEC//\"/\\\"}\"" + - !reference [.reproducer_vars, script] ######################## # Overridden shared jobs @@ -68,3 +66,13 @@ gcc_8_3_1_cuda_10_1_243_desul_atomics: variables: SPEC: " ~shared +openmp +tests +cuda +desul %gcc@=8.3.1 cuda_arch=70 ^cuda@10.1.243+allow-unsupported-compilers ^blt@develop" extends: .job_on_lassen + +# Warning: Allowed to fail temporarily +# Deactivated due to issues with OpenMP Target and various tests and compilers. +clang_16_0_6_ibm_omptarget: + variables: + SPEC: " ~shared +openmp +omptarget +tests %clang@=16.0.6.ibm.gcc.8.3.1 ^blt@develop" + ON_LASSEN: "OFF" + extends: .job_on_lassen + allow_failure: true + diff --git a/.gitlab/jobs/poodle.yml b/.gitlab/jobs/poodle.yml index cc1f956cb9..54870e37aa 100644 --- a/.gitlab/jobs/poodle.yml +++ b/.gitlab/jobs/poodle.yml @@ -5,38 +5,42 @@ # SPDX-License-Identifier: (BSD-3-Clause) ############################################################################## -# Override reproducer section to define projet specific variables. +# Override reproducer section to define project specific variables. .poodle_reproducer_vars: script: - - | - echo -e "export MODULE_LIST=\"${MODULE_LIST}\"" - echo -e "export SPEC=\"${SPEC//\"/\\\"}\"" + - !reference [.reproducer_vars, script] ######################## # Overridden shared jobs ######################## # We duplicate the shared jobs description and add necessary changes for this # project. We keep ${PROJECT__VARIANTS} and ${PROJECT__DEPS} -# So that the comparison with the original job is easier. +# when possible so that the comparison with the original job is easier. +# Identical to shared job, but use OpenMP tasks and no vectorization clang_14_0_6: variables: - SPEC: " ~shared +openmp +omptask +tests %clang@=14.0.6 ^blt@develop" + SPEC: " ~shared +openmp +omptask +tests %clang@=14.0.6 ${PROJECT_POODLE_DEPS}" extends: .job_on_poodle +# Identical to shared job, but use OpenMP tasks and no vectorization gcc_10_3_1: variables: - SPEC: " ~shared +openmp +omptask +tests %gcc@=10.3.1 ^blt@develop" + SPEC: " ~shared +openmp +omptask +tests %gcc@=10.3.1 ${PROJECT_POODLE_DEPS}" extends: .job_on_poodle +# Identical to shared job, but use OpenMP tasks and no vectorization +# Deactivated (too long on poodle) intel_19_1_2_gcc_10_3_1: variables: - SPEC: " ~shared +openmp +omptask +tests %intel@=19.1.2.gcc.10.3.1 ^blt@develop" + ON_POODLE: "OFF" + SPEC: " ~shared +openmp +omptask +tests %intel@=19.1.2.gcc.10.3.1 ${PROJECT_POODLE_DEPS}" extends: .job_on_poodle +# Allowed to fail intel_2022_1_0: variables: - SPEC: "${PROJECT_POODLE_VARIANTS} %intel@=2022.1.0 ${PROJECT_POODLE_DEPS} ^blt@develop" + SPEC: "${PROJECT_POODLE_VARIANTS} %intel@=2022.1.0 ${PROJECT_POODLE_DEPS}" allow_failure: true extends: .job_on_poodle diff --git a/.gitlab/jobs/ruby.yml b/.gitlab/jobs/ruby.yml index a924ddd47c..2242494b9c 100644 --- a/.gitlab/jobs/ruby.yml +++ b/.gitlab/jobs/ruby.yml @@ -8,35 +8,37 @@ # Override reproducer section to define project specific variables. .ruby_reproducer_vars: script: - - | - echo -e "export MODULE_LIST=\"${MODULE_LIST}\"" - echo -e "export SPEC=\"${SPEC//\"/\\\"}\"" + - !reference [.reproducer_vars, script] ######################## # Overridden shared jobs ######################## # We duplicate the shared jobs description and add necessary changes for this # project. We keep ${PROJECT__VARIANTS} and ${PROJECT__DEPS} -# So that the comparison with the original job is easier. +# when possible so that the comparison with the original job is easier. +# Identical to shared job, but use OpenMP tasks and no vectorization clang_14_0_6: variables: - SPEC: " ~shared +openmp +omptask +tests %clang@=14.0.6 ^blt@develop" + SPEC: " ~shared +openmp +omptask +tests %clang@=14.0.6 ${PROJECT_RUBY_DEPS}" extends: .job_on_ruby +# Identical to shared job, but use OpenMP tasks and no vectorization gcc_10_3_1: variables: - SPEC: " ~shared +openmp +omptask +tests %gcc@=10.3.1 ^blt@develop" + SPEC: " ~shared +openmp +omptask +tests %gcc@=10.3.1 ${PROJECT_RUBY_DEPS}" extends: .job_on_ruby +# Identical to shared job, but use OpenMP tasks and no vectorization intel_19_1_2_gcc_10_3_1: variables: - SPEC: " ~shared +openmp +omptask +tests %intel@=19.1.2.gcc.10.3.1 ^blt@develop" + SPEC: " ~shared +openmp +omptask +tests %intel@=19.1.2.gcc.10.3.1 ${PROJECT_RUBY_DEPS}" extends: .job_on_ruby +# Allowed to fail intel_2022_1_0: variables: - SPEC: "${PROJECT_RUBY_VARIANTS} %intel@=2022.1.0 ${PROJECT_RUBY_DEPS} ^blt@develop" + SPEC: "${PROJECT_RUBY_VARIANTS} %intel@=2022.1.0 ${PROJECT_RUBY_DEPS}" allow_failure: true extends: .job_on_ruby diff --git a/.gitlab/jobs/tioga.yml b/.gitlab/jobs/tioga.yml index ef89808932..50b60bc13d 100644 --- a/.gitlab/jobs/tioga.yml +++ b/.gitlab/jobs/tioga.yml @@ -8,9 +8,7 @@ # Override reproducer section to define project specific variables. .tioga_reproducer_vars: script: - - | - echo -e "export MODULE_LIST=\"${MODULE_LIST}\"" - echo -e "export SPEC=\"${SPEC//\"/\\\"}\"" + - !reference [.reproducer_vars, script] ######################## # Overridden shared jobs @@ -28,12 +26,12 @@ # ${PROJECT__DEPS} in the extra jobs. There is no reason not to fully # describe the spec here. -rocmcc_5_7_1_hip_desul_atomics: +rocmcc_6_1_1_hip_desul_atomics: variables: - SPEC: "~shared +rocm ~openmp +desul +tests amdgpu_target=gfx90a %rocmcc@=5.7.1 ^hip@5.7.1 ^blt@develop" + SPEC: "~shared +rocm ~openmp +desul +tests amdgpu_target=gfx90a %rocmcc@=6.1.1 ^hip@6.1.1 ^blt@develop" extends: .job_on_tioga -rocmcc_5_7_1_hip_openmp: +rocmcc_6_1_1_hip_openmp: variables: - SPEC: "~shared +rocm +openmp +omptask +tests amdgpu_target=gfx90a %rocmcc@=5.7.1 ^hip@5.7.1 ^blt@develop" + SPEC: "~shared +rocm +openmp +omptask +tests amdgpu_target=gfx90a %rocmcc@=6.1.1 ^hip@6.1.1 ^blt@develop" extends: .job_on_tioga diff --git a/.uberenv_config.json b/.uberenv_config.json index 2261a80aea..d89b97cb29 100644 --- a/.uberenv_config.json +++ b/.uberenv_config.json @@ -4,7 +4,7 @@ "package_final_phase" : "initconfig", "package_source_dir" : "../..", "spack_url": "https://github.com/spack/spack.git", -"spack_branch": "develop-2024-02-18", +"spack_branch": "develop-2024-07-07", "spack_activate" : {}, "spack_configs_path": "scripts/radiuss-spack-configs", "spack_packages_path": "scripts/radiuss-spack-configs/packages", diff --git a/CMakeLists.txt b/CMakeLists.txt index 9e5ecec0b7..1659021970 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -15,8 +15,8 @@ include(CMakeDependentOption) # Set version number set(RAJA_VERSION_MAJOR 2024) -set(RAJA_VERSION_MINOR 02) -set(RAJA_VERSION_PATCHLEVEL 2) +set(RAJA_VERSION_MINOR 07) +set(RAJA_VERSION_PATCHLEVEL 0) if (RAJA_LOADED AND (NOT RAJA_LOADED STREQUAL "${RAJA_VERSION_MAJOR}.${RAJA_VERSION_MINOR}.${RAJA_VERSION_PATCHLEVEL}")) message(FATAL_ERROR "You are mixing RAJA versions. Loaded is ${RAJA_LOADED}, expected ${RAJA_VERSION_MAJOR}.${RAJA_VERSION_MINOR}.${RAJA_VERSION_PATCHLEVEL}") @@ -44,11 +44,7 @@ set(CMAKE_MODULE_PATH "${PROJECT_SOURCE_DIR}/cmake/thirdparty" ${CMAKE_MODULE_PA include(cmake/SetupRajaOptions.cmake) -if (ENABLE_HIP) - cmake_minimum_required(VERSION 3.23) -else() - cmake_minimum_required(VERSION 3.20) -endif() +cmake_minimum_required(VERSION 3.23) # Detect C++ standard and add appropriate flag _before_ loading BLT set(COMPILERS_KNOWN_TO_CMAKE33 AppleClang Clang GNU MSVC) diff --git a/RELEASE_NOTES.md b/RELEASE_NOTES.md index c2df2a03ea..e86890d13d 100644 --- a/RELEASE_NOTES.md +++ b/RELEASE_NOTES.md @@ -20,6 +20,57 @@ Notable changes include: * Bug fixes/improvements: +Version 2024.07.0 -- Release date 2024-07-24 +============================================ + +This release contains new features, improvements, and bugfixes. + +Notable changes include: + + * New features / API changes: + * Added support for a "multi-reduction" operation which allows users to + perform a run time-defined number of reduction operations in a kernel. + Please see the RAJA User Guide for details and examples. + * Added first couple of sections for a "RAJA Cookbook" in the RAJA User + Guide. The goal is to provide users with more detailed guidance about + using RAJA features, choosing execution policies, etc. Additional + content will be provided in future releases. + * Added atomicLoad and atomicStore routines for correctness in some + use cases. + * Added OpenMP 5.1 implementations for atomicMin and atomicMax. + * Add SYCL reduction support in RAJA::launch + + * Build changes/improvements: + * Update camp submodule to v2024.07.0 release. This will be a version + constraint for this release in RAJA Spack package. + * Minimum required CMake version bumped to 3.23. + + * Bug fixes/improvements: + * Fix CMake issue for case when RAJA is used as a submodule dependency. + * Various fixes and improvements to builtin atomic support. + * Fixes and improvements to other atomic operations: + * Modified HIP and CUDA generic atomic compare and swap algorithms + to use atomic loads instead of relying on volatile. + * Re-implemented atomic loads in terms of builtin atomics for CUDA + and HIP so that the generic compare and swap functions can use it. + * Removes volatile qualifier in atomic function signatures. + * Use cuda::atomic_ref in newer versions of CUDA to back + atomicLoad/atomicStore. + * Use atomicAdd as a fallback for atomicSub in CUDA. + * Removed checks where __CUDA_ARCH__ is less than 350 since RAJA + requires that as the minimum supported architecture (CMake check). + * Fixed issues with naming RAJA forall::kernels when using CUDA. + * Fixes in SYCL back-end for RAJA::launch. + * Fixed some issues in examples. + * Bugfixes and cleanup in parts of the SYCL back-end needed to + support a bunch of new SYCL kernels that will appear in + RAJA Performance Suite release. + * Fix type naming issue that was exposed with a new version of the + Intel oneAPI compiler. + * Fix issue in User Guide documentation for configuring a project + using RAJA CMake configuration. + + Version 2024.02.2 -- Release date 2024-05-08 ============================================ diff --git a/docs/conf.py b/docs/conf.py index 3212170b30..5f76d77b76 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -86,9 +86,9 @@ # built documents. # # The short X.Y version. -version = u'2024.02' +version = u'2024.07' # The full version, including alpha/beta/rc tags. -release = u'2024.02.2' +release = u'2024.07.0' # The language for content autogenerated by Sphinx. Refer to documentation # for a list of supported languages. diff --git a/docs/sphinx/dev_guide/ci_tasks.rst b/docs/sphinx/dev_guide/ci_tasks.rst index 70c65e8903..5c63ecc9ad 100644 --- a/docs/sphinx/dev_guide/ci_tasks.rst +++ b/docs/sphinx/dev_guide/ci_tasks.rst @@ -157,6 +157,114 @@ annotate the job for this. For example: describe the change in the ``RAJA/.gitlab/jobs/.yml`` file where the job is overridden. + +Building the Intel clang + SYCL HIP compiler for use in CI +---------------------------------------------------------- + +The SYCL CI tests on corona rely on a custom Intel Clang SYCL compiler that we +build ourselves. This compiler lives in the ``/usr/workspace/raja-dev/`` folder so +that it can be accessed by the gitlab CI system. Since the intel compiler does +not do releases in the typical sense (they simply update their repo *every night*), +it may become necessary to periodically build a new version of the compiler to +ensure that we are using the most up-to-date version available. The steps for +building, installing, and running are shown here. + +Building the Compiler +^^^^^^^^^^^^^^^^^^^^^ + +.. important:: Because intel updates their compiler repo daily, there is a nonzero possibility that the head of the sycl branch will fail to build. + In the event that it does not build, try checking out a different commit. On the intel/llvm GitHub page, one can see which of their + commits builds by checking the status badge next to each commit. Look for a commit that passes. + + +#. Load the version of GCC that you want to use. In this case, we are using LC's gcc/10.3.1-magic installation:: + + module load gcc/10.3.1-magic + +#. Load the version of rocm that you want to use. In this case, we are using 5.7.1:: + + module load rocm/5.7.1 + +#. Clone the "sycl" branch of intel's llvm compiler fork:: + + git clone https://github.com/intel/llvm -b sycl + +#. cd into that folder:: + + cd llvm + + In the event that the head of the sycl branch does not build, run ``git checkout `` to checkout a version that does build. + +#. Build the compiler. + + Note that in this example, we are using rocm5.7.1, but one can change the version they wish to use simply by changing the paths in the configure step + + a. Configure + + .. code-block:: bash + + srun -n1 /usr/bin/python3 buildbot/configure.py --hip -o buildrocm5.7.1 \ + --cmake-gen "Unix Makefiles" \ + --cmake-opt=-DSYCL_BUILD_PI_HIP_ROCM_DIR=/opt/rocm-5.7.1 \ + --cmake-opt=-DSYCL_BUILD_PI_HIP_ROCM_INCLUDE_DIR=/opt/rocm-5.7.1/include \ + --cmake-opt=-DSYCL_BUILD_PI_HIP_ROCM_LIB_DIR=/opt/rocm-5.7.1/lib \ + --cmake-opt=-DSYCL_BUILD_PI_HIP_INCLUDE_DIR=/opt/rocm-5.7.1/include \ + --cmake-opt=-DSYCL_BUILD_PI_HIP_HSA_INCLUDE_DIR=/opt/rocm-5.7.1/hsa/include/hsa \ + --cmake-opt=-DSYCL_BUILD_PI_HIP_LIB_DIR=/opt/rocm-5.7.1/lib \ + --cmake-opt=-DUR_HIP_ROCM_DIR=/opt/rocm-5.7.1 \ + --cmake-opt=-DUR_HIP_INCLUDE_DIR=/opt/rocm-5.7.1/include \ + --cmake-opt=-DUR_HIP_HSA_INCLUDE_DIR=/opt/rocm-5.7.1/hsa/include/hsa \ + --cmake-opt=-DUR_HIP_LIB_DIR=/opt/rocm-5.7.1/lib + + b. Build + + .. code-block:: bash + + srun -n1 /usr/bin/python3 buildbot/compile.py -o buildrocm5.7.1 + +#. Test the compiler + + Follow the steps in the `Using the compiler`_ section to test this installation + +#. Install + + a. The build step will install the compiler to the folder ``buildrocm/install``. Simply copy this folder to the ``/usr/workspace/raja-dev/`` directory using the naming scheme ``clang_sycl__hip_gcc_rocm`` + + #. Set the permissions of the folder, and everything in it to 750:: + + chmod 750 /usr/workspace/raja-dev// -R + + #. Change the group of the folder and everything in it to raja-dev:: + + chgrp raja-dev /usr/workspace/raja-dev// -R + + +Using the compiler +^^^^^^^^^^^^^^^^^^ + +#. Load the version of rocm that you used when building the compiler:: + + module load rocm/5.7.1 + +#. Navigate to the root of your local checkout space of the RAJA repo:: + + cd /path/to/raja + +#. Run the test config script:: + + ./scripts/lc-builds/corona_sycl.sh /usr/workspace/raja-dev/clang_sycl_2f03ef85fee5_hip_gcc10.3.1_rocm5.7.1 + + Note that at the time of writing, the newest compiler we had built was at ``clang_sycl_2f03ef85fee5_hip_gcc10.3.1_rocm5.7.1`` + +#. cd into the auto generated build directory:: + + cd {build directory} + +#. Run the tests:: + + make -j + + ============== Azure CI Tasks ============== diff --git a/docs/sphinx/user_guide/cook_book.rst b/docs/sphinx/user_guide/cook_book.rst index 91494f3674..349fdd5b3f 100644 --- a/docs/sphinx/user_guide/cook_book.rst +++ b/docs/sphinx/user_guide/cook_book.rst @@ -20,4 +20,5 @@ to provide users with complete beyond usage examples beyond what can be found in :maxdepth: 2 cook_book/reduction + cook_book/multi-reduction diff --git a/docs/sphinx/user_guide/cook_book/multi-reduction.rst b/docs/sphinx/user_guide/cook_book/multi-reduction.rst new file mode 100644 index 0000000000..2ad4d60aa2 --- /dev/null +++ b/docs/sphinx/user_guide/cook_book/multi-reduction.rst @@ -0,0 +1,160 @@ +.. ## +.. ## Copyright (c) 2016-24, Lawrence Livermore National Security, LLC +.. ## and other RAJA project contributors. See the RAJA/LICENSE file +.. ## for details. +.. ## +.. ## SPDX-License-Identifier: (BSD-3-Clause) +.. ## + +.. _cook-book-multi-reductions-label: + +============================ +Cooking with MultiReductions +============================ + +Please see the following section for overview discussion about RAJA multi-reductions: + + * :ref:`feat-multi-reductions-label`. + + +--------------------------------- +MultiReductions with RAJA::forall +--------------------------------- + +Here is the setup for a simple multi-reduction example:: + + const int N = 1000; + const int num_bins = 10; + + int vec[N]; + int bins[N]; + + for (int i = 0; i < N; ++i) { + + vec[i] = 1; + bins[i] = i % num_bins; + + } + +Here a simple sum multi-reduction performed in a C-style for-loop:: + + int vsum[num_bins] {0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; + + // Run a kernel using the multi-reduction objects + for (int i = 0; i < N; ++i) { + + vsum[bins[i]] += vec[i]; + + } + +The results of these operations will yield the following values: + + * ``vsum[0] == 100`` + * ``vsum[1] == 100`` + * ``vsum[2] == 100`` + * ``vsum[3] == 100`` + * ``vsum[4] == 100`` + * ``vsum[5] == 100`` + * ``vsum[6] == 100`` + * ``vsum[7] == 100`` + * ``vsum[8] == 100`` + * ``vsum[9] == 100`` + +RAJA uses policy types to specify how things are implemented. + +The forall *execution policy* specifies how the loop is run by the ``RAJA::forall`` method. The following discussion includes examples of several other RAJA execution policies that could be applied. +For example ``RAJA::seq_exec`` runs a C-style for-loop sequentially on a CPU. The +``RAJA::cuda_exec_with_reduce<256>`` runs the operation as a CUDA GPU kernel with +256 threads per block and other CUDA kernel launch parameters, like the +number of blocks, optimized for performance with multi_reducers.:: + + using exec_policy = RAJA::seq_exec; + // using exec_policy = RAJA::omp_parallel_for_exec; + // using exec_policy = RAJA::cuda_exec_with_reduce<256>; + // using exec_policy = RAJA::hip_exec_with_reduce<256>; + +The multi-reduction policy specifies how the multi-reduction is done and must be compatible with the +execution policy. For example, ``RAJA::seq_multi_reduce`` does a sequential multi-reduction +and can only be used with sequential execution policies. The +``RAJA::cuda_multi_reduce_atomic`` policy uses atomics and can only be used with +cuda execution policies. Similarly for other RAJA execution back-ends, such as +HIP and OpenMP. Here are example RAJA multi-reduction policies whose names are +indicative of which execution policies they work with:: + + using multi_reduce_policy = RAJA::seq_multi_reduce; + // using multi_reduce_policy = RAJA::omp_multi_reduce; + // using multi_reduce_policy = RAJA::cuda_multi_reduce_atomic; + // using multi_reduce_policy = RAJA::hip_multi_reduce_atomic; + +Here a simple sum multi-reduction is performed using RAJA:: + + RAJA::MultiReduceSum vsum(num_bins, 0); + + RAJA::forall( RAJA::RangeSegment(0, N), + [=](RAJA::Index_type i) { + + vsum[bins[i]] += vec[i]; + + }); + +The results of these operations will yield the following values: + + * ``vsum[0].get() == 100`` + * ``vsum[1].get() == 100`` + * ``vsum[2].get() == 100`` + * ``vsum[3].get() == 100`` + * ``vsum[4].get() == 100`` + * ``vsum[5].get() == 100`` + * ``vsum[6].get() == 100`` + * ``vsum[7].get() == 100`` + * ``vsum[8].get() == 100`` + * ``vsum[9].get() == 100`` + +Another option for the execution policy when using the CUDA or HIP backends are +the base policies which have a boolean parameter to choose between the general +use ``cuda/hip_exec`` policy and the ``cuda/hip_exec_with_reduce`` policy.:: + + // static constexpr bool with_reduce = ...; + // using exec_policy = RAJA::cuda_exec_base; + // using exec_policy = RAJA::hip_exec_base; + + +--------------------------- +Rarely Used MultiReductions +--------------------------- + +Multi-reductions consume resources even if they are not used in a +loop kernel. If a multi-reducer is conditionally used to set an error flag, for example, even +if the multi-reduction is not used at runtime in the loop kernel, then the setup +and finalization for the multi-reduction is still done and any resources are +still allocated and deallocated. To minimize these overheads, some backends have +special policies that minimize the amount of work the multi-reducer does in the +case that it is not used at runtime even if it is compiled into a loop kernel. +Here are example RAJA multi-reduction policies that have minimal overhead:: + + using rarely_used_multi_reduce_policy = RAJA::seq_multi_reduce; + // using rarely_used_multi_reduce_policy = RAJA::omp_multi_reduce; + // using rarely_used_multi_reduce_policy = RAJA::cuda_multi_reduce_atomic_low_performance_low_overhead; + // using rarely_used_multi_reduce_policy = RAJA::hip_multi_reduce_atomic_low_performance_low_overhead; + +Here is a simple rarely used bitwise-or multi-reduction performed using RAJA:: + + RAJA::MultiReduceBitOr vor(num_bins, 0); + + RAJA::forall( RAJA::RangeSegment(0, N), + [=](RAJA::Index_type i) { + + if (vec[i] < 0) { + vor[0] |= 1; + } + + }); + +The results of these operations will yield the following value if the condition +is never met: + + * ``vsum[0].get() == 0`` + +or yield the following value if the condition is ever met: + + * ``vsum[0].get() == 1`` diff --git a/docs/sphinx/user_guide/feature/multi-reduction.rst b/docs/sphinx/user_guide/feature/multi-reduction.rst new file mode 100644 index 0000000000..c41cc37225 --- /dev/null +++ b/docs/sphinx/user_guide/feature/multi-reduction.rst @@ -0,0 +1,227 @@ +.. ## +.. ## Copyright (c) 2016-24, Lawrence Livermore National Security, LLC +.. ## and other RAJA project contributors. See the RAJA/LICENSE file +.. ## for details. +.. ## +.. ## SPDX-License-Identifier: (BSD-3-Clause) +.. ## + +.. _feat-multi-reductions-label: + +========================= +MultiReduction Operations +========================= + +RAJA provides multi-reduction types that allow users to perform a runtime number +of reduction operations in kernels launched using ``RAJA::forall``, ``RAJA::kernel``, +and ``RAJA::launch`` methods in a portable, thread-safe manner. Users may +use as many multi-reduction objects in a loop kernel as they need. If a small +fixed number of reductions is required in a loop kernel then standard RAJA reduction objects can be +used. Available RAJA multi-reduction types are described in this section. + +.. note:: All RAJA multi-reduction types are located in the namespace ``RAJA``. + +Also + +.. note:: * Each RAJA multi-reduction type is templated on a **multi-reduction policy** + and a **reduction value type** for the multi-reduction variable. The + **multi-reduction policy type must be compatible with the execution + policy used by the kernel in which it is used.** For example, in + a CUDA kernel, a CUDA multi-reduction policy must be used. + * Each RAJA multi-reduction type accepts an **initial reduction value or + values** at construction (see below). + * Each RAJA multi-reduction type has a 'get' method to access reduced + values after kernel execution completes. + +Please see the following sections for a description of reducers: + + * :ref:`feat-reductions-label`. + +Please see the following cook book sections for guidance on policy usage: + + * :ref:`cook-book-multi-reductions-label`. + + +-------------------- +MultiReduction Types +-------------------- + +RAJA supports three common multi-reduction types: + +* ``MultiReduceSum< multi_reduce_policy, data_type >`` - Sum of values. + +* ``MultiReduceMin< multi_reduce_policy, data_type >`` - Min value. + +* ``MultiReduceMax< multi_reduce_policy, data_type >`` - Max value. + +and two less common bitwise multi-reduction types: + +* ``MultiReduceBitAnd< multi_reduce_policy, data_type >`` - Bitwise 'and' of values (i.e., ``a & b``). + +* ``MultiReduceBitOr< multi_reduce_policy, data_type >`` - Bitwise 'or' of values (i.e., ``a | b``). + +.. note:: ``RAJA::MultiReduceBitAnd`` and ``RAJA::MultiReduceBitOr`` reduction types are designed to work on integral data types because **in C++, at the language level, there is no such thing as a bitwise operator on floating-point numbers.** + +----------------------- +MultiReduction Examples +----------------------- + +Next, we provide a few examples to illustrate basic usage of RAJA multi-reduction +types. + +Here is a simple RAJA multi-reduction example that shows how to use a sum +multi-reduction type:: + + const int N = 1000; + const int B = 10; + + // + // Initialize an array of length N with all ones, and another array to + // integers between 0 and B-1 + // + int vec[N]; + int bins[N]; + for (int i = 0; i < N; ++i) { + vec[i] = 1; + bins[i] = i % B; + } + + // Create a sum multi-reduction object with a size of B, and initial + // values of zero + RAJA::MultiReduceSum< RAJA::omp_multi_reduce, int > vsum(B, 0); + + // Run a kernel using the multi-reduction object + RAJA::forall( RAJA::RangeSegment(0, N), + [=](RAJA::Index_type i) { + + vsum[bins[i]] += vec[i]; + + }); + + // After kernel is run, extract the reduced values + int my_vsums[B]; + for (int bin = 0; bin < B; ++bin) { + my_vsums[bin] = vsum[bin].get(); + } + +The results of these operations will yield the following values: + + * my_vsums[0] == 100 + * my_vsums[1] == 100 + * my_vsums[2] == 100 + * my_vsums[3] == 100 + * my_vsums[4] == 100 + * my_vsums[5] == 100 + * my_vsums[6] == 100 + * my_vsums[7] == 100 + * my_vsums[8] == 100 + * my_vsums[9] == 100 + + +Here is the same example but using values stored in a container:: + + const int N = 1000; + const int B = 10; + + // + // Initialize an array of length N with all ones, and another array to + // integers between 0 and B-1 + // + int vec[N]; + int bins[N]; + for (int i = 0; i < N; ++i) { + vec[i] = 1; + bins[i] = i % B; + } + + // Create a vector with a size of B, and initial values of zero + std::vector my_vsums(B, 0); + + // Create a multi-reducer initalized with size and values from my_vsums + RAJA::MultiReduceSum< RAJA::omp_multi_reduce, int > vsum(my_vsums); + + // Run a kernel using the multi-reduction object + RAJA::forall( RAJA::RangeSegment(0, N), + [=](RAJA::Index_type i) { + + vsum[bins[i]] += vec[i]; + + }); + + // After kernel is run, extract the reduced values back into my_vsums + vsum.get_all(my_vsums); + +The results of these operations will yield the following values: + + * my_vsums[0] == 100 + * my_vsums[1] == 100 + * my_vsums[2] == 100 + * my_vsums[3] == 100 + * my_vsums[4] == 100 + * my_vsums[5] == 100 + * my_vsums[6] == 100 + * my_vsums[7] == 100 + * my_vsums[8] == 100 + * my_vsums[9] == 100 + + + + + +Here is an example of a bitwise-or multi-reduction:: + + const int N = 128; + const int B = 8; + + // + // Initialize an array of length N to integers between 0 and B-1 + // + int bins[N]; + for (int i = 0; i < N; ++i) { + bins[i] = i % B; + } + + // Create a bitwise-or multi-reduction object with initial value of '0' + RAJA::MultiReduceBitOr< RAJA::omp_multi_reduce, int > vor(B, 0); + + // Run a kernel using the multi-reduction object + RAJA::forall( RAJA::RangeSegment(0, N), + [=](RAJA::Index_type i) { + + vor[bins[i]] |= i; + + }); + + // After kernel is run, extract the reduced values + int my_vors[B]; + for (int bin = 0; bin < B; ++bin) { + my_vors[bin] = vor[bin].get(); + } + +The results of these operations will yield the following values: + + * my_vors[0] == 120 == 0b1111000 + * my_vors[1] == 121 == 0b1111001 + * my_vors[2] == 122 == 0b1111010 + * my_vors[3] == 123 == 0b1111011 + * my_vors[4] == 124 == 0b1111100 + * my_vors[5] == 125 == 0b1111101 + * my_vors[6] == 126 == 0b1111110 + * my_vors[7] == 127 == 0b1111111 + +The results of the multi-reduction start at 120 and increase to 127. In binary +representation (i.e., bits), :math:`120 = 0b1111000` and :math:`127 = 0b1111111`. +The bins were picked in such a way that all the integers in a bin had the same +remainder modulo 8 so their last 3 binary digits were all the same while their +upper binary digits varied. Because bitwise-or keeps all the set bits, the upper +bits are all set because at least one integer in that bin set them. The last +3 bits were the same in all the integers so the last 3 bits are the same as the +remainder modulo 8 of the bin number. + +----------------------- +MultiReduction Policies +----------------------- + +For more information about available RAJA multi-reduction policies and guidance +on which to use with RAJA execution policies, please see +:ref:`multi-reducepolicy-label`. diff --git a/docs/sphinx/user_guide/feature/policies.rst b/docs/sphinx/user_guide/feature/policies.rst index facde1da5d..e38856a919 100644 --- a/docs/sphinx/user_guide/feature/policies.rst +++ b/docs/sphinx/user_guide/feature/policies.rst @@ -850,6 +850,73 @@ sycl_reduce any SYCL Reduction in a S guaranteed to generate correct results. So they should not be used for kernels containing reductions. +.. _multi-reducepolicy-label: + +------------------------- +MultiReduction Policies +------------------------- + +Each RAJA multi-reduction object must be defined with a 'multi-reduction policy' +type. Multi-reduction policy types are distinct from loop execution policy types. +It is important to note the following constraints about RAJA multi-reduction usage: + +.. note:: To guarantee correctness, a **multi-reduction policy must be compatible + with the loop execution policy** used. For example, a CUDA + multi-reduction policy must be used when the execution policy is a + CUDA policy, an OpenMP multi-reduction policy must be used when the + execution policy is an OpenMP policy, and so on. + +The following table summarizes RAJA multi-reduction policy types: + +============================================================= ============= ========================================== +MultiReduction Policy Loop Policies Brief description + to Use With +============================================================= ============= ========================================== +seq_multi_reduce seq_exec, Non-parallel (sequential) multi-reduction. +omp_multi_reduce any OpenMP OpenMP parallel multi-reduction. + policy +omp_multi_reduce_ordered any OpenMP OpenMP parallel multi-reduction with result + policy guaranteed to be reproducible. +cuda/hip_multi_reduce_atomic any CUDA/HIP Parallel multi-reduction in a CUDA/HIP kernel. + policy Multi-reduction may use atomic operations + leading to run to run variability in the + results. + (device synchronization will occur when + reduction value is finalized) +cuda/hip_multi_reduce_atomic_low_performance_low_overhead any CUDA/HIP Same as above, but multi-reduction uses + policy a low overhead algorithm with a minimal + set of resources. This minimally effects + the performance of loops containing the + multi-reducer though it may cause the + multi-reducer itself to perform poorly if + it is used. +cuda/hip_multi_reduce_atomic_block_then_atomic_grid_host_init any CUDA/HIP The multi-reduction uses atomics into shared + policy memory and global memory. Atomics into + shared memory are used each time a value + is combined into the multi-reducer and at + the end of the life of the block the shared + values are combined into global memory with + atomics. If there is not enough shared memory + available this will fall back to using atomics into + global memory only, which may have a + performance penalty. + The memory for global atomics is + initialized on the host. +cuda/hip_multi_reduce_atomic_global_host_init any CUDA/HIP The multi-reduction uses atomics into global + policy global memory only. Atomics into + global memory are used each time a value + is combined into the multi-reducer. + The memory for global atomics is + initialized on the host. +cuda/hip_multi_reduce_atomic_global_no_replication_host_init any CUDA/HIP Same as above, but uses minimal memory + by not replicating global atomics. + +============================================================= ============= ========================================== + +.. note:: RAJA multi-reductions used with SIMD execution policies are not + guaranteed to generate correct results. So they should not be used + for kernels containing multi-reductions. + .. _atomicpolicy-label: ------------------------- diff --git a/docs/sphinx/user_guide/feature/reduction.rst b/docs/sphinx/user_guide/feature/reduction.rst index 5f2f09afad..6d4c8695d9 100644 --- a/docs/sphinx/user_guide/feature/reduction.rst +++ b/docs/sphinx/user_guide/feature/reduction.rst @@ -17,8 +17,9 @@ reduction operations like some other C++ loop programming abstraction models. Instead, RAJA provides reduction types that allow users to perform reduction operations in kernels launched using ``RAJA::forall``, ``RAJA::kernel``, and ``RAJA::launch`` methods in a portable, thread-safe manner. Users may -use as many reduction objects in a loop kernel as they need. Available RAJA -reduction types are described in this section. +use as many reduction objects in a loop kernel as they need. If a runtime number +of reductions is required in a loop kernel, then multi-reductions can be used. +Available RAJA reduction types are described in this section. .. note:: All RAJA reduction types are located in the namespace ``RAJA``. @@ -39,6 +40,10 @@ RAJA reductions: * :ref:`tut-reduction-label`. +Please see the following sections for a description of multi-reducers: + + * :ref:`feat-multi-reductions-label`. + Please see the following cook book sections for guidance on policy usage: * :ref:`cook-book-reductions-label`. diff --git a/docs/sphinx/user_guide/feature/resource.rst b/docs/sphinx/user_guide/feature/resource.rst index 860af3eddd..d0ca13a3ab 100644 --- a/docs/sphinx/user_guide/feature/resource.rst +++ b/docs/sphinx/user_guide/feature/resource.rst @@ -95,7 +95,7 @@ Memory Operations ------------------- The example discussed in this section illustrates most of the memory -operations that can be performed with +operations that can be performed with RAJA resource objects. A common use case for a resource is to manage arrays in the appropriate memory space to use in a kernel. Consider the following code example:: diff --git a/docs/sphinx/user_guide/features.rst b/docs/sphinx/user_guide/features.rst index 4d9e4bf711..afeb50ce9d 100644 --- a/docs/sphinx/user_guide/features.rst +++ b/docs/sphinx/user_guide/features.rst @@ -25,6 +25,7 @@ materials that provide detailed examples of usage. feature/iteration_spaces feature/view feature/reduction + feature/multi-reduction feature/atomic feature/scan feature/sort diff --git a/docs/sphinx/user_guide/using_raja.rst b/docs/sphinx/user_guide/using_raja.rst index 6dc8086a9c..e05cec4dfb 100644 --- a/docs/sphinx/user_guide/using_raja.rst +++ b/docs/sphinx/user_guide/using_raja.rst @@ -34,7 +34,7 @@ project:: Then, pass the path of RAJA to CMake when you configure your code:: - cmake -DRAJA_DIR=/share/raja/cmake + cmake -DRAJA_DIR=/lib/cmake/raja/ The ``RAJA-config.cmake`` file provides a ``RAJA`` target, that can be used natively by CMake to add a dependency on RAJA. For example:: diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt index 7fd580972b..4dfd2fbc10 100644 --- a/examples/CMakeLists.txt +++ b/examples/CMakeLists.txt @@ -19,6 +19,10 @@ raja_add_executable( NAME forall-param-reductions SOURCES forall-param-reductions.cpp) +raja_add_executable( + NAME forall_multi-reductions + SOURCES forall_multi-reductions.cpp) + raja_add_executable( NAME launch-param-reductions SOURCES launch-param-reductions.cpp) diff --git a/examples/dynamic_mat_transpose.cpp b/examples/dynamic_mat_transpose.cpp index 9fc973e311..feb5247224 100644 --- a/examples/dynamic_mat_transpose.cpp +++ b/examples/dynamic_mat_transpose.cpp @@ -11,7 +11,6 @@ #include #include "RAJA/RAJA.hpp" -#include "memoryManager.hpp" /* * Matrix Transpose Example @@ -96,7 +95,7 @@ using outer0 = RAJA::LoopPolicy< #endif #if defined(RAJA_ENABLE_SYCL) , - RAJA::sycl_group_0_direct + RAJA::sycl_group_2_direct #endif >; @@ -135,7 +134,7 @@ using inner0 = RAJA::LoopPolicy< #endif #if defined(RAJA_ENABLE_SYCL) , - RAJA::sycl_local_0_direct + RAJA::sycl_local_2_direct #endif >; @@ -154,20 +153,9 @@ using inner1 = RAJA::LoopPolicy; -template -void switch_ptrs(T *A, T *d_A) -{ - T *tmp_ptr; - tmp_ptr = d_A; - d_A = A; - A = tmp_ptr; -} - int main(int argc, char *argv[]) { - std::cout << "\n\nRAJA matrix transpose example...\n"; - if(argc != 2) { RAJA_ABORT_OR_THROW("Usage ./dynamic_mat_transpose host or ./dynamic_mat_transpose device"); } @@ -185,17 +173,26 @@ int main(int argc, char *argv[]) RAJA::ExecPlace select_cpu_or_gpu; if(exec_space.compare("host") == 0) - { select_cpu_or_gpu = RAJA::ExecPlace::HOST; printf("Running RAJA::launch reductions example on the host \n"); } + { select_cpu_or_gpu = RAJA::ExecPlace::HOST; std::cout<<"Running RAJA::launch matrix transpose example on the host"<(N_r * N_c); - int *At = memoryManager::allocate(N_r * N_c); - + int *A = host_res.allocate(N_r * N_c); + int *At = host_res.allocate(N_r * N_c); // // In the following implementations of matrix transpose, we // use RAJA 'View' objects to access the matrix data. A RAJA view @@ -300,20 +296,24 @@ int main(int argc, char *argv[]) std::cout << "\n Running RAJA matrix transpose w/ dynamic shared memory ...\n"; -#if defined(RAJA_ENABLE_HIP) + //Reset memory + std::memset(At, 0, N_r * N_c * sizeof(int)); - //Hip requires device side pointers +#if defined(RAJA_GPU_ACTIVE) + //Allocate device side pointers int *d_A = nullptr, *d_At = nullptr; if(select_cpu_or_gpu == RAJA::ExecPlace::DEVICE) { - d_A = memoryManager::allocate_gpu(N_r * N_c); - d_At = memoryManager::allocate_gpu(N_r * N_c); - hipErrchk(hipMemcpy( d_A, A, N_r * N_c * sizeof(int), hipMemcpyHostToDevice )); + d_A = device_res.allocate(N_r * N_c); + d_At = device_res.allocate(N_r * N_c); + + device_res.memcpy(d_A, A, sizeof(int) * N_r * N_c); + device_res.memcpy(d_At, At, sizeof(int) * N_r * N_c); //switch host/device pointers so we can reuse the views - switch_ptrs(d_A, A); - switch_ptrs(d_At, At); + Aview.set_data(d_A); + Atview.set_data(d_At); } #endif @@ -323,13 +323,11 @@ int main(int argc, char *argv[]) // _dynamic_mattranspose_kernel_start RAJA::launch - (select_cpu_or_gpu, - RAJA::LaunchParams(RAJA::Teams(outer_Dimr, outer_Dimc), - RAJA::Threads(TILE_DIM, TILE_DIM), dynamic_shared_mem_size), + (res, RAJA::LaunchParams(RAJA::Teams(outer_Dimc, outer_Dimr), + RAJA::Threads(TILE_DIM, TILE_DIM), dynamic_shared_mem_size), "Matrix tranpose with dynamic shared memory kernel", [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) { - RAJA::loop(ctx, RAJA::RangeSegment(0, outer_Dimr), [&] (int by){ RAJA::loop(ctx, RAJA::RangeSegment(0, outer_Dimc), [&] (int bx){ @@ -378,24 +376,37 @@ int main(int argc, char *argv[]) ctx.releaseSharedMemory(); }); }); - }); // _dynamic_mattranspose_kernel_end - -#if defined(RAJA_ENABLE_HIP) +#if defined(RAJA_GPU_ACTIVE) if(select_cpu_or_gpu == RAJA::ExecPlace::DEVICE) { - switch_ptrs(d_At, At); - switch_ptrs(d_A, A); - hipErrchk(hipMemcpy( d_At, At, N_r * N_c * sizeof(int), hipMemcpyDeviceToHost )); + device_res.memcpy(A, d_A, sizeof(int) * N_r * N_c); + device_res.memcpy(At, d_At, sizeof(int) * N_r * N_c); + + Aview.set_data(A); + Atview.set_data(At); } #endif checkResult(Atview, N_c, N_r); + //printResult(Atview, N_c, N_r); //----------------------------------------------------------------------------// + //Release data + host_res.deallocate(A); + host_res.deallocate(At); + +#if defined(RAJA_GPU_ACTIVE) + if(select_cpu_or_gpu == RAJA::ExecPlace::DEVICE) { + device_res.deallocate(d_A); + device_res.deallocate(d_At); + } +#endif + + return 0; } diff --git a/examples/forall-param-reductions.cpp b/examples/forall-param-reductions.cpp index 8e16113c95..fb82582704 100644 --- a/examples/forall-param-reductions.cpp +++ b/examples/forall-param-reductions.cpp @@ -9,8 +9,6 @@ #include #include -#include "memoryManager.hpp" - #include "RAJA/RAJA.hpp" /* @@ -39,6 +37,10 @@ constexpr int CUDA_BLOCK_SIZE = 256; constexpr int HIP_BLOCK_SIZE = 256; #endif +#if defined(RAJA_ENABLE_SYCL) +constexpr int SYCL_BLOCK_SIZE = 256; +#endif + int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) { @@ -53,13 +55,14 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) // // Allocate array data and initialize data to alternating sequence of 1, -1. // - int* a = memoryManager::allocate(N); + RAJA::resources::Host host_res; + int* a = host_res.allocate(N); for (int i = 0; i < N; ++i) { if ( i % 2 == 0 ) { a[i] = 1; } else { - a[i] = -1; + a[i] = -1; } } @@ -103,19 +106,20 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) // _reductions_raja_seq_start using EXEC_POL1 = RAJA::seq_exec; - + int seq_sum = 0; int seq_min = std::numeric_limits::max(); int seq_max = std::numeric_limits::min(); VALLOC_INT seq_minloc(std::numeric_limits::max(), -1); VALLOC_INT seq_maxloc(std::numeric_limits::min(), -1); - RAJA::forall(arange, + RAJA::forall(host_res, arange, RAJA::expt::Reduce(&seq_sum), RAJA::expt::Reduce(&seq_min), RAJA::expt::Reduce(&seq_max), RAJA::expt::Reduce(&seq_minloc), RAJA::expt::Reduce(&seq_maxloc), + RAJA::expt::KernelName("RAJA Reduce Seq Kernel"), [=](int i, int &_seq_sum, int &_seq_min, int &_seq_max, VALLOC_INT &_seq_minloc, VALLOC_INT &_seq_maxloc) { _seq_sum += a[i]; @@ -126,8 +130,8 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) _seq_maxloc = RAJA_MAX(VALLOC_INT(a[i], i), _seq_maxloc); //_seq_minloc.min(a[i], i); //_seq_maxloc.max(a[i], i); - // Note : RAJA::expt::ValLoc objects provide min() and max() methods - // that are equivalent to the assignments with RAJA_MIN and RAJA_MAX + // Note : RAJA::expt::ValLoc objects provide min() and max() methods + // that are equivalent to the assignments with RAJA_MIN and RAJA_MAX // above. } ); @@ -135,12 +139,12 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) std::cout << "\tsum = " << seq_sum << std::endl; std::cout << "\tmin = " << seq_min << std::endl; std::cout << "\tmax = " << seq_max << std::endl; - std::cout << "\tmin, loc = " << seq_minloc.getVal() << " , " + std::cout << "\tmin, loc = " << seq_minloc.getVal() << " , " << seq_minloc.getLoc() << std::endl; - std::cout << "\tmax, loc = " << seq_maxloc.getVal() << " , " + std::cout << "\tmax, loc = " << seq_maxloc.getVal() << " , " << seq_maxloc.getLoc() << std::endl; // _reductions_raja_seq_end - + //----------------------------------------------------------------------------// @@ -157,12 +161,13 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) VALLOC_INT omp_minloc(std::numeric_limits::max(), -1); VALLOC_INT omp_maxloc(std::numeric_limits::min(), -1); - RAJA::forall(arange, + RAJA::forall(host_res, arange, RAJA::expt::Reduce(&omp_sum), RAJA::expt::Reduce(&omp_min), RAJA::expt::Reduce(&omp_max), RAJA::expt::Reduce(&omp_minloc), RAJA::expt::Reduce(&omp_maxloc), + RAJA::expt::KernelName("RAJA Reduce OpenMP Kernel"), [=](int i, int &_omp_sum, int &_omp_min, int &_omp_max, VALLOC_INT &_omp_minloc, VALLOC_INT &_omp_maxloc) { _omp_sum += a[i]; @@ -179,9 +184,9 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) std::cout << "\tsum = " << omp_sum << std::endl; std::cout << "\tmin = " << omp_min << std::endl; std::cout << "\tmax = " << omp_max << std::endl; - std::cout << "\tmin, loc = " << omp_minloc.getVal() << " , " + std::cout << "\tmin, loc = " << omp_minloc.getVal() << " , " << omp_minloc.getLoc() << std::endl; - std::cout << "\tmax, loc = " << omp_maxloc.getVal() << " , " + std::cout << "\tmax, loc = " << omp_maxloc.getVal() << " , " << omp_maxloc.getLoc() << std::endl; #endif @@ -191,6 +196,8 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) #if defined(RAJA_ENABLE_TARGET_OPENMP) std::cout << "\n Running RAJA OpenMP Target reductions...\n"; + RAJA::resources::Omp omp_res; + // _reductions_raja_omppolicy_start using EXEC_POL3 = RAJA::omp_target_parallel_for_exec_nt; // _reductions_raja_omppolicy_end @@ -201,12 +208,13 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) VALLOC_INT omp_t_minloc(std::numeric_limits::max(), -1); VALLOC_INT omp_t_maxloc(std::numeric_limits::min(), -1); - RAJA::forall(arange, + RAJA::forall(omp_res, arange, RAJA::expt::Reduce(&omp_t_sum), RAJA::expt::Reduce(&omp_t_min), RAJA::expt::Reduce(&omp_t_max), RAJA::expt::Reduce(&omp_t_minloc), RAJA::expt::Reduce(&omp_t_maxloc), + RAJA::expt::KernelName("RAJA Reduce Target OpenMP Kernel"), [=](int i, int &_omp_t_sum, int &_omp_t_min, int &_omp_t_max, VALLOC_INT &_omp_t_minloc, VALLOC_INT &_omp_t_maxloc) { _omp_t_sum += a[i]; @@ -223,9 +231,9 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) std::cout << "\tsum = " << omp_t_sum << std::endl; std::cout << "\tmin = " << omp_t_min << std::endl; std::cout << "\tmax = " << omp_t_max << std::endl; - std::cout << "\tmin, loc = " << omp_t_minloc.getVal() << " , " + std::cout << "\tmin, loc = " << omp_t_minloc.getVal() << " , " << omp_t_minloc.getLoc() << std::endl; - std::cout << "\tmax, loc = " << omp_t_maxloc.getVal() << " , " + std::cout << "\tmax, loc = " << omp_t_maxloc.getVal() << " , " << omp_t_maxloc.getLoc() << std::endl; #endif @@ -236,6 +244,11 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) #if defined(RAJA_ENABLE_CUDA) std::cout << "\n Running RAJA CUDA reductions...\n"; + RAJA::resources::Cuda cuda_res; + + int* d_a = cuda_res.allocate(N); + cuda_res.memcpy(d_a, a, sizeof(int) * N); + // _reductions_raja_cudapolicy_start using EXEC_POL3 = RAJA::cuda_exec; // _reductions_raja_cudapolicy_end @@ -246,20 +259,21 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) VALLOC_INT cuda_minloc(std::numeric_limits::max(), -1); VALLOC_INT cuda_maxloc(std::numeric_limits::min(), -1); - RAJA::forall(arange, + RAJA::forall(cuda_res, arange, RAJA::expt::Reduce(&cuda_sum), RAJA::expt::Reduce(&cuda_min), RAJA::expt::Reduce(&cuda_max), RAJA::expt::Reduce(&cuda_minloc), RAJA::expt::Reduce(&cuda_maxloc), + RAJA::expt::KernelName("RAJA Reduce CUDA Kernel"), [=] RAJA_DEVICE (int i, int &_cuda_sum, int &_cuda_min, int &_cuda_max, VALLOC_INT &_cuda_minloc, VALLOC_INT &_cuda_maxloc) { - _cuda_sum += a[i]; + _cuda_sum += d_a[i]; - _cuda_min = RAJA_MIN(a[i], _cuda_min); - _cuda_max = RAJA_MAX(a[i], _cuda_max); + _cuda_min = RAJA_MIN(d_a[i], _cuda_min); + _cuda_max = RAJA_MAX(d_a[i], _cuda_max); - _cuda_minloc = RAJA_MIN(VALLOC_INT(a[i], i), _cuda_minloc); - _cuda_maxloc = RAJA_MAX(VALLOC_INT(a[i], i), _cuda_maxloc); + _cuda_minloc = RAJA_MIN(VALLOC_INT(d_a[i], i), _cuda_minloc); + _cuda_maxloc = RAJA_MAX(VALLOC_INT(d_a[i], i), _cuda_maxloc); //_cuda_minloc.min(a[i], i); //_cuda_maxloc.max(a[i], i); } @@ -268,11 +282,11 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) std::cout << "\tsum = " << cuda_sum << std::endl; std::cout << "\tmin = " << cuda_min << std::endl; std::cout << "\tmax = " << cuda_max << std::endl; - std::cout << "\tmin, loc = " << cuda_minloc.getVal() << " , " + std::cout << "\tmin, loc = " << cuda_minloc.getVal() << " , " << cuda_minloc.getLoc() << std::endl; - std::cout << "\tmax, loc = " << cuda_maxloc.getVal() << " , " + std::cout << "\tmax, loc = " << cuda_maxloc.getVal() << " , " << cuda_maxloc.getLoc() << std::endl; - + cuda_res.deallocate(d_a); #endif //----------------------------------------------------------------------------// @@ -280,8 +294,10 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) #if defined(RAJA_ENABLE_HIP) std::cout << "\n Running RAJA HIP reductions...\n"; - int* d_a = memoryManager::allocate_gpu(N); - hipErrchk(hipMemcpy( d_a, a, N * sizeof(int), hipMemcpyHostToDevice )); + RAJA::resources::Hip hip_res; + + int* d_a = hip_res.allocate(N); + hip_res.memcpy(d_a, a, sizeof(int) * N); // _reductions_raja_hippolicy_start using EXEC_POL3 = RAJA::hip_exec; @@ -293,12 +309,13 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) VALLOC_INT hip_minloc(std::numeric_limits::max(), -1); VALLOC_INT hip_maxloc(std::numeric_limits::min(), -1); - RAJA::forall(arange, + RAJA::forall(arange, RAJA::expt::Reduce(&hip_sum), RAJA::expt::Reduce(&hip_min), RAJA::expt::Reduce(&hip_max), RAJA::expt::Reduce(&hip_minloc), RAJA::expt::Reduce(&hip_maxloc), + RAJA::expt::KernelName("RAJA Reduce HIP Kernel"), [=] RAJA_DEVICE (int i, int &_hip_sum, int &_hip_min, int &_hip_max, VALLOC_INT &_hip_minloc, VALLOC_INT &_hip_maxloc) { _hip_sum += d_a[i]; @@ -315,12 +332,63 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) std::cout << "\tsum = " << hip_sum << std::endl; std::cout << "\tmin = " << hip_min << std::endl; std::cout << "\tmax = " << hip_max << std::endl; - std::cout << "\tmin, loc = " << hip_minloc.getVal() << " , " + std::cout << "\tmin, loc = " << hip_minloc.getVal() << " , " << hip_minloc.getLoc() << std::endl; - std::cout << "\tmax, loc = " << hip_maxloc.getVal() << " , " + std::cout << "\tmax, loc = " << hip_maxloc.getVal() << " , " << hip_maxloc.getLoc() << std::endl; - memoryManager::deallocate_gpu(d_a); + hip_res.deallocate(d_a); +#endif + +//----------------------------------------------------------------------------// + +#if defined(RAJA_ENABLE_SYCL) + std::cout << "\n Running RAJA SYCL reductions...\n"; + + RAJA::resources::Sycl sycl_res; + + int* d_a = sycl_res.allocate(N); + sycl_res.memcpy(d_a, a, sizeof(int) * N); + + // _reductions_raja_syclpolicy_start + using EXEC_POL3 = RAJA::sycl_exec; + // _reductions_raja_syclpolicy_end + + int sycl_sum = 0; + int sycl_min = std::numeric_limits::max(); + int sycl_max = std::numeric_limits::min(); + VALLOC_INT sycl_minloc(std::numeric_limits::max(), -1); + VALLOC_INT sycl_maxloc(std::numeric_limits::min(), -1); + + RAJA::forall(sycl_res, arange, + RAJA::expt::Reduce(&sycl_sum), + RAJA::expt::Reduce(&sycl_min), + RAJA::expt::Reduce(&sycl_max), + RAJA::expt::Reduce(&sycl_minloc), + RAJA::expt::Reduce(&sycl_maxloc), + RAJA::expt::KernelName("RAJA Reduce SYCL Kernel"), + [=] RAJA_DEVICE (int i, int &_sycl_sum, int &_sycl_min, int &_sycl_max, VALLOC_INT &_sycl_minloc, VALLOC_INT &_sycl_maxloc) { + _sycl_sum += d_a[i]; + + _sycl_min = RAJA_MIN(d_a[i], _sycl_min); + _sycl_max = RAJA_MAX(d_a[i], _sycl_max); + + _sycl_minloc = RAJA_MIN(VALLOC_INT(d_a[i], i), _sycl_minloc); + _sycl_maxloc = RAJA_MAX(VALLOC_INT(d_a[i], i), _sycl_maxloc); + //_sycl_minloc.min(d_a[i], i); + //_sycl_maxloc.max(d_a[i], i); + } + ); + + std::cout << "\tsum = " << sycl_sum << std::endl; + std::cout << "\tmin = " << sycl_min << std::endl; + std::cout << "\tmax = " << sycl_max << std::endl; + std::cout << "\tmin, loc = " << sycl_minloc.getVal() << " , " + << sycl_minloc.getLoc() << std::endl; + std::cout << "\tmax, loc = " << sycl_maxloc.getVal() << " , " + << sycl_maxloc.getLoc() << std::endl; + + sycl_res.deallocate(d_a); #endif //----------------------------------------------------------------------------// @@ -328,9 +396,9 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) // // Clean up. // - memoryManager::deallocate(a); + host_res.deallocate(a); std::cout << "\n DONE!...\n"; - + return 0; } diff --git a/examples/forall_multi-reductions.cpp b/examples/forall_multi-reductions.cpp new file mode 100644 index 0000000000..0010dd2848 --- /dev/null +++ b/examples/forall_multi-reductions.cpp @@ -0,0 +1,166 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2016-24, Lawrence Livermore National Security, LLC +// and RAJA project contributors. See the RAJA/LICENSE file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +#include +#include +#include + +#include "RAJA/RAJA.hpp" + +/* + * MultiReduction Example using RAJA forall + * + * This example illustrates use of the RAJA multi-reduction types: min, max, + * sum, and, and or. + * + * RAJA features shown: + * - `forall' loop iteration template method + * - Index range segment + * - Execution policies + * - MultiReduction types + * + */ + +template < typename t_exec_policy, typename t_multi_reduce_policy > +struct Backend +{ + using exec_policy = t_exec_policy; + using multi_reduce_policy = t_multi_reduce_policy; + + std::string name; +}; + +auto example_policies = camp::make_tuple( + + Backend{"Sequential"} + +#if defined(RAJA_ENABLE_OPENMP) + , Backend{"OpenMP"} +#endif + +#if defined(RAJA_ENABLE_CUDA) + , Backend, RAJA::cuda_multi_reduce_atomic>{"Cuda"} +#endif + +#if defined(RAJA_ENABLE_HIP) + , Backend, RAJA::hip_multi_reduce_atomic>{"Hip"} +#endif + + ); + +template < typename exec_policy, typename multi_reduce_policy > +void example_code(RAJA::RangeSegment arange, int num_bins, int* bins, int* a) +{ + RAJA::MultiReduceSum multi_reduce_sum(num_bins); + RAJA::MultiReduceMin multi_reduce_min(num_bins); + RAJA::MultiReduceMax multi_reduce_max(num_bins); + RAJA::MultiReduceBitAnd multi_reduce_and(num_bins); + RAJA::MultiReduceBitOr multi_reduce_or(num_bins); + + RAJA::forall(arange, + [=] RAJA_HOST_DEVICE(RAJA::Index_type i) { + + int bin = bins[i]; + + multi_reduce_sum[bin] += a[i]; + multi_reduce_min[bin].min(a[i]); + multi_reduce_max[bin].max(a[i]); + multi_reduce_and[bin] &= a[i]; + multi_reduce_or [bin] |= a[i]; + + }); + + for (int bin = 0; bin < num_bins; ++bin) { + std::cout << "\tsum[" << bin << "] = " << multi_reduce_sum.get(bin) << '\n'; + std::cout << "\tmin[" << bin << "] = " << multi_reduce_min.get(bin) << '\n'; + std::cout << "\tmax[" << bin << "] = " << multi_reduce_max.get(bin) << '\n'; + std::cout << "\tand[" << bin << "] = " << multi_reduce_and.get(bin) << '\n'; + std::cout << "\tor [" << bin << "] = " << multi_reduce_or .get(bin) << '\n'; + std::cout << '\n'; + } +} + +int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv)) +{ + + // _multi_reductions_array_init_start +// +// Define array length +// + const int N = 1000000; + const int num_bins = 10; + +// +// Allocate array data and initialize data to alternating sequence of 1, -1. +// + camp::resources::Host host_res; + int* host_bins = host_res.template allocate(N); + int* host_a = host_res.template allocate(N); + + for (int i = 0; i < N; ++i) { + host_bins[i] = i % num_bins; + host_a[i] = (i % (2*num_bins)) - num_bins; + } + + // _multi_reductions_array_init_end + +// +// Note: with this data initialization scheme, the following results will +// be observed for all reduction kernels below: +// +// for bin in [0, num_bins) +// - the sum will be (bin - num_bins/2) * N / num_bins +// - the min will be bin - num_bins +// - the max will be bin +// - the and will be min & max +// - the or will be min | max +// + +// +// Define index range for iterating over a elements in all examples +// + // _multi_reductions_range_start + RAJA::RangeSegment arange(0, N); + // _multi_reductions_range_end + +//----------------------------------------------------------------------------// + + RAJA::for_each_tuple(example_policies, [&](auto const& backend) { + + std::cout << "Running " << backend.name << " policies" << '\n'; + + using exec_policy = typename std::decay_t::exec_policy; + using multi_reduce_policy = typename std::decay_t::multi_reduce_policy; + + auto res = RAJA::resources::get_default_resource(); + + int* bins = res.template allocate(N); + int* a = res.template allocate(N); + + res.memcpy(bins, host_bins, N*sizeof(int)); + res.memcpy(a , host_a , N*sizeof(int)); + + example_code(arange, num_bins, bins, a); + + res.deallocate(bins); + res.deallocate(a ); + + std::cout << std::endl; + }); + +//----------------------------------------------------------------------------// + +// +// Clean up. +// + host_res.deallocate(host_bins); + host_res.deallocate(host_a ); + + std::cout << "\n DONE!...\n"; + + return 0; +} diff --git a/examples/launch-param-reductions.cpp b/examples/launch-param-reductions.cpp index 7dec3595a6..b57bedfd6b 100644 --- a/examples/launch-param-reductions.cpp +++ b/examples/launch-param-reductions.cpp @@ -9,8 +9,6 @@ #include #include -#include "memoryManager.hpp" - #include "RAJA/RAJA.hpp" /* @@ -39,6 +37,11 @@ constexpr int CUDA_BLOCK_SIZE = 256; constexpr int HIP_BLOCK_SIZE = 256; #endif +#if defined(RAJA_ENABLE_SYCL) +//LC testing hardware has a limit of 151 +constexpr int SYCL_BLOCK_SIZE = 128; +#endif + int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) { @@ -50,10 +53,25 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) // constexpr int N = 1000000; +// +// Use a resource to allocate memory +// + RAJA::resources::Host host_res; +#if defined(RAJA_ENABLE_CUDA) + RAJA::resources::Cuda device_res; +#endif +#if defined(RAJA_ENABLE_HIP) + RAJA::resources::Hip device_res; +#endif +#if defined(RAJA_ENABLE_SYCL) + RAJA::resources::Sycl device_res; +#endif + + // // Allocate array data and initialize data to alternating sequence of 1, -1. // - int* a = memoryManager::allocate(N); + int* a = host_res.allocate(N); for (int i = 0; i < N; ++i) { if ( i % 2 == 0 ) { @@ -111,9 +129,8 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) VALLOC_INT seq_minloc(std::numeric_limits::max(), -1); VALLOC_INT seq_maxloc(std::numeric_limits::min(), -1); - //RAJA::forall(arange, RAJA::launch - (RAJA::LaunchParams(), "SeqReductionKernel", + (host_res, RAJA::LaunchParams(), "SeqReductionKernel", RAJA::expt::Reduce(&seq_sum), RAJA::expt::Reduce(&seq_min), RAJA::expt::Reduce(&seq_max), @@ -171,7 +188,7 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) VALLOC_INT omp_maxloc(std::numeric_limits::min(), -1); RAJA::launch - (RAJA::LaunchParams(), "OmpReductionKernel", + (host_res, RAJA::LaunchParams(), "OmpReductionKernel", RAJA::expt::Reduce(&omp_sum), RAJA::expt::Reduce(&omp_min), RAJA::expt::Reduce(&omp_max), @@ -214,6 +231,9 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) #if defined(RAJA_ENABLE_CUDA) std::cout << "\n Running RAJA CUDA reductions...\n"; + int* d_a = device_res.allocate(N); + device_res.memcpy(d_a, a, sizeof(int) * N); + // _reductions_raja_cudapolicy_start using LAUNCH_POL3 = RAJA::LaunchPolicy>; using LOOP_POL3 = RAJA::LoopPolicy; @@ -228,7 +248,7 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) VALLOC_INT cuda_maxloc(std::numeric_limits::min(), -1); RAJA::launch - (RAJA::LaunchParams(RAJA::Teams(NUMBER_OF_TEAMS), RAJA::Threads(CUDA_BLOCK_SIZE)), + (device_res, RAJA::LaunchParams(RAJA::Teams(NUMBER_OF_TEAMS), RAJA::Threads(CUDA_BLOCK_SIZE)), "CUDAReductionKernel", RAJA::expt::Reduce(&cuda_sum), RAJA::expt::Reduce(&cuda_min), @@ -242,13 +262,13 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) RAJA::loop(ctx, arange, [&] (int i) { - _cuda_sum += a[i]; + _cuda_sum += d_a[i]; - _cuda_min = RAJA_MIN(a[i], _cuda_min); - _cuda_max = RAJA_MAX(a[i], _cuda_max); + _cuda_min = RAJA_MIN(d_a[i], _cuda_min); + _cuda_max = RAJA_MAX(d_a[i], _cuda_max); - _cuda_minloc = RAJA_MIN(VALLOC_INT(a[i], i), _cuda_minloc); - _cuda_maxloc = RAJA_MAX(VALLOC_INT(a[i], i), _cuda_maxloc); + _cuda_minloc = RAJA_MIN(VALLOC_INT(d_a[i], i), _cuda_minloc); + _cuda_maxloc = RAJA_MAX(VALLOC_INT(d_a[i], i), _cuda_maxloc); //_cuda_minloc.min(a[i], i); //_cuda_maxloc.max(a[i], i); @@ -267,6 +287,7 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) std::cout << "\tmax, loc = " << cuda_maxloc.getVal() << " , " << cuda_maxloc.getLoc() << std::endl; + device_res.deallocate(d_a); #endif //----------------------------------------------------------------------------// @@ -274,8 +295,8 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) #if defined(RAJA_ENABLE_HIP) std::cout << "\n Running RAJA HIP reductions...\n"; - int* d_a = memoryManager::allocate_gpu(N); - hipErrchk(hipMemcpy( d_a, a, N * sizeof(int), hipMemcpyHostToDevice )); + int* d_a = device_res.allocate(N); + device_res.memcpy(d_a, a, sizeof(int) * N); // _reductions_raja_hippolicy_start using LAUNCH_POL3 = RAJA::LaunchPolicy>; @@ -291,7 +312,7 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) VALLOC_INT hip_maxloc(std::numeric_limits::min(), -1); RAJA::launch - (RAJA::LaunchParams(RAJA::Teams(NUMBER_OF_TEAMS), RAJA::Threads(HIP_BLOCK_SIZE)), + (device_res, RAJA::LaunchParams(RAJA::Teams(NUMBER_OF_TEAMS), RAJA::Threads(HIP_BLOCK_SIZE)), "HipReductionKernel", RAJA::expt::Reduce(&hip_sum), RAJA::expt::Reduce(&hip_min), @@ -329,7 +350,70 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) std::cout << "\tmax, loc = " << hip_maxloc.getVal() << " , " << hip_maxloc.getLoc() << std::endl; - memoryManager::deallocate_gpu(d_a); + device_res.deallocate(d_a); +#endif + +//----------------------------------------------------------------------------// + +#if defined(RAJA_ENABLE_SYCL) + std::cout << "\n Running RAJA SYCL reductions...\n"; + + int* d_a = device_res.allocate(N); + device_res.memcpy(d_a, a, sizeof(int) * N); + + // _reductions_raja_syclpolicy_start + using LAUNCH_POL4 = RAJA::LaunchPolicy>; + using LOOP_POL4 = RAJA::LoopPolicy; + // _reductions_raja_syclpolicy_end + + const int NUMBER_OF_TEAMS = (N-1)/SYCL_BLOCK_SIZE + 1; + + int sycl_sum = 0; + int sycl_min = std::numeric_limits::max(); + int sycl_max = std::numeric_limits::min(); + VALLOC_INT sycl_minloc(std::numeric_limits::max(), -1); + VALLOC_INT sycl_maxloc(std::numeric_limits::min(), -1); + + RAJA::launch + (device_res, RAJA::LaunchParams(RAJA::Teams(NUMBER_OF_TEAMS), RAJA::Threads(SYCL_BLOCK_SIZE)), + "SyclReductionKernel", + RAJA::expt::Reduce(&sycl_sum), + RAJA::expt::Reduce(&sycl_min), + RAJA::expt::Reduce(&sycl_max), + RAJA::expt::Reduce(&sycl_minloc), + RAJA::expt::Reduce(&sycl_maxloc), + [=] RAJA_HOST_DEVICE (RAJA::LaunchContext ctx, + int &_sycl_sum, int &_sycl_min, + int &_sycl_max, VALLOC_INT &_sycl_minloc, + VALLOC_INT &_sycl_maxloc) { + + RAJA::loop(ctx, arange, [&] (int i) { + + _sycl_sum += d_a[i]; + + _sycl_min = RAJA_MIN(d_a[i], _sycl_min); + _sycl_max = RAJA_MAX(d_a[i], _sycl_max); + + _sycl_minloc = RAJA_MIN(VALLOC_INT(d_a[i], i), _sycl_minloc); + _sycl_maxloc = RAJA_MAX(VALLOC_INT(d_a[i], i), _sycl_maxloc); + //_sycl_minloc.min(d_a[i], i); + //_sycl_maxloc.max(d_a[i], i); + + } + ); + + } + ); + + std::cout << "\tsum = " << sycl_sum << std::endl; + std::cout << "\tmin = " << sycl_min << std::endl; + std::cout << "\tmax = " << sycl_max << std::endl; + std::cout << "\tmin, loc = " << sycl_minloc.getVal() << " , " + << sycl_minloc.getLoc() << std::endl; + std::cout << "\tmax, loc = " << sycl_maxloc.getVal() << " , " + << sycl_maxloc.getLoc() << std::endl; + + device_res.deallocate(d_a); #endif //----------------------------------------------------------------------------// @@ -337,7 +421,7 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) // // Clean up. // - memoryManager::deallocate(a); + host_res.deallocate(a); std::cout << "\n DONE!...\n"; diff --git a/examples/resource-dynamic-forall.cpp b/examples/resource-dynamic-forall.cpp index 01d45bb2ae..0b35017fac 100644 --- a/examples/resource-dynamic-forall.cpp +++ b/examples/resource-dynamic-forall.cpp @@ -110,9 +110,12 @@ int main(int argc, char *argv[]) #if defined(RAJA_ENABLE_HIP) RAJA::resources::Hip device_res; #endif +#if defined(RAJA_ENABLE_SYCL) + RAJA::resources::Sycl device_res; +#endif //Get typed erased resource - it will internally store if we are running on the host or device -#if defined(RAJA_ENABLE_CUDA) || defined(RAJA_ENABLE_HIP) +#if defined(RAJA_ENABLE_CUDA) || defined(RAJA_ENABLE_HIP) || defined(RAJA_ENABLE_SYCL) RAJA::resources::Resource res = RAJA::Get_Runtime_Resource(host_res, device_res, select_cpu_or_gpu); #else RAJA::resources::Resource res = RAJA::Get_Host_Resource(host_res, select_cpu_or_gpu); diff --git a/examples/resource-runtime-launch.cpp b/examples/resource-runtime-launch.cpp index 9524c32cde..e52923d81f 100644 --- a/examples/resource-runtime-launch.cpp +++ b/examples/resource-runtime-launch.cpp @@ -153,7 +153,7 @@ int main(int argc, char *argv[]) #endif //Get typed erased resource - it will internally store if we are running on the host or device -#if defined(RAJA_ENABLE_CUDA) || defined(RAJA_ENABLE_HIP) +#if defined(RAJA_GPU_ACTIVE) && !defined(RAJA_ENABLE_SYCL) RAJA::resources::Resource res = RAJA::Get_Runtime_Resource(host_res, device_res, select_cpu_or_gpu); #else RAJA::resources::Resource res = RAJA::Get_Host_Resource(host_res, select_cpu_or_gpu); diff --git a/include/RAJA/RAJA.hpp b/include/RAJA/RAJA.hpp index c37ac997a4..59cca4bf22 100644 --- a/include/RAJA/RAJA.hpp +++ b/include/RAJA/RAJA.hpp @@ -165,6 +165,7 @@ // Reduction objects // #include "RAJA/pattern/reduce.hpp" +#include "RAJA/pattern/multi_reduce.hpp" // diff --git a/include/RAJA/pattern/atomic.hpp b/include/RAJA/pattern/atomic.hpp index af9a6af911..d5905f7928 100644 --- a/include/RAJA/pattern/atomic.hpp +++ b/include/RAJA/pattern/atomic.hpp @@ -80,6 +80,32 @@ namespace RAJA */ +/*! + * @brief Atomic load + * @param acc Pointer to location of value + * @return Value at acc + */ +RAJA_SUPPRESS_HD_WARN +template +RAJA_INLINE RAJA_HOST_DEVICE T atomicLoad(T *acc) +{ + return RAJA::atomicLoad(Policy{}, acc); +} + + +/*! + * @brief Atomic store + * @param acc Pointer to location of value + * @param value Value to store at *acc + */ +RAJA_SUPPRESS_HD_WARN +template +RAJA_INLINE RAJA_HOST_DEVICE void atomicStore(T *acc, T value) +{ + RAJA::atomicStore(Policy{}, acc, value); +} + + /*! * @brief Atomic add * @param acc Pointer to location of result value @@ -88,7 +114,7 @@ namespace RAJA */ RAJA_SUPPRESS_HD_WARN template -RAJA_INLINE RAJA_HOST_DEVICE T atomicAdd(T volatile *acc, T value) +RAJA_INLINE RAJA_HOST_DEVICE T atomicAdd(T *acc, T value) { return RAJA::atomicAdd(Policy{}, acc, value); } @@ -102,7 +128,7 @@ RAJA_INLINE RAJA_HOST_DEVICE T atomicAdd(T volatile *acc, T value) */ RAJA_SUPPRESS_HD_WARN template -RAJA_INLINE RAJA_HOST_DEVICE T atomicSub(T volatile *acc, T value) +RAJA_INLINE RAJA_HOST_DEVICE T atomicSub(T *acc, T value) { return RAJA::atomicSub(Policy{}, acc, value); } @@ -116,7 +142,7 @@ RAJA_INLINE RAJA_HOST_DEVICE T atomicSub(T volatile *acc, T value) */ RAJA_SUPPRESS_HD_WARN template -RAJA_INLINE RAJA_HOST_DEVICE T atomicMin(T volatile *acc, T value) +RAJA_INLINE RAJA_HOST_DEVICE T atomicMin(T *acc, T value) { return RAJA::atomicMin(Policy{}, acc, value); } @@ -130,7 +156,7 @@ RAJA_INLINE RAJA_HOST_DEVICE T atomicMin(T volatile *acc, T value) */ RAJA_SUPPRESS_HD_WARN template -RAJA_INLINE RAJA_HOST_DEVICE T atomicMax(T volatile *acc, T value) +RAJA_INLINE RAJA_HOST_DEVICE T atomicMax(T *acc, T value) { return RAJA::atomicMax(Policy{}, acc, value); } @@ -143,7 +169,7 @@ RAJA_INLINE RAJA_HOST_DEVICE T atomicMax(T volatile *acc, T value) */ RAJA_SUPPRESS_HD_WARN template -RAJA_INLINE RAJA_HOST_DEVICE T atomicInc(T volatile *acc) +RAJA_INLINE RAJA_HOST_DEVICE T atomicInc(T *acc) { return RAJA::atomicInc(Policy{}, acc); } @@ -159,7 +185,7 @@ RAJA_INLINE RAJA_HOST_DEVICE T atomicInc(T volatile *acc) */ RAJA_SUPPRESS_HD_WARN template -RAJA_INLINE RAJA_HOST_DEVICE T atomicInc(T volatile *acc, T compare) +RAJA_INLINE RAJA_HOST_DEVICE T atomicInc(T *acc, T compare) { return RAJA::atomicInc(Policy{}, acc, compare); } @@ -172,7 +198,7 @@ RAJA_INLINE RAJA_HOST_DEVICE T atomicInc(T volatile *acc, T compare) */ RAJA_SUPPRESS_HD_WARN template -RAJA_INLINE RAJA_HOST_DEVICE T atomicDec(T volatile *acc) +RAJA_INLINE RAJA_HOST_DEVICE T atomicDec(T *acc) { return RAJA::atomicDec(Policy{}, acc); } @@ -188,7 +214,7 @@ RAJA_INLINE RAJA_HOST_DEVICE T atomicDec(T volatile *acc) */ RAJA_SUPPRESS_HD_WARN template -RAJA_INLINE RAJA_HOST_DEVICE T atomicDec(T volatile *acc, T compare) +RAJA_INLINE RAJA_HOST_DEVICE T atomicDec(T *acc, T compare) { return RAJA::atomicDec(Policy{}, acc, compare); } @@ -203,7 +229,7 @@ RAJA_INLINE RAJA_HOST_DEVICE T atomicDec(T volatile *acc, T compare) */ RAJA_SUPPRESS_HD_WARN template -RAJA_INLINE RAJA_HOST_DEVICE T atomicAnd(T volatile *acc, T value) +RAJA_INLINE RAJA_HOST_DEVICE T atomicAnd(T *acc, T value) { static_assert(std::is_integral::value, "atomicAnd can only be used on integral types"); @@ -220,7 +246,7 @@ RAJA_INLINE RAJA_HOST_DEVICE T atomicAnd(T volatile *acc, T value) */ RAJA_SUPPRESS_HD_WARN template -RAJA_INLINE RAJA_HOST_DEVICE T atomicOr(T volatile *acc, T value) +RAJA_INLINE RAJA_HOST_DEVICE T atomicOr(T *acc, T value) { static_assert(std::is_integral::value, "atomicOr can only be used on integral types"); @@ -237,7 +263,7 @@ RAJA_INLINE RAJA_HOST_DEVICE T atomicOr(T volatile *acc, T value) */ RAJA_SUPPRESS_HD_WARN template -RAJA_INLINE RAJA_HOST_DEVICE T atomicXor(T volatile *acc, T value) +RAJA_INLINE RAJA_HOST_DEVICE T atomicXor(T *acc, T value) { static_assert(std::is_integral::value, "atomicXor can only be used on integral types"); @@ -253,7 +279,7 @@ RAJA_INLINE RAJA_HOST_DEVICE T atomicXor(T volatile *acc, T value) */ RAJA_SUPPRESS_HD_WARN template -RAJA_INLINE RAJA_HOST_DEVICE T atomicExchange(T volatile *acc, T value) +RAJA_INLINE RAJA_HOST_DEVICE T atomicExchange(T *acc, T value) { return RAJA::atomicExchange(Policy{}, acc, value); } @@ -269,7 +295,7 @@ RAJA_INLINE RAJA_HOST_DEVICE T atomicExchange(T volatile *acc, T value) RAJA_SUPPRESS_HD_WARN template -RAJA_INLINE RAJA_HOST_DEVICE T atomicCAS(T volatile *acc, T compare, T value) +RAJA_INLINE RAJA_HOST_DEVICE T atomicCAS(T *acc, T compare, T value) { return RAJA::atomicCAS(Policy{}, acc, compare, value); } @@ -292,31 +318,34 @@ class AtomicRef RAJA_INLINE RAJA_HOST_DEVICE constexpr explicit AtomicRef(value_type *value_ptr) - : m_value_ptr(value_ptr){}; + : m_value_ptr(value_ptr) {} RAJA_INLINE RAJA_HOST_DEVICE - constexpr AtomicRef(AtomicRef const&c) - : m_value_ptr(c.m_value_ptr){}; + constexpr AtomicRef(AtomicRef const &c) + : m_value_ptr(c.m_value_ptr) {} AtomicRef& operator=(AtomicRef const&) = delete; RAJA_INLINE RAJA_HOST_DEVICE - value_type volatile * getPointer() const { return m_value_ptr; } + value_type * getPointer() const + { + return m_value_ptr; + } RAJA_INLINE RAJA_HOST_DEVICE void store(value_type rhs) const { - *m_value_ptr = rhs; + RAJA::atomicStore(m_value_ptr, rhs); } RAJA_INLINE RAJA_HOST_DEVICE value_type operator=(value_type rhs) const { - *m_value_ptr = rhs; + RAJA::atomicStore(m_value_ptr, rhs); return rhs; } @@ -324,14 +353,14 @@ class AtomicRef RAJA_HOST_DEVICE value_type load() const { - return *m_value_ptr; + return RAJA::atomicLoad(m_value_ptr); } RAJA_INLINE RAJA_HOST_DEVICE operator value_type() const { - return *m_value_ptr; + return RAJA::atomicLoad(m_value_ptr); } RAJA_INLINE @@ -498,7 +527,7 @@ class AtomicRef } private: - value_type volatile *m_value_ptr; + value_type *m_value_ptr; }; diff --git a/include/RAJA/pattern/detail/multi_reduce.hpp b/include/RAJA/pattern/detail/multi_reduce.hpp new file mode 100644 index 0000000000..884b9aa989 --- /dev/null +++ b/include/RAJA/pattern/detail/multi_reduce.hpp @@ -0,0 +1,420 @@ +/*! + ****************************************************************************** + * + * \file + * + * \brief Base types used in common for RAJA reducer objects. + * + ****************************************************************************** + */ + +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2016-24, Lawrence Livermore National Security, LLC +// and RAJA project contributors. See the RAJA/LICENSE file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +#ifndef RAJA_PATTERN_DETAIL_MULTI_REDUCE_HPP +#define RAJA_PATTERN_DETAIL_MULTI_REDUCE_HPP + +#include "RAJA/pattern/detail/forall.hpp" + +#include "RAJA/util/macros.hpp" +#include "RAJA/util/Operators.hpp" +#include "RAJA/util/types.hpp" +#include "RAJA/util/RepeatView.hpp" + + +#define RAJA_DECLARE_MULTI_REDUCER(OP_NAME, OP, POL, DATA) \ + template \ + struct MultiReduce##OP_NAME, T> \ + : reduce::detail::BaseMultiReduce##OP_NAME< \ + DATA, tuning>> \ + { \ + using policy = POL; \ + using Base = reduce::detail::BaseMultiReduce##OP_NAME< \ + DATA, tuning>>; \ + using Base::Base; \ + using typename Base::value_type; \ + using typename Base::reference; \ + \ + RAJA_SUPPRESS_HD_WARN \ + RAJA_HOST_DEVICE \ + reference operator[](size_t bin) const \ + { \ + return reference(*this, bin); \ + } \ + }; + +#define RAJA_DECLARE_ALL_MULTI_REDUCERS(POL, DATA) \ + RAJA_DECLARE_MULTI_REDUCER(Sum, sum, POL, DATA) \ + RAJA_DECLARE_MULTI_REDUCER(Min, min, POL, DATA) \ + RAJA_DECLARE_MULTI_REDUCER(Max, max, POL, DATA) \ + RAJA_DECLARE_MULTI_REDUCER(BitOr, or_bit, POL, DATA) \ + RAJA_DECLARE_MULTI_REDUCER(BitAnd, and_bit, POL, DATA) + +namespace RAJA +{ + +namespace reduce +{ + +namespace detail +{ + +template +struct BaseMultiReduce +{ + using MultiReduceData = t_MultiReduceData; + using MultiReduceOp = typename t_MultiReduceData::MultiReduceOp; + using value_type = typename t_MultiReduceData::value_type; + + BaseMultiReduce() : BaseMultiReduce{RepeatView(MultiReduceOp::identity(), 0)} {} + + explicit BaseMultiReduce(size_t num_bins, + value_type init_val = MultiReduceOp::identity(), + value_type identity = MultiReduceOp::identity()) + : BaseMultiReduce{RepeatView(init_val, num_bins), identity} + { } + + template < typename Container, + concepts::enable_if_t, + concepts::negate>, + concepts::negate>>* = nullptr > + explicit BaseMultiReduce(Container const& container, + value_type identity = MultiReduceOp::identity()) + : data{container, identity} + { } + + RAJA_SUPPRESS_HD_WARN + BaseMultiReduce(BaseMultiReduce const&) = default; + RAJA_SUPPRESS_HD_WARN + BaseMultiReduce(BaseMultiReduce &&) = default; + BaseMultiReduce &operator=(BaseMultiReduce const&) = delete; + BaseMultiReduce &operator=(BaseMultiReduce &&) = delete; + RAJA_SUPPRESS_HD_WARN + ~BaseMultiReduce() = default; + + void reset() + { + reset(RepeatView(MultiReduceOp::identity(), size())); + } + + void reset(size_t num_bins, + value_type init_val = MultiReduceOp::identity(), + value_type identity = MultiReduceOp::identity()) + { + reset(RepeatView(init_val, num_bins), identity); + } + + template < typename Container, + concepts::enable_if_t>* = nullptr > + void reset(Container const& container, + value_type identity = MultiReduceOp::identity()) + { + for (size_t bin = 0; bin < data.num_bins(); ++bin) { + RAJA_UNUSED_VAR(get(bin)); // automatic get() before reset + } + data.reset(container, identity); + } + + RAJA_SUPPRESS_HD_WARN + RAJA_HOST_DEVICE + size_t size() const { return data.num_bins(); } + + RAJA_SUPPRESS_HD_WARN + RAJA_HOST_DEVICE + BaseMultiReduce const& combine(size_t bin, value_type const &other) const + { + data.combine(bin, other); + return *this; + } + + //! Get the calculated reduced value for a bin + value_type get(size_t bin) const { return data.get(bin); } + + //! Get the calculated reduced value for each bin and store it in container + template < typename Container, + concepts::enable_if_t>* = nullptr > + void get_all(Container& container) const + { + RAJA_EXTRACT_BED_IT(container); + if (size_t(distance_it) != data.num_bins()) { + RAJA_ABORT_OR_THROW("MultiReduce::get_all container has different size than multi reducer"); + } + size_t bin = 0; + for (auto& val : container) { + val = data.get(bin); + ++bin; + } + } + +private: + MultiReduceData mutable data; +}; + + +/*! + ****************************************************************************** + * + * \brief Min reducer class template. + * + ****************************************************************************** + */ +template +class BaseMultiReduceMin : public BaseMultiReduce +{ +public: + using Base = BaseMultiReduce; + using typename Base::value_type; + using Base::Base; + + RAJA_SUPPRESS_HD_WARN + BaseMultiReduceMin(BaseMultiReduceMin const&) = default; + RAJA_SUPPRESS_HD_WARN + BaseMultiReduceMin(BaseMultiReduceMin &&) = default; + RAJA_SUPPRESS_HD_WARN + BaseMultiReduceMin &operator=(BaseMultiReduceMin const&) = delete; + RAJA_SUPPRESS_HD_WARN + BaseMultiReduceMin &operator=(BaseMultiReduceMin &&) = delete; + RAJA_SUPPRESS_HD_WARN + ~BaseMultiReduceMin() = default; + + struct reference + { + RAJA_HOST_DEVICE + reference(BaseMultiReduceMin const& base, size_t bin) + : m_base(base), m_bin(bin) + { } + + //! reducer function; updates the current instance's state + RAJA_HOST_DEVICE + reference const& min(value_type rhs) const + { + m_base.combine(m_bin, rhs); + return *this; + } + + value_type get() const + { + return m_base.get(m_bin); + } + + private: + BaseMultiReduceMin const& m_base; + size_t m_bin; + }; +}; + +/*! + ************************************************************************** + * + * \brief Max reducer class template. + * + ************************************************************************** + */ +template +class BaseMultiReduceMax : public BaseMultiReduce +{ +public: + using Base = BaseMultiReduce; + using typename Base::value_type; + + using Base::Base; + + RAJA_SUPPRESS_HD_WARN + BaseMultiReduceMax(BaseMultiReduceMax const&) = default; + RAJA_SUPPRESS_HD_WARN + BaseMultiReduceMax(BaseMultiReduceMax &&) = default; + BaseMultiReduceMax &operator=(BaseMultiReduceMax const&) = delete; + BaseMultiReduceMax &operator=(BaseMultiReduceMax &&) = delete; + RAJA_SUPPRESS_HD_WARN + ~BaseMultiReduceMax() = default; + + struct reference + { + RAJA_HOST_DEVICE + reference(BaseMultiReduceMax const& base, size_t bin) + : m_base(base), m_bin(bin) + { } + + //! reducer function; updates the current instance's state + RAJA_HOST_DEVICE + reference const& max(value_type rhs) const + { + m_base.combine(m_bin, rhs); + return *this; + } + + value_type get() const + { + return m_base.get(m_bin); + } + + private: + BaseMultiReduceMax const& m_base; + size_t m_bin; + }; +}; + +/*! + ************************************************************************** + * + * \brief Sum reducer class template. + * + ************************************************************************** + */ +template +class BaseMultiReduceSum : public BaseMultiReduce +{ +public: + using Base = BaseMultiReduce; + using typename Base::value_type; + + using Base::Base; + + RAJA_SUPPRESS_HD_WARN + BaseMultiReduceSum(BaseMultiReduceSum const&) = default; + RAJA_SUPPRESS_HD_WARN + BaseMultiReduceSum(BaseMultiReduceSum &&) = default; + BaseMultiReduceSum &operator=(BaseMultiReduceSum const&) = delete; + BaseMultiReduceSum &operator=(BaseMultiReduceSum &&) = delete; + RAJA_SUPPRESS_HD_WARN + ~BaseMultiReduceSum() = default; + + struct reference + { + RAJA_HOST_DEVICE + reference(BaseMultiReduceSum const& base, size_t bin) + : m_base(base), m_bin(bin) + { } + + //! reducer function; updates the current instance's state + RAJA_HOST_DEVICE + reference const& operator+=(value_type rhs) const + { + m_base.combine(m_bin, rhs); + return *this; + } + + value_type get() const + { + return m_base.get(m_bin); + } + + private: + BaseMultiReduceSum const& m_base; + size_t m_bin; + }; +}; + +/*! + ************************************************************************** + * + * \brief Bitwise OR reducer class template. + * + ************************************************************************** + */ +template +class BaseMultiReduceBitOr : public BaseMultiReduce +{ +public: + using Base = BaseMultiReduce; + using typename Base::value_type; + + using Base::Base; + + RAJA_SUPPRESS_HD_WARN + BaseMultiReduceBitOr(BaseMultiReduceBitOr const&) = default; + RAJA_SUPPRESS_HD_WARN + BaseMultiReduceBitOr(BaseMultiReduceBitOr &&) = default; + BaseMultiReduceBitOr &operator=(BaseMultiReduceBitOr const&) = delete; + BaseMultiReduceBitOr &operator=(BaseMultiReduceBitOr &&) = delete; + RAJA_SUPPRESS_HD_WARN + ~BaseMultiReduceBitOr() = default; + + struct reference + { + RAJA_HOST_DEVICE + reference(BaseMultiReduceBitOr const& base, size_t bin) + : m_base(base), m_bin(bin) + { } + + //! reducer function; updates the current instance's state + RAJA_HOST_DEVICE + reference const& operator|=(value_type rhs) const + { + m_base.combine(m_bin, rhs); + return *this; + } + + value_type get() const + { + return m_base.get(m_bin); + } + + private: + BaseMultiReduceBitOr const& m_base; + size_t m_bin; + }; +}; + +/*! + ************************************************************************** + * + * \brief Bitwise AND reducer class template. + * + ************************************************************************** + */ +template +class BaseMultiReduceBitAnd : public BaseMultiReduce +{ +public: + using Base = BaseMultiReduce; + using typename Base::value_type; + + using Base::Base; + + RAJA_SUPPRESS_HD_WARN + BaseMultiReduceBitAnd(BaseMultiReduceBitAnd const&) = default; + RAJA_SUPPRESS_HD_WARN + BaseMultiReduceBitAnd(BaseMultiReduceBitAnd &&) = default; + BaseMultiReduceBitAnd &operator=(BaseMultiReduceBitAnd const&) = delete; + BaseMultiReduceBitAnd &operator=(BaseMultiReduceBitAnd &&) = delete; + RAJA_SUPPRESS_HD_WARN + ~BaseMultiReduceBitAnd() = default; + + struct reference + { + RAJA_HOST_DEVICE + reference(BaseMultiReduceBitAnd const& base, size_t bin) + : m_base(base), m_bin(bin) + { } + + //! reducer function; updates the current instance's state + RAJA_HOST_DEVICE + reference const& operator&=(value_type rhs) const + { + m_base.combine(m_bin, rhs); + return *this; + } + + value_type get() const + { + return m_base.get(m_bin); + } + + private: + BaseMultiReduceBitAnd const& m_base; + size_t m_bin; + }; +}; + +} // namespace detail + +} // namespace reduce + +} // namespace RAJA + +#endif /* RAJA_PATTERN_DETAIL_MULTI_REDUCE_HPP */ diff --git a/include/RAJA/pattern/kernel/For.hpp b/include/RAJA/pattern/kernel/For.hpp index 1c11ad92bc..539c451673 100644 --- a/include/RAJA/pattern/kernel/For.hpp +++ b/include/RAJA/pattern/kernel/For.hpp @@ -145,4 +145,4 @@ struct StatementExecutor< } // end namespace RAJA -#endif /* RAJA_pattern_nested_HPP */ +#endif /* RAJA_pattern_kernel_For_HPP */ diff --git a/include/RAJA/pattern/kernel/Tile.hpp b/include/RAJA/pattern/kernel/Tile.hpp index 53b595564e..43f72e0545 100644 --- a/include/RAJA/pattern/kernel/Tile.hpp +++ b/include/RAJA/pattern/kernel/Tile.hpp @@ -169,13 +169,13 @@ struct IterableTiler { } RAJA_HOST_DEVICE - RAJA_INLINE bool operator!=(const IterableTiler &rhs) const + RAJA_INLINE bool operator!=(const iterator &rhs) const { return block_id != rhs.block_id; } RAJA_HOST_DEVICE - RAJA_INLINE bool operator<(const IterableTiler &rhs) const + RAJA_INLINE bool operator<(const iterator &rhs) const { return block_id < rhs.block_id; } diff --git a/include/RAJA/pattern/kernel/internal/LoopData.hpp b/include/RAJA/pattern/kernel/internal/LoopData.hpp index 8cef228874..9667a55538 100644 --- a/include/RAJA/pattern/kernel/internal/LoopData.hpp +++ b/include/RAJA/pattern/kernel/internal/LoopData.hpp @@ -214,7 +214,7 @@ struct GenericWrapper : GenericWrapperBase { /*! - * Convenience object used to create thread-private a LoopData object. + * Convenience object used to create a thread-private LoopData object. */ template struct NestedPrivatizer { diff --git a/include/RAJA/pattern/kernel/internal/LoopTypes.hpp b/include/RAJA/pattern/kernel/internal/LoopTypes.hpp index e47fe59e37..7f77df4214 100644 --- a/include/RAJA/pattern/kernel/internal/LoopTypes.hpp +++ b/include/RAJA/pattern/kernel/internal/LoopTypes.hpp @@ -3,8 +3,7 @@ * * \file * - * \brief Header file for loop kernel internals: LoopData structure and - * related helper functions. + * \brief Header file for loop kernel internals and related helper functions. * ****************************************************************************** */ @@ -93,4 +92,4 @@ using setSegmentTypeFromData = } // end namespace RAJA -#endif /* RAJA_pattern_kernel_internal_LoopData_HPP */ +#endif /* RAJA_pattern_kernel_internal_LoopTypes_HPP */ diff --git a/include/RAJA/pattern/kernel/internal/Template.hpp b/include/RAJA/pattern/kernel/internal/Template.hpp index 7b34949570..c750b95986 100644 --- a/include/RAJA/pattern/kernel/internal/Template.hpp +++ b/include/RAJA/pattern/kernel/internal/Template.hpp @@ -3,8 +3,7 @@ * * \file * - * \brief Header file for loop kernel internals: LoopData structure and - * related helper functions. + * \brief Header file for loop kernel internals and helper functions. * ****************************************************************************** */ @@ -83,4 +82,4 @@ using tuple_of_n = typename detail::TupleOfNHelper>:: } // end namespace RAJA -#endif /* RAJA_pattern_kernel_internal_LoopData_HPP */ +#endif /* RAJA_pattern_kernel_internal_Template_HPP */ diff --git a/include/RAJA/pattern/launch/launch_core.hpp b/include/RAJA/pattern/launch/launch_core.hpp index 213c435236..b78ec0de92 100644 --- a/include/RAJA/pattern/launch/launch_core.hpp +++ b/include/RAJA/pattern/launch/launch_core.hpp @@ -374,23 +374,21 @@ void launch(ExecPlace place, const LaunchParams &launch_params, ReduceParams&&.. } - - // Helper function to retrieve a resource based on the run-time policy - if a device is active -#if defined(RAJA_ENABLE_CUDA) || defined(RAJA_ENABLE_HIP) +#if defined(RAJA_ENABLE_CUDA) || defined(RAJA_ENABLE_HIP) || defined(RAJA_ENABLE_SYCL) template RAJA::resources::Resource Get_Runtime_Resource(T host_res, U device_res, RAJA::ExecPlace device){ if(device == RAJA::ExecPlace::DEVICE) {return RAJA::resources::Resource(device_res);} else { return RAJA::resources::Resource(host_res); } } -#else +#endif + template RAJA::resources::Resource Get_Host_Resource(T host_res, RAJA::ExecPlace device){ if(device == RAJA::ExecPlace::DEVICE) {RAJA_ABORT_OR_THROW("Device is not enabled");} return RAJA::resources::Resource(host_res); } -#endif //Launch API which takes team resource struct and supports new reducers template diff --git a/include/RAJA/pattern/multi_reduce.hpp b/include/RAJA/pattern/multi_reduce.hpp new file mode 100644 index 0000000000..3fbe36877c --- /dev/null +++ b/include/RAJA/pattern/multi_reduce.hpp @@ -0,0 +1,194 @@ +/*! + ****************************************************************************** + * + * \file + * + * \brief Header file providing RAJA reduction declarations. + * + ****************************************************************************** + */ + +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2016-24, Lawrence Livermore National Security, LLC +// and RAJA project contributors. See the RAJA/LICENSE file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +#ifndef RAJA_multi_reduce_HPP +#define RAJA_multi_reduce_HPP + +#include "RAJA/config.hpp" + +#include "RAJA/util/Operators.hpp" +#include "RAJA/util/macros.hpp" + +namespace RAJA +{ + +// +// Forward declarations for multi reduction templates. +// Actual classes appear in forall_*.hxx header files. +// +// IMPORTANT: multi reduction policy parameter must be consistent with loop +// execution policy type. +// +// Also, multiple multi reductions using different reduction operations may be +// combined in a single RAJA forall() construct. +// + +/*! + ****************************************************************************** + * + * \brief Min multi reducer class template. + * + * Usage example: + * + * \verbatim + + Real_ptr data = ...; + Index_ptr bins = ...; + Real_ptr min_vals = ...; + + MultiReduceMin my_mins(num_bins, init_val); + + forall( ..., [=] (Index_type i) { + my_mins[bins[i]].min(data[i]); + } + + for (size_t bin = 0; bin < num_bins; ++bin) { + min_vals[bin] = my_mins[bin].get(); + } + + * \endverbatim + * + ****************************************************************************** + */ +template +struct MultiReduceMin; + +/*! + ****************************************************************************** + * + * \brief Max multi reducer class template. + * + * Usage example: + * + * \verbatim + + Real_ptr data = ...; + Index_ptr bins = ...; + Real_ptr max_vals = ...; + + MultiReduceMax my_maxs(num_bins, init_val); + + forall( ..., [=] (Index_type i) { + my_maxs[bins[i]].max(data[i]); + } + + for (size_t bin = 0; bin < num_bins; ++bin) { + max_vals[bin] = my_maxs[bin].get(); + } + + * \endverbatim + * + ****************************************************************************** + */ +template +struct MultiReduceMax; + +/*! + ****************************************************************************** + * + * \brief Sum multi reducer class template. + * + * Usage example: + * + * \verbatim + + Real_ptr data = ...; + Index_ptr bins = ...; + Real_ptr sum_vals = ...; + + MultiReduceSum my_sums(num_bins, init_val); + + forall( ..., [=] (Index_type i) { + my_sums[bins[i]] += (data[i]); + } + + for (size_t bin = 0; bin < num_bins; ++bin) { + sum_vals[bin] = my_sums[bin].get(); + } + + * \endverbatim + * + ****************************************************************************** + */ +template +struct MultiReduceSum; + +/*! + ****************************************************************************** + * + * \brief Bitwise OR multi reducer class template. + * + * Usage example: + * + * \verbatim + + Real_ptr data = ...; + Index_ptr bins = ...; + Real_ptr bit_vals = ...; + + MultiReduceBitOr my_bits(num_bins, init_val); + + forall( ..., [=] (Index_type i) { + my_bits[bins[i]] |= (data[i]); + } + + for (size_t bin = 0; bin < num_bins; ++bin) { + bit_vals[bin] = my_bits[bin].get(); + } + + * \endverbatim + * + ****************************************************************************** + */ +template +struct MultiReduceBitOr; + + +/*! + ****************************************************************************** + * + * \brief Bitwise AND multi reducer class template. + * + * Usage example: + * + * \verbatim + + Real_ptr data = ...; + Index_ptr bins = ...; + Real_ptr bit_vals = ...; + + MultiReduceBitAnd my_bits(num_bins, init_val); + + forall( ..., [=] (Index_type i) { + my_bits[bins[i]] &= (data[i]); + } + + for (size_t bin = 0; bin < num_bins; ++bin) { + bit_vals[bin] = my_bits[bin].get(); + } + + * \endverbatim + * + ****************************************************************************** + */ +template +struct MultiReduceBitAnd; + +} //namespace RAJA + + +#endif // closing endif for header file include guard diff --git a/include/RAJA/pattern/params/forall.hpp b/include/RAJA/pattern/params/forall.hpp index 7e685e2ce0..fb854c8706 100644 --- a/include/RAJA/pattern/params/forall.hpp +++ b/include/RAJA/pattern/params/forall.hpp @@ -2,12 +2,17 @@ #define FORALL_PARAM_HPP #include "RAJA/policy/sequential/params/reduce.hpp" +#include "RAJA/policy/sequential/params/kernel_name.hpp" #include "RAJA/policy/openmp/params/reduce.hpp" +#include "RAJA/policy/openmp/params/kernel_name.hpp" #include "RAJA/policy/openmp_target/params/reduce.hpp" +#include "RAJA/policy/openmp_target/params/kernel_name.hpp" #include "RAJA/policy/cuda/params/reduce.hpp" #include "RAJA/policy/cuda/params/kernel_name.hpp" #include "RAJA/policy/hip/params/reduce.hpp" +#include "RAJA/policy/hip/params/kernel_name.hpp" #include "RAJA/policy/sycl/params/reduce.hpp" +#include "RAJA/policy/sycl/params/kernel_name.hpp" #include "RAJA/util/CombiningAdapter.hpp" diff --git a/include/RAJA/policy/PolicyBase.hpp b/include/RAJA/policy/PolicyBase.hpp index f867265b8a..898c92a621 100644 --- a/include/RAJA/policy/PolicyBase.hpp +++ b/include/RAJA/policy/PolicyBase.hpp @@ -42,6 +42,7 @@ enum class Pattern { forall, region, reduce, + multi_reduce, taskgraph, synchronize, workgroup, @@ -110,6 +111,25 @@ struct platform_is : camp::num>::value == P_> { }; +template +struct policy_has_trait_impl + : camp::num { +}; +/// +template +struct policy_has_trait_impl< + PolicyBaseT, Trait> + : camp::num...>::value> { +}; +/// +template +using policy_has_trait = policy_has_trait_impl, Trait>; + + template struct wrapper { using inner = Inner; @@ -121,6 +141,9 @@ namespace reduce struct ordered { }; +struct unordered { +}; + } // namespace reduce @@ -201,6 +224,15 @@ struct is_device_exec_policy DefineTypeTraitFromConcept(is_execution_policy, RAJA::concepts::ExecutionPolicy); + +template +struct is_reduce_policy : RAJA::pattern_is { +}; + +template +struct is_multi_reduce_policy : RAJA::pattern_is { +}; + } // end namespace type_traits } // end namespace RAJA diff --git a/include/RAJA/policy/atomic_auto.hpp b/include/RAJA/policy/atomic_auto.hpp index a64212b665..e0ca557b32 100644 --- a/include/RAJA/policy/atomic_auto.hpp +++ b/include/RAJA/policy/atomic_auto.hpp @@ -63,81 +63,91 @@ namespace RAJA struct auto_atomic { }; +template +RAJA_INLINE RAJA_HOST_DEVICE T atomicLoad(auto_atomic, T *acc) +{ + return atomicLoad(RAJA_AUTO_ATOMIC, acc); +} template -RAJA_INLINE RAJA_HOST_DEVICE T atomicAdd(auto_atomic, T volatile *acc, T value) +RAJA_INLINE RAJA_HOST_DEVICE void atomicStore(auto_atomic, T *acc, T value) { - return atomicAdd(RAJA_AUTO_ATOMIC, acc, value); + atomicStore(RAJA_AUTO_ATOMIC, acc, value); } +template +RAJA_INLINE RAJA_HOST_DEVICE T atomicAdd(auto_atomic, T *acc, T value) +{ + return atomicAdd(RAJA_AUTO_ATOMIC, acc, value); +} template -RAJA_INLINE RAJA_HOST_DEVICE T atomicSub(auto_atomic, T volatile *acc, T value) +RAJA_INLINE RAJA_HOST_DEVICE T atomicSub(auto_atomic, T *acc, T value) { return atomicSub(RAJA_AUTO_ATOMIC, acc, value); } template -RAJA_INLINE RAJA_HOST_DEVICE T atomicMin(auto_atomic, T volatile *acc, T value) +RAJA_INLINE RAJA_HOST_DEVICE T atomicMin(auto_atomic, T *acc, T value) { return atomicMin(RAJA_AUTO_ATOMIC, acc, value); } template -RAJA_INLINE RAJA_HOST_DEVICE T atomicMax(auto_atomic, T volatile *acc, T value) +RAJA_INLINE RAJA_HOST_DEVICE T atomicMax(auto_atomic, T *acc, T value) { return atomicMax(RAJA_AUTO_ATOMIC, acc, value); } template -RAJA_INLINE RAJA_HOST_DEVICE T atomicInc(auto_atomic, T volatile *acc) +RAJA_INLINE RAJA_HOST_DEVICE T atomicInc(auto_atomic, T *acc) { return atomicInc(RAJA_AUTO_ATOMIC, acc); } template RAJA_INLINE RAJA_HOST_DEVICE T atomicInc(auto_atomic, - T volatile *acc, + T *acc, T compare) { return atomicInc(RAJA_AUTO_ATOMIC, acc, compare); } template -RAJA_INLINE RAJA_HOST_DEVICE T atomicDec(auto_atomic, T volatile *acc) +RAJA_INLINE RAJA_HOST_DEVICE T atomicDec(auto_atomic, T *acc) { return atomicDec(RAJA_AUTO_ATOMIC, acc); } template RAJA_INLINE RAJA_HOST_DEVICE T atomicDec(auto_atomic, - T volatile *acc, + T *acc, T compare) { return atomicDec(RAJA_AUTO_ATOMIC, acc, compare); } template -RAJA_INLINE RAJA_HOST_DEVICE T atomicAnd(auto_atomic, T volatile *acc, T value) +RAJA_INLINE RAJA_HOST_DEVICE T atomicAnd(auto_atomic, T *acc, T value) { return atomicAnd(RAJA_AUTO_ATOMIC, acc, value); } template -RAJA_INLINE RAJA_HOST_DEVICE T atomicOr(auto_atomic, T volatile *acc, T value) +RAJA_INLINE RAJA_HOST_DEVICE T atomicOr(auto_atomic, T *acc, T value) { return atomicOr(RAJA_AUTO_ATOMIC, acc, value); } template -RAJA_INLINE RAJA_HOST_DEVICE T atomicXor(auto_atomic, T volatile *acc, T value) +RAJA_INLINE RAJA_HOST_DEVICE T atomicXor(auto_atomic, T *acc, T value) { return atomicXor(RAJA_AUTO_ATOMIC, acc, value); } template RAJA_INLINE RAJA_HOST_DEVICE T atomicExchange(auto_atomic, - T volatile *acc, + T *acc, T value) { return atomicExchange(RAJA_AUTO_ATOMIC, acc, value); @@ -145,7 +155,7 @@ RAJA_INLINE RAJA_HOST_DEVICE T atomicExchange(auto_atomic, template RAJA_INLINE RAJA_HOST_DEVICE T -atomicCAS(auto_atomic, T volatile *acc, T compare, T value) +atomicCAS(auto_atomic, T *acc, T compare, T value) { return atomicCAS(RAJA_AUTO_ATOMIC, acc, compare, value); } diff --git a/include/RAJA/policy/atomic_builtin.hpp b/include/RAJA/policy/atomic_builtin.hpp index fa3f4841a1..34755fa49d 100644 --- a/include/RAJA/policy/atomic_builtin.hpp +++ b/include/RAJA/policy/atomic_builtin.hpp @@ -20,9 +20,16 @@ #include "RAJA/config.hpp" +#include + +#if defined(RAJA_COMPILER_MSVC) || (defined(_WIN32) && defined(__INTEL_COMPILER)) +#include +#endif + #include "RAJA/util/TypeConvert.hpp" #include "RAJA/util/macros.hpp" + #if defined(RAJA_ENABLE_HIP) #define RAJA_DEVICE_HIP RAJA_HOST_DEVICE #else @@ -37,199 +44,667 @@ namespace RAJA struct builtin_atomic { }; -namespace detail -{ + +namespace detail { + #if defined(RAJA_COMPILER_MSVC) || (defined(_WIN32) && defined(__INTEL_COMPILER)) -RAJA_DEVICE_HIP -RAJA_INLINE unsigned builtin_atomic_CAS(unsigned volatile *acc, - unsigned compare, - unsigned value) + +/*! + * Type trait for determining if the operator should be implemented + * using an intrinsic + */ +template +struct builtin_useIntrinsic { + static constexpr bool value = + std::is_same::value || + std::is_same::value || + std::is_same::value || + std::is_same::value; +}; + + +/*! + * Type trait for determining if the operator should be implemented + * by reinterpreting inputs to types that intrinsics support + */ +template +struct builtin_useReinterpret { + static constexpr bool value = + !builtin_useIntrinsic::value && + (sizeof(T) == 1 || + sizeof(T) == 2 || + sizeof(T) == 4 || + sizeof(T) == 8); + + using type = + std::conditional_t>>; +}; + + +/*! + * Type trait for determining if the operator should be implemented + * using a compare and swap loop + */ +template +struct builtin_useCAS { + static constexpr bool value = + !builtin_useIntrinsic::value && + (sizeof(T) == 1 || sizeof(T) == 2 || sizeof(T) == 4 || sizeof(T) == 8); +}; + + +/*! + * Atomics implemented using intrinsics + */ + + +/*! + * Atomic or using intrinsics + */ +RAJA_INLINE char builtin_atomicOr(char *acc, char value) { + return _InterlockedOr8(acc, value); +} - long long_value = RAJA::util::reinterp_A_as_B(value); - long long_compare = RAJA::util::reinterp_A_as_B(compare); +RAJA_INLINE short builtin_atomicOr(short *acc, short value) +{ + return _InterlockedOr16(acc, value); +} - long old = _InterlockedCompareExchange((long *)acc, long_value, long_compare); +RAJA_INLINE long builtin_atomicOr(long *acc, long value) +{ + return _InterlockedOr(acc, value); +} - return RAJA::util::reinterp_A_as_B(old); +RAJA_INLINE long long builtin_atomicOr(long long *acc, long long value) +{ + return _InterlockedOr64(acc, value); } -RAJA_DEVICE_HIP -RAJA_INLINE unsigned long long builtin_atomic_CAS( - unsigned long long volatile *acc, - unsigned long long compare, - unsigned long long value) + +/*! + * Atomic load using atomic or + */ +template ::value, bool> = true> +RAJA_INLINE T builtin_atomicLoad(T *acc) { + return builtin_atomicOr(acc, static_cast(0)); +} - long long long_value = - RAJA::util::reinterp_A_as_B(value); - long long long_compare = - RAJA::util::reinterp_A_as_B(compare); - long long old = _InterlockedCompareExchange64((long long volatile *)acc, - long_value, - long_compare); +/*! + * Atomic exchange using intrinsics + */ +RAJA_INLINE char builtin_atomicExchange(char *acc, char value) +{ + return _InterlockedExchange8(acc, value); +} - return RAJA::util::reinterp_A_as_B(old); +RAJA_INLINE short builtin_atomicExchange(short *acc, short value) +{ + return _InterlockedExchange16(acc, value); } -#else // RAJA_COMPILER_MSVC +RAJA_INLINE long builtin_atomicExchange(long *acc, long value) +{ + return _InterlockedExchange(acc, value); +} -RAJA_DEVICE_HIP -RAJA_INLINE unsigned builtin_atomic_CAS(unsigned volatile *acc, - unsigned compare, - unsigned value) +RAJA_INLINE long long builtin_atomicExchange(long long *acc, long long value) { - __atomic_compare_exchange_n( - acc, &compare, value, false, __ATOMIC_ACQ_REL, __ATOMIC_RELAXED); - return compare; + return _InterlockedExchange64(acc, value); } -RAJA_DEVICE_HIP -RAJA_INLINE unsigned long long builtin_atomic_CAS( - unsigned long long volatile *acc, - unsigned long long compare, - unsigned long long value) + +/*! + * Atomic store using atomic exchange + */ +template ::value, bool> = true> +RAJA_INLINE void builtin_atomicStore(T *acc, T value) { - __atomic_compare_exchange_n( - acc, &compare, value, false, __ATOMIC_ACQ_REL, __ATOMIC_RELAXED); - return compare; + builtin_atomicExchange(acc, value); } -#endif // RAJA_COMPILER_MSVC +/*! + * Atomic compare and swap using intrinsics + */ +RAJA_INLINE char builtin_atomicCAS(char *acc, char compare, char value) +{ + return _InterlockedCompareExchange8(acc, value, compare); +} -template -RAJA_DEVICE_HIP RAJA_INLINE - typename std::enable_if::type - builtin_atomic_CAS(T volatile *acc, T compare, T value) +RAJA_INLINE short builtin_atomicCAS(short *acc, short compare, short value) { - return RAJA::util::reinterp_A_as_B( - builtin_atomic_CAS((unsigned volatile *)acc, - RAJA::util::reinterp_A_as_B(compare), - RAJA::util::reinterp_A_as_B(value))); + return _InterlockedCompareExchange16(acc, value, compare); } -template -RAJA_DEVICE_HIP RAJA_INLINE - typename std::enable_if::type - builtin_atomic_CAS(T volatile *acc, T compare, T value) +RAJA_INLINE long builtin_atomicCAS(long *acc, long compare, long value) +{ + return _InterlockedCompareExchange(acc, value, compare); +} + +RAJA_INLINE long long builtin_atomicCAS(long long *acc, long long compare, long long value) +{ + return _InterlockedCompareExchange64(acc, value, compare); +} + + +/*! + * Atomic addition using intrinsics + */ +RAJA_INLINE char builtin_atomicAdd(char *acc, char value) +{ + return _InterlockedExchangeAdd8(acc, value); +} + +RAJA_INLINE short builtin_atomicAdd(short *acc, short value) +{ + return _InterlockedExchangeAdd16(acc, value); +} + +RAJA_INLINE long builtin_atomicAdd(long *acc, long value) +{ + return _InterlockedExchangeAdd(acc, value); +} + +RAJA_INLINE long long builtin_atomicAdd(long long *acc, long long value) { - return RAJA::util::reinterp_A_as_B(builtin_atomic_CAS( - (unsigned long long volatile *)acc, - RAJA::util::reinterp_A_as_B(compare), - RAJA::util::reinterp_A_as_B(value))); + return _InterlockedExchangeAdd64(acc, value); } -template -struct BuiltinAtomicCAS; -template -struct BuiltinAtomicCAS { - static_assert(!(BYTES == 4 || BYTES == 8), - "builtin atomic cas assumes 4 or 8 byte targets"); +/*! + * Atomic subtraction using intrinsics + */ +RAJA_INLINE char builtin_atomicSub(char *acc, char value) +{ + return _InterlockedExchangeAdd8(acc, -value); +} + +RAJA_INLINE short builtin_atomicSub(short *acc, short value) +{ + return _InterlockedExchangeAdd16(acc, -value); +} + +RAJA_INLINE long builtin_atomicSub(long *acc, long value) +{ + return _InterlockedExchangeAdd(acc, -value); +} + +RAJA_INLINE long long builtin_atomicSub(long long *acc, long long value) +{ + return _InterlockedExchangeAdd64(acc, -value); +} + + +/*! + * Atomic and using intrinsics + */ +RAJA_INLINE char builtin_atomicAnd(char *acc, char value) +{ + return _InterlockedAnd8(acc, value); +} + +RAJA_INLINE short builtin_atomicAnd(short *acc, short value) +{ + return _InterlockedAnd16(acc, value); +} + +RAJA_INLINE long builtin_atomicAnd(long *acc, long value) +{ + return _InterlockedAnd(acc, value); +} + +RAJA_INLINE long long builtin_atomicAnd(long long *acc, long long value) +{ + return _InterlockedAnd64(acc, value); +} + + +/*! + * Atomic xor using intrinsics + */ +RAJA_INLINE char builtin_atomicXor(char *acc, char value) +{ + return _InterlockedXor8(acc, value); +} + +RAJA_INLINE short builtin_atomicXor(short *acc, short value) +{ + return _InterlockedXor16(acc, value); +} + +RAJA_INLINE long builtin_atomicXor(long *acc, long value) +{ + return _InterlockedXor(acc, value); +} + +RAJA_INLINE long long builtin_atomicXor(long long *acc, long long value) +{ + return _InterlockedXor64(acc, value); +} + + +#else // RAJA_COMPILER_MSVC + + +/*! + * Type trait for determining if the operator should be implemented + * using an intrinsic + */ +template +struct builtin_useIntrinsic { + static constexpr bool value = + (std::is_integral::value || std::is_enum::value) && + (sizeof(T) == 1 || sizeof(T) == 2 || sizeof(T) == 4 || sizeof(T) == 8); }; -template <> -struct BuiltinAtomicCAS<4> { - - /*! - * Generic impementation of any atomic 32-bit operator. - * Implementation uses the existing builtin unsigned 32-bit CAS operator. - * Returns the OLD value that was replaced by the result of this operation. - */ - template - RAJA_DEVICE_HIP RAJA_INLINE T operator()(T volatile *acc, - OPER const &oper, - ShortCircuit const &sc) const - { -#ifdef RAJA_COMPILER_MSVC -#pragma warning( disable : 4244 ) // Force msvc to not emit conversion warning +/*! + * Type trait for determining if the operator should be implemented + * by reinterpreting inputs to types that intrinsics support + */ +template +struct builtin_useReinterpret { + static constexpr bool value = + !std::is_integral::value && + !std::is_enum::value && + ((sizeof(T) == 1 +#if !defined(UINT8_MAX) + && sizeof(unsigned char) == 1 #endif - unsigned oldval, newval, readback; - - oldval = RAJA::util::reinterp_A_as_B(*acc); - newval = RAJA::util::reinterp_A_as_B( - oper(RAJA::util::reinterp_A_as_B(oldval))); - - while ((readback = builtin_atomic_CAS((unsigned *)acc, oldval, newval)) != - oldval) { - if (sc(readback)) break; - oldval = readback; - newval = RAJA::util::reinterp_A_as_B( - oper(RAJA::util::reinterp_A_as_B(oldval))); - } - return RAJA::util::reinterp_A_as_B(oldval); - } -#ifdef RAJA_COMPILER_MSVC -#pragma warning( default : 4244 ) // Reenable warning + ) || + (sizeof(T) == 2 +#if !defined(UINT16_MAX) + && sizeof(unsigned short) == 2 #endif -}; - -template <> -struct BuiltinAtomicCAS<8> { - - /*! - * Generic impementation of any atomic 64-bit operator. - * Implementation uses the existing builtin unsigned 64-bit CAS operator. - * Returns the OLD value that was replaced by the result of this operation. - */ - template - RAJA_DEVICE_HIP RAJA_INLINE T operator()(T volatile *acc, - OPER const &oper, - ShortCircuit const &sc) const - { -#ifdef RAJA_COMPILER_MSVC -#pragma warning( disable : 4244 ) // Force msvc to not emit conversion warning + ) || + (sizeof(T) == 4 +#if !defined(UINT32_MAX) + && sizeof(unsigned int) == 4 #endif - unsigned long long oldval, newval, readback; - - oldval = RAJA::util::reinterp_A_as_B(*acc); - newval = RAJA::util::reinterp_A_as_B( - oper(RAJA::util::reinterp_A_as_B(oldval))); - - while ((readback = builtin_atomic_CAS((unsigned long long *)acc, - oldval, - newval)) != oldval) { - if (sc(readback)) break; - oldval = readback; - newval = RAJA::util::reinterp_A_as_B( - oper(RAJA::util::reinterp_A_as_B(oldval))); - } - return RAJA::util::reinterp_A_as_B(oldval); - } + ) || + (sizeof(T) == 8 +#if !defined(UINT64_MAX) + && sizeof(unsigned long long) == 8 +#endif + )); -#ifdef RAJA_COMPILER_MSVC -#pragma warning( default : 4244 ) // Reenable warning + using type = + std::conditional_t>>; +#else + unsigned long long>>>; +#endif +}; + +/*! + * Type trait for determining if the operator should be implemented + * using a compare and swap loop + */ +template +struct builtin_useCAS { + static constexpr bool value = + !std::is_integral::value && !std::is_enum::value && + (sizeof(T) == 1 || sizeof(T) == 2 || sizeof(T) == 4 || sizeof(T) == 8); }; /*! - * Generic impementation of any atomic 32-bit or 64-bit operator that can be - * implemented using a compare and swap primitive. - * Implementation uses the builtin unsigned 32-bit and 64-bit CAS operators. + * Atomics implemented using intrinsics + */ + + +/*! + * Atomic load using intrinsic + */ +template ::value, bool> = true> +RAJA_DEVICE_HIP RAJA_INLINE T builtin_atomicLoad(T *acc) +{ + return __atomic_load_n(acc, __ATOMIC_RELAXED); +} + + +/*! + * Atomic store using intrinsic + */ +template ::value, bool> = true> +RAJA_DEVICE_HIP RAJA_INLINE void builtin_atomicStore(T *acc, T value) +{ + __atomic_store_n(acc, value, __ATOMIC_RELAXED); +} + + +/*! + * Atomic exchange using intrinsic + */ +template ::value, bool> = true> +RAJA_DEVICE_HIP RAJA_INLINE T builtin_atomicExchange(T *acc, T value) +{ + return __atomic_exchange_n(acc, value, __ATOMIC_RELAXED); +} + + +/*! + * Atomic compare and swap using intrinsic + */ +template ::value, bool> = true> +RAJA_DEVICE_HIP RAJA_INLINE T builtin_atomicCAS(T *acc, T compare, T value) +{ + __atomic_compare_exchange_n( + acc, &compare, value, false, __ATOMIC_RELAXED, __ATOMIC_RELAXED); + return compare; +} + + +/*! + * Atomic addition using intrinsic + */ +template ::value, bool> = true> +RAJA_DEVICE_HIP RAJA_INLINE T builtin_atomicAdd(T *acc, T value) +{ + return __atomic_fetch_add(acc, value, __ATOMIC_RELAXED); +} + + +/*! + * Atomic subtraction using intrinsic + */ +template ::value, bool> = true> +RAJA_DEVICE_HIP RAJA_INLINE T builtin_atomicSub(T *acc, T value) +{ + return __atomic_fetch_sub(acc, value, __ATOMIC_RELAXED); +} + + +/*! + * Atomic and using intrinsic + */ +template ::value, bool> = true> +RAJA_DEVICE_HIP RAJA_INLINE T builtin_atomicAnd(T *acc, T value) +{ + return __atomic_fetch_and(acc, value, __ATOMIC_RELAXED); +} + + +/*! + * Atomic or using intrinsic + */ +template ::value, bool> = true> +RAJA_DEVICE_HIP RAJA_INLINE T builtin_atomicOr(T *acc, T value) +{ + return __atomic_fetch_or(acc, value, __ATOMIC_RELAXED); +} + + +/*! + * Atomic xor using intrinsic + */ +template ::value, bool> = true> +RAJA_DEVICE_HIP RAJA_INLINE T builtin_atomicXor(T *acc, T value) +{ + return __atomic_fetch_xor(acc, value, __ATOMIC_RELAXED); +} + + +#endif // RAJA_COMPILER_MSVC + + +/*! + * Atomics implemented using reinterpret cast + */ + + +/*! + * Alias for determining the integral type of the same size as the given type + */ +template +using builtin_useReinterpret_t = typename builtin_useReinterpret::type; + + +/*! + * Atomic load using reinterpret cast + */ +template ::value, bool> = true> +RAJA_DEVICE_HIP RAJA_INLINE T builtin_atomicLoad(T *acc) +{ + using R = builtin_useReinterpret_t; + + return RAJA::util::reinterp_A_as_B( + builtin_atomicLoad(reinterpret_cast(acc))); +} + + +/*! + * Atomic store using reinterpret cast + */ +template ::value, bool> = true> +RAJA_DEVICE_HIP RAJA_INLINE void builtin_atomicStore(T *acc, T value) +{ + using R = builtin_useReinterpret_t; + + builtin_atomicStore(reinterpret_cast(acc), + RAJA::util::reinterp_A_as_B(value)); +} + + +/*! + * Atomic exchange using reinterpret cast + */ +template ::value, bool> = true> +RAJA_DEVICE_HIP RAJA_INLINE T builtin_atomicExchange(T *acc, T value) +{ + using R = builtin_useReinterpret_t; + + return RAJA::util::reinterp_A_as_B( + builtin_atomicExchange(reinterpret_cast(acc), + RAJA::util::reinterp_A_as_B(value))); +} + + +/*! + * Atomic compare and swap using reinterpret cast + */ +template ::value, bool> = true> +RAJA_DEVICE_HIP RAJA_INLINE T builtin_atomicCAS(T *acc, T compare, T value) +{ + using R = builtin_useReinterpret_t; + + return RAJA::util::reinterp_A_as_B( + builtin_atomicCAS(reinterpret_cast(acc), + RAJA::util::reinterp_A_as_B(compare), + RAJA::util::reinterp_A_as_B(value))); +} + + +/*! + * Implementation of compare and swap loop + */ + + +/*! + * Equality comparison for compare and swap loop using types supported by + * intrinsics. + */ +template ::value, bool> = true> +RAJA_DEVICE_HIP RAJA_INLINE bool builtin_atomicCAS_equal(const T &a, const T &b) +{ + return a == b; +} + + +/*! + * Equality comparison for compare and swap loop using reinterpret cast. + * Converts to the underlying integral type to avoid cases where the values + * will never compare equal (most notably, NaNs). + */ +template ::value, bool> = true> +RAJA_DEVICE_HIP RAJA_INLINE bool builtin_atomicCAS_equal(const T &a, const T &b) +{ + using R = builtin_useReinterpret_t; + + return builtin_atomicCAS_equal(RAJA::util::reinterp_A_as_B(a), + RAJA::util::reinterp_A_as_B(b)); +} + + +/*! + * Generic impementation of any atomic 8, 16, 32, or 64 bit operator + * that can be implemented using a builtin compare and swap primitive. * Returns the OLD value that was replaced by the result of this operation. */ -template -RAJA_DEVICE_HIP RAJA_INLINE T builtin_atomic_CAS_oper(T volatile *acc, - OPER &&oper) +template +RAJA_DEVICE_HIP RAJA_INLINE T builtin_atomicCAS_loop(T *acc, + Oper &&oper) +{ + T old = builtin_atomicLoad(acc); + T expected; + + do { + expected = old; + old = builtin_atomicCAS(acc, expected, oper(expected)); + } while (!builtin_atomicCAS_equal(old, expected)); + + return old; +} + + +/*! + * Generic impementation of any atomic 8, 16, 32, or 64 bit operator + * that can be implemented using a builtin compare and swap primitive. + * Uses short-circuiting for improved efficiency. Returns the OLD value + * that was replaced by the result of this operation. + */ +template +RAJA_DEVICE_HIP RAJA_INLINE T builtin_atomicCAS_loop(T *acc, + Oper &&oper, + ShortCircuit &&sc) +{ + T old = builtin_atomicLoad(acc); + + if (sc(old)) { + return old; + } + + T expected; + + do { + expected = old; + old = builtin_atomicCAS(acc, expected, oper(expected)); + } while (!builtin_atomicCAS_equal(old, expected) && !sc(old)); + + return old; +} + + +/*! + * Atomics implemented using compare and swap loop + */ + + +/*! + * Atomic addition using compare and swap loop + */ +template ::value, bool> = true> +RAJA_DEVICE_HIP RAJA_INLINE T builtin_atomicAdd(T *acc, T value) { - BuiltinAtomicCAS cas; - return cas(acc, std::forward(oper), [](T const &) { return false; }); + return builtin_atomicCAS_loop(acc, [value] (T old) { + return old + value; + }); } -template -RAJA_DEVICE_HIP RAJA_INLINE T builtin_atomic_CAS_oper_sc(T volatile *acc, - OPER &&oper, - ShortCircuit const &sc) + +/*! + * Atomic subtraction using compare and swap loop + */ +template ::value, bool> = true> +RAJA_DEVICE_HIP RAJA_INLINE T builtin_atomicSub(T *acc, T value) { - BuiltinAtomicCAS cas; - return cas(acc, std::forward(oper), sc); + return builtin_atomicCAS_loop(acc, [value] (T old) { + return old - value; + }); +} + + +/*! + * Atomic and using compare and swap loop + */ +template ::value, bool> = true> +RAJA_DEVICE_HIP RAJA_INLINE T builtin_atomicAnd(T *acc, T value) +{ + return builtin_atomicCAS_loop(acc, [value] (T old) { + return old & value; + }); +} + + +/*! + * Atomic or using compare and swap loop + */ +template ::value, bool> = true> +RAJA_DEVICE_HIP RAJA_INLINE T builtin_atomicOr(T *acc, T value) +{ + return builtin_atomicCAS_loop(acc, [value] (T old) { + return old | value; + }); +} + + +/*! + * Atomic xor using compare and swap loop + */ +template ::value, bool> = true> +RAJA_DEVICE_HIP RAJA_INLINE T builtin_atomicXor(T *acc, T value) +{ + return builtin_atomicCAS_loop(acc, [value] (T old) { + return old ^ value; + }); } @@ -237,125 +712,115 @@ RAJA_DEVICE_HIP RAJA_INLINE T builtin_atomic_CAS_oper_sc(T volatile *acc, template -RAJA_DEVICE_HIP RAJA_INLINE T atomicAdd(builtin_atomic, - T volatile *acc, - T value) +RAJA_DEVICE_HIP RAJA_INLINE T atomicLoad(builtin_atomic, T *acc) { - return detail::builtin_atomic_CAS_oper(acc, [=](T a) { return a + value; }); + return detail::builtin_atomicLoad(acc); } +template +RAJA_DEVICE_HIP RAJA_INLINE void atomicStore(builtin_atomic, T *acc, T value) +{ + detail::builtin_atomicStore(acc, value); +} template -RAJA_DEVICE_HIP RAJA_INLINE T atomicSub(builtin_atomic, - T volatile *acc, - T value) +RAJA_DEVICE_HIP RAJA_INLINE T atomicAdd(builtin_atomic, T *acc, T value) { - return detail::builtin_atomic_CAS_oper(acc, [=](T a) { return a - value; }); + return detail::builtin_atomicAdd(acc, value); } template -RAJA_DEVICE_HIP RAJA_INLINE T atomicMin(builtin_atomic, - T volatile *acc, - T value) +RAJA_DEVICE_HIP RAJA_INLINE T atomicSub(builtin_atomic, T *acc, T value) { - if (*acc < value) { - return *acc; - } - return detail::builtin_atomic_CAS_oper_sc(acc, - [=](T a) { - return a < value ? a : value; - }, - [=](T current) { - return current < value; - }); + return detail::builtin_atomicSub(acc, value); } template -RAJA_DEVICE_HIP RAJA_INLINE T atomicMax(builtin_atomic, - T volatile *acc, - T value) +RAJA_DEVICE_HIP RAJA_INLINE T atomicMin(builtin_atomic, T *acc, T value) { - if (*acc > value) { - return *acc; - } - return detail::builtin_atomic_CAS_oper_sc(acc, - [=](T a) { - return a > value ? a : value; - }, - [=](T current) { - return current > value; - }); + return detail::builtin_atomicCAS_loop( + acc, + [value] (T old) { + return value < old ? value : old; + }, + [value] (T current) { + return current <= value; + }); +} + +template +RAJA_DEVICE_HIP RAJA_INLINE T atomicMax(builtin_atomic, T *acc, T value) +{ + return detail::builtin_atomicCAS_loop( + acc, + [value] (T old) { + return old < value ? value : old; + }, + [value] (T current) { + return value <= current; + }); } template -RAJA_DEVICE_HIP RAJA_INLINE T atomicInc(builtin_atomic, T volatile *acc) +RAJA_DEVICE_HIP RAJA_INLINE T atomicInc(builtin_atomic, T *acc) { - return detail::builtin_atomic_CAS_oper(acc, [=](T a) { return a + 1; }); + return detail::builtin_atomicAdd(acc, static_cast(1)); } template -RAJA_DEVICE_HIP RAJA_INLINE T atomicInc(builtin_atomic, T volatile *acc, T val) +RAJA_DEVICE_HIP RAJA_INLINE T atomicInc(builtin_atomic, T *acc, T value) { - return detail::builtin_atomic_CAS_oper(acc, [=](T old) { - return ((old >= val) ? 0 : (old + 1)); + return detail::builtin_atomicCAS_loop(acc, [value] (T old) { + return value <= old ? static_cast(0) : old + static_cast(1); }); } template -RAJA_DEVICE_HIP RAJA_INLINE T atomicDec(builtin_atomic, T volatile *acc) +RAJA_DEVICE_HIP RAJA_INLINE T atomicDec(builtin_atomic, T *acc) { - return detail::builtin_atomic_CAS_oper(acc, [=](T a) { return a - 1; }); + return detail::builtin_atomicSub(acc, static_cast(1)); } template -RAJA_DEVICE_HIP RAJA_INLINE T atomicDec(builtin_atomic, T volatile *acc, T val) +RAJA_DEVICE_HIP RAJA_INLINE T atomicDec(builtin_atomic, T *acc, T value) { - return detail::builtin_atomic_CAS_oper(acc, [=](T old) { - return (((old == 0) | (old > val)) ? val : (old - 1)); + return detail::builtin_atomicCAS_loop(acc, [value] (T old) { + return old == static_cast(0) || value < old ? value : old - static_cast(1); }); } template -RAJA_DEVICE_HIP RAJA_INLINE T atomicAnd(builtin_atomic, - T volatile *acc, - T value) +RAJA_DEVICE_HIP RAJA_INLINE T atomicAnd(builtin_atomic, T *acc, T value) { - return detail::builtin_atomic_CAS_oper(acc, [=](T a) { return a & value; }); + return detail::builtin_atomicAnd(acc, value); } template -RAJA_DEVICE_HIP RAJA_INLINE T atomicOr(builtin_atomic, T volatile *acc, T value) +RAJA_DEVICE_HIP RAJA_INLINE T atomicOr(builtin_atomic, T *acc, T value) { - return detail::builtin_atomic_CAS_oper(acc, [=](T a) { return a | value; }); + return detail::builtin_atomicOr(acc, value); } template -RAJA_DEVICE_HIP RAJA_INLINE T atomicXor(builtin_atomic, - T volatile *acc, - T value) +RAJA_DEVICE_HIP RAJA_INLINE T atomicXor(builtin_atomic, T *acc, T value) { - return detail::builtin_atomic_CAS_oper(acc, [=](T a) { return a ^ value; }); + return detail::builtin_atomicXor(acc, value); } template -RAJA_DEVICE_HIP RAJA_INLINE T atomicExchange(builtin_atomic, - T volatile *acc, - T value) +RAJA_DEVICE_HIP RAJA_INLINE T atomicExchange(builtin_atomic, T *acc, T value) { - return detail::builtin_atomic_CAS_oper(acc, [=](T) { return value; }); + return detail::builtin_atomicExchange(acc, value); } template -RAJA_DEVICE_HIP RAJA_INLINE T -atomicCAS(builtin_atomic, T volatile *acc, T compare, T value) +RAJA_DEVICE_HIP RAJA_INLINE T atomicCAS(builtin_atomic, T *acc, T compare, T value) { - return detail::builtin_atomic_CAS(acc, compare, value); + return detail::builtin_atomicCAS(acc, compare, value); } } // namespace RAJA -// make sure this define doesn't bleed out of this header -#undef RAJA_AUTO_ATOMIC #endif diff --git a/include/RAJA/policy/cuda.hpp b/include/RAJA/policy/cuda.hpp index c561122349..e9d5bc454f 100644 --- a/include/RAJA/policy/cuda.hpp +++ b/include/RAJA/policy/cuda.hpp @@ -34,6 +34,7 @@ #include "RAJA/policy/cuda/forall.hpp" #include "RAJA/policy/cuda/policy.hpp" #include "RAJA/policy/cuda/reduce.hpp" +#include "RAJA/policy/cuda/multi_reduce.hpp" #include "RAJA/policy/cuda/scan.hpp" #include "RAJA/policy/cuda/sort.hpp" #include "RAJA/policy/cuda/kernel.hpp" diff --git a/include/RAJA/policy/cuda/MemUtils_CUDA.hpp b/include/RAJA/policy/cuda/MemUtils_CUDA.hpp index 43d927acab..88a89d5362 100644 --- a/include/RAJA/policy/cuda/MemUtils_CUDA.hpp +++ b/include/RAJA/policy/cuda/MemUtils_CUDA.hpp @@ -49,6 +49,26 @@ namespace RAJA namespace cuda { +//! Get the properties of the current device +RAJA_INLINE +cudaDeviceProp get_device_prop() +{ + int device; + cudaErrchk(cudaGetDevice(&device)); + cudaDeviceProp prop; + cudaErrchk(cudaGetDeviceProperties(&prop, device)); + return prop; +} + +//! Get a reference to a static cached copy of the current device properties. +// This caches a copy on first use to speedup later calls. +RAJA_INLINE +cudaDeviceProp& device_prop() +{ + static thread_local cudaDeviceProp prop = get_device_prop(); + return prop; +} + //! Allocator for pinned memory for use in basic_mempool struct PinnedAllocator { @@ -146,36 +166,22 @@ namespace detail //! struct containing data necessary to coordinate kernel launches with reducers struct cudaInfo { + const void* func = nullptr; cuda_dim_t gridDim{0, 0, 0}; cuda_dim_t blockDim{0, 0, 0}; + size_t* dynamic_smem = nullptr; ::RAJA::resources::Cuda res{::RAJA::resources::Cuda::CudaFromStream(0,0)}; bool setup_reducers = false; +}; +struct cudaStatusInfo : cudaInfo { #if defined(RAJA_ENABLE_OPENMP) - cudaInfo* thread_states = nullptr; omp::mutex lock; #endif }; -//! class that changes a value on construction then resets it at destruction -template -class SetterResetter -{ -public: - SetterResetter(T& val, T new_val) : m_val(val), m_old_val(val) - { - m_val = new_val; - } - SetterResetter(const SetterResetter&) = delete; - ~SetterResetter() { m_val = m_old_val; } - -private: - T& m_val; - T m_old_val; -}; - -extern cudaInfo g_status; +extern cudaStatusInfo g_status; -extern cudaInfo tl_status; +extern cudaStatusInfo tl_status; #if defined(RAJA_ENABLE_OPENMP) #pragma omp threadprivate(tl_status) #endif @@ -275,54 +281,94 @@ bool setupReducers() { return detail::tl_status.setup_reducers; } RAJA_INLINE cuda_dim_t currentGridDim() { return detail::tl_status.gridDim; } +//! get grid size of current launch +RAJA_INLINE +cuda_dim_member_t currentGridSize() { return detail::tl_status.gridDim.x * + detail::tl_status.gridDim.y * + detail::tl_status.gridDim.z; } + //! get blockDim of current launch RAJA_INLINE cuda_dim_t currentBlockDim() { return detail::tl_status.blockDim; } +//! get block size of current launch +RAJA_INLINE +cuda_dim_member_t currentBlockSize() { return detail::tl_status.blockDim.x * + detail::tl_status.blockDim.y * + detail::tl_status.blockDim.z; } + +//! get dynamic shared memory usage for current launch +RAJA_INLINE +size_t currentDynamicShmem() { return *detail::tl_status.dynamic_smem; } + +//! get maximum dynamic shared memory for current launch +RAJA_INLINE +size_t maxDynamicShmem() +{ + cudaFuncAttributes func_attr; + cudaErrchk(cudaFuncGetAttributes(&func_attr, detail::tl_status.func)); + return func_attr.maxDynamicSharedSizeBytes; +} + +constexpr size_t dynamic_smem_allocation_failure = std::numeric_limits::max(); + +//! Allocate dynamic shared memory for current launch +// +// The first argument is a functional object that takes the maximum number of +// objects that can fit into the dynamic shared memory available and returns +// the number of objects to allocate. +// The second argument is the required alignment. +// +// Returns an offset into dynamic shared memory aligned to align on success, +// or dynamic_smem_allocation_failure on failure. Note that asking for 0 memory +// takes the failure return path. +template < typename T, typename GetNFromMax > +RAJA_INLINE +size_t allocateDynamicShmem(GetNFromMax&& get_n_from_max, size_t align = alignof(T)) +{ + const size_t unaligned_shmem = *detail::tl_status.dynamic_smem; + const size_t align_offset = ((unaligned_shmem % align) != size_t(0)) + ? align - (unaligned_shmem % align) + : size_t(0); + const size_t aligned_shmem = unaligned_shmem + align_offset; + + const size_t max_shmem_bytes = maxDynamicShmem() - aligned_shmem; + const size_t n_bytes = sizeof(T) * + std::forward(get_n_from_max)(max_shmem_bytes / sizeof(T)); + + if (size_t(0) < n_bytes && n_bytes <= max_shmem_bytes) { + *detail::tl_status.dynamic_smem = aligned_shmem + n_bytes; + return aligned_shmem; + } else { + return dynamic_smem_allocation_failure; + } +} + //! get resource for current launch RAJA_INLINE ::RAJA::resources::Cuda currentResource() { return detail::tl_status.res; } //! create copy of loop_body that is setup for device execution +// +// Note: This is done to setup the Reducer and MultiReducer objects through +// their copy constructors. Both look at tl_status to setup per kernel launch +// resources. template RAJA_INLINE typename std::remove_reference::type make_launch_body( + const void* func, cuda_dim_t gridDim, cuda_dim_t blockDim, - size_t RAJA_UNUSED_ARG(dynamic_smem), + size_t& dynamic_smem, ::RAJA::resources::Cuda res, LOOP_BODY&& loop_body) { - detail::SetterResetter setup_reducers_srer( - detail::tl_status.setup_reducers, true); - detail::SetterResetter<::RAJA::resources::Cuda> res_srer( - detail::tl_status.res, res); - - detail::tl_status.gridDim = gridDim; - detail::tl_status.blockDim = blockDim; + ::RAJA::detail::ScopedAssignment info_sa(detail::tl_status, + detail::cudaInfo{func, gridDim, blockDim, &dynamic_smem, res, true}); using return_type = typename std::remove_reference::type; return return_type(std::forward(loop_body)); } -//! Get the properties of the current device -RAJA_INLINE -cudaDeviceProp get_device_prop() -{ - int device; - cudaErrchk(cudaGetDevice(&device)); - cudaDeviceProp prop; - cudaErrchk(cudaGetDeviceProperties(&prop, device)); - return prop; -} - -//! Get a copy of the device properties, this copy is cached on first use to speedup later calls -RAJA_INLINE -cudaDeviceProp& device_prop() -{ - static thread_local cudaDeviceProp prop = get_device_prop(); - return prop; -} - static constexpr int cuda_occupancy_uninitialized_int = -1; static constexpr size_t cuda_occupancy_uninitialized_size_t = diff --git a/include/RAJA/policy/cuda/WorkGroup/WorkRunner.hpp b/include/RAJA/policy/cuda/WorkGroup/WorkRunner.hpp index c4aabd012f..41fe17c84a 100644 --- a/include/RAJA/policy/cuda/WorkGroup/WorkRunner.hpp +++ b/include/RAJA/policy/cuda/WorkGroup/WorkRunner.hpp @@ -292,7 +292,7 @@ struct WorkRunner< // // TODO: Privatize the loop_body, using make_launch_body to setup reductions // - // LOOP_BODY body = RAJA::cuda::make_launch_body( + // LOOP_BODY body = RAJA::cuda::make_launch_body(func, // gridSize, blockSize, shmem, stream, std::forward(loop_body)); storage.template emplace( diff --git a/include/RAJA/policy/cuda/atomic.hpp b/include/RAJA/policy/cuda/atomic.hpp index 573618fc25..aedfe91a03 100644 --- a/include/RAJA/policy/cuda/atomic.hpp +++ b/include/RAJA/policy/cuda/atomic.hpp @@ -25,17 +25,31 @@ #include #include +#if __CUDA__ARCH__ >= 600 && __CUDACC_VER_MAJOR__ >= 11 && __CUDACC_VER_MINOR__ >= 6 +#define RAJA_ENABLE_CUDA_ATOMIC_REF +#endif + +#if defined(RAJA_ENABLE_CUDA_ATOMIC_REF) +#include +#endif + +#include "camp/list.hpp" + #include "RAJA/policy/sequential/atomic.hpp" #include "RAJA/policy/atomic_builtin.hpp" #if defined(RAJA_ENABLE_OPENMP) #include "RAJA/policy/openmp/atomic.hpp" #endif +#include "RAJA/util/EnableIf.hpp" #include "RAJA/util/Operators.hpp" #include "RAJA/util/TypeConvert.hpp" #include "RAJA/util/macros.hpp" +// TODO: When we can use if constexpr in C++17, this file can be cleaned up + + namespace RAJA { @@ -43,596 +57,602 @@ namespace RAJA namespace detail { -#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ < 350) // baseline CUDA_ARCH sm_35 check -#warning CUDA_ARCH is set too low in nvcc. Should set nvcc -arch=sm_35 or greater. COMPILING WITH DEFAULT atomicCAS! -#endif -// All CUDA atomic functions are checked for individual arch versions. -// Most >= 200 checks can be deemed as >= 110 (except CAS 64-bit, Add 32-bit float, and Add 64-bit ULL), but using 200 for shared memory support. -// If using < 350, certain atomics will be implemented with atomicCAS. +/*! + * Type trait for determining if atomic operators should be implemented + * using builtin functions. This type trait can be used for a lot of atomic + * operators. More specific type traits are added when needed, such as + * cuda_useBuiltinExchange below. + */ +template +struct cuda_useBuiltinCommon { + static constexpr bool value = + std::is_same::value || + std::is_same::value || + std::is_same::value; +}; + -#if __CUDA_ARCH__ >= 200 /*! - * Generic impementation of atomic 32-bit or 64-bit compare and swap primitive. - * Implementation uses the existing CUDA supplied unsigned 32-bit and 64-bit - * CAS operators. - * Returns the value that was stored before this operation. + * Type trait for determining if atomic operators should be implemented + * by reinterpreting inputs to types that the builtin functions support. + * This type trait can be used for a lot of atomic operators. More specific + * type traits are added when needed, such as cuda_useReinterpretExchange + * below. */ -RAJA_INLINE __device__ unsigned cuda_atomic_CAS( - unsigned volatile *acc, - unsigned compare, - unsigned value) -{ - return ::atomicCAS((unsigned *)acc, compare, value); -} -/// -RAJA_INLINE __device__ unsigned long long cuda_atomic_CAS( - unsigned long long volatile *acc, - unsigned long long compare, - unsigned long long value) -{ - return ::atomicCAS((unsigned long long *)acc, compare, value); -} -/// template -RAJA_INLINE __device__ -typename std::enable_if::type -cuda_atomic_CAS(T volatile *acc, T compare, T value) -{ - return RAJA::util::reinterp_A_as_B( - cuda_atomic_CAS((unsigned volatile *)acc, - RAJA::util::reinterp_A_as_B(compare), - RAJA::util::reinterp_A_as_B(value))); -} -/// +struct cuda_useReinterpretCommon { + static constexpr bool value = + !cuda_useBuiltinCommon::value && + (sizeof(T) == sizeof(unsigned int) || + sizeof(T) == sizeof(unsigned long long)); + + using type = + std::conditional_t; +}; + + +/*! + * Alias for determining the integral type of the same size as the given type + */ template -RAJA_INLINE __device__ -typename std::enable_if::type -cuda_atomic_CAS(T volatile *acc, T compare, T value) +using cuda_useReinterpretCommon_t = typename cuda_useReinterpretCommon::type; + + +/*! + * Performs an atomic bitwise or using a builtin function. Stores the new value + * in the given address and returns the old value. + * + * This overload using builtin functions is used to implement atomic loads + * under some build configurations. + */ +template ::value, bool> = true> +RAJA_INLINE __device__ T cuda_atomicOr(T *acc, T value) { - return RAJA::util::reinterp_A_as_B( - cuda_atomic_CAS((unsigned long long volatile *)acc, - RAJA::util::reinterp_A_as_B(compare), - RAJA::util::reinterp_A_as_B(value))); + return ::atomicOr(acc, value); } -template -struct CudaAtomicCAS { -}; +/*! + * Atomic exchange + */ -template <> -struct CudaAtomicCAS<4> { - - /*! - * Generic impementation of any atomic 32-bit operator. - * Implementation uses the existing CUDA supplied unsigned 32-bit CAS - * operator. Returns the OLD value that was replaced by the result of this - * operation. - */ - template - RAJA_INLINE __device__ T operator()(T volatile *acc, OPER const &oper) const - { - // asserts in RAJA::util::reinterp_T_as_u and RAJA::util::reinterp_u_as_T - // will enforce 32-bit T - unsigned oldval, newval, readback; - oldval = RAJA::util::reinterp_A_as_B(*acc); - newval = RAJA::util::reinterp_A_as_B( - oper(RAJA::util::reinterp_A_as_B(oldval))); - while ((readback = cuda_atomic_CAS((unsigned volatile*)acc, oldval, newval)) != - oldval) { - oldval = readback; - newval = RAJA::util::reinterp_A_as_B( - oper(RAJA::util::reinterp_A_as_B(oldval))); - } - return RAJA::util::reinterp_A_as_B(oldval); - } +/*! + * Type trait for determining if the exchange operator should be implemented + * using a builtin + */ +template +struct cuda_useBuiltinExchange { + static constexpr bool value = + std::is_same::value || + std::is_same::value || + std::is_same::value || + std::is_same::value; }; -template <> -struct CudaAtomicCAS<8> { - - /*! - * Generic impementation of any atomic 64-bit operator. - * Implementation uses the existing CUDA supplied unsigned 64-bit CAS - * operator. Returns the OLD value that was replaced by the result of this - * operation. - */ - template - RAJA_INLINE __device__ T operator()(T volatile *acc, OPER const &oper) const - { - // asserts in RAJA::util::reinterp_T_as_u and RAJA::util::reinterp_u_as_T - // will enforce 64-bit T - unsigned long long oldval, newval, readback; - oldval = RAJA::util::reinterp_A_as_B(*acc); - newval = RAJA::util::reinterp_A_as_B( - oper(RAJA::util::reinterp_A_as_B(oldval))); - while ( - (readback = cuda_atomic_CAS((unsigned long long volatile*)acc, oldval, newval)) != - oldval) { - oldval = readback; - newval = RAJA::util::reinterp_A_as_B( - oper(RAJA::util::reinterp_A_as_B(oldval))); - } - return RAJA::util::reinterp_A_as_B(oldval); - } +/*! + * Type trait for determining if the exchange operator should be implemented + * by reinterpreting inputs to types that the builtin exchange supports + */ +template +struct cuda_useReinterpretExchange { + static constexpr bool value = + !cuda_useBuiltinExchange::value && + (sizeof(T) == sizeof(unsigned int) || + sizeof(T) == sizeof(unsigned long long)); + + using type = + std::conditional_t; }; +/*! + * Alias for determining the integral type of the same size as the given type + */ +template +using cuda_useReinterpretExchange_t = typename cuda_useReinterpretExchange::type; /*! - * Generic impementation of any atomic 32-bit or 64-bit operator that can be - * implemented using a compare and swap primitive. - * Implementation uses the existing CUDA supplied unsigned 32-bit and 64-bit - * CAS operators. - * Returns the OLD value that was replaced by the result of this operation. + * Performs an atomic exchange using a builtin function. Stores the new value + * in the given address and returns the old value. */ -template -RAJA_INLINE __device__ T cuda_atomic_CAS_oper(T volatile *acc, OPER &&oper) +template ::value, bool> = true> +RAJA_INLINE __device__ T cuda_atomicExchange(T *acc, T value) { - CudaAtomicCAS cas; - return cas(acc, std::forward(oper)); + return ::atomicExch(acc, value); } -#endif // end CAS >= 200 -#if __CUDA_ARCH__ >= 200 /*! - * Catch-all policy passes off to CUDA's builtin atomics. - * - * This catch-all will only work for types supported by the compiler. - * Specialization below can adapt for some unsupported types. - * - * These are atomic in cuda device code and non-atomic otherwise + * Performs an atomic exchange using a reinterpret cast. Stores the new value + * in the given address and returns the old value. */ -template -RAJA_INLINE __device__ T cuda_atomicAdd(T volatile *acc, T value) +template ::value, bool> = true> +RAJA_INLINE __device__ T cuda_atomicExchange(T *acc, T value) { - return cuda_atomic_CAS_oper(acc, [=] __device__(T a) { - return a + value; - }); -} + using R = cuda_useReinterpretExchange_t; -// 32-bit signed atomicAdd support by CUDA -template <> -RAJA_INLINE __device__ int cuda_atomicAdd(int volatile *acc, - int value) -{ - return ::atomicAdd((int *)acc, value); + return RAJA::util::reinterp_A_as_B( + cuda_atomicExchange(reinterpret_cast(acc), + RAJA::util::reinterp_A_as_B(value))); } -// 32-bit unsigned atomicAdd support by CUDA -template <> -RAJA_INLINE __device__ unsigned cuda_atomicAdd(unsigned volatile *acc, - unsigned value) -{ - return ::atomicAdd((unsigned *)acc, value); -} +/*! + * Atomic load and store + */ +#if defined(RAJA_ENABLE_CUDA_ATOMIC_REF) -// 64-bit unsigned atomicAdd support by CUDA -template <> -RAJA_INLINE __device__ unsigned long long cuda_atomicAdd( - unsigned long long volatile *acc, - unsigned long long value) +template +RAJA_INLINE __device__ T cuda_atomicLoad(T *acc) { - return ::atomicAdd((unsigned long long *)acc, value); + return cuda::atomic_ref(*acc).load( + cuda::memory_order_relaxed{}); } -// 32-bit float atomicAdd support by CUDA -template <> -RAJA_INLINE __device__ float cuda_atomicAdd(float volatile *acc, - float value) +template +RAJA_INLINE __device__ void cuda_atomicStore(T *acc, T value) { - return ::atomicAdd((float *)acc, value); + cuda::atomic_ref(*acc).store( + value, cuda::memory_order_relaxed{}); } -#endif +#else -// 64-bit double atomicAdd support added for sm_60 -#if __CUDA_ARCH__ >= 600 -template <> -RAJA_INLINE __device__ double cuda_atomicAdd(double volatile *acc, - double value) +template ::value, bool> = true> +RAJA_INLINE __device__ T cuda_atomicLoad(T *acc) { - return ::atomicAdd((double *)acc, value); + return cuda_atomicOr(acc, static_cast(0)); } -#endif -#if __CUDA_ARCH__ >= 200 -template -RAJA_INLINE __device__ T cuda_atomicSub(T volatile *acc, T value) +template ::value, bool> = true> +RAJA_INLINE __device__ T cuda_atomicLoad(T *acc) { - return cuda_atomic_CAS_oper(acc, [=] __device__(T a) { - return a - value; - }); -} + using R = cuda_useReinterpretCommon_t; -// 32-bit signed atomicSub support by CUDA -template <> -RAJA_INLINE __device__ int cuda_atomicSub(int volatile *acc, - int value) -{ - return ::atomicSub((int *)acc, value); + return RAJA::util::reinterp_A_as_B( + cuda_atomicLoad(reinterpret_cast(acc))); } - -// 32-bit unsigned atomicSub support by CUDA -template <> -RAJA_INLINE __device__ unsigned cuda_atomicSub(unsigned volatile *acc, - unsigned value) +template +RAJA_INLINE __device__ void cuda_atomicStore(T *acc, T value) { - return ::atomicSub((unsigned *)acc, value); + cuda_atomicExchange(acc, value); } + #endif -#if __CUDA_ARCH__ >= 200 + +/*! + * Atomic compare and swap + */ + +/*! + * Type trait for determining if the compare and swap operator should be + * implemented using a builtin + */ template -RAJA_INLINE __device__ T cuda_atomicMin(T volatile *acc, T value) -{ - return cuda_atomic_CAS_oper(acc, [=] __device__(T a) { - return value < a ? value : a; - }); -} +struct cuda_useBuiltinCAS { + static constexpr bool value = +#if __CUDA_ARCH__ >= 700 + std::is_same::value || +#endif + std::is_same::value || + std::is_same::value || + std::is_same::value; +}; -// 32-bit signed atomicMin support by CUDA -template <> -RAJA_INLINE __device__ int cuda_atomicMin(int volatile *acc, - int value) -{ - return ::atomicMin((int *)acc, value); -} +/*! + * Type trait for determining if the compare and swap operator should be + * implemented by reinterpreting inputs to types that the builtin compare + * and swap supports + */ +template +struct cuda_useReinterpretCAS { + static constexpr bool value = + !cuda_useBuiltinCAS::value && + ( +#if __CUDA_ARCH__ >= 700 + sizeof(T) == sizeof(unsigned short) || +#endif + sizeof(T) == sizeof(unsigned int) || + sizeof(T) == sizeof(unsigned long long) + ); + + using type = +#if __CUDA_ARCH__ >= 700 + std::conditional_t +#if __CUDA_ARCH__ >= 700 + > +#endif + ; +}; +/*! + * Alias for determining the integral type of the same size as the given type + */ +template +using cuda_useReinterpretCAS_t = typename cuda_useReinterpretCAS::type; -// 32-bit unsigned atomicMin support by CUDA -template <> -RAJA_INLINE __device__ unsigned cuda_atomicMin(unsigned volatile *acc, - unsigned value) +template ::value, bool> = true> +RAJA_INLINE __device__ T cuda_atomicCAS(T *acc, T compare, T value) { - return ::atomicMin((unsigned *)acc, value); + return ::atomicCAS(acc, compare, value); } -#endif -// 64-bit unsigned atomicMin support by CUDA sm_35 and later -#if __CUDA_ARCH__ >= 350 -template <> -RAJA_INLINE __device__ unsigned long long cuda_atomicMin( - unsigned long long volatile *acc, - unsigned long long value) +template ::value, bool> = true> +RAJA_INLINE __device__ T cuda_atomicCAS(T *acc, T compare, T value) { - return ::atomicMin((unsigned long long *)acc, value); + using R = cuda_useReinterpretCAS_t; + + return RAJA::util::reinterp_A_as_B( + cuda_atomicCAS(reinterpret_cast(acc), + RAJA::util::reinterp_A_as_B(compare), + RAJA::util::reinterp_A_as_B(value))); } -#endif -#if __CUDA_ARCH__ >= 200 -template -RAJA_INLINE __device__ T cuda_atomicMax(T volatile *acc, T value) +/*! + * Equality comparison for compare and swap loop. Converts to the underlying + * integral type to avoid cases where the values will never compare equal + * (most notably, NaNs). + */ +template ::value, bool> = true> +RAJA_INLINE __device__ bool cuda_atomicCAS_equal(const T& a, const T& b) { - return cuda_atomic_CAS_oper(acc, [=] __device__(T a) { - return value > a ? value : a; - }); + return a == b; } -// 32-bit signed atomicMax support by CUDA -template <> -RAJA_INLINE __device__ int cuda_atomicMax(int volatile *acc, - int value) +template ::value, bool> = true> +RAJA_INLINE __device__ bool cuda_atomicCAS_equal(const T& a, const T& b) { - return ::atomicMax((int *)acc, value); + using R = cuda_useReinterpretCommon_t; + + return cuda_atomicCAS_equal(RAJA::util::reinterp_A_as_B(a), + RAJA::util::reinterp_A_as_B(b)); } -// 32-bit unsigned atomicMax support by CUDA -template <> -RAJA_INLINE __device__ unsigned cuda_atomicMax(unsigned volatile *acc, - unsigned value) +/*! + * Generic impementation of any atomic 32-bit or 64-bit operator. + * Implementation uses the existing CUDA supplied unsigned 32-bit or 64-bit CAS + * operator. Returns the OLD value that was replaced by the result of this + * operation. + */ +template +RAJA_INLINE __device__ T cuda_atomicCAS_loop(T *acc, + Oper&& oper) { - return ::atomicMax((unsigned *)acc, value); -} -#endif + T old = cuda_atomicLoad(acc); + T expected; -// 64-bit unsigned atomicMax support by CUDA sm_35 and later -#if __CUDA_ARCH__ >= 350 -template <> -RAJA_INLINE __device__ unsigned long long cuda_atomicMax( - unsigned long long volatile *acc, - unsigned long long value) -{ - return ::atomicMax((unsigned long long *)acc, value); -} -#endif + do { + expected = old; + old = cuda_atomicCAS(acc, expected, oper(expected)); + } while (!cuda_atomicCAS_equal(old, expected)); -#if __CUDA_ARCH__ >= 200 -template -RAJA_INLINE __device__ T cuda_atomicInc(T volatile *acc, T val) -{ - // See: - // http://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#atomicinc - return cuda_atomic_CAS_oper(acc, [=] __device__(T old) { - return ((old >= val) ? 0 : (old + 1)); - }); + return old; } -// 32-bit unsigned atomicInc support by CUDA -template <> -RAJA_INLINE __device__ unsigned cuda_atomicInc(unsigned volatile *acc, - unsigned value) +/*! + * Generic impementation of any atomic 32-bit or 64-bit operator with short-circuiting. + * Implementation uses the existing CUDA supplied unsigned 32-bit or 64-bit CAS + * operator. Returns the OLD value that was replaced by the result of this + * operation. + */ +template +RAJA_INLINE __device__ T cuda_atomicCAS_loop(T *acc, + Oper&& oper, + ShortCircuit&& sc) { - return ::atomicInc((unsigned *)acc, value); -} + T old = cuda_atomicLoad(acc); -template -RAJA_INLINE __device__ T cuda_atomicInc(T volatile *acc) -{ - return cuda_atomic_CAS_oper(acc, - [=] __device__(T a) { return a + 1; }); -} + if (sc(old)) { + return old; + } -// 32-bit signed atomicAdd support by CUDA, used as backend for atomicInc -template <> -RAJA_INLINE __device__ int cuda_atomicInc(int volatile *acc) -{ - return ::atomicAdd((int *)acc, (int)1); -} + T expected; -// 32-bit unsigned atomicAdd support by CUDA, used as backend for atomicInc -template <> -RAJA_INLINE __device__ unsigned cuda_atomicInc(unsigned volatile *acc) -{ - return ::atomicAdd((unsigned *)acc, (unsigned)1); -} + do { + expected = old; + old = cuda_atomicCAS(acc, expected, oper(expected)); + } while (!cuda_atomicCAS_equal(old, expected) && !sc(old)); -// 64-bit unsigned atomicAdd support by CUDA, used as backend for atomicInc -template <> -RAJA_INLINE __device__ unsigned long long cuda_atomicInc( - unsigned long long volatile *acc) -{ - return ::atomicAdd((unsigned long long *)acc, (unsigned long long)1); + return old; } -// 32-bit float atomicAdd support by CUDA, used as backend for atomicInc -template <> -RAJA_INLINE __device__ float cuda_atomicInc(float volatile *acc) -{ - return ::atomicAdd((float *)acc, (float)1); -} -#endif -// 64-bit double atomicAdd support added for sm_60, used as backend for atomicInc +/*! + * Atomic addition + */ +using cuda_atomicAdd_builtin_types = ::camp::list< + int, + unsigned int, + unsigned long long int, + float #if __CUDA_ARCH__ >= 600 -template <> -RAJA_INLINE __device__ double cuda_atomicInc(double volatile *acc) -{ - return ::atomicAdd((double *)acc, (double)1); -} + , + double #endif +>; - -#if __CUDA_ARCH__ >= 200 -template -RAJA_INLINE __device__ T cuda_atomicDec(T volatile *acc, T val) +template * = nullptr> +RAJA_INLINE __device__ T cuda_atomicAdd(T *acc, T value) { - // See: - // http://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#atomicdec - return cuda_atomic_CAS_oper(acc, [=] __device__(T old) { - return (((old == 0) | (old > val)) ? val : (old - 1)); + return cuda_atomicCAS_loop(acc, [value] (T old) { + return old + value; }); } -// 32-bit unsigned atomicDec support by CUDA -template <> -RAJA_INLINE __device__ unsigned cuda_atomicDec(unsigned volatile *acc, - unsigned value) +template * = nullptr> +RAJA_INLINE __device__ T cuda_atomicAdd(T *acc, T value) { - return ::atomicDec((unsigned *)acc, value); + return ::atomicAdd(acc, value); } -template -RAJA_INLINE __device__ T cuda_atomicDec(T volatile *acc) -{ - return cuda_atomic_CAS_oper(acc, - [=] __device__(T a) { return a - 1; }); -} -// 32-bit signed atomicSub support by CUDA, used as backend for atomicDec -template <> -RAJA_INLINE __device__ int cuda_atomicDec(int volatile *acc) -{ - return ::atomicSub((int *)acc, (int)1); -} +/*! + * Atomic subtract + */ +using cuda_atomicSub_builtin_types = cuda_atomicAdd_builtin_types; -// 32-bit unsigned atomicSub support by CUDA, used as backend for atomicDec -template <> -RAJA_INLINE __device__ unsigned cuda_atomicDec(unsigned volatile *acc) -{ - return ::atomicSub((unsigned *)acc, (unsigned)1); -} -#endif +using cuda_atomicSub_via_Sub_builtin_types = ::camp::list< + int, + unsigned int +>; +using cuda_atomicSub_via_Add_builtin_types = ::camp::list< + unsigned long long int, + float +#if __CUDA_ARCH__ >= 600 + , + double +#endif +>; -#if __CUDA_ARCH__ >= 200 -template -RAJA_INLINE __device__ T cuda_atomicAnd(T volatile *acc, T value) +template * = nullptr> +RAJA_INLINE __device__ T cuda_atomicSub(T *acc, T value) { - return cuda_atomic_CAS_oper(acc, [=] __device__(T a) { - return a & value; + return cuda_atomicCAS_loop(acc, [value] (T old) { + return old - value; }); } -// 32-bit signed atomicAnd support by CUDA -template <> -RAJA_INLINE __device__ int cuda_atomicAnd(int volatile *acc, - int value) +template * = nullptr> +RAJA_INLINE __device__ T cuda_atomicSub(T *acc, T value) { - return ::atomicAnd((int *)acc, value); + return ::atomicSub(acc, value); } - -// 32-bit unsigned atomicAnd support by CUDA -template <> -RAJA_INLINE __device__ unsigned cuda_atomicAnd(unsigned volatile *acc, - unsigned value) +template * = nullptr> +RAJA_INLINE __device__ T cuda_atomicSub(T *acc, T value) { - return ::atomicAnd((unsigned *)acc, value); + return ::atomicAdd(acc, -value); } -#endif -// 64-bit unsigned atomicAnd support by CUDA sm_35 and later -#if __CUDA_ARCH__ >= 350 -template <> -RAJA_INLINE __device__ unsigned long long cuda_atomicAnd( - unsigned long long volatile *acc, - unsigned long long value) -{ - return ::atomicAnd((unsigned long long *)acc, value); -} + +/*! + * Atomic min/max + */ +using cuda_atomicMinMax_builtin_types = ::camp::list< + int, + unsigned int +#if __CUDA_ARCH__ >= 500 + , + long long int, + unsigned long long int #endif +>; -#if __CUDA_ARCH__ >= 200 -template -RAJA_INLINE __device__ T cuda_atomicOr(T volatile *acc, T value) + +/*! + * Atomic min + */ +template * = nullptr> +RAJA_INLINE __device__ T cuda_atomicMin(T *acc, T value) { - return cuda_atomic_CAS_oper(acc, [=] __device__(T a) { - return a | value; - }); + return cuda_atomicCAS_loop( + acc, + [value] (T old) { + return value < old ? value : old; + }, + [value] (T current) { + return current <= value; + }); } -// 32-bit signed atomicOr support by CUDA -template <> -RAJA_INLINE __device__ int cuda_atomicOr(int volatile *acc, - int value) +template * = nullptr> +RAJA_INLINE __device__ T cuda_atomicMin(T *acc, T value) { - return ::atomicOr((int *)acc, value); + return ::atomicMin(acc, value); } -// 32-bit unsigned atomicOr support by CUDA -template <> -RAJA_INLINE __device__ unsigned cuda_atomicOr(unsigned volatile *acc, - unsigned value) +/*! + * Atomic max + */ +template * = nullptr> +RAJA_INLINE __device__ T cuda_atomicMax(T *acc, T value) { - return ::atomicOr((unsigned *)acc, value); + return cuda_atomicCAS_loop( + acc, + [value] (T old) { + return old < value ? value : old; + }, + [value] (T current) { + return value <= current; + }); } -#endif -// 64-bit unsigned atomicOr support by CUDA sm_35 and later -#if __CUDA_ARCH__ >= 350 -template <> -RAJA_INLINE __device__ unsigned long long cuda_atomicOr( - unsigned long long volatile *acc, - unsigned long long value) +template * = nullptr> +RAJA_INLINE __device__ T cuda_atomicMax(T *acc, T value) { - return ::atomicOr((unsigned long long *)acc, value); + return ::atomicMax(acc, value); } -#endif -#if __CUDA_ARCH__ >= 200 -template -RAJA_INLINE __device__ T cuda_atomicXor(T volatile *acc, T value) + +/*! + * Atomic increment/decrement with reset + */ +using cuda_atomicIncDecReset_builtin_types = ::camp::list< + unsigned int +>; + + +/*! + * Atomic increment with reset + */ +template * = nullptr> +RAJA_INLINE __device__ T cuda_atomicInc(T *acc, T value) { - return cuda_atomic_CAS_oper(acc, [=] __device__(T a) { - return a ^ value; + // See: + // http://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#atomicinc + return cuda_atomicCAS_loop(acc, [value] (T old) { + return value <= old ? static_cast(0) : old + static_cast(1); }); } -// 32-bit signed atomicXor support by CUDA -template <> -RAJA_INLINE __device__ int cuda_atomicXor(int volatile *acc, - int value) +template * = nullptr> +RAJA_INLINE __device__ T cuda_atomicInc(T *acc, T value) { - return ::atomicXor((int *)acc, value); + return ::atomicInc(acc, value); } -// 32-bit unsigned atomicXor support by CUDA -template <> -RAJA_INLINE __device__ unsigned cuda_atomicXor(unsigned volatile *acc, - unsigned value) +/*! + * Atomic increment (implemented in terms of atomic addition) + */ +template +RAJA_INLINE __device__ T cuda_atomicInc(T *acc) { - return ::atomicXor((unsigned *)acc, value); + return cuda_atomicAdd(acc, static_cast(1)); } -#endif -// 64-bit unsigned atomicXor support by CUDA sm_35 and later -#if __CUDA_ARCH__ >= 350 -template <> -RAJA_INLINE __device__ unsigned long long cuda_atomicXor( - unsigned long long volatile *acc, - unsigned long long value) -{ - return ::atomicXor((unsigned long long *)acc, value); -} -#endif -#if __CUDA_ARCH__ >= 200 -template -RAJA_INLINE __device__ T cuda_atomicExchange(T volatile *acc, T value) +/*! + * Atomic decrement with reset + */ +template * = nullptr> +RAJA_INLINE __device__ T cuda_atomicDec(T *acc, T value) { - return cuda_atomic_CAS_oper(acc, [=] __device__(T) { - return value; + // See: + // http://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#atomicdec + return cuda_atomicCAS_loop(acc, [value] (T old) { + return old == static_cast(0) || value < old ? value : old - static_cast(1); }); } -template <> -RAJA_INLINE __device__ int cuda_atomicExchange( - int volatile *acc, int value) +template * = nullptr> +RAJA_INLINE __device__ T cuda_atomicDec(T *acc, T value) { - return ::atomicExch((int *)acc, value); + return ::atomicDec(acc, value); } -template <> -RAJA_INLINE __device__ unsigned cuda_atomicExchange( - unsigned volatile *acc, unsigned value) + +/*! + * Atomic decrement (implemented in terms of atomic subtraction) + */ +template +RAJA_INLINE __device__ T cuda_atomicDec(T *acc) { - return ::atomicExch((unsigned *)acc, value); + return cuda_atomicSub(acc, static_cast(1)); } -template <> -RAJA_INLINE __device__ unsigned long long cuda_atomicExchange( - unsigned long long volatile *acc, - unsigned long long value) + +/*! + * Atomic bitwise functions (and, or, xor) + */ +using cuda_atomicBit_builtin_types = ::camp::list< + int, + unsigned int, + unsigned long long int +>; + + +/*! + * Atomic and + */ +template * = nullptr> +RAJA_INLINE __device__ T cuda_atomicAnd(T *acc, T value) { - return ::atomicExch((unsigned long long *)acc, value); + return cuda_atomicCAS_loop(acc, [value] (T old) { + return old & value; + }); } -template <> -RAJA_INLINE __device__ float cuda_atomicExchange( - float volatile *acc, float value) +template * = nullptr> +RAJA_INLINE __device__ T cuda_atomicAnd(T *acc, T value) { - return ::atomicExch((float *)acc, value); + return ::atomicAnd(acc, value); } -#endif -#if __CUDA_ARCH__ >= 200 -template -RAJA_INLINE __device__ T cuda_atomicCAS(T volatile *acc, T compare, T value) +/*! + * Atomic or + */ +template * = nullptr> +RAJA_INLINE __device__ T cuda_atomicOr(T *acc, T value) { - return cuda_atomic_CAS(acc, compare, value); + return cuda_atomicCAS_loop(acc, [value] (T old) { + return old | value; + }); } -template <> -RAJA_INLINE __device__ int cuda_atomicCAS( - int volatile *acc, int compare, int value) -{ - return ::atomicCAS((int *)acc, compare, value); -} +/*! + * Atomic or via builtin functions was implemented much earlier since atomicLoad + * may depend on it. + */ + -template <> -RAJA_INLINE __device__ unsigned cuda_atomicCAS( - unsigned volatile *acc, unsigned compare, unsigned value) +/*! + * Atomic xor + */ +template * = nullptr> +RAJA_INLINE __device__ T cuda_atomicXor(T *acc, T value) { - return ::atomicCAS((unsigned *)acc, compare, value); + return cuda_atomicCAS_loop(acc, [value] (T old) { + return old ^ value; + }); } -template <> -RAJA_INLINE __device__ unsigned long long cuda_atomicCAS( - unsigned long long volatile *acc, - unsigned long long compare, - unsigned long long value) +template * = nullptr> +RAJA_INLINE __device__ T cuda_atomicXor(T *acc, T value) { - return ::atomicCAS((unsigned long long *)acc, compare, value); + return ::atomicXor(acc, value); } -#endif + } // namespace detail @@ -648,7 +668,31 @@ RAJA_INLINE __device__ unsigned long long cuda_atomicCAS( RAJA_SUPPRESS_HD_WARN template RAJA_INLINE RAJA_HOST_DEVICE T -atomicAdd(cuda_atomic_explicit, T volatile *acc, T value) +atomicLoad(cuda_atomic_explicit, T *acc) +{ +#ifdef __CUDA_ARCH__ + return detail::cuda_atomicLoad(acc); +#else + return RAJA::atomicLoad(host_policy{}, acc); +#endif +} + +RAJA_SUPPRESS_HD_WARN +template +RAJA_INLINE RAJA_HOST_DEVICE void +atomicStore(cuda_atomic_explicit, T *acc, T value) +{ +#ifdef __CUDA_ARCH__ + detail::cuda_atomicStore(acc, value); +#else + RAJA::atomicStore(host_policy{}, acc, value); +#endif +} + +RAJA_SUPPRESS_HD_WARN +template +RAJA_INLINE RAJA_HOST_DEVICE T +atomicAdd(cuda_atomic_explicit, T *acc, T value) { #ifdef __CUDA_ARCH__ return detail::cuda_atomicAdd(acc, value); @@ -660,7 +704,7 @@ atomicAdd(cuda_atomic_explicit, T volatile *acc, T value) RAJA_SUPPRESS_HD_WARN template RAJA_INLINE RAJA_HOST_DEVICE T -atomicSub(cuda_atomic_explicit, T volatile *acc, T value) +atomicSub(cuda_atomic_explicit, T *acc, T value) { #ifdef __CUDA_ARCH__ return detail::cuda_atomicSub(acc, value); @@ -672,7 +716,7 @@ atomicSub(cuda_atomic_explicit, T volatile *acc, T value) RAJA_SUPPRESS_HD_WARN template RAJA_INLINE RAJA_HOST_DEVICE T -atomicMin(cuda_atomic_explicit, T volatile *acc, T value) +atomicMin(cuda_atomic_explicit, T *acc, T value) { #ifdef __CUDA_ARCH__ return detail::cuda_atomicMin(acc, value); @@ -684,7 +728,7 @@ atomicMin(cuda_atomic_explicit, T volatile *acc, T value) RAJA_SUPPRESS_HD_WARN template RAJA_INLINE RAJA_HOST_DEVICE T -atomicMax(cuda_atomic_explicit, T volatile *acc, T value) +atomicMax(cuda_atomic_explicit, T *acc, T value) { #ifdef __CUDA_ARCH__ return detail::cuda_atomicMax(acc, value); @@ -696,21 +740,21 @@ atomicMax(cuda_atomic_explicit, T volatile *acc, T value) RAJA_SUPPRESS_HD_WARN template RAJA_INLINE RAJA_HOST_DEVICE T -atomicInc(cuda_atomic_explicit, T volatile *acc, T val) +atomicInc(cuda_atomic_explicit, T *acc, T value) { #ifdef __CUDA_ARCH__ // See: // http://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#atomicinc - return detail::cuda_atomicInc(acc, val); + return detail::cuda_atomicInc(acc, value); #else - return RAJA::atomicInc(host_policy{}, acc, val); + return RAJA::atomicInc(host_policy{}, acc, value); #endif } RAJA_SUPPRESS_HD_WARN template RAJA_INLINE RAJA_HOST_DEVICE T -atomicInc(cuda_atomic_explicit, T volatile *acc) +atomicInc(cuda_atomic_explicit, T *acc) { #ifdef __CUDA_ARCH__ return detail::cuda_atomicInc(acc); @@ -722,21 +766,21 @@ atomicInc(cuda_atomic_explicit, T volatile *acc) RAJA_SUPPRESS_HD_WARN template RAJA_INLINE RAJA_HOST_DEVICE T -atomicDec(cuda_atomic_explicit, T volatile *acc, T val) +atomicDec(cuda_atomic_explicit, T *acc, T value) { #ifdef __CUDA_ARCH__ // See: // http://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#atomicdec - return detail::cuda_atomicDec(acc, val); + return detail::cuda_atomicDec(acc, value); #else - return RAJA::atomicDec(host_policy{}, acc, val); + return RAJA::atomicDec(host_policy{}, acc, value); #endif } RAJA_SUPPRESS_HD_WARN template RAJA_INLINE RAJA_HOST_DEVICE T -atomicDec(cuda_atomic_explicit, T volatile *acc) +atomicDec(cuda_atomic_explicit, T *acc) { #ifdef __CUDA_ARCH__ return detail::cuda_atomicDec(acc); @@ -748,7 +792,7 @@ atomicDec(cuda_atomic_explicit, T volatile *acc) RAJA_SUPPRESS_HD_WARN template RAJA_INLINE RAJA_HOST_DEVICE T -atomicAnd(cuda_atomic_explicit, T volatile *acc, T value) +atomicAnd(cuda_atomic_explicit, T *acc, T value) { #ifdef __CUDA_ARCH__ return detail::cuda_atomicAnd(acc, value); @@ -760,7 +804,7 @@ atomicAnd(cuda_atomic_explicit, T volatile *acc, T value) RAJA_SUPPRESS_HD_WARN template RAJA_INLINE RAJA_HOST_DEVICE T -atomicOr(cuda_atomic_explicit, T volatile *acc, T value) +atomicOr(cuda_atomic_explicit, T *acc, T value) { #ifdef __CUDA_ARCH__ return detail::cuda_atomicOr(acc, value); @@ -772,7 +816,7 @@ atomicOr(cuda_atomic_explicit, T volatile *acc, T value) RAJA_SUPPRESS_HD_WARN template RAJA_INLINE RAJA_HOST_DEVICE T -atomicXor(cuda_atomic_explicit, T volatile *acc, T value) +atomicXor(cuda_atomic_explicit, T *acc, T value) { #ifdef __CUDA_ARCH__ return detail::cuda_atomicXor(acc, value); @@ -784,7 +828,7 @@ atomicXor(cuda_atomic_explicit, T volatile *acc, T value) RAJA_SUPPRESS_HD_WARN template RAJA_INLINE RAJA_HOST_DEVICE T -atomicExchange(cuda_atomic_explicit, T volatile *acc, T value) +atomicExchange(cuda_atomic_explicit, T *acc, T value) { #ifdef __CUDA_ARCH__ return detail::cuda_atomicExchange(acc, value); @@ -796,7 +840,7 @@ atomicExchange(cuda_atomic_explicit, T volatile *acc, T value) RAJA_SUPPRESS_HD_WARN template RAJA_INLINE RAJA_HOST_DEVICE T -atomicCAS(cuda_atomic_explicit, T volatile *acc, T compare, T value) +atomicCAS(cuda_atomic_explicit, T *acc, T compare, T value) { #ifdef __CUDA_ARCH__ return detail::cuda_atomicCAS(acc, compare, value); diff --git a/include/RAJA/policy/cuda/forall.hpp b/include/RAJA/policy/cuda/forall.hpp index 333f0f90e8..493136400c 100644 --- a/include/RAJA/policy/cuda/forall.hpp +++ b/include/RAJA/policy/cuda/forall.hpp @@ -548,7 +548,8 @@ forall_impl(resources::Cuda cuda_res, if (len > 0) { auto func = reinterpret_cast( - &impl::forall_cuda_kernel); + &impl::forall_cuda_kernel); // // Setup shared memory buffers @@ -567,7 +568,7 @@ forall_impl(resources::Cuda cuda_res, // // Privatize the loop_body, using make_launch_body to setup reductions // - LOOP_BODY body = RAJA::cuda::make_launch_body( + LOOP_BODY body = RAJA::cuda::make_launch_body(func, dims.blocks, dims.threads, shmem, cuda_res, std::forward(loop_body)); // @@ -617,7 +618,8 @@ forall_impl(resources::Cuda cuda_res, if (len > 0) { auto func = reinterpret_cast( - impl::forallp_cuda_kernel< EXEC_POL, BlocksPerSM, Iterator, LOOP_BODY, IndexType, camp::decay >); + &impl::forallp_cuda_kernel< EXEC_POL, BlocksPerSM, Iterator, LOOP_BODY, + IndexType, camp::decay >); // // Setup shared memory buffers @@ -643,7 +645,7 @@ forall_impl(resources::Cuda cuda_res, // // Privatize the loop_body, using make_launch_body to setup reductions // - LOOP_BODY body = RAJA::cuda::make_launch_body( + LOOP_BODY body = RAJA::cuda::make_launch_body(func, dims.blocks, dims.threads, shmem, cuda_res, std::forward(loop_body)); // diff --git a/include/RAJA/policy/cuda/intrinsics.hpp b/include/RAJA/policy/cuda/intrinsics.hpp index b0d2ea7cf1..b2daa3a23e 100644 --- a/include/RAJA/policy/cuda/intrinsics.hpp +++ b/include/RAJA/policy/cuda/intrinsics.hpp @@ -334,10 +334,10 @@ RAJA_DEVICE RAJA_INLINE T warp_reduce(T val, T RAJA_UNUSED_ARG(identity)) T temp = val; - if (numThreads % policy::cuda::WARP_SIZE == 0) { + if (numThreads % policy::cuda::device_constants.WARP_SIZE == 0) { // reduce each warp - for (int i = 1; i < policy::cuda::WARP_SIZE; i *= 2) { + for (int i = 1; i < policy::cuda::device_constants.WARP_SIZE; i *= 2) { T rhs = shfl_xor_sync(temp, i); Combiner{}(temp, rhs); } @@ -345,7 +345,7 @@ RAJA_DEVICE RAJA_INLINE T warp_reduce(T val, T RAJA_UNUSED_ARG(identity)) } else { // reduce each warp - for (int i = 1; i < policy::cuda::WARP_SIZE; i *= 2) { + for (int i = 1; i < policy::cuda::device_constants.WARP_SIZE; i *= 2) { int srcLane = threadId ^ i; T rhs = shfl_sync(temp, srcLane); // only add from threads that exist (don't double count own value) @@ -370,7 +370,7 @@ RAJA_DEVICE RAJA_INLINE T warp_allreduce(T val) { T temp = val; - for (int i = 1; i < policy::cuda::WARP_SIZE; i *= 2) { + for (int i = 1; i < policy::cuda::device_constants.WARP_SIZE; i *= 2) { T rhs = __shfl_xor_sync(0xffffffff, temp, i); Combiner{}(temp, rhs); } @@ -388,15 +388,15 @@ RAJA_DEVICE RAJA_INLINE T block_reduce(T val, T identity) int threadId = threadIdx.x + blockDim.x * threadIdx.y + (blockDim.x * blockDim.y) * threadIdx.z; - int warpId = threadId % policy::cuda::WARP_SIZE; - int warpNum = threadId / policy::cuda::WARP_SIZE; + int warpId = threadId % policy::cuda::device_constants.WARP_SIZE; + int warpNum = threadId / policy::cuda::device_constants.WARP_SIZE; T temp = val; - if (numThreads % policy::cuda::WARP_SIZE == 0) { + if (numThreads % policy::cuda::device_constants.WARP_SIZE == 0) { // reduce each warp - for (int i = 1; i < policy::cuda::WARP_SIZE; i *= 2) { + for (int i = 1; i < policy::cuda::device_constants.WARP_SIZE; i *= 2) { T rhs = shfl_xor_sync(temp, i); Combiner{}(temp, rhs); } @@ -404,7 +404,7 @@ RAJA_DEVICE RAJA_INLINE T block_reduce(T val, T identity) } else { // reduce each warp - for (int i = 1; i < policy::cuda::WARP_SIZE; i *= 2) { + for (int i = 1; i < policy::cuda::device_constants.WARP_SIZE; i *= 2) { int srcLane = threadId ^ i; T rhs = shfl_sync(temp, srcLane); // only add from threads that exist (don't double count own value) @@ -415,18 +415,18 @@ RAJA_DEVICE RAJA_INLINE T block_reduce(T val, T identity) } // reduce per warp values - if (numThreads > policy::cuda::WARP_SIZE) { + if (numThreads > policy::cuda::device_constants.WARP_SIZE) { - static_assert(policy::cuda::MAX_WARPS <= policy::cuda::WARP_SIZE, - "Max Warps must be less than or equal to Warp Size for this algorithm to work"); + static_assert(policy::cuda::device_constants.MAX_WARPS <= policy::cuda::device_constants.WARP_SIZE, + "This algorithms assumes a warp of WARP_SIZE threads can reduce MAX_WARPS values"); // Need to separate declaration and initialization for clang-cuda - __shared__ unsigned char tmpsd[sizeof(RAJA::detail::SoAArray)]; + __shared__ unsigned char tmpsd[sizeof(RAJA::detail::SoAArray)]; // Partial placement new: Should call new(tmpsd) here but recasting memory // to avoid calling constructor/destructor in shared memory. - RAJA::detail::SoAArray* sd = - reinterpret_cast *>(tmpsd); + RAJA::detail::SoAArray* sd = + reinterpret_cast *>(tmpsd); // write per warp values to shared memory if (warpId == 0) { @@ -438,13 +438,13 @@ RAJA_DEVICE RAJA_INLINE T block_reduce(T val, T identity) if (warpNum == 0) { // read per warp values - if (warpId * policy::cuda::WARP_SIZE < numThreads) { + if (warpId * policy::cuda::device_constants.WARP_SIZE < numThreads) { temp = sd->get(warpId); } else { temp = identity; } - for (int i = 1; i < policy::cuda::MAX_WARPS; i *= 2) { + for (int i = 1; i < policy::cuda::device_constants.MAX_WARPS; i *= 2) { T rhs = shfl_xor_sync(temp, i); Combiner{}(temp, rhs); } diff --git a/include/RAJA/policy/cuda/kernel/CudaKernel.hpp b/include/RAJA/policy/cuda/kernel/CudaKernel.hpp index c070d618ea..7465f515b0 100644 --- a/include/RAJA/policy/cuda/kernel/CudaKernel.hpp +++ b/include/RAJA/policy/cuda/kernel/CudaKernel.hpp @@ -237,7 +237,7 @@ struct CudaKernelLauncherGetter using type = camp::decay)>; static constexpr type get() noexcept { - return internal::CudaKernelLauncherFixed; + return &internal::CudaKernelLauncherFixed; } }; @@ -251,7 +251,7 @@ struct CudaKernelLauncherGetter<0, 0, Data, executor_t> using type = camp::decay)>; static constexpr type get() noexcept { - return internal::CudaKernelLauncher; + return &internal::CudaKernelLauncher; } }; @@ -281,10 +281,15 @@ struct CudaLaunchHelper; + inline static const void* get_func() + { + return reinterpret_cast(kernelGetter_t::get()); + } + inline static void recommended_blocks_threads(size_t shmem_size, int &recommended_blocks, int &recommended_threads) { - auto func = reinterpret_cast(kernelGetter_t::get()); + auto func = Self::get_func(); if (num_blocks <= 0) { @@ -363,7 +368,7 @@ struct CudaLaunchHelper(kernelGetter_t::get()); + auto func = Self::get_func(); if (num_blocks <= 0) { @@ -400,17 +405,6 @@ struct CudaLaunchHelper; - static_assert(mask_t::max_masked_size <= RAJA::policy::cuda::WARP_SIZE, + static_assert(mask_t::max_masked_size <= RAJA::policy::cuda::device_constants.WARP_SIZE, "BitMask is too large for CUDA warp size"); static @@ -312,7 +312,7 @@ struct CudaStatementExecutor< // we always get EXACTLY one warp by allocating one warp in the X // dimension - const diff_t len = RAJA::policy::cuda::WARP_SIZE; + const diff_t len = RAJA::policy::cuda::device_constants.WARP_SIZE; // request one thread per element in the segment set_cuda_dim(dims.dims.threads, len); @@ -352,7 +352,7 @@ struct CudaStatementExecutor< using diff_t = segment_diff_type; - static_assert(mask_t::max_masked_size <= RAJA::policy::cuda::WARP_SIZE, + static_assert(mask_t::max_masked_size <= RAJA::policy::cuda::device_constants.WARP_SIZE, "BitMask is too large for CUDA warp size"); static @@ -391,7 +391,7 @@ struct CudaStatementExecutor< // we always get EXACTLY one warp by allocating one warp in the X // dimension - const diff_t len = RAJA::policy::cuda::WARP_SIZE; + const diff_t len = RAJA::policy::cuda::device_constants.WARP_SIZE; // request one thread per element in the segment set_cuda_dim(dims.dims.threads, len); diff --git a/include/RAJA/policy/cuda/kernel/ForICount.hpp b/include/RAJA/policy/cuda/kernel/ForICount.hpp index 8486abaa2c..87556ed8b1 100644 --- a/include/RAJA/policy/cuda/kernel/ForICount.hpp +++ b/include/RAJA/policy/cuda/kernel/ForICount.hpp @@ -273,7 +273,7 @@ struct CudaStatementExecutor< using mask_t = Mask; - static_assert(mask_t::max_masked_size <= RAJA::policy::cuda::WARP_SIZE, + static_assert(mask_t::max_masked_size <= RAJA::policy::cuda::device_constants.WARP_SIZE, "BitMask is too large for CUDA warp size"); static inline RAJA_DEVICE @@ -332,7 +332,7 @@ struct CudaStatementExecutor< using mask_t = Mask; - static_assert(mask_t::max_masked_size <= RAJA::policy::cuda::WARP_SIZE, + static_assert(mask_t::max_masked_size <= RAJA::policy::cuda::device_constants.WARP_SIZE, "BitMask is too large for CUDA warp size"); static inline RAJA_DEVICE diff --git a/include/RAJA/policy/cuda/launch.hpp b/include/RAJA/policy/cuda/launch.hpp index 602221e58a..0db1dc4e0d 100644 --- a/include/RAJA/policy/cuda/launch.hpp +++ b/include/RAJA/policy/cuda/launch.hpp @@ -75,7 +75,8 @@ struct LaunchExecute; - auto func = launch_global_fcn; + auto func = reinterpret_cast( + &launch_global_fcn); resources::Cuda cuda_res = res.get(); @@ -99,17 +100,19 @@ struct LaunchExecute(body_in)); + BODY body = RAJA::cuda::make_launch_body(func, + gridSize, blockSize, shared_mem_size, cuda_res, std::forward(body_in)); // // Launch the kernel // void *args[] = {(void*)&body}; - RAJA::cuda::launch((const void*)func, gridSize, blockSize, args, params.shared_mem_size, cuda_res, async, kernel_name); + RAJA::cuda::launch(func, gridSize, blockSize, args, shared_mem_size, cuda_res, async, kernel_name); } RAJA_FT_END; @@ -128,7 +131,8 @@ struct LaunchExecute; - auto func = reinterpret_cast(launch_new_reduce_global_fcn >); + auto func = reinterpret_cast( + &launch_new_reduce_global_fcn>); resources::Cuda cuda_res = res.get(); @@ -151,9 +155,11 @@ struct LaunchExecute(body_in)); + BODY body = RAJA::cuda::make_launch_body(func, + gridSize, blockSize, shared_mem_size, cuda_res, std::forward(body_in)); // // Launch the kernel // void *args[] = {(void*)&body, (void*)&launch_reducers}; - RAJA::cuda::launch((const void*)func, gridSize, blockSize, args, launch_params.shared_mem_size, cuda_res, async, kernel_name); + RAJA::cuda::launch(func, gridSize, blockSize, args, shared_mem_size, cuda_res, async, kernel_name); RAJA::expt::ParamMultiplexer::resolve(launch_reducers, launch_info); } @@ -234,7 +240,8 @@ struct LaunchExecute; - auto func = launch_global_fcn_fixed; + auto func = reinterpret_cast( + &launch_global_fcn_fixed); resources::Cuda cuda_res = res.get(); @@ -258,17 +265,19 @@ struct LaunchExecute(body_in)); + BODY body = RAJA::cuda::make_launch_body(func, + gridSize, blockSize, shared_mem_size, cuda_res, std::forward(body_in)); // // Launch the kernel // void *args[] = {(void*)&body}; - RAJA::cuda::launch((const void*)func, gridSize, blockSize, args, params.shared_mem_size, cuda_res, async, kernel_name); + RAJA::cuda::launch(func, gridSize, blockSize, args, shared_mem_size, cuda_res, async, kernel_name); } RAJA_FT_END; @@ -288,7 +297,8 @@ struct LaunchExecute; - auto func = reinterpret_cast(launch_new_reduce_global_fcn >); + auto func = reinterpret_cast( + &launch_new_reduce_global_fcn>); resources::Cuda cuda_res = res.get(); @@ -312,9 +322,11 @@ struct LaunchExecute(body_in)); + BODY body = RAJA::cuda::make_launch_body(func, + gridSize, blockSize, shared_mem_size, cuda_res, std::forward(body_in)); // // Launch the kernel // void *args[] = {(void*)&body, (void*)&launch_reducers}; - RAJA::cuda::launch((const void*)func, gridSize, blockSize, args, launch_params.shared_mem_size, cuda_res, async, kernel_name); + RAJA::cuda::launch(func, gridSize, blockSize, args, shared_mem_size, cuda_res, async, kernel_name); RAJA::expt::ParamMultiplexer::resolve(launch_reducers, launch_info); } diff --git a/include/RAJA/policy/cuda/multi_reduce.hpp b/include/RAJA/policy/cuda/multi_reduce.hpp new file mode 100644 index 0000000000..f9f60f730e --- /dev/null +++ b/include/RAJA/policy/cuda/multi_reduce.hpp @@ -0,0 +1,764 @@ +/*! + ****************************************************************************** + * + * \file + * + * \brief Header file containing RAJA reduction templates for CUDA execution. + * + * These methods should work on any platform that supports + * CUDA devices. + * + ****************************************************************************** + */ + +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2016-24, Lawrence Livermore National Security, LLC +// and RAJA project contributors. See the RAJA/LICENSE file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +#ifndef RAJA_cuda_multi_reduce_HPP +#define RAJA_cuda_multi_reduce_HPP + +#include "RAJA/config.hpp" + +#if defined(RAJA_ENABLE_CUDA) + +#include +#include +#include +#include + +#include + +#include "RAJA/util/macros.hpp" +#include "RAJA/util/math.hpp" +#include "RAJA/util/mutex.hpp" +#include "RAJA/util/types.hpp" +#include "RAJA/util/reduce.hpp" +#include "RAJA/util/OffsetOperators.hpp" + +#include "RAJA/pattern/detail/multi_reduce.hpp" +#include "RAJA/pattern/multi_reduce.hpp" + +#include "RAJA/policy/cuda/MemUtils_CUDA.hpp" +#include "RAJA/policy/cuda/intrinsics.hpp" + +#if defined(RAJA_ENABLE_DESUL_ATOMICS) + #include "RAJA/policy/desul/atomic.hpp" +#else + #include "RAJA/policy/cuda/atomic.hpp" +#endif + +#include "RAJA/policy/cuda/policy.hpp" +#include "RAJA/policy/cuda/raja_cudaerrchk.hpp" + +namespace RAJA +{ + +namespace cuda +{ + +namespace impl +{ + + +// +////////////////////////////////////////////////////////////////////// +// +// MultiReduction algorithms. +// +////////////////////////////////////////////////////////////////////// +// + +//! combine value into global memory +template +RAJA_DEVICE RAJA_INLINE void block_multi_reduce_combine_global_atomic(int RAJA_UNUSED_ARG(num_bins), + T identity, + int bin, + T value, + T* tally_mem, + GetTallyOffset get_tally_offset, + int tally_replication, + int tally_bins) +{ + if (value == identity) { return; } + + int tally_index = GetTallyIndex::template index(); // globalWarpId by default + int tally_rep = ::RAJA::power_of_2_mod(tally_index, tally_replication); + int tally_offset = get_tally_offset(bin, tally_bins, tally_rep, tally_replication); + RAJA::reduce::cuda::atomic{}(tally_mem[tally_offset], value); +} + + +//! initialize shared memory +template +RAJA_DEVICE RAJA_INLINE void block_multi_reduce_init_shmem(int num_bins, + T identity, + T* shared_mem, + int shared_replication) +{ + int threadId = threadIdx.x + blockDim.x * threadIdx.y + + (blockDim.x * blockDim.y) * threadIdx.z; + int numThreads = blockDim.x * blockDim.y * blockDim.z; + + for (int shmem_offset = threadId; + shmem_offset < shared_replication * num_bins; + shmem_offset += numThreads) { + shared_mem[shmem_offset] = identity; + } + __syncthreads(); +} + +//! combine value into shared memory +template +RAJA_DEVICE RAJA_INLINE void block_multi_reduce_combine_shmem_atomic(int num_bins, + T identity, + int bin, + T value, + T* shared_mem, + GetSharedOffset get_shared_offset, + int shared_replication) +{ + if (value == identity) { return; } + + int shared_index = GetSharedIndex::template index(); // threadId by default + int shared_rep = ::RAJA::power_of_2_mod(shared_index, shared_replication); + int shmem_offset = get_shared_offset(bin, num_bins, shared_rep, shared_replication); + + RAJA::reduce::cuda::atomic{}(shared_mem[shmem_offset], value); +} + +//! combine value into shared memory +template +RAJA_DEVICE RAJA_INLINE void grid_multi_reduce_shmem_to_global_atomic(int num_bins, + T identity, + T* shared_mem, + GetSharedOffset get_shared_offset, + int shared_replication, + T* tally_mem, + GetTallyOffset get_tally_offset, + int tally_replication, + int tally_bins) +{ + int threadId = threadIdx.x + blockDim.x * threadIdx.y + + (blockDim.x * blockDim.y) * threadIdx.z; + int numThreads = blockDim.x * blockDim.y * blockDim.z; + + int blockId = blockIdx.x + gridDim.x * blockIdx.y + + (gridDim.x * gridDim.y) * blockIdx.z; + + __syncthreads(); + for (int bin = threadId; bin < num_bins; bin += numThreads) { + + T value = identity; + for (int shared_rep = 0; shared_rep < shared_replication; ++shared_rep) { + int shmem_offset = get_shared_offset(bin, num_bins, shared_rep, shared_replication); + Combiner{}(value, shared_mem[shmem_offset]); + } + + if (value != identity) { + int tally_rep = ::RAJA::power_of_2_mod(blockId, tally_replication); + int tally_offset = get_tally_offset(bin, tally_bins, tally_rep, tally_replication); + RAJA::reduce::cuda::atomic{}(tally_mem[tally_offset], value); + } + + } +} + +} // namespace impl + +// +////////////////////////////////////////////////////////////////////// +// +// MultiReduction classes. +// +////////////////////////////////////////////////////////////////////// +// + +//! MultiReduction data for Cuda Offload -- stores value, host pointer +template +struct MultiReduceGridAtomicHostInit_TallyData +{ + //! setup permanent settings, allocate and initialize tally memory + template < typename Container > + MultiReduceGridAtomicHostInit_TallyData(Container const& container, T const& identity) + : m_tally_mem(nullptr) + , m_identity(identity) + , m_num_bins(container.size()) + , m_tally_bins(get_tally_bins(m_num_bins)) + , m_tally_replication(get_tally_replication()) + { + m_tally_mem = create_tally(container, identity, m_num_bins, m_tally_bins, m_tally_replication); + } + + MultiReduceGridAtomicHostInit_TallyData() = delete; + MultiReduceGridAtomicHostInit_TallyData(MultiReduceGridAtomicHostInit_TallyData const&) = default; + MultiReduceGridAtomicHostInit_TallyData(MultiReduceGridAtomicHostInit_TallyData &&) = delete; + MultiReduceGridAtomicHostInit_TallyData& operator=(MultiReduceGridAtomicHostInit_TallyData const&) = default; + MultiReduceGridAtomicHostInit_TallyData& operator=(MultiReduceGridAtomicHostInit_TallyData &&) = delete; + ~MultiReduceGridAtomicHostInit_TallyData() = default; + + + //! reset permanent settings, reallocate and reset tally memory + template < typename Container > + void reset_permanent(Container const& container, T const& identity) + { + int new_num_bins = container.size(); + if (new_num_bins != m_num_bins) { + teardown_permanent(); + m_num_bins = new_num_bins; + m_tally_bins = get_tally_bins(m_num_bins); + m_tally_replication = get_tally_replication(); + m_tally_mem = create_tally(container, identity, m_num_bins, m_tally_bins, m_tally_replication); + } else { + { + int tally_rep = 0; + int bin = 0; + for (auto const& value : container) { + m_tally_mem[GetTallyOffset{}(bin, m_tally_bins, tally_rep, m_tally_replication)] = value; + ++bin; + } + } + for (int tally_rep = 1; tally_rep < m_tally_replication; ++tally_rep) { + for (int bin = 0; bin < m_num_bins; ++bin) { + m_tally_mem[GetTallyOffset{}(bin, m_tally_bins, tally_rep, m_tally_replication)] = identity; + } + } + } + m_identity = identity; + } + + //! teardown permanent settings, free tally memory + void teardown_permanent() + { + destroy_tally(m_tally_mem, m_num_bins, m_tally_bins, m_tally_replication); + } + + + //! get value for bin, assumes synchronization occurred elsewhere + T get(int bin) const + { + ::RAJA::detail::HighAccuracyReduce + reducer(m_identity); + for (int tally_rep = 0; tally_rep < m_tally_replication; ++tally_rep) { + int tally_offset = GetTallyOffset{}(bin, m_tally_bins, tally_rep, m_tally_replication); + reducer.combine(m_tally_mem[tally_offset]); + } + return reducer.get_and_clear(); + } + + + int num_bins() const { return m_num_bins; } + + T identity() const { return m_identity; } + +private: + static constexpr size_t s_tally_alignment = std::max(size_t(policy::cuda::device_constants.ATOMIC_DESTRUCTIVE_INTERFERENCE_SIZE), + size_t(RAJA::DATA_ALIGN)); + static constexpr size_t s_tally_bunch_size = RAJA_DIVIDE_CEILING_INT(s_tally_alignment, sizeof(T)); + + using tally_mempool_type = device_pinned_mempool_type; + using tally_tuning = typename tuning::GlobalAtomicReplicationTuning; + using TallyAtomicReplicationConcretizer = typename tally_tuning::AtomicReplicationConcretizer; + using GetTallyOffset_rebind_rebunch = typename tally_tuning::OffsetCalculator; + using GetTallyOffset_rebind = typename GetTallyOffset_rebind_rebunch::template rebunch; + + + static int get_tally_bins(int num_bins) + { + return RAJA_DIVIDE_CEILING_INT(num_bins, s_tally_bunch_size) * s_tally_bunch_size; + } + + static int get_tally_replication() + { + int min_tally_replication = 1; +#if defined(RAJA_ENABLE_OPENMP) + min_tally_replication = omp_get_max_threads(); +#endif + + struct { + int func_min_global_replication; + } func_data{min_tally_replication}; + + return TallyAtomicReplicationConcretizer{}.template + get_global_replication(func_data); + } + + template < typename Container > + static T* create_tally(Container const& container, T const& identity, + int num_bins, int tally_bins, int tally_replication) + { + if (num_bins == size_t(0)) { + return nullptr; + } + + T* tally_mem = tally_mempool_type::getInstance().template malloc( + tally_replication*tally_bins, s_tally_alignment); + + if (tally_replication > 0) { + { + int tally_rep = 0; + int bin = 0; + for (auto const& value : container) { + int tally_offset = GetTallyOffset{}(bin, tally_bins, tally_rep, tally_replication); + new(&tally_mem[tally_offset]) T(value); + ++bin; + } + } + for (int tally_rep = 1; tally_rep < tally_replication; ++tally_rep) { + for (int bin = 0; bin < num_bins; ++bin) { + int tally_offset = GetTallyOffset{}(bin, tally_bins, tally_rep, tally_replication); + new(&tally_mem[tally_offset]) T(identity); + } + } + } + return tally_mem; + } + + static void destroy_tally(T*& tally_mem, + int num_bins, int tally_bins, int tally_replication) + { + if (num_bins == size_t(0)) { + return; + } + + for (int tally_rep = tally_replication+1; tally_rep > 0; --tally_rep) { + for (int bin = num_bins; bin > 0; --bin) { + int tally_offset = GetTallyOffset{}(bin-1, tally_bins, tally_rep-1, tally_replication); + tally_mem[tally_offset].~T(); + } + } + tally_mempool_type::getInstance().free(tally_mem); + tally_mem = nullptr; + } + +protected: + using GetTallyIndex = typename tally_tuning::ReplicationIndexer; + using GetTallyOffset = typename GetTallyOffset_rebind::template rebind; + + T* m_tally_mem; + T m_identity; + int m_num_bins; + int m_tally_bins; + int m_tally_replication; // power of 2, at least the max number of omp threads +}; + + +//! MultiReduction data for Cuda Offload -- stores value, host pointer +template +struct MultiReduceGridAtomicHostInit_Data + : MultiReduceGridAtomicHostInit_TallyData +{ + using TallyData = MultiReduceGridAtomicHostInit_TallyData; + + //! defer to tally data for some functions + using TallyData::TallyData; + using TallyData::reset_permanent; + using TallyData::teardown_permanent; + using TallyData::get; + using TallyData::num_bins; + using TallyData::identity; + + //! setup per launch, do nothing + void setup_launch(size_t RAJA_UNUSED_ARG(block_size)) + { } + + //! teardown per launch, do nothing + void teardown_launch() + { } + + + //! setup on device, do nothing + RAJA_DEVICE + void setup_device() + { } + + //! finalize on device, do nothing + RAJA_DEVICE + void finalize_device() + { } + + + //! combine value on device, combine a value into the tally atomically + RAJA_DEVICE + void combine_device(int bin, T value) + { + impl::block_multi_reduce_combine_global_atomic( + m_num_bins, m_identity, + bin, value, + m_tally_mem, GetTallyOffset{}, m_tally_replication, m_tally_bins); + } + + //! combine value on host, combine a value into the tally + void combine_host(int bin, T value) + { + int tally_rep = 0; +#if defined(RAJA_ENABLE_OPENMP) + tally_rep = omp_get_thread_num(); +#endif + int tally_offset = GetTallyOffset{}(bin, m_tally_bins, tally_rep, m_tally_replication); + Combiner{}(m_tally_mem[tally_offset], value); + } + +private: + using typename TallyData::GetTallyIndex; + using typename TallyData::GetTallyOffset; + + using TallyData::m_tally_mem; + using TallyData::m_identity; + using TallyData::m_num_bins; + using TallyData::m_tally_bins; + using TallyData::m_tally_replication; +}; + + +//! MultiReduction data for Cuda Offload -- stores value, host pointer +template +struct MultiReduceBlockThenGridAtomicHostInit_Data + : MultiReduceGridAtomicHostInit_TallyData +{ + using TallyData = MultiReduceGridAtomicHostInit_TallyData; + + //! setup permanent settings, defer to tally data + template < typename Container > + MultiReduceBlockThenGridAtomicHostInit_Data(Container const& container, T const& identity) + : TallyData(container, identity) + , m_shared_offset(s_shared_offset_unknown) + , m_shared_replication(0) + { } + + MultiReduceBlockThenGridAtomicHostInit_Data() = delete; + MultiReduceBlockThenGridAtomicHostInit_Data(MultiReduceBlockThenGridAtomicHostInit_Data const&) = default; + MultiReduceBlockThenGridAtomicHostInit_Data(MultiReduceBlockThenGridAtomicHostInit_Data &&) = delete; + MultiReduceBlockThenGridAtomicHostInit_Data& operator=(MultiReduceBlockThenGridAtomicHostInit_Data const&) = default; + MultiReduceBlockThenGridAtomicHostInit_Data& operator=(MultiReduceBlockThenGridAtomicHostInit_Data &&) = delete; + ~MultiReduceBlockThenGridAtomicHostInit_Data() = default; + + + //! defer to tally data for some functions + using TallyData::reset_permanent; + using TallyData::teardown_permanent; + using TallyData::get; + using TallyData::num_bins; + using TallyData::identity; + + //! setup per launch, setup shared memory parameters + void setup_launch(size_t block_size) + { + if (m_num_bins == size_t(0)) { + m_shared_offset = s_shared_offset_invalid; + return; + } + + size_t shared_replication = 0; + const size_t shared_offset = allocateDynamicShmem( + [&](size_t max_shmem_size) { + + struct { + size_t func_threads_per_block; + size_t func_max_shared_replication_per_block; + } func_data{block_size, max_shmem_size / m_num_bins}; + + shared_replication = SharedAtomicReplicationConcretizer{}.template + get_shared_replication(func_data); + return m_num_bins * shared_replication; + }); + + if (shared_offset != dynamic_smem_allocation_failure) { + m_shared_replication = static_cast(shared_replication); + m_shared_offset = static_cast(shared_offset); + } else { + m_shared_offset = s_shared_offset_invalid; + } + } + + //! teardown per launch, unset shared memory parameters + void teardown_launch() + { + m_shared_replication = 0; + m_shared_offset = s_shared_offset_unknown; + } + + + //! setup on device, initialize shared memory + RAJA_DEVICE + void setup_device() + { + T* shared_mem = get_shared_mem(); + if (shared_mem != nullptr) { + impl::block_multi_reduce_init_shmem( + m_num_bins, m_identity, + shared_mem, m_shared_replication); + } + } + + //! finalize on device, combine values in shared memory into the tally + RAJA_DEVICE + void finalize_device() + { + T* shared_mem = get_shared_mem(); + if (shared_mem != nullptr) { + impl::grid_multi_reduce_shmem_to_global_atomic( + m_num_bins, m_identity, + shared_mem, GetSharedOffset{}, m_shared_replication, + m_tally_mem, GetTallyOffset{}, m_tally_replication, m_tally_bins); + } + } + + + //! combine value on device, combine a value into shared memory + RAJA_DEVICE + void combine_device(int bin, T value) + { + T* shared_mem = get_shared_mem(); + if (shared_mem != nullptr) { + impl::block_multi_reduce_combine_shmem_atomic( + m_num_bins, m_identity, + bin, value, + shared_mem, GetSharedOffset{}, m_shared_replication); + } else { + impl::block_multi_reduce_combine_global_atomic( + m_num_bins, m_identity, + bin, value, + m_tally_mem, GetTallyOffset{}, m_tally_replication, m_tally_bins); + } + } + + //! combine value on host, combine a value into the tally + void combine_host(int bin, T value) + { + int tally_rep = 0; +#if defined(RAJA_ENABLE_OPENMP) + tally_rep = omp_get_thread_num(); +#endif + int tally_offset = GetTallyOffset{}(bin, m_tally_bins, tally_rep, m_tally_replication); + Combiner{}(m_tally_mem[tally_offset], value); + } + +private: + using shared_tuning = typename tuning::SharedAtomicReplicationTuning; + using SharedAtomicReplicationConcretizer = typename shared_tuning::AtomicReplicationConcretizer; + using GetSharedIndex = typename shared_tuning::ReplicationIndexer; + using GetSharedOffset_rebind = typename shared_tuning::OffsetCalculator; + using GetSharedOffset = typename GetSharedOffset_rebind::template rebind; + + using typename TallyData::GetTallyIndex; + using typename TallyData::GetTallyOffset; + + + static constexpr int s_shared_offset_unknown = std::numeric_limits::max(); + static constexpr int s_shared_offset_invalid = std::numeric_limits::max() - 1; + + + using TallyData::m_tally_mem; + using TallyData::m_identity; + using TallyData::m_num_bins; + using TallyData::m_tally_bins; + using TallyData::m_tally_replication; + + int m_shared_offset; // in bytes + int m_shared_replication; // power of 2 + + + RAJA_DEVICE + T* get_shared_mem() const + { + if (m_shared_offset == s_shared_offset_invalid) { + return nullptr; + } + extern __shared__ char shared_mem[]; + return reinterpret_cast(&shared_mem[m_shared_offset]); + } +}; + + +/*! + ************************************************************************** + * + * \brief Cuda multi-reduce data class template. + * + * This class manages synchronization, data lifetimes, and interaction with + * the runtime kernel launch info passing facilities. + * + * This class manages the lifetime of underlying reduce_data_type using + * calls to setup and teardown methods. This includes storage durations: + * - permanent, the lifetime of the parent object + * - launch, setup before a launch using the launch parameters and + * teardown after the launch + * - device, setup all device threads in a kernel before any block work and + * teardown all device threads after all block work is finished + * + ************************************************************************** + */ +template < typename T, typename t_MultiReduceOp, typename tuning > +struct MultiReduceDataCuda +{ + static constexpr bool atomic_available = RAJA::reduce::cuda::cuda_atomic_available::value; + + //! cuda reduction data storage class and folding algorithm + using reduce_data_type = + std::conditional_t<(atomic_available), + std::conditional_t<(tuning::algorithm == multi_reduce_algorithm::init_host_combine_block_atomic_then_grid_atomic), + cuda::MultiReduceBlockThenGridAtomicHostInit_Data, + std::conditional_t<(tuning::algorithm == multi_reduce_algorithm::init_host_combine_global_atomic), + cuda::MultiReduceGridAtomicHostInit_Data, + void>>, + void>; + + + using SyncList = std::vector; + +public: + using value_type = T; + using MultiReduceOp = t_MultiReduceOp; + + MultiReduceDataCuda() = delete; + + template < typename Container, + std::enable_if_t::value>* = nullptr > + MultiReduceDataCuda(Container const& container, T identity) + : m_parent(this) + , m_sync_list(new SyncList) + , m_data(container, identity) + , m_own_launch_data(false) + { + } + + //! copy and on host attempt to setup for device + // init val_ptr to avoid uninitialized read caused by host copy of + // reducer in host device lambda not being used on device. + RAJA_HOST_DEVICE + MultiReduceDataCuda(MultiReduceDataCuda const& other) +#if !defined(RAJA_GPU_DEVICE_COMPILE_PASS_ACTIVE) + : m_parent(other.m_parent) +#else + : m_parent(&other) +#endif + , m_sync_list(other.m_sync_list) + , m_data(other.m_data) + , m_own_launch_data(false) + { +#if !defined(RAJA_GPU_DEVICE_COMPILE_PASS_ACTIVE) + if (m_parent) { + if (setupReducers()) { + // the copy made in make_launch_body does this setup + add_resource_to_synchronization_list(currentResource()); + m_data.setup_launch(currentBlockSize()); + m_own_launch_data = true; + m_parent = nullptr; + } + } +#else + if (!m_parent->m_parent) { + // the first copy on device enters this branch + m_data.setup_device(); + } +#endif + } + + MultiReduceDataCuda(MultiReduceDataCuda &&) = delete; + MultiReduceDataCuda& operator=(MultiReduceDataCuda const&) = delete; + MultiReduceDataCuda& operator=(MultiReduceDataCuda &&) = delete; + + //! cleanup resources owned by this copy + // on device store in pinned buffer on host + RAJA_HOST_DEVICE + ~MultiReduceDataCuda() + { +#if !defined(RAJA_GPU_DEVICE_COMPILE_PASS_ACTIVE) + if (m_parent == this) { + // the original object, owns permanent storage + synchronize_resources_and_clear_list(); + delete m_sync_list; + m_sync_list = nullptr; + m_data.teardown_permanent(); + } else if (m_parent) { + // do nothing + } else { + if (m_own_launch_data) { + // the copy made in make_launch_body, owns launch data + m_data.teardown_launch(); + m_own_launch_data = false; + } + } +#else + if (!m_parent->m_parent) { + // the first copy on device, does finalization on the device + m_data.finalize_device(); + } +#endif + } + + + template < typename Container > + void reset(Container const& container, T identity) + { + synchronize_resources_and_clear_list(); + m_data.reset_permanent(container, identity); + } + + + //! apply reduction (const version) -- still combines internal values + RAJA_HOST_DEVICE + void combine(int bin, T const& value) + { +#if !defined(RAJA_GPU_DEVICE_COMPILE_PASS_ACTIVE) + m_data.combine_host(bin, value); +#else + m_data.combine_device(bin, value); +#endif + } + + + //! map result value back to host if not done already; return aggregate value + T get(int bin) + { + synchronize_resources_and_clear_list(); + return m_data.get(bin); + } + + + size_t num_bins() const { return m_data.num_bins(); } + + T identity() const { return m_data.identity(); } + + +private: + MultiReduceDataCuda const *m_parent; + SyncList* m_sync_list; + reduce_data_type m_data; + bool m_own_launch_data; + + void add_resource_to_synchronization_list(resources::Cuda res) + { + for (resources::Cuda& list_res : *m_sync_list) { + if (list_res.get_stream() == res.get_stream()) { + return; + } + } + m_sync_list->emplace_back(res); + } + + void synchronize_resources_and_clear_list() + { + for (resources::Cuda& list_res : *m_sync_list) { + ::RAJA::cuda::synchronize(list_res); + } + m_sync_list->clear(); + } +}; + +} // end namespace cuda + +RAJA_DECLARE_ALL_MULTI_REDUCERS(policy::cuda::cuda_multi_reduce_policy, cuda::MultiReduceDataCuda) + +} // namespace RAJA + +#endif // closing endif for RAJA_ENABLE_CUDA guard + +#endif // closing endif for header file include guard diff --git a/include/RAJA/policy/cuda/params/kernel_name.hpp b/include/RAJA/policy/cuda/params/kernel_name.hpp index d845bccfc2..4edf645ed3 100644 --- a/include/RAJA/policy/cuda/params/kernel_name.hpp +++ b/include/RAJA/policy/cuda/params/kernel_name.hpp @@ -1,11 +1,10 @@ #ifndef CUDA_KERNELNAME_HPP #define CUDA_KERNELNAME_HPP -//#include "../util/policy.hpp" - #if defined(RAJA_CUDA_ACTIVE) #include +#include "RAJA/policy/cuda/MemUtils_CUDA.hpp" #include "RAJA/pattern/params/kernel_name.hpp" namespace RAJA { @@ -15,10 +14,12 @@ namespace detail { // Init template camp::concepts::enable_if< type_traits::is_cuda_policy > - init(KernelName& kn, const RAJA::cuda::detail::cudaInfo & cs) + init(KernelName& kn, const RAJA::cuda::detail::cudaInfo &) { #if defined(RAJA_ENABLE_NV_TOOLS_EXT) nvtxRangePush(kn.name); +#else + RAJA_UNUSED_VAR(kn); #endif } @@ -31,7 +32,7 @@ namespace detail { // Resolve template camp::concepts::enable_if< type_traits::is_cuda_policy > - resolve(KernelName&) + resolve(KernelName&, const RAJA::cuda::detail::cudaInfo &) { #if defined(RAJA_ENABLE_NV_TOOLS_EXT) nvtxRangePop(); diff --git a/include/RAJA/policy/cuda/policy.hpp b/include/RAJA/policy/cuda/policy.hpp index 84cd8a301c..cd71a37480 100644 --- a/include/RAJA/policy/cuda/policy.hpp +++ b/include/RAJA/policy/cuda/policy.hpp @@ -31,7 +31,9 @@ #include "RAJA/policy/sequential/policy.hpp" #include "RAJA/util/Operators.hpp" +#include "RAJA/util/OffsetOperators.hpp" #include "RAJA/util/types.hpp" +#include "RAJA/util/math.hpp" namespace RAJA { @@ -79,6 +81,13 @@ struct IndexGlobal; template struct IndexFlatten; +template +struct IndexDivide; + +template +struct IndexModulo; + + /*! * Use the max occupancy of a kernel on the current device when launch * parameters are not fully determined. @@ -160,6 +169,84 @@ struct AvoidDeviceMaxThreadOccupancyConcretizer }; +/*! + * Get an amount of replication that is preferred_replication. + */ +template < size_t preferred_replication > +struct ConstantPreferredReplicationConcretizer +{ + template < typename IdxT, typename Data > + static IdxT get_preferred_replication(Data const& RAJA_UNUSED_ARG(data)) + { + return IdxT(preferred_replication); + } +}; + +/*! + * Get an amount of replication that is preferred_replication_before_cutoff if + * data.func_threads_per_block is less than t_cutoff or + * preferred_replication_after_cutoff otherwise. + */ +template < size_t t_cutoff, size_t preferred_replication_before_cutoff, + size_t preferred_replication_after_cutoff > +struct ThreadsPerBlockCutoffPreferredReplicationConcretizer +{ + template < typename IdxT, typename Data > + static IdxT get_preferred_replication(Data const& data) + { + IdxT cutoff = t_cutoff; + IdxT func_threads_per_block = data.func_threads_per_block; + + if (func_threads_per_block < cutoff) { + return IdxT(preferred_replication_before_cutoff); + } else { + return IdxT(preferred_replication_after_cutoff); + } + } +}; + +/*! + * Get an amount of shared atomic replication that is a power of 2 that is at + * most the amount given by data.func_max_shared_replication_per_block or the + * amount given by GetPreferredReplication. + */ +template < typename GetPreferredReplication > +struct SharedAtomicReplicationMaxPow2Concretizer +{ + template < typename IdxT, typename Data > + static IdxT get_shared_replication(Data const& data) + { + IdxT func_max_shared_replication_per_block = data.func_max_shared_replication_per_block; + + IdxT preferred_replication = GetPreferredReplication{}.template + get_preferred_replication(data); + + return prev_pow2(std::min(preferred_replication, + func_max_shared_replication_per_block)); + } +}; + +/*! + * Get an amount of global atomic replication that is a power of 2 that is at + * least the amount given by data.func_min_global_replication or the + * amount given by GetPreferredReplication. + */ +template < typename GetPreferredReplication > +struct GlobalAtomicReplicationMinPow2Concretizer +{ + template < typename IdxT, typename Data > + static IdxT get_global_replication(Data const& data) + { + IdxT func_min_global_replication = data.func_min_global_replication; + + IdxT preferred_replication = GetPreferredReplication{}.template + get_preferred_replication(data); + + return next_pow2(std::max(preferred_replication, func_min_global_replication)); + } +}; + + enum struct reduce_algorithm : int { combine_last_block, @@ -181,6 +268,36 @@ struct ReduceTuning static constexpr block_communication_mode comm_mode = t_comm_mode; static constexpr size_t replication = t_replication; static constexpr size_t atomic_stride = t_atomic_stride; + static constexpr bool consistent = + (algorithm == reduce_algorithm::combine_last_block); +}; + + +enum struct multi_reduce_algorithm : int +{ + init_host_combine_block_atomic_then_grid_atomic, + init_host_combine_global_atomic +}; + +template < typename t_AtomicReplicationConcretizer, + typename t_ReplicationIndexer, + typename t_OffsetCalculator > +struct AtomicReplicationTuning +{ + using AtomicReplicationConcretizer = t_AtomicReplicationConcretizer; + using ReplicationIndexer = t_ReplicationIndexer; + using OffsetCalculator = t_OffsetCalculator; +}; + +template < multi_reduce_algorithm t_algorithm, + typename t_SharedAtomicReplicationTuning, + typename t_GlobalAtomicReplicationTuning > +struct MultiReduceTuning +{ + static constexpr multi_reduce_algorithm algorithm = t_algorithm; + using SharedAtomicReplicationTuning = t_SharedAtomicReplicationTuning; + using GlobalAtomicReplicationTuning = t_GlobalAtomicReplicationTuning; + static constexpr bool consistent = false; }; } // namespace cuda @@ -190,9 +307,38 @@ namespace policy namespace cuda { +struct DeviceConstants +{ + RAJA::Index_type WARP_SIZE; + RAJA::Index_type MAX_BLOCK_SIZE; + RAJA::Index_type MAX_WARPS; + RAJA::Index_type ATOMIC_DESTRUCTIVE_INTERFERENCE_SIZE; // basically the cache line size of the cache level that handles atomics + + constexpr DeviceConstants(RAJA::Index_type warp_size, + RAJA::Index_type max_block_size, + RAJA::Index_type atomic_cache_line_bytes) noexcept + : WARP_SIZE(warp_size) + , MAX_BLOCK_SIZE(max_block_size) + , MAX_WARPS(max_block_size / warp_size) + , ATOMIC_DESTRUCTIVE_INTERFERENCE_SIZE(atomic_cache_line_bytes) + { } +}; + +// +// Operations in the included files are parametrized using the following +// values for CUDA warp size and max block size. +// +constexpr DeviceConstants device_constants(32, 1024, 32); // V100 +static_assert(device_constants.WARP_SIZE >= device_constants.MAX_WARPS, + "RAJA Assumption Broken: device_constants.WARP_SIZE < device_constants.MAX_WARPS"); +static_assert(device_constants.MAX_BLOCK_SIZE % device_constants.WARP_SIZE == 0, + "RAJA Assumption Broken: device_constants.MAX_BLOCK_SIZE not " + "a multiple of device_constants.WARP_SIZE"); + constexpr const size_t MIN_BLOCKS_PER_SM = 1; constexpr const size_t MAX_BLOCKS_PER_SM = 32; + template struct cuda_indexer {}; @@ -268,7 +414,22 @@ struct cuda_reduce_policy make_policy_pattern_launch_platform_t::value, - RAJA::Platform::cuda> { + RAJA::Platform::cuda, + std::conditional_t> { +}; + +template < typename tuning > +struct cuda_multi_reduce_policy + : public RAJA:: + make_policy_pattern_launch_platform_t::value, + RAJA::Platform::cuda, + std::conditional_t> { }; /*! @@ -285,74 +446,6 @@ struct cuda_atomic_explicit{}; using cuda_atomic = cuda_atomic_explicit; -template < RAJA::cuda::reduce_algorithm algorithm, - RAJA::cuda::block_communication_mode comm_mode, - size_t replication = named_usage::unspecified, - size_t atomic_stride = named_usage::unspecified > -using cuda_reduce_tuning = cuda_reduce_policy< RAJA::cuda::ReduceTuning< - algorithm, comm_mode, replication, atomic_stride> >; - -// Policies for RAJA::Reduce* objects with specific behaviors. -// - *atomic* policies may use atomics to combine partial results and falls back -// on a non-atomic policy when atomics can't be used with the given type. The -// use of atomics leads to order of operation differences which change the -// results of floating point sum reductions run to run. The memory used with -// atomics is initialized on the device which can be expensive on some HW. -// On some HW this is faster overall than the non-atomic policies. -// - *atomic_host* policies are similar to the atomic policies above. However -// the memory used with atomics is initialized on the host which is -// significantly cheaper on some HW. On some HW this is faster overall than -// the non-atomic and atomic policies. -// - *device_fence policies use normal memory accesses with device scope fences -// in the implementation. This works on all HW. -// - *block_fence policies use special (atomic) memory accesses that only cache -// in a cache shared by the whole device to avoid having to use -// device scope fences. This improves performance on some HW but -// is more difficult to code correctly. -using cuda_reduce_device_fence = cuda_reduce_tuning< - RAJA::cuda::reduce_algorithm::combine_last_block, - RAJA::cuda::block_communication_mode::device_fence, - named_usage::unspecified, named_usage::unspecified>; -/// -using cuda_reduce_block_fence = cuda_reduce_tuning< - RAJA::cuda::reduce_algorithm::combine_last_block, - RAJA::cuda::block_communication_mode::block_fence, - named_usage::unspecified, named_usage::unspecified>; -/// -using cuda_reduce_atomic_device_init_device_fence = cuda_reduce_tuning< - RAJA::cuda::reduce_algorithm::init_device_combine_atomic_block, - RAJA::cuda::block_communication_mode::device_fence, - named_usage::unspecified, named_usage::unspecified>; -/// -using cuda_reduce_atomic_device_init_block_fence = cuda_reduce_tuning< - RAJA::cuda::reduce_algorithm::init_device_combine_atomic_block, - RAJA::cuda::block_communication_mode::block_fence, - named_usage::unspecified, named_usage::unspecified>; -/// -using cuda_reduce_atomic_host_init_device_fence = cuda_reduce_tuning< - RAJA::cuda::reduce_algorithm::init_host_combine_atomic_block, - RAJA::cuda::block_communication_mode::device_fence, - named_usage::unspecified, named_usage::unspecified>; -/// -using cuda_reduce_atomic_host_init_block_fence = cuda_reduce_tuning< - RAJA::cuda::reduce_algorithm::init_host_combine_atomic_block, - RAJA::cuda::block_communication_mode::block_fence, - named_usage::unspecified, named_usage::unspecified>; - -// Policy for RAJA::Reduce* objects that gives the same answer every time when -// used in the same way -using cuda_reduce = cuda_reduce_device_fence; - -// Policy for RAJA::Reduce* objects that may use atomics and may not give the -// same answer every time when used in the same way -using cuda_reduce_atomic = cuda_reduce_atomic_host_init_device_fence; - -// Policy for RAJA::Reduce* objects that lets you select the default atomic or -// non-atomic policy with a bool -template < bool with_atomic > -using cuda_reduce_base = std::conditional_t; - - // Policy for RAJA::statement::Reduce that reduces threads in a block // down to threadIdx 0 struct cuda_block_reduce{}; @@ -400,21 +493,6 @@ template struct cuda_thread_masked_loop {}; - -// -// Operations in the included files are parametrized using the following -// values for CUDA warp size and max block size. -// -constexpr const RAJA::Index_type ATOMIC_DESTRUCTIVE_INTERFERENCE_SIZE = 32; -constexpr const RAJA::Index_type WARP_SIZE = 32; -constexpr const RAJA::Index_type MAX_BLOCK_SIZE = 1024; -constexpr const RAJA::Index_type MAX_WARPS = MAX_BLOCK_SIZE / WARP_SIZE; -static_assert(WARP_SIZE >= MAX_WARPS, - "RAJA Assumption Broken: WARP_SIZE < MAX_WARPS"); -static_assert(MAX_BLOCK_SIZE % WARP_SIZE == 0, - "RAJA Assumption Broken: MAX_BLOCK_SIZE not " - "a multiple of WARP_SIZE"); - struct cuda_synchronize : make_policy_pattern_launch_t { @@ -992,6 +1070,38 @@ struct IndexFlatten }; +template +struct IndexDivide +{ + template < typename IdxT = cuda_dim_member_t > + RAJA_DEVICE static inline IdxT index() + { + return indexer::template index() / static_cast(divisor); + } + + template < typename IdxT = cuda_dim_member_t > + RAJA_DEVICE static inline IdxT size() + { + return RAJA_DIVIDE_CEILING_INT(indexer::template size(), static_cast(divisor)); + } +}; + +template +struct IndexModulo +{ + template < typename IdxT = cuda_dim_member_t > + RAJA_DEVICE static inline IdxT index() + { + return indexer::template index() % static_cast(divisor); + } + + template < typename IdxT = cuda_dim_member_t > + RAJA_DEVICE static inline IdxT size() + { + return static_cast(divisor); + } +}; + // helper to get just the thread indexing part of IndexGlobal template < typename index_global > @@ -1037,6 +1147,13 @@ using thread_y = IndexGlobal; template using thread_z = IndexGlobal; +template +using thread_xyz = IndexFlatten, + thread_y, + thread_z>; + template using block_x = IndexGlobal; template @@ -1044,6 +1161,13 @@ using block_y = IndexGlobal; template using block_z = IndexGlobal; +template +using block_xyz = IndexFlatten, + block_y, + block_z>; + template using global_x = IndexGlobal; template @@ -1051,6 +1175,42 @@ using global_y = IndexGlobal; template using global_z = IndexGlobal; + +template +using global_xyz = IndexFlatten, + global_y, + global_z>; + + +template +using warp_xyz = IndexDivide>; + +template +using warp_global_xyz = IndexFlatten, + block_xyz>; + } // namespace cuda // contretizers used in forall, scan, and sort policies @@ -1248,16 +1408,147 @@ using policy::cuda::unordered_cuda_loop_y_block_iter_x_threadblock_average; using policy::cuda::cuda_atomic; using policy::cuda::cuda_atomic_explicit; + // policies usable with reducers -using policy::cuda::cuda_reduce_device_fence; -using policy::cuda::cuda_reduce_block_fence; -using policy::cuda::cuda_reduce_atomic_device_init_device_fence; -using policy::cuda::cuda_reduce_atomic_device_init_block_fence; -using policy::cuda::cuda_reduce_atomic_host_init_device_fence; -using policy::cuda::cuda_reduce_atomic_host_init_block_fence; -using policy::cuda::cuda_reduce_base; -using policy::cuda::cuda_reduce; -using policy::cuda::cuda_reduce_atomic; +template < cuda::reduce_algorithm algorithm, + cuda::block_communication_mode comm_mode, + size_t replication = named_usage::unspecified, + size_t atomic_stride = named_usage::unspecified > +using cuda_reduce_tuning = policy::cuda::cuda_reduce_policy< + cuda::ReduceTuning>; + +// Policies for RAJA::Reduce* objects with specific behaviors. +// - non-atomic policies store partial results and combine them in the same +// order every time, leading to consistent results for a loop run to run. +// - *atomic* policies may use atomics to combine partial results. The +// use of atomics leads to order of operation differences which change the +// results of floating point sum reductions for a loop run to run. Falls back +// on a non-atomic implementation if atomics can't be used with the given +// type. The memory used with atomics is initialized on the device using +// atomics which adds overhead. +// - *atomic_host* policies are similar to the atomic policies above. However +// the memory used with atomics is initialized on the host. This is faster +// overall than other policies on HW with direct host access to device memory +// such as the IBM power 9 + Nvidia V100 Sierra/Lassen systems. +// - *device_fence* policies use normal memory accesses with device scope fences +// in the implementation. This works on all HW. +// - *block_fence* policies use special (atomic) memory accesses that use +// a cache shared by the whole device to avoid having to use +// device scope fences. This improves performance on some HW but +// is more difficult to code correctly. +using cuda_reduce_device_fence = cuda_reduce_tuning< + cuda::reduce_algorithm::combine_last_block, + cuda::block_communication_mode::device_fence, + named_usage::unspecified, named_usage::unspecified>; +/// +using cuda_reduce_block_fence = cuda_reduce_tuning< + cuda::reduce_algorithm::combine_last_block, + cuda::block_communication_mode::block_fence, + named_usage::unspecified, named_usage::unspecified>; +/// +using cuda_reduce_atomic_device_init_device_fence = cuda_reduce_tuning< + cuda::reduce_algorithm::init_device_combine_atomic_block, + cuda::block_communication_mode::device_fence, + named_usage::unspecified, named_usage::unspecified>; +/// +using cuda_reduce_atomic_device_init_block_fence = cuda_reduce_tuning< + cuda::reduce_algorithm::init_device_combine_atomic_block, + cuda::block_communication_mode::block_fence, + named_usage::unspecified, named_usage::unspecified>; +/// +using cuda_reduce_atomic_host_init_device_fence = cuda_reduce_tuning< + cuda::reduce_algorithm::init_host_combine_atomic_block, + cuda::block_communication_mode::device_fence, + named_usage::unspecified, named_usage::unspecified>; +/// +using cuda_reduce_atomic_host_init_block_fence = cuda_reduce_tuning< + cuda::reduce_algorithm::init_host_combine_atomic_block, + cuda::block_communication_mode::block_fence, + named_usage::unspecified, named_usage::unspecified>; + +// Policy for RAJA::Reduce* objects that gives the same answer every time when +// used in the same way +using cuda_reduce = cuda_reduce_device_fence; + +// Policy for RAJA::Reduce* objects that may use atomics and may not give the +// same answer every time when used in the same way +using cuda_reduce_atomic = cuda_reduce_atomic_host_init_device_fence; + +// Policy for RAJA::Reduce* objects that lets you select the default atomic or +// non-atomic policy with a bool +template < bool with_atomic > +using cuda_reduce_base = std::conditional_t; + + +// policies usable with multi_reducers +template < cuda::multi_reduce_algorithm algorithm, + typename SharedAtomicReplicationConcretizer, + typename SharedAtomicReplicationIndexer, + typename GlobalAtomicReplicationConcretizer, + typename GlobalAtomicReplicationIndexer > +using cuda_multi_reduce_tuning = policy::cuda::cuda_multi_reduce_policy< + cuda::MultiReduceTuning< + algorithm, + cuda::AtomicReplicationTuning>, + cuda::AtomicReplicationTuning>>>; + +// Policies for RAJA::MultiReduce* objects with specific behaviors. +// - *atomic* policies may use atomics to combine partial results. The +// use of atomics leads to order of operation differences which change the +// results of floating point sum reductions for a loop run to run. +// - *no_replication* policies use the minimum amount of resources. The +// lack of resources means they may perform poorly. These policies are +// intended for use cases where low overhead is more important than high +// performance such as error flags that are rarely set. +// - *host_init* policies initialize memory used with atomics on the host. +// This is faster overall than other policies on HW with direct host access +// to device memory such as the IBM power 9 + Nvidia V100 Sierra/Lassen +// systems. +using cuda_multi_reduce_atomic_block_then_atomic_grid_host_init = cuda_multi_reduce_tuning< + cuda::multi_reduce_algorithm::init_host_combine_block_atomic_then_grid_atomic, + cuda::SharedAtomicReplicationMaxPow2Concretizer< + cuda::ConstantPreferredReplicationConcretizer<16>>, + cuda::thread_xyz<>, + cuda::GlobalAtomicReplicationMinPow2Concretizer< + cuda::ConstantPreferredReplicationConcretizer<2>>, + cuda::warp_global_xyz<>>; +// special policy to test that multi-reducers work if there is not enough shmem +using cuda_multi_reduce_atomic_block_then_atomic_grid_host_init_fallback_testing = cuda_multi_reduce_tuning< + cuda::multi_reduce_algorithm::init_host_combine_block_atomic_then_grid_atomic, + cuda::SharedAtomicReplicationMaxPow2Concretizer< + cuda::ConstantPreferredReplicationConcretizer<0>>, + cuda::thread_xyz<>, + cuda::GlobalAtomicReplicationMinPow2Concretizer< + cuda::ConstantPreferredReplicationConcretizer<2>>, + cuda::warp_global_xyz<>>; +// +using cuda_multi_reduce_atomic_global_host_init = cuda_multi_reduce_tuning< + cuda::multi_reduce_algorithm::init_host_combine_global_atomic, + void, // unused with this algorithm + void, // unused with this algorithm + cuda::GlobalAtomicReplicationMinPow2Concretizer< + cuda::ConstantPreferredReplicationConcretizer<2>>, + cuda::warp_global_xyz<>>; +// +using cuda_multi_reduce_atomic_global_no_replication_host_init = cuda_multi_reduce_tuning< + cuda::multi_reduce_algorithm::init_host_combine_global_atomic, + void, // unused with this algorithm + void, // unused with this algorithm + cuda::GlobalAtomicReplicationMinPow2Concretizer< + cuda::ConstantPreferredReplicationConcretizer<1>>, + cuda::block_xyz<>>; + +// Policy for RAJA::MultiReduce* objects that may use atomics and may not give the +// same answer every time when used in the same way +using cuda_multi_reduce_atomic = cuda_multi_reduce_atomic_block_then_atomic_grid_host_init; +// Similar to above but optimized for low overhead in cases where it is rarely used +using cuda_multi_reduce_atomic_low_performance_low_overhead = + cuda_multi_reduce_atomic_global_no_replication_host_init; + // policies usable with kernel using policy::cuda::cuda_block_reduce; @@ -1266,11 +1557,11 @@ using policy::cuda::cuda_warp_reduce; using cuda_warp_direct = RAJA::policy::cuda::cuda_indexer< iteration_mapping::Direct, kernel_sync_requirement::none, - cuda::thread_x>; + cuda::thread_x>; using cuda_warp_loop = RAJA::policy::cuda::cuda_indexer< iteration_mapping::StridedLoop, kernel_sync_requirement::none, - cuda::thread_x>; + cuda::thread_x>; using policy::cuda::cuda_warp_masked_direct; using policy::cuda::cuda_warp_masked_loop; diff --git a/include/RAJA/policy/cuda/reduce.hpp b/include/RAJA/policy/cuda/reduce.hpp index 516b02383c..8d55698af8 100644 --- a/include/RAJA/policy/cuda/reduce.hpp +++ b/include/RAJA/policy/cuda/reduce.hpp @@ -206,15 +206,15 @@ RAJA_DEVICE RAJA_INLINE T block_reduce(T val, T identity) const int numThreads = ThreadIterationGetter::size(); const int threadId = ThreadIterationGetter::index(); - const int warpId = threadId % RAJA::policy::cuda::WARP_SIZE; - const int warpNum = threadId / RAJA::policy::cuda::WARP_SIZE; + const int warpId = threadId % RAJA::policy::cuda::device_constants.WARP_SIZE; + const int warpNum = threadId / RAJA::policy::cuda::device_constants.WARP_SIZE; T temp = val; - if (numThreads % RAJA::policy::cuda::WARP_SIZE == 0) { + if (numThreads % RAJA::policy::cuda::device_constants.WARP_SIZE == 0) { // reduce each warp - for (int i = 1; i < RAJA::policy::cuda::WARP_SIZE; i *= 2) { + for (int i = 1; i < RAJA::policy::cuda::device_constants.WARP_SIZE; i *= 2) { T rhs = RAJA::cuda::impl::shfl_xor_sync(temp, i); temp = Combiner{}(temp, rhs); } @@ -222,7 +222,7 @@ RAJA_DEVICE RAJA_INLINE T block_reduce(T val, T identity) } else { // reduce each warp - for (int i = 1; i < RAJA::policy::cuda::WARP_SIZE; i *= 2) { + for (int i = 1; i < RAJA::policy::cuda::device_constants.WARP_SIZE; i *= 2) { int srcLane = threadId ^ i; T rhs = RAJA::cuda::impl::shfl_sync(temp, srcLane); // only add from threads that exist (don't double count own value) @@ -232,18 +232,18 @@ RAJA_DEVICE RAJA_INLINE T block_reduce(T val, T identity) } } - static_assert(RAJA::policy::cuda::MAX_WARPS <= RAJA::policy::cuda::WARP_SIZE, + static_assert(RAJA::policy::cuda::device_constants.MAX_WARPS <= RAJA::policy::cuda::device_constants.WARP_SIZE, "Max Warps must be less than or equal to Warp Size for this algorithm to work"); // reduce per warp values - if (numThreads > RAJA::policy::cuda::WARP_SIZE) { + if (numThreads > RAJA::policy::cuda::device_constants.WARP_SIZE) { // Need to separate declaration and initialization for clang-cuda - __shared__ unsigned char tmpsd[sizeof(RAJA::detail::SoAArray)]; + __shared__ unsigned char tmpsd[sizeof(RAJA::detail::SoAArray)]; // Partial placement new: Should call new(tmpsd) here but recasting memory // to avoid calling constructor/destructor in shared memory. - RAJA::detail::SoAArray * sd = reinterpret_cast *>(tmpsd); + RAJA::detail::SoAArray * sd = reinterpret_cast *>(tmpsd); // write per warp values to shared memory if (warpId == 0) { @@ -255,13 +255,13 @@ RAJA_DEVICE RAJA_INLINE T block_reduce(T val, T identity) if (warpNum == 0) { // read per warp values - if (warpId * RAJA::policy::cuda::WARP_SIZE < numThreads) { + if (warpId * RAJA::policy::cuda::device_constants.WARP_SIZE < numThreads) { temp = sd->get(warpId); } else { temp = identity; } - for (int i = 1; i < RAJA::policy::cuda::MAX_WARPS; i *= 2) { + for (int i = 1; i < RAJA::policy::cuda::device_constants.MAX_WARPS; i *= 2) { T rhs = RAJA::cuda::impl::shfl_xor_sync(temp, i); temp = Combiner{}(temp, rhs); } @@ -887,8 +887,8 @@ class Reduce : 1; static constexpr size_t atomic_stride = (tuning::atomic_stride > 0) ? tuning::atomic_stride - : ((policy::cuda::ATOMIC_DESTRUCTIVE_INTERFERENCE_SIZE > sizeof(T)) - ? RAJA_DIVIDE_CEILING_INT(policy::cuda::ATOMIC_DESTRUCTIVE_INTERFERENCE_SIZE, sizeof(T)) + : ((policy::cuda::device_constants.ATOMIC_DESTRUCTIVE_INTERFERENCE_SIZE > sizeof(T)) + ? RAJA_DIVIDE_CEILING_INT(policy::cuda::device_constants.ATOMIC_DESTRUCTIVE_INTERFERENCE_SIZE, sizeof(T)) : 1); using Accessor = std::conditional_t<(tuning::comm_mode == block_communication_mode::block_fence), diff --git a/include/RAJA/policy/desul/atomic.hpp b/include/RAJA/policy/desul/atomic.hpp index dbcb5e06eb..71bf429079 100644 --- a/include/RAJA/policy/desul/atomic.hpp +++ b/include/RAJA/policy/desul/atomic.hpp @@ -30,8 +30,32 @@ RAJA_SUPPRESS_HD_WARN template RAJA_HOST_DEVICE RAJA_INLINE T -atomicAdd(AtomicPolicy, T volatile *acc, T value) { - return desul::atomic_fetch_add(const_cast(acc), +atomicLoad(AtomicPolicy, T *acc) +{ + return desul::atomic_load(acc, + raja_default_desul_order{}, + raja_default_desul_scope{}); +} + +RAJA_SUPPRESS_HD_WARN +template +RAJA_HOST_DEVICE +RAJA_INLINE void +atomicStore(AtomicPolicy, T *acc, T value) +{ + desul::atomic_store(acc, + value, + raja_default_desul_order{}, + raja_default_desul_scope{}); +} + +RAJA_SUPPRESS_HD_WARN +template +RAJA_HOST_DEVICE +RAJA_INLINE T +atomicAdd(AtomicPolicy, T *acc, T value) +{ + return desul::atomic_fetch_add(acc, value, raja_default_desul_order{}, raja_default_desul_scope{}); @@ -41,8 +65,9 @@ RAJA_SUPPRESS_HD_WARN template RAJA_HOST_DEVICE RAJA_INLINE T -atomicSub(AtomicPolicy, T volatile *acc, T value) { - return desul::atomic_fetch_sub(const_cast(acc), +atomicSub(AtomicPolicy, T *acc, T value) +{ + return desul::atomic_fetch_sub(acc, value, raja_default_desul_order{}, raja_default_desul_scope{}); @@ -51,9 +76,9 @@ atomicSub(AtomicPolicy, T volatile *acc, T value) { RAJA_SUPPRESS_HD_WARN template RAJA_HOST_DEVICE -RAJA_INLINE T atomicMin(AtomicPolicy, T volatile *acc, T value) +RAJA_INLINE T atomicMin(AtomicPolicy, T *acc, T value) { - return desul::atomic_fetch_min(const_cast(acc), + return desul::atomic_fetch_min(acc, value, raja_default_desul_order{}, raja_default_desul_scope{}); @@ -62,9 +87,9 @@ RAJA_INLINE T atomicMin(AtomicPolicy, T volatile *acc, T value) RAJA_SUPPRESS_HD_WARN template RAJA_HOST_DEVICE -RAJA_INLINE T atomicMax(AtomicPolicy, T volatile *acc, T value) +RAJA_INLINE T atomicMax(AtomicPolicy, T *acc, T value) { - return desul::atomic_fetch_max(const_cast(acc), + return desul::atomic_fetch_max(acc, value, raja_default_desul_order{}, raja_default_desul_scope{}); @@ -73,9 +98,9 @@ RAJA_INLINE T atomicMax(AtomicPolicy, T volatile *acc, T value) RAJA_SUPPRESS_HD_WARN template RAJA_HOST_DEVICE -RAJA_INLINE T atomicInc(AtomicPolicy, T volatile *acc) +RAJA_INLINE T atomicInc(AtomicPolicy, T *acc) { - return desul::atomic_fetch_inc(const_cast(acc), + return desul::atomic_fetch_inc(acc, raja_default_desul_order{}, raja_default_desul_scope{}); } @@ -83,22 +108,22 @@ RAJA_INLINE T atomicInc(AtomicPolicy, T volatile *acc) RAJA_SUPPRESS_HD_WARN template RAJA_HOST_DEVICE -RAJA_INLINE T atomicInc(AtomicPolicy, T volatile *acc, T val) +RAJA_INLINE T atomicInc(AtomicPolicy, T *acc, T val) { // See: // http://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#atomicinc - return desul::atomic_fetch_inc_mod(const_cast(acc), - val, - raja_default_desul_order{}, - raja_default_desul_scope{}); + return desul::atomic_fetch_inc_mod(acc, + val, + raja_default_desul_order{}, + raja_default_desul_scope{}); } RAJA_SUPPRESS_HD_WARN template RAJA_HOST_DEVICE -RAJA_INLINE T atomicDec(AtomicPolicy, T volatile *acc) +RAJA_INLINE T atomicDec(AtomicPolicy, T *acc) { - return desul::atomic_fetch_dec(const_cast(acc), + return desul::atomic_fetch_dec(acc, raja_default_desul_order{}, raja_default_desul_scope{}); } @@ -106,22 +131,22 @@ RAJA_INLINE T atomicDec(AtomicPolicy, T volatile *acc) RAJA_SUPPRESS_HD_WARN template RAJA_HOST_DEVICE -RAJA_INLINE T atomicDec(AtomicPolicy, T volatile *acc, T val) +RAJA_INLINE T atomicDec(AtomicPolicy, T *acc, T val) { // See: // http://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#atomicdec - return desul::atomic_fetch_dec_mod(const_cast(acc), - val, - raja_default_desul_order{}, - raja_default_desul_scope{}); + return desul::atomic_fetch_dec_mod(acc, + val, + raja_default_desul_order{}, + raja_default_desul_scope{}); } RAJA_SUPPRESS_HD_WARN template RAJA_HOST_DEVICE -RAJA_INLINE T atomicAnd(AtomicPolicy, T volatile *acc, T value) +RAJA_INLINE T atomicAnd(AtomicPolicy, T *acc, T value) { - return desul::atomic_fetch_and(const_cast(acc), + return desul::atomic_fetch_and(acc, value, raja_default_desul_order{}, raja_default_desul_scope{}); @@ -130,9 +155,9 @@ RAJA_INLINE T atomicAnd(AtomicPolicy, T volatile *acc, T value) RAJA_SUPPRESS_HD_WARN template RAJA_HOST_DEVICE -RAJA_INLINE T atomicOr(AtomicPolicy, T volatile *acc, T value) +RAJA_INLINE T atomicOr(AtomicPolicy, T *acc, T value) { - return desul::atomic_fetch_or(const_cast(acc), + return desul::atomic_fetch_or(acc, value, raja_default_desul_order{}, raja_default_desul_scope{}); @@ -141,9 +166,9 @@ RAJA_INLINE T atomicOr(AtomicPolicy, T volatile *acc, T value) RAJA_SUPPRESS_HD_WARN template RAJA_HOST_DEVICE -RAJA_INLINE T atomicXor(AtomicPolicy, T volatile *acc, T value) +RAJA_INLINE T atomicXor(AtomicPolicy, T *acc, T value) { - return desul::atomic_fetch_xor(const_cast(acc), + return desul::atomic_fetch_xor(acc, value, raja_default_desul_order{}, raja_default_desul_scope{}); @@ -152,9 +177,9 @@ RAJA_INLINE T atomicXor(AtomicPolicy, T volatile *acc, T value) RAJA_SUPPRESS_HD_WARN template RAJA_HOST_DEVICE -RAJA_INLINE T atomicExchange(AtomicPolicy, T volatile *acc, T value) +RAJA_INLINE T atomicExchange(AtomicPolicy, T *acc, T value) { - return desul::atomic_exchange(const_cast(acc), + return desul::atomic_exchange(acc, value, raja_default_desul_order{}, raja_default_desul_scope{}); @@ -163,9 +188,9 @@ RAJA_INLINE T atomicExchange(AtomicPolicy, T volatile *acc, T value) RAJA_SUPPRESS_HD_WARN template RAJA_HOST_DEVICE -RAJA_INLINE T atomicCAS(AtomicPolicy, T volatile *acc, T compare, T value) +RAJA_INLINE T atomicCAS(AtomicPolicy, T *acc, T compare, T value) { - return desul::atomic_compare_exchange(const_cast(acc), + return desul::atomic_compare_exchange(acc, compare, value, raja_default_desul_order{}, diff --git a/include/RAJA/policy/hip.hpp b/include/RAJA/policy/hip.hpp index a1578fd9df..ab7e922c0f 100644 --- a/include/RAJA/policy/hip.hpp +++ b/include/RAJA/policy/hip.hpp @@ -33,6 +33,7 @@ #include "RAJA/policy/hip/forall.hpp" #include "RAJA/policy/hip/policy.hpp" #include "RAJA/policy/hip/reduce.hpp" +#include "RAJA/policy/hip/multi_reduce.hpp" #include "RAJA/policy/hip/scan.hpp" #include "RAJA/policy/hip/sort.hpp" #include "RAJA/policy/hip/kernel.hpp" diff --git a/include/RAJA/policy/hip/MemUtils_HIP.hpp b/include/RAJA/policy/hip/MemUtils_HIP.hpp index 84c6d1fa38..f1f69eab5e 100644 --- a/include/RAJA/policy/hip/MemUtils_HIP.hpp +++ b/include/RAJA/policy/hip/MemUtils_HIP.hpp @@ -50,6 +50,26 @@ namespace RAJA namespace hip { +//! Get the properties of the current device +RAJA_INLINE +hipDeviceProp_t get_device_prop() +{ + int device; + hipErrchk(hipGetDevice(&device)); + hipDeviceProp_t prop; + hipErrchk(hipGetDeviceProperties(&prop, device)); + return prop; +} + +//! Get a reference to a static cached copy of the current device properties. +// This caches a copy on first use to speedup later calls. +RAJA_INLINE +hipDeviceProp_t& device_prop() +{ + static thread_local hipDeviceProp_t prop = get_device_prop(); + return prop; +} + //! Allocator for pinned memory for use in basic_mempool struct PinnedAllocator { @@ -143,36 +163,22 @@ namespace detail //! struct containing data necessary to coordinate kernel launches with reducers struct hipInfo { + const void* func = nullptr; hip_dim_t gridDim{0, 0, 0}; hip_dim_t blockDim{0, 0, 0}; + size_t* dynamic_smem = nullptr; ::RAJA::resources::Hip res{::RAJA::resources::Hip::HipFromStream(0,0)}; bool setup_reducers = false; +}; +struct hipStatusInfo : hipInfo { #if defined(RAJA_ENABLE_OPENMP) - hipInfo* thread_states = nullptr; omp::mutex lock; #endif }; -//! class that changes a value on construction then resets it at destruction -template -class SetterResetter -{ -public: - SetterResetter(T& val, T new_val) : m_val(val), m_old_val(val) - { - m_val = new_val; - } - SetterResetter(const SetterResetter&) = delete; - ~SetterResetter() { m_val = m_old_val; } - -private: - T& m_val; - T m_old_val; -}; - -extern hipInfo g_status; +extern hipStatusInfo g_status; -extern hipInfo tl_status; +extern hipStatusInfo tl_status; #if defined(RAJA_ENABLE_OPENMP) #pragma omp threadprivate(tl_status) #endif @@ -272,54 +278,94 @@ bool setupReducers() { return detail::tl_status.setup_reducers; } RAJA_INLINE hip_dim_t currentGridDim() { return detail::tl_status.gridDim; } +//! get grid size of current launch +RAJA_INLINE +hip_dim_member_t currentGridSize() { return detail::tl_status.gridDim.x * + detail::tl_status.gridDim.y * + detail::tl_status.gridDim.z; } + //! get blockDim of current launch RAJA_INLINE hip_dim_t currentBlockDim() { return detail::tl_status.blockDim; } +//! get block size of current launch +RAJA_INLINE +hip_dim_member_t currentBlockSize() { return detail::tl_status.blockDim.x * + detail::tl_status.blockDim.y * + detail::tl_status.blockDim.z; } + +//! get dynamic shared memory usage for current launch +RAJA_INLINE +size_t currentDynamicShmem() { return *detail::tl_status.dynamic_smem; } + +//! get maximum dynamic shared memory for current launch +RAJA_INLINE +size_t maxDynamicShmem() +{ + hipFuncAttributes func_attr; + hipErrchk(hipFuncGetAttributes(&func_attr, detail::tl_status.func)); + return func_attr.maxDynamicSharedSizeBytes; +} + +constexpr size_t dynamic_smem_allocation_failure = std::numeric_limits::max(); + +//! Allocate dynamic shared memory for current launch +// +// The first argument is a functional object that takes the maximum number of +// objects that can fit into the dynamic shared memory available and returns +// the number of objects to allocate. +// The second argument is the required alignment. +// +// Returns an offset into dynamic shared memory aligned to align on success, +// or dynamic_smem_allocation_failure on failure. Note that asking for 0 memory +// takes the failure return path. +template < typename T, typename GetNFromMax > +RAJA_INLINE +size_t allocateDynamicShmem(GetNFromMax&& get_n_from_max, size_t align = alignof(T)) +{ + const size_t unaligned_shmem = *detail::tl_status.dynamic_smem; + const size_t align_offset = ((unaligned_shmem % align) != size_t(0)) + ? align - (unaligned_shmem % align) + : size_t(0); + const size_t aligned_shmem = unaligned_shmem + align_offset; + + const size_t max_shmem_bytes = maxDynamicShmem() - aligned_shmem; + const size_t n_bytes = sizeof(T) * + std::forward(get_n_from_max)(max_shmem_bytes / sizeof(T)); + + if (size_t(0) < n_bytes && n_bytes <= max_shmem_bytes) { + *detail::tl_status.dynamic_smem = aligned_shmem + n_bytes; + return aligned_shmem; + } else { + return dynamic_smem_allocation_failure; + } +} + //! get resource for current launch RAJA_INLINE ::RAJA::resources::Hip currentResource() { return detail::tl_status.res; } //! create copy of loop_body that is setup for device execution +// +// Note: This is done to setup the Reducer and MultiReducer objects through +// their copy constructors. Both look at tl_status to setup per kernel launch +// resources. template RAJA_INLINE typename std::remove_reference::type make_launch_body( + const void* func, hip_dim_t gridDim, hip_dim_t blockDim, - size_t RAJA_UNUSED_ARG(dynamic_smem), + size_t& dynamic_smem, ::RAJA::resources::Hip res, LOOP_BODY&& loop_body) { - detail::SetterResetter setup_reducers_srer( - detail::tl_status.setup_reducers, true); - detail::SetterResetter<::RAJA::resources::Hip> res_srer( - detail::tl_status.res, res); - - detail::tl_status.gridDim = gridDim; - detail::tl_status.blockDim = blockDim; + ::RAJA::detail::ScopedAssignment info_sa(detail::tl_status, + detail::hipInfo{func, gridDim, blockDim, &dynamic_smem, res, true}); using return_type = typename std::remove_reference::type; return return_type(std::forward(loop_body)); } -//! Get the properties of the current device -RAJA_INLINE -hipDeviceProp_t get_device_prop() -{ - int device; - hipErrchk(hipGetDevice(&device)); - hipDeviceProp_t prop; - hipErrchk(hipGetDeviceProperties(&prop, device)); - return prop; -} - -//! Get a copy of the device properties, this copy is cached on first use to speedup later calls -RAJA_INLINE -hipDeviceProp_t& device_prop() -{ - static thread_local hipDeviceProp_t prop = get_device_prop(); - return prop; -} - static constexpr int hip_occupancy_uninitialized_int = -1; static constexpr size_t hip_occupancy_uninitialized_size_t = diff --git a/include/RAJA/policy/hip/WorkGroup/WorkRunner.hpp b/include/RAJA/policy/hip/WorkGroup/WorkRunner.hpp index 389b24e35a..26d45d7bd9 100644 --- a/include/RAJA/policy/hip/WorkGroup/WorkRunner.hpp +++ b/include/RAJA/policy/hip/WorkGroup/WorkRunner.hpp @@ -291,7 +291,7 @@ struct WorkRunner< // // TODO: Privatize the loop_body, using make_launch_body to setup reductions // - // LOOP_BODY body = RAJA::hip::make_launch_body( + // LOOP_BODY body = RAJA::hip::make_launch_body(func, // gridSize, blockSize, shmem, stream, std::forward(loop_body)); storage.template emplace( diff --git a/include/RAJA/policy/hip/atomic.hpp b/include/RAJA/policy/hip/atomic.hpp index e16f31bb5b..b4f0d7faa7 100644 --- a/include/RAJA/policy/hip/atomic.hpp +++ b/include/RAJA/policy/hip/atomic.hpp @@ -22,447 +22,691 @@ #if defined(RAJA_ENABLE_HIP) +#include #include #include #include "hip/hip_runtime.h" +#include "camp/list.hpp" + #include "RAJA/policy/sequential/atomic.hpp" #include "RAJA/policy/atomic_builtin.hpp" #if defined(RAJA_ENABLE_OPENMP) #include "RAJA/policy/openmp/atomic.hpp" #endif -#include "RAJA/util/camp_aliases.hpp" -#include "RAJA/util/concepts.hpp" +#include "RAJA/util/EnableIf.hpp" #include "RAJA/util/Operators.hpp" #include "RAJA/util/TypeConvert.hpp" #include "RAJA/util/macros.hpp" +// TODO: When we can use if constexpr in C++17, this file can be cleaned up namespace RAJA { + namespace detail { +using hip_atomicCommon_builtin_types = ::camp::list< + int, + unsigned int, + unsigned long long +>; + /*! - * Generic impementation of atomic 32-bit or 64-bit compare and swap primitive. - * Implementation uses the existing HIP supplied unsigned 32-bit and 64-bit - * CAS operators. - * Returns the value that was stored before this operation. + * Type trait for determining if atomic operators should be implemented + * using builtin functions. This type trait can be used for a lot of atomic + * operators. More specific type traits are added when needed, such as + * hip_useBuiltinExchange below. */ -RAJA_INLINE __device__ unsigned hip_atomic_CAS( - unsigned volatile *acc, - unsigned compare, - unsigned value) -{ - return ::atomicCAS((unsigned *)acc, compare, value); -} -/// -RAJA_INLINE __device__ unsigned long long hip_atomic_CAS( - unsigned long long volatile *acc, - unsigned long long compare, - unsigned long long value) -{ - return ::atomicCAS((unsigned long long *)acc, compare, value); -} -/// template -RAJA_INLINE __device__ -typename std::enable_if::type -hip_atomic_CAS(T volatile *acc, T compare, T value) -{ - return RAJA::util::reinterp_A_as_B( - hip_atomic_CAS((unsigned volatile *)acc, - RAJA::util::reinterp_A_as_B(compare), - RAJA::util::reinterp_A_as_B(value))); -} -/// -template -RAJA_INLINE __device__ -typename std::enable_if::type -hip_atomic_CAS(T volatile *acc, T compare, T value) -{ - return RAJA::util::reinterp_A_as_B( - hip_atomic_CAS((unsigned long long volatile *)acc, - RAJA::util::reinterp_A_as_B(compare), - RAJA::util::reinterp_A_as_B(value))); -} - -template -struct HipAtomicCAS { +struct hip_useBuiltinCommon { + static constexpr bool value = + std::is_same::value || + std::is_same::value || + std::is_same::value; }; -template <> -struct HipAtomicCAS<4> { - - /*! - * Generic impementation of any atomic 32-bit operator. - * Implementation uses the existing HIP supplied unsigned 32-bit CAS - * operator. Returns the OLD value that was replaced by the result of this - * operation. - */ - template - RAJA_INLINE __device__ T operator()(T volatile *acc, OPER const &oper) const - { - // asserts in RAJA::util::reinterp_T_as_u and RAJA::util::reinterp_u_as_T - // will enforce 32-bit T - unsigned oldval, newval, readback; - oldval = RAJA::util::reinterp_A_as_B(*acc); - newval = RAJA::util::reinterp_A_as_B( - oper(RAJA::util::reinterp_A_as_B(oldval))); - while ((readback = hip_atomic_CAS((unsigned volatile*)acc, oldval, newval)) != - oldval) { - oldval = readback; - newval = RAJA::util::reinterp_A_as_B( - oper(RAJA::util::reinterp_A_as_B(oldval))); - } - return RAJA::util::reinterp_A_as_B(oldval); - } +/*! + * Type trait for determining if atomic operators should be implemented + * by reinterpreting inputs to types that the builtin functions support. + * This type trait can be used for a lot of atomic operators. More specific + * type traits are added when needed, such as hip_useReinterpretExchange + * below. + */ +template +struct hip_useReinterpretCommon { + static constexpr bool value = + !hip_useBuiltinCommon::value && + (sizeof(T) == sizeof(unsigned int) || + sizeof(T) == sizeof(unsigned long long)); + + using type = + std::conditional_t; }; -template <> -struct HipAtomicCAS<8> { - - /*! - * Generic impementation of any atomic 64-bit operator. - * Implementation uses the existing HIP supplied unsigned 64-bit CAS - * operator. Returns the OLD value that was replaced by the result of this - * operation. - */ - template - RAJA_INLINE __device__ T operator()(T volatile *acc, OPER const &oper) const - { - // asserts in RAJA::util::reinterp_T_as_u and RAJA::util::reinterp_u_as_T - // will enforce 64-bit T - unsigned long long oldval, newval, readback; - oldval = RAJA::util::reinterp_A_as_B(*acc); - newval = RAJA::util::reinterp_A_as_B( - oper(RAJA::util::reinterp_A_as_B(oldval))); - while ( - (readback = hip_atomic_CAS((unsigned long long volatile*)acc, oldval, newval)) != - oldval) { - oldval = readback; - newval = RAJA::util::reinterp_A_as_B( - oper(RAJA::util::reinterp_A_as_B(oldval))); - } - return RAJA::util::reinterp_A_as_B(oldval); - } -}; + +/*! + * Alias for determining the integral type of the same size as the given type + */ +template +using hip_useReinterpretCommon_t = typename hip_useReinterpretCommon::type; /*! - * Generic impementation of any atomic 32-bit or 64-bit operator that can be - * implemented using a compare and swap primitive. - * Implementation uses the existing HIP supplied unsigned 32-bit and 64-bit - * CAS operators. - * Returns the OLD value that was replaced by the result of this operation. + * Performs an atomic bitwise or using a builtin function. Stores the new value + * in the given address and returns the old value. + * + * This overload using builtin functions is used to implement atomic loads + * under some build configurations. */ -template -RAJA_INLINE __device__ T hip_atomic_CAS_oper(T volatile *acc, OPER &&oper) +template ::value, bool> = true> +RAJA_INLINE __device__ T hip_atomicOr(T *acc, T value) { - HipAtomicCAS cas; - return cas(acc, std::forward(oper)); + return ::atomicOr(acc, value); } -template < typename T, typename TypeList > -struct is_any_of; +/*! + * Type trait for determining if the exchange operator should be implemented + * using a builtin + */ +template +struct hip_useBuiltinExchange { + static constexpr bool value = + std::is_same::value || + std::is_same::value || + std::is_same::value || + std::is_same::value; +}; -template < typename T, typename... Types > -struct is_any_of> - : concepts::any_of...> -{}; +/*! + * Type trait for determining if the exchange operator should be implemented + * by reinterpreting inputs to types that the builtin exchange supports + */ +template +struct hip_useReinterpretExchange { + static constexpr bool value = + !hip_useBuiltinExchange::value && + (sizeof(T) == sizeof(unsigned int) || + sizeof(T) == sizeof(unsigned long long)); + + using type = + std::conditional_t; +}; -template < typename T, typename TypeList > -using enable_if_is_any_of = std::enable_if_t::value, T>; +/*! + * Alias for determining the integral type of the same size as the given type + */ +template +using hip_useReinterpretExchange_t = typename hip_useReinterpretExchange::type; -template < typename T, typename TypeList > -using enable_if_is_none_of = std::enable_if_t>::value, T>; +/*! + * Performs an atomic exchange using a builtin function. Stores the new value + * in the given address and returns the old value. + */ +template ::value, bool> = true> +RAJA_INLINE __device__ T hip_atomicExchange(T *acc, T value) +{ + return ::atomicExch(acc, value); +} +/*! + * Performs an atomic exchange using a reinterpret cast. Stores the new value + * in the given address and returns the old value. + */ +template ::value, bool> = true> +RAJA_INLINE __device__ T hip_atomicExchange(T *acc, T value) +{ + using R = hip_useReinterpretExchange_t; -using hip_atomicCommon_builtin_types = list< - int - ,unsigned int - ,unsigned long long - >; + return RAJA::util::reinterp_A_as_B( + hip_atomicExchange(reinterpret_cast(acc), + RAJA::util::reinterp_A_as_B(value))); +} -using hip_atomicAdd_builtin_types = list< - int - ,unsigned int - ,unsigned long long - ,float -#ifdef RAJA_ENABLE_HIP_DOUBLE_ATOMICADD - ,double -#endif - >; +#if defined(__has_builtin) && \ + (__has_builtin(__hip_atomic_load) || __has_builtin(__hip_atomic_store)) /*! - * List of types where HIP builtin atomics are used to implement atomicSub. + * Type trait for determining if the operator should be implemented + * using an intrinsic */ -using hip_atomicSub_types = list< - int - ,unsigned int - ,float -#ifdef RAJA_ENABLE_HIP_DOUBLE_ATOMICADD - ,double -#endif - >; +template +struct hip_useBuiltinLoad { + static constexpr bool value = + (std::is_integral::value || std::is_enum::value) && + (sizeof(T) == 1 || sizeof(T) == 2 || sizeof(T) == 4 || sizeof(T) == 8); +}; + +template +using hip_useBuiltinStore = hip_useBuiltinLoad; -using hip_atomicSub_builtin_types = list< - int - ,unsigned int - >; /*! - * List of types where HIP builtin atomicAdd is used to implement atomicSub. - * - * Avoid multiple definition errors by including the previous list type here - * to ensure these lists have different types. + * Type trait for determining if the operator should be implemented + * by reinterpreting inputs to types that intrinsics support */ -using hip_atomicSub_via_Add_builtin_types = list< - float -#ifdef RAJA_ENABLE_HIP_DOUBLE_ATOMICADD - ,double +template +struct hip_useReinterpretLoad { + static constexpr bool value = + !std::is_integral::value && + !std::is_enum::value && + ((sizeof(T) == 1 +#if !defined(UINT8_MAX) + && sizeof(unsigned char) == 1 #endif - >; - -using hip_atomicMin_builtin_types = hip_atomicCommon_builtin_types; + ) || + (sizeof(T) == 2 +#if !defined(UINT16_MAX) + && sizeof(unsigned short) == 2 +#endif + ) || + (sizeof(T) == 4 +#if !defined(UINT32_MAX) + && sizeof(unsigned int) == 4 +#endif + ) || + (sizeof(T) == 8 +#if !defined(UINT64_MAX) + && sizeof(unsigned long long) == 8 +#endif + )); -using hip_atomicMax_builtin_types = hip_atomicCommon_builtin_types; + using type = + std::conditional_t>>; +#else + unsigned long long>>>; +#endif +}; -using hip_atomicIncReset_builtin_types = list< - unsigned int - >; +template +using hip_useReinterpretStore = hip_useReinterpretLoad; -using hip_atomicInc_builtin_types = list< >; +#else -using hip_atomicDecReset_builtin_types = list< - unsigned int - >; +template +using hip_useBuiltinLoad = hip_useBuiltinCommon; -using hip_atomicDec_builtin_types = list< >; +template +using hip_useBuiltinStore = hip_useBuiltinExchange; -using hip_atomicAnd_builtin_types = hip_atomicCommon_builtin_types; +/*! + * Alias for determining the integral type of the same size as the given type + */ +template +using hip_useReinterpretLoad = hip_useReinterpretCommon; -using hip_atomicOr_builtin_types = hip_atomicCommon_builtin_types; +template +using hip_useReinterpretStore = hip_useReinterpretExchange; -using hip_atomicXor_builtin_types = hip_atomicCommon_builtin_types; +#endif -using hip_atomicExch_builtin_types = list< - int - ,unsigned int - ,unsigned long long - ,float - >; +/*! + * Alias for determining the integral type of the same size as the given type + */ +template +using hip_useReinterpretLoad_t = typename hip_useReinterpretLoad::type; -using hip_atomicCAS_builtin_types = hip_atomicCommon_builtin_types; +template +using hip_useReinterpretStore_t = typename hip_useReinterpretStore::type; -template * = nullptr> -RAJA_INLINE __device__ T hip_atomicAdd(T volatile *acc, T value) +/*! + * Atomic load + */ +template ::value, bool> = true> +RAJA_INLINE __device__ T hip_atomicLoad(T *acc) { - return hip_atomic_CAS_oper(acc, [=] __device__(T a) { - return a + value; - }); +#if defined(__has_builtin) && __has_builtin(__hip_atomic_load) + return __hip_atomic_load(acc, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT); +#else + return hip_atomicOr(acc, static_cast(0)); +#endif } -template * = nullptr> -RAJA_INLINE __device__ T hip_atomicAdd(T volatile *acc, T value) +template ::value, bool> = true> +RAJA_INLINE __device__ T hip_atomicLoad(T *acc) { - return ::atomicAdd((T *)acc, value); + using R = hip_useReinterpretLoad_t; + + return RAJA::util::reinterp_A_as_B( + hip_atomicLoad(reinterpret_cast(acc))); } -template * = nullptr> -RAJA_INLINE __device__ T hip_atomicSub(T volatile *acc, T value) +/*! + * Atomic store + */ +template ::value, bool> = true> +RAJA_INLINE __device__ void hip_atomicStore(T *acc, T value) { - return hip_atomic_CAS_oper(acc, [=] __device__(T a) { - return a - value; - }); +#if defined(__has_builtin) && __has_builtin(__hip_atomic_store) + __hip_atomic_store(acc, value, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT); +#else + hip_atomicExchange(acc, value); +#endif +} + +template ::value, bool> = true> +RAJA_INLINE __device__ void hip_atomicStore(T *acc, T value) +{ + using R = hip_useReinterpretStore_t; + + hip_atomicStore(reinterpret_cast(acc), + RAJA::util::reinterp_A_as_B(value)); } + /*! - * HIP atomicSub builtin implementation. + * Hip atomicCAS using builtin function + * + * Returns the old value in memory before this operation. */ -template * = nullptr> -RAJA_INLINE __device__ T hip_atomicSub(T volatile *acc, T value) +template ::value, bool> = true> +RAJA_INLINE __device__ T hip_atomicCAS(T *acc, T compare, T value) { - return ::atomicSub((T *)acc, value); + return ::atomicCAS(acc, compare, value); } /*! - * HIP atomicSub via atomicAdd builtin implementation. + * Hip atomicCAS using reinterpret cast + * + * Returns the old value in memory before this operation. */ -template * = nullptr> -RAJA_INLINE __device__ T hip_atomicSub(T volatile *acc, T value) +template ::value, bool> = true> +RAJA_INLINE __device__ T hip_atomicCAS(T *acc, T compare, T value) { - return ::atomicAdd((T *)acc, -value); + using R = hip_useReinterpretCommon_t; + + return RAJA::util::reinterp_A_as_B( + hip_atomicCAS(reinterpret_cast(acc), + RAJA::util::reinterp_A_as_B(compare), + RAJA::util::reinterp_A_as_B(value))); } -template * = nullptr> -RAJA_INLINE __device__ T hip_atomicMin(T volatile *acc, T value) +/*! + * Equality comparison for compare and swap loop. Converts to the underlying + * integral type to avoid cases where the values will never compare equal + * (most notably, NaNs). + */ +template ::value, bool> = true> +RAJA_INLINE __device__ bool hip_atomicCAS_equal(const T& a, const T& b) { - return hip_atomic_CAS_oper(acc, [=] __device__(T a) { - return value < a ? value : a; - }); + return a == b; } -template * = nullptr> -RAJA_INLINE __device__ T hip_atomicMin(T volatile *acc, T value) +template ::value, bool> = true> +RAJA_INLINE __device__ bool hip_atomicCAS_equal(const T& a, const T& b) { - return ::atomicMin((T *)acc, value); + using R = hip_useReinterpretCommon_t; + + return hip_atomicCAS_equal(RAJA::util::reinterp_A_as_B(a), + RAJA::util::reinterp_A_as_B(b)); } -template * = nullptr> -RAJA_INLINE __device__ T hip_atomicMax(T volatile *acc, T value) +/*! + * Generic impementation of any atomic 32-bit or 64-bit operator. + * Implementation uses the existing HIP supplied unsigned 32-bit or 64-bit CAS + * operator. Returns the OLD value that was replaced by the result of this + * operation. + */ +template +RAJA_INLINE __device__ T hip_atomicCAS_loop(T *acc, + Oper&& oper) { - return hip_atomic_CAS_oper(acc, [=] __device__(T a) { - return value > a ? value : a; - }); + T old = hip_atomicLoad(acc); + T expected; + + do { + expected = old; + old = hip_atomicCAS(acc, expected, oper(expected)); + } while (!hip_atomicCAS_equal(old, expected)); + + return old; } -template * = nullptr> -RAJA_INLINE __device__ T hip_atomicMax(T volatile *acc, T value) + +/*! + * Generic impementation of any atomic 32-bit or 64-bit operator with short-circuiting. + * Implementation uses the existing HIP supplied unsigned 32-bit or 64-bit CAS + * operator. Returns the OLD value that was replaced by the result of this + * operation. + */ +template +RAJA_INLINE __device__ T hip_atomicCAS_loop(T *acc, + Oper&& oper, + ShortCircuit&& sc) { - return ::atomicMax((T *)acc, value); + T old = hip_atomicLoad(acc); + + if (sc(old)) { + return old; + } + + T expected; + + do { + expected = old; + old = hip_atomicCAS(acc, expected, oper(expected)); + } while (!hip_atomicCAS_equal(old, expected) && !sc(old)); + + return old; } -template * = nullptr> -RAJA_INLINE __device__ T hip_atomicInc(T volatile *acc, T val) +/*! + * Atomic addition + */ + +/*! + * List of types where HIP builtin atomics are used to implement atomicAdd. + */ +using hip_atomicAdd_builtin_types = ::camp::list< + int, + unsigned int, + unsigned long long, + float +#ifdef RAJA_ENABLE_HIP_DOUBLE_ATOMICADD + , + double +#endif +>; + +template * = nullptr> +RAJA_INLINE __device__ T hip_atomicAdd(T *acc, T value) { - return hip_atomic_CAS_oper(acc, [=] __device__(T old) { - return ((old >= val) ? (T)0 : (old + (T)1)); + return hip_atomicCAS_loop(acc, [value] (T old) { + return old + value; }); } -template * = nullptr> -RAJA_INLINE __device__ T hip_atomicInc(T volatile *acc, T val) +template * = nullptr> +RAJA_INLINE __device__ T hip_atomicAdd(T *acc, T value) { - return ::atomicInc((T *)acc, val); + return ::atomicAdd(acc, value); } -template * = nullptr> -RAJA_INLINE __device__ T hip_atomicInc(T volatile *acc) +/*! + * Atomic subtraction + */ + +/*! + * List of types where HIP builtin atomics are used to implement atomicSub. + */ +using hip_atomicSub_builtin_types = ::camp::list< + int, + unsigned int, + unsigned long long, + float +#ifdef RAJA_ENABLE_HIP_DOUBLE_ATOMICADD + , + double +#endif +>; + +/*! + * List of types where HIP builtin atomicSub is used to implement atomicSub. + * + * Avoid multiple definition errors by including the previous list type here + * to ensure these lists have different types. + */ +using hip_atomicSub_via_Sub_builtin_types = ::camp::list< + int, + unsigned int +>; + +/*! + * List of types where HIP builtin atomicAdd is used to implement atomicSub. + * + * Avoid multiple definition errors by including the previous list type here + * to ensure these lists have different types. + */ +using hip_atomicSub_via_Add_builtin_types = ::camp::list< + unsigned long long, + float +#ifdef RAJA_ENABLE_HIP_DOUBLE_ATOMICADD + , + double +#endif +>; + +/*! + * HIP atomicSub compare and swap loop implementation. + */ +template * = nullptr> +RAJA_INLINE __device__ T hip_atomicSub(T *acc, T value) +{ + return hip_atomicCAS_loop(acc, [value] (T old) { + return old - value; + }); +} + +/*! + * HIP atomicSub builtin implementation. + */ +template * = nullptr> +RAJA_INLINE __device__ T hip_atomicSub(T *acc, T value) { - return hip_atomicAdd(acc, (T)1); + return ::atomicSub(acc, value); } -template * = nullptr> -RAJA_INLINE __device__ T hip_atomicInc(T volatile *acc) +/*! + * HIP atomicSub via atomicAdd builtin implementation. + */ +template * = nullptr> +RAJA_INLINE __device__ T hip_atomicSub(T *acc, T value) { - return ::atomicInc((T *)acc); + return ::atomicAdd(acc, -value); } -template * = nullptr> -RAJA_INLINE __device__ T hip_atomicDec(T volatile *acc, T val) +/*! + * Atomic minimum + */ +using hip_atomicMin_builtin_types = hip_atomicCommon_builtin_types; + +template * = nullptr> +RAJA_INLINE __device__ T hip_atomicMin(T *acc, T value) { - // See: - // http://docs.nvidia.com/hip/hip-c-programming-guide/index.html#atomicdec - return hip_atomic_CAS_oper(acc, [=] __device__(T old) { - return (((old == (T)0) | (old > val)) ? val : (old - (T)1)); - }); + return hip_atomicCAS_loop( + acc, + [value] (T old) { + return value < old ? value : old; + }, + [value] (T current) { + return current <= value; + }); } -template * = nullptr> -RAJA_INLINE __device__ T hip_atomicDec(T volatile *acc, T val) +template * = nullptr> +RAJA_INLINE __device__ T hip_atomicMin(T *acc, T value) { - return ::atomicDec((T *)acc, val); + return ::atomicMin(acc, value); } -template * = nullptr> -RAJA_INLINE __device__ T hip_atomicDec(T volatile *acc) +/*! + * Atomic maximum + */ +using hip_atomicMax_builtin_types = hip_atomicCommon_builtin_types; + +template * = nullptr> +RAJA_INLINE __device__ T hip_atomicMax(T *acc, T value) { - return hip_atomicSub(acc, (T)1); + return hip_atomicCAS_loop( + acc, + [value] (T old) { + return old < value ? value : old; + }, + [value] (T current) { + return value <= current; + }); } -template * = nullptr> -RAJA_INLINE __device__ T hip_atomicDec(T volatile *acc) +template * = nullptr> +RAJA_INLINE __device__ T hip_atomicMax(T *acc, T value) { - return ::atomicDec((T *)acc); + return ::atomicMax(acc, value); } -template * = nullptr> -RAJA_INLINE __device__ T hip_atomicAnd(T volatile *acc, T val) +/*! + * Atomic increment with reset + */ +template +RAJA_INLINE __device__ T hip_atomicInc(T *acc, T value) { - return hip_atomic_CAS_oper(acc, [=] __device__(T a) { - return a & val; + return hip_atomicCAS_loop(acc, [value] (T old) { + return value <= old ? static_cast(0) : old + static_cast(1); }); } -template * = nullptr> -RAJA_INLINE __device__ T hip_atomicAnd(T volatile *acc, T val) + +/*! + * Atomic increment (implemented in terms of atomic addition) + */ +template +RAJA_INLINE __device__ T hip_atomicInc(T *acc) { - return ::atomicAnd((T *)acc, val); + return hip_atomicAdd(acc, static_cast(1)); } -template * = nullptr> -RAJA_INLINE __device__ T hip_atomicOr(T volatile *acc, T val) +/*! + * Atomic decrement with reset + */ +template +RAJA_INLINE __device__ T hip_atomicDec(T *acc, T value) { - return hip_atomic_CAS_oper(acc, [=] __device__(T a) { - return a | val; + return hip_atomicCAS_loop(acc, [value] (T old) { + return old == static_cast(0) || value < old ? value : old - static_cast(1); }); } -template * = nullptr> -RAJA_INLINE __device__ T hip_atomicOr(T volatile *acc, T val) + +/*! + * Atomic decrement (implemented in terms of atomic subtraction) + */ +template +RAJA_INLINE __device__ T hip_atomicDec(T *acc) { - return ::atomicOr((T *)acc, val); + return hip_atomicSub(acc, static_cast(1)); } -template * = nullptr> -RAJA_INLINE __device__ T hip_atomicXor(T volatile *acc, T val) +/*! + * Atomic and + */ +using hip_atomicAnd_builtin_types = hip_atomicCommon_builtin_types; + +template * = nullptr> +RAJA_INLINE __device__ T hip_atomicAnd(T *acc, T value) { - return hip_atomic_CAS_oper(acc, [=] __device__(T a) { - return a ^ val; + return hip_atomicCAS_loop(acc, [value] (T old) { + return old & value; }); } -template * = nullptr> -RAJA_INLINE __device__ T hip_atomicXor(T volatile *acc, T val) +template * = nullptr> +RAJA_INLINE __device__ T hip_atomicAnd(T *acc, T value) { - return ::atomicXor((T *)acc, val); + return ::atomicAnd(acc, value); } -template * = nullptr> -RAJA_INLINE __device__ T hip_atomicExchange(T volatile *acc, T val) +/*! + * Atomic or + */ +using hip_atomicOr_builtin_types = hip_atomicCommon_builtin_types; + +template * = nullptr> +RAJA_INLINE __device__ T hip_atomicOr(T *acc, T value) { - return hip_atomic_CAS_oper(acc, [=] __device__(T) { - return val; + return hip_atomicCAS_loop(acc, [value] (T old) { + return old | value; }); } -template * = nullptr> -RAJA_INLINE __device__ T hip_atomicExchange(T volatile *acc, T val) -{ - return ::atomicExch((T *)acc, val); -} +/*! + * Atomic or via builtin functions was implemented much earlier since atomicLoad + * may depend on it. + */ -template * = nullptr> -RAJA_INLINE __device__ T hip_atomicCAS(T volatile *acc, T compare, T val) +/*! + * Atomic xor + */ +using hip_atomicXor_builtin_types = hip_atomicCommon_builtin_types; + +template * = nullptr> +RAJA_INLINE __device__ T hip_atomicXor(T *acc, T value) { - return hip_atomic_CAS(acc, compare, val); + return hip_atomicCAS_loop(acc, [value] (T old) { + return old ^ value; + }); } -template * = nullptr> -RAJA_INLINE __device__ T hip_atomicCAS( T volatile *acc, T compare, T val) +template * = nullptr> +RAJA_INLINE __device__ T hip_atomicXor(T *acc, T value) { - return ::atomicCAS((T *)acc, compare, val); + return ::atomicXor(acc, value); } + } // namespace detail @@ -474,10 +718,35 @@ RAJA_INLINE __device__ T hip_atomicCAS( T volatile *acc, T compare, T val) * * These are atomic in hip device code and non-atomic otherwise */ + +RAJA_SUPPRESS_HD_WARN +template +RAJA_INLINE RAJA_HOST_DEVICE T +atomicLoad(hip_atomic_explicit, T *acc) +{ +#if defined(__HIP_DEVICE_COMPILE__) + return detail::hip_atomicLoad(acc); +#else + return RAJA::atomicLoad(host_policy{}, acc); +#endif +} + +RAJA_SUPPRESS_HD_WARN +template +RAJA_INLINE RAJA_HOST_DEVICE void +atomicStore(hip_atomic_explicit, T *acc, T value) +{ +#if defined(__HIP_DEVICE_COMPILE__) + detail::hip_atomicStore(acc, value); +#else + RAJA::atomicStore(host_policy{}, acc, value); +#endif +} + RAJA_SUPPRESS_HD_WARN template RAJA_INLINE RAJA_HOST_DEVICE T -atomicAdd(hip_atomic_explicit, T volatile *acc, T value) +atomicAdd(hip_atomic_explicit, T *acc, T value) { #if defined(__HIP_DEVICE_COMPILE__) return detail::hip_atomicAdd(acc, value); @@ -489,7 +758,7 @@ atomicAdd(hip_atomic_explicit, T volatile *acc, T value) RAJA_SUPPRESS_HD_WARN template RAJA_INLINE RAJA_HOST_DEVICE T -atomicSub(hip_atomic_explicit, T volatile *acc, T value) +atomicSub(hip_atomic_explicit, T *acc, T value) { #if defined(__HIP_DEVICE_COMPILE__) return detail::hip_atomicSub(acc, value); @@ -501,7 +770,7 @@ atomicSub(hip_atomic_explicit, T volatile *acc, T value) RAJA_SUPPRESS_HD_WARN template RAJA_INLINE RAJA_HOST_DEVICE T -atomicMin(hip_atomic_explicit, T volatile *acc, T value) +atomicMin(hip_atomic_explicit, T *acc, T value) { #if defined(__HIP_DEVICE_COMPILE__) return detail::hip_atomicMin(acc, value); @@ -513,7 +782,7 @@ atomicMin(hip_atomic_explicit, T volatile *acc, T value) RAJA_SUPPRESS_HD_WARN template RAJA_INLINE RAJA_HOST_DEVICE T -atomicMax(hip_atomic_explicit, T volatile *acc, T value) +atomicMax(hip_atomic_explicit, T *acc, T value) { #if defined(__HIP_DEVICE_COMPILE__) return detail::hip_atomicMax(acc, value); @@ -525,19 +794,19 @@ atomicMax(hip_atomic_explicit, T volatile *acc, T value) RAJA_SUPPRESS_HD_WARN template RAJA_INLINE RAJA_HOST_DEVICE T -atomicInc(hip_atomic_explicit, T volatile *acc, T val) +atomicInc(hip_atomic_explicit, T *acc, T value) { #if defined(__HIP_DEVICE_COMPILE__) - return detail::hip_atomicInc(acc, val); + return detail::hip_atomicInc(acc, value); #else - return RAJA::atomicInc(host_policy{}, acc, val); + return RAJA::atomicInc(host_policy{}, acc, value); #endif } RAJA_SUPPRESS_HD_WARN template RAJA_INLINE RAJA_HOST_DEVICE T -atomicInc(hip_atomic_explicit, T volatile *acc) +atomicInc(hip_atomic_explicit, T *acc) { #if defined(__HIP_DEVICE_COMPILE__) return detail::hip_atomicInc(acc); @@ -549,19 +818,19 @@ atomicInc(hip_atomic_explicit, T volatile *acc) RAJA_SUPPRESS_HD_WARN template RAJA_INLINE RAJA_HOST_DEVICE T -atomicDec(hip_atomic_explicit, T volatile *acc, T val) +atomicDec(hip_atomic_explicit, T *acc, T value) { #if defined(__HIP_DEVICE_COMPILE__) - return detail::hip_atomicDec(acc, val); + return detail::hip_atomicDec(acc, value); #else - return RAJA::atomicDec(host_policy{}, acc, val); + return RAJA::atomicDec(host_policy{}, acc, value); #endif } RAJA_SUPPRESS_HD_WARN template RAJA_INLINE RAJA_HOST_DEVICE T -atomicDec(hip_atomic_explicit, T volatile *acc) +atomicDec(hip_atomic_explicit, T *acc) { #if defined(__HIP_DEVICE_COMPILE__) return detail::hip_atomicDec(acc); @@ -573,7 +842,7 @@ atomicDec(hip_atomic_explicit, T volatile *acc) RAJA_SUPPRESS_HD_WARN template RAJA_INLINE RAJA_HOST_DEVICE T -atomicAnd(hip_atomic_explicit, T volatile *acc, T value) +atomicAnd(hip_atomic_explicit, T *acc, T value) { #if defined(__HIP_DEVICE_COMPILE__) return detail::hip_atomicAnd(acc, value); @@ -585,7 +854,7 @@ atomicAnd(hip_atomic_explicit, T volatile *acc, T value) RAJA_SUPPRESS_HD_WARN template RAJA_INLINE RAJA_HOST_DEVICE T -atomicOr(hip_atomic_explicit, T volatile *acc, T value) +atomicOr(hip_atomic_explicit, T *acc, T value) { #if defined(__HIP_DEVICE_COMPILE__) return detail::hip_atomicOr(acc, value); @@ -597,7 +866,7 @@ atomicOr(hip_atomic_explicit, T volatile *acc, T value) RAJA_SUPPRESS_HD_WARN template RAJA_INLINE RAJA_HOST_DEVICE T -atomicXor(hip_atomic_explicit, T volatile *acc, T value) +atomicXor(hip_atomic_explicit, T *acc, T value) { #if defined(__HIP_DEVICE_COMPILE__) return detail::hip_atomicXor(acc, value); @@ -609,7 +878,7 @@ atomicXor(hip_atomic_explicit, T volatile *acc, T value) RAJA_SUPPRESS_HD_WARN template RAJA_INLINE RAJA_HOST_DEVICE T -atomicExchange(hip_atomic_explicit, T volatile *acc, T value) +atomicExchange(hip_atomic_explicit, T *acc, T value) { #if defined(__HIP_DEVICE_COMPILE__) return detail::hip_atomicExchange(acc, value); @@ -621,7 +890,7 @@ atomicExchange(hip_atomic_explicit, T volatile *acc, T value) RAJA_SUPPRESS_HD_WARN template RAJA_INLINE RAJA_HOST_DEVICE T -atomicCAS(hip_atomic_explicit, T volatile *acc, T compare, T value) +atomicCAS(hip_atomic_explicit, T *acc, T compare, T value) { #if defined(__HIP_DEVICE_COMPILE__) return detail::hip_atomicCAS(acc, compare, value); diff --git a/include/RAJA/policy/hip/forall.hpp b/include/RAJA/policy/hip/forall.hpp index 6fa21f9217..a8c4cf53b9 100644 --- a/include/RAJA/policy/hip/forall.hpp +++ b/include/RAJA/policy/hip/forall.hpp @@ -560,7 +560,7 @@ forall_impl(resources::Hip hip_res, // // Privatize the loop_body, using make_launch_body to setup reductions // - LOOP_BODY body = RAJA::hip::make_launch_body( + LOOP_BODY body = RAJA::hip::make_launch_body(func, dims.blocks, dims.threads, shmem, hip_res, std::forward(loop_body)); // @@ -610,7 +610,8 @@ forall_impl(resources::Hip hip_res, if (len > 0) { auto func = reinterpret_cast( - &impl::forallp_hip_kernel>); + &impl::forallp_hip_kernel>); // // Setup shared memory buffers @@ -636,7 +637,7 @@ forall_impl(resources::Hip hip_res, // // Privatize the loop_body, using make_launch_body to setup reductions // - LOOP_BODY body = RAJA::hip::make_launch_body( + LOOP_BODY body = RAJA::hip::make_launch_body(func, dims.blocks, dims.threads, shmem, hip_res, std::forward(loop_body)); // diff --git a/include/RAJA/policy/hip/intrinsics.hpp b/include/RAJA/policy/hip/intrinsics.hpp index 354e5d7278..c72a0b5c4f 100644 --- a/include/RAJA/policy/hip/intrinsics.hpp +++ b/include/RAJA/policy/hip/intrinsics.hpp @@ -233,10 +233,10 @@ RAJA_DEVICE RAJA_INLINE T warp_reduce(T val, T RAJA_UNUSED_ARG(identity)) T temp = val; - if (numThreads % policy::hip::WARP_SIZE == 0) { + if (numThreads % policy::hip::device_constants.WARP_SIZE == 0) { // reduce each warp - for (int i = 1; i < policy::hip::WARP_SIZE; i *= 2) { + for (int i = 1; i < policy::hip::device_constants.WARP_SIZE; i *= 2) { T rhs = shfl_xor_sync(temp, i); Combiner{}(temp, rhs); } @@ -244,7 +244,7 @@ RAJA_DEVICE RAJA_INLINE T warp_reduce(T val, T RAJA_UNUSED_ARG(identity)) } else { // reduce each warp - for (int i = 1; i < policy::hip::WARP_SIZE; i *= 2) { + for (int i = 1; i < policy::hip::device_constants.WARP_SIZE; i *= 2) { int srcLane = threadId ^ i; T rhs = shfl_sync(temp, srcLane); // only add from threads that exist (don't double count own value) @@ -269,7 +269,7 @@ RAJA_DEVICE RAJA_INLINE T warp_allreduce(T val) { T temp = val; - for (int i = 1; i < policy::hip::WARP_SIZE; i *= 2) { + for (int i = 1; i < policy::hip::device_constants.WARP_SIZE; i *= 2) { T rhs = shfl_xor_sync(temp, i); Combiner{}(temp, rhs); } @@ -287,15 +287,15 @@ RAJA_DEVICE RAJA_INLINE T block_reduce(T val, T identity) int threadId = threadIdx.x + blockDim.x * threadIdx.y + (blockDim.x * blockDim.y) * threadIdx.z; - int warpId = threadId % policy::hip::WARP_SIZE; - int warpNum = threadId / policy::hip::WARP_SIZE; + int warpId = threadId % policy::hip::device_constants.WARP_SIZE; + int warpNum = threadId / policy::hip::device_constants.WARP_SIZE; T temp = val; - if (numThreads % policy::hip::WARP_SIZE == 0) { + if (numThreads % policy::hip::device_constants.WARP_SIZE == 0) { // reduce each warp - for (int i = 1; i < policy::hip::WARP_SIZE; i *= 2) { + for (int i = 1; i < policy::hip::device_constants.WARP_SIZE; i *= 2) { T rhs = shfl_xor_sync(temp, i); Combiner{}(temp, rhs); } @@ -303,7 +303,7 @@ RAJA_DEVICE RAJA_INLINE T block_reduce(T val, T identity) } else { // reduce each warp - for (int i = 1; i < policy::hip::WARP_SIZE; i *= 2) { + for (int i = 1; i < policy::hip::device_constants.WARP_SIZE; i *= 2) { int srcLane = threadId ^ i; T rhs = shfl_sync(temp, srcLane); // only add from threads that exist (don't double count own value) @@ -314,14 +314,14 @@ RAJA_DEVICE RAJA_INLINE T block_reduce(T val, T identity) } // reduce per warp values - if (numThreads > policy::hip::WARP_SIZE) { + if (numThreads > policy::hip::device_constants.WARP_SIZE) { - static_assert(policy::hip::MAX_WARPS <= policy::hip::WARP_SIZE, - "Max Warps must be less than or equal to Warp Size for this algorithm to work"); + static_assert(policy::hip::device_constants.MAX_WARPS <= policy::hip::device_constants.WARP_SIZE, + "This algorithms assumes a warp of WARP_SIZE threads can reduce MAX_WARPS values"); - __shared__ unsigned char tmpsd[sizeof(RAJA::detail::SoAArray)]; - RAJA::detail::SoAArray* sd = - reinterpret_cast *>(tmpsd); + __shared__ unsigned char tmpsd[sizeof(RAJA::detail::SoAArray)]; + RAJA::detail::SoAArray* sd = + reinterpret_cast *>(tmpsd); // write per warp values to shared memory if (warpId == 0) { @@ -333,13 +333,13 @@ RAJA_DEVICE RAJA_INLINE T block_reduce(T val, T identity) if (warpNum == 0) { // read per warp values - if (warpId * policy::hip::WARP_SIZE < numThreads) { + if (warpId * policy::hip::device_constants.WARP_SIZE < numThreads) { temp = sd->get(warpId); } else { temp = identity; } - for (int i = 1; i < policy::hip::MAX_WARPS; i *= 2) { + for (int i = 1; i < policy::hip::device_constants.MAX_WARPS; i *= 2) { T rhs = shfl_xor_sync(temp, i); Combiner{}(temp, rhs); } diff --git a/include/RAJA/policy/hip/kernel/For.hpp b/include/RAJA/policy/hip/kernel/For.hpp index 848ea42edf..39e7104c16 100644 --- a/include/RAJA/policy/hip/kernel/For.hpp +++ b/include/RAJA/policy/hip/kernel/For.hpp @@ -283,7 +283,7 @@ struct HipStatementExecutor< using diff_t = segment_diff_type; - static_assert(mask_t::max_masked_size <= RAJA::policy::hip::WARP_SIZE, + static_assert(mask_t::max_masked_size <= RAJA::policy::hip::device_constants.WARP_SIZE, "BitMask is too large for HIP warp size"); static @@ -312,7 +312,7 @@ struct HipStatementExecutor< // we always get EXACTLY one warp by allocating one warp in the X // dimension - const diff_t len = RAJA::policy::hip::WARP_SIZE; + const diff_t len = RAJA::policy::hip::device_constants.WARP_SIZE; // request one thread per element in the segment set_hip_dim(dims.dims.threads, len); @@ -352,7 +352,7 @@ struct HipStatementExecutor< using diff_t = segment_diff_type; - static_assert(mask_t::max_masked_size <= RAJA::policy::hip::WARP_SIZE, + static_assert(mask_t::max_masked_size <= RAJA::policy::hip::device_constants.WARP_SIZE, "BitMask is too large for HIP warp size"); static @@ -391,7 +391,7 @@ struct HipStatementExecutor< // we always get EXACTLY one warp by allocating one warp in the X // dimension - const diff_t len = RAJA::policy::hip::WARP_SIZE; + const diff_t len = RAJA::policy::hip::device_constants.WARP_SIZE; // request one thread per element in the segment set_hip_dim(dims.dims.threads, len); diff --git a/include/RAJA/policy/hip/kernel/ForICount.hpp b/include/RAJA/policy/hip/kernel/ForICount.hpp index 014b4db3ac..ba6642f248 100644 --- a/include/RAJA/policy/hip/kernel/ForICount.hpp +++ b/include/RAJA/policy/hip/kernel/ForICount.hpp @@ -273,7 +273,7 @@ struct HipStatementExecutor< using mask_t = Mask; - static_assert(mask_t::max_masked_size <= RAJA::policy::hip::WARP_SIZE, + static_assert(mask_t::max_masked_size <= RAJA::policy::hip::device_constants.WARP_SIZE, "BitMask is too large for HIP warp size"); static inline RAJA_DEVICE @@ -332,7 +332,7 @@ struct HipStatementExecutor< using mask_t = Mask; - static_assert(mask_t::max_masked_size <= RAJA::policy::hip::WARP_SIZE, + static_assert(mask_t::max_masked_size <= RAJA::policy::hip::device_constants.WARP_SIZE, "BitMask is too large for HIP warp size"); static inline RAJA_DEVICE diff --git a/include/RAJA/policy/hip/kernel/HipKernel.hpp b/include/RAJA/policy/hip/kernel/HipKernel.hpp index 68156600b2..1ed7740008 100644 --- a/include/RAJA/policy/hip/kernel/HipKernel.hpp +++ b/include/RAJA/policy/hip/kernel/HipKernel.hpp @@ -216,7 +216,7 @@ struct HipKernelLauncherGetter using type = camp::decay)>; static constexpr type get() noexcept { - return internal::HipKernelLauncherFixed; + return &internal::HipKernelLauncherFixed; } }; @@ -230,7 +230,7 @@ struct HipKernelLauncherGetter<0, Data, executor_t> using type = camp::decay)>; static constexpr type get() noexcept { - return internal::HipKernelLauncher; + return &internal::HipKernelLauncher; } }; @@ -260,10 +260,15 @@ struct HipLaunchHelper,Stmt using kernelGetter_t = HipKernelLauncherGetter<(num_threads <= 0) ? 0 : num_threads, Data, executor_t>; + inline static const void* get_func() + { + return reinterpret_cast(kernelGetter_t::get()); + } + inline static void recommended_blocks_threads(size_t shmem_size, int &recommended_blocks, int &recommended_threads) { - auto func = reinterpret_cast(kernelGetter_t::get()); + auto func = Self::get_func(); if (num_blocks <= 0) { @@ -342,7 +347,7 @@ struct HipLaunchHelper,Stmt inline static void max_blocks(size_t shmem_size, int &max_blocks, int actual_threads) { - auto func = reinterpret_cast(kernelGetter_t::get()); + auto func = Self::get_func(); if (num_blocks <= 0) { @@ -379,17 +384,6 @@ struct HipLaunchHelper,Stmt } } - - static void launch(Data &&data, - internal::LaunchDims launch_dims, - size_t shmem, - RAJA::resources::Hip res) - { - auto func = kernelGetter_t::get(); - - void *args[] = {(void*)&data}; - RAJA::hip::launch((const void*)func, launch_dims.dims.blocks, launch_dims.dims.threads, args, shmem, res, async); - } }; /*! @@ -571,17 +565,23 @@ struct StatementExecutor< } { + auto func = launch_t::get_func(); + // // Privatize the LoopData, using make_launch_body to setup reductions // - auto hip_data = RAJA::hip::make_launch_body( + // Note that there is a circular dependency between the previous setup + // of the launch_dims and potential changes to shmem here that is + // currently an unresolved issue. + // + auto hip_data = RAJA::hip::make_launch_body(func, launch_dims.dims.blocks, launch_dims.dims.threads, shmem, res, data); - // - // Launch the kernels + // Launch the kernel // - launch_t::launch(std::move(hip_data), launch_dims, shmem, res); + void *args[] = {(void*)&hip_data}; + RAJA::hip::launch(func, launch_dims.dims.blocks, launch_dims.dims.threads, args, shmem, res, launch_t::async); } } } diff --git a/include/RAJA/policy/hip/launch.hpp b/include/RAJA/policy/hip/launch.hpp index 76f592d20b..6823647b48 100644 --- a/include/RAJA/policy/hip/launch.hpp +++ b/include/RAJA/policy/hip/launch.hpp @@ -75,7 +75,8 @@ struct LaunchExecute; - auto func = launch_global_fcn; + auto func = reinterpret_cast( + &launch_global_fcn); resources::Hip hip_res = res.get(); @@ -99,17 +100,19 @@ struct LaunchExecute(body_in)); + BODY body = RAJA::hip::make_launch_body(func, + gridSize, blockSize, shared_mem_size, hip_res, std::forward(body_in)); // // Launch the kernel // void *args[] = {(void*)&body}; - RAJA::hip::launch((const void*)func, gridSize, blockSize, args, params.shared_mem_size, hip_res, async, kernel_name); + RAJA::hip::launch(func, gridSize, blockSize, args, shared_mem_size, hip_res, async, kernel_name); } RAJA_FT_END; @@ -129,7 +132,8 @@ struct LaunchExecute; - auto func = reinterpret_cast(launch_new_reduce_global_fcn >); + auto func = reinterpret_cast( + &launch_new_reduce_global_fcn>); resources::Hip hip_res = res.get(); @@ -152,9 +156,11 @@ struct LaunchExecute(body_in)); + BODY body = RAJA::hip::make_launch_body(func, + gridSize, blockSize, shared_mem_size, hip_res, std::forward(body_in)); // // Launch the kernel // void *args[] = {(void*)&body, (void*)&launch_reducers}; - RAJA::hip::launch((const void*)func, gridSize, blockSize, args, launch_params.shared_mem_size, hip_res, async, kernel_name); + RAJA::hip::launch(func, gridSize, blockSize, args, shared_mem_size, hip_res, async, kernel_name); RAJA::expt::ParamMultiplexer::resolve(launch_reducers, launch_info); } @@ -235,7 +241,8 @@ struct LaunchExecute> { { using BODY = camp::decay; - auto func = launch_global_fcn_fixed; + auto func = reinterpret_cast( + &launch_global_fcn_fixed); resources::Hip hip_res = res.get(); @@ -259,17 +266,18 @@ struct LaunchExecute> { RAJA_FT_BEGIN; { + size_t shared_mem_size = params.shared_mem_size; // // Privatize the loop_body, using make_launch_body to setup reductions // - BODY body = RAJA::hip::make_launch_body( - gridSize, blockSize, params.shared_mem_size, hip_res, std::forward(body_in)); + BODY body = RAJA::hip::make_launch_body(func, + gridSize, blockSize, shared_mem_size, hip_res, std::forward(body_in)); // // Launch the kernel // void *args[] = {(void*)&body}; - RAJA::hip::launch((const void*)func, gridSize, blockSize, args, params.shared_mem_size, hip_res, async, kernel_name); + RAJA::hip::launch(func, gridSize, blockSize, args, shared_mem_size, hip_res, async, kernel_name); } RAJA_FT_END; @@ -288,7 +296,8 @@ struct LaunchExecute> { { using BODY = camp::decay; - auto func = reinterpret_cast(launch_new_reduce_global_fcn_fixed >); + auto func = reinterpret_cast( + &launch_new_reduce_global_fcn_fixed>); resources::Hip hip_res = res.get(); @@ -311,9 +320,11 @@ struct LaunchExecute> { RAJA_FT_BEGIN; + size_t shared_mem_size = launch_params.shared_mem_size; RAJA::hip::detail::hipInfo launch_info; launch_info.gridDim = gridSize; launch_info.blockDim = blockSize; + launch_info.dynamic_smem = &shared_mem_size; launch_info.res = hip_res; { @@ -323,14 +334,14 @@ struct LaunchExecute> { // // Privatize the loop_body, using make_launch_body to setup reductions // - BODY body = RAJA::hip::make_launch_body( - gridSize, blockSize, launch_params.shared_mem_size, hip_res, std::forward(body_in)); + BODY body = RAJA::hip::make_launch_body(func, + gridSize, blockSize, shared_mem_size, hip_res, std::forward(body_in)); // // Launch the kernel // void *args[] = {(void*)&body, (void*)&launch_reducers}; - RAJA::hip::launch((const void*)func, gridSize, blockSize, args, launch_params.shared_mem_size, hip_res, async, kernel_name); + RAJA::hip::launch(func, gridSize, blockSize, args, shared_mem_size, hip_res, async, kernel_name); RAJA::expt::ParamMultiplexer::resolve(launch_reducers, launch_info); } diff --git a/include/RAJA/policy/hip/multi_reduce.hpp b/include/RAJA/policy/hip/multi_reduce.hpp new file mode 100644 index 0000000000..0d9d3899d8 --- /dev/null +++ b/include/RAJA/policy/hip/multi_reduce.hpp @@ -0,0 +1,764 @@ +/*! + ****************************************************************************** + * + * \file + * + * \brief Header file containing RAJA reduction templates for HIP execution. + * + * These methods should work on any platform that supports + * HIP devices. + * + ****************************************************************************** + */ + +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2016-24, Lawrence Livermore National Security, LLC +// and RAJA project contributors. See the RAJA/LICENSE file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +#ifndef RAJA_hip_multi_reduce_HPP +#define RAJA_hip_multi_reduce_HPP + +#include "RAJA/config.hpp" + +#if defined(RAJA_ENABLE_HIP) + +#include +#include +#include +#include + +#include "hip/hip_runtime.h" + +#include "RAJA/util/macros.hpp" +#include "RAJA/util/math.hpp" +#include "RAJA/util/mutex.hpp" +#include "RAJA/util/types.hpp" +#include "RAJA/util/reduce.hpp" +#include "RAJA/util/OffsetOperators.hpp" + +#include "RAJA/pattern/detail/multi_reduce.hpp" +#include "RAJA/pattern/multi_reduce.hpp" + +#include "RAJA/policy/hip/MemUtils_HIP.hpp" +#include "RAJA/policy/hip/intrinsics.hpp" + +#if defined(RAJA_ENABLE_DESUL_ATOMICS) + #include "RAJA/policy/desul/atomic.hpp" +#else + #include "RAJA/policy/hip/atomic.hpp" +#endif + +#include "RAJA/policy/hip/policy.hpp" +#include "RAJA/policy/hip/raja_hiperrchk.hpp" + +namespace RAJA +{ + +namespace hip +{ + +namespace impl +{ + + +// +////////////////////////////////////////////////////////////////////// +// +// MultiReduction algorithms. +// +////////////////////////////////////////////////////////////////////// +// + +//! combine value into global memory +template +RAJA_DEVICE RAJA_INLINE void block_multi_reduce_combine_global_atomic(int RAJA_UNUSED_ARG(num_bins), + T identity, + int bin, + T value, + T* tally_mem, + GetTallyOffset get_tally_offset, + int tally_replication, + int tally_bins) +{ + if (value == identity) { return; } + + int tally_index = GetTallyIndex::template index(); // globalWarpId by default + int tally_rep = ::RAJA::power_of_2_mod(tally_index, tally_replication); + int tally_offset = get_tally_offset(bin, tally_bins, tally_rep, tally_replication); + RAJA::reduce::hip::atomic{}(tally_mem[tally_offset], value); +} + + +//! initialize shared memory +template +RAJA_DEVICE RAJA_INLINE void block_multi_reduce_init_shmem(int num_bins, + T identity, + T* shared_mem, + int shared_replication) +{ + int threadId = threadIdx.x + blockDim.x * threadIdx.y + + (blockDim.x * blockDim.y) * threadIdx.z; + int numThreads = blockDim.x * blockDim.y * blockDim.z; + + for (int shmem_offset = threadId; + shmem_offset < shared_replication * num_bins; + shmem_offset += numThreads) { + shared_mem[shmem_offset] = identity; + } + __syncthreads(); +} + +//! combine value into shared memory +template +RAJA_DEVICE RAJA_INLINE void block_multi_reduce_combine_shmem_atomic(int num_bins, + T identity, + int bin, + T value, + T* shared_mem, + GetSharedOffset get_shared_offset, + int shared_replication) +{ + if (value == identity) { return; } + + int shared_index = GetSharedIndex::template index(); // threadId by default + int shared_rep = ::RAJA::power_of_2_mod(shared_index, shared_replication); + int shmem_offset = get_shared_offset(bin, num_bins, shared_rep, shared_replication); + + RAJA::reduce::hip::atomic{}(shared_mem[shmem_offset], value); +} + +//! combine value into shared memory +template +RAJA_DEVICE RAJA_INLINE void grid_multi_reduce_shmem_to_global_atomic(int num_bins, + T identity, + T* shared_mem, + GetSharedOffset get_shared_offset, + int shared_replication, + T* tally_mem, + GetTallyOffset get_tally_offset, + int tally_replication, + int tally_bins) +{ + int threadId = threadIdx.x + blockDim.x * threadIdx.y + + (blockDim.x * blockDim.y) * threadIdx.z; + int numThreads = blockDim.x * blockDim.y * blockDim.z; + + int blockId = blockIdx.x + gridDim.x * blockIdx.y + + (gridDim.x * gridDim.y) * blockIdx.z; + + __syncthreads(); + for (int bin = threadId; bin < num_bins; bin += numThreads) { + + T value = identity; + for (int shared_rep = 0; shared_rep < shared_replication; ++shared_rep) { + int shmem_offset = get_shared_offset(bin, num_bins, shared_rep, shared_replication); + Combiner{}(value, shared_mem[shmem_offset]); + } + + if (value != identity) { + int tally_rep = ::RAJA::power_of_2_mod(blockId, tally_replication); + int tally_offset = get_tally_offset(bin, tally_bins, tally_rep, tally_replication); + RAJA::reduce::hip::atomic{}(tally_mem[tally_offset], value); + } + + } +} + +} // namespace impl + +// +////////////////////////////////////////////////////////////////////// +// +// MultiReduction classes. +// +////////////////////////////////////////////////////////////////////// +// + +//! MultiReduction data for Hip Offload -- stores value, host pointer +template +struct MultiReduceGridAtomicHostInit_TallyData +{ + //! setup permanent settings, allocate and initialize tally memory + template < typename Container > + MultiReduceGridAtomicHostInit_TallyData(Container const& container, T const& identity) + : m_tally_mem(nullptr) + , m_identity(identity) + , m_num_bins(container.size()) + , m_tally_bins(get_tally_bins(m_num_bins)) + , m_tally_replication(get_tally_replication()) + { + m_tally_mem = create_tally(container, identity, m_num_bins, m_tally_bins, m_tally_replication); + } + + MultiReduceGridAtomicHostInit_TallyData() = delete; + MultiReduceGridAtomicHostInit_TallyData(MultiReduceGridAtomicHostInit_TallyData const&) = default; + MultiReduceGridAtomicHostInit_TallyData(MultiReduceGridAtomicHostInit_TallyData &&) = delete; + MultiReduceGridAtomicHostInit_TallyData& operator=(MultiReduceGridAtomicHostInit_TallyData const&) = default; + MultiReduceGridAtomicHostInit_TallyData& operator=(MultiReduceGridAtomicHostInit_TallyData &&) = delete; + ~MultiReduceGridAtomicHostInit_TallyData() = default; + + + //! reset permanent settings, reallocate and reset tally memory + template < typename Container > + void reset_permanent(Container const& container, T const& identity) + { + int new_num_bins = container.size(); + if (new_num_bins != m_num_bins) { + teardown_permanent(); + m_num_bins = new_num_bins; + m_tally_bins = get_tally_bins(m_num_bins); + m_tally_replication = get_tally_replication(); + m_tally_mem = create_tally(container, identity, m_num_bins, m_tally_bins, m_tally_replication); + } else { + { + int tally_rep = 0; + int bin = 0; + for (auto const& value : container) { + m_tally_mem[GetTallyOffset{}(bin, m_tally_bins, tally_rep, m_tally_replication)] = value; + ++bin; + } + } + for (int tally_rep = 1; tally_rep < m_tally_replication; ++tally_rep) { + for (int bin = 0; bin < m_num_bins; ++bin) { + m_tally_mem[GetTallyOffset{}(bin, m_tally_bins, tally_rep, m_tally_replication)] = identity; + } + } + } + m_identity = identity; + } + + //! teardown permanent settings, free tally memory + void teardown_permanent() + { + destroy_tally(m_tally_mem, m_num_bins, m_tally_bins, m_tally_replication); + } + + + //! get value for bin, assumes synchronization occurred elsewhere + T get(int bin) const + { + ::RAJA::detail::HighAccuracyReduce + reducer(m_identity); + for (int tally_rep = 0; tally_rep < m_tally_replication; ++tally_rep) { + int tally_offset = GetTallyOffset{}(bin, m_tally_bins, tally_rep, m_tally_replication); + reducer.combine(m_tally_mem[tally_offset]); + } + return reducer.get_and_clear(); + } + + + int num_bins() const { return m_num_bins; } + + T identity() const { return m_identity; } + +private: + static constexpr size_t s_tally_alignment = std::max(size_t(policy::hip::device_constants.ATOMIC_DESTRUCTIVE_INTERFERENCE_SIZE), + size_t(RAJA::DATA_ALIGN)); + static constexpr size_t s_tally_bunch_size = RAJA_DIVIDE_CEILING_INT(s_tally_alignment, sizeof(T)); + + using tally_mempool_type = device_pinned_mempool_type; + using tally_tuning = typename tuning::GlobalAtomicReplicationTuning; + using TallyAtomicReplicationConcretizer = typename tally_tuning::AtomicReplicationConcretizer; + using GetTallyOffset_rebind_rebunch = typename tally_tuning::OffsetCalculator; + using GetTallyOffset_rebind = typename GetTallyOffset_rebind_rebunch::template rebunch; + + + static int get_tally_bins(int num_bins) + { + return RAJA_DIVIDE_CEILING_INT(num_bins, s_tally_bunch_size) * s_tally_bunch_size; + } + + static int get_tally_replication() + { + int min_tally_replication = 1; +#if defined(RAJA_ENABLE_OPENMP) + min_tally_replication = omp_get_max_threads(); +#endif + + struct { + int func_min_global_replication; + } func_data{min_tally_replication}; + + return TallyAtomicReplicationConcretizer{}.template + get_global_replication(func_data); + } + + template < typename Container > + static T* create_tally(Container const& container, T const& identity, + int num_bins, int tally_bins, int tally_replication) + { + if (num_bins == size_t(0)) { + return nullptr; + } + + T* tally_mem = tally_mempool_type::getInstance().template malloc( + tally_replication*tally_bins, s_tally_alignment); + + if (tally_replication > 0) { + { + int tally_rep = 0; + int bin = 0; + for (auto const& value : container) { + int tally_offset = GetTallyOffset{}(bin, tally_bins, tally_rep, tally_replication); + new(&tally_mem[tally_offset]) T(value); + ++bin; + } + } + for (int tally_rep = 1; tally_rep < tally_replication; ++tally_rep) { + for (int bin = 0; bin < num_bins; ++bin) { + int tally_offset = GetTallyOffset{}(bin, tally_bins, tally_rep, tally_replication); + new(&tally_mem[tally_offset]) T(identity); + } + } + } + return tally_mem; + } + + static void destroy_tally(T*& tally_mem, + int num_bins, int tally_bins, int tally_replication) + { + if (num_bins == size_t(0)) { + return; + } + + for (int tally_rep = tally_replication+1; tally_rep > 0; --tally_rep) { + for (int bin = num_bins; bin > 0; --bin) { + int tally_offset = GetTallyOffset{}(bin-1, tally_bins, tally_rep-1, tally_replication); + tally_mem[tally_offset].~T(); + } + } + tally_mempool_type::getInstance().free(tally_mem); + tally_mem = nullptr; + } + +protected: + using GetTallyIndex = typename tally_tuning::ReplicationIndexer; + using GetTallyOffset = typename GetTallyOffset_rebind::template rebind; + + T* m_tally_mem; + T m_identity; + int m_num_bins; + int m_tally_bins; + int m_tally_replication; // power of 2, at least the max number of omp threads +}; + + +//! MultiReduction data for Hip Offload -- stores value, host pointer +template +struct MultiReduceGridAtomicHostInit_Data + : MultiReduceGridAtomicHostInit_TallyData +{ + using TallyData = MultiReduceGridAtomicHostInit_TallyData; + + //! defer to tally data for some functions + using TallyData::TallyData; + using TallyData::reset_permanent; + using TallyData::teardown_permanent; + using TallyData::get; + using TallyData::num_bins; + using TallyData::identity; + + //! setup per launch, do nothing + void setup_launch(size_t RAJA_UNUSED_ARG(block_size)) + { } + + //! teardown per launch, do nothing + void teardown_launch() + { } + + + //! setup on device, do nothing + RAJA_DEVICE + void setup_device() + { } + + //! finalize on device, do nothing + RAJA_DEVICE + void finalize_device() + { } + + + //! combine value on device, combine a value into the tally atomically + RAJA_DEVICE + void combine_device(int bin, T value) + { + impl::block_multi_reduce_combine_global_atomic( + m_num_bins, m_identity, + bin, value, + m_tally_mem, GetTallyOffset{}, m_tally_replication, m_tally_bins); + } + + //! combine value on host, combine a value into the tally + void combine_host(int bin, T value) + { + int tally_rep = 0; +#if defined(RAJA_ENABLE_OPENMP) + tally_rep = omp_get_thread_num(); +#endif + int tally_offset = GetTallyOffset{}(bin, m_tally_bins, tally_rep, m_tally_replication); + Combiner{}(m_tally_mem[tally_offset], value); + } + +private: + using typename TallyData::GetTallyIndex; + using typename TallyData::GetTallyOffset; + + using TallyData::m_tally_mem; + using TallyData::m_identity; + using TallyData::m_num_bins; + using TallyData::m_tally_bins; + using TallyData::m_tally_replication; +}; + + +//! MultiReduction data for Hip Offload -- stores value, host pointer +template +struct MultiReduceBlockThenGridAtomicHostInit_Data + : MultiReduceGridAtomicHostInit_TallyData +{ + using TallyData = MultiReduceGridAtomicHostInit_TallyData; + + //! setup permanent settings, defer to tally data + template < typename Container > + MultiReduceBlockThenGridAtomicHostInit_Data(Container const& container, T const& identity) + : TallyData(container, identity) + , m_shared_offset(s_shared_offset_unknown) + , m_shared_replication(0) + { } + + MultiReduceBlockThenGridAtomicHostInit_Data() = delete; + MultiReduceBlockThenGridAtomicHostInit_Data(MultiReduceBlockThenGridAtomicHostInit_Data const&) = default; + MultiReduceBlockThenGridAtomicHostInit_Data(MultiReduceBlockThenGridAtomicHostInit_Data &&) = delete; + MultiReduceBlockThenGridAtomicHostInit_Data& operator=(MultiReduceBlockThenGridAtomicHostInit_Data const&) = default; + MultiReduceBlockThenGridAtomicHostInit_Data& operator=(MultiReduceBlockThenGridAtomicHostInit_Data &&) = delete; + ~MultiReduceBlockThenGridAtomicHostInit_Data() = default; + + + //! defer to tally data for some functions + using TallyData::reset_permanent; + using TallyData::teardown_permanent; + using TallyData::get; + using TallyData::num_bins; + using TallyData::identity; + + //! setup per launch, setup shared memory parameters + void setup_launch(size_t block_size) + { + if (m_num_bins == size_t(0)) { + m_shared_offset = s_shared_offset_invalid; + return; + } + + size_t shared_replication = 0; + const size_t shared_offset = allocateDynamicShmem( + [&](size_t max_shmem_size) { + + struct { + size_t func_threads_per_block; + size_t func_max_shared_replication_per_block; + } func_data{block_size, max_shmem_size / m_num_bins}; + + shared_replication = SharedAtomicReplicationConcretizer{}.template + get_shared_replication(func_data); + return m_num_bins * shared_replication; + }); + + if (shared_offset != dynamic_smem_allocation_failure) { + m_shared_replication = static_cast(shared_replication); + m_shared_offset = static_cast(shared_offset); + } else { + m_shared_offset = s_shared_offset_invalid; + } + } + + //! teardown per launch, unset shared memory parameters + void teardown_launch() + { + m_shared_replication = 0; + m_shared_offset = s_shared_offset_unknown; + } + + + //! setup on device, initialize shared memory + RAJA_DEVICE + void setup_device() + { + T* shared_mem = get_shared_mem(); + if (shared_mem != nullptr) { + impl::block_multi_reduce_init_shmem( + m_num_bins, m_identity, + shared_mem, m_shared_replication); + } + } + + //! finalize on device, combine values in shared memory into the tally + RAJA_DEVICE + void finalize_device() + { + T* shared_mem = get_shared_mem(); + if (shared_mem != nullptr) { + impl::grid_multi_reduce_shmem_to_global_atomic( + m_num_bins, m_identity, + shared_mem, GetSharedOffset{}, m_shared_replication, + m_tally_mem, GetTallyOffset{}, m_tally_replication, m_tally_bins); + } + } + + + //! combine value on device, combine a value into shared memory + RAJA_DEVICE + void combine_device(int bin, T value) + { + T* shared_mem = get_shared_mem(); + if (shared_mem != nullptr) { + impl::block_multi_reduce_combine_shmem_atomic( + m_num_bins, m_identity, + bin, value, + shared_mem, GetSharedOffset{}, m_shared_replication); + } else { + impl::block_multi_reduce_combine_global_atomic( + m_num_bins, m_identity, + bin, value, + m_tally_mem, GetTallyOffset{}, m_tally_replication, m_tally_bins); + } + } + + //! combine value on host, combine a value into the tally + void combine_host(int bin, T value) + { + int tally_rep = 0; +#if defined(RAJA_ENABLE_OPENMP) + tally_rep = omp_get_thread_num(); +#endif + int tally_offset = GetTallyOffset{}(bin, m_tally_bins, tally_rep, m_tally_replication); + Combiner{}(m_tally_mem[tally_offset], value); + } + +private: + using shared_tuning = typename tuning::SharedAtomicReplicationTuning; + using SharedAtomicReplicationConcretizer = typename shared_tuning::AtomicReplicationConcretizer; + using GetSharedIndex = typename shared_tuning::ReplicationIndexer; + using GetSharedOffset_rebind = typename shared_tuning::OffsetCalculator; + using GetSharedOffset = typename GetSharedOffset_rebind::template rebind; + + using typename TallyData::GetTallyIndex; + using typename TallyData::GetTallyOffset; + + + static constexpr int s_shared_offset_unknown = std::numeric_limits::max(); + static constexpr int s_shared_offset_invalid = std::numeric_limits::max() - 1; + + + using TallyData::m_tally_mem; + using TallyData::m_identity; + using TallyData::m_num_bins; + using TallyData::m_tally_bins; + using TallyData::m_tally_replication; + + int m_shared_offset; // in bytes + int m_shared_replication; // power of 2 + + + RAJA_DEVICE + T* get_shared_mem() const + { + if (m_shared_offset == s_shared_offset_invalid) { + return nullptr; + } + extern __shared__ char shared_mem[]; + return reinterpret_cast(&shared_mem[m_shared_offset]); + } +}; + + +/*! + ************************************************************************** + * + * \brief Hip multi-reduce data class template. + * + * This class manages synchronization, data lifetimes, and interaction with + * the runtime kernel launch info passing facilities. + * + * This class manages the lifetime of underlying reduce_data_type using + * calls to setup and teardown methods. This includes storage durations: + * - permanent, the lifetime of the parent object + * - launch, setup before a launch using the launch parameters and + * teardown after the launch + * - device, setup all device threads in a kernel before any block work and + * teardown all device threads after all block work is finished + * + ************************************************************************** + */ +template < typename T, typename t_MultiReduceOp, typename tuning > +struct MultiReduceDataHip +{ + static constexpr bool atomic_available = RAJA::reduce::hip::hip_atomic_available::value; + + //! hip reduction data storage class and folding algorithm + using reduce_data_type = + std::conditional_t<(atomic_available), + std::conditional_t<(tuning::algorithm == multi_reduce_algorithm::init_host_combine_block_atomic_then_grid_atomic), + hip::MultiReduceBlockThenGridAtomicHostInit_Data, + std::conditional_t<(tuning::algorithm == multi_reduce_algorithm::init_host_combine_global_atomic), + hip::MultiReduceGridAtomicHostInit_Data, + void>>, + void>; + + + using SyncList = std::vector; + +public: + using value_type = T; + using MultiReduceOp = t_MultiReduceOp; + + MultiReduceDataHip() = delete; + + template < typename Container, + std::enable_if_t::value>* = nullptr > + MultiReduceDataHip(Container const& container, T identity) + : m_parent(this) + , m_sync_list(new SyncList) + , m_data(container, identity) + , m_own_launch_data(false) + { + } + + //! copy and on host attempt to setup for device + // init val_ptr to avoid uninitialized read caused by host copy of + // reducer in host device lambda not being used on device. + RAJA_HOST_DEVICE + MultiReduceDataHip(MultiReduceDataHip const& other) +#if !defined(RAJA_GPU_DEVICE_COMPILE_PASS_ACTIVE) + : m_parent(other.m_parent) +#else + : m_parent(&other) +#endif + , m_sync_list(other.m_sync_list) + , m_data(other.m_data) + , m_own_launch_data(false) + { +#if !defined(RAJA_GPU_DEVICE_COMPILE_PASS_ACTIVE) + if (m_parent) { + if (setupReducers()) { + // the copy made in make_launch_body does this setup + add_resource_to_synchronization_list(currentResource()); + m_data.setup_launch(currentBlockSize()); + m_own_launch_data = true; + m_parent = nullptr; + } + } +#else + if (!m_parent->m_parent) { + // the first copy on device enters this branch + m_data.setup_device(); + } +#endif + } + + MultiReduceDataHip(MultiReduceDataHip &&) = delete; + MultiReduceDataHip& operator=(MultiReduceDataHip const&) = delete; + MultiReduceDataHip& operator=(MultiReduceDataHip &&) = delete; + + //! cleanup resources owned by this copy + // on device store in pinned buffer on host + RAJA_HOST_DEVICE + ~MultiReduceDataHip() + { +#if !defined(RAJA_GPU_DEVICE_COMPILE_PASS_ACTIVE) + if (m_parent == this) { + // the original object, owns permanent storage + synchronize_resources_and_clear_list(); + delete m_sync_list; + m_sync_list = nullptr; + m_data.teardown_permanent(); + } else if (m_parent) { + // do nothing + } else { + if (m_own_launch_data) { + // the copy made in make_launch_body, owns launch data + m_data.teardown_launch(); + m_own_launch_data = false; + } + } +#else + if (!m_parent->m_parent) { + // the first copy on device, does finalization on the device + m_data.finalize_device(); + } +#endif + } + + + template < typename Container > + void reset(Container const& container, T identity) + { + synchronize_resources_and_clear_list(); + m_data.reset_permanent(container, identity); + } + + + //! apply reduction (const version) -- still combines internal values + RAJA_HOST_DEVICE + void combine(int bin, T const& value) + { +#if !defined(RAJA_GPU_DEVICE_COMPILE_PASS_ACTIVE) + m_data.combine_host(bin, value); +#else + m_data.combine_device(bin, value); +#endif + } + + + //! map result value back to host if not done already; return aggregate value + T get(int bin) + { + synchronize_resources_and_clear_list(); + return m_data.get(bin); + } + + + size_t num_bins() const { return m_data.num_bins(); } + + T identity() const { return m_data.identity(); } + + +private: + MultiReduceDataHip const *m_parent; + SyncList* m_sync_list; + reduce_data_type m_data; + bool m_own_launch_data; + + void add_resource_to_synchronization_list(resources::Hip res) + { + for (resources::Hip& list_res : *m_sync_list) { + if (list_res.get_stream() == res.get_stream()) { + return; + } + } + m_sync_list->emplace_back(res); + } + + void synchronize_resources_and_clear_list() + { + for (resources::Hip& list_res : *m_sync_list) { + ::RAJA::hip::synchronize(list_res); + } + m_sync_list->clear(); + } +}; + +} // end namespace hip + +RAJA_DECLARE_ALL_MULTI_REDUCERS(policy::hip::hip_multi_reduce_policy, hip::MultiReduceDataHip) + +} // namespace RAJA + +#endif // closing endif for RAJA_ENABLE_HIP guard + +#endif // closing endif for header file include guard diff --git a/include/RAJA/policy/hip/params/kernel_name.hpp b/include/RAJA/policy/hip/params/kernel_name.hpp new file mode 100644 index 0000000000..30269f8406 --- /dev/null +++ b/include/RAJA/policy/hip/params/kernel_name.hpp @@ -0,0 +1,52 @@ +#ifndef HIP_KERNELNAME_HPP +#define HIP_KERNELNAME_HPP + +#if defined(RAJA_HIP_ACTIVE) + +#include "RAJA/policy/hip/MemUtils_HIP.hpp" +#include "RAJA/pattern/params/kernel_name.hpp" + +#if defined(RAJA_ENABLE_ROCTX) +#include "hip/hip_runtime_api.h" +#include "roctx.h" +#endif + +namespace RAJA { +namespace expt { +namespace detail { + + // Init + template + camp::concepts::enable_if< type_traits::is_hip_policy > + init(KernelName& kn, const RAJA::hip::detail::hipInfo &) + { +#if defined(RAJA_ENABLE_ROCTX) + roctxRangePush(kn.name); +#else + RAJA_UNUSED_VAR(kn); +#endif + } + + // Combine + template + RAJA_HOST_DEVICE + camp::concepts::enable_if< type_traits::is_hip_policy > + combine(KernelName&) {} + + // Resolve + template + camp::concepts::enable_if< type_traits::is_hip_policy > + resolve(KernelName&, const RAJA::hip::detail::hipInfo &) + { +#if defined(RAJA_ENABLE_ROCTX) + roctxRangePop(); +#endif + } + +} // namespace detail +} // namespace expt +} // namespace RAJA + +#endif + +#endif // NEW_REDUCE_HIP_REDUCE_HPP diff --git a/include/RAJA/policy/hip/policy.hpp b/include/RAJA/policy/hip/policy.hpp index c359a68de0..a9f9027675 100644 --- a/include/RAJA/policy/hip/policy.hpp +++ b/include/RAJA/policy/hip/policy.hpp @@ -31,7 +31,9 @@ #include "RAJA/policy/sequential/policy.hpp" #include "RAJA/util/Operators.hpp" +#include "RAJA/util/OffsetOperators.hpp" #include "RAJA/util/types.hpp" +#include "RAJA/util/math.hpp" namespace RAJA { @@ -74,6 +76,13 @@ struct IndexGlobal; template struct IndexFlatten; +template +struct IndexDivide; + +template +struct IndexModulo; + + /*! * Use the max occupancy of a kernel on the current device when launch * parameters are not fully determined. @@ -155,6 +164,84 @@ struct AvoidDeviceMaxThreadOccupancyConcretizer }; +/*! + * Get an amount of replication that is preferred_replication. + */ +template < size_t preferred_replication > +struct ConstantPreferredReplicationConcretizer +{ + template < typename IdxT, typename Data > + static IdxT get_preferred_replication(Data const& RAJA_UNUSED_ARG(data)) + { + return IdxT(preferred_replication); + } +}; + +/*! + * Get an amount of replication that is preferred_replication_before_cutoff if + * data.func_threads_per_block is less than t_cutoff or + * preferred_replication_after_cutoff otherwise. + */ +template < size_t t_cutoff, size_t preferred_replication_before_cutoff, + size_t preferred_replication_after_cutoff > +struct ThreadsPerBlockCutoffPreferredReplicationConcretizer +{ + template < typename IdxT, typename Data > + static IdxT get_preferred_replication(Data const& data) + { + IdxT cutoff = t_cutoff; + IdxT func_threads_per_block = data.func_threads_per_block; + + if (func_threads_per_block < cutoff) { + return IdxT(preferred_replication_before_cutoff); + } else { + return IdxT(preferred_replication_after_cutoff); + } + } +}; + +/*! + * Get an amount of shared atomic replication that is a power of 2 that is at + * most the amount given by data.func_max_shared_replication_per_block or the + * amount given by GetPreferredReplication. + */ +template < typename GetPreferredReplication > +struct SharedAtomicReplicationMaxPow2Concretizer +{ + template < typename IdxT, typename Data > + static IdxT get_shared_replication(Data const& data) + { + IdxT func_max_shared_replication_per_block = data.func_max_shared_replication_per_block; + + IdxT preferred_replication = GetPreferredReplication{}.template + get_preferred_replication(data); + + return prev_pow2(std::min(preferred_replication, + func_max_shared_replication_per_block)); + } +}; + +/*! + * Get an amount of global atomic replication that is a power of 2 that is at + * least the amount given by data.func_min_global_replication or the + * amount given by GetPreferredReplication. + */ +template < typename GetPreferredReplication > +struct GlobalAtomicReplicationMinPow2Concretizer +{ + template < typename IdxT, typename Data > + static IdxT get_global_replication(Data const& data) + { + IdxT func_min_global_replication = data.func_min_global_replication; + + IdxT preferred_replication = GetPreferredReplication{}.template + get_preferred_replication(data); + + return next_pow2(std::max(preferred_replication, func_min_global_replication)); + } +}; + + enum struct reduce_algorithm : int { combine_last_block, @@ -176,6 +263,36 @@ struct ReduceTuning static constexpr block_communication_mode comm_mode = t_comm_mode; static constexpr size_t replication = t_replication; static constexpr size_t atomic_stride = t_atomic_stride; + static constexpr bool consistent = + (algorithm == reduce_algorithm::combine_last_block); +}; + + +enum struct multi_reduce_algorithm : int +{ + init_host_combine_block_atomic_then_grid_atomic, + init_host_combine_global_atomic +}; + +template < typename t_AtomicReplicationConcretizer, + typename t_ReplicationIndexer, + typename t_OffsetCalculator > +struct AtomicReplicationTuning +{ + using AtomicReplicationConcretizer = t_AtomicReplicationConcretizer; + using ReplicationIndexer = t_ReplicationIndexer; + using OffsetCalculator = t_OffsetCalculator; +}; + +template < multi_reduce_algorithm t_algorithm, + typename t_SharedAtomicReplicationTuning, + typename t_GlobalAtomicReplicationTuning > +struct MultiReduceTuning +{ + static constexpr multi_reduce_algorithm algorithm = t_algorithm; + using SharedAtomicReplicationTuning = t_SharedAtomicReplicationTuning; + using GlobalAtomicReplicationTuning = t_GlobalAtomicReplicationTuning; + static constexpr bool consistent = false; }; } // namespace hip @@ -185,6 +302,40 @@ namespace policy namespace hip { +struct DeviceConstants +{ + RAJA::Index_type WARP_SIZE; + RAJA::Index_type MAX_BLOCK_SIZE; + RAJA::Index_type MAX_WARPS; + RAJA::Index_type ATOMIC_DESTRUCTIVE_INTERFERENCE_SIZE; // basically the cache line size of the cache level that handles atomics + + constexpr DeviceConstants(RAJA::Index_type warp_size, + RAJA::Index_type max_block_size, + RAJA::Index_type atomic_cache_line_bytes) noexcept + : WARP_SIZE(warp_size) + , MAX_BLOCK_SIZE(max_block_size) + , MAX_WARPS(max_block_size / warp_size) + , ATOMIC_DESTRUCTIVE_INTERFERENCE_SIZE(atomic_cache_line_bytes) + { } +}; + +// +// Operations in the included files are parametrized using the following +// values for HIP warp size and max block size. +// +#if defined(__HIP_PLATFORM_AMD__) +constexpr DeviceConstants device_constants(64, 1024, 64); // MI300A +// constexpr DeviceConstants device_constants(64, 1024, 128); // MI250X +#elif defined(__HIP_PLATFORM_NVIDIA__) +constexpr DeviceConstants device_constants(32, 1024, 32); // V100 +#endif +static_assert(device_constants.WARP_SIZE >= device_constants.MAX_WARPS, + "RAJA Assumption Broken: device_constants.WARP_SIZE < device_constants.MAX_WARPS"); +static_assert(device_constants.MAX_BLOCK_SIZE % device_constants.WARP_SIZE == 0, + "RAJA Assumption Broken: device_constants.MAX_BLOCK_SIZE not " + "a multiple of device_constants.WARP_SIZE"); + + template struct hip_indexer {}; @@ -260,7 +411,22 @@ struct hip_reduce_policy make_policy_pattern_launch_platform_t::value, - RAJA::Platform::hip> { + RAJA::Platform::hip, + std::conditional_t> { +}; + +template < typename tuning > +struct hip_multi_reduce_policy + : public RAJA:: + make_policy_pattern_launch_platform_t::value, + RAJA::Platform::hip, + std::conditional_t> { }; /*! @@ -277,74 +443,6 @@ struct hip_atomic_explicit{}; using hip_atomic = hip_atomic_explicit; -template < RAJA::hip::reduce_algorithm algorithm, - RAJA::hip::block_communication_mode comm_mode, - size_t replication = named_usage::unspecified, - size_t atomic_stride = named_usage::unspecified > -using hip_reduce_tuning = hip_reduce_policy< RAJA::hip::ReduceTuning< - algorithm, comm_mode, replication, atomic_stride> >; - -// Policies for RAJA::Reduce* objects with specific behaviors. -// - *atomic* policies may use atomics to combine partial results and falls back -// on a non-atomic policy when atomics can't be used with the given type. The -// use of atomics leads to order of operation differences which change the -// results of floating point sum reductions run to run. The memory used with -// atomics is initialized on the device which can be expensive on some HW. -// On some HW this is faster overall than the non-atomic policies. -// - *atomic_host* policies are similar to the atomic policies above. However -// the memory used with atomics is initialized on the host which is -// significantly cheaper on some HW. On some HW this is faster overall than -// the non-atomic and atomic policies. -// - *device_fence policies use normal memory accesses with device scope fences -// in the implementation. This works on all HW. -// - *block_fence policies use special (atomic) memory accesses that only cache -// in a cache shared by the whole device to avoid having to use -// device scope fences. This improves performance on some HW but -// is more difficult to code correctly. -using hip_reduce_device_fence = hip_reduce_tuning< - RAJA::hip::reduce_algorithm::combine_last_block, - RAJA::hip::block_communication_mode::device_fence, - named_usage::unspecified, named_usage::unspecified>; -/// -using hip_reduce_block_fence = hip_reduce_tuning< - RAJA::hip::reduce_algorithm::combine_last_block, - RAJA::hip::block_communication_mode::block_fence, - named_usage::unspecified, named_usage::unspecified>; -/// -using hip_reduce_atomic_device_init_device_fence = hip_reduce_tuning< - RAJA::hip::reduce_algorithm::init_device_combine_atomic_block, - RAJA::hip::block_communication_mode::device_fence, - named_usage::unspecified, named_usage::unspecified>; -/// -using hip_reduce_atomic_device_init_block_fence = hip_reduce_tuning< - RAJA::hip::reduce_algorithm::init_device_combine_atomic_block, - RAJA::hip::block_communication_mode::block_fence, - named_usage::unspecified, named_usage::unspecified>; -/// -using hip_reduce_atomic_host_init_device_fence = hip_reduce_tuning< - RAJA::hip::reduce_algorithm::init_host_combine_atomic_block, - RAJA::hip::block_communication_mode::device_fence, - named_usage::unspecified, named_usage::unspecified>; -/// -using hip_reduce_atomic_host_init_block_fence = hip_reduce_tuning< - RAJA::hip::reduce_algorithm::init_host_combine_atomic_block, - RAJA::hip::block_communication_mode::block_fence, - named_usage::unspecified, named_usage::unspecified>; - -// Policy for RAJA::Reduce* objects that gives the same answer every time when -// used in the same way -using hip_reduce = hip_reduce_block_fence; - -// Policy for RAJA::Reduce* objects that may use atomics and may not give the -// same answer every time when used in the same way -using hip_reduce_atomic = hip_reduce_atomic_host_init_block_fence; - -// Policy for RAJA::Reduce* objects that lets you select the default atomic or -// non-atomic policy with a bool -template < bool with_atomic > -using hip_reduce_base = std::conditional_t; - - // Policy for RAJA::statement::Reduce that reduces threads in a block // down to threadIdx 0 struct hip_block_reduce{}; @@ -392,25 +490,6 @@ template struct hip_thread_masked_loop {}; - -// -// Operations in the included files are parametrized using the following -// values for HIP warp size and max block size. -// -constexpr const RAJA::Index_type ATOMIC_DESTRUCTIVE_INTERFERENCE_SIZE = 64; // 128 on gfx90a -#if defined(__HIP_PLATFORM_AMD__) -constexpr const RAJA::Index_type WARP_SIZE = 64; -#elif defined(__HIP_PLATFORM_NVIDIA__) -constexpr const RAJA::Index_type WARP_SIZE = 32; -#endif -constexpr const RAJA::Index_type MAX_BLOCK_SIZE = 1024; -constexpr const RAJA::Index_type MAX_WARPS = MAX_BLOCK_SIZE / WARP_SIZE; -static_assert(WARP_SIZE >= MAX_WARPS, - "RAJA Assumption Broken: WARP_SIZE < MAX_WARPS"); -static_assert(MAX_BLOCK_SIZE % WARP_SIZE == 0, - "RAJA Assumption Broken: MAX_BLOCK_SIZE not " - "a multiple of WARP_SIZE"); - struct hip_synchronize : make_policy_pattern_launch_t { @@ -988,6 +1067,38 @@ struct IndexFlatten }; +template +struct IndexDivide +{ + template < typename IdxT = hip_dim_member_t > + RAJA_DEVICE static inline IdxT index() + { + return indexer::template index() / static_cast(divisor); + } + + template < typename IdxT = hip_dim_member_t > + RAJA_DEVICE static inline IdxT size() + { + return RAJA_DIVIDE_CEILING_INT(indexer::template size(), static_cast(divisor)); + } +}; + +template +struct IndexModulo +{ + template < typename IdxT = hip_dim_member_t > + RAJA_DEVICE static inline IdxT index() + { + return indexer::template index() % static_cast(divisor); + } + + template < typename IdxT = hip_dim_member_t > + RAJA_DEVICE static inline IdxT size() + { + return static_cast(divisor); + } +}; + // helper to get just the thread indexing part of IndexGlobal template < typename index_global > @@ -1033,6 +1144,13 @@ using thread_y = IndexGlobal; template using thread_z = IndexGlobal; +template +using thread_xyz = IndexFlatten, + thread_y, + thread_z>; + template using block_x = IndexGlobal; template @@ -1040,6 +1158,13 @@ using block_y = IndexGlobal; template using block_z = IndexGlobal; +template +using block_xyz = IndexFlatten, + block_y, + block_z>; + template using global_x = IndexGlobal; template @@ -1047,6 +1172,42 @@ using global_y = IndexGlobal; template using global_z = IndexGlobal; + +template +using global_xyz = IndexFlatten, + global_y, + global_z>; + + +template +using warp_xyz = IndexDivide>; + +template +using warp_global_xyz = IndexFlatten, + block_xyz>; + } // namespace hip // contretizers used in forall, scan, and sort policies @@ -1156,16 +1317,146 @@ using policy::hip::unordered_hip_loop_y_block_iter_x_threadblock_average; using policy::hip::hip_atomic; using policy::hip::hip_atomic_explicit; + // policies usable with reducers -using policy::hip::hip_reduce_device_fence; -using policy::hip::hip_reduce_block_fence; -using policy::hip::hip_reduce_atomic_device_init_device_fence; -using policy::hip::hip_reduce_atomic_device_init_block_fence; -using policy::hip::hip_reduce_atomic_host_init_device_fence; -using policy::hip::hip_reduce_atomic_host_init_block_fence; -using policy::hip::hip_reduce_base; -using policy::hip::hip_reduce; -using policy::hip::hip_reduce_atomic; +template < hip::reduce_algorithm algorithm, + hip::block_communication_mode comm_mode, + size_t replication = named_usage::unspecified, + size_t atomic_stride = named_usage::unspecified > +using hip_reduce_tuning = policy::hip::hip_reduce_policy< + hip::ReduceTuning>; + +// Policies for RAJA::Reduce* objects with specific behaviors. +// - non-atomic policies store partial results and combine them in the same +// order every time, leading to consistent results for a loop run to run. +// - *atomic* policies may use atomics to combine partial results. The +// use of atomics leads to order of operation differences which change the +// results of floating point sum reductions for a loop run to run. Falls back +// on a non-atomic implementation if atomics can't be used with the given +// type. The memory used with atomics is initialized on the device using +// atomics which adds overhead. +// - *atomic_host* policies are similar to the atomic policies above. However +// the memory used with atomics is initialized on the host. This is faster +// overall than other policies on HW with direct host access to device memory +// such as the AMD MI300A El Capitan/Tuolumne systems. +// - *device_fence* policies use normal memory accesses with device scope fences +// in the implementation. This works on all HW. +// - *block_fence* policies use special (atomic) memory accesses that use +// a cache shared by the whole device to avoid having to use +// device scope fences. This improves performance on some HW but +// is more difficult to code correctly. +using hip_reduce_device_fence = hip_reduce_tuning< + hip::reduce_algorithm::combine_last_block, + hip::block_communication_mode::device_fence, + named_usage::unspecified, named_usage::unspecified>; +/// +using hip_reduce_block_fence = hip_reduce_tuning< + hip::reduce_algorithm::combine_last_block, + hip::block_communication_mode::block_fence, + named_usage::unspecified, named_usage::unspecified>; +/// +using hip_reduce_atomic_device_init_device_fence = hip_reduce_tuning< + hip::reduce_algorithm::init_device_combine_atomic_block, + hip::block_communication_mode::device_fence, + named_usage::unspecified, named_usage::unspecified>; +/// +using hip_reduce_atomic_device_init_block_fence = hip_reduce_tuning< + hip::reduce_algorithm::init_device_combine_atomic_block, + hip::block_communication_mode::block_fence, + named_usage::unspecified, named_usage::unspecified>; +/// +using hip_reduce_atomic_host_init_device_fence = hip_reduce_tuning< + hip::reduce_algorithm::init_host_combine_atomic_block, + hip::block_communication_mode::device_fence, + named_usage::unspecified, named_usage::unspecified>; +/// +using hip_reduce_atomic_host_init_block_fence = hip_reduce_tuning< + hip::reduce_algorithm::init_host_combine_atomic_block, + hip::block_communication_mode::block_fence, + named_usage::unspecified, named_usage::unspecified>; + +// Policy for RAJA::Reduce* objects that gives the same answer every time when +// used in the same way +using hip_reduce = hip_reduce_block_fence; + +// Policy for RAJA::Reduce* objects that may use atomics and may not give the +// same answer every time when used in the same way +using hip_reduce_atomic = hip_reduce_atomic_host_init_block_fence; + +// Policy for RAJA::Reduce* objects that lets you select the default atomic or +// non-atomic policy with a bool +template < bool with_atomic > +using hip_reduce_base = std::conditional_t; + + +// policies usable with multi_reducers +template < hip::multi_reduce_algorithm algorithm, + typename SharedAtomicReplicationConcretizer, + typename SharedAtomicReplicationIndexer, + typename GlobalAtomicReplicationConcretizer, + typename GlobalAtomicReplicationIndexer > +using hip_multi_reduce_tuning = policy::hip::hip_multi_reduce_policy< + hip::MultiReduceTuning< + algorithm, + hip::AtomicReplicationTuning>, + hip::AtomicReplicationTuning>>>; + +// Policies for RAJA::MultiReduce* objects with specific behaviors. +// - *atomic* policies may use atomics to combine partial results. The +// use of atomics leads to order of operation differences which change the +// results of floating point sum reductions for a loop run to run. +// - *no_replication* policies use the minimum amount of resources. The +// lack of resources means they may perform poorly. These policies are +// intended for use cases where low overhead is more important than high +// performance such as error flags that are rarely set. +// - *host_init* policies initialize memory used with atomics on the host. +// This is faster overall than other policies on HW with direct host access +// to device memory such as the AMD MI300A El Capitan/Tuolumne systems. +using hip_multi_reduce_atomic_block_then_atomic_grid_host_init = hip_multi_reduce_tuning< + hip::multi_reduce_algorithm::init_host_combine_block_atomic_then_grid_atomic, + hip::SharedAtomicReplicationMaxPow2Concretizer< + hip::ConstantPreferredReplicationConcretizer<4>>, + hip::thread_xyz<>, + hip::GlobalAtomicReplicationMinPow2Concretizer< + hip::ConstantPreferredReplicationConcretizer<32>>, + hip::warp_global_xyz<>>; +// special policy to test that multi-reducers work if there is not enough shmem +using hip_multi_reduce_atomic_block_then_atomic_grid_host_init_fallback_testing = hip_multi_reduce_tuning< + hip::multi_reduce_algorithm::init_host_combine_block_atomic_then_grid_atomic, + hip::SharedAtomicReplicationMaxPow2Concretizer< + hip::ConstantPreferredReplicationConcretizer<0>>, + hip::thread_xyz<>, + hip::GlobalAtomicReplicationMinPow2Concretizer< + hip::ConstantPreferredReplicationConcretizer<32>>, + hip::warp_global_xyz<>>; +// +using hip_multi_reduce_atomic_global_host_init = hip_multi_reduce_tuning< + hip::multi_reduce_algorithm::init_host_combine_global_atomic, + void, // unused with this algorithm + void, // unused with this algorithm + hip::GlobalAtomicReplicationMinPow2Concretizer< + hip::ConstantPreferredReplicationConcretizer<32>>, + hip::warp_global_xyz<>>; +// +using hip_multi_reduce_atomic_global_no_replication_host_init = hip_multi_reduce_tuning< + hip::multi_reduce_algorithm::init_host_combine_global_atomic, + void, // unused with this algorithm + void, // unused with this algorithm + hip::GlobalAtomicReplicationMinPow2Concretizer< + hip::ConstantPreferredReplicationConcretizer<1>>, + hip::block_xyz<>>; + +// Policy for RAJA::MultiReduce* objects that may use atomics and may not give the +// same answer every time when used in the same way +using hip_multi_reduce_atomic = hip_multi_reduce_atomic_block_then_atomic_grid_host_init; +// Similar to above but optimized for low overhead in cases where it is rarely used +using hip_multi_reduce_atomic_low_performance_low_overhead = + hip_multi_reduce_atomic_global_no_replication_host_init; + // policies usable with kernel using policy::hip::hip_block_reduce; @@ -1174,11 +1465,11 @@ using policy::hip::hip_warp_reduce; using hip_warp_direct = RAJA::policy::hip::hip_indexer< iteration_mapping::Direct, kernel_sync_requirement::none, - hip::thread_x>; + hip::thread_x>; using hip_warp_loop = RAJA::policy::hip::hip_indexer< iteration_mapping::StridedLoop, kernel_sync_requirement::none, - hip::thread_x>; + hip::thread_x>; using policy::hip::hip_warp_masked_direct; using policy::hip::hip_warp_masked_loop; diff --git a/include/RAJA/policy/hip/reduce.hpp b/include/RAJA/policy/hip/reduce.hpp index 2dbaf9f7e5..e8e67029ef 100644 --- a/include/RAJA/policy/hip/reduce.hpp +++ b/include/RAJA/policy/hip/reduce.hpp @@ -200,15 +200,15 @@ RAJA_DEVICE RAJA_INLINE T block_reduce(T val, T identity) const int numThreads = ThreadIterationGetter::size(); const int threadId = ThreadIterationGetter::index(); - const int warpId = threadId % RAJA::policy::hip::WARP_SIZE; - const int warpNum = threadId / RAJA::policy::hip::WARP_SIZE; + const int warpId = threadId % RAJA::policy::hip::device_constants.WARP_SIZE; + const int warpNum = threadId / RAJA::policy::hip::device_constants.WARP_SIZE; T temp = val; - if (numThreads % RAJA::policy::hip::WARP_SIZE == 0) { + if (numThreads % RAJA::policy::hip::device_constants.WARP_SIZE == 0) { // reduce each warp - for (int i = 1; i < RAJA::policy::hip::WARP_SIZE; i *= 2) { + for (int i = 1; i < RAJA::policy::hip::device_constants.WARP_SIZE; i *= 2) { T rhs = RAJA::hip::impl::shfl_xor_sync(temp, i); temp = Combiner{}(temp, rhs); } @@ -216,7 +216,7 @@ RAJA_DEVICE RAJA_INLINE T block_reduce(T val, T identity) } else { // reduce each warp - for (int i = 1; i < RAJA::policy::hip::WARP_SIZE; i *= 2) { + for (int i = 1; i < RAJA::policy::hip::device_constants.WARP_SIZE; i *= 2) { int srcLane = threadId ^ i; T rhs = RAJA::hip::impl::shfl_sync(temp, srcLane); // only add from threads that exist (don't double count own value) @@ -226,18 +226,18 @@ RAJA_DEVICE RAJA_INLINE T block_reduce(T val, T identity) } } - static_assert(RAJA::policy::hip::MAX_WARPS <= RAJA::policy::hip::WARP_SIZE, + static_assert(RAJA::policy::hip::device_constants.MAX_WARPS <= RAJA::policy::hip::device_constants.WARP_SIZE, "Max Warps must be less than or equal to Warp Size for this algorithm to work"); // reduce per warp values - if (numThreads > RAJA::policy::hip::WARP_SIZE) { + if (numThreads > RAJA::policy::hip::device_constants.WARP_SIZE) { // Need to separate declaration and initialization for clang-hip - __shared__ unsigned char tmpsd[sizeof(RAJA::detail::SoAArray)]; + __shared__ unsigned char tmpsd[sizeof(RAJA::detail::SoAArray)]; // Partial placement new: Should call new(tmpsd) here but recasting memory // to avoid calling constructor/destructor in shared memory. - RAJA::detail::SoAArray * sd = reinterpret_cast *>(tmpsd); + RAJA::detail::SoAArray * sd = reinterpret_cast *>(tmpsd); // write per warp values to shared memory if (warpId == 0) { @@ -249,13 +249,13 @@ RAJA_DEVICE RAJA_INLINE T block_reduce(T val, T identity) if (warpNum == 0) { // read per warp values - if (warpId * RAJA::policy::hip::WARP_SIZE < numThreads) { + if (warpId * RAJA::policy::hip::device_constants.WARP_SIZE < numThreads) { temp = sd->get(warpId); } else { temp = identity; } - for (int i = 1; i < RAJA::policy::hip::MAX_WARPS; i *= 2) { + for (int i = 1; i < RAJA::policy::hip::device_constants.MAX_WARPS; i *= 2) { T rhs = RAJA::hip::impl::shfl_xor_sync(temp, i); temp = Combiner{}(temp, rhs); } @@ -882,8 +882,8 @@ class Reduce : 32; static constexpr size_t atomic_stride = (tuning::atomic_stride > 0) ? tuning::atomic_stride - : ((policy::hip::ATOMIC_DESTRUCTIVE_INTERFERENCE_SIZE > sizeof(T)) - ? RAJA_DIVIDE_CEILING_INT(policy::hip::ATOMIC_DESTRUCTIVE_INTERFERENCE_SIZE, sizeof(T)) + : ((policy::hip::device_constants.ATOMIC_DESTRUCTIVE_INTERFERENCE_SIZE > sizeof(T)) + ? RAJA_DIVIDE_CEILING_INT(policy::hip::device_constants.ATOMIC_DESTRUCTIVE_INTERFERENCE_SIZE, sizeof(T)) : 1); using Accessor = std::conditional_t<(tuning::comm_mode == block_communication_mode::block_fence), diff --git a/include/RAJA/policy/openmp.hpp b/include/RAJA/policy/openmp.hpp index ae0f70a37f..fc29dabcbf 100644 --- a/include/RAJA/policy/openmp.hpp +++ b/include/RAJA/policy/openmp.hpp @@ -37,6 +37,7 @@ #include "RAJA/policy/openmp/kernel.hpp" #include "RAJA/policy/openmp/policy.hpp" #include "RAJA/policy/openmp/reduce.hpp" +#include "RAJA/policy/openmp/multi_reduce.hpp" #include "RAJA/policy/openmp/region.hpp" #include "RAJA/policy/openmp/scan.hpp" #include "RAJA/policy/openmp/sort.hpp" diff --git a/include/RAJA/policy/openmp/atomic.hpp b/include/RAJA/policy/openmp/atomic.hpp index 4eea77722e..2dc047dd95 100644 --- a/include/RAJA/policy/openmp/atomic.hpp +++ b/include/RAJA/policy/openmp/atomic.hpp @@ -36,163 +36,217 @@ namespace RAJA RAJA_SUPPRESS_HD_WARN template RAJA_HOST_DEVICE -RAJA_INLINE T atomicAdd(omp_atomic, T volatile *acc, T value) +RAJA_INLINE T atomicLoad(omp_atomic, T *acc) { T ret; #pragma omp atomic capture { ret = *acc; // capture old for return value - *acc += value; + *acc += (T)0; } return ret; } - RAJA_SUPPRESS_HD_WARN template RAJA_HOST_DEVICE -RAJA_INLINE T atomicSub(omp_atomic, T volatile *acc, T value) +RAJA_INLINE void atomicStore(omp_atomic, T *acc, T value) { T ret; #pragma omp atomic capture { - ret = *acc; // capture old for return value + ret = *acc; + *acc = value; + } + RAJA_UNUSED_VAR(ret); +} + +RAJA_SUPPRESS_HD_WARN +template +RAJA_HOST_DEVICE +RAJA_INLINE T atomicAdd(omp_atomic, T *acc, T value) +{ + T old; +#pragma omp atomic capture + { + old = *acc; // capture old for return value + *acc += value; + } + return old; +} + + +RAJA_SUPPRESS_HD_WARN +template +RAJA_HOST_DEVICE +RAJA_INLINE T atomicSub(omp_atomic, T *acc, T value) +{ + T old; +#pragma omp atomic capture + { + old = *acc; // capture old for return value *acc -= value; } - return ret; + return old; } RAJA_SUPPRESS_HD_WARN template RAJA_HOST_DEVICE -RAJA_INLINE T atomicMin(omp_atomic, T volatile *acc, T value) +RAJA_INLINE T atomicMin(omp_atomic, T *acc, T value) { - // OpenMP doesn't define atomic trinary operators so use builtin atomics +#if _OPENMP >= 202011 + T old; + #pragma omp atomic capture compare + { + old = *acc; + if ( value < *acc ) + { + *acc = value; + } + } + return old; +#else + // OpenMP doesn't define atomic ternary operators so use builtin atomics return atomicMin(builtin_atomic{}, acc, value); +#endif } RAJA_SUPPRESS_HD_WARN template RAJA_HOST_DEVICE -RAJA_INLINE T atomicMax(omp_atomic, T volatile *acc, T value) +RAJA_INLINE T atomicMax(omp_atomic, T *acc, T value) { - // OpenMP doesn't define atomic trinary operators so use builtin atomics +#if _OPENMP >= 202011 + T old; + #pragma omp atomic capture compare + { + old = *acc; + if ( value > *acc ) + { + *acc = value; + } + } + return old; +#else + // OpenMP doesn't define atomic ternary operators so use builtin atomics return atomicMax(builtin_atomic{}, acc, value); +#endif } RAJA_SUPPRESS_HD_WARN template RAJA_HOST_DEVICE -RAJA_INLINE T atomicInc(omp_atomic, T volatile *acc) +RAJA_INLINE T atomicInc(omp_atomic, T *acc) { - T ret; + T old; #pragma omp atomic capture { - ret = *acc; // capture old for return value - *acc += 1; + old = *acc; // capture old for return value + *acc += T(1); } - return ret; + return old; } RAJA_SUPPRESS_HD_WARN template RAJA_HOST_DEVICE -RAJA_INLINE T atomicInc(omp_atomic, T volatile *acc, T val) +RAJA_INLINE T atomicInc(omp_atomic, T *acc, T value) { - // OpenMP doesn't define atomic trinary operators so use builtin atomics - return RAJA::atomicInc(builtin_atomic{}, acc, val); + // OpenMP doesn't define needed operations, so use builtin atomics + return RAJA::atomicInc(builtin_atomic{}, acc, value); } RAJA_SUPPRESS_HD_WARN template RAJA_HOST_DEVICE -RAJA_INLINE T atomicDec(omp_atomic, T volatile *acc) +RAJA_INLINE T atomicDec(omp_atomic, T *acc) { - T ret; + T old; #pragma omp atomic capture { - ret = *acc; // capture old for return value - *acc -= 1; + old = *acc; // capture old for return value + *acc -= T(1); } - return ret; + return old; } RAJA_SUPPRESS_HD_WARN template RAJA_HOST_DEVICE -RAJA_INLINE T atomicDec(omp_atomic, T volatile *acc, T val) +RAJA_INLINE T atomicDec(omp_atomic, T *acc, T value) { - // OpenMP doesn't define atomic trinary operators so use builtin atomics - return RAJA::atomicDec(builtin_atomic{}, acc, val); + // OpenMP doesn't define needed operations, so use builtin atomics + return RAJA::atomicDec(builtin_atomic{}, acc, value); } RAJA_SUPPRESS_HD_WARN template RAJA_HOST_DEVICE -RAJA_INLINE T atomicAnd(omp_atomic, T volatile *acc, T value) +RAJA_INLINE T atomicAnd(omp_atomic, T *acc, T value) { - T ret; + T old; #pragma omp atomic capture { - ret = *acc; // capture old for return value + old = *acc; // capture old for return value *acc &= value; } - return ret; + return old; } RAJA_SUPPRESS_HD_WARN template RAJA_HOST_DEVICE -RAJA_INLINE T atomicOr(omp_atomic, T volatile *acc, T value) +RAJA_INLINE T atomicOr(omp_atomic, T *acc, T value) { - T ret; + T old; #pragma omp atomic capture { - ret = *acc; // capture old for return value + old = *acc; // capture old for return value *acc |= value; } - return ret; + return old; } RAJA_SUPPRESS_HD_WARN template RAJA_HOST_DEVICE -RAJA_INLINE T atomicXor(omp_atomic, T volatile *acc, T value) +RAJA_INLINE T atomicXor(omp_atomic, T *acc, T value) { - T ret; + T old; #pragma omp atomic capture { - ret = *acc; // capture old for return value + old = *acc; // capture old for return value *acc ^= value; } - return ret; + return old; } RAJA_SUPPRESS_HD_WARN template RAJA_HOST_DEVICE -RAJA_INLINE T atomicExchange(omp_atomic, T volatile *acc, T value) +RAJA_INLINE T atomicExchange(omp_atomic, T *acc, T value) { - T ret; + T old; #pragma omp atomic capture { - ret = *acc; // capture old for return value + old = *acc; // capture old for return value *acc = value; } - return ret; + return old; } RAJA_SUPPRESS_HD_WARN template RAJA_HOST_DEVICE -RAJA_INLINE T atomicCAS(omp_atomic, T volatile *acc, T compare, T value) +RAJA_INLINE T atomicCAS(omp_atomic, T *acc, T compare, T value) { - // OpenMP doesn't define atomic trinary operators so use builtin atomics + // OpenMP doesn't define atomic ternary operators so use builtin atomics return RAJA::atomicCAS(builtin_atomic{}, acc, compare, value); } diff --git a/include/RAJA/policy/openmp/multi_reduce.hpp b/include/RAJA/policy/openmp/multi_reduce.hpp new file mode 100644 index 0000000000..22b09a7722 --- /dev/null +++ b/include/RAJA/policy/openmp/multi_reduce.hpp @@ -0,0 +1,360 @@ +/*! + ****************************************************************************** + * + * \file + * + * \brief Header file containing RAJA reduction templates for + * OpenMP execution. + * + * These methods should work on any platform that supports OpenMP. + * + ****************************************************************************** + */ + +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2016-24, Lawrence Livermore National Security, LLC +// and RAJA project contributors. See the RAJA/LICENSE file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +#ifndef RAJA_omp_multi_reduce_HPP +#define RAJA_omp_multi_reduce_HPP + +#include "RAJA/config.hpp" + +#if defined(RAJA_ENABLE_OPENMP) + +#include +#include + +#include + +#include "RAJA/util/types.hpp" +#include "RAJA/util/reduce.hpp" +#include "RAJA/util/RepeatView.hpp" + +#include "RAJA/internal/MemUtils_CPU.hpp" + +#include "RAJA/pattern/detail/multi_reduce.hpp" +#include "RAJA/pattern/multi_reduce.hpp" + +#include "RAJA/policy/openmp/policy.hpp" + +namespace RAJA +{ + +namespace detail +{ + +/*! + ************************************************************************** + * + * \brief OMP multi-reduce data class template. + * + * In this class memory is owned by the parent object + * + ************************************************************************** + */ +template < typename T, typename t_MultiReduceOp, typename tuning > +struct MultiReduceDataOMP; + +/*! + ************************************************************************** + * + * \brief OMP multi-reduce data class template using combine on destruction. + * + * In this class memory is owned by each copy of the object + * + ************************************************************************** + */ +template < typename T, typename t_MultiReduceOp > +struct MultiReduceDataOMP> +{ + using value_type = T; + using MultiReduceOp = t_MultiReduceOp; + + MultiReduceDataOMP() = delete; + + template < typename Container, + std::enable_if_t::value>* = nullptr > + MultiReduceDataOMP(Container const& container, T identity) + : m_parent(nullptr) + , m_num_bins(container.size()) + , m_identity(identity) + , m_data(nullptr) + { + m_data = create_data(container, m_num_bins); + } + + MultiReduceDataOMP(MultiReduceDataOMP const &other) + : m_parent(other.m_parent ? other.m_parent : &other) + , m_num_bins(other.m_num_bins) + , m_identity(other.m_identity) + , m_data(nullptr) + { + m_data = create_data(RepeatView(other.m_identity, other.m_num_bins), other.m_num_bins); + } + + MultiReduceDataOMP(MultiReduceDataOMP &&) = delete; + MultiReduceDataOMP& operator=(MultiReduceDataOMP const&) = delete; + MultiReduceDataOMP& operator=(MultiReduceDataOMP &&) = delete; + + ~MultiReduceDataOMP() + { + if (m_data) { + if (m_parent && (m_num_bins != size_t(0))) { +#pragma omp critical(ompMultiReduceCritical) + { + for (size_t bin = 0; bin < m_num_bins; ++bin) { + MultiReduceOp{}(m_parent->m_data[bin], m_data[bin]); + } + } + } + destroy_data(m_data, m_num_bins); + } + } + + template < typename Container > + void reset(Container const& container, T identity) + { + m_identity = identity; + size_t new_num_bins = container.size(); + if (new_num_bins != m_num_bins) { + destroy_data(m_data, m_num_bins); + m_num_bins = new_num_bins; + m_data = create_data(container, m_num_bins); + } else { + size_t bin = 0; + for (auto const& value : container) { + m_data[bin] = value; + ++bin; + } + } + } + + size_t num_bins() const { return m_num_bins; } + + T identity() const { return m_identity; } + + void combine(size_t bin, T const &val) { MultiReduceOp{}(m_data[bin], val); } + + T get(size_t bin) const { return m_data[bin]; } + +private: + MultiReduceDataOMP const *m_parent; + size_t m_num_bins; + T m_identity; + T* m_data; + + template < typename Container > + static T* create_data(Container const& container, size_t num_bins) + { + if (num_bins == size_t(0)) { + return nullptr; + } + auto data = RAJA::allocate_aligned_type( RAJA::DATA_ALIGN, num_bins * sizeof(T) ); + size_t bin = 0; + for (auto const& value : container) { + new(&data[bin]) T(value); + ++bin; + } + return data; + } + + static void destroy_data(T*& data, size_t num_bins) + { + if (num_bins == size_t(0)) { + return; + } + for (size_t bin = num_bins; bin > 0; --bin) { + data[bin-1].~T(); + } + RAJA::free_aligned(data); + data = nullptr; + } +}; + +/*! + ************************************************************************** + * + * \brief OMP multi-reduce data class template using combine on get. + * + * In this class memory is owned by each copy of the object + * + ************************************************************************** + */ +template < typename T, typename t_MultiReduceOp > +struct MultiReduceDataOMP> +{ + using value_type = T; + using MultiReduceOp = t_MultiReduceOp; + + MultiReduceDataOMP() = delete; + + template < typename Container, + std::enable_if_t::value>* = nullptr > + MultiReduceDataOMP(Container const& container, T identity) + : m_parent(nullptr) + , m_max_threads(omp_get_max_threads()) + , m_num_bins(container.size()) + , m_padded_threads(pad_threads(m_max_threads)) + , m_padded_bins(pad_bins(m_num_bins)) + , m_identity(identity) + , m_data(nullptr) + { + m_data = create_data(container, identity, m_num_bins, m_max_threads, m_padded_bins, m_padded_threads); + } + + MultiReduceDataOMP(MultiReduceDataOMP const &other) + : m_parent(other.m_parent ? other.m_parent : &other) + , m_num_bins(other.m_num_bins) + , m_padded_threads(other.m_padded_threads) + , m_padded_bins(other.m_padded_bins) + , m_identity(other.m_identity) + , m_data(other.m_data) + { } + + MultiReduceDataOMP(MultiReduceDataOMP &&) = delete; + MultiReduceDataOMP& operator=(MultiReduceDataOMP const&) = delete; + MultiReduceDataOMP& operator=(MultiReduceDataOMP &&) = delete; + + ~MultiReduceDataOMP() + { + if (m_data) { + if (!m_parent) { + destroy_data(m_data, m_num_bins, m_max_threads, m_padded_bins, m_padded_threads); + } + } + } + + template < typename Container > + void reset(Container const& container, T identity) + { + m_identity = identity; + size_t new_num_bins = container.size(); + if (new_num_bins != m_num_bins) { + destroy_data(m_data, m_num_bins, m_max_threads, m_padded_bins, m_padded_threads); + m_num_bins = new_num_bins; + m_padded_bins = pad_bins(m_num_bins); + m_data = create_data(container, identity, m_num_bins, m_max_threads, m_padded_bins, m_padded_threads); + } else { + if (m_max_threads > 0) { + { + size_t thread_idx = 0; + size_t bin = 0; + for (auto const& value : container) { + m_data[index_data(bin, thread_idx, m_padded_bins, m_padded_threads)] = value; + ++bin; + } + } + for (size_t thread_idx = 1; thread_idx < m_max_threads; ++thread_idx) { + for (size_t bin = 0; bin < m_num_bins; ++bin) { + m_data[index_data(bin, thread_idx, m_padded_bins, m_padded_threads)] = identity; + } + } + } + } + } + + size_t num_bins() const { return m_num_bins; } + + T identity() const { return m_identity; } + + void combine(size_t bin, T const &val) + { + size_t thread_idx = omp_get_thread_num(); + MultiReduceOp{}(m_data[index_data(bin, thread_idx, m_padded_bins, m_padded_threads)], val); + } + + T get(size_t bin) const + { + ::RAJA::detail::HighAccuracyReduce + reducer(m_identity); + for (size_t thread_idx = 0; thread_idx < m_max_threads; ++thread_idx) { + reducer.combine(m_data[index_data(bin, thread_idx, m_padded_bins, m_padded_threads)]); + } + return reducer.get_and_clear(); + } + +private: + MultiReduceDataOMP const *m_parent; + size_t m_max_threads; + size_t m_num_bins; + size_t m_padded_threads; + size_t m_padded_bins; + T m_identity; + T* m_data; + + static constexpr size_t pad_bins(size_t num_bins) + { + size_t num_cache_lines = RAJA_DIVIDE_CEILING_INT(num_bins*sizeof(T), RAJA::DATA_ALIGN); + return RAJA_DIVIDE_CEILING_INT(num_cache_lines * RAJA::DATA_ALIGN, sizeof(T)); + } + + static constexpr size_t pad_threads(size_t max_threads) + { + return max_threads; + } + + static constexpr size_t index_data(size_t bin, size_t thread_idx, + size_t padded_bins, size_t RAJA_UNUSED_ARG(padded_threads)) + { + return bin + thread_idx * padded_bins; + } + + template < typename Container > + static T* create_data(Container const& container, T identity, + size_t num_bins, size_t max_threads, + size_t padded_bins, size_t padded_threads) + { + if (num_bins == size_t(0)) { + return nullptr; + } + auto data = RAJA::allocate_aligned_type( RAJA::DATA_ALIGN, padded_threads*padded_bins*sizeof(T) ); + if (max_threads > 0) { + { + size_t thread_idx = 0; + size_t bin = 0; + for (auto const& value : container) { + new(&data[index_data(bin, thread_idx, padded_bins, padded_threads)]) T(value); + ++bin; + } + } + for (size_t thread_idx = 1; thread_idx < max_threads; ++thread_idx) { + for (size_t bin = 0; bin < num_bins; ++bin) { + new(&data[index_data(bin, thread_idx, padded_bins, padded_threads)]) T(identity); + } + } + } + return data; + } + + static void destroy_data(T*& data, + size_t num_bins, size_t max_threads, + size_t padded_bins, size_t padded_threads) + { + if (num_bins == size_t(0)) { + return; + } + for (size_t thread_idx = max_threads; thread_idx > 0; --thread_idx) { + for (size_t bin = num_bins; bin > 0; --bin) { + data[index_data(bin-1, thread_idx-1, padded_bins, padded_threads)].~T(); + } + } + RAJA::free_aligned(data); + data = nullptr; + } +}; + +} // namespace detail + +RAJA_DECLARE_ALL_MULTI_REDUCERS(policy::omp::omp_multi_reduce_policy, detail::MultiReduceDataOMP) + +} // namespace RAJA + +#endif // closing endif for RAJA_ENABLE_OPENMP guard + +#endif // closing endif for header file include guard diff --git a/include/RAJA/policy/openmp/params/kernel_name.hpp b/include/RAJA/policy/openmp/params/kernel_name.hpp new file mode 100644 index 0000000000..65a5f7a329 --- /dev/null +++ b/include/RAJA/policy/openmp/params/kernel_name.hpp @@ -0,0 +1,40 @@ +#ifndef OPENMP_KERNELNAME_HPP +#define OPENMP_KERNELNAME_HPP + +#include "RAJA/pattern/params/kernel_name.hpp" + +namespace RAJA { +namespace expt { +namespace detail { + +#if defined(RAJA_ENABLE_OPENMP) + + // Init + template + camp::concepts::enable_if< type_traits::is_openmp_policy > + init(KernelName&) + { + //TODO: Define kernel naming + } + + // Combine + template + camp::concepts::enable_if< type_traits::is_openmp_policy > + combine(KernelName&, T& /*place holder argument*/) {} + + // Resolve + template + camp::concepts::enable_if< type_traits::is_openmp_policy > + resolve(KernelName&) + { + //TODO: Define kernel naming + } + +#endif + +} // namespace detail +} // namespace expt +} // namespace RAJA + + +#endif // NEW_REDUCE_SEQ_REDUCE_HPP diff --git a/include/RAJA/policy/openmp/policy.hpp b/include/RAJA/policy/openmp/policy.hpp index af5bdf2df7..aff2567474 100644 --- a/include/RAJA/policy/openmp/policy.hpp +++ b/include/RAJA/policy/openmp/policy.hpp @@ -42,6 +42,25 @@ typedef enum omp_sched_t { namespace RAJA { +namespace omp +{ + +enum struct multi_reduce_algorithm : int +{ + combine_on_destruction, + combine_on_get +}; + +template < multi_reduce_algorithm t_algorithm > +struct MultiReduceTuning +{ + static constexpr multi_reduce_algorithm algorithm = t_algorithm; + static constexpr bool consistent = + (algorithm == multi_reduce_algorithm::combine_on_get); +}; + +} // namspace omp + namespace policy { namespace omp @@ -283,6 +302,18 @@ struct omp_reduce_ordered : make_policy_pattern_t { }; +/// +template < typename tuning > +struct omp_multi_reduce_policy + : make_policy_pattern_launch_platform_t> { +}; + /// struct omp_synchronize : make_policy_pattern_launch_t +using omp_multi_reduce_tuning = omp_multi_reduce_policy< + RAJA::omp::MultiReduceTuning >; + +// Policies for RAJA::MultiReduce* objects with specific behaviors. +// - combine_on_destruction policies combine new values into a single value for +// each object then each object combines its values into the parent object's +// values on destruction in a critical region. +using omp_multi_reduce_combine_on_destruction = omp_multi_reduce_tuning< + RAJA::omp::multi_reduce_algorithm::combine_on_destruction>; +// - combine_on_get policies combine new values into a single value for +// each thread then when get is called those values are combined. +using omp_multi_reduce_combine_on_get = omp_multi_reduce_tuning< + RAJA::omp::multi_reduce_algorithm::combine_on_get>; + +// Policy for RAJA::MultiReduce* objects that gives the +// same answer every time when used in the same way +using omp_multi_reduce_ordered = omp_multi_reduce_combine_on_get; + +// Policy for RAJA::MultiReduce* objects that may not give the +// same answer every time when used in the same way +using omp_multi_reduce_unordered = omp_multi_reduce_combine_on_destruction; + +using omp_multi_reduce = omp_multi_reduce_unordered; + } // namespace omp } // namespace policy @@ -389,6 +446,10 @@ using policy::omp::omp_launch_t; using policy::omp::omp_reduce; /// using policy::omp::omp_reduce_ordered; +/// +using policy::omp::omp_multi_reduce; +/// +using policy::omp::omp_multi_reduce_ordered; /// /// Type aliases for omp reductions diff --git a/include/RAJA/policy/openmp_target.hpp b/include/RAJA/policy/openmp_target.hpp index 6b90282e6d..af88127636 100644 --- a/include/RAJA/policy/openmp_target.hpp +++ b/include/RAJA/policy/openmp_target.hpp @@ -30,6 +30,7 @@ #include "RAJA/policy/openmp_target/kernel.hpp" #include "RAJA/policy/openmp_target/forall.hpp" #include "RAJA/policy/openmp_target/reduce.hpp" +//#include "RAJA/policy/openmp_target/multi_reduce.hpp" #include "RAJA/policy/openmp_target/WorkGroup.hpp" diff --git a/include/RAJA/policy/openmp_target/params/kernel_name.hpp b/include/RAJA/policy/openmp_target/params/kernel_name.hpp new file mode 100644 index 0000000000..5e9edb4b6c --- /dev/null +++ b/include/RAJA/policy/openmp_target/params/kernel_name.hpp @@ -0,0 +1,40 @@ +#ifndef OPENMP_TARGET_KERNELNAME_HPP +#define OPENMP_TARGET_KERNELNAME_HPP + +#include "RAJA/pattern/params/kernel_name.hpp" + +namespace RAJA { +namespace expt { +namespace detail { + +#if defined(RAJA_ENABLE_TARGET_OPENMP) + + // Init + template + camp::concepts::enable_if< type_traits::is_target_openmp_policy > + init(KernelName&) + { + //TODO: Define kernel naming + } + + // Combine + template + camp::concepts::enable_if< type_traits::is_target_openmp_policy > + combine(KernelName&, T& /*place holder argument*/) {} + + // Resolve + template + camp::concepts::enable_if< type_traits::is_target_openmp_policy > + resolve(KernelName&) + { + //TODO: Define kernel naming + } + +#endif + +} // namespace detail +} // namespace expt +} // namespace RAJA + + +#endif // NEW_REDUCE_SEQ_REDUCE_HPP diff --git a/include/RAJA/policy/sequential.hpp b/include/RAJA/policy/sequential.hpp index e9c1f8e570..0963b31a01 100644 --- a/include/RAJA/policy/sequential.hpp +++ b/include/RAJA/policy/sequential.hpp @@ -28,6 +28,7 @@ #include "RAJA/policy/sequential/kernel.hpp" #include "RAJA/policy/sequential/policy.hpp" #include "RAJA/policy/sequential/reduce.hpp" +#include "RAJA/policy/sequential/multi_reduce.hpp" #include "RAJA/policy/sequential/scan.hpp" #include "RAJA/policy/sequential/sort.hpp" #include "RAJA/policy/sequential/launch.hpp" diff --git a/include/RAJA/policy/sequential/atomic.hpp b/include/RAJA/policy/sequential/atomic.hpp index 58777cd9ef..046e52e1c1 100644 --- a/include/RAJA/policy/sequential/atomic.hpp +++ b/include/RAJA/policy/sequential/atomic.hpp @@ -28,7 +28,23 @@ namespace RAJA RAJA_SUPPRESS_HD_WARN template RAJA_HOST_DEVICE -RAJA_INLINE T atomicAdd(seq_atomic, T volatile *acc, T value) +RAJA_INLINE T atomicLoad(seq_atomic, T *acc) +{ + return *acc; +} + +RAJA_SUPPRESS_HD_WARN +template +RAJA_HOST_DEVICE +RAJA_INLINE void atomicStore(seq_atomic, T *acc, T value) +{ + *acc = value; +} + +RAJA_SUPPRESS_HD_WARN +template +RAJA_HOST_DEVICE +RAJA_INLINE T atomicAdd(seq_atomic, T *acc, T value) { T ret = *acc; *acc += value; @@ -39,7 +55,7 @@ RAJA_INLINE T atomicAdd(seq_atomic, T volatile *acc, T value) RAJA_SUPPRESS_HD_WARN template RAJA_HOST_DEVICE -RAJA_INLINE T atomicSub(seq_atomic, T volatile *acc, T value) +RAJA_INLINE T atomicSub(seq_atomic, T *acc, T value) { T ret = *acc; *acc -= value; @@ -50,7 +66,7 @@ RAJA_INLINE T atomicSub(seq_atomic, T volatile *acc, T value) RAJA_SUPPRESS_HD_WARN template RAJA_HOST_DEVICE -RAJA_INLINE T atomicMin(seq_atomic, T volatile *acc, T value) +RAJA_INLINE T atomicMin(seq_atomic, T *acc, T value) { T ret = *acc; *acc = ret < value ? ret : value; @@ -60,10 +76,10 @@ RAJA_INLINE T atomicMin(seq_atomic, T volatile *acc, T value) RAJA_SUPPRESS_HD_WARN template RAJA_HOST_DEVICE -RAJA_INLINE T atomicMax(seq_atomic, T volatile *acc, T value) +RAJA_INLINE T atomicMax(seq_atomic, T *acc, T value) { T ret = *acc; - *acc = ret > value ? ret : value; + *acc = value < ret ? ret : value; return ret; } @@ -71,47 +87,47 @@ RAJA_INLINE T atomicMax(seq_atomic, T volatile *acc, T value) RAJA_SUPPRESS_HD_WARN template RAJA_HOST_DEVICE -RAJA_INLINE T atomicInc(seq_atomic, T volatile *acc) +RAJA_INLINE T atomicInc(seq_atomic, T *acc) { T ret = *acc; - (*acc) += 1; + (*acc) += T(1); return ret; } RAJA_SUPPRESS_HD_WARN template RAJA_HOST_DEVICE -RAJA_INLINE T atomicInc(seq_atomic, T volatile *acc, T val) +RAJA_INLINE T atomicInc(seq_atomic, T *acc, T val) { T old = *acc; - (*acc) = ((old >= val) ? 0 : (old + 1)); + *acc = val <= old ? T(0) : old + T(1); return old; } RAJA_SUPPRESS_HD_WARN template RAJA_HOST_DEVICE -RAJA_INLINE T atomicDec(seq_atomic, T volatile *acc) +RAJA_INLINE T atomicDec(seq_atomic, T *acc) { T ret = *acc; - (*acc) -= 1; + (*acc) -= T(1); return ret; } RAJA_SUPPRESS_HD_WARN template RAJA_HOST_DEVICE -RAJA_INLINE T atomicDec(seq_atomic, T volatile *acc, T val) +RAJA_INLINE T atomicDec(seq_atomic, T *acc, T val) { T old = *acc; - (*acc) = (((old == 0) | (old > val)) ? val : (old - 1)); + *acc = old == T(0) || val < old ? val : old - T(1); return old; } RAJA_SUPPRESS_HD_WARN template RAJA_HOST_DEVICE -RAJA_INLINE T atomicAnd(seq_atomic, T volatile *acc, T value) +RAJA_INLINE T atomicAnd(seq_atomic, T *acc, T value) { T ret = *acc; *acc &= value; @@ -121,7 +137,7 @@ RAJA_INLINE T atomicAnd(seq_atomic, T volatile *acc, T value) RAJA_SUPPRESS_HD_WARN template RAJA_HOST_DEVICE -RAJA_INLINE T atomicOr(seq_atomic, T volatile *acc, T value) +RAJA_INLINE T atomicOr(seq_atomic, T *acc, T value) { T ret = *acc; *acc |= value; @@ -131,7 +147,7 @@ RAJA_INLINE T atomicOr(seq_atomic, T volatile *acc, T value) RAJA_SUPPRESS_HD_WARN template RAJA_HOST_DEVICE -RAJA_INLINE T atomicXor(seq_atomic, T volatile *acc, T value) +RAJA_INLINE T atomicXor(seq_atomic, T *acc, T value) { T ret = *acc; *acc ^= value; @@ -141,7 +157,7 @@ RAJA_INLINE T atomicXor(seq_atomic, T volatile *acc, T value) RAJA_SUPPRESS_HD_WARN template RAJA_HOST_DEVICE -RAJA_INLINE T atomicExchange(seq_atomic, T volatile *acc, T value) +RAJA_INLINE T atomicExchange(seq_atomic, T *acc, T value) { T ret = *acc; *acc = value; @@ -151,7 +167,7 @@ RAJA_INLINE T atomicExchange(seq_atomic, T volatile *acc, T value) RAJA_SUPPRESS_HD_WARN template RAJA_HOST_DEVICE -RAJA_INLINE T atomicCAS(seq_atomic, T volatile *acc, T compare, T value) +RAJA_INLINE T atomicCAS(seq_atomic, T *acc, T compare, T value) { T ret = *acc; *acc = ret == compare ? value : ret; diff --git a/include/RAJA/policy/sequential/multi_reduce.hpp b/include/RAJA/policy/sequential/multi_reduce.hpp new file mode 100644 index 0000000000..be3a3860f8 --- /dev/null +++ b/include/RAJA/policy/sequential/multi_reduce.hpp @@ -0,0 +1,171 @@ +/*! + ****************************************************************************** + * + * \file + * + * \brief Header file containing RAJA reduction templates for + * sequential execution. + * + * These methods should work on any platform. + * + ****************************************************************************** + */ + +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2016-24, Lawrence Livermore National Security, LLC +// and RAJA project contributors. See the RAJA/LICENSE file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +#ifndef RAJA_sequential_multi_reduce_HPP +#define RAJA_sequential_multi_reduce_HPP + +#include "RAJA/config.hpp" + +#include "RAJA/internal/MemUtils_CPU.hpp" + +#include "RAJA/pattern/detail/multi_reduce.hpp" +#include "RAJA/pattern/multi_reduce.hpp" + +#include "RAJA/policy/sequential/policy.hpp" + +#include "RAJA/util/types.hpp" + +namespace RAJA +{ + +namespace detail +{ + +/*! + ************************************************************************** + * + * \brief Seq multi-reduce data class template. + * + * In this class memory is owned by the parent object + * + ************************************************************************** + */ +template < typename T, typename t_MultiReduceOp, typename tuning > +struct MultiReduceDataSeq; + +/*! + ************************************************************************** + * + * \brief Seq multi-reduce data class template using left_fold reductions. + * + * In this class memory is owned by the parent object + * + ************************************************************************** + */ +template < typename T, typename t_MultiReduceOp > +struct MultiReduceDataSeq> +{ + using value_type = T; + using MultiReduceOp = t_MultiReduceOp; + + MultiReduceDataSeq() = delete; + + template < typename Container, + std::enable_if_t::value>* = nullptr > + MultiReduceDataSeq(Container const& container, T identity) + : m_parent(nullptr) + , m_num_bins(container.size()) + , m_identity(identity) + , m_data(nullptr) + { + m_data = create_data(container, m_num_bins); + } + + MultiReduceDataSeq(MultiReduceDataSeq const &other) + : m_parent(other.m_parent ? other.m_parent : &other) + , m_num_bins(other.m_num_bins) + , m_identity(other.m_identity) + , m_data(other.m_data) + { } + + MultiReduceDataSeq(MultiReduceDataSeq &&) = delete; + MultiReduceDataSeq& operator=(MultiReduceDataSeq const&) = delete; + MultiReduceDataSeq& operator=(MultiReduceDataSeq &&) = delete; + + ~MultiReduceDataSeq() + { + if (m_data) { + if (!m_parent) { + destroy_data(m_data, m_num_bins); + } + } + } + + template < typename Container > + void reset(Container const& container, T identity) + { + m_identity = identity; + size_t new_num_bins = container.size(); + if (new_num_bins != m_num_bins) { + destroy_data(m_data, m_num_bins); + m_num_bins = new_num_bins; + m_data = create_data(container, m_num_bins); + } else { + size_t bin = 0; + for (auto const& value : container) { + m_data[bin] = value; + ++bin; + } + } + } + + size_t num_bins() const { return m_num_bins; } + + T identity() const { return m_identity; } + + void combine(size_t bin, T const &val) { MultiReduceOp{}(m_data[bin], val); } + + T get(size_t bin) const { return m_data[bin]; } + +private: + MultiReduceDataSeq const *m_parent; + size_t m_num_bins; + T m_identity; + T* m_data; + + template < typename Container > + static T* create_data(Container const& container, size_t num_bins) + { + if (num_bins == size_t(0)) { + return nullptr; + } + + auto data = static_cast(malloc(num_bins*sizeof(T))); + size_t bin = 0; + for (auto const& value : container) { + new(&data[bin]) T(value); + ++bin; + } + return data; + } + + static void destroy_data(T*& data, size_t num_bins) + { + if (num_bins == size_t(0)) { + return; + } + + for (size_t bin = 0; bin < num_bins; ++bin) { + data[bin].~T(); + } + free(data); + data = nullptr; + } +}; + +} // namespace detail + +RAJA_DECLARE_ALL_MULTI_REDUCERS(policy::sequential::seq_multi_reduce_policy, detail::MultiReduceDataSeq) + +} // namespace RAJA + +#endif // closing endif for header file include guard diff --git a/include/RAJA/policy/sequential/params/kernel_name.hpp b/include/RAJA/policy/sequential/params/kernel_name.hpp new file mode 100644 index 0000000000..00e6a1dc52 --- /dev/null +++ b/include/RAJA/policy/sequential/params/kernel_name.hpp @@ -0,0 +1,37 @@ +#ifndef SEQ_KERNELNAME_HPP +#define SEQ_KERNELNAME_HPP + +#include "RAJA/pattern/params/kernel_name.hpp" + +namespace RAJA { +namespace expt { +namespace detail { + + // Init + template + camp::concepts::enable_if< std::is_same< EXEC_POL, RAJA::seq_exec> > + init(KernelName&) + { + //TODO: Define kernel naming + } + + // Combine + template + RAJA_HOST_DEVICE + camp::concepts::enable_if< std::is_same< EXEC_POL, RAJA::seq_exec> > + combine(KernelName&, T) {} + + // Resolve + template + camp::concepts::enable_if< std::is_same< EXEC_POL, RAJA::seq_exec> > + resolve(KernelName&) + { + //TODO: Define kernel naming + } + +} // namespace detail +} // namespace expt +} // namespace RAJA + + +#endif // NEW_REDUCE_SEQ_REDUCE_HPP diff --git a/include/RAJA/policy/sequential/policy.hpp b/include/RAJA/policy/sequential/policy.hpp index f59bee50e4..287af42502 100644 --- a/include/RAJA/policy/sequential/policy.hpp +++ b/include/RAJA/policy/sequential/policy.hpp @@ -22,6 +22,24 @@ namespace RAJA { +namespace sequential +{ + +enum struct multi_reduce_algorithm : int +{ + left_fold +}; + +template < multi_reduce_algorithm t_multi_algorithm > +struct MultiReduceTuning +{ + static constexpr multi_reduce_algorithm algorithm = t_multi_algorithm; + static constexpr bool consistent = + (algorithm == multi_reduce_algorithm::left_fold); +}; + +} // namspace sequential + namespace policy { namespace sequential @@ -79,11 +97,23 @@ struct seq_work : make_policy_pattern_launch_platform_t { }; +/// +template < typename tuning > +struct seq_multi_reduce_policy + : make_policy_pattern_launch_platform_t> { +}; + /// /////////////////////////////////////////////////////////////////////// /// @@ -94,12 +124,27 @@ struct seq_reduce : make_policy_pattern_launch_platform_t +using seq_multi_reduce_tuning = seq_multi_reduce_policy< + RAJA::sequential::MultiReduceTuning >; + +// Policies for RAJA::MultiReduce* objects with specific behaviors. +// - left_fold policies combine new values into a single value. +using seq_multi_reduce_left_fold = seq_multi_reduce_tuning< + RAJA::sequential::multi_reduce_algorithm::left_fold>; + +// Policy for RAJA::MultiReduce* objects that gives the +// same answer every time when used in the same way +using seq_multi_reduce = seq_multi_reduce_left_fold; + } // namespace sequential } // namespace policy using policy::sequential::seq_atomic; using policy::sequential::seq_exec; using policy::sequential::seq_reduce; +using policy::sequential::seq_multi_reduce; using policy::sequential::seq_region; using policy::sequential::seq_segit; using policy::sequential::seq_work; diff --git a/include/RAJA/policy/sycl.hpp b/include/RAJA/policy/sycl.hpp index dc4112d8a7..491e39910c 100644 --- a/include/RAJA/policy/sycl.hpp +++ b/include/RAJA/policy/sycl.hpp @@ -24,11 +24,12 @@ #if defined(RAJA_SYCL_ACTIVE) -#include +#include "RAJA/util/sycl_compat.hpp" #include "RAJA/policy/sycl/forall.hpp" #include "RAJA/policy/sycl/policy.hpp" #include "RAJA/policy/sycl/reduce.hpp" +//#include "RAJA/policy/sycl/multi_reduce.hpp" //#include "RAJA/policy/sycl/scan.hpp" //#include "RAJA/policy/sycl/sort.hpp" #include "RAJA/policy/sycl/kernel.hpp" diff --git a/include/RAJA/policy/sycl/MemUtils_SYCL.hpp b/include/RAJA/policy/sycl/MemUtils_SYCL.hpp index c158bd2801..27d3209ae3 100644 --- a/include/RAJA/policy/sycl/MemUtils_SYCL.hpp +++ b/include/RAJA/policy/sycl/MemUtils_SYCL.hpp @@ -23,7 +23,7 @@ #if defined(RAJA_ENABLE_SYCL) -#include +#include "RAJA/util/sycl_compat.hpp" #include #include diff --git a/include/RAJA/policy/sycl/forall.hpp b/include/RAJA/policy/sycl/forall.hpp index 4a33ab3bd4..901cc694f0 100644 --- a/include/RAJA/policy/sycl/forall.hpp +++ b/include/RAJA/policy/sycl/forall.hpp @@ -26,10 +26,11 @@ #if defined(RAJA_ENABLE_SYCL) -#include #include #include +#include "RAJA/util/sycl_compat.hpp" + #include "RAJA/pattern/forall.hpp" #include "RAJA/pattern/params/forall.hpp" @@ -121,12 +122,7 @@ forall_impl(resources::Sycl &sycl_res, sycl_dim_t blockSize{BlockSize}; sycl_dim_t gridSize = impl::getGridDim(static_cast(len), BlockSize); - ::sycl::queue* q = ::RAJA::sycl::detail::getQueue(); - // Global resource was not set, use the resource that was passed to forall - // Determine if the default SYCL res is being used - if (!q) { - q = sycl_res.get_queue(); - } + ::sycl::queue* q = sycl_res.get_queue(); q->submit([&](::sycl::handler& h) { @@ -168,6 +164,7 @@ resources::EventProxy forall_impl(resources::Sycl &sycl_res, // Only launch kernel if we have something to iterate over if (len > 0 && BlockSize > 0) { + // Note: We could fix an incorrect workgroup size. // It would change what was specified. // For now, leave the device compiler to error with invalid WG size. @@ -178,14 +175,11 @@ resources::EventProxy forall_impl(resources::Sycl &sycl_res, sycl_dim_t blockSize{BlockSize}; sycl_dim_t gridSize = impl::getGridDim(static_cast(len), BlockSize); - ::sycl::queue* q = ::RAJA::sycl::detail::getQueue(); - // Global resource was not set, use the resource that was passed to forall - // Determine if the default SYCL res is being used - if (!q) { - q = sycl_res.get_queue(); - } + ::sycl::queue* q = sycl_res.get_queue(); + LOOP_BODY* lbody; Iterator* beg; + RAJA_FT_BEGIN; // // Setup shared memory buffers @@ -250,18 +244,14 @@ forall_impl(resources::Sycl &sycl_res, // Only launch kernel if we have something to iterate over if (len > 0 && BlockSize > 0) { + // // Compute the number of blocks // sycl_dim_t blockSize{BlockSize}; sycl_dim_t gridSize = impl::getGridDim(static_cast(len), BlockSize); - ::sycl::queue* q = ::RAJA::sycl::detail::getQueue(); - // Global resource was not set, use the resource that was passed to forall - // Determine if the default SYCL res is being used - if (!q) { - q = sycl_res.get_queue(); - } + ::sycl::queue* q = sycl_res.get_queue(); auto combiner = []( ForallParam x, ForallParam y ) { RAJA::expt::ParamMultiplexer::combine( x, y ); @@ -332,12 +322,7 @@ forall_impl(resources::Sycl &sycl_res, sycl_dim_t blockSize{BlockSize}; sycl_dim_t gridSize = impl::getGridDim(static_cast(len), BlockSize); - ::sycl::queue* q = ::RAJA::sycl::detail::getQueue(); - // Global resource was not set, use the resource that was passed to forall - // Determine if the default SYCL res is being used - if (!q) { - q = sycl_res.get_queue(); - } + ::sycl::queue* q = sycl_res.get_queue(); auto combiner = []( ForallParam x, ForallParam y ) { RAJA::expt::ParamMultiplexer::combine( x, y ); @@ -414,29 +399,6 @@ forall_impl(resources::Sycl &sycl_res, * ****************************************************************************** */ -template -RAJA_INLINE void forall_impl(ExecPolicy>, - const TypedIndexSet& iset, - LoopBody&& loop_body) -{ - int num_seg = iset.getNumSegments(); - for (int isi = 0; isi < num_seg; ++isi) { - iset.segmentCall(isi, - detail::CallForall(), - sycl_exec(), - loop_body); - } // iterate over segments of index set - - if (!Async) { - ::sycl::queue* q = ::RAJA::sycl::detail::getQueue(); - q->wait(); - }; -} - - template forall_impl(resources::Sycl & loop_body); } // iterate over segments of index set - if (!Async) { - ::sycl::queue* q = ::RAJA::sycl::detail::getQueue(); - q->wait(); + if ( !Async ) { + ::sycl::queue* q = r.get_queue(); + q->wait(); } return resources::EventProxy(r); diff --git a/include/RAJA/policy/sycl/kernel/SyclKernel.hpp b/include/RAJA/policy/sycl/kernel/SyclKernel.hpp index 0b7fa5f253..88c789c062 100644 --- a/include/RAJA/policy/sycl/kernel/SyclKernel.hpp +++ b/include/RAJA/policy/sycl/kernel/SyclKernel.hpp @@ -48,19 +48,22 @@ namespace RAJA * work group and work items per group. */ template -struct sycl_launch {}; +struct sycl_launch : public RAJA::make_policy_pattern_launch_platform_t< + RAJA::Policy::sycl, + RAJA::Pattern::forall, + detail::get_launch::value, + RAJA::Platform::sycl>{ +}; namespace statement { - -/*! RAJA::kernel statement that launches a SYCL kernel. - * - * +/* + * ! RAJA::kernel statement that launches a SYCL kernel. */ template struct SyclKernelExt - : public internal::Statement, EnclosedStmts...> { + : public internal::Statement { }; /* @@ -87,11 +90,7 @@ namespace internal { /*! - * SYCL global function for launching SyclKernel policies - * This is annotated to guarantee that device code generated - * can be launched by a kernel with BlockSize number of threads. - * - * This launcher is used by the SyclKernel policies. + * SYCL global function for launching SyclKernel policies. */ template void SyclKernelLauncher(Data data, cl::sycl::nd_item<3> item) @@ -142,7 +141,7 @@ struct SyclLaunchHelper,StmtList,Data,Types> qu->submit([&](cl::sycl::handler& h) { - h.parallel_for(launch_dims.fit_nd_range(), + h.parallel_for(launch_dims.fit_nd_range(qu), [=] (cl::sycl::nd_item<3> item) { SyclKernelLauncher(*m_data, item); @@ -178,7 +177,7 @@ struct SyclLaunchHelper,StmtList,Data,Types> qu->submit([&](cl::sycl::handler& h) { - h.parallel_for(launch_dims.fit_nd_range(), + h.parallel_for(launch_dims.fit_nd_range(qu), [=] (cl::sycl::nd_item<3> item) { SyclKernelLauncher(data, item); @@ -211,20 +210,15 @@ struct StatementExecutor< using launch_t = SyclLaunchHelper::value, LaunchConfig, stmt_list_t, data_t, Types>; + camp::resources::Sycl res = data.get_resource(); + ::sycl::queue* q = res.get_queue();; + // // Compute the requested kernel dimensions // LaunchDims launch_dims = executor_t::calculateDimensions(data); int shmem = 0; - cl::sycl::queue* q = ::RAJA::sycl::detail::getQueue(); - - // Global resource was not set, use the resource that was passed to forall - // Determine if the default SYCL res is being used - if (!q) { - camp::resources::Resource res = camp::resources::Sycl(); - q = res.get().get_queue(); - } // // Launch the kernels diff --git a/include/RAJA/policy/sycl/kernel/internal.hpp b/include/RAJA/policy/sycl/kernel/internal.hpp index 3fe37efe0b..56e3a9aa1e 100644 --- a/include/RAJA/policy/sycl/kernel/internal.hpp +++ b/include/RAJA/policy/sycl/kernel/internal.hpp @@ -86,7 +86,7 @@ struct LaunchDims { return result; } - cl::sycl::nd_range<3> fit_nd_range() { + cl::sycl::nd_range<3> fit_nd_range(::sycl::queue* q) { sycl_dim_3_t launch_global; @@ -95,14 +95,6 @@ struct LaunchDims { launch_local.y = std::max(launch_local.y, local.y); launch_local.z = std::max(launch_local.z, local.z); - cl::sycl::queue* q = ::RAJA::sycl::detail::getQueue(); - // Global resource was not set, use the resource that was passed to forall - // Determine if the default SYCL res is being used - if (!q) { - camp::resources::Resource sycl_res = camp::resources::Sycl(); - q = sycl_res.get().get_queue(); - } - cl::sycl::device dev = q->get_device(); auto max_work_group_size = dev.get_info< ::cl::sycl::info::device::max_work_group_size>(); diff --git a/include/RAJA/policy/sycl/launch.hpp b/include/RAJA/policy/sycl/launch.hpp index 9176444cd4..ad9fecc222 100644 --- a/include/RAJA/policy/sycl/launch.hpp +++ b/include/RAJA/policy/sycl/launch.hpp @@ -41,16 +41,8 @@ struct LaunchExecute> { BODY_IN &&body_in, ReduceParams &RAJA_UNUSED_ARG(launch_reducers)) { - cl::sycl::queue* q = ::RAJA::sycl::detail::getQueue(); - - /*Get the concrete resource */ - resources::Sycl sycl_res = res.get(); - - // Global resource was not set, use the resource that was passed to forall - // Determine if the default SYCL res is being used - if (!q) { - q = sycl_res.get_queue(); - } + /*Get the queue from concrete resource */ + ::sycl::queue* q = res.get().get_queue(); // // Compute the number of blocks and threads @@ -91,6 +83,8 @@ struct LaunchExecute> { }); + if (!async) { q->wait(); } + RAJA_FT_END; } @@ -105,10 +99,76 @@ struct LaunchExecute> { RAJA::expt::type_traits::is_ForallParamPack, concepts::negate>> exec(RAJA::resources::Resource res, const LaunchParams &launch_params, const char *kernel_name, - BODY_IN &&body_in, ReduceParams &&launch_reducers) + BODY_IN &&body_in, ReduceParams launch_reducers) { - RAJA_ABORT_OR_THROW("SYCL trivially copyable lambda backend currently not supported in RAJA launch"); + /*Get the queue from concrete resource */ + ::sycl::queue* q = res.get().get_queue(); + + using EXEC_POL = RAJA::sycl_launch_t; + RAJA::expt::ParamMultiplexer::init(launch_reducers); + + // + // Compute the number of blocks and threads + // + const ::sycl::range<3> blockSize(launch_params.threads.value[2], + launch_params.threads.value[1], + launch_params.threads.value[0]); + + const ::sycl::range<3> gridSize(launch_params.threads.value[2] * launch_params.teams.value[2], + launch_params.threads.value[1] * launch_params.teams.value[1], + launch_params.threads.value[0] * launch_params.teams.value[0]); + + // Only launch kernel if we have something to iterate over + constexpr size_t zero = 0; + if ( launch_params.threads.value[0] > zero && launch_params.threads.value[1] > zero && launch_params.threads.value[2] > zero && + launch_params.teams.value[0] > zero && launch_params.teams.value[1] > zero && launch_params.teams.value[2]> zero ) { + + + auto combiner = []( ReduceParams x, ReduceParams y ) { + RAJA::expt::ParamMultiplexer::combine( x, y ); + return x; + }; + + RAJA_FT_BEGIN; + + ReduceParams* res = ::sycl::malloc_shared(1,*q); + RAJA::expt::ParamMultiplexer::init(*res); + auto reduction = ::sycl::reduction(res, launch_reducers, combiner); + + q->submit([&](cl::sycl::handler& h) { + + auto s_vec = ::sycl::local_accessor (launch_params.shared_mem_size, h); + + h.parallel_for + (cl::sycl::nd_range<3>(gridSize, blockSize), + reduction, + [=] (cl::sycl::nd_item<3> itm, auto & red) { + + LaunchContext ctx; + ctx.itm = &itm; + + //Point to shared memory + ctx.shared_mem_ptr = s_vec.get_multi_ptr<::sycl::access::decorated::yes>().get(); + + ReduceParams fp; + RAJA::expt::ParamMultiplexer::init(fp); + + RAJA::expt::invoke_body(fp, body_in, ctx); + + red.combine(fp); + + }); + + }).wait(); // Need to wait for completion to free memory + + RAJA::expt::ParamMultiplexer::combine( launch_reducers, *res ); + ::sycl::free(res, *q); + + RAJA_FT_END; + } + + RAJA::expt::ParamMultiplexer::resolve(launch_reducers); return resources::EventProxy(res); } @@ -123,16 +183,8 @@ struct LaunchExecute> { BODY_IN &&body_in, ReduceParams &RAJA_UNUSED_ARG(launch_reducers)) { - cl::sycl::queue* q = ::RAJA::sycl::detail::getQueue(); - - /*Get the concrete resource */ - resources::Sycl sycl_res = res.get(); - - // Global resource was not set, use the resource that was passed to forall - // Determine if the default SYCL res is being used - if (!q) { - q = sycl_res.get_queue(); - } + /*Get the queue from concrete resource */ + ::sycl::queue* q = res.get().get_queue(); // // Compute the number of blocks and threads @@ -180,7 +232,9 @@ struct LaunchExecute> { }); - }); + }).wait(); // Need to wait for completion to free memory + + cl::sycl::free(lbody, *q); RAJA_FT_END; @@ -197,15 +251,90 @@ struct LaunchExecute> { RAJA::expt::type_traits::is_ForallParamPack, concepts::negate>> exec(RAJA::resources::Resource res, const LaunchParams &launch_params, const char *kernel_name, - BODY_IN &&body_in, ReduceParams &&launch_reducers) + BODY_IN &&body_in, ReduceParams launch_reducers) { - RAJA_ABORT_OR_THROW("SYCL non-trivially copyable lambda backend currently not supported in RAJA launch"); + /*Get the queue from concrete resource */ + ::sycl::queue* q = res.get().get_queue(); + + using EXEC_POL = RAJA::sycl_launch_t; + RAJA::expt::ParamMultiplexer::init(launch_reducers); + + // + // Compute the number of blocks and threads + // + const ::sycl::range<3> blockSize(launch_params.threads.value[2], + launch_params.threads.value[1], + launch_params.threads.value[0]); + + const ::sycl::range<3> gridSize(launch_params.threads.value[2] * launch_params.teams.value[2], + launch_params.threads.value[1] * launch_params.teams.value[1], + launch_params.threads.value[0] * launch_params.teams.value[0]); + + // Only launch kernel if we have something to iterate over + constexpr size_t zero = 0; + if ( launch_params.threads.value[0] > zero && launch_params.threads.value[1] > zero && launch_params.threads.value[2] > zero && + launch_params.teams.value[0] > zero && launch_params.teams.value[1] > zero && launch_params.teams.value[2]> zero ) { + + + auto combiner = []( ReduceParams x, ReduceParams y ) { + RAJA::expt::ParamMultiplexer::combine( x, y ); + return x; + }; + + RAJA_FT_BEGIN; + + // + // Kernel body is nontrivially copyable, create space on device and copy to + // Workaround until "is_device_copyable" is supported + // + using LOOP_BODY = camp::decay; + LOOP_BODY* lbody; + lbody = (LOOP_BODY*) cl::sycl::malloc_device(sizeof(LOOP_BODY), *q); + q->memcpy(lbody, &body_in, sizeof(LOOP_BODY)).wait(); + + ReduceParams* res = ::sycl::malloc_shared(1,*q); + RAJA::expt::ParamMultiplexer::init(*res); + auto reduction = ::sycl::reduction(res, launch_reducers, combiner); + + q->submit([&](cl::sycl::handler& h) { + + auto s_vec = ::sycl::local_accessor (launch_params.shared_mem_size, h); + + h.parallel_for + (cl::sycl::nd_range<3>(gridSize, blockSize), + reduction, + [=] (cl::sycl::nd_item<3> itm, auto & red) { + + LaunchContext ctx; + ctx.itm = &itm; + + //Point to shared memory + ctx.shared_mem_ptr = s_vec.get_multi_ptr<::sycl::access::decorated::yes>().get(); + + ReduceParams fp; + RAJA::expt::ParamMultiplexer::init(fp); + + RAJA::expt::invoke_body(fp, *lbody, ctx); + + red.combine(fp); + + }); + + }).wait(); // Need to wait for completion to free memory + + RAJA::expt::ParamMultiplexer::combine( launch_reducers, *res ); + ::sycl::free(res, *q); + cl::sycl::free(lbody, *q); + + RAJA_FT_END; + } + + RAJA::expt::ParamMultiplexer::resolve(launch_reducers); return resources::EventProxy(res); } - }; /* diff --git a/include/RAJA/policy/sycl/params/kernel_name.hpp b/include/RAJA/policy/sycl/params/kernel_name.hpp new file mode 100644 index 0000000000..1f33be19bb --- /dev/null +++ b/include/RAJA/policy/sycl/params/kernel_name.hpp @@ -0,0 +1,41 @@ +#ifndef SYCL_KERNELNAME_HPP +#define SYCL_KERNELNAME_HPP + +#include "RAJA/pattern/params/kernel_name.hpp" + +namespace RAJA { +namespace expt { +namespace detail { + +#if defined(RAJA_ENABLE_SYCL) + + // Init + template + camp::concepts::enable_if< type_traits::is_sycl_policy > + init(KernelName&) + { + //TODO: Define kernel naming + } + + // Combine + template + camp::concepts::enable_if< type_traits::is_sycl_policy > + SYCL_EXTERNAL + combine(KernelName&, T) {} + + // Resolve + template + camp::concepts::enable_if< type_traits::is_sycl_policy > + resolve(KernelName&) + { + //TODO: Define kernel naming + } + +#endif + +} // namespace detail +} // namespace expt +} // namespace RAJA + + +#endif // NEW_REDUCE_SYCL_REDUCE_HPP diff --git a/include/RAJA/policy/sycl/policy.hpp b/include/RAJA/policy/sycl/policy.hpp index a2ab44e3f7..0f92fe27e1 100644 --- a/include/RAJA/policy/sycl/policy.hpp +++ b/include/RAJA/policy/sycl/policy.hpp @@ -22,7 +22,7 @@ #if defined(RAJA_SYCL_ACTIVE) -#include +#include "RAJA/util/sycl_compat.hpp" #include "RAJA/policy/PolicyBase.hpp" #include "RAJA/policy/sequential/policy.hpp" @@ -96,7 +96,7 @@ template struct sycl_atomic_explicit{}; // -// Default cuda atomic policy uses cuda atomics on the device and non-atomics +// Default sycl atomic policy uses sycl atomics on the device and non-atomics // on the host // using sycl_atomic = sycl_atomic_explicit; diff --git a/include/RAJA/policy/sycl/reduce.hpp b/include/RAJA/policy/sycl/reduce.hpp index 72cdbaeb6f..58cb83d295 100644 --- a/include/RAJA/policy/sycl/reduce.hpp +++ b/include/RAJA/policy/sycl/reduce.hpp @@ -73,7 +73,7 @@ struct maxloc // Ideally, MaxNumTeams = ThreadsPerTeam in omp_target_parallel_for_exec. static int MaxNumTeams = 1; -//! Information necessary for OpenMP offload to be considered +//! Information necessary for SYCL offload to be considered struct Offload_Info { int hostID{1}; @@ -88,7 +88,7 @@ struct Offload_Info } }; -//! Reduction data for OpenMP Offload -- stores value, host pointer, and device +//! Reduction data for SYCL Offload -- stores value, host pointer, and device //! pointer template struct Reduce_Data @@ -195,7 +195,7 @@ struct Reduce_Data } // end namespace sycl -//! OpenMP Target Reduction entity -- generalize on # of teams, reduction, and +//! SYCL Target Reduction entity -- generalize on # of teams, reduction, and //! type template struct TargetReduce @@ -285,7 +285,7 @@ struct TargetReduce T finalVal; }; -//! OpenMP Target Reduction Location entity -- generalize on # of teams, +//! SYCL Target Reduction Location entity -- generalize on # of teams, //! reduction, and type template struct TargetReduceLoc diff --git a/include/RAJA/util/EnableIf.hpp b/include/RAJA/util/EnableIf.hpp new file mode 100644 index 0000000000..257e852bf9 --- /dev/null +++ b/include/RAJA/util/EnableIf.hpp @@ -0,0 +1,57 @@ +/*! + ****************************************************************************** + * + * \file + * + * \brief Header file for enable_if helpers. + * + * These type functions are used heavily by the atomic operators. + * + ****************************************************************************** + */ + +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2024, Lawrence Livermore National Security, LLC +// and RAJA project contributors. See the RAJA/LICENSE file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +#ifndef RAJA_util_EnableIf_HPP +#define RAJA_util_EnableIf_HPP + +#include "RAJA/config.hpp" + +#include + +#include "camp/list.hpp" +#include "camp/type_traits.hpp" + +#include "RAJA/util/concepts.hpp" + + +namespace RAJA +{ +namespace util +{ + + +template +struct is_any_of; + +template +struct is_any_of> + : ::RAJA::concepts::any_of<::camp::is_same...> +{}; + +template +using enable_if_is_any_of = std::enable_if_t::value, T>; + +template +using enable_if_is_none_of = std::enable_if_t<::RAJA::concepts::negate>::value, T>; + + +} // namespace util +} // namespace RAJA + +#endif // closing endif for header file include guard diff --git a/include/RAJA/util/OffsetOperators.hpp b/include/RAJA/util/OffsetOperators.hpp new file mode 100644 index 0000000000..150aaeee34 --- /dev/null +++ b/include/RAJA/util/OffsetOperators.hpp @@ -0,0 +1,88 @@ +/*! + ****************************************************************************** + * + * \file + * + * \brief RAJA header file defining Simple Offset Calculators + * + ****************************************************************************** + */ + +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2016-24, Lawrence Livermore National Security, LLC +// and RAJA project contributors. See the RAJA/LICENSE file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +#ifndef RAJA_OFFSETOPERATORS_HPP +#define RAJA_OFFSETOPERATORS_HPP + +#include "RAJA/config.hpp" + +#include "RAJA/util/concepts.hpp" +#include "RAJA/util/macros.hpp" + +namespace RAJA +{ + +template +struct GetOffsetLeft +{ + template < typename new_Ret, typename new_Arg1 = new_Ret, typename new_Arg2 = new_Ret> + using rebind = GetOffsetLeft; + + template < size_t > + using rebunch = GetOffsetLeft; + + RAJA_INLINE RAJA_HOST_DEVICE constexpr + Ret operator()(Arg1 const& i, Arg1 const& num_i, + Arg2 const& j, Arg2 const& RAJA_UNUSED_ARG(num_j)) const noexcept + { + return i + j * num_i; + } +}; + +template +struct GetOffsetRight +{ + template < typename new_Ret, typename new_Arg1 = new_Ret, typename new_Arg2 = new_Ret> + using rebind = GetOffsetRight; + + template < size_t > + using rebunch = GetOffsetRight; + + RAJA_INLINE RAJA_HOST_DEVICE constexpr + Ret operator()(Arg1 const& i, Arg1 const& RAJA_UNUSED_ARG(num_i), + Arg2 const& j, Arg2 const& num_j) const noexcept + { + return i * num_j + j; + } +}; + +template +struct GetOffsetLeftBunched +{ + template < typename new_Ret, typename new_Arg1 = new_Ret, typename new_Arg2 = new_Ret> + using rebind = GetOffsetLeftBunched; + + template < size_t new_bunch_num_i > + using rebunch = GetOffsetLeftBunched; + + static constexpr Arg1 bunch_num_i{t_bunch_num_i}; + + RAJA_INLINE RAJA_HOST_DEVICE constexpr + Ret operator()(Arg1 const& i, Arg1 const& RAJA_UNUSED_ARG(num_i), + Arg2 const& j, Arg2 const& num_j) const noexcept + { + // assert(num_i >= bunch_num_i) + Arg1 i_inner = i % bunch_num_i; + Arg1 i_outer = i / bunch_num_i; + return i_inner + j * bunch_num_i + i_outer * num_j * bunch_num_i; + } +}; + +} // namespace RAJA + +#endif diff --git a/include/RAJA/util/RepeatView.hpp b/include/RAJA/util/RepeatView.hpp new file mode 100644 index 0000000000..618913f794 --- /dev/null +++ b/include/RAJA/util/RepeatView.hpp @@ -0,0 +1,141 @@ +/*! + ****************************************************************************** + * + * \file + * + * \brief Header file for RAJA RepeatView constructs. + * + ****************************************************************************** + */ + +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2016-24, Lawrence Livermore National Security, LLC +// and RAJA project contributors. See the RAJA/LICENSE file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +#ifndef RAJA_REPEATVIEW_HPP +#define RAJA_REPEATVIEW_HPP + +#include +#include +#include + +#include "RAJA/util/macros.hpp" + +namespace RAJA +{ + +/*! + * @brief A view of a single object repeated a certain number of times. + * + * Creates a view or container object given an object and length. + * Allows use of container interface functions if you want to repeat a + * single object. + * + * For example: + * + * // Create a repeat view object for the int 2 repeated int_len times + * RepeatView int_repeated(2, int_len); + * + * // Use with RAJA for_each + * RAJA::for_each(int_repeated, [&](int val) { + * sum += val; + * }); + * + * Based on the std::ranges::repeat_view template. + * Differs in that it does not support: + * compile time extents + * unbounded extents + * + */ +template < typename T > +struct RepeatView +{ + struct iterator + { + using difference_type = std::ptrdiff_t; + using value_type = T; + using reference = value_type const&; + + iterator() = default; + + constexpr iterator(const T* base, size_t index) + : m_value(base), m_index(index) + { } + + constexpr reference operator*() const noexcept { return *m_value; } + constexpr reference operator[](difference_type index) const noexcept { return *(*this + index); } + + constexpr iterator& operator++() { ++m_index; return *this; } + constexpr iterator operator++(int) { auto tmp = *this; ++(*this); return tmp; } + + constexpr iterator& operator--() { --m_index; return *this; } + constexpr iterator operator--(int) { auto tmp = *this; --(*this); return tmp; } + + constexpr iterator& operator+=(difference_type rhs) { m_index += rhs; return *this; } + constexpr iterator& operator-=(difference_type rhs) { m_index -= rhs; return *this; } + + friend constexpr iterator operator+(iterator lhs, difference_type rhs) + { lhs += rhs; return lhs; } + friend constexpr iterator operator+(difference_type lhs, iterator rhs) + { rhs += lhs; return rhs; } + + friend constexpr iterator operator-(iterator lhs, difference_type rhs) + { lhs -= rhs; return lhs; } + friend constexpr difference_type operator-(iterator const& lhs, iterator const& rhs) + { return static_cast(lhs.m_index) - static_cast(rhs.m_index); } + + friend constexpr bool operator==(iterator const& lhs, iterator const& rhs) + { return lhs.m_index == rhs.m_index; } + friend constexpr bool operator!=(iterator const& lhs, iterator const& rhs) + { return !(lhs == rhs); } + + friend constexpr bool operator<(iterator const& lhs, iterator const& rhs) + { return lhs.m_index < rhs.m_index; } + friend constexpr bool operator<=(iterator const& lhs, iterator const& rhs) + { return !(rhs < lhs); } + friend constexpr bool operator>(iterator const& lhs, iterator const& rhs) + { return rhs < lhs; } + friend constexpr bool operator>=(iterator const& lhs, iterator const& rhs) + { return !(lhs < rhs); } + + private: + const T* m_value = nullptr; + size_t m_index = 0; + }; + + RepeatView() = delete; + + constexpr RepeatView(T const& value, size_t bound) + : m_bound(bound), m_value(value) + { } + + constexpr RepeatView(T&& value, size_t bound) + : m_bound(bound), m_value(std::move(value)) + { } + + constexpr T const& front() const { return m_value; } + constexpr T const& back() const { return m_value; } + constexpr T const& operator[](size_t RAJA_UNUSED_ARG(index)) const { return m_value; } + + constexpr iterator begin() const { return iterator(&m_value, 0); } + constexpr iterator cbegin() const { return iterator(&m_value, 0); } + + constexpr iterator end() const { return iterator(&m_value, m_bound); } + constexpr iterator cend() const { return iterator(&m_value, m_bound); } + + constexpr explicit operator bool() const { return m_bound != 0; } + constexpr bool empty() const { return m_bound == 0; } + + constexpr size_t size() const { return m_bound; } + +private: + size_t m_bound = 0; + T m_value; +}; + +} // end namespace RAJA + +#endif /* RAJA_REPEATVIEW_HPP */ diff --git a/include/RAJA/util/TypeConvert.hpp b/include/RAJA/util/TypeConvert.hpp index 1486207712..5cdc019259 100644 --- a/include/RAJA/util/TypeConvert.hpp +++ b/include/RAJA/util/TypeConvert.hpp @@ -26,6 +26,8 @@ #include "RAJA/util/macros.hpp" +#include + namespace RAJA { @@ -37,17 +39,13 @@ namespace util * Reinterpret any datatype as another datatype of the same size */ template -RAJA_INLINE RAJA_HOST_DEVICE constexpr B reinterp_A_as_B(A const &val) +RAJA_INLINE RAJA_HOST_DEVICE constexpr B reinterp_A_as_B(A const &a) { - static_assert(sizeof(A) == sizeof(B), "A and B must be same size"); - return reinterpret_cast(val); -} + static_assert(sizeof(A) == sizeof(B), "A and B must be the same size"); -template -RAJA_INLINE RAJA_HOST_DEVICE constexpr B reinterp_A_as_B(A volatile const &val) -{ - static_assert(sizeof(A) == sizeof(B), "A and B must be same size"); - return reinterpret_cast(val); + B b; + memcpy(&b, &a, sizeof(A)); + return b; } diff --git a/include/RAJA/util/for_each.hpp b/include/RAJA/util/for_each.hpp index b279ec29ff..25783b2a0a 100644 --- a/include/RAJA/util/for_each.hpp +++ b/include/RAJA/util/for_each.hpp @@ -37,6 +37,7 @@ namespace detail { // runtime loop applying func to each element in the range in order +RAJA_SUPPRESS_HD_WARN template RAJA_HOST_DEVICE RAJA_INLINE UnaryFunc for_each(Iter begin, Iter end, UnaryFunc func) @@ -49,6 +50,7 @@ UnaryFunc for_each(Iter begin, Iter end, UnaryFunc func) } // compile time expansion applying func to a each type in the list in order +RAJA_SUPPRESS_HD_WARN template RAJA_HOST_DEVICE RAJA_INLINE UnaryFunc for_each_type(camp::list const&, UnaryFunc func) @@ -60,6 +62,20 @@ UnaryFunc for_each_type(camp::list const&, UnaryFunc func) return func; } +// compile time expansion applying func to a each type in the tuple in order +RAJA_SUPPRESS_HD_WARN +template +RAJA_HOST_DEVICE RAJA_INLINE +UnaryFunc for_each_tuple(Tuple&& t, UnaryFunc func, camp::idx_seq) +{ + using camp::get; + // braced init lists are evaluated in order + int seq_unused_array[] = {0, (func(get(std::forward(t))), 0)...}; + RAJA_UNUSED_VAR(seq_unused_array); + + return func; +} + } // namespace detail @@ -68,6 +84,7 @@ UnaryFunc for_each_type(camp::list const&, UnaryFunc func) using a sequential for loop in O(N) operations and O(1) extra memory see https://en.cppreference.com/w/cpp/algorithm/for_each */ +RAJA_SUPPRESS_HD_WARN template RAJA_HOST_DEVICE RAJA_INLINE concepts::enable_if_t> @@ -83,6 +100,7 @@ concepts::enable_if_t> \brief Apply func to each type in the given list in order using a compile-time expansion in O(N) operations and O(1) extra memory */ +RAJA_SUPPRESS_HD_WARN template RAJA_HOST_DEVICE RAJA_INLINE UnaryFunc for_each_type(camp::list const& c, UnaryFunc func) @@ -90,6 +108,19 @@ UnaryFunc for_each_type(camp::list const& c, UnaryFunc func) return detail::for_each_type(c, std::move(func)); } +/*! + \brief Apply func to each object in the given tuple or tuple like type in order + using a compile-time expansion in O(N) operations and O(1) extra memory +*/ +RAJA_SUPPRESS_HD_WARN +template +RAJA_HOST_DEVICE RAJA_INLINE +UnaryFunc for_each_tuple(Tuple&& t, UnaryFunc func) +{ + return detail::for_each_tuple(std::forward(t), std::move(func), + camp::make_idx_seq_t>::value>{}); +} + } // namespace RAJA #endif diff --git a/include/RAJA/util/macros.hpp b/include/RAJA/util/macros.hpp index 55e90010d8..9ddb5bebb7 100644 --- a/include/RAJA/util/macros.hpp +++ b/include/RAJA/util/macros.hpp @@ -153,7 +153,8 @@ RAJA_HOST_DEVICE inline void RAJA_ABORT_OR_THROW(const char *str) { #if defined(__SYCL_DEVICE_ONLY__) - abort(); + //segfault here ran into linking problems + *((volatile char *)0) = 0; // write to address 0 #else printf ( "%s\n", str ); #if defined(RAJA_ENABLE_TARGET_OPENMP) && (_OPENMP >= 201511) diff --git a/include/RAJA/util/math.hpp b/include/RAJA/util/math.hpp index 36c7cca1a0..66b0c9058c 100644 --- a/include/RAJA/util/math.hpp +++ b/include/RAJA/util/math.hpp @@ -70,6 +70,37 @@ constexpr T next_pow2(T n) noexcept return n; } +/*! + \brief "round down" to the largest power of 2 that is less than or equal to n + + For an integer n, + if n is negative, return 0 + else + if n is a power of 2, return n + else return the largest power of 2 that is less than n +*/ +template < typename T, + std::enable_if_t::value>* = nullptr > +RAJA_HOST_DEVICE +constexpr T prev_pow2(T n) noexcept +{ + if ( n < 0 ) return 0; + for (size_t s = 1; s < CHAR_BIT*sizeof(T); s *= 2) { + n |= n >> s; + } + return n - (n >> 1); +} + +/*! + \brief compute lhs mod rhs where lhs is non-negative and rhs is a power of 2 +*/ +template < typename L, typename R, + std::enable_if_t::value && std::is_integral::value>* = nullptr > +constexpr auto power_of_2_mod(L lhs, R rhs) noexcept +{ + return lhs & (rhs-R(1)); +} + } // namespace RAJA #endif diff --git a/include/RAJA/util/sycl_compat.hpp b/include/RAJA/util/sycl_compat.hpp new file mode 100644 index 0000000000..7754caa273 --- /dev/null +++ b/include/RAJA/util/sycl_compat.hpp @@ -0,0 +1,29 @@ +/*! + ****************************************************************************** + * + * \file + * + * \brief RAJA header file for handling different SYCL header include paths + * + ****************************************************************************** + */ + +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2016-24, Lawrence Livermore National Security, LLC +// and RAJA project contributors. See the RAJA/LICENSE file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +#ifndef RAJA_util_sycl_compat_HPP +#define RAJA_util_sycl_compat_HPP + +#if (__INTEL_CLANG_COMPILER && __INTEL_CLANG_COMPILER < 20230000) +// older version, use legacy header locations +#include +#else +// SYCL 2020 standard header +#include +#endif + +#endif // RAJA_util_sycl_compat_HPP diff --git a/include/RAJA/util/types.hpp b/include/RAJA/util/types.hpp index 7e331ef00e..310217bde5 100644 --- a/include/RAJA/util/types.hpp +++ b/include/RAJA/util/types.hpp @@ -956,6 +956,43 @@ struct AsIntegerArray } }; + +/*! + * \brief Assign a new value to an object and restore the object's previous + * value at the end of the current scope. + */ +template +struct ScopedAssignment +{ + ScopedAssignment(T& val, T const& new_val) + : m_ref_to_val(val) + , m_prev_val(std::move(val)) + { + m_ref_to_val = new_val; + } + + ScopedAssignment(T& val, T&& new_val) + : m_ref_to_val(val) + , m_prev_val(std::move(val)) + { + m_ref_to_val = std::move(new_val); + } + + ScopedAssignment(ScopedAssignment const&) = delete; + ScopedAssignment(ScopedAssignment &&) = delete; + ScopedAssignment& operator=(ScopedAssignment const&) = delete; + ScopedAssignment& operator=(ScopedAssignment &&) = delete; + + ~ScopedAssignment() + { + m_ref_to_val = std::move(m_prev_val); + } + +private: + T& m_ref_to_val; + T m_prev_val; +}; + } // namespace detail } // namespace RAJA diff --git a/scripts/gitlab/build_and_test.sh b/scripts/gitlab/build_and_test.sh index ec7c8a6408..15d27e64a0 100755 --- a/scripts/gitlab/build_and_test.sh +++ b/scripts/gitlab/build_and_test.sh @@ -26,6 +26,34 @@ spec=${SPEC:-""} module_list=${MODULE_LIST:-""} job_unique_id=${CI_JOB_ID:-""} use_dev_shm=${USE_DEV_SHM:-true} +spack_debug=${SPACK_DEBUG:-false} +debug_mode=${DEBUG_MODE:-false} + +# REGISTRY_TOKEN allows you to provide your own personal access token to the CI +# registry. Be sure to set the token with at least read access to the registry. +registry_token=${REGISTRY_TOKEN:-""} +ci_registry_user=${CI_REGISTRY_USER:-"${USER}"} +ci_registry_image=${CI_REGISTRY_IMAGE:-"czregistry.llnl.gov:5050/radiuss/raja"} +ci_registry_token=${CI_JOB_TOKEN:-"${registry_token}"} + +timed_message () +{ + echo "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" + echo "~ $(date --rfc-3339=seconds) ~ ${1}" + echo "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" +} + +if [[ ${debug_mode} == true ]] +then + echo "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" + echo "~~~~~ Debug mode:" + echo "~~~~~ - Spack debug mode." + echo "~~~~~ - Deactivated shared memory." + echo "~~~~~ - Do not push to buildcache." + echo "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" + use_dev_shm=false + spack_debug=true +fi if [[ -n ${module_list} ]] then @@ -49,27 +77,33 @@ then fi prefix="${prefix}-${job_unique_id}" - mkdir -p ${prefix} else # We set the prefix in the parent directory so that spack dependencies are not installed inside the source tree. prefix="$(pwd)/../spack-and-build-root" - mkdir -p ${prefix} +fi + +echo "Creating directory ${prefix}" +echo "project_dir: ${project_dir}" + +mkdir -p ${prefix} + +spack_cmd="${prefix}/spack/bin/spack" +spack_env_path="${prefix}/spack_env" +uberenv_cmd="./scripts/uberenv/uberenv.py" +if [[ ${spack_debug} == true ]] +then + spack_cmd="${spack_cmd} --debug --stacktrace" + uberenv_cmd="${uberenv_cmd} --spack-debug" fi # Dependencies -date -echo "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" -echo "~~~~~ Build and test started" -echo "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" if [[ "${option}" != "--build-only" && "${option}" != "--test-only" ]] then - echo "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" - echo "~~~~~ Building dependencies" - echo "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" + timed_message "Building dependencies" if [[ -z ${spec} ]] then - echo "SPEC is undefined, aborting..." + echo "[Error]: SPEC is undefined, aborting..." exit 1 fi @@ -83,15 +117,29 @@ then export SPACK_USER_CACHE_PATH="${spack_user_cache}" mkdir -p ${spack_user_cache} - ./scripts/uberenv/uberenv.py --spec="${spec}" ${prefix_opt} + # generate cmake cache file with uberenv and radiuss spack package + timed_message "Spack setup and environment" + ${uberenv_cmd} --setup-and-env-only --spec="${spec}" ${prefix_opt} + + if [[ -n ${ci_registry_token} ]] + then + timed_message "GitLab registry as Spack Buildcache" + ${spack_cmd} -D ${spack_env_path} mirror add --unsigned --oci-username ${ci_registry_user} --oci-password ${ci_registry_token} gitlab_ci oci://${ci_registry_image} + fi + + timed_message "Spack build of dependencies" + ${uberenv_cmd} --skip-setup-and-env --spec="${spec}" ${prefix_opt} + if [[ -n ${ci_registry_token} && ${debug_mode} == false ]] + then + timed_message "Push dependencies to buildcache" + ${spack_cmd} -D ${spack_env_path} buildcache push --only dependencies gitlab_ci + fi + + timed_message "Dependencies built" fi - echo "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" - echo "~~~~~ Dependencies built" - echo "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" -date -# Host config file +# Find cmake cache file (hostconfig) if [[ -z ${hostconfig} ]] then # If no host config file was provided, we assume it was generated. @@ -100,24 +148,24 @@ then if [[ ${#hostconfigs[@]} == 1 ]] then hostconfig_path=${hostconfigs[0]} - echo "Found host config file: ${hostconfig_path}" elif [[ ${#hostconfigs[@]} == 0 ]] then - echo "No result for: ${project_dir}/*.cmake" - echo "Spack generated host-config not found." + echo "[Error]: No result for: ${project_dir}/*.cmake" + echo "[Error]: Spack generated host-config not found." exit 1 else - echo "More than one result for: ${project_dir}/*.cmake" - echo "${hostconfigs[@]}" - echo "Please specify one with HOST_CONFIG variable" + echo "[Error]: More than one result for: ${project_dir}/*.cmake" + echo "[Error]: ${hostconfigs[@]}" + echo "[Error]: Please specify one with HOST_CONFIG variable" exit 1 fi else # Using provided host-config file. - hostconfig_path="${project_dir}/host-configs/${hostconfig}" + hostconfig_path="${project_dir}/${hostconfig}" fi hostconfig=$(basename ${hostconfig_path}) +echo "[Information]: Found hostconfig ${hostconfig_path}" # Build Directory # When using /dev/shm, we use prefix for both spack builds and source build, unless BUILD_ROOT was defined @@ -131,17 +179,15 @@ cmake_exe=`grep 'CMake executable' ${hostconfig_path} | cut -d ':' -f 2 | xargs` # Build if [[ "${option}" != "--deps-only" && "${option}" != "--test-only" ]] then - date echo "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" + echo "~~~~~ Prefix: ${prefix}" echo "~~~~~ Host-config: ${hostconfig_path}" echo "~~~~~ Build Dir: ${build_dir}" echo "~~~~~ Project Dir: ${project_dir}" echo "~~~~~ Install Dir: ${install_dir}" echo "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" echo "" - echo "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" - echo "~~~~~ Building RAJA" - echo "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" + timed_message "Cleaning working directory" # Map CPU core allocations declare -A core_counts=(["lassen"]=40 ["ruby"]=28 ["poodle"]=28 ["corona"]=32 ["rzansel"]=48 ["tioga"]=32) @@ -153,8 +199,8 @@ then rm -rf ${build_dir} 2>/dev/null mkdir -p ${build_dir} && cd ${build_dir} - date - if [[ "${truehostname}" == "corona" || "${truehostname}" == "tioga" ]] + timed_message "Building RAJA" + if [[ "${truehostname}" == "tioga" ]] then module unload rocm fi @@ -164,28 +210,20 @@ then ${project_dir} if ! $cmake_exe --build . -j ${core_counts[$truehostname]} then - echo "[Error]: compilation failed, building with verbose output..." - echo "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" - echo "~~~~~ Running make VERBOSE=1" - echo "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" + echo "[Error]: Compilation failed, building with verbose output..." + timed_message "Re-building with --verbose" $cmake_exe --build . --verbose -j 1 else + timed_message "Installing" $cmake_exe --install . fi - date - echo "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" - echo "~~~~~ RAJA built" - echo "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" + timed_message "RAJA built and installed" fi # Test if [[ "${option}" != "--build-only" ]] && grep -q -i "ENABLE_TESTS.*ON" ${hostconfig_path} then - date - echo "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" - echo "~~~~~ Testing RAJA" - echo "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" if [[ ! -d ${build_dir} ]] then @@ -194,9 +232,8 @@ then cd ${build_dir} - date - ctest --output-on-failure -T test 2>&1 | tee tests_output.txt - date + timed_message "Testing RAJA" + ctest --output-on-failure --no-compress-output -T test -VV 2>&1 | tee tests_output.txt no_test_str="No tests were found!!!" if [[ "$(tail -n 1 tests_output.txt)" == "${no_test_str}" ]] @@ -204,48 +241,40 @@ then echo "[Error]: No tests were found" && exit 1 fi - echo "Copying Testing xml reports for export" + timed_message "Preparing tests xml reports for export" tree Testing xsltproc -o junit.xml ${project_dir}/blt/tests/ctest-to-junit.xsl Testing/*/Test.xml mv junit.xml ${project_dir}/junit.xml if grep -q "Errors while running CTest" ./tests_output.txt then - echo "[Error]: failure(s) while running CTest" && exit 1 + echo "[Error]: Failure(s) while running CTest" && exit 1 fi if grep -q -i "ENABLE_HIP.*ON" ${hostconfig_path} then - echo "[Warning]: not testing install with HIP" + echo "[Warning]: Not testing install with HIP" else if [[ ! -d ${install_dir} ]] then - echo "[Error]: install directory not found : ${install_dir}" && exit 1 + echo "[Error]: Install directory not found : ${install_dir}" && exit 1 fi cd ${install_dir}/examples/RAJA/using-with-cmake mkdir build && cd build if ! $cmake_exe -C ../host-config.cmake ..; then - echo "[Error]: running $cmake_exe for using-with-cmake test" && exit 1 + echo "[Error]: Running $cmake_exe for using-with-cmake test" && exit 1 fi if ! make; then - echo "[Error]: running make for using-with-cmake test" && exit 1 + echo "[Error]: Running make for using-with-cmake test" && exit 1 fi fi - echo "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" - echo "~~~~~ RAJA tests complete" - echo "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" - date + timed_message "RAJA tests completed" fi -echo "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" -echo "~~~~~ CLEAN UP" -echo "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" +timed_message "Cleaning up" make clean -echo "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" -echo "~~~~~ Build and test completed" -echo "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" -date +timed_message "Build and test completed" diff --git a/scripts/lc-builds/blueos_nvcc_clang.sh b/scripts/lc-builds/blueos_nvcc_clang.sh index 4d55ef1b3a..ae8ded8431 100755 --- a/scripts/lc-builds/blueos_nvcc_clang.sh +++ b/scripts/lc-builds/blueos_nvcc_clang.sh @@ -44,6 +44,7 @@ cmake \ -C ../host-configs/lc-builds/blueos/nvcc_clang_X.cmake \ -DENABLE_OPENMP=On \ -DENABLE_CUDA=On \ + -DRAJA_ENABLE_NV_TOOLS_EXT=ON \ -DCUDA_TOOLKIT_ROOT_DIR=/usr/tce/packages/cuda/cuda-${COMP_NVCC_VER} \ -DCMAKE_CUDA_COMPILER=/usr/tce/packages/cuda/cuda-${COMP_NVCC_VER}/bin/nvcc \ -DCMAKE_CUDA_ARCHITECTURES=${COMP_ARCH} \ diff --git a/scripts/lc-builds/toss4_hipcc.sh b/scripts/lc-builds/toss4_hipcc.sh index 6e1bb2af75..f7342e474c 100755 --- a/scripts/lc-builds/toss4_hipcc.sh +++ b/scripts/lc-builds/toss4_hipcc.sh @@ -63,6 +63,7 @@ cmake \ -DROCM_ROOT_DIR="/opt/rocm-${COMP_VER}" \ -DHIP_ROOT_DIR="/opt/rocm-${COMP_VER}/hip" \ -DHIP_PATH=/opt/rocm-${COMP_VER}/bin \ + -DRAJA_ENABLE_ROCTX=ON \ -DCMAKE_C_COMPILER=/opt/rocm-${COMP_VER}/bin/hipcc \ -DCMAKE_CXX_COMPILER=/opt/rocm-${COMP_VER}/bin/hipcc \ -DCMAKE_HIP_ARCHITECTURES="${COMP_ARCH}" \ diff --git a/scripts/radiuss-spack-configs b/scripts/radiuss-spack-configs index a8d22367e0..54c09b5dcf 160000 --- a/scripts/radiuss-spack-configs +++ b/scripts/radiuss-spack-configs @@ -1 +1 @@ -Subproject commit a8d22367e03d4c9c180a11886414430bdf6428a8 +Subproject commit 54c09b5dcf45decaac2b1e6d1048671cde17f7e5 diff --git a/scripts/uberenv b/scripts/uberenv index cf91883ef0..205672b8b2 160000 --- a/scripts/uberenv +++ b/scripts/uberenv @@ -1 +1 @@ -Subproject commit cf91883ef0500a808338ad6c8b56647da15fa5f3 +Subproject commit 205672b8b2520d7dc69acefe8738960cd5db0937 diff --git a/src/MemUtils_CUDA.cpp b/src/MemUtils_CUDA.cpp index d077e8af8f..85ead614d9 100644 --- a/src/MemUtils_CUDA.cpp +++ b/src/MemUtils_CUDA.cpp @@ -42,10 +42,10 @@ namespace detail // //! State of the host code globally -cudaInfo g_status; +cudaStatusInfo g_status; //! State of the host code in this thread -cudaInfo tl_status; +cudaStatusInfo tl_status; #if defined(RAJA_ENABLE_OPENMP) #pragma omp threadprivate(tl_status) #endif diff --git a/src/MemUtils_HIP.cpp b/src/MemUtils_HIP.cpp index bf44264132..97bd82775e 100644 --- a/src/MemUtils_HIP.cpp +++ b/src/MemUtils_HIP.cpp @@ -42,10 +42,10 @@ namespace detail // //! State of the host code globally -hipInfo g_status; +hipStatusInfo g_status; //! State of the host code in this thread -hipInfo tl_status; +hipStatusInfo tl_status; #if defined(RAJA_ENABLE_OPENMP) #pragma omp threadprivate(tl_status) #endif diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index 4aa1294d07..8f8e65be8f 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -17,4 +17,4 @@ add_subdirectory(old-tests) add_subdirectory(install) -configure_file(${CMAKE_SOURCE_DIR}/test/CTestCustom.cmake ${CMAKE_BINARY_DIR}) +configure_file(${PROJECT_SOURCE_DIR}/test/CTestCustom.cmake ${CMAKE_BINARY_DIR}) diff --git a/test/functional/forall/CMakeLists.txt b/test/functional/forall/CMakeLists.txt index eb9cc5ad19..435f0bbfcb 100644 --- a/test/functional/forall/CMakeLists.txt +++ b/test/functional/forall/CMakeLists.txt @@ -37,6 +37,8 @@ add_subdirectory(reduce-basic) add_subdirectory(reduce-multiple-segment) add_subdirectory(reduce-multiple-indexset) +add_subdirectory(multi-reduce-basic) + add_subdirectory(resource-indexset) add_subdirectory(resource-segment) diff --git a/test/functional/forall/atomic-basic/CMakeLists.txt b/test/functional/forall/atomic-basic/CMakeLists.txt index 9c2c12d76f..4c7973b0a3 100644 --- a/test/functional/forall/atomic-basic/CMakeLists.txt +++ b/test/functional/forall/atomic-basic/CMakeLists.txt @@ -11,7 +11,6 @@ # Note: FORALL_ATOMIC_BACKENDS is defined in ../CMakeLists.txt # foreach( ATOMIC_BACKEND ${FORALL_ATOMIC_BACKENDS} ) - # Signed Tests configure_file( test-forall-atomic-basic.cpp.in test-forall-atomic-basic-${ATOMIC_BACKEND}.cpp ) raja_add_test( NAME test-forall-atomic-basic-${ATOMIC_BACKEND} @@ -19,12 +18,4 @@ foreach( ATOMIC_BACKEND ${FORALL_ATOMIC_BACKENDS} ) target_include_directories(test-forall-atomic-basic-${ATOMIC_BACKEND}.exe PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/tests) - # Unsigned Tests - configure_file( test-forall-atomic-basic-unsigned.cpp.in - test-forall-atomic-basic-unsigned-${ATOMIC_BACKEND}.cpp ) - raja_add_test( NAME test-forall-atomic-basic-unsigned-${ATOMIC_BACKEND} - SOURCES ${CMAKE_CURRENT_BINARY_DIR}/test-forall-atomic-basic-unsigned-${ATOMIC_BACKEND}.cpp ) - - target_include_directories(test-forall-atomic-basic-unsigned-${ATOMIC_BACKEND}.exe - PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/tests) endforeach() diff --git a/test/functional/forall/atomic-basic/tests/test-forall-atomic-basic-unsigned.hpp b/test/functional/forall/atomic-basic/tests/test-forall-atomic-basic-unsigned.hpp deleted file mode 100644 index e318c3847f..0000000000 --- a/test/functional/forall/atomic-basic/tests/test-forall-atomic-basic-unsigned.hpp +++ /dev/null @@ -1,147 +0,0 @@ -//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2016-24, Lawrence Livermore National Security, LLC -// and RAJA project contributors. See the RAJA/LICENSE file for details. -// -// SPDX-License-Identifier: (BSD-3-Clause) -//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// - -/// -/// Header file containing basic functional tests for atomic operations with forall. -/// - -#ifndef __TEST_FORALL_ATOMIC_BASIC_UNSIGNED_HPP__ -#define __TEST_FORALL_ATOMIC_BASIC_UNSIGNED_HPP__ - -#include - -// segment multiplexer -template< typename IdxType, typename SegType > -struct RSMultiplexer {}; - -template< typename IdxType > -struct RSMultiplexer < IdxType, RAJA::TypedRangeSegment > -{ - RAJA::TypedRangeSegment - makeseg( IdxType N, camp::resources::Resource RAJA_UNUSED_ARG(work_res) ) - { - return RAJA::TypedRangeSegment( 0, N ); - } -}; - -template< typename IdxType > -struct RSMultiplexer < IdxType, RAJA::TypedRangeStrideSegment > -{ - RAJA::TypedRangeStrideSegment - makeseg( IdxType N, camp::resources::Resource RAJA_UNUSED_ARG(work_res) ) - { - return RAJA::TypedRangeStrideSegment( 0, N, 1 ); - } -}; - -template< typename IdxType > -struct RSMultiplexer < IdxType, RAJA::TypedListSegment > -{ - RAJA::TypedListSegment - makeseg( IdxType N, camp::resources::Resource work_res ) - { - std::vector temp(N); - std::iota( std::begin(temp), std::end(temp), 0 ); - return RAJA::TypedListSegment( &temp[0], static_cast(temp.size()), work_res ); - } -}; -// end segment multiplexer - -template -void ForallAtomicBasicUnsignedTestImpl( IdxType seglimit ) -{ - // initialize an array - const int len = 2; - - camp::resources::Resource work_res{WORKINGRES()}; - - SegmentType seg = - RSMultiplexer().makeseg(seglimit, work_res); - - T * work_array; - T * test_array; - T * check_array; - - allocateForallTestData( len, - work_res, - &work_array, - &check_array, - &test_array ); - - work_res.memcpy( work_array, test_array, sizeof(T) * len ); - -#if defined(RAJA_ENABLE_CUDA) - cudaErrchk(cudaDeviceSynchronize()); -#endif - -#if defined(RAJA_ENABLE_HIP) - hipErrchk(hipDeviceSynchronize()); -#endif - - test_array[0] = (T)0; - test_array[1] = (T)0; - - work_res.memcpy( work_array, test_array, sizeof(T) * len ); - - RAJA::forall(seg, [=] RAJA_HOST_DEVICE(IdxType RAJA_UNUSED_ARG(i)) { - RAJA::atomicInc(work_array + 0, (T)16); - RAJA::atomicDec(work_array + 1, (T)16); - }); - - work_res.memcpy( check_array, work_array, sizeof(T) * len ); - -#if defined(RAJA_ENABLE_CUDA) - cudaErrchk(cudaDeviceSynchronize()); -#endif - -#if defined(RAJA_ENABLE_HIP) - hipErrchk(hipDeviceSynchronize()); -#endif - - EXPECT_EQ((T)4, check_array[0]); - EXPECT_EQ((T)13, check_array[1]); - - deallocateForallTestData( work_res, - work_array, - check_array, - test_array ); -} - -TYPED_TEST_SUITE_P(ForallAtomicBasicUnsignedTest); -template -class ForallAtomicBasicUnsignedTest : public ::testing::Test -{ -}; - -TYPED_TEST_P(ForallAtomicBasicUnsignedTest, AtomicBasicUnsignedForall) -{ - using AExec = typename camp::at>::type; - using APol = typename camp::at>::type; - using ResType = typename camp::at>::type; - using IdxType = typename camp::at>::type; - using DType = typename camp::at>::type; - - ForallAtomicBasicUnsignedTestImpl, - DType>( 10000 ); - ForallAtomicBasicUnsignedTestImpl, - DType>( 10000 ); - ForallAtomicBasicUnsignedTestImpl, - DType>( 10000 ); -} - -REGISTER_TYPED_TEST_SUITE_P(ForallAtomicBasicUnsignedTest, - AtomicBasicUnsignedForall); - -#endif //__TEST_FORALL_ATOMIC_BASIC_UNSIGNED_HPP__ diff --git a/test/functional/forall/atomic-basic/tests/test-forall-atomic-basic.hpp b/test/functional/forall/atomic-basic/tests/test-forall-atomic-basic.hpp index ab2f0a89e7..a9e2c5a9f8 100644 --- a/test/functional/forall/atomic-basic/tests/test-forall-atomic-basic.hpp +++ b/test/functional/forall/atomic-basic/tests/test-forall-atomic-basic.hpp @@ -61,7 +61,7 @@ template (0); + test_array[1] = static_cast(seglimit); + test_array[2] = static_cast(seglimit); + test_array[3] = static_cast(0); + test_array[4] = static_cast(0); + test_array[5] = static_cast(seglimit + 1); + test_array[6] = static_cast(seglimit); + test_array[7] = static_cast(0); + test_array[8] = static_cast(0); + test_array[9] = static_cast(0); + test_array[10] = static_cast(0); + test_array[11] = static_cast(0); + + work_res.memcpy(work_array, test_array, sizeof(T) * len); RAJA::forall(seg, [=] RAJA_HOST_DEVICE(IdxType i) { - RAJA::atomicAdd(work_array + 0, (T)1); - RAJA::atomicSub(work_array + 1, (T)1); - RAJA::atomicMin(work_array + 2, (T)i); - RAJA::atomicMax(work_array + 3, (T)i); + RAJA::atomicAdd(work_array + 0, static_cast(1)); + RAJA::atomicSub(work_array + 1, static_cast(1)); + RAJA::atomicMin(work_array + 2, static_cast(i)); + RAJA::atomicMax(work_array + 3, static_cast(i)); RAJA::atomicInc(work_array + 4); RAJA::atomicDec(work_array + 5); - RAJA::atomicExchange(work_array + 6, (T)i); - RAJA::atomicCAS(work_array + 7, (T)i, (T)(i+1)); + RAJA::atomicExchange(work_array + 6, static_cast(i)); + RAJA::atomicCAS(work_array + 7, static_cast(i), static_cast(i+1)); + RAJA::atomicLoad(work_array + 8); + RAJA::atomicStore(work_array + 9, static_cast(1)); + RAJA::atomicInc(work_array + 10, static_cast(16)); + RAJA::atomicDec(work_array + 11, static_cast(16)); }); work_res.memcpy( check_array, work_array, sizeof(T) * len ); - -#if defined(RAJA_ENABLE_CUDA) - cudaErrchk(cudaDeviceSynchronize()); -#endif - -#if defined(RAJA_ENABLE_HIP) - hipErrchk(hipDeviceSynchronize()); -#endif - - EXPECT_EQ((T)seglimit, check_array[0]); - EXPECT_EQ((T)0, check_array[1]); - EXPECT_EQ((T)0, check_array[2]); - EXPECT_EQ((T)seglimit - 1, check_array[3]); - EXPECT_EQ((T)seglimit, check_array[4]); - EXPECT_EQ((T)1, check_array[5]); - EXPECT_LE((T)0, check_array[6]); - EXPECT_GT((T)seglimit, check_array[6]); - EXPECT_LT((T)0, check_array[7]); - EXPECT_GE((T)seglimit, check_array[7]); - - deallocateForallTestData( work_res, - work_array, - check_array, - test_array ); + work_res.wait(); + + EXPECT_EQ(static_cast(seglimit), check_array[0]); + EXPECT_EQ(static_cast(0), check_array[1]); + EXPECT_EQ(static_cast(0), check_array[2]); + EXPECT_EQ(static_cast(seglimit - 1), check_array[3]); + EXPECT_EQ(static_cast(seglimit), check_array[4]); + EXPECT_EQ(static_cast(1), check_array[5]); + EXPECT_LE(static_cast(0), check_array[6]); + EXPECT_GT(static_cast(seglimit), check_array[6]); + EXPECT_LT(static_cast(0), check_array[7]); + EXPECT_GE(static_cast(seglimit), check_array[7]); + EXPECT_EQ(static_cast(0), check_array[8]); + EXPECT_EQ(static_cast(1), check_array[9]); + EXPECT_EQ(static_cast(4), check_array[10]); + EXPECT_EQ(static_cast(13), check_array[11]); + + deallocateForallTestData(work_res, + work_array, + check_array, + test_array); } TYPED_TEST_SUITE_P(ForallAtomicBasicTest); @@ -154,13 +149,13 @@ TYPED_TEST_P(ForallAtomicBasicTest, AtomicBasicForall) ForallAtomicBasicTestImpl, - DType>( 10000 ); + DType>(10000); ForallAtomicBasicTestImpl, - DType>( 10000 ); + DType>(10000); ForallAtomicBasicTestImpl, - DType>( 10000 ); + DType>(10000); } REGISTER_TYPED_TEST_SUITE_P(ForallAtomicBasicTest, diff --git a/test/functional/forall/multi-reduce-basic/CMakeLists.txt b/test/functional/forall/multi-reduce-basic/CMakeLists.txt new file mode 100644 index 0000000000..31ec872c0f --- /dev/null +++ b/test/functional/forall/multi-reduce-basic/CMakeLists.txt @@ -0,0 +1,73 @@ +############################################################################### +# Copyright (c) 2016-24, Lawrence Livermore National Security, LLC +# and RAJA project contributors. See the RAJA/LICENSE file for details. +# +# SPDX-License-Identifier: (BSD-3-Clause) +############################################################################### + +# +# List of core reduction types for generating test files. +# +set(REDUCETYPES Sum Min Max BitAnd BitOr) + +# +# If building openmp target tests, remove the back-end to +# from the list of tests to generate here. +# +if(RAJA_ENABLE_TARGET_OPENMP) + #if(RAJA_TEST_OPENMP_TARGET_SUBSET) + list(REMOVE_ITEM FORALL_BACKENDS OpenMPTarget) + #endif() +endif() + +# +# If building SYCL tests, remove the back-end to +# from the list of tests to generate here. +# +if(RAJA_ENABLE_SYCL) + list(REMOVE_ITEM FORALL_BACKENDS Sycl) +endif() + +# +# Generate core reduction tests for each enabled RAJA back-end +# +# Note: FORALL_BACKENDS is defined in ../CMakeLists.txt +# +foreach( BACKEND ${FORALL_BACKENDS} ) + foreach( REDUCETYPE ${REDUCETYPES} ) + configure_file( test-forall-basic-multi-reduce.cpp.in + test-forall-basic-MultiReduce${REDUCETYPE}-${BACKEND}.cpp ) + raja_add_test( NAME test-forall-basic-MultiReduce${REDUCETYPE}-${BACKEND} + SOURCES ${CMAKE_CURRENT_BINARY_DIR}/test-forall-basic-MultiReduce${REDUCETYPE}-${BACKEND}.cpp ) + + target_include_directories(test-forall-basic-MultiReduce${REDUCETYPE}-${BACKEND}.exe + PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/tests) + endforeach() +endforeach() + +unset( REDUCETYPES ) + + +# +# If building a subset of openmp target tests, add tests to build here. +# +#if(RAJA_ENABLE_TARGET_OPENMP) +# if(RAJA_TEST_OPENMP_TARGET_SUBSET) +# +# set(BACKEND OpenMPTarget) +# set(REDUCETYPES ReduceSum) +# +# foreach( REDUCETYPE ${REDUCETYPES} ) +# configure_file( test-forall-basic-multi-reduce.cpp.in +# test-forall-basic-MultiReduce${REDUCETYPE}-${BACKEND}.cpp ) +# raja_add_test( NAME test-forall-basic-MultiReduce${REDUCETYPE}-${BACKEND} +# SOURCES ${CMAKE_CURRENT_BINARY_DIR}/test-forall-basic-MultiReduce${REDUCETYPE}-${BACKEND}.cpp ) +# +# target_include_directories(test-forall-basic-MultiReduce${REDUCETYPE}-${BACKEND}.exe +# PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/tests) +# endforeach() +# +# endif() +#endif() + +unset( REDUCETYPES ) diff --git a/test/functional/forall/atomic-basic/test-forall-atomic-basic-unsigned.cpp.in b/test/functional/forall/multi-reduce-basic/test-forall-basic-multi-reduce.cpp.in similarity index 51% rename from test/functional/forall/atomic-basic/test-forall-atomic-basic-unsigned.cpp.in rename to test/functional/forall/multi-reduce-basic/test-forall-basic-multi-reduce.cpp.in index 5c4ef05e5d..cd03109a9c 100644 --- a/test/functional/forall/atomic-basic/test-forall-atomic-basic-unsigned.cpp.in +++ b/test/functional/forall/multi-reduce-basic/test-forall-basic-multi-reduce.cpp.in @@ -12,21 +12,29 @@ #include "RAJA_test-camp.hpp" #include "RAJA_test-index-types.hpp" -#include "RAJA_test-atomic-types.hpp" -#include "RAJA_test-atomicpol.hpp" - -#include "RAJA_test-forall-execpol.hpp" #include "RAJA_test-forall-data.hpp" +#include "RAJA_test-forall-execpol.hpp" +#include "RAJA_test-multi-reducepol.hpp" +#include "RAJA_test-multi-reduce-abstractor.hpp" + // // Header for tests in ./tests directory // // Note: CMake adds ./tests as an include dir for these tests. // -#include "test-forall-atomic-basic-unsigned.hpp" +#include "test-forall-basic-MultiReduce.hpp" // -// These tests exercise only one index type. We parameterize here to +// Data types for core reduction basic tests +// +using ReductionDataTypeList = camp::list< int, + float, + double >; + + +// +// These tests exercise only one index type. We parameterize here to // make it easier to expand types in the future if needed. // using TestIdxTypeList = camp::list< RAJA::Index_type >; @@ -34,17 +42,17 @@ using TestIdxTypeList = camp::list< RAJA::Index_type >; // // Cartesian product of types used in parameterized tests // -using @ATOMIC_BACKEND@ForallAtomicBasicUnsignedTypes = - Test< camp::cartesian_product<@ATOMIC_BACKEND@ForallAtomicExecPols, - @ATOMIC_BACKEND@AtomicPols, - @ATOMIC_BACKEND@ResourceList, - TestIdxTypeList, - AtomicDataUnsignedTypeList > >::Types; +using @BACKEND@ForallMultiReduceBasicTypes = + Test< camp::cartesian_product>::Types; // // Instantiate parameterized test // - -INSTANTIATE_TYPED_TEST_SUITE_P(@ATOMIC_BACKEND@, - ForallAtomicBasicUnsignedTest, - @ATOMIC_BACKEND@ForallAtomicBasicUnsignedTypes); +INSTANTIATE_TYPED_TEST_SUITE_P(@BACKEND@, + ForallMultiReduceBasicTest, + @BACKEND@ForallMultiReduceBasicTypes); diff --git a/test/functional/forall/multi-reduce-basic/tests/test-forall-basic-MultiReduce.hpp b/test/functional/forall/multi-reduce-basic/tests/test-forall-basic-MultiReduce.hpp new file mode 100644 index 0000000000..7c187464e8 --- /dev/null +++ b/test/functional/forall/multi-reduce-basic/tests/test-forall-basic-MultiReduce.hpp @@ -0,0 +1,299 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2016-24, Lawrence Livermore National Security, LLC +// and RAJA project contributors. See the RAJA/LICENSE file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +#ifndef __TEST_FORALL_BASIC_REDUCESUM_HPP__ +#define __TEST_FORALL_BASIC_REDUCESUM_HPP__ + +#include +#include +#include +#include +#include +#include + +template +// use enable_if in return type to appease nvcc 11.2 +// add bool return type to disambiguate signatures of these functions for MSVC +std::enable_if_t(), bool> +ForallMultiReduceBasicTestImpl(const SEG_TYPE&, + const Container&, + const std::vector&, + camp::resources::Resource, + RandomGenerator&) +{ return false; } +/// +template +// use enable_if in return type to appease nvcc 11.2 +std::enable_if_t()> +ForallMultiReduceBasicTestImpl(const SEG_TYPE& seg, + const Container& multi_init, + const std::vector& seg_idx, + camp::resources::Resource working_res, + RandomGenerator& rngen) +{ + using MULTIREDUCER = typename ABSTRACTION::template multi_reducer; + + const IDX_TYPE idx_range = seg_idx[seg_idx.size() - 1] + 1; + const IDX_TYPE idx_len = static_cast( seg_idx.size() ); + + const int modval = 100; + const size_t num_bins = multi_init.size(); + + IDX_TYPE* working_range; + IDX_TYPE* check_range; + IDX_TYPE* test_range; + + DATA_TYPE* working_array; + DATA_TYPE* check_array; + DATA_TYPE* test_array; + + IDX_TYPE* working_bins; + IDX_TYPE* check_bins; + IDX_TYPE* test_bins; + + IDX_TYPE data_len = 0; + + allocateForallTestData(idx_range+1, + working_res, + &working_range, + &check_range, + &test_range); + + for (IDX_TYPE i = 0; i < idx_range+1; ++i) { + test_range[i] = ~IDX_TYPE(0); + } + + std::uniform_int_distribution work_per_iterate_distribution(0, num_bins); + + for (IDX_TYPE i = 0; i < idx_len; ++i) { + IDX_TYPE idx = seg_idx[i]; + test_range[idx] = data_len; + data_len += work_per_iterate_distribution(rngen); + test_range[idx+1] = data_len; + } + + allocateForallTestData(data_len, + working_res, + &working_array, + &check_array, + &test_array); + + allocateForallTestData(data_len, + working_res, + &working_bins, + &check_bins, + &test_bins); + + // use ints to initialize array here to avoid floating point precision issues + std::uniform_int_distribution array_int_distribution(0, modval-1); + std::uniform_int_distribution bin_distribution(0, num_bins-1); + + + for (IDX_TYPE i = 0; i < data_len; ++i) { + test_array[i] = DATA_TYPE(array_int_distribution(rngen)); + + // this may use the same bin multiple times per iterate + test_bins[i] = bin_distribution(rngen); + } + + working_res.memcpy(working_range, test_range, sizeof(IDX_TYPE) * (idx_range+1)); + working_res.memcpy(working_array, test_array, sizeof(DATA_TYPE) * data_len); + working_res.memcpy(working_bins, test_bins, sizeof(IDX_TYPE) * data_len); + + + MULTIREDUCER red(num_bins); + MULTIREDUCER red2(multi_init); + + // basic test with two multi reducers in the same loop + { + std::vector ref_vals(num_bins, ABSTRACTION::identity(red)); + + for (IDX_TYPE i = 0; i < data_len; ++i) { + ref_vals[test_bins[i]] = ABSTRACTION::combine(ref_vals[test_bins[i]], test_array[i]); + } + + RAJA::forall(seg, [=] RAJA_HOST_DEVICE(IDX_TYPE ii) { + for (IDX_TYPE idx = working_range[ii]; idx < working_range[ii+1]; ++idx) { + ABSTRACTION::reduce(red[working_bins[idx]], working_array[idx]); + ABSTRACTION::reduce(red2[working_bins[idx]], working_array[idx]); + } + }); + + size_t bin = 0; + for (auto init_val : multi_init) { + ASSERT_EQ(DATA_TYPE(red[bin].get()), ref_vals[bin]); + ASSERT_EQ(red2.get(bin), ABSTRACTION::combine(ref_vals[bin], init_val)); + ++bin; + } + } + + + red.reset(); + + // basic multiple use test, ensure same reducer can combine values from multiple loops + { + std::vector ref_vals(num_bins, ABSTRACTION::identity(red)); + + const int nloops = 2; + for (int j = 0; j < nloops; ++j) { + + for (IDX_TYPE i = 0; i < data_len; ++i) { + ref_vals[test_bins[i]] = ABSTRACTION::combine(ref_vals[test_bins[i]], test_array[i]); + } + + RAJA::forall(seg, [=] RAJA_HOST_DEVICE(IDX_TYPE ii) { + for (IDX_TYPE idx = working_range[ii]; idx < working_range[ii+1]; ++idx) { + ABSTRACTION::reduce(red[working_bins[idx]], working_array[idx]); + } + }); + } + + for (size_t bin = 0; bin < num_bins; ++bin) { + ASSERT_EQ(static_cast(red[bin].get()), ref_vals[bin]); + } + } + + + // test the consistency of answers, if we expect them to be consistent + if (ABSTRACTION::consistent(red)) { + + if /* constexpr */ (std::is_floating_point::value) { + + // use floating point values to accentuate floating point precision issues + std::conditional_t::value, + std::uniform_int_distribution, + std::uniform_real_distribution> array_flt_distribution(0, modval-1); + + for (IDX_TYPE i = 0; i < data_len; ++i) { + test_array[i] = DATA_TYPE(array_flt_distribution(rngen)); + } + working_res.memcpy(working_array, test_array, sizeof(DATA_TYPE) * data_len); + } + + + std::vector ref_vals; + bool got_ref_vals = false; + + const int nloops = 2; + for (int j = 0; j < nloops; ++j) { + red.reset(); + + RAJA::forall(seg, [=] RAJA_HOST_DEVICE(IDX_TYPE ii) { + for (IDX_TYPE idx = working_range[ii]; idx < working_range[ii+1]; ++idx) { + ABSTRACTION::reduce(red[working_bins[idx]], working_array[idx]); + } + }); + + if (!got_ref_vals) { + ref_vals.resize(num_bins); + red.get_all(ref_vals); + got_ref_vals = true; + } else { + for (size_t bin = 0; bin < num_bins; ++bin) { + ASSERT_EQ(red.get(bin), ref_vals[bin]); + } + } + } + } + + + deallocateForallTestData(working_res, + working_bins, + check_bins, + test_bins); + deallocateForallTestData(working_res, + working_array, + check_array, + test_array); + deallocateForallTestData(working_res, + working_range, + check_range, + test_range); +} + + +TYPED_TEST_SUITE_P(ForallMultiReduceBasicTest); +template +class ForallMultiReduceBasicTest : public ::testing::Test +{ +}; + +TYPED_TEST_P(ForallMultiReduceBasicTest, MultiReduceBasicForall) +{ + using IDX_TYPE = typename camp::at>::type; + using DATA_TYPE = typename camp::at>::type; + using WORKING_RES = typename camp::at>::type; + using EXEC_POLICY = typename camp::at>::type; + using REDUCE_POLICY = typename camp::at>::type; + using ABSTRACTION = typename camp::at>::type; + + // for setting random values in arrays + auto random_seed = std::random_device{}(); + std::mt19937 rngen(random_seed); + + camp::resources::Resource working_res{WORKING_RES::get_default()}; + + std::vector seg_idx; + + std::vector container; + + std::vector num_bins_max_container({0, 1, 100}); + size_t num_bins_min = 0; + for (size_t num_bins_max : num_bins_max_container) { + + std::uniform_int_distribution num_bins_dist(num_bins_min, num_bins_max); + num_bins_min = num_bins_max+1; + size_t num_bins = num_bins_dist(rngen); + + container.resize(num_bins, DATA_TYPE(2)); + + // Range segment tests + RAJA::TypedRangeSegment r1( 0, 28 ); + RAJA::getIndices(seg_idx, r1); + ForallMultiReduceBasicTestImpl( + r1, container, seg_idx, working_res, rngen); + + seg_idx.clear(); + RAJA::TypedRangeSegment r3( 3, 2060 ); + RAJA::getIndices(seg_idx, r3); + ForallMultiReduceBasicTestImpl( + r3, container, seg_idx, working_res, rngen); + + // Range-stride segment test + seg_idx.clear(); + RAJA::TypedRangeStrideSegment r5( 3, 1029, 3 ); + RAJA::getIndices(seg_idx, r5); + ForallMultiReduceBasicTestImpl( + r5, container, seg_idx, working_res, rngen); + + // List segment test + seg_idx.clear(); + IDX_TYPE last = 10567; + std::uniform_int_distribution dist(0, last-1); + for (IDX_TYPE i = 0; i < last; ++i) { + IDX_TYPE randval = dist(rngen); + if ( i < randval ) { + seg_idx.push_back(i); + } + } + RAJA::TypedListSegment l1( &seg_idx[0], seg_idx.size(), + working_res ); + ForallMultiReduceBasicTestImpl( + l1, container, seg_idx, working_res, rngen); + } +} + +REGISTER_TYPED_TEST_SUITE_P(ForallMultiReduceBasicTest, + MultiReduceBasicForall); + +#endif // __TEST_FORALL_BASIC_REDUCESUM_HPP__ diff --git a/test/functional/forall/reduce-basic/CMakeLists.txt b/test/functional/forall/reduce-basic/CMakeLists.txt index 2537ea1ff9..42f03f04c8 100644 --- a/test/functional/forall/reduce-basic/CMakeLists.txt +++ b/test/functional/forall/reduce-basic/CMakeLists.txt @@ -22,16 +22,6 @@ if(RAJA_ENABLE_TARGET_OPENMP) endif() endif() -# -# If building SYCL tests, remove the back-end from -# from the list of tests to generate here for the -# expt-reduce tests. -# -if(RAJA_ENABLE_SYCL) - list(REMOVE_ITEM REDUCETYPES ReduceMaxLoc) - list(REMOVE_ITEM REDUCETYPES ReduceMinLoc) -endif() - # # Generate core reduction tests for each enabled RAJA back-end diff --git a/test/functional/kernel/CMakeLists.txt b/test/functional/kernel/CMakeLists.txt index cd577a45b4..76771724c9 100644 --- a/test/functional/kernel/CMakeLists.txt +++ b/test/functional/kernel/CMakeLists.txt @@ -37,6 +37,8 @@ add_subdirectory(conditional-fission-fusion-loop) add_subdirectory(hyperplane) +add_subdirectory(multi-reduce-nested) + add_subdirectory(nested-loop) add_subdirectory(nested-loop-reducesum) diff --git a/test/functional/kernel/hyperplane/CMakeLists.txt b/test/functional/kernel/hyperplane/CMakeLists.txt index 2e74129160..c01c9c2231 100644 --- a/test/functional/kernel/hyperplane/CMakeLists.txt +++ b/test/functional/kernel/hyperplane/CMakeLists.txt @@ -13,7 +13,7 @@ set(TESTTYPES 2D 3D) foreach( BACKEND ${KERNEL_BACKENDS} ) foreach( TEST_TYPE ${TESTTYPES} ) # Removing Sycl backend, implementation of Hyperplane does not exist - if( NOT ((BACKEND STREQUAL "Sycl")) ) + if( NOT ((BACKEND STREQUAL "Sycl")) AND NOT ((BACKEND STREQUAL "OpenMPTarget")) ) configure_file( test-kernel-hyperplane-${TEST_TYPE}.cpp.in test-kernel-hyperplane-${TEST_TYPE}-${BACKEND}.cpp ) raja_add_test( NAME test-kernel-hyperplane-${TEST_TYPE}-${BACKEND} diff --git a/test/functional/kernel/multi-reduce-nested/CMakeLists.txt b/test/functional/kernel/multi-reduce-nested/CMakeLists.txt new file mode 100644 index 0000000000..9efda7d133 --- /dev/null +++ b/test/functional/kernel/multi-reduce-nested/CMakeLists.txt @@ -0,0 +1,73 @@ +############################################################################### +# Copyright (c) 2016-24, Lawrence Livermore National Security, LLC +# and RAJA project contributors. See the RAJA/LICENSE file for details. +# +# SPDX-License-Identifier: (BSD-3-Clause) +############################################################################### + +# +# List of core reduction types for generating test files. +# +set(REDUCETYPES Sum Min Max BitAnd BitOr) + +# +# If building openmp target tests, remove the back-end to +# from the list of tests to generate here. +# +if(RAJA_ENABLE_TARGET_OPENMP) + #if(RAJA_TEST_OPENMP_TARGET_SUBSET) + list(REMOVE_ITEM KERNEL_BACKENDS OpenMPTarget) + #endif() +endif() + +# +# If building SYCL tests, remove the back-end to +# from the list of tests to generate here. +# +if(RAJA_ENABLE_SYCL) + list(REMOVE_ITEM KERNEL_BACKENDS Sycl) +endif() + +# +# Generate core reduction tests for each enabled RAJA back-end +# +# Note: KERNEL_BACKENDS is defined in ../CMakeLists.txt +# +foreach( BACKEND ${KERNEL_BACKENDS} ) + foreach( REDUCETYPE ${REDUCETYPES} ) + configure_file( test-kernel-nested-multi-reduce.cpp.in + test-kernel-nested-MultiReduce${REDUCETYPE}-${BACKEND}.cpp ) + raja_add_test( NAME test-kernel-nested-MultiReduce${REDUCETYPE}-${BACKEND} + SOURCES ${CMAKE_CURRENT_BINARY_DIR}/test-kernel-nested-MultiReduce${REDUCETYPE}-${BACKEND}.cpp ) + + target_include_directories(test-kernel-nested-MultiReduce${REDUCETYPE}-${BACKEND}.exe + PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/tests) + endforeach() +endforeach() + +unset( REDUCETYPES ) + + +# +# If building a subset of openmp target tests, add tests to build here. +# +#if(RAJA_ENABLE_TARGET_OPENMP) +# if(RAJA_TEST_OPENMP_TARGET_SUBSET) +# +# set(BACKEND OpenMPTarget) +# set(REDUCETYPES ReduceSum) +# +# foreach( REDUCETYPE ${REDUCETYPES} ) +# configure_file( test-kernel-nested-multi-reduce.cpp.in +# test-kernel-nested-MultiReduce${REDUCETYPE}-${BACKEND}.cpp ) +# raja_add_test( NAME test-kernel-nested-MultiReduce${REDUCETYPE}-${BACKEND} +# SOURCES ${CMAKE_CURRENT_BINARY_DIR}/test-kernel-nested-MultiReduce${REDUCETYPE}-${BACKEND}.cpp ) +# +# target_include_directories(test-kernel-nested-MultiReduce${REDUCETYPE}-${BACKEND}.exe +# PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/tests) +# endforeach() +# +# endif() +#endif() + +unset( REDUCETYPES ) diff --git a/test/functional/kernel/multi-reduce-nested/test-kernel-nested-multi-reduce.cpp.in b/test/functional/kernel/multi-reduce-nested/test-kernel-nested-multi-reduce.cpp.in new file mode 100644 index 0000000000..6816bb6ad7 --- /dev/null +++ b/test/functional/kernel/multi-reduce-nested/test-kernel-nested-multi-reduce.cpp.in @@ -0,0 +1,123 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2016-24, Lawrence Livermore National Security, LLC +// and RAJA project contributors. See the RAJA/LICENSE file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +// +// test/include headers +// +#include "RAJA_test-base.hpp" +#include "RAJA_test-camp.hpp" +#include "RAJA_test-index-types.hpp" + +#include "RAJA_test-forall-data.hpp" +#include "RAJA_test-kernel-nested-loop-types.hpp" +#include "RAJA_test-multi-reducepol.hpp" +#include "RAJA_test-multi-reduce-abstractor.hpp" + + +// +// Header for tests in ./tests directory +// +// Note: CMake adds ./tests as an include dir for these tests. +// +#include "test-kernel-nested-MultiReduce.hpp" + +// +// Data types for core reduction nested tests +// +using ReductionDataTypeList = camp::list< int, + float, + double >; + + +// +// These tests exercise only one index type. We parameterize here to +// make it easier to expand types in the future if needed. +// +using TestIdxTypeList = camp::list< RAJA::Index_type >; + + +using SequentialKernelNestedLoopExecPols = camp::list< + + // Depth 3 Exec Pols + NestedLoopData + >; + +#if defined(RAJA_ENABLE_OPENMP) + +using OpenMPKernelNestedLoopExecPols = camp::list< + + // Collapse Exec Pols + NestedLoopData, + + // Depth 3 Exec Pols + NestedLoopData + >; + +#endif // RAJA_ENABLE_OPENMP + + +#if defined(RAJA_ENABLE_TARGET_OPENMP) + +using OpenMPTargetKernelNestedLoopExecPols = camp::list< + + // Collapse Exec Pols + NestedLoopData, + + // Depth 3 Exec Pols + NestedLoopData, RAJA::seq_exec > + >; + +#endif // RAJA_ENABLE_TARGET_OPENMP + + +#if defined(RAJA_ENABLE_CUDA) +using CudaKernelNestedLoopExecPols = camp::list< + + // Depth 3 Exec Pols + NestedLoopData, RAJA::cuda_global_size_y_direct<16>, RAJA::seq_exec > + >; + +#endif // RAJA_ENABLE_CUDA + +#if defined(RAJA_ENABLE_HIP) + +using HipKernelNestedLoopExecPols = camp::list< + + // Depth 3 Exec Pols + NestedLoopData, RAJA::hip_global_size_y_direct<8>, RAJA::seq_exec > + >; + +#endif // RAJA_ENABLE_HIP + +#if defined(RAJA_ENABLE_SYCL) + +using SyclKernelNestedLoopExecPols = camp::list< + + // Depth 3 Exec Pols + NestedLoopData, + NestedLoopData + >; + +#endif // RAJA_ENABLE_SYCL + +// +// Cartesian product of types used in parameterized tests +// +using @BACKEND@KernelMultiReduceNestedTypes = + Test< camp::cartesian_product>::Types; + +// +// Instantiate parameterized test +// +INSTANTIATE_TYPED_TEST_SUITE_P(@BACKEND@, + KernelMultiReduceNestedTest, + @BACKEND@KernelMultiReduceNestedTypes); diff --git a/test/functional/kernel/multi-reduce-nested/tests/test-kernel-nested-MultiReduce.hpp b/test/functional/kernel/multi-reduce-nested/tests/test-kernel-nested-MultiReduce.hpp new file mode 100644 index 0000000000..30c102684b --- /dev/null +++ b/test/functional/kernel/multi-reduce-nested/tests/test-kernel-nested-MultiReduce.hpp @@ -0,0 +1,361 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2016-24, Lawrence Livermore National Security, LLC +// and RAJA project contributors. See the RAJA/LICENSE file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +#ifndef __TEST_KERNEL_NESTED_MULTIREDUCE_HPP__ +#define __TEST_KERNEL_NESTED_MULTIREDUCE_HPP__ + +#include +#include +#include +#include +#include +#include + +template +// use enable_if in return type to appease nvcc 11.2 +// add bool return type to disambiguate signatures of these functions for MSVC +std::enable_if_t(), bool> +KernelMultiReduceNestedTestImpl(const SEGMENTS_TYPE&, + const Container&, + WORKING_RES, + RandomGenerator&) +{ return false; } +/// +template +// use enable_if in return type to appease nvcc 11.2 +std::enable_if_t()> +KernelMultiReduceNestedTestImpl(const SEGMENTS_TYPE& segments, + const Container& multi_init, + WORKING_RES working_res, + RandomGenerator& rngen) +{ + using RAJA::get; + using MULTIREDUCER = typename ABSTRACTION::template multi_reducer; + + auto si = get<2>(segments); + auto sj = get<1>(segments); + auto sk = get<0>(segments); + + RAJA_EXTRACT_BED_SUFFIXED(si, _si); + RAJA_EXTRACT_BED_SUFFIXED(sj, _sj); + RAJA_EXTRACT_BED_SUFFIXED(sk, _sk); + + IDX_TYPE dimi = begin_si[distance_si-1] + 1; + IDX_TYPE dimj = begin_sj[distance_sj-1] + 1; + IDX_TYPE dimk = begin_sk[distance_sk-1] + 1; + + const IDX_TYPE idx_range = dimi * dimj * dimk; + + const int modval = 100; + const size_t num_bins = multi_init.size(); + + IDX_TYPE* working_range; + IDX_TYPE* check_range; + IDX_TYPE* test_range; + + DATA_TYPE* working_array; + DATA_TYPE* check_array; + DATA_TYPE* test_array; + + IDX_TYPE* working_bins; + IDX_TYPE* check_bins; + IDX_TYPE* test_bins; + + IDX_TYPE data_len = 0; + + allocateForallTestData(idx_range+1, + working_res, + &working_range, + &check_range, + &test_range); + + for (IDX_TYPE i = 0; i < idx_range+1; ++i) { + test_range[i] = ~IDX_TYPE(0); + } + + std::uniform_int_distribution work_per_iterate_distribution(0, num_bins); + + for (IDX_TYPE k : sk) { + for (IDX_TYPE j : sj) { + for (IDX_TYPE i : si) { + IDX_TYPE ii = (dimi * dimj * k) + (dimi * j) + i; + test_range[ii] = data_len; + data_len += work_per_iterate_distribution(rngen); + test_range[ii+1] = data_len; + } + } + } + + allocateForallTestData(data_len, + working_res, + &working_array, + &check_array, + &test_array); + + allocateForallTestData(data_len, + working_res, + &working_bins, + &check_bins, + &test_bins); + + // use ints to initialize array here to avoid floating point precision issues + std::uniform_int_distribution array_int_distribution(0, modval-1); + std::uniform_int_distribution bin_distribution(0, num_bins-1); + + + for (IDX_TYPE i = 0; i < data_len; ++i) { + test_array[i] = DATA_TYPE(array_int_distribution(rngen)); + + // this may use the same bin multiple times per iterate + test_bins[i] = bin_distribution(rngen); + } + + working_res.memcpy(working_range, test_range, sizeof(IDX_TYPE) * (idx_range+1)); + working_res.memcpy(working_array, test_array, sizeof(DATA_TYPE) * data_len); + working_res.memcpy(working_bins, test_bins, sizeof(IDX_TYPE) * data_len); + + + MULTIREDUCER red(num_bins); + MULTIREDUCER red2(multi_init); + + // basic test with two multi reducers in the same loop + { + std::vector ref_vals(num_bins, ABSTRACTION::identity(red)); + + for (IDX_TYPE i = 0; i < data_len; ++i) { + ref_vals[test_bins[i]] = ABSTRACTION::combine(ref_vals[test_bins[i]], test_array[i]); + } + + RAJA::kernel_resource(segments, working_res, + [=] RAJA_HOST_DEVICE (IDX_TYPE k, IDX_TYPE j, IDX_TYPE i) { + IDX_TYPE ii = (dimi * dimj * k) + (dimi * j) + i; + for (IDX_TYPE idx = working_range[ii]; idx < working_range[ii+1]; ++idx) { + ABSTRACTION::reduce(red[working_bins[idx]], working_array[idx]); + ABSTRACTION::reduce(red2[working_bins[idx]], working_array[idx]); + } + }); + + size_t bin = 0; + for (auto init_val : multi_init) { + ASSERT_EQ(DATA_TYPE(red[bin].get()), ref_vals[bin]); + ASSERT_EQ(red2.get(bin), ABSTRACTION::combine(ref_vals[bin], init_val)); + ++bin; + } + } + + + red.reset(); + + // basic multiple use test, ensure same reducer can combine values from multiple loops + { + std::vector ref_vals(num_bins, ABSTRACTION::identity(red)); + + const int nloops = 2; + for (int j = 0; j < nloops; ++j) { + + for (IDX_TYPE i = 0; i < data_len; ++i) { + ref_vals[test_bins[i]] = ABSTRACTION::combine(ref_vals[test_bins[i]], test_array[i]); + } + + RAJA::kernel_resource(segments, working_res, + [=] RAJA_HOST_DEVICE (IDX_TYPE k, IDX_TYPE j, IDX_TYPE i) { + IDX_TYPE ii = (dimi * dimj * k) + (dimi * j) + i; + for (IDX_TYPE idx = working_range[ii]; idx < working_range[ii+1]; ++idx) { + ABSTRACTION::reduce(red[working_bins[idx]], working_array[idx]); + } + }); + } + + for (size_t bin = 0; bin < num_bins; ++bin) { + ASSERT_EQ(static_cast(red[bin].get()), ref_vals[bin]); + } + } + + + // test the consistency of answers, if we expect them to be consistent + if (ABSTRACTION::consistent(red)) { + + if /* constexpr */ (std::is_floating_point::value) { + + // use floating point values to accentuate floating point precision issues + std::conditional_t::value, + std::uniform_int_distribution, + std::uniform_real_distribution> array_flt_distribution(0, modval-1); + + for (IDX_TYPE i = 0; i < data_len; ++i) { + test_array[i] = DATA_TYPE(array_flt_distribution(rngen)); + } + working_res.memcpy(working_array, test_array, sizeof(DATA_TYPE) * data_len); + } + + + std::vector ref_vals; + bool got_ref_vals = false; + + const int nloops = 2; + for (int j = 0; j < nloops; ++j) { + red.reset(); + + RAJA::kernel_resource(segments, working_res, + [=] RAJA_HOST_DEVICE (IDX_TYPE k, IDX_TYPE j, IDX_TYPE i) { + IDX_TYPE ii = (dimi * dimj * k) + (dimi * j) + i; + for (IDX_TYPE idx = working_range[ii]; idx < working_range[ii+1]; ++idx) { + ABSTRACTION::reduce(red[working_bins[idx]], working_array[idx]); + } + }); + + if (!got_ref_vals) { + ref_vals.resize(num_bins); + red.get_all(ref_vals); + got_ref_vals = true; + } else { + for (size_t bin = 0; bin < num_bins; ++bin) { + ASSERT_EQ(red.get(bin), ref_vals[bin]); + } + } + } + } + + + deallocateForallTestData(working_res, + working_bins, + check_bins, + test_bins); + deallocateForallTestData(working_res, + working_array, + check_array, + test_array); + deallocateForallTestData(working_res, + working_range, + check_range, + test_range); +} + + +TYPED_TEST_SUITE_P(KernelMultiReduceNestedTest); +template +class KernelMultiReduceNestedTest : public ::testing::Test +{ +}; + +// +// +// Defining the Kernel Loop structure for MultiReduce Nested Loop Tests. +// +// +template +struct MultiReduceNestedLoopExec; + +template +struct MultiReduceNestedLoopExec { + using type = + RAJA::KernelPolicy< + RAJA::statement::For<0, typename camp::at>::type, + RAJA::statement::For<1, typename camp::at>::type, + RAJA::statement::For<2, typename camp::at>::type, + RAJA::statement::Lambda<0> + > + > + > + >; +}; + +template +struct MultiReduceNestedLoopExec { + using type = + RAJA::KernelPolicy< + RAJA::statement::Collapse< typename camp::at>::type, + RAJA::ArgList<0,1,2>, + RAJA::statement::Lambda<0> + > + >; +}; + +#if defined(RAJA_ENABLE_CUDA) or defined(RAJA_ENABLE_HIP) or defined(RAJA_ENABLE_SYCL) + +template +struct MultiReduceNestedLoopExec { + using type = + RAJA::KernelPolicy< + RAJA::statement::DEVICE_KERNEL< + RAJA::statement::For<0, typename camp::at>::type, + RAJA::statement::For<1, typename camp::at>::type, + RAJA::statement::For<2, typename camp::at>::type, + RAJA::statement::Lambda<0> + > + > + > + > // end DEVICE_KERNEL + >; +}; + +#endif // RAJA_ENABLE_CUDA or RAJA_ENABLE_HIP or RAJA_ENABLE_SYCL + +TYPED_TEST_P(KernelMultiReduceNestedTest, MultiReduceNestedKernel) +{ + using IDX_TYPE = typename camp::at>::type; + using DATA_TYPE = typename camp::at>::type; + using WORKING_RES = typename camp::at>::type; + using EXEC_POL_DATA = typename camp::at>::type; + using REDUCE_POLICY = typename camp::at>::type; + using ABSTRACTION = typename camp::at>::type; + + using LOOP_TYPE = typename EXEC_POL_DATA::LoopType; + using LOOP_POLS = typename EXEC_POL_DATA::type; + using EXEC_POLICY = typename MultiReduceNestedLoopExec::type; + + // for setting random values in arrays + auto random_seed = std::random_device{}(); + std::mt19937 rngen(random_seed); + + WORKING_RES working_res{WORKING_RES::get_default()}; + + std::vector container; + + std::vector num_bins_max_container({0, 1, 100}); + size_t num_bins_min = 0; + for (size_t num_bins_max : num_bins_max_container) { + + std::uniform_int_distribution num_bins_dist(num_bins_min, num_bins_max); + num_bins_min = num_bins_max+1; + size_t num_bins = num_bins_dist(rngen); + + container.resize(num_bins, DATA_TYPE(2)); + + // Range segment tests + auto s1 = RAJA::make_tuple(RAJA::TypedRangeSegment( 0, 2 ), + RAJA::TypedRangeSegment( 0, 7 ), + RAJA::TypedRangeSegment( 0, 3 )); + KernelMultiReduceNestedTestImpl( + s1, container, working_res, rngen); + + auto s2 = RAJA::make_tuple(RAJA::TypedRangeSegment( 2, 35 ), + RAJA::TypedRangeSegment( 0, 19 ), + RAJA::TypedRangeSegment( 3, 13 )); + KernelMultiReduceNestedTestImpl( + s2, container, working_res, rngen); + + // Range-stride segment tests + auto s3 = RAJA::make_tuple(RAJA::TypedRangeStrideSegment( 0, 6, 2 ), + RAJA::TypedRangeStrideSegment( 1, 38, 3 ), + RAJA::TypedRangeStrideSegment( 5, 17, 1 )); + KernelMultiReduceNestedTestImpl( + s3, container, working_res, rngen); + + } +} + +REGISTER_TYPED_TEST_SUITE_P(KernelMultiReduceNestedTest, + MultiReduceNestedKernel); + +#endif // __TEST_KERNEL_NESTED_MULTIREDUCE_HPP__ diff --git a/test/functional/kernel/single-loop-tile-icount-tcount/CMakeLists.txt b/test/functional/kernel/single-loop-tile-icount-tcount/CMakeLists.txt index 482a78297d..adc04fb1bc 100644 --- a/test/functional/kernel/single-loop-tile-icount-tcount/CMakeLists.txt +++ b/test/functional/kernel/single-loop-tile-icount-tcount/CMakeLists.txt @@ -17,16 +17,19 @@ set(TILESIZES 8 32) # Note: KERNEL_BACKENDS is defined in ../CMakeLists.txt # foreach( BACKEND ${KERNEL_BACKENDS} ) - foreach( TESTTYPE ${TESTTYPES} ) - foreach( TILESIZE ${TILESIZES} ) - configure_file( test-kernel-single-loop-tile-count.cpp.in - test-kernel-single-loop-${TESTTYPE}-${TILESIZE}-${BACKEND}.cpp ) - raja_add_test( NAME test-kernel-single-loop-${TESTTYPE}-${TILESIZE}-${BACKEND} - SOURCES ${CMAKE_CURRENT_BINARY_DIR}/test-kernel-single-loop-${TESTTYPE}-${TILESIZE}-${BACKEND}.cpp ) - target_include_directories(test-kernel-single-loop-${TESTTYPE}-${TILESIZE}-${BACKEND}.exe - PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/tests) + # using omp target crashes the compiler with this one + if( NOT ((BACKEND STREQUAL "OpenMPTarget")) ) + foreach( TESTTYPE ${TESTTYPES} ) + foreach( TILESIZE ${TILESIZES} ) + configure_file( test-kernel-single-loop-tile-count.cpp.in + test-kernel-single-loop-${TESTTYPE}-${TILESIZE}-${BACKEND}.cpp ) + raja_add_test( NAME test-kernel-single-loop-${TESTTYPE}-${TILESIZE}-${BACKEND} + SOURCES ${CMAKE_CURRENT_BINARY_DIR}/test-kernel-single-loop-${TESTTYPE}-${TILESIZE}-${BACKEND}.cpp ) + target_include_directories(test-kernel-single-loop-${TESTTYPE}-${TILESIZE}-${BACKEND}.exe + PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/tests) + endforeach() endforeach() - endforeach() + endif() endforeach() unset( TILESIZES ) diff --git a/test/functional/kernel/tile-variants/CMakeLists.txt b/test/functional/kernel/tile-variants/CMakeLists.txt index 02d5aa2fd2..ac5ba913e9 100644 --- a/test/functional/kernel/tile-variants/CMakeLists.txt +++ b/test/functional/kernel/tile-variants/CMakeLists.txt @@ -12,13 +12,16 @@ set(TILETYPES Fixed2D Fixed2DSum Fixed2DMinMax) foreach( TILE_BACKEND ${KERNEL_BACKENDS} ) foreach( TILE_TYPE ${TILETYPES} ) - configure_file( test-kernel-tilefixed.cpp.in - test-kernel-tile-${TILE_TYPE}-${TILE_BACKEND}.cpp ) - raja_add_test( NAME test-kernel-tile-${TILE_TYPE}-${TILE_BACKEND} - SOURCES ${CMAKE_CURRENT_BINARY_DIR}/test-kernel-tile-${TILE_TYPE}-${TILE_BACKEND}.cpp ) + # OpenMPTarget crashes the xl compiler when building this test... + if( NOT((TILE_BACKEND STREQUAL "OpenMPTarget")) ) + configure_file( test-kernel-tilefixed.cpp.in + test-kernel-tile-${TILE_TYPE}-${TILE_BACKEND}.cpp ) + raja_add_test( NAME test-kernel-tile-${TILE_TYPE}-${TILE_BACKEND} + SOURCES ${CMAKE_CURRENT_BINARY_DIR}/test-kernel-tile-${TILE_TYPE}-${TILE_BACKEND}.cpp ) - target_include_directories(test-kernel-tile-${TILE_TYPE}-${TILE_BACKEND}.exe - PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/tests) + target_include_directories(test-kernel-tile-${TILE_TYPE}-${TILE_BACKEND}.exe + PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/tests) + endif() endforeach() endforeach() diff --git a/test/functional/launch/CMakeLists.txt b/test/functional/launch/CMakeLists.txt index 3e83383833..a8fcdfd8ce 100644 --- a/test/functional/launch/CMakeLists.txt +++ b/test/functional/launch/CMakeLists.txt @@ -26,6 +26,8 @@ endif() add_subdirectory(run-time-switch) #Adapted from forall test +add_subdirectory(multi-reduce-nested) + add_subdirectory(reduce-basic) add_subdirectory(reduce-params) diff --git a/test/functional/launch/multi-reduce-nested/CMakeLists.txt b/test/functional/launch/multi-reduce-nested/CMakeLists.txt new file mode 100644 index 0000000000..f5a916344d --- /dev/null +++ b/test/functional/launch/multi-reduce-nested/CMakeLists.txt @@ -0,0 +1,73 @@ +############################################################################### +# Copyright (c) 2016-24, Lawrence Livermore National Security, LLC +# and RAJA project contributors. See the RAJA/LICENSE file for details. +# +# SPDX-License-Identifier: (BSD-3-Clause) +############################################################################### + +# +# List of core reduction types for generating test files. +# +set(REDUCETYPES Sum Min Max BitAnd BitOr) + +# +# If building openmp target tests, remove the back-end to +# from the list of tests to generate here. +# +if(RAJA_ENABLE_TARGET_OPENMP) + #if(RAJA_TEST_OPENMP_TARGET_SUBSET) + list(REMOVE_ITEM LAUNCH_BACKENDS OpenMPTarget) + #endif() +endif() + +# +# If building SYCL tests, remove the back-end to +# from the list of tests to generate here. +# +if(RAJA_ENABLE_SYCL) + list(REMOVE_ITEM LAUNCH_BACKENDS Sycl) +endif() + +# +# Generate core reduction tests for each enabled RAJA back-end +# +# Note: LAUNCH_BACKENDS is defined in ../CMakeLists.txt +# +foreach( BACKEND ${LAUNCH_BACKENDS} ) + foreach( REDUCETYPE ${REDUCETYPES} ) + configure_file( test-launch-nested-multi-reduce.cpp.in + test-launch-nested-MultiReduce${REDUCETYPE}-${BACKEND}.cpp ) + raja_add_test( NAME test-launch-nested-MultiReduce${REDUCETYPE}-${BACKEND} + SOURCES ${CMAKE_CURRENT_BINARY_DIR}/test-launch-nested-MultiReduce${REDUCETYPE}-${BACKEND}.cpp ) + + target_include_directories(test-launch-nested-MultiReduce${REDUCETYPE}-${BACKEND}.exe + PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/tests) + endforeach() +endforeach() + +unset( REDUCETYPES ) + + +# +# If building a subset of openmp target tests, add tests to build here. +# +#if(RAJA_ENABLE_TARGET_OPENMP) +# if(RAJA_TEST_OPENMP_TARGET_SUBSET) +# +# set(BACKEND OpenMPTarget) +# set(REDUCETYPES ReduceSum) +# +# foreach( REDUCETYPE ${REDUCETYPES} ) +# configure_file( test-launch-nested-multi-reduce.cpp.in +# test-launch-nested-MultiReduce${REDUCETYPE}-${BACKEND}.cpp ) +# raja_add_test( NAME test-launch-nested-MultiReduce${REDUCETYPE}-${BACKEND} +# SOURCES ${CMAKE_CURRENT_BINARY_DIR}/test-launch-nested-MultiReduce${REDUCETYPE}-${BACKEND}.cpp ) +# +# target_include_directories(test-launch-nested-MultiReduce${REDUCETYPE}-${BACKEND}.exe +# PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/tests) +# endforeach() +# +# endif() +#endif() + +unset( REDUCETYPES ) diff --git a/test/functional/launch/multi-reduce-nested/test-launch-nested-multi-reduce.cpp.in b/test/functional/launch/multi-reduce-nested/test-launch-nested-multi-reduce.cpp.in new file mode 100644 index 0000000000..df097a896f --- /dev/null +++ b/test/functional/launch/multi-reduce-nested/test-launch-nested-multi-reduce.cpp.in @@ -0,0 +1,58 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2016-24, Lawrence Livermore National Security, LLC +// and RAJA project contributors. See the RAJA/LICENSE file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +// +// test/include headers +// +#include "RAJA_test-base.hpp" +#include "RAJA_test-camp.hpp" +#include "RAJA_test-index-types.hpp" + +#include "RAJA_test-forall-data.hpp" +#include "RAJA_test-launch-direct-teams-threads-3D-execpol.hpp" +#include "RAJA_test-multi-reducepol.hpp" +#include "RAJA_test-multi-reduce-abstractor.hpp" + + +// +// Header for tests in ./tests directory +// +// Note: CMake adds ./tests as an include dir for these tests. +// +#include "test-launch-nested-MultiReduce.hpp" + +// +// Data types for core reduction nested tests +// +using ReductionDataTypeList = camp::list< int, + float, + double >; + + +// +// These tests exercise only one index type. We parameterize here to +// make it easier to expand types in the future if needed. +// +using TestIdxTypeList = camp::list< RAJA::Index_type >; + +// +// Cartesian product of types used in parameterized tests +// +using @BACKEND@LaunchMultiReduceNestedTypes = + Test< camp::cartesian_product>::Types; + +// +// Instantiate parameterized test +// +INSTANTIATE_TYPED_TEST_SUITE_P(@BACKEND@, + LaunchMultiReduceNestedTest, + @BACKEND@LaunchMultiReduceNestedTypes); diff --git a/test/functional/launch/multi-reduce-nested/tests/test-launch-nested-MultiReduce.hpp b/test/functional/launch/multi-reduce-nested/tests/test-launch-nested-MultiReduce.hpp new file mode 100644 index 0000000000..867b826df3 --- /dev/null +++ b/test/functional/launch/multi-reduce-nested/tests/test-launch-nested-MultiReduce.hpp @@ -0,0 +1,375 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2016-24, Lawrence Livermore National Security, LLC +// and RAJA project contributors. See the RAJA/LICENSE file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +#ifndef __TEST_LAUNCH_NESTED_MULTIREDUCE_HPP__ +#define __TEST_LAUNCH_NESTED_MULTIREDUCE_HPP__ + +#include +#include +#include +#include +#include +#include + + +// +// +// Defining the Launch Loop structure for MultiReduce Nested Loop Tests. +// +// +template +void Launch(const SEGMENTS_TYPE& segments, + Lambda&& lambda) +{ + using RAJA::get; + + using LAUNCH_POLICY = typename camp::at>::type; + + using TEAM_Z_POLICY = typename camp::at>::type; + using TEAM_Y_POLICY = typename camp::at>::type; + using TEAM_X_POLICY = typename camp::at>::type; + + using THREAD_Z_POLICY = typename camp::at>::type; + using THREAD_Y_POLICY = typename camp::at>::type; + using THREAD_X_POLICY = typename camp::at>::type; + + auto si = get<2>(segments); + auto sj = get<1>(segments); + auto sk = get<0>(segments); + + RAJA_EXTRACT_BED_SUFFIXED(si, _si); + RAJA_EXTRACT_BED_SUFFIXED(sj, _sj); + RAJA_EXTRACT_BED_SUFFIXED(sk, _sk); + + IDX_TYPE threads_i = 16; + IDX_TYPE threads_j = 4; + IDX_TYPE threads_k = 4; + + IDX_TYPE blocks_i = RAJA_DIVIDE_CEILING_INT(distance_si, threads_i); + IDX_TYPE blocks_j = RAJA_DIVIDE_CEILING_INT(distance_sj, threads_j); + IDX_TYPE blocks_k = RAJA_DIVIDE_CEILING_INT(distance_sk, threads_k); + + RAJA::launch + (RAJA::LaunchParams(RAJA::Teams(blocks_i, blocks_j, blocks_k), + RAJA::Threads(threads_i, threads_j,threads_k)), + [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) { + + RAJA::loop(ctx, RAJA::TypedRangeSegment(0, blocks_k), [&](IDX_TYPE bk) { + RAJA::loop(ctx, RAJA::TypedRangeSegment(0, blocks_j), [&](IDX_TYPE bj) { + RAJA::loop(ctx, RAJA::TypedRangeSegment(0, blocks_i), [&](IDX_TYPE bi) { + + RAJA::loop(ctx, RAJA::TypedRangeSegment(0, threads_k), [&](IDX_TYPE tk) { + RAJA::loop(ctx, RAJA::TypedRangeSegment(0, threads_j), [&](IDX_TYPE tj) { + RAJA::loop(ctx, RAJA::TypedRangeSegment(0, threads_i), [&](IDX_TYPE ti) { + + IDX_TYPE i = ti + threads_i * bi; + IDX_TYPE j = tj + threads_j * bj; + IDX_TYPE k = tk + threads_k * bk; + + if (i < distance_si && j < distance_sj && k < distance_sk) { + lambda(begin_sk[k], begin_sj[j], begin_si[i]); + } + }); + }); + }); + + }); + }); + }); + + }); +} + +template +// use enable_if in return type to appease nvcc 11.2 +// add bool return type to disambiguate signatures of these functions for MSVC +std::enable_if_t(), bool> +LaunchMultiReduceNestedTestImpl(const SEGMENTS_TYPE&, + const Container&, + WORKING_RES, + RandomGenerator&) +{ return false; } +/// +template +// use enable_if in return type to appease nvcc 11.2 +std::enable_if_t()> +LaunchMultiReduceNestedTestImpl(const SEGMENTS_TYPE& segments, + const Container& multi_init, + WORKING_RES working_res, + RandomGenerator& rngen) +{ + using RAJA::get; + using MULTIREDUCER = typename ABSTRACTION::template multi_reducer; + + auto si = get<2>(segments); + auto sj = get<1>(segments); + auto sk = get<0>(segments); + + RAJA_EXTRACT_BED_SUFFIXED(si, _si); + RAJA_EXTRACT_BED_SUFFIXED(sj, _sj); + RAJA_EXTRACT_BED_SUFFIXED(sk, _sk); + + IDX_TYPE dimi = begin_si[distance_si-1] + 1; + IDX_TYPE dimj = begin_sj[distance_sj-1] + 1; + IDX_TYPE dimk = begin_sk[distance_sk-1] + 1; + + const IDX_TYPE idx_range = dimi * dimj * dimk; + + const int modval = 100; + const size_t num_bins = multi_init.size(); + + IDX_TYPE* working_range; + IDX_TYPE* check_range; + IDX_TYPE* test_range; + + DATA_TYPE* working_array; + DATA_TYPE* check_array; + DATA_TYPE* test_array; + + IDX_TYPE* working_bins; + IDX_TYPE* check_bins; + IDX_TYPE* test_bins; + + IDX_TYPE data_len = 0; + + allocateForallTestData(idx_range+1, + working_res, + &working_range, + &check_range, + &test_range); + + for (IDX_TYPE i = 0; i < idx_range+1; ++i) { + test_range[i] = ~IDX_TYPE(0); + } + + std::uniform_int_distribution work_per_iterate_distribution(0, num_bins); + + for (IDX_TYPE k : sk) { + for (IDX_TYPE j : sj) { + for (IDX_TYPE i : si) { + IDX_TYPE ii = (dimi * dimj * k) + (dimi * j) + i; + test_range[ii] = data_len; + data_len += work_per_iterate_distribution(rngen); + test_range[ii+1] = data_len; + } + } + } + + allocateForallTestData(data_len, + working_res, + &working_array, + &check_array, + &test_array); + + allocateForallTestData(data_len, + working_res, + &working_bins, + &check_bins, + &test_bins); + + // use ints to initialize array here to avoid floating point precision issues + std::uniform_int_distribution array_int_distribution(0, modval-1); + std::uniform_int_distribution bin_distribution(0, num_bins-1); + + + for (IDX_TYPE i = 0; i < data_len; ++i) { + test_array[i] = DATA_TYPE(array_int_distribution(rngen)); + + // this may use the same bin multiple times per iterate + test_bins[i] = bin_distribution(rngen); + } + + working_res.memcpy(working_range, test_range, sizeof(IDX_TYPE) * (idx_range+1)); + working_res.memcpy(working_array, test_array, sizeof(DATA_TYPE) * data_len); + working_res.memcpy(working_bins, test_bins, sizeof(IDX_TYPE) * data_len); + + + MULTIREDUCER red(num_bins); + MULTIREDUCER red2(multi_init); + + // basic test with two multi reducers in the same loop + { + std::vector ref_vals(num_bins, ABSTRACTION::identity(red)); + + for (IDX_TYPE i = 0; i < data_len; ++i) { + ref_vals[test_bins[i]] = ABSTRACTION::combine(ref_vals[test_bins[i]], test_array[i]); + } + + Launch(segments, + [=] RAJA_HOST_DEVICE (IDX_TYPE k, IDX_TYPE j, IDX_TYPE i) { + IDX_TYPE ii = (dimi * dimj * k) + (dimi * j) + i; + for (IDX_TYPE idx = working_range[ii]; idx < working_range[ii+1]; ++idx) { + ABSTRACTION::reduce(red[working_bins[idx]], working_array[idx]); + ABSTRACTION::reduce(red2[working_bins[idx]], working_array[idx]); + } + }); + + size_t bin = 0; + for (auto init_val : multi_init) { + ASSERT_EQ(DATA_TYPE(red[bin].get()), ref_vals[bin]); + ASSERT_EQ(red2.get(bin), ABSTRACTION::combine(ref_vals[bin], init_val)); + ++bin; + } + } + + + red.reset(); + + // basic multiple use test, ensure same reducer can combine values from multiple loops + { + std::vector ref_vals(num_bins, ABSTRACTION::identity(red)); + + const int nloops = 2; + for (int j = 0; j < nloops; ++j) { + + for (IDX_TYPE i = 0; i < data_len; ++i) { + ref_vals[test_bins[i]] = ABSTRACTION::combine(ref_vals[test_bins[i]], test_array[i]); + } + + Launch(segments, + [=] RAJA_HOST_DEVICE (IDX_TYPE k, IDX_TYPE j, IDX_TYPE i) { + IDX_TYPE ii = (dimi * dimj * k) + (dimi * j) + i; + for (IDX_TYPE idx = working_range[ii]; idx < working_range[ii+1]; ++idx) { + ABSTRACTION::reduce(red[working_bins[idx]], working_array[idx]); + } + }); + } + + for (size_t bin = 0; bin < num_bins; ++bin) { + ASSERT_EQ(static_cast(red[bin].get()), ref_vals[bin]); + } + } + + + // test the consistency of answers, if we expect them to be consistent + if (ABSTRACTION::consistent(red)) { + + if /* constexpr */ (std::is_floating_point::value) { + + // use floating point values to accentuate floating point precision issues + std::conditional_t::value, + std::uniform_int_distribution, + std::uniform_real_distribution> array_flt_distribution(0, modval-1); + + for (IDX_TYPE i = 0; i < data_len; ++i) { + test_array[i] = DATA_TYPE(array_flt_distribution(rngen)); + } + working_res.memcpy(working_array, test_array, sizeof(DATA_TYPE) * data_len); + } + + + std::vector ref_vals; + bool got_ref_vals = false; + + const int nloops = 2; + for (int j = 0; j < nloops; ++j) { + red.reset(); + + Launch(segments, + [=] RAJA_HOST_DEVICE (IDX_TYPE k, IDX_TYPE j, IDX_TYPE i) { + IDX_TYPE ii = (dimi * dimj * k) + (dimi * j) + i; + for (IDX_TYPE idx = working_range[ii]; idx < working_range[ii+1]; ++idx) { + ABSTRACTION::reduce(red[working_bins[idx]], working_array[idx]); + } + }); + + if (!got_ref_vals) { + ref_vals.resize(num_bins); + red.get_all(ref_vals); + got_ref_vals = true; + } else { + for (size_t bin = 0; bin < num_bins; ++bin) { + ASSERT_EQ(red.get(bin), ref_vals[bin]); + } + } + } + } + + + deallocateForallTestData(working_res, + working_bins, + check_bins, + test_bins); + deallocateForallTestData(working_res, + working_array, + check_array, + test_array); + deallocateForallTestData(working_res, + working_range, + check_range, + test_range); +} + + +TYPED_TEST_SUITE_P(LaunchMultiReduceNestedTest); +template +class LaunchMultiReduceNestedTest : public ::testing::Test +{ +}; + +TYPED_TEST_P(LaunchMultiReduceNestedTest, MultiReduceNestedLaunch) +{ + using IDX_TYPE = typename camp::at>::type; + using DATA_TYPE = typename camp::at>::type; + using WORKING_RES = typename camp::at>::type; + using EXEC_POL_DATA = typename camp::at>::type; + using REDUCE_POLICY = typename camp::at>::type; + using ABSTRACTION = typename camp::at>::type; + + // for setting random values in arrays + auto random_seed = std::random_device{}(); + std::mt19937 rngen(random_seed); + + WORKING_RES working_res{WORKING_RES::get_default()}; + + std::vector container; + + std::vector num_bins_max_container({0, 1, 100}); + size_t num_bins_min = 0; + for (size_t num_bins_max : num_bins_max_container) { + + std::uniform_int_distribution num_bins_dist(num_bins_min, num_bins_max); + num_bins_min = num_bins_max+1; + size_t num_bins = num_bins_dist(rngen); + + container.resize(num_bins, DATA_TYPE(2)); + + // Range segment tests + auto s1 = RAJA::make_tuple(RAJA::TypedRangeSegment( 0, 2 ), + RAJA::TypedRangeSegment( 0, 7 ), + RAJA::TypedRangeSegment( 0, 3 )); + LaunchMultiReduceNestedTestImpl( + s1, container, working_res, rngen); + + auto s2 = RAJA::make_tuple(RAJA::TypedRangeSegment( 2, 35 ), + RAJA::TypedRangeSegment( 0, 19 ), + RAJA::TypedRangeSegment( 3, 13 )); + LaunchMultiReduceNestedTestImpl( + s2, container, working_res, rngen); + + // Range-stride segment tests + auto s3 = RAJA::make_tuple(RAJA::TypedRangeStrideSegment( 0, 6, 2 ), + RAJA::TypedRangeStrideSegment( 1, 38, 3 ), + RAJA::TypedRangeStrideSegment( 5, 17, 1 )); + LaunchMultiReduceNestedTestImpl( + s3, container, working_res, rngen); + + } +} + +REGISTER_TYPED_TEST_SUITE_P(LaunchMultiReduceNestedTest, + MultiReduceNestedLaunch); + +#endif // __TEST_LAUNCH_NESTED_MULTIREDUCE_HPP__ diff --git a/test/functional/launch/reduce-params/CMakeLists.txt b/test/functional/launch/reduce-params/CMakeLists.txt index 630f78eb9b..42135b265c 100644 --- a/test/functional/launch/reduce-params/CMakeLists.txt +++ b/test/functional/launch/reduce-params/CMakeLists.txt @@ -20,7 +20,6 @@ set(DATATYPES CoreReductionDataTypeList) # Note: LAUNCH_BACKENDS is defined in ../CMakeLists.txt # foreach( BACKEND ${LAUNCH_BACKENDS} ) - if( NOT (BACKEND STREQUAL "Sycl")) foreach( REDUCETYPE ${REDUCETYPES} ) configure_file( test-launch-basic-param-expt-reduce.cpp.in test-launch-basic-param-expt-${REDUCETYPE}-${BACKEND}.cpp) @@ -30,7 +29,6 @@ foreach( BACKEND ${LAUNCH_BACKENDS} ) target_include_directories(test-launch-basic-param-expt-${REDUCETYPE}-${BACKEND}.exe PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/tests) endforeach() - endif() endforeach() unset( DATATYPES ) @@ -52,7 +50,6 @@ set(DATATYPES BitwiseReductionDataTypeList) # Note: LAUNCH_BACKENDS is defined in ../CMakeLists.txt # foreach( BACKEND ${LAUNCH_BACKENDS} ) - if( NOT (BACKEND STREQUAL "Sycl")) foreach( REDUCETYPE ${REDUCETYPES} ) configure_file( test-launch-basic-param-expt-reduce.cpp.in test-launch-basic-param-expt-${REDUCETYPE}-${BACKEND}.cpp ) @@ -62,7 +59,6 @@ foreach( BACKEND ${LAUNCH_BACKENDS} ) target_include_directories(test-launch-basic-param-expt-${REDUCETYPE}-${BACKEND}.exe PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/tests) endforeach() - endif() endforeach() unset( DATATYPES ) diff --git a/test/include/RAJA_test-atomic-types.hpp b/test/include/RAJA_test-atomic-types.hpp index 9cf4c21355..90a1be4024 100644 --- a/test/include/RAJA_test-atomic-types.hpp +++ b/test/include/RAJA_test-atomic-types.hpp @@ -6,7 +6,7 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// // -// Types and type lists for loop indexing used throughout RAJA tests. +// Type list for testing RAJA atomics. // // Note that in the type lists, a subset of types is used by default. // For more comprehensive type testing define the macro RAJA_TEST_EXHAUSTIVE. @@ -25,18 +25,11 @@ using AtomicDataTypeList = camp::list< RAJA::Index_type, int, #if defined(RAJA_TEST_EXHAUSTIVE) - unsigned, + unsigned int, long long, unsigned long long, float, #endif double >; -using AtomicDataUnsignedTypeList = - camp::list< unsigned, -#if defined(RAJA_TEST_EXHAUSTIVE) - unsigned long, -#endif - unsigned long long>; - #endif // __RAJA_test_atomic_types_HPP__ diff --git a/test/include/RAJA_test-multi-reduce-abstractor.hpp b/test/include/RAJA_test-multi-reduce-abstractor.hpp new file mode 100644 index 0000000000..2c5412893c --- /dev/null +++ b/test/include/RAJA_test-multi-reduce-abstractor.hpp @@ -0,0 +1,170 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2016-24, Lawrence Livermore National Security, LLC +// and RAJA project contributors. See the RAJA/LICENSE file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +// +// Reduction policies used for reduction tests +// + +#ifndef __RAJA_test_multi_reduce_abstractor_HPP__ +#define __RAJA_test_multi_reduce_abstractor_HPP__ + +#include "RAJA/RAJA.hpp" +#include "camp/list.hpp" + +// +// Get the identity value for the operation used by the given multi reducer +// +template < typename MultiReducer > +inline auto get_op_identity(MultiReducer const& RAJA_UNUSED_ARG(multi_reduce)) +{ + return MultiReducer::MultiReduceOp::identity(); +} + + +struct SumAbstractor +{ + template < typename DATA_TYPE > + static constexpr bool supports() { return std::is_arithmetic::value; } + + template < typename Reducer > + static bool consistent(Reducer const&) + { + return RAJA::policy_has_trait::value || + !std::is_floating_point::value; + } + + template < typename policy, typename DATA_TYPE > + using reducer = RAJA::ReduceSum; + + template < typename policy, typename DATA_TYPE > + using multi_reducer = RAJA::MultiReduceSum; + + template < typename Lhs, typename Rhs > + RAJA_HOST_DEVICE + static auto combine(Lhs const& lhs, Rhs const& rhs) { return lhs + rhs; } + + template < typename Reducer, typename Rhs > + RAJA_HOST_DEVICE + static decltype(auto) reduce(Reducer&& lhs, Rhs const& rhs) { return std::forward(lhs) += rhs; } + + template < typename Reducer > + static auto identity(Reducer const&) { return Reducer::MultiReduceOp::identity(); } +}; + +struct MinAbstractor +{ + template < typename DATA_TYPE > + static constexpr bool supports() { return std::is_arithmetic::value; } + + template < typename Reducer > + static constexpr bool consistent(Reducer const&) { return true; } + + template < typename policy, typename DATA_TYPE > + using reducer = RAJA::ReduceSum; + + template < typename policy, typename DATA_TYPE > + using multi_reducer = RAJA::MultiReduceMin; + + template < typename Lhs, typename Rhs > + RAJA_HOST_DEVICE + static auto combine(Lhs const& lhs, Rhs const& rhs) { return (lhs > rhs) ? rhs : lhs; } + + template < typename Reducer, typename Rhs > + RAJA_HOST_DEVICE + static decltype(auto) reduce(Reducer&& lhs, Rhs const& rhs) { return std::forward(lhs).min(rhs); } + + template < typename Reducer > + static auto identity(Reducer const&) { return Reducer::MultiReduceOp::identity(); } +}; + +struct MaxAbstractor +{ + template < typename DATA_TYPE > + static constexpr bool supports() { return std::is_arithmetic::value; } + + template < typename Reducer > + static constexpr bool consistent(Reducer const&) { return true; } + + template < typename policy, typename DATA_TYPE > + using reducer = RAJA::ReduceSum; + + template < typename policy, typename DATA_TYPE > + using multi_reducer = RAJA::MultiReduceMax; + + template < typename Lhs, typename Rhs > + RAJA_HOST_DEVICE + static auto combine(Lhs const& lhs, Rhs const& rhs) { return (lhs < rhs) ? rhs : lhs; } + + template < typename Reducer, typename Rhs > + RAJA_HOST_DEVICE + static decltype(auto) reduce(Reducer&& lhs, Rhs const& rhs) { return std::forward(lhs).max(rhs); } + + template < typename Reducer > + static auto identity(Reducer const&) { return Reducer::MultiReduceOp::identity(); } +}; + +struct BitAndAbstractor +{ + template < typename DATA_TYPE > + static constexpr bool supports() { return std::is_integral::value; } + + template < typename Reducer > + static constexpr bool consistent(Reducer const&) { return true; } + + template < typename policy, typename DATA_TYPE > + using reducer = RAJA::ReduceSum; + + template < typename policy, typename DATA_TYPE > + using multi_reducer = RAJA::MultiReduceBitAnd; + + template < typename Lhs, typename Rhs > + RAJA_HOST_DEVICE + static auto combine(Lhs const& lhs, Rhs const& rhs) { return lhs & rhs; } + + template < typename Reducer, typename Rhs > + RAJA_HOST_DEVICE + static decltype(auto) reduce(Reducer&& lhs, Rhs const& rhs) { return std::forward(lhs) &= rhs; } + + template < typename Reducer > + static auto identity(Reducer const&) { return Reducer::MultiReduceOp::identity(); } +}; + +struct BitOrAbstractor +{ + template < typename DATA_TYPE > + static constexpr bool supports() { return std::is_integral::value; } + + template < typename Reducer > + static constexpr bool consistent(Reducer const&) { return true; } + + template < typename policy, typename DATA_TYPE > + using reducer = RAJA::ReduceSum; + + template < typename policy, typename DATA_TYPE > + using multi_reducer = RAJA::MultiReduceBitOr; + + template < typename Lhs, typename Rhs > + RAJA_HOST_DEVICE + static auto combine(Lhs const& lhs, Rhs const& rhs) { return lhs | rhs; } + + template < typename Reducer, typename Rhs > + RAJA_HOST_DEVICE + static decltype(auto) reduce(Reducer&& lhs, Rhs const& rhs) { return std::forward(lhs) |= rhs; } + + template < typename Reducer > + static auto identity(Reducer const&) { return Reducer::MultiReduceOp::identity(); } +}; + + +// Sequential reduction policy types +using ReduceSumAbstractors = camp::list< SumAbstractor >; +using ReduceMinAbstractors = camp::list< MinAbstractor >; +using ReduceMaxAbstractors = camp::list< MaxAbstractor >; +using ReduceBitAndAbstractors = camp::list< BitAndAbstractor >; +using ReduceBitOrAbstractors = camp::list< BitOrAbstractor >; + +#endif // __RAJA_test_multi_reduce_abstractor_HPP__ diff --git a/test/include/RAJA_test-multi-reducepol.hpp b/test/include/RAJA_test-multi-reducepol.hpp new file mode 100644 index 0000000000..e024ef70aa --- /dev/null +++ b/test/include/RAJA_test-multi-reducepol.hpp @@ -0,0 +1,43 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2016-24, Lawrence Livermore National Security, LLC +// and RAJA project contributors. See the RAJA/LICENSE file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +// +// Reduction policies used for reduction tests +// + +#ifndef __RAJA_test_multi_reducepol_HPP__ +#define __RAJA_test_multi_reducepol_HPP__ + +#include "RAJA/RAJA.hpp" +#include "camp/list.hpp" + +// Sequential reduction policy types +using SequentialMultiReducePols = camp::list< RAJA::seq_multi_reduce >; + +#if defined(RAJA_ENABLE_OPENMP) +using OpenMPMultiReducePols = + camp::list< RAJA::omp_multi_reduce, + RAJA::omp_multi_reduce_ordered >; +#endif + +#if defined(RAJA_ENABLE_CUDA) +using CudaMultiReducePols = + camp::list< RAJA::cuda_multi_reduce_atomic_block_then_atomic_grid_host_init, + RAJA::cuda_multi_reduce_atomic_block_then_atomic_grid_host_init_fallback_testing, + RAJA::cuda_multi_reduce_atomic_global_host_init, + RAJA::cuda_multi_reduce_atomic_global_no_replication_host_init >; +#endif + +#if defined(RAJA_ENABLE_HIP) +using HipMultiReducePols = + camp::list< RAJA::hip_multi_reduce_atomic_block_then_atomic_grid_host_init, + RAJA::hip_multi_reduce_atomic_block_then_atomic_grid_host_init_fallback_testing, + RAJA::hip_multi_reduce_atomic_global_host_init, + RAJA::hip_multi_reduce_atomic_global_no_replication_host_init >; +#endif + +#endif // __RAJA_test_multi_reducepol_HPP__ diff --git a/test/unit/CMakeLists.txt b/test/unit/CMakeLists.txt index cb82636e2e..20b3015c5d 100644 --- a/test/unit/CMakeLists.txt +++ b/test/unit/CMakeLists.txt @@ -9,6 +9,7 @@ add_subdirectory(index) add_subdirectory(internal) add_subdirectory(util) add_subdirectory(reducer) +add_subdirectory(multi_reducer) add_subdirectory(resource) add_subdirectory(atomic) add_subdirectory(view-layout) diff --git a/test/unit/multi_reducer/CMakeLists.txt b/test/unit/multi_reducer/CMakeLists.txt new file mode 100644 index 0000000000..6453fa66cb --- /dev/null +++ b/test/unit/multi_reducer/CMakeLists.txt @@ -0,0 +1,60 @@ +############################################################################### +# Copyright (c) 2016-24, Lawrence Livermore National Security, LLC +# and RAJA project contributors. See the RAJA/LICENSE file for details. +# +# SPDX-License-Identifier: (BSD-3-Clause) +############################################################################### + +# +# macro that generates test file and build target for each backend +# this must be a macro or the linker variable set by FindHIP won't be set in +# the right scope and linking will fail with a weird error from +# hipcc_cmake_linker_helper because it expects the path to hipcc as the first +# argument +# +macro( buildunitmultireducetest TESTNAME BACKENDS ) + foreach( BACKEND ${BACKENDS} ) + + configure_file( test-multi-reducer-${TESTNAME}.cpp.in + test-multi-reducer-${TESTNAME}-${BACKEND}.cpp ) + + raja_add_test( NAME test-multi-reducer-${TESTNAME}-${BACKEND} + SOURCES ${CMAKE_CURRENT_BINARY_DIR}/test-multi-reducer-${TESTNAME}-${BACKEND}.cpp ) + + target_include_directories( test-multi-reducer-${TESTNAME}-${BACKEND}.exe + PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/tests ) + + endforeach() +endmacro() + + +set(BACKENDS Sequential) + +# only need to test WorkStorage once +set(WorkStorage_BACKENDS Sequential) + +if(RAJA_ENABLE_OPENMP) + list(APPEND BACKENDS OpenMP) +endif() + +if(RAJA_ENABLE_TARGET_OPENMP) + list(APPEND BACKENDS OpenMPTarget) +endif() + +if(RAJA_ENABLE_CUDA) + list(APPEND BACKENDS Cuda) +endif() + +if(RAJA_ENABLE_HIP) + list(APPEND BACKENDS Hip) +endif() + + + +buildunitmultireducetest(constructors "${BACKENDS}") + +buildunitmultireducetest(reset "${BACKENDS}") + + + +unset(BACKENDS) diff --git a/test/unit/multi_reducer/test-multi-reducer-constructors.cpp.in b/test/unit/multi_reducer/test-multi-reducer-constructors.cpp.in new file mode 100644 index 0000000000..f7bf87e092 --- /dev/null +++ b/test/unit/multi_reducer/test-multi-reducer-constructors.cpp.in @@ -0,0 +1,30 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2016-24, Lawrence Livermore National Security, LLC +// and RAJA project contributors. See the RAJA/LICENSE file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +/// +/// Source file containing tests for RAJA multi-reducer constructors and initialization. +/// + +#include "test-multi-reducer-constructors.hpp" + +using @BACKEND@MultiReducerConstructorTypes = + Test< camp::cartesian_product< @BACKEND@MultiReducerPolicyList, + DataTypeList > >::Types; + +INSTANTIATE_TYPED_TEST_SUITE_P(@BACKEND@BasicTest, + MultiReducerBasicConstructorUnitTest, + @BACKEND@MultiReducerConstructorTypes); + +INSTANTIATE_TYPED_TEST_SUITE_P(@BACKEND@InitTest, + MultiReducerSingleInitConstructorUnitTest, + @BACKEND@MultiReducerConstructorTypes); + +INSTANTIATE_TYPED_TEST_SUITE_P(@BACKEND@InitTest, + MultiReducerContainerInitConstructorUnitTest, + @BACKEND@MultiReducerConstructorTypes); + + diff --git a/test/unit/multi_reducer/test-multi-reducer-reset.cpp.in b/test/unit/multi_reducer/test-multi-reducer-reset.cpp.in new file mode 100644 index 0000000000..ea033161e1 --- /dev/null +++ b/test/unit/multi_reducer/test-multi-reducer-reset.cpp.in @@ -0,0 +1,30 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2016-24, Lawrence Livermore National Security, LLC +// and RAJA project contributors. See the RAJA/LICENSE file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +/// +/// Source file containing tests for RAJA multi-reducer reset. +/// + +#include "test-multi-reducer-reset.hpp" + +using @BACKEND@MultiReducerResetTypes = + Test< camp::cartesian_product< @BACKEND@MultiReducerPolicyList, + DataTypeList, + @BACKEND@UnitTestPolicyList > >::Types; + +INSTANTIATE_TYPED_TEST_SUITE_P(@BACKEND@ResetTest, + MultiReducerBasicResetUnitTest, + @BACKEND@MultiReducerResetTypes); + +INSTANTIATE_TYPED_TEST_SUITE_P(@BACKEND@ResetTest, + MultiReducerSingleResetUnitTest, + @BACKEND@MultiReducerResetTypes); + +INSTANTIATE_TYPED_TEST_SUITE_P(@BACKEND@ResetTest, + MultiReducerContainerResetUnitTest, + @BACKEND@MultiReducerResetTypes); + diff --git a/test/unit/multi_reducer/test-multi-reducer.hpp b/test/unit/multi_reducer/test-multi-reducer.hpp new file mode 100644 index 0000000000..a1f94e0895 --- /dev/null +++ b/test/unit/multi_reducer/test-multi-reducer.hpp @@ -0,0 +1,47 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2016-24, Lawrence Livermore National Security, LLC +// and RAJA project contributors. See the RAJA/LICENSE file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +#ifndef __TEST_MULTI_REDUCER_UTILS_HPP__ +#define __TEST_MULTI_REDUCER_UTILS_HPP__ + +#include "RAJA_test-base.hpp" +#include "RAJA_test-camp.hpp" + +#include "RAJA_unit-test-forone.hpp" +#include "RAJA_test-multi-reduce-abstractor.hpp" + +// +// Data types +// +using DataTypeList = camp::list< int, + float, + double >; + +using SequentialMultiReducerPolicyList = camp::list< RAJA::seq_multi_reduce >; + +#if defined(RAJA_ENABLE_OPENMP) +using OpenMPMultiReducerPolicyList = camp::list< RAJA::omp_multi_reduce, + RAJA::omp_multi_reduce_ordered >; +#endif + +#if defined(RAJA_ENABLE_CUDA) +using CudaMultiReducerPolicyList = + camp::list< RAJA::cuda_multi_reduce_atomic_block_then_atomic_grid_host_init, + RAJA::cuda_multi_reduce_atomic_block_then_atomic_grid_host_init_fallback_testing, + RAJA::cuda_multi_reduce_atomic_global_host_init, + RAJA::cuda_multi_reduce_atomic_global_no_replication_host_init >; +#endif + +#if defined(RAJA_ENABLE_HIP) +using HipMultiReducerPolicyList = + camp::list< RAJA::hip_multi_reduce_atomic_block_then_atomic_grid_host_init, + RAJA::hip_multi_reduce_atomic_block_then_atomic_grid_host_init_fallback_testing, + RAJA::hip_multi_reduce_atomic_global_host_init, + RAJA::hip_multi_reduce_atomic_global_no_replication_host_init >; +#endif + +#endif // __TEST_MULTI_REDUCER_UTILS_HPP__ diff --git a/test/unit/multi_reducer/tests/test-multi-reducer-constructors.hpp b/test/unit/multi_reducer/tests/test-multi-reducer-constructors.hpp new file mode 100644 index 0000000000..1104ae1e28 --- /dev/null +++ b/test/unit/multi_reducer/tests/test-multi-reducer-constructors.hpp @@ -0,0 +1,282 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2016-24, Lawrence Livermore National Security, LLC +// and RAJA project contributors. See the RAJA/LICENSE file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +/// +/// Header file containing tests for RAJA multi reducer constructors and initialization. +/// + +#ifndef __TEST_MULTI_REDUCER_CONSTRUCTOR__ +#define __TEST_MULTI_REDUCER_CONSTRUCTOR__ + +#include "RAJA/internal/MemUtils_CPU.hpp" + +#include "../test-multi-reducer.hpp" + +#include +#include +#include + +template +class MultiReducerBasicConstructorUnitTest : public ::testing::Test +{ +}; + +template +class MultiReducerSingleInitConstructorUnitTest : public ::testing::Test +{ +}; + +template +class MultiReducerContainerInitConstructorUnitTest : public ::testing::Test +{ +}; + +TYPED_TEST_SUITE_P(MultiReducerBasicConstructorUnitTest); +TYPED_TEST_SUITE_P(MultiReducerSingleInitConstructorUnitTest); +TYPED_TEST_SUITE_P(MultiReducerContainerInitConstructorUnitTest); + + +template +void testBasicMultiReducerConstructorRegular(size_t num_bins) +{ + RAJA::MultiReduceSum multi_reduce_sum(num_bins); + RAJA::MultiReduceMin multi_reduce_min(num_bins); + RAJA::MultiReduceMax multi_reduce_max(num_bins); + + ASSERT_EQ(multi_reduce_sum.size(), num_bins); + ASSERT_EQ(multi_reduce_min.size(), num_bins); + ASSERT_EQ(multi_reduce_max.size(), num_bins); + + for (size_t bin = 0; bin < num_bins; ++bin) { + ASSERT_EQ(multi_reduce_sum.get(bin), get_op_identity(multi_reduce_sum)); + ASSERT_EQ(multi_reduce_min.get(bin), get_op_identity(multi_reduce_min)); + ASSERT_EQ(multi_reduce_max.get(bin), get_op_identity(multi_reduce_max)); + + ASSERT_EQ((NumericType)multi_reduce_sum[bin].get(), get_op_identity(multi_reduce_sum)); + ASSERT_EQ((NumericType)multi_reduce_min[bin].get(), get_op_identity(multi_reduce_min)); + ASSERT_EQ((NumericType)multi_reduce_max[bin].get(), get_op_identity(multi_reduce_max)); + } +} + +template +void testBasicMultiReducerConstructorBitwise(size_t num_bins) +{ + RAJA::MultiReduceBitOr multi_reduce_or(num_bins); + RAJA::MultiReduceBitAnd multi_reduce_and(num_bins); + + ASSERT_EQ(multi_reduce_or.size(), num_bins); + ASSERT_EQ(multi_reduce_and.size(), num_bins); + + for (size_t bin = 0; bin < num_bins; ++bin) { + ASSERT_EQ(multi_reduce_or.get(bin), get_op_identity(multi_reduce_or)); + ASSERT_EQ(multi_reduce_and.get(bin), get_op_identity(multi_reduce_and)); + + ASSERT_EQ((NumericType)multi_reduce_or[bin].get(), get_op_identity(multi_reduce_or)); + ASSERT_EQ((NumericType)multi_reduce_and[bin].get(), get_op_identity(multi_reduce_and)); + } +} + +template ::value>* = nullptr> +void testBasicMultiReducerConstructor(size_t num_bins) +{ + testBasicMultiReducerConstructorRegular< MultiReducePolicy, NumericType >(num_bins); + testBasicMultiReducerConstructorBitwise< MultiReducePolicy, NumericType >(num_bins); +} +/// +template ::value>* = nullptr> +void testBasicMultiReducerConstructor(size_t num_bins) +{ + testBasicMultiReducerConstructorRegular< MultiReducePolicy, NumericType >(num_bins); +} + +TYPED_TEST_P(MultiReducerBasicConstructorUnitTest, MultiReducerConstructor) +{ + using MultiReducePolicy = typename camp::at>::type; + using NumericType = typename camp::at>::type; + + testBasicMultiReducerConstructor< MultiReducePolicy, NumericType >(0); + testBasicMultiReducerConstructor< MultiReducePolicy, NumericType >(1); + testBasicMultiReducerConstructor< MultiReducePolicy, NumericType >(2); + testBasicMultiReducerConstructor< MultiReducePolicy, NumericType >(10); +} + + +template +void testMultiReducerSingleInitConstructorRegular(size_t num_bins, NumericType initVal) +{ + RAJA::MultiReduceSum multi_reduce_sum(num_bins, initVal); + RAJA::MultiReduceMin multi_reduce_min(num_bins, initVal); + RAJA::MultiReduceMax multi_reduce_max(num_bins, initVal); + + ASSERT_EQ(multi_reduce_sum.size(), num_bins); + ASSERT_EQ(multi_reduce_min.size(), num_bins); + ASSERT_EQ(multi_reduce_max.size(), num_bins); + + for (size_t bin = 0; bin < num_bins; ++bin) { + ASSERT_EQ(multi_reduce_sum.get(bin), initVal); + ASSERT_EQ(multi_reduce_min.get(bin), initVal); + ASSERT_EQ(multi_reduce_max.get(bin), initVal); + + ASSERT_EQ((NumericType)multi_reduce_sum[bin].get(), initVal); + ASSERT_EQ((NumericType)multi_reduce_min[bin].get(), initVal); + ASSERT_EQ((NumericType)multi_reduce_max[bin].get(), initVal); + } +} + +template +void testMultiReducerSingleInitConstructorBitwise(size_t num_bins, NumericType initVal) +{ + RAJA::MultiReduceBitOr multi_reduce_or(num_bins, initVal); + RAJA::MultiReduceBitAnd multi_reduce_and(num_bins, initVal); + + ASSERT_EQ(multi_reduce_or.size(), num_bins); + ASSERT_EQ(multi_reduce_and.size(), num_bins); + + for (size_t bin = 0; bin < num_bins; ++bin) { + ASSERT_EQ(multi_reduce_or.get(bin), initVal); + ASSERT_EQ(multi_reduce_and.get(bin), initVal); + + ASSERT_EQ((NumericType)multi_reduce_or[bin].get(), initVal); + ASSERT_EQ((NumericType)multi_reduce_and[bin].get(), initVal); + } +} + +template ::value>* = nullptr > +void testMultiReducerSingleInitConstructor(size_t num_bins, NumericType initVal) +{ + testMultiReducerSingleInitConstructorRegular< MultiReducePolicy, NumericType >(num_bins, initVal); + testMultiReducerSingleInitConstructorBitwise< MultiReducePolicy, NumericType >(num_bins, initVal); +} +/// +template ::value>* = nullptr > +void testMultiReducerSingleInitConstructor(size_t num_bins, NumericType initVal) +{ + testMultiReducerSingleInitConstructorRegular< MultiReducePolicy, NumericType >(num_bins, initVal); +} + +TYPED_TEST_P(MultiReducerSingleInitConstructorUnitTest, MultiReducerConstructor) +{ + using MultiReducePolicy = typename camp::at>::type; + using NumericType = typename camp::at>::type; + + testMultiReducerSingleInitConstructor< MultiReducePolicy, NumericType >(0, NumericType(2)); + testMultiReducerSingleInitConstructor< MultiReducePolicy, NumericType >(1, NumericType(4)); + testMultiReducerSingleInitConstructor< MultiReducePolicy, NumericType >(2, NumericType(0)); + testMultiReducerSingleInitConstructor< MultiReducePolicy, NumericType >(10, NumericType(9)); +} + + +template +void testMultiReducerContainerInitConstructorRegular(Container const& container) +{ + RAJA::MultiReduceSum multi_reduce_sum(container); + RAJA::MultiReduceMin multi_reduce_min(container); + RAJA::MultiReduceMax multi_reduce_max(container); + + ASSERT_EQ(multi_reduce_sum.size(), container.size()); + ASSERT_EQ(multi_reduce_min.size(), container.size()); + ASSERT_EQ(multi_reduce_max.size(), container.size()); + + size_t bin = 0; + for (NumericType val : container) { + ASSERT_EQ(multi_reduce_sum.get(bin), val); + ASSERT_EQ(multi_reduce_min.get(bin), val); + ASSERT_EQ(multi_reduce_max.get(bin), val); + + ASSERT_EQ((NumericType)multi_reduce_sum[bin].get(), val); + ASSERT_EQ((NumericType)multi_reduce_min[bin].get(), val); + ASSERT_EQ((NumericType)multi_reduce_max[bin].get(), val); + ++bin; + } +} + +template +void testMultiReducerContainerInitConstructorBitwise(Container const& container) +{ + RAJA::MultiReduceBitAnd multi_reduce_and(container); + RAJA::MultiReduceBitOr multi_reduce_or(container); + + ASSERT_EQ(multi_reduce_and.size(), container.size()); + ASSERT_EQ(multi_reduce_or.size(), container.size()); + + size_t bin = 0; + for (NumericType val : container) { + ASSERT_EQ(multi_reduce_and.get(bin), val); + ASSERT_EQ(multi_reduce_or.get(bin), val); + + ASSERT_EQ((NumericType)multi_reduce_and[bin].get(), val); + ASSERT_EQ((NumericType)multi_reduce_or[bin].get(), val); + ++bin; + } +} + +template ::value>* = nullptr> +void testMultiReducerContainerInitConstructor(Container const& container) +{ + testMultiReducerContainerInitConstructorRegular< MultiReducePolicy, NumericType >(container); + testMultiReducerContainerInitConstructorBitwise< MultiReducePolicy, NumericType >(container); +} +/// +template ::value>* = nullptr> +void testMultiReducerContainerInitConstructor(Container const& container) +{ + testMultiReducerContainerInitConstructorRegular< MultiReducePolicy, NumericType >(container); +} + +TYPED_TEST_P(MultiReducerContainerInitConstructorUnitTest, MultiReducerConstructor) +{ + using MultiReducePolicy = typename camp::at>::type; + using NumericType = typename camp::at>::type; + + std::vector c0(0); + std::vector c1(1, 3); + std::set c2; + c2.emplace(5); + c2.emplace(8); + std::list c10; + for (size_t bin = 0; bin < size_t(10); ++bin) { + c10.emplace_front(NumericType(bin)); + } + testMultiReducerContainerInitConstructor< MultiReducePolicy, NumericType >(c0); + testMultiReducerContainerInitConstructor< MultiReducePolicy, NumericType >(c1); + testMultiReducerContainerInitConstructor< MultiReducePolicy, NumericType >(c2); + testMultiReducerContainerInitConstructor< MultiReducePolicy, NumericType >(c10); +} + + +REGISTER_TYPED_TEST_SUITE_P(MultiReducerBasicConstructorUnitTest, + MultiReducerConstructor); + +REGISTER_TYPED_TEST_SUITE_P(MultiReducerSingleInitConstructorUnitTest, + MultiReducerConstructor); + +REGISTER_TYPED_TEST_SUITE_P(MultiReducerContainerInitConstructorUnitTest, + MultiReducerConstructor); + +#endif //__TEST_MULTI_REDUCER_CONSTRUCTOR__ diff --git a/test/unit/multi_reducer/tests/test-multi-reducer-reset.hpp b/test/unit/multi_reducer/tests/test-multi-reducer-reset.hpp new file mode 100644 index 0000000000..0eb1eb6eb6 --- /dev/null +++ b/test/unit/multi_reducer/tests/test-multi-reducer-reset.hpp @@ -0,0 +1,431 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2016-24, Lawrence Livermore National Security, LLC +// and RAJA project contributors. See the RAJA/LICENSE file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +/// +/// Header file containing tests for RAJA multi reducer reset. +/// + +#ifndef __TEST_MULTI_REDUCER_RESET__ +#define __TEST_MULTI_REDUCER_RESET__ + +#include "RAJA/internal/MemUtils_CPU.hpp" + +#include "../test-multi-reducer.hpp" + +#include +#include +#include + +template +class MultiReducerBasicResetUnitTest : public ::testing::Test +{ +}; + +template +class MultiReducerSingleResetUnitTest : public ::testing::Test +{ +}; + +template +class MultiReducerContainerResetUnitTest : public ::testing::Test +{ +}; + +TYPED_TEST_SUITE_P(MultiReducerBasicResetUnitTest); +TYPED_TEST_SUITE_P(MultiReducerSingleResetUnitTest); +TYPED_TEST_SUITE_P(MultiReducerContainerResetUnitTest); + + + +template < typename MultiReducePolicy, + typename NumericType, + typename ForOnePol > +void testMultiReducerBasicResetRegular(bool use_reducer, size_t num_bins) +{ + NumericType initVal = NumericType(5); + + RAJA::MultiReduceSum multi_reduce_sum(num_bins, initVal); + RAJA::MultiReduceMin multi_reduce_min(num_bins, initVal); + RAJA::MultiReduceMax multi_reduce_max(num_bins, initVal); + + if (use_reducer) { + forone( [=] RAJA_HOST_DEVICE() { + for (size_t bin = 0; bin < num_bins; ++bin) { + multi_reduce_sum[bin] += initVal; + multi_reduce_min[bin].min(initVal-1); + multi_reduce_max[bin].max(initVal+1); + } + }); + } + + multi_reduce_sum.reset(); + multi_reduce_min.reset(); + multi_reduce_max.reset(); + + ASSERT_EQ(multi_reduce_sum.size(), num_bins); + ASSERT_EQ(multi_reduce_min.size(), num_bins); + ASSERT_EQ(multi_reduce_max.size(), num_bins); + + for (size_t bin = 0; bin < num_bins; ++bin) { + ASSERT_EQ(multi_reduce_sum.get(bin), get_op_identity(multi_reduce_sum)); + ASSERT_EQ(multi_reduce_min.get(bin), get_op_identity(multi_reduce_min)); + ASSERT_EQ(multi_reduce_max.get(bin), get_op_identity(multi_reduce_max)); + + ASSERT_EQ((NumericType)multi_reduce_sum[bin].get(), get_op_identity(multi_reduce_sum)); + ASSERT_EQ((NumericType)multi_reduce_min[bin].get(), get_op_identity(multi_reduce_min)); + ASSERT_EQ((NumericType)multi_reduce_max[bin].get(), get_op_identity(multi_reduce_max)); + } +} + +template < typename MultiReducePolicy, + typename NumericType, + typename ForOnePol > +void testMultiReducerBasicResetBitwise(bool use_reducer, size_t num_bins) +{ + NumericType initVal = NumericType(5); + + RAJA::MultiReduceBitAnd multi_reduce_and(num_bins, initVal); + RAJA::MultiReduceBitOr multi_reduce_or(num_bins, initVal); + + if (use_reducer) { + forone( [=] RAJA_HOST_DEVICE() { + for (size_t bin = 0; bin < num_bins; ++bin) { + multi_reduce_and[bin] &= initVal-1; + multi_reduce_or[bin] |= initVal+1; + } + }); + } + + multi_reduce_and.reset(); + multi_reduce_or.reset(); + + ASSERT_EQ(multi_reduce_and.size(), num_bins); + ASSERT_EQ(multi_reduce_or.size(), num_bins); + + for (size_t bin = 0; bin < num_bins; ++bin) { + ASSERT_EQ(multi_reduce_and.get(bin), get_op_identity(multi_reduce_and)); + ASSERT_EQ(multi_reduce_or.get(bin), get_op_identity(multi_reduce_or)); + + ASSERT_EQ((NumericType)multi_reduce_and[bin].get(), get_op_identity(multi_reduce_and)); + ASSERT_EQ((NumericType)multi_reduce_or[bin].get(), get_op_identity(multi_reduce_or)); + } +} + +template < typename MultiReducePolicy, + typename NumericType, + typename ForOnePol, + std::enable_if_t::value>* = nullptr > +void testMultiReducerBasicReset(size_t num_bins) +{ + testMultiReducerBasicResetRegular< MultiReducePolicy, NumericType, ForOnePol >(false, num_bins); + testMultiReducerBasicResetBitwise< MultiReducePolicy, NumericType, ForOnePol >(false, num_bins); + // avoid using the reducer as forone does not handle reducers correctly + // forone does not make_lambda_body or privatize the body + // testMultiReducerBasicResetRegular< MultiReducePolicy, NumericType, ForOnePol >(true, num_bins); + // testMultiReducerBasicResetBitwise< MultiReducePolicy, NumericType, ForOnePol >(true, num_bins); +} +/// +template < typename MultiReducePolicy, + typename NumericType, + typename ForOnePol, + std::enable_if_t::value>* = nullptr > +void testMultiReducerBasicReset(size_t num_bins) +{ + testMultiReducerBasicResetRegular< MultiReducePolicy, NumericType, ForOnePol >(false, num_bins); + // avoid using the reducer as forone does not handle reducers correctly + // forone does not make_lambda_body or privatize the body + // testMultiReducerBasicResetRegular< MultiReducePolicy, NumericType, ForOnePol >(true, num_bins); +} + +TYPED_TEST_P(MultiReducerBasicResetUnitTest, MultiReducerReset) +{ + using MultiReducePolicy = typename camp::at>::type; + using NumericType = typename camp::at>::type; + using ForOnePol = typename camp::at>::type; + + testMultiReducerBasicReset< MultiReducePolicy, NumericType, ForOnePol >(0); + testMultiReducerBasicReset< MultiReducePolicy, NumericType, ForOnePol >(1); + testMultiReducerBasicReset< MultiReducePolicy, NumericType, ForOnePol >(2); + testMultiReducerBasicReset< MultiReducePolicy, NumericType, ForOnePol >(10); +} + + + +template < typename MultiReducePolicy, + typename NumericType, + typename ForOnePol > +void testMultiReducerSingleResetRegular(bool use_reducer, size_t init_bins, size_t num_bins, NumericType initVal) +{ + RAJA::MultiReduceSum multi_reduce_sum(init_bins, initVal); + RAJA::MultiReduceMin multi_reduce_min(init_bins, initVal); + RAJA::MultiReduceMax multi_reduce_max(init_bins, initVal); + + if (use_reducer) { + forone( [=] RAJA_HOST_DEVICE() { + for (size_t bin = 0; bin < init_bins; ++bin) { + multi_reduce_sum[bin] += initVal; + multi_reduce_min[bin].min(initVal-1); + multi_reduce_max[bin].max(initVal+1); + } + }); + } + + multi_reduce_sum.reset(num_bins, initVal); + multi_reduce_min.reset(num_bins, initVal); + multi_reduce_max.reset(num_bins, initVal); + + ASSERT_EQ(multi_reduce_sum.size(), num_bins); + ASSERT_EQ(multi_reduce_min.size(), num_bins); + ASSERT_EQ(multi_reduce_max.size(), num_bins); + + for (size_t bin = 0; bin < num_bins; ++bin) { + ASSERT_EQ(multi_reduce_sum.get(bin), initVal); + ASSERT_EQ(multi_reduce_min.get(bin), initVal); + ASSERT_EQ(multi_reduce_max.get(bin), initVal); + + ASSERT_EQ((NumericType)multi_reduce_sum[bin].get(), initVal); + ASSERT_EQ((NumericType)multi_reduce_min[bin].get(), initVal); + ASSERT_EQ((NumericType)multi_reduce_max[bin].get(), initVal); + } +} + +template < typename MultiReducePolicy, + typename NumericType, + typename ForOnePol > +void testMultiReducerSingleResetBitwise(bool use_reducer, size_t init_bins, size_t num_bins, NumericType initVal) +{ + RAJA::MultiReduceBitAnd multi_reduce_and(init_bins, initVal); + RAJA::MultiReduceBitOr multi_reduce_or(init_bins, initVal); + + if (use_reducer) { + forone( [=] RAJA_HOST_DEVICE() { + for (size_t bin = 0; bin < init_bins; ++bin) { + multi_reduce_and[bin] &= initVal-1; + multi_reduce_or[bin] |= initVal+1; + } + }); + } + + multi_reduce_and.reset(num_bins, initVal); + multi_reduce_or.reset(num_bins, initVal); + + ASSERT_EQ(multi_reduce_and.size(), num_bins); + ASSERT_EQ(multi_reduce_or.size(), num_bins); + + for (size_t bin = 0; bin < num_bins; ++bin) { + ASSERT_EQ(multi_reduce_and.get(bin), initVal); + ASSERT_EQ(multi_reduce_or.get(bin), initVal); + + ASSERT_EQ((NumericType)multi_reduce_and[bin].get(), initVal); + ASSERT_EQ((NumericType)multi_reduce_or[bin].get(), initVal); + } +} + +template < typename MultiReducePolicy, + typename NumericType, + typename ForOnePol, + std::enable_if_t::value>* = nullptr > +void testMultiReducerSingleResetSize(size_t init_bins, size_t num_bins, NumericType initVal) +{ + testMultiReducerSingleResetRegular< MultiReducePolicy, NumericType, ForOnePol >(false, init_bins, num_bins, initVal); + testMultiReducerSingleResetBitwise< MultiReducePolicy, NumericType, ForOnePol >(false, init_bins, num_bins, initVal); + // avoid using the reducer as forone does not handle reducers correctly + // forone does not make_lambda_body or privatize the body + // testMultiReducerSingleResetRegular< MultiReducePolicy, NumericType, ForOnePol >(true, init_bins, num_bins, initVal); + // testMultiReducerSingleResetBitwise< MultiReducePolicy, NumericType, ForOnePol >(true, init_bins, num_bins, initVal); +} +/// +template < typename MultiReducePolicy, + typename NumericType, + typename ForOnePol, + std::enable_if_t::value>* = nullptr > +void testMultiReducerSingleResetSize(size_t init_bins, size_t num_bins, NumericType initVal) +{ + testMultiReducerSingleResetRegular< MultiReducePolicy, NumericType, ForOnePol >(false, init_bins, num_bins, initVal); + // avoid using the reducer as forone does not handle reducers correctly + // forone does not make_lambda_body or privatize the body + // testMultiReducerSingleResetRegular< MultiReducePolicy, NumericType, ForOnePol >(true, init_bins, num_bins, initVal); +} + +template < typename MultiReducePolicy, + typename NumericType, + typename ForOnePol > +void testMultiReducerSingleReset(size_t num_bins, NumericType initVal) +{ + testMultiReducerSingleResetSize< MultiReducePolicy, NumericType, ForOnePol >(0, num_bins, initVal); + testMultiReducerSingleResetSize< MultiReducePolicy, NumericType, ForOnePol >(4, num_bins, initVal); + testMultiReducerSingleResetSize< MultiReducePolicy, NumericType, ForOnePol >(num_bins, num_bins, initVal); +} + +TYPED_TEST_P(MultiReducerSingleResetUnitTest, MultiReducerReset) +{ + using MultiReducePolicy = typename camp::at>::type; + using NumericType = typename camp::at>::type; + using ForOnePol = typename camp::at>::type; + + testMultiReducerSingleReset< MultiReducePolicy, NumericType, ForOnePol >(0, NumericType(3)); + testMultiReducerSingleReset< MultiReducePolicy, NumericType, ForOnePol >(1, NumericType(5)); + testMultiReducerSingleReset< MultiReducePolicy, NumericType, ForOnePol >(2, NumericType(0)); + testMultiReducerSingleReset< MultiReducePolicy, NumericType, ForOnePol >(10, NumericType(8)); +} + + + +template < typename MultiReducePolicy, + typename NumericType, + typename ForOnePol, + typename Container > +void testMultiReducerContainerResetRegular(bool use_reducer, size_t init_bins, Container const& container) +{ + const size_t num_bins = container.size(); + NumericType initVal = NumericType(5); + + RAJA::MultiReduceSum multi_reduce_sum(init_bins, initVal); + RAJA::MultiReduceMin multi_reduce_min(init_bins, initVal); + RAJA::MultiReduceMax multi_reduce_max(init_bins, initVal); + + if (use_reducer) { + forone( [=] RAJA_HOST_DEVICE() { + for (size_t bin = 0; bin < init_bins; ++bin) { + multi_reduce_sum[bin] += initVal; + multi_reduce_min[bin].min(initVal-1); + multi_reduce_max[bin].max(initVal+1); + } + }); + } + + multi_reduce_sum.reset(container); + multi_reduce_min.reset(container); + multi_reduce_max.reset(container); + + ASSERT_EQ(multi_reduce_sum.size(), num_bins); + ASSERT_EQ(multi_reduce_min.size(), num_bins); + ASSERT_EQ(multi_reduce_max.size(), num_bins); + + size_t bin = 0; + for (NumericType val : container) { + ASSERT_EQ(multi_reduce_sum.get(bin), val); + ASSERT_EQ(multi_reduce_min.get(bin), val); + ASSERT_EQ(multi_reduce_max.get(bin), val); + + ASSERT_EQ((NumericType)multi_reduce_sum[bin].get(), val); + ASSERT_EQ((NumericType)multi_reduce_min[bin].get(), val); + ASSERT_EQ((NumericType)multi_reduce_max[bin].get(), val); + ++bin; + } +} + +template < typename MultiReducePolicy, + typename NumericType, + typename ForOnePol, + typename Container > +void testMultiReducerContainerResetBitwise(bool use_reducer, size_t init_bins, Container const& container) +{ + const size_t num_bins = container.size(); + NumericType initVal = NumericType(5); + + RAJA::MultiReduceBitAnd multi_reduce_and(init_bins, initVal); + RAJA::MultiReduceBitOr multi_reduce_or(init_bins, initVal); + + if (use_reducer) { + forone( [=] RAJA_HOST_DEVICE() { + for (size_t bin = 0; bin < init_bins; ++bin) { + multi_reduce_and[bin] &= initVal-1; + multi_reduce_or[bin] |= initVal+1; + } + }); + } + + multi_reduce_and.reset(container); + multi_reduce_or.reset(container); + + ASSERT_EQ(multi_reduce_and.size(), num_bins); + ASSERT_EQ(multi_reduce_or.size(), num_bins); + + size_t bin = 0; + for (NumericType val : container) { + ASSERT_EQ(multi_reduce_and.get(bin), val); + ASSERT_EQ(multi_reduce_or.get(bin), val); + + ASSERT_EQ((NumericType)multi_reduce_and[bin].get(), val); + ASSERT_EQ((NumericType)multi_reduce_or[bin].get(), val); + ++bin; + } +} + +template < typename MultiReducePolicy, + typename NumericType, + typename ForOnePol, + typename Container, + std::enable_if_t::value>* = nullptr > +void testMultiReducerContainerResetSize(size_t init_bins, Container const& container) +{ + testMultiReducerContainerResetRegular< MultiReducePolicy, NumericType, ForOnePol >(false, init_bins, container); + testMultiReducerContainerResetBitwise< MultiReducePolicy, NumericType, ForOnePol >(false, init_bins, container); + // avoid using the reducer as forone does not handle reducers correctly + // forone does not make_lambda_body or privatize the body + // testMultiReducerContainerResetRegular< MultiReducePolicy, NumericType, ForOnePol >(true, init_bins, container); + // testMultiReducerContainerResetBitwise< MultiReducePolicy, NumericType, ForOnePol >(true, init_bins, container); +} +/// +template < typename MultiReducePolicy, + typename NumericType, + typename ForOnePol, + typename Container, + std::enable_if_t::value>* = nullptr > +void testMultiReducerContainerResetSize(size_t init_bins, Container const& container) +{ + testMultiReducerContainerResetRegular< MultiReducePolicy, NumericType, ForOnePol >(false, init_bins, container); + // avoid using the reducer as forone does not handle reducers correctly + // forone does not make_lambda_body or privatize the body + // testMultiReducerContainerResetRegular< MultiReducePolicy, NumericType, ForOnePol >(true, init_bins, container); +} + +template < typename MultiReducePolicy, + typename NumericType, + typename ForOnePol, + typename Container > +void testMultiReducerContainerReset(Container const& container) +{ + testMultiReducerContainerResetSize< MultiReducePolicy, NumericType, ForOnePol >(0, container); + testMultiReducerContainerResetSize< MultiReducePolicy, NumericType, ForOnePol >(4, container); + testMultiReducerContainerResetSize< MultiReducePolicy, NumericType, ForOnePol >(container.size(), container); +} + +TYPED_TEST_P(MultiReducerContainerResetUnitTest, MultiReducerReset) +{ + using MultiReducePolicy = typename camp::at>::type; + using NumericType = typename camp::at>::type; + using ForOnePol = typename camp::at>::type; + + std::vector c0(0); + std::vector c1(1, 3); + std::set c2; + c2.emplace(5); + c2.emplace(8); + std::list c10; + for (size_t bin = 0; bin < size_t(10); ++bin) { + c10.emplace_front(NumericType(bin)); + } + testMultiReducerContainerReset< MultiReducePolicy, NumericType, ForOnePol >(c0); + testMultiReducerContainerReset< MultiReducePolicy, NumericType, ForOnePol >(c1); + testMultiReducerContainerReset< MultiReducePolicy, NumericType, ForOnePol >(c2); + testMultiReducerContainerReset< MultiReducePolicy, NumericType, ForOnePol >(c10); +} + + + +REGISTER_TYPED_TEST_SUITE_P(MultiReducerBasicResetUnitTest, + MultiReducerReset); + +REGISTER_TYPED_TEST_SUITE_P(MultiReducerSingleResetUnitTest, + MultiReducerReset); + +REGISTER_TYPED_TEST_SUITE_P(MultiReducerContainerResetUnitTest, + MultiReducerReset); + +#endif //__TEST_MULTI_REDUCER_RESET__ diff --git a/test/unit/util/CMakeLists.txt b/test/unit/util/CMakeLists.txt index 869b897714..175d2c07bb 100644 --- a/test/unit/util/CMakeLists.txt +++ b/test/unit/util/CMakeLists.txt @@ -25,4 +25,8 @@ raja_add_test( NAME test-fraction SOURCES test-fraction.cpp) +raja_add_test( + NAME test-math + SOURCES test-math.cpp) + add_subdirectory(operator) diff --git a/test/unit/util/test-math.cpp b/test/unit/util/test-math.cpp new file mode 100644 index 0000000000..39572ad3a0 --- /dev/null +++ b/test/unit/util/test-math.cpp @@ -0,0 +1,119 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2016-24, Lawrence Livermore National Security, LLC +// and RAJA project contributors. See the RAJA/LICENSE file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +/// +/// Source file containing tests for Fraction +/// + +#include +#include "RAJA_gtest.hpp" +#include + +template < typename T > +void test_log2() +{ + ASSERT_EQ(RAJA::log2(T(257)), T(8)); + ASSERT_EQ(RAJA::log2(T(256)), T(8)); + ASSERT_EQ(RAJA::log2(T(255)), T(7)); + ASSERT_EQ(RAJA::log2(T(4)), T(2)); + ASSERT_EQ(RAJA::log2(T(3)), T(1)); + ASSERT_EQ(RAJA::log2(T(2)), T(1)); + ASSERT_EQ(RAJA::log2(T(1)), T(0)); + ASSERT_EQ(RAJA::log2(T(0)), T(0)); + if (std::is_signed::value) { + ASSERT_EQ(RAJA::log2(T(-1)), T(0)); + ASSERT_EQ(RAJA::log2(T(-100)), T(0)); + } +} + +TEST(math, log2) +{ + test_log2(); + test_log2(); +} + + +template < typename T > +void test_next_pow2() +{ + ASSERT_EQ(RAJA::next_pow2(T(257)), T(512)); + ASSERT_EQ(RAJA::next_pow2(T(256)), T(256)); + ASSERT_EQ(RAJA::next_pow2(T(255)), T(256)); + ASSERT_EQ(RAJA::next_pow2(T(4)), T(4)); + ASSERT_EQ(RAJA::next_pow2(T(3)), T(4)); + ASSERT_EQ(RAJA::next_pow2(T(2)), T(2)); + ASSERT_EQ(RAJA::next_pow2(T(1)), T(1)); + ASSERT_EQ(RAJA::next_pow2(T(0)), T(0)); + if (std::is_signed::value) { + ASSERT_EQ(RAJA::next_pow2(T(-1)), T(0)); + ASSERT_EQ(RAJA::next_pow2(T(-100)), T(0)); + } +} + +TEST(math, next_pow2) +{ + test_next_pow2(); + test_next_pow2(); +} + + +template < typename T > +void test_prev_pow2() +{ + ASSERT_EQ(RAJA::prev_pow2(T(257)), T(256)); + ASSERT_EQ(RAJA::prev_pow2(T(256)), T(256)); + ASSERT_EQ(RAJA::prev_pow2(T(255)), T(128)); + ASSERT_EQ(RAJA::prev_pow2(T(4)), T(4)); + ASSERT_EQ(RAJA::prev_pow2(T(3)), T(2)); + ASSERT_EQ(RAJA::prev_pow2(T(2)), T(2)); + ASSERT_EQ(RAJA::prev_pow2(T(1)), T(1)); + ASSERT_EQ(RAJA::prev_pow2(T(0)), T(0)); + if (std::is_signed::value) { + ASSERT_EQ(RAJA::prev_pow2(T(-1)), T(0)); + ASSERT_EQ(RAJA::prev_pow2(T(-100)), T(0)); + } +} + +TEST(math, prev_pow2) +{ + test_prev_pow2(); + test_prev_pow2(); +} + + +template < typename T > +void test_power_of_2_mod() +{ + ASSERT_EQ(RAJA::power_of_2_mod(T(257), T(256)), T(1)); + ASSERT_EQ(RAJA::power_of_2_mod(T(256), T(256)), T(0)); + ASSERT_EQ(RAJA::power_of_2_mod(T(255), T(256)), T(255)); + ASSERT_EQ(RAJA::power_of_2_mod(T(128), T(256)), T(128)); + ASSERT_EQ(RAJA::power_of_2_mod(T(256), T(4)), T(0)); + ASSERT_EQ(RAJA::power_of_2_mod(T(95), T(4)), T(3)); + ASSERT_EQ(RAJA::power_of_2_mod(T(94), T(4)), T(2)); + ASSERT_EQ(RAJA::power_of_2_mod(T(93), T(4)), T(1)); + ASSERT_EQ(RAJA::power_of_2_mod(T(92), T(4)), T(0)); + ASSERT_EQ(RAJA::power_of_2_mod(T(7), T(4)), T(3)); + ASSERT_EQ(RAJA::power_of_2_mod(T(6), T(4)), T(2)); + ASSERT_EQ(RAJA::power_of_2_mod(T(5), T(4)), T(1)); + ASSERT_EQ(RAJA::power_of_2_mod(T(4), T(4)), T(0)); + ASSERT_EQ(RAJA::power_of_2_mod(T(3), T(4)), T(3)); + ASSERT_EQ(RAJA::power_of_2_mod(T(2), T(4)), T(2)); + ASSERT_EQ(RAJA::power_of_2_mod(T(1), T(4)), T(1)); + ASSERT_EQ(RAJA::power_of_2_mod(T(0), T(4)), T(0)); + ASSERT_EQ(RAJA::power_of_2_mod(T(3), T(2)), T(1)); + ASSERT_EQ(RAJA::power_of_2_mod(T(2), T(2)), T(0)); + ASSERT_EQ(RAJA::power_of_2_mod(T(1), T(2)), T(1)); + ASSERT_EQ(RAJA::power_of_2_mod(T(0), T(2)), T(0)); + ASSERT_EQ(RAJA::power_of_2_mod(T(1), T(1)), T(0)); +} + +TEST(math, power_of_2_mod) +{ + test_power_of_2_mod(); + test_power_of_2_mod(); +} diff --git a/test/unit/workgroup/CMakeLists.txt b/test/unit/workgroup/CMakeLists.txt index dce610d954..0815ffda5e 100644 --- a/test/unit/workgroup/CMakeLists.txt +++ b/test/unit/workgroup/CMakeLists.txt @@ -62,6 +62,11 @@ if(RAJA_TEST_EXHAUSTIVE OR NOT RAJA_COMPILER MATCHES "RAJA_COMPILER_Intel") set(Constructor_SUBTESTS Single) buildunitworkgrouptest(Constructor "${Constructor_SUBTESTS}" "${DISPATCHERS}" "${BACKENDS}") + if(RAJA_ENABLE_TARGET_OPENMP) + # WorkGroup dispatcher for OpenMPTarget not implemented yet + list(REMOVE_ITEM BACKENDS OpenMPTarget) + endif() + set(Enqueue_SUBTESTS Single Multiple) buildunitworkgrouptest(Enqueue "${Enqueue_SUBTESTS}" "${DISPATCHERS}" "${BACKENDS}") @@ -70,10 +75,12 @@ if(RAJA_TEST_EXHAUSTIVE OR NOT RAJA_COMPILER MATCHES "RAJA_COMPILER_Intel") endif() set(Dispatcher_SUBTESTS Single) + if(RAJA_ENABLE_TARGET_OPENMP) # WorkGroup dispatcher for OpenMPTarget not implemented yet list(REMOVE_ITEM BACKENDS OpenMPTarget) endif() + buildunitworkgrouptest(Dispatcher "${Dispatcher_SUBTESTS}" "${DISPATCHERS}" "${BACKENDS}") set(WorkStorage_SUBTESTS Constructor Iterator InsertCall Multiple) diff --git a/tpl/camp b/tpl/camp index 79c320fa09..d580fd8feb 160000 --- a/tpl/camp +++ b/tpl/camp @@ -1 +1 @@ -Subproject commit 79c320fa09db987923b56884afdc9f82f4b70fc4 +Subproject commit d580fd8feb10ddb7a63a784b4afcd857ac686e39