diff --git a/.gitignore b/.gitignore
index 10b3b40f79..269a0763e6 100644
--- a/.gitignore
+++ b/.gitignore
@@ -14,3 +14,5 @@
 /install_*/
 /install-*/
 /Debug/
+*.swp
+*.orig
diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index fb6bc7055c..b263b8aa25 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -75,7 +75,7 @@ stages:
     include:
       - local: '.gitlab/custom-jobs-and-variables.yml'
       - project: 'radiuss/radiuss-shared-ci'
-        ref: 'v2024.04.0'
+        ref: 'v2024.06.0'
         file: 'pipelines/${CI_MACHINE}.yml'
       - artifact: '${CI_MACHINE}-jobs.yml'
         job: 'generate-job-lists'
@@ -91,20 +91,20 @@ stages:
 trigger-rajaperf:
   stage: multi-project
   rules:
-    - if: '$CI_COMMIT_BRANCH == "${MP_BRANCH}" || $MULTI_PROJECT == "ON"' #run only if ...
+    - if: $CI_COMMIT_BRANCH == $MP_BRANCH || $MULTI_PROJECT == "ON" #run only if ...
   variables:
     UPDATE_RAJA: ${MP_BRANCH}
   trigger:
     project: radiuss/rajaperf
     branch: develop
-    strategy: depend
 
 include:
+  # Sets ID tokens for every job using `default:`
   - project: 'lc-templates/id_tokens'
     file: 'id_tokens.yml'
   # [Optional] checks preliminary to running the actual CI test
   - project: 'radiuss/radiuss-shared-ci'
-    ref: 'v2024.04.0'
+    ref: 'v2024.06.0'
     file: 'utilities/preliminary-ignore-draft-pr.yml'
   # pipelines subscribed by the project
   - local: '.gitlab/subscribed-pipelines.yml'
diff --git a/.gitlab/custom-jobs-and-variables.yml b/.gitlab/custom-jobs-and-variables.yml
index 62d7908945..b04cf0de1d 100644
--- a/.gitlab/custom-jobs-and-variables.yml
+++ b/.gitlab/custom-jobs-and-variables.yml
@@ -21,7 +21,7 @@ variables:
 # Project specific variants for ruby
   PROJECT_RUBY_VARIANTS: "~shared +openmp +vectorization +tests"
 # Project specific deps for ruby
-  PROJECT_RUBY_DEPS: ""
+  PROJECT_RUBY_DEPS: "^blt@develop "
 
 # Poodle
 # Arguments for top level allocation
@@ -31,7 +31,7 @@ variables:
 # Project specific variants for poodle
   PROJECT_POODLE_VARIANTS: "~shared +openmp +vectorization +tests"
 # Project specific deps for poodle
-  PROJECT_POODLE_DEPS: ""
+  PROJECT_POODLE_DEPS: "^blt@develop "
 
 # Corona
 # Arguments for top level allocation
@@ -70,3 +70,15 @@ variables:
   artifacts:
     reports:
       junit: junit.xml
+
+.reproducer_vars:
+  script:
+    - |
+      echo -e "
+      # Required variables \n
+      export MODULE_LIST=\"${MODULE_LIST}\" \n
+      export SPEC=\"${SPEC//\"/\\\"}\" \n
+      # Allow to set job script for debugging (only this differs from CI) \n
+      export DEBUG_MODE=true \n
+      # Using the CI build cache is optional and requires a token. Set it like so: \n
+      # export REGISTRY_TOKEN=\"<your token here>\" \n"
diff --git a/.gitlab/jobs/corona.yml b/.gitlab/jobs/corona.yml
index 9213d6e932..abbafe5bb9 100644
--- a/.gitlab/jobs/corona.yml
+++ b/.gitlab/jobs/corona.yml
@@ -8,9 +8,7 @@
 # Override reproducer section to define project specific variables.
 .corona_reproducer_vars:
   script:
-    - |
-      echo -e "export MODULE_LIST=\"${MODULE_LIST}\""
-      echo -e "export SPEC=\"${SPEC//\"/\\\"}\""
+    - !reference [.reproducer_vars, script]
 
 ########################
 # Overridden shared jobs
@@ -33,3 +31,9 @@ rocmcc_5_7_0_hip_desul_atomics:
     SPEC: " ~shared +rocm ~openmp +tests +desul amdgpu_target=gfx906 %rocmcc@=5.7.0 ^hip@5.7.0 ^blt@develop"
   extends: .job_on_corona
 
+clang_19_0_0_sycl_gcc_10_3_1_rocmcc_5_7_1_hip:
+  variables:
+    SPEC: " ~shared +sycl ~openmp +tests %clang@=19.0.0 cxxflags==\"-w -fsycl -fsycl-unnamed-lambda -fsycl-targets=amdgcn-amd-amdhsa -Xsycl-target-backend --offload-arch=gfx906\" ^blt@develop"
+    MODULE_LIST: "rocm/5.7.1"
+  extends: .job_on_corona
+
diff --git a/.gitlab/jobs/lassen.yml b/.gitlab/jobs/lassen.yml
index fbd3d93db0..dc21689ce3 100644
--- a/.gitlab/jobs/lassen.yml
+++ b/.gitlab/jobs/lassen.yml
@@ -8,9 +8,7 @@
 # Override reproducer section to define project specific variables.
 .lassen_reproducer_vars:
   script:
-    - |
-      echo -e "export MODULE_LIST=\"${MODULE_LIST}\""
-      echo -e "export SPEC=\"${SPEC//\"/\\\"}\""
+    - !reference [.reproducer_vars, script]
 
 ########################
 # Overridden shared jobs
@@ -68,3 +66,13 @@ gcc_8_3_1_cuda_10_1_243_desul_atomics:
   variables:
     SPEC: " ~shared +openmp +tests +cuda +desul %gcc@=8.3.1 cuda_arch=70 ^cuda@10.1.243+allow-unsupported-compilers ^blt@develop"
   extends: .job_on_lassen
+
+# Warning: Allowed to fail temporarily
+# Deactivated due to issues with OpenMP Target and various tests and compilers.
+clang_16_0_6_ibm_omptarget:
+  variables:
+    SPEC: " ~shared +openmp +omptarget +tests %clang@=16.0.6.ibm.gcc.8.3.1 ^blt@develop"
+    ON_LASSEN: "OFF"
+  extends: .job_on_lassen
+  allow_failure: true
+
diff --git a/.gitlab/jobs/poodle.yml b/.gitlab/jobs/poodle.yml
index cc1f956cb9..54870e37aa 100644
--- a/.gitlab/jobs/poodle.yml
+++ b/.gitlab/jobs/poodle.yml
@@ -5,38 +5,42 @@
 # SPDX-License-Identifier: (BSD-3-Clause)
 ##############################################################################
 
-# Override reproducer section to define projet specific variables.
+# Override reproducer section to define project specific variables.
 .poodle_reproducer_vars:
   script:
-    - |
-      echo -e "export MODULE_LIST=\"${MODULE_LIST}\""
-      echo -e "export SPEC=\"${SPEC//\"/\\\"}\""
+    - !reference [.reproducer_vars, script]
 
 ########################
 # Overridden shared jobs
 ########################
 # We duplicate the shared jobs description and add necessary changes for this
 # project. We keep ${PROJECT_<MACHINE>_VARIANTS} and ${PROJECT_<MACHINE>_DEPS}
-# So that the comparison with the original job is easier.
+# when possible so that the comparison with the original job is easier.
 
+# Identical to shared job, but use OpenMP tasks and no vectorization
 clang_14_0_6:
   variables:
-    SPEC: " ~shared +openmp +omptask +tests %clang@=14.0.6 ^blt@develop"
+    SPEC: " ~shared +openmp +omptask +tests %clang@=14.0.6 ${PROJECT_POODLE_DEPS}"
   extends: .job_on_poodle
 
+# Identical to shared job, but use OpenMP tasks and no vectorization
 gcc_10_3_1:
   variables:
-    SPEC: " ~shared +openmp +omptask +tests %gcc@=10.3.1 ^blt@develop"
+    SPEC: " ~shared +openmp +omptask +tests %gcc@=10.3.1 ${PROJECT_POODLE_DEPS}"
   extends: .job_on_poodle
 
+# Identical to shared job, but use OpenMP tasks and no vectorization
+# Deactivated (too long on poodle)
 intel_19_1_2_gcc_10_3_1:
   variables:
-    SPEC: " ~shared +openmp +omptask +tests %intel@=19.1.2.gcc.10.3.1 ^blt@develop"
+    ON_POODLE: "OFF"
+    SPEC: " ~shared +openmp +omptask +tests %intel@=19.1.2.gcc.10.3.1 ${PROJECT_POODLE_DEPS}"
   extends: .job_on_poodle
 
+# Allowed to fail
 intel_2022_1_0:
   variables:
-    SPEC: "${PROJECT_POODLE_VARIANTS} %intel@=2022.1.0 ${PROJECT_POODLE_DEPS} ^blt@develop"
+    SPEC: "${PROJECT_POODLE_VARIANTS} %intel@=2022.1.0 ${PROJECT_POODLE_DEPS}"
   allow_failure: true
   extends: .job_on_poodle
 
diff --git a/.gitlab/jobs/ruby.yml b/.gitlab/jobs/ruby.yml
index a924ddd47c..2242494b9c 100644
--- a/.gitlab/jobs/ruby.yml
+++ b/.gitlab/jobs/ruby.yml
@@ -8,35 +8,37 @@
 # Override reproducer section to define project specific variables.
 .ruby_reproducer_vars:
   script:
-    - |
-      echo -e "export MODULE_LIST=\"${MODULE_LIST}\""
-      echo -e "export SPEC=\"${SPEC//\"/\\\"}\""
+    - !reference [.reproducer_vars, script]
 
 ########################
 # Overridden shared jobs
 ########################
 # We duplicate the shared jobs description and add necessary changes for this
 # project. We keep ${PROJECT_<MACHINE>_VARIANTS} and ${PROJECT_<MACHINE>_DEPS}
-# So that the comparison with the original job is easier.
+# when possible so that the comparison with the original job is easier.
 
+# Identical to shared job, but use OpenMP tasks and no vectorization
 clang_14_0_6:
   variables:
-    SPEC: " ~shared +openmp +omptask +tests %clang@=14.0.6 ^blt@develop"
+    SPEC: " ~shared +openmp +omptask +tests %clang@=14.0.6 ${PROJECT_RUBY_DEPS}"
   extends: .job_on_ruby
 
+# Identical to shared job, but use OpenMP tasks and no vectorization
 gcc_10_3_1:
   variables:
-    SPEC: " ~shared +openmp +omptask +tests %gcc@=10.3.1 ^blt@develop"
+    SPEC: " ~shared +openmp +omptask +tests %gcc@=10.3.1 ${PROJECT_RUBY_DEPS}"
   extends: .job_on_ruby
 
+# Identical to shared job, but use OpenMP tasks and no vectorization
 intel_19_1_2_gcc_10_3_1:
   variables:
-    SPEC: " ~shared +openmp +omptask +tests %intel@=19.1.2.gcc.10.3.1 ^blt@develop"
+    SPEC: " ~shared +openmp +omptask +tests %intel@=19.1.2.gcc.10.3.1 ${PROJECT_RUBY_DEPS}"
   extends: .job_on_ruby
 
+# Allowed to fail
 intel_2022_1_0:
   variables:
-    SPEC: "${PROJECT_RUBY_VARIANTS} %intel@=2022.1.0 ${PROJECT_RUBY_DEPS} ^blt@develop"
+    SPEC: "${PROJECT_RUBY_VARIANTS} %intel@=2022.1.0 ${PROJECT_RUBY_DEPS}"
   allow_failure: true
   extends: .job_on_ruby
 
diff --git a/.gitlab/jobs/tioga.yml b/.gitlab/jobs/tioga.yml
index ef89808932..50b60bc13d 100644
--- a/.gitlab/jobs/tioga.yml
+++ b/.gitlab/jobs/tioga.yml
@@ -8,9 +8,7 @@
 # Override reproducer section to define project specific variables.
 .tioga_reproducer_vars:
   script:
-    - |
-      echo -e "export MODULE_LIST=\"${MODULE_LIST}\""
-      echo -e "export SPEC=\"${SPEC//\"/\\\"}\""
+    - !reference [.reproducer_vars, script]
 
 ########################
 # Overridden shared jobs
@@ -28,12 +26,12 @@
 # ${PROJECT_<MACHINE>_DEPS} in the extra jobs. There is no reason not to fully
 # describe the spec here.
 
-rocmcc_5_7_1_hip_desul_atomics:
+rocmcc_6_1_1_hip_desul_atomics:
   variables:
-    SPEC: "~shared +rocm ~openmp +desul +tests amdgpu_target=gfx90a %rocmcc@=5.7.1 ^hip@5.7.1 ^blt@develop"
+    SPEC: "~shared +rocm ~openmp +desul +tests amdgpu_target=gfx90a %rocmcc@=6.1.1 ^hip@6.1.1 ^blt@develop"
   extends: .job_on_tioga
 
-rocmcc_5_7_1_hip_openmp:
+rocmcc_6_1_1_hip_openmp:
   variables:
-    SPEC: "~shared +rocm +openmp +omptask +tests amdgpu_target=gfx90a %rocmcc@=5.7.1 ^hip@5.7.1 ^blt@develop"
+    SPEC: "~shared +rocm +openmp +omptask +tests amdgpu_target=gfx90a %rocmcc@=6.1.1 ^hip@6.1.1 ^blt@develop"
   extends: .job_on_tioga
diff --git a/.uberenv_config.json b/.uberenv_config.json
index 2261a80aea..d89b97cb29 100644
--- a/.uberenv_config.json
+++ b/.uberenv_config.json
@@ -4,7 +4,7 @@
 "package_final_phase" : "initconfig",
 "package_source_dir" : "../..",
 "spack_url": "https://github.com/spack/spack.git",
-"spack_branch": "develop-2024-02-18",
+"spack_branch": "develop-2024-07-07",
 "spack_activate" : {},
 "spack_configs_path": "scripts/radiuss-spack-configs",
 "spack_packages_path": "scripts/radiuss-spack-configs/packages",
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 9e5ecec0b7..1659021970 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -15,8 +15,8 @@ include(CMakeDependentOption)
 
 # Set version number
 set(RAJA_VERSION_MAJOR 2024)
-set(RAJA_VERSION_MINOR 02)
-set(RAJA_VERSION_PATCHLEVEL 2)
+set(RAJA_VERSION_MINOR 07)
+set(RAJA_VERSION_PATCHLEVEL 0)
 
 if (RAJA_LOADED AND (NOT RAJA_LOADED STREQUAL "${RAJA_VERSION_MAJOR}.${RAJA_VERSION_MINOR}.${RAJA_VERSION_PATCHLEVEL}"))
   message(FATAL_ERROR "You are mixing RAJA versions. Loaded is ${RAJA_LOADED}, expected ${RAJA_VERSION_MAJOR}.${RAJA_VERSION_MINOR}.${RAJA_VERSION_PATCHLEVEL}")
@@ -44,11 +44,7 @@ set(CMAKE_MODULE_PATH "${PROJECT_SOURCE_DIR}/cmake/thirdparty" ${CMAKE_MODULE_PA
 
 include(cmake/SetupRajaOptions.cmake)
 
-if (ENABLE_HIP)
-  cmake_minimum_required(VERSION 3.23)
-else()
-  cmake_minimum_required(VERSION 3.20)
-endif()
+cmake_minimum_required(VERSION 3.23)
 
 # Detect C++ standard and add appropriate flag _before_ loading BLT
 set(COMPILERS_KNOWN_TO_CMAKE33 AppleClang Clang GNU MSVC)
diff --git a/RELEASE_NOTES.md b/RELEASE_NOTES.md
index c2df2a03ea..e86890d13d 100644
--- a/RELEASE_NOTES.md
+++ b/RELEASE_NOTES.md
@@ -20,6 +20,57 @@ Notable changes include:
   * Bug fixes/improvements:
 
 
+Version 2024.07.0 -- Release date 2024-07-24
+============================================
+
+This release contains new features, improvements, and bugfixes.
+
+Notable changes include:
+
+  * New features / API changes:
+     * Added support for a "multi-reduction" operation which allows users to
+       perform a run time-defined number of reduction operations in a kernel.
+       Please see the RAJA User Guide for details and examples.
+     * Added first couple of sections for a "RAJA Cookbook" in the RAJA User
+       Guide. The goal is to provide users with more detailed guidance about
+       using RAJA features, choosing execution policies, etc. Additional
+       content will be provided in future releases.
+     * Added atomicLoad and atomicStore routines for correctness in some
+       use cases.
+     * Added OpenMP 5.1 implementations for atomicMin and atomicMax.
+     * Add SYCL reduction support in RAJA::launch
+
+  * Build changes/improvements:
+     * Update camp submodule to v2024.07.0 release. This will be a version
+       constraint for this release in RAJA Spack package.
+     * Minimum required CMake version bumped to 3.23.
+
+  * Bug fixes/improvements:
+     * Fix CMake issue for case when RAJA is used as a submodule dependency.
+     * Various fixes and improvements to builtin atomic support.
+     * Fixes and improvements to other atomic operations:
+        * Modified HIP and CUDA generic atomic compare and swap algorithms
+          to use atomic loads instead of relying on volatile.
+        * Re-implemented atomic loads in terms of builtin atomics for CUDA
+          and HIP so that the generic compare and swap functions can use it.
+        * Removes volatile qualifier in atomic function signatures.
+        * Use cuda::atomic_ref in newer versions of CUDA to back 
+          atomicLoad/atomicStore.
+        * Use atomicAdd as a fallback for atomicSub in CUDA.
+        * Removed checks where __CUDA_ARCH__ is less than 350 since RAJA 
+          requires that as the minimum supported architecture (CMake check).
+     * Fixed issues with naming RAJA forall::kernels when using CUDA.
+     * Fixes in SYCL back-end for RAJA::launch.
+     * Fixed some issues in examples.
+     * Bugfixes and cleanup in parts of the SYCL back-end needed to
+       support a bunch of new SYCL kernels that will appear in 
+       RAJA Performance Suite release.
+     * Fix type naming issue that was exposed with a new version of the
+       Intel oneAPI compiler.
+     * Fix issue in User Guide documentation for configuring a project
+       using RAJA CMake configuration.
+
+
 Version 2024.02.2 -- Release date 2024-05-08
 ============================================
 
diff --git a/docs/conf.py b/docs/conf.py
index 3212170b30..5f76d77b76 100644
--- a/docs/conf.py
+++ b/docs/conf.py
@@ -86,9 +86,9 @@
 # built documents.
 #
 # The short X.Y version.
-version = u'2024.02'
+version = u'2024.07'
 # The full version, including alpha/beta/rc tags.
-release = u'2024.02.2'
+release = u'2024.07.0'
 
 # The language for content autogenerated by Sphinx. Refer to documentation
 # for a list of supported languages.
diff --git a/docs/sphinx/dev_guide/ci_tasks.rst b/docs/sphinx/dev_guide/ci_tasks.rst
index 70c65e8903..5c63ecc9ad 100644
--- a/docs/sphinx/dev_guide/ci_tasks.rst
+++ b/docs/sphinx/dev_guide/ci_tasks.rst
@@ -157,6 +157,114 @@ annotate the job for this. For example:
    describe the change in the ``RAJA/.gitlab/jobs/<MACHINE>.yml`` file where
    the job is overridden.
 
+
+Building the Intel clang + SYCL HIP compiler for use in CI
+----------------------------------------------------------
+
+The SYCL CI tests on corona rely on a custom Intel Clang SYCL compiler that we 
+build ourselves. This compiler lives in the ``/usr/workspace/raja-dev/`` folder so 
+that it can be accessed by the gitlab CI system. Since the intel compiler does
+not do releases in the typical sense (they simply update their repo *every night*), 
+it may become necessary to periodically build a new version of the compiler to 
+ensure that we are using the most up-to-date version available. The steps for 
+building, installing, and running are shown here.
+
+Building the Compiler
+^^^^^^^^^^^^^^^^^^^^^
+
+.. important:: Because intel updates their compiler repo daily, there is a nonzero possibility that the head of the sycl branch will fail to build. 
+  In the event that it does not build, try checking out a different commit. On the intel/llvm GitHub page, one can see which of their 
+  commits builds by checking the status badge next to each commit. Look for a commit that passes. 
+
+
+#. Load the version of GCC that you want to use. In this case, we are using LC's gcc/10.3.1-magic installation::
+
+    module load gcc/10.3.1-magic
+
+#. Load the version of rocm that you want to use. In this case, we are using 5.7.1::
+
+    module load rocm/5.7.1 
+
+#. Clone the "sycl" branch of intel's llvm compiler fork::
+
+    git clone https://github.com/intel/llvm -b sycl
+
+#. cd into that folder:: 
+    
+    cd llvm
+
+   In the event that the head of the sycl branch does not build, run ``git checkout <git sha>`` to checkout a version that does build.
+
+#. Build the compiler. 
+
+   Note that in this example, we are using rocm5.7.1, but one can change the version they wish to use simply by changing the paths in the configure step
+
+   a. Configure
+
+     .. code-block:: bash 
+
+        srun -n1 /usr/bin/python3 buildbot/configure.py --hip -o buildrocm5.7.1 \
+        --cmake-gen "Unix Makefiles" \
+        --cmake-opt=-DSYCL_BUILD_PI_HIP_ROCM_DIR=/opt/rocm-5.7.1 \
+        --cmake-opt=-DSYCL_BUILD_PI_HIP_ROCM_INCLUDE_DIR=/opt/rocm-5.7.1/include \
+        --cmake-opt=-DSYCL_BUILD_PI_HIP_ROCM_LIB_DIR=/opt/rocm-5.7.1/lib \
+        --cmake-opt=-DSYCL_BUILD_PI_HIP_INCLUDE_DIR=/opt/rocm-5.7.1/include \
+        --cmake-opt=-DSYCL_BUILD_PI_HIP_HSA_INCLUDE_DIR=/opt/rocm-5.7.1/hsa/include/hsa \
+        --cmake-opt=-DSYCL_BUILD_PI_HIP_LIB_DIR=/opt/rocm-5.7.1/lib \
+        --cmake-opt=-DUR_HIP_ROCM_DIR=/opt/rocm-5.7.1 \
+        --cmake-opt=-DUR_HIP_INCLUDE_DIR=/opt/rocm-5.7.1/include \
+        --cmake-opt=-DUR_HIP_HSA_INCLUDE_DIR=/opt/rocm-5.7.1/hsa/include/hsa \
+        --cmake-opt=-DUR_HIP_LIB_DIR=/opt/rocm-5.7.1/lib
+
+   b. Build
+
+     .. code-block:: bash
+
+      srun -n1 /usr/bin/python3 buildbot/compile.py -o buildrocm5.7.1
+
+#. Test the compiler
+
+   Follow the steps in the `Using the compiler`_ section to test this installation
+
+#. Install
+
+  a. The build step will install the compiler to the folder ``buildrocm<version>/install``. Simply copy this folder to the ``/usr/workspace/raja-dev/`` directory using the naming scheme ``clang_sycl_<git sha>_hip_gcc<version>_rocm<version>``
+
+  #. Set the permissions of the folder, and everything in it to 750::
+
+      chmod 750 /usr/workspace/raja-dev/<foldername>/ -R  
+
+  #. Change the group of the folder and everything in it to raja-dev::
+
+      chgrp raja-dev /usr/workspace/raja-dev/<foldername>/ -R  
+
+
+Using the compiler
+^^^^^^^^^^^^^^^^^^
+
+#. Load the version of rocm that you used when building the compiler::
+
+    module load rocm/5.7.1
+
+#. Navigate to the root of your local checkout space of the RAJA repo::
+
+    cd /path/to/raja
+
+#. Run the test config script::
+
+    ./scripts/lc-builds/corona_sycl.sh /usr/workspace/raja-dev/clang_sycl_2f03ef85fee5_hip_gcc10.3.1_rocm5.7.1
+
+   Note that at the time of writing, the newest compiler we had built was at ``clang_sycl_2f03ef85fee5_hip_gcc10.3.1_rocm5.7.1``
+
+#. cd into the auto generated build directory::
+
+    cd {build directory}
+
+#. Run the tests::
+
+    make -j
+
+
 ==============
 Azure CI Tasks
 ==============
diff --git a/docs/sphinx/user_guide/cook_book.rst b/docs/sphinx/user_guide/cook_book.rst
index 91494f3674..349fdd5b3f 100644
--- a/docs/sphinx/user_guide/cook_book.rst
+++ b/docs/sphinx/user_guide/cook_book.rst
@@ -20,4 +20,5 @@ to provide users with complete beyond usage examples beyond what can be found in
    :maxdepth: 2
 
    cook_book/reduction
+   cook_book/multi-reduction
 
diff --git a/docs/sphinx/user_guide/cook_book/multi-reduction.rst b/docs/sphinx/user_guide/cook_book/multi-reduction.rst
new file mode 100644
index 0000000000..2ad4d60aa2
--- /dev/null
+++ b/docs/sphinx/user_guide/cook_book/multi-reduction.rst
@@ -0,0 +1,160 @@
+.. ##
+.. ## Copyright (c) 2016-24, Lawrence Livermore National Security, LLC
+.. ## and other RAJA project contributors. See the RAJA/LICENSE file
+.. ## for details.
+.. ##
+.. ## SPDX-License-Identifier: (BSD-3-Clause)
+.. ##
+
+.. _cook-book-multi-reductions-label:
+
+============================
+Cooking with MultiReductions
+============================
+
+Please see the following section for overview discussion about RAJA multi-reductions:
+
+ * :ref:`feat-multi-reductions-label`.
+
+
+---------------------------------
+MultiReductions with RAJA::forall
+---------------------------------
+
+Here is the setup for a simple multi-reduction example::
+
+  const int N = 1000;
+  const int num_bins = 10;
+
+  int vec[N];
+  int bins[N];
+
+  for (int i = 0; i < N; ++i) {
+
+    vec[i] = 1;
+    bins[i] = i % num_bins;
+
+  }
+
+Here a simple sum multi-reduction performed in a C-style for-loop::
+
+  int vsum[num_bins] {0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
+
+  // Run a kernel using the multi-reduction objects
+  for (int i = 0; i < N; ++i) {
+
+    vsum[bins[i]] += vec[i];
+
+  }
+
+The results of these operations will yield the following values:
+
+ * ``vsum[0] == 100``
+ * ``vsum[1] == 100``
+ * ``vsum[2] == 100``
+ * ``vsum[3] == 100``
+ * ``vsum[4] == 100``
+ * ``vsum[5] == 100``
+ * ``vsum[6] == 100``
+ * ``vsum[7] == 100``
+ * ``vsum[8] == 100``
+ * ``vsum[9] == 100``
+
+RAJA uses policy types to specify how things are implemented.
+
+The forall *execution policy* specifies how the loop is run by the ``RAJA::forall`` method. The following discussion includes examples of several other RAJA execution policies that could be applied.
+For example ``RAJA::seq_exec`` runs a C-style for-loop sequentially on a CPU. The
+``RAJA::cuda_exec_with_reduce<256>`` runs the operation as a CUDA GPU kernel with
+256 threads per block and other CUDA kernel launch parameters, like the
+number of blocks, optimized for performance with multi_reducers.::
+
+  using exec_policy = RAJA::seq_exec;
+  // using exec_policy = RAJA::omp_parallel_for_exec;
+  // using exec_policy = RAJA::cuda_exec_with_reduce<256>;
+  // using exec_policy = RAJA::hip_exec_with_reduce<256>;
+
+The multi-reduction policy specifies how the multi-reduction is done and must be compatible with the
+execution policy. For example, ``RAJA::seq_multi_reduce`` does a sequential multi-reduction
+and can only be used with sequential execution policies. The
+``RAJA::cuda_multi_reduce_atomic`` policy uses atomics and can only be used with
+cuda execution policies. Similarly for other RAJA execution back-ends, such as
+HIP and OpenMP. Here are example RAJA multi-reduction policies whose names are
+indicative of which execution policies they work with::
+
+  using multi_reduce_policy = RAJA::seq_multi_reduce;
+  // using multi_reduce_policy = RAJA::omp_multi_reduce;
+  // using multi_reduce_policy = RAJA::cuda_multi_reduce_atomic;
+  // using multi_reduce_policy = RAJA::hip_multi_reduce_atomic;
+
+Here a simple sum multi-reduction is performed using RAJA::
+
+  RAJA::MultiReduceSum<multi_reduce_policy, int> vsum(num_bins, 0);
+
+  RAJA::forall<exec_policy>( RAJA::RangeSegment(0, N),
+    [=](RAJA::Index_type i) {
+
+    vsum[bins[i]] += vec[i];
+
+  });
+
+The results of these operations will yield the following values:
+
+ * ``vsum[0].get() == 100``
+ * ``vsum[1].get() == 100``
+ * ``vsum[2].get() == 100``
+ * ``vsum[3].get() == 100``
+ * ``vsum[4].get() == 100``
+ * ``vsum[5].get() == 100``
+ * ``vsum[6].get() == 100``
+ * ``vsum[7].get() == 100``
+ * ``vsum[8].get() == 100``
+ * ``vsum[9].get() == 100``
+
+Another option for the execution policy when using the CUDA or HIP backends are
+the base policies which have a boolean parameter to choose between the general
+use ``cuda/hip_exec`` policy and the ``cuda/hip_exec_with_reduce`` policy.::
+
+  // static constexpr bool with_reduce = ...;
+  // using exec_policy = RAJA::cuda_exec_base<with_reduce, 256>;
+  // using exec_policy = RAJA::hip_exec_base<with_reduce, 256>;
+
+
+---------------------------
+Rarely Used MultiReductions
+---------------------------
+
+Multi-reductions consume resources even if they are not used in a
+loop kernel. If a multi-reducer is conditionally used to set an error flag, for example, even
+if the multi-reduction is not used at runtime in the loop kernel, then the setup
+and finalization for the multi-reduction is still done and any resources are
+still allocated and deallocated. To minimize these overheads, some backends have
+special policies that minimize the amount of work the multi-reducer does in the
+case that it is not used at runtime even if it is compiled into a loop kernel.
+Here are example RAJA multi-reduction policies that have minimal overhead::
+
+  using rarely_used_multi_reduce_policy = RAJA::seq_multi_reduce;
+  // using rarely_used_multi_reduce_policy = RAJA::omp_multi_reduce;
+  // using rarely_used_multi_reduce_policy = RAJA::cuda_multi_reduce_atomic_low_performance_low_overhead;
+  // using rarely_used_multi_reduce_policy = RAJA::hip_multi_reduce_atomic_low_performance_low_overhead;
+
+Here is a simple rarely used bitwise-or multi-reduction performed using RAJA::
+
+  RAJA::MultiReduceBitOr<rarely_used_multi_reduce_policy, int> vor(num_bins, 0);
+
+  RAJA::forall<exec_policy>( RAJA::RangeSegment(0, N),
+    [=](RAJA::Index_type i) {
+
+    if (vec[i] < 0) {
+      vor[0] |= 1;
+    }
+
+  });
+
+The results of these operations will yield the following value if the condition
+is never met:
+
+ * ``vsum[0].get() == 0``
+
+or yield the following value if the condition is ever met:
+
+ * ``vsum[0].get() == 1``
diff --git a/docs/sphinx/user_guide/feature/multi-reduction.rst b/docs/sphinx/user_guide/feature/multi-reduction.rst
new file mode 100644
index 0000000000..c41cc37225
--- /dev/null
+++ b/docs/sphinx/user_guide/feature/multi-reduction.rst
@@ -0,0 +1,227 @@
+.. ##
+.. ## Copyright (c) 2016-24, Lawrence Livermore National Security, LLC
+.. ## and other RAJA project contributors. See the RAJA/LICENSE file
+.. ## for details.
+.. ##
+.. ## SPDX-License-Identifier: (BSD-3-Clause)
+.. ##
+
+.. _feat-multi-reductions-label:
+
+=========================
+MultiReduction Operations
+=========================
+
+RAJA provides multi-reduction types that allow users to perform a runtime number
+of reduction operations in kernels launched using ``RAJA::forall``, ``RAJA::kernel``,
+and ``RAJA::launch`` methods in a portable, thread-safe manner. Users may
+use as many multi-reduction objects in a loop kernel as they need. If a small
+fixed number of reductions is required in a loop kernel then standard RAJA reduction objects can be
+used. Available RAJA multi-reduction types are described in this section.
+
+.. note:: All RAJA multi-reduction types are located in the namespace ``RAJA``.
+
+Also
+
+.. note:: * Each RAJA multi-reduction type is templated on a **multi-reduction policy**
+            and a **reduction value type** for the multi-reduction variable. The
+            **multi-reduction policy type must be compatible with the execution
+            policy used by the kernel in which it is used.** For example, in
+            a CUDA kernel, a CUDA multi-reduction policy must be used.
+          * Each RAJA multi-reduction type accepts an **initial reduction value or
+            values** at construction (see below).
+          * Each RAJA multi-reduction type has a 'get' method to access reduced
+            values after kernel execution completes.
+
+Please see the following sections for a description of reducers:
+
+ * :ref:`feat-reductions-label`.
+
+Please see the following cook book sections for guidance on policy usage:
+
+ * :ref:`cook-book-multi-reductions-label`.
+
+
+--------------------
+MultiReduction Types
+--------------------
+
+RAJA supports three common multi-reduction types:
+
+* ``MultiReduceSum< multi_reduce_policy, data_type >`` - Sum of values.
+
+* ``MultiReduceMin< multi_reduce_policy, data_type >`` - Min value.
+
+* ``MultiReduceMax< multi_reduce_policy, data_type >`` - Max value.
+
+and two less common bitwise multi-reduction types:
+
+* ``MultiReduceBitAnd< multi_reduce_policy, data_type >`` - Bitwise 'and' of values (i.e., ``a & b``).
+
+* ``MultiReduceBitOr< multi_reduce_policy, data_type >`` - Bitwise 'or' of values (i.e., ``a | b``).
+
+.. note:: ``RAJA::MultiReduceBitAnd`` and ``RAJA::MultiReduceBitOr`` reduction types are designed to work on integral data types because **in C++, at the language level, there is no such thing as a bitwise operator on floating-point numbers.**
+
+-----------------------
+MultiReduction Examples
+-----------------------
+
+Next, we provide a few examples to illustrate basic usage of RAJA multi-reduction
+types.
+
+Here is a simple RAJA multi-reduction example that shows how to use a sum
+multi-reduction type::
+
+  const int N = 1000;
+  const int B = 10;
+
+  //
+  // Initialize an array of length N with all ones, and another array to
+  // integers between 0 and B-1
+  //
+  int vec[N];
+  int bins[N];
+  for (int i = 0; i < N; ++i) {
+    vec[i] = 1;
+    bins[i] = i % B;
+  }
+
+  // Create a sum multi-reduction object with a size of B, and initial
+  // values of zero
+  RAJA::MultiReduceSum< RAJA::omp_multi_reduce, int > vsum(B, 0);
+
+  // Run a kernel using the multi-reduction object
+  RAJA::forall<RAJA::omp_parallel_for_exec>( RAJA::RangeSegment(0, N),
+    [=](RAJA::Index_type i) {
+
+    vsum[bins[i]] += vec[i];
+
+  });
+
+  // After kernel is run, extract the reduced values
+  int my_vsums[B];
+  for (int bin = 0; bin < B; ++bin) {
+    my_vsums[bin] = vsum[bin].get();
+  }
+
+The results of these operations will yield the following values:
+
+ * my_vsums[0] == 100
+ * my_vsums[1] == 100
+ * my_vsums[2] == 100
+ * my_vsums[3] == 100
+ * my_vsums[4] == 100
+ * my_vsums[5] == 100
+ * my_vsums[6] == 100
+ * my_vsums[7] == 100
+ * my_vsums[8] == 100
+ * my_vsums[9] == 100
+
+
+Here is the same example but using values stored in a container::
+
+  const int N = 1000;
+  const int B = 10;
+
+  //
+  // Initialize an array of length N with all ones, and another array to
+  // integers between 0 and B-1
+  //
+  int vec[N];
+  int bins[N];
+  for (int i = 0; i < N; ++i) {
+    vec[i] = 1;
+    bins[i] = i % B;
+  }
+
+  // Create a vector with a size of B, and initial values of zero
+  std::vector<int> my_vsums(B, 0);
+
+  // Create a multi-reducer initalized with size and values from my_vsums
+  RAJA::MultiReduceSum< RAJA::omp_multi_reduce, int > vsum(my_vsums);
+
+  // Run a kernel using the multi-reduction object
+  RAJA::forall<RAJA::omp_parallel_for_exec>( RAJA::RangeSegment(0, N),
+    [=](RAJA::Index_type i) {
+
+    vsum[bins[i]] += vec[i];
+
+  });
+
+  // After kernel is run, extract the reduced values back into my_vsums
+  vsum.get_all(my_vsums);
+
+The results of these operations will yield the following values:
+
+ * my_vsums[0] == 100
+ * my_vsums[1] == 100
+ * my_vsums[2] == 100
+ * my_vsums[3] == 100
+ * my_vsums[4] == 100
+ * my_vsums[5] == 100
+ * my_vsums[6] == 100
+ * my_vsums[7] == 100
+ * my_vsums[8] == 100
+ * my_vsums[9] == 100
+
+
+
+
+
+Here is an example of a bitwise-or multi-reduction::
+
+  const int N = 128;
+  const int B = 8;
+
+  //
+  // Initialize an array of length N to integers between 0 and B-1
+  //
+  int bins[N];
+  for (int i = 0; i < N; ++i) {
+    bins[i] = i % B;
+  }
+
+  // Create a bitwise-or multi-reduction object with initial value of '0'
+  RAJA::MultiReduceBitOr< RAJA::omp_multi_reduce, int > vor(B, 0);
+
+  // Run a kernel using the multi-reduction object
+  RAJA::forall<RAJA::omp_parallel_for_exec>( RAJA::RangeSegment(0, N),
+    [=](RAJA::Index_type i) {
+
+    vor[bins[i]] |= i;
+
+  });
+
+  // After kernel is run, extract the reduced values
+  int my_vors[B];
+  for (int bin = 0; bin < B; ++bin) {
+    my_vors[bin] = vor[bin].get();
+  }
+
+The results of these operations will yield the following values:
+
+ * my_vors[0] == 120 == 0b1111000
+ * my_vors[1] == 121 == 0b1111001
+ * my_vors[2] == 122 == 0b1111010
+ * my_vors[3] == 123 == 0b1111011
+ * my_vors[4] == 124 == 0b1111100
+ * my_vors[5] == 125 == 0b1111101
+ * my_vors[6] == 126 == 0b1111110
+ * my_vors[7] == 127 == 0b1111111
+
+The results of the multi-reduction start at 120 and increase to 127. In binary
+representation (i.e., bits), :math:`120 = 0b1111000` and :math:`127 = 0b1111111`.
+The bins were picked in such a way that all the integers in a bin had the same
+remainder modulo 8 so their last 3 binary digits were all the same while their
+upper binary digits varied. Because bitwise-or keeps all the set bits, the upper
+bits are all set because at least one integer in that bin set them. The last
+3 bits were the same in all the integers so the last 3 bits are the same as the
+remainder modulo 8 of the bin number.
+
+-----------------------
+MultiReduction Policies
+-----------------------
+
+For more information about available RAJA multi-reduction policies and guidance
+on which to use with RAJA execution policies, please see
+:ref:`multi-reducepolicy-label`.
diff --git a/docs/sphinx/user_guide/feature/policies.rst b/docs/sphinx/user_guide/feature/policies.rst
index facde1da5d..e38856a919 100644
--- a/docs/sphinx/user_guide/feature/policies.rst
+++ b/docs/sphinx/user_guide/feature/policies.rst
@@ -850,6 +850,73 @@ sycl_reduce                                       any SYCL      Reduction in a S
           guaranteed to generate correct results. So they should not be used
           for kernels containing reductions.
 
+.. _multi-reducepolicy-label:
+
+-------------------------
+MultiReduction Policies
+-------------------------
+
+Each RAJA multi-reduction object must be defined with a 'multi-reduction policy'
+type. Multi-reduction policy types are distinct from loop execution policy types.
+It is important to note the following constraints about RAJA multi-reduction usage:
+
+.. note:: To guarantee correctness, a **multi-reduction policy must be compatible
+          with the loop execution policy** used. For example, a CUDA
+          multi-reduction policy must be used when the execution policy is a
+          CUDA policy, an OpenMP multi-reduction policy must be used when the
+          execution policy is an OpenMP policy, and so on.
+
+The following table summarizes RAJA multi-reduction policy types:
+
+============================================================= ============= ==========================================
+MultiReduction Policy                                         Loop Policies Brief description
+                                                              to Use With
+============================================================= ============= ==========================================
+seq_multi_reduce                                              seq_exec,     Non-parallel (sequential) multi-reduction.
+omp_multi_reduce                                              any OpenMP    OpenMP parallel multi-reduction.
+                                                              policy
+omp_multi_reduce_ordered                                      any OpenMP    OpenMP parallel multi-reduction with result
+                                                              policy        guaranteed to be reproducible.
+cuda/hip_multi_reduce_atomic                                  any CUDA/HIP  Parallel multi-reduction in a CUDA/HIP kernel.
+                                                              policy        Multi-reduction may use atomic operations
+                                                                            leading to run to run variability in the
+                                                                            results.
+                                                                            (device synchronization will occur when
+                                                                            reduction value is finalized)
+cuda/hip_multi_reduce_atomic_low_performance_low_overhead     any CUDA/HIP  Same as above, but multi-reduction uses
+                                                              policy        a low overhead algorithm with a minimal
+                                                                            set of resources. This minimally effects
+                                                                            the performance of loops containing the
+                                                                            multi-reducer though it may cause the
+                                                                            multi-reducer itself to perform poorly if
+                                                                            it is used.
+cuda/hip_multi_reduce_atomic_block_then_atomic_grid_host_init any CUDA/HIP  The multi-reduction uses atomics into shared
+                                                              policy        memory and global memory. Atomics into
+                                                                            shared memory are used each time a value
+                                                                            is combined into the multi-reducer and at
+                                                                            the end of the life of the block the shared
+                                                                            values are combined into global memory with
+                                                                            atomics. If there is not enough shared memory
+                                                                            available this will fall back to using atomics into
+                                                                            global memory only, which may have a
+                                                                            performance penalty.
+                                                                            The memory for global atomics is
+                                                                            initialized on the host.
+cuda/hip_multi_reduce_atomic_global_host_init                 any CUDA/HIP  The multi-reduction uses atomics into global
+                                                              policy        global memory only. Atomics into
+                                                                            global memory are used each time a value
+                                                                            is combined into the multi-reducer.
+                                                                            The memory for global atomics is
+                                                                            initialized on the host.
+cuda/hip_multi_reduce_atomic_global_no_replication_host_init  any CUDA/HIP  Same as above, but uses minimal memory
+                                                                            by not replicating global atomics.
+
+============================================================= ============= ==========================================
+
+.. note:: RAJA multi-reductions used with SIMD execution policies are not
+          guaranteed to generate correct results. So they should not be used
+          for kernels containing multi-reductions.
+
 .. _atomicpolicy-label:
 
 -------------------------
diff --git a/docs/sphinx/user_guide/feature/reduction.rst b/docs/sphinx/user_guide/feature/reduction.rst
index 5f2f09afad..6d4c8695d9 100644
--- a/docs/sphinx/user_guide/feature/reduction.rst
+++ b/docs/sphinx/user_guide/feature/reduction.rst
@@ -17,8 +17,9 @@ reduction operations like some other C++ loop programming abstraction models.
 Instead, RAJA provides reduction types that allow users to perform reduction
 operations in kernels launched using ``RAJA::forall``, ``RAJA::kernel``,
 and ``RAJA::launch`` methods in a portable, thread-safe manner. Users may
-use as many reduction objects in a loop kernel as they need. Available RAJA
-reduction types are described in this section.
+use as many reduction objects in a loop kernel as they need. If a runtime number
+of reductions is required in a loop kernel, then multi-reductions can be used.
+Available RAJA reduction types are described in this section.
 
 .. note:: All RAJA reduction types are located in the namespace ``RAJA``.
 
@@ -39,6 +40,10 @@ RAJA reductions:
 
  * :ref:`tut-reduction-label`.
 
+Please see the following sections for a description of multi-reducers:
+
+ * :ref:`feat-multi-reductions-label`.
+
 Please see the following cook book sections for guidance on policy usage:
 
  * :ref:`cook-book-reductions-label`.
diff --git a/docs/sphinx/user_guide/feature/resource.rst b/docs/sphinx/user_guide/feature/resource.rst
index 860af3eddd..d0ca13a3ab 100644
--- a/docs/sphinx/user_guide/feature/resource.rst
+++ b/docs/sphinx/user_guide/feature/resource.rst
@@ -95,7 +95,7 @@ Memory Operations
 -------------------
 
 The example discussed in this section illustrates most of the memory
-operations that can be performed with 
+operations that can be performed with RAJA resource objects.
 A common use case for a resource is to manage arrays in the appropriate 
 memory space to use in a kernel. Consider the following code example::
 
diff --git a/docs/sphinx/user_guide/features.rst b/docs/sphinx/user_guide/features.rst
index 4d9e4bf711..afeb50ce9d 100644
--- a/docs/sphinx/user_guide/features.rst
+++ b/docs/sphinx/user_guide/features.rst
@@ -25,6 +25,7 @@ materials that provide detailed examples of usage.
    feature/iteration_spaces
    feature/view
    feature/reduction
+   feature/multi-reduction
    feature/atomic
    feature/scan
    feature/sort
diff --git a/docs/sphinx/user_guide/using_raja.rst b/docs/sphinx/user_guide/using_raja.rst
index 6dc8086a9c..e05cec4dfb 100644
--- a/docs/sphinx/user_guide/using_raja.rst
+++ b/docs/sphinx/user_guide/using_raja.rst
@@ -34,7 +34,7 @@ project::
 
 Then, pass the path of RAJA to CMake when you configure your code::
 
-  cmake -DRAJA_DIR=<path-to-raja>/share/raja/cmake
+  cmake -DRAJA_DIR=<path-to-raja-install>/lib/cmake/raja/
 
 The ``RAJA-config.cmake`` file provides a ``RAJA`` target, that can be used
 natively by CMake to add a dependency on RAJA. For example::
diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt
index 7fd580972b..4dfd2fbc10 100644
--- a/examples/CMakeLists.txt
+++ b/examples/CMakeLists.txt
@@ -19,6 +19,10 @@ raja_add_executable(
   NAME forall-param-reductions
   SOURCES forall-param-reductions.cpp)
 
+raja_add_executable(
+  NAME forall_multi-reductions
+  SOURCES forall_multi-reductions.cpp)
+
 raja_add_executable(
   NAME launch-param-reductions
   SOURCES launch-param-reductions.cpp)
diff --git a/examples/dynamic_mat_transpose.cpp b/examples/dynamic_mat_transpose.cpp
index 9fc973e311..feb5247224 100644
--- a/examples/dynamic_mat_transpose.cpp
+++ b/examples/dynamic_mat_transpose.cpp
@@ -11,7 +11,6 @@
 #include <iostream>
 
 #include "RAJA/RAJA.hpp"
-#include "memoryManager.hpp"
 
 /*
  *  Matrix Transpose Example
@@ -96,7 +95,7 @@ using outer0 = RAJA::LoopPolicy<
 #endif
 #if defined(RAJA_ENABLE_SYCL)
                                        ,
-                                       RAJA::sycl_group_0_direct
+                                       RAJA::sycl_group_2_direct
 #endif
                                        >;
 
@@ -135,7 +134,7 @@ using inner0 = RAJA::LoopPolicy<
 #endif
 #if defined(RAJA_ENABLE_SYCL)
                                         ,
-                                         RAJA::sycl_local_0_direct
+                                         RAJA::sycl_local_2_direct
 #endif
                                          >;
 
@@ -154,20 +153,9 @@ using inner1 = RAJA::LoopPolicy<RAJA::seq_exec
 #endif
                                          >;
 
-template<typename T>
-void switch_ptrs(T *A, T *d_A)
-{
-  T *tmp_ptr;
-  tmp_ptr = d_A;
-  d_A = A;
-  A = tmp_ptr;
-}
-
 int main(int argc, char *argv[])
 {
 
-  std::cout << "\n\nRAJA matrix transpose example...\n";
-
   if(argc != 2) {
     RAJA_ABORT_OR_THROW("Usage ./dynamic_mat_transpose host or ./dynamic_mat_transpose device");
   }
@@ -185,17 +173,26 @@ int main(int argc, char *argv[])
 
   RAJA::ExecPlace select_cpu_or_gpu;
   if(exec_space.compare("host") == 0)
-    { select_cpu_or_gpu = RAJA::ExecPlace::HOST; printf("Running RAJA::launch reductions example on the host \n"); }
+    { select_cpu_or_gpu = RAJA::ExecPlace::HOST; std::cout<<"Running RAJA::launch matrix transpose example on the host"<<std::endl; }
   if(exec_space.compare("device") == 0)
-    { select_cpu_or_gpu = RAJA::ExecPlace::DEVICE; printf("Running RAJA::launch reductions example on the device \n"); }
-
-
+    { select_cpu_or_gpu = RAJA::ExecPlace::DEVICE; std::cout<<"Running RAJA::launch matrix transpose example on the device" <<std::endl; }
 
+  RAJA::resources::Host host_res;
+#if defined(RAJA_ENABLE_CUDA)
+  RAJA::resources::Cuda device_res;
+#endif
+#if defined(RAJA_ENABLE_HIP)
+  RAJA::resources::Hip device_res;
+#endif
 #if defined(RAJA_ENABLE_SYCL)
-  memoryManager::sycl_res = new camp::resources::Resource{camp::resources::Sycl()};
-  ::RAJA::sycl::detail::setQueue(memoryManager::sycl_res);
+  RAJA::resources::Sycl device_res;
 #endif
 
+#if defined(RAJA_GPU_ACTIVE)
+  RAJA::resources::Resource res = RAJA::Get_Runtime_Resource(host_res, device_res, select_cpu_or_gpu);
+#else
+  RAJA::resources::Resource res = RAJA::Get_Host_Resource(host_res, select_cpu_or_gpu);
+#endif
   //
   // Define num rows/cols in matrix, tile dimensions, and number of tiles
   //
@@ -212,9 +209,8 @@ int main(int argc, char *argv[])
   //
   // Allocate matrix data
   //
-  int *A = memoryManager::allocate<int>(N_r * N_c);
-  int *At = memoryManager::allocate<int>(N_r * N_c);
-
+  int *A = host_res.allocate<int>(N_r * N_c);
+  int *At = host_res.allocate<int>(N_r * N_c);
   //
   // In the following implementations of matrix transpose, we
   // use RAJA 'View' objects to access the matrix data. A RAJA view
@@ -300,20 +296,24 @@ int main(int argc, char *argv[])
 
   std::cout << "\n Running RAJA matrix transpose w/ dynamic shared memory ...\n";
 
-#if defined(RAJA_ENABLE_HIP)
+  //Reset memory
+  std::memset(At, 0, N_r * N_c * sizeof(int));
 
-  //Hip requires device side pointers
+#if defined(RAJA_GPU_ACTIVE)
+  //Allocate device side pointers
   int *d_A = nullptr, *d_At = nullptr;
 
   if(select_cpu_or_gpu == RAJA::ExecPlace::DEVICE) {
-    d_A =  memoryManager::allocate_gpu<int>(N_r * N_c);
-    d_At = memoryManager::allocate_gpu<int>(N_r * N_c);
 
-    hipErrchk(hipMemcpy( d_A, A, N_r * N_c * sizeof(int), hipMemcpyHostToDevice ));
+    d_A  =  device_res.allocate<int>(N_r * N_c);
+    d_At = device_res.allocate<int>(N_r * N_c);
+
+    device_res.memcpy(d_A, A, sizeof(int) * N_r * N_c);
+    device_res.memcpy(d_At, At, sizeof(int) * N_r * N_c);
 
     //switch host/device pointers so we can reuse the views
-    switch_ptrs(d_A, A);
-    switch_ptrs(d_At, At);
+    Aview.set_data(d_A);
+    Atview.set_data(d_At);
   }
 #endif
 
@@ -323,13 +323,11 @@ int main(int argc, char *argv[])
 
   // _dynamic_mattranspose_kernel_start
   RAJA::launch<launch_policy>
-    (select_cpu_or_gpu,
-     RAJA::LaunchParams(RAJA::Teams(outer_Dimr, outer_Dimc),
-                        RAJA::Threads(TILE_DIM, TILE_DIM), dynamic_shared_mem_size),
+    (res, RAJA::LaunchParams(RAJA::Teams(outer_Dimc, outer_Dimr),
+                             RAJA::Threads(TILE_DIM, TILE_DIM), dynamic_shared_mem_size),
      "Matrix tranpose with dynamic shared memory kernel",
       [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx)
   {
-
     RAJA::loop<outer1>(ctx, RAJA::RangeSegment(0, outer_Dimr), [&] (int by){
         RAJA::loop<outer0>(ctx, RAJA::RangeSegment(0, outer_Dimc), [&] (int bx){
 
@@ -378,24 +376,37 @@ int main(int argc, char *argv[])
             ctx.releaseSharedMemory();
           });
       });
-
   });
   // _dynamic_mattranspose_kernel_end
 
-
-#if defined(RAJA_ENABLE_HIP)
+#if defined(RAJA_GPU_ACTIVE)
   if(select_cpu_or_gpu == RAJA::ExecPlace::DEVICE) {
-    switch_ptrs(d_At, At);
-    switch_ptrs(d_A, A);
 
-    hipErrchk(hipMemcpy( d_At, At, N_r * N_c * sizeof(int), hipMemcpyDeviceToHost ));
+    device_res.memcpy(A, d_A, sizeof(int) * N_r * N_c);
+    device_res.memcpy(At, d_At, sizeof(int) * N_r * N_c);
+
+    Aview.set_data(A);
+    Atview.set_data(At);
   }
 #endif
 
 
   checkResult<int>(Atview, N_c, N_r);
+  //printResult<int>(Atview, N_c, N_r);
   //----------------------------------------------------------------------------//
 
+  //Release data
+  host_res.deallocate(A);
+  host_res.deallocate(At);
+
+#if defined(RAJA_GPU_ACTIVE)
+  if(select_cpu_or_gpu == RAJA::ExecPlace::DEVICE) {
+    device_res.deallocate(d_A);
+    device_res.deallocate(d_At);
+  }
+#endif
+
+
   return 0;
 }
 
diff --git a/examples/forall-param-reductions.cpp b/examples/forall-param-reductions.cpp
index 8e16113c95..fb82582704 100644
--- a/examples/forall-param-reductions.cpp
+++ b/examples/forall-param-reductions.cpp
@@ -9,8 +9,6 @@
 #include <iostream>
 #include <limits>
 
-#include "memoryManager.hpp"
-
 #include "RAJA/RAJA.hpp"
 
 /*
@@ -39,6 +37,10 @@ constexpr int CUDA_BLOCK_SIZE = 256;
 constexpr int HIP_BLOCK_SIZE = 256;
 #endif
 
+#if defined(RAJA_ENABLE_SYCL)
+constexpr int SYCL_BLOCK_SIZE = 256;
+#endif
+
 int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
 {
 
@@ -53,13 +55,14 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
 //
 // Allocate array data and initialize data to alternating sequence of 1, -1.
 //
-  int* a = memoryManager::allocate<int>(N);
+  RAJA::resources::Host host_res;
+  int* a = host_res.allocate<int>(N);
 
   for (int i = 0; i < N; ++i) {
     if ( i % 2 == 0 ) {
       a[i] = 1;
     } else {
-      a[i] = -1; 
+      a[i] = -1;
     }
   }
 
@@ -103,19 +106,20 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
 
   // _reductions_raja_seq_start
   using EXEC_POL1   = RAJA::seq_exec;
- 
+
   int seq_sum = 0;
   int seq_min = std::numeric_limits<int>::max();
   int seq_max = std::numeric_limits<int>::min();
   VALLOC_INT seq_minloc(std::numeric_limits<int>::max(), -1);
   VALLOC_INT seq_maxloc(std::numeric_limits<int>::min(), -1);
 
-  RAJA::forall<EXEC_POL1>(arange, 
+  RAJA::forall<EXEC_POL1>(host_res, arange,
     RAJA::expt::Reduce<RAJA::operators::plus>(&seq_sum),
     RAJA::expt::Reduce<RAJA::operators::minimum>(&seq_min),
     RAJA::expt::Reduce<RAJA::operators::maximum>(&seq_max),
     RAJA::expt::Reduce<RAJA::operators::minimum>(&seq_minloc),
     RAJA::expt::Reduce<RAJA::operators::maximum>(&seq_maxloc),
+    RAJA::expt::KernelName("RAJA Reduce Seq Kernel"),
     [=](int i, int &_seq_sum, int &_seq_min, int &_seq_max, VALLOC_INT &_seq_minloc, VALLOC_INT &_seq_maxloc) {
       _seq_sum += a[i];
 
@@ -126,8 +130,8 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
       _seq_maxloc = RAJA_MAX(VALLOC_INT(a[i], i), _seq_maxloc);
       //_seq_minloc.min(a[i], i);
       //_seq_maxloc.max(a[i], i);
-      // Note : RAJA::expt::ValLoc<T> objects provide min() and max() methods 
-      //        that are equivalent to the assignments with RAJA_MIN and RAJA_MAX 
+      // Note : RAJA::expt::ValLoc<T> objects provide min() and max() methods
+      //        that are equivalent to the assignments with RAJA_MIN and RAJA_MAX
       //        above.
     }
   );
@@ -135,12 +139,12 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   std::cout << "\tsum = " << seq_sum << std::endl;
   std::cout << "\tmin = " << seq_min << std::endl;
   std::cout << "\tmax = " << seq_max << std::endl;
-  std::cout << "\tmin, loc = " << seq_minloc.getVal() << " , " 
+  std::cout << "\tmin, loc = " << seq_minloc.getVal() << " , "
                                << seq_minloc.getLoc() << std::endl;
-  std::cout << "\tmax, loc = " << seq_maxloc.getVal() << " , " 
+  std::cout << "\tmax, loc = " << seq_maxloc.getVal() << " , "
                                << seq_maxloc.getLoc() << std::endl;
   // _reductions_raja_seq_end
-  
+
 
 //----------------------------------------------------------------------------//
 
@@ -157,12 +161,13 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   VALLOC_INT omp_minloc(std::numeric_limits<int>::max(), -1);
   VALLOC_INT omp_maxloc(std::numeric_limits<int>::min(), -1);
 
-  RAJA::forall<EXEC_POL2>(arange, 
+  RAJA::forall<EXEC_POL2>(host_res, arange,
     RAJA::expt::Reduce<RAJA::operators::plus>(&omp_sum),
     RAJA::expt::Reduce<RAJA::operators::minimum>(&omp_min),
     RAJA::expt::Reduce<RAJA::operators::maximum>(&omp_max),
     RAJA::expt::Reduce<RAJA::operators::minimum>(&omp_minloc),
     RAJA::expt::Reduce<RAJA::operators::maximum>(&omp_maxloc),
+    RAJA::expt::KernelName("RAJA Reduce OpenMP Kernel"),
     [=](int i, int &_omp_sum, int &_omp_min, int &_omp_max, VALLOC_INT &_omp_minloc, VALLOC_INT &_omp_maxloc) {
       _omp_sum += a[i];
 
@@ -179,9 +184,9 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   std::cout << "\tsum = " << omp_sum << std::endl;
   std::cout << "\tmin = " << omp_min << std::endl;
   std::cout << "\tmax = " << omp_max << std::endl;
-  std::cout << "\tmin, loc = " << omp_minloc.getVal() << " , " 
+  std::cout << "\tmin, loc = " << omp_minloc.getVal() << " , "
                                << omp_minloc.getLoc() << std::endl;
-  std::cout << "\tmax, loc = " << omp_maxloc.getVal() << " , " 
+  std::cout << "\tmax, loc = " << omp_maxloc.getVal() << " , "
                                << omp_maxloc.getLoc() << std::endl;
 
 #endif
@@ -191,6 +196,8 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
 #if defined(RAJA_ENABLE_TARGET_OPENMP)
   std::cout << "\n Running RAJA OpenMP Target reductions...\n";
 
+  RAJA::resources::Omp omp_res;
+
   // _reductions_raja_omppolicy_start
   using EXEC_POL3   = RAJA::omp_target_parallel_for_exec_nt;
   // _reductions_raja_omppolicy_end
@@ -201,12 +208,13 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   VALLOC_INT omp_t_minloc(std::numeric_limits<int>::max(), -1);
   VALLOC_INT omp_t_maxloc(std::numeric_limits<int>::min(), -1);
 
-  RAJA::forall<EXEC_POL3>(arange, 
+  RAJA::forall<EXEC_POL3>(omp_res, arange,
     RAJA::expt::Reduce<RAJA::operators::plus>(&omp_t_sum),
     RAJA::expt::Reduce<RAJA::operators::minimum>(&omp_t_min),
     RAJA::expt::Reduce<RAJA::operators::maximum>(&omp_t_max),
     RAJA::expt::Reduce<RAJA::operators::minimum>(&omp_t_minloc),
     RAJA::expt::Reduce<RAJA::operators::maximum>(&omp_t_maxloc),
+    RAJA::expt::KernelName("RAJA Reduce Target OpenMP Kernel"),
     [=](int i, int &_omp_t_sum, int &_omp_t_min, int &_omp_t_max, VALLOC_INT &_omp_t_minloc, VALLOC_INT &_omp_t_maxloc) {
       _omp_t_sum += a[i];
 
@@ -223,9 +231,9 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   std::cout << "\tsum = " << omp_t_sum << std::endl;
   std::cout << "\tmin = " << omp_t_min << std::endl;
   std::cout << "\tmax = " << omp_t_max << std::endl;
-  std::cout << "\tmin, loc = " << omp_t_minloc.getVal() << " , " 
+  std::cout << "\tmin, loc = " << omp_t_minloc.getVal() << " , "
                                << omp_t_minloc.getLoc() << std::endl;
-  std::cout << "\tmax, loc = " << omp_t_maxloc.getVal() << " , " 
+  std::cout << "\tmax, loc = " << omp_t_maxloc.getVal() << " , "
                                << omp_t_maxloc.getLoc() << std::endl;
 
 #endif
@@ -236,6 +244,11 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
 #if defined(RAJA_ENABLE_CUDA)
   std::cout << "\n Running RAJA CUDA reductions...\n";
 
+  RAJA::resources::Cuda cuda_res;
+
+  int* d_a = cuda_res.allocate<int>(N);
+  cuda_res.memcpy(d_a, a, sizeof(int) * N);
+
   // _reductions_raja_cudapolicy_start
   using EXEC_POL3   = RAJA::cuda_exec<CUDA_BLOCK_SIZE>;
   // _reductions_raja_cudapolicy_end
@@ -246,20 +259,21 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   VALLOC_INT cuda_minloc(std::numeric_limits<int>::max(), -1);
   VALLOC_INT cuda_maxloc(std::numeric_limits<int>::min(), -1);
 
-  RAJA::forall<EXEC_POL3>(arange, 
+  RAJA::forall<EXEC_POL3>(cuda_res, arange,
     RAJA::expt::Reduce<RAJA::operators::plus>(&cuda_sum),
     RAJA::expt::Reduce<RAJA::operators::minimum>(&cuda_min),
     RAJA::expt::Reduce<RAJA::operators::maximum>(&cuda_max),
     RAJA::expt::Reduce<RAJA::operators::minimum>(&cuda_minloc),
     RAJA::expt::Reduce<RAJA::operators::maximum>(&cuda_maxloc),
+    RAJA::expt::KernelName("RAJA Reduce CUDA Kernel"),
     [=] RAJA_DEVICE (int i, int &_cuda_sum, int &_cuda_min, int &_cuda_max, VALLOC_INT &_cuda_minloc, VALLOC_INT &_cuda_maxloc) {
-      _cuda_sum += a[i];
+      _cuda_sum += d_a[i];
 
-      _cuda_min = RAJA_MIN(a[i], _cuda_min);
-      _cuda_max = RAJA_MAX(a[i], _cuda_max);
+      _cuda_min = RAJA_MIN(d_a[i], _cuda_min);
+      _cuda_max = RAJA_MAX(d_a[i], _cuda_max);
 
-      _cuda_minloc = RAJA_MIN(VALLOC_INT(a[i], i), _cuda_minloc);
-      _cuda_maxloc = RAJA_MAX(VALLOC_INT(a[i], i), _cuda_maxloc);
+      _cuda_minloc = RAJA_MIN(VALLOC_INT(d_a[i], i), _cuda_minloc);
+      _cuda_maxloc = RAJA_MAX(VALLOC_INT(d_a[i], i), _cuda_maxloc);
       //_cuda_minloc.min(a[i], i);
       //_cuda_maxloc.max(a[i], i);
     }
@@ -268,11 +282,11 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   std::cout << "\tsum = " << cuda_sum << std::endl;
   std::cout << "\tmin = " << cuda_min << std::endl;
   std::cout << "\tmax = " << cuda_max << std::endl;
-  std::cout << "\tmin, loc = " << cuda_minloc.getVal() << " , " 
+  std::cout << "\tmin, loc = " << cuda_minloc.getVal() << " , "
                                << cuda_minloc.getLoc() << std::endl;
-  std::cout << "\tmax, loc = " << cuda_maxloc.getVal() << " , " 
+  std::cout << "\tmax, loc = " << cuda_maxloc.getVal() << " , "
                                << cuda_maxloc.getLoc() << std::endl;
-
+  cuda_res.deallocate(d_a);
 #endif
 
 //----------------------------------------------------------------------------//
@@ -280,8 +294,10 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
 #if defined(RAJA_ENABLE_HIP)
   std::cout << "\n Running RAJA HIP reductions...\n";
 
-  int* d_a = memoryManager::allocate_gpu<int>(N);
-  hipErrchk(hipMemcpy( d_a, a, N * sizeof(int), hipMemcpyHostToDevice ));
+  RAJA::resources::Hip hip_res;
+
+  int* d_a = hip_res.allocate<int>(N);
+  hip_res.memcpy(d_a, a, sizeof(int) * N);
 
   // _reductions_raja_hippolicy_start
   using EXEC_POL3   = RAJA::hip_exec<HIP_BLOCK_SIZE>;
@@ -293,12 +309,13 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   VALLOC_INT hip_minloc(std::numeric_limits<int>::max(), -1);
   VALLOC_INT hip_maxloc(std::numeric_limits<int>::min(), -1);
 
-  RAJA::forall<EXEC_POL3>(arange, 
+  RAJA::forall<EXEC_POL3>(arange,
     RAJA::expt::Reduce<RAJA::operators::plus>(&hip_sum),
     RAJA::expt::Reduce<RAJA::operators::minimum>(&hip_min),
     RAJA::expt::Reduce<RAJA::operators::maximum>(&hip_max),
     RAJA::expt::Reduce<RAJA::operators::minimum>(&hip_minloc),
     RAJA::expt::Reduce<RAJA::operators::maximum>(&hip_maxloc),
+    RAJA::expt::KernelName("RAJA Reduce HIP Kernel"),
     [=] RAJA_DEVICE (int i, int &_hip_sum, int &_hip_min, int &_hip_max, VALLOC_INT &_hip_minloc, VALLOC_INT &_hip_maxloc) {
       _hip_sum += d_a[i];
 
@@ -315,12 +332,63 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   std::cout << "\tsum = " << hip_sum << std::endl;
   std::cout << "\tmin = " << hip_min << std::endl;
   std::cout << "\tmax = " << hip_max << std::endl;
-  std::cout << "\tmin, loc = " << hip_minloc.getVal() << " , " 
+  std::cout << "\tmin, loc = " << hip_minloc.getVal() << " , "
                                << hip_minloc.getLoc() << std::endl;
-  std::cout << "\tmax, loc = " << hip_maxloc.getVal() << " , " 
+  std::cout << "\tmax, loc = " << hip_maxloc.getVal() << " , "
                                << hip_maxloc.getLoc() << std::endl;
 
-  memoryManager::deallocate_gpu(d_a);
+  hip_res.deallocate(d_a);
+#endif
+
+//----------------------------------------------------------------------------//
+
+#if defined(RAJA_ENABLE_SYCL)
+  std::cout << "\n Running RAJA SYCL reductions...\n";
+
+  RAJA::resources::Sycl sycl_res;
+
+  int* d_a = sycl_res.allocate<int>(N);
+  sycl_res.memcpy(d_a, a, sizeof(int) * N);
+
+  // _reductions_raja_syclpolicy_start
+  using EXEC_POL3   = RAJA::sycl_exec<SYCL_BLOCK_SIZE>;
+  // _reductions_raja_syclpolicy_end
+
+  int sycl_sum = 0;
+  int sycl_min = std::numeric_limits<int>::max();
+  int sycl_max = std::numeric_limits<int>::min();
+  VALLOC_INT sycl_minloc(std::numeric_limits<int>::max(), -1);
+  VALLOC_INT sycl_maxloc(std::numeric_limits<int>::min(), -1);
+
+  RAJA::forall<EXEC_POL3>(sycl_res, arange,
+    RAJA::expt::Reduce<RAJA::operators::plus>(&sycl_sum),
+    RAJA::expt::Reduce<RAJA::operators::minimum>(&sycl_min),
+    RAJA::expt::Reduce<RAJA::operators::maximum>(&sycl_max),
+    RAJA::expt::Reduce<RAJA::operators::minimum>(&sycl_minloc),
+    RAJA::expt::Reduce<RAJA::operators::maximum>(&sycl_maxloc),
+    RAJA::expt::KernelName("RAJA Reduce SYCL Kernel"),
+    [=] RAJA_DEVICE (int i, int &_sycl_sum, int &_sycl_min, int &_sycl_max, VALLOC_INT &_sycl_minloc, VALLOC_INT &_sycl_maxloc) {
+      _sycl_sum += d_a[i];
+
+      _sycl_min = RAJA_MIN(d_a[i], _sycl_min);
+      _sycl_max = RAJA_MAX(d_a[i], _sycl_max);
+
+      _sycl_minloc = RAJA_MIN(VALLOC_INT(d_a[i], i), _sycl_minloc);
+      _sycl_maxloc = RAJA_MAX(VALLOC_INT(d_a[i], i), _sycl_maxloc);
+      //_sycl_minloc.min(d_a[i], i);
+      //_sycl_maxloc.max(d_a[i], i);
+    }
+  );
+
+  std::cout << "\tsum = " << sycl_sum << std::endl;
+  std::cout << "\tmin = " << sycl_min << std::endl;
+  std::cout << "\tmax = " << sycl_max << std::endl;
+  std::cout << "\tmin, loc = " << sycl_minloc.getVal() << " , "
+                               << sycl_minloc.getLoc() << std::endl;
+  std::cout << "\tmax, loc = " << sycl_maxloc.getVal() << " , "
+                               << sycl_maxloc.getLoc() << std::endl;
+
+  sycl_res.deallocate(d_a);
 #endif
 
 //----------------------------------------------------------------------------//
@@ -328,9 +396,9 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
 //
 // Clean up.
 //
-  memoryManager::deallocate(a);
+  host_res.deallocate(a);
 
   std::cout << "\n DONE!...\n";
- 
+
   return 0;
 }
diff --git a/examples/forall_multi-reductions.cpp b/examples/forall_multi-reductions.cpp
new file mode 100644
index 0000000000..0010dd2848
--- /dev/null
+++ b/examples/forall_multi-reductions.cpp
@@ -0,0 +1,166 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2016-24, Lawrence Livermore National Security, LLC
+// and RAJA project contributors. See the RAJA/LICENSE file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+#include <cstdlib>
+#include <iostream>
+#include <limits>
+
+#include "RAJA/RAJA.hpp"
+
+/*
+ *  MultiReduction Example using RAJA forall
+ *
+ *  This example illustrates use of the RAJA multi-reduction types: min, max,
+ *  sum, and, and or.
+ *
+ *  RAJA features shown:
+ *    - `forall' loop iteration template method
+ *    -  Index range segment
+ *    -  Execution policies
+ *    -  MultiReduction types
+ *
+ */
+
+template < typename t_exec_policy, typename t_multi_reduce_policy >
+struct Backend
+{
+  using exec_policy = t_exec_policy;
+  using multi_reduce_policy = t_multi_reduce_policy;
+
+  std::string name;
+};
+
+auto example_policies = camp::make_tuple(
+
+      Backend<RAJA::seq_exec, RAJA::seq_multi_reduce>{"Sequential"}
+
+#if defined(RAJA_ENABLE_OPENMP)
+    , Backend<RAJA::omp_parallel_for_exec, RAJA::omp_multi_reduce>{"OpenMP"}
+#endif
+
+#if defined(RAJA_ENABLE_CUDA)
+    , Backend<RAJA::cuda_exec_async<256>, RAJA::cuda_multi_reduce_atomic>{"Cuda"}
+#endif
+
+#if defined(RAJA_ENABLE_HIP)
+    , Backend<RAJA::hip_exec_async<256>, RAJA::hip_multi_reduce_atomic>{"Hip"}
+#endif
+
+    );
+
+template < typename exec_policy, typename multi_reduce_policy >
+void example_code(RAJA::RangeSegment arange, int num_bins, int* bins, int* a)
+{
+  RAJA::MultiReduceSum<multi_reduce_policy, int>    multi_reduce_sum(num_bins);
+  RAJA::MultiReduceMin<multi_reduce_policy, int>    multi_reduce_min(num_bins);
+  RAJA::MultiReduceMax<multi_reduce_policy, int>    multi_reduce_max(num_bins);
+  RAJA::MultiReduceBitAnd<multi_reduce_policy, int> multi_reduce_and(num_bins);
+  RAJA::MultiReduceBitOr<multi_reduce_policy, int>  multi_reduce_or(num_bins);
+
+  RAJA::forall<exec_policy>(arange,
+      [=] RAJA_HOST_DEVICE(RAJA::Index_type i) {
+
+    int bin = bins[i];
+
+    multi_reduce_sum[bin] +=  a[i];
+    multi_reduce_min[bin].min(a[i]);
+    multi_reduce_max[bin].max(a[i]);
+    multi_reduce_and[bin] &=  a[i];
+    multi_reduce_or [bin] |=  a[i];
+
+  });
+
+  for (int bin = 0; bin < num_bins; ++bin) {
+    std::cout << "\tsum[" << bin << "] = " << multi_reduce_sum.get(bin) << '\n';
+    std::cout << "\tmin[" << bin << "] = " << multi_reduce_min.get(bin) << '\n';
+    std::cout << "\tmax[" << bin << "] = " << multi_reduce_max.get(bin) << '\n';
+    std::cout << "\tand[" << bin << "] = " << multi_reduce_and.get(bin) << '\n';
+    std::cout << "\tor [" << bin << "] = " << multi_reduce_or .get(bin) << '\n';
+    std::cout << '\n';
+  }
+}
+
+int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv))
+{
+
+  // _multi_reductions_array_init_start
+//
+// Define array length
+//
+  const int N = 1000000;
+  const int num_bins = 10;
+
+//
+// Allocate array data and initialize data to alternating sequence of 1, -1.
+//
+  camp::resources::Host host_res;
+  int* host_bins = host_res.template allocate<int>(N);
+  int* host_a    = host_res.template allocate<int>(N);
+
+  for (int i = 0; i < N; ++i) {
+    host_bins[i] = i % num_bins;
+    host_a[i] = (i % (2*num_bins)) - num_bins;
+  }
+
+  // _multi_reductions_array_init_end
+
+//
+// Note: with this data initialization scheme, the following results will
+//       be observed for all reduction kernels below:
+//
+// for bin in [0, num_bins)
+//  - the sum will be (bin - num_bins/2) * N / num_bins
+//  - the min will be bin - num_bins
+//  - the max will be bin
+//  - the and will be min & max
+//  - the or  will be min | max
+//
+
+//
+// Define index range for iterating over a elements in all examples
+//
+  // _multi_reductions_range_start
+  RAJA::RangeSegment arange(0, N);
+  // _multi_reductions_range_end
+
+//----------------------------------------------------------------------------//
+
+  RAJA::for_each_tuple(example_policies, [&](auto const& backend) {
+
+    std::cout << "Running " << backend.name << " policies" << '\n';
+
+    using exec_policy = typename std::decay_t<decltype(backend)>::exec_policy;
+    using multi_reduce_policy = typename std::decay_t<decltype(backend)>::multi_reduce_policy;
+
+    auto res = RAJA::resources::get_default_resource<exec_policy>();
+
+    int* bins = res.template allocate<int>(N);
+    int* a    = res.template allocate<int>(N);
+
+    res.memcpy(bins, host_bins, N*sizeof(int));
+    res.memcpy(a   , host_a   , N*sizeof(int));
+
+    example_code<exec_policy, multi_reduce_policy>(arange, num_bins, bins, a);
+
+    res.deallocate(bins);
+    res.deallocate(a   );
+
+    std::cout << std::endl;
+  });
+
+//----------------------------------------------------------------------------//
+
+//
+// Clean up.
+//
+  host_res.deallocate(host_bins);
+  host_res.deallocate(host_a   );
+
+  std::cout << "\n DONE!...\n";
+
+  return 0;
+}
diff --git a/examples/launch-param-reductions.cpp b/examples/launch-param-reductions.cpp
index 7dec3595a6..b57bedfd6b 100644
--- a/examples/launch-param-reductions.cpp
+++ b/examples/launch-param-reductions.cpp
@@ -9,8 +9,6 @@
 #include <iostream>
 #include <limits>
 
-#include "memoryManager.hpp"
-
 #include "RAJA/RAJA.hpp"
 
 /*
@@ -39,6 +37,11 @@ constexpr int CUDA_BLOCK_SIZE = 256;
 constexpr int HIP_BLOCK_SIZE = 256;
 #endif
 
+#if defined(RAJA_ENABLE_SYCL)
+//LC testing hardware has a limit of 151
+constexpr int SYCL_BLOCK_SIZE = 128;
+#endif
+
 int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
 {
 
@@ -50,10 +53,25 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
 //
   constexpr int N = 1000000;
 
+//
+// Use a resource to allocate memory
+//
+  RAJA::resources::Host host_res;
+#if defined(RAJA_ENABLE_CUDA)
+  RAJA::resources::Cuda device_res;
+#endif
+#if defined(RAJA_ENABLE_HIP)
+  RAJA::resources::Hip device_res;
+#endif
+#if defined(RAJA_ENABLE_SYCL)
+  RAJA::resources::Sycl device_res;
+#endif
+
+
 //
 // Allocate array data and initialize data to alternating sequence of 1, -1.
 //
-  int* a = memoryManager::allocate<int>(N);
+  int* a = host_res.allocate<int>(N);
 
   for (int i = 0; i < N; ++i) {
     if ( i % 2 == 0 ) {
@@ -111,9 +129,8 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   VALLOC_INT seq_minloc(std::numeric_limits<int>::max(), -1);
   VALLOC_INT seq_maxloc(std::numeric_limits<int>::min(), -1);
 
-  //RAJA::forall<EXEC_POL1>(arange,
   RAJA::launch<LAUNCH_POL1>
-    (RAJA::LaunchParams(), "SeqReductionKernel",
+    (host_res, RAJA::LaunchParams(), "SeqReductionKernel",
     RAJA::expt::Reduce<RAJA::operators::plus>(&seq_sum),
     RAJA::expt::Reduce<RAJA::operators::minimum>(&seq_min),
     RAJA::expt::Reduce<RAJA::operators::maximum>(&seq_max),
@@ -171,7 +188,7 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   VALLOC_INT omp_maxloc(std::numeric_limits<int>::min(), -1);
 
   RAJA::launch<LAUNCH_POL2>
-    (RAJA::LaunchParams(), "OmpReductionKernel",
+    (host_res, RAJA::LaunchParams(), "OmpReductionKernel",
     RAJA::expt::Reduce<RAJA::operators::plus>(&omp_sum),
     RAJA::expt::Reduce<RAJA::operators::minimum>(&omp_min),
     RAJA::expt::Reduce<RAJA::operators::maximum>(&omp_max),
@@ -214,6 +231,9 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
 #if defined(RAJA_ENABLE_CUDA)
   std::cout << "\n Running RAJA CUDA reductions...\n";
 
+  int* d_a = device_res.allocate<int>(N);
+  device_res.memcpy(d_a, a, sizeof(int) * N);
+
   // _reductions_raja_cudapolicy_start
   using LAUNCH_POL3   = RAJA::LaunchPolicy<RAJA::cuda_launch_t<false /*async*/>>;
   using LOOP_POL3     = RAJA::LoopPolicy<RAJA::cuda_global_thread_x>;
@@ -228,7 +248,7 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   VALLOC_INT cuda_maxloc(std::numeric_limits<int>::min(), -1);
 
   RAJA::launch<LAUNCH_POL3>
-    (RAJA::LaunchParams(RAJA::Teams(NUMBER_OF_TEAMS), RAJA::Threads(CUDA_BLOCK_SIZE)),
+    (device_res, RAJA::LaunchParams(RAJA::Teams(NUMBER_OF_TEAMS), RAJA::Threads(CUDA_BLOCK_SIZE)),
      "CUDAReductionKernel",
     RAJA::expt::Reduce<RAJA::operators::plus>(&cuda_sum),
     RAJA::expt::Reduce<RAJA::operators::minimum>(&cuda_min),
@@ -242,13 +262,13 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
 
       RAJA::loop<LOOP_POL3>(ctx, arange, [&] (int i) {
 
-          _cuda_sum += a[i];
+          _cuda_sum += d_a[i];
 
-          _cuda_min = RAJA_MIN(a[i], _cuda_min);
-          _cuda_max = RAJA_MAX(a[i], _cuda_max);
+          _cuda_min = RAJA_MIN(d_a[i], _cuda_min);
+          _cuda_max = RAJA_MAX(d_a[i], _cuda_max);
 
-          _cuda_minloc = RAJA_MIN(VALLOC_INT(a[i], i), _cuda_minloc);
-          _cuda_maxloc = RAJA_MAX(VALLOC_INT(a[i], i), _cuda_maxloc);
+          _cuda_minloc = RAJA_MIN(VALLOC_INT(d_a[i], i), _cuda_minloc);
+          _cuda_maxloc = RAJA_MAX(VALLOC_INT(d_a[i], i), _cuda_maxloc);
           //_cuda_minloc.min(a[i], i);
           //_cuda_maxloc.max(a[i], i);
 
@@ -267,6 +287,7 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   std::cout << "\tmax, loc = " << cuda_maxloc.getVal() << " , "
                                << cuda_maxloc.getLoc() << std::endl;
 
+  device_res.deallocate(d_a);
 #endif
 
 //----------------------------------------------------------------------------//
@@ -274,8 +295,8 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
 #if defined(RAJA_ENABLE_HIP)
   std::cout << "\n Running RAJA HIP reductions...\n";
 
-  int* d_a = memoryManager::allocate_gpu<int>(N);
-  hipErrchk(hipMemcpy( d_a, a, N * sizeof(int), hipMemcpyHostToDevice ));
+  int* d_a = device_res.allocate<int>(N);
+  device_res.memcpy(d_a, a, sizeof(int) * N);
 
   // _reductions_raja_hippolicy_start
   using LAUNCH_POL3   = RAJA::LaunchPolicy<RAJA::hip_launch_t<false /*async*/>>;
@@ -291,7 +312,7 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   VALLOC_INT hip_maxloc(std::numeric_limits<int>::min(), -1);
 
   RAJA::launch<LAUNCH_POL3>
-    (RAJA::LaunchParams(RAJA::Teams(NUMBER_OF_TEAMS), RAJA::Threads(HIP_BLOCK_SIZE)),
+    (device_res, RAJA::LaunchParams(RAJA::Teams(NUMBER_OF_TEAMS), RAJA::Threads(HIP_BLOCK_SIZE)),
      "HipReductionKernel",
     RAJA::expt::Reduce<RAJA::operators::plus>(&hip_sum),
     RAJA::expt::Reduce<RAJA::operators::minimum>(&hip_min),
@@ -329,7 +350,70 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   std::cout << "\tmax, loc = " << hip_maxloc.getVal() << " , "
                                << hip_maxloc.getLoc() << std::endl;
 
-  memoryManager::deallocate_gpu(d_a);
+  device_res.deallocate(d_a);
+#endif
+
+//----------------------------------------------------------------------------//
+
+#if defined(RAJA_ENABLE_SYCL)
+  std::cout << "\n Running RAJA SYCL reductions...\n";
+
+  int* d_a = device_res.allocate<int>(N);
+  device_res.memcpy(d_a, a, sizeof(int) * N);
+
+  // _reductions_raja_syclpolicy_start
+  using LAUNCH_POL4   = RAJA::LaunchPolicy<RAJA::sycl_launch_t<false /*async*/>>;
+  using LOOP_POL4     = RAJA::LoopPolicy<RAJA::sycl_global_item_2>;
+  // _reductions_raja_syclpolicy_end
+
+  const int NUMBER_OF_TEAMS = (N-1)/SYCL_BLOCK_SIZE + 1;
+
+  int sycl_sum = 0;
+  int sycl_min = std::numeric_limits<int>::max();
+  int sycl_max = std::numeric_limits<int>::min();
+  VALLOC_INT sycl_minloc(std::numeric_limits<int>::max(), -1);
+  VALLOC_INT sycl_maxloc(std::numeric_limits<int>::min(), -1);
+
+  RAJA::launch<LAUNCH_POL4>
+    (device_res, RAJA::LaunchParams(RAJA::Teams(NUMBER_OF_TEAMS), RAJA::Threads(SYCL_BLOCK_SIZE)),
+     "SyclReductionKernel",
+    RAJA::expt::Reduce<RAJA::operators::plus>(&sycl_sum),
+    RAJA::expt::Reduce<RAJA::operators::minimum>(&sycl_min),
+    RAJA::expt::Reduce<RAJA::operators::maximum>(&sycl_max),
+    RAJA::expt::Reduce<RAJA::operators::minimum>(&sycl_minloc),
+    RAJA::expt::Reduce<RAJA::operators::maximum>(&sycl_maxloc),
+     [=] RAJA_HOST_DEVICE (RAJA::LaunchContext ctx,
+                           int &_sycl_sum, int &_sycl_min,
+                           int &_sycl_max, VALLOC_INT &_sycl_minloc,
+                           VALLOC_INT &_sycl_maxloc) {
+
+      RAJA::loop<LOOP_POL4>(ctx, arange, [&] (int i) {
+
+          _sycl_sum += d_a[i];
+
+          _sycl_min = RAJA_MIN(d_a[i], _sycl_min);
+          _sycl_max = RAJA_MAX(d_a[i], _sycl_max);
+
+          _sycl_minloc = RAJA_MIN(VALLOC_INT(d_a[i], i), _sycl_minloc);
+          _sycl_maxloc = RAJA_MAX(VALLOC_INT(d_a[i], i), _sycl_maxloc);
+          //_sycl_minloc.min(d_a[i], i);
+          //_sycl_maxloc.max(d_a[i], i);
+
+        }
+      );
+
+    }
+  );
+
+  std::cout << "\tsum = " << sycl_sum << std::endl;
+  std::cout << "\tmin = " << sycl_min << std::endl;
+  std::cout << "\tmax = " << sycl_max << std::endl;
+  std::cout << "\tmin, loc = " << sycl_minloc.getVal() << " , "
+                               << sycl_minloc.getLoc() << std::endl;
+  std::cout << "\tmax, loc = " << sycl_maxloc.getVal() << " , "
+                               << sycl_maxloc.getLoc() << std::endl;
+
+  device_res.deallocate(d_a);
 #endif
 
 //----------------------------------------------------------------------------//
@@ -337,7 +421,7 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
 //
 // Clean up.
 //
-  memoryManager::deallocate(a);
+  host_res.deallocate(a);
 
   std::cout << "\n DONE!...\n";
 
diff --git a/examples/resource-dynamic-forall.cpp b/examples/resource-dynamic-forall.cpp
index 01d45bb2ae..0b35017fac 100644
--- a/examples/resource-dynamic-forall.cpp
+++ b/examples/resource-dynamic-forall.cpp
@@ -110,9 +110,12 @@ int main(int argc, char *argv[])
 #if defined(RAJA_ENABLE_HIP)
   RAJA::resources::Hip device_res;
 #endif
+#if defined(RAJA_ENABLE_SYCL)
+  RAJA::resources::Sycl device_res;
+#endif  
 
   //Get typed erased resource - it will internally store if we are running on the host or device
-#if defined(RAJA_ENABLE_CUDA) || defined(RAJA_ENABLE_HIP)
+#if defined(RAJA_ENABLE_CUDA) || defined(RAJA_ENABLE_HIP) || defined(RAJA_ENABLE_SYCL)
   RAJA::resources::Resource res = RAJA::Get_Runtime_Resource(host_res, device_res, select_cpu_or_gpu);
 #else
   RAJA::resources::Resource res = RAJA::Get_Host_Resource(host_res, select_cpu_or_gpu);
diff --git a/examples/resource-runtime-launch.cpp b/examples/resource-runtime-launch.cpp
index 9524c32cde..e52923d81f 100644
--- a/examples/resource-runtime-launch.cpp
+++ b/examples/resource-runtime-launch.cpp
@@ -153,7 +153,7 @@ int main(int argc, char *argv[])
 #endif
 
   //Get typed erased resource - it will internally store if we are running on the host or device
-#if defined(RAJA_ENABLE_CUDA) || defined(RAJA_ENABLE_HIP)
+#if defined(RAJA_GPU_ACTIVE) && !defined(RAJA_ENABLE_SYCL)
   RAJA::resources::Resource res = RAJA::Get_Runtime_Resource(host_res, device_res, select_cpu_or_gpu);
 #else
   RAJA::resources::Resource res = RAJA::Get_Host_Resource(host_res, select_cpu_or_gpu);
diff --git a/include/RAJA/RAJA.hpp b/include/RAJA/RAJA.hpp
index c37ac997a4..59cca4bf22 100644
--- a/include/RAJA/RAJA.hpp
+++ b/include/RAJA/RAJA.hpp
@@ -165,6 +165,7 @@
 // Reduction objects
 //
 #include "RAJA/pattern/reduce.hpp"
+#include "RAJA/pattern/multi_reduce.hpp"
 
 
 //
diff --git a/include/RAJA/pattern/atomic.hpp b/include/RAJA/pattern/atomic.hpp
index af9a6af911..d5905f7928 100644
--- a/include/RAJA/pattern/atomic.hpp
+++ b/include/RAJA/pattern/atomic.hpp
@@ -80,6 +80,32 @@ namespace RAJA
  */
 
 
+/*!
+ * @brief Atomic load
+ * @param acc Pointer to location of value
+ * @return Value at acc
+ */
+RAJA_SUPPRESS_HD_WARN
+template <typename Policy, typename T>
+RAJA_INLINE RAJA_HOST_DEVICE T atomicLoad(T *acc)
+{
+  return RAJA::atomicLoad(Policy{}, acc);
+}
+
+
+/*!
+ * @brief Atomic store
+ * @param acc Pointer to location of value
+ * @param value Value to store at *acc
+ */
+RAJA_SUPPRESS_HD_WARN
+template <typename Policy, typename T>
+RAJA_INLINE RAJA_HOST_DEVICE void atomicStore(T *acc, T value)
+{
+  RAJA::atomicStore(Policy{}, acc, value);
+}
+
+
 /*!
  * @brief Atomic add
  * @param acc Pointer to location of result value
@@ -88,7 +114,7 @@ namespace RAJA
  */
 RAJA_SUPPRESS_HD_WARN
 template <typename Policy, typename T>
-RAJA_INLINE RAJA_HOST_DEVICE T atomicAdd(T volatile *acc, T value)
+RAJA_INLINE RAJA_HOST_DEVICE T atomicAdd(T *acc, T value)
 {
   return RAJA::atomicAdd(Policy{}, acc, value);
 }
@@ -102,7 +128,7 @@ RAJA_INLINE RAJA_HOST_DEVICE T atomicAdd(T volatile *acc, T value)
  */
 RAJA_SUPPRESS_HD_WARN
 template <typename Policy, typename T>
-RAJA_INLINE RAJA_HOST_DEVICE T atomicSub(T volatile *acc, T value)
+RAJA_INLINE RAJA_HOST_DEVICE T atomicSub(T *acc, T value)
 {
   return RAJA::atomicSub(Policy{}, acc, value);
 }
@@ -116,7 +142,7 @@ RAJA_INLINE RAJA_HOST_DEVICE T atomicSub(T volatile *acc, T value)
  */
 RAJA_SUPPRESS_HD_WARN
 template <typename Policy, typename T>
-RAJA_INLINE RAJA_HOST_DEVICE T atomicMin(T volatile *acc, T value)
+RAJA_INLINE RAJA_HOST_DEVICE T atomicMin(T *acc, T value)
 {
   return RAJA::atomicMin(Policy{}, acc, value);
 }
@@ -130,7 +156,7 @@ RAJA_INLINE RAJA_HOST_DEVICE T atomicMin(T volatile *acc, T value)
  */
 RAJA_SUPPRESS_HD_WARN
 template <typename Policy, typename T>
-RAJA_INLINE RAJA_HOST_DEVICE T atomicMax(T volatile *acc, T value)
+RAJA_INLINE RAJA_HOST_DEVICE T atomicMax(T *acc, T value)
 {
   return RAJA::atomicMax(Policy{}, acc, value);
 }
@@ -143,7 +169,7 @@ RAJA_INLINE RAJA_HOST_DEVICE T atomicMax(T volatile *acc, T value)
  */
 RAJA_SUPPRESS_HD_WARN
 template <typename Policy, typename T>
-RAJA_INLINE RAJA_HOST_DEVICE T atomicInc(T volatile *acc)
+RAJA_INLINE RAJA_HOST_DEVICE T atomicInc(T *acc)
 {
   return RAJA::atomicInc(Policy{}, acc);
 }
@@ -159,7 +185,7 @@ RAJA_INLINE RAJA_HOST_DEVICE T atomicInc(T volatile *acc)
  */
 RAJA_SUPPRESS_HD_WARN
 template <typename Policy, typename T>
-RAJA_INLINE RAJA_HOST_DEVICE T atomicInc(T volatile *acc, T compare)
+RAJA_INLINE RAJA_HOST_DEVICE T atomicInc(T *acc, T compare)
 {
   return RAJA::atomicInc(Policy{}, acc, compare);
 }
@@ -172,7 +198,7 @@ RAJA_INLINE RAJA_HOST_DEVICE T atomicInc(T volatile *acc, T compare)
  */
 RAJA_SUPPRESS_HD_WARN
 template <typename Policy, typename T>
-RAJA_INLINE RAJA_HOST_DEVICE T atomicDec(T volatile *acc)
+RAJA_INLINE RAJA_HOST_DEVICE T atomicDec(T *acc)
 {
   return RAJA::atomicDec(Policy{}, acc);
 }
@@ -188,7 +214,7 @@ RAJA_INLINE RAJA_HOST_DEVICE T atomicDec(T volatile *acc)
  */
 RAJA_SUPPRESS_HD_WARN
 template <typename Policy, typename T>
-RAJA_INLINE RAJA_HOST_DEVICE T atomicDec(T volatile *acc, T compare)
+RAJA_INLINE RAJA_HOST_DEVICE T atomicDec(T *acc, T compare)
 {
   return RAJA::atomicDec(Policy{}, acc, compare);
 }
@@ -203,7 +229,7 @@ RAJA_INLINE RAJA_HOST_DEVICE T atomicDec(T volatile *acc, T compare)
  */
 RAJA_SUPPRESS_HD_WARN
 template <typename Policy, typename T>
-RAJA_INLINE RAJA_HOST_DEVICE T atomicAnd(T volatile *acc, T value)
+RAJA_INLINE RAJA_HOST_DEVICE T atomicAnd(T *acc, T value)
 {
   static_assert(std::is_integral<T>::value,
                 "atomicAnd can only be used on integral types");
@@ -220,7 +246,7 @@ RAJA_INLINE RAJA_HOST_DEVICE T atomicAnd(T volatile *acc, T value)
  */
 RAJA_SUPPRESS_HD_WARN
 template <typename Policy, typename T>
-RAJA_INLINE RAJA_HOST_DEVICE T atomicOr(T volatile *acc, T value)
+RAJA_INLINE RAJA_HOST_DEVICE T atomicOr(T *acc, T value)
 {
   static_assert(std::is_integral<T>::value,
                 "atomicOr can only be used on integral types");
@@ -237,7 +263,7 @@ RAJA_INLINE RAJA_HOST_DEVICE T atomicOr(T volatile *acc, T value)
  */
 RAJA_SUPPRESS_HD_WARN
 template <typename Policy, typename T>
-RAJA_INLINE RAJA_HOST_DEVICE T atomicXor(T volatile *acc, T value)
+RAJA_INLINE RAJA_HOST_DEVICE T atomicXor(T *acc, T value)
 {
   static_assert(std::is_integral<T>::value,
                 "atomicXor can only be used on integral types");
@@ -253,7 +279,7 @@ RAJA_INLINE RAJA_HOST_DEVICE T atomicXor(T volatile *acc, T value)
  */
 RAJA_SUPPRESS_HD_WARN
 template <typename Policy, typename T>
-RAJA_INLINE RAJA_HOST_DEVICE T atomicExchange(T volatile *acc, T value)
+RAJA_INLINE RAJA_HOST_DEVICE T atomicExchange(T *acc, T value)
 {
   return RAJA::atomicExchange(Policy{}, acc, value);
 }
@@ -269,7 +295,7 @@ RAJA_INLINE RAJA_HOST_DEVICE T atomicExchange(T volatile *acc, T value)
 
 RAJA_SUPPRESS_HD_WARN
 template <typename Policy, typename T>
-RAJA_INLINE RAJA_HOST_DEVICE T atomicCAS(T volatile *acc, T compare, T value)
+RAJA_INLINE RAJA_HOST_DEVICE T atomicCAS(T *acc, T compare, T value)
 {
   return RAJA::atomicCAS(Policy{}, acc, compare, value);
 }
@@ -292,31 +318,34 @@ class AtomicRef
   RAJA_INLINE
   RAJA_HOST_DEVICE
   constexpr explicit AtomicRef(value_type *value_ptr)
-      : m_value_ptr(value_ptr){};
+      : m_value_ptr(value_ptr) {}
 
   RAJA_INLINE
   RAJA_HOST_DEVICE
-  constexpr AtomicRef(AtomicRef const&c)
-      : m_value_ptr(c.m_value_ptr){};
+  constexpr AtomicRef(AtomicRef const &c)
+      : m_value_ptr(c.m_value_ptr) {}
 
   AtomicRef& operator=(AtomicRef const&) = delete;
 
   RAJA_INLINE
   RAJA_HOST_DEVICE
-  value_type volatile * getPointer() const { return m_value_ptr; }
+  value_type * getPointer() const
+  {
+    return m_value_ptr;
+  }
 
   RAJA_INLINE
   RAJA_HOST_DEVICE
   void store(value_type rhs) const
   {
-    *m_value_ptr = rhs;
+    RAJA::atomicStore<Policy>(m_value_ptr, rhs);
   }
 
   RAJA_INLINE
   RAJA_HOST_DEVICE
   value_type operator=(value_type rhs) const
   {
-    *m_value_ptr = rhs;
+    RAJA::atomicStore<Policy>(m_value_ptr, rhs);
     return rhs;
   }
 
@@ -324,14 +353,14 @@ class AtomicRef
   RAJA_HOST_DEVICE
   value_type load() const
   {
-    return *m_value_ptr;
+    return RAJA::atomicLoad<Policy>(m_value_ptr);
   }
 
   RAJA_INLINE
   RAJA_HOST_DEVICE
   operator value_type() const
   {
-    return *m_value_ptr;
+    return RAJA::atomicLoad<Policy>(m_value_ptr);
   }
 
   RAJA_INLINE
@@ -498,7 +527,7 @@ class AtomicRef
   }
 
 private:
-  value_type volatile *m_value_ptr;
+  value_type *m_value_ptr;
 };
 
 
diff --git a/include/RAJA/pattern/detail/multi_reduce.hpp b/include/RAJA/pattern/detail/multi_reduce.hpp
new file mode 100644
index 0000000000..884b9aa989
--- /dev/null
+++ b/include/RAJA/pattern/detail/multi_reduce.hpp
@@ -0,0 +1,420 @@
+/*!
+ ******************************************************************************
+ *
+ * \file
+ *
+ * \brief  Base types used in common for RAJA reducer objects.
+ *
+ ******************************************************************************
+ */
+
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2016-24, Lawrence Livermore National Security, LLC
+// and RAJA project contributors. See the RAJA/LICENSE file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+#ifndef RAJA_PATTERN_DETAIL_MULTI_REDUCE_HPP
+#define RAJA_PATTERN_DETAIL_MULTI_REDUCE_HPP
+
+#include "RAJA/pattern/detail/forall.hpp"
+
+#include "RAJA/util/macros.hpp"
+#include "RAJA/util/Operators.hpp"
+#include "RAJA/util/types.hpp"
+#include "RAJA/util/RepeatView.hpp"
+
+
+#define RAJA_DECLARE_MULTI_REDUCER(OP_NAME, OP, POL, DATA)    \
+  template <typename tuning, typename T>                      \
+  struct MultiReduce##OP_NAME<POL<tuning>, T>                 \
+      : reduce::detail::BaseMultiReduce##OP_NAME<             \
+            DATA<T, RAJA::reduce::OP<T>, tuning>>             \
+  {                                                           \
+    using policy = POL<tuning>;                               \
+    using Base = reduce::detail::BaseMultiReduce##OP_NAME<    \
+        DATA<T, RAJA::reduce::OP<T>, tuning>>;                \
+    using Base::Base;                                         \
+    using typename Base::value_type;                          \
+    using typename Base::reference;                           \
+                                                              \
+    RAJA_SUPPRESS_HD_WARN                                     \
+    RAJA_HOST_DEVICE                                          \
+    reference operator[](size_t bin) const                    \
+    {                                                         \
+      return reference(*this, bin);                           \
+    }                                                         \
+  };
+
+#define RAJA_DECLARE_ALL_MULTI_REDUCERS(POL, DATA)            \
+  RAJA_DECLARE_MULTI_REDUCER(Sum, sum, POL, DATA)             \
+  RAJA_DECLARE_MULTI_REDUCER(Min, min, POL, DATA)             \
+  RAJA_DECLARE_MULTI_REDUCER(Max, max, POL, DATA)             \
+  RAJA_DECLARE_MULTI_REDUCER(BitOr, or_bit, POL, DATA)        \
+  RAJA_DECLARE_MULTI_REDUCER(BitAnd, and_bit, POL, DATA)
+
+namespace RAJA
+{
+
+namespace reduce
+{
+
+namespace detail
+{
+
+template <typename t_MultiReduceData>
+struct BaseMultiReduce
+{
+  using MultiReduceData = t_MultiReduceData;
+  using MultiReduceOp = typename t_MultiReduceData::MultiReduceOp;
+  using value_type = typename t_MultiReduceData::value_type;
+
+  BaseMultiReduce() : BaseMultiReduce{RepeatView<value_type>(MultiReduceOp::identity(), 0)} {}
+
+  explicit BaseMultiReduce(size_t num_bins,
+                           value_type init_val = MultiReduceOp::identity(),
+                           value_type identity = MultiReduceOp::identity())
+      : BaseMultiReduce{RepeatView<value_type>(init_val, num_bins), identity}
+  { }
+
+  template < typename Container,
+             concepts::enable_if_t<type_traits::is_range<Container>,
+                                   concepts::negate<std::is_convertible<Container, size_t>>,
+                                   concepts::negate<std::is_base_of<BaseMultiReduce, Container>>>* = nullptr >
+  explicit BaseMultiReduce(Container const& container,
+                           value_type identity = MultiReduceOp::identity())
+      : data{container, identity}
+  { }
+
+  RAJA_SUPPRESS_HD_WARN
+  BaseMultiReduce(BaseMultiReduce const&) = default;
+  RAJA_SUPPRESS_HD_WARN
+  BaseMultiReduce(BaseMultiReduce &&) = default;
+  BaseMultiReduce &operator=(BaseMultiReduce const&) = delete;
+  BaseMultiReduce &operator=(BaseMultiReduce &&) = delete;
+  RAJA_SUPPRESS_HD_WARN
+  ~BaseMultiReduce() = default;
+
+  void reset()
+  {
+    reset(RepeatView<value_type>(MultiReduceOp::identity(), size()));
+  }
+
+  void reset(size_t num_bins,
+             value_type init_val = MultiReduceOp::identity(),
+             value_type identity = MultiReduceOp::identity())
+  {
+    reset(RepeatView<value_type>(init_val, num_bins), identity);
+  }
+
+  template < typename Container,
+             concepts::enable_if_t<type_traits::is_range<Container>>* = nullptr >
+  void reset(Container const& container,
+             value_type identity = MultiReduceOp::identity())
+  {
+    for (size_t bin = 0; bin < data.num_bins(); ++bin) {
+      RAJA_UNUSED_VAR(get(bin)); // automatic get() before reset
+    }
+    data.reset(container, identity);
+  }
+
+  RAJA_SUPPRESS_HD_WARN
+  RAJA_HOST_DEVICE
+  size_t size() const { return data.num_bins(); }
+
+  RAJA_SUPPRESS_HD_WARN
+  RAJA_HOST_DEVICE
+  BaseMultiReduce const& combine(size_t bin, value_type const &other) const
+  {
+    data.combine(bin, other);
+    return *this;
+  }
+
+  //! Get the calculated reduced value for a bin
+  value_type get(size_t bin) const { return data.get(bin); }
+
+  //! Get the calculated reduced value for each bin and store it in container
+  template < typename Container,
+             concepts::enable_if_t<type_traits::is_range<Container>>* = nullptr >
+  void get_all(Container& container) const
+  {
+    RAJA_EXTRACT_BED_IT(container);
+    if (size_t(distance_it) != data.num_bins()) {
+      RAJA_ABORT_OR_THROW("MultiReduce::get_all container has different size than multi reducer");
+    }
+    size_t bin = 0;
+    for (auto& val : container) {
+      val = data.get(bin);
+      ++bin;
+    }
+  }
+
+private:
+  MultiReduceData mutable data;
+};
+
+
+/*!
+ ******************************************************************************
+ *
+ * \brief  Min reducer class template.
+ *
+ ******************************************************************************
+ */
+template <typename MultiReduceData>
+class BaseMultiReduceMin : public BaseMultiReduce<MultiReduceData>
+{
+public:
+  using Base = BaseMultiReduce<MultiReduceData>;
+  using typename Base::value_type;
+  using Base::Base;
+
+  RAJA_SUPPRESS_HD_WARN
+  BaseMultiReduceMin(BaseMultiReduceMin const&) = default;
+  RAJA_SUPPRESS_HD_WARN
+  BaseMultiReduceMin(BaseMultiReduceMin &&) = default;
+  RAJA_SUPPRESS_HD_WARN
+  BaseMultiReduceMin &operator=(BaseMultiReduceMin const&) = delete;
+  RAJA_SUPPRESS_HD_WARN
+  BaseMultiReduceMin &operator=(BaseMultiReduceMin &&) = delete;
+  RAJA_SUPPRESS_HD_WARN
+  ~BaseMultiReduceMin() = default;
+
+  struct reference
+  {
+    RAJA_HOST_DEVICE
+    reference(BaseMultiReduceMin const& base, size_t bin)
+      : m_base(base), m_bin(bin)
+    { }
+
+    //! reducer function; updates the current instance's state
+    RAJA_HOST_DEVICE
+    reference const& min(value_type rhs) const
+    {
+      m_base.combine(m_bin, rhs);
+      return *this;
+    }
+
+    value_type get() const
+    {
+      return m_base.get(m_bin);
+    }
+
+  private:
+    BaseMultiReduceMin const& m_base;
+    size_t m_bin;
+  };
+};
+
+/*!
+ **************************************************************************
+ *
+ * \brief  Max reducer class template.
+ *
+ **************************************************************************
+ */
+template <typename MultiReduceData>
+class BaseMultiReduceMax : public BaseMultiReduce<MultiReduceData>
+{
+public:
+  using Base = BaseMultiReduce<MultiReduceData>;
+  using typename Base::value_type;
+
+  using Base::Base;
+
+  RAJA_SUPPRESS_HD_WARN
+  BaseMultiReduceMax(BaseMultiReduceMax const&) = default;
+  RAJA_SUPPRESS_HD_WARN
+  BaseMultiReduceMax(BaseMultiReduceMax &&) = default;
+  BaseMultiReduceMax &operator=(BaseMultiReduceMax const&) = delete;
+  BaseMultiReduceMax &operator=(BaseMultiReduceMax &&) = delete;
+  RAJA_SUPPRESS_HD_WARN
+  ~BaseMultiReduceMax() = default;
+
+  struct reference
+  {
+    RAJA_HOST_DEVICE
+    reference(BaseMultiReduceMax const& base, size_t bin)
+      : m_base(base), m_bin(bin)
+    { }
+
+    //! reducer function; updates the current instance's state
+    RAJA_HOST_DEVICE
+    reference const& max(value_type rhs) const
+    {
+      m_base.combine(m_bin, rhs);
+      return *this;
+    }
+
+    value_type get() const
+    {
+      return m_base.get(m_bin);
+    }
+
+  private:
+    BaseMultiReduceMax const& m_base;
+    size_t m_bin;
+  };
+};
+
+/*!
+ **************************************************************************
+ *
+ * \brief  Sum reducer class template.
+ *
+ **************************************************************************
+ */
+template <typename MultiReduceData>
+class BaseMultiReduceSum : public BaseMultiReduce<MultiReduceData>
+{
+public:
+  using Base = BaseMultiReduce<MultiReduceData>;
+  using typename Base::value_type;
+
+  using Base::Base;
+
+  RAJA_SUPPRESS_HD_WARN
+  BaseMultiReduceSum(BaseMultiReduceSum const&) = default;
+  RAJA_SUPPRESS_HD_WARN
+  BaseMultiReduceSum(BaseMultiReduceSum &&) = default;
+  BaseMultiReduceSum &operator=(BaseMultiReduceSum const&) = delete;
+  BaseMultiReduceSum &operator=(BaseMultiReduceSum &&) = delete;
+  RAJA_SUPPRESS_HD_WARN
+  ~BaseMultiReduceSum() = default;
+
+  struct reference
+  {
+    RAJA_HOST_DEVICE
+    reference(BaseMultiReduceSum const& base, size_t bin)
+      : m_base(base), m_bin(bin)
+    { }
+
+    //! reducer function; updates the current instance's state
+    RAJA_HOST_DEVICE
+    reference const& operator+=(value_type rhs) const
+    {
+      m_base.combine(m_bin, rhs);
+      return *this;
+    }
+
+    value_type get() const
+    {
+      return m_base.get(m_bin);
+    }
+
+  private:
+    BaseMultiReduceSum const& m_base;
+    size_t m_bin;
+  };
+};
+
+/*!
+ **************************************************************************
+ *
+ * \brief  Bitwise OR reducer class template.
+ *
+ **************************************************************************
+ */
+template <typename MultiReduceData>
+class BaseMultiReduceBitOr : public BaseMultiReduce<MultiReduceData>
+{
+public:
+  using Base = BaseMultiReduce<MultiReduceData>;
+  using typename Base::value_type;
+
+  using Base::Base;
+
+  RAJA_SUPPRESS_HD_WARN
+  BaseMultiReduceBitOr(BaseMultiReduceBitOr const&) = default;
+  RAJA_SUPPRESS_HD_WARN
+  BaseMultiReduceBitOr(BaseMultiReduceBitOr &&) = default;
+  BaseMultiReduceBitOr &operator=(BaseMultiReduceBitOr const&) = delete;
+  BaseMultiReduceBitOr &operator=(BaseMultiReduceBitOr &&) = delete;
+  RAJA_SUPPRESS_HD_WARN
+  ~BaseMultiReduceBitOr() = default;
+
+  struct reference
+  {
+    RAJA_HOST_DEVICE
+    reference(BaseMultiReduceBitOr const& base, size_t bin)
+      : m_base(base), m_bin(bin)
+    { }
+
+    //! reducer function; updates the current instance's state
+    RAJA_HOST_DEVICE
+    reference const& operator|=(value_type rhs) const
+    {
+      m_base.combine(m_bin, rhs);
+      return *this;
+    }
+
+    value_type get() const
+    {
+      return m_base.get(m_bin);
+    }
+
+  private:
+    BaseMultiReduceBitOr const& m_base;
+    size_t m_bin;
+  };
+};
+
+/*!
+ **************************************************************************
+ *
+ * \brief  Bitwise AND reducer class template.
+ *
+ **************************************************************************
+ */
+template <typename MultiReduceData>
+class BaseMultiReduceBitAnd : public BaseMultiReduce<MultiReduceData>
+{
+public:
+  using Base = BaseMultiReduce<MultiReduceData>;
+  using typename Base::value_type;
+
+  using Base::Base;
+
+  RAJA_SUPPRESS_HD_WARN
+  BaseMultiReduceBitAnd(BaseMultiReduceBitAnd const&) = default;
+  RAJA_SUPPRESS_HD_WARN
+  BaseMultiReduceBitAnd(BaseMultiReduceBitAnd &&) = default;
+  BaseMultiReduceBitAnd &operator=(BaseMultiReduceBitAnd const&) = delete;
+  BaseMultiReduceBitAnd &operator=(BaseMultiReduceBitAnd &&) = delete;
+  RAJA_SUPPRESS_HD_WARN
+  ~BaseMultiReduceBitAnd() = default;
+
+  struct reference
+  {
+    RAJA_HOST_DEVICE
+    reference(BaseMultiReduceBitAnd const& base, size_t bin)
+      : m_base(base), m_bin(bin)
+    { }
+
+    //! reducer function; updates the current instance's state
+    RAJA_HOST_DEVICE
+    reference const& operator&=(value_type rhs) const
+    {
+      m_base.combine(m_bin, rhs);
+      return *this;
+    }
+
+    value_type get() const
+    {
+      return m_base.get(m_bin);
+    }
+
+  private:
+    BaseMultiReduceBitAnd const& m_base;
+    size_t m_bin;
+  };
+};
+
+}  // namespace detail
+
+}  // namespace reduce
+
+}  // namespace RAJA
+
+#endif /* RAJA_PATTERN_DETAIL_MULTI_REDUCE_HPP */
diff --git a/include/RAJA/pattern/kernel/For.hpp b/include/RAJA/pattern/kernel/For.hpp
index 1c11ad92bc..539c451673 100644
--- a/include/RAJA/pattern/kernel/For.hpp
+++ b/include/RAJA/pattern/kernel/For.hpp
@@ -145,4 +145,4 @@ struct StatementExecutor<
 }  // end namespace RAJA
 
 
-#endif /* RAJA_pattern_nested_HPP */
+#endif /* RAJA_pattern_kernel_For_HPP */
diff --git a/include/RAJA/pattern/kernel/Tile.hpp b/include/RAJA/pattern/kernel/Tile.hpp
index 53b595564e..43f72e0545 100644
--- a/include/RAJA/pattern/kernel/Tile.hpp
+++ b/include/RAJA/pattern/kernel/Tile.hpp
@@ -169,13 +169,13 @@ struct IterableTiler {
     }
 
     RAJA_HOST_DEVICE
-    RAJA_INLINE bool operator!=(const IterableTiler &rhs) const
+    RAJA_INLINE bool operator!=(const iterator &rhs) const
     {
       return block_id != rhs.block_id;
     }
 
     RAJA_HOST_DEVICE
-    RAJA_INLINE bool operator<(const IterableTiler &rhs) const
+    RAJA_INLINE bool operator<(const iterator &rhs) const
     {
       return block_id < rhs.block_id;
     }
diff --git a/include/RAJA/pattern/kernel/internal/LoopData.hpp b/include/RAJA/pattern/kernel/internal/LoopData.hpp
index 8cef228874..9667a55538 100644
--- a/include/RAJA/pattern/kernel/internal/LoopData.hpp
+++ b/include/RAJA/pattern/kernel/internal/LoopData.hpp
@@ -214,7 +214,7 @@ struct GenericWrapper : GenericWrapperBase {
 
 
 /*!
- * Convenience object used to create thread-private a LoopData object.
+ * Convenience object used to create a thread-private LoopData object.
  */
 template <typename T>
 struct NestedPrivatizer {
diff --git a/include/RAJA/pattern/kernel/internal/LoopTypes.hpp b/include/RAJA/pattern/kernel/internal/LoopTypes.hpp
index e47fe59e37..7f77df4214 100644
--- a/include/RAJA/pattern/kernel/internal/LoopTypes.hpp
+++ b/include/RAJA/pattern/kernel/internal/LoopTypes.hpp
@@ -3,8 +3,7 @@
  *
  * \file
  *
- * \brief   Header file for loop kernel internals: LoopData structure and
- *          related helper functions.
+ * \brief   Header file for loop kernel internals and related helper functions.
  *
  ******************************************************************************
  */
@@ -93,4 +92,4 @@ using setSegmentTypeFromData =
 }  // end namespace RAJA
 
 
-#endif /* RAJA_pattern_kernel_internal_LoopData_HPP */
+#endif /* RAJA_pattern_kernel_internal_LoopTypes_HPP */
diff --git a/include/RAJA/pattern/kernel/internal/Template.hpp b/include/RAJA/pattern/kernel/internal/Template.hpp
index 7b34949570..c750b95986 100644
--- a/include/RAJA/pattern/kernel/internal/Template.hpp
+++ b/include/RAJA/pattern/kernel/internal/Template.hpp
@@ -3,8 +3,7 @@
  *
  * \file
  *
- * \brief   Header file for loop kernel internals: LoopData structure and
- *          related helper functions.
+ * \brief   Header file for loop kernel internals and helper functions.
  *
  ******************************************************************************
  */
@@ -83,4 +82,4 @@ using tuple_of_n = typename detail::TupleOfNHelper<T, camp::make_idx_seq_t<N>>::
 }  // end namespace RAJA
 
 
-#endif /* RAJA_pattern_kernel_internal_LoopData_HPP */
+#endif /* RAJA_pattern_kernel_internal_Template_HPP */
diff --git a/include/RAJA/pattern/launch/launch_core.hpp b/include/RAJA/pattern/launch/launch_core.hpp
index 213c435236..b78ec0de92 100644
--- a/include/RAJA/pattern/launch/launch_core.hpp
+++ b/include/RAJA/pattern/launch/launch_core.hpp
@@ -374,23 +374,21 @@ void launch(ExecPlace place, const LaunchParams &launch_params, ReduceParams&&..
 }
 
 
-
-
 // Helper function to retrieve a resource based on the run-time policy - if a device is active
-#if defined(RAJA_ENABLE_CUDA) || defined(RAJA_ENABLE_HIP)
+#if defined(RAJA_ENABLE_CUDA) || defined(RAJA_ENABLE_HIP) || defined(RAJA_ENABLE_SYCL)
 template<typename T, typename U>
 RAJA::resources::Resource Get_Runtime_Resource(T host_res, U device_res, RAJA::ExecPlace device){
   if(device == RAJA::ExecPlace::DEVICE) {return RAJA::resources::Resource(device_res);}
   else { return RAJA::resources::Resource(host_res); }
 }
-#else
+#endif
+
 template<typename T>
 RAJA::resources::Resource Get_Host_Resource(T host_res, RAJA::ExecPlace device){
   if(device == RAJA::ExecPlace::DEVICE) {RAJA_ABORT_OR_THROW("Device is not enabled");}
 
   return RAJA::resources::Resource(host_res);
 }
-#endif
 
 //Launch API which takes team resource struct and supports new reducers
 template <typename POLICY_LIST, typename ... ReduceParams>
diff --git a/include/RAJA/pattern/multi_reduce.hpp b/include/RAJA/pattern/multi_reduce.hpp
new file mode 100644
index 0000000000..3fbe36877c
--- /dev/null
+++ b/include/RAJA/pattern/multi_reduce.hpp
@@ -0,0 +1,194 @@
+/*!
+ ******************************************************************************
+ *
+ * \file
+ *
+ * \brief   Header file providing RAJA reduction declarations.
+ *
+ ******************************************************************************
+ */
+
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2016-24, Lawrence Livermore National Security, LLC
+// and RAJA project contributors. See the RAJA/LICENSE file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+#ifndef RAJA_multi_reduce_HPP
+#define RAJA_multi_reduce_HPP
+
+#include "RAJA/config.hpp"
+
+#include "RAJA/util/Operators.hpp"
+#include "RAJA/util/macros.hpp"
+
+namespace RAJA
+{
+
+//
+// Forward declarations for multi reduction templates.
+// Actual classes appear in forall_*.hxx header files.
+//
+// IMPORTANT: multi reduction policy parameter must be consistent with loop
+//            execution policy type.
+//
+// Also, multiple multi reductions using different reduction operations may be
+// combined in a single RAJA forall() construct.
+//
+
+/*!
+ ******************************************************************************
+ *
+ * \brief  Min multi reducer class template.
+ *
+ * Usage example:
+ *
+ * \verbatim
+
+   Real_ptr data = ...;
+   Index_ptr bins = ...;
+   Real_ptr min_vals = ...;
+
+   MultiReduceMin<multi_reduce_policy, Real_type> my_mins(num_bins, init_val);
+
+   forall<exec_policy>( ..., [=] (Index_type i) {
+      my_mins[bins[i]].min(data[i]);
+   }
+
+   for (size_t bin = 0; bin < num_bins; ++bin) {
+      min_vals[bin] = my_mins[bin].get();
+   }
+
+ * \endverbatim
+ *
+ ******************************************************************************
+ */
+template <typename MULTI_REDUCE_POLICY_T, typename T>
+struct MultiReduceMin;
+
+/*!
+ ******************************************************************************
+ *
+ * \brief  Max multi reducer class template.
+ *
+ * Usage example:
+ *
+ * \verbatim
+
+   Real_ptr data = ...;
+   Index_ptr bins = ...;
+   Real_ptr max_vals = ...;
+
+   MultiReduceMax<multi_reduce_policy, Real_type> my_maxs(num_bins, init_val);
+
+   forall<exec_policy>( ..., [=] (Index_type i) {
+      my_maxs[bins[i]].max(data[i]);
+   }
+
+   for (size_t bin = 0; bin < num_bins; ++bin) {
+      max_vals[bin] = my_maxs[bin].get();
+   }
+
+ * \endverbatim
+ *
+ ******************************************************************************
+ */
+template <typename MULTI_REDUCE_POLICY_T, typename T>
+struct MultiReduceMax;
+
+/*!
+ ******************************************************************************
+ *
+ * \brief  Sum multi reducer class template.
+ *
+ * Usage example:
+ *
+ * \verbatim
+
+   Real_ptr data = ...;
+   Index_ptr bins = ...;
+   Real_ptr sum_vals = ...;
+
+   MultiReduceSum<multi_reduce_policy, Real_type> my_sums(num_bins, init_val);
+
+   forall<exec_policy>( ..., [=] (Index_type i) {
+      my_sums[bins[i]] += (data[i]);
+   }
+
+   for (size_t bin = 0; bin < num_bins; ++bin) {
+      sum_vals[bin] = my_sums[bin].get();
+   }
+
+ * \endverbatim
+ *
+ ******************************************************************************
+ */
+template <typename MULTI_REDUCE_POLICY_T, typename T>
+struct MultiReduceSum;
+
+/*!
+ ******************************************************************************
+ *
+ * \brief  Bitwise OR multi reducer class template.
+ *
+ * Usage example:
+ *
+ * \verbatim
+
+   Real_ptr data = ...;
+   Index_ptr bins = ...;
+   Real_ptr bit_vals = ...;
+
+   MultiReduceBitOr<multi_reduce_policy, Real_type> my_bits(num_bins, init_val);
+
+   forall<exec_policy>( ..., [=] (Index_type i) {
+      my_bits[bins[i]] |= (data[i]);
+   }
+
+   for (size_t bin = 0; bin < num_bins; ++bin) {
+      bit_vals[bin] = my_bits[bin].get();
+   }
+
+ * \endverbatim
+ *
+ ******************************************************************************
+ */
+template <typename MULTI_REDUCE_POLICY_T, typename T>
+struct MultiReduceBitOr;
+ 
+
+/*!
+ ******************************************************************************
+ *
+ * \brief  Bitwise AND multi reducer class template.
+ *
+ * Usage example:
+ *
+ * \verbatim
+
+   Real_ptr data = ...;
+   Index_ptr bins = ...;
+   Real_ptr bit_vals = ...;
+
+   MultiReduceBitAnd<multi_reduce_policy, Real_type> my_bits(num_bins, init_val);
+
+   forall<exec_policy>( ..., [=] (Index_type i) {
+      my_bits[bins[i]] &= (data[i]);
+   }
+
+   for (size_t bin = 0; bin < num_bins; ++bin) {
+      bit_vals[bin] = my_bits[bin].get();
+   }
+
+ * \endverbatim
+ *
+ ******************************************************************************
+ */
+template <typename MULTI_REDUCE_POLICY_T, typename T>
+struct MultiReduceBitAnd;
+
+} //namespace RAJA
+
+
+#endif  // closing endif for header file include guard
diff --git a/include/RAJA/pattern/params/forall.hpp b/include/RAJA/pattern/params/forall.hpp
index 7e685e2ce0..fb854c8706 100644
--- a/include/RAJA/pattern/params/forall.hpp
+++ b/include/RAJA/pattern/params/forall.hpp
@@ -2,12 +2,17 @@
 #define FORALL_PARAM_HPP
 
 #include "RAJA/policy/sequential/params/reduce.hpp"
+#include "RAJA/policy/sequential/params/kernel_name.hpp"
 #include "RAJA/policy/openmp/params/reduce.hpp"
+#include "RAJA/policy/openmp/params/kernel_name.hpp"
 #include "RAJA/policy/openmp_target/params/reduce.hpp"
+#include "RAJA/policy/openmp_target/params/kernel_name.hpp"
 #include "RAJA/policy/cuda/params/reduce.hpp"
 #include "RAJA/policy/cuda/params/kernel_name.hpp"
 #include "RAJA/policy/hip/params/reduce.hpp"
+#include "RAJA/policy/hip/params/kernel_name.hpp"
 #include "RAJA/policy/sycl/params/reduce.hpp"
+#include "RAJA/policy/sycl/params/kernel_name.hpp"
 
 #include "RAJA/util/CombiningAdapter.hpp"
 
diff --git a/include/RAJA/policy/PolicyBase.hpp b/include/RAJA/policy/PolicyBase.hpp
index f867265b8a..898c92a621 100644
--- a/include/RAJA/policy/PolicyBase.hpp
+++ b/include/RAJA/policy/PolicyBase.hpp
@@ -42,6 +42,7 @@ enum class Pattern {
   forall,
   region,
   reduce,
+  multi_reduce,
   taskgraph,
   synchronize,
   workgroup,
@@ -110,6 +111,25 @@ struct platform_is
     : camp::num<platform_of<camp::decay<PolicyType>>::value == P_> {
 };
 
+template <typename PolicyType, typename Trait>
+struct policy_has_trait_impl
+    : camp::num<false> {
+};
+///
+template <typename Trait, Policy Policy_,
+                          Pattern Pattern_,
+                          Launch Launch_,
+                          Platform Platform_,
+                          typename... Traits>
+struct policy_has_trait_impl<
+      PolicyBaseT<Policy_, Pattern_, Launch_, Platform_, Traits...>, Trait>
+    : camp::num<camp::concepts::any_of<std::is_same<Trait, Traits>...>::value> {
+};
+///
+template <typename PolicyType, typename Trait>
+using policy_has_trait = policy_has_trait_impl<camp::decay<PolicyType>, Trait>;
+
+
 template <typename Inner>
 struct wrapper {
   using inner = Inner;
@@ -121,6 +141,9 @@ namespace reduce
 struct ordered {
 };
 
+struct unordered {
+};
+
 }  // namespace reduce
 
 
@@ -201,6 +224,15 @@ struct is_device_exec_policy
 DefineTypeTraitFromConcept(is_execution_policy,
                            RAJA::concepts::ExecutionPolicy);
 
+
+template <typename Pol>
+struct is_reduce_policy : RAJA::pattern_is<Pol, RAJA::Pattern::reduce> {
+};
+
+template <typename Pol>
+struct is_multi_reduce_policy : RAJA::pattern_is<Pol, RAJA::Pattern::multi_reduce> {
+};
+
 }  // end namespace type_traits
 
 }  // end namespace RAJA
diff --git a/include/RAJA/policy/atomic_auto.hpp b/include/RAJA/policy/atomic_auto.hpp
index a64212b665..e0ca557b32 100644
--- a/include/RAJA/policy/atomic_auto.hpp
+++ b/include/RAJA/policy/atomic_auto.hpp
@@ -63,81 +63,91 @@ namespace RAJA
 struct auto_atomic {
 };
 
+template <typename T>
+RAJA_INLINE RAJA_HOST_DEVICE T atomicLoad(auto_atomic, T *acc)
+{
+  return atomicLoad(RAJA_AUTO_ATOMIC, acc);
+}
 
 template <typename T>
-RAJA_INLINE RAJA_HOST_DEVICE T atomicAdd(auto_atomic, T volatile *acc, T value)
+RAJA_INLINE RAJA_HOST_DEVICE void atomicStore(auto_atomic, T *acc, T value)
 {
-  return atomicAdd(RAJA_AUTO_ATOMIC, acc, value);
+  atomicStore(RAJA_AUTO_ATOMIC, acc, value);
 }
 
+template <typename T>
+RAJA_INLINE RAJA_HOST_DEVICE T atomicAdd(auto_atomic, T *acc, T value)
+{
+  return atomicAdd(RAJA_AUTO_ATOMIC, acc, value);
+}
 
 template <typename T>
-RAJA_INLINE RAJA_HOST_DEVICE T atomicSub(auto_atomic, T volatile *acc, T value)
+RAJA_INLINE RAJA_HOST_DEVICE T atomicSub(auto_atomic, T *acc, T value)
 {
   return atomicSub(RAJA_AUTO_ATOMIC, acc, value);
 }
 
 template <typename T>
-RAJA_INLINE RAJA_HOST_DEVICE T atomicMin(auto_atomic, T volatile *acc, T value)
+RAJA_INLINE RAJA_HOST_DEVICE T atomicMin(auto_atomic, T *acc, T value)
 {
   return atomicMin(RAJA_AUTO_ATOMIC, acc, value);
 }
 
 template <typename T>
-RAJA_INLINE RAJA_HOST_DEVICE T atomicMax(auto_atomic, T volatile *acc, T value)
+RAJA_INLINE RAJA_HOST_DEVICE T atomicMax(auto_atomic, T *acc, T value)
 {
   return atomicMax(RAJA_AUTO_ATOMIC, acc, value);
 }
 
 template <typename T>
-RAJA_INLINE RAJA_HOST_DEVICE T atomicInc(auto_atomic, T volatile *acc)
+RAJA_INLINE RAJA_HOST_DEVICE T atomicInc(auto_atomic, T *acc)
 {
   return atomicInc(RAJA_AUTO_ATOMIC, acc);
 }
 
 template <typename T>
 RAJA_INLINE RAJA_HOST_DEVICE T atomicInc(auto_atomic,
-                                         T volatile *acc,
+                                         T *acc,
                                          T compare)
 {
   return atomicInc(RAJA_AUTO_ATOMIC, acc, compare);
 }
 
 template <typename T>
-RAJA_INLINE RAJA_HOST_DEVICE T atomicDec(auto_atomic, T volatile *acc)
+RAJA_INLINE RAJA_HOST_DEVICE T atomicDec(auto_atomic, T *acc)
 {
   return atomicDec(RAJA_AUTO_ATOMIC, acc);
 }
 
 template <typename T>
 RAJA_INLINE RAJA_HOST_DEVICE T atomicDec(auto_atomic,
-                                         T volatile *acc,
+                                         T *acc,
                                          T compare)
 {
   return atomicDec(RAJA_AUTO_ATOMIC, acc, compare);
 }
 
 template <typename T>
-RAJA_INLINE RAJA_HOST_DEVICE T atomicAnd(auto_atomic, T volatile *acc, T value)
+RAJA_INLINE RAJA_HOST_DEVICE T atomicAnd(auto_atomic, T *acc, T value)
 {
   return atomicAnd(RAJA_AUTO_ATOMIC, acc, value);
 }
 
 template <typename T>
-RAJA_INLINE RAJA_HOST_DEVICE T atomicOr(auto_atomic, T volatile *acc, T value)
+RAJA_INLINE RAJA_HOST_DEVICE T atomicOr(auto_atomic, T *acc, T value)
 {
   return atomicOr(RAJA_AUTO_ATOMIC, acc, value);
 }
 
 template <typename T>
-RAJA_INLINE RAJA_HOST_DEVICE T atomicXor(auto_atomic, T volatile *acc, T value)
+RAJA_INLINE RAJA_HOST_DEVICE T atomicXor(auto_atomic, T *acc, T value)
 {
   return atomicXor(RAJA_AUTO_ATOMIC, acc, value);
 }
 
 template <typename T>
 RAJA_INLINE RAJA_HOST_DEVICE T atomicExchange(auto_atomic,
-                                              T volatile *acc,
+                                              T *acc,
                                               T value)
 {
   return atomicExchange(RAJA_AUTO_ATOMIC, acc, value);
@@ -145,7 +155,7 @@ RAJA_INLINE RAJA_HOST_DEVICE T atomicExchange(auto_atomic,
 
 template <typename T>
 RAJA_INLINE RAJA_HOST_DEVICE T
-atomicCAS(auto_atomic, T volatile *acc, T compare, T value)
+atomicCAS(auto_atomic, T *acc, T compare, T value)
 {
   return atomicCAS(RAJA_AUTO_ATOMIC, acc, compare, value);
 }
diff --git a/include/RAJA/policy/atomic_builtin.hpp b/include/RAJA/policy/atomic_builtin.hpp
index fa3f4841a1..34755fa49d 100644
--- a/include/RAJA/policy/atomic_builtin.hpp
+++ b/include/RAJA/policy/atomic_builtin.hpp
@@ -20,9 +20,16 @@
 
 #include "RAJA/config.hpp"
 
+#include <cstdint>
+
+#if defined(RAJA_COMPILER_MSVC) || (defined(_WIN32) && defined(__INTEL_COMPILER))
+#include <intrin.h>
+#endif
+
 #include "RAJA/util/TypeConvert.hpp"
 #include "RAJA/util/macros.hpp"
 
+
 #if defined(RAJA_ENABLE_HIP)
 #define RAJA_DEVICE_HIP RAJA_HOST_DEVICE
 #else
@@ -37,199 +44,667 @@ namespace RAJA
 struct builtin_atomic {
 };
 
-namespace detail
-{
+
+namespace detail {
+
 
 #if defined(RAJA_COMPILER_MSVC) || (defined(_WIN32) && defined(__INTEL_COMPILER))
 
-RAJA_DEVICE_HIP
-RAJA_INLINE unsigned builtin_atomic_CAS(unsigned volatile *acc,
-                                        unsigned compare,
-                                        unsigned value)
+
+/*!
+ * Type trait for determining if the operator should be implemented
+ * using an intrinsic
+ */
+template <typename T>
+struct builtin_useIntrinsic {
+  static constexpr bool value =
+    std::is_same<T, char>::value ||
+    std::is_same<T, short>::value ||
+    std::is_same<T, long>::value ||
+    std::is_same<T, long long>::value;
+};
+
+
+/*!
+ * Type trait for determining if the operator should be implemented
+ * by reinterpreting inputs to types that intrinsics support
+ */
+template <typename T>
+struct builtin_useReinterpret {
+  static constexpr bool value =
+    !builtin_useIntrinsic<T>::value &&
+    (sizeof(T) == 1 ||
+     sizeof(T) == 2 ||
+     sizeof(T) == 4 ||
+     sizeof(T) == 8);
+
+  using type =
+    std::conditional_t<sizeof(T) == 1, char,
+    std::conditional_t<sizeof(T) == 2, short,
+    std::conditional_t<sizeof(T) == 4, long, long long>>>;
+};
+
+
+/*!
+ * Type trait for determining if the operator should be implemented
+ * using a compare and swap loop
+ */
+template <typename T>
+struct builtin_useCAS {
+  static constexpr bool value =
+    !builtin_useIntrinsic<T>::value &&
+    (sizeof(T) == 1 || sizeof(T) == 2 || sizeof(T) == 4 || sizeof(T) == 8);
+};
+
+
+/*!
+ * Atomics implemented using intrinsics
+ */
+
+
+/*!
+ * Atomic or using intrinsics
+ */
+RAJA_INLINE char builtin_atomicOr(char *acc, char value)
 {
+  return _InterlockedOr8(acc, value);
+}
 
-  long long_value = RAJA::util::reinterp_A_as_B<unsigned, long>(value);
-  long long_compare = RAJA::util::reinterp_A_as_B<unsigned, long>(compare);
+RAJA_INLINE short builtin_atomicOr(short *acc, short value)
+{
+  return _InterlockedOr16(acc, value);
+}
 
-  long old = _InterlockedCompareExchange((long *)acc, long_value, long_compare);
+RAJA_INLINE long builtin_atomicOr(long *acc, long value)
+{
+  return _InterlockedOr(acc, value);
+}
 
-  return RAJA::util::reinterp_A_as_B<long, unsigned>(old);
+RAJA_INLINE long long builtin_atomicOr(long long *acc, long long value)
+{
+  return _InterlockedOr64(acc, value);
 }
 
-RAJA_DEVICE_HIP
-RAJA_INLINE unsigned long long builtin_atomic_CAS(
-    unsigned long long volatile *acc,
-    unsigned long long compare,
-    unsigned long long value)
+
+/*!
+ * Atomic load using atomic or
+ */
+template <typename T,
+          std::enable_if_t<builtin_useIntrinsic<T>::value, bool> = true>
+RAJA_INLINE T builtin_atomicLoad(T *acc)
 {
+  return builtin_atomicOr(acc, static_cast<T>(0));
+}
 
-  long long long_value =
-      RAJA::util::reinterp_A_as_B<unsigned long long, long long>(value);
-  long long long_compare =
-      RAJA::util::reinterp_A_as_B<unsigned long long, long long>(compare);
 
-  long long old = _InterlockedCompareExchange64((long long volatile *)acc,
-                                                long_value,
-                                                long_compare);
+/*!
+ * Atomic exchange using intrinsics
+ */
+RAJA_INLINE char builtin_atomicExchange(char *acc, char value)
+{
+  return _InterlockedExchange8(acc, value);
+}
 
-  return RAJA::util::reinterp_A_as_B<long long, unsigned long long>(old);
+RAJA_INLINE short builtin_atomicExchange(short *acc, short value)
+{
+  return _InterlockedExchange16(acc, value);
 }
 
-#else  // RAJA_COMPILER_MSVC
+RAJA_INLINE long builtin_atomicExchange(long *acc, long value)
+{
+  return _InterlockedExchange(acc, value);
+}
 
-RAJA_DEVICE_HIP
-RAJA_INLINE unsigned builtin_atomic_CAS(unsigned volatile *acc,
-                                        unsigned compare,
-                                        unsigned value)
+RAJA_INLINE long long builtin_atomicExchange(long long *acc, long long value)
 {
-  __atomic_compare_exchange_n(
-      acc, &compare, value, false, __ATOMIC_ACQ_REL, __ATOMIC_RELAXED);
-  return compare;
+  return _InterlockedExchange64(acc, value);
 }
 
-RAJA_DEVICE_HIP
-RAJA_INLINE unsigned long long builtin_atomic_CAS(
-    unsigned long long volatile *acc,
-    unsigned long long compare,
-    unsigned long long value)
+
+/*!
+ * Atomic store using atomic exchange
+ */
+template <typename T,
+          std::enable_if_t<builtin_useIntrinsic<T>::value, bool> = true>
+RAJA_INLINE void builtin_atomicStore(T *acc, T value)
 {
-  __atomic_compare_exchange_n(
-      acc, &compare, value, false, __ATOMIC_ACQ_REL, __ATOMIC_RELAXED);
-  return compare;
+  builtin_atomicExchange(acc, value);
 }
 
-#endif  // RAJA_COMPILER_MSVC
 
+/*!
+ * Atomic compare and swap using intrinsics
+ */
+RAJA_INLINE char builtin_atomicCAS(char *acc, char compare, char value)
+{
+  return _InterlockedCompareExchange8(acc, value, compare);
+}
 
-template <typename T>
-RAJA_DEVICE_HIP RAJA_INLINE
-    typename std::enable_if<sizeof(T) == sizeof(unsigned), T>::type
-    builtin_atomic_CAS(T volatile *acc, T compare, T value)
+RAJA_INLINE short builtin_atomicCAS(short *acc, short compare, short value)
 {
-  return RAJA::util::reinterp_A_as_B<unsigned, T>(
-      builtin_atomic_CAS((unsigned volatile *)acc,
-                         RAJA::util::reinterp_A_as_B<T, unsigned>(compare),
-                         RAJA::util::reinterp_A_as_B<T, unsigned>(value)));
+  return _InterlockedCompareExchange16(acc, value, compare);
 }
 
-template <typename T>
-RAJA_DEVICE_HIP RAJA_INLINE
-    typename std::enable_if<sizeof(T) == sizeof(unsigned long long), T>::type
-    builtin_atomic_CAS(T volatile *acc, T compare, T value)
+RAJA_INLINE long builtin_atomicCAS(long *acc, long compare, long value)
+{
+  return _InterlockedCompareExchange(acc, value, compare);
+}
+
+RAJA_INLINE long long builtin_atomicCAS(long long *acc, long long compare, long long value)
+{
+  return _InterlockedCompareExchange64(acc, value, compare);
+}
+
+
+/*!
+ * Atomic addition using intrinsics
+ */
+RAJA_INLINE char builtin_atomicAdd(char *acc, char value)
+{
+  return _InterlockedExchangeAdd8(acc, value);
+}
+
+RAJA_INLINE short builtin_atomicAdd(short *acc, short value)
+{
+  return _InterlockedExchangeAdd16(acc, value);
+}
+
+RAJA_INLINE long builtin_atomicAdd(long *acc, long value)
+{
+  return _InterlockedExchangeAdd(acc, value);
+}
+
+RAJA_INLINE long long builtin_atomicAdd(long long *acc, long long value)
 {
-  return RAJA::util::reinterp_A_as_B<unsigned long long, T>(builtin_atomic_CAS(
-      (unsigned long long volatile *)acc,
-      RAJA::util::reinterp_A_as_B<T, unsigned long long>(compare),
-      RAJA::util::reinterp_A_as_B<T, unsigned long long>(value)));
+  return _InterlockedExchangeAdd64(acc, value);
 }
 
 
-template <size_t BYTES>
-struct BuiltinAtomicCAS;
-template <size_t BYTES>
-struct BuiltinAtomicCAS {
-  static_assert(!(BYTES == 4 || BYTES == 8),
-                "builtin atomic cas assumes 4 or 8 byte targets");
+/*!
+ * Atomic subtraction using intrinsics
+ */
+RAJA_INLINE char builtin_atomicSub(char *acc, char value)
+{
+  return _InterlockedExchangeAdd8(acc, -value);
+}
+
+RAJA_INLINE short builtin_atomicSub(short *acc, short value)
+{
+  return _InterlockedExchangeAdd16(acc, -value);
+}
+
+RAJA_INLINE long builtin_atomicSub(long *acc, long value)
+{
+  return _InterlockedExchangeAdd(acc, -value);
+}
+
+RAJA_INLINE long long builtin_atomicSub(long long *acc, long long value)
+{
+  return _InterlockedExchangeAdd64(acc, -value);
+}
+
+
+/*!
+ * Atomic and using intrinsics
+ */
+RAJA_INLINE char builtin_atomicAnd(char *acc, char value)
+{
+  return _InterlockedAnd8(acc, value);
+}
+
+RAJA_INLINE short builtin_atomicAnd(short *acc, short value)
+{
+  return _InterlockedAnd16(acc, value);
+}
+
+RAJA_INLINE long builtin_atomicAnd(long *acc, long value)
+{
+  return _InterlockedAnd(acc, value);
+}
+
+RAJA_INLINE long long builtin_atomicAnd(long long *acc, long long value)
+{
+  return _InterlockedAnd64(acc, value);
+}
+
+
+/*!
+ * Atomic xor using intrinsics
+ */
+RAJA_INLINE char builtin_atomicXor(char *acc, char value)
+{
+  return _InterlockedXor8(acc, value);
+}
+
+RAJA_INLINE short builtin_atomicXor(short *acc, short value)
+{
+  return _InterlockedXor16(acc, value);
+}
+
+RAJA_INLINE long builtin_atomicXor(long *acc, long value)
+{
+  return _InterlockedXor(acc, value);
+}
+
+RAJA_INLINE long long builtin_atomicXor(long long *acc, long long value)
+{
+  return _InterlockedXor64(acc, value);
+}
+
+
+#else  // RAJA_COMPILER_MSVC
+
+
+/*!
+ * Type trait for determining if the operator should be implemented
+ * using an intrinsic
+ */
+template <typename T>
+struct builtin_useIntrinsic {
+  static constexpr bool value =
+    (std::is_integral<T>::value || std::is_enum<T>::value) &&
+    (sizeof(T) == 1 || sizeof(T) == 2 || sizeof(T) == 4 || sizeof(T) == 8);
 };
 
 
-template <>
-struct BuiltinAtomicCAS<4> {
-
-  /*!
-   * Generic impementation of any atomic 32-bit operator.
-   * Implementation uses the existing builtin unsigned 32-bit CAS operator.
-   * Returns the OLD value that was replaced by the result of this operation.
-   */
-  template <typename T, typename OPER, typename ShortCircuit>
-  RAJA_DEVICE_HIP RAJA_INLINE T operator()(T volatile *acc,
-                                           OPER const &oper,
-                                           ShortCircuit const &sc) const
-  {
-#ifdef RAJA_COMPILER_MSVC
-#pragma warning( disable : 4244 )  // Force msvc to not emit conversion warning
+/*!
+ * Type trait for determining if the operator should be implemented
+ * by reinterpreting inputs to types that intrinsics support
+ */
+template <typename T>
+struct builtin_useReinterpret {
+  static constexpr bool value =
+    !std::is_integral<T>::value &&
+    !std::is_enum<T>::value &&
+    ((sizeof(T) == 1
+#if !defined(UINT8_MAX)
+      && sizeof(unsigned char) == 1
 #endif
-    unsigned oldval, newval, readback;
-
-    oldval = RAJA::util::reinterp_A_as_B<T, unsigned>(*acc);
-    newval = RAJA::util::reinterp_A_as_B<T, unsigned>(
-        oper(RAJA::util::reinterp_A_as_B<unsigned, T>(oldval)));
-
-    while ((readback = builtin_atomic_CAS((unsigned *)acc, oldval, newval)) !=
-           oldval) {
-      if (sc(readback)) break;
-      oldval = readback;
-      newval = RAJA::util::reinterp_A_as_B<T, unsigned>(
-          oper(RAJA::util::reinterp_A_as_B<unsigned, T>(oldval)));
-    }
-    return RAJA::util::reinterp_A_as_B<unsigned, T>(oldval);
-  }
-#ifdef RAJA_COMPILER_MSVC
-#pragma warning( default : 4244 )  // Reenable warning
+     ) ||
+     (sizeof(T) == 2
+#if !defined(UINT16_MAX)
+      && sizeof(unsigned short) == 2
 #endif
-};
-
-template <>
-struct BuiltinAtomicCAS<8> {
-
-  /*!
-   * Generic impementation of any atomic 64-bit operator.
-   * Implementation uses the existing builtin unsigned 64-bit CAS operator.
-   * Returns the OLD value that was replaced by the result of this operation.
-   */
-  template <typename T, typename OPER, typename ShortCircuit>
-  RAJA_DEVICE_HIP RAJA_INLINE T operator()(T volatile *acc,
-                                           OPER const &oper,
-                                           ShortCircuit const &sc) const
-  {
-#ifdef RAJA_COMPILER_MSVC
-#pragma warning( disable : 4244 )  // Force msvc to not emit conversion warning
+     ) ||
+     (sizeof(T) == 4
+#if !defined(UINT32_MAX)
+      && sizeof(unsigned int) == 4
 #endif
-    unsigned long long oldval, newval, readback;
-
-    oldval = RAJA::util::reinterp_A_as_B<T, unsigned long long>(*acc);
-    newval = RAJA::util::reinterp_A_as_B<T, unsigned long long>(
-        oper(RAJA::util::reinterp_A_as_B<unsigned long long, T>(oldval)));
-
-    while ((readback = builtin_atomic_CAS((unsigned long long *)acc,
-                                          oldval,
-                                          newval)) != oldval) {
-      if (sc(readback)) break;
-      oldval = readback;
-      newval = RAJA::util::reinterp_A_as_B<T, unsigned long long>(
-          oper(RAJA::util::reinterp_A_as_B<unsigned long long, T>(oldval)));
-    }
-    return RAJA::util::reinterp_A_as_B<unsigned long long, T>(oldval);
-  }
+     ) ||
+     (sizeof(T) == 8
+#if !defined(UINT64_MAX)
+      && sizeof(unsigned long long) == 8
+#endif
+     ));
 
-#ifdef RAJA_COMPILER_MSVC
-#pragma warning( default : 4244 )  // Reenable warning
+  using type =
+    std::conditional_t<sizeof(T) == 1,
+#if defined(UINT8_MAX)
+                       uint8_t,
+#else
+                       unsigned char,
+#endif
+    std::conditional_t<sizeof(T) == 2,
+#if defined(UINT16_MAX)
+                       uint16_t,
+#else
+                       unsigned short,
 #endif
+    std::conditional_t<sizeof(T) == 4,
+#if defined(UINT32_MAX)
+                       uint32_t,
+#else
+                       unsigned int,
+#endif
+#if defined(UINT64_MAX)
+                       uint64_t>>>;
+#else
+                       unsigned long long>>>;
+#endif
+};
 
+
+/*!
+ * Type trait for determining if the operator should be implemented
+ * using a compare and swap loop
+ */
+template <typename T>
+struct builtin_useCAS {
+  static constexpr bool value =
+    !std::is_integral<T>::value && !std::is_enum<T>::value &&
+    (sizeof(T) == 1 || sizeof(T) == 2 || sizeof(T) == 4 || sizeof(T) == 8);
 };
 
 
 /*!
- * Generic impementation of any atomic 32-bit or 64-bit operator that can be
- * implemented using a compare and swap primitive.
- * Implementation uses the builtin unsigned 32-bit and 64-bit CAS operators.
+ * Atomics implemented using intrinsics
+ */
+
+
+/*!
+ * Atomic load using intrinsic
+ */
+template <typename T,
+          std::enable_if_t<builtin_useIntrinsic<T>::value, bool> = true>
+RAJA_DEVICE_HIP RAJA_INLINE T builtin_atomicLoad(T *acc)
+{
+  return __atomic_load_n(acc, __ATOMIC_RELAXED);
+}
+
+
+/*!
+ * Atomic store using intrinsic
+ */
+template <typename T,
+          std::enable_if_t<builtin_useIntrinsic<T>::value, bool> = true>
+RAJA_DEVICE_HIP RAJA_INLINE void builtin_atomicStore(T *acc, T value)
+{
+  __atomic_store_n(acc, value, __ATOMIC_RELAXED);
+}
+
+
+/*!
+ * Atomic exchange using intrinsic
+ */
+template <typename T,
+          std::enable_if_t<builtin_useIntrinsic<T>::value, bool> = true>
+RAJA_DEVICE_HIP RAJA_INLINE T builtin_atomicExchange(T *acc, T value)
+{
+  return __atomic_exchange_n(acc, value, __ATOMIC_RELAXED);
+}
+
+
+/*!
+ * Atomic compare and swap using intrinsic
+ */
+template <typename T,
+          std::enable_if_t<builtin_useIntrinsic<T>::value, bool> = true>
+RAJA_DEVICE_HIP RAJA_INLINE T builtin_atomicCAS(T *acc, T compare, T value)
+{
+  __atomic_compare_exchange_n(
+      acc, &compare, value, false, __ATOMIC_RELAXED, __ATOMIC_RELAXED);
+  return compare;
+}
+
+
+/*!
+ * Atomic addition using intrinsic
+ */
+template <typename T,
+          std::enable_if_t<builtin_useIntrinsic<T>::value, bool> = true>
+RAJA_DEVICE_HIP RAJA_INLINE T builtin_atomicAdd(T *acc, T value)
+{
+  return __atomic_fetch_add(acc, value, __ATOMIC_RELAXED);
+}
+
+
+/*!
+ * Atomic subtraction using intrinsic
+ */
+template <typename T,
+          std::enable_if_t<builtin_useIntrinsic<T>::value, bool> = true>
+RAJA_DEVICE_HIP RAJA_INLINE T builtin_atomicSub(T *acc, T value)
+{
+  return __atomic_fetch_sub(acc, value, __ATOMIC_RELAXED);
+}
+
+
+/*!
+ * Atomic and using intrinsic
+ */
+template <typename T,
+          std::enable_if_t<builtin_useIntrinsic<T>::value, bool> = true>
+RAJA_DEVICE_HIP RAJA_INLINE T builtin_atomicAnd(T *acc, T value)
+{
+  return __atomic_fetch_and(acc, value, __ATOMIC_RELAXED);
+}
+
+
+/*!
+ * Atomic or using intrinsic
+ */
+template <typename T,
+          std::enable_if_t<builtin_useIntrinsic<T>::value, bool> = true>
+RAJA_DEVICE_HIP RAJA_INLINE T builtin_atomicOr(T *acc, T value)
+{
+  return __atomic_fetch_or(acc, value, __ATOMIC_RELAXED);
+}
+
+
+/*!
+ * Atomic xor using intrinsic
+ */
+template <typename T,
+          std::enable_if_t<builtin_useIntrinsic<T>::value, bool> = true>
+RAJA_DEVICE_HIP RAJA_INLINE T builtin_atomicXor(T *acc, T value)
+{
+  return __atomic_fetch_xor(acc, value, __ATOMIC_RELAXED);
+}
+
+
+#endif  // RAJA_COMPILER_MSVC
+
+
+/*!
+ * Atomics implemented using reinterpret cast
+ */
+
+
+/*!
+ * Alias for determining the integral type of the same size as the given type
+ */
+template <typename T>
+using builtin_useReinterpret_t = typename builtin_useReinterpret<T>::type;
+
+
+/*!
+ * Atomic load using reinterpret cast
+ */
+template <typename T,
+          std::enable_if_t<builtin_useReinterpret<T>::value, bool> = true>
+RAJA_DEVICE_HIP RAJA_INLINE T builtin_atomicLoad(T *acc)
+{
+  using R = builtin_useReinterpret_t<T>;
+
+  return RAJA::util::reinterp_A_as_B<R, T>(
+    builtin_atomicLoad(reinterpret_cast<R*>(acc)));
+}
+
+
+/*!
+ * Atomic store using reinterpret cast
+ */
+template <typename T,
+          std::enable_if_t<builtin_useReinterpret<T>::value, bool> = true>
+RAJA_DEVICE_HIP RAJA_INLINE void builtin_atomicStore(T *acc, T value)
+{
+  using R = builtin_useReinterpret_t<T>;
+
+  builtin_atomicStore(reinterpret_cast<R*>(acc),
+                      RAJA::util::reinterp_A_as_B<T, R>(value));
+}
+
+
+/*!
+ * Atomic exchange using reinterpret cast
+ */
+template <typename T,
+          std::enable_if_t<builtin_useReinterpret<T>::value, bool> = true>
+RAJA_DEVICE_HIP RAJA_INLINE T builtin_atomicExchange(T *acc, T value)
+{
+  using R = builtin_useReinterpret_t<T>;
+
+  return RAJA::util::reinterp_A_as_B<R, T>(
+    builtin_atomicExchange(reinterpret_cast<R*>(acc),
+                           RAJA::util::reinterp_A_as_B<T, R>(value)));
+}
+
+
+/*!
+ * Atomic compare and swap using reinterpret cast
+ */
+template <typename T,
+          std::enable_if_t<builtin_useReinterpret<T>::value, bool> = true>
+RAJA_DEVICE_HIP RAJA_INLINE T builtin_atomicCAS(T *acc, T compare, T value)
+{
+  using R = builtin_useReinterpret_t<T>;
+
+  return RAJA::util::reinterp_A_as_B<R, T>(
+    builtin_atomicCAS(reinterpret_cast<R*>(acc),
+                      RAJA::util::reinterp_A_as_B<T, R>(compare),
+                      RAJA::util::reinterp_A_as_B<T, R>(value)));
+}
+
+
+/*!
+ * Implementation of compare and swap loop
+ */
+
+
+/*!
+ * Equality comparison for compare and swap loop using types supported by
+ * intrinsics.
+ */
+template <typename T,
+          std::enable_if_t<builtin_useIntrinsic<T>::value, bool> = true>
+RAJA_DEVICE_HIP RAJA_INLINE bool builtin_atomicCAS_equal(const T &a, const T &b)
+{
+  return a == b;
+}
+
+
+/*!
+ * Equality comparison for compare and swap loop using reinterpret cast.
+ * Converts to the underlying integral type to avoid cases where the values
+ * will never compare equal (most notably, NaNs).
+ */
+template <typename T,
+          std::enable_if_t<builtin_useReinterpret<T>::value, bool> = true>
+RAJA_DEVICE_HIP RAJA_INLINE bool builtin_atomicCAS_equal(const T &a, const T &b)
+{
+  using R = builtin_useReinterpret_t<T>;
+
+  return builtin_atomicCAS_equal(RAJA::util::reinterp_A_as_B<T, R>(a),
+                                 RAJA::util::reinterp_A_as_B<T, R>(b));
+}
+
+
+/*!
+ * Generic impementation of any atomic 8, 16, 32, or 64 bit operator
+ * that can be implemented using a builtin compare and swap primitive.
  * Returns the OLD value that was replaced by the result of this operation.
  */
-template <typename T, typename OPER>
-RAJA_DEVICE_HIP RAJA_INLINE T builtin_atomic_CAS_oper(T volatile *acc,
-                                                      OPER &&oper)
+template <typename T, typename Oper>
+RAJA_DEVICE_HIP RAJA_INLINE T builtin_atomicCAS_loop(T *acc,
+                                                     Oper &&oper)
+{
+  T old = builtin_atomicLoad(acc);
+  T expected;
+
+  do {
+    expected = old;
+    old = builtin_atomicCAS(acc, expected, oper(expected));
+  } while (!builtin_atomicCAS_equal(old, expected));
+
+  return old;
+}
+
+
+/*!
+ * Generic impementation of any atomic 8, 16, 32, or 64 bit operator
+ * that can be implemented using a builtin compare and swap primitive.
+ * Uses short-circuiting for improved efficiency. Returns the OLD value
+ * that was replaced by the result of this operation.
+ */
+template <typename T, typename Oper, typename ShortCircuit>
+RAJA_DEVICE_HIP RAJA_INLINE T builtin_atomicCAS_loop(T *acc,
+                                                     Oper &&oper,
+                                                     ShortCircuit &&sc)
+{
+  T old = builtin_atomicLoad(acc);
+
+  if (sc(old)) {
+    return old;
+  }
+
+  T expected;
+
+  do {
+    expected = old;
+    old = builtin_atomicCAS(acc, expected, oper(expected));
+  } while (!builtin_atomicCAS_equal(old, expected) && !sc(old));
+
+  return old;
+}
+
+
+/*!
+ * Atomics implemented using compare and swap loop
+ */
+
+
+/*!
+ * Atomic addition using compare and swap loop
+ */
+template <typename T,
+          std::enable_if_t<builtin_useCAS<T>::value, bool> = true>
+RAJA_DEVICE_HIP RAJA_INLINE T builtin_atomicAdd(T *acc, T value)
 {
-  BuiltinAtomicCAS<sizeof(T)> cas;
-  return cas(acc, std::forward<OPER>(oper), [](T const &) { return false; });
+  return builtin_atomicCAS_loop(acc, [value] (T old) {
+    return old + value;
+  });
 }
 
-template <typename T, typename OPER, typename ShortCircuit>
-RAJA_DEVICE_HIP RAJA_INLINE T builtin_atomic_CAS_oper_sc(T volatile *acc,
-                                                         OPER &&oper,
-                                                         ShortCircuit const &sc)
+
+/*!
+ * Atomic subtraction using compare and swap loop
+ */
+template <typename T,
+          std::enable_if_t<builtin_useCAS<T>::value, bool> = true>
+RAJA_DEVICE_HIP RAJA_INLINE T builtin_atomicSub(T *acc, T value)
 {
-  BuiltinAtomicCAS<sizeof(T)> cas;
-  return cas(acc, std::forward<OPER>(oper), sc);
+  return builtin_atomicCAS_loop(acc, [value] (T old) {
+    return old - value;
+  });
+}
+
+
+/*!
+ * Atomic and using compare and swap loop
+ */
+template <typename T,
+          std::enable_if_t<builtin_useCAS<T>::value, bool> = true>
+RAJA_DEVICE_HIP RAJA_INLINE T builtin_atomicAnd(T *acc, T value)
+{
+  return builtin_atomicCAS_loop(acc, [value] (T old) {
+    return old & value;
+  });
+}
+
+
+/*!
+ * Atomic or using compare and swap loop
+ */
+template <typename T,
+          std::enable_if_t<builtin_useCAS<T>::value, bool> = true>
+RAJA_DEVICE_HIP RAJA_INLINE T builtin_atomicOr(T *acc, T value)
+{
+  return builtin_atomicCAS_loop(acc, [value] (T old) {
+    return old | value;
+  });
+}
+
+
+/*!
+ * Atomic xor using compare and swap loop
+ */
+template <typename T,
+          std::enable_if_t<builtin_useCAS<T>::value, bool> = true>
+RAJA_DEVICE_HIP RAJA_INLINE T builtin_atomicXor(T *acc, T value)
+{
+  return builtin_atomicCAS_loop(acc, [value] (T old) {
+    return old ^ value;
+  });
 }
 
 
@@ -237,125 +712,115 @@ RAJA_DEVICE_HIP RAJA_INLINE T builtin_atomic_CAS_oper_sc(T volatile *acc,
 
 
 template <typename T>
-RAJA_DEVICE_HIP RAJA_INLINE T atomicAdd(builtin_atomic,
-                                        T volatile *acc,
-                                        T value)
+RAJA_DEVICE_HIP RAJA_INLINE T atomicLoad(builtin_atomic, T *acc)
 {
-  return detail::builtin_atomic_CAS_oper(acc, [=](T a) { return a + value; });
+  return detail::builtin_atomicLoad(acc);
 }
 
+template <typename T>
+RAJA_DEVICE_HIP RAJA_INLINE void atomicStore(builtin_atomic, T *acc, T value)
+{
+  detail::builtin_atomicStore(acc, value);
+}
 
 template <typename T>
-RAJA_DEVICE_HIP RAJA_INLINE T atomicSub(builtin_atomic,
-                                        T volatile *acc,
-                                        T value)
+RAJA_DEVICE_HIP RAJA_INLINE T atomicAdd(builtin_atomic, T *acc, T value)
 {
-  return detail::builtin_atomic_CAS_oper(acc, [=](T a) { return a - value; });
+  return detail::builtin_atomicAdd(acc, value);
 }
 
 template <typename T>
-RAJA_DEVICE_HIP RAJA_INLINE T atomicMin(builtin_atomic,
-                                        T volatile *acc,
-                                        T value)
+RAJA_DEVICE_HIP RAJA_INLINE T atomicSub(builtin_atomic, T *acc, T value)
 {
-  if (*acc < value) {
-    return *acc;
-  }
-  return detail::builtin_atomic_CAS_oper_sc(acc,
-                                            [=](T a) {
-                                              return a < value ? a : value;
-                                            },
-                                            [=](T current) {
-                                              return current < value;
-                                            });
+  return detail::builtin_atomicSub(acc, value);
 }
 
 template <typename T>
-RAJA_DEVICE_HIP RAJA_INLINE T atomicMax(builtin_atomic,
-                                        T volatile *acc,
-                                        T value)
+RAJA_DEVICE_HIP RAJA_INLINE T atomicMin(builtin_atomic, T *acc, T value)
 {
-  if (*acc > value) {
-    return *acc;
-  }
-  return detail::builtin_atomic_CAS_oper_sc(acc,
-                                            [=](T a) {
-                                              return a > value ? a : value;
-                                            },
-                                            [=](T current) {
-                                              return current > value;
-                                            });
+  return detail::builtin_atomicCAS_loop(
+    acc,
+    [value] (T old) {
+      return value < old ? value : old;
+    },
+    [value] (T current) {
+      return current <= value;
+    });
+}
+
+template <typename T>
+RAJA_DEVICE_HIP RAJA_INLINE T atomicMax(builtin_atomic, T *acc, T value)
+{
+  return detail::builtin_atomicCAS_loop(
+    acc,
+    [value] (T old) {
+      return old < value ? value : old;
+    },
+    [value] (T current) {
+      return value <= current;
+    });
 }
 
 template <typename T>
-RAJA_DEVICE_HIP RAJA_INLINE T atomicInc(builtin_atomic, T volatile *acc)
+RAJA_DEVICE_HIP RAJA_INLINE T atomicInc(builtin_atomic, T *acc)
 {
-  return detail::builtin_atomic_CAS_oper(acc, [=](T a) { return a + 1; });
+  return detail::builtin_atomicAdd(acc, static_cast<T>(1));
 }
 
 template <typename T>
-RAJA_DEVICE_HIP RAJA_INLINE T atomicInc(builtin_atomic, T volatile *acc, T val)
+RAJA_DEVICE_HIP RAJA_INLINE T atomicInc(builtin_atomic, T *acc, T value)
 {
-  return detail::builtin_atomic_CAS_oper(acc, [=](T old) {
-    return ((old >= val) ? 0 : (old + 1));
+  return detail::builtin_atomicCAS_loop(acc, [value] (T old) {
+    return value <= old ? static_cast<T>(0) : old + static_cast<T>(1);
   });
 }
 
 template <typename T>
-RAJA_DEVICE_HIP RAJA_INLINE T atomicDec(builtin_atomic, T volatile *acc)
+RAJA_DEVICE_HIP RAJA_INLINE T atomicDec(builtin_atomic, T *acc)
 {
-  return detail::builtin_atomic_CAS_oper(acc, [=](T a) { return a - 1; });
+  return detail::builtin_atomicSub(acc, static_cast<T>(1));
 }
 
 template <typename T>
-RAJA_DEVICE_HIP RAJA_INLINE T atomicDec(builtin_atomic, T volatile *acc, T val)
+RAJA_DEVICE_HIP RAJA_INLINE T atomicDec(builtin_atomic, T *acc, T value)
 {
-  return detail::builtin_atomic_CAS_oper(acc, [=](T old) {
-    return (((old == 0) | (old > val)) ? val : (old - 1));
+  return detail::builtin_atomicCAS_loop(acc, [value] (T old) {
+    return old == static_cast<T>(0) || value < old ? value : old - static_cast<T>(1);
   });
 }
 
 template <typename T>
-RAJA_DEVICE_HIP RAJA_INLINE T atomicAnd(builtin_atomic,
-                                        T volatile *acc,
-                                        T value)
+RAJA_DEVICE_HIP RAJA_INLINE T atomicAnd(builtin_atomic, T *acc, T value)
 {
-  return detail::builtin_atomic_CAS_oper(acc, [=](T a) { return a & value; });
+  return detail::builtin_atomicAnd(acc, value);
 }
 
 template <typename T>
-RAJA_DEVICE_HIP RAJA_INLINE T atomicOr(builtin_atomic, T volatile *acc, T value)
+RAJA_DEVICE_HIP RAJA_INLINE T atomicOr(builtin_atomic, T *acc, T value)
 {
-  return detail::builtin_atomic_CAS_oper(acc, [=](T a) { return a | value; });
+  return detail::builtin_atomicOr(acc, value);
 }
 
 template <typename T>
-RAJA_DEVICE_HIP RAJA_INLINE T atomicXor(builtin_atomic,
-                                        T volatile *acc,
-                                        T value)
+RAJA_DEVICE_HIP RAJA_INLINE T atomicXor(builtin_atomic, T *acc, T value)
 {
-  return detail::builtin_atomic_CAS_oper(acc, [=](T a) { return a ^ value; });
+  return detail::builtin_atomicXor(acc, value);
 }
 
 template <typename T>
-RAJA_DEVICE_HIP RAJA_INLINE T atomicExchange(builtin_atomic,
-                                             T volatile *acc,
-                                             T value)
+RAJA_DEVICE_HIP RAJA_INLINE T atomicExchange(builtin_atomic, T *acc, T value)
 {
-  return detail::builtin_atomic_CAS_oper(acc, [=](T) { return value; });
+  return detail::builtin_atomicExchange(acc, value);
 }
 
 template <typename T>
-RAJA_DEVICE_HIP RAJA_INLINE T
-atomicCAS(builtin_atomic, T volatile *acc, T compare, T value)
+RAJA_DEVICE_HIP RAJA_INLINE T atomicCAS(builtin_atomic, T *acc, T compare, T value)
 {
-  return detail::builtin_atomic_CAS(acc, compare, value);
+  return detail::builtin_atomicCAS(acc, compare, value);
 }
 
 
 }  // namespace RAJA
 
-// make sure this define doesn't bleed out of this header
-#undef RAJA_AUTO_ATOMIC
 
 #endif
diff --git a/include/RAJA/policy/cuda.hpp b/include/RAJA/policy/cuda.hpp
index c561122349..e9d5bc454f 100644
--- a/include/RAJA/policy/cuda.hpp
+++ b/include/RAJA/policy/cuda.hpp
@@ -34,6 +34,7 @@
 #include "RAJA/policy/cuda/forall.hpp"
 #include "RAJA/policy/cuda/policy.hpp"
 #include "RAJA/policy/cuda/reduce.hpp"
+#include "RAJA/policy/cuda/multi_reduce.hpp"
 #include "RAJA/policy/cuda/scan.hpp"
 #include "RAJA/policy/cuda/sort.hpp"
 #include "RAJA/policy/cuda/kernel.hpp"
diff --git a/include/RAJA/policy/cuda/MemUtils_CUDA.hpp b/include/RAJA/policy/cuda/MemUtils_CUDA.hpp
index 43d927acab..88a89d5362 100644
--- a/include/RAJA/policy/cuda/MemUtils_CUDA.hpp
+++ b/include/RAJA/policy/cuda/MemUtils_CUDA.hpp
@@ -49,6 +49,26 @@ namespace RAJA
 namespace cuda
 {
 
+//! Get the properties of the current device
+RAJA_INLINE
+cudaDeviceProp get_device_prop()
+{
+  int device;
+  cudaErrchk(cudaGetDevice(&device));
+  cudaDeviceProp prop;
+  cudaErrchk(cudaGetDeviceProperties(&prop, device));
+  return prop;
+}
+
+//! Get a reference to a static cached copy of the current device properties.
+//  This caches a copy on first use to speedup later calls.
+RAJA_INLINE
+cudaDeviceProp& device_prop()
+{
+  static thread_local cudaDeviceProp prop = get_device_prop();
+  return prop;
+}
+
 
 //! Allocator for pinned memory for use in basic_mempool
 struct PinnedAllocator {
@@ -146,36 +166,22 @@ namespace detail
 
 //! struct containing data necessary to coordinate kernel launches with reducers
 struct cudaInfo {
+  const void* func = nullptr;
   cuda_dim_t gridDim{0, 0, 0};
   cuda_dim_t blockDim{0, 0, 0};
+  size_t* dynamic_smem = nullptr;
   ::RAJA::resources::Cuda res{::RAJA::resources::Cuda::CudaFromStream(0,0)};
   bool setup_reducers = false;
+};
+struct cudaStatusInfo : cudaInfo {
 #if defined(RAJA_ENABLE_OPENMP)
-  cudaInfo* thread_states = nullptr;
   omp::mutex lock;
 #endif
 };
 
-//! class that changes a value on construction then resets it at destruction
-template <typename T>
-class SetterResetter
-{
-public:
-  SetterResetter(T& val, T new_val) : m_val(val), m_old_val(val)
-  {
-    m_val = new_val;
-  }
-  SetterResetter(const SetterResetter&) = delete;
-  ~SetterResetter() { m_val = m_old_val; }
-
-private:
-  T& m_val;
-  T m_old_val;
-};
-
-extern cudaInfo g_status;
+extern cudaStatusInfo g_status;
 
-extern cudaInfo tl_status;
+extern cudaStatusInfo tl_status;
 #if defined(RAJA_ENABLE_OPENMP)
 #pragma omp threadprivate(tl_status)
 #endif
@@ -275,54 +281,94 @@ bool setupReducers() { return detail::tl_status.setup_reducers; }
 RAJA_INLINE
 cuda_dim_t currentGridDim() { return detail::tl_status.gridDim; }
 
+//! get grid size of current launch
+RAJA_INLINE
+cuda_dim_member_t currentGridSize() { return detail::tl_status.gridDim.x *
+                                             detail::tl_status.gridDim.y *
+                                             detail::tl_status.gridDim.z; }
+
 //! get blockDim of current launch
 RAJA_INLINE
 cuda_dim_t currentBlockDim() { return detail::tl_status.blockDim; }
 
+//! get block size of current launch
+RAJA_INLINE
+cuda_dim_member_t currentBlockSize() { return detail::tl_status.blockDim.x *
+                                              detail::tl_status.blockDim.y *
+                                              detail::tl_status.blockDim.z; }
+
+//! get dynamic shared memory usage for current launch
+RAJA_INLINE
+size_t currentDynamicShmem() { return *detail::tl_status.dynamic_smem; }
+
+//! get maximum dynamic shared memory for current launch
+RAJA_INLINE
+size_t maxDynamicShmem()
+{
+  cudaFuncAttributes func_attr;
+  cudaErrchk(cudaFuncGetAttributes(&func_attr, detail::tl_status.func));
+  return func_attr.maxDynamicSharedSizeBytes;
+}
+
+constexpr size_t dynamic_smem_allocation_failure = std::numeric_limits<size_t>::max();
+
+//! Allocate dynamic shared memory for current launch
+//
+//  The first argument is a functional object that takes the maximum number of
+//  objects that can fit into the dynamic shared memory available and returns
+//  the number of objects to allocate.
+//  The second argument is the required alignment.
+//
+//  Returns an offset into dynamic shared memory aligned to align on success,
+//  or dynamic_smem_allocation_failure on failure. Note that asking for 0 memory
+//  takes the failure return path.
+template < typename T, typename GetNFromMax >
+RAJA_INLINE
+size_t allocateDynamicShmem(GetNFromMax&& get_n_from_max, size_t align = alignof(T))
+{
+  const size_t unaligned_shmem = *detail::tl_status.dynamic_smem;
+  const size_t align_offset = ((unaligned_shmem % align) != size_t(0))
+      ? align - (unaligned_shmem % align)
+      : size_t(0);
+  const size_t aligned_shmem = unaligned_shmem + align_offset;
+
+  const size_t max_shmem_bytes = maxDynamicShmem() - aligned_shmem;
+  const size_t n_bytes = sizeof(T) *
+      std::forward<GetNFromMax>(get_n_from_max)(max_shmem_bytes / sizeof(T));
+
+  if (size_t(0) < n_bytes && n_bytes <= max_shmem_bytes) {
+    *detail::tl_status.dynamic_smem = aligned_shmem + n_bytes;
+    return aligned_shmem;
+  } else {
+    return dynamic_smem_allocation_failure;
+  }
+}
+
 //! get resource for current launch
 RAJA_INLINE
 ::RAJA::resources::Cuda currentResource() { return detail::tl_status.res; }
 
 //! create copy of loop_body that is setup for device execution
+//
+// Note: This is done to setup the Reducer and MultiReducer objects through
+// their copy constructors. Both look at tl_status to setup per kernel launch
+// resources.
 template <typename LOOP_BODY>
 RAJA_INLINE typename std::remove_reference<LOOP_BODY>::type make_launch_body(
+    const void* func,
     cuda_dim_t gridDim,
     cuda_dim_t blockDim,
-    size_t RAJA_UNUSED_ARG(dynamic_smem),
+    size_t& dynamic_smem,
     ::RAJA::resources::Cuda res,
     LOOP_BODY&& loop_body)
 {
-  detail::SetterResetter<bool> setup_reducers_srer(
-      detail::tl_status.setup_reducers, true);
-  detail::SetterResetter<::RAJA::resources::Cuda> res_srer(
-      detail::tl_status.res, res);
-
-  detail::tl_status.gridDim = gridDim;
-  detail::tl_status.blockDim = blockDim;
+  ::RAJA::detail::ScopedAssignment<detail::cudaInfo> info_sa(detail::tl_status,
+      detail::cudaInfo{func, gridDim, blockDim, &dynamic_smem, res, true});
 
   using return_type = typename std::remove_reference<LOOP_BODY>::type;
   return return_type(std::forward<LOOP_BODY>(loop_body));
 }
 
-//! Get the properties of the current device
-RAJA_INLINE
-cudaDeviceProp get_device_prop()
-{
-  int device;
-  cudaErrchk(cudaGetDevice(&device));
-  cudaDeviceProp prop;
-  cudaErrchk(cudaGetDeviceProperties(&prop, device));
-  return prop;
-}
-
-//! Get a copy of the device properties, this copy is cached on first use to speedup later calls
-RAJA_INLINE
-cudaDeviceProp& device_prop()
-{
-  static thread_local cudaDeviceProp prop = get_device_prop();
-  return prop;
-}
-
 
 static constexpr int cuda_occupancy_uninitialized_int = -1;
 static constexpr size_t cuda_occupancy_uninitialized_size_t =
diff --git a/include/RAJA/policy/cuda/WorkGroup/WorkRunner.hpp b/include/RAJA/policy/cuda/WorkGroup/WorkRunner.hpp
index c4aabd012f..41fe17c84a 100644
--- a/include/RAJA/policy/cuda/WorkGroup/WorkRunner.hpp
+++ b/include/RAJA/policy/cuda/WorkGroup/WorkRunner.hpp
@@ -292,7 +292,7 @@ struct WorkRunner<
       //
       // TODO: Privatize the loop_body, using make_launch_body to setup reductions
       //
-      // LOOP_BODY body = RAJA::cuda::make_launch_body(
+      // LOOP_BODY body = RAJA::cuda::make_launch_body(func,
       //     gridSize, blockSize, shmem, stream, std::forward<LoopBody>(loop_body));
 
       storage.template emplace<holder>(
diff --git a/include/RAJA/policy/cuda/atomic.hpp b/include/RAJA/policy/cuda/atomic.hpp
index 573618fc25..aedfe91a03 100644
--- a/include/RAJA/policy/cuda/atomic.hpp
+++ b/include/RAJA/policy/cuda/atomic.hpp
@@ -25,17 +25,31 @@
 #include <stdexcept>
 #include <type_traits>
 
+#if __CUDA__ARCH__ >= 600 && __CUDACC_VER_MAJOR__ >= 11 && __CUDACC_VER_MINOR__ >= 6
+#define RAJA_ENABLE_CUDA_ATOMIC_REF
+#endif
+
+#if defined(RAJA_ENABLE_CUDA_ATOMIC_REF)
+#include <cuda/atomic>
+#endif
+
+#include "camp/list.hpp"
+
 #include "RAJA/policy/sequential/atomic.hpp"
 #include "RAJA/policy/atomic_builtin.hpp"
 #if defined(RAJA_ENABLE_OPENMP)
 #include "RAJA/policy/openmp/atomic.hpp"
 #endif
 
+#include "RAJA/util/EnableIf.hpp"
 #include "RAJA/util/Operators.hpp"
 #include "RAJA/util/TypeConvert.hpp"
 #include "RAJA/util/macros.hpp"
 
 
+// TODO: When we can use if constexpr in C++17, this file can be cleaned up
+
+
 namespace RAJA
 {
 
@@ -43,596 +57,602 @@ namespace RAJA
 namespace detail
 {
 
-#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ < 350)  // baseline CUDA_ARCH sm_35 check
-#warning CUDA_ARCH is set too low in nvcc. Should set nvcc -arch=sm_35 or greater. COMPILING WITH DEFAULT atomicCAS!
-#endif
 
-// All CUDA atomic functions are checked for individual arch versions.
-// Most >= 200 checks can be deemed as >= 110 (except CAS 64-bit, Add 32-bit float, and Add 64-bit ULL), but using 200 for shared memory support.
-// If using < 350, certain atomics will be implemented with atomicCAS.
+/*!
+ * Type trait for determining if atomic operators should be implemented
+ * using builtin functions. This type trait can be used for a lot of atomic
+ * operators. More specific type traits are added when needed, such as
+ * cuda_useBuiltinExchange below.
+ */
+template <typename T>
+struct cuda_useBuiltinCommon {
+  static constexpr bool value =
+    std::is_same<T, int>::value ||
+    std::is_same<T, unsigned int>::value ||
+    std::is_same<T, unsigned long long>::value;
+};
+
 
-#if __CUDA_ARCH__ >= 200
 /*!
- * Generic impementation of atomic 32-bit or 64-bit compare and swap primitive.
- * Implementation uses the existing CUDA supplied unsigned 32-bit and 64-bit
- * CAS operators.
- * Returns the value that was stored before this operation.
+ * Type trait for determining if atomic operators should be implemented
+ * by reinterpreting inputs to types that the builtin functions support.
+ * This type trait can be used for a lot of atomic operators. More specific
+ * type traits are added when needed, such as cuda_useReinterpretExchange
+ * below.
  */
-RAJA_INLINE __device__ unsigned cuda_atomic_CAS(
-    unsigned volatile *acc,
-    unsigned compare,
-    unsigned value)
-{
-  return ::atomicCAS((unsigned *)acc, compare, value);
-}
-///
-RAJA_INLINE __device__ unsigned long long cuda_atomic_CAS(
-    unsigned long long volatile *acc,
-    unsigned long long compare,
-    unsigned long long value)
-{
-  return ::atomicCAS((unsigned long long *)acc, compare, value);
-}
-///
 template <typename T>
-RAJA_INLINE __device__
-typename std::enable_if<sizeof(T) == sizeof(unsigned), T>::type
-cuda_atomic_CAS(T volatile *acc, T compare, T value)
-{
-  return RAJA::util::reinterp_A_as_B<unsigned, T>(
-      cuda_atomic_CAS((unsigned volatile *)acc,
-          RAJA::util::reinterp_A_as_B<T, unsigned>(compare),
-          RAJA::util::reinterp_A_as_B<T, unsigned>(value)));
-}
-///
+struct cuda_useReinterpretCommon {
+  static constexpr bool value =
+    !cuda_useBuiltinCommon<T>::value &&
+    (sizeof(T) == sizeof(unsigned int) ||
+     sizeof(T) == sizeof(unsigned long long));
+
+  using type =
+    std::conditional_t<sizeof(T) == sizeof(unsigned int),
+                       unsigned int, unsigned long long>;
+};
+
+
+/*!
+ * Alias for determining the integral type of the same size as the given type
+ */
 template <typename T>
-RAJA_INLINE __device__
-typename std::enable_if<sizeof(T) == sizeof(unsigned long long), T>::type
-cuda_atomic_CAS(T volatile *acc, T compare, T value)
+using cuda_useReinterpretCommon_t = typename cuda_useReinterpretCommon<T>::type;
+
+
+/*!
+ * Performs an atomic bitwise or using a builtin function. Stores the new value
+ * in the given address and returns the old value.
+ *
+ * This overload using builtin functions is used to implement atomic loads
+ * under some build configurations.
+ */
+template <typename T,
+          std::enable_if_t<cuda_useBuiltinCommon<T>::value, bool> = true>
+RAJA_INLINE __device__ T cuda_atomicOr(T *acc, T value)
 {
-  return RAJA::util::reinterp_A_as_B<unsigned long long, T>(
-      cuda_atomic_CAS((unsigned long long volatile *)acc,
-          RAJA::util::reinterp_A_as_B<T, unsigned long long>(compare),
-          RAJA::util::reinterp_A_as_B<T, unsigned long long>(value)));
+  return ::atomicOr(acc, value);
 }
 
-template <size_t BYTES>
-struct CudaAtomicCAS {
-};
 
+/*!
+ * Atomic exchange
+ */
 
-template <>
-struct CudaAtomicCAS<4> {
-
-  /*!
-   * Generic impementation of any atomic 32-bit operator.
-   * Implementation uses the existing CUDA supplied unsigned 32-bit CAS
-   * operator. Returns the OLD value that was replaced by the result of this
-   * operation.
-   */
-  template <typename T, typename OPER>
-  RAJA_INLINE __device__ T operator()(T volatile *acc, OPER const &oper) const
-  {
-    // asserts in RAJA::util::reinterp_T_as_u and RAJA::util::reinterp_u_as_T
-    // will enforce 32-bit T
-    unsigned oldval, newval, readback;
-    oldval = RAJA::util::reinterp_A_as_B<T, unsigned>(*acc);
-    newval = RAJA::util::reinterp_A_as_B<T, unsigned>(
-        oper(RAJA::util::reinterp_A_as_B<unsigned, T>(oldval)));
-    while ((readback = cuda_atomic_CAS((unsigned volatile*)acc, oldval, newval)) !=
-           oldval) {
-      oldval = readback;
-      newval = RAJA::util::reinterp_A_as_B<T, unsigned>(
-          oper(RAJA::util::reinterp_A_as_B<unsigned, T>(oldval)));
-    }
-    return RAJA::util::reinterp_A_as_B<unsigned, T>(oldval);
-  }
+/*!
+ * Type trait for determining if the exchange operator should be implemented
+ * using a builtin
+ */
+template <typename T>
+struct cuda_useBuiltinExchange {
+  static constexpr bool value =
+    std::is_same<T, int>::value ||
+    std::is_same<T, unsigned int>::value ||
+    std::is_same<T, unsigned long long>::value ||
+    std::is_same<T, float>::value;
 };
 
-template <>
-struct CudaAtomicCAS<8> {
-
-  /*!
-   * Generic impementation of any atomic 64-bit operator.
-   * Implementation uses the existing CUDA supplied unsigned 64-bit CAS
-   * operator. Returns the OLD value that was replaced by the result of this
-   * operation.
-   */
-  template <typename T, typename OPER>
-  RAJA_INLINE __device__ T operator()(T volatile *acc, OPER const &oper) const
-  {
-    // asserts in RAJA::util::reinterp_T_as_u and RAJA::util::reinterp_u_as_T
-    // will enforce 64-bit T
-    unsigned long long oldval, newval, readback;
-    oldval = RAJA::util::reinterp_A_as_B<T, unsigned long long>(*acc);
-    newval = RAJA::util::reinterp_A_as_B<T, unsigned long long>(
-        oper(RAJA::util::reinterp_A_as_B<unsigned long long, T>(oldval)));
-    while (
-        (readback = cuda_atomic_CAS((unsigned long long volatile*)acc, oldval, newval)) !=
-        oldval) {
-      oldval = readback;
-      newval = RAJA::util::reinterp_A_as_B<T, unsigned long long>(
-          oper(RAJA::util::reinterp_A_as_B<unsigned long long, T>(oldval)));
-    }
-    return RAJA::util::reinterp_A_as_B<unsigned long long, T>(oldval);
-  }
+/*!
+ * Type trait for determining if the exchange operator should be implemented
+ * by reinterpreting inputs to types that the builtin exchange supports
+ */
+template <typename T>
+struct cuda_useReinterpretExchange {
+  static constexpr bool value =
+    !cuda_useBuiltinExchange<T>::value &&
+    (sizeof(T) == sizeof(unsigned int) ||
+     sizeof(T) == sizeof(unsigned long long));
+
+  using type =
+    std::conditional_t<sizeof(T) == sizeof(unsigned int),
+                       unsigned int, unsigned long long>;
 };
 
+/*!
+ * Alias for determining the integral type of the same size as the given type
+ */
+template <typename T>
+using cuda_useReinterpretExchange_t = typename cuda_useReinterpretExchange<T>::type;
 
 /*!
- * Generic impementation of any atomic 32-bit or 64-bit operator that can be
- * implemented using a compare and swap primitive.
- * Implementation uses the existing CUDA supplied unsigned 32-bit and 64-bit
- * CAS operators.
- * Returns the OLD value that was replaced by the result of this operation.
+ * Performs an atomic exchange using a builtin function. Stores the new value
+ * in the given address and returns the old value.
  */
-template <typename T, typename OPER>
-RAJA_INLINE __device__ T cuda_atomic_CAS_oper(T volatile *acc, OPER &&oper)
+template <typename T,
+          std::enable_if_t<cuda_useBuiltinExchange<T>::value, bool> = true>
+RAJA_INLINE __device__ T cuda_atomicExchange(T *acc, T value)
 {
-  CudaAtomicCAS<sizeof(T)> cas;
-  return cas(acc, std::forward<OPER>(oper));
+  return ::atomicExch(acc, value);
 }
-#endif  // end CAS >= 200
 
-#if __CUDA_ARCH__ >= 200
 /*!
- * Catch-all policy passes off to CUDA's builtin atomics.
- *
- * This catch-all will only work for types supported by the compiler.
- * Specialization below can adapt for some unsupported types.
- *
- * These are atomic in cuda device code and non-atomic otherwise
+ * Performs an atomic exchange using a reinterpret cast. Stores the new value
+ * in the given address and returns the old value.
  */
-template <typename T>
-RAJA_INLINE __device__ T cuda_atomicAdd(T volatile *acc, T value)
+template <typename T,
+          std::enable_if_t<cuda_useReinterpretExchange<T>::value, bool> = true>
+RAJA_INLINE __device__ T cuda_atomicExchange(T *acc, T value)
 {
-  return cuda_atomic_CAS_oper(acc, [=] __device__(T a) {
-    return a + value;
-  });
-}
+  using R = cuda_useReinterpretExchange_t<T>;
 
-// 32-bit signed atomicAdd support by CUDA
-template <>
-RAJA_INLINE __device__ int cuda_atomicAdd<int>(int volatile *acc,
-                                          int value)
-{
-  return ::atomicAdd((int *)acc, value);
+  return RAJA::util::reinterp_A_as_B<R, T>(
+    cuda_atomicExchange(reinterpret_cast<R*>(acc),
+                        RAJA::util::reinterp_A_as_B<T, R>(value)));
 }
 
 
-// 32-bit unsigned atomicAdd support by CUDA
-template <>
-RAJA_INLINE __device__ unsigned cuda_atomicAdd<unsigned>(unsigned volatile *acc,
-                                                    unsigned value)
-{
-  return ::atomicAdd((unsigned *)acc, value);
-}
+/*!
+ * Atomic load and store
+ */
+#if defined(RAJA_ENABLE_CUDA_ATOMIC_REF)
 
-// 64-bit unsigned atomicAdd support by CUDA
-template <>
-RAJA_INLINE __device__ unsigned long long cuda_atomicAdd<unsigned long long>(
-    unsigned long long volatile *acc,
-    unsigned long long value)
+template <typename T>
+RAJA_INLINE __device__ T cuda_atomicLoad(T *acc)
 {
-  return ::atomicAdd((unsigned long long *)acc, value);
+  return cuda::atomic_ref<T, cuda::thread_scope_device>(*acc).load(
+    cuda::memory_order_relaxed{});
 }
 
 
-// 32-bit float atomicAdd support by CUDA
-template <>
-RAJA_INLINE __device__ float cuda_atomicAdd<float>(float volatile *acc,
-                                              float value)
+template <typename T>
+RAJA_INLINE __device__ void cuda_atomicStore(T *acc, T value)
 {
-  return ::atomicAdd((float *)acc, value);
+  cuda::atomic_ref<T, cuda::thread_scope_device>(*acc).store(
+    value, cuda::memory_order_relaxed{});
 }
-#endif
 
+#else
 
-// 64-bit double atomicAdd support added for sm_60
-#if __CUDA_ARCH__ >= 600
-template <>
-RAJA_INLINE __device__ double cuda_atomicAdd<double>(double volatile *acc,
-                                                double value)
+template <typename T,
+          std::enable_if_t<cuda_useBuiltinCommon<T>::value, bool> = true>
+RAJA_INLINE __device__ T cuda_atomicLoad(T *acc)
 {
-  return ::atomicAdd((double *)acc, value);
+  return cuda_atomicOr(acc, static_cast<T>(0));
 }
-#endif
 
-#if __CUDA_ARCH__ >= 200
-template <typename T>
-RAJA_INLINE __device__ T cuda_atomicSub(T volatile *acc, T value)
+template <typename T,
+          std::enable_if_t<cuda_useReinterpretCommon<T>::value, bool> = true>
+RAJA_INLINE __device__ T cuda_atomicLoad(T *acc)
 {
-  return cuda_atomic_CAS_oper(acc, [=] __device__(T a) {
-    return a - value;
-  });
-}
+  using R = cuda_useReinterpretCommon_t<T>;
 
-// 32-bit signed atomicSub support by CUDA
-template <>
-RAJA_INLINE __device__ int cuda_atomicSub<int>(int volatile *acc,
-                                          int value)
-{
-  return ::atomicSub((int *)acc, value);
+  return RAJA::util::reinterp_A_as_B<R, T>(
+    cuda_atomicLoad(reinterpret_cast<R*>(acc)));
 }
 
-
-// 32-bit unsigned atomicSub support by CUDA
-template <>
-RAJA_INLINE __device__ unsigned cuda_atomicSub<unsigned>(unsigned volatile *acc,
-                                                    unsigned value)
+template <typename T>
+RAJA_INLINE __device__ void cuda_atomicStore(T *acc, T value)
 {
-  return ::atomicSub((unsigned *)acc, value);
+  cuda_atomicExchange(acc, value);
 }
+
 #endif
 
-#if __CUDA_ARCH__ >= 200
+
+/*!
+ * Atomic compare and swap
+ */
+
+/*!
+ * Type trait for determining if the compare and swap operator should be
+ * implemented using a builtin
+ */
 template <typename T>
-RAJA_INLINE __device__ T cuda_atomicMin(T volatile *acc, T value)
-{
-  return cuda_atomic_CAS_oper(acc, [=] __device__(T a) {
-    return value < a ? value : a;
-  });
-}
+struct cuda_useBuiltinCAS {
+  static constexpr bool value =
+#if __CUDA_ARCH__ >= 700
+    std::is_same<T, unsigned short int>::value ||
+#endif
+    std::is_same<T, int>::value ||
+    std::is_same<T, unsigned int>::value ||
+    std::is_same<T, unsigned long long>::value;
+};
 
-// 32-bit signed atomicMin support by CUDA
-template <>
-RAJA_INLINE __device__ int cuda_atomicMin<int>(int volatile *acc,
-                                          int value)
-{
-  return ::atomicMin((int *)acc, value);
-}
+/*!
+ * Type trait for determining if the compare and swap operator should be
+ * implemented by reinterpreting inputs to types that the builtin compare
+ * and swap supports
+ */
+template <typename T>
+struct cuda_useReinterpretCAS {
+  static constexpr bool value =
+    !cuda_useBuiltinCAS<T>::value &&
+    (
+#if __CUDA_ARCH__ >= 700
+     sizeof(T) == sizeof(unsigned short) ||
+#endif
+     sizeof(T) == sizeof(unsigned int) ||
+     sizeof(T) == sizeof(unsigned long long)
+    );
+
+  using type =
+#if __CUDA_ARCH__ >= 700
+    std::conditional_t<sizeof(T) == sizeof(unsigned short),
+                       unsigned short,
+#endif
+    std::conditional_t<sizeof(T) == sizeof(unsigned int),
+                       unsigned int,
+                       unsigned long long>
+#if __CUDA_ARCH__ >= 700
+                      >
+#endif
+    ;
+};
 
+/*!
+ * Alias for determining the integral type of the same size as the given type
+ */
+template <typename T>
+using cuda_useReinterpretCAS_t = typename cuda_useReinterpretCAS<T>::type;
 
-// 32-bit unsigned atomicMin support by CUDA
-template <>
-RAJA_INLINE __device__ unsigned cuda_atomicMin<unsigned>(unsigned volatile *acc,
-                                                    unsigned value)
+template <typename T,
+          std::enable_if_t<cuda_useBuiltinCAS<T>::value, bool> = true>
+RAJA_INLINE __device__ T cuda_atomicCAS(T *acc, T compare, T value)
 {
-  return ::atomicMin((unsigned *)acc, value);
+  return ::atomicCAS(acc, compare, value);
 }
-#endif
 
-// 64-bit unsigned atomicMin support by CUDA sm_35 and later
-#if __CUDA_ARCH__ >= 350
-template <>
-RAJA_INLINE __device__ unsigned long long cuda_atomicMin<unsigned long long>(
-    unsigned long long volatile *acc,
-    unsigned long long value)
+template <typename T,
+          std::enable_if_t<cuda_useReinterpretCAS<T>::value, bool> = true>
+RAJA_INLINE __device__ T cuda_atomicCAS(T *acc, T compare, T value)
 {
-  return ::atomicMin((unsigned long long *)acc, value);
+  using R = cuda_useReinterpretCAS_t<T>;
+
+  return RAJA::util::reinterp_A_as_B<R, T>(
+    cuda_atomicCAS(reinterpret_cast<R*>(acc),
+                   RAJA::util::reinterp_A_as_B<T, R>(compare),
+                   RAJA::util::reinterp_A_as_B<T, R>(value)));
 }
-#endif
 
-#if __CUDA_ARCH__ >= 200
-template <typename T>
-RAJA_INLINE __device__ T cuda_atomicMax(T volatile *acc, T value)
+/*!
+ * Equality comparison for compare and swap loop. Converts to the underlying
+ * integral type to avoid cases where the values will never compare equal
+ * (most notably, NaNs).
+ */
+template <typename T,
+          std::enable_if_t<cuda_useBuiltinCommon<T>::value, bool> = true>
+RAJA_INLINE __device__ bool cuda_atomicCAS_equal(const T& a, const T& b)
 {
-  return cuda_atomic_CAS_oper(acc, [=] __device__(T a) {
-    return value > a ? value : a;
-  });
+  return a == b;
 }
 
-// 32-bit signed atomicMax support by CUDA
-template <>
-RAJA_INLINE __device__ int cuda_atomicMax<int>(int volatile *acc,
-                                          int value)
+template <typename T,
+          std::enable_if_t<cuda_useReinterpretCommon<T>::value, bool> = true>
+RAJA_INLINE __device__ bool cuda_atomicCAS_equal(const T& a, const T& b)
 {
-  return ::atomicMax((int *)acc, value);
+  using R = cuda_useReinterpretCommon_t<T>;
+
+  return cuda_atomicCAS_equal(RAJA::util::reinterp_A_as_B<T, R>(a),
+                              RAJA::util::reinterp_A_as_B<T, R>(b));
 }
 
 
-// 32-bit unsigned atomicMax support by CUDA
-template <>
-RAJA_INLINE __device__ unsigned cuda_atomicMax<unsigned>(unsigned volatile *acc,
-                                                    unsigned value)
+/*!
+ * Generic impementation of any atomic 32-bit or 64-bit operator.
+ * Implementation uses the existing CUDA supplied unsigned 32-bit or 64-bit CAS
+ * operator. Returns the OLD value that was replaced by the result of this
+ * operation.
+ */
+template <typename T, typename Oper>
+RAJA_INLINE __device__ T cuda_atomicCAS_loop(T *acc,
+                                             Oper&& oper)
 {
-  return ::atomicMax((unsigned *)acc, value);
-}
-#endif
+  T old = cuda_atomicLoad(acc);
+  T expected;
 
-// 64-bit unsigned atomicMax support by CUDA sm_35 and later
-#if __CUDA_ARCH__ >= 350
-template <>
-RAJA_INLINE __device__ unsigned long long cuda_atomicMax<unsigned long long>(
-    unsigned long long volatile *acc,
-    unsigned long long value)
-{
-  return ::atomicMax((unsigned long long *)acc, value);
-}
-#endif
+  do {
+    expected = old;
+    old = cuda_atomicCAS(acc, expected, oper(expected));
+  } while (!cuda_atomicCAS_equal(old, expected));
 
-#if __CUDA_ARCH__ >= 200
-template <typename T>
-RAJA_INLINE __device__ T cuda_atomicInc(T volatile *acc, T val)
-{
-  // See:
-  // http://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#atomicinc
-  return cuda_atomic_CAS_oper(acc, [=] __device__(T old) {
-    return ((old >= val) ? 0 : (old + 1));
-  });
+  return old;
 }
 
-// 32-bit unsigned atomicInc support by CUDA
-template <>
-RAJA_INLINE __device__ unsigned cuda_atomicInc<unsigned>(unsigned volatile *acc,
-                                                    unsigned value)
+/*!
+ * Generic impementation of any atomic 32-bit or 64-bit operator with short-circuiting.
+ * Implementation uses the existing CUDA supplied unsigned 32-bit or 64-bit CAS
+ * operator. Returns the OLD value that was replaced by the result of this
+ * operation.
+ */
+template <typename T, typename Oper, typename ShortCircuit>
+RAJA_INLINE __device__ T cuda_atomicCAS_loop(T *acc,
+                                             Oper&& oper,
+                                             ShortCircuit&& sc)
 {
-  return ::atomicInc((unsigned *)acc, value);
-}
+  T old = cuda_atomicLoad(acc);
 
-template <typename T>
-RAJA_INLINE __device__ T cuda_atomicInc(T volatile *acc)
-{
-  return cuda_atomic_CAS_oper(acc,
-                                      [=] __device__(T a) { return a + 1; });
-}
+  if (sc(old)) {
+    return old;
+  }
 
-// 32-bit signed atomicAdd support by CUDA, used as backend for atomicInc
-template <>
-RAJA_INLINE __device__ int cuda_atomicInc<int>(int volatile *acc)
-{
-  return ::atomicAdd((int *)acc, (int)1);
-}
+  T expected;
 
-// 32-bit unsigned atomicAdd support by CUDA, used as backend for atomicInc
-template <>
-RAJA_INLINE __device__ unsigned cuda_atomicInc<unsigned>(unsigned volatile *acc)
-{
-  return ::atomicAdd((unsigned *)acc, (unsigned)1);
-}
+  do {
+    expected = old;
+    old = cuda_atomicCAS(acc, expected, oper(expected));
+  } while (!cuda_atomicCAS_equal(old, expected) && !sc(old));
 
-// 64-bit unsigned atomicAdd support by CUDA, used as backend for atomicInc
-template <>
-RAJA_INLINE __device__ unsigned long long cuda_atomicInc<unsigned long long>(
-    unsigned long long volatile *acc)
-{
-  return ::atomicAdd((unsigned long long *)acc, (unsigned long long)1);
+  return old;
 }
 
-// 32-bit float atomicAdd support by CUDA, used as backend for atomicInc
-template <>
-RAJA_INLINE __device__ float cuda_atomicInc<float>(float volatile *acc)
-{
-  return ::atomicAdd((float *)acc, (float)1);
-}
-#endif
 
-// 64-bit double atomicAdd support added for sm_60, used as backend for atomicInc
+/*!
+ * Atomic addition
+ */
+using cuda_atomicAdd_builtin_types = ::camp::list<
+  int,
+  unsigned int,
+  unsigned long long int,
+  float
 #if __CUDA_ARCH__ >= 600
-template <>
-RAJA_INLINE __device__ double cuda_atomicInc<double>(double volatile *acc)
-{
-  return ::atomicAdd((double *)acc, (double)1);
-}
+  ,
+  double
 #endif
+>;
 
-
-#if __CUDA_ARCH__ >= 200
-template <typename T>
-RAJA_INLINE __device__ T cuda_atomicDec(T volatile *acc, T val)
+template <typename T,
+          RAJA::util::enable_if_is_none_of<T, cuda_atomicAdd_builtin_types>* = nullptr>
+RAJA_INLINE __device__ T cuda_atomicAdd(T *acc, T value)
 {
-  // See:
-  // http://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#atomicdec
-  return cuda_atomic_CAS_oper(acc, [=] __device__(T old) {
-    return (((old == 0) | (old > val)) ? val : (old - 1));
+  return cuda_atomicCAS_loop(acc, [value] (T old) {
+    return old + value;
   });
 }
 
-// 32-bit unsigned atomicDec support by CUDA
-template <>
-RAJA_INLINE __device__ unsigned cuda_atomicDec<unsigned>(unsigned volatile *acc,
-                                                    unsigned value)
+template <typename T,
+          RAJA::util::enable_if_is_any_of<T, cuda_atomicAdd_builtin_types>* = nullptr>
+RAJA_INLINE __device__ T cuda_atomicAdd(T *acc, T value)
 {
-  return ::atomicDec((unsigned *)acc, value);
+  return ::atomicAdd(acc, value);
 }
 
-template <typename T>
-RAJA_INLINE __device__ T cuda_atomicDec(T volatile *acc)
-{
-  return cuda_atomic_CAS_oper(acc,
-                                      [=] __device__(T a) { return a - 1; });
-}
 
-// 32-bit signed atomicSub support by CUDA, used as backend for atomicDec
-template <>
-RAJA_INLINE __device__ int cuda_atomicDec<int>(int volatile *acc)
-{
-  return ::atomicSub((int *)acc, (int)1);
-}
+/*!
+ * Atomic subtract
+ */
+using cuda_atomicSub_builtin_types = cuda_atomicAdd_builtin_types;
 
-// 32-bit unsigned atomicSub support by CUDA, used as backend for atomicDec
-template <>
-RAJA_INLINE __device__ unsigned cuda_atomicDec<unsigned>(unsigned volatile *acc)
-{
-  return ::atomicSub((unsigned *)acc, (unsigned)1);
-}
-#endif
+using cuda_atomicSub_via_Sub_builtin_types = ::camp::list<
+  int,
+  unsigned int
+>;
 
+using cuda_atomicSub_via_Add_builtin_types = ::camp::list<
+  unsigned long long int,
+  float
+#if __CUDA_ARCH__ >= 600
+  ,
+  double
+#endif
+>;
 
-#if __CUDA_ARCH__ >= 200
-template <typename T>
-RAJA_INLINE __device__ T cuda_atomicAnd(T volatile *acc, T value)
+template <typename T,
+          RAJA::util::enable_if_is_none_of<T, cuda_atomicSub_builtin_types>* = nullptr>
+RAJA_INLINE __device__ T cuda_atomicSub(T *acc, T value)
 {
-  return cuda_atomic_CAS_oper(acc, [=] __device__(T a) {
-    return a & value;
+  return cuda_atomicCAS_loop(acc, [value] (T old) {
+    return old - value;
   });
 }
 
-// 32-bit signed atomicAnd support by CUDA
-template <>
-RAJA_INLINE __device__ int cuda_atomicAnd<int>(int volatile *acc,
-                                          int value)
+template <typename T,
+          RAJA::util::enable_if_is_any_of<T, cuda_atomicSub_via_Sub_builtin_types>* = nullptr>
+RAJA_INLINE __device__ T cuda_atomicSub(T *acc, T value)
 {
-  return ::atomicAnd((int *)acc, value);
+  return ::atomicSub(acc, value);
 }
 
-
-// 32-bit unsigned atomicAnd support by CUDA
-template <>
-RAJA_INLINE __device__ unsigned cuda_atomicAnd<unsigned>(unsigned volatile *acc,
-                                                    unsigned value)
+template <typename T,
+          RAJA::util::enable_if_is_any_of<T, cuda_atomicSub_via_Add_builtin_types>* = nullptr>
+RAJA_INLINE __device__ T cuda_atomicSub(T *acc, T value)
 {
-  return ::atomicAnd((unsigned *)acc, value);
+  return ::atomicAdd(acc, -value);
 }
-#endif
 
-// 64-bit unsigned atomicAnd support by CUDA sm_35 and later
-#if __CUDA_ARCH__ >= 350
-template <>
-RAJA_INLINE __device__ unsigned long long cuda_atomicAnd<unsigned long long>(
-    unsigned long long volatile *acc,
-    unsigned long long value)
-{
-  return ::atomicAnd((unsigned long long *)acc, value);
-}
+
+/*!
+ * Atomic min/max
+ */
+using cuda_atomicMinMax_builtin_types = ::camp::list<
+  int,
+  unsigned int
+#if __CUDA_ARCH__ >= 500
+  ,
+  long long int,
+  unsigned long long int
 #endif
+>;
 
-#if __CUDA_ARCH__ >= 200
-template <typename T>
-RAJA_INLINE __device__ T cuda_atomicOr(T volatile *acc, T value)
+
+/*!
+ * Atomic min
+ */
+template <typename T,
+          RAJA::util::enable_if_is_none_of<T, cuda_atomicMinMax_builtin_types>* = nullptr>
+RAJA_INLINE __device__ T cuda_atomicMin(T *acc, T value)
 {
-  return cuda_atomic_CAS_oper(acc, [=] __device__(T a) {
-    return a | value;
-  });
+  return cuda_atomicCAS_loop(
+    acc,
+    [value] (T old) {
+      return value < old ? value : old;
+    },
+    [value] (T current) {
+      return current <= value;
+    });
 }
 
-// 32-bit signed atomicOr support by CUDA
-template <>
-RAJA_INLINE __device__ int cuda_atomicOr<int>(int volatile *acc,
-                                         int value)
+template <typename T,
+          RAJA::util::enable_if_is_any_of<T, cuda_atomicMinMax_builtin_types>* = nullptr>
+RAJA_INLINE __device__ T cuda_atomicMin(T *acc, T value)
 {
-  return ::atomicOr((int *)acc, value);
+  return ::atomicMin(acc, value);
 }
 
 
-// 32-bit unsigned atomicOr support by CUDA
-template <>
-RAJA_INLINE __device__ unsigned cuda_atomicOr<unsigned>(unsigned volatile *acc,
-                                                   unsigned value)
+/*!
+ * Atomic max
+ */
+template <typename T,
+          RAJA::util::enable_if_is_none_of<T, cuda_atomicMinMax_builtin_types>* = nullptr>
+RAJA_INLINE __device__ T cuda_atomicMax(T *acc, T value)
 {
-  return ::atomicOr((unsigned *)acc, value);
+  return cuda_atomicCAS_loop(
+    acc,
+    [value] (T old) {
+      return old < value ? value : old;
+    },
+    [value] (T current) {
+      return value <= current;
+    });
 }
-#endif
 
-// 64-bit unsigned atomicOr support by CUDA sm_35 and later
-#if __CUDA_ARCH__ >= 350
-template <>
-RAJA_INLINE __device__ unsigned long long cuda_atomicOr<unsigned long long>(
-    unsigned long long volatile *acc,
-    unsigned long long value)
+template <typename T,
+          RAJA::util::enable_if_is_any_of<T, cuda_atomicMinMax_builtin_types>* = nullptr>
+RAJA_INLINE __device__ T cuda_atomicMax(T *acc, T value)
 {
-  return ::atomicOr((unsigned long long *)acc, value);
+  return ::atomicMax(acc, value);
 }
-#endif
 
-#if __CUDA_ARCH__ >= 200
-template <typename T>
-RAJA_INLINE __device__ T cuda_atomicXor(T volatile *acc, T value)
+
+/*!
+ * Atomic increment/decrement with reset
+ */
+using cuda_atomicIncDecReset_builtin_types = ::camp::list<
+  unsigned int
+>;
+
+
+/*!
+ * Atomic increment with reset
+ */
+template <typename T,
+          RAJA::util::enable_if_is_none_of<T, cuda_atomicIncDecReset_builtin_types>* = nullptr>
+RAJA_INLINE __device__ T cuda_atomicInc(T *acc, T value)
 {
-  return cuda_atomic_CAS_oper(acc, [=] __device__(T a) {
-    return a ^ value;
+  // See:
+  // http://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#atomicinc
+  return cuda_atomicCAS_loop(acc, [value] (T old) {
+    return value <= old ? static_cast<T>(0) : old + static_cast<T>(1);
   });
 }
 
-// 32-bit signed atomicXor support by CUDA
-template <>
-RAJA_INLINE __device__ int cuda_atomicXor<int>(int volatile *acc,
-                                          int value)
+template <typename T,
+          RAJA::util::enable_if_is_any_of<T, cuda_atomicIncDecReset_builtin_types>* = nullptr>
+RAJA_INLINE __device__ T cuda_atomicInc(T *acc, T value)
 {
-  return ::atomicXor((int *)acc, value);
+  return ::atomicInc(acc, value);
 }
 
 
-// 32-bit unsigned atomicXor support by CUDA
-template <>
-RAJA_INLINE __device__ unsigned cuda_atomicXor<unsigned>(unsigned volatile *acc,
-                                                    unsigned value)
+/*!
+ * Atomic increment (implemented in terms of atomic addition)
+ */
+template <typename T>
+RAJA_INLINE __device__ T cuda_atomicInc(T *acc)
 {
-  return ::atomicXor((unsigned *)acc, value);
+  return cuda_atomicAdd(acc, static_cast<T>(1));
 }
-#endif
 
-// 64-bit unsigned atomicXor support by CUDA sm_35 and later
-#if __CUDA_ARCH__ >= 350
-template <>
-RAJA_INLINE __device__ unsigned long long cuda_atomicXor<unsigned long long>(
-    unsigned long long volatile *acc,
-    unsigned long long value)
-{
-  return ::atomicXor((unsigned long long *)acc, value);
-}
-#endif
 
-#if __CUDA_ARCH__ >= 200
-template <typename T>
-RAJA_INLINE __device__ T cuda_atomicExchange(T volatile *acc, T value)
+/*!
+ * Atomic decrement with reset
+ */
+template <typename T,
+          RAJA::util::enable_if_is_none_of<T, cuda_atomicIncDecReset_builtin_types>* = nullptr>
+RAJA_INLINE __device__ T cuda_atomicDec(T *acc, T value)
 {
-  return cuda_atomic_CAS_oper(acc, [=] __device__(T) {
-    return value;
+  // See:
+  // http://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#atomicdec
+  return cuda_atomicCAS_loop(acc, [value] (T old) {
+    return old == static_cast<T>(0) || value < old ? value : old - static_cast<T>(1);
   });
 }
 
-template <>
-RAJA_INLINE __device__ int cuda_atomicExchange<int>(
-    int volatile *acc, int value)
+template <typename T,
+          RAJA::util::enable_if_is_any_of<T, cuda_atomicIncDecReset_builtin_types>* = nullptr>
+RAJA_INLINE __device__ T cuda_atomicDec(T *acc, T value)
 {
-  return ::atomicExch((int *)acc, value);
+  return ::atomicDec(acc, value);
 }
 
-template <>
-RAJA_INLINE __device__ unsigned cuda_atomicExchange<unsigned>(
-    unsigned volatile *acc, unsigned value)
+
+/*!
+ * Atomic decrement (implemented in terms of atomic subtraction)
+ */
+template <typename T>
+RAJA_INLINE __device__ T cuda_atomicDec(T *acc)
 {
-  return ::atomicExch((unsigned *)acc, value);
+  return cuda_atomicSub(acc, static_cast<T>(1));
 }
 
-template <>
-RAJA_INLINE __device__ unsigned long long cuda_atomicExchange<unsigned long long>(
-    unsigned long long volatile *acc,
-    unsigned long long value)
+
+/*!
+ * Atomic bitwise functions (and, or, xor)
+ */
+using cuda_atomicBit_builtin_types = ::camp::list<
+  int,
+  unsigned int,
+  unsigned long long int
+>;
+
+
+/*!
+ * Atomic and
+ */
+template <typename T,
+          RAJA::util::enable_if_is_none_of<T, cuda_atomicBit_builtin_types>* = nullptr>
+RAJA_INLINE __device__ T cuda_atomicAnd(T *acc, T value)
 {
-  return ::atomicExch((unsigned long long *)acc, value);
+  return cuda_atomicCAS_loop(acc, [value] (T old) {
+    return old & value;
+  });
 }
 
-template <>
-RAJA_INLINE __device__ float cuda_atomicExchange<float>(
-    float volatile *acc, float value)
+template <typename T,
+          RAJA::util::enable_if_is_any_of<T, cuda_atomicBit_builtin_types>* = nullptr>
+RAJA_INLINE __device__ T cuda_atomicAnd(T *acc, T value)
 {
-  return ::atomicExch((float *)acc, value);
+  return ::atomicAnd(acc, value);
 }
-#endif
 
 
-#if __CUDA_ARCH__ >= 200
-template <typename T>
-RAJA_INLINE __device__ T cuda_atomicCAS(T volatile *acc, T compare, T value)
+/*!
+ * Atomic or
+ */
+template <typename T,
+          RAJA::util::enable_if_is_none_of<T, cuda_atomicBit_builtin_types>* = nullptr>
+RAJA_INLINE __device__ T cuda_atomicOr(T *acc, T value)
 {
-  return cuda_atomic_CAS(acc, compare, value);
+  return cuda_atomicCAS_loop(acc, [value] (T old) {
+    return old | value;
+  });
 }
 
-template <>
-RAJA_INLINE __device__ int cuda_atomicCAS<int>(
-    int volatile *acc, int compare, int value)
-{
-  return ::atomicCAS((int *)acc, compare, value);
-}
+/*!
+ * Atomic or via builtin functions was implemented much earlier since atomicLoad
+ * may depend on it.
+ */
+
 
-template <>
-RAJA_INLINE __device__ unsigned cuda_atomicCAS<unsigned>(
-    unsigned volatile *acc, unsigned compare, unsigned value)
+/*!
+ * Atomic xor
+ */
+template <typename T,
+          RAJA::util::enable_if_is_none_of<T, cuda_atomicBit_builtin_types>* = nullptr>
+RAJA_INLINE __device__ T cuda_atomicXor(T *acc, T value)
 {
-  return ::atomicCAS((unsigned *)acc, compare, value);
+  return cuda_atomicCAS_loop(acc, [value] (T old) {
+    return old ^ value;
+  });
 }
 
-template <>
-RAJA_INLINE __device__ unsigned long long cuda_atomicCAS<unsigned long long>(
-    unsigned long long volatile *acc,
-    unsigned long long compare,
-    unsigned long long value)
+template <typename T,
+          RAJA::util::enable_if_is_any_of<T, cuda_atomicBit_builtin_types>* = nullptr>
+RAJA_INLINE __device__ T cuda_atomicXor(T *acc, T value)
 {
-  return ::atomicCAS((unsigned long long *)acc, compare, value);
+  return ::atomicXor(acc, value);
 }
-#endif
+
 
 }  // namespace detail
 
@@ -648,7 +668,31 @@ RAJA_INLINE __device__ unsigned long long cuda_atomicCAS<unsigned long long>(
 RAJA_SUPPRESS_HD_WARN
 template <typename T, typename host_policy>
 RAJA_INLINE RAJA_HOST_DEVICE T
-atomicAdd(cuda_atomic_explicit<host_policy>, T volatile *acc, T value)
+atomicLoad(cuda_atomic_explicit<host_policy>, T *acc)
+{
+#ifdef __CUDA_ARCH__
+  return detail::cuda_atomicLoad(acc);
+#else
+  return RAJA::atomicLoad(host_policy{}, acc);
+#endif
+}
+
+RAJA_SUPPRESS_HD_WARN
+template <typename T, typename host_policy>
+RAJA_INLINE RAJA_HOST_DEVICE void
+atomicStore(cuda_atomic_explicit<host_policy>, T *acc, T value)
+{
+#ifdef __CUDA_ARCH__
+  detail::cuda_atomicStore(acc, value);
+#else
+  RAJA::atomicStore(host_policy{}, acc, value);
+#endif
+}
+
+RAJA_SUPPRESS_HD_WARN
+template <typename T, typename host_policy>
+RAJA_INLINE RAJA_HOST_DEVICE T
+atomicAdd(cuda_atomic_explicit<host_policy>, T *acc, T value)
 {
 #ifdef __CUDA_ARCH__
   return detail::cuda_atomicAdd(acc, value);
@@ -660,7 +704,7 @@ atomicAdd(cuda_atomic_explicit<host_policy>, T volatile *acc, T value)
 RAJA_SUPPRESS_HD_WARN
 template <typename T, typename host_policy>
 RAJA_INLINE RAJA_HOST_DEVICE T
-atomicSub(cuda_atomic_explicit<host_policy>, T volatile *acc, T value)
+atomicSub(cuda_atomic_explicit<host_policy>, T *acc, T value)
 {
 #ifdef __CUDA_ARCH__
   return detail::cuda_atomicSub(acc, value);
@@ -672,7 +716,7 @@ atomicSub(cuda_atomic_explicit<host_policy>, T volatile *acc, T value)
 RAJA_SUPPRESS_HD_WARN
 template <typename T, typename host_policy>
 RAJA_INLINE RAJA_HOST_DEVICE T
-atomicMin(cuda_atomic_explicit<host_policy>, T volatile *acc, T value)
+atomicMin(cuda_atomic_explicit<host_policy>, T *acc, T value)
 {
 #ifdef __CUDA_ARCH__
   return detail::cuda_atomicMin(acc, value);
@@ -684,7 +728,7 @@ atomicMin(cuda_atomic_explicit<host_policy>, T volatile *acc, T value)
 RAJA_SUPPRESS_HD_WARN
 template <typename T, typename host_policy>
 RAJA_INLINE RAJA_HOST_DEVICE T
-atomicMax(cuda_atomic_explicit<host_policy>, T volatile *acc, T value)
+atomicMax(cuda_atomic_explicit<host_policy>, T *acc, T value)
 {
 #ifdef __CUDA_ARCH__
   return detail::cuda_atomicMax(acc, value);
@@ -696,21 +740,21 @@ atomicMax(cuda_atomic_explicit<host_policy>, T volatile *acc, T value)
 RAJA_SUPPRESS_HD_WARN
 template <typename T, typename host_policy>
 RAJA_INLINE RAJA_HOST_DEVICE T
-atomicInc(cuda_atomic_explicit<host_policy>, T volatile *acc, T val)
+atomicInc(cuda_atomic_explicit<host_policy>, T *acc, T value)
 {
 #ifdef __CUDA_ARCH__
   // See:
   // http://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#atomicinc
-  return detail::cuda_atomicInc(acc, val);
+  return detail::cuda_atomicInc(acc, value);
 #else
-  return RAJA::atomicInc(host_policy{}, acc, val);
+  return RAJA::atomicInc(host_policy{}, acc, value);
 #endif
 }
 
 RAJA_SUPPRESS_HD_WARN
 template <typename T, typename host_policy>
 RAJA_INLINE RAJA_HOST_DEVICE T
-atomicInc(cuda_atomic_explicit<host_policy>, T volatile *acc)
+atomicInc(cuda_atomic_explicit<host_policy>, T *acc)
 {
 #ifdef __CUDA_ARCH__
   return detail::cuda_atomicInc(acc);
@@ -722,21 +766,21 @@ atomicInc(cuda_atomic_explicit<host_policy>, T volatile *acc)
 RAJA_SUPPRESS_HD_WARN
 template <typename T, typename host_policy>
 RAJA_INLINE RAJA_HOST_DEVICE T
-atomicDec(cuda_atomic_explicit<host_policy>, T volatile *acc, T val)
+atomicDec(cuda_atomic_explicit<host_policy>, T *acc, T value)
 {
 #ifdef __CUDA_ARCH__
   // See:
   // http://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#atomicdec
-  return detail::cuda_atomicDec(acc, val);
+  return detail::cuda_atomicDec(acc, value);
 #else
-  return RAJA::atomicDec(host_policy{}, acc, val);
+  return RAJA::atomicDec(host_policy{}, acc, value);
 #endif
 }
 
 RAJA_SUPPRESS_HD_WARN
 template <typename T, typename host_policy>
 RAJA_INLINE RAJA_HOST_DEVICE T
-atomicDec(cuda_atomic_explicit<host_policy>, T volatile *acc)
+atomicDec(cuda_atomic_explicit<host_policy>, T *acc)
 {
 #ifdef __CUDA_ARCH__
   return detail::cuda_atomicDec(acc);
@@ -748,7 +792,7 @@ atomicDec(cuda_atomic_explicit<host_policy>, T volatile *acc)
 RAJA_SUPPRESS_HD_WARN
 template <typename T, typename host_policy>
 RAJA_INLINE RAJA_HOST_DEVICE T
-atomicAnd(cuda_atomic_explicit<host_policy>, T volatile *acc, T value)
+atomicAnd(cuda_atomic_explicit<host_policy>, T *acc, T value)
 {
 #ifdef __CUDA_ARCH__
   return detail::cuda_atomicAnd(acc, value);
@@ -760,7 +804,7 @@ atomicAnd(cuda_atomic_explicit<host_policy>, T volatile *acc, T value)
 RAJA_SUPPRESS_HD_WARN
 template <typename T, typename host_policy>
 RAJA_INLINE RAJA_HOST_DEVICE T
-atomicOr(cuda_atomic_explicit<host_policy>, T volatile *acc, T value)
+atomicOr(cuda_atomic_explicit<host_policy>, T *acc, T value)
 {
 #ifdef __CUDA_ARCH__
   return detail::cuda_atomicOr(acc, value);
@@ -772,7 +816,7 @@ atomicOr(cuda_atomic_explicit<host_policy>, T volatile *acc, T value)
 RAJA_SUPPRESS_HD_WARN
 template <typename T, typename host_policy>
 RAJA_INLINE RAJA_HOST_DEVICE T
-atomicXor(cuda_atomic_explicit<host_policy>, T volatile *acc, T value)
+atomicXor(cuda_atomic_explicit<host_policy>, T *acc, T value)
 {
 #ifdef __CUDA_ARCH__
   return detail::cuda_atomicXor(acc, value);
@@ -784,7 +828,7 @@ atomicXor(cuda_atomic_explicit<host_policy>, T volatile *acc, T value)
 RAJA_SUPPRESS_HD_WARN
 template <typename T, typename host_policy>
 RAJA_INLINE RAJA_HOST_DEVICE T
-atomicExchange(cuda_atomic_explicit<host_policy>, T volatile *acc, T value)
+atomicExchange(cuda_atomic_explicit<host_policy>, T *acc, T value)
 {
 #ifdef __CUDA_ARCH__
   return detail::cuda_atomicExchange(acc, value);
@@ -796,7 +840,7 @@ atomicExchange(cuda_atomic_explicit<host_policy>, T volatile *acc, T value)
 RAJA_SUPPRESS_HD_WARN
 template <typename T, typename host_policy>
 RAJA_INLINE RAJA_HOST_DEVICE T
-atomicCAS(cuda_atomic_explicit<host_policy>, T volatile *acc, T compare, T value)
+atomicCAS(cuda_atomic_explicit<host_policy>, T *acc, T compare, T value)
 {
 #ifdef __CUDA_ARCH__
   return detail::cuda_atomicCAS(acc, compare, value);
diff --git a/include/RAJA/policy/cuda/forall.hpp b/include/RAJA/policy/cuda/forall.hpp
index 333f0f90e8..493136400c 100644
--- a/include/RAJA/policy/cuda/forall.hpp
+++ b/include/RAJA/policy/cuda/forall.hpp
@@ -548,7 +548,8 @@ forall_impl(resources::Cuda cuda_res,
   if (len > 0) {
 
     auto func = reinterpret_cast<const void*>(
-        &impl::forall_cuda_kernel<EXEC_POL, BlocksPerSM, Iterator, LOOP_BODY, IndexType>);
+        &impl::forall_cuda_kernel<EXEC_POL, BlocksPerSM, Iterator, LOOP_BODY,
+                                  IndexType>);
 
     //
     // Setup shared memory buffers
@@ -567,7 +568,7 @@ forall_impl(resources::Cuda cuda_res,
       //
       // Privatize the loop_body, using make_launch_body to setup reductions
       //
-      LOOP_BODY body = RAJA::cuda::make_launch_body(
+      LOOP_BODY body = RAJA::cuda::make_launch_body(func,
           dims.blocks, dims.threads, shmem, cuda_res, std::forward<LoopBody>(loop_body));
 
       //
@@ -617,7 +618,8 @@ forall_impl(resources::Cuda cuda_res,
   if (len > 0) {
 
     auto func = reinterpret_cast<const void*>(
-        impl::forallp_cuda_kernel< EXEC_POL, BlocksPerSM, Iterator, LOOP_BODY, IndexType, camp::decay<ForallParam> >);
+        &impl::forallp_cuda_kernel< EXEC_POL, BlocksPerSM, Iterator, LOOP_BODY,
+                                   IndexType, camp::decay<ForallParam> >);
 
     //
     // Setup shared memory buffers
@@ -643,7 +645,7 @@ forall_impl(resources::Cuda cuda_res,
       //
       // Privatize the loop_body, using make_launch_body to setup reductions
       //
-      LOOP_BODY body = RAJA::cuda::make_launch_body(
+      LOOP_BODY body = RAJA::cuda::make_launch_body(func,
           dims.blocks, dims.threads, shmem, cuda_res, std::forward<LoopBody>(loop_body));
 
       //
diff --git a/include/RAJA/policy/cuda/intrinsics.hpp b/include/RAJA/policy/cuda/intrinsics.hpp
index b0d2ea7cf1..b2daa3a23e 100644
--- a/include/RAJA/policy/cuda/intrinsics.hpp
+++ b/include/RAJA/policy/cuda/intrinsics.hpp
@@ -334,10 +334,10 @@ RAJA_DEVICE RAJA_INLINE T warp_reduce(T val, T RAJA_UNUSED_ARG(identity))
 
   T temp = val;
 
-  if (numThreads % policy::cuda::WARP_SIZE == 0) {
+  if (numThreads % policy::cuda::device_constants.WARP_SIZE == 0) {
 
     // reduce each warp
-    for (int i = 1; i < policy::cuda::WARP_SIZE; i *= 2) {
+    for (int i = 1; i < policy::cuda::device_constants.WARP_SIZE; i *= 2) {
       T rhs = shfl_xor_sync(temp, i);
       Combiner{}(temp, rhs);
     }
@@ -345,7 +345,7 @@ RAJA_DEVICE RAJA_INLINE T warp_reduce(T val, T RAJA_UNUSED_ARG(identity))
   } else {
 
     // reduce each warp
-    for (int i = 1; i < policy::cuda::WARP_SIZE; i *= 2) {
+    for (int i = 1; i < policy::cuda::device_constants.WARP_SIZE; i *= 2) {
       int srcLane = threadId ^ i;
       T rhs = shfl_sync(temp, srcLane);
       // only add from threads that exist (don't double count own value)
@@ -370,7 +370,7 @@ RAJA_DEVICE RAJA_INLINE T warp_allreduce(T val)
 {
   T temp = val;
 
-  for (int i = 1; i < policy::cuda::WARP_SIZE; i *= 2) {
+  for (int i = 1; i < policy::cuda::device_constants.WARP_SIZE; i *= 2) {
     T rhs = __shfl_xor_sync(0xffffffff, temp, i);
     Combiner{}(temp, rhs);
   }
@@ -388,15 +388,15 @@ RAJA_DEVICE RAJA_INLINE T block_reduce(T val, T identity)
   int threadId = threadIdx.x + blockDim.x * threadIdx.y +
                  (blockDim.x * blockDim.y) * threadIdx.z;
 
-  int warpId = threadId % policy::cuda::WARP_SIZE;
-  int warpNum = threadId / policy::cuda::WARP_SIZE;
+  int warpId = threadId % policy::cuda::device_constants.WARP_SIZE;
+  int warpNum = threadId / policy::cuda::device_constants.WARP_SIZE;
 
   T temp = val;
 
-  if (numThreads % policy::cuda::WARP_SIZE == 0) {
+  if (numThreads % policy::cuda::device_constants.WARP_SIZE == 0) {
 
     // reduce each warp
-    for (int i = 1; i < policy::cuda::WARP_SIZE; i *= 2) {
+    for (int i = 1; i < policy::cuda::device_constants.WARP_SIZE; i *= 2) {
       T rhs = shfl_xor_sync(temp, i);
       Combiner{}(temp, rhs);
     }
@@ -404,7 +404,7 @@ RAJA_DEVICE RAJA_INLINE T block_reduce(T val, T identity)
   } else {
 
     // reduce each warp
-    for (int i = 1; i < policy::cuda::WARP_SIZE; i *= 2) {
+    for (int i = 1; i < policy::cuda::device_constants.WARP_SIZE; i *= 2) {
       int srcLane = threadId ^ i;
       T rhs = shfl_sync(temp, srcLane);
       // only add from threads that exist (don't double count own value)
@@ -415,18 +415,18 @@ RAJA_DEVICE RAJA_INLINE T block_reduce(T val, T identity)
   }
 
   // reduce per warp values
-  if (numThreads > policy::cuda::WARP_SIZE) {
+  if (numThreads > policy::cuda::device_constants.WARP_SIZE) {
 
-    static_assert(policy::cuda::MAX_WARPS <= policy::cuda::WARP_SIZE,
-        "Max Warps must be less than or equal to Warp Size for this algorithm to work");
+    static_assert(policy::cuda::device_constants.MAX_WARPS <= policy::cuda::device_constants.WARP_SIZE,
+        "This algorithms assumes a warp of WARP_SIZE threads can reduce MAX_WARPS values");
 
     // Need to separate declaration and initialization for clang-cuda
-    __shared__ unsigned char tmpsd[sizeof(RAJA::detail::SoAArray<T, policy::cuda::MAX_WARPS>)];
+    __shared__ unsigned char tmpsd[sizeof(RAJA::detail::SoAArray<T, policy::cuda::device_constants.MAX_WARPS>)];
 
     // Partial placement new: Should call new(tmpsd) here but recasting memory
     // to avoid calling constructor/destructor in shared memory.
-    RAJA::detail::SoAArray<T, policy::cuda::MAX_WARPS>* sd =
-      reinterpret_cast<RAJA::detail::SoAArray<T, policy::cuda::MAX_WARPS> *>(tmpsd);
+    RAJA::detail::SoAArray<T, policy::cuda::device_constants.MAX_WARPS>* sd =
+      reinterpret_cast<RAJA::detail::SoAArray<T, policy::cuda::device_constants.MAX_WARPS> *>(tmpsd);
 
     // write per warp values to shared memory
     if (warpId == 0) {
@@ -438,13 +438,13 @@ RAJA_DEVICE RAJA_INLINE T block_reduce(T val, T identity)
     if (warpNum == 0) {
 
       // read per warp values
-      if (warpId * policy::cuda::WARP_SIZE < numThreads) {
+      if (warpId * policy::cuda::device_constants.WARP_SIZE < numThreads) {
         temp = sd->get(warpId);
       } else {
         temp = identity;
       }
 
-      for (int i = 1; i < policy::cuda::MAX_WARPS; i *= 2) {
+      for (int i = 1; i < policy::cuda::device_constants.MAX_WARPS; i *= 2) {
         T rhs = shfl_xor_sync(temp, i);
         Combiner{}(temp, rhs);
       }
diff --git a/include/RAJA/policy/cuda/kernel/CudaKernel.hpp b/include/RAJA/policy/cuda/kernel/CudaKernel.hpp
index c070d618ea..7465f515b0 100644
--- a/include/RAJA/policy/cuda/kernel/CudaKernel.hpp
+++ b/include/RAJA/policy/cuda/kernel/CudaKernel.hpp
@@ -237,7 +237,7 @@ struct CudaKernelLauncherGetter
   using type = camp::decay<decltype(&internal::CudaKernelLauncherFixed<BlockSize, BlocksPerSM, Data, executor_t>)>;
   static constexpr type get() noexcept
   {
-    return internal::CudaKernelLauncherFixed<BlockSize, BlocksPerSM, Data, executor_t>;
+    return &internal::CudaKernelLauncherFixed<BlockSize, BlocksPerSM, Data, executor_t>;
   }
 };
 
@@ -251,7 +251,7 @@ struct CudaKernelLauncherGetter<0, 0, Data, executor_t>
   using type = camp::decay<decltype(&internal::CudaKernelLauncher<Data, executor_t>)>;
   static constexpr type get() noexcept
   {
-    return internal::CudaKernelLauncher<Data, executor_t>;
+    return &internal::CudaKernelLauncher<Data, executor_t>;
   }
 };
 
@@ -281,10 +281,15 @@ struct CudaLaunchHelper<cuda_explicit_launch<async0, num_blocks, num_threads, bl
 
   using kernelGetter_t = CudaKernelLauncherGetter<(num_threads <= 0) ? 0 : num_threads, (blocks_per_sm <= 0) ? 0 : blocks_per_sm, Data, executor_t>;
 
+  inline static const void* get_func()
+  {
+    return reinterpret_cast<const void*>(kernelGetter_t::get());
+  }
+
   inline static void recommended_blocks_threads(size_t shmem_size,
       int &recommended_blocks, int &recommended_threads)
   {
-    auto func = reinterpret_cast<const void*>(kernelGetter_t::get());
+    auto func = Self::get_func();
 
     if (num_blocks <= 0) {
 
@@ -363,7 +368,7 @@ struct CudaLaunchHelper<cuda_explicit_launch<async0, num_blocks, num_threads, bl
   inline static void max_blocks(size_t shmem_size,
       int &max_blocks, int actual_threads)
   {
-    auto func = reinterpret_cast<const void*>(kernelGetter_t::get());
+    auto func = Self::get_func();
 
     if (num_blocks <= 0) {
 
@@ -400,17 +405,6 @@ struct CudaLaunchHelper<cuda_explicit_launch<async0, num_blocks, num_threads, bl
 
     }
   }
-
-  static void launch(Data &&data,
-                     internal::LaunchDims launch_dims,
-                     size_t shmem,
-                     RAJA::resources::Cuda res)
-  {
-    auto func = kernelGetter_t::get();
-
-    void *args[] = {(void*)&data};
-    RAJA::cuda::launch((const void*)func, launch_dims.dims.blocks, launch_dims.dims.threads, args, shmem, res, async);
-  }
 };
 
 /*!
@@ -592,17 +586,23 @@ struct StatementExecutor<
       }
 
       {
+        auto func = launch_t::get_func();
+
         //
         // Privatize the LoopData, using make_launch_body to setup reductions
         //
-        auto cuda_data = RAJA::cuda::make_launch_body(
+        // Note that there is a circular dependency between the previous setup
+        // of the launch_dims and potential changes to shmem here that is
+        // currently an unresolved issue.
+        //
+        auto cuda_data = RAJA::cuda::make_launch_body(func,
             launch_dims.dims.blocks, launch_dims.dims.threads, shmem, res, data);
 
-
         //
-        // Launch the kernels
+        // Launch the kernel
         //
-        launch_t::launch(std::move(cuda_data), launch_dims, shmem, res);
+        void *args[] = {(void*)&cuda_data};
+        RAJA::cuda::launch(func, launch_dims.dims.blocks, launch_dims.dims.threads, args, shmem, res, launch_t::async);
       }
     }
   }
diff --git a/include/RAJA/policy/cuda/kernel/For.hpp b/include/RAJA/policy/cuda/kernel/For.hpp
index 9de20c7b4b..58ffa1ba14 100644
--- a/include/RAJA/policy/cuda/kernel/For.hpp
+++ b/include/RAJA/policy/cuda/kernel/For.hpp
@@ -283,7 +283,7 @@ struct CudaStatementExecutor<
 
   using diff_t = segment_diff_type<ArgumentId, Data>;
 
-  static_assert(mask_t::max_masked_size <= RAJA::policy::cuda::WARP_SIZE,
+  static_assert(mask_t::max_masked_size <= RAJA::policy::cuda::device_constants.WARP_SIZE,
                 "BitMask is too large for CUDA warp size");
 
   static
@@ -312,7 +312,7 @@ struct CudaStatementExecutor<
 
     // we always get EXACTLY one warp by allocating one warp in the X
     // dimension
-    const diff_t len = RAJA::policy::cuda::WARP_SIZE;
+    const diff_t len = RAJA::policy::cuda::device_constants.WARP_SIZE;
 
     // request one thread per element in the segment
     set_cuda_dim<named_dim::x>(dims.dims.threads, len);
@@ -352,7 +352,7 @@ struct CudaStatementExecutor<
 
   using diff_t = segment_diff_type<ArgumentId, Data>;
 
-  static_assert(mask_t::max_masked_size <= RAJA::policy::cuda::WARP_SIZE,
+  static_assert(mask_t::max_masked_size <= RAJA::policy::cuda::device_constants.WARP_SIZE,
                 "BitMask is too large for CUDA warp size");
 
   static
@@ -391,7 +391,7 @@ struct CudaStatementExecutor<
 
     // we always get EXACTLY one warp by allocating one warp in the X
     // dimension
-    const diff_t len = RAJA::policy::cuda::WARP_SIZE;
+    const diff_t len = RAJA::policy::cuda::device_constants.WARP_SIZE;
 
     // request one thread per element in the segment
     set_cuda_dim<named_dim::x>(dims.dims.threads, len);
diff --git a/include/RAJA/policy/cuda/kernel/ForICount.hpp b/include/RAJA/policy/cuda/kernel/ForICount.hpp
index 8486abaa2c..87556ed8b1 100644
--- a/include/RAJA/policy/cuda/kernel/ForICount.hpp
+++ b/include/RAJA/policy/cuda/kernel/ForICount.hpp
@@ -273,7 +273,7 @@ struct CudaStatementExecutor<
 
   using mask_t = Mask;
 
-  static_assert(mask_t::max_masked_size <= RAJA::policy::cuda::WARP_SIZE,
+  static_assert(mask_t::max_masked_size <= RAJA::policy::cuda::device_constants.WARP_SIZE,
                 "BitMask is too large for CUDA warp size");
 
   static inline RAJA_DEVICE
@@ -332,7 +332,7 @@ struct CudaStatementExecutor<
 
   using mask_t = Mask;
 
-  static_assert(mask_t::max_masked_size <= RAJA::policy::cuda::WARP_SIZE,
+  static_assert(mask_t::max_masked_size <= RAJA::policy::cuda::device_constants.WARP_SIZE,
                 "BitMask is too large for CUDA warp size");
 
   static inline RAJA_DEVICE
diff --git a/include/RAJA/policy/cuda/launch.hpp b/include/RAJA/policy/cuda/launch.hpp
index 602221e58a..0db1dc4e0d 100644
--- a/include/RAJA/policy/cuda/launch.hpp
+++ b/include/RAJA/policy/cuda/launch.hpp
@@ -75,7 +75,8 @@ struct LaunchExecute<RAJA::policy::cuda::cuda_launch_explicit_t<async, named_usa
   {
     using BODY = camp::decay<BODY_IN>;
 
-    auto func = launch_global_fcn<BODY>;
+    auto func = reinterpret_cast<const void*>(
+        &launch_global_fcn<BODY>);
 
     resources::Cuda cuda_res = res.get<RAJA::resources::Cuda>();
 
@@ -99,17 +100,19 @@ struct LaunchExecute<RAJA::policy::cuda::cuda_launch_explicit_t<async, named_usa
       RAJA_FT_BEGIN;
 
       {
+        size_t shared_mem_size = params.shared_mem_size;
+
         //
         // Privatize the loop_body, using make_launch_body to setup reductions
         //
-        BODY body = RAJA::cuda::make_launch_body(
-            gridSize, blockSize, params.shared_mem_size, cuda_res, std::forward<BODY_IN>(body_in));
+        BODY body = RAJA::cuda::make_launch_body(func,
+            gridSize, blockSize, shared_mem_size, cuda_res, std::forward<BODY_IN>(body_in));
 
         //
         // Launch the kernel
         //
         void *args[] = {(void*)&body};
-        RAJA::cuda::launch((const void*)func, gridSize, blockSize, args, params.shared_mem_size, cuda_res, async, kernel_name);
+        RAJA::cuda::launch(func, gridSize, blockSize, args, shared_mem_size, cuda_res, async, kernel_name);
       }
 
       RAJA_FT_END;
@@ -128,7 +131,8 @@ struct LaunchExecute<RAJA::policy::cuda::cuda_launch_explicit_t<async, named_usa
   {
     using BODY = camp::decay<BODY_IN>;
 
-    auto func = reinterpret_cast<const void*>(launch_new_reduce_global_fcn<BODY, camp::decay<ReduceParams> >);
+    auto func = reinterpret_cast<const void*>(
+        &launch_new_reduce_global_fcn<BODY, camp::decay<ReduceParams>>);
 
     resources::Cuda cuda_res = res.get<RAJA::resources::Cuda>();
 
@@ -151,9 +155,11 @@ struct LaunchExecute<RAJA::policy::cuda::cuda_launch_explicit_t<async, named_usa
 
       RAJA_FT_BEGIN;
 
+      size_t shared_mem_size = launch_params.shared_mem_size;
       RAJA::cuda::detail::cudaInfo launch_info;
       launch_info.gridDim = gridSize;
       launch_info.blockDim = blockSize;
+      launch_info.dynamic_smem = &shared_mem_size;
       launch_info.res = cuda_res;
 
       {
@@ -164,14 +170,14 @@ struct LaunchExecute<RAJA::policy::cuda::cuda_launch_explicit_t<async, named_usa
         //
         // Privatize the loop_body, using make_launch_body to setup reductions
         //
-        BODY body = RAJA::cuda::make_launch_body(
-            gridSize, blockSize, launch_params.shared_mem_size, cuda_res, std::forward<BODY_IN>(body_in));
+        BODY body = RAJA::cuda::make_launch_body(func,
+            gridSize, blockSize, shared_mem_size, cuda_res, std::forward<BODY_IN>(body_in));
 
         //
         // Launch the kernel
         //
         void *args[] = {(void*)&body, (void*)&launch_reducers};
-        RAJA::cuda::launch((const void*)func, gridSize, blockSize, args, launch_params.shared_mem_size, cuda_res, async, kernel_name);
+        RAJA::cuda::launch(func, gridSize, blockSize, args, shared_mem_size, cuda_res, async, kernel_name);
 
         RAJA::expt::ParamMultiplexer::resolve<EXEC_POL>(launch_reducers, launch_info);
       }
@@ -234,7 +240,8 @@ struct LaunchExecute<RAJA::policy::cuda::cuda_launch_explicit_t<async, nthreads,
 
     using BODY = camp::decay<BODY_IN>;
 
-    auto func = launch_global_fcn_fixed<BODY, nthreads, BLOCKS_PER_SM>;
+    auto func = reinterpret_cast<const void*>(
+        &launch_global_fcn_fixed<BODY, nthreads, BLOCKS_PER_SM>);
 
     resources::Cuda cuda_res = res.get<RAJA::resources::Cuda>();
 
@@ -258,17 +265,19 @@ struct LaunchExecute<RAJA::policy::cuda::cuda_launch_explicit_t<async, nthreads,
       RAJA_FT_BEGIN;
 
       {
+        size_t shared_mem_size = params.shared_mem_size;
+
         //
         // Privatize the loop_body, using make_launch_body to setup reductions
         //
-        BODY body = RAJA::cuda::make_launch_body(
-            gridSize, blockSize, params.shared_mem_size, cuda_res, std::forward<BODY_IN>(body_in));
+        BODY body = RAJA::cuda::make_launch_body(func,
+            gridSize, blockSize, shared_mem_size, cuda_res, std::forward<BODY_IN>(body_in));
 
         //
         // Launch the kernel
         //
         void *args[] = {(void*)&body};
-        RAJA::cuda::launch((const void*)func, gridSize, blockSize, args, params.shared_mem_size, cuda_res, async, kernel_name);
+        RAJA::cuda::launch(func, gridSize, blockSize, args, shared_mem_size, cuda_res, async, kernel_name);
       }
 
       RAJA_FT_END;
@@ -288,7 +297,8 @@ struct LaunchExecute<RAJA::policy::cuda::cuda_launch_explicit_t<async, nthreads,
 
     using BODY = camp::decay<BODY_IN>;
 
-    auto func = reinterpret_cast<const void*>(launch_new_reduce_global_fcn<BODY, camp::decay<ReduceParams> >);
+    auto func = reinterpret_cast<const void*>(
+        &launch_new_reduce_global_fcn<BODY, camp::decay<ReduceParams>>);
 
     resources::Cuda cuda_res = res.get<RAJA::resources::Cuda>();
 
@@ -312,9 +322,11 @@ struct LaunchExecute<RAJA::policy::cuda::cuda_launch_explicit_t<async, nthreads,
 
       RAJA_FT_BEGIN;
 
+      size_t shared_mem_size = launch_params.shared_mem_size;
       RAJA::cuda::detail::cudaInfo launch_info;
       launch_info.gridDim = gridSize;
       launch_info.blockDim = blockSize;
+      launch_info.dynamic_smem = &shared_mem_size;
       launch_info.res = cuda_res;
       {
 
@@ -324,14 +336,14 @@ struct LaunchExecute<RAJA::policy::cuda::cuda_launch_explicit_t<async, nthreads,
         //
         // Privatize the loop_body, using make_launch_body to setup reductions
         //
-        BODY body = RAJA::cuda::make_launch_body(
-            gridSize, blockSize, launch_params.shared_mem_size, cuda_res, std::forward<BODY_IN>(body_in));
+        BODY body = RAJA::cuda::make_launch_body(func,
+            gridSize, blockSize, shared_mem_size, cuda_res, std::forward<BODY_IN>(body_in));
 
         //
         // Launch the kernel
         //
         void *args[] = {(void*)&body, (void*)&launch_reducers};
-        RAJA::cuda::launch((const void*)func, gridSize, blockSize, args, launch_params.shared_mem_size, cuda_res, async, kernel_name);
+        RAJA::cuda::launch(func, gridSize, blockSize, args, shared_mem_size, cuda_res, async, kernel_name);
 
         RAJA::expt::ParamMultiplexer::resolve<EXEC_POL>(launch_reducers, launch_info);
       }
diff --git a/include/RAJA/policy/cuda/multi_reduce.hpp b/include/RAJA/policy/cuda/multi_reduce.hpp
new file mode 100644
index 0000000000..f9f60f730e
--- /dev/null
+++ b/include/RAJA/policy/cuda/multi_reduce.hpp
@@ -0,0 +1,764 @@
+/*!
+ ******************************************************************************
+ *
+ * \file
+ *
+ * \brief   Header file containing RAJA reduction templates for CUDA execution.
+ *
+ *          These methods should work on any platform that supports
+ *          CUDA devices.
+ *
+ ******************************************************************************
+ */
+
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2016-24, Lawrence Livermore National Security, LLC
+// and RAJA project contributors. See the RAJA/LICENSE file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+#ifndef RAJA_cuda_multi_reduce_HPP
+#define RAJA_cuda_multi_reduce_HPP
+
+#include "RAJA/config.hpp"
+
+#if defined(RAJA_ENABLE_CUDA)
+
+#include <type_traits>
+#include <limits>
+#include <utility>
+#include <vector>
+
+#include <cuda.h>
+
+#include "RAJA/util/macros.hpp"
+#include "RAJA/util/math.hpp"
+#include "RAJA/util/mutex.hpp"
+#include "RAJA/util/types.hpp"
+#include "RAJA/util/reduce.hpp"
+#include "RAJA/util/OffsetOperators.hpp"
+
+#include "RAJA/pattern/detail/multi_reduce.hpp"
+#include "RAJA/pattern/multi_reduce.hpp"
+
+#include "RAJA/policy/cuda/MemUtils_CUDA.hpp"
+#include "RAJA/policy/cuda/intrinsics.hpp"
+
+#if defined(RAJA_ENABLE_DESUL_ATOMICS)
+  #include "RAJA/policy/desul/atomic.hpp"
+#else
+  #include "RAJA/policy/cuda/atomic.hpp"
+#endif
+
+#include "RAJA/policy/cuda/policy.hpp"
+#include "RAJA/policy/cuda/raja_cudaerrchk.hpp"
+
+namespace RAJA
+{
+
+namespace cuda
+{
+
+namespace impl
+{
+
+
+//
+//////////////////////////////////////////////////////////////////////
+//
+// MultiReduction algorithms.
+//
+//////////////////////////////////////////////////////////////////////
+//
+
+//! combine value into global memory
+template <typename Combiner, typename GetTallyIndex,
+          typename T, typename GetTallyOffset>
+RAJA_DEVICE RAJA_INLINE void block_multi_reduce_combine_global_atomic(int RAJA_UNUSED_ARG(num_bins),
+                                                                      T identity,
+                                                                      int bin,
+                                                                      T value,
+                                                                      T* tally_mem,
+                                                                      GetTallyOffset get_tally_offset,
+                                                                      int tally_replication,
+                                                                      int tally_bins)
+{
+  if (value == identity) { return; }
+
+  int tally_index = GetTallyIndex::template index<int>(); // globalWarpId by default
+  int tally_rep = ::RAJA::power_of_2_mod(tally_index, tally_replication);
+  int tally_offset = get_tally_offset(bin, tally_bins, tally_rep, tally_replication);
+  RAJA::reduce::cuda::atomic<Combiner>{}(tally_mem[tally_offset], value);
+}
+
+
+//! initialize shared memory
+template <typename T>
+RAJA_DEVICE RAJA_INLINE void block_multi_reduce_init_shmem(int num_bins,
+                                                           T identity,
+                                                           T* shared_mem,
+                                                           int shared_replication)
+{
+  int threadId = threadIdx.x + blockDim.x * threadIdx.y +
+                 (blockDim.x * blockDim.y) * threadIdx.z;
+  int numThreads = blockDim.x * blockDim.y * blockDim.z;
+
+  for (int shmem_offset = threadId;
+       shmem_offset < shared_replication * num_bins;
+       shmem_offset += numThreads) {
+    shared_mem[shmem_offset] = identity;
+  }
+  __syncthreads();
+}
+
+//! combine value into shared memory
+template <typename Combiner, typename GetSharedIndex,
+          typename T, typename GetSharedOffset>
+RAJA_DEVICE RAJA_INLINE void block_multi_reduce_combine_shmem_atomic(int num_bins,
+                                                                     T identity,
+                                                                     int bin,
+                                                                     T value,
+                                                                     T* shared_mem,
+                                                                     GetSharedOffset get_shared_offset,
+                                                                     int shared_replication)
+{
+  if (value == identity) { return; }
+
+  int shared_index = GetSharedIndex::template index<int>(); // threadId by default
+  int shared_rep = ::RAJA::power_of_2_mod(shared_index, shared_replication);
+  int shmem_offset = get_shared_offset(bin, num_bins, shared_rep, shared_replication);
+
+  RAJA::reduce::cuda::atomic<Combiner>{}(shared_mem[shmem_offset], value);
+}
+
+//! combine value into shared memory
+template <typename Combiner,
+          typename T, typename GetSharedOffset, typename GetTallyOffset>
+RAJA_DEVICE RAJA_INLINE void grid_multi_reduce_shmem_to_global_atomic(int num_bins,
+                                                                      T identity,
+                                                                      T* shared_mem,
+                                                                      GetSharedOffset get_shared_offset,
+                                                                      int shared_replication,
+                                                                      T* tally_mem,
+                                                                      GetTallyOffset get_tally_offset,
+                                                                      int tally_replication,
+                                                                      int tally_bins)
+{
+  int threadId = threadIdx.x + blockDim.x * threadIdx.y +
+                 (blockDim.x * blockDim.y) * threadIdx.z;
+  int numThreads = blockDim.x * blockDim.y * blockDim.z;
+
+  int blockId = blockIdx.x + gridDim.x * blockIdx.y +
+                 (gridDim.x * gridDim.y) * blockIdx.z;
+
+  __syncthreads();
+  for (int bin = threadId; bin < num_bins; bin += numThreads) {
+
+    T value = identity;
+    for (int shared_rep = 0; shared_rep < shared_replication; ++shared_rep) {
+      int shmem_offset = get_shared_offset(bin, num_bins, shared_rep, shared_replication);
+      Combiner{}(value, shared_mem[shmem_offset]);
+    }
+
+    if (value != identity) {
+      int tally_rep = ::RAJA::power_of_2_mod(blockId, tally_replication);
+      int tally_offset = get_tally_offset(bin, tally_bins, tally_rep, tally_replication);
+      RAJA::reduce::cuda::atomic<Combiner>{}(tally_mem[tally_offset], value);
+    }
+
+  }
+}
+
+}  // namespace impl
+
+//
+//////////////////////////////////////////////////////////////////////
+//
+// MultiReduction classes.
+//
+//////////////////////////////////////////////////////////////////////
+//
+
+//! MultiReduction data for Cuda Offload -- stores value, host pointer
+template <typename Combiner, typename T, typename tuning>
+struct MultiReduceGridAtomicHostInit_TallyData
+{
+  //! setup permanent settings, allocate and initialize tally memory
+  template < typename Container >
+  MultiReduceGridAtomicHostInit_TallyData(Container const& container, T const& identity)
+      : m_tally_mem(nullptr)
+      , m_identity(identity)
+      , m_num_bins(container.size())
+      , m_tally_bins(get_tally_bins(m_num_bins))
+      , m_tally_replication(get_tally_replication())
+  {
+    m_tally_mem = create_tally(container, identity, m_num_bins, m_tally_bins, m_tally_replication);
+  }
+
+  MultiReduceGridAtomicHostInit_TallyData() = delete;
+  MultiReduceGridAtomicHostInit_TallyData(MultiReduceGridAtomicHostInit_TallyData const&) = default;
+  MultiReduceGridAtomicHostInit_TallyData(MultiReduceGridAtomicHostInit_TallyData &&) = delete;
+  MultiReduceGridAtomicHostInit_TallyData& operator=(MultiReduceGridAtomicHostInit_TallyData const&) = default;
+  MultiReduceGridAtomicHostInit_TallyData& operator=(MultiReduceGridAtomicHostInit_TallyData &&) = delete;
+  ~MultiReduceGridAtomicHostInit_TallyData() = default;
+
+
+  //! reset permanent settings, reallocate and reset tally memory
+  template < typename Container >
+  void reset_permanent(Container const& container, T const& identity)
+  {
+    int new_num_bins = container.size();
+    if (new_num_bins != m_num_bins) {
+      teardown_permanent();
+      m_num_bins = new_num_bins;
+      m_tally_bins = get_tally_bins(m_num_bins);
+      m_tally_replication = get_tally_replication();
+      m_tally_mem = create_tally(container, identity, m_num_bins, m_tally_bins, m_tally_replication);
+    } else {
+      {
+        int tally_rep = 0;
+        int bin = 0;
+        for (auto const& value : container) {
+          m_tally_mem[GetTallyOffset{}(bin, m_tally_bins, tally_rep, m_tally_replication)] = value;
+          ++bin;
+        }
+      }
+      for (int tally_rep = 1; tally_rep < m_tally_replication; ++tally_rep) {
+        for (int bin = 0; bin < m_num_bins; ++bin) {
+          m_tally_mem[GetTallyOffset{}(bin, m_tally_bins, tally_rep, m_tally_replication)] = identity;
+        }
+      }
+    }
+    m_identity = identity;
+  }
+
+  //! teardown permanent settings, free tally memory
+  void teardown_permanent()
+  {
+    destroy_tally(m_tally_mem, m_num_bins, m_tally_bins, m_tally_replication);
+  }
+
+
+  //! get value for bin, assumes synchronization occurred elsewhere
+  T get(int bin) const
+  {
+    ::RAJA::detail::HighAccuracyReduce<T, typename Combiner::operator_type>
+          reducer(m_identity);
+    for (int tally_rep = 0; tally_rep < m_tally_replication; ++tally_rep) {
+      int tally_offset = GetTallyOffset{}(bin, m_tally_bins, tally_rep, m_tally_replication);
+      reducer.combine(m_tally_mem[tally_offset]);
+    }
+    return reducer.get_and_clear();
+  }
+
+
+  int num_bins() const { return m_num_bins; }
+
+  T identity() const { return m_identity; }
+
+private:
+  static constexpr size_t s_tally_alignment = std::max(size_t(policy::cuda::device_constants.ATOMIC_DESTRUCTIVE_INTERFERENCE_SIZE),
+                                                       size_t(RAJA::DATA_ALIGN));
+  static constexpr size_t s_tally_bunch_size = RAJA_DIVIDE_CEILING_INT(s_tally_alignment, sizeof(T));
+
+  using tally_mempool_type = device_pinned_mempool_type;
+  using tally_tuning = typename tuning::GlobalAtomicReplicationTuning;
+  using TallyAtomicReplicationConcretizer = typename tally_tuning::AtomicReplicationConcretizer;
+  using GetTallyOffset_rebind_rebunch = typename tally_tuning::OffsetCalculator;
+  using GetTallyOffset_rebind = typename GetTallyOffset_rebind_rebunch::template rebunch<s_tally_bunch_size>;
+
+
+  static int get_tally_bins(int num_bins)
+  {
+    return RAJA_DIVIDE_CEILING_INT(num_bins, s_tally_bunch_size) * s_tally_bunch_size;
+  }
+
+  static int get_tally_replication()
+  {
+    int min_tally_replication = 1;
+#if defined(RAJA_ENABLE_OPENMP)
+    min_tally_replication = omp_get_max_threads();
+#endif
+
+    struct {
+      int func_min_global_replication;
+    } func_data{min_tally_replication};
+
+    return TallyAtomicReplicationConcretizer{}.template
+        get_global_replication<int>(func_data);
+  }
+
+  template < typename Container >
+  static T* create_tally(Container const& container, T const& identity,
+                         int num_bins, int tally_bins, int tally_replication)
+  {
+    if (num_bins == size_t(0)) {
+      return nullptr;
+    }
+
+    T* tally_mem = tally_mempool_type::getInstance().template malloc<T>(
+        tally_replication*tally_bins, s_tally_alignment);
+
+    if (tally_replication > 0) {
+      {
+        int tally_rep = 0;
+        int bin = 0;
+        for (auto const& value : container) {
+          int tally_offset = GetTallyOffset{}(bin, tally_bins, tally_rep, tally_replication);
+          new(&tally_mem[tally_offset]) T(value);
+          ++bin;
+        }
+      }
+      for (int tally_rep = 1; tally_rep < tally_replication; ++tally_rep) {
+        for (int bin = 0; bin < num_bins; ++bin) {
+          int tally_offset = GetTallyOffset{}(bin, tally_bins, tally_rep, tally_replication);
+          new(&tally_mem[tally_offset]) T(identity);
+        }
+      }
+    }
+    return tally_mem;
+  }
+
+  static void destroy_tally(T*& tally_mem,
+                            int num_bins, int tally_bins, int tally_replication)
+  {
+    if (num_bins == size_t(0)) {
+      return;
+    }
+
+    for (int tally_rep = tally_replication+1; tally_rep > 0; --tally_rep) {
+      for (int bin = num_bins; bin > 0; --bin) {
+        int tally_offset = GetTallyOffset{}(bin-1, tally_bins, tally_rep-1, tally_replication);
+        tally_mem[tally_offset].~T();
+      }
+    }
+    tally_mempool_type::getInstance().free(tally_mem);
+    tally_mem = nullptr;
+  }
+
+protected:
+  using GetTallyIndex = typename tally_tuning::ReplicationIndexer;
+  using GetTallyOffset = typename GetTallyOffset_rebind::template rebind<int>;
+
+  T* m_tally_mem;
+  T m_identity;
+  int m_num_bins;
+  int m_tally_bins;
+  int m_tally_replication; // power of 2, at least the max number of omp threads
+};
+
+
+//! MultiReduction data for Cuda Offload -- stores value, host pointer
+template <typename Combiner, typename T, typename tuning>
+struct MultiReduceGridAtomicHostInit_Data
+    : MultiReduceGridAtomicHostInit_TallyData<Combiner, T, tuning>
+{
+  using TallyData = MultiReduceGridAtomicHostInit_TallyData<Combiner, T, tuning>;
+
+  //! defer to tally data for some functions
+  using TallyData::TallyData;
+  using TallyData::reset_permanent;
+  using TallyData::teardown_permanent;
+  using TallyData::get;
+  using TallyData::num_bins;
+  using TallyData::identity;
+
+  //! setup per launch, do nothing
+  void setup_launch(size_t RAJA_UNUSED_ARG(block_size))
+  { }
+
+  //! teardown per launch, do nothing
+  void teardown_launch()
+  { }
+
+
+  //! setup on device, do nothing
+  RAJA_DEVICE
+  void setup_device()
+  { }
+
+  //! finalize on device, do nothing
+  RAJA_DEVICE
+  void finalize_device()
+  { }
+
+
+  //! combine value on device, combine a value into the tally atomically
+  RAJA_DEVICE
+  void combine_device(int bin, T value)
+  {
+    impl::block_multi_reduce_combine_global_atomic<Combiner, GetTallyIndex>(
+        m_num_bins, m_identity,
+        bin, value,
+        m_tally_mem, GetTallyOffset{}, m_tally_replication, m_tally_bins);
+  }
+
+  //! combine value on host, combine a value into the tally
+  void combine_host(int bin, T value)
+  {
+    int tally_rep = 0;
+#if defined(RAJA_ENABLE_OPENMP)
+    tally_rep = omp_get_thread_num();
+#endif
+    int tally_offset = GetTallyOffset{}(bin, m_tally_bins, tally_rep, m_tally_replication);
+    Combiner{}(m_tally_mem[tally_offset], value);
+  }
+
+private:
+  using typename TallyData::GetTallyIndex;
+  using typename TallyData::GetTallyOffset;
+
+  using TallyData::m_tally_mem;
+  using TallyData::m_identity;
+  using TallyData::m_num_bins;
+  using TallyData::m_tally_bins;
+  using TallyData::m_tally_replication;
+};
+
+
+//! MultiReduction data for Cuda Offload -- stores value, host pointer
+template <typename Combiner, typename T, typename tuning>
+struct MultiReduceBlockThenGridAtomicHostInit_Data
+    : MultiReduceGridAtomicHostInit_TallyData<Combiner, T, tuning>
+{
+  using TallyData = MultiReduceGridAtomicHostInit_TallyData<Combiner, T, tuning>;
+
+  //! setup permanent settings, defer to tally data
+  template < typename Container >
+  MultiReduceBlockThenGridAtomicHostInit_Data(Container const& container, T const& identity)
+      : TallyData(container, identity)
+      , m_shared_offset(s_shared_offset_unknown)
+      , m_shared_replication(0)
+  { }
+
+  MultiReduceBlockThenGridAtomicHostInit_Data() = delete;
+  MultiReduceBlockThenGridAtomicHostInit_Data(MultiReduceBlockThenGridAtomicHostInit_Data const&) = default;
+  MultiReduceBlockThenGridAtomicHostInit_Data(MultiReduceBlockThenGridAtomicHostInit_Data &&) = delete;
+  MultiReduceBlockThenGridAtomicHostInit_Data& operator=(MultiReduceBlockThenGridAtomicHostInit_Data const&) = default;
+  MultiReduceBlockThenGridAtomicHostInit_Data& operator=(MultiReduceBlockThenGridAtomicHostInit_Data &&) = delete;
+  ~MultiReduceBlockThenGridAtomicHostInit_Data() = default;
+
+
+  //! defer to tally data for some functions
+  using TallyData::reset_permanent;
+  using TallyData::teardown_permanent;
+  using TallyData::get;
+  using TallyData::num_bins;
+  using TallyData::identity;
+
+  //! setup per launch, setup shared memory parameters
+  void setup_launch(size_t block_size)
+  {
+    if (m_num_bins == size_t(0)) {
+      m_shared_offset = s_shared_offset_invalid;
+      return;
+    }
+
+    size_t shared_replication = 0;
+    const size_t shared_offset = allocateDynamicShmem<T>(
+        [&](size_t max_shmem_size) {
+
+      struct {
+        size_t func_threads_per_block;
+        size_t func_max_shared_replication_per_block;
+      } func_data{block_size, max_shmem_size / m_num_bins};
+
+      shared_replication = SharedAtomicReplicationConcretizer{}.template
+          get_shared_replication<size_t>(func_data);
+      return m_num_bins * shared_replication;
+    });
+
+    if (shared_offset != dynamic_smem_allocation_failure) {
+      m_shared_replication = static_cast<int>(shared_replication);
+      m_shared_offset = static_cast<int>(shared_offset);
+    } else {
+      m_shared_offset = s_shared_offset_invalid;
+    }
+  }
+
+  //! teardown per launch, unset shared memory parameters
+  void teardown_launch()
+  {
+    m_shared_replication = 0;
+    m_shared_offset = s_shared_offset_unknown;
+  }
+
+
+  //! setup on device, initialize shared memory
+  RAJA_DEVICE
+  void setup_device()
+  {
+    T* shared_mem = get_shared_mem();
+    if (shared_mem != nullptr) {
+      impl::block_multi_reduce_init_shmem(
+          m_num_bins, m_identity,
+          shared_mem, m_shared_replication);
+    }
+  }
+
+  //! finalize on device, combine values in shared memory into the tally
+  RAJA_DEVICE
+  void finalize_device()
+  {
+    T* shared_mem = get_shared_mem();
+    if (shared_mem != nullptr) {
+      impl::grid_multi_reduce_shmem_to_global_atomic<Combiner>(
+          m_num_bins, m_identity,
+          shared_mem, GetSharedOffset{}, m_shared_replication,
+          m_tally_mem, GetTallyOffset{}, m_tally_replication, m_tally_bins);
+    }
+  }
+
+
+  //! combine value on device, combine a value into shared memory
+  RAJA_DEVICE
+  void combine_device(int bin, T value)
+  {
+    T* shared_mem = get_shared_mem();
+    if (shared_mem != nullptr) {
+      impl::block_multi_reduce_combine_shmem_atomic<Combiner, GetSharedIndex>(
+          m_num_bins, m_identity,
+          bin, value,
+          shared_mem, GetSharedOffset{}, m_shared_replication);
+    } else {
+      impl::block_multi_reduce_combine_global_atomic<Combiner, GetTallyIndex>(
+          m_num_bins, m_identity,
+          bin, value,
+          m_tally_mem, GetTallyOffset{}, m_tally_replication, m_tally_bins);
+    }
+  }
+
+  //! combine value on host, combine a value into the tally
+  void combine_host(int bin, T value)
+  {
+    int tally_rep = 0;
+#if defined(RAJA_ENABLE_OPENMP)
+    tally_rep = omp_get_thread_num();
+#endif
+    int tally_offset = GetTallyOffset{}(bin, m_tally_bins, tally_rep, m_tally_replication);
+    Combiner{}(m_tally_mem[tally_offset], value);
+  }
+
+private:
+  using shared_tuning = typename tuning::SharedAtomicReplicationTuning;
+  using SharedAtomicReplicationConcretizer = typename shared_tuning::AtomicReplicationConcretizer;
+  using GetSharedIndex = typename shared_tuning::ReplicationIndexer;
+  using GetSharedOffset_rebind = typename shared_tuning::OffsetCalculator;
+  using GetSharedOffset = typename GetSharedOffset_rebind::template rebind<int>;
+
+  using typename TallyData::GetTallyIndex;
+  using typename TallyData::GetTallyOffset;
+
+
+  static constexpr int s_shared_offset_unknown = std::numeric_limits<int>::max();
+  static constexpr int s_shared_offset_invalid = std::numeric_limits<int>::max() - 1;
+
+
+  using TallyData::m_tally_mem;
+  using TallyData::m_identity;
+  using TallyData::m_num_bins;
+  using TallyData::m_tally_bins;
+  using TallyData::m_tally_replication;
+
+  int m_shared_offset; // in bytes
+  int m_shared_replication; // power of 2
+
+
+  RAJA_DEVICE
+  T* get_shared_mem() const
+  {
+    if (m_shared_offset == s_shared_offset_invalid) {
+      return nullptr;
+    }
+    extern __shared__ char shared_mem[];
+    return reinterpret_cast<T*>(&shared_mem[m_shared_offset]);
+  }
+};
+
+
+/*!
+ **************************************************************************
+ *
+ * \brief  Cuda multi-reduce data class template.
+ *
+ * This class manages synchronization, data lifetimes, and interaction with
+ * the runtime kernel launch info passing facilities.
+ *
+ * This class manages the lifetime of underlying reduce_data_type using
+ * calls to setup and teardown methods. This includes storage durations:
+ * - permanent, the lifetime of the parent object
+ * - launch, setup before a launch using the launch parameters and
+ *           teardown after the launch
+ * - device, setup all device threads in a kernel before any block work and
+ *           teardown all device threads after all block work is finished
+ *
+ **************************************************************************
+ */
+template < typename T, typename t_MultiReduceOp, typename tuning >
+struct MultiReduceDataCuda
+{
+  static constexpr bool atomic_available = RAJA::reduce::cuda::cuda_atomic_available<T>::value;
+
+  //! cuda reduction data storage class and folding algorithm
+  using reduce_data_type =
+      std::conditional_t<(atomic_available),
+        std::conditional_t<(tuning::algorithm == multi_reduce_algorithm::init_host_combine_block_atomic_then_grid_atomic),
+          cuda::MultiReduceBlockThenGridAtomicHostInit_Data<t_MultiReduceOp, T, tuning>,
+          std::conditional_t<(tuning::algorithm == multi_reduce_algorithm::init_host_combine_global_atomic),
+            cuda::MultiReduceGridAtomicHostInit_Data<t_MultiReduceOp, T, tuning>,
+            void>>,
+      void>;
+
+
+  using SyncList = std::vector<resources::Cuda>;
+
+public:
+  using value_type = T;
+  using MultiReduceOp = t_MultiReduceOp;
+
+  MultiReduceDataCuda() = delete;
+
+  template < typename Container,
+             std::enable_if_t<!std::is_same<Container, MultiReduceDataCuda>::value>* = nullptr >
+  MultiReduceDataCuda(Container const& container, T identity)
+      : m_parent(this)
+      , m_sync_list(new SyncList)
+      , m_data(container, identity)
+      , m_own_launch_data(false)
+  {
+  }
+
+  //! copy and on host attempt to setup for device
+  //  init val_ptr to avoid uninitialized read caused by host copy of
+  //  reducer in host device lambda not being used on device.
+  RAJA_HOST_DEVICE
+  MultiReduceDataCuda(MultiReduceDataCuda const& other)
+#if !defined(RAJA_GPU_DEVICE_COMPILE_PASS_ACTIVE)
+      : m_parent(other.m_parent)
+#else
+      : m_parent(&other)
+#endif
+      , m_sync_list(other.m_sync_list)
+      , m_data(other.m_data)
+      , m_own_launch_data(false)
+  {
+#if !defined(RAJA_GPU_DEVICE_COMPILE_PASS_ACTIVE)
+    if (m_parent) {
+      if (setupReducers()) {
+        // the copy made in make_launch_body does this setup
+        add_resource_to_synchronization_list(currentResource());
+        m_data.setup_launch(currentBlockSize());
+        m_own_launch_data = true;
+        m_parent = nullptr;
+      }
+    }
+#else
+    if (!m_parent->m_parent) {
+      // the first copy on device enters this branch
+      m_data.setup_device();
+    }
+#endif
+  }
+
+  MultiReduceDataCuda(MultiReduceDataCuda &&) = delete;
+  MultiReduceDataCuda& operator=(MultiReduceDataCuda const&) = delete;
+  MultiReduceDataCuda& operator=(MultiReduceDataCuda &&) = delete;
+
+  //! cleanup resources owned by this copy
+  //  on device store in pinned buffer on host
+  RAJA_HOST_DEVICE
+  ~MultiReduceDataCuda()
+  {
+#if !defined(RAJA_GPU_DEVICE_COMPILE_PASS_ACTIVE)
+    if (m_parent == this) {
+      // the original object, owns permanent storage
+      synchronize_resources_and_clear_list();
+      delete m_sync_list;
+      m_sync_list = nullptr;
+      m_data.teardown_permanent();
+    } else if (m_parent) {
+      // do nothing
+    } else {
+      if (m_own_launch_data) {
+        // the copy made in make_launch_body, owns launch data
+        m_data.teardown_launch();
+        m_own_launch_data = false;
+      }
+    }
+#else
+    if (!m_parent->m_parent) {
+      // the first copy on device, does finalization on the device
+      m_data.finalize_device();
+    }
+#endif
+  }
+
+
+  template < typename Container >
+  void reset(Container const& container, T identity)
+  {
+    synchronize_resources_and_clear_list();
+    m_data.reset_permanent(container, identity);
+  }
+
+
+  //! apply reduction (const version) -- still combines internal values
+  RAJA_HOST_DEVICE
+  void combine(int bin, T const& value)
+  {
+#if !defined(RAJA_GPU_DEVICE_COMPILE_PASS_ACTIVE)
+    m_data.combine_host(bin, value);
+#else
+    m_data.combine_device(bin, value);
+#endif
+  }
+
+
+  //! map result value back to host if not done already; return aggregate value
+  T get(int bin)
+  {
+    synchronize_resources_and_clear_list();
+    return m_data.get(bin);
+  }
+
+
+  size_t num_bins() const { return m_data.num_bins(); }
+
+  T identity() const { return m_data.identity(); }
+
+
+private:
+  MultiReduceDataCuda const *m_parent;
+  SyncList* m_sync_list;
+  reduce_data_type m_data;
+  bool m_own_launch_data;
+
+  void add_resource_to_synchronization_list(resources::Cuda res)
+  {
+    for (resources::Cuda& list_res : *m_sync_list) {
+      if (list_res.get_stream() == res.get_stream()) {
+        return;
+      }
+    }
+    m_sync_list->emplace_back(res);
+  }
+
+  void synchronize_resources_and_clear_list()
+  {
+    for (resources::Cuda& list_res : *m_sync_list) {
+      ::RAJA::cuda::synchronize(list_res);
+    }
+    m_sync_list->clear();
+  }
+};
+
+}  // end namespace cuda
+
+RAJA_DECLARE_ALL_MULTI_REDUCERS(policy::cuda::cuda_multi_reduce_policy, cuda::MultiReduceDataCuda)
+
+}  // namespace RAJA
+
+#endif  // closing endif for RAJA_ENABLE_CUDA guard
+
+#endif  // closing endif for header file include guard
diff --git a/include/RAJA/policy/cuda/params/kernel_name.hpp b/include/RAJA/policy/cuda/params/kernel_name.hpp
index d845bccfc2..4edf645ed3 100644
--- a/include/RAJA/policy/cuda/params/kernel_name.hpp
+++ b/include/RAJA/policy/cuda/params/kernel_name.hpp
@@ -1,11 +1,10 @@
 #ifndef CUDA_KERNELNAME_HPP
 #define CUDA_KERNELNAME_HPP
 
-//#include "../util/policy.hpp"
-
 #if defined(RAJA_CUDA_ACTIVE)
 
 #include <cuda.h>
+#include "RAJA/policy/cuda/MemUtils_CUDA.hpp"
 #include "RAJA/pattern/params/kernel_name.hpp"
 
 namespace RAJA {
@@ -15,10 +14,12 @@ namespace detail {
   // Init
   template<typename EXEC_POL>
   camp::concepts::enable_if< type_traits::is_cuda_policy<EXEC_POL> >
-  init(KernelName& kn, const RAJA::cuda::detail::cudaInfo & cs)
+  init(KernelName& kn, const RAJA::cuda::detail::cudaInfo &)
   {
 #if defined(RAJA_ENABLE_NV_TOOLS_EXT)
     nvtxRangePush(kn.name);
+#else
+    RAJA_UNUSED_VAR(kn);
 #endif
   }
 
@@ -31,7 +32,7 @@ namespace detail {
   // Resolve
   template<typename EXEC_POL>
   camp::concepts::enable_if< type_traits::is_cuda_policy<EXEC_POL> >
-  resolve(KernelName&)
+  resolve(KernelName&, const RAJA::cuda::detail::cudaInfo &)
   {
 #if defined(RAJA_ENABLE_NV_TOOLS_EXT)
     nvtxRangePop();
diff --git a/include/RAJA/policy/cuda/policy.hpp b/include/RAJA/policy/cuda/policy.hpp
index 84cd8a301c..cd71a37480 100644
--- a/include/RAJA/policy/cuda/policy.hpp
+++ b/include/RAJA/policy/cuda/policy.hpp
@@ -31,7 +31,9 @@
 #include "RAJA/policy/sequential/policy.hpp"
 
 #include "RAJA/util/Operators.hpp"
+#include "RAJA/util/OffsetOperators.hpp"
 #include "RAJA/util/types.hpp"
+#include "RAJA/util/math.hpp"
 
 namespace RAJA
 {
@@ -79,6 +81,13 @@ struct IndexGlobal;
 template<typename ...indexers>
 struct IndexFlatten;
 
+template<size_t divisor, typename index>
+struct IndexDivide;
+
+template<size_t divisor, typename index>
+struct IndexModulo;
+
+
 /*!
  * Use the max occupancy of a kernel on the current device when launch
  * parameters are not fully determined.
@@ -160,6 +169,84 @@ struct AvoidDeviceMaxThreadOccupancyConcretizer
 };
 
 
+/*!
+ * Get an amount of replication that is preferred_replication.
+ */
+template < size_t preferred_replication >
+struct ConstantPreferredReplicationConcretizer
+{
+  template < typename IdxT, typename Data >
+  static IdxT get_preferred_replication(Data const& RAJA_UNUSED_ARG(data))
+  {
+    return IdxT(preferred_replication);
+  }
+};
+
+/*!
+ * Get an amount of replication that is preferred_replication_before_cutoff if
+ * data.func_threads_per_block is less than t_cutoff or
+ * preferred_replication_after_cutoff otherwise.
+ */
+template < size_t t_cutoff, size_t preferred_replication_before_cutoff,
+                            size_t preferred_replication_after_cutoff >
+struct ThreadsPerBlockCutoffPreferredReplicationConcretizer
+{
+  template < typename IdxT, typename Data >
+  static IdxT get_preferred_replication(Data const& data)
+  {
+    IdxT cutoff = t_cutoff;
+    IdxT func_threads_per_block = data.func_threads_per_block;
+
+    if (func_threads_per_block < cutoff) {
+      return IdxT(preferred_replication_before_cutoff);
+    } else {
+      return IdxT(preferred_replication_after_cutoff);
+    }
+  }
+};
+
+/*!
+ * Get an amount of shared atomic replication that is a power of 2 that is at
+ * most the amount given by data.func_max_shared_replication_per_block or the
+ * amount given by GetPreferredReplication.
+ */
+template < typename GetPreferredReplication >
+struct SharedAtomicReplicationMaxPow2Concretizer
+{
+  template < typename IdxT, typename Data >
+  static IdxT get_shared_replication(Data const& data)
+  {
+    IdxT func_max_shared_replication_per_block = data.func_max_shared_replication_per_block;
+
+    IdxT preferred_replication = GetPreferredReplication{}.template
+        get_preferred_replication<IdxT>(data);
+
+    return prev_pow2(std::min(preferred_replication,
+                              func_max_shared_replication_per_block));
+  }
+};
+
+/*!
+ * Get an amount of global atomic replication that is a power of 2 that is at
+ * least the amount given by data.func_min_global_replication or the
+ * amount given by GetPreferredReplication.
+ */
+template < typename GetPreferredReplication >
+struct GlobalAtomicReplicationMinPow2Concretizer
+{
+  template < typename IdxT, typename Data >
+  static IdxT get_global_replication(Data const& data)
+  {
+    IdxT func_min_global_replication = data.func_min_global_replication;
+
+    IdxT preferred_replication = GetPreferredReplication{}.template
+        get_preferred_replication<IdxT>(data);
+
+    return next_pow2(std::max(preferred_replication, func_min_global_replication));
+  }
+};
+
+
 enum struct reduce_algorithm : int
 {
   combine_last_block,
@@ -181,6 +268,36 @@ struct ReduceTuning
   static constexpr block_communication_mode comm_mode = t_comm_mode;
   static constexpr size_t replication = t_replication;
   static constexpr size_t atomic_stride = t_atomic_stride;
+  static constexpr bool consistent =
+      (algorithm == reduce_algorithm::combine_last_block);
+};
+
+
+enum struct multi_reduce_algorithm : int
+{
+  init_host_combine_block_atomic_then_grid_atomic,
+  init_host_combine_global_atomic
+};
+
+template < typename t_AtomicReplicationConcretizer,
+           typename t_ReplicationIndexer,
+           typename t_OffsetCalculator >
+struct AtomicReplicationTuning
+{
+  using AtomicReplicationConcretizer = t_AtomicReplicationConcretizer;
+  using ReplicationIndexer = t_ReplicationIndexer;
+  using OffsetCalculator = t_OffsetCalculator;
+};
+
+template < multi_reduce_algorithm t_algorithm,
+           typename t_SharedAtomicReplicationTuning,
+           typename t_GlobalAtomicReplicationTuning >
+struct MultiReduceTuning
+{
+  static constexpr multi_reduce_algorithm algorithm = t_algorithm;
+  using SharedAtomicReplicationTuning = t_SharedAtomicReplicationTuning;
+  using GlobalAtomicReplicationTuning = t_GlobalAtomicReplicationTuning;
+  static constexpr bool consistent = false;
 };
 
 }  // namespace cuda
@@ -190,9 +307,38 @@ namespace policy
 namespace cuda
 {
 
+struct DeviceConstants
+{
+  RAJA::Index_type WARP_SIZE;
+  RAJA::Index_type MAX_BLOCK_SIZE;
+  RAJA::Index_type MAX_WARPS;
+  RAJA::Index_type ATOMIC_DESTRUCTIVE_INTERFERENCE_SIZE; // basically the cache line size of the cache level that handles atomics
+
+  constexpr DeviceConstants(RAJA::Index_type warp_size,
+                            RAJA::Index_type max_block_size,
+                            RAJA::Index_type atomic_cache_line_bytes) noexcept
+    : WARP_SIZE(warp_size)
+    , MAX_BLOCK_SIZE(max_block_size)
+    , MAX_WARPS(max_block_size / warp_size)
+    , ATOMIC_DESTRUCTIVE_INTERFERENCE_SIZE(atomic_cache_line_bytes)
+  { }
+};
+
+//
+// Operations in the included files are parametrized using the following
+// values for CUDA warp size and max block size.
+//
+constexpr DeviceConstants device_constants(32, 1024, 32); // V100
+static_assert(device_constants.WARP_SIZE >= device_constants.MAX_WARPS,
+              "RAJA Assumption Broken: device_constants.WARP_SIZE < device_constants.MAX_WARPS");
+static_assert(device_constants.MAX_BLOCK_SIZE % device_constants.WARP_SIZE == 0,
+              "RAJA Assumption Broken: device_constants.MAX_BLOCK_SIZE not "
+              "a multiple of device_constants.WARP_SIZE");
+
 constexpr const size_t MIN_BLOCKS_PER_SM = 1;
 constexpr const size_t MAX_BLOCKS_PER_SM = 32;
 
+
 template <typename _IterationMapping, kernel_sync_requirement sync, typename ... _IterationGetters>
 struct cuda_indexer {};
 
@@ -268,7 +414,22 @@ struct cuda_reduce_policy
           make_policy_pattern_launch_platform_t<RAJA::Policy::cuda,
                                                 RAJA::Pattern::reduce,
                                                 detail::get_launch<false>::value,
-                                                RAJA::Platform::cuda> {
+                                                RAJA::Platform::cuda,
+                                                std::conditional_t<tuning::consistent,
+                                                                   reduce::ordered,
+                                                                   reduce::unordered>> {
+};
+
+template < typename tuning >
+struct cuda_multi_reduce_policy
+    : public RAJA::
+          make_policy_pattern_launch_platform_t<RAJA::Policy::cuda,
+                                                RAJA::Pattern::multi_reduce,
+                                                detail::get_launch<false>::value,
+                                                RAJA::Platform::cuda,
+                                                std::conditional_t<tuning::consistent,
+                                                                   reduce::ordered,
+                                                                   reduce::unordered>> {
 };
 
 /*!
@@ -285,74 +446,6 @@ struct cuda_atomic_explicit{};
 using cuda_atomic = cuda_atomic_explicit<seq_atomic>;
 
 
-template < RAJA::cuda::reduce_algorithm algorithm,
-           RAJA::cuda::block_communication_mode comm_mode,
-           size_t replication = named_usage::unspecified,
-           size_t atomic_stride = named_usage::unspecified >
-using cuda_reduce_tuning = cuda_reduce_policy< RAJA::cuda::ReduceTuning<
-    algorithm, comm_mode, replication, atomic_stride> >;
-
-// Policies for RAJA::Reduce* objects with specific behaviors.
-// - *atomic* policies may use atomics to combine partial results and falls back
-//   on a non-atomic policy when atomics can't be used with the given type. The
-//   use of atomics leads to order of operation differences which change the
-//   results of floating point sum reductions run to run. The memory used with
-//   atomics is initialized on the device which can be expensive on some HW.
-//   On some HW this is faster overall than the non-atomic policies.
-// - *atomic_host* policies are similar to the atomic policies above. However
-//   the memory used with atomics is initialized on the host which is
-//   significantly cheaper on some HW. On some HW this is faster overall than
-//   the non-atomic and atomic policies.
-// - *device_fence policies use normal memory accesses with device scope fences
-//                in the implementation. This works on all HW.
-// - *block_fence policies use special (atomic) memory accesses that only cache
-//                 in a cache shared by the whole device to avoid having to use
-//                 device scope fences. This improves performance on some HW but
-//                 is more difficult to code correctly.
-using cuda_reduce_device_fence = cuda_reduce_tuning<
-    RAJA::cuda::reduce_algorithm::combine_last_block,
-    RAJA::cuda::block_communication_mode::device_fence,
-    named_usage::unspecified, named_usage::unspecified>;
-///
-using cuda_reduce_block_fence = cuda_reduce_tuning<
-    RAJA::cuda::reduce_algorithm::combine_last_block,
-    RAJA::cuda::block_communication_mode::block_fence,
-    named_usage::unspecified, named_usage::unspecified>;
-///
-using cuda_reduce_atomic_device_init_device_fence = cuda_reduce_tuning<
-    RAJA::cuda::reduce_algorithm::init_device_combine_atomic_block,
-    RAJA::cuda::block_communication_mode::device_fence,
-    named_usage::unspecified, named_usage::unspecified>;
-///
-using cuda_reduce_atomic_device_init_block_fence = cuda_reduce_tuning<
-    RAJA::cuda::reduce_algorithm::init_device_combine_atomic_block,
-    RAJA::cuda::block_communication_mode::block_fence,
-    named_usage::unspecified, named_usage::unspecified>;
-///
-using cuda_reduce_atomic_host_init_device_fence = cuda_reduce_tuning<
-    RAJA::cuda::reduce_algorithm::init_host_combine_atomic_block,
-    RAJA::cuda::block_communication_mode::device_fence,
-    named_usage::unspecified, named_usage::unspecified>;
-///
-using cuda_reduce_atomic_host_init_block_fence = cuda_reduce_tuning<
-    RAJA::cuda::reduce_algorithm::init_host_combine_atomic_block,
-    RAJA::cuda::block_communication_mode::block_fence,
-    named_usage::unspecified, named_usage::unspecified>;
-
-// Policy for RAJA::Reduce* objects that gives the same answer every time when
-// used in the same way
-using cuda_reduce = cuda_reduce_device_fence;
-
-// Policy for RAJA::Reduce* objects that may use atomics and may not give the
-// same answer every time when used in the same way
-using cuda_reduce_atomic = cuda_reduce_atomic_host_init_device_fence;
-
-// Policy for RAJA::Reduce* objects that lets you select the default atomic or
-// non-atomic policy with a bool
-template < bool with_atomic >
-using cuda_reduce_base = std::conditional_t<with_atomic, cuda_reduce_atomic, cuda_reduce>;
-
-
 // Policy for RAJA::statement::Reduce that reduces threads in a block
 // down to threadIdx 0
 struct cuda_block_reduce{};
@@ -400,21 +493,6 @@ template<typename Mask>
 struct cuda_thread_masked_loop {};
 
 
-
-//
-// Operations in the included files are parametrized using the following
-// values for CUDA warp size and max block size.
-//
-constexpr const RAJA::Index_type ATOMIC_DESTRUCTIVE_INTERFERENCE_SIZE = 32;
-constexpr const RAJA::Index_type WARP_SIZE = 32;
-constexpr const RAJA::Index_type MAX_BLOCK_SIZE = 1024;
-constexpr const RAJA::Index_type MAX_WARPS = MAX_BLOCK_SIZE / WARP_SIZE;
-static_assert(WARP_SIZE >= MAX_WARPS,
-              "RAJA Assumption Broken: WARP_SIZE < MAX_WARPS");
-static_assert(MAX_BLOCK_SIZE % WARP_SIZE == 0,
-              "RAJA Assumption Broken: MAX_BLOCK_SIZE not "
-              "a multiple of WARP_SIZE");
-
 struct cuda_synchronize : make_policy_pattern_launch_t<Policy::cuda,
                                                        Pattern::synchronize,
                                                        Launch::sync> {
@@ -992,6 +1070,38 @@ struct IndexFlatten<x_index, y_index, z_index>
 
 };
 
+template<size_t divisor, typename indexer>
+struct IndexDivide
+{
+  template < typename IdxT = cuda_dim_member_t >
+  RAJA_DEVICE static inline IdxT index()
+  {
+    return indexer::template index<IdxT>() / static_cast<IdxT>(divisor);
+  }
+
+  template < typename IdxT = cuda_dim_member_t >
+  RAJA_DEVICE static inline IdxT size()
+  {
+    return RAJA_DIVIDE_CEILING_INT(indexer::template size<IdxT>(), static_cast<IdxT>(divisor));
+  }
+};
+
+template<size_t divisor, typename indexer>
+struct IndexModulo
+{
+  template < typename IdxT = cuda_dim_member_t >
+  RAJA_DEVICE static inline IdxT index()
+  {
+    return indexer::template index<IdxT>() % static_cast<IdxT>(divisor);
+  }
+
+  template < typename IdxT = cuda_dim_member_t >
+  RAJA_DEVICE static inline IdxT size()
+  {
+    return static_cast<IdxT>(divisor);
+  }
+};
+
 
 // helper to get just the thread indexing part of IndexGlobal
 template < typename index_global >
@@ -1037,6 +1147,13 @@ using thread_y = IndexGlobal<named_dim::y, BLOCK_SIZE, named_usage::ignored>;
 template <size_t BLOCK_SIZE=named_usage::unspecified>
 using thread_z = IndexGlobal<named_dim::z, BLOCK_SIZE, named_usage::ignored>;
 
+template <size_t BLOCK_SIZE_X=named_usage::unspecified,
+          size_t BLOCK_SIZE_Y=named_usage::unspecified,
+          size_t BLOCK_SIZE_Z=named_usage::unspecified>
+using thread_xyz = IndexFlatten<thread_x<BLOCK_SIZE_X>,
+                                thread_y<BLOCK_SIZE_Y>,
+                                thread_z<BLOCK_SIZE_Z>>;
+
 template <size_t GRID_SIZE=named_usage::unspecified>
 using block_x = IndexGlobal<named_dim::x, named_usage::ignored, GRID_SIZE>;
 template <size_t GRID_SIZE=named_usage::unspecified>
@@ -1044,6 +1161,13 @@ using block_y = IndexGlobal<named_dim::y, named_usage::ignored, GRID_SIZE>;
 template <size_t GRID_SIZE=named_usage::unspecified>
 using block_z = IndexGlobal<named_dim::z, named_usage::ignored, GRID_SIZE>;
 
+template <size_t GRID_SIZE_X=named_usage::unspecified,
+          size_t GRID_SIZE_Y=named_usage::unspecified,
+          size_t GRID_SIZE_Z=named_usage::unspecified>
+using block_xyz = IndexFlatten<block_x<GRID_SIZE_X>,
+                               block_y<GRID_SIZE_Y>,
+                               block_z<GRID_SIZE_Z>>;
+
 template <size_t BLOCK_SIZE, size_t GRID_SIZE=named_usage::unspecified>
 using global_x = IndexGlobal<named_dim::x, BLOCK_SIZE, GRID_SIZE>;
 template <size_t BLOCK_SIZE, size_t GRID_SIZE=named_usage::unspecified>
@@ -1051,6 +1175,42 @@ using global_y = IndexGlobal<named_dim::y, BLOCK_SIZE, GRID_SIZE>;
 template <size_t BLOCK_SIZE, size_t GRID_SIZE=named_usage::unspecified>
 using global_z = IndexGlobal<named_dim::z, BLOCK_SIZE, GRID_SIZE>;
 
+
+template <size_t BLOCK_SIZE_X,
+          size_t BLOCK_SIZE_Y,
+          size_t BLOCK_SIZE_Z,
+          size_t GRID_SIZE_X=named_usage::unspecified,
+          size_t GRID_SIZE_Y=named_usage::unspecified,
+          size_t GRID_SIZE_Z=named_usage::unspecified>
+using global_xyz = IndexFlatten<global_x<BLOCK_SIZE_X, GRID_SIZE_X>,
+                                global_y<BLOCK_SIZE_Y, GRID_SIZE_Y>,
+                                global_z<BLOCK_SIZE_Z, GRID_SIZE_Z>>;
+
+
+template <size_t WARP_SIZE=RAJA::policy::cuda::device_constants.WARP_SIZE,
+          size_t BLOCK_SIZE_X=named_usage::unspecified,
+          size_t BLOCK_SIZE_Y=named_usage::unspecified,
+          size_t BLOCK_SIZE_Z=named_usage::unspecified>
+using warp_xyz = IndexDivide<WARP_SIZE,
+                             thread_xyz<BLOCK_SIZE_X,
+                                        BLOCK_SIZE_Y,
+                                        BLOCK_SIZE_Z>>;
+
+template <size_t WARP_SIZE=RAJA::policy::cuda::device_constants.WARP_SIZE,
+          size_t BLOCK_SIZE_X=named_usage::unspecified,
+          size_t BLOCK_SIZE_Y=named_usage::unspecified,
+          size_t BLOCK_SIZE_Z=named_usage::unspecified,
+          size_t GRID_SIZE_X=named_usage::unspecified,
+          size_t GRID_SIZE_Y=named_usage::unspecified,
+          size_t GRID_SIZE_Z=named_usage::unspecified>
+using warp_global_xyz = IndexFlatten<warp_xyz<WARP_SIZE,
+                                              BLOCK_SIZE_X,
+                                              BLOCK_SIZE_Y,
+                                              BLOCK_SIZE_Z>,
+                                     block_xyz<GRID_SIZE_X,
+                                               GRID_SIZE_Y,
+                                               GRID_SIZE_Z>>;
+
 } // namespace cuda
 
 // contretizers used in forall, scan, and sort policies
@@ -1248,16 +1408,147 @@ using policy::cuda::unordered_cuda_loop_y_block_iter_x_threadblock_average;
 using policy::cuda::cuda_atomic;
 using policy::cuda::cuda_atomic_explicit;
 
+
 // policies usable with reducers
-using policy::cuda::cuda_reduce_device_fence;
-using policy::cuda::cuda_reduce_block_fence;
-using policy::cuda::cuda_reduce_atomic_device_init_device_fence;
-using policy::cuda::cuda_reduce_atomic_device_init_block_fence;
-using policy::cuda::cuda_reduce_atomic_host_init_device_fence;
-using policy::cuda::cuda_reduce_atomic_host_init_block_fence;
-using policy::cuda::cuda_reduce_base;
-using policy::cuda::cuda_reduce;
-using policy::cuda::cuda_reduce_atomic;
+template < cuda::reduce_algorithm algorithm,
+           cuda::block_communication_mode comm_mode,
+           size_t replication = named_usage::unspecified,
+           size_t atomic_stride = named_usage::unspecified >
+using cuda_reduce_tuning = policy::cuda::cuda_reduce_policy<
+    cuda::ReduceTuning<algorithm, comm_mode, replication, atomic_stride>>;
+
+// Policies for RAJA::Reduce* objects with specific behaviors.
+// - non-atomic policies store partial results and combine them in the same
+//   order every time, leading to consistent results for a loop run to run.
+// - *atomic* policies may use atomics to combine partial results. The
+//   use of atomics leads to order of operation differences which change the
+//   results of floating point sum reductions for a loop run to run. Falls back
+//   on a non-atomic implementation if atomics can't be used with the given
+//   type. The memory used with atomics is initialized on the device using
+//   atomics which adds overhead.
+// - *atomic_host* policies are similar to the atomic policies above. However
+//   the memory used with atomics is initialized on the host. This is faster
+//   overall than other policies on HW with direct host access to device memory
+//   such as the IBM power 9 + Nvidia V100 Sierra/Lassen systems.
+// - *device_fence* policies use normal memory accesses with device scope fences
+//                in the implementation. This works on all HW.
+// - *block_fence* policies use special (atomic) memory accesses that use
+//                 a cache shared by the whole device to avoid having to use
+//                 device scope fences. This improves performance on some HW but
+//                 is more difficult to code correctly.
+using cuda_reduce_device_fence = cuda_reduce_tuning<
+    cuda::reduce_algorithm::combine_last_block,
+    cuda::block_communication_mode::device_fence,
+    named_usage::unspecified, named_usage::unspecified>;
+///
+using cuda_reduce_block_fence = cuda_reduce_tuning<
+    cuda::reduce_algorithm::combine_last_block,
+    cuda::block_communication_mode::block_fence,
+    named_usage::unspecified, named_usage::unspecified>;
+///
+using cuda_reduce_atomic_device_init_device_fence = cuda_reduce_tuning<
+    cuda::reduce_algorithm::init_device_combine_atomic_block,
+    cuda::block_communication_mode::device_fence,
+    named_usage::unspecified, named_usage::unspecified>;
+///
+using cuda_reduce_atomic_device_init_block_fence = cuda_reduce_tuning<
+    cuda::reduce_algorithm::init_device_combine_atomic_block,
+    cuda::block_communication_mode::block_fence,
+    named_usage::unspecified, named_usage::unspecified>;
+///
+using cuda_reduce_atomic_host_init_device_fence = cuda_reduce_tuning<
+    cuda::reduce_algorithm::init_host_combine_atomic_block,
+    cuda::block_communication_mode::device_fence,
+    named_usage::unspecified, named_usage::unspecified>;
+///
+using cuda_reduce_atomic_host_init_block_fence = cuda_reduce_tuning<
+    cuda::reduce_algorithm::init_host_combine_atomic_block,
+    cuda::block_communication_mode::block_fence,
+    named_usage::unspecified, named_usage::unspecified>;
+
+// Policy for RAJA::Reduce* objects that gives the same answer every time when
+// used in the same way
+using cuda_reduce = cuda_reduce_device_fence;
+
+// Policy for RAJA::Reduce* objects that may use atomics and may not give the
+// same answer every time when used in the same way
+using cuda_reduce_atomic = cuda_reduce_atomic_host_init_device_fence;
+
+// Policy for RAJA::Reduce* objects that lets you select the default atomic or
+// non-atomic policy with a bool
+template < bool with_atomic >
+using cuda_reduce_base = std::conditional_t<with_atomic, cuda_reduce_atomic, cuda_reduce>;
+
+
+// policies usable with multi_reducers
+template < cuda::multi_reduce_algorithm algorithm,
+           typename SharedAtomicReplicationConcretizer,
+           typename SharedAtomicReplicationIndexer,
+           typename GlobalAtomicReplicationConcretizer,
+           typename GlobalAtomicReplicationIndexer >
+using cuda_multi_reduce_tuning = policy::cuda::cuda_multi_reduce_policy<
+    cuda::MultiReduceTuning<
+      algorithm,
+      cuda::AtomicReplicationTuning<SharedAtomicReplicationConcretizer,
+                                    SharedAtomicReplicationIndexer,
+                                    GetOffsetRight<int>>,
+      cuda::AtomicReplicationTuning<GlobalAtomicReplicationConcretizer,
+                                    GlobalAtomicReplicationIndexer,
+                                    GetOffsetLeft<int>>>>;
+
+// Policies for RAJA::MultiReduce* objects with specific behaviors.
+// - *atomic* policies may use atomics to combine partial results. The
+//   use of atomics leads to order of operation differences which change the
+//   results of floating point sum reductions for a loop run to run.
+// - *no_replication* policies use the minimum amount of resources. The
+//   lack of resources means they may perform poorly. These policies are
+//   intended for use cases where low overhead is more important than high
+//   performance such as error flags that are rarely set.
+// - *host_init* policies initialize memory used with atomics on the host.
+//   This is faster overall than other policies on HW with direct host access
+//   to device memory such as the IBM power 9 + Nvidia V100 Sierra/Lassen
+//   systems.
+using cuda_multi_reduce_atomic_block_then_atomic_grid_host_init = cuda_multi_reduce_tuning<
+    cuda::multi_reduce_algorithm::init_host_combine_block_atomic_then_grid_atomic,
+    cuda::SharedAtomicReplicationMaxPow2Concretizer<
+        cuda::ConstantPreferredReplicationConcretizer<16>>,
+    cuda::thread_xyz<>,
+    cuda::GlobalAtomicReplicationMinPow2Concretizer<
+        cuda::ConstantPreferredReplicationConcretizer<2>>,
+    cuda::warp_global_xyz<>>;
+// special policy to test that multi-reducers work if there is not enough shmem
+using cuda_multi_reduce_atomic_block_then_atomic_grid_host_init_fallback_testing = cuda_multi_reduce_tuning<
+    cuda::multi_reduce_algorithm::init_host_combine_block_atomic_then_grid_atomic,
+    cuda::SharedAtomicReplicationMaxPow2Concretizer<
+        cuda::ConstantPreferredReplicationConcretizer<0>>,
+    cuda::thread_xyz<>,
+    cuda::GlobalAtomicReplicationMinPow2Concretizer<
+        cuda::ConstantPreferredReplicationConcretizer<2>>,
+    cuda::warp_global_xyz<>>;
+//
+using cuda_multi_reduce_atomic_global_host_init = cuda_multi_reduce_tuning<
+    cuda::multi_reduce_algorithm::init_host_combine_global_atomic,
+    void, // unused with this algorithm
+    void, // unused with this algorithm
+    cuda::GlobalAtomicReplicationMinPow2Concretizer<
+        cuda::ConstantPreferredReplicationConcretizer<2>>,
+    cuda::warp_global_xyz<>>;
+//
+using cuda_multi_reduce_atomic_global_no_replication_host_init = cuda_multi_reduce_tuning<
+    cuda::multi_reduce_algorithm::init_host_combine_global_atomic,
+    void, // unused with this algorithm
+    void, // unused with this algorithm
+    cuda::GlobalAtomicReplicationMinPow2Concretizer<
+        cuda::ConstantPreferredReplicationConcretizer<1>>,
+    cuda::block_xyz<>>;
+
+// Policy for RAJA::MultiReduce* objects that may use atomics and may not give the
+// same answer every time when used in the same way
+using cuda_multi_reduce_atomic = cuda_multi_reduce_atomic_block_then_atomic_grid_host_init;
+// Similar to above but optimized for low overhead in cases where it is rarely used
+using cuda_multi_reduce_atomic_low_performance_low_overhead =
+    cuda_multi_reduce_atomic_global_no_replication_host_init;
+
 
 // policies usable with kernel
 using policy::cuda::cuda_block_reduce;
@@ -1266,11 +1557,11 @@ using policy::cuda::cuda_warp_reduce;
 using cuda_warp_direct = RAJA::policy::cuda::cuda_indexer<
     iteration_mapping::Direct,
     kernel_sync_requirement::none,
-    cuda::thread_x<RAJA::policy::cuda::WARP_SIZE>>;
+    cuda::thread_x<RAJA::policy::cuda::device_constants.WARP_SIZE>>;
 using cuda_warp_loop = RAJA::policy::cuda::cuda_indexer<
     iteration_mapping::StridedLoop<named_usage::unspecified>,
     kernel_sync_requirement::none,
-    cuda::thread_x<RAJA::policy::cuda::WARP_SIZE>>;
+    cuda::thread_x<RAJA::policy::cuda::device_constants.WARP_SIZE>>;
 
 using policy::cuda::cuda_warp_masked_direct;
 using policy::cuda::cuda_warp_masked_loop;
diff --git a/include/RAJA/policy/cuda/reduce.hpp b/include/RAJA/policy/cuda/reduce.hpp
index 516b02383c..8d55698af8 100644
--- a/include/RAJA/policy/cuda/reduce.hpp
+++ b/include/RAJA/policy/cuda/reduce.hpp
@@ -206,15 +206,15 @@ RAJA_DEVICE RAJA_INLINE T block_reduce(T val, T identity)
   const int numThreads = ThreadIterationGetter::size();
   const int threadId = ThreadIterationGetter::index();
 
-  const int warpId = threadId % RAJA::policy::cuda::WARP_SIZE;
-  const int warpNum = threadId / RAJA::policy::cuda::WARP_SIZE;
+  const int warpId = threadId % RAJA::policy::cuda::device_constants.WARP_SIZE;
+  const int warpNum = threadId / RAJA::policy::cuda::device_constants.WARP_SIZE;
 
   T temp = val;
 
-  if (numThreads % RAJA::policy::cuda::WARP_SIZE == 0) {
+  if (numThreads % RAJA::policy::cuda::device_constants.WARP_SIZE == 0) {
 
     // reduce each warp
-    for (int i = 1; i < RAJA::policy::cuda::WARP_SIZE; i *= 2) {
+    for (int i = 1; i < RAJA::policy::cuda::device_constants.WARP_SIZE; i *= 2) {
       T rhs = RAJA::cuda::impl::shfl_xor_sync(temp, i);
       temp = Combiner{}(temp, rhs);
     }
@@ -222,7 +222,7 @@ RAJA_DEVICE RAJA_INLINE T block_reduce(T val, T identity)
   } else {
 
     // reduce each warp
-    for (int i = 1; i < RAJA::policy::cuda::WARP_SIZE; i *= 2) {
+    for (int i = 1; i < RAJA::policy::cuda::device_constants.WARP_SIZE; i *= 2) {
       int srcLane = threadId ^ i;
       T rhs = RAJA::cuda::impl::shfl_sync(temp, srcLane);
       // only add from threads that exist (don't double count own value)
@@ -232,18 +232,18 @@ RAJA_DEVICE RAJA_INLINE T block_reduce(T val, T identity)
     }
   }
 
-  static_assert(RAJA::policy::cuda::MAX_WARPS <= RAJA::policy::cuda::WARP_SIZE,
+  static_assert(RAJA::policy::cuda::device_constants.MAX_WARPS <= RAJA::policy::cuda::device_constants.WARP_SIZE,
                "Max Warps must be less than or equal to Warp Size for this algorithm to work");
 
   // reduce per warp values
-  if (numThreads > RAJA::policy::cuda::WARP_SIZE) {
+  if (numThreads > RAJA::policy::cuda::device_constants.WARP_SIZE) {
 
     // Need to separate declaration and initialization for clang-cuda
-    __shared__ unsigned char tmpsd[sizeof(RAJA::detail::SoAArray<T, RAJA::policy::cuda::MAX_WARPS>)];
+    __shared__ unsigned char tmpsd[sizeof(RAJA::detail::SoAArray<T, RAJA::policy::cuda::device_constants.MAX_WARPS>)];
 
     // Partial placement new: Should call new(tmpsd) here but recasting memory
     // to avoid calling constructor/destructor in shared memory.
-    RAJA::detail::SoAArray<T, RAJA::policy::cuda::MAX_WARPS> * sd = reinterpret_cast<RAJA::detail::SoAArray<T, RAJA::policy::cuda::MAX_WARPS> *>(tmpsd);
+    RAJA::detail::SoAArray<T, RAJA::policy::cuda::device_constants.MAX_WARPS> * sd = reinterpret_cast<RAJA::detail::SoAArray<T, RAJA::policy::cuda::device_constants.MAX_WARPS> *>(tmpsd);
 
     // write per warp values to shared memory
     if (warpId == 0) {
@@ -255,13 +255,13 @@ RAJA_DEVICE RAJA_INLINE T block_reduce(T val, T identity)
     if (warpNum == 0) {
 
       // read per warp values
-      if (warpId * RAJA::policy::cuda::WARP_SIZE < numThreads) {
+      if (warpId * RAJA::policy::cuda::device_constants.WARP_SIZE < numThreads) {
         temp = sd->get(warpId);
       } else {
         temp = identity;
       }
 
-      for (int i = 1; i < RAJA::policy::cuda::MAX_WARPS; i *= 2) {
+      for (int i = 1; i < RAJA::policy::cuda::device_constants.MAX_WARPS; i *= 2) {
         T rhs = RAJA::cuda::impl::shfl_xor_sync(temp, i);
         temp = Combiner{}(temp, rhs);
       }
@@ -887,8 +887,8 @@ class Reduce
       : 1;
   static constexpr size_t atomic_stride = (tuning::atomic_stride > 0)
       ? tuning::atomic_stride
-      : ((policy::cuda::ATOMIC_DESTRUCTIVE_INTERFERENCE_SIZE > sizeof(T))
-        ? RAJA_DIVIDE_CEILING_INT(policy::cuda::ATOMIC_DESTRUCTIVE_INTERFERENCE_SIZE, sizeof(T))
+      : ((policy::cuda::device_constants.ATOMIC_DESTRUCTIVE_INTERFERENCE_SIZE > sizeof(T))
+        ? RAJA_DIVIDE_CEILING_INT(policy::cuda::device_constants.ATOMIC_DESTRUCTIVE_INTERFERENCE_SIZE, sizeof(T))
         : 1);
 
   using Accessor = std::conditional_t<(tuning::comm_mode == block_communication_mode::block_fence),
diff --git a/include/RAJA/policy/desul/atomic.hpp b/include/RAJA/policy/desul/atomic.hpp
index dbcb5e06eb..71bf429079 100644
--- a/include/RAJA/policy/desul/atomic.hpp
+++ b/include/RAJA/policy/desul/atomic.hpp
@@ -30,8 +30,32 @@ RAJA_SUPPRESS_HD_WARN
 template <typename AtomicPolicy, typename T>
 RAJA_HOST_DEVICE
 RAJA_INLINE T
-atomicAdd(AtomicPolicy, T volatile *acc, T value) {
-  return desul::atomic_fetch_add(const_cast<T*>(acc),
+atomicLoad(AtomicPolicy, T *acc)
+{
+  return desul::atomic_load(acc,
+                            raja_default_desul_order{},
+                            raja_default_desul_scope{});
+}
+
+RAJA_SUPPRESS_HD_WARN
+template <typename AtomicPolicy, typename T>
+RAJA_HOST_DEVICE
+RAJA_INLINE void
+atomicStore(AtomicPolicy, T *acc, T value)
+{
+  desul::atomic_store(acc,
+                      value,
+                      raja_default_desul_order{},
+                      raja_default_desul_scope{});
+}
+
+RAJA_SUPPRESS_HD_WARN
+template <typename AtomicPolicy, typename T>
+RAJA_HOST_DEVICE
+RAJA_INLINE T
+atomicAdd(AtomicPolicy, T *acc, T value)
+{
+  return desul::atomic_fetch_add(acc,
                                  value,
                                  raja_default_desul_order{},
                                  raja_default_desul_scope{});
@@ -41,8 +65,9 @@ RAJA_SUPPRESS_HD_WARN
 template <typename AtomicPolicy, typename T>
 RAJA_HOST_DEVICE
 RAJA_INLINE T
-atomicSub(AtomicPolicy, T volatile *acc, T value) {
-  return desul::atomic_fetch_sub(const_cast<T*>(acc),
+atomicSub(AtomicPolicy, T *acc, T value)
+{
+  return desul::atomic_fetch_sub(acc,
                                  value,
                                  raja_default_desul_order{},
                                  raja_default_desul_scope{});
@@ -51,9 +76,9 @@ atomicSub(AtomicPolicy, T volatile *acc, T value) {
 RAJA_SUPPRESS_HD_WARN
 template <typename AtomicPolicy, typename T>
 RAJA_HOST_DEVICE
-RAJA_INLINE T atomicMin(AtomicPolicy, T volatile *acc, T value)
+RAJA_INLINE T atomicMin(AtomicPolicy, T *acc, T value)
 {
-  return desul::atomic_fetch_min(const_cast<T*>(acc),
+  return desul::atomic_fetch_min(acc,
                                  value,
                                  raja_default_desul_order{},
                                  raja_default_desul_scope{});
@@ -62,9 +87,9 @@ RAJA_INLINE T atomicMin(AtomicPolicy, T volatile *acc, T value)
 RAJA_SUPPRESS_HD_WARN
 template <typename AtomicPolicy, typename T>
 RAJA_HOST_DEVICE
-RAJA_INLINE T atomicMax(AtomicPolicy, T volatile *acc, T value)
+RAJA_INLINE T atomicMax(AtomicPolicy, T *acc, T value)
 {
-  return desul::atomic_fetch_max(const_cast<T*>(acc),
+  return desul::atomic_fetch_max(acc,
                                  value,
                                  raja_default_desul_order{},
                                  raja_default_desul_scope{});
@@ -73,9 +98,9 @@ RAJA_INLINE T atomicMax(AtomicPolicy, T volatile *acc, T value)
 RAJA_SUPPRESS_HD_WARN
 template <typename AtomicPolicy, typename T>
 RAJA_HOST_DEVICE
-RAJA_INLINE T atomicInc(AtomicPolicy, T volatile *acc)
+RAJA_INLINE T atomicInc(AtomicPolicy, T *acc)
 {
-  return desul::atomic_fetch_inc(const_cast<T*>(acc),
+  return desul::atomic_fetch_inc(acc,
                                  raja_default_desul_order{},
                                  raja_default_desul_scope{});
 }
@@ -83,22 +108,22 @@ RAJA_INLINE T atomicInc(AtomicPolicy, T volatile *acc)
 RAJA_SUPPRESS_HD_WARN
 template <typename AtomicPolicy, typename T>
 RAJA_HOST_DEVICE
-RAJA_INLINE T atomicInc(AtomicPolicy, T volatile *acc, T val)
+RAJA_INLINE T atomicInc(AtomicPolicy, T *acc, T val)
 {
   // See:
   // http://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#atomicinc
-  return desul::atomic_fetch_inc_mod(const_cast<T*>(acc),
-                                          val,
-                                          raja_default_desul_order{},
-                                          raja_default_desul_scope{});
+  return desul::atomic_fetch_inc_mod(acc,
+                                     val,
+                                     raja_default_desul_order{},
+                                     raja_default_desul_scope{});
 }
 
 RAJA_SUPPRESS_HD_WARN
 template <typename AtomicPolicy, typename T>
 RAJA_HOST_DEVICE
-RAJA_INLINE T atomicDec(AtomicPolicy, T volatile *acc)
+RAJA_INLINE T atomicDec(AtomicPolicy, T *acc)
 {
-  return desul::atomic_fetch_dec(const_cast<T*>(acc),
+  return desul::atomic_fetch_dec(acc,
                                  raja_default_desul_order{},
                                  raja_default_desul_scope{});
 }
@@ -106,22 +131,22 @@ RAJA_INLINE T atomicDec(AtomicPolicy, T volatile *acc)
 RAJA_SUPPRESS_HD_WARN
 template <typename AtomicPolicy, typename T>
 RAJA_HOST_DEVICE
-RAJA_INLINE T atomicDec(AtomicPolicy, T volatile *acc, T val)
+RAJA_INLINE T atomicDec(AtomicPolicy, T *acc, T val)
 {
   // See:
   // http://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#atomicdec
-  return desul::atomic_fetch_dec_mod(const_cast<T*>(acc),
-                                          val,
-                                          raja_default_desul_order{},
-                                          raja_default_desul_scope{});
+  return desul::atomic_fetch_dec_mod(acc,
+                                     val,
+                                     raja_default_desul_order{},
+                                     raja_default_desul_scope{});
 }
 
 RAJA_SUPPRESS_HD_WARN
 template <typename AtomicPolicy, typename T>
 RAJA_HOST_DEVICE
-RAJA_INLINE T atomicAnd(AtomicPolicy, T volatile *acc, T value)
+RAJA_INLINE T atomicAnd(AtomicPolicy, T *acc, T value)
 {
-  return desul::atomic_fetch_and(const_cast<T*>(acc),
+  return desul::atomic_fetch_and(acc,
                                  value,
                                  raja_default_desul_order{},
                                  raja_default_desul_scope{});
@@ -130,9 +155,9 @@ RAJA_INLINE T atomicAnd(AtomicPolicy, T volatile *acc, T value)
 RAJA_SUPPRESS_HD_WARN
 template <typename AtomicPolicy, typename T>
 RAJA_HOST_DEVICE
-RAJA_INLINE T atomicOr(AtomicPolicy, T volatile *acc, T value)
+RAJA_INLINE T atomicOr(AtomicPolicy, T *acc, T value)
 {
-  return desul::atomic_fetch_or(const_cast<T*>(acc),
+  return desul::atomic_fetch_or(acc,
                                 value,
                                 raja_default_desul_order{},
                                 raja_default_desul_scope{});
@@ -141,9 +166,9 @@ RAJA_INLINE T atomicOr(AtomicPolicy, T volatile *acc, T value)
 RAJA_SUPPRESS_HD_WARN
 template <typename AtomicPolicy, typename T>
 RAJA_HOST_DEVICE
-RAJA_INLINE T atomicXor(AtomicPolicy, T volatile *acc, T value)
+RAJA_INLINE T atomicXor(AtomicPolicy, T *acc, T value)
 {
-  return desul::atomic_fetch_xor(const_cast<T*>(acc),
+  return desul::atomic_fetch_xor(acc,
                                  value,
                                  raja_default_desul_order{},
                                  raja_default_desul_scope{});
@@ -152,9 +177,9 @@ RAJA_INLINE T atomicXor(AtomicPolicy, T volatile *acc, T value)
 RAJA_SUPPRESS_HD_WARN
 template <typename AtomicPolicy, typename T>
 RAJA_HOST_DEVICE
-RAJA_INLINE T atomicExchange(AtomicPolicy, T volatile *acc, T value)
+RAJA_INLINE T atomicExchange(AtomicPolicy, T *acc, T value)
 {
-  return desul::atomic_exchange(const_cast<T*>(acc),
+  return desul::atomic_exchange(acc,
                                 value,
                                 raja_default_desul_order{},
                                 raja_default_desul_scope{});
@@ -163,9 +188,9 @@ RAJA_INLINE T atomicExchange(AtomicPolicy, T volatile *acc, T value)
 RAJA_SUPPRESS_HD_WARN
 template <typename AtomicPolicy, typename T>
 RAJA_HOST_DEVICE
-RAJA_INLINE T atomicCAS(AtomicPolicy, T volatile *acc, T compare, T value)
+RAJA_INLINE T atomicCAS(AtomicPolicy, T *acc, T compare, T value)
 {
-  return desul::atomic_compare_exchange(const_cast<T*>(acc),
+  return desul::atomic_compare_exchange(acc,
                                         compare,
                                         value,
                                         raja_default_desul_order{},
diff --git a/include/RAJA/policy/hip.hpp b/include/RAJA/policy/hip.hpp
index a1578fd9df..ab7e922c0f 100644
--- a/include/RAJA/policy/hip.hpp
+++ b/include/RAJA/policy/hip.hpp
@@ -33,6 +33,7 @@
 #include "RAJA/policy/hip/forall.hpp"
 #include "RAJA/policy/hip/policy.hpp"
 #include "RAJA/policy/hip/reduce.hpp"
+#include "RAJA/policy/hip/multi_reduce.hpp"
 #include "RAJA/policy/hip/scan.hpp"
 #include "RAJA/policy/hip/sort.hpp"
 #include "RAJA/policy/hip/kernel.hpp"
diff --git a/include/RAJA/policy/hip/MemUtils_HIP.hpp b/include/RAJA/policy/hip/MemUtils_HIP.hpp
index 84c6d1fa38..f1f69eab5e 100644
--- a/include/RAJA/policy/hip/MemUtils_HIP.hpp
+++ b/include/RAJA/policy/hip/MemUtils_HIP.hpp
@@ -50,6 +50,26 @@ namespace RAJA
 namespace hip
 {
 
+//! Get the properties of the current device
+RAJA_INLINE
+hipDeviceProp_t get_device_prop()
+{
+  int device;
+  hipErrchk(hipGetDevice(&device));
+  hipDeviceProp_t prop;
+  hipErrchk(hipGetDeviceProperties(&prop, device));
+  return prop;
+}
+
+//! Get a reference to a static cached copy of the current device properties.
+//  This caches a copy on first use to speedup later calls.
+RAJA_INLINE
+hipDeviceProp_t& device_prop()
+{
+  static thread_local hipDeviceProp_t prop = get_device_prop();
+  return prop;
+}
+
 
 //! Allocator for pinned memory for use in basic_mempool
 struct PinnedAllocator {
@@ -143,36 +163,22 @@ namespace detail
 
 //! struct containing data necessary to coordinate kernel launches with reducers
 struct hipInfo {
+  const void* func = nullptr;
   hip_dim_t gridDim{0, 0, 0};
   hip_dim_t blockDim{0, 0, 0};
+  size_t* dynamic_smem = nullptr;
   ::RAJA::resources::Hip res{::RAJA::resources::Hip::HipFromStream(0,0)};
   bool setup_reducers = false;
+};
+struct hipStatusInfo : hipInfo {
 #if defined(RAJA_ENABLE_OPENMP)
-  hipInfo* thread_states = nullptr;
   omp::mutex lock;
 #endif
 };
 
-//! class that changes a value on construction then resets it at destruction
-template <typename T>
-class SetterResetter
-{
-public:
-  SetterResetter(T& val, T new_val) : m_val(val), m_old_val(val)
-  {
-    m_val = new_val;
-  }
-  SetterResetter(const SetterResetter&) = delete;
-  ~SetterResetter() { m_val = m_old_val; }
-
-private:
-  T& m_val;
-  T m_old_val;
-};
-
-extern hipInfo g_status;
+extern hipStatusInfo g_status;
 
-extern hipInfo tl_status;
+extern hipStatusInfo tl_status;
 #if defined(RAJA_ENABLE_OPENMP)
 #pragma omp threadprivate(tl_status)
 #endif
@@ -272,54 +278,94 @@ bool setupReducers() { return detail::tl_status.setup_reducers; }
 RAJA_INLINE
 hip_dim_t currentGridDim() { return detail::tl_status.gridDim; }
 
+//! get grid size of current launch
+RAJA_INLINE
+hip_dim_member_t currentGridSize() { return detail::tl_status.gridDim.x *
+                                            detail::tl_status.gridDim.y *
+                                            detail::tl_status.gridDim.z; }
+
 //! get blockDim of current launch
 RAJA_INLINE
 hip_dim_t currentBlockDim() { return detail::tl_status.blockDim; }
 
+//! get block size of current launch
+RAJA_INLINE
+hip_dim_member_t currentBlockSize() { return detail::tl_status.blockDim.x *
+                                             detail::tl_status.blockDim.y *
+                                             detail::tl_status.blockDim.z; }
+
+//! get dynamic shared memory usage for current launch
+RAJA_INLINE
+size_t currentDynamicShmem() { return *detail::tl_status.dynamic_smem; }
+
+//! get maximum dynamic shared memory for current launch
+RAJA_INLINE
+size_t maxDynamicShmem()
+{
+  hipFuncAttributes func_attr;
+  hipErrchk(hipFuncGetAttributes(&func_attr, detail::tl_status.func));
+  return func_attr.maxDynamicSharedSizeBytes;
+}
+
+constexpr size_t dynamic_smem_allocation_failure = std::numeric_limits<size_t>::max();
+
+//! Allocate dynamic shared memory for current launch
+//
+//  The first argument is a functional object that takes the maximum number of
+//  objects that can fit into the dynamic shared memory available and returns
+//  the number of objects to allocate.
+//  The second argument is the required alignment.
+//
+//  Returns an offset into dynamic shared memory aligned to align on success,
+//  or dynamic_smem_allocation_failure on failure. Note that asking for 0 memory
+//  takes the failure return path.
+template < typename T, typename GetNFromMax >
+RAJA_INLINE
+size_t allocateDynamicShmem(GetNFromMax&& get_n_from_max, size_t align = alignof(T))
+{
+  const size_t unaligned_shmem = *detail::tl_status.dynamic_smem;
+  const size_t align_offset = ((unaligned_shmem % align) != size_t(0))
+      ? align - (unaligned_shmem % align)
+      : size_t(0);
+  const size_t aligned_shmem = unaligned_shmem + align_offset;
+
+  const size_t max_shmem_bytes = maxDynamicShmem() - aligned_shmem;
+  const size_t n_bytes = sizeof(T) *
+      std::forward<GetNFromMax>(get_n_from_max)(max_shmem_bytes / sizeof(T));
+
+  if (size_t(0) < n_bytes && n_bytes <= max_shmem_bytes) {
+    *detail::tl_status.dynamic_smem = aligned_shmem + n_bytes;
+    return aligned_shmem;
+  } else {
+    return dynamic_smem_allocation_failure;
+  }
+}
+
 //! get resource for current launch
 RAJA_INLINE
 ::RAJA::resources::Hip currentResource() { return detail::tl_status.res; }
 
 //! create copy of loop_body that is setup for device execution
+//
+// Note: This is done to setup the Reducer and MultiReducer objects through
+// their copy constructors. Both look at tl_status to setup per kernel launch
+// resources.
 template <typename LOOP_BODY>
 RAJA_INLINE typename std::remove_reference<LOOP_BODY>::type make_launch_body(
+    const void* func,
     hip_dim_t gridDim,
     hip_dim_t blockDim,
-    size_t RAJA_UNUSED_ARG(dynamic_smem),
+    size_t& dynamic_smem,
     ::RAJA::resources::Hip res,
     LOOP_BODY&& loop_body)
 {
-  detail::SetterResetter<bool> setup_reducers_srer(
-      detail::tl_status.setup_reducers, true);
-  detail::SetterResetter<::RAJA::resources::Hip> res_srer(
-      detail::tl_status.res, res);
-
-  detail::tl_status.gridDim = gridDim;
-  detail::tl_status.blockDim = blockDim;
+  ::RAJA::detail::ScopedAssignment<detail::hipInfo> info_sa(detail::tl_status,
+      detail::hipInfo{func, gridDim, blockDim, &dynamic_smem, res, true});
 
   using return_type = typename std::remove_reference<LOOP_BODY>::type;
   return return_type(std::forward<LOOP_BODY>(loop_body));
 }
 
-//! Get the properties of the current device
-RAJA_INLINE
-hipDeviceProp_t get_device_prop()
-{
-  int device;
-  hipErrchk(hipGetDevice(&device));
-  hipDeviceProp_t prop;
-  hipErrchk(hipGetDeviceProperties(&prop, device));
-  return prop;
-}
-
-//! Get a copy of the device properties, this copy is cached on first use to speedup later calls
-RAJA_INLINE
-hipDeviceProp_t& device_prop()
-{
-  static thread_local hipDeviceProp_t prop = get_device_prop();
-  return prop;
-}
-
 
 static constexpr int hip_occupancy_uninitialized_int = -1;
 static constexpr size_t hip_occupancy_uninitialized_size_t =
diff --git a/include/RAJA/policy/hip/WorkGroup/WorkRunner.hpp b/include/RAJA/policy/hip/WorkGroup/WorkRunner.hpp
index 389b24e35a..26d45d7bd9 100644
--- a/include/RAJA/policy/hip/WorkGroup/WorkRunner.hpp
+++ b/include/RAJA/policy/hip/WorkGroup/WorkRunner.hpp
@@ -291,7 +291,7 @@ struct WorkRunner<
       //
       // TODO: Privatize the loop_body, using make_launch_body to setup reductions
       //
-      // LOOP_BODY body = RAJA::hip::make_launch_body(
+      // LOOP_BODY body = RAJA::hip::make_launch_body(func,
       //     gridSize, blockSize, shmem, stream, std::forward<LoopBody>(loop_body));
 
       storage.template emplace<holder>(
diff --git a/include/RAJA/policy/hip/atomic.hpp b/include/RAJA/policy/hip/atomic.hpp
index e16f31bb5b..b4f0d7faa7 100644
--- a/include/RAJA/policy/hip/atomic.hpp
+++ b/include/RAJA/policy/hip/atomic.hpp
@@ -22,447 +22,691 @@
 
 #if defined(RAJA_ENABLE_HIP)
 
+#include <cstdint>
 #include <stdexcept>
 #include <type_traits>
 #include "hip/hip_runtime.h"
 
+#include "camp/list.hpp"
+
 #include "RAJA/policy/sequential/atomic.hpp"
 #include "RAJA/policy/atomic_builtin.hpp"
 #if defined(RAJA_ENABLE_OPENMP)
 #include "RAJA/policy/openmp/atomic.hpp"
 #endif
 
-#include "RAJA/util/camp_aliases.hpp"
-#include "RAJA/util/concepts.hpp"
+#include "RAJA/util/EnableIf.hpp"
 #include "RAJA/util/Operators.hpp"
 #include "RAJA/util/TypeConvert.hpp"
 #include "RAJA/util/macros.hpp"
 
+// TODO: When we can use if constexpr in C++17, this file can be cleaned up
 
 namespace RAJA
 {
 
+
 namespace detail
 {
 
+using hip_atomicCommon_builtin_types = ::camp::list<
+  int,
+  unsigned int,
+  unsigned long long
+>;
+
 /*!
- * Generic impementation of atomic 32-bit or 64-bit compare and swap primitive.
- * Implementation uses the existing HIP supplied unsigned 32-bit and 64-bit
- * CAS operators.
- * Returns the value that was stored before this operation.
+ * Type trait for determining if atomic operators should be implemented
+ * using builtin functions. This type trait can be used for a lot of atomic
+ * operators. More specific type traits are added when needed, such as
+ * hip_useBuiltinExchange below.
  */
-RAJA_INLINE __device__ unsigned hip_atomic_CAS(
-    unsigned volatile *acc,
-    unsigned compare,
-    unsigned value)
-{
-  return ::atomicCAS((unsigned *)acc, compare, value);
-}
-///
-RAJA_INLINE __device__ unsigned long long hip_atomic_CAS(
-    unsigned long long volatile *acc,
-    unsigned long long compare,
-    unsigned long long value)
-{
-  return ::atomicCAS((unsigned long long *)acc, compare, value);
-}
-///
 template <typename T>
-RAJA_INLINE __device__
-typename std::enable_if<sizeof(T) == sizeof(unsigned), T>::type
-hip_atomic_CAS(T volatile *acc, T compare, T value)
-{
-  return RAJA::util::reinterp_A_as_B<unsigned, T>(
-      hip_atomic_CAS((unsigned volatile *)acc,
-          RAJA::util::reinterp_A_as_B<T, unsigned>(compare),
-          RAJA::util::reinterp_A_as_B<T, unsigned>(value)));
-}
-///
-template <typename T>
-RAJA_INLINE __device__
-typename std::enable_if<sizeof(T) == sizeof(unsigned long long), T>::type
-hip_atomic_CAS(T volatile *acc, T compare, T value)
-{
-  return RAJA::util::reinterp_A_as_B<unsigned long long, T>(
-      hip_atomic_CAS((unsigned long long volatile *)acc,
-          RAJA::util::reinterp_A_as_B<T, unsigned long long>(compare),
-          RAJA::util::reinterp_A_as_B<T, unsigned long long>(value)));
-}
-
-template <size_t BYTES>
-struct HipAtomicCAS {
+struct hip_useBuiltinCommon {
+  static constexpr bool value =
+    std::is_same<T, int>::value ||
+    std::is_same<T, unsigned int>::value ||
+    std::is_same<T, unsigned long long>::value;
 };
 
 
-template <>
-struct HipAtomicCAS<4> {
-
-  /*!
-   * Generic impementation of any atomic 32-bit operator.
-   * Implementation uses the existing HIP supplied unsigned 32-bit CAS
-   * operator. Returns the OLD value that was replaced by the result of this
-   * operation.
-   */
-  template <typename T, typename OPER>
-  RAJA_INLINE __device__ T operator()(T volatile *acc, OPER const &oper) const
-  {
-    // asserts in RAJA::util::reinterp_T_as_u and RAJA::util::reinterp_u_as_T
-    // will enforce 32-bit T
-    unsigned oldval, newval, readback;
-    oldval = RAJA::util::reinterp_A_as_B<T, unsigned>(*acc);
-    newval = RAJA::util::reinterp_A_as_B<T, unsigned>(
-        oper(RAJA::util::reinterp_A_as_B<unsigned, T>(oldval)));
-    while ((readback = hip_atomic_CAS((unsigned volatile*)acc, oldval, newval)) !=
-           oldval) {
-      oldval = readback;
-      newval = RAJA::util::reinterp_A_as_B<T, unsigned>(
-          oper(RAJA::util::reinterp_A_as_B<unsigned, T>(oldval)));
-    }
-    return RAJA::util::reinterp_A_as_B<unsigned, T>(oldval);
-  }
+/*!
+ * Type trait for determining if atomic operators should be implemented
+ * by reinterpreting inputs to types that the builtin functions support.
+ * This type trait can be used for a lot of atomic operators. More specific
+ * type traits are added when needed, such as hip_useReinterpretExchange
+ * below.
+ */
+template <typename T>
+struct hip_useReinterpretCommon {
+  static constexpr bool value =
+    !hip_useBuiltinCommon<T>::value &&
+    (sizeof(T) == sizeof(unsigned int) ||
+     sizeof(T) == sizeof(unsigned long long));
+
+  using type =
+    std::conditional_t<sizeof(T) == sizeof(unsigned int),
+                       unsigned int, unsigned long long>;
 };
 
-template <>
-struct HipAtomicCAS<8> {
-
-  /*!
-   * Generic impementation of any atomic 64-bit operator.
-   * Implementation uses the existing HIP supplied unsigned 64-bit CAS
-   * operator. Returns the OLD value that was replaced by the result of this
-   * operation.
-   */
-  template <typename T, typename OPER>
-  RAJA_INLINE __device__ T operator()(T volatile *acc, OPER const &oper) const
-  {
-    // asserts in RAJA::util::reinterp_T_as_u and RAJA::util::reinterp_u_as_T
-    // will enforce 64-bit T
-    unsigned long long oldval, newval, readback;
-    oldval = RAJA::util::reinterp_A_as_B<T, unsigned long long>(*acc);
-    newval = RAJA::util::reinterp_A_as_B<T, unsigned long long>(
-        oper(RAJA::util::reinterp_A_as_B<unsigned long long, T>(oldval)));
-    while (
-        (readback = hip_atomic_CAS((unsigned long long volatile*)acc, oldval, newval)) !=
-        oldval) {
-      oldval = readback;
-      newval = RAJA::util::reinterp_A_as_B<T, unsigned long long>(
-          oper(RAJA::util::reinterp_A_as_B<unsigned long long, T>(oldval)));
-    }
-    return RAJA::util::reinterp_A_as_B<unsigned long long, T>(oldval);
-  }
-};
+
+/*!
+ * Alias for determining the integral type of the same size as the given type
+ */
+template <typename T>
+using hip_useReinterpretCommon_t = typename hip_useReinterpretCommon<T>::type;
 
 
 /*!
- * Generic impementation of any atomic 32-bit or 64-bit operator that can be
- * implemented using a compare and swap primitive.
- * Implementation uses the existing HIP supplied unsigned 32-bit and 64-bit
- * CAS operators.
- * Returns the OLD value that was replaced by the result of this operation.
+ * Performs an atomic bitwise or using a builtin function. Stores the new value
+ * in the given address and returns the old value.
+ *
+ * This overload using builtin functions is used to implement atomic loads
+ * under some build configurations.
  */
-template <typename T, typename OPER>
-RAJA_INLINE __device__ T hip_atomic_CAS_oper(T volatile *acc, OPER &&oper)
+template <typename T,
+          std::enable_if_t<hip_useBuiltinCommon<T>::value, bool> = true>
+RAJA_INLINE __device__ T hip_atomicOr(T *acc, T value)
 {
-  HipAtomicCAS<sizeof(T)> cas;
-  return cas(acc, std::forward<OPER>(oper));
+  return ::atomicOr(acc, value);
 }
 
 
-template < typename T, typename TypeList >
-struct is_any_of;
+/*!
+ * Type trait for determining if the exchange operator should be implemented
+ * using a builtin
+ */
+template <typename T>
+struct hip_useBuiltinExchange {
+  static constexpr bool value =
+    std::is_same<T, int>::value ||
+    std::is_same<T, unsigned int>::value ||
+    std::is_same<T, unsigned long long>::value ||
+    std::is_same<T, float>::value;
+};
 
-template < typename T, typename... Types >
-struct is_any_of<T, list<Types...>>
-  : concepts::any_of<camp::is_same<T, Types>...>
-{};
+/*!
+ * Type trait for determining if the exchange operator should be implemented
+ * by reinterpreting inputs to types that the builtin exchange supports
+ */
+template <typename T>
+struct hip_useReinterpretExchange {
+  static constexpr bool value =
+    !hip_useBuiltinExchange<T>::value &&
+    (sizeof(T) == sizeof(unsigned int) ||
+     sizeof(T) == sizeof(unsigned long long));
+
+  using type =
+    std::conditional_t<sizeof(T) == sizeof(unsigned int),
+                       unsigned int, unsigned long long>;
+};
 
-template < typename T, typename TypeList >
-using enable_if_is_any_of = std::enable_if_t<is_any_of<T, TypeList>::value, T>;
+/*!
+ * Alias for determining the integral type of the same size as the given type
+ */
+template <typename T>
+using hip_useReinterpretExchange_t = typename hip_useReinterpretExchange<T>::type;
 
-template < typename T, typename TypeList >
-using enable_if_is_none_of = std::enable_if_t<concepts::negate<is_any_of<T, TypeList>>::value, T>;
+/*!
+ * Performs an atomic exchange using a builtin function. Stores the new value
+ * in the given address and returns the old value.
+ */
+template <typename T,
+          std::enable_if_t<hip_useBuiltinExchange<T>::value, bool> = true>
+RAJA_INLINE __device__ T hip_atomicExchange(T *acc, T value)
+{
+  return ::atomicExch(acc, value);
+}
 
+/*!
+ * Performs an atomic exchange using a reinterpret cast. Stores the new value
+ * in the given address and returns the old value.
+ */
+template <typename T,
+          std::enable_if_t<hip_useReinterpretExchange<T>::value, bool> = true>
+RAJA_INLINE __device__ T hip_atomicExchange(T *acc, T value)
+{
+  using R = hip_useReinterpretExchange_t<T>;
 
-using hip_atomicCommon_builtin_types = list<
-      int
-     ,unsigned int
-     ,unsigned long long
-    >;
+  return RAJA::util::reinterp_A_as_B<R, T>(
+    hip_atomicExchange(reinterpret_cast<R*>(acc),
+                       RAJA::util::reinterp_A_as_B<T, R>(value)));
+}
 
 
-using hip_atomicAdd_builtin_types = list<
-      int
-     ,unsigned int
-     ,unsigned long long
-     ,float
-#ifdef RAJA_ENABLE_HIP_DOUBLE_ATOMICADD
-     ,double
-#endif
-    >;
+#if defined(__has_builtin) && \
+    (__has_builtin(__hip_atomic_load) || __has_builtin(__hip_atomic_store))
 
 /*!
- * List of types where HIP builtin atomics are used to implement atomicSub.
+ * Type trait for determining if the operator should be implemented
+ * using an intrinsic
  */
-using hip_atomicSub_types = list<
-      int
-     ,unsigned int
-     ,float
-#ifdef RAJA_ENABLE_HIP_DOUBLE_ATOMICADD
-     ,double
-#endif
-    >;
+template <typename T>
+struct hip_useBuiltinLoad {
+  static constexpr bool value =
+    (std::is_integral<T>::value || std::is_enum<T>::value) &&
+    (sizeof(T) == 1 || sizeof(T) == 2 || sizeof(T) == 4 || sizeof(T) == 8);
+};
+
+template <typename T>
+using hip_useBuiltinStore = hip_useBuiltinLoad<T>;
 
-using hip_atomicSub_builtin_types = list<
-      int
-     ,unsigned int
-    >;
 
 /*!
- * List of types where HIP builtin atomicAdd is used to implement atomicSub.
- *
- * Avoid multiple definition errors by including the previous list type here
- * to ensure these lists have different types.
+ * Type trait for determining if the operator should be implemented
+ * by reinterpreting inputs to types that intrinsics support
  */
-using hip_atomicSub_via_Add_builtin_types = list<
-      float
-#ifdef RAJA_ENABLE_HIP_DOUBLE_ATOMICADD
-     ,double
+template <typename T>
+struct hip_useReinterpretLoad {
+  static constexpr bool value =
+    !std::is_integral<T>::value &&
+    !std::is_enum<T>::value &&
+    ((sizeof(T) == 1
+#if !defined(UINT8_MAX)
+      && sizeof(unsigned char) == 1
 #endif
-    >;
-
-using hip_atomicMin_builtin_types = hip_atomicCommon_builtin_types;
+     ) ||
+     (sizeof(T) == 2
+#if !defined(UINT16_MAX)
+      && sizeof(unsigned short) == 2
+#endif
+     ) ||
+     (sizeof(T) == 4
+#if !defined(UINT32_MAX)
+      && sizeof(unsigned int) == 4
+#endif
+     ) ||
+     (sizeof(T) == 8
+#if !defined(UINT64_MAX)
+      && sizeof(unsigned long long) == 8
+#endif
+     ));
 
-using hip_atomicMax_builtin_types = hip_atomicCommon_builtin_types;
+  using type =
+    std::conditional_t<sizeof(T) == 1,
+#if defined(UINT8_MAX)
+                       uint8_t,
+#else
+                       unsigned char,
+#endif
+    std::conditional_t<sizeof(T) == 2,
+#if defined(UINT16_MAX)
+                       uint16_t,
+#else
+                       unsigned short,
+#endif
+    std::conditional_t<sizeof(T) == 4,
+#if defined(UINT32_MAX)
+                       uint32_t,
+#else
+                       unsigned int,
+#endif
+#if defined(UINT64_MAX)
+                       uint64_t>>>;
+#else
+                       unsigned long long>>>;
+#endif
+};
 
-using hip_atomicIncReset_builtin_types = list<
-      unsigned int
-    >;
+template <typename T>
+using hip_useReinterpretStore = hip_useReinterpretLoad<T>;
 
-using hip_atomicInc_builtin_types = list< >;
+#else
 
-using hip_atomicDecReset_builtin_types = list<
-      unsigned int
-    >;
+template <typename T>
+using hip_useBuiltinLoad = hip_useBuiltinCommon<T>;
 
-using hip_atomicDec_builtin_types = list< >;
+template <typename T>
+using hip_useBuiltinStore = hip_useBuiltinExchange<T>;
 
-using hip_atomicAnd_builtin_types = hip_atomicCommon_builtin_types;
+/*!
+ * Alias for determining the integral type of the same size as the given type
+ */
+template <typename T>
+using hip_useReinterpretLoad = hip_useReinterpretCommon<T>;
 
-using hip_atomicOr_builtin_types = hip_atomicCommon_builtin_types;
+template <typename T>
+using hip_useReinterpretStore = hip_useReinterpretExchange<T>;
 
-using hip_atomicXor_builtin_types = hip_atomicCommon_builtin_types;
+#endif
 
-using hip_atomicExch_builtin_types = list<
-      int
-     ,unsigned int
-     ,unsigned long long
-     ,float
-    >;
+/*!
+ * Alias for determining the integral type of the same size as the given type
+ */
+template <typename T>
+using hip_useReinterpretLoad_t = typename hip_useReinterpretLoad<T>::type;
 
-using hip_atomicCAS_builtin_types = hip_atomicCommon_builtin_types;
+template <typename T>
+using hip_useReinterpretStore_t = typename hip_useReinterpretStore<T>::type;
 
 
-template <typename T, enable_if_is_none_of<T, hip_atomicAdd_builtin_types>* = nullptr>
-RAJA_INLINE __device__ T hip_atomicAdd(T volatile *acc, T value)
+/*!
+ * Atomic load
+ */
+template <typename T,
+          std::enable_if_t<hip_useBuiltinLoad<T>::value, bool> = true>
+RAJA_INLINE __device__ T hip_atomicLoad(T *acc)
 {
-  return hip_atomic_CAS_oper(acc, [=] __device__(T a) {
-    return a + value;
-  });
+#if defined(__has_builtin) && __has_builtin(__hip_atomic_load)
+  return __hip_atomic_load(acc, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
+#else
+  return hip_atomicOr(acc, static_cast<T>(0));
+#endif
 }
 
-template <typename T, enable_if_is_any_of<T, hip_atomicAdd_builtin_types>* = nullptr>
-RAJA_INLINE __device__ T hip_atomicAdd(T volatile *acc, T value)
+template <typename T,
+          std::enable_if_t<hip_useReinterpretLoad<T>::value, bool> = true>
+RAJA_INLINE __device__ T hip_atomicLoad(T *acc)
 {
-  return ::atomicAdd((T *)acc, value);
+  using R = hip_useReinterpretLoad_t<T>;
+
+  return RAJA::util::reinterp_A_as_B<R, T>(
+    hip_atomicLoad(reinterpret_cast<R*>(acc)));
 }
 
 
-template <typename T, enable_if_is_none_of<T, hip_atomicSub_types>* = nullptr>
-RAJA_INLINE __device__ T hip_atomicSub(T volatile *acc, T value)
+/*!
+ * Atomic store
+ */
+template <typename T,
+          std::enable_if_t<hip_useBuiltinStore<T>::value, bool> = true>
+RAJA_INLINE __device__ void hip_atomicStore(T *acc, T value)
 {
-  return hip_atomic_CAS_oper(acc, [=] __device__(T a) {
-    return a - value;
-  });
+#if defined(__has_builtin) && __has_builtin(__hip_atomic_store)
+  __hip_atomic_store(acc, value, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
+#else
+  hip_atomicExchange(acc, value);
+#endif
+}
+
+template <typename T,
+          std::enable_if_t<hip_useReinterpretStore<T>::value, bool> = true>
+RAJA_INLINE __device__ void hip_atomicStore(T *acc, T value)
+{
+  using R = hip_useReinterpretStore_t<T>;
+
+  hip_atomicStore(reinterpret_cast<R*>(acc),
+                  RAJA::util::reinterp_A_as_B<T, R>(value));
 }
 
+
 /*!
- * HIP atomicSub builtin implementation.
+ * Hip atomicCAS using builtin function
+ *
+ * Returns the old value in memory before this operation.
  */
-template <typename T, enable_if_is_any_of<T, hip_atomicSub_builtin_types>* = nullptr>
-RAJA_INLINE __device__ T hip_atomicSub(T volatile *acc, T value)
+template <typename T,
+          std::enable_if_t<hip_useBuiltinCommon<T>::value, bool> = true>
+RAJA_INLINE __device__ T hip_atomicCAS(T *acc, T compare, T value)
 {
-  return ::atomicSub((T *)acc, value);
+  return ::atomicCAS(acc, compare, value);
 }
 
 /*!
- * HIP atomicSub via atomicAdd builtin implementation.
+ * Hip atomicCAS using reinterpret cast
+ *
+ * Returns the old value in memory before this operation.
  */
-template <typename T, enable_if_is_any_of<T, hip_atomicSub_via_Add_builtin_types>* = nullptr>
-RAJA_INLINE __device__ T hip_atomicSub(T volatile *acc, T value)
+template <typename T,
+          std::enable_if_t<hip_useReinterpretCommon<T>::value, bool> = true>
+RAJA_INLINE __device__ T hip_atomicCAS(T *acc, T compare, T value)
 {
-  return ::atomicAdd((T *)acc, -value);
+  using R = hip_useReinterpretCommon_t<T>;
+
+  return RAJA::util::reinterp_A_as_B<R, T>(
+    hip_atomicCAS(reinterpret_cast<R*>(acc),
+                  RAJA::util::reinterp_A_as_B<T, R>(compare),
+                  RAJA::util::reinterp_A_as_B<T, R>(value)));
 }
 
 
-template <typename T, enable_if_is_none_of<T, hip_atomicMin_builtin_types>* = nullptr>
-RAJA_INLINE __device__ T hip_atomicMin(T volatile *acc, T value)
+/*!
+ * Equality comparison for compare and swap loop. Converts to the underlying
+ * integral type to avoid cases where the values will never compare equal
+ * (most notably, NaNs).
+ */
+template <typename T,
+          std::enable_if_t<hip_useBuiltinCommon<T>::value, bool> = true>
+RAJA_INLINE __device__ bool hip_atomicCAS_equal(const T& a, const T& b)
 {
-  return hip_atomic_CAS_oper(acc, [=] __device__(T a) {
-    return value < a ? value : a;
-  });
+  return a == b;
 }
 
-template <typename T, enable_if_is_any_of<T, hip_atomicMin_builtin_types>* = nullptr>
-RAJA_INLINE __device__ T hip_atomicMin(T volatile *acc, T value)
+template <typename T,
+          std::enable_if_t<hip_useReinterpretCommon<T>::value, bool> = true>
+RAJA_INLINE __device__ bool hip_atomicCAS_equal(const T& a, const T& b)
 {
-  return ::atomicMin((T *)acc, value);
+  using R = hip_useReinterpretCommon_t<T>;
+
+  return hip_atomicCAS_equal(RAJA::util::reinterp_A_as_B<T, R>(a),
+                             RAJA::util::reinterp_A_as_B<T, R>(b));
 }
 
 
-template <typename T, enable_if_is_none_of<T, hip_atomicMax_builtin_types>* = nullptr>
-RAJA_INLINE __device__ T hip_atomicMax(T volatile *acc, T value)
+/*!
+ * Generic impementation of any atomic 32-bit or 64-bit operator.
+ * Implementation uses the existing HIP supplied unsigned 32-bit or 64-bit CAS
+ * operator. Returns the OLD value that was replaced by the result of this
+ * operation.
+ */
+template <typename T, typename Oper>
+RAJA_INLINE __device__ T hip_atomicCAS_loop(T *acc,
+                                            Oper&& oper)
 {
-  return hip_atomic_CAS_oper(acc, [=] __device__(T a) {
-    return value > a ? value : a;
-  });
+  T old = hip_atomicLoad(acc);
+  T expected;
+
+  do {
+    expected = old;
+    old = hip_atomicCAS(acc, expected, oper(expected));
+  } while (!hip_atomicCAS_equal(old, expected));
+
+  return old;
 }
 
-template <typename T, enable_if_is_any_of<T, hip_atomicMax_builtin_types>* = nullptr>
-RAJA_INLINE __device__ T hip_atomicMax(T volatile *acc, T value)
+
+/*!
+ * Generic impementation of any atomic 32-bit or 64-bit operator with short-circuiting.
+ * Implementation uses the existing HIP supplied unsigned 32-bit or 64-bit CAS
+ * operator. Returns the OLD value that was replaced by the result of this
+ * operation.
+ */
+template <typename T, typename Oper, typename ShortCircuit>
+RAJA_INLINE __device__ T hip_atomicCAS_loop(T *acc,
+                                            Oper&& oper,
+                                            ShortCircuit&& sc)
 {
-  return ::atomicMax((T *)acc, value);
+  T old = hip_atomicLoad(acc);
+
+  if (sc(old)) {
+    return old;
+  }
+
+  T expected;
+
+  do {
+    expected = old;
+    old = hip_atomicCAS(acc, expected, oper(expected));
+  } while (!hip_atomicCAS_equal(old, expected) && !sc(old));
+
+  return old;
 }
 
 
-template <typename T, enable_if_is_none_of<T, hip_atomicIncReset_builtin_types>* = nullptr>
-RAJA_INLINE __device__ T hip_atomicInc(T volatile *acc, T val)
+/*!
+ * Atomic addition
+ */
+
+/*!
+ * List of types where HIP builtin atomics are used to implement atomicAdd.
+ */
+using hip_atomicAdd_builtin_types = ::camp::list<
+  int,
+  unsigned int,
+  unsigned long long,
+  float
+#ifdef RAJA_ENABLE_HIP_DOUBLE_ATOMICADD
+  ,
+  double
+#endif
+>;
+
+template <typename T,
+          RAJA::util::enable_if_is_none_of<T, hip_atomicAdd_builtin_types>* = nullptr>
+RAJA_INLINE __device__ T hip_atomicAdd(T *acc, T value)
 {
-  return hip_atomic_CAS_oper(acc, [=] __device__(T old) {
-    return ((old >= val) ? (T)0 : (old + (T)1));
+  return hip_atomicCAS_loop(acc, [value] (T old) {
+    return old + value;
   });
 }
 
-template <typename T, enable_if_is_any_of<T, hip_atomicIncReset_builtin_types>* = nullptr>
-RAJA_INLINE __device__ T hip_atomicInc(T volatile *acc, T val)
+template <typename T,
+          RAJA::util::enable_if_is_any_of<T, hip_atomicAdd_builtin_types>* = nullptr>
+RAJA_INLINE __device__ T hip_atomicAdd(T *acc, T value)
 {
-  return ::atomicInc((T *)acc, val);
+  return ::atomicAdd(acc, value);
 }
 
 
-template <typename T, enable_if_is_none_of<T, hip_atomicInc_builtin_types>* = nullptr>
-RAJA_INLINE __device__ T hip_atomicInc(T volatile *acc)
+/*!
+ * Atomic subtraction
+ */
+
+/*!
+ * List of types where HIP builtin atomics are used to implement atomicSub.
+ */
+using hip_atomicSub_builtin_types = ::camp::list<
+  int,
+  unsigned int,
+  unsigned long long,
+  float
+#ifdef RAJA_ENABLE_HIP_DOUBLE_ATOMICADD
+  ,
+  double
+#endif
+>;
+
+/*!
+ * List of types where HIP builtin atomicSub is used to implement atomicSub.
+ *
+ * Avoid multiple definition errors by including the previous list type here
+ * to ensure these lists have different types.
+ */
+using hip_atomicSub_via_Sub_builtin_types = ::camp::list<
+  int,
+  unsigned int
+>;
+
+/*!
+ * List of types where HIP builtin atomicAdd is used to implement atomicSub.
+ *
+ * Avoid multiple definition errors by including the previous list type here
+ * to ensure these lists have different types.
+ */
+using hip_atomicSub_via_Add_builtin_types = ::camp::list<
+  unsigned long long,
+  float
+#ifdef RAJA_ENABLE_HIP_DOUBLE_ATOMICADD
+  ,
+  double
+#endif
+>;
+
+/*!
+ * HIP atomicSub compare and swap loop implementation.
+ */
+template <typename T,
+          RAJA::util::enable_if_is_none_of<T, hip_atomicSub_builtin_types>* = nullptr>
+RAJA_INLINE __device__ T hip_atomicSub(T *acc, T value)
+{
+  return hip_atomicCAS_loop(acc, [value] (T old) {
+    return old - value;
+  });
+}
+
+/*!
+ * HIP atomicSub builtin implementation.
+ */
+template <typename T,
+          RAJA::util::enable_if_is_any_of<T, hip_atomicSub_via_Sub_builtin_types>* = nullptr>
+RAJA_INLINE __device__ T hip_atomicSub(T *acc, T value)
 {
-  return hip_atomicAdd(acc, (T)1);
+  return ::atomicSub(acc, value);
 }
 
-template <typename T, enable_if_is_any_of<T, hip_atomicInc_builtin_types>* = nullptr>
-RAJA_INLINE __device__ T hip_atomicInc(T volatile *acc)
+/*!
+ * HIP atomicSub via atomicAdd builtin implementation.
+ */
+template <typename T,
+          RAJA::util::enable_if_is_any_of<T, hip_atomicSub_via_Add_builtin_types>* = nullptr>
+RAJA_INLINE __device__ T hip_atomicSub(T *acc, T value)
 {
-  return ::atomicInc((T *)acc);
+  return ::atomicAdd(acc, -value);
 }
 
 
-template <typename T, enable_if_is_none_of<T, hip_atomicDecReset_builtin_types>* = nullptr>
-RAJA_INLINE __device__ T hip_atomicDec(T volatile *acc, T val)
+/*!
+ * Atomic minimum
+ */
+using hip_atomicMin_builtin_types = hip_atomicCommon_builtin_types;
+
+template <typename T,
+          RAJA::util::enable_if_is_none_of<T, hip_atomicMin_builtin_types>* = nullptr>
+RAJA_INLINE __device__ T hip_atomicMin(T *acc, T value)
 {
-  // See:
-  // http://docs.nvidia.com/hip/hip-c-programming-guide/index.html#atomicdec
-  return hip_atomic_CAS_oper(acc, [=] __device__(T old) {
-    return (((old == (T)0) | (old > val)) ? val : (old - (T)1));
-  });
+  return hip_atomicCAS_loop(
+    acc,
+    [value] (T old) {
+      return value < old ? value : old;
+    },
+    [value] (T current) {
+      return current <= value;
+    });
 }
 
-template <typename T, enable_if_is_any_of<T, hip_atomicDecReset_builtin_types>* = nullptr>
-RAJA_INLINE __device__ T hip_atomicDec(T volatile *acc, T val)
+template <typename T,
+          RAJA::util::enable_if_is_any_of<T, hip_atomicMin_builtin_types>* = nullptr>
+RAJA_INLINE __device__ T hip_atomicMin(T *acc, T value)
 {
-  return ::atomicDec((T *)acc, val);
+  return ::atomicMin(acc, value);
 }
 
 
-template <typename T, enable_if_is_none_of<T, hip_atomicDec_builtin_types>* = nullptr>
-RAJA_INLINE __device__ T hip_atomicDec(T volatile *acc)
+/*!
+ * Atomic maximum
+ */
+using hip_atomicMax_builtin_types = hip_atomicCommon_builtin_types;
+
+template <typename T,
+          RAJA::util::enable_if_is_none_of<T, hip_atomicMax_builtin_types>* = nullptr>
+RAJA_INLINE __device__ T hip_atomicMax(T *acc, T value)
 {
-  return hip_atomicSub(acc, (T)1);
+  return hip_atomicCAS_loop(
+    acc,
+    [value] (T old) {
+      return old < value ? value : old;
+    },
+    [value] (T current) {
+      return value <= current;
+    });
 }
 
-template <typename T, enable_if_is_any_of<T, hip_atomicDec_builtin_types>* = nullptr>
-RAJA_INLINE __device__ T hip_atomicDec(T volatile *acc)
+template <typename T,
+          RAJA::util::enable_if_is_any_of<T, hip_atomicMax_builtin_types>* = nullptr>
+RAJA_INLINE __device__ T hip_atomicMax(T *acc, T value)
 {
-  return ::atomicDec((T *)acc);
+  return ::atomicMax(acc, value);
 }
 
 
-template <typename T, enable_if_is_none_of<T, hip_atomicAnd_builtin_types>* = nullptr>
-RAJA_INLINE __device__ T hip_atomicAnd(T volatile *acc, T val)
+/*!
+ * Atomic increment with reset
+ */
+template <typename T>
+RAJA_INLINE __device__ T hip_atomicInc(T *acc, T value)
 {
-  return hip_atomic_CAS_oper(acc, [=] __device__(T a) {
-    return a & val;
+  return hip_atomicCAS_loop(acc, [value] (T old) {
+    return value <= old ? static_cast<T>(0) : old + static_cast<T>(1);
   });
 }
 
-template <typename T, enable_if_is_any_of<T, hip_atomicAnd_builtin_types>* = nullptr>
-RAJA_INLINE __device__ T hip_atomicAnd(T volatile *acc, T val)
+
+/*!
+ * Atomic increment (implemented in terms of atomic addition)
+ */
+template <typename T>
+RAJA_INLINE __device__ T hip_atomicInc(T *acc)
 {
-  return ::atomicAnd((T *)acc, val);
+  return hip_atomicAdd(acc, static_cast<T>(1));
 }
 
 
-template <typename T, enable_if_is_none_of<T, hip_atomicOr_builtin_types>* = nullptr>
-RAJA_INLINE __device__ T hip_atomicOr(T volatile *acc, T val)
+/*!
+ * Atomic decrement with reset
+ */
+template <typename T>
+RAJA_INLINE __device__ T hip_atomicDec(T *acc, T value)
 {
-  return hip_atomic_CAS_oper(acc, [=] __device__(T a) {
-    return a | val;
+  return hip_atomicCAS_loop(acc, [value] (T old) {
+    return old == static_cast<T>(0) || value < old ? value : old - static_cast<T>(1);
   });
 }
 
-template <typename T, enable_if_is_any_of<T, hip_atomicOr_builtin_types>* = nullptr>
-RAJA_INLINE __device__ T hip_atomicOr(T volatile *acc, T val)
+
+/*!
+ * Atomic decrement (implemented in terms of atomic subtraction)
+ */
+template <typename T>
+RAJA_INLINE __device__ T hip_atomicDec(T *acc)
 {
-  return ::atomicOr((T *)acc, val);
+  return hip_atomicSub(acc, static_cast<T>(1));
 }
 
 
-template <typename T, enable_if_is_none_of<T, hip_atomicXor_builtin_types>* = nullptr>
-RAJA_INLINE __device__ T hip_atomicXor(T volatile *acc, T val)
+/*!
+ * Atomic and
+ */
+using hip_atomicAnd_builtin_types = hip_atomicCommon_builtin_types;
+
+template <typename T,
+          RAJA::util::enable_if_is_none_of<T, hip_atomicAnd_builtin_types>* = nullptr>
+RAJA_INLINE __device__ T hip_atomicAnd(T *acc, T value)
 {
-  return hip_atomic_CAS_oper(acc, [=] __device__(T a) {
-    return a ^ val;
+  return hip_atomicCAS_loop(acc, [value] (T old) {
+    return old & value;
   });
 }
 
-template <typename T, enable_if_is_any_of<T, hip_atomicXor_builtin_types>* = nullptr>
-RAJA_INLINE __device__ T hip_atomicXor(T volatile *acc, T val)
+template <typename T,
+          RAJA::util::enable_if_is_any_of<T, hip_atomicAnd_builtin_types>* = nullptr>
+RAJA_INLINE __device__ T hip_atomicAnd(T *acc, T value)
 {
-  return ::atomicXor((T *)acc, val);
+  return ::atomicAnd(acc, value);
 }
 
 
-template <typename T, enable_if_is_none_of<T, hip_atomicExch_builtin_types>* = nullptr>
-RAJA_INLINE __device__ T hip_atomicExchange(T volatile *acc, T val)
+/*!
+ * Atomic or
+ */
+using hip_atomicOr_builtin_types = hip_atomicCommon_builtin_types;
+
+template <typename T,
+          RAJA::util::enable_if_is_none_of<T, hip_atomicOr_builtin_types>* = nullptr>
+RAJA_INLINE __device__ T hip_atomicOr(T *acc, T value)
 {
-  return hip_atomic_CAS_oper(acc, [=] __device__(T) {
-    return val;
+  return hip_atomicCAS_loop(acc, [value] (T old) {
+    return old | value;
   });
 }
 
-template <typename T, enable_if_is_any_of<T, hip_atomicExch_builtin_types>* = nullptr>
-RAJA_INLINE __device__ T hip_atomicExchange(T volatile *acc, T val)
-{
-  return ::atomicExch((T *)acc, val);
-}
+/*!
+ * Atomic or via builtin functions was implemented much earlier since atomicLoad
+ * may depend on it.
+ */
 
 
-template <typename T, enable_if_is_none_of<T, hip_atomicCAS_builtin_types>* = nullptr>
-RAJA_INLINE __device__ T hip_atomicCAS(T volatile *acc, T compare, T val)
+/*!
+ * Atomic xor
+ */
+using hip_atomicXor_builtin_types = hip_atomicCommon_builtin_types;
+
+template <typename T,
+          RAJA::util::enable_if_is_none_of<T, hip_atomicXor_builtin_types>* = nullptr>
+RAJA_INLINE __device__ T hip_atomicXor(T *acc, T value)
 {
-  return hip_atomic_CAS(acc, compare, val);
+  return hip_atomicCAS_loop(acc, [value] (T old) {
+    return old ^ value;
+  });
 }
 
-template <typename T, enable_if_is_any_of<T, hip_atomicCAS_builtin_types>* = nullptr>
-RAJA_INLINE __device__ T hip_atomicCAS( T volatile *acc, T compare, T val)
+template <typename T,
+          RAJA::util::enable_if_is_any_of<T, hip_atomicXor_builtin_types>* = nullptr>
+RAJA_INLINE __device__ T hip_atomicXor(T *acc, T value)
 {
-  return ::atomicCAS((T *)acc, compare, val);
+  return ::atomicXor(acc, value);
 }
 
+
 }  // namespace detail
 
 
@@ -474,10 +718,35 @@ RAJA_INLINE __device__ T hip_atomicCAS( T volatile *acc, T compare, T val)
  *
  * These are atomic in hip device code and non-atomic otherwise
  */
+
+RAJA_SUPPRESS_HD_WARN
+template <typename T, typename host_policy>
+RAJA_INLINE RAJA_HOST_DEVICE T
+atomicLoad(hip_atomic_explicit<host_policy>, T *acc)
+{
+#if defined(__HIP_DEVICE_COMPILE__)
+  return detail::hip_atomicLoad(acc);
+#else
+  return RAJA::atomicLoad(host_policy{}, acc);
+#endif
+}
+
+RAJA_SUPPRESS_HD_WARN
+template <typename T, typename host_policy>
+RAJA_INLINE RAJA_HOST_DEVICE void
+atomicStore(hip_atomic_explicit<host_policy>, T *acc, T value)
+{
+#if defined(__HIP_DEVICE_COMPILE__)
+  detail::hip_atomicStore(acc, value);
+#else
+  RAJA::atomicStore(host_policy{}, acc, value);
+#endif
+}
+
 RAJA_SUPPRESS_HD_WARN
 template <typename T, typename host_policy>
 RAJA_INLINE RAJA_HOST_DEVICE T
-atomicAdd(hip_atomic_explicit<host_policy>, T volatile *acc, T value)
+atomicAdd(hip_atomic_explicit<host_policy>, T *acc, T value)
 {
 #if defined(__HIP_DEVICE_COMPILE__)
   return detail::hip_atomicAdd(acc, value);
@@ -489,7 +758,7 @@ atomicAdd(hip_atomic_explicit<host_policy>, T volatile *acc, T value)
 RAJA_SUPPRESS_HD_WARN
 template <typename T, typename host_policy>
 RAJA_INLINE RAJA_HOST_DEVICE T
-atomicSub(hip_atomic_explicit<host_policy>, T volatile *acc, T value)
+atomicSub(hip_atomic_explicit<host_policy>, T *acc, T value)
 {
 #if defined(__HIP_DEVICE_COMPILE__)
   return detail::hip_atomicSub(acc, value);
@@ -501,7 +770,7 @@ atomicSub(hip_atomic_explicit<host_policy>, T volatile *acc, T value)
 RAJA_SUPPRESS_HD_WARN
 template <typename T, typename host_policy>
 RAJA_INLINE RAJA_HOST_DEVICE T
-atomicMin(hip_atomic_explicit<host_policy>, T volatile *acc, T value)
+atomicMin(hip_atomic_explicit<host_policy>, T *acc, T value)
 {
 #if defined(__HIP_DEVICE_COMPILE__)
   return detail::hip_atomicMin(acc, value);
@@ -513,7 +782,7 @@ atomicMin(hip_atomic_explicit<host_policy>, T volatile *acc, T value)
 RAJA_SUPPRESS_HD_WARN
 template <typename T, typename host_policy>
 RAJA_INLINE RAJA_HOST_DEVICE T
-atomicMax(hip_atomic_explicit<host_policy>, T volatile *acc, T value)
+atomicMax(hip_atomic_explicit<host_policy>, T *acc, T value)
 {
 #if defined(__HIP_DEVICE_COMPILE__)
   return detail::hip_atomicMax(acc, value);
@@ -525,19 +794,19 @@ atomicMax(hip_atomic_explicit<host_policy>, T volatile *acc, T value)
 RAJA_SUPPRESS_HD_WARN
 template <typename T, typename host_policy>
 RAJA_INLINE RAJA_HOST_DEVICE T
-atomicInc(hip_atomic_explicit<host_policy>, T volatile *acc, T val)
+atomicInc(hip_atomic_explicit<host_policy>, T *acc, T value)
 {
 #if defined(__HIP_DEVICE_COMPILE__)
-  return detail::hip_atomicInc(acc, val);
+  return detail::hip_atomicInc(acc, value);
 #else
-  return RAJA::atomicInc(host_policy{}, acc, val);
+  return RAJA::atomicInc(host_policy{}, acc, value);
 #endif
 }
 
 RAJA_SUPPRESS_HD_WARN
 template <typename T, typename host_policy>
 RAJA_INLINE RAJA_HOST_DEVICE T
-atomicInc(hip_atomic_explicit<host_policy>, T volatile *acc)
+atomicInc(hip_atomic_explicit<host_policy>, T *acc)
 {
 #if defined(__HIP_DEVICE_COMPILE__)
   return detail::hip_atomicInc(acc);
@@ -549,19 +818,19 @@ atomicInc(hip_atomic_explicit<host_policy>, T volatile *acc)
 RAJA_SUPPRESS_HD_WARN
 template <typename T, typename host_policy>
 RAJA_INLINE RAJA_HOST_DEVICE T
-atomicDec(hip_atomic_explicit<host_policy>, T volatile *acc, T val)
+atomicDec(hip_atomic_explicit<host_policy>, T *acc, T value)
 {
 #if defined(__HIP_DEVICE_COMPILE__)
-  return detail::hip_atomicDec(acc, val);
+  return detail::hip_atomicDec(acc, value);
 #else
-  return RAJA::atomicDec(host_policy{}, acc, val);
+  return RAJA::atomicDec(host_policy{}, acc, value);
 #endif
 }
 
 RAJA_SUPPRESS_HD_WARN
 template <typename T, typename host_policy>
 RAJA_INLINE RAJA_HOST_DEVICE T
-atomicDec(hip_atomic_explicit<host_policy>, T volatile *acc)
+atomicDec(hip_atomic_explicit<host_policy>, T *acc)
 {
 #if defined(__HIP_DEVICE_COMPILE__)
   return detail::hip_atomicDec(acc);
@@ -573,7 +842,7 @@ atomicDec(hip_atomic_explicit<host_policy>, T volatile *acc)
 RAJA_SUPPRESS_HD_WARN
 template <typename T, typename host_policy>
 RAJA_INLINE RAJA_HOST_DEVICE T
-atomicAnd(hip_atomic_explicit<host_policy>, T volatile *acc, T value)
+atomicAnd(hip_atomic_explicit<host_policy>, T *acc, T value)
 {
 #if defined(__HIP_DEVICE_COMPILE__)
   return detail::hip_atomicAnd(acc, value);
@@ -585,7 +854,7 @@ atomicAnd(hip_atomic_explicit<host_policy>, T volatile *acc, T value)
 RAJA_SUPPRESS_HD_WARN
 template <typename T, typename host_policy>
 RAJA_INLINE RAJA_HOST_DEVICE T
-atomicOr(hip_atomic_explicit<host_policy>, T volatile *acc, T value)
+atomicOr(hip_atomic_explicit<host_policy>, T *acc, T value)
 {
 #if defined(__HIP_DEVICE_COMPILE__)
   return detail::hip_atomicOr(acc, value);
@@ -597,7 +866,7 @@ atomicOr(hip_atomic_explicit<host_policy>, T volatile *acc, T value)
 RAJA_SUPPRESS_HD_WARN
 template <typename T, typename host_policy>
 RAJA_INLINE RAJA_HOST_DEVICE T
-atomicXor(hip_atomic_explicit<host_policy>, T volatile *acc, T value)
+atomicXor(hip_atomic_explicit<host_policy>, T *acc, T value)
 {
 #if defined(__HIP_DEVICE_COMPILE__)
   return detail::hip_atomicXor(acc, value);
@@ -609,7 +878,7 @@ atomicXor(hip_atomic_explicit<host_policy>, T volatile *acc, T value)
 RAJA_SUPPRESS_HD_WARN
 template <typename T, typename host_policy>
 RAJA_INLINE RAJA_HOST_DEVICE T
-atomicExchange(hip_atomic_explicit<host_policy>, T volatile *acc, T value)
+atomicExchange(hip_atomic_explicit<host_policy>, T *acc, T value)
 {
 #if defined(__HIP_DEVICE_COMPILE__)
   return detail::hip_atomicExchange(acc, value);
@@ -621,7 +890,7 @@ atomicExchange(hip_atomic_explicit<host_policy>, T volatile *acc, T value)
 RAJA_SUPPRESS_HD_WARN
 template <typename T, typename host_policy>
 RAJA_INLINE RAJA_HOST_DEVICE T
-atomicCAS(hip_atomic_explicit<host_policy>, T volatile *acc, T compare, T value)
+atomicCAS(hip_atomic_explicit<host_policy>, T *acc, T compare, T value)
 {
 #if defined(__HIP_DEVICE_COMPILE__)
   return detail::hip_atomicCAS(acc, compare, value);
diff --git a/include/RAJA/policy/hip/forall.hpp b/include/RAJA/policy/hip/forall.hpp
index 6fa21f9217..a8c4cf53b9 100644
--- a/include/RAJA/policy/hip/forall.hpp
+++ b/include/RAJA/policy/hip/forall.hpp
@@ -560,7 +560,7 @@ forall_impl(resources::Hip hip_res,
       //
       // Privatize the loop_body, using make_launch_body to setup reductions
       //
-      LOOP_BODY body = RAJA::hip::make_launch_body(
+      LOOP_BODY body = RAJA::hip::make_launch_body(func,
           dims.blocks, dims.threads, shmem, hip_res, std::forward<LoopBody>(loop_body));
 
       //
@@ -610,7 +610,8 @@ forall_impl(resources::Hip hip_res,
   if (len > 0) {
 
     auto func = reinterpret_cast<const void*>(
-        &impl::forallp_hip_kernel<EXEC_POL, Iterator, LOOP_BODY, IndexType, camp::decay<ForallParam>>);
+        &impl::forallp_hip_kernel<EXEC_POL, Iterator, LOOP_BODY, IndexType,
+                                  camp::decay<ForallParam>>);
 
     //
     // Setup shared memory buffers
@@ -636,7 +637,7 @@ forall_impl(resources::Hip hip_res,
       //
       // Privatize the loop_body, using make_launch_body to setup reductions
       //
-      LOOP_BODY body = RAJA::hip::make_launch_body(
+      LOOP_BODY body = RAJA::hip::make_launch_body(func,
           dims.blocks, dims.threads, shmem, hip_res, std::forward<LoopBody>(loop_body));
 
       //
diff --git a/include/RAJA/policy/hip/intrinsics.hpp b/include/RAJA/policy/hip/intrinsics.hpp
index 354e5d7278..c72a0b5c4f 100644
--- a/include/RAJA/policy/hip/intrinsics.hpp
+++ b/include/RAJA/policy/hip/intrinsics.hpp
@@ -233,10 +233,10 @@ RAJA_DEVICE RAJA_INLINE T warp_reduce(T val, T RAJA_UNUSED_ARG(identity))
 
   T temp = val;
 
-  if (numThreads % policy::hip::WARP_SIZE == 0) {
+  if (numThreads % policy::hip::device_constants.WARP_SIZE == 0) {
 
     // reduce each warp
-    for (int i = 1; i < policy::hip::WARP_SIZE; i *= 2) {
+    for (int i = 1; i < policy::hip::device_constants.WARP_SIZE; i *= 2) {
       T rhs = shfl_xor_sync(temp, i);
       Combiner{}(temp, rhs);
     }
@@ -244,7 +244,7 @@ RAJA_DEVICE RAJA_INLINE T warp_reduce(T val, T RAJA_UNUSED_ARG(identity))
   } else {
 
     // reduce each warp
-    for (int i = 1; i < policy::hip::WARP_SIZE; i *= 2) {
+    for (int i = 1; i < policy::hip::device_constants.WARP_SIZE; i *= 2) {
       int srcLane = threadId ^ i;
       T rhs = shfl_sync(temp, srcLane);
       // only add from threads that exist (don't double count own value)
@@ -269,7 +269,7 @@ RAJA_DEVICE RAJA_INLINE T warp_allreduce(T val)
 {
   T temp = val;
 
-  for (int i = 1; i < policy::hip::WARP_SIZE; i *= 2) {
+  for (int i = 1; i < policy::hip::device_constants.WARP_SIZE; i *= 2) {
     T rhs = shfl_xor_sync(temp, i);
     Combiner{}(temp, rhs);
   }
@@ -287,15 +287,15 @@ RAJA_DEVICE RAJA_INLINE T block_reduce(T val, T identity)
   int threadId = threadIdx.x + blockDim.x * threadIdx.y +
                  (blockDim.x * blockDim.y) * threadIdx.z;
 
-  int warpId = threadId % policy::hip::WARP_SIZE;
-  int warpNum = threadId / policy::hip::WARP_SIZE;
+  int warpId = threadId % policy::hip::device_constants.WARP_SIZE;
+  int warpNum = threadId / policy::hip::device_constants.WARP_SIZE;
 
   T temp = val;
 
-  if (numThreads % policy::hip::WARP_SIZE == 0) {
+  if (numThreads % policy::hip::device_constants.WARP_SIZE == 0) {
 
     // reduce each warp
-    for (int i = 1; i < policy::hip::WARP_SIZE; i *= 2) {
+    for (int i = 1; i < policy::hip::device_constants.WARP_SIZE; i *= 2) {
       T rhs = shfl_xor_sync(temp, i);
       Combiner{}(temp, rhs);
     }
@@ -303,7 +303,7 @@ RAJA_DEVICE RAJA_INLINE T block_reduce(T val, T identity)
   } else {
 
     // reduce each warp
-    for (int i = 1; i < policy::hip::WARP_SIZE; i *= 2) {
+    for (int i = 1; i < policy::hip::device_constants.WARP_SIZE; i *= 2) {
       int srcLane = threadId ^ i;
       T rhs = shfl_sync(temp, srcLane);
       // only add from threads that exist (don't double count own value)
@@ -314,14 +314,14 @@ RAJA_DEVICE RAJA_INLINE T block_reduce(T val, T identity)
   }
 
   // reduce per warp values
-  if (numThreads > policy::hip::WARP_SIZE) {
+  if (numThreads > policy::hip::device_constants.WARP_SIZE) {
 
-    static_assert(policy::hip::MAX_WARPS <= policy::hip::WARP_SIZE,
-        "Max Warps must be less than or equal to Warp Size for this algorithm to work");
+    static_assert(policy::hip::device_constants.MAX_WARPS <= policy::hip::device_constants.WARP_SIZE,
+        "This algorithms assumes a warp of WARP_SIZE threads can reduce MAX_WARPS values");
 
-    __shared__ unsigned char tmpsd[sizeof(RAJA::detail::SoAArray<T, policy::hip::MAX_WARPS>)];
-    RAJA::detail::SoAArray<T, policy::hip::MAX_WARPS>* sd =
-      reinterpret_cast<RAJA::detail::SoAArray<T, policy::hip::MAX_WARPS> *>(tmpsd);
+    __shared__ unsigned char tmpsd[sizeof(RAJA::detail::SoAArray<T, policy::hip::device_constants.MAX_WARPS>)];
+    RAJA::detail::SoAArray<T, policy::hip::device_constants.MAX_WARPS>* sd =
+      reinterpret_cast<RAJA::detail::SoAArray<T, policy::hip::device_constants.MAX_WARPS> *>(tmpsd);
 
     // write per warp values to shared memory
     if (warpId == 0) {
@@ -333,13 +333,13 @@ RAJA_DEVICE RAJA_INLINE T block_reduce(T val, T identity)
     if (warpNum == 0) {
 
       // read per warp values
-      if (warpId * policy::hip::WARP_SIZE < numThreads) {
+      if (warpId * policy::hip::device_constants.WARP_SIZE < numThreads) {
         temp = sd->get(warpId);
       } else {
         temp = identity;
       }
 
-      for (int i = 1; i < policy::hip::MAX_WARPS; i *= 2) {
+      for (int i = 1; i < policy::hip::device_constants.MAX_WARPS; i *= 2) {
         T rhs = shfl_xor_sync(temp, i);
         Combiner{}(temp, rhs);
       }
diff --git a/include/RAJA/policy/hip/kernel/For.hpp b/include/RAJA/policy/hip/kernel/For.hpp
index 848ea42edf..39e7104c16 100644
--- a/include/RAJA/policy/hip/kernel/For.hpp
+++ b/include/RAJA/policy/hip/kernel/For.hpp
@@ -283,7 +283,7 @@ struct HipStatementExecutor<
 
   using diff_t = segment_diff_type<ArgumentId, Data>;
 
-  static_assert(mask_t::max_masked_size <= RAJA::policy::hip::WARP_SIZE,
+  static_assert(mask_t::max_masked_size <= RAJA::policy::hip::device_constants.WARP_SIZE,
                 "BitMask is too large for HIP warp size");
 
   static
@@ -312,7 +312,7 @@ struct HipStatementExecutor<
 
     // we always get EXACTLY one warp by allocating one warp in the X
     // dimension
-    const diff_t len = RAJA::policy::hip::WARP_SIZE;
+    const diff_t len = RAJA::policy::hip::device_constants.WARP_SIZE;
 
     // request one thread per element in the segment
     set_hip_dim<named_dim::x>(dims.dims.threads, len);
@@ -352,7 +352,7 @@ struct HipStatementExecutor<
 
   using diff_t = segment_diff_type<ArgumentId, Data>;
 
-  static_assert(mask_t::max_masked_size <= RAJA::policy::hip::WARP_SIZE,
+  static_assert(mask_t::max_masked_size <= RAJA::policy::hip::device_constants.WARP_SIZE,
                 "BitMask is too large for HIP warp size");
 
   static
@@ -391,7 +391,7 @@ struct HipStatementExecutor<
 
     // we always get EXACTLY one warp by allocating one warp in the X
     // dimension
-    const diff_t len = RAJA::policy::hip::WARP_SIZE;
+    const diff_t len = RAJA::policy::hip::device_constants.WARP_SIZE;
 
     // request one thread per element in the segment
     set_hip_dim<named_dim::x>(dims.dims.threads, len);
diff --git a/include/RAJA/policy/hip/kernel/ForICount.hpp b/include/RAJA/policy/hip/kernel/ForICount.hpp
index 014b4db3ac..ba6642f248 100644
--- a/include/RAJA/policy/hip/kernel/ForICount.hpp
+++ b/include/RAJA/policy/hip/kernel/ForICount.hpp
@@ -273,7 +273,7 @@ struct HipStatementExecutor<
 
   using mask_t = Mask;
 
-  static_assert(mask_t::max_masked_size <= RAJA::policy::hip::WARP_SIZE,
+  static_assert(mask_t::max_masked_size <= RAJA::policy::hip::device_constants.WARP_SIZE,
                 "BitMask is too large for HIP warp size");
 
   static inline RAJA_DEVICE
@@ -332,7 +332,7 @@ struct HipStatementExecutor<
 
   using mask_t = Mask;
 
-  static_assert(mask_t::max_masked_size <= RAJA::policy::hip::WARP_SIZE,
+  static_assert(mask_t::max_masked_size <= RAJA::policy::hip::device_constants.WARP_SIZE,
                 "BitMask is too large for HIP warp size");
 
   static inline RAJA_DEVICE
diff --git a/include/RAJA/policy/hip/kernel/HipKernel.hpp b/include/RAJA/policy/hip/kernel/HipKernel.hpp
index 68156600b2..1ed7740008 100644
--- a/include/RAJA/policy/hip/kernel/HipKernel.hpp
+++ b/include/RAJA/policy/hip/kernel/HipKernel.hpp
@@ -216,7 +216,7 @@ struct HipKernelLauncherGetter
   using type = camp::decay<decltype(&internal::HipKernelLauncherFixed<BlockSize, Data, executor_t>)>;
   static constexpr type get() noexcept
   {
-    return internal::HipKernelLauncherFixed<BlockSize, Data, executor_t>;
+    return &internal::HipKernelLauncherFixed<BlockSize, Data, executor_t>;
   }
 };
 
@@ -230,7 +230,7 @@ struct HipKernelLauncherGetter<0, Data, executor_t>
   using type = camp::decay<decltype(&internal::HipKernelLauncher<Data, executor_t>)>;
   static constexpr type get() noexcept
   {
-    return internal::HipKernelLauncher<Data, executor_t>;
+    return &internal::HipKernelLauncher<Data, executor_t>;
   }
 };
 
@@ -260,10 +260,15 @@ struct HipLaunchHelper<hip_explicit_launch<async0, num_blocks, num_threads>,Stmt
 
   using kernelGetter_t = HipKernelLauncherGetter<(num_threads <= 0) ? 0 : num_threads, Data, executor_t>;
 
+  inline static const void* get_func()
+  {
+    return reinterpret_cast<const void*>(kernelGetter_t::get());
+  }
+
   inline static void recommended_blocks_threads(size_t shmem_size,
       int &recommended_blocks, int &recommended_threads)
   {
-    auto func = reinterpret_cast<const void*>(kernelGetter_t::get());
+    auto func = Self::get_func();
 
     if (num_blocks <= 0) {
 
@@ -342,7 +347,7 @@ struct HipLaunchHelper<hip_explicit_launch<async0, num_blocks, num_threads>,Stmt
   inline static void max_blocks(size_t shmem_size,
       int &max_blocks, int actual_threads)
   {
-    auto func = reinterpret_cast<const void*>(kernelGetter_t::get());
+    auto func = Self::get_func();
 
     if (num_blocks <= 0) {
 
@@ -379,17 +384,6 @@ struct HipLaunchHelper<hip_explicit_launch<async0, num_blocks, num_threads>,Stmt
 
     }
   }
-
-  static void launch(Data &&data,
-                     internal::LaunchDims launch_dims,
-                     size_t shmem,
-                     RAJA::resources::Hip res)
-  {
-    auto func = kernelGetter_t::get();
-
-    void *args[] = {(void*)&data};
-    RAJA::hip::launch((const void*)func, launch_dims.dims.blocks, launch_dims.dims.threads, args, shmem, res, async);
-  }
 };
 
 /*!
@@ -571,17 +565,23 @@ struct StatementExecutor<
       }
 
       {
+        auto func = launch_t::get_func();
+
         //
         // Privatize the LoopData, using make_launch_body to setup reductions
         //
-        auto hip_data = RAJA::hip::make_launch_body(
+        // Note that there is a circular dependency between the previous setup
+        // of the launch_dims and potential changes to shmem here that is
+        // currently an unresolved issue.
+        //
+        auto hip_data = RAJA::hip::make_launch_body(func,
             launch_dims.dims.blocks, launch_dims.dims.threads, shmem, res, data);
 
-
         //
-        // Launch the kernels
+        // Launch the kernel
         //
-        launch_t::launch(std::move(hip_data), launch_dims, shmem, res);
+        void *args[] = {(void*)&hip_data};
+        RAJA::hip::launch(func, launch_dims.dims.blocks, launch_dims.dims.threads, args, shmem, res, launch_t::async);
       }
     }
   }
diff --git a/include/RAJA/policy/hip/launch.hpp b/include/RAJA/policy/hip/launch.hpp
index 76f592d20b..6823647b48 100644
--- a/include/RAJA/policy/hip/launch.hpp
+++ b/include/RAJA/policy/hip/launch.hpp
@@ -75,7 +75,8 @@ struct LaunchExecute<RAJA::policy::hip::hip_launch_t<async, named_usage::unspeci
   {
     using BODY = camp::decay<BODY_IN>;
 
-    auto func = launch_global_fcn<BODY>;
+    auto func = reinterpret_cast<const void*>(
+        &launch_global_fcn<BODY>);
 
     resources::Hip hip_res = res.get<RAJA::resources::Hip>();
 
@@ -99,17 +100,19 @@ struct LaunchExecute<RAJA::policy::hip::hip_launch_t<async, named_usage::unspeci
       RAJA_FT_BEGIN;
 
       {
+        size_t shared_mem_size = params.shared_mem_size;
+
         //
         // Privatize the loop_body, using make_launch_body to setup reductions
         //
-        BODY body = RAJA::hip::make_launch_body(
-            gridSize, blockSize, params.shared_mem_size, hip_res, std::forward<BODY_IN>(body_in));
+        BODY body = RAJA::hip::make_launch_body(func,
+            gridSize, blockSize, shared_mem_size, hip_res, std::forward<BODY_IN>(body_in));
 
         //
         // Launch the kernel
         //
         void *args[] = {(void*)&body};
-        RAJA::hip::launch((const void*)func, gridSize, blockSize, args, params.shared_mem_size, hip_res, async, kernel_name);
+        RAJA::hip::launch(func, gridSize, blockSize, args, shared_mem_size, hip_res, async, kernel_name);
       }
 
       RAJA_FT_END;
@@ -129,7 +132,8 @@ struct LaunchExecute<RAJA::policy::hip::hip_launch_t<async, named_usage::unspeci
   {
     using BODY = camp::decay<BODY_IN>;
 
-    auto func = reinterpret_cast<const void*>(launch_new_reduce_global_fcn<BODY, camp::decay<ReduceParams> >);
+    auto func = reinterpret_cast<const void*>(
+        &launch_new_reduce_global_fcn<BODY, camp::decay<ReduceParams>>);
 
     resources::Hip hip_res = res.get<RAJA::resources::Hip>();
 
@@ -152,9 +156,11 @@ struct LaunchExecute<RAJA::policy::hip::hip_launch_t<async, named_usage::unspeci
 
       RAJA_FT_BEGIN;
 
+      size_t shared_mem_size = launch_params.shared_mem_size;
       RAJA::hip::detail::hipInfo launch_info;
       launch_info.gridDim = gridSize;
       launch_info.blockDim = blockSize;
+      launch_info.dynamic_smem = &shared_mem_size;
       launch_info.res = hip_res;
 
       {
@@ -164,14 +170,14 @@ struct LaunchExecute<RAJA::policy::hip::hip_launch_t<async, named_usage::unspeci
         //
         // Privatize the loop_body, using make_launch_body to setup reductions
         //
-        BODY body = RAJA::hip::make_launch_body(
-            gridSize, blockSize, launch_params.shared_mem_size, hip_res, std::forward<BODY_IN>(body_in));
+        BODY body = RAJA::hip::make_launch_body(func,
+            gridSize, blockSize, shared_mem_size, hip_res, std::forward<BODY_IN>(body_in));
 
         //
         // Launch the kernel
         //
         void *args[] = {(void*)&body, (void*)&launch_reducers};
-        RAJA::hip::launch((const void*)func, gridSize, blockSize, args, launch_params.shared_mem_size, hip_res, async, kernel_name);
+        RAJA::hip::launch(func, gridSize, blockSize, args, shared_mem_size, hip_res, async, kernel_name);
 
         RAJA::expt::ParamMultiplexer::resolve<EXEC_POL>(launch_reducers, launch_info);
       }
@@ -235,7 +241,8 @@ struct LaunchExecute<RAJA::policy::hip::hip_launch_t<async, nthreads>> {
   {
     using BODY = camp::decay<BODY_IN>;
 
-    auto func = launch_global_fcn_fixed<BODY, nthreads>;
+    auto func = reinterpret_cast<const void*>(
+        &launch_global_fcn_fixed<BODY, nthreads>);
 
     resources::Hip hip_res = res.get<RAJA::resources::Hip>();
 
@@ -259,17 +266,18 @@ struct LaunchExecute<RAJA::policy::hip::hip_launch_t<async, nthreads>> {
       RAJA_FT_BEGIN;
 
       {
+        size_t shared_mem_size = params.shared_mem_size;
         //
         // Privatize the loop_body, using make_launch_body to setup reductions
         //
-        BODY body = RAJA::hip::make_launch_body(
-            gridSize, blockSize, params.shared_mem_size, hip_res, std::forward<BODY_IN>(body_in));
+        BODY body = RAJA::hip::make_launch_body(func,
+            gridSize, blockSize, shared_mem_size, hip_res, std::forward<BODY_IN>(body_in));
 
         //
         // Launch the kernel
         //
         void *args[] = {(void*)&body};
-        RAJA::hip::launch((const void*)func, gridSize, blockSize, args, params.shared_mem_size, hip_res, async, kernel_name);
+        RAJA::hip::launch(func, gridSize, blockSize, args, shared_mem_size, hip_res, async, kernel_name);
       }
 
       RAJA_FT_END;
@@ -288,7 +296,8 @@ struct LaunchExecute<RAJA::policy::hip::hip_launch_t<async, nthreads>> {
   {
     using BODY = camp::decay<BODY_IN>;
 
-    auto func = reinterpret_cast<const void*>(launch_new_reduce_global_fcn_fixed<BODY, nthreads, camp::decay<ReduceParams> >);
+    auto func = reinterpret_cast<const void*>(
+        &launch_new_reduce_global_fcn_fixed<BODY, nthreads, camp::decay<ReduceParams>>);
 
     resources::Hip hip_res = res.get<RAJA::resources::Hip>();
 
@@ -311,9 +320,11 @@ struct LaunchExecute<RAJA::policy::hip::hip_launch_t<async, nthreads>> {
 
       RAJA_FT_BEGIN;
 
+      size_t shared_mem_size = launch_params.shared_mem_size;
       RAJA::hip::detail::hipInfo launch_info;
       launch_info.gridDim = gridSize;
       launch_info.blockDim = blockSize;
+      launch_info.dynamic_smem = &shared_mem_size;
       launch_info.res = hip_res;
 
       {
@@ -323,14 +334,14 @@ struct LaunchExecute<RAJA::policy::hip::hip_launch_t<async, nthreads>> {
         //
         // Privatize the loop_body, using make_launch_body to setup reductions
         //
-        BODY body = RAJA::hip::make_launch_body(
-            gridSize, blockSize, launch_params.shared_mem_size, hip_res, std::forward<BODY_IN>(body_in));
+        BODY body = RAJA::hip::make_launch_body(func,
+            gridSize, blockSize, shared_mem_size, hip_res, std::forward<BODY_IN>(body_in));
 
         //
         // Launch the kernel
         //
         void *args[] = {(void*)&body, (void*)&launch_reducers};
-        RAJA::hip::launch((const void*)func, gridSize, blockSize, args, launch_params.shared_mem_size, hip_res, async, kernel_name);
+        RAJA::hip::launch(func, gridSize, blockSize, args, shared_mem_size, hip_res, async, kernel_name);
 
         RAJA::expt::ParamMultiplexer::resolve<EXEC_POL>(launch_reducers, launch_info);
       }
diff --git a/include/RAJA/policy/hip/multi_reduce.hpp b/include/RAJA/policy/hip/multi_reduce.hpp
new file mode 100644
index 0000000000..0d9d3899d8
--- /dev/null
+++ b/include/RAJA/policy/hip/multi_reduce.hpp
@@ -0,0 +1,764 @@
+/*!
+ ******************************************************************************
+ *
+ * \file
+ *
+ * \brief   Header file containing RAJA reduction templates for HIP execution.
+ *
+ *          These methods should work on any platform that supports
+ *          HIP devices.
+ *
+ ******************************************************************************
+ */
+
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2016-24, Lawrence Livermore National Security, LLC
+// and RAJA project contributors. See the RAJA/LICENSE file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+#ifndef RAJA_hip_multi_reduce_HPP
+#define RAJA_hip_multi_reduce_HPP
+
+#include "RAJA/config.hpp"
+
+#if defined(RAJA_ENABLE_HIP)
+
+#include <type_traits>
+#include <limits>
+#include <utility>
+#include <vector>
+
+#include "hip/hip_runtime.h"
+
+#include "RAJA/util/macros.hpp"
+#include "RAJA/util/math.hpp"
+#include "RAJA/util/mutex.hpp"
+#include "RAJA/util/types.hpp"
+#include "RAJA/util/reduce.hpp"
+#include "RAJA/util/OffsetOperators.hpp"
+
+#include "RAJA/pattern/detail/multi_reduce.hpp"
+#include "RAJA/pattern/multi_reduce.hpp"
+
+#include "RAJA/policy/hip/MemUtils_HIP.hpp"
+#include "RAJA/policy/hip/intrinsics.hpp"
+
+#if defined(RAJA_ENABLE_DESUL_ATOMICS)
+  #include "RAJA/policy/desul/atomic.hpp"
+#else
+  #include "RAJA/policy/hip/atomic.hpp"
+#endif
+
+#include "RAJA/policy/hip/policy.hpp"
+#include "RAJA/policy/hip/raja_hiperrchk.hpp"
+
+namespace RAJA
+{
+
+namespace hip
+{
+
+namespace impl
+{
+
+
+//
+//////////////////////////////////////////////////////////////////////
+//
+// MultiReduction algorithms.
+//
+//////////////////////////////////////////////////////////////////////
+//
+
+//! combine value into global memory
+template <typename Combiner, typename GetTallyIndex,
+          typename T, typename GetTallyOffset>
+RAJA_DEVICE RAJA_INLINE void block_multi_reduce_combine_global_atomic(int RAJA_UNUSED_ARG(num_bins),
+                                                                      T identity,
+                                                                      int bin,
+                                                                      T value,
+                                                                      T* tally_mem,
+                                                                      GetTallyOffset get_tally_offset,
+                                                                      int tally_replication,
+                                                                      int tally_bins)
+{
+  if (value == identity) { return; }
+
+  int tally_index = GetTallyIndex::template index<int>(); // globalWarpId by default
+  int tally_rep = ::RAJA::power_of_2_mod(tally_index, tally_replication);
+  int tally_offset = get_tally_offset(bin, tally_bins, tally_rep, tally_replication);
+  RAJA::reduce::hip::atomic<Combiner>{}(tally_mem[tally_offset], value);
+}
+
+
+//! initialize shared memory
+template <typename T>
+RAJA_DEVICE RAJA_INLINE void block_multi_reduce_init_shmem(int num_bins,
+                                                           T identity,
+                                                           T* shared_mem,
+                                                           int shared_replication)
+{
+  int threadId = threadIdx.x + blockDim.x * threadIdx.y +
+                 (blockDim.x * blockDim.y) * threadIdx.z;
+  int numThreads = blockDim.x * blockDim.y * blockDim.z;
+
+  for (int shmem_offset = threadId;
+       shmem_offset < shared_replication * num_bins;
+       shmem_offset += numThreads) {
+    shared_mem[shmem_offset] = identity;
+  }
+  __syncthreads();
+}
+
+//! combine value into shared memory
+template <typename Combiner, typename GetSharedIndex,
+          typename T, typename GetSharedOffset>
+RAJA_DEVICE RAJA_INLINE void block_multi_reduce_combine_shmem_atomic(int num_bins,
+                                                                     T identity,
+                                                                     int bin,
+                                                                     T value,
+                                                                     T* shared_mem,
+                                                                     GetSharedOffset get_shared_offset,
+                                                                     int shared_replication)
+{
+  if (value == identity) { return; }
+
+  int shared_index = GetSharedIndex::template index<int>(); // threadId by default
+  int shared_rep = ::RAJA::power_of_2_mod(shared_index, shared_replication);
+  int shmem_offset = get_shared_offset(bin, num_bins, shared_rep, shared_replication);
+
+  RAJA::reduce::hip::atomic<Combiner>{}(shared_mem[shmem_offset], value);
+}
+
+//! combine value into shared memory
+template <typename Combiner,
+          typename T, typename GetSharedOffset, typename GetTallyOffset>
+RAJA_DEVICE RAJA_INLINE void grid_multi_reduce_shmem_to_global_atomic(int num_bins,
+                                                                      T identity,
+                                                                      T* shared_mem,
+                                                                      GetSharedOffset get_shared_offset,
+                                                                      int shared_replication,
+                                                                      T* tally_mem,
+                                                                      GetTallyOffset get_tally_offset,
+                                                                      int tally_replication,
+                                                                      int tally_bins)
+{
+  int threadId = threadIdx.x + blockDim.x * threadIdx.y +
+                 (blockDim.x * blockDim.y) * threadIdx.z;
+  int numThreads = blockDim.x * blockDim.y * blockDim.z;
+
+  int blockId = blockIdx.x + gridDim.x * blockIdx.y +
+                 (gridDim.x * gridDim.y) * blockIdx.z;
+
+  __syncthreads();
+  for (int bin = threadId; bin < num_bins; bin += numThreads) {
+
+    T value = identity;
+    for (int shared_rep = 0; shared_rep < shared_replication; ++shared_rep) {
+      int shmem_offset = get_shared_offset(bin, num_bins, shared_rep, shared_replication);
+      Combiner{}(value, shared_mem[shmem_offset]);
+    }
+
+    if (value != identity) {
+      int tally_rep = ::RAJA::power_of_2_mod(blockId, tally_replication);
+      int tally_offset = get_tally_offset(bin, tally_bins, tally_rep, tally_replication);
+      RAJA::reduce::hip::atomic<Combiner>{}(tally_mem[tally_offset], value);
+    }
+
+  }
+}
+
+}  // namespace impl
+
+//
+//////////////////////////////////////////////////////////////////////
+//
+// MultiReduction classes.
+//
+//////////////////////////////////////////////////////////////////////
+//
+
+//! MultiReduction data for Hip Offload -- stores value, host pointer
+template <typename Combiner, typename T, typename tuning>
+struct MultiReduceGridAtomicHostInit_TallyData
+{
+  //! setup permanent settings, allocate and initialize tally memory
+  template < typename Container >
+  MultiReduceGridAtomicHostInit_TallyData(Container const& container, T const& identity)
+      : m_tally_mem(nullptr)
+      , m_identity(identity)
+      , m_num_bins(container.size())
+      , m_tally_bins(get_tally_bins(m_num_bins))
+      , m_tally_replication(get_tally_replication())
+  {
+    m_tally_mem = create_tally(container, identity, m_num_bins, m_tally_bins, m_tally_replication);
+  }
+
+  MultiReduceGridAtomicHostInit_TallyData() = delete;
+  MultiReduceGridAtomicHostInit_TallyData(MultiReduceGridAtomicHostInit_TallyData const&) = default;
+  MultiReduceGridAtomicHostInit_TallyData(MultiReduceGridAtomicHostInit_TallyData &&) = delete;
+  MultiReduceGridAtomicHostInit_TallyData& operator=(MultiReduceGridAtomicHostInit_TallyData const&) = default;
+  MultiReduceGridAtomicHostInit_TallyData& operator=(MultiReduceGridAtomicHostInit_TallyData &&) = delete;
+  ~MultiReduceGridAtomicHostInit_TallyData() = default;
+
+
+  //! reset permanent settings, reallocate and reset tally memory
+  template < typename Container >
+  void reset_permanent(Container const& container, T const& identity)
+  {
+    int new_num_bins = container.size();
+    if (new_num_bins != m_num_bins) {
+      teardown_permanent();
+      m_num_bins = new_num_bins;
+      m_tally_bins = get_tally_bins(m_num_bins);
+      m_tally_replication = get_tally_replication();
+      m_tally_mem = create_tally(container, identity, m_num_bins, m_tally_bins, m_tally_replication);
+    } else {
+      {
+        int tally_rep = 0;
+        int bin = 0;
+        for (auto const& value : container) {
+          m_tally_mem[GetTallyOffset{}(bin, m_tally_bins, tally_rep, m_tally_replication)] = value;
+          ++bin;
+        }
+      }
+      for (int tally_rep = 1; tally_rep < m_tally_replication; ++tally_rep) {
+        for (int bin = 0; bin < m_num_bins; ++bin) {
+          m_tally_mem[GetTallyOffset{}(bin, m_tally_bins, tally_rep, m_tally_replication)] = identity;
+        }
+      }
+    }
+    m_identity = identity;
+  }
+
+  //! teardown permanent settings, free tally memory
+  void teardown_permanent()
+  {
+    destroy_tally(m_tally_mem, m_num_bins, m_tally_bins, m_tally_replication);
+  }
+
+
+  //! get value for bin, assumes synchronization occurred elsewhere
+  T get(int bin) const
+  {
+    ::RAJA::detail::HighAccuracyReduce<T, typename Combiner::operator_type>
+          reducer(m_identity);
+    for (int tally_rep = 0; tally_rep < m_tally_replication; ++tally_rep) {
+      int tally_offset = GetTallyOffset{}(bin, m_tally_bins, tally_rep, m_tally_replication);
+      reducer.combine(m_tally_mem[tally_offset]);
+    }
+    return reducer.get_and_clear();
+  }
+
+
+  int num_bins() const { return m_num_bins; }
+
+  T identity() const { return m_identity; }
+
+private:
+  static constexpr size_t s_tally_alignment = std::max(size_t(policy::hip::device_constants.ATOMIC_DESTRUCTIVE_INTERFERENCE_SIZE),
+                                                       size_t(RAJA::DATA_ALIGN));
+  static constexpr size_t s_tally_bunch_size = RAJA_DIVIDE_CEILING_INT(s_tally_alignment, sizeof(T));
+
+  using tally_mempool_type = device_pinned_mempool_type;
+  using tally_tuning = typename tuning::GlobalAtomicReplicationTuning;
+  using TallyAtomicReplicationConcretizer = typename tally_tuning::AtomicReplicationConcretizer;
+  using GetTallyOffset_rebind_rebunch = typename tally_tuning::OffsetCalculator;
+  using GetTallyOffset_rebind = typename GetTallyOffset_rebind_rebunch::template rebunch<s_tally_bunch_size>;
+
+
+  static int get_tally_bins(int num_bins)
+  {
+    return RAJA_DIVIDE_CEILING_INT(num_bins, s_tally_bunch_size) * s_tally_bunch_size;
+  }
+
+  static int get_tally_replication()
+  {
+    int min_tally_replication = 1;
+#if defined(RAJA_ENABLE_OPENMP)
+    min_tally_replication = omp_get_max_threads();
+#endif
+
+    struct {
+      int func_min_global_replication;
+    } func_data{min_tally_replication};
+
+    return TallyAtomicReplicationConcretizer{}.template
+        get_global_replication<int>(func_data);
+  }
+
+  template < typename Container >
+  static T* create_tally(Container const& container, T const& identity,
+                         int num_bins, int tally_bins, int tally_replication)
+  {
+    if (num_bins == size_t(0)) {
+      return nullptr;
+    }
+
+    T* tally_mem = tally_mempool_type::getInstance().template malloc<T>(
+        tally_replication*tally_bins, s_tally_alignment);
+
+    if (tally_replication > 0) {
+      {
+        int tally_rep = 0;
+        int bin = 0;
+        for (auto const& value : container) {
+          int tally_offset = GetTallyOffset{}(bin, tally_bins, tally_rep, tally_replication);
+          new(&tally_mem[tally_offset]) T(value);
+          ++bin;
+        }
+      }
+      for (int tally_rep = 1; tally_rep < tally_replication; ++tally_rep) {
+        for (int bin = 0; bin < num_bins; ++bin) {
+          int tally_offset = GetTallyOffset{}(bin, tally_bins, tally_rep, tally_replication);
+          new(&tally_mem[tally_offset]) T(identity);
+        }
+      }
+    }
+    return tally_mem;
+  }
+
+  static void destroy_tally(T*& tally_mem,
+                            int num_bins, int tally_bins, int tally_replication)
+  {
+    if (num_bins == size_t(0)) {
+      return;
+    }
+
+    for (int tally_rep = tally_replication+1; tally_rep > 0; --tally_rep) {
+      for (int bin = num_bins; bin > 0; --bin) {
+        int tally_offset = GetTallyOffset{}(bin-1, tally_bins, tally_rep-1, tally_replication);
+        tally_mem[tally_offset].~T();
+      }
+    }
+    tally_mempool_type::getInstance().free(tally_mem);
+    tally_mem = nullptr;
+  }
+
+protected:
+  using GetTallyIndex = typename tally_tuning::ReplicationIndexer;
+  using GetTallyOffset = typename GetTallyOffset_rebind::template rebind<int>;
+
+  T* m_tally_mem;
+  T m_identity;
+  int m_num_bins;
+  int m_tally_bins;
+  int m_tally_replication; // power of 2, at least the max number of omp threads
+};
+
+
+//! MultiReduction data for Hip Offload -- stores value, host pointer
+template <typename Combiner, typename T, typename tuning>
+struct MultiReduceGridAtomicHostInit_Data
+    : MultiReduceGridAtomicHostInit_TallyData<Combiner, T, tuning>
+{
+  using TallyData = MultiReduceGridAtomicHostInit_TallyData<Combiner, T, tuning>;
+
+  //! defer to tally data for some functions
+  using TallyData::TallyData;
+  using TallyData::reset_permanent;
+  using TallyData::teardown_permanent;
+  using TallyData::get;
+  using TallyData::num_bins;
+  using TallyData::identity;
+
+  //! setup per launch, do nothing
+  void setup_launch(size_t RAJA_UNUSED_ARG(block_size))
+  { }
+
+  //! teardown per launch, do nothing
+  void teardown_launch()
+  { }
+
+
+  //! setup on device, do nothing
+  RAJA_DEVICE
+  void setup_device()
+  { }
+
+  //! finalize on device, do nothing
+  RAJA_DEVICE
+  void finalize_device()
+  { }
+
+
+  //! combine value on device, combine a value into the tally atomically
+  RAJA_DEVICE
+  void combine_device(int bin, T value)
+  {
+    impl::block_multi_reduce_combine_global_atomic<Combiner, GetTallyIndex>(
+        m_num_bins, m_identity,
+        bin, value,
+        m_tally_mem, GetTallyOffset{}, m_tally_replication, m_tally_bins);
+  }
+
+  //! combine value on host, combine a value into the tally
+  void combine_host(int bin, T value)
+  {
+    int tally_rep = 0;
+#if defined(RAJA_ENABLE_OPENMP)
+    tally_rep = omp_get_thread_num();
+#endif
+    int tally_offset = GetTallyOffset{}(bin, m_tally_bins, tally_rep, m_tally_replication);
+    Combiner{}(m_tally_mem[tally_offset], value);
+  }
+
+private:
+  using typename TallyData::GetTallyIndex;
+  using typename TallyData::GetTallyOffset;
+
+  using TallyData::m_tally_mem;
+  using TallyData::m_identity;
+  using TallyData::m_num_bins;
+  using TallyData::m_tally_bins;
+  using TallyData::m_tally_replication;
+};
+
+
+//! MultiReduction data for Hip Offload -- stores value, host pointer
+template <typename Combiner, typename T, typename tuning>
+struct MultiReduceBlockThenGridAtomicHostInit_Data
+    : MultiReduceGridAtomicHostInit_TallyData<Combiner, T, tuning>
+{
+  using TallyData = MultiReduceGridAtomicHostInit_TallyData<Combiner, T, tuning>;
+
+  //! setup permanent settings, defer to tally data
+  template < typename Container >
+  MultiReduceBlockThenGridAtomicHostInit_Data(Container const& container, T const& identity)
+      : TallyData(container, identity)
+      , m_shared_offset(s_shared_offset_unknown)
+      , m_shared_replication(0)
+  { }
+
+  MultiReduceBlockThenGridAtomicHostInit_Data() = delete;
+  MultiReduceBlockThenGridAtomicHostInit_Data(MultiReduceBlockThenGridAtomicHostInit_Data const&) = default;
+  MultiReduceBlockThenGridAtomicHostInit_Data(MultiReduceBlockThenGridAtomicHostInit_Data &&) = delete;
+  MultiReduceBlockThenGridAtomicHostInit_Data& operator=(MultiReduceBlockThenGridAtomicHostInit_Data const&) = default;
+  MultiReduceBlockThenGridAtomicHostInit_Data& operator=(MultiReduceBlockThenGridAtomicHostInit_Data &&) = delete;
+  ~MultiReduceBlockThenGridAtomicHostInit_Data() = default;
+
+
+  //! defer to tally data for some functions
+  using TallyData::reset_permanent;
+  using TallyData::teardown_permanent;
+  using TallyData::get;
+  using TallyData::num_bins;
+  using TallyData::identity;
+
+  //! setup per launch, setup shared memory parameters
+  void setup_launch(size_t block_size)
+  {
+    if (m_num_bins == size_t(0)) {
+      m_shared_offset = s_shared_offset_invalid;
+      return;
+    }
+
+    size_t shared_replication = 0;
+    const size_t shared_offset = allocateDynamicShmem<T>(
+        [&](size_t max_shmem_size) {
+
+      struct {
+        size_t func_threads_per_block;
+        size_t func_max_shared_replication_per_block;
+      } func_data{block_size, max_shmem_size / m_num_bins};
+
+      shared_replication = SharedAtomicReplicationConcretizer{}.template
+          get_shared_replication<size_t>(func_data);
+      return m_num_bins * shared_replication;
+    });
+
+    if (shared_offset != dynamic_smem_allocation_failure) {
+      m_shared_replication = static_cast<int>(shared_replication);
+      m_shared_offset = static_cast<int>(shared_offset);
+    } else {
+      m_shared_offset = s_shared_offset_invalid;
+    }
+  }
+
+  //! teardown per launch, unset shared memory parameters
+  void teardown_launch()
+  {
+    m_shared_replication = 0;
+    m_shared_offset = s_shared_offset_unknown;
+  }
+
+
+  //! setup on device, initialize shared memory
+  RAJA_DEVICE
+  void setup_device()
+  {
+    T* shared_mem = get_shared_mem();
+    if (shared_mem != nullptr) {
+      impl::block_multi_reduce_init_shmem(
+          m_num_bins, m_identity,
+          shared_mem, m_shared_replication);
+    }
+  }
+
+  //! finalize on device, combine values in shared memory into the tally
+  RAJA_DEVICE
+  void finalize_device()
+  {
+    T* shared_mem = get_shared_mem();
+    if (shared_mem != nullptr) {
+      impl::grid_multi_reduce_shmem_to_global_atomic<Combiner>(
+          m_num_bins, m_identity,
+          shared_mem, GetSharedOffset{}, m_shared_replication,
+          m_tally_mem, GetTallyOffset{}, m_tally_replication, m_tally_bins);
+    }
+  }
+
+
+  //! combine value on device, combine a value into shared memory
+  RAJA_DEVICE
+  void combine_device(int bin, T value)
+  {
+    T* shared_mem = get_shared_mem();
+    if (shared_mem != nullptr) {
+      impl::block_multi_reduce_combine_shmem_atomic<Combiner, GetSharedIndex>(
+          m_num_bins, m_identity,
+          bin, value,
+          shared_mem, GetSharedOffset{}, m_shared_replication);
+    } else {
+      impl::block_multi_reduce_combine_global_atomic<Combiner, GetTallyIndex>(
+          m_num_bins, m_identity,
+          bin, value,
+          m_tally_mem, GetTallyOffset{}, m_tally_replication, m_tally_bins);
+    }
+  }
+
+  //! combine value on host, combine a value into the tally
+  void combine_host(int bin, T value)
+  {
+    int tally_rep = 0;
+#if defined(RAJA_ENABLE_OPENMP)
+    tally_rep = omp_get_thread_num();
+#endif
+    int tally_offset = GetTallyOffset{}(bin, m_tally_bins, tally_rep, m_tally_replication);
+    Combiner{}(m_tally_mem[tally_offset], value);
+  }
+
+private:
+  using shared_tuning = typename tuning::SharedAtomicReplicationTuning;
+  using SharedAtomicReplicationConcretizer = typename shared_tuning::AtomicReplicationConcretizer;
+  using GetSharedIndex = typename shared_tuning::ReplicationIndexer;
+  using GetSharedOffset_rebind = typename shared_tuning::OffsetCalculator;
+  using GetSharedOffset = typename GetSharedOffset_rebind::template rebind<int>;
+
+  using typename TallyData::GetTallyIndex;
+  using typename TallyData::GetTallyOffset;
+
+
+  static constexpr int s_shared_offset_unknown = std::numeric_limits<int>::max();
+  static constexpr int s_shared_offset_invalid = std::numeric_limits<int>::max() - 1;
+
+
+  using TallyData::m_tally_mem;
+  using TallyData::m_identity;
+  using TallyData::m_num_bins;
+  using TallyData::m_tally_bins;
+  using TallyData::m_tally_replication;
+
+  int m_shared_offset; // in bytes
+  int m_shared_replication; // power of 2
+
+
+  RAJA_DEVICE
+  T* get_shared_mem() const
+  {
+    if (m_shared_offset == s_shared_offset_invalid) {
+      return nullptr;
+    }
+    extern __shared__ char shared_mem[];
+    return reinterpret_cast<T*>(&shared_mem[m_shared_offset]);
+  }
+};
+
+
+/*!
+ **************************************************************************
+ *
+ * \brief  Hip multi-reduce data class template.
+ *
+ * This class manages synchronization, data lifetimes, and interaction with
+ * the runtime kernel launch info passing facilities.
+ *
+ * This class manages the lifetime of underlying reduce_data_type using
+ * calls to setup and teardown methods. This includes storage durations:
+ * - permanent, the lifetime of the parent object
+ * - launch, setup before a launch using the launch parameters and
+ *           teardown after the launch
+ * - device, setup all device threads in a kernel before any block work and
+ *           teardown all device threads after all block work is finished
+ *
+ **************************************************************************
+ */
+template < typename T, typename t_MultiReduceOp, typename tuning >
+struct MultiReduceDataHip
+{
+  static constexpr bool atomic_available = RAJA::reduce::hip::hip_atomic_available<T>::value;
+
+  //! hip reduction data storage class and folding algorithm
+  using reduce_data_type =
+      std::conditional_t<(atomic_available),
+        std::conditional_t<(tuning::algorithm == multi_reduce_algorithm::init_host_combine_block_atomic_then_grid_atomic),
+          hip::MultiReduceBlockThenGridAtomicHostInit_Data<t_MultiReduceOp, T, tuning>,
+          std::conditional_t<(tuning::algorithm == multi_reduce_algorithm::init_host_combine_global_atomic),
+            hip::MultiReduceGridAtomicHostInit_Data<t_MultiReduceOp, T, tuning>,
+            void>>,
+      void>;
+
+
+  using SyncList = std::vector<resources::Hip>;
+
+public:
+  using value_type = T;
+  using MultiReduceOp = t_MultiReduceOp;
+
+  MultiReduceDataHip() = delete;
+
+  template < typename Container,
+             std::enable_if_t<!std::is_same<Container, MultiReduceDataHip>::value>* = nullptr >
+  MultiReduceDataHip(Container const& container, T identity)
+      : m_parent(this)
+      , m_sync_list(new SyncList)
+      , m_data(container, identity)
+      , m_own_launch_data(false)
+  {
+  }
+
+  //! copy and on host attempt to setup for device
+  //  init val_ptr to avoid uninitialized read caused by host copy of
+  //  reducer in host device lambda not being used on device.
+  RAJA_HOST_DEVICE
+  MultiReduceDataHip(MultiReduceDataHip const& other)
+#if !defined(RAJA_GPU_DEVICE_COMPILE_PASS_ACTIVE)
+      : m_parent(other.m_parent)
+#else
+      : m_parent(&other)
+#endif
+      , m_sync_list(other.m_sync_list)
+      , m_data(other.m_data)
+      , m_own_launch_data(false)
+  {
+#if !defined(RAJA_GPU_DEVICE_COMPILE_PASS_ACTIVE)
+    if (m_parent) {
+      if (setupReducers()) {
+        // the copy made in make_launch_body does this setup
+        add_resource_to_synchronization_list(currentResource());
+        m_data.setup_launch(currentBlockSize());
+        m_own_launch_data = true;
+        m_parent = nullptr;
+      }
+    }
+#else
+    if (!m_parent->m_parent) {
+      // the first copy on device enters this branch
+      m_data.setup_device();
+    }
+#endif
+  }
+
+  MultiReduceDataHip(MultiReduceDataHip &&) = delete;
+  MultiReduceDataHip& operator=(MultiReduceDataHip const&) = delete;
+  MultiReduceDataHip& operator=(MultiReduceDataHip &&) = delete;
+
+  //! cleanup resources owned by this copy
+  //  on device store in pinned buffer on host
+  RAJA_HOST_DEVICE
+  ~MultiReduceDataHip()
+  {
+#if !defined(RAJA_GPU_DEVICE_COMPILE_PASS_ACTIVE)
+    if (m_parent == this) {
+      // the original object, owns permanent storage
+      synchronize_resources_and_clear_list();
+      delete m_sync_list;
+      m_sync_list = nullptr;
+      m_data.teardown_permanent();
+    } else if (m_parent) {
+      // do nothing
+    } else {
+      if (m_own_launch_data) {
+        // the copy made in make_launch_body, owns launch data
+        m_data.teardown_launch();
+        m_own_launch_data = false;
+      }
+    }
+#else
+    if (!m_parent->m_parent) {
+      // the first copy on device, does finalization on the device
+      m_data.finalize_device();
+    }
+#endif
+  }
+
+
+  template < typename Container >
+  void reset(Container const& container, T identity)
+  {
+    synchronize_resources_and_clear_list();
+    m_data.reset_permanent(container, identity);
+  }
+
+
+  //! apply reduction (const version) -- still combines internal values
+  RAJA_HOST_DEVICE
+  void combine(int bin, T const& value)
+  {
+#if !defined(RAJA_GPU_DEVICE_COMPILE_PASS_ACTIVE)
+    m_data.combine_host(bin, value);
+#else
+    m_data.combine_device(bin, value);
+#endif
+  }
+
+
+  //! map result value back to host if not done already; return aggregate value
+  T get(int bin)
+  {
+    synchronize_resources_and_clear_list();
+    return m_data.get(bin);
+  }
+
+
+  size_t num_bins() const { return m_data.num_bins(); }
+
+  T identity() const { return m_data.identity(); }
+
+
+private:
+  MultiReduceDataHip const *m_parent;
+  SyncList* m_sync_list;
+  reduce_data_type m_data;
+  bool m_own_launch_data;
+
+  void add_resource_to_synchronization_list(resources::Hip res)
+  {
+    for (resources::Hip& list_res : *m_sync_list) {
+      if (list_res.get_stream() == res.get_stream()) {
+        return;
+      }
+    }
+    m_sync_list->emplace_back(res);
+  }
+
+  void synchronize_resources_and_clear_list()
+  {
+    for (resources::Hip& list_res : *m_sync_list) {
+      ::RAJA::hip::synchronize(list_res);
+    }
+    m_sync_list->clear();
+  }
+};
+
+}  // end namespace hip
+
+RAJA_DECLARE_ALL_MULTI_REDUCERS(policy::hip::hip_multi_reduce_policy, hip::MultiReduceDataHip)
+
+}  // namespace RAJA
+
+#endif  // closing endif for RAJA_ENABLE_HIP guard
+
+#endif  // closing endif for header file include guard
diff --git a/include/RAJA/policy/hip/params/kernel_name.hpp b/include/RAJA/policy/hip/params/kernel_name.hpp
new file mode 100644
index 0000000000..30269f8406
--- /dev/null
+++ b/include/RAJA/policy/hip/params/kernel_name.hpp
@@ -0,0 +1,52 @@
+#ifndef HIP_KERNELNAME_HPP
+#define HIP_KERNELNAME_HPP
+
+#if defined(RAJA_HIP_ACTIVE)
+
+#include "RAJA/policy/hip/MemUtils_HIP.hpp"
+#include "RAJA/pattern/params/kernel_name.hpp"
+
+#if defined(RAJA_ENABLE_ROCTX)
+#include "hip/hip_runtime_api.h"
+#include "roctx.h"
+#endif
+
+namespace RAJA {
+namespace expt {
+namespace detail {
+
+  // Init
+  template<typename EXEC_POL>
+  camp::concepts::enable_if< type_traits::is_hip_policy<EXEC_POL> >
+  init(KernelName& kn, const RAJA::hip::detail::hipInfo &)
+  {
+#if defined(RAJA_ENABLE_ROCTX)
+    roctxRangePush(kn.name);
+#else
+    RAJA_UNUSED_VAR(kn);
+#endif
+  }
+
+  // Combine
+  template<typename EXEC_POL>
+  RAJA_HOST_DEVICE
+  camp::concepts::enable_if< type_traits::is_hip_policy<EXEC_POL> >
+  combine(KernelName&) {}
+
+  // Resolve
+  template<typename EXEC_POL>
+  camp::concepts::enable_if< type_traits::is_hip_policy<EXEC_POL> >
+  resolve(KernelName&, const RAJA::hip::detail::hipInfo &)
+  {
+#if defined(RAJA_ENABLE_ROCTX)
+    roctxRangePop();
+#endif
+  }
+
+} //  namespace detail
+} //  namespace expt
+} //  namespace RAJA
+
+#endif
+
+#endif //  NEW_REDUCE_HIP_REDUCE_HPP
diff --git a/include/RAJA/policy/hip/policy.hpp b/include/RAJA/policy/hip/policy.hpp
index c359a68de0..a9f9027675 100644
--- a/include/RAJA/policy/hip/policy.hpp
+++ b/include/RAJA/policy/hip/policy.hpp
@@ -31,7 +31,9 @@
 #include "RAJA/policy/sequential/policy.hpp"
 
 #include "RAJA/util/Operators.hpp"
+#include "RAJA/util/OffsetOperators.hpp"
 #include "RAJA/util/types.hpp"
+#include "RAJA/util/math.hpp"
 
 namespace RAJA
 {
@@ -74,6 +76,13 @@ struct IndexGlobal;
 template<typename ...indexers>
 struct IndexFlatten;
 
+template<size_t divisor, typename index>
+struct IndexDivide;
+
+template<size_t divisor, typename index>
+struct IndexModulo;
+
+
 /*!
  * Use the max occupancy of a kernel on the current device when launch
  * parameters are not fully determined.
@@ -155,6 +164,84 @@ struct AvoidDeviceMaxThreadOccupancyConcretizer
 };
 
 
+/*!
+ * Get an amount of replication that is preferred_replication.
+ */
+template < size_t preferred_replication >
+struct ConstantPreferredReplicationConcretizer
+{
+  template < typename IdxT, typename Data >
+  static IdxT get_preferred_replication(Data const& RAJA_UNUSED_ARG(data))
+  {
+    return IdxT(preferred_replication);
+  }
+};
+
+/*!
+ * Get an amount of replication that is preferred_replication_before_cutoff if
+ * data.func_threads_per_block is less than t_cutoff or
+ * preferred_replication_after_cutoff otherwise.
+ */
+template < size_t t_cutoff, size_t preferred_replication_before_cutoff,
+                            size_t preferred_replication_after_cutoff >
+struct ThreadsPerBlockCutoffPreferredReplicationConcretizer
+{
+  template < typename IdxT, typename Data >
+  static IdxT get_preferred_replication(Data const& data)
+  {
+    IdxT cutoff = t_cutoff;
+    IdxT func_threads_per_block = data.func_threads_per_block;
+
+    if (func_threads_per_block < cutoff) {
+      return IdxT(preferred_replication_before_cutoff);
+    } else {
+      return IdxT(preferred_replication_after_cutoff);
+    }
+  }
+};
+
+/*!
+ * Get an amount of shared atomic replication that is a power of 2 that is at
+ * most the amount given by data.func_max_shared_replication_per_block or the
+ * amount given by GetPreferredReplication.
+ */
+template < typename GetPreferredReplication >
+struct SharedAtomicReplicationMaxPow2Concretizer
+{
+  template < typename IdxT, typename Data >
+  static IdxT get_shared_replication(Data const& data)
+  {
+    IdxT func_max_shared_replication_per_block = data.func_max_shared_replication_per_block;
+
+    IdxT preferred_replication = GetPreferredReplication{}.template
+        get_preferred_replication<IdxT>(data);
+
+    return prev_pow2(std::min(preferred_replication,
+                              func_max_shared_replication_per_block));
+  }
+};
+
+/*!
+ * Get an amount of global atomic replication that is a power of 2 that is at
+ * least the amount given by data.func_min_global_replication or the
+ * amount given by GetPreferredReplication.
+ */
+template < typename GetPreferredReplication >
+struct GlobalAtomicReplicationMinPow2Concretizer
+{
+  template < typename IdxT, typename Data >
+  static IdxT get_global_replication(Data const& data)
+  {
+    IdxT func_min_global_replication = data.func_min_global_replication;
+
+    IdxT preferred_replication = GetPreferredReplication{}.template
+        get_preferred_replication<IdxT>(data);
+
+    return next_pow2(std::max(preferred_replication, func_min_global_replication));
+  }
+};
+
+
 enum struct reduce_algorithm : int
 {
   combine_last_block,
@@ -176,6 +263,36 @@ struct ReduceTuning
   static constexpr block_communication_mode comm_mode = t_comm_mode;
   static constexpr size_t replication = t_replication;
   static constexpr size_t atomic_stride = t_atomic_stride;
+  static constexpr bool consistent =
+      (algorithm == reduce_algorithm::combine_last_block);
+};
+
+
+enum struct multi_reduce_algorithm : int
+{
+  init_host_combine_block_atomic_then_grid_atomic,
+  init_host_combine_global_atomic
+};
+
+template < typename t_AtomicReplicationConcretizer,
+           typename t_ReplicationIndexer,
+           typename t_OffsetCalculator >
+struct AtomicReplicationTuning
+{
+  using AtomicReplicationConcretizer = t_AtomicReplicationConcretizer;
+  using ReplicationIndexer = t_ReplicationIndexer;
+  using OffsetCalculator = t_OffsetCalculator;
+};
+
+template < multi_reduce_algorithm t_algorithm,
+           typename t_SharedAtomicReplicationTuning,
+           typename t_GlobalAtomicReplicationTuning >
+struct MultiReduceTuning
+{
+  static constexpr multi_reduce_algorithm algorithm = t_algorithm;
+  using SharedAtomicReplicationTuning = t_SharedAtomicReplicationTuning;
+  using GlobalAtomicReplicationTuning = t_GlobalAtomicReplicationTuning;
+  static constexpr bool consistent = false;
 };
 
 }  // namespace hip
@@ -185,6 +302,40 @@ namespace policy
 namespace hip
 {
 
+struct DeviceConstants
+{
+  RAJA::Index_type WARP_SIZE;
+  RAJA::Index_type MAX_BLOCK_SIZE;
+  RAJA::Index_type MAX_WARPS;
+  RAJA::Index_type ATOMIC_DESTRUCTIVE_INTERFERENCE_SIZE; // basically the cache line size of the cache level that handles atomics
+
+  constexpr DeviceConstants(RAJA::Index_type warp_size,
+                            RAJA::Index_type max_block_size,
+                            RAJA::Index_type atomic_cache_line_bytes) noexcept
+    : WARP_SIZE(warp_size)
+    , MAX_BLOCK_SIZE(max_block_size)
+    , MAX_WARPS(max_block_size / warp_size)
+    , ATOMIC_DESTRUCTIVE_INTERFERENCE_SIZE(atomic_cache_line_bytes)
+  { }
+};
+
+//
+// Operations in the included files are parametrized using the following
+// values for HIP warp size and max block size.
+//
+#if defined(__HIP_PLATFORM_AMD__)
+constexpr DeviceConstants device_constants(64, 1024, 64); // MI300A
+// constexpr DeviceConstants device_constants(64, 1024, 128); // MI250X
+#elif defined(__HIP_PLATFORM_NVIDIA__)
+constexpr DeviceConstants device_constants(32, 1024, 32); // V100
+#endif
+static_assert(device_constants.WARP_SIZE >= device_constants.MAX_WARPS,
+              "RAJA Assumption Broken: device_constants.WARP_SIZE < device_constants.MAX_WARPS");
+static_assert(device_constants.MAX_BLOCK_SIZE % device_constants.WARP_SIZE == 0,
+              "RAJA Assumption Broken: device_constants.MAX_BLOCK_SIZE not "
+              "a multiple of device_constants.WARP_SIZE");
+
+
 template <typename _IterationMapping, kernel_sync_requirement sync, typename ... _IterationGetters>
 struct hip_indexer {};
 
@@ -260,7 +411,22 @@ struct hip_reduce_policy
           make_policy_pattern_launch_platform_t<RAJA::Policy::hip,
                                                 RAJA::Pattern::reduce,
                                                 detail::get_launch<false>::value,
-                                                RAJA::Platform::hip> {
+                                                RAJA::Platform::hip,
+                                                std::conditional_t<tuning::consistent,
+                                                                   reduce::ordered,
+                                                                   reduce::unordered>> {
+};
+
+template < typename tuning >
+struct hip_multi_reduce_policy
+    : public RAJA::
+          make_policy_pattern_launch_platform_t<RAJA::Policy::hip,
+                                                RAJA::Pattern::multi_reduce,
+                                                detail::get_launch<false>::value,
+                                                RAJA::Platform::hip,
+                                                std::conditional_t<tuning::consistent,
+                                                                   reduce::ordered,
+                                                                   reduce::unordered>> {
 };
 
 /*!
@@ -277,74 +443,6 @@ struct hip_atomic_explicit{};
 using hip_atomic = hip_atomic_explicit<seq_atomic>;
 
 
-template < RAJA::hip::reduce_algorithm algorithm,
-           RAJA::hip::block_communication_mode comm_mode,
-           size_t replication = named_usage::unspecified,
-           size_t atomic_stride = named_usage::unspecified >
-using hip_reduce_tuning = hip_reduce_policy< RAJA::hip::ReduceTuning<
-    algorithm, comm_mode, replication, atomic_stride> >;
-
-// Policies for RAJA::Reduce* objects with specific behaviors.
-// - *atomic* policies may use atomics to combine partial results and falls back
-//   on a non-atomic policy when atomics can't be used with the given type. The
-//   use of atomics leads to order of operation differences which change the
-//   results of floating point sum reductions run to run. The memory used with
-//   atomics is initialized on the device which can be expensive on some HW.
-//   On some HW this is faster overall than the non-atomic policies.
-// - *atomic_host* policies are similar to the atomic policies above. However
-//   the memory used with atomics is initialized on the host which is
-//   significantly cheaper on some HW. On some HW this is faster overall than
-//   the non-atomic and atomic policies.
-// - *device_fence policies use normal memory accesses with device scope fences
-//                in the implementation. This works on all HW.
-// - *block_fence policies use special (atomic) memory accesses that only cache
-//                 in a cache shared by the whole device to avoid having to use
-//                 device scope fences. This improves performance on some HW but
-//                 is more difficult to code correctly.
-using hip_reduce_device_fence = hip_reduce_tuning<
-    RAJA::hip::reduce_algorithm::combine_last_block,
-    RAJA::hip::block_communication_mode::device_fence,
-    named_usage::unspecified, named_usage::unspecified>;
-///
-using hip_reduce_block_fence = hip_reduce_tuning<
-    RAJA::hip::reduce_algorithm::combine_last_block,
-    RAJA::hip::block_communication_mode::block_fence,
-    named_usage::unspecified, named_usage::unspecified>;
-///
-using hip_reduce_atomic_device_init_device_fence = hip_reduce_tuning<
-    RAJA::hip::reduce_algorithm::init_device_combine_atomic_block,
-    RAJA::hip::block_communication_mode::device_fence,
-    named_usage::unspecified, named_usage::unspecified>;
-///
-using hip_reduce_atomic_device_init_block_fence = hip_reduce_tuning<
-    RAJA::hip::reduce_algorithm::init_device_combine_atomic_block,
-    RAJA::hip::block_communication_mode::block_fence,
-    named_usage::unspecified, named_usage::unspecified>;
-///
-using hip_reduce_atomic_host_init_device_fence = hip_reduce_tuning<
-    RAJA::hip::reduce_algorithm::init_host_combine_atomic_block,
-    RAJA::hip::block_communication_mode::device_fence,
-    named_usage::unspecified, named_usage::unspecified>;
-///
-using hip_reduce_atomic_host_init_block_fence = hip_reduce_tuning<
-    RAJA::hip::reduce_algorithm::init_host_combine_atomic_block,
-    RAJA::hip::block_communication_mode::block_fence,
-    named_usage::unspecified, named_usage::unspecified>;
-
-// Policy for RAJA::Reduce* objects that gives the same answer every time when
-// used in the same way
-using hip_reduce = hip_reduce_block_fence;
-
-// Policy for RAJA::Reduce* objects that may use atomics and may not give the
-// same answer every time when used in the same way
-using hip_reduce_atomic = hip_reduce_atomic_host_init_block_fence;
-
-// Policy for RAJA::Reduce* objects that lets you select the default atomic or
-// non-atomic policy with a bool
-template < bool with_atomic >
-using hip_reduce_base = std::conditional_t<with_atomic, hip_reduce_atomic, hip_reduce>;
-
-
 // Policy for RAJA::statement::Reduce that reduces threads in a block
 // down to threadIdx 0
 struct hip_block_reduce{};
@@ -392,25 +490,6 @@ template<typename Mask>
 struct hip_thread_masked_loop {};
 
 
-
-//
-// Operations in the included files are parametrized using the following
-// values for HIP warp size and max block size.
-//
-constexpr const RAJA::Index_type ATOMIC_DESTRUCTIVE_INTERFERENCE_SIZE = 64; // 128 on gfx90a
-#if defined(__HIP_PLATFORM_AMD__)
-constexpr const RAJA::Index_type WARP_SIZE = 64;
-#elif defined(__HIP_PLATFORM_NVIDIA__)
-constexpr const RAJA::Index_type WARP_SIZE = 32;
-#endif
-constexpr const RAJA::Index_type MAX_BLOCK_SIZE = 1024;
-constexpr const RAJA::Index_type MAX_WARPS = MAX_BLOCK_SIZE / WARP_SIZE;
-static_assert(WARP_SIZE >= MAX_WARPS,
-              "RAJA Assumption Broken: WARP_SIZE < MAX_WARPS");
-static_assert(MAX_BLOCK_SIZE % WARP_SIZE == 0,
-              "RAJA Assumption Broken: MAX_BLOCK_SIZE not "
-              "a multiple of WARP_SIZE");
-
 struct hip_synchronize : make_policy_pattern_launch_t<Policy::hip,
                                                        Pattern::synchronize,
                                                        Launch::sync> {
@@ -988,6 +1067,38 @@ struct IndexFlatten<x_index, y_index, z_index>
 
 };
 
+template<size_t divisor, typename indexer>
+struct IndexDivide
+{
+  template < typename IdxT = hip_dim_member_t >
+  RAJA_DEVICE static inline IdxT index()
+  {
+    return indexer::template index<IdxT>() / static_cast<IdxT>(divisor);
+  }
+
+  template < typename IdxT = hip_dim_member_t >
+  RAJA_DEVICE static inline IdxT size()
+  {
+    return RAJA_DIVIDE_CEILING_INT(indexer::template size<IdxT>(), static_cast<IdxT>(divisor));
+  }
+};
+
+template<size_t divisor, typename indexer>
+struct IndexModulo
+{
+  template < typename IdxT = hip_dim_member_t >
+  RAJA_DEVICE static inline IdxT index()
+  {
+    return indexer::template index<IdxT>() % static_cast<IdxT>(divisor);
+  }
+
+  template < typename IdxT = hip_dim_member_t >
+  RAJA_DEVICE static inline IdxT size()
+  {
+    return static_cast<IdxT>(divisor);
+  }
+};
+
 
 // helper to get just the thread indexing part of IndexGlobal
 template < typename index_global >
@@ -1033,6 +1144,13 @@ using thread_y = IndexGlobal<named_dim::y, BLOCK_SIZE, named_usage::ignored>;
 template <size_t BLOCK_SIZE=named_usage::unspecified>
 using thread_z = IndexGlobal<named_dim::z, BLOCK_SIZE, named_usage::ignored>;
 
+template <size_t BLOCK_SIZE_X=named_usage::unspecified,
+          size_t BLOCK_SIZE_Y=named_usage::unspecified,
+          size_t BLOCK_SIZE_Z=named_usage::unspecified>
+using thread_xyz = IndexFlatten<thread_x<BLOCK_SIZE_X>,
+                                thread_y<BLOCK_SIZE_Y>,
+                                thread_z<BLOCK_SIZE_Z>>;
+
 template <size_t GRID_SIZE=named_usage::unspecified>
 using block_x = IndexGlobal<named_dim::x, named_usage::ignored, GRID_SIZE>;
 template <size_t GRID_SIZE=named_usage::unspecified>
@@ -1040,6 +1158,13 @@ using block_y = IndexGlobal<named_dim::y, named_usage::ignored, GRID_SIZE>;
 template <size_t GRID_SIZE=named_usage::unspecified>
 using block_z = IndexGlobal<named_dim::z, named_usage::ignored, GRID_SIZE>;
 
+template <size_t GRID_SIZE_X=named_usage::unspecified,
+          size_t GRID_SIZE_Y=named_usage::unspecified,
+          size_t GRID_SIZE_Z=named_usage::unspecified>
+using block_xyz = IndexFlatten<block_x<GRID_SIZE_X>,
+                               block_y<GRID_SIZE_Y>,
+                               block_z<GRID_SIZE_Z>>;
+
 template <size_t BLOCK_SIZE, size_t GRID_SIZE=named_usage::unspecified>
 using global_x = IndexGlobal<named_dim::x, BLOCK_SIZE, GRID_SIZE>;
 template <size_t BLOCK_SIZE, size_t GRID_SIZE=named_usage::unspecified>
@@ -1047,6 +1172,42 @@ using global_y = IndexGlobal<named_dim::y, BLOCK_SIZE, GRID_SIZE>;
 template <size_t BLOCK_SIZE, size_t GRID_SIZE=named_usage::unspecified>
 using global_z = IndexGlobal<named_dim::z, BLOCK_SIZE, GRID_SIZE>;
 
+
+template <size_t BLOCK_SIZE_X,
+          size_t BLOCK_SIZE_Y,
+          size_t BLOCK_SIZE_Z,
+          size_t GRID_SIZE_X=named_usage::unspecified,
+          size_t GRID_SIZE_Y=named_usage::unspecified,
+          size_t GRID_SIZE_Z=named_usage::unspecified>
+using global_xyz = IndexFlatten<global_x<BLOCK_SIZE_X, GRID_SIZE_X>,
+                                global_y<BLOCK_SIZE_Y, GRID_SIZE_Y>,
+                                global_z<BLOCK_SIZE_Z, GRID_SIZE_Z>>;
+
+
+template <size_t WARP_SIZE=RAJA::policy::hip::device_constants.WARP_SIZE,
+          size_t BLOCK_SIZE_X=named_usage::unspecified,
+          size_t BLOCK_SIZE_Y=named_usage::unspecified,
+          size_t BLOCK_SIZE_Z=named_usage::unspecified>
+using warp_xyz = IndexDivide<WARP_SIZE,
+                             thread_xyz<BLOCK_SIZE_X,
+                                        BLOCK_SIZE_Y,
+                                        BLOCK_SIZE_Z>>;
+
+template <size_t WARP_SIZE=RAJA::policy::hip::device_constants.WARP_SIZE,
+          size_t BLOCK_SIZE_X=named_usage::unspecified,
+          size_t BLOCK_SIZE_Y=named_usage::unspecified,
+          size_t BLOCK_SIZE_Z=named_usage::unspecified,
+          size_t GRID_SIZE_X=named_usage::unspecified,
+          size_t GRID_SIZE_Y=named_usage::unspecified,
+          size_t GRID_SIZE_Z=named_usage::unspecified>
+using warp_global_xyz = IndexFlatten<warp_xyz<WARP_SIZE,
+                                              BLOCK_SIZE_X,
+                                              BLOCK_SIZE_Y,
+                                              BLOCK_SIZE_Z>,
+                                     block_xyz<GRID_SIZE_X,
+                                               GRID_SIZE_Y,
+                                               GRID_SIZE_Z>>;
+
 } // namespace hip
 
 // contretizers used in forall, scan, and sort policies
@@ -1156,16 +1317,146 @@ using policy::hip::unordered_hip_loop_y_block_iter_x_threadblock_average;
 using policy::hip::hip_atomic;
 using policy::hip::hip_atomic_explicit;
 
+
 // policies usable with reducers
-using policy::hip::hip_reduce_device_fence;
-using policy::hip::hip_reduce_block_fence;
-using policy::hip::hip_reduce_atomic_device_init_device_fence;
-using policy::hip::hip_reduce_atomic_device_init_block_fence;
-using policy::hip::hip_reduce_atomic_host_init_device_fence;
-using policy::hip::hip_reduce_atomic_host_init_block_fence;
-using policy::hip::hip_reduce_base;
-using policy::hip::hip_reduce;
-using policy::hip::hip_reduce_atomic;
+template < hip::reduce_algorithm algorithm,
+           hip::block_communication_mode comm_mode,
+           size_t replication = named_usage::unspecified,
+           size_t atomic_stride = named_usage::unspecified >
+using hip_reduce_tuning = policy::hip::hip_reduce_policy<
+    hip::ReduceTuning<algorithm, comm_mode, replication, atomic_stride>>;
+
+// Policies for RAJA::Reduce* objects with specific behaviors.
+// - non-atomic policies store partial results and combine them in the same
+//   order every time, leading to consistent results for a loop run to run.
+// - *atomic* policies may use atomics to combine partial results. The
+//   use of atomics leads to order of operation differences which change the
+//   results of floating point sum reductions for a loop run to run. Falls back
+//   on a non-atomic implementation if atomics can't be used with the given
+//   type. The memory used with atomics is initialized on the device using
+//   atomics which adds overhead.
+// - *atomic_host* policies are similar to the atomic policies above. However
+//   the memory used with atomics is initialized on the host. This is faster
+//   overall than other policies on HW with direct host access to device memory
+//   such as the AMD MI300A El Capitan/Tuolumne systems.
+// - *device_fence* policies use normal memory accesses with device scope fences
+//                in the implementation. This works on all HW.
+// - *block_fence* policies use special (atomic) memory accesses that use
+//                 a cache shared by the whole device to avoid having to use
+//                 device scope fences. This improves performance on some HW but
+//                 is more difficult to code correctly.
+using hip_reduce_device_fence = hip_reduce_tuning<
+    hip::reduce_algorithm::combine_last_block,
+    hip::block_communication_mode::device_fence,
+    named_usage::unspecified, named_usage::unspecified>;
+///
+using hip_reduce_block_fence = hip_reduce_tuning<
+    hip::reduce_algorithm::combine_last_block,
+    hip::block_communication_mode::block_fence,
+    named_usage::unspecified, named_usage::unspecified>;
+///
+using hip_reduce_atomic_device_init_device_fence = hip_reduce_tuning<
+    hip::reduce_algorithm::init_device_combine_atomic_block,
+    hip::block_communication_mode::device_fence,
+    named_usage::unspecified, named_usage::unspecified>;
+///
+using hip_reduce_atomic_device_init_block_fence = hip_reduce_tuning<
+    hip::reduce_algorithm::init_device_combine_atomic_block,
+    hip::block_communication_mode::block_fence,
+    named_usage::unspecified, named_usage::unspecified>;
+///
+using hip_reduce_atomic_host_init_device_fence = hip_reduce_tuning<
+    hip::reduce_algorithm::init_host_combine_atomic_block,
+    hip::block_communication_mode::device_fence,
+    named_usage::unspecified, named_usage::unspecified>;
+///
+using hip_reduce_atomic_host_init_block_fence = hip_reduce_tuning<
+    hip::reduce_algorithm::init_host_combine_atomic_block,
+    hip::block_communication_mode::block_fence,
+    named_usage::unspecified, named_usage::unspecified>;
+
+// Policy for RAJA::Reduce* objects that gives the same answer every time when
+// used in the same way
+using hip_reduce = hip_reduce_block_fence;
+
+// Policy for RAJA::Reduce* objects that may use atomics and may not give the
+// same answer every time when used in the same way
+using hip_reduce_atomic = hip_reduce_atomic_host_init_block_fence;
+
+// Policy for RAJA::Reduce* objects that lets you select the default atomic or
+// non-atomic policy with a bool
+template < bool with_atomic >
+using hip_reduce_base = std::conditional_t<with_atomic, hip_reduce_atomic, hip_reduce>;
+
+
+// policies usable with multi_reducers
+template < hip::multi_reduce_algorithm algorithm,
+           typename SharedAtomicReplicationConcretizer,
+           typename SharedAtomicReplicationIndexer,
+           typename GlobalAtomicReplicationConcretizer,
+           typename GlobalAtomicReplicationIndexer >
+using hip_multi_reduce_tuning = policy::hip::hip_multi_reduce_policy<
+    hip::MultiReduceTuning<
+      algorithm,
+      hip::AtomicReplicationTuning<SharedAtomicReplicationConcretizer,
+                                    SharedAtomicReplicationIndexer,
+                                    GetOffsetRight<int>>,
+      hip::AtomicReplicationTuning<GlobalAtomicReplicationConcretizer,
+                                    GlobalAtomicReplicationIndexer,
+                                    GetOffsetLeft<int>>>>;
+
+// Policies for RAJA::MultiReduce* objects with specific behaviors.
+// - *atomic* policies may use atomics to combine partial results. The
+//   use of atomics leads to order of operation differences which change the
+//   results of floating point sum reductions for a loop run to run.
+// - *no_replication* policies use the minimum amount of resources. The
+//   lack of resources means they may perform poorly. These policies are
+//   intended for use cases where low overhead is more important than high
+//   performance such as error flags that are rarely set.
+// - *host_init* policies initialize memory used with atomics on the host.
+//   This is faster overall than other policies on HW with direct host access
+//   to device memory such as the AMD MI300A El Capitan/Tuolumne systems.
+using hip_multi_reduce_atomic_block_then_atomic_grid_host_init = hip_multi_reduce_tuning<
+    hip::multi_reduce_algorithm::init_host_combine_block_atomic_then_grid_atomic,
+    hip::SharedAtomicReplicationMaxPow2Concretizer<
+        hip::ConstantPreferredReplicationConcretizer<4>>,
+    hip::thread_xyz<>,
+    hip::GlobalAtomicReplicationMinPow2Concretizer<
+        hip::ConstantPreferredReplicationConcretizer<32>>,
+    hip::warp_global_xyz<>>;
+// special policy to test that multi-reducers work if there is not enough shmem
+using hip_multi_reduce_atomic_block_then_atomic_grid_host_init_fallback_testing = hip_multi_reduce_tuning<
+    hip::multi_reduce_algorithm::init_host_combine_block_atomic_then_grid_atomic,
+    hip::SharedAtomicReplicationMaxPow2Concretizer<
+        hip::ConstantPreferredReplicationConcretizer<0>>,
+    hip::thread_xyz<>,
+    hip::GlobalAtomicReplicationMinPow2Concretizer<
+        hip::ConstantPreferredReplicationConcretizer<32>>,
+    hip::warp_global_xyz<>>;
+//
+using hip_multi_reduce_atomic_global_host_init = hip_multi_reduce_tuning<
+    hip::multi_reduce_algorithm::init_host_combine_global_atomic,
+    void, // unused with this algorithm
+    void, // unused with this algorithm
+    hip::GlobalAtomicReplicationMinPow2Concretizer<
+        hip::ConstantPreferredReplicationConcretizer<32>>,
+    hip::warp_global_xyz<>>;
+//
+using hip_multi_reduce_atomic_global_no_replication_host_init = hip_multi_reduce_tuning<
+    hip::multi_reduce_algorithm::init_host_combine_global_atomic,
+    void, // unused with this algorithm
+    void, // unused with this algorithm
+    hip::GlobalAtomicReplicationMinPow2Concretizer<
+        hip::ConstantPreferredReplicationConcretizer<1>>,
+    hip::block_xyz<>>;
+
+// Policy for RAJA::MultiReduce* objects that may use atomics and may not give the
+// same answer every time when used in the same way
+using hip_multi_reduce_atomic = hip_multi_reduce_atomic_block_then_atomic_grid_host_init;
+// Similar to above but optimized for low overhead in cases where it is rarely used
+using hip_multi_reduce_atomic_low_performance_low_overhead =
+    hip_multi_reduce_atomic_global_no_replication_host_init;
+
 
 // policies usable with kernel
 using policy::hip::hip_block_reduce;
@@ -1174,11 +1465,11 @@ using policy::hip::hip_warp_reduce;
 using hip_warp_direct = RAJA::policy::hip::hip_indexer<
     iteration_mapping::Direct,
     kernel_sync_requirement::none,
-    hip::thread_x<RAJA::policy::hip::WARP_SIZE>>;
+    hip::thread_x<RAJA::policy::hip::device_constants.WARP_SIZE>>;
 using hip_warp_loop = RAJA::policy::hip::hip_indexer<
     iteration_mapping::StridedLoop<named_usage::unspecified>,
     kernel_sync_requirement::none,
-    hip::thread_x<RAJA::policy::hip::WARP_SIZE>>;
+    hip::thread_x<RAJA::policy::hip::device_constants.WARP_SIZE>>;
 
 using policy::hip::hip_warp_masked_direct;
 using policy::hip::hip_warp_masked_loop;
diff --git a/include/RAJA/policy/hip/reduce.hpp b/include/RAJA/policy/hip/reduce.hpp
index 2dbaf9f7e5..e8e67029ef 100644
--- a/include/RAJA/policy/hip/reduce.hpp
+++ b/include/RAJA/policy/hip/reduce.hpp
@@ -200,15 +200,15 @@ RAJA_DEVICE RAJA_INLINE T block_reduce(T val, T identity)
   const int numThreads = ThreadIterationGetter::size();
   const int threadId = ThreadIterationGetter::index();
 
-  const int warpId = threadId % RAJA::policy::hip::WARP_SIZE;
-  const int warpNum = threadId / RAJA::policy::hip::WARP_SIZE;
+  const int warpId = threadId % RAJA::policy::hip::device_constants.WARP_SIZE;
+  const int warpNum = threadId / RAJA::policy::hip::device_constants.WARP_SIZE;
 
   T temp = val;
 
-  if (numThreads % RAJA::policy::hip::WARP_SIZE == 0) {
+  if (numThreads % RAJA::policy::hip::device_constants.WARP_SIZE == 0) {
 
     // reduce each warp
-    for (int i = 1; i < RAJA::policy::hip::WARP_SIZE; i *= 2) {
+    for (int i = 1; i < RAJA::policy::hip::device_constants.WARP_SIZE; i *= 2) {
       T rhs = RAJA::hip::impl::shfl_xor_sync(temp, i);
       temp = Combiner{}(temp, rhs);
     }
@@ -216,7 +216,7 @@ RAJA_DEVICE RAJA_INLINE T block_reduce(T val, T identity)
   } else {
 
     // reduce each warp
-    for (int i = 1; i < RAJA::policy::hip::WARP_SIZE; i *= 2) {
+    for (int i = 1; i < RAJA::policy::hip::device_constants.WARP_SIZE; i *= 2) {
       int srcLane = threadId ^ i;
       T rhs = RAJA::hip::impl::shfl_sync(temp, srcLane);
       // only add from threads that exist (don't double count own value)
@@ -226,18 +226,18 @@ RAJA_DEVICE RAJA_INLINE T block_reduce(T val, T identity)
     }
   }
 
-  static_assert(RAJA::policy::hip::MAX_WARPS <= RAJA::policy::hip::WARP_SIZE,
+  static_assert(RAJA::policy::hip::device_constants.MAX_WARPS <= RAJA::policy::hip::device_constants.WARP_SIZE,
                "Max Warps must be less than or equal to Warp Size for this algorithm to work");
 
   // reduce per warp values
-  if (numThreads > RAJA::policy::hip::WARP_SIZE) {
+  if (numThreads > RAJA::policy::hip::device_constants.WARP_SIZE) {
 
     // Need to separate declaration and initialization for clang-hip
-    __shared__ unsigned char tmpsd[sizeof(RAJA::detail::SoAArray<T, RAJA::policy::hip::MAX_WARPS>)];
+    __shared__ unsigned char tmpsd[sizeof(RAJA::detail::SoAArray<T, RAJA::policy::hip::device_constants.MAX_WARPS>)];
 
     // Partial placement new: Should call new(tmpsd) here but recasting memory
     // to avoid calling constructor/destructor in shared memory.
-    RAJA::detail::SoAArray<T, RAJA::policy::hip::MAX_WARPS> * sd = reinterpret_cast<RAJA::detail::SoAArray<T, RAJA::policy::hip::MAX_WARPS> *>(tmpsd);
+    RAJA::detail::SoAArray<T, RAJA::policy::hip::device_constants.MAX_WARPS> * sd = reinterpret_cast<RAJA::detail::SoAArray<T, RAJA::policy::hip::device_constants.MAX_WARPS> *>(tmpsd);
 
     // write per warp values to shared memory
     if (warpId == 0) {
@@ -249,13 +249,13 @@ RAJA_DEVICE RAJA_INLINE T block_reduce(T val, T identity)
     if (warpNum == 0) {
 
       // read per warp values
-      if (warpId * RAJA::policy::hip::WARP_SIZE < numThreads) {
+      if (warpId * RAJA::policy::hip::device_constants.WARP_SIZE < numThreads) {
         temp = sd->get(warpId);
       } else {
         temp = identity;
       }
 
-      for (int i = 1; i < RAJA::policy::hip::MAX_WARPS; i *= 2) {
+      for (int i = 1; i < RAJA::policy::hip::device_constants.MAX_WARPS; i *= 2) {
         T rhs = RAJA::hip::impl::shfl_xor_sync(temp, i);
         temp = Combiner{}(temp, rhs);
       }
@@ -882,8 +882,8 @@ class Reduce
       : 32;
   static constexpr size_t atomic_stride = (tuning::atomic_stride > 0)
       ? tuning::atomic_stride
-      : ((policy::hip::ATOMIC_DESTRUCTIVE_INTERFERENCE_SIZE > sizeof(T))
-        ? RAJA_DIVIDE_CEILING_INT(policy::hip::ATOMIC_DESTRUCTIVE_INTERFERENCE_SIZE, sizeof(T))
+      : ((policy::hip::device_constants.ATOMIC_DESTRUCTIVE_INTERFERENCE_SIZE > sizeof(T))
+        ? RAJA_DIVIDE_CEILING_INT(policy::hip::device_constants.ATOMIC_DESTRUCTIVE_INTERFERENCE_SIZE, sizeof(T))
         : 1);
 
   using Accessor = std::conditional_t<(tuning::comm_mode == block_communication_mode::block_fence),
diff --git a/include/RAJA/policy/openmp.hpp b/include/RAJA/policy/openmp.hpp
index ae0f70a37f..fc29dabcbf 100644
--- a/include/RAJA/policy/openmp.hpp
+++ b/include/RAJA/policy/openmp.hpp
@@ -37,6 +37,7 @@
 #include "RAJA/policy/openmp/kernel.hpp"
 #include "RAJA/policy/openmp/policy.hpp"
 #include "RAJA/policy/openmp/reduce.hpp"
+#include "RAJA/policy/openmp/multi_reduce.hpp"
 #include "RAJA/policy/openmp/region.hpp"
 #include "RAJA/policy/openmp/scan.hpp"
 #include "RAJA/policy/openmp/sort.hpp"
diff --git a/include/RAJA/policy/openmp/atomic.hpp b/include/RAJA/policy/openmp/atomic.hpp
index 4eea77722e..2dc047dd95 100644
--- a/include/RAJA/policy/openmp/atomic.hpp
+++ b/include/RAJA/policy/openmp/atomic.hpp
@@ -36,163 +36,217 @@ namespace RAJA
 RAJA_SUPPRESS_HD_WARN
 template <typename T>
 RAJA_HOST_DEVICE
-RAJA_INLINE T atomicAdd(omp_atomic, T volatile *acc, T value)
+RAJA_INLINE T atomicLoad(omp_atomic, T *acc)
 {
   T ret;
 #pragma omp atomic capture
   {
     ret = *acc;  // capture old for return value
-    *acc += value;
+    *acc += (T)0;
   }
   return ret;
 }
 
-
 RAJA_SUPPRESS_HD_WARN
 template <typename T>
 RAJA_HOST_DEVICE
-RAJA_INLINE T atomicSub(omp_atomic, T volatile *acc, T value)
+RAJA_INLINE void atomicStore(omp_atomic, T *acc, T value)
 {
   T ret;
 #pragma omp atomic capture
   {
-    ret = *acc;  // capture old for return value
+    ret = *acc;
+    *acc = value;
+  }
+  RAJA_UNUSED_VAR(ret);
+}
+
+RAJA_SUPPRESS_HD_WARN
+template <typename T>
+RAJA_HOST_DEVICE
+RAJA_INLINE T atomicAdd(omp_atomic, T *acc, T value)
+{
+  T old;
+#pragma omp atomic capture
+  {
+    old = *acc;  // capture old for return value
+    *acc += value;
+  }
+  return old;
+}
+
+
+RAJA_SUPPRESS_HD_WARN
+template <typename T>
+RAJA_HOST_DEVICE
+RAJA_INLINE T atomicSub(omp_atomic, T *acc, T value)
+{
+  T old;
+#pragma omp atomic capture
+  {
+    old = *acc;  // capture old for return value
     *acc -= value;
   }
-  return ret;
+  return old;
 }
 
 
 RAJA_SUPPRESS_HD_WARN
 template <typename T>
 RAJA_HOST_DEVICE
-RAJA_INLINE T atomicMin(omp_atomic, T volatile *acc, T value)
+RAJA_INLINE T atomicMin(omp_atomic, T *acc, T value)
 {
-  // OpenMP doesn't define atomic trinary operators so use builtin atomics
+#if _OPENMP >= 202011
+  T old;
+  #pragma omp atomic capture compare
+  {
+    old = *acc;
+    if ( value < *acc )
+    {
+      *acc = value;
+    }
+  }
+  return old;
+#else
+  // OpenMP doesn't define atomic ternary operators so use builtin atomics
   return atomicMin(builtin_atomic{}, acc, value);
+#endif
 }
 
 RAJA_SUPPRESS_HD_WARN
 template <typename T>
 RAJA_HOST_DEVICE
-RAJA_INLINE T atomicMax(omp_atomic, T volatile *acc, T value)
+RAJA_INLINE T atomicMax(omp_atomic, T *acc, T value)
 {
-  // OpenMP doesn't define atomic trinary operators so use builtin atomics
+#if _OPENMP >= 202011
+  T old;
+  #pragma omp atomic capture compare
+  {
+    old = *acc;
+    if ( value > *acc )
+    {
+      *acc = value;
+    }
+  }
+  return old;
+#else
+  // OpenMP doesn't define atomic ternary operators so use builtin atomics
   return atomicMax(builtin_atomic{}, acc, value);
+#endif
 }
 
 
 RAJA_SUPPRESS_HD_WARN
 template <typename T>
 RAJA_HOST_DEVICE
-RAJA_INLINE T atomicInc(omp_atomic, T volatile *acc)
+RAJA_INLINE T atomicInc(omp_atomic, T *acc)
 {
-  T ret;
+  T old;
 #pragma omp atomic capture
   {
-    ret = *acc;  // capture old for return value
-    *acc += 1;
+    old = *acc;  // capture old for return value
+    *acc += T(1);
   }
-  return ret;
+  return old;
 }
 
 
 RAJA_SUPPRESS_HD_WARN
 template <typename T>
 RAJA_HOST_DEVICE
-RAJA_INLINE T atomicInc(omp_atomic, T volatile *acc, T val)
+RAJA_INLINE T atomicInc(omp_atomic, T *acc, T value)
 {
-  // OpenMP doesn't define atomic trinary operators so use builtin atomics
-  return RAJA::atomicInc(builtin_atomic{}, acc, val);
+  // OpenMP doesn't define needed operations, so use builtin atomics
+  return RAJA::atomicInc(builtin_atomic{}, acc, value);
 }
 
 
 RAJA_SUPPRESS_HD_WARN
 template <typename T>
 RAJA_HOST_DEVICE
-RAJA_INLINE T atomicDec(omp_atomic, T volatile *acc)
+RAJA_INLINE T atomicDec(omp_atomic, T *acc)
 {
-  T ret;
+  T old;
 #pragma omp atomic capture
   {
-    ret = *acc;  // capture old for return value
-    *acc -= 1;
+    old = *acc;  // capture old for return value
+    *acc -= T(1);
   }
-  return ret;
+  return old;
 }
 
 
 RAJA_SUPPRESS_HD_WARN
 template <typename T>
 RAJA_HOST_DEVICE
-RAJA_INLINE T atomicDec(omp_atomic, T volatile *acc, T val)
+RAJA_INLINE T atomicDec(omp_atomic, T *acc, T value)
 {
-  // OpenMP doesn't define atomic trinary operators so use builtin atomics
-  return RAJA::atomicDec(builtin_atomic{}, acc, val);
+  // OpenMP doesn't define needed operations, so use builtin atomics
+  return RAJA::atomicDec(builtin_atomic{}, acc, value);
 }
 
 RAJA_SUPPRESS_HD_WARN
 template <typename T>
 RAJA_HOST_DEVICE
-RAJA_INLINE T atomicAnd(omp_atomic, T volatile *acc, T value)
+RAJA_INLINE T atomicAnd(omp_atomic, T *acc, T value)
 {
-  T ret;
+  T old;
 #pragma omp atomic capture
   {
-    ret = *acc;  // capture old for return value
+    old = *acc;  // capture old for return value
     *acc &= value;
   }
-  return ret;
+  return old;
 }
 
 RAJA_SUPPRESS_HD_WARN
 template <typename T>
 RAJA_HOST_DEVICE
-RAJA_INLINE T atomicOr(omp_atomic, T volatile *acc, T value)
+RAJA_INLINE T atomicOr(omp_atomic, T *acc, T value)
 {
-  T ret;
+  T old;
 #pragma omp atomic capture
   {
-    ret = *acc;  // capture old for return value
+    old = *acc;  // capture old for return value
     *acc |= value;
   }
-  return ret;
+  return old;
 }
 
 RAJA_SUPPRESS_HD_WARN
 template <typename T>
 RAJA_HOST_DEVICE
-RAJA_INLINE T atomicXor(omp_atomic, T volatile *acc, T value)
+RAJA_INLINE T atomicXor(omp_atomic, T *acc, T value)
 {
-  T ret;
+  T old;
 #pragma omp atomic capture
   {
-    ret = *acc;  // capture old for return value
+    old = *acc;  // capture old for return value
     *acc ^= value;
   }
-  return ret;
+  return old;
 }
 
 RAJA_SUPPRESS_HD_WARN
 template <typename T>
 RAJA_HOST_DEVICE
-RAJA_INLINE T atomicExchange(omp_atomic, T volatile *acc, T value)
+RAJA_INLINE T atomicExchange(omp_atomic, T *acc, T value)
 {
-  T ret;
+  T old;
 #pragma omp atomic capture
   {
-    ret = *acc;  // capture old for return value
+    old = *acc;  // capture old for return value
     *acc = value;
   }
-  return ret;
+  return old;
 }
 
 RAJA_SUPPRESS_HD_WARN
 template <typename T>
 RAJA_HOST_DEVICE
-RAJA_INLINE T atomicCAS(omp_atomic, T volatile *acc, T compare, T value)
+RAJA_INLINE T atomicCAS(omp_atomic, T *acc, T compare, T value)
 {
-  // OpenMP doesn't define atomic trinary operators so use builtin atomics
+  // OpenMP doesn't define atomic ternary operators so use builtin atomics
   return RAJA::atomicCAS(builtin_atomic{}, acc, compare, value);
 }
 
diff --git a/include/RAJA/policy/openmp/multi_reduce.hpp b/include/RAJA/policy/openmp/multi_reduce.hpp
new file mode 100644
index 0000000000..22b09a7722
--- /dev/null
+++ b/include/RAJA/policy/openmp/multi_reduce.hpp
@@ -0,0 +1,360 @@
+/*!
+ ******************************************************************************
+ *
+ * \file
+ *
+ * \brief   Header file containing RAJA reduction templates for
+ *          OpenMP execution.
+ *
+ *          These methods should work on any platform that supports OpenMP.
+ *
+ ******************************************************************************
+ */
+
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2016-24, Lawrence Livermore National Security, LLC
+// and RAJA project contributors. See the RAJA/LICENSE file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+#ifndef RAJA_omp_multi_reduce_HPP
+#define RAJA_omp_multi_reduce_HPP
+
+#include "RAJA/config.hpp"
+
+#if defined(RAJA_ENABLE_OPENMP)
+
+#include <memory>
+#include <vector>
+
+#include <omp.h>
+
+#include "RAJA/util/types.hpp"
+#include "RAJA/util/reduce.hpp"
+#include "RAJA/util/RepeatView.hpp"
+
+#include "RAJA/internal/MemUtils_CPU.hpp"
+
+#include "RAJA/pattern/detail/multi_reduce.hpp"
+#include "RAJA/pattern/multi_reduce.hpp"
+
+#include "RAJA/policy/openmp/policy.hpp"
+
+namespace RAJA
+{
+
+namespace detail
+{
+
+/*!
+ **************************************************************************
+ *
+ * \brief  OMP multi-reduce data class template.
+ *
+ * In this class memory is owned by the parent object
+ *
+ **************************************************************************
+ */
+template < typename T, typename t_MultiReduceOp, typename tuning >
+struct MultiReduceDataOMP;
+
+/*!
+ **************************************************************************
+ *
+ * \brief  OMP multi-reduce data class template using combine on destruction.
+ *
+ * In this class memory is owned by each copy of the object
+ *
+ **************************************************************************
+ */
+template < typename T, typename t_MultiReduceOp >
+struct MultiReduceDataOMP<T, t_MultiReduceOp,
+    RAJA::omp::MultiReduceTuning<RAJA::omp::multi_reduce_algorithm::combine_on_destruction>>
+{
+  using value_type = T;
+  using MultiReduceOp = t_MultiReduceOp;
+
+  MultiReduceDataOMP() = delete;
+
+  template < typename Container,
+             std::enable_if_t<!std::is_same<Container, MultiReduceDataOMP>::value>* = nullptr >
+  MultiReduceDataOMP(Container const& container, T identity)
+      : m_parent(nullptr)
+      , m_num_bins(container.size())
+      , m_identity(identity)
+      , m_data(nullptr)
+  {
+    m_data = create_data(container, m_num_bins);
+  }
+
+  MultiReduceDataOMP(MultiReduceDataOMP const &other)
+      : m_parent(other.m_parent ? other.m_parent : &other)
+      , m_num_bins(other.m_num_bins)
+      , m_identity(other.m_identity)
+      , m_data(nullptr)
+  {
+    m_data = create_data(RepeatView<value_type>(other.m_identity, other.m_num_bins), other.m_num_bins);
+  }
+
+  MultiReduceDataOMP(MultiReduceDataOMP &&) = delete;
+  MultiReduceDataOMP& operator=(MultiReduceDataOMP const&) = delete;
+  MultiReduceDataOMP& operator=(MultiReduceDataOMP &&) = delete;
+
+  ~MultiReduceDataOMP()
+  {
+    if (m_data) {
+      if (m_parent && (m_num_bins != size_t(0))) {
+#pragma omp critical(ompMultiReduceCritical)
+        {
+          for (size_t bin = 0; bin < m_num_bins; ++bin) {
+            MultiReduceOp{}(m_parent->m_data[bin], m_data[bin]);
+          }
+        }
+      }
+      destroy_data(m_data, m_num_bins);
+    }
+  }
+
+  template < typename Container >
+  void reset(Container const& container, T identity)
+  {
+    m_identity = identity;
+    size_t new_num_bins = container.size();
+    if (new_num_bins != m_num_bins) {
+      destroy_data(m_data, m_num_bins);
+      m_num_bins = new_num_bins;
+      m_data = create_data(container, m_num_bins);
+    } else {
+      size_t bin = 0;
+      for (auto const& value : container) {
+        m_data[bin] = value;
+        ++bin;
+      }
+    }
+  }
+
+  size_t num_bins() const { return m_num_bins; }
+
+  T identity() const { return m_identity; }
+
+  void combine(size_t bin, T const &val) { MultiReduceOp{}(m_data[bin], val); }
+
+  T get(size_t bin) const { return m_data[bin]; }
+
+private:
+  MultiReduceDataOMP const *m_parent;
+  size_t m_num_bins;
+  T m_identity;
+  T* m_data;
+
+  template < typename Container >
+  static T* create_data(Container const& container, size_t num_bins)
+  {
+    if (num_bins == size_t(0)) {
+      return nullptr;
+    }
+    auto data = RAJA::allocate_aligned_type<T>( RAJA::DATA_ALIGN, num_bins * sizeof(T) );
+    size_t bin = 0;
+    for (auto const& value : container) {
+      new(&data[bin]) T(value);
+      ++bin;
+    }
+    return data;
+  }
+
+  static void destroy_data(T*& data, size_t num_bins)
+  {
+    if (num_bins == size_t(0)) {
+      return;
+    }
+    for (size_t bin = num_bins; bin > 0; --bin) {
+      data[bin-1].~T();
+    }
+    RAJA::free_aligned(data);
+    data = nullptr;
+  }
+};
+
+/*!
+ **************************************************************************
+ *
+ * \brief  OMP multi-reduce data class template using combine on get.
+ *
+ * In this class memory is owned by each copy of the object
+ *
+ **************************************************************************
+ */
+template < typename T, typename t_MultiReduceOp >
+struct MultiReduceDataOMP<T, t_MultiReduceOp,
+    RAJA::omp::MultiReduceTuning<RAJA::omp::multi_reduce_algorithm::combine_on_get>>
+{
+  using value_type = T;
+  using MultiReduceOp = t_MultiReduceOp;
+
+  MultiReduceDataOMP() = delete;
+
+  template < typename Container,
+             std::enable_if_t<!std::is_same<Container, MultiReduceDataOMP>::value>* = nullptr >
+  MultiReduceDataOMP(Container const& container, T identity)
+      : m_parent(nullptr)
+      , m_max_threads(omp_get_max_threads())
+      , m_num_bins(container.size())
+      , m_padded_threads(pad_threads(m_max_threads))
+      , m_padded_bins(pad_bins(m_num_bins))
+      , m_identity(identity)
+      , m_data(nullptr)
+  {
+    m_data = create_data(container, identity, m_num_bins, m_max_threads, m_padded_bins, m_padded_threads);
+  }
+
+  MultiReduceDataOMP(MultiReduceDataOMP const &other)
+      : m_parent(other.m_parent ? other.m_parent : &other)
+      , m_num_bins(other.m_num_bins)
+      , m_padded_threads(other.m_padded_threads)
+      , m_padded_bins(other.m_padded_bins)
+      , m_identity(other.m_identity)
+      , m_data(other.m_data)
+  { }
+
+  MultiReduceDataOMP(MultiReduceDataOMP &&) = delete;
+  MultiReduceDataOMP& operator=(MultiReduceDataOMP const&) = delete;
+  MultiReduceDataOMP& operator=(MultiReduceDataOMP &&) = delete;
+
+  ~MultiReduceDataOMP()
+  {
+    if (m_data) {
+      if (!m_parent) {
+        destroy_data(m_data, m_num_bins, m_max_threads, m_padded_bins, m_padded_threads);
+      }
+    }
+  }
+
+  template < typename Container >
+  void reset(Container const& container, T identity)
+  {
+    m_identity = identity;
+    size_t new_num_bins = container.size();
+    if (new_num_bins != m_num_bins) {
+      destroy_data(m_data, m_num_bins, m_max_threads, m_padded_bins, m_padded_threads);
+      m_num_bins = new_num_bins;
+      m_padded_bins = pad_bins(m_num_bins);
+      m_data = create_data(container, identity, m_num_bins, m_max_threads, m_padded_bins, m_padded_threads);
+    } else {
+      if (m_max_threads > 0) {
+        {
+          size_t thread_idx = 0;
+          size_t bin = 0;
+          for (auto const& value : container) {
+            m_data[index_data(bin, thread_idx, m_padded_bins, m_padded_threads)] = value;
+            ++bin;
+          }
+        }
+        for (size_t thread_idx = 1; thread_idx < m_max_threads; ++thread_idx) {
+          for (size_t bin = 0; bin < m_num_bins; ++bin) {
+            m_data[index_data(bin, thread_idx, m_padded_bins, m_padded_threads)] = identity;
+          }
+        }
+      }
+    }
+  }
+
+  size_t num_bins() const { return m_num_bins; }
+
+  T identity() const { return m_identity; }
+
+  void combine(size_t bin, T const &val)
+  {
+    size_t thread_idx = omp_get_thread_num();
+    MultiReduceOp{}(m_data[index_data(bin, thread_idx, m_padded_bins, m_padded_threads)], val);
+  }
+
+  T get(size_t bin) const
+  {
+    ::RAJA::detail::HighAccuracyReduce<T, typename MultiReduceOp::operator_type>
+        reducer(m_identity);
+    for (size_t thread_idx = 0; thread_idx < m_max_threads; ++thread_idx) {
+      reducer.combine(m_data[index_data(bin, thread_idx, m_padded_bins, m_padded_threads)]);
+    }
+    return reducer.get_and_clear();
+  }
+
+private:
+  MultiReduceDataOMP const *m_parent;
+  size_t m_max_threads;
+  size_t m_num_bins;
+  size_t m_padded_threads;
+  size_t m_padded_bins;
+  T m_identity;
+  T* m_data;
+
+  static constexpr size_t pad_bins(size_t num_bins)
+  {
+    size_t num_cache_lines = RAJA_DIVIDE_CEILING_INT(num_bins*sizeof(T), RAJA::DATA_ALIGN);
+    return RAJA_DIVIDE_CEILING_INT(num_cache_lines * RAJA::DATA_ALIGN, sizeof(T));
+  }
+
+  static constexpr size_t pad_threads(size_t max_threads)
+  {
+    return max_threads;
+  }
+
+  static constexpr size_t index_data(size_t bin, size_t thread_idx,
+                                     size_t padded_bins, size_t RAJA_UNUSED_ARG(padded_threads))
+  {
+    return bin + thread_idx * padded_bins;
+  }
+
+  template < typename Container >
+  static T* create_data(Container const& container, T identity,
+                        size_t num_bins, size_t max_threads,
+                        size_t padded_bins, size_t padded_threads)
+  {
+    if (num_bins == size_t(0)) {
+      return nullptr;
+    }
+    auto data = RAJA::allocate_aligned_type<T>( RAJA::DATA_ALIGN, padded_threads*padded_bins*sizeof(T) );
+    if (max_threads > 0) {
+      {
+        size_t thread_idx = 0;
+        size_t bin = 0;
+        for (auto const& value : container) {
+          new(&data[index_data(bin, thread_idx, padded_bins, padded_threads)]) T(value);
+          ++bin;
+        }
+      }
+      for (size_t thread_idx = 1; thread_idx < max_threads; ++thread_idx) {
+        for (size_t bin = 0; bin < num_bins; ++bin) {
+          new(&data[index_data(bin, thread_idx, padded_bins, padded_threads)]) T(identity);
+        }
+      }
+    }
+    return data;
+  }
+
+  static void destroy_data(T*& data,
+                           size_t num_bins, size_t max_threads,
+                           size_t padded_bins, size_t padded_threads)
+  {
+    if (num_bins == size_t(0)) {
+      return;
+    }
+    for (size_t thread_idx = max_threads; thread_idx > 0; --thread_idx) {
+      for (size_t bin = num_bins; bin > 0; --bin) {
+        data[index_data(bin-1, thread_idx-1, padded_bins, padded_threads)].~T();
+      }
+    }
+    RAJA::free_aligned(data);
+    data = nullptr;
+  }
+};
+
+}  // namespace detail
+
+RAJA_DECLARE_ALL_MULTI_REDUCERS(policy::omp::omp_multi_reduce_policy, detail::MultiReduceDataOMP)
+
+}  // namespace RAJA
+
+#endif  // closing endif for RAJA_ENABLE_OPENMP guard
+
+#endif  // closing endif for header file include guard
diff --git a/include/RAJA/policy/openmp/params/kernel_name.hpp b/include/RAJA/policy/openmp/params/kernel_name.hpp
new file mode 100644
index 0000000000..65a5f7a329
--- /dev/null
+++ b/include/RAJA/policy/openmp/params/kernel_name.hpp
@@ -0,0 +1,40 @@
+#ifndef OPENMP_KERNELNAME_HPP
+#define OPENMP_KERNELNAME_HPP
+
+#include "RAJA/pattern/params/kernel_name.hpp"
+
+namespace RAJA {
+namespace expt {
+namespace detail {
+
+#if defined(RAJA_ENABLE_OPENMP)
+
+  // Init
+  template<typename EXEC_POL>
+  camp::concepts::enable_if< type_traits::is_openmp_policy<EXEC_POL> >
+  init(KernelName&)
+  {
+    //TODO: Define kernel naming
+  }
+
+  // Combine
+  template<typename EXEC_POL, typename T>
+  camp::concepts::enable_if< type_traits::is_openmp_policy<EXEC_POL> >
+  combine(KernelName&, T& /*place holder argument*/) {}
+
+  // Resolve
+  template<typename EXEC_POL>
+  camp::concepts::enable_if< type_traits::is_openmp_policy<EXEC_POL> >
+  resolve(KernelName&)
+  {
+    //TODO: Define kernel naming
+  }
+
+#endif
+
+} //  namespace detail
+} //  namespace expt
+} //  namespace RAJA
+
+
+#endif //  NEW_REDUCE_SEQ_REDUCE_HPP
diff --git a/include/RAJA/policy/openmp/policy.hpp b/include/RAJA/policy/openmp/policy.hpp
index af5bdf2df7..aff2567474 100644
--- a/include/RAJA/policy/openmp/policy.hpp
+++ b/include/RAJA/policy/openmp/policy.hpp
@@ -42,6 +42,25 @@ typedef enum omp_sched_t {
 
 namespace RAJA
 {
+namespace omp
+{
+
+enum struct multi_reduce_algorithm : int
+{
+  combine_on_destruction,
+  combine_on_get
+};
+
+template < multi_reduce_algorithm t_algorithm >
+struct MultiReduceTuning
+{
+  static constexpr multi_reduce_algorithm algorithm = t_algorithm;
+  static constexpr bool consistent =
+      (algorithm == multi_reduce_algorithm::combine_on_get);
+};
+
+} // namspace omp
+
 namespace policy
 {
 namespace omp
@@ -283,6 +302,18 @@ struct omp_reduce_ordered
     : make_policy_pattern_t<Policy::openmp, Pattern::reduce, reduce::ordered> {
 };
 
+///
+template < typename tuning >
+struct omp_multi_reduce_policy
+    : make_policy_pattern_launch_platform_t<Policy::openmp,
+                                            Pattern::multi_reduce,
+                                            Launch::undefined,
+                                            Platform::host,
+                                            std::conditional_t<tuning::consistent,
+                                                               reduce::ordered,
+                                                               reduce::unordered>> {
+};
+
 ///
 struct omp_synchronize : make_policy_pattern_launch_t<Policy::openmp,
                                                       Pattern::synchronize,
@@ -300,6 +331,32 @@ struct omp_atomic {};
 
 #endif
 
+
+template < RAJA::omp::multi_reduce_algorithm algorithm >
+using omp_multi_reduce_tuning = omp_multi_reduce_policy<
+    RAJA::omp::MultiReduceTuning<algorithm> >;
+
+// Policies for RAJA::MultiReduce* objects with specific behaviors.
+// - combine_on_destruction policies combine new values into a single value for
+//   each object then each object combines its values into the parent object's
+//   values on destruction in a critical region.
+using omp_multi_reduce_combine_on_destruction = omp_multi_reduce_tuning<
+    RAJA::omp::multi_reduce_algorithm::combine_on_destruction>;
+// - combine_on_get policies combine new values into a single value for
+//   each thread then when get is called those values are combined.
+using omp_multi_reduce_combine_on_get = omp_multi_reduce_tuning<
+    RAJA::omp::multi_reduce_algorithm::combine_on_get>;
+
+// Policy for RAJA::MultiReduce* objects that gives the
+// same answer every time when used in the same way
+using omp_multi_reduce_ordered = omp_multi_reduce_combine_on_get;
+
+// Policy for RAJA::MultiReduce* objects that may not give the
+// same answer every time when used in the same way
+using omp_multi_reduce_unordered = omp_multi_reduce_combine_on_destruction;
+
+using omp_multi_reduce = omp_multi_reduce_unordered;
+
 }  // namespace omp
 }  // namespace policy
 
@@ -389,6 +446,10 @@ using policy::omp::omp_launch_t;
 using policy::omp::omp_reduce;
 ///
 using policy::omp::omp_reduce_ordered;
+///
+using policy::omp::omp_multi_reduce;
+///
+using policy::omp::omp_multi_reduce_ordered;
 
 ///
 /// Type aliases for omp reductions
diff --git a/include/RAJA/policy/openmp_target.hpp b/include/RAJA/policy/openmp_target.hpp
index 6b90282e6d..af88127636 100644
--- a/include/RAJA/policy/openmp_target.hpp
+++ b/include/RAJA/policy/openmp_target.hpp
@@ -30,6 +30,7 @@
 #include "RAJA/policy/openmp_target/kernel.hpp"
 #include "RAJA/policy/openmp_target/forall.hpp"
 #include "RAJA/policy/openmp_target/reduce.hpp"
+//#include "RAJA/policy/openmp_target/multi_reduce.hpp"
 #include "RAJA/policy/openmp_target/WorkGroup.hpp"
 
 
diff --git a/include/RAJA/policy/openmp_target/params/kernel_name.hpp b/include/RAJA/policy/openmp_target/params/kernel_name.hpp
new file mode 100644
index 0000000000..5e9edb4b6c
--- /dev/null
+++ b/include/RAJA/policy/openmp_target/params/kernel_name.hpp
@@ -0,0 +1,40 @@
+#ifndef OPENMP_TARGET_KERNELNAME_HPP
+#define OPENMP_TARGET_KERNELNAME_HPP
+
+#include "RAJA/pattern/params/kernel_name.hpp"
+
+namespace RAJA {
+namespace expt {
+namespace detail {
+
+#if defined(RAJA_ENABLE_TARGET_OPENMP)
+
+  // Init
+  template<typename EXEC_POL>
+  camp::concepts::enable_if< type_traits::is_target_openmp_policy<EXEC_POL> >
+  init(KernelName&)
+  {
+    //TODO: Define kernel naming
+  }
+
+  // Combine
+  template<typename EXEC_POL, typename T>
+  camp::concepts::enable_if< type_traits::is_target_openmp_policy<EXEC_POL> >
+  combine(KernelName&, T& /*place holder argument*/) {}
+
+  // Resolve
+  template<typename EXEC_POL>
+  camp::concepts::enable_if< type_traits::is_target_openmp_policy<EXEC_POL> >
+  resolve(KernelName&)
+  {
+    //TODO: Define kernel naming
+  }
+
+#endif
+
+} //  namespace detail
+} //  namespace expt
+} //  namespace RAJA
+
+
+#endif //  NEW_REDUCE_SEQ_REDUCE_HPP
diff --git a/include/RAJA/policy/sequential.hpp b/include/RAJA/policy/sequential.hpp
index e9c1f8e570..0963b31a01 100644
--- a/include/RAJA/policy/sequential.hpp
+++ b/include/RAJA/policy/sequential.hpp
@@ -28,6 +28,7 @@
 #include "RAJA/policy/sequential/kernel.hpp"
 #include "RAJA/policy/sequential/policy.hpp"
 #include "RAJA/policy/sequential/reduce.hpp"
+#include "RAJA/policy/sequential/multi_reduce.hpp"
 #include "RAJA/policy/sequential/scan.hpp"
 #include "RAJA/policy/sequential/sort.hpp"
 #include "RAJA/policy/sequential/launch.hpp"
diff --git a/include/RAJA/policy/sequential/atomic.hpp b/include/RAJA/policy/sequential/atomic.hpp
index 58777cd9ef..046e52e1c1 100644
--- a/include/RAJA/policy/sequential/atomic.hpp
+++ b/include/RAJA/policy/sequential/atomic.hpp
@@ -28,7 +28,23 @@ namespace RAJA
 RAJA_SUPPRESS_HD_WARN
 template <typename T>
 RAJA_HOST_DEVICE
-RAJA_INLINE T atomicAdd(seq_atomic, T volatile *acc, T value)
+RAJA_INLINE T atomicLoad(seq_atomic, T *acc)
+{
+  return *acc;
+}
+
+RAJA_SUPPRESS_HD_WARN
+template <typename T>
+RAJA_HOST_DEVICE
+RAJA_INLINE void atomicStore(seq_atomic, T *acc, T value)
+{
+  *acc = value;
+}
+
+RAJA_SUPPRESS_HD_WARN
+template <typename T>
+RAJA_HOST_DEVICE
+RAJA_INLINE T atomicAdd(seq_atomic, T *acc, T value)
 {
   T ret = *acc;
   *acc += value;
@@ -39,7 +55,7 @@ RAJA_INLINE T atomicAdd(seq_atomic, T volatile *acc, T value)
 RAJA_SUPPRESS_HD_WARN
 template <typename T>
 RAJA_HOST_DEVICE
-RAJA_INLINE T atomicSub(seq_atomic, T volatile *acc, T value)
+RAJA_INLINE T atomicSub(seq_atomic, T *acc, T value)
 {
   T ret = *acc;
   *acc -= value;
@@ -50,7 +66,7 @@ RAJA_INLINE T atomicSub(seq_atomic, T volatile *acc, T value)
 RAJA_SUPPRESS_HD_WARN
 template <typename T>
 RAJA_HOST_DEVICE
-RAJA_INLINE T atomicMin(seq_atomic, T volatile *acc, T value)
+RAJA_INLINE T atomicMin(seq_atomic, T *acc, T value)
 {
   T ret = *acc;
   *acc = ret < value ? ret : value;
@@ -60,10 +76,10 @@ RAJA_INLINE T atomicMin(seq_atomic, T volatile *acc, T value)
 RAJA_SUPPRESS_HD_WARN
 template <typename T>
 RAJA_HOST_DEVICE
-RAJA_INLINE T atomicMax(seq_atomic, T volatile *acc, T value)
+RAJA_INLINE T atomicMax(seq_atomic, T *acc, T value)
 {
   T ret = *acc;
-  *acc = ret > value ? ret : value;
+  *acc = value < ret ? ret : value;
   return ret;
 }
 
@@ -71,47 +87,47 @@ RAJA_INLINE T atomicMax(seq_atomic, T volatile *acc, T value)
 RAJA_SUPPRESS_HD_WARN
 template <typename T>
 RAJA_HOST_DEVICE
-RAJA_INLINE T atomicInc(seq_atomic, T volatile *acc)
+RAJA_INLINE T atomicInc(seq_atomic, T *acc)
 {
   T ret = *acc;
-  (*acc) += 1;
+  (*acc) += T(1);
   return ret;
 }
 
 RAJA_SUPPRESS_HD_WARN
 template <typename T>
 RAJA_HOST_DEVICE
-RAJA_INLINE T atomicInc(seq_atomic, T volatile *acc, T val)
+RAJA_INLINE T atomicInc(seq_atomic, T *acc, T val)
 {
   T old = *acc;
-  (*acc) = ((old >= val) ? 0 : (old + 1));
+  *acc = val <= old ? T(0) : old + T(1);
   return old;
 }
 
 RAJA_SUPPRESS_HD_WARN
 template <typename T>
 RAJA_HOST_DEVICE
-RAJA_INLINE T atomicDec(seq_atomic, T volatile *acc)
+RAJA_INLINE T atomicDec(seq_atomic, T *acc)
 {
   T ret = *acc;
-  (*acc) -= 1;
+  (*acc) -= T(1);
   return ret;
 }
 
 RAJA_SUPPRESS_HD_WARN
 template <typename T>
 RAJA_HOST_DEVICE
-RAJA_INLINE T atomicDec(seq_atomic, T volatile *acc, T val)
+RAJA_INLINE T atomicDec(seq_atomic, T *acc, T val)
 {
   T old = *acc;
-  (*acc) = (((old == 0) | (old > val)) ? val : (old - 1));
+  *acc = old == T(0) || val < old ? val : old - T(1);
   return old;
 }
 
 RAJA_SUPPRESS_HD_WARN
 template <typename T>
 RAJA_HOST_DEVICE
-RAJA_INLINE T atomicAnd(seq_atomic, T volatile *acc, T value)
+RAJA_INLINE T atomicAnd(seq_atomic, T *acc, T value)
 {
   T ret = *acc;
   *acc &= value;
@@ -121,7 +137,7 @@ RAJA_INLINE T atomicAnd(seq_atomic, T volatile *acc, T value)
 RAJA_SUPPRESS_HD_WARN
 template <typename T>
 RAJA_HOST_DEVICE
-RAJA_INLINE T atomicOr(seq_atomic, T volatile *acc, T value)
+RAJA_INLINE T atomicOr(seq_atomic, T *acc, T value)
 {
   T ret = *acc;
   *acc |= value;
@@ -131,7 +147,7 @@ RAJA_INLINE T atomicOr(seq_atomic, T volatile *acc, T value)
 RAJA_SUPPRESS_HD_WARN
 template <typename T>
 RAJA_HOST_DEVICE
-RAJA_INLINE T atomicXor(seq_atomic, T volatile *acc, T value)
+RAJA_INLINE T atomicXor(seq_atomic, T *acc, T value)
 {
   T ret = *acc;
   *acc ^= value;
@@ -141,7 +157,7 @@ RAJA_INLINE T atomicXor(seq_atomic, T volatile *acc, T value)
 RAJA_SUPPRESS_HD_WARN
 template <typename T>
 RAJA_HOST_DEVICE
-RAJA_INLINE T atomicExchange(seq_atomic, T volatile *acc, T value)
+RAJA_INLINE T atomicExchange(seq_atomic, T *acc, T value)
 {
   T ret = *acc;
   *acc = value;
@@ -151,7 +167,7 @@ RAJA_INLINE T atomicExchange(seq_atomic, T volatile *acc, T value)
 RAJA_SUPPRESS_HD_WARN
 template <typename T>
 RAJA_HOST_DEVICE
-RAJA_INLINE T atomicCAS(seq_atomic, T volatile *acc, T compare, T value)
+RAJA_INLINE T atomicCAS(seq_atomic, T *acc, T compare, T value)
 {
   T ret = *acc;
   *acc = ret == compare ? value : ret;
diff --git a/include/RAJA/policy/sequential/multi_reduce.hpp b/include/RAJA/policy/sequential/multi_reduce.hpp
new file mode 100644
index 0000000000..be3a3860f8
--- /dev/null
+++ b/include/RAJA/policy/sequential/multi_reduce.hpp
@@ -0,0 +1,171 @@
+/*!
+ ******************************************************************************
+ *
+ * \file
+ *
+ * \brief   Header file containing RAJA reduction templates for
+ *          sequential execution.
+ *
+ *          These methods should work on any platform.
+ *
+ ******************************************************************************
+ */
+
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2016-24, Lawrence Livermore National Security, LLC
+// and RAJA project contributors. See the RAJA/LICENSE file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+#ifndef RAJA_sequential_multi_reduce_HPP
+#define RAJA_sequential_multi_reduce_HPP
+
+#include "RAJA/config.hpp"
+
+#include "RAJA/internal/MemUtils_CPU.hpp"
+
+#include "RAJA/pattern/detail/multi_reduce.hpp"
+#include "RAJA/pattern/multi_reduce.hpp"
+
+#include "RAJA/policy/sequential/policy.hpp"
+
+#include "RAJA/util/types.hpp"
+
+namespace RAJA
+{
+
+namespace detail
+{
+
+/*!
+ **************************************************************************
+ *
+ * \brief  Seq multi-reduce data class template.
+ *
+ * In this class memory is owned by the parent object
+ *
+ **************************************************************************
+ */
+template < typename T, typename t_MultiReduceOp, typename tuning >
+struct MultiReduceDataSeq;
+
+/*!
+ **************************************************************************
+ *
+ * \brief  Seq multi-reduce data class template using left_fold reductions.
+ *
+ * In this class memory is owned by the parent object
+ *
+ **************************************************************************
+ */
+template < typename T, typename t_MultiReduceOp >
+struct MultiReduceDataSeq<T, t_MultiReduceOp,
+    RAJA::sequential::MultiReduceTuning<
+      RAJA::sequential::multi_reduce_algorithm::left_fold>>
+{
+  using value_type = T;
+  using MultiReduceOp = t_MultiReduceOp;
+
+  MultiReduceDataSeq() = delete;
+
+  template < typename Container,
+             std::enable_if_t<!std::is_same<Container, MultiReduceDataSeq>::value>* = nullptr >
+  MultiReduceDataSeq(Container const& container, T identity)
+      : m_parent(nullptr)
+      , m_num_bins(container.size())
+      , m_identity(identity)
+      , m_data(nullptr)
+  {
+    m_data = create_data(container, m_num_bins);
+  }
+
+  MultiReduceDataSeq(MultiReduceDataSeq const &other)
+      : m_parent(other.m_parent ? other.m_parent : &other)
+      , m_num_bins(other.m_num_bins)
+      , m_identity(other.m_identity)
+      , m_data(other.m_data)
+  { }
+
+  MultiReduceDataSeq(MultiReduceDataSeq &&) = delete;
+  MultiReduceDataSeq& operator=(MultiReduceDataSeq const&) = delete;
+  MultiReduceDataSeq& operator=(MultiReduceDataSeq &&) = delete;
+
+  ~MultiReduceDataSeq()
+  {
+    if (m_data) {
+      if (!m_parent) {
+        destroy_data(m_data, m_num_bins);
+      }
+    }
+  }
+
+  template < typename Container >
+  void reset(Container const& container, T identity)
+  {
+    m_identity = identity;
+    size_t new_num_bins = container.size();
+    if (new_num_bins != m_num_bins) {
+      destroy_data(m_data, m_num_bins);
+      m_num_bins = new_num_bins;
+      m_data = create_data(container, m_num_bins);
+    } else {
+      size_t bin = 0;
+      for (auto const& value : container) {
+        m_data[bin] = value;
+        ++bin;
+      }
+    }
+  }
+
+  size_t num_bins() const { return m_num_bins; }
+
+  T identity() const { return m_identity; }
+
+  void combine(size_t bin, T const &val) { MultiReduceOp{}(m_data[bin], val); }
+
+  T get(size_t bin) const { return m_data[bin]; }
+
+private:
+  MultiReduceDataSeq const *m_parent;
+  size_t m_num_bins;
+  T m_identity;
+  T* m_data;
+
+  template < typename Container >
+  static T* create_data(Container const& container, size_t num_bins)
+  {
+    if (num_bins == size_t(0)) {
+      return nullptr;
+    }
+
+    auto data = static_cast<T*>(malloc(num_bins*sizeof(T)));
+    size_t bin = 0;
+    for (auto const& value : container) {
+      new(&data[bin]) T(value);
+      ++bin;
+    }
+    return data;
+  }
+
+  static void destroy_data(T*& data, size_t num_bins)
+  {
+    if (num_bins == size_t(0)) {
+      return;
+    }
+
+    for (size_t bin = 0; bin < num_bins; ++bin) {
+      data[bin].~T();
+    }
+    free(data);
+    data = nullptr;
+  }
+};
+
+}  // namespace detail
+
+RAJA_DECLARE_ALL_MULTI_REDUCERS(policy::sequential::seq_multi_reduce_policy, detail::MultiReduceDataSeq)
+
+}  // namespace RAJA
+
+#endif  // closing endif for header file include guard
diff --git a/include/RAJA/policy/sequential/params/kernel_name.hpp b/include/RAJA/policy/sequential/params/kernel_name.hpp
new file mode 100644
index 0000000000..00e6a1dc52
--- /dev/null
+++ b/include/RAJA/policy/sequential/params/kernel_name.hpp
@@ -0,0 +1,37 @@
+#ifndef SEQ_KERNELNAME_HPP
+#define SEQ_KERNELNAME_HPP
+
+#include "RAJA/pattern/params/kernel_name.hpp"
+
+namespace RAJA {
+namespace expt {
+namespace detail {
+
+  // Init
+  template<typename EXEC_POL>
+  camp::concepts::enable_if< std::is_same< EXEC_POL, RAJA::seq_exec> >
+  init(KernelName&)
+  {
+    //TODO: Define kernel naming
+  }
+
+  // Combine
+  template<typename EXEC_POL, typename T>
+  RAJA_HOST_DEVICE
+  camp::concepts::enable_if< std::is_same< EXEC_POL, RAJA::seq_exec> >
+  combine(KernelName&, T) {}
+
+  // Resolve
+  template<typename EXEC_POL>
+  camp::concepts::enable_if< std::is_same< EXEC_POL, RAJA::seq_exec> >
+  resolve(KernelName&)
+  {
+    //TODO: Define kernel naming
+  }
+
+} //  namespace detail
+} //  namespace expt
+} //  namespace RAJA
+
+
+#endif //  NEW_REDUCE_SEQ_REDUCE_HPP
diff --git a/include/RAJA/policy/sequential/policy.hpp b/include/RAJA/policy/sequential/policy.hpp
index f59bee50e4..287af42502 100644
--- a/include/RAJA/policy/sequential/policy.hpp
+++ b/include/RAJA/policy/sequential/policy.hpp
@@ -22,6 +22,24 @@
 
 namespace RAJA
 {
+namespace sequential
+{
+
+enum struct multi_reduce_algorithm : int
+{
+  left_fold
+};
+
+template < multi_reduce_algorithm t_multi_algorithm >
+struct MultiReduceTuning
+{
+  static constexpr multi_reduce_algorithm algorithm = t_multi_algorithm;
+  static constexpr bool consistent =
+      (algorithm == multi_reduce_algorithm::left_fold);
+};
+
+} // namspace sequential
+
 namespace policy
 {
 namespace sequential
@@ -79,11 +97,23 @@ struct seq_work : make_policy_pattern_launch_platform_t<Policy::sequential,
 ///////////////////////////////////////////////////////////////////////
 ///
 struct seq_reduce : make_policy_pattern_launch_platform_t<Policy::sequential,
-                                                          Pattern::forall,
+                                                          Pattern::reduce,
                                                           Launch::undefined,
                                                           Platform::host> {
 };
 
+///
+template < typename tuning >
+struct seq_multi_reduce_policy
+    : make_policy_pattern_launch_platform_t<Policy::sequential,
+                                            Pattern::multi_reduce,
+                                            Launch::undefined,
+                                            Platform::host,
+                                            std::conditional_t<tuning::consistent,
+                                                               reduce::ordered,
+                                                               reduce::unordered>> {
+};
+
 ///
 ///////////////////////////////////////////////////////////////////////
 ///
@@ -94,12 +124,27 @@ struct seq_reduce : make_policy_pattern_launch_platform_t<Policy::sequential,
 struct seq_atomic {
 };
 
+
+template < RAJA::sequential::multi_reduce_algorithm algorithm >
+using seq_multi_reduce_tuning = seq_multi_reduce_policy<
+    RAJA::sequential::MultiReduceTuning<algorithm> >;
+
+// Policies for RAJA::MultiReduce* objects with specific behaviors.
+// - left_fold policies combine new values into a single value.
+using seq_multi_reduce_left_fold = seq_multi_reduce_tuning<
+    RAJA::sequential::multi_reduce_algorithm::left_fold>;
+
+// Policy for RAJA::MultiReduce* objects that gives the
+// same answer every time when used in the same way
+using seq_multi_reduce = seq_multi_reduce_left_fold;
+
 }  // namespace sequential
 }  // namespace policy
 
 using policy::sequential::seq_atomic;
 using policy::sequential::seq_exec;
 using policy::sequential::seq_reduce;
+using policy::sequential::seq_multi_reduce;
 using policy::sequential::seq_region;
 using policy::sequential::seq_segit;
 using policy::sequential::seq_work;
diff --git a/include/RAJA/policy/sycl.hpp b/include/RAJA/policy/sycl.hpp
index dc4112d8a7..491e39910c 100644
--- a/include/RAJA/policy/sycl.hpp
+++ b/include/RAJA/policy/sycl.hpp
@@ -24,11 +24,12 @@
 
 #if defined(RAJA_SYCL_ACTIVE)
 
-#include <CL/sycl.hpp>
+#include "RAJA/util/sycl_compat.hpp"
 
 #include "RAJA/policy/sycl/forall.hpp"
 #include "RAJA/policy/sycl/policy.hpp"
 #include "RAJA/policy/sycl/reduce.hpp"
+//#include "RAJA/policy/sycl/multi_reduce.hpp"
 //#include "RAJA/policy/sycl/scan.hpp"
 //#include "RAJA/policy/sycl/sort.hpp"
 #include "RAJA/policy/sycl/kernel.hpp"
diff --git a/include/RAJA/policy/sycl/MemUtils_SYCL.hpp b/include/RAJA/policy/sycl/MemUtils_SYCL.hpp
index c158bd2801..27d3209ae3 100644
--- a/include/RAJA/policy/sycl/MemUtils_SYCL.hpp
+++ b/include/RAJA/policy/sycl/MemUtils_SYCL.hpp
@@ -23,7 +23,7 @@
 
 #if defined(RAJA_ENABLE_SYCL)
 
-#include <CL/sycl.hpp>
+#include "RAJA/util/sycl_compat.hpp"
 
 #include <cassert>
 #include <cstddef>
diff --git a/include/RAJA/policy/sycl/forall.hpp b/include/RAJA/policy/sycl/forall.hpp
index 4a33ab3bd4..901cc694f0 100644
--- a/include/RAJA/policy/sycl/forall.hpp
+++ b/include/RAJA/policy/sycl/forall.hpp
@@ -26,10 +26,11 @@
 
 #if defined(RAJA_ENABLE_SYCL)
 
-#include <CL/sycl.hpp>
 #include <algorithm>
 #include <chrono>
 
+#include "RAJA/util/sycl_compat.hpp"
+
 #include "RAJA/pattern/forall.hpp"
 
 #include "RAJA/pattern/params/forall.hpp"
@@ -121,12 +122,7 @@ forall_impl(resources::Sycl &sycl_res,
     sycl_dim_t blockSize{BlockSize};
     sycl_dim_t gridSize = impl::getGridDim(static_cast<size_t>(len), BlockSize);
 
-    ::sycl::queue* q = ::RAJA::sycl::detail::getQueue();
-    // Global resource was not set, use the resource that was passed to forall
-    // Determine if the default SYCL res is being used
-    if (!q) { 
-      q = sycl_res.get_queue();
-    }
+    ::sycl::queue* q = sycl_res.get_queue();
 
     q->submit([&](::sycl::handler& h) {
 
@@ -168,6 +164,7 @@ resources::EventProxy<resources::Sycl> forall_impl(resources::Sycl &sycl_res,
 
   // Only launch kernel if we have something to iterate over
   if (len > 0 && BlockSize > 0) {
+
     // Note: We could fix an incorrect workgroup size.
     //       It would change what was specified.
     //       For now, leave the device compiler to error with invalid WG size.
@@ -178,14 +175,11 @@ resources::EventProxy<resources::Sycl> forall_impl(resources::Sycl &sycl_res,
     sycl_dim_t blockSize{BlockSize};
     sycl_dim_t gridSize = impl::getGridDim(static_cast<size_t>(len), BlockSize);
 
-    ::sycl::queue* q = ::RAJA::sycl::detail::getQueue();
-    // Global resource was not set, use the resource that was passed to forall
-    // Determine if the default SYCL res is being used
-    if (!q) { 
-      q = sycl_res.get_queue();
-    }
+    ::sycl::queue* q = sycl_res.get_queue();
+
     LOOP_BODY* lbody;
     Iterator* beg;
+
     RAJA_FT_BEGIN;
     //
     // Setup shared memory buffers
@@ -250,18 +244,14 @@ forall_impl(resources::Sycl &sycl_res,
 
   // Only launch kernel if we have something to iterate over
   if (len > 0 && BlockSize > 0) {
+
     //
     // Compute the number of blocks
     //
     sycl_dim_t blockSize{BlockSize};
     sycl_dim_t gridSize = impl::getGridDim(static_cast<size_t>(len), BlockSize);
 
-    ::sycl::queue* q = ::RAJA::sycl::detail::getQueue();
-    // Global resource was not set, use the resource that was passed to forall
-    // Determine if the default SYCL res is being used
-    if (!q) {
-      q = sycl_res.get_queue();
-    }
+    ::sycl::queue* q = sycl_res.get_queue();
 
     auto combiner = []( ForallParam x, ForallParam y ) {
       RAJA::expt::ParamMultiplexer::combine<EXEC_POL>( x, y );
@@ -332,12 +322,7 @@ forall_impl(resources::Sycl &sycl_res,
     sycl_dim_t blockSize{BlockSize};
     sycl_dim_t gridSize = impl::getGridDim(static_cast<size_t>(len), BlockSize);
 
-    ::sycl::queue* q = ::RAJA::sycl::detail::getQueue();
-    // Global resource was not set, use the resource that was passed to forall
-    // Determine if the default SYCL res is being used
-    if (!q) {
-      q = sycl_res.get_queue();
-    }
+    ::sycl::queue* q = sycl_res.get_queue();
 
     auto combiner = []( ForallParam x, ForallParam y ) {
       RAJA::expt::ParamMultiplexer::combine<EXEC_POL>( x, y );
@@ -414,29 +399,6 @@ forall_impl(resources::Sycl &sycl_res,
  *
  ******************************************************************************
  */
-template <typename LoopBody,
-          size_t BlockSize,
-          bool Async,
-          typename... SegmentTypes>
-RAJA_INLINE void forall_impl(ExecPolicy<seq_segit, sycl_exec<BlockSize, Async>>,
-                             const TypedIndexSet<SegmentTypes...>& iset,
-                             LoopBody&& loop_body)
-{
-  int num_seg = iset.getNumSegments();
-  for (int isi = 0; isi < num_seg; ++isi) {
-    iset.segmentCall(isi,
-                     detail::CallForall(),
-                     sycl_exec<BlockSize, true>(),
-                     loop_body);
-  }  // iterate over segments of index set
-
-  if (!Async) {
-    ::sycl::queue* q = ::RAJA::sycl::detail::getQueue();
-    q->wait();
-  };
-}
-
-
 template <typename LoopBody,
           size_t BlockSize,
           bool Async,
@@ -455,9 +417,9 @@ RAJA_INLINE resources::EventProxy<resources::Sycl> forall_impl(resources::Sycl &
                      loop_body);
   }  // iterate over segments of index set
 
-  if (!Async) {
-    ::sycl::queue* q = ::RAJA::sycl::detail::getQueue(); 
-    q->wait();
+  if ( !Async ) {
+    ::sycl::queue* q = r.get_queue();
+    q->wait(); 
   }
 
   return resources::EventProxy<resources::Sycl>(r);
diff --git a/include/RAJA/policy/sycl/kernel/SyclKernel.hpp b/include/RAJA/policy/sycl/kernel/SyclKernel.hpp
index 0b7fa5f253..88c789c062 100644
--- a/include/RAJA/policy/sycl/kernel/SyclKernel.hpp
+++ b/include/RAJA/policy/sycl/kernel/SyclKernel.hpp
@@ -48,19 +48,22 @@ namespace RAJA
  * work group and work items per group.
  */
 template <bool async0>
-struct sycl_launch {};
+struct sycl_launch : public RAJA::make_policy_pattern_launch_platform_t<
+                            RAJA::Policy::sycl,
+                            RAJA::Pattern::forall,
+                            detail::get_launch<async0>::value,
+                            RAJA::Platform::sycl>{
+};
 
 namespace statement
 {
 
-
-/*! RAJA::kernel statement that launches a SYCL kernel.
- *
- *
+/*
+ * ! RAJA::kernel statement that launches a SYCL kernel.
  */
 template <typename LaunchConfig, typename... EnclosedStmts>
 struct SyclKernelExt
-    : public internal::Statement<sycl_launch<0>, EnclosedStmts...> {
+    : public internal::Statement<LaunchConfig, EnclosedStmts...> {
 };
 
 /*
@@ -87,11 +90,7 @@ namespace internal
 {
 
 /*!
- * SYCL global function for launching SyclKernel policies
- * This is annotated to guarantee that device code generated
- * can be launched by a kernel with BlockSize number of threads.
- *
- * This launcher is used by the SyclKernel policies.
+ * SYCL global function for launching SyclKernel policies.
  */
 template <typename Data, typename Exec>
 void SyclKernelLauncher(Data data, cl::sycl::nd_item<3> item)
@@ -142,7 +141,7 @@ struct SyclLaunchHelper<false,sycl_launch<async0>,StmtList,Data,Types>
 
     qu->submit([&](cl::sycl::handler& h) {
  
-      h.parallel_for(launch_dims.fit_nd_range(),
+      h.parallel_for(launch_dims.fit_nd_range(qu),
                      [=] (cl::sycl::nd_item<3> item) {
         
         SyclKernelLauncher<Data, executor_t>(*m_data, item);
@@ -178,7 +177,7 @@ struct SyclLaunchHelper<true,sycl_launch<async0>,StmtList,Data,Types>
 
     qu->submit([&](cl::sycl::handler& h) {
  
-      h.parallel_for(launch_dims.fit_nd_range(),
+      h.parallel_for(launch_dims.fit_nd_range(qu),
                      [=] (cl::sycl::nd_item<3> item) {
 
         SyclKernelLauncher<Data, executor_t>(data, item);
@@ -211,20 +210,15 @@ struct StatementExecutor<
     using launch_t = SyclLaunchHelper<std::is_trivially_copyable<data_t>::value,
                                       LaunchConfig, stmt_list_t, data_t, Types>;
 
+    camp::resources::Sycl res = data.get_resource();
+    ::sycl::queue* q = res.get_queue();;
+
     //
     // Compute the requested kernel dimensions
     //
     LaunchDims launch_dims = executor_t::calculateDimensions(data);
     
     int shmem = 0;
-    cl::sycl::queue* q = ::RAJA::sycl::detail::getQueue();
-
-    // Global resource was not set, use the resource that was passed to forall
-    // Determine if the default SYCL res is being used
-    if (!q) {
-      camp::resources::Resource res = camp::resources::Sycl();
-      q = res.get<camp::resources::Sycl>().get_queue();
-    }
 
     //
     // Launch the kernels
diff --git a/include/RAJA/policy/sycl/kernel/internal.hpp b/include/RAJA/policy/sycl/kernel/internal.hpp
index 3fe37efe0b..56e3a9aa1e 100644
--- a/include/RAJA/policy/sycl/kernel/internal.hpp
+++ b/include/RAJA/policy/sycl/kernel/internal.hpp
@@ -86,7 +86,7 @@ struct LaunchDims {
     return result;
   }
 
-  cl::sycl::nd_range<3> fit_nd_range() {
+  cl::sycl::nd_range<3> fit_nd_range(::sycl::queue* q) {
 
     sycl_dim_3_t launch_global;
 
@@ -95,14 +95,6 @@ struct LaunchDims {
     launch_local.y = std::max(launch_local.y, local.y);
     launch_local.z = std::max(launch_local.z, local.z);
 
-    cl::sycl::queue* q = ::RAJA::sycl::detail::getQueue();
-    // Global resource was not set, use the resource that was passed to forall
-    // Determine if the default SYCL res is being used
-    if (!q) {
-      camp::resources::Resource sycl_res = camp::resources::Sycl();
-      q = sycl_res.get<camp::resources::Sycl>().get_queue();
-    }
-
     cl::sycl::device dev = q->get_device();
 
     auto max_work_group_size = dev.get_info< ::cl::sycl::info::device::max_work_group_size>();
diff --git a/include/RAJA/policy/sycl/launch.hpp b/include/RAJA/policy/sycl/launch.hpp
index 9176444cd4..ad9fecc222 100644
--- a/include/RAJA/policy/sycl/launch.hpp
+++ b/include/RAJA/policy/sycl/launch.hpp
@@ -41,16 +41,8 @@ struct LaunchExecute<RAJA::sycl_launch_t<async, 0>> {
        BODY_IN &&body_in, ReduceParams &RAJA_UNUSED_ARG(launch_reducers))
   {
 
-    cl::sycl::queue* q = ::RAJA::sycl::detail::getQueue();
-
-    /*Get the concrete resource */
-    resources::Sycl sycl_res = res.get<RAJA::resources::Sycl>();
-
-    // Global resource was not set, use the resource that was passed to forall
-    // Determine if the default SYCL res is being used
-    if (!q) {
-      q = sycl_res.get_queue();
-    }
+    /*Get the queue from concrete resource */
+    ::sycl::queue* q = res.get<camp::resources::Sycl>().get_queue();
 
     //
     // Compute the number of blocks and threads
@@ -91,6 +83,8 @@ struct LaunchExecute<RAJA::sycl_launch_t<async, 0>> {
 
       });
 
+    if (!async) { q->wait(); }
+
       RAJA_FT_END;
 
     }
@@ -105,10 +99,76 @@ struct LaunchExecute<RAJA::sycl_launch_t<async, 0>> {
                                  RAJA::expt::type_traits::is_ForallParamPack<ReduceParams>,
                                  concepts::negate<RAJA::expt::type_traits::is_ForallParamPack_empty<ReduceParams>>>
   exec(RAJA::resources::Resource res, const LaunchParams &launch_params, const char *kernel_name,
-       BODY_IN &&body_in, ReduceParams &&launch_reducers)
+       BODY_IN &&body_in, ReduceParams launch_reducers)
   {
 
-   RAJA_ABORT_OR_THROW("SYCL trivially copyable lambda  backend currently not supported in RAJA launch");
+    /*Get the queue from concrete resource */
+    ::sycl::queue* q = res.get<camp::resources::Sycl>().get_queue();
+
+    using EXEC_POL = RAJA::sycl_launch_t<async, 0>;
+    RAJA::expt::ParamMultiplexer::init<EXEC_POL>(launch_reducers);
+
+    //
+    // Compute the number of blocks and threads
+    //
+    const ::sycl::range<3> blockSize(launch_params.threads.value[2],
+				     launch_params.threads.value[1],
+				     launch_params.threads.value[0]);
+
+    const ::sycl::range<3> gridSize(launch_params.threads.value[2] * launch_params.teams.value[2],
+				    launch_params.threads.value[1] * launch_params.teams.value[1],
+				    launch_params.threads.value[0] * launch_params.teams.value[0]);
+
+    // Only launch kernel if we have something to iterate over
+    constexpr size_t zero = 0;
+    if ( launch_params.threads.value[0]  > zero && launch_params.threads.value[1]  > zero && launch_params.threads.value[2] > zero &&
+         launch_params.teams.value[0] > zero && launch_params.teams.value[1] > zero && launch_params.teams.value[2]> zero ) {
+
+
+      auto combiner = []( ReduceParams x, ReduceParams y ) {
+        RAJA::expt::ParamMultiplexer::combine<EXEC_POL>( x, y );
+        return x;
+       };
+
+      RAJA_FT_BEGIN;
+
+      ReduceParams* res = ::sycl::malloc_shared<ReduceParams>(1,*q);
+      RAJA::expt::ParamMultiplexer::init<EXEC_POL>(*res);
+      auto reduction = ::sycl::reduction(res, launch_reducers, combiner);
+
+      q->submit([&](cl::sycl::handler& h) {
+
+       auto s_vec = ::sycl::local_accessor<char, 1> (launch_params.shared_mem_size, h);
+
+        h.parallel_for
+          (cl::sycl::nd_range<3>(gridSize, blockSize),
+           reduction,
+           [=] (cl::sycl::nd_item<3> itm, auto & red) {
+
+            LaunchContext ctx;
+            ctx.itm = &itm;
+
+            //Point to shared memory
+            ctx.shared_mem_ptr = s_vec.get_multi_ptr<::sycl::access::decorated::yes>().get();
+
+            ReduceParams fp;
+            RAJA::expt::ParamMultiplexer::init<EXEC_POL>(fp);
+
+            RAJA::expt::invoke_body(fp, body_in, ctx);
+
+            red.combine(fp);
+
+           });
+
+      }).wait(); // Need to wait for completion to free memory
+
+      RAJA::expt::ParamMultiplexer::combine<EXEC_POL>( launch_reducers, *res );
+      ::sycl::free(res, *q);
+
+      RAJA_FT_END;
+    }
+
+    RAJA::expt::ParamMultiplexer::resolve<EXEC_POL>(launch_reducers);
 
    return resources::EventProxy<resources::Resource>(res);
   }
@@ -123,16 +183,8 @@ struct LaunchExecute<RAJA::sycl_launch_t<async, 0>> {
        BODY_IN &&body_in, ReduceParams &RAJA_UNUSED_ARG(launch_reducers))
   {
 
-    cl::sycl::queue* q = ::RAJA::sycl::detail::getQueue();
-
-    /*Get the concrete resource */
-    resources::Sycl sycl_res = res.get<RAJA::resources::Sycl>();
-
-    // Global resource was not set, use the resource that was passed to forall
-    // Determine if the default SYCL res is being used
-    if (!q) {
-      q = sycl_res.get_queue();
-    }
+    /*Get the queue from concrete resource */
+    ::sycl::queue* q = res.get<camp::resources::Sycl>().get_queue();
 
     //
     // Compute the number of blocks and threads
@@ -180,7 +232,9 @@ struct LaunchExecute<RAJA::sycl_launch_t<async, 0>> {
 
            });
 
-      });
+      }).wait(); // Need to wait for completion to free memory
+
+      cl::sycl::free(lbody, *q);
 
       RAJA_FT_END;
 
@@ -197,15 +251,90 @@ struct LaunchExecute<RAJA::sycl_launch_t<async, 0>> {
                                  RAJA::expt::type_traits::is_ForallParamPack<ReduceParams>,
                                  concepts::negate<RAJA::expt::type_traits::is_ForallParamPack_empty<ReduceParams>>>
     exec(RAJA::resources::Resource res, const LaunchParams &launch_params, const char *kernel_name,
-         BODY_IN &&body_in, ReduceParams &&launch_reducers)
+         BODY_IN &&body_in, ReduceParams launch_reducers)
   {
 
-   RAJA_ABORT_OR_THROW("SYCL non-trivially copyable lambda  backend currently not supported in RAJA launch");
+    /*Get the queue from concrete resource */
+    ::sycl::queue* q = res.get<camp::resources::Sycl>().get_queue();
+
+    using EXEC_POL = RAJA::sycl_launch_t<async, 0>;
+    RAJA::expt::ParamMultiplexer::init<EXEC_POL>(launch_reducers);
+
+    //
+    // Compute the number of blocks and threads
+    //
+    const ::sycl::range<3> blockSize(launch_params.threads.value[2],
+				     launch_params.threads.value[1],
+				     launch_params.threads.value[0]);
+
+    const ::sycl::range<3> gridSize(launch_params.threads.value[2] * launch_params.teams.value[2],
+				    launch_params.threads.value[1] * launch_params.teams.value[1],
+				    launch_params.threads.value[0] * launch_params.teams.value[0]);
+
+    // Only launch kernel if we have something to iterate over
+    constexpr size_t zero = 0;
+    if ( launch_params.threads.value[0]  > zero && launch_params.threads.value[1]  > zero && launch_params.threads.value[2] > zero &&
+         launch_params.teams.value[0] > zero && launch_params.teams.value[1] > zero && launch_params.teams.value[2]> zero ) {
+
+
+      auto combiner = []( ReduceParams x, ReduceParams y ) {
+        RAJA::expt::ParamMultiplexer::combine<EXEC_POL>( x, y );
+        return x;
+       };
+
+      RAJA_FT_BEGIN;
+
+      //
+      // Kernel body is nontrivially copyable, create space on device and copy to
+      // Workaround until "is_device_copyable" is supported
+      //
+      using LOOP_BODY = camp::decay<BODY_IN>;
+      LOOP_BODY* lbody;
+      lbody = (LOOP_BODY*) cl::sycl::malloc_device(sizeof(LOOP_BODY), *q);
+      q->memcpy(lbody, &body_in, sizeof(LOOP_BODY)).wait();
+
+      ReduceParams* res = ::sycl::malloc_shared<ReduceParams>(1,*q);
+      RAJA::expt::ParamMultiplexer::init<EXEC_POL>(*res);
+      auto reduction = ::sycl::reduction(res, launch_reducers, combiner);
+
+      q->submit([&](cl::sycl::handler& h) {
+
+       auto s_vec = ::sycl::local_accessor<char, 1> (launch_params.shared_mem_size, h);
+
+        h.parallel_for
+          (cl::sycl::nd_range<3>(gridSize, blockSize),
+           reduction,
+           [=] (cl::sycl::nd_item<3> itm, auto & red) {
+
+            LaunchContext ctx;
+            ctx.itm = &itm;
+
+            //Point to shared memory
+            ctx.shared_mem_ptr = s_vec.get_multi_ptr<::sycl::access::decorated::yes>().get();
+
+            ReduceParams fp;
+            RAJA::expt::ParamMultiplexer::init<EXEC_POL>(fp);
+
+            RAJA::expt::invoke_body(fp, *lbody, ctx);
+
+            red.combine(fp);
+
+           });
+
+      }).wait(); // Need to wait for completion to free memory
+
+      RAJA::expt::ParamMultiplexer::combine<EXEC_POL>( launch_reducers, *res );
+      ::sycl::free(res, *q);
+      cl::sycl::free(lbody, *q);
+
+      RAJA_FT_END;
+    }
+
+    RAJA::expt::ParamMultiplexer::resolve<EXEC_POL>(launch_reducers);
 
    return resources::EventProxy<resources::Resource>(res);
   }
 
-
 };
 
 /*
diff --git a/include/RAJA/policy/sycl/params/kernel_name.hpp b/include/RAJA/policy/sycl/params/kernel_name.hpp
new file mode 100644
index 0000000000..1f33be19bb
--- /dev/null
+++ b/include/RAJA/policy/sycl/params/kernel_name.hpp
@@ -0,0 +1,41 @@
+#ifndef SYCL_KERNELNAME_HPP
+#define SYCL_KERNELNAME_HPP
+
+#include "RAJA/pattern/params/kernel_name.hpp"
+
+namespace RAJA {
+namespace expt {
+namespace detail {
+
+#if defined(RAJA_ENABLE_SYCL)  
+  
+  // Init
+  template<typename EXEC_POL>
+  camp::concepts::enable_if< type_traits::is_sycl_policy<EXEC_POL> >
+  init(KernelName&)
+  {
+    //TODO: Define kernel naming
+  }
+
+  // Combine
+  template<typename EXEC_POL, typename T>
+  camp::concepts::enable_if< type_traits::is_sycl_policy<EXEC_POL> >
+  SYCL_EXTERNAL
+  combine(KernelName&, T) {}
+
+  // Resolve
+  template<typename EXEC_POL>
+  camp::concepts::enable_if< type_traits::is_sycl_policy<EXEC_POL> >
+  resolve(KernelName&)
+  {
+    //TODO: Define kernel naming
+  }
+
+#endif  
+
+} //  namespace detail
+} //  namespace expt
+} //  namespace RAJA
+
+
+#endif //  NEW_REDUCE_SYCL_REDUCE_HPP
diff --git a/include/RAJA/policy/sycl/policy.hpp b/include/RAJA/policy/sycl/policy.hpp
index a2ab44e3f7..0f92fe27e1 100644
--- a/include/RAJA/policy/sycl/policy.hpp
+++ b/include/RAJA/policy/sycl/policy.hpp
@@ -22,7 +22,7 @@
 
 #if defined(RAJA_SYCL_ACTIVE)
 
-#include <CL/sycl.hpp>
+#include "RAJA/util/sycl_compat.hpp"
 
 #include "RAJA/policy/PolicyBase.hpp"
 #include "RAJA/policy/sequential/policy.hpp"
@@ -96,7 +96,7 @@ template<typename host_policy>
 struct sycl_atomic_explicit{};
 
 //
-// Default cuda atomic policy uses cuda atomics on the device and non-atomics
+// Default sycl atomic policy uses sycl atomics on the device and non-atomics
 // on the host
 //
 using sycl_atomic = sycl_atomic_explicit<seq_atomic>;
diff --git a/include/RAJA/policy/sycl/reduce.hpp b/include/RAJA/policy/sycl/reduce.hpp
index 72cdbaeb6f..58cb83d295 100644
--- a/include/RAJA/policy/sycl/reduce.hpp
+++ b/include/RAJA/policy/sycl/reduce.hpp
@@ -73,7 +73,7 @@ struct maxloc
 // Ideally, MaxNumTeams = ThreadsPerTeam in omp_target_parallel_for_exec.
 static int MaxNumTeams = 1;
 
-//! Information necessary for OpenMP offload to be considered
+//! Information necessary for SYCL offload to be considered
 struct Offload_Info 
 {
   int hostID{1};
@@ -88,7 +88,7 @@ struct Offload_Info
   }
 };
 
-//! Reduction data for OpenMP Offload -- stores value, host pointer, and device
+//! Reduction data for SYCL Offload -- stores value, host pointer, and device
 //! pointer
 template <typename T>
 struct Reduce_Data
@@ -195,7 +195,7 @@ struct Reduce_Data
 
 }  // end namespace sycl
 
-//! OpenMP Target Reduction entity -- generalize on # of teams, reduction, and
+//! SYCL Target Reduction entity -- generalize on # of teams, reduction, and
 //! type
 template <typename Reducer, typename T>
 struct TargetReduce 
@@ -285,7 +285,7 @@ struct TargetReduce
   T finalVal;
 };
 
-//! OpenMP Target Reduction Location entity -- generalize on # of teams,
+//! SYCL Target Reduction Location entity -- generalize on # of teams,
 //! reduction, and type
 template <typename Reducer, typename T, typename IndexType>
 struct TargetReduceLoc 
diff --git a/include/RAJA/util/EnableIf.hpp b/include/RAJA/util/EnableIf.hpp
new file mode 100644
index 0000000000..257e852bf9
--- /dev/null
+++ b/include/RAJA/util/EnableIf.hpp
@@ -0,0 +1,57 @@
+/*!
+ ******************************************************************************
+ *
+ * \file
+ *
+ * \brief   Header file for enable_if helpers.
+ *
+ *          These type functions are used heavily by the atomic operators.
+ *
+ ******************************************************************************
+ */
+
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2024, Lawrence Livermore National Security, LLC
+// and RAJA project contributors. See the RAJA/LICENSE file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+#ifndef RAJA_util_EnableIf_HPP
+#define RAJA_util_EnableIf_HPP
+
+#include "RAJA/config.hpp"
+
+#include <type_traits>
+
+#include "camp/list.hpp"
+#include "camp/type_traits.hpp"
+
+#include "RAJA/util/concepts.hpp"
+
+
+namespace RAJA
+{
+namespace util
+{
+
+
+template <typename T, typename TypeList>
+struct is_any_of;
+
+template <typename T, typename... Types>
+struct is_any_of<T, ::camp::list<Types...>>
+  : ::RAJA::concepts::any_of<::camp::is_same<T, Types>...>
+{};
+
+template <typename T, typename TypeList>
+using enable_if_is_any_of = std::enable_if_t<is_any_of<T, TypeList>::value, T>;
+
+template <typename T, typename TypeList>
+using enable_if_is_none_of = std::enable_if_t<::RAJA::concepts::negate<is_any_of<T, TypeList>>::value, T>;
+
+
+}  // namespace util
+}  // namespace RAJA
+
+#endif  // closing endif for header file include guard
diff --git a/include/RAJA/util/OffsetOperators.hpp b/include/RAJA/util/OffsetOperators.hpp
new file mode 100644
index 0000000000..150aaeee34
--- /dev/null
+++ b/include/RAJA/util/OffsetOperators.hpp
@@ -0,0 +1,88 @@
+/*!
+ ******************************************************************************
+ *
+ * \file
+ *
+ * \brief   RAJA header file defining Simple Offset Calculators
+ *
+ ******************************************************************************
+ */
+
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2016-24, Lawrence Livermore National Security, LLC
+// and RAJA project contributors. See the RAJA/LICENSE file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+#ifndef RAJA_OFFSETOPERATORS_HPP
+#define RAJA_OFFSETOPERATORS_HPP
+
+#include "RAJA/config.hpp"
+
+#include "RAJA/util/concepts.hpp"
+#include "RAJA/util/macros.hpp"
+
+namespace RAJA
+{
+
+template <typename Ret, typename Arg1 = Ret, typename Arg2 = Arg1>
+struct GetOffsetLeft
+{
+  template < typename new_Ret, typename new_Arg1 = new_Ret, typename new_Arg2 = new_Ret>
+  using rebind = GetOffsetLeft<new_Ret, new_Arg1, new_Arg2>;
+
+  template < size_t >
+  using rebunch = GetOffsetLeft<Ret, Arg1, Arg2>;
+
+  RAJA_INLINE RAJA_HOST_DEVICE constexpr
+  Ret operator()(Arg1 const& i, Arg1 const& num_i,
+                 Arg2 const& j, Arg2 const& RAJA_UNUSED_ARG(num_j)) const noexcept
+  {
+    return i + j * num_i;
+  }
+};
+
+template <typename Ret, typename Arg1 = Ret, typename Arg2 = Arg1>
+struct GetOffsetRight
+{
+  template < typename new_Ret, typename new_Arg1 = new_Ret, typename new_Arg2 = new_Ret>
+  using rebind = GetOffsetRight<new_Ret, new_Arg1, new_Arg2>;
+
+  template < size_t >
+  using rebunch = GetOffsetRight<Ret, Arg1, Arg2>;
+
+  RAJA_INLINE RAJA_HOST_DEVICE constexpr
+  Ret operator()(Arg1 const& i, Arg1 const& RAJA_UNUSED_ARG(num_i),
+                 Arg2 const& j, Arg2 const& num_j) const noexcept
+  {
+    return i * num_j + j;
+  }
+};
+
+template <size_t t_bunch_num_i,
+          typename Ret, typename Arg1 = Ret, typename Arg2 = Arg1>
+struct GetOffsetLeftBunched
+{
+  template < typename new_Ret, typename new_Arg1 = new_Ret, typename new_Arg2 = new_Ret>
+  using rebind = GetOffsetLeftBunched<t_bunch_num_i, new_Ret, new_Arg1, new_Arg2>;
+
+  template < size_t new_bunch_num_i >
+  using rebunch = GetOffsetLeftBunched<new_bunch_num_i, Ret, Arg1, Arg2>;
+
+  static constexpr Arg1 bunch_num_i{t_bunch_num_i};
+
+  RAJA_INLINE RAJA_HOST_DEVICE constexpr
+  Ret operator()(Arg1 const& i, Arg1 const& RAJA_UNUSED_ARG(num_i),
+                 Arg2 const& j, Arg2 const& num_j) const noexcept
+  {
+    // assert(num_i >= bunch_num_i)
+    Arg1 i_inner = i % bunch_num_i;
+    Arg1 i_outer = i / bunch_num_i;
+    return i_inner + j * bunch_num_i + i_outer * num_j * bunch_num_i;
+  }
+};
+
+}  // namespace RAJA
+
+#endif
diff --git a/include/RAJA/util/RepeatView.hpp b/include/RAJA/util/RepeatView.hpp
new file mode 100644
index 0000000000..618913f794
--- /dev/null
+++ b/include/RAJA/util/RepeatView.hpp
@@ -0,0 +1,141 @@
+/*!
+ ******************************************************************************
+ *
+ * \file
+ *
+ * \brief   Header file for RAJA RepeatView constructs.
+ *
+ ******************************************************************************
+ */
+
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2016-24, Lawrence Livermore National Security, LLC
+// and RAJA project contributors. See the RAJA/LICENSE file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+#ifndef RAJA_REPEATVIEW_HPP
+#define RAJA_REPEATVIEW_HPP
+
+#include <cstddef>
+#include <utility>
+#include <type_traits>
+
+#include "RAJA/util/macros.hpp"
+
+namespace RAJA
+{
+
+/*!
+ * @brief A view of a single object repeated a certain number of times.
+ *
+ * Creates a view or container object given an object and length.
+ * Allows use of container interface functions if you want to repeat a
+ * single object.
+ *
+ * For example:
+ *
+ *     // Create a repeat view object for the int 2 repeated int_len times
+ *     RepeatView<int> int_repeated(2, int_len);
+ *
+ *     // Use with RAJA for_each
+ *     RAJA::for_each(int_repeated, [&](int val) {
+ *       sum += val;
+ *     });
+ *
+ * Based on the std::ranges::repeat_view template.
+ * Differs in that it does not support:
+ *   compile time extents
+ *   unbounded extents
+ *
+ */
+template < typename T >
+struct RepeatView
+{
+  struct iterator
+  {
+    using difference_type = std::ptrdiff_t;
+    using value_type = T;
+    using reference = value_type const&;
+
+    iterator() = default;
+
+    constexpr iterator(const T* base, size_t index)
+      : m_value(base), m_index(index)
+    { }
+
+    constexpr reference operator*() const noexcept { return *m_value; }
+    constexpr reference operator[](difference_type index) const noexcept { return *(*this + index); }
+
+    constexpr iterator& operator++() { ++m_index; return *this; }
+    constexpr iterator operator++(int) { auto tmp = *this; ++(*this); return tmp; }
+
+    constexpr iterator& operator--() { --m_index; return *this; }
+    constexpr iterator operator--(int) { auto tmp = *this; --(*this); return tmp; }
+
+    constexpr iterator& operator+=(difference_type rhs) { m_index += rhs; return *this; }
+    constexpr iterator& operator-=(difference_type rhs) { m_index -= rhs; return *this; }
+
+    friend constexpr iterator operator+(iterator lhs, difference_type rhs)
+    { lhs += rhs; return lhs; }
+    friend constexpr iterator operator+(difference_type lhs, iterator rhs)
+    { rhs += lhs; return rhs; }
+
+    friend constexpr iterator operator-(iterator lhs, difference_type rhs)
+    { lhs -= rhs; return lhs; }
+    friend constexpr difference_type operator-(iterator const& lhs, iterator const& rhs)
+    { return static_cast<difference_type>(lhs.m_index) - static_cast<difference_type>(rhs.m_index); }
+
+    friend constexpr bool operator==(iterator const& lhs, iterator const& rhs)
+    { return lhs.m_index == rhs.m_index; }
+    friend constexpr bool operator!=(iterator const& lhs, iterator const& rhs)
+    { return !(lhs == rhs); }
+
+    friend constexpr bool operator<(iterator const& lhs, iterator const& rhs)
+    { return lhs.m_index < rhs.m_index; }
+    friend constexpr bool operator<=(iterator const& lhs, iterator const& rhs)
+    { return !(rhs < lhs); }
+    friend constexpr bool operator>(iterator const& lhs, iterator const& rhs)
+    { return rhs < lhs; }
+    friend constexpr bool operator>=(iterator const& lhs, iterator const& rhs)
+    { return !(lhs < rhs); }
+
+  private:
+    const T* m_value = nullptr;
+    size_t m_index = 0;
+  };
+
+  RepeatView() = delete;
+
+  constexpr RepeatView(T const& value, size_t bound)
+    : m_bound(bound), m_value(value)
+  { }
+
+  constexpr RepeatView(T&& value, size_t bound)
+    : m_bound(bound), m_value(std::move(value))
+  { }
+
+  constexpr T const& front() const { return m_value; }
+  constexpr T const& back() const { return m_value; }
+  constexpr T const& operator[](size_t RAJA_UNUSED_ARG(index)) const { return m_value; }
+
+  constexpr iterator begin() const { return iterator(&m_value, 0); }
+  constexpr iterator cbegin() const { return iterator(&m_value, 0); }
+
+  constexpr iterator end() const { return iterator(&m_value, m_bound); }
+  constexpr iterator cend() const { return iterator(&m_value, m_bound); }
+
+  constexpr explicit operator bool() const { return m_bound != 0; }
+  constexpr bool empty() const { return m_bound == 0; }
+
+  constexpr size_t size() const { return m_bound; }
+
+private:
+  size_t m_bound = 0;
+  T m_value;
+};
+
+}  // end namespace RAJA
+
+#endif /* RAJA_REPEATVIEW_HPP */
diff --git a/include/RAJA/util/TypeConvert.hpp b/include/RAJA/util/TypeConvert.hpp
index 1486207712..5cdc019259 100644
--- a/include/RAJA/util/TypeConvert.hpp
+++ b/include/RAJA/util/TypeConvert.hpp
@@ -26,6 +26,8 @@
 
 #include "RAJA/util/macros.hpp"
 
+#include <string.h>
+
 
 namespace RAJA
 {
@@ -37,17 +39,13 @@ namespace util
  * Reinterpret any datatype as another datatype of the same size
  */
 template <typename A, typename B>
-RAJA_INLINE RAJA_HOST_DEVICE constexpr B reinterp_A_as_B(A const &val)
+RAJA_INLINE RAJA_HOST_DEVICE constexpr B reinterp_A_as_B(A const &a)
 {
-  static_assert(sizeof(A) == sizeof(B), "A and B must be same size");
-  return reinterpret_cast<B const &>(val);
-}
+  static_assert(sizeof(A) == sizeof(B), "A and B must be the same size");
 
-template <typename A, typename B>
-RAJA_INLINE RAJA_HOST_DEVICE constexpr B reinterp_A_as_B(A volatile const &val)
-{
-  static_assert(sizeof(A) == sizeof(B), "A and B must be same size");
-  return reinterpret_cast<B const volatile &>(val);
+  B b;
+  memcpy(&b, &a, sizeof(A));
+  return b;
 }
 
 
diff --git a/include/RAJA/util/for_each.hpp b/include/RAJA/util/for_each.hpp
index b279ec29ff..25783b2a0a 100644
--- a/include/RAJA/util/for_each.hpp
+++ b/include/RAJA/util/for_each.hpp
@@ -37,6 +37,7 @@ namespace detail
 {
 
 // runtime loop applying func to each element in the range in order
+RAJA_SUPPRESS_HD_WARN
 template<typename Iter, typename UnaryFunc>
 RAJA_HOST_DEVICE RAJA_INLINE
 UnaryFunc for_each(Iter begin, Iter end, UnaryFunc func)
@@ -49,6 +50,7 @@ UnaryFunc for_each(Iter begin, Iter end, UnaryFunc func)
 }
 
 // compile time expansion applying func to a each type in the list in order
+RAJA_SUPPRESS_HD_WARN
 template <typename UnaryFunc, typename... Ts>
 RAJA_HOST_DEVICE RAJA_INLINE
 UnaryFunc for_each_type(camp::list<Ts...> const&, UnaryFunc func)
@@ -60,6 +62,20 @@ UnaryFunc for_each_type(camp::list<Ts...> const&, UnaryFunc func)
   return func;
 }
 
+// compile time expansion applying func to a each type in the tuple in order
+RAJA_SUPPRESS_HD_WARN
+template <typename Tuple, typename UnaryFunc, camp::idx_t... Is>
+RAJA_HOST_DEVICE RAJA_INLINE
+UnaryFunc for_each_tuple(Tuple&& t, UnaryFunc func, camp::idx_seq<Is...>)
+{
+  using camp::get;
+  // braced init lists are evaluated in order
+  int seq_unused_array[] = {0, (func(get<Is>(std::forward<Tuple>(t))), 0)...};
+  RAJA_UNUSED_VAR(seq_unused_array);
+
+  return func;
+}
+
 }  // namespace detail
 
 
@@ -68,6 +84,7 @@ UnaryFunc for_each_type(camp::list<Ts...> const&, UnaryFunc func)
   using a sequential for loop in O(N) operations and O(1) extra memory
     see https://en.cppreference.com/w/cpp/algorithm/for_each
 */
+RAJA_SUPPRESS_HD_WARN
 template <typename Container, typename UnaryFunc>
 RAJA_HOST_DEVICE RAJA_INLINE
 concepts::enable_if_t<UnaryFunc, type_traits::is_range<Container>>
@@ -83,6 +100,7 @@ concepts::enable_if_t<UnaryFunc, type_traits::is_range<Container>>
   \brief Apply func to each type in the given list in order
   using a compile-time expansion in O(N) operations and O(1) extra memory
 */
+RAJA_SUPPRESS_HD_WARN
 template <typename UnaryFunc, typename... Ts>
 RAJA_HOST_DEVICE RAJA_INLINE
 UnaryFunc for_each_type(camp::list<Ts...> const& c, UnaryFunc func)
@@ -90,6 +108,19 @@ UnaryFunc for_each_type(camp::list<Ts...> const& c, UnaryFunc func)
   return detail::for_each_type(c, std::move(func));
 }
 
+/*!
+  \brief Apply func to each object in the given tuple or tuple like type in order
+  using a compile-time expansion in O(N) operations and O(1) extra memory
+*/
+RAJA_SUPPRESS_HD_WARN
+template <typename Tuple, typename UnaryFunc>
+RAJA_HOST_DEVICE RAJA_INLINE
+UnaryFunc for_each_tuple(Tuple&& t, UnaryFunc func)
+{
+  return detail::for_each_tuple(std::forward<Tuple>(t), std::move(func),
+      camp::make_idx_seq_t<std::tuple_size<camp::decay<Tuple>>::value>{});
+}
+
 }  // namespace RAJA
 
 #endif
diff --git a/include/RAJA/util/macros.hpp b/include/RAJA/util/macros.hpp
index 55e90010d8..9ddb5bebb7 100644
--- a/include/RAJA/util/macros.hpp
+++ b/include/RAJA/util/macros.hpp
@@ -153,7 +153,8 @@ RAJA_HOST_DEVICE
 inline void RAJA_ABORT_OR_THROW(const char *str)
 {
 #if defined(__SYCL_DEVICE_ONLY__)
-  abort();
+  //segfault here ran into linking problems
+  *((volatile char *)0) = 0;  // write to address 0
 #else
   printf ( "%s\n", str );
 #if defined(RAJA_ENABLE_TARGET_OPENMP) && (_OPENMP >= 201511)
diff --git a/include/RAJA/util/math.hpp b/include/RAJA/util/math.hpp
index 36c7cca1a0..66b0c9058c 100644
--- a/include/RAJA/util/math.hpp
+++ b/include/RAJA/util/math.hpp
@@ -70,6 +70,37 @@ constexpr T next_pow2(T n) noexcept
   return n;
 }
 
+/*!
+    \brief "round down" to the largest power of 2 that is less than or equal to n
+
+    For an integer n,
+      if n is negative, return 0
+      else
+        if n is a power of 2, return n
+        else return the largest power of 2 that is less than n
+*/
+template < typename T,
+           std::enable_if_t<std::is_integral<T>::value>* = nullptr >
+RAJA_HOST_DEVICE
+constexpr T prev_pow2(T n) noexcept
+{
+  if ( n < 0 ) return 0;
+  for (size_t s = 1; s < CHAR_BIT*sizeof(T); s *= 2) {
+    n |= n >> s;
+  }
+  return n - (n >> 1);
+}
+
+/*!
+    \brief compute lhs mod rhs where lhs is non-negative and rhs is a power of 2
+*/
+template < typename L, typename R,
+           std::enable_if_t<std::is_integral<L>::value && std::is_integral<R>::value>* = nullptr >
+constexpr auto power_of_2_mod(L lhs, R rhs) noexcept
+{
+  return lhs & (rhs-R(1));
+}
+
 }  // namespace RAJA
 
 #endif
diff --git a/include/RAJA/util/sycl_compat.hpp b/include/RAJA/util/sycl_compat.hpp
new file mode 100644
index 0000000000..7754caa273
--- /dev/null
+++ b/include/RAJA/util/sycl_compat.hpp
@@ -0,0 +1,29 @@
+/*!
+ ******************************************************************************
+ *
+ * \file
+ *
+ * \brief   RAJA header file for handling different SYCL header include paths
+ *
+ ******************************************************************************
+ */
+
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2016-24, Lawrence Livermore National Security, LLC
+// and RAJA project contributors. See the RAJA/LICENSE file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+#ifndef RAJA_util_sycl_compat_HPP
+#define RAJA_util_sycl_compat_HPP
+
+#if (__INTEL_CLANG_COMPILER && __INTEL_CLANG_COMPILER < 20230000)
+// older version, use legacy header locations
+#include <CL/sycl.hpp>
+#else
+// SYCL 2020 standard header
+#include <sycl/sycl.hpp>
+#endif
+
+#endif  // RAJA_util_sycl_compat_HPP
diff --git a/include/RAJA/util/types.hpp b/include/RAJA/util/types.hpp
index 7e331ef00e..310217bde5 100644
--- a/include/RAJA/util/types.hpp
+++ b/include/RAJA/util/types.hpp
@@ -956,6 +956,43 @@ struct AsIntegerArray
   }
 };
 
+
+/*!
+ * \brief Assign a new value to an object and restore the object's previous
+ * value at the end of the current scope.
+ */
+template <typename T>
+struct ScopedAssignment
+{
+  ScopedAssignment(T& val, T const& new_val)
+    : m_ref_to_val(val)
+    , m_prev_val(std::move(val))
+  {
+    m_ref_to_val = new_val;
+  }
+
+  ScopedAssignment(T& val, T&& new_val)
+    : m_ref_to_val(val)
+    , m_prev_val(std::move(val))
+  {
+    m_ref_to_val = std::move(new_val);
+  }
+
+  ScopedAssignment(ScopedAssignment const&) = delete;
+  ScopedAssignment(ScopedAssignment &&) = delete;
+  ScopedAssignment& operator=(ScopedAssignment const&) = delete;
+  ScopedAssignment& operator=(ScopedAssignment &&) = delete;
+
+  ~ScopedAssignment()
+  {
+    m_ref_to_val = std::move(m_prev_val);
+  }
+
+private:
+  T& m_ref_to_val;
+  T m_prev_val;
+};
+
 }  // namespace detail
 
 }  // namespace RAJA
diff --git a/scripts/gitlab/build_and_test.sh b/scripts/gitlab/build_and_test.sh
index ec7c8a6408..15d27e64a0 100755
--- a/scripts/gitlab/build_and_test.sh
+++ b/scripts/gitlab/build_and_test.sh
@@ -26,6 +26,34 @@ spec=${SPEC:-""}
 module_list=${MODULE_LIST:-""}
 job_unique_id=${CI_JOB_ID:-""}
 use_dev_shm=${USE_DEV_SHM:-true}
+spack_debug=${SPACK_DEBUG:-false}
+debug_mode=${DEBUG_MODE:-false}
+
+# REGISTRY_TOKEN allows you to provide your own personal access token to the CI
+# registry. Be sure to set the token with at least read access to the registry.
+registry_token=${REGISTRY_TOKEN:-""}
+ci_registry_user=${CI_REGISTRY_USER:-"${USER}"}
+ci_registry_image=${CI_REGISTRY_IMAGE:-"czregistry.llnl.gov:5050/radiuss/raja"}
+ci_registry_token=${CI_JOB_TOKEN:-"${registry_token}"}
+
+timed_message ()
+{
+    echo "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~"
+    echo "~ $(date --rfc-3339=seconds) ~ ${1}"
+    echo "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~"
+}
+
+if [[ ${debug_mode} == true ]]
+then
+    echo "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~"
+    echo "~~~~~ Debug mode:"
+    echo "~~~~~ - Spack debug mode."
+    echo "~~~~~ - Deactivated shared memory."
+    echo "~~~~~ - Do not push to buildcache."
+    echo "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~"
+    use_dev_shm=false
+    spack_debug=true
+fi
 
 if [[ -n ${module_list} ]]
 then
@@ -49,27 +77,33 @@ then
     fi
 
     prefix="${prefix}-${job_unique_id}"
-    mkdir -p ${prefix}
 else
     # We set the prefix in the parent directory so that spack dependencies are not installed inside the source tree.
     prefix="$(pwd)/../spack-and-build-root"
-    mkdir -p ${prefix}
+fi
+
+echo "Creating directory ${prefix}"
+echo "project_dir: ${project_dir}"
+
+mkdir -p ${prefix}
+
+spack_cmd="${prefix}/spack/bin/spack"
+spack_env_path="${prefix}/spack_env"
+uberenv_cmd="./scripts/uberenv/uberenv.py"
+if [[ ${spack_debug} == true ]]
+then
+    spack_cmd="${spack_cmd} --debug --stacktrace"
+    uberenv_cmd="${uberenv_cmd} --spack-debug"
 fi
 
 # Dependencies
-date
-echo "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~"
-echo "~~~~~ Build and test started"
-echo "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~"
 if [[ "${option}" != "--build-only" && "${option}" != "--test-only" ]]
 then
-    echo "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~"
-    echo "~~~~~ Building dependencies"
-    echo "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~"
+    timed_message "Building dependencies"
 
     if [[ -z ${spec} ]]
     then
-        echo "SPEC is undefined, aborting..."
+        echo "[Error]: SPEC is undefined, aborting..."
         exit 1
     fi
 
@@ -83,15 +117,29 @@ then
     export SPACK_USER_CACHE_PATH="${spack_user_cache}"
     mkdir -p ${spack_user_cache}
 
-    ./scripts/uberenv/uberenv.py --spec="${spec}" ${prefix_opt}
+    # generate cmake cache file with uberenv and radiuss spack package
+    timed_message "Spack setup and environment"
+    ${uberenv_cmd} --setup-and-env-only --spec="${spec}" ${prefix_opt}
+
+    if [[ -n ${ci_registry_token} ]]
+    then
+        timed_message "GitLab registry as Spack Buildcache"
+        ${spack_cmd} -D ${spack_env_path} mirror add --unsigned --oci-username ${ci_registry_user} --oci-password ${ci_registry_token} gitlab_ci oci://${ci_registry_image}
+    fi
+
+    timed_message "Spack build of dependencies"
+    ${uberenv_cmd} --skip-setup-and-env --spec="${spec}" ${prefix_opt}
 
+    if [[ -n ${ci_registry_token} && ${debug_mode} == false ]]
+    then
+        timed_message "Push dependencies to buildcache"
+        ${spack_cmd} -D ${spack_env_path} buildcache push --only dependencies gitlab_ci
+    fi
+
+    timed_message "Dependencies built"
 fi
-  echo "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~"
-  echo "~~~~~ Dependencies built"
-  echo "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~"
-date
 
-# Host config file
+# Find cmake cache file (hostconfig)
 if [[ -z ${hostconfig} ]]
 then
     # If no host config file was provided, we assume it was generated.
@@ -100,24 +148,24 @@ then
     if [[ ${#hostconfigs[@]} == 1 ]]
     then
         hostconfig_path=${hostconfigs[0]}
-        echo "Found host config file: ${hostconfig_path}"
     elif [[ ${#hostconfigs[@]} == 0 ]]
     then
-        echo "No result for: ${project_dir}/*.cmake"
-        echo "Spack generated host-config not found."
+        echo "[Error]: No result for: ${project_dir}/*.cmake"
+        echo "[Error]: Spack generated host-config not found."
         exit 1
     else
-        echo "More than one result for: ${project_dir}/*.cmake"
-        echo "${hostconfigs[@]}"
-        echo "Please specify one with HOST_CONFIG variable"
+        echo "[Error]: More than one result for: ${project_dir}/*.cmake"
+        echo "[Error]: ${hostconfigs[@]}"
+        echo "[Error]: Please specify one with HOST_CONFIG variable"
         exit 1
     fi
 else
     # Using provided host-config file.
-    hostconfig_path="${project_dir}/host-configs/${hostconfig}"
+    hostconfig_path="${project_dir}/${hostconfig}"
 fi
 
 hostconfig=$(basename ${hostconfig_path})
+echo "[Information]: Found hostconfig ${hostconfig_path}"
 
 # Build Directory
 # When using /dev/shm, we use prefix for both spack builds and source build, unless BUILD_ROOT was defined
@@ -131,17 +179,15 @@ cmake_exe=`grep 'CMake executable' ${hostconfig_path} | cut -d ':' -f 2 | xargs`
 # Build
 if [[ "${option}" != "--deps-only" && "${option}" != "--test-only" ]]
 then
-    date
     echo "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~"
+    echo "~~~~~ Prefix: ${prefix}"
     echo "~~~~~ Host-config: ${hostconfig_path}"
     echo "~~~~~ Build Dir:   ${build_dir}"
     echo "~~~~~ Project Dir: ${project_dir}"
     echo "~~~~~ Install Dir: ${install_dir}"
     echo "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~"
     echo ""
-    echo "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~"
-    echo "~~~~~ Building RAJA"
-    echo "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~"
+    timed_message "Cleaning working directory"
 
     # Map CPU core allocations
     declare -A core_counts=(["lassen"]=40 ["ruby"]=28 ["poodle"]=28 ["corona"]=32 ["rzansel"]=48 ["tioga"]=32)
@@ -153,8 +199,8 @@ then
     rm -rf ${build_dir} 2>/dev/null
     mkdir -p ${build_dir} && cd ${build_dir}
 
-    date
-    if [[ "${truehostname}" == "corona" || "${truehostname}" == "tioga" ]]
+    timed_message "Building RAJA"
+    if [[ "${truehostname}" == "tioga" ]]
     then
         module unload rocm
     fi
@@ -164,28 +210,20 @@ then
       ${project_dir}
     if ! $cmake_exe --build . -j ${core_counts[$truehostname]}
     then
-        echo "[Error]: compilation failed, building with verbose output..."
-        echo "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~"
-        echo "~~~~~ Running make VERBOSE=1"
-        echo "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~"
+        echo "[Error]: Compilation failed, building with verbose output..."
+        timed_message "Re-building with --verbose"
         $cmake_exe --build . --verbose -j 1
     else
+        timed_message "Installing"
         $cmake_exe --install .
     fi
-    date
 
-    echo "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~"
-    echo "~~~~~ RAJA built"
-    echo "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~"
+    timed_message "RAJA built and installed"
 fi
 
 # Test
 if [[ "${option}" != "--build-only" ]] && grep -q -i "ENABLE_TESTS.*ON" ${hostconfig_path}
 then
-    date
-    echo "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~"
-    echo "~~~~~ Testing RAJA"
-    echo "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~"
 
     if [[ ! -d ${build_dir} ]]
     then
@@ -194,9 +232,8 @@ then
 
     cd ${build_dir}
 
-    date
-    ctest --output-on-failure -T test 2>&1 | tee tests_output.txt
-    date
+    timed_message "Testing RAJA"
+    ctest --output-on-failure --no-compress-output -T test -VV 2>&1 | tee tests_output.txt
 
     no_test_str="No tests were found!!!"
     if [[ "$(tail -n 1 tests_output.txt)" == "${no_test_str}" ]]
@@ -204,48 +241,40 @@ then
         echo "[Error]: No tests were found" && exit 1
     fi
 
-    echo "Copying Testing xml reports for export"
+    timed_message "Preparing tests xml reports for export"
     tree Testing
     xsltproc -o junit.xml ${project_dir}/blt/tests/ctest-to-junit.xsl Testing/*/Test.xml
     mv junit.xml ${project_dir}/junit.xml
 
     if grep -q "Errors while running CTest" ./tests_output.txt
     then
-        echo "[Error]: failure(s) while running CTest" && exit 1
+        echo "[Error]: Failure(s) while running CTest" && exit 1
     fi
 
     if grep -q -i "ENABLE_HIP.*ON" ${hostconfig_path}
     then
-        echo "[Warning]: not testing install with HIP"
+        echo "[Warning]: Not testing install with HIP"
     else
         if [[ ! -d ${install_dir} ]]
         then
-            echo "[Error]: install directory not found : ${install_dir}" && exit 1
+            echo "[Error]: Install directory not found : ${install_dir}" && exit 1
         fi
 
         cd ${install_dir}/examples/RAJA/using-with-cmake
         mkdir build && cd build
         if ! $cmake_exe -C ../host-config.cmake ..; then
-        echo "[Error]: running $cmake_exe for using-with-cmake test" && exit 1
+            echo "[Error]: Running $cmake_exe for using-with-cmake test" && exit 1
         fi
 
         if ! make; then
-        echo "[Error]: running make for using-with-cmake test" && exit 1
+            echo "[Error]: Running make for using-with-cmake test" && exit 1
         fi
     fi
 
-    echo "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~"
-    echo "~~~~~ RAJA tests complete"
-    echo "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~"
-    date
+    timed_message "RAJA tests completed"
 fi
 
-echo "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~"
-echo "~~~~~ CLEAN UP"
-echo "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~"
+timed_message "Cleaning up"
 make clean
 
-echo "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~"
-echo "~~~~~ Build and test completed"
-echo "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~"
-date
+timed_message "Build and test completed"
diff --git a/scripts/lc-builds/blueos_nvcc_clang.sh b/scripts/lc-builds/blueos_nvcc_clang.sh
index 4d55ef1b3a..ae8ded8431 100755
--- a/scripts/lc-builds/blueos_nvcc_clang.sh
+++ b/scripts/lc-builds/blueos_nvcc_clang.sh
@@ -44,6 +44,7 @@ cmake \
   -C ../host-configs/lc-builds/blueos/nvcc_clang_X.cmake \
   -DENABLE_OPENMP=On \
   -DENABLE_CUDA=On \
+  -DRAJA_ENABLE_NV_TOOLS_EXT=ON \
   -DCUDA_TOOLKIT_ROOT_DIR=/usr/tce/packages/cuda/cuda-${COMP_NVCC_VER} \
   -DCMAKE_CUDA_COMPILER=/usr/tce/packages/cuda/cuda-${COMP_NVCC_VER}/bin/nvcc \
   -DCMAKE_CUDA_ARCHITECTURES=${COMP_ARCH} \
diff --git a/scripts/lc-builds/toss4_hipcc.sh b/scripts/lc-builds/toss4_hipcc.sh
index 6e1bb2af75..f7342e474c 100755
--- a/scripts/lc-builds/toss4_hipcc.sh
+++ b/scripts/lc-builds/toss4_hipcc.sh
@@ -63,6 +63,7 @@ cmake \
   -DROCM_ROOT_DIR="/opt/rocm-${COMP_VER}" \
   -DHIP_ROOT_DIR="/opt/rocm-${COMP_VER}/hip" \
   -DHIP_PATH=/opt/rocm-${COMP_VER}/bin \
+  -DRAJA_ENABLE_ROCTX=ON \
   -DCMAKE_C_COMPILER=/opt/rocm-${COMP_VER}/bin/hipcc \
   -DCMAKE_CXX_COMPILER=/opt/rocm-${COMP_VER}/bin/hipcc \
   -DCMAKE_HIP_ARCHITECTURES="${COMP_ARCH}" \
diff --git a/scripts/radiuss-spack-configs b/scripts/radiuss-spack-configs
index a8d22367e0..54c09b5dcf 160000
--- a/scripts/radiuss-spack-configs
+++ b/scripts/radiuss-spack-configs
@@ -1 +1 @@
-Subproject commit a8d22367e03d4c9c180a11886414430bdf6428a8
+Subproject commit 54c09b5dcf45decaac2b1e6d1048671cde17f7e5
diff --git a/scripts/uberenv b/scripts/uberenv
index cf91883ef0..205672b8b2 160000
--- a/scripts/uberenv
+++ b/scripts/uberenv
@@ -1 +1 @@
-Subproject commit cf91883ef0500a808338ad6c8b56647da15fa5f3
+Subproject commit 205672b8b2520d7dc69acefe8738960cd5db0937
diff --git a/src/MemUtils_CUDA.cpp b/src/MemUtils_CUDA.cpp
index d077e8af8f..85ead614d9 100644
--- a/src/MemUtils_CUDA.cpp
+++ b/src/MemUtils_CUDA.cpp
@@ -42,10 +42,10 @@ namespace detail
 //
 
 //! State of the host code globally
-cudaInfo g_status;
+cudaStatusInfo g_status;
 
 //! State of the host code in this thread
-cudaInfo tl_status;
+cudaStatusInfo tl_status;
 #if defined(RAJA_ENABLE_OPENMP)
 #pragma omp threadprivate(tl_status)
 #endif
diff --git a/src/MemUtils_HIP.cpp b/src/MemUtils_HIP.cpp
index bf44264132..97bd82775e 100644
--- a/src/MemUtils_HIP.cpp
+++ b/src/MemUtils_HIP.cpp
@@ -42,10 +42,10 @@ namespace detail
 //
 
 //! State of the host code globally
-hipInfo g_status;
+hipStatusInfo g_status;
 
 //! State of the host code in this thread
-hipInfo tl_status;
+hipStatusInfo tl_status;
 #if defined(RAJA_ENABLE_OPENMP)
 #pragma omp threadprivate(tl_status)
 #endif
diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt
index 4aa1294d07..8f8e65be8f 100644
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -17,4 +17,4 @@ add_subdirectory(old-tests)
 
 add_subdirectory(install)
 
-configure_file(${CMAKE_SOURCE_DIR}/test/CTestCustom.cmake ${CMAKE_BINARY_DIR})
+configure_file(${PROJECT_SOURCE_DIR}/test/CTestCustom.cmake ${CMAKE_BINARY_DIR})
diff --git a/test/functional/forall/CMakeLists.txt b/test/functional/forall/CMakeLists.txt
index eb9cc5ad19..435f0bbfcb 100644
--- a/test/functional/forall/CMakeLists.txt
+++ b/test/functional/forall/CMakeLists.txt
@@ -37,6 +37,8 @@ add_subdirectory(reduce-basic)
 add_subdirectory(reduce-multiple-segment)
 add_subdirectory(reduce-multiple-indexset)
 
+add_subdirectory(multi-reduce-basic)
+
 add_subdirectory(resource-indexset)
 add_subdirectory(resource-segment)
 
diff --git a/test/functional/forall/atomic-basic/CMakeLists.txt b/test/functional/forall/atomic-basic/CMakeLists.txt
index 9c2c12d76f..4c7973b0a3 100644
--- a/test/functional/forall/atomic-basic/CMakeLists.txt
+++ b/test/functional/forall/atomic-basic/CMakeLists.txt
@@ -11,7 +11,6 @@
 # Note: FORALL_ATOMIC_BACKENDS is defined in ../CMakeLists.txt
 #
 foreach( ATOMIC_BACKEND ${FORALL_ATOMIC_BACKENDS} )
-  # Signed Tests
   configure_file( test-forall-atomic-basic.cpp.in
                   test-forall-atomic-basic-${ATOMIC_BACKEND}.cpp )
   raja_add_test( NAME test-forall-atomic-basic-${ATOMIC_BACKEND}
@@ -19,12 +18,4 @@ foreach( ATOMIC_BACKEND ${FORALL_ATOMIC_BACKENDS} )
 
   target_include_directories(test-forall-atomic-basic-${ATOMIC_BACKEND}.exe
                                PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/tests)
-  # Unsigned Tests
-  configure_file( test-forall-atomic-basic-unsigned.cpp.in
-                  test-forall-atomic-basic-unsigned-${ATOMIC_BACKEND}.cpp )
-  raja_add_test( NAME test-forall-atomic-basic-unsigned-${ATOMIC_BACKEND}
-                SOURCES ${CMAKE_CURRENT_BINARY_DIR}/test-forall-atomic-basic-unsigned-${ATOMIC_BACKEND}.cpp )
-
-  target_include_directories(test-forall-atomic-basic-unsigned-${ATOMIC_BACKEND}.exe
-                              PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/tests)
 endforeach()
diff --git a/test/functional/forall/atomic-basic/tests/test-forall-atomic-basic-unsigned.hpp b/test/functional/forall/atomic-basic/tests/test-forall-atomic-basic-unsigned.hpp
deleted file mode 100644
index e318c3847f..0000000000
--- a/test/functional/forall/atomic-basic/tests/test-forall-atomic-basic-unsigned.hpp
+++ /dev/null
@@ -1,147 +0,0 @@
-//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2016-24, Lawrence Livermore National Security, LLC
-// and RAJA project contributors. See the RAJA/LICENSE file for details.
-//
-// SPDX-License-Identifier: (BSD-3-Clause)
-//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-
-///
-/// Header file containing basic functional tests for atomic operations with forall.
-///
-
-#ifndef __TEST_FORALL_ATOMIC_BASIC_UNSIGNED_HPP__
-#define __TEST_FORALL_ATOMIC_BASIC_UNSIGNED_HPP__
-
-#include <numeric>
-
-// segment multiplexer
-template< typename IdxType, typename SegType >
-struct RSMultiplexer {};
-
-template< typename IdxType >
-struct RSMultiplexer < IdxType, RAJA::TypedRangeSegment<IdxType> >
-{
-  RAJA::TypedRangeSegment<IdxType>
-  makeseg( IdxType N, camp::resources::Resource RAJA_UNUSED_ARG(work_res) )
-  {
-    return RAJA::TypedRangeSegment<IdxType>( 0, N );
-  }
-};
-
-template< typename IdxType >
-struct RSMultiplexer < IdxType, RAJA::TypedRangeStrideSegment<IdxType> >
-{
-  RAJA::TypedRangeStrideSegment<IdxType>
-  makeseg( IdxType N, camp::resources::Resource RAJA_UNUSED_ARG(work_res) )
-  {
-    return RAJA::TypedRangeStrideSegment<IdxType>( 0, N, 1 );
-  }
-};
-
-template< typename IdxType >
-struct RSMultiplexer < IdxType, RAJA::TypedListSegment<IdxType> >
-{
-  RAJA::TypedListSegment<IdxType>
-  makeseg( IdxType N, camp::resources::Resource work_res )
-  {
-    std::vector<IdxType> temp(N);
-    std::iota( std::begin(temp), std::end(temp), 0 );
-    return RAJA::TypedListSegment<IdxType>( &temp[0], static_cast<size_t>(temp.size()), work_res );
-  }
-};
-// end segment multiplexer
-
-template <typename ExecPolicy,
-          typename AtomicPolicy,
-          typename WORKINGRES,
-          typename IdxType,
-          typename SegmentType,
-          typename T>
-void ForallAtomicBasicUnsignedTestImpl( IdxType seglimit )
-{
-  // initialize an array
-  const int len = 2;
-
-  camp::resources::Resource work_res{WORKINGRES()};
-
-  SegmentType seg =
-    RSMultiplexer<IdxType, SegmentType>().makeseg(seglimit, work_res);
-
-  T * work_array;
-  T * test_array;
-  T * check_array;
-
-  allocateForallTestData<T>(  len,
-                              work_res,
-                              &work_array,
-                              &check_array,
-                              &test_array );
-
-  work_res.memcpy( work_array, test_array, sizeof(T) * len );
-
-#if defined(RAJA_ENABLE_CUDA)
-  cudaErrchk(cudaDeviceSynchronize());
-#endif
-
-#if defined(RAJA_ENABLE_HIP)
-  hipErrchk(hipDeviceSynchronize());
-#endif
-
-  test_array[0] = (T)0;
-  test_array[1] = (T)0;
-
-  work_res.memcpy( work_array, test_array, sizeof(T) * len );
-
-  RAJA::forall<ExecPolicy>(seg, [=] RAJA_HOST_DEVICE(IdxType RAJA_UNUSED_ARG(i)) {
-    RAJA::atomicInc<AtomicPolicy>(work_array + 0, (T)16);
-    RAJA::atomicDec<AtomicPolicy>(work_array + 1, (T)16);
-  });
-
-  work_res.memcpy( check_array, work_array, sizeof(T) * len );
-
-#if defined(RAJA_ENABLE_CUDA)
-  cudaErrchk(cudaDeviceSynchronize());
-#endif
-
-#if defined(RAJA_ENABLE_HIP)
-  hipErrchk(hipDeviceSynchronize());
-#endif
-
-  EXPECT_EQ((T)4, check_array[0]);
-  EXPECT_EQ((T)13, check_array[1]);
-
-  deallocateForallTestData<T>(  work_res,
-                                work_array,
-                                check_array,
-                                test_array );
-}
-
-TYPED_TEST_SUITE_P(ForallAtomicBasicUnsignedTest);
-template <typename T>
-class ForallAtomicBasicUnsignedTest : public ::testing::Test
-{
-};
-
-TYPED_TEST_P(ForallAtomicBasicUnsignedTest, AtomicBasicUnsignedForall)
-{
-  using AExec   = typename camp::at<TypeParam, camp::num<0>>::type;
-  using APol    = typename camp::at<TypeParam, camp::num<1>>::type;
-  using ResType = typename camp::at<TypeParam, camp::num<2>>::type;
-  using IdxType = typename camp::at<TypeParam, camp::num<3>>::type;
-  using DType   = typename camp::at<TypeParam, camp::num<4>>::type;
-
-  ForallAtomicBasicUnsignedTestImpl<AExec, APol, ResType,
-                                    IdxType, RAJA::TypedRangeSegment<IdxType>,
-                                    DType>( 10000 );
-  ForallAtomicBasicUnsignedTestImpl<AExec, APol, ResType,
-                                    IdxType, RAJA::TypedRangeStrideSegment<IdxType>,
-                                    DType>( 10000 );
-  ForallAtomicBasicUnsignedTestImpl<AExec, APol, ResType,
-                                    IdxType, RAJA::TypedListSegment<IdxType>,
-                                    DType>( 10000 );
-}
-
-REGISTER_TYPED_TEST_SUITE_P(ForallAtomicBasicUnsignedTest,
-                            AtomicBasicUnsignedForall);
-
-#endif  //__TEST_FORALL_ATOMIC_BASIC_UNSIGNED_HPP__
diff --git a/test/functional/forall/atomic-basic/tests/test-forall-atomic-basic.hpp b/test/functional/forall/atomic-basic/tests/test-forall-atomic-basic.hpp
index ab2f0a89e7..a9e2c5a9f8 100644
--- a/test/functional/forall/atomic-basic/tests/test-forall-atomic-basic.hpp
+++ b/test/functional/forall/atomic-basic/tests/test-forall-atomic-basic.hpp
@@ -61,7 +61,7 @@ template <typename ExecPolicy,
 void ForallAtomicBasicTestImpl( IdxType seglimit )
 {
   // initialize an array
-  const int len = 8;
+  const int len = 12;
 
   camp::resources::Resource work_res{WORKINGRES()};
 
@@ -78,64 +78,59 @@ void ForallAtomicBasicTestImpl( IdxType seglimit )
                               &check_array,
                               &test_array );
 
-  work_res.memcpy( work_array, test_array, sizeof(T) * len );
-
-#if defined(RAJA_ENABLE_CUDA)
-  cudaErrchk(cudaDeviceSynchronize());
-#endif
-
-#if defined(RAJA_ENABLE_HIP)
-  hipErrchk(hipDeviceSynchronize());
-#endif
-
   // use atomic add to reduce the array
-  test_array[0] = (T)0;
-  test_array[1] = (T)seglimit;
-  test_array[2] = (T)seglimit;
-  test_array[3] = (T)0;
-  test_array[4] = (T)0;
-  test_array[5] = (T)seglimit + 1;
-  test_array[6] = (T)seglimit;
-  test_array[7] = (T)0;
-
-  work_res.memcpy( work_array, test_array, sizeof(T) * len );
+  test_array[0] = static_cast<T>(0);
+  test_array[1] = static_cast<T>(seglimit);
+  test_array[2] = static_cast<T>(seglimit);
+  test_array[3] = static_cast<T>(0);
+  test_array[4] = static_cast<T>(0);
+  test_array[5] = static_cast<T>(seglimit + 1);
+  test_array[6] = static_cast<T>(seglimit);
+  test_array[7] = static_cast<T>(0);
+  test_array[8] = static_cast<T>(0);
+  test_array[9] = static_cast<T>(0);
+  test_array[10] = static_cast<T>(0);
+  test_array[11] = static_cast<T>(0);
+
+  work_res.memcpy(work_array, test_array, sizeof(T) * len);
 
   RAJA::forall<ExecPolicy>(seg, [=] RAJA_HOST_DEVICE(IdxType i) {
-    RAJA::atomicAdd<AtomicPolicy>(work_array + 0, (T)1);
-    RAJA::atomicSub<AtomicPolicy>(work_array + 1, (T)1);
-    RAJA::atomicMin<AtomicPolicy>(work_array + 2, (T)i);
-    RAJA::atomicMax<AtomicPolicy>(work_array + 3, (T)i);
+    RAJA::atomicAdd<AtomicPolicy>(work_array + 0, static_cast<T>(1));
+    RAJA::atomicSub<AtomicPolicy>(work_array + 1, static_cast<T>(1));
+    RAJA::atomicMin<AtomicPolicy>(work_array + 2, static_cast<T>(i));
+    RAJA::atomicMax<AtomicPolicy>(work_array + 3, static_cast<T>(i));
     RAJA::atomicInc<AtomicPolicy>(work_array + 4);
     RAJA::atomicDec<AtomicPolicy>(work_array + 5);
-    RAJA::atomicExchange<AtomicPolicy>(work_array + 6, (T)i);
-    RAJA::atomicCAS<AtomicPolicy>(work_array + 7, (T)i, (T)(i+1));
+    RAJA::atomicExchange<AtomicPolicy>(work_array + 6, static_cast<T>(i));
+    RAJA::atomicCAS<AtomicPolicy>(work_array + 7, static_cast<T>(i), static_cast<T>(i+1));
+    RAJA::atomicLoad<AtomicPolicy>(work_array + 8);
+    RAJA::atomicStore<AtomicPolicy>(work_array + 9, static_cast<T>(1));
+    RAJA::atomicInc<AtomicPolicy>(work_array + 10, static_cast<T>(16));
+    RAJA::atomicDec<AtomicPolicy>(work_array + 11, static_cast<T>(16));
   });
 
   work_res.memcpy( check_array, work_array, sizeof(T) * len );
-
-#if defined(RAJA_ENABLE_CUDA)
-  cudaErrchk(cudaDeviceSynchronize());
-#endif
-
-#if defined(RAJA_ENABLE_HIP)
-  hipErrchk(hipDeviceSynchronize());
-#endif
-
-  EXPECT_EQ((T)seglimit, check_array[0]);
-  EXPECT_EQ((T)0, check_array[1]);
-  EXPECT_EQ((T)0, check_array[2]);
-  EXPECT_EQ((T)seglimit - 1, check_array[3]);
-  EXPECT_EQ((T)seglimit, check_array[4]);
-  EXPECT_EQ((T)1, check_array[5]);
-  EXPECT_LE((T)0, check_array[6]);
-  EXPECT_GT((T)seglimit, check_array[6]);
-  EXPECT_LT((T)0, check_array[7]);
-  EXPECT_GE((T)seglimit, check_array[7]);
-
-  deallocateForallTestData<T>(  work_res,
-                                work_array,
-                                check_array,
-                                test_array );
+  work_res.wait();
+
+  EXPECT_EQ(static_cast<T>(seglimit), check_array[0]);
+  EXPECT_EQ(static_cast<T>(0), check_array[1]);
+  EXPECT_EQ(static_cast<T>(0), check_array[2]);
+  EXPECT_EQ(static_cast<T>(seglimit - 1), check_array[3]);
+  EXPECT_EQ(static_cast<T>(seglimit), check_array[4]);
+  EXPECT_EQ(static_cast<T>(1), check_array[5]);
+  EXPECT_LE(static_cast<T>(0), check_array[6]);
+  EXPECT_GT(static_cast<T>(seglimit), check_array[6]);
+  EXPECT_LT(static_cast<T>(0), check_array[7]);
+  EXPECT_GE(static_cast<T>(seglimit), check_array[7]);
+  EXPECT_EQ(static_cast<T>(0), check_array[8]);
+  EXPECT_EQ(static_cast<T>(1), check_array[9]);
+  EXPECT_EQ(static_cast<T>(4), check_array[10]);
+  EXPECT_EQ(static_cast<T>(13), check_array[11]);
+
+  deallocateForallTestData<T>(work_res,
+                              work_array,
+                              check_array,
+                              test_array);
 }
 
 TYPED_TEST_SUITE_P(ForallAtomicBasicTest);
@@ -154,13 +149,13 @@ TYPED_TEST_P(ForallAtomicBasicTest, AtomicBasicForall)
 
   ForallAtomicBasicTestImpl<AExec, APol, ResType, 
                             IdxType, RAJA::TypedRangeSegment<IdxType>, 
-                            DType>( 10000 );
+                            DType>(10000);
   ForallAtomicBasicTestImpl<AExec, APol, ResType, 
                             IdxType, RAJA::TypedRangeStrideSegment<IdxType>, 
-                            DType>( 10000 );
+                            DType>(10000);
   ForallAtomicBasicTestImpl<AExec, APol, ResType, 
                             IdxType, RAJA::TypedListSegment<IdxType>, 
-                            DType>( 10000 );
+                            DType>(10000);
 }
 
 REGISTER_TYPED_TEST_SUITE_P(ForallAtomicBasicTest,
diff --git a/test/functional/forall/multi-reduce-basic/CMakeLists.txt b/test/functional/forall/multi-reduce-basic/CMakeLists.txt
new file mode 100644
index 0000000000..31ec872c0f
--- /dev/null
+++ b/test/functional/forall/multi-reduce-basic/CMakeLists.txt
@@ -0,0 +1,73 @@
+###############################################################################
+# Copyright (c) 2016-24, Lawrence Livermore National Security, LLC
+# and RAJA project contributors. See the RAJA/LICENSE file for details.
+#
+# SPDX-License-Identifier: (BSD-3-Clause)
+###############################################################################
+
+#
+# List of core reduction types for generating test files.
+#
+set(REDUCETYPES Sum Min Max BitAnd BitOr)
+
+#
+# If building openmp target tests, remove the back-end to
+# from the list of tests to generate here.
+#
+if(RAJA_ENABLE_TARGET_OPENMP)
+  #if(RAJA_TEST_OPENMP_TARGET_SUBSET)
+    list(REMOVE_ITEM FORALL_BACKENDS OpenMPTarget)
+  #endif()
+endif()
+
+#
+# If building SYCL tests, remove the back-end to
+# from the list of tests to generate here.
+#
+if(RAJA_ENABLE_SYCL)
+  list(REMOVE_ITEM FORALL_BACKENDS Sycl)
+endif()
+
+#
+# Generate core reduction tests for each enabled RAJA back-end
+#
+# Note: FORALL_BACKENDS is defined in ../CMakeLists.txt
+#
+foreach( BACKEND ${FORALL_BACKENDS} )
+  foreach( REDUCETYPE ${REDUCETYPES} )
+    configure_file( test-forall-basic-multi-reduce.cpp.in
+                    test-forall-basic-MultiReduce${REDUCETYPE}-${BACKEND}.cpp )
+    raja_add_test( NAME test-forall-basic-MultiReduce${REDUCETYPE}-${BACKEND}
+                   SOURCES ${CMAKE_CURRENT_BINARY_DIR}/test-forall-basic-MultiReduce${REDUCETYPE}-${BACKEND}.cpp )
+
+    target_include_directories(test-forall-basic-MultiReduce${REDUCETYPE}-${BACKEND}.exe
+                               PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/tests)
+  endforeach()
+endforeach()
+
+unset( REDUCETYPES )
+
+
+#
+# If building a subset of openmp target tests, add tests to build here.
+#
+#if(RAJA_ENABLE_TARGET_OPENMP)
+#  if(RAJA_TEST_OPENMP_TARGET_SUBSET)
+#
+#    set(BACKEND OpenMPTarget)
+#    set(REDUCETYPES ReduceSum)
+#
+#    foreach( REDUCETYPE ${REDUCETYPES} )
+#      configure_file( test-forall-basic-multi-reduce.cpp.in
+#                      test-forall-basic-MultiReduce${REDUCETYPE}-${BACKEND}.cpp )
+#      raja_add_test( NAME test-forall-basic-MultiReduce${REDUCETYPE}-${BACKEND}
+#                     SOURCES ${CMAKE_CURRENT_BINARY_DIR}/test-forall-basic-MultiReduce${REDUCETYPE}-${BACKEND}.cpp )
+#
+#      target_include_directories(test-forall-basic-MultiReduce${REDUCETYPE}-${BACKEND}.exe
+#                                 PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/tests)
+#    endforeach()
+#
+#  endif()
+#endif()
+
+unset( REDUCETYPES )
diff --git a/test/functional/forall/atomic-basic/test-forall-atomic-basic-unsigned.cpp.in b/test/functional/forall/multi-reduce-basic/test-forall-basic-multi-reduce.cpp.in
similarity index 51%
rename from test/functional/forall/atomic-basic/test-forall-atomic-basic-unsigned.cpp.in
rename to test/functional/forall/multi-reduce-basic/test-forall-basic-multi-reduce.cpp.in
index 5c4ef05e5d..cd03109a9c 100644
--- a/test/functional/forall/atomic-basic/test-forall-atomic-basic-unsigned.cpp.in
+++ b/test/functional/forall/multi-reduce-basic/test-forall-basic-multi-reduce.cpp.in
@@ -12,21 +12,29 @@
 #include "RAJA_test-camp.hpp"
 #include "RAJA_test-index-types.hpp"
 
-#include "RAJA_test-atomic-types.hpp"
-#include "RAJA_test-atomicpol.hpp"
-
-#include "RAJA_test-forall-execpol.hpp"
 #include "RAJA_test-forall-data.hpp"
+#include "RAJA_test-forall-execpol.hpp"
+#include "RAJA_test-multi-reducepol.hpp"
+#include "RAJA_test-multi-reduce-abstractor.hpp"
+
 
 //
 // Header for tests in ./tests directory
 //
 // Note: CMake adds ./tests as an include dir for these tests.
 //
-#include "test-forall-atomic-basic-unsigned.hpp"
+#include "test-forall-basic-MultiReduce.hpp"
 
 //
-// These tests exercise only one index type. We parameterize here to
+// Data types for core reduction basic tests
+//
+using ReductionDataTypeList = camp::list< int,
+                                          float,
+                                          double >;
+
+
+//
+// These tests exercise only one index type. We parameterize here to 
 // make it easier to expand types in the future if needed.
 //
 using TestIdxTypeList = camp::list< RAJA::Index_type >;
@@ -34,17 +42,17 @@ using TestIdxTypeList = camp::list< RAJA::Index_type >;
 //
 // Cartesian product of types used in parameterized tests
 //
-using @ATOMIC_BACKEND@ForallAtomicBasicUnsignedTypes =
-  Test< camp::cartesian_product<@ATOMIC_BACKEND@ForallAtomicExecPols,
-                                @ATOMIC_BACKEND@AtomicPols,
-                                @ATOMIC_BACKEND@ResourceList,
-                                TestIdxTypeList,
-                                AtomicDataUnsignedTypeList > >::Types;
+using @BACKEND@ForallMultiReduceBasicTypes =
+  Test< camp::cartesian_product<TestIdxTypeList,
+                                ReductionDataTypeList,
+                                @BACKEND@ResourceList,
+                                @BACKEND@ForallReduceExecPols,
+                                @BACKEND@MultiReducePols,
+                                Reduce@REDUCETYPE@Abstractors>>::Types;
 
 //
 // Instantiate parameterized test
 //
-
-INSTANTIATE_TYPED_TEST_SUITE_P(@ATOMIC_BACKEND@,
-                               ForallAtomicBasicUnsignedTest,
-                               @ATOMIC_BACKEND@ForallAtomicBasicUnsignedTypes);
+INSTANTIATE_TYPED_TEST_SUITE_P(@BACKEND@,
+                               ForallMultiReduceBasicTest,
+                               @BACKEND@ForallMultiReduceBasicTypes);
diff --git a/test/functional/forall/multi-reduce-basic/tests/test-forall-basic-MultiReduce.hpp b/test/functional/forall/multi-reduce-basic/tests/test-forall-basic-MultiReduce.hpp
new file mode 100644
index 0000000000..7c187464e8
--- /dev/null
+++ b/test/functional/forall/multi-reduce-basic/tests/test-forall-basic-MultiReduce.hpp
@@ -0,0 +1,299 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2016-24, Lawrence Livermore National Security, LLC
+// and RAJA project contributors. See the RAJA/LICENSE file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+#ifndef __TEST_FORALL_BASIC_REDUCESUM_HPP__
+#define __TEST_FORALL_BASIC_REDUCESUM_HPP__
+
+#include <cstdlib>
+#include <ctime>
+#include <numeric>
+#include <vector>
+#include <random>
+#include <type_traits>
+
+template <typename EXEC_POLICY, typename REDUCE_POLICY, typename ABSTRACTION,
+          typename DATA_TYPE, typename IDX_TYPE,
+          typename SEG_TYPE, typename Container,
+          typename RandomGenerator>
+// use enable_if in return type to appease nvcc 11.2
+// add bool return type to disambiguate signatures of these functions for MSVC
+std::enable_if_t<!ABSTRACTION::template supports<DATA_TYPE>(), bool>
+ForallMultiReduceBasicTestImpl(const SEG_TYPE&,
+                               const Container&,
+                               const std::vector<IDX_TYPE>&,
+                               camp::resources::Resource,
+                               RandomGenerator&)
+{ return false; }
+///
+template <typename EXEC_POLICY, typename REDUCE_POLICY, typename ABSTRACTION,
+          typename DATA_TYPE, typename IDX_TYPE,
+          typename SEG_TYPE, typename Container,
+          typename RandomGenerator>
+// use enable_if in return type to appease nvcc 11.2
+std::enable_if_t<ABSTRACTION::template supports<DATA_TYPE>()>
+ForallMultiReduceBasicTestImpl(const SEG_TYPE& seg,
+                               const Container& multi_init,
+                               const std::vector<IDX_TYPE>& seg_idx,
+                               camp::resources::Resource working_res,
+                               RandomGenerator& rngen)
+{
+  using MULTIREDUCER = typename ABSTRACTION::template multi_reducer<REDUCE_POLICY, DATA_TYPE>;
+
+  const IDX_TYPE idx_range = seg_idx[seg_idx.size() - 1] + 1;
+  const IDX_TYPE idx_len = static_cast<IDX_TYPE>( seg_idx.size() );
+
+  const int modval = 100;
+  const size_t num_bins = multi_init.size();
+
+  IDX_TYPE* working_range;
+  IDX_TYPE* check_range;
+  IDX_TYPE* test_range;
+
+  DATA_TYPE* working_array;
+  DATA_TYPE* check_array;
+  DATA_TYPE* test_array;
+
+  IDX_TYPE* working_bins;
+  IDX_TYPE* check_bins;
+  IDX_TYPE* test_bins;
+
+  IDX_TYPE data_len = 0;
+
+  allocateForallTestData(idx_range+1,
+                         working_res,
+                         &working_range,
+                         &check_range,
+                         &test_range);
+
+  for (IDX_TYPE i = 0; i < idx_range+1; ++i) {
+    test_range[i] = ~IDX_TYPE(0);
+  }
+
+  std::uniform_int_distribution<IDX_TYPE> work_per_iterate_distribution(0, num_bins);
+
+  for (IDX_TYPE i = 0; i < idx_len; ++i) {
+    IDX_TYPE idx = seg_idx[i];
+    test_range[idx] = data_len;
+    data_len += work_per_iterate_distribution(rngen);
+    test_range[idx+1] = data_len;
+  }
+
+  allocateForallTestData(data_len,
+                         working_res,
+                         &working_array,
+                         &check_array,
+                         &test_array);
+
+  allocateForallTestData(data_len,
+                         working_res,
+                         &working_bins,
+                         &check_bins,
+                         &test_bins);
+
+  // use ints to initialize array here to avoid floating point precision issues
+  std::uniform_int_distribution<int> array_int_distribution(0, modval-1);
+  std::uniform_int_distribution<IDX_TYPE> bin_distribution(0, num_bins-1);
+
+
+  for (IDX_TYPE i = 0; i < data_len; ++i) {
+    test_array[i] = DATA_TYPE(array_int_distribution(rngen));
+
+    // this may use the same bin multiple times per iterate
+    test_bins[i] = bin_distribution(rngen);
+  }
+
+  working_res.memcpy(working_range, test_range, sizeof(IDX_TYPE) * (idx_range+1));
+  working_res.memcpy(working_array, test_array, sizeof(DATA_TYPE) * data_len);
+  working_res.memcpy(working_bins, test_bins, sizeof(IDX_TYPE) * data_len);
+
+
+  MULTIREDUCER red(num_bins);
+  MULTIREDUCER red2(multi_init);
+
+  // basic test with two multi reducers in the same loop
+  {
+    std::vector<DATA_TYPE> ref_vals(num_bins, ABSTRACTION::identity(red));
+
+    for (IDX_TYPE i = 0; i < data_len; ++i) {
+      ref_vals[test_bins[i]] = ABSTRACTION::combine(ref_vals[test_bins[i]], test_array[i]);
+    }
+
+    RAJA::forall<EXEC_POLICY>(seg, [=] RAJA_HOST_DEVICE(IDX_TYPE ii) {
+      for (IDX_TYPE idx = working_range[ii]; idx < working_range[ii+1]; ++idx) {
+        ABSTRACTION::reduce(red[working_bins[idx]],  working_array[idx]);
+        ABSTRACTION::reduce(red2[working_bins[idx]], working_array[idx]);
+      }
+    });
+
+    size_t bin = 0;
+    for (auto init_val : multi_init) {
+      ASSERT_EQ(DATA_TYPE(red[bin].get()), ref_vals[bin]);
+      ASSERT_EQ(red2.get(bin), ABSTRACTION::combine(ref_vals[bin], init_val));
+      ++bin;
+    }
+  }
+
+
+  red.reset();
+
+  // basic multiple use test, ensure same reducer can combine values from multiple loops
+  {
+    std::vector<DATA_TYPE> ref_vals(num_bins, ABSTRACTION::identity(red));
+
+    const int nloops = 2;
+    for (int j = 0; j < nloops; ++j) {
+
+      for (IDX_TYPE i = 0; i < data_len; ++i) {
+        ref_vals[test_bins[i]] = ABSTRACTION::combine(ref_vals[test_bins[i]], test_array[i]);
+      }
+
+      RAJA::forall<EXEC_POLICY>(seg, [=] RAJA_HOST_DEVICE(IDX_TYPE ii) {
+        for (IDX_TYPE idx = working_range[ii]; idx < working_range[ii+1]; ++idx) {
+          ABSTRACTION::reduce(red[working_bins[idx]], working_array[idx]);
+        }
+      });
+    }
+
+    for (size_t bin = 0; bin < num_bins; ++bin) {
+      ASSERT_EQ(static_cast<DATA_TYPE>(red[bin].get()), ref_vals[bin]);
+    }
+  }
+
+
+  // test the consistency of answers, if we expect them to be consistent
+  if (ABSTRACTION::consistent(red)) {
+
+    if /* constexpr */ (std::is_floating_point<DATA_TYPE>::value) {
+
+      // use floating point values to accentuate floating point precision issues
+      std::conditional_t<!std::is_floating_point<DATA_TYPE>::value,
+          std::uniform_int_distribution<DATA_TYPE>,
+          std::uniform_real_distribution<DATA_TYPE>> array_flt_distribution(0, modval-1);
+
+      for (IDX_TYPE i = 0; i < data_len; ++i) {
+        test_array[i] = DATA_TYPE(array_flt_distribution(rngen));
+      }
+      working_res.memcpy(working_array, test_array, sizeof(DATA_TYPE) * data_len);
+    }
+
+
+    std::vector<DATA_TYPE> ref_vals;
+    bool got_ref_vals = false;
+
+    const int nloops = 2;
+    for (int j = 0; j < nloops; ++j) {
+      red.reset();
+
+      RAJA::forall<EXEC_POLICY>(seg, [=] RAJA_HOST_DEVICE(IDX_TYPE ii) {
+        for (IDX_TYPE idx = working_range[ii]; idx < working_range[ii+1]; ++idx) {
+          ABSTRACTION::reduce(red[working_bins[idx]], working_array[idx]);
+        }
+      });
+
+      if (!got_ref_vals) {
+        ref_vals.resize(num_bins);
+        red.get_all(ref_vals);
+        got_ref_vals = true;
+      } else {
+        for (size_t bin = 0; bin < num_bins; ++bin) {
+          ASSERT_EQ(red.get(bin), ref_vals[bin]);
+        }
+      }
+    }
+  }
+
+
+  deallocateForallTestData(working_res,
+                           working_bins,
+                           check_bins,
+                           test_bins);
+  deallocateForallTestData(working_res,
+                           working_array,
+                           check_array,
+                           test_array);
+  deallocateForallTestData(working_res,
+                           working_range,
+                           check_range,
+                           test_range);
+}
+
+
+TYPED_TEST_SUITE_P(ForallMultiReduceBasicTest);
+template <typename T>
+class ForallMultiReduceBasicTest : public ::testing::Test
+{
+};
+
+TYPED_TEST_P(ForallMultiReduceBasicTest, MultiReduceBasicForall)
+{
+  using IDX_TYPE      = typename camp::at<TypeParam, camp::num<0>>::type;
+  using DATA_TYPE     = typename camp::at<TypeParam, camp::num<1>>::type;
+  using WORKING_RES   = typename camp::at<TypeParam, camp::num<2>>::type;
+  using EXEC_POLICY   = typename camp::at<TypeParam, camp::num<3>>::type;
+  using REDUCE_POLICY = typename camp::at<TypeParam, camp::num<4>>::type;
+  using ABSTRACTION   = typename camp::at<TypeParam, camp::num<5>>::type;
+
+  // for setting random values in arrays
+  auto random_seed = std::random_device{}();
+  std::mt19937 rngen(random_seed);
+
+  camp::resources::Resource working_res{WORKING_RES::get_default()};
+
+  std::vector<IDX_TYPE> seg_idx;
+
+  std::vector<DATA_TYPE> container;
+
+  std::vector<size_t> num_bins_max_container({0, 1, 100});
+  size_t num_bins_min = 0;
+  for (size_t num_bins_max : num_bins_max_container) {
+
+    std::uniform_int_distribution<size_t> num_bins_dist(num_bins_min, num_bins_max);
+    num_bins_min = num_bins_max+1;
+    size_t num_bins = num_bins_dist(rngen);
+
+    container.resize(num_bins, DATA_TYPE(2));
+
+    // Range segment tests
+    RAJA::TypedRangeSegment<IDX_TYPE> r1( 0, 28 );
+    RAJA::getIndices(seg_idx, r1);
+    ForallMultiReduceBasicTestImpl<EXEC_POLICY, REDUCE_POLICY, ABSTRACTION, DATA_TYPE>(
+                                   r1, container, seg_idx, working_res, rngen);
+
+    seg_idx.clear();
+    RAJA::TypedRangeSegment<IDX_TYPE> r3( 3, 2060 );
+    RAJA::getIndices(seg_idx, r3);
+    ForallMultiReduceBasicTestImpl<EXEC_POLICY, REDUCE_POLICY, ABSTRACTION, DATA_TYPE>(
+                                   r3, container, seg_idx, working_res, rngen);
+
+    // Range-stride segment test
+    seg_idx.clear();
+    RAJA::TypedRangeStrideSegment<IDX_TYPE> r5( 3, 1029, 3 );
+    RAJA::getIndices(seg_idx, r5);
+    ForallMultiReduceBasicTestImpl<EXEC_POLICY, REDUCE_POLICY, ABSTRACTION, DATA_TYPE>(
+                                   r5, container, seg_idx, working_res, rngen);
+
+    // List segment test
+    seg_idx.clear();
+    IDX_TYPE last = 10567;
+    std::uniform_int_distribution<IDX_TYPE> dist(0, last-1);
+    for (IDX_TYPE i = 0; i < last; ++i) {
+      IDX_TYPE randval = dist(rngen);
+      if ( i < randval ) {
+        seg_idx.push_back(i);
+      }
+    }
+    RAJA::TypedListSegment<IDX_TYPE> l1( &seg_idx[0], seg_idx.size(),
+                                         working_res );
+    ForallMultiReduceBasicTestImpl<EXEC_POLICY, REDUCE_POLICY, ABSTRACTION, DATA_TYPE>(
+                                   l1, container, seg_idx, working_res, rngen);
+  }
+}
+
+REGISTER_TYPED_TEST_SUITE_P(ForallMultiReduceBasicTest,
+                            MultiReduceBasicForall);
+
+#endif  // __TEST_FORALL_BASIC_REDUCESUM_HPP__
diff --git a/test/functional/forall/reduce-basic/CMakeLists.txt b/test/functional/forall/reduce-basic/CMakeLists.txt
index 2537ea1ff9..42f03f04c8 100644
--- a/test/functional/forall/reduce-basic/CMakeLists.txt
+++ b/test/functional/forall/reduce-basic/CMakeLists.txt
@@ -22,16 +22,6 @@ if(RAJA_ENABLE_TARGET_OPENMP)
   endif()
 endif()
 
-#
-# If building SYCL tests, remove the back-end from
-# from the list of tests to generate here for the 
-# expt-reduce tests.
-#
-if(RAJA_ENABLE_SYCL)
-	list(REMOVE_ITEM REDUCETYPES ReduceMaxLoc)
-	list(REMOVE_ITEM REDUCETYPES ReduceMinLoc)
-endif()
-
 
 #
 # Generate core reduction tests for each enabled RAJA back-end
diff --git a/test/functional/kernel/CMakeLists.txt b/test/functional/kernel/CMakeLists.txt
index cd577a45b4..76771724c9 100644
--- a/test/functional/kernel/CMakeLists.txt
+++ b/test/functional/kernel/CMakeLists.txt
@@ -37,6 +37,8 @@ add_subdirectory(conditional-fission-fusion-loop)
 
 add_subdirectory(hyperplane)
 
+add_subdirectory(multi-reduce-nested)
+
 add_subdirectory(nested-loop)
 
 add_subdirectory(nested-loop-reducesum)
diff --git a/test/functional/kernel/hyperplane/CMakeLists.txt b/test/functional/kernel/hyperplane/CMakeLists.txt
index 2e74129160..c01c9c2231 100644
--- a/test/functional/kernel/hyperplane/CMakeLists.txt
+++ b/test/functional/kernel/hyperplane/CMakeLists.txt
@@ -13,7 +13,7 @@ set(TESTTYPES 2D 3D)
 foreach( BACKEND ${KERNEL_BACKENDS} )
   foreach( TEST_TYPE ${TESTTYPES} )
     # Removing Sycl backend, implementation of Hyperplane does not exist
-    if( NOT ((BACKEND STREQUAL "Sycl")) )
+    if( NOT ((BACKEND STREQUAL "Sycl")) AND NOT ((BACKEND STREQUAL "OpenMPTarget")) )
       configure_file( test-kernel-hyperplane-${TEST_TYPE}.cpp.in
                       test-kernel-hyperplane-${TEST_TYPE}-${BACKEND}.cpp )
       raja_add_test( NAME test-kernel-hyperplane-${TEST_TYPE}-${BACKEND}
diff --git a/test/functional/kernel/multi-reduce-nested/CMakeLists.txt b/test/functional/kernel/multi-reduce-nested/CMakeLists.txt
new file mode 100644
index 0000000000..9efda7d133
--- /dev/null
+++ b/test/functional/kernel/multi-reduce-nested/CMakeLists.txt
@@ -0,0 +1,73 @@
+###############################################################################
+# Copyright (c) 2016-24, Lawrence Livermore National Security, LLC
+# and RAJA project contributors. See the RAJA/LICENSE file for details.
+#
+# SPDX-License-Identifier: (BSD-3-Clause)
+###############################################################################
+
+#
+# List of core reduction types for generating test files.
+#
+set(REDUCETYPES Sum Min Max BitAnd BitOr)
+
+#
+# If building openmp target tests, remove the back-end to
+# from the list of tests to generate here.
+#
+if(RAJA_ENABLE_TARGET_OPENMP)
+  #if(RAJA_TEST_OPENMP_TARGET_SUBSET)
+    list(REMOVE_ITEM KERNEL_BACKENDS OpenMPTarget)
+  #endif()
+endif()
+
+#
+# If building SYCL tests, remove the back-end to
+# from the list of tests to generate here.
+#
+if(RAJA_ENABLE_SYCL)
+  list(REMOVE_ITEM KERNEL_BACKENDS Sycl)
+endif()
+
+#
+# Generate core reduction tests for each enabled RAJA back-end
+#
+# Note: KERNEL_BACKENDS is defined in ../CMakeLists.txt
+#
+foreach( BACKEND ${KERNEL_BACKENDS} )
+  foreach( REDUCETYPE ${REDUCETYPES} )
+    configure_file( test-kernel-nested-multi-reduce.cpp.in
+                    test-kernel-nested-MultiReduce${REDUCETYPE}-${BACKEND}.cpp )
+    raja_add_test( NAME test-kernel-nested-MultiReduce${REDUCETYPE}-${BACKEND}
+                   SOURCES ${CMAKE_CURRENT_BINARY_DIR}/test-kernel-nested-MultiReduce${REDUCETYPE}-${BACKEND}.cpp )
+
+    target_include_directories(test-kernel-nested-MultiReduce${REDUCETYPE}-${BACKEND}.exe
+                               PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/tests)
+  endforeach()
+endforeach()
+
+unset( REDUCETYPES )
+
+
+#
+# If building a subset of openmp target tests, add tests to build here.
+#
+#if(RAJA_ENABLE_TARGET_OPENMP)
+#  if(RAJA_TEST_OPENMP_TARGET_SUBSET)
+#
+#    set(BACKEND OpenMPTarget)
+#    set(REDUCETYPES ReduceSum)
+#
+#    foreach( REDUCETYPE ${REDUCETYPES} )
+#      configure_file( test-kernel-nested-multi-reduce.cpp.in
+#                      test-kernel-nested-MultiReduce${REDUCETYPE}-${BACKEND}.cpp )
+#      raja_add_test( NAME test-kernel-nested-MultiReduce${REDUCETYPE}-${BACKEND}
+#                     SOURCES ${CMAKE_CURRENT_BINARY_DIR}/test-kernel-nested-MultiReduce${REDUCETYPE}-${BACKEND}.cpp )
+#
+#      target_include_directories(test-kernel-nested-MultiReduce${REDUCETYPE}-${BACKEND}.exe
+#                                 PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/tests)
+#    endforeach()
+#
+#  endif()
+#endif()
+
+unset( REDUCETYPES )
diff --git a/test/functional/kernel/multi-reduce-nested/test-kernel-nested-multi-reduce.cpp.in b/test/functional/kernel/multi-reduce-nested/test-kernel-nested-multi-reduce.cpp.in
new file mode 100644
index 0000000000..6816bb6ad7
--- /dev/null
+++ b/test/functional/kernel/multi-reduce-nested/test-kernel-nested-multi-reduce.cpp.in
@@ -0,0 +1,123 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2016-24, Lawrence Livermore National Security, LLC
+// and RAJA project contributors. See the RAJA/LICENSE file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+//
+// test/include headers
+//
+#include "RAJA_test-base.hpp"
+#include "RAJA_test-camp.hpp"
+#include "RAJA_test-index-types.hpp"
+
+#include "RAJA_test-forall-data.hpp"
+#include "RAJA_test-kernel-nested-loop-types.hpp"
+#include "RAJA_test-multi-reducepol.hpp"
+#include "RAJA_test-multi-reduce-abstractor.hpp"
+
+
+//
+// Header for tests in ./tests directory
+//
+// Note: CMake adds ./tests as an include dir for these tests.
+//
+#include "test-kernel-nested-MultiReduce.hpp"
+
+//
+// Data types for core reduction nested tests
+//
+using ReductionDataTypeList = camp::list< int,
+                                          float,
+                                          double >;
+
+
+//
+// These tests exercise only one index type. We parameterize here to 
+// make it easier to expand types in the future if needed.
+//
+using TestIdxTypeList = camp::list< RAJA::Index_type >;
+
+
+using SequentialKernelNestedLoopExecPols = camp::list<
+
+    // Depth 3 Exec Pols
+    NestedLoopData<DEPTH_3, RAJA::seq_exec,  RAJA::seq_exec, RAJA::seq_exec >
+  >;
+
+#if defined(RAJA_ENABLE_OPENMP)
+
+using OpenMPKernelNestedLoopExecPols = camp::list<
+
+    // Collapse Exec Pols
+    NestedLoopData<DEPTH_3_COLLAPSE, RAJA::omp_parallel_collapse_exec >,
+
+    // Depth 3 Exec Pols
+    NestedLoopData<DEPTH_3, RAJA::seq_exec, RAJA::omp_parallel_for_exec, RAJA::seq_exec >
+  >;
+
+#endif  // RAJA_ENABLE_OPENMP
+
+
+#if defined(RAJA_ENABLE_TARGET_OPENMP)
+
+using OpenMPTargetKernelNestedLoopExecPols = camp::list<
+
+    // Collapse Exec Pols
+    NestedLoopData<DEPTH_3_COLLAPSE, RAJA::omp_target_parallel_collapse_exec >,
+
+    // Depth 3 Exec Pols
+    NestedLoopData<DEPTH_3, RAJA::seq_exec,  RAJA::omp_target_parallel_for_exec<16>, RAJA::seq_exec >
+  >;
+
+#endif  // RAJA_ENABLE_TARGET_OPENMP
+
+
+#if defined(RAJA_ENABLE_CUDA)
+using CudaKernelNestedLoopExecPols = camp::list<
+
+    // Depth 3 Exec Pols
+    NestedLoopData<DEVICE_DEPTH_3, RAJA::cuda_global_size_x_loop<16>, RAJA::cuda_global_size_y_direct<16>, RAJA::seq_exec >
+  >;
+
+#endif  // RAJA_ENABLE_CUDA
+
+#if defined(RAJA_ENABLE_HIP)
+
+using HipKernelNestedLoopExecPols = camp::list<
+
+    // Depth 3 Exec Pols
+    NestedLoopData<DEVICE_DEPTH_3, RAJA::hip_global_size_x_loop<32>, RAJA::hip_global_size_y_direct<8>, RAJA::seq_exec >
+  >;
+
+#endif  // RAJA_ENABLE_HIP
+
+#if defined(RAJA_ENABLE_SYCL)
+
+using SyclKernelNestedLoopExecPols = camp::list<
+
+    // Depth 3 Exec Pols
+    NestedLoopData<DEVICE_DEPTH_3, RAJA::sycl_local_0_loop, RAJA::sycl_local_1_loop, RAJA::seq_exec >,
+    NestedLoopData<DEVICE_DEPTH_3, RAJA::sycl_group_0_loop, RAJA::sycl_local_1_loop, RAJA::seq_exec >
+  >;
+
+#endif  // RAJA_ENABLE_SYCL
+
+//
+// Cartesian product of types used in parameterized tests
+//
+using @BACKEND@KernelMultiReduceNestedTypes =
+  Test< camp::cartesian_product<TestIdxTypeList,
+                                ReductionDataTypeList,
+                                @BACKEND@ResourceList,
+                                @BACKEND@KernelNestedLoopExecPols,
+                                @BACKEND@MultiReducePols,
+                                Reduce@REDUCETYPE@Abstractors>>::Types;
+
+//
+// Instantiate parameterized test
+//
+INSTANTIATE_TYPED_TEST_SUITE_P(@BACKEND@,
+                               KernelMultiReduceNestedTest,
+                               @BACKEND@KernelMultiReduceNestedTypes);
diff --git a/test/functional/kernel/multi-reduce-nested/tests/test-kernel-nested-MultiReduce.hpp b/test/functional/kernel/multi-reduce-nested/tests/test-kernel-nested-MultiReduce.hpp
new file mode 100644
index 0000000000..30c102684b
--- /dev/null
+++ b/test/functional/kernel/multi-reduce-nested/tests/test-kernel-nested-MultiReduce.hpp
@@ -0,0 +1,361 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2016-24, Lawrence Livermore National Security, LLC
+// and RAJA project contributors. See the RAJA/LICENSE file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+#ifndef __TEST_KERNEL_NESTED_MULTIREDUCE_HPP__
+#define __TEST_KERNEL_NESTED_MULTIREDUCE_HPP__
+
+#include <cstdlib>
+#include <ctime>
+#include <numeric>
+#include <vector>
+#include <random>
+#include <type_traits>
+
+template <typename EXEC_POLICY, typename REDUCE_POLICY, typename ABSTRACTION,
+          typename DATA_TYPE, typename IDX_TYPE,
+          typename SEGMENTS_TYPE, typename Container,
+          typename WORKING_RES, typename RandomGenerator>
+// use enable_if in return type to appease nvcc 11.2
+// add bool return type to disambiguate signatures of these functions for MSVC
+std::enable_if_t<!ABSTRACTION::template supports<DATA_TYPE>(), bool>
+KernelMultiReduceNestedTestImpl(const SEGMENTS_TYPE&,
+                                const Container&,
+                                WORKING_RES,
+                                RandomGenerator&)
+{ return false; }
+///
+template <typename EXEC_POLICY, typename REDUCE_POLICY, typename ABSTRACTION,
+          typename DATA_TYPE, typename IDX_TYPE,
+          typename SEGMENTS_TYPE, typename Container,
+          typename WORKING_RES, typename RandomGenerator>
+// use enable_if in return type to appease nvcc 11.2
+std::enable_if_t<ABSTRACTION::template supports<DATA_TYPE>()>
+KernelMultiReduceNestedTestImpl(const SEGMENTS_TYPE& segments,
+                                const Container& multi_init,
+                                WORKING_RES working_res,
+                                RandomGenerator& rngen)
+{
+  using RAJA::get;
+  using MULTIREDUCER = typename ABSTRACTION::template multi_reducer<REDUCE_POLICY, DATA_TYPE>;
+
+  auto si = get<2>(segments);
+  auto sj = get<1>(segments);
+  auto sk = get<0>(segments);
+
+  RAJA_EXTRACT_BED_SUFFIXED(si, _si);
+  RAJA_EXTRACT_BED_SUFFIXED(sj, _sj);
+  RAJA_EXTRACT_BED_SUFFIXED(sk, _sk);
+
+  IDX_TYPE dimi = begin_si[distance_si-1] + 1;
+  IDX_TYPE dimj = begin_sj[distance_sj-1] + 1;
+  IDX_TYPE dimk = begin_sk[distance_sk-1] + 1;
+
+  const IDX_TYPE idx_range = dimi * dimj * dimk;
+
+  const int modval = 100;
+  const size_t num_bins = multi_init.size();
+
+  IDX_TYPE* working_range;
+  IDX_TYPE* check_range;
+  IDX_TYPE* test_range;
+
+  DATA_TYPE* working_array;
+  DATA_TYPE* check_array;
+  DATA_TYPE* test_array;
+
+  IDX_TYPE* working_bins;
+  IDX_TYPE* check_bins;
+  IDX_TYPE* test_bins;
+
+  IDX_TYPE data_len = 0;
+
+  allocateForallTestData(idx_range+1,
+                         working_res,
+                         &working_range,
+                         &check_range,
+                         &test_range);
+
+  for (IDX_TYPE i = 0; i < idx_range+1; ++i) {
+    test_range[i] = ~IDX_TYPE(0);
+  }
+
+  std::uniform_int_distribution<IDX_TYPE> work_per_iterate_distribution(0, num_bins);
+
+  for (IDX_TYPE k : sk) {
+    for (IDX_TYPE j : sj) {
+      for (IDX_TYPE i : si) {
+        IDX_TYPE ii = (dimi * dimj * k) + (dimi * j) + i;
+        test_range[ii] = data_len;
+        data_len += work_per_iterate_distribution(rngen);
+        test_range[ii+1] = data_len;
+      }
+    }
+  }
+
+  allocateForallTestData(data_len,
+                         working_res,
+                         &working_array,
+                         &check_array,
+                         &test_array);
+
+  allocateForallTestData(data_len,
+                         working_res,
+                         &working_bins,
+                         &check_bins,
+                         &test_bins);
+
+  // use ints to initialize array here to avoid floating point precision issues
+  std::uniform_int_distribution<int> array_int_distribution(0, modval-1);
+  std::uniform_int_distribution<IDX_TYPE> bin_distribution(0, num_bins-1);
+
+
+  for (IDX_TYPE i = 0; i < data_len; ++i) {
+    test_array[i] = DATA_TYPE(array_int_distribution(rngen));
+
+    // this may use the same bin multiple times per iterate
+    test_bins[i] = bin_distribution(rngen);
+  }
+
+  working_res.memcpy(working_range, test_range, sizeof(IDX_TYPE) * (idx_range+1));
+  working_res.memcpy(working_array, test_array, sizeof(DATA_TYPE) * data_len);
+  working_res.memcpy(working_bins, test_bins, sizeof(IDX_TYPE) * data_len);
+
+
+  MULTIREDUCER red(num_bins);
+  MULTIREDUCER red2(multi_init);
+
+  // basic test with two multi reducers in the same loop
+  {
+    std::vector<DATA_TYPE> ref_vals(num_bins, ABSTRACTION::identity(red));
+
+    for (IDX_TYPE i = 0; i < data_len; ++i) {
+      ref_vals[test_bins[i]] = ABSTRACTION::combine(ref_vals[test_bins[i]], test_array[i]);
+    }
+
+    RAJA::kernel_resource<EXEC_POLICY>(segments, working_res,
+        [=] RAJA_HOST_DEVICE (IDX_TYPE k, IDX_TYPE j, IDX_TYPE i) {
+      IDX_TYPE ii = (dimi * dimj * k) + (dimi * j) + i;
+      for (IDX_TYPE idx = working_range[ii]; idx < working_range[ii+1]; ++idx) {
+        ABSTRACTION::reduce(red[working_bins[idx]],  working_array[idx]);
+        ABSTRACTION::reduce(red2[working_bins[idx]], working_array[idx]);
+      }
+    });
+
+    size_t bin = 0;
+    for (auto init_val : multi_init) {
+      ASSERT_EQ(DATA_TYPE(red[bin].get()), ref_vals[bin]);
+      ASSERT_EQ(red2.get(bin), ABSTRACTION::combine(ref_vals[bin], init_val));
+      ++bin;
+    }
+  }
+
+
+  red.reset();
+
+  // basic multiple use test, ensure same reducer can combine values from multiple loops
+  {
+    std::vector<DATA_TYPE> ref_vals(num_bins, ABSTRACTION::identity(red));
+
+    const int nloops = 2;
+    for (int j = 0; j < nloops; ++j) {
+
+      for (IDX_TYPE i = 0; i < data_len; ++i) {
+        ref_vals[test_bins[i]] = ABSTRACTION::combine(ref_vals[test_bins[i]], test_array[i]);
+      }
+
+      RAJA::kernel_resource<EXEC_POLICY>(segments, working_res,
+          [=] RAJA_HOST_DEVICE (IDX_TYPE k, IDX_TYPE j, IDX_TYPE i) {
+        IDX_TYPE ii = (dimi * dimj * k) + (dimi * j) + i;
+        for (IDX_TYPE idx = working_range[ii]; idx < working_range[ii+1]; ++idx) {
+          ABSTRACTION::reduce(red[working_bins[idx]],  working_array[idx]);
+        }
+      });
+    }
+
+    for (size_t bin = 0; bin < num_bins; ++bin) {
+      ASSERT_EQ(static_cast<DATA_TYPE>(red[bin].get()), ref_vals[bin]);
+    }
+  }
+
+
+  // test the consistency of answers, if we expect them to be consistent
+  if (ABSTRACTION::consistent(red)) {
+
+    if /* constexpr */ (std::is_floating_point<DATA_TYPE>::value) {
+
+      // use floating point values to accentuate floating point precision issues
+      std::conditional_t<!std::is_floating_point<DATA_TYPE>::value,
+          std::uniform_int_distribution<DATA_TYPE>,
+          std::uniform_real_distribution<DATA_TYPE>> array_flt_distribution(0, modval-1);
+
+      for (IDX_TYPE i = 0; i < data_len; ++i) {
+        test_array[i] = DATA_TYPE(array_flt_distribution(rngen));
+      }
+      working_res.memcpy(working_array, test_array, sizeof(DATA_TYPE) * data_len);
+    }
+
+
+    std::vector<DATA_TYPE> ref_vals;
+    bool got_ref_vals = false;
+
+    const int nloops = 2;
+    for (int j = 0; j < nloops; ++j) {
+      red.reset();
+
+      RAJA::kernel_resource<EXEC_POLICY>(segments, working_res,
+          [=] RAJA_HOST_DEVICE (IDX_TYPE k, IDX_TYPE j, IDX_TYPE i) {
+        IDX_TYPE ii = (dimi * dimj * k) + (dimi * j) + i;
+        for (IDX_TYPE idx = working_range[ii]; idx < working_range[ii+1]; ++idx) {
+          ABSTRACTION::reduce(red[working_bins[idx]],  working_array[idx]);
+        }
+      });
+
+      if (!got_ref_vals) {
+        ref_vals.resize(num_bins);
+        red.get_all(ref_vals);
+        got_ref_vals = true;
+      } else {
+        for (size_t bin = 0; bin < num_bins; ++bin) {
+          ASSERT_EQ(red.get(bin), ref_vals[bin]);
+        }
+      }
+    }
+  }
+
+
+  deallocateForallTestData(working_res,
+                           working_bins,
+                           check_bins,
+                           test_bins);
+  deallocateForallTestData(working_res,
+                           working_array,
+                           check_array,
+                           test_array);
+  deallocateForallTestData(working_res,
+                           working_range,
+                           check_range,
+                           test_range);
+}
+
+
+TYPED_TEST_SUITE_P(KernelMultiReduceNestedTest);
+template <typename T>
+class KernelMultiReduceNestedTest : public ::testing::Test
+{
+};
+
+//
+//
+// Defining the Kernel Loop structure for MultiReduce Nested Loop Tests.
+//
+//
+template<typename POLICY_TYPE, typename POLICY_DATA>
+struct MultiReduceNestedLoopExec;
+
+template<typename POLICY_DATA>
+struct MultiReduceNestedLoopExec<DEPTH_3, POLICY_DATA> {
+  using type =
+    RAJA::KernelPolicy<
+      RAJA::statement::For<0, typename camp::at<POLICY_DATA, camp::num<0>>::type,
+        RAJA::statement::For<1, typename camp::at<POLICY_DATA, camp::num<1>>::type,
+          RAJA::statement::For<2, typename camp::at<POLICY_DATA, camp::num<2>>::type,
+            RAJA::statement::Lambda<0>
+          >
+        >
+      >
+    >;
+};
+
+template<typename POLICY_DATA>
+struct MultiReduceNestedLoopExec<DEPTH_3_COLLAPSE, POLICY_DATA> {
+  using type =
+    RAJA::KernelPolicy<
+      RAJA::statement::Collapse< typename camp::at<POLICY_DATA, camp::num<0>>::type,
+        RAJA::ArgList<0,1,2>,
+        RAJA::statement::Lambda<0>
+      >
+    >;
+};
+
+#if defined(RAJA_ENABLE_CUDA) or defined(RAJA_ENABLE_HIP) or defined(RAJA_ENABLE_SYCL)
+
+template<typename POLICY_DATA>
+struct MultiReduceNestedLoopExec<DEVICE_DEPTH_3, POLICY_DATA> {
+  using type =
+    RAJA::KernelPolicy<
+      RAJA::statement::DEVICE_KERNEL<
+        RAJA::statement::For<0, typename camp::at<POLICY_DATA, camp::num<0>>::type,
+          RAJA::statement::For<1, typename camp::at<POLICY_DATA, camp::num<1>>::type,
+            RAJA::statement::For<2, typename camp::at<POLICY_DATA, camp::num<2>>::type,
+              RAJA::statement::Lambda<0>
+            >
+          >
+        >
+      > // end DEVICE_KERNEL
+    >;
+};
+
+#endif  // RAJA_ENABLE_CUDA or RAJA_ENABLE_HIP or RAJA_ENABLE_SYCL
+
+TYPED_TEST_P(KernelMultiReduceNestedTest, MultiReduceNestedKernel)
+{
+  using IDX_TYPE      = typename camp::at<TypeParam, camp::num<0>>::type;
+  using DATA_TYPE     = typename camp::at<TypeParam, camp::num<1>>::type;
+  using WORKING_RES   = typename camp::at<TypeParam, camp::num<2>>::type;
+  using EXEC_POL_DATA = typename camp::at<TypeParam, camp::num<3>>::type;
+  using REDUCE_POLICY = typename camp::at<TypeParam, camp::num<4>>::type;
+  using ABSTRACTION   = typename camp::at<TypeParam, camp::num<5>>::type;
+
+  using LOOP_TYPE = typename EXEC_POL_DATA::LoopType;
+  using LOOP_POLS = typename EXEC_POL_DATA::type;
+  using EXEC_POLICY = typename MultiReduceNestedLoopExec<LOOP_TYPE, LOOP_POLS>::type;
+
+  // for setting random values in arrays
+  auto random_seed = std::random_device{}();
+  std::mt19937 rngen(random_seed);
+
+  WORKING_RES working_res{WORKING_RES::get_default()};
+
+  std::vector<DATA_TYPE> container;
+
+  std::vector<size_t> num_bins_max_container({0, 1, 100});
+  size_t num_bins_min = 0;
+  for (size_t num_bins_max : num_bins_max_container) {
+
+    std::uniform_int_distribution<size_t> num_bins_dist(num_bins_min, num_bins_max);
+    num_bins_min = num_bins_max+1;
+    size_t num_bins = num_bins_dist(rngen);
+
+    container.resize(num_bins, DATA_TYPE(2));
+
+    // Range segment tests
+    auto s1 = RAJA::make_tuple(RAJA::TypedRangeSegment<IDX_TYPE>( 0, 2 ),
+                               RAJA::TypedRangeSegment<IDX_TYPE>( 0, 7 ),
+                               RAJA::TypedRangeSegment<IDX_TYPE>( 0, 3 ));
+    KernelMultiReduceNestedTestImpl<EXEC_POLICY, REDUCE_POLICY, ABSTRACTION, DATA_TYPE, IDX_TYPE>(
+                                   s1, container, working_res, rngen);
+
+    auto s2 = RAJA::make_tuple(RAJA::TypedRangeSegment<IDX_TYPE>( 2, 35 ),
+                               RAJA::TypedRangeSegment<IDX_TYPE>( 0, 19 ),
+                               RAJA::TypedRangeSegment<IDX_TYPE>( 3, 13 ));
+    KernelMultiReduceNestedTestImpl<EXEC_POLICY, REDUCE_POLICY, ABSTRACTION, DATA_TYPE, IDX_TYPE>(
+                                   s2, container, working_res, rngen);
+
+    // Range-stride segment tests
+    auto s3 = RAJA::make_tuple(RAJA::TypedRangeStrideSegment<IDX_TYPE>( 0, 6, 2 ),
+                               RAJA::TypedRangeStrideSegment<IDX_TYPE>( 1, 38, 3 ),
+                               RAJA::TypedRangeStrideSegment<IDX_TYPE>( 5, 17, 1 ));
+    KernelMultiReduceNestedTestImpl<EXEC_POLICY, REDUCE_POLICY, ABSTRACTION, DATA_TYPE, IDX_TYPE>(
+                                   s3, container, working_res, rngen);
+
+  }
+}
+
+REGISTER_TYPED_TEST_SUITE_P(KernelMultiReduceNestedTest,
+                            MultiReduceNestedKernel);
+
+#endif  // __TEST_KERNEL_NESTED_MULTIREDUCE_HPP__
diff --git a/test/functional/kernel/single-loop-tile-icount-tcount/CMakeLists.txt b/test/functional/kernel/single-loop-tile-icount-tcount/CMakeLists.txt
index 482a78297d..adc04fb1bc 100644
--- a/test/functional/kernel/single-loop-tile-icount-tcount/CMakeLists.txt
+++ b/test/functional/kernel/single-loop-tile-icount-tcount/CMakeLists.txt
@@ -17,16 +17,19 @@ set(TILESIZES 8 32)
 # Note: KERNEL_BACKENDS is defined in ../CMakeLists.txt
 #
 foreach( BACKEND ${KERNEL_BACKENDS} )
-  foreach( TESTTYPE ${TESTTYPES} )
-    foreach( TILESIZE ${TILESIZES} )
-      configure_file( test-kernel-single-loop-tile-count.cpp.in
-                      test-kernel-single-loop-${TESTTYPE}-${TILESIZE}-${BACKEND}.cpp )
-      raja_add_test( NAME test-kernel-single-loop-${TESTTYPE}-${TILESIZE}-${BACKEND}
-                     SOURCES ${CMAKE_CURRENT_BINARY_DIR}/test-kernel-single-loop-${TESTTYPE}-${TILESIZE}-${BACKEND}.cpp )
-      target_include_directories(test-kernel-single-loop-${TESTTYPE}-${TILESIZE}-${BACKEND}.exe
-                                 PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/tests)
+  # using omp target crashes the compiler with this one 
+  if( NOT ((BACKEND STREQUAL "OpenMPTarget")) )
+    foreach( TESTTYPE ${TESTTYPES} )
+      foreach( TILESIZE ${TILESIZES} )
+        configure_file( test-kernel-single-loop-tile-count.cpp.in
+                        test-kernel-single-loop-${TESTTYPE}-${TILESIZE}-${BACKEND}.cpp )
+        raja_add_test( NAME test-kernel-single-loop-${TESTTYPE}-${TILESIZE}-${BACKEND}
+                      SOURCES ${CMAKE_CURRENT_BINARY_DIR}/test-kernel-single-loop-${TESTTYPE}-${TILESIZE}-${BACKEND}.cpp )
+        target_include_directories(test-kernel-single-loop-${TESTTYPE}-${TILESIZE}-${BACKEND}.exe
+                                  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/tests)
+      endforeach()
     endforeach()
-  endforeach()
+  endif()
 endforeach()
 
 unset( TILESIZES )
diff --git a/test/functional/kernel/tile-variants/CMakeLists.txt b/test/functional/kernel/tile-variants/CMakeLists.txt
index 02d5aa2fd2..ac5ba913e9 100644
--- a/test/functional/kernel/tile-variants/CMakeLists.txt
+++ b/test/functional/kernel/tile-variants/CMakeLists.txt
@@ -12,13 +12,16 @@ set(TILETYPES Fixed2D Fixed2DSum Fixed2DMinMax)
 
 foreach( TILE_BACKEND ${KERNEL_BACKENDS} )
   foreach( TILE_TYPE ${TILETYPES} )
-    configure_file( test-kernel-tilefixed.cpp.in
-                    test-kernel-tile-${TILE_TYPE}-${TILE_BACKEND}.cpp )
-    raja_add_test( NAME test-kernel-tile-${TILE_TYPE}-${TILE_BACKEND}
-                   SOURCES ${CMAKE_CURRENT_BINARY_DIR}/test-kernel-tile-${TILE_TYPE}-${TILE_BACKEND}.cpp )
+    # OpenMPTarget crashes the xl compiler when building this test... 
+    if( NOT((TILE_BACKEND STREQUAL "OpenMPTarget")) )
+      configure_file( test-kernel-tilefixed.cpp.in
+                      test-kernel-tile-${TILE_TYPE}-${TILE_BACKEND}.cpp )
+      raja_add_test( NAME test-kernel-tile-${TILE_TYPE}-${TILE_BACKEND}
+                    SOURCES ${CMAKE_CURRENT_BINARY_DIR}/test-kernel-tile-${TILE_TYPE}-${TILE_BACKEND}.cpp )
 
-    target_include_directories(test-kernel-tile-${TILE_TYPE}-${TILE_BACKEND}.exe
-                               PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/tests)
+      target_include_directories(test-kernel-tile-${TILE_TYPE}-${TILE_BACKEND}.exe
+                                PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/tests)
+    endif()
   endforeach()
 endforeach()
 
diff --git a/test/functional/launch/CMakeLists.txt b/test/functional/launch/CMakeLists.txt
index 3e83383833..a8fcdfd8ce 100644
--- a/test/functional/launch/CMakeLists.txt
+++ b/test/functional/launch/CMakeLists.txt
@@ -26,6 +26,8 @@ endif()
 add_subdirectory(run-time-switch)
 
 #Adapted from forall test
+add_subdirectory(multi-reduce-nested)
+
 add_subdirectory(reduce-basic)
 
 add_subdirectory(reduce-params)
diff --git a/test/functional/launch/multi-reduce-nested/CMakeLists.txt b/test/functional/launch/multi-reduce-nested/CMakeLists.txt
new file mode 100644
index 0000000000..f5a916344d
--- /dev/null
+++ b/test/functional/launch/multi-reduce-nested/CMakeLists.txt
@@ -0,0 +1,73 @@
+###############################################################################
+# Copyright (c) 2016-24, Lawrence Livermore National Security, LLC
+# and RAJA project contributors. See the RAJA/LICENSE file for details.
+#
+# SPDX-License-Identifier: (BSD-3-Clause)
+###############################################################################
+
+#
+# List of core reduction types for generating test files.
+#
+set(REDUCETYPES Sum Min Max BitAnd BitOr)
+
+#
+# If building openmp target tests, remove the back-end to
+# from the list of tests to generate here.
+#
+if(RAJA_ENABLE_TARGET_OPENMP)
+  #if(RAJA_TEST_OPENMP_TARGET_SUBSET)
+    list(REMOVE_ITEM LAUNCH_BACKENDS OpenMPTarget)
+  #endif()
+endif()
+
+#
+# If building SYCL tests, remove the back-end to
+# from the list of tests to generate here.
+#
+if(RAJA_ENABLE_SYCL)
+  list(REMOVE_ITEM LAUNCH_BACKENDS Sycl)
+endif()
+
+#
+# Generate core reduction tests for each enabled RAJA back-end
+#
+# Note: LAUNCH_BACKENDS is defined in ../CMakeLists.txt
+#
+foreach( BACKEND ${LAUNCH_BACKENDS} )
+  foreach( REDUCETYPE ${REDUCETYPES} )
+    configure_file( test-launch-nested-multi-reduce.cpp.in
+                    test-launch-nested-MultiReduce${REDUCETYPE}-${BACKEND}.cpp )
+    raja_add_test( NAME test-launch-nested-MultiReduce${REDUCETYPE}-${BACKEND}
+                   SOURCES ${CMAKE_CURRENT_BINARY_DIR}/test-launch-nested-MultiReduce${REDUCETYPE}-${BACKEND}.cpp )
+
+    target_include_directories(test-launch-nested-MultiReduce${REDUCETYPE}-${BACKEND}.exe
+                               PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/tests)
+  endforeach()
+endforeach()
+
+unset( REDUCETYPES )
+
+
+#
+# If building a subset of openmp target tests, add tests to build here.
+#
+#if(RAJA_ENABLE_TARGET_OPENMP)
+#  if(RAJA_TEST_OPENMP_TARGET_SUBSET)
+#
+#    set(BACKEND OpenMPTarget)
+#    set(REDUCETYPES ReduceSum)
+#
+#    foreach( REDUCETYPE ${REDUCETYPES} )
+#      configure_file( test-launch-nested-multi-reduce.cpp.in
+#                      test-launch-nested-MultiReduce${REDUCETYPE}-${BACKEND}.cpp )
+#      raja_add_test( NAME test-launch-nested-MultiReduce${REDUCETYPE}-${BACKEND}
+#                     SOURCES ${CMAKE_CURRENT_BINARY_DIR}/test-launch-nested-MultiReduce${REDUCETYPE}-${BACKEND}.cpp )
+#
+#      target_include_directories(test-launch-nested-MultiReduce${REDUCETYPE}-${BACKEND}.exe
+#                                 PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/tests)
+#    endforeach()
+#
+#  endif()
+#endif()
+
+unset( REDUCETYPES )
diff --git a/test/functional/launch/multi-reduce-nested/test-launch-nested-multi-reduce.cpp.in b/test/functional/launch/multi-reduce-nested/test-launch-nested-multi-reduce.cpp.in
new file mode 100644
index 0000000000..df097a896f
--- /dev/null
+++ b/test/functional/launch/multi-reduce-nested/test-launch-nested-multi-reduce.cpp.in
@@ -0,0 +1,58 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2016-24, Lawrence Livermore National Security, LLC
+// and RAJA project contributors. See the RAJA/LICENSE file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+//
+// test/include headers
+//
+#include "RAJA_test-base.hpp"
+#include "RAJA_test-camp.hpp"
+#include "RAJA_test-index-types.hpp"
+
+#include "RAJA_test-forall-data.hpp"
+#include "RAJA_test-launch-direct-teams-threads-3D-execpol.hpp"
+#include "RAJA_test-multi-reducepol.hpp"
+#include "RAJA_test-multi-reduce-abstractor.hpp"
+
+
+//
+// Header for tests in ./tests directory
+//
+// Note: CMake adds ./tests as an include dir for these tests.
+//
+#include "test-launch-nested-MultiReduce.hpp"
+
+//
+// Data types for core reduction nested tests
+//
+using ReductionDataTypeList = camp::list< int,
+                                          float,
+                                          double >;
+
+
+//
+// These tests exercise only one index type. We parameterize here to 
+// make it easier to expand types in the future if needed.
+//
+using TestIdxTypeList = camp::list< RAJA::Index_type >;
+
+//
+// Cartesian product of types used in parameterized tests
+//
+using @BACKEND@LaunchMultiReduceNestedTypes =
+  Test< camp::cartesian_product<TestIdxTypeList,
+                                ReductionDataTypeList,
+                                @BACKEND@ResourceList,
+                                @BACKEND@_launch_policies,
+                                @BACKEND@MultiReducePols,
+                                Reduce@REDUCETYPE@Abstractors>>::Types;
+
+//
+// Instantiate parameterized test
+//
+INSTANTIATE_TYPED_TEST_SUITE_P(@BACKEND@,
+                               LaunchMultiReduceNestedTest,
+                               @BACKEND@LaunchMultiReduceNestedTypes);
diff --git a/test/functional/launch/multi-reduce-nested/tests/test-launch-nested-MultiReduce.hpp b/test/functional/launch/multi-reduce-nested/tests/test-launch-nested-MultiReduce.hpp
new file mode 100644
index 0000000000..867b826df3
--- /dev/null
+++ b/test/functional/launch/multi-reduce-nested/tests/test-launch-nested-MultiReduce.hpp
@@ -0,0 +1,375 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2016-24, Lawrence Livermore National Security, LLC
+// and RAJA project contributors. See the RAJA/LICENSE file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+#ifndef __TEST_LAUNCH_NESTED_MULTIREDUCE_HPP__
+#define __TEST_LAUNCH_NESTED_MULTIREDUCE_HPP__
+
+#include <cstdlib>
+#include <ctime>
+#include <numeric>
+#include <vector>
+#include <random>
+#include <type_traits>
+
+
+//
+//
+// Defining the Launch Loop structure for MultiReduce Nested Loop Tests.
+//
+//
+template <typename EXEC_POL_DATA, typename IDX_TYPE,
+          typename SEGMENTS_TYPE,
+          typename Lambda>
+void Launch(const SEGMENTS_TYPE& segments,
+                  Lambda&& lambda)
+{
+  using RAJA::get;
+
+  using LAUNCH_POLICY = typename camp::at<EXEC_POL_DATA, camp::num<0>>::type;
+
+  using TEAM_Z_POLICY = typename camp::at<EXEC_POL_DATA, camp::num<1>>::type;
+  using TEAM_Y_POLICY = typename camp::at<EXEC_POL_DATA, camp::num<2>>::type;
+  using TEAM_X_POLICY = typename camp::at<EXEC_POL_DATA, camp::num<3>>::type;
+
+  using THREAD_Z_POLICY = typename camp::at<EXEC_POL_DATA, camp::num<4>>::type;
+  using THREAD_Y_POLICY = typename camp::at<EXEC_POL_DATA, camp::num<5>>::type;
+  using THREAD_X_POLICY = typename camp::at<EXEC_POL_DATA, camp::num<6>>::type;
+
+  auto si = get<2>(segments);
+  auto sj = get<1>(segments);
+  auto sk = get<0>(segments);
+
+  RAJA_EXTRACT_BED_SUFFIXED(si, _si);
+  RAJA_EXTRACT_BED_SUFFIXED(sj, _sj);
+  RAJA_EXTRACT_BED_SUFFIXED(sk, _sk);
+
+  IDX_TYPE threads_i = 16;
+  IDX_TYPE threads_j = 4;
+  IDX_TYPE threads_k = 4;
+
+  IDX_TYPE blocks_i = RAJA_DIVIDE_CEILING_INT(distance_si, threads_i);
+  IDX_TYPE blocks_j = RAJA_DIVIDE_CEILING_INT(distance_sj, threads_j);
+  IDX_TYPE blocks_k = RAJA_DIVIDE_CEILING_INT(distance_sk, threads_k);
+
+  RAJA::launch<LAUNCH_POLICY>
+    (RAJA::LaunchParams(RAJA::Teams(blocks_i, blocks_j, blocks_k),
+                        RAJA::Threads(threads_i, threads_j,threads_k)),
+      [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) {
+
+    RAJA::loop<TEAM_Z_POLICY>(ctx, RAJA::TypedRangeSegment<IDX_TYPE>(0, blocks_k), [&](IDX_TYPE bk) {
+      RAJA::loop<TEAM_Y_POLICY>(ctx, RAJA::TypedRangeSegment<IDX_TYPE>(0, blocks_j), [&](IDX_TYPE bj) {
+        RAJA::loop<TEAM_X_POLICY>(ctx, RAJA::TypedRangeSegment<IDX_TYPE>(0, blocks_i), [&](IDX_TYPE bi) {
+
+          RAJA::loop<THREAD_Z_POLICY>(ctx, RAJA::TypedRangeSegment<IDX_TYPE>(0, threads_k), [&](IDX_TYPE tk) {
+            RAJA::loop<THREAD_Y_POLICY>(ctx, RAJA::TypedRangeSegment<IDX_TYPE>(0, threads_j), [&](IDX_TYPE tj) {
+              RAJA::loop<THREAD_X_POLICY>(ctx, RAJA::TypedRangeSegment<IDX_TYPE>(0, threads_i), [&](IDX_TYPE ti) {
+
+                IDX_TYPE i = ti + threads_i * bi;
+                IDX_TYPE j = tj + threads_j * bj;
+                IDX_TYPE k = tk + threads_k * bk;
+
+                if (i < distance_si && j < distance_sj && k < distance_sk) {
+                  lambda(begin_sk[k], begin_sj[j], begin_si[i]);
+                }
+              });
+            });
+          });
+
+        });
+      });
+    });
+
+  });
+}
+
+template <typename EXEC_POL_DATA, typename REDUCE_POLICY, typename ABSTRACTION,
+          typename DATA_TYPE, typename IDX_TYPE,
+          typename SEGMENTS_TYPE, typename Container,
+          typename WORKING_RES, typename RandomGenerator>
+// use enable_if in return type to appease nvcc 11.2
+// add bool return type to disambiguate signatures of these functions for MSVC
+std::enable_if_t<!ABSTRACTION::template supports<DATA_TYPE>(), bool>
+LaunchMultiReduceNestedTestImpl(const SEGMENTS_TYPE&,
+                                const Container&,
+                                WORKING_RES,
+                                RandomGenerator&)
+{ return false; }
+///
+template <typename EXEC_POL_DATA, typename REDUCE_POLICY, typename ABSTRACTION,
+          typename DATA_TYPE, typename IDX_TYPE,
+          typename SEGMENTS_TYPE, typename Container,
+          typename WORKING_RES, typename RandomGenerator>
+// use enable_if in return type to appease nvcc 11.2
+std::enable_if_t<ABSTRACTION::template supports<DATA_TYPE>()>
+LaunchMultiReduceNestedTestImpl(const SEGMENTS_TYPE& segments,
+                                const Container& multi_init,
+                                WORKING_RES working_res,
+                                RandomGenerator& rngen)
+{
+  using RAJA::get;
+  using MULTIREDUCER = typename ABSTRACTION::template multi_reducer<REDUCE_POLICY, DATA_TYPE>;
+
+  auto si = get<2>(segments);
+  auto sj = get<1>(segments);
+  auto sk = get<0>(segments);
+
+  RAJA_EXTRACT_BED_SUFFIXED(si, _si);
+  RAJA_EXTRACT_BED_SUFFIXED(sj, _sj);
+  RAJA_EXTRACT_BED_SUFFIXED(sk, _sk);
+
+  IDX_TYPE dimi = begin_si[distance_si-1] + 1;
+  IDX_TYPE dimj = begin_sj[distance_sj-1] + 1;
+  IDX_TYPE dimk = begin_sk[distance_sk-1] + 1;
+
+  const IDX_TYPE idx_range = dimi * dimj * dimk;
+
+  const int modval = 100;
+  const size_t num_bins = multi_init.size();
+
+  IDX_TYPE* working_range;
+  IDX_TYPE* check_range;
+  IDX_TYPE* test_range;
+
+  DATA_TYPE* working_array;
+  DATA_TYPE* check_array;
+  DATA_TYPE* test_array;
+
+  IDX_TYPE* working_bins;
+  IDX_TYPE* check_bins;
+  IDX_TYPE* test_bins;
+
+  IDX_TYPE data_len = 0;
+
+  allocateForallTestData(idx_range+1,
+                         working_res,
+                         &working_range,
+                         &check_range,
+                         &test_range);
+
+  for (IDX_TYPE i = 0; i < idx_range+1; ++i) {
+    test_range[i] = ~IDX_TYPE(0);
+  }
+
+  std::uniform_int_distribution<IDX_TYPE> work_per_iterate_distribution(0, num_bins);
+
+  for (IDX_TYPE k : sk) {
+    for (IDX_TYPE j : sj) {
+      for (IDX_TYPE i : si) {
+        IDX_TYPE ii = (dimi * dimj * k) + (dimi * j) + i;
+        test_range[ii] = data_len;
+        data_len += work_per_iterate_distribution(rngen);
+        test_range[ii+1] = data_len;
+      }
+    }
+  }
+
+  allocateForallTestData(data_len,
+                         working_res,
+                         &working_array,
+                         &check_array,
+                         &test_array);
+
+  allocateForallTestData(data_len,
+                         working_res,
+                         &working_bins,
+                         &check_bins,
+                         &test_bins);
+
+  // use ints to initialize array here to avoid floating point precision issues
+  std::uniform_int_distribution<int> array_int_distribution(0, modval-1);
+  std::uniform_int_distribution<IDX_TYPE> bin_distribution(0, num_bins-1);
+
+
+  for (IDX_TYPE i = 0; i < data_len; ++i) {
+    test_array[i] = DATA_TYPE(array_int_distribution(rngen));
+
+    // this may use the same bin multiple times per iterate
+    test_bins[i] = bin_distribution(rngen);
+  }
+
+  working_res.memcpy(working_range, test_range, sizeof(IDX_TYPE) * (idx_range+1));
+  working_res.memcpy(working_array, test_array, sizeof(DATA_TYPE) * data_len);
+  working_res.memcpy(working_bins, test_bins, sizeof(IDX_TYPE) * data_len);
+
+
+  MULTIREDUCER red(num_bins);
+  MULTIREDUCER red2(multi_init);
+
+  // basic test with two multi reducers in the same loop
+  {
+    std::vector<DATA_TYPE> ref_vals(num_bins, ABSTRACTION::identity(red));
+
+    for (IDX_TYPE i = 0; i < data_len; ++i) {
+      ref_vals[test_bins[i]] = ABSTRACTION::combine(ref_vals[test_bins[i]], test_array[i]);
+    }
+
+    Launch<EXEC_POL_DATA, IDX_TYPE>(segments,
+        [=] RAJA_HOST_DEVICE (IDX_TYPE k, IDX_TYPE j, IDX_TYPE i) {
+      IDX_TYPE ii = (dimi * dimj * k) + (dimi * j) + i;
+      for (IDX_TYPE idx = working_range[ii]; idx < working_range[ii+1]; ++idx) {
+        ABSTRACTION::reduce(red[working_bins[idx]],  working_array[idx]);
+        ABSTRACTION::reduce(red2[working_bins[idx]], working_array[idx]);
+      }
+    });
+
+    size_t bin = 0;
+    for (auto init_val : multi_init) {
+      ASSERT_EQ(DATA_TYPE(red[bin].get()), ref_vals[bin]);
+      ASSERT_EQ(red2.get(bin), ABSTRACTION::combine(ref_vals[bin], init_val));
+      ++bin;
+    }
+  }
+
+
+  red.reset();
+
+  // basic multiple use test, ensure same reducer can combine values from multiple loops
+  {
+    std::vector<DATA_TYPE> ref_vals(num_bins, ABSTRACTION::identity(red));
+
+    const int nloops = 2;
+    for (int j = 0; j < nloops; ++j) {
+
+      for (IDX_TYPE i = 0; i < data_len; ++i) {
+        ref_vals[test_bins[i]] = ABSTRACTION::combine(ref_vals[test_bins[i]], test_array[i]);
+      }
+
+      Launch<EXEC_POL_DATA, IDX_TYPE>(segments,
+          [=] RAJA_HOST_DEVICE (IDX_TYPE k, IDX_TYPE j, IDX_TYPE i) {
+        IDX_TYPE ii = (dimi * dimj * k) + (dimi * j) + i;
+        for (IDX_TYPE idx = working_range[ii]; idx < working_range[ii+1]; ++idx) {
+          ABSTRACTION::reduce(red[working_bins[idx]],  working_array[idx]);
+        }
+      });
+    }
+
+    for (size_t bin = 0; bin < num_bins; ++bin) {
+      ASSERT_EQ(static_cast<DATA_TYPE>(red[bin].get()), ref_vals[bin]);
+    }
+  }
+
+
+  // test the consistency of answers, if we expect them to be consistent
+  if (ABSTRACTION::consistent(red)) {
+
+    if /* constexpr */ (std::is_floating_point<DATA_TYPE>::value) {
+
+      // use floating point values to accentuate floating point precision issues
+      std::conditional_t<!std::is_floating_point<DATA_TYPE>::value,
+          std::uniform_int_distribution<DATA_TYPE>,
+          std::uniform_real_distribution<DATA_TYPE>> array_flt_distribution(0, modval-1);
+
+      for (IDX_TYPE i = 0; i < data_len; ++i) {
+        test_array[i] = DATA_TYPE(array_flt_distribution(rngen));
+      }
+      working_res.memcpy(working_array, test_array, sizeof(DATA_TYPE) * data_len);
+    }
+
+
+    std::vector<DATA_TYPE> ref_vals;
+    bool got_ref_vals = false;
+
+    const int nloops = 2;
+    for (int j = 0; j < nloops; ++j) {
+      red.reset();
+
+      Launch<EXEC_POL_DATA, IDX_TYPE>(segments,
+          [=] RAJA_HOST_DEVICE (IDX_TYPE k, IDX_TYPE j, IDX_TYPE i) {
+        IDX_TYPE ii = (dimi * dimj * k) + (dimi * j) + i;
+        for (IDX_TYPE idx = working_range[ii]; idx < working_range[ii+1]; ++idx) {
+          ABSTRACTION::reduce(red[working_bins[idx]],  working_array[idx]);
+        }
+      });
+
+      if (!got_ref_vals) {
+        ref_vals.resize(num_bins);
+        red.get_all(ref_vals);
+        got_ref_vals = true;
+      } else {
+        for (size_t bin = 0; bin < num_bins; ++bin) {
+          ASSERT_EQ(red.get(bin), ref_vals[bin]);
+        }
+      }
+    }
+  }
+
+
+  deallocateForallTestData(working_res,
+                           working_bins,
+                           check_bins,
+                           test_bins);
+  deallocateForallTestData(working_res,
+                           working_array,
+                           check_array,
+                           test_array);
+  deallocateForallTestData(working_res,
+                           working_range,
+                           check_range,
+                           test_range);
+}
+
+
+TYPED_TEST_SUITE_P(LaunchMultiReduceNestedTest);
+template <typename T>
+class LaunchMultiReduceNestedTest : public ::testing::Test
+{
+};
+
+TYPED_TEST_P(LaunchMultiReduceNestedTest, MultiReduceNestedLaunch)
+{
+  using IDX_TYPE      = typename camp::at<TypeParam, camp::num<0>>::type;
+  using DATA_TYPE     = typename camp::at<TypeParam, camp::num<1>>::type;
+  using WORKING_RES   = typename camp::at<TypeParam, camp::num<2>>::type;
+  using EXEC_POL_DATA = typename camp::at<TypeParam, camp::num<3>>::type;
+  using REDUCE_POLICY = typename camp::at<TypeParam, camp::num<4>>::type;
+  using ABSTRACTION   = typename camp::at<TypeParam, camp::num<5>>::type;
+
+  // for setting random values in arrays
+  auto random_seed = std::random_device{}();
+  std::mt19937 rngen(random_seed);
+
+  WORKING_RES working_res{WORKING_RES::get_default()};
+
+  std::vector<DATA_TYPE> container;
+
+  std::vector<size_t> num_bins_max_container({0, 1, 100});
+  size_t num_bins_min = 0;
+  for (size_t num_bins_max : num_bins_max_container) {
+
+    std::uniform_int_distribution<size_t> num_bins_dist(num_bins_min, num_bins_max);
+    num_bins_min = num_bins_max+1;
+    size_t num_bins = num_bins_dist(rngen);
+
+    container.resize(num_bins, DATA_TYPE(2));
+
+    // Range segment tests
+    auto s1 = RAJA::make_tuple(RAJA::TypedRangeSegment<IDX_TYPE>( 0, 2 ),
+                               RAJA::TypedRangeSegment<IDX_TYPE>( 0, 7 ),
+                               RAJA::TypedRangeSegment<IDX_TYPE>( 0, 3 ));
+    LaunchMultiReduceNestedTestImpl<EXEC_POL_DATA, REDUCE_POLICY, ABSTRACTION, DATA_TYPE, IDX_TYPE>(
+                                   s1, container, working_res, rngen);
+
+    auto s2 = RAJA::make_tuple(RAJA::TypedRangeSegment<IDX_TYPE>( 2, 35 ),
+                               RAJA::TypedRangeSegment<IDX_TYPE>( 0, 19 ),
+                               RAJA::TypedRangeSegment<IDX_TYPE>( 3, 13 ));
+    LaunchMultiReduceNestedTestImpl<EXEC_POL_DATA, REDUCE_POLICY, ABSTRACTION, DATA_TYPE, IDX_TYPE>(
+                                   s2, container, working_res, rngen);
+
+    // Range-stride segment tests
+    auto s3 = RAJA::make_tuple(RAJA::TypedRangeStrideSegment<IDX_TYPE>( 0, 6, 2 ),
+                               RAJA::TypedRangeStrideSegment<IDX_TYPE>( 1, 38, 3 ),
+                               RAJA::TypedRangeStrideSegment<IDX_TYPE>( 5, 17, 1 ));
+    LaunchMultiReduceNestedTestImpl<EXEC_POL_DATA, REDUCE_POLICY, ABSTRACTION, DATA_TYPE, IDX_TYPE>(
+                                   s3, container, working_res, rngen);
+
+  }
+}
+
+REGISTER_TYPED_TEST_SUITE_P(LaunchMultiReduceNestedTest,
+                            MultiReduceNestedLaunch);
+
+#endif  // __TEST_LAUNCH_NESTED_MULTIREDUCE_HPP__
diff --git a/test/functional/launch/reduce-params/CMakeLists.txt b/test/functional/launch/reduce-params/CMakeLists.txt
index 630f78eb9b..42135b265c 100644
--- a/test/functional/launch/reduce-params/CMakeLists.txt
+++ b/test/functional/launch/reduce-params/CMakeLists.txt
@@ -20,7 +20,6 @@ set(DATATYPES CoreReductionDataTypeList)
 # Note: LAUNCH_BACKENDS is defined in ../CMakeLists.txt
 #
 foreach( BACKEND ${LAUNCH_BACKENDS} )
-  if( NOT (BACKEND STREQUAL "Sycl"))
   foreach( REDUCETYPE ${REDUCETYPES} )
     configure_file( test-launch-basic-param-expt-reduce.cpp.in
                     test-launch-basic-param-expt-${REDUCETYPE}-${BACKEND}.cpp)
@@ -30,7 +29,6 @@ foreach( BACKEND ${LAUNCH_BACKENDS} )
     target_include_directories(test-launch-basic-param-expt-${REDUCETYPE}-${BACKEND}.exe
                                PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/tests)
   endforeach()
-  endif()
 endforeach()
 
 unset( DATATYPES )
@@ -52,7 +50,6 @@ set(DATATYPES BitwiseReductionDataTypeList)
 # Note: LAUNCH_BACKENDS is defined in ../CMakeLists.txt
 #
 foreach( BACKEND ${LAUNCH_BACKENDS} )
-  if( NOT (BACKEND STREQUAL "Sycl"))
   foreach( REDUCETYPE ${REDUCETYPES} )
     configure_file( test-launch-basic-param-expt-reduce.cpp.in
                     test-launch-basic-param-expt-${REDUCETYPE}-${BACKEND}.cpp )
@@ -62,7 +59,6 @@ foreach( BACKEND ${LAUNCH_BACKENDS} )
     target_include_directories(test-launch-basic-param-expt-${REDUCETYPE}-${BACKEND}.exe
                                PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/tests)
   endforeach()
-  endif()
 endforeach()
 
 unset( DATATYPES )
diff --git a/test/include/RAJA_test-atomic-types.hpp b/test/include/RAJA_test-atomic-types.hpp
index 9cf4c21355..90a1be4024 100644
--- a/test/include/RAJA_test-atomic-types.hpp
+++ b/test/include/RAJA_test-atomic-types.hpp
@@ -6,7 +6,7 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
 
 //
-// Types and type lists for loop indexing used throughout RAJA tests.
+// Type list for testing RAJA atomics.
 //
 // Note that in the type lists, a subset of types is used by default.
 // For more comprehensive type testing define the macro RAJA_TEST_EXHAUSTIVE.
@@ -25,18 +25,11 @@ using AtomicDataTypeList =
   camp::list< RAJA::Index_type,
               int,
 #if defined(RAJA_TEST_EXHAUSTIVE)
-              unsigned,
+              unsigned int,
               long long,
               unsigned long long,
               float,
 #endif
               double >;
 
-using AtomicDataUnsignedTypeList =
-  camp::list< unsigned,
-#if defined(RAJA_TEST_EXHAUSTIVE)
-              unsigned long,
-#endif
-              unsigned long long>;
-
 #endif // __RAJA_test_atomic_types_HPP__
diff --git a/test/include/RAJA_test-multi-reduce-abstractor.hpp b/test/include/RAJA_test-multi-reduce-abstractor.hpp
new file mode 100644
index 0000000000..2c5412893c
--- /dev/null
+++ b/test/include/RAJA_test-multi-reduce-abstractor.hpp
@@ -0,0 +1,170 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2016-24, Lawrence Livermore National Security, LLC
+// and RAJA project contributors. See the RAJA/LICENSE file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+//
+// Reduction policies used for reduction tests
+//
+
+#ifndef __RAJA_test_multi_reduce_abstractor_HPP__
+#define __RAJA_test_multi_reduce_abstractor_HPP__
+
+#include "RAJA/RAJA.hpp"
+#include "camp/list.hpp"
+
+//
+// Get the identity value for the operation used by the given multi reducer
+//
+template < typename MultiReducer >
+inline auto get_op_identity(MultiReducer const& RAJA_UNUSED_ARG(multi_reduce))
+{
+  return MultiReducer::MultiReduceOp::identity();
+}
+
+
+struct SumAbstractor
+{
+  template < typename DATA_TYPE >
+  static constexpr bool supports() { return std::is_arithmetic<DATA_TYPE>::value; }
+
+  template < typename Reducer >
+  static bool consistent(Reducer const&)
+  {
+    return RAJA::policy_has_trait<typename Reducer::policy, RAJA::reduce::ordered>::value ||
+           !std::is_floating_point<typename Reducer::value_type>::value;
+  }
+
+  template < typename policy, typename DATA_TYPE >
+  using reducer = RAJA::ReduceSum<policy, DATA_TYPE>;
+
+  template < typename policy, typename DATA_TYPE >
+  using multi_reducer = RAJA::MultiReduceSum<policy, DATA_TYPE>;
+
+  template < typename Lhs, typename Rhs >
+  RAJA_HOST_DEVICE
+  static auto combine(Lhs const& lhs, Rhs const& rhs) { return lhs + rhs; }
+
+  template < typename Reducer, typename Rhs >
+  RAJA_HOST_DEVICE
+  static decltype(auto) reduce(Reducer&& lhs, Rhs const& rhs) { return std::forward<Reducer>(lhs) += rhs; }
+
+  template < typename Reducer >
+  static auto identity(Reducer const&) { return Reducer::MultiReduceOp::identity(); }
+};
+
+struct MinAbstractor
+{
+  template < typename DATA_TYPE >
+  static constexpr bool supports() { return std::is_arithmetic<DATA_TYPE>::value; }
+
+  template < typename Reducer >
+  static constexpr bool consistent(Reducer const&) { return true; }
+
+  template < typename policy, typename DATA_TYPE >
+  using reducer = RAJA::ReduceSum<policy, DATA_TYPE>;
+
+  template < typename policy, typename DATA_TYPE >
+  using multi_reducer = RAJA::MultiReduceMin<policy, DATA_TYPE>;
+
+  template < typename Lhs, typename Rhs >
+  RAJA_HOST_DEVICE
+  static auto combine(Lhs const& lhs, Rhs const& rhs) { return (lhs > rhs) ? rhs : lhs; }
+
+  template < typename Reducer, typename Rhs >
+  RAJA_HOST_DEVICE
+  static decltype(auto) reduce(Reducer&& lhs, Rhs const& rhs) { return std::forward<Reducer>(lhs).min(rhs); }
+
+  template < typename Reducer >
+  static auto identity(Reducer const&) { return Reducer::MultiReduceOp::identity(); }
+};
+
+struct MaxAbstractor
+{
+  template < typename DATA_TYPE >
+  static constexpr bool supports() { return std::is_arithmetic<DATA_TYPE>::value; }
+
+  template < typename Reducer >
+  static constexpr bool consistent(Reducer const&) { return true; }
+
+  template < typename policy, typename DATA_TYPE >
+  using reducer = RAJA::ReduceSum<policy, DATA_TYPE>;
+
+  template < typename policy, typename DATA_TYPE >
+  using multi_reducer = RAJA::MultiReduceMax<policy, DATA_TYPE>;
+
+  template < typename Lhs, typename Rhs >
+  RAJA_HOST_DEVICE
+  static auto combine(Lhs const& lhs, Rhs const& rhs) { return (lhs < rhs) ? rhs : lhs; }
+
+  template < typename Reducer, typename Rhs >
+  RAJA_HOST_DEVICE
+  static decltype(auto) reduce(Reducer&& lhs, Rhs const& rhs) { return std::forward<Reducer>(lhs).max(rhs); }
+
+  template < typename Reducer >
+  static auto identity(Reducer const&) { return Reducer::MultiReduceOp::identity(); }
+};
+
+struct BitAndAbstractor
+{
+  template < typename DATA_TYPE >
+  static constexpr bool supports() { return std::is_integral<DATA_TYPE>::value; }
+
+  template < typename Reducer >
+  static constexpr bool consistent(Reducer const&) { return true; }
+
+  template < typename policy, typename DATA_TYPE >
+  using reducer = RAJA::ReduceSum<policy, DATA_TYPE>;
+
+  template < typename policy, typename DATA_TYPE >
+  using multi_reducer = RAJA::MultiReduceBitAnd<policy, DATA_TYPE>;
+
+  template < typename Lhs, typename Rhs >
+  RAJA_HOST_DEVICE
+  static auto combine(Lhs const& lhs, Rhs const& rhs) { return lhs & rhs; }
+
+  template < typename Reducer, typename Rhs >
+  RAJA_HOST_DEVICE
+  static decltype(auto) reduce(Reducer&& lhs, Rhs const& rhs) { return std::forward<Reducer>(lhs) &= rhs; }
+
+  template < typename Reducer >
+  static auto identity(Reducer const&) { return Reducer::MultiReduceOp::identity(); }
+};
+
+struct BitOrAbstractor
+{
+  template < typename DATA_TYPE >
+  static constexpr bool supports() { return std::is_integral<DATA_TYPE>::value; }
+
+  template < typename Reducer >
+  static constexpr bool consistent(Reducer const&) { return true; }
+
+  template < typename policy, typename DATA_TYPE >
+  using reducer = RAJA::ReduceSum<policy, DATA_TYPE>;
+
+  template < typename policy, typename DATA_TYPE >
+  using multi_reducer = RAJA::MultiReduceBitOr<policy, DATA_TYPE>;
+
+  template < typename Lhs, typename Rhs >
+  RAJA_HOST_DEVICE
+  static auto combine(Lhs const& lhs, Rhs const& rhs) { return lhs | rhs; }
+
+  template < typename Reducer, typename Rhs >
+  RAJA_HOST_DEVICE
+  static decltype(auto) reduce(Reducer&& lhs, Rhs const& rhs) { return std::forward<Reducer>(lhs) |= rhs; }
+
+  template < typename Reducer >
+  static auto identity(Reducer const&) { return Reducer::MultiReduceOp::identity(); }
+};
+
+
+// Sequential reduction policy types
+using ReduceSumAbstractors = camp::list< SumAbstractor >;
+using ReduceMinAbstractors = camp::list< MinAbstractor >;
+using ReduceMaxAbstractors = camp::list< MaxAbstractor >;
+using ReduceBitAndAbstractors = camp::list< BitAndAbstractor >;
+using ReduceBitOrAbstractors = camp::list< BitOrAbstractor >;
+
+#endif  // __RAJA_test_multi_reduce_abstractor_HPP__
diff --git a/test/include/RAJA_test-multi-reducepol.hpp b/test/include/RAJA_test-multi-reducepol.hpp
new file mode 100644
index 0000000000..e024ef70aa
--- /dev/null
+++ b/test/include/RAJA_test-multi-reducepol.hpp
@@ -0,0 +1,43 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2016-24, Lawrence Livermore National Security, LLC
+// and RAJA project contributors. See the RAJA/LICENSE file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+//
+// Reduction policies used for reduction tests
+//
+
+#ifndef __RAJA_test_multi_reducepol_HPP__
+#define __RAJA_test_multi_reducepol_HPP__
+
+#include "RAJA/RAJA.hpp"
+#include "camp/list.hpp"
+
+// Sequential reduction policy types
+using SequentialMultiReducePols = camp::list< RAJA::seq_multi_reduce >;
+
+#if defined(RAJA_ENABLE_OPENMP)
+using OpenMPMultiReducePols =
+  camp::list< RAJA::omp_multi_reduce,
+              RAJA::omp_multi_reduce_ordered >;
+#endif
+
+#if defined(RAJA_ENABLE_CUDA)
+using CudaMultiReducePols =
+  camp::list< RAJA::cuda_multi_reduce_atomic_block_then_atomic_grid_host_init,
+              RAJA::cuda_multi_reduce_atomic_block_then_atomic_grid_host_init_fallback_testing,
+              RAJA::cuda_multi_reduce_atomic_global_host_init,
+              RAJA::cuda_multi_reduce_atomic_global_no_replication_host_init >;
+#endif
+
+#if defined(RAJA_ENABLE_HIP)
+using HipMultiReducePols =
+  camp::list< RAJA::hip_multi_reduce_atomic_block_then_atomic_grid_host_init,
+              RAJA::hip_multi_reduce_atomic_block_then_atomic_grid_host_init_fallback_testing,
+              RAJA::hip_multi_reduce_atomic_global_host_init,
+              RAJA::hip_multi_reduce_atomic_global_no_replication_host_init  >;
+#endif
+
+#endif  // __RAJA_test_multi_reducepol_HPP__
diff --git a/test/unit/CMakeLists.txt b/test/unit/CMakeLists.txt
index cb82636e2e..20b3015c5d 100644
--- a/test/unit/CMakeLists.txt
+++ b/test/unit/CMakeLists.txt
@@ -9,6 +9,7 @@ add_subdirectory(index)
 add_subdirectory(internal)
 add_subdirectory(util)
 add_subdirectory(reducer)
+add_subdirectory(multi_reducer)
 add_subdirectory(resource)
 add_subdirectory(atomic)
 add_subdirectory(view-layout)
diff --git a/test/unit/multi_reducer/CMakeLists.txt b/test/unit/multi_reducer/CMakeLists.txt
new file mode 100644
index 0000000000..6453fa66cb
--- /dev/null
+++ b/test/unit/multi_reducer/CMakeLists.txt
@@ -0,0 +1,60 @@
+###############################################################################
+# Copyright (c) 2016-24, Lawrence Livermore National Security, LLC
+# and RAJA project contributors. See the RAJA/LICENSE file for details.
+#
+# SPDX-License-Identifier: (BSD-3-Clause)
+###############################################################################
+
+#
+# macro that generates test file and build target for each backend
+# this must be a macro or the linker variable set by FindHIP won't be set in
+# the right scope and linking will fail with a weird error from
+# hipcc_cmake_linker_helper because it expects the path to hipcc as the first
+# argument
+#
+macro( buildunitmultireducetest TESTNAME BACKENDS )
+  foreach( BACKEND ${BACKENDS} )
+
+    configure_file( test-multi-reducer-${TESTNAME}.cpp.in
+                    test-multi-reducer-${TESTNAME}-${BACKEND}.cpp )
+
+    raja_add_test( NAME test-multi-reducer-${TESTNAME}-${BACKEND}
+                   SOURCES ${CMAKE_CURRENT_BINARY_DIR}/test-multi-reducer-${TESTNAME}-${BACKEND}.cpp )
+
+    target_include_directories( test-multi-reducer-${TESTNAME}-${BACKEND}.exe
+                                PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/tests )
+
+  endforeach()
+endmacro()
+
+
+set(BACKENDS Sequential)
+
+# only need to test WorkStorage once
+set(WorkStorage_BACKENDS Sequential)
+
+if(RAJA_ENABLE_OPENMP)
+  list(APPEND BACKENDS OpenMP)
+endif()
+
+if(RAJA_ENABLE_TARGET_OPENMP)
+  list(APPEND BACKENDS OpenMPTarget)
+endif()
+
+if(RAJA_ENABLE_CUDA)
+  list(APPEND BACKENDS Cuda)
+endif()
+
+if(RAJA_ENABLE_HIP)
+  list(APPEND BACKENDS Hip)
+endif()
+
+
+
+buildunitmultireducetest(constructors "${BACKENDS}")
+
+buildunitmultireducetest(reset "${BACKENDS}")
+
+
+
+unset(BACKENDS)
diff --git a/test/unit/multi_reducer/test-multi-reducer-constructors.cpp.in b/test/unit/multi_reducer/test-multi-reducer-constructors.cpp.in
new file mode 100644
index 0000000000..f7bf87e092
--- /dev/null
+++ b/test/unit/multi_reducer/test-multi-reducer-constructors.cpp.in
@@ -0,0 +1,30 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2016-24, Lawrence Livermore National Security, LLC
+// and RAJA project contributors. See the RAJA/LICENSE file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+///
+/// Source file containing tests for RAJA multi-reducer constructors and initialization.
+///
+
+#include "test-multi-reducer-constructors.hpp"
+
+using @BACKEND@MultiReducerConstructorTypes =
+  Test< camp::cartesian_product< @BACKEND@MultiReducerPolicyList,
+                                 DataTypeList > >::Types;
+
+INSTANTIATE_TYPED_TEST_SUITE_P(@BACKEND@BasicTest,
+                               MultiReducerBasicConstructorUnitTest,
+                               @BACKEND@MultiReducerConstructorTypes);
+
+INSTANTIATE_TYPED_TEST_SUITE_P(@BACKEND@InitTest,
+                               MultiReducerSingleInitConstructorUnitTest,
+                               @BACKEND@MultiReducerConstructorTypes);
+
+INSTANTIATE_TYPED_TEST_SUITE_P(@BACKEND@InitTest,
+                               MultiReducerContainerInitConstructorUnitTest,
+                               @BACKEND@MultiReducerConstructorTypes);
+
+
diff --git a/test/unit/multi_reducer/test-multi-reducer-reset.cpp.in b/test/unit/multi_reducer/test-multi-reducer-reset.cpp.in
new file mode 100644
index 0000000000..ea033161e1
--- /dev/null
+++ b/test/unit/multi_reducer/test-multi-reducer-reset.cpp.in
@@ -0,0 +1,30 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2016-24, Lawrence Livermore National Security, LLC
+// and RAJA project contributors. See the RAJA/LICENSE file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+///
+/// Source file containing tests for RAJA multi-reducer reset.
+///
+
+#include "test-multi-reducer-reset.hpp"
+
+using @BACKEND@MultiReducerResetTypes =
+  Test< camp::cartesian_product< @BACKEND@MultiReducerPolicyList,
+                                 DataTypeList,
+                                 @BACKEND@UnitTestPolicyList > >::Types;
+
+INSTANTIATE_TYPED_TEST_SUITE_P(@BACKEND@ResetTest,
+                               MultiReducerBasicResetUnitTest,
+                               @BACKEND@MultiReducerResetTypes);
+
+INSTANTIATE_TYPED_TEST_SUITE_P(@BACKEND@ResetTest,
+                               MultiReducerSingleResetUnitTest,
+                               @BACKEND@MultiReducerResetTypes);
+
+INSTANTIATE_TYPED_TEST_SUITE_P(@BACKEND@ResetTest,
+                               MultiReducerContainerResetUnitTest,
+                               @BACKEND@MultiReducerResetTypes);
+
diff --git a/test/unit/multi_reducer/test-multi-reducer.hpp b/test/unit/multi_reducer/test-multi-reducer.hpp
new file mode 100644
index 0000000000..a1f94e0895
--- /dev/null
+++ b/test/unit/multi_reducer/test-multi-reducer.hpp
@@ -0,0 +1,47 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2016-24, Lawrence Livermore National Security, LLC
+// and RAJA project contributors. See the RAJA/LICENSE file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+#ifndef __TEST_MULTI_REDUCER_UTILS_HPP__
+#define __TEST_MULTI_REDUCER_UTILS_HPP__
+
+#include "RAJA_test-base.hpp"
+#include "RAJA_test-camp.hpp"
+
+#include "RAJA_unit-test-forone.hpp"
+#include "RAJA_test-multi-reduce-abstractor.hpp"
+
+//
+// Data types
+//
+using DataTypeList = camp::list< int,
+                                 float,
+                                 double >;
+
+using SequentialMultiReducerPolicyList = camp::list< RAJA::seq_multi_reduce >;
+
+#if defined(RAJA_ENABLE_OPENMP)
+using OpenMPMultiReducerPolicyList = camp::list< RAJA::omp_multi_reduce,
+                                                 RAJA::omp_multi_reduce_ordered >;
+#endif
+
+#if defined(RAJA_ENABLE_CUDA)
+using CudaMultiReducerPolicyList =
+  camp::list< RAJA::cuda_multi_reduce_atomic_block_then_atomic_grid_host_init,
+              RAJA::cuda_multi_reduce_atomic_block_then_atomic_grid_host_init_fallback_testing,
+              RAJA::cuda_multi_reduce_atomic_global_host_init,
+              RAJA::cuda_multi_reduce_atomic_global_no_replication_host_init >;
+#endif
+
+#if defined(RAJA_ENABLE_HIP)
+using HipMultiReducerPolicyList =
+  camp::list< RAJA::hip_multi_reduce_atomic_block_then_atomic_grid_host_init,
+              RAJA::hip_multi_reduce_atomic_block_then_atomic_grid_host_init_fallback_testing,
+              RAJA::hip_multi_reduce_atomic_global_host_init,
+              RAJA::hip_multi_reduce_atomic_global_no_replication_host_init  >;
+#endif
+
+#endif  // __TEST_MULTI_REDUCER_UTILS_HPP__
diff --git a/test/unit/multi_reducer/tests/test-multi-reducer-constructors.hpp b/test/unit/multi_reducer/tests/test-multi-reducer-constructors.hpp
new file mode 100644
index 0000000000..1104ae1e28
--- /dev/null
+++ b/test/unit/multi_reducer/tests/test-multi-reducer-constructors.hpp
@@ -0,0 +1,282 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2016-24, Lawrence Livermore National Security, LLC
+// and RAJA project contributors. See the RAJA/LICENSE file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+///
+/// Header file containing tests for RAJA multi reducer constructors and initialization.
+///
+
+#ifndef __TEST_MULTI_REDUCER_CONSTRUCTOR__
+#define __TEST_MULTI_REDUCER_CONSTRUCTOR__
+
+#include "RAJA/internal/MemUtils_CPU.hpp"
+
+#include "../test-multi-reducer.hpp"
+
+#include <vector>
+#include <list>
+#include <set>
+
+template <typename T>
+class MultiReducerBasicConstructorUnitTest : public ::testing::Test
+{
+};
+
+template <typename T>
+class MultiReducerSingleInitConstructorUnitTest : public ::testing::Test
+{
+};
+
+template <typename T>
+class MultiReducerContainerInitConstructorUnitTest : public ::testing::Test
+{
+};
+
+TYPED_TEST_SUITE_P(MultiReducerBasicConstructorUnitTest);
+TYPED_TEST_SUITE_P(MultiReducerSingleInitConstructorUnitTest);
+TYPED_TEST_SUITE_P(MultiReducerContainerInitConstructorUnitTest);
+
+
+template <typename MultiReducePolicy,
+          typename NumericType>
+void testBasicMultiReducerConstructorRegular(size_t num_bins)
+{
+  RAJA::MultiReduceSum<MultiReducePolicy, NumericType> multi_reduce_sum(num_bins);
+  RAJA::MultiReduceMin<MultiReducePolicy, NumericType> multi_reduce_min(num_bins);
+  RAJA::MultiReduceMax<MultiReducePolicy, NumericType> multi_reduce_max(num_bins);
+
+  ASSERT_EQ(multi_reduce_sum.size(), num_bins);
+  ASSERT_EQ(multi_reduce_min.size(), num_bins);
+  ASSERT_EQ(multi_reduce_max.size(), num_bins);
+
+  for (size_t bin = 0; bin < num_bins; ++bin) {
+    ASSERT_EQ(multi_reduce_sum.get(bin), get_op_identity(multi_reduce_sum));
+    ASSERT_EQ(multi_reduce_min.get(bin), get_op_identity(multi_reduce_min));
+    ASSERT_EQ(multi_reduce_max.get(bin), get_op_identity(multi_reduce_max));
+
+    ASSERT_EQ((NumericType)multi_reduce_sum[bin].get(), get_op_identity(multi_reduce_sum));
+    ASSERT_EQ((NumericType)multi_reduce_min[bin].get(), get_op_identity(multi_reduce_min));
+    ASSERT_EQ((NumericType)multi_reduce_max[bin].get(), get_op_identity(multi_reduce_max));
+  }
+}
+
+template <typename MultiReducePolicy,
+          typename NumericType>
+void testBasicMultiReducerConstructorBitwise(size_t num_bins)
+{
+  RAJA::MultiReduceBitOr<MultiReducePolicy, NumericType> multi_reduce_or(num_bins);
+  RAJA::MultiReduceBitAnd<MultiReducePolicy, NumericType> multi_reduce_and(num_bins);
+
+  ASSERT_EQ(multi_reduce_or.size(), num_bins);
+  ASSERT_EQ(multi_reduce_and.size(), num_bins);
+
+  for (size_t bin = 0; bin < num_bins; ++bin) {
+    ASSERT_EQ(multi_reduce_or.get(bin), get_op_identity(multi_reduce_or));
+    ASSERT_EQ(multi_reduce_and.get(bin), get_op_identity(multi_reduce_and));
+
+    ASSERT_EQ((NumericType)multi_reduce_or[bin].get(), get_op_identity(multi_reduce_or));
+    ASSERT_EQ((NumericType)multi_reduce_and[bin].get(), get_op_identity(multi_reduce_and));
+  }
+}
+
+template <typename MultiReducePolicy,
+          typename NumericType,
+          std::enable_if_t<std::is_integral<NumericType>::value>* = nullptr>
+void testBasicMultiReducerConstructor(size_t num_bins)
+{
+  testBasicMultiReducerConstructorRegular< MultiReducePolicy, NumericType >(num_bins);
+  testBasicMultiReducerConstructorBitwise< MultiReducePolicy, NumericType >(num_bins);
+}
+///
+template <typename MultiReducePolicy,
+          typename NumericType,
+          std::enable_if_t<!std::is_integral<NumericType>::value>* = nullptr>
+void testBasicMultiReducerConstructor(size_t num_bins)
+{
+  testBasicMultiReducerConstructorRegular< MultiReducePolicy, NumericType >(num_bins);
+}
+
+TYPED_TEST_P(MultiReducerBasicConstructorUnitTest, MultiReducerConstructor)
+{
+  using MultiReducePolicy = typename camp::at<TypeParam, camp::num<0>>::type;
+  using NumericType = typename camp::at<TypeParam, camp::num<1>>::type;
+
+  testBasicMultiReducerConstructor< MultiReducePolicy, NumericType >(0);
+  testBasicMultiReducerConstructor< MultiReducePolicy, NumericType >(1);
+  testBasicMultiReducerConstructor< MultiReducePolicy, NumericType >(2);
+  testBasicMultiReducerConstructor< MultiReducePolicy, NumericType >(10);
+}
+
+
+template <typename MultiReducePolicy,
+          typename NumericType>
+void testMultiReducerSingleInitConstructorRegular(size_t num_bins, NumericType initVal)
+{
+  RAJA::MultiReduceSum<MultiReducePolicy, NumericType> multi_reduce_sum(num_bins, initVal);
+  RAJA::MultiReduceMin<MultiReducePolicy, NumericType> multi_reduce_min(num_bins, initVal);
+  RAJA::MultiReduceMax<MultiReducePolicy, NumericType> multi_reduce_max(num_bins, initVal);
+
+  ASSERT_EQ(multi_reduce_sum.size(), num_bins);
+  ASSERT_EQ(multi_reduce_min.size(), num_bins);
+  ASSERT_EQ(multi_reduce_max.size(), num_bins);
+
+  for (size_t bin = 0; bin < num_bins; ++bin) {
+    ASSERT_EQ(multi_reduce_sum.get(bin), initVal);
+    ASSERT_EQ(multi_reduce_min.get(bin), initVal);
+    ASSERT_EQ(multi_reduce_max.get(bin), initVal);
+
+    ASSERT_EQ((NumericType)multi_reduce_sum[bin].get(), initVal);
+    ASSERT_EQ((NumericType)multi_reduce_min[bin].get(), initVal);
+    ASSERT_EQ((NumericType)multi_reduce_max[bin].get(), initVal);
+  }
+}
+
+template <typename MultiReducePolicy,
+          typename NumericType>
+void testMultiReducerSingleInitConstructorBitwise(size_t num_bins, NumericType initVal)
+{
+  RAJA::MultiReduceBitOr<MultiReducePolicy, NumericType> multi_reduce_or(num_bins, initVal);
+  RAJA::MultiReduceBitAnd<MultiReducePolicy, NumericType> multi_reduce_and(num_bins, initVal);
+
+  ASSERT_EQ(multi_reduce_or.size(), num_bins);
+  ASSERT_EQ(multi_reduce_and.size(), num_bins);
+
+  for (size_t bin = 0; bin < num_bins; ++bin) {
+    ASSERT_EQ(multi_reduce_or.get(bin), initVal);
+    ASSERT_EQ(multi_reduce_and.get(bin), initVal);
+
+    ASSERT_EQ((NumericType)multi_reduce_or[bin].get(), initVal);
+    ASSERT_EQ((NumericType)multi_reduce_and[bin].get(), initVal);
+  }
+}
+
+template <typename MultiReducePolicy,
+          typename NumericType,
+          std::enable_if_t<std::is_integral<NumericType>::value>* = nullptr >
+void testMultiReducerSingleInitConstructor(size_t num_bins, NumericType initVal)
+{
+  testMultiReducerSingleInitConstructorRegular< MultiReducePolicy, NumericType >(num_bins, initVal);
+  testMultiReducerSingleInitConstructorBitwise< MultiReducePolicy, NumericType >(num_bins, initVal);
+}
+///
+template <typename MultiReducePolicy,
+          typename NumericType,
+          std::enable_if_t<!std::is_integral<NumericType>::value>* = nullptr >
+void testMultiReducerSingleInitConstructor(size_t num_bins, NumericType initVal)
+{
+  testMultiReducerSingleInitConstructorRegular< MultiReducePolicy, NumericType >(num_bins, initVal);
+}
+
+TYPED_TEST_P(MultiReducerSingleInitConstructorUnitTest, MultiReducerConstructor)
+{
+  using MultiReducePolicy = typename camp::at<TypeParam, camp::num<0>>::type;
+  using NumericType = typename camp::at<TypeParam, camp::num<1>>::type;
+
+  testMultiReducerSingleInitConstructor< MultiReducePolicy, NumericType >(0, NumericType(2));
+  testMultiReducerSingleInitConstructor< MultiReducePolicy, NumericType >(1, NumericType(4));
+  testMultiReducerSingleInitConstructor< MultiReducePolicy, NumericType >(2, NumericType(0));
+  testMultiReducerSingleInitConstructor< MultiReducePolicy, NumericType >(10, NumericType(9));
+}
+
+
+template <typename MultiReducePolicy,
+          typename NumericType,
+          typename Container>
+void testMultiReducerContainerInitConstructorRegular(Container const& container)
+{
+  RAJA::MultiReduceSum<MultiReducePolicy, NumericType> multi_reduce_sum(container);
+  RAJA::MultiReduceMin<MultiReducePolicy, NumericType> multi_reduce_min(container);
+  RAJA::MultiReduceMax<MultiReducePolicy, NumericType> multi_reduce_max(container);
+
+  ASSERT_EQ(multi_reduce_sum.size(), container.size());
+  ASSERT_EQ(multi_reduce_min.size(), container.size());
+  ASSERT_EQ(multi_reduce_max.size(), container.size());
+
+  size_t bin = 0;
+  for (NumericType val : container) {
+    ASSERT_EQ(multi_reduce_sum.get(bin), val);
+    ASSERT_EQ(multi_reduce_min.get(bin), val);
+    ASSERT_EQ(multi_reduce_max.get(bin), val);
+
+    ASSERT_EQ((NumericType)multi_reduce_sum[bin].get(), val);
+    ASSERT_EQ((NumericType)multi_reduce_min[bin].get(), val);
+    ASSERT_EQ((NumericType)multi_reduce_max[bin].get(), val);
+    ++bin;
+  }
+}
+
+template <typename MultiReducePolicy,
+          typename NumericType,
+          typename Container>
+void testMultiReducerContainerInitConstructorBitwise(Container const& container)
+{
+  RAJA::MultiReduceBitAnd<MultiReducePolicy, NumericType> multi_reduce_and(container);
+  RAJA::MultiReduceBitOr<MultiReducePolicy, NumericType> multi_reduce_or(container);
+
+  ASSERT_EQ(multi_reduce_and.size(), container.size());
+  ASSERT_EQ(multi_reduce_or.size(), container.size());
+
+  size_t bin = 0;
+  for (NumericType val : container) {
+    ASSERT_EQ(multi_reduce_and.get(bin), val);
+    ASSERT_EQ(multi_reduce_or.get(bin), val);
+
+    ASSERT_EQ((NumericType)multi_reduce_and[bin].get(), val);
+    ASSERT_EQ((NumericType)multi_reduce_or[bin].get(), val);
+    ++bin;
+  }
+}
+
+template <typename MultiReducePolicy,
+          typename NumericType,
+          typename Container,
+          std::enable_if_t<std::is_integral<NumericType>::value>* = nullptr>
+void testMultiReducerContainerInitConstructor(Container const& container)
+{
+  testMultiReducerContainerInitConstructorRegular< MultiReducePolicy, NumericType >(container);
+  testMultiReducerContainerInitConstructorBitwise< MultiReducePolicy, NumericType >(container);
+}
+///
+template <typename MultiReducePolicy,
+          typename NumericType,
+          typename Container,
+          std::enable_if_t<!std::is_integral<NumericType>::value>* = nullptr>
+void testMultiReducerContainerInitConstructor(Container const& container)
+{
+  testMultiReducerContainerInitConstructorRegular< MultiReducePolicy, NumericType >(container);
+}
+
+TYPED_TEST_P(MultiReducerContainerInitConstructorUnitTest, MultiReducerConstructor)
+{
+  using MultiReducePolicy = typename camp::at<TypeParam, camp::num<0>>::type;
+  using NumericType = typename camp::at<TypeParam, camp::num<1>>::type;
+
+  std::vector<NumericType> c0(0);
+  std::vector<NumericType> c1(1, 3);
+  std::set<NumericType> c2;
+  c2.emplace(5);
+  c2.emplace(8);
+  std::list<NumericType> c10;
+  for (size_t bin = 0; bin < size_t(10); ++bin) {
+    c10.emplace_front(NumericType(bin));
+  }
+  testMultiReducerContainerInitConstructor< MultiReducePolicy, NumericType >(c0);
+  testMultiReducerContainerInitConstructor< MultiReducePolicy, NumericType >(c1);
+  testMultiReducerContainerInitConstructor< MultiReducePolicy, NumericType >(c2);
+  testMultiReducerContainerInitConstructor< MultiReducePolicy, NumericType >(c10);
+}
+
+
+REGISTER_TYPED_TEST_SUITE_P(MultiReducerBasicConstructorUnitTest,
+                            MultiReducerConstructor);
+
+REGISTER_TYPED_TEST_SUITE_P(MultiReducerSingleInitConstructorUnitTest,
+                            MultiReducerConstructor);
+
+REGISTER_TYPED_TEST_SUITE_P(MultiReducerContainerInitConstructorUnitTest,
+                            MultiReducerConstructor);
+
+#endif  //__TEST_MULTI_REDUCER_CONSTRUCTOR__
diff --git a/test/unit/multi_reducer/tests/test-multi-reducer-reset.hpp b/test/unit/multi_reducer/tests/test-multi-reducer-reset.hpp
new file mode 100644
index 0000000000..0eb1eb6eb6
--- /dev/null
+++ b/test/unit/multi_reducer/tests/test-multi-reducer-reset.hpp
@@ -0,0 +1,431 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2016-24, Lawrence Livermore National Security, LLC
+// and RAJA project contributors. See the RAJA/LICENSE file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+///
+/// Header file containing tests for RAJA multi reducer reset.
+///
+
+#ifndef __TEST_MULTI_REDUCER_RESET__
+#define __TEST_MULTI_REDUCER_RESET__
+
+#include "RAJA/internal/MemUtils_CPU.hpp"
+
+#include "../test-multi-reducer.hpp"
+
+#include <vector>
+#include <list>
+#include <set>
+
+template <typename T>
+class MultiReducerBasicResetUnitTest : public ::testing::Test
+{
+};
+
+template <typename T>
+class MultiReducerSingleResetUnitTest : public ::testing::Test
+{
+};
+
+template <typename T>
+class MultiReducerContainerResetUnitTest : public ::testing::Test
+{
+};
+
+TYPED_TEST_SUITE_P(MultiReducerBasicResetUnitTest);
+TYPED_TEST_SUITE_P(MultiReducerSingleResetUnitTest);
+TYPED_TEST_SUITE_P(MultiReducerContainerResetUnitTest);
+
+
+
+template <  typename MultiReducePolicy,
+            typename NumericType,
+            typename ForOnePol  >
+void testMultiReducerBasicResetRegular(bool use_reducer, size_t num_bins)
+{
+  NumericType initVal = NumericType(5);
+
+  RAJA::MultiReduceSum<MultiReducePolicy, NumericType> multi_reduce_sum(num_bins, initVal);
+  RAJA::MultiReduceMin<MultiReducePolicy, NumericType> multi_reduce_min(num_bins, initVal);
+  RAJA::MultiReduceMax<MultiReducePolicy, NumericType> multi_reduce_max(num_bins, initVal);
+
+  if (use_reducer) {
+    forone<ForOnePol>( [=] RAJA_HOST_DEVICE() {
+      for (size_t bin = 0; bin < num_bins; ++bin) {
+        multi_reduce_sum[bin] += initVal;
+        multi_reduce_min[bin].min(initVal-1);
+        multi_reduce_max[bin].max(initVal+1);
+      }
+    });
+  }
+
+  multi_reduce_sum.reset();
+  multi_reduce_min.reset();
+  multi_reduce_max.reset();
+
+  ASSERT_EQ(multi_reduce_sum.size(), num_bins);
+  ASSERT_EQ(multi_reduce_min.size(), num_bins);
+  ASSERT_EQ(multi_reduce_max.size(), num_bins);
+
+  for (size_t bin = 0; bin < num_bins; ++bin) {
+    ASSERT_EQ(multi_reduce_sum.get(bin), get_op_identity(multi_reduce_sum));
+    ASSERT_EQ(multi_reduce_min.get(bin), get_op_identity(multi_reduce_min));
+    ASSERT_EQ(multi_reduce_max.get(bin), get_op_identity(multi_reduce_max));
+
+    ASSERT_EQ((NumericType)multi_reduce_sum[bin].get(), get_op_identity(multi_reduce_sum));
+    ASSERT_EQ((NumericType)multi_reduce_min[bin].get(), get_op_identity(multi_reduce_min));
+    ASSERT_EQ((NumericType)multi_reduce_max[bin].get(), get_op_identity(multi_reduce_max));
+  }
+}
+
+template <  typename MultiReducePolicy,
+            typename NumericType,
+            typename ForOnePol  >
+void testMultiReducerBasicResetBitwise(bool use_reducer, size_t num_bins)
+{
+  NumericType initVal = NumericType(5);
+
+  RAJA::MultiReduceBitAnd<MultiReducePolicy, NumericType> multi_reduce_and(num_bins, initVal);
+  RAJA::MultiReduceBitOr<MultiReducePolicy, NumericType> multi_reduce_or(num_bins, initVal);
+
+  if (use_reducer) {
+    forone<ForOnePol>( [=] RAJA_HOST_DEVICE() {
+      for (size_t bin = 0; bin < num_bins; ++bin) {
+        multi_reduce_and[bin] &= initVal-1;
+        multi_reduce_or[bin] |= initVal+1;
+      }
+    });
+  }
+
+  multi_reduce_and.reset();
+  multi_reduce_or.reset();
+
+  ASSERT_EQ(multi_reduce_and.size(), num_bins);
+  ASSERT_EQ(multi_reduce_or.size(), num_bins);
+
+  for (size_t bin = 0; bin < num_bins; ++bin) {
+    ASSERT_EQ(multi_reduce_and.get(bin), get_op_identity(multi_reduce_and));
+    ASSERT_EQ(multi_reduce_or.get(bin), get_op_identity(multi_reduce_or));
+
+    ASSERT_EQ((NumericType)multi_reduce_and[bin].get(), get_op_identity(multi_reduce_and));
+    ASSERT_EQ((NumericType)multi_reduce_or[bin].get(), get_op_identity(multi_reduce_or));
+  }
+}
+
+template <  typename MultiReducePolicy,
+            typename NumericType,
+            typename ForOnePol,
+            std::enable_if_t<std::is_integral<NumericType>::value>* = nullptr >
+void testMultiReducerBasicReset(size_t num_bins)
+{
+  testMultiReducerBasicResetRegular< MultiReducePolicy, NumericType, ForOnePol >(false, num_bins);
+  testMultiReducerBasicResetBitwise< MultiReducePolicy, NumericType, ForOnePol >(false, num_bins);
+  // avoid using the reducer as forone does not handle reducers correctly
+  // forone does not make_lambda_body or privatize the body
+  // testMultiReducerBasicResetRegular< MultiReducePolicy, NumericType, ForOnePol >(true, num_bins);
+  // testMultiReducerBasicResetBitwise< MultiReducePolicy, NumericType, ForOnePol >(true, num_bins);
+}
+///
+template <  typename MultiReducePolicy,
+            typename NumericType,
+            typename ForOnePol,
+            std::enable_if_t<!std::is_integral<NumericType>::value>* = nullptr >
+void testMultiReducerBasicReset(size_t num_bins)
+{
+  testMultiReducerBasicResetRegular< MultiReducePolicy, NumericType, ForOnePol >(false, num_bins);
+  // avoid using the reducer as forone does not handle reducers correctly
+  // forone does not make_lambda_body or privatize the body
+  // testMultiReducerBasicResetRegular< MultiReducePolicy, NumericType, ForOnePol >(true, num_bins);
+}
+
+TYPED_TEST_P(MultiReducerBasicResetUnitTest, MultiReducerReset)
+{
+  using MultiReducePolicy = typename camp::at<TypeParam, camp::num<0>>::type;
+  using NumericType = typename camp::at<TypeParam, camp::num<1>>::type;
+  using ForOnePol = typename camp::at<TypeParam, camp::num<2>>::type;
+
+  testMultiReducerBasicReset< MultiReducePolicy, NumericType, ForOnePol >(0);
+  testMultiReducerBasicReset< MultiReducePolicy, NumericType, ForOnePol >(1);
+  testMultiReducerBasicReset< MultiReducePolicy, NumericType, ForOnePol >(2);
+  testMultiReducerBasicReset< MultiReducePolicy, NumericType, ForOnePol >(10);
+}
+
+
+
+template <  typename MultiReducePolicy,
+            typename NumericType,
+            typename ForOnePol  >
+void testMultiReducerSingleResetRegular(bool use_reducer, size_t init_bins, size_t num_bins, NumericType initVal)
+{
+  RAJA::MultiReduceSum<MultiReducePolicy, NumericType> multi_reduce_sum(init_bins, initVal);
+  RAJA::MultiReduceMin<MultiReducePolicy, NumericType> multi_reduce_min(init_bins, initVal);
+  RAJA::MultiReduceMax<MultiReducePolicy, NumericType> multi_reduce_max(init_bins, initVal);
+
+  if (use_reducer) {
+    forone<ForOnePol>( [=] RAJA_HOST_DEVICE() {
+      for (size_t bin = 0; bin < init_bins; ++bin) {
+        multi_reduce_sum[bin] += initVal;
+        multi_reduce_min[bin].min(initVal-1);
+        multi_reduce_max[bin].max(initVal+1);
+      }
+    });
+  }
+
+  multi_reduce_sum.reset(num_bins, initVal);
+  multi_reduce_min.reset(num_bins, initVal);
+  multi_reduce_max.reset(num_bins, initVal);
+
+  ASSERT_EQ(multi_reduce_sum.size(), num_bins);
+  ASSERT_EQ(multi_reduce_min.size(), num_bins);
+  ASSERT_EQ(multi_reduce_max.size(), num_bins);
+
+  for (size_t bin = 0; bin < num_bins; ++bin) {
+    ASSERT_EQ(multi_reduce_sum.get(bin), initVal);
+    ASSERT_EQ(multi_reduce_min.get(bin), initVal);
+    ASSERT_EQ(multi_reduce_max.get(bin), initVal);
+
+    ASSERT_EQ((NumericType)multi_reduce_sum[bin].get(), initVal);
+    ASSERT_EQ((NumericType)multi_reduce_min[bin].get(), initVal);
+    ASSERT_EQ((NumericType)multi_reduce_max[bin].get(), initVal);
+  }
+}
+
+template <  typename MultiReducePolicy,
+            typename NumericType,
+            typename ForOnePol  >
+void testMultiReducerSingleResetBitwise(bool use_reducer, size_t init_bins, size_t num_bins, NumericType initVal)
+{
+  RAJA::MultiReduceBitAnd<MultiReducePolicy, NumericType> multi_reduce_and(init_bins, initVal);
+  RAJA::MultiReduceBitOr<MultiReducePolicy, NumericType> multi_reduce_or(init_bins, initVal);
+
+  if (use_reducer) {
+    forone<ForOnePol>( [=] RAJA_HOST_DEVICE() {
+      for (size_t bin = 0; bin < init_bins; ++bin) {
+        multi_reduce_and[bin] &= initVal-1;
+        multi_reduce_or[bin] |= initVal+1;
+      }
+    });
+  }
+
+  multi_reduce_and.reset(num_bins, initVal);
+  multi_reduce_or.reset(num_bins, initVal);
+
+  ASSERT_EQ(multi_reduce_and.size(), num_bins);
+  ASSERT_EQ(multi_reduce_or.size(), num_bins);
+
+  for (size_t bin = 0; bin < num_bins; ++bin) {
+    ASSERT_EQ(multi_reduce_and.get(bin), initVal);
+    ASSERT_EQ(multi_reduce_or.get(bin), initVal);
+
+    ASSERT_EQ((NumericType)multi_reduce_and[bin].get(), initVal);
+    ASSERT_EQ((NumericType)multi_reduce_or[bin].get(), initVal);
+  }
+}
+
+template <  typename MultiReducePolicy,
+            typename NumericType,
+            typename ForOnePol,
+            std::enable_if_t<std::is_integral<NumericType>::value>* = nullptr >
+void testMultiReducerSingleResetSize(size_t init_bins, size_t num_bins, NumericType initVal)
+{
+  testMultiReducerSingleResetRegular< MultiReducePolicy, NumericType, ForOnePol >(false, init_bins, num_bins, initVal);
+  testMultiReducerSingleResetBitwise< MultiReducePolicy, NumericType, ForOnePol >(false, init_bins, num_bins, initVal);
+  // avoid using the reducer as forone does not handle reducers correctly
+  // forone does not make_lambda_body or privatize the body
+  // testMultiReducerSingleResetRegular< MultiReducePolicy, NumericType, ForOnePol >(true, init_bins, num_bins, initVal);
+  // testMultiReducerSingleResetBitwise< MultiReducePolicy, NumericType, ForOnePol >(true, init_bins, num_bins, initVal);
+}
+///
+template <  typename MultiReducePolicy,
+            typename NumericType,
+            typename ForOnePol,
+            std::enable_if_t<!std::is_integral<NumericType>::value>* = nullptr >
+void testMultiReducerSingleResetSize(size_t init_bins, size_t num_bins, NumericType initVal)
+{
+  testMultiReducerSingleResetRegular< MultiReducePolicy, NumericType, ForOnePol >(false, init_bins, num_bins, initVal);
+  // avoid using the reducer as forone does not handle reducers correctly
+  // forone does not make_lambda_body or privatize the body
+  // testMultiReducerSingleResetRegular< MultiReducePolicy, NumericType, ForOnePol >(true, init_bins, num_bins, initVal);
+}
+
+template <  typename MultiReducePolicy,
+            typename NumericType,
+            typename ForOnePol >
+void testMultiReducerSingleReset(size_t num_bins, NumericType initVal)
+{
+  testMultiReducerSingleResetSize< MultiReducePolicy, NumericType, ForOnePol >(0, num_bins, initVal);
+  testMultiReducerSingleResetSize< MultiReducePolicy, NumericType, ForOnePol >(4, num_bins, initVal);
+  testMultiReducerSingleResetSize< MultiReducePolicy, NumericType, ForOnePol >(num_bins, num_bins, initVal);
+}
+
+TYPED_TEST_P(MultiReducerSingleResetUnitTest, MultiReducerReset)
+{
+  using MultiReducePolicy = typename camp::at<TypeParam, camp::num<0>>::type;
+  using NumericType = typename camp::at<TypeParam, camp::num<1>>::type;
+  using ForOnePol = typename camp::at<TypeParam, camp::num<2>>::type;
+
+  testMultiReducerSingleReset< MultiReducePolicy, NumericType, ForOnePol >(0, NumericType(3));
+  testMultiReducerSingleReset< MultiReducePolicy, NumericType, ForOnePol >(1, NumericType(5));
+  testMultiReducerSingleReset< MultiReducePolicy, NumericType, ForOnePol >(2, NumericType(0));
+  testMultiReducerSingleReset< MultiReducePolicy, NumericType, ForOnePol >(10, NumericType(8));
+}
+
+
+
+template <  typename MultiReducePolicy,
+            typename NumericType,
+            typename ForOnePol,
+            typename Container  >
+void testMultiReducerContainerResetRegular(bool use_reducer, size_t init_bins, Container const& container)
+{
+  const size_t num_bins = container.size();
+  NumericType initVal = NumericType(5);
+
+  RAJA::MultiReduceSum<MultiReducePolicy, NumericType> multi_reduce_sum(init_bins, initVal);
+  RAJA::MultiReduceMin<MultiReducePolicy, NumericType> multi_reduce_min(init_bins, initVal);
+  RAJA::MultiReduceMax<MultiReducePolicy, NumericType> multi_reduce_max(init_bins, initVal);
+
+  if (use_reducer) {
+    forone<ForOnePol>( [=] RAJA_HOST_DEVICE() {
+      for (size_t bin = 0; bin < init_bins; ++bin) {
+        multi_reduce_sum[bin] += initVal;
+        multi_reduce_min[bin].min(initVal-1);
+        multi_reduce_max[bin].max(initVal+1);
+      }
+    });
+  }
+
+  multi_reduce_sum.reset(container);
+  multi_reduce_min.reset(container);
+  multi_reduce_max.reset(container);
+
+  ASSERT_EQ(multi_reduce_sum.size(), num_bins);
+  ASSERT_EQ(multi_reduce_min.size(), num_bins);
+  ASSERT_EQ(multi_reduce_max.size(), num_bins);
+
+  size_t bin = 0;
+  for (NumericType val : container) {
+    ASSERT_EQ(multi_reduce_sum.get(bin), val);
+    ASSERT_EQ(multi_reduce_min.get(bin), val);
+    ASSERT_EQ(multi_reduce_max.get(bin), val);
+
+    ASSERT_EQ((NumericType)multi_reduce_sum[bin].get(), val);
+    ASSERT_EQ((NumericType)multi_reduce_min[bin].get(), val);
+    ASSERT_EQ((NumericType)multi_reduce_max[bin].get(), val);
+    ++bin;
+  }
+}
+
+template <  typename MultiReducePolicy,
+            typename NumericType,
+            typename ForOnePol,
+            typename Container >
+void testMultiReducerContainerResetBitwise(bool use_reducer, size_t init_bins, Container const& container)
+{
+  const size_t num_bins = container.size();
+  NumericType initVal = NumericType(5);
+
+  RAJA::MultiReduceBitAnd<MultiReducePolicy, NumericType> multi_reduce_and(init_bins, initVal);
+  RAJA::MultiReduceBitOr<MultiReducePolicy, NumericType> multi_reduce_or(init_bins, initVal);
+
+  if (use_reducer) {
+    forone<ForOnePol>( [=] RAJA_HOST_DEVICE() {
+      for (size_t bin = 0; bin < init_bins; ++bin) {
+        multi_reduce_and[bin] &= initVal-1;
+        multi_reduce_or[bin] |= initVal+1;
+      }
+    });
+  }
+
+  multi_reduce_and.reset(container);
+  multi_reduce_or.reset(container);
+
+  ASSERT_EQ(multi_reduce_and.size(), num_bins);
+  ASSERT_EQ(multi_reduce_or.size(), num_bins);
+
+  size_t bin = 0;
+  for (NumericType val : container) {
+    ASSERT_EQ(multi_reduce_and.get(bin), val);
+    ASSERT_EQ(multi_reduce_or.get(bin), val);
+
+    ASSERT_EQ((NumericType)multi_reduce_and[bin].get(), val);
+    ASSERT_EQ((NumericType)multi_reduce_or[bin].get(), val);
+    ++bin;
+  }
+}
+
+template <  typename MultiReducePolicy,
+            typename NumericType,
+            typename ForOnePol,
+            typename Container,
+            std::enable_if_t<std::is_integral<NumericType>::value>* = nullptr >
+void testMultiReducerContainerResetSize(size_t init_bins, Container const& container)
+{
+  testMultiReducerContainerResetRegular< MultiReducePolicy, NumericType, ForOnePol >(false, init_bins, container);
+  testMultiReducerContainerResetBitwise< MultiReducePolicy, NumericType, ForOnePol >(false, init_bins, container);
+  // avoid using the reducer as forone does not handle reducers correctly
+  // forone does not make_lambda_body or privatize the body
+  // testMultiReducerContainerResetRegular< MultiReducePolicy, NumericType, ForOnePol >(true, init_bins, container);
+  // testMultiReducerContainerResetBitwise< MultiReducePolicy, NumericType, ForOnePol >(true, init_bins, container);
+}
+///
+template <  typename MultiReducePolicy,
+            typename NumericType,
+            typename ForOnePol,
+            typename Container,
+            std::enable_if_t<!std::is_integral<NumericType>::value>* = nullptr >
+void testMultiReducerContainerResetSize(size_t init_bins, Container const& container)
+{
+  testMultiReducerContainerResetRegular< MultiReducePolicy, NumericType, ForOnePol >(false, init_bins, container);
+  // avoid using the reducer as forone does not handle reducers correctly
+  // forone does not make_lambda_body or privatize the body
+  // testMultiReducerContainerResetRegular< MultiReducePolicy, NumericType, ForOnePol >(true, init_bins, container);
+}
+
+template <  typename MultiReducePolicy,
+            typename NumericType,
+            typename ForOnePol,
+            typename Container >
+void testMultiReducerContainerReset(Container const& container)
+{
+  testMultiReducerContainerResetSize< MultiReducePolicy, NumericType, ForOnePol >(0, container);
+  testMultiReducerContainerResetSize< MultiReducePolicy, NumericType, ForOnePol >(4, container);
+  testMultiReducerContainerResetSize< MultiReducePolicy, NumericType, ForOnePol >(container.size(), container);
+}
+
+TYPED_TEST_P(MultiReducerContainerResetUnitTest, MultiReducerReset)
+{
+  using MultiReducePolicy = typename camp::at<TypeParam, camp::num<0>>::type;
+  using NumericType = typename camp::at<TypeParam, camp::num<1>>::type;
+  using ForOnePol = typename camp::at<TypeParam, camp::num<2>>::type;
+
+  std::vector<NumericType> c0(0);
+  std::vector<NumericType> c1(1, 3);
+  std::set<NumericType> c2;
+  c2.emplace(5);
+  c2.emplace(8);
+  std::list<NumericType> c10;
+  for (size_t bin = 0; bin < size_t(10); ++bin) {
+    c10.emplace_front(NumericType(bin));
+  }
+  testMultiReducerContainerReset< MultiReducePolicy, NumericType, ForOnePol >(c0);
+  testMultiReducerContainerReset< MultiReducePolicy, NumericType, ForOnePol >(c1);
+  testMultiReducerContainerReset< MultiReducePolicy, NumericType, ForOnePol >(c2);
+  testMultiReducerContainerReset< MultiReducePolicy, NumericType, ForOnePol >(c10);
+}
+
+
+
+REGISTER_TYPED_TEST_SUITE_P(MultiReducerBasicResetUnitTest,
+                            MultiReducerReset);
+
+REGISTER_TYPED_TEST_SUITE_P(MultiReducerSingleResetUnitTest,
+                            MultiReducerReset);
+
+REGISTER_TYPED_TEST_SUITE_P(MultiReducerContainerResetUnitTest,
+                            MultiReducerReset);
+
+#endif  //__TEST_MULTI_REDUCER_RESET__
diff --git a/test/unit/util/CMakeLists.txt b/test/unit/util/CMakeLists.txt
index 869b897714..175d2c07bb 100644
--- a/test/unit/util/CMakeLists.txt
+++ b/test/unit/util/CMakeLists.txt
@@ -25,4 +25,8 @@ raja_add_test(
   NAME test-fraction
   SOURCES test-fraction.cpp)
 
+raja_add_test(
+  NAME test-math
+  SOURCES test-math.cpp)
+
 add_subdirectory(operator)
diff --git a/test/unit/util/test-math.cpp b/test/unit/util/test-math.cpp
new file mode 100644
index 0000000000..39572ad3a0
--- /dev/null
+++ b/test/unit/util/test-math.cpp
@@ -0,0 +1,119 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2016-24, Lawrence Livermore National Security, LLC
+// and RAJA project contributors. See the RAJA/LICENSE file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+///
+/// Source file containing tests for Fraction
+///
+
+#include <RAJA/RAJA.hpp>
+#include "RAJA_gtest.hpp"
+#include <type_traits>
+
+template < typename T >
+void test_log2()
+{
+  ASSERT_EQ(RAJA::log2(T(257)), T(8));
+  ASSERT_EQ(RAJA::log2(T(256)), T(8));
+  ASSERT_EQ(RAJA::log2(T(255)), T(7));
+  ASSERT_EQ(RAJA::log2(T(4)), T(2));
+  ASSERT_EQ(RAJA::log2(T(3)), T(1));
+  ASSERT_EQ(RAJA::log2(T(2)), T(1));
+  ASSERT_EQ(RAJA::log2(T(1)), T(0));
+  ASSERT_EQ(RAJA::log2(T(0)), T(0));
+  if (std::is_signed<T>::value) {
+    ASSERT_EQ(RAJA::log2(T(-1)), T(0));
+    ASSERT_EQ(RAJA::log2(T(-100)), T(0));
+  }
+}
+
+TEST(math, log2)
+{
+  test_log2<int>();
+  test_log2<size_t>();
+}
+
+
+template < typename T >
+void test_next_pow2()
+{
+  ASSERT_EQ(RAJA::next_pow2(T(257)), T(512));
+  ASSERT_EQ(RAJA::next_pow2(T(256)), T(256));
+  ASSERT_EQ(RAJA::next_pow2(T(255)), T(256));
+  ASSERT_EQ(RAJA::next_pow2(T(4)), T(4));
+  ASSERT_EQ(RAJA::next_pow2(T(3)), T(4));
+  ASSERT_EQ(RAJA::next_pow2(T(2)), T(2));
+  ASSERT_EQ(RAJA::next_pow2(T(1)), T(1));
+  ASSERT_EQ(RAJA::next_pow2(T(0)), T(0));
+  if (std::is_signed<T>::value) {
+    ASSERT_EQ(RAJA::next_pow2(T(-1)), T(0));
+    ASSERT_EQ(RAJA::next_pow2(T(-100)), T(0));
+  }
+}
+
+TEST(math, next_pow2)
+{
+  test_next_pow2<int>();
+  test_next_pow2<size_t>();
+}
+
+
+template < typename T >
+void test_prev_pow2()
+{
+  ASSERT_EQ(RAJA::prev_pow2(T(257)), T(256));
+  ASSERT_EQ(RAJA::prev_pow2(T(256)), T(256));
+  ASSERT_EQ(RAJA::prev_pow2(T(255)), T(128));
+  ASSERT_EQ(RAJA::prev_pow2(T(4)), T(4));
+  ASSERT_EQ(RAJA::prev_pow2(T(3)), T(2));
+  ASSERT_EQ(RAJA::prev_pow2(T(2)), T(2));
+  ASSERT_EQ(RAJA::prev_pow2(T(1)), T(1));
+  ASSERT_EQ(RAJA::prev_pow2(T(0)), T(0));
+  if (std::is_signed<T>::value) {
+    ASSERT_EQ(RAJA::prev_pow2(T(-1)), T(0));
+    ASSERT_EQ(RAJA::prev_pow2(T(-100)), T(0));
+  }
+}
+
+TEST(math, prev_pow2)
+{
+  test_prev_pow2<int>();
+  test_prev_pow2<size_t>();
+}
+
+
+template < typename T >
+void test_power_of_2_mod()
+{
+  ASSERT_EQ(RAJA::power_of_2_mod(T(257), T(256)), T(1));
+  ASSERT_EQ(RAJA::power_of_2_mod(T(256), T(256)), T(0));
+  ASSERT_EQ(RAJA::power_of_2_mod(T(255), T(256)), T(255));
+  ASSERT_EQ(RAJA::power_of_2_mod(T(128), T(256)), T(128));
+  ASSERT_EQ(RAJA::power_of_2_mod(T(256), T(4)), T(0));
+  ASSERT_EQ(RAJA::power_of_2_mod(T(95), T(4)), T(3));
+  ASSERT_EQ(RAJA::power_of_2_mod(T(94), T(4)), T(2));
+  ASSERT_EQ(RAJA::power_of_2_mod(T(93), T(4)), T(1));
+  ASSERT_EQ(RAJA::power_of_2_mod(T(92), T(4)), T(0));
+  ASSERT_EQ(RAJA::power_of_2_mod(T(7), T(4)), T(3));
+  ASSERT_EQ(RAJA::power_of_2_mod(T(6), T(4)), T(2));
+  ASSERT_EQ(RAJA::power_of_2_mod(T(5), T(4)), T(1));
+  ASSERT_EQ(RAJA::power_of_2_mod(T(4), T(4)), T(0));
+  ASSERT_EQ(RAJA::power_of_2_mod(T(3), T(4)), T(3));
+  ASSERT_EQ(RAJA::power_of_2_mod(T(2), T(4)), T(2));
+  ASSERT_EQ(RAJA::power_of_2_mod(T(1), T(4)), T(1));
+  ASSERT_EQ(RAJA::power_of_2_mod(T(0), T(4)), T(0));
+  ASSERT_EQ(RAJA::power_of_2_mod(T(3), T(2)), T(1));
+  ASSERT_EQ(RAJA::power_of_2_mod(T(2), T(2)), T(0));
+  ASSERT_EQ(RAJA::power_of_2_mod(T(1), T(2)), T(1));
+  ASSERT_EQ(RAJA::power_of_2_mod(T(0), T(2)), T(0));
+  ASSERT_EQ(RAJA::power_of_2_mod(T(1), T(1)), T(0));
+}
+
+TEST(math, power_of_2_mod)
+{
+  test_power_of_2_mod<int>();
+  test_power_of_2_mod<size_t>();
+}
diff --git a/test/unit/workgroup/CMakeLists.txt b/test/unit/workgroup/CMakeLists.txt
index dce610d954..0815ffda5e 100644
--- a/test/unit/workgroup/CMakeLists.txt
+++ b/test/unit/workgroup/CMakeLists.txt
@@ -62,6 +62,11 @@ if(RAJA_TEST_EXHAUSTIVE OR NOT RAJA_COMPILER MATCHES "RAJA_COMPILER_Intel")
   set(Constructor_SUBTESTS Single)
   buildunitworkgrouptest(Constructor "${Constructor_SUBTESTS}" "${DISPATCHERS}" "${BACKENDS}")
 
+  if(RAJA_ENABLE_TARGET_OPENMP)
+    # WorkGroup dispatcher for OpenMPTarget not implemented yet
+    list(REMOVE_ITEM BACKENDS OpenMPTarget)
+  endif()
+
   set(Enqueue_SUBTESTS Single Multiple)
   buildunitworkgrouptest(Enqueue     "${Enqueue_SUBTESTS}"     "${DISPATCHERS}" "${BACKENDS}")
 
@@ -70,10 +75,12 @@ if(RAJA_TEST_EXHAUSTIVE OR NOT RAJA_COMPILER MATCHES "RAJA_COMPILER_Intel")
 endif()
 
 set(Dispatcher_SUBTESTS Single)
+
 if(RAJA_ENABLE_TARGET_OPENMP)
   # WorkGroup dispatcher for OpenMPTarget not implemented yet
   list(REMOVE_ITEM BACKENDS OpenMPTarget)
 endif()
+
 buildunitworkgrouptest(Dispatcher    "${Dispatcher_SUBTESTS}"  "${DISPATCHERS}" "${BACKENDS}")
 
 set(WorkStorage_SUBTESTS Constructor Iterator InsertCall Multiple)
diff --git a/tpl/camp b/tpl/camp
index 79c320fa09..d580fd8feb 160000
--- a/tpl/camp
+++ b/tpl/camp
@@ -1 +1 @@
-Subproject commit 79c320fa09db987923b56884afdc9f82f4b70fc4
+Subproject commit d580fd8feb10ddb7a63a784b4afcd857ac686e39