Merge pull request #895 from LLNL/v0.12.1-rc

V0.12.1 rc
LLNL · Sep 9, 2020 · 9cb6370 · 9cb6370
2 parents 32d92e3 + a5d6fe1
commit 9cb6370
Show file tree

Hide file tree

Showing 28 changed files with 148 additions and 134 deletions.
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -15,7 +15,7 @@ endif()
 # Set version number
 set(RAJA_VERSION_MAJOR 0)
 set(RAJA_VERSION_MINOR 12)
-set(RAJA_VERSION_PATCHLEVEL 0)
+set(RAJA_VERSION_PATCHLEVEL 1)
 
 if (RAJA_LOADED AND (NOT RAJA_LOADED STREQUAL "${RAJA_VERSION_MAJOR}.${RAJA_VERSION_MINOR}.${RAJA_VERSION_PATCHLEVEL}"))
   message(FATAL_ERROR "You are mixing RAJA versions. Loaded is ${RAJA_LOADED}, expected ${RAJA_VERSION_MAJOR}.${RAJA_VERSION_MINOR}.${RAJA_VERSION_PATCHLEVEL}")

diff --git a/RELEASE_NOTES.md b/RELEASE_NOTES.md
@@ -10,6 +10,14 @@
 Version vxx.yy.zz -- Release date 20yy-mm-dd
 ============================================
 
+Version v0.12.1 -- Release date 2020-09-09
+============================================
+
+This release contains fixes for errors when using a CUDA build with a
+non-CUDA compiler and compiler warnings, plus some other bug fixes related
+to OpenMP target compilation.
+
+
 Version v0.12.0 -- Release date 2020-09-03
 ============================================
 

diff --git a/docs/sphinx/user_guide/conf.py b/docs/sphinx/user_guide/conf.py
@@ -68,7 +68,7 @@
 # The short X.Y version.
 version = u'0.12'
 # The full version, including alpha/beta/rc tags.
-release = u'0.12.0'
+release = u'0.12.1'
 
 # The language for content autogenerated by Sphinx. Refer to documentation
 # for a list of supported languages.

diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt
@@ -102,13 +102,13 @@ raja_add_executable(
   SOURCES multiview.cpp)
 
 if(ENABLE_TARGET_OPENMP)
-  raja_add_executable(
-    NAME target-kernel
-    SOURCES omp-target-kernel.cpp)
-
-  raja_add_executable(
-    NAME omp-target-ltimes
-    SOURCES omp-target-ltimes.cpp)
+  #  raja_add_executable(
+  #    NAME target-kernel
+  #    SOURCES omp-target-kernel.cpp)
+  #
+  #  raja_add_executable(
+  #    NAME omp-target-ltimes
+  #    SOURCES omp-target-ltimes.cpp)
 endif()
 
 raja_add_executable(

diff --git a/examples/raja-teams.cpp b/examples/raja-teams.cpp
@@ -102,7 +102,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
 
   std::cout << "\n Running RAJA-Teams examples...\n";
   int num_of_backends = 1;
-#if defined(RAJA_ENABLE_DEVICE)
+#if defined(RAJA_DEVICE_ACTIVE)
   num_of_backends++;
 #endif
 
@@ -119,13 +119,15 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
     // Allocate memory for either host or device
     int N_tri = 5;
 
-    int *Ddat;
-    if (select_cpu_or_gpu == RAJA::expt::HOST)
+    int* Ddat = nullptr;
+    if (select_cpu_or_gpu == RAJA::expt::HOST) {
       Ddat = host_res.allocate<int>(N_tri * N_tri);
+    }
 
-#if defined(RAJA_ENABLE_DEVICE)
-    if (select_cpu_or_gpu == RAJA::expt::DEVICE)
+#if defined(RAJA_DEVICE_ACTIVE)
+    if (select_cpu_or_gpu == RAJA::expt::DEVICE) {
       Ddat = device_res.allocate<int>(N_tri * N_tri);
+    }
 #endif
 
     /*
@@ -143,9 +145,9 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
      */
 
     if (select_cpu_or_gpu == RAJA::expt::HOST){
-      std::cout << "\n Running Upper triangular pattern example on the host...\n";
+      std::cout << "\n Running upper triangular pattern example on the host...\n";
     }else {
-      std::cout << "\n Running Upper triangular pattern example on the device...\n";
+      std::cout << "\n Running upper triangular pattern example on the device...\n";
     }
 
 
@@ -157,29 +159,28 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
 
          RAJA::expt::loop<teams_x>(ctx, RAJA::RangeSegment(0, N_tri), [&](int r) {
 
-         // Array shared within threads of the same team
-         TEAM_SHARED int s_A[1];
-
-         RAJA::expt::loop<threads_x>(ctx, RAJA::RangeSegment(r, N_tri), [&](int c) {
-            if (c == r) s_A[0] = r;
-            D(r, c) = r * N_tri + c;
-         });  // loop j
+           // Array shared within threads of the same team
+           RAJA_TEAM_SHARED int s_A[1];
 
-         ctx.teamSync();
+           RAJA::expt::loop<threads_x>(ctx, RAJA::RangeSegment(0, 1), [&](int c) {
+              s_A[c] = r;
+           });  // loop c
 
-         RAJA::expt::loop<threads_x>(ctx, RAJA::RangeSegment(r, N_tri), [&](int c) {
+           ctx.teamSync();
 
-             printf("r=%d, c=%d : D=%d : s_A = %d \n", r, c, D(r, c), s_A[0]);
+           RAJA::expt::loop<threads_x>(ctx, RAJA::RangeSegment(r, N_tri), [&](int c) {
+               D(r, c) = r * N_tri + c;
+               printf("r=%d, c=%d : D=%d : s_A = %d \n", r, c, D(r, c), s_A[0]);
+           });  // loop c
 
-         });  // loop c
          });  // loop r
        });  // outer lambda
 
     if (select_cpu_or_gpu == RAJA::expt::HOST) {
       host_res.deallocate(Ddat);
     }
 
-#if defined(RAJA_ENABLE_DEVICE)
+#if defined(RAJA_DEVICE_ACTIVE)
     if (select_cpu_or_gpu == RAJA::expt::DEVICE) {
       device_res.deallocate(Ddat);
     }

diff --git a/include/RAJA/config.hpp.in b/include/RAJA/config.hpp.in
@@ -155,12 +155,14 @@ namespace RAJA {
 #endif // _OPENMP
 #endif // RAJA_ENABLE_OPENMP
 
-#if defined(RAJA_ENABLE_CUDA)
-#if not defined(__CUDACC__)
-#error RAJA configured with ENABLE_CUDA, but CUDA not supported by current compiler 
-#endif // 
-#endif // RAJA_ENABLE_CUDA
+#if defined(RAJA_ENABLE_CUDA) && defined(__CUDACC__)
+#define RAJA_CUDA_ACTIVE
+#endif // RAJA_ENABLE_CUDA && __CUDACC__
 
+#if defined(RAJA_CUDA_ACTIVE) || \
+  defined(RAJA_ENABLE_HIP)
+#define RAJA_DEVICE_ACTIVE
+#endif
 
 /*!
  ******************************************************************************

diff --git a/include/RAJA/index/ListSegment.hpp b/include/RAJA/index/ListSegment.hpp
@@ -31,7 +31,7 @@
 #include "RAJA/util/Span.hpp"
 #include "RAJA/util/types.hpp"
 
-#if (defined(__NVCC__) || (defined(__clang__) && defined(__CUDA__))) && defined(RAJA_ENABLE_CUDA)
+#if defined(RAJA_CUDA_ACTIVE)
 #include "RAJA/policy/cuda/raja_cudaerrchk.hpp"
 #else
 #define cudaErrchk(...)
@@ -70,7 +70,7 @@ class TypedListSegment
  * won't see any different usage or behavior.
  */
 
-#if ((defined(__NVCC__) || (defined(__clang__) && defined(__CUDA__))) && defined(RAJA_ENABLE_CUDA)) || defined(RAJA_ENABLE_HIP)
+#if defined(RAJA_DEVICE_ACTIVE)
   static constexpr bool Has_GPU = true;
 #else
   static constexpr bool Has_GPU = false;
@@ -117,7 +117,7 @@ class TypedListSegment
   //! specialization for allocation of CPU_memory
   void allocate(CPU_memory) { m_data = new T[m_size]; }
 
-#if (defined(__NVCC__) || (defined(__clang__) && defined(__CUDA__))) && defined(RAJA_ENABLE_CUDA)
+#if defined(RAJA_CUDA_ACTIVE)
   //! copy data from container using BlockCopy
   template <typename Container>
   void copy(Container&& src, BlockCopy)

diff --git a/include/RAJA/pattern/teams.hpp b/include/RAJA/pattern/teams.hpp
@@ -25,7 +25,7 @@
 //
 #include "RAJA/pattern/teams/teams_sequential.hpp"
 
-#if (defined(__NVCC__) || (defined(__clang__) && defined(__CUDA__))) && defined(RAJA_ENABLE_CUDA)
+#if defined(RAJA_CUDA_ACTIVE)
 #include "RAJA/pattern/teams/teams_cuda.hpp"
 #endif
 

diff --git a/include/RAJA/pattern/teams/teams_core.hpp b/include/RAJA/pattern/teams/teams_core.hpp
@@ -28,16 +28,10 @@
 #include "camp/concepts.hpp"
 #include "camp/tuple.hpp"
 
-#if ((defined(__NVCC__) || (defined(__clang__) && defined(__CUDA__))) && \
-     defined(RAJA_ENABLE_CUDA)) ||                                       \
-    defined(RAJA_ENABLE_HIP)
-#define RAJA_ENABLE_DEVICE
-#endif
-
 #if defined(RAJA_DEVICE_CODE)
-#define TEAM_SHARED __shared__
+#define RAJA_TEAM_SHARED __shared__
 #else
-#define TEAM_SHARED
+#define RAJA_TEAM_SHARED
 #endif
 
 namespace RAJA
@@ -54,27 +48,27 @@ struct null_launch_t {
 
 // Support for host, and device
 template <typename HOST_POLICY
-#if defined(RAJA_ENABLE_DEVICE)
+#if defined(RAJA_DEVICE_ACTIVE)
           ,
           typename DEVICE_POLICY
 #endif
           >
 struct LoopPolicy {
   using host_policy_t = HOST_POLICY;
-#if defined(RAJA_ENABLE_DEVICE)
+#if defined(RAJA_DEVICE_ACTIVE)
   using device_policy_t = DEVICE_POLICY;
 #endif
 };
 
 template <typename HOST_POLICY
-#if defined(RAJA_ENABLE_DEVICE)
+#if defined(RAJA_DEVICE_ACTIVE)
           ,
           typename DEVICE_POLICY
 #endif
           >
 struct LaunchPolicy {
   using host_policy_t = HOST_POLICY;
-#if defined(RAJA_ENABLE_DEVICE)
+#if defined(RAJA_DEVICE_ACTIVE)
   using device_policy_t = DEVICE_POLICY;
 #endif
 };
@@ -193,7 +187,7 @@ void launch(ExecPlace place, Resources const &team_resources, BODY const &body)
       launch_t::exec(LaunchContext(team_resources, HOST), body);
       break;
     }
-#ifdef RAJA_ENABLE_DEVICE
+#ifdef RAJA_DEVICE_ACTIVE
     case DEVICE: {
       using launch_t = LaunchExecute<typename POLICY_LIST::device_policy_t>;
       launch_t::exec(LaunchContext(team_resources, DEVICE), body);

diff --git a/include/RAJA/pattern/teams/teams_sequential.hpp b/include/RAJA/pattern/teams/teams_sequential.hpp
@@ -35,7 +35,8 @@ struct seq_launch_t {
 template <>
 struct LaunchExecute<RAJA::expt::null_launch_t> {
   template <typename BODY>
-  static void exec(LaunchContext const &ctx, BODY const &body)
+  static void exec(LaunchContext const& RAJA_UNUSED_ARG(ctx), 
+                   BODY const& RAJA_UNUSED_ARG(body))
   {
     RAJA_ABORT_OR_THROW("NULL Launch");
   }

diff --git a/include/RAJA/policy/cuda.hpp b/include/RAJA/policy/cuda.hpp
@@ -22,7 +22,7 @@
 
 #include "RAJA/config.hpp"
 
-#if (defined(__NVCC__) || (defined(__clang__) && defined(__CUDA__))) && defined(RAJA_ENABLE_CUDA)
+#if defined(RAJA_CUDA_ACTIVE)
 
 #include <cuda.h>
 #include <cuda_runtime.h>

diff --git a/include/RAJA/policy/cuda/policy.hpp b/include/RAJA/policy/cuda/policy.hpp
@@ -20,7 +20,7 @@
 
 #include "RAJA/config.hpp"
 
-#if defined(RAJA_ENABLE_CUDA)
+#if defined(RAJA_CUDA_ACTIVE)
 
 #include <utility>
 

diff --git a/include/RAJA/policy/openmp/policy.hpp b/include/RAJA/policy/openmp/policy.hpp
@@ -66,9 +66,6 @@ namespace internal
 struct Parallel {
 };
 
-struct Collapse {
-};
-
 struct For {
 };
 

diff --git a/include/RAJA/policy/openmp_target/forall.hpp b/include/RAJA/policy/openmp_target/forall.hpp
@@ -68,11 +68,11 @@ RAJA_INLINE resources::EventProxy<resources::Omp> forall_impl(resources::Omp &om
     ib(begin_it[i]);
   }
 
-  return resources::EventProxy<resources::Omp>(&res);
+  return resources::EventProxy<resources::Omp>(&omp_res);
 }
 
 template <typename Iterable, typename Func>
-RAJA_INLINE resources::EventProxy<resources::Omp> forall_impl(resources::Resource &omp_res,
+RAJA_INLINE resources::EventProxy<resources::Omp> forall_impl(resources::Omp &omp_res,
                                                               const omp_target_parallel_for_exec_nt&,
                                                               Iterable&& iter,
                                                               Func&& loop_body)
@@ -89,7 +89,7 @@ RAJA_INLINE resources::EventProxy<resources::Omp> forall_impl(resources::Resourc
     ib(begin_it[i]);
   }
 
-  return RAJA::resources::EventProxy<resources::Omp>(&res);
+  return resources::EventProxy<resources::Omp>(&omp_res);
 }
 
 }  // namespace omp

diff --git a/include/RAJA/policy/openmp_target/kernel/For.hpp b/include/RAJA/policy/openmp_target/kernel/For.hpp
@@ -53,7 +53,8 @@ struct StatementExecutor<statement::For<ArgumentId, omp_target_parallel_for_exec
     auto len = segment_length<ArgumentId>(data);
     using len_t = decltype(len);
 
-    forall_impl(omp_target_parallel_for_exec<N>{}, TypedRangeSegment<len_t>(0, len), for_wrapper);
+    auto r = resources::Omp::get_default();
+    forall_impl(r, omp_target_parallel_for_exec<N>{}, TypedRangeSegment<len_t>(0, len), for_wrapper);
   }
 };
 

diff --git a/include/RAJA/policy/openmp_target/policy.hpp b/include/RAJA/policy/openmp_target/policy.hpp
@@ -30,6 +30,9 @@ struct Target {
 struct Distribute {
 };
 
+struct Collapse {
+};
+
 template <size_t ThreadsPerTeam>
 struct omp_target_parallel_for_exec
     : make_policy_pattern_t<Policy::target_openmp,