From 0ccc2cf207585400dc5002b13ae124f556530668 Mon Sep 17 00:00:00 2001 From: Anderson Date: Tue, 30 Jul 2024 09:10:35 -0600 Subject: [PATCH 01/13] Updated deprecated packages' CXX_FLAGS to turn off all warnings Since deprecated packages will no longer be supported soon, turn off all warnings generated by their codes. Signed-off-by: Anderson --- packages/framework/ini-files/config-specs.ini | 69 +++++++++++-------- 1 file changed, 39 insertions(+), 30 deletions(-) diff --git a/packages/framework/ini-files/config-specs.ini b/packages/framework/ini-files/config-specs.ini index 44cbdcd65edd..8320a0b7991f 100644 --- a/packages/framework/ini-files/config-specs.ini +++ b/packages/framework/ini-files/config-specs.ini @@ -1738,42 +1738,51 @@ opt-set-cmake-var KokkosKernels_batched_dla_cuda_MPI_1_SET_RUN_SERIAL BOOL FORCE opt-set-cmake-var Intrepid2_unit-test_MonolithicExecutable_Intrepid2_Tests_MPI_1_SET_RUN_SERIAL BOOL FORCE : ON [GCC_PACKAGE_SPECIFIC_WARNING_FLAGS] -opt-set-cmake-var Amesos_CXX_FLAGS STRING : ${CMAKE_CXX_FLAGS|CMAKE} -Wno-error=shadow -opt-set-cmake-var Amesos2_CXX_FLAGS STRING : ${CMAKE_CXX_FLAGS|CMAKE} -Wno-error=shadow -opt-set-cmake-var Belos_CXX_FLAGS STRING : ${CMAKE_CXX_FLAGS|CMAKE} -Wno-error=shadow -opt-set-cmake-var Domi_CXX_FLAGS STRING : ${CMAKE_CXX_FLAGS|CMAKE} -Wno-error=shadow -opt-set-cmake-var Epetra_CXX_FLAGS STRING : ${CMAKE_CXX_FLAGS|CMAKE} -Wno-error=shadow -opt-set-cmake-var EpetraExt_CXX_FLAGS STRING : ${CMAKE_CXX_FLAGS|CMAKE} -Wno-error=shadow -opt-set-cmake-var FEI_CXX_FLAGS STRING : ${CMAKE_CXX_FLAGS|CMAKE} -Wno-error=shadow -opt-set-cmake-var Galeri_CXX_FLAGS STRING : ${CMAKE_CXX_FLAGS|CMAKE} -Wno-error=shadow -opt-set-cmake-var Ifpack_CXX_FLAGS STRING : ${CMAKE_CXX_FLAGS|CMAKE} -Wno-error=shadow -opt-set-cmake-var Intrepid_CXX_FLAGS STRING : ${CMAKE_CXX_FLAGS|CMAKE} -Wno-error=shadow -opt-set-cmake-var Intrepid2_CXX_FLAGS STRING : ${CMAKE_CXX_FLAGS|CMAKE} -Werror -Wno-error=shadow -opt-set-cmake-var Isorropia_CXX_FLAGS STRING : ${CMAKE_CXX_FLAGS|CMAKE} -Wno-error=shadow -opt-set-cmake-var ML_CXX_FLAGS STRING : ${CMAKE_CXX_FLAGS|CMAKE} -Wno-error=shadow -opt-set-cmake-var Moertel_CXX_FLAGS STRING : ${CMAKE_CXX_FLAGS|CMAKE} -Wno-error=shadow -opt-set-cmake-var NOX_CXX_FLAGS STRING : ${CMAKE_CXX_FLAGS|CMAKE} -Wno-error=shadow -opt-set-cmake-var Pamgen_CXX_FLAGS STRING : ${CMAKE_CXX_FLAGS|CMAKE} -Wno-error=shadow -opt-set-cmake-var Phalanx_CXX_FLAGS STRING : ${CMAKE_CXX_FLAGS|CMAKE} -Wno-error=shadow -opt-set-cmake-var Pike_CXX_FLAGS STRING : ${CMAKE_CXX_FLAGS|CMAKE} -Werror -opt-set-cmake-var Piro_CXX_FLAGS STRING : ${CMAKE_CXX_FLAGS|CMAKE} -Wno-error=shadow -opt-set-cmake-var ROL_CXX_FLAGS STRING : ${CMAKE_CXX_FLAGS|CMAKE} -Wno-error=shadow -opt-set-cmake-var Sacado_CXX_FLAGS STRING : ${CMAKE_CXX_FLAGS|CMAKE} -Wno-error=shadow -opt-set-cmake-var Shards_CXX_FLAGS STRING : ${CMAKE_CXX_FLAGS|CMAKE} -Wno-error=shadow -opt-set-cmake-var ShyLU_Node_CXX_FLAGS STRING : ${CMAKE_CXX_FLAGS|CMAKE} -Wno-error=shadow -opt-set-cmake-var STK_CXX_FLAGS STRING : ${CMAKE_CXX_FLAGS|CMAKE} -Wno-error=shadow -opt-set-cmake-var Stokhos_CXX_FLAGS STRING : ${CMAKE_CXX_FLAGS|CMAKE} -Wno-error=shadow +# Supported packages +opt-set-cmake-var Amesos2_CXX_FLAGS STRING : ${CMAKE_CXX_FLAGS|CMAKE} -Wno-error=shadow +opt-set-cmake-var Belos_CXX_FLAGS STRING : ${CMAKE_CXX_FLAGS|CMAKE} -Wno-error=shadow +opt-set-cmake-var Domi_CXX_FLAGS STRING : ${CMAKE_CXX_FLAGS|CMAKE} -Wno-error=shadow +opt-set-cmake-var FEI_CXX_FLAGS STRING : ${CMAKE_CXX_FLAGS|CMAKE} -Wno-error=shadow +opt-set-cmake-var Galeri_CXX_FLAGS STRING : ${CMAKE_CXX_FLAGS|CMAKE} -Wno-error=shadow +opt-set-cmake-var Intrepid2_CXX_FLAGS STRING : ${CMAKE_CXX_FLAGS|CMAKE} -Werror -Wno-error=shadow +opt-set-cmake-var Moertel_CXX_FLAGS STRING : ${CMAKE_CXX_FLAGS|CMAKE} -Wno-error=shadow +opt-set-cmake-var NOX_CXX_FLAGS STRING : ${CMAKE_CXX_FLAGS|CMAKE} -Wno-error=shadow +opt-set-cmake-var Pamgen_CXX_FLAGS STRING : ${CMAKE_CXX_FLAGS|CMAKE} -Wno-error=shadow +opt-set-cmake-var Phalanx_CXX_FLAGS STRING : ${CMAKE_CXX_FLAGS|CMAKE} -Wno-error=shadow +opt-set-cmake-var Pike_CXX_FLAGS STRING : ${CMAKE_CXX_FLAGS|CMAKE} -Werror +opt-set-cmake-var Piro_CXX_FLAGS STRING : ${CMAKE_CXX_FLAGS|CMAKE} -Wno-error=shadow +opt-set-cmake-var ROL_CXX_FLAGS STRING : ${CMAKE_CXX_FLAGS|CMAKE} -Wno-error=shadow +opt-set-cmake-var Sacado_CXX_FLAGS STRING : ${CMAKE_CXX_FLAGS|CMAKE} -Wno-error=shadow +opt-set-cmake-var Shards_CXX_FLAGS STRING : ${CMAKE_CXX_FLAGS|CMAKE} -Wno-error=shadow +opt-set-cmake-var ShyLU_Node_CXX_FLAGS STRING : ${CMAKE_CXX_FLAGS|CMAKE} -Wno-error=shadow +opt-set-cmake-var STK_CXX_FLAGS STRING : ${CMAKE_CXX_FLAGS|CMAKE} -Wno-error=shadow +opt-set-cmake-var Stokhos_CXX_FLAGS STRING : ${CMAKE_CXX_FLAGS|CMAKE} -Wno-error=shadow opt-set-cmake-var Stratimikos_CXX_FLAGS STRING : ${CMAKE_CXX_FLAGS|CMAKE} -Wno-error=shadow -# opt-set-cmake-var Tempus_CXX_FLAGS STRING : ${CMAKE_CXX_FLAGS|CMAKE} -Werror +# opt-set-cmake-var Tempus_CXX_FLAGS STRING : ${CMAKE_CXX_FLAGS|CMAKE} -Werror +opt-set-cmake-var Zoltan_CXX_FLAGS STRING : ${CMAKE_CXX_FLAGS|CMAKE} -Wno-error=shadow opt-set-cmake-var TrilinosCouplings_CXX_FLAGS STRING : ${CMAKE_CXX_FLAGS|CMAKE} -Wno-error=shadow -opt-set-cmake-var Zoltan_CXX_FLAGS STRING : ${CMAKE_CXX_FLAGS|CMAKE} -Wno-error=shadow + +# Deprecated packages +opt-set-cmake-var Amesos_CXX_FLAGS STRING FORCE : ${CMAKE_CXX_FLAGS|CMAKE} -w +opt-set-cmake-var AztecOO_CXX_FLAGS STRING FORCE : ${CMAKE_CXX_FLAGS|CMAKE} -w +opt-set-cmake-var Epetra_CXX_FLAGS STRING FORCE : ${CMAKE_CXX_FLAGS|CMAKE} -w +opt-set-cmake-var EpetraExt_CXX_FLAGS STRING FORCE : ${CMAKE_CXX_FLAGS|CMAKE} -w +opt-set-cmake-var Ifpack_CXX_FLAGS STRING FORCE : ${CMAKE_CXX_FLAGS|CMAKE} -w +opt-set-cmake-var Intrepid_CXX_FLAGS STRING FORCE : ${CMAKE_CXX_FLAGS|CMAKE} -w +opt-set-cmake-var Isorropia_CXX_FLAGS STRING FORCE : ${CMAKE_CXX_FLAGS|CMAKE} -w +opt-set-cmake-var ML_CXX_FLAGS STRING FORCE : ${CMAKE_CXX_FLAGS|CMAKE} -w +opt-set-cmake-var Pliris_CXX_FLAGS STRING FORCE : ${CMAKE_CXX_FLAGS|CMAKE} -w +opt-set-cmake-var PyTrilinos_CXX_FLAGS STRING FORCE : ${CMAKE_CXX_FLAGS|CMAKE} -w +opt-set-cmake-var ShyLU_DD_CXX_FLAGS STRING FORCE : ${CMAKE_CXX_FLAGS|CMAKE} -w +opt-set-cmake-var Triutils_CXX_FLAGS STRING FORCE : ${CMAKE_CXX_FLAGS|CMAKE} -w +opt-set-cmake-var ThyraEpetraAdapters_CXX_FLAGS STRING FORCE : ${CMAKE_CXX_FLAGS|CMAKE} -w +opt-set-cmake-var ThyraEpetraExtAdapters_CXX_FLAGS STRING FORCE : ${CMAKE_CXX_FLAGS|CMAKE} -w + [GCC_OPENMP_PACKAGE_SPECIFIC_WARNING_FLAGS] use GCC_PACKAGE_SPECIFIC_WARNING_FLAGS -opt-set-cmake-var Panzer_CXX_FLAGS STRING : ${CMAKE_CXX_FLAGS|CMAKE} -Wno-error=shadow +opt-set-cmake-var Panzer_CXX_FLAGS STRING : ${CMAKE_CXX_FLAGS|CMAKE} -Wno-error=shadow opt-set-cmake-var Percept_CXX_FLAGS STRING : ${CMAKE_CXX_FLAGS|CMAKE} -Wno-error=shadow -opt-set-cmake-var Pliris_CXX_FLAGS STRING : ${CMAKE_CXX_FLAGS|CMAKE} -Wno-error=shadow -opt-set-cmake-var ShyLU_DD_CXX_FLAGS STRING : ${CMAKE_CXX_FLAGS|CMAKE} -Wno-error=shadow # Full configurations intended to be loaded. From f7617cc6b0bd3816af00d535960157c6c6cb3e4f Mon Sep 17 00:00:00 2001 From: Anderson Date: Tue, 30 Jul 2024 15:56:10 -0600 Subject: [PATCH 02/13] Added more warnings-as-errors flags Added `-Werror=sign-compare -Werror=unused-variable -Werror=parentheses` to the rhel8_gcc-8.5.0-serial Trilinos configuration. These three warnings-as-errors flags serve as the starting steps to introduce more warnings-as-errors flags going forward without overwhelming package developers too much off the bat. NOTE: Trilinos cannot currently enable `-Werror` due to how deprecated package header warnings are implemented in PR #12828 (controlled by -Wcpp which is controls a lot of other warnings). Signed-off-by: Anderson --- packages/framework/ini-files/config-specs.ini | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/packages/framework/ini-files/config-specs.ini b/packages/framework/ini-files/config-specs.ini index 8320a0b7991f..598f43292e3e 100644 --- a/packages/framework/ini-files/config-specs.ini +++ b/packages/framework/ini-files/config-specs.ini @@ -3110,7 +3110,7 @@ opt-set-cmake-var TPL_ENABLE_Scotch BOOL FORCE : OFF opt-set-cmake-var TPL_Netcdf_LIBRARIES STRING FORCE : ${NETCDF_C_LIB|ENV}/libnetcdf.so opt-set-cmake-var Trilinos_ENABLE_Fortran OFF BOOL : OFF -opt-set-cmake-var CMAKE_CXX_FLAGS STRING : -Wall -Wno-clobbered -Wno-vla -Wno-pragmas -Wno-unknown-pragmas -Wno-parentheses -Wno-unused-local-typedefs -Wno-literal-suffix -Wno-deprecated-declarations -Wno-misleading-indentation -Wno-int-in-bool-context -Wno-maybe-uninitialized -Wno-class-memaccess -Wno-nonnull-compare -Wno-address -Wno-inline -Wno-unused-but-set-variable -Wno-unused-variable -Wno-unused-label -Werror=shadow -DTRILINOS_HIDE_DEPRECATED_HEADER_WARNINGS +opt-set-cmake-var CMAKE_CXX_FLAGS STRING : -Wall -Wno-clobbered -Wno-vla -Wno-pragmas -Wno-unknown-pragmas -Wno-unused-local-typedefs -Wno-literal-suffix -Wno-deprecated-declarations -Wno-misleading-indentation -Wno-int-in-bool-context -Wno-maybe-uninitialized -Wno-class-memaccess -Wno-nonnull-compare -Wno-address -Wno-inline -Wno-unused-label -Werror=parentheses -Werror=sign-compare -Werror=unused-variable -DTRILINOS_HIDE_DEPRECATED_HEADER_WARNINGS use GCC_PACKAGE_SPECIFIC_WARNING_FLAGS From 77efb156f9c863a3446b6b7f663b7d08e252f4ec Mon Sep 17 00:00:00 2001 From: Anderson Date: Wed, 31 Jul 2024 09:19:16 -0600 Subject: [PATCH 03/13] Updated CXX_FLAGS of PR configs with warnings-as-errors flags Updated the CXX_FLAGS of gcc-openmpi-openmp and gcc-openmpi PR testing configurations with additional warnings-as-errors flags of `-Werror=sign-compare -Werror=unused-variable -Werror=parentheses` Signed-off-by: Anderson --- packages/framework/ini-files/config-specs.ini | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/packages/framework/ini-files/config-specs.ini b/packages/framework/ini-files/config-specs.ini index 598f43292e3e..d2539b524388 100644 --- a/packages/framework/ini-files/config-specs.ini +++ b/packages/framework/ini-files/config-specs.ini @@ -3144,7 +3144,7 @@ use COMMON_SPACK_TPLS opt-set-cmake-var MPI_EXEC_PRE_NUMPROCS_FLAGS STRING : --bind-to;none --mca btl vader,self opt-set-cmake-var CMAKE_CXX_EXTENSIONS BOOL : OFF opt-set-cmake-var Teko_DISABLE_LSCSTABALIZED_TPETRA_ALPAH_INV_D BOOL : ON -opt-set-cmake-var CMAKE_CXX_FLAGS STRING : -fno-strict-aliasing -Wall -Wno-clobbered -Wno-vla -Wno-pragmas -Wno-unknown-pragmas -Wno-parentheses -Wno-unused-local-typedefs -Wno-literal-suffix -Wno-deprecated-declarations -Wno-misleading-indentation -Wno-int-in-bool-context -Wno-maybe-uninitialized -Wno-class-memaccess -Wno-inline -Wno-nonnull-compare -Wno-address -DTRILINOS_HIDE_DEPRECATED_HEADER_WARNINGS +opt-set-cmake-var CMAKE_CXX_FLAGS STRING : -fno-strict-aliasing -Wall -Wno-clobbered -Wno-vla -Wno-pragmas -Wno-unknown-pragmas -Wno-unused-local-typedefs -Wno-literal-suffix -Wno-deprecated-declarations -Wno-misleading-indentation -Wno-int-in-bool-context -Wno-maybe-uninitialized -Wno-class-memaccess -Wno-inline -Wno-nonnull-compare -Wno-address -Werror=sign-compare -Werror=unused-variable -Werror=parentheses -DTRILINOS_HIDE_DEPRECATED_HEADER_WARNINGS # TPL_BLAS_LIBRARIES is redefined here with libm for SuperLU to properly link opt-set-cmake-var TPL_BLAS_LIBRARIES STRING FORCE : -L${BLAS_ROOT|ENV}/lib;-lblas;-lgfortran;-lgomp;-lm @@ -3187,7 +3187,7 @@ opt-set-cmake-var ROL_example_PDE-OPT_helmholtz_example_02_MPI_1_DISABLE BOOL opt-set-cmake-var Pliris_vector_random_MPI_3_DISABLE BOOL : ON opt-set-cmake-var Pliris_vector_random_MPI_4_DISABLE BOOL : ON -opt-set-cmake-var CMAKE_CXX_FLAGS STRING FORCE : -Wall -Wno-clobbered -Wno-vla -Wno-pragmas -Wno-unknown-pragmas -Wno-unused-local-typedefs -Wno-literal-suffix -Wno-deprecated-declarations -Wno-misleading-indentation -Wno-int-in-bool-context -Wno-maybe-uninitialized -Wno-nonnull-compare -Wno-address -Wno-inline -Wno-error -DTRILINOS_HIDE_DEPRECATED_HEADER_WARNINGS +opt-set-cmake-var CMAKE_CXX_FLAGS STRING FORCE : -Wall -Wno-clobbered -Wno-vla -Wno-pragmas -Wno-unknown-pragmas -Wno-unused-local-typedefs -Wno-literal-suffix -Wno-deprecated-declarations -Wno-misleading-indentation -Wno-int-in-bool-context -Wno-maybe-uninitialized -Wno-nonnull-compare -Wno-address -Wno-inline -Werror=sign-compare -Werror=unused-variable -Werror=parentheses -DTRILINOS_HIDE_DEPRECATED_HEADER_WARNINGS # Test failures as of 11-28-22 opt-set-cmake-var ROL_example_PDE-OPT_navier-stokes_example_01_MPI_4_DISABLE BOOL : ON From d97c9c558bf4bfa04da87d62622f389603831d5e Mon Sep 17 00:00:00 2001 From: iyamazaki Date: Thu, 1 Aug 2024 10:09:54 -0600 Subject: [PATCH 04/13] Tacho: move workspace allocation outsie levelset-loop, and use subview of workspace at each level --- .../src/impl/Tacho_NumericTools_Base.hpp | 1 + .../src/impl/Tacho_NumericTools_LevelSet.hpp | 436 +++++++++++------- .../tacho/src/impl/Tacho_SupernodeInfo.hpp | 21 +- .../src/impl/Tacho_TeamFunctor_ExtractCRS.hpp | 108 +++-- .../impl/Tacho_TeamFunctor_FactorizeChol.hpp | 4 - 5 files changed, 337 insertions(+), 233 deletions(-) diff --git a/packages/shylu/shylu_node/tacho/src/impl/Tacho_NumericTools_Base.hpp b/packages/shylu/shylu_node/tacho/src/impl/Tacho_NumericTools_Base.hpp index eaf27c8742c7..62419467186d 100644 --- a/packages/shylu/shylu_node/tacho/src/impl/Tacho_NumericTools_Base.hpp +++ b/packages/shylu/shylu_node/tacho/src/impl/Tacho_NumericTools_Base.hpp @@ -44,6 +44,7 @@ template class NumericToolsBase { using ordinal_type_array = typename supernode_info_type::ordinal_type_array; using size_type_array = typename supernode_info_type::size_type_array; using value_type_array = typename supernode_info_type::value_type_array; + using int_type_array = typename supernode_info_type::int_type_array; using ordinal_pair_type_array = typename supernode_info_type::ordinal_pair_type_array; using value_type_matrix = typename supernode_info_type::value_type_matrix; diff --git a/packages/shylu/shylu_node/tacho/src/impl/Tacho_NumericTools_LevelSet.hpp b/packages/shylu/shylu_node/tacho/src/impl/Tacho_NumericTools_LevelSet.hpp index eca229bf91e7..980ea1e4fb7a 100644 --- a/packages/shylu/shylu_node/tacho/src/impl/Tacho_NumericTools_LevelSet.hpp +++ b/packages/shylu/shylu_node/tacho/src/impl/Tacho_NumericTools_LevelSet.hpp @@ -112,6 +112,7 @@ class NumericToolsLevelSet : public NumericToolsBase { using typename base_type::supernode_info_type; using typename base_type::supernode_type_array_host; using typename base_type::value_type; + using typename base_type::int_type_array; using typename base_type::value_type_array; using typename base_type::value_type_matrix; @@ -138,6 +139,10 @@ class NumericToolsLevelSet : public NumericToolsBase { using base_type::track_alloc; using base_type::track_free; + using rowptr_view = Kokkos::View; + using colind_view = Kokkos::View; + using nzvals_view = Kokkos::View; + // supernode host information for level kernels launching supernode_type_array_host _h_supernodes; @@ -175,6 +180,15 @@ class NumericToolsLevelSet : public NumericToolsBase { size_type _bufsize_factorize, _bufsize_solve; value_type_array _buf; + // for using SpMV + rowptr_view rowptrU; + colind_view colindU; + nzvals_view nzvalsU; + + rowptr_view rowptrL; + colind_view colindL; + nzvals_view nzvalsL; + // common for host and cuda int _status; @@ -182,6 +196,7 @@ class NumericToolsLevelSet : public NumericToolsBase { int _nstreams; // workspace for SpMV + bool _is_spmv_extracted; value_type_matrix _w_vec; value_type_array buffer_U; value_type_array buffer_L; @@ -191,8 +206,9 @@ class NumericToolsLevelSet : public NumericToolsBase { cusolverDnHandle_t _handle_lapack; #if defined(TACHO_HAVE_CUSPARSE) // workspace for SpMV - cusparseDnMatDescr_t matT, matW; - cusparseDnVecDescr_t vecT, vecW; + // (separte for U and L, so that we can "destroy" without waiting for the other) + cusparseDnMatDescr_t matL, matU, matW; + cusparseDnVecDescr_t vecL, vecU, vecW; #endif using blas_handle_type = cublasHandle_t; @@ -579,14 +595,14 @@ class NumericToolsLevelSet : public NumericToolsBase { const ordinal_type sid = _h_level_sids(p); const auto s = _h_supernodes(sid); const ordinal_type m = s.m; //, n_m = s.n-s.m; - if (m > _device_solve_thres) { // || n > _device_solve_thres) { + if (m > _device_solve_thres) { // || n > _device_solve_thres) _h_solve_mode(sid) = 0; ++stat_level.n_device_solve; } else { _h_solve_mode(sid) = 1; ++stat_level.n_team_solve; } - if (m > _device_factorize_thres) { // || n_m > _device_factorize_thres) { + if (m > _device_factorize_thres) { // || n_m > _device_factorize_thres) _h_factorize_mode(sid) = 0; ++stat_level.n_device_factorize; } else { @@ -628,6 +644,10 @@ class NumericToolsLevelSet : public NumericToolsBase { inline void release(const ordinal_type verbose = 0) override { base_type::release(false); + if (variant == 3 && _is_spmv_extracted) { + Kokkos::fence(); + this->releaseCRS(); + } track_free(_buf_factor_ptr.span() * sizeof(size_type)); track_free(_buf_solve_ptr.span() * sizeof(size_type)); track_free(_buf_solve_nrhs_ptr.span() * sizeof(size_type)); @@ -666,6 +686,7 @@ class NumericToolsLevelSet : public NumericToolsBase { : base_type(method, m, ap, aj, perm, peri, nsupernodes, supernodes, gid_ptr, gid_colidx, sid_ptr, sid_colidx, blk_colidx, stree_parent, stree_ptr, stree_children, stree_level, stree_roots) { _nstreams = 0; + _is_spmv_extracted = 0; #if defined(KOKKOS_ENABLE_CUDA) _is_cublas_created = 0; _is_cusolver_dn_created = 0; @@ -1559,6 +1580,11 @@ class NumericToolsLevelSet : public NumericToolsBase { const value_type one(1); const value_type zero(0); + // ======================== + // free CRS, + // if it has been extracted + this->releaseCRS(); + // ======================== // workspace Kokkos::resize(_w_vec, m, nrhs); @@ -1576,6 +1602,11 @@ class NumericToolsLevelSet : public NumericToolsBase { // attach to Cusparse/Rocsparse data struct cusparseCreateDnMat(&matW, m, nrhs, ldw, (void*)(_w_vec.data()), computeType, CUSPARSE_ORDER_COL); cusparseCreateDnVec(&vecW, m, (void*)(_w_vec.data()), computeType); + // also to T, to be destroyed before each SpMV call + cusparseCreateDnMat(&matL, m, nrhs, ldw, (void*)(_w_vec.data()), computeType, CUSPARSE_ORDER_COL); + cusparseCreateDnVec(&vecL, m, (void*)(_w_vec.data()), computeType); + cusparseCreateDnMat(&matU, m, nrhs, ldw, (void*)(_w_vec.data()), computeType, CUSPARSE_ORDER_COL); + cusparseCreateDnVec(&vecU, m, (void*)(_w_vec.data()), computeType); // vectors used for preprocessing #ifdef USE_SPMM_FOR_WORKSPACE_SIZE cusparseDnMatDescr_t vecX, vecY; @@ -1601,11 +1632,25 @@ class NumericToolsLevelSet : public NumericToolsBase { rocsparse_create_dnvec_descr(&vecY, m, (void*)_w_vec.data(), rocsparse_compute_type); #endif + // allocate rowptrs + Kokkos::resize(rowptrU, _team_serial_level_cut*(1+m)); + Kokkos::resize(rowptrL, _team_serial_level_cut*(1+m)); + Kokkos::deep_copy(rowptrL, 0); + // counting nnz, first, so that we can allocate in NumericalTool + size_t ptr = 0; + size_t nnzU = 0; + size_t nnzL = 0; + typedef TeamFunctor_ExtractCrs functor_type; for (ordinal_type lvl = 0; lvl < _team_serial_level_cut; ++lvl) { const ordinal_type pbeg = _h_level_ptr(lvl), pend = _h_level_ptr(lvl + 1); // the first supernode in this lvl (where the CRS matrix is stored) auto &s0 = _h_supernodes(_h_level_sids(pbeg)); +#if defined(KOKKOS_ENABLE_HIP) + s0.spmv_explicit_transpose = true; +#else + s0.spmv_explicit_transpose = false; // okay for SpMV, though may not for SpMM +#endif #define TACHO_INSERT_DIAGONALS // NOTE: this needs extra vector-entry copy for the non-active rows at each level for solve (copy t to w, and w back to t) @@ -1613,8 +1658,8 @@ class NumericToolsLevelSet : public NumericToolsBase { #define TACHO_INSERT_DIAGONALS // ======================== // count nnz / row - Kokkos::resize(s0.rowptrU, 1+m); - typedef TeamFunctor_ExtractCrs functor_type; + auto d_rowptrU = Kokkos::subview(rowptrU, range_type(ptr, ptr+m+1)); + s0.rowptrU = d_rowptrU.data(); functor_type extractor_crs(_info, _solve_mode, _level_sids); extractor_crs.setGlobalSize(m); @@ -1632,24 +1677,88 @@ class NumericToolsLevelSet : public NumericToolsBase { // ======================== // shift to generate rowptr using range_type = Kokkos::pair; - ordinal_type nnz = 0; { using range_policy_type = Kokkos::RangePolicy; Kokkos::parallel_scan("shiftRowptr", range_policy_type(0, m+1), rowptr_sum(s0.rowptrU)); exec_space().fence(); // get nnz - auto d_nnz = Kokkos::subview(s0.rowptrU, range_type(m, m+1)); + auto d_nnz = Kokkos::subview(d_rowptrU, range_type(m, m+1)); auto h_nnz = Kokkos::create_mirror_view_and_copy(host_memory_space(), d_nnz); - nnz = h_nnz(0); + s0.nnzU = h_nnz(0); + nnzU += s0.nnzU; } + if (lu) { + // get nnz per row (L is stored by column) + auto d_rowptrL = Kokkos::subview(rowptrL, range_type(ptr, ptr+m+1)); + s0.rowptrL = d_rowptrL.data(); + { + using team_policy_type = Kokkos::TeamPolicy, exec_space, + typename functor_type::ExtractPtrColTag>; + team_policy_type team_policy((pend-pbeg)+1, Kokkos::AUTO()); + + extractor_crs.setRowPtr(s0.rowptrL); + Kokkos::parallel_for("extract rowptr L", team_policy, extractor_crs); + exec_space().fence(); + } + { + // convert to offset + using range_policy_type = Kokkos::RangePolicy; + Kokkos::parallel_scan("shiftRowptr L", range_policy_type(0, m+1), rowptr_sum(s0.rowptrL)); + exec_space().fence(); + // get nnz (on CPU for now) + auto d_nnz = Kokkos::subview(d_rowptrL, range_type(m, m+1)); + auto h_nnz = Kokkos::create_mirror_view_and_copy(host_memory_space(), d_nnz); + s0.nnzL = h_nnz(0); + nnzL += s0.nnzL; + } + s0.spmv_explicit_transpose = true; + } else if (s0.spmv_explicit_transpose) { + // ======================== + // explicitly form transpose + s0.nnzL = s0.nnzU; + auto d_rowptrL = Kokkos::subview(rowptrL, range_type(ptr, ptr+m+1)); + s0.rowptrL = d_rowptrL.data(); + nnzL += s0.nnzL; + } + ptr += (1+m); + } + + // allocate (TODO: move to symbolic) + if (nnzU) { + Kokkos::resize(colindU, nnzU); + Kokkos::resize(nzvalsU, nnzU); + } + if (nnzL) { + Kokkos::resize(colindL, nnzL); + Kokkos::resize(nzvalsL, nnzL); + } + + // load nonzero val/ind + ptr = 0; + nnzU = 0; + nnzL = 0; + for (ordinal_type lvl = 0; lvl < _team_serial_level_cut; ++lvl) { + const ordinal_type pbeg = _h_level_ptr(lvl), pend = _h_level_ptr(lvl + 1); + + // the first supernode in this lvl (where the CRS matrix is stored) + auto &s0 = _h_supernodes(_h_level_sids(pbeg)); + // ======================== - // allocate (TODO: move to symbolic) - Kokkos::resize(s0.colindU, nnz); - Kokkos::resize(s0.nzvalsU, nnz); + // assign memory + auto d_rowptrU = Kokkos::subview(rowptrU, range_type(ptr, ptr+m+1)); + auto d_colindU = Kokkos::subview(colindU, range_type(nnzU, nnzU+s0.nnzU)); + auto d_nzvalsU = Kokkos::subview(nzvalsU, range_type(nnzU, nnzU+s0.nnzU)); + s0.colindU = d_colindU.data(); + s0.nzvalsU = d_nzvalsU.data(); + nnzU += s0.nnzU; // ======================== // extract nonzero element + functor_type extractor_crs(_info, _solve_mode, _level_sids); + extractor_crs.setGlobalSize(m); + extractor_crs.setRange(pbeg, pend); + extractor_crs.setRowPtr(s0.rowptrU); extractor_crs.setCrsView(s0.colindU, s0.nzvalsU); { using team_policy_type = Kokkos::TeamPolicy, exec_space, @@ -1661,53 +1770,27 @@ class NumericToolsLevelSet : public NumericToolsBase { exec_space().fence(); } + // ======================== + // shift back (TODO: shift first to avoid this) { - // ======================== - // shift back (TODO: shift first to avoid this) // copy to CPU, for now - auto h_rowptr = Kokkos::create_mirror_view_and_copy(host_memory_space(), s0.rowptrU); + auto h_rowptr = Kokkos::create_mirror_view_and_copy(host_memory_space(), d_rowptrU); for (ordinal_type i = m; i > 0 ; i--) h_rowptr(i) = h_rowptr(i-1); h_rowptr(0) = 0; - Kokkos::deep_copy(s0.rowptrU, h_rowptr); + Kokkos::deep_copy(d_rowptrU, h_rowptr); } -#if defined(KOKKOS_ENABLE_HIP) - s0.spmv_explicit_transpose = true; -#else - s0.spmv_explicit_transpose = false; // okay for SpMV, though may not for SpMM -#endif if (lu) { - s0.spmv_explicit_transpose = true; - - // get nnz per row (L is stored by column) - Kokkos::resize(s0.rowptrL, 1+m); - Kokkos::deep_copy(s0.rowptrL, 0); - ordinal_type nnz = 0; - { - using team_policy_type = Kokkos::TeamPolicy, exec_space, - typename functor_type::ExtractPtrColTag>; - team_policy_type team_policy((pend-pbeg)+1, Kokkos::AUTO()); - - extractor_crs.setRowPtr(s0.rowptrL); - Kokkos::parallel_for("extract rowptr L", team_policy, extractor_crs); - exec_space().fence(); - } - { - // convert to offset (on CPU for now) - using range_policy_type = Kokkos::RangePolicy; - Kokkos::parallel_scan("shiftRowptr L", range_policy_type(0, m+1), rowptr_sum(s0.rowptrL)); - exec_space().fence(); - // get nnz - auto d_nnz = Kokkos::subview(s0.rowptrL, range_type(m, m+1)); - auto h_nnz = Kokkos::create_mirror_view_and_copy(host_memory_space(), d_nnz); - nnz = h_nnz(0); - } - - // allocate (TODO: move to symbolic) - Kokkos::resize(s0.colindL, nnz); - Kokkos::resize(s0.nzvalsL, nnz); + auto d_rowptrL = Kokkos::subview(rowptrL, range_type(ptr, ptr+m+1)); + auto d_colindL = Kokkos::subview(colindL, range_type(nnzL, nnzL+s0.nnzL)); + auto d_nzvalsL = Kokkos::subview(nzvalsL, range_type(nnzL, nnzL+s0.nnzL)); + s0.colindL = d_colindL.data(); + s0.nzvalsL = d_nzvalsL.data(); + nnzL += s0.nnzL; + // ======================== // insert nonzeros + extractor_crs.setRowPtr(s0.rowptrL); extractor_crs.setCrsView(s0.colindL, s0.nzvalsL); extractor_crs.setPivPtr(_piv); { @@ -1719,25 +1802,21 @@ class NumericToolsLevelSet : public NumericToolsBase { Kokkos::parallel_for("extract nzvals L", team_policy, extractor_crs); exec_space().fence(); } + // ======================== + // shift back + // (TODO: shift first to avoid this) { - // ======================== - // shift back (TODO: shift first to avoid this) // copy to CPU, for now - auto h_rowptr = Kokkos::create_mirror_view_and_copy(host_memory_space(), s0.rowptrL); + auto h_rowptr = Kokkos::create_mirror_view_and_copy(host_memory_space(), d_rowptrL); for (ordinal_type i = m; i > 0 ; i--) h_rowptr(i) = h_rowptr(i-1); h_rowptr(0) = 0; - Kokkos::deep_copy(s0.rowptrL, h_rowptr); + Kokkos::deep_copy(d_rowptrL, h_rowptr); } } else if (s0.spmv_explicit_transpose) { // ======================== // transpose - // >> allocate - Kokkos::resize(s0.rowptrL, 1+m); - Kokkos::resize(s0.colindL, nnz); - Kokkos::resize(s0.nzvalsL, nnz); - - Kokkos::deep_copy(s0.rowptrL, 0); - extractor_crs.setRowPtrT(s0.rowptrL); + // >> generate rowptr + extractor_crs.setRowPtrT(s0.rowptrL); { // >> count nnz / row (transpose) using team_policy_type = Kokkos::RangePolicy; @@ -1750,26 +1829,41 @@ class NumericToolsLevelSet : public NumericToolsBase { Kokkos::parallel_scan("shiftRowptrT", range_policy_type(0, m+1), rowptr_sum(s0.rowptrL)); exec_space().fence(); } + + s0.nnzL = s0.nnzU; + auto d_colindL = Kokkos::subview(colindL, range_type(nnzL, nnzL+s0.nnzL)); + auto d_nzvalsL = Kokkos::subview(nzvalsL, range_type(nnzL, nnzL+s0.nnzL)); + s0.colindL = d_colindL.data(); + s0.nzvalsL = d_nzvalsL.data(); + nnzL += s0.nnzL; + + // ======================== + // >> copy into transpose-matrix + extractor_crs.setRowPtrT(s0.rowptrL); extractor_crs.setCrsViewT(s0.colindL, s0.nzvalsL); { - // >> copy into transpose-matrix using team_policy_type = Kokkos::RangePolicy; team_policy_type team_policy(0, m); Kokkos::parallel_for("transpose pointer", team_policy, extractor_crs); } + // ======================== + // shift back + // (TODO: shift first to avoid this) { - // ======================== // copy to CPU, for now - auto h_rowptr = Kokkos::create_mirror_view_and_copy(host_memory_space(), s0.rowptrL); + auto d_rowptrL = Kokkos::subview(rowptrL, range_type(ptr, ptr+m+1)); + auto h_rowptr = Kokkos::create_mirror_view_and_copy(host_memory_space(), d_rowptrL); for (ordinal_type i = m; i > 0 ; i--) h_rowptr(i) = h_rowptr(i-1); h_rowptr(0) = 0; - Kokkos::deep_copy(s0.rowptrL, h_rowptr); + Kokkos::deep_copy(d_rowptrL, h_rowptr); } } + ptr += (1+m); + // ======================== + // create NVIDIA/AMD data structures for SpMV size_t buffer_size_L = 0; size_t buffer_size_U = 0; - nnz = s0.colindU.extent(0); value_type alpha = one; #ifdef TACHO_INSERT_DIAGONALS value_type beta = zero; @@ -1779,8 +1873,8 @@ class NumericToolsLevelSet : public NumericToolsBase { #if defined(KOKKOS_ENABLE_CUDA) cusparseCreate(&s0.cusparseHandle); // create matrix - cusparseCreateCsr(&s0.U_cusparse, m, m, nnz, - s0.rowptrU.data(), s0.colindU.data(), s0.nzvalsU.data(), + cusparseCreateCsr(&s0.U_cusparse, m, m, s0.nnzU, + s0.rowptrU, s0.colindU, s0.nzvalsU, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, computeType); @@ -1793,10 +1887,9 @@ class NumericToolsLevelSet : public NumericToolsBase { computeType, TACHO_CUSPARSE_SPMV_ALG, &buffer_size_U); #endif if (s0.spmv_explicit_transpose) { - // create matrix (transpose or L) - nnz = s0.colindL.extent(0); - cusparseCreateCsr(&s0.L_cusparse, m, m, nnz, - s0.rowptrL.data(), s0.colindL.data(), s0.nzvalsL.data(), + // create matrix (transpose(U) or L) + cusparseCreateCsr(&s0.L_cusparse, m, m, s0.nnzL, + s0.rowptrL, s0.colindL, s0.nzvalsL, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, computeType); // workspace size @@ -1810,8 +1903,9 @@ class NumericToolsLevelSet : public NumericToolsBase { #endif } else { // create matrix (L_cusparse stores the same ptrs as descrU, but optimized for trans) - cusparseCreateCsr(&s0.L_cusparse, m, m, nnz, - s0.rowptrU.data(), s0.colindU.data(), s0.nzvalsU.data(), + s0.nnzL = s0.nnzU; + cusparseCreateCsr(&s0.L_cusparse, m, m, s0.nnzL, + s0.rowptrU, s0.colindU, s0.nzvalsU, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, computeType); // workspace size for transpose SpMV @@ -1834,8 +1928,8 @@ class NumericToolsLevelSet : public NumericToolsBase { #elif defined(KOKKOS_ENABLE_HIP) rocsparse_create_handle(&s0.rocsparseHandle); // create matrix - rocsparse_create_csr_descr(&(s0.descrU), m, m, nnz, - s0.rowptrU.data(), s0.colindU.data(), s0.nzvalsU.data(), + rocsparse_create_csr_descr(&(s0.descrU), m, m, s0.nnzU, + s0.rowptrU, s0.colindU, s0.nzvalsU, rocsparse_indextype_i32, rocsparse_indextype_i32, rocsparse_index_base_zero, rocsparse_compute_type); // workspace #if ROCM_VERSION >= 50400 @@ -1866,9 +1960,8 @@ class NumericToolsLevelSet : public NumericToolsBase { #endif if (s0.spmv_explicit_transpose) { // create matrix (transpose) - nnz = s0.colindL.extent(0); - rocsparse_create_csr_descr(&(s0.descrL), m, m, nnz, - s0.rowptrL.data(), s0.colindL.data(), s0.nzvalsL.data(), + rocsparse_create_csr_descr(&(s0.descrL), m, m, s0.nnzL, + s0.rowptrL, s0.colindL, s0.nzvalsL, rocsparse_indextype_i32, rocsparse_indextype_i32, rocsparse_index_base_zero, rocsparse_compute_type); // workspace #if ROCM_VERSION >= 50400 @@ -1899,9 +1992,8 @@ class NumericToolsLevelSet : public NumericToolsBase { #endif } else { // create matrix, transpose (L_cusparse stores the same ptrs as descrU, but optimized for trans) - nnz = s0.colindL.extent(0); - rocsparse_create_csr_descr(&(s0.descrL), m, m, nnz, - s0.rowptrU.data(), s0.colindU.data(), s0.nzvalsU.data(), + rocsparse_create_csr_descr(&(s0.descrL), m, m, s0.nnzL, + s0.rowptrU, s0.colindU, s0.nzvalsU, rocsparse_indextype_i32, rocsparse_indextype_i32, rocsparse_index_base_zero, rocsparse_compute_type); // workspace (transpose) #if ROCM_VERSION >= 50400 @@ -1933,6 +2025,7 @@ class NumericToolsLevelSet : public NumericToolsBase { } #endif } + #if defined(KOKKOS_ENABLE_CUDA) #ifdef USE_SPMM_FOR_WORKSPACE_SIZE cusparseDestroyDnMat(vecX); @@ -1945,9 +2038,32 @@ class NumericToolsLevelSet : public NumericToolsBase { rocsparse_destroy_dnvec_descr(vecX); rocsparse_destroy_dnvec_descr(vecY); #endif + _is_spmv_extracted = 1; #endif } + inline void releaseCRS() { + if(_is_spmv_extracted) { +#if defined(KOKKOS_ENABLE_CUDA) + for (ordinal_type lvl = 0; lvl < _team_serial_level_cut; ++lvl) { + const ordinal_type pbeg = _h_level_ptr(lvl); + // the first supernode in this lvl (where the CRS matrix is stored) + auto &s0 = _h_supernodes(_h_level_sids(pbeg)); + cusparseDestroySpMat(s0.U_cusparse); + cusparseDestroySpMat(s0.L_cusparse); + cusparseDestroy(s0.cusparseHandle); + } + cusparseDestroyDnMat(matL); + cusparseDestroyDnVec(vecL); + cusparseDestroyDnMat(matU); + cusparseDestroyDnVec(vecU); + cusparseDestroyDnMat(matW); + cusparseDestroyDnVec(vecW); +#endif + _is_spmv_extracted = 0; + } + } + /// /// Level set factorize /// @@ -2235,6 +2351,10 @@ class NumericToolsLevelSet : public NumericToolsBase { // attach to Cusparse/Rocsparse data struct int ldw = _w_vec.stride(1); #if defined(KOKKOS_ENABLE_CUDA) + // destroy previous + cusparseDestroyDnMat(matW); + cusparseDestroyDnVec(vecW); + // create new cusparseCreateDnMat(&matW, m, nrhs, ldw, (void*)(_w_vec.data()), computeType, CUSPARSE_ORDER_COL); cusparseCreateDnVec(&vecW, m, (void*)(_w_vec.data()), computeType); #elif defined(KOKKOS_ENABLE_HIP) @@ -2249,74 +2369,65 @@ class NumericToolsLevelSet : public NumericToolsBase { exit(0); #endif #if defined(KOKKOS_ENABLE_CUDA) + // Desctory old CSR + cusparseDestroySpMat(s0.L_cusparse); // Re-create CuSparse CSR if (s0.spmv_explicit_transpose) { - size_t nnz = s0.nzvalsL.extent(0); - cusparseCreateCsr(&s0.L_cusparse, m, m, nnz, - s0.rowptrL.data(), s0.colindL.data(), s0.nzvalsL.data(), + cusparseCreateCsr(&s0.L_cusparse, m, m, s0.nnzL, + s0.rowptrL, s0.colindL, s0.nzvalsL, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, computeType); } else { - size_t nnz = s0.nzvalsU.extent(0); - cusparseCreateCsr(&s0.L_cusparse, m, m, nnz, - s0.rowptrU.data(), s0.colindU.data(), s0.nzvalsU.data(), + cusparseCreateCsr(&s0.L_cusparse, m, m, s0.nnzU, + s0.rowptrU, s0.colindU, s0.nzvalsU, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, computeType); } // Call SpMV/SPMM cusparseStatus_t status; + cusparseOperation_t opL = (s0.spmv_explicit_transpose ? CUSPARSE_OPERATION_NON_TRANSPOSE : CUSPARSE_OPERATION_TRANSPOSE); if (nrhs > 1) { if (lvl == nlvls-1) { + // start : destroy previous + cusparseDestroyDnMat(matL); // start : create DnMat for T - cusparseCreateDnMat(&matT, m, nrhs, ldt, (void*)(t.data()), computeType, CUSPARSE_ORDER_COL); + cusparseCreateDnMat(&matL, m, nrhs, ldt, (void*)(t.data()), computeType, CUSPARSE_ORDER_COL); } // create vectors - auto matX = ((nlvls-1-lvl)%2 == 0 ? matT : matW); - auto matY = ((nlvls-1-lvl)%2 == 0 ? matW : matT); + auto matX = ((nlvls-1-lvl)%2 == 0 ? matL : matW); + auto matY = ((nlvls-1-lvl)%2 == 0 ? matW : matL); // SpMM - if (s0.spmv_explicit_transpose) { - status = cusparseSpMM(s0.cusparseHandle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE, - &alpha, s0.L_cusparse, - matX, - &beta, matY, - computeType, TACHO_CUSPARSE_SPMM_ALG, (void*)buffer_L.data()); - } else { - status = cusparseSpMM(s0.cusparseHandle, CUSPARSE_OPERATION_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE, - &alpha, s0.L_cusparse, // L_cusparse stores the same ptrs as descrU, but optimized for trans - matX, - &beta, matY, - computeType, TACHO_CUSPARSE_SPMM_ALG, (void*)buffer_L.data()); - } + status = cusparseSpMM(s0.cusparseHandle, opL, CUSPARSE_OPERATION_NON_TRANSPOSE, + &alpha, s0.L_cusparse, + matX, + &beta, matY, + computeType, TACHO_CUSPARSE_SPMM_ALG, (void*)buffer_L.data()); } else { if (lvl == nlvls-1) { + // start : destroy previous + cusparseDestroyDnVec(vecL); // start : create DnMat for T - cusparseCreateDnVec(&vecT, m, (void*)(t.data()), computeType); + cusparseCreateDnVec(&vecL, m, (void*)(t.data()), computeType); } // create vectors - auto vecX = ((nlvls-1-lvl)%2 == 0 ? vecT : vecW); - auto vecY = ((nlvls-1-lvl)%2 == 0 ? vecW : vecT); + auto vecX = ((nlvls-1-lvl)%2 == 0 ? vecL : vecW); + auto vecY = ((nlvls-1-lvl)%2 == 0 ? vecW : vecL); // SpMV - if (s0.spmv_explicit_transpose) { - status = cusparseSpMV(s0.cusparseHandle, CUSPARSE_OPERATION_NON_TRANSPOSE, - &alpha, s0.L_cusparse, - vecX, - &beta, vecY, - computeType, TACHO_CUSPARSE_SPMV_ALG, (void*)buffer_L.data()); - } else { - status = cusparseSpMV(s0.cusparseHandle, CUSPARSE_OPERATION_TRANSPOSE, - &alpha, s0.L_cusparse, // L_cusparse stores the same ptrs as descrU, but optimized for trans - vecX, - &beta, vecY, - computeType, TACHO_CUSPARSE_SPMV_ALG, (void*)buffer_L.data()); - } + status = cusparseSpMV(s0.cusparseHandle, opL, + &alpha, s0.L_cusparse, + vecX, + &beta, vecY, + computeType, TACHO_CUSPARSE_SPMV_ALG, (void*)buffer_L.data()); } if (CUSPARSE_STATUS_SUCCESS != status) { - printf( " Failed cusparseSpMV for SpMV\n" ); + printf( " Failed cusparseSpMV for SpMV (lower)\n" ); } #elif defined(KOKKOS_ENABLE_HIP) rocsparse_status status; if (nrhs > 1) { if (lvl == nlvls-1) { + // start : destroy previous + rocsparse_destroy_dnmat_descr(matT); // start : create DnMat for T rocsparse_create_dnmat_descr(&matT, m, nrhs, ldt, (void*)(t.data()), rocsparse_compute_type, rocsparse_order_column); } @@ -2340,6 +2451,8 @@ class NumericToolsLevelSet : public NumericToolsBase { } } else { if (lvl == nlvls-1) { + // start : destroy previous + rocsparse_destroy_dnvec_descr(vecT); // start : create DnVec for T rocsparse_create_dnvec_descr(&vecT, m, (void*)(t.data()), rocsparse_compute_type); } @@ -2386,9 +2499,12 @@ class NumericToolsLevelSet : public NumericToolsBase { Kokkos::deep_copy(h_t, zero); if (s0.spmv_explicit_transpose) { - auto h_rowptr = Kokkos::create_mirror_view_and_copy(host_memory_space(), s0.rowptrL); - auto h_colind = Kokkos::create_mirror_view_and_copy(host_memory_space(), s0.colindL); - auto h_nzvals = Kokkos::create_mirror_view_and_copy(host_memory_space(), s0.nzvalsL); + UnmanagedViewType d_rowptrL(s0.rowptrL, m+1); + UnmanagedViewType d_colindL(s0.colindL, s0.nnzL); + UnmanagedViewType d_nzvalsL(s0.nzvalsL, s0.nnzL); + auto h_rowptr = Kokkos::create_mirror_view_and_copy(host_memory_space(), d_rowptrL); + auto h_colind = Kokkos::create_mirror_view_and_copy(host_memory_space(), d_colindL); + auto h_nzvals = Kokkos::create_mirror_view_and_copy(host_memory_space(), d_nzvalsL); for (ordinal_type i = 0; i < m ; i++) { for (int k = h_rowptr(i); k < h_rowptr(i+1); k++) { for (int j = 0; j < nrhs; j++) { @@ -2397,9 +2513,12 @@ class NumericToolsLevelSet : public NumericToolsBase { } } } else { - auto h_rowptr = Kokkos::create_mirror_view_and_copy(host_memory_space(), s0.rowptrU); - auto h_colind = Kokkos::create_mirror_view_and_copy(host_memory_space(), s0.colindU); - auto h_nzvals = Kokkos::create_mirror_view_and_copy(host_memory_space(), s0.nzvalsU); + UnmanagedViewType d_rowptrU(s0.rowptrU, m+1); + UnmanagedViewType d_colindU(s0.colindU, s0.nnzU); + UnmanagedViewType d_nzvalsU(s0.nzvalsU, s0.nnzU); + auto h_rowptr = Kokkos::create_mirror_view_and_copy(host_memory_space(), d_rowptrU); + auto h_colind = Kokkos::create_mirror_view_and_copy(host_memory_space(), d_colindU); + auto h_nzvals = Kokkos::create_mirror_view_and_copy(host_memory_space(), d_nzvalsU); for (ordinal_type i = 0; i < m ; i++) { for (int k = h_rowptr(i); k < h_rowptr(i+1); k++) { for (int j = 0; j < nrhs; j++) { @@ -2410,18 +2529,7 @@ class NumericToolsLevelSet : public NumericToolsBase { } #endif if (lvl == 0) { - // end : destroy vectors -#if defined(KOKKOS_ENABLE_CUDA) && defined(TACHO_HAVE_CUSPARSE) - if (nrhs > 1) - cusparseDestroyDnMat(matT); - else - cusparseDestroyDnVec(vecT); -#elif defined(KOKKOS_ENABLE_HIP) - if (nrhs > 1) - rocsparse_destroy_dnmat_descr(matT); - else - rocsparse_destroy_dnvec_descr(vecT); -#endif + // end : copy to output if ((nlvls-1)%2 == 0) { Kokkos::deep_copy(t, _w_vec); } @@ -2615,20 +2723,24 @@ class NumericToolsLevelSet : public NumericToolsBase { } cusparseStatus_t status; + // Desctory old CSR + cusparseDestroySpMat(s0.U_cusparse); // Re-create CuSparse CSR - size_t nnz = s0.nzvalsU.extent(0); - cusparseCreateCsr(&s0.U_cusparse, m, m, nnz, - s0.rowptrU.data(), s0.colindU.data(), s0.nzvalsU.data(), + cusparseCreateCsr(&s0.U_cusparse, m, m, s0.nnzU, + s0.rowptrU, s0.colindU, s0.nzvalsU, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, computeType); + // Call SpMV/SPMM if (nrhs > 1) { if (lvl == 0) { + // start : destroy previous + cusparseDestroyDnMat(matU); // start : create DnMat for T - cusparseCreateDnMat(&matT, m, nrhs, ldt, (void*)(t.data()), computeType, CUSPARSE_ORDER_COL); + cusparseCreateDnMat(&matU, m, nrhs, ldt, (void*)(t.data()), computeType, CUSPARSE_ORDER_COL); } - auto vecX = (lvl%2 == 0 ? matT : matW); - auto vecY = (lvl%2 == 0 ? matW : matT); + auto vecX = (lvl%2 == 0 ? matU : matW); + auto vecY = (lvl%2 == 0 ? matW : matU); // SpMM status = cusparseSpMM(s0.cusparseHandle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE, &alpha, s0.U_cusparse, @@ -2637,11 +2749,13 @@ class NumericToolsLevelSet : public NumericToolsBase { computeType, TACHO_CUSPARSE_SPMM_ALG, (void*)buffer_U.data()); } else { if (lvl == 0) { + // start : destroy previous + cusparseDestroyDnVec(vecU); // start : create DnMat for T - cusparseCreateDnVec(&vecT, m, (void*)(t.data()), computeType); + cusparseCreateDnVec(&vecU, m, (void*)(t.data()), computeType); } - auto vecX = (lvl%2 == 0 ? vecT : vecW); - auto vecY = (lvl%2 == 0 ? vecW : vecT); + auto vecX = (lvl%2 == 0 ? vecU : vecW); + auto vecY = (lvl%2 == 0 ? vecW : vecU); // SpMV status = cusparseSpMV(s0.cusparseHandle, CUSPARSE_OPERATION_NON_TRANSPOSE, &alpha, s0.U_cusparse, @@ -2650,7 +2764,7 @@ class NumericToolsLevelSet : public NumericToolsBase { computeType, TACHO_CUSPARSE_SPMV_ALG, (void*)buffer_U.data()); } if (CUSPARSE_STATUS_SUCCESS != status) { - printf( " Failed cusparseSpMV for SpMV\n" ); + printf( " Failed cusparseSpMV for SpMV (upper)\n" ); } #elif defined(KOKKOS_ENABLE_HIP) rocsparse_datatype rocsparse_compute_type = rocsparse_datatype_f64_r; @@ -2704,9 +2818,12 @@ class NumericToolsLevelSet : public NumericToolsBase { auto h_t = Kokkos::create_mirror_view(host_memory_space(), (lvl%2 == 0 ? _w_vec : t)); Kokkos::deep_copy(h_t, zero); - auto h_rowptr = Kokkos::create_mirror_view_and_copy(host_memory_space(), s0.rowptrU); - auto h_colind = Kokkos::create_mirror_view_and_copy(host_memory_space(), s0.colindU); - auto h_nzvals = Kokkos::create_mirror_view_and_copy(host_memory_space(), s0.nzvalsU); + UnmanagedViewType d_rowptrU(s0.rowptrU, m+1); + UnmanagedViewType d_colindU(s0.colindU, s0.nnzU); + UnmanagedViewType d_nzvalsU(s0.nzvalsU, s0.nnzU); + auto h_rowptr = Kokkos::create_mirror_view_and_copy(host_memory_space(), d_rowptrU); + auto h_colind = Kokkos::create_mirror_view_and_copy(host_memory_space(), d_colindU); + auto h_nzvals = Kokkos::create_mirror_view_and_copy(host_memory_space(), d_nzvalsU); for (ordinal_type i = 0; i < m ; i++) { for (int k = h_rowptr(i); k < h_rowptr(i+1); k++) { @@ -2718,21 +2835,16 @@ class NumericToolsLevelSet : public NumericToolsBase { Kokkos::deep_copy(t, h_t); #endif if (lvl == nlvls-1) { - // end : destroy vectors -#if defined(KOKKOS_ENABLE_CUDA) && defined(TACHO_HAVE_CUSPARSE) - if (nrhs > 1) - cusparseDestroyDnMat(matT); - else - cusparseDestroyDnVec(vecT); -#elif defined(KOKKOS_ENABLE_HIP) + // end : copy to output + if (lvl%2 == 0) { + Kokkos::deep_copy(t, _w_vec); + } +#if defined(KOKKOS_ENABLE_HIP) if (nrhs > 1) rocsparse_destroy_dnmat_descr(matT); else rocsparse_destroy_dnvec_descr(vecT); #endif - if (lvl%2 == 0) { - Kokkos::deep_copy(t, _w_vec); - } } #endif } diff --git a/packages/shylu/shylu_node/tacho/src/impl/Tacho_SupernodeInfo.hpp b/packages/shylu/shylu_node/tacho/src/impl/Tacho_SupernodeInfo.hpp index dfdf28851be2..f25f397955e6 100644 --- a/packages/shylu/shylu_node/tacho/src/impl/Tacho_SupernodeInfo.hpp +++ b/packages/shylu/shylu_node/tacho/src/impl/Tacho_SupernodeInfo.hpp @@ -105,14 +105,11 @@ template struct SupernodeInfo { using ordinal_type_array = Kokkos::View; using size_type_array = Kokkos::View; using value_type_array = Kokkos::View; + using int_type_array = Kokkos::View; using ordinal_pair_type = Kokkos::pair; using ordinal_pair_type_array = Kokkos::View; using value_type_matrix = Kokkos::View; - - using rowptr_view = Kokkos::View; - using colind_view = Kokkos::View; - using nzvals_view = Kokkos::View; using range_type = Kokkos::pair; struct Supernode { @@ -133,13 +130,15 @@ template struct SupernodeInfo { bool do_not_apply_pivots; // for using SpMV - rowptr_view rowptrU; - colind_view colindU; - nzvals_view nzvalsU; - - rowptr_view rowptrL; - colind_view colindL; - nzvals_view nzvalsL; + size_t nnzU; + int* rowptrU; + int* colindU; + value_type* nzvalsU; + + size_t nnzL; + int* rowptrL; + int* colindL; + value_type* nzvalsL; bool spmv_explicit_transpose; #if defined(KOKKOS_ENABLE_CUDA) diff --git a/packages/shylu/shylu_node/tacho/src/impl/Tacho_TeamFunctor_ExtractCRS.hpp b/packages/shylu/shylu_node/tacho/src/impl/Tacho_TeamFunctor_ExtractCRS.hpp index 99d7aa2b6fa7..de7419e7757c 100644 --- a/packages/shylu/shylu_node/tacho/src/impl/Tacho_TeamFunctor_ExtractCRS.hpp +++ b/packages/shylu/shylu_node/tacho/src/impl/Tacho_TeamFunctor_ExtractCRS.hpp @@ -29,24 +29,23 @@ Sandia National Laboratories, Albuquerque, NM, USA namespace Tacho { -template struct rowptr_sum { - rowptr_view _rowptr; + int* _rowptr; - rowptr_sum(rowptr_view rowptr) + rowptr_sum(int* rowptr) : _rowptr(rowptr) {} KOKKOS_INLINE_FUNCTION void operator()(const ordinal_type i, ordinal_type& update, const bool is_final) const { - const ordinal_type val_i = _rowptr(i); + const ordinal_type val_i = _rowptr[i]; update += val_i; if (is_final) { - _rowptr(i) = update; + _rowptr[i] = update; } } - ordinal_type nnz() { return _rowptr(_rowptr.extent(0)); } + //ordinal_type nnz() { return _rowptr[_rowptr.extent(0)]; } }; template struct TeamFunctor_ExtractCrs { @@ -56,16 +55,13 @@ template struct TeamFunctor_ExtractCrs { typedef SupernodeInfoType supernode_info_type; typedef typename supernode_info_type::supernode_type supernode_type; - typedef typename supernode_info_type::ordinal_type_array ordinal_type_array; - typedef typename supernode_info_type::size_type_array size_type_array; - typedef typename supernode_info_type::value_type value_type; - typedef typename supernode_info_type::value_type_array value_type_array; typedef typename supernode_info_type::value_type_matrix value_type_matrix; + typedef typename supernode_info_type::ordinal_type_array ordinal_type_array; - typedef typename supernode_info_type::rowptr_view rowptr_view; - typedef typename supernode_info_type::colind_view colind_view; - typedef typename supernode_info_type::nzvals_view nzvals_view; + //typedef typename supernode_info_type::rowptr_view rowptr_view; + //typedef typename supernode_info_type::colind_view colind_view; + //typedef typename supernode_info_type::nzvals_view nzvals_view; private: supernode_info_type _info; @@ -74,13 +70,13 @@ template struct TeamFunctor_ExtractCrs { ordinal_type _m; // in CRS format - rowptr_view _rowptr; - colind_view _colind; - nzvals_view _nzvals; + int* _rowptr; + int* _colind; + value_type* _nzvals; // in CRS format, transpose - rowptr_view _rowptrT; - colind_view _colindT; - nzvals_view _nzvalsT; + int* _rowptrT; + int* _colindT; + value_type* _nzvalsT; // pivot ordinal_type_array _piv; @@ -102,13 +98,13 @@ template struct TeamFunctor_ExtractCrs { _pend = pend; } - inline void setRowPtr(rowptr_view &rowptr) { _rowptr = rowptr; } - inline void setCrsView(colind_view &colind, nzvals_view &nzvals) { + inline void setRowPtr(int* rowptr) { _rowptr = rowptr; } + inline void setCrsView(int *colind, value_type *nzvals) { _colind = colind; _nzvals = nzvals; } - inline void setRowPtrT(rowptr_view &rowptrT) { _rowptrT = rowptrT; } - inline void setCrsViewT(colind_view &colindT, nzvals_view &nzvalsT) { + inline void setRowPtrT(int* rowptrT) { _rowptrT = rowptrT; } + inline void setCrsViewT(int *colindT, value_type *nzvalsT) { _colindT = colindT; _nzvalsT = nzvalsT; } @@ -148,7 +144,7 @@ template struct TeamFunctor_ExtractCrs { } // add diagonal entry Kokkos::parallel_for(Kokkos::TeamThreadRange(member, offm-row_id), - [&](const int& i) { _rowptr(row_id+i+1) = 1; }); + [&](const int& i) { _rowptr[row_id+i+1] = 1; }); #endif if (p < _pend) { if (s.m > 0) { @@ -158,10 +154,10 @@ template struct TeamFunctor_ExtractCrs { UnmanagedViewType AT(aptr, s.m, s.n); Kokkos::parallel_for(Kokkos::TeamThreadRange(member, s.m), [&](const int& i) { - _rowptr(1+i+offm) = 0; + _rowptr[1+i+offm] = 0; for (ordinal_type j = 0; j < s.n; j++) { if (AT(i,j) != zero) { - _rowptr(1+i+offm) ++; + _rowptr[1+i+offm] ++; } } }); @@ -195,10 +191,10 @@ template struct TeamFunctor_ExtractCrs { // insert diagonals for the missing rows between previous and this block Kokkos::parallel_for(Kokkos::TeamThreadRange(member, offm-row_id), [&](const int& i) { - int nnz = _rowptr(row_id+i); - _colind(nnz) = row_id+i; - _nzvals(nnz) = one; - _rowptr(row_id+i)++; + int nnz = _rowptr[row_id+i]; + _colind[nnz] = row_id+i; + _nzvals[nnz] = one; + _rowptr[row_id+i]++; }); #endif if (p < _pend) { @@ -212,10 +208,10 @@ template struct TeamFunctor_ExtractCrs { ordinal_type j; for (ordinal_type j = i; j < s.m; j++) { if (AT(i,j) != zero) { - int nnz = _rowptr(i+offm); - _colind(nnz) = j+offm; - _nzvals(nnz) = AT(i,j); - _rowptr(i+offm) ++; + int nnz = _rowptr[i+offm]; + _colind[nnz] = j+offm; + _nzvals[nnz] = AT(i,j); + _rowptr[i+offm] ++; } } // off-diagonal blocksa @@ -223,10 +219,10 @@ template struct TeamFunctor_ExtractCrs { for (ordinal_type id = s.sid_col_begin + 1; id < s.sid_col_end - 1; id++) { for (ordinal_type k = _info.sid_block_colidx(id).second; k < _info.sid_block_colidx(id + 1).second; k++) { if (AT(i,j) != zero) { - int nnz = _rowptr(i+offm); - _colind(nnz) = _info.gid_colidx(k+offn); - _nzvals(nnz) = AT(i,j); - _rowptr(i+offm) ++; + int nnz = _rowptr[i+offm]; + _colind[nnz] = _info.gid_colidx(k+offn); + _nzvals[nnz] = AT(i,j); + _rowptr[i+offm] ++; } j++; } @@ -263,7 +259,7 @@ template struct TeamFunctor_ExtractCrs { } // add diagonal entry Kokkos::parallel_for(Kokkos::TeamThreadRange(member, offm-row_id), - [&](const int& i) { Kokkos::atomic_add(&(_rowptr(row_id+i+1)), 1); }); + [&](const int& i) { Kokkos::atomic_add(&(_rowptr[row_id+i+1]), 1); }); #endif if (p < _pend) { // extract this supernode (AL is stored by col) @@ -275,7 +271,7 @@ template struct TeamFunctor_ExtractCrs { // first extract diagnal block (each thread extract row in parallel) for (ordinal_type i = 0; i < s.m; i++) { if (AL(i,j) != zero) { - Kokkos::atomic_add(&(_rowptr(1+i+offm)), 1); + Kokkos::atomic_add(&(_rowptr[1+i+offm]), 1); } } // off-diagonals (each thread extract col, needing atomic-add) @@ -284,7 +280,7 @@ template struct TeamFunctor_ExtractCrs { for (ordinal_type k = _info.sid_block_colidx(id).second; k < _info.sid_block_colidx(id + 1).second; k++) { if (AL(i, j) != zero) { ordinal_type gid_i = _info.gid_colidx(k+offn); - Kokkos::atomic_add(&(_rowptr(1+gid_i)), 1); + Kokkos::atomic_add(&(_rowptr[1+gid_i]), 1); } i++; } @@ -320,9 +316,9 @@ template struct TeamFunctor_ExtractCrs { // add diagonal entry Kokkos::parallel_for(Kokkos::TeamThreadRange(member, offm-row_id), [&](const int& i) { - ordinal_type nnz = Kokkos::atomic_fetch_add(&(_rowptr(row_id+i)), 1); - _colind(nnz) = row_id+i; - _nzvals(nnz) = one; + ordinal_type nnz = Kokkos::atomic_fetch_add(&(_rowptr[row_id+i]), 1); + _colind[nnz] = row_id+i; + _nzvals[nnz] = one; }); #endif if (p < _pend) { @@ -340,9 +336,9 @@ template struct TeamFunctor_ExtractCrs { // diagnal block for (ordinal_type i = 0; i < s.m; i++) { if (AL(i,j) != zero) { - ordinal_type nnz = Kokkos::atomic_fetch_add(&(_rowptr(offm+i)), 1); - _colind(nnz) = gid_j; - _nzvals(nnz) = AL(i,j); + ordinal_type nnz = Kokkos::atomic_fetch_add(&(_rowptr[offm+i]), 1); + _colind[nnz] = gid_j; + _nzvals[nnz] = AL(i,j); } } // off-diagonals (each thread extract col, needing atomic-add) @@ -351,9 +347,9 @@ template struct TeamFunctor_ExtractCrs { for (ordinal_type k = _info.sid_block_colidx(id).second; k < _info.sid_block_colidx(id + 1).second; k++) { if (AL(i, j) != zero) { ordinal_type gid_i = _info.gid_colidx(k+offn); - ordinal_type nnz = Kokkos::atomic_fetch_add(&(_rowptr(gid_i)), 1); - _colind(nnz) = gid_j; - _nzvals(nnz) = AL(i,j); + ordinal_type nnz = Kokkos::atomic_fetch_add(&(_rowptr[gid_i]), 1); + _colind[nnz] = gid_j; + _nzvals[nnz] = AL(i,j); } i++; } @@ -369,17 +365,17 @@ template struct TeamFunctor_ExtractCrs { // Functors to transpose KOKKOS_INLINE_FUNCTION void operator()(const TransPtrTag &, const int i) const { // count offset rowptrT - for (ordinal_type k = _rowptr(i); k < _rowptr(i+1); k++) { - Kokkos::atomic_add(&(_rowptrT(_colind(k)+1)), 1); + for (ordinal_type k = _rowptr[i]; k < _rowptr[i+1]; k++) { + Kokkos::atomic_add(&(_rowptrT[_colind[k]+1]), 1); } } KOKKOS_INLINE_FUNCTION void operator()(const TransMatTag &, const int i) const { // count offset rowptrT - for (ordinal_type k = _rowptr(i); k < _rowptr(i+1); k++) { - int nnz = Kokkos::atomic_fetch_add(&(_rowptrT(_colind(k))), 1); - _colindT(nnz) = i; - _nzvalsT(nnz) = _nzvals(k); + for (ordinal_type k = _rowptr[i]; k < _rowptr[i+1]; k++) { + int nnz = Kokkos::atomic_fetch_add(&(_rowptrT[_colind[k]]), 1); + _colindT[nnz] = i; + _nzvalsT[nnz] = _nzvals[k]; } } }; diff --git a/packages/shylu/shylu_node/tacho/src/impl/Tacho_TeamFunctor_FactorizeChol.hpp b/packages/shylu/shylu_node/tacho/src/impl/Tacho_TeamFunctor_FactorizeChol.hpp index f97103664def..f03ce9a1f035 100644 --- a/packages/shylu/shylu_node/tacho/src/impl/Tacho_TeamFunctor_FactorizeChol.hpp +++ b/packages/shylu/shylu_node/tacho/src/impl/Tacho_TeamFunctor_FactorizeChol.hpp @@ -42,10 +42,6 @@ template struct TeamFunctor_FactorizeChol { typedef typename supernode_info_type::value_type_array value_type_array; typedef typename supernode_info_type::value_type_matrix value_type_matrix; - typedef typename supernode_info_type::rowptr_view rowptr_view; - typedef typename supernode_info_type::colind_view colind_view; - typedef typename supernode_info_type::nzvals_view nzvals_view; - private: supernode_info_type _info; ordinal_type_array _compute_mode, _level_sids; From 46a900eaa366b6359b13bd332139f044426fb3ca Mon Sep 17 00:00:00 2001 From: iyamazaki Date: Fri, 2 Aug 2024 00:11:01 -0400 Subject: [PATCH 05/13] Tacho : workspace on AMD GPU --- .../src/impl/Tacho_NumericTools_LevelSet.hpp | 70 ++++++++++++------- .../src/impl/Tacho_TeamFunctor_ExtractCRS.hpp | 6 -- 2 files changed, 44 insertions(+), 32 deletions(-) diff --git a/packages/shylu/shylu_node/tacho/src/impl/Tacho_NumericTools_LevelSet.hpp b/packages/shylu/shylu_node/tacho/src/impl/Tacho_NumericTools_LevelSet.hpp index 980ea1e4fb7a..c1ba449d5f21 100644 --- a/packages/shylu/shylu_node/tacho/src/impl/Tacho_NumericTools_LevelSet.hpp +++ b/packages/shylu/shylu_node/tacho/src/impl/Tacho_NumericTools_LevelSet.hpp @@ -222,8 +222,8 @@ class NumericToolsLevelSet : public NumericToolsBase { rocblas_handle _handle_lapack; std::vector _handles; // workspace for SpMV - rocsparse_dnmat_descr matT, matW; - rocsparse_dnvec_descr vecT, vecW; + rocsparse_dnmat_descr matL, matU, matW; + rocsparse_dnvec_descr vecL, vecU, vecW; using blas_handle_type = rocblas_handle; using lapack_handle_type = rocblas_handle; @@ -1599,7 +1599,7 @@ class NumericToolsLevelSet : public NumericToolsBase { TACHO_TEST_FOR_EXCEPTION(true, std::logic_error, "LevelSetTools::solveCholeskyLowerOnDevice: ComputeSPMV only supported double or float"); } - // attach to Cusparse/Rocsparse data struct + // attach to Cusparse data struct cusparseCreateDnMat(&matW, m, nrhs, ldw, (void*)(_w_vec.data()), computeType, CUSPARSE_ORDER_COL); cusparseCreateDnVec(&vecW, m, (void*)(_w_vec.data()), computeType); // also to T, to be destroyed before each SpMV call @@ -1623,9 +1623,14 @@ class NumericToolsLevelSet : public NumericToolsBase { if (std::is_same::value) { rocsparse_compute_type = rocsparse_datatype_f32_r; } - // attach to Cusparse/Rocsparse data struct + // attach to Rocsparse data struct rocsparse_create_dnmat_descr(&matW, m, nrhs, ldw, (void*)(_w_vec.data()), rocsparse_compute_type, rocsparse_order_column); rocsparse_create_dnvec_descr(&vecW, m, (void*)(_w_vec.data()), rocsparse_compute_type); + // also to T, to be destroyed before each SpMV call + rocsparse_create_dnmat_descr(&matL, m, nrhs, ldw, (void*)(_w_vec.data()), rocsparse_compute_type, rocsparse_order_column); + rocsparse_create_dnvec_descr(&vecL, m, (void*)(_w_vec.data()), rocsparse_compute_type); + rocsparse_create_dnmat_descr(&matU, m, nrhs, ldw, (void*)(_w_vec.data()), rocsparse_compute_type, rocsparse_order_column); + rocsparse_create_dnvec_descr(&vecU, m, (void*)(_w_vec.data()), rocsparse_compute_type); // vectors used for preprocessing rocsparse_dnvec_descr vecX, vecY; rocsparse_create_dnvec_descr(&vecX, m, (void*)_w_vec.data(), rocsparse_compute_type); @@ -2059,6 +2064,21 @@ class NumericToolsLevelSet : public NumericToolsBase { cusparseDestroyDnVec(vecU); cusparseDestroyDnMat(matW); cusparseDestroyDnVec(vecW); +#elif defined(KOKKOS_ENABLE_HIP) + for (ordinal_type lvl = 0; lvl < _team_serial_level_cut; ++lvl) { + const ordinal_type pbeg = _h_level_ptr(lvl); + // the first supernode in this lvl (where the CRS matrix is stored) + auto &s0 = _h_supernodes(_h_level_sids(pbeg)); + rocsparse_destroy_spmat_descr(s0.descrU); + rocsparse_destroy_spmat_descr(s0.descrL); + rocsparse_destroy_handle(s0.rocsparseHandle); + } + rocsparse_destroy_dnmat_descr(matL); + rocsparse_destroy_dnvec_descr(vecL); + rocsparse_destroy_dnmat_descr(matU); + rocsparse_destroy_dnvec_descr(vecU); + rocsparse_destroy_dnmat_descr(matW); + rocsparse_destroy_dnvec_descr(vecW); #endif _is_spmv_extracted = 0; } @@ -2358,12 +2378,15 @@ class NumericToolsLevelSet : public NumericToolsBase { cusparseCreateDnMat(&matW, m, nrhs, ldw, (void*)(_w_vec.data()), computeType, CUSPARSE_ORDER_COL); cusparseCreateDnVec(&vecW, m, (void*)(_w_vec.data()), computeType); #elif defined(KOKKOS_ENABLE_HIP) + // destroy previous + rocsparse_destroy_dnmat_descr(matW); + rocsparse_destroy_dnvec_descr(vecW); + // create new rocsparse_create_dnmat_descr(&matW, m, nrhs, ldw, (void*)(_w_vec.data()), rocsparse_compute_type, rocsparse_order_column); rocsparse_create_dnvec_descr(&vecW, m, (void*)(_w_vec.data()), rocsparse_compute_type); #endif } const ordinal_type ldt = t.stride(1); - const ordinal_type ldw = _w_vec.stride(1); auto &s0 = _h_supernodes(_h_level_sids(pbeg)); #else exit(0); @@ -2427,13 +2450,13 @@ class NumericToolsLevelSet : public NumericToolsBase { if (nrhs > 1) { if (lvl == nlvls-1) { // start : destroy previous - rocsparse_destroy_dnmat_descr(matT); + rocsparse_destroy_dnmat_descr(matL); // start : create DnMat for T - rocsparse_create_dnmat_descr(&matT, m, nrhs, ldt, (void*)(t.data()), rocsparse_compute_type, rocsparse_order_column); + rocsparse_create_dnmat_descr(&matL, m, nrhs, ldt, (void*)(t.data()), rocsparse_compute_type, rocsparse_order_column); } // create vectors - auto vecX = ((nlvls-1-lvl)%2 == 0 ? matT : matW); - auto vecY = ((nlvls-1-lvl)%2 == 0 ? matW : matT); + auto vecX = ((nlvls-1-lvl)%2 == 0 ? matL : matW); + auto vecY = ((nlvls-1-lvl)%2 == 0 ? matW : matL); if (s0.spmv_explicit_transpose) { size_t buffer_size_L = buffer_L.extent(0); status = rocsparse_spmm(s0.rocsparseHandle, rocsparse_operation_none, rocsparse_operation_none, @@ -2452,13 +2475,13 @@ class NumericToolsLevelSet : public NumericToolsBase { } else { if (lvl == nlvls-1) { // start : destroy previous - rocsparse_destroy_dnvec_descr(vecT); + rocsparse_destroy_dnvec_descr(vecL); // start : create DnVec for T - rocsparse_create_dnvec_descr(&vecT, m, (void*)(t.data()), rocsparse_compute_type); + rocsparse_create_dnvec_descr(&vecL, m, (void*)(t.data()), rocsparse_compute_type); } size_t buffer_size_L = buffer_L.extent(0); - auto vecX = ((nlvls-1-lvl)%2 == 0 ? vecT : vecW); - auto vecY = ((nlvls-1-lvl)%2 == 0 ? vecW : vecT); + auto vecX = ((nlvls-1-lvl)%2 == 0 ? vecL : vecW); + auto vecY = ((nlvls-1-lvl)%2 == 0 ? vecW : vecL); if (s0.spmv_explicit_transpose) { status = #if ROCM_VERSION >= 50400 @@ -2709,7 +2732,6 @@ class NumericToolsLevelSet : public NumericToolsBase { const value_type alpha (1); const value_type beta (0); const ordinal_type ldt = t.stride(1); - const ordinal_type ldw = _w_vec.stride(1); #else exit(0); #endif @@ -2779,10 +2801,11 @@ class NumericToolsLevelSet : public NumericToolsBase { if (nrhs > 1) { if (lvl == 0) { // start : create DnMat for T - rocsparse_create_dnmat_descr(&matT, m, nrhs, ldt, (void*)(t.data()), rocsparse_compute_type, rocsparse_order_column); + rocsparse_destroy_dnmat_descr(matU); + rocsparse_create_dnmat_descr(&matU, m, nrhs, ldt, (void*)(t.data()), rocsparse_compute_type, rocsparse_order_column); } - auto vecX = (lvl%2 == 0 ? matT : matW); - auto vecY = (lvl%2 == 0 ? matW : matT); + auto vecX = (lvl%2 == 0 ? matU : matW); + auto vecY = (lvl%2 == 0 ? matW : matU); status = rocsparse_spmm(s0.rocsparseHandle, rocsparse_operation_none, rocsparse_operation_none, &alpha, s0.descrU, vecX, &beta, vecY, rocsparse_compute_type, rocsparse_spmm_alg_default, @@ -2791,10 +2814,11 @@ class NumericToolsLevelSet : public NumericToolsBase { } else { if (lvl == 0) { // start : create DnVec for T - rocsparse_create_dnvec_descr(&vecT, m, (void*)(t.data()), rocsparse_compute_type); + rocsparse_destroy_dnvec_descr(vecU); + rocsparse_create_dnvec_descr(&vecU, m, (void*)(t.data()), rocsparse_compute_type); } - auto vecX = (lvl%2 == 0 ? vecT : vecW); - auto vecY = (lvl%2 == 0 ? vecW : vecT); + auto vecX = (lvl%2 == 0 ? vecU : vecW); + auto vecY = (lvl%2 == 0 ? vecW : vecU); status = #if ROCM_VERSION >= 50400 rocsparse_spmv_ex @@ -2839,12 +2863,6 @@ class NumericToolsLevelSet : public NumericToolsBase { if (lvl%2 == 0) { Kokkos::deep_copy(t, _w_vec); } -#if defined(KOKKOS_ENABLE_HIP) - if (nrhs > 1) - rocsparse_destroy_dnmat_descr(matT); - else - rocsparse_destroy_dnvec_descr(vecT); -#endif } #endif } diff --git a/packages/shylu/shylu_node/tacho/src/impl/Tacho_TeamFunctor_ExtractCRS.hpp b/packages/shylu/shylu_node/tacho/src/impl/Tacho_TeamFunctor_ExtractCRS.hpp index de7419e7757c..cff6e5b78f29 100644 --- a/packages/shylu/shylu_node/tacho/src/impl/Tacho_TeamFunctor_ExtractCRS.hpp +++ b/packages/shylu/shylu_node/tacho/src/impl/Tacho_TeamFunctor_ExtractCRS.hpp @@ -44,8 +44,6 @@ struct rowptr_sum { _rowptr[i] = update; } } - - //ordinal_type nnz() { return _rowptr[_rowptr.extent(0)]; } }; template struct TeamFunctor_ExtractCrs { @@ -59,10 +57,6 @@ template struct TeamFunctor_ExtractCrs { typedef typename supernode_info_type::value_type_matrix value_type_matrix; typedef typename supernode_info_type::ordinal_type_array ordinal_type_array; - //typedef typename supernode_info_type::rowptr_view rowptr_view; - //typedef typename supernode_info_type::colind_view colind_view; - //typedef typename supernode_info_type::nzvals_view nzvals_view; - private: supernode_info_type _info; ordinal_type_array _compute_mode, _level_sids; From 4bf336b0b5124886d127c745664dfd7f97dcb413 Mon Sep 17 00:00:00 2001 From: iyamazaki Date: Fri, 2 Aug 2024 22:30:56 -0600 Subject: [PATCH 06/13] Tacho : move sparse handle from SupernodeInfo to NumericalTools --- .../src/impl/Tacho_NumericTools_LevelSet.hpp | 56 ++++++++++--------- .../tacho/src/impl/Tacho_SupernodeInfo.hpp | 2 - 2 files changed, 30 insertions(+), 28 deletions(-) diff --git a/packages/shylu/shylu_node/tacho/src/impl/Tacho_NumericTools_LevelSet.hpp b/packages/shylu/shylu_node/tacho/src/impl/Tacho_NumericTools_LevelSet.hpp index c1ba449d5f21..520ec5554869 100644 --- a/packages/shylu/shylu_node/tacho/src/impl/Tacho_NumericTools_LevelSet.hpp +++ b/packages/shylu/shylu_node/tacho/src/impl/Tacho_NumericTools_LevelSet.hpp @@ -209,6 +209,7 @@ class NumericToolsLevelSet : public NumericToolsBase { // (separte for U and L, so that we can "destroy" without waiting for the other) cusparseDnMatDescr_t matL, matU, matW; cusparseDnVecDescr_t vecL, vecU, vecW; + cusparseHandle_t cusparseHandle; #endif using blas_handle_type = cublasHandle_t; @@ -224,6 +225,7 @@ class NumericToolsLevelSet : public NumericToolsBase { // workspace for SpMV rocsparse_dnmat_descr matL, matU, matW; rocsparse_dnvec_descr vecL, vecU, vecW; + rocsparse_handle rocsparseHandle; using blas_handle_type = rocblas_handle; using lapack_handle_type = rocblas_handle; @@ -1599,6 +1601,8 @@ class NumericToolsLevelSet : public NumericToolsBase { TACHO_TEST_FOR_EXCEPTION(true, std::logic_error, "LevelSetTools::solveCholeskyLowerOnDevice: ComputeSPMV only supported double or float"); } + // create cusparse handle + cusparseCreate(&cusparseHandle); // attach to Cusparse data struct cusparseCreateDnMat(&matW, m, nrhs, ldw, (void*)(_w_vec.data()), computeType, CUSPARSE_ORDER_COL); cusparseCreateDnVec(&vecW, m, (void*)(_w_vec.data()), computeType); @@ -1623,6 +1627,8 @@ class NumericToolsLevelSet : public NumericToolsBase { if (std::is_same::value) { rocsparse_compute_type = rocsparse_datatype_f32_r; } + // create rocsparse handle + rocsparse_create_handle(&rocsparseHandle); // attach to Rocsparse data struct rocsparse_create_dnmat_descr(&matW, m, nrhs, ldw, (void*)(_w_vec.data()), rocsparse_compute_type, rocsparse_order_column); rocsparse_create_dnvec_descr(&vecW, m, (void*)(_w_vec.data()), rocsparse_compute_type); @@ -1876,7 +1882,6 @@ class NumericToolsLevelSet : public NumericToolsBase { value_type beta = one; #endif #if defined(KOKKOS_ENABLE_CUDA) - cusparseCreate(&s0.cusparseHandle); // create matrix cusparseCreateCsr(&s0.U_cusparse, m, m, s0.nnzU, s0.rowptrU, s0.colindU, s0.nzvalsU, @@ -1884,11 +1889,11 @@ class NumericToolsLevelSet : public NumericToolsBase { CUSPARSE_INDEX_BASE_ZERO, computeType); #ifdef USE_SPMM_FOR_WORKSPACE_SIZE - cusparseSpMM_bufferSize(s0.cusparseHandle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE, + cusparseSpMM_bufferSize(cusparseHandle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE, &alpha, s0.U_cusparse, vecX, &beta, vecY, computeType, TACHO_CUSPARSE_SPMM_ALG, &buffer_size_U); #else - cusparseSpMV_bufferSize(s0.cusparseHandle, CUSPARSE_OPERATION_NON_TRANSPOSE, &alpha, s0.U_cusparse, vecX, &beta, vecY, + cusparseSpMV_bufferSize(cusparseHandle, CUSPARSE_OPERATION_NON_TRANSPOSE, &alpha, s0.U_cusparse, vecX, &beta, vecY, computeType, TACHO_CUSPARSE_SPMV_ALG, &buffer_size_U); #endif if (s0.spmv_explicit_transpose) { @@ -1899,11 +1904,11 @@ class NumericToolsLevelSet : public NumericToolsBase { CUSPARSE_INDEX_BASE_ZERO, computeType); // workspace size #ifdef USE_SPMM_FOR_WORKSPACE_SIZE - cusparseSpMM_bufferSize(s0.cusparseHandle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE, + cusparseSpMM_bufferSize(cusparseHandle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE, &alpha, s0.L_cusparse, vecX, &beta, vecY, computeType, TACHO_CUSPARSE_SPMM_ALG, &buffer_size_L); #else - cusparseSpMV_bufferSize(s0.cusparseHandle, CUSPARSE_OPERATION_NON_TRANSPOSE, &alpha, s0.L_cusparse, vecX, &beta, vecY, + cusparseSpMV_bufferSize(cusparseHandle, CUSPARSE_OPERATION_NON_TRANSPOSE, &alpha, s0.L_cusparse, vecX, &beta, vecY, computeType, TACHO_CUSPARSE_SPMV_ALG, &buffer_size_L); #endif } else { @@ -1915,11 +1920,11 @@ class NumericToolsLevelSet : public NumericToolsBase { CUSPARSE_INDEX_BASE_ZERO, computeType); // workspace size for transpose SpMV #ifdef USE_SPMM_FOR_WORKSPACE_SIZE - cusparseSpMM_bufferSize(s0.cusparseHandle, CUSPARSE_OPERATION_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE, + cusparseSpMM_bufferSize(cusparseHandle, CUSPARSE_OPERATION_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE, &alpha, s0.L_cusparse, vecX, &beta, vecY, computeType, TACHO_CUSPARSE_SPMM_ALG, &buffer_size_L); #else - cusparseSpMV_bufferSize(s0.cusparseHandle, CUSPARSE_OPERATION_TRANSPOSE, &alpha, s0.L_cusparse, vecX, &beta, vecY, + cusparseSpMV_bufferSize(cusparseHandle, CUSPARSE_OPERATION_TRANSPOSE, &alpha, s0.L_cusparse, vecX, &beta, vecY, computeType, TACHO_CUSPARSE_SPMV_ALG, &buffer_size_L); #endif } @@ -1931,7 +1936,6 @@ class NumericToolsLevelSet : public NumericToolsBase { Kokkos::resize(buffer_L, buffer_size_L); } #elif defined(KOKKOS_ENABLE_HIP) - rocsparse_create_handle(&s0.rocsparseHandle); // create matrix rocsparse_create_csr_descr(&(s0.descrU), m, m, s0.nnzU, s0.rowptrU, s0.colindU, s0.nzvalsU, @@ -1942,7 +1946,7 @@ class NumericToolsLevelSet : public NumericToolsBase { #else rocsparse_spmv #endif - (s0.rocsparseHandle, rocsparse_operation_none, + (rocsparseHandle, rocsparse_operation_none, &alpha, s0.descrU, vecX, &beta, vecY, rocsparse_compute_type, rocsparse_spmv_alg_default, #if ROCM_VERSION >= 50400 @@ -1957,7 +1961,7 @@ class NumericToolsLevelSet : public NumericToolsBase { // preprocess buffer_size_U = buffer_U.extent(0); rocsparse_spmv_ex - (s0.rocsparseHandle, rocsparse_operation_none, + (rocsparseHandle, rocsparse_operation_none, &alpha, s0.descrU, vecX, &beta, vecY, rocsparse_compute_type, rocsparse_spmv_alg_default, rocsparse_spmv_stage_preprocess, @@ -1974,7 +1978,7 @@ class NumericToolsLevelSet : public NumericToolsBase { #else rocsparse_spmv #endif - (s0.rocsparseHandle, rocsparse_operation_none, + (rocsparseHandle, rocsparse_operation_none, &alpha, s0.descrL, vecX, &beta, vecY, rocsparse_compute_type, rocsparse_spmv_alg_default, #if ROCM_VERSION >= 50400 @@ -1989,7 +1993,7 @@ class NumericToolsLevelSet : public NumericToolsBase { // preprocess buffer_size_L = buffer_L.extent(0); rocsparse_spmv_ex - (s0.rocsparseHandle, rocsparse_operation_none, + (rocsparseHandle, rocsparse_operation_none, &alpha, s0.descrL, vecX, &beta, vecY, rocsparse_compute_type, rocsparse_spmv_alg_default, rocsparse_spmv_stage_preprocess, @@ -2006,7 +2010,7 @@ class NumericToolsLevelSet : public NumericToolsBase { #else rocsparse_spmv #endif - (s0.rocsparseHandle, rocsparse_operation_transpose, + (rocsparseHandle, rocsparse_operation_transpose, &alpha, s0.descrL, vecX, &beta, vecY, rocsparse_compute_type, rocsparse_spmv_alg_default, #if ROCM_VERSION >= 50400 @@ -2021,7 +2025,7 @@ class NumericToolsLevelSet : public NumericToolsBase { // preprocess buffer_size_L = buffer_L.extent(0); rocsparse_spmv_ex - (s0.rocsparseHandle, rocsparse_operation_transpose, + (rocsparseHandle, rocsparse_operation_transpose, &alpha, s0.descrL, vecX, &beta, vecY, rocsparse_compute_type, rocsparse_spmv_alg_default, rocsparse_spmv_stage_preprocess, @@ -2056,7 +2060,6 @@ class NumericToolsLevelSet : public NumericToolsBase { auto &s0 = _h_supernodes(_h_level_sids(pbeg)); cusparseDestroySpMat(s0.U_cusparse); cusparseDestroySpMat(s0.L_cusparse); - cusparseDestroy(s0.cusparseHandle); } cusparseDestroyDnMat(matL); cusparseDestroyDnVec(vecL); @@ -2064,6 +2067,7 @@ class NumericToolsLevelSet : public NumericToolsBase { cusparseDestroyDnVec(vecU); cusparseDestroyDnMat(matW); cusparseDestroyDnVec(vecW); + cusparseDestroy(cusparseHandle); #elif defined(KOKKOS_ENABLE_HIP) for (ordinal_type lvl = 0; lvl < _team_serial_level_cut; ++lvl) { const ordinal_type pbeg = _h_level_ptr(lvl); @@ -2071,7 +2075,6 @@ class NumericToolsLevelSet : public NumericToolsBase { auto &s0 = _h_supernodes(_h_level_sids(pbeg)); rocsparse_destroy_spmat_descr(s0.descrU); rocsparse_destroy_spmat_descr(s0.descrL); - rocsparse_destroy_handle(s0.rocsparseHandle); } rocsparse_destroy_dnmat_descr(matL); rocsparse_destroy_dnvec_descr(vecL); @@ -2079,6 +2082,7 @@ class NumericToolsLevelSet : public NumericToolsBase { rocsparse_destroy_dnvec_descr(vecU); rocsparse_destroy_dnmat_descr(matW); rocsparse_destroy_dnvec_descr(vecW); + rocsparse_destroy_handle(rocsparseHandle); #endif _is_spmv_extracted = 0; } @@ -2420,7 +2424,7 @@ class NumericToolsLevelSet : public NumericToolsBase { auto matX = ((nlvls-1-lvl)%2 == 0 ? matL : matW); auto matY = ((nlvls-1-lvl)%2 == 0 ? matW : matL); // SpMM - status = cusparseSpMM(s0.cusparseHandle, opL, CUSPARSE_OPERATION_NON_TRANSPOSE, + status = cusparseSpMM(cusparseHandle, opL, CUSPARSE_OPERATION_NON_TRANSPOSE, &alpha, s0.L_cusparse, matX, &beta, matY, @@ -2436,7 +2440,7 @@ class NumericToolsLevelSet : public NumericToolsBase { auto vecX = ((nlvls-1-lvl)%2 == 0 ? vecL : vecW); auto vecY = ((nlvls-1-lvl)%2 == 0 ? vecW : vecL); // SpMV - status = cusparseSpMV(s0.cusparseHandle, opL, + status = cusparseSpMV(cusparseHandle, opL, &alpha, s0.L_cusparse, vecX, &beta, vecY, @@ -2459,14 +2463,14 @@ class NumericToolsLevelSet : public NumericToolsBase { auto vecY = ((nlvls-1-lvl)%2 == 0 ? matW : matL); if (s0.spmv_explicit_transpose) { size_t buffer_size_L = buffer_L.extent(0); - status = rocsparse_spmm(s0.rocsparseHandle, rocsparse_operation_none, rocsparse_operation_none, + status = rocsparse_spmm(rocsparseHandle, rocsparse_operation_none, rocsparse_operation_none, &alpha, s0.descrL, vecX, &beta, vecY, rocsparse_compute_type, rocsparse_spmm_alg_default, rocsparse_spmm_stage_compute, &buffer_size_L, (void*)buffer_L.data()); } else { size_t buffer_size_L = buffer_L.extent(0); - status = rocsparse_spmm(s0.rocsparseHandle, rocsparse_operation_transpose, rocsparse_operation_none, + status = rocsparse_spmm(rocsparseHandle, rocsparse_operation_transpose, rocsparse_operation_none, &alpha, s0.descrL, vecX, &beta, vecY, // dscrL stores the same ptrs as descrU, but optimized for trans rocsparse_compute_type, rocsparse_spmm_alg_default, rocsparse_spmm_stage_compute, @@ -2489,7 +2493,7 @@ class NumericToolsLevelSet : public NumericToolsBase { #else rocsparse_spmv #endif - (s0.rocsparseHandle, rocsparse_operation_none, + (rocsparseHandle, rocsparse_operation_none, &alpha, s0.descrL, vecX, &beta, vecY, rocsparse_compute_type, rocsparse_spmv_alg_default, #if ROCM_VERSION >= 50400 @@ -2503,7 +2507,7 @@ class NumericToolsLevelSet : public NumericToolsBase { #else rocsparse_spmv #endif - (s0.rocsparseHandle, rocsparse_operation_transpose, + (rocsparseHandle, rocsparse_operation_transpose, &alpha, s0.descrL, vecX, &beta, vecY, // dscrL stores the same ptrs as descrU, but optimized for trans rocsparse_compute_type, rocsparse_spmv_alg_default, #if ROCM_VERSION >= 50400 @@ -2764,7 +2768,7 @@ class NumericToolsLevelSet : public NumericToolsBase { auto vecX = (lvl%2 == 0 ? matU : matW); auto vecY = (lvl%2 == 0 ? matW : matU); // SpMM - status = cusparseSpMM(s0.cusparseHandle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE, + status = cusparseSpMM(cusparseHandle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE, &alpha, s0.U_cusparse, vecX, &beta, vecY, @@ -2779,7 +2783,7 @@ class NumericToolsLevelSet : public NumericToolsBase { auto vecX = (lvl%2 == 0 ? vecU : vecW); auto vecY = (lvl%2 == 0 ? vecW : vecU); // SpMV - status = cusparseSpMV(s0.cusparseHandle, CUSPARSE_OPERATION_NON_TRANSPOSE, + status = cusparseSpMV(cusparseHandle, CUSPARSE_OPERATION_NON_TRANSPOSE, &alpha, s0.U_cusparse, vecX, &beta, vecY, @@ -2806,7 +2810,7 @@ class NumericToolsLevelSet : public NumericToolsBase { } auto vecX = (lvl%2 == 0 ? matU : matW); auto vecY = (lvl%2 == 0 ? matW : matU); - status = rocsparse_spmm(s0.rocsparseHandle, rocsparse_operation_none, rocsparse_operation_none, + status = rocsparse_spmm(rocsparseHandle, rocsparse_operation_none, rocsparse_operation_none, &alpha, s0.descrU, vecX, &beta, vecY, rocsparse_compute_type, rocsparse_spmm_alg_default, rocsparse_spmm_stage_compute, @@ -2825,7 +2829,7 @@ class NumericToolsLevelSet : public NumericToolsBase { #else rocsparse_spmv #endif - (s0.rocsparseHandle, rocsparse_operation_none, + (rocsparseHandle, rocsparse_operation_none, &alpha, s0.descrU, vecX, &beta, vecY, rocsparse_compute_type, rocsparse_spmv_alg_default, #if ROCM_VERSION >= 50400 diff --git a/packages/shylu/shylu_node/tacho/src/impl/Tacho_SupernodeInfo.hpp b/packages/shylu/shylu_node/tacho/src/impl/Tacho_SupernodeInfo.hpp index f25f397955e6..265f376632d9 100644 --- a/packages/shylu/shylu_node/tacho/src/impl/Tacho_SupernodeInfo.hpp +++ b/packages/shylu/shylu_node/tacho/src/impl/Tacho_SupernodeInfo.hpp @@ -142,11 +142,9 @@ template struct SupernodeInfo { bool spmv_explicit_transpose; #if defined(KOKKOS_ENABLE_CUDA) - cusparseHandle_t cusparseHandle; cusparseSpMatDescr_t U_cusparse; cusparseSpMatDescr_t L_cusparse; #elif defined(KOKKOS_ENABLE_HIP) - rocsparse_handle rocsparseHandle; rocsparse_spmat_descr descrU; rocsparse_spmat_descr descrL; #endif From 704b851b02a12bfe6e667513ed53f76a1491f97f Mon Sep 17 00:00:00 2001 From: iyamazaki Date: Sat, 3 Aug 2024 23:16:28 -0400 Subject: [PATCH 07/13] Tacho : an option to release CRS, or not --- .../src/impl/Tacho_NumericTools_LevelSet.hpp | 52 ++++++++++++------- 1 file changed, 32 insertions(+), 20 deletions(-) diff --git a/packages/shylu/shylu_node/tacho/src/impl/Tacho_NumericTools_LevelSet.hpp b/packages/shylu/shylu_node/tacho/src/impl/Tacho_NumericTools_LevelSet.hpp index 520ec5554869..1ed93f26ba1f 100644 --- a/packages/shylu/shylu_node/tacho/src/impl/Tacho_NumericTools_LevelSet.hpp +++ b/packages/shylu/shylu_node/tacho/src/impl/Tacho_NumericTools_LevelSet.hpp @@ -641,14 +641,14 @@ class NumericToolsLevelSet : public NumericToolsBase { } } print_stat_init(); + fflush(stdout); } } inline void release(const ordinal_type verbose = 0) override { base_type::release(false); - if (variant == 3 && _is_spmv_extracted) { - Kokkos::fence(); - this->releaseCRS(); + if (variant == 3) { + this->releaseCRS(true); } track_free(_buf_factor_ptr.span() * sizeof(size_type)); track_free(_buf_solve_ptr.span() * sizeof(size_type)); @@ -661,6 +661,7 @@ class NumericToolsLevelSet : public NumericToolsBase { printf("Summary: LevelSetTools-Variant-%d (Release)\n", variant); printf("===========================================\n"); print_stat_memory(); + fflush(stdout); } } @@ -789,6 +790,7 @@ class NumericToolsLevelSet : public NumericToolsBase { if (verbose) { printf("Summary: CreateStream : %3d\n", _nstreams); printf("===========================\n"); + fflush(stdout); } #endif } @@ -1585,7 +1587,11 @@ class NumericToolsLevelSet : public NumericToolsBase { // ======================== // free CRS, // if it has been extracted - this->releaseCRS(); +#if defined(KOKKOS_ENABLE_HIP) + this->releaseCRS(!lu); +#else + this->releaseCRS(true); +#endif // ======================== // workspace @@ -2051,38 +2057,39 @@ class NumericToolsLevelSet : public NumericToolsBase { #endif } - inline void releaseCRS() { + inline void releaseCRS(bool release_all) { if(_is_spmv_extracted) { + Kokkos::fence(); + if (release_all) { + for (ordinal_type lvl = 0; lvl < _team_serial_level_cut; ++lvl) { + const ordinal_type pbeg = _h_level_ptr(lvl); + // the first supernode in this lvl (where the CRS matrix is stored) + auto &s0 = _h_supernodes(_h_level_sids(pbeg)); #if defined(KOKKOS_ENABLE_CUDA) - for (ordinal_type lvl = 0; lvl < _team_serial_level_cut; ++lvl) { - const ordinal_type pbeg = _h_level_ptr(lvl); - // the first supernode in this lvl (where the CRS matrix is stored) - auto &s0 = _h_supernodes(_h_level_sids(pbeg)); - cusparseDestroySpMat(s0.U_cusparse); - cusparseDestroySpMat(s0.L_cusparse); + cusparseDestroySpMat(s0.U_cusparse); + cusparseDestroySpMat(s0.L_cusparse); +#elif defined(KOKKOS_ENABLE_HIP) + rocsparse_destroy_spmat_descr(s0.descrU); + rocsparse_destroy_spmat_descr(s0.descrL); +#endif + } } +#if defined(KOKKOS_ENABLE_CUDA) + cusparseDestroy(cusparseHandle); cusparseDestroyDnMat(matL); cusparseDestroyDnVec(vecL); cusparseDestroyDnMat(matU); cusparseDestroyDnVec(vecU); cusparseDestroyDnMat(matW); cusparseDestroyDnVec(vecW); - cusparseDestroy(cusparseHandle); #elif defined(KOKKOS_ENABLE_HIP) - for (ordinal_type lvl = 0; lvl < _team_serial_level_cut; ++lvl) { - const ordinal_type pbeg = _h_level_ptr(lvl); - // the first supernode in this lvl (where the CRS matrix is stored) - auto &s0 = _h_supernodes(_h_level_sids(pbeg)); - rocsparse_destroy_spmat_descr(s0.descrU); - rocsparse_destroy_spmat_descr(s0.descrL); - } + rocsparse_destroy_handle(rocsparseHandle); rocsparse_destroy_dnmat_descr(matL); rocsparse_destroy_dnvec_descr(vecL); rocsparse_destroy_dnmat_descr(matU); rocsparse_destroy_dnvec_descr(vecU); rocsparse_destroy_dnmat_descr(matW); rocsparse_destroy_dnvec_descr(vecW); - rocsparse_destroy_handle(rocsparseHandle); #endif _is_spmv_extracted = 0; } @@ -2240,6 +2247,7 @@ class NumericToolsLevelSet : public NumericToolsBase { printf("=====================================================\n"); printf( "\n ** Team = %f s, Device = %f s, Update = %f s **\n\n",time_parallel,time_device,time_update ); print_stat_factor(); + fflush(stdout); } } @@ -3826,6 +3834,7 @@ class NumericToolsLevelSet : public NumericToolsBase { printf("Summary: LevelSetTools-Variant-%d (Cholesky Solve: %3d)\n", variant, nrhs); printf("=======================================================\n"); print_stat_solve(); + fflush(stdout); } } @@ -3987,6 +3996,7 @@ class NumericToolsLevelSet : public NumericToolsBase { printf("=================================================\n"); printf( "\n ** Team = %f s, Device = %f s, Update = %f s **\n\n",time_parallel,time_device,time_update ); print_stat_factor(); + fflush(stdout); } } @@ -4179,6 +4189,7 @@ class NumericToolsLevelSet : public NumericToolsBase { printf("Summary: LevelSetTools-Variant-%d (LDL Solve: %3d)\n", variant, nrhs); printf("==================================================\n"); print_stat_solve(); + fflush(stdout); } } @@ -4345,6 +4356,7 @@ class NumericToolsLevelSet : public NumericToolsBase { printf("================================================\n"); printf( "\n ** Team = %f s, Device = %f s, Update = %f s **\n\n",time_parallel,time_device,time_update ); print_stat_factor(); + fflush(stdout); } } From 2247b3c297905592de3613050467387790f9143c Mon Sep 17 00:00:00 2001 From: Jonathan Hu Date: Wed, 7 Aug 2024 19:28:20 -0500 Subject: [PATCH 08/13] Xpetra: split MatrixFactory into two files This works around an issue where Xpetra's library was missing symbols for MatrixFactory2. This occurs with the Intel OneAPI compiler 2024.2.0. --- .../src/Matrix/Xpetra_MatrixFactory2_decl.hpp | 147 ++++++++++++++++++ .../src/Matrix/Xpetra_MatrixFactory2_def.hpp | 63 ++++++++ .../src/Matrix/Xpetra_MatrixFactory_decl.hpp | 120 +------------- .../src/Matrix/Xpetra_MatrixFactory_def.hpp | 42 +---- .../src/Utils/ClassList/SC-LO-GO-NO.classList | 1 + .../ETI_SC_LO_GO_NO_classes.cmake | 1 + .../Xpetra_MatrixFactory2_fwd.hpp | 22 +++ .../Xpetra_MatrixFactory_fwd.hpp | 2 - 8 files changed, 236 insertions(+), 162 deletions(-) create mode 100644 packages/xpetra/src/Matrix/Xpetra_MatrixFactory2_decl.hpp create mode 100644 packages/xpetra/src/Matrix/Xpetra_MatrixFactory2_def.hpp create mode 100644 packages/xpetra/src/Utils/ForwardDeclaration/Xpetra_MatrixFactory2_fwd.hpp diff --git a/packages/xpetra/src/Matrix/Xpetra_MatrixFactory2_decl.hpp b/packages/xpetra/src/Matrix/Xpetra_MatrixFactory2_decl.hpp new file mode 100644 index 000000000000..384046030aea --- /dev/null +++ b/packages/xpetra/src/Matrix/Xpetra_MatrixFactory2_decl.hpp @@ -0,0 +1,147 @@ +// @HEADER +// ***************************************************************************** +// Xpetra: A linear algebra interface package +// +// Copyright 2012 NTESS and the Xpetra contributors. +// SPDX-License-Identifier: BSD-3-Clause +// ***************************************************************************** +// @HEADER + +// WARNING: This code is experimental. Backwards compatibility should not be expected. + +#ifndef XPETRA_MATRIXFACTORY2_DECL_HPP +#define XPETRA_MATRIXFACTORY2_DECL_HPP + +#include "Xpetra_ConfigDefs.hpp" +#include "Xpetra_MapExtractor_fwd.hpp" +#include "Xpetra_Matrix.hpp" +#include "Xpetra_CrsMatrixWrap.hpp" +#include "Xpetra_BlockedCrsMatrix_fwd.hpp" +#include "Xpetra_Map.hpp" +#include "Xpetra_BlockedMap.hpp" +#include "Xpetra_Vector.hpp" +#include "Xpetra_BlockedVector.hpp" +#include "Xpetra_Exceptions.hpp" + +namespace Xpetra { + +template +class MatrixFactory2 { +#undef XPETRA_MATRIXFACTORY2_SHORT +#include "Xpetra_UseShortNames.hpp" + + public: + static RCP > BuildCopy(const RCP > A, bool setFixedBlockSize = true); +}; +#define XPETRA_MATRIXFACTORY2_SHORT + +// template<> +// class MatrixFactory2::node_type> { +template +class MatrixFactory2 { + typedef double Scalar; + typedef int LocalOrdinal; + typedef int GlobalOrdinal; + // typedef Matrix::node_type Node; +#undef XPETRA_MATRIXFACTORY2_SHORT +#include "Xpetra_UseShortNames.hpp" + public: + static RCP > BuildCopy(const RCP > A, bool setFixedBlockSize = true) { + RCP oldOp = Teuchos::rcp_dynamic_cast(A); + if (oldOp == Teuchos::null) + throw Exceptions::BadCast("Cast from Xpetra::Matrix to Xpetra::CrsMatrixWrap failed"); + + RCP oldCrsOp = oldOp->getCrsMatrix(); + +#ifdef HAVE_XPETRA_EPETRA +#ifndef XPETRA_EPETRA_NO_32BIT_GLOBAL_INDICES + RCP > oldECrsOp = Teuchos::rcp_dynamic_cast >(oldCrsOp); + if (oldECrsOp != Teuchos::null) { + // Underlying matrix is Epetra + RCP newECrsOp(new EpetraCrsMatrixT(*oldECrsOp)); + RCP newOp(new CrsMatrixWrap(newECrsOp)); + if (setFixedBlockSize) + newOp->SetFixedBlockSize(A->GetFixedBlockSize()); + return newOp; + } +#endif +#endif + +#ifdef HAVE_XPETRA_TPETRA + // Underlying matrix is Tpetra + RCP oldTCrsOp = Teuchos::rcp_dynamic_cast(oldCrsOp); + if (oldTCrsOp != Teuchos::null) { + RCP newTCrsOp(new TpetraCrsMatrix(*oldTCrsOp)); + RCP newOp(new CrsMatrixWrap(newTCrsOp)); + if (setFixedBlockSize) + newOp->SetFixedBlockSize(A->GetFixedBlockSize()); + return newOp; + } + return Teuchos::null; +#else + throw Exceptions::BadCast("Cast from Xpetra::Matrix to Xpetra::EpetraCrsMatrix or Xpetra::TpetraCrsMatrix failed"); + TEUCHOS_UNREACHABLE_RETURN(Teuchos::null); // make compiler happy +#endif + + } // BuildCopy +}; + +#define XPETRA_MATRIXFACTORY2_SHORT + +#ifdef HAVE_XPETRA_INT_LONG_LONG +template +class MatrixFactory2 { + typedef double Scalar; + typedef int LocalOrdinal; + typedef long long GlobalOrdinal; + // typedef Matrix::node_type Node; +#undef XPETRA_MATRIXFACTORY2_SHORT +#include "Xpetra_UseShortNames.hpp" + public: + static RCP > BuildCopy(const RCP > A, bool setFixedBlockSize = true) { + RCP oldOp = Teuchos::rcp_dynamic_cast(A); + if (oldOp == Teuchos::null) + throw Exceptions::BadCast("Cast from Xpetra::Matrix to Xpetra::CrsMatrixWrap failed"); + + RCP oldCrsOp = oldOp->getCrsMatrix(); + +#ifdef HAVE_XPETRA_EPETRA +#ifndef XPETRA_EPETRA_NO_64BIT_GLOBAL_INDICES + RCP > oldECrsOp = Teuchos::rcp_dynamic_cast >(oldCrsOp); + if (oldECrsOp != Teuchos::null) { + // Underlying matrix is Epetra + RCP newECrsOp(new EpetraCrsMatrixT(*oldECrsOp)); + RCP newOp(new CrsMatrixWrap(newECrsOp)); + if (setFixedBlockSize) + newOp->SetFixedBlockSize(A->GetFixedBlockSize()); + return newOp; + } +#endif +#endif + +#ifdef HAVE_XPETRA_TPETRA + // Underlying matrix is Tpetra + RCP oldTCrsOp = Teuchos::rcp_dynamic_cast(oldCrsOp); + if (oldTCrsOp != Teuchos::null) { + RCP newTCrsOp(new TpetraCrsMatrix(*oldTCrsOp)); + RCP newOp(new CrsMatrixWrap(newTCrsOp)); + if (setFixedBlockSize) + newOp->SetFixedBlockSize(A->GetFixedBlockSize()); + return newOp; + } +#else + throw Exceptions::BadCast("Cast from Xpetra::Matrix to Xpetra::EpetraCrsMatrix or Xpetra::TpetraCrsMatrix failed"); +#endif + + return Teuchos::null; // make compiler happy + } +}; +#endif // HAVE_XPETRA_INT_LONG_LONG + +#define XPETRA_MATRIXFACTORY2_SHORT + +} // namespace Xpetra + +#define XPETRA_MATRIXFACTORY2_SHORT + +#endif // ifndef XPETRA_MATRIXFACTORY2_DECL_HPP diff --git a/packages/xpetra/src/Matrix/Xpetra_MatrixFactory2_def.hpp b/packages/xpetra/src/Matrix/Xpetra_MatrixFactory2_def.hpp new file mode 100644 index 000000000000..1e3b15de0342 --- /dev/null +++ b/packages/xpetra/src/Matrix/Xpetra_MatrixFactory2_def.hpp @@ -0,0 +1,63 @@ +// @HEADER +// ***************************************************************************** +// Xpetra: A linear algebra interface package +// +// Copyright 2012 NTESS and the Xpetra contributors. +// SPDX-License-Identifier: BSD-3-Clause +// ***************************************************************************** +// @HEADER + +// WARNING: This code is experimental. Backwards compatibility should not be expected. + +#ifndef XPETRA_MATRIXFACTORY2_DEF_HPP +#define XPETRA_MATRIXFACTORY2_DEF_HPP + +#include "Xpetra_MatrixFactory2_decl.hpp" +#include "Xpetra_BlockedCrsMatrix.hpp" + +namespace Xpetra { + +template +RCP> MatrixFactory2::BuildCopy(const RCP> A, bool setFixedBlockSize) { + RCP oldOp = Teuchos::rcp_dynamic_cast(A); + if (oldOp == Teuchos::null) + throw Exceptions::BadCast("Cast from Xpetra::Matrix to Xpetra::CrsMatrixWrap failed"); + + RCP oldCrsOp = oldOp->getCrsMatrix(); + + UnderlyingLib lib = A->getRowMap()->lib(); + + TEUCHOS_TEST_FOR_EXCEPTION(lib != UseEpetra && lib != UseTpetra, Exceptions::RuntimeError, + "Not Epetra or Tpetra matrix"); + +#ifdef HAVE_XPETRA_EPETRA + if (lib == UseEpetra) { + // NOTE: The proper Epetra conversion in Xpetra_MatrixFactory.cpp + throw Exceptions::RuntimeError("Xpetra::BuildCopy(): matrix templates are incompatible with Epetra"); + } +#endif + +#ifdef HAVE_XPETRA_TPETRA + if (lib == UseTpetra) { + // Underlying matrix is Tpetra + RCP oldTCrsOp = Teuchos::rcp_dynamic_cast(oldCrsOp); + + if (oldTCrsOp != Teuchos::null) { + RCP newTCrsOp(new TpetraCrsMatrix(*oldTCrsOp)); + RCP newOp(new CrsMatrixWrap(Teuchos::as>(newTCrsOp))); + if (setFixedBlockSize) + newOp->SetFixedBlockSize(A->GetFixedBlockSize()); + + return newOp; + } else { + throw Exceptions::BadCast("Cast from Xpetra::Matrix to Xpetra::TpetraCrsMatrix failed"); + } + } +#endif + + return Teuchos::null; +} + +} // namespace Xpetra + +#endif diff --git a/packages/xpetra/src/Matrix/Xpetra_MatrixFactory_decl.hpp b/packages/xpetra/src/Matrix/Xpetra_MatrixFactory_decl.hpp index 691e3a6c5d5a..1ed9072c6a78 100644 --- a/packages/xpetra/src/Matrix/Xpetra_MatrixFactory_decl.hpp +++ b/packages/xpetra/src/Matrix/Xpetra_MatrixFactory_decl.hpp @@ -25,123 +25,6 @@ namespace Xpetra { -template -class MatrixFactory2 { -#undef XPETRA_MATRIXFACTORY2_SHORT -#include "Xpetra_UseShortNames.hpp" - - public: - static RCP > BuildCopy(const RCP > A, bool setFixedBlockSize = true); -}; -#define XPETRA_MATRIXFACTORY2_SHORT - -// template<> -// class MatrixFactory2::node_type> { -template -class MatrixFactory2 { - typedef double Scalar; - typedef int LocalOrdinal; - typedef int GlobalOrdinal; - // typedef Matrix::node_type Node; -#undef XPETRA_MATRIXFACTORY2_SHORT -#include "Xpetra_UseShortNames.hpp" - public: - static RCP > BuildCopy(const RCP > A, bool setFixedBlockSize = true) { - RCP oldOp = Teuchos::rcp_dynamic_cast(A); - if (oldOp == Teuchos::null) - throw Exceptions::BadCast("Cast from Xpetra::Matrix to Xpetra::CrsMatrixWrap failed"); - - RCP oldCrsOp = oldOp->getCrsMatrix(); - -#ifdef HAVE_XPETRA_EPETRA -#ifndef XPETRA_EPETRA_NO_32BIT_GLOBAL_INDICES - RCP > oldECrsOp = Teuchos::rcp_dynamic_cast >(oldCrsOp); - if (oldECrsOp != Teuchos::null) { - // Underlying matrix is Epetra - RCP newECrsOp(new EpetraCrsMatrixT(*oldECrsOp)); - RCP newOp(new CrsMatrixWrap(newECrsOp)); - if (setFixedBlockSize) - newOp->SetFixedBlockSize(A->GetFixedBlockSize()); - return newOp; - } -#endif -#endif - -#ifdef HAVE_XPETRA_TPETRA - // Underlying matrix is Tpetra - RCP oldTCrsOp = Teuchos::rcp_dynamic_cast(oldCrsOp); - if (oldTCrsOp != Teuchos::null) { - RCP newTCrsOp(new TpetraCrsMatrix(*oldTCrsOp)); - RCP newOp(new CrsMatrixWrap(newTCrsOp)); - if (setFixedBlockSize) - newOp->SetFixedBlockSize(A->GetFixedBlockSize()); - return newOp; - } - return Teuchos::null; -#else - throw Exceptions::BadCast("Cast from Xpetra::Matrix to Xpetra::EpetraCrsMatrix or Xpetra::TpetraCrsMatrix failed"); - TEUCHOS_UNREACHABLE_RETURN(Teuchos::null); // make compiler happy -#endif - - } // BuildCopy -}; - -#define XPETRA_MATRIXFACTORY2_SHORT - -#ifdef HAVE_XPETRA_INT_LONG_LONG -// template<> -// class MatrixFactory2::node_type> { -template -class MatrixFactory2 { - typedef double Scalar; - typedef int LocalOrdinal; - typedef long long GlobalOrdinal; - // typedef Matrix::node_type Node; -#undef XPETRA_MATRIXFACTORY2_SHORT -#include "Xpetra_UseShortNames.hpp" - public: - static RCP > BuildCopy(const RCP > A, bool setFixedBlockSize = true) { - RCP oldOp = Teuchos::rcp_dynamic_cast(A); - if (oldOp == Teuchos::null) - throw Exceptions::BadCast("Cast from Xpetra::Matrix to Xpetra::CrsMatrixWrap failed"); - - RCP oldCrsOp = oldOp->getCrsMatrix(); - -#ifdef HAVE_XPETRA_EPETRA -#ifndef XPETRA_EPETRA_NO_64BIT_GLOBAL_INDICES - RCP > oldECrsOp = Teuchos::rcp_dynamic_cast >(oldCrsOp); - if (oldECrsOp != Teuchos::null) { - // Underlying matrix is Epetra - RCP newECrsOp(new EpetraCrsMatrixT(*oldECrsOp)); - RCP newOp(new CrsMatrixWrap(newECrsOp)); - if (setFixedBlockSize) - newOp->SetFixedBlockSize(A->GetFixedBlockSize()); - return newOp; - } -#endif -#endif - -#ifdef HAVE_XPETRA_TPETRA - // Underlying matrix is Tpetra - RCP oldTCrsOp = Teuchos::rcp_dynamic_cast(oldCrsOp); - if (oldTCrsOp != Teuchos::null) { - RCP newTCrsOp(new TpetraCrsMatrix(*oldTCrsOp)); - RCP newOp(new CrsMatrixWrap(newTCrsOp)); - if (setFixedBlockSize) - newOp->SetFixedBlockSize(A->GetFixedBlockSize()); - return newOp; - } -#else - throw Exceptions::BadCast("Cast from Xpetra::Matrix to Xpetra::EpetraCrsMatrix or Xpetra::TpetraCrsMatrix failed"); -#endif - - return Teuchos::null; // make compiler happy - } -}; -#endif // HAVE_XPETRA_INT_LONG_LONG - -#define XPETRA_MATRIXFACTORY2_SHORT - template -RCP> MatrixFactory2::BuildCopy(const RCP> A, bool setFixedBlockSize) { - RCP oldOp = Teuchos::rcp_dynamic_cast(A); - if (oldOp == Teuchos::null) - throw Exceptions::BadCast("Cast from Xpetra::Matrix to Xpetra::CrsMatrixWrap failed"); - - RCP oldCrsOp = oldOp->getCrsMatrix(); - - UnderlyingLib lib = A->getRowMap()->lib(); - - TEUCHOS_TEST_FOR_EXCEPTION(lib != UseEpetra && lib != UseTpetra, Exceptions::RuntimeError, - "Not Epetra or Tpetra matrix"); - -#ifdef HAVE_XPETRA_EPETRA - if (lib == UseEpetra) { - // NOTE: The proper Epetra conversion in Xpetra_MatrixFactory.cpp - throw Exceptions::RuntimeError("Xpetra::BuildCopy(): matrix templates are incompatible with Epetra"); - } -#endif - -#ifdef HAVE_XPETRA_TPETRA - if (lib == UseTpetra) { - // Underlying matrix is Tpetra - RCP oldTCrsOp = Teuchos::rcp_dynamic_cast(oldCrsOp); - - if (oldTCrsOp != Teuchos::null) { - RCP newTCrsOp(new TpetraCrsMatrix(*oldTCrsOp)); - RCP newOp(new CrsMatrixWrap(Teuchos::as>(newTCrsOp))); - if (setFixedBlockSize) - newOp->SetFixedBlockSize(A->GetFixedBlockSize()); - - return newOp; - } else { - throw Exceptions::BadCast("Cast from Xpetra::Matrix to Xpetra::TpetraCrsMatrix failed"); - } - } -#endif - - return Teuchos::null; -} - template RCP> MatrixFactory::Build(const RCP& rowMap) { return rcp(new CrsMatrixWrap(rowMap)); diff --git a/packages/xpetra/src/Utils/ClassList/SC-LO-GO-NO.classList b/packages/xpetra/src/Utils/ClassList/SC-LO-GO-NO.classList index c7fe4e0280f4..0eae66e42377 100644 --- a/packages/xpetra/src/Utils/ClassList/SC-LO-GO-NO.classList +++ b/packages/xpetra/src/Utils/ClassList/SC-LO-GO-NO.classList @@ -12,6 +12,7 @@ IO Matrix MatrixUtils MatrixFactory +MatrixFactory2 MatrixMatrix #MultiVectorFactory #Operator diff --git a/packages/xpetra/src/Utils/ExplicitInstantiation/ETI_SC_LO_GO_NO_classes.cmake b/packages/xpetra/src/Utils/ExplicitInstantiation/ETI_SC_LO_GO_NO_classes.cmake index da204626023f..facd488f2794 100644 --- a/packages/xpetra/src/Utils/ExplicitInstantiation/ETI_SC_LO_GO_NO_classes.cmake +++ b/packages/xpetra/src/Utils/ExplicitInstantiation/ETI_SC_LO_GO_NO_classes.cmake @@ -6,6 +6,7 @@ APPEND_SET(XPETRA_SC_LO_GO_NO_ETI_CLASSES Xpetra::IO ) APPEND_SET(XPETRA_SC_LO_GO_NO_ETI_CLASSES Xpetra::Matrix ) APPEND_SET(XPETRA_SC_LO_GO_NO_ETI_CLASSES Xpetra::MatrixUtils ) APPEND_SET(XPETRA_SC_LO_GO_NO_ETI_CLASSES Xpetra::MatrixFactory ) +APPEND_SET(XPETRA_SC_LO_GO_NO_ETI_CLASSES Xpetra::MatrixFactory2 ) APPEND_SET(XPETRA_SC_LO_GO_NO_ETI_CLASSES Xpetra::MatrixMatrix ) APPEND_SET(XPETRA_SC_LO_GO_NO_ETI_CLASSES Xpetra::ThyraUtils-.?if.defined[HAVE_XPETRA_THYRA] ) APPEND_SET(XPETRA_SC_LO_GO_NO_ETI_CLASSES Xpetra::TpetraBlockCrsMatrix ) diff --git a/packages/xpetra/src/Utils/ForwardDeclaration/Xpetra_MatrixFactory2_fwd.hpp b/packages/xpetra/src/Utils/ForwardDeclaration/Xpetra_MatrixFactory2_fwd.hpp new file mode 100644 index 000000000000..2b9967c47033 --- /dev/null +++ b/packages/xpetra/src/Utils/ForwardDeclaration/Xpetra_MatrixFactory2_fwd.hpp @@ -0,0 +1,22 @@ +// @HEADER +// ***************************************************************************** +// Xpetra: A linear algebra interface package +// +// Copyright 2012 NTESS and the Xpetra contributors. +// SPDX-License-Identifier: BSD-3-Clause +// ***************************************************************************** +// @HEADER + +#ifndef XPETRA_MATRIXFACTORY2_FWD_HPP +#define XPETRA_MATRIXFACTORY2_FWD_HPP + +namespace Xpetra { +template +class MatrixFactory2; +} // namespace Xpetra + +#ifndef XPETRA_MATRIXFACTORY2_SHORT +#define XPETRA_MATRIXFACTORY2_SHORT +#endif + +#endif // XPETRA_MATRIXFACTORY2_FWD_HPP diff --git a/packages/xpetra/src/Utils/ForwardDeclaration/Xpetra_MatrixFactory_fwd.hpp b/packages/xpetra/src/Utils/ForwardDeclaration/Xpetra_MatrixFactory_fwd.hpp index 88603b364014..7561319b07cc 100644 --- a/packages/xpetra/src/Utils/ForwardDeclaration/Xpetra_MatrixFactory_fwd.hpp +++ b/packages/xpetra/src/Utils/ForwardDeclaration/Xpetra_MatrixFactory_fwd.hpp @@ -14,8 +14,6 @@ namespace Xpetra { template class MatrixFactory; -template -class MatrixFactory2; } // namespace Xpetra #ifndef XPETRA_MATRIXFACTORY_SHORT From 28c1d13965b2336ab65d47b1ab923bb91bf03c55 Mon Sep 17 00:00:00 2001 From: Jonathan Hu Date: Wed, 7 Aug 2024 19:32:49 -0500 Subject: [PATCH 09/13] MueLu: use new Xpetra file MatrixFactory2 --- packages/muelu/src/Misc/MueLu_RAPShiftFactory_decl.hpp | 2 +- packages/muelu/src/Misc/MueLu_RAPShiftFactory_def.hpp | 1 + .../Energy-Minimization/Solvers/MueLu_CGSolver_def.hpp | 1 + .../Energy-Minimization/Solvers/MueLu_GMRESSolver_def.hpp | 1 + 4 files changed, 4 insertions(+), 1 deletion(-) diff --git a/packages/muelu/src/Misc/MueLu_RAPShiftFactory_decl.hpp b/packages/muelu/src/Misc/MueLu_RAPShiftFactory_decl.hpp index f8398c676402..7c85fed9931c 100644 --- a/packages/muelu/src/Misc/MueLu_RAPShiftFactory_decl.hpp +++ b/packages/muelu/src/Misc/MueLu_RAPShiftFactory_decl.hpp @@ -13,7 +13,7 @@ #include #include -#include +#include #include #include diff --git a/packages/muelu/src/Misc/MueLu_RAPShiftFactory_def.hpp b/packages/muelu/src/Misc/MueLu_RAPShiftFactory_def.hpp index 4960f319a9e2..fb8343e6c148 100644 --- a/packages/muelu/src/Misc/MueLu_RAPShiftFactory_def.hpp +++ b/packages/muelu/src/Misc/MueLu_RAPShiftFactory_def.hpp @@ -17,6 +17,7 @@ #include #include #include +#include #include "MueLu_RAPShiftFactory_decl.hpp" #include "MueLu_MasterList.hpp" diff --git a/packages/muelu/src/Transfers/Energy-Minimization/Solvers/MueLu_CGSolver_def.hpp b/packages/muelu/src/Transfers/Energy-Minimization/Solvers/MueLu_CGSolver_def.hpp index 627886f591a6..d9da924f5a49 100644 --- a/packages/muelu/src/Transfers/Energy-Minimization/Solvers/MueLu_CGSolver_def.hpp +++ b/packages/muelu/src/Transfers/Energy-Minimization/Solvers/MueLu_CGSolver_def.hpp @@ -11,6 +11,7 @@ #define MUELU_CGSOLVER_DEF_HPP #include +#include #include #include "MueLu_Utilities.hpp" diff --git a/packages/muelu/src/Transfers/Energy-Minimization/Solvers/MueLu_GMRESSolver_def.hpp b/packages/muelu/src/Transfers/Energy-Minimization/Solvers/MueLu_GMRESSolver_def.hpp index ea52e370c1dd..b82f7d9fd2af 100644 --- a/packages/muelu/src/Transfers/Energy-Minimization/Solvers/MueLu_GMRESSolver_def.hpp +++ b/packages/muelu/src/Transfers/Energy-Minimization/Solvers/MueLu_GMRESSolver_def.hpp @@ -13,6 +13,7 @@ #include #include +#include #include #include From a693e0213d6e758e449c78fc29679e488ca22982 Mon Sep 17 00:00:00 2001 From: Jonathan Hu Date: Thu, 8 Aug 2024 12:06:33 -0600 Subject: [PATCH 10/13] Xpetra: MueLu: update tests to use new header --- packages/muelu/example/advanced/multiplesolve/StandardReuse.cpp | 2 ++ packages/xpetra/test/Matrix/Matrix_UnitTests.cpp | 1 + 2 files changed, 3 insertions(+) diff --git a/packages/muelu/example/advanced/multiplesolve/StandardReuse.cpp b/packages/muelu/example/advanced/multiplesolve/StandardReuse.cpp index c640c1e9ef4d..681c84a1cb91 100644 --- a/packages/muelu/example/advanced/multiplesolve/StandardReuse.cpp +++ b/packages/muelu/example/advanced/multiplesolve/StandardReuse.cpp @@ -18,6 +18,8 @@ #include #include +#include + #include #include #include diff --git a/packages/xpetra/test/Matrix/Matrix_UnitTests.cpp b/packages/xpetra/test/Matrix/Matrix_UnitTests.cpp index f4a0cc20f808..42eabaaae53e 100644 --- a/packages/xpetra/test/Matrix/Matrix_UnitTests.cpp +++ b/packages/xpetra/test/Matrix/Matrix_UnitTests.cpp @@ -17,6 +17,7 @@ #include "Xpetra_Matrix.hpp" #include "Xpetra_MatrixUtils.hpp" #include "Xpetra_MatrixFactory.hpp" +#include "Xpetra_MatrixFactory2.hpp" #include "Xpetra_MultiVectorFactory.hpp" #include "Xpetra_CrsMatrixWrap.hpp" #ifdef HAVE_XPETRA_TPETRA From 9ef46d72e58ae103ed599bd5644254b6f65461f1 Mon Sep 17 00:00:00 2001 From: Nathan Ellingwood Date: Thu, 8 Aug 2024 12:33:40 -0600 Subject: [PATCH 11/13] tacho: fix unused variable triggering -Werror Signed-off-by: Nathan Ellingwood --- .../shylu_node/tacho/src/impl/Tacho_NumericTools_LevelSet.hpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/packages/shylu/shylu_node/tacho/src/impl/Tacho_NumericTools_LevelSet.hpp b/packages/shylu/shylu_node/tacho/src/impl/Tacho_NumericTools_LevelSet.hpp index 44b577df0175..87b4718fb568 100644 --- a/packages/shylu/shylu_node/tacho/src/impl/Tacho_NumericTools_LevelSet.hpp +++ b/packages/shylu/shylu_node/tacho/src/impl/Tacho_NumericTools_LevelSet.hpp @@ -2056,11 +2056,12 @@ class NumericToolsLevelSet : public NumericToolsBase { for (ordinal_type lvl = 0; lvl < _team_serial_level_cut; ++lvl) { const ordinal_type pbeg = _h_level_ptr(lvl); // the first supernode in this lvl (where the CRS matrix is stored) - auto &s0 = _h_supernodes(_h_level_sids(pbeg)); #if defined(KOKKOS_ENABLE_CUDA) + auto &s0 = _h_supernodes(_h_level_sids(pbeg)); cusparseDestroySpMat(s0.U_cusparse); cusparseDestroySpMat(s0.L_cusparse); #elif defined(KOKKOS_ENABLE_HIP) + auto &s0 = _h_supernodes(_h_level_sids(pbeg)); rocsparse_destroy_spmat_descr(s0.descrU); rocsparse_destroy_spmat_descr(s0.descrL); #endif From a0874dc171e9ba9bddcce52f7c4b19274816205a Mon Sep 17 00:00:00 2001 From: Nathan Ellingwood Date: Thu, 8 Aug 2024 16:22:53 -0600 Subject: [PATCH 12/13] amesos2: fix -Werror=sign-compare in Solver_Test.cpp Signed-off-by: Nathan Ellingwood --- packages/amesos2/test/solvers/Solver_Test.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/packages/amesos2/test/solvers/Solver_Test.cpp b/packages/amesos2/test/solvers/Solver_Test.cpp index 41a3c485d248..856671b96334 100644 --- a/packages/amesos2/test/solvers/Solver_Test.cpp +++ b/packages/amesos2/test/solvers/Solver_Test.cpp @@ -1515,7 +1515,7 @@ bool do_kokkos_test_with_types(const string& mm_file, auto row_map = A2->graph.row_map; Kokkos::RangePolicy policy(0, vals.size()); Kokkos::parallel_for(policy, KOKKOS_LAMBDA(size_t i) { - if(i < row_map(1)) { // just do 1st row + if(i < size_t(row_map(1))) { // just do 1st row vals(i) = vals(i) * vals(i); } }); From e6c551c8f36a5ef0152b6302a31a07e4072f3f09 Mon Sep 17 00:00:00 2001 From: Nathan Ellingwood Date: Fri, 9 Aug 2024 10:45:36 -0600 Subject: [PATCH 13/13] tacho: guard potentially unused variable Signed-off-by: Nathan Ellingwood --- .../shylu_node/tacho/src/impl/Tacho_NumericTools_LevelSet.hpp | 2 ++ 1 file changed, 2 insertions(+) diff --git a/packages/shylu/shylu_node/tacho/src/impl/Tacho_NumericTools_LevelSet.hpp b/packages/shylu/shylu_node/tacho/src/impl/Tacho_NumericTools_LevelSet.hpp index 87b4718fb568..9b98c80ec399 100644 --- a/packages/shylu/shylu_node/tacho/src/impl/Tacho_NumericTools_LevelSet.hpp +++ b/packages/shylu/shylu_node/tacho/src/impl/Tacho_NumericTools_LevelSet.hpp @@ -2054,7 +2054,9 @@ class NumericToolsLevelSet : public NumericToolsBase { Kokkos::fence(); if (release_all) { for (ordinal_type lvl = 0; lvl < _team_serial_level_cut; ++lvl) { +#if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP) const ordinal_type pbeg = _h_level_ptr(lvl); +#endif // the first supernode in this lvl (where the CRS matrix is stored) #if defined(KOKKOS_ENABLE_CUDA) auto &s0 = _h_supernodes(_h_level_sids(pbeg));