diff --git a/.github/workflows/clang_format.yml b/.github/workflows/clang_format.yml index 1141f4e8f35a..1aa6c7249eb5 100644 --- a/.github/workflows/clang_format.yml +++ b/.github/workflows/clang_format.yml @@ -2,8 +2,10 @@ name: Check packages with clang-format on: [pull_request] -permissions: read-all - +permissions: + contents: read + issues: write + jobs: build: runs-on: ubuntu-latest diff --git a/.github/workflows/codeql.yml b/.github/workflows/codeql.yml index fd4b38be7479..c97fa64481e6 100644 --- a/.github/workflows/codeql.yml +++ b/.github/workflows/codeql.yml @@ -62,7 +62,7 @@ jobs: # Initializes the CodeQL tools for scanning. - name: Initialize CodeQL - uses: github/codeql-action/init@b7cec7526559c32f1616476ff32d17ba4c59b2d6 # v3.25.5 + uses: github/codeql-action/init@9fdb3e49720b44c48891d036bb502feb25684276 # v3.25.6 with: languages: ${{ matrix.language }} build-mode: ${{ matrix.build-mode }} @@ -85,6 +85,6 @@ jobs: make -j 2 - name: Perform CodeQL Analysis - uses: github/codeql-action/analyze@b7cec7526559c32f1616476ff32d17ba4c59b2d6 # v3.25.5 + uses: github/codeql-action/analyze@9fdb3e49720b44c48891d036bb502feb25684276 # v3.25.6 with: category: "/language:${{matrix.language}}" diff --git a/.github/workflows/dependency-review.yml b/.github/workflows/dependency-review.yml index 6ad35f1f80fe..7141d0933730 100644 --- a/.github/workflows/dependency-review.yml +++ b/.github/workflows/dependency-review.yml @@ -17,7 +17,7 @@ jobs: runs-on: ubuntu-latest steps: - name: Harden Runner - uses: step-security/harden-runner@a4aa98b93cab29d9b1101a6143fb8bce00e2eac4 # v2.7.1 + uses: step-security/harden-runner@f086349bfa2bd1361f7909c78558e816508cdc10 # v2.8.0 with: egress-policy: audit diff --git a/.github/workflows/scorecards.yml b/.github/workflows/scorecards.yml index 79be3dec93a5..f6cefad9914b 100644 --- a/.github/workflows/scorecards.yml +++ b/.github/workflows/scorecards.yml @@ -66,6 +66,6 @@ jobs: # Upload the results to GitHub's code scanning dashboard. - name: "Upload to code-scanning" - uses: github/codeql-action/upload-sarif@b7cec7526559c32f1616476ff32d17ba4c59b2d6 # v3.25.5 + uses: github/codeql-action/upload-sarif@9fdb3e49720b44c48891d036bb502feb25684276 # v3.25.6 with: sarif_file: results.sarif diff --git a/packages/amesos2/src/Amesos2_Tacho_decl.hpp b/packages/amesos2/src/Amesos2_Tacho_decl.hpp index 44eaa41e5d70..ec52b7775b98 100644 --- a/packages/amesos2/src/Amesos2_Tacho_decl.hpp +++ b/packages/amesos2/src/Amesos2_Tacho_decl.hpp @@ -227,6 +227,7 @@ class TachoSolver : public SolverCore // TODO: Implement the paramter options - confirm which we want and which have been implemented int method; int variant; + int small_problem_threshold_size; // int num_kokkos_threads; // int max_num_superblocks; } data_; diff --git a/packages/amesos2/src/Amesos2_Tacho_def.hpp b/packages/amesos2/src/Amesos2_Tacho_def.hpp index 5fe53a7a4f71..d2519c8e5b3e 100644 --- a/packages/amesos2/src/Amesos2_Tacho_def.hpp +++ b/packages/amesos2/src/Amesos2_Tacho_def.hpp @@ -107,6 +107,7 @@ TachoSolver::symbolicFactorization_impl() data_.solver.setSolutionMethod(data_.method); data_.solver.setLevelSetOptionAlgorithmVariant(data_.variant); + data_.solver.setSmallProblemThresholdsize(data_.small_problem_threshold_size); // TODO: Confirm param options // data_.solver.setMaxNumberOfSuperblocks(data_.max_num_superblocks); @@ -247,6 +248,8 @@ TachoSolver::setParameters_impl(const Teuchos::RCPget ("variant", 2); + // small problem threshold + data_.small_problem_threshold_size = parameterList->get ("small problem threshold size", 1024); // TODO: Confirm param options // data_.num_kokkos_threads = parameterList->get("kokkos-threads", 1); // data_.max_num_superblocks = parameterList->get("max-num-superblocks", 4); @@ -264,6 +267,7 @@ TachoSolver::getValidParameters_impl() const pl->set("method", "chol", "Type of factorization, chol, ldl, or lu"); pl->set("variant", 2, "Type of solver variant, 0, 1, or 2"); + pl->set("small problem threshold size", 1024, "Problem size threshold below with Tacho uses LAPACK."); // TODO: Confirm param options // pl->set("kokkos-threads", 1, "Number of threads"); diff --git a/packages/framework/ini-files/config-specs.ini b/packages/framework/ini-files/config-specs.ini index 0d7196b38293..f917ccefa47a 100644 --- a/packages/framework/ini-files/config-specs.ini +++ b/packages/framework/ini-files/config-specs.ini @@ -2046,7 +2046,7 @@ use PACKAGE-ENABLES|ALL use rhel7_sems-clang-11.0.1-openmpi-1.10.1-serial_release-debug_shared_no-kokkos-arch_no-asan_no-complex_no-fpic_mpi_no-pt_no-rdc_no-uvm_deprecated-on_no-package-enables use PACKAGE-ENABLES|ALL -[rhel7_sems-clang-11.0.1-openmpi-1.10.7-serial_release-debug_shared_no-kokkos-arch_no-asan_no-complex_no-fpic_mpi_no-pt_no-rdc_no-uvm_deprecated-on_no-package-enables] +[rhel8_sems-clang-11.0.1-openmpi-4.0.5-serial_release-debug_shared_no-kokkos-arch_no-asan_no-complex_no-fpic_mpi_no-pt_no-rdc_no-uvm_deprecated-on_no-package-enables] # uses sems-v2 modules use RHEL7_SEMS_COMPILER|CLANG use NODE-TYPE|SERIAL @@ -2067,11 +2067,17 @@ opt-set-cmake-var Teko_DISABLE_LSCSTABALIZED_TPETRA_ALPAH_INV_D BOOL : ON use RHEL7_TEST_DISABLES|CLANG +opt-set-cmake-var SuperLU_LIBRARY_NAMES STRING : superlu;m +opt-set-cmake-var ML_ENABLE_SuperLU BOOL FORCE : OFF + +opt-set-cmake-var Pliris_vector_random_MPI_3_DISABLE BOOL : ON +opt-set-cmake-var Pliris_vector_random_MPI_4_DISABLE BOOL : ON + use RHEL7_POST -[rhel7_sems-clang-11.0.1-openmpi-1.10.7-serial_release-debug_shared_no-kokkos-arch_no-asan_no-complex_no-fpic_mpi_no-pt_no-rdc_no-uvm_deprecated-on_all] +[rhel8_sems-clang-11.0.1-openmpi-4.0.5-serial_release-debug_shared_no-kokkos-arch_no-asan_no-complex_no-fpic_mpi_no-pt_no-rdc_no-uvm_deprecated-on_all] # uses sems-v2 modules -use rhel7_sems-clang-11.0.1-openmpi-1.10.7-serial_release-debug_shared_no-kokkos-arch_no-asan_no-complex_no-fpic_mpi_no-pt_no-rdc_no-uvm_deprecated-on_no-package-enables +use rhel8_sems-clang-11.0.1-openmpi-4.0.5-serial_release-debug_shared_no-kokkos-arch_no-asan_no-complex_no-fpic_mpi_no-pt_no-rdc_no-uvm_deprecated-on_no-package-enables use PACKAGE-ENABLES|ALL [rhel7_sems-intel-19.0.5-mpich-3.2-serial_release-debug_static_no-kokkos-arch_no-asan_no-complex_fpic_mpi_no-pt_no-rdc_no-uvm_deprecated-on_no-package-enables] diff --git a/packages/framework/ini-files/supported-envs.ini b/packages/framework/ini-files/supported-envs.ini index b543e510fccb..d3a27cfcb82e 100644 --- a/packages/framework/ini-files/supported-envs.ini +++ b/packages/framework/ini-files/supported-envs.ini @@ -178,6 +178,7 @@ gcc-openmpi gcc-serial aue-gcc-openmpi sems-cuda-11.4.2-sems-gnu-10.1.0-sems-openmpi-4.1.4 +sems-clang-11.0.1-openmpi-4.0.5-serial [ats2] cuda-11.2.152-gnu-8.3.1-spmpi-rolling diff --git a/packages/framework/pr_tools/trilinosprhelpers/TrilinosPRConfigurationBase.py b/packages/framework/pr_tools/trilinosprhelpers/TrilinosPRConfigurationBase.py index bb038e3e1926..54cbcd1227f3 100644 --- a/packages/framework/pr_tools/trilinosprhelpers/TrilinosPRConfigurationBase.py +++ b/packages/framework/pr_tools/trilinosprhelpers/TrilinosPRConfigurationBase.py @@ -1,4 +1,4 @@ -#!/usr/bin/env python +#!/usr/bin/env python3 # -*- mode: python; py-indent-offset: 4; py-continuation-offset: 4 -*- """ This file contains the base class for the Pull Request test driver. diff --git a/packages/framework/pr_tools/trilinosprhelpers/TrilinosPRConfigurationInstallation.py b/packages/framework/pr_tools/trilinosprhelpers/TrilinosPRConfigurationInstallation.py index 82ab88a015fc..ba2c87c9a576 100644 --- a/packages/framework/pr_tools/trilinosprhelpers/TrilinosPRConfigurationInstallation.py +++ b/packages/framework/pr_tools/trilinosprhelpers/TrilinosPRConfigurationInstallation.py @@ -1,4 +1,4 @@ -#!/usr/bin/env python +#!/usr/bin/env python3 # -*- mode: python; py-indent-offset: 4; py-continuation-offset: 4 -*- """ Custom PR Executor for Installation testing diff --git a/packages/framework/pr_tools/trilinosprhelpers/TrilinosPRConfigurationStandard.py b/packages/framework/pr_tools/trilinosprhelpers/TrilinosPRConfigurationStandard.py index 0bd0d7071950..cb3e5af13552 100644 --- a/packages/framework/pr_tools/trilinosprhelpers/TrilinosPRConfigurationStandard.py +++ b/packages/framework/pr_tools/trilinosprhelpers/TrilinosPRConfigurationStandard.py @@ -1,4 +1,4 @@ -#!/usr/bin/env python +#!/usr/bin/env python3 # -*- mode: python; py-indent-offset: 4; py-continuation-offset: 4 -*- """ Custom PR Executor for Standard testing diff --git a/packages/framework/pr_tools/trilinosprhelpers/__init__.py b/packages/framework/pr_tools/trilinosprhelpers/__init__.py index 1b8eb9136cdb..d2718d766803 100644 --- a/packages/framework/pr_tools/trilinosprhelpers/__init__.py +++ b/packages/framework/pr_tools/trilinosprhelpers/__init__.py @@ -1,4 +1,4 @@ -#!/usr/bin/env python +#!/usr/bin/env python3 # -*- mode: python; py-indent-offset: 4; py-continuation-offset: 4 -*- from .sysinfo import * diff --git a/packages/framework/pr_tools/trilinosprhelpers/gitutility/GitUtility.py b/packages/framework/pr_tools/trilinosprhelpers/gitutility/GitUtility.py index ff6bd5831f33..82ad13c414ed 100644 --- a/packages/framework/pr_tools/trilinosprhelpers/gitutility/GitUtility.py +++ b/packages/framework/pr_tools/trilinosprhelpers/gitutility/GitUtility.py @@ -1,4 +1,4 @@ -#!/usr/bin/env python +#!/usr/bin/env python3 # -*- mode: python; py-indent-offset: 4; py-continuation-offset: 4 -*- """ This class contains a set of utilities for using and querying diff --git a/packages/framework/pr_tools/trilinosprhelpers/gitutility/unittests/__init__.py b/packages/framework/pr_tools/trilinosprhelpers/gitutility/unittests/__init__.py index 36f13ba9b20b..b6bf6bef5324 100644 --- a/packages/framework/pr_tools/trilinosprhelpers/gitutility/unittests/__init__.py +++ b/packages/framework/pr_tools/trilinosprhelpers/gitutility/unittests/__init__.py @@ -1,4 +1,4 @@ -#!/usr/bin/env python +#!/usr/bin/env python3 # -*- coding: utf-8 -*- """ This file is required to enable Python to locate the package files in the parent diff --git a/packages/framework/pr_tools/trilinosprhelpers/gitutility/unittests/test_GitUtility.py b/packages/framework/pr_tools/trilinosprhelpers/gitutility/unittests/test_GitUtility.py index 5dbd6f3497b3..3390974955e5 100644 --- a/packages/framework/pr_tools/trilinosprhelpers/gitutility/unittests/test_GitUtility.py +++ b/packages/framework/pr_tools/trilinosprhelpers/gitutility/unittests/test_GitUtility.py @@ -1,4 +1,4 @@ -#!/usr/bin/env python +#!/usr/bin/env python3 # -*- coding: utf-8; mode: python; py-indent-offset: 4; py-continuation-offset: 4 -*- """ """ diff --git a/packages/framework/pr_tools/trilinosprhelpers/jenkinsenv/EnvvarHelper.py b/packages/framework/pr_tools/trilinosprhelpers/jenkinsenv/EnvvarHelper.py index 38067dca1d85..55655574d102 100644 --- a/packages/framework/pr_tools/trilinosprhelpers/jenkinsenv/EnvvarHelper.py +++ b/packages/framework/pr_tools/trilinosprhelpers/jenkinsenv/EnvvarHelper.py @@ -1,4 +1,4 @@ -#!/usr/bin/env python +#!/usr/bin/env python3 # -*- mode: python; py-indent-offset: 4; py-continuation-offset: 4 -*- """ """ diff --git a/packages/framework/pr_tools/trilinosprhelpers/jenkinsenv/JenkinsEnv.py b/packages/framework/pr_tools/trilinosprhelpers/jenkinsenv/JenkinsEnv.py index 2aaeba25e3fe..9e328b779f80 100644 --- a/packages/framework/pr_tools/trilinosprhelpers/jenkinsenv/JenkinsEnv.py +++ b/packages/framework/pr_tools/trilinosprhelpers/jenkinsenv/JenkinsEnv.py @@ -1,4 +1,4 @@ -#!/usr/bin/env python +#!/usr/bin/env python3 # -*- mode: python; py-indent-offset: 4; py-continuation-offset: 4 -*- from .EnvvarHelper import EnvvarHelper diff --git a/packages/framework/pr_tools/trilinosprhelpers/jenkinsenv/TrilinosJenkinsEnv.py b/packages/framework/pr_tools/trilinosprhelpers/jenkinsenv/TrilinosJenkinsEnv.py index 95b4469742e6..eb35e6fa4374 100644 --- a/packages/framework/pr_tools/trilinosprhelpers/jenkinsenv/TrilinosJenkinsEnv.py +++ b/packages/framework/pr_tools/trilinosprhelpers/jenkinsenv/TrilinosJenkinsEnv.py @@ -1,4 +1,4 @@ -#!/usr/bin/env python +#!/usr/bin/env python3 # -*- mode: python; py-indent-offset: 4; py-continuation-offset: 4 -*- from .JenkinsEnv import JenkinsEnv diff --git a/packages/framework/pr_tools/trilinosprhelpers/jenkinsenv/__init__.py b/packages/framework/pr_tools/trilinosprhelpers/jenkinsenv/__init__.py index f83d145c1517..fe4cf1d0cbef 100644 --- a/packages/framework/pr_tools/trilinosprhelpers/jenkinsenv/__init__.py +++ b/packages/framework/pr_tools/trilinosprhelpers/jenkinsenv/__init__.py @@ -1,4 +1,4 @@ -#!/usr/bin/env python +#!/usr/bin/env python3 # -*- mode: python; py-indent-offset: 4; py-continuation-offset: 4 -*- from .EnvvarHelper import EnvvarHelper diff --git a/packages/framework/pr_tools/trilinosprhelpers/jenkinsenv/unittests/__init__.py b/packages/framework/pr_tools/trilinosprhelpers/jenkinsenv/unittests/__init__.py index 36f13ba9b20b..b6bf6bef5324 100644 --- a/packages/framework/pr_tools/trilinosprhelpers/jenkinsenv/unittests/__init__.py +++ b/packages/framework/pr_tools/trilinosprhelpers/jenkinsenv/unittests/__init__.py @@ -1,4 +1,4 @@ -#!/usr/bin/env python +#!/usr/bin/env python3 # -*- coding: utf-8 -*- """ This file is required to enable Python to locate the package files in the parent diff --git a/packages/framework/pr_tools/trilinosprhelpers/jenkinsenv/unittests/test_EnvvarHelper.py b/packages/framework/pr_tools/trilinosprhelpers/jenkinsenv/unittests/test_EnvvarHelper.py index 430f27175874..5ece6c0c6b48 100644 --- a/packages/framework/pr_tools/trilinosprhelpers/jenkinsenv/unittests/test_EnvvarHelper.py +++ b/packages/framework/pr_tools/trilinosprhelpers/jenkinsenv/unittests/test_EnvvarHelper.py @@ -1,4 +1,4 @@ -#!/usr/bin/env python +#!/usr/bin/env python3 # -*- coding: utf-8; mode: python; py-indent-offset: 4; py-continuation-offset: 4 -*- """ """ diff --git a/packages/framework/pr_tools/trilinosprhelpers/jenkinsenv/unittests/test_JenkinsEnv.py b/packages/framework/pr_tools/trilinosprhelpers/jenkinsenv/unittests/test_JenkinsEnv.py index e4b8e19bff6b..ead3e02e444e 100644 --- a/packages/framework/pr_tools/trilinosprhelpers/jenkinsenv/unittests/test_JenkinsEnv.py +++ b/packages/framework/pr_tools/trilinosprhelpers/jenkinsenv/unittests/test_JenkinsEnv.py @@ -1,4 +1,4 @@ -#!/usr/bin/env python +#!/usr/bin/env python3 # -*- coding: utf-8; mode: python; py-indent-offset: 4; py-continuation-offset: 4 -*- """ """ diff --git a/packages/framework/pr_tools/trilinosprhelpers/jenkinsenv/unittests/test_TrilinosJenkinsEnv.py b/packages/framework/pr_tools/trilinosprhelpers/jenkinsenv/unittests/test_TrilinosJenkinsEnv.py index e4b9e7a2613f..6a8844c406d4 100644 --- a/packages/framework/pr_tools/trilinosprhelpers/jenkinsenv/unittests/test_TrilinosJenkinsEnv.py +++ b/packages/framework/pr_tools/trilinosprhelpers/jenkinsenv/unittests/test_TrilinosJenkinsEnv.py @@ -1,4 +1,4 @@ -#!/usr/bin/env python +#!/usr/bin/env python3 # -*- coding: utf-8; mode: python; py-indent-offset: 4; py-continuation-offset: 4 -*- """ """ diff --git a/packages/framework/pr_tools/trilinosprhelpers/sysinfo/SysInfo.py b/packages/framework/pr_tools/trilinosprhelpers/sysinfo/SysInfo.py index 839f846900e4..60334dd1526d 100644 --- a/packages/framework/pr_tools/trilinosprhelpers/sysinfo/SysInfo.py +++ b/packages/framework/pr_tools/trilinosprhelpers/sysinfo/SysInfo.py @@ -1,4 +1,4 @@ -#!/usr/bin/env python +#!/usr/bin/env python3 # -*- mode: python; py-indent-offset: 4; py-continuation-offset: 4 -*- """ System information helpers are contained in this file. diff --git a/packages/framework/pr_tools/trilinosprhelpers/sysinfo/unittests/__init__.py b/packages/framework/pr_tools/trilinosprhelpers/sysinfo/unittests/__init__.py index 36f13ba9b20b..b6bf6bef5324 100644 --- a/packages/framework/pr_tools/trilinosprhelpers/sysinfo/unittests/__init__.py +++ b/packages/framework/pr_tools/trilinosprhelpers/sysinfo/unittests/__init__.py @@ -1,4 +1,4 @@ -#!/usr/bin/env python +#!/usr/bin/env python3 # -*- coding: utf-8 -*- """ This file is required to enable Python to locate the package files in the parent diff --git a/packages/framework/pr_tools/trilinosprhelpers/sysinfo/unittests/test_SysInfo.py b/packages/framework/pr_tools/trilinosprhelpers/sysinfo/unittests/test_SysInfo.py index 45fad390822d..74e1e0deb7b2 100644 --- a/packages/framework/pr_tools/trilinosprhelpers/sysinfo/unittests/test_SysInfo.py +++ b/packages/framework/pr_tools/trilinosprhelpers/sysinfo/unittests/test_SysInfo.py @@ -1,4 +1,4 @@ -#!/usr/bin/env python +#!/usr/bin/env python3 # -*- coding: utf-8; mode: python; py-indent-offset: 4; py-continuation-offset: 4 -*- """ """ diff --git a/packages/framework/pr_tools/trilinosprhelpers/sysinfo/unittests/test_gpu_utils.py b/packages/framework/pr_tools/trilinosprhelpers/sysinfo/unittests/test_gpu_utils.py index 4d6bf4f0d47d..8f5ea0df155c 100644 --- a/packages/framework/pr_tools/trilinosprhelpers/sysinfo/unittests/test_gpu_utils.py +++ b/packages/framework/pr_tools/trilinosprhelpers/sysinfo/unittests/test_gpu_utils.py @@ -1,4 +1,4 @@ -#!/usr/bin/env python +#!/usr/bin/env python3 # -*- coding: utf-8; mode: python; py-indent-offset: 4; py-continuation-offset: 4 -*- """ """ diff --git a/packages/framework/pr_tools/trilinosprhelpers/unittests/__init__.py b/packages/framework/pr_tools/trilinosprhelpers/unittests/__init__.py index 36f13ba9b20b..b6bf6bef5324 100644 --- a/packages/framework/pr_tools/trilinosprhelpers/unittests/__init__.py +++ b/packages/framework/pr_tools/trilinosprhelpers/unittests/__init__.py @@ -1,4 +1,4 @@ -#!/usr/bin/env python +#!/usr/bin/env python3 # -*- coding: utf-8 -*- """ This file is required to enable Python to locate the package files in the parent diff --git a/packages/framework/pr_tools/trilinosprhelpers/unittests/test_TrilinosPRConfigurationBase.py b/packages/framework/pr_tools/trilinosprhelpers/unittests/test_TrilinosPRConfigurationBase.py index ee90c1dd431e..6f933f5d751a 100755 --- a/packages/framework/pr_tools/trilinosprhelpers/unittests/test_TrilinosPRConfigurationBase.py +++ b/packages/framework/pr_tools/trilinosprhelpers/unittests/test_TrilinosPRConfigurationBase.py @@ -1,4 +1,4 @@ -#!/usr/bin/env python +#!/usr/bin/env python3 # -*- coding: utf-8; mode: python; py-indent-offset: 4; py-continuation-offset: 4 -*- """ """ diff --git a/packages/framework/pr_tools/trilinosprhelpers/unittests/test_TrilinosPRConfigurationInstallation.py b/packages/framework/pr_tools/trilinosprhelpers/unittests/test_TrilinosPRConfigurationInstallation.py index 09c311c156a6..8d6b71abef7c 100755 --- a/packages/framework/pr_tools/trilinosprhelpers/unittests/test_TrilinosPRConfigurationInstallation.py +++ b/packages/framework/pr_tools/trilinosprhelpers/unittests/test_TrilinosPRConfigurationInstallation.py @@ -1,4 +1,4 @@ -#!/usr/bin/env python +#!/usr/bin/env python3 # -*- coding: utf-8; mode: python; py-indent-offset: 4; py-continuation-offset: 4 -*- """ """ diff --git a/packages/framework/pr_tools/unittests/__init__.py b/packages/framework/pr_tools/unittests/__init__.py index efdebb2e61f8..9f023287705a 100644 --- a/packages/framework/pr_tools/unittests/__init__.py +++ b/packages/framework/pr_tools/unittests/__init__.py @@ -1,3 +1,3 @@ -#!/usr/bin/env python +#!/usr/bin/env python3 # -*- mode: python; py-indent-offset: 4; py-continuation-offset: 4 -*- diff --git a/packages/ifpack2/src/Ifpack2_BlockComputeResidualVector.hpp b/packages/ifpack2/src/Ifpack2_BlockComputeResidualVector.hpp index 85bb469a4bf1..c51a3868fb7f 100644 --- a/packages/ifpack2/src/Ifpack2_BlockComputeResidualVector.hpp +++ b/packages/ifpack2/src/Ifpack2_BlockComputeResidualVector.hpp @@ -238,7 +238,8 @@ namespace Ifpack2 { // block crs graph information // for cuda (kokkos crs graph uses a different size_type from size_t) - const ConstUnmanaged > A_rowptr; + const ConstUnmanaged > A_block_rowptr; + const ConstUnmanaged > A_point_rowptr; const ConstUnmanaged > A_colind; // blocksize @@ -252,19 +253,22 @@ namespace Ifpack2 { const ConstUnmanaged lclrow; const ConstUnmanaged dm2cm; const bool is_dm2cm_active; + const bool hasBlockCrsMatrix; public: template ComputeResidualVector(const AmD &amd, - const LocalCrsGraphType &graph, + const LocalCrsGraphType &block_graph, + const LocalCrsGraphType &point_graph, const local_ordinal_type &blocksize_requested_, const PartInterface &interf, const local_ordinal_type_1d_view &dm2cm_) : rowptr(amd.rowptr), rowptr_remote(amd.rowptr_remote), colindsub(amd.A_colindsub), colindsub_remote(amd.A_colindsub_remote), tpetra_values(amd.tpetra_values), - A_rowptr(graph.row_map), - A_colind(graph.entries), + A_block_rowptr(block_graph.row_map), + A_point_rowptr(point_graph.row_map), + A_colind(block_graph.entries), blocksize_requested(blocksize_requested_), part2packrowidx0(interf.part2packrowidx0), part2rowidx0(interf.part2rowidx0), @@ -272,8 +276,32 @@ namespace Ifpack2 { partptr(interf.partptr), lclrow(interf.lclrow), dm2cm(dm2cm_), - is_dm2cm_active(dm2cm_.span() > 0) - {} + is_dm2cm_active(dm2cm_.span() > 0), + hasBlockCrsMatrix(A_block_rowptr.extent(0) == A_point_rowptr.extent(0)) + { } + + inline + void + SerialDot(const local_ordinal_type &blocksize, + const local_ordinal_type &lclRowID, + const local_ordinal_type &lclColID, + const local_ordinal_type &ii, + const ConstUnmanagedcolindsub_, + const impl_scalar_type * const KOKKOS_RESTRICT xx, + /* */ impl_scalar_type * KOKKOS_RESTRICT yy) const { + const size_type Aj_c = colindsub_(lclColID); + auto point_row_offset = A_point_rowptr(lclRowID*blocksize + ii) + Aj_c*blocksize; + impl_scalar_type val = 0; +#if defined(KOKKOS_ENABLE_PRAGMA_IVDEP) +# pragma ivdep +#endif +#if defined(KOKKOS_ENABLE_PRAGMA_UNROLL) +# pragma unroll +#endif + for (local_ordinal_type k1=0;k1 + KOKKOS_INLINE_FUNCTION + void + VectorDot(const member_type &member, + const local_ordinal_type &blocksize, + const local_ordinal_type &lclRowID, + const local_ordinal_type &lclColID, + const local_ordinal_type &ii, + const ConstUnmanagedcolindsub_, + const xxViewType &xx, + const yyViewType &yy) const { + const size_type Aj_c = colindsub_(lclColID); + auto point_row_offset = A_point_rowptr(lclRowID*blocksize + ii) + Aj_c*blocksize; + impl_scalar_type val = 0; + Kokkos::parallel_for + (Kokkos::ThreadVectorRange(member, blocksize), + [&](const local_ordinal_type &k1) { + val += tpetra_values(point_row_offset + k1) *xx(k1); + }); + Kokkos::atomic_fetch_add(&yy(ii), typename yyViewType::const_value_type(-val)); + } + template KOKKOS_INLINE_FUNCTION void @@ -387,12 +437,18 @@ namespace Ifpack2 { memcpy(yy, bb, sizeof(impl_scalar_type)*blocksize); // y -= Rx - const size_type A_k0 = A_rowptr[i]; + const size_type A_k0 = A_block_rowptr[i]; for (size_type k=rowptr[i];k(NULL, blocksize, blocksize); + auto A_block_cst = ConstUnmanaged(NULL, blocksize, blocksize); const local_ordinal_type row = lr*blocksize; for (local_ordinal_type col=0;col(NULL, blocksize, blocksize); + auto A_block_cst = ConstUnmanaged(NULL, blocksize, blocksize); const local_ordinal_type lr = lclrow(rowidx); const local_ordinal_type row = lr*blocksize; @@ -524,24 +611,46 @@ namespace Ifpack2 { member.team_barrier(); // y -= Rx - const size_type A_k0 = A_rowptr[lr]; - Kokkos::parallel_for - (Kokkos::TeamThreadRange(member, rowptr[lr], rowptr[lr+1]), - [&](const local_ordinal_type &k) { - const size_type j = A_k0 + colindsub[k]; - A_block.assign_data( &tpetra_values(j*blocksize_square) ); - - const local_ordinal_type A_colind_at_j = A_colind[j]; - if (A_colind_at_j < num_local_rows) { - const auto loc = is_dm2cm_active ? dm2cm[A_colind_at_j] : A_colind_at_j; - xx.assign_data( &x(loc*blocksize, col) ); - VectorGemv(member, blocksize, A_block, xx, yy); - } else { - const auto loc = A_colind_at_j - num_local_rows; - xx_remote.assign_data( &x_remote(loc*blocksize, col) ); - VectorGemv(member, blocksize, A_block, xx_remote, yy); - } - }); + const size_type A_k0 = A_block_rowptr[lr]; + if(hasBlockCrsMatrix) { + Kokkos::parallel_for + (Kokkos::TeamThreadRange(member, rowptr[lr], rowptr[lr+1]), + [&](const local_ordinal_type &k) { + const size_type j = A_k0 + colindsub[k]; + A_block_cst.assign_data( &tpetra_values(j*blocksize_square) ); + + const local_ordinal_type A_colind_at_j = A_colind[j]; + if (A_colind_at_j < num_local_rows) { + const auto loc = is_dm2cm_active ? dm2cm[A_colind_at_j] : A_colind_at_j; + xx.assign_data( &x(loc*blocksize, col) ); + VectorGemv(member, blocksize, A_block_cst, xx, yy); + } else { + const auto loc = A_colind_at_j - num_local_rows; + xx_remote.assign_data( &x_remote(loc*blocksize, col) ); + VectorGemv(member, blocksize, A_block_cst, xx_remote, yy); + } + }); + } + else { + Kokkos::parallel_for + (Kokkos::TeamThreadRange(member, rowptr[lr], rowptr[lr+1]), + [&](const local_ordinal_type &k) { + const size_type j = A_k0 + colindsub[k]; + + const local_ordinal_type A_colind_at_j = A_colind[j]; + if (A_colind_at_j < num_local_rows) { + const auto loc = is_dm2cm_active ? dm2cm[A_colind_at_j] : A_colind_at_j; + xx.assign_data( &x(loc*blocksize, col) ); + for (local_ordinal_type k0=0;k0(NULL, blocksize, blocksize); + auto A_block_cst = ConstUnmanaged(NULL, blocksize, blocksize); auto colindsub_used = (P == 0 ? colindsub : colindsub_remote); auto rowptr_used = (P == 0 ? rowptr : rowptr_remote); @@ -648,24 +772,46 @@ namespace Ifpack2 { } // y -= Rx - const size_type A_k0 = A_rowptr[lr]; - Kokkos::parallel_for - (Kokkos::TeamThreadRange(member, rowptr_used[lr], rowptr_used[lr+1]), - [&](const local_ordinal_type &k) { - const size_type j = A_k0 + colindsub_used[k]; - A_block.assign_data( &tpetra_values(j*blocksize_square) ); - - const local_ordinal_type A_colind_at_j = A_colind[j]; - if (P == 0) { - const auto loc = is_dm2cm_active ? dm2cm[A_colind_at_j] : A_colind_at_j; - xx.assign_data( &x(loc*blocksize, col) ); - VectorGemv(member, blocksize, A_block, xx, yy); - } else { - const auto loc = A_colind_at_j - num_local_rows; - xx_remote.assign_data( &x_remote(loc*blocksize, col) ); - VectorGemv(member, blocksize, A_block, xx_remote, yy); - } - }); + const size_type A_k0 = A_block_rowptr[lr]; + if(hasBlockCrsMatrix) { + Kokkos::parallel_for + (Kokkos::TeamThreadRange(member, rowptr_used[lr], rowptr_used[lr+1]), + [&](const local_ordinal_type &k) { + const size_type j = A_k0 + colindsub_used[k]; + A_block_cst.assign_data( &tpetra_values(j*blocksize_square) ); + + const local_ordinal_type A_colind_at_j = A_colind[j]; + if (P == 0) { + const auto loc = is_dm2cm_active ? dm2cm[A_colind_at_j] : A_colind_at_j; + xx.assign_data( &x(loc*blocksize, col) ); + VectorGemv(member, blocksize, A_block_cst, xx, yy); + } else { + const auto loc = A_colind_at_j - num_local_rows; + xx_remote.assign_data( &x_remote(loc*blocksize, col) ); + VectorGemv(member, blocksize, A_block_cst, xx_remote, yy); + } + }); + } + else { + Kokkos::parallel_for + (Kokkos::TeamThreadRange(member, rowptr_used[lr], rowptr_used[lr+1]), + [&](const local_ordinal_type &k) { + const size_type j = A_k0 + colindsub_used[k]; + + const local_ordinal_type A_colind_at_j = A_colind[j]; + if (P == 0) { + const auto loc = is_dm2cm_active ? dm2cm[A_colind_at_j] : A_colind_at_j; + xx.assign_data( &x(loc*blocksize, col) ); + for (local_ordinal_type k0=0;k0 tpetra_map_type; typedef Tpetra::Import tpetra_import_type; typedef Tpetra::RowMatrix tpetra_row_matrix_type; + typedef Tpetra::CrsMatrix tpetra_crs_matrix_type; + typedef Tpetra::CrsGraph tpetra_crs_graph_type; typedef Tpetra::BlockCrsMatrix tpetra_block_crs_matrix_type; typedef typename tpetra_block_crs_matrix_type::little_block_type tpetra_block_access_view_type; typedef Tpetra::BlockMultiVector tpetra_block_multivector_type; @@ -375,6 +377,7 @@ namespace Ifpack2 { typedef Kokkos::View internal_vector_type_3d_view; typedef Kokkos::View internal_vector_type_4d_view; typedef Kokkos::View internal_vector_type_5d_view; + typedef Kokkos::View btdm_scalar_type_2d_view; typedef Kokkos::View btdm_scalar_type_3d_view; typedef Kokkos::View btdm_scalar_type_4d_view; typedef Kokkos::View btdm_scalar_type_5d_view; diff --git a/packages/ifpack2/src/Ifpack2_BlockRelaxation_def.hpp b/packages/ifpack2/src/Ifpack2_BlockRelaxation_def.hpp index 21c15108496f..30c9d79f1ea8 100644 --- a/packages/ifpack2/src/Ifpack2_BlockRelaxation_def.hpp +++ b/packages/ifpack2/src/Ifpack2_BlockRelaxation_def.hpp @@ -179,6 +179,7 @@ getValidParameters () const validParams->set("partitioner: subparts per part", 1); validParams->set("partitioner: block size", -1); validParams->set("partitioner: print level", false); + validParams->set("partitioner: explicit convert to BlockCrs", false); return validParams; } @@ -633,14 +634,23 @@ initialize () Teuchos::rcp_dynamic_cast (A_); hasBlockCrsMatrix_ = !A_bcrs.is_null(); + Teuchos::RCP graph = A_->getGraph (); + if(!hasBlockCrsMatrix_ && List_.isParameter("relaxation: container") && List_.get("relaxation: container") == "BlockTriDi" ) { TEUCHOS_FUNC_TIME_MONITOR("Ifpack2::BlockRelaxation::initialize::convertToBlockCrsMatrix"); int block_size = List_.get("partitioner: block size"); + bool use_explicit_conversion = List_.isParameter("partitioner: explicit convert to BlockCrs") && List_.get("partitioner: explicit convert to BlockCrs"); TEUCHOS_TEST_FOR_EXCEPT_MSG - (block_size == -1, "A pointwise matrix and block_size = -1 were given as inputs."); - A_bcrs = Tpetra::convertToBlockCrsMatrix(*Teuchos::rcp_dynamic_cast(A_), block_size, false); - A_ = A_bcrs; - hasBlockCrsMatrix_ = true; + (use_explicit_conversion && block_size == -1, "A pointwise matrix and block_size = -1 were given as inputs."); + if(use_explicit_conversion) { + A_bcrs = Tpetra::convertToBlockCrsMatrix(*Teuchos::rcp_dynamic_cast(A_), block_size, false); + A_ = A_bcrs; + hasBlockCrsMatrix_ = true; + graph = A_->getGraph (); + } + else { + graph = Tpetra::getBlockCrsGraph(*Teuchos::rcp_dynamic_cast(A_), block_size, true); + } Kokkos::DefaultExecutionSpace().fence(); } @@ -655,19 +665,19 @@ initialize () if (PartitionerType_ == "linear") { Partitioner_ = - rcp (new Ifpack2::LinearPartitioner (A_->getGraph ())); + rcp (new Ifpack2::LinearPartitioner (graph)); } else if (PartitionerType_ == "line") { Partitioner_ = - rcp (new Ifpack2::LinePartitioner (A_->getGraph ())); + rcp (new Ifpack2::LinePartitioner (graph)); } else if (PartitionerType_ == "user") { Partitioner_ = - rcp (new Ifpack2::Details::UserPartitioner (A_->getGraph () ) ); + rcp (new Ifpack2::Details::UserPartitioner (graph ) ); } else if (PartitionerType_ == "zoltan2") { #if defined(HAVE_IFPACK2_ZOLTAN2) - if (A_->getGraph ()->getComm ()->getSize () == 1) { + if (graph->getComm ()->getSize () == 1) { // Only one MPI, so call zoltan2 with global graph Partitioner_ = - rcp (new Ifpack2::Zoltan2Partitioner (A_->getGraph ()) ); + rcp (new Ifpack2::Zoltan2Partitioner (graph) ); } else { // Form local matrix to get local graph for calling zoltan2 Teuchos::RCP A_local = rcp (new LocalFilter (A_)); diff --git a/packages/ifpack2/src/Ifpack2_BlockTriDiContainer_decl.hpp b/packages/ifpack2/src/Ifpack2_BlockTriDiContainer_decl.hpp index e72565179cc4..067f85e34b04 100644 --- a/packages/ifpack2/src/Ifpack2_BlockTriDiContainer_decl.hpp +++ b/packages/ifpack2/src/Ifpack2_BlockTriDiContainer_decl.hpp @@ -235,7 +235,8 @@ namespace Ifpack2 { const int n_subparts_per_part = 1, bool overlapCommAndComp = false, bool useSequentialMethod = false, - const int block_size = -1); + const int block_size = -1, + const bool explicitConversion = false); //! Destructor (declared virtual for memory safety of derived classes). ~BlockTriDiContainer () override; @@ -397,13 +398,15 @@ namespace Ifpack2 { // hide details of impl using ImplObj; finally I understand why AMB did that way. Teuchos::RCP > impl_; int n_subparts_per_part_; + int block_size_ = -1; // initialize distributed and local objects void initInternal (const Teuchos::RCP& matrix, const Teuchos::RCP &importer, const bool overlapCommAndComp, const bool useSeqMethod, - const int block_size = -1); + const int block_size = -1, + const bool explicitConversion = false); void clearInternal(); }; diff --git a/packages/ifpack2/src/Ifpack2_BlockTriDiContainer_def.hpp b/packages/ifpack2/src/Ifpack2_BlockTriDiContainer_def.hpp index 4c872368a43c..dcbe1ee76fce 100644 --- a/packages/ifpack2/src/Ifpack2_BlockTriDiContainer_def.hpp +++ b/packages/ifpack2/src/Ifpack2_BlockTriDiContainer_def.hpp @@ -83,7 +83,8 @@ namespace Ifpack2 { const Teuchos::RCP& importer, const bool overlapCommAndComp, const bool useSeqMethod, - const int block_size) + const int block_size, + const bool explicitConversion) { IFPACK2_BLOCKHELPER_TIMER("BlockTriDiContainer::initInternal"); @@ -99,16 +100,21 @@ namespace Ifpack2 { { IFPACK2_BLOCKHELPER_TIMER("BlockTriDiContainer::setA"); - impl_->A = Teuchos::rcp_dynamic_cast(matrix); - if (impl_->A.is_null()) { - TEUCHOS_TEST_FOR_EXCEPT_MSG - (block_size == -1, "A pointwise matrix and block_size = -1 were given as inputs."); - { - IFPACK2_BLOCKHELPER_TIMER("BlockTriDiContainer::setA::convertToBlockCrsMatrix"); - impl_->A = Tpetra::convertToBlockCrsMatrix(*Teuchos::rcp_dynamic_cast(matrix), block_size, false); - IFPACK2_BLOCKHELPER_TIMER_FENCE(typename BlockHelperDetails::ImplType::execution_space) + if (explicitConversion) { + impl_->A = Teuchos::rcp_dynamic_cast(matrix); + if (impl_->A.is_null()) { + TEUCHOS_TEST_FOR_EXCEPT_MSG + (block_size == -1, "A pointwise matrix and block_size = -1 were given as inputs."); + { + IFPACK2_BLOCKHELPER_TIMER("BlockTriDiContainer::setA::convertToBlockCrsMatrix"); + impl_->A = Tpetra::convertToBlockCrsMatrix(*Teuchos::rcp_dynamic_cast(matrix), block_size, false); + IFPACK2_BLOCKHELPER_TIMER_FENCE(typename BlockHelperDetails::ImplType::execution_space) + } } } + else { + impl_->A = matrix; + } IFPACK2_BLOCKHELPER_TIMER_FENCE(typename BlockHelperDetails::ImplType::execution_space) } @@ -197,6 +203,7 @@ namespace Ifpack2 { const bool overlapCommAndComp = false; initInternal(matrix, importer, overlapCommAndComp, useSeqMethod); n_subparts_per_part_ = -1; + block_size_ = -1; IFPACK2_BLOCKHELPER_TIMER_FENCE(typename BlockHelperDetails::ImplType::execution_space) } @@ -207,12 +214,14 @@ namespace Ifpack2 { const int n_subparts_per_part, const bool overlapCommAndComp, const bool useSeqMethod, - const int block_size) + const int block_size, + const bool explicitConversion) : Container(matrix, partitions, false), partitions_(partitions) { IFPACK2_BLOCKHELPER_TIMER("BlockTriDiContainer::BlockTriDiContainer"); - initInternal(matrix, Teuchos::null, overlapCommAndComp, useSeqMethod, block_size); + initInternal(matrix, Teuchos::null, overlapCommAndComp, useSeqMethod, block_size, explicitConversion); n_subparts_per_part_ = n_subparts_per_part; + block_size_ = block_size; IFPACK2_BLOCKHELPER_TIMER_FENCE(typename BlockHelperDetails::ImplType::execution_space) } @@ -229,6 +238,8 @@ namespace Ifpack2 { { if (List.isType("partitioner: subparts per part")) n_subparts_per_part_ = List.get("partitioner: subparts per part"); + if (List.isType("partitioner: block size")) + block_size_ = List.get("partitioner: block size"); } template @@ -238,13 +249,31 @@ namespace Ifpack2 { { IFPACK2_BLOCKHELPER_TIMER("BlockTriDiContainer::initialize"); this->IsInitialized_ = true; + { + auto bA = Teuchos::rcp_dynamic_cast(impl_->A); + if (bA.is_null()) { + TEUCHOS_TEST_FOR_EXCEPT_MSG + (block_size_ == -1, "A pointwise matrix and block_size = -1 were given as inputs."); + { + IFPACK2_BLOCKHELPER_TIMER("BlockTriDiContainer::initialize::getBlockCrsGraph"); + auto A = Teuchos::rcp_dynamic_cast(impl_->A); + impl_->blockGraph = Tpetra::getBlockCrsGraph(*A, block_size_, true); + IFPACK2_BLOCKHELPER_TIMER_FENCE(typename BlockHelperDetails::ImplType::execution_space) + } + } + else { + impl_->blockGraph = Teuchos::rcpFromRef(bA->getCrsGraph()); + } + } + { IFPACK2_BLOCKHELPER_TIMER("BlockTriDiContainer::createPartInterfaceBlockTridiagsNormManager"); - impl_->part_interface = BlockTriDiContainerDetails::createPartInterface(impl_->A, partitions_, n_subparts_per_part_); + impl_->part_interface = BlockTriDiContainerDetails::createPartInterface(impl_->A, impl_->blockGraph, partitions_, n_subparts_per_part_); impl_->block_tridiags = BlockTriDiContainerDetails::createBlockTridiags(impl_->part_interface); impl_->norm_manager = BlockHelperDetails::NormManager(impl_->A->getComm()); IFPACK2_BLOCKHELPER_TIMER_FENCE(typename BlockHelperDetails::ImplType::execution_space) } + // We assume that if you called this method, you intend to recompute // everything. this->IsComputed_ = false; @@ -252,7 +281,9 @@ namespace Ifpack2 { { BlockTriDiContainerDetails::performSymbolicPhase (impl_->A, - impl_->part_interface, impl_->block_tridiags, + impl_->blockGraph, + impl_->part_interface, + impl_->block_tridiags, impl_->a_minus_d, impl_->overlap_communication_and_computation); } @@ -270,7 +301,8 @@ namespace Ifpack2 { this->initialize(); { BlockTriDiContainerDetails::performNumericPhase - (impl_->A, + (impl_->A, + impl_->blockGraph, impl_->part_interface, impl_->block_tridiags, Kokkos::ArithTraits::zero()); } @@ -303,6 +335,7 @@ namespace Ifpack2 { BlockTriDiContainerDetails::applyInverseJacobi (impl_->A, + impl_->blockGraph, impl_->tpetra_importer, impl_->async_importer, impl_->overlap_communication_and_computation, @@ -337,7 +370,8 @@ namespace Ifpack2 { this->initialize(); { BlockTriDiContainerDetails::performNumericPhase - (impl_->A, + (impl_->A, + impl_->blockGraph, impl_->part_interface, impl_->block_tridiags, in.addRadiallyToDiagonal); } @@ -366,6 +400,7 @@ namespace Ifpack2 { { r_val = BlockTriDiContainerDetails::applyInverseJacobi (impl_->A, + impl_->blockGraph, impl_->tpetra_importer, impl_->async_importer, impl_->overlap_communication_and_computation, diff --git a/packages/ifpack2/src/Ifpack2_BlockTriDiContainer_impl.hpp b/packages/ifpack2/src/Ifpack2_BlockTriDiContainer_impl.hpp index 8d47c85d1a50..311fa80f88f4 100644 --- a/packages/ifpack2/src/Ifpack2_BlockTriDiContainer_impl.hpp +++ b/packages/ifpack2/src/Ifpack2_BlockTriDiContainer_impl.hpp @@ -193,15 +193,24 @@ namespace Ifpack2 { /// template typename Teuchos::RCP::tpetra_import_type> - createBlockCrsTpetraImporter(const Teuchos::RCP::tpetra_block_crs_matrix_type> &A) { + createBlockCrsTpetraImporter(const Teuchos::RCP::tpetra_row_matrix_type> &A) { IFPACK2_BLOCKHELPER_TIMER("BlockTriDi::CreateBlockCrsTpetraImporter"); using impl_type = BlockHelperDetails::ImplType; using tpetra_map_type = typename impl_type::tpetra_map_type; using tpetra_mv_type = typename impl_type::tpetra_block_multivector_type; using tpetra_import_type = typename impl_type::tpetra_import_type; + using crs_matrix_type = typename impl_type::tpetra_crs_matrix_type; + using block_crs_matrix_type = typename impl_type::tpetra_block_crs_matrix_type; + + auto A_crs = Teuchos::rcp_dynamic_cast(A); + auto A_bcrs = Teuchos::rcp_dynamic_cast(A); + + bool hasBlockCrsMatrix = ! A_bcrs.is_null (); + + // This is OK here to use the graph of the A_crs matrix and a block size of 1 + const auto g = hasBlockCrsMatrix ? A_bcrs->getCrsGraph() : *(A_crs->getCrsGraph()); // tpetra crs graph object - const auto g = A->getCrsGraph(); // tpetra crs graph object - const auto blocksize = A->getBlockSize(); + const auto blocksize = hasBlockCrsMatrix ? A_bcrs->getBlockSize() : 1; const auto src = Teuchos::rcp(new tpetra_map_type(tpetra_mv_type::makePointMap(*g.getDomainMap(), blocksize))); const auto tgt = Teuchos::rcp(new tpetra_map_type(tpetra_mv_type::makePointMap(*g.getColMap() , blocksize))); IFPACK2_BLOCKHELPER_TIMER_FENCE(typename BlockHelperDetails::ImplType::execution_space) @@ -883,15 +892,24 @@ namespace Ifpack2 { /// template Teuchos::RCP > - createBlockCrsAsyncImporter(const Teuchos::RCP::tpetra_block_crs_matrix_type> &A) { + createBlockCrsAsyncImporter(const Teuchos::RCP::tpetra_row_matrix_type> &A) { using impl_type = BlockHelperDetails::ImplType; using tpetra_map_type = typename impl_type::tpetra_map_type; using local_ordinal_type = typename impl_type::local_ordinal_type; using global_ordinal_type = typename impl_type::global_ordinal_type; using local_ordinal_type_1d_view = typename impl_type::local_ordinal_type_1d_view; + using crs_matrix_type = typename impl_type::tpetra_crs_matrix_type; + using block_crs_matrix_type = typename impl_type::tpetra_block_crs_matrix_type; + + auto A_crs = Teuchos::rcp_dynamic_cast(A); + auto A_bcrs = Teuchos::rcp_dynamic_cast(A); + + bool hasBlockCrsMatrix = ! A_bcrs.is_null (); - const auto g = A->getCrsGraph(); // tpetra crs graph object - const auto blocksize = A->getBlockSize(); + // This is OK here to use the graph of the A_crs matrix and a block size of 1 + const auto g = hasBlockCrsMatrix ? A_bcrs->getCrsGraph() : *(A_crs->getCrsGraph()); // tpetra crs graph object + + const auto blocksize = hasBlockCrsMatrix ? A_bcrs->getBlockSize() : 1; const auto domain_map = g.getDomainMap(); const auto column_map = g.getColMap(); @@ -1004,7 +1022,8 @@ namespace Ifpack2 { /// template BlockHelperDetails::PartInterface - createPartInterface(const Teuchos::RCP::tpetra_block_crs_matrix_type> &A, + createPartInterface(const Teuchos::RCP::tpetra_row_matrix_type> &A, + const Teuchos::RCP::tpetra_crs_graph_type> &G, const Teuchos::Array::local_ordinal_type> > &partitions, const typename BlockHelperDetails::ImplType::local_ordinal_type n_subparts_per_part_in) { IFPACK2_BLOCKHELPER_TIMER("createPartInterface"); @@ -1014,7 +1033,10 @@ namespace Ifpack2 { using local_ordinal_type_2d_view = typename impl_type::local_ordinal_type_2d_view; using size_type = typename impl_type::size_type; - const auto blocksize = A->getBlockSize(); + auto bA = Teuchos::rcp_dynamic_cast::tpetra_block_crs_matrix_type>(A); + + TEUCHOS_ASSERT(!bA.is_null() || G->getLocalNumRows() != 0); + const local_ordinal_type blocksize = bA.is_null() ? A->getLocalNumRows() / G->getLocalNumRows() : A->getBlockSize(); constexpr int vector_length = impl_type::vector_length; constexpr int internal_vector_length = impl_type::internal_vector_length; @@ -1023,7 +1045,7 @@ namespace Ifpack2 { BlockHelperDetails::PartInterface interf; const bool jacobi = partitions.size() == 0; - const local_ordinal_type A_n_lclrows = A->getLocalNumRows(); + const local_ordinal_type A_n_lclrows = G->getLocalNumRows(); const local_ordinal_type nparts = jacobi ? A_n_lclrows : partitions.size(); typedef std::pair size_idx_pair_type; @@ -1050,11 +1072,13 @@ namespace Ifpack2 { SolveTridiagsDefaultModeAndAlgo:: recommended_team_size(blocksize, vector_length, internal_vector_length); - const local_ordinal_type num_teams = execution_space().concurrency() / (team_size * vector_length); + const local_ordinal_type num_teams = std::max(1, execution_space().concurrency() / (team_size * vector_length)); n_subparts_per_part = getAutomaticNSubparts(nparts, num_teams, line_length, blocksize); +#ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF printf("Automatically chosen n_subparts_per_part = %d for nparts = %d, num_teams = %d, team_size = %d, line_length = %d, and blocksize = %d;\n", n_subparts_per_part, nparts, num_teams, team_size, line_length, blocksize); +#endif } else { n_subparts_per_part = n_subparts_per_part_in; @@ -1817,7 +1841,8 @@ namespace Ifpack2 { /// template void - performSymbolicPhase(const Teuchos::RCP::tpetra_block_crs_matrix_type> &A, + performSymbolicPhase(const Teuchos::RCP::tpetra_row_matrix_type> &A, + const Teuchos::RCP::tpetra_crs_graph_type> &g, const BlockHelperDetails::PartInterface &interf, BlockTridiags &btdm, BlockHelperDetails::AmD &amd, @@ -1836,15 +1861,19 @@ namespace Ifpack2 { using size_type_1d_view = typename impl_type::size_type_1d_view; using vector_type_3d_view = typename impl_type::vector_type_3d_view; using vector_type_4d_view = typename impl_type::vector_type_4d_view; + using crs_matrix_type = typename impl_type::tpetra_crs_matrix_type; using block_crs_matrix_type = typename impl_type::tpetra_block_crs_matrix_type; constexpr int vector_length = impl_type::vector_length; const auto comm = A->getRowMap()->getComm(); - const auto& g = A->getCrsGraph(); + auto A_crs = Teuchos::rcp_dynamic_cast(A); + auto A_bcrs = Teuchos::rcp_dynamic_cast(A); - const auto blocksize = A->getBlockSize(); + bool hasBlockCrsMatrix = ! A_bcrs.is_null (); + TEUCHOS_ASSERT(hasBlockCrsMatrix || g->getLocalNumRows() != 0); + const local_ordinal_type blocksize = hasBlockCrsMatrix ? A->getBlockSize() : A->getLocalNumRows()/g->getLocalNumRows(); // mirroring to host const auto partptr = Kokkos::create_mirror_view_and_copy (Kokkos::HostSpace(), interf.partptr); @@ -1861,9 +1890,9 @@ namespace Ifpack2 { Kokkos::deep_copy(col2row, Teuchos::OrdinalTraits::invalid()); { - const auto rowmap = g.getRowMap(); - const auto colmap = g.getColMap(); - const auto dommap = g.getDomainMap(); + const auto rowmap = g->getRowMap(); + const auto colmap = g->getColMap(); + const auto dommap = g->getDomainMap(); TEUCHOS_ASSERT( !(rowmap.is_null() || colmap.is_null() || dommap.is_null())); #if !defined(__CUDA_ARCH__) && !defined(__HIP_DEVICE_COMPILE__) && !defined(__SYCL_DEVICE_ONLY__) @@ -1888,7 +1917,7 @@ namespace Ifpack2 { // construct the D and R graphs in A = D + R. { - const auto local_graph = g.getLocalGraphHost(); + const auto local_graph = g->getLocalGraphHost(); const auto local_graph_rowptr = local_graph.row_map; TEUCHOS_ASSERT(local_graph_rowptr.size() == static_cast(nrows + 1)); const auto local_graph_colidx = local_graph.entries; @@ -2116,8 +2145,11 @@ namespace Ifpack2 { } // Allocate or view values. - amd.tpetra_values = (const_cast(A.get())->getValuesDeviceNonConst()); - + if (hasBlockCrsMatrix) + amd.tpetra_values = (const_cast(A_bcrs.get())->getValuesDeviceNonConst()); + else { + amd.tpetra_values = (const_cast(A_crs.get()))->getLocalValuesDevice (Tpetra::Access::ReadWrite); + } } // Allocate view for E and initialize the values with B: @@ -2713,7 +2745,8 @@ namespace Ifpack2 { using impl_scalar_type = typename impl_type::impl_scalar_type; using magnitude_type = typename impl_type::magnitude_type; /// tpetra interface - using block_crs_matrix_type = typename impl_type::tpetra_block_crs_matrix_type; + using row_matrix_type = typename impl_type::tpetra_row_matrix_type; + using crs_graph_type = typename impl_type::tpetra_crs_graph_type; /// views using local_ordinal_type_1d_view = typename impl_type::local_ordinal_type_1d_view; using local_ordinal_type_2d_view = typename impl_type::local_ordinal_type_2d_view; @@ -2746,8 +2779,9 @@ namespace Ifpack2 { const local_ordinal_type max_partsz; // block crs matrix (it could be Kokkos::UVMSpace::size_type, which is int) using size_type_1d_view_tpetra = Kokkos::View; - const ConstUnmanaged A_rowptr; - const ConstUnmanaged A_values; + ConstUnmanaged A_block_rowptr; + ConstUnmanaged A_point_rowptr; + ConstUnmanaged A_values; // block tridiags const ConstUnmanaged pack_td_ptr, flat_td_ptr, pack_td_ptr_schur; const ConstUnmanaged A_colindsub; @@ -2762,10 +2796,13 @@ namespace Ifpack2 { const local_ordinal_type vector_loop_size; const local_ordinal_type vector_length_value; + bool hasBlockCrsMatrix; + public: ExtractAndFactorizeTridiags(const BlockTridiags &btdm_, const BlockHelperDetails::PartInterface &interf_, - const Teuchos::RCP &A_, + const Teuchos::RCP &A_, + const Teuchos::RCP &G_, const magnitude_type& tiny_) : // interface partptr(interf_.partptr), @@ -2777,9 +2814,6 @@ namespace Ifpack2 { part2packrowidx0_sub(interf_.part2packrowidx0_sub), packindices_schur(interf_.packindices_schur), max_partsz(interf_.max_partsz), - // block crs matrix - A_rowptr(A_->getCrsGraph().getLocalGraphDevice().row_map), - A_values(const_cast(A_.get())->getValuesDeviceNonConst()), // block tridiags pack_td_ptr(btdm_.pack_td_ptr), flat_td_ptr(btdm_.flat_td_ptr), @@ -2822,7 +2856,24 @@ namespace Ifpack2 { // diagonal weight to avoid zero pivots tiny(tiny_), vector_loop_size(vector_length/internal_vector_length), - vector_length_value(vector_length) {} + vector_length_value(vector_length) { + using crs_matrix_type = typename impl_type::tpetra_crs_matrix_type; + using block_crs_matrix_type = typename impl_type::tpetra_block_crs_matrix_type; + + auto A_crs = Teuchos::rcp_dynamic_cast(A_); + auto A_bcrs = Teuchos::rcp_dynamic_cast(A_); + + hasBlockCrsMatrix = ! A_bcrs.is_null (); + + A_block_rowptr = G_->getLocalGraphDevice().row_map; + if (hasBlockCrsMatrix) { + A_values = const_cast(A_bcrs.get())->getValuesDeviceNonConst(); + } + else { + A_point_rowptr = A_crs->getCrsGraph()->getLocalGraphDevice().row_map; + A_values = A_crs->getLocalValuesDevice (Tpetra::Access::ReadOnly); + } + } private: @@ -2861,25 +2912,44 @@ namespace Ifpack2 { #endif for (local_ordinal_type tr=tr_min,j=0;tr(block[vi][idx]); + ++j; + for (local_ordinal_type ii=0;ii(block[vi][idx]); + } + } } } + else { + const size_type pi = kps + j; + + for (local_ordinal_type vi=0;vi(block[tlb::getFlatIndex(ii,jj,blocksize)]); - } - }); + Kokkos::parallel_for + (Kokkos::TeamThreadRange(member,blocksize), + [&](const local_ordinal_type &ii) { + for (local_ordinal_type jj=0;jj(block[tlb::getFlatIndex(ii,jj,blocksize)]); + } + }); + } + } + else { + for (local_ordinal_type l=lbeg;l void - performNumericPhase(const Teuchos::RCP::tpetra_block_crs_matrix_type> &A, + performNumericPhase(const Teuchos::RCP::tpetra_row_matrix_type> &A, + const Teuchos::RCP::tpetra_crs_graph_type> &G, const BlockHelperDetails::PartInterface &interf, BlockTridiags &btdm, const typename BlockHelperDetails::ImplType::magnitude_type tiny) { IFPACK2_BLOCKHELPER_TIMER("BlockTriDi::NumericPhase"); - ExtractAndFactorizeTridiags function(btdm, interf, A, tiny); + ExtractAndFactorizeTridiags function(btdm, interf, A, G, tiny); function.run(); IFPACK2_BLOCKHELPER_TIMER_FENCE(typename BlockHelperDetails::ImplType::execution_space) } @@ -4736,7 +4823,8 @@ namespace Ifpack2 { template int applyInverseJacobi(// importer - const Teuchos::RCP::tpetra_block_crs_matrix_type> &A, + const Teuchos::RCP::tpetra_row_matrix_type> &A, + const Teuchos::RCP::tpetra_crs_graph_type> &G, const Teuchos::RCP::tpetra_import_type> &tpetra_importer, const Teuchos::RCP > &async_importer, const bool overlap_communication_and_computation, @@ -4838,8 +4926,18 @@ namespace Ifpack2 { damping_factor, is_norm_manager_active); const local_ordinal_type_1d_view dummy_local_ordinal_type_1d_view; + + + auto A_crs = Teuchos::rcp_dynamic_cast(A); + auto A_bcrs = Teuchos::rcp_dynamic_cast(A); + + bool hasBlockCrsMatrix = ! A_bcrs.is_null (); + + // This is OK here to use the graph of the A_crs matrix and a block size of 1 + const auto g = hasBlockCrsMatrix ? A_bcrs->getCrsGraph() : *(A_crs->getCrsGraph()); // tpetra crs graph object + BlockHelperDetails::ComputeResidualVector - compute_residual_vector(amd, A->getCrsGraph().getLocalGraphDevice(), blocksize, interf, + compute_residual_vector(amd, G->getLocalGraphDevice(), g.getLocalGraphDevice(), blocksize, interf, is_async_importer_active ? async_importer->dm2cm : dummy_local_ordinal_type_1d_view); // norm manager workspace resize @@ -4923,7 +5021,8 @@ namespace Ifpack2 { using async_import_type = AsyncableImport; // distructed objects - Teuchos::RCP A; + Teuchos::RCP A; + Teuchos::RCP blockGraph; Teuchos::RCP tpetra_importer; Teuchos::RCP async_importer; bool overlap_communication_and_computation; diff --git a/packages/ifpack2/test/unit_tests/Ifpack2_UnitTestBlockTriDiContainer.cpp b/packages/ifpack2/test/unit_tests/Ifpack2_UnitTestBlockTriDiContainer.cpp index 1e26b4996649..ae707b80d916 100644 --- a/packages/ifpack2/test/unit_tests/Ifpack2_UnitTestBlockTriDiContainer.cpp +++ b/packages/ifpack2/test/unit_tests/Ifpack2_UnitTestBlockTriDiContainer.cpp @@ -280,32 +280,36 @@ static LO run_teuchos_tests (const Input& in, Teuchos::FancyOStream& out, bool& if (seq_method && overlap_comm) continue; for (const bool nonuniform_lines : {false, true}) { for (const bool pointwise : {false, true}) { - if (jacobi && nonuniform_lines) continue; - for (const int nvec : {1, 3}) { - std::stringstream ss; - ss << "test_BR_BTDC:" - << " bs " << bs - << (contiguous ? " contig" : " noncontig") - << (jacobi ? " jacobi" : " tridiag") - << (seq_method ? " seq_method" : "") - << (overlap_comm ? " overlap_comm" : "") - << (pointwise ? " point_wise" : "") - << (nonuniform_lines ? " nonuniform_lines" : " uniform_lines") - << " nvec " << nvec; - const std::string details = ss.str(); - bool threw = false; - try { - ne = btdct::test_BR_BTDC(in.comm, sb, sbp, bs, nvec, nonuniform_lines, - different_maps, jacobi, overlap_comm, seq_method, pointwise, - details); - nerr += ne; - } catch (const std::exception& e) { - threw = true; - } - if (threw) - printf("Exception threw from rank %d, %s\n", in.comm->getRank(), details.c_str()); + for (const bool explicitConversion : {false, true}) { + if (jacobi && nonuniform_lines) continue; + if (!pointwise && explicitConversion) continue; + for (const int nvec : {1, 3}) { + std::stringstream ss; + ss << "test_BR_BTDC:" + << " bs " << bs + << (contiguous ? " contig" : " noncontig") + << (jacobi ? " jacobi" : " tridiag") + << (seq_method ? " seq_method" : "") + << (overlap_comm ? " overlap_comm" : "") + << (pointwise ? " point_wise" : "") + << (explicitConversion ? " explicit_block_conversion" : "") + << (nonuniform_lines ? " nonuniform_lines" : " uniform_lines") + << " nvec " << nvec; + const std::string details = ss.str(); + bool threw = false; + try { + ne = btdct::test_BR_BTDC(in.comm, sb, sbp, bs, nvec, nonuniform_lines, + different_maps, jacobi, overlap_comm, seq_method, pointwise, + explicitConversion, details); + nerr += ne; + } catch (const std::exception& e) { + threw = true; + } + if (threw) + printf("Exception threw from rank %d, %s\n", in.comm->getRank(), details.c_str()); - TEUCHOS_TEST(ne == 0 && ! threw, details); + TEUCHOS_TEST(ne == 0 && ! threw, details); + } } } } diff --git a/packages/ifpack2/test/unit_tests/Ifpack2_UnitTestBlockTriDiContainerUtil.hpp b/packages/ifpack2/test/unit_tests/Ifpack2_UnitTestBlockTriDiContainerUtil.hpp index 7b9c8a28bb39..516c0ab8d304 100644 --- a/packages/ifpack2/test/unit_tests/Ifpack2_UnitTestBlockTriDiContainerUtil.hpp +++ b/packages/ifpack2/test/unit_tests/Ifpack2_UnitTestBlockTriDiContainerUtil.hpp @@ -159,7 +159,8 @@ struct BlockTriDiContainerTester { const bool nonuniform_lines = false, const bool zero_starting_soln = true, const int num_sweeps = 1, - const bool jacobi = false) { + const bool jacobi = false, + const bool explicitConversion = false) { Teuchos::Array > parts; // make_parts modifies entries of A so the call to convertToCrsMatrix // needs to happen after make_parts @@ -178,6 +179,7 @@ struct BlockTriDiContainerTester { p.set("partitioner: parts", parts); p.set("partitioner: subparts per part", 1); p.set("partitioner: block size", A->getBlockSize()); + p.set("partitioner: explicit convert to BlockCrs", explicitConversion); T->setParameters(p); } return T; @@ -207,6 +209,7 @@ struct BlockTriDiContainerTester { p.set("partitioner: parts", parts); p.set("partitioner: subparts per part", 1); p.set("partitioner: block size", -1); + p.set("partitioner: explicit convert to BlockCrs", false); T->setParameters(p); } return T; @@ -219,7 +222,8 @@ struct BlockTriDiContainerTester { make_BTDC_PW (const StructuredBlock& sb, const StructuredBlockPart& sbp, const Teuchos::RCP& A, const bool overlap_comm = false, const bool nonuniform_lines = false, - const bool jacobi = false, const bool seq_method = false) { + const bool jacobi = false, const bool seq_method = false, + const bool explicitConversion = false) { Teuchos::Array > parts; // make_parts modifies entries of A so the call to convertToCrsMatrix // needs to happen after make_parts @@ -227,7 +231,7 @@ struct BlockTriDiContainerTester { auto A_pw = Tpetra::convertToCrsMatrix(*A); return Teuchos::rcp(new Ifpack2::BlockTriDiContainer( - A_pw, parts, 1, overlap_comm, seq_method, A->getBlockSize())); + A_pw, parts, 1, overlap_comm, seq_method, A->getBlockSize(), explicitConversion)); } // Make a bare BlockTriDiContainer. @@ -248,7 +252,8 @@ struct BlockTriDiContainerTester { const StructuredBlock& sb, const StructuredBlockPart& sbp, const Int bs, const Int nvec, const bool nonuniform_lines, const bool different_maps, const bool jacobi, const bool overlap_comm, - const bool seq_method, const bool pointwise, const std::string& details) { + const bool seq_method, const bool pointwise, const bool explicitConversion, + const std::string& details) { #define TEST_BR_BTDC_FAIL(msg) do { \ ++nerr; \ if (comm->getRank() == 0) { \ @@ -286,13 +291,13 @@ struct BlockTriDiContainerTester { const Magnitude tol = 1e-3; const auto T_br = use_br ? ( pointwise ? - make_BR_BTDC_PW(sb, sbp, A, nonuniform_lines, zero_starting, num_sweeps, jacobi): + make_BR_BTDC_PW(sb, sbp, A, nonuniform_lines, zero_starting, num_sweeps, jacobi, explicitConversion): make_BR_BTDC(sb, sbp, A, nonuniform_lines, zero_starting, num_sweeps, jacobi) ): Teuchos::null; const auto T_bare = use_br ? Teuchos::null : ( pointwise ? - make_BTDC_PW(sb, sbp, A, overlap_comm, nonuniform_lines, jacobi, seq_method): + make_BTDC_PW(sb, sbp, A, overlap_comm, nonuniform_lines, jacobi, seq_method, explicitConversion): make_BTDC(sb, sbp, A, overlap_comm, nonuniform_lines, jacobi, seq_method) ); if ( ! T_br.is_null()) { diff --git a/packages/muelu/src/Operators/MueLu_MultiPhys_def.hpp b/packages/muelu/src/Operators/MueLu_MultiPhys_def.hpp index 6155dd7f092f..5a04f8ebe20f 100644 --- a/packages/muelu/src/Operators/MueLu_MultiPhys_def.hpp +++ b/packages/muelu/src/Operators/MueLu_MultiPhys_def.hpp @@ -142,7 +142,9 @@ void MultiPhys::compute(bool reuse) { // Generate the (iii,iii) Hierarchy for (int iii = 0; iii < nBlks_; iii++) { - arrayOfParamLists_[iii]->sublist("user data").set("Coordinates", arrayOfCoords_[iii]); + if (arrayOfCoords_ != Teuchos::null) { + arrayOfParamLists_[iii]->sublist("user data").set("Coordinates", arrayOfCoords_[iii]); + } bool wantToRepartition = false; if (paramListMultiphysics_->isParameter("repartition: enable")) @@ -312,6 +314,8 @@ void MultiPhys:: } oss << std::endl; + out << oss.str(); + for (int i = 0; i < nBlks_; i++) { arrayOfHierarchies_[i]->describe(out, GetVerbLevel()); } diff --git a/packages/muelu/src/Operators/MueLu_RefMaxwell_def.hpp b/packages/muelu/src/Operators/MueLu_RefMaxwell_def.hpp index 3b3a6fe66184..22ef99a1fba2 100644 --- a/packages/muelu/src/Operators/MueLu_RefMaxwell_def.hpp +++ b/packages/muelu/src/Operators/MueLu_RefMaxwell_def.hpp @@ -2074,7 +2074,13 @@ void RefMaxwell::setupSubSolve(Teucho } if ((coarseType == "" || coarseType == "Klu" || - coarseType == "Klu2") && + coarseType == "Klu2" || + coarseType == "Superlu" || + coarseType == "Superlu_dist" || + coarseType == "Superludist" || + coarseType == "Basker" || + coarseType == "Cusolver" || + coarseType == "Tacho") && (!params.isSublist("coarse: params") || !params.sublist("coarse: params").isParameter("fix nullspace"))) params.sublist("coarse: params").set("fix nullspace", true); diff --git a/packages/muelu/src/Smoothers/MueLu_Amesos2Smoother_def.hpp b/packages/muelu/src/Smoothers/MueLu_Amesos2Smoother_def.hpp index ba72e83f2e0c..2a7c55e84b33 100644 --- a/packages/muelu/src/Smoothers/MueLu_Amesos2Smoother_def.hpp +++ b/packages/muelu/src/Smoothers/MueLu_Amesos2Smoother_def.hpp @@ -333,7 +333,7 @@ void Amesos2Smoother::Setup(Level& cu amesos2_params->setName("Amesos2"); if ((rowMap->getGlobalNumElements() != as((rowMap->getMaxAllGlobalIndex() - rowMap->getMinAllGlobalIndex()) + 1)) || (!rowMap->isContiguous() && (rowMap->getComm()->getSize() == 1))) { - if ((type_ != "Cusolver") && !(amesos2_params->sublist(prec_->name()).template isType("IsContiguous"))) + if (((type_ != "Cusolver") && (type_ != "Tacho")) && !(amesos2_params->sublist(prec_->name()).template isType("IsContiguous"))) amesos2_params->sublist(prec_->name()).set("IsContiguous", false, "Are GIDs Contiguous"); } prec_->setParameters(amesos2_params); diff --git a/packages/muelu/src/Transfers/BlockedTransfers/MueLu_SubBlockAFactory_def.hpp b/packages/muelu/src/Transfers/BlockedTransfers/MueLu_SubBlockAFactory_def.hpp index 18dd726a8d10..f1ea5162cd89 100644 --- a/packages/muelu/src/Transfers/BlockedTransfers/MueLu_SubBlockAFactory_def.hpp +++ b/packages/muelu/src/Transfers/BlockedTransfers/MueLu_SubBlockAFactory_def.hpp @@ -155,8 +155,10 @@ void SubBlockAFactory::Build(Level& c RCP rangeMapExtractor = A->getRangeMapExtractor(); RCP domainMapExtractor = A->getDomainMapExtractor(); - RCP rangeMap = rangeMapExtractor->getMap(row); - RCP domainMap = domainMapExtractor->getMap(col); + bool thyraMode = rangeMapExtractor->getThyraMode(); + + RCP rangeMap = rangeMapExtractor->getMap(row, thyraMode); + RCP domainMap = domainMapExtractor->getMap(col, thyraMode); // use user-specified striding information if available. Otherwise try to use internal striding information from the submaps! if (bRangeUserSpecified) diff --git a/packages/panzer/adapters-stk/example/CurlLaplacianExample/CMakeLists.txt b/packages/panzer/adapters-stk/example/CurlLaplacianExample/CMakeLists.txt index 8d47cfa1a732..fb144ce6e870 100644 --- a/packages/panzer/adapters-stk/example/CurlLaplacianExample/CMakeLists.txt +++ b/packages/panzer/adapters-stk/example/CurlLaplacianExample/CMakeLists.txt @@ -60,7 +60,7 @@ FOREACH( ORDER 1 2 3 4 ) PASS_REGULAR_EXPRESSION "ALL PASSED: Tpetra" NUM_MPI_PROCS 4 OUTPUT_FILE MPE-ConvTest-Quad-${ORDER}-32 - TEST_4 CMND python + TEST_4 CMND python3 ARGS ${CMAKE_CURRENT_SOURCE_DIR}/convergence_rate.py ${ORDER} MPE-ConvTest-Quad-${ORDER}- @@ -92,7 +92,7 @@ FOREACH( ORDER 1 ) PASS_REGULAR_EXPRESSION "ALL PASSED: Tpetra" NUM_MPI_PROCS 4 OUTPUT_FILE MPE-Multiblock-ConvTest-Quad-${ORDER}-32 - TEST_3 CMND python + TEST_3 CMND python3 ARGS ${CMAKE_CURRENT_SOURCE_DIR}/convergence_rate.py ${ORDER} MPE-Multiblock-ConvTest-Quad-${ORDER}- @@ -128,7 +128,7 @@ ENDFOREACH() # PASS_REGULAR_EXPRESSION "ALL PASSED: Epetra" # NUM_MPI_PROCS 4 # OUTPUT_FILE MPE-ConvTest-Tri-${ORDER}-32 -# TEST_4 CMND python +# TEST_4 CMND python3 # ARGS ${CMAKE_CURRENT_SOURCE_DIR}/convergence_rate.py # ${ORDER} # MPE-ConvTest-Tri-${ORDER}- @@ -165,7 +165,7 @@ ENDFOREACH() # PASS_REGULAR_EXPRESSION "ALL PASSED: Epetra" # NUM_MPI_PROCS 4 # OUTPUT_FILE MPE-ConvTest-Hex-${ORDER}-32 -# TEST_4 CMND python +# TEST_4 CMND python3 # ARGS ${CMAKE_CURRENT_SOURCE_DIR}/convergence_rate.py # ${ORDER} # MPE-ConvTest-Hex-${ORDER}- diff --git a/packages/panzer/adapters-stk/example/MixedCurlLaplacianExample/CMakeLists.txt b/packages/panzer/adapters-stk/example/MixedCurlLaplacianExample/CMakeLists.txt index 7768714c4deb..39908f32745d 100644 --- a/packages/panzer/adapters-stk/example/MixedCurlLaplacianExample/CMakeLists.txt +++ b/packages/panzer/adapters-stk/example/MixedCurlLaplacianExample/CMakeLists.txt @@ -60,7 +60,7 @@ FOREACH( ORDER 1 2 ) PASS_REGULAR_EXPRESSION "ALL PASSED: Tpetra" NUM_MPI_PROCS 4 OUTPUT_FILE MPE-ConvTest-Quad-${ORDER}-32 - TEST_4 CMND python + TEST_4 CMND python3 ARGS ${CMAKE_CURRENT_SOURCE_DIR}/convergence_rate.py ${ORDER} MPE-ConvTest-Quad-${ORDER}- @@ -92,7 +92,7 @@ FOREACH( ORDER 3 ) PASS_REGULAR_EXPRESSION "ALL PASSED: Tpetra" NUM_MPI_PROCS 4 OUTPUT_FILE MPE-ConvTest-Quad-${ORDER}-16 - TEST_4 CMND python + TEST_4 CMND python3 ARGS ${CMAKE_CURRENT_SOURCE_DIR}/convergence_rate.py ${ORDER} MPE-ConvTest-Quad-${ORDER}- @@ -123,7 +123,7 @@ FOREACH( ORDER 1 ) PASS_REGULAR_EXPRESSION "ALL PASSED: Tpetra" NUM_MPI_PROCS 4 OUTPUT_FILE MPE-Multiblock-ConvTest-Quad-${ORDER}-32 - TEST_3 CMND python + TEST_3 CMND python3 ARGS ${CMAKE_CURRENT_SOURCE_DIR}/convergence_rate.py ${ORDER} MPE-Multiblock-ConvTest-Quad-${ORDER}- @@ -159,7 +159,7 @@ FOREACH( ORDER 1 ) PASS_REGULAR_EXPRESSION "ALL PASSED: Tpetra" NUM_MPI_PROCS 4 OUTPUT_FILE MPE-ConvTest-Tri-${ORDER}-64 - TEST_4 CMND python + TEST_4 CMND python3 ARGS ${CMAKE_CURRENT_SOURCE_DIR}/convergence_rate.py ${ORDER} MPE-ConvTest-Tri-${ORDER}- @@ -191,7 +191,7 @@ FOREACH( ORDER 2 ) PASS_REGULAR_EXPRESSION "ALL PASSED: Tpetra" NUM_MPI_PROCS 4 OUTPUT_FILE MPE-ConvTest-Tri-${ORDER}-16 - TEST_3 CMND python + TEST_3 CMND python3 ARGS ${CMAKE_CURRENT_SOURCE_DIR}/convergence_rate.py ${ORDER} MPE-ConvTest-Tri-${ORDER}- @@ -227,7 +227,7 @@ ENDFOREACH() # PASS_REGULAR_EXPRESSION "ALL PASSED: Epetra" # NUM_MPI_PROCS 4 # OUTPUT_FILE MPE-ConvTest-Hex-${ORDER}-32 -# TEST_4 CMND python +# TEST_4 CMND python3 # ARGS ${CMAKE_CURRENT_SOURCE_DIR}/convergence_rate.py # ${ORDER} # MPE-ConvTest-Hex-${ORDER}- diff --git a/packages/panzer/adapters-stk/example/MixedPoissonExample/CMakeLists.txt b/packages/panzer/adapters-stk/example/MixedPoissonExample/CMakeLists.txt index 0cc77eef4476..996c0723ad45 100644 --- a/packages/panzer/adapters-stk/example/MixedPoissonExample/CMakeLists.txt +++ b/packages/panzer/adapters-stk/example/MixedPoissonExample/CMakeLists.txt @@ -59,7 +59,7 @@ FOREACH( ORDER 1 2 3) PASS_REGULAR_EXPRESSION "ALL PASSED: Tpetra" NUM_MPI_PROCS 4 OUTPUT_FILE MPE-ConvTest-Hex-p${ORDER}-12 - TEST_3 CMND python + TEST_3 CMND python3 ARGS ${CMAKE_CURRENT_SOURCE_DIR}/convergence_rate.py ${ORDER} MPE-ConvTest-Hex-p${ORDER}- @@ -90,7 +90,7 @@ FOREACH( ORDER 1) PASS_REGULAR_EXPRESSION "ALL PASSED: Tpetra" NUM_MPI_PROCS 4 OUTPUT_FILE MPE-Multiblock-ConvTest-Hex-p${ORDER}-12 - TEST_2 CMND python + TEST_2 CMND python3 ARGS ${CMAKE_CURRENT_SOURCE_DIR}/convergence_rate.py ${ORDER} MPE-Multiblock-ConvTest-Hex-p${ORDER}- diff --git a/packages/panzer/adapters-stk/example/MixedPoissonExample/analyze_performance.py b/packages/panzer/adapters-stk/example/MixedPoissonExample/analyze_performance.py index 517e27047d5d..90785008ec7b 100755 --- a/packages/panzer/adapters-stk/example/MixedPoissonExample/analyze_performance.py +++ b/packages/panzer/adapters-stk/example/MixedPoissonExample/analyze_performance.py @@ -1,4 +1,4 @@ -#! /usr/bin/env python +#! /usr/bin/env python3 """ Script for analyzing Panzer kernel performance on next-generation @@ -12,7 +12,6 @@ # Import python modules for command-line options, the operating system, regular # expressions, and system functions -import commands import argparse import os import re diff --git a/packages/panzer/adapters-stk/example/PoissonExample/CMakeLists.txt b/packages/panzer/adapters-stk/example/PoissonExample/CMakeLists.txt index 4ea8c6e0dff1..e1201f5e0c3b 100644 --- a/packages/panzer/adapters-stk/example/PoissonExample/CMakeLists.txt +++ b/packages/panzer/adapters-stk/example/PoissonExample/CMakeLists.txt @@ -47,7 +47,7 @@ FOREACH( ORDER 1 2 3 4 ) PASS_REGULAR_EXPRESSION "ALL PASSED" NUM_MPI_PROCS 4 OUTPUT_FILE MPE-ConvTest-Quad-p${ORDER}-80 - TEST_5 CMND python + TEST_5 CMND python3 ARGS ${CMAKE_CURRENT_SOURCE_DIR}/convergence_rate.py ${ORDER} MPE-ConvTest-Quad-p${ORDER}- @@ -91,7 +91,7 @@ FOREACH( ORDER 1 2 3 4 ) PASS_REGULAR_EXPRESSION "ALL PASSED" NUM_MPI_PROCS 4 OUTPUT_FILE MPE-ConvTest-Tri-p${ORDER}-80 - TEST_5 CMND python + TEST_5 CMND python3 ARGS ${CMAKE_CURRENT_SOURCE_DIR}/convergence_rate.py ${ORDER} MPE-ConvTest-Tri-p${ORDER}- @@ -119,7 +119,7 @@ TRIBITS_ADD_ADVANCED_TEST( NUM_MPI_PROCS 2 OUTPUT_FILE PoissonExampleCurvilinear_3_Q1.out COMM mpi - TEST_2 CMND python + TEST_2 CMND python3 ARGS ${CMAKE_CURRENT_SOURCE_DIR}/compareWithGold_Curvilinear.py ${CMAKE_CURRENT_SOURCE_DIR}/PoissonExampleCurvilinear_3_Q2.gold ${CMAKE_CURRENT_SOURCE_DIR}/PoissonExampleCurvilinear_3_Q1.gold diff --git a/packages/panzer/adapters-stk/example/PoissonInterfaceExample/CMakeLists.txt b/packages/panzer/adapters-stk/example/PoissonInterfaceExample/CMakeLists.txt index 6ff12ed190a2..d6d4b217a376 100644 --- a/packages/panzer/adapters-stk/example/PoissonInterfaceExample/CMakeLists.txt +++ b/packages/panzer/adapters-stk/example/PoissonInterfaceExample/CMakeLists.txt @@ -61,7 +61,7 @@ TRIBITS_ADD_ADVANCED_TEST( PASS_REGULAR_EXPRESSION "PASS BASICS" NUM_MPI_PROCS 4 OUTPUT_FILE PIE-ConvTest-20 - TEST_2 CMND python + TEST_2 CMND python3 ARGS ${CMAKE_CURRENT_SOURCE_DIR}/convergence_rate.py PIE-ConvTest- 10 @@ -82,7 +82,7 @@ TRIBITS_ADD_ADVANCED_TEST( PASS_REGULAR_EXPRESSION "PASS BASICS" NUM_MPI_PROCS 4 OUTPUT_FILE PIE-ConvTest-8 - TEST_2 CMND python + TEST_2 CMND python3 ARGS ${CMAKE_CURRENT_SOURCE_DIR}/convergence_rate.py PIE-ConvTest- 4 diff --git a/packages/panzer/adapters-stk/example/PoissonInterfaceTpetra/CMakeLists.txt b/packages/panzer/adapters-stk/example/PoissonInterfaceTpetra/CMakeLists.txt index 8cf8e79a965d..31587e48442d 100644 --- a/packages/panzer/adapters-stk/example/PoissonInterfaceTpetra/CMakeLists.txt +++ b/packages/panzer/adapters-stk/example/PoissonInterfaceTpetra/CMakeLists.txt @@ -61,7 +61,7 @@ TRIBITS_ADD_ADVANCED_TEST( PASS_REGULAR_EXPRESSION "PASS BASICS" NUM_MPI_PROCS 4 OUTPUT_FILE PIE-ConvTest-20 - TEST_2 CMND python + TEST_2 CMND python3 ARGS ${CMAKE_CURRENT_SOURCE_DIR}/convergence_rate.py PIE-ConvTest- 10 @@ -82,7 +82,7 @@ TRIBITS_ADD_ADVANCED_TEST( PASS_REGULAR_EXPRESSION "PASS BASICS" NUM_MPI_PROCS 4 OUTPUT_FILE PIE-ConvTest-8 - TEST_2 CMND python + TEST_2 CMND python3 ARGS ${CMAKE_CURRENT_SOURCE_DIR}/convergence_rate.py PIE-ConvTest- 4 diff --git a/packages/panzer/maintenance/replacePanzerEvaluatorMacros.py b/packages/panzer/maintenance/replacePanzerEvaluatorMacros.py index 606f67b9b489..90afaf9f7d26 100755 --- a/packages/panzer/maintenance/replacePanzerEvaluatorMacros.py +++ b/packages/panzer/maintenance/replacePanzerEvaluatorMacros.py @@ -1,4 +1,4 @@ -#!/usr/bin/env python +#!/usr/bin/env python3 # This script will read a file and replace any Panzer Evaluator macros it # finds with their contents. diff --git a/packages/panzer/mini-em/cmake/Dependencies.cmake b/packages/panzer/mini-em/cmake/Dependencies.cmake index e4cad2ef1488..72ad76e29743 100644 --- a/packages/panzer/mini-em/cmake/Dependencies.cmake +++ b/packages/panzer/mini-em/cmake/Dependencies.cmake @@ -1,5 +1,5 @@ SET(LIB_REQUIRED_DEP_PACKAGES PanzerCore PanzerDofMgr PanzerDiscFE PanzerAdaptersSTK Phalanx Belos Teko MueLu) -SET(LIB_OPTIONAL_DEP_PACKAGES ML) +SET(LIB_OPTIONAL_DEP_PACKAGES ML ShyLU_NodeTacho) SET(TEST_REQUIRED_DEP_PACKAGES) SET(TEST_OPTIONAL_DEP_PACKAGES) SET(LIB_REQUIRED_DEP_TPLS MPI) diff --git a/packages/panzer/mini-em/example/BlockPrec/CMakeLists.txt b/packages/panzer/mini-em/example/BlockPrec/CMakeLists.txt index e76043dd1382..a31d3475dece 100644 --- a/packages/panzer/mini-em/example/BlockPrec/CMakeLists.txt +++ b/packages/panzer/mini-em/example/BlockPrec/CMakeLists.txt @@ -23,7 +23,7 @@ TRIBITS_COPY_FILES_TO_BINARY_DIR(CopyBlockPrecFiles darcyTetAnalytic.xml darcyHexAnalytic.xml solverCG.xml solverGMRES.xml solverMueLu.xml solverMueLu2D.xml solverMueLuEpetra.xml - solverMueLuOpenMP.xml solverMueLuCuda.xml solverMueLuTruncated.xml + solverMueLuOpenMP.xml solverMueLuCuda.xml solverMueLuTruncated.xml solverMueLuTPL.xml solverMueLuHO.xml solverMueLuHOCuda.xml solverAugmentationUseILU.xml solverML.xml @@ -99,6 +99,17 @@ TRIBITS_ADD_TEST( ) +IF (${PACKAGE_NAME}_ENABLE_ShyLU_NodeTacho) + TRIBITS_ADD_TEST( + BlockPrec + NAME "Maxwell_MueLu" + POSTFIX_AND_ARGS_0 "order1_tpl" --solver=MueLu --numTimeSteps=1 --linAlgebra=Tpetra --tpl + ENVIRONMENT TPETRA_FUSED_RESIDUAL=OFF + NUM_MPI_PROCS 4 + ) + ENDIF() + + # Large performance-testing runs of 3D BlockPrec_RefMaxwell (4 and 16 ranks) TRIBITS_ADD_TEST( BlockPrec @@ -120,6 +131,32 @@ TRIBITS_ADD_TEST( RUN_SERIAL ) +IF (${PACKAGE_NAME}_ENABLE_ShyLU_NodeTacho) + + TRIBITS_ADD_TEST( + BlockPrec + NAME "MiniEM-BlockPrec_RefMaxwell_Performance_4_tpl" + ARGS "--stacked-timer --solver=MueLu --numTimeSteps=3 --linAlgebra=Tpetra --inputFile=maxwell-large.xml --tpl --test-name=\"MiniEM 3D RefMaxwell TPL\"" + ENVIRONMENT TPETRA_FUSED_RESIDUAL=OFF + COMM mpi + NUM_MPI_PROCS 4 + CATEGORIES PERFORMANCE + RUN_SERIAL + ) + + TRIBITS_ADD_TEST( + BlockPrec + NAME "MiniEM-BlockPrec_RefMaxwell_Performance_16_tpl" + ARGS "--stacked-timer --solver=MueLu --numTimeSteps=3 --linAlgebra=Tpetra --inputFile=maxwell-large.xml --tpl --test-name=\"MiniEM 3D RefMaxwell TPL\"" + ENVIRONMENT TPETRA_FUSED_RESIDUAL=OFF + COMM mpi + NUM_MPI_PROCS 16 + CATEGORIES PERFORMANCE + RUN_SERIAL + ) + +ENDIF() + TRIBITS_ADD_TEST( BlockPrec NAME "MiniEM-BlockPrec_RefMaxwell_Performance_16_reuse" diff --git a/packages/panzer/mini-em/example/BlockPrec/MiniEM_helpers.cpp b/packages/panzer/mini-em/example/BlockPrec/MiniEM_helpers.cpp index d60d387ee1df..ca0f217a0cc0 100644 --- a/packages/panzer/mini-em/example/BlockPrec/MiniEM_helpers.cpp +++ b/packages/panzer/mini-em/example/BlockPrec/MiniEM_helpers.cpp @@ -100,6 +100,7 @@ namespace mini_em { Teuchos::RCP> &comm, Teuchos::RCP &out, std::string &xml, int basis_order, + const bool preferTPLs, const bool truncateMueLuHierarchy) { using Teuchos::RCP; using Teuchos::rcp; @@ -161,7 +162,9 @@ namespace mini_em { updateParams("solverMueLu2D.xml", lin_solver_pl, comm, out); } if (truncateMueLuHierarchy) - updateParams("solverMueLuTruncated.xml", lin_solver_pl, comm, out); + updateParams("solverMueLuTruncated.xml", lin_solver_pl, comm, out); + if (preferTPLs) + updateParams("solverMueLuTPL.xml", lin_solver_pl, comm, out); if (basis_order > 1) { RCP lin_solver_pl_lo = lin_solver_pl; lin_solver_pl = rcp(new Teuchos::ParameterList("Linear Solver")); diff --git a/packages/panzer/mini-em/example/BlockPrec/MiniEM_helpers.hpp b/packages/panzer/mini-em/example/BlockPrec/MiniEM_helpers.hpp index ff5eb74b2833..cdbb8a82ee9d 100644 --- a/packages/panzer/mini-em/example/BlockPrec/MiniEM_helpers.hpp +++ b/packages/panzer/mini-em/example/BlockPrec/MiniEM_helpers.hpp @@ -59,6 +59,7 @@ namespace mini_em { Teuchos::RCP &out, std::string &xml, int basis_order, + const bool preferTPLs = false, const bool truncateMueLuHierarchy = false); void setClosureParameters(physicsType physics, diff --git a/packages/panzer/mini-em/example/BlockPrec/main.cpp b/packages/panzer/mini-em/example/BlockPrec/main.cpp index 34cfc46df97c..2857b7af46b2 100644 --- a/packages/panzer/mini-em/example/BlockPrec/main.cpp +++ b/packages/panzer/mini-em/example/BlockPrec/main.cpp @@ -110,7 +110,7 @@ int main_(Teuchos::CommandLineProcessor &clp, int argc,char * argv[]) { // defaults for command-line options int x_elements=-1,y_elements=-1,z_elements=-1,basis_order=1; - int workset_size=20; + int workset_size=2000; std::string pCoarsenScheduleStr = "1"; double dt=0.0; std::string meshFile = ""; @@ -120,6 +120,7 @@ int main_(Teuchos::CommandLineProcessor &clp, int argc,char * argv[]) std::string xml = ""; solverType solverValues[5] = {AUGMENTATION, MUELU, ML, CG, GMRES}; const char * solverNames[5] = {"Augmentation", "MueLu", "ML", "CG", "GMRES"}; + bool preferTPLs = false; bool truncateMueLuHierarchy = false; solverType solver = MUELU; int numTimeSteps = 1; @@ -147,6 +148,7 @@ int main_(Teuchos::CommandLineProcessor &clp, int argc,char * argv[]) clp.setOption("inputFile",&input_file,"XML file with the problem definitions"); clp.setOption("solverFile",&xml,"XML file with the solver params"); clp.setOption("solver",&solver,5,solverValues,solverNames,"Solver that is used"); + clp.setOption("tpl", "no-tpl", &preferTPLs, "Prefer TPL usage over fused kernels"); clp.setOption("truncateMueLuHierarchy", "no-truncateMueLuHierarchy", &truncateMueLuHierarchy, "Truncate the MueLu hierarchy"); clp.setOption("numTimeSteps",&numTimeSteps); clp.setOption("finalTime",&finalTime); @@ -292,7 +294,7 @@ int main_(Teuchos::CommandLineProcessor &clp, int argc,char * argv[]) throw; } - RCP lin_solver_pl = mini_em::getSolverParameters(linAlgebra, physics, solver, dim, comm, out, xml, basis_order, truncateMueLuHierarchy); + RCP lin_solver_pl = mini_em::getSolverParameters(linAlgebra, physics, solver, dim, comm, out, xml, basis_order, preferTPLs, truncateMueLuHierarchy); if (lin_solver_pl->sublist("Preconditioner Types").isSublist("Teko") && lin_solver_pl->sublist("Preconditioner Types").sublist("Teko").isSublist("Inverse Factory Library")) { diff --git a/packages/panzer/mini-em/example/BlockPrec/maxwell-1stOrder.xml b/packages/panzer/mini-em/example/BlockPrec/maxwell-1stOrder.xml index 6fc4966b38a6..78709fd571ba 100644 --- a/packages/panzer/mini-em/example/BlockPrec/maxwell-1stOrder.xml +++ b/packages/panzer/mini-em/example/BlockPrec/maxwell-1stOrder.xml @@ -56,7 +56,6 @@ - diff --git a/packages/panzer/mini-em/example/BlockPrec/maxwell-analyticSolution.xml b/packages/panzer/mini-em/example/BlockPrec/maxwell-analyticSolution.xml index 68ed5f2abf1a..5dda86d61a0d 100644 --- a/packages/panzer/mini-em/example/BlockPrec/maxwell-analyticSolution.xml +++ b/packages/panzer/mini-em/example/BlockPrec/maxwell-analyticSolution.xml @@ -33,7 +33,6 @@ - diff --git a/packages/panzer/mini-em/example/BlockPrec/maxwell-bdot-large.xml b/packages/panzer/mini-em/example/BlockPrec/maxwell-bdot-large.xml index 3a8584084e82..8ea32881b78d 100644 --- a/packages/panzer/mini-em/example/BlockPrec/maxwell-bdot-large.xml +++ b/packages/panzer/mini-em/example/BlockPrec/maxwell-bdot-large.xml @@ -28,7 +28,6 @@ - diff --git a/packages/panzer/mini-em/example/BlockPrec/maxwell-bdot-medium.xml b/packages/panzer/mini-em/example/BlockPrec/maxwell-bdot-medium.xml index 4a3c9eb2e884..1d8d7e47eae8 100644 --- a/packages/panzer/mini-em/example/BlockPrec/maxwell-bdot-medium.xml +++ b/packages/panzer/mini-em/example/BlockPrec/maxwell-bdot-medium.xml @@ -28,7 +28,6 @@ - diff --git a/packages/panzer/mini-em/example/BlockPrec/maxwell-bdot-small.xml b/packages/panzer/mini-em/example/BlockPrec/maxwell-bdot-small.xml index 87758fc0173b..9856f4fc37f3 100644 --- a/packages/panzer/mini-em/example/BlockPrec/maxwell-bdot-small.xml +++ b/packages/panzer/mini-em/example/BlockPrec/maxwell-bdot-small.xml @@ -28,7 +28,6 @@ - diff --git a/packages/panzer/mini-em/example/BlockPrec/maxwell-blob-R0.xml b/packages/panzer/mini-em/example/BlockPrec/maxwell-blob-R0.xml index dee0f8077980..45e2454f27f0 100644 --- a/packages/panzer/mini-em/example/BlockPrec/maxwell-blob-R0.xml +++ b/packages/panzer/mini-em/example/BlockPrec/maxwell-blob-R0.xml @@ -24,7 +24,6 @@ - diff --git a/packages/panzer/mini-em/example/BlockPrec/maxwell-blob-R1.xml b/packages/panzer/mini-em/example/BlockPrec/maxwell-blob-R1.xml index e3da35320350..0a83103c0b81 100644 --- a/packages/panzer/mini-em/example/BlockPrec/maxwell-blob-R1.xml +++ b/packages/panzer/mini-em/example/BlockPrec/maxwell-blob-R1.xml @@ -24,7 +24,6 @@ - diff --git a/packages/panzer/mini-em/example/BlockPrec/maxwell-blob-R2.xml b/packages/panzer/mini-em/example/BlockPrec/maxwell-blob-R2.xml index 1a582547ec88..b69f15a758f8 100644 --- a/packages/panzer/mini-em/example/BlockPrec/maxwell-blob-R2.xml +++ b/packages/panzer/mini-em/example/BlockPrec/maxwell-blob-R2.xml @@ -24,7 +24,6 @@ - diff --git a/packages/panzer/mini-em/example/BlockPrec/maxwell-blob-R3.xml b/packages/panzer/mini-em/example/BlockPrec/maxwell-blob-R3.xml index 508639541ab6..b4f11120c050 100644 --- a/packages/panzer/mini-em/example/BlockPrec/maxwell-blob-R3.xml +++ b/packages/panzer/mini-em/example/BlockPrec/maxwell-blob-R3.xml @@ -24,7 +24,6 @@ - diff --git a/packages/panzer/mini-em/example/BlockPrec/maxwell-blob-R4.xml b/packages/panzer/mini-em/example/BlockPrec/maxwell-blob-R4.xml index 88ec43e607b5..d31bc2e4b0e7 100644 --- a/packages/panzer/mini-em/example/BlockPrec/maxwell-blob-R4.xml +++ b/packages/panzer/mini-em/example/BlockPrec/maxwell-blob-R4.xml @@ -24,7 +24,6 @@ - diff --git a/packages/panzer/mini-em/example/BlockPrec/maxwell-donut.xml b/packages/panzer/mini-em/example/BlockPrec/maxwell-donut.xml index 0e7623d9f925..57b54ec058b2 100644 --- a/packages/panzer/mini-em/example/BlockPrec/maxwell-donut.xml +++ b/packages/panzer/mini-em/example/BlockPrec/maxwell-donut.xml @@ -41,7 +41,6 @@ - diff --git a/packages/panzer/mini-em/example/BlockPrec/maxwell-exterior-large.xml b/packages/panzer/mini-em/example/BlockPrec/maxwell-exterior-large.xml index 4913f1f5fc41..cf2809ffdfd3 100644 --- a/packages/panzer/mini-em/example/BlockPrec/maxwell-exterior-large.xml +++ b/packages/panzer/mini-em/example/BlockPrec/maxwell-exterior-large.xml @@ -28,7 +28,6 @@ - diff --git a/packages/panzer/mini-em/example/BlockPrec/maxwell-exterior-medium.xml b/packages/panzer/mini-em/example/BlockPrec/maxwell-exterior-medium.xml index 7cb156f0771e..1fdc977dc9ed 100644 --- a/packages/panzer/mini-em/example/BlockPrec/maxwell-exterior-medium.xml +++ b/packages/panzer/mini-em/example/BlockPrec/maxwell-exterior-medium.xml @@ -28,7 +28,6 @@ - diff --git a/packages/panzer/mini-em/example/BlockPrec/maxwell-exterior-small.xml b/packages/panzer/mini-em/example/BlockPrec/maxwell-exterior-small.xml index 9ce84b7349d7..141ff1349ce9 100644 --- a/packages/panzer/mini-em/example/BlockPrec/maxwell-exterior-small.xml +++ b/packages/panzer/mini-em/example/BlockPrec/maxwell-exterior-small.xml @@ -28,7 +28,6 @@ - diff --git a/packages/panzer/mini-em/example/BlockPrec/maxwell-large.xml b/packages/panzer/mini-em/example/BlockPrec/maxwell-large.xml index a64500279c77..867b31679a0c 100644 --- a/packages/panzer/mini-em/example/BlockPrec/maxwell-large.xml +++ b/packages/panzer/mini-em/example/BlockPrec/maxwell-large.xml @@ -56,7 +56,6 @@ - diff --git a/packages/panzer/mini-em/example/BlockPrec/maxwell.xml b/packages/panzer/mini-em/example/BlockPrec/maxwell.xml index 95c32435ba5c..567e7f60d831 100644 --- a/packages/panzer/mini-em/example/BlockPrec/maxwell.xml +++ b/packages/panzer/mini-em/example/BlockPrec/maxwell.xml @@ -56,7 +56,6 @@ - diff --git a/packages/panzer/mini-em/example/BlockPrec/maxwell2D.xml b/packages/panzer/mini-em/example/BlockPrec/maxwell2D.xml index 8e9fb32117c8..f4452b2fc93f 100644 --- a/packages/panzer/mini-em/example/BlockPrec/maxwell2D.xml +++ b/packages/panzer/mini-em/example/BlockPrec/maxwell2D.xml @@ -34,7 +34,6 @@ - diff --git a/packages/panzer/mini-em/example/BlockPrec/solverMueLu.xml b/packages/panzer/mini-em/example/BlockPrec/solverMueLu.xml index df5b9ad49160..794d8ad984df 100644 --- a/packages/panzer/mini-em/example/BlockPrec/solverMueLu.xml +++ b/packages/panzer/mini-em/example/BlockPrec/solverMueLu.xml @@ -157,6 +157,8 @@ + + @@ -175,6 +177,7 @@ + @@ -201,6 +204,8 @@ + + @@ -219,6 +224,7 @@ + @@ -258,6 +264,8 @@ + + @@ -276,6 +284,7 @@ + diff --git a/packages/panzer/mini-em/example/BlockPrec/solverMueLuCuda.xml b/packages/panzer/mini-em/example/BlockPrec/solverMueLuCuda.xml index a3ab680e2f5e..47034e77c794 100644 --- a/packages/panzer/mini-em/example/BlockPrec/solverMueLuCuda.xml +++ b/packages/panzer/mini-em/example/BlockPrec/solverMueLuCuda.xml @@ -1,3 +1,5 @@ + + diff --git a/packages/panzer/mini-em/example/BlockPrec/solverMueLuTPL.xml b/packages/panzer/mini-em/example/BlockPrec/solverMueLuTPL.xml new file mode 100644 index 000000000000..98242888745c --- /dev/null +++ b/packages/panzer/mini-em/example/BlockPrec/solverMueLuTPL.xml @@ -0,0 +1,106 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/packages/panzer/mini-em/src/solvers/MiniEM_Interpolation.cpp b/packages/panzer/mini-em/src/solvers/MiniEM_Interpolation.cpp index 8199145715ad..ba501e10b930 100644 --- a/packages/panzer/mini-em/src/solvers/MiniEM_Interpolation.cpp +++ b/packages/panzer/mini-em/src/solvers/MiniEM_Interpolation.cpp @@ -340,9 +340,7 @@ Teko::LinearOp buildInterpolation(const Teuchos::RCP(worksetSize); DynRankDeviceView ho_dofCoords_d("ho_dofCoords_d", hoCardinality, dim); DynRankDeviceView basisCoeffsLIOriented_d("basisCoeffsLIOriented_d", numCells, hoCardinality, loCardinality); - typename Kokkos::DynRankView elemOrts_d ("elemOrts_d", numCells); - typename Kokkos::DynRankView::HostMirror elemNodes_h("elemNodes_h", numCells, numElemVertices); - typename Kokkos::DynRankView elemNodes_d("elemNodes_d", numCells, numElemVertices); + // the ranks of these depend on dimension DynRankDeviceView ho_dofCoeffs_d; @@ -370,6 +368,13 @@ Teko::LinearOp buildInterpolation(const Teuchos::RCP::eps(); + + // HO dof coordinates and coefficients + ho_basis->getDofCoords(ho_dofCoords_d); + + // compute values of op * (LO basis) at HO dof coords on reference element + lo_basis->getValues(valuesAtDofCoordsNonOriented_d, ho_dofCoords_d, op); + // loop over element blocks std::vector elementBlockIds; blockedDOFMngr->getElementBlockIds(elementBlockIds); @@ -381,44 +386,48 @@ Teko::LinearOp buildInterpolation(const Teuchos::RCP elementIds_d("elementIds_d", elementIds_h.extent(0)); Kokkos::deep_copy(elementIds_d, elementIds_h); - for(std::size_t elemIter = 0; elemIter < elementIds.size(); elemIter += numCells) { + // get element orientations + typename Kokkos::DynRankView elemOrts_d ("elemOrts_d", elementIds.size()); + { + typename Kokkos::DynRankView::HostMirror elemNodes_h("elemNodes_h", elementIds.size(), numElemVertices); + typename Kokkos::DynRankView elemNodes_d("elemNodes_d", elementIds.size(), numElemVertices); - // get element orientations - for (int cellNo = 0; cellNo < numCells; cellNo++) { - if (elemIter+cellNo >= elementIds.size()) - continue; - const GlobalOrdinal* node_ids = node_conn->getConnectivity(elementIds[elemIter+cellNo]); + for (int cellNo = 0; cellNo < elementIds.size(); cellNo++) { + const GlobalOrdinal* node_ids = node_conn->getConnectivity(elementIds[cellNo]); for(int i = 0; i < numElemVertices; i++) elemNodes_h(cellNo, i) = node_ids[i]; } Kokkos::deep_copy(elemNodes_d, elemNodes_h); ots::getOrientation(elemOrts_d, elemNodes_d, topology); + } - // HO dof coordinates and coefficients - ho_basis->getDofCoords(ho_dofCoords_d); + for(std::size_t elemIter = 0; elemIter < elementIds.size(); elemIter += numCells) { - // compute values of op * (LO basis) at HO dof coords on reference element - lo_basis->getValues(valuesAtDofCoordsNonOriented_d, ho_dofCoords_d, op); + int endCellRange = + std::min(numCells, Teuchos::as(elementIds.size()) - + Teuchos::as(elemIter)); + + // get subviews on workset + auto elemOrtsWorkset_d = Kokkos::subview(elemOrts_d, Kokkos::make_pair(elemIter, std::min(elemIter + numCells, elementIds.size()))); + auto valuesAtDofCoordsOrientedWorkset_d = Kokkos::subview(valuesAtDofCoordsOriented_d, Kokkos::make_pair(0, endCellRange), Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL()); + auto basisCoeffsLIOrientedWorkset_d = Kokkos::subview(basisCoeffsLIOriented_d, Kokkos::make_pair(0, endCellRange), Kokkos::ALL(), Kokkos::ALL()); // apply orientations for LO basis // shuffles things in the second dimension, i.e. wrt LO basis - ots::modifyBasisByOrientation(valuesAtDofCoordsOriented_d, + ots::modifyBasisByOrientation(valuesAtDofCoordsOrientedWorkset_d, valuesAtDofCoordsNonOriented_d, - elemOrts_d, + elemOrtsWorkset_d, lo_basis.get()); //get basis coefficients of LO basis functions wrt HO basis for(size_t loIter=0; loIter(elementIds.size()) - - Teuchos::as(elemIter)); + li::getBasisCoeffs(Kokkos::subview(basisCoeffsLIOrientedWorkset_d, Kokkos::ALL(), Kokkos::ALL(), loIter), + Kokkos::subview(valuesAtDofCoordsOrientedWorkset_d, Kokkos::ALL(), loIter, Kokkos::ALL(), Kokkos::ALL()), + ho_basis.get(), + elemOrtsWorkset_d); #ifdef PANZER_HAVE_EPETRA_STACK if (tblof.is_null()) { // Epetra fill @@ -426,7 +435,7 @@ Teko::LinearOp buildInterpolation(const Teuchos::RCP indices_d("indices", loCardinality); Kokkos::View values_d ("values", loCardinality); - + for (int cellNo = 0; cellNo < endCellRange; cellNo++) { auto elemId = elementIds_d(elemIter+cellNo); diff --git a/packages/panzer/mini-em/src/solvers/MiniEM_MatrixFreeInterpolationOp.cpp b/packages/panzer/mini-em/src/solvers/MiniEM_MatrixFreeInterpolationOp.cpp index 8302820fcc1d..cc54d77fdd4e 100644 --- a/packages/panzer/mini-em/src/solvers/MiniEM_MatrixFreeInterpolationOp.cpp +++ b/packages/panzer/mini-em/src/solvers/MiniEM_MatrixFreeInterpolationOp.cpp @@ -289,6 +289,12 @@ namespace mini_em { Kokkos::View node_elementLidToConn_d("node_elementLidToConn_d", node_elementLidToConn_h.extent(0)); Kokkos::deep_copy(node_elementLidToConn_d, node_elementLidToConn_h); + // HO dof coordinates and coefficients + ho_basis->getDofCoords(ho_dofCoords_d); + + // compute values of op * (LO basis) at HO dof coords on reference element + lo_basis->getValues(valuesAtDofCoordsNonOriented_d, ho_dofCoords_d, op); + // loop over element blocks std::vector elementBlockIds; blockedDOFMngr->getElementBlockIds(elementBlockIds); @@ -299,31 +305,39 @@ namespace mini_em { Kokkos::View::HostMirror elementIds_h(elementIds.data(), elementIds.size()); Kokkos::View elementIds_d("elementIds_d", elementIds_h.extent(0)); Kokkos::deep_copy(elementIds_d, elementIds_h); - for(std::size_t elemIter = 0; elemIter < elementIds_d.extent(0); elemIter += numCells) { - // get element orientations - Kokkos::parallel_for("miniEM:MatrixFreeInterpolationOp::connectivity", - range_type(0, std::min(numCells, elementIds_d.extent_int(0)-Teuchos::as(elemIter))), - KOKKOS_LAMBDA(const LocalOrdinal cellNo) { - LocalOrdinal cellNo2 = elemIter+cellNo; - LocalOrdinal elementID = elementIds_d(cellNo2); - LocalOrdinal k = node_elementLidToConn_d(elementID); - for(int i = 0; i < node_connectivitySize_d(elementID); i++) - elemNodes_d(cellNo, i) = node_connectivity_d(k+i); - }); + // get element orientations + typename Kokkos::DynRankView elemOrts_d ("elemOrts_d", elementIds.size()); + { + typename Kokkos::DynRankView::HostMirror elemNodes_h("elemNodes_h", elementIds.size(), numElemVertices); + typename Kokkos::DynRankView elemNodes_d("elemNodes_d", elementIds.size(), numElemVertices); + + for (int cellNo = 0; cellNo < elementIds.size(); cellNo++) { + const GlobalOrdinal* node_ids = node_conn_->getConnectivity(elementIds[cellNo]); + for(int i = 0; i < numElemVertices; i++) + elemNodes_h(cellNo, i) = node_ids[i]; + } + Kokkos::deep_copy(elemNodes_d, elemNodes_h); + ots::getOrientation(elemOrts_d, elemNodes_d, topology); + } + + for(std::size_t elemIter = 0; elemIter < elementIds_d.extent(0); elemIter += numCells) { - // HO dof coordinates - ho_basis->getDofCoords(ho_dofCoords_d); + int endCellRange = + std::min(numCells, Teuchos::as(elementIds.size()) - + Teuchos::as(elemIter)); - // compute values of op * (LO basis) at HO dof coords on reference element - lo_basis->getValues(valuesAtDofCoordsNonOriented_d, ho_dofCoords_d, op); + // get subviews on workset + auto elemOrtsWorkset_d = Kokkos::subview(elemOrts_d, Kokkos::make_pair(elemIter, std::min(elemIter + numCells, elementIds.size()))); + auto valuesAtDofCoordsOrientedWorkset_d = Kokkos::subview(valuesAtDofCoordsOriented_d, Kokkos::make_pair(0, endCellRange), Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL()); + auto basisCoeffsLIOrientedWorkset_d = Kokkos::subview(basisCoeffsLIOriented_d, Kokkos::make_pair(0, endCellRange), Kokkos::ALL(), Kokkos::ALL()); // apply orientations for LO basis // shuffles things in the second dimension, i.e. wrt LO basis - ots::modifyBasisByOrientation(valuesAtDofCoordsOriented_d, + ots::modifyBasisByOrientation(valuesAtDofCoordsOrientedWorkset_d, valuesAtDofCoordsNonOriented_d, - elemOrts_d, + elemOrtsWorkset_d, lo_basis.get()); Kokkos::deep_copy(reducedValuesAtDofCoordsOriented_d, 0.0); @@ -346,10 +360,10 @@ namespace mini_em { for (size_t j = 0; jgetLIDs(); Kokkos::fence(); + // HO dof coordinates and coefficients + ho_basis->getDofCoords(ho_dofCoords_d); + + // compute values of op * (LO basis) at HO dof coords on reference element + lo_basis->getValues(valuesAtDofCoordsNonOriented_d, ho_dofCoords_d, op); + // loop over element blocks std::vector elementBlockIds; blockedDOFMngr->getElementBlockIds(elementBlockIds); @@ -490,46 +510,40 @@ namespace mini_em { Kokkos::View elementIds_d("elementIds_d", elementIds_h.extent(0)); Kokkos::deep_copy(elementIds_d, elementIds_h); Kokkos::fence(); - for(std::size_t elemIter = 0; elemIter < elementIds_d.extent(0); elemIter += numCells) { - // get element orientations - auto node_connectivity_h = node_conn_->getConnectivityView(); - Kokkos::View node_connectivity_d("node_connectivity_d", node_connectivity_h.extent(0)); - Kokkos::deep_copy(node_connectivity_d, node_connectivity_h); - auto node_connectivitySize_h = node_conn_->getConnectivitySizeView(); - Kokkos::View node_connectivitySize_d("node_connectivitySize_d", node_connectivitySize_h.extent(0)); - Kokkos::deep_copy(node_connectivitySize_d, node_connectivitySize_h); - auto node_elementLidToConn_h = node_conn_->getElementLidToConnView(); - Kokkos::View node_elementLidToConn_d("node_elementLidToConn_d", node_elementLidToConn_h.extent(0)); - Kokkos::deep_copy(node_elementLidToConn_d, node_elementLidToConn_h); - Kokkos::fence(); + // get element orientations + typename Kokkos::DynRankView elemOrts_d ("elemOrts_d", elementIds.size()); + { + typename Kokkos::DynRankView::HostMirror elemNodes_h("elemNodes_h", elementIds.size(), numElemVertices); + typename Kokkos::DynRankView elemNodes_d("elemNodes_d", elementIds.size(), numElemVertices); + + for (int cellNo = 0; cellNo < elementIds.size(); cellNo++) { + const GlobalOrdinal* node_ids = node_conn_->getConnectivity(elementIds[cellNo]); + for(int i = 0; i < numElemVertices; i++) + elemNodes_h(cellNo, i) = node_ids[i]; + } + Kokkos::deep_copy(elemNodes_d, elemNodes_h); - Kokkos::parallel_for("miniEM:MatrixFreeInterpolationOp::connectivity", - range_type(0, std::min(numCells, elementIds_d.extent_int(0)-Teuchos::as(elemIter))), - KOKKOS_LAMBDA(const LocalOrdinal cellNo) { - LocalOrdinal cellNo2 = elemIter+cellNo; - LocalOrdinal elementID = elementIds_d(cellNo2); - LocalOrdinal k = node_elementLidToConn_d(elementID); - for(int i = 0; i < node_connectivitySize_d(elementID); i++) - elemNodes_d(cellNo, i) = node_connectivity_d(k+i); - }); - Kokkos::fence(); ots::getOrientation(elemOrts_d, elemNodes_d, topology); - Kokkos::fence(); + } - // HO dof coordinates and coefficients - ho_basis->getDofCoords(ho_dofCoords_d); - Kokkos::fence(); + for(std::size_t elemIter = 0; elemIter < elementIds_d.extent(0); elemIter += numCells) { + + int endCellRange = + std::min(numCells, Teuchos::as(elementIds.size()) - + Teuchos::as(elemIter)); + + // get subviews on workset + auto elemOrtsWorkset_d = Kokkos::subview(elemOrts_d, Kokkos::make_pair(elemIter, std::min(elemIter + numCells, elementIds.size()))); + auto valuesAtDofCoordsOrientedWorkset_d = Kokkos::subview(valuesAtDofCoordsOriented_d, Kokkos::make_pair(0, endCellRange), Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL()); + auto basisCoeffsLIOrientedWorkset_d = Kokkos::subview(basisCoeffsLIOriented_d, Kokkos::make_pair(0, endCellRange), Kokkos::ALL(), Kokkos::ALL()); - // compute values of op * (LO basis) at HO dof coords on reference element - lo_basis->getValues(valuesAtDofCoordsNonOriented_d, ho_dofCoords_d, op); - Kokkos::fence(); // apply orientations for LO basis // shuffles things in the second dimension, i.e. wrt LO basis - ots::modifyBasisByOrientation(valuesAtDofCoordsOriented_d, + ots::modifyBasisByOrientation(valuesAtDofCoordsOrientedWorkset_d, valuesAtDofCoordsNonOriented_d, - elemOrts_d, + elemOrtsWorkset_d, lo_basis.get()); Kokkos::fence(); @@ -537,9 +551,9 @@ namespace mini_em { for(size_t loIter=0; loIter& rows) { + const int nrows = rows.size(); + TEUCHOS_ASSERT(blockCount(blo) >= nrows); + std::vector subVectors; + for (int row_id = 0; row_id < nrows; ++row_id) { + const auto row = rows[row_id]; + subVectors.push_back(getBlock(row, blo)); + } + + return buildBlockedMultiVector(subVectors); +} + +BlockedLinearOp extractSubblockMatrix(const BlockedLinearOp& blo, const std::vector& rows, + const std::vector& cols) { + int nOuterBlockRows = blockRowCount(blo); + for (auto&& row : rows) { + TEUCHOS_ASSERT(row < nOuterBlockRows); + } + int nOuterBlockCols = blockColCount(blo); + for (auto&& col : cols) { + TEUCHOS_ASSERT(col < nOuterBlockCols); + } + + // allocate new operator + auto subblock = createBlockedOp(); + + const int nInnerBlockRows = rows.size(); + const int nInnerBlockCols = cols.size(); + + // build new operator + subblock->beginBlockFill(nInnerBlockRows, nInnerBlockCols); + for (int innerBlockRow = 0U; innerBlockRow < nInnerBlockRows; ++innerBlockRow) { + for (int innerBlockCol = 0U; innerBlockCol < nInnerBlockCols; ++innerBlockCol) { + auto [outerBlockRow, outerBlockCol] = + std::make_tuple(rows[innerBlockRow], cols[innerBlockCol]); + auto A_row_col = blo->getBlock(outerBlockRow, outerBlockCol); + + if (A_row_col != Teuchos::null) { + subblock->setBlock(innerBlockRow, innerBlockCol, A_row_col); + } else { + // scan to find first non-null range/domain, construct zero operator + VectorSpace range; + VectorSpace domain; + + for (int outerBlock = 0; outerBlock < nOuterBlockCols; ++outerBlock) { + auto A_ij = blo->getBlock(outerBlockRow, outerBlock); + if (A_ij != Teuchos::null) { + range = A_ij->range(); + break; + } + } + + for (int outerBlock = 0; outerBlock < nOuterBlockRows; ++outerBlock) { + auto A_ij = blo->getBlock(outerBlock, outerBlockCol); + if (A_ij != Teuchos::null) { + domain = A_ij->domain(); + break; + } + } + + TEUCHOS_ASSERT(range != Teuchos::null); + TEUCHOS_ASSERT(domain != Teuchos::null); + + subblock->setBlock(innerBlockRow, innerBlockCol, zero(range, domain)); + } + } + } + + subblock->endBlockFill(); + + return subblock; +} + +std::vector tokenize_input(std::string input, const std::string& delimiter) { + size_t pos = 0; + std::vector tokens; + while ((pos = input.find(delimiter)) != std::string::npos) { + tokens.push_back(input.substr(0, pos)); + input.erase(0, pos + delimiter.length()); + } + tokens.push_back(input); + return tokens; +} + +// Parse hierarchical block number from: +// +// +std::optional extract_hierarchical_block_number(const Teuchos::ParameterList& params, + const std::string& key) { + const std::string blockHierarchy = "Hierarchical Block "; + if (key.find(blockHierarchy) == std::string::npos) return {}; + if (!params.isSublist(key)) return {}; + + int blockNumber = -1; + try { + auto tokens = tokenize_input(key, " "); + blockNumber = std::stoi(tokens.back()); + } catch (std::exception& err) { + std::ostringstream ss; + ss << "Error occured when trying to parse entry with key " << key << "\n"; + ss << "It said:\n" << err.what() << "\n"; + TEUCHOS_TEST_FOR_EXCEPTION(true, std::runtime_error, ss.str()); + } + return blockNumber - 1; // adjust to 0-based indexing +} + +std::vector extract_block_numbers(const Teuchos::ParameterList& params) { + const std::string includedBlocks = "Included Subblocks"; + + std::ostringstream ss; + ss << "Parameter 'Included Subblocks' is missing for params:\n" << params; + TEUCHOS_TEST_FOR_EXCEPTION(!params.isParameter(includedBlocks), std::runtime_error, ss.str()); + std::vector blocks; + try { + auto blockStrs = tokenize_input(params.get(includedBlocks), ","); + for (const auto& blockStr : blockStrs) { + blocks.emplace_back(std::stoi(blockStr) - 1); // adjust to 0-based indexing + } + } catch (std::exception& err) { + std::ostringstream errSS; + errSS << "Error occured when trying to parse 'Included SubBlocks' for params:\n" + << params << "\n"; + errSS << "It said:\n" << err.what() << "\n"; + TEUCHOS_TEST_FOR_EXCEPTION(true, std::runtime_error, errSS.str()); + } + return blocks; +} + +unsigned index(unsigned i, unsigned j, unsigned N) { return i * N + j; } + +} // namespace + +NestedBlockGS::NestedBlockGS(const std::map>& blockToRow_, + const std::map& blockToInvOp_, BlockedLinearOp& A_, + bool useLowerTriangle_) + : blockToRow(blockToRow_), + blockToInvOp(blockToInvOp_), + A(A_), + useLowerTriangle(useLowerTriangle_) { + productRange_ = A->productRange(); + productDomain_ = A->productDomain(); + const auto Nrows = blockToRow.size(); + Ab.resize(Nrows * Nrows); + for (auto [hierchicalBlockRow, rows] : blockToRow) { + for (auto [hierchicalBlockCol, cols] : blockToRow) { + Ab[index(hierchicalBlockRow, hierchicalBlockCol, Nrows)] = + extractSubblockMatrix(A, rows, cols); + } + } +} + +void NestedBlockGS::implicitApply(const BlockedMultiVector& r, BlockedMultiVector& z, + const double alpha, const double beta) const { + const int blocks = blockToRow.size(); + + auto src = deepcopy(r); + std::vector rVec; + std::vector zVec; + + for (int b = 0; b < blocks; b++) { + rVec.push_back(extractSubBlockVector(src, blockToRow.at(b))); + zVec.push_back(extractSubBlockVector(z, blockToRow.at(b))); + } + + if (useLowerTriangle) { + lowerTriangularImplicitApply(rVec, zVec, alpha, beta); + } else { + upperTriangularImplicitApply(rVec, zVec, alpha, beta); + } +} + +void NestedBlockGS::upperTriangularImplicitApply(std::vector& r, + std::vector& z, + const double /* alpha */, + const double /* beta */) const { + const int blocks = blockToRow.size(); + + for (int b = blocks - 1; b >= 0; b--) { + applyOp(blockToInvOp.at(b), r[b], z[b]); + + for (int i = 0; i < b; i++) { + auto u_ib = Ab[index(i, b, blocks)]; + if (u_ib != Teuchos::null) { + applyOp(u_ib, z[b], r[i], -1.0, 1.0); + } + } + } +} + +void NestedBlockGS::lowerTriangularImplicitApply(std::vector& r, + std::vector& z, + const double /* alpha */, + const double /* beta */) const { + const int blocks = blockToRow.size(); + + for (int b = 0; b < blocks; b++) { + applyOp(blockToInvOp.at(b), r[b], z[b]); + + // loop over each row + for (int i = b + 1; i < blocks; i++) { + auto l_ib = Ab[index(i, b, blocks)]; + if (l_ib != Teuchos::null) { + applyOp(l_ib, z[b], r[i], -1.0, 1.0); + } + } + } +} + +HierarchicalGaussSeidelPreconditionerFactory::HierarchicalGaussSeidelPreconditionerFactory() = + default; + +LinearOp HierarchicalGaussSeidelPreconditionerFactory::buildPreconditionerOperator( + BlockedLinearOp& blo, BlockPreconditionerState& state) const { + for (auto [hierarchicalBlockNum, blocks] : blockToRow) { + auto A_bb = extractSubblockMatrix(blo, blocks, blocks); + blockToInvOp[hierarchicalBlockNum] = this->buildBlockInverse( + *blockToInverse.at(hierarchicalBlockNum), A_bb, state, hierarchicalBlockNum); + } + + return Teuchos::rcp(new NestedBlockGS(blockToRow, blockToInvOp, blo, useLowerTriangle)); +} + +LinearOp HierarchicalGaussSeidelPreconditionerFactory::buildBlockInverse( + const InverseFactory& invFact, const BlockedLinearOp& matrix, BlockPreconditionerState& state, + int hierarchicalBlockNum) const { + std::stringstream ss; + ss << "hierarchical_block_" << hierarchicalBlockNum; + + ModifiableLinearOp& invOp = state.getModifiableOp(ss.str()); + + if (invOp == Teuchos::null) { + invOp = buildInverse(invFact, matrix); + } else { + rebuildInverse(invFact, matrix, invOp); + } + + return invOp; +} + +void HierarchicalGaussSeidelPreconditionerFactory::initializeFromParameterList( + const Teuchos::ParameterList& pl) { + RCP invLib = getInverseLibrary(); + + for (const auto& entry : pl) { + const auto key = entry.first; + const auto value = entry.second; + auto hierarchicalBlockNumber = extract_hierarchical_block_number(pl, key); + if (!hierarchicalBlockNumber) continue; + const auto& hierarchicalParams = pl.sublist(key); + blockToRow[*hierarchicalBlockNumber] = extract_block_numbers(hierarchicalParams); + + std::ostringstream ss; + ss << "Missing required parameter \"Inverse Type\" for hierarchical block " + << *hierarchicalBlockNumber << "\n"; + TEUCHOS_TEST_FOR_EXCEPTION(!hierarchicalParams.isParameter("Inverse Type"), std::runtime_error, + ss.str()); + auto invStr = hierarchicalParams.get("Inverse Type"); + blockToInverse[*hierarchicalBlockNumber] = invLib->getInverseFactory(invStr); + } + + useLowerTriangle = false; + if (pl.isParameter("Use Upper Triangle")) { + useLowerTriangle = !pl.get("Use Upper Triangle"); + } +} + +} // namespace Teko diff --git a/packages/teko/src/Teko_HierarchicalGaussSeidelPreconditionerFactory.hpp b/packages/teko/src/Teko_HierarchicalGaussSeidelPreconditionerFactory.hpp new file mode 100644 index 000000000000..b0ab109c7607 --- /dev/null +++ b/packages/teko/src/Teko_HierarchicalGaussSeidelPreconditionerFactory.hpp @@ -0,0 +1,118 @@ +/* +// @HEADER +// +// *********************************************************************** +// +// Teko: A package for block and physics based preconditioning +// Copyright 2010 Sandia Corporation +// +// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Eric C. Cyr (eccyr@sandia.gov) +// +// *********************************************************************** +// +// @HEADER + +*/ + +#ifndef __Teko_HierarchicalGaussSeidelPreconditionerFactory_hpp__ +#define __Teko_HierarchicalGaussSeidelPreconditionerFactory_hpp__ + +#include "Teuchos_RCP.hpp" + +#include "Teko_BlockPreconditionerFactory.hpp" +#include "Teko_BlockImplicitLinearOp.hpp" +#include "Teko_Utilities.hpp" +#include +#include + +namespace Teko { + +class NestedBlockGS : public BlockImplicitLinearOp { + public: + NestedBlockGS(const std::map>& blockToRow_, + const std::map& blockToInvOp_, BlockedLinearOp& A_, + bool useLowerTriangle_ = false); + + VectorSpace range() const override { return productRange_; } + VectorSpace domain() const override { return productDomain_; } + + void implicitApply(const BlockedMultiVector& r, BlockedMultiVector& z, const double alpha = 1.0, + const double beta = 0.0) const override; + + private: + void upperTriangularImplicitApply(std::vector& r, + std::vector& z, const double alpha = 1.0, + const double beta = 0.0) const; + + void lowerTriangularImplicitApply(std::vector& r, + std::vector& z, const double alpha = 1.0, + const double beta = 0.0) const; + + // block operators + std::map> blockToRow; + std::map blockToInvOp; + std::vector invOps; + BlockedLinearOp A; + std::vector Ab; + bool useLowerTriangle = false; + + Teuchos::RCP> productRange_; ///< Range vector space. + Teuchos::RCP> + productDomain_; ///< Domain vector space. +}; + +class HierarchicalGaussSeidelPreconditionerFactory : public BlockPreconditionerFactory { + public: + ~HierarchicalGaussSeidelPreconditionerFactory() override = default; + HierarchicalGaussSeidelPreconditionerFactory(); + + LinearOp buildPreconditionerOperator(BlockedLinearOp& blo, + BlockPreconditionerState& state) const override; + + protected: + void initializeFromParameterList(const Teuchos::ParameterList& pl) override; + using BlockPreconditionerFactory::buildPreconditionerOperator; + + private: + LinearOp buildBlockInverse(const InverseFactory& invFact, const BlockedLinearOp& matrix, + BlockPreconditionerState& state, int hierarchicalBlockNum) const; + + std::map> blockToRow; + std::map> blockToInverse; + mutable std::map blockToInvOp; + + bool useLowerTriangle = false; +}; +} // end namespace Teko + +#endif diff --git a/packages/teko/src/Teko_PreconditionerFactory.cpp b/packages/teko/src/Teko_PreconditionerFactory.cpp index f58867056b9e..31801b9665ac 100644 --- a/packages/teko/src/Teko_PreconditionerFactory.cpp +++ b/packages/teko/src/Teko_PreconditionerFactory.cpp @@ -53,6 +53,7 @@ // Specific preconditioners included for dynamic creation #include "Teko_JacobiPreconditionerFactory.hpp" #include "Teko_GaussSeidelPreconditionerFactory.hpp" +#include "Teko_HierarchicalGaussSeidelPreconditionerFactory.hpp" #include "Teko_AddPreconditionerFactory.hpp" #include "Teko_MultPreconditionerFactory.hpp" #include "Teko_LU2x2PreconditionerFactory.hpp" @@ -278,6 +279,9 @@ void PreconditionerFactory::initializePrecFactoryBuilder() { clone = rcp(new AutoClone()); precFactoryBuilder_.addClone("Block Gauss-Seidel", clone); + clone = rcp(new AutoClone()); + precFactoryBuilder_.addClone("Hierarchical Block Gauss-Seidel", clone); + clone = rcp(new AutoClone()); precFactoryBuilder_.addClone("Block Add", clone); diff --git a/packages/teko/src/Teko_Utilities.hpp b/packages/teko/src/Teko_Utilities.hpp index bf86d9eaf535..f5543898171a 100644 --- a/packages/teko/src/Teko_Utilities.hpp +++ b/packages/teko/src/Teko_Utilities.hpp @@ -343,6 +343,10 @@ typedef Teuchos::RCP > ModifiableLinearOp; //! Build a square zero operator from a single vector space inline LinearOp zero(const VectorSpace &vs) { return Thyra::zero(vs, vs); } +inline LinearOp zero(const VectorSpace &range, const VectorSpace &domain) { + return Thyra::zero(range, domain); +} + //! Replace nonzeros with a scalar value, used to zero out an operator #ifdef TEKO_HAVE_EPETRA void putScalar(const ModifiableLinearOp &op, double scalar); diff --git a/packages/teko/tests/src/tSIMPLEPreconditionerFactory_tpetra.cpp b/packages/teko/tests/src/tSIMPLEPreconditionerFactory_tpetra.cpp index 84f04d3e28a5..6f880c95d84b 100644 --- a/packages/teko/tests/src/tSIMPLEPreconditionerFactory_tpetra.cpp +++ b/packages/teko/tests/src/tSIMPLEPreconditionerFactory_tpetra.cpp @@ -279,6 +279,13 @@ int tSIMPLEPreconditionerFactory_tpetra::runTest(int verbosity, std::ostream &st failcount += status ? 0 : 1; totalrun++; + status = test_hierarchicalSolves(verbosity, failstrm); + Teko_TEST_MSG_tpetra(stdstrm, 1, " \"hierarchicalSolves\" ... PASSED", + " \"hierarchicalSolves\" ... FAILED"); + allTests &= status; + failcount += status ? 0 : 1; + totalrun++; + status = test_diagonal(verbosity, failstrm, 0); Teko_TEST_MSG_tpetra(stdstrm, 1, " \"diagonal(diag)\" ... PASSED", " \"diagonal(diag)\" ... FAILED"); @@ -751,5 +758,56 @@ bool tSIMPLEPreconditionerFactory_tpetra::test_iterativeSolves(int verbosity, st return true; } +bool tSIMPLEPreconditionerFactory_tpetra::test_hierarchicalSolves(int verbosity, std::ostream &os) { + bool status = false; + bool allPassed = true; + + const RCP > blkOp = Thyra::defaultBlockedLinearOp(); + blkOp->beginBlockFill(3, 3); + blkOp->setBlock(0, 0, F_); + blkOp->setBlock(0, 1, Bt_); + blkOp->setBlock(1, 0, B_); + blkOp->setBlock(1, 1, C_); + blkOp->setBlock(1, 2, B_); + blkOp->setBlock(2, 1, Bt_); + blkOp->setBlock(2, 2, F_); + blkOp->endBlockFill(); + + RCP params = Teuchos::rcp(new ParameterList()); + ParameterList &tekoList = params->sublist("Preconditioner Types").sublist("Teko"); + tekoList.set("Inverse Type", "HBGS"); + ParameterList &ifl = tekoList.sublist("Inverse Factory Library"); + + ifl.sublist("HBGS").set("Type", "Hierarchical Block Gauss-Seidel"); + + ifl.sublist("HBGS").sublist("Hierarchical Block 1").set("Included Subblocks", "1,2"); + ifl.sublist("HBGS").sublist("Hierarchical Block 1").set("Inverse Type", "SIMPLE"); + + ifl.sublist("HBGS").sublist("Hierarchical Block 2").set("Included Subblocks", "3"); + ifl.sublist("HBGS").sublist("Hierarchical Block 2").set("Inverse Type", "SINGLE_BGS"); + + ifl.sublist("SIMPLE").set("Type", "NS SIMPLE"); + ifl.sublist("SIMPLE").set("Inverse Velocity Type", "Belos"); + ifl.sublist("SIMPLE").set("Preconditioner Velocity Type", "Ifpack2"); + ifl.sublist("SIMPLE").set("Inverse Pressure Type", "Belos"); + ifl.sublist("SIMPLE").set("Preconditioner Pressure Type", "Ifpack2"); + + ifl.sublist("SINGLE_BGS").set("Type", "Block Gauss-Seidel"); + ifl.sublist("SINGLE_BGS").set("Inverse Type", "Ifpack2"); + + RCP invLib = Teko::InverseLibrary::buildFromParameterList(ifl); + RCP invFact = invLib->getInverseFactory("HBGS"); + + Teko::ModifiableLinearOp inv = Teko::buildInverse(*invFact, blkOp); + TEST_ASSERT(!inv.is_null(), "Constructed preconditioner is null"); + + if (!inv.is_null()) { + Teko::rebuildInverse(*invFact, blkOp, inv); + TEST_ASSERT(!inv.is_null(), "Constructed preconditioner is null"); + } + + return true; +} + } // end namespace Test } // end namespace Teko diff --git a/packages/teko/tests/src/tSIMPLEPreconditionerFactory_tpetra.hpp b/packages/teko/tests/src/tSIMPLEPreconditionerFactory_tpetra.hpp index c31a423bf3ec..8a1676a82c96 100644 --- a/packages/teko/tests/src/tSIMPLEPreconditionerFactory_tpetra.hpp +++ b/packages/teko/tests/src/tSIMPLEPreconditionerFactory_tpetra.hpp @@ -73,6 +73,7 @@ class tSIMPLEPreconditionerFactory_tpetra : public UnitTest { bool test_uninitializePrec(int verbosity, std::ostream& os); bool test_isCompatable(int verbosity, std::ostream& os); bool test_iterativeSolves(int verbosity, std::ostream& os); + bool test_hierarchicalSolves(int verbosity, std::ostream& os); // non-member tests bool test_result(int verbosity, std::ostream& os, int use_blocking); diff --git a/packages/tpetra/core/src/Tpetra_CrsMatrix_def.hpp b/packages/tpetra/core/src/Tpetra_CrsMatrix_def.hpp index febf5fc28ddd..df302e9f1d8e 100644 --- a/packages/tpetra/core/src/Tpetra_CrsMatrix_def.hpp +++ b/packages/tpetra/core/src/Tpetra_CrsMatrix_def.hpp @@ -5093,13 +5093,14 @@ CrsMatrix:: // Decide now whether to use the imbalanced row path, or the default. bool useMergePath = false; LocalOrdinal nrows = getLocalNumRows(); +#ifdef KOKKOSKERNELS_ENABLE_TPL_CUSPARSE LocalOrdinal maxRowImbalance = 0; if(nrows != 0) maxRowImbalance = getLocalMaxNumRowEntries() - (getLocalNumEntries() / nrows); //TODO: when https://github.com/kokkos/kokkos-kernels/issues/2166 is fixed and, //we can use SPMV_MERGE_PATH for the native spmv as well. //Take out this ifdef to enable that. -#ifdef KOKKOSKERNELS_ENABLE_TPL_CUSPARSE + if(size_t(maxRowImbalance) >= Tpetra::Details::Behavior::rowImbalanceThreshold()) useMergePath = true; #endif diff --git a/packages/tpetra/core/src/Tpetra_Details_KokkosCounter.cpp b/packages/tpetra/core/src/Tpetra_Details_KokkosCounter.cpp index 7dd95c747adb..a78d4294e231 100644 --- a/packages/tpetra/core/src/Tpetra_Details_KokkosCounter.cpp +++ b/packages/tpetra/core/src/Tpetra_Details_KokkosCounter.cpp @@ -38,11 +38,13 @@ // ************************************************************************ // @HEADER */ +// clang-format off #include "Tpetra_Details_KokkosCounter.hpp" #include "TpetraCore_config.h" #include "Kokkos_Core.hpp" #include "Teuchos_TestForException.hpp" #include +#include namespace Tpetra { namespace Details { @@ -201,6 +203,54 @@ namespace Details { TEUCHOS_TEST_FOR_EXCEPTION(1,std::runtime_error,std::string("Error: ") + device + std::string(" is not a device known to Tpetra")); } +// clang-format on +namespace KokkosRegionCounterDetails { +std::vector regions; + +void push_region_callback(const char *label) { regions.push_back(label); } +static_assert(std::is_same_v, + "Unexpected Kokkos profiling interface API. This is an internal " + "Tpetra developer error, please report this."); + +} // namespace KokkosRegionCounterDetails + +void KokkosRegionCounter::start() { + Kokkos::Tools::Experimental::set_push_region_callback( + KokkosRegionCounterDetails::push_region_callback); +} + +void KokkosRegionCounter::reset() { + KokkosRegionCounterDetails::regions.clear(); +} + +void KokkosRegionCounter::stop() { + Kokkos::Tools::Experimental::set_push_region_callback(nullptr); +} + +size_t +KokkosRegionCounter::get_count_region_contains(const std::string &needle) { + size_t count = 0; + for (const auto ®ion : KokkosRegionCounterDetails::regions) { + count += (region.find(needle) != std::string::npos); + } + return count; +} + +void KokkosRegionCounter::dump_regions(Teuchos::FancyOStream &os) { + for (const auto ®ion : KokkosRegionCounterDetails::regions) { + os << region << "\n"; + } +} + +void KokkosRegionCounter::dump_regions(std::ostream &os) { + for (const auto ®ion : KokkosRegionCounterDetails::regions) { + os << region << "\n"; + } +} + + +// clang-format off } // namespace Details diff --git a/packages/tpetra/core/src/Tpetra_Details_KokkosCounter.hpp b/packages/tpetra/core/src/Tpetra_Details_KokkosCounter.hpp index e6a8bccb9375..702819fad61d 100644 --- a/packages/tpetra/core/src/Tpetra_Details_KokkosCounter.hpp +++ b/packages/tpetra/core/src/Tpetra_Details_KokkosCounter.hpp @@ -38,6 +38,7 @@ // ************************************************************************ // @HEADER */ +// clang-format off #ifndef TPETRA_DETAILS_KOKKOS_COUNTER_HPP #define TPETRA_DETAILS_KOKKOS_COUNTER_HPP @@ -46,6 +47,7 @@ /// types using the Kokkos Profiling Library #include +#include namespace Tpetra { namespace Details { @@ -87,6 +89,30 @@ namespace FenceCounter { size_t get_count_global(const std::string & device); } +// clang-format on + +/// \brief Counter for Kokkos regions representing third-party library usage +namespace KokkosRegionCounter { +/// \brief Start the counter +void start(); + +/// \brief Reset the counter +void reset(); + +/// \brief Stop the counter +void stop(); + +/// \brief How many regions containing `substr` have been seen +size_t get_count_region_contains(const std::string &substr); + +/// \brief Print all observed region labels, separated by newline +void dump_regions(std::ostream &os); +void dump_regions(Teuchos::FancyOStream &os); +} // namespace KokkosRegionCounter + +// clang-format off + + } // namespace Details } // namespace Tpetra diff --git a/packages/tpetra/core/src/Tpetra_Details_leftScaleLocalCrsMatrix.hpp b/packages/tpetra/core/src/Tpetra_Details_leftScaleLocalCrsMatrix.hpp index 302301cd38da..aa6bc75753d0 100644 --- a/packages/tpetra/core/src/Tpetra_Details_leftScaleLocalCrsMatrix.hpp +++ b/packages/tpetra/core/src/Tpetra_Details_leftScaleLocalCrsMatrix.hpp @@ -76,6 +76,8 @@ class LeftScaleLocalCrsMatrix { static_assert (ScalingFactorsViewType::rank == 1, "scalingFactors must be a rank-1 Kokkos::View."); using device_type = typename LocalSparseMatrixType::device_type; + using LO = typename LocalSparseMatrixType::ordinal_type; + using policy_type = Kokkos::TeamPolicy; /// \param A_lcl [in/out] The local sparse matrix. /// @@ -94,11 +96,11 @@ class LeftScaleLocalCrsMatrix { {} KOKKOS_INLINE_FUNCTION void - operator () (const typename LocalSparseMatrixType::ordinal_type lclRow) const + operator () (const typename policy_type::member_type & team) const { - using LO = typename LocalSparseMatrixType::ordinal_type; using KAM = Kokkos::ArithTraits; + const LO lclRow = team.league_rank(); const mag_type curRowNorm = scalingFactors_(lclRow); // Users are responsible for any divisions or multiplications by // zero. @@ -106,14 +108,14 @@ class LeftScaleLocalCrsMatrix { KAM::sqrt (curRowNorm) : curRowNorm; auto curRow = A_lcl_.row (lclRow); const LO numEnt = curRow.length; - for (LO k = 0; k < numEnt; ++k) { + Kokkos::parallel_for(Kokkos::TeamThreadRange(team, numEnt), [&](const LO k) { if (divide) { // constexpr, so should get compiled out curRow.value (k) = curRow.value(k) / scalingFactor; } else { curRow.value (k) = curRow.value(k) * scalingFactor; } - } + }); } private: @@ -145,7 +147,7 @@ leftScaleLocalCrsMatrix (const LocalSparseMatrixType& A_lcl, using device_type = typename LocalSparseMatrixType::device_type; using execution_space = typename device_type::execution_space; using LO = typename LocalSparseMatrixType::ordinal_type; - using range_type = Kokkos::RangePolicy; + using policy_type = Kokkos::TeamPolicy; const LO lclNumRows = A_lcl.numRows (); if (divide) { @@ -154,7 +156,7 @@ leftScaleLocalCrsMatrix (const LocalSparseMatrixType& A_lcl, typename ScalingFactorsViewType::const_type, true>; functor_type functor (A_lcl, scalingFactors, assumeSymmetric); Kokkos::parallel_for ("leftScaleLocalCrsMatrix", - range_type (0, lclNumRows), functor); + policy_type (lclNumRows, Kokkos::AUTO), functor); } else { using functor_type = @@ -162,7 +164,7 @@ leftScaleLocalCrsMatrix (const LocalSparseMatrixType& A_lcl, typename ScalingFactorsViewType::const_type, false>; functor_type functor (A_lcl, scalingFactors, assumeSymmetric); Kokkos::parallel_for ("leftScaleLocalCrsMatrix", - range_type (0, lclNumRows), functor); + policy_type (lclNumRows, Kokkos::AUTO), functor); } } diff --git a/packages/tpetra/core/src/Tpetra_Details_rightScaleLocalCrsMatrix.hpp b/packages/tpetra/core/src/Tpetra_Details_rightScaleLocalCrsMatrix.hpp index 219888164161..a6a48e8b2f8e 100644 --- a/packages/tpetra/core/src/Tpetra_Details_rightScaleLocalCrsMatrix.hpp +++ b/packages/tpetra/core/src/Tpetra_Details_rightScaleLocalCrsMatrix.hpp @@ -76,6 +76,8 @@ class RightScaleLocalCrsMatrix { static_assert (ScalingFactorsViewType::rank == 1, "scalingFactors must be a rank-1 Kokkos::View."); using device_type = typename LocalSparseMatrixType::device_type; + using LO = typename LocalSparseMatrixType::ordinal_type; + using policy_type = Kokkos::TeamPolicy; /// \param A_lcl [in/out] The local sparse matrix. /// @@ -96,14 +98,14 @@ class RightScaleLocalCrsMatrix { {} KOKKOS_INLINE_FUNCTION void - operator () (const typename LocalSparseMatrixType::ordinal_type lclRow) const + operator () (const typename policy_type::member_type & team) const { - using LO = typename LocalSparseMatrixType::ordinal_type; using KAM = Kokkos::ArithTraits; + const LO lclRow = team.league_rank(); auto curRow = A_lcl_.row (lclRow); const LO numEnt = curRow.length; - for (LO k = 0; k < numEnt; ++k) { + Kokkos::parallel_for(Kokkos::TeamThreadRange(team, numEnt), [&](const LO k) { const LO lclColInd = curRow.colidx(k); const mag_type curColNorm = scalingFactors_(lclColInd); // Users are responsible for any divisions or multiplications by @@ -116,7 +118,7 @@ class RightScaleLocalCrsMatrix { else { curRow.value(k) = curRow.value(k) * scalingFactor; } - } + }); } private: @@ -148,7 +150,7 @@ rightScaleLocalCrsMatrix (const LocalSparseMatrixType& A_lcl, using device_type = typename LocalSparseMatrixType::device_type; using execution_space = typename device_type::execution_space; using LO = typename LocalSparseMatrixType::ordinal_type; - using range_type = Kokkos::RangePolicy; + using policy_type = Kokkos::TeamPolicy; const LO lclNumRows = A_lcl.numRows (); if (divide) { @@ -157,7 +159,7 @@ rightScaleLocalCrsMatrix (const LocalSparseMatrixType& A_lcl, typename ScalingFactorsViewType::const_type, true>; functor_type functor (A_lcl, scalingFactors, assumeSymmetric); Kokkos::parallel_for ("rightScaleLocalCrsMatrix", - range_type (0, lclNumRows), functor); + policy_type (lclNumRows, Kokkos::AUTO), functor); } else { using functor_type = @@ -165,7 +167,7 @@ rightScaleLocalCrsMatrix (const LocalSparseMatrixType& A_lcl, typename ScalingFactorsViewType::const_type, false>; functor_type functor (A_lcl, scalingFactors, assumeSymmetric); Kokkos::parallel_for ("rightScaleLocalCrsMatrix", - range_type (0, lclNumRows), functor); + policy_type (lclNumRows, Kokkos::AUTO), functor); } } diff --git a/packages/tpetra/core/src/Tpetra_computeRowAndColumnOneNorms_def.hpp b/packages/tpetra/core/src/Tpetra_computeRowAndColumnOneNorms_def.hpp index a526d4438f9e..872e7ea23a6e 100644 --- a/packages/tpetra/core/src/Tpetra_computeRowAndColumnOneNorms_def.hpp +++ b/packages/tpetra/core/src/Tpetra_computeRowAndColumnOneNorms_def.hpp @@ -50,6 +50,7 @@ /// Tpetra_computeRowAndColumnOneNorms_decl.hpp in this directory. #include "Tpetra_Details_copyConvert.hpp" +#include "Tpetra_Details_EquilibrationInfo.hpp" #include "Tpetra_CrsMatrix.hpp" #include "Tpetra_Export.hpp" #include "Tpetra_Map.hpp" @@ -310,6 +311,7 @@ class ComputeLocalRowScaledColumnNorms { using val_type = typename Kokkos::ArithTraits::val_type; using mag_type = typename Kokkos::ArithTraits::mag_type; using device_type = typename crs_matrix_type::device_type; + using policy_type = Kokkos::TeamPolicy; ComputeLocalRowScaledColumnNorms (const Kokkos::View& rowScaledColNorms, const Kokkos::View& rowNorms, @@ -319,18 +321,19 @@ class ComputeLocalRowScaledColumnNorms { A_lcl_ (A.getLocalMatrixDevice ()) {} - KOKKOS_INLINE_FUNCTION void operator () (const LO lclRow) const { + KOKKOS_INLINE_FUNCTION void operator () (const typename policy_type::member_type &team) const { using KAT = Kokkos::ArithTraits; + const LO lclRow = team.league_rank(); const auto curRow = A_lcl_.rowConst (lclRow); const mag_type rowNorm = rowNorms_[lclRow]; const LO numEnt = curRow.length; - for (LO k = 0; k < numEnt; ++k) { + Kokkos::parallel_for(Kokkos::TeamThreadRange(team, numEnt), [&](const LO k) { const mag_type matrixAbsVal = KAT::abs (curRow.value(k)); const LO lclCol = curRow.colidx(k); Kokkos::atomic_add (&rowScaledColNorms_[lclCol], matrixAbsVal / rowNorm); - } + }); } static void @@ -339,14 +342,13 @@ class ComputeLocalRowScaledColumnNorms { const crs_matrix_type& A) { using execution_space = typename device_type::execution_space; - using range_type = Kokkos::RangePolicy; using functor_type = ComputeLocalRowScaledColumnNorms; functor_type functor (rowScaledColNorms, rowNorms, A); const LO lclNumRows = static_cast (A.getRowMap ()->getLocalNumElements ()); Kokkos::parallel_for ("computeLocalRowScaledColumnNorms", - range_type (0, lclNumRows), functor); + policy_type (lclNumRows, Kokkos::AUTO), functor); } private: @@ -409,6 +411,7 @@ class ComputeLocalRowOneNorms { using local_matrix_device_type = typename ::Tpetra::CrsMatrix::local_matrix_device_type; using local_map_type = typename ::Tpetra::Map::local_map_type; + using policy_type = Kokkos::TeamPolicy; ComputeLocalRowOneNorms (const equib_info_type& equib, // in/out const local_matrix_device_type& A_lcl, // in @@ -441,12 +444,13 @@ class ComputeLocalRowOneNorms { } KOKKOS_INLINE_FUNCTION void - operator () (const LO lclRow, value_type& dst) const + operator () (const typename policy_type::member_type& team, value_type& dst) const { using KAT = Kokkos::ArithTraits; using mag_type = typename KAT::mag_type; using KAM = Kokkos::ArithTraits; + const LO lclRow = team.league_rank(); const GO gblRow = rowMap_.getGlobalElement (lclRow); // OK if invalid(); then we simply won't find the diagonal entry. const GO lclDiagColInd = colMap_.getLocalElement (gblRow); @@ -456,33 +460,37 @@ class ComputeLocalRowOneNorms { mag_type rowNorm {0.0}; val_type diagVal {0.0}; + value_type dstThread {0}; - for (LO k = 0; k < numEnt; ++k) { + Kokkos::parallel_reduce(Kokkos::TeamThreadRange(team, numEnt), [&](const LO k, mag_type &normContrib, val_type& diagContrib, value_type& dstContrib) { const val_type matrixVal = curRow.value (k); if (KAT::isInf (matrixVal)) { - dst |= 1; + dstContrib |= 1; } if (KAT::isNan (matrixVal)) { - dst |= 2; + dstContrib |= 2; } const mag_type matrixAbsVal = KAT::abs (matrixVal); - rowNorm += matrixAbsVal; + normContrib += matrixAbsVal; const LO lclCol = curRow.colidx (k); if (lclCol == lclDiagColInd) { - diagVal = curRow.value (k); // assume no repeats + diagContrib = curRow.value (k); // assume no repeats } - } // for each entry in row + }, Kokkos::Sum(rowNorm), Kokkos::Sum(diagVal), Kokkos::BOr(dstThread)); // for each entry in row // This is a local result. If the matrix has an overlapping // row Map, then the global result might differ. - if (diagVal == KAT::zero ()) { - dst |= 4; - } - if (rowNorm == KAM::zero ()) { - dst |= 8; - } - equib_.rowDiagonalEntries[lclRow] = diagVal; - equib_.rowNorms[lclRow] = rowNorm; + Kokkos::single(Kokkos::PerTeam(team), [&](){ + dst |= dstThread; + if (diagVal == KAT::zero ()) { + dst |= 4; + } + if (rowNorm == KAM::zero ()) { + dst |= 8; + } + equib_.rowDiagonalEntries[lclRow] = diagVal; + equib_.rowNorms[lclRow] = rowNorm; + }); } private: @@ -501,6 +509,7 @@ class ComputeLocalRowAndColumnOneNorms { using equib_info_type = EquilibrationInfo; using local_matrix_device_type = typename ::Tpetra::CrsMatrix::local_matrix_device_type; using local_map_type = typename ::Tpetra::Map::local_map_type; + using policy_type = Kokkos::TeamPolicy; public: ComputeLocalRowAndColumnOneNorms (const equib_info_type& equib, // in/out @@ -534,12 +543,13 @@ class ComputeLocalRowAndColumnOneNorms { } KOKKOS_INLINE_FUNCTION void - operator () (const LO lclRow, value_type& dst) const + operator () (const typename policy_type::member_type& team, value_type& dst) const { using KAT = Kokkos::ArithTraits; using mag_type = typename KAT::mag_type; using KAM = Kokkos::ArithTraits; + const LO lclRow = team.league_rank(); const GO gblRow = rowMap_.getGlobalElement (lclRow); // OK if invalid(); then we simply won't find the diagonal entry. const GO lclDiagColInd = colMap_.getLocalElement (gblRow); @@ -549,46 +559,50 @@ class ComputeLocalRowAndColumnOneNorms { mag_type rowNorm {0.0}; val_type diagVal {0.0}; + value_type dstThread {0}; - for (LO k = 0; k < numEnt; ++k) { + Kokkos::parallel_reduce(Kokkos::TeamThreadRange(team, numEnt), [&](const LO k, mag_type &normContrib, val_type& diagContrib, value_type& dstContrib) { const val_type matrixVal = curRow.value (k); if (KAT::isInf (matrixVal)) { - dst |= 1; + dstContrib |= 1; } if (KAT::isNan (matrixVal)) { - dst |= 2; + dstContrib |= 2; } const mag_type matrixAbsVal = KAT::abs (matrixVal); - rowNorm += matrixAbsVal; + normContrib += matrixAbsVal; const LO lclCol = curRow.colidx (k); if (lclCol == lclDiagColInd) { - diagVal = curRow.value (k); // assume no repeats + diagContrib = curRow.value (k); // assume no repeats } if (! equib_.assumeSymmetric) { Kokkos::atomic_add (&(equib_.colNorms[lclCol]), matrixAbsVal); } - } // for each entry in row + }, Kokkos::Sum(rowNorm), Kokkos::Sum(diagVal), Kokkos::BOr(dstThread)); // for each entry in row // This is a local result. If the matrix has an overlapping // row Map, then the global result might differ. - if (diagVal == KAT::zero ()) { - dst |= 4; - } - if (rowNorm == KAM::zero ()) { - dst |= 8; - } - // NOTE (mfh 24 May 2018) We could actually compute local - // rowScaledColNorms in situ at this point, if ! assumeSymmetric - // and row Map is the same as range Map (so that the local row - // norms are the same as the global row norms). - equib_.rowDiagonalEntries[lclRow] = diagVal; - equib_.rowNorms[lclRow] = rowNorm; - if (! equib_.assumeSymmetric && - lclDiagColInd != Tpetra::Details::OrdinalTraits::invalid ()) { - // Don't need an atomic update here, since this lclDiagColInd is - // a one-to-one function of lclRow. - equib_.colDiagonalEntries[lclDiagColInd] += diagVal; - } + Kokkos::single(Kokkos::PerTeam(team), [&](){ + dst |= dstThread; + if (diagVal == KAT::zero ()) { + dst |= 4; + } + if (rowNorm == KAM::zero ()) { + dst |= 8; + } + // NOTE (mfh 24 May 2018) We could actually compute local + // rowScaledColNorms in situ at this point, if ! assumeSymmetric + // and row Map is the same as range Map (so that the local row + // norms are the same as the global row norms). + equib_.rowDiagonalEntries[lclRow] = diagVal; + equib_.rowNorms[lclRow] = rowNorm; + if (! equib_.assumeSymmetric && + lclDiagColInd != Tpetra::Details::OrdinalTraits::invalid ()) { + // Don't need an atomic update here, since this lclDiagColInd is + // a one-to-one function of lclRow. + equib_.colDiagonalEntries[lclDiagColInd] += diagVal; + } + }); } private: @@ -605,7 +619,7 @@ EquilibrationInfo::val_type, typename NT::devic computeLocalRowOneNorms_CrsMatrix (const Tpetra::CrsMatrix& A) { using execution_space = typename NT::device_type::execution_space; - using range_type = Kokkos::RangePolicy; + using policy_type = Kokkos::TeamPolicy; using functor_type = ComputeLocalRowOneNorms; using val_type = typename Kokkos::ArithTraits::val_type; using device_type = typename NT::device_type; @@ -621,7 +635,7 @@ computeLocalRowOneNorms_CrsMatrix (const Tpetra::CrsMatrix& A) A.getColMap ()->getLocalMap ()); int result = 0; Kokkos::parallel_reduce ("computeLocalRowOneNorms", - range_type (0, lclNumRows), functor, + policy_type (lclNumRows, Kokkos::AUTO), functor, result); equib.foundInf = (result & 1) != 0; equib.foundNan = (result & 2) != 0; @@ -638,7 +652,7 @@ computeLocalRowAndColumnOneNorms_CrsMatrix (const Tpetra::CrsMatrix; + using policy_type = Kokkos::TeamPolicy; using functor_type = ComputeLocalRowAndColumnOneNorms; using val_type = typename Kokkos::ArithTraits::val_type; using device_type = typename NT::device_type; @@ -653,7 +667,7 @@ computeLocalRowAndColumnOneNorms_CrsMatrix (const Tpetra::CrsMatrixgetLocalMap ()); int result = 0; Kokkos::parallel_reduce ("computeLocalRowAndColumnOneNorms", - range_type (0, lclNumRows), functor, + policy_type (lclNumRows, Kokkos::AUTO), functor, result); equib.foundInf = (result & 1) != 0; equib.foundNan = (result & 2) != 0; diff --git a/packages/tpetra/core/test/Comm/gpuAwareMpi.cpp b/packages/tpetra/core/test/Comm/gpuAwareMpi.cpp index ae747e94194c..70e8472cf3d4 100644 --- a/packages/tpetra/core/test/Comm/gpuAwareMpi.cpp +++ b/packages/tpetra/core/test/Comm/gpuAwareMpi.cpp @@ -81,7 +81,7 @@ namespace { // (anonymous) typedef Kokkos::Device test_device_type; typedef Tpetra::KokkosCompat::KokkosHIPWrapperNode test_node_type; #elif defined(KOKKOS_ENABLE_SYCL) - typedef Kokkos::Device test_device_type; + typedef Kokkos::Device test_device_type; typedef Tpetra::KokkosCompat::KokkosSYCLWrapperNode test_node_type; #endif typedef Tpetra::Map<>::local_ordinal_type LO; diff --git a/packages/tpetra/core/test/CrsMatrix/CMakeLists.txt b/packages/tpetra/core/test/CrsMatrix/CMakeLists.txt index 646ff80848a0..8c79f3c14bd3 100644 --- a/packages/tpetra/core/test/CrsMatrix/CMakeLists.txt +++ b/packages/tpetra/core/test/CrsMatrix/CMakeLists.txt @@ -490,8 +490,29 @@ TRIBITS_ADD_EXECUTABLE_AND_TEST( STANDARD_PASS_OUTPUT ) +if ( + # supported TPLs + ( + (Tpetra_ENABLE_CUDA AND TPL_ENABLE_CUSPARSE ) OR + (Tpetra_ENABLE_HIP AND TPL_ENABLE_ROCSPARSE) + ) + AND + # supported type combos + ( + (Tpetra_INST_DOUBLE OR Tpetra_INST_FLOAT) + ) +) + TRIBITS_ADD_EXECUTABLE_AND_TEST( + CrsMatrix_ApplyUsesTPLs + SOURCES + CrsMatrix_ApplyUsesTPLs.cpp + ${TEUCHOS_STD_UNIT_TEST_MAIN} + COMM serial mpi + STANDARD_PASS_OUTPUT + ) +endif() SET(TIMING_INSTALLS "") diff --git a/packages/tpetra/core/test/CrsMatrix/CrsMatrix_ApplyUsesTPLs.cpp b/packages/tpetra/core/test/CrsMatrix/CrsMatrix_ApplyUsesTPLs.cpp new file mode 100644 index 000000000000..dff7928d1f12 --- /dev/null +++ b/packages/tpetra/core/test/CrsMatrix/CrsMatrix_ApplyUsesTPLs.cpp @@ -0,0 +1,286 @@ +/* +// @HEADER +// *********************************************************************** +// +// Tpetra: Templated Linear Algebra Services Package +// Copyright (2008) Sandia Corporation +// +// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Michael A. Heroux (maherou@sandia.gov) +// +// ************************************************************************ +// @HEADER +*/ + +#include "Tpetra_TestingUtilities.hpp" +#include "Tpetra_MultiVector.hpp" +#include "Tpetra_CrsMatrix.hpp" +#include "Tpetra_Details_KokkosCounter.hpp" + +// TODO: add test where some nodes have zero rows +// TODO: add test where non-"zero" graph is used to build matrix; if no values are added to matrix, the operator effect should be zero. This tests that matrix values are initialized properly. +// TODO: add test where dynamic profile initially has no allocation, then entries are added. this will test new view functionality. + +namespace Teuchos { + template <> + ScalarTraits::magnitudeType + relErr( const int &s1, const int &s2 ) + { + typedef ScalarTraits ST; + return ST::magnitude(s1-s2); + } + + template <> + ScalarTraits::magnitudeType + relErr( const char &s1, const char &s2 ) + { + typedef ScalarTraits ST; + return ST::magnitude(s1-s2); + } +} + +namespace { + + // no ScalarTraits<>::eps() for integer types + + template struct TestingTolGuts {}; + + template + struct TestingTolGuts { + static typename Teuchos::ScalarTraits::magnitudeType testingTol() + { return Teuchos::ScalarTraits::eps(); } + }; + + template + struct TestingTolGuts { + static typename Teuchos::ScalarTraits::magnitudeType testingTol() + { return 0; } + }; + + template + static typename Teuchos::ScalarTraits::magnitudeType testingTol() + { + return TestingTolGuts::hasMachineParameters>:: + testingTol(); + } + + using Tpetra::TestingUtilities::getDefaultComm; + + using std::endl; + using std::swap; + + using std::string; + + using Teuchos::as; + using Teuchos::FancyOStream; + using Teuchos::RCP; + using Teuchos::ArrayRCP; + using Teuchos::rcp; + using Teuchos::arcp; + using Teuchos::outArg; + using Teuchos::arcpClone; + using Teuchos::arrayView; + using Teuchos::broadcast; + using Teuchos::OrdinalTraits; + using Teuchos::ScalarTraits; + using Teuchos::Comm; + using Teuchos::Array; + using Teuchos::ArrayView; + using Teuchos::tuple; + using Teuchos::null; + using Teuchos::VERB_NONE; + using Teuchos::VERB_LOW; + using Teuchos::VERB_MEDIUM; + using Teuchos::VERB_HIGH; + using Teuchos::VERB_EXTREME; + using Teuchos::ETransp; + using Teuchos::NO_TRANS; + using Teuchos::TRANS; + using Teuchos::CONJ_TRANS; + using Teuchos::EDiag; + using Teuchos::UNIT_DIAG; + using Teuchos::NON_UNIT_DIAG; + using Teuchos::EUplo; + using Teuchos::UPPER_TRI; + using Teuchos::LOWER_TRI; + using Teuchos::ParameterList; + using Teuchos::parameterList; + + using Tpetra::Map; + using Tpetra::MultiVector; + using Tpetra::Vector; + using Tpetra::Operator; + using Tpetra::CrsMatrix; + using Tpetra::CrsGraph; + using Tpetra::RowMatrix; + using Tpetra::Import; + using Tpetra::global_size_t; + using Tpetra::createContigMapWithNode; + using Tpetra::createLocalMapWithNode; + using Tpetra::createVector; + using Tpetra::OptimizeOption; + using Tpetra::DoOptimizeStorage; + using Tpetra::DoNotOptimizeStorage; + using Tpetra::GloballyDistributed; + using Tpetra::INSERT; + + // + // UNIT TESTS + // + + //// + TEUCHOS_UNIT_TEST_TEMPLATE_4_DECL( CrsMatrix, NonSquare, LO, GO, Scalar, Node ) + { + + // skip test if Scalar is not (float or double) + if constexpr (!(std::is_same_v || std::is_same_v)) { + out << "SKIP: unsupported scalar type" << std::endl; + TEST_EQUALITY_CONST(1,1); // SKIP + return; + } + // skip test if LO != int + if constexpr (!std::is_same_v) { + out << "SKIP: unsupported local ordinal type" << std::endl; + TEST_EQUALITY_CONST(1,1); // SKIP + return; + } + // skip test if CUDA enables and not CUDA space + #if defined(HAVE_TPETRA_CUDA) || defined(HAVE_TPETRACORE_CUDA) + if constexpr (!std::is_same_v) { + out << "SKIP: non-CUDA exec space" << std::endl; + TEST_EQUALITY_CONST(1,1); // SKIP + return; + } + #endif + // skip test if HIP enabled and not HIP space + #if defined(HAVE_TPETRA_HIP) || defined(HAVE_TPETRACORE_HIP) + if constexpr (!std::is_same_v) { + out << "SKIP: non-HIP exec space" << std::endl; + TEST_EQUALITY_CONST(1,1); // SKIP + return; + } + #endif + + typedef CrsMatrix MAT; + typedef MultiVector MV; + typedef Map map_type; + const global_size_t INVALID = OrdinalTraits::invalid(); + // get a comm + RCP > comm = getDefaultComm(); + const int M = 3; + const int P = 5; + const int N = comm->getSize(); + const int myImageID = comm->getRank(); + // create Maps + // matrix is M*N-by-P + // col + // 0 1 P-1 + // 0 [0 MN ... (P-1)MN ] + // . [... ... ... ] + // 0 [M-1 MN+M-1 (P-1)MN+M-1 ] + //p 1 [M MN+M ] + //r . [... ... ] = [A_ij], where A_ij = i+jMN + //o 1 [2M-1 MN+2M-1 ] + //c . [... ] + // N-1 [(N-1)M MN+(N-1)(M-1) ] + // . [... ... ] + // N-1 [MN-1 MN+MN-1 ] + // + // row map, range map is [0,M-1] [M,2M-1] [2M,3M-1] ... [MN-M,MN-1] + // domain map will be map for X (lclmap) + // + // input multivector X is not distributed: + // + // X = [ 0 P ... (numVecs-1)P ] + // [ ... .... ... ... ] = [X_ji], where X_ij = i+jP + // [ P-1 2P-1 ... numVecs*P-1 ] + // + // the result of the non-transpose multiplication should be + // P-1 + // (A*X)_ij = sum_k A_ik X_kj = sum (i+kMN)(k+jP) = jiP^2 + (i+jMNP)(P^2-P)/2 + MNP(P-1)(2P-1)/6 + // k=0 + // + // + // + const int numVecs = 3; + RCP rowmap (new map_type (INVALID, M, 0, comm)); + RCP lclmap = createLocalMapWithNode (P, comm); + + // create the matrix + MAT A(rowmap,P); + for (GO i=0; i(M); ++i) { + for (GO j=0; j(P); ++j) { + A.insertGlobalValues( M*myImageID+i, tuple(j), tuple(M*myImageID+i + j*M*N) ); + } + } + // call fillComplete() + A.fillComplete(lclmap,rowmap); + // build the input multivector X + MV X(lclmap,numVecs); + for (GO i=0; i(P); ++i) { + for (GO j=0; j(numVecs); ++j) { + X.replaceGlobalValue(i,j,static_cast(i+j*P)); + } + } + // allocate output multivec + MV Bout(rowmap,numVecs); + // test the action + Bout.randomize(); + Tpetra::Details::KokkosRegionCounter::reset(); + Tpetra::Details::KokkosRegionCounter::start(); + A.apply(X,Bout); + Tpetra::Details::KokkosRegionCounter::stop(); + + TEST_COMPARE(Tpetra::Details::KokkosRegionCounter::get_count_region_contains("spmv[TPL_"), ==, 1); + + using Teuchos::outArg; + using Teuchos::REDUCE_MIN; + using Teuchos::reduceAll; + const int lclSuccess = success ? 1 : 0; + int gblSuccess = 0; // output argument + reduceAll (*comm, REDUCE_MIN, lclSuccess, outArg (gblSuccess)); + TEST_EQUALITY_CONST( gblSuccess, 1 ); + if (gblSuccess != 1) { + out << "KokkosKernels TPL use was not detected where it was expected!" << endl; + } + } + +// +// INSTANTIATIONS +// +#define UNIT_TEST_GROUP( SCALAR, LO, GO, NODE ) \ + TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT( CrsMatrix, NonSquare, LO, GO, SCALAR, NODE ) + + TPETRA_ETI_MANGLING_TYPEDEFS() + + TPETRA_INSTANTIATE_SLGN( UNIT_TEST_GROUP ) +} diff --git a/packages/tpetra/core/test/PerformanceCGSolve/cg_solve_file.cpp b/packages/tpetra/core/test/PerformanceCGSolve/cg_solve_file.cpp index 3011db577316..b5483a86bd80 100644 --- a/packages/tpetra/core/test/PerformanceCGSolve/cg_solve_file.cpp +++ b/packages/tpetra/core/test/PerformanceCGSolve/cg_solve_file.cpp @@ -89,21 +89,26 @@ bool cg_solve (Teuchos::RCP A, Teuchos::RCP b, Teuchos::RCPupdate(1.0,*x,0.0,*x,0.0); + Kokkos::fence(); } { TimeMonitor t(*TimeMonitor::getNewTimer(matvecTimerName)); A->apply(*p, *Ap); + Kokkos::fence(); } { TimeMonitor t(*TimeMonitor::getNewTimer(addTimerName)); r->update(1.0,*b,-1.0,*Ap,0.0); + Kokkos::fence(); } { TimeMonitor t(*TimeMonitor::getNewTimer(dotTimerName)); rtrans = r->dot(*r); + Kokkos::fence(); } normr = std::sqrt(rtrans); @@ -118,17 +123,20 @@ bool cg_solve (Teuchos::RCP A, Teuchos::RCP b, Teuchos::RCPupdate(1.0,*r,0.0); + Kokkos::fence(); } else { oldrtrans = rtrans; { TimeMonitor t(*TimeMonitor::getNewTimer(dotTimerName)); rtrans = r->dot(*r); + Kokkos::fence(); } { TimeMonitor t(*TimeMonitor::getNewTimer(addTimerName)); magnitude_type beta = rtrans/oldrtrans; p->update(1.0,*r,beta); + Kokkos::fence(); } } normr = std::sqrt(rtrans); @@ -141,10 +149,12 @@ bool cg_solve (Teuchos::RCP A, Teuchos::RCP b, Teuchos::RCPapply(*p, *Ap); + Kokkos::fence(); } { TimeMonitor t(*TimeMonitor::getNewTimer(dotTimerName)); p_ap_dot = Ap->dot(*p); + Kokkos::fence(); } { TimeMonitor t(*TimeMonitor::getNewTimer(addTimerName)); @@ -158,11 +168,14 @@ bool cg_solve (Teuchos::RCP A, Teuchos::RCP b, Teuchos::RCPupdate(alpha,*p,1.0); r->update(-alpha,*Ap,1.0); + Kokkos::fence(); } } { + Kokkos::fence(); TimeMonitor t(*TimeMonitor::getNewTimer(dotTimerName)); rtrans = r->dot(*r); + Kokkos::fence(); } normr = std::sqrt(rtrans); @@ -267,6 +280,11 @@ int run() // On output, it is the approximate solution. RCP x (new vec_type (A->getDomainMap ())); + // Untimed warm-up apply + A->apply(*b, *x); + // Zero out x again + x->putScalar(0); + // Solve the linear system Ax=b using CG. RCP timer = rcp(new StackedTimer("CG: global")); TimeMonitor::setStackedTimer(timer);