diff --git a/.github/workflows/clang_format.yml b/.github/workflows/clang_format.yml
index 1141f4e8f35a..1aa6c7249eb5 100644
--- a/.github/workflows/clang_format.yml
+++ b/.github/workflows/clang_format.yml
@@ -2,8 +2,10 @@ name: Check packages with clang-format
 
 on: [pull_request]
 
-permissions: read-all
-
+permissions:
+  contents: read
+  issues: write
+  
 jobs:
   build:
     runs-on: ubuntu-latest
diff --git a/.github/workflows/codeql.yml b/.github/workflows/codeql.yml
index fd4b38be7479..c97fa64481e6 100644
--- a/.github/workflows/codeql.yml
+++ b/.github/workflows/codeql.yml
@@ -62,7 +62,7 @@ jobs:
 
     # Initializes the CodeQL tools for scanning.
     - name: Initialize CodeQL
-      uses: github/codeql-action/init@b7cec7526559c32f1616476ff32d17ba4c59b2d6 # v3.25.5
+      uses: github/codeql-action/init@9fdb3e49720b44c48891d036bb502feb25684276 # v3.25.6
       with:
         languages: ${{ matrix.language }}
         build-mode: ${{ matrix.build-mode }}
@@ -85,6 +85,6 @@ jobs:
           make -j 2
       
     - name: Perform CodeQL Analysis
-      uses: github/codeql-action/analyze@b7cec7526559c32f1616476ff32d17ba4c59b2d6 # v3.25.5
+      uses: github/codeql-action/analyze@9fdb3e49720b44c48891d036bb502feb25684276 # v3.25.6
       with:
         category: "/language:${{matrix.language}}"
diff --git a/.github/workflows/dependency-review.yml b/.github/workflows/dependency-review.yml
index 6ad35f1f80fe..7141d0933730 100644
--- a/.github/workflows/dependency-review.yml
+++ b/.github/workflows/dependency-review.yml
@@ -17,7 +17,7 @@ jobs:
     runs-on: ubuntu-latest
     steps:
       - name: Harden Runner
-        uses: step-security/harden-runner@a4aa98b93cab29d9b1101a6143fb8bce00e2eac4 # v2.7.1
+        uses: step-security/harden-runner@f086349bfa2bd1361f7909c78558e816508cdc10 # v2.8.0
         with:
           egress-policy: audit
 
diff --git a/.github/workflows/scorecards.yml b/.github/workflows/scorecards.yml
index 79be3dec93a5..f6cefad9914b 100644
--- a/.github/workflows/scorecards.yml
+++ b/.github/workflows/scorecards.yml
@@ -66,6 +66,6 @@ jobs:
 
       # Upload the results to GitHub's code scanning dashboard.
       - name: "Upload to code-scanning"
-        uses: github/codeql-action/upload-sarif@b7cec7526559c32f1616476ff32d17ba4c59b2d6 # v3.25.5
+        uses: github/codeql-action/upload-sarif@9fdb3e49720b44c48891d036bb502feb25684276 # v3.25.6
         with:
           sarif_file: results.sarif
diff --git a/packages/amesos2/src/Amesos2_Tacho_decl.hpp b/packages/amesos2/src/Amesos2_Tacho_decl.hpp
index 44eaa41e5d70..ec52b7775b98 100644
--- a/packages/amesos2/src/Amesos2_Tacho_decl.hpp
+++ b/packages/amesos2/src/Amesos2_Tacho_decl.hpp
@@ -227,6 +227,7 @@ class TachoSolver : public SolverCore<Amesos2::TachoSolver, Matrix, Vector>
     // TODO: Implement the paramter options - confirm which we want and which have been implemented
     int method;
     int variant;
+    int small_problem_threshold_size;
     // int num_kokkos_threads;
     // int max_num_superblocks;
   } data_;
diff --git a/packages/amesos2/src/Amesos2_Tacho_def.hpp b/packages/amesos2/src/Amesos2_Tacho_def.hpp
index 5fe53a7a4f71..d2519c8e5b3e 100644
--- a/packages/amesos2/src/Amesos2_Tacho_def.hpp
+++ b/packages/amesos2/src/Amesos2_Tacho_def.hpp
@@ -107,6 +107,7 @@ TachoSolver<Matrix,Vector>::symbolicFactorization_impl()
 
     data_.solver.setSolutionMethod(data_.method);
     data_.solver.setLevelSetOptionAlgorithmVariant(data_.variant);
+    data_.solver.setSmallProblemThresholdsize(data_.small_problem_threshold_size);
 
     // TODO: Confirm param options
     // data_.solver.setMaxNumberOfSuperblocks(data_.max_num_superblocks);
@@ -247,6 +248,8 @@ TachoSolver<Matrix,Vector>::setParameters_impl(const Teuchos::RCP<Teuchos::Param
   }
   // solver type
   data_.variant = parameterList->get<int> ("variant", 2);
+  // small problem threshold
+  data_.small_problem_threshold_size = parameterList->get<int> ("small problem threshold size", 1024);
   // TODO: Confirm param options
   // data_.num_kokkos_threads = parameterList->get<int>("kokkos-threads", 1);
   // data_.max_num_superblocks = parameterList->get<int>("max-num-superblocks", 4);
@@ -264,6 +267,7 @@ TachoSolver<Matrix,Vector>::getValidParameters_impl() const
 
     pl->set("method", "chol", "Type of factorization, chol, ldl, or lu");
     pl->set("variant", 2, "Type of solver variant, 0, 1, or 2");
+    pl->set("small problem threshold size", 1024, "Problem size threshold below with Tacho uses LAPACK.");
 
     // TODO: Confirm param options
     // pl->set("kokkos-threads", 1, "Number of threads");
diff --git a/packages/framework/ini-files/config-specs.ini b/packages/framework/ini-files/config-specs.ini
index 0d7196b38293..f917ccefa47a 100644
--- a/packages/framework/ini-files/config-specs.ini
+++ b/packages/framework/ini-files/config-specs.ini
@@ -2046,7 +2046,7 @@ use PACKAGE-ENABLES|ALL
 use rhel7_sems-clang-11.0.1-openmpi-1.10.1-serial_release-debug_shared_no-kokkos-arch_no-asan_no-complex_no-fpic_mpi_no-pt_no-rdc_no-uvm_deprecated-on_no-package-enables
 use PACKAGE-ENABLES|ALL
 
-[rhel7_sems-clang-11.0.1-openmpi-1.10.7-serial_release-debug_shared_no-kokkos-arch_no-asan_no-complex_no-fpic_mpi_no-pt_no-rdc_no-uvm_deprecated-on_no-package-enables]
+[rhel8_sems-clang-11.0.1-openmpi-4.0.5-serial_release-debug_shared_no-kokkos-arch_no-asan_no-complex_no-fpic_mpi_no-pt_no-rdc_no-uvm_deprecated-on_no-package-enables]
 # uses sems-v2 modules
 use RHEL7_SEMS_COMPILER|CLANG
 use NODE-TYPE|SERIAL
@@ -2067,11 +2067,17 @@ opt-set-cmake-var Teko_DISABLE_LSCSTABALIZED_TPETRA_ALPAH_INV_D BOOL       : ON
 
 use RHEL7_TEST_DISABLES|CLANG
 
+opt-set-cmake-var SuperLU_LIBRARY_NAMES STRING : superlu;m
+opt-set-cmake-var ML_ENABLE_SuperLU BOOL FORCE : OFF
+
+opt-set-cmake-var Pliris_vector_random_MPI_3_DISABLE BOOL : ON
+opt-set-cmake-var Pliris_vector_random_MPI_4_DISABLE BOOL : ON
+
 use RHEL7_POST
 
-[rhel7_sems-clang-11.0.1-openmpi-1.10.7-serial_release-debug_shared_no-kokkos-arch_no-asan_no-complex_no-fpic_mpi_no-pt_no-rdc_no-uvm_deprecated-on_all]
+[rhel8_sems-clang-11.0.1-openmpi-4.0.5-serial_release-debug_shared_no-kokkos-arch_no-asan_no-complex_no-fpic_mpi_no-pt_no-rdc_no-uvm_deprecated-on_all]
 # uses sems-v2 modules
-use rhel7_sems-clang-11.0.1-openmpi-1.10.7-serial_release-debug_shared_no-kokkos-arch_no-asan_no-complex_no-fpic_mpi_no-pt_no-rdc_no-uvm_deprecated-on_no-package-enables
+use rhel8_sems-clang-11.0.1-openmpi-4.0.5-serial_release-debug_shared_no-kokkos-arch_no-asan_no-complex_no-fpic_mpi_no-pt_no-rdc_no-uvm_deprecated-on_no-package-enables
 use PACKAGE-ENABLES|ALL
 
 [rhel7_sems-intel-19.0.5-mpich-3.2-serial_release-debug_static_no-kokkos-arch_no-asan_no-complex_fpic_mpi_no-pt_no-rdc_no-uvm_deprecated-on_no-package-enables]
diff --git a/packages/framework/ini-files/supported-envs.ini b/packages/framework/ini-files/supported-envs.ini
index b543e510fccb..d3a27cfcb82e 100644
--- a/packages/framework/ini-files/supported-envs.ini
+++ b/packages/framework/ini-files/supported-envs.ini
@@ -178,6 +178,7 @@ gcc-openmpi
 gcc-serial
 aue-gcc-openmpi
 sems-cuda-11.4.2-sems-gnu-10.1.0-sems-openmpi-4.1.4
+sems-clang-11.0.1-openmpi-4.0.5-serial
 
 [ats2]
 cuda-11.2.152-gnu-8.3.1-spmpi-rolling
diff --git a/packages/framework/pr_tools/trilinosprhelpers/TrilinosPRConfigurationBase.py b/packages/framework/pr_tools/trilinosprhelpers/TrilinosPRConfigurationBase.py
index bb038e3e1926..54cbcd1227f3 100644
--- a/packages/framework/pr_tools/trilinosprhelpers/TrilinosPRConfigurationBase.py
+++ b/packages/framework/pr_tools/trilinosprhelpers/TrilinosPRConfigurationBase.py
@@ -1,4 +1,4 @@
-#!/usr/bin/env python
+#!/usr/bin/env python3
 # -*- mode: python; py-indent-offset: 4; py-continuation-offset: 4 -*-
 """
 This file contains the base class for the Pull Request test driver.
diff --git a/packages/framework/pr_tools/trilinosprhelpers/TrilinosPRConfigurationInstallation.py b/packages/framework/pr_tools/trilinosprhelpers/TrilinosPRConfigurationInstallation.py
index 82ab88a015fc..ba2c87c9a576 100644
--- a/packages/framework/pr_tools/trilinosprhelpers/TrilinosPRConfigurationInstallation.py
+++ b/packages/framework/pr_tools/trilinosprhelpers/TrilinosPRConfigurationInstallation.py
@@ -1,4 +1,4 @@
-#!/usr/bin/env python
+#!/usr/bin/env python3
 # -*- mode: python; py-indent-offset: 4; py-continuation-offset: 4 -*-
 """
 Custom PR Executor for Installation testing
diff --git a/packages/framework/pr_tools/trilinosprhelpers/TrilinosPRConfigurationStandard.py b/packages/framework/pr_tools/trilinosprhelpers/TrilinosPRConfigurationStandard.py
index 0bd0d7071950..cb3e5af13552 100644
--- a/packages/framework/pr_tools/trilinosprhelpers/TrilinosPRConfigurationStandard.py
+++ b/packages/framework/pr_tools/trilinosprhelpers/TrilinosPRConfigurationStandard.py
@@ -1,4 +1,4 @@
-#!/usr/bin/env python
+#!/usr/bin/env python3
 # -*- mode: python; py-indent-offset: 4; py-continuation-offset: 4 -*-
 """
 Custom PR Executor for Standard testing
diff --git a/packages/framework/pr_tools/trilinosprhelpers/__init__.py b/packages/framework/pr_tools/trilinosprhelpers/__init__.py
index 1b8eb9136cdb..d2718d766803 100644
--- a/packages/framework/pr_tools/trilinosprhelpers/__init__.py
+++ b/packages/framework/pr_tools/trilinosprhelpers/__init__.py
@@ -1,4 +1,4 @@
-#!/usr/bin/env python
+#!/usr/bin/env python3
 # -*- mode: python; py-indent-offset: 4; py-continuation-offset: 4 -*-
 
 from .sysinfo import *
diff --git a/packages/framework/pr_tools/trilinosprhelpers/gitutility/GitUtility.py b/packages/framework/pr_tools/trilinosprhelpers/gitutility/GitUtility.py
index ff6bd5831f33..82ad13c414ed 100644
--- a/packages/framework/pr_tools/trilinosprhelpers/gitutility/GitUtility.py
+++ b/packages/framework/pr_tools/trilinosprhelpers/gitutility/GitUtility.py
@@ -1,4 +1,4 @@
-#!/usr/bin/env python
+#!/usr/bin/env python3
 # -*- mode: python; py-indent-offset: 4; py-continuation-offset: 4 -*-
 """
 This class contains a set of utilities for using and querying
diff --git a/packages/framework/pr_tools/trilinosprhelpers/gitutility/unittests/__init__.py b/packages/framework/pr_tools/trilinosprhelpers/gitutility/unittests/__init__.py
index 36f13ba9b20b..b6bf6bef5324 100644
--- a/packages/framework/pr_tools/trilinosprhelpers/gitutility/unittests/__init__.py
+++ b/packages/framework/pr_tools/trilinosprhelpers/gitutility/unittests/__init__.py
@@ -1,4 +1,4 @@
-#!/usr/bin/env python
+#!/usr/bin/env python3
 # -*- coding: utf-8 -*-
 """
 This file is required to enable Python to locate the package files in the parent
diff --git a/packages/framework/pr_tools/trilinosprhelpers/gitutility/unittests/test_GitUtility.py b/packages/framework/pr_tools/trilinosprhelpers/gitutility/unittests/test_GitUtility.py
index 5dbd6f3497b3..3390974955e5 100644
--- a/packages/framework/pr_tools/trilinosprhelpers/gitutility/unittests/test_GitUtility.py
+++ b/packages/framework/pr_tools/trilinosprhelpers/gitutility/unittests/test_GitUtility.py
@@ -1,4 +1,4 @@
-#!/usr/bin/env python
+#!/usr/bin/env python3
 # -*- coding: utf-8; mode: python; py-indent-offset: 4; py-continuation-offset: 4 -*-
 """
 """
diff --git a/packages/framework/pr_tools/trilinosprhelpers/jenkinsenv/EnvvarHelper.py b/packages/framework/pr_tools/trilinosprhelpers/jenkinsenv/EnvvarHelper.py
index 38067dca1d85..55655574d102 100644
--- a/packages/framework/pr_tools/trilinosprhelpers/jenkinsenv/EnvvarHelper.py
+++ b/packages/framework/pr_tools/trilinosprhelpers/jenkinsenv/EnvvarHelper.py
@@ -1,4 +1,4 @@
-#!/usr/bin/env python
+#!/usr/bin/env python3
 # -*- mode: python; py-indent-offset: 4; py-continuation-offset: 4 -*-
 """
 """
diff --git a/packages/framework/pr_tools/trilinosprhelpers/jenkinsenv/JenkinsEnv.py b/packages/framework/pr_tools/trilinosprhelpers/jenkinsenv/JenkinsEnv.py
index 2aaeba25e3fe..9e328b779f80 100644
--- a/packages/framework/pr_tools/trilinosprhelpers/jenkinsenv/JenkinsEnv.py
+++ b/packages/framework/pr_tools/trilinosprhelpers/jenkinsenv/JenkinsEnv.py
@@ -1,4 +1,4 @@
-#!/usr/bin/env python
+#!/usr/bin/env python3
 # -*- mode: python; py-indent-offset: 4; py-continuation-offset: 4 -*-
 from .EnvvarHelper import EnvvarHelper
 
diff --git a/packages/framework/pr_tools/trilinosprhelpers/jenkinsenv/TrilinosJenkinsEnv.py b/packages/framework/pr_tools/trilinosprhelpers/jenkinsenv/TrilinosJenkinsEnv.py
index 95b4469742e6..eb35e6fa4374 100644
--- a/packages/framework/pr_tools/trilinosprhelpers/jenkinsenv/TrilinosJenkinsEnv.py
+++ b/packages/framework/pr_tools/trilinosprhelpers/jenkinsenv/TrilinosJenkinsEnv.py
@@ -1,4 +1,4 @@
-#!/usr/bin/env python
+#!/usr/bin/env python3
 # -*- mode: python; py-indent-offset: 4; py-continuation-offset: 4 -*-
 from .JenkinsEnv import JenkinsEnv
 
diff --git a/packages/framework/pr_tools/trilinosprhelpers/jenkinsenv/__init__.py b/packages/framework/pr_tools/trilinosprhelpers/jenkinsenv/__init__.py
index f83d145c1517..fe4cf1d0cbef 100644
--- a/packages/framework/pr_tools/trilinosprhelpers/jenkinsenv/__init__.py
+++ b/packages/framework/pr_tools/trilinosprhelpers/jenkinsenv/__init__.py
@@ -1,4 +1,4 @@
-#!/usr/bin/env python
+#!/usr/bin/env python3
 # -*- mode: python; py-indent-offset: 4; py-continuation-offset: 4 -*-
 
 from .EnvvarHelper import EnvvarHelper
diff --git a/packages/framework/pr_tools/trilinosprhelpers/jenkinsenv/unittests/__init__.py b/packages/framework/pr_tools/trilinosprhelpers/jenkinsenv/unittests/__init__.py
index 36f13ba9b20b..b6bf6bef5324 100644
--- a/packages/framework/pr_tools/trilinosprhelpers/jenkinsenv/unittests/__init__.py
+++ b/packages/framework/pr_tools/trilinosprhelpers/jenkinsenv/unittests/__init__.py
@@ -1,4 +1,4 @@
-#!/usr/bin/env python
+#!/usr/bin/env python3
 # -*- coding: utf-8 -*-
 """
 This file is required to enable Python to locate the package files in the parent
diff --git a/packages/framework/pr_tools/trilinosprhelpers/jenkinsenv/unittests/test_EnvvarHelper.py b/packages/framework/pr_tools/trilinosprhelpers/jenkinsenv/unittests/test_EnvvarHelper.py
index 430f27175874..5ece6c0c6b48 100644
--- a/packages/framework/pr_tools/trilinosprhelpers/jenkinsenv/unittests/test_EnvvarHelper.py
+++ b/packages/framework/pr_tools/trilinosprhelpers/jenkinsenv/unittests/test_EnvvarHelper.py
@@ -1,4 +1,4 @@
-#!/usr/bin/env python
+#!/usr/bin/env python3
 # -*- coding: utf-8; mode: python; py-indent-offset: 4; py-continuation-offset: 4 -*-
 """
 """
diff --git a/packages/framework/pr_tools/trilinosprhelpers/jenkinsenv/unittests/test_JenkinsEnv.py b/packages/framework/pr_tools/trilinosprhelpers/jenkinsenv/unittests/test_JenkinsEnv.py
index e4b8e19bff6b..ead3e02e444e 100644
--- a/packages/framework/pr_tools/trilinosprhelpers/jenkinsenv/unittests/test_JenkinsEnv.py
+++ b/packages/framework/pr_tools/trilinosprhelpers/jenkinsenv/unittests/test_JenkinsEnv.py
@@ -1,4 +1,4 @@
-#!/usr/bin/env python
+#!/usr/bin/env python3
 # -*- coding: utf-8; mode: python; py-indent-offset: 4; py-continuation-offset: 4 -*-
 """
 """
diff --git a/packages/framework/pr_tools/trilinosprhelpers/jenkinsenv/unittests/test_TrilinosJenkinsEnv.py b/packages/framework/pr_tools/trilinosprhelpers/jenkinsenv/unittests/test_TrilinosJenkinsEnv.py
index e4b9e7a2613f..6a8844c406d4 100644
--- a/packages/framework/pr_tools/trilinosprhelpers/jenkinsenv/unittests/test_TrilinosJenkinsEnv.py
+++ b/packages/framework/pr_tools/trilinosprhelpers/jenkinsenv/unittests/test_TrilinosJenkinsEnv.py
@@ -1,4 +1,4 @@
-#!/usr/bin/env python
+#!/usr/bin/env python3
 # -*- coding: utf-8; mode: python; py-indent-offset: 4; py-continuation-offset: 4 -*-
 """
 """
diff --git a/packages/framework/pr_tools/trilinosprhelpers/sysinfo/SysInfo.py b/packages/framework/pr_tools/trilinosprhelpers/sysinfo/SysInfo.py
index 839f846900e4..60334dd1526d 100644
--- a/packages/framework/pr_tools/trilinosprhelpers/sysinfo/SysInfo.py
+++ b/packages/framework/pr_tools/trilinosprhelpers/sysinfo/SysInfo.py
@@ -1,4 +1,4 @@
-#!/usr/bin/env python
+#!/usr/bin/env python3
 # -*- mode: python; py-indent-offset: 4; py-continuation-offset: 4 -*-
 """
 System information helpers are contained in this file.
diff --git a/packages/framework/pr_tools/trilinosprhelpers/sysinfo/unittests/__init__.py b/packages/framework/pr_tools/trilinosprhelpers/sysinfo/unittests/__init__.py
index 36f13ba9b20b..b6bf6bef5324 100644
--- a/packages/framework/pr_tools/trilinosprhelpers/sysinfo/unittests/__init__.py
+++ b/packages/framework/pr_tools/trilinosprhelpers/sysinfo/unittests/__init__.py
@@ -1,4 +1,4 @@
-#!/usr/bin/env python
+#!/usr/bin/env python3
 # -*- coding: utf-8 -*-
 """
 This file is required to enable Python to locate the package files in the parent
diff --git a/packages/framework/pr_tools/trilinosprhelpers/sysinfo/unittests/test_SysInfo.py b/packages/framework/pr_tools/trilinosprhelpers/sysinfo/unittests/test_SysInfo.py
index 45fad390822d..74e1e0deb7b2 100644
--- a/packages/framework/pr_tools/trilinosprhelpers/sysinfo/unittests/test_SysInfo.py
+++ b/packages/framework/pr_tools/trilinosprhelpers/sysinfo/unittests/test_SysInfo.py
@@ -1,4 +1,4 @@
-#!/usr/bin/env python
+#!/usr/bin/env python3
 # -*- coding: utf-8; mode: python; py-indent-offset: 4; py-continuation-offset: 4 -*-
 """
 """
diff --git a/packages/framework/pr_tools/trilinosprhelpers/sysinfo/unittests/test_gpu_utils.py b/packages/framework/pr_tools/trilinosprhelpers/sysinfo/unittests/test_gpu_utils.py
index 4d6bf4f0d47d..8f5ea0df155c 100644
--- a/packages/framework/pr_tools/trilinosprhelpers/sysinfo/unittests/test_gpu_utils.py
+++ b/packages/framework/pr_tools/trilinosprhelpers/sysinfo/unittests/test_gpu_utils.py
@@ -1,4 +1,4 @@
-#!/usr/bin/env python
+#!/usr/bin/env python3
 # -*- coding: utf-8; mode: python; py-indent-offset: 4; py-continuation-offset: 4 -*-
 """
 """
diff --git a/packages/framework/pr_tools/trilinosprhelpers/unittests/__init__.py b/packages/framework/pr_tools/trilinosprhelpers/unittests/__init__.py
index 36f13ba9b20b..b6bf6bef5324 100644
--- a/packages/framework/pr_tools/trilinosprhelpers/unittests/__init__.py
+++ b/packages/framework/pr_tools/trilinosprhelpers/unittests/__init__.py
@@ -1,4 +1,4 @@
-#!/usr/bin/env python
+#!/usr/bin/env python3
 # -*- coding: utf-8 -*-
 """
 This file is required to enable Python to locate the package files in the parent
diff --git a/packages/framework/pr_tools/trilinosprhelpers/unittests/test_TrilinosPRConfigurationBase.py b/packages/framework/pr_tools/trilinosprhelpers/unittests/test_TrilinosPRConfigurationBase.py
index ee90c1dd431e..6f933f5d751a 100755
--- a/packages/framework/pr_tools/trilinosprhelpers/unittests/test_TrilinosPRConfigurationBase.py
+++ b/packages/framework/pr_tools/trilinosprhelpers/unittests/test_TrilinosPRConfigurationBase.py
@@ -1,4 +1,4 @@
-#!/usr/bin/env python
+#!/usr/bin/env python3
 # -*- coding: utf-8; mode: python; py-indent-offset: 4; py-continuation-offset: 4 -*-
 """
 """
diff --git a/packages/framework/pr_tools/trilinosprhelpers/unittests/test_TrilinosPRConfigurationInstallation.py b/packages/framework/pr_tools/trilinosprhelpers/unittests/test_TrilinosPRConfigurationInstallation.py
index 09c311c156a6..8d6b71abef7c 100755
--- a/packages/framework/pr_tools/trilinosprhelpers/unittests/test_TrilinosPRConfigurationInstallation.py
+++ b/packages/framework/pr_tools/trilinosprhelpers/unittests/test_TrilinosPRConfigurationInstallation.py
@@ -1,4 +1,4 @@
-#!/usr/bin/env python
+#!/usr/bin/env python3
 # -*- coding: utf-8; mode: python; py-indent-offset: 4; py-continuation-offset: 4 -*-
 """
 """
diff --git a/packages/framework/pr_tools/unittests/__init__.py b/packages/framework/pr_tools/unittests/__init__.py
index efdebb2e61f8..9f023287705a 100644
--- a/packages/framework/pr_tools/unittests/__init__.py
+++ b/packages/framework/pr_tools/unittests/__init__.py
@@ -1,3 +1,3 @@
-#!/usr/bin/env python
+#!/usr/bin/env python3
 # -*- mode: python; py-indent-offset: 4; py-continuation-offset: 4 -*-
 
diff --git a/packages/ifpack2/src/Ifpack2_BlockComputeResidualVector.hpp b/packages/ifpack2/src/Ifpack2_BlockComputeResidualVector.hpp
index 85bb469a4bf1..c51a3868fb7f 100644
--- a/packages/ifpack2/src/Ifpack2_BlockComputeResidualVector.hpp
+++ b/packages/ifpack2/src/Ifpack2_BlockComputeResidualVector.hpp
@@ -238,7 +238,8 @@ namespace Ifpack2 {
 
       // block crs graph information
       // for cuda (kokkos crs graph uses a different size_type from size_t)
-      const ConstUnmanaged<Kokkos::View<size_t*,node_device_type> > A_rowptr;
+      const ConstUnmanaged<Kokkos::View<size_t*,node_device_type> > A_block_rowptr;
+      const ConstUnmanaged<Kokkos::View<size_t*,node_device_type> > A_point_rowptr;
       const ConstUnmanaged<Kokkos::View<local_ordinal_type*,node_device_type> > A_colind;
 
       // blocksize
@@ -252,19 +253,22 @@ namespace Ifpack2 {
       const ConstUnmanaged<local_ordinal_type_1d_view> lclrow;
       const ConstUnmanaged<local_ordinal_type_1d_view> dm2cm;
       const bool is_dm2cm_active;
+      const bool hasBlockCrsMatrix;
 
     public:
       template<typename LocalCrsGraphType>
       ComputeResidualVector(const AmD<MatrixType> &amd,
-                            const LocalCrsGraphType &graph,
+                            const LocalCrsGraphType &block_graph,
+                            const LocalCrsGraphType &point_graph,
                             const local_ordinal_type &blocksize_requested_,
                             const PartInterface<MatrixType> &interf,
                             const local_ordinal_type_1d_view &dm2cm_)
         : rowptr(amd.rowptr), rowptr_remote(amd.rowptr_remote),
           colindsub(amd.A_colindsub), colindsub_remote(amd.A_colindsub_remote),
           tpetra_values(amd.tpetra_values),
-          A_rowptr(graph.row_map),
-          A_colind(graph.entries),
+          A_block_rowptr(block_graph.row_map),
+          A_point_rowptr(point_graph.row_map),
+          A_colind(block_graph.entries),
           blocksize_requested(blocksize_requested_),
           part2packrowidx0(interf.part2packrowidx0),
           part2rowidx0(interf.part2rowidx0),
@@ -272,8 +276,32 @@ namespace Ifpack2 {
           partptr(interf.partptr),
           lclrow(interf.lclrow),
           dm2cm(dm2cm_),
-          is_dm2cm_active(dm2cm_.span() > 0)
-      {}
+          is_dm2cm_active(dm2cm_.span() > 0),
+          hasBlockCrsMatrix(A_block_rowptr.extent(0) == A_point_rowptr.extent(0))
+      { }
+
+      inline
+      void
+      SerialDot(const local_ordinal_type &blocksize,
+                const local_ordinal_type &lclRowID,
+                const local_ordinal_type &lclColID,
+                const local_ordinal_type &ii,
+                const ConstUnmanaged<local_ordinal_type_1d_view>colindsub_,
+                const impl_scalar_type * const KOKKOS_RESTRICT xx,
+                /* */ impl_scalar_type * KOKKOS_RESTRICT yy) const {
+        const size_type Aj_c = colindsub_(lclColID);
+        auto point_row_offset = A_point_rowptr(lclRowID*blocksize + ii) + Aj_c*blocksize;
+        impl_scalar_type val = 0;
+#if defined(KOKKOS_ENABLE_PRAGMA_IVDEP)
+#   pragma ivdep
+#endif
+#if defined(KOKKOS_ENABLE_PRAGMA_UNROLL)
+#   pragma unroll
+#endif
+        for (local_ordinal_type k1=0;k1<blocksize;++k1)
+          val += tpetra_values(point_row_offset + k1) * xx[k1];
+        yy[ii] -= val;
+      }
 
       inline
       void
@@ -329,6 +357,28 @@ namespace Ifpack2 {
           });
       }
 
+      template<typename xxViewType, typename yyViewType>
+      KOKKOS_INLINE_FUNCTION
+      void
+      VectorDot(const member_type &member,
+                 const local_ordinal_type &blocksize,
+                 const local_ordinal_type &lclRowID,
+                 const local_ordinal_type &lclColID,
+                 const local_ordinal_type &ii,
+                 const ConstUnmanaged<local_ordinal_type_1d_view>colindsub_,
+                 const xxViewType &xx,
+                 const yyViewType &yy) const {
+        const size_type Aj_c = colindsub_(lclColID);
+        auto point_row_offset = A_point_rowptr(lclRowID*blocksize + ii) + Aj_c*blocksize;
+        impl_scalar_type val = 0;
+        Kokkos::parallel_for
+          (Kokkos::ThreadVectorRange(member, blocksize),
+           [&](const local_ordinal_type &k1) {
+              val += tpetra_values(point_row_offset + k1) *xx(k1);
+          });
+        Kokkos::atomic_fetch_add(&yy(ii), typename yyViewType::const_value_type(-val));
+      }
+
       template<typename AAViewType, typename xxViewType, typename yyViewType>
       KOKKOS_INLINE_FUNCTION
       void
@@ -387,12 +437,18 @@ namespace Ifpack2 {
           memcpy(yy, bb, sizeof(impl_scalar_type)*blocksize);
 
           // y -= Rx
-          const size_type A_k0 = A_rowptr[i];
+          const size_type A_k0 = A_block_rowptr[i];
           for (size_type k=rowptr[i];k<rowptr[i+1];++k) {
             const size_type j = A_k0 + colindsub[k];
-            const impl_scalar_type * const AA = &tpetra_values(j*blocksize_square);
             const impl_scalar_type * const xx = &x(A_colind[j]*blocksize, col);
-            SerialGemv(blocksize,AA,xx,yy);
+            if (hasBlockCrsMatrix) {
+              const impl_scalar_type * const AA = &tpetra_values(j*blocksize_square);
+              SerialGemv(blocksize,AA,xx,yy);
+            }
+            else {
+              for (local_ordinal_type k0=0;k0<blocksize;++k0)
+                SerialDot(blocksize, i, k, k0, colindsub, xx, yy);
+            }
           }
         }
       }
@@ -413,7 +469,7 @@ namespace Ifpack2 {
         auto bb = Kokkos::subview(b, block_range, 0);
         auto xx = bb;
         auto yy = Kokkos::subview(y, block_range, 0);
-        auto A_block = ConstUnmanaged<tpetra_block_access_view_type>(NULL, blocksize, blocksize);
+        auto A_block_cst = ConstUnmanaged<tpetra_block_access_view_type>(NULL, blocksize, blocksize);
 
         const local_ordinal_type row = lr*blocksize;
         for (local_ordinal_type col=0;col<num_vectors;++col) {
@@ -425,15 +481,30 @@ namespace Ifpack2 {
           member.team_barrier();
 
           // y -= Rx
-          const size_type A_k0 = A_rowptr[lr];
-          Kokkos::parallel_for
-            (Kokkos::TeamThreadRange(member, rowptr[lr], rowptr[lr+1]),
-             [&](const local_ordinal_type &k) {
-              const size_type j = A_k0 + colindsub[k];
-              A_block.assign_data( &tpetra_values(j*blocksize_square) );
-              xx.assign_data( &x(A_colind[j]*blocksize, col) );
-              VectorGemv(member, blocksize, A_block, xx, yy);
-            });
+          const size_type A_k0 = A_block_rowptr[lr];
+
+          if(hasBlockCrsMatrix) {
+            Kokkos::parallel_for
+              (Kokkos::TeamThreadRange(member, rowptr[lr], rowptr[lr+1]),
+              [&](const local_ordinal_type &k) {
+                const size_type j = A_k0 + colindsub[k];
+                xx.assign_data( &x(A_colind[j]*blocksize, col) );
+                A_block_cst.assign_data( &tpetra_values(j*blocksize_square) );
+                VectorGemv(member, blocksize, A_block_cst, xx, yy);
+              });
+          }
+          else {
+            Kokkos::parallel_for
+              (Kokkos::TeamThreadRange(member, rowptr[lr], rowptr[lr+1]),
+              [&](const local_ordinal_type &k) {
+
+                const size_type j = A_k0 + colindsub[k];
+                xx.assign_data( &x(A_colind[j]*blocksize, col) );
+
+                for (local_ordinal_type k0=0;k0<blocksize;++k0)
+                  VectorDot(member, blocksize, lr, k, k0, colindsub, xx, yy);
+              });
+          }
         }
       }
 
@@ -466,19 +537,35 @@ namespace Ifpack2 {
           memcpy(yy, &b(row, col), sizeof(impl_scalar_type)*blocksize);
 
           // y -= Rx
-          const size_type A_k0 = A_rowptr[lr];
+          const size_type A_k0 = A_block_rowptr[lr];
           for (size_type k=rowptr[lr];k<rowptr[lr+1];++k) {
             const size_type j = A_k0 + colindsub[k];
-            const impl_scalar_type * const AA = &tpetra_values(j*blocksize_square);
             const local_ordinal_type A_colind_at_j = A_colind[j];
-            if (A_colind_at_j < num_local_rows) {
-              const auto loc = is_dm2cm_active ? dm2cm[A_colind_at_j] : A_colind_at_j;
-              const impl_scalar_type * const xx = &x(loc*blocksize, col);
-              SerialGemv(blocksize, AA,xx,yy);
-            } else {
-              const auto loc = A_colind_at_j - num_local_rows;
-              const impl_scalar_type * const xx_remote = &x_remote(loc*blocksize, col);
-              SerialGemv(blocksize, AA,xx_remote,yy);
+            if (hasBlockCrsMatrix) {
+              const impl_scalar_type * const AA = &tpetra_values(j*blocksize_square);
+              
+              if (A_colind_at_j < num_local_rows) {
+                const auto loc = is_dm2cm_active ? dm2cm[A_colind_at_j] : A_colind_at_j;
+                const impl_scalar_type * const xx = &x(loc*blocksize, col);
+                SerialGemv(blocksize, AA,xx,yy);
+              } else {
+                const auto loc = A_colind_at_j - num_local_rows;
+                const impl_scalar_type * const xx_remote = &x_remote(loc*blocksize, col);
+                SerialGemv(blocksize, AA,xx_remote,yy);
+              }
+            }
+            else {
+              if (A_colind_at_j < num_local_rows) {
+                const auto loc = is_dm2cm_active ? dm2cm[A_colind_at_j] : A_colind_at_j;
+                const impl_scalar_type * const xx = &x(loc*blocksize, col);
+                for (local_ordinal_type k0=0;k0<blocksize;++k0)
+                  SerialDot(blocksize, lr, k, k0, colindsub, xx, yy);
+              } else {
+                const auto loc = A_colind_at_j - num_local_rows;
+                const impl_scalar_type * const xx_remote = &x_remote(loc*blocksize, col);
+                for (local_ordinal_type k0=0;k0<blocksize;++k0)
+                  SerialDot(blocksize, lr, k, k0, colindsub, xx_remote, yy);
+              }
             }
           }
           // move yy to y_packed
@@ -511,7 +598,7 @@ namespace Ifpack2 {
         subview_1D_right_t xx_remote(nullptr, blocksize);
         using subview_1D_stride_t = decltype(Kokkos::subview(y_packed_scalar, 0, block_range, 0, 0));
         subview_1D_stride_t yy(nullptr, Kokkos::LayoutStride(blocksize, y_packed_scalar.stride_1()));
-        auto A_block = ConstUnmanaged<tpetra_block_access_view_type>(NULL, blocksize, blocksize);
+        auto A_block_cst = ConstUnmanaged<tpetra_block_access_view_type>(NULL, blocksize, blocksize);
 
         const local_ordinal_type lr = lclrow(rowidx);
         const local_ordinal_type row = lr*blocksize;
@@ -524,24 +611,46 @@ namespace Ifpack2 {
           member.team_barrier();
 
           // y -= Rx
-          const size_type A_k0 = A_rowptr[lr];
-          Kokkos::parallel_for
-            (Kokkos::TeamThreadRange(member, rowptr[lr], rowptr[lr+1]),
-             [&](const local_ordinal_type &k) {
-              const size_type j = A_k0 + colindsub[k];
-              A_block.assign_data( &tpetra_values(j*blocksize_square) );
-
-              const local_ordinal_type A_colind_at_j = A_colind[j];
-              if (A_colind_at_j < num_local_rows) {
-                const auto loc = is_dm2cm_active ? dm2cm[A_colind_at_j] : A_colind_at_j;
-                xx.assign_data( &x(loc*blocksize, col) );
-                VectorGemv(member, blocksize, A_block, xx, yy);
-              } else {
-                const auto loc = A_colind_at_j - num_local_rows;
-                xx_remote.assign_data( &x_remote(loc*blocksize, col) );
-                VectorGemv(member, blocksize, A_block, xx_remote, yy);
-              }
-            });
+          const size_type A_k0 = A_block_rowptr[lr];
+          if(hasBlockCrsMatrix) {
+            Kokkos::parallel_for
+              (Kokkos::TeamThreadRange(member, rowptr[lr], rowptr[lr+1]),
+              [&](const local_ordinal_type &k) {
+                const size_type j = A_k0 + colindsub[k];
+                A_block_cst.assign_data( &tpetra_values(j*blocksize_square) );
+
+                const local_ordinal_type A_colind_at_j = A_colind[j];
+                if (A_colind_at_j < num_local_rows) {
+                  const auto loc = is_dm2cm_active ? dm2cm[A_colind_at_j] : A_colind_at_j;
+                  xx.assign_data( &x(loc*blocksize, col) );
+                  VectorGemv(member, blocksize, A_block_cst, xx, yy);
+                } else {
+                  const auto loc = A_colind_at_j - num_local_rows; 
+                  xx_remote.assign_data( &x_remote(loc*blocksize, col) );
+                  VectorGemv(member, blocksize, A_block_cst, xx_remote, yy);
+                }
+              });
+          }
+          else {
+            Kokkos::parallel_for
+              (Kokkos::TeamThreadRange(member, rowptr[lr], rowptr[lr+1]),
+              [&](const local_ordinal_type &k) {
+                const size_type j = A_k0 + colindsub[k];
+
+                const local_ordinal_type A_colind_at_j = A_colind[j];
+                if (A_colind_at_j < num_local_rows) {
+                  const auto loc = is_dm2cm_active ? dm2cm[A_colind_at_j] : A_colind_at_j;
+                  xx.assign_data( &x(loc*blocksize, col) );
+                  for (local_ordinal_type k0=0;k0<blocksize;++k0)
+                    VectorDot(member, blocksize, lr, k, k0, colindsub, xx, yy);
+                } else {
+                  const auto loc = A_colind_at_j - num_local_rows; 
+                  xx_remote.assign_data( &x_remote(loc*blocksize, col) );
+                  for (local_ordinal_type k0=0;k0<blocksize;++k0)
+                    VectorDot(member, blocksize, lr, k, k0, colindsub, xx_remote, yy);
+                }
+              });
+          }
         }
       }
 
@@ -581,19 +690,34 @@ namespace Ifpack2 {
           }
 
           // y -= Rx
-          const size_type A_k0 = A_rowptr[lr];
+          const size_type A_k0 = A_block_rowptr[lr];
           for (size_type k=rowptr_used[lr];k<rowptr_used[lr+1];++k) {
             const size_type j = A_k0 + colindsub_used[k];
-            const impl_scalar_type * const AA = &tpetra_values(j*blocksize_square);
             const local_ordinal_type A_colind_at_j = A_colind[j];
-            if (P == 0) {
-              const auto loc = is_dm2cm_active ? dm2cm[A_colind_at_j] : A_colind_at_j;
-              const impl_scalar_type * const xx = &x(loc*blocksize, col);
-              SerialGemv(blocksize,AA,xx,yy);
-            } else {
-              const auto loc = A_colind_at_j - num_local_rows;
-              const impl_scalar_type * const xx_remote = &x_remote(loc*blocksize, col);
-              SerialGemv(blocksize,AA,xx_remote,yy);
+            if (hasBlockCrsMatrix) {
+              const impl_scalar_type * const AA = &tpetra_values(j*blocksize_square);
+              if (P == 0) {
+                const auto loc = is_dm2cm_active ? dm2cm[A_colind_at_j] : A_colind_at_j;
+                const impl_scalar_type * const xx = &x(loc*blocksize, col);
+                SerialGemv(blocksize,AA,xx,yy);
+              } else {
+                const auto loc = A_colind_at_j - num_local_rows;
+                const impl_scalar_type * const xx_remote = &x_remote(loc*blocksize, col);
+                SerialGemv(blocksize,AA,xx_remote,yy);
+              }
+            }
+            else {
+              if (P == 0) {
+                const auto loc = is_dm2cm_active ? dm2cm[A_colind_at_j] : A_colind_at_j;
+                const impl_scalar_type * const xx = &x(loc*blocksize, col);
+                for (local_ordinal_type k0=0;k0<blocksize;++k0)
+                  SerialDot(blocksize, lr, k, k0, colindsub_used, xx, yy);
+              } else {
+                const auto loc = A_colind_at_j - num_local_rows;
+                const impl_scalar_type * const xx_remote = &x_remote(loc*blocksize, col);
+                for (local_ordinal_type k0=0;k0<blocksize;++k0)
+                  SerialDot(blocksize, lr, k, k0, colindsub_used, xx_remote, yy);
+              }
             }
           }
           // move yy to y_packed
@@ -631,7 +755,7 @@ namespace Ifpack2 {
         subview_1D_right_t xx_remote(nullptr, blocksize);
         using subview_1D_stride_t = decltype(Kokkos::subview(y_packed_scalar, 0, block_range, 0, 0));
         subview_1D_stride_t yy(nullptr, Kokkos::LayoutStride(blocksize, y_packed_scalar.stride_1()));
-        auto A_block = ConstUnmanaged<tpetra_block_access_view_type>(NULL, blocksize, blocksize);
+        auto A_block_cst = ConstUnmanaged<tpetra_block_access_view_type>(NULL, blocksize, blocksize);
         auto colindsub_used = (P == 0 ? colindsub : colindsub_remote);
         auto rowptr_used = (P == 0 ? rowptr : rowptr_remote);
 
@@ -648,24 +772,46 @@ namespace Ifpack2 {
           }
 
           // y -= Rx
-          const size_type A_k0 = A_rowptr[lr];
-          Kokkos::parallel_for
-            (Kokkos::TeamThreadRange(member, rowptr_used[lr], rowptr_used[lr+1]),
-             [&](const local_ordinal_type &k) {
-              const size_type j = A_k0 + colindsub_used[k];
-              A_block.assign_data( &tpetra_values(j*blocksize_square) );
-
-              const local_ordinal_type A_colind_at_j = A_colind[j];
-              if (P == 0) {
-                const auto loc = is_dm2cm_active ? dm2cm[A_colind_at_j] : A_colind_at_j;
-                xx.assign_data( &x(loc*blocksize, col) );
-                VectorGemv(member, blocksize, A_block, xx, yy);
-              } else {
-                const auto loc = A_colind_at_j - num_local_rows;
-                xx_remote.assign_data( &x_remote(loc*blocksize, col) );
-                VectorGemv(member, blocksize, A_block, xx_remote, yy);
-              }
-            });
+          const size_type A_k0 = A_block_rowptr[lr];
+          if(hasBlockCrsMatrix) {
+            Kokkos::parallel_for
+              (Kokkos::TeamThreadRange(member, rowptr_used[lr], rowptr_used[lr+1]),
+              [&](const local_ordinal_type &k) {
+                const size_type j = A_k0 + colindsub_used[k];
+                A_block_cst.assign_data( &tpetra_values(j*blocksize_square) );
+
+                const local_ordinal_type A_colind_at_j = A_colind[j];
+                if (P == 0) {
+                  const auto loc = is_dm2cm_active ? dm2cm[A_colind_at_j] : A_colind_at_j;
+                  xx.assign_data( &x(loc*blocksize, col) );
+                  VectorGemv(member, blocksize, A_block_cst, xx, yy);
+                } else {
+                  const auto loc = A_colind_at_j - num_local_rows;
+                  xx_remote.assign_data( &x_remote(loc*blocksize, col) );
+                  VectorGemv(member, blocksize, A_block_cst, xx_remote, yy);
+                }
+              });
+          }
+          else {
+            Kokkos::parallel_for
+              (Kokkos::TeamThreadRange(member, rowptr_used[lr], rowptr_used[lr+1]),
+              [&](const local_ordinal_type &k) {
+                const size_type j = A_k0 + colindsub_used[k];
+
+                const local_ordinal_type A_colind_at_j = A_colind[j];
+                if (P == 0) {
+                  const auto loc = is_dm2cm_active ? dm2cm[A_colind_at_j] : A_colind_at_j;
+                  xx.assign_data( &x(loc*blocksize, col) );
+                  for (local_ordinal_type k0=0;k0<blocksize;++k0)
+                    VectorDot(member, blocksize, lr, k, k0, colindsub_used, xx, yy);
+                } else {
+                  const auto loc = A_colind_at_j - num_local_rows;
+                  xx_remote.assign_data( &x_remote(loc*blocksize, col) );
+                  for (local_ordinal_type k0=0;k0<blocksize;++k0)
+                    VectorDot(member, blocksize, lr, k, k0, colindsub_used, xx_remote, yy);
+                }
+              });
+          }
         }
       }
 
diff --git a/packages/ifpack2/src/Ifpack2_BlockHelper.hpp b/packages/ifpack2/src/Ifpack2_BlockHelper.hpp
index db3bddac1a7f..9814501c85bf 100644
--- a/packages/ifpack2/src/Ifpack2_BlockHelper.hpp
+++ b/packages/ifpack2/src/Ifpack2_BlockHelper.hpp
@@ -335,6 +335,8 @@ namespace Ifpack2 {
       typedef Tpetra::Map<local_ordinal_type,global_ordinal_type,node_type> tpetra_map_type;
       typedef Tpetra::Import<local_ordinal_type,global_ordinal_type,node_type> tpetra_import_type;
       typedef Tpetra::RowMatrix<scalar_type,local_ordinal_type,global_ordinal_type,node_type> tpetra_row_matrix_type;
+      typedef Tpetra::CrsMatrix<scalar_type,local_ordinal_type,global_ordinal_type,node_type> tpetra_crs_matrix_type;
+      typedef Tpetra::CrsGraph<local_ordinal_type,global_ordinal_type,node_type> tpetra_crs_graph_type;
       typedef Tpetra::BlockCrsMatrix<scalar_type,local_ordinal_type,global_ordinal_type,node_type> tpetra_block_crs_matrix_type;
       typedef typename tpetra_block_crs_matrix_type::little_block_type tpetra_block_access_view_type;
       typedef Tpetra::BlockMultiVector<scalar_type,local_ordinal_type,global_ordinal_type,node_type> tpetra_block_multivector_type;
@@ -375,6 +377,7 @@ namespace Ifpack2 {
       typedef Kokkos::View<internal_vector_type***,Kokkos::LayoutRight,device_type> internal_vector_type_3d_view;
       typedef Kokkos::View<internal_vector_type****,Kokkos::LayoutRight,device_type> internal_vector_type_4d_view;
       typedef Kokkos::View<internal_vector_type*****,Kokkos::LayoutRight,device_type> internal_vector_type_5d_view;
+      typedef Kokkos::View<btdm_scalar_type**,Kokkos::LayoutRight,device_type> btdm_scalar_type_2d_view;
       typedef Kokkos::View<btdm_scalar_type***,Kokkos::LayoutRight,device_type> btdm_scalar_type_3d_view;
       typedef Kokkos::View<btdm_scalar_type****,Kokkos::LayoutRight,device_type> btdm_scalar_type_4d_view;
       typedef Kokkos::View<btdm_scalar_type*****,Kokkos::LayoutRight,device_type> btdm_scalar_type_5d_view;
diff --git a/packages/ifpack2/src/Ifpack2_BlockRelaxation_def.hpp b/packages/ifpack2/src/Ifpack2_BlockRelaxation_def.hpp
index 21c15108496f..30c9d79f1ea8 100644
--- a/packages/ifpack2/src/Ifpack2_BlockRelaxation_def.hpp
+++ b/packages/ifpack2/src/Ifpack2_BlockRelaxation_def.hpp
@@ -179,6 +179,7 @@ getValidParameters () const
   validParams->set("partitioner: subparts per part", 1);
   validParams->set("partitioner: block size", -1);
   validParams->set("partitioner: print level", false);
+  validParams->set("partitioner: explicit convert to BlockCrs", false);
 
   return validParams;
 }
@@ -633,14 +634,23 @@ initialize ()
       Teuchos::rcp_dynamic_cast<const block_crs_matrix_type> (A_);
     hasBlockCrsMatrix_ = !A_bcrs.is_null();
 
+    Teuchos::RCP<const row_graph_type> graph = A_->getGraph ();
+
     if(!hasBlockCrsMatrix_ && List_.isParameter("relaxation: container") && List_.get<std::string>("relaxation: container") == "BlockTriDi" ) {
       TEUCHOS_FUNC_TIME_MONITOR("Ifpack2::BlockRelaxation::initialize::convertToBlockCrsMatrix");
       int block_size = List_.get<int>("partitioner: block size");
+      bool use_explicit_conversion = List_.isParameter("partitioner: explicit convert to BlockCrs") && List_.get<bool>("partitioner: explicit convert to BlockCrs");
       TEUCHOS_TEST_FOR_EXCEPT_MSG
-        (block_size == -1, "A pointwise matrix and block_size = -1 were given as inputs.");
-      A_bcrs = Tpetra::convertToBlockCrsMatrix(*Teuchos::rcp_dynamic_cast<const crs_matrix_type>(A_), block_size, false);
-      A_ = A_bcrs;
-      hasBlockCrsMatrix_ = true;
+        (use_explicit_conversion && block_size == -1, "A pointwise matrix and block_size = -1 were given as inputs.");
+      if(use_explicit_conversion) {
+        A_bcrs = Tpetra::convertToBlockCrsMatrix(*Teuchos::rcp_dynamic_cast<const crs_matrix_type>(A_), block_size, false);
+        A_ = A_bcrs;
+        hasBlockCrsMatrix_ = true;
+        graph = A_->getGraph ();
+      }
+      else {
+        graph = Tpetra::getBlockCrsGraph(*Teuchos::rcp_dynamic_cast<const crs_matrix_type>(A_), block_size, true);
+      }
       Kokkos::DefaultExecutionSpace().fence();
     }
 
@@ -655,19 +665,19 @@ initialize ()
 
     if (PartitionerType_ == "linear") {
       Partitioner_ =
-        rcp (new Ifpack2::LinearPartitioner<row_graph_type> (A_->getGraph ()));
+        rcp (new Ifpack2::LinearPartitioner<row_graph_type> (graph));
     } else if (PartitionerType_ == "line") {
       Partitioner_ =
-        rcp (new Ifpack2::LinePartitioner<row_graph_type,typename MatrixType::scalar_type> (A_->getGraph ()));
+        rcp (new Ifpack2::LinePartitioner<row_graph_type,typename MatrixType::scalar_type> (graph));
     } else if (PartitionerType_ == "user") {
       Partitioner_ =
-        rcp (new Ifpack2::Details::UserPartitioner<row_graph_type> (A_->getGraph () ) );
+        rcp (new Ifpack2::Details::UserPartitioner<row_graph_type> (graph ) );
     } else if (PartitionerType_ == "zoltan2") {
       #if defined(HAVE_IFPACK2_ZOLTAN2)
-      if (A_->getGraph ()->getComm ()->getSize () == 1) {
+      if (graph->getComm ()->getSize () == 1) {
         // Only one MPI, so call zoltan2 with global graph
         Partitioner_ =
-          rcp (new Ifpack2::Zoltan2Partitioner<row_graph_type> (A_->getGraph ()) );
+          rcp (new Ifpack2::Zoltan2Partitioner<row_graph_type> (graph) );
       } else {
         // Form local matrix to get local graph for calling zoltan2
         Teuchos::RCP<const row_matrix_type> A_local = rcp (new LocalFilter<row_matrix_type> (A_));
diff --git a/packages/ifpack2/src/Ifpack2_BlockTriDiContainer_decl.hpp b/packages/ifpack2/src/Ifpack2_BlockTriDiContainer_decl.hpp
index e72565179cc4..067f85e34b04 100644
--- a/packages/ifpack2/src/Ifpack2_BlockTriDiContainer_decl.hpp
+++ b/packages/ifpack2/src/Ifpack2_BlockTriDiContainer_decl.hpp
@@ -235,7 +235,8 @@ namespace Ifpack2 {
                          const int n_subparts_per_part = 1,
                          bool overlapCommAndComp = false, 
                          bool useSequentialMethod = false,
-                         const int block_size = -1);
+                         const int block_size = -1,
+                         const bool explicitConversion = false);
 
     //! Destructor (declared virtual for memory safety of derived classes).
     ~BlockTriDiContainer () override;
@@ -397,13 +398,15 @@ namespace Ifpack2 {
     // hide details of impl using ImplObj; finally I understand why AMB did that way.
     Teuchos::RCP<BlockTriDiContainerDetails::ImplObject<MatrixType> > impl_;
     int n_subparts_per_part_;
+    int block_size_ = -1;
     
     // initialize distributed and local objects
     void initInternal (const Teuchos::RCP<const row_matrix_type>& matrix,
                        const Teuchos::RCP<const import_type> &importer,
                        const bool overlapCommAndComp,
                        const bool useSeqMethod,
-                       const int block_size = -1);
+                       const int block_size = -1,
+                       const bool explicitConversion = false);
 
     void clearInternal();
   };
diff --git a/packages/ifpack2/src/Ifpack2_BlockTriDiContainer_def.hpp b/packages/ifpack2/src/Ifpack2_BlockTriDiContainer_def.hpp
index 4c872368a43c..dcbe1ee76fce 100644
--- a/packages/ifpack2/src/Ifpack2_BlockTriDiContainer_def.hpp
+++ b/packages/ifpack2/src/Ifpack2_BlockTriDiContainer_def.hpp
@@ -83,7 +83,8 @@ namespace Ifpack2 {
                   const Teuchos::RCP<const import_type>& importer,
                   const bool overlapCommAndComp,
                   const bool useSeqMethod,
-                  const int block_size) 
+                  const int block_size,
+                  const bool explicitConversion) 
   {
     IFPACK2_BLOCKHELPER_TIMER("BlockTriDiContainer::initInternal");
 
@@ -99,16 +100,21 @@ namespace Ifpack2 {
 
     {
       IFPACK2_BLOCKHELPER_TIMER("BlockTriDiContainer::setA");
-      impl_->A = Teuchos::rcp_dynamic_cast<const block_crs_matrix_type>(matrix);
-      if (impl_->A.is_null()) {
-        TEUCHOS_TEST_FOR_EXCEPT_MSG
-          (block_size == -1, "A pointwise matrix and block_size = -1 were given as inputs.");
-        {
-          IFPACK2_BLOCKHELPER_TIMER("BlockTriDiContainer::setA::convertToBlockCrsMatrix");
-          impl_->A = Tpetra::convertToBlockCrsMatrix(*Teuchos::rcp_dynamic_cast<const crs_matrix_type>(matrix), block_size, false);
-          IFPACK2_BLOCKHELPER_TIMER_FENCE(typename BlockHelperDetails::ImplType<MatrixType>::execution_space)
+      if (explicitConversion) {
+        impl_->A = Teuchos::rcp_dynamic_cast<const block_crs_matrix_type>(matrix);
+        if (impl_->A.is_null()) {
+          TEUCHOS_TEST_FOR_EXCEPT_MSG
+            (block_size == -1, "A pointwise matrix and block_size = -1 were given as inputs.");
+          {
+            IFPACK2_BLOCKHELPER_TIMER("BlockTriDiContainer::setA::convertToBlockCrsMatrix");
+            impl_->A = Tpetra::convertToBlockCrsMatrix(*Teuchos::rcp_dynamic_cast<const crs_matrix_type>(matrix), block_size, false);
+            IFPACK2_BLOCKHELPER_TIMER_FENCE(typename BlockHelperDetails::ImplType<MatrixType>::execution_space)
+          }
         }
       }
+      else {
+        impl_->A = matrix;
+      }
       IFPACK2_BLOCKHELPER_TIMER_FENCE(typename BlockHelperDetails::ImplType<MatrixType>::execution_space)
     }
 
@@ -197,6 +203,7 @@ namespace Ifpack2 {
     const bool overlapCommAndComp = false;
     initInternal(matrix, importer, overlapCommAndComp, useSeqMethod);
     n_subparts_per_part_ = -1;
+    block_size_ = -1;
     IFPACK2_BLOCKHELPER_TIMER_FENCE(typename BlockHelperDetails::ImplType<MatrixType>::execution_space)
   }
 
@@ -207,12 +214,14 @@ namespace Ifpack2 {
                        const int n_subparts_per_part,
                        const bool overlapCommAndComp, 
                        const bool useSeqMethod,
-                       const int block_size)
+                       const int block_size,
+                       const bool explicitConversion)
     : Container<MatrixType>(matrix, partitions, false), partitions_(partitions)
   {
     IFPACK2_BLOCKHELPER_TIMER("BlockTriDiContainer::BlockTriDiContainer");
-    initInternal(matrix, Teuchos::null, overlapCommAndComp, useSeqMethod, block_size);
+    initInternal(matrix, Teuchos::null, overlapCommAndComp, useSeqMethod, block_size, explicitConversion);
     n_subparts_per_part_ = n_subparts_per_part;
+    block_size_ = block_size;
     IFPACK2_BLOCKHELPER_TIMER_FENCE(typename BlockHelperDetails::ImplType<MatrixType>::execution_space)
   }
 
@@ -229,6 +238,8 @@ namespace Ifpack2 {
   {
     if (List.isType<int>("partitioner: subparts per part"))
       n_subparts_per_part_ = List.get<int>("partitioner: subparts per part");
+    if (List.isType<int>("partitioner: block size"))
+      block_size_ = List.get<int>("partitioner: block size");
   }
 
   template <typename MatrixType>
@@ -238,13 +249,31 @@ namespace Ifpack2 {
   {
     IFPACK2_BLOCKHELPER_TIMER("BlockTriDiContainer::initialize");
     this->IsInitialized_ = true;
+    {
+      auto bA = Teuchos::rcp_dynamic_cast<const block_crs_matrix_type>(impl_->A);
+      if (bA.is_null()) {
+        TEUCHOS_TEST_FOR_EXCEPT_MSG
+          (block_size_ == -1, "A pointwise matrix and block_size = -1 were given as inputs.");
+        {
+          IFPACK2_BLOCKHELPER_TIMER("BlockTriDiContainer::initialize::getBlockCrsGraph");
+          auto A = Teuchos::rcp_dynamic_cast<const crs_matrix_type>(impl_->A);
+          impl_->blockGraph = Tpetra::getBlockCrsGraph(*A, block_size_, true);
+          IFPACK2_BLOCKHELPER_TIMER_FENCE(typename BlockHelperDetails::ImplType<MatrixType>::execution_space)
+        }
+      }
+      else {
+        impl_->blockGraph = Teuchos::rcpFromRef(bA->getCrsGraph());
+      }
+    }
+
     {
       IFPACK2_BLOCKHELPER_TIMER("BlockTriDiContainer::createPartInterfaceBlockTridiagsNormManager");
-      impl_->part_interface  = BlockTriDiContainerDetails::createPartInterface<MatrixType>(impl_->A, partitions_, n_subparts_per_part_);
+      impl_->part_interface  = BlockTriDiContainerDetails::createPartInterface<MatrixType>(impl_->A, impl_->blockGraph, partitions_, n_subparts_per_part_);
       impl_->block_tridiags  = BlockTriDiContainerDetails::createBlockTridiags<MatrixType>(impl_->part_interface);
       impl_->norm_manager    = BlockHelperDetails::NormManager<MatrixType>(impl_->A->getComm());
       IFPACK2_BLOCKHELPER_TIMER_FENCE(typename BlockHelperDetails::ImplType<MatrixType>::execution_space)
     }
+
     // We assume that if you called this method, you intend to recompute
     // everything.
     this->IsComputed_ = false;
@@ -252,7 +281,9 @@ namespace Ifpack2 {
     {
       BlockTriDiContainerDetails::performSymbolicPhase<MatrixType>
         (impl_->A, 
-         impl_->part_interface, impl_->block_tridiags, 
+         impl_->blockGraph, 
+         impl_->part_interface, 
+         impl_->block_tridiags, 
          impl_->a_minus_d, 
          impl_->overlap_communication_and_computation);    
     }
@@ -270,7 +301,8 @@ namespace Ifpack2 {
       this->initialize();
     {
       BlockTriDiContainerDetails::performNumericPhase<MatrixType>
-        (impl_->A, 
+        (impl_->A,
+         impl_->blockGraph, 
          impl_->part_interface, impl_->block_tridiags, 
          Kokkos::ArithTraits<magnitude_type>::zero());
     }
@@ -303,6 +335,7 @@ namespace Ifpack2 {
 
     BlockTriDiContainerDetails::applyInverseJacobi<MatrixType>
       (impl_->A,
+       impl_->blockGraph,
        impl_->tpetra_importer, 
        impl_->async_importer, 
        impl_->overlap_communication_and_computation,
@@ -337,7 +370,8 @@ namespace Ifpack2 {
       this->initialize();
     {
       BlockTriDiContainerDetails::performNumericPhase<MatrixType>
-        (impl_->A, 
+        (impl_->A,
+         impl_->blockGraph, 
          impl_->part_interface, impl_->block_tridiags, 
          in.addRadiallyToDiagonal);
     }
@@ -366,6 +400,7 @@ namespace Ifpack2 {
     {
       r_val = BlockTriDiContainerDetails::applyInverseJacobi<MatrixType>
         (impl_->A,
+         impl_->blockGraph,
          impl_->tpetra_importer, 
          impl_->async_importer,
          impl_->overlap_communication_and_computation,
diff --git a/packages/ifpack2/src/Ifpack2_BlockTriDiContainer_impl.hpp b/packages/ifpack2/src/Ifpack2_BlockTriDiContainer_impl.hpp
index 8d47c85d1a50..311fa80f88f4 100644
--- a/packages/ifpack2/src/Ifpack2_BlockTriDiContainer_impl.hpp
+++ b/packages/ifpack2/src/Ifpack2_BlockTriDiContainer_impl.hpp
@@ -193,15 +193,24 @@ namespace Ifpack2 {
     ///
     template<typename MatrixType>
     typename Teuchos::RCP<const typename BlockHelperDetails::ImplType<MatrixType>::tpetra_import_type>
-    createBlockCrsTpetraImporter(const Teuchos::RCP<const typename BlockHelperDetails::ImplType<MatrixType>::tpetra_block_crs_matrix_type> &A) {
+    createBlockCrsTpetraImporter(const Teuchos::RCP<const typename BlockHelperDetails::ImplType<MatrixType>::tpetra_row_matrix_type> &A) {
       IFPACK2_BLOCKHELPER_TIMER("BlockTriDi::CreateBlockCrsTpetraImporter");
       using impl_type = BlockHelperDetails::ImplType<MatrixType>;
       using tpetra_map_type = typename impl_type::tpetra_map_type;
       using tpetra_mv_type = typename impl_type::tpetra_block_multivector_type;
       using tpetra_import_type = typename impl_type::tpetra_import_type;
+      using crs_matrix_type = typename impl_type::tpetra_crs_matrix_type;
+      using block_crs_matrix_type = typename impl_type::tpetra_block_crs_matrix_type;
+
+      auto A_crs = Teuchos::rcp_dynamic_cast<const crs_matrix_type>(A);
+      auto A_bcrs = Teuchos::rcp_dynamic_cast<const block_crs_matrix_type>(A);
+
+      bool hasBlockCrsMatrix = ! A_bcrs.is_null ();
+
+      // This is OK here to use the graph of the A_crs matrix and a block size of 1
+      const auto g = hasBlockCrsMatrix ? A_bcrs->getCrsGraph() : *(A_crs->getCrsGraph()); // tpetra crs graph object
 
-      const auto g = A->getCrsGraph();  // tpetra crs graph object
-      const auto blocksize = A->getBlockSize();
+      const auto blocksize = hasBlockCrsMatrix ? A_bcrs->getBlockSize() : 1;
       const auto src = Teuchos::rcp(new tpetra_map_type(tpetra_mv_type::makePointMap(*g.getDomainMap(), blocksize)));
       const auto tgt = Teuchos::rcp(new tpetra_map_type(tpetra_mv_type::makePointMap(*g.getColMap()   , blocksize)));
       IFPACK2_BLOCKHELPER_TIMER_FENCE(typename BlockHelperDetails::ImplType<MatrixType>::execution_space)
@@ -883,15 +892,24 @@ namespace Ifpack2 {
     ///
     template<typename MatrixType>
     Teuchos::RCP<AsyncableImport<MatrixType> >
-    createBlockCrsAsyncImporter(const Teuchos::RCP<const typename BlockHelperDetails::ImplType<MatrixType>::tpetra_block_crs_matrix_type> &A) {
+    createBlockCrsAsyncImporter(const Teuchos::RCP<const typename BlockHelperDetails::ImplType<MatrixType>::tpetra_row_matrix_type> &A) {
       using impl_type = BlockHelperDetails::ImplType<MatrixType>;
       using tpetra_map_type = typename impl_type::tpetra_map_type;
       using local_ordinal_type = typename impl_type::local_ordinal_type;
       using global_ordinal_type = typename impl_type::global_ordinal_type;
       using local_ordinal_type_1d_view = typename impl_type::local_ordinal_type_1d_view;
+      using crs_matrix_type = typename impl_type::tpetra_crs_matrix_type;
+      using block_crs_matrix_type = typename impl_type::tpetra_block_crs_matrix_type;
+
+      auto A_crs = Teuchos::rcp_dynamic_cast<const crs_matrix_type>(A);
+      auto A_bcrs = Teuchos::rcp_dynamic_cast<const block_crs_matrix_type>(A);
+
+      bool hasBlockCrsMatrix = ! A_bcrs.is_null ();
 
-      const auto g = A->getCrsGraph();  // tpetra crs graph object
-      const auto blocksize = A->getBlockSize();
+      // This is OK here to use the graph of the A_crs matrix and a block size of 1
+      const auto g = hasBlockCrsMatrix ? A_bcrs->getCrsGraph() : *(A_crs->getCrsGraph()); // tpetra crs graph object
+
+      const auto blocksize = hasBlockCrsMatrix ? A_bcrs->getBlockSize() : 1;
       const auto domain_map = g.getDomainMap();
       const auto column_map = g.getColMap();
 
@@ -1004,7 +1022,8 @@ namespace Ifpack2 {
     ///
     template<typename MatrixType>
     BlockHelperDetails::PartInterface<MatrixType>
-    createPartInterface(const Teuchos::RCP<const typename BlockHelperDetails::ImplType<MatrixType>::tpetra_block_crs_matrix_type> &A,
+    createPartInterface(const Teuchos::RCP<const typename BlockHelperDetails::ImplType<MatrixType>::tpetra_row_matrix_type> &A,
+                        const Teuchos::RCP<const typename BlockHelperDetails::ImplType<MatrixType>::tpetra_crs_graph_type> &G,
                         const Teuchos::Array<Teuchos::Array<typename BlockHelperDetails::ImplType<MatrixType>::local_ordinal_type> > &partitions,
                         const typename BlockHelperDetails::ImplType<MatrixType>::local_ordinal_type n_subparts_per_part_in) {
       IFPACK2_BLOCKHELPER_TIMER("createPartInterface");
@@ -1014,7 +1033,10 @@ namespace Ifpack2 {
       using local_ordinal_type_2d_view = typename impl_type::local_ordinal_type_2d_view;
       using size_type = typename impl_type::size_type;
 
-      const auto blocksize = A->getBlockSize();
+      auto bA = Teuchos::rcp_dynamic_cast<const typename BlockHelperDetails::ImplType<MatrixType>::tpetra_block_crs_matrix_type>(A);
+
+      TEUCHOS_ASSERT(!bA.is_null() || G->getLocalNumRows() != 0);
+      const local_ordinal_type blocksize = bA.is_null() ? A->getLocalNumRows() / G->getLocalNumRows() : A->getBlockSize();
       constexpr int vector_length = impl_type::vector_length;
       constexpr int internal_vector_length = impl_type::internal_vector_length;
 
@@ -1023,7 +1045,7 @@ namespace Ifpack2 {
       BlockHelperDetails::PartInterface<MatrixType> interf;
 
       const bool jacobi = partitions.size() == 0;
-      const local_ordinal_type A_n_lclrows = A->getLocalNumRows();
+      const local_ordinal_type A_n_lclrows = G->getLocalNumRows();
       const local_ordinal_type nparts = jacobi ? A_n_lclrows : partitions.size();
 
       typedef std::pair<local_ordinal_type,local_ordinal_type> size_idx_pair_type;
@@ -1050,11 +1072,13 @@ namespace Ifpack2 {
           SolveTridiagsDefaultModeAndAlgo<typename execution_space::memory_space>::
           recommended_team_size(blocksize, vector_length, internal_vector_length);
 
-        const local_ordinal_type num_teams = execution_space().concurrency() / (team_size * vector_length);
+        const local_ordinal_type num_teams = std::max(1, execution_space().concurrency() / (team_size * vector_length));
 
         n_subparts_per_part = getAutomaticNSubparts(nparts, num_teams, line_length, blocksize);
 
+#ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
         printf("Automatically chosen n_subparts_per_part = %d for nparts = %d, num_teams = %d, team_size = %d, line_length = %d, and blocksize = %d;\n", n_subparts_per_part, nparts, num_teams, team_size, line_length, blocksize);
+#endif
       }
       else {
         n_subparts_per_part = n_subparts_per_part_in;
@@ -1817,7 +1841,8 @@ namespace Ifpack2 {
     ///
     template<typename MatrixType>
     void
-    performSymbolicPhase(const Teuchos::RCP<const typename BlockHelperDetails::ImplType<MatrixType>::tpetra_block_crs_matrix_type> &A,
+    performSymbolicPhase(const Teuchos::RCP<const typename BlockHelperDetails::ImplType<MatrixType>::tpetra_row_matrix_type> &A,
+                         const Teuchos::RCP<const typename BlockHelperDetails::ImplType<MatrixType>::tpetra_crs_graph_type> &g,
                          const BlockHelperDetails::PartInterface<MatrixType> &interf,
                          BlockTridiags<MatrixType> &btdm,
                          BlockHelperDetails::AmD<MatrixType> &amd,
@@ -1836,15 +1861,19 @@ namespace Ifpack2 {
       using size_type_1d_view = typename impl_type::size_type_1d_view;
       using vector_type_3d_view = typename impl_type::vector_type_3d_view;
       using vector_type_4d_view = typename impl_type::vector_type_4d_view;
+      using crs_matrix_type = typename impl_type::tpetra_crs_matrix_type;
       using block_crs_matrix_type = typename impl_type::tpetra_block_crs_matrix_type;
 
       constexpr int vector_length = impl_type::vector_length;
 
       const auto comm = A->getRowMap()->getComm();
 
-      const auto& g = A->getCrsGraph();
+      auto A_crs = Teuchos::rcp_dynamic_cast<const crs_matrix_type>(A);
+      auto A_bcrs = Teuchos::rcp_dynamic_cast<const block_crs_matrix_type>(A);
 
-      const auto blocksize = A->getBlockSize();
+      bool hasBlockCrsMatrix = ! A_bcrs.is_null ();
+      TEUCHOS_ASSERT(hasBlockCrsMatrix || g->getLocalNumRows() != 0);
+      const local_ordinal_type blocksize = hasBlockCrsMatrix ? A->getBlockSize() : A->getLocalNumRows()/g->getLocalNumRows();
 
       // mirroring to host
       const auto partptr = Kokkos::create_mirror_view_and_copy     (Kokkos::HostSpace(), interf.partptr);
@@ -1861,9 +1890,9 @@ namespace Ifpack2 {
       
       Kokkos::deep_copy(col2row, Teuchos::OrdinalTraits<local_ordinal_type>::invalid());
       {
-        const auto rowmap = g.getRowMap();
-        const auto colmap = g.getColMap();
-        const auto dommap = g.getDomainMap();
+        const auto rowmap = g->getRowMap();
+        const auto colmap = g->getColMap();
+        const auto dommap = g->getDomainMap();
         TEUCHOS_ASSERT( !(rowmap.is_null() || colmap.is_null() || dommap.is_null()));
 
 #if !defined(__CUDA_ARCH__) && !defined(__HIP_DEVICE_COMPILE__) && !defined(__SYCL_DEVICE_ONLY__)
@@ -1888,7 +1917,7 @@ namespace Ifpack2 {
 
       // construct the D and R graphs in A = D + R.
       {
-        const auto local_graph = g.getLocalGraphHost();
+        const auto local_graph = g->getLocalGraphHost();
         const auto local_graph_rowptr = local_graph.row_map;
         TEUCHOS_ASSERT(local_graph_rowptr.size() == static_cast<size_t>(nrows + 1));
         const auto local_graph_colidx = local_graph.entries;
@@ -2116,8 +2145,11 @@ namespace Ifpack2 {
           }
 
           // Allocate or view values.
-          amd.tpetra_values = (const_cast<block_crs_matrix_type*>(A.get())->getValuesDeviceNonConst());
-                               
+          if (hasBlockCrsMatrix)
+            amd.tpetra_values = (const_cast<block_crs_matrix_type*>(A_bcrs.get())->getValuesDeviceNonConst());
+          else {
+            amd.tpetra_values = (const_cast<crs_matrix_type*>(A_crs.get()))->getLocalValuesDevice (Tpetra::Access::ReadWrite);
+          }
         }
 
         // Allocate view for E and initialize the values with B:
@@ -2713,7 +2745,8 @@ namespace Ifpack2 {
       using impl_scalar_type = typename impl_type::impl_scalar_type;
       using magnitude_type = typename impl_type::magnitude_type;
       /// tpetra interface
-      using block_crs_matrix_type = typename impl_type::tpetra_block_crs_matrix_type;
+      using row_matrix_type = typename impl_type::tpetra_row_matrix_type;
+      using crs_graph_type = typename impl_type::tpetra_crs_graph_type;
       /// views
       using local_ordinal_type_1d_view = typename impl_type::local_ordinal_type_1d_view;
       using local_ordinal_type_2d_view = typename impl_type::local_ordinal_type_2d_view;
@@ -2746,8 +2779,9 @@ namespace Ifpack2 {
       const local_ordinal_type max_partsz;
       // block crs matrix (it could be Kokkos::UVMSpace::size_type, which is int)
       using size_type_1d_view_tpetra = Kokkos::View<size_t*,typename impl_type::node_device_type>;
-      const ConstUnmanaged<size_type_1d_view_tpetra> A_rowptr;
-      const ConstUnmanaged<impl_scalar_type_1d_view_tpetra> A_values;
+      ConstUnmanaged<size_type_1d_view_tpetra> A_block_rowptr;
+      ConstUnmanaged<size_type_1d_view_tpetra> A_point_rowptr;
+      ConstUnmanaged<impl_scalar_type_1d_view_tpetra> A_values;
       // block tridiags
       const ConstUnmanaged<size_type_2d_view> pack_td_ptr, flat_td_ptr, pack_td_ptr_schur;
       const ConstUnmanaged<local_ordinal_type_1d_view> A_colindsub;
@@ -2762,10 +2796,13 @@ namespace Ifpack2 {
       const local_ordinal_type vector_loop_size;
       const local_ordinal_type vector_length_value;
 
+      bool hasBlockCrsMatrix;
+
     public:
       ExtractAndFactorizeTridiags(const BlockTridiags<MatrixType> &btdm_,
                                   const BlockHelperDetails::PartInterface<MatrixType> &interf_,
-                                  const Teuchos::RCP<const block_crs_matrix_type> &A_,
+                                  const Teuchos::RCP<const row_matrix_type> &A_,
+                                  const Teuchos::RCP<const crs_graph_type> &G_,
                                   const magnitude_type& tiny_) :
         // interface
         partptr(interf_.partptr),
@@ -2777,9 +2814,6 @@ namespace Ifpack2 {
         part2packrowidx0_sub(interf_.part2packrowidx0_sub),
         packindices_schur(interf_.packindices_schur),
         max_partsz(interf_.max_partsz),
-        // block crs matrix
-        A_rowptr(A_->getCrsGraph().getLocalGraphDevice().row_map),
-        A_values(const_cast<block_crs_matrix_type*>(A_.get())->getValuesDeviceNonConst()),
         // block tridiags
         pack_td_ptr(btdm_.pack_td_ptr),
         flat_td_ptr(btdm_.flat_td_ptr),
@@ -2822,7 +2856,24 @@ namespace Ifpack2 {
         // diagonal weight to avoid zero pivots
         tiny(tiny_),
         vector_loop_size(vector_length/internal_vector_length),
-        vector_length_value(vector_length) {}
+        vector_length_value(vector_length) {
+          using crs_matrix_type = typename impl_type::tpetra_crs_matrix_type;
+          using block_crs_matrix_type = typename impl_type::tpetra_block_crs_matrix_type;
+
+          auto A_crs = Teuchos::rcp_dynamic_cast<const crs_matrix_type>(A_);
+          auto A_bcrs = Teuchos::rcp_dynamic_cast<const block_crs_matrix_type>(A_);
+
+          hasBlockCrsMatrix = ! A_bcrs.is_null ();
+
+          A_block_rowptr = G_->getLocalGraphDevice().row_map;
+          if (hasBlockCrsMatrix) {
+            A_values = const_cast<block_crs_matrix_type*>(A_bcrs.get())->getValuesDeviceNonConst();
+          }
+          else {
+            A_point_rowptr = A_crs->getCrsGraph()->getLocalGraphDevice().row_map;
+            A_values = A_crs->getLocalValuesDevice (Tpetra::Access::ReadOnly);
+          }
+        }
 
     private:
 
@@ -2861,25 +2912,44 @@ namespace Ifpack2 {
 #endif
         for (local_ordinal_type tr=tr_min,j=0;tr<tr_max;++tr) {
           for (local_ordinal_type e=0;e<3;++e) {
-            const impl_scalar_type* block[vector_length] = {};
-            for (local_ordinal_type vi=0;vi<npacks;++vi) {
-              const size_type Aj = A_rowptr(lclrow(ri0[vi] + tr)) + A_colindsub(kfs[vi] + j);
-              block[vi] = &A_values(Aj*blocksize_square);
-            }
-            const size_type pi = kps + j;
+            if (hasBlockCrsMatrix) {
+              const impl_scalar_type* block[vector_length] = {};
+              for (local_ordinal_type vi=0;vi<npacks;++vi) {
+                const size_type Aj = A_block_rowptr(lclrow(ri0[vi] + tr)) + A_colindsub(kfs[vi] + j);
+
+                block[vi] = &A_values(Aj*blocksize_square);
+              }
+              const size_type pi = kps + j;
 #ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
-            printf("Extract pi = %ld, ri0 + tr = %d, kfs + j = %d\n", pi, ri0[0] + tr, kfs[0] + j);
+              printf("Extract pi = %ld, ri0 + tr = %d, kfs + j = %d\n", pi, ri0[0] + tr, kfs[0] + j);
 #endif            
-            ++j;
-            for (local_ordinal_type ii=0;ii<blocksize;++ii) {
-              for (local_ordinal_type jj=0;jj<blocksize;++jj) {
-                const auto idx = tlb::getFlatIndex(ii, jj, blocksize);
-                auto& v = internal_vector_values(pi, ii, jj, 0);
-                for (local_ordinal_type vi=0;vi<npacks;++vi)
-                  v[vi] = static_cast<btdm_scalar_type>(block[vi][idx]);
+              ++j;            
+              for (local_ordinal_type ii=0;ii<blocksize;++ii) {
+                for (local_ordinal_type jj=0;jj<blocksize;++jj) {
+                  const auto idx = tlb::getFlatIndex(ii, jj, blocksize);
+                  auto& v = internal_vector_values(pi, ii, jj, 0);
+                  for (local_ordinal_type vi=0;vi<npacks;++vi) {
+                    v[vi] = static_cast<btdm_scalar_type>(block[vi][idx]);
+                  }
+                }
               }
             }
+            else {
+              const size_type pi = kps + j;
+
+              for (local_ordinal_type vi=0;vi<npacks;++vi) {
+                const size_type Aj_c = A_colindsub(kfs[vi] + j);
+
+                for (local_ordinal_type ii=0;ii<blocksize;++ii) {
+                  auto point_row_offset = A_point_rowptr(lclrow(ri0[vi] + tr)*blocksize + ii);
 
+                  for (local_ordinal_type jj=0;jj<blocksize;++jj) {
+                    scalar_values(pi, ii, jj, vi) = A_values(point_row_offset + Aj_c*blocksize + jj);
+                  }
+                }
+              }
+              ++j;
+            }
             if (nrows[0] == 1) break;
             if (local_subpartidx % 2 == 0) {
               if (e == 1 && (tr == 0 || tr+1 == nrows[0])) break;
@@ -2965,20 +3035,36 @@ namespace Ifpack2 {
                   lend = 1;
                 }
               }
-              for (local_ordinal_type l=lbeg;l<lend;++l,++j) {
-                const size_type Aj = A_rowptr(lclrow(ri0 + tr)) + A_colindsub(kfs + j);
-                const impl_scalar_type* block = &A_values(Aj*blocksize_square);
-                const size_type pi = kps + j;
+              if (hasBlockCrsMatrix) {
+                for (local_ordinal_type l=lbeg;l<lend;++l,++j) {
+                  const size_type Aj = A_block_rowptr(lclrow(ri0 + tr)) + A_colindsub(kfs + j);
+                  const impl_scalar_type* block = &A_values(Aj*blocksize_square);
+                  const size_type pi = kps + j;
 #ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
-                printf("Extract pi = %ld, ri0 + tr = %d, kfs + j = %d, tr = %d, lbeg = %d, lend = %d, l = %d\n", pi, ri0 + tr, kfs + j, tr, lbeg, lend, l);
+                  printf("Extract pi = %ld, ri0 + tr = %d, kfs + j = %d, tr = %d, lbeg = %d, lend = %d, l = %d\n", pi, ri0 + tr, kfs + j, tr, lbeg, lend, l);
 #endif
-                Kokkos::parallel_for
-                  (Kokkos::TeamThreadRange(member,blocksize),
-                   [&](const local_ordinal_type &ii) {
-                    for (local_ordinal_type jj=0;jj<blocksize;++jj) {
-                      scalar_values(pi, ii, jj, v) = static_cast<btdm_scalar_type>(block[tlb::getFlatIndex(ii,jj,blocksize)]);
-                    }
-                  });
+                  Kokkos::parallel_for
+                    (Kokkos::TeamThreadRange(member,blocksize),
+                    [&](const local_ordinal_type &ii) {
+                      for (local_ordinal_type jj=0;jj<blocksize;++jj) {
+                        scalar_values(pi, ii, jj, v) = static_cast<btdm_scalar_type>(block[tlb::getFlatIndex(ii,jj,blocksize)]);
+                      }
+                    });
+                }
+              }
+              else {
+                for (local_ordinal_type l=lbeg;l<lend;++l,++j) {
+                  const size_type Aj_c = A_colindsub(kfs + j);
+                  const size_type pi = kps + j;
+                  Kokkos::parallel_for
+                    (Kokkos::TeamThreadRange(member,blocksize),
+                    [&](const local_ordinal_type &ii) {
+                      auto point_row_offset = A_point_rowptr(lclrow(ri0 + tr)*blocksize + ii);
+                      for (local_ordinal_type jj=0;jj<blocksize;++jj) {
+                        scalar_values(pi, ii, jj, v) = A_values(point_row_offset + Aj_c*blocksize + jj);
+                      }
+                    });
+                }
               }
             }
           }
@@ -3478,12 +3564,13 @@ namespace Ifpack2 {
     ///
     template<typename MatrixType>
     void
-    performNumericPhase(const Teuchos::RCP<const typename BlockHelperDetails::ImplType<MatrixType>::tpetra_block_crs_matrix_type> &A,
+    performNumericPhase(const Teuchos::RCP<const typename BlockHelperDetails::ImplType<MatrixType>::tpetra_row_matrix_type> &A,
+                        const Teuchos::RCP<const typename BlockHelperDetails::ImplType<MatrixType>::tpetra_crs_graph_type> &G,
                         const BlockHelperDetails::PartInterface<MatrixType> &interf,
                         BlockTridiags<MatrixType> &btdm,
                         const typename BlockHelperDetails::ImplType<MatrixType>::magnitude_type tiny) {
       IFPACK2_BLOCKHELPER_TIMER("BlockTriDi::NumericPhase");
-      ExtractAndFactorizeTridiags<MatrixType> function(btdm, interf, A, tiny);
+      ExtractAndFactorizeTridiags<MatrixType> function(btdm, interf, A, G, tiny);
       function.run();
       IFPACK2_BLOCKHELPER_TIMER_FENCE(typename BlockHelperDetails::ImplType<MatrixType>::execution_space)
     }
@@ -4736,7 +4823,8 @@ namespace Ifpack2 {
     template<typename MatrixType>
     int
     applyInverseJacobi(// importer
-                       const Teuchos::RCP<const typename BlockHelperDetails::ImplType<MatrixType>::tpetra_block_crs_matrix_type> &A,
+                       const Teuchos::RCP<const typename BlockHelperDetails::ImplType<MatrixType>::tpetra_row_matrix_type> &A,
+                       const Teuchos::RCP<const typename BlockHelperDetails::ImplType<MatrixType>::tpetra_crs_graph_type> &G,
                        const Teuchos::RCP<const typename BlockHelperDetails::ImplType<MatrixType>::tpetra_import_type> &tpetra_importer,
                        const Teuchos::RCP<AsyncableImport<MatrixType> > &async_importer,
                        const bool overlap_communication_and_computation,
@@ -4838,8 +4926,18 @@ namespace Ifpack2 {
                                                damping_factor, is_norm_manager_active);
 
       const local_ordinal_type_1d_view dummy_local_ordinal_type_1d_view;
+
+
+      auto A_crs = Teuchos::rcp_dynamic_cast<const typename impl_type::tpetra_crs_matrix_type>(A);
+      auto A_bcrs = Teuchos::rcp_dynamic_cast<const typename impl_type::tpetra_block_crs_matrix_type>(A);
+
+      bool hasBlockCrsMatrix = ! A_bcrs.is_null ();
+
+      // This is OK here to use the graph of the A_crs matrix and a block size of 1
+      const auto g = hasBlockCrsMatrix ? A_bcrs->getCrsGraph() : *(A_crs->getCrsGraph()); // tpetra crs graph object
+
       BlockHelperDetails::ComputeResidualVector<MatrixType>
-        compute_residual_vector(amd, A->getCrsGraph().getLocalGraphDevice(), blocksize, interf,
+        compute_residual_vector(amd, G->getLocalGraphDevice(), g.getLocalGraphDevice(), blocksize, interf,
                                 is_async_importer_active ? async_importer->dm2cm : dummy_local_ordinal_type_1d_view);
 
       // norm manager workspace resize
@@ -4923,7 +5021,8 @@ namespace Ifpack2 {
       using async_import_type = AsyncableImport<MatrixType>;
 
       // distructed objects
-      Teuchos::RCP<const typename impl_type::tpetra_block_crs_matrix_type> A;
+      Teuchos::RCP<const typename impl_type::tpetra_row_matrix_type> A;
+      Teuchos::RCP<const typename impl_type::tpetra_crs_graph_type> blockGraph;
       Teuchos::RCP<const typename impl_type::tpetra_import_type> tpetra_importer;
       Teuchos::RCP<async_import_type> async_importer;
       bool overlap_communication_and_computation;
diff --git a/packages/ifpack2/test/unit_tests/Ifpack2_UnitTestBlockTriDiContainer.cpp b/packages/ifpack2/test/unit_tests/Ifpack2_UnitTestBlockTriDiContainer.cpp
index 1e26b4996649..ae707b80d916 100644
--- a/packages/ifpack2/test/unit_tests/Ifpack2_UnitTestBlockTriDiContainer.cpp
+++ b/packages/ifpack2/test/unit_tests/Ifpack2_UnitTestBlockTriDiContainer.cpp
@@ -280,32 +280,36 @@ static LO run_teuchos_tests (const Input& in, Teuchos::FancyOStream& out, bool&
             if (seq_method && overlap_comm) continue;
             for (const bool nonuniform_lines : {false, true}) {
               for (const bool pointwise : {false, true}) {
-                if (jacobi && nonuniform_lines) continue;
-                for (const int nvec : {1, 3}) {
-                  std::stringstream ss;
-                  ss << "test_BR_BTDC:"
-                    << " bs " << bs
-                    << (contiguous ? " contig" : " noncontig")
-                    << (jacobi ? " jacobi" : " tridiag")
-                    << (seq_method ? " seq_method" : "")
-                    << (overlap_comm ? " overlap_comm" : "")
-                    << (pointwise ? " point_wise" : "")
-                    << (nonuniform_lines ? " nonuniform_lines" : " uniform_lines")
-                    << " nvec " << nvec;
-                  const std::string details = ss.str();
-                  bool threw = false;
-                  try {
-                    ne = btdct::test_BR_BTDC(in.comm, sb, sbp, bs, nvec, nonuniform_lines,
-                                            different_maps, jacobi, overlap_comm, seq_method, pointwise,
-                                            details);
-                    nerr += ne;
-                  } catch (const std::exception& e) {
-                    threw = true;
-                  }
-                  if (threw)
-                    printf("Exception threw from rank %d, %s\n", in.comm->getRank(), details.c_str());
+                for (const bool explicitConversion : {false, true}) {
+                  if (jacobi && nonuniform_lines) continue;
+                  if (!pointwise && explicitConversion) continue;
+                  for (const int nvec : {1, 3}) {
+                    std::stringstream ss;
+                    ss << "test_BR_BTDC:"
+                      << " bs " << bs
+                      << (contiguous ? " contig" : " noncontig")
+                      << (jacobi ? " jacobi" : " tridiag")
+                      << (seq_method ? " seq_method" : "")
+                      << (overlap_comm ? " overlap_comm" : "")
+                      << (pointwise ? " point_wise" : "")
+                      << (explicitConversion ? " explicit_block_conversion" : "")
+                      << (nonuniform_lines ? " nonuniform_lines" : " uniform_lines")
+                      << " nvec " << nvec;
+                    const std::string details = ss.str();
+                    bool threw = false;
+                    try {
+                      ne = btdct::test_BR_BTDC(in.comm, sb, sbp, bs, nvec, nonuniform_lines,
+                                              different_maps, jacobi, overlap_comm, seq_method, pointwise,
+                                              explicitConversion, details);
+                      nerr += ne;
+                    } catch (const std::exception& e) {
+                      threw = true;
+                    }
+                    if (threw)
+                      printf("Exception threw from rank %d, %s\n", in.comm->getRank(), details.c_str());
 
-                  TEUCHOS_TEST(ne == 0 && ! threw, details);
+                    TEUCHOS_TEST(ne == 0 && ! threw, details);
+                  }
                 }
               }
             }
diff --git a/packages/ifpack2/test/unit_tests/Ifpack2_UnitTestBlockTriDiContainerUtil.hpp b/packages/ifpack2/test/unit_tests/Ifpack2_UnitTestBlockTriDiContainerUtil.hpp
index 7b9c8a28bb39..516c0ab8d304 100644
--- a/packages/ifpack2/test/unit_tests/Ifpack2_UnitTestBlockTriDiContainerUtil.hpp
+++ b/packages/ifpack2/test/unit_tests/Ifpack2_UnitTestBlockTriDiContainerUtil.hpp
@@ -159,7 +159,8 @@ struct BlockTriDiContainerTester {
                 const bool nonuniform_lines = false,
                 const bool zero_starting_soln = true,
                 const int num_sweeps = 1,
-                const bool jacobi = false) {
+                const bool jacobi = false,
+                const bool explicitConversion = false) {
     Teuchos::Array<Teuchos::ArrayRCP<LO> > parts;
     // make_parts modifies entries of A so the call to convertToCrsMatrix
     // needs to happen after make_parts
@@ -178,6 +179,7 @@ struct BlockTriDiContainerTester {
       p.set("partitioner: parts", parts);
       p.set("partitioner: subparts per part", 1);
       p.set("partitioner: block size", A->getBlockSize());
+      p.set("partitioner: explicit convert to BlockCrs", explicitConversion);
       T->setParameters(p);
     }
     return T;
@@ -207,6 +209,7 @@ struct BlockTriDiContainerTester {
       p.set("partitioner: parts", parts);
       p.set("partitioner: subparts per part", 1);
       p.set("partitioner: block size", -1);
+      p.set("partitioner: explicit convert to BlockCrs", false);
       T->setParameters(p);
     }
     return T;
@@ -219,7 +222,8 @@ struct BlockTriDiContainerTester {
   make_BTDC_PW (const StructuredBlock& sb, const StructuredBlockPart& sbp,
              const Teuchos::RCP<Tpetra_BlockCrsMatrix>& A,
              const bool overlap_comm = false, const bool nonuniform_lines = false,
-             const bool jacobi = false, const bool seq_method = false) {
+             const bool jacobi = false, const bool seq_method = false,
+             const bool explicitConversion = false) {
     Teuchos::Array<Teuchos::Array<LO> > parts;
     // make_parts modifies entries of A so the call to convertToCrsMatrix
     // needs to happen after make_parts
@@ -227,7 +231,7 @@ struct BlockTriDiContainerTester {
     auto A_pw = Tpetra::convertToCrsMatrix(*A);
 
     return Teuchos::rcp(new Ifpack2::BlockTriDiContainer<Tpetra_RowMatrix>(
-                          A_pw, parts, 1, overlap_comm, seq_method, A->getBlockSize()));
+                          A_pw, parts, 1, overlap_comm, seq_method, A->getBlockSize(), explicitConversion));
   }
 
   // Make a bare BlockTriDiContainer.
@@ -248,7 +252,8 @@ struct BlockTriDiContainerTester {
                 const StructuredBlock& sb, const StructuredBlockPart& sbp,
                 const Int bs, const Int nvec, const bool nonuniform_lines,
                 const bool different_maps, const bool jacobi, const bool overlap_comm,
-                const bool seq_method, const bool pointwise, const std::string& details) {
+                const bool seq_method, const bool pointwise, const bool explicitConversion, 
+                const std::string& details) {
 #define TEST_BR_BTDC_FAIL(msg) do {             \
       ++nerr;                                   \
       if (comm->getRank() == 0) {               \
@@ -286,13 +291,13 @@ struct BlockTriDiContainerTester {
       const Magnitude tol = 1e-3;
       const auto T_br = use_br ?
         ( pointwise ?
-          make_BR_BTDC_PW(sb, sbp, A, nonuniform_lines, zero_starting, num_sweeps, jacobi): 
+          make_BR_BTDC_PW(sb, sbp, A, nonuniform_lines, zero_starting, num_sweeps, jacobi, explicitConversion): 
           make_BR_BTDC(sb, sbp, A, nonuniform_lines, zero_starting, num_sweeps, jacobi) 
         ):
         Teuchos::null;
       const auto T_bare = use_br ? Teuchos::null :
         ( pointwise ?
-          make_BTDC_PW(sb, sbp, A, overlap_comm, nonuniform_lines, jacobi, seq_method): 
+          make_BTDC_PW(sb, sbp, A, overlap_comm, nonuniform_lines, jacobi, seq_method, explicitConversion): 
           make_BTDC(sb, sbp, A, overlap_comm, nonuniform_lines, jacobi, seq_method) 
         );
       if ( ! T_br.is_null()) {
diff --git a/packages/muelu/src/Operators/MueLu_MultiPhys_def.hpp b/packages/muelu/src/Operators/MueLu_MultiPhys_def.hpp
index 6155dd7f092f..5a04f8ebe20f 100644
--- a/packages/muelu/src/Operators/MueLu_MultiPhys_def.hpp
+++ b/packages/muelu/src/Operators/MueLu_MultiPhys_def.hpp
@@ -142,7 +142,9 @@ void MultiPhys<Scalar, LocalOrdinal, GlobalOrdinal, Node>::compute(bool reuse) {
   // Generate the (iii,iii) Hierarchy
 
   for (int iii = 0; iii < nBlks_; iii++) {
-    arrayOfParamLists_[iii]->sublist("user data").set("Coordinates", arrayOfCoords_[iii]);
+    if (arrayOfCoords_ != Teuchos::null) {
+      arrayOfParamLists_[iii]->sublist("user data").set("Coordinates", arrayOfCoords_[iii]);
+    }
 
     bool wantToRepartition = false;
     if (paramListMultiphysics_->isParameter("repartition: enable"))
@@ -312,6 +314,8 @@ void MultiPhys<Scalar, LocalOrdinal, GlobalOrdinal, Node>::
   }
   oss << std::endl;
 
+  out << oss.str();
+
   for (int i = 0; i < nBlks_; i++) {
     arrayOfHierarchies_[i]->describe(out, GetVerbLevel());
   }
diff --git a/packages/muelu/src/Operators/MueLu_RefMaxwell_def.hpp b/packages/muelu/src/Operators/MueLu_RefMaxwell_def.hpp
index 3b3a6fe66184..22ef99a1fba2 100644
--- a/packages/muelu/src/Operators/MueLu_RefMaxwell_def.hpp
+++ b/packages/muelu/src/Operators/MueLu_RefMaxwell_def.hpp
@@ -2074,7 +2074,13 @@ void RefMaxwell<Scalar, LocalOrdinal, GlobalOrdinal, Node>::setupSubSolve(Teucho
         }
         if ((coarseType == "" ||
              coarseType == "Klu" ||
-             coarseType == "Klu2") &&
+             coarseType == "Klu2" ||
+             coarseType == "Superlu" ||
+             coarseType == "Superlu_dist" ||
+             coarseType == "Superludist" ||
+             coarseType == "Basker" ||
+             coarseType == "Cusolver" ||
+             coarseType == "Tacho") &&
             (!params.isSublist("coarse: params") ||
              !params.sublist("coarse: params").isParameter("fix nullspace")))
           params.sublist("coarse: params").set("fix nullspace", true);
diff --git a/packages/muelu/src/Smoothers/MueLu_Amesos2Smoother_def.hpp b/packages/muelu/src/Smoothers/MueLu_Amesos2Smoother_def.hpp
index ba72e83f2e0c..2a7c55e84b33 100644
--- a/packages/muelu/src/Smoothers/MueLu_Amesos2Smoother_def.hpp
+++ b/packages/muelu/src/Smoothers/MueLu_Amesos2Smoother_def.hpp
@@ -333,7 +333,7 @@ void Amesos2Smoother<Scalar, LocalOrdinal, GlobalOrdinal, Node>::Setup(Level& cu
   amesos2_params->setName("Amesos2");
   if ((rowMap->getGlobalNumElements() != as<size_t>((rowMap->getMaxAllGlobalIndex() - rowMap->getMinAllGlobalIndex()) + 1)) ||
       (!rowMap->isContiguous() && (rowMap->getComm()->getSize() == 1))) {
-    if ((type_ != "Cusolver") && !(amesos2_params->sublist(prec_->name()).template isType<bool>("IsContiguous")))
+    if (((type_ != "Cusolver") && (type_ != "Tacho")) && !(amesos2_params->sublist(prec_->name()).template isType<bool>("IsContiguous")))
       amesos2_params->sublist(prec_->name()).set("IsContiguous", false, "Are GIDs Contiguous");
   }
   prec_->setParameters(amesos2_params);
diff --git a/packages/muelu/src/Transfers/BlockedTransfers/MueLu_SubBlockAFactory_def.hpp b/packages/muelu/src/Transfers/BlockedTransfers/MueLu_SubBlockAFactory_def.hpp
index 18dd726a8d10..f1ea5162cd89 100644
--- a/packages/muelu/src/Transfers/BlockedTransfers/MueLu_SubBlockAFactory_def.hpp
+++ b/packages/muelu/src/Transfers/BlockedTransfers/MueLu_SubBlockAFactory_def.hpp
@@ -155,8 +155,10 @@ void SubBlockAFactory<Scalar, LocalOrdinal, GlobalOrdinal, Node>::Build(Level& c
   RCP<const MapExtractor> rangeMapExtractor  = A->getRangeMapExtractor();
   RCP<const MapExtractor> domainMapExtractor = A->getDomainMapExtractor();
 
-  RCP<const Map> rangeMap  = rangeMapExtractor->getMap(row);
-  RCP<const Map> domainMap = domainMapExtractor->getMap(col);
+  bool thyraMode = rangeMapExtractor->getThyraMode();
+
+  RCP<const Map> rangeMap  = rangeMapExtractor->getMap(row, thyraMode);
+  RCP<const Map> domainMap = domainMapExtractor->getMap(col, thyraMode);
 
   // use user-specified striding information if available. Otherwise try to use internal striding information from the submaps!
   if (bRangeUserSpecified)
diff --git a/packages/panzer/adapters-stk/example/CurlLaplacianExample/CMakeLists.txt b/packages/panzer/adapters-stk/example/CurlLaplacianExample/CMakeLists.txt
index 8d47cfa1a732..fb144ce6e870 100644
--- a/packages/panzer/adapters-stk/example/CurlLaplacianExample/CMakeLists.txt
+++ b/packages/panzer/adapters-stk/example/CurlLaplacianExample/CMakeLists.txt
@@ -60,7 +60,7 @@ FOREACH( ORDER 1 2 3 4 )
       PASS_REGULAR_EXPRESSION "ALL PASSED: Tpetra"
       NUM_MPI_PROCS 4
       OUTPUT_FILE MPE-ConvTest-Quad-${ORDER}-32
-    TEST_4 CMND python
+    TEST_4 CMND python3
       ARGS ${CMAKE_CURRENT_SOURCE_DIR}/convergence_rate.py
            ${ORDER}
            MPE-ConvTest-Quad-${ORDER}-
@@ -92,7 +92,7 @@ FOREACH( ORDER 1 )
       PASS_REGULAR_EXPRESSION "ALL PASSED: Tpetra"
       NUM_MPI_PROCS 4
       OUTPUT_FILE MPE-Multiblock-ConvTest-Quad-${ORDER}-32
-    TEST_3 CMND python
+    TEST_3 CMND python3
       ARGS ${CMAKE_CURRENT_SOURCE_DIR}/convergence_rate.py
            ${ORDER}
            MPE-Multiblock-ConvTest-Quad-${ORDER}-
@@ -128,7 +128,7 @@ ENDFOREACH()
 #       PASS_REGULAR_EXPRESSION "ALL PASSED: Epetra"
 #       NUM_MPI_PROCS 4
 #       OUTPUT_FILE MPE-ConvTest-Tri-${ORDER}-32
-#     TEST_4 CMND python
+#     TEST_4 CMND python3
 #       ARGS ${CMAKE_CURRENT_SOURCE_DIR}/convergence_rate.py
 #            ${ORDER}
 #            MPE-ConvTest-Tri-${ORDER}-
@@ -165,7 +165,7 @@ ENDFOREACH()
 #       PASS_REGULAR_EXPRESSION "ALL PASSED: Epetra"
 #       NUM_MPI_PROCS 4
 #       OUTPUT_FILE MPE-ConvTest-Hex-${ORDER}-32
-#     TEST_4 CMND python
+#     TEST_4 CMND python3
 #       ARGS ${CMAKE_CURRENT_SOURCE_DIR}/convergence_rate.py
 #            ${ORDER}
 #            MPE-ConvTest-Hex-${ORDER}-
diff --git a/packages/panzer/adapters-stk/example/MixedCurlLaplacianExample/CMakeLists.txt b/packages/panzer/adapters-stk/example/MixedCurlLaplacianExample/CMakeLists.txt
index 7768714c4deb..39908f32745d 100644
--- a/packages/panzer/adapters-stk/example/MixedCurlLaplacianExample/CMakeLists.txt
+++ b/packages/panzer/adapters-stk/example/MixedCurlLaplacianExample/CMakeLists.txt
@@ -60,7 +60,7 @@ FOREACH( ORDER 1 2 )
       PASS_REGULAR_EXPRESSION "ALL PASSED: Tpetra"
       NUM_MPI_PROCS 4
       OUTPUT_FILE MPE-ConvTest-Quad-${ORDER}-32
-    TEST_4 CMND python
+    TEST_4 CMND python3
       ARGS ${CMAKE_CURRENT_SOURCE_DIR}/convergence_rate.py
            ${ORDER}
            MPE-ConvTest-Quad-${ORDER}-
@@ -92,7 +92,7 @@ FOREACH( ORDER 3 )
       PASS_REGULAR_EXPRESSION "ALL PASSED: Tpetra"
       NUM_MPI_PROCS 4
       OUTPUT_FILE MPE-ConvTest-Quad-${ORDER}-16
-    TEST_4 CMND python
+    TEST_4 CMND python3
       ARGS ${CMAKE_CURRENT_SOURCE_DIR}/convergence_rate.py
            ${ORDER}
            MPE-ConvTest-Quad-${ORDER}-
@@ -123,7 +123,7 @@ FOREACH( ORDER 1 )
       PASS_REGULAR_EXPRESSION "ALL PASSED: Tpetra"
       NUM_MPI_PROCS 4
       OUTPUT_FILE MPE-Multiblock-ConvTest-Quad-${ORDER}-32
-    TEST_3 CMND python
+    TEST_3 CMND python3
       ARGS ${CMAKE_CURRENT_SOURCE_DIR}/convergence_rate.py
            ${ORDER}
            MPE-Multiblock-ConvTest-Quad-${ORDER}-
@@ -159,7 +159,7 @@ FOREACH( ORDER 1 )
       PASS_REGULAR_EXPRESSION "ALL PASSED: Tpetra"
       NUM_MPI_PROCS 4
       OUTPUT_FILE MPE-ConvTest-Tri-${ORDER}-64
-    TEST_4 CMND python
+    TEST_4 CMND python3
       ARGS ${CMAKE_CURRENT_SOURCE_DIR}/convergence_rate.py
            ${ORDER}
            MPE-ConvTest-Tri-${ORDER}-
@@ -191,7 +191,7 @@ FOREACH( ORDER 2 )
       PASS_REGULAR_EXPRESSION "ALL PASSED: Tpetra"
       NUM_MPI_PROCS 4
       OUTPUT_FILE MPE-ConvTest-Tri-${ORDER}-16
-    TEST_3 CMND python
+    TEST_3 CMND python3
       ARGS ${CMAKE_CURRENT_SOURCE_DIR}/convergence_rate.py
            ${ORDER}
            MPE-ConvTest-Tri-${ORDER}-
@@ -227,7 +227,7 @@ ENDFOREACH()
 #       PASS_REGULAR_EXPRESSION "ALL PASSED: Epetra"
 #       NUM_MPI_PROCS 4
 #       OUTPUT_FILE MPE-ConvTest-Hex-${ORDER}-32
-#     TEST_4 CMND python
+#     TEST_4 CMND python3
 #       ARGS ${CMAKE_CURRENT_SOURCE_DIR}/convergence_rate.py
 #            ${ORDER}
 #            MPE-ConvTest-Hex-${ORDER}-
diff --git a/packages/panzer/adapters-stk/example/MixedPoissonExample/CMakeLists.txt b/packages/panzer/adapters-stk/example/MixedPoissonExample/CMakeLists.txt
index 0cc77eef4476..996c0723ad45 100644
--- a/packages/panzer/adapters-stk/example/MixedPoissonExample/CMakeLists.txt
+++ b/packages/panzer/adapters-stk/example/MixedPoissonExample/CMakeLists.txt
@@ -59,7 +59,7 @@ FOREACH( ORDER 1 2 3)
       PASS_REGULAR_EXPRESSION "ALL PASSED: Tpetra"
       NUM_MPI_PROCS 4
       OUTPUT_FILE MPE-ConvTest-Hex-p${ORDER}-12
-    TEST_3 CMND python
+    TEST_3 CMND python3
       ARGS ${CMAKE_CURRENT_SOURCE_DIR}/convergence_rate.py
          ${ORDER}
          MPE-ConvTest-Hex-p${ORDER}-
@@ -90,7 +90,7 @@ FOREACH( ORDER 1)
       PASS_REGULAR_EXPRESSION "ALL PASSED: Tpetra"
       NUM_MPI_PROCS 4
       OUTPUT_FILE MPE-Multiblock-ConvTest-Hex-p${ORDER}-12
-    TEST_2 CMND python
+    TEST_2 CMND python3
       ARGS ${CMAKE_CURRENT_SOURCE_DIR}/convergence_rate.py
          ${ORDER}
          MPE-Multiblock-ConvTest-Hex-p${ORDER}-
diff --git a/packages/panzer/adapters-stk/example/MixedPoissonExample/analyze_performance.py b/packages/panzer/adapters-stk/example/MixedPoissonExample/analyze_performance.py
index 517e27047d5d..90785008ec7b 100755
--- a/packages/panzer/adapters-stk/example/MixedPoissonExample/analyze_performance.py
+++ b/packages/panzer/adapters-stk/example/MixedPoissonExample/analyze_performance.py
@@ -1,4 +1,4 @@
-#! /usr/bin/env python
+#! /usr/bin/env python3
 
 """
 Script for analyzing Panzer kernel performance on next-generation
@@ -12,7 +12,6 @@
 
 # Import python modules for command-line options, the operating system, regular
 # expressions, and system functions
-import commands
 import argparse
 import os
 import re
diff --git a/packages/panzer/adapters-stk/example/PoissonExample/CMakeLists.txt b/packages/panzer/adapters-stk/example/PoissonExample/CMakeLists.txt
index 4ea8c6e0dff1..e1201f5e0c3b 100644
--- a/packages/panzer/adapters-stk/example/PoissonExample/CMakeLists.txt
+++ b/packages/panzer/adapters-stk/example/PoissonExample/CMakeLists.txt
@@ -47,7 +47,7 @@ FOREACH( ORDER 1 2 3 4 )
       PASS_REGULAR_EXPRESSION "ALL PASSED"
       NUM_MPI_PROCS 4
       OUTPUT_FILE MPE-ConvTest-Quad-p${ORDER}-80
-    TEST_5 CMND python
+    TEST_5 CMND python3
       ARGS ${CMAKE_CURRENT_SOURCE_DIR}/convergence_rate.py
          ${ORDER}
          MPE-ConvTest-Quad-p${ORDER}-
@@ -91,7 +91,7 @@ FOREACH( ORDER 1 2 3 4 )
       PASS_REGULAR_EXPRESSION "ALL PASSED"
       NUM_MPI_PROCS 4
       OUTPUT_FILE MPE-ConvTest-Tri-p${ORDER}-80
-    TEST_5 CMND python
+    TEST_5 CMND python3
       ARGS ${CMAKE_CURRENT_SOURCE_DIR}/convergence_rate.py
          ${ORDER}
          MPE-ConvTest-Tri-p${ORDER}-
@@ -119,7 +119,7 @@ TRIBITS_ADD_ADVANCED_TEST(
     NUM_MPI_PROCS 2
     OUTPUT_FILE PoissonExampleCurvilinear_3_Q1.out
   COMM mpi
-  TEST_2 CMND python
+  TEST_2 CMND python3
   ARGS   ${CMAKE_CURRENT_SOURCE_DIR}/compareWithGold_Curvilinear.py
          ${CMAKE_CURRENT_SOURCE_DIR}/PoissonExampleCurvilinear_3_Q2.gold
          ${CMAKE_CURRENT_SOURCE_DIR}/PoissonExampleCurvilinear_3_Q1.gold
diff --git a/packages/panzer/adapters-stk/example/PoissonInterfaceExample/CMakeLists.txt b/packages/panzer/adapters-stk/example/PoissonInterfaceExample/CMakeLists.txt
index 6ff12ed190a2..d6d4b217a376 100644
--- a/packages/panzer/adapters-stk/example/PoissonInterfaceExample/CMakeLists.txt
+++ b/packages/panzer/adapters-stk/example/PoissonInterfaceExample/CMakeLists.txt
@@ -61,7 +61,7 @@ TRIBITS_ADD_ADVANCED_TEST(
     PASS_REGULAR_EXPRESSION "PASS BASICS"
     NUM_MPI_PROCS 4
     OUTPUT_FILE PIE-ConvTest-20
-  TEST_2 CMND python
+  TEST_2 CMND python3
     ARGS ${CMAKE_CURRENT_SOURCE_DIR}/convergence_rate.py
          PIE-ConvTest-
          10
@@ -82,7 +82,7 @@ TRIBITS_ADD_ADVANCED_TEST(
     PASS_REGULAR_EXPRESSION "PASS BASICS"
     NUM_MPI_PROCS 4
     OUTPUT_FILE PIE-ConvTest-8
-  TEST_2 CMND python
+  TEST_2 CMND python3
     ARGS ${CMAKE_CURRENT_SOURCE_DIR}/convergence_rate.py
          PIE-ConvTest-
          4
diff --git a/packages/panzer/adapters-stk/example/PoissonInterfaceTpetra/CMakeLists.txt b/packages/panzer/adapters-stk/example/PoissonInterfaceTpetra/CMakeLists.txt
index 8cf8e79a965d..31587e48442d 100644
--- a/packages/panzer/adapters-stk/example/PoissonInterfaceTpetra/CMakeLists.txt
+++ b/packages/panzer/adapters-stk/example/PoissonInterfaceTpetra/CMakeLists.txt
@@ -61,7 +61,7 @@ TRIBITS_ADD_ADVANCED_TEST(
     PASS_REGULAR_EXPRESSION "PASS BASICS"
     NUM_MPI_PROCS 4
     OUTPUT_FILE PIE-ConvTest-20
-  TEST_2 CMND python
+  TEST_2 CMND python3
     ARGS ${CMAKE_CURRENT_SOURCE_DIR}/convergence_rate.py
          PIE-ConvTest-
          10
@@ -82,7 +82,7 @@ TRIBITS_ADD_ADVANCED_TEST(
     PASS_REGULAR_EXPRESSION "PASS BASICS"
     NUM_MPI_PROCS 4
     OUTPUT_FILE PIE-ConvTest-8
-  TEST_2 CMND python
+  TEST_2 CMND python3
     ARGS ${CMAKE_CURRENT_SOURCE_DIR}/convergence_rate.py
          PIE-ConvTest-
          4
diff --git a/packages/panzer/maintenance/replacePanzerEvaluatorMacros.py b/packages/panzer/maintenance/replacePanzerEvaluatorMacros.py
index 606f67b9b489..90afaf9f7d26 100755
--- a/packages/panzer/maintenance/replacePanzerEvaluatorMacros.py
+++ b/packages/panzer/maintenance/replacePanzerEvaluatorMacros.py
@@ -1,4 +1,4 @@
-#!/usr/bin/env python
+#!/usr/bin/env python3
 
 # This script will read a file and replace any Panzer Evaluator macros it
 # finds with their contents.
diff --git a/packages/panzer/mini-em/cmake/Dependencies.cmake b/packages/panzer/mini-em/cmake/Dependencies.cmake
index e4cad2ef1488..72ad76e29743 100644
--- a/packages/panzer/mini-em/cmake/Dependencies.cmake
+++ b/packages/panzer/mini-em/cmake/Dependencies.cmake
@@ -1,5 +1,5 @@
 SET(LIB_REQUIRED_DEP_PACKAGES PanzerCore PanzerDofMgr PanzerDiscFE PanzerAdaptersSTK Phalanx Belos Teko MueLu)
-SET(LIB_OPTIONAL_DEP_PACKAGES ML)
+SET(LIB_OPTIONAL_DEP_PACKAGES ML ShyLU_NodeTacho)
 SET(TEST_REQUIRED_DEP_PACKAGES)
 SET(TEST_OPTIONAL_DEP_PACKAGES)
 SET(LIB_REQUIRED_DEP_TPLS MPI)
diff --git a/packages/panzer/mini-em/example/BlockPrec/CMakeLists.txt b/packages/panzer/mini-em/example/BlockPrec/CMakeLists.txt
index e76043dd1382..a31d3475dece 100644
--- a/packages/panzer/mini-em/example/BlockPrec/CMakeLists.txt
+++ b/packages/panzer/mini-em/example/BlockPrec/CMakeLists.txt
@@ -23,7 +23,7 @@ TRIBITS_COPY_FILES_TO_BINARY_DIR(CopyBlockPrecFiles
   darcyTetAnalytic.xml darcyHexAnalytic.xml
   solverCG.xml solverGMRES.xml
   solverMueLu.xml solverMueLu2D.xml solverMueLuEpetra.xml
-  solverMueLuOpenMP.xml solverMueLuCuda.xml solverMueLuTruncated.xml
+  solverMueLuOpenMP.xml solverMueLuCuda.xml solverMueLuTruncated.xml solverMueLuTPL.xml
   solverMueLuHO.xml solverMueLuHOCuda.xml
   solverAugmentationUseILU.xml
   solverML.xml
@@ -99,6 +99,17 @@ TRIBITS_ADD_TEST(
    )
 
 
+IF (${PACKAGE_NAME}_ENABLE_ShyLU_NodeTacho)
+  TRIBITS_ADD_TEST(
+    BlockPrec
+    NAME "Maxwell_MueLu"
+    POSTFIX_AND_ARGS_0 "order1_tpl"    --solver=MueLu --numTimeSteps=1 --linAlgebra=Tpetra --tpl
+    ENVIRONMENT TPETRA_FUSED_RESIDUAL=OFF
+    NUM_MPI_PROCS 4
+  )
+ ENDIF()
+
+
  # Large performance-testing runs of 3D BlockPrec_RefMaxwell (4 and 16 ranks)
 TRIBITS_ADD_TEST(
    BlockPrec
@@ -120,6 +131,32 @@ TRIBITS_ADD_TEST(
    RUN_SERIAL
    )
 
+IF (${PACKAGE_NAME}_ENABLE_ShyLU_NodeTacho)
+
+  TRIBITS_ADD_TEST(
+    BlockPrec
+    NAME "MiniEM-BlockPrec_RefMaxwell_Performance_4_tpl"
+    ARGS "--stacked-timer --solver=MueLu --numTimeSteps=3 --linAlgebra=Tpetra --inputFile=maxwell-large.xml --tpl --test-name=\"MiniEM 3D RefMaxwell TPL\""
+    ENVIRONMENT TPETRA_FUSED_RESIDUAL=OFF
+    COMM mpi
+    NUM_MPI_PROCS 4
+    CATEGORIES PERFORMANCE
+    RUN_SERIAL
+  )
+
+  TRIBITS_ADD_TEST(
+    BlockPrec
+    NAME "MiniEM-BlockPrec_RefMaxwell_Performance_16_tpl"
+    ARGS "--stacked-timer --solver=MueLu --numTimeSteps=3 --linAlgebra=Tpetra --inputFile=maxwell-large.xml --tpl --test-name=\"MiniEM 3D RefMaxwell TPL\""
+    ENVIRONMENT TPETRA_FUSED_RESIDUAL=OFF
+    COMM mpi
+    NUM_MPI_PROCS 16
+    CATEGORIES PERFORMANCE
+    RUN_SERIAL
+  )
+
+ENDIF()
+
  TRIBITS_ADD_TEST(
    BlockPrec
    NAME "MiniEM-BlockPrec_RefMaxwell_Performance_16_reuse"
diff --git a/packages/panzer/mini-em/example/BlockPrec/MiniEM_helpers.cpp b/packages/panzer/mini-em/example/BlockPrec/MiniEM_helpers.cpp
index d60d387ee1df..ca0f217a0cc0 100644
--- a/packages/panzer/mini-em/example/BlockPrec/MiniEM_helpers.cpp
+++ b/packages/panzer/mini-em/example/BlockPrec/MiniEM_helpers.cpp
@@ -100,6 +100,7 @@ namespace mini_em {
                       Teuchos::RCP<const Teuchos::MpiComm<int>> &comm,
                       Teuchos::RCP<Teuchos::FancyOStream> &out,
                       std::string &xml, int basis_order,
+                      const bool preferTPLs,
                       const bool truncateMueLuHierarchy) {
     using Teuchos::RCP;
     using Teuchos::rcp;
@@ -161,7 +162,9 @@ namespace mini_em {
               updateParams("solverMueLu2D.xml", lin_solver_pl, comm, out);
           }
           if (truncateMueLuHierarchy)
-          updateParams("solverMueLuTruncated.xml", lin_solver_pl, comm, out);
+            updateParams("solverMueLuTruncated.xml", lin_solver_pl, comm, out);
+          if (preferTPLs)
+            updateParams("solverMueLuTPL.xml", lin_solver_pl, comm, out);
           if (basis_order > 1) {
             RCP<Teuchos::ParameterList> lin_solver_pl_lo = lin_solver_pl;
             lin_solver_pl = rcp(new Teuchos::ParameterList("Linear Solver"));
diff --git a/packages/panzer/mini-em/example/BlockPrec/MiniEM_helpers.hpp b/packages/panzer/mini-em/example/BlockPrec/MiniEM_helpers.hpp
index ff5eb74b2833..cdbb8a82ee9d 100644
--- a/packages/panzer/mini-em/example/BlockPrec/MiniEM_helpers.hpp
+++ b/packages/panzer/mini-em/example/BlockPrec/MiniEM_helpers.hpp
@@ -59,6 +59,7 @@ namespace mini_em {
                                                            Teuchos::RCP<Teuchos::FancyOStream> &out,
                                                            std::string &xml,
                                                            int basis_order,
+                                                           const bool preferTPLs = false,
                                                            const bool truncateMueLuHierarchy = false);
 
   void setClosureParameters(physicsType physics,
diff --git a/packages/panzer/mini-em/example/BlockPrec/main.cpp b/packages/panzer/mini-em/example/BlockPrec/main.cpp
index 34cfc46df97c..2857b7af46b2 100644
--- a/packages/panzer/mini-em/example/BlockPrec/main.cpp
+++ b/packages/panzer/mini-em/example/BlockPrec/main.cpp
@@ -110,7 +110,7 @@ int main_(Teuchos::CommandLineProcessor &clp, int argc,char * argv[])
   {
     // defaults for command-line options
     int x_elements=-1,y_elements=-1,z_elements=-1,basis_order=1;
-    int workset_size=20;
+    int workset_size=2000;
     std::string pCoarsenScheduleStr = "1";
     double dt=0.0;
     std::string meshFile = "";
@@ -120,6 +120,7 @@ int main_(Teuchos::CommandLineProcessor &clp, int argc,char * argv[])
     std::string xml = "";
     solverType solverValues[5] = {AUGMENTATION, MUELU, ML, CG, GMRES};
     const char * solverNames[5] = {"Augmentation", "MueLu", "ML", "CG", "GMRES"};
+    bool preferTPLs = false;
     bool truncateMueLuHierarchy = false;
     solverType solver = MUELU;
     int numTimeSteps = 1;
@@ -147,6 +148,7 @@ int main_(Teuchos::CommandLineProcessor &clp, int argc,char * argv[])
     clp.setOption("inputFile",&input_file,"XML file with the problem definitions");
     clp.setOption("solverFile",&xml,"XML file with the solver params");
     clp.setOption<solverType>("solver",&solver,5,solverValues,solverNames,"Solver that is used");
+    clp.setOption("tpl", "no-tpl", &preferTPLs, "Prefer TPL usage over fused kernels");
     clp.setOption("truncateMueLuHierarchy", "no-truncateMueLuHierarchy", &truncateMueLuHierarchy, "Truncate the MueLu hierarchy");
     clp.setOption("numTimeSteps",&numTimeSteps);
     clp.setOption("finalTime",&finalTime);
@@ -292,7 +294,7 @@ int main_(Teuchos::CommandLineProcessor &clp, int argc,char * argv[])
         throw;
     }
 
-    RCP<Teuchos::ParameterList> lin_solver_pl = mini_em::getSolverParameters(linAlgebra, physics, solver, dim, comm, out, xml, basis_order, truncateMueLuHierarchy);
+    RCP<Teuchos::ParameterList> lin_solver_pl = mini_em::getSolverParameters(linAlgebra, physics, solver, dim, comm, out, xml, basis_order, preferTPLs, truncateMueLuHierarchy);
 
     if (lin_solver_pl->sublist("Preconditioner Types").isSublist("Teko") &&
         lin_solver_pl->sublist("Preconditioner Types").sublist("Teko").isSublist("Inverse Factory Library")) {
diff --git a/packages/panzer/mini-em/example/BlockPrec/maxwell-1stOrder.xml b/packages/panzer/mini-em/example/BlockPrec/maxwell-1stOrder.xml
index 6fc4966b38a6..78709fd571ba 100644
--- a/packages/panzer/mini-em/example/BlockPrec/maxwell-1stOrder.xml
+++ b/packages/panzer/mini-em/example/BlockPrec/maxwell-1stOrder.xml
@@ -56,7 +56,6 @@
 
    <ParameterList name="Assembly">
      <Parameter name="Field Order"           type="string" value="blocked: B_face E_edge"/>
-     <Parameter name="Workset Size" type="int" value="2000"/>
    </ParameterList>
 
   <ParameterList name="Physics Blocks">
diff --git a/packages/panzer/mini-em/example/BlockPrec/maxwell-analyticSolution.xml b/packages/panzer/mini-em/example/BlockPrec/maxwell-analyticSolution.xml
index 68ed5f2abf1a..5dda86d61a0d 100644
--- a/packages/panzer/mini-em/example/BlockPrec/maxwell-analyticSolution.xml
+++ b/packages/panzer/mini-em/example/BlockPrec/maxwell-analyticSolution.xml
@@ -33,7 +33,6 @@
 
    <ParameterList name="Assembly">
      <Parameter name="Field Order"  type="string" value="blocked: B_face E_edge"/>
-     <Parameter name="Workset Size" type="int" value="2000"/>
    </ParameterList>
 
   <ParameterList name="Physics Blocks">
diff --git a/packages/panzer/mini-em/example/BlockPrec/maxwell-bdot-large.xml b/packages/panzer/mini-em/example/BlockPrec/maxwell-bdot-large.xml
index 3a8584084e82..8ea32881b78d 100644
--- a/packages/panzer/mini-em/example/BlockPrec/maxwell-bdot-large.xml
+++ b/packages/panzer/mini-em/example/BlockPrec/maxwell-bdot-large.xml
@@ -28,7 +28,6 @@
    <ParameterList name="Assembly">
      <Parameter name="Field Order"           type="string" value="blocked: B_face E_edge"/>
      <Parameter name="Auxiliary Field Order" type="string" value="blocked: AUXILIARY_NODE AUXILIARY_EDGE"/>
-     <Parameter name="Workset Size" type="int" value="2000"/>
    </ParameterList>
 
   <ParameterList name="Physics Blocks">
diff --git a/packages/panzer/mini-em/example/BlockPrec/maxwell-bdot-medium.xml b/packages/panzer/mini-em/example/BlockPrec/maxwell-bdot-medium.xml
index 4a3c9eb2e884..1d8d7e47eae8 100644
--- a/packages/panzer/mini-em/example/BlockPrec/maxwell-bdot-medium.xml
+++ b/packages/panzer/mini-em/example/BlockPrec/maxwell-bdot-medium.xml
@@ -28,7 +28,6 @@
    <ParameterList name="Assembly">
      <Parameter name="Field Order"           type="string" value="blocked: B_face E_edge"/>
      <Parameter name="Auxiliary Field Order" type="string" value="blocked: AUXILIARY_NODE AUXILIARY_EDGE"/>
-     <Parameter name="Workset Size" type="int" value="2000"/>
    </ParameterList>
 
   <ParameterList name="Physics Blocks">
diff --git a/packages/panzer/mini-em/example/BlockPrec/maxwell-bdot-small.xml b/packages/panzer/mini-em/example/BlockPrec/maxwell-bdot-small.xml
index 87758fc0173b..9856f4fc37f3 100644
--- a/packages/panzer/mini-em/example/BlockPrec/maxwell-bdot-small.xml
+++ b/packages/panzer/mini-em/example/BlockPrec/maxwell-bdot-small.xml
@@ -28,7 +28,6 @@
    <ParameterList name="Assembly">
      <Parameter name="Field Order"           type="string" value="blocked: B_face E_edge"/>
      <Parameter name="Auxiliary Field Order" type="string" value="blocked: AUXILIARY_NODE AUXILIARY_EDGE"/>
-     <Parameter name="Workset Size" type="int" value="2000"/>
    </ParameterList>
 
   <ParameterList name="Physics Blocks">
diff --git a/packages/panzer/mini-em/example/BlockPrec/maxwell-blob-R0.xml b/packages/panzer/mini-em/example/BlockPrec/maxwell-blob-R0.xml
index dee0f8077980..45e2454f27f0 100644
--- a/packages/panzer/mini-em/example/BlockPrec/maxwell-blob-R0.xml
+++ b/packages/panzer/mini-em/example/BlockPrec/maxwell-blob-R0.xml
@@ -24,7 +24,6 @@
    <ParameterList name="Assembly">
      <Parameter name="Field Order"           type="string" value="blocked: B_face E_edge"/>
      <Parameter name="Auxiliary Field Order" type="string" value="blocked: AUXILIARY_NODE AUXILIARY_EDGE"/>
-     <Parameter name="Workset Size" type="int" value="2000"/>
    </ParameterList>
 
   <ParameterList name="Physics Blocks">
diff --git a/packages/panzer/mini-em/example/BlockPrec/maxwell-blob-R1.xml b/packages/panzer/mini-em/example/BlockPrec/maxwell-blob-R1.xml
index e3da35320350..0a83103c0b81 100644
--- a/packages/panzer/mini-em/example/BlockPrec/maxwell-blob-R1.xml
+++ b/packages/panzer/mini-em/example/BlockPrec/maxwell-blob-R1.xml
@@ -24,7 +24,6 @@
    <ParameterList name="Assembly">
      <Parameter name="Field Order"           type="string" value="blocked: B_face E_edge"/>
      <Parameter name="Auxiliary Field Order" type="string" value="blocked: AUXILIARY_NODE AUXILIARY_EDGE"/>
-     <Parameter name="Workset Size" type="int" value="2000"/>
    </ParameterList>
 
   <ParameterList name="Physics Blocks">
diff --git a/packages/panzer/mini-em/example/BlockPrec/maxwell-blob-R2.xml b/packages/panzer/mini-em/example/BlockPrec/maxwell-blob-R2.xml
index 1a582547ec88..b69f15a758f8 100644
--- a/packages/panzer/mini-em/example/BlockPrec/maxwell-blob-R2.xml
+++ b/packages/panzer/mini-em/example/BlockPrec/maxwell-blob-R2.xml
@@ -24,7 +24,6 @@
    <ParameterList name="Assembly">
      <Parameter name="Field Order"           type="string" value="blocked: B_face E_edge"/>
      <Parameter name="Auxiliary Field Order" type="string" value="blocked: AUXILIARY_NODE AUXILIARY_EDGE"/>
-     <Parameter name="Workset Size" type="int" value="2000"/>
    </ParameterList>
 
   <ParameterList name="Physics Blocks">
diff --git a/packages/panzer/mini-em/example/BlockPrec/maxwell-blob-R3.xml b/packages/panzer/mini-em/example/BlockPrec/maxwell-blob-R3.xml
index 508639541ab6..b4f11120c050 100644
--- a/packages/panzer/mini-em/example/BlockPrec/maxwell-blob-R3.xml
+++ b/packages/panzer/mini-em/example/BlockPrec/maxwell-blob-R3.xml
@@ -24,7 +24,6 @@
    <ParameterList name="Assembly">
      <Parameter name="Field Order"           type="string" value="blocked: B_face E_edge"/>
      <Parameter name="Auxiliary Field Order" type="string" value="blocked: AUXILIARY_NODE AUXILIARY_EDGE"/>
-     <Parameter name="Workset Size" type="int" value="2000"/>
    </ParameterList>
 
   <ParameterList name="Physics Blocks">
diff --git a/packages/panzer/mini-em/example/BlockPrec/maxwell-blob-R4.xml b/packages/panzer/mini-em/example/BlockPrec/maxwell-blob-R4.xml
index 88ec43e607b5..d31bc2e4b0e7 100644
--- a/packages/panzer/mini-em/example/BlockPrec/maxwell-blob-R4.xml
+++ b/packages/panzer/mini-em/example/BlockPrec/maxwell-blob-R4.xml
@@ -24,7 +24,6 @@
    <ParameterList name="Assembly">
      <Parameter name="Field Order"           type="string" value="blocked: B_face E_edge"/>
      <Parameter name="Auxiliary Field Order" type="string" value="blocked: AUXILIARY_NODE AUXILIARY_EDGE"/>
-     <Parameter name="Workset Size" type="int" value="2000"/>
    </ParameterList>
 
   <ParameterList name="Physics Blocks">
diff --git a/packages/panzer/mini-em/example/BlockPrec/maxwell-donut.xml b/packages/panzer/mini-em/example/BlockPrec/maxwell-donut.xml
index 0e7623d9f925..57b54ec058b2 100644
--- a/packages/panzer/mini-em/example/BlockPrec/maxwell-donut.xml
+++ b/packages/panzer/mini-em/example/BlockPrec/maxwell-donut.xml
@@ -41,7 +41,6 @@
 
    <ParameterList name="Assembly">
      <Parameter name="Field Order"  type="string" value="blocked: B_face E_edge"/>
-     <Parameter name="Workset Size" type="int" value="2000"/>
    </ParameterList>
 
   <ParameterList name="Physics Blocks">
diff --git a/packages/panzer/mini-em/example/BlockPrec/maxwell-exterior-large.xml b/packages/panzer/mini-em/example/BlockPrec/maxwell-exterior-large.xml
index 4913f1f5fc41..cf2809ffdfd3 100644
--- a/packages/panzer/mini-em/example/BlockPrec/maxwell-exterior-large.xml
+++ b/packages/panzer/mini-em/example/BlockPrec/maxwell-exterior-large.xml
@@ -28,7 +28,6 @@
    <ParameterList name="Assembly">
      <Parameter name="Field Order"           type="string" value="blocked: B_face E_edge"/>
      <Parameter name="Auxiliary Field Order" type="string" value="blocked: AUXILIARY_NODE AUXILIARY_EDGE"/>
-     <Parameter name="Workset Size" type="int" value="2000"/>
    </ParameterList>
 
   <ParameterList name="Physics Blocks">
diff --git a/packages/panzer/mini-em/example/BlockPrec/maxwell-exterior-medium.xml b/packages/panzer/mini-em/example/BlockPrec/maxwell-exterior-medium.xml
index 7cb156f0771e..1fdc977dc9ed 100644
--- a/packages/panzer/mini-em/example/BlockPrec/maxwell-exterior-medium.xml
+++ b/packages/panzer/mini-em/example/BlockPrec/maxwell-exterior-medium.xml
@@ -28,7 +28,6 @@
    <ParameterList name="Assembly">
      <Parameter name="Field Order"           type="string" value="blocked: B_face E_edge"/>
      <Parameter name="Auxiliary Field Order" type="string" value="blocked: AUXILIARY_NODE AUXILIARY_EDGE"/>
-     <Parameter name="Workset Size" type="int" value="2000"/>
    </ParameterList>
 
   <ParameterList name="Physics Blocks">
diff --git a/packages/panzer/mini-em/example/BlockPrec/maxwell-exterior-small.xml b/packages/panzer/mini-em/example/BlockPrec/maxwell-exterior-small.xml
index 9ce84b7349d7..141ff1349ce9 100644
--- a/packages/panzer/mini-em/example/BlockPrec/maxwell-exterior-small.xml
+++ b/packages/panzer/mini-em/example/BlockPrec/maxwell-exterior-small.xml
@@ -28,7 +28,6 @@
    <ParameterList name="Assembly">
      <Parameter name="Field Order"           type="string" value="blocked: B_face E_edge"/>
      <Parameter name="Auxiliary Field Order" type="string" value="blocked: AUXILIARY_NODE AUXILIARY_EDGE"/>
-     <Parameter name="Workset Size" type="int" value="2000"/>
    </ParameterList>
 
   <ParameterList name="Physics Blocks">
diff --git a/packages/panzer/mini-em/example/BlockPrec/maxwell-large.xml b/packages/panzer/mini-em/example/BlockPrec/maxwell-large.xml
index a64500279c77..867b31679a0c 100644
--- a/packages/panzer/mini-em/example/BlockPrec/maxwell-large.xml
+++ b/packages/panzer/mini-em/example/BlockPrec/maxwell-large.xml
@@ -56,7 +56,6 @@
 
    <ParameterList name="Assembly">
      <Parameter name="Field Order"           type="string" value="blocked: B_face E_edge"/>
-     <Parameter name="Workset Size" type="int" value="2000"/>
    </ParameterList>
 
   <ParameterList name="Physics Blocks">
diff --git a/packages/panzer/mini-em/example/BlockPrec/maxwell.xml b/packages/panzer/mini-em/example/BlockPrec/maxwell.xml
index 95c32435ba5c..567e7f60d831 100644
--- a/packages/panzer/mini-em/example/BlockPrec/maxwell.xml
+++ b/packages/panzer/mini-em/example/BlockPrec/maxwell.xml
@@ -56,7 +56,6 @@
 
    <ParameterList name="Assembly">
      <Parameter name="Field Order"  type="string" value="blocked: B_face E_edge"/>
-     <Parameter name="Workset Size" type="int" value="2000"/>
    </ParameterList>
 
   <ParameterList name="Physics Blocks">
diff --git a/packages/panzer/mini-em/example/BlockPrec/maxwell2D.xml b/packages/panzer/mini-em/example/BlockPrec/maxwell2D.xml
index 8e9fb32117c8..f4452b2fc93f 100644
--- a/packages/panzer/mini-em/example/BlockPrec/maxwell2D.xml
+++ b/packages/panzer/mini-em/example/BlockPrec/maxwell2D.xml
@@ -34,7 +34,6 @@
 
    <ParameterList name="Assembly">
      <Parameter name="Field Order"           type="string" value="blocked: B_face E_edge"/>
-     <Parameter name="Workset Size" type="int" value="2000"/>
    </ParameterList>
 
   <ParameterList name="Physics Blocks">
diff --git a/packages/panzer/mini-em/example/BlockPrec/solverMueLu.xml b/packages/panzer/mini-em/example/BlockPrec/solverMueLu.xml
index df5b9ad49160..794d8ad984df 100644
--- a/packages/panzer/mini-em/example/BlockPrec/solverMueLu.xml
+++ b/packages/panzer/mini-em/example/BlockPrec/solverMueLu.xml
@@ -157,6 +157,8 @@
                 <Parameter name="refmaxwell: subsolves on subcommunicators" type="bool" value="true"/>
                 <Parameter name="refmaxwell: subsolves striding" type="int" value="1"/>
 
+                <Parameter name="fuse prolongation and update" type="bool" value="true"/>
+
                 <!-- <Parameter name="rap: triple product" type="bool" value="true"/> -->
                 <!-- <Parameter name="transpose: use implicit" type="bool" value="true"/> -->
 
@@ -175,6 +177,7 @@
                   <Parameter name="chebyshev: ratio eigenvalue" type="double" value="20.0"/>
                   <Parameter name="chebyshev: eigenvalue max iterations" type="int" value="100"/>
                   <Parameter name="chebyshev: algorithm" type="string" value="opt_fourth"/>
+                  <Parameter name="chebyshev: use native spmv" type="bool" value="false"/>
                 </ParameterList>
 
                 <!-- <Parameter name="smoother: pre type" type="string" value="RELAXATION"/> -->
@@ -201,6 +204,8 @@
                   <Parameter name="number of equations" type="int" value="3"/>
                   <Parameter name="multigrid algorithm" type="string" value="unsmoothed"/>
 
+                  <Parameter name="fuse prolongation and update" type="bool" value="true"/>
+
                   <Parameter name="aggregation: type" type="string" value="uncoupled"/>
                   <Parameter name="aggregation: drop scheme" type="string" value="distance laplacian"/>
                   <Parameter name="aggregation: distance laplacian algo" type="string" value="scaled cut"/>
@@ -219,6 +224,7 @@
                     <Parameter name="chebyshev: ratio eigenvalue" type="double" value="5.4"/>
                     <Parameter name="chebyshev: eigenvalue max iterations" type="int" value="100"/>
                     <Parameter name="chebyshev: algorithm" type="string" value="opt_fourth"/>
+                    <Parameter name="chebyshev: use native spmv" type="bool" value="false"/>
                   </ParameterList>
 
                   <!-- <Parameter name="smoother: pre type" type="string" value="RELAXATION"/> -->
@@ -258,6 +264,8 @@
                   <Parameter name="coarse: max size" type="int" value="2500"/>
                   <Parameter name="multigrid algorithm" type="string" value="unsmoothed"/>
 
+                  <Parameter name="fuse prolongation and update" type="bool" value="true"/>
+
                   <Parameter name="aggregation: type" type="string" value="uncoupled"/>
                   <Parameter name="aggregation: drop scheme" type="string" value="distance laplacian"/>
                   <Parameter name="aggregation: distance laplacian algo" type="string" value="scaled cut"/>
@@ -276,6 +284,7 @@
                     <Parameter name="chebyshev: ratio eigenvalue" type="double" value="7.0"/>
                     <Parameter name="chebyshev: eigenvalue max iterations" type="int" value="100"/>
                     <Parameter name="chebyshev: algorithm" type="string" value="opt_fourth"/>
+                    <Parameter name="chebyshev: use native spmv" type="bool" value="false"/>
                   </ParameterList>
 
                   <!-- <Parameter name="smoother: pre type" type="string" value="RELAXATION"/> -->
diff --git a/packages/panzer/mini-em/example/BlockPrec/solverMueLuCuda.xml b/packages/panzer/mini-em/example/BlockPrec/solverMueLuCuda.xml
index a3ab680e2f5e..47034e77c794 100644
--- a/packages/panzer/mini-em/example/BlockPrec/solverMueLuCuda.xml
+++ b/packages/panzer/mini-em/example/BlockPrec/solverMueLuCuda.xml
@@ -1,3 +1,5 @@
+<!-- Tuned for V100 -->
+
 <ParameterList name="Linear Solver">
 
   <ParameterList name="Preconditioner Types">
diff --git a/packages/panzer/mini-em/example/BlockPrec/solverMueLuTPL.xml b/packages/panzer/mini-em/example/BlockPrec/solverMueLuTPL.xml
new file mode 100644
index 000000000000..98242888745c
--- /dev/null
+++ b/packages/panzer/mini-em/example/BlockPrec/solverMueLuTPL.xml
@@ -0,0 +1,106 @@
+<!-- Prefers TPL -->
+
+<ParameterList name="Linear Solver">
+
+  <ParameterList name="Preconditioner Types">
+    <ParameterList name="Teko">
+      <ParameterList name="Inverse Factory Library">
+        <ParameterList name="Maxwell">
+
+          <ParameterList name="S_E Preconditioner">
+            <ParameterList name="Preconditioner Types">
+              <ParameterList name="MueLuRefMaxwell">
+
+                <ParameterList name="smoother: params">
+                  <Parameter name="chebyshev: use native spmv" type="bool" value="true"/>
+                </ParameterList>
+
+                <ParameterList name="refmaxwell: 11list">
+
+                  <Parameter name="coarse: type" type="string" value="tacho"/>
+                  <ParameterList name="coarse: params">
+                    <ParameterList name="Amesos2">
+                      <ParameterList name="Tacho">
+                        <Parameter name="small problem threshold size" type="int" value="100"/>
+                      </ParameterList>
+                    </ParameterList>
+                  </ParameterList>
+
+                  <ParameterList name="smoother: params">
+                    <Parameter name="chebyshev: use native spmv" type="bool" value="true"/>
+                  </ParameterList>
+
+                </ParameterList>
+
+                <ParameterList name="refmaxwell: 22list">
+
+                  <Parameter name="coarse: type" type="string" value="tacho"/>
+                  <ParameterList name="coarse: params">
+                    <ParameterList name="Amesos2">
+                      <ParameterList name="Tacho">
+                        <Parameter name="small problem threshold size" type="int" value="100"/>
+                      </ParameterList>
+                    </ParameterList>
+                  </ParameterList>
+
+                  <ParameterList name="smoother: params">
+                    <Parameter name="chebyshev: use native spmv" type="bool" value="true"/>
+                  </ParameterList>
+
+                </ParameterList>
+              </ParameterList>
+            </ParameterList>
+          </ParameterList>
+        </ParameterList>
+
+        <ParameterList name="Darcy">
+
+          <ParameterList name="S_sigma Preconditioner">
+            <ParameterList name="Preconditioner Types">
+              <ParameterList name="MueLuRefMaxwell">
+
+                <ParameterList name="smoother: params">
+                  <Parameter name="chebyshev: use native spmv" type="bool" value="true"/>
+                </ParameterList>
+
+                <ParameterList name="refmaxwell: 11list">
+
+                  <Parameter name="coarse: type" type="string" value="tacho"/>
+                  <ParameterList name="coarse: params">
+                    <ParameterList name="Amesos2">
+                      <ParameterList name="Tacho">
+                        <Parameter name="small problem threshold size" type="int" value="100"/>
+                      </ParameterList>
+                    </ParameterList>
+                  </ParameterList>
+
+                  <ParameterList name="smoother: params">
+                    <Parameter name="chebyshev: use native spmv" type="bool" value="true"/>
+                  </ParameterList>
+
+                </ParameterList>
+
+                <ParameterList name="refmaxwell: 22list">
+
+                  <Parameter name="coarse: type" type="string" value="tacho"/>
+                  <ParameterList name="coarse: params">
+                    <ParameterList name="Amesos2">
+                      <ParameterList name="Tacho">
+                        <Parameter name="small problem threshold size" type="int" value="100"/>
+                      </ParameterList>
+                    </ParameterList>
+                  </ParameterList>
+
+                  <ParameterList name="smoother: params">
+                    <Parameter name="chebyshev: use native spmv" type="bool" value="true"/>
+                  </ParameterList>
+
+                </ParameterList>
+              </ParameterList>
+            </ParameterList>
+          </ParameterList>
+        </ParameterList>
+      </ParameterList>
+    </ParameterList>
+  </ParameterList>
+</ParameterList>
diff --git a/packages/panzer/mini-em/src/solvers/MiniEM_Interpolation.cpp b/packages/panzer/mini-em/src/solvers/MiniEM_Interpolation.cpp
index 8199145715ad..ba501e10b930 100644
--- a/packages/panzer/mini-em/src/solvers/MiniEM_Interpolation.cpp
+++ b/packages/panzer/mini-em/src/solvers/MiniEM_Interpolation.cpp
@@ -340,9 +340,7 @@ Teko::LinearOp buildInterpolation(const Teuchos::RCP<const panzer::LinearObjFact
     numCells = static_cast<int>(worksetSize);
   DynRankDeviceView             ho_dofCoords_d("ho_dofCoords_d", hoCardinality, dim);
   DynRankDeviceView             basisCoeffsLIOriented_d("basisCoeffsLIOriented_d", numCells, hoCardinality, loCardinality);
-  typename Kokkos::DynRankView<Intrepid2::Orientation,DeviceSpace>     elemOrts_d ("elemOrts_d",  numCells);
-  typename Kokkos::DynRankView<GlobalOrdinal, DeviceSpace>::HostMirror elemNodes_h("elemNodes_h", numCells, numElemVertices);
-  typename Kokkos::DynRankView<GlobalOrdinal, DeviceSpace>             elemNodes_d("elemNodes_d", numCells, numElemVertices);
+
 
   // the ranks of these depend on dimension
   DynRankDeviceView ho_dofCoeffs_d;
@@ -370,6 +368,13 @@ Teko::LinearOp buildInterpolation(const Teuchos::RCP<const panzer::LinearObjFact
 
   auto entryFilterTol = 100*Teuchos::ScalarTraits<typename STS::magnitudeType>::eps();
 
+
+  // HO dof coordinates and coefficients
+  ho_basis->getDofCoords(ho_dofCoords_d);
+
+  // compute values of op * (LO basis) at HO dof coords on reference element
+  lo_basis->getValues(valuesAtDofCoordsNonOriented_d, ho_dofCoords_d, op);
+
   // loop over element blocks
   std::vector<std::string> elementBlockIds;
   blockedDOFMngr->getElementBlockIds(elementBlockIds);
@@ -381,44 +386,48 @@ Teko::LinearOp buildInterpolation(const Teuchos::RCP<const panzer::LinearObjFact
     Kokkos::View<int *, DeviceSpace> elementIds_d("elementIds_d", elementIds_h.extent(0));
     Kokkos::deep_copy(elementIds_d, elementIds_h);
 
-    for(std::size_t elemIter = 0; elemIter < elementIds.size(); elemIter += numCells) {
+    // get element orientations
+    typename Kokkos::DynRankView<Intrepid2::Orientation,DeviceSpace>     elemOrts_d ("elemOrts_d",  elementIds.size());
+    {
+      typename Kokkos::DynRankView<GlobalOrdinal, DeviceSpace>::HostMirror elemNodes_h("elemNodes_h", elementIds.size(), numElemVertices);
+      typename Kokkos::DynRankView<GlobalOrdinal, DeviceSpace>             elemNodes_d("elemNodes_d", elementIds.size(), numElemVertices);
 
-      // get element orientations
-      for (int cellNo = 0; cellNo < numCells; cellNo++) {
-        if (elemIter+cellNo >= elementIds.size())
-          continue;
-        const GlobalOrdinal* node_ids = node_conn->getConnectivity(elementIds[elemIter+cellNo]);
+      for (int cellNo = 0; cellNo < elementIds.size(); cellNo++) {
+        const GlobalOrdinal* node_ids = node_conn->getConnectivity(elementIds[cellNo]);
         for(int i = 0; i < numElemVertices; i++)
           elemNodes_h(cellNo, i) = node_ids[i];
       }
       Kokkos::deep_copy(elemNodes_d, elemNodes_h);
 
       ots::getOrientation(elemOrts_d, elemNodes_d, topology);
+    }
 
-      // HO dof coordinates and coefficients
-      ho_basis->getDofCoords(ho_dofCoords_d);
+    for(std::size_t elemIter = 0; elemIter < elementIds.size(); elemIter += numCells) {
 
-      // compute values of op * (LO basis) at HO dof coords on reference element
-      lo_basis->getValues(valuesAtDofCoordsNonOriented_d, ho_dofCoords_d, op);
+      int endCellRange =
+          std::min(numCells, Teuchos::as<int>(elementIds.size()) -
+                                 Teuchos::as<int>(elemIter));
+
+      // get subviews on workset
+      auto elemOrtsWorkset_d = Kokkos::subview(elemOrts_d, Kokkos::make_pair(elemIter, std::min(elemIter + numCells, elementIds.size())));
+      auto valuesAtDofCoordsOrientedWorkset_d = Kokkos::subview(valuesAtDofCoordsOriented_d, Kokkos::make_pair(0, endCellRange), Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL());
+      auto basisCoeffsLIOrientedWorkset_d = Kokkos::subview(basisCoeffsLIOriented_d, Kokkos::make_pair(0, endCellRange), Kokkos::ALL(), Kokkos::ALL());
 
       // apply orientations for LO basis
       // shuffles things in the second dimension, i.e. wrt LO basis
-      ots::modifyBasisByOrientation(valuesAtDofCoordsOriented_d,
+      ots::modifyBasisByOrientation(valuesAtDofCoordsOrientedWorkset_d,
                                     valuesAtDofCoordsNonOriented_d,
-                                    elemOrts_d,
+                                    elemOrtsWorkset_d,
                                     lo_basis.get());
 
       //get basis coefficients of LO basis functions wrt HO basis
       for(size_t loIter=0; loIter<loCardinality; loIter++)
         // Get basis coeffs wrt HO basis on reference element.
         // basisCoeffsLI has dimensions (numCells, numFields=hoCardinality, loCardinality)
-        li::getBasisCoeffs(Kokkos::subview(basisCoeffsLIOriented_d, Kokkos::ALL(), Kokkos::ALL(), loIter),
-                           Kokkos::subview(valuesAtDofCoordsOriented_d, Kokkos::ALL(), loIter, Kokkos::ALL(), Kokkos::ALL()),
-                           ho_basis.get(), elemOrts_d);
-
-      int endCellRange =
-          std::min(numCells, Teuchos::as<int>(elementIds.size()) -
-                                 Teuchos::as<int>(elemIter));
+        li::getBasisCoeffs(Kokkos::subview(basisCoeffsLIOrientedWorkset_d, Kokkos::ALL(), Kokkos::ALL(), loIter),
+                           Kokkos::subview(valuesAtDofCoordsOrientedWorkset_d, Kokkos::ALL(), loIter, Kokkos::ALL(), Kokkos::ALL()),
+                           ho_basis.get(),
+                           elemOrtsWorkset_d);
 
 #ifdef PANZER_HAVE_EPETRA_STACK
       if (tblof.is_null()) { // Epetra fill
@@ -426,7 +435,7 @@ Teko::LinearOp buildInterpolation(const Teuchos::RCP<const panzer::LinearObjFact
 	Kokkos::View<LocalOrdinal*,DeviceSpace> indices_d("indices", loCardinality);
 	Kokkos::View<Scalar*,      DeviceSpace> values_d ("values",  loCardinality);
 
-	
+
         for (int cellNo = 0; cellNo < endCellRange; cellNo++) {
           auto elemId = elementIds_d(elemIter+cellNo);
 
diff --git a/packages/panzer/mini-em/src/solvers/MiniEM_MatrixFreeInterpolationOp.cpp b/packages/panzer/mini-em/src/solvers/MiniEM_MatrixFreeInterpolationOp.cpp
index 8302820fcc1d..cc54d77fdd4e 100644
--- a/packages/panzer/mini-em/src/solvers/MiniEM_MatrixFreeInterpolationOp.cpp
+++ b/packages/panzer/mini-em/src/solvers/MiniEM_MatrixFreeInterpolationOp.cpp
@@ -289,6 +289,12 @@ namespace mini_em {
     Kokkos::View<LocalOrdinal*,DeviceSpace> node_elementLidToConn_d("node_elementLidToConn_d", node_elementLidToConn_h.extent(0));
     Kokkos::deep_copy(node_elementLidToConn_d, node_elementLidToConn_h);
 
+    // HO dof coordinates and coefficients
+    ho_basis->getDofCoords(ho_dofCoords_d);
+
+    // compute values of op * (LO basis) at HO dof coords on reference element
+    lo_basis->getValues(valuesAtDofCoordsNonOriented_d, ho_dofCoords_d, op);
+
     // loop over element blocks
     std::vector<std::string> elementBlockIds;
     blockedDOFMngr->getElementBlockIds(elementBlockIds);
@@ -299,31 +305,39 @@ namespace mini_em {
       Kokkos::View<int*,DeviceSpace>::HostMirror elementIds_h(elementIds.data(), elementIds.size());
       Kokkos::View<int*,DeviceSpace> elementIds_d("elementIds_d", elementIds_h.extent(0));
       Kokkos::deep_copy(elementIds_d, elementIds_h);
-      for(std::size_t elemIter = 0; elemIter < elementIds_d.extent(0); elemIter += numCells) {
 
-        // get element orientations
-        Kokkos::parallel_for("miniEM:MatrixFreeInterpolationOp::connectivity",
-                             range_type(0, std::min(numCells, elementIds_d.extent_int(0)-Teuchos::as<int>(elemIter))),
-                             KOKKOS_LAMBDA(const LocalOrdinal cellNo) {
-                               LocalOrdinal cellNo2 = elemIter+cellNo;
-                               LocalOrdinal elementID = elementIds_d(cellNo2);
-                               LocalOrdinal k = node_elementLidToConn_d(elementID);
-                               for(int i = 0; i < node_connectivitySize_d(elementID); i++)
-                                 elemNodes_d(cellNo, i) = node_connectivity_d(k+i);
-                             });
+      // get element orientations
+      typename Kokkos::DynRankView<Intrepid2::Orientation,DeviceSpace>     elemOrts_d ("elemOrts_d",  elementIds.size());
+      {
+        typename Kokkos::DynRankView<GlobalOrdinal, DeviceSpace>::HostMirror elemNodes_h("elemNodes_h", elementIds.size(), numElemVertices);
+        typename Kokkos::DynRankView<GlobalOrdinal, DeviceSpace>             elemNodes_d("elemNodes_d", elementIds.size(), numElemVertices);
+
+        for (int cellNo = 0; cellNo < elementIds.size(); cellNo++) {
+          const GlobalOrdinal* node_ids = node_conn_->getConnectivity(elementIds[cellNo]);
+          for(int i = 0; i < numElemVertices; i++)
+            elemNodes_h(cellNo, i) = node_ids[i];
+        }
+        Kokkos::deep_copy(elemNodes_d, elemNodes_h);
+
         ots::getOrientation(elemOrts_d, elemNodes_d, topology);
+      }
+
+      for(std::size_t elemIter = 0; elemIter < elementIds_d.extent(0); elemIter += numCells) {
 
-        // HO dof coordinates
-        ho_basis->getDofCoords(ho_dofCoords_d);
+        int endCellRange =
+          std::min(numCells, Teuchos::as<int>(elementIds.size()) -
+                             Teuchos::as<int>(elemIter));
 
-        // compute values of op * (LO basis) at HO dof coords on reference element
-        lo_basis->getValues(valuesAtDofCoordsNonOriented_d, ho_dofCoords_d, op);
+        // get subviews on workset
+        auto elemOrtsWorkset_d = Kokkos::subview(elemOrts_d, Kokkos::make_pair(elemIter, std::min(elemIter + numCells, elementIds.size())));
+        auto valuesAtDofCoordsOrientedWorkset_d = Kokkos::subview(valuesAtDofCoordsOriented_d, Kokkos::make_pair(0, endCellRange), Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL());
+        auto basisCoeffsLIOrientedWorkset_d = Kokkos::subview(basisCoeffsLIOriented_d, Kokkos::make_pair(0, endCellRange), Kokkos::ALL(), Kokkos::ALL());
 
         // apply orientations for LO basis
         // shuffles things in the second dimension, i.e. wrt LO basis
-        ots::modifyBasisByOrientation(valuesAtDofCoordsOriented_d,
+        ots::modifyBasisByOrientation(valuesAtDofCoordsOrientedWorkset_d,
                                       valuesAtDofCoordsNonOriented_d,
-                                      elemOrts_d,
+                                      elemOrtsWorkset_d,
                                       lo_basis.get());
 
         Kokkos::deep_copy(reducedValuesAtDofCoordsOriented_d, 0.0);
@@ -346,10 +360,10 @@ namespace mini_em {
 
 
         for (size_t j = 0; j<numVectors; ++j)
-          li::getBasisCoeffs(Kokkos::subview(basisCoeffsLIOriented_d, Kokkos::ALL(), Kokkos::ALL(), j),
-                             Kokkos::subview(reducedValuesAtDofCoordsOriented_d, Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL(), j),
+          li::getBasisCoeffs(Kokkos::subview(basisCoeffsLIOrientedWorkset_d, Kokkos::ALL(), Kokkos::ALL(), j),
+                             Kokkos::subview(reducedValuesAtDofCoordsOriented_d, Kokkos::make_pair(0, endCellRange), Kokkos::ALL(), Kokkos::ALL(), j),
                              ho_basis.get(),
-                             elemOrts_d
+                             elemOrtsWorkset_d
                              );
 
         auto owner_d = owner_d_;
@@ -479,6 +493,12 @@ namespace mini_em {
     auto loLIDs_d = lo_ugi->getLIDs();
     Kokkos::fence();
 
+    // HO dof coordinates and coefficients
+    ho_basis->getDofCoords(ho_dofCoords_d);
+
+    // compute values of op * (LO basis) at HO dof coords on reference element
+    lo_basis->getValues(valuesAtDofCoordsNonOriented_d, ho_dofCoords_d, op);
+
     // loop over element blocks
     std::vector<std::string> elementBlockIds;
     blockedDOFMngr->getElementBlockIds(elementBlockIds);
@@ -490,46 +510,40 @@ namespace mini_em {
       Kokkos::View<int*,DeviceSpace> elementIds_d("elementIds_d", elementIds_h.extent(0));
       Kokkos::deep_copy(elementIds_d, elementIds_h);
       Kokkos::fence();
-      for(std::size_t elemIter = 0; elemIter < elementIds_d.extent(0); elemIter += numCells) {
 
-        // get element orientations
-        auto node_connectivity_h = node_conn_->getConnectivityView();
-        Kokkos::View<GlobalOrdinal*,DeviceSpace> node_connectivity_d("node_connectivity_d", node_connectivity_h.extent(0));
-        Kokkos::deep_copy(node_connectivity_d, node_connectivity_h);
-        auto node_connectivitySize_h = node_conn_->getConnectivitySizeView();
-        Kokkos::View<LocalOrdinal*,DeviceSpace> node_connectivitySize_d("node_connectivitySize_d", node_connectivitySize_h.extent(0));
-        Kokkos::deep_copy(node_connectivitySize_d, node_connectivitySize_h);
-        auto node_elementLidToConn_h = node_conn_->getElementLidToConnView();
-        Kokkos::View<LocalOrdinal*,DeviceSpace> node_elementLidToConn_d("node_elementLidToConn_d", node_elementLidToConn_h.extent(0));
-        Kokkos::deep_copy(node_elementLidToConn_d, node_elementLidToConn_h);
-        Kokkos::fence();
+      // get element orientations
+      typename Kokkos::DynRankView<Intrepid2::Orientation,DeviceSpace>     elemOrts_d ("elemOrts_d",  elementIds.size());
+      {
+        typename Kokkos::DynRankView<GlobalOrdinal, DeviceSpace>::HostMirror elemNodes_h("elemNodes_h", elementIds.size(), numElemVertices);
+        typename Kokkos::DynRankView<GlobalOrdinal, DeviceSpace>             elemNodes_d("elemNodes_d", elementIds.size(), numElemVertices);
+
+        for (int cellNo = 0; cellNo < elementIds.size(); cellNo++) {
+          const GlobalOrdinal* node_ids = node_conn_->getConnectivity(elementIds[cellNo]);
+          for(int i = 0; i < numElemVertices; i++)
+            elemNodes_h(cellNo, i) = node_ids[i];
+        }
+        Kokkos::deep_copy(elemNodes_d, elemNodes_h);
 
-        Kokkos::parallel_for("miniEM:MatrixFreeInterpolationOp::connectivity",
-                             range_type(0, std::min(numCells, elementIds_d.extent_int(0)-Teuchos::as<int>(elemIter))),
-                             KOKKOS_LAMBDA(const LocalOrdinal cellNo) {
-                               LocalOrdinal cellNo2 = elemIter+cellNo;
-                               LocalOrdinal elementID = elementIds_d(cellNo2);
-                               LocalOrdinal k = node_elementLidToConn_d(elementID);
-                               for(int i = 0; i < node_connectivitySize_d(elementID); i++)
-                                 elemNodes_d(cellNo, i) = node_connectivity_d(k+i);
-                             });
-        Kokkos::fence();
         ots::getOrientation(elemOrts_d, elemNodes_d, topology);
-        Kokkos::fence();
+      }
 
-        // HO dof coordinates and coefficients
-        ho_basis->getDofCoords(ho_dofCoords_d);
-        Kokkos::fence();
+      for(std::size_t elemIter = 0; elemIter < elementIds_d.extent(0); elemIter += numCells) {
+
+        int endCellRange =
+          std::min(numCells, Teuchos::as<int>(elementIds.size()) -
+                             Teuchos::as<int>(elemIter));
+
+        // get subviews on workset
+        auto elemOrtsWorkset_d = Kokkos::subview(elemOrts_d, Kokkos::make_pair(elemIter, std::min(elemIter + numCells, elementIds.size())));
+        auto valuesAtDofCoordsOrientedWorkset_d = Kokkos::subview(valuesAtDofCoordsOriented_d, Kokkos::make_pair(0, endCellRange), Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL());
+        auto basisCoeffsLIOrientedWorkset_d = Kokkos::subview(basisCoeffsLIOriented_d, Kokkos::make_pair(0, endCellRange), Kokkos::ALL(), Kokkos::ALL());
 
-        // compute values of op * (LO basis) at HO dof coords on reference element
-        lo_basis->getValues(valuesAtDofCoordsNonOriented_d, ho_dofCoords_d, op);
-        Kokkos::fence();
 
         // apply orientations for LO basis
         // shuffles things in the second dimension, i.e. wrt LO basis
-        ots::modifyBasisByOrientation(valuesAtDofCoordsOriented_d,
+        ots::modifyBasisByOrientation(valuesAtDofCoordsOrientedWorkset_d,
                                       valuesAtDofCoordsNonOriented_d,
-                                      elemOrts_d,
+                                      elemOrtsWorkset_d,
                                       lo_basis.get());
         Kokkos::fence();
 
@@ -537,9 +551,9 @@ namespace mini_em {
         for(size_t loIter=0; loIter<loCardinality; loIter++)
           // Get basis coeffs wrt HO basis on reference element.
           // basisCoeffsLI has dimensions (numCells, numFields=hoCardinality, loCardinality)
-          li::getBasisCoeffs(Kokkos::subview(basisCoeffsLIOriented_d, Kokkos::ALL(), Kokkos::ALL(), loIter),
-                             Kokkos::subview(valuesAtDofCoordsOriented_d, Kokkos::ALL(), loIter, Kokkos::ALL(), Kokkos::ALL()),
-                             ho_basis.get(), elemOrts_d);
+          li::getBasisCoeffs(Kokkos::subview(basisCoeffsLIOrientedWorkset_d, Kokkos::ALL(), Kokkos::ALL(), loIter),
+                             Kokkos::subview(valuesAtDofCoordsOrientedWorkset_d, Kokkos::ALL(), loIter, Kokkos::ALL(), Kokkos::ALL()),
+                             ho_basis.get(), elemOrtsWorkset_d);
         Kokkos::fence();
 
         auto owner_d = owner_d_;
diff --git a/packages/teko/src/Teko_HierarchicalGaussSeidelPreconditionerFactory.cpp b/packages/teko/src/Teko_HierarchicalGaussSeidelPreconditionerFactory.cpp
new file mode 100644
index 000000000000..144b82069411
--- /dev/null
+++ b/packages/teko/src/Teko_HierarchicalGaussSeidelPreconditionerFactory.cpp
@@ -0,0 +1,324 @@
+/*
+// @HEADER
+//
+// ***********************************************************************
+//
+//      Teko: A package for block and physics based preconditioning
+//                  Copyright 2010 Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Eric C. Cyr (eccyr@sandia.gov)
+//
+// ***********************************************************************
+//
+// @HEADER
+
+*/
+
+#include "Teko_BlockImplicitLinearOp.hpp"
+#include "Teko_HierarchicalGaussSeidelPreconditionerFactory.hpp"
+
+using Teuchos::rcp;
+using Teuchos::RCP;
+
+namespace Teko {
+
+namespace {
+
+BlockedMultiVector extractSubBlockVector(const BlockedMultiVector& blo,
+                                         const std::vector<int>& rows) {
+  const int nrows = rows.size();
+  TEUCHOS_ASSERT(blockCount(blo) >= nrows);
+  std::vector<MultiVector> subVectors;
+  for (int row_id = 0; row_id < nrows; ++row_id) {
+    const auto row = rows[row_id];
+    subVectors.push_back(getBlock(row, blo));
+  }
+
+  return buildBlockedMultiVector(subVectors);
+}
+
+BlockedLinearOp extractSubblockMatrix(const BlockedLinearOp& blo, const std::vector<int>& rows,
+                                      const std::vector<int>& cols) {
+  int nOuterBlockRows = blockRowCount(blo);
+  for (auto&& row : rows) {
+    TEUCHOS_ASSERT(row < nOuterBlockRows);
+  }
+  int nOuterBlockCols = blockColCount(blo);
+  for (auto&& col : cols) {
+    TEUCHOS_ASSERT(col < nOuterBlockCols);
+  }
+
+  // allocate new operator
+  auto subblock = createBlockedOp();
+
+  const int nInnerBlockRows = rows.size();
+  const int nInnerBlockCols = cols.size();
+
+  // build new operator
+  subblock->beginBlockFill(nInnerBlockRows, nInnerBlockCols);
+  for (int innerBlockRow = 0U; innerBlockRow < nInnerBlockRows; ++innerBlockRow) {
+    for (int innerBlockCol = 0U; innerBlockCol < nInnerBlockCols; ++innerBlockCol) {
+      auto [outerBlockRow, outerBlockCol] =
+          std::make_tuple(rows[innerBlockRow], cols[innerBlockCol]);
+      auto A_row_col = blo->getBlock(outerBlockRow, outerBlockCol);
+
+      if (A_row_col != Teuchos::null) {
+        subblock->setBlock(innerBlockRow, innerBlockCol, A_row_col);
+      } else {
+        // scan to find first non-null range/domain, construct zero operator
+        VectorSpace range;
+        VectorSpace domain;
+
+        for (int outerBlock = 0; outerBlock < nOuterBlockCols; ++outerBlock) {
+          auto A_ij = blo->getBlock(outerBlockRow, outerBlock);
+          if (A_ij != Teuchos::null) {
+            range = A_ij->range();
+            break;
+          }
+        }
+
+        for (int outerBlock = 0; outerBlock < nOuterBlockRows; ++outerBlock) {
+          auto A_ij = blo->getBlock(outerBlock, outerBlockCol);
+          if (A_ij != Teuchos::null) {
+            domain = A_ij->domain();
+            break;
+          }
+        }
+
+        TEUCHOS_ASSERT(range != Teuchos::null);
+        TEUCHOS_ASSERT(domain != Teuchos::null);
+
+        subblock->setBlock(innerBlockRow, innerBlockCol, zero(range, domain));
+      }
+    }
+  }
+
+  subblock->endBlockFill();
+
+  return subblock;
+}
+
+std::vector<std::string> tokenize_input(std::string input, const std::string& delimiter) {
+  size_t pos = 0;
+  std::vector<std::string> tokens;
+  while ((pos = input.find(delimiter)) != std::string::npos) {
+    tokens.push_back(input.substr(0, pos));
+    input.erase(0, pos + delimiter.length());
+  }
+  tokens.push_back(input);
+  return tokens;
+}
+
+// Parse hierarchical block number from:
+// <ParameterList name="Hierarchical Block 1">
+// </ParameterList>
+std::optional<int> extract_hierarchical_block_number(const Teuchos::ParameterList& params,
+                                                     const std::string& key) {
+  const std::string blockHierarchy = "Hierarchical Block ";
+  if (key.find(blockHierarchy) == std::string::npos) return {};
+  if (!params.isSublist(key)) return {};
+
+  int blockNumber = -1;
+  try {
+    auto tokens = tokenize_input(key, " ");
+    blockNumber = std::stoi(tokens.back());
+  } catch (std::exception& err) {
+    std::ostringstream ss;
+    ss << "Error occured when trying to parse entry with key " << key << "\n";
+    ss << "It said:\n" << err.what() << "\n";
+    TEUCHOS_TEST_FOR_EXCEPTION(true, std::runtime_error, ss.str());
+  }
+  return blockNumber - 1;  // adjust to 0-based indexing
+}
+
+std::vector<int> extract_block_numbers(const Teuchos::ParameterList& params) {
+  const std::string includedBlocks = "Included Subblocks";
+
+  std::ostringstream ss;
+  ss << "Parameter 'Included Subblocks' is missing for params:\n" << params;
+  TEUCHOS_TEST_FOR_EXCEPTION(!params.isParameter(includedBlocks), std::runtime_error, ss.str());
+  std::vector<int> blocks;
+  try {
+    auto blockStrs = tokenize_input(params.get<std::string>(includedBlocks), ",");
+    for (const auto& blockStr : blockStrs) {
+      blocks.emplace_back(std::stoi(blockStr) - 1);  // adjust to 0-based indexing
+    }
+  } catch (std::exception& err) {
+    std::ostringstream errSS;
+    errSS << "Error occured when trying to parse 'Included SubBlocks' for params:\n"
+          << params << "\n";
+    errSS << "It said:\n" << err.what() << "\n";
+    TEUCHOS_TEST_FOR_EXCEPTION(true, std::runtime_error, errSS.str());
+  }
+  return blocks;
+}
+
+unsigned index(unsigned i, unsigned j, unsigned N) { return i * N + j; }
+
+}  // namespace
+
+NestedBlockGS::NestedBlockGS(const std::map<int, std::vector<int>>& blockToRow_,
+                             const std::map<int, LinearOp>& blockToInvOp_, BlockedLinearOp& A_,
+                             bool useLowerTriangle_)
+    : blockToRow(blockToRow_),
+      blockToInvOp(blockToInvOp_),
+      A(A_),
+      useLowerTriangle(useLowerTriangle_) {
+  productRange_    = A->productRange();
+  productDomain_   = A->productDomain();
+  const auto Nrows = blockToRow.size();
+  Ab.resize(Nrows * Nrows);
+  for (auto [hierchicalBlockRow, rows] : blockToRow) {
+    for (auto [hierchicalBlockCol, cols] : blockToRow) {
+      Ab[index(hierchicalBlockRow, hierchicalBlockCol, Nrows)] =
+          extractSubblockMatrix(A, rows, cols);
+    }
+  }
+}
+
+void NestedBlockGS::implicitApply(const BlockedMultiVector& r, BlockedMultiVector& z,
+                                  const double alpha, const double beta) const {
+  const int blocks = blockToRow.size();
+
+  auto src = deepcopy(r);
+  std::vector<BlockedMultiVector> rVec;
+  std::vector<BlockedMultiVector> zVec;
+
+  for (int b = 0; b < blocks; b++) {
+    rVec.push_back(extractSubBlockVector(src, blockToRow.at(b)));
+    zVec.push_back(extractSubBlockVector(z, blockToRow.at(b)));
+  }
+
+  if (useLowerTriangle) {
+    lowerTriangularImplicitApply(rVec, zVec, alpha, beta);
+  } else {
+    upperTriangularImplicitApply(rVec, zVec, alpha, beta);
+  }
+}
+
+void NestedBlockGS::upperTriangularImplicitApply(std::vector<BlockedMultiVector>& r,
+                                                 std::vector<BlockedMultiVector>& z,
+                                                 const double /* alpha */,
+                                                 const double /* beta */) const {
+  const int blocks = blockToRow.size();
+
+  for (int b = blocks - 1; b >= 0; b--) {
+    applyOp(blockToInvOp.at(b), r[b], z[b]);
+
+    for (int i = 0; i < b; i++) {
+      auto u_ib = Ab[index(i, b, blocks)];
+      if (u_ib != Teuchos::null) {
+        applyOp(u_ib, z[b], r[i], -1.0, 1.0);
+      }
+    }
+  }
+}
+
+void NestedBlockGS::lowerTriangularImplicitApply(std::vector<BlockedMultiVector>& r,
+                                                 std::vector<BlockedMultiVector>& z,
+                                                 const double /* alpha */,
+                                                 const double /* beta */) const {
+  const int blocks = blockToRow.size();
+
+  for (int b = 0; b < blocks; b++) {
+    applyOp(blockToInvOp.at(b), r[b], z[b]);
+
+    // loop over each row
+    for (int i = b + 1; i < blocks; i++) {
+      auto l_ib = Ab[index(i, b, blocks)];
+      if (l_ib != Teuchos::null) {
+        applyOp(l_ib, z[b], r[i], -1.0, 1.0);
+      }
+    }
+  }
+}
+
+HierarchicalGaussSeidelPreconditionerFactory::HierarchicalGaussSeidelPreconditionerFactory() =
+    default;
+
+LinearOp HierarchicalGaussSeidelPreconditionerFactory::buildPreconditionerOperator(
+    BlockedLinearOp& blo, BlockPreconditionerState& state) const {
+  for (auto [hierarchicalBlockNum, blocks] : blockToRow) {
+    auto A_bb                          = extractSubblockMatrix(blo, blocks, blocks);
+    blockToInvOp[hierarchicalBlockNum] = this->buildBlockInverse(
+        *blockToInverse.at(hierarchicalBlockNum), A_bb, state, hierarchicalBlockNum);
+  }
+
+  return Teuchos::rcp(new NestedBlockGS(blockToRow, blockToInvOp, blo, useLowerTriangle));
+}
+
+LinearOp HierarchicalGaussSeidelPreconditionerFactory::buildBlockInverse(
+    const InverseFactory& invFact, const BlockedLinearOp& matrix, BlockPreconditionerState& state,
+    int hierarchicalBlockNum) const {
+  std::stringstream ss;
+  ss << "hierarchical_block_" << hierarchicalBlockNum;
+
+  ModifiableLinearOp& invOp = state.getModifiableOp(ss.str());
+
+  if (invOp == Teuchos::null) {
+    invOp = buildInverse(invFact, matrix);
+  } else {
+    rebuildInverse(invFact, matrix, invOp);
+  }
+
+  return invOp;
+}
+
+void HierarchicalGaussSeidelPreconditionerFactory::initializeFromParameterList(
+    const Teuchos::ParameterList& pl) {
+  RCP<const InverseLibrary> invLib = getInverseLibrary();
+
+  for (const auto& entry : pl) {
+    const auto key               = entry.first;
+    const auto value             = entry.second;
+    auto hierarchicalBlockNumber = extract_hierarchical_block_number(pl, key);
+    if (!hierarchicalBlockNumber) continue;
+    const auto& hierarchicalParams       = pl.sublist(key);
+    blockToRow[*hierarchicalBlockNumber] = extract_block_numbers(hierarchicalParams);
+
+    std::ostringstream ss;
+    ss << "Missing required parameter \"Inverse Type\" for hierarchical block "
+       << *hierarchicalBlockNumber << "\n";
+    TEUCHOS_TEST_FOR_EXCEPTION(!hierarchicalParams.isParameter("Inverse Type"), std::runtime_error,
+                               ss.str());
+    auto invStr                              = hierarchicalParams.get<std::string>("Inverse Type");
+    blockToInverse[*hierarchicalBlockNumber] = invLib->getInverseFactory(invStr);
+  }
+
+  useLowerTriangle = false;
+  if (pl.isParameter("Use Upper Triangle")) {
+    useLowerTriangle = !pl.get<bool>("Use Upper Triangle");
+  }
+}
+
+}  // namespace Teko
diff --git a/packages/teko/src/Teko_HierarchicalGaussSeidelPreconditionerFactory.hpp b/packages/teko/src/Teko_HierarchicalGaussSeidelPreconditionerFactory.hpp
new file mode 100644
index 000000000000..b0ab109c7607
--- /dev/null
+++ b/packages/teko/src/Teko_HierarchicalGaussSeidelPreconditionerFactory.hpp
@@ -0,0 +1,118 @@
+/*
+// @HEADER
+//
+// ***********************************************************************
+//
+//      Teko: A package for block and physics based preconditioning
+//                  Copyright 2010 Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Eric C. Cyr (eccyr@sandia.gov)
+//
+// ***********************************************************************
+//
+// @HEADER
+
+*/
+
+#ifndef __Teko_HierarchicalGaussSeidelPreconditionerFactory_hpp__
+#define __Teko_HierarchicalGaussSeidelPreconditionerFactory_hpp__
+
+#include "Teuchos_RCP.hpp"
+
+#include "Teko_BlockPreconditionerFactory.hpp"
+#include "Teko_BlockImplicitLinearOp.hpp"
+#include "Teko_Utilities.hpp"
+#include <map>
+#include <vector>
+
+namespace Teko {
+
+class NestedBlockGS : public BlockImplicitLinearOp {
+ public:
+  NestedBlockGS(const std::map<int, std::vector<int>>& blockToRow_,
+                const std::map<int, LinearOp>& blockToInvOp_, BlockedLinearOp& A_,
+                bool useLowerTriangle_ = false);
+
+  VectorSpace range() const override { return productRange_; }
+  VectorSpace domain() const override { return productDomain_; }
+
+  void implicitApply(const BlockedMultiVector& r, BlockedMultiVector& z, const double alpha = 1.0,
+                     const double beta = 0.0) const override;
+
+ private:
+  void upperTriangularImplicitApply(std::vector<BlockedMultiVector>& r,
+                                    std::vector<BlockedMultiVector>& z, const double alpha = 1.0,
+                                    const double beta = 0.0) const;
+
+  void lowerTriangularImplicitApply(std::vector<BlockedMultiVector>& r,
+                                    std::vector<BlockedMultiVector>& z, const double alpha = 1.0,
+                                    const double beta = 0.0) const;
+
+  // block operators
+  std::map<int, std::vector<int>> blockToRow;
+  std::map<int, LinearOp> blockToInvOp;
+  std::vector<LinearOp> invOps;
+  BlockedLinearOp A;
+  std::vector<BlockedLinearOp> Ab;
+  bool useLowerTriangle = false;
+
+  Teuchos::RCP<const Thyra::ProductVectorSpaceBase<double>> productRange_;  ///< Range vector space.
+  Teuchos::RCP<const Thyra::ProductVectorSpaceBase<double>>
+      productDomain_;  ///< Domain vector space.
+};
+
+class HierarchicalGaussSeidelPreconditionerFactory : public BlockPreconditionerFactory {
+ public:
+  ~HierarchicalGaussSeidelPreconditionerFactory() override = default;
+  HierarchicalGaussSeidelPreconditionerFactory();
+
+  LinearOp buildPreconditionerOperator(BlockedLinearOp& blo,
+                                       BlockPreconditionerState& state) const override;
+
+ protected:
+  void initializeFromParameterList(const Teuchos::ParameterList& pl) override;
+  using BlockPreconditionerFactory::buildPreconditionerOperator;
+
+ private:
+  LinearOp buildBlockInverse(const InverseFactory& invFact, const BlockedLinearOp& matrix,
+                             BlockPreconditionerState& state, int hierarchicalBlockNum) const;
+
+  std::map<int, std::vector<int>> blockToRow;
+  std::map<int, Teuchos::RCP<InverseFactory>> blockToInverse;
+  mutable std::map<int, LinearOp> blockToInvOp;
+
+  bool useLowerTriangle = false;
+};
+}  // end namespace Teko
+
+#endif
diff --git a/packages/teko/src/Teko_PreconditionerFactory.cpp b/packages/teko/src/Teko_PreconditionerFactory.cpp
index f58867056b9e..31801b9665ac 100644
--- a/packages/teko/src/Teko_PreconditionerFactory.cpp
+++ b/packages/teko/src/Teko_PreconditionerFactory.cpp
@@ -53,6 +53,7 @@
 // Specific preconditioners included for dynamic creation
 #include "Teko_JacobiPreconditionerFactory.hpp"
 #include "Teko_GaussSeidelPreconditionerFactory.hpp"
+#include "Teko_HierarchicalGaussSeidelPreconditionerFactory.hpp"
 #include "Teko_AddPreconditionerFactory.hpp"
 #include "Teko_MultPreconditionerFactory.hpp"
 #include "Teko_LU2x2PreconditionerFactory.hpp"
@@ -278,6 +279,9 @@ void PreconditionerFactory::initializePrecFactoryBuilder() {
   clone = rcp(new AutoClone<GaussSeidelPreconditionerFactory>());
   precFactoryBuilder_.addClone("Block Gauss-Seidel", clone);
 
+  clone = rcp(new AutoClone<HierarchicalGaussSeidelPreconditionerFactory>());
+  precFactoryBuilder_.addClone("Hierarchical Block Gauss-Seidel", clone);
+
   clone = rcp(new AutoClone<AddPreconditionerFactory>());
   precFactoryBuilder_.addClone("Block Add", clone);
 
diff --git a/packages/teko/src/Teko_Utilities.hpp b/packages/teko/src/Teko_Utilities.hpp
index bf86d9eaf535..f5543898171a 100644
--- a/packages/teko/src/Teko_Utilities.hpp
+++ b/packages/teko/src/Teko_Utilities.hpp
@@ -343,6 +343,10 @@ typedef Teuchos::RCP<Thyra::LinearOpBase<ST> > ModifiableLinearOp;
 //! Build a square zero operator from a single vector space
 inline LinearOp zero(const VectorSpace &vs) { return Thyra::zero<ST>(vs, vs); }
 
+inline LinearOp zero(const VectorSpace &range, const VectorSpace &domain) {
+  return Thyra::zero<ST>(range, domain);
+}
+
 //! Replace nonzeros with a scalar value, used to zero out an operator
 #ifdef TEKO_HAVE_EPETRA
 void putScalar(const ModifiableLinearOp &op, double scalar);
diff --git a/packages/teko/tests/src/tSIMPLEPreconditionerFactory_tpetra.cpp b/packages/teko/tests/src/tSIMPLEPreconditionerFactory_tpetra.cpp
index 84f04d3e28a5..6f880c95d84b 100644
--- a/packages/teko/tests/src/tSIMPLEPreconditionerFactory_tpetra.cpp
+++ b/packages/teko/tests/src/tSIMPLEPreconditionerFactory_tpetra.cpp
@@ -279,6 +279,13 @@ int tSIMPLEPreconditionerFactory_tpetra::runTest(int verbosity, std::ostream &st
   failcount += status ? 0 : 1;
   totalrun++;
 
+  status = test_hierarchicalSolves(verbosity, failstrm);
+  Teko_TEST_MSG_tpetra(stdstrm, 1, "   \"hierarchicalSolves\" ... PASSED",
+                       "   \"hierarchicalSolves\" ... FAILED");
+  allTests &= status;
+  failcount += status ? 0 : 1;
+  totalrun++;
+
   status = test_diagonal(verbosity, failstrm, 0);
   Teko_TEST_MSG_tpetra(stdstrm, 1, "   \"diagonal(diag)\" ... PASSED",
                        "   \"diagonal(diag)\" ... FAILED");
@@ -751,5 +758,56 @@ bool tSIMPLEPreconditionerFactory_tpetra::test_iterativeSolves(int verbosity, st
   return true;
 }
 
+bool tSIMPLEPreconditionerFactory_tpetra::test_hierarchicalSolves(int verbosity, std::ostream &os) {
+  bool status    = false;
+  bool allPassed = true;
+
+  const RCP<Thyra::PhysicallyBlockedLinearOpBase<ST> > blkOp = Thyra::defaultBlockedLinearOp<ST>();
+  blkOp->beginBlockFill(3, 3);
+  blkOp->setBlock(0, 0, F_);
+  blkOp->setBlock(0, 1, Bt_);
+  blkOp->setBlock(1, 0, B_);
+  blkOp->setBlock(1, 1, C_);
+  blkOp->setBlock(1, 2, B_);
+  blkOp->setBlock(2, 1, Bt_);
+  blkOp->setBlock(2, 2, F_);
+  blkOp->endBlockFill();
+
+  RCP<ParameterList> params = Teuchos::rcp(new ParameterList());
+  ParameterList &tekoList   = params->sublist("Preconditioner Types").sublist("Teko");
+  tekoList.set("Inverse Type", "HBGS");
+  ParameterList &ifl = tekoList.sublist("Inverse Factory Library");
+
+  ifl.sublist("HBGS").set("Type", "Hierarchical Block Gauss-Seidel");
+
+  ifl.sublist("HBGS").sublist("Hierarchical Block 1").set("Included Subblocks", "1,2");
+  ifl.sublist("HBGS").sublist("Hierarchical Block 1").set("Inverse Type", "SIMPLE");
+
+  ifl.sublist("HBGS").sublist("Hierarchical Block 2").set("Included Subblocks", "3");
+  ifl.sublist("HBGS").sublist("Hierarchical Block 2").set("Inverse Type", "SINGLE_BGS");
+
+  ifl.sublist("SIMPLE").set("Type", "NS SIMPLE");
+  ifl.sublist("SIMPLE").set("Inverse Velocity Type", "Belos");
+  ifl.sublist("SIMPLE").set("Preconditioner Velocity Type", "Ifpack2");
+  ifl.sublist("SIMPLE").set("Inverse Pressure Type", "Belos");
+  ifl.sublist("SIMPLE").set("Preconditioner Pressure Type", "Ifpack2");
+
+  ifl.sublist("SINGLE_BGS").set("Type", "Block Gauss-Seidel");
+  ifl.sublist("SINGLE_BGS").set("Inverse Type", "Ifpack2");
+
+  RCP<Teko::InverseLibrary> invLib        = Teko::InverseLibrary::buildFromParameterList(ifl);
+  RCP<const Teko::InverseFactory> invFact = invLib->getInverseFactory("HBGS");
+
+  Teko::ModifiableLinearOp inv = Teko::buildInverse(*invFact, blkOp);
+  TEST_ASSERT(!inv.is_null(), "Constructed preconditioner is null");
+
+  if (!inv.is_null()) {
+    Teko::rebuildInverse(*invFact, blkOp, inv);
+    TEST_ASSERT(!inv.is_null(), "Constructed preconditioner is null");
+  }
+
+  return true;
+}
+
 }  // end namespace Test
 }  // end namespace Teko
diff --git a/packages/teko/tests/src/tSIMPLEPreconditionerFactory_tpetra.hpp b/packages/teko/tests/src/tSIMPLEPreconditionerFactory_tpetra.hpp
index c31a423bf3ec..8a1676a82c96 100644
--- a/packages/teko/tests/src/tSIMPLEPreconditionerFactory_tpetra.hpp
+++ b/packages/teko/tests/src/tSIMPLEPreconditionerFactory_tpetra.hpp
@@ -73,6 +73,7 @@ class tSIMPLEPreconditionerFactory_tpetra : public UnitTest {
   bool test_uninitializePrec(int verbosity, std::ostream& os);
   bool test_isCompatable(int verbosity, std::ostream& os);
   bool test_iterativeSolves(int verbosity, std::ostream& os);
+  bool test_hierarchicalSolves(int verbosity, std::ostream& os);
 
   // non-member tests
   bool test_result(int verbosity, std::ostream& os, int use_blocking);
diff --git a/packages/tpetra/core/src/Tpetra_CrsMatrix_def.hpp b/packages/tpetra/core/src/Tpetra_CrsMatrix_def.hpp
index febf5fc28ddd..df302e9f1d8e 100644
--- a/packages/tpetra/core/src/Tpetra_CrsMatrix_def.hpp
+++ b/packages/tpetra/core/src/Tpetra_CrsMatrix_def.hpp
@@ -5093,13 +5093,14 @@ CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>::
       // Decide now whether to use the imbalanced row path, or the default.
       bool useMergePath = false;
       LocalOrdinal nrows = getLocalNumRows();
+#ifdef KOKKOSKERNELS_ENABLE_TPL_CUSPARSE
       LocalOrdinal maxRowImbalance = 0;
       if(nrows != 0)
         maxRowImbalance = getLocalMaxNumRowEntries() - (getLocalNumEntries() / nrows);
       //TODO: when https://github.com/kokkos/kokkos-kernels/issues/2166 is fixed and,
       //we can use SPMV_MERGE_PATH for the native spmv as well.
       //Take out this ifdef to enable that.
-#ifdef KOKKOSKERNELS_ENABLE_TPL_CUSPARSE
+
       if(size_t(maxRowImbalance) >= Tpetra::Details::Behavior::rowImbalanceThreshold())
         useMergePath = true;
 #endif
diff --git a/packages/tpetra/core/src/Tpetra_Details_KokkosCounter.cpp b/packages/tpetra/core/src/Tpetra_Details_KokkosCounter.cpp
index 7dd95c747adb..a78d4294e231 100644
--- a/packages/tpetra/core/src/Tpetra_Details_KokkosCounter.cpp
+++ b/packages/tpetra/core/src/Tpetra_Details_KokkosCounter.cpp
@@ -38,11 +38,13 @@
 // ************************************************************************
 // @HEADER
 */
+// clang-format off
 #include "Tpetra_Details_KokkosCounter.hpp"
 #include "TpetraCore_config.h"
 #include "Kokkos_Core.hpp"
 #include "Teuchos_TestForException.hpp"
 #include <cstring>
+#include <string>
 
 namespace Tpetra {
 namespace Details {
@@ -201,6 +203,54 @@ namespace Details {
     TEUCHOS_TEST_FOR_EXCEPTION(1,std::runtime_error,std::string("Error: ") + device + std::string(" is not a device known to Tpetra"));
   }
 
+// clang-format on
+namespace KokkosRegionCounterDetails {
+std::vector<std::string> regions;
+
+void push_region_callback(const char *label) { regions.push_back(label); }
+static_assert(std::is_same_v<decltype(&push_region_callback),
+                             Kokkos_Profiling_pushFunction>,
+              "Unexpected Kokkos profiling interface API. This is an internal "
+              "Tpetra developer error, please report this.");
+
+} // namespace KokkosRegionCounterDetails
+
+void KokkosRegionCounter::start() {
+  Kokkos::Tools::Experimental::set_push_region_callback(
+      KokkosRegionCounterDetails::push_region_callback);
+}
+
+void KokkosRegionCounter::reset() {
+  KokkosRegionCounterDetails::regions.clear();
+}
+
+void KokkosRegionCounter::stop() {
+  Kokkos::Tools::Experimental::set_push_region_callback(nullptr);
+}
+
+size_t
+KokkosRegionCounter::get_count_region_contains(const std::string &needle) {
+  size_t count = 0;
+  for (const auto &region : KokkosRegionCounterDetails::regions) {
+    count += (region.find(needle) != std::string::npos);
+  }
+  return count;
+}
+
+void KokkosRegionCounter::dump_regions(Teuchos::FancyOStream &os) {
+  for (const auto &region : KokkosRegionCounterDetails::regions) {
+    os << region << "\n";
+  }
+}
+
+void KokkosRegionCounter::dump_regions(std::ostream &os) {
+  for (const auto &region : KokkosRegionCounterDetails::regions) {
+    os << region << "\n";
+  }
+}
+
+
+// clang-format off
 
 
 } // namespace Details
diff --git a/packages/tpetra/core/src/Tpetra_Details_KokkosCounter.hpp b/packages/tpetra/core/src/Tpetra_Details_KokkosCounter.hpp
index e6a8bccb9375..702819fad61d 100644
--- a/packages/tpetra/core/src/Tpetra_Details_KokkosCounter.hpp
+++ b/packages/tpetra/core/src/Tpetra_Details_KokkosCounter.hpp
@@ -38,6 +38,7 @@
 // ************************************************************************
 // @HEADER
 */
+// clang-format off
 #ifndef TPETRA_DETAILS_KOKKOS_COUNTER_HPP
 #define TPETRA_DETAILS_KOKKOS_COUNTER_HPP
 
@@ -46,6 +47,7 @@
 /// types using the Kokkos Profiling Library
 
 #include <string>
+#include <Teuchos_FancyOStream.hpp>
 
 namespace Tpetra {
 namespace Details {
@@ -87,6 +89,30 @@ namespace FenceCounter {
   size_t get_count_global(const std::string & device);
 }
 
+// clang-format on
+
+/// \brief Counter for Kokkos regions representing third-party library usage
+namespace KokkosRegionCounter {
+/// \brief Start the counter
+void start();
+
+/// \brief Reset the counter
+void reset();
+
+/// \brief Stop the counter
+void stop();
+
+/// \brief How many regions containing `substr` have been seen
+size_t get_count_region_contains(const std::string &substr);
+
+/// \brief Print all observed region labels, separated by newline
+void dump_regions(std::ostream &os);
+void dump_regions(Teuchos::FancyOStream &os);
+} // namespace KokkosRegionCounter
+
+// clang-format off
+
+
 
 } // namespace Details
 } // namespace Tpetra
diff --git a/packages/tpetra/core/src/Tpetra_Details_leftScaleLocalCrsMatrix.hpp b/packages/tpetra/core/src/Tpetra_Details_leftScaleLocalCrsMatrix.hpp
index 302301cd38da..aa6bc75753d0 100644
--- a/packages/tpetra/core/src/Tpetra_Details_leftScaleLocalCrsMatrix.hpp
+++ b/packages/tpetra/core/src/Tpetra_Details_leftScaleLocalCrsMatrix.hpp
@@ -76,6 +76,8 @@ class LeftScaleLocalCrsMatrix {
   static_assert (ScalingFactorsViewType::rank == 1,
                  "scalingFactors must be a rank-1 Kokkos::View.");
   using device_type = typename LocalSparseMatrixType::device_type;
+  using LO = typename LocalSparseMatrixType::ordinal_type;
+  using policy_type = Kokkos::TeamPolicy<typename device_type::execution_space, LO>;
 
   /// \param A_lcl [in/out] The local sparse matrix.
   ///
@@ -94,11 +96,11 @@ class LeftScaleLocalCrsMatrix {
   {}
 
   KOKKOS_INLINE_FUNCTION void
-  operator () (const typename LocalSparseMatrixType::ordinal_type lclRow) const
+  operator () (const typename policy_type::member_type & team) const
   {
-    using LO = typename LocalSparseMatrixType::ordinal_type;
     using KAM = Kokkos::ArithTraits<mag_type>;
 
+    const LO lclRow = team.league_rank();
     const mag_type curRowNorm = scalingFactors_(lclRow);
     // Users are responsible for any divisions or multiplications by
     // zero.
@@ -106,14 +108,14 @@ class LeftScaleLocalCrsMatrix {
       KAM::sqrt (curRowNorm) : curRowNorm;
     auto curRow = A_lcl_.row (lclRow);
     const LO numEnt = curRow.length;
-    for (LO k = 0; k < numEnt; ++k) {
+    Kokkos::parallel_for(Kokkos::TeamThreadRange(team, numEnt), [&](const LO k) {
       if (divide) { // constexpr, so should get compiled out
         curRow.value (k) = curRow.value(k) / scalingFactor;
       }
       else {
         curRow.value (k) = curRow.value(k) * scalingFactor;
       }
-    }
+    });
   }
 
 private:
@@ -145,7 +147,7 @@ leftScaleLocalCrsMatrix (const LocalSparseMatrixType& A_lcl,
   using device_type = typename LocalSparseMatrixType::device_type;
   using execution_space = typename device_type::execution_space;
   using LO = typename LocalSparseMatrixType::ordinal_type;
-  using range_type = Kokkos::RangePolicy<execution_space, LO>;
+  using policy_type = Kokkos::TeamPolicy<execution_space, LO>;
 
   const LO lclNumRows = A_lcl.numRows ();
   if (divide) {
@@ -154,7 +156,7 @@ leftScaleLocalCrsMatrix (const LocalSparseMatrixType& A_lcl,
         typename ScalingFactorsViewType::const_type, true>;
     functor_type functor (A_lcl, scalingFactors, assumeSymmetric);
     Kokkos::parallel_for ("leftScaleLocalCrsMatrix",
-                          range_type (0, lclNumRows), functor);
+                          policy_type (lclNumRows, Kokkos::AUTO), functor);
   }
   else {
     using functor_type =
@@ -162,7 +164,7 @@ leftScaleLocalCrsMatrix (const LocalSparseMatrixType& A_lcl,
         typename ScalingFactorsViewType::const_type, false>;
     functor_type functor (A_lcl, scalingFactors, assumeSymmetric);
     Kokkos::parallel_for ("leftScaleLocalCrsMatrix",
-                          range_type (0, lclNumRows), functor);
+                          policy_type (lclNumRows, Kokkos::AUTO), functor);
   }
 }
 
diff --git a/packages/tpetra/core/src/Tpetra_Details_rightScaleLocalCrsMatrix.hpp b/packages/tpetra/core/src/Tpetra_Details_rightScaleLocalCrsMatrix.hpp
index 219888164161..a6a48e8b2f8e 100644
--- a/packages/tpetra/core/src/Tpetra_Details_rightScaleLocalCrsMatrix.hpp
+++ b/packages/tpetra/core/src/Tpetra_Details_rightScaleLocalCrsMatrix.hpp
@@ -76,6 +76,8 @@ class RightScaleLocalCrsMatrix {
   static_assert (ScalingFactorsViewType::rank == 1,
                  "scalingFactors must be a rank-1 Kokkos::View.");
   using device_type = typename LocalSparseMatrixType::device_type;
+  using LO = typename LocalSparseMatrixType::ordinal_type;
+  using policy_type = Kokkos::TeamPolicy<typename device_type::execution_space, LO>;
 
   /// \param A_lcl [in/out] The local sparse matrix.
   ///
@@ -96,14 +98,14 @@ class RightScaleLocalCrsMatrix {
   {}
 
   KOKKOS_INLINE_FUNCTION void
-  operator () (const typename LocalSparseMatrixType::ordinal_type lclRow) const
+  operator () (const typename policy_type::member_type & team) const
   {
-    using LO = typename LocalSparseMatrixType::ordinal_type;
     using KAM = Kokkos::ArithTraits<mag_type>;
 
+    const LO lclRow = team.league_rank();
     auto curRow = A_lcl_.row (lclRow);
     const LO numEnt = curRow.length;
-    for (LO k = 0; k < numEnt; ++k) {
+    Kokkos::parallel_for(Kokkos::TeamThreadRange(team, numEnt), [&](const LO k) {
       const LO lclColInd = curRow.colidx(k);
       const mag_type curColNorm = scalingFactors_(lclColInd);
       // Users are responsible for any divisions or multiplications by
@@ -116,7 +118,7 @@ class RightScaleLocalCrsMatrix {
       else {
         curRow.value(k) = curRow.value(k) * scalingFactor;
       }
-    }
+    });
   }
 
 private:
@@ -148,7 +150,7 @@ rightScaleLocalCrsMatrix (const LocalSparseMatrixType& A_lcl,
   using device_type = typename LocalSparseMatrixType::device_type;
   using execution_space = typename device_type::execution_space;
   using LO = typename LocalSparseMatrixType::ordinal_type;
-  using range_type = Kokkos::RangePolicy<execution_space, LO>;
+  using policy_type = Kokkos::TeamPolicy<execution_space, LO>;
 
   const LO lclNumRows = A_lcl.numRows ();
   if (divide) {
@@ -157,7 +159,7 @@ rightScaleLocalCrsMatrix (const LocalSparseMatrixType& A_lcl,
         typename ScalingFactorsViewType::const_type, true>;
     functor_type functor (A_lcl, scalingFactors, assumeSymmetric);
     Kokkos::parallel_for ("rightScaleLocalCrsMatrix",
-                          range_type (0, lclNumRows), functor);
+                          policy_type (lclNumRows, Kokkos::AUTO), functor);
   }
   else {
     using functor_type =
@@ -165,7 +167,7 @@ rightScaleLocalCrsMatrix (const LocalSparseMatrixType& A_lcl,
         typename ScalingFactorsViewType::const_type, false>;
     functor_type functor (A_lcl, scalingFactors, assumeSymmetric);
     Kokkos::parallel_for ("rightScaleLocalCrsMatrix",
-                          range_type (0, lclNumRows), functor);
+                          policy_type (lclNumRows, Kokkos::AUTO), functor);
   }
 }
 
diff --git a/packages/tpetra/core/src/Tpetra_computeRowAndColumnOneNorms_def.hpp b/packages/tpetra/core/src/Tpetra_computeRowAndColumnOneNorms_def.hpp
index a526d4438f9e..872e7ea23a6e 100644
--- a/packages/tpetra/core/src/Tpetra_computeRowAndColumnOneNorms_def.hpp
+++ b/packages/tpetra/core/src/Tpetra_computeRowAndColumnOneNorms_def.hpp
@@ -50,6 +50,7 @@
 /// Tpetra_computeRowAndColumnOneNorms_decl.hpp in this directory.
 
 #include "Tpetra_Details_copyConvert.hpp"
+#include "Tpetra_Details_EquilibrationInfo.hpp"
 #include "Tpetra_CrsMatrix.hpp"
 #include "Tpetra_Export.hpp"
 #include "Tpetra_Map.hpp"
@@ -310,6 +311,7 @@ class ComputeLocalRowScaledColumnNorms {
   using val_type = typename Kokkos::ArithTraits<SC>::val_type;
   using mag_type = typename Kokkos::ArithTraits<val_type>::mag_type;
   using device_type = typename crs_matrix_type::device_type;
+  using policy_type = Kokkos::TeamPolicy<typename device_type::execution_space, LO>;
 
   ComputeLocalRowScaledColumnNorms (const Kokkos::View<mag_type*, device_type>& rowScaledColNorms,
                                     const Kokkos::View<const mag_type*, device_type>& rowNorms,
@@ -319,18 +321,19 @@ class ComputeLocalRowScaledColumnNorms {
     A_lcl_ (A.getLocalMatrixDevice ())
   {}
 
-  KOKKOS_INLINE_FUNCTION void operator () (const LO lclRow) const {
+  KOKKOS_INLINE_FUNCTION void operator () (const typename policy_type::member_type &team) const {
     using KAT = Kokkos::ArithTraits<val_type>;
 
+    const LO lclRow = team.league_rank();
     const auto curRow = A_lcl_.rowConst (lclRow);
     const mag_type rowNorm = rowNorms_[lclRow];
     const LO numEnt = curRow.length;
-    for (LO k = 0; k < numEnt; ++k) {
+    Kokkos::parallel_for(Kokkos::TeamThreadRange(team, numEnt), [&](const LO k) {
       const mag_type matrixAbsVal = KAT::abs (curRow.value(k));
       const LO lclCol = curRow.colidx(k);
 
       Kokkos::atomic_add (&rowScaledColNorms_[lclCol], matrixAbsVal / rowNorm);
-    }
+    });
   }
 
   static void
@@ -339,14 +342,13 @@ class ComputeLocalRowScaledColumnNorms {
        const crs_matrix_type& A)
   {
     using execution_space = typename device_type::execution_space;
-    using range_type = Kokkos::RangePolicy<execution_space, LO>;
     using functor_type = ComputeLocalRowScaledColumnNorms<SC, LO, GO, NT>;
 
     functor_type functor (rowScaledColNorms, rowNorms, A);
     const LO lclNumRows =
       static_cast<LO> (A.getRowMap ()->getLocalNumElements ());
     Kokkos::parallel_for ("computeLocalRowScaledColumnNorms",
-                          range_type (0, lclNumRows), functor);
+                          policy_type (lclNumRows, Kokkos::AUTO), functor);
   }
 
 private:
@@ -409,6 +411,7 @@ class ComputeLocalRowOneNorms {
   using local_matrix_device_type =
     typename ::Tpetra::CrsMatrix<SC, LO, GO, NT>::local_matrix_device_type;
   using local_map_type = typename ::Tpetra::Map<LO, GO, NT>::local_map_type;
+  using policy_type = Kokkos::TeamPolicy<typename local_matrix_device_type::execution_space, LO>;
 
   ComputeLocalRowOneNorms (const equib_info_type& equib,   // in/out
                            const local_matrix_device_type& A_lcl, // in
@@ -441,12 +444,13 @@ class ComputeLocalRowOneNorms {
   }
 
   KOKKOS_INLINE_FUNCTION void
-  operator () (const LO lclRow, value_type& dst) const
+  operator () (const typename policy_type::member_type& team, value_type& dst) const
   {
     using KAT = Kokkos::ArithTraits<val_type>;
     using mag_type = typename KAT::mag_type;
     using KAM = Kokkos::ArithTraits<mag_type>;
 
+    const LO lclRow = team.league_rank();
     const GO gblRow = rowMap_.getGlobalElement (lclRow);
     // OK if invalid(); then we simply won't find the diagonal entry.
     const GO lclDiagColInd = colMap_.getLocalElement (gblRow);
@@ -456,33 +460,37 @@ class ComputeLocalRowOneNorms {
 
     mag_type rowNorm {0.0};
     val_type diagVal {0.0};
+    value_type dstThread {0};
 
-    for (LO k = 0; k < numEnt; ++k) {
+    Kokkos::parallel_reduce(Kokkos::TeamThreadRange(team, numEnt), [&](const LO k, mag_type &normContrib, val_type& diagContrib, value_type& dstContrib) {
       const val_type matrixVal = curRow.value (k);
       if (KAT::isInf (matrixVal)) {
-        dst |= 1;
+        dstContrib |= 1;
       }
       if (KAT::isNan (matrixVal)) {
-        dst |= 2;
+        dstContrib |= 2;
       }
       const mag_type matrixAbsVal = KAT::abs (matrixVal);
-      rowNorm += matrixAbsVal;
+      normContrib += matrixAbsVal;
       const LO lclCol = curRow.colidx (k);
       if (lclCol == lclDiagColInd) {
-        diagVal = curRow.value (k); // assume no repeats
+        diagContrib = curRow.value (k); // assume no repeats
       }
-    } // for each entry in row
+    }, Kokkos::Sum<mag_type>(rowNorm), Kokkos::Sum<val_type>(diagVal), Kokkos::BOr<value_type>(dstThread)); // for each entry in row
 
     // This is a local result.  If the matrix has an overlapping
     // row Map, then the global result might differ.
-    if (diagVal == KAT::zero ()) {
-      dst |= 4;
-    }
-    if (rowNorm == KAM::zero ()) {
-      dst |= 8;
-    }
-    equib_.rowDiagonalEntries[lclRow] = diagVal;
-    equib_.rowNorms[lclRow] = rowNorm;
+    Kokkos::single(Kokkos::PerTeam(team), [&](){
+      dst |= dstThread;
+      if (diagVal == KAT::zero ()) {
+        dst |= 4;
+      }
+      if (rowNorm == KAM::zero ()) {
+        dst |= 8;
+      }
+      equib_.rowDiagonalEntries[lclRow] = diagVal;
+      equib_.rowNorms[lclRow] = rowNorm;
+    });
   }
 
 private:
@@ -501,6 +509,7 @@ class ComputeLocalRowAndColumnOneNorms {
   using equib_info_type = EquilibrationInfo<val_type, typename NT::device_type>;
   using local_matrix_device_type = typename ::Tpetra::CrsMatrix<SC, LO, GO, NT>::local_matrix_device_type;
   using local_map_type = typename ::Tpetra::Map<LO, GO, NT>::local_map_type;
+  using policy_type = Kokkos::TeamPolicy<typename local_matrix_device_type::execution_space, LO>;
 
 public:
   ComputeLocalRowAndColumnOneNorms (const equib_info_type& equib,   // in/out
@@ -534,12 +543,13 @@ class ComputeLocalRowAndColumnOneNorms {
   }
 
   KOKKOS_INLINE_FUNCTION void
-  operator () (const LO lclRow, value_type& dst) const
+  operator () (const typename policy_type::member_type& team, value_type& dst) const
   {
     using KAT = Kokkos::ArithTraits<val_type>;
     using mag_type = typename KAT::mag_type;
     using KAM = Kokkos::ArithTraits<mag_type>;
 
+    const LO lclRow = team.league_rank();
     const GO gblRow = rowMap_.getGlobalElement (lclRow);
     // OK if invalid(); then we simply won't find the diagonal entry.
     const GO lclDiagColInd = colMap_.getLocalElement (gblRow);
@@ -549,46 +559,50 @@ class ComputeLocalRowAndColumnOneNorms {
 
     mag_type rowNorm {0.0};
     val_type diagVal {0.0};
+    value_type dstThread {0};
 
-    for (LO k = 0; k < numEnt; ++k) {
+    Kokkos::parallel_reduce(Kokkos::TeamThreadRange(team, numEnt), [&](const LO k, mag_type &normContrib, val_type& diagContrib, value_type& dstContrib) {
       const val_type matrixVal = curRow.value (k);
       if (KAT::isInf (matrixVal)) {
-        dst |= 1;
+        dstContrib |= 1;
       }
       if (KAT::isNan (matrixVal)) {
-        dst |= 2;
+        dstContrib |= 2;
       }
       const mag_type matrixAbsVal = KAT::abs (matrixVal);
-      rowNorm += matrixAbsVal;
+      normContrib += matrixAbsVal;
       const LO lclCol = curRow.colidx (k);
       if (lclCol == lclDiagColInd) {
-        diagVal = curRow.value (k); // assume no repeats
+        diagContrib = curRow.value (k); // assume no repeats
       }
       if (! equib_.assumeSymmetric) {
         Kokkos::atomic_add (&(equib_.colNorms[lclCol]), matrixAbsVal);
       }
-    } // for each entry in row
+    }, Kokkos::Sum<mag_type>(rowNorm), Kokkos::Sum<val_type>(diagVal), Kokkos::BOr<value_type>(dstThread)); // for each entry in row
 
     // This is a local result.  If the matrix has an overlapping
     // row Map, then the global result might differ.
-    if (diagVal == KAT::zero ()) {
-      dst |= 4;
-    }
-    if (rowNorm == KAM::zero ()) {
-      dst |= 8;
-    }
-    // NOTE (mfh 24 May 2018) We could actually compute local
-    // rowScaledColNorms in situ at this point, if ! assumeSymmetric
-    // and row Map is the same as range Map (so that the local row
-    // norms are the same as the global row norms).
-    equib_.rowDiagonalEntries[lclRow] = diagVal;
-    equib_.rowNorms[lclRow] = rowNorm;
-    if (! equib_.assumeSymmetric &&
-        lclDiagColInd != Tpetra::Details::OrdinalTraits<LO>::invalid ()) {
-      // Don't need an atomic update here, since this lclDiagColInd is
-      // a one-to-one function of lclRow.
-      equib_.colDiagonalEntries[lclDiagColInd] += diagVal;
-    }
+    Kokkos::single(Kokkos::PerTeam(team), [&](){
+      dst |= dstThread;
+      if (diagVal == KAT::zero ()) {
+        dst |= 4;
+      }
+      if (rowNorm == KAM::zero ()) {
+        dst |= 8;
+      }
+      // NOTE (mfh 24 May 2018) We could actually compute local
+      // rowScaledColNorms in situ at this point, if ! assumeSymmetric
+      // and row Map is the same as range Map (so that the local row
+      // norms are the same as the global row norms).
+      equib_.rowDiagonalEntries[lclRow] = diagVal;
+      equib_.rowNorms[lclRow] = rowNorm;
+      if (! equib_.assumeSymmetric &&
+          lclDiagColInd != Tpetra::Details::OrdinalTraits<LO>::invalid ()) {
+        // Don't need an atomic update here, since this lclDiagColInd is
+        // a one-to-one function of lclRow.
+        equib_.colDiagonalEntries[lclDiagColInd] += diagVal;
+      }
+    });
   }
 
 private:
@@ -605,7 +619,7 @@ EquilibrationInfo<typename Kokkos::ArithTraits<SC>::val_type, typename NT::devic
 computeLocalRowOneNorms_CrsMatrix (const Tpetra::CrsMatrix<SC, LO, GO, NT>& A)
 {
   using execution_space = typename NT::device_type::execution_space;
-  using range_type = Kokkos::RangePolicy<execution_space, LO>;
+  using policy_type = Kokkos::TeamPolicy<execution_space, LO>;
   using functor_type = ComputeLocalRowOneNorms<SC, LO, GO, NT>;
   using val_type = typename Kokkos::ArithTraits<SC>::val_type;
   using device_type = typename NT::device_type;
@@ -621,7 +635,7 @@ computeLocalRowOneNorms_CrsMatrix (const Tpetra::CrsMatrix<SC, LO, GO, NT>& A)
                         A.getColMap ()->getLocalMap ());
   int result = 0;
   Kokkos::parallel_reduce ("computeLocalRowOneNorms",
-                           range_type (0, lclNumRows), functor,
+                           policy_type (lclNumRows, Kokkos::AUTO), functor,
                            result);
   equib.foundInf = (result & 1) != 0;
   equib.foundNan = (result & 2) != 0;
@@ -638,7 +652,7 @@ computeLocalRowAndColumnOneNorms_CrsMatrix (const Tpetra::CrsMatrix<SC, LO, GO,
                                             const bool assumeSymmetric)
 {
   using execution_space = typename NT::device_type::execution_space;
-  using range_type = Kokkos::RangePolicy<execution_space, LO>;
+  using policy_type = Kokkos::TeamPolicy<execution_space, LO>;
   using functor_type = ComputeLocalRowAndColumnOneNorms<SC, LO, GO, NT>;
   using val_type = typename Kokkos::ArithTraits<SC>::val_type;
   using device_type = typename NT::device_type;
@@ -653,7 +667,7 @@ computeLocalRowAndColumnOneNorms_CrsMatrix (const Tpetra::CrsMatrix<SC, LO, GO,
                         A.getColMap ()->getLocalMap ());
   int result = 0;
   Kokkos::parallel_reduce ("computeLocalRowAndColumnOneNorms",
-                           range_type (0, lclNumRows), functor,
+                           policy_type (lclNumRows, Kokkos::AUTO), functor,
                            result);
   equib.foundInf = (result & 1) != 0;
   equib.foundNan = (result & 2) != 0;
diff --git a/packages/tpetra/core/test/Comm/gpuAwareMpi.cpp b/packages/tpetra/core/test/Comm/gpuAwareMpi.cpp
index ae747e94194c..70e8472cf3d4 100644
--- a/packages/tpetra/core/test/Comm/gpuAwareMpi.cpp
+++ b/packages/tpetra/core/test/Comm/gpuAwareMpi.cpp
@@ -81,7 +81,7 @@ namespace { // (anonymous)
   typedef Kokkos::Device<Kokkos::HIP, Kokkos::HIPSpace> test_device_type;
   typedef Tpetra::KokkosCompat::KokkosHIPWrapperNode test_node_type;
 #elif defined(KOKKOS_ENABLE_SYCL)
-  typedef Kokkos::Device<Kokkos::Experimental::SYCL, Kokkos::Experimental::SYCLSpace> test_device_type;
+  typedef Kokkos::Device<Kokkos::Experimental::SYCL, Kokkos::Experimental::SYCLDeviceUSMSpace> test_device_type;
   typedef Tpetra::KokkosCompat::KokkosSYCLWrapperNode test_node_type;
 #endif
   typedef Tpetra::Map<>::local_ordinal_type LO;
diff --git a/packages/tpetra/core/test/CrsMatrix/CMakeLists.txt b/packages/tpetra/core/test/CrsMatrix/CMakeLists.txt
index 646ff80848a0..8c79f3c14bd3 100644
--- a/packages/tpetra/core/test/CrsMatrix/CMakeLists.txt
+++ b/packages/tpetra/core/test/CrsMatrix/CMakeLists.txt
@@ -490,8 +490,29 @@ TRIBITS_ADD_EXECUTABLE_AND_TEST(
   STANDARD_PASS_OUTPUT
   )
 
+if (
+  # supported TPLs
+  (
+    (Tpetra_ENABLE_CUDA AND TPL_ENABLE_CUSPARSE ) OR
+    (Tpetra_ENABLE_HIP  AND TPL_ENABLE_ROCSPARSE)
+  )
 
+  AND
 
+  # supported type combos
+  (
+    (Tpetra_INST_DOUBLE OR Tpetra_INST_FLOAT)
+  )
+)
+  TRIBITS_ADD_EXECUTABLE_AND_TEST(
+    CrsMatrix_ApplyUsesTPLs
+    SOURCES
+      CrsMatrix_ApplyUsesTPLs.cpp
+      ${TEUCHOS_STD_UNIT_TEST_MAIN}
+    COMM serial mpi
+    STANDARD_PASS_OUTPUT
+    )
+endif()
 
 SET(TIMING_INSTALLS "")
 
diff --git a/packages/tpetra/core/test/CrsMatrix/CrsMatrix_ApplyUsesTPLs.cpp b/packages/tpetra/core/test/CrsMatrix/CrsMatrix_ApplyUsesTPLs.cpp
new file mode 100644
index 000000000000..dff7928d1f12
--- /dev/null
+++ b/packages/tpetra/core/test/CrsMatrix/CrsMatrix_ApplyUsesTPLs.cpp
@@ -0,0 +1,286 @@
+/*
+// @HEADER
+// ***********************************************************************
+//
+//          Tpetra: Templated Linear Algebra Services Package
+//                 Copyright (2008) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Michael A. Heroux (maherou@sandia.gov)
+//
+// ************************************************************************
+// @HEADER
+*/
+
+#include "Tpetra_TestingUtilities.hpp"
+#include "Tpetra_MultiVector.hpp"
+#include "Tpetra_CrsMatrix.hpp"
+#include "Tpetra_Details_KokkosCounter.hpp"
+
+// TODO: add test where some nodes have zero rows
+// TODO: add test where non-"zero" graph is used to build matrix; if no values are added to matrix, the operator effect should be zero. This tests that matrix values are initialized properly.
+// TODO: add test where dynamic profile initially has no allocation, then entries are added. this will test new view functionality.
+
+namespace Teuchos {
+  template <>
+    ScalarTraits<int>::magnitudeType
+    relErr( const int &s1, const int &s2 )
+    {
+      typedef ScalarTraits<int> ST;
+      return ST::magnitude(s1-s2);
+    }
+
+  template <>
+    ScalarTraits<char>::magnitudeType
+    relErr( const char &s1, const char &s2 )
+    {
+      typedef ScalarTraits<char> ST;
+      return ST::magnitude(s1-s2);
+    }
+}
+
+namespace {
+
+  // no ScalarTraits<>::eps() for integer types
+
+  template <class Scalar, bool hasMachineParameters> struct TestingTolGuts {};
+
+  template <class Scalar>
+  struct TestingTolGuts<Scalar, true> {
+    static typename Teuchos::ScalarTraits<Scalar>::magnitudeType testingTol()
+      { return Teuchos::ScalarTraits<Scalar>::eps(); }
+  };
+
+  template <class Scalar>
+  struct TestingTolGuts<Scalar, false> {
+    static typename Teuchos::ScalarTraits<Scalar>::magnitudeType testingTol()
+      { return 0; }
+  };
+
+  template <class Scalar>
+  static typename Teuchos::ScalarTraits<Scalar>::magnitudeType testingTol()
+  {
+    return TestingTolGuts<Scalar, Teuchos::ScalarTraits<Scalar>::hasMachineParameters>::
+      testingTol();
+  }
+
+  using Tpetra::TestingUtilities::getDefaultComm;
+
+  using std::endl;
+  using std::swap;
+
+  using std::string;
+
+  using Teuchos::as;
+  using Teuchos::FancyOStream;
+  using Teuchos::RCP;
+  using Teuchos::ArrayRCP;
+  using Teuchos::rcp;
+  using Teuchos::arcp;
+  using Teuchos::outArg;
+  using Teuchos::arcpClone;
+  using Teuchos::arrayView;
+  using Teuchos::broadcast;
+  using Teuchos::OrdinalTraits;
+  using Teuchos::ScalarTraits;
+  using Teuchos::Comm;
+  using Teuchos::Array;
+  using Teuchos::ArrayView;
+  using Teuchos::tuple;
+  using Teuchos::null;
+  using Teuchos::VERB_NONE;
+  using Teuchos::VERB_LOW;
+  using Teuchos::VERB_MEDIUM;
+  using Teuchos::VERB_HIGH;
+  using Teuchos::VERB_EXTREME;
+  using Teuchos::ETransp;
+  using Teuchos::NO_TRANS;
+  using Teuchos::TRANS;
+  using Teuchos::CONJ_TRANS;
+  using Teuchos::EDiag;
+  using Teuchos::UNIT_DIAG;
+  using Teuchos::NON_UNIT_DIAG;
+  using Teuchos::EUplo;
+  using Teuchos::UPPER_TRI;
+  using Teuchos::LOWER_TRI;
+  using Teuchos::ParameterList;
+  using Teuchos::parameterList;
+
+  using Tpetra::Map;
+  using Tpetra::MultiVector;
+  using Tpetra::Vector;
+  using Tpetra::Operator;
+  using Tpetra::CrsMatrix;
+  using Tpetra::CrsGraph;
+  using Tpetra::RowMatrix;
+  using Tpetra::Import;
+  using Tpetra::global_size_t;
+  using Tpetra::createContigMapWithNode;
+  using Tpetra::createLocalMapWithNode;
+  using Tpetra::createVector;
+  using Tpetra::OptimizeOption;
+  using Tpetra::DoOptimizeStorage;
+  using Tpetra::DoNotOptimizeStorage;
+  using Tpetra::GloballyDistributed;
+  using Tpetra::INSERT;
+
+  //
+  // UNIT TESTS
+  //
+
+  ////
+  TEUCHOS_UNIT_TEST_TEMPLATE_4_DECL( CrsMatrix, NonSquare, LO, GO, Scalar, Node )
+  {
+
+    // skip test if Scalar is not (float or double)
+    if constexpr (!(std::is_same_v<Scalar, float> || std::is_same_v<Scalar, double>)) {
+      out << "SKIP: unsupported scalar type" << std::endl;
+      TEST_EQUALITY_CONST(1,1); // SKIP
+      return;
+    }
+    // skip test if LO != int
+    if constexpr (!std::is_same_v<LO, int>) {
+      out << "SKIP: unsupported local ordinal type" << std::endl;
+      TEST_EQUALITY_CONST(1,1); // SKIP
+      return;
+    }
+    // skip test if CUDA enables and not CUDA space
+  #if defined(HAVE_TPETRA_CUDA) || defined(HAVE_TPETRACORE_CUDA)
+    if constexpr (!std::is_same_v<typename Node::execution_space, Kokkos::Cuda>) {
+      out << "SKIP: non-CUDA exec space" << std::endl;
+      TEST_EQUALITY_CONST(1,1); // SKIP
+      return;
+    }
+  #endif
+  // skip test if HIP enabled and not HIP space
+  #if defined(HAVE_TPETRA_HIP) || defined(HAVE_TPETRACORE_HIP)
+    if constexpr (!std::is_same_v<typename Node::execution_space, Kokkos::HIP>) {
+      out << "SKIP: non-HIP exec space" << std::endl;
+      TEST_EQUALITY_CONST(1,1); // SKIP
+      return;
+    }
+  #endif
+
+    typedef CrsMatrix<Scalar,LO,GO,Node> MAT;
+    typedef MultiVector<Scalar,LO,GO,Node> MV;
+    typedef Map<LO,GO,Node> map_type;
+    const global_size_t INVALID = OrdinalTraits<global_size_t>::invalid();
+    // get a comm
+    RCP<const Comm<int> > comm = getDefaultComm();
+    const int M = 3;
+    const int P = 5;
+    const int N = comm->getSize();
+    const int myImageID = comm->getRank();
+    // create Maps
+    // matrix is M*N-by-P
+    //                  col
+    //            0        1                  P-1
+    //    0  [0        MN              ... (P-1)MN     ]
+    //    .  [...      ...                 ...         ]
+    //    0  [M-1      MN+M-1              (P-1)MN+M-1 ]
+    //p   1  [M        MN+M                            ]
+    //r   .  [...      ...                             ] = [A_ij], where A_ij = i+jMN
+    //o   1  [2M-1     MN+2M-1                         ]
+    //c   .  [...                                      ]
+    //   N-1 [(N-1)M   MN+(N-1)(M-1)                   ]
+    //    .  [...      ...                             ]
+    //   N-1 [MN-1     MN+MN-1                         ]
+    //
+    // row map, range map is [0,M-1] [M,2M-1] [2M,3M-1] ... [MN-M,MN-1]
+    // domain map will be map for X (lclmap)
+    //
+    // input multivector X is not distributed:
+    //
+    //   X = [  0    P    ...  (numVecs-1)P ]
+    //       [ ...  ....  ...       ...     ] = [X_ji], where X_ij = i+jP
+    //       [ P-1  2P-1  ...   numVecs*P-1 ]
+    //
+    // the result of the non-transpose multiplication should be
+    //                              P-1
+    // (A*X)_ij = sum_k A_ik X_kj = sum (i+kMN)(k+jP) = jiP^2 + (i+jMNP)(P^2-P)/2 + MNP(P-1)(2P-1)/6
+    //                              k=0
+    //
+    //
+    //
+    const int numVecs  = 3;
+    RCP<const map_type> rowmap (new map_type (INVALID, M, 0, comm));
+    RCP<const map_type> lclmap = createLocalMapWithNode<LO,GO,Node> (P, comm);
+
+    // create the matrix
+    MAT A(rowmap,P);
+    for (GO i=0; i<static_cast<GO>(M); ++i) {
+      for (GO j=0; j<static_cast<GO>(P); ++j) {
+        A.insertGlobalValues( M*myImageID+i, tuple<GO>(j), tuple<Scalar>(M*myImageID+i + j*M*N) );
+      }
+    }
+    // call fillComplete()
+    A.fillComplete(lclmap,rowmap);
+    // build the input multivector X
+    MV X(lclmap,numVecs);
+    for (GO i=0; i<static_cast<GO>(P); ++i) {
+      for (GO j=0; j<static_cast<GO>(numVecs); ++j) {
+        X.replaceGlobalValue(i,j,static_cast<Scalar>(i+j*P));
+      }
+    }
+    // allocate output multivec
+    MV Bout(rowmap,numVecs);
+    // test the action
+    Bout.randomize();
+    Tpetra::Details::KokkosRegionCounter::reset();
+    Tpetra::Details::KokkosRegionCounter::start();
+    A.apply(X,Bout);
+    Tpetra::Details::KokkosRegionCounter::stop();
+
+    TEST_COMPARE(Tpetra::Details::KokkosRegionCounter::get_count_region_contains("spmv[TPL_"), ==, 1);
+
+    using Teuchos::outArg;
+    using Teuchos::REDUCE_MIN;
+    using Teuchos::reduceAll;
+    const int lclSuccess = success ? 1 : 0;
+    int gblSuccess = 0; // output argument
+    reduceAll (*comm, REDUCE_MIN, lclSuccess, outArg (gblSuccess));
+    TEST_EQUALITY_CONST( gblSuccess, 1 );
+    if (gblSuccess != 1) {
+      out << "KokkosKernels TPL use was not detected where it was expected!" << endl;
+    }
+  }
+
+//
+// INSTANTIATIONS
+//
+#define UNIT_TEST_GROUP( SCALAR, LO, GO, NODE ) \
+      TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT( CrsMatrix, NonSquare, LO, GO, SCALAR, NODE )
+
+  TPETRA_ETI_MANGLING_TYPEDEFS()
+
+  TPETRA_INSTANTIATE_SLGN( UNIT_TEST_GROUP )
+}
diff --git a/packages/tpetra/core/test/PerformanceCGSolve/cg_solve_file.cpp b/packages/tpetra/core/test/PerformanceCGSolve/cg_solve_file.cpp
index 3011db577316..b5483a86bd80 100644
--- a/packages/tpetra/core/test/PerformanceCGSolve/cg_solve_file.cpp
+++ b/packages/tpetra/core/test/PerformanceCGSolve/cg_solve_file.cpp
@@ -89,21 +89,26 @@ bool cg_solve (Teuchos::RCP<CrsMatrix> A, Teuchos::RCP<Vector> b, Teuchos::RCP<V
   LO print_freq = max_iter/10;
   print_freq = std::min(print_freq, 50);
   print_freq = std::max(print_freq, 1);
+  Kokkos::fence();
   {
     TimeMonitor t(*TimeMonitor::getNewTimer(addTimerName));
     p->update(1.0,*x,0.0,*x,0.0);
+    Kokkos::fence();
   }
   {
     TimeMonitor t(*TimeMonitor::getNewTimer(matvecTimerName));
     A->apply(*p, *Ap);
+    Kokkos::fence();
   }
   {
     TimeMonitor t(*TimeMonitor::getNewTimer(addTimerName));
     r->update(1.0,*b,-1.0,*Ap,0.0);
+    Kokkos::fence();
   }
   {
     TimeMonitor t(*TimeMonitor::getNewTimer(dotTimerName));
     rtrans = r->dot(*r);
+    Kokkos::fence();
   }
 
   normr = std::sqrt(rtrans);
@@ -118,17 +123,20 @@ bool cg_solve (Teuchos::RCP<CrsMatrix> A, Teuchos::RCP<Vector> b, Teuchos::RCP<V
     if (k == 1) {
         TimeMonitor t(*TimeMonitor::getNewTimer(addTimerName));
         p->update(1.0,*r,0.0);
+        Kokkos::fence();
     }
     else {
       oldrtrans = rtrans;
       {
         TimeMonitor t(*TimeMonitor::getNewTimer(dotTimerName));
         rtrans = r->dot(*r);
+        Kokkos::fence();
       }
       {
         TimeMonitor t(*TimeMonitor::getNewTimer(addTimerName));
         magnitude_type beta = rtrans/oldrtrans;
         p->update(1.0,*r,beta);
+        Kokkos::fence();
       }
     }
     normr = std::sqrt(rtrans);
@@ -141,10 +149,12 @@ bool cg_solve (Teuchos::RCP<CrsMatrix> A, Teuchos::RCP<Vector> b, Teuchos::RCP<V
     {
       TimeMonitor t(*TimeMonitor::getNewTimer(matvecTimerName));
       A->apply(*p, *Ap);
+      Kokkos::fence();
     }
     {
       TimeMonitor t(*TimeMonitor::getNewTimer(dotTimerName));
       p_ap_dot = Ap->dot(*p);
+      Kokkos::fence();
     }
     {
       TimeMonitor t(*TimeMonitor::getNewTimer(addTimerName));
@@ -158,11 +168,14 @@ bool cg_solve (Teuchos::RCP<CrsMatrix> A, Teuchos::RCP<Vector> b, Teuchos::RCP<V
       alpha = rtrans / p_ap_dot;
       x->update(alpha,*p,1.0);
       r->update(-alpha,*Ap,1.0);
+      Kokkos::fence();
     }
   }
   {
+    Kokkos::fence();
     TimeMonitor t(*TimeMonitor::getNewTimer(dotTimerName));
     rtrans = r->dot(*r);
+    Kokkos::fence();
   }
 
   normr = std::sqrt(rtrans);
@@ -267,6 +280,11 @@ int run()
   // On output, it is the approximate solution.
   RCP<vec_type> x (new vec_type (A->getDomainMap ()));
 
+  // Untimed warm-up apply
+  A->apply(*b, *x);
+  // Zero out x again
+  x->putScalar(0);
+
   // Solve the linear system Ax=b using CG.
   RCP<StackedTimer> timer = rcp(new StackedTimer("CG: global"));
   TimeMonitor::setStackedTimer(timer);