diff --git a/packages/tpetra/core/src/Tpetra_Details_WrappedDualView.hpp b/packages/tpetra/core/src/Tpetra_Details_WrappedDualView.hpp index c050122f377b..b4038d65d9ee 100644 --- a/packages/tpetra/core/src/Tpetra_Details_WrappedDualView.hpp +++ b/packages/tpetra/core/src/Tpetra_Details_WrappedDualView.hpp @@ -98,6 +98,8 @@ using enableIfConstData = std::enable_if_t::value>; template using enableIfNonConstData = std::enable_if_t::value>; +/* sync_host functions */ + template enableIfNonConstData sync_host(DualViewType dualView) { @@ -105,21 +107,46 @@ sync_host(DualViewType dualView) { dualView.sync_host(); } +template +enableIfNonConstData +sync_host(const typename DualViewType::t_host::execution_space& exec, DualViewType dualView) { + // This will sync, but only if needed + dualView.sync_host(); +} + template enableIfConstData sync_host(DualViewType dualView) { } +template +enableIfConstData +sync_host(const typename DualViewType::t_host::execution_space& exec, DualViewType dualView) { } + +/* sync_device functions */ + template enableIfNonConstData sync_device(DualViewType dualView) { // This will sync, but only if needed - dualView.sync_device(); + dualView.sync_device(); +} + +template +enableIfNonConstData +sync_device(const typename DualViewType::t_dev::execution_space& exec, DualViewType dualView) { + // This will sync, but only if needed + dualView.sync_device(exec); } template enableIfConstData sync_device(DualViewType dualView) { } +template +enableIfConstData +sync_device(const typename DualViewType::t_dev::execution_space& exec, DualViewType dualView) { } + + }// end namespace Impl /// \brief Whether WrappedDualView reference count checking is enabled. Initially true. @@ -320,6 +347,19 @@ class WrappedDualView { } return dualView.view_device(); } + + typename t_dev::const_type + getDeviceView(const typename DualViewType::t_dev::execution_space& exec, Access::ReadOnlyStruct + DEBUG_UVM_REMOVAL_ARGUMENT + ) const + { + DEBUG_UVM_REMOVAL_PRINT_CALLER("getDeviceViewReadOnly"); + if(needsSyncPath()) { + throwIfHostViewAlive(); + impl::sync_device(exec, originalDualView); + } + return dualView.view_device(); + } t_dev getDeviceView(Access::ReadWriteStruct @@ -337,6 +377,23 @@ class WrappedDualView { return dualView.view_device(); } + t_dev + getDeviceView(const typename DualViewType::t_dev::execution_space& exec, Access::ReadWriteStruct + DEBUG_UVM_REMOVAL_ARGUMENT + ) + { + DEBUG_UVM_REMOVAL_PRINT_CALLER("getDeviceViewReadWrite"); + static_assert(dualViewHasNonConstData, + "ReadWrite views are not available for DualView with const data"); + if(needsSyncPath()) { + throwIfHostViewAlive(); + impl::sync_device(exec,originalDualView); + originalDualView.modify_device(); + } + return dualView.view_device(); + } + + t_dev getDeviceView(Access::OverwriteAllStruct DEBUG_UVM_REMOVAL_ARGUMENT @@ -357,6 +414,21 @@ class WrappedDualView { return dualView.view_device(); } + + t_dev + getDeviceView(const typename DualViewType::t_dev::execution_space& exec, Access::OverwriteAllStruct s + DEBUG_UVM_REMOVAL_ARGUMENT + ) + { + // Since we're never syncing in this case, the execution_space is meaningless here +#ifdef DEBUG_UVM_REMOVAL + return getDeviceView(s,callerstr,filestr,linnum); +#else + return getDeviceView(s); +#endif + } + + template typename std::remove_reference().template view())>::type::const_type getView (Access::ReadOnlyStruct s DEBUG_UVM_REMOVAL_ARGUMENT) const { @@ -381,7 +453,31 @@ class WrappedDualView { return dualView.template view(); } + template + typename std::remove_reference().template view())>::type::const_type + getView (const typename TargetDeviceType::execution_space & exec, Access::ReadOnlyStruct s DEBUG_UVM_REMOVAL_ARGUMENT) const { + using ReturnViewType = typename std::remove_reference().template view())>::type::const_type; + using ReturnDeviceType = typename ReturnViewType::device_type; + constexpr bool returnDevice = std::is_same::value; + if(returnDevice) { + DEBUG_UVM_REMOVAL_PRINT_CALLER("getViewReadOnly"); + if(needsSyncPath()) { + throwIfHostViewAlive(); + impl::sync_device(exec,originalDualView); + } + } + else { + DEBUG_UVM_REMOVAL_PRINT_CALLER("getViewReadOnly"); + if(needsSyncPath()) { + throwIfDeviceViewAlive(); + impl::sync_host(exec,originalDualView); + } + } + + return dualView.template view(); + } + template typename std::remove_reference().template view())>::type getView (Access::ReadWriteStruct s DEBUG_UVM_REMOVAL_ARGUMENT) const { @@ -414,6 +510,39 @@ class WrappedDualView { } + template + typename std::remove_reference().template view())>::type + getView (const typename TargetDeviceType::execution_space & exec,Access::ReadWriteStruct s DEBUG_UVM_REMOVAL_ARGUMENT) const { + using ReturnViewType = typename std::remove_reference().template view())>::type; + using ReturnDeviceType = typename ReturnViewType::device_type; + constexpr bool returnDevice = std::is_same::value; + + if(returnDevice) { + DEBUG_UVM_REMOVAL_PRINT_CALLER("getViewReadWrite"); + static_assert(dualViewHasNonConstData, + "ReadWrite views are not available for DualView with const data"); + if(needsSyncPath()) { + throwIfHostViewAlive(); + impl::sync_device(exec,originalDualView); + originalDualView.modify_device(); + } + } + else { + DEBUG_UVM_REMOVAL_PRINT_CALLER("getViewReadWrite"); + static_assert(dualViewHasNonConstData, + "ReadWrite views are not available for DualView with const data"); + if(needsSyncPath()) { + throwIfDeviceViewAlive(); + impl::sync_host(exec,originalDualView); + originalDualView.modify_host(); + } + } + + return dualView.template view(); + } + + + template typename std::remove_reference().template view())>::type getView (Access::OverwriteAllStruct s DEBUG_UVM_REMOVAL_ARGUMENT) const { @@ -450,6 +579,21 @@ class WrappedDualView { } + template + typename std::remove_reference().template view())>::type + getView (const typename TargetDeviceType::execution_space & exec, Access::OverwriteAllStruct s DEBUG_UVM_REMOVAL_ARGUMENT) const { + using ReturnViewType = typename std::remove_reference().template view())>::type; + using ReturnDeviceType = typename ReturnViewType::device_type; + // Since nothing syncs here, the ExecSpace is meaningless +#ifdef DEBUG_UVM_REMOVAL + return getView(s,callerstr,filestr,linnum); +#else + return getView(s); +#endif + + } + + typename t_host::const_type getHostSubview(int offset, int numEntries, Access::ReadOnlyStruct DEBUG_UVM_REMOVAL_ARGUMENT @@ -503,6 +647,20 @@ class WrappedDualView { return getSubview(dualView.view_device(), offset, numEntries); } + typename t_dev::const_type + getDeviceSubview(const typename DualViewType::t_dev::execution_space& exec, int offset, int numEntries, Access::ReadOnlyStruct + DEBUG_UVM_REMOVAL_ARGUMENT + ) const + { + DEBUG_UVM_REMOVAL_PRINT_CALLER("getDeviceSubviewReadOnly"); + if(needsSyncPath()) { + throwIfHostViewAlive(); + impl::sync_device(exec,originalDualView); + } + return getSubview(dualView.view_device(), offset, numEntries); + } + + t_dev getDeviceSubview(int offset, int numEntries, Access::ReadWriteStruct DEBUG_UVM_REMOVAL_ARGUMENT @@ -519,6 +677,22 @@ class WrappedDualView { return getSubview(dualView.view_device(), offset, numEntries); } + t_dev + getDeviceSubview(const typename DualViewType::t_dev::execution_space& exec, int offset, int numEntries, Access::ReadWriteStruct + DEBUG_UVM_REMOVAL_ARGUMENT + ) + { + DEBUG_UVM_REMOVAL_PRINT_CALLER("getDeviceSubviewReadWrite"); + static_assert(dualViewHasNonConstData, + "ReadWrite views are not available for DualView with const data"); + if(needsSyncPath()) { + throwIfHostViewAlive(); + impl::sync_device(exec, originalDualView); + originalDualView.modify_device(); + } + return getSubview(dualView.view_device(), offset, numEntries); + } + t_dev getDeviceSubview(int offset, int numEntries, Access::OverwriteAllStruct DEBUG_UVM_REMOVAL_ARGUMENT @@ -530,6 +704,17 @@ class WrappedDualView { return getDeviceSubview(offset, numEntries, Access::ReadWrite); } + t_dev + getDeviceSubview(const typename DualViewType::t_dev::execution_space& exec, int offset, int numEntries, Access::OverwriteAllStruct + DEBUG_UVM_REMOVAL_ARGUMENT + ) + { + DEBUG_UVM_REMOVAL_PRINT_CALLER("getDeviceSubviewOverwriteAll"); + static_assert(dualViewHasNonConstData, + "OverwriteAll views are not available for DualView with const data"); + return getDeviceSubview(exec,offset, numEntries, Access::ReadWrite); + } + // Debugging functions to get copies of the view state typename t_host::HostMirror getHostCopy() const { diff --git a/packages/tpetra/core/src/Tpetra_MultiVector_decl.hpp b/packages/tpetra/core/src/Tpetra_MultiVector_decl.hpp index 0b7a6284d53d..80680103ed19 100644 --- a/packages/tpetra/core/src/Tpetra_MultiVector_decl.hpp +++ b/packages/tpetra/core/src/Tpetra_MultiVector_decl.hpp @@ -594,7 +594,7 @@ namespace Tpetra { /// around with multiple memory spaces. MultiVector (const Teuchos::RCP& map, const typename dual_view_type::t_dev& d_view); - + /// \brief Expert mode constructor, that takes a Kokkos::DualView /// of the MultiVector's data and the "original" /// Kokkos::DualView of the data, and returns a MultiVector that @@ -1466,14 +1466,31 @@ namespace Tpetra { /// This requires that there are no live host-space views. typename dual_view_type::t_dev::const_type getLocalViewDevice(Access::ReadOnlyStruct) const; + /// \brief Return a read-only, up-to-date view of this MultiVector's local data on device. + /// This requires that there are no live host-space views. + /// @warning This function will only synchronize the provided @c execution_space instance, which if not used correctly can lead to errors. + typename dual_view_type::t_dev::const_type getLocalViewDevice(const execution_space & exec, Access::ReadOnlyStruct) const; + /// \brief Return a mutable, up-to-date view of this MultiVector's local data on device. /// This requires that there are no live host-space views. typename dual_view_type::t_dev getLocalViewDevice(Access::ReadWriteStruct); + /// \brief Return a mutable, up-to-date view of this MultiVector's local data on device. + /// This requires that there are no live host-space views. + /// WARNING: This function will only synchronize the provided execution_space instance, which if not used correctly + /// can lead to errors. + typename dual_view_type::t_dev getLocalViewDevice(const execution_space & exec,Access::ReadWriteStruct); + /// \brief Return a mutable view of this MultiVector's local data on device, assuming all existing data will be overwritten. /// This requires that there are no live host-space views. typename dual_view_type::t_dev getLocalViewDevice(Access::OverwriteAllStruct); + /// \brief Return a mutable view of this MultiVector's local data on device, assuming all existing data will be overwritten. + /// This requires that there are no live host-space views. + /// WARNING: This function will only synchronize the provided execution_space instance, which if not used correctly + /// can lead to errors. + typename dual_view_type::t_dev getLocalViewDevice(const execution_space & exec, Access::OverwriteAllStruct); + /// \brief Return the wrapped dual view holding this MultiVector's local data. /// /// \warning This method is ONLY for use by experts. We highly recommend accessing the local data @@ -1671,9 +1688,21 @@ namespace Tpetra { //! Put element-wise absolute values of input Multi-vector in target: A = abs(this) void abs (const MultiVector& A); + //\brief Put element-wise absolute values of input Multi-vector in target: A = abs(this) + /// + /// WARNING: This will only synchronize the MultiVectors w.r.t. the used-provided execution space instance. + /// This can lead to incorrect behavior if other execution_space instances are attempting to modify the vectors. + void abs (const execution_space& exec, const MultiVector& A); + //! Put element-wise reciprocal values of input Multi-vector in target, this(i,j) = 1/A(i,j). void reciprocal (const MultiVector& A); + //\brief Put element-wise reciprocal values of input Multi-vector in target, this(i,j) = 1/A(i,j). + /// + /// WARNING: This will only synchronize the MultiVectors w.r.t. the used-provided execution space instance. + /// This can lead to incorrect behavior if other execution_space instances are attempting to modify the vectors. + void reciprocal (const execution_space& exec, const MultiVector& A); + /// \brief Scale in place: this = alpha*this. /// /// Replace this MultiVector with alpha times this MultiVector. @@ -1683,6 +1712,18 @@ namespace Tpetra { /// this method finishes. void scale (const Scalar& alpha); + /// \brief Scale in place: this = alpha*this. + /// + /// Replace this MultiVector with alpha times this MultiVector. + /// This method will always multiply, even if alpha is zero. That + /// means, for example, that if \c *this contains NaN entries + /// before calling this method, the NaN entries will remain after + /// this method finishes. + /// + /// WARNING: This will only synchronize the MultiVectors w.r.t. the used-provided execution space instance. + /// This can lead to incorrect behavior if other execution_space instances are attempting to modify the vectors. + void scale (const execution_space& exec, const Scalar& alpha); + /// \brief Scale each column in place: this[j] = alpha[j]*this[j]. /// /// Replace each column j of this MultiVector with @@ -1703,6 +1744,19 @@ namespace Tpetra { /// the NaN entries will remain after this method finishes. void scale (const Kokkos::View& alpha); + /// \brief Scale each column in place: this[j] = alpha[j]*this[j]. + /// + /// Replace each column j of this MultiVector with + /// alpha[j] times the current column j of this + /// MultiVector. This method will always multiply, even if all + /// the entries of alpha are zero. That means, for example, that + /// if \c *this contains NaN entries before calling this method, + /// the NaN entries will remain after this method finishes. + /// + /// WARNING: This will only synchronize the MultiVectors w.r.t. the used-provided execution space instance. + /// This can lead to incorrect behavior if other execution_space instances are attempting to modify the vectors. + void scale (const execution_space& exec, const Kokkos::View& alpha); + /// \brief Scale in place: this = alpha * A. /// /// Replace this MultiVector with scaled values of A. This method @@ -1715,6 +1769,21 @@ namespace Tpetra { scale (const Scalar& alpha, const MultiVector& A); + /// \brief Scale in place: this = alpha * A. + /// + /// Replace this MultiVector with scaled values of A. This method + /// will always multiply, even if alpha is zero. That means, for + /// example, that if \c *this contains NaN entries before calling + /// this method, the NaN entries will remain after this method + /// finishes. It is legal for the input A to alias this + /// MultiVector. + /// + /// WARNING: This will only synchronize the MultiVectors w.r.t. the used-provided execution space instance. + /// This can lead to incorrect behavior if other execution_space instances are attempting to modify the vectors. + void + scale (const execution_space& exec, const Scalar& alpha, + const MultiVector& A); + /// \brief Update: this = beta*this + alpha*A. /// /// Update this MultiVector with scaled values of A. If beta is @@ -1726,6 +1795,22 @@ namespace Tpetra { const MultiVector& A, const Scalar& beta); + /// \brief Update: this = beta*this + alpha*A. + /// + /// Update this MultiVector with scaled values of A. If beta is + /// zero, overwrite \c *this unconditionally, even if it contains + /// NaN entries. It is legal for the input A to alias this + /// MultiVector. + /// + /// WARNING: This will only synchronize the MultiVectors w.r.t. the used-provided execution space instance. + /// This can lead to incorrect behavior if other execution_space instances are attempting to modify the vectors. + void + update (const execution_space& exec, + const Scalar& alpha, + const MultiVector& A, + const Scalar& beta); + + /// \brief Update: this = gamma*this + alpha*A + beta*B. /// /// Update this MultiVector with scaled values of A and B. If @@ -1739,6 +1824,24 @@ namespace Tpetra { const MultiVector& B, const Scalar& gamma); + /// \brief Update: this = gamma*this + alpha*A + beta*B. + /// + /// Update this MultiVector with scaled values of A and B. If + /// gamma is zero, overwrite \c *this unconditionally, even if it + /// contains NaN entries. It is legal for the inputs A or B to + /// alias this MultiVector. + /// + /// WARNING: This will only synchronize the MultiVectors w.r.t. the used-provided execution space instance. + /// This can lead to incorrect behavior if other execution_space instances are attempting to modify the vectors. + void + update (const execution_space& exec, + const Scalar& alpha, + const MultiVector& A, + const Scalar& beta, + const MultiVector& B, + const Scalar& gamma); + + /// \brief Compute the one-norm of each vector (column), storing /// the result in a host view. /// @@ -2070,6 +2173,35 @@ namespace Tpetra { const Vector& A, const MultiVector& B, Scalar scalarThis); + + /// \brief Multiply a Vector A elementwise by a MultiVector B. + /// + /// Compute this = scalarThis * this + scalarAB * B @ A + /// where @ denotes element-wise multiplication. In + /// pseudocode, if C denotes *this MultiVector: + /// \code + /// C(i,j) = scalarThis * C(i,j) + scalarAB * B(i,j) * A(i,1); + /// \endcode + /// for all rows i and columns j of C. + /// + /// B must have the same dimensions as *this, while A + /// must have the same number of rows but a single column. + /// + /// We do not require that A, B, and *this have + /// compatible Maps, as long as the number of rows in A, B, and + /// *this on each process is the same. For example, one + /// or more of these vectors might have a locally replicated Map, + /// or a Map with a local communicator (MPI_COMM_SELF). + /// This case may occur in block relaxation algorithms when + /// applying a diagonal scaling. + /// WARNING: This function will only synchronize the provided execution_space instance, which if not used correctly + /// can lead to errors. + void + elementWiseMultiply (const execution_space& exec, Scalar scalarAB, + const Vector& A, + const MultiVector& B, + Scalar scalarThis); + //@} //! @name Attribute access functions //@{ diff --git a/packages/tpetra/core/src/Tpetra_MultiVector_def.hpp b/packages/tpetra/core/src/Tpetra_MultiVector_def.hpp index a4bd3dff67fe..95a5a3cfc66b 100644 --- a/packages/tpetra/core/src/Tpetra_MultiVector_def.hpp +++ b/packages/tpetra/core/src/Tpetra_MultiVector_def.hpp @@ -297,8 +297,8 @@ namespace { // (anonymous) // If you take a subview of a view with zero rows Kokkos::subview() // always returns a DualView with the same data pointers. This will break // pointer equality testing in between two subviews of the same 2D View if - // it has zero row extent. While the one (known) case where this was actually used - // has been fixed, that sort of check could very easily be reintroduced in the future, + // it has zero row extent. While the one (known) case where this was actually used + // has been fixed, that sort of check could very easily be reintroduced in the future, // hence I've added this if check here. // // This is not a bug in Kokkos::subview(), just some very subtle behavior which @@ -349,16 +349,16 @@ namespace { // (anonymous) template bool - runKernelOnHost ( - Kokkos::DualView imports + runKernelOnHost ( + Kokkos::DualView imports ) { if (! imports.need_sync_device ()) { return false; // most up-to-date on device } - else { // most up-to-date on host, + else { // most up-to-date on host, // but if large enough, worth running on device anyway - size_t localLengthThreshold = + size_t localLengthThreshold = Tpetra::Details::Behavior::multivectorKernelLocationThreshold(); return imports.extent(0) <= localLengthThreshold; } @@ -374,7 +374,7 @@ namespace { // (anonymous) } else { // most up-to-date on host // but if large enough, worth running on device anyway - size_t localLengthThreshold = + size_t localLengthThreshold = Tpetra::Details::Behavior::multivectorKernelLocationThreshold(); return X.getLocalLength () <= localLengthThreshold; } @@ -1219,19 +1219,19 @@ namespace Tpetra { auto tgt_j = Kokkos::subview (tgt_h, rows, tgtCol); auto src_j = Kokkos::subview (src_h, rows, srcCol); - if (CM == ADD_ASSIGN) { + if (CM == ADD_ASSIGN) { // Sum src_j into tgt_j - using range_t = + using range_t = Kokkos::RangePolicy; range_t rp(space, 0,numSameIDs); Tpetra::Details::AddAssignFunctor aaf(tgt_j, src_j); Kokkos::parallel_for(rp, aaf); } - else { + else { // Copy src_j into tgt_j // DEEP_COPY REVIEW - HOSTMIRROR-TO-HOSTMIRROR - Kokkos::deep_copy (space, tgt_j, src_j); + Kokkos::deep_copy (space, tgt_j, src_j); space.fence(); } } @@ -1247,19 +1247,19 @@ namespace Tpetra { auto tgt_j = Kokkos::subview (tgt_d, rows, tgtCol); auto src_j = Kokkos::subview (src_d, rows, srcCol); - if (CM == ADD_ASSIGN) { + if (CM == ADD_ASSIGN) { // Sum src_j into tgt_j - using range_t = + using range_t = Kokkos::RangePolicy; range_t rp(space, 0,numSameIDs); Tpetra::Details::AddAssignFunctor aaf(tgt_j, src_j); Kokkos::parallel_for(rp, aaf); } - else { + else { // Copy src_j into tgt_j // DEEP_COPY REVIEW - DEVICE-TO-DEVICE - Kokkos::deep_copy (space, tgt_j, src_j); + Kokkos::deep_copy (space, tgt_j, src_j); space.fence(); } } @@ -1618,7 +1618,7 @@ void MultiVector::copyAndPermute( // clears out the 'modified' flags. if (packOnHost) { // nde 06 Feb 2020: If 'exports' does not require resize - // when reallocDualViewIfNeeded is called, the modified flags + // when reallocDualViewIfNeeded is called, the modified flags // are not cleared out. This can result in host and device views // being out-of-sync, resuling in an error in exports.modify_* calls. // Clearing the sync flags prevents this possible case. @@ -1627,7 +1627,7 @@ void MultiVector::copyAndPermute( } else { // nde 06 Feb 2020: If 'exports' does not require resize - // when reallocDualViewIfNeeded is called, the modified flags + // when reallocDualViewIfNeeded is called, the modified flags // are not cleared out. This can result in host and device views // being out-of-sync, resuling in an error in exports.modify_* calls. // Clearing the sync flags prevents this possible case. @@ -1782,7 +1782,7 @@ void MultiVector::copyAndPermute( Kokkos::DualView& exports, Kokkos::DualView numExportPacketsPerLID, size_t& constantNumPackets) { - packAndPrepare(sourceObj, exportLIDs, exports, numExportPacketsPerLID, constantNumPackets, execution_space()); + packAndPrepare(sourceObj, exportLIDs, exports, numExportPacketsPerLID, constantNumPackets, execution_space()); } // clang-format off @@ -2898,6 +2898,60 @@ void MultiVector::copyAndPermute( } + template + void + MultiVector:: + scale (const execution_space& exec, const Scalar& alpha) + { + using Kokkos::ALL; + using IST = impl_scalar_type; + + const IST theAlpha = static_cast (alpha); + if (theAlpha == Kokkos::ArithTraits::one ()) { + return; // do nothing + } + const size_t lclNumRows = getLocalLength (); + const size_t numVecs = getNumVectors (); + const std::pair rowRng (0, lclNumRows); + const std::pair colRng (0, numVecs); + + // We can't substitute putScalar(0.0) for scale(0.0), because the + // former will overwrite NaNs present in the MultiVector. The + // semantics of this call require multiplying them by 0, which + // IEEE 754 requires to be NaN. + + // If we need sync to device, then host has the most recent version. + const bool useHostVersion = need_sync_device (); + if (useHostVersion) { + auto Y_lcl = Kokkos::subview (getLocalViewHost(Access::ReadWrite), rowRng, ALL ()); + if (isConstantStride ()) { + KokkosBlas::scal (Y_lcl, theAlpha, Y_lcl); + } + else { + for (size_t k = 0; k < numVecs; ++k) { + const size_t Y_col = whichVectors_[k]; + auto Y_k = Kokkos::subview (Y_lcl, ALL (), Y_col); + KokkosBlas::scal (Y_k, theAlpha, Y_k); + } + } + } + else { // work on device + auto Y_lcl = Kokkos::subview (getLocalViewDevice(exec, Access::ReadWrite), rowRng, ALL ()); + if (isConstantStride ()) { + KokkosBlas::scal (exec, Y_lcl, theAlpha, Y_lcl); + } + else { + for (size_t k = 0; k < numVecs; ++k) { + const size_t Y_col = isConstantStride () ? k : whichVectors_[k]; + auto Y_k = Kokkos::subview (Y_lcl, ALL (), Y_col); + KokkosBlas::scal (exec, Y_k, theAlpha, Y_k); + } + } + } + } + + + template void MultiVector:: @@ -2995,6 +3049,81 @@ void MultiVector::copyAndPermute( } } + template + void + MultiVector:: + scale (const execution_space& exec, const Kokkos::View& alphas) + { + using Kokkos::ALL; + using Kokkos::subview; + + const size_t lclNumRows = this->getLocalLength (); + const size_t numVecs = this->getNumVectors (); + TEUCHOS_TEST_FOR_EXCEPTION( + static_cast (alphas.extent (0)) != numVecs, + std::invalid_argument, "Tpetra::MultiVector::scale(alphas): " + "alphas.extent(0) = " << alphas.extent (0) + << " != this->getNumVectors () = " << numVecs << "."); + const std::pair rowRng (0, lclNumRows); + const std::pair colRng (0, numVecs); + + // NOTE (mfh 08 Apr 2015) We prefer to let the compiler deduce the + // type of the return value of subview. This is because if we + // switch the array layout from LayoutLeft to LayoutRight + // (preferred for performance of block operations), the types + // below won't be valid. (A view of a column of a LayoutRight + // multivector has LayoutStride, not LayoutLeft.) + + // If we need sync to device, then host has the most recent version. + const bool useHostVersion = this->need_sync_device (); + if (useHostVersion) { + // Work in host memory. This means we need to create a host + // mirror of the input View of coefficients. + auto alphas_h = Kokkos::create_mirror_view (alphas); + // DEEP_COPY REVIEW - NOT TESTED + Kokkos::deep_copy (exec, alphas_h, alphas); + exec.fence(); + + auto Y_lcl = subview (this->getLocalViewHost(Access::ReadWrite), rowRng, ALL ()); + if (isConstantStride ()) { + KokkosBlas::scal (Y_lcl, alphas_h, Y_lcl); + } + else { + for (size_t k = 0; k < numVecs; ++k) { + const size_t Y_col = this->isConstantStride () ? k : + this->whichVectors_[k]; + auto Y_k = subview (Y_lcl, ALL (), Y_col); + // We don't have to use the entire 1-D View here; we can use + // the version that takes a scalar coefficient. + KokkosBlas::scal (Y_k, alphas_h(k), Y_k); + } + } + } + else { // Work in device memory, using the input View 'alphas' directly. + auto Y_lcl = subview (this->getLocalViewDevice(exec, Access::ReadWrite), rowRng, ALL ()); + if (isConstantStride ()) { + KokkosBlas::scal (exec, Y_lcl, alphas, Y_lcl); + } + else { + // FIXME (mfh 15 Mar 2019) We need one coefficient at a time, + // as values on host, so copy them to host. Another approach + // would be to fix scal() so that it takes a 0-D View as the + // second argument. + auto alphas_h = Kokkos::create_mirror_view (alphas); + // DEEP_COPY REVIEW - NOT TESTED + Kokkos::deep_copy (exec, alphas_h, alphas); + + for (size_t k = 0; k < numVecs; ++k) { + const size_t Y_col = this->isConstantStride () ? k : + this->whichVectors_[k]; + auto Y_k = subview (Y_lcl, ALL (), Y_col); + KokkosBlas::scal (exec, Y_k, alphas_h(k), Y_k); + } + } + } + } + + template void MultiVector:: @@ -3042,6 +3171,52 @@ void MultiVector::copyAndPermute( } } + template + void + MultiVector:: + scale (const execution_space& exec, const Scalar& alpha, + const MultiVector& A) + { + using Kokkos::ALL; + using Kokkos::subview; + const char tfecfFuncName[] = "scale: "; + + const size_t lclNumRows = getLocalLength (); + const size_t numVecs = getNumVectors (); + + TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC( + lclNumRows != A.getLocalLength (), std::invalid_argument, + "this->getLocalLength() = " << lclNumRows << " != A.getLocalLength() = " + << A.getLocalLength () << "."); + TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC( + numVecs != A.getNumVectors (), std::invalid_argument, + "this->getNumVectors() = " << numVecs << " != A.getNumVectors() = " + << A.getNumVectors () << "."); + + const impl_scalar_type theAlpha = static_cast (alpha); + const std::pair rowRng (0, lclNumRows); + const std::pair colRng (0, numVecs); + + auto Y_lcl_orig = this->getLocalViewDevice(exec, Access::ReadWrite); + auto X_lcl_orig = A.getLocalViewDevice(exec, Access::ReadOnly); + auto Y_lcl = subview (Y_lcl_orig, rowRng, ALL ()); + auto X_lcl = subview (X_lcl_orig, rowRng, ALL ()); + + if (isConstantStride () && A.isConstantStride ()) { + KokkosBlas::scal (exec, Y_lcl, theAlpha, X_lcl); + } + else { + // Make sure that Kokkos only uses the local length for add. + for (size_t k = 0; k < numVecs; ++k) { + const size_t Y_col = this->isConstantStride () ? k : this->whichVectors_[k]; + const size_t X_col = A.isConstantStride () ? k : A.whichVectors_[k]; + auto Y_k = subview (Y_lcl, ALL (), Y_col); + auto X_k = subview (X_lcl, ALL (), X_col); + + KokkosBlas::scal (exec, Y_k, theAlpha, X_k); + } + } + } template @@ -3083,6 +3258,46 @@ void MultiVector::copyAndPermute( } } + template + void + MultiVector:: + reciprocal (const execution_space& exec, const MultiVector& A) + { + const char tfecfFuncName[] = "reciprocal: "; + + TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC( + getLocalLength () != A.getLocalLength (), std::runtime_error, + "MultiVectors do not have the same local length. " + "this->getLocalLength() = " << getLocalLength () + << " != A.getLocalLength() = " << A.getLocalLength () << "."); + TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC( + A.getNumVectors () != this->getNumVectors (), std::runtime_error, + ": MultiVectors do not have the same number of columns (vectors). " + "this->getNumVectors() = " << getNumVectors () + << " != A.getNumVectors() = " << A.getNumVectors () << "."); + + const size_t numVecs = getNumVectors (); + + auto this_view_dev = this->getLocalViewDevice(exec,Access::ReadWrite); + auto A_view_dev = A.getLocalViewDevice(exec,Access::ReadOnly); + + if (isConstantStride () && A.isConstantStride ()) { + KokkosBlas::reciprocal (exec,this_view_dev, A_view_dev); + } + else { + using Kokkos::ALL; + using Kokkos::subview; + for (size_t k = 0; k < numVecs; ++k) { + const size_t this_col = isConstantStride () ? k : whichVectors_[k]; + auto vector_k = subview (this_view_dev, ALL (), this_col); + const size_t A_col = isConstantStride () ? k : A.whichVectors_[k]; + auto vector_Ak = subview (A_view_dev, ALL (), A_col); + KokkosBlas::reciprocal (exec,vector_k, vector_Ak); + } + } + } + + template void MultiVector:: @@ -3121,6 +3336,45 @@ void MultiVector::copyAndPermute( } } } + template + void + MultiVector:: + abs (const execution_space& exec, const MultiVector& A) + { + const char tfecfFuncName[] = "abs"; + + TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC( + getLocalLength () != A.getLocalLength (), std::runtime_error, + ": MultiVectors do not have the same local length. " + "this->getLocalLength() = " << getLocalLength () + << " != A.getLocalLength() = " << A.getLocalLength () << "."); + TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC( + A.getNumVectors () != this->getNumVectors (), std::runtime_error, + ": MultiVectors do not have the same number of columns (vectors). " + "this->getNumVectors() = " << getNumVectors () + << " != A.getNumVectors() = " << A.getNumVectors () << "."); + const size_t numVecs = getNumVectors (); + + auto this_view_dev = this->getLocalViewDevice(exec,Access::ReadWrite); + auto A_view_dev = A.getLocalViewDevice(exec,Access::ReadOnly); + + if (isConstantStride () && A.isConstantStride ()) { + KokkosBlas::abs (exec,this_view_dev, A_view_dev); + } + else { + using Kokkos::ALL; + using Kokkos::subview; + + for (size_t k=0; k < numVecs; ++k) { + const size_t this_col = isConstantStride () ? k : whichVectors_[k]; + auto vector_k = subview (this_view_dev, ALL (), this_col); + const size_t A_col = isConstantStride () ? k : A.whichVectors_[k]; + auto vector_Ak = subview (A_view_dev, ALL (), A_col); + KokkosBlas::abs (exec,vector_k, vector_Ak); + } + } + } + template void @@ -3129,6 +3383,8 @@ void MultiVector::copyAndPermute( const MultiVector& A, const Scalar& beta) { + // NOTE: This is intentionally not implemented with a call to the 4-arg update() which takes an execution space + // instance, because that has different synchronization behavior. const char tfecfFuncName[] = "update: "; using Kokkos::subview; using Kokkos::ALL; @@ -3176,6 +3432,61 @@ void MultiVector::copyAndPermute( } } + template + void + MultiVector:: + update (const execution_space& exec, + const Scalar& alpha, + const MultiVector& A, + const Scalar& beta) + { + const char tfecfFuncName[] = "update: "; + using Kokkos::subview; + using Kokkos::ALL; + + ::Tpetra::Details::ProfilingRegion region ("Tpetra::MV::update(alpha,A,beta)"); + + const size_t lclNumRows = getLocalLength (); + const size_t numVecs = getNumVectors (); + + if (::Tpetra::Details::Behavior::debug ()) { + TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC( + lclNumRows != A.getLocalLength (), std::invalid_argument, + "this->getLocalLength() = " << lclNumRows << " != A.getLocalLength() = " + << A.getLocalLength () << "."); + TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC( + numVecs != A.getNumVectors (), std::invalid_argument, + "this->getNumVectors() = " << numVecs << " != A.getNumVectors() = " + << A.getNumVectors () << "."); + } + + const impl_scalar_type theAlpha = static_cast (alpha); + const impl_scalar_type theBeta = static_cast (beta); + const std::pair rowRng (0, lclNumRows); + const std::pair colRng (0, numVecs); + + auto Y_lcl_orig = this->getLocalViewDevice(exec, Access::ReadWrite); + auto Y_lcl = subview (Y_lcl_orig, rowRng, Kokkos::ALL ()); + auto X_lcl_orig = A.getLocalViewDevice(exec, Access::ReadOnly); + auto X_lcl = subview (X_lcl_orig, rowRng, Kokkos::ALL ()); + + // The device memory of *this is about to be modified + if (isConstantStride () && A.isConstantStride ()) { + KokkosBlas::axpby (exec, theAlpha, X_lcl, theBeta, Y_lcl); + } + else { + // Make sure that Kokkos only uses the local length for add. + for (size_t k = 0; k < numVecs; ++k) { + const size_t Y_col = this->isConstantStride () ? k : this->whichVectors_[k]; + const size_t X_col = A.isConstantStride () ? k : A.whichVectors_[k]; + auto Y_k = subview (Y_lcl, ALL (), Y_col); + auto X_k = subview (X_lcl, ALL (), X_col); + + KokkosBlas::axpby (exec, theAlpha, X_k, theBeta, Y_k); + } + } + } + template void MultiVector:: @@ -3215,7 +3526,7 @@ void MultiVector::copyAndPermute( "The input MultiVector B has " << B.getNumVectors () << " column(s), " "but this MultiVector has " << numVecs << " column(s)."); } - + const impl_scalar_type theAlpha = static_cast (alpha); const impl_scalar_type theBeta = static_cast (beta); const impl_scalar_type theGamma = static_cast (gamma); @@ -3247,6 +3558,80 @@ void MultiVector::copyAndPermute( } } + + + template + void + MultiVector:: + update (const execution_space& exec, + const Scalar& alpha, + const MultiVector& A, + const Scalar& beta, + const MultiVector& B, + const Scalar& gamma) + { + using Kokkos::ALL; + using Kokkos::subview; + + const char tfecfFuncName[] = "update(alpha,A,beta,B,gamma): "; + + ::Tpetra::Details::ProfilingRegion region ("Tpetra::MV::update(alpha,A,beta,B,gamma)"); + + const size_t lclNumRows = this->getLocalLength (); + const size_t numVecs = getNumVectors (); + + if (::Tpetra::Details::Behavior::debug ()) { + TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC( + lclNumRows != A.getLocalLength (), std::invalid_argument, + "The input MultiVector A has " << A.getLocalLength () << " local " + "row(s), but this MultiVector has " << lclNumRows << " local " + "row(s)."); + TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC( + lclNumRows != B.getLocalLength (), std::invalid_argument, + "The input MultiVector B has " << B.getLocalLength () << " local " + "row(s), but this MultiVector has " << lclNumRows << " local " + "row(s)."); + TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC( + A.getNumVectors () != numVecs, std::invalid_argument, + "The input MultiVector A has " << A.getNumVectors () << " column(s), " + "but this MultiVector has " << numVecs << " column(s)."); + TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC( + B.getNumVectors () != numVecs, std::invalid_argument, + "The input MultiVector B has " << B.getNumVectors () << " column(s), " + "but this MultiVector has " << numVecs << " column(s)."); + } + + const impl_scalar_type theAlpha = static_cast (alpha); + const impl_scalar_type theBeta = static_cast (beta); + const impl_scalar_type theGamma = static_cast (gamma); + + const std::pair rowRng (0, lclNumRows); + const std::pair colRng (0, numVecs); + + // Prefer 'auto' over specifying the type explicitly. This avoids + // issues with a subview possibly having a different type than the + // original view. + auto C_lcl = subview (this->getLocalViewDevice(exec,Access::ReadWrite), rowRng, ALL ()); + auto A_lcl = subview (A.getLocalViewDevice(exec,Access::ReadOnly), rowRng, ALL ()); + auto B_lcl = subview (B.getLocalViewDevice(exec,Access::ReadOnly), rowRng, ALL ()); + + if (isConstantStride () && A.isConstantStride () && B.isConstantStride ()) { + KokkosBlas::update (exec, theAlpha, A_lcl, theBeta, B_lcl, theGamma, C_lcl); + } + else { + // Some input (or *this) is not constant stride, + // so perform the update one column at a time. + for (size_t k = 0; k < numVecs; ++k) { + const size_t this_col = isConstantStride () ? k : whichVectors_[k]; + const size_t A_col = A.isConstantStride () ? k : A.whichVectors_[k]; + const size_t B_col = B.isConstantStride () ? k : B.whichVectors_[k]; + KokkosBlas::update (exec, theAlpha, subview (A_lcl, rowRng, A_col), + theBeta, subview (B_lcl, rowRng, B_col), + theGamma, subview (C_lcl, rowRng, this_col)); + } + } + } + template Teuchos::ArrayRCP MultiVector:: @@ -3830,7 +4215,7 @@ void MultiVector::copyAndPermute( /// can change the local data and we do not know which one the user want as a copy throw std::runtime_error("Tpetra::MultiVector: A non-const view is alive outside and we cannot give a copy where host or device view can be modified outside"); } - else { + else { const bool useHostView = view_.host_view_use_count() >= view_.device_view_use_count(); if (this->isConstantStride ()) { if (useHostView) { @@ -3847,7 +4232,7 @@ void MultiVector::copyAndPermute( for (size_t j = 0; j < numCols; ++j) { const size_t srcCol = this->whichVectors_[j]; auto dstColView = Kokkos::subview (A_view, rowRange, j); - + if (useHostView) { auto srcView_host = this->getLocalViewHost(Access::ReadOnly); auto srcColView_host = Kokkos::subview (srcView_host, rowRange, srcCol); @@ -4006,6 +4391,14 @@ void MultiVector::copyAndPermute( return view_.getDeviceView(s); } + template + typename MultiVector::dual_view_type::t_dev::const_type + MultiVector:: + getLocalViewDevice(const execution_space &exec, Access::ReadOnlyStruct s) const + { + return view_.getDeviceView(exec,s); + } + template typename MultiVector::dual_view_type::t_dev MultiVector:: @@ -4014,6 +4407,14 @@ void MultiVector::copyAndPermute( return view_.getDeviceView(s); } + template + typename MultiVector::dual_view_type::t_dev + MultiVector:: + getLocalViewDevice(const execution_space &exec, Access::ReadWriteStruct s) + { + return view_.getDeviceView(exec,s); + } + template typename MultiVector::dual_view_type::t_dev MultiVector:: @@ -4023,7 +4424,16 @@ void MultiVector::copyAndPermute( } template - typename MultiVector::wrapped_dual_view_type + typename MultiVector::dual_view_type::t_dev + MultiVector:: + getLocalViewDevice(const execution_space &exec,Access::OverwriteAllStruct s) + { + return view_.getDeviceView(exec,s); + } + + + template + typename MultiVector::wrapped_dual_view_type MultiVector:: getWrappedDualView() const { return view_; @@ -4296,6 +4706,7 @@ void MultiVector::copyAndPermute( } } + template void MultiVector:: @@ -4348,6 +4759,61 @@ void MultiVector::copyAndPermute( } } + + template + void + MultiVector:: + elementWiseMultiply (const execution_space& exec, Scalar scalarAB, + const Vector& A, + const MultiVector& B, + Scalar scalarThis) + { + using Kokkos::ALL; + using Kokkos::subview; + const char tfecfFuncName[] = "elementWiseMultiply: "; + + const size_t lclNumRows = this->getLocalLength (); + const size_t numVecs = this->getNumVectors (); + + TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC + (lclNumRows != A.getLocalLength () || lclNumRows != B.getLocalLength (), + std::runtime_error, "MultiVectors do not have the same local length."); + TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC( + numVecs != B.getNumVectors (), std::runtime_error, "this->getNumVectors" + "() = " << numVecs << " != B.getNumVectors() = " << B.getNumVectors () + << "."); + + auto this_view = this->getLocalViewDevice(exec,Access::ReadWrite); + auto A_view = A.getLocalViewDevice(exec, Access::ReadOnly); + auto B_view = B.getLocalViewDevice(exec, Access::ReadOnly); + + if (isConstantStride () && B.isConstantStride ()) { + // A is just a Vector; it only has one column, so it always has + // constant stride. + // + // If both *this and B have constant stride, we can do an + // element-wise multiply on all columns at once. + KokkosBlas::mult (exec, + scalarThis, + this_view, + scalarAB, + subview (A_view, ALL (), 0), + B_view); + } + else { + for (size_t j = 0; j < numVecs; ++j) { + const size_t C_col = isConstantStride () ? j : whichVectors_[j]; + const size_t B_col = B.isConstantStride () ? j : B.whichVectors_[j]; + KokkosBlas::mult (exec, + scalarThis, + subview (this_view, ALL (), C_col), + scalarAB, + subview (A_view, ALL (), 0), + subview (B_view, ALL (), B_col)); + } + } + } + template void MultiVector:: @@ -4659,7 +5125,7 @@ void MultiVector::copyAndPermute( // so we can't use our regular accessor functins // NOTE: This is an occasion where we do *not* want the auto-sync stuff - // to trigger (since this function is conceptually const). Thus, we + // to trigger (since this function is conceptually const). Thus, we // get *copies* of the view's data instead. auto X_dev = view_.getDeviceCopy(); auto X_host = view_.getHostCopy(); @@ -4668,12 +5134,12 @@ void MultiVector::copyAndPermute( // One single allocation Details::print_vector(out,"unified",X_host); } - else { + else { Details::print_vector(out,"host",X_host); Details::print_vector(out,"dev",X_dev); } } - } + } out.flush (); // make sure the ostringstream got everything return outStringP->str (); }