Serial eigensolver for GPU (elemental#130)

* Add cuSOLVER-based implementation of Hermitian eigensolves. * Add rocSOLVER implementation of Syev. * Reenable the HermitianEig test for sequential matrices
benson31 · Dec 10, 2021 · cacfa47 · cacfa47
1 parent 21123cb
commit cacfa47
Show file tree

Hide file tree

Showing 36 changed files with 971 additions and 502 deletions.
diff --git a/include/El/blas_like/level1/Copy/TranslateBetweenGrids.hpp b/include/El/blas_like/level1/Copy/TranslateBetweenGrids.hpp
@@ -112,15 +112,6 @@ void TranslateBetweenGrids
     mpi::Translate(
         owningGroupA, sizeA, ranks.data(), viewingCommB, rankMap.data());
 
-    // Have each member of A's grid individually send to all numRow x numCol
-    // processes in order, while the members of this grid receive from all
-    // necessary processes at each step.
-    Int requiredMemory = 0;
-    if(inAGrid)
-        requiredMemory += maxSendSize;
-    if(inBGrid)
-        requiredMemory += maxSendSize;
-
     SyncInfo<D1> syncInfoA = SyncInfoFromMatrix(A.LockedMatrix());
     SyncInfo<D2> syncInfoB = SyncInfoFromMatrix(B.LockedMatrix());
 
@@ -3676,15 +3667,6 @@ void TranslateBetweenGridsAsync
     mpi::Translate(
         owningGroupA, sizeA, ranks.data(), viewingCommB, rankMap.data());
 
-    Int requiredMemory = 0;
-    if(inAGrid)
-        requiredMemory += maxSendSize;
-    if(inBGrid)
-        requiredMemory += maxSendSize;
-
-
-
-
     std::vector<simple_buffer<T,D1>> sendBufVector(numRowSends);
 
     for(Int i=0; i<numRowSends; ++i)
@@ -3970,12 +3952,6 @@ void TranslateBetweenGrids
     mpi::Translate(
         owningGroupA, sizeA, ranks.data(), viewingCommB, rankMap.data());
 
-    Int requiredMemory = 0;
-    if(inAGrid)
-        requiredMemory += maxSendSize;
-    if(inBGrid)
-        requiredMemory += maxSendSize;
-
     simple_buffer<T,D1> send_buf(inAGrid ? maxSendSize : 0, syncInfoA);
     simple_buffer<T,D2> recv_buf(inBGrid ? maxSendSize : 0, syncInfoB);
 

diff --git a/include/El/blas_like/level1/CopyFromRoot.hpp b/include/El/blas_like/level1/CopyFromRoot.hpp
@@ -6,7 +6,7 @@ namespace El
 
 template<typename T>
 void CopyFromRoot(const Matrix<T>& A, DistMatrix<T,CIRC,CIRC>& B,
-                  bool includingViewers)
+                  bool includingViewers=false)
 {
     EL_DEBUG_CSE;
     if (B.CrossRank() != B.Root())
@@ -17,7 +17,7 @@ void CopyFromRoot(const Matrix<T>& A, DistMatrix<T,CIRC,CIRC>& B,
 }
 
 template<typename T>
-void CopyFromNonRoot(DistMatrix<T,CIRC,CIRC>& B, bool includingViewers)
+void CopyFromNonRoot(DistMatrix<T,CIRC,CIRC>& B, bool includingViewers=false)
 {
     EL_DEBUG_CSE;
     if (B.CrossRank() == B.Root())
@@ -27,7 +27,7 @@ void CopyFromNonRoot(DistMatrix<T,CIRC,CIRC>& B, bool includingViewers)
 
 template<typename T>
 void CopyFromRoot (const Matrix<T>& A, DistMatrix<T,CIRC,CIRC,BLOCK>& B,
-                   bool includingViewers)
+                   bool includingViewers=false)
 {
     EL_DEBUG_CSE;
     if (B.CrossRank() != B.Root())
@@ -38,7 +38,8 @@ void CopyFromRoot (const Matrix<T>& A, DistMatrix<T,CIRC,CIRC,BLOCK>& B,
 }
 
 template<typename T>
-void CopyFromNonRoot (DistMatrix<T,CIRC,CIRC,BLOCK>& B, bool includingViewers)
+void CopyFromNonRoot (DistMatrix<T,CIRC,CIRC,BLOCK>& B,
+                      bool includingViewers=false)
 {
     EL_DEBUG_CSE;
     if (B.CrossRank() == B.Root())

diff --git a/include/El/lapack_like/spectral.hpp b/include/El/lapack_like/spectral.hpp
@@ -430,8 +430,8 @@ template<typename Field>
 HermitianEigInfo
 HermitianEig
 (       UpperOrLower uplo,
-        Matrix<Field>& A,
-        Matrix<Base<Field>>& w,
+        Matrix<Field, El::Device::CPU>& A,
+        Matrix<Base<Field>, El::Device::CPU>& w,
   const HermitianEigCtrl<Field>& ctrl=HermitianEigCtrl<Field>() );
 template<typename Field>
 HermitianEigInfo
@@ -447,9 +447,9 @@ template<typename Field>
 HermitianEigInfo
 HermitianEig
 (       UpperOrLower uplo,
-        Matrix<Field>& A,
-        Matrix<Base<Field>>& w,
-        Matrix<Field>& Q,
+        Matrix<Field, El::Device::CPU>& A,
+        Matrix<Base<Field>, El::Device::CPU>& w,
+        Matrix<Field, El::Device::CPU>& Q,
   const HermitianEigCtrl<Field>& ctrl=HermitianEigCtrl<Field>() );
 template<typename Field>
 HermitianEigInfo
@@ -460,6 +460,22 @@ HermitianEig
         AbstractDistMatrix<Field>& Q,
   const HermitianEigCtrl<Field>& ctrl=HermitianEigCtrl<Field>() );
 
+#ifdef HYDROGEN_HAVE_GPU
+template<typename Field>
+HermitianEigInfo
+HermitianEig
+(       UpperOrLower uplo,
+        Matrix<Field, El::Device::GPU>& A,
+        Matrix<Base<Field>, El::Device::GPU>& w,
+  const HermitianEigCtrl<Field>& ctrl=HermitianEigCtrl<Field>() );
+template <typename Field>
+HermitianEigInfo
+HermitianEig(UpperOrLower uplo,
+             Matrix<Field, El::Device::GPU>& A,
+             Matrix<Base<Field>, El::Device::GPU>& w,
+             Matrix<Field, El::Device::GPU>& Q,
+             const HermitianEigCtrl<Field>& ctrl = HermitianEigCtrl<Field>());
+#endif // HDYROGEN_HAVE_GPU
 namespace herm_eig {
 
 template<typename Real,

diff --git a/include/hydrogen/blas/BLAS_Common.hpp b/include/hydrogen/blas/BLAS_Common.hpp
@@ -38,6 +38,7 @@ enum class BLAS_Op
 
 enum class LAPACK_Op
 {
+    HEEV,
     POTRF,
 }; // enum class LAPACK_Op
 

diff --git a/include/hydrogen/blas/GPU_BLAS_decl.hpp b/include/hydrogen/blas/GPU_BLAS_decl.hpp
@@ -587,6 +587,24 @@ void Dgmm(SideMode side,
 namespace gpu_lapack
 {
 
+/** @brief Compute the eigenvalues and eigenvectors of A.
+ *
+ *  At this time, we only support the computation of values AND vectors.
+ *
+ *  @note Some workspace memory is allocated internally, using a
+ *        caching allocator when possible.
+ *  @note The "info" parameter to the solver routine is only checked
+ *        in DEBUG builds.
+ */
+template <typename T, typename SizeT>
+void HermitianEig(
+    FillMode uplo,
+    SizeT n,
+    T* A,
+    SizeT lda,
+    TmpBase<T>* W,
+    SyncInfo<Device::GPU> const& si);
+
 /** @brief Compute the Cholesky factorization of A.
  *
  *  This routine will allocate the properly-sized workspace and info

diff --git a/include/hydrogen/blas/GPU_BLAS_impl.hpp b/include/hydrogen/blas/GPU_BLAS_impl.hpp
@@ -1070,6 +1070,7 @@ namespace gpu_lapack
 {
 namespace details
 {
+
 // These might be unique.
 using gpu_lapack_impl::ToSizeT;
 using gpu_lapack_impl::GetDenseLibraryHandle;
@@ -1083,18 +1084,82 @@ using gpu_blas_impl::ToNativeFillMode;
 using gpu_blas_impl::ToNativeSideMode;
 using gpu_blas_impl::ToNativeTransposeMode;
 
-template <typename T, typename SizeT, typename InfoT,
+template <typename T,
+          typename SizeT,
+          typename = EnableWhen<IsSupportedType<T, LAPACK_Op::HEEV>>>
+void HermitianEigImpl(
+    FillMode uplo,
+    SizeT n,
+    T* A,
+    SizeT lda,
+    TmpBase<T>* W,
+    SyncInfo<Device::GPU> const& si)
+{
+    using NTP = MakePointer<NativeType<T>>;
+    using RealNTP = MakePointer<NativeType<TmpBase<T>>>;
+
+    auto workspace_size = gpu_lapack_impl::GetHeevWorkspaceSize(
+        GetDenseLibraryHandle(),
+        ToNativeFillMode(uplo),
+        ToSizeT(n),
+        reinterpret_cast<NTP>(A),
+        ToSizeT(lda),
+        reinterpret_cast<RealNTP>(W));
+
+    SyncManager mgr(GetDenseLibraryHandle(), si);
+    simple_buffer<T, Device::GPU> workspace(workspace_size, si);
+    simple_buffer<gpu_lapack_impl::InfoT, Device::GPU> info(1, si);
+
+    // NOTE: This excludes the "jobz" parameter -- we ALWAYS compute
+    // the eigenvalues for now.
+    gpu_lapack_impl::Heev(
+        GetDenseLibraryHandle(),
+        ToNativeFillMode(uplo),
+        ToSizeT(n),
+        reinterpret_cast<NTP>(A),
+        ToSizeT(lda),
+        reinterpret_cast<RealNTP>(W),
+        reinterpret_cast<NTP>(workspace.data()),
+        workspace_size,
+        info.data());
+
+#ifndef EL_RELEASE
+    gpu_lapack_impl::InfoT host_info;
+    gpu::Copy1DToHost(info.data(), &host_info, 1, si);
+    Synchronize(si);
+    if (host_info > gpu_lapack_impl::InfoT(0))
+        throw std::runtime_error("HermitianEigImpl: Solver failed to converge.");
+    else if (host_info < gpu_lapack_impl::InfoT(0))
+        throw std::runtime_error("HermitianEigImpl: A parameter is bad.");
+#endif // EL_RELEASE
+}
+
+template <typename T, typename SizeT,
+          typename=EnableUnless<IsSupportedType<T,LAPACK_Op::HEEV>>,
+          typename=void>
+void HermitianEigImpl(
+    FillMode,
+    SizeT,
+    T*,
+    SizeT,
+    TmpBase<T>*,
+    SyncInfo<Device::GPU> const&)
+{
+    std::ostringstream oss;
+    oss << "No valid implementation of HermitianEig for T="
+        << TypeTraits<T>::Name();
+    throw std::logic_error(oss.str());
+}
+
+template <typename T, typename SizeT,
           typename=EnableWhen<IsSupportedType<T,LAPACK_Op::POTRF>>>
 void CholeskyFactorizeImpl(FillMode uplo,
                            SizeT n,
                            T* A, SizeT lda,
                            T* workspace, SizeT workspace_size,
-                           InfoT* info,
+                           gpu_lapack_impl::InfoT* info,
                            SyncInfo<Device::GPU> const& si)
 {
-    static_assert(IsSame<InfoT, gpu_lapack_impl::InfoT>::value,
-                  "Deduced InfoT must match gpu_lapack_impl::InfoT.");
-
     using NTP = MakePointer<NativeType<T>>;
 
     SyncManager mgr(GetDenseLibraryHandle(), si);
@@ -1194,6 +1259,18 @@ void CholeskyFactorizeImpl(FillMode const,
 }
 }// namespace details
 
+template <typename T, typename SizeT>
+void HermitianEig(
+    FillMode uplo,
+    SizeT n,
+    T* A,
+    SizeT lda,
+    TmpBase<T>* W,
+    SyncInfo<Device::GPU> const& si)
+{
+    details::HermitianEigImpl(uplo, n, A, lda, W, si);
+}
+
 template <typename T, typename SizeT>
 void CholeskyFactorize(FillMode uplo,
                        SizeT n,

diff --git a/include/hydrogen/device/gpu/cuda/cuSOLVER_API.hpp b/include/hydrogen/device/gpu/cuda/cuSOLVER_API.hpp
@@ -22,6 +22,26 @@ using DevicePtr = T*;
 
 using cusolver_int = int;
 
+template <typename T>
+struct RealTypeT
+{
+    using type = T;
+};
+
+template <>
+struct RealTypeT<cuComplex>
+    : RealTypeT<float>
+{
+};
+
+template <>
+struct RealTypeT<cuDoubleComplex>
+    : RealTypeT<double>
+{
+};
+
+template <typename T> using RealType = typename RealTypeT<T>::type;
+
 // Cholesky factorization
 #define ADD_POTRF_DECL(ScalarType)                      \
     cusolver_int GetPotrfWorkspaceSize(                 \
@@ -38,10 +58,33 @@ using cusolver_int = int;
         cusolver_int workspace_size,                    \
         DevicePtr<cusolver_int> devinfo)
 
-ADD_POTRF_DECL(float);
-ADD_POTRF_DECL(double);
-ADD_POTRF_DECL(cuComplex);
-ADD_POTRF_DECL(cuDoubleComplex);
+#define ADD_HEEV_DECL(ScalarType)                       \
+    cusolver_int GetHeevWorkspaceSize(                  \
+        cusolverDnHandle_t handle,                      \
+        cublasFillMode_t uplo,                          \
+        cusolver_int n,                                 \
+        DeviceArray<ScalarType> A,                      \
+        cusolver_int lda,                               \
+        DeviceArray<RealType<ScalarType>> W);           \
+    void Heev(                                          \
+        cusolverDnHandle_t handle,                      \
+        cublasFillMode_t uplo,                          \
+        cusolver_int n,                                 \
+        DeviceArray<ScalarType> A,                      \
+        cusolver_int lda,                               \
+        DeviceArray<RealType<ScalarType>> W,            \
+        DeviceArray<ScalarType> workspace,              \
+        cusolver_int workspace_size,                    \
+        DevicePtr<cusolver_int> devinfo)
+
+#define ADD_CUSOLVER_DECLS(ScalarType)          \
+    ADD_POTRF_DECL(ScalarType);                 \
+    ADD_HEEV_DECL(ScalarType)
+
+ADD_CUSOLVER_DECLS(float);
+ADD_CUSOLVER_DECLS(double);
+ADD_CUSOLVER_DECLS(cuComplex);
+ADD_CUSOLVER_DECLS(cuDoubleComplex);
 
 }// namespace cusolver
 }// namespace hydrogen
-Original file line number
+Diff line change
@@ Expand Up / @@ -38,6 +38,7 @@ enum class BLAS_Op @@
     enum class LAPACK_Op
     {
+        HEEV,
         POTRF,
     }; // enum class LAPACK_Op
@@ Expand Down @@