Skip to content

Commit

Permalink
Serial eigensolver for GPU (elemental#130)
Browse files Browse the repository at this point in the history
* Add cuSOLVER-based implementation of Hermitian eigensolves.
* Add rocSOLVER implementation of Syev.
* Reenable the HermitianEig test for sequential matrices
  • Loading branch information
benson31 authored Dec 10, 2021
1 parent 21123cb commit cacfa47
Show file tree
Hide file tree
Showing 36 changed files with 971 additions and 502 deletions.
24 changes: 0 additions & 24 deletions include/El/blas_like/level1/Copy/TranslateBetweenGrids.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -112,15 +112,6 @@ void TranslateBetweenGrids
mpi::Translate(
owningGroupA, sizeA, ranks.data(), viewingCommB, rankMap.data());

// Have each member of A's grid individually send to all numRow x numCol
// processes in order, while the members of this grid receive from all
// necessary processes at each step.
Int requiredMemory = 0;
if(inAGrid)
requiredMemory += maxSendSize;
if(inBGrid)
requiredMemory += maxSendSize;

SyncInfo<D1> syncInfoA = SyncInfoFromMatrix(A.LockedMatrix());
SyncInfo<D2> syncInfoB = SyncInfoFromMatrix(B.LockedMatrix());

Expand Down Expand Up @@ -3676,15 +3667,6 @@ void TranslateBetweenGridsAsync
mpi::Translate(
owningGroupA, sizeA, ranks.data(), viewingCommB, rankMap.data());

Int requiredMemory = 0;
if(inAGrid)
requiredMemory += maxSendSize;
if(inBGrid)
requiredMemory += maxSendSize;




std::vector<simple_buffer<T,D1>> sendBufVector(numRowSends);

for(Int i=0; i<numRowSends; ++i)
Expand Down Expand Up @@ -3970,12 +3952,6 @@ void TranslateBetweenGrids
mpi::Translate(
owningGroupA, sizeA, ranks.data(), viewingCommB, rankMap.data());

Int requiredMemory = 0;
if(inAGrid)
requiredMemory += maxSendSize;
if(inBGrid)
requiredMemory += maxSendSize;

simple_buffer<T,D1> send_buf(inAGrid ? maxSendSize : 0, syncInfoA);
simple_buffer<T,D2> recv_buf(inBGrid ? maxSendSize : 0, syncInfoB);

Expand Down
9 changes: 5 additions & 4 deletions include/El/blas_like/level1/CopyFromRoot.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ namespace El

template<typename T>
void CopyFromRoot(const Matrix<T>& A, DistMatrix<T,CIRC,CIRC>& B,
bool includingViewers)
bool includingViewers=false)
{
EL_DEBUG_CSE;
if (B.CrossRank() != B.Root())
Expand All @@ -17,7 +17,7 @@ void CopyFromRoot(const Matrix<T>& A, DistMatrix<T,CIRC,CIRC>& B,
}

template<typename T>
void CopyFromNonRoot(DistMatrix<T,CIRC,CIRC>& B, bool includingViewers)
void CopyFromNonRoot(DistMatrix<T,CIRC,CIRC>& B, bool includingViewers=false)
{
EL_DEBUG_CSE;
if (B.CrossRank() == B.Root())
Expand All @@ -27,7 +27,7 @@ void CopyFromNonRoot(DistMatrix<T,CIRC,CIRC>& B, bool includingViewers)

template<typename T>
void CopyFromRoot (const Matrix<T>& A, DistMatrix<T,CIRC,CIRC,BLOCK>& B,
bool includingViewers)
bool includingViewers=false)
{
EL_DEBUG_CSE;
if (B.CrossRank() != B.Root())
Expand All @@ -38,7 +38,8 @@ void CopyFromRoot (const Matrix<T>& A, DistMatrix<T,CIRC,CIRC,BLOCK>& B,
}

template<typename T>
void CopyFromNonRoot (DistMatrix<T,CIRC,CIRC,BLOCK>& B, bool includingViewers)
void CopyFromNonRoot (DistMatrix<T,CIRC,CIRC,BLOCK>& B,
bool includingViewers=false)
{
EL_DEBUG_CSE;
if (B.CrossRank() == B.Root())
Expand Down
26 changes: 21 additions & 5 deletions include/El/lapack_like/spectral.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -430,8 +430,8 @@ template<typename Field>
HermitianEigInfo
HermitianEig
( UpperOrLower uplo,
Matrix<Field>& A,
Matrix<Base<Field>>& w,
Matrix<Field, El::Device::CPU>& A,
Matrix<Base<Field>, El::Device::CPU>& w,
const HermitianEigCtrl<Field>& ctrl=HermitianEigCtrl<Field>() );
template<typename Field>
HermitianEigInfo
Expand All @@ -447,9 +447,9 @@ template<typename Field>
HermitianEigInfo
HermitianEig
( UpperOrLower uplo,
Matrix<Field>& A,
Matrix<Base<Field>>& w,
Matrix<Field>& Q,
Matrix<Field, El::Device::CPU>& A,
Matrix<Base<Field>, El::Device::CPU>& w,
Matrix<Field, El::Device::CPU>& Q,
const HermitianEigCtrl<Field>& ctrl=HermitianEigCtrl<Field>() );
template<typename Field>
HermitianEigInfo
Expand All @@ -460,6 +460,22 @@ HermitianEig
AbstractDistMatrix<Field>& Q,
const HermitianEigCtrl<Field>& ctrl=HermitianEigCtrl<Field>() );

#ifdef HYDROGEN_HAVE_GPU
template<typename Field>
HermitianEigInfo
HermitianEig
( UpperOrLower uplo,
Matrix<Field, El::Device::GPU>& A,
Matrix<Base<Field>, El::Device::GPU>& w,
const HermitianEigCtrl<Field>& ctrl=HermitianEigCtrl<Field>() );
template <typename Field>
HermitianEigInfo
HermitianEig(UpperOrLower uplo,
Matrix<Field, El::Device::GPU>& A,
Matrix<Base<Field>, El::Device::GPU>& w,
Matrix<Field, El::Device::GPU>& Q,
const HermitianEigCtrl<Field>& ctrl = HermitianEigCtrl<Field>());
#endif // HDYROGEN_HAVE_GPU
namespace herm_eig {

template<typename Real,
Expand Down
1 change: 1 addition & 0 deletions include/hydrogen/blas/BLAS_Common.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@ enum class BLAS_Op

enum class LAPACK_Op
{
HEEV,
POTRF,
}; // enum class LAPACK_Op

Expand Down
18 changes: 18 additions & 0 deletions include/hydrogen/blas/GPU_BLAS_decl.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -587,6 +587,24 @@ void Dgmm(SideMode side,
namespace gpu_lapack
{

/** @brief Compute the eigenvalues and eigenvectors of A.
*
* At this time, we only support the computation of values AND vectors.
*
* @note Some workspace memory is allocated internally, using a
* caching allocator when possible.
* @note The "info" parameter to the solver routine is only checked
* in DEBUG builds.
*/
template <typename T, typename SizeT>
void HermitianEig(
FillMode uplo,
SizeT n,
T* A,
SizeT lda,
TmpBase<T>* W,
SyncInfo<Device::GPU> const& si);

/** @brief Compute the Cholesky factorization of A.
*
* This routine will allocate the properly-sized workspace and info
Expand Down
87 changes: 82 additions & 5 deletions include/hydrogen/blas/GPU_BLAS_impl.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -1070,6 +1070,7 @@ namespace gpu_lapack
{
namespace details
{

// These might be unique.
using gpu_lapack_impl::ToSizeT;
using gpu_lapack_impl::GetDenseLibraryHandle;
Expand All @@ -1083,18 +1084,82 @@ using gpu_blas_impl::ToNativeFillMode;
using gpu_blas_impl::ToNativeSideMode;
using gpu_blas_impl::ToNativeTransposeMode;

template <typename T, typename SizeT, typename InfoT,
template <typename T,
typename SizeT,
typename = EnableWhen<IsSupportedType<T, LAPACK_Op::HEEV>>>
void HermitianEigImpl(
FillMode uplo,
SizeT n,
T* A,
SizeT lda,
TmpBase<T>* W,
SyncInfo<Device::GPU> const& si)
{
using NTP = MakePointer<NativeType<T>>;
using RealNTP = MakePointer<NativeType<TmpBase<T>>>;

auto workspace_size = gpu_lapack_impl::GetHeevWorkspaceSize(
GetDenseLibraryHandle(),
ToNativeFillMode(uplo),
ToSizeT(n),
reinterpret_cast<NTP>(A),
ToSizeT(lda),
reinterpret_cast<RealNTP>(W));

SyncManager mgr(GetDenseLibraryHandle(), si);
simple_buffer<T, Device::GPU> workspace(workspace_size, si);
simple_buffer<gpu_lapack_impl::InfoT, Device::GPU> info(1, si);

// NOTE: This excludes the "jobz" parameter -- we ALWAYS compute
// the eigenvalues for now.
gpu_lapack_impl::Heev(
GetDenseLibraryHandle(),
ToNativeFillMode(uplo),
ToSizeT(n),
reinterpret_cast<NTP>(A),
ToSizeT(lda),
reinterpret_cast<RealNTP>(W),
reinterpret_cast<NTP>(workspace.data()),
workspace_size,
info.data());

#ifndef EL_RELEASE
gpu_lapack_impl::InfoT host_info;
gpu::Copy1DToHost(info.data(), &host_info, 1, si);
Synchronize(si);
if (host_info > gpu_lapack_impl::InfoT(0))
throw std::runtime_error("HermitianEigImpl: Solver failed to converge.");
else if (host_info < gpu_lapack_impl::InfoT(0))
throw std::runtime_error("HermitianEigImpl: A parameter is bad.");
#endif // EL_RELEASE
}

template <typename T, typename SizeT,
typename=EnableUnless<IsSupportedType<T,LAPACK_Op::HEEV>>,
typename=void>
void HermitianEigImpl(
FillMode,
SizeT,
T*,
SizeT,
TmpBase<T>*,
SyncInfo<Device::GPU> const&)
{
std::ostringstream oss;
oss << "No valid implementation of HermitianEig for T="
<< TypeTraits<T>::Name();
throw std::logic_error(oss.str());
}

template <typename T, typename SizeT,
typename=EnableWhen<IsSupportedType<T,LAPACK_Op::POTRF>>>
void CholeskyFactorizeImpl(FillMode uplo,
SizeT n,
T* A, SizeT lda,
T* workspace, SizeT workspace_size,
InfoT* info,
gpu_lapack_impl::InfoT* info,
SyncInfo<Device::GPU> const& si)
{
static_assert(IsSame<InfoT, gpu_lapack_impl::InfoT>::value,
"Deduced InfoT must match gpu_lapack_impl::InfoT.");

using NTP = MakePointer<NativeType<T>>;

SyncManager mgr(GetDenseLibraryHandle(), si);
Expand Down Expand Up @@ -1194,6 +1259,18 @@ void CholeskyFactorizeImpl(FillMode const,
}
}// namespace details

template <typename T, typename SizeT>
void HermitianEig(
FillMode uplo,
SizeT n,
T* A,
SizeT lda,
TmpBase<T>* W,
SyncInfo<Device::GPU> const& si)
{
details::HermitianEigImpl(uplo, n, A, lda, W, si);
}

template <typename T, typename SizeT>
void CholeskyFactorize(FillMode uplo,
SizeT n,
Expand Down
51 changes: 47 additions & 4 deletions include/hydrogen/device/gpu/cuda/cuSOLVER_API.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,26 @@ using DevicePtr = T*;

using cusolver_int = int;

template <typename T>
struct RealTypeT
{
using type = T;
};

template <>
struct RealTypeT<cuComplex>
: RealTypeT<float>
{
};

template <>
struct RealTypeT<cuDoubleComplex>
: RealTypeT<double>
{
};

template <typename T> using RealType = typename RealTypeT<T>::type;

// Cholesky factorization
#define ADD_POTRF_DECL(ScalarType) \
cusolver_int GetPotrfWorkspaceSize( \
Expand All @@ -38,10 +58,33 @@ using cusolver_int = int;
cusolver_int workspace_size, \
DevicePtr<cusolver_int> devinfo)

ADD_POTRF_DECL(float);
ADD_POTRF_DECL(double);
ADD_POTRF_DECL(cuComplex);
ADD_POTRF_DECL(cuDoubleComplex);
#define ADD_HEEV_DECL(ScalarType) \
cusolver_int GetHeevWorkspaceSize( \
cusolverDnHandle_t handle, \
cublasFillMode_t uplo, \
cusolver_int n, \
DeviceArray<ScalarType> A, \
cusolver_int lda, \
DeviceArray<RealType<ScalarType>> W); \
void Heev( \
cusolverDnHandle_t handle, \
cublasFillMode_t uplo, \
cusolver_int n, \
DeviceArray<ScalarType> A, \
cusolver_int lda, \
DeviceArray<RealType<ScalarType>> W, \
DeviceArray<ScalarType> workspace, \
cusolver_int workspace_size, \
DevicePtr<cusolver_int> devinfo)

#define ADD_CUSOLVER_DECLS(ScalarType) \
ADD_POTRF_DECL(ScalarType); \
ADD_HEEV_DECL(ScalarType)

ADD_CUSOLVER_DECLS(float);
ADD_CUSOLVER_DECLS(double);
ADD_CUSOLVER_DECLS(cuComplex);
ADD_CUSOLVER_DECLS(cuDoubleComplex);

}// namespace cusolver
}// namespace hydrogen
Expand Down
Loading

0 comments on commit cacfa47

Please sign in to comment.