Commit 9a1e9b62 authored by Andreas Marek's avatar Andreas Marek
Browse files

Start to add redistribute

parent 919e9ba0
......@@ -104,30 +104,6 @@ if test x"${with_mpi}" = x"yes"; then
AC_DEFINE([WITH_MPI], [1], [use MPI])
fi
dnl Scalapack tests
AC_MSG_CHECKING(whether --enable-scalapack-tests is specified)
AC_ARG_ENABLE([scalapack-tests],
AS_HELP_STRING([--enable-scalapack-tests],
[build SCALAPACK test cases for performance comparison, needs MPI, default no.]),
[
if test x"$enableval" = x"yes"; then
enable_scalapack_tests=yes
else
enable_scalapack_tests=no
fi
],
[enable_scalapack_tests="no"])
AC_MSG_RESULT([$enable_scalapack_tests])
if test x"${enable_scalapack_tests}" = x"yes"; then
if test x"$with_mpi" = x"no"; then
AC_MSG_ERROR([You cannot build the SCALAPCK test cases without MPI])
fi
AC_DEFINE([WITH_SCALAPACK_TESTS], [1], [build SCALAPACK test cases])
fi
AM_CONDITIONAL([WITH_SCALAPACK_TESTS], [test x"$enable_scalapack_tests" = x"yes"])
dnl C
AC_LANG_PUSH([C])
......@@ -1411,6 +1387,100 @@ if test x"${enable_autotuning}" = x"yes"; then
AC_DEFINE([ENABLE_AUTOTUNING], [1], [enable autotuning functionality])
fi
dnl Scalapack tests
AC_MSG_CHECKING(whether --enable-scalapack-tests is specified)
AC_ARG_ENABLE([scalapack-tests],
AS_HELP_STRING([--enable-scalapack-tests],
[build SCALAPACK test cases for performance comparison, needs MPI, default no.]),
[
if test x"$enableval" = x"yes"; then
enable_scalapack_tests=yes
else
enable_scalapack_tests=no
fi
],
[enable_scalapack_tests="no"])
AC_MSG_RESULT([$enable_scalapack_tests])
if test x"${enable_scalapack_tests}" = x"yes"; then
if test x"$with_mpi" = x"no"; then
AC_MSG_ERROR([You cannot build the SCALAPCK test cases without MPI])
fi
AC_DEFINE([WITH_SCALAPACK_TESTS], [1], [build SCALAPACK test cases])
fi
AM_CONDITIONAL([WITH_SCALAPACK_TESTS], [test x"$enable_scalapack_tests" = x"yes"])
AC_MSG_CHECKING(whether matrix redistribution should be considered in autotuning)
AC_ARG_ENABLE([autotune-redistribute-matrix],
AS_HELP_STRING([--enable-autotune-redistribute-matrix],
[Allows ELPA during autotuning to re-distribute the matrix to find the best (ELPA internal) block size for block-cyclic distribution (Needs Scalapack functionality)]),
[if test x"$enableval" = x"yes"; then
enable_autotune_redistribute_matrix=yes
else
enable_autotune_redistribute_matrix=no
fi],
[enable_autotune_redistribute_matrix=no])
AC_MSG_RESULT([${enable_autotune_redistribute_matrix}])
if test x"${enable_autotune_redistribute_matrix}" = x"yes" ; then
if test x"${enable_scalapack_tests}" = x"no"; then
AC_MSG_ERROR([Please also set --enable_scalapack_tests in this case])
fi
if test x"${with_mpi}" = x"no"; then
AC_MSG_ERROR([For this option ELPA must be build with MPI enabled])
fi
AC_DEFINE([REDISTRIBUTE_MATRIX],[1],[enable matrix re-distribution during autotuning])
fi
dnl Scalapack tests
AC_MSG_CHECKING(whether --enable-scalapack-tests is specified)
AC_ARG_ENABLE([scalapack-tests],
AS_HELP_STRING([--enable-scalapack-tests],
[build SCALAPACK test cases for performance comparison, needs MPI, default no.]),
[
if test x"$enableval" = x"yes"; then
enable_scalapack_tests=yes
else
enable_scalapack_tests=no
fi
],
[enable_scalapack_tests="no"])
AC_MSG_RESULT([$enable_scalapack_tests])
if test x"${enable_scalapack_tests}" = x"yes"; then
if test x"$with_mpi" = x"no"; then
AC_MSG_ERROR([You cannot build the SCALAPCK test cases without MPI])
fi
AC_DEFINE([WITH_SCALAPACK_TESTS], [1], [build SCALAPACK test cases])
fi
AM_CONDITIONAL([WITH_SCALAPACK_TESTS], [test x"$enable_scalapack_tests" = x"yes"])
AC_MSG_CHECKING(whether matrix redistribution should be considered in autotuning)
AC_ARG_ENABLE([autotune-redistribute-matrix],
AS_HELP_STRING([--enable-autotune-redistribute-matrix],
[Allows ELPA during autotuning to re-distribute the matrix to find the best (ELPA internal) block size for block-cyclic distribution (Needs Scalapack functionality)]),
[if test x"$enableval" = x"yes"; then
enable_autotune_redistribute_matrix=yes
else
enable_autotune_redistribute_matrix=no
fi],
[enable_autotune_redistribute_matrix=no])
AC_MSG_RESULT([${enable_autotune_redistribute_matrix}])
if test x"${enable_autotune_redistribute_matrix}" = x"yes" ; then
if test x"${enable_scalapack_tests}" = x"no"; then
AC_MSG_ERROR([Please also set --enable_scalapack_tests in this case])
fi
if test x"${with_mpi}" = x"no"; then
AC_MSG_ERROR([For this option ELPA must be build with MPI enabled])
fi
AC_DEFINE([REDISTRIBUTE_MATRIX],[1],[enable matrix re-distribution during autotuning])
fi
AC_MSG_CHECKING(whether C tests should be provided)
AC_ARG_ENABLE([c-tests],
AS_HELP_STRING([--enable-c-tests],
......
......@@ -9,6 +9,16 @@
name = value,
#define ELPA_ENUM_SUM(name, value, ...) +1
/* MATRIX layout */
#define ELPA_FOR_ALL_MATRIX_LAYOUTS(X) \
X(COLUMN_MAJOR_ORDER, 1) \
X(ROW_MAJOR_ORDER, 2)
enum MATRIX_LAYOUTS {
ELPA_FOR_ALL_MATRIX_LAYOUTS(ELPA_ENUM_ENTRY)
};
#define ELPA_NUMBER_OF_MATRIX_LAYOUTS (0 ELPA_FOR_ALL_MATRIX_LAYOUTS(ELPA_ENUM_SUM))
/* Solver constants */
#define ELPA_FOR_ALL_SOLVERS(X) \
......
......@@ -54,7 +54,7 @@
implicit none
#include "../general/precision_kinds.F90"
class(elpa_abstract_impl_t), intent(inout) :: obj
integer(kind=ik) :: na, lda, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols
integer(kind=ik) :: na, matrixRows, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols
#ifdef USE_ASSUMED_SIZE
MATH_DATATYPE(kind=rck) :: a(obj%local_nrows,*)
#else
......@@ -94,7 +94,7 @@
#endif
na = obj%na
lda = obj%local_nrows
matrixRows = obj%local_nrows
nblk = obj%nblk
matrixCols = obj%local_ncols
......@@ -200,7 +200,7 @@
call obj%timer%start("blas")
call PRECISION_POTRF('U', int(na-n+1,kind=BLAS_KIND), a(l_row1,l_col1), &
int(lda,kind=BLAS_KIND), infoBLAS )
int(matrixRows,kind=BLAS_KIND), infoBLAS )
info = int(infoBLAS,kind=ik)
call obj%timer%stop("blas")
......@@ -233,7 +233,7 @@
call obj%timer%start("blas")
call PRECISION_POTRF('U', int(nblk,kind=BLAS_KIND), a(l_row1,l_col1), &
int(lda,kind=BLAS_KIND) , infoBLAS )
int(matrixRows,kind=BLAS_KIND) , infoBLAS )
info = int(infoBLAS,kind=ik)
call obj%timer%stop("blas")
......@@ -283,7 +283,7 @@
if (l_cols-l_colx+1>0) &
call PRECISION_TRSM('L', 'U', BLAS_TRANS_OR_CONJ, 'N', int(nblk,kind=BLAS_KIND), &
int(l_cols-l_colx+1,kind=BLAS_KIND), ONE, tmp2, &
int(ubound(tmp2,dim=1),kind=BLAS_KIND), a(l_row1,l_colx), int(lda,kind=BLAS_KIND) )
int(ubound(tmp2,dim=1),kind=BLAS_KIND), a(l_row1,l_colx), int(matrixRows,kind=BLAS_KIND) )
call obj%timer%stop("blas")
endif
......@@ -326,7 +326,7 @@
int(nblk,kind=BLAS_KIND), -ONE, &
tmatr(lrs,1), int(ubound(tmatr,dim=1),kind=BLAS_KIND), tmatc(lcs,1), &
int(ubound(tmatc,dim=1),kind=BLAS_KIND), &
ONE, a(lrs,lcs), int(lda,kind=BLAS_KIND))
ONE, a(lrs,lcs), int(matrixRows,kind=BLAS_KIND))
call obj%timer%stop("blas")
enddo
......
......@@ -62,7 +62,7 @@
implicit none
#include "../general/precision_kinds.F90"
class(elpa_abstract_impl_t), intent(inout) :: obj
integer(kind=ik) :: na, lda, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols
integer(kind=ik) :: na, matrixRows, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols
#ifdef USE_ASSUMED_SIZE
MATH_DATATYPE(kind=rck) :: a(obj%local_nrows,*)
#else
......@@ -87,7 +87,7 @@
&")
na = obj%na
lda = obj%local_nrows
matrixRows = obj%local_nrows
nblk = obj%nblk
matrixCols = obj%local_ncols
......@@ -185,7 +185,7 @@
if (my_pcol==pcol(n, nblk, np_cols)) then
call obj%timer%start("blas")
call PRECISION_TRTRI('U', 'N', int(nb,kind=BLAS_KIND), a(l_row1,l_col1), int(lda,kind=BLAS_KIND), &
call PRECISION_TRTRI('U', 'N', int(nb,kind=BLAS_KIND), a(l_row1,l_col1), int(matrixRows,kind=BLAS_KIND), &
infoBLAS)
info = int(infoBLAS,kind=ik)
call obj%timer%stop("blas")
......@@ -231,7 +231,7 @@
call obj%timer%start("blas")
if (l_cols-l_colx+1>0) &
call PRECISION_TRMM('L', 'U', 'N', 'N', int(nb,kind=BLAS_KIND), int(l_cols-l_colx+1,kind=BLAS_KIND), ONE, &
tmp2, int(ubound(tmp2,dim=1),kind=BLAS_KIND), a(l_row1,l_colx), int(lda,kind=BLAS_KIND))
tmp2, int(ubound(tmp2,dim=1),kind=BLAS_KIND), a(l_row1,l_colx), int(matrixRows,kind=BLAS_KIND))
call obj%timer%stop("blas")
if (l_colx<=l_cols) tmat2(1:nb,l_colx:l_cols) = a(l_row1:l_row1+nb-1,l_colx:l_cols)
if (my_pcol==pcol(n, nblk, np_cols)) tmat2(1:nb,l_col1:l_col1+nb-1) = tmp2(1:nb,1:nb) ! tmp2 has the lower left triangle 0
......@@ -269,7 +269,7 @@
int(nb,kind=BLAS_KIND), -ONE, &
tmat1, int(ubound(tmat1,dim=1),kind=BLAS_KIND), tmat2(1,l_col1), &
int(ubound(tmat2,dim=1),kind=BLAS_KIND), ONE, &
a(1,l_col1), int(lda,kind=BLAS_KIND) )
a(1,l_col1), int(matrixRows,kind=BLAS_KIND) )
call obj%timer%stop("blas")
......
......@@ -87,7 +87,7 @@
integer(kind=ik) :: istat
character(200) :: errorMessage
logical :: success
integer(kind=ik) :: nblk, mpi_comm_rows, mpi_comm_cols, lda, ldaCols, error
integer(kind=ik) :: nblk, mpi_comm_rows, mpi_comm_cols, matrixRows, matrixCols, error
call obj%timer%start("elpa_mult_at_b_&
&MATH_DATATYPE&
......@@ -95,10 +95,10 @@
&PRECISION&
&")
na = obj%na
nblk = obj%nblk
lda = obj%local_nrows
ldaCols = obj%local_ncols
na = obj%na
nblk = obj%nblk
matrixRows = obj%local_nrows
matrixCols = obj%local_ncols
call obj%get("mpi_comm_rows",mpi_comm_rows,error)
......
......@@ -67,7 +67,7 @@
implicit none
class(elpa_abstract_impl_t), intent(inout) :: obj
integer(kind=ik) :: na, nev, ldq, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols
integer(kind=ik) :: na, nev, matrixRows, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols
real(kind=REAL_DATATYPE) :: d(obj%na), e(obj%na)
#ifdef USE_ASSUMED_SIZE
real(kind=REAL_DATATYPE) :: q(obj%local_nrows,*)
......@@ -89,7 +89,7 @@
na = obj%na
nev = obj%nev
nblk = obj%nblk
ldq = obj%local_nrows
matrixRows = obj%local_nrows
matrixCols = obj%local_ncols
#ifdef WITH_OPENMP
......@@ -129,7 +129,7 @@
call solve_tridi_&
&PRECISION&
&_private_impl(obj, na, nev, d, e, q, ldq, nblk, matrixCols, &
&_private_impl(obj, na, nev, d, e, q, matrixRows, nblk, matrixCols, &
mpi_comm_rows, mpi_comm_cols,.false., wantDebug, success, &
nrThreads)
......
......@@ -128,9 +128,9 @@
&PRECISION&
&_&
&MATH_DATATYPE
integer(kind=ik) :: na, nev, lda, ldq, nblk, matrixCols, &
integer(kind=ik) :: na, nev, nblk, matrixCols, &
mpi_comm_rows, mpi_comm_cols, &
mpi_comm_all, check_pd, error
mpi_comm_all, check_pd, error, matrixRows
logical :: do_bandred, do_tridiag, do_solve_tridi, &
do_trans_to_band, do_trans_to_full
......@@ -187,10 +187,10 @@
na = obj%na
nev = obj%nev
lda = obj%local_nrows
ldq = obj%local_nrows
nblk = obj%nblk
matrixCols = obj%local_ncols
matrixRows = obj%local_nrows
call obj%get("mpi_comm_rows",mpi_comm_rows,error)
if (error .ne. ELPA_OK) then
......@@ -558,10 +558,10 @@
if (.not. obj%eigenvalues_only) then
q_actual => q(1:obj%local_nrows,1:obj%local_ncols)
q_actual => q(1:matrixRows,1:matrixCols)
else
allocate(q_dummy(1:obj%local_nrows,1:obj%local_ncols))
q_actual => q_dummy(1:obj%local_nrows,1:obj%local_ncols)
allocate(q_dummy(1:matrixRows,1:matrixCols))
q_actual => q_dummy(1:matrixRows,1:matrixCols)
endif
......@@ -701,7 +701,7 @@
&_&
&PRECISION &
(obj, na, a, &
a_dev, lda, nblk, nbw, matrixCols, num_blocks, mpi_comm_rows, mpi_comm_cols, tmat, &
a_dev, matrixRows, nblk, nbw, matrixCols, num_blocks, mpi_comm_rows, mpi_comm_cols, tmat, &
tmat_dev, wantDebug, do_useGPU_bandred, success, &
#if REALCASE == 1
useQRActual, &
......@@ -734,7 +734,7 @@
&MATH_DATATYPE&
&_&
&PRECISION&
(obj, na, nbw, nblk, a, a_dev, lda, ev, e, matrixCols, hh_trans, mpi_comm_rows, mpi_comm_cols, mpi_comm_all, &
(obj, na, nbw, nblk, a, a_dev, matrixRows, ev, e, matrixCols, hh_trans, mpi_comm_rows, mpi_comm_cols, mpi_comm_all, &
do_useGPU_tridiag_band, wantDebug, nrThreads)
#ifdef WITH_MPI
......@@ -754,6 +754,24 @@
l_cols = local_index(na, my_pcol, np_cols, nblk, -1) ! Local columns of q
l_cols_nev = local_index(nev, my_pcol, np_cols, nblk, -1) ! Local columns corresponding to nev
! test only
l_cols = local_index(na, my_pcol, np_cols, nblk, -1) ! Local columns of q
if (matrixCols .ne. l_cols) then
print *,"DFDSF ",matrixCols, l_cols
else
print *,"identical"
endif
l_rows = local_index(na, my_prow, np_rows, nblk, -1) ! Local rows of a and q
if (matrixRows .ne. l_rows) then
print *,"DFDSF rows ",matrixRows, l_rows
else
print *,"identical rows"
endif
allocate(q_real(l_rows,l_cols), stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
print *,"solve_evp_&
......@@ -774,7 +792,7 @@
&PRECISION &
(obj, na, nev, ev, e, &
#if REALCASE == 1
q_actual, ldq, &
q_actual, matrixRows, &
#endif
#if COMPLEXCASE == 1
q_real, ubound(q_real,dim=1), &
......@@ -843,23 +861,23 @@
! Extra transformation step for skew-symmetric matrix. Multiplication with diagonal complex matrix D.
! This makes the eigenvectors complex.
! For now real part of eigenvectors is generated in first half of q, imaginary part in second part.
q(1:obj%local_nrows, obj%local_ncols+1:2*obj%local_ncols) = 0.0
do i = 1, obj%local_nrows
q(1:matrixRows, matrixCols+1:2*matrixCols) = 0.0
do i = 1, matrixRows
! global_index = indxl2g(i, nblk, my_prow, 0, np_rows)
global_index = np_rows*nblk*((i-1)/nblk) + MOD(i-1,nblk) + MOD(np_rows+my_prow-0, np_rows)*nblk + 1
if (mod(global_index-1,4) .eq. 0) then
! do nothing
end if
if (mod(global_index-1,4) .eq. 1) then
q(i,obj%local_ncols+1:2*obj%local_ncols) = q(i,1:obj%local_ncols)
q(i,1:obj%local_ncols) = 0
q(i,matrixCols+1:2*matrixCols) = q(i,1:matrixCols)
q(i,1:matrixCols) = 0
end if
if (mod(global_index-1,4) .eq. 2) then
q(i,1:obj%local_ncols) = -q(i,1:obj%local_ncols)
q(i,1:matrixCols) = -q(i,1:matrixCols)
end if
if (mod(global_index-1,4) .eq. 3) then
q(i,obj%local_ncols+1:2*obj%local_ncols) = -q(i,1:obj%local_ncols)
q(i,1:obj%local_ncols) = 0
q(i,matrixCols+1:2*matrixCols) = -q(i,1:matrixCols)
q(i,1:matrixCols) = 0
end if
end do
endif
......@@ -881,27 +899,9 @@
&PRECISION &
(obj, na, nev, nblk, nbw, q, &
q_dev, &
ldq, matrixCols, hh_trans, mpi_comm_rows, mpi_comm_cols, wantDebug, do_useGPU_trans_ev_tridi_to_band, &
matrixRows, matrixCols, hh_trans, mpi_comm_rows, mpi_comm_cols, wantDebug, do_useGPU_trans_ev_tridi_to_band, &
nrThreads, success=success, kernel=kernel)
! if (isSkewsymmetric) then
! ! Transform imaginary part
! ! Transformation of real and imaginary part could also be one call of trans_ev_tridi acting on the n x 2n matrix.
! call trans_ev_tridi_to_band_&
! &MATH_DATATYPE&
! &_&
! &PRECISION &
! (obj, na, nev, nblk, nbw, q(1:obj%local_nrows, obj%local_ncols+1:2*obj%local_ncols), &
! q_dev, &
! ldq, matrixCols, hh_trans, mpi_comm_rows, mpi_comm_cols, wantDebug, do_useGPU_trans_ev_tridi_to_band, &
! nrThreads, success=success, kernel=kernel)
! endif
! print * , "After trans_ev_tridi_to_band: real part of q="
! do i=1,na
! write(*,"(100g15.5)") ( q(i,j), j=1,na )
! enddo
! #ifdef DOUBLE_PRECISION_REAL
! call prmat(na,useGPU,q(1:obj%local_nrows, obj%local_ncols+1:2*obj%local_ncols),q_dev,lda,matrixCols,nblk,my_prow,my_pcol,np_rows,np_cols,'R',0)
! #endif
#ifdef HAVE_LIKWID
call likwid_markerStopRegion("trans_ev_to_band")
#endif
......@@ -930,7 +930,8 @@
! if the second backward step is to be performed, but not on GPU, we have
! to transfer q to the host
if(do_trans_to_full .and. (.not. do_useGPU_trans_ev_band_to_full)) then
successCUDA = cuda_memcpy(int(loc(q),kind=c_intptr_t), q_dev, ldq*matrixCols* size_of_datatype, cudaMemcpyDeviceToHost)
successCUDA = cuda_memcpy(int(loc(q),kind=c_intptr_t), q_dev, matrixRows*matrixCols* size_of_datatype, &
cudaMemcpyDeviceToHost)
if (.not.(successCUDA)) then
print *,"elpa2_template, error in copy to host"
stop 1
......@@ -956,15 +957,16 @@
if ( (do_useGPU_trans_ev_band_to_full) .and. .not.(do_useGPU_trans_ev_tridi_to_band) ) then
! copy to device if we want to continue on GPU
successCUDA = cuda_malloc(q_dev, ldq*matrixCols*size_of_datatype)
successCUDA = cuda_malloc(q_dev, matrixRows*matrixCols*size_of_datatype)
! if (.not.(successCUDA)) then
! print *,"elpa2_template, error in cuda_malloc"
! stop 1
! endif
! print *, 'q_dev=', q_dev, 'loc(q)=', loc(q)&
! , 'ldq*matrixCols* size_of_datatype=', ldq*matrixCols* size_of_datatype, ', q(1,1)=', q(1,1)
! , 'matrixRows*matrixCols* size_of_datatype=', matrixRows*matrixCols* size_of_datatype, ', q(1,1)=', q(1,1)
successCUDA = cuda_memcpy(q_dev, int(loc(q),kind=c_intptr_t), ldq*matrixCols* size_of_datatype, cudaMemcpyHostToDevice)
successCUDA = cuda_memcpy(q_dev, int(loc(q),kind=c_intptr_t), matrixRows*matrixCols* size_of_datatype, &
cudaMemcpyHostToDevice)
if (.not.(successCUDA)) then
print *,"elpa2_template, error in copy to device", successCUDA
stop 1
......@@ -979,9 +981,9 @@
&_&
&PRECISION &
(obj, na, nev, nblk, nbw, a, &
a_dev, lda, tmat, tmat_dev, q, &
a_dev, matrixRows, tmat, tmat_dev, q, &
q_dev, &
ldq, matrixCols, num_blocks, mpi_comm_rows, mpi_comm_cols, do_useGPU_trans_ev_band_to_full &
matrixRows, matrixCols, num_blocks, mpi_comm_rows, mpi_comm_cols, do_useGPU_trans_ev_band_to_full &
#if REALCASE == 1
, useQRActual &
#endif
......@@ -993,7 +995,7 @@
call obj%timer%stop("trans_ev_to_full")
endif ! do_trans_to_full
! #ifdef DOUBLE_PRECISION_REAL
! call prmat(na,useGPU,q(1:obj%local_nrows, 1:obj%local_ncols),q_dev,lda,matrixCols,nblk,my_prow,my_pcol,np_rows,np_cols,'R',1)
! call prmat(na,useGPU,q(1:matrixRows, 1:matrixCols),q_dev,matrixRows,matrixCols,nblk,my_prow,my_pcol,np_rows,np_cols,'R',1)
! #endif
! New position:
if (do_trans_to_band) then
......@@ -1004,9 +1006,9 @@
&MATH_DATATYPE&
&_&
&PRECISION &
(obj, na, nev, nblk, nbw, q(1:obj%local_nrows, obj%local_ncols+1:2*obj%local_ncols), &
(obj, na, nev, nblk, nbw, q(1:matrixRows, matrixCols+1:2*matrixCols), &
q_dev, &
ldq, matrixCols, hh_trans, mpi_comm_rows, mpi_comm_cols, wantDebug, do_useGPU_trans_ev_tridi_to_band, &
matrixRows, matrixCols, hh_trans, mpi_comm_rows, mpi_comm_cols, wantDebug, do_useGPU_trans_ev_tridi_to_band, &
nrThreads, success=success, kernel=kernel)
endif
! print * , "After trans_ev_tridi_to_band: imaginary part of q="
......@@ -1014,7 +1016,7 @@
! write(*,"(100g15.5)") ( q(i,j+na), j=1,na )
! enddo
! #ifdef DOUBLE_PRECISION_REAL
! call prmat(na,useGPU,q(1:obj%local_nrows, obj%local_ncols+1:2*obj%local_ncols),q_dev,lda,matrixCols,nblk,my_prow,my_pcol,np_rows,np_cols,'R',1)
! call prmat(na,useGPU,q(1:matrixRows, matrixCols+1:2*matrixCols),q_dev,matrixRows,matrixCols,nblk,my_prow,my_pcol,np_rows,np_cols,'R',1)
! #endif
! We can now deallocate the stored householder vectors
deallocate(hh_trans, stat=istat, errmsg=errorMessage)
......@@ -1032,7 +1034,8 @@
! if the second backward step is to be performed, but not on GPU, we have
! to transfer q to the host
if(do_trans_to_full .and. (.not. do_useGPU_trans_ev_band_to_full)) then
successCUDA = cuda_memcpy(loc(q(1,obj%local_ncols+1)), q_dev, ldq*matrixCols* size_of_datatype, cudaMemcpyDeviceToHost)
successCUDA = cuda_memcpy(loc(q(1,matrixCols+1)), q_dev, matrixRows*matrixCols* size_of_datatype, &
cudaMemcpyDeviceToHost)
if (.not.(successCUDA)) then
print *,"elpa2_template, error in copy to host"
stop 1
......@@ -1052,19 +1055,20 @@
if (isSkewsymmetric) then
if ( (do_useGPU_trans_ev_band_to_full) .and. .not.(do_useGPU_trans_ev_tridi_to_band) ) then
! copy to device if we want to continue on GPU
successCUDA = cuda_malloc(q_dev, ldq*matrixCols*size_of_datatype)
successCUDA = cuda_malloc(q_dev, matrixRows*matrixCols*size_of_datatype)
! if (.not.(successCUDA)) then
! print *,"elpa2_template, error in cuda_malloc"
! stop 1
! endif
successCUDA = cuda_memcpy(q_dev, loc(q(1,obj%local_ncols+1)), ldq*matrixCols* size_of_datatype, cudaMemcpyHostToDevice)
successCUDA = cuda_memcpy(q_dev, loc(q(1,matrixCols+1)), matrixRows*matrixCols* size_of_datatype, &
cudaMemcpyHostToDevice)
if (.not.(successCUDA)) then
print *,"elpa2_template, error in copy to device"
stop 1
endif
endif
! #ifdef DOUBLE_PRECISION_REAL
! call prmat(na,useGPU,q(1:obj%local_nrows, obj%local_ncols+1:2*obj%local_ncols),q_dev,lda,matrixCols,nblk,my_prow,my_pcol,np_rows,np_cols,'I',0)
! call prmat(na,useGPU,q(matrixRows, matrixCols+1:2*matrixCols),q_dev,matrixRows,matrixCols,nblk,my_prow,my_pcol,np_rows,np_cols,'I',0)
! #endif
! Transform imaginary part
! Transformation of real and imaginary part could also be one call of trans_ev_band_to_full_ acting on the n x 2n matrix.
......@@ -1074,9 +1078,9 @@
&_&
&PRECISION &
(obj, na, nev, nblk, nbw, a, &
a_dev, lda, tmat, tmat_dev, q(1:obj%local_nrows, obj%local_ncols+1:2*obj%local_ncols), &
a_dev, matrixRows, tmat, tmat_dev, q(1:matrixRows, matrixCols+1:2*matrixCols), &
q_dev, &
ldq, matrixCols, num_blocks, mpi_comm_rows, mpi_comm_cols, do_useGPU_trans_ev_band_to_full &
matrixRows, matrixCols, num_blocks, mpi_comm_rows, mpi_comm_cols, do_useGPU_trans_ev_band_to_full &
#if REALCASE == 1
, useQRActual &
#endif
......@@ -1086,7 +1090,7 @@
! write(*,"(100g15.5)") ( q(i,j+na), j=1,na )
! enddo
! #ifdef DOUBLE_PRECISION_REAL
! call prmat(na,useGPU,q(1:obj%local_nrows, obj%local_ncols+1:2*obj%local_ncols),q_dev,lda,matrixCols,nblk,my_prow,my_pcol,np_rows,np_cols,'I',1)
! call prmat(na,useGPU,q(1:matrixRows, matrixCols+1:2*matrixCols),q_dev,matrixRows,matrixCols,nblk,my_prow,my_pcol,np_rows,np_cols,'I',1)
! #endif
endif
......
......@@ -177,7 +177,7 @@
np_rows = int(np_rowsMPI,kind=MPI_KIND)
my_pcol = int(my_pcolMPI,kind=MPI_KIND)
np_cols = int(np_colsMPI,kind=MPI_KIND)
if (wantDebug) call obj%timer%stop(",kind=MPI_KIND)mpi_communication")
if (wantDebug) call obj%timer%stop("mpi_communication")
! Get global_id mapping 2D procssor coordinates to global id
......
......@@ -62,6 +62,11 @@ static int enumerate_identity(elpa_index_t index, int i);
static int cardinality_bool(elpa_index_t index);
static int valid_bool(elpa_index_t index, int n, int new_value);
static int number_of_matrix_layouts(elpa_index_t index);
static int matrix_layout_enumerate(elpa_index_t index, int i);
static int matrix_layout_is_valid(elpa_index_t index, int n, int new_value);
static const char* elpa_matrix_layout_name(int layout);
static int number_of_solvers(elpa_index_t index);
static int solver_enumerate(elpa_index_t index, int i);
static int solver_is_valid(elpa_index_t index, int n, int new_value);
......@@ -85,6 +90,10 @@ static int stripewidth_real_cardinality(elpa_index_t index);
static int stripewidth_real_enumerate(elpa_index_t index, int i);
static int stripewidth_real_is_valid(elpa_index_t index, int n, int new_value);
static int internal_nblk_cardinality(elpa_index_t index);