Commit 136081c1 authored by Andreas Marek's avatar Andreas Marek

Further unfiy real/complex elpa1 trans ev

parent eb01a207
......@@ -901,6 +901,7 @@ EXTRA_DIST = \
src/elpa1_tridiag_template.X90 \
src/elpa2_compute_real_template.X90 \
src/elpa2_compute_complex_template.X90 \
src/elpa2_bandred_template.X90 \
src/elpa2_herm_matrix_allreduce_complex_template.X90 \
src/elpa2_symm_matrix_allreduce_real_template.X90 \
src/elpa2_trans_ev_band_to_full_complex_template.X90 \
......
......@@ -86,12 +86,11 @@
!> \param useGPU If true, GPU version of the subroutine will be used
!>
#if REALCASE == 1
subroutine trans_ev_real_PRECISION (na, nqc, a_mat, lda, tau, q_mat, ldq, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols, useGPU)
#endif
#if COMPLEXCASE == 1
subroutine trans_ev_complex_PRECISION(na, nqc, a_mat, lda, tau, q_mat, ldq, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols, useGPU)
#endif
subroutine trans_ev_&
&MATH_DATATYPE&
&_&
&PRECISION &
(na, nqc, a_mat, lda, tau, q_mat, ldq, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols, useGPU)
use cuda_functions
use iso_c_binding
#ifdef HAVE_DETAILED_TIMINGS
......@@ -155,12 +154,11 @@
integer(kind=C_intptr_T) :: q_dev, tmp_dev, hvm_dev, tmat_dev
logical :: successCUDA
#if REALCASE == 1
call timer%start("trans_ev_real" // PRECISION_SUFFIX)
#endif
#if COMPLEXCASE == 1
call timer%start("trans_ev_complex" // PRECISION_SUFFIX)
#endif
call timer%start("trans_ev_&
&MATH_DATATYPE&
&_" // &
&PRECISION_SUFFIX &
)
call timer%start("mpi_communication")
call mpi_comm_rank(mpi_comm_rows,my_prow,mpierr)
......@@ -179,54 +177,39 @@
max_stored_rows = (63/nblk+1)*nblk
allocate(tmat(max_stored_rows,max_stored_rows), stat=istat, errmsg=errorMessage)
#if REALCASE == 1
call check_alloc("trans_ev_real", "tmat", istat, errorMessage)
#endif
#if COMPLEXCASE == 1
call check_alloc("trans_ev_complex", "tmat", istat, errorMessage)
#endif
call check_alloc("trans_ev_&
&MATH_DATATYPE&
&", "tmat", istat, errorMessage)
allocate(h1(max_stored_rows*max_stored_rows), stat=istat, errmsg=errorMessage)
#if REALCASE == 1
call check_alloc("trans_ev_real", "h1", istat, errorMessage)
#endif
#if COMPLEXCASE == 1
call check_alloc("trans_ev_complex", "h1", istat, errorMessage)
#endif
call check_alloc("trans_ev_&
&MATH_DATATYPE&
&", "h1", istat, errorMessage)
allocate(h2(max_stored_rows*max_stored_rows), stat=istat, errmsg=errorMessage)
#if REALCASE == 1
call check_alloc("trans_ev_real", "h2", istat, errorMessage)
#endif
#if COMPLEXCASE == 1
call check_alloc("trans_ev_complex", "h2", istat, errorMessage)
#endif
call check_alloc("trans_ev_&
&MATH_DATATYPE&
&", "h2", istat, errorMessage)
allocate(tmp1(max_local_cols*max_stored_rows), stat=istat, errmsg=errorMessage)
#if REALCASE == 1
call check_alloc("trans_ev_real", "tmp1", istat, errorMessage)
#endif
#if COMPLEXCASE == 1
call check_alloc("trans_ev_complex", "tmp1", istat, errorMessage)
#endif
call check_alloc("trans_ev_&
&MATH_DATATYPE&
&", "tmp1", istat, errorMessage)
allocate(tmp2(max_local_cols*max_stored_rows), stat=istat, errmsg=errorMessage)
#if REALCASE == 1
call check_alloc("trans_ev_real", "tmp2", istat, errorMessage)
#endif
#if COMPLEXCASE == 1
call check_alloc("trans_ev_complex", "tmp2", istat, errorMessage)
#endif
call check_alloc("trans_ev_&
&MATH_DATATYPE&
&", "tmp2", istat, errorMessage)
allocate(hvb(max_local_rows*nblk), stat=istat, errmsg=errorMessage)
#if REALCASE == 1
call check_alloc("trans_ev_real", "hvn", istat, errorMessage)
#endif
#if COMPLEXCASE == 1
call check_alloc("trans_ev_complex", "hvb", istat, errorMessage)
#endif
call check_alloc("trans_ev_&
&MATH_DATATYPE&
&", "hvn", istat, errorMessage)
allocate(hvm(max_local_rows,max_stored_rows), stat=istat, errmsg=errorMessage)
#if REALCASE == 1
call check_alloc("trans_ev_real", "hvm", istat, errorMessage)
#endif
#if COMPLEXCASE == 1
call check_alloc("trans_ev_complex", "hvm", istat, errorMessage)
#endif
call check_alloc("trans_ev_&
&MATH_DATATYPE&
&", "hvm", istat, errorMessage)
hvm = 0 ! Must be set to 0 !!!
hvb = 0 ! Safety only
......@@ -248,59 +231,45 @@
if (useGPU) then
! todo: this is used only for copying hmv to device.. it should be possible to go without it
allocate(hvm1(max_local_rows*max_stored_rows), stat=istat, errmsg=errorMessage)
#if REALCASE == 1
call check_alloc("trans_ev_real", "hvm1", istat, errorMessage)
#endif
#if COMPLEXCASE == 1
call check_alloc("trans_ev_complex", "hvm1", istat, errorMessage)
#endif
#if REALCASE == 1
successCUDA = cuda_malloc(tmat_dev, max_stored_rows * max_stored_rows * size_of_PRECISION_real)
check_alloc_cuda("trans_ev", successCUDA)
#endif
#if COMPLEXCASE == 1
successCUDA = cuda_malloc(tmat_dev, max_stored_rows * max_stored_rows * size_of_PRECISION_complex)
call check_alloc("trans_ev_&
&MATH_DATATYPE&
&", "hvm1", istat, errorMessage)
successCUDA = cuda_malloc(tmat_dev, max_stored_rows * max_stored_rows * size_of_&
&PRECISION&
&_&
&MATH_DATATYPE&
&_datatype)
check_alloc_cuda("trans_ev", successCUDA)
#endif
#if REALCASE == 1
successCUDA = cuda_malloc(hvm_dev, max_local_rows * max_stored_rows * size_of_PRECISION_real)
successCUDA = cuda_malloc(hvm_dev, max_local_rows * max_stored_rows * size_of_&
&PRECISION&
&_&
&MATH_DATATYPE&
&_datatype)
check_alloc_cuda("trans_ev", successCUDA)
#endif
#if COMPLEXCASE == 1
successCUDA = cuda_malloc(hvm_dev, max_local_rows * max_stored_rows * size_of_PRECISION_complex)
check_alloc_cuda("trans_ev", successCUDA)
#endif
#if REALCASE == 1
successCUDA = cuda_malloc(tmp_dev, max_local_cols * max_stored_rows * size_of_PRECISION_real)
check_alloc_cuda("trans_ev", successCUDA)
#endif
#if COMPLEXCASE == 1
successCUDA = cuda_malloc(tmp_dev, max_local_cols * max_stored_rows * size_of_PRECISION_complex)
successCUDA = cuda_malloc(tmp_dev, max_local_cols * max_stored_rows * size_of_&
&PRECISION&
&_&
&MATH_DATATYPE&
&_datatype)
check_alloc_cuda("trans_ev", successCUDA)
#endif
#if REALCASE == 1
successCUDA = cuda_malloc(q_dev, ldq * matrixCols * size_of_PRECISION_real)
check_alloc_cuda("trans_ev", successCUDA)
#endif
#if COMPLEXCASE == 1
successCUDA = cuda_malloc(q_dev, ldq * matrixCols * size_of_PRECISION_complex)
successCUDA = cuda_malloc(q_dev, ldq * matrixCols * size_of_&
&PRECISION&
&_&
&MATH_DATATYPE&
&_datatype)
check_alloc_cuda("trans_ev", successCUDA)
#endif
! q_dev = q_mat
#if REALCASE == 1
successCUDA = cuda_memcpy(q_dev, loc(q_mat(1,1)), ldq * matrixCols * size_of_PRECISION_real, cudaMemcpyHostToDevice)
successCUDA = cuda_memcpy(q_dev, loc(q_mat(1,1)), ldq * matrixCols * size_of_&
&PRECISION&
&_&
&MATH_DATATYPE&
&_datatype, cudaMemcpyHostToDevice)
check_memcpy_cuda("trans_ev", successCUDA)
#endif
#if COMPLEXCASE == 1
successCUDA = cuda_memcpy(q_dev, loc(q_mat(1,1)), ldq * matrixCols * size_of_PRECISION_complex, &
cudaMemcpyHostToDevice)
check_memcpy_cuda("trans_ev", successCUDA)
#endif
endif ! useGPU
do istep = 1, na, nblk
......@@ -330,12 +299,15 @@
#ifdef WITH_MPI
call timer%start("mpi_communication")
if (nb>0) &
call MPI_Bcast(hvb, nb, &
#if REALCASE == 1
call MPI_Bcast(hvb, nb, MPI_REAL_PRECISION, cur_pcol, mpi_comm_cols, mpierr)
&MPI_REAL_PRECISION&
#endif
#if COMPLEXCASE == 1
call MPI_Bcast(hvb, nb, MPI_COMPLEX_PRECISION, cur_pcol, mpi_comm_cols, mpierr)
&MPI_COMPLEX_PRECISION&
#endif
, cur_pcol, mpi_comm_cols, mpierr)
call timer%stop("mpi_communication")
#endif /* WITH_MPI */
......@@ -375,12 +347,14 @@
enddo
#ifdef WITH_MPI
call timer%start("mpi_communication")
if (nc>0) call mpi_allreduce( h1, h2, nc, &
#if REALCASE == 1
if (nc>0) call mpi_allreduce( h1, h2, nc, MPI_REAL_PRECISION, MPI_SUM, mpi_comm_rows, mpierr)
&MPI_REAL_PRECISION&
#endif
#if COMPLEXCASE == 1
if (nc>0) call mpi_allreduce(h1, h2, nc, MPI_COMPLEX_PRECISION, MPI_SUM, mpi_comm_rows, mpierr)
&MPI_COMPLEX_PRECISION&
#endif
&, MPI_SUM, mpi_comm_rows, mpierr)
call timer%stop("mpi_communication")
#else /* WITH_MPI */
......@@ -403,13 +377,16 @@
tmat, max_stored_rows, &
h2(nc+1),1)
#endif
call timer%stop("blas")
call timer%stop("blas")
tmat(n+1,1:n) = &
#if REALCASE == 1
tmat(n+1,1:n) = -h2(nc+1:nc+n)*tau(ice-nstor+n+1)
-h2(nc+1:nc+n) &
#endif
#if COMPLEXCASE == 1
tmat(n+1,1:n) = -conjg(h2(nc+1:nc+n))*tau(ice-nstor+n+1)
-conjg(h2(nc+1:nc+n)) &
#endif
*tau(ice-nstor+n+1)
tmat(n+1,n+1) = tau(ice-nstor+n+1)
nc = nc+n
enddo
......@@ -419,25 +396,22 @@
hvm1(1:hvm_ubnd*nstor) = reshape(hvm(1:hvm_ubnd,1:nstor), (/ hvm_ubnd*nstor /))
!hvm_dev(1:hvm_ubnd*nstor) = hvm1(1:hvm_ubnd*nstor)
#if REALCASE == 1
successCUDA = cuda_memcpy(hvm_dev, loc(hvm1(1)), &
hvm_ubnd * nstor * size_of_PRECISION_real, cudaMemcpyHostToDevice)
#endif
#if COMPLEXCASE == 1
successCUDA = cuda_memcpy(hvm_dev, loc(hvm1(1)), &
hvm_ubnd * nstor * size_of_PRECISION_complex, cudaMemcpyHostToDevice)
#endif
hvm_ubnd * nstor * size_of_&
&PRECISION&
&_&
&MATH_DATATYPE&
&_datatype, cudaMemcpyHostToDevice)
check_memcpy_cuda("trans_ev", successCUDA)
!tmat_dev = tmat
#if REALCASE == 1
successCUDA = cuda_memcpy(tmat_dev, loc(tmat(1,1)), &
max_stored_rows * max_stored_rows * size_of_PRECISION_real, cudaMemcpyHostToDevice)
#endif
#if COMPLEXCASE == 1
successCUDA = cuda_memcpy(tmat_dev, loc(tmat(1,1)), &
max_stored_rows * max_stored_rows * size_of_PRECISION_complex, cudaMemcpyHostToDevice)
#endif
max_stored_rows * max_stored_rows * size_of_&
&PRECISION&
&_&
&MATH_DATATYPE&
&_datatype, cudaMemcpyHostToDevice)
check_memcpy_cuda("trans_ev", successCUDA)
endif
......@@ -479,12 +453,11 @@
else !l_rows>0
if (useGPU) then
#if REALCASE == 1
successCUDA = cuda_memset(tmp_dev, 0, l_cols * nstor * size_of_PRECISION_real)
#endif
#if COMPLEXCASE == 1
successCUDA = cuda_memset(tmp_dev, 0, l_cols * nstor * size_of_PRECISION_complex)
#endif
successCUDA = cuda_memset(tmp_dev, 0, l_cols * nstor * size_of_&
&PRECISION&
&_&
&MATH_DATATYPE&
&_datatype)
check_memcpy_cuda("trans_ev", successCUDA)
else
tmp1(1:l_cols*nstor) = 0
......@@ -495,34 +468,32 @@
! In the legacy GPU version, this allreduce was ommited. But probably it has to be done for GPU + MPI
! todo: does it need to be copied whole? Wouldn't be a part sufficient?
if (useGPU) then
#if REALCASE == 1
successCUDA = cuda_memcpy(loc(tmp1(1)), tmp_dev, &
max_local_cols * max_stored_rows * size_of_PRECISION_real, cudaMemcpyDeviceToHost)
#endif
#if COMPLEXCASE == 1
successCUDA = cuda_memcpy(loc(tmp1(1)), tmp_dev, &
max_local_cols * max_stored_rows * size_of_PRECISION_complex, cudaMemcpyDeviceToHost)
#endif
max_local_cols * max_stored_rows * size_of_&
&PRECISION&
&_&
&MATH_DATATYPE&
&_datatype, cudaMemcpyDeviceToHost)
check_memcpy_cuda("trans_ev", successCUDA)
endif
call timer%start("mpi_communication")
call mpi_allreduce(tmp1, tmp2, nstor*l_cols, &
#if REALCASE == 1
call mpi_allreduce(tmp1, tmp2, nstor*l_cols, MPI_REAL_PRECISION, MPI_SUM, mpi_comm_rows, mpierr)
&MPI_REAL_PRECISION&
#endif
#if COMPLEXCASE == 1
call mpi_allreduce(tmp1, tmp2, nstor*l_cols, MPI_COMPLEX_PRECISION, MPI_SUM, mpi_comm_rows, mpierr)
&MPI_COMPLEX_PRECISION&
#endif
&, MPI_SUM, mpi_comm_rows, mpierr)
call timer%stop("mpi_communication")
! copy back tmp2 - after reduction...
if (useGPU) then
#if REALCASE == 1
successCUDA = cuda_memcpy(tmp_dev, loc(tmp2(1)), &
max_local_cols * max_stored_rows * size_of_PRECISION_real, cudaMemcpyHostToDevice)
#endif
#if COMPLEXCASE == 1
successCUDA = cuda_memcpy(tmp_dev, loc(tmp2(1)), &
max_local_cols * max_stored_rows * size_of_PRECISION_complex, cudaMemcpyHostToDevice)
#endif
max_local_cols * max_stored_rows * size_of_&
&PRECISION&
&_&
&MATH_DATATYPE&
&_datatype, cudaMemcpyHostToDevice)
check_memcpy_cuda("trans_ev", successCUDA)
endif ! useGPU
......@@ -609,34 +580,26 @@
deallocate(tmat, h1, h2, tmp1, tmp2, hvb, hvm, stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
#if REALCASE == 1
print *,"trans_ev_real: error when deallocating hvm "//errorMessage
#endif
#if COMPLEXCASE == 1
print *,"trans_ev_complex: error when deallocating hvm "//errorMessage
#endif
print *,"trans_ev_&
&MATH_DATATYPE&
&: error when deallocating hvm "//errorMessage
stop
endif
if (useGPU) then
!q_mat = q_dev
#if REALCASE == 1
successCUDA = cuda_memcpy(loc(q_mat(1,1)), q_dev, ldq * matrixCols * size_of_PRECISION_real, cudaMemcpyDeviceToHost)
#endif
#if COMPLEXCASE == 1
successCUDA = cuda_memcpy(loc(q_mat(1,1)), q_dev, ldq * matrixCols * size_of_PRECISION_complex, &
cudaMemcpyDeviceToHost)
#endif
successCUDA = cuda_memcpy(loc(q_mat(1,1)), q_dev, ldq * matrixCols * size_of_&
&PRECISION&
&_&
&MATH_DATATYPE&
&_datatype, cudaMemcpyDeviceToHost)
check_memcpy_cuda("trans_ev", successCUDA)
deallocate(hvm1, stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
#if REALCASE == 1
print *,"trans_ev_real: error when deallocating hvm1 "//errorMessage
#endif
#if COMPLEXCASE == 1
print *,"trans_ev_complex: error when deallocating hvm1 "//errorMessage
#endif
print *,"trans_ev_&
&MATH_DATATYPE&
&: error when deallocating hvm1 "//errorMessage
stop
endif
......@@ -655,16 +618,13 @@
endif
#if REALCASE == 1
call timer%stop("trans_ev_real" // PRECISION_SUFFIX)
#endif
#if COMPLEXCASE == 1
call timer%stop("trans_ev_complex" // PRECISION_SUFFIX)
#endif
call timer%stop("trans_ev_&
&MATH_DATATYPE&
&" // &
&PRECISION_SUFFIX&
)
#if REALCASE == 1
end subroutine trans_ev_real_PRECISION
#endif
#if COMPLEXCASE == 1
end subroutine trans_ev_complex_PRECISION
#endif
end subroutine trans_ev_&
&MATH_DATATYPE&
&_&
&PRECISION
#undef PRECISION
#undef MATH_DATATYPE
#define MATH_DATATYPE real
#ifdef DOUBLE_PRECISION_REAL
#define PRECISION double
#else
#define PRECISION single
#endif
#ifdef DOUBLE_PRECISION_REAL
#undef elpa_transpose_vectors_real_PRECISION
#undef elpa_reduce_add_vectors_real_PRECISION
......@@ -7,7 +17,6 @@
#undef trans_ev_tridi_to_band_real_PRECISION
#undef band_band_real_PRECISION
#undef tridiag_real_PRECISION
#undef trans_ev_real_PRECISION
#undef solve_tridi_PRECISION
#undef solve_tridi_col_PRECISION
#undef solve_tridi_single_problem_PRECISION
......@@ -77,7 +86,6 @@
#define trans_ev_tridi_to_band_real_PRECISION trans_ev_tridi_to_band_real_double
#define band_band_real_PRECISION band_band_real_double
#define tridiag_real_PRECISION tridiag_real_double
#define trans_ev_real_PRECISION trans_ev_real_double
#define solve_tridi_PRECISION solve_tridi_double
#define solve_tridi_col_PRECISION solve_tridi_col_double
#define solve_tridi_single_problem_PRECISION solve_tridi_single_problem_double
......@@ -140,6 +148,7 @@
#define size_of_PRECISION_real size_of_double_real_datatype
#define MPI_REAL_PRECISION MPI_REAL8
#else
#undef elpa_transpose_vectors_real_PRECISION
#undef elpa_reduce_add_vectors_real_PRECISION
#undef bandred_real_PRECISION
......@@ -148,7 +157,6 @@
#undef trans_ev_tridi_to_band_real_PRECISION
#undef band_band_real_PRECISION
#undef tridiag_real_PRECISION
#undef trans_ev_real_PRECISION
#undef solve_tridi_PRECISION
#undef solve_tridi_col_PRECISION
#undef solve_tridi_single_problem_PRECISION
......@@ -218,7 +226,6 @@
#define trans_ev_tridi_to_band_real_PRECISION trans_ev_tridi_to_band_real_single
#define band_band_real_PRECISION band_band_real_single
#define tridiag_real_PRECISION tridiag_real_single
#define trans_ev_real_PRECISION trans_ev_real_single
#define solve_tridi_PRECISION solve_tridi_single
#define solve_tridi_col_PRECISION solve_tridi_col_single
#define solve_tridi_single_problem_PRECISION solve_tridi_single_problem_single
......
#undef PRECISION
#undef MATH_DATATYPE
#define MATH_DATATYPE complex
#ifdef DOUBLE_PRECISION_COMPLEX
#define PRECISION double
#else
#define PRECISION single
#endif
#ifdef DOUBLE_PRECISION_COMPLEX
#undef elpa_transpose_vectors_complex_PRECISION
#undef elpa_reduce_add_vectors_complex_PRECISION
#undef bandred_complex_PRECISION
......@@ -7,7 +18,6 @@
#undef trans_ev_tridi_to_band_complex_PRECISION
#undef band_band_complex_PRECISION
#undef tridiag_complex_PRECISION
#undef trans_ev_complex_PRECISION
#undef solve_tridi_PRECISION
#undef solve_tridi_col_PRECISION
#undef solve_tridi_single_problem_PRECISION
......@@ -96,7 +106,6 @@
#define trans_ev_tridi_to_band_complex_PRECISION trans_ev_tridi_to_band_complex_double
#define band_band_complex_PRECISION band_band_complex_double
#define tridiag_complex_PRECISION tridiag_complex_double
#define trans_ev_complex_PRECISION trans_ev_complex_double
#define solve_tridi_PRECISION solve_tridi_double
#define solve_tridi_col_PRECISION solve_tridi_col_double
#define solve_tridi_single_problem_PRECISION solve_tridi_single_problem_double
......@@ -178,6 +187,7 @@
#define CONST_COMPLEX_1_0 1.0_ck8
#define size_of_PRECISION_complex size_of_double_complex_datatype
#else
#undef elpa_transpose_vectors_complex_PRECISION
#undef elpa_reduce_add_vectors_complex_PRECISION
#undef bandred_complex_PRECISION
......@@ -186,7 +196,6 @@
#undef trans_ev_tridi_to_band_complex_PRECISION
#undef band_band_complex_PRECISION
#undef tridiag_complex_PRECISION
#undef trans_ev_complex_PRECISION
#undef solve_tridi_PRECISION
#undef solve_tridi_col_PRECISION
#undef solve_tridi_single_problem_PRECISION
......@@ -275,7 +284,6 @@
#define trans_ev_tridi_to_band_complex_PRECISION trans_ev_tridi_to_band_complex_single
#define band_band_complex_PRECISION band_band_complex_single
#define tridiag_complex_PRECISION tridiag_complex_single
#define trans_ev_complex_PRECISION trans_ev_complex_single
#define solve_tridi_PRECISION solve_tridi_single
#define solve_tridi_col_PRECISION solve_tridi_col_single
#define solve_tridi_single_problem_PRECISION solve_tridi_single_problem_single
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment