Skip to content
GitLab
Menu
Projects
Groups
Snippets
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
elpa
elpa
Commits
136081c1
Commit
136081c1
authored
Dec 09, 2016
by
Andreas Marek
Browse files
Further unfiy real/complex elpa1 trans ev
parent
eb01a207
Changes
4
Pipelines
1
Hide whitespace changes
Inline
Side-by-side
Makefile.am
View file @
136081c1
...
@@ -901,6 +901,7 @@ EXTRA_DIST = \
...
@@ -901,6 +901,7 @@ EXTRA_DIST = \
src/elpa1_tridiag_template.X90
\
src/elpa1_tridiag_template.X90
\
src/elpa2_compute_real_template.X90
\
src/elpa2_compute_real_template.X90
\
src/elpa2_compute_complex_template.X90
\
src/elpa2_compute_complex_template.X90
\
src/elpa2_bandred_template.X90
\
src/elpa2_herm_matrix_allreduce_complex_template.X90
\
src/elpa2_herm_matrix_allreduce_complex_template.X90
\
src/elpa2_symm_matrix_allreduce_real_template.X90
\
src/elpa2_symm_matrix_allreduce_real_template.X90
\
src/elpa2_trans_ev_band_to_full_complex_template.X90
\
src/elpa2_trans_ev_band_to_full_complex_template.X90
\
...
...
src/elpa1_trans_ev_template.X90
View file @
136081c1
...
@@ -86,12 +86,11 @@
...
@@ -86,12 +86,11 @@
!> \param useGPU If true, GPU version of the subroutine will be used
!> \param useGPU If true, GPU version of the subroutine will be used
!>
!>
#if REALCASE == 1
subroutine trans_ev_&
subroutine trans_ev_real_PRECISION (na, nqc, a_mat, lda, tau, q_mat, ldq, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols, useGPU)
&MATH_DATATYPE&
#endif
&_&
#if COMPLEXCASE == 1
&PRECISION &
subroutine trans_ev_complex_PRECISION(na, nqc, a_mat, lda, tau, q_mat, ldq, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols, useGPU)
(na, nqc, a_mat, lda, tau, q_mat, ldq, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols, useGPU)
#endif
use cuda_functions
use cuda_functions
use iso_c_binding
use iso_c_binding
#ifdef HAVE_DETAILED_TIMINGS
#ifdef HAVE_DETAILED_TIMINGS
...
@@ -155,12 +154,11 @@
...
@@ -155,12 +154,11 @@
integer(kind=C_intptr_T) :: q_dev, tmp_dev, hvm_dev, tmat_dev
integer(kind=C_intptr_T) :: q_dev, tmp_dev, hvm_dev, tmat_dev
logical :: successCUDA
logical :: successCUDA
#if REALCASE == 1
call timer%start("trans_ev_&
call timer%start("trans_ev_real" // PRECISION_SUFFIX)
&MATH_DATATYPE&
#endif
&_" // &
#if COMPLEXCASE == 1
&PRECISION_SUFFIX &
call timer%start("trans_ev_complex" // PRECISION_SUFFIX)
)
#endif
call timer%start("mpi_communication")
call timer%start("mpi_communication")
call mpi_comm_rank(mpi_comm_rows,my_prow,mpierr)
call mpi_comm_rank(mpi_comm_rows,my_prow,mpierr)
...
@@ -179,54 +177,39 @@
...
@@ -179,54 +177,39 @@
max_stored_rows = (63/nblk+1)*nblk
max_stored_rows = (63/nblk+1)*nblk
allocate(tmat(max_stored_rows,max_stored_rows), stat=istat, errmsg=errorMessage)
allocate(tmat(max_stored_rows,max_stored_rows), stat=istat, errmsg=errorMessage)
#if REALCASE == 1
call check_alloc("trans_ev_&
call check_alloc("trans_ev_real", "tmat", istat, errorMessage)
&MATH_DATATYPE&
#endif
&", "tmat", istat, errorMessage)
#if COMPLEXCASE == 1
call check_alloc("trans_ev_complex", "tmat", istat, errorMessage)
#endif
allocate(h1(max_stored_rows*max_stored_rows), stat=istat, errmsg=errorMessage)
allocate(h1(max_stored_rows*max_stored_rows), stat=istat, errmsg=errorMessage)
#if REALCASE == 1
call check_alloc("trans_ev_&
call check_alloc("trans_ev_real", "h1", istat, errorMessage)
&MATH_DATATYPE&
#endif
&", "h1", istat, errorMessage)
#if COMPLEXCASE == 1
call check_alloc("trans_ev_complex", "h1", istat, errorMessage)
#endif
allocate(h2(max_stored_rows*max_stored_rows), stat=istat, errmsg=errorMessage)
allocate(h2(max_stored_rows*max_stored_rows), stat=istat, errmsg=errorMessage)
#if REALCASE == 1
call check_alloc("trans_ev_&
call check_alloc("trans_ev_real", "h2", istat, errorMessage)
&MATH_DATATYPE&
#endif
&", "h2", istat, errorMessage)
#if COMPLEXCASE == 1
call check_alloc("trans_ev_complex", "h2", istat, errorMessage)
#endif
allocate(tmp1(max_local_cols*max_stored_rows), stat=istat, errmsg=errorMessage)
allocate(tmp1(max_local_cols*max_stored_rows), stat=istat, errmsg=errorMessage)
#if REALCASE == 1
call check_alloc("trans_ev_&
call check_alloc("trans_ev_real", "tmp1", istat, errorMessage)
&MATH_DATATYPE&
#endif
&", "tmp1", istat, errorMessage)
#if COMPLEXCASE == 1
call check_alloc("trans_ev_complex", "tmp1", istat, errorMessage)
#endif
allocate(tmp2(max_local_cols*max_stored_rows), stat=istat, errmsg=errorMessage)
allocate(tmp2(max_local_cols*max_stored_rows), stat=istat, errmsg=errorMessage)
#if REALCASE == 1
call check_alloc("trans_ev_&
call check_alloc("trans_ev_real", "tmp2", istat, errorMessage)
&MATH_DATATYPE&
#endif
&", "tmp2", istat, errorMessage)
#if COMPLEXCASE == 1
call check_alloc("trans_ev_complex", "tmp2", istat, errorMessage)
#endif
allocate(hvb(max_local_rows*nblk), stat=istat, errmsg=errorMessage)
allocate(hvb(max_local_rows*nblk), stat=istat, errmsg=errorMessage)
#if REALCASE == 1
call check_alloc("trans_ev_&
call check_alloc("trans_ev_real", "hvn", istat, errorMessage)
&MATH_DATATYPE&
#endif
&", "hvn", istat, errorMessage)
#if COMPLEXCASE == 1
call check_alloc("trans_ev_complex", "hvb", istat, errorMessage)
#endif
allocate(hvm(max_local_rows,max_stored_rows), stat=istat, errmsg=errorMessage)
allocate(hvm(max_local_rows,max_stored_rows), stat=istat, errmsg=errorMessage)
#if REALCASE == 1
call check_alloc("trans_ev_&
call check_alloc("trans_ev_real", "hvm", istat, errorMessage)
&MATH_DATATYPE&
#endif
&", "hvm", istat, errorMessage)
#if COMPLEXCASE == 1
call check_alloc("trans_ev_complex", "hvm", istat, errorMessage)
#endif
hvm = 0 ! Must be set to 0 !!!
hvm = 0 ! Must be set to 0 !!!
hvb = 0 ! Safety only
hvb = 0 ! Safety only
...
@@ -248,59 +231,45 @@
...
@@ -248,59 +231,45 @@
if (useGPU) then
if (useGPU) then
! todo: this is used only for copying hmv to device.. it should be possible to go without it
! todo: this is used only for copying hmv to device.. it should be possible to go without it
allocate(hvm1(max_local_rows*max_stored_rows), stat=istat, errmsg=errorMessage)
allocate(hvm1(max_local_rows*max_stored_rows), stat=istat, errmsg=errorMessage)
#if REALCASE == 1
call check_alloc("trans_ev_&
call check_alloc("trans_ev_real", "hvm1", istat, errorMessage)
&MATH_DATATYPE&
#endif
&", "hvm1", istat, errorMessage)
#if COMPLEXCASE == 1
call check_alloc("trans_ev_complex", "hvm1", istat, errorMessage)
successCUDA = cuda_malloc(tmat_dev, max_stored_rows * max_stored_rows * size_of_&
#endif
&PRECISION&
&_&
#if REALCASE == 1
&MATH_DATATYPE&
successCUDA = cuda_malloc(tmat_dev, max_stored_rows * max_stored_rows * size_of_PRECISION_real)
&_datatype)
check_alloc_cuda("trans_ev", successCUDA)
#endif
#if COMPLEXCASE == 1
successCUDA = cuda_malloc(tmat_dev, max_stored_rows * max_stored_rows * size_of_PRECISION_complex)
check_alloc_cuda("trans_ev", successCUDA)
check_alloc_cuda("trans_ev", successCUDA)
#endif
#if REALCASE == 1
successCUDA = cuda_malloc(hvm_dev, max_local_rows * max_stored_rows * size_of_&
successCUDA = cuda_malloc(hvm_dev, max_local_rows * max_stored_rows * size_of_PRECISION_real)
&PRECISION&
&_&
&MATH_DATATYPE&
&_datatype)
check_alloc_cuda("trans_ev", successCUDA)
check_alloc_cuda("trans_ev", successCUDA)
#endif
#if COMPLEXCASE == 1
successCUDA = cuda_malloc(hvm_dev, max_local_rows * max_stored_rows * size_of_PRECISION_complex)
check_alloc_cuda("trans_ev", successCUDA)
#endif
#if REALCASE == 1
successCUDA = cuda_malloc(tmp_dev, max_local_cols * max_stored_rows * size_of_&
successCUDA = cuda_malloc(tmp_dev, max_local_cols * max_stored_rows * size_of_PRECISION_real)
&PRECISION&
check_alloc_cuda("trans_ev", successCUDA)
&_&
#endif
&MATH_DATATYPE&
#if COMPLEXCASE == 1
&_datatype)
successCUDA = cuda_malloc(tmp_dev, max_local_cols * max_stored_rows * size_of_PRECISION_complex)
check_alloc_cuda("trans_ev", successCUDA)
check_alloc_cuda("trans_ev", successCUDA)
#endif
#if REALCASE == 1
successCUDA = cuda_malloc(q_dev, ldq * matrixCols * size_of_&
successCUDA = cuda_malloc(q_dev, ldq * matrixCols * size_of_PRECISION_real)
&PRECISION&
check_alloc_cuda("trans_ev", successCUDA)
&_&
#endif
&MATH_DATATYPE&
#if COMPLEXCASE == 1
&_datatype)
successCUDA = cuda_malloc(q_dev, ldq * matrixCols * size_of_PRECISION_complex)
check_alloc_cuda("trans_ev", successCUDA)
check_alloc_cuda("trans_ev", successCUDA)
#endif
! q_dev = q_mat
! q_dev = q_mat
#if REALCASE == 1
successCUDA = cuda_memcpy(q_dev, loc(q_mat(1,1)), ldq * matrixCols * size_of_&
successCUDA = cuda_memcpy(q_dev, loc(q_mat(1,1)), ldq * matrixCols * size_of_PRECISION_real, cudaMemcpyHostToDevice)
&PRECISION&
&_&
&MATH_DATATYPE&
&_datatype, cudaMemcpyHostToDevice)
check_memcpy_cuda("trans_ev", successCUDA)
check_memcpy_cuda("trans_ev", successCUDA)
#endif
#if COMPLEXCASE == 1
successCUDA = cuda_memcpy(q_dev, loc(q_mat(1,1)), ldq * matrixCols * size_of_PRECISION_complex, &
cudaMemcpyHostToDevice)
check_memcpy_cuda("trans_ev", successCUDA)
#endif
endif ! useGPU
endif ! useGPU
do istep = 1, na, nblk
do istep = 1, na, nblk
...
@@ -330,12 +299,15 @@
...
@@ -330,12 +299,15 @@
#ifdef WITH_MPI
#ifdef WITH_MPI
call timer%start("mpi_communication")
call timer%start("mpi_communication")
if (nb>0) &
if (nb>0) &
call MPI_Bcast(hvb, nb, &
#if REALCASE == 1
#if REALCASE == 1
call MPI_Bcast(hvb, nb, MPI_REAL_PRECISION, cur_pcol, mpi_comm_cols, mpierr)
&MPI_REAL_PRECISION&
#endif
#endif
#if COMPLEXCASE == 1
#if COMPLEXCASE == 1
call MPI_Bcast(hvb, nb, MPI_COMPLEX_PRECISION, cur_pcol, mpi_comm_cols, mpierr)
&MPI_COMPLEX_PRECISION&
#endif
#endif
, cur_pcol, mpi_comm_cols, mpierr)
call timer%stop("mpi_communication")
call timer%stop("mpi_communication")
#endif /* WITH_MPI */
#endif /* WITH_MPI */
...
@@ -375,12 +347,14 @@
...
@@ -375,12 +347,14 @@
enddo
enddo
#ifdef WITH_MPI
#ifdef WITH_MPI
call timer%start("mpi_communication")
call timer%start("mpi_communication")
if (nc>0) call mpi_allreduce( h1, h2, nc, &
#if REALCASE == 1
#if REALCASE == 1
if (nc>0) call mpi_allreduce( h1, h2, nc, MPI_REAL_PRECISION, MPI_SUM, mpi_comm_rows, mpierr)
&MPI_REAL_PRECISION&
#endif
#endif
#if COMPLEXCASE == 1
#if COMPLEXCASE == 1
if (nc>0) call mpi_allreduce(h1, h2, nc, MPI_COMPLEX_PRECISION, MPI_SUM, mpi_comm_rows, mpierr)
&MPI_COMPLEX_PRECISION&
#endif
#endif
&, MPI_SUM, mpi_comm_rows, mpierr)
call timer%stop("mpi_communication")
call timer%stop("mpi_communication")
#else /* WITH_MPI */
#else /* WITH_MPI */
...
@@ -403,13 +377,16 @@
...
@@ -403,13 +377,16 @@
tmat, max_stored_rows, &
tmat, max_stored_rows, &
h2(nc+1),1)
h2(nc+1),1)
#endif
#endif
call timer%stop("blas")
call timer%stop("blas")
tmat(n+1,1:n) = &
#if REALCASE == 1
#if REALCASE == 1
tmat(n+1,1:n) = -h2(nc+1:nc+n)*tau(ice-nstor+n+1)
-h2(nc+1:nc+n) &
#endif
#endif
#if COMPLEXCASE == 1
#if COMPLEXCASE == 1
tmat(n+1,1:n) =
-conjg(h2(nc+1:nc+n))
*tau(ice-nstor+n+1)
-conjg(h2(nc+1:nc+n))
&
#endif
#endif
*tau(ice-nstor+n+1)
tmat(n+1,n+1) = tau(ice-nstor+n+1)
tmat(n+1,n+1) = tau(ice-nstor+n+1)
nc = nc+n
nc = nc+n
enddo
enddo
...
@@ -419,25 +396,22 @@
...
@@ -419,25 +396,22 @@
hvm1(1:hvm_ubnd*nstor) = reshape(hvm(1:hvm_ubnd,1:nstor), (/ hvm_ubnd*nstor /))
hvm1(1:hvm_ubnd*nstor) = reshape(hvm(1:hvm_ubnd,1:nstor), (/ hvm_ubnd*nstor /))
!hvm_dev(1:hvm_ubnd*nstor) = hvm1(1:hvm_ubnd*nstor)
!hvm_dev(1:hvm_ubnd*nstor) = hvm1(1:hvm_ubnd*nstor)
#if REALCASE == 1
successCUDA = cuda_memcpy(hvm_dev, loc(hvm1(1)), &
successCUDA = cuda_memcpy(hvm_dev, loc(hvm1(1)), &
hvm_ubnd * nstor * size_of_
PRECISION_real, cudaMemcpyHostToDevice)
hvm_ubnd * nstor * size_of_
&
#endif
&PRECISION&
#if COMPLEXCASE == 1
&_&
successCUDA = cuda_memcpy(hvm_dev, loc(hvm1(1)),
&
&MATH_DATATYPE
&
hvm_ubnd * nstor * size_of_PRECISION_complex
, cudaMemcpyHostToDevice)
&_datatype
, cudaMemcpyHostToDevice)
#endif
check_memcpy_cuda("trans_ev", successCUDA)
check_memcpy_cuda("trans_ev", successCUDA)
!tmat_dev = tmat
!tmat_dev = tmat
#if REALCASE == 1
successCUDA = cuda_memcpy(tmat_dev, loc(tmat(1,1)), &
successCUDA = cuda_memcpy(tmat_dev, loc(tmat(1,1)), &
max_stored_rows * max_stored_rows * size_of_PRECISION_real, cudaMemcpyHostToDevice)
max_stored_rows * max_stored_rows * size_of_&
#endif
&PRECISION&
#if COMPLEXCASE == 1
&_&
successCUDA = cuda_memcpy(tmat_dev, loc(tmat(1,1)), &
&MATH_DATATYPE&
max_stored_rows * max_stored_rows * size_of_PRECISION_complex, cudaMemcpyHostToDevice)
&_datatype, cudaMemcpyHostToDevice)
#endif
check_memcpy_cuda("trans_ev", successCUDA)
check_memcpy_cuda("trans_ev", successCUDA)
endif
endif
...
@@ -479,12 +453,11 @@
...
@@ -479,12 +453,11 @@
else !l_rows>0
else !l_rows>0
if (useGPU) then
if (useGPU) then
#if REALCASE == 1
successCUDA = cuda_memset(tmp_dev, 0, l_cols * nstor * size_of_&
successCUDA = cuda_memset(tmp_dev, 0, l_cols * nstor * size_of_PRECISION_real)
&PRECISION&
#endif
&_&
#if COMPLEXCASE == 1
&MATH_DATATYPE&
successCUDA = cuda_memset(tmp_dev, 0, l_cols * nstor * size_of_PRECISION_complex)
&_datatype)
#endif
check_memcpy_cuda("trans_ev", successCUDA)
check_memcpy_cuda("trans_ev", successCUDA)
else
else
tmp1(1:l_cols*nstor) = 0
tmp1(1:l_cols*nstor) = 0
...
@@ -495,34 +468,32 @@
...
@@ -495,34 +468,32 @@
! In the legacy GPU version, this allreduce was ommited. But probably it has to be done for GPU + MPI
! In the legacy GPU version, this allreduce was ommited. But probably it has to be done for GPU + MPI
! todo: does it need to be copied whole? Wouldn't be a part sufficient?
! todo: does it need to be copied whole? Wouldn't be a part sufficient?
if (useGPU) then
if (useGPU) then
#if REALCASE == 1
successCUDA = cuda_memcpy(loc(tmp1(1)), tmp_dev, &
successCUDA = cuda_memcpy(loc(tmp1(1)), tmp_dev, &
max_local_cols * max_stored_rows * size_of_PRECISION_real, cudaMemcpyDeviceToHost)
max_local_cols * max_stored_rows * size_of_&
#endif
&PRECISION&
#if COMPLEXCASE == 1
&_&
successCUDA = cuda_memcpy(loc(tmp1(1)), tmp_dev, &
&MATH_DATATYPE&
max_local_cols * max_stored_rows * size_of_PRECISION_complex, cudaMemcpyDeviceToHost)
&_datatype, cudaMemcpyDeviceToHost)
#endif
check_memcpy_cuda("trans_ev", successCUDA)
check_memcpy_cuda("trans_ev", successCUDA)
endif
endif
call timer%start("mpi_communication")
call timer%start("mpi_communication")
call mpi_allreduce(tmp1, tmp2, nstor*l_cols, &
#if REALCASE == 1
#if REALCASE == 1
call mpi_allreduce(tmp1, tmp2, nstor*l_cols, MPI_REAL_PRECISION, MPI_SUM, mpi_comm_rows, mpierr)
&MPI_REAL_PRECISION&
#endif
#endif
#if COMPLEXCASE == 1
#if COMPLEXCASE == 1
call mpi_allreduce(tmp1, tmp2, nstor*l_cols, MPI_COMPLEX_PRECISION, MPI_SUM, mpi_comm_rows, mpierr)
&MPI_COMPLEX_PRECISION&
#endif
#endif
&, MPI_SUM, mpi_comm_rows, mpierr)
call timer%stop("mpi_communication")
call timer%stop("mpi_communication")
! copy back tmp2 - after reduction...
! copy back tmp2 - after reduction...
if (useGPU) then
if (useGPU) then
#if REALCASE == 1
successCUDA = cuda_memcpy(tmp_dev, loc(tmp2(1)), &
max_local_cols * max_stored_rows * size_of_PRECISION_real, cudaMemcpyHostToDevice)
#endif
#if COMPLEXCASE == 1
successCUDA = cuda_memcpy(tmp_dev, loc(tmp2(1)), &
successCUDA = cuda_memcpy(tmp_dev, loc(tmp2(1)), &
max_local_cols * max_stored_rows * size_of_PRECISION_complex, cudaMemcpyHostToDevice)
max_local_cols * max_stored_rows * size_of_&
#endif
&PRECISION&
&_&
&MATH_DATATYPE&
&_datatype, cudaMemcpyHostToDevice)
check_memcpy_cuda("trans_ev", successCUDA)
check_memcpy_cuda("trans_ev", successCUDA)
endif ! useGPU
endif ! useGPU
...
@@ -609,34 +580,26 @@
...
@@ -609,34 +580,26 @@
deallocate(tmat, h1, h2, tmp1, tmp2, hvb, hvm, stat=istat, errmsg=errorMessage)
deallocate(tmat, h1, h2, tmp1, tmp2, hvb, hvm, stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
if (istat .ne. 0) then
#if REALCASE == 1
print *,"trans_ev_&
print *,"trans_ev_real: error when deallocating hvm "//errorMessage
&MATH_DATATYPE&
#endif
&: error when deallocating hvm "//errorMessage
#if COMPLEXCASE == 1
print *,"trans_ev_complex: error when deallocating hvm "//errorMessage
#endif
stop
stop
endif
endif
if (useGPU) then
if (useGPU) then
!q_mat = q_dev
!q_mat = q_dev
#if REALCASE == 1
successCUDA = cuda_memcpy(loc(q_mat(1,1)), q_dev, ldq * matrixCols * size_of_&
successCUDA = cuda_memcpy(loc(q_mat(1,1)), q_dev, ldq * matrixCols * size_of_PRECISION_real, cudaMemcpyDeviceToHost)
&PRECISION&
#endif
&_&
#if COMPLEXCASE == 1
&MATH_DATATYPE&
successCUDA = cuda_memcpy(loc(q_mat(1,1)), q_dev, ldq * matrixCols * size_of_PRECISION_complex, &
&_datatype, cudaMemcpyDeviceToHost)
cudaMemcpyDeviceToHost)
#endif
check_memcpy_cuda("trans_ev", successCUDA)
check_memcpy_cuda("trans_ev", successCUDA)
deallocate(hvm1, stat=istat, errmsg=errorMessage)
deallocate(hvm1, stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
if (istat .ne. 0) then
#if REALCASE == 1
print *,"trans_ev_&
print *,"trans_ev_real: error when deallocating hvm1 "//errorMessage
&MATH_DATATYPE&
#endif
&: error when deallocating hvm1 "//errorMessage
#if COMPLEXCASE == 1
print *,"trans_ev_complex: error when deallocating hvm1 "//errorMessage
#endif
stop
stop
endif
endif
...
@@ -655,16 +618,13 @@
...
@@ -655,16 +618,13 @@
endif
endif
#if REALCASE == 1
call timer%stop("trans_ev_&
call timer%stop("trans_ev_real" // PRECISION_SUFFIX)
&MATH_DATATYPE&
#endif
&" // &
#if COMPLEXCASE == 1
&PRECISION_SUFFIX&
call timer%stop("trans_ev_complex" // PRECISION_SUFFIX)
)
#endif
#if REALCASE == 1
end subroutine trans_ev_&
end subroutine trans_ev_real_PRECISION
&MATH_DATATYPE&
#endif
&_&
#if COMPLEXCASE == 1
&PRECISION
end subroutine trans_ev_complex_PRECISION
#endif
src/precision_macros.h
View file @
136081c1
#undef PRECISION
#undef MATH_DATATYPE
#define MATH_DATATYPE real
#ifdef DOUBLE_PRECISION_REAL
#define PRECISION double
#else
#define PRECISION single
#endif
#ifdef DOUBLE_PRECISION_REAL
#ifdef DOUBLE_PRECISION_REAL
#undef elpa_transpose_vectors_real_PRECISION
#undef elpa_transpose_vectors_real_PRECISION
#undef elpa_reduce_add_vectors_real_PRECISION
#undef elpa_reduce_add_vectors_real_PRECISION
...
@@ -7,7 +17,6 @@
...
@@ -7,7 +17,6 @@
#undef trans_ev_tridi_to_band_real_PRECISION
#undef trans_ev_tridi_to_band_real_PRECISION
#undef band_band_real_PRECISION
#undef band_band_real_PRECISION
#undef tridiag_real_PRECISION
#undef tridiag_real_PRECISION
#undef trans_ev_real_PRECISION
#undef solve_tridi_PRECISION
#undef solve_tridi_PRECISION
#undef solve_tridi_col_PRECISION
#undef solve_tridi_col_PRECISION
#undef solve_tridi_single_problem_PRECISION
#undef solve_tridi_single_problem_PRECISION
...
@@ -77,7 +86,6 @@
...
@@ -77,7 +86,6 @@
#define trans_ev_tridi_to_band_real_PRECISION trans_ev_tridi_to_band_real_double
#define trans_ev_tridi_to_band_real_PRECISION trans_ev_tridi_to_band_real_double
#define band_band_real_PRECISION band_band_real_double
#define band_band_real_PRECISION band_band_real_double
#define tridiag_real_PRECISION tridiag_real_double
#define tridiag_real_PRECISION tridiag_real_double
#define trans_ev_real_PRECISION trans_ev_real_double
#define solve_tridi_PRECISION solve_tridi_double
#define solve_tridi_PRECISION solve_tridi_double
#define solve_tridi_col_PRECISION solve_tridi_col_double
#define solve_tridi_col_PRECISION solve_tridi_col_double
#define solve_tridi_single_problem_PRECISION solve_tridi_single_problem_double
#define solve_tridi_single_problem_PRECISION solve_tridi_single_problem_double
...
@@ -140,6 +148,7 @@
...
@@ -140,6 +148,7 @@
#define size_of_PRECISION_real size_of_double_real_datatype
#define size_of_PRECISION_real size_of_double_real_datatype
#define MPI_REAL_PRECISION MPI_REAL8
#define MPI_REAL_PRECISION MPI_REAL8
#else
#else
#undef elpa_transpose_vectors_real_PRECISION
#undef elpa_transpose_vectors_real_PRECISION
#undef elpa_reduce_add_vectors_real_PRECISION
#undef elpa_reduce_add_vectors_real_PRECISION
#undef bandred_real_PRECISION
#undef bandred_real_PRECISION
...
@@ -148,7 +157,6 @@
...
@@ -148,7 +157,6 @@
#undef trans_ev_tridi_to_band_real_PRECISION
#undef trans_ev_tridi_to_band_real_PRECISION
#undef band_band_real_PRECISION
#undef band_band_real_PRECISION
#undef tridiag_real_PRECISION
#undef tridiag_real_PRECISION
#undef trans_ev_real_PRECISION
#undef solve_tridi_PRECISION
#undef solve_tridi_PRECISION
#undef solve_tridi_col_PRECISION
#undef solve_tridi_col_PRECISION
#undef solve_tridi_single_problem_PRECISION
#undef solve_tridi_single_problem_PRECISION
...
@@ -218,7 +226,6 @@
...
@@ -218,7 +226,6 @@
#define trans_ev_tridi_to_band_real_PRECISION trans_ev_tridi_to_band_real_single
#define trans_ev_tridi_to_band_real_PRECISION trans_ev_tridi_to_band_real_single
#define band_band_real_PRECISION band_band_real_single
#define band_band_real_PRECISION band_band_real_single
#define tridiag_real_PRECISION tridiag_real_single
#define tridiag_real_PRECISION tridiag_real_single
#define trans_ev_real_PRECISION trans_ev_real_single
#define solve_tridi_PRECISION solve_tridi_single
#define solve_tridi_PRECISION solve_tridi_single
#define solve_tridi_col_PRECISION solve_tridi_col_single
#define solve_tridi_col_PRECISION solve_tridi_col_single
#define solve_tridi_single_problem_PRECISION solve_tridi_single_problem_single
#define solve_tridi_single_problem_PRECISION solve_tridi_single_problem_single
...
...
src/precision_macros_complex.h
View file @
136081c1
#undef PRECISION
#undef MATH_DATATYPE
#define MATH_DATATYPE complex
#ifdef DOUBLE_PRECISION_COMPLEX
#ifdef DOUBLE_PRECISION_COMPLEX
#define PRECISION double
#else
#define PRECISION single
#endif
#ifdef DOUBLE_PRECISION_COMPLEX
#undef elpa_transpose_vectors_complex_PRECISION
#undef elpa_transpose_vectors_complex_PRECISION
#undef elpa_reduce_add_vectors_complex_PRECISION
#undef elpa_reduce_add_vectors_complex_PRECISION
#undef bandred_complex_PRECISION
#undef bandred_complex_PRECISION
...
@@ -7,7 +18,6 @@
...
@@ -7,7 +18,6 @@
#undef trans_ev_tridi_to_band_complex_PRECISION
#undef trans_ev_tridi_to_band_complex_PRECISION
#undef band_band_complex_PRECISION
#undef band_band_complex_PRECISION
#undef tridiag_complex_PRECISION
#undef tridiag_complex_PRECISION
#undef trans_ev_complex_PRECISION
#undef solve_tridi_PRECISION
#undef solve_tridi_PRECISION
#undef solve_tridi_col_PRECISION
#undef solve_tridi_col_PRECISION
#undef solve_tridi_single_problem_PRECISION
#undef solve_tridi_single_problem_PRECISION
...
@@ -96,7 +106,6 @@
...
@@ -96,7 +106,6 @@
#define trans_ev_tridi_to_band_complex_PRECISION trans_ev_tridi_to_band_complex_double
#define trans_ev_tridi_to_band_complex_PRECISION trans_ev_tridi_to_band_complex_double
#define band_band_complex_PRECISION band_band_complex_double
#define band_band_complex_PRECISION band_band_complex_double
#define tridiag_complex_PRECISION tridiag_complex_double
#define tridiag_complex_PRECISION tridiag_complex_double
#define trans_ev_complex_PRECISION trans_ev_complex_double
#define solve_tridi_PRECISION solve_tridi_double
#define solve_tridi_PRECISION solve_tridi_double
#define solve_tridi_col_PRECISION solve_tridi_col_double
#define solve_tridi_col_PRECISION solve_tridi_col_double
#define solve_tridi_single_problem_PRECISION solve_tridi_single_problem_double
#define solve_tridi_single_problem_PRECISION solve_tridi_single_problem_double
...
@@ -178,6 +187,7 @@
...
@@ -178,6 +187,7 @@
#define CONST_COMPLEX_1_0 1.0_ck8
#define CONST_COMPLEX_1_0 1.0_ck8
#define size_of_PRECISION_complex size_of_double_complex_datatype
#define size_of_PRECISION_complex size_of_double_complex_datatype
#else
#else
#undef elpa_transpose_vectors_complex_PRECISION
#undef elpa_transpose_vectors_complex_PRECISION