Commit 136081c1 authored by Andreas Marek's avatar Andreas Marek
Browse files

Further unfiy real/complex elpa1 trans ev

parent eb01a207
...@@ -901,6 +901,7 @@ EXTRA_DIST = \ ...@@ -901,6 +901,7 @@ EXTRA_DIST = \
src/elpa1_tridiag_template.X90 \ src/elpa1_tridiag_template.X90 \
src/elpa2_compute_real_template.X90 \ src/elpa2_compute_real_template.X90 \
src/elpa2_compute_complex_template.X90 \ src/elpa2_compute_complex_template.X90 \
src/elpa2_bandred_template.X90 \
src/elpa2_herm_matrix_allreduce_complex_template.X90 \ src/elpa2_herm_matrix_allreduce_complex_template.X90 \
src/elpa2_symm_matrix_allreduce_real_template.X90 \ src/elpa2_symm_matrix_allreduce_real_template.X90 \
src/elpa2_trans_ev_band_to_full_complex_template.X90 \ src/elpa2_trans_ev_band_to_full_complex_template.X90 \
......
...@@ -86,12 +86,11 @@ ...@@ -86,12 +86,11 @@
!> \param useGPU If true, GPU version of the subroutine will be used !> \param useGPU If true, GPU version of the subroutine will be used
!> !>
#if REALCASE == 1 subroutine trans_ev_&
subroutine trans_ev_real_PRECISION (na, nqc, a_mat, lda, tau, q_mat, ldq, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols, useGPU) &MATH_DATATYPE&
#endif &_&
#if COMPLEXCASE == 1 &PRECISION &
subroutine trans_ev_complex_PRECISION(na, nqc, a_mat, lda, tau, q_mat, ldq, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols, useGPU) (na, nqc, a_mat, lda, tau, q_mat, ldq, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols, useGPU)
#endif
use cuda_functions use cuda_functions
use iso_c_binding use iso_c_binding
#ifdef HAVE_DETAILED_TIMINGS #ifdef HAVE_DETAILED_TIMINGS
...@@ -155,12 +154,11 @@ ...@@ -155,12 +154,11 @@
integer(kind=C_intptr_T) :: q_dev, tmp_dev, hvm_dev, tmat_dev integer(kind=C_intptr_T) :: q_dev, tmp_dev, hvm_dev, tmat_dev
logical :: successCUDA logical :: successCUDA
#if REALCASE == 1 call timer%start("trans_ev_&
call timer%start("trans_ev_real" // PRECISION_SUFFIX) &MATH_DATATYPE&
#endif &_" // &
#if COMPLEXCASE == 1 &PRECISION_SUFFIX &
call timer%start("trans_ev_complex" // PRECISION_SUFFIX) )
#endif
call timer%start("mpi_communication") call timer%start("mpi_communication")
call mpi_comm_rank(mpi_comm_rows,my_prow,mpierr) call mpi_comm_rank(mpi_comm_rows,my_prow,mpierr)
...@@ -179,54 +177,39 @@ ...@@ -179,54 +177,39 @@
max_stored_rows = (63/nblk+1)*nblk max_stored_rows = (63/nblk+1)*nblk
allocate(tmat(max_stored_rows,max_stored_rows), stat=istat, errmsg=errorMessage) allocate(tmat(max_stored_rows,max_stored_rows), stat=istat, errmsg=errorMessage)
#if REALCASE == 1 call check_alloc("trans_ev_&
call check_alloc("trans_ev_real", "tmat", istat, errorMessage) &MATH_DATATYPE&
#endif &", "tmat", istat, errorMessage)
#if COMPLEXCASE == 1
call check_alloc("trans_ev_complex", "tmat", istat, errorMessage)
#endif
allocate(h1(max_stored_rows*max_stored_rows), stat=istat, errmsg=errorMessage) allocate(h1(max_stored_rows*max_stored_rows), stat=istat, errmsg=errorMessage)
#if REALCASE == 1 call check_alloc("trans_ev_&
call check_alloc("trans_ev_real", "h1", istat, errorMessage) &MATH_DATATYPE&
#endif &", "h1", istat, errorMessage)
#if COMPLEXCASE == 1
call check_alloc("trans_ev_complex", "h1", istat, errorMessage)
#endif
allocate(h2(max_stored_rows*max_stored_rows), stat=istat, errmsg=errorMessage) allocate(h2(max_stored_rows*max_stored_rows), stat=istat, errmsg=errorMessage)
#if REALCASE == 1 call check_alloc("trans_ev_&
call check_alloc("trans_ev_real", "h2", istat, errorMessage) &MATH_DATATYPE&
#endif &", "h2", istat, errorMessage)
#if COMPLEXCASE == 1
call check_alloc("trans_ev_complex", "h2", istat, errorMessage)
#endif
allocate(tmp1(max_local_cols*max_stored_rows), stat=istat, errmsg=errorMessage) allocate(tmp1(max_local_cols*max_stored_rows), stat=istat, errmsg=errorMessage)
#if REALCASE == 1 call check_alloc("trans_ev_&
call check_alloc("trans_ev_real", "tmp1", istat, errorMessage) &MATH_DATATYPE&
#endif &", "tmp1", istat, errorMessage)
#if COMPLEXCASE == 1
call check_alloc("trans_ev_complex", "tmp1", istat, errorMessage)
#endif
allocate(tmp2(max_local_cols*max_stored_rows), stat=istat, errmsg=errorMessage) allocate(tmp2(max_local_cols*max_stored_rows), stat=istat, errmsg=errorMessage)
#if REALCASE == 1 call check_alloc("trans_ev_&
call check_alloc("trans_ev_real", "tmp2", istat, errorMessage) &MATH_DATATYPE&
#endif &", "tmp2", istat, errorMessage)
#if COMPLEXCASE == 1
call check_alloc("trans_ev_complex", "tmp2", istat, errorMessage)
#endif
allocate(hvb(max_local_rows*nblk), stat=istat, errmsg=errorMessage) allocate(hvb(max_local_rows*nblk), stat=istat, errmsg=errorMessage)
#if REALCASE == 1 call check_alloc("trans_ev_&
call check_alloc("trans_ev_real", "hvn", istat, errorMessage) &MATH_DATATYPE&
#endif &", "hvn", istat, errorMessage)
#if COMPLEXCASE == 1
call check_alloc("trans_ev_complex", "hvb", istat, errorMessage)
#endif
allocate(hvm(max_local_rows,max_stored_rows), stat=istat, errmsg=errorMessage) allocate(hvm(max_local_rows,max_stored_rows), stat=istat, errmsg=errorMessage)
#if REALCASE == 1 call check_alloc("trans_ev_&
call check_alloc("trans_ev_real", "hvm", istat, errorMessage) &MATH_DATATYPE&
#endif &", "hvm", istat, errorMessage)
#if COMPLEXCASE == 1
call check_alloc("trans_ev_complex", "hvm", istat, errorMessage)
#endif
hvm = 0 ! Must be set to 0 !!! hvm = 0 ! Must be set to 0 !!!
hvb = 0 ! Safety only hvb = 0 ! Safety only
...@@ -248,59 +231,45 @@ ...@@ -248,59 +231,45 @@
if (useGPU) then if (useGPU) then
! todo: this is used only for copying hmv to device.. it should be possible to go without it ! todo: this is used only for copying hmv to device.. it should be possible to go without it
allocate(hvm1(max_local_rows*max_stored_rows), stat=istat, errmsg=errorMessage) allocate(hvm1(max_local_rows*max_stored_rows), stat=istat, errmsg=errorMessage)
#if REALCASE == 1 call check_alloc("trans_ev_&
call check_alloc("trans_ev_real", "hvm1", istat, errorMessage) &MATH_DATATYPE&
#endif &", "hvm1", istat, errorMessage)
#if COMPLEXCASE == 1
call check_alloc("trans_ev_complex", "hvm1", istat, errorMessage) successCUDA = cuda_malloc(tmat_dev, max_stored_rows * max_stored_rows * size_of_&
#endif &PRECISION&
&_&
#if REALCASE == 1 &MATH_DATATYPE&
successCUDA = cuda_malloc(tmat_dev, max_stored_rows * max_stored_rows * size_of_PRECISION_real) &_datatype)
check_alloc_cuda("trans_ev", successCUDA)
#endif
#if COMPLEXCASE == 1
successCUDA = cuda_malloc(tmat_dev, max_stored_rows * max_stored_rows * size_of_PRECISION_complex)
check_alloc_cuda("trans_ev", successCUDA) check_alloc_cuda("trans_ev", successCUDA)
#endif
#if REALCASE == 1 successCUDA = cuda_malloc(hvm_dev, max_local_rows * max_stored_rows * size_of_&
successCUDA = cuda_malloc(hvm_dev, max_local_rows * max_stored_rows * size_of_PRECISION_real) &PRECISION&
&_&
&MATH_DATATYPE&
&_datatype)
check_alloc_cuda("trans_ev", successCUDA) check_alloc_cuda("trans_ev", successCUDA)
#endif
#if COMPLEXCASE == 1
successCUDA = cuda_malloc(hvm_dev, max_local_rows * max_stored_rows * size_of_PRECISION_complex)
check_alloc_cuda("trans_ev", successCUDA)
#endif
#if REALCASE == 1 successCUDA = cuda_malloc(tmp_dev, max_local_cols * max_stored_rows * size_of_&
successCUDA = cuda_malloc(tmp_dev, max_local_cols * max_stored_rows * size_of_PRECISION_real) &PRECISION&
check_alloc_cuda("trans_ev", successCUDA) &_&
#endif &MATH_DATATYPE&
#if COMPLEXCASE == 1 &_datatype)
successCUDA = cuda_malloc(tmp_dev, max_local_cols * max_stored_rows * size_of_PRECISION_complex)
check_alloc_cuda("trans_ev", successCUDA) check_alloc_cuda("trans_ev", successCUDA)
#endif
#if REALCASE == 1 successCUDA = cuda_malloc(q_dev, ldq * matrixCols * size_of_&
successCUDA = cuda_malloc(q_dev, ldq * matrixCols * size_of_PRECISION_real) &PRECISION&
check_alloc_cuda("trans_ev", successCUDA) &_&
#endif &MATH_DATATYPE&
#if COMPLEXCASE == 1 &_datatype)
successCUDA = cuda_malloc(q_dev, ldq * matrixCols * size_of_PRECISION_complex)
check_alloc_cuda("trans_ev", successCUDA) check_alloc_cuda("trans_ev", successCUDA)
#endif
! q_dev = q_mat ! q_dev = q_mat
#if REALCASE == 1 successCUDA = cuda_memcpy(q_dev, loc(q_mat(1,1)), ldq * matrixCols * size_of_&
successCUDA = cuda_memcpy(q_dev, loc(q_mat(1,1)), ldq * matrixCols * size_of_PRECISION_real, cudaMemcpyHostToDevice) &PRECISION&
&_&
&MATH_DATATYPE&
&_datatype, cudaMemcpyHostToDevice)
check_memcpy_cuda("trans_ev", successCUDA) check_memcpy_cuda("trans_ev", successCUDA)
#endif
#if COMPLEXCASE == 1
successCUDA = cuda_memcpy(q_dev, loc(q_mat(1,1)), ldq * matrixCols * size_of_PRECISION_complex, &
cudaMemcpyHostToDevice)
check_memcpy_cuda("trans_ev", successCUDA)
#endif
endif ! useGPU endif ! useGPU
do istep = 1, na, nblk do istep = 1, na, nblk
...@@ -330,12 +299,15 @@ ...@@ -330,12 +299,15 @@
#ifdef WITH_MPI #ifdef WITH_MPI
call timer%start("mpi_communication") call timer%start("mpi_communication")
if (nb>0) & if (nb>0) &
call MPI_Bcast(hvb, nb, &
#if REALCASE == 1 #if REALCASE == 1
call MPI_Bcast(hvb, nb, MPI_REAL_PRECISION, cur_pcol, mpi_comm_cols, mpierr) &MPI_REAL_PRECISION&
#endif #endif
#if COMPLEXCASE == 1 #if COMPLEXCASE == 1
call MPI_Bcast(hvb, nb, MPI_COMPLEX_PRECISION, cur_pcol, mpi_comm_cols, mpierr) &MPI_COMPLEX_PRECISION&
#endif #endif
, cur_pcol, mpi_comm_cols, mpierr)
call timer%stop("mpi_communication") call timer%stop("mpi_communication")
#endif /* WITH_MPI */ #endif /* WITH_MPI */
...@@ -375,12 +347,14 @@ ...@@ -375,12 +347,14 @@
enddo enddo
#ifdef WITH_MPI #ifdef WITH_MPI
call timer%start("mpi_communication") call timer%start("mpi_communication")
if (nc>0) call mpi_allreduce( h1, h2, nc, &
#if REALCASE == 1 #if REALCASE == 1
if (nc>0) call mpi_allreduce( h1, h2, nc, MPI_REAL_PRECISION, MPI_SUM, mpi_comm_rows, mpierr) &MPI_REAL_PRECISION&
#endif #endif
#if COMPLEXCASE == 1 #if COMPLEXCASE == 1
if (nc>0) call mpi_allreduce(h1, h2, nc, MPI_COMPLEX_PRECISION, MPI_SUM, mpi_comm_rows, mpierr) &MPI_COMPLEX_PRECISION&
#endif #endif
&, MPI_SUM, mpi_comm_rows, mpierr)
call timer%stop("mpi_communication") call timer%stop("mpi_communication")
#else /* WITH_MPI */ #else /* WITH_MPI */
...@@ -403,13 +377,16 @@ ...@@ -403,13 +377,16 @@
tmat, max_stored_rows, & tmat, max_stored_rows, &
h2(nc+1),1) h2(nc+1),1)
#endif #endif
call timer%stop("blas") call timer%stop("blas")
tmat(n+1,1:n) = &
#if REALCASE == 1 #if REALCASE == 1
tmat(n+1,1:n) = -h2(nc+1:nc+n)*tau(ice-nstor+n+1) -h2(nc+1:nc+n) &
#endif #endif
#if COMPLEXCASE == 1 #if COMPLEXCASE == 1
tmat(n+1,1:n) = -conjg(h2(nc+1:nc+n))*tau(ice-nstor+n+1) -conjg(h2(nc+1:nc+n)) &
#endif #endif
*tau(ice-nstor+n+1)
tmat(n+1,n+1) = tau(ice-nstor+n+1) tmat(n+1,n+1) = tau(ice-nstor+n+1)
nc = nc+n nc = nc+n
enddo enddo
...@@ -419,25 +396,22 @@ ...@@ -419,25 +396,22 @@
hvm1(1:hvm_ubnd*nstor) = reshape(hvm(1:hvm_ubnd,1:nstor), (/ hvm_ubnd*nstor /)) hvm1(1:hvm_ubnd*nstor) = reshape(hvm(1:hvm_ubnd,1:nstor), (/ hvm_ubnd*nstor /))
!hvm_dev(1:hvm_ubnd*nstor) = hvm1(1:hvm_ubnd*nstor) !hvm_dev(1:hvm_ubnd*nstor) = hvm1(1:hvm_ubnd*nstor)
#if REALCASE == 1
successCUDA = cuda_memcpy(hvm_dev, loc(hvm1(1)), & successCUDA = cuda_memcpy(hvm_dev, loc(hvm1(1)), &
hvm_ubnd * nstor * size_of_PRECISION_real, cudaMemcpyHostToDevice) hvm_ubnd * nstor * size_of_&
#endif &PRECISION&
#if COMPLEXCASE == 1 &_&
successCUDA = cuda_memcpy(hvm_dev, loc(hvm1(1)), & &MATH_DATATYPE&
hvm_ubnd * nstor * size_of_PRECISION_complex, cudaMemcpyHostToDevice) &_datatype, cudaMemcpyHostToDevice)
#endif
check_memcpy_cuda("trans_ev", successCUDA) check_memcpy_cuda("trans_ev", successCUDA)
!tmat_dev = tmat !tmat_dev = tmat
#if REALCASE == 1
successCUDA = cuda_memcpy(tmat_dev, loc(tmat(1,1)), & successCUDA = cuda_memcpy(tmat_dev, loc(tmat(1,1)), &
max_stored_rows * max_stored_rows * size_of_PRECISION_real, cudaMemcpyHostToDevice) max_stored_rows * max_stored_rows * size_of_&
#endif &PRECISION&
#if COMPLEXCASE == 1 &_&
successCUDA = cuda_memcpy(tmat_dev, loc(tmat(1,1)), & &MATH_DATATYPE&
max_stored_rows * max_stored_rows * size_of_PRECISION_complex, cudaMemcpyHostToDevice) &_datatype, cudaMemcpyHostToDevice)
#endif
check_memcpy_cuda("trans_ev", successCUDA) check_memcpy_cuda("trans_ev", successCUDA)
endif endif
...@@ -479,12 +453,11 @@ ...@@ -479,12 +453,11 @@
else !l_rows>0 else !l_rows>0
if (useGPU) then if (useGPU) then
#if REALCASE == 1 successCUDA = cuda_memset(tmp_dev, 0, l_cols * nstor * size_of_&
successCUDA = cuda_memset(tmp_dev, 0, l_cols * nstor * size_of_PRECISION_real) &PRECISION&
#endif &_&
#if COMPLEXCASE == 1 &MATH_DATATYPE&
successCUDA = cuda_memset(tmp_dev, 0, l_cols * nstor * size_of_PRECISION_complex) &_datatype)
#endif
check_memcpy_cuda("trans_ev", successCUDA) check_memcpy_cuda("trans_ev", successCUDA)
else else
tmp1(1:l_cols*nstor) = 0 tmp1(1:l_cols*nstor) = 0
...@@ -495,34 +468,32 @@ ...@@ -495,34 +468,32 @@
! In the legacy GPU version, this allreduce was ommited. But probably it has to be done for GPU + MPI ! In the legacy GPU version, this allreduce was ommited. But probably it has to be done for GPU + MPI
! todo: does it need to be copied whole? Wouldn't be a part sufficient? ! todo: does it need to be copied whole? Wouldn't be a part sufficient?
if (useGPU) then if (useGPU) then
#if REALCASE == 1
successCUDA = cuda_memcpy(loc(tmp1(1)), tmp_dev, & successCUDA = cuda_memcpy(loc(tmp1(1)), tmp_dev, &
max_local_cols * max_stored_rows * size_of_PRECISION_real, cudaMemcpyDeviceToHost) max_local_cols * max_stored_rows * size_of_&
#endif &PRECISION&
#if COMPLEXCASE == 1 &_&
successCUDA = cuda_memcpy(loc(tmp1(1)), tmp_dev, & &MATH_DATATYPE&
max_local_cols * max_stored_rows * size_of_PRECISION_complex, cudaMemcpyDeviceToHost) &_datatype, cudaMemcpyDeviceToHost)
#endif
check_memcpy_cuda("trans_ev", successCUDA) check_memcpy_cuda("trans_ev", successCUDA)
endif endif
call timer%start("mpi_communication") call timer%start("mpi_communication")
call mpi_allreduce(tmp1, tmp2, nstor*l_cols, &
#if REALCASE == 1 #if REALCASE == 1
call mpi_allreduce(tmp1, tmp2, nstor*l_cols, MPI_REAL_PRECISION, MPI_SUM, mpi_comm_rows, mpierr) &MPI_REAL_PRECISION&
#endif #endif
#if COMPLEXCASE == 1 #if COMPLEXCASE == 1
call mpi_allreduce(tmp1, tmp2, nstor*l_cols, MPI_COMPLEX_PRECISION, MPI_SUM, mpi_comm_rows, mpierr) &MPI_COMPLEX_PRECISION&
#endif #endif
&, MPI_SUM, mpi_comm_rows, mpierr)
call timer%stop("mpi_communication") call timer%stop("mpi_communication")
! copy back tmp2 - after reduction... ! copy back tmp2 - after reduction...
if (useGPU) then if (useGPU) then
#if REALCASE == 1
successCUDA = cuda_memcpy(tmp_dev, loc(tmp2(1)), &
max_local_cols * max_stored_rows * size_of_PRECISION_real, cudaMemcpyHostToDevice)
#endif
#if COMPLEXCASE == 1
successCUDA = cuda_memcpy(tmp_dev, loc(tmp2(1)), & successCUDA = cuda_memcpy(tmp_dev, loc(tmp2(1)), &
max_local_cols * max_stored_rows * size_of_PRECISION_complex, cudaMemcpyHostToDevice) max_local_cols * max_stored_rows * size_of_&
#endif &PRECISION&
&_&
&MATH_DATATYPE&
&_datatype, cudaMemcpyHostToDevice)
check_memcpy_cuda("trans_ev", successCUDA) check_memcpy_cuda("trans_ev", successCUDA)
endif ! useGPU endif ! useGPU
...@@ -609,34 +580,26 @@ ...@@ -609,34 +580,26 @@
deallocate(tmat, h1, h2, tmp1, tmp2, hvb, hvm, stat=istat, errmsg=errorMessage) deallocate(tmat, h1, h2, tmp1, tmp2, hvb, hvm, stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then if (istat .ne. 0) then
#if REALCASE == 1 print *,"trans_ev_&
print *,"trans_ev_real: error when deallocating hvm "//errorMessage &MATH_DATATYPE&
#endif &: error when deallocating hvm "//errorMessage
#if COMPLEXCASE == 1
print *,"trans_ev_complex: error when deallocating hvm "//errorMessage
#endif
stop stop
endif endif
if (useGPU) then if (useGPU) then
!q_mat = q_dev !q_mat = q_dev
#if REALCASE == 1 successCUDA = cuda_memcpy(loc(q_mat(1,1)), q_dev, ldq * matrixCols * size_of_&
successCUDA = cuda_memcpy(loc(q_mat(1,1)), q_dev, ldq * matrixCols * size_of_PRECISION_real, cudaMemcpyDeviceToHost) &PRECISION&
#endif &_&
#if COMPLEXCASE == 1 &MATH_DATATYPE&
successCUDA = cuda_memcpy(loc(q_mat(1,1)), q_dev, ldq * matrixCols * size_of_PRECISION_complex, & &_datatype, cudaMemcpyDeviceToHost)
cudaMemcpyDeviceToHost)
#endif
check_memcpy_cuda("trans_ev", successCUDA) check_memcpy_cuda("trans_ev", successCUDA)
deallocate(hvm1, stat=istat, errmsg=errorMessage) deallocate(hvm1, stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then if (istat .ne. 0) then
#if REALCASE == 1 print *,"trans_ev_&
print *,"trans_ev_real: error when deallocating hvm1 "//errorMessage &MATH_DATATYPE&
#endif &: error when deallocating hvm1 "//errorMessage
#if COMPLEXCASE == 1
print *,"trans_ev_complex: error when deallocating hvm1 "//errorMessage
#endif
stop stop
endif endif
...@@ -655,16 +618,13 @@ ...@@ -655,16 +618,13 @@
endif endif
#if REALCASE == 1 call timer%stop("trans_ev_&
call timer%stop("trans_ev_real" // PRECISION_SUFFIX) &MATH_DATATYPE&
#endif &" // &
#if COMPLEXCASE == 1 &PRECISION_SUFFIX&
call timer%stop("trans_ev_complex" // PRECISION_SUFFIX) )
#endif
#if REALCASE == 1 end subroutine trans_ev_&
end subroutine trans_ev_real_PRECISION &MATH_DATATYPE&
#endif &_&
#if COMPLEXCASE == 1 &PRECISION
end subroutine trans_ev_complex_PRECISION
#endif
#undef PRECISION
#undef MATH_DATATYPE
#define MATH_DATATYPE real
#ifdef DOUBLE_PRECISION_REAL
#define PRECISION double
#else
#define PRECISION single
#endif
#ifdef DOUBLE_PRECISION_REAL #ifdef DOUBLE_PRECISION_REAL
#undef elpa_transpose_vectors_real_PRECISION #undef elpa_transpose_vectors_real_PRECISION
#undef elpa_reduce_add_vectors_real_PRECISION #undef elpa_reduce_add_vectors_real_PRECISION
...@@ -7,7 +17,6 @@ ...@@ -7,7 +17,6 @@
#undef trans_ev_tridi_to_band_real_PRECISION #undef trans_ev_tridi_to_band_real_PRECISION
#undef band_band_real_PRECISION #undef band_band_real_PRECISION
#undef tridiag_real_PRECISION #undef tridiag_real_PRECISION
#undef trans_ev_real_PRECISION
#undef solve_tridi_PRECISION #undef solve_tridi_PRECISION
#undef solve_tridi_col_PRECISION #undef solve_tridi_col_PRECISION
#undef solve_tridi_single_problem_PRECISION #undef solve_tridi_single_problem_PRECISION
...@@ -77,7 +86,6 @@ ...@@ -77,7 +86,6 @@
#define trans_ev_tridi_to_band_real_PRECISION trans_ev_tridi_to_band_real_double #define trans_ev_tridi_to_band_real_PRECISION trans_ev_tridi_to_band_real_double
#define band_band_real_PRECISION band_band_real_double #define band_band_real_PRECISION band_band_real_double
#define tridiag_real_PRECISION tridiag_real_double #define tridiag_real_PRECISION tridiag_real_double
#define trans_ev_real_PRECISION trans_ev_real_double
#define solve_tridi_PRECISION solve_tridi_double #define solve_tridi_PRECISION solve_tridi_double
#define solve_tridi_col_PRECISION solve_tridi_col_double #define solve_tridi_col_PRECISION solve_tridi_col_double
#define solve_tridi_single_problem_PRECISION solve_tridi_single_problem_double #define solve_tridi_single_problem_PRECISION solve_tridi_single_problem_double
...@@ -140,6 +148,7 @@ ...@@ -140,6 +148,7 @@
#define size_of_PRECISION_real size_of_double_real_datatype #define size_of_PRECISION_real size_of_double_real_datatype
#define MPI_REAL_PRECISION MPI_REAL8 #define MPI_REAL_PRECISION MPI_REAL8
#else #else
#undef elpa_transpose_vectors_real_PRECISION #undef elpa_transpose_vectors_real_PRECISION
#undef elpa_reduce_add_vectors_real_PRECISION #undef elpa_reduce_add_vectors_real_PRECISION
#undef bandred_real_PRECISION #undef bandred_real_PRECISION
...@@ -148,7 +157,6 @@ ...@@ -148,7 +157,6 @@
#undef trans_ev_tridi_to_band_real_PRECISION #undef trans_ev_tridi_to_band_real_PRECISION
#undef band_band_real_PRECISION #undef band_band_real_PRECISION
#undef tridiag_real_PRECISION #undef tridiag_real_PRECISION
#undef trans_ev_real_PRECISION
#undef solve_tridi_PRECISION #undef solve_tridi_PRECISION
#undef solve_tridi_col_PRECISION #undef solve_tridi_col_PRECISION
#undef solve_tridi_single_problem_PRECISION #undef solve_tridi_single_problem_PRECISION
...@@ -218,7 +226,6 @@ ...@@ -218,7 +226,6 @@
#define trans_ev_tridi_to_band_real_PRECISION trans_ev_tridi_to_band_real_single #define trans_ev_tridi_to_band_real_PRECISION trans_ev_tridi_to_band_real_single
#define band_band_real_PRECISION band_band_real_single #define band_band_real_PRECISION band_band_real_single
#define tridiag_real_PRECISION tridiag_real_single #define tridiag_real_PRECISION tridiag_real_single
#define trans_ev_real_PRECISION trans_ev_real_single
#define solve_tridi_PRECISION solve_tridi_single #define solve_tridi_PRECISION solve_tridi_single
#define solve_tridi_col_PRECISION solve_tridi_col_single #define solve_tridi_col_PRECISION solve_tridi_col_single
#define solve_tridi_single_problem_PRECISION solve_tridi_single_problem_single #define solve_tridi_single_problem_PRECISION solve_tridi_single_problem_single
......
#undef PRECISION
#undef MATH_DATATYPE
#define MATH_DATATYPE complex
#ifdef DOUBLE_PRECISION_COMPLEX #ifdef DOUBLE_PRECISION_COMPLEX
#define PRECISION double
#else
#define PRECISION single
#endif
#ifdef DOUBLE_PRECISION_COMPLEX
#undef elpa_transpose_vectors_complex_PRECISION #undef elpa_transpose_vectors_complex_PRECISION
#undef elpa_reduce_add_vectors_complex_PRECISION #undef elpa_reduce_add_vectors_complex_PRECISION
#undef bandred_complex_PRECISION #undef bandred_complex_PRECISION
...@@ -7,7 +18,6 @@ ...@@ -7,7 +18,6 @@
#undef trans_ev_tridi_to_band_complex_PRECISION #undef trans_ev_tridi_to_band_complex_PRECISION
#undef band_band_complex_PRECISION #undef band_band_complex_PRECISION
#undef tridiag_complex_PRECISION #undef tridiag_complex_PRECISION
#undef trans_ev_complex_PRECISION
#undef solve_tridi_PRECISION #undef solve_tridi_PRECISION
#undef solve_tridi_col_PRECISION #undef solve_tridi_col_PRECISION
#undef solve_tridi_single_problem_PRECISION #undef solve_tridi_single_problem_PRECISION
...@@ -96,7 +106,6 @@ ...@@ -96,7 +106,6 @@
#define trans_ev_tridi_to_band_complex_PRECISION trans_ev_tridi_to_band_complex_double #define trans_ev_tridi_to_band_complex_PRECISION trans_ev_tridi_to_band_complex_double
#define band_band_complex_PRECISION band_band_complex_double #define band_band_complex_PRECISION band_band_complex_double
#define tridiag_complex_PRECISION tridiag_complex_double #define tridiag_complex_PRECISION tridiag_complex_double
#define trans_ev_complex_PRECISION trans_ev_complex_double
#define solve_tridi_PRECISION solve_tridi_double #define solve_tridi_PRECISION solve_tridi_double
#define solve_tridi_col_PRECISION solve_tridi_col_double #define solve_tridi_col_PRECISION solve_tridi_col_double
#define solve_tridi_single_problem_PRECISION solve_tridi_single_problem_double #define solve_tridi_single_problem_PRECISION solve_tridi_single_problem_double
...@@ -178,6 +187,7 @@ ...@@ -178,6 +187,7 @@
#define CONST_COMPLEX_1_0 1.0_ck8 #define CONST_COMPLEX_1_0 1.0_ck8
#define size_of_PRECISION_complex size_of_double_complex_datatype #define size_of_PRECISION_complex size_of_double_complex_datatype
#else #else
#undef elpa_transpose_vectors_complex_PRECISION #undef elpa_transpose_vectors_complex_PRECISION