Commit a2b5931c authored by Andreas Marek's avatar Andreas Marek

Start to cleanup precision_macros.h

Due to bug in Intel compiler some ifdefs are still necessary
parent 74a07ae8
......@@ -52,7 +52,9 @@
! distributed along with the original code in the file "COPYING".
#endif
subroutine solve_tridi_PRECISION( na, nev, d, e, q, ldq, nblk, matrixCols, mpi_comm_rows, &
subroutine solve_tridi_&
&PRECISION &
( na, nev, d, e, q, ldq, nblk, matrixCols, mpi_comm_rows, &
mpi_comm_cols, wantDebug, success )
#ifdef HAVE_DETAILED_TIMINGS
......@@ -141,7 +143,9 @@ subroutine solve_tridi_PRECISION( na, nev, d, e, q, ldq, nblk, matrixCols, mpi_c
else
nev1 = MIN(nev,l_cols)
endif
call solve_tridi_col_PRECISION(l_cols, nev1, nc, d(nc+1), e(nc+1), q, ldq, nblk, &
call solve_tridi_col_&
&PRECISION &
(l_cols, nev1, nc, d(nc+1), e(nc+1), q, ldq, nblk, &
matrixCols, mpi_comm_rows, wantDebug, success)
if (.not.(success)) then
call timer%stop("solve_tridi" // PRECISION_SUFFIX)
......@@ -215,7 +219,9 @@ subroutine solve_tridi_PRECISION( na, nev, d, e, q, ldq, nblk, matrixCols, mpi_c
enddo
! Recursively merge sub problems
call merge_recursive_PRECISION(0, np_cols, wantDebug, success)
call merge_recursive_&
&PRECISION &
(0, np_cols, wantDebug, success)
if (.not.(success)) then
call timer%stop("solve_tridi" // PRECISION_SUFFIX)
return
......@@ -231,7 +237,9 @@ subroutine solve_tridi_PRECISION( na, nev, d, e, q, ldq, nblk, matrixCols, mpi_c
return
contains
recursive subroutine merge_recursive_PRECISION(np_off, nprocs, wantDebug, success)
recursive subroutine merge_recursive_&
&PRECISION &
(np_off, nprocs, wantDebug, success)
use precision
#ifdef HAVE_DETAILED_TIMINGS
use timings
......@@ -264,9 +272,13 @@ subroutine solve_tridi_PRECISION( na, nev, d, e, q, ldq, nblk, matrixCols, mpi_c
np1 = nprocs/2
np2 = nprocs-np1
if (np1 > 1) call merge_recursive_PRECISION(np_off, np1, wantDebug, success)
if (np1 > 1) call merge_recursive_&
&PRECISION &
(np_off, np1, wantDebug, success)
if (.not.(success)) return
if (np2 > 1) call merge_recursive_PRECISION(np_off+np1, np2, wantDebug, success)
if (np2 > 1) call merge_recursive_&
&PRECISION &
(np_off+np1, np2, wantDebug, success)
if (.not.(success)) return
noff = limits(np_off)
......@@ -316,22 +328,30 @@ subroutine solve_tridi_PRECISION( na, nev, d, e, q, ldq, nblk, matrixCols, mpi_c
! Last merge, result distribution must be block cyclic, noff==0,
! p_col_bc is set so that only nev eigenvalues are calculated
call merge_systems_PRECISION(nlen, nmid, d(noff+1), e(noff+nmid), q, ldq, noff, &
call merge_systems_&
&PRECISION &
(nlen, nmid, d(noff+1), e(noff+nmid), q, ldq, noff, &
nblk, matrixCols, mpi_comm_rows, mpi_comm_cols, l_col, p_col, &
l_col_bc, p_col_bc, np_off, nprocs, wantDebug, success )
if (.not.(success)) return
else
! Not last merge, leave dense column distribution
call merge_systems_PRECISION(nlen, nmid, d(noff+1), e(noff+nmid), q, ldq, noff, &
call merge_systems_&
&PRECISION &
(nlen, nmid, d(noff+1), e(noff+nmid), q, ldq, noff, &
nblk, matrixCols, mpi_comm_rows, mpi_comm_cols, l_col(noff+1), p_col(noff+1), &
l_col(noff+1), p_col(noff+1), np_off, nprocs, wantDebug, success )
if (.not.(success)) return
endif
end subroutine merge_recursive_PRECISION
end subroutine merge_recursive_&
&PRECISION
end subroutine solve_tridi_PRECISION
end subroutine solve_tridi_&
&PRECISION
subroutine solve_tridi_col_PRECISION( na, nev, nqoff, d, e, q, ldq, nblk, matrixCols, mpi_comm_rows, wantDebug, success )
subroutine solve_tridi_col_&
&PRECISION &
( na, nev, nqoff, d, e, q, ldq, nblk, matrixCols, mpi_comm_rows, wantDebug, success )
! Solves the symmetric, tridiagonal eigenvalue problem on one processor column
! with the divide and conquer method.
......@@ -427,7 +447,9 @@ subroutine solve_tridi_PRECISION( na, nev, d, e, q, ldq, nblk, matrixCols, mpi_c
noff = limits(n) ! Start of subproblem
nlen = limits(n+1)-noff ! Size of subproblem
call solve_tridi_single_problem_PRECISION(nlen,d(noff+1),e(noff+1), &
call solve_tridi_single_problem_&
&PRECISION &
(nlen,d(noff+1),e(noff+1), &
q(nqoff+noff+1,noff+1),ubound(q,dim=1), wantDebug, success)
if (.not.(success)) return
......@@ -456,7 +478,9 @@ subroutine solve_tridi_PRECISION( na, nev, d, e, q, ldq, nblk, matrixCols, mpi_c
noff = limits(my_prow) ! Start of subproblem
nlen = limits(my_prow+1)-noff ! Size of subproblem
call solve_tridi_single_problem_PRECISION(nlen,d(noff+1),e(noff+1),qmat1, &
call solve_tridi_single_problem_&
&PRECISION &
(nlen,d(noff+1),e(noff+1),qmat1, &
ubound(qmat1,dim=1), wantDebug, success)
if (.not.(success)) return
......@@ -480,9 +504,13 @@ subroutine solve_tridi_PRECISION( na, nev, d, e, q, ldq, nblk, matrixCols, mpi_c
do i=1,nlen
#ifdef WITH_MPI
call distribute_global_column_PRECISION(qmat2(1,i), q(1,noff+i), nqoff+noff, nlen, my_prow, np_rows, nblk)
call distribute_global_column_&
&PRECISION &
(qmat2(1,i), q(1,noff+i), nqoff+noff, nlen, my_prow, np_rows, nblk)
#else /* WITH_MPI */
call distribute_global_column_PRECISION(qmat1(1,i), q(1,noff+i), nqoff+noff, nlen, my_prow, np_rows, nblk)
call distribute_global_column_&
&PRECISION &
(qmat1(1,i), q(1,noff+i), nqoff+noff, nlen, my_prow, np_rows, nblk)
#endif /* WITH_MPI */
enddo
......@@ -525,7 +553,9 @@ subroutine solve_tridi_PRECISION( na, nev, d, e, q, ldq, nblk, matrixCols, mpi_c
! Last merge, set p_col_o=-1 for unneeded (output) eigenvectors
p_col_o(nev+1:na) = -1
endif
call merge_systems_PRECISION(nlen, nmid, d(noff+1), e(noff+nmid), q, ldq, nqoff+noff, nblk, &
call merge_systems_&
&PRECISION &
(nlen, nmid, d(noff+1), e(noff+nmid), q, ldq, nqoff+noff, nblk, &
matrixCols, mpi_comm_rows, mpi_comm_self, l_col(noff+1), p_col_i(noff+1), &
l_col(noff+1), p_col_o(noff+1), 0, 1, wantDebug, success)
if (.not.(success)) return
......@@ -544,9 +574,12 @@ subroutine solve_tridi_PRECISION( na, nev, d, e, q, ldq, nblk, matrixCols, mpi_c
call timer%stop("solve_tridi_col" // PRECISION_SUFFIX)
end subroutine solve_tridi_col_PRECISION
end subroutine solve_tridi_col_&
&PRECISION
recursive subroutine solve_tridi_single_problem_PRECISION(nlen, d, e, q, ldq, wantDebug, success)
recursive subroutine solve_tridi_single_problem_&
&PRECISION &
(nlen, d, e, q, ldq, wantDebug, success)
! Solves the symmetric, tridiagonal eigenvalue problem on a single processor.
! Takes precautions if DSTEDC fails or if the eigenvalues are not ordered correctly.
......@@ -672,5 +705,6 @@ subroutine solve_tridi_PRECISION( na, nev, d, e, q, ldq, nblk, matrixCols, mpi_c
enddo
call timer%stop("solve_tridi_single" // PRECISION_SUFFIX)
end subroutine solve_tridi_single_problem_PRECISION
end subroutine solve_tridi_single_problem_&
&PRECISION
......@@ -82,13 +82,11 @@
!> \param useGPU If true, GPU version of the subroutine will be used
!>
#if REALCASE == 1
subroutine tridiag_real_PRECISION (na, a_mat, lda, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols, d_vec, e_vec, tau, useGPU)
#endif
#if COMPLEXCASE == 1
subroutine tridiag_complex_PRECISION(na, a_mat, lda, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols, &
d_vec, e_vec, tau, useGPU)
#endif
subroutine tridiag_&
&MATH_DATATYPE&
&_&
&PRECISION &
(na, a_mat, lda, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols, d_vec, e_vec, tau, useGPU)
use cuda_functions
use iso_c_binding
#ifdef HAVE_DETAILED_TIMINGS
......@@ -211,12 +209,12 @@
integer(kind=ik) :: istat
character(200) :: errorMessage
#if REALCASE == 1
call timer%start("tridiag_real" // PRECISION_SUFFIX)
#endif
#if COMPLEXCASE == 1
call timer%start("tridiag_complex" // PRECISION_SUFFIX)
#endif
call timer%start("tridiag_&
&MATH_DATATYPE&
&_" // &
PRECISION_SUFFIX &
)
call timer%start("mpi_communication")
call mpi_comm_rank(mpi_comm_rows,my_prow,mpierr)
call mpi_comm_size(mpi_comm_rows,np_rows,mpierr)
......@@ -265,61 +263,36 @@
! todo: probably one should read it as v_row = vector v distributed among rows
!
allocate(tmp(MAX(max_local_rows,max_local_cols)), stat=istat, errmsg=errorMessage)
#if REALCASE == 1
call check_alloc("tridiag_real", "tmp", istat, errorMessage)
#endif
#if COMPLEXCASE == 1
call check_alloc("tridiag_complex", "tmp", istat, errorMessage)
#endif
call check_alloc("tridiag_&
&MATH_DATATYPE ", "tmp", istat, errorMessage)
! allocate v_row 1 element longer to allow store and broadcast tau together with it
allocate(v_row(max_local_rows+1), stat=istat, errmsg=errorMessage)
#if REALCASE == 1
call check_alloc("tridiag_real", "v_row", istat, errorMessage)
#endif
#if COMPLEXCASE == 1
call check_alloc("tridiag_complex", "v_row", istat, errorMessage)
#endif
call check_alloc("tridiag_&
&MATH_DATATYPE ", "v_row", istat, errorMessage)
allocate(u_row(max_local_rows), stat=istat, errmsg=errorMessage)
#if REALCASE == 1
call check_alloc("tridiag_real", "u_row", istat, errorMessage)
#endif
#if COMPLEXCASE == 1
call check_alloc("tridiag_complex", "u_row", istat, errorMessage)
#endif
call check_alloc("tridiag_&
&MATH_DATATYPE ", "u_row", istat, errorMessage)
allocate(v_col(max_local_cols), stat=istat, errmsg=errorMessage)
#if REALCASE == 1
call check_alloc("tridiag_real", "v_col", istat, errorMessage)
#endif
#if COMPLEXCASE == 1
call check_alloc("tridiag_complex", "v_col", istat, errorMessage)
#endif
call check_alloc("tridiag_&
&MATH_DATATYPE ", "v_col", istat, errorMessage)
allocate(u_col(max_local_cols), stat=istat, errmsg=errorMessage)
#if REALCASE == 1
call check_alloc("tridiag_real", "u_col", istat, errorMessage)
#endif
#if COMPLEXCASE == 1
call check_alloc("tridiag_complex", "u_col", istat, errorMessage)
#endif
call check_alloc("tridiag_&
&MATH_DATATYPE ", "u_col", istat, errorMessage)
#ifdef WITH_OPENMP
max_threads = omp_get_max_threads()
allocate(ur_p(max_local_rows,0:max_threads-1), stat=istat, errmsg=errorMessage)
#if REALCASE == 1
call check_alloc("tridiag_real", "ur_p", istat, errorMessage)
#endif
#if COMPLEXCASE == 1
call check_alloc("tridiag_complex", "ur_p", istat, errorMessage)
#endif
call check_alloc("tridiag_&
&MATH_DATATYPE ", "ur_p", istat, errorMessage)
allocate(uc_p(max_local_cols,0:max_threads-1), stat=istat, errmsg=errorMessage)
#if REALCASE == 1
call check_alloc("tridiag_real", "uc_p", istat, errorMessage)
#endif
#if COMPLEXCASE == 1
call check_alloc("tridiag_complex", "uc_p", istat, errorMessage)
#endif
call check_alloc("tridiag_&
&MATH_DATATYPE ", "uc_p", istat, errorMessage)
#endif /* WITH_OPENMP */
tmp = 0
......@@ -329,65 +302,65 @@
u_col = 0
allocate(vu_stored_rows(max_local_rows,2*max_stored_uv), stat=istat, errmsg=errorMessage)
#if REALCASE == 1
call check_alloc("tridiag_real", "vu_stored_rows", istat, errorMessage)
#endif
#if COMPLEXCASE == 1
call check_alloc("tridiag_complex", "vu_stored_rows", istat, errorMessage)
#endif
call check_alloc("tridiag_&
&MATH_DATATYPE ", "vu_stored_rows", istat, errorMessage)
allocate(uv_stored_cols(max_local_cols,2*max_stored_uv), stat=istat, errmsg=errorMessage)
#if REALCASE == 1
call check_alloc("tridiag_real", "uv_stored_cols", istat, errorMessage)
#endif
#if COMPLEXCASE == 1
call check_alloc("tridiag_complex", "uv_stored_cols", istat, errorMessage)
#endif
call check_alloc("tridiag_&
&MATH_DATATYPE ", "uv_stored_cols", istat, errorMessage)
if (useGPU) then
successCUDA = cuda_malloc(v_row_dev, max_local_rows * &
#if REALCASE == 1
successCUDA = cuda_malloc(v_row_dev, max_local_rows * size_of_PRECISION_real)
size_of_PRECISION_real)
#endif
#if COMPLEXCASE == 1
successCUDA = cuda_malloc(v_row_dev, max_local_rows * size_of_PRECISION_complex)
size_of_PRECISION_complex)
#endif
check_alloc_cuda("tridiag", successCUDA)
successCUDA = cuda_malloc(u_row_dev, max_local_rows * &
#if REALCASE == 1
successCUDA = cuda_malloc(u_row_dev, max_local_rows * size_of_PRECISION_real)
size_of_PRECISION_real)
#endif
#if COMPLEXCASE == 1
successCUDA = cuda_malloc(u_row_dev, max_local_rows * size_of_PRECISION_complex)
size_of_PRECISION_complex)
#endif
check_alloc_cuda("tridiag", successCUDA)
successCUDA = cuda_malloc(v_col_dev, max_local_cols * &
#if REALCASE == 1
successCUDA = cuda_malloc(v_col_dev, max_local_cols * size_of_PRECISION_real)
size_of_PRECISION_real)
#endif
#if COMPLEXCASE == 1
successCUDA = cuda_malloc(v_col_dev, max_local_cols * size_of_PRECISION_complex)
size_of_PRECISION_complex)
#endif
check_alloc_cuda("tridiag", successCUDA)
successCUDA = cuda_malloc(u_col_dev, max_local_cols * &
#if REALCASE == 1
successCUDA = cuda_malloc(u_col_dev, max_local_cols * size_of_PRECISION_real)
size_of_PRECISION_real)
#endif
#if COMPLEXCASE == 1
successCUDA = cuda_malloc(u_col_dev, max_local_cols * size_of_PRECISION_complex)
size_of_PRECISION_complex)
#endif
check_alloc_cuda("tridiag", successCUDA)
successCUDA = cuda_malloc(vu_stored_rows_dev, max_local_rows * 2 * max_stored_uv * &
#if REALCASE == 1
successCUDA = cuda_malloc(vu_stored_rows_dev, max_local_rows * 2 * max_stored_uv * size_of_PRECISION_real)
size_of_PRECISION_real)
#endif
#if COMPLEXCASE == 1
successCUDA = cuda_malloc(vu_stored_rows_dev, max_local_rows * 2 * max_stored_uv * size_of_PRECISION_complex)
size_of_PRECISION_complex)
#endif
check_alloc_cuda("tridiag", successCUDA)
successCUDA = cuda_malloc(uv_stored_cols_dev, max_local_cols * 2 * max_stored_uv * &
#if REALCASE == 1
successCUDA = cuda_malloc(uv_stored_cols_dev, max_local_cols * 2 * max_stored_uv * size_of_PRECISION_real)
size_of_PRECISION_real)
#endif
#if COMPLEXCASE == 1
successCUDA = cuda_malloc(uv_stored_cols_dev, max_local_cols * 2 * max_stored_uv * size_of_PRECISION_complex)
size_of_PRECISION_complex)
#endif
check_alloc_cuda("tridiag", successCUDA)
endif !useGPU
......@@ -406,19 +379,22 @@
if (useGPU) then
! allocate memmory for matrix A on the device and than copy the matrix
successCUDA = cuda_malloc(a_dev, lda * matrixCols * &
#if REALCASE == 1
successCUDA = cuda_malloc(a_dev, lda * matrixCols * size_of_PRECISION_real)
size_of_PRECISION_real)
#endif
#if COMPLEXCASE == 1
successCUDA = cuda_malloc(a_dev, lda * matrixCols * size_of_PRECISION_complex)
size_of_PRECISION_complex)
#endif
check_alloc_cuda("tridiag", successCUDA)
successCUDA = cuda_memcpy(a_dev, loc(a_mat(1,1)), lda * matrixCols * &
#if REALCASE == 1
successCUDA = cuda_memcpy(a_dev, loc(a_mat(1,1)), lda * matrixCols * size_of_PRECISION_real, cudaMemcpyHostToDevice)
size_of_PRECISION_real, cudaMemcpyHostToDevice)
#endif
#if COMPLEXCASE == 1
successCUDA = cuda_memcpy(a_dev, loc(a_mat(1,1)), lda * matrixCols * size_of_PRECISION_complex, &
cudaMemcpyHostToDevice)
size_of_PRECISION_complex, cudaMemcpyHostToDevice)
#endif
check_alloc_cuda("tridiag", successCUDA)
endif
......@@ -832,14 +808,14 @@
! global tile size is smaller than the global remaining matrix
if (tile_size < istep-1) then
#if REALCASE == 1
call elpa_reduce_add_vectors_real_PRECISION (u_row, ubound(u_row,dim=1), mpi_comm_rows, &
u_col, ubound(u_col,dim=1), mpi_comm_cols, istep-1, 1, nblk)
#endif
#if COMPLEXCASE == 1
call elpa_reduce_add_vectors_complex_PRECISION (u_row, ubound(u_row,dim=1), mpi_comm_rows, &
u_col, ubound(u_col,dim=1), mpi_comm_cols, istep-1, 1, nblk)
#endif
call elpa_reduce_add_vectors_&
&MATH_DATATYPE&
&_&
&PRECISION &
(u_row, ubound(u_row,dim=1), mpi_comm_rows, u_col, ubound(u_col,dim=1), &
mpi_comm_cols, istep-1, 1, nblk)
endif
! Sum up all the u_col(:) parts, transpose u_col -> u_row
......@@ -862,15 +838,13 @@
#endif /* WITH_MPI */
endif
#if REALCASE == 1
call elpa_transpose_vectors_real_PRECISION (u_col, ubound(u_col,dim=1), mpi_comm_cols, &
u_row, ubound(u_row,dim=1), mpi_comm_rows, 1, istep-1, 1, nblk)
#endif
#if COMPLEXCASE == 1
call elpa_transpose_vectors_complex_PRECISION (u_col, ubound(u_col,dim=1), mpi_comm_cols, &
u_row, ubound(u_row,dim=1), mpi_comm_rows, &
1, (istep-1), 1, nblk)
#endif
call elpa_transpose_vectors_&
&MATH_DATATYPE&
&_&
&PRECISION &
(u_col, ubound(u_col,dim=1), mpi_comm_cols, u_row, ubound(u_row,dim=1), &
mpi_comm_rows, 1, istep-1, 1, nblk)
! calculate u**T * v (same as v**T * (A + VU**T + UV**T) * v )
#if REALCASE == 1
x = 0
......@@ -1278,9 +1252,7 @@
!
! end subroutine
#if REALCASE == 1
end subroutine tridiag_real_PRECISION
#endif
#if COMPLEXCASE == 1
end subroutine tridiag_complex_PRECISION
#endif
end subroutine tridiag_&
&MATH_DATATYPE&
&_&
&PRECISION
......@@ -60,15 +60,26 @@
! with their original authors, but shall adhere to the licensing terms
! distributed along with the original code in the file "COPYING".
#endif
subroutine bandred_&
&MATH_DATATYPE&
&_&
&PRECISION &
(na, a, &
#if REALCASE == 1
a_dev, &
#endif
lda, nblk, nbw, matrixCols, numBlocks, mpi_comm_rows, mpi_comm_cols, tmat, &
#if REALCASE == 1
tmat_dev, &
#endif
wantDebug, useGPU, success &
#if REALCASE == 1
subroutine bandred_real_PRECISION(na, a, a_dev, lda, nblk, nbw, matrixCols, numBlocks, mpi_comm_rows, mpi_comm_cols, &
tmat, tmat_dev, wantDebug, useGPU, success, useQR)
, useQR)
#endif
#if COMPLEXCASE == 1
subroutine bandred_complex_PRECISION(na, a, lda, nblk, nbw, matrixCols, numBlocks, &
mpi_comm_rows, mpi_comm_cols, tmat, wantDebug, useGPU, success)
)
#endif
!-------------------------------------------------------------------------------
! bandred_real/complex: Reduces a distributed symmetric matrix to band form
!
......@@ -1966,12 +1977,11 @@ subroutine bandred_complex_PRECISION(na, a, lda, nblk, nbw, matrixCols, numBlock
call timer%stop("bandred_complex" // PRECISION_SUFFIX)
#endif
end subroutine bandred_&
&MATH_DATATYPE&
&_&
&PRECISION
#if REALCASE == 1
end subroutine bandred_real_PRECISION ! slower for gpu on 10000 10000 ???
! slower for gpu on 10000 10000 ???
#endif
#if COMPLEXCASE == 1
end subroutine bandred_complex_PRECISION
#endif
#if REALCASE == 1
subroutine trans_ev_tridi_to_band_real_PRECISION &
#endif
#if COMPLEXCASE == 1
subroutine trans_ev_tridi_to_band_complex_PRECISION &
#endif
subroutine trans_ev_tridi_to_band_&
&MATH_DATATYPE&
&_&
&PRECISION &
(na, nev, nblk, nbw, q, &
#if REALCASE == 1
q_dev, &
......@@ -2816,7 +2814,8 @@
#endif /* COMPLEXCASE */
#if REALCASE == 1
end subroutine trans_ev_tridi_to_band_real_PRECISION
end subroutine trans_ev_tridi_to_band_real_&
&PRECISION
#endif
......@@ -50,11 +50,12 @@
#endif
#if REALCASE == 1
subroutine tridiag_band_real_PRECISION &
subroutine tridiag_band_real_&
#endif
#if COMPLEXCASE == 1
subroutine tridiag_band_complex_PRECISION &
subroutine tridiag_band_complex_&
#endif
&PRECISION &
(na, nb, nblk, a, lda, d, e, matrixCols, &
#if REALCASE == 1
hh_trans_real, &
......@@ -1446,10 +1447,13 @@
&" // &
&PRECISION_SUFFIX&
)
! intel compiler bug makes these ifdefs necessary
#if REALCASE == 1
end subroutine tridiag_band_real_PRECISION
end subroutine tridiag_band_real_&
#endif
#if COMPLEXCASE == 1
end subroutine tridiag_band_complex_PRECISION
end subroutine tridiag_band_complex_&
#endif
&PRECISION
......@@ -9,29 +9,9 @@
#undef elpa_reduce_add_vectors_NUMBER_PRECISION
#undef elpa_reduce_add_vectors_NUMBER_PRECISION_STR
#undef elpa_reduce_add_vectors_real_PRECISION
#undef bandred_NUMBER_PRECISION
#undef bandred_NUMBER_PRECISION_STR
#undef bandred_real_PRECISION
#undef trans_ev_band_to_full_NUMBER_PRECISION
#undef trans_ev_band_to_full_NUMBER_PRECISION_STR
#undef trans_ev_band_to_full_real_PRECISION
#undef tridiag_band_NUMBER_PRECISION
#undef tridiag_band_NUMBER_PRECISION_STR
#undef tridiag_band_real_PRECISION
#undef trans_ev_tridi_to_band_NUMBER_PRECISION
#undef trans_ev_tridi_to_band_NUMBER_PRECISION_STR
#undef trans_ev_tridi_to_band_real_PRECISION
#undef band_band_NUMBER_PRECISION
#undef band_band_NUMBER_PRECISION_STR
#undef band_band_real_PRECISION
#undef tridiag_NUMBER_PRECISION
#undef tridiag_NUMBER_PRECISION_STR
#undef tridiag_real_PRECISION
#undef trans_ev_NUMBER_PRECISION
#undef trans_ev_NUMBER_PRECISION_STR
#undef trans_ev_real_PRECISION
#undef solve_tridi_PRECISION
#undef solve_tridi_PRECISION_STR
#undef solve_tridi_col_PRECISION
#undef solve_tridi_col_PRECISION_STR
#undef solve_tridi_single_problem_PRECISION
......@@ -129,21 +109,6 @@
#undef launch_extract_hh_tau_c_kernel_NUMBER_PRECISION
#undef launch_extract_hh_tau_c_kernel_NUMBER_PRECISION_STR
#undef launch_extract_hh_tau_c_kernel_real_PRECISION
#undef AVAILABLE_UPCASENUMBER_ELPA_KERNELS
#undef AVAILABLE_UPCASENUMBER_ELPA_KERNELS_STR
#undef AVAILABLE_UPCASEreal_ELPA_KERNELS
#undef UPCASENUMBER_ELPA_KERNEL_GENERIC
#undef UPCASENUMBER_ELPA_KERNEL_GENERIC_STR
#undef UPCASEreal_ELPA_KERNEL_GENERIC
#undef DEFAULT_UPCASENUMBER_ELPA_KERNEL
#undef DEFAULT_UPCASENUMBER_ELPA_KERNEL_STR
#undef DEFAULT_UPCASEreal_ELPA_KERNEL
#undef UPCASENUMBER_ELPA_KERNEL_NAMES
#undef UPCASENUMBER_ELPA_KERNEL_NAMES_STR
#undef UPCASEreal_ELPA_KERNEL_NAMES
#undef UPCASENUMBER_ELPA_KERNEL_GPU
#undef UPCASENUMBER_ELPA_KERNEL_GPU_STR
#undef UPCASEreal_ELPA_KERNEL_GPU
#undef PRECISION_GEMV
#undef PRECISION_TRMV
#undef PRECISION_GEMM
......@@ -185,29 +150,9 @@
#define elpa_reduce_add_vectors_NUMBER_PRECISION elpa_reduce_add_vectors_real_double
#define elpa_reduce_add_vectors_NUMBER_PRECISION_STR 'elpa_reduce_add_vectors_real_double'
#define elpa_reduce_add_vectors_real_PRECISION elpa_reduce_add_vectors_real_double
#define bandred_NUMBER_PRECISION bandred_real_double
#define bandred_NUMBER_PRECISION_STR 'bandred_real_double'
#define bandred_real_PRECISION bandred_real_double
#define trans_ev_band_to_full_NUMBER_PRECISION trans_ev_band_to_full_real_double
#define trans_ev_band_to_full_NUMBER_PRECISION_STR 'trans_ev_band_to_full_real_double'
#define trans_ev_band_to_full_real_PRECISION trans_ev_band_to_full_real_double
#define tridiag_band_NUMBER_PRECISION tridiag_band_real_double
#define tridiag_band_NUMBER_PRECISION_STR 'tridiag_band_real_double'
#define tridiag_band_real_PRECISION tridiag_band_real_double
#define trans_ev_tridi_to_band_NUMBER_PRECISION trans_ev_tridi_to_band_real_double
#define trans_ev_tridi_to_band_NUMBER_PRECISION_STR 'trans_ev_tridi_to_band_real_double'
#define trans_ev_tridi_to_band_real_PRECISION trans_ev_tridi_to_band_real_double
#define band_band_NUMBER_PRECISION band_band_real_double
#define band_band_NUMBER_PRECISION_STR 'band_band_real_double'
#define band_band_real_PRECISION band_band_real_double
#define tridiag_NUMBER_PRECISION tridiag_real_double
#define tridiag_NUMBER_PRECISION_STR 'tridiag_real_double'
#define tridiag_real_PRECISION tridiag_real_double
#define trans_ev_NUMBER_PRECISION trans_ev_real_double
#define trans_ev_NUMBER_PRECISION_STR 'trans_ev_real_double'
#define trans_ev_real_PRECISION trans_ev_real_double
#define solve_tridi_PRECISION solve_tridi_double
#define solve_tridi_PRECISION_STR 'solve_tridi_double'
#define solve_tridi_col_PRECISION solve_tridi_col_double
#define solve_tridi_col_PRECISION_STR 'solve_tridi_col_double'
#define solve_tridi_single_problem_PRECISION solve_tridi_single_problem_double
......@@ -305,21 +250,6 @@
#define launch_extract_hh_tau_c_kernel_NUMBER_PRECISION launch_extract_hh_tau_c_kernel_real_double
#define launch_extract_hh_tau_c_kernel_NUMBER_PRECISION_STR 'launch_extract_hh_tau_c_kernel_real_double'
#define launch_extract_hh_tau_c_kernel_real_PRECISION launch_extract_hh_tau_c_kernel_real_double
#define AVAILABLE_UPCASENUMBER_ELPA_KERNELS AVAILABLE_REAL_ELPA_KERNELS
#define AVAILABLE_UPCASENUMBER_ELPA_KERNELS_STR 'AVAILABLE_REAL_ELPA_KERNELS'
#define AVAILABLE_UPCASEreal_ELPA_KERNELS AVAILABLE_UPCASEreal_ELPA_KERNELS
#define UPCASENUMBER_ELPA_KERNEL_GENERIC REAL_ELPA_KERNEL_GENERIC
#define UPCASENUMBER_ELPA_KERNEL_GENERIC_STR 'REAL_ELPA_KERNEL_GENERIC'
#define UPCASEreal_ELPA_KERNEL_GENERIC UPCASEreal_ELPA_KERNEL_GENERIC
#define DEFAULT_UPCASENUMBER_ELPA_KERNEL DEFAULT_REAL_ELPA_KERNEL
#define DEFAULT_UPCASENUMBER_ELPA_KERNEL_STR 'DEFAULT_REAL_ELPA_KERNEL'
#define DEFAULT_UPCASEreal_ELPA_KERNEL DEFAULT_UPCASEreal_ELPA_KERNEL