Commit 50b3ab16 authored by Pavel Kus's avatar Pavel Kus
Browse files

single/double macros introduced in...

single/double macros introduced in elpa2_trans_ev_band_to_full_complex_template.X90, some macro names unified
parent ab0dfe98
......@@ -67,9 +67,9 @@ blas_tokens = [
"PRECISION_LAPY2",
"PRECISION_LAED4",
"PRECISION_LAED5",
"cublas_PRECISION_gemm",
"cublas_PRECISION_trmm",
"cublas_PRECISION_gemv",
"cublas_PRECISION_GEMM",
"cublas_PRECISION_TRMM",
"cublas_PRECISION_GEMV",
]
explicit_tokens_complex = [
......@@ -82,6 +82,7 @@ explicit_tokens_complex = [
("PRECISION_REAL", "DREAL", "REAL"),
("CONST_REAL_0_0", "0.0_rk8", "0.0_rk4"),
("CONST_REAL_1_0", "1.0_rk8", "1.0_rk4"),
("CONST_COMPLEX_0_0", "0.0_ck8", "0.0_ck4"),
("size_of_PRECISION_complex", "size_of_double_complex_datatype", "size_of_single_complex_datatype"),
]
......
......@@ -443,13 +443,13 @@
if (l_rows>0) then
if(useGPU) then
#if REALCASE == 1
call cublas_PRECISION_gemm('T', 'N', nstor, l_cols, l_rows, &
call cublas_PRECISION_GEMM('T', 'N', nstor, l_cols, l_rows, &
CONST_1_0, hvm_dev, hvm_ubnd, &
q_dev, ldq, &
CONST_0_0, tmp_dev, nstor)
#endif
#if COMPLEXCASE == 1
call cublas_PRECISION_gemm('C', 'N', nstor, l_cols, l_rows, &
call cublas_PRECISION_GEMM('C', 'N', nstor, l_cols, l_rows, &
CONE, hvm_dev, hvm_ubnd, &
q_dev, ldq, &
CZERO, tmp_dev, nstor)
......@@ -528,19 +528,19 @@
if (l_rows>0) then
if (useGPU) then
#if REALCASE == 1
call cublas_PRECISION_trmm('L', 'L', 'N', 'N', nstor, l_cols, &
call cublas_PRECISION_TRMM('L', 'L', 'N', 'N', nstor, l_cols, &
CONST_1_0, tmat_dev, max_stored_rows, &
tmp_dev, nstor)
call cublas_PRECISION_gemm('N', 'N' ,l_rows ,l_cols ,nstor, &
call cublas_PRECISION_GEMM('N', 'N' ,l_rows ,l_cols ,nstor, &
-CONST_1_0, hvm_dev, hvm_ubnd, &
tmp_dev, nstor, &
CONST_1_0, q_dev, ldq)
#endif
#if COMPLEXCASE == 1
call cublas_PRECISION_trmm('L', 'L', 'N', 'N', nstor, l_cols, &
call cublas_PRECISION_TRMM('L', 'L', 'N', 'N', nstor, l_cols, &
CONE, tmat_dev, max_stored_rows, &
tmp_dev, nstor)
call cublas_PRECISION_gemm('N', 'N' ,l_rows ,l_cols ,nstor, &
call cublas_PRECISION_GEMM('N', 'N' ,l_rows ,l_cols ,nstor, &
-CONE, hvm_dev, hvm_ubnd, &
tmp_dev, nstor, &
CONE, q_dev, ldq)
......
......@@ -680,13 +680,13 @@
#endif
#if REALCASE == 1
call cublas_PRECISION_gemv('T',l_row_end-l_row_beg+1,l_col_end-l_col_beg+1, &
call cublas_PRECISION_GEMV('T',l_row_end-l_row_beg+1,l_col_end-l_col_beg+1, &
CONST_1_0,a_dev + a_offset, lda, &
v_row_dev + (l_row_beg - 1) * size_of_PRECISION_real, 1, &
CONST_1_0, u_col_dev + (l_col_beg - 1) * size_of_PRECISION_real, 1)
#endif
#if COMPLEXCASE == 1
call cublas_PRECISION_gemv('C',l_row_end-l_row_beg+1,l_col_end-l_col_beg+1, &
call cublas_PRECISION_GEMV('C',l_row_end-l_row_beg+1,l_col_end-l_col_beg+1, &
CONE,a_dev + a_offset, lda, &
v_row_dev + (l_row_beg - 1) * size_of_PRECISION_complex, 1, &
CONE, u_col_dev + (l_col_beg - 1) * size_of_PRECISION_complex, 1)
......@@ -694,13 +694,13 @@
if(i/=j) then
#if REALCASE == 1
call cublas_PRECISION_gemv('N',l_row_end-l_row_beg+1,l_col_end-l_col_beg+1, &
call cublas_PRECISION_GEMV('N',l_row_end-l_row_beg+1,l_col_end-l_col_beg+1, &
CONST_1_0,a_dev + a_offset, lda, &
v_col_dev + (l_col_beg - 1) * size_of_PRECISION_real, 1, &
CONST_1_0, u_row_dev + (l_row_beg - 1) * size_of_PRECISION_real, 1)
#endif
#if COMPLEXCASE == 1
call cublas_PRECISION_gemv('N',l_row_end-l_row_beg+1,l_col_end-l_col_beg+1, &
call cublas_PRECISION_GEMV('N',l_row_end-l_row_beg+1,l_col_end-l_col_beg+1, &
CONE, a_dev + a_offset, lda, &
v_col_dev + (l_col_beg - 1) * size_of_PRECISION_complex, 1, &
CONE, u_row_dev + (l_row_beg - 1) * size_of_PRECISION_complex, 1)
......@@ -938,13 +938,13 @@
if (useGPU) then
#if REALCASE == 1
call cublas_PRECISION_gemm('N', 'T', l_row_end-l_row_beg+1, l_col_end-l_col_beg+1, 2*n_stored_vecs, &
call cublas_PRECISION_GEMM('N', 'T', l_row_end-l_row_beg+1, l_col_end-l_col_beg+1, 2*n_stored_vecs, &
CONST_1_0, vu_stored_rows_dev + (l_row_beg - 1) * size_of_PRECISION_real, max_local_rows, &
uv_stored_cols_dev + (l_col_beg - 1) * size_of_PRECISION_real, max_local_cols, &
CONST_1_0, a_dev + ((l_row_beg - 1) + (l_col_beg - 1) * lda) * size_of_PRECISION_real, lda)
#endif
#if COMPLEXCASE == 1
call cublas_PRECISION_gemm('N', 'C', l_row_end-l_row_beg+1, l_col_end-l_col_beg+1, 2*n_stored_vecs, &
call cublas_PRECISION_GEMM('N', 'C', l_row_end-l_row_beg+1, l_col_end-l_col_beg+1, 2*n_stored_vecs, &
CONE, vu_stored_rows_dev + (l_row_beg - 1) * size_of_PRECISION_complex, max_local_rows, &
uv_stored_cols_dev + (l_col_beg - 1) * size_of_PRECISION_complex, max_local_cols, &
CONE, a_dev + ((l_row_beg - 1) + (l_col_beg - 1) * lda) * size_of_PRECISION_complex, lda)
......
......@@ -707,12 +707,12 @@
if (lce<lcs) cycle
lre = min(l_rows,(i+1)*l_rows_tile)
call cublas_PRECISION_gemm('T', 'N', lce-lcs+1, n_cols, lre, &
call cublas_PRECISION_GEMM('T', 'N', lce-lcs+1, n_cols, lre, &
CONST_1_0, (a_dev + ((lcs-1)*lda*size_of_PRECISION_real)), lda, vmr_dev,cur_l_rows, &
CONST_1_0, (umc_dev+ (lcs-1)*size_of_PRECISION_real), cur_l_cols)
if(i==0) cycle
lre = min(l_rows,i*l_rows_tile)
call cublas_PRECISION_gemm('N', 'N', lre,n_cols, lce-lcs+1,&
call cublas_PRECISION_GEMM('N', 'N', lre,n_cols, lce-lcs+1,&
CONST_1_0, (a_dev+ ((lcs-1)*lda*size_of_PRECISION_real)), lda, &
(umc_dev+(cur_l_cols * n_cols+lcs-1)*size_of_PRECISION_real), cur_l_cols, &
CONST_1_0, (vmr_dev+(cur_l_rows * n_cols)*size_of_PRECISION_real), cur_l_rows)
......@@ -875,7 +875,7 @@
print *,"bandred_real: error in cudaMemcpy"
stop
endif
call cublas_PRECISION_trmm('Right', 'Upper', 'Trans', 'Nonunit', l_cols, n_cols, &
call cublas_PRECISION_TRMM('Right', 'Upper', 'Trans', 'Nonunit', l_cols, n_cols, &
CONST_1_0, tmat_dev, nbw, umc_dev, cur_l_cols)
! VAV = Tmat * V**T * A * V * Tmat**T = (U*Tmat**T)**T * V * Tmat**T
successCUDA = cuda_memcpy(vav_dev,loc(vav(1,1)), nbw*nbw*size_of_PRECISION_real,cudaMemcpyHostToDevice)
......@@ -883,11 +883,11 @@
print *,"bandred_real: error in cudaMemcpy"
stop
endif
call cublas_PRECISION_gemm('T', 'N', n_cols, n_cols, l_cols, &
call cublas_PRECISION_GEMM('T', 'N', n_cols, n_cols, l_cols, &
CONST_1_0, umc_dev, cur_l_cols, (umc_dev+(cur_l_cols * n_cols )*size_of_PRECISION_real),cur_l_cols, &
CONST_0_0, vav_dev, nbw)
call cublas_PRECISION_trmm('Right', 'Upper', 'Trans', 'Nonunit', n_cols, n_cols, &
call cublas_PRECISION_TRMM('Right', 'Upper', 'Trans', 'Nonunit', n_cols, n_cols, &
CONST_1_0, tmat_dev, nbw, vav_dev, nbw)
successCUDA = cuda_memcpy(loc(vav(1,1)), vav_dev, nbw*nbw*size_of_PRECISION_real, cudaMemcpyDeviceToHost)
......@@ -905,7 +905,7 @@
endif
! U = U - 0.5 * V * VAV
call cublas_PRECISION_gemm('N', 'N', l_cols, n_cols, n_cols,&
call cublas_PRECISION_GEMM('N', 'N', l_cols, n_cols, n_cols,&
-CONST_0_5, (umc_dev+(cur_l_cols * n_cols )*size_of_PRECISION_real),cur_l_cols, vav_dev,nbw,&
CONST_1_0, umc_dev, cur_l_cols)
......@@ -939,7 +939,7 @@
lce = min(l_cols,(i+1)*l_cols_tile)
lre = min(l_rows,(i+1)*l_rows_tile)
if (lce<lcs .or. lre<1) cycle
call cublas_PRECISION_gemm('N', 'T', lre, lce-lcs+1, 2*n_cols, -CONST_1_0, &
call cublas_PRECISION_GEMM('N', 'T', lre, lce-lcs+1, 2*n_cols, -CONST_1_0, &
vmr_dev, cur_l_rows, (umc_dev +(lcs-1)*size_of_PRECISION_real), cur_l_cols, &
CONST_1_0, (a_dev+(lcs-1)*lda*size_of_PRECISION_real), lda)
enddo
......
......@@ -192,6 +192,7 @@ module ELPA2_compute
#define REAL_DATATYPE rk8
#define COMPLEX_DATATYPE ck8
#include "precision_macros_complex.h"
#include "elpa2_compute_complex_template.X90"
#undef DOUBLE_PRECISION_COMPLEX
......@@ -206,6 +207,7 @@ module ELPA2_compute
#define REAL_DATATYPE rk4
#define COMPLEX_DATATYPE ck4
#include "precision_macros_complex.h"
#include "elpa2_compute_complex_template.X90"
#undef DOUBLE_PRECISION_COMPLEX
......
#ifdef DOUBLE_PRECISION_COMPLEX
subroutine trans_ev_band_to_full_complex_double(na, nqc, nblk, nbw, a, lda, tmat, q, ldq, matrixCols, numBlocks, &
mpi_comm_rows, mpi_comm_cols, useGPU)
#else
subroutine trans_ev_band_to_full_complex_single(na, nqc, nblk, nbw, a, lda, tmat, q, ldq, matrixCols, numBlocks, &
subroutine trans_ev_band_to_full_complex_PRECISION(na, nqc, nblk, nbw, a, lda, tmat, q, ldq, matrixCols, numBlocks, &
mpi_comm_rows, mpi_comm_cols, useGPU)
#endif
!-------------------------------------------------------------------------------
! trans_ev_band_to_full_complex:
! Transforms the eigenvectors of a band matrix back to the eigenvectors of the original matrix
......@@ -80,11 +74,7 @@
character(200) :: errorMessage
logical :: successCUDA
#ifdef DOUBLE_PRECISION_COMPLEX
call timer%start("trans_ev_band_to_full_complex_double")
#else
call timer%start("trans_ev_band_to_full_complex_single")
#endif
call timer%start("trans_ev_band_to_full_complex" // PRECISION_SUFFIX)
call timer%start("mpi_communication")
call mpi_comm_rank(mpi_comm_rows,my_prow,mpierr)
......@@ -133,38 +123,22 @@
! if (istat .ne. 0) then
! print *,"trans_ev_band_to_full_complex: error when allocating tmat_temp "//errorMessage
! endif
#ifdef DOUBLE_PRECISION_COMPLEX
successCUDA = cuda_malloc(hvm_dev, max_local_rows*nbw*size_of_double_complex_datatype)
#else
successCUDA = cuda_malloc(hvm_dev, max_local_rows*nbw*size_of_single_complex_datatype)
#endif
successCUDA = cuda_malloc(hvm_dev, max_local_rows*nbw*size_of_PRECISION_complex)
if (.not.(successCUDA)) then
print *,"trans_ev_band_to_full_complex: error in cudaMalloc"
stop
endif
#ifdef DOUBLE_PRECISION_COMPLEX
successCUDA = cuda_malloc(tmat_dev, nbw*nbw*size_of_double_complex_datatype)
#else
successCUDA = cuda_malloc(tmat_dev, nbw*nbw*size_of_single_complex_datatype)
#endif
successCUDA = cuda_malloc(tmat_dev, nbw*nbw*size_of_PRECISION_complex)
if (.not.(successCUDA)) then
print *,"trans_ev_band_to_full_complex: error in cudaMalloc"
stop
endif
#ifdef DOUBLE_PRECISION_COMPLEX
successCUDA = cuda_malloc(q_dev, ldq*matrixCols*size_of_double_complex_datatype)
#else
successCUDA = cuda_malloc(q_dev, ldq*matrixCols*size_of_single_complex_datatype)
#endif
successCUDA = cuda_malloc(q_dev, ldq*matrixCols*size_of_PRECISION_complex)
if (.not.(successCUDA)) then
print *,"trans_ev_band_to_full_complex: error in cudaMalloc"
stop
endif
#ifdef DOUBLE_PRECISION_COMPLEX
successCUDA = cuda_malloc(tmp_dev, max_local_cols*nbw*size_of_double_complex_datatype)
#else
successCUDA = cuda_malloc(tmp_dev, max_local_cols*nbw*size_of_single_complex_datatype)
#endif
successCUDA = cuda_malloc(tmp_dev, max_local_cols*nbw*size_of_PRECISION_complex)
if (.not.(successCUDA)) then
print *,"trans_ev_band_to_full_complex: error in cudaMalloc"
stop
......@@ -177,31 +151,18 @@
! stop
! endif
endif
#ifdef DOUBLE_PRECISION_COMPLEX
hvm = 0._ck8 ! Must be set to 0 !!!
hvb = 0._ck8 ! Safety only
#else
hvm = 0._ck4 ! Must be set to 0 !!!
hvb = 0._ck4 ! Safety only
#endif
hvm = CONST_COMPLEX_0_0 ! Must be set to 0 !!!
hvb = CONST_COMPLEX_0_0 ! Safety only
if (useGPU) then
! q_temp(:,:) = 0.0
! q_temp(1:ldq,1:na_cols) = q(1:ldq,1:na_cols)
#ifdef DOUBLE_PRECISION_COMPLEX
successCUDA = cuda_memcpy(q_dev, loc(q),ldq*matrixCols*size_of_double_complex_datatype, cudaMemcpyHostToDevice)
#else
successCUDA = cuda_memcpy(q_dev, loc(q),ldq*matrixCols*size_of_single_complex_datatype, cudaMemcpyHostToDevice)
#endif
successCUDA = cuda_memcpy(q_dev, loc(q),ldq*matrixCols*size_of_PRECISION_complex, cudaMemcpyHostToDevice)
if (.not.(successCUDA)) then
print *,"trans_ev_band_to_full_complex: error in cudaMemcpy"
stop
endif
#ifdef DOUBLE_PRECISION_COMPLEX
successCUDA = cuda_memset(hvm_dev, 0, (max_local_rows)*(nbw)*size_of_double_complex_datatype)
#else
successCUDA = cuda_memset(hvm_dev, 0, (max_local_rows)*(nbw)*size_of_single_complex_datatype)
#endif
successCUDA = cuda_memset(hvm_dev, 0, (max_local_rows)*(nbw)*size_of_PRECISION_complex)
if (.not.(successCUDA)) then
print *,"trans_ev_band_to_full_complex: error in cudaMemset"
stop
......@@ -233,14 +194,9 @@
if (lc==n_cols .or. mod(ncol,nblk)==0) then
#ifdef WITH_MPI
call timer%start("mpi_communication")
#ifdef DOUBLE_PRECISION_COMPLEX
call MPI_Bcast(hvb(ns+1), nb-ns, MPI_DOUBLE_COMPLEX, pcol(ncol, nblk, np_cols), mpi_comm_cols, mpierr)
#else
call MPI_Bcast(hvb(ns+1), nb-ns, MPI_COMPLEX, pcol(ncol, nblk, np_cols), mpi_comm_cols, mpierr)
#endif
call timer%stop("mpi_communication")
call timer%start("mpi_communication")
call MPI_Bcast(hvb(ns+1), nb-ns, MPI_COMPLEX_PRECISION, pcol(ncol, nblk, np_cols), mpi_comm_cols, mpierr)
call timer%stop("mpi_communication")
#endif /* WITH_MPI */
ns = nb
......@@ -261,11 +217,7 @@
enddo
if (useGPU) then
#ifdef DOUBLE_PRECISION_COMPLEX
successCUDA = cuda_memcpy(hvm_dev,loc(hvm),(max_local_rows*nbw*size_of_double_complex_datatype),cudaMemcpyHostToDevice)
#else
successCUDA = cuda_memcpy(hvm_dev,loc(hvm),(max_local_rows*nbw*size_of_single_complex_datatype),cudaMemcpyHostToDevice)
#endif
successCUDA = cuda_memcpy(hvm_dev,loc(hvm),(max_local_rows*nbw*size_of_PRECISION_complex),cudaMemcpyHostToDevice)
if (.not.(successCUDA)) then
print *,"trans_ev_band_to_full_complex: error in cudaMemcpy"
stop
......@@ -278,32 +230,18 @@
if (l_rows > 0) then
if (useGPU) then
#ifdef DOUBLE_PRECISION_COMPLEX
call cublas_zgemm('C', 'N', n_cols, l_cols, l_rows, CONE, hvm_dev, max_local_rows, &
call cublas_PRECISION_GEMM('C', 'N', n_cols, l_cols, l_rows, CONE, hvm_dev, max_local_rows, &
q_dev, ldq, CZERO, tmp_dev, n_cols)
successCUDA = cuda_memcpy(loc(tmp1), tmp_dev, n_cols*l_cols*size_of_double_complex_datatype, &
successCUDA = cuda_memcpy(loc(tmp1), tmp_dev, n_cols*l_cols*size_of_PRECISION_complex, &
cudaMemcpyDeviceToHost)
#else
call cublas_cgemm('C', 'N', n_cols, l_cols, l_rows, CONE, hvm_dev, max_local_rows, &
q_dev, ldq, CZERO, tmp_dev, n_cols)
successCUDA = cuda_memcpy(loc(tmp1), tmp_dev, n_cols*l_cols*size_of_single_complex_datatype, &
cudaMemcpyDeviceToHost)
#endif
if (.not.(successCUDA)) then
print *,"trans_ev_band_to_full_complex: error in cudaMemcpy"
stop
endif
else
#ifdef DOUBLE_PRECISION_COMPLEX
call zgemm('C', 'N', n_cols, l_cols, l_rows, CONE, hvm, ubound(hvm,dim=1), &
call PRECISION_GEMM('C', 'N', n_cols, l_cols, l_rows, CONE, hvm, ubound(hvm,dim=1), &
q, ldq, CZERO, tmp1, n_cols)
#else
call cgemm('C', 'N', n_cols, l_cols, l_rows, CONE, hvm, ubound(hvm,dim=1), &
q, ldq, CZERO, tmp1, n_cols)
#endif
endif
else ! l_rows > 0
if (useGPU) then
......@@ -318,21 +256,12 @@
! stop
! endif
endif
#ifdef DOUBLE_PRECISION_COMPLEX
tmp1(1:l_cols*n_cols) = 0._ck8
#else
tmp1(1:l_cols*n_cols) = 0._ck4
#endif
tmp1(1:l_cols*n_cols) = CONST_COMPLEX_0_0
endif
#ifdef WITH_MPI
call timer%start("mpi_communication")
#ifdef DOUBLE_PRECISION_COMPLEX
call mpi_allreduce(tmp1, tmp2, n_cols*l_cols, MPI_DOUBLE_COMPLEX, MPI_SUM, mpi_comm_rows, mpierr)
#else
call mpi_allreduce(tmp1, tmp2, n_cols*l_cols, MPI_COMPLEX, MPI_SUM, mpi_comm_rows, mpierr)
#endif
call mpi_allreduce(tmp1, tmp2, n_cols*l_cols, MPI_COMPLEX_PRECISION, MPI_SUM, mpi_comm_rows, mpierr)
call timer%stop("mpi_communication")
#else /* WITH_MPI */
......@@ -343,21 +272,9 @@
if (useGPU) then
#ifdef WITH_MPI
#ifdef DOUBLE_PRECISION_COMPLEX
successCUDA = cuda_memcpy(tmp_dev,loc(tmp2),l_cols*n_cols*size_of_double_complex_datatype,cudaMemcpyHostToDevice)
#else
successCUDA = cuda_memcpy(tmp_dev,loc(tmp2),l_cols*n_cols*size_of_single_complex_datatype,cudaMemcpyHostToDevice)
#endif
successCUDA = cuda_memcpy(tmp_dev,loc(tmp2),l_cols*n_cols*size_of_PRECISION_complex,cudaMemcpyHostToDevice)
#else /* WITH_MPI */
#ifdef DOUBLE_PRECISION_COMPLEX
successCUDA = cuda_memcpy(tmp_dev,loc(tmp1),l_cols*n_cols*size_of_double_complex_datatype,cudaMemcpyHostToDevice)
#else
successCUDA = cuda_memcpy(tmp_dev,loc(tmp1),l_cols*n_cols*size_of_single_complex_datatype,cudaMemcpyHostToDevice)
#endif
successCUDA = cuda_memcpy(tmp_dev,loc(tmp1),l_cols*n_cols*size_of_PRECISION_complex,cudaMemcpyHostToDevice)
#endif /* WITH_MPI */
if (.not.(successCUDA)) then
print *,"trans_ev_band_to_full_complex: error in cudaMemcpy"
......@@ -365,53 +282,25 @@
endif
! tmat_temp(1:nbw,1:nbw) = tmat(1:nbw,1:nbw,istep)
#ifdef DOUBLE_PRECISION_COMPLEX
successCUDA = cuda_memcpy(tmat_dev, loc(tmat(1,1,istep)),nbw*nbw* &
size_of_double_complex_datatype,cudaMemcpyHostToDevice)
#else
successCUDA = cuda_memcpy(tmat_dev, loc(tmat(1,1,istep)),nbw*nbw* &
size_of_single_complex_datatype,cudaMemcpyHostToDevice)
#endif
size_of_PRECISION_complex,cudaMemcpyHostToDevice)
if (.not.(successCUDA)) then
print *,"trans_ev_band_to_full_complex: error in cudaMemcpy"
stop
endif
#ifdef DOUBLE_PRECISION_COMPLEX
call cublas_ztrmm('L', 'U', 'C', 'N', n_cols, l_cols, CONE, tmat_dev, nbw, tmp_dev, n_cols)
call cublas_zgemm('N', 'N', l_rows, l_cols, n_cols, -CONE, hvm_dev, max_local_rows, &
tmp_dev, n_cols, CONE, q_dev, ldq)
#else
call cublas_ctrmm('L', 'U', 'C', 'N', n_cols, l_cols, CONE, tmat_dev, nbw, tmp_dev, n_cols)
call cublas_cgemm('N', 'N', l_rows, l_cols, n_cols, -CONE, hvm_dev, max_local_rows, &
call cublas_PRECISION_TRMM('L', 'U', 'C', 'N', n_cols, l_cols, CONE, tmat_dev, nbw, tmp_dev, n_cols)
call cublas_PRECISION_GEMM('N', 'N', l_rows, l_cols, n_cols, -CONE, hvm_dev, max_local_rows, &
tmp_dev, n_cols, CONE, q_dev, ldq)
#endif
else ! not useGPU
#ifdef WITH_MPI
#ifdef DOUBLE_PRECISION_COMPLEX
call ztrmm('L', 'U', 'C', 'N', n_cols, l_cols, CONE, tmat(1,1,istep), ubound(tmat,dim=1), tmp2, n_cols)
call zgemm('N', 'N', l_rows, l_cols, n_cols, -CONE, hvm, ubound(hvm,dim=1), &
call PRECISION_TRMM('L', 'U', 'C', 'N', n_cols, l_cols, CONE, tmat(1,1,istep), ubound(tmat,dim=1), tmp2, n_cols)
call PRECISION_GEMM('N', 'N', l_rows, l_cols, n_cols, -CONE, hvm, ubound(hvm,dim=1), &
tmp2, n_cols, CONE, q, ldq)
#else
call ctrmm('L', 'U', 'C', 'N', n_cols, l_cols, CONE, tmat(1,1,istep), ubound(tmat,dim=1), tmp2, n_cols)
call cgemm('N', 'N', l_rows, l_cols, n_cols, -CONE, hvm, ubound(hvm,dim=1), &
tmp2, n_cols, CONE, q, ldq)
#endif
#else /* WITH_MPI */
#ifdef DOUBLE_PRECISION_COMPLEX
call ztrmm('L', 'U', 'C', 'N', n_cols, l_cols, CONE, tmat(1,1,istep), ubound(tmat,dim=1), tmp1, n_cols)
call zgemm('N', 'N', l_rows, l_cols, n_cols, -CONE, hvm, ubound(hvm,dim=1), &
call PRECISION_TRMM('L', 'U', 'C', 'N', n_cols, l_cols, CONE, tmat(1,1,istep), ubound(tmat,dim=1), tmp1, n_cols)
call PRECISION_GEMM('N', 'N', l_rows, l_cols, n_cols, -CONE, hvm, ubound(hvm,dim=1), &
tmp1, n_cols, CONE, q, ldq)
#else
call ctrmm('L', 'U', 'C', 'N', n_cols, l_cols, CONE, tmat(1,1,istep), ubound(tmat,dim=1), tmp1, n_cols)
call cgemm('N', 'N', l_rows, l_cols, n_cols, -CONE, hvm, ubound(hvm,dim=1), &
tmp1, n_cols, CONE, q, ldq)
#endif
#endif /* WITH_MPI */
endif
endif
......@@ -451,11 +340,7 @@
print *,"trans_ev_band_to_full_complex: error in cudaFree"
stop
endif
#ifdef DOUBLE_PRECISION_COMPLEX
successCUDA = cuda_memcpy(loc(q), q_dev,ldq*matrixCols*size_of_double_complex_datatype, cudaMemcpyDeviceToHost)
#else
successCUDA = cuda_memcpy(loc(q), q_dev,ldq*matrixCols*size_of_single_complex_datatype, cudaMemcpyDeviceToHost)
#endif
successCUDA = cuda_memcpy(loc(q), q_dev,ldq*matrixCols*size_of_PRECISION_complex, cudaMemcpyDeviceToHost)
if (.not.(successCUDA)) then
print *,"trans_ev_band_to_full_complex: error in cudaMemcpy"
stop
......@@ -478,16 +363,8 @@
!print *,"trans_ev_band_to_full_complex: error when deallocating tmat_temp "//errorMessage
!endif
endif ! use GPU
#ifdef DOUBLE_PRECISION_COMPLEX
call timer%stop("trans_ev_band_to_full_complex_double")
#else
call timer%stop("trans_ev_band_to_full_complex_single")
#endif
call timer%stop("trans_ev_band_to_full_complex" // PRECISION_SUFFIX)
#ifdef DOUBLE_PRECISION_COMPLEX
end subroutine trans_ev_band_to_full_complex_double
#else
end subroutine trans_ev_band_to_full_complex_single
#endif
end subroutine trans_ev_band_to_full_complex_PRECISION
......@@ -224,7 +224,7 @@
! Q = Q - V * T**T * V**T * Q
if (l_rows>0) then
call cublas_PRECISION_gemm('T', 'N', n_cols, l_cols, l_rows, CONST_1_0, hvm_dev, max_local_rows, &
call cublas_PRECISION_GEMM('T', 'N', n_cols, l_cols, l_rows, CONST_1_0, hvm_dev, max_local_rows, &
q_dev, ldq , CONST_0_0, tmp_dev, n_cols)
#ifdef WITH_MPI
......@@ -302,8 +302,8 @@
endif
!#endif /* WITH_MPI */
call cublas_PRECISION_trmm('L', 'U', 'T', 'N', n_cols, l_cols, CONST_1_0, tmat_dev, nbw, tmp_dev, n_cols)
call cublas_PRECISION_gemm('N', 'N', l_rows, l_cols, n_cols, -CONST_1_0, hvm_dev, max_local_rows, &
call cublas_PRECISION_TRMM('L', 'U', 'T', 'N', n_cols, l_cols, CONST_1_0, tmat_dev, nbw, tmp_dev, n_cols)
call cublas_PRECISION_GEMM('N', 'N', l_rows, l_cols, n_cols, -CONST_1_0, hvm_dev, max_local_rows, &
tmp_dev, n_cols, CONST_1_0, q_dev, ldq)
! copy to host maybe this can be avoided
......
......@@ -58,9 +58,9 @@
#undef PRECISION_LAPY2
#undef PRECISION_LAED4
#undef PRECISION_LAED5
#undef cublas_PRECISION_gemm
#undef cublas_PRECISION_trmm
#undef cublas_PRECISION_gemv
#undef cublas_PRECISION_GEMM
#undef cublas_PRECISION_TRMM
#undef cublas_PRECISION_GEMV
#undef PRECISION_SUFFIX
#undef CONST_0_0
#undef CONST_0_5
......@@ -128,9 +128,9 @@
#define PRECISION_LAPY2 DLAPY2
#define PRECISION_LAED4 DLAED4
#define PRECISION_LAED5 DLAED5
#define cublas_PRECISION_gemm cublas_Dgemm
#define cublas_PRECISION_trmm cublas_Dtrmm
#define cublas_PRECISION_gemv cublas_Dgemv
#define cublas_PRECISION_GEMM cublas_DGEMM
#define cublas_PRECISION_TRMM cublas_DTRMM
#define cublas_PRECISION_GEMV cublas_DGEMV
#define PRECISION_SUFFIX "_double"
#define CONST_0_0 0.0_rk8
#define CONST_0_5 0.5_rk8
......@@ -199,9 +199,9 @@
#undef PRECISION_LAPY2
#undef PRECISION_LAED4
#undef PRECISION_LAED5
#undef cublas_PRECISION_gemm
#undef cublas_PRECISION_trmm
#undef cublas_PRECISION_gemv
#undef cublas_PRECISION_GEMM
#undef cublas_PRECISION_TRMM
#undef cublas_PRECISION_GEMV
#undef PRECISION_SUFFIX
#undef CONST_0_0
#undef CONST_0_5
......@@ -269,9 +269,9 @@
#define PRECISION_LAPY2 SLAPY2
#define PRECISION_LAED4 SLAED4
#define PRECISION_LAED5 SLAED5
#define cublas_PRECISION_gemm cublas_Sgemm
#define cublas_PRECISION_trmm cublas_Strmm
#define cublas_PRECISION_gemv cublas_Sgemv
#define cublas_PRECISION_GEMM cublas_SGEMM
#define cublas_PRECISION_TRMM cublas_STRMM
#define cublas_PRECISION_GEMV cublas_SGEMV
#define PRECISION_SUFFIX "_single"
#define CONST_0_0 0.0_rk4
#define CONST_0_5 0.5_rk4
......
......@@ -58,9 +58,9 @@
#undef PRECISION_LAPY2
#undef PRECISION_LAED4
#undef PRECISION_LAED5
#undef cublas_PRECISION_gemm
#undef cublas_PRECISION_trmm
#undef cublas_PRECISION_gemv
#undef cublas_PRECISION_GEMM
#undef cublas_PRECISION_TRMM
#undef cublas_PRECISION_GEMV
#undef PRECISION_SUFFIX
#undef MPI_COMPLEX_PRECISION
#undef MPI_REAL_PRECISION
......@@ -70,6 +70,7 @@
#undef PRECISION_REAL
#undef CONST_REAL_0_0
#undef CONST_REAL_1_0
#undef CONST_COMPLEX_0_0
#undef size_of_PRECISION_complex
#define elpa_transpose_vectors_complex_PRECISION elpa_transpose_vectors_complex_double
#define elpa_reduce_add_vectors_complex_PRECISION elpa_reduce_add_vectors_complex_double
......@@ -130,9 +131,9 @@
#define PRECISION_LAPY2 ZLAPY2
#define PRECISION_LAED4 ZLAED4
#define PRECISION_LAED5 ZLAED5
#define cublas_PRECISION_gemm cublas_Zgemm
#define cublas_PRECISION_trmm cublas_Ztrmm
#define cublas_PRECISION_gemv cublas_Zgemv
#define cublas_PRECISION_GEMM cublas_ZGEMM
#define cublas_PRECISION_TRMM cublas_ZTRMM
#define cublas_PRECISION_GEMV cublas_ZGEMV
#define PRECISION_SUFFIX "_double"
#define MPI_COMPLEX_PRECISION MPI_DOUBLE_COMPLEX
#define MPI_REAL_PRECISION MPI_REAL8
......@@ -142,6 +143,7 @@
#define PRECISION_REAL DREAL
#define CONST_REAL_0_0 0.0_rk8
#define CONST_REAL_1_0 1.0_rk8
#define CONST_COMPLEX_0_0 0.0_ck8
#define size_of_PRECISION_complex size_of_double_complex_datatype
#else
#undef elpa_transpose_vectors_complex_PRECISION
......@@ -203,9 +205,9 @@
#undef PRECISION_LAPY2
#undef PRECISION_LAED4
#undef PRECISION_LAED5
#undef cublas_PRECISION_gemm
#undef cublas_PRECISION_trmm
#undef cublas_PRECISION_gemv
#undef cublas_PRECISION_GEMM
#undef cublas_PRECISION_TRMM
#undef cublas_PRECISION_GEMV
#undef PRECISION_SUFFIX
#undef MPI_COMPLEX_PRECISION
#undef MPI_REAL_PRECISION
......@@ -215,6 +217,7 @@
#undef PRECISION_REAL
#undef CONST_REAL_0_0
#undef CONST_REAL_1_0
#undef CONST_COMPLEX_0_0
#undef size_of_PRECISION_complex
#define elpa_transpose_vectors_complex_PRECISION elpa_transpose_vectors_complex_single
#define elpa_reduce_add_vectors_complex_PRECISION elpa_reduce_add_vectors_complex_single
......@@ -275,9 +278,9 @@
#define PRECISION_LAPY2 CLAPY2
#define PRECISION_LAED4 CLAED4
#define PRECISION_LAED5 CLAED5
#define cublas_PRECISION_gemm cublas_Cgemm
#define cublas_PRECISION_trmm cublas_Ctrmm
#define cublas_PRECISION_gemv cublas_Cgemv
#define cublas_PRECISION_GEMM cublas_CGEMM
#define cublas_PRECISION_TRMM cublas_CTRMM
#define cublas_PRECISION_GEMV cublas_CGEMV
#define PRECISION_SUFFIX "_single"
#define MPI_COMPLEX_PRECISION MPI_COMPLEX
#define MPI_REAL_PRECISION MPI_REAL4
......@@ -287,5 +290,6 @@
#define PRECISION_REAL REAL
#define CONST_REAL_0_0 0.0_rk4
#define CONST_REAL_1_0 1.0_rk4
#define CONST_COMPLEX_0_0 0.0_ck4
#define size_of_PRECISION_complex size_of_single_complex_datatype
#endif