Commit ab0dfe98 authored by Pavel Kus's avatar Pavel Kus
Browse files

real precision header generated. Removed M_

parent 5cf0ae52
#!/usr/bin/python
import sys
simple_tokens = [
"elpa_transpose_vectors_NUMBER_PRECISION",
"elpa_reduce_add_vectors_NUMBER_PRECISION",
"bandred_NUMBER_PRECISION",
"trans_ev_band_to_full_NUMBER_PRECISION",
"tridiag_band_NUMBER_PRECISION",
"trans_ev_tridi_to_band_NUMBER_PRECISION",
"band_band_NUMBER_PRECISION",
"tridiag_NUMBER_PRECISION",
"trans_ev_NUMBER_PRECISION",
"solve_tridi_PRECISION",
"solve_tridi_col_PRECISION",
"solve_tridi_single_problem_PRECISION",
"qr_pdgeqrf_2dcomm_PRECISION",
"hh_transform_NUMBER_PRECISION",
"symm_matrix_allreduce_PRECISION",
"redist_band_NUMBER_PRECISION",
"unpack_row_NUMBER_cpu_PRECISION",
"unpack_row_NUMBER_cpu_openmp_PRECISION",
"unpack_and_prepare_row_group_NUMBER_gpu_PRECISION",
"extract_hh_tau_NUMBER_gpu_PRECISION",
"compute_hh_dot_products_NUMBER_gpu_PRECISION",
"compute_hh_trafo_NUMBER_cpu_openmp_PRECISION",
"compute_hh_trafo_NUMBER_cpu_PRECISION",
"pack_row_group_NUMBER_gpu_PRECISION",
"pack_row_NUMBER_cpu_openmp_PRECISION",
"pack_row_NUMBER_cpu_PRECISION",
"wy_gen_PRECISION",
"wy_right_PRECISION",
"wy_left_PRECISION",
"wy_symm_PRECISION",
"merge_recursive_PRECISION",
"merge_systems_PRECISION",
"distribute_global_column_PRECISION",
"check_monotony_PRECISION",
"global_gather_PRECISION",
"resort_ev_PRECISION",
"transform_columns_PRECISION",
"solve_secular_equation_PRECISION",
"global_product_PRECISION",
"add_tmp_PRECISION",
"v_add_s_PRECISION",
]
blas_tokens = [
"PRECISION_GEMV",
"PRECISION_TRMV",
"PRECISION_GEMM",
"PRECISION_TRMM",
"PRECISION_HERK",
"PRECISION_SYRK",
"PRECISION_SYMV",
"PRECISION_SYMM",
"PRECISION_SYR2",
"PRECISION_SYR2K",
"PRECISION_GEQRF",
"PRECISION_STEDC",
"PRECISION_STEQR",
"PRECISION_LAMRG",
"PRECISION_LAMCH",
"PRECISION_LAPY2",
"PRECISION_LAED4",
"PRECISION_LAED5",
"cublas_PRECISION_gemm",
"cublas_PRECISION_trmm",
"cublas_PRECISION_gemv",
]
explicit_tokens_complex = [
("PRECISION_SUFFIX", "\"_double\"", "\"_single\""),
("MPI_COMPLEX_PRECISION", "MPI_DOUBLE_COMPLEX", "MPI_COMPLEX"),
("MPI_REAL_PRECISION", "MPI_REAL8", "MPI_REAL4"),
("KIND_PRECISION", "rk8", "rk4"),
("PRECISION_CMPLX", "DCMPLX", "CMPLX"),
("PRECISION_IMAG", "DIMAG", "AIMAG"),
("PRECISION_REAL", "DREAL", "REAL"),
("CONST_REAL_0_0", "0.0_rk8", "0.0_rk4"),
("CONST_REAL_1_0", "1.0_rk8", "1.0_rk4"),
("size_of_PRECISION_complex", "size_of_double_complex_datatype", "size_of_single_complex_datatype"),
]
explicit_tokens_real = [
("PRECISION_SUFFIX", "\"_double\"", "\"_single\""),
("CONST_0_0", "0.0_rk8", "0.0_rk4"),
("CONST_0_5", "0.5_rk8", "0.5_rk4"),
("CONST_1_0", "1.0_rk8", "1.0_rk4"),
("CONST_2_0", "2.0_rk8", "2.0_rk4"),
("CONST_8_0", "8.0_rk8", "8.0_rk4"),
("size_of_PRECISION_real", "size_of_double_real_datatype", "size_of_single_real_datatype"),
("MPI_REAL_PRECISION", "MPI_REAL8", "MPI_REAL4"),
]
explicit_order = {"single":2, "double":1}
blas_prefixes = {("real","single") : "S", ("real","double") : "D", ("complex","single") : "C", ("complex","double") : "Z"}
def print_variant(number, precision, explicit):
for token in simple_tokens:
print "#define ", token.replace("NUMBER", number), token.replace("PRECISION", precision).replace("NUMBER", number)
for token in blas_tokens:
print "#define ", token, token.replace("PRECISION_", blas_prefixes[(number, precision)])
for token in explicit:
print "#define ", token[0], token[explicit_order[precision]]
def print_undefs(number, explicit):
for token in simple_tokens:
print "#undef ", token.replace("NUMBER", number)
for token in blas_tokens:
print "#undef ", token
for token in explicit:
print "#undef ", token[0]
if(sys.argv[1] == "complex"):
print "#ifdef DOUBLE_PRECISION_COMPLEX"
print_undefs("complex", explicit_tokens_complex)
print_variant("complex", "double", explicit_tokens_complex)
print "#else"
print_undefs("complex", explicit_tokens_complex)
print_variant("complex", "single", explicit_tokens_complex)
print "#endif"
elif(sys.argv[1] == "real"):
print "#ifdef DOUBLE_PRECISION_REAL"
print_undefs("real", explicit_tokens_real)
print_variant("real", "double", explicit_tokens_real)
print "#else"
print_undefs("real", explicit_tokens_real)
print_variant("real", "single", explicit_tokens_real)
print "#endif"
else:
assert(False)
\ No newline at end of file
#!/usr/bin/python
simple_tokens = ["tridiag_complex_PRECISION",
"trans_ev_complex_PRECISION",
"solve_complex_PRECISION",
"hh_transform_complex_PRECISION",
"elpa_transpose_vectors_complex_PRECISION",
"elpa_reduce_add_vectors_complex_PRECISION",
]
blas_tokens = ["PRECISION_GEMV",
"PRECISION_TRMV",
"PRECISION_GEMM",
"PRECISION_TRMM",
"PRECISION_HERK",
"cublas_PRECISION_gemm",
"cublas_PRECISION_trmm",
"cublas_PRECISION_gemv",
]
explicit_tokens = [("PRECISION_SUFFIX", "\"_double\"", "\"_single\""),
("MPI_COMPLEX_PRECISION", "MPI_DOUBLE_COMPLEX", "MPI_COMPLEX"),
("MPI_REAL_PRECISION", "MPI_REAL8", "MPI_REAL4"),
("KIND_PRECISION", "rk8", "rk4"),
("PRECISION_CMPLX", "DCMPLX", "CMPLX"),
("PRECISION_IMAG", "DIMAG", "AIMAG"),
("PRECISION_REAL", "DREAL", "REAL"),
("CONST_REAL_0_0", "0.0_rk8", "0.0_rk4"),
("CONST_REAL_1_0", "1.0_rk8", "1.0_rk4"),
("size_of_PRECISION_complex", "size_of_double_complex_datatype", "size_of_single_complex_datatype"),
]
print "#ifdef DOUBLE_PRECISION_COMPLEX"
for token in simple_tokens:
print "#define ", token, token.replace("PRECISION", "double")
for token in blas_tokens:
print "#define ", token, token.replace("PRECISION_", "Z")
for token in explicit_tokens:
print "#define ", token[0], token[1]
print "#else"
for token in simple_tokens:
print "#undef ", token
for token in blas_tokens:
print "#undef ", token
for token in explicit_tokens:
print "#undef ", token[0]
for token in simple_tokens:
print "#define ", token, token.replace("PRECISION", "single")
for token in blas_tokens:
print "#define ", token, token.replace("PRECISION_", "C")
for token in explicit_tokens:
print "#define ", token[0], token[2]
print "#endif"
...@@ -51,7 +51,7 @@ ...@@ -51,7 +51,7 @@
! with their original authors, but shall adhere to the licensing terms ! with their original authors, but shall adhere to the licensing terms
! distributed along with the original code in the file "COPYING". ! distributed along with the original code in the file "COPYING".
#endif #endif
subroutine M_merge_systems_PRECISION( na, nm, d, e, q, ldq, nqoff, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols, & subroutine merge_systems_PRECISION( na, nm, d, e, q, ldq, nqoff, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols, &
l_col, p_col, l_col_out, p_col_out, npc_0, npc_n, wantDebug, success) l_col, p_col, l_col_out, p_col_out, npc_0, npc_n, wantDebug, success)
#ifdef HAVE_DETAILED_TIMINGS #ifdef HAVE_DETAILED_TIMINGS
...@@ -77,7 +77,7 @@ ...@@ -77,7 +77,7 @@
integer(kind=ik), parameter :: max_strip=128 integer(kind=ik), parameter :: max_strip=128
real(kind=REAL_DATATYPE) :: M_PRECISION_LAMCH, M_PRECISION_LAPY2 real(kind=REAL_DATATYPE) :: PRECISION_LAMCH, PRECISION_LAPY2
real(kind=REAL_DATATYPE) :: beta, sig, s, c, t, tau, rho, eps, tol, & real(kind=REAL_DATATYPE) :: beta, sig, s, c, t, tau, rho, eps, tol, &
qtrans(2,2), dmax, zmax, d1new, d2new qtrans(2,2), dmax, zmax, d1new, d2new
real(kind=REAL_DATATYPE) :: z(na), d1(na), d2(na), z1(na), delta(na), & real(kind=REAL_DATATYPE) :: z(na), d1(na), d2(na), z1(na), delta(na), &
...@@ -118,7 +118,7 @@ ...@@ -118,7 +118,7 @@
endif endif
#endif #endif
call timer%start("merge_systems" // M_PRECISION_SUFFIX) call timer%start("merge_systems" // PRECISION_SUFFIX)
success = .true. success = .true.
call timer%start("mpi_communication") call timer%start("mpi_communication")
call mpi_comm_rank(mpi_comm_rows,my_prow,mpierr) call mpi_comm_rank(mpi_comm_rows,my_prow,mpierr)
...@@ -130,7 +130,7 @@ ...@@ -130,7 +130,7 @@
! If my processor column isn't in the requested set, do nothing ! If my processor column isn't in the requested set, do nothing
if (my_pcol<npc_0 .or. my_pcol>=npc_0+npc_n) then if (my_pcol<npc_0 .or. my_pcol>=npc_0+npc_n) then
call timer%stop("merge_systems" // M_PRECISION_SUFFIX) call timer%stop("merge_systems" // PRECISION_SUFFIX)
return return
endif endif
! Determine number of "next" and "prev" column for ring sends ! Determine number of "next" and "prev" column for ring sends
...@@ -146,14 +146,14 @@ ...@@ -146,14 +146,14 @@
else else
np_prev = my_pcol - 1 np_prev = my_pcol - 1
endif endif
call M_check_monotony_PRECISION(nm,d,'Input1',wantDebug, success) call check_monotony_PRECISION(nm,d,'Input1',wantDebug, success)
if (.not.(success)) then if (.not.(success)) then
call timer%stop("merge_systems" // M_PRECISION_SUFFIX) call timer%stop("merge_systems" // PRECISION_SUFFIX)
return return
endif endif
call M_check_monotony_PRECISION(na-nm,d(nm+1),'Input2',wantDebug, success) call check_monotony_PRECISION(na-nm,d(nm+1),'Input2',wantDebug, success)
if (.not.(success)) then if (.not.(success)) then
call timer%stop("merge_systems" // M_PRECISION_SUFFIX) call timer%stop("merge_systems" // PRECISION_SUFFIX)
return return
endif endif
! Get global number of processors and my processor number. ! Get global number of processors and my processor number.
...@@ -188,7 +188,7 @@ ...@@ -188,7 +188,7 @@
! Calculations start here ! Calculations start here
beta = abs(e) beta = abs(e)
sig = sign(M_CONST_1_0,e) sig = sign(CONST_1_0,e)
! Calculate rank-1 modifier z ! Calculate rank-1 modifier z
...@@ -208,20 +208,20 @@ ...@@ -208,20 +208,20 @@
enddo enddo
endif endif
call M_global_gather_PRECISION(z, na) call global_gather_PRECISION(z, na)
! Normalize z so that norm(z) = 1. Since z is the concatenation of ! Normalize z so that norm(z) = 1. Since z is the concatenation of
! two normalized vectors, norm2(z) = sqrt(2). ! two normalized vectors, norm2(z) = sqrt(2).
z = z/sqrt(M_CONST_2_0) z = z/sqrt(CONST_2_0)
rho = M_CONST_2_0*beta rho = CONST_2_0*beta
! Calculate index for merging both systems by ascending eigenvalues ! Calculate index for merging both systems by ascending eigenvalues
call M_PRECISION_LAMRG( nm, na-nm, d, 1, 1, idx ) call PRECISION_LAMRG( nm, na-nm, d, 1, 1, idx )
! Calculate the allowable deflation tolerance ! Calculate the allowable deflation tolerance
zmax = maxval(abs(z)) zmax = maxval(abs(z))
dmax = maxval(abs(d)) dmax = maxval(abs(d))
EPS = M_PRECISION_LAMCH( 'Epsilon' ) EPS = PRECISION_LAMCH( 'Epsilon' )
TOL = M_CONST_8_0*EPS*MAX(dmax,zmax) TOL = CONST_8_0*EPS*MAX(dmax,zmax)
! If the rank-1 modifier is small enough, no more needs to be done ! If the rank-1 modifier is small enough, no more needs to be done
! except to reorganize D and Q ! except to reorganize D and Q
...@@ -236,9 +236,9 @@ ...@@ -236,9 +236,9 @@
enddo enddo
! Rearrange eigenvectors ! Rearrange eigenvectors
call M_resort_ev_PRECISION(idx, na) call resort_ev_PRECISION(idx, na)
call timer%stop("merge_systems" // M_PRECISION_SUFFIX) call timer%stop("merge_systems" // PRECISION_SUFFIX)
return return
ENDIF ENDIF
...@@ -277,7 +277,7 @@ ...@@ -277,7 +277,7 @@
! Find sqrt(a**2+b**2) without overflow or ! Find sqrt(a**2+b**2) without overflow or
! destructive underflow. ! destructive underflow.
TAU = M_PRECISION_LAPY2( C, S ) TAU = PRECISION_LAPY2( C, S )
T = D1(na1) - D(idx(i)) T = D1(na1) - D(idx(i))
C = C / TAU C = C / TAU
S = -S / TAU S = -S / TAU
...@@ -328,7 +328,7 @@ ...@@ -328,7 +328,7 @@
qtrans(1,1) = C; qtrans(1,2) =-S qtrans(1,1) = C; qtrans(1,2) =-S
qtrans(2,1) = S; qtrans(2,2) = C qtrans(2,1) = S; qtrans(2,2) = C
call M_transform_columns_PRECISION(idx(i), idx1(na1)) call transform_columns_PRECISION(idx(i), idx1(na1))
if (coltyp(idx(i))==1 .and. coltyp(idx1(na1))/=1) coltyp(idx1(na1)) = 2 if (coltyp(idx(i))==1 .and. coltyp(idx1(na1))/=1) coltyp(idx1(na1)) = 2
if (coltyp(idx(i))==3 .and. coltyp(idx1(na1))/=3) coltyp(idx1(na1)) = 2 if (coltyp(idx(i))==3 .and. coltyp(idx1(na1))/=3) coltyp(idx1(na1)) = 2
...@@ -348,14 +348,14 @@ ...@@ -348,14 +348,14 @@
endif endif
enddo enddo
call M_check_monotony_PRECISION(na1,d1,'Sorted1', wantDebug, success) call check_monotony_PRECISION(na1,d1,'Sorted1', wantDebug, success)
if (.not.(success)) then if (.not.(success)) then
call timer%stop("merge_systems" // M_PRECISION_SUFFIX) call timer%stop("merge_systems" // PRECISION_SUFFIX)
return return
endif endif
call M_check_monotony_PRECISION(na2,d2,'Sorted2', wantDebug, success) call check_monotony_PRECISION(na2,d2,'Sorted2', wantDebug, success)
if (.not.(success)) then if (.not.(success)) then
call timer%stop("merge_systems" // M_PRECISION_SUFFIX) call timer%stop("merge_systems" // PRECISION_SUFFIX)
return return
endif endif
...@@ -365,17 +365,17 @@ ...@@ -365,17 +365,17 @@
if (na1==1) then if (na1==1) then
d(1) = d1(1) + rho*z1(1)**2 ! solve secular equation d(1) = d1(1) + rho*z1(1)**2 ! solve secular equation
else ! na1==2 else ! na1==2
call M_PRECISION_LAED5(1, d1, z1, qtrans(1,1), rho, d(1)) call PRECISION_LAED5(1, d1, z1, qtrans(1,1), rho, d(1))
call M_PRECISION_LAED5(2, d1, z1, qtrans(1,2), rho, d(2)) call PRECISION_LAED5(2, d1, z1, qtrans(1,2), rho, d(2))
call M_transform_columns_PRECISION(idx1(1), idx1(2)) call transform_columns_PRECISION(idx1(1), idx1(2))
endif endif
! Add the deflated eigenvalues ! Add the deflated eigenvalues
d(na1+1:na) = d2(1:na2) d(na1+1:na) = d2(1:na2)
! Calculate arrangement of all eigenvalues in output ! Calculate arrangement of all eigenvalues in output
call M_PRECISION_LAMRG( na1, na-na1, d, 1, 1, idx ) call PRECISION_LAMRG( na1, na-na1, d, 1, 1, idx )
! Rearrange eigenvalues ! Rearrange eigenvalues
tmp = d tmp = d
...@@ -392,7 +392,7 @@ ...@@ -392,7 +392,7 @@
idxq1(i) = idx2(idx(i)-na1) idxq1(i) = idx2(idx(i)-na1)
endif endif
enddo enddo
call M_resort_ev_PRECISION(idxq1, na) call resort_ev_PRECISION(idxq1, na)
else if (na1>2) then else if (na1>2) then
! Solve secular equation ! Solve secular equation
...@@ -407,19 +407,19 @@ ...@@ -407,19 +407,19 @@
info = 0 info = 0
#ifdef WITH_OPENMP #ifdef WITH_OPENMP
call timer%start("OpenMP parallel" // M_PRECISION_SUFFIX) call timer%start("OpenMP parallel" // PRECISION_SUFFIX)
!$OMP PARALLEL PRIVATE(i,my_thread,delta,s,info,j) !$OMP PARALLEL PRIVATE(i,my_thread,delta,s,info,j)
my_thread = omp_get_thread_num() my_thread = omp_get_thread_num()
!$OMP DO !$OMP DO
#endif #endif
DO i = my_proc+1, na1, n_procs ! work distributed over all processors DO i = my_proc+1, na1, n_procs ! work distributed over all processors
call M_PRECISION_LAED4(na1, i, d1, z1, delta, rho, s, info) ! s is not used! call PRECISION_LAED4(na1, i, d1, z1, delta, rho, s, info) ! s is not used!
if (info/=0) then if (info/=0) then
! If DLAED4 fails (may happen especially for LAPACK versions before 3.2) ! If DLAED4 fails (may happen especially for LAPACK versions before 3.2)
! use the more stable bisection algorithm in solve_secular_equation ! use the more stable bisection algorithm in solve_secular_equation
! print *,'ERROR DLAED4 n=',na1,'i=',i,' Using Bisection' ! print *,'ERROR DLAED4 n=',na1,'i=',i,' Using Bisection'
call M_solve_secular_equation_PRECISION(na1, i, d1, z1, delta, rho, s) call solve_secular_equation_PRECISION(na1, i, d1, z1, delta, rho, s)
endif endif
! Compute updated z ! Compute updated z
...@@ -453,26 +453,26 @@ ...@@ -453,26 +453,26 @@
#ifdef WITH_OPENMP #ifdef WITH_OPENMP
!$OMP END PARALLEL !$OMP END PARALLEL
call timer%stop("OpenMP parallel" // M_PRECISION_SUFFIX) call timer%stop("OpenMP parallel" // PRECISION_SUFFIX)
do i = 0, max_threads-1 do i = 0, max_threads-1
z(1:na1) = z(1:na1)*z_p(1:na1,i) z(1:na1) = z(1:na1)*z_p(1:na1,i)
enddo enddo
#endif #endif
call M_global_product_PRECISION(z, na1) call global_product_PRECISION(z, na1)
z(1:na1) = SIGN( SQRT( -z(1:na1) ), z1(1:na1) ) z(1:na1) = SIGN( SQRT( -z(1:na1) ), z1(1:na1) )
call M_global_gather_PRECISION(dbase, na1) call global_gather_PRECISION(dbase, na1)
call M_global_gather_PRECISION(ddiff, na1) call global_gather_PRECISION(ddiff, na1)
d(1:na1) = dbase(1:na1) - ddiff(1:na1) d(1:na1) = dbase(1:na1) - ddiff(1:na1)
! Calculate scale factors for eigenvectors ! Calculate scale factors for eigenvectors
ev_scale(:) = M_CONST_0_0 ev_scale(:) = CONST_0_0
#ifdef WITH_OPENMP #ifdef WITH_OPENMP
call timer%start("OpenMP parallel" // M_PRECISION_SUFFIX) call timer%start("OpenMP parallel" // PRECISION_SUFFIX)
!$OMP PARALLEL DO PRIVATE(i) SHARED(na1, my_proc, n_procs, & !$OMP PARALLEL DO PRIVATE(i) SHARED(na1, my_proc, n_procs, &
!$OMP d1,dbase, ddiff, z, ev_scale) & !$OMP d1,dbase, ddiff, z, ev_scale) &
...@@ -487,32 +487,32 @@ ...@@ -487,32 +487,32 @@
! All we want to calculate is tmp = (d1(1:na1)-dbase(i))+ddiff(i) ! All we want to calculate is tmp = (d1(1:na1)-dbase(i))+ddiff(i)
! in exactly this order, but we want to prevent compiler optimization ! in exactly this order, but we want to prevent compiler optimization
! ev_scale_val = ev_scale(i) ! ev_scale_val = ev_scale(i)
call M_add_tmp_PRECISION(d1, dbase, ddiff, z, ev_scale(i), na1,i) call add_tmp_PRECISION(d1, dbase, ddiff, z, ev_scale(i), na1,i)
! ev_scale(i) = ev_scale_val ! ev_scale(i) = ev_scale_val
enddo enddo
#ifdef WITH_OPENMP #ifdef WITH_OPENMP
!$OMP END PARALLEL DO !$OMP END PARALLEL DO
call timer%stop("OpenMP parallel" // M_PRECISION_SUFFIX) call timer%stop("OpenMP parallel" // PRECISION_SUFFIX)
#endif #endif
call M_global_gather_PRECISION(ev_scale, na1) call global_gather_PRECISION(ev_scale, na1)
! Add the deflated eigenvalues ! Add the deflated eigenvalues
d(na1+1:na) = d2(1:na2) d(na1+1:na) = d2(1:na2)
! Calculate arrangement of all eigenvalues in output ! Calculate arrangement of all eigenvalues in output
call M_PRECISION_LAMRG( na1, na-na1, d, 1, 1, idx ) call PRECISION_LAMRG( na1, na-na1, d, 1, 1, idx )
! Rearrange eigenvalues ! Rearrange eigenvalues
tmp = d tmp = d
do i=1,na do i=1,na
d(i) = tmp(idx(i)) d(i) = tmp(idx(i))
enddo enddo
call M_check_monotony_PRECISION(na,d,'Output', wantDebug, success) call check_monotony_PRECISION(na,d,'Output', wantDebug, success)
if (.not.(success)) then if (.not.(success)) then
call timer%stop("merge_systems" // M_PRECISION_SUFFIX) call timer%stop("merge_systems" // PRECISION_SUFFIX)
return return
endif endif
! Eigenvector calculations ! Eigenvector calculations
...@@ -614,7 +614,7 @@ ...@@ -614,7 +614,7 @@
endif endif
#ifdef WITH_MPI #ifdef WITH_MPI
call timer%start("mpi_communication") call timer%start("mpi_communication")
call MPI_Sendrecv_replace(qtmp1, l_rows*max_local_cols, M_MPI_REAL_PRECISION, & call MPI_Sendrecv_replace(qtmp1, l_rows*max_local_cols, MPI_REAL_PRECISION, &
np_next, 1111, np_prev, 1111, & np_next, 1111, np_prev, 1111, &
mpi_comm_cols, MPI_STATUS_IGNORE, mpierr) mpi_comm_cols, MPI_STATUS_IGNORE, mpierr)
call timer%stop("mpi_communication") call timer%stop("mpi_communication")
...@@ -677,7 +677,7 @@ ...@@ -677,7 +677,7 @@
! Calculate the j-th eigenvector of the deflated system ! Calculate the j-th eigenvector of the deflated system
! See above why we are doing it this way! ! See above why we are doing it this way!
tmp(1:nnzu) = d1u(1:nnzu)-dbase(j) tmp(1:nnzu) = d1u(1:nnzu)-dbase(j)
call M_v_add_s_PRECISION(tmp,nnzu,ddiff(j)) call v_add_s_PRECISION(tmp,nnzu,ddiff(j))
ev(1:nnzu,i) = zu(1:nnzu) / tmp(1:nnzu) * ev_scale(j) ev(1:nnzu,i) = zu(1:nnzu) / tmp(1:nnzu) * ev_scale(j)
enddo enddo
...@@ -689,8 +689,8 @@ ...@@ -689,8 +689,8 @@
! 1.d0,qtmp2(1,1),ubound(qtmp2,1)) ! 1.d0,qtmp2(1,1),ubound(qtmp2,1))
! else ! else
if (l_rnm>0 .and. ncnt>0 .and. nnzu>0) & if (l_rnm>0 .and. ncnt>0 .and. nnzu>0) &
call M_PRECISION_GEMM('N', 'N', l_rnm, ncnt, nnzu, M_CONST_1_0, qtmp1, ubound(qtmp1,dim=1), ev, ubound(ev,dim=1), & call PRECISION_GEMM('N', 'N', l_rnm, ncnt, nnzu, CONST_1_0, qtmp1, ubound(qtmp1,dim=1), ev, ubound(ev,dim=1), &
M_CONST_1_0, qtmp2(1,1), ubound(qtmp2,dim=1)) CONST_1_0, qtmp2(1,1), ubound(qtmp2,dim=1))
! endif ! useGPU ! endif ! useGPU
! Compute eigenvectors of the rank-1 modified matrix. ! Compute eigenvectors of the rank-1 modified matrix.
! Parts for multiplying with lower half of Q: ! Parts for multiplying with lower half of Q:
...@@ -700,7 +700,7 @@ ...@@ -700,7 +700,7 @@
! Calculate the j-th eigenvector of the deflated system ! Calculate the j-th eigenvector of the deflated system
! See above why we are doing it this way! ! See above why we are doing it this way!
tmp(1:nnzl) = d1l(1:nnzl)-dbase(j) tmp(1:nnzl) = d1l(1:nnzl)-dbase(j)
call M_v_add_s_PRECISION(tmp,nnzl,ddiff(j)) call v_add_s_PRECISION(tmp,nnzl,ddiff(j))
ev(1:nnzl,i) = zl(1:nnzl) / tmp(1:nnzl) * ev_scale(j) ev(1:nnzl,i) = zl(1:nnzl) / tmp(1:nnzl) * ev_scale(j)
enddo enddo
...@@ -712,8 +712,8 @@ ...@@ -712,8 +712,8 @@
! 1.d0,qtmp2(l_rnm+1,1),ubound(qtmp2,1)) ! 1.d0,qtmp2(l_rnm+1,1),ubound(qtmp2,1))
! else