...
 
Commits (15)
This source diff could not be displayed because it is too large. You can view the blob instead.
......@@ -749,21 +749,21 @@ for cc, fc, m, o, p, a, b, g, instr, addr, na in product(
if ( instr == "avx2" or instr == "avx512" or instr == "knl" or g == "with-gpu"):
print(" - export REQUESTED_MEMORY="+memory)
print("\n")
if (g == "with-gpu"):
print(" - echo \"The tasks will be submitted to SLURM PARTITION \" $SLURMPARTITION \" on host \" $SLURMHOST \" with constraints \" $CONTSTRAINTS \" with the geometry \" $GEOMETRYRESERVATION" )
else:
print(" - echo \"The tasks will be submitted to SLURM PARTITION \" $SLURMPARTITION \" on host \" $SLURMHOST \"with constraints \" $CONTSTRAINTS ")
# construct srun command-line
if (g == "with-gpu"):
print(" - export SRUN_COMMANDLINE_CONFIGURE=\"--partition=$SLURMPARTITION --nodelist=$SLURMHOST --time=$CONFIGURETIME --constraint=$CONTSTRAINTS --gres=$GEOMETRYRESERVATION \" ")
print(" - export SRUN_COMMANDLINE_BUILD=\"--partition=$SLURMPARTITION --nodelist=$SLURMHOST --time=$BUILDTIME --constraint=$CONTSTRAINTS --gres=$GEOMETRYRESERVATION \" ")
print(" - export SRUN_COMMANDLINE_RUN=\"--partition=$SLURMPARTITION --nodelist=$SLURMHOST --time=$RUNTIME --constraint=$CONTSTRAINTS --gres=$GEOMETRYRESERVATION \" ")
else:
print(" - export SRUN_COMMANDLINE_CONFIGURE=\"--partition=$SLURMPARTITION --nodelist=$SLURMHOST --time=$CONFIGURETIME --constraint=$CONTSTRAINTS --mem=$REQUESTED_MEMORY\" ")
print(" - export SRUN_COMMANDLINE_BUILD=\"--partition=$SLURMPARTITION --nodelist=$SLURMHOST --time=$BUILDTIME --constraint=$CONTSTRAINTS --mem=$REQUESTED_MEMORY \" ")
print(" - export SRUN_COMMANDLINE_RUN=\"--partition=$SLURMPARTITION --nodelist=$SLURMHOST --time=$RUNTIME --constraint=$CONTSTRAINTS --mem=$REQUESTED_MEMORY \" ")
#print(" - echo \"srun --ntasks=1 --cpus-per-task=1 $SRUN_COMMANDLINE_CONFIGURE\" ")
#if (g == "with-gpu"):
# print(" - echo \"The tasks will be submitted to SLURM PARTITION \" $SLURMPARTITION \" on host \" $SLURMHOST \" with constraints \" $CONTSTRAINTS \" with the geometry \" $GEOMETRYRESERVATION" )
#else:
# print(" - echo \"The tasks will be submitted to SLURM PARTITION \" $SLURMPARTITION \" on host \" $SLURMHOST \"with constraints \" $CONTSTRAINTS ")
## construct srun command-line
#if (g == "with-gpu"):
# print(" - export SRUN_COMMANDLINE_CONFIGURE=\"--partition=$SLURMPARTITION --nodelist=$SLURMHOST --time=$CONFIGURETIME --constraint=$CONTSTRAINTS --gres=$GEOMETRYRESERVATION \" ")
# print(" - export SRUN_COMMANDLINE_BUILD=\"--partition=$SLURMPARTITION --nodelist=$SLURMHOST --time=$BUILDTIME --constraint=$CONTSTRAINTS --gres=$GEOMETRYRESERVATION \" ")
# print(" - export SRUN_COMMANDLINE_RUN=\"--partition=$SLURMPARTITION --nodelist=$SLURMHOST --time=$RUNTIME --constraint=$CONTSTRAINTS --gres=$GEOMETRYRESERVATION \" ")
#else:
# print(" - export SRUN_COMMANDLINE_CONFIGURE=\"--partition=$SLURMPARTITION --nodelist=$SLURMHOST --time=$CONFIGURETIME --constraint=$CONTSTRAINTS --mem=$REQUESTED_MEMORY\" ")
# print(" - export SRUN_COMMANDLINE_BUILD=\"--partition=$SLURMPARTITION --nodelist=$SLURMHOST --time=$BUILDTIME --constraint=$CONTSTRAINTS --mem=$REQUESTED_MEMORY \" ")
# print(" - export SRUN_COMMANDLINE_RUN=\"--partition=$SLURMPARTITION --nodelist=$SLURMHOST --time=$RUNTIME --constraint=$CONTSTRAINTS --mem=$REQUESTED_MEMORY \" ")
##print(" - echo \"srun --ntasks=1 --cpus-per-task=1 $SRUN_COMMANDLINE_CONFIGURE\" ")
if (runScalapackTest):
print(" - ./ci_test_scripts/run_ci_tests.sh -c \" CC=\\\""+c_compiler_wrapper+"\\\"" + " CFLAGS=\\\""+CFLAGS+"\\\"" + " FC=\\\""+fortran_compiler_wrapper+"\\\"" + " FCFLAGS=\\\""+FCFLAGS+"\\\"" \
......
......@@ -164,13 +164,28 @@ if test x"${enable_openmp}" = x"yes"; then
CFLAGS="$OPENMP_CFLAGS $CFLAGS"
fi
AX_CHECK_COMPILE_FLAG([-std=c11], [
CFLAGS+=" -std=c11"
c11_standard=no
AX_CHECK_COMPILE_FLAG([-std=gnu11], [
c11_standard=yes
], [
echo "C compiler cannot compile C11 code"
exit -1
echo "C compiler cannot compile -std=gnu11 code"
echo "testing -std=c11.."
])
if test x"$c11_standard" =x"yes"; then
CFLAGS+=" -std=gnu11"
fi
if test x"$c11_standard" =x"no"; then
AX_CHECK_COMPILE_FLAG([-std=c11], [
c11_standard=yes
], [
echo "C compiler cannot compile C11 code"
exit -1
])
if test x"$c11_standard" =x"yes"; then
CFLAGS+=" -std=c11"
fi
fi
AC_MSG_CHECKING(whether C compiler can use _Generic )
AC_COMPILE_IFELSE([AC_LANG_SOURCE([
......@@ -1323,6 +1338,29 @@ if test x"${enable_kcomputer}" = x"yes"; then
fi
fi
AC_MSG_CHECKING(whether we build for NEC SX-Auroa)
AC_ARG_ENABLE([SX-Aurora],
AS_HELP_STRING([--enable-SX-Aurora],
[enable builds on SX-Aurora, default no.]),
[if test x"$enableval"=x"yes"; then
enable_sxaurora=yes
else
enable_sxaurora=no
fi],
[enable_kcomputer=no])
AC_MSG_RESULT([${enable_sxaurora}])
AM_CONDITIONAL([BUILD_KCOMPUTER],[test x"$enable_sxaurora" = x"yes"])
if test x"${enable_sxaurora}" = x"yes"; then
AC_DEFINE([BUILD_SXAURORA], [1], [build for SX-Aurora])
FC_MODINC="-I"
#if test x"${USE_ASSUMED_SIZE}" = x"yes" ; then
# AC_MSG_ERROR(on K-computer you have to switch off assumed-size arrays!)
#fi
if test x"${enable_fortran2008_features}" = x"yes" ; then
AC_MSG_ERROR(on SX-Aurora you have to switch off Fortran 2008 features!)
fi
fi
if test x"${want_single_precision}" = x"yes" ; then
AC_DEFINE([WANT_SINGLE_PRECISION_REAL],[1],[build also single-precision for real calculation])
AC_DEFINE([WANT_SINGLE_PRECISION_COMPLEX],[1],[build also single-precision for complex calculation])
......@@ -1413,7 +1451,7 @@ AC_CONFIG_FILES([
m4_include([m4/ax_fc_check_define.m4])
AC_MSG_CHECKING([if workaround for broken preprocessor is needed])
need_manual_cpp=no
need_manual_cpp=yes
AX_FC_CHECK_DEFINE([__INTEL_COMPILER],[is_intel=yes],[])
AX_FC_CHECK_DEFINE([__PGI],[is_pgi=yes],[])
ACTUAL_FC="$FC"
......
......@@ -326,13 +326,17 @@ module elpa
integer :: error2
obj => elpa_impl_allocate(error2)
#ifdef USE_FORTRAN2008
if (present(error)) then
#endif
error = error2
if (error .ne. ELPA_OK) then
write(*,*) "Cannot allocate the ELPA object!"
write(*,*) "This is a critical error!"
write(*,*) "ELPA not usable with this error"
endif
#ifdef USE_FORTRAN2008
else
if (error2 .ne. ELPA_OK) then
write(*,*) "Cannot allocate the ELPA object!"
......@@ -341,6 +345,8 @@ module elpa
stop
endif
endif
#endif
end function
......@@ -359,7 +365,9 @@ module elpa
integer :: error2
call obj%destroy(error2)
#ifdef USE_FORTRAN2008
if (present(error)) then
#endif
error = error2
if (error .ne. ELPA_OK) then
write(*,*) "Cannot destroy the ELPA object!"
......@@ -368,6 +376,7 @@ module elpa
error = ELPA_ERROR_CRITICAL
return
endif
#ifdef USE_FORTRAN2008
else
if (error2 .ne. ELPA_OK) then
write(*,*) "Cannot destroy the ELPA object!"
......@@ -377,15 +386,21 @@ module elpa
return
endif
endif
#endif
deallocate(obj, stat=error2)
if (error2 .ne. 0) then
write(*,*) "Cannot deallocate the ELPA object!"
write(*,*) "This is a critical error!"
write(*,*) "This might lead to a memory leak in your application!"
#ifdef USE_FORTRAN2008
if (present(error)) then
error = ELPA_ERROR_CRITICAL
return
endif
#else
error = ELPA_ERROR_CRITICAL
return
#endif
endif
end subroutine
......@@ -404,7 +419,9 @@ module elpa
#endif
integer :: error2
call obj%destroy(error2)
#ifdef USE_FORTRAN2008
if (present(error)) then
#endif
error = error2
if (error2 .ne. ELPA_OK) then
write(*,*) "Cannot destroy the ELPA autotuning object!"
......@@ -413,6 +430,7 @@ module elpa
error = ELPA_ERROR_CRITICAL
return
endif
#ifdef USE_FORTRAN2008
else
if (error2 .ne. ELPA_OK) then
write(*,*) "Cannot destroy the ELPA autotuning object!"
......@@ -422,15 +440,21 @@ module elpa
return
endif
endif
#endif
deallocate(obj, stat=error2)
if (error2 .ne. 0) then
write(*,*) "Cannot deallocate the ELPA autotuning object!"
write(*,*) "This is a critical error!"
write(*,*) "This might lead to a memory leak in your application!"
#ifdef USE_FORTRAN2008
if (present(error)) then
error = ELPA_ERROR_CRITICAL
return
endif
#else
error = ELPA_ERROR_CRITICAL
return
#endif
endif
end subroutine
......
......@@ -684,7 +684,7 @@
endif
if (useGPU) then
successCUDA = cuda_memcpy(qtmp1_dev, loc(qtmp1(1,1)), &
successCUDA = cuda_memcpy(qtmp1_dev, int(loc(qtmp1(1,1)),kind=c_intptr_t), &
gemm_dim_k * gemm_dim_l * size_of_datatype, cudaMemcpyHostToDevice)
check_memcpy_cuda("merge_systems: qtmp1_dev", successCUDA)
endif
......@@ -749,13 +749,13 @@
if(useGPU) then
!TODO: it should be enough to copy l_rows x ncnt
successCUDA = cuda_memcpy(qtmp2_dev, loc(qtmp2(1,1)), &
successCUDA = cuda_memcpy(qtmp2_dev, int(loc(qtmp2(1,1)),kind=c_intptr_t), &
gemm_dim_k * gemm_dim_m * size_of_datatype, cudaMemcpyHostToDevice)
check_memcpy_cuda("merge_systems: qtmp2_dev", successCUDA)
!TODO the previous loop could be possible to do on device and thus
!copy less
successCUDA = cuda_memcpy(ev_dev, loc(ev(1,1)), &
successCUDA = cuda_memcpy(ev_dev, int(loc(ev(1,1)),kind=c_intptr_t), &
gemm_dim_l * gemm_dim_m * size_of_datatype, cudaMemcpyHostToDevice)
check_memcpy_cuda("merge_systems: ev_dev", successCUDA)
endif
......@@ -791,7 +791,7 @@
!TODO either copy only half of the matrix here, and half after the
!second gemm, or copy whole array after the next gemm
! successCUDA = cuda_memcpy(loc(qtmp2(1,1)), qtmp2_dev, &
! successCUDA = cuda_memcpy(c_loc(qtmp2(1,1)), qtmp2_dev, &
! gemm_dim_k * gemm_dim_m * size_of_datatype, cudaMemcpyDeviceToHost)
! check_memcpy_cuda("merge_systems: qtmp2_dev", successCUDA)
endif
......@@ -813,7 +813,7 @@
if(useGPU) then
!TODO the previous loop could be possible to do on device and thus
!copy less
successCUDA = cuda_memcpy(ev_dev, loc(ev(1,1)), &
successCUDA = cuda_memcpy(ev_dev, int(loc(ev(1,1)),kind=c_intptr_t), &
gemm_dim_l * gemm_dim_m * size_of_datatype, cudaMemcpyHostToDevice)
check_memcpy_cuda("merge_systems: ev_dev", successCUDA)
endif
......@@ -843,7 +843,7 @@
if(useGPU) then
!TODO either copy only half of the matrix here, and get rid of the
!previous copy or copy whole array here
successCUDA = cuda_memcpy(loc(qtmp2(1,1)), qtmp2_dev, &
successCUDA = cuda_memcpy(int(loc(qtmp2(1,1)),kind=c_intptr_t), qtmp2_dev, &
gemm_dim_k * gemm_dim_m * size_of_datatype, cudaMemcpyDeviceToHost)
check_memcpy_cuda("merge_systems: qtmp2_dev", successCUDA)
endif
......
......@@ -104,9 +104,11 @@
MATH_DATATYPE(kind=rck), intent(in) :: tau(na)
#ifdef USE_ASSUMED_SIZE
MATH_DATATYPE(kind=rck), intent(inout) :: a_mat(lda,*), q_mat(ldq,*)
MATH_DATATYPE(kind=rck), intent(inout) :: a_mat(lda,*)
MATH_DATATYPE(kind=rck), intent(inout) :: q_mat(ldq,*)
#else
MATH_DATATYPE(kind=rck), intent(inout) :: a_mat(lda,matrixCols), q_mat(ldq,matrixCols)
MATH_DATATYPE(kind=rck), intent(inout) :: a_mat(lda,matrixCols)
MATH_DATATYPE(kind=rck), intent(inout) :: q_mat(ldq,matrixCols)
#endif
logical, intent(in) :: useGPU
integer(kind=ik) :: max_stored_rows, max_stored_rows_fac
......@@ -117,8 +119,10 @@
integer(kind=ik) :: istep, n, nc, ic, ics, ice, nb, cur_pcol
integer(kind=ik) :: hvn_ubnd, hvm_ubnd
MATH_DATATYPE(kind=rck), allocatable :: tmp1(:), tmp2(:), hvb(:), hvm(:,:)
MATH_DATATYPE(kind=rck), allocatable :: tmat(:,:), h1(:), h2(:), hvm1(:)
MATH_DATATYPE(kind=rck), allocatable :: hvb(:), hvm(:,:)
MATH_DATATYPE(kind=rck), allocatable :: tmp1(:), tmp2(:)
MATH_DATATYPE(kind=rck), allocatable :: h1(:), h2(:)
MATH_DATATYPE(kind=rck), allocatable :: tmat(:,:), hvm1(:)
integer(kind=ik) :: istat
character(200) :: errorMessage
......@@ -233,7 +237,8 @@
check_alloc_cuda("trans_ev", successCUDA)
! q_dev = q_mat
successCUDA = cuda_memcpy(q_dev, loc(q_mat(1,1)), ldq * matrixCols * size_of_datatype, cudaMemcpyHostToDevice)
successCUDA = cuda_memcpy(q_dev, int(loc(q_mat(1,1)),kind=c_intptr_t), &
ldq * matrixCols * size_of_datatype, cudaMemcpyHostToDevice)
check_memcpy_cuda("trans_ev", successCUDA)
endif ! useGPU
......@@ -337,13 +342,13 @@
hvm1(1:hvm_ubnd*nstor) = reshape(hvm(1:hvm_ubnd,1:nstor), (/ hvm_ubnd*nstor /))
!hvm_dev(1:hvm_ubnd*nstor) = hvm1(1:hvm_ubnd*nstor)
successCUDA = cuda_memcpy(hvm_dev, loc(hvm1(1)), &
successCUDA = cuda_memcpy(hvm_dev, int(loc(hvm1(1)),kind=c_intptr_t), &
hvm_ubnd * nstor * size_of_datatype, cudaMemcpyHostToDevice)
check_memcpy_cuda("trans_ev", successCUDA)
!tmat_dev = tmat
successCUDA = cuda_memcpy(tmat_dev, loc(tmat(1,1)), &
successCUDA = cuda_memcpy(tmat_dev, int(loc(tmat(1,1)),kind=c_intptr_t), &
max_stored_rows * max_stored_rows * size_of_datatype, cudaMemcpyHostToDevice)
check_memcpy_cuda("trans_ev", successCUDA)
endif
......@@ -381,7 +386,7 @@
! In the legacy GPU version, this allreduce was ommited. But probably it has to be done for GPU + MPI
! todo: does it need to be copied whole? Wouldn't be a part sufficient?
if (useGPU) then
successCUDA = cuda_memcpy(loc(tmp1(1)), tmp_dev, &
successCUDA = cuda_memcpy(int(loc(tmp1(1)),kind=c_intptr_t), tmp_dev, &
max_local_cols * max_stored_rows * size_of_datatype, cudaMemcpyDeviceToHost)
check_memcpy_cuda("trans_ev", successCUDA)
endif
......@@ -390,7 +395,7 @@
call obj%timer%stop("mpi_communication")
! copy back tmp2 - after reduction...
if (useGPU) then
successCUDA = cuda_memcpy(tmp_dev, loc(tmp2(1)), &
successCUDA = cuda_memcpy(tmp_dev, int(loc(tmp2(1)),kind=c_intptr_t), &
max_local_cols * max_stored_rows * size_of_datatype, cudaMemcpyHostToDevice)
check_memcpy_cuda("trans_ev", successCUDA)
endif ! useGPU
......@@ -447,7 +452,8 @@
if (useGPU) then
!q_mat = q_dev
successCUDA = cuda_memcpy(loc(q_mat(1,1)), q_dev, ldq * matrixCols * size_of_datatype, cudaMemcpyDeviceToHost)
successCUDA = cuda_memcpy(int(loc(q_mat(1,1)),kind=c_intptr_t), &
q_dev, ldq * matrixCols * size_of_datatype, cudaMemcpyDeviceToHost)
check_memcpy_cuda("trans_ev", successCUDA)
deallocate(hvm1, stat=istat, errmsg=errorMessage)
......
......@@ -110,14 +110,14 @@ call prmat(na,useGpu,a_mat,a_dev,lda,matrixCols,nblk,my_prow,my_pcol,np_rows,np_
integer(kind=ik), intent(in) :: na, lda, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols
logical, intent(in) :: useGPU, wantDebug
MATH_DATATYPE(kind=rck), intent(out) :: tau(na)
MATH_DATATYPE(kind=rck), intent(out) :: tau(na)
#ifdef USE_ASSUMED_SIZE
MATH_DATATYPE(kind=rck), intent(inout) :: a_mat(lda,*)
MATH_DATATYPE(kind=rck), intent(inout) :: a_mat(lda,*)
#else
MATH_DATATYPE(kind=rck), intent(inout) :: a_mat(lda,matrixCols)
MATH_DATATYPE(kind=rck), intent(inout) :: a_mat(lda,matrixCols)
#endif
real(kind=rk), intent(out) :: d_vec(na), e_vec(na)
real(kind=rk), intent(out) :: d_vec(na)
real(kind=rk), intent(out) :: e_vec(na)
integer(kind=ik), parameter :: max_stored_uv = 32
logical, parameter :: mat_vec_as_one_block = .true.
......@@ -151,9 +151,10 @@ call prmat(na,useGpu,a_mat,a_dev,lda,matrixCols,nblk,my_prow,my_pcol,np_rows,np_
complex(kind=rck) :: aux3(1)
#endif
MATH_DATATYPE(kind=rck), allocatable :: tmp(:), &
v_row(:), & ! used to store calculated Householder Vector
v_col(:), & ! the same Vector, but transposed - differently distributed among MPI tasks
MATH_DATATYPE(kind=rck), allocatable :: tmp(:)
MATH_DATATYPE(kind=rck), allocatable :: v_row(:), & ! used to store calculated Householder Vector
v_col(:), & ! the same Vector, but transposed
! - differently distributed among MPI tasks
u_row(:), &
u_col(:)
! the following two matrices store pairs of vectors v and u calculated in each step
......@@ -342,7 +343,8 @@ call prmat(na,useGpu,a_mat,a_dev,lda,matrixCols,nblk,my_prow,my_pcol,np_rows,np_
successCUDA = cuda_malloc(a_dev, lda * matrixCols * size_of_datatype)
check_alloc_cuda("tridiag: a_dev", successCUDA)
successCUDA = cuda_memcpy(a_dev, loc(a_mat(1,1)), lda * matrixCols * size_of_datatype, cudaMemcpyHostToDevice)
successCUDA = cuda_memcpy(a_dev, int(loc(a_mat(1,1)),kind=c_intptr_t), &
lda * matrixCols * size_of_datatype, cudaMemcpyHostToDevice)
check_memcpy_cuda("tridiag: a_dev", successCUDA)
endif
......@@ -366,9 +368,11 @@ call prmat(na,useGpu,a_mat,a_dev,lda,matrixCols,nblk,my_prow,my_pcol,np_rows,np_
! copy l_cols + 1 column of A to v_row
if (useGPU) then
a_offset = l_cols * lda * size_of_datatype
! we use v_row on the host at the moment! successCUDA = cuda_memcpy(v_row_dev, a_dev + a_offset, (l_rows)*size_of_PRECISION_real, cudaMemcpyDeviceToDevice)
! we use v_row on the host at the moment! successCUDA = cuda_memcpy(v_row_dev, a_dev + a_offset,
! (l_rows)*size_of_PRECISION_real, cudaMemcpyDeviceToDevice)
successCUDA = cuda_memcpy(loc(v_row(1)), a_dev + a_offset, (l_rows)* size_of_datatype, cudaMemcpyDeviceToHost)
successCUDA = cuda_memcpy(int(loc(v_row(1)),kind=c_intptr_t), &
a_dev + a_offset, (l_rows)* size_of_datatype, cudaMemcpyDeviceToHost)
check_memcpy_cuda("tridiag a_dev 1", successCUDA)
else
v_row(1:l_rows) = a_mat(1:l_rows,l_cols+1)
......@@ -486,11 +490,13 @@ call prmat(na,useGpu,a_mat,a_dev,lda,matrixCols,nblk,my_prow,my_pcol,np_rows,np_
successCUDA = cuda_memset(u_row_dev, 0, l_rows * size_of_datatype)
check_memcpy_cuda("tridiag: u_row_dev", successCUDA)
successCUDA = cuda_memcpy(v_col_dev, loc(v_col(1)), l_cols * size_of_datatype, cudaMemcpyHostToDevice)
successCUDA = cuda_memcpy(v_col_dev, int(loc(v_col(1)),kind=c_intptr_t), &
l_cols * size_of_datatype, cudaMemcpyHostToDevice)
check_memcpy_cuda("tridiag: v_col_dev", successCUDA)
successCUDA = cuda_memcpy(v_row_dev, loc(v_row(1)), l_rows * size_of_datatype, cudaMemcpyHostToDevice)
successCUDA = cuda_memcpy(v_row_dev, int(loc(v_row(1)),kind=c_intptr_t), &
l_rows * size_of_datatype, cudaMemcpyHostToDevice)
check_memcpy_cuda("tridiag: v_row_dev", successCUDA)
endif ! useGU
......@@ -621,10 +627,12 @@ call prmat(na,useGpu,a_mat,a_dev,lda,matrixCols,nblk,my_prow,my_pcol,np_rows,np_
enddo
end if !multiplication as one block / per stripes
successCUDA = cuda_memcpy(loc(u_col(1)), u_col_dev, l_cols * size_of_datatype, cudaMemcpyDeviceToHost)
successCUDA = cuda_memcpy(int(loc(u_col(1)),kind=c_intptr_t), &
u_col_dev, l_cols * size_of_datatype, cudaMemcpyDeviceToHost)
check_memcpy_cuda("tridiag: u_col_dev 1", successCUDA)
successCUDA = cuda_memcpy(loc(u_row(1)), u_row_dev, l_rows * size_of_datatype, cudaMemcpyDeviceToHost)
successCUDA = cuda_memcpy(int(loc(u_row(1)),kind=c_intptr_t), &
u_row_dev, l_rows * size_of_datatype, cudaMemcpyDeviceToHost)
check_memcpy_cuda("tridiag: u_row_dev 1", successCUDA)
endif
......@@ -749,12 +757,12 @@ call prmat(na,useGpu,a_mat,a_dev,lda,matrixCols,nblk,my_prow,my_pcol,np_rows,np_
if (n_stored_vecs == max_stored_uv .or. istep == 3) then
if (useGPU) then
successCUDA = cuda_memcpy(vu_stored_rows_dev, loc(vu_stored_rows(1,1)), &
successCUDA = cuda_memcpy(vu_stored_rows_dev, int(loc(vu_stored_rows(1,1)),kind=c_intptr_t), &
max_local_rows * 2 * max_stored_uv * &
size_of_datatype, cudaMemcpyHostToDevice)
check_memcpy_cuda("tridiag: vu_stored_rows_dev", successCUDA)
successCUDA = cuda_memcpy(uv_stored_cols_dev, loc(uv_stored_cols(1,1)), &
successCUDA = cuda_memcpy(uv_stored_cols_dev, int(loc(uv_stored_cols(1,1)),kind=c_intptr_t), &
max_local_cols * 2 * max_stored_uv * &
size_of_datatype, cudaMemcpyHostToDevice)
check_memcpy_cuda("tridiag: uv_stored_cols_dev", successCUDA)
......@@ -817,7 +825,7 @@ call prmat(na,useGpu,a_mat,a_dev,lda,matrixCols,nblk,my_prow,my_pcol,np_rows,np_
!a_mat(l_rows,l_cols) = a_dev(l_rows,l_cols)
a_offset = ((l_rows - 1) + lda * (l_cols - 1)) * size_of_datatype
successCUDA = cuda_memcpy(loc(a_mat(l_rows, l_cols)), a_dev + a_offset, &
successCUDA = cuda_memcpy(int(loc(a_mat(l_rows, l_cols)),kind=c_intptr_t), a_dev + a_offset, &
1 * size_of_datatype, cudaMemcpyDeviceToHost)
check_memcpy_cuda("tridiag: a_dev 3", successCUDA)
......@@ -853,7 +861,7 @@ call prmat(na,useGpu,a_mat,a_dev,lda,matrixCols,nblk,my_prow,my_pcol,np_rows,np_
if (my_prow==prow(1, nblk, np_rows)) then
! We use last l_cols value of loop above
if(useGPU) then
successCUDA = cuda_memcpy(loc(aux3(1)), a_dev + (lda * (l_cols - 1)) * size_of_datatype, &
successCUDA = cuda_memcpy(int(loc(aux3(1)),kind=c_intptr_t), a_dev + (lda * (l_cols - 1)) * size_of_datatype, &
1 * size_of_datatype, cudaMemcpyDeviceToHost)
check_memcpy_cuda("tridiag: a_dev 5", successCUDA)
vrl = aux3(1)
......@@ -889,7 +897,7 @@ call prmat(na,useGpu,a_mat,a_dev,lda,matrixCols,nblk,my_prow,my_pcol,np_rows,np_
#endif /* WITH_MPI */
if (my_prow == prow(1, nblk, np_rows) .and. my_pcol == pcol(1, nblk, np_cols)) then
if(useGPU) then
successCUDA = cuda_memcpy(loc(aux3(1)), a_dev, &
successCUDA = cuda_memcpy(int(loc(aux3(1)),kind=c_intptr_t), a_dev, &
1 * size_of_datatype, cudaMemcpyDeviceToHost)
check_memcpy_cuda("tridiag: a_dev 6", successCUDA)
d_vec(1) = PRECISION_REAL(aux3(1))
......@@ -905,7 +913,7 @@ call prmat(na,useGpu,a_mat,a_dev,lda,matrixCols,nblk,my_prow,my_pcol,np_rows,np_
if (my_prow==prow(1, nblk, np_rows) .and. my_pcol==pcol(2, nblk, np_cols)) then
if(useGPU) then
successCUDA = cuda_memcpy(loc(e_vec(1)), a_dev + (lda * (l_cols - 1)) * size_of_datatype, &
successCUDA = cuda_memcpy(int(loc(e_vec(1)),kind=c_intptr_t), a_dev + (lda * (l_cols - 1)) * size_of_datatype, &
1 * size_of_datatype, cudaMemcpyDeviceToHost)
check_memcpy_cuda("tridiag: a_dev 7", successCUDA)
else !useGPU
......@@ -916,7 +924,7 @@ call prmat(na,useGpu,a_mat,a_dev,lda,matrixCols,nblk,my_prow,my_pcol,np_rows,np_
! Store d_vec(1)
if (my_prow==prow(1, nblk, np_rows) .and. my_pcol==pcol(1, nblk, np_cols)) then
if(useGPU) then
successCUDA = cuda_memcpy(loc(d_vec(1)), a_dev, 1 * size_of_datatype, cudaMemcpyDeviceToHost)
successCUDA = cuda_memcpy(int(loc(d_vec(1)),kind=c_intptr_t), a_dev, 1 * size_of_datatype, cudaMemcpyDeviceToHost)
check_memcpy_cuda("tridiag: a_dev 8", successCUDA)
else !useGPU
d_vec(1) = a_mat(1,1)
......
......@@ -114,9 +114,11 @@
integer(kind=ik) :: na, lda, nblk, nbw, matrixCols, numBlocks, mpi_comm_rows, mpi_comm_cols
#ifdef USE_ASSUMED_SIZE
MATH_DATATYPE(kind=rck) :: a_mat(lda,*), tmat(nbw,nbw,*)
MATH_DATATYPE(kind=rck) :: a_mat(lda,*)
MATH_DATATYPE(kind=rck) :: tmat(nbw,nbw,*)
#else
MATH_DATATYPE(kind=rck) :: a_mat(lda,matrixCols), tmat(nbw,nbw,numBlocks)
MATH_DATATYPE(kind=rck) :: a_mat(lda,matrixCols)
MATH_DATATYPE(kind=rck) :: tmat(nbw,nbw,numBlocks)
#endif
#if REALCASE == 1
......@@ -138,10 +140,12 @@
integer(kind=ik) :: tile_size, l_rows_tile, l_cols_tile
real(kind=rk) :: vnorm2
MATH_DATATYPE(kind=rck) :: xf, aux1(nbw), aux2(nbw), vrl, tau, vav(nbw,nbw)
MATH_DATATYPE(kind=rck) :: xf, aux1(nbw), aux2(nbw), vrl, tau
MATH_DATATYPE(kind=rck) :: vav(nbw,nbw)
! complex(kind=COMPLEX_DATATYPE), allocatable :: tmpCUDA(:,:), vmrCUDA(:,:), umcCUDA(:,:) ! note the different dimension in real case
MATH_DATATYPE(kind=rck), allocatable :: tmpCUDA(:), vmrCUDA(:), umcCUDA(:)
MATH_DATATYPE(kind=rck), allocatable :: tmpCUDA(:)
MATH_DATATYPE(kind=rck), allocatable :: vmrCUDA(:), umcCUDA(:)
MATH_DATATYPE(kind=rck), allocatable :: tmpCPU(:,:), vmrCPU(:,:), umcCPU(:,:)
MATH_DATATYPE(kind=rck), allocatable :: vr(:)
......@@ -359,7 +363,8 @@
cur_l_rows = 0
cur_l_cols = 0
successCUDA = cuda_memcpy(a_dev, loc(a_mat(1,1)), (lda)*(na_cols)* size_of_datatype, cudaMemcpyHostToDevice)
successCUDA = cuda_memcpy(a_dev, int(loc(a_mat(1,1)),kind=c_intptr_t), &
(lda)*(na_cols)* size_of_datatype, cudaMemcpyHostToDevice)
if (.not.(successCUDA)) then
print *,"bandred_&
&MATH_DATATYPE&
......@@ -537,7 +542,7 @@
cur_pcol = pcol(istep*nbw+1, nblk, np_cols)
if (my_pcol == cur_pcol) then
successCUDA = cuda_memcpy2d(loc(a_mat(1, lc_start)), &
successCUDA = cuda_memcpy2d(int(loc(a_mat(1, lc_start)),kind=c_intptr_t), &
int((lda*size_of_datatype),kind=c_intptr_t), &
(a_dev + int( ( (lc_start-1) * lda*size_of_datatype),kind=c_intptr_t )), &
int(lda*size_of_datatype,kind=c_intptr_t), &
......@@ -849,7 +854,7 @@
if (my_pcol == cur_pcol) then
successCUDA = cuda_memcpy2d((a_dev+ &
int(((lc_start-1)*lda*size_of_datatype),kind=c_intptr_t)), &
int(lda*size_of_datatype,kind=c_intptr_t), loc(a_mat(1,lc_start)), &
int(lda*size_of_datatype,kind=c_intptr_t), int(loc(a_mat(1,lc_start)),kind=c_intptr_t), &
int(lda*size_of_datatype,kind=c_intptr_t), &
int(lr_end*size_of_datatype,kind=c_intptr_t), &
int((lc_end - lc_start+1),kind=c_intptr_t), &
......@@ -930,7 +935,7 @@
if (my_pcol == cur_pcol) then
successCUDA = cuda_memcpy2d((a_dev+ &
int(((lc_start-1)*lda*size_of_datatype),kind=c_intptr_t)), &
int(lda*size_of_datatype,kind=c_intptr_t), loc(a_mat(1,lc_start)), &
int(lda*size_of_datatype,kind=c_intptr_t), int(loc(a_mat(1,lc_start)),kind=c_intptr_t), &
int(lda*size_of_datatype,kind=c_intptr_t), &
int(lr_end*size_of_datatype,kind=c_intptr_t), &
int((lc_end - lc_start+1),kind=c_intptr_t), &
......@@ -1093,7 +1098,7 @@
if (useGPU) then
successCUDA = cuda_memcpy(vmr_dev, &
loc(vmrCUDA(1)),&
int(loc(vmrCUDA(1)),kind=c_intptr_t),&
vmr_size*size_of_datatype,cudaMemcpyHostToDevice)
if (.not.(successCUDA)) then
print *,"bandred_&
......@@ -1103,7 +1108,7 @@
endif
successCUDA = cuda_memcpy(umc_dev, &
loc(umcCUDA(1)), &
int(loc(umcCUDA(1)),kind=c_intptr_t), &
umc_size*size_of_datatype,cudaMemcpyHostToDevice)
if (.not.(successCUDA)) then
print *,"bandred_&
......@@ -1165,7 +1170,7 @@
if (useGPU) then
successCUDA = cuda_memcpy( &
loc(vmrCUDA(1)), &
int(loc(vmrCUDA(1)),kind=c_intptr_t), &
vmr_dev,vmr_size*size_of_datatype,cudaMemcpyDeviceToHost)
if (.not.(successCUDA)) then
print *,"bandred_&
......@@ -1175,7 +1180,7 @@
endif
successCUDA = cuda_memcpy( &
loc(umcCUDA(1)), &
int(loc(umcCUDA(1)),kind=c_intptr_t), &
umc_dev, umc_size*size_of_datatype,cudaMemcpyDeviceToHost)
if (.not.(successCUDA)) then
print *,"bandred_&
......@@ -1290,7 +1295,7 @@
if (useGPU) then
successCUDA = cuda_memcpy(umc_dev, &
loc(umcCUDA(1)), &
int(loc(umcCUDA(1)),kind=c_intptr_t), &
umc_size*size_of_datatype, cudaMemcpyHostToDevice)
if (.not.(successCUDA)) then
print *,"bandred_&
......@@ -1298,7 +1303,8 @@
&: error in cudaMemcpy umc_dev 5"
stop 1
endif
successCUDA = cuda_memcpy(tmat_dev,loc(tmat(1,1,istep)),nbw*nbw*size_of_datatype,cudaMemcpyHostToDevice)
successCUDA = cuda_memcpy(tmat_dev,int(loc(tmat(1,1,istep)),kind=c_intptr_t), &
nbw*nbw*size_of_datatype,cudaMemcpyHostToDevice)
if (.not.(successCUDA)) then
print *,"bandred_&
&MATH_DATATYPE&
......@@ -1312,7 +1318,8 @@
call obj%timer%stop("cublas")
! VAV = Tmat * V**T * A * V * Tmat**T = (U*Tmat**T)**T * V * Tmat**T
successCUDA = cuda_memcpy(vav_dev,loc(vav(1,1)), nbw*nbw*size_of_datatype,cudaMemcpyHostToDevice)
successCUDA = cuda_memcpy(vav_dev,int(loc(vav(1,1)),kind=c_intptr_t), &
nbw*nbw*size_of_datatype,cudaMemcpyHostToDevice)
if (.not.(successCUDA)) then
print *,"bandred_&
&MATH_DATATYPE&
......@@ -1330,7 +1337,8 @@
n_cols, n_cols, ONE, tmat_dev, nbw, vav_dev, nbw)
call obj%timer%stop("cublas")
successCUDA = cuda_memcpy(loc(vav(1,1)), vav_dev, nbw*nbw*size_of_datatype, cudaMemcpyDeviceToHost)
successCUDA = cuda_memcpy(int(loc(vav(1,1)),kind=c_intptr_t), &
vav_dev, nbw*nbw*size_of_datatype, cudaMemcpyDeviceToHost)
if (.not.(successCUDA)) then
print *,"bandred_&
&MATH_DATATYPE&
......@@ -1368,7 +1376,7 @@
(obj, n_cols,vav, nbw, nbw ,mpi_comm_cols)
if (useGPU) then
successCUDA = cuda_memcpy(vav_dev, loc(vav(1,1)), nbw*nbw*size_of_datatype,cudaMemcpyHostToDevice)
successCUDA = cuda_memcpy(vav_dev, int(loc(vav(1,1)),kind=c_intptr_t), nbw*nbw*size_of_datatype,cudaMemcpyHostToDevice)
if (.not.(successCUDA)) then
print *,"bandred_&
&MATH_DATATYPE&
......@@ -1396,7 +1404,7 @@
call obj%timer%stop("cublas")
successCUDA = cuda_memcpy( &
loc(umcCUDA(1)), &
int(loc(umcCUDA(1)),kind=c_intptr_t), &
umc_dev, umc_size*size_of_datatype, cudaMemcpyDeviceToHost)
if (.not.(successCUDA)) then
......@@ -1416,7 +1424,7 @@
1, istep*nbw, n_cols, nblk, max_threads)
successCUDA = cuda_memcpy(vmr_dev, &
loc(vmrCUDA(1)), &
int(loc(vmrCUDA(1)),kind=c_intptr_t), &
vmr_size*size_of_datatype, cudaMemcpyHostToDevice)
if (.not.(successCUDA)) then
print *,"bandred_&
......@@ -1426,7 +1434,7 @@
endif
successCUDA = cuda_memcpy(umc_dev, &
loc(umcCUDA(1)), &
int(loc(umcCUDA(1)),kind=c_intptr_t), &
umc_size*size_of_datatype, cudaMemcpyHostToDevice)
if (.not.(successCUDA)) then
print *,"bandred_&
......@@ -1687,7 +1695,8 @@
! (band to tridi). Previously, a has been kept on the device and then
! copied in redist_band (called from tridiag_band). However, it seems to
! be easier to do it here.
successCUDA = cuda_memcpy (loc(a_mat), int(a_dev,kind=c_intptr_t), int(lda*matrixCols* size_of_datatype, kind=c_intptr_t), &
successCUDA = cuda_memcpy (int(loc(a_mat),kind=c_intptr_t), &
int(a_dev,kind=c_intptr_t), int(lda*matrixCols* size_of_datatype, kind=c_intptr_t), &
cudaMemcpyDeviceToHost)
if (.not.(successCUDA)) then
print *,"bandred_&
......
......@@ -74,14 +74,14 @@ program print_available_elpa2_kernels
integer(kind=c_int) :: i
class(elpa_t), pointer :: e
integer :: option
integer :: option, error
if (elpa_init(CURRENT_API_VERSION) /= ELPA_OK) then
print *, "Unsupported ELPA API Version"
stop 1
endif
e => elpa_allocate()
e => elpa_allocate(error)
print *, "This program will give information on the ELPA2 kernels, "
print *, "which are available with this library and it will give "
......@@ -125,7 +125,7 @@ program print_available_elpa2_kernels
print *
print *
call elpa_deallocate(e)
call elpa_deallocate(e, error)
contains
......@@ -141,7 +141,7 @@ program print_available_elpa2_kernels
if (elpa_int_value_to_string(KERNEL_KEY, i) .eq. "ELPA_2STAGE_COMPLEX_GPU" .or. &
elpa_int_value_to_string(KERNEL_KEY, i) .eq. "ELPA_2STAGE_REAL_GPU") then
if (e%can_set("use_gpu",1) == ELPA_OK) then
call e%set("use_gpu",1)
call e%set("use_gpu",1, error)
endif
endif
......
......@@ -78,10 +78,10 @@
#ifdef USE_ASSUMED_SIZE
MATH_DATATYPE(kind=C_DATATYPE_KIND), intent(inout) :: a(obj%local_nrows,*)
MATH_DATATYPE(kind=C_DATATYPE_KIND), optional, target, intent(out) :: q(obj%local_nrows,*)
MATH_DATATYPE(kind=C_DATATYPE_KIND), optional, intent(out), target :: q(obj%local_nrows,*)
#else
MATH_DATATYPE(kind=C_DATATYPE_KIND), intent(inout) :: a(obj%local_nrows,obj%local_ncols)
MATH_DATATYPE(kind=C_DATATYPE_KIND), optional, target, intent(out) :: q(obj%local_nrows,obj%local_ncols)
MATH_DATATYPE(kind=C_DATATYPE_KIND), optional, intent(out), target :: q(obj%local_nrows,obj%local_ncols)
#endif
real(kind=C_DATATYPE_KIND), intent(inout) :: ev(obj%na)
MATH_DATATYPE(kind=C_DATATYPE_KIND), allocatable :: hh_trans(:,:)
......@@ -734,7 +734,7 @@
! if the second backward step is to be performed, but not on GPU, we have
! to transfer q to the host
if(do_trans_to_full .and. (.not. do_useGPU_trans_ev_band_to_full)) then
successCUDA = cuda_memcpy(loc(q), q_dev, ldq*matrixCols* size_of_datatype, cudaMemcpyDeviceToHost)
successCUDA = cuda_memcpy(int(loc(q),kind=c_intptr_t), q_dev, ldq*matrixCols* size_of_datatype, cudaMemcpyDeviceToHost)
if (.not.(successCUDA)) then
print *,"elpa2_template, error in copy to host"
stop 1
......@@ -760,7 +760,7 @@
! copy to device if we want to continue on GPU
successCUDA = cuda_malloc(q_dev, ldq*matrixCols*size_of_datatype)
successCUDA = cuda_memcpy(q_dev, loc(q), ldq*matrixCols* size_of_datatype, cudaMemcpyHostToDevice)
successCUDA = cuda_memcpy(q_dev, int(loc(q),kind=c_intptr_t), ldq*matrixCols* size_of_datatype, cudaMemcpyHostToDevice)
if (.not.(successCUDA)) then
print *,"elpa2_template, error in copy to device"
stop 1
......
......@@ -110,9 +110,11 @@
#endif
integer(kind=ik) :: na, nqc, lda, ldq, nblk, nbw, matrixCols, numBlocks, mpi_comm_rows, mpi_comm_cols
#ifdef USE_ASSUMED_SIZE
MATH_DATATYPE(kind=rck) :: a_mat(lda,*), q_mat(ldq,*), tmat(nbw,nbw,*)
MATH_DATATYPE(kind=rck) :: a_mat(lda,*)
MATH_DATATYPE(kind=rck) :: q_mat(ldq,*), tmat(nbw,nbw,*)
#else
MATH_DATATYPE(kind=rck) :: a_mat(lda,matrixCols), q_mat(ldq,matrixCols), tmat(nbw, nbw, numBlocks)
MATH_DATATYPE(kind=rck) :: a_mat(lda,matrixCols)
MATH_DATATYPE(kind=rck) :: q_mat(ldq,matrixCols), tmat(nbw, nbw, numBlocks)
#endif
integer(kind=C_intptr_T) :: a_dev ! passed from bandred_real at the moment not used since copied in bandred_real
......@@ -122,7 +124,8 @@
integer(kind=ik) :: l_cols, l_rows, l_colh, n_cols
integer(kind=ik) :: istep, lc, ncol, nrow, nb, ns
MATH_DATATYPE(kind=rck), allocatable :: tmp1(:), tmp2(:), hvb(:), hvm(:,:)
MATH_DATATYPE(kind=rck), allocatable :: hvb(:)
MATH_DATATYPE(kind=rck), allocatable :: tmp1(:), tmp2(:), hvm(:,:)
! hvm_dev is fist used and set in this routine
! q_mat is changed in trans_ev_tridi on the host, copied to device and passed here. this can be adapted
! tmp_dev is first used in this routine
......@@ -268,7 +271,7 @@
! q_temp(1:ldq,1:na_cols) = q_mat(1:ldq,1:na_cols)
! ! copy q_dev to device, maybe this can be avoided if q_dev can be kept on device in trans_ev_tridi_to_band
! successCUDA = cuda_memcpy(q_dev, loc(q_mat), (ldq)*(matrixCols)*size_of_PRECISION_real, cudaMemcpyHostToDevice)
! successCUDA = cuda_memcpy(q_dev, c_loc(q_mat), (ldq)*(matrixCols)*size_of_PRECISION_real, cudaMemcpyHostToDevice)
! if (.not.(successCUDA)) then
! print *,"trans_ev_band_to_full_real: error in cudaMalloc"
! stop 1
......@@ -281,7 +284,7 @@
! stop 1
! endif
!
! successCUDA = cuda_memcpy(q_dev, loc(q_mat),ldq*matrixCols*size_of_PRECISION_complex, cudaMemcpyHostToDevice)
! successCUDA = cuda_memcpy(q_dev, c_loc(q_mat),ldq*matrixCols*size_of_PRECISION_complex, cudaMemcpyHostToDevice)
! if (.not.(successCUDA)) then
! print *,"trans_ev_band_to_full_complex: error in cudaMemcpy"
! stop 1
......@@ -346,7 +349,8 @@
nb = nb+l_rows
enddo
successCUDA = cuda_memcpy(hvm_dev, loc(hvm), max_local_rows*nbw* size_of_datatype, cudaMemcpyHostToDevice)
successCUDA = cuda_memcpy(hvm_dev, int(loc(hvm),kind=c_intptr_t), &
max_local_rows*nbw* size_of_datatype, cudaMemcpyHostToDevice)
if (.not.(successCUDA)) then
print *,"trans_ev_band_to_full_real: error in cudaMemcpy, hvm"
......@@ -369,7 +373,8 @@
! copy data from device to host for a later MPI_ALLREDUCE
! copy to host maybe this can be avoided this is needed if MPI is used (allreduce)
successCUDA = cuda_memcpy(loc(tmp1), tmp_dev, l_cols*n_cols*size_of_datatype, cudaMemcpyDeviceToHost)
successCUDA = cuda_memcpy(int(loc(tmp1),kind=c_intptr_t), &
tmp_dev, l_cols*n_cols*size_of_datatype, cudaMemcpyDeviceToHost)
if (.not.(successCUDA)) then
print *,"trans_ev_band_to_full_real: error in cudaMemcpy, tmp1 to host"
stop 1
......@@ -398,7 +403,8 @@
#ifdef WITH_MPI
! after the mpi_allreduce we have to copy back to the device
! copy back to device
successCUDA = cuda_memcpy(tmp_dev, loc(tmp2), n_cols*l_cols* size_of_datatype, &
successCUDA = cuda_memcpy(tmp_dev, int(loc(tmp2),kind=c_intptr_t), &
n_cols*l_cols* size_of_datatype, &
cudaMemcpyHostToDevice)
if (.not.(successCUDA)) then
print *,"trans_ev_band_to_full_&
......@@ -414,7 +420,8 @@
! IMPORTANT: even though tmat_dev is transfered from the previous rutine, we have to copy from tmat again
! tmat is 3-dimensional array, while tmat_dev contains only one 2-dimensional slice of it - and here we
! need to upload another slice
successCUDA = cuda_memcpy(tmat_dev, loc(tmat(1,1,istep)), nbw*nbw*size_of_datatype, cudaMemcpyHostToDevice)
successCUDA = cuda_memcpy(tmat_dev, int(loc(tmat(1,1,istep)),kind=c_intptr_t), &
nbw*nbw*size_of_datatype, cudaMemcpyHostToDevice)
if (.not.(successCUDA)) then
print *,"trans_ev_band_to_full_&
......@@ -434,7 +441,8 @@
! copy to host maybe this can be avoided
! this is not necessary hvm is not used anymore
successCUDA = cuda_memcpy(loc(hvm), hvm_dev, ((max_local_rows)*nbw*size_of_datatype),cudaMemcpyDeviceToHost)
successCUDA = cuda_memcpy(int(loc(hvm),kind=c_intptr_t), &
hvm_dev, ((max_local_rows)*nbw*size_of_datatype),cudaMemcpyDeviceToHost)
if (.not.(successCUDA)) then
print *,"trans_ev_band_to_full_real: error in cudaMemcpy hvm to host"
stop 1
......@@ -779,7 +787,8 @@
! final transfer of q_dev
successCUDA = cuda_memcpy(loc(q_mat), q_dev, ldq*matrixCols* size_of_datatype, cudaMemcpyDeviceToHost)
successCUDA = cuda_memcpy(int(loc(q_mat),kind=c_intptr_t), q_dev, ldq*matrixCols* size_of_datatype, &
cudaMemcpyDeviceToHost)
if (.not.(successCUDA)) then
print *,"trans_ev_band_to_full_&
......
......@@ -143,11 +143,15 @@
MATH_DATATYPE(kind=rck) , allocatable :: row_group(:,:)
#ifdef WITH_OPENMP
MATH_DATATYPE(kind=rck), allocatable :: top_border_send_buffer(:,:), top_border_recv_buffer(:,:)
MATH_DATATYPE(kind=rck), allocatable :: bottom_border_send_buffer(:,:), bottom_border_recv_buffer(:,:)
MATH_DATATYPE(kind=rck), allocatable :: top_border_send_buffer(:,:)
MATH_DATATYPE(kind=rck), allocatable :: top_border_recv_buffer(:,:)
MATH_DATATYPE(kind=rck), allocatable :: bottom_border_send_buffer(:,:)
MATH_DATATYPE(kind=rck), allocatable :: bottom_border_recv_buffer(:,:)
#else
MATH_DATATYPE(kind=rck), allocatable :: top_border_send_buffer(:,:,:), top_border_recv_buffer(:,:,:)
MATH_DATATYPE(kind=rck), allocatable :: bottom_border_send_buffer(:,:,:), bottom_border_recv_buffer(:,:,:)
MATH_DATATYPE(kind=rck), allocatable :: top_border_send_buffer(:,:,:)
MATH_DATATYPE(kind=rck), allocatable :: top_border_recv_buffer(:,:,:)
MATH_DATATYPE(kind=rck), allocatable :: bottom_border_send_buffer(:,:,:)
MATH_DATATYPE(kind=rck), allocatable :: bottom_border_recv_buffer(:,:,:)
#endif
integer(kind=c_intptr_t) :: aIntern_dev
......@@ -1234,7 +1238,7 @@
#endif /* WITH_MPI */
if (useGPU) then
successCUDA = cuda_memcpy(bcast_buffer_dev, loc(bcast_buffer(1,1)), &
successCUDA = cuda_memcpy(bcast_buffer_dev, int(loc(bcast_buffer(1,1)),kind=c_intptr_t), &
nbw * current_local_n * &
size_of_datatype, &
cudaMemcpyHostToDevice)
......@@ -1345,7 +1349,8 @@
if (useGPU) then
dev_offset = (0 + (n_off * stripe_width) + ( (i-1) * stripe_width *a_dim2 )) * size_of_datatype
successCUDA = cuda_memcpy( aIntern_dev + dev_offset , loc(bottom_border_recv_buffer(1,1,i)), &
successCUDA = cuda_memcpy( aIntern_dev + dev_offset , &
int(loc(bottom_border_recv_buffer(1,1,i)),kind=c_intptr_t), &
stripe_width*nbw* size_of_datatype, &
cudaMemcpyHostToDevice)
if (.not.(successCUDA)) then
......@@ -1430,7 +1435,7 @@
if (useGPU) then
dev_offset = (0 + (a_off * stripe_width) + ( (i-1) * stripe_width * a_dim2 )) * size_of_datatype
! host_offset= (0 + (0 * stripe_width) + ( (i-1) * stripe_width * nbw ) ) * 8
successCUDA = cuda_memcpy( aIntern_dev+dev_offset , loc(top_border_recv_buffer(1,1,i)), &
successCUDA = cuda_memcpy( aIntern_dev+dev_offset , int(loc(top_border_recv_buffer(1,1,i)),kind=c_intptr_t), &
stripe_width*top_msg_length* size_of_datatype, &
cudaMemcpyHostToDevice)
if (.not.(successCUDA)) then
......@@ -1523,7 +1528,7 @@
if (useGPU) then
dev_offset = (0 + (n_off * stripe_width) + ( (i-1) * stripe_width * a_dim2 )) * size_of_datatype
successCUDA = cuda_memcpy( loc(bottom_border_send_buffer(1,1,i)), aIntern_dev + dev_offset, &
successCUDA = cuda_memcpy( int(loc(bottom_border_send_buffer(1,1,i)),kind=c_intptr_t), aIntern_dev + dev_offset, &
stripe_width * bottom_msg_length * size_of_datatype, &
cudaMemcpyDeviceToHost)
if (.not.(successCUDA)) then
......@@ -1634,7 +1639,7 @@
if (useGPU) then
dev_offset = (0 + (n_off * stripe_width) + ( (i-1) * stripe_width * a_dim2 )) * size_of_datatype
successCUDA = cuda_memcpy( loc(bottom_border_send_buffer(1,1,i)), aIntern_dev + dev_offset, &
successCUDA = cuda_memcpy(int(loc(bottom_border_send_buffer(1,1,i)),kind=c_intptr_t), aIntern_dev + dev_offset, &
stripe_width*bottom_msg_length* size_of_datatype, &
cudaMemcpyDeviceToHost)
if (.not.(successCUDA)) then
......@@ -1730,7 +1735,7 @@
#endif
if (useGPU) then
dev_offset = (0 + (a_off * stripe_width) + ( (i-1) * stripe_width * a_dim2 )) * size_of_datatype
successCUDA = cuda_memcpy( aIntern_dev + dev_offset , loc( top_border_recv_buffer(:,1,i)), &
successCUDA = cuda_memcpy( aIntern_dev + dev_offset ,int(loc( top_border_recv_buffer(:,1,i)),kind=c_intptr_t), &
stripe_width * top_msg_length * size_of_datatype, &
cudaMemcpyHostToDevice)
if (.not.(successCUDA)) then
......@@ -1858,7 +1863,7 @@
#endif
if (useGPU) then
dev_offset = (0 + (a_off * stripe_width) + ( (i-1) * stripe_width * a_dim2 )) * size_of_datatype
successCUDA = cuda_memcpy( loc(top_border_send_buffer(:,1,i)), aIntern_dev + dev_offset, &
successCUDA = cuda_memcpy( int(loc(top_border_send_buffer(:,1,i)),kind=c_intptr_t), aIntern_dev + dev_offset, &
stripe_width*nbw * size_of_datatype, &
cudaMemcpyDeviceToHost)
if (.not.(successCUDA)) then
......@@ -2205,7 +2210,7 @@
endif
! copy q_dev to device, maybe this can be avoided if q_dev can be kept on device in trans_ev_tridi_to_band
successCUDA = cuda_memcpy(q_dev, loc(q), (ldq)*(matrixCols)* size_of_datatype, &
successCUDA = cuda_memcpy(q_dev, int(loc(q),kind=c_intptr_t), (ldq)*(matrixCols)* size_of_datatype, &
cudaMemcpyHostToDevice)
if (.not.(successCUDA)) then
print *,"trans_ev_tridi_to_band_&
......
......@@ -81,8 +81,11 @@
#define __forceinline __attribute__((always_inline))
#endif
#endif /* VEC_SET == SSE_128 || VEC_SET == AVX_256 || VEC_SET == AVX_512 */
#if VEC_SET == NEON_ARCH64_128
#include <arm_neon.h>
#endif
#include <complex.h>
......@@ -103,6 +106,10 @@
#define SIMD_SET SSE
#endif
#if VEC_SET == NEON_ARCH64_128
#define SIMD_SET NEON_ARCH64
#endif
#if VEC_SET == AVX_256
#define SIMD_SET AVX_AVX2
#endif
......@@ -155,6 +162,49 @@
#endif /* VEC_SET == SSE_128 */
#if VEC_SET == NEON_128
#ifdef DOUBLE_PRECISION_COMPLEX
#define offset 2
#define __SIMD_DATATYPE __Float64x2_t
#define _SIMD_LOAD vld1q_f64
#define _SIMD_LOADU _mm_loadu_pd
#define _SIMD_STORE vst1q_f64
#define _SIMD_STOREU _mm_storeu_pd
#define _SIMD_MUL vmulq_f64
#define _SIMD_ADD vaddq_f64
#define _SIMD_XOR _mm_xor_pd
#define _SIMD_ADDSUB _mm_addsub_pd
#define _SIMD_SHUFFLE _mm_shuffle_pd
#define _SHUFFLE _MM_SHUFFLE2(0,1)
#ifdef __ELPA_USE_FMA__
#define _SIMD_FMSUBADD _mm_maddsub_pd
#endif
#endif /* DOUBLE_PRECISION_COMPLEX */
#ifdef SINGLE_PRECISION_COMPLEX
#define offset 4
#define __SIMD_DATATYPE __m128
#define _SIMD_LOAD _mm_load_ps
#define _SIMD_LOADU _mm_loadu_ps
#define _SIMD_STORE _mm_store_ps
#define _SIMD_STOREU _mm_storeu_ps
#define _SIMD_MUL _mm_mul_ps
#define _SIMD_ADD _mm_add_ps
#define _SIMD_XOR _mm_xor_ps
#define _SIMD_ADDSUB _mm_addsub_ps
#define _SIMD_SHUFFLE _mm_shuffle_ps
#define _SHUFFLE 0xb1
#ifdef __ELPA_USE_FMA__
#define _SIMD_FMSUBADD _mm_maddsub_ps
#endif
#endif /* SINGLE_PRECISION_COMPLEX */
#endif /* VEC_SET == NEON_128 */
#if VEC_SET == AVX_256
#ifdef DOUBLE_PRECISION_COMPLEX
......
......@@ -72,7 +72,7 @@
! Safety only:
if(mod(ldq,4) /= 0) STOP 'double_hh_trafo: ldq not divisible by 4!'
if(mod(loc(q),16) /= 0) STOP 'Q unaligned!'
if(mod(c_loc(q),16) /= 0) STOP 'Q unaligned!'
! Calculate dot product of the two Householder vectors
......
......@@ -63,7 +63,7 @@
real(kind=C_DATATYPE_KIND) :: rows(:,:)
#endif
#if COMPLEXCASE == 1
complex(kind=C_DATATYPE_KIND):: rows(:,:)
complex(kind=C_DATATYPE_KIND) :: rows(:,:)
#endif
integer(kind=ik) :: max_idx
logical :: successCUDA
......@@ -84,7 +84,7 @@
! Issue one single transfer call for all rows (device to host)
! rows(:, 1 : row_count) = row_group_dev(:, 1 : row_count)
successCUDA = cuda_memcpy( loc(rows(:, 1: row_count)), row_group_dev , row_count * l_nev * size_of_&
successCUDA = cuda_memcpy(int(loc(rows(:, 1: row_count)),kind=c_intptr_t), row_group_dev , row_count * l_nev * size_of_&
&PRECISION&
&_&
&MATH_DATATYPE&
......@@ -117,10 +117,10 @@
integer(kind=ik), intent(in) :: stripe_count, stripe_width, last_stripe_width, a_dim2, l_nev
integer(kind=ik), intent(in) :: n_offset, row_count
#if REALCASE == 1
real(kind=C_DATATYPE_KIND), intent(in) :: rows(:, :)
real(kind=C_DATATYPE_KIND), intent(in) :: rows(:, :)
#endif
#if COMPLEXCASE == 1
complex(kind=C_DATATYPE_KIND), intent(in) :: rows(:, :)
complex(kind=C_DATATYPE_KIND), intent(in) :: rows(:, :)
#endif
integer(kind=ik) :: max_idx
......@@ -133,7 +133,7 @@
! row_group_dev(:, 1 : row_count) = rows(:, 1 : row_count)
successCUDA = cuda_memcpy( row_group_dev , loc(rows(1, 1)),row_count * l_nev * &
successCUDA = cuda_memcpy( row_group_dev , int(loc(rows(1, 1)),kind=c_intptr_t),row_count * l_nev * &
size_of_&
&PRECISION&
&_&
......
......@@ -866,11 +866,11 @@ module elpa_api
#else
integer, intent(out) :: error
#endif
#ifdef USE_FORTRAN2008
if (present(error)) then
error = ELPA_OK
return
endif
if (present(error)) error = ELPA_OK
#else
error = ELPA_OK
#endif
end subroutine
!> \brief helper function for error strings
......
......@@ -31,7 +31,6 @@ module elpa_autotune_impl
#else
integer, intent(out) :: error
#endif
!print *, "Print me"
end subroutine
!> \brief function to destroy an elpa autotune object
......@@ -48,7 +47,11 @@ module elpa_autotune_impl
#endif
! nothing to do atm
#ifdef USE_FORTRAN2008
if (present(error)) error = ELPA_OK
#else
error = ELPA_OK
#endif
end subroutine
#endif
end module
This diff is collapsed.
......@@ -50,15 +50,15 @@
call self%get("solver", solver,error2)
if (error2 .ne. ELPA_OK) then
print *,"Problem setting option. Aborting..."
stop
endif
#ifdef USE_FORTRAN2008
if (present(error)) then
error = error2
endif
if (present(error)) then
error = error2
endif
#else
error = error2
error = error2
#endif
return
endif
if (solver .eq. ELPA_SOLVER_1STAGE) then
call self%autotune_timer%start("accumulator")
#if defined(INCLUDE_ROUTINES)
......@@ -205,15 +205,16 @@
call self%get("solver", solver,error2)
if (error2 .ne. ELPA_OK) then
print *,"Problem getting option. Aborting..."
stop
endif
#ifdef USE_FORTRAN2008
if (present(error)) then
error = error2
endif
if (present(error)) then
error = error2
endif
#else
error = error2
error = error2
#endif
return
endif
if (solver .eq. ELPA_SOLVER_1STAGE) then
call self%autotune_timer%start("accumulator")
#if defined(INCLUDE_ROUTINES)
......@@ -381,7 +382,7 @@
write(error_unit,'(a)') "ELPA: Error in transform_generalized() and you did not check for errors!"
endif
call self%get("solver", solver)
call self%get("solver", solver,error_l)
if (solver .eq. ELPA_SOLVER_1STAGE) then
#if defined(INCLUDE_ROUTINES)
success_l = elpa_solve_evp_&
......@@ -556,7 +557,7 @@
write(error_unit,'(a)') "ELPA: Error in transform_generalized() and you did not check for errors!"
endif
call self%get("solver", solver)
call self%get("solver", solver,error_l)
if (solver .eq. ELPA_SOLVER_1STAGE) then
#if defined(INCLUDE_ROUTINES)
success_l = elpa_solve_evp_&
......
......@@ -59,8 +59,10 @@ module matrix_plot
! print a_dev
if(useGpu) then
successCUDA = cuda_memcpy(loc(a_dev_helper(1,1)), a_dev, lda * matrixCols * size_of_datatype, cudaMemcpyDeviceToHost)
#ifdef HAVE_GPU_VERSION
successCUDA = cuda_memcpy(int(loc(a_dev_helper(1,1)),kind=c_intptr_t), &
a_dev, lda * matrixCols * size_of_datatype, cudaMemcpyDeviceToHost)
#endif
write(filename, "(A,A,I0.4,A,I0.2,A)") trim(directory), "/a_dev-", counter, "-", mpi_rank, ".txt"
write(*,*) trim(filename)
open(unit=out_unit, file=trim(filename), action="write",status="replace")
......
......@@ -201,7 +201,7 @@ program test_interface
call elpa_deallocate(e2, success)
assert_elpa_ok(success)
call elpa_uninit()
call elpa_uninit(success)
status = check_correctness_evp_numeric_residuals(na, nev, as1, z1, ev1, sc_desc, nblk, myid, np_rows, np_cols, my_prow, my_pcol)
......
......@@ -180,7 +180,7 @@ program test
call prepare_matrix_analytic(na, a, nblk, myid, np_rows, np_cols, my_prow, my_pcol, print_times=.false.)
as(:,:) = a(:,:)
e1 => elpa_allocate()
e1 => elpa_allocate(error)
!assert_elpa_ok(error)
call set_basic_params(e1, na, nev, na_rows, na_cols, my_prow, my_pcol)
......@@ -313,7 +313,7 @@ program test
status = check_correctness_analytic(na, nev, ev, z, nblk, myid, np_rows, np_cols, my_prow, my_pcol, &
.true., .true., print_times=.false.)
call elpa_deallocate(e_ptr)
call elpa_deallocate(e_ptr, error)
!assert_elpa_ok(error)
deallocate(a)
......@@ -321,7 +321,7 @@ program test
deallocate(z)
deallocate(ev)
call elpa_uninit()
call elpa_uninit(error)
!assert_elpa_ok(error)
#ifdef WITH_MPI
......
......@@ -231,13 +231,13 @@ program test
call prepare_matrix_random(na, myid_sub, sc_desc, a, z, as)
as(:,:) = a(:,:)
e => elpa_allocate()
e => elpa_allocate(error)
call set_basic_params(e, na, nev, na_rows, na_cols, mpi_sub_comm, my_prow, my_pcol)
call e%set("timings",1, error)
call e%set("debug",1)
call e%set("gpu", 0)
call e%set("debug",1, error)
call e%set("gpu", 0, error)
!call e%set("max_stored_rows", 15, error)
assert_elpa_ok(e%setup())
......@@ -267,14 +267,14 @@ program test
call e%print_times("eigenvectors")
endif
call elpa_deallocate(e)
call elpa_deallocate(e, error)
deallocate(a)
deallocate(as)
deallocate(z)
deallocate(ev)
call elpa_uninit()
call elpa_uninit(error)
call blacs_gridexit(my_blacs_ctxt)
call mpi_finalize(mpierr)
......
......@@ -54,7 +54,11 @@ module test_blacs_infrastructure
implicit none
integer(kind=c_int), intent(in), value :: mpi_comm_parent, np_rows, np_cols
#ifdef SXAURORA
character(len=1), intent(in) :: layout
#else
character(len=1), intent(in), value :: layout
#endif
integer(kind=c_int), intent(out) :: my_blacs_ctxt, my_prow, my_pcol
#ifdef WITH_MPI
......