...
 
Commits (25)
This source diff could not be displayed because it is too large. You can view the blob instead.
......@@ -454,6 +454,12 @@ if WANT_SINGLE_PRECISION_COMPLEX
endif
endif
if STORE_BUILD_CONFIG
libelpa@SUFFIX@_private_la_SOURCES += src/helpers/print_build_config.c
endif
# Cuda files
.cu.lo:
NVCC="$(NVCC)" libtool --mode=compile --tag=CC $(top_srcdir)/nvcc_wrap $(NVCCFLAGS) $(LDFLAGS) -I$(top_builddir)/ -I$(top_srcdir)/ -c $< -o $@
......@@ -669,7 +675,6 @@ wrapper_la_CFLAGS = $(PYTHON_INCLUDE) $(NUMPY_INCLUDE) $(AM_CFLAGS)
python/pyelpa/wrapper.c: python/pyelpa/wrapper.pyx
cython $< -o $@
# test scripts
TASKS ?= 2
if WITH_MPI
......@@ -793,8 +798,6 @@ EXTRA_DIST = \
src/elpa2/elpa2_tridiag_band_template.F90 \
src/elpa2/kernels/complex_128bit_256bit_512bit_BLOCK_template.c \
src/elpa2/kernels/complex_template.F90 \
src/elpa2/kernels/real_vsx_4hv_template.c \
src/elpa2/kernels/real_vsx_6hv_template.c \
src/elpa2/kernels/real_128bit_256bit_512bit_BLOCK_template.c \
src/elpa2/kernels/real_template.F90 \
src/elpa2/kernels/simple_template.F90 \
......
......@@ -268,7 +268,7 @@ print(" - export BLOCK_SIZE=16")
print(" - if [ \"$MEDIUM_MATRIX\" = \"yes\" ]; then export MATRIX_SIZE=1500 && export NUMBER_OF_EIGENVECTORS=750; fi")
print(" - if [ \"$LARGE_MATRIX\" = \"yes\" ]; then export MATRIX_SIZE=5000 && export NUMBER_OF_EIGENVECTORS=500; fi")
print(" - if [ \"$GPU_BLOCKSIZE\" = \"yes\" ]; then export BLOCK_SIZE=128 ; fi")
print(" - if [ -z \"$PIPELINE_MPI_TASKS\" ]; then export MPI_TASKS=2; else xport MPI_TASKS=$PIPELINE_MPI_TASKS; fi")
print(" - if [ -z \"$PIPELINE_MPI_TASKS\" ]; then export MPI_TASKS=2; else export MPI_TASKS=$PIPELINE_MPI_TASKS; fi")
print(" - echo \"This test will run with matrix size na = $MATRIX_SIZE, nev= $NUMBER_OF_EIGENVECTORS, on a blacs grid with blocksize nblk= $BLOCK_SIZE \" ")
print(" - export SKIP_STEP=0")
......@@ -702,9 +702,9 @@ for cc, fc, m, o, p, a, b, g, instr, addr, na in product(
print("# " + cc + "-" + fc + "-" + m + "-" + o + "-" + p + "-" + a + "-" + b + "-" +g + "-" + cov + "-" + instr + "-" + addr)
print(cc + "-" + fc + "-" + m + "-" + o + "-" + p + "-" +a + "-" +b + "-" +g + "-" + cov + "-" + instr + "-" + addr + "-jobs:")
#if (MasterOnly):
# print(" only:")
# print(" - /.*master.*/")
if (MasterOnly):
print(" only:")
print(" - /.*master.*/")
if (instr == "power8"):
print(" allow_failure: true")
print(" tags:")
......
......@@ -164,13 +164,28 @@ if test x"${enable_openmp}" = x"yes"; then
CFLAGS="$OPENMP_CFLAGS $CFLAGS"
fi
AX_CHECK_COMPILE_FLAG([-std=c11], [
CFLAGS+=" -std=c11"
c11_standard=no
AX_CHECK_COMPILE_FLAG([-std=gnu11], [
c11_standard=yes
], [
echo "C compiler cannot compile C11 code"
exit -1
echo "C compiler cannot compile -std=gnu11 code"
echo "testing -std=c11.."
])
if test x"$c11_standard" =x"yes"; then
CFLAGS+=" -std=gnu11"
fi
if test x"$c11_standard" =x"no"; then
AX_CHECK_COMPILE_FLAG([-std=c11], [
c11_standard=yes
], [
echo "C compiler cannot compile C11 code"
exit -1
])
if test x"$c11_standard" =x"yes"; then
CFLAGS+=" -std=c11"
fi
fi
AC_MSG_CHECKING(whether C compiler can use _Generic )
AC_COMPILE_IFELSE([AC_LANG_SOURCE([
......@@ -1323,6 +1338,29 @@ if test x"${enable_kcomputer}" = x"yes"; then
fi
fi
AC_MSG_CHECKING(whether we build for NEC SX-Auroa)
AC_ARG_ENABLE([SX-Aurora],
AS_HELP_STRING([--enable-SX-Aurora],
[enable builds on SX-Aurora, default no.]),
[if test x"$enableval"=x"yes"; then
enable_sxaurora=yes
else
enable_sxaurora=no
fi],
[enable_kcomputer=no])
AC_MSG_RESULT([${enable_sxaurora}])
AM_CONDITIONAL([BUILD_KCOMPUTER],[test x"$enable_sxaurora" = x"yes"])
if test x"${enable_sxaurora}" = x"yes"; then
AC_DEFINE([BUILD_SXAURORA], [1], [build for SX-Aurora])
FC_MODINC="-I"
#if test x"${USE_ASSUMED_SIZE}" = x"yes" ; then
# AC_MSG_ERROR(on K-computer you have to switch off assumed-size arrays!)
#fi
if test x"${enable_fortran2008_features}" = x"yes" ; then
AC_MSG_ERROR(on SX-Aurora you have to switch off Fortran 2008 features!)
fi
fi
if test x"${want_single_precision}" = x"yes" ; then
AC_DEFINE([WANT_SINGLE_PRECISION_REAL],[1],[build also single-precision for real calculation])
AC_DEFINE([WANT_SINGLE_PRECISION_COMPLEX],[1],[build also single-precision for complex calculation])
......@@ -1376,6 +1414,28 @@ else
fi
fi
dnl store-build-config
AC_MSG_CHECKING(whether build config should be compiled into the library)
AC_CHECK_PROG(xxd_CHECK,xxd,yes)
AS_IF([test x"$xxd_CHECK" != x"yes"], [AC_MSG_ERROR([Please install xxd before configuring.])])
AC_ARG_ENABLE([store-build-config],
AS_HELP_STRING([--enable-store-build-config],
[compile build config into the library object, default no]),
[
if test x"$enableval" = x"yes"; then
store_build_config=yes
else
store_build_config=no
fi
],
[store_build_config=no])
AC_MSG_RESULT([${store_build_config}])
AM_CONDITIONAL([STORE_BUILD_CONFIG],[test x"$store_build_config" = x"yes"])
if test x"${store_build_config}" = x"yes"; then
AC_DEFINE([STORE_BUILD_CONFIG], [1], [compile build config into the library object])
fi
AC_SUBST([SUFFIX])
AC_SUBST([PKG_CONFIG_FILE],[elpa${SUFFIX}-${PACKAGE_VERSION}.pc])
......@@ -1385,12 +1445,13 @@ AC_CONFIG_FILES([
${PKG_CONFIG_FILE}:elpa.pc.in
elpa/elpa_constants.h
elpa/elpa_version.h
elpa/elpa_build_config.h
])
m4_include([m4/ax_fc_check_define.m4])
AC_MSG_CHECKING([if workaround for broken preprocessor is needed])
need_manual_cpp=no
need_manual_cpp=yes
AX_FC_CHECK_DEFINE([__INTEL_COMPILER],[is_intel=yes],[])
AX_FC_CHECK_DEFINE([__PGI],[is_pgi=yes],[])
ACTUAL_FC="$FC"
......@@ -1501,8 +1562,6 @@ if test x"${enable_python_tests}" = x"yes"; then
AC_MSG_ERROR([pytest not found.])
fi
fi
AC_OUTPUT
echo ""
......@@ -1552,6 +1611,11 @@ else
else
echo "#undef OPTIONAL_C_ERROR_ARGUMENT" > elpa/elpa_generated_c_api.h
fi
if test x"$store_build_config" = x"yes"; then
cat config.log > elpa_build_object
xxd -i elpa_build_object >> elpa/elpa_build_config.h
fi
make -f $srcdir/generated_headers.am generated-headers top_srcdir="$srcdir" CPP="$CPP"
fi
// The stored build config
......@@ -53,7 +53,7 @@ test/shared/generated.h: $(wildcard $(top_srcdir)/test/shared/*.*90) | test/shar
$(call extract_interface,!c>)
generated_headers += src/elpa_generated_fortran_interfaces.h
src/elpa_generated_fortran_interfaces.h: $(filter-out $(wildcard $(top_srcdir)/src/*generated*), $(wildcard $(top_srcdir)/src/elpa2/kernels/*.c $(top_srcdir)/src/elpa2/kernels/*.s $(top_srcdir)/src/*.[ch] $(top_srcdir)/src/elpa_generalized/*.[ch])) | src
src/elpa_generated_fortran_interfaces.h: $(filter-out $(wildcard $(top_srcdir)/src/*generated*), $(wildcard $(top_srcdir)/src/helpers/*.c $(top_srcdir)/src/elpa2/kernels/*.c $(top_srcdir)/src/elpa2/kernels/*.s $(top_srcdir)/src/*.[ch] $(top_srcdir)/src/elpa_generalized/*.[ch])) | src
@rm -f $@
$(call extract_interface,!f>)
$(call extract_interface,#!f>)
......
......@@ -326,13 +326,17 @@ module elpa
integer :: error2
obj => elpa_impl_allocate(error2)
#ifdef USE_FORTRAN2008
if (present(error)) then
#endif
error = error2
if (error .ne. ELPA_OK) then
write(*,*) "Cannot allocate the ELPA object!"
write(*,*) "This is a critical error!"
write(*,*) "ELPA not usable with this error"
endif
#ifdef USE_FORTRAN2008
else
if (error2 .ne. ELPA_OK) then
write(*,*) "Cannot allocate the ELPA object!"
......@@ -341,6 +345,8 @@ module elpa
stop
endif
endif
#endif
end function
......@@ -359,7 +365,9 @@ module elpa
integer :: error2
call obj%destroy(error2)
#ifdef USE_FORTRAN2008
if (present(error)) then
#endif
error = error2
if (error .ne. ELPA_OK) then
write(*,*) "Cannot destroy the ELPA object!"
......@@ -368,6 +376,7 @@ module elpa
error = ELPA_ERROR_CRITICAL
return
endif
#ifdef USE_FORTRAN2008
else
if (error2 .ne. ELPA_OK) then
write(*,*) "Cannot destroy the ELPA object!"
......@@ -377,15 +386,21 @@ module elpa
return
endif
endif
#endif
deallocate(obj, stat=error2)
if (error2 .ne. 0) then
write(*,*) "Cannot deallocate the ELPA object!"
write(*,*) "This is a critical error!"
write(*,*) "This might lead to a memory leak in your application!"
#ifdef USE_FORTRAN2008
if (present(error)) then
error = ELPA_ERROR_CRITICAL
return
endif
#else
error = ELPA_ERROR_CRITICAL
return
#endif
endif
end subroutine
......@@ -404,7 +419,9 @@ module elpa
#endif
integer :: error2
call obj%destroy(error2)
#ifdef USE_FORTRAN2008
if (present(error)) then
#endif
error = error2
if (error2 .ne. ELPA_OK) then
write(*,*) "Cannot destroy the ELPA autotuning object!"
......@@ -413,6 +430,7 @@ module elpa
error = ELPA_ERROR_CRITICAL
return
endif
#ifdef USE_FORTRAN2008
else
if (error2 .ne. ELPA_OK) then
write(*,*) "Cannot destroy the ELPA autotuning object!"
......@@ -422,15 +440,21 @@ module elpa
return
endif
endif
#endif
deallocate(obj, stat=error2)
if (error2 .ne. 0) then
write(*,*) "Cannot deallocate the ELPA autotuning object!"
write(*,*) "This is a critical error!"
write(*,*) "This might lead to a memory leak in your application!"
#ifdef USE_FORTRAN2008
if (present(error)) then
error = ELPA_ERROR_CRITICAL
return
endif
#else
error = ELPA_ERROR_CRITICAL
return
#endif
endif
end subroutine
......
......@@ -684,7 +684,7 @@
endif
if (useGPU) then
successCUDA = cuda_memcpy(qtmp1_dev, loc(qtmp1(1,1)), &
successCUDA = cuda_memcpy(qtmp1_dev, int(loc(qtmp1(1,1)),kind=c_intptr_t), &
gemm_dim_k * gemm_dim_l * size_of_datatype, cudaMemcpyHostToDevice)
check_memcpy_cuda("merge_systems: qtmp1_dev", successCUDA)
endif
......@@ -749,13 +749,13 @@
if(useGPU) then
!TODO: it should be enough to copy l_rows x ncnt
successCUDA = cuda_memcpy(qtmp2_dev, loc(qtmp2(1,1)), &
successCUDA = cuda_memcpy(qtmp2_dev, int(loc(qtmp2(1,1)),kind=c_intptr_t), &
gemm_dim_k * gemm_dim_m * size_of_datatype, cudaMemcpyHostToDevice)
check_memcpy_cuda("merge_systems: qtmp2_dev", successCUDA)
!TODO the previous loop could be possible to do on device and thus
!copy less
successCUDA = cuda_memcpy(ev_dev, loc(ev(1,1)), &
successCUDA = cuda_memcpy(ev_dev, int(loc(ev(1,1)),kind=c_intptr_t), &
gemm_dim_l * gemm_dim_m * size_of_datatype, cudaMemcpyHostToDevice)
check_memcpy_cuda("merge_systems: ev_dev", successCUDA)
endif
......@@ -791,7 +791,7 @@
!TODO either copy only half of the matrix here, and half after the
!second gemm, or copy whole array after the next gemm
! successCUDA = cuda_memcpy(loc(qtmp2(1,1)), qtmp2_dev, &
! successCUDA = cuda_memcpy(c_loc(qtmp2(1,1)), qtmp2_dev, &
! gemm_dim_k * gemm_dim_m * size_of_datatype, cudaMemcpyDeviceToHost)
! check_memcpy_cuda("merge_systems: qtmp2_dev", successCUDA)
endif
......@@ -813,7 +813,7 @@
if(useGPU) then
!TODO the previous loop could be possible to do on device and thus
!copy less
successCUDA = cuda_memcpy(ev_dev, loc(ev(1,1)), &
successCUDA = cuda_memcpy(ev_dev, int(loc(ev(1,1)),kind=c_intptr_t), &
gemm_dim_l * gemm_dim_m * size_of_datatype, cudaMemcpyHostToDevice)
check_memcpy_cuda("merge_systems: ev_dev", successCUDA)
endif
......@@ -843,7 +843,7 @@
if(useGPU) then
!TODO either copy only half of the matrix here, and get rid of the
!previous copy or copy whole array here
successCUDA = cuda_memcpy(loc(qtmp2(1,1)), qtmp2_dev, &
successCUDA = cuda_memcpy(int(loc(qtmp2(1,1)),kind=c_intptr_t), qtmp2_dev, &
gemm_dim_k * gemm_dim_m * size_of_datatype, cudaMemcpyDeviceToHost)
check_memcpy_cuda("merge_systems: qtmp2_dev", successCUDA)
endif
......
......@@ -104,9 +104,11 @@
MATH_DATATYPE(kind=rck), intent(in) :: tau(na)
#ifdef USE_ASSUMED_SIZE
MATH_DATATYPE(kind=rck), intent(inout) :: a_mat(lda,*), q_mat(ldq,*)
MATH_DATATYPE(kind=rck), intent(inout) :: a_mat(lda,*)
MATH_DATATYPE(kind=rck), intent(inout) :: q_mat(ldq,*)
#else
MATH_DATATYPE(kind=rck), intent(inout) :: a_mat(lda,matrixCols), q_mat(ldq,matrixCols)
MATH_DATATYPE(kind=rck), intent(inout) :: a_mat(lda,matrixCols)
MATH_DATATYPE(kind=rck), intent(inout) :: q_mat(ldq,matrixCols)
#endif
logical, intent(in) :: useGPU
integer(kind=ik) :: max_stored_rows, max_stored_rows_fac
......@@ -117,8 +119,10 @@
integer(kind=ik) :: istep, n, nc, ic, ics, ice, nb, cur_pcol
integer(kind=ik) :: hvn_ubnd, hvm_ubnd
MATH_DATATYPE(kind=rck), allocatable :: tmp1(:), tmp2(:), hvb(:), hvm(:,:)
MATH_DATATYPE(kind=rck), allocatable :: tmat(:,:), h1(:), h2(:), hvm1(:)
MATH_DATATYPE(kind=rck), allocatable :: hvb(:), hvm(:,:)
MATH_DATATYPE(kind=rck), allocatable :: tmp1(:), tmp2(:)
MATH_DATATYPE(kind=rck), allocatable :: h1(:), h2(:)
MATH_DATATYPE(kind=rck), allocatable :: tmat(:,:), hvm1(:)
integer(kind=ik) :: istat
character(200) :: errorMessage
......@@ -233,7 +237,8 @@
check_alloc_cuda("trans_ev", successCUDA)
! q_dev = q_mat
successCUDA = cuda_memcpy(q_dev, loc(q_mat(1,1)), ldq * matrixCols * size_of_datatype, cudaMemcpyHostToDevice)
successCUDA = cuda_memcpy(q_dev, int(loc(q_mat(1,1)),kind=c_intptr_t), &
ldq * matrixCols * size_of_datatype, cudaMemcpyHostToDevice)
check_memcpy_cuda("trans_ev", successCUDA)
endif ! useGPU
......@@ -337,13 +342,13 @@
hvm1(1:hvm_ubnd*nstor) = reshape(hvm(1:hvm_ubnd,1:nstor), (/ hvm_ubnd*nstor /))
!hvm_dev(1:hvm_ubnd*nstor) = hvm1(1:hvm_ubnd*nstor)
successCUDA = cuda_memcpy(hvm_dev, loc(hvm1(1)), &
successCUDA = cuda_memcpy(hvm_dev, int(loc(hvm1(1)),kind=c_intptr_t), &
hvm_ubnd * nstor * size_of_datatype, cudaMemcpyHostToDevice)
check_memcpy_cuda("trans_ev", successCUDA)
!tmat_dev = tmat
successCUDA = cuda_memcpy(tmat_dev, loc(tmat(1,1)), &
successCUDA = cuda_memcpy(tmat_dev, int(loc(tmat(1,1)),kind=c_intptr_t), &
max_stored_rows * max_stored_rows * size_of_datatype, cudaMemcpyHostToDevice)
check_memcpy_cuda("trans_ev", successCUDA)
endif
......@@ -381,7 +386,7 @@
! In the legacy GPU version, this allreduce was ommited. But probably it has to be done for GPU + MPI
! todo: does it need to be copied whole? Wouldn't be a part sufficient?
if (useGPU) then
successCUDA = cuda_memcpy(loc(tmp1(1)), tmp_dev, &
successCUDA = cuda_memcpy(int(loc(tmp1(1)),kind=c_intptr_t), tmp_dev, &
max_local_cols * max_stored_rows * size_of_datatype, cudaMemcpyDeviceToHost)
check_memcpy_cuda("trans_ev", successCUDA)
endif
......@@ -390,7 +395,7 @@
call obj%timer%stop("mpi_communication")
! copy back tmp2 - after reduction...
if (useGPU) then
successCUDA = cuda_memcpy(tmp_dev, loc(tmp2(1)), &
successCUDA = cuda_memcpy(tmp_dev, int(loc(tmp2(1)),kind=c_intptr_t), &
max_local_cols * max_stored_rows * size_of_datatype, cudaMemcpyHostToDevice)
check_memcpy_cuda("trans_ev", successCUDA)
endif ! useGPU
......@@ -447,7 +452,8 @@
if (useGPU) then
!q_mat = q_dev
successCUDA = cuda_memcpy(loc(q_mat(1,1)), q_dev, ldq * matrixCols * size_of_datatype, cudaMemcpyDeviceToHost)
successCUDA = cuda_memcpy(int(loc(q_mat(1,1)),kind=c_intptr_t), &
q_dev, ldq * matrixCols * size_of_datatype, cudaMemcpyDeviceToHost)
check_memcpy_cuda("trans_ev", successCUDA)
deallocate(hvm1, stat=istat, errmsg=errorMessage)
......
......@@ -110,14 +110,14 @@ call prmat(na,useGpu,a_mat,a_dev,lda,matrixCols,nblk,my_prow,my_pcol,np_rows,np_
integer(kind=ik), intent(in) :: na, lda, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols
logical, intent(in) :: useGPU, wantDebug
MATH_DATATYPE(kind=rck), intent(out) :: tau(na)
MATH_DATATYPE(kind=rck), intent(out) :: tau(na)
#ifdef USE_ASSUMED_SIZE
MATH_DATATYPE(kind=rck), intent(inout) :: a_mat(lda,*)
MATH_DATATYPE(kind=rck), intent(inout) :: a_mat(lda,*)
#else
MATH_DATATYPE(kind=rck), intent(inout) :: a_mat(lda,matrixCols)
MATH_DATATYPE(kind=rck), intent(inout) :: a_mat(lda,matrixCols)
#endif
real(kind=rk), intent(out) :: d_vec(na), e_vec(na)
real(kind=rk), intent(out) :: d_vec(na)
real(kind=rk), intent(out) :: e_vec(na)
integer(kind=ik), parameter :: max_stored_uv = 32
logical, parameter :: mat_vec_as_one_block = .true.
......@@ -151,9 +151,10 @@ call prmat(na,useGpu,a_mat,a_dev,lda,matrixCols,nblk,my_prow,my_pcol,np_rows,np_
complex(kind=rck) :: aux3(1)
#endif
MATH_DATATYPE(kind=rck), allocatable :: tmp(:), &
v_row(:), & ! used to store calculated Householder Vector
v_col(:), & ! the same Vector, but transposed - differently distributed among MPI tasks
MATH_DATATYPE(kind=rck), allocatable :: tmp(:)
MATH_DATATYPE(kind=rck), allocatable :: v_row(:), & ! used to store calculated Householder Vector
v_col(:), & ! the same Vector, but transposed
! - differently distributed among MPI tasks
u_row(:), &
u_col(:)
! the following two matrices store pairs of vectors v and u calculated in each step
......@@ -342,7 +343,8 @@ call prmat(na,useGpu,a_mat,a_dev,lda,matrixCols,nblk,my_prow,my_pcol,np_rows,np_
successCUDA = cuda_malloc(a_dev, lda * matrixCols * size_of_datatype)
check_alloc_cuda("tridiag: a_dev", successCUDA)
successCUDA = cuda_memcpy(a_dev, loc(a_mat(1,1)), lda * matrixCols * size_of_datatype, cudaMemcpyHostToDevice)
successCUDA = cuda_memcpy(a_dev, int(loc(a_mat(1,1)),kind=c_intptr_t), &
lda * matrixCols * size_of_datatype, cudaMemcpyHostToDevice)
check_memcpy_cuda("tridiag: a_dev", successCUDA)
endif
......@@ -366,9 +368,11 @@ call prmat(na,useGpu,a_mat,a_dev,lda,matrixCols,nblk,my_prow,my_pcol,np_rows,np_
! copy l_cols + 1 column of A to v_row
if (useGPU) then
a_offset = l_cols * lda * size_of_datatype
! we use v_row on the host at the moment! successCUDA = cuda_memcpy(v_row_dev, a_dev + a_offset, (l_rows)*size_of_PRECISION_real, cudaMemcpyDeviceToDevice)
! we use v_row on the host at the moment! successCUDA = cuda_memcpy(v_row_dev, a_dev + a_offset,
! (l_rows)*size_of_PRECISION_real, cudaMemcpyDeviceToDevice)
successCUDA = cuda_memcpy(loc(v_row(1)), a_dev + a_offset, (l_rows)* size_of_datatype, cudaMemcpyDeviceToHost)
successCUDA = cuda_memcpy(int(loc(v_row(1)),kind=c_intptr_t), &
a_dev + a_offset, (l_rows)* size_of_datatype, cudaMemcpyDeviceToHost)
check_memcpy_cuda("tridiag a_dev 1", successCUDA)
else
v_row(1:l_rows) = a_mat(1:l_rows,l_cols+1)
......@@ -486,11 +490,13 @@ call prmat(na,useGpu,a_mat,a_dev,lda,matrixCols,nblk,my_prow,my_pcol,np_rows,np_
successCUDA = cuda_memset(u_row_dev, 0, l_rows * size_of_datatype)
check_memcpy_cuda("tridiag: u_row_dev", successCUDA)
successCUDA = cuda_memcpy(v_col_dev, loc(v_col(1)), l_cols * size_of_datatype, cudaMemcpyHostToDevice)
successCUDA = cuda_memcpy(v_col_dev, int(loc(v_col(1)),kind=c_intptr_t), &
l_cols * size_of_datatype, cudaMemcpyHostToDevice)
check_memcpy_cuda("tridiag: v_col_dev", successCUDA)
successCUDA = cuda_memcpy(v_row_dev, loc(v_row(1)), l_rows * size_of_datatype, cudaMemcpyHostToDevice)
successCUDA = cuda_memcpy(v_row_dev, int(loc(v_row(1)),kind=c_intptr_t), &
l_rows * size_of_datatype, cudaMemcpyHostToDevice)
check_memcpy_cuda("tridiag: v_row_dev", successCUDA)
endif ! useGU
......@@ -621,10 +627,12 @@ call prmat(na,useGpu,a_mat,a_dev,lda,matrixCols,nblk,my_prow,my_pcol,np_rows,np_
enddo
end if !multiplication as one block / per stripes
successCUDA = cuda_memcpy(loc(u_col(1)), u_col_dev, l_cols * size_of_datatype, cudaMemcpyDeviceToHost)
successCUDA = cuda_memcpy(int(loc(u_col(1)),kind=c_intptr_t), &
u_col_dev, l_cols * size_of_datatype, cudaMemcpyDeviceToHost)
check_memcpy_cuda("tridiag: u_col_dev 1", successCUDA)
successCUDA = cuda_memcpy(loc(u_row(1)), u_row_dev, l_rows * size_of_datatype, cudaMemcpyDeviceToHost)
successCUDA = cuda_memcpy(int(loc(u_row(1)),kind=c_intptr_t), &
u_row_dev, l_rows * size_of_datatype, cudaMemcpyDeviceToHost)
check_memcpy_cuda("tridiag: u_row_dev 1", successCUDA)
endif
......@@ -749,12 +757,12 @@ call prmat(na,useGpu,a_mat,a_dev,lda,matrixCols,nblk,my_prow,my_pcol,np_rows,np_
if (n_stored_vecs == max_stored_uv .or. istep == 3) then
if (useGPU) then
successCUDA = cuda_memcpy(vu_stored_rows_dev, loc(vu_stored_rows(1,1)), &
successCUDA = cuda_memcpy(vu_stored_rows_dev, int(loc(vu_stored_rows(1,1)),kind=c_intptr_t), &
max_local_rows * 2 * max_stored_uv * &
size_of_datatype, cudaMemcpyHostToDevice)
check_memcpy_cuda("tridiag: vu_stored_rows_dev", successCUDA)
successCUDA = cuda_memcpy(uv_stored_cols_dev, loc(uv_stored_cols(1,1)), &
successCUDA = cuda_memcpy(uv_stored_cols_dev, int(loc(uv_stored_cols(1,1)),kind=c_intptr_t), &
max_local_cols * 2 * max_stored_uv * &
size_of_datatype, cudaMemcpyHostToDevice)
check_memcpy_cuda("tridiag: uv_stored_cols_dev", successCUDA)
......@@ -817,7 +825,7 @@ call prmat(na,useGpu,a_mat,a_dev,lda,matrixCols,nblk,my_prow,my_pcol,np_rows,np_
!a_mat(l_rows,l_cols) = a_dev(l_rows,l_cols)
a_offset = ((l_rows - 1) + lda * (l_cols - 1)) * size_of_datatype
successCUDA = cuda_memcpy(loc(a_mat(l_rows, l_cols)), a_dev + a_offset, &
successCUDA = cuda_memcpy(int(loc(a_mat(l_rows, l_cols)),kind=c_intptr_t), a_dev + a_offset, &
1 * size_of_datatype, cudaMemcpyDeviceToHost)
check_memcpy_cuda("tridiag: a_dev 3", successCUDA)
......@@ -853,7 +861,7 @@ call prmat(na,useGpu,a_mat,a_dev,lda,matrixCols,nblk,my_prow,my_pcol,np_rows,np_
if (my_prow==prow(1, nblk, np_rows)) then
! We use last l_cols value of loop above
if(useGPU) then
successCUDA = cuda_memcpy(loc(aux3(1)), a_dev + (lda * (l_cols - 1)) * size_of_datatype, &
successCUDA = cuda_memcpy(int(loc(aux3(1)),kind=c_intptr_t), a_dev + (lda * (l_cols - 1)) * size_of_datatype, &
1 * size_of_datatype, cudaMemcpyDeviceToHost)
check_memcpy_cuda("tridiag: a_dev 5", successCUDA)
vrl = aux3(1)
......@@ -889,7 +897,7 @@ call prmat(na,useGpu,a_mat,a_dev,lda,matrixCols,nblk,my_prow,my_pcol,np_rows,np_
#endif /* WITH_MPI */
if (my_prow == prow(1, nblk, np_rows) .and. my_pcol == pcol(1, nblk, np_cols)) then
if(useGPU) then
successCUDA = cuda_memcpy(loc(aux3(1)), a_dev, &
successCUDA = cuda_memcpy(int(loc(aux3(1)),kind=c_intptr_t), a_dev, &
1 * size_of_datatype, cudaMemcpyDeviceToHost)
check_memcpy_cuda("tridiag: a_dev 6", successCUDA)
d_vec(1) = PRECISION_REAL(aux3(1))
......@@ -905,7 +913,7 @@ call prmat(na,useGpu,a_mat,a_dev,lda,matrixCols,nblk,my_prow,my_pcol,np_rows,np_
if (my_prow==prow(1, nblk, np_rows) .and. my_pcol==pcol(2, nblk, np_cols)) then
if(useGPU) then
successCUDA = cuda_memcpy(loc(e_vec(1)), a_dev + (lda * (l_cols - 1)) * size_of_datatype, &
successCUDA = cuda_memcpy(int(loc(e_vec(1)),kind=c_intptr_t), a_dev + (lda * (l_cols - 1)) * size_of_datatype, &
1 * size_of_datatype, cudaMemcpyDeviceToHost)
check_memcpy_cuda("tridiag: a_dev 7", successCUDA)
else !useGPU
......@@ -916,7 +924,7 @@ call prmat(na,useGpu,a_mat,a_dev,lda,matrixCols,nblk,my_prow,my_pcol,np_rows,np_
! Store d_vec(1)
if (my_prow==prow(1, nblk, np_rows) .and. my_pcol==pcol(1, nblk, np_cols)) then
if(useGPU) then
successCUDA = cuda_memcpy(loc(d_vec(1)), a_dev, 1 * size_of_datatype, cudaMemcpyDeviceToHost)
successCUDA = cuda_memcpy(int(loc(d_vec(1)),kind=c_intptr_t), a_dev, 1 * size_of_datatype, cudaMemcpyDeviceToHost)
check_memcpy_cuda("tridiag: a_dev 8", successCUDA)
else !useGPU
d_vec(1) = a_mat(1,1)
......
......@@ -114,9 +114,11 @@
integer(kind=ik) :: na, lda, nblk, nbw, matrixCols, numBlocks, mpi_comm_rows, mpi_comm_cols
#ifdef USE_ASSUMED_SIZE
MATH_DATATYPE(kind=rck) :: a_mat(lda,*), tmat(nbw,nbw,*)
MATH_DATATYPE(kind=rck) :: a_mat(lda,*)
MATH_DATATYPE(kind=rck) :: tmat(nbw,nbw,*)
#else
MATH_DATATYPE(kind=rck) :: a_mat(lda,matrixCols), tmat(nbw,nbw,numBlocks)
MATH_DATATYPE(kind=rck) :: a_mat(lda,matrixCols)
MATH_DATATYPE(kind=rck) :: tmat(nbw,nbw,numBlocks)
#endif
#if REALCASE == 1
......@@ -138,10 +140,12 @@
integer(kind=ik) :: tile_size, l_rows_tile, l_cols_tile
real(kind=rk) :: vnorm2
MATH_DATATYPE(kind=rck) :: xf, aux1(nbw), aux2(nbw), vrl, tau, vav(nbw,nbw)
MATH_DATATYPE(kind=rck) :: xf, aux1(nbw), aux2(nbw), vrl, tau
MATH_DATATYPE(kind=rck) :: vav(nbw,nbw)
! complex(kind=COMPLEX_DATATYPE), allocatable :: tmpCUDA(:,:), vmrCUDA(:,:), umcCUDA(:,:) ! note the different dimension in real case
MATH_DATATYPE(kind=rck), allocatable :: tmpCUDA(:), vmrCUDA(:), umcCUDA(:)
MATH_DATATYPE(kind=rck), allocatable :: tmpCUDA(:)
MATH_DATATYPE(kind=rck), allocatable :: vmrCUDA(:), umcCUDA(:)
MATH_DATATYPE(kind=rck), allocatable :: tmpCPU(:,:), vmrCPU(:,:), umcCPU(:,:)
MATH_DATATYPE(kind=rck), allocatable :: vr(:)
......@@ -359,7 +363,8 @@
cur_l_rows = 0
cur_l_cols = 0
successCUDA = cuda_memcpy(a_dev, loc(a_mat(1,1)), (lda)*(na_cols)* size_of_datatype, cudaMemcpyHostToDevice)
successCUDA = cuda_memcpy(a_dev, int(loc(a_mat(1,1)),kind=c_intptr_t), &
(lda)*(na_cols)* size_of_datatype, cudaMemcpyHostToDevice)
if (.not.(successCUDA)) then
print *,"bandred_&
&MATH_DATATYPE&
......@@ -537,7 +542,7 @@
cur_pcol = pcol(istep*nbw+1, nblk, np_cols)
if (my_pcol == cur_pcol) then
successCUDA = cuda_memcpy2d(loc(a_mat(1, lc_start)), &
successCUDA = cuda_memcpy2d(int(loc(a_mat(1, lc_start)),kind=c_intptr_t), &
int((lda*size_of_datatype),kind=c_intptr_t), &
(a_dev + int( ( (lc_start-1) * lda*size_of_datatype),kind=c_intptr_t )), &
int(lda*size_of_datatype,kind=c_intptr_t), &
......@@ -849,7 +854,7 @@
if (my_pcol == cur_pcol) then
successCUDA = cuda_memcpy2d((a_dev+ &
int(((lc_start-1)*lda*size_of_datatype),kind=c_intptr_t)), &
int(lda*size_of_datatype,kind=c_intptr_t), loc(a_mat(1,lc_start)), &
int(lda*size_of_datatype,kind=c_intptr_t), int(loc(a_mat(1,lc_start)),kind=c_intptr_t), &
int(lda*size_of_datatype,kind=c_intptr_t), &
int(lr_end*size_of_datatype,kind=c_intptr_t), &
int((lc_end - lc_start+1),kind=c_intptr_t), &
......@@ -930,7 +935,7 @@
if (my_pcol == cur_pcol) then
successCUDA = cuda_memcpy2d((a_dev+ &
int(((lc_start-1)*lda*size_of_datatype),kind=c_intptr_t)), &
int(lda*size_of_datatype,kind=c_intptr_t), loc(a_mat(1,lc_start)), &
int(lda*size_of_datatype,kind=c_intptr_t), int(loc(a_mat(1,lc_start)),kind=c_intptr_t), &
int(lda*size_of_datatype,kind=c_intptr_t), &
int(lr_end*size_of_datatype,kind=c_intptr_t), &
int((lc_end - lc_start+1),kind=c_intptr_t), &
......@@ -1093,7 +1098,7 @@
if (useGPU) then
successCUDA = cuda_memcpy(vmr_dev, &
loc(vmrCUDA(1)),&
int(loc(vmrCUDA(1)),kind=c_intptr_t),&
vmr_size*size_of_datatype,cudaMemcpyHostToDevice)
if (.not.(successCUDA)) then
print *,"bandred_&
......@@ -1103,7 +1108,7 @@
endif
successCUDA = cuda_memcpy(umc_dev, &
loc(umcCUDA(1)), &
int(loc(umcCUDA(1)),kind=c_intptr_t), &
umc_size*size_of_datatype,cudaMemcpyHostToDevice)
if (.not.(successCUDA)) then
print *,"bandred_&
......@@ -1165,7 +1170,7 @@
if (useGPU) then
successCUDA = cuda_memcpy( &
loc(vmrCUDA(1)), &
int(loc(vmrCUDA(1)),kind=c_intptr_t), &
vmr_dev,vmr_size*size_of_datatype,cudaMemcpyDeviceToHost)
if (.not.(successCUDA)) then
print *,"bandred_&
......@@ -1175,7 +1180,7 @@
endif
successCUDA = cuda_memcpy( &
loc(umcCUDA(1)), &
int(loc(umcCUDA(1)),kind=c_intptr_t), &
umc_dev, umc_size*size_of_datatype,cudaMemcpyDeviceToHost)
if (.not.(successCUDA)) then
print *,"bandred_&
......@@ -1290,7 +1295,7 @@
if (useGPU) then
successCUDA = cuda_memcpy(umc_dev, &
loc(umcCUDA(1)), &
int(loc(umcCUDA(1)),kind=c_intptr_t), &
umc_size*size_of_datatype, cudaMemcpyHostToDevice)
if (.not.(successCUDA)) then
print *,"bandred_&
......@@ -1298,7 +1303,8 @@
&: error in cudaMemcpy umc_dev 5"
stop 1
endif
successCUDA = cuda_memcpy(tmat_dev,loc(tmat(1,1,istep)),nbw*nbw*size_of_datatype,cudaMemcpyHostToDevice)
successCUDA = cuda_memcpy(tmat_dev,int(loc(tmat(1,1,istep)),kind=c_intptr_t), &
nbw*nbw*size_of_datatype,cudaMemcpyHostToDevice)
if (.not.(successCUDA)) then
print *,"bandred_&
&MATH_DATATYPE&
......@@ -1312,7 +1318,8 @@
call obj%timer%stop("cublas")
! VAV = Tmat * V**T * A * V * Tmat**T = (U*Tmat**T)**T * V * Tmat**T
successCUDA = cuda_memcpy(vav_dev,loc(vav(1,1)), nbw*nbw*size_of_datatype,cudaMemcpyHostToDevice)
successCUDA = cuda_memcpy(vav_dev,int(loc(vav(1,1)),kind=c_intptr_t), &
nbw*nbw*size_of_datatype,cudaMemcpyHostToDevice)
if (.not.(successCUDA)) then
print *,"bandred_&
&MATH_DATATYPE&
......@@ -1330,7 +1337,8 @@
n_cols, n_cols, ONE, tmat_dev, nbw, vav_dev, nbw)
call obj%timer%stop("cublas")
successCUDA = cuda_memcpy(loc(vav(1,1)), vav_dev, nbw*nbw*size_of_datatype, cudaMemcpyDeviceToHost)
successCUDA = cuda_memcpy(int(loc(vav(1,1)),kind=c_intptr_t), &
vav_dev, nbw*nbw*size_of_datatype, cudaMemcpyDeviceToHost)
if (.not.(successCUDA)) then
print *,"bandred_&
&MATH_DATATYPE&
......@@ -1368,7 +1376,7 @@
(obj, n_cols,vav, nbw, nbw ,mpi_comm_cols)
if (useGPU) then
successCUDA = cuda_memcpy(vav_dev, loc(vav(1,1)), nbw*nbw*size_of_datatype,cudaMemcpyHostToDevice)
successCUDA = cuda_memcpy(vav_dev, int(loc(vav(1,1)),kind=c_intptr_t), nbw*nbw*size_of_datatype,cudaMemcpyHostToDevice)
if (.not.(successCUDA)) then
print *,"bandred_&
&MATH_DATATYPE&
......@@ -1396,7 +1404,7 @@
call obj%timer%stop("cublas")
successCUDA = cuda_memcpy( &
loc(umcCUDA(1)), &
int(loc(umcCUDA(1)),kind=c_intptr_t), &
umc_dev, umc_size*size_of_datatype, cudaMemcpyDeviceToHost)
if (.not.(successCUDA)) then
......@@ -1416,7 +1424,7 @@
1, istep*nbw, n_cols, nblk, max_threads)
successCUDA = cuda_memcpy(vmr_dev, &
loc(vmrCUDA(1)), &
int(loc(vmrCUDA(1)),kind=c_intptr_t), &
vmr_size*size_of_datatype, cudaMemcpyHostToDevice)
if (.not.(successCUDA)) then
print *,"bandred_&
......@@ -1426,7 +1434,7 @@
endif
successCUDA = cuda_memcpy(umc_dev, &
loc(umcCUDA(1)), &
int(loc(umcCUDA(1)),kind=c_intptr_t), &
umc_size*size_of_datatype, cudaMemcpyHostToDevice)
if (.not.(successCUDA)) then
print *,"bandred_&
......@@ -1687,7 +1695,8 @@
! (band to tridi). Previously, a has been kept on the device and then
! copied in redist_band (called from tridiag_band). However, it seems to
! be easier to do it here.
successCUDA = cuda_memcpy (loc(a_mat), int(a_dev,kind=c_intptr_t), int(lda*matrixCols* size_of_datatype, kind=c_intptr_t), &
successCUDA = cuda_memcpy (int(loc(a_mat),kind=c_intptr_t), &
int(a_dev,kind=c_intptr_t), int(lda*matrixCols* size_of_datatype, kind=c_intptr_t), &
cudaMemcpyDeviceToHost)
if (.not.(successCUDA)) then
print *,"bandred_&
......
......@@ -74,14 +74,14 @@ program print_available_elpa2_kernels
integer(kind=c_int) :: i
class(elpa_t), pointer :: e
integer :: option
integer :: option, error
if (elpa_init(CURRENT_API_VERSION) /= ELPA_OK) then
print *, "Unsupported ELPA API Version"
stop 1
endif
e => elpa_allocate()
e => elpa_allocate(error)
print *, "This program will give information on the ELPA2 kernels, "
print *, "which are available with this library and it will give "
......@@ -125,7 +125,7 @@ program print_available_elpa2_kernels
print *
print *
call elpa_deallocate(e)
call elpa_deallocate(e, error)
contains
......@@ -141,7 +141,7 @@ program print_available_elpa2_kernels
if (elpa_int_value_to_string(KERNEL_KEY, i) .eq. "ELPA_2STAGE_COMPLEX_GPU" .or. &
elpa_int_value_to_string(KERNEL_KEY, i) .eq. "ELPA_2STAGE_REAL_GPU") then
if (e%can_set("use_gpu",1) == ELPA_OK) then
call e%set("use_gpu",1)
call e%set("use_gpu",1, error)
endif
endif
......
......@@ -78,10 +78,10 @@
#ifdef USE_ASSUMED_SIZE
MATH_DATATYPE(kind=C_DATATYPE_KIND), intent(inout) :: a(obj%local_nrows,*)
MATH_DATATYPE(kind=C_DATATYPE_KIND), optional, target, intent(out) :: q(obj%local_nrows,*)
MATH_DATATYPE(kind=C_DATATYPE_KIND), optional, intent(out), target :: q(obj%local_nrows,*)
#else
MATH_DATATYPE(kind=C_DATATYPE_KIND), intent(inout) :: a(obj%local_nrows,obj%local_ncols)
MATH_DATATYPE(kind=C_DATATYPE_KIND), optional, target, intent(out) :: q(obj%local_nrows,obj%local_ncols)
MATH_DATATYPE(kind=C_DATATYPE_KIND), optional, intent(out), target :: q(obj%local_nrows,obj%local_ncols)
#endif
real(kind=C_DATATYPE_KIND), intent(inout) :: ev(obj%na)
MATH_DATATYPE(kind=C_DATATYPE_KIND), allocatable :: hh_trans(:,:)
......@@ -734,7 +734,7 @@
! if the second backward step is to be performed, but not on GPU, we have
! to transfer q to the host
if(do_trans_to_full .and. (.not. do_useGPU_trans_ev_band_to_full)) then
successCUDA = cuda_memcpy(loc(q), q_dev, ldq*matrixCols* size_of_datatype, cudaMemcpyDeviceToHost)
successCUDA = cuda_memcpy(int(loc(q),kind=c_intptr_t), q_dev, ldq*matrixCols* size_of_datatype, cudaMemcpyDeviceToHost)
if (.not.(successCUDA)) then
print *,"elpa2_template, error in copy to host"
stop 1
......@@ -760,7 +760,7 @@
! copy to device if we want to continue on GPU
successCUDA = cuda_malloc(q_dev, ldq*matrixCols*size_of_datatype)
successCUDA = cuda_memcpy(q_dev, loc(q), ldq*matrixCols* size_of_datatype, cudaMemcpyHostToDevice)
successCUDA = cuda_memcpy(q_dev, int(loc(q),kind=c_intptr_t), ldq*matrixCols* size_of_datatype, cudaMemcpyHostToDevice)
if (.not.(successCUDA)) then
print *,"elpa2_template, error in copy to device"
stop 1
......
......@@ -110,9 +110,11 @@
#endif
integer(kind=ik) :: na, nqc, lda, ldq, nblk, nbw, matrixCols, numBlocks, mpi_comm_rows, mpi_comm_cols
#ifdef USE_ASSUMED_SIZE
MATH_DATATYPE(kind=rck) :: a_mat(lda,*), q_mat(ldq,*), tmat(nbw,nbw,*)
MATH_DATATYPE(kind=rck) :: a_mat(lda,*)
MATH_DATATYPE(kind=rck) :: q_mat(ldq,*), tmat(nbw,nbw,*)
#else
MATH_DATATYPE(kind=rck) :: a_mat(lda,matrixCols), q_mat(ldq,matrixCols), tmat(nbw, nbw, numBlocks)
MATH_DATATYPE(kind=rck) :: a_mat(lda,matrixCols)
MATH_DATATYPE(kind=rck) :: q_mat(ldq,matrixCols), tmat(nbw, nbw, numBlocks)
#endif
integer(kind=C_intptr_T) :: a_dev ! passed from bandred_real at the moment not used since copied in bandred_real
......@@ -122,7 +124,8 @@
integer(kind=ik) :: l_cols, l_rows, l_colh, n_cols
integer(kind=ik) :: istep, lc, ncol, nrow, nb, ns
MATH_DATATYPE(kind=rck), allocatable :: tmp1(:), tmp2(:), hvb(:), hvm(:,:)
MATH_DATATYPE(kind=rck), allocatable :: hvb(:)
MATH_DATATYPE(kind=rck), allocatable :: tmp1(:), tmp2(:), hvm(:,:)
! hvm_dev is fist used and set in this routine
! q_mat is changed in trans_ev_tridi on the host, copied to device and passed here. this can be adapted
! tmp_dev is first used in this routine
......@@ -268,7 +271,7 @@
! q_temp(1:ldq,1:na_cols) = q_mat(1:ldq,1:na_cols)
! ! copy q_dev to device, maybe this can be avoided if q_dev can be kept on device in trans_ev_tridi_to_band
! successCUDA = cuda_memcpy(q_dev, loc(q_mat), (ldq)*(matrixCols)*size_of_PRECISION_real, cudaMemcpyHostToDevice)
! successCUDA = cuda_memcpy(q_dev, c_loc(q_mat), (ldq)*(matrixCols)*size_of_PRECISION_real, cudaMemcpyHostToDevice)
! if (.not.(successCUDA)) then
! print *,"trans_ev_band_to_full_real: error in cudaMalloc"
! stop 1
......@@ -281,7 +284,7 @@
! stop 1
! endif
!
! successCUDA = cuda_memcpy(q_dev, loc(q_mat),ldq*matrixCols*size_of_PRECISION_complex, cudaMemcpyHostToDevice)
! successCUDA = cuda_memcpy(q_dev, c_loc(q_mat),ldq*matrixCols*size_of_PRECISION_complex, cudaMemcpyHostToDevice)
! if (.not.(successCUDA)) then
! print *,"trans_ev_band_to_full_complex: error in cudaMemcpy"
! stop 1
......@@ -346,7 +349,8 @@
nb = nb+l_rows
enddo
successCUDA = cuda_memcpy(hvm_dev, loc(hvm), max_local_rows*nbw* size_of_datatype, cudaMemcpyHostToDevice)
successCUDA = cuda_memcpy(hvm_dev, int(loc(hvm),kind=c_intptr_t), &
max_local_rows*nbw* size_of_datatype, cudaMemcpyHostToDevice)
if (.not.(successCUDA)) then
print *,"trans_ev_band_to_full_real: error in cudaMemcpy, hvm"
......@@ -369,7 +373,8 @@
! copy data from device to host for a later MPI_ALLREDUCE
! copy to host maybe this can be avoided this is needed if MPI is used (allreduce)
successCUDA = cuda_memcpy(loc(tmp1), tmp_dev, l_cols*n_cols*size_of_datatype, cudaMemcpyDeviceToHost)
successCUDA = cuda_memcpy(int(loc(tmp1),kind=c_intptr_t), &
tmp_dev, l_cols*n_cols*size_of_datatype, cudaMemcpyDeviceToHost)
if (.not.(successCUDA)) then
print *,"trans_ev_band_to_full_real: error in cudaMemcpy, tmp1 to host"
stop 1
......@@ -398,7 +403,8 @@
#ifdef WITH_MPI
! after the mpi_allreduce we have to copy back to the device
! copy back to device
successCUDA = cuda_memcpy(tmp_dev, loc(tmp2), n_cols*l_cols* size_of_datatype, &
successCUDA = cuda_memcpy(tmp_dev, int(loc(tmp2),kind=c_intptr_t), &
n_cols*l_cols* size_of_datatype, &
cudaMemcpyHostToDevice)
if (.not.(successCUDA)) then
print *,"trans_ev_band_to_full_&
......@@ -414,7 +420,8 @@
! IMPORTANT: even though tmat_dev is transfered from the previous rutine, we have to copy from tmat again
! tmat is 3-dimensional array, while tmat_dev contains only one 2-dimensional slice of it - and here we
! need to upload another slice
successCUDA = cuda_memcpy(tmat_dev, loc(tmat(1,1,istep)), nbw*nbw*size_of_datatype, cudaMemcpyHostToDevice)
successCUDA = cuda_memcpy(tmat_dev, int(loc(tmat(1,1,istep)),kind=c_intptr_t), &
nbw*nbw*size_of_datatype, cudaMemcpyHostToDevice)
if (.not.(successCUDA)) then
print *,"trans_ev_band_to_full_&
......@@ -434,7 +441,8 @@
! copy to host maybe this can be avoided
! this is not necessary hvm is not used anymore
successCUDA = cuda_memcpy(loc(hvm), hvm_dev, ((max_local_rows)*nbw*size_of_datatype),cudaMemcpyDeviceToHost)
successCUDA = cuda_memcpy(int(loc(hvm),kind=c_intptr_t), &
hvm_dev, ((max_local_rows)*nbw*size_of_datatype),cudaMemcpyDeviceToHost)
if (.not.(successCUDA)) then
print *,"trans_ev_band_to_full_real: error in cudaMemcpy hvm to host"
stop 1
......@@ -779,7 +787,8 @@
! final transfer of q_dev
successCUDA = cuda_memcpy(loc(q_mat), q_dev, ldq*matrixCols* size_of_datatype, cudaMemcpyDeviceToHost)
successCUDA = cuda_memcpy(int(loc(q_mat),kind=c_intptr_t), q_dev, ldq*matrixCols* size_of_datatype, &
cudaMemcpyDeviceToHost)
if (.not.(successCUDA)) then
print *,"trans_ev_band_to_full_&
......
......@@ -143,11 +143,15 @@
MATH_DATATYPE(kind=rck) , allocatable :: row_group(:,:)
#ifdef WITH_OPENMP
MATH_DATATYPE(kind=rck), allocatable :: top_border_send_buffer(:,:), top_border_recv_buffer(:,:)
MATH_DATATYPE(kind=rck), allocatable :: bottom_border_send_buffer(:,:), bottom_border_recv_buffer(:,:)
MATH_DATATYPE(kind=rck), allocatable :: top_border_send_buffer(:,:)
MATH_DATATYPE(kind=rck), allocatable :: top_border_recv_buffer(:,:)
MATH_DATATYPE(kind=rck), allocatable :: bottom_border_send_buffer(:,:)
MATH_DATATYPE(kind=rck), allocatable :: bottom_border_recv_buffer(:,:)
#else
MATH_DATATYPE(kind=rck), allocatable :: top_border_send_buffer(:,:,:), top_border_recv_buffer(:,:,:)
MATH_DATATYPE(kind=rck), allocatable :: bottom_border_send_buffer(:,:,:), bottom_border_recv_buffer(:,:,:)
MATH_DATATYPE(kind=rck), allocatable :: top_border_send_buffer(:,:,:)
MATH_DATATYPE(kind=rck), allocatable :: top_border_recv_buffer(:,:,:)
MATH_DATATYPE(kind=rck), allocatable :: bottom_border_send_buffer(:,:,:)
MATH_DATATYPE(kind=rck), allocatable :: bottom_border_recv_buffer(:,:,:)
#endif
integer(kind=c_intptr_t) :: aIntern_dev
......@@ -1234,7 +1238,7 @@
#endif /* WITH_MPI */
if (useGPU) then
successCUDA = cuda_memcpy(bcast_buffer_dev, loc(bcast_buffer(1,1)), &
successCUDA = cuda_memcpy(bcast_buffer_dev, int(loc(bcast_buffer(1,1)),kind=c_intptr_t), &
nbw * current_local_n * &
size_of_datatype, &
cudaMemcpyHostToDevice)
......@@ -1345,7 +1349,8 @@
if (useGPU) then
dev_offset = (0 + (n_off * stripe_width) + ( (i-1) * stripe_width *a_dim2 )) * size_of_datatype
successCUDA = cuda_memcpy( aIntern_dev + dev_offset , loc(bottom_border_recv_buffer(1,1,i)), &
successCUDA = cuda_memcpy( aIntern_dev + dev_offset , &
int(loc(bottom_border_recv_buffer(1,1,i)),kind=c_intptr_t), &
stripe_width*nbw* size_of_datatype, &
cudaMemcpyHostToDevice)
if (.not.(successCUDA)) then
......@@ -1430,7 +1435,7 @@
if (useGPU) then
dev_offset = (0 + (a_off * stripe_width) + ( (i-1) * stripe_width * a_dim2 )) * size_of_datatype
! host_offset= (0 + (0 * stripe_width) + ( (i-1) * stripe_width * nbw ) ) * 8
successCUDA = cuda_memcpy( aIntern_dev+dev_offset , loc(top_border_recv_buffer(1,1,i)), &
successCUDA = cuda_memcpy( aIntern_dev+dev_offset , int(loc(top_border_recv_buffer(1,1,i)),kind=c_intptr_t), &
stripe_width*top_msg_length* size_of_datatype, &
cudaMemcpyHostToDevice)
if (.not.(successCUDA)) then
......@@ -1523,7 +1528,7 @@
if (useGPU) then
dev_offset = (0 + (n_off * stripe_width) + ( (i-1) * stripe_width * a_dim2 )) * size_of_datatype
successCUDA = cuda_memcpy( loc(bottom_border_send_buffer(1,1,i)), aIntern_dev + dev_offset, &
successCUDA = cuda_memcpy( int(loc(bottom_border_send_buffer(1,1,i)),kind=c_intptr_t), aIntern_dev + dev_offset, &
stripe_width * bottom_msg_length * size_of_datatype, &
cudaMemcpyDeviceToHost)
if (.not.(successCUDA)) then
......@@ -1634,7 +1639,7 @@
if (useGPU) then
dev_offset = (0 + (n_off * stripe_width) + ( (i-1) * stripe_width * a_dim2 )) * size_of_datatype
successCUDA = cuda_memcpy( loc(bottom_border_send_buffer(1,1,i)), aIntern_dev + dev_offset, &
successCUDA = cuda_memcpy(int(loc(bottom_border_send_buffer(1,1,i)),kind=c_intptr_t), aIntern_dev + dev_offset, &
stripe_width*bottom_msg_length* size_of_datatype, &
cudaMemcpyDeviceToHost)
if (.not.(successCUDA)) then
......@@ -1730,7 +1735,7 @@
#endif
if (useGPU) then
dev_offset = (0 + (a_off * stripe_width) + ( (i-1) * stripe_width * a_dim2 )) * size_of_datatype
successCUDA = cuda_memcpy( aIntern_dev + dev_offset , loc( top_border_recv_buffer(:,1,i)), &
successCUDA = cuda_memcpy( aIntern_dev + dev_offset ,int(loc( top_border_recv_buffer(:,1,i)),kind=c_intptr_t), &
stripe_width * top_msg_length * size_of_datatype, &
cudaMemcpyHostToDevice)
if (.not.(successCUDA)) then
......@@ -1858,7 +1863,7 @@
#endif
if (useGPU) then
dev_offset = (0 + (a_off * stripe_width) + ( (i-1) * stripe_width * a_dim2 )) * size_of_datatype
successCUDA = cuda_memcpy( loc(top_border_send_buffer(:,1,i)), aIntern_dev + dev_offset, &
successCUDA = cuda_memcpy( int(loc(top_border_send_buffer(:,1,i)),kind=c_intptr_t), aIntern_dev + dev_offset, &
stripe_width*nbw * size_of_datatype, &
cudaMemcpyDeviceToHost)
if (.not.(successCUDA)) then
......@@ -2205,7 +2210,7 @@
endif
! copy q_dev to device, maybe this can be avoided if q_dev can be kept on device in trans_ev_tridi_to_band
successCUDA = cuda_memcpy(q_dev, loc(q), (ldq)*(matrixCols)* size_of_datatype, &
successCUDA = cuda_memcpy(q_dev, int(loc(q),kind=c_intptr_t), (ldq)*(matrixCols)* size_of_datatype, &
cudaMemcpyHostToDevice)
if (.not.(successCUDA)) then
print *,"trans_ev_tridi_to_band_&
......
......@@ -81,8 +81,11 @@
#define __forceinline __attribute__((always_inline))
#endif
#endif /* VEC_SET == SSE_128 || VEC_SET == AVX_256 || VEC_SET == AVX_512 */
#if VEC_SET == NEON_ARCH64_128
#include <arm_neon.h>
#endif
#include <complex.h>
......@@ -103,6 +106,10 @@
#define SIMD_SET SSE
#endif
#if VEC_SET == NEON_ARCH64_128
#define SIMD_SET NEON_ARCH64
#endif
#if VEC_SET == AVX_256
#define SIMD_SET AVX_AVX2
#endif
......@@ -155,6 +162,49 @@
#endif /* VEC_SET == SSE_128 */
#if VEC_SET == NEON_128
#ifdef DOUBLE_PRECISION_COMPLEX
#define offset 2
#define __SIMD_DATATYPE __Float64x2_t
#define _SIMD_LOAD vld1q_f64
#define _SIMD_LOADU _mm_loadu_pd
#define _SIMD_STORE vst1q_f64
#define _SIMD_STOREU _mm_storeu_pd
#define _SIMD_MUL vmulq_f64
#define _SIMD_ADD vaddq_f64
#define _SIMD_XOR _mm_xor_pd
#define _SIMD_ADDSUB _mm_addsub_pd
#define _SIMD_SHUFFLE _mm_shuffle_pd
#define _SHUFFLE _MM_SHUFFLE2(0,1)
#ifdef __ELPA_USE_FMA__
#define _SIMD_FMSUBADD _mm_maddsub_pd
#endif
#endif /* DOUBLE_PRECISION_COMPLEX */
#ifdef SINGLE_PRECISION_COMPLEX
#define offset 4
#define __SIMD_DATATYPE __m128
#define _SIMD_LOAD _mm_load_ps
#define _SIMD_LOADU _mm_loadu_ps
#define _SIMD_STORE _mm_store_ps
#define _SIMD_STOREU _mm_storeu_ps
#define _SIMD_MUL _mm_mul_ps
#define _SIMD_ADD _mm_add_ps
#define _SIMD_XOR _mm_xor_ps
#define _SIMD_ADDSUB _mm_addsub_ps
#define _SIMD_SHUFFLE _mm_shuffle_ps
#define _SHUFFLE 0xb1
#ifdef __ELPA_USE_FMA__
#define _SIMD_FMSUBADD _mm_maddsub_ps
#endif
#endif /* SINGLE_PRECISION_COMPLEX */
#endif /* VEC_SET == NEON_128 */
#if VEC_SET == AVX_256
#ifdef DOUBLE_PRECISION_COMPLEX
......
......@@ -49,7 +49,7 @@
#define REALCASE 1
#define DOUBLE_PRECISION 1
#define BLOCK2 1
#define VEC_SET 256
#define VEC_SET AVX_256
#include "../../general/precision_macros.h"
#include "real_128bit_256bit_512bit_BLOCK_template.c"
#undef BLOCK2
......
......@@ -49,7 +49,7 @@
#define REALCASE 1
#define SINGLE_PRECISION 1
#define BLOCK2 1
#define VEC_SET 256
#define VEC_SET AVX_256
#include "../../general/precision_macros.h"
#include "real_128bit_256bit_512bit_BLOCK_template.c"
#undef BLOCK2
......
......@@ -49,7 +49,7 @@
#define REALCASE 1
#define DOUBLE_PRECISION 1
#define BLOCK4 1
#define VEC_SET 256
#define VEC_SET AVX_256
#include "../../general/precision_macros.h"
#include "real_128bit_256bit_512bit_BLOCK_template.c"
#undef BLOCK4
......
......@@ -49,7 +49,7 @@
#define REALCASE 1
#define SINGLE_PRECISION 1
#define BLOCK4
#define VEC_SET 256
#define VEC_SET AVX_256
#include "../../general/precision_macros.h"
#include "real_128bit_256bit_512bit_BLOCK_template.c"
#undef BLOCK4
......
......@@ -49,7 +49,7 @@
#define REALCASE 1
#define DOUBLE_PRECISION 1
#define BLOCK6 1
#define VEC_SET 256
#define VEC_SET AVX_256
#include "../../general/precision_macros.h"
#include "real_128bit_256bit_512bit_BLOCK_template.c"
#undef REALCASE
......
......@@ -49,7 +49,7 @@
#define REALCASE 1
#define SINGLE_PRECISION 1
#define BLOCK6 1
#define VEC_SET 256
#define VEC_SET AVX_256
#include "../../general/precision_macros.h"
#include "real_128bit_256bit_512bit_BLOCK_template.c"
#undef REALCASE
......
......@@ -49,7 +49,7 @@
#define REALCASE 1
#define DOUBLE_PRECISION 1
#define BLOCK2 1
#define VEC_SET 512
#define VEC_SET AVX_512
#include "../../general/precision_macros.h"
#include "real_128bit_256bit_512bit_BLOCK_template.c"
#undef BLOCK2
......
......@@ -49,7 +49,7 @@
#define REALCASE 1
#define SINGLE_PRECISION 1
#define BLOCK2 1
#define VEC_SET 512
#define VEC_SET AVX_512
#include "../../general/precision_macros.h"
#include "real_128bit_256bit_512bit_BLOCK_template.c"
#undef BLOCK2
......
......@@ -49,7 +49,7 @@
#define REALCASE 1
#define DOUBLE_PRECISION 1
#define BLOCK4 1
#define VEC_SET 512
#define VEC_SET AVX_512
#include "../../general/precision_macros.h"
#include "real_128bit_256bit_512bit_BLOCK_template.c"
#undef BLOCK4
......
......@@ -49,7 +49,7 @@
#define REALCASE 1
#define SINGLE_PRECISION 1
#define BLOCK4 1
#define VEC_SET 512
#define VEC_SET AVX_512
#include "../../general/precision_macros.h"
#include "real_128bit_256bit_512bit_BLOCK_template.c"
#undef REALCASE
......
......@@ -48,7 +48,7 @@
#define REALCASE 1
#define DOUBLE_PRECISION 1
#define VEC_SET 512
#define VEC_SET AVX_512
#define BLOCK6 1
#include "../../general/precision_macros.h"
#include "real_128bit_256bit_512bit_BLOCK_template.c"
......
......@@ -49,7 +49,7 @@
#define REALCASE 1
#define SINGLE_PRECISION 1
#define BLOCK6 1
#define VEC_SET 512
#define VEC_SET AVX_512
#include "../../general/precision_macros.h"
#include "real_128bit_256bit_512bit_BLOCK_template.c"
#undef BLOCK6
......
......@@ -72,7 +72,7 @@
! Safety only:
if(mod(ldq,4) /= 0) STOP 'double_hh_trafo: ldq not divisible by 4!'
if(mod(loc(q),16) /= 0) STOP 'Q unaligned!'
if(mod(c_loc(q),16) /= 0) STOP 'Q unaligned!'
! Calculate dot product of the two Householder vectors
......
......@@ -49,7 +49,7 @@
#define REALCASE 1
#define DOUBLE_PRECISION 1
#define BLOCK2 1
#define VEC_SET 1281
#define VEC_SET SPARC64_SSE
#include "../../general/precision_macros.h"
#include "real_128bit_256bit_512bit_BLOCK_template.c"
#undef REALCASE
......
......@@ -49,7 +49,7 @@
#define REALCASE 1
#define SINGLE_PRECISION 1
#define BLOCK2 1
#define VEC_SET 1281
#define VEC_SET SPARC64_SSE
#include "../../general/precision_macros.h"
#include "real_128bit_256bit_512bit_BLOCK_template.c"
#undef BLOCK2
......