...
 
Commits (25)
This source diff could not be displayed because it is too large. You can view the blob instead.
...@@ -454,6 +454,12 @@ if WANT_SINGLE_PRECISION_COMPLEX ...@@ -454,6 +454,12 @@ if WANT_SINGLE_PRECISION_COMPLEX
endif endif
endif endif
if STORE_BUILD_CONFIG
libelpa@SUFFIX@_private_la_SOURCES += src/helpers/print_build_config.c
endif
# Cuda files # Cuda files
.cu.lo: .cu.lo:
NVCC="$(NVCC)" libtool --mode=compile --tag=CC $(top_srcdir)/nvcc_wrap $(NVCCFLAGS) $(LDFLAGS) -I$(top_builddir)/ -I$(top_srcdir)/ -c $< -o $@ NVCC="$(NVCC)" libtool --mode=compile --tag=CC $(top_srcdir)/nvcc_wrap $(NVCCFLAGS) $(LDFLAGS) -I$(top_builddir)/ -I$(top_srcdir)/ -c $< -o $@
...@@ -669,7 +675,6 @@ wrapper_la_CFLAGS = $(PYTHON_INCLUDE) $(NUMPY_INCLUDE) $(AM_CFLAGS) ...@@ -669,7 +675,6 @@ wrapper_la_CFLAGS = $(PYTHON_INCLUDE) $(NUMPY_INCLUDE) $(AM_CFLAGS)
python/pyelpa/wrapper.c: python/pyelpa/wrapper.pyx python/pyelpa/wrapper.c: python/pyelpa/wrapper.pyx
cython $< -o $@ cython $< -o $@
# test scripts # test scripts
TASKS ?= 2 TASKS ?= 2
if WITH_MPI if WITH_MPI
...@@ -793,8 +798,6 @@ EXTRA_DIST = \ ...@@ -793,8 +798,6 @@ EXTRA_DIST = \
src/elpa2/elpa2_tridiag_band_template.F90 \ src/elpa2/elpa2_tridiag_band_template.F90 \
src/elpa2/kernels/complex_128bit_256bit_512bit_BLOCK_template.c \ src/elpa2/kernels/complex_128bit_256bit_512bit_BLOCK_template.c \
src/elpa2/kernels/complex_template.F90 \ src/elpa2/kernels/complex_template.F90 \
src/elpa2/kernels/real_vsx_4hv_template.c \
src/elpa2/kernels/real_vsx_6hv_template.c \
src/elpa2/kernels/real_128bit_256bit_512bit_BLOCK_template.c \ src/elpa2/kernels/real_128bit_256bit_512bit_BLOCK_template.c \
src/elpa2/kernels/real_template.F90 \ src/elpa2/kernels/real_template.F90 \
src/elpa2/kernels/simple_template.F90 \ src/elpa2/kernels/simple_template.F90 \
......
...@@ -268,7 +268,7 @@ print(" - export BLOCK_SIZE=16") ...@@ -268,7 +268,7 @@ print(" - export BLOCK_SIZE=16")
print(" - if [ \"$MEDIUM_MATRIX\" = \"yes\" ]; then export MATRIX_SIZE=1500 && export NUMBER_OF_EIGENVECTORS=750; fi") print(" - if [ \"$MEDIUM_MATRIX\" = \"yes\" ]; then export MATRIX_SIZE=1500 && export NUMBER_OF_EIGENVECTORS=750; fi")
print(" - if [ \"$LARGE_MATRIX\" = \"yes\" ]; then export MATRIX_SIZE=5000 && export NUMBER_OF_EIGENVECTORS=500; fi") print(" - if [ \"$LARGE_MATRIX\" = \"yes\" ]; then export MATRIX_SIZE=5000 && export NUMBER_OF_EIGENVECTORS=500; fi")
print(" - if [ \"$GPU_BLOCKSIZE\" = \"yes\" ]; then export BLOCK_SIZE=128 ; fi") print(" - if [ \"$GPU_BLOCKSIZE\" = \"yes\" ]; then export BLOCK_SIZE=128 ; fi")
print(" - if [ -z \"$PIPELINE_MPI_TASKS\" ]; then export MPI_TASKS=2; else xport MPI_TASKS=$PIPELINE_MPI_TASKS; fi") print(" - if [ -z \"$PIPELINE_MPI_TASKS\" ]; then export MPI_TASKS=2; else export MPI_TASKS=$PIPELINE_MPI_TASKS; fi")
print(" - echo \"This test will run with matrix size na = $MATRIX_SIZE, nev= $NUMBER_OF_EIGENVECTORS, on a blacs grid with blocksize nblk= $BLOCK_SIZE \" ") print(" - echo \"This test will run with matrix size na = $MATRIX_SIZE, nev= $NUMBER_OF_EIGENVECTORS, on a blacs grid with blocksize nblk= $BLOCK_SIZE \" ")
print(" - export SKIP_STEP=0") print(" - export SKIP_STEP=0")
...@@ -702,9 +702,9 @@ for cc, fc, m, o, p, a, b, g, instr, addr, na in product( ...@@ -702,9 +702,9 @@ for cc, fc, m, o, p, a, b, g, instr, addr, na in product(
print("# " + cc + "-" + fc + "-" + m + "-" + o + "-" + p + "-" + a + "-" + b + "-" +g + "-" + cov + "-" + instr + "-" + addr) print("# " + cc + "-" + fc + "-" + m + "-" + o + "-" + p + "-" + a + "-" + b + "-" +g + "-" + cov + "-" + instr + "-" + addr)
print(cc + "-" + fc + "-" + m + "-" + o + "-" + p + "-" +a + "-" +b + "-" +g + "-" + cov + "-" + instr + "-" + addr + "-jobs:") print(cc + "-" + fc + "-" + m + "-" + o + "-" + p + "-" +a + "-" +b + "-" +g + "-" + cov + "-" + instr + "-" + addr + "-jobs:")
#if (MasterOnly): if (MasterOnly):
# print(" only:") print(" only:")
# print(" - /.*master.*/") print(" - /.*master.*/")
if (instr == "power8"): if (instr == "power8"):
print(" allow_failure: true") print(" allow_failure: true")
print(" tags:") print(" tags:")
......
...@@ -164,13 +164,28 @@ if test x"${enable_openmp}" = x"yes"; then ...@@ -164,13 +164,28 @@ if test x"${enable_openmp}" = x"yes"; then
CFLAGS="$OPENMP_CFLAGS $CFLAGS" CFLAGS="$OPENMP_CFLAGS $CFLAGS"
fi fi
c11_standard=no
AX_CHECK_COMPILE_FLAG([-std=c11], [ AX_CHECK_COMPILE_FLAG([-std=gnu11], [
CFLAGS+=" -std=c11" c11_standard=yes
], [ ], [
echo "C compiler cannot compile C11 code" echo "C compiler cannot compile -std=gnu11 code"
exit -1 echo "testing -std=c11.."
]) ])
if test x"$c11_standard" =x"yes"; then
CFLAGS+=" -std=gnu11"
fi
if test x"$c11_standard" =x"no"; then
AX_CHECK_COMPILE_FLAG([-std=c11], [
c11_standard=yes
], [
echo "C compiler cannot compile C11 code"
exit -1
])
if test x"$c11_standard" =x"yes"; then
CFLAGS+=" -std=c11"
fi
fi
AC_MSG_CHECKING(whether C compiler can use _Generic ) AC_MSG_CHECKING(whether C compiler can use _Generic )
AC_COMPILE_IFELSE([AC_LANG_SOURCE([ AC_COMPILE_IFELSE([AC_LANG_SOURCE([
...@@ -1323,6 +1338,29 @@ if test x"${enable_kcomputer}" = x"yes"; then ...@@ -1323,6 +1338,29 @@ if test x"${enable_kcomputer}" = x"yes"; then
fi fi
fi fi
AC_MSG_CHECKING(whether we build for NEC SX-Auroa)
AC_ARG_ENABLE([SX-Aurora],
AS_HELP_STRING([--enable-SX-Aurora],
[enable builds on SX-Aurora, default no.]),
[if test x"$enableval"=x"yes"; then
enable_sxaurora=yes
else
enable_sxaurora=no
fi],
[enable_kcomputer=no])
AC_MSG_RESULT([${enable_sxaurora}])
AM_CONDITIONAL([BUILD_KCOMPUTER],[test x"$enable_sxaurora" = x"yes"])
if test x"${enable_sxaurora}" = x"yes"; then
AC_DEFINE([BUILD_SXAURORA], [1], [build for SX-Aurora])
FC_MODINC="-I"
#if test x"${USE_ASSUMED_SIZE}" = x"yes" ; then
# AC_MSG_ERROR(on K-computer you have to switch off assumed-size arrays!)
#fi
if test x"${enable_fortran2008_features}" = x"yes" ; then
AC_MSG_ERROR(on SX-Aurora you have to switch off Fortran 2008 features!)
fi
fi
if test x"${want_single_precision}" = x"yes" ; then if test x"${want_single_precision}" = x"yes" ; then
AC_DEFINE([WANT_SINGLE_PRECISION_REAL],[1],[build also single-precision for real calculation]) AC_DEFINE([WANT_SINGLE_PRECISION_REAL],[1],[build also single-precision for real calculation])
AC_DEFINE([WANT_SINGLE_PRECISION_COMPLEX],[1],[build also single-precision for complex calculation]) AC_DEFINE([WANT_SINGLE_PRECISION_COMPLEX],[1],[build also single-precision for complex calculation])
...@@ -1376,6 +1414,28 @@ else ...@@ -1376,6 +1414,28 @@ else
fi fi
fi fi
dnl store-build-config
AC_MSG_CHECKING(whether build config should be compiled into the library)
AC_CHECK_PROG(xxd_CHECK,xxd,yes)
AS_IF([test x"$xxd_CHECK" != x"yes"], [AC_MSG_ERROR([Please install xxd before configuring.])])
AC_ARG_ENABLE([store-build-config],
AS_HELP_STRING([--enable-store-build-config],
[compile build config into the library object, default no]),
[
if test x"$enableval" = x"yes"; then
store_build_config=yes
else
store_build_config=no
fi
],
[store_build_config=no])
AC_MSG_RESULT([${store_build_config}])
AM_CONDITIONAL([STORE_BUILD_CONFIG],[test x"$store_build_config" = x"yes"])
if test x"${store_build_config}" = x"yes"; then
AC_DEFINE([STORE_BUILD_CONFIG], [1], [compile build config into the library object])
fi
AC_SUBST([SUFFIX]) AC_SUBST([SUFFIX])
AC_SUBST([PKG_CONFIG_FILE],[elpa${SUFFIX}-${PACKAGE_VERSION}.pc]) AC_SUBST([PKG_CONFIG_FILE],[elpa${SUFFIX}-${PACKAGE_VERSION}.pc])
...@@ -1385,12 +1445,13 @@ AC_CONFIG_FILES([ ...@@ -1385,12 +1445,13 @@ AC_CONFIG_FILES([
${PKG_CONFIG_FILE}:elpa.pc.in ${PKG_CONFIG_FILE}:elpa.pc.in
elpa/elpa_constants.h elpa/elpa_constants.h
elpa/elpa_version.h elpa/elpa_version.h
elpa/elpa_build_config.h
]) ])
m4_include([m4/ax_fc_check_define.m4]) m4_include([m4/ax_fc_check_define.m4])
AC_MSG_CHECKING([if workaround for broken preprocessor is needed]) AC_MSG_CHECKING([if workaround for broken preprocessor is needed])
need_manual_cpp=no need_manual_cpp=yes
AX_FC_CHECK_DEFINE([__INTEL_COMPILER],[is_intel=yes],[]) AX_FC_CHECK_DEFINE([__INTEL_COMPILER],[is_intel=yes],[])
AX_FC_CHECK_DEFINE([__PGI],[is_pgi=yes],[]) AX_FC_CHECK_DEFINE([__PGI],[is_pgi=yes],[])
ACTUAL_FC="$FC" ACTUAL_FC="$FC"
...@@ -1501,8 +1562,6 @@ if test x"${enable_python_tests}" = x"yes"; then ...@@ -1501,8 +1562,6 @@ if test x"${enable_python_tests}" = x"yes"; then
AC_MSG_ERROR([pytest not found.]) AC_MSG_ERROR([pytest not found.])
fi fi
fi fi
AC_OUTPUT AC_OUTPUT
echo "" echo ""
...@@ -1552,6 +1611,11 @@ else ...@@ -1552,6 +1611,11 @@ else
else else
echo "#undef OPTIONAL_C_ERROR_ARGUMENT" > elpa/elpa_generated_c_api.h echo "#undef OPTIONAL_C_ERROR_ARGUMENT" > elpa/elpa_generated_c_api.h
fi fi
if test x"$store_build_config" = x"yes"; then
cat config.log > elpa_build_object
xxd -i elpa_build_object >> elpa/elpa_build_config.h
fi
make -f $srcdir/generated_headers.am generated-headers top_srcdir="$srcdir" CPP="$CPP" make -f $srcdir/generated_headers.am generated-headers top_srcdir="$srcdir" CPP="$CPP"
fi fi
// The stored build config
...@@ -53,7 +53,7 @@ test/shared/generated.h: $(wildcard $(top_srcdir)/test/shared/*.*90) | test/shar ...@@ -53,7 +53,7 @@ test/shared/generated.h: $(wildcard $(top_srcdir)/test/shared/*.*90) | test/shar
$(call extract_interface,!c>) $(call extract_interface,!c>)
generated_headers += src/elpa_generated_fortran_interfaces.h generated_headers += src/elpa_generated_fortran_interfaces.h
src/elpa_generated_fortran_interfaces.h: $(filter-out $(wildcard $(top_srcdir)/src/*generated*), $(wildcard $(top_srcdir)/src/elpa2/kernels/*.c $(top_srcdir)/src/elpa2/kernels/*.s $(top_srcdir)/src/*.[ch] $(top_srcdir)/src/elpa_generalized/*.[ch])) | src src/elpa_generated_fortran_interfaces.h: $(filter-out $(wildcard $(top_srcdir)/src/*generated*), $(wildcard $(top_srcdir)/src/helpers/*.c $(top_srcdir)/src/elpa2/kernels/*.c $(top_srcdir)/src/elpa2/kernels/*.s $(top_srcdir)/src/*.[ch] $(top_srcdir)/src/elpa_generalized/*.[ch])) | src
@rm -f $@ @rm -f $@
$(call extract_interface,!f>) $(call extract_interface,!f>)
$(call extract_interface,#!f>) $(call extract_interface,#!f>)
......
...@@ -326,13 +326,17 @@ module elpa ...@@ -326,13 +326,17 @@ module elpa
integer :: error2 integer :: error2
obj => elpa_impl_allocate(error2) obj => elpa_impl_allocate(error2)
#ifdef USE_FORTRAN2008
if (present(error)) then if (present(error)) then
#endif
error = error2 error = error2
if (error .ne. ELPA_OK) then if (error .ne. ELPA_OK) then
write(*,*) "Cannot allocate the ELPA object!" write(*,*) "Cannot allocate the ELPA object!"
write(*,*) "This is a critical error!" write(*,*) "This is a critical error!"
write(*,*) "ELPA not usable with this error" write(*,*) "ELPA not usable with this error"
endif endif
#ifdef USE_FORTRAN2008
else else
if (error2 .ne. ELPA_OK) then if (error2 .ne. ELPA_OK) then
write(*,*) "Cannot allocate the ELPA object!" write(*,*) "Cannot allocate the ELPA object!"
...@@ -341,6 +345,8 @@ module elpa ...@@ -341,6 +345,8 @@ module elpa
stop stop
endif endif
endif endif
#endif
end function end function
...@@ -359,7 +365,9 @@ module elpa ...@@ -359,7 +365,9 @@ module elpa
integer :: error2 integer :: error2
call obj%destroy(error2) call obj%destroy(error2)
#ifdef USE_FORTRAN2008
if (present(error)) then if (present(error)) then
#endif
error = error2 error = error2
if (error .ne. ELPA_OK) then if (error .ne. ELPA_OK) then
write(*,*) "Cannot destroy the ELPA object!" write(*,*) "Cannot destroy the ELPA object!"
...@@ -368,6 +376,7 @@ module elpa ...@@ -368,6 +376,7 @@ module elpa
error = ELPA_ERROR_CRITICAL error = ELPA_ERROR_CRITICAL
return return
endif endif
#ifdef USE_FORTRAN2008
else else
if (error2 .ne. ELPA_OK) then if (error2 .ne. ELPA_OK) then
write(*,*) "Cannot destroy the ELPA object!" write(*,*) "Cannot destroy the ELPA object!"
...@@ -377,15 +386,21 @@ module elpa ...@@ -377,15 +386,21 @@ module elpa
return return
endif endif
endif endif
#endif
deallocate(obj, stat=error2) deallocate(obj, stat=error2)
if (error2 .ne. 0) then if (error2 .ne. 0) then
write(*,*) "Cannot deallocate the ELPA object!" write(*,*) "Cannot deallocate the ELPA object!"
write(*,*) "This is a critical error!" write(*,*) "This is a critical error!"
write(*,*) "This might lead to a memory leak in your application!" write(*,*) "This might lead to a memory leak in your application!"
#ifdef USE_FORTRAN2008
if (present(error)) then if (present(error)) then
error = ELPA_ERROR_CRITICAL error = ELPA_ERROR_CRITICAL
return return
endif endif
#else
error = ELPA_ERROR_CRITICAL
return
#endif
endif endif
end subroutine end subroutine
...@@ -404,7 +419,9 @@ module elpa ...@@ -404,7 +419,9 @@ module elpa
#endif #endif
integer :: error2 integer :: error2
call obj%destroy(error2) call obj%destroy(error2)
#ifdef USE_FORTRAN2008
if (present(error)) then if (present(error)) then
#endif
error = error2 error = error2
if (error2 .ne. ELPA_OK) then if (error2 .ne. ELPA_OK) then
write(*,*) "Cannot destroy the ELPA autotuning object!" write(*,*) "Cannot destroy the ELPA autotuning object!"
...@@ -413,6 +430,7 @@ module elpa ...@@ -413,6 +430,7 @@ module elpa
error = ELPA_ERROR_CRITICAL error = ELPA_ERROR_CRITICAL
return return
endif endif
#ifdef USE_FORTRAN2008
else else
if (error2 .ne. ELPA_OK) then if (error2 .ne. ELPA_OK) then
write(*,*) "Cannot destroy the ELPA autotuning object!" write(*,*) "Cannot destroy the ELPA autotuning object!"
...@@ -422,15 +440,21 @@ module elpa ...@@ -422,15 +440,21 @@ module elpa
return return
endif endif
endif endif
#endif
deallocate(obj, stat=error2) deallocate(obj, stat=error2)
if (error2 .ne. 0) then if (error2 .ne. 0) then
write(*,*) "Cannot deallocate the ELPA autotuning object!" write(*,*) "Cannot deallocate the ELPA autotuning object!"
write(*,*) "This is a critical error!" write(*,*) "This is a critical error!"
write(*,*) "This might lead to a memory leak in your application!" write(*,*) "This might lead to a memory leak in your application!"
#ifdef USE_FORTRAN2008
if (present(error)) then if (present(error)) then
error = ELPA_ERROR_CRITICAL error = ELPA_ERROR_CRITICAL
return return
endif endif
#else
error = ELPA_ERROR_CRITICAL
return
#endif
endif endif
end subroutine end subroutine
......
...@@ -684,7 +684,7 @@ ...@@ -684,7 +684,7 @@
endif endif
if (useGPU) then if (useGPU) then
successCUDA = cuda_memcpy(qtmp1_dev, loc(qtmp1(1,1)), & successCUDA = cuda_memcpy(qtmp1_dev, int(loc(qtmp1(1,1)),kind=c_intptr_t), &
gemm_dim_k * gemm_dim_l * size_of_datatype, cudaMemcpyHostToDevice) gemm_dim_k * gemm_dim_l * size_of_datatype, cudaMemcpyHostToDevice)
check_memcpy_cuda("merge_systems: qtmp1_dev", successCUDA) check_memcpy_cuda("merge_systems: qtmp1_dev", successCUDA)
endif endif
...@@ -749,13 +749,13 @@ ...@@ -749,13 +749,13 @@
if(useGPU) then if(useGPU) then
!TODO: it should be enough to copy l_rows x ncnt !TODO: it should be enough to copy l_rows x ncnt
successCUDA = cuda_memcpy(qtmp2_dev, loc(qtmp2(1,1)), & successCUDA = cuda_memcpy(qtmp2_dev, int(loc(qtmp2(1,1)),kind=c_intptr_t), &
gemm_dim_k * gemm_dim_m * size_of_datatype, cudaMemcpyHostToDevice) gemm_dim_k * gemm_dim_m * size_of_datatype, cudaMemcpyHostToDevice)
check_memcpy_cuda("merge_systems: qtmp2_dev", successCUDA) check_memcpy_cuda("merge_systems: qtmp2_dev", successCUDA)
!TODO the previous loop could be possible to do on device and thus !TODO the previous loop could be possible to do on device and thus
!copy less !copy less
successCUDA = cuda_memcpy(ev_dev, loc(ev(1,1)), & successCUDA = cuda_memcpy(ev_dev, int(loc(ev(1,1)),kind=c_intptr_t), &
gemm_dim_l * gemm_dim_m * size_of_datatype, cudaMemcpyHostToDevice) gemm_dim_l * gemm_dim_m * size_of_datatype, cudaMemcpyHostToDevice)
check_memcpy_cuda("merge_systems: ev_dev", successCUDA) check_memcpy_cuda("merge_systems: ev_dev", successCUDA)
endif endif
...@@ -791,7 +791,7 @@ ...@@ -791,7 +791,7 @@
!TODO either copy only half of the matrix here, and half after the !TODO either copy only half of the matrix here, and half after the
!second gemm, or copy whole array after the next gemm !second gemm, or copy whole array after the next gemm
! successCUDA = cuda_memcpy(loc(qtmp2(1,1)), qtmp2_dev, & ! successCUDA = cuda_memcpy(c_loc(qtmp2(1,1)), qtmp2_dev, &
! gemm_dim_k * gemm_dim_m * size_of_datatype, cudaMemcpyDeviceToHost) ! gemm_dim_k * gemm_dim_m * size_of_datatype, cudaMemcpyDeviceToHost)
! check_memcpy_cuda("merge_systems: qtmp2_dev", successCUDA) ! check_memcpy_cuda("merge_systems: qtmp2_dev", successCUDA)
endif endif
...@@ -813,7 +813,7 @@ ...@@ -813,7 +813,7 @@
if(useGPU) then if(useGPU) then
!TODO the previous loop could be possible to do on device and thus !TODO the previous loop could be possible to do on device and thus
!copy less !copy less
successCUDA = cuda_memcpy(ev_dev, loc(ev(1,1)), & successCUDA = cuda_memcpy(ev_dev, int(loc(ev(1,1)),kind=c_intptr_t), &
gemm_dim_l * gemm_dim_m * size_of_datatype, cudaMemcpyHostToDevice) gemm_dim_l * gemm_dim_m * size_of_datatype, cudaMemcpyHostToDevice)
check_memcpy_cuda("merge_systems: ev_dev", successCUDA) check_memcpy_cuda("merge_systems: ev_dev", successCUDA)
endif endif
...@@ -843,7 +843,7 @@ ...@@ -843,7 +843,7 @@
if(useGPU) then if(useGPU) then
!TODO either copy only half of the matrix here, and get rid of the !TODO either copy only half of the matrix here, and get rid of the
!previous copy or copy whole array here !previous copy or copy whole array here
successCUDA = cuda_memcpy(loc(qtmp2(1,1)), qtmp2_dev, & successCUDA = cuda_memcpy(int(loc(qtmp2(1,1)),kind=c_intptr_t), qtmp2_dev, &
gemm_dim_k * gemm_dim_m * size_of_datatype, cudaMemcpyDeviceToHost) gemm_dim_k * gemm_dim_m * size_of_datatype, cudaMemcpyDeviceToHost)
check_memcpy_cuda("merge_systems: qtmp2_dev", successCUDA) check_memcpy_cuda("merge_systems: qtmp2_dev", successCUDA)
endif endif
......
...@@ -104,9 +104,11 @@ ...@@ -104,9 +104,11 @@
MATH_DATATYPE(kind=rck), intent(in) :: tau(na) MATH_DATATYPE(kind=rck), intent(in) :: tau(na)
#ifdef USE_ASSUMED_SIZE #ifdef USE_ASSUMED_SIZE
MATH_DATATYPE(kind=rck), intent(inout) :: a_mat(lda,*), q_mat(ldq,*) MATH_DATATYPE(kind=rck), intent(inout) :: a_mat(lda,*)
MATH_DATATYPE(kind=rck), intent(inout) :: q_mat(ldq,*)
#else #else
MATH_DATATYPE(kind=rck), intent(inout) :: a_mat(lda,matrixCols), q_mat(ldq,matrixCols) MATH_DATATYPE(kind=rck), intent(inout) :: a_mat(lda,matrixCols)
MATH_DATATYPE(kind=rck), intent(inout) :: q_mat(ldq,matrixCols)
#endif #endif
logical, intent(in) :: useGPU logical, intent(in) :: useGPU
integer(kind=ik) :: max_stored_rows, max_stored_rows_fac integer(kind=ik) :: max_stored_rows, max_stored_rows_fac
...@@ -117,8 +119,10 @@ ...@@ -117,8 +119,10 @@
integer(kind=ik) :: istep, n, nc, ic, ics, ice, nb, cur_pcol integer(kind=ik) :: istep, n, nc, ic, ics, ice, nb, cur_pcol
integer(kind=ik) :: hvn_ubnd, hvm_ubnd integer(kind=ik) :: hvn_ubnd, hvm_ubnd
MATH_DATATYPE(kind=rck), allocatable :: tmp1(:), tmp2(:), hvb(:), hvm(:,:) MATH_DATATYPE(kind=rck), allocatable :: hvb(:), hvm(:,:)
MATH_DATATYPE(kind=rck), allocatable :: tmat(:,:), h1(:), h2(:), hvm1(:) MATH_DATATYPE(kind=rck), allocatable :: tmp1(:), tmp2(:)
MATH_DATATYPE(kind=rck), allocatable :: h1(:), h2(:)
MATH_DATATYPE(kind=rck), allocatable :: tmat(:,:), hvm1(:)
integer(kind=ik) :: istat integer(kind=ik) :: istat
character(200) :: errorMessage character(200) :: errorMessage
...@@ -233,7 +237,8 @@ ...@@ -233,7 +237,8 @@
check_alloc_cuda("trans_ev", successCUDA) check_alloc_cuda("trans_ev", successCUDA)
! q_dev = q_mat ! q_dev = q_mat
successCUDA = cuda_memcpy(q_dev, loc(q_mat(1,1)), ldq * matrixCols * size_of_datatype, cudaMemcpyHostToDevice) successCUDA = cuda_memcpy(q_dev, int(loc(q_mat(1,1)),kind=c_intptr_t), &
ldq * matrixCols * size_of_datatype, cudaMemcpyHostToDevice)
check_memcpy_cuda("trans_ev", successCUDA) check_memcpy_cuda("trans_ev", successCUDA)
endif ! useGPU endif ! useGPU
...@@ -337,13 +342,13 @@ ...@@ -337,13 +342,13 @@
hvm1(1:hvm_ubnd*nstor) = reshape(hvm(1:hvm_ubnd,1:nstor), (/ hvm_ubnd*nstor /)) hvm1(1:hvm_ubnd*nstor) = reshape(hvm(1:hvm_ubnd,1:nstor), (/ hvm_ubnd*nstor /))
!hvm_dev(1:hvm_ubnd*nstor) = hvm1(1:hvm_ubnd*nstor) !hvm_dev(1:hvm_ubnd*nstor) = hvm1(1:hvm_ubnd*nstor)
successCUDA = cuda_memcpy(hvm_dev, loc(hvm1(1)), & successCUDA = cuda_memcpy(hvm_dev, int(loc(hvm1(1)),kind=c_intptr_t), &
hvm_ubnd * nstor * size_of_datatype, cudaMemcpyHostToDevice) hvm_ubnd * nstor * size_of_datatype, cudaMemcpyHostToDevice)
check_memcpy_cuda("trans_ev", successCUDA) check_memcpy_cuda("trans_ev", successCUDA)
!tmat_dev = tmat !tmat_dev = tmat
successCUDA = cuda_memcpy(tmat_dev, loc(tmat(1,1)), & successCUDA = cuda_memcpy(tmat_dev, int(loc(tmat(1,1)),kind=c_intptr_t), &
max_stored_rows * max_stored_rows * size_of_datatype, cudaMemcpyHostToDevice) max_stored_rows * max_stored_rows * size_of_datatype, cudaMemcpyHostToDevice)
check_memcpy_cuda("trans_ev", successCUDA) check_memcpy_cuda("trans_ev", successCUDA)
endif endif
...@@ -381,7 +386,7 @@ ...@@ -381,7 +386,7 @@
! In the legacy GPU version, this allreduce was ommited. But probably it has to be done for GPU + MPI ! In the legacy GPU version, this allreduce was ommited. But probably it has to be done for GPU + MPI
! todo: does it need to be copied whole? Wouldn't be a part sufficient? ! todo: does it need to be copied whole? Wouldn't be a part sufficient?
if (useGPU) then if (useGPU) then
successCUDA = cuda_memcpy(loc(tmp1(1)), tmp_dev, & successCUDA = cuda_memcpy(int(loc(tmp1(1)),kind=c_intptr_t), tmp_dev, &
max_local_cols * max_stored_rows * size_of_datatype, cudaMemcpyDeviceToHost) max_local_cols * max_stored_rows * size_of_datatype, cudaMemcpyDeviceToHost)
check_memcpy_cuda("trans_ev", successCUDA) check_memcpy_cuda("trans_ev", successCUDA)
endif endif
...@@ -390,7 +395,7 @@ ...@@ -390,7 +395,7 @@
call obj%timer%stop("mpi_communication") call obj%timer%stop("mpi_communication")
! copy back tmp2 - after reduction... ! copy back tmp2 - after reduction...
if (useGPU) then if (useGPU) then
successCUDA = cuda_memcpy(tmp_dev, loc(tmp2(1)), & successCUDA = cuda_memcpy(tmp_dev, int(loc(tmp2(1)),kind=c_intptr_t), &
max_local_cols * max_stored_rows * size_of_datatype, cudaMemcpyHostToDevice) max_local_cols * max_stored_rows * size_of_datatype, cudaMemcpyHostToDevice)
check_memcpy_cuda("trans_ev", successCUDA) check_memcpy_cuda("trans_ev", successCUDA)
endif ! useGPU endif ! useGPU
...@@ -447,7 +452,8 @@ ...@@ -447,7 +452,8 @@
if (useGPU) then if (useGPU) then
!q_mat = q_dev !q_mat = q_dev
successCUDA = cuda_memcpy(loc(q_mat(1,1)), q_dev, ldq * matrixCols * size_of_datatype, cudaMemcpyDeviceToHost) successCUDA = cuda_memcpy(int(loc(q_mat(1,1)),kind=c_intptr_t), &
q_dev, ldq * matrixCols * size_of_datatype, cudaMemcpyDeviceToHost)
check_memcpy_cuda("trans_ev", successCUDA) check_memcpy_cuda("trans_ev", successCUDA)
deallocate(hvm1, stat=istat, errmsg=errorMessage) deallocate(hvm1, stat=istat, errmsg=errorMessage)
......
...@@ -110,14 +110,14 @@ call prmat(na,useGpu,a_mat,a_dev,lda,matrixCols,nblk,my_prow,my_pcol,np_rows,np_ ...@@ -110,14 +110,14 @@ call prmat(na,useGpu,a_mat,a_dev,lda,matrixCols,nblk,my_prow,my_pcol,np_rows,np_
integer(kind=ik), intent(in) :: na, lda, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols integer(kind=ik), intent(in) :: na, lda, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols
logical, intent(in) :: useGPU, wantDebug logical, intent(in) :: useGPU, wantDebug
MATH_DATATYPE(kind=rck), intent(out) :: tau(na) MATH_DATATYPE(kind=rck), intent(out) :: tau(na)
#ifdef USE_ASSUMED_SIZE #ifdef USE_ASSUMED_SIZE
MATH_DATATYPE(kind=rck), intent(inout) :: a_mat(lda,*) MATH_DATATYPE(kind=rck), intent(inout) :: a_mat(lda,*)
#else #else
MATH_DATATYPE(kind=rck), intent(inout) :: a_mat(lda,matrixCols) MATH_DATATYPE(kind=rck), intent(inout) :: a_mat(lda,matrixCols)
#endif #endif
real(kind=rk), intent(out) :: d_vec(na), e_vec(na) real(kind=rk), intent(out) :: d_vec(na)
real(kind=rk), intent(out) :: e_vec(na)
integer(kind=ik), parameter :: max_stored_uv = 32 integer(kind=ik), parameter :: max_stored_uv = 32
logical, parameter :: mat_vec_as_one_block = .true. logical, parameter :: mat_vec_as_one_block = .true.
...@@ -151,9 +151,10 @@ call prmat(na,useGpu,a_mat,a_dev,lda,matrixCols,nblk,my_prow,my_pcol,np_rows,np_ ...@@ -151,9 +151,10 @@ call prmat(na,useGpu,a_mat,a_dev,lda,matrixCols,nblk,my_prow,my_pcol,np_rows,np_
complex(kind=rck) :: aux3(1) complex(kind=rck) :: aux3(1)
#endif #endif
MATH_DATATYPE(kind=rck), allocatable :: tmp(:), & MATH_DATATYPE(kind=rck), allocatable :: tmp(:)
v_row(:), & ! used to store calculated Householder Vector MATH_DATATYPE(kind=rck), allocatable :: v_row(:), & ! used to store calculated Householder Vector
v_col(:), & ! the same Vector, but transposed - differently distributed among MPI tasks v_col(:), & ! the same Vector, but transposed
! - differently distributed among MPI tasks
u_row(:), & u_row(:), &
u_col(:) u_col(:)
! the following two matrices store pairs of vectors v and u calculated in each step ! the following two matrices store pairs of vectors v and u calculated in each step
...@@ -342,7 +343,8 @@ call prmat(na,useGpu,a_mat,a_dev,lda,matrixCols,nblk,my_prow,my_pcol,np_rows,np_ ...@@ -342,7 +343,8 @@ call prmat(na,useGpu,a_mat,a_dev,lda,matrixCols,nblk,my_prow,my_pcol,np_rows,np_
successCUDA = cuda_malloc(a_dev, lda * matrixCols * size_of_datatype) successCUDA = cuda_malloc(a_dev, lda * matrixCols * size_of_datatype)
check_alloc_cuda("tridiag: a_dev", successCUDA) check_alloc_cuda("tridiag: a_dev", successCUDA)
successCUDA = cuda_memcpy(a_dev, loc(a_mat(1,1)), lda * matrixCols * size_of_datatype, cudaMemcpyHostToDevice) successCUDA = cuda_memcpy(a_dev, int(loc(a_mat(1,1)),kind=c_intptr_t), &
lda * matrixCols * size_of_datatype, cudaMemcpyHostToDevice)
check_memcpy_cuda("tridiag: a_dev", successCUDA) check_memcpy_cuda("tridiag: a_dev", successCUDA)
endif endif
...@@ -366,9 +368,11 @@ call prmat(na,useGpu,a_mat,a_dev,lda,matrixCols,nblk,my_prow,my_pcol,np_rows,np_ ...@@ -366,9 +368,11 @@ call prmat(na,useGpu,a_mat,a_dev,lda,matrixCols,nblk,my_prow,my_pcol,np_rows,np_
! copy l_cols + 1 column of A to v_row ! copy l_cols + 1 column of A to v_row
if (useGPU) then if (useGPU) then
a_offset = l_cols * lda * size_of_datatype a_offset = l_cols * lda * size_of_datatype
! we use v_row on the host at the moment! successCUDA = cuda_memcpy(v_row_dev, a_dev + a_offset, (l_rows)*size_of_PRECISION_real, cudaMemcpyDeviceToDevice) ! we use v_row on the host at the moment! successCUDA = cuda_memcpy(v_row_dev, a_dev + a_offset,
! (l_rows)*size_of_PRECISION_real, cudaMemcpyDeviceToDevice)
successCUDA = cuda_memcpy(loc(v_row(1)), a_dev + a_offset, (l_rows)* size_of_datatype, cudaMemcpyDeviceToHost) successCUDA = cuda_memcpy(int(loc(v_row(1)),kind=c_intptr_t), &
a_dev + a_offset, (l_rows)* size_of_datatype, cudaMemcpyDeviceToHost)
check_memcpy_cuda("tridiag a_dev 1", successCUDA) check_memcpy_cuda("tridiag a_dev 1", successCUDA)
else else
v_row(1:l_rows) = a_mat(1:l_rows,l_cols+1) v_row(1:l_rows) = a_mat(1:l_rows,l_cols+1)
...@@ -486,11 +490,13 @@ call prmat(na,useGpu,a_mat,a_dev,lda,matrixCols,nblk,my_prow,my_pcol,np_rows,np_ ...@@ -486,11 +490,13 @@ call prmat(na,useGpu,a_mat,a_dev,lda,matrixCols,nblk,my_prow,my_pcol,np_rows,np_
successCUDA = cuda_memset(u_row_dev, 0, l_rows * size_of_datatype) successCUDA = cuda_memset(u_row_dev, 0, l_rows * size_of_datatype)
check_memcpy_cuda("tridiag: u_row_dev", successCUDA) check_memcpy_cuda("tridiag: u_row_dev", successCUDA)
successCUDA = cuda_memcpy(v_col_dev, loc(v_col(1)), l_cols * size_of_datatype, cudaMemcpyHostToDevice) successCUDA = cuda_memcpy(v_col_dev, int(loc(v_col(1)),kind=c_intptr_t), &
l_cols * size_of_datatype, cudaMemcpyHostToDevice)
check_memcpy_cuda("tridiag: v_col_dev", successCUDA) check_memcpy_cuda("tridiag: v_col_dev", successCUDA)
successCUDA = cuda_memcpy(v_row_dev, loc(v_row(1)), l_rows * size_of_datatype, cudaMemcpyHostToDevice) successCUDA = cuda_memcpy(v_row_dev, int(loc(v_row(1)),kind=c_intptr_t), &
l_rows * size_of_datatype, cudaMemcpyHostToDevice)
check_memcpy_cuda("tridiag: v_row_dev", successCUDA) check_memcpy_cuda("tridiag: v_row_dev", successCUDA)
endif ! useGU endif ! useGU
...@@ -621,10 +627,12 @@ call prmat(na,useGpu,a_mat,a_dev,lda,matrixCols,nblk,my_prow,my_pcol,np_rows,np_ ...@@ -621,10 +627,12 @@ call prmat(na,useGpu,a_mat,a_dev,lda,matrixCols,nblk,my_prow,my_pcol,np_rows,np_
enddo enddo
end if !multiplication as one block / per stripes end if !multiplication as one block / per stripes
successCUDA = cuda_memcpy(loc(u_col(1)), u_col_dev, l_cols * size_of_datatype, cudaMemcpyDeviceToHost) successCUDA = cuda_memcpy(int(loc(u_col(1)),kind=c_intptr_t), &
u_col_dev, l_cols * size_of_datatype, cudaMemcpyDeviceToHost)
check_memcpy_cuda("tridiag: u_col_dev 1", successCUDA) check_memcpy_cuda("tridiag: u_col_dev 1", successCUDA)
successCUDA = cuda_memcpy(loc(u_row(1)), u_row_dev, l_rows * size_of_datatype, cudaMemcpyDeviceToHost) successCUDA = cuda_memcpy(int(loc(u_row(1)),kind=c_intptr_t), &
u_row_dev, l_rows * size_of_datatype, cudaMemcpyDeviceToHost)
check_memcpy_cuda("tridiag: u_row_dev 1", successCUDA) check_memcpy_cuda("tridiag: u_row_dev 1", successCUDA)
endif endif
...@@ -749,12 +757,12 @@ call prmat(na,useGpu,a_mat,a_dev,lda,matrixCols,nblk,my_prow,my_pcol,np_rows,np_ ...@@ -749,12 +757,12 @@ call prmat(na,useGpu,a_mat,a_dev,lda,matrixCols,nblk,my_prow,my_pcol,np_rows,np_
if (n_stored_vecs == max_stored_uv .or. istep == 3) then if (n_stored_vecs == max_stored_uv .or. istep == 3) then
if (useGPU) then if (useGPU) then
successCUDA = cuda_memcpy(vu_stored_rows_dev, loc(vu_stored_rows(1,1)), & successCUDA = cuda_memcpy(vu_stored_rows_dev, int(loc(vu_stored_rows(1,1)),kind=c_intptr_t), &
max_local_rows * 2 * max_stored_uv * & max_local_rows * 2 * max_stored_uv * &
size_of_datatype, cudaMemcpyHostToDevice) size_of_datatype, cudaMemcpyHostToDevice)
check_memcpy_cuda("tridiag: vu_stored_rows_dev", successCUDA) check_memcpy_cuda("tridiag: vu_stored_rows_dev", successCUDA)
successCUDA = cuda_memcpy(uv_stored_cols_dev, loc(uv_stored_cols(1,1)), & successCUDA = cuda_memcpy(uv_stored_cols_dev, int(loc(uv_stored_cols(1,1)),kind=c_intptr_t), &
max_local_cols * 2 * max_stored_uv * & max_local_cols * 2 * max_stored_uv * &
size_of_datatype, cudaMemcpyHostToDevice) size_of_datatype, cudaMemcpyHostToDevice)
check_memcpy_cuda("tridiag: uv_stored_cols_dev", successCUDA) check_memcpy_cuda("tridiag: uv_stored_cols_dev", successCUDA)
...@@ -817,7 +825,7 @@ call prmat(na,useGpu,a_mat,a_dev,lda,matrixCols,nblk,my_prow,my_pcol,np_rows,np_ ...@@ -817,7 +825,7 @@ call prmat(na,useGpu,a_mat,a_dev,lda,matrixCols,nblk,my_prow,my_pcol,np_rows,np_
!a_mat(l_rows,l_cols) = a_dev(l_rows,l_cols) !a_mat(l_rows,l_cols) = a_dev(l_rows,l_cols)
a_offset = ((l_rows - 1) + lda * (l_cols - 1)) * size_of_datatype a_offset = ((l_rows - 1) + lda * (l_cols - 1)) * size_of_datatype
successCUDA = cuda_memcpy(loc(a_mat(l_rows, l_cols)), a_dev + a_offset, & successCUDA = cuda_memcpy(int(loc(a_mat(l_rows, l_cols)),kind=c_intptr_t), a_dev + a_offset, &
1 * size_of_datatype, cudaMemcpyDeviceToHost) 1 * size_of_datatype, cudaMemcpyDeviceToHost)
check_memcpy_cuda("tridiag: a_dev 3", successCUDA) check_memcpy_cuda("tridiag: a_dev 3", successCUDA)
...@@ -853,7 +861,7 @@ call prmat(na,useGpu,a_mat,a_dev,lda,matrixCols,nblk,my_prow,my_pcol,np_rows,np_ ...@@ -853,7 +861,7 @@ call prmat(na,useGpu,a_mat,a_dev,lda,matrixCols,nblk,my_prow,my_pcol,np_rows,np_
if (my_prow==prow(1, nblk, np_rows)) then if (my_prow==prow(1, nblk, np_rows)) then
! We use last l_cols value of loop above ! We use last l_cols value of loop above
if(useGPU) then if(useGPU) then
successCUDA = cuda_memcpy(loc(aux3(1)), a_dev + (lda * (l_cols - 1)) * size_of_datatype, & successCUDA = cuda_memcpy(int(loc(aux3(1)),kind=c_intptr_t), a_dev + (lda * (l_cols - 1)) * size_of_datatype, &
1 * size_of_datatype, cudaMemcpyDeviceToHost) 1 * size_of_datatype, cudaMemcpyDeviceToHost)
check_memcpy_cuda("tridiag: a_dev 5", successCUDA) check_memcpy_cuda("tridiag: a_dev 5", successCUDA)
vrl = aux3(1) vrl = aux3(1)
...@@ -889,7 +897,7 @@ call prmat(na,useGpu,a_mat,a_dev,lda,matrixCols,nblk,my_prow,my_pcol,np_rows,np_ ...@@ -889,7 +897,7 @@ call prmat(na,useGpu,a_mat,a_dev,lda,matrixCols,nblk,my_prow,my_pcol,np_rows,np_
#endif /* WITH_MPI */ #endif /* WITH_MPI */
if (my_prow == prow(1, nblk, np_rows) .and. my_pcol == pcol(1, nblk, np_cols)) then if (my_prow == prow(1, nblk, np_rows) .and. my_pcol == pcol(1, nblk, np_cols)) then
if(useGPU) then if(useGPU) then
successCUDA = cuda_memcpy(loc(aux3(1)), a_dev, & successCUDA = cuda_memcpy(int(loc(aux3(1)),kind=c_intptr_t), a_dev, &
1 * size_of_datatype, cudaMemcpyDeviceToHost) 1 * size_of_datatype, cudaMemcpyDeviceToHost)
check_memcpy_cuda("tridiag: a_dev 6", successCUDA) check_memcpy_cuda("tridiag: a_dev 6", successCUDA)
d_vec(1) = PRECISION_REAL(aux3(1)) d_vec(1) = PRECISION_REAL(aux3(1))
...@@ -905,7 +913,7 @@ call prmat(na,useGpu,a_mat,a_dev,lda,matrixCols,nblk,my_prow,my_pcol,np_rows,np_ ...@@ -905,7 +913,7 @@ call prmat(na,useGpu,a_mat,a_dev,lda,matrixCols,nblk,my_prow,my_pcol,np_rows,np_
if (my_prow==prow(1, nblk, np_rows) .and. my_pcol==pcol(2, nblk, np_cols)) then if (my_prow==prow(1, nblk, np_rows) .and. my_pcol==pcol(2, nblk, np_cols)) then
if(useGPU) then if(useGPU) then
successCUDA = cuda_memcpy(loc(e_vec(1)), a_dev + (lda * (l_cols - 1)) * size_of_datatype, & successCUDA = cuda_memcpy(int(loc(e_vec(1)),kind=c_intptr_t), a_dev + (lda * (l_cols - 1)) * size_of_datatype, &
1 * size_of_datatype, cudaMemcpyDeviceToHost) 1 * size_of_datatype, cudaMemcpyDeviceToHost)
check_memcpy_cuda("tridiag: a_dev 7", successCUDA) check_memcpy_cuda("tridiag: a_dev 7", successCUDA)
else !useGPU else !useGPU
...@@ -916,7 +924,7 @@ call prmat(na,useGpu,a_mat,a_dev,lda,matrixCols,nblk,my_prow,my_pcol,np_rows,np_ ...@@ -916,7 +924,7 @@ call prmat(na,useGpu,a_mat,a_dev,lda,matrixCols,nblk,my_prow,my_pcol,np_rows,np_
! Store d_vec(1) ! Store d_vec(1)
if (my_prow==prow(1, nblk, np_rows) .and. my_pcol==pcol(1, nblk, np_cols)) then if (my_prow==prow(1, nblk, np_rows) .and. my_pcol==pcol(1, nblk, np_cols)) then
if(useGPU) then if(useGPU) then
successCUDA = cuda_memcpy(loc(d_vec(1)), a_dev, 1 * size_of_datatype, cudaMemcpyDeviceToHost) successCUDA = cuda_memcpy(int(loc(d_vec(1)),kind=c_intptr_t), a_dev, 1 * size_of_datatype, cudaMemcpyDeviceToHost)
check_memcpy_cuda("tridiag: a_dev 8", successCUDA) check_memcpy_cuda("tridiag: a_dev 8", successCUDA)
else !useGPU else !useGPU
d_vec(1) = a_mat(1,1) d_vec(1) = a_mat(1,1)
......
...@@ -114,9 +114,11 @@ ...@@ -114,9 +114,11 @@
integer(kind=ik) :: na, lda, nblk, nbw, matrixCols, numBlocks, mpi_comm_rows, mpi_comm_cols integer(kind=ik) :: na, lda, nblk, nbw, matrixCols, numBlocks, mpi_comm_rows, mpi_comm_cols
#ifdef USE_ASSUMED_SIZE #ifdef USE_ASSUMED_SIZE
MATH_DATATYPE(kind=rck) :: a_mat(lda,*), tmat(nbw,nbw,*) MATH_DATATYPE(kind=rck) :: a_mat(lda,*)
MATH_DATATYPE(kind=rck) :: tmat(nbw,nbw,*)
#else #else
MATH_DATATYPE(kind=rck) :: a_mat(lda,matrixCols), tmat(nbw,nbw,numBlocks) MATH_DATATYPE(kind=rck) :: a_mat(lda,matrixCols)
MATH_DATATYPE(kind=rck) :: tmat(nbw,nbw,numBlocks)
#endif #endif
#if REALCASE == 1 #if REALCASE == 1
...@@ -138,10 +140,12 @@ ...@@ -138,10 +140,12 @@
integer(kind=ik) :: tile_size, l_rows_tile, l_cols_tile integer(kind=ik) :: tile_size, l_rows_tile, l_cols_tile
real(kind=rk) :: vnorm2 real(kind=rk) :: vnorm2
MATH_DATATYPE(kind=rck) :: xf, aux1(nbw), aux2(nbw), vrl, tau, vav(nbw,nbw) MATH_DATATYPE(kind=rck) :: xf, aux1(nbw), aux2(nbw), vrl, tau
MATH_DATATYPE(kind=rck) :: vav(nbw,nbw)
! complex(kind=COMPLEX_DATATYPE), allocatable :: tmpCUDA(:,:), vmrCUDA(:,:), umcCUDA(:,:) ! note the different dimension in real case ! complex(kind=COMPLEX_DATATYPE), allocatable :: tmpCUDA(:,:), vmrCUDA(:,:), umcCUDA(:,:) ! note the different dimension in real case
MATH_DATATYPE(kind=rck), allocatable :: tmpCUDA(:), vmrCUDA(:), umcCUDA(:) MATH_DATATYPE(kind=rck), allocatable :: tmpCUDA(:)
MATH_DATATYPE(kind=rck), allocatable :: vmrCUDA(:), umcCUDA(:)
MATH_DATATYPE(kind=rck), allocatable :: tmpCPU(:,:), vmrCPU(:,:), umcCPU(:,:) MATH_DATATYPE(kind=rck), allocatable :: tmpCPU(:,:), vmrCPU(:,:), umcCPU(:,:)
MATH_DATATYPE(kind=rck), allocatable :: vr(:) MATH_DATATYPE(kind=rck), allocatable :: vr(:)
...@@ -359,7 +363,8 @@ ...@@ -359,7 +363,8 @@
cur_l_rows = 0 cur_l_rows = 0
cur_l_cols = 0 cur_l_cols = 0
successCUDA = cuda_memcpy(a_dev, loc(a_mat(1,1)), (lda)*(na_cols)* size_of_datatype, cudaMemcpyHostToDevice) successCUDA = cuda_memcpy(a_dev, int(loc(a_mat(1,1)),kind=c_intptr_t), &
(lda)*(na_cols)* size_of_datatype, cudaMemcpyHostToDevice)
if (.not.(successCUDA)) then if (.not.(successCUDA)) then
print *,"bandred_& print *,"bandred_&
&MATH_DATATYPE& &MATH_DATATYPE&
...@@ -537,7 +542,7 @@ ...@@ -537,7 +542,7 @@
cur_pcol = pcol(istep*nbw+1, nblk, np_cols) cur_pcol = pcol(istep*nbw+1, nblk, np_cols)
if (my_pcol == cur_pcol) then if (my_pcol == cur_pcol) then
successCUDA = cuda_memcpy2d(loc(a_mat(1, lc_start)), & successCUDA = cuda_memcpy2d(int(loc(a_mat(1, lc_start)),kind=c_intptr_t), &
int((lda*size_of_datatype),kind=c_intptr_t), & int((lda*size_of_datatype),kind=c_intptr_t), &
(a_dev + int( ( (lc_start-1) * lda*size_of_datatype),kind=c_intptr_t )), & (a_dev + int( ( (lc_start-1) * lda*size_of_datatype),kind=c_intptr_t )), &
int(lda*size_of_datatype,kind=c_intptr_t), & int(lda*size_of_datatype,kind=c_intptr_t), &
...@@ -849,7 +854,7 @@ ...@@ -849,7 +854,7 @@
if (my_pcol == cur_pcol) then if (my_pcol == cur_pcol) then
successCUDA = cuda_memcpy2d((a_dev+ & successCUDA = cuda_memcpy2d((a_dev+ &
int(((lc_start-1)*lda*size_of_datatype),kind=c_intptr_t)), & int(((lc_start-1)*lda*size_of_datatype),kind=c_intptr_t)), &
int(lda*size_of_datatype,kind=c_intptr_t), loc(a_mat(1,lc_start)), & int(lda*size_of_datatype,kind=c_intptr_t), int(loc(a_mat(1,lc_start)),kind=c_intptr_t), &
int(lda*size_of_datatype,kind=c_intptr_t), & int(lda*size_of_datatype,kind=c_intptr_t), &
int(lr_end*size_of_datatype,kind=c_intptr_t), & int(lr_end*size_of_datatype,kind=c_intptr_t), &
int((lc_end - lc_start+1),kind=c_intptr_t), & int((lc_end - lc_start+1),kind=c_intptr_t), &
...@@ -930,7 +935,7 @@ ...@@ -930,7 +935,7 @@
if (my_pcol == cur_pcol) then if (my_pcol == cur_pcol) then
successCUDA = cuda_memcpy2d((a_dev+ & successCUDA = cuda_memcpy2d((a_dev+ &
int(((lc_start-1)*lda*size_of_datatype),kind=c_intptr_t)), & int(((lc_start-1)*lda*size_of_datatype),kind=c_intptr_t)), &
int(lda*size_of_datatype,kind=c_intptr_t), loc(a_mat(1,lc_start)), & int(lda*size_of_datatype,kind=c_intptr_t), int(loc(a_mat(1,lc_start)),kind=c_intptr_t), &
int(lda*size_of_datatype,kind=c_intptr_t), & int(lda*size_of_datatype,kind=c_intptr_t), &
int(lr_end*size_of_datatype,kind=c_intptr_t), & int(lr_end*size_of_datatype,kind=c_intptr_t), &
int((lc_end - lc_start+1),kind=c_intptr_t), & int((lc_end - lc_start+1),kind=c_intptr_t), &
...@@ -1093,7 +1098,7 @@ ...@@ -1093,7 +1098,7 @@
if (useGPU) then if (useGPU) then
successCUDA = cuda_memcpy(vmr_dev, & successCUDA = cuda_memcpy(vmr_dev, &
loc(vmrCUDA(1)),& int(loc(vmrCUDA(1)),kind=c_intptr_t),&
vmr_size*size_of_datatype,cudaMemcpyHostToDevice) vmr_size*size_of_datatype,cudaMemcpyHostToDevice)
if (.not.(successCUDA)) then if (.not.(successCUDA)) then
print *,"bandred_& print *,"bandred_&
...@@ -1103,7 +1108,7 @@ ...@@ -1103,7 +1108,7 @@
endif endif
successCUDA = cuda_memcpy(umc_dev, & successCUDA = cuda_memcpy(umc_dev, &
loc(umcCUDA(1)), & int(loc(umcCUDA(1)),kind=c_intptr_t), &
umc_size*size_of_datatype,cudaMemcpyHostToDevice) umc_size*size_of_datatype,cudaMemcpyHostToDevice)
if (.not.(successCUDA)) then if (.not.(successCUDA)) then
print *,"bandred_& print *,"bandred_&
...@@ -1165,7 +1170,7 @@ ...@@ -1165,7 +1170,7 @@
if (useGPU) then if (useGPU) then
successCUDA = cuda_memcpy( & successCUDA = cuda_memcpy( &
loc(vmrCUDA(1)), & int(loc(vmrCUDA(1)),kind=c_intptr_t), &
vmr_dev,vmr_size*size_of_datatype,cudaMemcpyDeviceToHost) vmr_dev,vmr_size*size_of_datatype,cudaMemcpyDeviceToHost)
if (.not.(successCUDA)) then if (.not.(successCUDA)) then
print *,"bandred_& print *,"bandred_&
...@@ -1175,7 +1180,7 @@ ...@@ -1175,7 +1180,7 @@
endif endif
successCUDA = cuda_memcpy( & successCUDA = cuda_memcpy( &
loc(umcCUDA(1)), & int(loc(umcCUDA(1)),kind=c_intptr_t), &
umc_dev, umc_size*size_of_datatype,cudaMemcpyDeviceToHost) umc_dev, umc_size*size_of_datatype,cudaMemcpyDeviceToHost)
if (.not.(successCUDA)) then if (.not.(successCUDA)) then
print *,"bandred_& print *,"bandred_&
...@@ -1290,7 +1295,7 @@ ...@@ -1290,7 +1295,7 @@
if (useGPU) then if (useGPU) then
successCUDA = cuda_memcpy(umc_dev, & successCUDA = cuda_memcpy(umc_dev, &
loc(umcCUDA(1)), & int(loc(umcCUDA(1)),kind=c_intptr_t), &
umc_size*size_of_datatype, cudaMemcpyHostToDevice) umc_size*size_of_datatype, cudaMemcpyHostToDevice)
if (.not.(successCUDA)) then if (.not.(successCUDA)) then
print *,"bandred_& print *,"bandred_&
...@@ -1298,7 +1303,8 @@ ...@@ -1298,7 +1303,8 @@
&: error in cudaMemcpy umc_dev 5" &: error in cudaMemcpy umc_dev 5"
stop 1 stop 1
endif endif
successCUDA = cuda_memcpy(tmat_dev,loc(tmat(1,1,istep)),nbw*nbw*size_of_datatype,cudaMemcpyHostToDevice) successCUDA = cuda_memcpy(tmat_dev,int(loc(tmat(1,1,istep)),kind=c_intptr_t), &
nbw*nbw*size_of_datatype,cudaMemcpyHostToDevice)
if (.not.(successCUDA)) then if (.not.(successCUDA)) then
print *,"bandred_& print *,"bandred_&
&MATH_DATATYPE& &MATH_DATATYPE&
...@@ -1312,7 +1318,8 @@ ...@@ -1312,7 +1318,8 @@
call obj%timer%stop("cublas") call obj%timer%stop("cublas")
! VAV = Tmat * V**T * A * V * Tmat**T = (U*Tmat**T)**T * V * Tmat**T ! VAV = Tmat * V**T * A * V * Tmat**T = (U*Tmat**T)**T * V * Tmat**T
successCUDA = cuda_memcpy(vav_dev,loc(vav(1,1)), nbw*nbw*size_of_datatype,cudaMemcpyHostToDevice) successCUDA = cuda_memcpy(vav_dev,int(loc(vav(1,1)),kind=c_intptr_t), &
nbw*nbw*size_of_datatype,cudaMemcpyHostToDevice)
if (.not.(successCUDA)) then if (.not.(successCUDA)) then
print *,"bandred_& print *,"bandred_&
&MATH_DATATYPE& &MATH_DATATYPE&
...@@ -1330,7 +1337,8 @@ ...@@ -1330,7 +1337,8 @@
n_cols, n_cols, ONE, tmat_dev, nbw, vav_dev, nbw) n_cols, n_cols, ONE, tmat_dev, nbw, vav_dev, nbw)
call obj%timer%stop("cublas") call obj%timer%stop("cublas")
successCUDA = cuda_memcpy(loc(vav(1,1)), vav_dev, nbw*nbw*size_of_datatype, cudaMemcpyDeviceToHost) successCUDA = cuda_memcpy(int(loc(vav(1,1)),kind=c_intptr_t), &
vav_dev, nbw*nbw*size_of_datatype, cudaMemcpyDeviceToHost)
if (.not.(successCUDA)) then if (.not.(successCUDA)) then
print *,"bandred_& print *,"bandred_&
&MATH_DATATYPE& &MATH_DATATYPE&
...@@ -1368,7 +1376,7 @@ ...@@ -1368,7 +1376,7 @@
(obj, n_cols,vav, nbw, nbw ,mpi_comm_cols) (obj, n_cols,vav, nbw, nbw ,mpi_comm_cols)
if (useGPU) then if (useGPU) then
successCUDA = cuda_memcpy(vav_dev, loc(vav(1,1)), nbw*nbw*size_of_datatype,cudaMemcpyHostToDevice) successCUDA = cuda_memcpy(vav_dev, int(loc(vav(1,1)),kind=c_intptr_t), nbw*nbw*size_of_datatype,cudaMemcpyHostToDevice)
if (.not.(successCUDA)) then if (.not.(successCUDA)) then
print *,"bandred_& print *,"bandred_&
&MATH_DATATYPE& &MATH_DATATYPE&
...@@ -1396,7 +1404,7 @@ ...@@ -1396,7 +1404,7 @@
call obj%timer%stop("cublas") call obj%timer%stop("cublas")
successCUDA = cuda_memcpy( & successCUDA = cuda_memcpy( &
loc(umcCUDA(1)), & int(loc(umcCUDA(1)),kind=c_intptr_t), &
umc_dev, umc_size*size_of_datatype, cudaMemcpyDeviceToHost) umc_dev, umc_size*size_of_datatype, cudaMemcpyDeviceToHost)
if (.not.(successCUDA)) then if (.not.(successCUDA)) then
...@@ -1416,7 +1424,7 @@ ...@@ -1416,7 +1424,7 @@
1, istep*nbw, n_cols, nblk, max_threads) 1, istep*nbw, n_cols, nblk, max_threads)
successCUDA = cuda_memcpy(vmr_dev, & successCUDA = cuda_memcpy(vmr_dev, &
loc(vmrCUDA(1)), & int(loc(vmrCUDA(1)),kind=c_intptr_t), &
vmr_size*size_of_datatype, cudaMemcpyHostToDevice) vmr_size*size_of_datatype, cudaMemcpyHostToDevice)
if (.not.(successCUDA)) then if (.not.(successCUDA)) then
print *,"bandred_& print *,"bandred_&
...@@ -1426,7 +1434,7 @@ ...@@ -1426,7 +1434,7 @@
endif endif
successCUDA = cuda_memcpy(umc_dev, & successCUDA = cuda_memcpy(umc_dev, &
loc(umcCUDA(1)), & int(loc(umcCUDA(1)),kind=c_intptr_t), &
umc_size*size_of_datatype, cudaMemcpyHostToDevice) umc_size*size_of_datatype, cudaMemcpyHostToDevice)
if (.not.(successCUDA)) then if (.not.(successCUDA)) then
print *,"bandred_& print *,"bandred_&
...@@ -1687,7 +1695,8 @@ ...@@ -1687,7 +1695,8 @@
! (band to tridi). Previously, a has been kept on the device and then ! (band to tridi). Previously, a has been kept on the device and then
! copied in redist_band (called from tridiag_band). However, it seems to ! copied in redist_band (called from tridiag_band). However, it seems to
! be easier to do it here. ! be easier to do it here.
successCUDA = cuda_memcpy (loc(a_mat), int(a_dev,kind=c_intptr_t), int(lda*matrixCols* size_of_datatype, kind=c_intptr_t), & successCUDA = cuda_memcpy (int(loc(a_mat),kind=c_intptr_t), &
int(a_dev,kind=c_intptr_t), int(lda*matrixCols* size_of_datatype, kind=c_intptr_t), &
cudaMemcpyDeviceToHost) cudaMemcpyDeviceToHost)
if (.not.(successCUDA)) then if (.not.(successCUDA)) then
print *,"bandred_& print *,"bandred_&
......
...@@ -74,14 +74,14 @@ program print_available_elpa2_kernels ...@@ -74,14 +74,14 @@ program print_available_elpa2_kernels
integer(kind=c_int) :: i integer(kind=c_int) :: i
class(elpa_t), pointer :: e class(elpa_t), pointer :: e
integer :: option integer :: option, error
if (elpa_init(CURRENT_API_VERSION) /= ELPA_OK) then if (elpa_init(CURRENT_API_VERSION) /= ELPA_OK) then
print *, "Unsupported ELPA API Version" print *, "Unsupported ELPA API Version"
stop 1 stop 1
endif endif
e => elpa_allocate() e => elpa_allocate(error)
print *, "This program will give information on the ELPA2 kernels, " print *, "This program will give information on the ELPA2 kernels, "
print *, "which are available with this library and it will give " print *, "which are available with this library and it will give "
...@@ -125,7 +125,7 @@ program print_available_elpa2_kernels ...@@ -125,7 +125,7 @@ program print_available_elpa2_kernels
print * print *
print * print *
call elpa_deallocate(e) call elpa_deallocate(e, error)
contains contains
...@@ -141,7 +141,7 @@ program print_available_elpa2_kernels ...@@ -141,7 +141,7 @@ program print_available_elpa2_kernels
if (elpa_int_value_to_string(KERNEL_KEY, i) .eq. "ELPA_2STAGE_COMPLEX_GPU" .or. & if (elpa_int_value_to_string(KERNEL_KEY, i) .eq. "ELPA_2STAGE_COMPLEX_GPU" .or. &
elpa_int_value_to_string(KERNEL_KEY, i) .eq. "ELPA_2STAGE_REAL_GPU") then elpa_int_value_to_string(KERNEL_KEY, i) .eq. "ELPA_2STAGE_REAL_GPU") then
if (e%can_set("use_gpu",1) == ELPA_OK) then if (e%can_set("use_gpu",1) == ELPA_OK) then
call e%set("use_gpu",1) call e%set("use_gpu",1, error)
endif endif
endif endif
......
...@@ -78,10 +78,10 @@ ...@@ -78,10 +78,10 @@
#ifdef USE_ASSUMED_SIZE #ifdef USE_ASSUMED_SIZE
MATH_DATATYPE(kind=C_DATATYPE_KIND), intent(inout) :: a(obj%local_nrows,*) MATH_DATATYPE(kind=C_DATATYPE_KIND), intent(inout) :: a(obj%local_nrows,*)
MATH_DATATYPE(kind=C_DATATYPE_KIND), optional, target, intent(out) :: q(obj%local_nrows,*) MATH_DATATYPE(kind=C_DATATYPE_KIND), optional, intent(out), target :: q(obj%local_nrows,*)
#else #else
MATH_DATATYPE(kind=C_DATATYPE_KIND), intent(inout) :: a(obj%local_nrows,obj%local_ncols) MATH_DATATYPE(kind=C_DATATYPE_KIND), intent(inout) :: a(obj%local_nrows,obj%local_ncols)
MATH_DATATYPE(kind=C_DATATYPE_KIND), optional, target, intent(out) :: q(obj%local_nrows,obj%local_ncols) MATH_DATATYPE(kind=C_DATATYPE_KIND), optional, intent(out), target :: q(obj%local_nrows,obj%local_ncols)
#endif #endif
real(kind=C_DATATYPE_KIND), intent(inout) :: ev(obj%na) real(kind=C_DATATYPE_KIND), intent(inout) :: ev(obj%na)
MATH_DATATYPE(kind=C_DATATYPE_KIND), allocatable :: hh_trans(:,:) MATH_DATATYPE(kind=C_DATATYPE_KIND), allocatable :: hh_trans(:,:)
...@@ -734,7 +734,7 @@ ...@@ -734,7 +734,7 @@
! if the second backward step is to be performed, but not on GPU, we have ! if the second backward step is to be performed, but not on GPU, we have
! to transfer q to the host ! to transfer q to the host
if(do_trans_to_full .and. (.not. do_useGPU_trans_ev_band_to_full)) then if(do_trans_to_full .and. (.not. do_useGPU_trans_ev_band_to_full)) then
successCUDA = cuda_memcpy(loc(q), q_dev, ldq*matrixCols* size_of_datatype, cudaMemcpyDeviceToHost) successCUDA = cuda_memcpy(int(loc(q),kind=c_intptr_t), q_dev, ldq*matrixCols* size_of_datatype, cudaMemcpyDeviceToHost)
if (.not.(successCUDA)) then if (.not.(successCUDA)) then
print *,"elpa2_template, error in copy to host" print *,"elpa2_template, error in copy to host"
stop 1 stop 1
...@@ -760,7 +760,7 @@ ...@@ -760,7 +760,7 @@
! copy to device if we want to continue on GPU ! copy to device if we want to continue on GPU
successCUDA = cuda_malloc(q_dev, ldq*matrixCols*size_of_datatype) successCUDA = cuda_malloc(q_dev, ldq*matrixCols*size_of_datatype)
successCUDA = cuda_memcpy(q_dev, loc(q), ldq*matrixCols* size_of_datatype, cudaMemcpyHostToDevice) successCUDA = cuda_memcpy(q_dev, int(loc(q),kind=c_intptr_t), ldq*matrixCols* size_of_datatype, cudaMemcpyHostToDevice)
if (.not.(successCUDA)) then if (.not.(successCUDA)) then
print *,"elpa2_template, error in copy to device" print *,"elpa2_template, error in copy to device"
stop 1 stop 1
......
...@@ -110,9 +110,11 @@ ...@@ -110,9 +110,11 @@
#endif #endif
integer(kind=ik) :: na, nqc, lda, ldq, nblk, nbw, matrixCols, numBlocks, mpi_comm_rows, mpi_comm_cols integer(kind=ik) :: na, nqc, lda, ldq, nblk, nbw, matrixCols, numBlocks, mpi_comm_rows, mpi_comm_cols
#ifdef USE_ASSUMED_SIZE #ifdef USE_ASSUMED_SIZE
MATH_DATATYPE(kind=rck) :: a_mat(lda,*), q_mat(ldq,*), tmat(nbw,nbw,*) MATH_DATATYPE(kind=rck) :: a_mat(lda,*)
MATH_DATATYPE(kind=rck) :: q_mat(ldq,*), tmat(nbw,nbw,*)
#else #else
MATH_DATATYPE(kind=rck) :: a_mat(lda,matrixCols), q_mat(ldq,matrixCols), tmat(nbw, nbw, numBlocks) MATH_DATATYPE(kind=rck) :: a_mat(lda,matrixCols)
MATH_DATATYPE(kind=rck) :: q_mat(ldq,matrixCols), tmat(nbw, nbw, numBlocks)
#endif #endif
integer(kind=C_intptr_T) :: a_dev ! passed from bandred_real at the moment not used since copied in bandred_real integer(kind=C_intptr_T) :: a_dev ! passed from bandred_real at the moment not used since copied in bandred_real
...@@ -122,7 +124,8 @@ ...@@ -122,7 +124,8 @@
integer(kind=ik) :: l_cols, l_rows, l_colh, n_cols integer(kind=ik) :: l_cols, l_rows, l_colh, n_cols
integer(kind=ik) :: istep, lc, ncol, nrow, nb, ns integer(kind=ik) :: istep, lc, ncol, nrow, nb, ns
MATH_DATATYPE(kind=rck), allocatable :: tmp1(:), tmp2(:), hvb(:), hvm(:,:) MATH_DATATYPE(kind=rck), allocatable :: hvb(:)
MATH_DATATYPE(kind=rck), allocatable :: tmp1(:), tmp2(:), hvm(:,:)
! hvm_dev is fist used and set in this routine ! hvm_dev is fist used and set in this routine
! q_mat is changed in trans_ev_tridi on the host, copied to device and passed here. this can be adapted ! q_mat is changed in trans_ev_tridi on the host, copied to device and passed here. this can be adapted
! tmp_dev is first used in this routine ! tmp_dev is first used in this routine
...@@ -268,7 +271,7 @@ ...@@ -268,7 +271,7 @@
! q_temp(1:ldq,1:na_cols) = q_mat(1:ldq,1:na_cols) ! q_temp(1:ldq,1:na_cols) = q_mat(1:ldq,1:na_cols)
! ! copy q_dev to device, maybe this can be avoided if q_dev can be kept on device in trans_ev_tridi_to_band ! ! copy q_dev to device, maybe this can be avoided if q_dev can be kept on device in trans_ev_tridi_to_band
! successCUDA = cuda_memcpy(q_dev, loc(q_mat), (ldq)*(matrixCols)*size_of_PRECISION_real, cudaMemcpyHostToDevice) ! successCUDA = cuda_memcpy(q_dev, c_loc(q_mat), (ldq)*(matrixCols)*size_of_PRECISION_real, cudaMemcpyHostToDevice)
! if (.not.(successCUDA)) then ! if (.not.(successCUDA)) then
! print *,"trans_ev_band_to_full_real: error in cudaMalloc" ! print *,"trans_ev_band_to_full_real: error in cudaMalloc"
! stop 1 ! stop 1
...@@ -281,7 +284,7 @@ ...@@ -281,7 +284,7 @@
! stop 1 ! stop 1
! endif ! endif
! !
! successCUDA = cuda_memcpy(q_dev, loc(q_mat),ldq*matrixCols*size_of_PRECISION_complex, cudaMemcpyHostToDevice) ! successCUDA = cuda_memcpy(q_dev, c_loc(q_mat),ldq*matrixCols*size_of_PRECISION_complex, cudaMemcpyHostToDevice)
! if (.not.(successCUDA)) then ! if (.not.(successCUDA)) then
! print *,"trans_ev_band_to_full_complex: error in cudaMemcpy" ! print *,"trans_ev_band_to_full_complex: error in cudaMemcpy"
! stop 1 ! stop 1
...@@ -346,7 +349,8 @@ ...@@ -346,7 +349,8 @@
nb = nb+l_rows nb = nb+l_rows
enddo enddo
successCUDA = cuda_memcpy(hvm_dev, loc(hvm), max_local_rows*nbw* size_of_datatype, cudaMemcpyHostToDevice) successCUDA = cuda_memcpy(hvm_dev, int(loc(hvm),kind=c_intptr_t), &
max_local_rows*nbw* size_of_datatype, cudaMemcpyHostToDevice)
if (.not.(successCUDA)) then if (.not.(successCUDA)) then
print *,"trans_ev_band_to_full_real: error in cudaMemcpy, hvm" print *,"trans_ev_band_to_full_real: error in cudaMemcpy, hvm"
...@@ -369,7 +373,8 @@ ...@@ -369,7 +373,8 @@
! copy data from device to host for a later MPI_ALLREDUCE ! copy data from device to host for a later MPI_ALLREDUCE
! copy to host maybe this can be avoided this is needed if MPI is used (allreduce) ! copy to host maybe this can be avoided this is needed if MPI is used (allreduce)
successCUDA = cuda_memcpy(loc(tmp1), tmp_dev, l_cols*n_cols*size_of_datatype, cudaMemcpyDeviceToHost) successCUDA = cuda_memcpy(int(loc(tmp1),kind=c_intptr_t), &
tmp_dev, l_cols*n_cols*size_of_datatype, cudaMemcpyDeviceToHost)
if (.not.(successCUDA)) then if (.not.(successCUDA)) then
print *,"trans_ev_band_to_full_real: error in cudaMemcpy, tmp1 to host" print *,"trans_ev_band_to_full_real: error in cudaMemcpy, tmp1 to host"
stop 1 stop 1
...@@ -398,7 +403,8 @@ ...@@ -398,7 +403,8 @@
#ifdef WITH_MPI #ifdef WITH_MPI
! after the mpi_allreduce we have to copy back to the device ! after the mpi_allreduce we have to copy back to the device
! copy back to device ! copy back to device
successCUDA = cuda_memcpy(tmp_dev, loc(tmp2), n_cols*l_cols* size_of_datatype, & successCUDA = cuda_memcpy(tmp_dev, int(loc(tmp2),kind=c_intptr_t), &
n_cols*l_cols* size_of_datatype, &
cudaMemcpyHostToDevice) cudaMemcpyHostToDevice)
if (.not.(successCUDA)) then if (.not.(successCUDA)) then
print *,"trans_ev_band_to_full_& print *,"trans_ev_band_to_full_&
...@@ -414,7 +420,8 @@ ...@@ -414,7 +420,8 @@
! IMPORTANT: even though tmat_dev is transfered from the previous rutine, we have to copy from tmat again ! IMPORTANT: even though tmat_dev is transfered from the previous rutine, we have to copy from tmat again
! tmat is 3-dimensional array, while tmat_dev contains only one 2-dimensional slice of it - and here we ! tmat is 3-dimensional array, while tmat_dev contains only one 2-dimensional slice of it - and here we
! need to upload another slice ! need to upload another slice
successCUDA = cuda_memcpy(tmat_dev, loc(tmat(1,1,istep)), nbw*nbw*size_of_datatype, cudaMemcpyHostToDevice) successCUDA = cuda_memcpy(tmat_dev, int(loc(tmat(1,1,istep)),kind=c_intptr_t), &
nbw*nbw*size_of_datatype, cudaMemcpyHostToDevice)
if (.not.(successCUDA)) then if (.not.(successCUDA)) then
print *,"trans_ev_band_to_full_& print *,"trans_ev_band_to_full_&
...@@ -434,7 +441,8 @@ ...@@ -434,7 +441,8 @@
! copy to host maybe this can be avoided ! copy to host maybe this can be avoided
! this is not necessary hvm is not used anymore ! this is not necessary hvm is not used anymore
successCUDA = cuda_memcpy(loc(hvm), hvm_dev, ((max_local_rows)*nbw*size_of_datatype),cudaMemcpyDeviceToHost) successCUDA = cuda_memcpy(int(loc(hvm),kind=c_intptr_t), &
hvm_dev, ((max_local_rows)*nbw*size_of_datatype),cudaMemcpyDeviceToHost)
if (.not.(successCUDA)) then if (.not.(successCUDA)) then
print *,"trans_ev_band_to_full_real: error in cudaMemcpy hvm to host" print *,"trans_ev_band_to_full_real: error in cudaMemcpy hvm to host"
stop 1 stop 1
...@@ -779,7 +787,8 @@ ...@@ -779,7 +787,8 @@
! final transfer of q_dev ! final transfer of q_dev
successCUDA = cuda_memcpy(loc(q_mat), q_dev, ldq*matrixCols* size_of_datatype, cudaMemcpyDeviceToHost) successCUDA = cuda_memcpy(int(loc(q_mat),kind=c_intptr_t), q_dev, ldq*matrixCols* size_of_datatype, &
cudaMemcpyDeviceToHost)
if (.not.(successCUDA)) then if (.not.(successCUDA)) then
print *,"trans_ev_band_to_full_& print *,"trans_ev_band_to_full_&
......
...@@ -143,11 +143,15 @@ ...@@ -143,11 +143,15 @@
MATH_DATATYPE(kind=rck) , allocatable :: row_group(:,:) MATH_DATATYPE(kind=rck) , allocatable :: row_group(:,:)
#ifdef WITH_OPENMP #ifdef WITH_OPENMP
MATH_DATATYPE(kind=rck), allocatable :: top_border_send_buffer(:,:), top_border_recv_buffer(:,:) MATH_DATATYPE(kind=rck), allocatable :: top_border_send_buffer(:,:)
MATH_DATATYPE(kind=rck), allocatable :: bottom_border_send_buffer(:,:), bottom_border_recv_buffer(:,:) MATH_DATATYPE(kind=rck), allocatable :: top_border_recv_buffer(:,:)
MATH_DATATYPE(kind=rck), allocatable :: bottom_border_send_buffer(:,:)
MATH_DATATYPE(kind=rck), allocatable :: bottom_border_recv_buffer(:,:)
#else #else
MATH_DATATYPE(kind=rck), allocatable :: top_border_send_buffer(:,:,:), top_border_recv_buffer(:,:,:) MATH_DATATYPE(kind=rck), allocatable :: top_border_send_buffer(:,:,:)
MATH_DATATYPE(kind=rck), allocatable :: bottom_border_send_buffer(:,:,:), bottom_border_recv_buffer(:,:,:) MATH_DATATYPE(kind=rck), allocatable :: top_border_recv_buffer(:,:,:)
MATH_DATATYPE(kind=rck), allocatable :: bottom_border_send_buffer(:,:,:)
MATH_DATATYPE(kind=rck), allocatable :: bottom_border_recv_buffer(:,:,:)
#endif #endif
integer(kind=c_intptr_t) :: aIntern_dev integer(kind=c_intptr_t) :: aIntern_dev
...@@ -1234,7 +1238,7 @@ ...@@ -1234,7 +1238,7 @@
#endif /* WITH_MPI */ #endif /* WITH_MPI */
if (useGPU) then if (useGPU) then
successCUDA = cuda_memcpy(bcast_buffer_dev, loc(bcast_buffer(1,1)), & successCUDA = cuda_memcpy(bcast_buffer_dev, int(loc(bcast_buffer(1,1)),kind=c_intptr_t), &
nbw * current_local_n * & nbw * current_local_n * &
size_of_datatype, & size_of_datatype, &
cudaMemcpyHostToDevice) cudaMemcpyHostToDevice)
...@@ -1345,7 +1349,8 @@ ...@@ -1345,7 +1349,8 @@
if (useGPU) then if (useGPU) then
dev_offset = (0 + (n_off * stripe_width) + ( (i-1) * stripe_width *a_dim2 )) * size_of_datatype dev_offset = (0 + (n_off * stripe_width) + ( (i-1) * stripe_width *a_dim2 )) * size_of_datatype
successCUDA = cuda_memcpy( aIntern_dev + dev_offset , loc(bottom_border_recv_buffer(1,1,i)), & successCUDA = cuda_memcpy( aIntern_dev + dev_offset , &
int(loc(bottom_border_recv_buffer(1,1,i)),kind=c_intptr_t), &
stripe_width*nbw* size_of_datatype, & stripe_width*nbw* size_of_datatype, &
cudaMemcpyHostToDevice) cudaMemcpyHostToDevice)
if (.not.(successCUDA)) then if (.not.(successCUDA)) then
...@@ -1430,7 +1435,7 @@ ...@@ -1430,7 +1435,7 @@
if (useGPU) then if (useGPU) then
dev_offset = (0 + (a_off * stripe_width) +