Commit d910628e authored by Pavel Kus's avatar Pavel Kus

fixed elpa2 tmat_dev bug affecting larger matrices only

added several larger tests in CI. Still missing for gfortran (no
gfortran+cuda tests at the moment, will be added)
parent 0c766a0f
......@@ -130,6 +130,20 @@ intel-single-precision-mpi-noomp-cuda-jobs:
- /home/elpa/bin/reserve_timeslot make check TEST_FLAGS='150 50 128'
- cat test-suite.log
intel-single-precision-mpi-noomp-cuda-larger-jobs:
tags:
- gpu
script:
- module unload gcc
- module load gcc/4.9 cuda
- module list
- ./autogen.sh
- ./configure SCALAPACK_LDFLAGS="$MKL_INTEL_SCALAPACK_LDFLAGS_MPI_NO_OMP -L$CUDA_HOME/lib64 -lcublas -I$CUDA_HOME/include" SCALAPACK_FCFLAGS="$MKL_INTEL_SCALAPACK_FCFLAGS_MPI_NO_OMP -L$CUDA_HOME/lib64 -lcublas -I$CUDA_HOME/include" CFLAGS="-O2" CXXFLAGS="-O2" FCFLAGS="-O1" --enable-gpu-support --with-cuda-path=$CUDA_HOME/ --enable-single-precision
- make -j 8
- export LD_LIBRARY_PATH=$MKL_HOME/lib/intel64:$LD_LIBRARY_PATH
- /home/elpa/bin/reserve_timeslot make check TEST_FLAGS='1500 500 128'
- cat test-suite.log
#intel-single-precision-mpi-noomp-cuda-runtime-choice-jobs:
# tags:
# - gpu
......@@ -176,6 +190,20 @@ intel-single-precision-nompi-noomp-cuda-jobs:
- /home/elpa/bin/reserve_timeslot make check TEST_FLAGS='150 50 128'
- cat test-suite.log
intel-single-precision-nompi-noomp-cuda-larger-jobs:
tags:
- gpu
script:
- module unload gcc
- module load gcc/4.9 cuda
- module list
- ./autogen.sh
- ./configure FC=ifort SCALAPACK_LDFLAGS="$MKL_INTEL_SCALAPACK_LDFLAGS_NO_MPI_NO_OMP -L$CUDA_HOME/lib64 -lcublas -I$CUDA_HOME/include" SCALAPACK_FCFLAGS="$MKL_INTEL_SCALAPACK_FCFLAGS_NO_MPI_NO_OMP -L$CUDA_HOME/lib64 -lcublas -I$CUDA_HOME/include" CFLAGS="-O2" CXXFLAGS="-O2" FCFLAGS="-O1" --enable-gpu-support --with-cuda-path=$CUDA_HOME/ --enable-single-precision --with-mpi=0
- make -j 8
- export LD_LIBRARY_PATH=$MKL_HOME/lib/intel64:$LD_LIBRARY_PATH
- /home/elpa/bin/reserve_timeslot make check TEST_FLAGS='1500 500 128'
- cat test-suite.log
#gfortran-single-precision-mpi-noomp-jobs:
# tags:
......@@ -217,6 +245,16 @@ intel-double-precision-nompi-noomp-jobs:
- export LD_LIBRARY_PATH=$MKL_HOME/lib/intel64:$LD_LIBRARY_PATH
- make check TEST_FLAGS='150 50 16' || { cat test-suite.log; exit 1; }
intel-double-precision-nompi-noomp-larger-jobs:
tags:
- cpu
script:
- ./autogen.sh
- ./configure CFLAGS="-O3 -mavx" CXXFLAGS="-O3 -mavx" FCFLAGS="-O3 -mavx" SCALAPACK_LDFLAGS="$MKL_INTEL_SCALAPACK_LDFLAGS_NO_MPI_NO_OMP" SCALAPACK_FCFLAGS="$MKL_INTEL_SCALAPACK_FCFLAGS_NO_MPI_NO_OMP" --with-mpi=no FC=ifort
- make -j 8
- export LD_LIBRARY_PATH=$MKL_HOME/lib/intel64:$LD_LIBRARY_PATH
- make check TEST_FLAGS='1000 50 32' || { cat test-suite.log; exit 1; }
intel-double-precision-nompi-noomp-assumed-size-jobs:
tags:
- cpu
......@@ -380,6 +418,17 @@ intel-single-precision-mpi-openmp-jobs:
- export LD_LIBRARY_PATH=$MKL_HOME/lib/intel64:$LD_LIBRARY_PATH
- make check TEST_FLAGS='150 50 16' || { cat test-suite.log; exit 1; }
intel-single-precision-mpi-openmp-larger-jobs:
tags:
- cpu
script:
- ./autogen.sh
- ./configure CFLAGS="-O3 -mavx" CXXFLAGS="-O3 -mavx" FCFLAGS="-O3 -mavx" SCALAPACK_LDFLAGS="$MKL_INTEL_SCALAPACK_LDFLAGS_MPI_OMP" SCALAPACK_FCFLAGS="$MKL_INTEL_SCALAPACK_FCFLAGS_MPI_OMP" --enable-openmp --enable-single-precision
- make -j 8
- export OMP_NUM_THREADS=2
- export LD_LIBRARY_PATH=$MKL_HOME/lib/intel64:$LD_LIBRARY_PATH
- make check TEST_FLAGS='1500 500 16' || { cat test-suite.log; exit 1; }
intel-single-precision-mpi-openmp-assumed-size-jobs:
tags:
- cpu
......
......@@ -507,26 +507,19 @@
#endif
#endif /* WITH_MPI */
!#ifdef WITH_MPI
!! it should be possible to keep tmat on the device and not copy it aroud
!! ! copy to device, maybe this can be avoided tmat is input from bandred_real
!
! successCUDA = cuda_memcpy(tmat_dev, loc(tmat(1,1,istep)), nbw*nbw* &
!#if REALCASE == 1
! size_of_PRECISION_real, &
!#endif
!#if COMPLEXCASE == 1
! size_of_PRECISION_complex, &
!#endif
! cudaMemcpyHostToDevice)
!
! if (.not.(successCUDA)) then
! print *,"trans_ev_band_to_full_&
! &MATH_DATATYPE&
! &: error in cudaMemcpy"
! stop 1
! endif
!#endif /* WITH_MPI */
#ifdef WITH_MPI
! IMPORTANT: even though tmat_dev is transfered from the previous rutine, we have to copy from tmat again
! tmat is 3-dimensional array, while tmat_dev contains only one 2-dimensional slice of it - and here we
! need to upload another slice
successCUDA = cuda_memcpy(tmat_dev, loc(tmat(1,1,istep)), nbw*nbw*size_of_datatype, cudaMemcpyHostToDevice)
if (.not.(successCUDA)) then
print *,"trans_ev_band_to_full_&
&MATH_DATATYPE&
&: error in cudaMemcpy"
stop 1
endif
#endif /* WITH_MPI */
call timer%start("cublas")
#if REALCASE == 1
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment