Commit d910628e authored by Pavel Kus's avatar Pavel Kus
Browse files

fixed elpa2 tmat_dev bug affecting larger matrices only

added several larger tests in CI. Still missing for gfortran (no
gfortran+cuda tests at the moment, will be added)
parent 0c766a0f
...@@ -130,6 +130,20 @@ intel-single-precision-mpi-noomp-cuda-jobs: ...@@ -130,6 +130,20 @@ intel-single-precision-mpi-noomp-cuda-jobs:
- /home/elpa/bin/reserve_timeslot make check TEST_FLAGS='150 50 128' - /home/elpa/bin/reserve_timeslot make check TEST_FLAGS='150 50 128'
- cat test-suite.log - cat test-suite.log
intel-single-precision-mpi-noomp-cuda-larger-jobs:
tags:
- gpu
script:
- module unload gcc
- module load gcc/4.9 cuda
- module list
- ./autogen.sh
- ./configure SCALAPACK_LDFLAGS="$MKL_INTEL_SCALAPACK_LDFLAGS_MPI_NO_OMP -L$CUDA_HOME/lib64 -lcublas -I$CUDA_HOME/include" SCALAPACK_FCFLAGS="$MKL_INTEL_SCALAPACK_FCFLAGS_MPI_NO_OMP -L$CUDA_HOME/lib64 -lcublas -I$CUDA_HOME/include" CFLAGS="-O2" CXXFLAGS="-O2" FCFLAGS="-O1" --enable-gpu-support --with-cuda-path=$CUDA_HOME/ --enable-single-precision
- make -j 8
- export LD_LIBRARY_PATH=$MKL_HOME/lib/intel64:$LD_LIBRARY_PATH
- /home/elpa/bin/reserve_timeslot make check TEST_FLAGS='1500 500 128'
- cat test-suite.log
#intel-single-precision-mpi-noomp-cuda-runtime-choice-jobs: #intel-single-precision-mpi-noomp-cuda-runtime-choice-jobs:
# tags: # tags:
# - gpu # - gpu
...@@ -176,6 +190,20 @@ intel-single-precision-nompi-noomp-cuda-jobs: ...@@ -176,6 +190,20 @@ intel-single-precision-nompi-noomp-cuda-jobs:
- /home/elpa/bin/reserve_timeslot make check TEST_FLAGS='150 50 128' - /home/elpa/bin/reserve_timeslot make check TEST_FLAGS='150 50 128'
- cat test-suite.log - cat test-suite.log
intel-single-precision-nompi-noomp-cuda-larger-jobs:
tags:
- gpu
script:
- module unload gcc
- module load gcc/4.9 cuda
- module list
- ./autogen.sh
- ./configure FC=ifort SCALAPACK_LDFLAGS="$MKL_INTEL_SCALAPACK_LDFLAGS_NO_MPI_NO_OMP -L$CUDA_HOME/lib64 -lcublas -I$CUDA_HOME/include" SCALAPACK_FCFLAGS="$MKL_INTEL_SCALAPACK_FCFLAGS_NO_MPI_NO_OMP -L$CUDA_HOME/lib64 -lcublas -I$CUDA_HOME/include" CFLAGS="-O2" CXXFLAGS="-O2" FCFLAGS="-O1" --enable-gpu-support --with-cuda-path=$CUDA_HOME/ --enable-single-precision --with-mpi=0
- make -j 8
- export LD_LIBRARY_PATH=$MKL_HOME/lib/intel64:$LD_LIBRARY_PATH
- /home/elpa/bin/reserve_timeslot make check TEST_FLAGS='1500 500 128'
- cat test-suite.log
#gfortran-single-precision-mpi-noomp-jobs: #gfortran-single-precision-mpi-noomp-jobs:
# tags: # tags:
...@@ -217,6 +245,16 @@ intel-double-precision-nompi-noomp-jobs: ...@@ -217,6 +245,16 @@ intel-double-precision-nompi-noomp-jobs:
- export LD_LIBRARY_PATH=$MKL_HOME/lib/intel64:$LD_LIBRARY_PATH - export LD_LIBRARY_PATH=$MKL_HOME/lib/intel64:$LD_LIBRARY_PATH
- make check TEST_FLAGS='150 50 16' || { cat test-suite.log; exit 1; } - make check TEST_FLAGS='150 50 16' || { cat test-suite.log; exit 1; }
intel-double-precision-nompi-noomp-larger-jobs:
tags:
- cpu
script:
- ./autogen.sh
- ./configure CFLAGS="-O3 -mavx" CXXFLAGS="-O3 -mavx" FCFLAGS="-O3 -mavx" SCALAPACK_LDFLAGS="$MKL_INTEL_SCALAPACK_LDFLAGS_NO_MPI_NO_OMP" SCALAPACK_FCFLAGS="$MKL_INTEL_SCALAPACK_FCFLAGS_NO_MPI_NO_OMP" --with-mpi=no FC=ifort
- make -j 8
- export LD_LIBRARY_PATH=$MKL_HOME/lib/intel64:$LD_LIBRARY_PATH
- make check TEST_FLAGS='1000 50 32' || { cat test-suite.log; exit 1; }
intel-double-precision-nompi-noomp-assumed-size-jobs: intel-double-precision-nompi-noomp-assumed-size-jobs:
tags: tags:
- cpu - cpu
...@@ -380,6 +418,17 @@ intel-single-precision-mpi-openmp-jobs: ...@@ -380,6 +418,17 @@ intel-single-precision-mpi-openmp-jobs:
- export LD_LIBRARY_PATH=$MKL_HOME/lib/intel64:$LD_LIBRARY_PATH - export LD_LIBRARY_PATH=$MKL_HOME/lib/intel64:$LD_LIBRARY_PATH
- make check TEST_FLAGS='150 50 16' || { cat test-suite.log; exit 1; } - make check TEST_FLAGS='150 50 16' || { cat test-suite.log; exit 1; }
intel-single-precision-mpi-openmp-larger-jobs:
tags:
- cpu
script:
- ./autogen.sh
- ./configure CFLAGS="-O3 -mavx" CXXFLAGS="-O3 -mavx" FCFLAGS="-O3 -mavx" SCALAPACK_LDFLAGS="$MKL_INTEL_SCALAPACK_LDFLAGS_MPI_OMP" SCALAPACK_FCFLAGS="$MKL_INTEL_SCALAPACK_FCFLAGS_MPI_OMP" --enable-openmp --enable-single-precision
- make -j 8
- export OMP_NUM_THREADS=2
- export LD_LIBRARY_PATH=$MKL_HOME/lib/intel64:$LD_LIBRARY_PATH
- make check TEST_FLAGS='1500 500 16' || { cat test-suite.log; exit 1; }
intel-single-precision-mpi-openmp-assumed-size-jobs: intel-single-precision-mpi-openmp-assumed-size-jobs:
tags: tags:
- cpu - cpu
......
...@@ -507,26 +507,19 @@ ...@@ -507,26 +507,19 @@
#endif #endif
#endif /* WITH_MPI */ #endif /* WITH_MPI */
!#ifdef WITH_MPI #ifdef WITH_MPI
!! it should be possible to keep tmat on the device and not copy it aroud ! IMPORTANT: even though tmat_dev is transfered from the previous rutine, we have to copy from tmat again
!! ! copy to device, maybe this can be avoided tmat is input from bandred_real ! tmat is 3-dimensional array, while tmat_dev contains only one 2-dimensional slice of it - and here we
! ! need to upload another slice
! successCUDA = cuda_memcpy(tmat_dev, loc(tmat(1,1,istep)), nbw*nbw* & successCUDA = cuda_memcpy(tmat_dev, loc(tmat(1,1,istep)), nbw*nbw*size_of_datatype, cudaMemcpyHostToDevice)
!#if REALCASE == 1
! size_of_PRECISION_real, & if (.not.(successCUDA)) then
!#endif print *,"trans_ev_band_to_full_&
!#if COMPLEXCASE == 1 &MATH_DATATYPE&
! size_of_PRECISION_complex, & &: error in cudaMemcpy"
!#endif stop 1
! cudaMemcpyHostToDevice) endif
! #endif /* WITH_MPI */
! if (.not.(successCUDA)) then
! print *,"trans_ev_band_to_full_&
! &MATH_DATATYPE&
! &: error in cudaMemcpy"
! stop 1
! endif
!#endif /* WITH_MPI */
call timer%start("cublas") call timer%start("cublas")
#if REALCASE == 1 #if REALCASE == 1
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment