Commit 8aae13e2 authored by Andreas Marek's avatar Andreas Marek
Browse files

ELPA 2stage GPU only works with nblk =128

parent 8c0a2ae7
......@@ -127,37 +127,38 @@ intel-single-precision-mpi-noomp-cuda-jobs:
- export LD_LIBRARY_PATH=$MKL_HOME/lib/intel64:$LD_LIBRARY_PATH
- /home/elpa/bin/reserve_timeslot make check TEST_FLAGS='150 50 128' || { cat test-suite.log; exit 1; }
intel-single-precision-mpi-noomp-cuda-runtime-choice-jobs:
tags:
- gpu
script:
- module unload gcc
- module load gcc/4.9 cuda
- module list
- ./autogen.sh
- ./configure SCALAPACK_LDFLAGS="$MKL_INTEL_SCALAPACK_LDFLAGS_MPI_NO_OMP -L$CUDA_HOME/lib64 -lcublas -I$CUDA_HOME/include" SCALAPACK_FCFLAGS="$MKL_INTEL_SCALAPACK_FCFLAGS_MPI_NO_OMP -L$CUDA_HOME/lib64 -lcublas -I$CUDA_HOME/include" CFLAGS="-O2" CXXFLAGS="-O2" FCFLAGS="-O1" --enable-gpu-support --with-cuda-path=$CUDA_HOME/ --enable-single-precision
- make -j 8
- export LD_LIBRARY_PATH=$MKL_HOME/lib/intel64:$LD_LIBRARY_PATH
- export ELPA_USE_GPU=yes
- export REAL_ELPA_KERNEL=REAL_ELPA_KERNEL_GPU
- export COMPLEX_ELPA_KERNEL=COMPLEX_ELPA_KERNEL_GPU
- /home/elpa/bin/reserve_timeslot make check TEST_FLAGS='150 50 128' || { cat test-suite.log; exit 1; }
#intel-single-precision-mpi-noomp-cuda-runtime-choice-jobs:
# tags:
# - gpu
# script:
# - module unload gcc
# - module load gcc/4.9 cuda
# - module list
# - ./autogen.sh
# - ./configure SCALAPACK_LDFLAGS="$MKL_INTEL_SCALAPACK_LDFLAGS_MPI_NO_OMP -L$CUDA_HOME/lib64 -lcublas -I$CUDA_HOME/include" SCALAPACK_FCFLAGS="$MKL_INTEL_SCALAPACK_FCFLAGS_MPI_NO_OMP -L$CUDA_HOME/lib64 -lcublas -I$CUDA_HOME/include" CFLAGS="-O2" CXXFLAGS="-O2" FCFLAGS="-O1" --enable-gpu-support --with-cuda-path=$CUDA_HOME/ --enable-single-precision
# - make -j 8
# - export LD_LIBRARY_PATH=$MKL_HOME/lib/intel64:$LD_LIBRARY_PATH
# - export ELPA_USE_GPU=yes
# - export REAL_ELPA_KERNEL=REAL_ELPA_KERNEL_GPU
# - export COMPLEX_ELPA_KERNEL=COMPLEX_ELPA_KERNEL_GPU
# - /home/elpa/bin/reserve_timeslot make check TEST_FLAGS='150 50 128' || { cat test-suite.log; exit 1; }
#intel-single-precision-mpi-noomp-cuda-blocksize-jobs:
# tags:
# - gpu
# script:
# - module unload gcc
# - module load gcc/4.9 cuda
# - module list
# - ./autogen.sh
# - ./configure SCALAPACK_LDFLAGS="$MKL_INTEL_SCALAPACK_LDFLAGS_MPI_NO_OMP -L$CUDA_HOME/lib64 -lcublas -I$CUDA_HOME/include" SCALAPACK_FCFLAGS="$MKL_INTEL_SCALAPACK_FCFLAGS_MPI_NO_OMP -L$CUDA_HOME/lib64 -lcublas -I$CUDA_HOME/include" CFLAGS="-O2" CXXFLAGS="-O2" FCFLAGS="-O1" --enable-gpu-support --with-cuda-path=$CUDA_HOME/ --enable-single-precision
# - make -j 8
# - export LD_LIBRARY_PATH=$MKL_HOME/lib/intel64:$LD_LIBRARY_PATH
# - export ELPA_USE_GPU=yes
# - export REAL_ELPA_KERNEL=REAL_ELPA_KERNEL_GPU
# - export COMPLEX_ELPA_KERNEL=COMPLEX_ELPA_KERNEL_GPU
# - /home/elpa/bin/reserve_timeslot make check TEST_FLAGS='150 50 16' || { cat test-suite.log; exit 1; }
intel-single-precision-mpi-noomp-cuda-blocksize-jobs:
tags:
- gpu
script:
- module unload gcc
- module load gcc/4.9 cuda
- module list
- ./autogen.sh
- ./configure SCALAPACK_LDFLAGS="$MKL_INTEL_SCALAPACK_LDFLAGS_MPI_NO_OMP -L$CUDA_HOME/lib64 -lcublas -I$CUDA_HOME/include" SCALAPACK_FCFLAGS="$MKL_INTEL_SCALAPACK_FCFLAGS_MPI_NO_OMP -L$CUDA_HOME/lib64 -lcublas -I$CUDA_HOME/include" CFLAGS="-O2" CXXFLAGS="-O2" FCFLAGS="-O1" --enable-gpu-support --with-cuda-path=$CUDA_HOME/ --enable-single-precision
- make -j 8
- export LD_LIBRARY_PATH=$MKL_HOME/lib/intel64:$LD_LIBRARY_PATH
- export ELPA_USE_GPU=yes
- export REAL_ELPA_KERNEL=REAL_ELPA_KERNEL_GPU
- export COMPLEX_ELPA_KERNEL=COMPLEX_ELPA_KERNEL_GPU
- /home/elpa/bin/reserve_timeslot make check TEST_FLAGS='150 50 16' || { cat test-suite.log; exit 1; }
intel-single-precision-nompi-noomp-cuda-jobs:
tags:
- gpu
......
......@@ -581,6 +581,14 @@ module ELPA2
endif
endif
if (do_useGPU) then
if (nblk .ne. 128) then
write(error_unit,*) "In case of GPU usage the blocksize for ELPA 2stage has to be 128"
success = .false.
return
endif
endif
! Choose bandwidth, must be a multiple of nblk, set to a value >= 32
! On older systems (IBM Bluegene/P, Intel Nehalem) a value of 32 was optimal.
! For Intel(R) Xeon(R) E5 v2 and v3, better use 64 instead of 32!
......@@ -960,6 +968,13 @@ module ELPA2
endif
endif
if (do_useGPU) then
if (nblk .ne. 128) then
write(error_unit,*) "In case of GPU usage the blocksize for ELPA 2stage has to be 128"
success = .false.
return
endif
endif
! Choose bandwidth, must be a multiple of nblk, set to a value >= 32
! On older systems (IBM Bluegene/P, Intel Nehalem) a value of 32 was optimal.
! For Intel(R) Xeon(R) E5 v2 and v3, better use 64 instead of 32!
......@@ -1296,6 +1311,13 @@ function solve_evp_complex_2stage_single(na, nev, a, lda, ev, q, ldq, nblk, &
endif
endif
if (do_useGPU) then
if (nblk .ne. 128) then
write(error_unit,*) "In case of GPU usage the blocksize for ELPA 2stage has to be 128"
success = .false.
return
endif
endif
! Choose bandwidth, must be a multiple of nblk, set to a value >= 32
nbw = (31/nblk+1)*nblk
......@@ -1642,6 +1664,14 @@ function solve_evp_complex_2stage_single(na, nev, a, lda, ev, q, ldq, nblk, &
return
endif
endif
if (do_useGPU) then
if (nblk .ne. 128) then
write(error_unit,*) "In case of GPU usage the blocksize for ELPA 2stage has to be 128"
success = .false.
return
endif
endif
! Choose bandwidth, must be a multiple of nblk, set to a value >= 32
nbw = (31/nblk+1)*nblk
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment