Commit 40e9db4a authored by Luka Stanisic's avatar Luka Stanisic

updating labbook

parent a6e19b48
......@@ -124,7 +124,7 @@ mpiexec -n 2 $BUILD_DIR/bioEM --Inputfile $TUTORIAL_DIR/Param_Input --Modelfile
# Loading necessary modules if needed (check installation gcc/Intel)
# Environment variables
export OMP_NUM_THREADS=20
export OMP_NUM_THREADS=10
export OMP_PLACES=cores
export FFTALGO=1
export GPU=1
......@@ -256,7 +256,7 @@ BIOEM_DEBUG_BREAK=4 BIOEM_DEBUG_OUTPUT=2 mpiexec -n 2 $BUILD_DIR/bioEM --Inputfi
BIOEM_DEBUG_OUTPUT=0 mpiexec -n 2 $BUILD_DIR/bioEM --Inputfile $INPUT_DIR/INPUT_FRH_Sep2016 --Modelfile $INPUT_DIR/Mod_X-ray_PDB --Particlesfile $INPUT_DIR/2000FRH_Part
#+END_SRC
*** [partially working] hydra machine
*** [working] hydra machine
**** Machine description (from http://www.mpcdf.mpg.de/services/computing/hydra/about-the-system)
In October 2013, the main part of the iDataPlex HPC system HYDRA
......@@ -291,14 +291,14 @@ BIOEM_DEBUG_OUTPUT=0 mpiexec -n 2 $BUILD_DIR/bioEM --Inputfile $INPUT_DIR/INPUT_
between the domains is much weaker, so batch jobs are restricted
to a single domain.
**** [error: problems with boost] Installation with gnu compilers
**** [working] Installation with gnu compilers
#+BEGIN_SRC
# Loading necessary modules
module purge
module load git/2.8
module load cmake/3.5
module load gcc/4.9
module load gcc/5.4
module load mpi.ibm/1.4.0
module load fftw/gcc/3.3.4
module load boost/gcc/1.61
......@@ -319,7 +319,7 @@ ln -s /u/system/SLES11/soft/cuda/libcuda/libcuda.so.352.79 libcuda.so.1
ln -s libcuda.so.1 libcuda.so
# Configuration and compilation (need to manually add CUDA_rt_LIBRARY)
cmake -DUSE_MPI=ON -DUSE_OPENMP=ON -DUSE_CUDA=ON -DPRINT_CMAKE_VARIABLES=ON -DCUDA_FORCE_GCC=ON -DCUDA_rt_LIBRARY=/u/system/SLES11/soft/cuda/7.5/lib64/libcudart.so -DCUDA_SDK_ROOT_DIR=/u/system/SLES11/soft/cuda/7.5/samples/common -DCUDA_CUDA_LIBRARY=$PWD/libcuda.so $SRC_DIR/
cmake -DMPI_C_COMPILER=mpigcc -DMPI_CXX_COMPILER=mpicxx -DCMAKE_CXX_COMPILER=g++ -DUSE_MPI=ON -DUSE_OPENMP=ON -DUSE_CUDA=ON -DPRINT_CMAKE_VARIABLES=ON -DCUDA_FORCE_GCC=OFF -DCUDA_rt_LIBRARY=/u/system/SLES11/soft/cuda/7.5/lib64/libcudart.so -DCUDA_SDK_ROOT_DIR=/u/system/SLES11/soft/cuda/7.5/samples/common -DCUDA_CUDA_LIBRARY=$PWD/libcuda.so $SRC_DIR/
make -j5 VERBOSE=1
#+END_SRC
......@@ -444,50 +444,17 @@ mpiexec -n 8 $BUILD_DIR/bioEM --Inputfile $INPUT_DIR/INPUT_FRH_Sep2016 --Modelfi
#+END_SRC
- Later just do /llsubmit batch_script/
*** [partially working] draco machine
**** Machine description (from http://www.mpcdf.mpg.de/services/computing/draco/about-the-system)
The extension cluster DRACO of the HPC system HYDRA was installed
in May 2016 at the MPCDF with Intel 'Haswell' Xeon E5-2698
processors (~ 880 nodes with 32 cores @ 2.3 GHz each). 106 of the
nodes are equipped with accelerator cards (2 x PNY GTX980 GPUs
each).
Most of the compute nodes have a main memory of 128 GB, 4 nodes
have 512 GB, 1 has 256 GB, 4 of the GPU nodes have a main memory
of 256 GB.
In January 2017, the DRACO cluster was expanded by 64 Intel
'Broadwell' nodes that were purchased by the Fritz-Haber
Institute. The 'Broadwell' nodes have 40 cores each and a main
memory of 256 GB.
In total there are 30.688 cores with a main memory of 128 TB and
a peak performance of 1.12 PetaFlop/s.
In addition to the compute nodes there are 4 login nodes and 8
I/O nodes that serve the 1.5 PetaByte of disk storage.
The common interconnect is a fast InfiniBand FDR14 network.
The compute nodes and GPU nodes are bundled into 30 domains.
Within one domain, the InfiniBand network topology is a 'fat
tree' topology for high efficient communication. The InfiniBand
connection between the domains is much weaker, so batch jobs are
restricted to a single domain, that is 32 nodes.
**** [error: problems with boost] Installation with gnu compilers
*** [working] new recipes for draco
- Installation with Intel modules
#+BEGIN_SRC
# Loading necessary modules
module purge
module load git/2.8
module load cmake/3.5
module load gcc/4.9
module load impi/5.1.3
module load fftw/gcc/3.3.4
module load boost/gcc/1.61
module load cuda/7.5
module purge
module load git/2.13
module load cmake/3.7
module load intel/17.0
module load impi/2017.3
module load fftw/3.3.6
module load boost/intel/1.64
module load cuda/8.0
# Paths
SRC_DIR="$afsdir/BioEM_fork"
......@@ -499,24 +466,25 @@ cd $BUILD_DIR
rm -rf $BUILD_DIR/*
rm -rf $SRC_DIR/CMakeFiles $SRC_DIR/CMakeCache.txt $SRC_DIR/Makefile $SRC_DIR/cmake_install.cmake
# Configuration and compilation (need to manually add CUDA_rt_LIBRARY)
cmake -DUSE_MPI=ON -DUSE_OPENMP=ON -DUSE_CUDA=ON -DPRINT_CMAKE_VARIABLES=ON -DCUDA_FORCE_GCC=ON $SRC_DIR/
# Configuration and compilation (need to manually add CUDA_rt_LIBRARY and CUDA_SDK_ROOT_DIR)
cmake -DMPI_C_COMPILER=mpiicc -DMPI_CXX_COMPILER=mpiicpc -DCMAKE_CXX_COMPILER=icpc -DUSE_MPI=ON -DUSE_OPENMP=ON -DUSE_CUDA=ON -DPRINT_CMAKE_VARIABLES=ON -DCUDA_FORCE_GCC=ON $SRC_DIR/
make -j5 VERBOSE=1
#+END_SRC
**** [working] Installation with Intel compilers
- Installation with gcc modules. Before CUDA_HOST_COMPILER needs to be manually set to gcc/5.4 as CUDA was compiled with that gcc
#+BEGIN_SRC
# Loading necessary modules
#set (CUDA_HOST_COMPILER /mpcdf/soft/SLES122/common/gcc/5.4.0/bin/gcc)
# # Needs to be set instead of
#set (CUDA_HOST_COMPILER gcc)
module purge
module load git/2.8
module load cmake/3.5
module load intel/16.0
module load mkl/11.3
module load impi/5.1.3
module load fftw/3.3.4
module load boost/intel/1.61
module load cuda/7.5
module load git/2.13
module load cmake/3.7
module load gcc/6.3
module load impi/2017.3
module load fftw/gcc/3.3.6
module load boost/gcc/1.64
module load cuda/8.0
# Paths
SRC_DIR="$afsdir/BioEM_fork"
......@@ -528,104 +496,12 @@ cd $BUILD_DIR
rm -rf $BUILD_DIR/*
rm -rf $SRC_DIR/CMakeFiles $SRC_DIR/CMakeCache.txt $SRC_DIR/Makefile $SRC_DIR/cmake_install.cmake
# Configuration and compilation (need to manually add CUDA_rt_LIBRARY and CUDA_SDK_ROOT_DIR)
cmake -DUSE_MPI=ON -DUSE_OPENMP=ON -DUSE_CUDA=ON -DPRINT_CMAKE_VARIABLES=ON -DCUDA_FORCE_GCC=ON $SRC_DIR/
# Configuration and compilation
cmake -DMPI_C_COMPILER=mpigcc -DMPI_CXX_COMPILER=mpicxx -DCMAKE_CXX_COMPILER=g++ -DUSE_MPI=ON -DUSE_OPENMP=ON -DUSE_CUDA=ON -DPRINT_CMAKE_VARIABLES=ON -DCUDA_FORCE_GCC=ON $SRC_DIR/
make -j5 VERBOSE=1
#+END_SRC
**** [working] Running interactive Tutorial example (no GPUs)
#+BEGIN_SRC
# Loading necessary modules if needed (check installation gcc/Intel)
# Environment variables
export OMP_NUM_THREADS=5
export OMP_PLACES=cores
export FFTALGO=1
export GPU=0
# Paths
TUTORIAL_DIR="$HOME/BioEM_project/tutorial"
BUILD_DIR="$HOME/BioEM_project/build"
# Running tutorial test
srun -n 4 $BUILD_DIR/bioEM --Inputfile $TUTORIAL_DIR/Param_Input --Modelfile $TUTORIAL_DIR/Model_Text --Particlesfile $TUTORIAL_DIR/Text_Image_Form
#+END_SRC
**** [working] Running larger example from the paper
- Sample scripts are available here: https://www.mpcdf.mpg.de/services/computing/draco/sample-batch-script
#+BEGIN_SRC
#!/bin/bash -l
# Standard output and error:
#SBATCH -o ./tjob_hybrid.out.%j
#SBATCH -e ./tjob_hybrid.err.%j
# Initial working directory:
#SBATCH -D ./
# Job Name:
#SBATCH -J bioem_test
# Queue (Partition):
#SBATCH --partition=gpu
# Node feature:
#SBATCH --constraint="gpu"
# Number of nodes and MPI tasks per node:
#SBATCH --nodes=4
#SBATCH --ntasks-per-node=2
# for OpenMP:
#SBATCH --cpus-per-task=16
#
#SBATCH --mail-type=none
#SBATCH --mail-user=<userid>@rzg.mpg.de
# Wall clock limit:
#SBATCH --time=01:00:00
# Loading necessary modules for Intel compilers
module purge
module load git/2.8
module load cmake/3.5
module load intel/16.0
module load mkl/11.3
module load impi/5.1.3
module load fftw/3.3.4
module load boost/intel/1.61
module load cuda/7.5
# Environment variables
export OMP_NUM_THREADS=16
export OMP_PLACES=cores
export FFTALGO=1
export GPU=1
export GPUDEVICE=-1
# For hyperthreading, but actually brings bad performance
#export OMP_NUM_THREADS=32
#export OMP_PLACES=threads
#export SLURM_HINT=multithread
# Environment variable to tune
export GPUWORKLOAD=90
export BIOEM_DEBUG_OUTPUT=0
export BIOEM_AUTOTUNING=0
# Paths
INPUT_DIR="$HOME/BioEM_project/inputs"
BUILD_DIR="$HOME/BioEM_project/build"
datafile="$HOME/BioEM_project/data/draco_$BIOEM_AUTOTUNING.org"
> $datafile
# Environment capture
$HOME/BioEM_project/get_info.sh $datafile
echo "* BIOEM OUTPUT" >> $datafile
# Running full example
cd /ptmp/$USER/
srun -n 8 $BUILD_DIR/bioEM --Inputfile $INPUT_DIR/INPUT_FRH_Sep2016 --Modelfile $INPUT_DIR/Mod_X-ray_PDB --Particlesfile $INPUT_DIR/2000FRH_Part >> $datafile
#+END_SRC
- Later just do /sbatch batch_script_draco/
*** [partially working] phys machine
*** [working] phys machine
**** Machine description
No real documentation of the machine, but the characteristics are
one of the following:
......@@ -633,7 +509,7 @@ srun -n 8 $BUILD_DIR/bioEM --Inputfile $INPUT_DIR/INPUT_FRH_Sep2016 --Modelfile
1. phys01 (E5-2680v3 + GTX1080) - suggested by Markus in his email
2. HSW + GTX980 - suggested in the spreed sheet from Markus
**** [error: problems with boost] Installation with gnu compilers
**** [working] Installation with gnu compilers
#+BEGIN_SRC
# Loading necessary modules
......@@ -643,7 +519,7 @@ module load cmake/3.5
module load gcc/4.9
module load impi/5.1.3
module load fftw/gcc/3.3.4
module load boost/gcc/1.60
module load boost/gcc/1.57
module load cuda/7.5
# Paths
......@@ -657,7 +533,7 @@ rm -rf $BUILD_DIR/*
rm -rf $SRC_DIR/CMakeFiles $SRC_DIR/CMakeCache.txt $SRC_DIR/Makefile $SRC_DIR/cmake_install.cmake
# Configuration and compilation (need to manually add CUDA_rt_LIBRARY)
cmake -DUSE_MPI=ON -DUSE_OPENMP=ON -DUSE_CUDA=ON -DPRINT_CMAKE_VARIABLES=ON -DCUDA_FORCE_GCC=ON $SRC_DIR/
cmake -DMPI_C_COMPILER=mpigcc -DMPI_CXX_COMPILER=mpicxx -DCMAKE_CXX_COMPILER=g++ -DUSE_MPI=ON -DUSE_OPENMP=ON -DUSE_CUDA=ON -DPRINT_CMAKE_VARIABLES=ON -DCUDA_FORCE_GCC=ON $SRC_DIR/
make -j5 VERBOSE=1
#+END_SRC
......@@ -821,6 +697,186 @@ done
#+END_SRC
** Deprecated
*** [partially working] draco machine
**** Machine description (from http://www.mpcdf.mpg.de/services/computing/draco/about-the-system)
The extension cluster DRACO of the HPC system HYDRA was installed
in May 2016 at the MPCDF with Intel 'Haswell' Xeon E5-2698
processors (~ 880 nodes with 32 cores @ 2.3 GHz each). 106 of the
nodes are equipped with accelerator cards (2 x PNY GTX980 GPUs
each).
Most of the compute nodes have a main memory of 128 GB, 4 nodes
have 512 GB, 1 has 256 GB, 4 of the GPU nodes have a main memory
of 256 GB.
In January 2017, the DRACO cluster was expanded by 64 Intel
'Broadwell' nodes that were purchased by the Fritz-Haber
Institute. The 'Broadwell' nodes have 40 cores each and a main
memory of 256 GB.
In total there are 30.688 cores with a main memory of 128 TB and
a peak performance of 1.12 PetaFlop/s.
In addition to the compute nodes there are 4 login nodes and 8
I/O nodes that serve the 1.5 PetaByte of disk storage.
The common interconnect is a fast InfiniBand FDR14 network.
The compute nodes and GPU nodes are bundled into 30 domains.
Within one domain, the InfiniBand network topology is a 'fat
tree' topology for high efficient communication. The InfiniBand
connection between the domains is much weaker, so batch jobs are
restricted to a single domain, that is 32 nodes.
**** [error: problems with boost] Installation with gnu compilers
#+BEGIN_SRC
# Loading necessary modules
module purge
module load git/2.8
module load cmake/3.5
module load gcc/4.9
module load impi/5.1.3
module load fftw/gcc/3.3.4
module load boost/gcc/1.61
module load cuda/7.5
# Paths
SRC_DIR="$afsdir/BioEM_fork"
BUILD_DIR="$HOME/BioEM_project/build"
mkdir -p $BUILD_DIR
cd $BUILD_DIR
# Deleting files from previous installations previous
rm -rf $BUILD_DIR/*
rm -rf $SRC_DIR/CMakeFiles $SRC_DIR/CMakeCache.txt $SRC_DIR/Makefile $SRC_DIR/cmake_install.cmake
# Configuration and compilation (need to manually add CUDA_rt_LIBRARY)
cmake -DUSE_MPI=ON -DUSE_OPENMP=ON -DUSE_CUDA=ON -DPRINT_CMAKE_VARIABLES=ON -DCUDA_FORCE_GCC=ON $SRC_DIR/
make -j5 VERBOSE=1
#+END_SRC
**** [working] Installation with Intel compilers
#+BEGIN_SRC
# Loading necessary modules
module purge
module load git/2.8
module load cmake/3.5
module load intel/16.0
module load mkl/11.3
module load impi/5.1.3
module load fftw/3.3.4
module load boost/intel/1.61
module load cuda/7.5
# Paths
SRC_DIR="$afsdir/BioEM_fork"
BUILD_DIR="$HOME/BioEM_project/build"
mkdir -p $BUILD_DIR
cd $BUILD_DIR
# Deleting files from previous installations previous
rm -rf $BUILD_DIR/*
rm -rf $SRC_DIR/CMakeFiles $SRC_DIR/CMakeCache.txt $SRC_DIR/Makefile $SRC_DIR/cmake_install.cmake
# Configuration and compilation (need to manually add CUDA_rt_LIBRARY and CUDA_SDK_ROOT_DIR)
cmake -DUSE_MPI=ON -DUSE_OPENMP=ON -DUSE_CUDA=ON -DPRINT_CMAKE_VARIABLES=ON -DCUDA_FORCE_GCC=ON $SRC_DIR/
make -j5 VERBOSE=1
#+END_SRC
**** [working] Running interactive Tutorial example (no GPUs)
#+BEGIN_SRC
# Loading necessary modules if needed (check installation gcc/Intel)
# Environment variables
export OMP_NUM_THREADS=5
export OMP_PLACES=cores
export FFTALGO=1
export GPU=0
# Paths
TUTORIAL_DIR="$HOME/BioEM_project/tutorial"
BUILD_DIR="$HOME/BioEM_project/build"
# Running tutorial test
srun -n 4 $BUILD_DIR/bioEM --Inputfile $TUTORIAL_DIR/Param_Input --Modelfile $TUTORIAL_DIR/Model_Text --Particlesfile $TUTORIAL_DIR/Text_Image_Form
#+END_SRC
**** [working] Running larger example from the paper
- Sample scripts are available here: https://www.mpcdf.mpg.de/services/computing/draco/sample-batch-script
#+BEGIN_SRC
#!/bin/bash -l
# Standard output and error:
#SBATCH -o ./tjob_hybrid.out.%j
#SBATCH -e ./tjob_hybrid.err.%j
# Initial working directory:
#SBATCH -D ./
# Job Name:
#SBATCH -J bioem_test
# Queue (Partition):
#SBATCH --partition=gpu
# Node feature:
#SBATCH --constraint="gpu"
# Number of nodes and MPI tasks per node:
#SBATCH --nodes=4
#SBATCH --ntasks-per-node=2
# for OpenMP:
#SBATCH --cpus-per-task=16
#
#SBATCH --mail-type=none
#SBATCH --mail-user=<userid>@rzg.mpg.de
# Wall clock limit:
#SBATCH --time=01:00:00
# Loading necessary modules for Intel compilers
module purge
module load git/2.8
module load cmake/3.5
module load intel/16.0
module load mkl/11.3
module load impi/5.1.3
module load fftw/3.3.4
module load boost/intel/1.61
module load cuda/7.5
# Environment variables
export OMP_NUM_THREADS=16
export OMP_PLACES=cores
export FFTALGO=1
export GPU=1
export GPUDEVICE=-1
# For hyperthreading, but actually brings bad performance
#export OMP_NUM_THREADS=32
#export OMP_PLACES=threads
#export SLURM_HINT=multithread
# Environment variable to tune
export GPUWORKLOAD=90
export BIOEM_DEBUG_OUTPUT=0
export BIOEM_AUTOTUNING=0
# Paths
INPUT_DIR="$HOME/BioEM_project/inputs"
BUILD_DIR="$HOME/BioEM_project/build"
datafile="$HOME/BioEM_project/data/draco_$BIOEM_AUTOTUNING.org"
> $datafile
# Environment capture
$HOME/BioEM_project/get_info.sh $datafile
echo "* BIOEM OUTPUT" >> $datafile
# Running full example
cd /ptmp/$USER/
srun -n 8 $BUILD_DIR/bioEM --Inputfile $INPUT_DIR/INPUT_FRH_Sep2016 --Modelfile $INPUT_DIR/Mod_X-ray_PDB --Particlesfile $INPUT_DIR/2000FRH_Part >> $datafile
#+END_SRC
- Later just do /sbatch batch_script_draco/
** General configuration remarks
- Choose number of computing nodes N
- See number of cores on every node C
......@@ -831,6 +887,14 @@ done
- Number of cores per task and OMP_NUM_THREADS (?/--cpus-per-task) is C/G
+ If OMP_NUM_THREADS is larger than the max number of cores, it is similar as C/G
** General compilation remarks
1. When compiling with Intel compilers, the following options are needed for the cmake command:
-DMPI_C_COMPILER=mpiicc -DMPI_CXX_COMPILER=mpiicpc -DCMAKE_CXX_COMPILER=icpc
2. When compiling with GCC compilers, CUDA_HOST_COMPILER variable needs to be manually set to gcc/5.4, although module gcc/6.3 is loaded. Additionally, the following options are needed for the cmake command:
-DMPI_C_COMPILER=mpigcc -DMPI_CXX_COMPILER=mpicxx -DCMAKE_CXX_COMPILER=g++
* Summaries, papers, reports
=put here links to potential external repositories=
......@@ -1629,8 +1693,10 @@ ggplot(df, aes(x=Workload, y=Time, fill=Workload)) + geom_bar(stat="identity") +
#+RESULTS:
[[file:analysis/results1_analysis.pdf]]
** 2017-06-19
*** TODO Autotuning [6/9]:
*** TODO Autotuning [6/8]:
:LOGBOOK:
- State "TODO" from "TODO" [2017-06-30 Fri 10:49]
- State "TODO" from "TODO" [2017-06-30 Fri 10:49]
- State "TODO" from "TODO" [2017-06-23 Fri 15:46]
- State "TODO" from "TODO" [2017-06-23 Fri 12:58]
- State "TODO" from "TODO" [2017-06-23 Fri 09:52]
......@@ -1679,6 +1745,7 @@ ggplot(df, aes(x=Workload, y=Time, fill=Workload)) + geom_bar(stat="identity") +
- [X] Problems with installing BioEM on hydra machine with Intel recipe that used to work. Issues seem unrelated to autotuning changes, but something regarding bio:configure and reading the parameters. Actually, before Luka was not using the right Intel compilers, with the correct one everything compiles fine
- [X] Need to find a proper way of handling errors in the code
- [ ] Need to do a nice cleanup before merging into the main project
+ Check if it is OK to add workload information to the "TimeComparison" line
- [ ] Add nice printf for writing the Optimal Workload
- [ ] Add more profoud CUDA profiling, possibly using specialized CUDA tools for that. We will certainly need it in future when doing more developments in BioEM
- [ ] Ensure that pinning is done correctly (in Intel case there shouldnt be any problem)
......@@ -1730,14 +1797,17 @@ ggplot(df, aes(x=Algorithm, y=Time, fill=Workload)) + geom_bar(stat="identity")
- Running experiments on 4 nodes on machines hydra, draco, phys
- [ ] Strange error on phys machine where no error is supposed to occur (just after the computation). Disabled temporarily the check to make it work, but this needs to be investigated more
** 2017-06-22
*** TODO BioEM code errors regrouped [0/2]
*** TODO BioEM code errors regrouped [1/2]
:LOGBOOK:
- State "TODO" from "TODO" [2017-06-29 Thu 15:29]
- State "TODO" from "TODO" [2017-06-22 Thu 10:51]
- State "TODO" from "TODO" [2017-06-22 Thu 10:51]
- State "TODO" from [2017-06-22 Thu 09:57]
:END:
- [ ] Strange error on phys machine where no error is supposed to occur (just after the CUDA computation). It is happening in bioem_cuda.cu:300, although the code error is 0 (which normally means cudaSuccess, hence no error). Disabled temporarily the check to make it work, but this needs to be investigated more
- [ ] Still it seems that when initializing device 1 then 0, this causes a problem (initialization 0 then 1 seems to be fine). Need to inspect this problem in more details
+ Code was 0 as cudaGetLastError was reseting the code error. Hence, using cudaPeekAtLastError() might be better
+ Actually cudaPeekAtLastError() shows that the error was CUDA_ERROR_INVALID_DEVICE
- [X] Still it seems that when initializing device 1 then 0, this causes a problem (initialization 0 then 1 seems to be fine). Need to inspect this problem in more details
+ Markus thinks that this may be related to the way CUDA is configured on /dvl/ machine. They enabled the special MPS mode, which for some unknown reasons is causing troubles for BioEM code.
+ We will need to inspect this more and check this hypothesis
+ If this is true, additional code in BioEM is needed, to make sure it can work on similar machines
......@@ -1881,3 +1951,214 @@ ggplot(df, aes(x=Algorithm, y=Time, fill=Workload)) + geom_bar(stat="identity")
#+RESULTS:
[[file:analysis/draco_workload0.pdf]]
** 2017-06-26
*** TODO Workload 87% issue
:LOGBOOK:
- State "TODO" from [2017-06-29 Thu 15:01]
:END:
- Stange observation on draco machines, for workload 87% (as well as 86) there is a huge variability for Comparison duration
- Not sure why it is happening, maybe it is related to the part executed on CPU
- Could it be CPU frequency scalling?
+ That would explain the issue and the results
+ However, draco nodes are normally working in /performance/ mode, so thse things should not occur
- Tried to experimentally detect the same problem on dvl machine, but everything was stable
- [X] Wait until draco machine is back, with new OS, reproduce the experiments and see if the issue persisted
+ If needed, check PAPI counters, look at the frequency, profile in more detail
- Can compute how much effort was performed on GPU and how much on CPUs. Then run separate experiments by putting exactly that number of workload on ressources. This is quite easy to do by commenting parts of GPU/CPU code (although results will be worng at the end)
+ Commenting OMP code in bioem_cuda.cu:369 doesnt create any problems to the execution
+ Putting number of iteration to 0 in GPU code in bioem_cuda.cu:283 doesnt create any problems to the execution, and it gives an estimate of the OMP execution duration
- No need for a large execution, as the problem is quite stable. So decrease the number of MPI nodes and the size of the problem.
- Actually after update draco seems to be in /powersafe/ mode, so performance is quite stable. However, workload value could be now quite different, as OMP is not as performant as it used to be
- [ ] Discuss with Christian about draco governor modes
- [ ] Explain to Pilar in BioEM issue
*** TODO EXCLUSIVE_MODE on dvl GPUs issue
:LOGBOOK:
- State "TODO" from [2017-06-29 Thu 15:02]
:END:
- Exclusive mode is used to be able to benefit from Nvidia MPS
- Normally it should not compromise standard CUDA code, but for some reason there are problems with BioEM on dvl machine
- [ ] Do we want MPS implementation of BioEM one day?
- [X] cuCtxCreate function was causing errors, maybe because only one context is allowed with EXCLUSIVE_PROCESS and that one already exists. Need to verify this hypothesis.=Yes
- Adding computeMode information to the output. It returns value "3" which is normally non-existing
+ enum cudaComputeMode {
+ cudaComputeModeDefault = 0,
+ cudaComputeModeExclusive = 1,
+ cudaComputeModeProhibited = 2 }
- A lot of problems, some people encountered the same issues
- At the end, it seems that for CUDA versions > 6.5, we just *mustnt* call cudaSetDevice(), but just cudaDeviceSynchronize()
- Explanation from https://github.com/kaldi-asr/kaldi/issues/1487
+ Unfortunately they don't say what cudaSetDevice does if it is
called on a device with computeMode
cudaComputeModeExclusiveProcess that is already occupied by
another process . Now I would expect it to return
cudaErrorDeviceAlreadyInUse indicating the device cannot be
used, but that is not what happens. In fact it returns
cudaSuccess. They again don't say what subsequent non-device
management runtime functions return when an occupied
"Exclusive Process" mode device is chosen with cudaSetDevice
from a different process. This time they do what I expect them
to do and return cudaErrorDevicesUnavailable. The problem is
they continue to return cudaErrorDevicesUnavailable even if
the device is no longer occupied. I tried calling
cudaDeviceReset after every failed context creation attempt
but that did not change anything. I tried calling cudaFree(0)
instead of cudaDeviceSynchronize() but that did not change
anything either. I also tried a bunch of other hacks that I
can not remember now but nothing helped. It seems like there
is nothing we can do at this point other than submitting a bug
report to NVIDIA and wait for them to fix it. If anyone
reading this can think of anything to work around this issue
in the meantime, let me know.
+ BTW, someone here
https://devtalk.nvidia.com/default/topic/415723/choosing-cuda-device-programmatically/
is saying that when using compute exclusive mode, you should
just not call cudaSetDevice-- presumably it will automatically
pick a free device when you call
cudaDeviceSynchronize(). Interestingly, this is how the code
*originally* worked. However, after a certain version of the
CUDA toolkit, the original code stopped working, and that's
when we added the iteration over devices and the calls to
cudaSetDevice. One possibility is to query the version of the
CUDA toolkit, and if it's >= 8.0, go back to the original
strategy of trying to call cudaDeviceSynchronize() and
checking the error status.
- Another interesting discussion: https://serverfault.com/questions/377005/using-cuda-visible-devices-with-sge
- Other people complaining about the CUDA bug on this matter:
+ https://devtalk.nvidia.com/default/topic/869602/cuda-7-0-and-compute-exclusive-mode-on-multi-gpu-machine/
+ https://devtalk.nvidia.com/default/topic/857233/dual-gpu-cuda-6-5-vs-7-0/
+
- For checking different type of CUDA errors, consult this explanation: https://devblogs.nvidia.com/parallelforall/how-query-device-properties-and-handle-errors-cuda-cc/
+ Basically both cudaGetLastError() and cudaDeviceSynchronize() are needed, for synchronous and asynchronous errors
- Conclusion: when CUDA is set to EXCLUSIVE_PROCESS mode,
processes are directly, automatically mapped to different GPU
devices. In some cases, BioEM tries for CUDA process 0 which is
on GPU 0 to set its device to GPU 1. This creates an error since
there is already a running process CUDA 1 on GPU 1. However,
when compute mode is DEFAULT, this thing is very much
needed. Hence, do SetDevice only if mode is computeMode
- [X] CUDA processes are actually created with pProb.init. Is it supposed to be like that?
+ Probably not, it is better to move this code
- Also CUDA has bugs regarding SetDevice errors
- BTW: for BioEM code GPUs perform 8.5% worse when run in EXCLUSIVE_MODE compared to the DEFAULT mode
- Proposed solution:.=Actually, better to do even more
#+BEGIN_SRC
cudaGetDeviceProperties(&deviceProp ,bestDevice);
if (deviceProp.computeMode == 0)
{
checkCudaErrors(cudaSetDevice(bestDevice));
}
else
{
printf("CUDA device %d is not set in DEFAULT mode, make sure that processes are correctly pinned!\n", bestDevice);
checkCudaErrors(cudaDeviceSynchronize());
}
#+END_SRC
- [ ] Test new solution on several machines, with different inputs
- [ ] Get the right patch, send it to the master BioEM project
- [X] Write explanations to Pilar
- [ ] Close the issue on BioEM_fork
- After Application group meeting, Andy said that NVidia clearly states that EXCLUSIVE_MODE should only be used with MPS
** 2017-06-27
*** New recipes for draco (not working)
- Installation with Intel modules
#+BEGIN_SRC
module purge
module load git/2.13
module load cmake/3.7
module load intel/17.0
module load impi/2017.3
module load fftw/3.3.6
module load boost/intel/1.64
module load cuda/8.0
# Paths
SRC_DIR="$afsdir/BioEM_fork"
BUILD_DIR="$HOME/BioEM_project/build"
mkdir -p $BUILD_DIR
cd $BUILD_DIR
# Deleting files from previous installations previous
rm -rf $BUILD_DIR/*
rm -rf $SRC_DIR/CMakeFiles $SRC_DIR/CMakeCache.txt $SRC_DIR/Makefile $SRC_DIR/cmake_install.cmake
# Configuration and compilation (need to manually add CUDA_rt_LIBRARY and CUDA_SDK_ROOT_DIR)
cmake -DUSE_MPI=ON -DUSE_OPENMP=ON -DUSE_CUDA=ON -DPRINT_CMAKE_VARIABLES=ON -DCUDA_FORCE_GCC=ON $SRC_DIR/
make -j5 VERBOSE=1
#+END_SRC
- Problems compiling with Intel (seems to be related to FFTW-Intel compatibility)
#+BEGIN_SRC
/usr/lib64/gcc/x86_64-suse-linux/4.8/../../../../x86_64-suse-linux/bin/ld: /mpcdf/soft/SLES122/HSW/fftw/3.3.6-pl2/intel-17.0/impi-2017.3/lib/libfftw3f.a(trig.o): undefined reference to symbol '__libm_sse2_sincos'
/mpcdf/soft/SLES122/common/intel/ps2017.4/17.0/linux/compiler/lib/intel64/libimf.so: error adding symbols: DSO missing from command line
#+END_SRC
- When bioem module was installed on draco machine (probably it works), it was with boost/1.54. However, it really looks like the problem is coming from FFT
- Installation with gcc modules
#+BEGIN_SRC
module purge
module load git/2.13
module load cmake/3.7
module load gcc/6.3
module load impi/2017.3
module load fftw/gcc/3.3.6
module load boost/gcc/1.64
module load cuda/8.0
# Paths
SRC_DIR="$afsdir/BioEM_fork"
BUILD_DIR="$HOME/BioEM_project/build"
mkdir -p $BUILD_DIR
cd $BUILD_DIR
# Deleting files from previous installations previous
rm -rf $BUILD_DIR/*
rm -rf $SRC_DIR/CMakeFiles $SRC_DIR/CMakeCache.txt $SRC_DIR/Makefile $SRC_DIR/cmake_install.cmake
# Configuration and compilation (need to manually add CUDA_rt_LIBRARY and CUDA_SDK_ROOT_DIR)
cmake -DUSE_MPI=ON -DUSE_OPENMP=ON -DUSE_CUDA=ON -DPRINT_CMAKE_VARIABLES=ON -DCUDA_FORCE_GCC=OFF $SRC_DIR/
make -j5 VERBOSE=1
#+END_SRC