Skip to content
Snippets Groups Projects
Commit a02901d9 authored by Nastassya Horlava's avatar Nastassya Horlava
Browse files

viper examples

parent 70765b34
No related branches found
No related tags found
1 merge request!4Docs tensorflow
#!/bin/bash -l
#SBATCH -o logs/%j_multigpu.log
#SBATCH -e logs/%j_multigpu.log
#SBATCH -J tf_synth
#SBATCH --nodes=1 # request a full node
#SBATCH --ntasks-per-node=1 # only start 1 task via srun because Python multiprocessing starts more tasks internally
#SBATCH --ntasks-per-socket=1
#SBATCH --cpus-per-task=48 # assign all the cores to that first task to make room for Python's multiprocessing tasks
#SBATCH --constraint="apu"
#SBATCH --gres=gpu:2
#SBATCH --mem=0
#SBATCH --time=02:15:00
module purge
module load apptainer/1.4.1
# Get the command line from the job info
export SBATCH_SCRIPT_PATH=$(scontrol show job "$SLURM_JOBID" | grep -oP 'Command=\K.*' | sed -E "s|/$(echo "$SLURM_CELL" | tr '[:upper:]' '[:lower:]')||" | sed -E 's|^/u[12]/|/u/|')
sif_file="$(dirname "$SBATCH_SCRIPT_PATH")/tf-2.16.sif"
code_dir="$(dirname $(dirname "$SBATCH_SCRIPT_PATH"))/src"
export TF_FORCE_GPU_ALLOW_GROWTH=true
export batch_size_per_device="--batch_size_per_device 256"
srun apptainer exec --nv $sif_file bash -c """
export RANK=\${SLURM_PROCID}
python $code_dir/train_synthetic.py train $batch_size_per_device
"""
#!/bin/bash -l
#SBATCH -o logs/%j_multinode.log
#SBATCH -e logs/%j_multinode.log
#SBATCH -J tf_synth
#SBATCH --nodes=2 # request multiple nodes
#SBATCH --ntasks-per-node=1 # only start 1 task via srun because Python multiprocessing starts more tasks internally
#SBATCH --ntasks-per-socket=1
#SBATCH --cpus-per-task=48 # assign all the cores to that first task to make room for Python's multiprocessing tasks
#SBATCH --constraint="apu"
#SBATCH --gres=gpu:2
#SBATCH --mem=0
#SBATCH --time=02:15:00
module purge
module load apptainer/1.4.1
# Get the command line from the job info
# Get the command line from the job info
export SBATCH_SCRIPT_PATH=$(scontrol show job "$SLURM_JOBID" | grep -oP 'Command=\K.*' | sed -E "s|/$(echo "$SLURM_CELL" | tr '[:upper:]' '[:lower:]')||" | sed -E 's|^/u[12]/|/u/|')
sif_file="$(dirname "$SBATCH_SCRIPT_PATH")/tf-2.16.sif"
code_dir="$(dirname $(dirname "$SBATCH_SCRIPT_PATH"))/src"
export TF_FORCE_GPU_ALLOW_GROWTH=true
export batch_size_per_device="--batch_size_per_device 256"
PRE_RUN="source ${code_dir}/set_tf_config_multiple_nodes.sh && echo \$TF_CONFIG && export RANK=\${SLURM_PROCID}"
srun bash -c """
${PRE_RUN} &&
export RANK=\${SLURM_PROCID} &&
apptainer exec --nv $sif_file python $code_dir/train_synthetic.py train $batch_size_per_device
"""
......@@ -4,6 +4,7 @@ From: rocm/tensorflow:rocm6.3.3-py3.12-tf2.16-dev
%post
python -m pip install --upgrade pip
pip install ipython ipykernel
pip install pandas
pip install click
%environment
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment