From a02901d9423f2bc88199ffd2889aa23989952571 Mon Sep 17 00:00:00 2001 From: Nastassya Horlava <nastassya.horlava@mpcdf.mpg.de> Date: Mon, 16 Jun 2025 15:41:22 +0200 Subject: [PATCH] viper examples --- .../run_distributed_1_node_multi_gpu.slurm | 32 ++++++++++++++++++ ...run_distributed_multi_node_multi_gpu.slurm | 33 +++++++++++++++++++ tensorflow/viper/tf-2.16-recipe.def | 1 + 3 files changed, 66 insertions(+) create mode 100644 tensorflow/viper/run_distributed_1_node_multi_gpu.slurm create mode 100644 tensorflow/viper/run_distributed_multi_node_multi_gpu.slurm diff --git a/tensorflow/viper/run_distributed_1_node_multi_gpu.slurm b/tensorflow/viper/run_distributed_1_node_multi_gpu.slurm new file mode 100644 index 0000000..b91296f --- /dev/null +++ b/tensorflow/viper/run_distributed_1_node_multi_gpu.slurm @@ -0,0 +1,32 @@ +#!/bin/bash -l +#SBATCH -o logs/%j_multigpu.log +#SBATCH -e logs/%j_multigpu.log +#SBATCH -J tf_synth +#SBATCH --nodes=1 # request a full node +#SBATCH --ntasks-per-node=1 # only start 1 task via srun because Python multiprocessing starts more tasks internally +#SBATCH --ntasks-per-socket=1 +#SBATCH --cpus-per-task=48 # assign all the cores to that first task to make room for Python's multiprocessing tasks +#SBATCH --constraint="apu" +#SBATCH --gres=gpu:2 +#SBATCH --mem=0 +#SBATCH --time=02:15:00 + +module purge +module load apptainer/1.4.1 + +# Get the command line from the job info +export SBATCH_SCRIPT_PATH=$(scontrol show job "$SLURM_JOBID" | grep -oP 'Command=\K.*' | sed -E "s|/$(echo "$SLURM_CELL" | tr '[:upper:]' '[:lower:]')||" | sed -E 's|^/u[12]/|/u/|') + + +sif_file="$(dirname "$SBATCH_SCRIPT_PATH")/tf-2.16.sif" +code_dir="$(dirname $(dirname "$SBATCH_SCRIPT_PATH"))/src" + +export TF_FORCE_GPU_ALLOW_GROWTH=true + +export batch_size_per_device="--batch_size_per_device 256" + +srun apptainer exec --nv $sif_file bash -c """ + export RANK=\${SLURM_PROCID} + + python $code_dir/train_synthetic.py train $batch_size_per_device + """ diff --git a/tensorflow/viper/run_distributed_multi_node_multi_gpu.slurm b/tensorflow/viper/run_distributed_multi_node_multi_gpu.slurm new file mode 100644 index 0000000..070f864 --- /dev/null +++ b/tensorflow/viper/run_distributed_multi_node_multi_gpu.slurm @@ -0,0 +1,33 @@ +#!/bin/bash -l +#SBATCH -o logs/%j_multinode.log +#SBATCH -e logs/%j_multinode.log +#SBATCH -J tf_synth +#SBATCH --nodes=2 # request multiple nodes +#SBATCH --ntasks-per-node=1 # only start 1 task via srun because Python multiprocessing starts more tasks internally +#SBATCH --ntasks-per-socket=1 +#SBATCH --cpus-per-task=48 # assign all the cores to that first task to make room for Python's multiprocessing tasks +#SBATCH --constraint="apu" +#SBATCH --gres=gpu:2 +#SBATCH --mem=0 +#SBATCH --time=02:15:00 + +module purge +module load apptainer/1.4.1 + +# Get the command line from the job info +# Get the command line from the job info +export SBATCH_SCRIPT_PATH=$(scontrol show job "$SLURM_JOBID" | grep -oP 'Command=\K.*' | sed -E "s|/$(echo "$SLURM_CELL" | tr '[:upper:]' '[:lower:]')||" | sed -E 's|^/u[12]/|/u/|') + +sif_file="$(dirname "$SBATCH_SCRIPT_PATH")/tf-2.16.sif" +code_dir="$(dirname $(dirname "$SBATCH_SCRIPT_PATH"))/src" + +export TF_FORCE_GPU_ALLOW_GROWTH=true + +export batch_size_per_device="--batch_size_per_device 256" +PRE_RUN="source ${code_dir}/set_tf_config_multiple_nodes.sh && echo \$TF_CONFIG && export RANK=\${SLURM_PROCID}" + +srun bash -c """ + ${PRE_RUN} && + export RANK=\${SLURM_PROCID} && + apptainer exec --nv $sif_file python $code_dir/train_synthetic.py train $batch_size_per_device + """ diff --git a/tensorflow/viper/tf-2.16-recipe.def b/tensorflow/viper/tf-2.16-recipe.def index 924b4e9..d795e94 100644 --- a/tensorflow/viper/tf-2.16-recipe.def +++ b/tensorflow/viper/tf-2.16-recipe.def @@ -4,6 +4,7 @@ From: rocm/tensorflow:rocm6.3.3-py3.12-tf2.16-dev %post python -m pip install --upgrade pip pip install ipython ipykernel +pip install pandas pip install click %environment -- GitLab