diff --git a/tensorflow/viper/run_distributed_1_node_multi_gpu.slurm b/tensorflow/viper/run_distributed_1_node_multi_gpu.slurm new file mode 100644 index 0000000000000000000000000000000000000000..b91296f65f44980f22d13d9b211e0f59261f55b9 --- /dev/null +++ b/tensorflow/viper/run_distributed_1_node_multi_gpu.slurm @@ -0,0 +1,32 @@ +#!/bin/bash -l +#SBATCH -o logs/%j_multigpu.log +#SBATCH -e logs/%j_multigpu.log +#SBATCH -J tf_synth +#SBATCH --nodes=1 # request a full node +#SBATCH --ntasks-per-node=1 # only start 1 task via srun because Python multiprocessing starts more tasks internally +#SBATCH --ntasks-per-socket=1 +#SBATCH --cpus-per-task=48 # assign all the cores to that first task to make room for Python's multiprocessing tasks +#SBATCH --constraint="apu" +#SBATCH --gres=gpu:2 +#SBATCH --mem=0 +#SBATCH --time=02:15:00 + +module purge +module load apptainer/1.4.1 + +# Get the command line from the job info +export SBATCH_SCRIPT_PATH=$(scontrol show job "$SLURM_JOBID" | grep -oP 'Command=\K.*' | sed -E "s|/$(echo "$SLURM_CELL" | tr '[:upper:]' '[:lower:]')||" | sed -E 's|^/u[12]/|/u/|') + + +sif_file="$(dirname "$SBATCH_SCRIPT_PATH")/tf-2.16.sif" +code_dir="$(dirname $(dirname "$SBATCH_SCRIPT_PATH"))/src" + +export TF_FORCE_GPU_ALLOW_GROWTH=true + +export batch_size_per_device="--batch_size_per_device 256" + +srun apptainer exec --nv $sif_file bash -c """ + export RANK=\${SLURM_PROCID} + + python $code_dir/train_synthetic.py train $batch_size_per_device + """ diff --git a/tensorflow/viper/run_distributed_multi_node_multi_gpu.slurm b/tensorflow/viper/run_distributed_multi_node_multi_gpu.slurm new file mode 100644 index 0000000000000000000000000000000000000000..070f864a79fa624d025c01c995b2dfcf7f0cba17 --- /dev/null +++ b/tensorflow/viper/run_distributed_multi_node_multi_gpu.slurm @@ -0,0 +1,33 @@ +#!/bin/bash -l +#SBATCH -o logs/%j_multinode.log +#SBATCH -e logs/%j_multinode.log +#SBATCH -J tf_synth +#SBATCH --nodes=2 # request multiple nodes +#SBATCH --ntasks-per-node=1 # only start 1 task via srun because Python multiprocessing starts more tasks internally +#SBATCH --ntasks-per-socket=1 +#SBATCH --cpus-per-task=48 # assign all the cores to that first task to make room for Python's multiprocessing tasks +#SBATCH --constraint="apu" +#SBATCH --gres=gpu:2 +#SBATCH --mem=0 +#SBATCH --time=02:15:00 + +module purge +module load apptainer/1.4.1 + +# Get the command line from the job info +# Get the command line from the job info +export SBATCH_SCRIPT_PATH=$(scontrol show job "$SLURM_JOBID" | grep -oP 'Command=\K.*' | sed -E "s|/$(echo "$SLURM_CELL" | tr '[:upper:]' '[:lower:]')||" | sed -E 's|^/u[12]/|/u/|') + +sif_file="$(dirname "$SBATCH_SCRIPT_PATH")/tf-2.16.sif" +code_dir="$(dirname $(dirname "$SBATCH_SCRIPT_PATH"))/src" + +export TF_FORCE_GPU_ALLOW_GROWTH=true + +export batch_size_per_device="--batch_size_per_device 256" +PRE_RUN="source ${code_dir}/set_tf_config_multiple_nodes.sh && echo \$TF_CONFIG && export RANK=\${SLURM_PROCID}" + +srun bash -c """ + ${PRE_RUN} && + export RANK=\${SLURM_PROCID} && + apptainer exec --nv $sif_file python $code_dir/train_synthetic.py train $batch_size_per_device + """ diff --git a/tensorflow/viper/tf-2.16-recipe.def b/tensorflow/viper/tf-2.16-recipe.def index 924b4e9ea1682e09efdcdc4b934b979941ab10ba..d795e9426bb6c0f5faa546a6e1a3cc77c0255948 100644 --- a/tensorflow/viper/tf-2.16-recipe.def +++ b/tensorflow/viper/tf-2.16-recipe.def @@ -4,6 +4,7 @@ From: rocm/tensorflow:rocm6.3.3-py3.12-tf2.16-dev %post python -m pip install --upgrade pip pip install ipython ipykernel +pip install pandas pip install click %environment