diff --git a/tensorflow/viper/README.md b/tensorflow/viper/README.md index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..eeae3a171ac14f3aa312e2e40c8981c64c8bf0a1 100644 --- a/tensorflow/viper/README.md +++ b/tensorflow/viper/README.md @@ -0,0 +1,48 @@ +# Tensorflow on Viper + +Use ROCM containers: https://hub.docker.com/r/rocm/tensorflow + +## BUILD + +1. Load the latest apptainer module, e.g. + ```bash + module load apptainer/1.4.1 + ``` +2. Build your container: + + ``` + apptainer build amd-tensorflow.sif amd-tensorflow.def + ``` + +## RUN + +* To run the training script on synthetic data using ResNet50 in an **undistributed** fashion, execute: + + ```bash + sbatch run_undistributed.slurm + ``` + +* To run it in a **distributed** fashion on a **single node with multiple GPUs**, execute: + + ```bash + sbatch run_distributed_1_node_multi_gpu.slurm + ``` + +* To run it in a **distributed** fashion across **multiple nodes**, execute: + + ```bash + sbatch run_distributed_multi_node_multi_gpu.slurm + ``` + +### Results + +Container: rocm/tensorflow:rocm6.3.3-py3.12-tf2.16-dev + +#### (local) batch_size=256 +> `CONF` = 1.96 * std + +|NNODES|NGPUS|IPS|CONF| +|-|-|-|-| +|1|1|1360.7|34.5| +|1|2|1204.8|32.7| +|2|4|2263.4|100.7| diff --git a/tensorflow/viper/run_distributed_1_node_multi_gpu.slurm b/tensorflow/viper/run_distributed_1_node_multi_gpu.slurm index bcb5299c15efa4db23ca761bdcaf79c9fbd7b226..4f2dde2496e5afad278de3fce7d11413bb0d1906 100644 --- a/tensorflow/viper/run_distributed_1_node_multi_gpu.slurm +++ b/tensorflow/viper/run_distributed_1_node_multi_gpu.slurm @@ -18,7 +18,7 @@ module load apptainer/1.4.1 CONTAINER="amd-tensorflow.sif" export TF_FORCE_GPU_ALLOW_GROWTH=true -srun apptainer exec --rocm -B ../src/:/workspace/ ${CONTAINER} bash -c """ +srun apptainer exec -B ../src/:/workspace/ ${CONTAINER} bash -c """ export RANK=\${SLURM_PROCID} python /workspace/train_synthetic.py train diff --git a/tensorflow/viper/run_distributed_multi_node_multi_gpu.slurm b/tensorflow/viper/run_distributed_multi_node_multi_gpu.slurm index 4d01ae4f6a5192f6ee3e435cf28426a87f1c27a6..00b12295301e26282e4dad10e388fe2bd71dc5f0 100644 --- a/tensorflow/viper/run_distributed_multi_node_multi_gpu.slurm +++ b/tensorflow/viper/run_distributed_multi_node_multi_gpu.slurm @@ -18,7 +18,7 @@ module load apptainer/1.4.1 CONTAINER="amd-tensorflow.sif" export TF_FORCE_GPU_ALLOW_GROWTH=true -PRE_RUN="source /workspace/set_tf_config_multiple_nodes.sh && echo \$TF_CONFIG && export RANK=\${SLURM_PROCID}" +PRE_RUN="source ../src/set_tf_config_multiple_nodes.sh && echo \$TF_CONFIG && export RANK=\${SLURM_PROCID}" srun bash -c """ ${PRE_RUN} && diff --git a/tensorflow/viper/run_undistributed.slurm b/tensorflow/viper/run_undistributed.slurm index d7a73b008462ff96ba79d9749988ffe715fee8c6..312f260dcd41dda6edd1245ddc26b718d71346c4 100644 --- a/tensorflow/viper/run_undistributed.slurm +++ b/tensorflow/viper/run_undistributed.slurm @@ -19,7 +19,7 @@ module load apptainer/1.4.1 CONTAINER="amd-tensorflow.sif" export TF_FORCE_GPU_ALLOW_GROWTH=true -srun apptainer exec --rocm -B ../src/:/workspace/ ${CONTAINER} bash -c """ +srun apptainer exec -B ../src/:/workspace/ ${CONTAINER} bash -c """ export RANK=\${SLURM_PROCID} python /workspace/train_synthetic.py train