Skip to content
Snippets Groups Projects
Commit 351e0473 authored by Piero Coronica's avatar Piero Coronica
Browse files

upd APPT, rocm; fix rel imports

parent 71825cf4
Branches
Tags
No related merge requests found
# pyTorch on Raven
# pyTorch on Viper
Use ROCM containers: https://hub.docker.com/r/rocm/pytorch
......
BootStrap: docker
From: rocm/pytorch:rocm6.3.4_ubuntu24.04_py3.12_pytorch_release_2.4.0
From: rocm/pytorch:rocm6.4.1_ubuntu24.04_py3.12_pytorch_release_2.5.1
%labels
author Piero Coronica
version 1.0
%help
The converted Docker image from ROCm PyTorch (rocm 6.3.4, pytorch 2.4.0).
\ No newline at end of file
The converted Docker image from ROCm PyTorch (rocm 6.4.1, pytorch 2.5.1).
\ No newline at end of file
#!/bin/bash -l
#SBATCH -D ./
#SBATCH -o %j.out
#SBATCH -e %j.out
#SBATCH -o out/2_%j.out
#SBATCH -e out/2_%j.out
#SBATCH -J AMD-pt-synth
#SBATCH --time=0-00:10:00
#SBATCH -p apudev
#SBATCH --nodes=1
#SBATCH --ntasks-per-node=1
#SBATCH --nodes=2
#SBATCH --ntasks-per-node=2
#SBATCH --constraint="apu"
#SBATCH --gres=gpu:1
#SBATCH --mem=120000
#SBATCH --gres=gpu:2
#SBATCH --mem=0
#SBATCH --cpus-per-task=24
#SBATCH --threads-per-core=1
###### Environonment
module purge
module load apptainer/1.4.1
module list
###### Variables ######
CONTAINER="amd-pytorch.sif"
echo $CONTAINER
### PT DIST settings
export MASTER_ADDR=$(scontrol show hostnames "$SLURM_JOB_NODELIST" | head -n 1)
export MASTER_PORT=$(expr 10000 + $(echo -n $SLURM_JOBID | tail -c 4))
......@@ -28,23 +34,18 @@ echo "MASTER_ADDR="$MASTER_ADDR
echo "MASTER_PORT="$MASTER_PORT
echo "WORLD_SIZE="$WORLD_SIZE
###### Environonment
module purge
module load apptainer/1.3.2
module list
echo -e "Nodes: ${SLURM_JOB_NUM_NODES} \t NTASK: ${SLURM_NTASKS}"
echo "${SLURM_NODELIST}"
echo
###### Run the program:
srun apptainer exec --rocm ${CONTAINER} \
srun apptainer exec --rocm -B ../scripts/:/workspace/ ${CONTAINER}\
bash -c """
export NUM_GPUS=\$(( \$(echo \${HIP_VISIBLE_DEVICES} | tr -cd , | wc -c) + 1 ))
python3 -c 'import torch; print(torch.__config__.show())'
export LOCAL_RANK=\${SLURM_LOCALID}
export RANK=\${SLURM_PROCID}
echo \$SLURM_PROCID : \$(hostname) - \${SLURM_LOCALID} NUM_GPUS: \${NUM_GPUS} HIP_VISIBLE_DEVICES: \${HIP_VISIBLE_DEVICES}
echo \$SLURM_PROCID : \$(hostname) - \${SLURM_LOCALID} HIP_VISIBLE_DEVICES: \${HIP_VISIBLE_DEVICES}
python ../scripts/resnet50-bench.py
python3 /workspace/resnet50-bench.py
"""
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment