diff --git a/pytorch/raven/README.md b/pytorch/raven/README.md index 6817e1ddb676d5bd32e081c3aec787e2ae9a3665..eb455b617b4771a24f1629745368c5f482f715f3 100644 --- a/pytorch/raven/README.md +++ b/pytorch/raven/README.md @@ -6,29 +6,17 @@ Use NGC containers: https://catalog.ngc.nvidia.com/orgs/nvidia/containers/pytorc > HINT: Use a compute node, and export `APPTAINER_TMPDIR` and `APPTAINER_CACHEDIR` in `${JOB_SHMTMPDIR}/`. This will reduce **drastically** the creation of the SIF file and so the overall build time. -``` -apptainer build pytorch-imagenet.sif pytorch-imagenet.def -``` +apptainer build nv-pytorch.sif nv-pytorch.def -|Node|APPTAINER_CACHEDIR|APPTAINER_TMPDIR|SIF FILE|JOBID|Time(min)| -|-|-|-|-|-|-| -|Login|ptmp|ptmp|ptmp|x|100| -|CPU|ptmp|ptmp|ptmp|15741022|25| -|CPU|ptmp|ptmp|shmem|15741568|25| -|CPU|shmem|shmem|shmem|15740928|9| -|CPU|shmem|shmem|ptmp|15742072|9| -|CPU|shmem|ptmp|ptmp|15743064|26| -|CPU|ptmp|shmem|ptmp|15743070|9| +## RUN +To submit the script: -> **Internal comment**: In splush one can see the drastic difference in "GPFS /ptmp/ opens per node (per sample)" between the jobs above. -> -> https://splush.mpcdf.mpg.de/en-GB/app/search/test_hpcmd3?form.timetoken.earliest=-24h%40h&form.timetoken.latest=now&form.machinetoken=raven&form.useridtoken=pierocor&form.yid=Select&form.jobidtoken=15741022 -> -> https://splush.mpcdf.mpg.de/en-GB/app/search/test_hpcmd3?form.timetoken.earliest=-24h%40h&form.timetoken.latest=now&form.machinetoken=raven&form.useridtoken=pierocor&form.yid=Select&form.jobidtoken=15742072 +``` +sbatch run.slurm +``` -## RUN -### synthetic-bench.py +### results > `CONF` = 1.96 * std |NGPUS|IPS|CONF|IPS_GPU|SU| diff --git a/pytorch/raven/nv-pytorch.def b/pytorch/raven/nv-pytorch.def index 3ee8dfc24849eac39f7b47e2aadbb655ac225958..2d298e2dcd43154408fdb4474e19a68144d1cb39 100644 --- a/pytorch/raven/nv-pytorch.def +++ b/pytorch/raven/nv-pytorch.def @@ -1,22 +1,9 @@ BootStrap: docker From: nvcr.io/nvidia/pytorch:25.01-py3 -%files - ../get-training-script.sh /workspace/tools/ - -%post - cd /workspace - source /workspace/tools/get-training-script.sh - -%environment - -%runscript - %labels author Piero Coronica version 1.0 %help - The converted Docker image from NVIDIAs PyTorch (version 25.01). - -%test + The converted Docker image from NVIDIAs PyTorch (version 25.01). \ No newline at end of file diff --git a/pytorch/raven/run.slurm b/pytorch/raven/run.slurm index 0ca241c116cee082c1a073d251649db862e22a32..4e2b2fcb6c5eb0abf9f4e4ae5b4c4bf3bbc3e576 100644 --- a/pytorch/raven/run.slurm +++ b/pytorch/raven/run.slurm @@ -1,14 +1,13 @@ #!/bin/bash -l #SBATCH -D ./ -#SBATCH -o _synth_4_4_%j.out -#SBATCH -e _synth_4_4_%j.out -#SBATCH -J pt-synth +#SBATCH -o %j.out +#SBATCH -e %j.out +#SBATCH -J NV-pt-synth #SBATCH --time=0-00:5:00 -# #SBATCH -p gpudev -#SBATCH --nodes=4 +#SBATCH --nodes=2 #SBATCH --ntasks-per-node=4 #SBATCH --constraint="gpu" @@ -19,10 +18,7 @@ ###### Variables ###### -CONTAINER="/ptmp/pierocor/containers/pytorch-24.11.sif" -### Paths -GPU_LOGS="_synth_${SLURM_JOB_ID}" - +CONTAINER="nv-pytorch.sif" ### PT DIST settings export MASTER_ADDR=$(scontrol show hostnames "$SLURM_JOB_NODELIST" | head -n 1) export MASTER_PORT=$(expr 10000 + $(echo -n $SLURM_JOBID | tail -c 4)) @@ -38,32 +34,15 @@ module list echo -e "Nodes: ${SLURM_JOB_NUM_NODES} \t NTASK: ${SLURM_NTASKS}" echo "${SLURM_NODELIST}" - -# rocm-smi -nvidia-smi - -# TODO: -# export NCCL_DEBUG=INFO -# export UCX_TLS=^cma +echo ###### Run the program: -printf '=%.0s' {1..100}; echo -e "\n\n${BACKEND}\n"; printf '=%.0s' {1..100}; echo -set -o xtrace - srun apptainer exec --nv ${CONTAINER} \ bash -c """ export NUM_GPUS=\$(( \$(echo \${CUDA_VISIBLE_DEVICES} | tr -cd , | wc -c) + 1 )) export LOCAL_RANK=\${SLURM_LOCALID} export RANK=\${SLURM_PROCID} echo \$SLURM_PROCID : \$(hostname) - \${SLURM_LOCALID} NUM_GPUS: \${NUM_GPUS} CUDA_VISIBLE_DEVICES: \${CUDA_VISIBLE_DEVICES} - if [ \${LOCAL_RANK} -ne 0 ]; then GPU_LOGS='/dev/null'; else GPU_LOGS=${GPU_LOGS}_\${RANK}.csv; fi; echo \${GPU_LOGS} - nvidia-smi --query-gpu=memory.used,memory.total,utilization.gpu --format=csv -l 1 > \${GPU_LOGS} & - NV_PID=\$! - - python /u/pierocor/work/ai_containers/pytorch/example/synthetic-bench.py - kill \${NV_PID} + python ../scripts/resnet50-bench.py """ - -set +o xtrace -printf '=%.0s' {1..100}; echo diff --git a/pytorch/viper/README.md b/pytorch/viper/README.md index f16b4c7e7928b76a76ded89ee8444885f913b548..1e3041f07dd57676fa43732a0159852f7e62965e 100644 --- a/pytorch/viper/README.md +++ b/pytorch/viper/README.md @@ -4,10 +4,17 @@ Use ROCM containers: https://hub.docker.com/r/rocm/pytorch ## BUILD -> TODO: when clear instructions on how to build are available, add them. +apptainer build amd-pytorch.sif amd-pytorch.def ## RUN -### synthetic-bench.py + +To submit the script: + +``` +sbatch run.slurm +``` + +### results > `CONF` = 1.96 * std Container: rocm/pytorch:rocm6.3.2_ubuntu22.04_py3.10_pytorch_release_2.4.0 diff --git a/pytorch/viper/run.slurm b/pytorch/viper/run.slurm index 425d1be0083f14086de4f563d335b1eaf449eace..aedf098e154db06661a6f6b5967fbc0676acde12 100644 --- a/pytorch/viper/run.slurm +++ b/pytorch/viper/run.slurm @@ -1,9 +1,9 @@ #!/bin/bash -l #SBATCH -D ./ -#SBATCH -o rocm634/_synth_rocm_1_1_%j.out -#SBATCH -e rocm634/_synth_rocm_1_1_%j.out -#SBATCH -J pt-synth +#SBATCH -o %j.out +#SBATCH -e %j.out +#SBATCH -J AMD-pt-synth #SBATCH --time=0-00:10:00 @@ -19,13 +19,7 @@ ###### Variables ###### -CONTAINER="/ptmp/pierocor/containers/pytorch_rocm6.3.4_ubuntu24.04_py3.12_pytorch_release_2.4.0.sif" -# CONTAINER="/ptmp/pierocor/containers/pytorch-rocm-25.01.sif" -# CONTAINER="/ptmp/pierocor/containers/rocm6.1-pytorch2.4.sif" -# CONTAINER="/ptmp/pierocor/containers/rocm6.2-pytorch2.5.sif" -### Paths -GPU_LOGS="rocm634/gpu_logs/_synth_${SLURM_JOB_ID}" - +CONTAINER="amd-pytorch.sif" ### PT DIST settings export MASTER_ADDR=$(scontrol show hostnames "$SLURM_JOB_NODELIST" | head -n 1) export MASTER_PORT=$(expr 10000 + $(echo -n $SLURM_JOBID | tail -c 4)) @@ -41,36 +35,16 @@ module list echo -e "Nodes: ${SLURM_JOB_NUM_NODES} \t NTASK: ${SLURM_NTASKS}" echo "${SLURM_NODELIST}" - -rocm-smi - - -# TODO: -# export NCCL_DEBUG=INFO -# export UCX_TLS=^cma - -for BS in 256 512 768; -do +echo ###### Run the program: -printf '=%.0s' {1..100}; echo -e "\n\n${BS}\n"; printf '=%.0s' {1..100}; echo -set -o xtrace - srun apptainer exec --rocm ${CONTAINER} \ bash -c """ - export NUM_GPUS=\$(( \$(echo \${ROCR_VISIBLE_DEVICES} | tr -cd , | wc -c) + 1 )) + export NUM_GPUS=\$(( \$(echo \${HIP_VISIBLE_DEVICES} | tr -cd , | wc -c) + 1 )) export LOCAL_RANK=\${SLURM_LOCALID} export RANK=\${SLURM_PROCID} - echo \$SLURM_PROCID : \$(hostname) - \${SLURM_LOCALID} NUM_GPUS: \${NUM_GPUS} ROCR_VISIBLE_DEVICES: \${ROCR_VISIBLE_DEVICES} - if [ \${LOCAL_RANK} -ne 0 ]; then GPU_LOGS='/dev/null'; else GPU_LOGS=${GPU_LOGS}_\${RANK}.csv; fi; echo \${GPU_LOGS} - amd-smi monitor -w 1 -umv --csv --file \${GPU_LOGS} & - SMI_PID=\$! - - python3 /u/pierocor/work/ai_containers/pytorch/example/synthetic-bench.py --batch-size ${BS} + echo \$SLURM_PROCID : \$(hostname) - \${SLURM_LOCALID} NUM_GPUS: \${NUM_GPUS} HIP_VISIBLE_DEVICES: \${HIP_VISIBLE_DEVICES} - kill \${SMI_PID} + python ../scripts/resnet50-bench.py """ -set +o xtrace -printf '=%.0s' {1..100}; echo -done