From c7e6f174458d2e06a4b1cb997888c024a3ebbca0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Martin=20Lilleeng=20S=C3=A6tra?= Date: Tue, 26 Apr 2022 15:39:21 +0200 Subject: [PATCH] Adjusting order of substep ops and benchmark scripts --- GPUSimulators/MPISimulator.py | 27 +++++++++++++-------------- dgx-2_strong_scaling_benchmark.job | 12 ++++++++---- dgx-2_strong_scaling_benchmark.sh | 27 +++++++++++++++++++-------- mpiTesting.py | 3 ++- 4 files changed, 42 insertions(+), 27 deletions(-) diff --git a/GPUSimulators/MPISimulator.py b/GPUSimulators/MPISimulator.py index e49738e..80cca46 100644 --- a/GPUSimulators/MPISimulator.py +++ b/GPUSimulators/MPISimulator.py @@ -27,7 +27,7 @@ from mpi4py import MPI import time import pycuda.driver as cuda -import nvtx +#import nvtx @@ -319,31 +319,30 @@ class MPISimulator(Simulator.BaseSimulator): self.upload_for_exchange(self.sim.u0) def substep(self, dt, step_number): - nvtx.mark("substep start", color="red") + #nvtx.mark("substep start", color="red") self.profiling_data_mpi["start"]["t_step_mpi"] += time.time() - nvtx.mark("substep internal", color="red") - self.sim.substep(dt, step_number, internal=True, external=False) # "internal ghost cells" excluded - self.profiling_data_mpi["end"]["t_step_mpi"] += time.time() - - self.profiling_data_mpi["start"]["t_step_mpi"] += time.time() - nvtx.mark("substep external", color="blue") + #nvtx.mark("substep external", color="blue") self.sim.substep(dt, step_number, external=True, internal=False) # only "internal ghost cells" - self.profiling_data_mpi["end"]["t_step_mpi"] += time.time() - # NOTE: Need to download from u1, as u0<->u1 switch is not performed yet - nvtx.mark("download", color="red") + #nvtx.mark("substep internal", color="red") + self.sim.substep(dt, step_number, internal=True, external=False) # "internal ghost cells" excluded + + #nvtx.mark("download", color="red") self.sim.swapBuffers() self.download_for_exchange(self.sim.u0) - nvtx.mark("sync", color="red") + #nvtx.mark("sync", color="red") self.sim.stream.synchronize() - nvtx.mark("MPI", color="green") + #nvtx.mark("MPI", color="green") + self.profiling_data_mpi["end"]["t_step_mpi"] += time.time() self.exchange() - nvtx.mark("upload", color="red") + self.profiling_data_mpi["start"]["t_step_mpi"] += time.time() + #nvtx.mark("upload", color="red") self.upload_for_exchange(self.sim.u0) self.sim.internal_stream.synchronize() + self.profiling_data_mpi["end"]["t_step_mpi"] += time.time() self.profiling_data_mpi["n_time_steps"] += 1 diff --git a/dgx-2_strong_scaling_benchmark.job b/dgx-2_strong_scaling_benchmark.job index 5366d38..d607feb 100644 --- a/dgx-2_strong_scaling_benchmark.job +++ b/dgx-2_strong_scaling_benchmark.job @@ -15,6 +15,8 @@ module load openmpi4-cuda11.2-ofed50-gcc8/4.1.0 # Check how many gpu's your job got #nvidia-smi +mkdir -p output_dgx-2/$NOW + ## Copy input files to the work directory: mkdir -p /work/$USER/$SLURM_JOB_ID/ShallowWaterGPU cp -r . /work/$USER/$SLURM_JOB_ID/ShallowWaterGPU @@ -23,14 +25,16 @@ cp -r . /work/$USER/$SLURM_JOB_ID/ShallowWaterGPU # (Assumes Miniconda is installed in user root dir.) cd /work/$USER/$SLURM_JOB_ID/ShallowWaterGPU #mpirun --mca btl_openib_if_include mlx5_0 --mca btl_openib_warn_no_device_params_found 0 $HOME/miniconda3/envs/ShallowWaterGPU_HPC/bin/python3 mpiTesting.py -nx $NX -ny $NY --profile +#nsys profile -t nvtx,cuda mpirun -np $SLURM_NTASKS numactl --cpunodebind=0 --localalloc $HOME/miniconda3/envs/ShallowWaterGPU_HPC/bin/python3 mpiTesting.py -nx $NX -ny $NY --profile mpirun -np $SLURM_NTASKS numactl --cpunodebind=0 --localalloc $HOME/miniconda3/envs/ShallowWaterGPU_HPC/bin/python3 mpiTesting.py -nx $NX -ny $NY --profile cd $HOME/src/ShallowWaterGPU ## Copy files from work directory: # (NOTE: Copying is not performed if job fails!) -mkdir -p output/$SLURM_JOB_ID -mv /work/$USER/$SLURM_JOB_ID/ShallowWaterGPU/*.log ./output/$SLURM_JOB_ID -mv /work/$USER/$SLURM_JOB_ID/ShallowWaterGPU/*.nc ./output/$SLURM_JOB_ID -mv /work/$USER/$SLURM_JOB_ID/ShallowWaterGPU/*.json . +mkdir -p output_dgx-2/$NOW/$SLURM_JOB_ID +mv /work/$USER/$SLURM_JOB_ID/ShallowWaterGPU/*.log ./output_dgx-2/$NOW/$SLURM_JOB_ID +mv /work/$USER/$SLURM_JOB_ID/ShallowWaterGPU/*.nc ./output_dgx-2/$NOW/$SLURM_JOB_ID +mv /work/$USER/$SLURM_JOB_ID/ShallowWaterGPU/*.json ./output_dgx-2/$NOW +mv /work/$USER/$SLURM_JOB_ID/ShallowWaterGPU/*.qdrep ./output_dgx-2/$NOW rm -rf /work/$USER/$SLURM_JOB_ID diff --git a/dgx-2_strong_scaling_benchmark.sh b/dgx-2_strong_scaling_benchmark.sh index eef6b9a..9c94602 100644 --- a/dgx-2_strong_scaling_benchmark.sh +++ b/dgx-2_strong_scaling_benchmark.sh @@ -1,11 +1,22 @@ #!/bin/bash +TIMESTAMP=$(date "+%Y-%m-%dT%H%M%S") + # one node: 1-8 GPUs -sbatch --nodes=1 --gpus-per-node=1 --ntasks-per-node=1 --export=ALL,NX=4096,NY=4096 dgx-2_strong_scaling_benchmark.job -sbatch --nodes=1 --gpus-per-node=2 --ntasks-per-node=2 --export=ALL,NX=4096,NY=2048 dgx-2_strong_scaling_benchmark.job -sbatch --nodes=1 --gpus-per-node=3 --ntasks-per-node=3 --export=ALL,NX=4096,NY=1365 dgx-2_strong_scaling_benchmark.job -sbatch --nodes=1 --gpus-per-node=4 --ntasks-per-node=4 --export=ALL,NX=4096,NY=1024 dgx-2_strong_scaling_benchmark.job -#sbatch --nodes=1 --gpus-per-node=5 --ntasks-per-node=5 --export=ALL,NX=4096,NY=819 dgx-2_strong_scaling_benchmark.job -#sbatch --nodes=1 --gpus-per-node=6 --ntasks-per-node=6 --export=ALL,NX=4096,NY=683 dgx-2_strong_scaling_benchmark.job -#sbatch --nodes=1 --gpus-per-node=7 --ntasks-per-node=7 --export=ALL,NX=4096,NY=585 dgx-2_strong_scaling_benchmark.job -#sbatch --nodes=1 --gpus-per-node=8 --ntasks-per-node=8 --export=ALL,NX=4096,NY=512 dgx-2_strong_scaling_benchmark.job +sbatch --nodes=1 --gpus-per-node=1 --ntasks-per-node=1 --export=ALL,NX=4096,NY=4096,NOW=$TIMESTAMP dgx-2_strong_scaling_benchmark.job +sbatch --nodes=1 --gpus-per-node=2 --ntasks-per-node=2 --export=ALL,NX=4096,NY=2048,NOW=$TIMESTAMP dgx-2_strong_scaling_benchmark.job +sbatch --nodes=1 --gpus-per-node=3 --ntasks-per-node=3 --export=ALL,NX=4096,NY=1365,NOW=$TIMESTAMP dgx-2_strong_scaling_benchmark.job +sbatch --nodes=1 --gpus-per-node=4 --ntasks-per-node=4 --export=ALL,NX=4096,NY=1024,NOW=$TIMESTAMP dgx-2_strong_scaling_benchmark.job +sbatch --nodes=1 --gpus-per-node=5 --ntasks-per-node=5 --export=ALL,NX=4096,NY=819,NOW=$TIMESTAMP dgx-2_strong_scaling_benchmark.job +sbatch --nodes=1 --gpus-per-node=6 --ntasks-per-node=6 --export=ALL,NX=4096,NY=683,NOW=$TIMESTAMP dgx-2_strong_scaling_benchmark.job +sbatch --nodes=1 --gpus-per-node=7 --ntasks-per-node=7 --export=ALL,NX=4096,NY=585,NOW=$TIMESTAMP dgx-2_strong_scaling_benchmark.job +sbatch --nodes=1 --gpus-per-node=8 --ntasks-per-node=8 --export=ALL,NX=4096,NY=512,NOW=$TIMESTAMP dgx-2_strong_scaling_benchmark.job + +sbatch --nodes=1 --gpus-per-node=9 --ntasks-per-node=9 --export=ALL,NX=4096,NY=512,NOW=$TIMESTAMP dgx-2_strong_scaling_benchmark.job +sbatch --nodes=1 --gpus-per-node=10 --ntasks-per-node=10 --export=ALL,NX=4096,NY=512,NOW=$TIMESTAMP dgx-2_strong_scaling_benchmark.job +sbatch --nodes=1 --gpus-per-node=11 --ntasks-per-node=11 --export=ALL,NX=4096,NY=512,NOW=$TIMESTAMP dgx-2_strong_scaling_benchmark.job +sbatch --nodes=1 --gpus-per-node=12 --ntasks-per-node=12 --export=ALL,NX=4096,NY=512,NOW=$TIMESTAMP dgx-2_strong_scaling_benchmark.job +sbatch --nodes=1 --gpus-per-node=13 --ntasks-per-node=13 --export=ALL,NX=4096,NY=512,NOW=$TIMESTAMP dgx-2_strong_scaling_benchmark.job +sbatch --nodes=1 --gpus-per-node=14 --ntasks-per-node=14 --export=ALL,NX=4096,NY=512,NOW=$TIMESTAMP dgx-2_strong_scaling_benchmark.job +sbatch --nodes=1 --gpus-per-node=15 --ntasks-per-node=15 --export=ALL,NX=4096,NY=512,NOW=$TIMESTAMP dgx-2_strong_scaling_benchmark.job +sbatch --nodes=1 --gpus-per-node=16 --ntasks-per-node=16 --export=ALL,NX=4096,NY=512,NOW=$TIMESTAMP dgx-2_strong_scaling_benchmark.job diff --git a/mpiTesting.py b/mpiTesting.py index 8d4e087..42e3bad 100644 --- a/mpiTesting.py +++ b/mpiTesting.py @@ -119,7 +119,8 @@ dt = 0.00001 gamma = 1.4 #save_times = np.linspace(0, 0.000009, 2) #save_times = np.linspace(0, 0.000099, 11) -save_times = np.linspace(0, 0.000099, 2) +#save_times = np.linspace(0, 0.000099, 2) +save_times = np.linspace(0, 0.000999, 2) outfile = "mpi_out_" + str(MPI.COMM_WORLD.rank) + ".nc" save_var_names = ['rho', 'rho_u', 'rho_v', 'E']