mirror of
https://github.com/smyalygames/FiniteVolumeGPU.git
synced 2025-05-18 14:34:13 +02:00
Adjusting order of substep ops and benchmark scripts
This commit is contained in:
parent
acb7d2ab39
commit
c7e6f17445
@ -27,7 +27,7 @@ from mpi4py import MPI
|
|||||||
import time
|
import time
|
||||||
|
|
||||||
import pycuda.driver as cuda
|
import pycuda.driver as cuda
|
||||||
import nvtx
|
#import nvtx
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
@ -319,31 +319,30 @@ class MPISimulator(Simulator.BaseSimulator):
|
|||||||
self.upload_for_exchange(self.sim.u0)
|
self.upload_for_exchange(self.sim.u0)
|
||||||
|
|
||||||
def substep(self, dt, step_number):
|
def substep(self, dt, step_number):
|
||||||
nvtx.mark("substep start", color="red")
|
#nvtx.mark("substep start", color="red")
|
||||||
|
|
||||||
self.profiling_data_mpi["start"]["t_step_mpi"] += time.time()
|
self.profiling_data_mpi["start"]["t_step_mpi"] += time.time()
|
||||||
nvtx.mark("substep internal", color="red")
|
#nvtx.mark("substep external", color="blue")
|
||||||
self.sim.substep(dt, step_number, internal=True, external=False) # "internal ghost cells" excluded
|
|
||||||
self.profiling_data_mpi["end"]["t_step_mpi"] += time.time()
|
|
||||||
|
|
||||||
self.profiling_data_mpi["start"]["t_step_mpi"] += time.time()
|
|
||||||
nvtx.mark("substep external", color="blue")
|
|
||||||
self.sim.substep(dt, step_number, external=True, internal=False) # only "internal ghost cells"
|
self.sim.substep(dt, step_number, external=True, internal=False) # only "internal ghost cells"
|
||||||
self.profiling_data_mpi["end"]["t_step_mpi"] += time.time()
|
|
||||||
|
|
||||||
# NOTE: Need to download from u1, as u0<->u1 switch is not performed yet
|
#nvtx.mark("substep internal", color="red")
|
||||||
nvtx.mark("download", color="red")
|
self.sim.substep(dt, step_number, internal=True, external=False) # "internal ghost cells" excluded
|
||||||
|
|
||||||
|
#nvtx.mark("download", color="red")
|
||||||
self.sim.swapBuffers()
|
self.sim.swapBuffers()
|
||||||
self.download_for_exchange(self.sim.u0)
|
self.download_for_exchange(self.sim.u0)
|
||||||
|
|
||||||
nvtx.mark("sync", color="red")
|
#nvtx.mark("sync", color="red")
|
||||||
self.sim.stream.synchronize()
|
self.sim.stream.synchronize()
|
||||||
nvtx.mark("MPI", color="green")
|
#nvtx.mark("MPI", color="green")
|
||||||
|
self.profiling_data_mpi["end"]["t_step_mpi"] += time.time()
|
||||||
self.exchange()
|
self.exchange()
|
||||||
nvtx.mark("upload", color="red")
|
self.profiling_data_mpi["start"]["t_step_mpi"] += time.time()
|
||||||
|
#nvtx.mark("upload", color="red")
|
||||||
self.upload_for_exchange(self.sim.u0)
|
self.upload_for_exchange(self.sim.u0)
|
||||||
|
|
||||||
self.sim.internal_stream.synchronize()
|
self.sim.internal_stream.synchronize()
|
||||||
|
self.profiling_data_mpi["end"]["t_step_mpi"] += time.time()
|
||||||
|
|
||||||
self.profiling_data_mpi["n_time_steps"] += 1
|
self.profiling_data_mpi["n_time_steps"] += 1
|
||||||
|
|
||||||
|
@ -15,6 +15,8 @@ module load openmpi4-cuda11.2-ofed50-gcc8/4.1.0
|
|||||||
# Check how many gpu's your job got
|
# Check how many gpu's your job got
|
||||||
#nvidia-smi
|
#nvidia-smi
|
||||||
|
|
||||||
|
mkdir -p output_dgx-2/$NOW
|
||||||
|
|
||||||
## Copy input files to the work directory:
|
## Copy input files to the work directory:
|
||||||
mkdir -p /work/$USER/$SLURM_JOB_ID/ShallowWaterGPU
|
mkdir -p /work/$USER/$SLURM_JOB_ID/ShallowWaterGPU
|
||||||
cp -r . /work/$USER/$SLURM_JOB_ID/ShallowWaterGPU
|
cp -r . /work/$USER/$SLURM_JOB_ID/ShallowWaterGPU
|
||||||
@ -23,14 +25,16 @@ cp -r . /work/$USER/$SLURM_JOB_ID/ShallowWaterGPU
|
|||||||
# (Assumes Miniconda is installed in user root dir.)
|
# (Assumes Miniconda is installed in user root dir.)
|
||||||
cd /work/$USER/$SLURM_JOB_ID/ShallowWaterGPU
|
cd /work/$USER/$SLURM_JOB_ID/ShallowWaterGPU
|
||||||
#mpirun --mca btl_openib_if_include mlx5_0 --mca btl_openib_warn_no_device_params_found 0 $HOME/miniconda3/envs/ShallowWaterGPU_HPC/bin/python3 mpiTesting.py -nx $NX -ny $NY --profile
|
#mpirun --mca btl_openib_if_include mlx5_0 --mca btl_openib_warn_no_device_params_found 0 $HOME/miniconda3/envs/ShallowWaterGPU_HPC/bin/python3 mpiTesting.py -nx $NX -ny $NY --profile
|
||||||
|
#nsys profile -t nvtx,cuda mpirun -np $SLURM_NTASKS numactl --cpunodebind=0 --localalloc $HOME/miniconda3/envs/ShallowWaterGPU_HPC/bin/python3 mpiTesting.py -nx $NX -ny $NY --profile
|
||||||
mpirun -np $SLURM_NTASKS numactl --cpunodebind=0 --localalloc $HOME/miniconda3/envs/ShallowWaterGPU_HPC/bin/python3 mpiTesting.py -nx $NX -ny $NY --profile
|
mpirun -np $SLURM_NTASKS numactl --cpunodebind=0 --localalloc $HOME/miniconda3/envs/ShallowWaterGPU_HPC/bin/python3 mpiTesting.py -nx $NX -ny $NY --profile
|
||||||
cd $HOME/src/ShallowWaterGPU
|
cd $HOME/src/ShallowWaterGPU
|
||||||
|
|
||||||
## Copy files from work directory:
|
## Copy files from work directory:
|
||||||
# (NOTE: Copying is not performed if job fails!)
|
# (NOTE: Copying is not performed if job fails!)
|
||||||
mkdir -p output/$SLURM_JOB_ID
|
mkdir -p output_dgx-2/$NOW/$SLURM_JOB_ID
|
||||||
mv /work/$USER/$SLURM_JOB_ID/ShallowWaterGPU/*.log ./output/$SLURM_JOB_ID
|
mv /work/$USER/$SLURM_JOB_ID/ShallowWaterGPU/*.log ./output_dgx-2/$NOW/$SLURM_JOB_ID
|
||||||
mv /work/$USER/$SLURM_JOB_ID/ShallowWaterGPU/*.nc ./output/$SLURM_JOB_ID
|
mv /work/$USER/$SLURM_JOB_ID/ShallowWaterGPU/*.nc ./output_dgx-2/$NOW/$SLURM_JOB_ID
|
||||||
mv /work/$USER/$SLURM_JOB_ID/ShallowWaterGPU/*.json .
|
mv /work/$USER/$SLURM_JOB_ID/ShallowWaterGPU/*.json ./output_dgx-2/$NOW
|
||||||
|
mv /work/$USER/$SLURM_JOB_ID/ShallowWaterGPU/*.qdrep ./output_dgx-2/$NOW
|
||||||
|
|
||||||
rm -rf /work/$USER/$SLURM_JOB_ID
|
rm -rf /work/$USER/$SLURM_JOB_ID
|
||||||
|
@ -1,11 +1,22 @@
|
|||||||
#!/bin/bash
|
#!/bin/bash
|
||||||
|
|
||||||
|
TIMESTAMP=$(date "+%Y-%m-%dT%H%M%S")
|
||||||
|
|
||||||
# one node: 1-8 GPUs
|
# one node: 1-8 GPUs
|
||||||
sbatch --nodes=1 --gpus-per-node=1 --ntasks-per-node=1 --export=ALL,NX=4096,NY=4096 dgx-2_strong_scaling_benchmark.job
|
sbatch --nodes=1 --gpus-per-node=1 --ntasks-per-node=1 --export=ALL,NX=4096,NY=4096,NOW=$TIMESTAMP dgx-2_strong_scaling_benchmark.job
|
||||||
sbatch --nodes=1 --gpus-per-node=2 --ntasks-per-node=2 --export=ALL,NX=4096,NY=2048 dgx-2_strong_scaling_benchmark.job
|
sbatch --nodes=1 --gpus-per-node=2 --ntasks-per-node=2 --export=ALL,NX=4096,NY=2048,NOW=$TIMESTAMP dgx-2_strong_scaling_benchmark.job
|
||||||
sbatch --nodes=1 --gpus-per-node=3 --ntasks-per-node=3 --export=ALL,NX=4096,NY=1365 dgx-2_strong_scaling_benchmark.job
|
sbatch --nodes=1 --gpus-per-node=3 --ntasks-per-node=3 --export=ALL,NX=4096,NY=1365,NOW=$TIMESTAMP dgx-2_strong_scaling_benchmark.job
|
||||||
sbatch --nodes=1 --gpus-per-node=4 --ntasks-per-node=4 --export=ALL,NX=4096,NY=1024 dgx-2_strong_scaling_benchmark.job
|
sbatch --nodes=1 --gpus-per-node=4 --ntasks-per-node=4 --export=ALL,NX=4096,NY=1024,NOW=$TIMESTAMP dgx-2_strong_scaling_benchmark.job
|
||||||
#sbatch --nodes=1 --gpus-per-node=5 --ntasks-per-node=5 --export=ALL,NX=4096,NY=819 dgx-2_strong_scaling_benchmark.job
|
sbatch --nodes=1 --gpus-per-node=5 --ntasks-per-node=5 --export=ALL,NX=4096,NY=819,NOW=$TIMESTAMP dgx-2_strong_scaling_benchmark.job
|
||||||
#sbatch --nodes=1 --gpus-per-node=6 --ntasks-per-node=6 --export=ALL,NX=4096,NY=683 dgx-2_strong_scaling_benchmark.job
|
sbatch --nodes=1 --gpus-per-node=6 --ntasks-per-node=6 --export=ALL,NX=4096,NY=683,NOW=$TIMESTAMP dgx-2_strong_scaling_benchmark.job
|
||||||
#sbatch --nodes=1 --gpus-per-node=7 --ntasks-per-node=7 --export=ALL,NX=4096,NY=585 dgx-2_strong_scaling_benchmark.job
|
sbatch --nodes=1 --gpus-per-node=7 --ntasks-per-node=7 --export=ALL,NX=4096,NY=585,NOW=$TIMESTAMP dgx-2_strong_scaling_benchmark.job
|
||||||
#sbatch --nodes=1 --gpus-per-node=8 --ntasks-per-node=8 --export=ALL,NX=4096,NY=512 dgx-2_strong_scaling_benchmark.job
|
sbatch --nodes=1 --gpus-per-node=8 --ntasks-per-node=8 --export=ALL,NX=4096,NY=512,NOW=$TIMESTAMP dgx-2_strong_scaling_benchmark.job
|
||||||
|
|
||||||
|
sbatch --nodes=1 --gpus-per-node=9 --ntasks-per-node=9 --export=ALL,NX=4096,NY=512,NOW=$TIMESTAMP dgx-2_strong_scaling_benchmark.job
|
||||||
|
sbatch --nodes=1 --gpus-per-node=10 --ntasks-per-node=10 --export=ALL,NX=4096,NY=512,NOW=$TIMESTAMP dgx-2_strong_scaling_benchmark.job
|
||||||
|
sbatch --nodes=1 --gpus-per-node=11 --ntasks-per-node=11 --export=ALL,NX=4096,NY=512,NOW=$TIMESTAMP dgx-2_strong_scaling_benchmark.job
|
||||||
|
sbatch --nodes=1 --gpus-per-node=12 --ntasks-per-node=12 --export=ALL,NX=4096,NY=512,NOW=$TIMESTAMP dgx-2_strong_scaling_benchmark.job
|
||||||
|
sbatch --nodes=1 --gpus-per-node=13 --ntasks-per-node=13 --export=ALL,NX=4096,NY=512,NOW=$TIMESTAMP dgx-2_strong_scaling_benchmark.job
|
||||||
|
sbatch --nodes=1 --gpus-per-node=14 --ntasks-per-node=14 --export=ALL,NX=4096,NY=512,NOW=$TIMESTAMP dgx-2_strong_scaling_benchmark.job
|
||||||
|
sbatch --nodes=1 --gpus-per-node=15 --ntasks-per-node=15 --export=ALL,NX=4096,NY=512,NOW=$TIMESTAMP dgx-2_strong_scaling_benchmark.job
|
||||||
|
sbatch --nodes=1 --gpus-per-node=16 --ntasks-per-node=16 --export=ALL,NX=4096,NY=512,NOW=$TIMESTAMP dgx-2_strong_scaling_benchmark.job
|
||||||
|
@ -119,7 +119,8 @@ dt = 0.00001
|
|||||||
gamma = 1.4
|
gamma = 1.4
|
||||||
#save_times = np.linspace(0, 0.000009, 2)
|
#save_times = np.linspace(0, 0.000009, 2)
|
||||||
#save_times = np.linspace(0, 0.000099, 11)
|
#save_times = np.linspace(0, 0.000099, 11)
|
||||||
save_times = np.linspace(0, 0.000099, 2)
|
#save_times = np.linspace(0, 0.000099, 2)
|
||||||
|
save_times = np.linspace(0, 0.000999, 2)
|
||||||
outfile = "mpi_out_" + str(MPI.COMM_WORLD.rank) + ".nc"
|
outfile = "mpi_out_" + str(MPI.COMM_WORLD.rank) + ".nc"
|
||||||
save_var_names = ['rho', 'rho_u', 'rho_v', 'E']
|
save_var_names = ['rho', 'rho_u', 'rho_v', 'E']
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user