Profiling code for DGX-2. Added fixed dt.

This commit is contained in:
Martin Lilleeng Sætra 2022-04-08 15:09:09 +02:00
parent 80d84e0489
commit d83d620512
5 changed files with 92 additions and 30 deletions

File diff suppressed because one or more lines are too long

View File

@ -89,7 +89,7 @@ def toJson(in_dict, compressed=True):
out_dict[key] = value
return json.dumps(out_dict)
def runSimulation(simulator, simulator_args, outfile, save_times, save_var_names=[]):
def runSimulation(simulator, simulator_args, outfile, save_times, save_var_names=[], dt=None):
"""
Runs a simulation, and stores output in netcdf file. Stores the times given in
save_times, and saves all of the variables in list save_var_names. Elements in
@ -176,7 +176,7 @@ def runSimulation(simulator, simulator_args, outfile, save_times, save_var_names
#Simulate
if (t_step > 0.0):
sim.simulate(t_step)
sim.simulate(t_step, dt)
profiling_data_sim_runner["end"]["t_step"] += time.time()

View File

@ -0,0 +1,36 @@
#!/bin/bash
# See http://wiki.ex3.simula.no before changing the values below
#SBATCH -p dgx2q # partition (GPU queue)
#SBATCH -w g001 # DGX-2 node
##SBATCH --gres=gpu:4 # number of V100's
#SBATCH -t 0-00:10 # time (D-HH:MM)
#SBATCH -o slurm.%N.%j.out # STDOUT
#SBATCH -e slurm.%N.%j.err # STDERR
ulimit -s 10240
module load slurm/20.02.7
module load cuda11.2/toolkit/11.2.2
module load openmpi4-cuda11.2-ofed50-gcc8/4.1.0
# Check how many gpu's your job got
#nvidia-smi
## Copy input files to the work directory:
mkdir -p /work/$USER/$SLURM_JOB_ID/ShallowWaterGPU
cp -r . /work/$USER/$SLURM_JOB_ID/ShallowWaterGPU
# Run job
# (Assumes Miniconda is installed in user root dir.)
cd /work/$USER/$SLURM_JOB_ID/ShallowWaterGPU
#mpirun --mca btl_openib_if_include mlx5_0 --mca btl_openib_warn_no_device_params_found 0 $HOME/miniconda3/envs/ShallowWaterGPU_HPC/bin/python3 mpiTesting.py -nx $NX -ny $NY --profile
mpirun -np $SLURM_NTASKS numactl --cpunodebind=0 --localalloc $HOME/miniconda3/envs/ShallowWaterGPU_HPC/bin/python3 mpiTesting.py -nx $NX -ny $NY --profile
cd $HOME/src/ShallowWaterGPU
## Copy files from work directory:
# (NOTE: Copying is not performed if job fails!)
mkdir -p output/$SLURM_JOB_ID
mv /work/$USER/$SLURM_JOB_ID/ShallowWaterGPU/*.log ./output/$SLURM_JOB_ID
mv /work/$USER/$SLURM_JOB_ID/ShallowWaterGPU/*.nc ./output/$SLURM_JOB_ID
mv /work/$USER/$SLURM_JOB_ID/ShallowWaterGPU/*.json .
rm -rf /work/$USER/$SLURM_JOB_ID

View File

@ -0,0 +1,11 @@
#!/bin/bash
# one node: 1-8 GPUs
sbatch --nodes=1 --gpus-per-node=1 --ntasks-per-node=1 --export=ALL,NX=4096,NY=4096 dgx-2_strong_scaling_benchmark.job
sbatch --nodes=1 --gpus-per-node=2 --ntasks-per-node=2 --export=ALL,NX=4096,NY=2048 dgx-2_strong_scaling_benchmark.job
sbatch --nodes=1 --gpus-per-node=3 --ntasks-per-node=3 --export=ALL,NX=4096,NY=1365 dgx-2_strong_scaling_benchmark.job
sbatch --nodes=1 --gpus-per-node=4 --ntasks-per-node=4 --export=ALL,NX=4096,NY=1024 dgx-2_strong_scaling_benchmark.job
#sbatch --nodes=1 --gpus-per-node=5 --ntasks-per-node=5 --export=ALL,NX=4096,NY=819 dgx-2_strong_scaling_benchmark.job
#sbatch --nodes=1 --gpus-per-node=6 --ntasks-per-node=6 --export=ALL,NX=4096,NY=683 dgx-2_strong_scaling_benchmark.job
#sbatch --nodes=1 --gpus-per-node=7 --ntasks-per-node=7 --export=ALL,NX=4096,NY=585 dgx-2_strong_scaling_benchmark.job
#sbatch --nodes=1 --gpus-per-node=8 --ntasks-per-node=8 --export=ALL,NX=4096,NY=512 dgx-2_strong_scaling_benchmark.job

View File

@ -110,8 +110,10 @@ logger.info("Generating initial conditions")
nx = args.nx
ny = args.ny
dt = 0.00001
gamma = 1.4
save_times = np.linspace(0, 0.5, 2)
save_times = np.linspace(0, 0.1, 2)
outfile = "mpi_out_" + str(MPI.COMM_WORLD.rank) + ".nc"
save_var_names = ['rho', 'rho_u', 'rho_v', 'E']
@ -139,7 +141,7 @@ def genSim(grid, **kwargs):
outfile, sim_runner_profiling_data, sim_profiling_data = Common.runSimulation(
genSim, arguments, outfile, save_times, save_var_names)
genSim, arguments, outfile, save_times, save_var_names, dt)
if(args.profile):
t_total_end = time.time()
@ -149,6 +151,7 @@ if(args.profile):
# write profiling to json file
if(args.profile and MPI.COMM_WORLD.rank == 0):
job_id = ""
if "SLURM_JOB_ID" in os.environ:
job_id = int(os.environ["SLURM_JOB_ID"])
allocated_nodes = int(os.environ["SLURM_JOB_NUM_NODES"])
@ -167,8 +170,13 @@ if(args.profile and MPI.COMM_WORLD.rank == 0):
profiling_data["nx"] = nx
profiling_data["ny"] = ny
profiling_data["dt"] = dt
profiling_data["n_time_steps"] = sim_profiling_data["n_time_steps"]
profiling_data["slurm_job_id"] = job_id
profiling_data["n_cuda_devices"] = str(num_cuda_devices)
profiling_data["n_processes"] = str(MPI.COMM_WORLD.size)
with open(profiling_file, "w") as write_file:
json.dump(profiling_data, write_file)