mirror of
https://github.com/smyalygames/FiniteVolumeGPU.git
synced 2025-05-17 22:14:14 +02:00
Profiling code for DGX-2. Added fixed dt.
This commit is contained in:
parent
80d84e0489
commit
d83d620512
File diff suppressed because one or more lines are too long
@ -89,7 +89,7 @@ def toJson(in_dict, compressed=True):
|
||||
out_dict[key] = value
|
||||
return json.dumps(out_dict)
|
||||
|
||||
def runSimulation(simulator, simulator_args, outfile, save_times, save_var_names=[]):
|
||||
def runSimulation(simulator, simulator_args, outfile, save_times, save_var_names=[], dt=None):
|
||||
"""
|
||||
Runs a simulation, and stores output in netcdf file. Stores the times given in
|
||||
save_times, and saves all of the variables in list save_var_names. Elements in
|
||||
@ -176,7 +176,7 @@ def runSimulation(simulator, simulator_args, outfile, save_times, save_var_names
|
||||
|
||||
#Simulate
|
||||
if (t_step > 0.0):
|
||||
sim.simulate(t_step)
|
||||
sim.simulate(t_step, dt)
|
||||
|
||||
profiling_data_sim_runner["end"]["t_step"] += time.time()
|
||||
|
||||
|
36
dgx-2_strong_scaling_benchmark.job
Normal file
36
dgx-2_strong_scaling_benchmark.job
Normal file
@ -0,0 +1,36 @@
|
||||
#!/bin/bash
|
||||
# See http://wiki.ex3.simula.no before changing the values below
|
||||
#SBATCH -p dgx2q # partition (GPU queue)
|
||||
#SBATCH -w g001 # DGX-2 node
|
||||
##SBATCH --gres=gpu:4 # number of V100's
|
||||
#SBATCH -t 0-00:10 # time (D-HH:MM)
|
||||
#SBATCH -o slurm.%N.%j.out # STDOUT
|
||||
#SBATCH -e slurm.%N.%j.err # STDERR
|
||||
|
||||
ulimit -s 10240
|
||||
module load slurm/20.02.7
|
||||
module load cuda11.2/toolkit/11.2.2
|
||||
module load openmpi4-cuda11.2-ofed50-gcc8/4.1.0
|
||||
|
||||
# Check how many gpu's your job got
|
||||
#nvidia-smi
|
||||
|
||||
## Copy input files to the work directory:
|
||||
mkdir -p /work/$USER/$SLURM_JOB_ID/ShallowWaterGPU
|
||||
cp -r . /work/$USER/$SLURM_JOB_ID/ShallowWaterGPU
|
||||
|
||||
# Run job
|
||||
# (Assumes Miniconda is installed in user root dir.)
|
||||
cd /work/$USER/$SLURM_JOB_ID/ShallowWaterGPU
|
||||
#mpirun --mca btl_openib_if_include mlx5_0 --mca btl_openib_warn_no_device_params_found 0 $HOME/miniconda3/envs/ShallowWaterGPU_HPC/bin/python3 mpiTesting.py -nx $NX -ny $NY --profile
|
||||
mpirun -np $SLURM_NTASKS numactl --cpunodebind=0 --localalloc $HOME/miniconda3/envs/ShallowWaterGPU_HPC/bin/python3 mpiTesting.py -nx $NX -ny $NY --profile
|
||||
cd $HOME/src/ShallowWaterGPU
|
||||
|
||||
## Copy files from work directory:
|
||||
# (NOTE: Copying is not performed if job fails!)
|
||||
mkdir -p output/$SLURM_JOB_ID
|
||||
mv /work/$USER/$SLURM_JOB_ID/ShallowWaterGPU/*.log ./output/$SLURM_JOB_ID
|
||||
mv /work/$USER/$SLURM_JOB_ID/ShallowWaterGPU/*.nc ./output/$SLURM_JOB_ID
|
||||
mv /work/$USER/$SLURM_JOB_ID/ShallowWaterGPU/*.json .
|
||||
|
||||
rm -rf /work/$USER/$SLURM_JOB_ID
|
11
dgx-2_strong_scaling_benchmark.sh
Normal file
11
dgx-2_strong_scaling_benchmark.sh
Normal file
@ -0,0 +1,11 @@
|
||||
#!/bin/bash
|
||||
|
||||
# one node: 1-8 GPUs
|
||||
sbatch --nodes=1 --gpus-per-node=1 --ntasks-per-node=1 --export=ALL,NX=4096,NY=4096 dgx-2_strong_scaling_benchmark.job
|
||||
sbatch --nodes=1 --gpus-per-node=2 --ntasks-per-node=2 --export=ALL,NX=4096,NY=2048 dgx-2_strong_scaling_benchmark.job
|
||||
sbatch --nodes=1 --gpus-per-node=3 --ntasks-per-node=3 --export=ALL,NX=4096,NY=1365 dgx-2_strong_scaling_benchmark.job
|
||||
sbatch --nodes=1 --gpus-per-node=4 --ntasks-per-node=4 --export=ALL,NX=4096,NY=1024 dgx-2_strong_scaling_benchmark.job
|
||||
#sbatch --nodes=1 --gpus-per-node=5 --ntasks-per-node=5 --export=ALL,NX=4096,NY=819 dgx-2_strong_scaling_benchmark.job
|
||||
#sbatch --nodes=1 --gpus-per-node=6 --ntasks-per-node=6 --export=ALL,NX=4096,NY=683 dgx-2_strong_scaling_benchmark.job
|
||||
#sbatch --nodes=1 --gpus-per-node=7 --ntasks-per-node=7 --export=ALL,NX=4096,NY=585 dgx-2_strong_scaling_benchmark.job
|
||||
#sbatch --nodes=1 --gpus-per-node=8 --ntasks-per-node=8 --export=ALL,NX=4096,NY=512 dgx-2_strong_scaling_benchmark.job
|
@ -110,8 +110,10 @@ logger.info("Generating initial conditions")
|
||||
nx = args.nx
|
||||
ny = args.ny
|
||||
|
||||
dt = 0.00001
|
||||
|
||||
gamma = 1.4
|
||||
save_times = np.linspace(0, 0.5, 2)
|
||||
save_times = np.linspace(0, 0.1, 2)
|
||||
outfile = "mpi_out_" + str(MPI.COMM_WORLD.rank) + ".nc"
|
||||
save_var_names = ['rho', 'rho_u', 'rho_v', 'E']
|
||||
|
||||
@ -139,7 +141,7 @@ def genSim(grid, **kwargs):
|
||||
|
||||
|
||||
outfile, sim_runner_profiling_data, sim_profiling_data = Common.runSimulation(
|
||||
genSim, arguments, outfile, save_times, save_var_names)
|
||||
genSim, arguments, outfile, save_times, save_var_names, dt)
|
||||
|
||||
if(args.profile):
|
||||
t_total_end = time.time()
|
||||
@ -149,6 +151,7 @@ if(args.profile):
|
||||
|
||||
# write profiling to json file
|
||||
if(args.profile and MPI.COMM_WORLD.rank == 0):
|
||||
job_id = ""
|
||||
if "SLURM_JOB_ID" in os.environ:
|
||||
job_id = int(os.environ["SLURM_JOB_ID"])
|
||||
allocated_nodes = int(os.environ["SLURM_JOB_NUM_NODES"])
|
||||
@ -167,8 +170,13 @@ if(args.profile and MPI.COMM_WORLD.rank == 0):
|
||||
|
||||
profiling_data["nx"] = nx
|
||||
profiling_data["ny"] = ny
|
||||
profiling_data["dt"] = dt
|
||||
profiling_data["n_time_steps"] = sim_profiling_data["n_time_steps"]
|
||||
|
||||
profiling_data["slurm_job_id"] = job_id
|
||||
profiling_data["n_cuda_devices"] = str(num_cuda_devices)
|
||||
profiling_data["n_processes"] = str(MPI.COMM_WORLD.size)
|
||||
|
||||
with open(profiling_file, "w") as write_file:
|
||||
json.dump(profiling_data, write_file)
|
||||
|
||||
|
Loading…
x
Reference in New Issue
Block a user