diff --git a/GPUSimulators/Common.py b/GPUSimulators/Common.py index 36b82be..d052db0 100644 --- a/GPUSimulators/Common.py +++ b/GPUSimulators/Common.py @@ -95,10 +95,20 @@ def runSimulation(simulator, simulator_args, outfile, save_times, save_var_names save_times, and saves all of the variables in list save_var_names. Elements in save_var_names can be set to None if you do not want to save them """ + profiling_data_sim_runner = { 'start': {}, 'end': {} } + profiling_data_sim_runner["start"]["t_sim_init"] = 0 + profiling_data_sim_runner["end"]["t_sim_init"] = 0 + profiling_data_sim_runner["start"]["t_nc_write"] = 0 + profiling_data_sim_runner["end"]["t_nc_write"] = 0 + profiling_data_sim_runner["start"]["t_step"] = 0 + profiling_data_sim_runner["end"]["t_step"] = 0 + + profiling_data_sim_runner["start"]["t_sim_init"] = time.time() + logger = logging.getLogger(__name__) - + assert len(save_times) > 0, "Need to specify which times to save" - + with Timer("construct") as t: sim = simulator(**simulator_args) logger.info("Constructed in " + str(t.secs) + " seconds") @@ -140,12 +150,14 @@ def runSimulation(simulator, simulator_args, outfile, save_times, save_var_names #Create variables for var_name in save_var_names: ncvars[var_name] = outdata.ncfile.createVariable(var_name, np.dtype('float32').char, ('time', 'y', 'x'), zlib=True, least_significant_digit=3) - + #Create step sizes between each save t_steps = np.empty_like(save_times) t_steps[0] = save_times[0] t_steps[1:] = save_times[1:] - save_times[0:-1] + profiling_data_sim_runner["end"]["t_sim_init"] = time.time() + #Start simulation loop progress_printer = ProgressPrinter(save_times[-1], print_every=10) for k in range(len(save_times)): @@ -160,18 +172,24 @@ def runSimulation(simulator, simulator_args, outfile, save_times, save_var_names logger.error("Error after {:d} steps (t={:f}: {:s}".format(sim.simSteps(), sim.simTime(), str(e))) return outdata.filename + profiling_data_sim_runner["start"]["t_step"] += time.time() + #Simulate if (t_step > 0.0): sim.simulate(t_step) + profiling_data_sim_runner["end"]["t_step"] += time.time() + + profiling_data_sim_runner["start"]["t_nc_write"] += time.time() + #Download - """ save_vars = sim.download(download_vars) #Save to file for i, var_name in enumerate(save_var_names): ncvars[var_name][k, :] = save_vars[i] - """ + + profiling_data_sim_runner["end"]["t_nc_write"] += time.time() #Write progress to screen print_string = progress_printer.getPrintString(t_end) @@ -180,7 +198,7 @@ def runSimulation(simulator, simulator_args, outfile, save_times, save_var_names logger.debug("Simulated to t={:f} in {:d} timesteps (average dt={:f})".format(t_end, sim.simSteps(), sim.simTime() / sim.simSteps())) - return outdata.filename, sim.profiling_data + return outdata.filename, profiling_data_sim_runner, sim.profiling_data_mpi diff --git a/GPUSimulators/MPISimulator.py b/GPUSimulators/MPISimulator.py index 4b1a98e..823960b 100644 --- a/GPUSimulators/MPISimulator.py +++ b/GPUSimulators/MPISimulator.py @@ -201,13 +201,13 @@ class MPISimulator(Simulator.BaseSimulator): Class which handles communication between simulators on different MPI nodes """ def __init__(self, sim, grid): - self.profiling_data = { 'start': {}, 'end': {} } - self.profiling_data["start"]["t_halo_exchange"] = 0 - self.profiling_data["end"]["t_halo_exchange"] = 0 - self.profiling_data["start"]["t_step"] = 0 - self.profiling_data["end"]["t_step"] = 0 - self.profiling_data["n_time_steps"] = 0 - self.profiling_data["start"]["t_mpi_sim_init"] = time.time() + self.profiling_data_mpi = { 'start': {}, 'end': {} } + self.profiling_data_mpi["start"]["t_step_mpi_halo_exchange"] = 0 + self.profiling_data_mpi["end"]["t_step_mpi_halo_exchange"] = 0 + self.profiling_data_mpi["start"]["t_step_mpi"] = 0 + self.profiling_data_mpi["end"]["t_step_mpi"] = 0 + self.profiling_data_mpi["n_time_steps"] = 0 + self.profiling_data_mpi["start"]["t_sim_mpi_init"] = time.time() self.logger = logging.getLogger(__name__) autotuner = sim.context.autotuner @@ -292,18 +292,26 @@ class MPISimulator(Simulator.BaseSimulator): self.out_s = np.empty_like(self.in_s) self.logger.debug("Simlator rank {:d} initialized on {:s}".format(self.grid.comm.rank, MPI.Get_processor_name())) - self.profiling_data["end"]["t_mpi_sim_init"] = time.time() + self.profiling_data_mpi["end"]["t_sim_mpi_init"] = time.time() def substep(self, dt, step_number): - self.profiling_data["start"]["t_halo_exchange"] += time.time() - self.exchange() - self.profiling_data["end"]["t_halo_exchange"] += time.time() + if self.profiling_data_mpi["n_time_steps"] > 0: + self.profiling_data_mpi["start"]["t_step_mpi_halo_exchange"] += time.time() + + self.exchange() + + self.sim.stream.synchronize() # only necessary for profiling! + if self.profiling_data_mpi["n_time_steps"] > 0: + self.profiling_data_mpi["end"]["t_step_mpi_halo_exchange"] += time.time() + self.profiling_data_mpi["start"]["t_step_mpi"] += time.time() - self.profiling_data["start"]["t_step"] += time.time() self.sim.substep(dt, step_number) - self.profiling_data["end"]["t_step"] += time.time() - self.profiling_data["n_time_steps"] += 1 - + + self.sim.stream.synchronize() # only necessary for profiling! + if self.profiling_data_mpi["n_time_steps"] > 0: + self.profiling_data_mpi["end"]["t_step_mpi"] += time.time() + self.profiling_data_mpi["n_time_steps"] += 1 + def getOutput(self): return self.sim.getOutput() @@ -422,6 +430,4 @@ class MPISimulator(Simulator.BaseSimulator): #Wait for sending to complete for comm in comm_send: comm.wait() - - \ No newline at end of file diff --git a/mpiTesting.py b/mpiTesting.py index 4d6f859..9750b37 100644 --- a/mpiTesting.py +++ b/mpiTesting.py @@ -111,7 +111,7 @@ nx = args.nx ny = args.ny gamma = 1.4 -save_times = np.linspace(0, 0.1, 2) +save_times = np.linspace(0, 0.5, 2) outfile = "mpi_out_" + str(MPI.COMM_WORLD.rank) + ".nc" save_var_names = ['rho', 'rho_u', 'rho_v', 'E'] @@ -138,7 +138,7 @@ def genSim(grid, **kwargs): return sim -outfile, sim_profiling_data = Common.runSimulation( +outfile, sim_runner_profiling_data, sim_profiling_data = Common.runSimulation( genSim, arguments, outfile, save_times, save_var_names) if(args.profile): @@ -159,6 +159,9 @@ if(args.profile and MPI.COMM_WORLD.rank == 0): else: profiling_file = "MPI_" + str(MPI.COMM_WORLD.size) + "_procs_and_" + str(num_cuda_devices) + "_GPUs_profiling.json" + for stage in sim_runner_profiling_data["start"].keys(): + profiling_data[stage] = sim_runner_profiling_data["end"][stage] - sim_runner_profiling_data["start"][stage] + for stage in sim_profiling_data["start"].keys(): profiling_data[stage] = sim_profiling_data["end"][stage] - sim_profiling_data["start"][stage] diff --git a/saga-dev.job b/saga-dev.job index a0023ff..7aa6786 100644 --- a/saga-dev.job +++ b/saga-dev.job @@ -6,7 +6,7 @@ #SBATCH --account=nn9882k # # Wall clock limit: -#SBATCH --time=00:20:00 +#SBATCH --time=00:02:00 # # NOTE: See https://documentation.sigma2.no/jobs/projects_accounting.html when adjusting the values below # @@ -28,7 +28,7 @@ # source /cluster/bin/jobsetup module restore system # instead of 'module purge' rather set module environment to the system default -module load CUDA/10.2.89 +module load CUDA/11.4.1 # It is also recommended to to list loaded modules, for easier debugging: module list @@ -41,7 +41,7 @@ mkdir $SCRATCH/ShallowWaterGPU cp -r . $SCRATCH/ShallowWaterGPU ## Make sure the results are copied back to the submit directory (see Work Directory below): -# chkfile MyResultFile +# chkfile MyResultFileq # chkfile is replaced by 'savefile' on Saga savefile "$SCRATCH/ShallowWaterGPU/*.log" savefile "$SCRATCH/ShallowWaterGPU/*.nc" diff --git a/saga_strong_scaling_benchmark.job b/saga_strong_scaling_benchmark.job index 0d729c3..fc61ffb 100644 --- a/saga_strong_scaling_benchmark.job +++ b/saga_strong_scaling_benchmark.job @@ -6,26 +6,26 @@ #SBATCH --account=nn9882k # # Wall clock limit: -#SBATCH --time=10:00:00 +#SBATCH --time=00:10:00 +# +# NOTE: See https://documentation.sigma2.no/jobs/projects_accounting.html when adjusting the values below # -# Ask for 1 GPU (max is 2) # Note: The environment variable CUDA_VISIBLE_DEVICES will show which GPU # device(s) to use. It will have values '0', '1' or '0,1' corresponding to # /dev/nvidia0, /dev/nvidia1 or both, respectively. -#SBATCH --partition=accel --gres=gpu:1 +#SBATCH --partition=accel # # Max memory usage per task (core) - increasing this will cost more core hours: -#SBATCH --mem-per-cpu=16G +#SBATCH --mem-per-cpu=3800M # -# Number of tasks: -#SBATCH --nodes=1 --ntasks-per-node=1 +#SBATCH --qos=devel ## Set up job environment: (this is done automatically behind the scenes) ## (make sure to comment '#' or remove the following line 'source ...') # source /cluster/bin/jobsetup module restore system # instead of 'module purge' rather set module environment to the system default -module load CUDA/10.2.89 +module load CUDA/11.4.1 # It is also recommended to to list loaded modules, for easier debugging: module list @@ -47,5 +47,5 @@ savefile "$SCRATCH/ShallowWaterGPU/*.json" ## Do some work: cd $SCRATCH/ShallowWaterGPU srun $HOME/.conda/envs/ShallowWaterGPU_HPC/bin/python3 --version -srun $HOME/.conda/envs/ShallowWaterGPU_HPC/bin/python3 mpiTesting.py -nx 1024 -ny 1024 --profile +srun $HOME/.conda/envs/ShallowWaterGPU_HPC/bin/python3 mpiTesting.py -nx $NX -ny $NY --profile diff --git a/saga_strong_scaling_benchmark.sh b/saga_strong_scaling_benchmark.sh index 1d2284d..96ba541 100644 --- a/saga_strong_scaling_benchmark.sh +++ b/saga_strong_scaling_benchmark.sh @@ -1,13 +1,13 @@ #!/bin/bash -# one node: 1-4 tasks/GPUs -sbatch --partition=accel --gres=gpu:1 --nodes=1 --ntasks-per-node=1 saga_strong_scaling_benchmark.job -sbatch --partition=accel --gres=gpu:2 --nodes=1 --ntasks-per-node=2 saga_strong_scaling_benchmark.job -sbatch --partition=accel --gres=gpu:3 --nodes=1 --ntasks-per-node=3 saga_strong_scaling_benchmark.job -sbatch --partition=accel --gres=gpu:4 --nodes=1 --ntasks-per-node=4 saga_strong_scaling_benchmark.job +# one node: 1-4 GPUs +sbatch --nodes=1 --gpus-per-node=1 --ntasks-per-node=1 --export=ALL,NX=1024,NY=1024 saga_strong_scaling_benchmark.job +sbatch --nodes=1 --gpus-per-node=2 --ntasks-per-node=2 --export=ALL,NX=1024,NY=512 saga_strong_scaling_benchmark.job +sbatch --nodes=1 --gpus-per-node=3 --ntasks-per-node=3 --export=ALL,NX=1024,NY=341 saga_strong_scaling_benchmark.job +sbatch --nodes=1 --gpus-per-node=4 --ntasks-per-node=4 --export=ALL,NX=512,NY=512 saga_strong_scaling_benchmark.job -# 2-4 nodes: 4 tasks/GPUs per node -sbatch --partition=accel --gres=gpu:4 --nodes=2 --ntasks-per-node=4 saga_strong_scaling_benchmark.job -sbatch --partition=accel --gres=gpu:4 --nodes=3 --ntasks-per-node=4 saga_strong_scaling_benchmark.job -sbatch --partition=accel --gres=gpu:4 --nodes=4 --ntasks-per-node=4 saga_strong_scaling_benchmark.job +# 2-4 nodes: 1 GPUs per node +sbatch --nodes=2 --gpus-per-node=1 --ntasks-per-node=1 --export=ALL,NX=1024,NY=512 saga_strong_scaling_benchmark.job +sbatch --nodes=3 --gpus-per-node=1 --ntasks-per-node=1 --export=ALL,NX=1024,NY=341 saga_strong_scaling_benchmark.job +sbatch --nodes=4 --gpus-per-node=1 --ntasks-per-node=1 --export=ALL,NX=512,NY=512 saga_strong_scaling_benchmark.job