From a588948e77908b2985ee143b055aa4ba15b4fbc9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Martin=20Lilleeng=20S=C3=A6tra?= Date: Wed, 25 May 2022 11:25:10 +0000 Subject: [PATCH] Bugfix for ghost cell exchange and nc atts --- GPUSimulators/Common.py | 17 ++-- GPUSimulators/EE2D_KP07_dimsplit.py | 64 +++++++++++--- GPUSimulators/MPISimulator.py | 127 ++++++++++++---------------- mpiTesting.py | 2 + 4 files changed, 124 insertions(+), 86 deletions(-) diff --git a/GPUSimulators/Common.py b/GPUSimulators/Common.py index a965231..76902c5 100644 --- a/GPUSimulators/Common.py +++ b/GPUSimulators/Common.py @@ -100,8 +100,8 @@ def runSimulation(simulator, simulator_args, outfile, save_times, save_var_names profiling_data_sim_runner["end"]["t_sim_init"] = 0 profiling_data_sim_runner["start"]["t_nc_write"] = 0 profiling_data_sim_runner["end"]["t_nc_write"] = 0 - profiling_data_sim_runner["start"]["t_step"] = 0 - profiling_data_sim_runner["end"]["t_step"] = 0 + profiling_data_sim_runner["start"]["t_full_step"] = 0 + profiling_data_sim_runner["end"]["t_full_step"] = 0 profiling_data_sim_runner["start"]["t_sim_init"] = time.time() @@ -121,7 +121,14 @@ def runSimulation(simulator, simulator_args, outfile, save_times, save_var_names outdata.ncfile.git_hash = getGitHash() outdata.ncfile.git_status = getGitStatus() outdata.ncfile.simulator = str(simulator) - outdata.ncfile.sim_args = toJson(simulator_args) + + # do not write fields to attributes (they are to large) + simulator_args_for_ncfile = simulator_args.copy() + del simulator_args_for_ncfile["rho"] + del simulator_args_for_ncfile["rho_u"] + del simulator_args_for_ncfile["rho_v"] + del simulator_args_for_ncfile["E"] + outdata.ncfile.sim_args = toJson(simulator_args_for_ncfile) #Create dimensions outdata.ncfile.createDimension('time', len(save_times)) @@ -172,13 +179,13 @@ def runSimulation(simulator, simulator_args, outfile, save_times, save_var_names logger.error("Error after {:d} steps (t={:f}: {:s}".format(sim.simSteps(), sim.simTime(), str(e))) return outdata.filename - profiling_data_sim_runner["start"]["t_step"] += time.time() + profiling_data_sim_runner["start"]["t_full_step"] += time.time() #Simulate if (t_step > 0.0): sim.simulate(t_step, dt) - profiling_data_sim_runner["end"]["t_step"] += time.time() + profiling_data_sim_runner["end"]["t_full_step"] += time.time() profiling_data_sim_runner["start"]["t_nc_write"] += time.time() diff --git a/GPUSimulators/EE2D_KP07_dimsplit.py b/GPUSimulators/EE2D_KP07_dimsplit.py index cc15c9c..2c3f810 100644 --- a/GPUSimulators/EE2D_KP07_dimsplit.py +++ b/GPUSimulators/EE2D_KP07_dimsplit.py @@ -138,9 +138,9 @@ class EE2D_KP07_dimsplit (BaseSimulator): return if external and not internal: - ############################################################# - # XXX: Only treating north and south external cells for now # - ############################################################# + ################################### + # XXX: Corners are treated twice! # + ################################### ns_grid_size = (self.grid_size[0], 1) @@ -189,14 +189,58 @@ class EE2D_KP07_dimsplit (BaseSimulator): self.cfl_data.gpudata, 0, 0, self.nx, int(self.u0[0].y_halo)) + + we_grid_size = (1, self.grid_size[1]) + + # WEST + # (x0, y0) x (x1, y1) + # (0, 0) x (x_halo, ny) + self.kernel.prepared_async_call(we_grid_size, self.block_size, self.stream, + self.nx, self.ny, + self.dx, self.dy, dt, + self.g, + self.gamma, + self.theta, + substep, + self.boundary_conditions, + self.u0[0].data.gpudata, self.u0[0].data.strides[0], + self.u0[1].data.gpudata, self.u0[1].data.strides[0], + self.u0[2].data.gpudata, self.u0[2].data.strides[0], + self.u0[3].data.gpudata, self.u0[3].data.strides[0], + self.u1[0].data.gpudata, self.u1[0].data.strides[0], + self.u1[1].data.gpudata, self.u1[1].data.strides[0], + self.u1[2].data.gpudata, self.u1[2].data.strides[0], + self.u1[3].data.gpudata, self.u1[3].data.strides[0], + self.cfl_data.gpudata, + 0, 0, + int(self.u0[0].x_halo), self.ny) + + # EAST + # (x0, y0) x (x1, y1) + # (nx-x_halo, 0) x (nx, ny) + self.kernel.prepared_async_call(we_grid_size, self.block_size, self.stream, + self.nx, self.ny, + self.dx, self.dy, dt, + self.g, + self.gamma, + self.theta, + substep, + self.boundary_conditions, + self.u0[0].data.gpudata, self.u0[0].data.strides[0], + self.u0[1].data.gpudata, self.u0[1].data.strides[0], + self.u0[2].data.gpudata, self.u0[2].data.strides[0], + self.u0[3].data.gpudata, self.u0[3].data.strides[0], + self.u1[0].data.gpudata, self.u1[0].data.strides[0], + self.u1[1].data.gpudata, self.u1[1].data.strides[0], + self.u1[2].data.gpudata, self.u1[2].data.strides[0], + self.u1[3].data.gpudata, self.u1[3].data.strides[0], + self.cfl_data.gpudata, + self.nx - int(self.u0[0].x_halo), 0, + self.nx, self.ny) return if internal and not external: - ############################################################# - # XXX: Only treating north and south external cells for now # - # So we need to include west and east boundary here! # - ############################################################# - + # INTERNAL DOMAIN # (x0, y0) x (x1, y1) # (x_halo, y_halo) x (nx - x_halo, ny - y_halo) @@ -217,8 +261,8 @@ class EE2D_KP07_dimsplit (BaseSimulator): self.u1[2].data.gpudata, self.u1[2].data.strides[0], self.u1[3].data.gpudata, self.u1[3].data.strides[0], self.cfl_data.gpudata, - 0, int(self.u0[0].y_halo), - self.nx, self.ny - int(self.u0[0].y_halo)) + int(self.u0[0].x_halo), int(self.u0[0].y_halo), + self.nx - int(self.u0[0].x_halo), self.ny - int(self.u0[0].y_halo)) return def swapBuffers(self): diff --git a/GPUSimulators/MPISimulator.py b/GPUSimulators/MPISimulator.py index 3e2a7e0..4828a6c 100644 --- a/GPUSimulators/MPISimulator.py +++ b/GPUSimulators/MPISimulator.py @@ -208,18 +208,17 @@ class MPISimulator(Simulator.BaseSimulator): """ def __init__(self, sim, grid): self.profiling_data_mpi = { 'start': {}, 'end': {} } - self.profiling_data_mpi["start"]["t_step_mpi_halo_exchange"] = 0 - self.profiling_data_mpi["end"]["t_step_mpi_halo_exchange"] = 0 - self.profiling_data_mpi["start"]["t_step_mpi_halo_exchange_download"] = 0 - self.profiling_data_mpi["end"]["t_step_mpi_halo_exchange_download"] = 0 - self.profiling_data_mpi["start"]["t_step_mpi_halo_exchange_upload"] = 0 - self.profiling_data_mpi["end"]["t_step_mpi_halo_exchange_upload"] = 0 - self.profiling_data_mpi["start"]["t_step_mpi_halo_exchange_sendreceive"] = 0 - self.profiling_data_mpi["end"]["t_step_mpi_halo_exchange_sendreceive"] = 0 - self.profiling_data_mpi["start"]["t_step_mpi"] = 0 - self.profiling_data_mpi["end"]["t_step_mpi"] = 0 + self.profiling_data_mpi["start"]["t_mpi_halo_exchange"] = 0 + self.profiling_data_mpi["end"]["t_mpi_halo_exchange"] = 0 + self.profiling_data_mpi["start"]["t_mpi_halo_exchange_download"] = 0 + self.profiling_data_mpi["end"]["t_mpi_halo_exchange_download"] = 0 + self.profiling_data_mpi["start"]["t_mpi_halo_exchange_upload"] = 0 + self.profiling_data_mpi["end"]["t_mpi_halo_exchange_upload"] = 0 + self.profiling_data_mpi["start"]["t_mpi_halo_exchange_sendreceive"] = 0 + self.profiling_data_mpi["end"]["t_mpi_halo_exchange_sendreceive"] = 0 + self.profiling_data_mpi["start"]["t_mpi_step"] = 0 + self.profiling_data_mpi["end"]["t_mpi_step"] = 0 self.profiling_data_mpi["n_time_steps"] = 0 - self.profiling_data_mpi["start"]["t_sim_mpi_init"] = time.time() self.logger = logging.getLogger(__name__) autotuner = sim.context.autotuner @@ -297,43 +296,43 @@ class MPISimulator(Simulator.BaseSimulator): #Note that east and west also transfer ghost cells #whilst north/south only transfer internal cells #Reuses the width/height defined in the read-extets above - self.in_e = cuda.pagelocked_empty((int(self.nvars), int(self.read_e[3]), int(self.read_e[2])), dtype=np.float32, mem_flags=cuda.host_alloc_flags.PORTABLE) #np.empty((self.nvars, self.read_e[3], self.read_e[2]), dtype=np.float32) - self.in_w = cuda.pagelocked_empty((int(self.nvars), int(self.read_w[3]), int(self.read_w[2])), dtype=np.float32, mem_flags=cuda.host_alloc_flags.PORTABLE) #np.empty((self.nvars, self.read_w[3], self.read_w[2]), dtype=np.float32) - self.in_n = cuda.pagelocked_empty((int(self.nvars), int(self.read_n[3]), int(self.read_n[2])), dtype=np.float32, mem_flags=cuda.host_alloc_flags.PORTABLE) #np.empty((self.nvars, self.read_n[3], self.read_n[2]), dtype=np.float32) - self.in_s = cuda.pagelocked_empty((int(self.nvars), int(self.read_s[3]), int(self.read_s[2])), dtype=np.float32, mem_flags=cuda.host_alloc_flags.PORTABLE) #np.empty((self.nvars, self.read_s[3], self.read_s[2]), dtype=np.float32) + self.in_e = cuda.pagelocked_empty((int(self.nvars), int(self.read_e[3]), int(self.read_e[2])), dtype=np.float32) #np.empty((self.nvars, self.read_e[3], self.read_e[2]), dtype=np.float32) + self.in_w = cuda.pagelocked_empty((int(self.nvars), int(self.read_w[3]), int(self.read_w[2])), dtype=np.float32) #np.empty((self.nvars, self.read_w[3], self.read_w[2]), dtype=np.float32) + self.in_n = cuda.pagelocked_empty((int(self.nvars), int(self.read_n[3]), int(self.read_n[2])), dtype=np.float32) #np.empty((self.nvars, self.read_n[3], self.read_n[2]), dtype=np.float32) + self.in_s = cuda.pagelocked_empty((int(self.nvars), int(self.read_s[3]), int(self.read_s[2])), dtype=np.float32) #np.empty((self.nvars, self.read_s[3], self.read_s[2]), dtype=np.float32) #Allocate data for sending - self.out_e = cuda.pagelocked_empty((int(self.nvars), int(self.read_e[3]), int(self.read_e[2])), dtype=np.float32, mem_flags=cuda.host_alloc_flags.PORTABLE) #np.empty_like(self.in_e) - self.out_w = cuda.pagelocked_empty((int(self.nvars), int(self.read_w[3]), int(self.read_w[2])), dtype=np.float32, mem_flags=cuda.host_alloc_flags.PORTABLE) #np.empty_like(self.in_w) - self.out_n = cuda.pagelocked_empty((int(self.nvars), int(self.read_n[3]), int(self.read_n[2])), dtype=np.float32, mem_flags=cuda.host_alloc_flags.PORTABLE) #np.empty_like(self.in_n) - self.out_s = cuda.pagelocked_empty((int(self.nvars), int(self.read_s[3]), int(self.read_s[2])), dtype=np.float32, mem_flags=cuda.host_alloc_flags.PORTABLE) #np.empty_like(self.in_s) + self.out_e = cuda.pagelocked_empty((int(self.nvars), int(self.read_e[3]), int(self.read_e[2])), dtype=np.float32) #np.empty_like(self.in_e) + self.out_w = cuda.pagelocked_empty((int(self.nvars), int(self.read_w[3]), int(self.read_w[2])), dtype=np.float32) #np.empty_like(self.in_w) + self.out_n = cuda.pagelocked_empty((int(self.nvars), int(self.read_n[3]), int(self.read_n[2])), dtype=np.float32) #np.empty_like(self.in_n) + self.out_s = cuda.pagelocked_empty((int(self.nvars), int(self.read_s[3]), int(self.read_s[2])), dtype=np.float32) #np.empty_like(self.in_s) self.logger.debug("Simlator rank {:d} initialized on {:s}".format(self.grid.comm.rank, MPI.Get_processor_name())) - self.profiling_data_mpi["end"]["t_sim_mpi_init"] = time.time() - self.old_exchange() + self.full_exchange() + sim.context.synchronize() def substep(self, dt, step_number): nvtx.mark("substep start", color="yellow") - self.profiling_data_mpi["start"]["t_step_mpi"] += time.time() + self.profiling_data_mpi["start"]["t_mpi_step"] += time.time() + + nvtx.mark("substep external", color="blue") + self.sim.substep(dt, step_number, external=True, internal=False) # only "internal ghost cells" nvtx.mark("substep internal", color="red") self.sim.substep(dt, step_number, internal=True, external=False) # "internal ghost cells" excluded - nvtx.mark("substep external", color="blue") - self.sim.substep(dt, step_number, external=True, internal=False) # only "internal ghost cells" - #nvtx.mark("substep full", color="blue") #self.sim.substep(dt, step_number, external=True, internal=True) self.sim.swapBuffers() - self.profiling_data_mpi["end"]["t_step_mpi"] += time.time() + self.profiling_data_mpi["end"]["t_mpi_step"] += time.time() nvtx.mark("exchange", color="blue") - self.old_exchange() + self.full_exchange() #nvtx.mark("download", color="blue") #self.download_for_exchange(self.sim.u0) @@ -383,8 +382,7 @@ class MPISimulator(Simulator.BaseSimulator): return [x0, x1, y0, y1] def download_for_exchange(self, u): - if self.profiling_data_mpi["n_time_steps"] > 0: - self.profiling_data_mpi["start"]["t_step_mpi_halo_exchange_download"] += time.time() + self.profiling_data_mpi["start"]["t_mpi_halo_exchange_download"] += time.time() # North-south if self.north is not None: @@ -406,12 +404,10 @@ class MPISimulator(Simulator.BaseSimulator): u[k].download(self.sim.stream, cpu_data=self.out_w[k,:,:], asynch=True, extent=self.read_w) #self.out_w[k,:,:] = u[k].download(self.sim.stream, asynch=True, extent=self.read_w) - if self.profiling_data_mpi["n_time_steps"] > 0: - self.profiling_data_mpi["end"]["t_step_mpi_halo_exchange_download"] += time.time() + self.profiling_data_mpi["end"]["t_mpi_halo_exchange_download"] += time.time() def exchange(self): - if self.profiling_data_mpi["n_time_steps"] > 0: - self.profiling_data_mpi["start"]["t_step_mpi_halo_exchange_sendreceive"] += time.time() + self.profiling_data_mpi["start"]["t_mpi_halo_exchange_sendreceive"] += time.time() #Send/receive to north/south neighbours comm_send = [] @@ -441,12 +437,10 @@ class MPISimulator(Simulator.BaseSimulator): for comm in comm_send: comm.wait() - if self.profiling_data_mpi["n_time_steps"] > 0: - self.profiling_data_mpi["end"]["t_step_mpi_halo_exchange_sendreceive"] += time.time() + self.profiling_data_mpi["end"]["t_mpi_halo_exchange_sendreceive"] += time.time() def upload_for_exchange(self, u): - if self.profiling_data_mpi["n_time_steps"] > 0: - self.profiling_data_mpi["start"]["t_step_mpi_halo_exchange_upload"] += time.time() + self.profiling_data_mpi["start"]["t_mpi_halo_exchange_upload"] += time.time() # North-south if self.north is not None: @@ -464,15 +458,11 @@ class MPISimulator(Simulator.BaseSimulator): for k in range(self.nvars): u[k].upload(self.sim.stream, self.in_w[k,:,:], extent=self.write_w) - if self.profiling_data_mpi["n_time_steps"] > 0: - self.profiling_data_mpi["end"]["t_step_mpi_halo_exchange_upload"] += time.time() - + self.profiling_data_mpi["end"]["t_mpi_halo_exchange_upload"] += time.time() - - - def old_exchange(self): + def full_exchange(self): #### # FIXME: This function can be optimized using persitent communications. # Also by overlapping some of the communications north/south and east/west of GPU and intra-node @@ -484,8 +474,7 @@ class MPISimulator(Simulator.BaseSimulator): #### #Download from the GPU - if self.profiling_data_mpi["n_time_steps"] > 0: - self.profiling_data_mpi["start"]["t_step_mpi_halo_exchange_download"] += time.time() + self.profiling_data_mpi["start"]["t_mpi_halo_exchange_download"] += time.time() if self.north is not None: for k in range(self.nvars): @@ -495,10 +484,10 @@ class MPISimulator(Simulator.BaseSimulator): self.sim.u0[k].download(self.sim.stream, cpu_data=self.out_s[k,:,:], asynch=True, extent=self.read_s) self.sim.stream.synchronize() + self.profiling_data_mpi["end"]["t_mpi_halo_exchange_download"] += time.time() + #Send/receive to north/south neighbours - if self.profiling_data_mpi["n_time_steps"] > 0: - self.profiling_data_mpi["end"]["t_step_mpi_halo_exchange_download"] += time.time() - self.profiling_data_mpi["start"]["t_step_mpi_halo_exchange_sendreceive"] += time.time() + self.profiling_data_mpi["start"]["t_mpi_halo_exchange_sendreceive"] += time.time() comm_send = [] comm_recv = [] @@ -513,10 +502,10 @@ class MPISimulator(Simulator.BaseSimulator): for comm in comm_recv: comm.wait() + self.profiling_data_mpi["end"]["t_mpi_halo_exchange_sendreceive"] += time.time() + #Upload to the GPU - if self.profiling_data_mpi["n_time_steps"] > 0: - self.profiling_data_mpi["end"]["t_step_mpi_halo_exchange_sendreceive"] += time.time() - self.profiling_data_mpi["start"]["t_step_mpi_halo_exchange_upload"] += time.time() + self.profiling_data_mpi["start"]["t_mpi_halo_exchange_upload"] += time.time() if self.north is not None: for k in range(self.nvars): @@ -524,25 +513,23 @@ class MPISimulator(Simulator.BaseSimulator): if self.south is not None: for k in range(self.nvars): self.sim.u0[k].upload(self.sim.stream, self.in_s[k,:,:], extent=self.write_s) + + self.profiling_data_mpi["end"]["t_mpi_halo_exchange_upload"] += time.time() #Wait for sending to complete - if self.profiling_data_mpi["n_time_steps"] > 0: - self.profiling_data_mpi["end"]["t_step_mpi_halo_exchange_upload"] += time.time() - self.profiling_data_mpi["start"]["t_step_mpi_halo_exchange_sendreceive"] += time.time() + self.profiling_data_mpi["start"]["t_mpi_halo_exchange_sendreceive"] += time.time() for comm in comm_send: comm.wait() - + self.profiling_data_mpi["end"]["t_mpi_halo_exchange_sendreceive"] += time.time() #### # Then transfer east-west including ghost cells that have been filled in by north-south transfer above #### #Download from the GPU - if self.profiling_data_mpi["n_time_steps"] > 0: - self.profiling_data_mpi["end"]["t_step_mpi_halo_exchange_sendreceive"] += time.time() - self.profiling_data_mpi["start"]["t_step_mpi_halo_exchange_download"] += time.time() + self.profiling_data_mpi["start"]["t_mpi_halo_exchange_download"] += time.time() if self.east is not None: for k in range(self.nvars): @@ -552,10 +539,10 @@ class MPISimulator(Simulator.BaseSimulator): self.sim.u0[k].download(self.sim.stream, cpu_data=self.out_w[k,:,:], asynch=True, extent=self.read_w) self.sim.stream.synchronize() + self.profiling_data_mpi["end"]["t_mpi_halo_exchange_download"] += time.time() + #Send/receive to east/west neighbours - if self.profiling_data_mpi["n_time_steps"] > 0: - self.profiling_data_mpi["end"]["t_step_mpi_halo_exchange_download"] += time.time() - self.profiling_data_mpi["start"]["t_step_mpi_halo_exchange_sendreceive"] += time.time() + self.profiling_data_mpi["start"]["t_mpi_halo_exchange_sendreceive"] += time.time() comm_send = [] comm_recv = [] @@ -566,15 +553,14 @@ class MPISimulator(Simulator.BaseSimulator): comm_send += [self.grid.comm.Isend(self.out_w, dest=self.west, tag=4*self.nt + 3)] comm_recv += [self.grid.comm.Irecv(self.in_w, source=self.west, tag=4*self.nt + 2)] - #Wait for incoming transfers to complete for comm in comm_recv: comm.wait() + self.profiling_data_mpi["end"]["t_mpi_halo_exchange_sendreceive"] += time.time() + #Upload to the GPU - if self.profiling_data_mpi["n_time_steps"] > 0: - self.profiling_data_mpi["end"]["t_step_mpi_halo_exchange_sendreceive"] += time.time() - self.profiling_data_mpi["start"]["t_step_mpi_halo_exchange_upload"] += time.time() + self.profiling_data_mpi["start"]["t_mpi_halo_exchange_upload"] += time.time() if self.east is not None: for k in range(self.nvars): @@ -583,13 +569,12 @@ class MPISimulator(Simulator.BaseSimulator): for k in range(self.nvars): self.sim.u0[k].upload(self.sim.stream, self.in_w[k,:,:], extent=self.write_w) + self.profiling_data_mpi["end"]["t_mpi_halo_exchange_upload"] += time.time() + #Wait for sending to complete - if self.profiling_data_mpi["n_time_steps"] > 0: - self.profiling_data_mpi["end"]["t_step_mpi_halo_exchange_upload"] += time.time() - self.profiling_data_mpi["start"]["t_step_mpi_halo_exchange_sendreceive"] += time.time() + self.profiling_data_mpi["start"]["t_mpi_halo_exchange_sendreceive"] += time.time() for comm in comm_send: comm.wait() - - if self.profiling_data_mpi["n_time_steps"] > 0: - self.profiling_data_mpi["end"]["t_step_mpi_halo_exchange_sendreceive"] += time.time() + + self.profiling_data_mpi["end"]["t_mpi_halo_exchange_sendreceive"] += time.time() diff --git a/mpiTesting.py b/mpiTesting.py index 42e3bad..11b8189 100644 --- a/mpiTesting.py +++ b/mpiTesting.py @@ -183,6 +183,8 @@ if(args.profile and MPI.COMM_WORLD.rank == 0): profiling_data["slurm_job_id"] = job_id profiling_data["n_cuda_devices"] = str(num_cuda_devices) profiling_data["n_processes"] = str(MPI.COMM_WORLD.size) + profiling_data["git_hash"] = Common.getGitHash() + profiling_data["git_status"] = Common.getGitStatus() with open(profiling_file, "w") as write_file: json.dump(profiling_data, write_file)