mirror of
				https://github.com/smyalygames/FiniteVolumeGPU.git
				synced 2025-10-31 12:57:41 +01:00 
			
		
		
		
	Async mem ops
This commit is contained in:
		
							parent
							
								
									dcef56a1b9
								
							
						
					
					
						commit
						6c8bac6f7b
					
				| @ -546,9 +546,9 @@ class CudaArray2D: | ||||
|             #self.logger.debug("Downloading [%dx%d] buffer", self.nx, self.ny) | ||||
|             #Allocate host memory | ||||
|             #The following fails, don't know why (crashes python) | ||||
|             #cpu_data = cuda.pagelocked_empty((self.ny, self.nx), np.float32)32) | ||||
|             cpu_data = cuda.pagelocked_empty((int(ny), int(nx)), dtype=np.float32, mem_flags=cuda.host_alloc_flags.PORTABLE) | ||||
|             #Non-pagelocked: cpu_data = np.empty((ny, nx), dtype=np.float32) | ||||
|             cpu_data = self.memorypool.allocate((ny, nx), dtype=np.float32) | ||||
|             #cpu_data = self.memorypool.allocate((ny, nx), dtype=np.float32) | ||||
|              | ||||
|         assert nx == cpu_data.shape[1] | ||||
|         assert ny == cpu_data.shape[0] | ||||
| @ -759,7 +759,7 @@ class ArakawaA2D: | ||||
|             assert i < len(self.gpu_variables), "Variable {:d} is out of range".format(i) | ||||
|             cpu_variables += [self.gpu_variables[i].download(stream, asynch=True)] | ||||
| 
 | ||||
|         stream.synchronize() | ||||
|         #stream.synchronize() | ||||
|         return cpu_variables | ||||
|          | ||||
|     def check(self): | ||||
|  | ||||
| @ -90,7 +90,7 @@ class EE2D_KP07_dimsplit (BaseSimulator): | ||||
|                                         },  | ||||
|                                         jit_compile_args={}) | ||||
|         self.kernel = module.get_function("KP07DimsplitKernel") | ||||
|         self.kernel.prepare("iiffffffiiPiPiPiPiPiPiPiPiP") | ||||
|         self.kernel.prepare("iiffffffiiPiPiPiPiPiPiPiPiPiiii") | ||||
|          | ||||
|          | ||||
|         #Create data by uploading to device | ||||
| @ -109,11 +109,14 @@ class EE2D_KP07_dimsplit (BaseSimulator): | ||||
|         self.cfl_data.fill(self.dt, stream=self.stream) | ||||
|                          | ||||
|      | ||||
|     def substep(self, dt, step_number): | ||||
|         self.substepDimsplit(0.5*dt, step_number) | ||||
|                  | ||||
|     def substepDimsplit(self, dt, substep): | ||||
|         self.kernel.prepared_async_call(self.grid_size, self.block_size, self.stream,  | ||||
|     def substep(self, dt, step_number, external=True, internal=True): | ||||
|             self.substepDimsplit(0.5*dt, step_number, external, internal) | ||||
|      | ||||
|     def substepDimsplit(self, dt, substep, external, internal): | ||||
|         if external and internal: | ||||
|             #print("COMPLETE DOMAIN (dt=" + str(dt) + ")") | ||||
| 
 | ||||
|             self.kernel.prepared_async_call(self.grid_size, self.block_size, self.stream,  | ||||
|                 self.nx, self.ny,  | ||||
|                 self.dx, self.dy, dt,  | ||||
|                 self.g,  | ||||
| @ -129,8 +132,99 @@ class EE2D_KP07_dimsplit (BaseSimulator): | ||||
|                 self.u1[1].data.gpudata, self.u1[1].data.strides[0],  | ||||
|                 self.u1[2].data.gpudata, self.u1[2].data.strides[0],  | ||||
|                 self.u1[3].data.gpudata, self.u1[3].data.strides[0], | ||||
|                 self.cfl_data.gpudata) | ||||
|                 self.cfl_data.gpudata, | ||||
|                 0, 0,  | ||||
|                 self.nx, self.ny) | ||||
|             self.u0, self.u1 = self.u1, self.u0 | ||||
|             return | ||||
|          | ||||
|         if external and not internal: | ||||
|             ############################################################# | ||||
|             # XXX: Only treating north and south external cells for now # | ||||
|             ############################################################# | ||||
| 
 | ||||
|             ns_grid_size = (self.grid_size[0], 1) | ||||
| 
 | ||||
|             # NORTH | ||||
|             # (x0, y0) x (x1, y1) | ||||
|             #  (0, ny-y_halo) x (nx, ny) | ||||
|             self.kernel.prepared_async_call(ns_grid_size, self.block_size, self.stream,  | ||||
|                 self.nx, self.ny, | ||||
|                 self.dx, self.dy, dt,  | ||||
|                 self.g,  | ||||
|                 self.gamma,  | ||||
|                 self.theta,  | ||||
|                 substep, | ||||
|                 self.boundary_conditions,  | ||||
|                 self.u0[0].data.gpudata, self.u0[0].data.strides[0],  | ||||
|                 self.u0[1].data.gpudata, self.u0[1].data.strides[0],  | ||||
|                 self.u0[2].data.gpudata, self.u0[2].data.strides[0],  | ||||
|                 self.u0[3].data.gpudata, self.u0[3].data.strides[0],  | ||||
|                 self.u1[0].data.gpudata, self.u1[0].data.strides[0],  | ||||
|                 self.u1[1].data.gpudata, self.u1[1].data.strides[0],  | ||||
|                 self.u1[2].data.gpudata, self.u1[2].data.strides[0],  | ||||
|                 self.u1[3].data.gpudata, self.u1[3].data.strides[0], | ||||
|                 self.cfl_data.gpudata, | ||||
|                 0, self.ny - int(self.u0[0].y_halo), | ||||
|                 self.nx, self.ny) | ||||
| 
 | ||||
|             # SOUTH | ||||
|             # (x0, y0) x (x1, y1) | ||||
|             #   (0, 0) x (nx, y_halo) | ||||
|             self.kernel.prepared_async_call(ns_grid_size, self.block_size, self.stream,  | ||||
|                 self.nx, self.ny, | ||||
|                 self.dx, self.dy, dt,  | ||||
|                 self.g,  | ||||
|                 self.gamma,  | ||||
|                 self.theta,  | ||||
|                 substep, | ||||
|                 self.boundary_conditions,  | ||||
|                 self.u0[0].data.gpudata, self.u0[0].data.strides[0],  | ||||
|                 self.u0[1].data.gpudata, self.u0[1].data.strides[0],  | ||||
|                 self.u0[2].data.gpudata, self.u0[2].data.strides[0],  | ||||
|                 self.u0[3].data.gpudata, self.u0[3].data.strides[0],  | ||||
|                 self.u1[0].data.gpudata, self.u1[0].data.strides[0],  | ||||
|                 self.u1[1].data.gpudata, self.u1[1].data.strides[0],  | ||||
|                 self.u1[2].data.gpudata, self.u1[2].data.strides[0],  | ||||
|                 self.u1[3].data.gpudata, self.u1[3].data.strides[0], | ||||
|                 self.cfl_data.gpudata, | ||||
|                 0, 0, | ||||
|                 self.nx, int(self.u0[0].y_halo)) | ||||
|             return | ||||
| 
 | ||||
|         if internal and not external: | ||||
|             ############################################################# | ||||
|             # XXX: Only treating north and south external cells for now # | ||||
|             #      So we need to include west and east boundary here!   # | ||||
|             ############################################################# | ||||
| 
 | ||||
|             # INTERNAL DOMAIN | ||||
|             #         (x0, y0) x (x1, y1) | ||||
|             # (x_halo, y_halo) x (nx - x_halo, ny - y_halo) | ||||
|             self.kernel.prepared_async_call(self.grid_size, self.block_size, self.internal_stream,  | ||||
|                 self.nx, self.ny,  | ||||
|                 self.dx, self.dy, dt,  | ||||
|                 self.g,  | ||||
|                 self.gamma,  | ||||
|                 self.theta,  | ||||
|                 substep, | ||||
|                 self.boundary_conditions,  | ||||
|                 self.u0[0].data.gpudata, self.u0[0].data.strides[0],  | ||||
|                 self.u0[1].data.gpudata, self.u0[1].data.strides[0],  | ||||
|                 self.u0[2].data.gpudata, self.u0[2].data.strides[0],  | ||||
|                 self.u0[3].data.gpudata, self.u0[3].data.strides[0],  | ||||
|                 self.u1[0].data.gpudata, self.u1[0].data.strides[0],  | ||||
|                 self.u1[1].data.gpudata, self.u1[1].data.strides[0],  | ||||
|                 self.u1[2].data.gpudata, self.u1[2].data.strides[0],  | ||||
|                 self.u1[3].data.gpudata, self.u1[3].data.strides[0], | ||||
|                 self.cfl_data.gpudata, | ||||
|                 0, int(self.u0[0].y_halo), | ||||
|                 self.nx, self.ny - int(self.u0[0].y_halo)) | ||||
|             return | ||||
| 
 | ||||
|     def swapBuffers(self): | ||||
|         self.u0, self.u1 = self.u1, self.u0 | ||||
|         return | ||||
|          | ||||
|     def getOutput(self): | ||||
|         return self.u0 | ||||
| @ -138,6 +232,7 @@ class EE2D_KP07_dimsplit (BaseSimulator): | ||||
|     def check(self): | ||||
|         self.u0.check() | ||||
|         self.u1.check() | ||||
|         return | ||||
|          | ||||
|     def computeDt(self): | ||||
|         max_dt = gpuarray.min(self.cfl_data, stream=self.stream).get(); | ||||
|  | ||||
| @ -26,6 +26,8 @@ import numpy as np | ||||
| from mpi4py import MPI | ||||
| import time | ||||
| 
 | ||||
| import pycuda.driver as cuda | ||||
| import nvtx | ||||
| 
 | ||||
| 
 | ||||
| 
 | ||||
| @ -239,25 +241,36 @@ class MPISimulator(Simulator.BaseSimulator): | ||||
|          | ||||
|         #Get coordinate of this node | ||||
|         #and handle global boundary conditions | ||||
|         #new_boundary_conditions = Simulator.BoundaryCondition({ | ||||
|         #    'north': Simulator.BoundaryCondition.Type.Dirichlet, | ||||
|         #    'south': Simulator.BoundaryCondition.Type.Dirichlet, | ||||
|         #    'east': Simulator.BoundaryCondition.Type.Dirichlet, | ||||
|         #    'west': Simulator.BoundaryCondition.Type.Dirichlet | ||||
|         #}) | ||||
|         new_boundary_conditions = Simulator.BoundaryCondition({ | ||||
|             'north': Simulator.BoundaryCondition.Type.Dirichlet, | ||||
|             'south': Simulator.BoundaryCondition.Type.Dirichlet, | ||||
|             'east': Simulator.BoundaryCondition.Type.Dirichlet, | ||||
|             'west': Simulator.BoundaryCondition.Type.Dirichlet | ||||
|             'north': Simulator.BoundaryCondition.Type.Reflective, | ||||
|             'south': Simulator.BoundaryCondition.Type.Reflective, | ||||
|             'east': Simulator.BoundaryCondition.Type.Reflective, | ||||
|             'west': Simulator.BoundaryCondition.Type.Reflective | ||||
|         }) | ||||
|         gi, gj = grid.getCoordinate() | ||||
|         print("gi: " + str(gi) + ", gj: " + str(gj)) | ||||
|         if (gi == 0 and boundary_conditions.west != Simulator.BoundaryCondition.Type.Periodic): | ||||
|             self.west = None | ||||
|             new_boundary_conditions.west = boundary_conditions.west; | ||||
|             print("Rank: " + str(self.grid.comm.rank) + ": WEST") | ||||
|         if (gj == 0 and boundary_conditions.south != Simulator.BoundaryCondition.Type.Periodic): | ||||
|             self.south = None | ||||
|             new_boundary_conditions.south = boundary_conditions.south; | ||||
|             print("Rank: " + str(self.grid.comm.rank) + ": SOUTH") | ||||
|         if (gi == grid.grid[0]-1 and boundary_conditions.east != Simulator.BoundaryCondition.Type.Periodic): | ||||
|             self.east = None | ||||
|             new_boundary_conditions.east = boundary_conditions.east; | ||||
|             print("Rank: " + str(self.grid.comm.rank) + ": EAST") | ||||
|         if (gj == grid.grid[1]-1 and boundary_conditions.north != Simulator.BoundaryCondition.Type.Periodic): | ||||
|             self.north = None | ||||
|             new_boundary_conditions.north = boundary_conditions.north; | ||||
|             print("Rank: " + str(self.grid.comm.rank) + ": NORTH") | ||||
|         sim.setBoundaryConditions(new_boundary_conditions) | ||||
|                  | ||||
|         #Get number of variables | ||||
| @ -286,36 +299,52 @@ class MPISimulator(Simulator.BaseSimulator): | ||||
|         #Note that east and west also transfer ghost cells | ||||
|         #whilst north/south only transfer internal cells | ||||
|         #Reuses the width/height defined in the read-extets above | ||||
|         self.in_e = np.empty((self.nvars, self.read_e[3], self.read_e[2]), dtype=np.float32) | ||||
|         self.in_w = np.empty((self.nvars, self.read_w[3], self.read_w[2]), dtype=np.float32) | ||||
|         self.in_n = np.empty((self.nvars, self.read_n[3], self.read_n[2]), dtype=np.float32) | ||||
|         self.in_s = np.empty((self.nvars, self.read_s[3], self.read_s[2]), dtype=np.float32) | ||||
|          | ||||
|         self.in_e = cuda.pagelocked_empty((int(self.nvars), int(self.read_e[3]), int(self.read_e[2])), dtype=np.float32, mem_flags=cuda.host_alloc_flags.PORTABLE) #np.empty((self.nvars, self.read_e[3], self.read_e[2]), dtype=np.float32) | ||||
|         self.in_w = cuda.pagelocked_empty((int(self.nvars), int(self.read_w[3]), int(self.read_w[2])), dtype=np.float32, mem_flags=cuda.host_alloc_flags.PORTABLE) #np.empty((self.nvars, self.read_w[3], self.read_w[2]), dtype=np.float32) | ||||
|         self.in_n = cuda.pagelocked_empty((int(self.nvars), int(self.read_n[3]), int(self.read_n[2])), dtype=np.float32, mem_flags=cuda.host_alloc_flags.PORTABLE) #np.empty((self.nvars, self.read_n[3], self.read_n[2]), dtype=np.float32) | ||||
|         self.in_s = cuda.pagelocked_empty((int(self.nvars), int(self.read_s[3]), int(self.read_s[2])), dtype=np.float32, mem_flags=cuda.host_alloc_flags.PORTABLE) #np.empty((self.nvars, self.read_s[3], self.read_s[2]), dtype=np.float32) | ||||
| 
 | ||||
|         #Allocate data for sending | ||||
|         self.out_e = np.empty_like(self.in_e) | ||||
|         self.out_w = np.empty_like(self.in_w) | ||||
|         self.out_n = np.empty_like(self.in_n) | ||||
|         self.out_s = np.empty_like(self.in_s) | ||||
|         self.out_e = cuda.pagelocked_empty((int(self.nvars), int(self.read_e[3]), int(self.read_e[2])), dtype=np.float32, mem_flags=cuda.host_alloc_flags.PORTABLE) #np.empty_like(self.in_e) | ||||
|         self.out_w = cuda.pagelocked_empty((int(self.nvars), int(self.read_w[3]), int(self.read_w[2])), dtype=np.float32, mem_flags=cuda.host_alloc_flags.PORTABLE) #np.empty_like(self.in_w) | ||||
|         self.out_n = cuda.pagelocked_empty((int(self.nvars), int(self.read_n[3]), int(self.read_n[2])), dtype=np.float32, mem_flags=cuda.host_alloc_flags.PORTABLE) #np.empty_like(self.in_n) | ||||
|         self.out_s = cuda.pagelocked_empty((int(self.nvars), int(self.read_s[3]), int(self.read_s[2])), dtype=np.float32, mem_flags=cuda.host_alloc_flags.PORTABLE) #np.empty_like(self.in_s) | ||||
|          | ||||
|         self.logger.debug("Simlator rank {:d} initialized on {:s}".format(self.grid.comm.rank, MPI.Get_processor_name())) | ||||
|         self.profiling_data_mpi["end"]["t_sim_mpi_init"] = time.time() | ||||
|          | ||||
|     def substep(self, dt, step_number): | ||||
|         if self.profiling_data_mpi["n_time_steps"] > 0: | ||||
|             self.profiling_data_mpi["start"]["t_step_mpi_halo_exchange"] += time.time() | ||||
| 
 | ||||
|         #Init ghost cells (with data from neighboring subdomains) | ||||
|         self.download_for_exchange(self.sim.u0) | ||||
|         self.exchange() | ||||
|         self.upload_for_exchange(self.sim.u0) | ||||
|      | ||||
|     def substep(self, dt, step_number): | ||||
|         nvtx.mark("substep start", color="red") | ||||
| 
 | ||||
|         self.sim.stream.synchronize() # only necessary for profiling! | ||||
|         if self.profiling_data_mpi["n_time_steps"] > 0: | ||||
|             self.profiling_data_mpi["end"]["t_step_mpi_halo_exchange"] += time.time() | ||||
|             self.profiling_data_mpi["start"]["t_step_mpi"] += time.time() | ||||
|         self.profiling_data_mpi["start"]["t_step_mpi"] += time.time() | ||||
|         nvtx.mark("substep internal", color="red") | ||||
|         self.sim.substep(dt, step_number, internal=True, external=False) # "internal ghost cells" excluded | ||||
|         self.profiling_data_mpi["end"]["t_step_mpi"] += time.time() | ||||
| 
 | ||||
|         self.sim.substep(dt, step_number) | ||||
|         self.profiling_data_mpi["start"]["t_step_mpi"] += time.time() | ||||
|         nvtx.mark("substep external", color="blue") | ||||
|         self.sim.substep(dt, step_number, external=True, internal=False) # only "internal ghost cells" | ||||
|         self.profiling_data_mpi["end"]["t_step_mpi"] += time.time() | ||||
| 
 | ||||
|         self.sim.stream.synchronize() # only necessary for profiling! | ||||
|         if self.profiling_data_mpi["n_time_steps"] > 0: | ||||
|             self.profiling_data_mpi["end"]["t_step_mpi"] += time.time() | ||||
|         # NOTE: Need to download from u1, as u0<->u1 switch is not performed yet | ||||
|         nvtx.mark("download", color="red") | ||||
|         self.sim.swapBuffers() | ||||
|         self.download_for_exchange(self.sim.u0) | ||||
| 
 | ||||
|         nvtx.mark("sync", color="red") | ||||
|         self.sim.stream.synchronize() | ||||
|         nvtx.mark("MPI", color="green") | ||||
|         self.exchange() | ||||
|         nvtx.mark("upload", color="red") | ||||
|         self.upload_for_exchange(self.sim.u0) | ||||
| 
 | ||||
|         self.sim.internal_stream.synchronize() | ||||
|          | ||||
|         self.profiling_data_mpi["n_time_steps"] += 1 | ||||
| 
 | ||||
|     def getOutput(self): | ||||
| @ -348,8 +377,114 @@ class MPISimulator(Simulator.BaseSimulator): | ||||
|         x1 = x0 + width | ||||
|         y1 = y0 + height | ||||
|         return [x0, x1, y0, y1] | ||||
|          | ||||
| 
 | ||||
|     def download_for_exchange(self, u): | ||||
|         if self.profiling_data_mpi["n_time_steps"] > 0: | ||||
|             self.profiling_data_mpi["start"]["t_step_mpi_halo_exchange_download"] += time.time() | ||||
| 
 | ||||
|         # North-south | ||||
|         if self.north is not None: | ||||
|             for k in range(self.nvars): | ||||
|                 u[k].download(self.sim.stream, cpu_data=self.out_n[k,:,:], asynch=True, extent=self.read_n) | ||||
|                 #self.out_n[k,:,:] = u[k].download(self.sim.stream, asynch=True, extent=self.read_n) | ||||
|         if self.south is not None: | ||||
|             for k in range(self.nvars): | ||||
|                 u[k].download(self.sim.stream, cpu_data=self.out_s[k,:,:], asynch=True, extent=self.read_s) | ||||
|                 #self.out_s[k,:,:] = u[k].download(self.sim.stream, asynch=True, extent=self.read_s) | ||||
| 
 | ||||
|         # East-west | ||||
|         if self.east is not None: | ||||
|             for k in range(self.nvars): | ||||
|                 u[k].download(self.sim.stream, cpu_data=self.out_e[k,:,:], asynch=True, extent=self.read_e) | ||||
|                 #self.out_e[k,:,:] = u[k].download(self.sim.stream, asynch=True, extent=self.read_e) | ||||
|         if self.west is not None: | ||||
|             for k in range(self.nvars): | ||||
|                 u[k].download(self.sim.stream, cpu_data=self.out_w[k,:,:], asynch=True, extent=self.read_w) | ||||
|                 #self.out_w[k,:,:] = u[k].download(self.sim.stream, asynch=True, extent=self.read_w) | ||||
|           | ||||
|         if self.profiling_data_mpi["n_time_steps"] > 0: | ||||
|             self.profiling_data_mpi["end"]["t_step_mpi_halo_exchange_download"] += time.time() | ||||
| 
 | ||||
|     def exchange(self): | ||||
|         if self.profiling_data_mpi["n_time_steps"] > 0: | ||||
|             self.profiling_data_mpi["start"]["t_step_mpi_halo_exchange_sendreceive"] += time.time() | ||||
| 
 | ||||
|         #### | ||||
|         # First transfer internal cells north-south | ||||
|         #### | ||||
| 
 | ||||
|         #Send/receive to north/south neighbours | ||||
|         comm_send = [] | ||||
|         comm_recv = [] | ||||
|         if self.north is not None: | ||||
|             comm_send += [self.grid.comm.Isend(self.out_n, dest=self.north, tag=4*self.nt + 0)] | ||||
|             comm_recv += [self.grid.comm.Irecv(self.in_n, source=self.north, tag=4*self.nt + 1)] | ||||
|         if self.south is not None: | ||||
|             comm_send += [self.grid.comm.Isend(self.out_s, dest=self.south, tag=4*self.nt + 1)] | ||||
|             comm_recv += [self.grid.comm.Irecv(self.in_s, source=self.south, tag=4*self.nt + 0)] | ||||
|          | ||||
|         #Wait for incoming transfers to complete | ||||
|         for comm in comm_recv: | ||||
|             comm.wait() | ||||
|          | ||||
|         #Wait for sending to complete | ||||
|         for comm in comm_send: | ||||
|             comm.wait() | ||||
|          | ||||
|         #### | ||||
|         # Then transfer east-west including ghost cells that have been filled in by north-south transfer above | ||||
|         #### | ||||
|         #Send/receive to east/west neighbours | ||||
|         comm_send = [] | ||||
|         comm_recv = [] | ||||
|         if self.east is not None: | ||||
|             comm_send += [self.grid.comm.Isend(self.out_e, dest=self.east, tag=4*self.nt + 2)] | ||||
|             comm_recv += [self.grid.comm.Irecv(self.in_e, source=self.east, tag=4*self.nt + 3)] | ||||
|         if self.west is not None: | ||||
|             comm_send += [self.grid.comm.Isend(self.out_w, dest=self.west, tag=4*self.nt + 3)] | ||||
|             comm_recv += [self.grid.comm.Irecv(self.in_w, source=self.west, tag=4*self.nt + 2)] | ||||
|          | ||||
|          | ||||
|         #Wait for incoming transfers to complete | ||||
|         for comm in comm_recv: | ||||
|             comm.wait() | ||||
|          | ||||
|         #Wait for sending to complete | ||||
|         for comm in comm_send: | ||||
|             comm.wait() | ||||
|              | ||||
|         if self.profiling_data_mpi["n_time_steps"] > 0: | ||||
|             self.profiling_data_mpi["end"]["t_step_mpi_halo_exchange_sendreceive"] += time.time() | ||||
| 
 | ||||
|     def upload_for_exchange(self, u): | ||||
|         if self.profiling_data_mpi["n_time_steps"] > 0: | ||||
|             self.profiling_data_mpi["start"]["t_step_mpi_halo_exchange_upload"] += time.time() | ||||
| 
 | ||||
|         # North-south | ||||
|         if self.north is not None: | ||||
|             for k in range(self.nvars): | ||||
|                 u[k].upload(self.sim.stream, self.in_n[k,:,:], extent=self.write_n) | ||||
|         if self.south is not None: | ||||
|             for k in range(self.nvars): | ||||
|                 u[k].upload(self.sim.stream, self.in_s[k,:,:], extent=self.write_s) | ||||
|          | ||||
|         # East-west | ||||
|         if self.east is not None: | ||||
|             for k in range(self.nvars): | ||||
|                 u[k].upload(self.sim.stream, self.in_e[k,:,:], extent=self.write_e) | ||||
|         if self.west is not None: | ||||
|             for k in range(self.nvars): | ||||
|                 u[k].upload(self.sim.stream, self.in_w[k,:,:], extent=self.write_w) | ||||
|          | ||||
|         if self.profiling_data_mpi["n_time_steps"] > 0: | ||||
|             self.profiling_data_mpi["end"]["t_step_mpi_halo_exchange_upload"] += time.time() | ||||
|      | ||||
| 
 | ||||
| 
 | ||||
| 
 | ||||
| 
 | ||||
| 
 | ||||
|     def old_exchange(self): | ||||
|         #### | ||||
|         # FIXME: This function can be optimized using persitent communications.  | ||||
|         # Also by overlapping some of the communications north/south and east/west of GPU and intra-node | ||||
| @ -470,4 +605,3 @@ class MPISimulator(Simulator.BaseSimulator): | ||||
|              | ||||
|         if self.profiling_data_mpi["n_time_steps"] > 0: | ||||
|             self.profiling_data_mpi["end"]["t_step_mpi_halo_exchange_sendreceive"] += time.time() | ||||
|      | ||||
| @ -147,23 +147,34 @@ __global__ void KP07DimsplitKernel( | ||||
|         float* E1_ptr_, int E1_pitch_,  | ||||
|          | ||||
|         //Output CFL | ||||
|         float* cfl_) { | ||||
|         float* cfl_, | ||||
| 
 | ||||
|         //Subarea of internal domain to compute | ||||
|         int x0=0, int y0=0, | ||||
|         int x1=0, int y1=0) { | ||||
| 
 | ||||
|     if(x1 == 0) | ||||
|         x1 = nx_; | ||||
| 
 | ||||
|     if(y1 == 0) | ||||
|         y1 = ny_; | ||||
|      | ||||
|     const unsigned int w = BLOCK_WIDTH; | ||||
|     const unsigned int h = BLOCK_HEIGHT; | ||||
|     const unsigned int gc_x = 2; | ||||
|     const unsigned int gc_y = 2; | ||||
|     const unsigned int vars = 4; | ||||
|          | ||||
|      | ||||
|     //Shared memory variables | ||||
|     __shared__ float  Q[4][h+2*gc_y][w+2*gc_x]; | ||||
|     __shared__ float Qx[4][h+2*gc_y][w+2*gc_x]; | ||||
|     __shared__ float  F[4][h+2*gc_y][w+2*gc_x]; | ||||
|      | ||||
|     //Read into shared memory | ||||
|     readBlock<w, h, gc_x, gc_y,  1,  1>(  rho0_ptr_,   rho0_pitch_, Q[0], nx_, ny_, boundary_conditions_); | ||||
|     readBlock<w, h, gc_x, gc_y, -1,  1>(rho_u0_ptr_, rho_u0_pitch_, Q[1], nx_, ny_, boundary_conditions_); | ||||
|     readBlock<w, h, gc_x, gc_y,  1, -1>(rho_v0_ptr_, rho_v0_pitch_, Q[2], nx_, ny_, boundary_conditions_); | ||||
|     readBlock<w, h, gc_x, gc_y,  1,  1>(    E0_ptr_,     E0_pitch_, Q[3], nx_, ny_, boundary_conditions_); | ||||
|     readBlock<w, h, gc_x, gc_y,  1,  1>(  rho0_ptr_,   rho0_pitch_, Q[0], nx_, ny_, boundary_conditions_, x0, y0, x1, y1); | ||||
|     readBlock<w, h, gc_x, gc_y, -1,  1>(rho_u0_ptr_, rho_u0_pitch_, Q[1], nx_, ny_, boundary_conditions_, x0, y0, x1, y1); | ||||
|     readBlock<w, h, gc_x, gc_y,  1, -1>(rho_v0_ptr_, rho_v0_pitch_, Q[2], nx_, ny_, boundary_conditions_, x0, y0, x1, y1); | ||||
|     readBlock<w, h, gc_x, gc_y,  1,  1>(    E0_ptr_,     E0_pitch_, Q[3], nx_, ny_, boundary_conditions_, x0, y0, x1, y1); | ||||
| 
 | ||||
|     //Step 0 => evolve x first, then y | ||||
|     if (step_ == 0) { | ||||
| @ -224,10 +235,10 @@ __global__ void KP07DimsplitKernel( | ||||
| 
 | ||||
|      | ||||
|     // Write to main memory for all internal cells | ||||
|     writeBlock<w, h, gc_x, gc_y>(  rho1_ptr_,   rho1_pitch_, Q[0], nx_, ny_, 0, 1); | ||||
|     writeBlock<w, h, gc_x, gc_y>(rho_u1_ptr_, rho_u1_pitch_, Q[1], nx_, ny_, 0, 1); | ||||
|     writeBlock<w, h, gc_x, gc_y>(rho_v1_ptr_, rho_v1_pitch_, Q[2], nx_, ny_, 0, 1); | ||||
|     writeBlock<w, h, gc_x, gc_y>(    E1_ptr_,     E1_pitch_, Q[3], nx_, ny_, 0, 1); | ||||
|     writeBlock<w, h, gc_x, gc_y>(  rho1_ptr_,   rho1_pitch_, Q[0], nx_, ny_, 0, 1, x0, y0, x1, y1); | ||||
|     writeBlock<w, h, gc_x, gc_y>(rho_u1_ptr_, rho_u1_pitch_, Q[1], nx_, ny_, 0, 1, x0, y0, x1, y1); | ||||
|     writeBlock<w, h, gc_x, gc_y>(rho_v1_ptr_, rho_v1_pitch_, Q[2], nx_, ny_, 0, 1, x0, y0, x1, y1); | ||||
|     writeBlock<w, h, gc_x, gc_y>(    E1_ptr_,     E1_pitch_, Q[3], nx_, ny_, 0, 1, x0, y0, x1, y1); | ||||
|      | ||||
|     //Compute the CFL for this block | ||||
|     if (cfl_ != NULL) { | ||||
|  | ||||
| @ -321,7 +321,9 @@ template<int w, int h, int gc_x, int gc_y, int sign_x, int sign_y> | ||||
| inline __device__ void readBlock(float* ptr_, int pitch_, | ||||
|                 float Q[h+2*gc_y][w+2*gc_x],  | ||||
|                 const int nx_, const int ny_, | ||||
|                 const int boundary_conditions_) { | ||||
|                 const int boundary_conditions_, | ||||
|                  int x0, int y0, | ||||
|                  int x1, int y1) { | ||||
|     //Index of block within domain
 | ||||
|     const int bx = blockDim.x * blockIdx.x; | ||||
|     const int by = blockDim.y * blockIdx.y; | ||||
| @ -330,14 +332,14 @@ inline __device__ void readBlock(float* ptr_, int pitch_, | ||||
|     //Loop over all variables
 | ||||
|     for (int j=threadIdx.y; j<h+2*gc_y; j+=h) { | ||||
|         //Handle periodic boundary conditions here
 | ||||
|         int l = handlePeriodicBoundaryY<gc_y>(by + j, ny_, boundary_conditions_); | ||||
|         l = min(l, ny_+2*gc_y-1); | ||||
|         int l = handlePeriodicBoundaryY<gc_y>(by + j + y0, ny_, boundary_conditions_); | ||||
|         l = min(l, min(ny_+2*gc_y-1, y1+2*gc_y-1)); | ||||
|         float* row = (float*) ((char*) ptr_ + pitch_*l); | ||||
|          | ||||
|         for (int i=threadIdx.x; i<w+2*gc_x; i+=w) { | ||||
|             //Handle periodic boundary conditions here
 | ||||
|             int k = handlePeriodicBoundaryX<gc_x>(bx + i, nx_, boundary_conditions_); | ||||
|             k = min(k, nx_+2*gc_x-1); | ||||
|             int k = handlePeriodicBoundaryX<gc_x>(bx + i + x0, nx_, boundary_conditions_); | ||||
|             k = min(k, min(nx_+2*gc_x-1, x1+2*gc_x-1)); | ||||
|              | ||||
|             //Read from global memory
 | ||||
|             Q[j][i] = row[k]; | ||||
| @ -358,14 +360,20 @@ template<int w, int h, int gc_x, int gc_y> | ||||
| inline __device__ void writeBlock(float* ptr_, int pitch_, | ||||
|                  float shmem[h+2*gc_y][w+2*gc_x], | ||||
|                  const int nx_, const int ny_, | ||||
|                  int rk_step_, int rk_order_) { | ||||
|                  int rk_step_, int rk_order_, | ||||
|                  int x0, int y0, | ||||
|                  int x1, int y1) { | ||||
|      | ||||
|     //Index of cell within domain
 | ||||
|     const int ti = blockDim.x*blockIdx.x + threadIdx.x + gc_x; | ||||
|     const int tj = blockDim.y*blockIdx.y + threadIdx.y + gc_y; | ||||
|     const int ti = blockDim.x*blockIdx.x + threadIdx.x + gc_x + x0; | ||||
|     const int tj = blockDim.y*blockIdx.y + threadIdx.y + gc_y + y0; | ||||
| 
 | ||||
|     //In case we are writing only to a subarea given by (x0, y0) x (x1, y1)
 | ||||
|     const int max_ti = min(nx_+gc_x, x1+gc_x); | ||||
|     const int max_tj = min(ny_+gc_y, y1+gc_y); | ||||
|      | ||||
|     //Only write internal cells
 | ||||
|     if (ti < nx_+gc_x && tj < ny_+gc_y) { | ||||
|     if ((x0+gc_x <= ti) && (ti < max_ti) && (y0+gc_y <= tj) && (tj < max_tj)) { | ||||
|         //Index of thread within block
 | ||||
|         const int tx = threadIdx.x + gc_x; | ||||
|         const int ty = threadIdx.y + gc_y; | ||||
| @ -416,6 +424,9 @@ inline __device__ void writeBlock(float* ptr_, int pitch_, | ||||
|                 row[ti] = t*row[ti] + (1.0f-t)*shmem[ty][tx]; | ||||
|             } | ||||
|         } | ||||
| 
 | ||||
|         // DEBUG
 | ||||
|         //row[ti] = 99.0;
 | ||||
|     } | ||||
| } | ||||
| 
 | ||||
|  | ||||
| @ -106,6 +106,10 @@ cuda_context = CudaContext.CudaContext(device=cuda_device, autotuning=False) | ||||
| #### | ||||
| # Set initial conditions | ||||
| #### | ||||
| 
 | ||||
| # DEBUGGING - setting random seed | ||||
| np.random.seed(42) | ||||
| 
 | ||||
| logger.info("Generating initial conditions") | ||||
| nx = args.nx | ||||
| ny = args.ny | ||||
| @ -113,7 +117,9 @@ ny = args.ny | ||||
| dt = 0.00001 | ||||
| 
 | ||||
| gamma = 1.4 | ||||
| save_times = np.linspace(0, 0.1, 2) | ||||
| #save_times = np.linspace(0, 0.000009, 2) | ||||
| #save_times = np.linspace(0, 0.000099, 11) | ||||
| save_times = np.linspace(0, 0.000099, 2) | ||||
| outfile = "mpi_out_" + str(MPI.COMM_WORLD.rank) + ".nc" | ||||
| save_var_names = ['rho', 'rho_u', 'rho_v', 'E'] | ||||
| 
 | ||||
|  | ||||
		Loading…
	
	
			
			x
			
			
		
	
		Reference in New Issue
	
	Block a user
	 Martin Lilleeng Sætra
						Martin Lilleeng Sætra