mirror of
				https://github.com/smyalygames/FiniteVolumeGPU.git
				synced 2025-10-31 20:27:40 +01:00 
			
		
		
		
	Async mem ops
This commit is contained in:
		
							parent
							
								
									dcef56a1b9
								
							
						
					
					
						commit
						6c8bac6f7b
					
				| @ -546,9 +546,9 @@ class CudaArray2D: | |||||||
|             #self.logger.debug("Downloading [%dx%d] buffer", self.nx, self.ny) |             #self.logger.debug("Downloading [%dx%d] buffer", self.nx, self.ny) | ||||||
|             #Allocate host memory |             #Allocate host memory | ||||||
|             #The following fails, don't know why (crashes python) |             #The following fails, don't know why (crashes python) | ||||||
|             #cpu_data = cuda.pagelocked_empty((self.ny, self.nx), np.float32)32) |             cpu_data = cuda.pagelocked_empty((int(ny), int(nx)), dtype=np.float32, mem_flags=cuda.host_alloc_flags.PORTABLE) | ||||||
|             #Non-pagelocked: cpu_data = np.empty((ny, nx), dtype=np.float32) |             #Non-pagelocked: cpu_data = np.empty((ny, nx), dtype=np.float32) | ||||||
|             cpu_data = self.memorypool.allocate((ny, nx), dtype=np.float32) |             #cpu_data = self.memorypool.allocate((ny, nx), dtype=np.float32) | ||||||
|              |              | ||||||
|         assert nx == cpu_data.shape[1] |         assert nx == cpu_data.shape[1] | ||||||
|         assert ny == cpu_data.shape[0] |         assert ny == cpu_data.shape[0] | ||||||
| @ -759,7 +759,7 @@ class ArakawaA2D: | |||||||
|             assert i < len(self.gpu_variables), "Variable {:d} is out of range".format(i) |             assert i < len(self.gpu_variables), "Variable {:d} is out of range".format(i) | ||||||
|             cpu_variables += [self.gpu_variables[i].download(stream, asynch=True)] |             cpu_variables += [self.gpu_variables[i].download(stream, asynch=True)] | ||||||
| 
 | 
 | ||||||
|         stream.synchronize() |         #stream.synchronize() | ||||||
|         return cpu_variables |         return cpu_variables | ||||||
|          |          | ||||||
|     def check(self): |     def check(self): | ||||||
|  | |||||||
| @ -90,7 +90,7 @@ class EE2D_KP07_dimsplit (BaseSimulator): | |||||||
|                                         },  |                                         },  | ||||||
|                                         jit_compile_args={}) |                                         jit_compile_args={}) | ||||||
|         self.kernel = module.get_function("KP07DimsplitKernel") |         self.kernel = module.get_function("KP07DimsplitKernel") | ||||||
|         self.kernel.prepare("iiffffffiiPiPiPiPiPiPiPiPiP") |         self.kernel.prepare("iiffffffiiPiPiPiPiPiPiPiPiPiiii") | ||||||
|          |          | ||||||
|          |          | ||||||
|         #Create data by uploading to device |         #Create data by uploading to device | ||||||
| @ -109,11 +109,14 @@ class EE2D_KP07_dimsplit (BaseSimulator): | |||||||
|         self.cfl_data.fill(self.dt, stream=self.stream) |         self.cfl_data.fill(self.dt, stream=self.stream) | ||||||
|                          |                          | ||||||
|      |      | ||||||
|     def substep(self, dt, step_number): |     def substep(self, dt, step_number, external=True, internal=True): | ||||||
|         self.substepDimsplit(0.5*dt, step_number) |             self.substepDimsplit(0.5*dt, step_number, external, internal) | ||||||
|                  |      | ||||||
|     def substepDimsplit(self, dt, substep): |     def substepDimsplit(self, dt, substep, external, internal): | ||||||
|         self.kernel.prepared_async_call(self.grid_size, self.block_size, self.stream,  |         if external and internal: | ||||||
|  |             #print("COMPLETE DOMAIN (dt=" + str(dt) + ")") | ||||||
|  | 
 | ||||||
|  |             self.kernel.prepared_async_call(self.grid_size, self.block_size, self.stream,  | ||||||
|                 self.nx, self.ny,  |                 self.nx, self.ny,  | ||||||
|                 self.dx, self.dy, dt,  |                 self.dx, self.dy, dt,  | ||||||
|                 self.g,  |                 self.g,  | ||||||
| @ -129,8 +132,99 @@ class EE2D_KP07_dimsplit (BaseSimulator): | |||||||
|                 self.u1[1].data.gpudata, self.u1[1].data.strides[0],  |                 self.u1[1].data.gpudata, self.u1[1].data.strides[0],  | ||||||
|                 self.u1[2].data.gpudata, self.u1[2].data.strides[0],  |                 self.u1[2].data.gpudata, self.u1[2].data.strides[0],  | ||||||
|                 self.u1[3].data.gpudata, self.u1[3].data.strides[0], |                 self.u1[3].data.gpudata, self.u1[3].data.strides[0], | ||||||
|                 self.cfl_data.gpudata) |                 self.cfl_data.gpudata, | ||||||
|  |                 0, 0,  | ||||||
|  |                 self.nx, self.ny) | ||||||
|  |             self.u0, self.u1 = self.u1, self.u0 | ||||||
|  |             return | ||||||
|  |          | ||||||
|  |         if external and not internal: | ||||||
|  |             ############################################################# | ||||||
|  |             # XXX: Only treating north and south external cells for now # | ||||||
|  |             ############################################################# | ||||||
|  | 
 | ||||||
|  |             ns_grid_size = (self.grid_size[0], 1) | ||||||
|  | 
 | ||||||
|  |             # NORTH | ||||||
|  |             # (x0, y0) x (x1, y1) | ||||||
|  |             #  (0, ny-y_halo) x (nx, ny) | ||||||
|  |             self.kernel.prepared_async_call(ns_grid_size, self.block_size, self.stream,  | ||||||
|  |                 self.nx, self.ny, | ||||||
|  |                 self.dx, self.dy, dt,  | ||||||
|  |                 self.g,  | ||||||
|  |                 self.gamma,  | ||||||
|  |                 self.theta,  | ||||||
|  |                 substep, | ||||||
|  |                 self.boundary_conditions,  | ||||||
|  |                 self.u0[0].data.gpudata, self.u0[0].data.strides[0],  | ||||||
|  |                 self.u0[1].data.gpudata, self.u0[1].data.strides[0],  | ||||||
|  |                 self.u0[2].data.gpudata, self.u0[2].data.strides[0],  | ||||||
|  |                 self.u0[3].data.gpudata, self.u0[3].data.strides[0],  | ||||||
|  |                 self.u1[0].data.gpudata, self.u1[0].data.strides[0],  | ||||||
|  |                 self.u1[1].data.gpudata, self.u1[1].data.strides[0],  | ||||||
|  |                 self.u1[2].data.gpudata, self.u1[2].data.strides[0],  | ||||||
|  |                 self.u1[3].data.gpudata, self.u1[3].data.strides[0], | ||||||
|  |                 self.cfl_data.gpudata, | ||||||
|  |                 0, self.ny - int(self.u0[0].y_halo), | ||||||
|  |                 self.nx, self.ny) | ||||||
|  | 
 | ||||||
|  |             # SOUTH | ||||||
|  |             # (x0, y0) x (x1, y1) | ||||||
|  |             #   (0, 0) x (nx, y_halo) | ||||||
|  |             self.kernel.prepared_async_call(ns_grid_size, self.block_size, self.stream,  | ||||||
|  |                 self.nx, self.ny, | ||||||
|  |                 self.dx, self.dy, dt,  | ||||||
|  |                 self.g,  | ||||||
|  |                 self.gamma,  | ||||||
|  |                 self.theta,  | ||||||
|  |                 substep, | ||||||
|  |                 self.boundary_conditions,  | ||||||
|  |                 self.u0[0].data.gpudata, self.u0[0].data.strides[0],  | ||||||
|  |                 self.u0[1].data.gpudata, self.u0[1].data.strides[0],  | ||||||
|  |                 self.u0[2].data.gpudata, self.u0[2].data.strides[0],  | ||||||
|  |                 self.u0[3].data.gpudata, self.u0[3].data.strides[0],  | ||||||
|  |                 self.u1[0].data.gpudata, self.u1[0].data.strides[0],  | ||||||
|  |                 self.u1[1].data.gpudata, self.u1[1].data.strides[0],  | ||||||
|  |                 self.u1[2].data.gpudata, self.u1[2].data.strides[0],  | ||||||
|  |                 self.u1[3].data.gpudata, self.u1[3].data.strides[0], | ||||||
|  |                 self.cfl_data.gpudata, | ||||||
|  |                 0, 0, | ||||||
|  |                 self.nx, int(self.u0[0].y_halo)) | ||||||
|  |             return | ||||||
|  | 
 | ||||||
|  |         if internal and not external: | ||||||
|  |             ############################################################# | ||||||
|  |             # XXX: Only treating north and south external cells for now # | ||||||
|  |             #      So we need to include west and east boundary here!   # | ||||||
|  |             ############################################################# | ||||||
|  | 
 | ||||||
|  |             # INTERNAL DOMAIN | ||||||
|  |             #         (x0, y0) x (x1, y1) | ||||||
|  |             # (x_halo, y_halo) x (nx - x_halo, ny - y_halo) | ||||||
|  |             self.kernel.prepared_async_call(self.grid_size, self.block_size, self.internal_stream,  | ||||||
|  |                 self.nx, self.ny,  | ||||||
|  |                 self.dx, self.dy, dt,  | ||||||
|  |                 self.g,  | ||||||
|  |                 self.gamma,  | ||||||
|  |                 self.theta,  | ||||||
|  |                 substep, | ||||||
|  |                 self.boundary_conditions,  | ||||||
|  |                 self.u0[0].data.gpudata, self.u0[0].data.strides[0],  | ||||||
|  |                 self.u0[1].data.gpudata, self.u0[1].data.strides[0],  | ||||||
|  |                 self.u0[2].data.gpudata, self.u0[2].data.strides[0],  | ||||||
|  |                 self.u0[3].data.gpudata, self.u0[3].data.strides[0],  | ||||||
|  |                 self.u1[0].data.gpudata, self.u1[0].data.strides[0],  | ||||||
|  |                 self.u1[1].data.gpudata, self.u1[1].data.strides[0],  | ||||||
|  |                 self.u1[2].data.gpudata, self.u1[2].data.strides[0],  | ||||||
|  |                 self.u1[3].data.gpudata, self.u1[3].data.strides[0], | ||||||
|  |                 self.cfl_data.gpudata, | ||||||
|  |                 0, int(self.u0[0].y_halo), | ||||||
|  |                 self.nx, self.ny - int(self.u0[0].y_halo)) | ||||||
|  |             return | ||||||
|  | 
 | ||||||
|  |     def swapBuffers(self): | ||||||
|         self.u0, self.u1 = self.u1, self.u0 |         self.u0, self.u1 = self.u1, self.u0 | ||||||
|  |         return | ||||||
|          |          | ||||||
|     def getOutput(self): |     def getOutput(self): | ||||||
|         return self.u0 |         return self.u0 | ||||||
| @ -138,6 +232,7 @@ class EE2D_KP07_dimsplit (BaseSimulator): | |||||||
|     def check(self): |     def check(self): | ||||||
|         self.u0.check() |         self.u0.check() | ||||||
|         self.u1.check() |         self.u1.check() | ||||||
|  |         return | ||||||
|          |          | ||||||
|     def computeDt(self): |     def computeDt(self): | ||||||
|         max_dt = gpuarray.min(self.cfl_data, stream=self.stream).get(); |         max_dt = gpuarray.min(self.cfl_data, stream=self.stream).get(); | ||||||
|  | |||||||
| @ -26,6 +26,8 @@ import numpy as np | |||||||
| from mpi4py import MPI | from mpi4py import MPI | ||||||
| import time | import time | ||||||
| 
 | 
 | ||||||
|  | import pycuda.driver as cuda | ||||||
|  | import nvtx | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| @ -239,25 +241,36 @@ class MPISimulator(Simulator.BaseSimulator): | |||||||
|          |          | ||||||
|         #Get coordinate of this node |         #Get coordinate of this node | ||||||
|         #and handle global boundary conditions |         #and handle global boundary conditions | ||||||
|  |         #new_boundary_conditions = Simulator.BoundaryCondition({ | ||||||
|  |         #    'north': Simulator.BoundaryCondition.Type.Dirichlet, | ||||||
|  |         #    'south': Simulator.BoundaryCondition.Type.Dirichlet, | ||||||
|  |         #    'east': Simulator.BoundaryCondition.Type.Dirichlet, | ||||||
|  |         #    'west': Simulator.BoundaryCondition.Type.Dirichlet | ||||||
|  |         #}) | ||||||
|         new_boundary_conditions = Simulator.BoundaryCondition({ |         new_boundary_conditions = Simulator.BoundaryCondition({ | ||||||
|             'north': Simulator.BoundaryCondition.Type.Dirichlet, |             'north': Simulator.BoundaryCondition.Type.Reflective, | ||||||
|             'south': Simulator.BoundaryCondition.Type.Dirichlet, |             'south': Simulator.BoundaryCondition.Type.Reflective, | ||||||
|             'east': Simulator.BoundaryCondition.Type.Dirichlet, |             'east': Simulator.BoundaryCondition.Type.Reflective, | ||||||
|             'west': Simulator.BoundaryCondition.Type.Dirichlet |             'west': Simulator.BoundaryCondition.Type.Reflective | ||||||
|         }) |         }) | ||||||
|         gi, gj = grid.getCoordinate() |         gi, gj = grid.getCoordinate() | ||||||
|  |         print("gi: " + str(gi) + ", gj: " + str(gj)) | ||||||
|         if (gi == 0 and boundary_conditions.west != Simulator.BoundaryCondition.Type.Periodic): |         if (gi == 0 and boundary_conditions.west != Simulator.BoundaryCondition.Type.Periodic): | ||||||
|             self.west = None |             self.west = None | ||||||
|             new_boundary_conditions.west = boundary_conditions.west; |             new_boundary_conditions.west = boundary_conditions.west; | ||||||
|  |             print("Rank: " + str(self.grid.comm.rank) + ": WEST") | ||||||
|         if (gj == 0 and boundary_conditions.south != Simulator.BoundaryCondition.Type.Periodic): |         if (gj == 0 and boundary_conditions.south != Simulator.BoundaryCondition.Type.Periodic): | ||||||
|             self.south = None |             self.south = None | ||||||
|             new_boundary_conditions.south = boundary_conditions.south; |             new_boundary_conditions.south = boundary_conditions.south; | ||||||
|  |             print("Rank: " + str(self.grid.comm.rank) + ": SOUTH") | ||||||
|         if (gi == grid.grid[0]-1 and boundary_conditions.east != Simulator.BoundaryCondition.Type.Periodic): |         if (gi == grid.grid[0]-1 and boundary_conditions.east != Simulator.BoundaryCondition.Type.Periodic): | ||||||
|             self.east = None |             self.east = None | ||||||
|             new_boundary_conditions.east = boundary_conditions.east; |             new_boundary_conditions.east = boundary_conditions.east; | ||||||
|  |             print("Rank: " + str(self.grid.comm.rank) + ": EAST") | ||||||
|         if (gj == grid.grid[1]-1 and boundary_conditions.north != Simulator.BoundaryCondition.Type.Periodic): |         if (gj == grid.grid[1]-1 and boundary_conditions.north != Simulator.BoundaryCondition.Type.Periodic): | ||||||
|             self.north = None |             self.north = None | ||||||
|             new_boundary_conditions.north = boundary_conditions.north; |             new_boundary_conditions.north = boundary_conditions.north; | ||||||
|  |             print("Rank: " + str(self.grid.comm.rank) + ": NORTH") | ||||||
|         sim.setBoundaryConditions(new_boundary_conditions) |         sim.setBoundaryConditions(new_boundary_conditions) | ||||||
|                  |                  | ||||||
|         #Get number of variables |         #Get number of variables | ||||||
| @ -286,36 +299,52 @@ class MPISimulator(Simulator.BaseSimulator): | |||||||
|         #Note that east and west also transfer ghost cells |         #Note that east and west also transfer ghost cells | ||||||
|         #whilst north/south only transfer internal cells |         #whilst north/south only transfer internal cells | ||||||
|         #Reuses the width/height defined in the read-extets above |         #Reuses the width/height defined in the read-extets above | ||||||
|         self.in_e = np.empty((self.nvars, self.read_e[3], self.read_e[2]), dtype=np.float32) |         self.in_e = cuda.pagelocked_empty((int(self.nvars), int(self.read_e[3]), int(self.read_e[2])), dtype=np.float32, mem_flags=cuda.host_alloc_flags.PORTABLE) #np.empty((self.nvars, self.read_e[3], self.read_e[2]), dtype=np.float32) | ||||||
|         self.in_w = np.empty((self.nvars, self.read_w[3], self.read_w[2]), dtype=np.float32) |         self.in_w = cuda.pagelocked_empty((int(self.nvars), int(self.read_w[3]), int(self.read_w[2])), dtype=np.float32, mem_flags=cuda.host_alloc_flags.PORTABLE) #np.empty((self.nvars, self.read_w[3], self.read_w[2]), dtype=np.float32) | ||||||
|         self.in_n = np.empty((self.nvars, self.read_n[3], self.read_n[2]), dtype=np.float32) |         self.in_n = cuda.pagelocked_empty((int(self.nvars), int(self.read_n[3]), int(self.read_n[2])), dtype=np.float32, mem_flags=cuda.host_alloc_flags.PORTABLE) #np.empty((self.nvars, self.read_n[3], self.read_n[2]), dtype=np.float32) | ||||||
|         self.in_s = np.empty((self.nvars, self.read_s[3], self.read_s[2]), dtype=np.float32) |         self.in_s = cuda.pagelocked_empty((int(self.nvars), int(self.read_s[3]), int(self.read_s[2])), dtype=np.float32, mem_flags=cuda.host_alloc_flags.PORTABLE) #np.empty((self.nvars, self.read_s[3], self.read_s[2]), dtype=np.float32) | ||||||
|          | 
 | ||||||
|         #Allocate data for sending |         #Allocate data for sending | ||||||
|         self.out_e = np.empty_like(self.in_e) |         self.out_e = cuda.pagelocked_empty((int(self.nvars), int(self.read_e[3]), int(self.read_e[2])), dtype=np.float32, mem_flags=cuda.host_alloc_flags.PORTABLE) #np.empty_like(self.in_e) | ||||||
|         self.out_w = np.empty_like(self.in_w) |         self.out_w = cuda.pagelocked_empty((int(self.nvars), int(self.read_w[3]), int(self.read_w[2])), dtype=np.float32, mem_flags=cuda.host_alloc_flags.PORTABLE) #np.empty_like(self.in_w) | ||||||
|         self.out_n = np.empty_like(self.in_n) |         self.out_n = cuda.pagelocked_empty((int(self.nvars), int(self.read_n[3]), int(self.read_n[2])), dtype=np.float32, mem_flags=cuda.host_alloc_flags.PORTABLE) #np.empty_like(self.in_n) | ||||||
|         self.out_s = np.empty_like(self.in_s) |         self.out_s = cuda.pagelocked_empty((int(self.nvars), int(self.read_s[3]), int(self.read_s[2])), dtype=np.float32, mem_flags=cuda.host_alloc_flags.PORTABLE) #np.empty_like(self.in_s) | ||||||
|          |          | ||||||
|         self.logger.debug("Simlator rank {:d} initialized on {:s}".format(self.grid.comm.rank, MPI.Get_processor_name())) |         self.logger.debug("Simlator rank {:d} initialized on {:s}".format(self.grid.comm.rank, MPI.Get_processor_name())) | ||||||
|         self.profiling_data_mpi["end"]["t_sim_mpi_init"] = time.time() |         self.profiling_data_mpi["end"]["t_sim_mpi_init"] = time.time() | ||||||
|          |  | ||||||
|     def substep(self, dt, step_number): |  | ||||||
|         if self.profiling_data_mpi["n_time_steps"] > 0: |  | ||||||
|             self.profiling_data_mpi["start"]["t_step_mpi_halo_exchange"] += time.time() |  | ||||||
| 
 | 
 | ||||||
|  |         #Init ghost cells (with data from neighboring subdomains) | ||||||
|  |         self.download_for_exchange(self.sim.u0) | ||||||
|         self.exchange() |         self.exchange() | ||||||
|  |         self.upload_for_exchange(self.sim.u0) | ||||||
|  |      | ||||||
|  |     def substep(self, dt, step_number): | ||||||
|  |         nvtx.mark("substep start", color="red") | ||||||
| 
 | 
 | ||||||
|         self.sim.stream.synchronize() # only necessary for profiling! |         self.profiling_data_mpi["start"]["t_step_mpi"] += time.time() | ||||||
|         if self.profiling_data_mpi["n_time_steps"] > 0: |         nvtx.mark("substep internal", color="red") | ||||||
|             self.profiling_data_mpi["end"]["t_step_mpi_halo_exchange"] += time.time() |         self.sim.substep(dt, step_number, internal=True, external=False) # "internal ghost cells" excluded | ||||||
|             self.profiling_data_mpi["start"]["t_step_mpi"] += time.time() |         self.profiling_data_mpi["end"]["t_step_mpi"] += time.time() | ||||||
| 
 | 
 | ||||||
|         self.sim.substep(dt, step_number) |         self.profiling_data_mpi["start"]["t_step_mpi"] += time.time() | ||||||
|  |         nvtx.mark("substep external", color="blue") | ||||||
|  |         self.sim.substep(dt, step_number, external=True, internal=False) # only "internal ghost cells" | ||||||
|  |         self.profiling_data_mpi["end"]["t_step_mpi"] += time.time() | ||||||
| 
 | 
 | ||||||
|         self.sim.stream.synchronize() # only necessary for profiling! |         # NOTE: Need to download from u1, as u0<->u1 switch is not performed yet | ||||||
|         if self.profiling_data_mpi["n_time_steps"] > 0: |         nvtx.mark("download", color="red") | ||||||
|             self.profiling_data_mpi["end"]["t_step_mpi"] += time.time() |         self.sim.swapBuffers() | ||||||
|  |         self.download_for_exchange(self.sim.u0) | ||||||
|  | 
 | ||||||
|  |         nvtx.mark("sync", color="red") | ||||||
|  |         self.sim.stream.synchronize() | ||||||
|  |         nvtx.mark("MPI", color="green") | ||||||
|  |         self.exchange() | ||||||
|  |         nvtx.mark("upload", color="red") | ||||||
|  |         self.upload_for_exchange(self.sim.u0) | ||||||
|  | 
 | ||||||
|  |         self.sim.internal_stream.synchronize() | ||||||
|  |          | ||||||
|         self.profiling_data_mpi["n_time_steps"] += 1 |         self.profiling_data_mpi["n_time_steps"] += 1 | ||||||
| 
 | 
 | ||||||
|     def getOutput(self): |     def getOutput(self): | ||||||
| @ -348,8 +377,114 @@ class MPISimulator(Simulator.BaseSimulator): | |||||||
|         x1 = x0 + width |         x1 = x0 + width | ||||||
|         y1 = y0 + height |         y1 = y0 + height | ||||||
|         return [x0, x1, y0, y1] |         return [x0, x1, y0, y1] | ||||||
|          | 
 | ||||||
|  |     def download_for_exchange(self, u): | ||||||
|  |         if self.profiling_data_mpi["n_time_steps"] > 0: | ||||||
|  |             self.profiling_data_mpi["start"]["t_step_mpi_halo_exchange_download"] += time.time() | ||||||
|  | 
 | ||||||
|  |         # North-south | ||||||
|  |         if self.north is not None: | ||||||
|  |             for k in range(self.nvars): | ||||||
|  |                 u[k].download(self.sim.stream, cpu_data=self.out_n[k,:,:], asynch=True, extent=self.read_n) | ||||||
|  |                 #self.out_n[k,:,:] = u[k].download(self.sim.stream, asynch=True, extent=self.read_n) | ||||||
|  |         if self.south is not None: | ||||||
|  |             for k in range(self.nvars): | ||||||
|  |                 u[k].download(self.sim.stream, cpu_data=self.out_s[k,:,:], asynch=True, extent=self.read_s) | ||||||
|  |                 #self.out_s[k,:,:] = u[k].download(self.sim.stream, asynch=True, extent=self.read_s) | ||||||
|  | 
 | ||||||
|  |         # East-west | ||||||
|  |         if self.east is not None: | ||||||
|  |             for k in range(self.nvars): | ||||||
|  |                 u[k].download(self.sim.stream, cpu_data=self.out_e[k,:,:], asynch=True, extent=self.read_e) | ||||||
|  |                 #self.out_e[k,:,:] = u[k].download(self.sim.stream, asynch=True, extent=self.read_e) | ||||||
|  |         if self.west is not None: | ||||||
|  |             for k in range(self.nvars): | ||||||
|  |                 u[k].download(self.sim.stream, cpu_data=self.out_w[k,:,:], asynch=True, extent=self.read_w) | ||||||
|  |                 #self.out_w[k,:,:] = u[k].download(self.sim.stream, asynch=True, extent=self.read_w) | ||||||
|  |           | ||||||
|  |         if self.profiling_data_mpi["n_time_steps"] > 0: | ||||||
|  |             self.profiling_data_mpi["end"]["t_step_mpi_halo_exchange_download"] += time.time() | ||||||
|  | 
 | ||||||
|     def exchange(self): |     def exchange(self): | ||||||
|  |         if self.profiling_data_mpi["n_time_steps"] > 0: | ||||||
|  |             self.profiling_data_mpi["start"]["t_step_mpi_halo_exchange_sendreceive"] += time.time() | ||||||
|  | 
 | ||||||
|  |         #### | ||||||
|  |         # First transfer internal cells north-south | ||||||
|  |         #### | ||||||
|  | 
 | ||||||
|  |         #Send/receive to north/south neighbours | ||||||
|  |         comm_send = [] | ||||||
|  |         comm_recv = [] | ||||||
|  |         if self.north is not None: | ||||||
|  |             comm_send += [self.grid.comm.Isend(self.out_n, dest=self.north, tag=4*self.nt + 0)] | ||||||
|  |             comm_recv += [self.grid.comm.Irecv(self.in_n, source=self.north, tag=4*self.nt + 1)] | ||||||
|  |         if self.south is not None: | ||||||
|  |             comm_send += [self.grid.comm.Isend(self.out_s, dest=self.south, tag=4*self.nt + 1)] | ||||||
|  |             comm_recv += [self.grid.comm.Irecv(self.in_s, source=self.south, tag=4*self.nt + 0)] | ||||||
|  |          | ||||||
|  |         #Wait for incoming transfers to complete | ||||||
|  |         for comm in comm_recv: | ||||||
|  |             comm.wait() | ||||||
|  |          | ||||||
|  |         #Wait for sending to complete | ||||||
|  |         for comm in comm_send: | ||||||
|  |             comm.wait() | ||||||
|  |          | ||||||
|  |         #### | ||||||
|  |         # Then transfer east-west including ghost cells that have been filled in by north-south transfer above | ||||||
|  |         #### | ||||||
|  |         #Send/receive to east/west neighbours | ||||||
|  |         comm_send = [] | ||||||
|  |         comm_recv = [] | ||||||
|  |         if self.east is not None: | ||||||
|  |             comm_send += [self.grid.comm.Isend(self.out_e, dest=self.east, tag=4*self.nt + 2)] | ||||||
|  |             comm_recv += [self.grid.comm.Irecv(self.in_e, source=self.east, tag=4*self.nt + 3)] | ||||||
|  |         if self.west is not None: | ||||||
|  |             comm_send += [self.grid.comm.Isend(self.out_w, dest=self.west, tag=4*self.nt + 3)] | ||||||
|  |             comm_recv += [self.grid.comm.Irecv(self.in_w, source=self.west, tag=4*self.nt + 2)] | ||||||
|  |          | ||||||
|  |          | ||||||
|  |         #Wait for incoming transfers to complete | ||||||
|  |         for comm in comm_recv: | ||||||
|  |             comm.wait() | ||||||
|  |          | ||||||
|  |         #Wait for sending to complete | ||||||
|  |         for comm in comm_send: | ||||||
|  |             comm.wait() | ||||||
|  |              | ||||||
|  |         if self.profiling_data_mpi["n_time_steps"] > 0: | ||||||
|  |             self.profiling_data_mpi["end"]["t_step_mpi_halo_exchange_sendreceive"] += time.time() | ||||||
|  | 
 | ||||||
|  |     def upload_for_exchange(self, u): | ||||||
|  |         if self.profiling_data_mpi["n_time_steps"] > 0: | ||||||
|  |             self.profiling_data_mpi["start"]["t_step_mpi_halo_exchange_upload"] += time.time() | ||||||
|  | 
 | ||||||
|  |         # North-south | ||||||
|  |         if self.north is not None: | ||||||
|  |             for k in range(self.nvars): | ||||||
|  |                 u[k].upload(self.sim.stream, self.in_n[k,:,:], extent=self.write_n) | ||||||
|  |         if self.south is not None: | ||||||
|  |             for k in range(self.nvars): | ||||||
|  |                 u[k].upload(self.sim.stream, self.in_s[k,:,:], extent=self.write_s) | ||||||
|  |          | ||||||
|  |         # East-west | ||||||
|  |         if self.east is not None: | ||||||
|  |             for k in range(self.nvars): | ||||||
|  |                 u[k].upload(self.sim.stream, self.in_e[k,:,:], extent=self.write_e) | ||||||
|  |         if self.west is not None: | ||||||
|  |             for k in range(self.nvars): | ||||||
|  |                 u[k].upload(self.sim.stream, self.in_w[k,:,:], extent=self.write_w) | ||||||
|  |          | ||||||
|  |         if self.profiling_data_mpi["n_time_steps"] > 0: | ||||||
|  |             self.profiling_data_mpi["end"]["t_step_mpi_halo_exchange_upload"] += time.time() | ||||||
|  |      | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  |     def old_exchange(self): | ||||||
|         #### |         #### | ||||||
|         # FIXME: This function can be optimized using persitent communications.  |         # FIXME: This function can be optimized using persitent communications.  | ||||||
|         # Also by overlapping some of the communications north/south and east/west of GPU and intra-node |         # Also by overlapping some of the communications north/south and east/west of GPU and intra-node | ||||||
| @ -470,4 +605,3 @@ class MPISimulator(Simulator.BaseSimulator): | |||||||
|              |              | ||||||
|         if self.profiling_data_mpi["n_time_steps"] > 0: |         if self.profiling_data_mpi["n_time_steps"] > 0: | ||||||
|             self.profiling_data_mpi["end"]["t_step_mpi_halo_exchange_sendreceive"] += time.time() |             self.profiling_data_mpi["end"]["t_step_mpi_halo_exchange_sendreceive"] += time.time() | ||||||
|      |  | ||||||
| @ -147,23 +147,34 @@ __global__ void KP07DimsplitKernel( | |||||||
|         float* E1_ptr_, int E1_pitch_,  |         float* E1_ptr_, int E1_pitch_,  | ||||||
|          |          | ||||||
|         //Output CFL |         //Output CFL | ||||||
|         float* cfl_) { |         float* cfl_, | ||||||
|  | 
 | ||||||
|  |         //Subarea of internal domain to compute | ||||||
|  |         int x0=0, int y0=0, | ||||||
|  |         int x1=0, int y1=0) { | ||||||
|  | 
 | ||||||
|  |     if(x1 == 0) | ||||||
|  |         x1 = nx_; | ||||||
|  | 
 | ||||||
|  |     if(y1 == 0) | ||||||
|  |         y1 = ny_; | ||||||
|  |      | ||||||
|     const unsigned int w = BLOCK_WIDTH; |     const unsigned int w = BLOCK_WIDTH; | ||||||
|     const unsigned int h = BLOCK_HEIGHT; |     const unsigned int h = BLOCK_HEIGHT; | ||||||
|     const unsigned int gc_x = 2; |     const unsigned int gc_x = 2; | ||||||
|     const unsigned int gc_y = 2; |     const unsigned int gc_y = 2; | ||||||
|     const unsigned int vars = 4; |     const unsigned int vars = 4; | ||||||
|          |      | ||||||
|     //Shared memory variables |     //Shared memory variables | ||||||
|     __shared__ float  Q[4][h+2*gc_y][w+2*gc_x]; |     __shared__ float  Q[4][h+2*gc_y][w+2*gc_x]; | ||||||
|     __shared__ float Qx[4][h+2*gc_y][w+2*gc_x]; |     __shared__ float Qx[4][h+2*gc_y][w+2*gc_x]; | ||||||
|     __shared__ float  F[4][h+2*gc_y][w+2*gc_x]; |     __shared__ float  F[4][h+2*gc_y][w+2*gc_x]; | ||||||
|      |      | ||||||
|     //Read into shared memory |     //Read into shared memory | ||||||
|     readBlock<w, h, gc_x, gc_y,  1,  1>(  rho0_ptr_,   rho0_pitch_, Q[0], nx_, ny_, boundary_conditions_); |     readBlock<w, h, gc_x, gc_y,  1,  1>(  rho0_ptr_,   rho0_pitch_, Q[0], nx_, ny_, boundary_conditions_, x0, y0, x1, y1); | ||||||
|     readBlock<w, h, gc_x, gc_y, -1,  1>(rho_u0_ptr_, rho_u0_pitch_, Q[1], nx_, ny_, boundary_conditions_); |     readBlock<w, h, gc_x, gc_y, -1,  1>(rho_u0_ptr_, rho_u0_pitch_, Q[1], nx_, ny_, boundary_conditions_, x0, y0, x1, y1); | ||||||
|     readBlock<w, h, gc_x, gc_y,  1, -1>(rho_v0_ptr_, rho_v0_pitch_, Q[2], nx_, ny_, boundary_conditions_); |     readBlock<w, h, gc_x, gc_y,  1, -1>(rho_v0_ptr_, rho_v0_pitch_, Q[2], nx_, ny_, boundary_conditions_, x0, y0, x1, y1); | ||||||
|     readBlock<w, h, gc_x, gc_y,  1,  1>(    E0_ptr_,     E0_pitch_, Q[3], nx_, ny_, boundary_conditions_); |     readBlock<w, h, gc_x, gc_y,  1,  1>(    E0_ptr_,     E0_pitch_, Q[3], nx_, ny_, boundary_conditions_, x0, y0, x1, y1); | ||||||
| 
 | 
 | ||||||
|     //Step 0 => evolve x first, then y |     //Step 0 => evolve x first, then y | ||||||
|     if (step_ == 0) { |     if (step_ == 0) { | ||||||
| @ -224,10 +235,10 @@ __global__ void KP07DimsplitKernel( | |||||||
| 
 | 
 | ||||||
|      |      | ||||||
|     // Write to main memory for all internal cells |     // Write to main memory for all internal cells | ||||||
|     writeBlock<w, h, gc_x, gc_y>(  rho1_ptr_,   rho1_pitch_, Q[0], nx_, ny_, 0, 1); |     writeBlock<w, h, gc_x, gc_y>(  rho1_ptr_,   rho1_pitch_, Q[0], nx_, ny_, 0, 1, x0, y0, x1, y1); | ||||||
|     writeBlock<w, h, gc_x, gc_y>(rho_u1_ptr_, rho_u1_pitch_, Q[1], nx_, ny_, 0, 1); |     writeBlock<w, h, gc_x, gc_y>(rho_u1_ptr_, rho_u1_pitch_, Q[1], nx_, ny_, 0, 1, x0, y0, x1, y1); | ||||||
|     writeBlock<w, h, gc_x, gc_y>(rho_v1_ptr_, rho_v1_pitch_, Q[2], nx_, ny_, 0, 1); |     writeBlock<w, h, gc_x, gc_y>(rho_v1_ptr_, rho_v1_pitch_, Q[2], nx_, ny_, 0, 1, x0, y0, x1, y1); | ||||||
|     writeBlock<w, h, gc_x, gc_y>(    E1_ptr_,     E1_pitch_, Q[3], nx_, ny_, 0, 1); |     writeBlock<w, h, gc_x, gc_y>(    E1_ptr_,     E1_pitch_, Q[3], nx_, ny_, 0, 1, x0, y0, x1, y1); | ||||||
|      |      | ||||||
|     //Compute the CFL for this block |     //Compute the CFL for this block | ||||||
|     if (cfl_ != NULL) { |     if (cfl_ != NULL) { | ||||||
|  | |||||||
| @ -321,7 +321,9 @@ template<int w, int h, int gc_x, int gc_y, int sign_x, int sign_y> | |||||||
| inline __device__ void readBlock(float* ptr_, int pitch_, | inline __device__ void readBlock(float* ptr_, int pitch_, | ||||||
|                 float Q[h+2*gc_y][w+2*gc_x],  |                 float Q[h+2*gc_y][w+2*gc_x],  | ||||||
|                 const int nx_, const int ny_, |                 const int nx_, const int ny_, | ||||||
|                 const int boundary_conditions_) { |                 const int boundary_conditions_, | ||||||
|  |                  int x0, int y0, | ||||||
|  |                  int x1, int y1) { | ||||||
|     //Index of block within domain
 |     //Index of block within domain
 | ||||||
|     const int bx = blockDim.x * blockIdx.x; |     const int bx = blockDim.x * blockIdx.x; | ||||||
|     const int by = blockDim.y * blockIdx.y; |     const int by = blockDim.y * blockIdx.y; | ||||||
| @ -330,14 +332,14 @@ inline __device__ void readBlock(float* ptr_, int pitch_, | |||||||
|     //Loop over all variables
 |     //Loop over all variables
 | ||||||
|     for (int j=threadIdx.y; j<h+2*gc_y; j+=h) { |     for (int j=threadIdx.y; j<h+2*gc_y; j+=h) { | ||||||
|         //Handle periodic boundary conditions here
 |         //Handle periodic boundary conditions here
 | ||||||
|         int l = handlePeriodicBoundaryY<gc_y>(by + j, ny_, boundary_conditions_); |         int l = handlePeriodicBoundaryY<gc_y>(by + j + y0, ny_, boundary_conditions_); | ||||||
|         l = min(l, ny_+2*gc_y-1); |         l = min(l, min(ny_+2*gc_y-1, y1+2*gc_y-1)); | ||||||
|         float* row = (float*) ((char*) ptr_ + pitch_*l); |         float* row = (float*) ((char*) ptr_ + pitch_*l); | ||||||
|          |          | ||||||
|         for (int i=threadIdx.x; i<w+2*gc_x; i+=w) { |         for (int i=threadIdx.x; i<w+2*gc_x; i+=w) { | ||||||
|             //Handle periodic boundary conditions here
 |             //Handle periodic boundary conditions here
 | ||||||
|             int k = handlePeriodicBoundaryX<gc_x>(bx + i, nx_, boundary_conditions_); |             int k = handlePeriodicBoundaryX<gc_x>(bx + i + x0, nx_, boundary_conditions_); | ||||||
|             k = min(k, nx_+2*gc_x-1); |             k = min(k, min(nx_+2*gc_x-1, x1+2*gc_x-1)); | ||||||
|              |              | ||||||
|             //Read from global memory
 |             //Read from global memory
 | ||||||
|             Q[j][i] = row[k]; |             Q[j][i] = row[k]; | ||||||
| @ -358,14 +360,20 @@ template<int w, int h, int gc_x, int gc_y> | |||||||
| inline __device__ void writeBlock(float* ptr_, int pitch_, | inline __device__ void writeBlock(float* ptr_, int pitch_, | ||||||
|                  float shmem[h+2*gc_y][w+2*gc_x], |                  float shmem[h+2*gc_y][w+2*gc_x], | ||||||
|                  const int nx_, const int ny_, |                  const int nx_, const int ny_, | ||||||
|                  int rk_step_, int rk_order_) { |                  int rk_step_, int rk_order_, | ||||||
|  |                  int x0, int y0, | ||||||
|  |                  int x1, int y1) { | ||||||
|      |      | ||||||
|     //Index of cell within domain
 |     //Index of cell within domain
 | ||||||
|     const int ti = blockDim.x*blockIdx.x + threadIdx.x + gc_x; |     const int ti = blockDim.x*blockIdx.x + threadIdx.x + gc_x + x0; | ||||||
|     const int tj = blockDim.y*blockIdx.y + threadIdx.y + gc_y; |     const int tj = blockDim.y*blockIdx.y + threadIdx.y + gc_y + y0; | ||||||
|  | 
 | ||||||
|  |     //In case we are writing only to a subarea given by (x0, y0) x (x1, y1)
 | ||||||
|  |     const int max_ti = min(nx_+gc_x, x1+gc_x); | ||||||
|  |     const int max_tj = min(ny_+gc_y, y1+gc_y); | ||||||
|      |      | ||||||
|     //Only write internal cells
 |     //Only write internal cells
 | ||||||
|     if (ti < nx_+gc_x && tj < ny_+gc_y) { |     if ((x0+gc_x <= ti) && (ti < max_ti) && (y0+gc_y <= tj) && (tj < max_tj)) { | ||||||
|         //Index of thread within block
 |         //Index of thread within block
 | ||||||
|         const int tx = threadIdx.x + gc_x; |         const int tx = threadIdx.x + gc_x; | ||||||
|         const int ty = threadIdx.y + gc_y; |         const int ty = threadIdx.y + gc_y; | ||||||
| @ -416,6 +424,9 @@ inline __device__ void writeBlock(float* ptr_, int pitch_, | |||||||
|                 row[ti] = t*row[ti] + (1.0f-t)*shmem[ty][tx]; |                 row[ti] = t*row[ti] + (1.0f-t)*shmem[ty][tx]; | ||||||
|             } |             } | ||||||
|         } |         } | ||||||
|  | 
 | ||||||
|  |         // DEBUG
 | ||||||
|  |         //row[ti] = 99.0;
 | ||||||
|     } |     } | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
|  | |||||||
| @ -106,6 +106,10 @@ cuda_context = CudaContext.CudaContext(device=cuda_device, autotuning=False) | |||||||
| #### | #### | ||||||
| # Set initial conditions | # Set initial conditions | ||||||
| #### | #### | ||||||
|  | 
 | ||||||
|  | # DEBUGGING - setting random seed | ||||||
|  | np.random.seed(42) | ||||||
|  | 
 | ||||||
| logger.info("Generating initial conditions") | logger.info("Generating initial conditions") | ||||||
| nx = args.nx | nx = args.nx | ||||||
| ny = args.ny | ny = args.ny | ||||||
| @ -113,7 +117,9 @@ ny = args.ny | |||||||
| dt = 0.00001 | dt = 0.00001 | ||||||
| 
 | 
 | ||||||
| gamma = 1.4 | gamma = 1.4 | ||||||
| save_times = np.linspace(0, 0.1, 2) | #save_times = np.linspace(0, 0.000009, 2) | ||||||
|  | #save_times = np.linspace(0, 0.000099, 11) | ||||||
|  | save_times = np.linspace(0, 0.000099, 2) | ||||||
| outfile = "mpi_out_" + str(MPI.COMM_WORLD.rank) + ".nc" | outfile = "mpi_out_" + str(MPI.COMM_WORLD.rank) + ".nc" | ||||||
| save_var_names = ['rho', 'rho_u', 'rho_v', 'E'] | save_var_names = ['rho', 'rho_u', 'rho_v', 'E'] | ||||||
| 
 | 
 | ||||||
|  | |||||||
		Loading…
	
	
			
			x
			
			
		
	
		Reference in New Issue
	
	Block a user
	 Martin Lilleeng Sætra
						Martin Lilleeng Sætra