diff --git a/GPUSimulators/Common.py b/GPUSimulators/Common.py index 73764e8..a965231 100644 --- a/GPUSimulators/Common.py +++ b/GPUSimulators/Common.py @@ -546,9 +546,9 @@ class CudaArray2D: #self.logger.debug("Downloading [%dx%d] buffer", self.nx, self.ny) #Allocate host memory #The following fails, don't know why (crashes python) - #cpu_data = cuda.pagelocked_empty((self.ny, self.nx), np.float32)32) + cpu_data = cuda.pagelocked_empty((int(ny), int(nx)), dtype=np.float32, mem_flags=cuda.host_alloc_flags.PORTABLE) #Non-pagelocked: cpu_data = np.empty((ny, nx), dtype=np.float32) - cpu_data = self.memorypool.allocate((ny, nx), dtype=np.float32) + #cpu_data = self.memorypool.allocate((ny, nx), dtype=np.float32) assert nx == cpu_data.shape[1] assert ny == cpu_data.shape[0] @@ -759,7 +759,7 @@ class ArakawaA2D: assert i < len(self.gpu_variables), "Variable {:d} is out of range".format(i) cpu_variables += [self.gpu_variables[i].download(stream, asynch=True)] - stream.synchronize() + #stream.synchronize() return cpu_variables def check(self): diff --git a/GPUSimulators/EE2D_KP07_dimsplit.py b/GPUSimulators/EE2D_KP07_dimsplit.py index f16af97..5059f9d 100644 --- a/GPUSimulators/EE2D_KP07_dimsplit.py +++ b/GPUSimulators/EE2D_KP07_dimsplit.py @@ -90,7 +90,7 @@ class EE2D_KP07_dimsplit (BaseSimulator): }, jit_compile_args={}) self.kernel = module.get_function("KP07DimsplitKernel") - self.kernel.prepare("iiffffffiiPiPiPiPiPiPiPiPiP") + self.kernel.prepare("iiffffffiiPiPiPiPiPiPiPiPiPiiii") #Create data by uploading to device @@ -109,11 +109,14 @@ class EE2D_KP07_dimsplit (BaseSimulator): self.cfl_data.fill(self.dt, stream=self.stream) - def substep(self, dt, step_number): - self.substepDimsplit(0.5*dt, step_number) - - def substepDimsplit(self, dt, substep): - self.kernel.prepared_async_call(self.grid_size, self.block_size, self.stream, + def substep(self, dt, step_number, external=True, internal=True): + self.substepDimsplit(0.5*dt, step_number, external, internal) + + def substepDimsplit(self, dt, substep, external, internal): + if external and internal: + #print("COMPLETE DOMAIN (dt=" + str(dt) + ")") + + self.kernel.prepared_async_call(self.grid_size, self.block_size, self.stream, self.nx, self.ny, self.dx, self.dy, dt, self.g, @@ -129,8 +132,99 @@ class EE2D_KP07_dimsplit (BaseSimulator): self.u1[1].data.gpudata, self.u1[1].data.strides[0], self.u1[2].data.gpudata, self.u1[2].data.strides[0], self.u1[3].data.gpudata, self.u1[3].data.strides[0], - self.cfl_data.gpudata) + self.cfl_data.gpudata, + 0, 0, + self.nx, self.ny) + self.u0, self.u1 = self.u1, self.u0 + return + + if external and not internal: + ############################################################# + # XXX: Only treating north and south external cells for now # + ############################################################# + + ns_grid_size = (self.grid_size[0], 1) + + # NORTH + # (x0, y0) x (x1, y1) + # (0, ny-y_halo) x (nx, ny) + self.kernel.prepared_async_call(ns_grid_size, self.block_size, self.stream, + self.nx, self.ny, + self.dx, self.dy, dt, + self.g, + self.gamma, + self.theta, + substep, + self.boundary_conditions, + self.u0[0].data.gpudata, self.u0[0].data.strides[0], + self.u0[1].data.gpudata, self.u0[1].data.strides[0], + self.u0[2].data.gpudata, self.u0[2].data.strides[0], + self.u0[3].data.gpudata, self.u0[3].data.strides[0], + self.u1[0].data.gpudata, self.u1[0].data.strides[0], + self.u1[1].data.gpudata, self.u1[1].data.strides[0], + self.u1[2].data.gpudata, self.u1[2].data.strides[0], + self.u1[3].data.gpudata, self.u1[3].data.strides[0], + self.cfl_data.gpudata, + 0, self.ny - int(self.u0[0].y_halo), + self.nx, self.ny) + + # SOUTH + # (x0, y0) x (x1, y1) + # (0, 0) x (nx, y_halo) + self.kernel.prepared_async_call(ns_grid_size, self.block_size, self.stream, + self.nx, self.ny, + self.dx, self.dy, dt, + self.g, + self.gamma, + self.theta, + substep, + self.boundary_conditions, + self.u0[0].data.gpudata, self.u0[0].data.strides[0], + self.u0[1].data.gpudata, self.u0[1].data.strides[0], + self.u0[2].data.gpudata, self.u0[2].data.strides[0], + self.u0[3].data.gpudata, self.u0[3].data.strides[0], + self.u1[0].data.gpudata, self.u1[0].data.strides[0], + self.u1[1].data.gpudata, self.u1[1].data.strides[0], + self.u1[2].data.gpudata, self.u1[2].data.strides[0], + self.u1[3].data.gpudata, self.u1[3].data.strides[0], + self.cfl_data.gpudata, + 0, 0, + self.nx, int(self.u0[0].y_halo)) + return + + if internal and not external: + ############################################################# + # XXX: Only treating north and south external cells for now # + # So we need to include west and east boundary here! # + ############################################################# + + # INTERNAL DOMAIN + # (x0, y0) x (x1, y1) + # (x_halo, y_halo) x (nx - x_halo, ny - y_halo) + self.kernel.prepared_async_call(self.grid_size, self.block_size, self.internal_stream, + self.nx, self.ny, + self.dx, self.dy, dt, + self.g, + self.gamma, + self.theta, + substep, + self.boundary_conditions, + self.u0[0].data.gpudata, self.u0[0].data.strides[0], + self.u0[1].data.gpudata, self.u0[1].data.strides[0], + self.u0[2].data.gpudata, self.u0[2].data.strides[0], + self.u0[3].data.gpudata, self.u0[3].data.strides[0], + self.u1[0].data.gpudata, self.u1[0].data.strides[0], + self.u1[1].data.gpudata, self.u1[1].data.strides[0], + self.u1[2].data.gpudata, self.u1[2].data.strides[0], + self.u1[3].data.gpudata, self.u1[3].data.strides[0], + self.cfl_data.gpudata, + 0, int(self.u0[0].y_halo), + self.nx, self.ny - int(self.u0[0].y_halo)) + return + + def swapBuffers(self): self.u0, self.u1 = self.u1, self.u0 + return def getOutput(self): return self.u0 @@ -138,6 +232,7 @@ class EE2D_KP07_dimsplit (BaseSimulator): def check(self): self.u0.check() self.u1.check() + return def computeDt(self): max_dt = gpuarray.min(self.cfl_data, stream=self.stream).get(); diff --git a/GPUSimulators/MPISimulator.py b/GPUSimulators/MPISimulator.py index f2a58d7..e49738e 100644 --- a/GPUSimulators/MPISimulator.py +++ b/GPUSimulators/MPISimulator.py @@ -26,6 +26,8 @@ import numpy as np from mpi4py import MPI import time +import pycuda.driver as cuda +import nvtx @@ -239,25 +241,36 @@ class MPISimulator(Simulator.BaseSimulator): #Get coordinate of this node #and handle global boundary conditions + #new_boundary_conditions = Simulator.BoundaryCondition({ + # 'north': Simulator.BoundaryCondition.Type.Dirichlet, + # 'south': Simulator.BoundaryCondition.Type.Dirichlet, + # 'east': Simulator.BoundaryCondition.Type.Dirichlet, + # 'west': Simulator.BoundaryCondition.Type.Dirichlet + #}) new_boundary_conditions = Simulator.BoundaryCondition({ - 'north': Simulator.BoundaryCondition.Type.Dirichlet, - 'south': Simulator.BoundaryCondition.Type.Dirichlet, - 'east': Simulator.BoundaryCondition.Type.Dirichlet, - 'west': Simulator.BoundaryCondition.Type.Dirichlet + 'north': Simulator.BoundaryCondition.Type.Reflective, + 'south': Simulator.BoundaryCondition.Type.Reflective, + 'east': Simulator.BoundaryCondition.Type.Reflective, + 'west': Simulator.BoundaryCondition.Type.Reflective }) gi, gj = grid.getCoordinate() + print("gi: " + str(gi) + ", gj: " + str(gj)) if (gi == 0 and boundary_conditions.west != Simulator.BoundaryCondition.Type.Periodic): self.west = None new_boundary_conditions.west = boundary_conditions.west; + print("Rank: " + str(self.grid.comm.rank) + ": WEST") if (gj == 0 and boundary_conditions.south != Simulator.BoundaryCondition.Type.Periodic): self.south = None new_boundary_conditions.south = boundary_conditions.south; + print("Rank: " + str(self.grid.comm.rank) + ": SOUTH") if (gi == grid.grid[0]-1 and boundary_conditions.east != Simulator.BoundaryCondition.Type.Periodic): self.east = None new_boundary_conditions.east = boundary_conditions.east; + print("Rank: " + str(self.grid.comm.rank) + ": EAST") if (gj == grid.grid[1]-1 and boundary_conditions.north != Simulator.BoundaryCondition.Type.Periodic): self.north = None new_boundary_conditions.north = boundary_conditions.north; + print("Rank: " + str(self.grid.comm.rank) + ": NORTH") sim.setBoundaryConditions(new_boundary_conditions) #Get number of variables @@ -286,36 +299,52 @@ class MPISimulator(Simulator.BaseSimulator): #Note that east and west also transfer ghost cells #whilst north/south only transfer internal cells #Reuses the width/height defined in the read-extets above - self.in_e = np.empty((self.nvars, self.read_e[3], self.read_e[2]), dtype=np.float32) - self.in_w = np.empty((self.nvars, self.read_w[3], self.read_w[2]), dtype=np.float32) - self.in_n = np.empty((self.nvars, self.read_n[3], self.read_n[2]), dtype=np.float32) - self.in_s = np.empty((self.nvars, self.read_s[3], self.read_s[2]), dtype=np.float32) - + self.in_e = cuda.pagelocked_empty((int(self.nvars), int(self.read_e[3]), int(self.read_e[2])), dtype=np.float32, mem_flags=cuda.host_alloc_flags.PORTABLE) #np.empty((self.nvars, self.read_e[3], self.read_e[2]), dtype=np.float32) + self.in_w = cuda.pagelocked_empty((int(self.nvars), int(self.read_w[3]), int(self.read_w[2])), dtype=np.float32, mem_flags=cuda.host_alloc_flags.PORTABLE) #np.empty((self.nvars, self.read_w[3], self.read_w[2]), dtype=np.float32) + self.in_n = cuda.pagelocked_empty((int(self.nvars), int(self.read_n[3]), int(self.read_n[2])), dtype=np.float32, mem_flags=cuda.host_alloc_flags.PORTABLE) #np.empty((self.nvars, self.read_n[3], self.read_n[2]), dtype=np.float32) + self.in_s = cuda.pagelocked_empty((int(self.nvars), int(self.read_s[3]), int(self.read_s[2])), dtype=np.float32, mem_flags=cuda.host_alloc_flags.PORTABLE) #np.empty((self.nvars, self.read_s[3], self.read_s[2]), dtype=np.float32) + #Allocate data for sending - self.out_e = np.empty_like(self.in_e) - self.out_w = np.empty_like(self.in_w) - self.out_n = np.empty_like(self.in_n) - self.out_s = np.empty_like(self.in_s) + self.out_e = cuda.pagelocked_empty((int(self.nvars), int(self.read_e[3]), int(self.read_e[2])), dtype=np.float32, mem_flags=cuda.host_alloc_flags.PORTABLE) #np.empty_like(self.in_e) + self.out_w = cuda.pagelocked_empty((int(self.nvars), int(self.read_w[3]), int(self.read_w[2])), dtype=np.float32, mem_flags=cuda.host_alloc_flags.PORTABLE) #np.empty_like(self.in_w) + self.out_n = cuda.pagelocked_empty((int(self.nvars), int(self.read_n[3]), int(self.read_n[2])), dtype=np.float32, mem_flags=cuda.host_alloc_flags.PORTABLE) #np.empty_like(self.in_n) + self.out_s = cuda.pagelocked_empty((int(self.nvars), int(self.read_s[3]), int(self.read_s[2])), dtype=np.float32, mem_flags=cuda.host_alloc_flags.PORTABLE) #np.empty_like(self.in_s) self.logger.debug("Simlator rank {:d} initialized on {:s}".format(self.grid.comm.rank, MPI.Get_processor_name())) self.profiling_data_mpi["end"]["t_sim_mpi_init"] = time.time() - - def substep(self, dt, step_number): - if self.profiling_data_mpi["n_time_steps"] > 0: - self.profiling_data_mpi["start"]["t_step_mpi_halo_exchange"] += time.time() + #Init ghost cells (with data from neighboring subdomains) + self.download_for_exchange(self.sim.u0) self.exchange() + self.upload_for_exchange(self.sim.u0) + + def substep(self, dt, step_number): + nvtx.mark("substep start", color="red") - self.sim.stream.synchronize() # only necessary for profiling! - if self.profiling_data_mpi["n_time_steps"] > 0: - self.profiling_data_mpi["end"]["t_step_mpi_halo_exchange"] += time.time() - self.profiling_data_mpi["start"]["t_step_mpi"] += time.time() + self.profiling_data_mpi["start"]["t_step_mpi"] += time.time() + nvtx.mark("substep internal", color="red") + self.sim.substep(dt, step_number, internal=True, external=False) # "internal ghost cells" excluded + self.profiling_data_mpi["end"]["t_step_mpi"] += time.time() - self.sim.substep(dt, step_number) + self.profiling_data_mpi["start"]["t_step_mpi"] += time.time() + nvtx.mark("substep external", color="blue") + self.sim.substep(dt, step_number, external=True, internal=False) # only "internal ghost cells" + self.profiling_data_mpi["end"]["t_step_mpi"] += time.time() - self.sim.stream.synchronize() # only necessary for profiling! - if self.profiling_data_mpi["n_time_steps"] > 0: - self.profiling_data_mpi["end"]["t_step_mpi"] += time.time() + # NOTE: Need to download from u1, as u0<->u1 switch is not performed yet + nvtx.mark("download", color="red") + self.sim.swapBuffers() + self.download_for_exchange(self.sim.u0) + + nvtx.mark("sync", color="red") + self.sim.stream.synchronize() + nvtx.mark("MPI", color="green") + self.exchange() + nvtx.mark("upload", color="red") + self.upload_for_exchange(self.sim.u0) + + self.sim.internal_stream.synchronize() + self.profiling_data_mpi["n_time_steps"] += 1 def getOutput(self): @@ -348,8 +377,114 @@ class MPISimulator(Simulator.BaseSimulator): x1 = x0 + width y1 = y0 + height return [x0, x1, y0, y1] - + + def download_for_exchange(self, u): + if self.profiling_data_mpi["n_time_steps"] > 0: + self.profiling_data_mpi["start"]["t_step_mpi_halo_exchange_download"] += time.time() + + # North-south + if self.north is not None: + for k in range(self.nvars): + u[k].download(self.sim.stream, cpu_data=self.out_n[k,:,:], asynch=True, extent=self.read_n) + #self.out_n[k,:,:] = u[k].download(self.sim.stream, asynch=True, extent=self.read_n) + if self.south is not None: + for k in range(self.nvars): + u[k].download(self.sim.stream, cpu_data=self.out_s[k,:,:], asynch=True, extent=self.read_s) + #self.out_s[k,:,:] = u[k].download(self.sim.stream, asynch=True, extent=self.read_s) + + # East-west + if self.east is not None: + for k in range(self.nvars): + u[k].download(self.sim.stream, cpu_data=self.out_e[k,:,:], asynch=True, extent=self.read_e) + #self.out_e[k,:,:] = u[k].download(self.sim.stream, asynch=True, extent=self.read_e) + if self.west is not None: + for k in range(self.nvars): + u[k].download(self.sim.stream, cpu_data=self.out_w[k,:,:], asynch=True, extent=self.read_w) + #self.out_w[k,:,:] = u[k].download(self.sim.stream, asynch=True, extent=self.read_w) + + if self.profiling_data_mpi["n_time_steps"] > 0: + self.profiling_data_mpi["end"]["t_step_mpi_halo_exchange_download"] += time.time() + def exchange(self): + if self.profiling_data_mpi["n_time_steps"] > 0: + self.profiling_data_mpi["start"]["t_step_mpi_halo_exchange_sendreceive"] += time.time() + + #### + # First transfer internal cells north-south + #### + + #Send/receive to north/south neighbours + comm_send = [] + comm_recv = [] + if self.north is not None: + comm_send += [self.grid.comm.Isend(self.out_n, dest=self.north, tag=4*self.nt + 0)] + comm_recv += [self.grid.comm.Irecv(self.in_n, source=self.north, tag=4*self.nt + 1)] + if self.south is not None: + comm_send += [self.grid.comm.Isend(self.out_s, dest=self.south, tag=4*self.nt + 1)] + comm_recv += [self.grid.comm.Irecv(self.in_s, source=self.south, tag=4*self.nt + 0)] + + #Wait for incoming transfers to complete + for comm in comm_recv: + comm.wait() + + #Wait for sending to complete + for comm in comm_send: + comm.wait() + + #### + # Then transfer east-west including ghost cells that have been filled in by north-south transfer above + #### + #Send/receive to east/west neighbours + comm_send = [] + comm_recv = [] + if self.east is not None: + comm_send += [self.grid.comm.Isend(self.out_e, dest=self.east, tag=4*self.nt + 2)] + comm_recv += [self.grid.comm.Irecv(self.in_e, source=self.east, tag=4*self.nt + 3)] + if self.west is not None: + comm_send += [self.grid.comm.Isend(self.out_w, dest=self.west, tag=4*self.nt + 3)] + comm_recv += [self.grid.comm.Irecv(self.in_w, source=self.west, tag=4*self.nt + 2)] + + + #Wait for incoming transfers to complete + for comm in comm_recv: + comm.wait() + + #Wait for sending to complete + for comm in comm_send: + comm.wait() + + if self.profiling_data_mpi["n_time_steps"] > 0: + self.profiling_data_mpi["end"]["t_step_mpi_halo_exchange_sendreceive"] += time.time() + + def upload_for_exchange(self, u): + if self.profiling_data_mpi["n_time_steps"] > 0: + self.profiling_data_mpi["start"]["t_step_mpi_halo_exchange_upload"] += time.time() + + # North-south + if self.north is not None: + for k in range(self.nvars): + u[k].upload(self.sim.stream, self.in_n[k,:,:], extent=self.write_n) + if self.south is not None: + for k in range(self.nvars): + u[k].upload(self.sim.stream, self.in_s[k,:,:], extent=self.write_s) + + # East-west + if self.east is not None: + for k in range(self.nvars): + u[k].upload(self.sim.stream, self.in_e[k,:,:], extent=self.write_e) + if self.west is not None: + for k in range(self.nvars): + u[k].upload(self.sim.stream, self.in_w[k,:,:], extent=self.write_w) + + if self.profiling_data_mpi["n_time_steps"] > 0: + self.profiling_data_mpi["end"]["t_step_mpi_halo_exchange_upload"] += time.time() + + + + + + + def old_exchange(self): #### # FIXME: This function can be optimized using persitent communications. # Also by overlapping some of the communications north/south and east/west of GPU and intra-node @@ -470,4 +605,3 @@ class MPISimulator(Simulator.BaseSimulator): if self.profiling_data_mpi["n_time_steps"] > 0: self.profiling_data_mpi["end"]["t_step_mpi_halo_exchange_sendreceive"] += time.time() - \ No newline at end of file diff --git a/GPUSimulators/cuda/EE2D_KP07_dimsplit.cu b/GPUSimulators/cuda/EE2D_KP07_dimsplit.cu index 9ccdd40..238718c 100644 --- a/GPUSimulators/cuda/EE2D_KP07_dimsplit.cu +++ b/GPUSimulators/cuda/EE2D_KP07_dimsplit.cu @@ -147,23 +147,34 @@ __global__ void KP07DimsplitKernel( float* E1_ptr_, int E1_pitch_, //Output CFL - float* cfl_) { + float* cfl_, + + //Subarea of internal domain to compute + int x0=0, int y0=0, + int x1=0, int y1=0) { + + if(x1 == 0) + x1 = nx_; + + if(y1 == 0) + y1 = ny_; + const unsigned int w = BLOCK_WIDTH; const unsigned int h = BLOCK_HEIGHT; const unsigned int gc_x = 2; const unsigned int gc_y = 2; const unsigned int vars = 4; - + //Shared memory variables __shared__ float Q[4][h+2*gc_y][w+2*gc_x]; __shared__ float Qx[4][h+2*gc_y][w+2*gc_x]; __shared__ float F[4][h+2*gc_y][w+2*gc_x]; //Read into shared memory - readBlock( rho0_ptr_, rho0_pitch_, Q[0], nx_, ny_, boundary_conditions_); - readBlock(rho_u0_ptr_, rho_u0_pitch_, Q[1], nx_, ny_, boundary_conditions_); - readBlock(rho_v0_ptr_, rho_v0_pitch_, Q[2], nx_, ny_, boundary_conditions_); - readBlock( E0_ptr_, E0_pitch_, Q[3], nx_, ny_, boundary_conditions_); + readBlock( rho0_ptr_, rho0_pitch_, Q[0], nx_, ny_, boundary_conditions_, x0, y0, x1, y1); + readBlock(rho_u0_ptr_, rho_u0_pitch_, Q[1], nx_, ny_, boundary_conditions_, x0, y0, x1, y1); + readBlock(rho_v0_ptr_, rho_v0_pitch_, Q[2], nx_, ny_, boundary_conditions_, x0, y0, x1, y1); + readBlock( E0_ptr_, E0_pitch_, Q[3], nx_, ny_, boundary_conditions_, x0, y0, x1, y1); //Step 0 => evolve x first, then y if (step_ == 0) { @@ -224,10 +235,10 @@ __global__ void KP07DimsplitKernel( // Write to main memory for all internal cells - writeBlock( rho1_ptr_, rho1_pitch_, Q[0], nx_, ny_, 0, 1); - writeBlock(rho_u1_ptr_, rho_u1_pitch_, Q[1], nx_, ny_, 0, 1); - writeBlock(rho_v1_ptr_, rho_v1_pitch_, Q[2], nx_, ny_, 0, 1); - writeBlock( E1_ptr_, E1_pitch_, Q[3], nx_, ny_, 0, 1); + writeBlock( rho1_ptr_, rho1_pitch_, Q[0], nx_, ny_, 0, 1, x0, y0, x1, y1); + writeBlock(rho_u1_ptr_, rho_u1_pitch_, Q[1], nx_, ny_, 0, 1, x0, y0, x1, y1); + writeBlock(rho_v1_ptr_, rho_v1_pitch_, Q[2], nx_, ny_, 0, 1, x0, y0, x1, y1); + writeBlock( E1_ptr_, E1_pitch_, Q[3], nx_, ny_, 0, 1, x0, y0, x1, y1); //Compute the CFL for this block if (cfl_ != NULL) { diff --git a/GPUSimulators/cuda/common.h b/GPUSimulators/cuda/common.h index eaffd00..5463294 100644 --- a/GPUSimulators/cuda/common.h +++ b/GPUSimulators/cuda/common.h @@ -321,7 +321,9 @@ template inline __device__ void readBlock(float* ptr_, int pitch_, float Q[h+2*gc_y][w+2*gc_x], const int nx_, const int ny_, - const int boundary_conditions_) { + const int boundary_conditions_, + int x0, int y0, + int x1, int y1) { //Index of block within domain const int bx = blockDim.x * blockIdx.x; const int by = blockDim.y * blockIdx.y; @@ -330,14 +332,14 @@ inline __device__ void readBlock(float* ptr_, int pitch_, //Loop over all variables for (int j=threadIdx.y; j(by + j, ny_, boundary_conditions_); - l = min(l, ny_+2*gc_y-1); + int l = handlePeriodicBoundaryY(by + j + y0, ny_, boundary_conditions_); + l = min(l, min(ny_+2*gc_y-1, y1+2*gc_y-1)); float* row = (float*) ((char*) ptr_ + pitch_*l); for (int i=threadIdx.x; i(bx + i, nx_, boundary_conditions_); - k = min(k, nx_+2*gc_x-1); + int k = handlePeriodicBoundaryX(bx + i + x0, nx_, boundary_conditions_); + k = min(k, min(nx_+2*gc_x-1, x1+2*gc_x-1)); //Read from global memory Q[j][i] = row[k]; @@ -358,14 +360,20 @@ template inline __device__ void writeBlock(float* ptr_, int pitch_, float shmem[h+2*gc_y][w+2*gc_x], const int nx_, const int ny_, - int rk_step_, int rk_order_) { + int rk_step_, int rk_order_, + int x0, int y0, + int x1, int y1) { //Index of cell within domain - const int ti = blockDim.x*blockIdx.x + threadIdx.x + gc_x; - const int tj = blockDim.y*blockIdx.y + threadIdx.y + gc_y; + const int ti = blockDim.x*blockIdx.x + threadIdx.x + gc_x + x0; + const int tj = blockDim.y*blockIdx.y + threadIdx.y + gc_y + y0; + + //In case we are writing only to a subarea given by (x0, y0) x (x1, y1) + const int max_ti = min(nx_+gc_x, x1+gc_x); + const int max_tj = min(ny_+gc_y, y1+gc_y); //Only write internal cells - if (ti < nx_+gc_x && tj < ny_+gc_y) { + if ((x0+gc_x <= ti) && (ti < max_ti) && (y0+gc_y <= tj) && (tj < max_tj)) { //Index of thread within block const int tx = threadIdx.x + gc_x; const int ty = threadIdx.y + gc_y; @@ -416,6 +424,9 @@ inline __device__ void writeBlock(float* ptr_, int pitch_, row[ti] = t*row[ti] + (1.0f-t)*shmem[ty][tx]; } } + + // DEBUG + //row[ti] = 99.0; } } diff --git a/mpiTesting.py b/mpiTesting.py index a7fbd64..8d4e087 100644 --- a/mpiTesting.py +++ b/mpiTesting.py @@ -106,6 +106,10 @@ cuda_context = CudaContext.CudaContext(device=cuda_device, autotuning=False) #### # Set initial conditions #### + +# DEBUGGING - setting random seed +np.random.seed(42) + logger.info("Generating initial conditions") nx = args.nx ny = args.ny @@ -113,7 +117,9 @@ ny = args.ny dt = 0.00001 gamma = 1.4 -save_times = np.linspace(0, 0.1, 2) +#save_times = np.linspace(0, 0.000009, 2) +#save_times = np.linspace(0, 0.000099, 11) +save_times = np.linspace(0, 0.000099, 2) outfile = "mpi_out_" + str(MPI.COMM_WORLD.rank) + ".nc" save_var_names = ['rho', 'rho_u', 'rho_v', 'E']