mirror of
https://github.com/smyalygames/FiniteVolumeGPU.git
synced 2025-05-18 06:24:13 +02:00
Async mem ops
This commit is contained in:
parent
dcef56a1b9
commit
6c8bac6f7b
@ -546,9 +546,9 @@ class CudaArray2D:
|
||||
#self.logger.debug("Downloading [%dx%d] buffer", self.nx, self.ny)
|
||||
#Allocate host memory
|
||||
#The following fails, don't know why (crashes python)
|
||||
#cpu_data = cuda.pagelocked_empty((self.ny, self.nx), np.float32)32)
|
||||
cpu_data = cuda.pagelocked_empty((int(ny), int(nx)), dtype=np.float32, mem_flags=cuda.host_alloc_flags.PORTABLE)
|
||||
#Non-pagelocked: cpu_data = np.empty((ny, nx), dtype=np.float32)
|
||||
cpu_data = self.memorypool.allocate((ny, nx), dtype=np.float32)
|
||||
#cpu_data = self.memorypool.allocate((ny, nx), dtype=np.float32)
|
||||
|
||||
assert nx == cpu_data.shape[1]
|
||||
assert ny == cpu_data.shape[0]
|
||||
@ -759,7 +759,7 @@ class ArakawaA2D:
|
||||
assert i < len(self.gpu_variables), "Variable {:d} is out of range".format(i)
|
||||
cpu_variables += [self.gpu_variables[i].download(stream, asynch=True)]
|
||||
|
||||
stream.synchronize()
|
||||
#stream.synchronize()
|
||||
return cpu_variables
|
||||
|
||||
def check(self):
|
||||
|
@ -90,7 +90,7 @@ class EE2D_KP07_dimsplit (BaseSimulator):
|
||||
},
|
||||
jit_compile_args={})
|
||||
self.kernel = module.get_function("KP07DimsplitKernel")
|
||||
self.kernel.prepare("iiffffffiiPiPiPiPiPiPiPiPiP")
|
||||
self.kernel.prepare("iiffffffiiPiPiPiPiPiPiPiPiPiiii")
|
||||
|
||||
|
||||
#Create data by uploading to device
|
||||
@ -109,11 +109,14 @@ class EE2D_KP07_dimsplit (BaseSimulator):
|
||||
self.cfl_data.fill(self.dt, stream=self.stream)
|
||||
|
||||
|
||||
def substep(self, dt, step_number):
|
||||
self.substepDimsplit(0.5*dt, step_number)
|
||||
|
||||
def substepDimsplit(self, dt, substep):
|
||||
self.kernel.prepared_async_call(self.grid_size, self.block_size, self.stream,
|
||||
def substep(self, dt, step_number, external=True, internal=True):
|
||||
self.substepDimsplit(0.5*dt, step_number, external, internal)
|
||||
|
||||
def substepDimsplit(self, dt, substep, external, internal):
|
||||
if external and internal:
|
||||
#print("COMPLETE DOMAIN (dt=" + str(dt) + ")")
|
||||
|
||||
self.kernel.prepared_async_call(self.grid_size, self.block_size, self.stream,
|
||||
self.nx, self.ny,
|
||||
self.dx, self.dy, dt,
|
||||
self.g,
|
||||
@ -129,8 +132,99 @@ class EE2D_KP07_dimsplit (BaseSimulator):
|
||||
self.u1[1].data.gpudata, self.u1[1].data.strides[0],
|
||||
self.u1[2].data.gpudata, self.u1[2].data.strides[0],
|
||||
self.u1[3].data.gpudata, self.u1[3].data.strides[0],
|
||||
self.cfl_data.gpudata)
|
||||
self.cfl_data.gpudata,
|
||||
0, 0,
|
||||
self.nx, self.ny)
|
||||
self.u0, self.u1 = self.u1, self.u0
|
||||
return
|
||||
|
||||
if external and not internal:
|
||||
#############################################################
|
||||
# XXX: Only treating north and south external cells for now #
|
||||
#############################################################
|
||||
|
||||
ns_grid_size = (self.grid_size[0], 1)
|
||||
|
||||
# NORTH
|
||||
# (x0, y0) x (x1, y1)
|
||||
# (0, ny-y_halo) x (nx, ny)
|
||||
self.kernel.prepared_async_call(ns_grid_size, self.block_size, self.stream,
|
||||
self.nx, self.ny,
|
||||
self.dx, self.dy, dt,
|
||||
self.g,
|
||||
self.gamma,
|
||||
self.theta,
|
||||
substep,
|
||||
self.boundary_conditions,
|
||||
self.u0[0].data.gpudata, self.u0[0].data.strides[0],
|
||||
self.u0[1].data.gpudata, self.u0[1].data.strides[0],
|
||||
self.u0[2].data.gpudata, self.u0[2].data.strides[0],
|
||||
self.u0[3].data.gpudata, self.u0[3].data.strides[0],
|
||||
self.u1[0].data.gpudata, self.u1[0].data.strides[0],
|
||||
self.u1[1].data.gpudata, self.u1[1].data.strides[0],
|
||||
self.u1[2].data.gpudata, self.u1[2].data.strides[0],
|
||||
self.u1[3].data.gpudata, self.u1[3].data.strides[0],
|
||||
self.cfl_data.gpudata,
|
||||
0, self.ny - int(self.u0[0].y_halo),
|
||||
self.nx, self.ny)
|
||||
|
||||
# SOUTH
|
||||
# (x0, y0) x (x1, y1)
|
||||
# (0, 0) x (nx, y_halo)
|
||||
self.kernel.prepared_async_call(ns_grid_size, self.block_size, self.stream,
|
||||
self.nx, self.ny,
|
||||
self.dx, self.dy, dt,
|
||||
self.g,
|
||||
self.gamma,
|
||||
self.theta,
|
||||
substep,
|
||||
self.boundary_conditions,
|
||||
self.u0[0].data.gpudata, self.u0[0].data.strides[0],
|
||||
self.u0[1].data.gpudata, self.u0[1].data.strides[0],
|
||||
self.u0[2].data.gpudata, self.u0[2].data.strides[0],
|
||||
self.u0[3].data.gpudata, self.u0[3].data.strides[0],
|
||||
self.u1[0].data.gpudata, self.u1[0].data.strides[0],
|
||||
self.u1[1].data.gpudata, self.u1[1].data.strides[0],
|
||||
self.u1[2].data.gpudata, self.u1[2].data.strides[0],
|
||||
self.u1[3].data.gpudata, self.u1[3].data.strides[0],
|
||||
self.cfl_data.gpudata,
|
||||
0, 0,
|
||||
self.nx, int(self.u0[0].y_halo))
|
||||
return
|
||||
|
||||
if internal and not external:
|
||||
#############################################################
|
||||
# XXX: Only treating north and south external cells for now #
|
||||
# So we need to include west and east boundary here! #
|
||||
#############################################################
|
||||
|
||||
# INTERNAL DOMAIN
|
||||
# (x0, y0) x (x1, y1)
|
||||
# (x_halo, y_halo) x (nx - x_halo, ny - y_halo)
|
||||
self.kernel.prepared_async_call(self.grid_size, self.block_size, self.internal_stream,
|
||||
self.nx, self.ny,
|
||||
self.dx, self.dy, dt,
|
||||
self.g,
|
||||
self.gamma,
|
||||
self.theta,
|
||||
substep,
|
||||
self.boundary_conditions,
|
||||
self.u0[0].data.gpudata, self.u0[0].data.strides[0],
|
||||
self.u0[1].data.gpudata, self.u0[1].data.strides[0],
|
||||
self.u0[2].data.gpudata, self.u0[2].data.strides[0],
|
||||
self.u0[3].data.gpudata, self.u0[3].data.strides[0],
|
||||
self.u1[0].data.gpudata, self.u1[0].data.strides[0],
|
||||
self.u1[1].data.gpudata, self.u1[1].data.strides[0],
|
||||
self.u1[2].data.gpudata, self.u1[2].data.strides[0],
|
||||
self.u1[3].data.gpudata, self.u1[3].data.strides[0],
|
||||
self.cfl_data.gpudata,
|
||||
0, int(self.u0[0].y_halo),
|
||||
self.nx, self.ny - int(self.u0[0].y_halo))
|
||||
return
|
||||
|
||||
def swapBuffers(self):
|
||||
self.u0, self.u1 = self.u1, self.u0
|
||||
return
|
||||
|
||||
def getOutput(self):
|
||||
return self.u0
|
||||
@ -138,6 +232,7 @@ class EE2D_KP07_dimsplit (BaseSimulator):
|
||||
def check(self):
|
||||
self.u0.check()
|
||||
self.u1.check()
|
||||
return
|
||||
|
||||
def computeDt(self):
|
||||
max_dt = gpuarray.min(self.cfl_data, stream=self.stream).get();
|
||||
|
@ -26,6 +26,8 @@ import numpy as np
|
||||
from mpi4py import MPI
|
||||
import time
|
||||
|
||||
import pycuda.driver as cuda
|
||||
import nvtx
|
||||
|
||||
|
||||
|
||||
@ -239,25 +241,36 @@ class MPISimulator(Simulator.BaseSimulator):
|
||||
|
||||
#Get coordinate of this node
|
||||
#and handle global boundary conditions
|
||||
#new_boundary_conditions = Simulator.BoundaryCondition({
|
||||
# 'north': Simulator.BoundaryCondition.Type.Dirichlet,
|
||||
# 'south': Simulator.BoundaryCondition.Type.Dirichlet,
|
||||
# 'east': Simulator.BoundaryCondition.Type.Dirichlet,
|
||||
# 'west': Simulator.BoundaryCondition.Type.Dirichlet
|
||||
#})
|
||||
new_boundary_conditions = Simulator.BoundaryCondition({
|
||||
'north': Simulator.BoundaryCondition.Type.Dirichlet,
|
||||
'south': Simulator.BoundaryCondition.Type.Dirichlet,
|
||||
'east': Simulator.BoundaryCondition.Type.Dirichlet,
|
||||
'west': Simulator.BoundaryCondition.Type.Dirichlet
|
||||
'north': Simulator.BoundaryCondition.Type.Reflective,
|
||||
'south': Simulator.BoundaryCondition.Type.Reflective,
|
||||
'east': Simulator.BoundaryCondition.Type.Reflective,
|
||||
'west': Simulator.BoundaryCondition.Type.Reflective
|
||||
})
|
||||
gi, gj = grid.getCoordinate()
|
||||
print("gi: " + str(gi) + ", gj: " + str(gj))
|
||||
if (gi == 0 and boundary_conditions.west != Simulator.BoundaryCondition.Type.Periodic):
|
||||
self.west = None
|
||||
new_boundary_conditions.west = boundary_conditions.west;
|
||||
print("Rank: " + str(self.grid.comm.rank) + ": WEST")
|
||||
if (gj == 0 and boundary_conditions.south != Simulator.BoundaryCondition.Type.Periodic):
|
||||
self.south = None
|
||||
new_boundary_conditions.south = boundary_conditions.south;
|
||||
print("Rank: " + str(self.grid.comm.rank) + ": SOUTH")
|
||||
if (gi == grid.grid[0]-1 and boundary_conditions.east != Simulator.BoundaryCondition.Type.Periodic):
|
||||
self.east = None
|
||||
new_boundary_conditions.east = boundary_conditions.east;
|
||||
print("Rank: " + str(self.grid.comm.rank) + ": EAST")
|
||||
if (gj == grid.grid[1]-1 and boundary_conditions.north != Simulator.BoundaryCondition.Type.Periodic):
|
||||
self.north = None
|
||||
new_boundary_conditions.north = boundary_conditions.north;
|
||||
print("Rank: " + str(self.grid.comm.rank) + ": NORTH")
|
||||
sim.setBoundaryConditions(new_boundary_conditions)
|
||||
|
||||
#Get number of variables
|
||||
@ -286,36 +299,52 @@ class MPISimulator(Simulator.BaseSimulator):
|
||||
#Note that east and west also transfer ghost cells
|
||||
#whilst north/south only transfer internal cells
|
||||
#Reuses the width/height defined in the read-extets above
|
||||
self.in_e = np.empty((self.nvars, self.read_e[3], self.read_e[2]), dtype=np.float32)
|
||||
self.in_w = np.empty((self.nvars, self.read_w[3], self.read_w[2]), dtype=np.float32)
|
||||
self.in_n = np.empty((self.nvars, self.read_n[3], self.read_n[2]), dtype=np.float32)
|
||||
self.in_s = np.empty((self.nvars, self.read_s[3], self.read_s[2]), dtype=np.float32)
|
||||
|
||||
self.in_e = cuda.pagelocked_empty((int(self.nvars), int(self.read_e[3]), int(self.read_e[2])), dtype=np.float32, mem_flags=cuda.host_alloc_flags.PORTABLE) #np.empty((self.nvars, self.read_e[3], self.read_e[2]), dtype=np.float32)
|
||||
self.in_w = cuda.pagelocked_empty((int(self.nvars), int(self.read_w[3]), int(self.read_w[2])), dtype=np.float32, mem_flags=cuda.host_alloc_flags.PORTABLE) #np.empty((self.nvars, self.read_w[3], self.read_w[2]), dtype=np.float32)
|
||||
self.in_n = cuda.pagelocked_empty((int(self.nvars), int(self.read_n[3]), int(self.read_n[2])), dtype=np.float32, mem_flags=cuda.host_alloc_flags.PORTABLE) #np.empty((self.nvars, self.read_n[3], self.read_n[2]), dtype=np.float32)
|
||||
self.in_s = cuda.pagelocked_empty((int(self.nvars), int(self.read_s[3]), int(self.read_s[2])), dtype=np.float32, mem_flags=cuda.host_alloc_flags.PORTABLE) #np.empty((self.nvars, self.read_s[3], self.read_s[2]), dtype=np.float32)
|
||||
|
||||
#Allocate data for sending
|
||||
self.out_e = np.empty_like(self.in_e)
|
||||
self.out_w = np.empty_like(self.in_w)
|
||||
self.out_n = np.empty_like(self.in_n)
|
||||
self.out_s = np.empty_like(self.in_s)
|
||||
self.out_e = cuda.pagelocked_empty((int(self.nvars), int(self.read_e[3]), int(self.read_e[2])), dtype=np.float32, mem_flags=cuda.host_alloc_flags.PORTABLE) #np.empty_like(self.in_e)
|
||||
self.out_w = cuda.pagelocked_empty((int(self.nvars), int(self.read_w[3]), int(self.read_w[2])), dtype=np.float32, mem_flags=cuda.host_alloc_flags.PORTABLE) #np.empty_like(self.in_w)
|
||||
self.out_n = cuda.pagelocked_empty((int(self.nvars), int(self.read_n[3]), int(self.read_n[2])), dtype=np.float32, mem_flags=cuda.host_alloc_flags.PORTABLE) #np.empty_like(self.in_n)
|
||||
self.out_s = cuda.pagelocked_empty((int(self.nvars), int(self.read_s[3]), int(self.read_s[2])), dtype=np.float32, mem_flags=cuda.host_alloc_flags.PORTABLE) #np.empty_like(self.in_s)
|
||||
|
||||
self.logger.debug("Simlator rank {:d} initialized on {:s}".format(self.grid.comm.rank, MPI.Get_processor_name()))
|
||||
self.profiling_data_mpi["end"]["t_sim_mpi_init"] = time.time()
|
||||
|
||||
def substep(self, dt, step_number):
|
||||
if self.profiling_data_mpi["n_time_steps"] > 0:
|
||||
self.profiling_data_mpi["start"]["t_step_mpi_halo_exchange"] += time.time()
|
||||
|
||||
#Init ghost cells (with data from neighboring subdomains)
|
||||
self.download_for_exchange(self.sim.u0)
|
||||
self.exchange()
|
||||
self.upload_for_exchange(self.sim.u0)
|
||||
|
||||
def substep(self, dt, step_number):
|
||||
nvtx.mark("substep start", color="red")
|
||||
|
||||
self.sim.stream.synchronize() # only necessary for profiling!
|
||||
if self.profiling_data_mpi["n_time_steps"] > 0:
|
||||
self.profiling_data_mpi["end"]["t_step_mpi_halo_exchange"] += time.time()
|
||||
self.profiling_data_mpi["start"]["t_step_mpi"] += time.time()
|
||||
self.profiling_data_mpi["start"]["t_step_mpi"] += time.time()
|
||||
nvtx.mark("substep internal", color="red")
|
||||
self.sim.substep(dt, step_number, internal=True, external=False) # "internal ghost cells" excluded
|
||||
self.profiling_data_mpi["end"]["t_step_mpi"] += time.time()
|
||||
|
||||
self.sim.substep(dt, step_number)
|
||||
self.profiling_data_mpi["start"]["t_step_mpi"] += time.time()
|
||||
nvtx.mark("substep external", color="blue")
|
||||
self.sim.substep(dt, step_number, external=True, internal=False) # only "internal ghost cells"
|
||||
self.profiling_data_mpi["end"]["t_step_mpi"] += time.time()
|
||||
|
||||
self.sim.stream.synchronize() # only necessary for profiling!
|
||||
if self.profiling_data_mpi["n_time_steps"] > 0:
|
||||
self.profiling_data_mpi["end"]["t_step_mpi"] += time.time()
|
||||
# NOTE: Need to download from u1, as u0<->u1 switch is not performed yet
|
||||
nvtx.mark("download", color="red")
|
||||
self.sim.swapBuffers()
|
||||
self.download_for_exchange(self.sim.u0)
|
||||
|
||||
nvtx.mark("sync", color="red")
|
||||
self.sim.stream.synchronize()
|
||||
nvtx.mark("MPI", color="green")
|
||||
self.exchange()
|
||||
nvtx.mark("upload", color="red")
|
||||
self.upload_for_exchange(self.sim.u0)
|
||||
|
||||
self.sim.internal_stream.synchronize()
|
||||
|
||||
self.profiling_data_mpi["n_time_steps"] += 1
|
||||
|
||||
def getOutput(self):
|
||||
@ -348,8 +377,114 @@ class MPISimulator(Simulator.BaseSimulator):
|
||||
x1 = x0 + width
|
||||
y1 = y0 + height
|
||||
return [x0, x1, y0, y1]
|
||||
|
||||
|
||||
def download_for_exchange(self, u):
|
||||
if self.profiling_data_mpi["n_time_steps"] > 0:
|
||||
self.profiling_data_mpi["start"]["t_step_mpi_halo_exchange_download"] += time.time()
|
||||
|
||||
# North-south
|
||||
if self.north is not None:
|
||||
for k in range(self.nvars):
|
||||
u[k].download(self.sim.stream, cpu_data=self.out_n[k,:,:], asynch=True, extent=self.read_n)
|
||||
#self.out_n[k,:,:] = u[k].download(self.sim.stream, asynch=True, extent=self.read_n)
|
||||
if self.south is not None:
|
||||
for k in range(self.nvars):
|
||||
u[k].download(self.sim.stream, cpu_data=self.out_s[k,:,:], asynch=True, extent=self.read_s)
|
||||
#self.out_s[k,:,:] = u[k].download(self.sim.stream, asynch=True, extent=self.read_s)
|
||||
|
||||
# East-west
|
||||
if self.east is not None:
|
||||
for k in range(self.nvars):
|
||||
u[k].download(self.sim.stream, cpu_data=self.out_e[k,:,:], asynch=True, extent=self.read_e)
|
||||
#self.out_e[k,:,:] = u[k].download(self.sim.stream, asynch=True, extent=self.read_e)
|
||||
if self.west is not None:
|
||||
for k in range(self.nvars):
|
||||
u[k].download(self.sim.stream, cpu_data=self.out_w[k,:,:], asynch=True, extent=self.read_w)
|
||||
#self.out_w[k,:,:] = u[k].download(self.sim.stream, asynch=True, extent=self.read_w)
|
||||
|
||||
if self.profiling_data_mpi["n_time_steps"] > 0:
|
||||
self.profiling_data_mpi["end"]["t_step_mpi_halo_exchange_download"] += time.time()
|
||||
|
||||
def exchange(self):
|
||||
if self.profiling_data_mpi["n_time_steps"] > 0:
|
||||
self.profiling_data_mpi["start"]["t_step_mpi_halo_exchange_sendreceive"] += time.time()
|
||||
|
||||
####
|
||||
# First transfer internal cells north-south
|
||||
####
|
||||
|
||||
#Send/receive to north/south neighbours
|
||||
comm_send = []
|
||||
comm_recv = []
|
||||
if self.north is not None:
|
||||
comm_send += [self.grid.comm.Isend(self.out_n, dest=self.north, tag=4*self.nt + 0)]
|
||||
comm_recv += [self.grid.comm.Irecv(self.in_n, source=self.north, tag=4*self.nt + 1)]
|
||||
if self.south is not None:
|
||||
comm_send += [self.grid.comm.Isend(self.out_s, dest=self.south, tag=4*self.nt + 1)]
|
||||
comm_recv += [self.grid.comm.Irecv(self.in_s, source=self.south, tag=4*self.nt + 0)]
|
||||
|
||||
#Wait for incoming transfers to complete
|
||||
for comm in comm_recv:
|
||||
comm.wait()
|
||||
|
||||
#Wait for sending to complete
|
||||
for comm in comm_send:
|
||||
comm.wait()
|
||||
|
||||
####
|
||||
# Then transfer east-west including ghost cells that have been filled in by north-south transfer above
|
||||
####
|
||||
#Send/receive to east/west neighbours
|
||||
comm_send = []
|
||||
comm_recv = []
|
||||
if self.east is not None:
|
||||
comm_send += [self.grid.comm.Isend(self.out_e, dest=self.east, tag=4*self.nt + 2)]
|
||||
comm_recv += [self.grid.comm.Irecv(self.in_e, source=self.east, tag=4*self.nt + 3)]
|
||||
if self.west is not None:
|
||||
comm_send += [self.grid.comm.Isend(self.out_w, dest=self.west, tag=4*self.nt + 3)]
|
||||
comm_recv += [self.grid.comm.Irecv(self.in_w, source=self.west, tag=4*self.nt + 2)]
|
||||
|
||||
|
||||
#Wait for incoming transfers to complete
|
||||
for comm in comm_recv:
|
||||
comm.wait()
|
||||
|
||||
#Wait for sending to complete
|
||||
for comm in comm_send:
|
||||
comm.wait()
|
||||
|
||||
if self.profiling_data_mpi["n_time_steps"] > 0:
|
||||
self.profiling_data_mpi["end"]["t_step_mpi_halo_exchange_sendreceive"] += time.time()
|
||||
|
||||
def upload_for_exchange(self, u):
|
||||
if self.profiling_data_mpi["n_time_steps"] > 0:
|
||||
self.profiling_data_mpi["start"]["t_step_mpi_halo_exchange_upload"] += time.time()
|
||||
|
||||
# North-south
|
||||
if self.north is not None:
|
||||
for k in range(self.nvars):
|
||||
u[k].upload(self.sim.stream, self.in_n[k,:,:], extent=self.write_n)
|
||||
if self.south is not None:
|
||||
for k in range(self.nvars):
|
||||
u[k].upload(self.sim.stream, self.in_s[k,:,:], extent=self.write_s)
|
||||
|
||||
# East-west
|
||||
if self.east is not None:
|
||||
for k in range(self.nvars):
|
||||
u[k].upload(self.sim.stream, self.in_e[k,:,:], extent=self.write_e)
|
||||
if self.west is not None:
|
||||
for k in range(self.nvars):
|
||||
u[k].upload(self.sim.stream, self.in_w[k,:,:], extent=self.write_w)
|
||||
|
||||
if self.profiling_data_mpi["n_time_steps"] > 0:
|
||||
self.profiling_data_mpi["end"]["t_step_mpi_halo_exchange_upload"] += time.time()
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
def old_exchange(self):
|
||||
####
|
||||
# FIXME: This function can be optimized using persitent communications.
|
||||
# Also by overlapping some of the communications north/south and east/west of GPU and intra-node
|
||||
@ -470,4 +605,3 @@ class MPISimulator(Simulator.BaseSimulator):
|
||||
|
||||
if self.profiling_data_mpi["n_time_steps"] > 0:
|
||||
self.profiling_data_mpi["end"]["t_step_mpi_halo_exchange_sendreceive"] += time.time()
|
||||
|
@ -147,23 +147,34 @@ __global__ void KP07DimsplitKernel(
|
||||
float* E1_ptr_, int E1_pitch_,
|
||||
|
||||
//Output CFL
|
||||
float* cfl_) {
|
||||
float* cfl_,
|
||||
|
||||
//Subarea of internal domain to compute
|
||||
int x0=0, int y0=0,
|
||||
int x1=0, int y1=0) {
|
||||
|
||||
if(x1 == 0)
|
||||
x1 = nx_;
|
||||
|
||||
if(y1 == 0)
|
||||
y1 = ny_;
|
||||
|
||||
const unsigned int w = BLOCK_WIDTH;
|
||||
const unsigned int h = BLOCK_HEIGHT;
|
||||
const unsigned int gc_x = 2;
|
||||
const unsigned int gc_y = 2;
|
||||
const unsigned int vars = 4;
|
||||
|
||||
|
||||
//Shared memory variables
|
||||
__shared__ float Q[4][h+2*gc_y][w+2*gc_x];
|
||||
__shared__ float Qx[4][h+2*gc_y][w+2*gc_x];
|
||||
__shared__ float F[4][h+2*gc_y][w+2*gc_x];
|
||||
|
||||
//Read into shared memory
|
||||
readBlock<w, h, gc_x, gc_y, 1, 1>( rho0_ptr_, rho0_pitch_, Q[0], nx_, ny_, boundary_conditions_);
|
||||
readBlock<w, h, gc_x, gc_y, -1, 1>(rho_u0_ptr_, rho_u0_pitch_, Q[1], nx_, ny_, boundary_conditions_);
|
||||
readBlock<w, h, gc_x, gc_y, 1, -1>(rho_v0_ptr_, rho_v0_pitch_, Q[2], nx_, ny_, boundary_conditions_);
|
||||
readBlock<w, h, gc_x, gc_y, 1, 1>( E0_ptr_, E0_pitch_, Q[3], nx_, ny_, boundary_conditions_);
|
||||
readBlock<w, h, gc_x, gc_y, 1, 1>( rho0_ptr_, rho0_pitch_, Q[0], nx_, ny_, boundary_conditions_, x0, y0, x1, y1);
|
||||
readBlock<w, h, gc_x, gc_y, -1, 1>(rho_u0_ptr_, rho_u0_pitch_, Q[1], nx_, ny_, boundary_conditions_, x0, y0, x1, y1);
|
||||
readBlock<w, h, gc_x, gc_y, 1, -1>(rho_v0_ptr_, rho_v0_pitch_, Q[2], nx_, ny_, boundary_conditions_, x0, y0, x1, y1);
|
||||
readBlock<w, h, gc_x, gc_y, 1, 1>( E0_ptr_, E0_pitch_, Q[3], nx_, ny_, boundary_conditions_, x0, y0, x1, y1);
|
||||
|
||||
//Step 0 => evolve x first, then y
|
||||
if (step_ == 0) {
|
||||
@ -224,10 +235,10 @@ __global__ void KP07DimsplitKernel(
|
||||
|
||||
|
||||
// Write to main memory for all internal cells
|
||||
writeBlock<w, h, gc_x, gc_y>( rho1_ptr_, rho1_pitch_, Q[0], nx_, ny_, 0, 1);
|
||||
writeBlock<w, h, gc_x, gc_y>(rho_u1_ptr_, rho_u1_pitch_, Q[1], nx_, ny_, 0, 1);
|
||||
writeBlock<w, h, gc_x, gc_y>(rho_v1_ptr_, rho_v1_pitch_, Q[2], nx_, ny_, 0, 1);
|
||||
writeBlock<w, h, gc_x, gc_y>( E1_ptr_, E1_pitch_, Q[3], nx_, ny_, 0, 1);
|
||||
writeBlock<w, h, gc_x, gc_y>( rho1_ptr_, rho1_pitch_, Q[0], nx_, ny_, 0, 1, x0, y0, x1, y1);
|
||||
writeBlock<w, h, gc_x, gc_y>(rho_u1_ptr_, rho_u1_pitch_, Q[1], nx_, ny_, 0, 1, x0, y0, x1, y1);
|
||||
writeBlock<w, h, gc_x, gc_y>(rho_v1_ptr_, rho_v1_pitch_, Q[2], nx_, ny_, 0, 1, x0, y0, x1, y1);
|
||||
writeBlock<w, h, gc_x, gc_y>( E1_ptr_, E1_pitch_, Q[3], nx_, ny_, 0, 1, x0, y0, x1, y1);
|
||||
|
||||
//Compute the CFL for this block
|
||||
if (cfl_ != NULL) {
|
||||
|
@ -321,7 +321,9 @@ template<int w, int h, int gc_x, int gc_y, int sign_x, int sign_y>
|
||||
inline __device__ void readBlock(float* ptr_, int pitch_,
|
||||
float Q[h+2*gc_y][w+2*gc_x],
|
||||
const int nx_, const int ny_,
|
||||
const int boundary_conditions_) {
|
||||
const int boundary_conditions_,
|
||||
int x0, int y0,
|
||||
int x1, int y1) {
|
||||
//Index of block within domain
|
||||
const int bx = blockDim.x * blockIdx.x;
|
||||
const int by = blockDim.y * blockIdx.y;
|
||||
@ -330,14 +332,14 @@ inline __device__ void readBlock(float* ptr_, int pitch_,
|
||||
//Loop over all variables
|
||||
for (int j=threadIdx.y; j<h+2*gc_y; j+=h) {
|
||||
//Handle periodic boundary conditions here
|
||||
int l = handlePeriodicBoundaryY<gc_y>(by + j, ny_, boundary_conditions_);
|
||||
l = min(l, ny_+2*gc_y-1);
|
||||
int l = handlePeriodicBoundaryY<gc_y>(by + j + y0, ny_, boundary_conditions_);
|
||||
l = min(l, min(ny_+2*gc_y-1, y1+2*gc_y-1));
|
||||
float* row = (float*) ((char*) ptr_ + pitch_*l);
|
||||
|
||||
for (int i=threadIdx.x; i<w+2*gc_x; i+=w) {
|
||||
//Handle periodic boundary conditions here
|
||||
int k = handlePeriodicBoundaryX<gc_x>(bx + i, nx_, boundary_conditions_);
|
||||
k = min(k, nx_+2*gc_x-1);
|
||||
int k = handlePeriodicBoundaryX<gc_x>(bx + i + x0, nx_, boundary_conditions_);
|
||||
k = min(k, min(nx_+2*gc_x-1, x1+2*gc_x-1));
|
||||
|
||||
//Read from global memory
|
||||
Q[j][i] = row[k];
|
||||
@ -358,14 +360,20 @@ template<int w, int h, int gc_x, int gc_y>
|
||||
inline __device__ void writeBlock(float* ptr_, int pitch_,
|
||||
float shmem[h+2*gc_y][w+2*gc_x],
|
||||
const int nx_, const int ny_,
|
||||
int rk_step_, int rk_order_) {
|
||||
int rk_step_, int rk_order_,
|
||||
int x0, int y0,
|
||||
int x1, int y1) {
|
||||
|
||||
//Index of cell within domain
|
||||
const int ti = blockDim.x*blockIdx.x + threadIdx.x + gc_x;
|
||||
const int tj = blockDim.y*blockIdx.y + threadIdx.y + gc_y;
|
||||
const int ti = blockDim.x*blockIdx.x + threadIdx.x + gc_x + x0;
|
||||
const int tj = blockDim.y*blockIdx.y + threadIdx.y + gc_y + y0;
|
||||
|
||||
//In case we are writing only to a subarea given by (x0, y0) x (x1, y1)
|
||||
const int max_ti = min(nx_+gc_x, x1+gc_x);
|
||||
const int max_tj = min(ny_+gc_y, y1+gc_y);
|
||||
|
||||
//Only write internal cells
|
||||
if (ti < nx_+gc_x && tj < ny_+gc_y) {
|
||||
if ((x0+gc_x <= ti) && (ti < max_ti) && (y0+gc_y <= tj) && (tj < max_tj)) {
|
||||
//Index of thread within block
|
||||
const int tx = threadIdx.x + gc_x;
|
||||
const int ty = threadIdx.y + gc_y;
|
||||
@ -416,6 +424,9 @@ inline __device__ void writeBlock(float* ptr_, int pitch_,
|
||||
row[ti] = t*row[ti] + (1.0f-t)*shmem[ty][tx];
|
||||
}
|
||||
}
|
||||
|
||||
// DEBUG
|
||||
//row[ti] = 99.0;
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -106,6 +106,10 @@ cuda_context = CudaContext.CudaContext(device=cuda_device, autotuning=False)
|
||||
####
|
||||
# Set initial conditions
|
||||
####
|
||||
|
||||
# DEBUGGING - setting random seed
|
||||
np.random.seed(42)
|
||||
|
||||
logger.info("Generating initial conditions")
|
||||
nx = args.nx
|
||||
ny = args.ny
|
||||
@ -113,7 +117,9 @@ ny = args.ny
|
||||
dt = 0.00001
|
||||
|
||||
gamma = 1.4
|
||||
save_times = np.linspace(0, 0.1, 2)
|
||||
#save_times = np.linspace(0, 0.000009, 2)
|
||||
#save_times = np.linspace(0, 0.000099, 11)
|
||||
save_times = np.linspace(0, 0.000099, 2)
|
||||
outfile = "mpi_out_" + str(MPI.COMM_WORLD.rank) + ".nc"
|
||||
save_var_names = ['rho', 'rho_u', 'rho_v', 'E']
|
||||
|
||||
|
Loading…
x
Reference in New Issue
Block a user