Refactoring

This commit is contained in:
André R. Brodtkorb
2018-11-08 22:05:14 +01:00
parent ae668a40d3
commit fd337e7d53
7 changed files with 305 additions and 146 deletions

View File

@@ -60,7 +60,7 @@ class Autotuner:
return return
# Set arguments to send to the simulators during construction # Set arguments to send to the simulators during construction
context = Common.CudaContext(autotuning=False) context = CudaContext.CudaContext(autotuning=False)
g = 9.81 g = 9.81
h0, hu0, hv0, dx, dy, dt = Autotuner.gen_test_data(nx=self.nx, ny=self.ny, g=g) h0, hu0, hv0, dx, dy, dt = Autotuner.gen_test_data(nx=self.nx, ny=self.ny, g=g)
arguments = { arguments = {

View File

@@ -186,7 +186,7 @@ Class that holds 2D data
""" """
class CudaArray2D: class CudaArray2D:
""" """
Uploads initial data to the CL device Uploads initial data to the CUDA device
""" """
def __init__(self, stream, nx, ny, x_halo, y_halo, cpu_data=None, dtype=np.float32): def __init__(self, stream, nx, ny, x_halo, y_halo, cpu_data=None, dtype=np.float32):
self.logger = logging.getLogger(__name__) self.logger = logging.getLogger(__name__)
@@ -217,10 +217,8 @@ class CudaArray2D:
copy.set_dst_device(self.data.gpudata) copy.set_dst_device(self.data.gpudata)
#Set offsets of upload in destination #Set offsets of upload in destination
x_offset = (nx_halo - cpu_data.shape[1]) // 2 copy.dst_x_in_bytes = x_halo*self.data.strides[1]
y_offset = (ny_halo - cpu_data.shape[0]) // 2 copy.dst_y = y_halo
copy.dst_x_in_bytes = x_offset*self.data.strides[1]
copy.dst_y = y_offset
#Set destination pitch #Set destination pitch
copy.dst_pitch = self.data.strides[0] copy.dst_pitch = self.data.strides[0]

View File

@@ -78,8 +78,8 @@ class CudaContext(object):
self.logger.info("Created context handle <%s>", str(self.cuda_context.handle)) self.logger.info("Created context handle <%s>", str(self.cuda_context.handle))
#Create cache dir for cubin files #Create cache dir for cubin files
if (self.use_cache):
self.cache_path = os.path.join(self.module_path, "cuda_cache") self.cache_path = os.path.join(self.module_path, "cuda_cache")
if (self.use_cache):
if not os.path.isdir(self.cache_path): if not os.path.isdir(self.cache_path):
os.mkdir(self.cache_path) os.mkdir(self.cache_path)
self.logger.info("Using CUDA cache dir %s", self.cache_path) self.logger.info("Using CUDA cache dir %s", self.cache_path)

View File

@@ -21,6 +21,7 @@ along with this program. If not, see <http://www.gnu.org/licenses/>.
#Import packages we need #Import packages we need
from GPUSimulators import Simulator, Common from GPUSimulators import Simulator, Common
from GPUSimulators.Simulator import BaseSimulator, BoundaryCondition
import numpy as np import numpy as np
@@ -34,7 +35,7 @@ import numpy as np
""" """
Class that solves the SW equations using the Forward-Backward linear scheme Class that solves the SW equations using the Forward-Backward linear scheme
""" """
class EE2D_KP07_dimsplit (Simulator.BaseSimulator): class EE2D_KP07_dimsplit (BaseSimulator):
""" """
Initialization routine Initialization routine
@@ -47,6 +48,7 @@ class EE2D_KP07_dimsplit (Simulator.BaseSimulator):
dx: Grid cell spacing along x-axis dx: Grid cell spacing along x-axis
dy: Grid cell spacing along y-axis dy: Grid cell spacing along y-axis
dt: Size of each timestep dt: Size of each timestep
g: Gravitational constant
gamma: Gas constant gamma: Gas constant
p: pressure p: pressure
""" """
@@ -55,8 +57,11 @@ class EE2D_KP07_dimsplit (Simulator.BaseSimulator):
rho, rho_u, rho_v, E, \ rho, rho_u, rho_v, E, \
nx, ny, \ nx, ny, \
dx, dy, dt, \ dx, dy, dt, \
g, \
gamma, \ gamma, \
theta=1.3, \ theta=1.3, \
order=2, \
boundaryConditions=BoundaryCondition(), \
block_width=16, block_height=8): block_width=16, block_height=8):
# Call super constructor # Call super constructor
@@ -64,12 +69,15 @@ class EE2D_KP07_dimsplit (Simulator.BaseSimulator):
nx, ny, \ nx, ny, \
dx, dy, dt, \ dx, dy, dt, \
block_width, block_height) block_width, block_height)
self.g = np.float32(g)
self.gamma = np.float32(gamma) self.gamma = np.float32(gamma)
self.theta = np.float32(theta) self.theta = np.float32(theta)
self.order = np.int32(order)
self.boundaryConditions = boundaryConditions.asCodedInt()
#Get kernels #Get kernels
self.kernel = context.get_prepared_kernel("cuda/EE2D_KP07_dimsplit.cu", "KP07DimsplitKernel", \ self.kernel = context.get_prepared_kernel("cuda/EE2D_KP07_dimsplit.cu", "KP07DimsplitKernel", \
"iifffffiPiPiPiPiPiPiPiPi", \ "iiffffffiiPiPiPiPiPiPiPiPi", \
defines={ defines={
'BLOCK_WIDTH': self.block_size[0], 'BLOCK_WIDTH': self.block_size[0],
'BLOCK_HEIGHT': self.block_size[1] 'BLOCK_HEIGHT': self.block_size[1]
@@ -100,9 +108,11 @@ class EE2D_KP07_dimsplit (Simulator.BaseSimulator):
self.kernel.prepared_async_call(self.grid_size, self.block_size, self.stream, \ self.kernel.prepared_async_call(self.grid_size, self.block_size, self.stream, \
self.nx, self.ny, \ self.nx, self.ny, \
self.dx, self.dy, dt, \ self.dx, self.dy, dt, \
self.g, \
self.gamma, \ self.gamma, \
self.theta, \ self.theta, \
np.int32(0), \ Simulator.stepOrderToCodedInt(step=0, order=self.order), \
self.boundaryConditions, \
self.u0[0].data.gpudata, self.u0[0].data.strides[0], \ self.u0[0].data.gpudata, self.u0[0].data.strides[0], \
self.u0[1].data.gpudata, self.u0[1].data.strides[0], \ self.u0[1].data.gpudata, self.u0[1].data.strides[0], \
self.u0[2].data.gpudata, self.u0[2].data.strides[0], \ self.u0[2].data.gpudata, self.u0[2].data.strides[0], \
@@ -119,9 +129,11 @@ class EE2D_KP07_dimsplit (Simulator.BaseSimulator):
self.kernel.prepared_async_call(self.grid_size, self.block_size, self.stream, \ self.kernel.prepared_async_call(self.grid_size, self.block_size, self.stream, \
self.nx, self.ny, \ self.nx, self.ny, \
self.dx, self.dy, dt, \ self.dx, self.dy, dt, \
self.g, \
self.gamma, \ self.gamma, \
self.theta, \ self.theta, \
np.int32(1), \ Simulator.stepOrderToCodedInt(step=0, order=self.order), \
self.boundaryConditions, \
self.u0[0].data.gpudata, self.u0[0].data.strides[0], \ self.u0[0].data.gpudata, self.u0[0].data.strides[0], \
self.u0[1].data.gpudata, self.u0[1].data.strides[0], \ self.u0[1].data.gpudata, self.u0[1].data.strides[0], \
self.u0[2].data.gpudata, self.u0[2].data.strides[0], \ self.u0[2].data.gpudata, self.u0[2].data.strides[0], \
@@ -140,3 +152,4 @@ class EE2D_KP07_dimsplit (Simulator.BaseSimulator):
def check(self): def check(self):
self.u0.check() self.u0.check()
self.u1.check() self.u1.check()
pass

View File

@@ -23,6 +23,7 @@ along with this program. If not, see <http://www.gnu.org/licenses/>.
#Import packages we need #Import packages we need
import numpy as np import numpy as np
import logging import logging
from enum import IntEnum
import pycuda.compiler as cuda_compiler import pycuda.compiler as cuda_compiler
import pycuda.gpuarray import pycuda.gpuarray
@@ -31,7 +32,68 @@ import pycuda.driver as cuda
from GPUSimulators import Common from GPUSimulators import Common
class BaseSimulator:
class BoundaryCondition(object):
"""
Class for holding boundary conditions for global boundaries
"""
class Type(IntEnum):
"""
Enum that describes the different types of boundary conditions
WARNING: MUST MATCH THAT OF common.h IN CUDA
"""
Dirichlet = 0,
Neumann = 1,
Periodic = 2,
Reflective = 3
def __init__(self, types={ \
'north': Type.Reflective, \
'south': Type.Reflective, \
'east': Type.Reflective, \
'west': Type.Reflective \
}):
"""
Constructor
"""
self.north = types['north']
self.south = types['south']
self.east = types['east']
self.west = types['west']
def asCodedInt(self):
"""
Helper function which packs four boundary conditions into one integer
"""
bc = 0
bc = bc | (self.north & 0x000F) << 24
bc = bc | (self.south & 0x000F) << 16
bc = bc | (self.east & 0x000F) << 8
bc = bc | (self.west & 0x000F)
#for t in types:
# print("{0:s}, {1:d}, {1:032b}, {1:08b}".format(t, types[t]))
#print("bc: {0:032b}".format(bc))
return np.int32(bc)
class BaseSimulator(object):
def __init__(self, \
context, \
nx, ny, \
dx, dy, dt, \
block_width, block_height):
""" """
Initialization routine Initialization routine
context: GPU context to use context: GPU context to use
@@ -45,11 +107,6 @@ class BaseSimulator:
dy: Grid cell spacing along y-axis (20 000 m) dy: Grid cell spacing along y-axis (20 000 m)
dt: Size of each timestep (90 s) dt: Size of each timestep (90 s)
""" """
def __init__(self, \
context, \
nx, ny, \
dx, dy, dt, \
block_width, block_height):
#Get logger #Get logger
self.logger = logging.getLogger(__name__ + "." + self.__class__.__name__) self.logger = logging.getLogger(__name__ + "." + self.__class__.__name__)
@@ -88,17 +145,19 @@ class BaseSimulator:
def __str__(self): def __str__(self):
return "{:s} [{:d}x{:d}]".format(self.__class__.__name__, self.nx, self.ny) return "{:s} [{:d}x{:d}]".format(self.__class__.__name__, self.nx, self.ny)
def simulate(self, t_end):
""" """
Function which simulates forward in time using the default simulation type Function which simulates forward in time using the default simulation type
""" """
def simulate(self, t_end):
raise(exceptions.NotImplementedError("Needs to be implemented in subclass")) raise(exceptions.NotImplementedError("Needs to be implemented in subclass"))
def simulateEuler(self, t_end):
""" """
Function which simulates t_end seconds using forward Euler Function which simulates t_end seconds using forward Euler
Requires that the stepEuler functionality is implemented in the subclasses Requires that the stepEuler functionality is implemented in the subclasses
""" """
def simulateEuler(self, t_end):
# Compute number of timesteps to perform # Compute number of timesteps to perform
n = int(t_end / self.dt + 1) n = int(t_end / self.dt + 1)
@@ -119,17 +178,21 @@ class BaseSimulator:
print_string = printer.getPrintString(i) print_string = printer.getPrintString(i)
if (print_string): if (print_string):
self.logger.info("%s (Euler): %s", self, print_string) self.logger.info("%s (Euler): %s", self, print_string)
try:
self.check() self.check()
except AssertionError as e:
e.args += ("Step={:d}, time={:f}".format(self.simSteps(), self.simTime()))
raise
#self.logger.info("%s simulated %f seconds to %f with %d steps (Euler)", self, t_end, self.t, n) #self.logger.info("%s simulated %f seconds to %f with %d steps (Euler)", self, t_end, self.t, n)
return self.t, n return self.t, n
def simulateRK(self, t_end, order):
""" """
Function which simulates t_end seconds using Runge-Kutta 2 Function which simulates t_end seconds using Runge-Kutta 2
Requires that the stepRK functionality is implemented in the subclasses Requires that the stepRK functionality is implemented in the subclasses
""" """
def simulateRK(self, t_end, order):
# Compute number of timesteps to perform # Compute number of timesteps to perform
n = int(t_end / self.dt + 1) n = int(t_end / self.dt + 1)
@@ -150,15 +213,20 @@ class BaseSimulator:
print_string = printer.getPrintString(i) print_string = printer.getPrintString(i)
if (print_string): if (print_string):
self.logger.info("%s (RK2): %s", self, print_string) self.logger.info("%s (RK2): %s", self, print_string)
try:
self.check() self.check()
except AssertionError as e:
e.args += ("Step={:d}, time={:f}".format(self.simSteps(), self.simTime()))
raise
return self.t, n return self.t, n
def simulateDimsplit(self, t_end):
""" """
Function which simulates t_end seconds using second order dimensional splitting (XYYX) Function which simulates t_end seconds using second order dimensional splitting (XYYX)
Requires that the stepDimsplitX and stepDimsplitY functionality is implemented in the subclasses Requires that the stepDimsplitX and stepDimsplitY functionality is implemented in the subclasses
""" """
def simulateDimsplit(self, t_end):
# Compute number of timesteps to perform # Compute number of timesteps to perform
n = int(t_end / (2.0*self.dt) + 1) n = int(t_end / (2.0*self.dt) + 1)
@@ -180,24 +248,37 @@ class BaseSimulator:
print_string = printer.getPrintString(i) print_string = printer.getPrintString(i)
if (print_string): if (print_string):
self.logger.info("%s (Dimsplit): %s", self, print_string) self.logger.info("%s (Dimsplit): %s", self, print_string)
try:
self.check() self.check()
except AssertionError as e:
e.args += ("Step={:d}, time={:f}".format(self.simSteps(), self.simTime()))
raise
return self.t, 2*n return self.t, 2*n
def stepEuler(self, dt):
""" """
Function which performs one single timestep of size dt using forward euler Function which performs one single timestep of size dt using forward euler
""" """
def stepEuler(self, dt):
raise(NotImplementedError("Needs to be implemented in subclass")) raise(NotImplementedError("Needs to be implemented in subclass"))
def stepRK(self, dt, substep): def stepRK(self, dt, substep):
"""
Function which performs one single timestep of size dt using Runge-Kutta
"""
raise(NotImplementedError("Needs to be implemented in subclass")) raise(NotImplementedError("Needs to be implemented in subclass"))
def stepDimsplitXY(self, dt): def stepDimsplitXY(self, dt):
"""
Function which performs one single timestep of size dt using dimensional splitting
"""
raise(NotImplementedError("Needs to be implemented in subclass")) raise(NotImplementedError("Needs to be implemented in subclass"))
def stepDimsplitYX(self, dt): def stepDimsplitYX(self, dt):
"""
Function which performs one single timestep of size dt using dimensional splitting
"""
raise(NotImplementedError("Needs to be implemented in subclass")) raise(NotImplementedError("Needs to be implemented in subclass"))
def download(self): def download(self):
@@ -215,3 +296,25 @@ class BaseSimulator:
def simSteps(self): def simSteps(self):
return self.nt return self.nt
def stepOrderToCodedInt(step, order):
"""
Helper function which packs the step and order into a single integer
"""
step_order = (step << 16) ^ (order & 0x00ff)
#print("Step: {0:032b}".format(step))
#print("Order: {0:032b}".format(order))
#print("Mix: {0:032b}".format(step_order))
return np.int32(step_order)

View File

@@ -122,14 +122,17 @@ void computeFluxG(float Q[4][BLOCK_HEIGHT+4][BLOCK_WIDTH+4],
* This unsplit kernel computes the 2D numerical scheme with a TVD RK2 time integration scheme * This unsplit kernel computes the 2D numerical scheme with a TVD RK2 time integration scheme
*/ */
extern "C" { extern "C" {
__global__ void KP07DimsplitKernel( __global__ void KP07DimsplitKernel(
int nx_, int ny_, int nx_, int ny_,
float dx_, float dy_, float dt_, float dx_, float dy_, float dt_,
float g_,
float gamma_, float gamma_,
float theta_, float theta_,
int step_, int step_order_,
int boundary_conditions_,
//Input h^n //Input h^n
float* rho0_ptr_, int rho0_pitch_, float* rho0_ptr_, int rho0_pitch_,
@@ -142,7 +145,6 @@ __global__ void KP07DimsplitKernel(
float* rho_u1_ptr_, int rho_u1_pitch_, float* rho_u1_ptr_, int rho_u1_pitch_,
float* rho_v1_ptr_, int rho_v1_pitch_, float* rho_v1_ptr_, int rho_v1_pitch_,
float* E1_ptr_, int E1_pitch_) { float* E1_ptr_, int E1_pitch_) {
const unsigned int w = BLOCK_WIDTH; const unsigned int w = BLOCK_WIDTH;
const unsigned int h = BLOCK_HEIGHT; const unsigned int h = BLOCK_HEIGHT;
const unsigned int gc = 2; const unsigned int gc = 2;
@@ -153,8 +155,6 @@ __global__ void KP07DimsplitKernel(
__shared__ float Qx[4][h+4][w+4]; __shared__ float Qx[4][h+4][w+4];
__shared__ float F[4][h+4][w+4]; __shared__ float F[4][h+4][w+4];
//Read into shared memory //Read into shared memory
readBlock<w, h, gc>( rho0_ptr_, rho0_pitch_, Q[0], nx_, ny_); readBlock<w, h, gc>( rho0_ptr_, rho0_pitch_, Q[0], nx_, ny_);
readBlock<w, h, gc>(rho_u0_ptr_, rho_u0_pitch_, Q[1], nx_, ny_); readBlock<w, h, gc>(rho_u0_ptr_, rho_u0_pitch_, Q[1], nx_, ny_);
@@ -167,13 +167,10 @@ __global__ void KP07DimsplitKernel(
noFlowBoundary<w, h, gc, -1, 1>(Q[1], nx_, ny_); noFlowBoundary<w, h, gc, -1, 1>(Q[1], nx_, ny_);
noFlowBoundary<w, h, gc, 1, -1>(Q[2], nx_, ny_); noFlowBoundary<w, h, gc, 1, -1>(Q[2], nx_, ny_);
noFlowBoundary<w, h, gc, 1, 1>(Q[3], nx_, ny_); noFlowBoundary<w, h, gc, 1, 1>(Q[3], nx_, ny_);
__syncthreads();
const float g = 0.1f;
//Step 0 => evolve x first, then y //Step 0 => evolve x first, then y
if (step_ == 0) { if (getStep(step_order_) == 0) {
//Compute fluxes along the x axis and evolve //Compute fluxes along the x axis and evolve
minmodSlopeX<w, h, gc, vars>(Q, Qx, theta_); minmodSlopeX<w, h, gc, vars>(Q, Qx, theta_);
__syncthreads(); __syncthreads();
@@ -184,17 +181,11 @@ __global__ void KP07DimsplitKernel(
evolveF<w, h, gc, vars>(Q, F, dx_, dt_); evolveF<w, h, gc, vars>(Q, F, dx_, dt_);
__syncthreads(); __syncthreads();
//Set boundary conditions
noFlowBoundary<w, h, gc, 1, 1>(Q[0], nx_, ny_);
noFlowBoundary<w, h, gc, -1, 1>(Q[1], nx_, ny_);
noFlowBoundary<w, h, gc, 1, -1>(Q[2], nx_, ny_);
noFlowBoundary<w, h, gc, 1, 1>(Q[3], nx_, ny_);
__syncthreads();
//Compute fluxes along the y axis and evolve //Compute fluxes along the y axis and evolve
minmodSlopeY<w, h, gc, vars>(Q, Qx, theta_); minmodSlopeY<w, h, gc, vars>(Q, Qx, theta_);
__syncthreads(); __syncthreads();
computeFluxG(Q, Qx, F, gamma_, dy_, dt_); computeFluxG(Q, Qx, F, gamma_, dy_, dt_);
__syncthreads(); __syncthreads();
@@ -202,15 +193,14 @@ __global__ void KP07DimsplitKernel(
__syncthreads(); __syncthreads();
//Gravity source term //Gravity source term
{ if (g_ > 0.0f) {
const int i = threadIdx.x + gc; const int i = threadIdx.x + gc;
const int j = threadIdx.y + gc; const int j = threadIdx.y + gc;
const float rho_v = Q[2][j][i]; const float rho_v = Q[2][j][i];
Q[2][j][i] -= g*Q[0][j][i]*dt_; Q[2][j][i] -= g_*Q[0][j][i]*dt_;
Q[3][j][i] -= g*rho_v*dt_; Q[3][j][i] -= g_*rho_v*dt_;
}
__syncthreads(); __syncthreads();
}
} }
//Step 1 => evolve y first, then x //Step 1 => evolve y first, then x
else { else {
@@ -224,13 +214,6 @@ __global__ void KP07DimsplitKernel(
evolveG<w, h, gc, vars>(Q, F, dy_, dt_); evolveG<w, h, gc, vars>(Q, F, dy_, dt_);
__syncthreads(); __syncthreads();
//Set boundary conditions
noFlowBoundary<w, h, gc, 1, 1>(Q[0], nx_, ny_);
noFlowBoundary<w, h, gc, -1, 1>(Q[1], nx_, ny_);
noFlowBoundary<w, h, gc, 1, -1>(Q[2], nx_, ny_);
noFlowBoundary<w, h, gc, 1, 1>(Q[3], nx_, ny_);
__syncthreads();
//Compute fluxes along the x axis and evolve //Compute fluxes along the x axis and evolve
minmodSlopeX<w, h, gc, vars>(Q, Qx, theta_); minmodSlopeX<w, h, gc, vars>(Q, Qx, theta_);
__syncthreads(); __syncthreads();
@@ -242,16 +225,17 @@ __global__ void KP07DimsplitKernel(
__syncthreads(); __syncthreads();
//Gravity source term //Gravity source term
{ if (g_ > 0.0f) {
const int i = threadIdx.x + gc; const int i = threadIdx.x + gc;
const int j = threadIdx.y + gc; const int j = threadIdx.y + gc;
const float rho_v = Q[2][j][i]; const float rho_v = Q[2][j][i];
Q[2][j][i] -= g*Q[0][j][i]*dt_; Q[2][j][i] -= g_*Q[0][j][i]*dt_;
Q[3][j][i] -= g*rho_v*dt_; Q[3][j][i] -= g_*rho_v*dt_;
}
__syncthreads(); __syncthreads();
}
//This is the RK2-part //This is the RK2-part
if (getOrder(step_order_) == 2) {
const int tx = threadIdx.x + gc; const int tx = threadIdx.x + gc;
const int ty = threadIdx.y + gc; const int ty = threadIdx.y + gc;
const float q1 = Q[0][ty][tx]; const float q1 = Q[0][ty][tx];
@@ -271,6 +255,7 @@ __global__ void KP07DimsplitKernel(
Q[2][ty][tx] = 0.5f*( Q[2][ty][tx] + q3 ); Q[2][ty][tx] = 0.5f*( Q[2][ty][tx] + q3 );
Q[3][ty][tx] = 0.5f*( Q[3][ty][tx] + q4 ); Q[3][ty][tx] = 0.5f*( Q[3][ty][tx] + q4 );
} }
}
// Write to main memory for all internal cells // Write to main memory for all internal cells

View File

@@ -167,90 +167,116 @@ inline __device__ void writeBlock(float* ptr_, int pitch_,
template<int block_width, int block_height, int ghost_cells, int scale_east_west=1, int scale_north_south=1> template<int block_width, int block_height, int ghost_cells, int scale_east_west=1, int scale_north_south=1>
__device__ void noFlowBoundary(float Q[block_height+2*ghost_cells][block_width+2*ghost_cells], const int nx_, const int ny_) { __device__ void noFlowBoundary(float Q[block_height+2*ghost_cells][block_width+2*ghost_cells], const int nx_, const int ny_) {
bcEastReflective<block_width, block_height, ghost_cells, scale_east_west>(Q, nx_, ny_);
bcWestReflective<block_width, block_height, ghost_cells, scale_east_west>(Q, nx_, ny_);
__syncthreads();
bcNorthReflective<block_width, block_height, ghost_cells, scale_north_south>(Q, nx_, ny_);
bcSouthReflective<block_width, block_height, ghost_cells, scale_north_south>(Q, nx_, ny_);
__syncthreads();
}
// West boundary
template<int block_width, int block_height, int ghost_cells, int sign>
__device__ void bcWestReflective(float Q[block_height+2*ghost_cells][block_width+2*ghost_cells], const int nx_, const int ny_) {
for (int j=threadIdx.y; j<block_height+2*ghost_cells; j+= block_height) { for (int j=threadIdx.y; j<block_height+2*ghost_cells; j+= block_height) {
const int i = threadIdx.x + ghost_cells; const int i = threadIdx.x + ghost_cells;
const int ti = blockDim.x*blockIdx.x + i; const int ti = blockDim.x*blockIdx.x + i;
const int tj = blockDim.y*blockIdx.y + j;
// West boundary
if (ti == ghost_cells) { if (ti == ghost_cells) {
Q[j][i-1] = scale_east_west*Q[j][i]; Q[j][i-1] = sign*Q[j][i];
} }
if (ghost_cells >= 2 && ti == ghost_cells + 1) { if (ghost_cells >= 2 && ti == ghost_cells + 1) {
Q[j][i-3] = scale_east_west*Q[j][i]; Q[j][i-3] = sign*Q[j][i];
} }
if (ghost_cells >= 3 && ti == ghost_cells + 2) { if (ghost_cells >= 3 && ti == ghost_cells + 2) {
Q[j][i-5] = scale_east_west*Q[j][i]; Q[j][i-5] = sign*Q[j][i];
} }
if (ghost_cells >= 4 && ti == ghost_cells + 3) { if (ghost_cells >= 4 && ti == ghost_cells + 3) {
Q[j][i-7] = scale_east_west*Q[j][i]; Q[j][i-7] = sign*Q[j][i];
} }
if (ghost_cells >= 5 && ti == ghost_cells + 4) { if (ghost_cells >= 5 && ti == ghost_cells + 4) {
Q[j][i-9] = scale_east_west*Q[j][i]; Q[j][i-9] = sign*Q[j][i];
}
}
} }
// East boundary // East boundary
template<int block_width, int block_height, int ghost_cells, int sign>
__device__ void bcEastReflective(float Q[block_height+2*ghost_cells][block_width+2*ghost_cells], const int nx_, const int ny_) {
for (int j=threadIdx.y; j<block_height+2*ghost_cells; j+= block_height) {
const int i = threadIdx.x + ghost_cells;
const int ti = blockDim.x*blockIdx.x + i;
if (ti == nx_ + ghost_cells - 1) { if (ti == nx_ + ghost_cells - 1) {
Q[j][i+1] = scale_east_west*Q[j][i]; Q[j][i+1] = sign*Q[j][i];
} }
if (ghost_cells >= 2 && ti == nx_ + ghost_cells - 2) { if (ghost_cells >= 2 && ti == nx_ + ghost_cells - 2) {
Q[j][i+3] = scale_east_west*Q[j][i]; Q[j][i+3] = sign*Q[j][i];
} }
if (ghost_cells >= 3 && ti == nx_ + ghost_cells - 3) { if (ghost_cells >= 3 && ti == nx_ + ghost_cells - 3) {
Q[j][i+5] = scale_east_west*Q[j][i]; Q[j][i+5] = sign*Q[j][i];
} }
if (ghost_cells >= 4 && ti == nx_ + ghost_cells - 4) { if (ghost_cells >= 4 && ti == nx_ + ghost_cells - 4) {
Q[j][i+7] = scale_east_west*Q[j][i]; Q[j][i+7] = sign*Q[j][i];
} }
if (ghost_cells >= 5 && ti == nx_ + ghost_cells - 5) { if (ghost_cells >= 5 && ti == nx_ + ghost_cells - 5) {
Q[j][i+9] = scale_east_west*Q[j][i]; Q[j][i+9] = sign*Q[j][i];
}
} }
} }
for (int i=threadIdx.x; i<block_width+2*ghost_cells; i+= block_width) {
const int j = threadIdx.y + ghost_cells;
const int ti = blockDim.x*blockIdx.x + i;
const int tj = blockDim.y*blockIdx.y + j;
// South boundary // South boundary
template<int block_width, int block_height, int ghost_cells, int sign>
__device__ void bcSouthReflective(float Q[block_height+2*ghost_cells][block_width+2*ghost_cells], const int nx_, const int ny_) {
for (int i=threadIdx.x; i<block_width+2*ghost_cells; i+= block_width) {
const int j = threadIdx.y + ghost_cells;
const int tj = blockDim.y*blockIdx.y + j;
if (tj == ghost_cells) { if (tj == ghost_cells) {
Q[j-1][i] = scale_north_south*Q[j][i]; Q[j-1][i] = sign*Q[j][i];
} }
if (ghost_cells >= 2 && tj == ghost_cells + 1) { if (ghost_cells >= 2 && tj == ghost_cells + 1) {
Q[j-3][i] = scale_north_south*Q[j][i]; Q[j-3][i] = sign*Q[j][i];
} }
if (ghost_cells >= 3 && tj == ghost_cells + 2) { if (ghost_cells >= 3 && tj == ghost_cells + 2) {
Q[j-5][i] = scale_north_south*Q[j][i]; Q[j-5][i] = sign*Q[j][i];
} }
if (ghost_cells >= 4 && tj == ghost_cells + 3) { if (ghost_cells >= 4 && tj == ghost_cells + 3) {
Q[j-7][i] = scale_north_south*Q[j][i]; Q[j-7][i] = sign*Q[j][i];
} }
if (ghost_cells >= 5 && tj == ghost_cells + 4) { if (ghost_cells >= 5 && tj == ghost_cells + 4) {
Q[j-9][i] = scale_north_south*Q[j][i]; Q[j-9][i] = sign*Q[j][i];
} }
}
}
// North boundary // North boundary
template<int block_width, int block_height, int ghost_cells, int sign>
__device__ void bcNorthReflective(float Q[block_height+2*ghost_cells][block_width+2*ghost_cells], const int nx_, const int ny_) {
for (int i=threadIdx.x; i<block_width+2*ghost_cells; i+= block_width) {
const int j = threadIdx.y + ghost_cells;
const int tj = blockDim.y*blockIdx.y + j;
if (tj == ny_ + ghost_cells - 1) { if (tj == ny_ + ghost_cells - 1) {
Q[j+1][i] = scale_north_south*Q[j][i]; Q[j+1][i] = sign*Q[j][i];
} }
if (ghost_cells >= 2 && tj == ny_ + ghost_cells - 2) { if (ghost_cells >= 2 && tj == ny_ + ghost_cells - 2) {
Q[j+3][i] = scale_north_south*Q[j][i]; Q[j+3][i] = sign*Q[j][i];
} }
if (ghost_cells >= 3 && tj == ny_ + ghost_cells - 3) { if (ghost_cells >= 3 && tj == ny_ + ghost_cells - 3) {
Q[j+5][i] = scale_north_south*Q[j][i]; Q[j+5][i] = sign*Q[j][i];
} }
if (ghost_cells >= 4 && tj == ny_ + ghost_cells - 4) { if (ghost_cells >= 4 && tj == ny_ + ghost_cells - 4) {
Q[j+7][i] = scale_north_south*Q[j][i]; Q[j+7][i] = sign*Q[j][i];
} }
if (ghost_cells >= 5 && tj == ny_ + ghost_cells - 5) { if (ghost_cells >= 5 && tj == ny_ + ghost_cells - 5) {
Q[j+9][i] = scale_north_south*Q[j][i]; Q[j+9][i] = sign*Q[j][i];
} }
} }
} }
@@ -282,7 +308,7 @@ __device__ void evolveF(float Q[vars][block_height+2*ghost_cells][block_width+2*
const float dx_, const float dt_) { const float dx_, const float dt_) {
for (int var=0; var < vars; ++var) { for (int var=0; var < vars; ++var) {
for (int j=threadIdx.y; j<block_height+2*ghost_cells; j+=block_height) { for (int j=threadIdx.y; j<block_height+2*ghost_cells; j+=block_height) {
for (int i=threadIdx.x+1; i<block_width+2*ghost_cells; i+=block_width) { for (int i=threadIdx.x+ghost_cells; i<block_width+ghost_cells; i+=block_width) {
Q[var][j][i] = Q[var][j][i] + (F[var][j][i-1] - F[var][j][i]) * dt_ / dx_; Q[var][j][i] = Q[var][j][i] + (F[var][j][i-1] - F[var][j][i]) * dt_ / dx_;
} }
} }
@@ -302,7 +328,7 @@ __device__ void evolveG(float Q[vars][block_height+2*ghost_cells][block_width+2*
float G[vars][block_height+2*ghost_cells][block_width+2*ghost_cells], float G[vars][block_height+2*ghost_cells][block_width+2*ghost_cells],
const float dy_, const float dt_) { const float dy_, const float dt_) {
for (int var=0; var < vars; ++var) { for (int var=0; var < vars; ++var) {
for (int j=threadIdx.y+1; j<block_height+2*ghost_cells; j+=block_height) { for (int j=threadIdx.y+ghost_cells; j<block_height+ghost_cells; j+=block_height) {
for (int i=threadIdx.x; i<block_width+2*ghost_cells; i+=block_width) { for (int i=threadIdx.x; i<block_width+2*ghost_cells; i+=block_width) {
Q[var][j][i] = Q[var][j][i] + (G[var][j-1][i] - G[var][j][i]) * dt_ / dy_; Q[var][j][i] = Q[var][j][i] + (G[var][j-1][i] - G[var][j][i]) * dt_ / dy_;
} }
@@ -329,11 +355,45 @@ __device__ void memset(float Q[vars][shmem_height][shmem_width], float value) {
} }
/**
* Returns the step stored in the leftmost 16 bits
* of the 32 bit step-order integer
*/
inline __device__ int getStep(int step_order_) {
return step_order_ >> 16;
}
/**
* Returns the order stored in the rightmost 16 bits
* of the 32 bit step-order integer
*/
inline __device__ int getOrder(int step_order_) {
return step_order_ & 0x0000FFFF;
}
enum BoundaryCondition {
Dirichlet = 0,
Neumann = 1,
Periodic = 2,
Reflective = 3
};
inline __device__ BoundaryCondition getBCNorth(int bc_) {
return static_cast<BoundaryCondition>(bc_ & 0x000F);
}
inline __device__ BoundaryCondition getBCSouth(int bc_) {
return static_cast<BoundaryCondition>((bc_ >> 8) & 0x000F);
}
inline __device__ BoundaryCondition getBCEast(int bc_) {
return static_cast<BoundaryCondition>((bc_ >> 16) & 0x000F);
}
inline __device__ BoundaryCondition getBCWest(int bc_) {
return static_cast<BoundaryCondition>(bc_ >> 24);
}