Refactoring

This commit is contained in:
André R. Brodtkorb 2018-10-30 15:14:19 +01:00
parent 10d8e26108
commit e434b4e02a
20 changed files with 325 additions and 278 deletions

View File

@ -81,9 +81,6 @@ class FORCE (Simulator.BaseSimulator):
1, 1, \
[None, None, None])
def __str__(self):
return "First order centered"
def simulate(self, t_end):
return super().simulateEuler(t_end)

View File

@ -76,9 +76,6 @@ class HLL (Simulator.BaseSimulator):
1, 1, \
[None, None, None])
def __str__(self):
return "Harten-Lax-van Leer"
def simulate(self, t_end):
return super().simulateEuler(t_end)

View File

@ -82,9 +82,6 @@ class HLL2 (Simulator.BaseSimulator):
2, 2, \
[None, None, None])
def __str__(self):
return "Harten-Lax-van Leer (2nd order)"
def simulate(self, t_end):
return super().simulateDimsplit(t_end)

View File

@ -83,9 +83,6 @@ class HLL2Euler (Simulator.BaseSimulator):
2, 2, \
[None, None, None, None])
def __str__(self):
return "Harten-Lax-van Leer (2nd order)"
def simulate(self, t_end):
return super().simulateDimsplit(t_end)

View File

@ -90,6 +90,17 @@ class MyIPythonMagic(Magics):
self.logger.debug("==================================================================")
atexit.register(exitfunc)
logger_initialized = False
@line_magic
@magic_arguments.magic_arguments()
@magic_arguments.argument(
@ -99,6 +110,12 @@ class MyIPythonMagic(Magics):
@magic_arguments.argument(
'--file_level', '-f', type=int, default=10, help='The level of logging to file [0, 50]')
def setup_logging(self, line):
if (self.logger_initialized):
logging.getLogger('').info("Global logger already initialized!")
return;
else:
self.logger_initialized = True
args = magic_arguments.parse_argstring(self.setup_logging, line)
import sys
@ -122,6 +139,14 @@ class MyIPythonMagic(Magics):
logger.info("Python version %s", sys.version)
# Register
ip = get_ipython()
ip.register_magics(MyIPythonMagic)

View File

@ -83,9 +83,6 @@ class KP07 (Simulator.BaseSimulator):
2, 2, \
[None, None, None])
def __str__(self):
return "Kurganov-Petrova 2007"
def simulate(self, t_end):
return super().simulateRK(t_end, 2)

View File

@ -83,9 +83,6 @@ class KP07_dimsplit (Simulator.BaseSimulator):
2, 2, \
[None, None, None])
def __str__(self):
return "Kurganov-Petrova 2007 dimensionally split"
def simulate(self, t_end):
return super().simulateDimsplit(t_end)

View File

@ -77,9 +77,6 @@ class LxF (Simulator.BaseSimulator):
1, 1, \
[None, None, None])
def __str__(self):
return "Lax Friedrichs"
def simulate(self, t_end):
return super().simulateEuler(t_end)

View File

@ -86,6 +86,9 @@ class BaseSimulator:
int(np.ceil(self.ny / float(self.local_size[1]))) \
)
def __str__(self):
return "{:s} [{:d}x{:d}]".format(self.__class__.__name__, self.nx, self.ny)
"""
Function which simulates forward in time using the default simulation type
"""
@ -112,7 +115,7 @@ class BaseSimulator:
# Step with forward Euler
self.stepEuler(local_dt)
self.logger.info("%s simulated %f seconds to %f with %d steps in %f seconds", self.__class__.__name__, t_end, self.t, n, t.secs)
self.logger.info("%s simulated %f seconds to %f with %d steps in %f seconds (Euler)", self, t_end, self.t, n, t.secs)
return self.t, n
"""
@ -135,7 +138,7 @@ class BaseSimulator:
# Perform all the Runge-Kutta substeps
self.stepRK(local_dt, order)
self.logger.info("%s simulated %f seconds to %f with %d steps in %f seconds", self.__class__.__name__, t_end, self.t, n, t.secs)
self.logger.info("%s simulated %f seconds to %f with %d steps in %f seconds (RK2)", self, t_end, self.t, n, t.secs)
return self.t, n
"""
@ -159,7 +162,7 @@ class BaseSimulator:
self.stepDimsplitXY(local_dt)
self.stepDimsplitYX(local_dt)
self.logger.info("%s simulated %f seconds to %f with %d steps in %f seconds", self.__class__.__name__, t_end, self.t, 2*n, t.secs)
self.logger.info("%s simulated %f seconds to %f with %d steps in %f seconds (dimsplit)", self, t_end, self.t, 2*n, t.secs)
return self.t, 2*n

View File

@ -76,9 +76,6 @@ class WAF (Simulator.BaseSimulator):
2, 2, \
[None, None, None])
def __str__(self):
return "Weighted average flux"
def simulate(self, t_end):
return super().simulateDimsplit(t_end)

View File

@ -1,12 +1,8 @@
/*
This OpenCL kernel implements the Kurganov-Petrova numerical scheme
for the shallow water equations, described in
A. Kurganov & Guergana Petrova
A Second-Order Well-Balanced Positivity Preserving Central-Upwind
Scheme for the Saint-Venant System Communications in Mathematical
Sciences, 5 (2007), 133-160.
These CUDA functions implement different types of numerical flux
functions for the shallow water equations
Copyright (C) 2016 SINTEF ICT
Copyright (C) 2016, 2017, 2018 SINTEF Digital
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
@ -23,13 +19,14 @@ along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
#pragma once
#include "limiters.h"
__device__ float pressure(float4 Q, float gamma) {
inline __device__ float pressure(float4 Q, float gamma) {
const float rho = Q.x;
const float rho_u = Q.y;
const float rho_v = Q.z;
@ -39,14 +36,13 @@ __device__ float pressure(float4 Q, float gamma) {
}
__device__ float4 F_func(const float4 Q, float gamma) {
__device__ float4 F_func(const float4 Q, float P) {
const float rho = Q.x;
const float rho_u = Q.y;
const float rho_v = Q.z;
const float E = Q.w;
const float u = rho_u/rho;
const float P = pressure(Q, gamma);
float4 F;
@ -57,3 +53,28 @@ __device__ float4 F_func(const float4 Q, float gamma) {
return F;
}
/**
* Central upwind flux function
*/
__device__ float4 CentralUpwindFlux(const float4 Qm, float4 Qp, const float gamma) {
const float Pp = pressure(Qp, gamma);
const float4 Fp = F_func(Qp, Pp);
const float up = Qp.y / Qp.x; // rho*u / rho
const float cp = sqrt(gamma*P*Qp.x); // sqrt(gamma*P/rho)
const float Pm = pressure(Qm, gamma);
const float3 Fm = F_func(Qm, Pm);
const float um = Qm.y / Qm.x; // rho*u / rho
const float cm = sqrt(gamma*P/Qm.x); // sqrt(g*h)
const float am = min(min(um-cm, up-cp), 0.0f); // largest negative wave speed
const float ap = max(max(um+cm, up+cp), 0.0f); // largest positive wave speed
return ((ap*Fm - am*Fp) + ap*am*(Qp-Qm))/(ap-am);
}

View File

@ -1,12 +1,8 @@
/*
This OpenCL kernel implements the Kurganov-Petrova numerical scheme
for the shallow water equations, described in
A. Kurganov & Guergana Petrova
A Second-Order Well-Balanced Positivity Preserving Central-Upwind
Scheme for the Saint-Venant System Communications in Mathematical
Sciences, 5 (2007), 133-160.
These CUDA functions implement different types of numerical flux
functions for the shallow water equations
Copyright (C) 2016 SINTEF ICT
Copyright (C) 2016, 2017, 2018 SINTEF Digital
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by

View File

@ -114,18 +114,23 @@ __global__ void FORCEKernel(
float* hu1_ptr_, int hu1_pitch_,
float* hv1_ptr_, int hv1_pitch_) {
__shared__ float Q[3][BLOCK_HEIGHT+2][BLOCK_WIDTH+2];
__shared__ float F[3][BLOCK_HEIGHT+1][BLOCK_WIDTH+1];
const unsigned int w = BLOCK_WIDTH;
const unsigned int h = BLOCK_HEIGHT;
const unsigned int gc = 1;
__shared__ float Q[3][h+2][w+2];
__shared__ float F[3][h+1][w+1];
//Read into shared memory
float* Q_ptr[3] = {h0_ptr_, hu0_ptr_, hv0_ptr_};
int Q_pitch[3] = {h0_pitch_, hu0_pitch_, hv0_pitch_};
readBlock<3, BLOCK_WIDTH+2, BLOCK_HEIGHT+2, BLOCK_WIDTH, BLOCK_HEIGHT>(Q_ptr, Q_pitch, Q, nx_+2, ny_+2);
readBlock<w, h, gc>( h0_ptr_, h0_pitch_, Q[0], nx_+2, ny_+2);
readBlock<w, h, gc>(hu0_ptr_, hu0_pitch_, Q[1], nx_+2, ny_+2);
readBlock<w, h, gc>(hv0_ptr_, hv0_pitch_, Q[2], nx_+2, ny_+2);
__syncthreads();
//Set boundary conditions
noFlowBoundary1(Q, nx_, ny_);
noFlowBoundary<w, h, gc, 1, 1>(Q[0], nx_, ny_);
noFlowBoundary<w, h, gc, -1, 1>(Q[1], nx_, ny_);
noFlowBoundary<w, h, gc, 1, -1>(Q[2], nx_, ny_);
__syncthreads();
//Compute flux along x, and evolve
@ -135,7 +140,9 @@ __global__ void FORCEKernel(
__syncthreads();
//Set boundary conditions
noFlowBoundary1(Q, nx_, ny_);
noFlowBoundary<w, h, gc, 1, 1>(Q[0], nx_, ny_);
noFlowBoundary<w, h, gc, -1, 1>(Q[1], nx_, ny_);
noFlowBoundary<w, h, gc, 1, -1>(Q[2], nx_, ny_);
__syncthreads();
//Compute flux along y, and evolve
@ -145,9 +152,9 @@ __global__ void FORCEKernel(
__syncthreads();
//Write to main memory
writeBlock<BLOCK_WIDTH+2, BLOCK_HEIGHT+2, 1, 1>( h1_ptr_, h1_pitch_, Q[0], nx_, ny_);
writeBlock<BLOCK_WIDTH+2, BLOCK_HEIGHT+2, 1, 1>(hu1_ptr_, hu1_pitch_, Q[1], nx_, ny_);
writeBlock<BLOCK_WIDTH+2, BLOCK_HEIGHT+2, 1, 1>(hv1_ptr_, hv1_pitch_, Q[2], nx_, ny_);
writeBlock<w, h, gc>( h1_ptr_, h1_pitch_, Q[0], nx_, ny_);
writeBlock<w, h, gc>(hu1_ptr_, hu1_pitch_, Q[1], nx_, ny_);
writeBlock<w, h, gc>(hv1_ptr_, hv1_pitch_, Q[2], nx_, ny_);
}
} // extern "C"

View File

@ -121,19 +121,24 @@ __global__ void HLLKernel(
float* hu1_ptr_, int hu1_pitch_,
float* hv1_ptr_, int hv1_pitch_) {
//Shared memory variables
__shared__ float Q[3][BLOCK_HEIGHT+2][BLOCK_WIDTH+2];
__shared__ float F[3][BLOCK_HEIGHT+1][BLOCK_WIDTH+1];
const unsigned int w = BLOCK_WIDTH;
const unsigned int h = BLOCK_HEIGHT;
const unsigned int gc = 1;
//Shared memory variables
__shared__ float Q[3][h+2][w+2];
__shared__ float F[3][h+1][w+1];
//Read into shared memory
float* Q_ptr[3] = {h0_ptr_, hu0_ptr_, hv0_ptr_};
int Q_pitch[3] = {h0_pitch_, hu0_pitch_, hv0_pitch_};
readBlock<3, BLOCK_WIDTH+2, BLOCK_HEIGHT+2, BLOCK_WIDTH, BLOCK_HEIGHT>(Q_ptr, Q_pitch, Q, nx_+2, ny_+2);
readBlock<w, h, gc>( h0_ptr_, h0_pitch_, Q[0], nx_+2, ny_+2);
readBlock<w, h, gc>(hu0_ptr_, hu0_pitch_, Q[1], nx_+2, ny_+2);
readBlock<w, h, gc>(hv0_ptr_, hv0_pitch_, Q[2], nx_+2, ny_+2);
__syncthreads();
//Set boundary conditions
noFlowBoundary1(Q, nx_, ny_);
noFlowBoundary<w, h, gc, 1, 1>(Q[0], nx_, ny_);
noFlowBoundary<w, h, gc, -1, 1>(Q[1], nx_, ny_);
noFlowBoundary<w, h, gc, 1, -1>(Q[2], nx_, ny_);
__syncthreads();
//Compute F flux
@ -143,7 +148,9 @@ __global__ void HLLKernel(
__syncthreads();
//Set boundary conditions
noFlowBoundary1(Q, nx_, ny_);
noFlowBoundary<w, h, gc, 1, 1>(Q[0], nx_, ny_);
noFlowBoundary<w, h, gc, -1, 1>(Q[1], nx_, ny_);
noFlowBoundary<w, h, gc, 1, -1>(Q[2], nx_, ny_);
__syncthreads();
//Compute G flux
@ -153,9 +160,9 @@ __global__ void HLLKernel(
__syncthreads();
// Write to main memory for all internal cells
writeBlock<BLOCK_WIDTH+2, BLOCK_HEIGHT+2, 1, 1>( h1_ptr_, h1_pitch_, Q[0], nx_, ny_);
writeBlock<BLOCK_WIDTH+2, BLOCK_HEIGHT+2, 1, 1>(hu1_ptr_, hu1_pitch_, Q[1], nx_, ny_);
writeBlock<BLOCK_WIDTH+2, BLOCK_HEIGHT+2, 1, 1>(hv1_ptr_, hv1_pitch_, Q[2], nx_, ny_);
writeBlock<w, h, gc>( h1_ptr_, h1_pitch_, Q[0], nx_, ny_);
writeBlock<w, h, gc>(hu1_ptr_, hu1_pitch_, Q[1], nx_, ny_);
writeBlock<w, h, gc>(hv1_ptr_, hv1_pitch_, Q[2], nx_, ny_);
}
} // extern "C"

View File

@ -156,22 +156,25 @@ __global__ void HLL2Kernel(
float* hu1_ptr_, int hu1_pitch_,
float* hv1_ptr_, int hv1_pitch_) {
const unsigned int w = BLOCK_WIDTH;
const unsigned int h = BLOCK_HEIGHT;
const unsigned int gc = 2;
//Shared memory variables
__shared__ float Q[3][BLOCK_HEIGHT+4][BLOCK_WIDTH+4];
__shared__ float Qx[3][BLOCK_HEIGHT+2][BLOCK_WIDTH+2];
__shared__ float F[3][BLOCK_HEIGHT+1][BLOCK_WIDTH+1];
__shared__ float Q[3][h+4][w+4];
__shared__ float Qx[3][h+2][w+2];
__shared__ float F[3][h+1][w+1];
//Read into shared memory
float* Q_ptr[3] = {h0_ptr_, hu0_ptr_, hv0_ptr_};
int Q_pitch[3] = {h0_pitch_, hu0_pitch_, hv0_pitch_};
readBlock<3, BLOCK_WIDTH+4, BLOCK_HEIGHT+4, BLOCK_WIDTH, BLOCK_HEIGHT>(Q_ptr, Q_pitch, Q, nx_+4, ny_+4);
readBlock<w, h, gc>( h0_ptr_, h0_pitch_, Q[0], nx_+2, ny_+2);
readBlock<w, h, gc>(hu0_ptr_, hu0_pitch_, Q[1], nx_+2, ny_+2);
readBlock<w, h, gc>(hv0_ptr_, hv0_pitch_, Q[2], nx_+2, ny_+2);
__syncthreads();
//Set boundary conditions
noFlowBoundary2(Q, nx_, ny_);
noFlowBoundary<w, h, gc, 1, 1>(Q[0], nx_, ny_);
noFlowBoundary<w, h, gc, -1, 1>(Q[1], nx_, ny_);
noFlowBoundary<w, h, gc, 1, -1>(Q[2], nx_, ny_);
__syncthreads();
//Step 0 => evolve x first, then y
@ -185,7 +188,9 @@ __global__ void HLL2Kernel(
__syncthreads();
//Set boundary conditions
noFlowBoundary2(Q, nx_, ny_);
noFlowBoundary<w, h, gc, 1, 1>(Q[0], nx_, ny_);
noFlowBoundary<w, h, gc, -1, 1>(Q[1], nx_, ny_);
noFlowBoundary<w, h, gc, 1, -1>(Q[2], nx_, ny_);
__syncthreads();
//Compute fluxes along the y axis and evolve
@ -207,7 +212,9 @@ __global__ void HLL2Kernel(
__syncthreads();
//Set boundary conditions
noFlowBoundary2(Q, nx_, ny_);
noFlowBoundary<w, h, gc, 1, 1>(Q[0], nx_, ny_);
noFlowBoundary<w, h, gc, -1, 1>(Q[1], nx_, ny_);
noFlowBoundary<w, h, gc, 1, -1>(Q[2], nx_, ny_);
__syncthreads();
//Compute fluxes along the x axis and evolve
@ -223,10 +230,9 @@ __global__ void HLL2Kernel(
// Write to main memory for all internal cells
writeBlock2(h1_ptr_, h1_pitch_,
hu1_ptr_, hu1_pitch_,
hv1_ptr_, hv1_pitch_,
Q, nx_, ny_);
writeBlock<w, h, 2>( h1_ptr_, h1_pitch_, Q[0], nx_, ny_);
writeBlock<w, h, 2>(hu1_ptr_, hu1_pitch_, Q[1], nx_, ny_);
writeBlock<w, h, 2>(hv1_ptr_, hv1_pitch_, Q[2], nx_, ny_);
}
} // extern "C"

View File

@ -119,6 +119,10 @@ __global__ void KP07Kernel(
float* hu1_ptr_, int hu1_pitch_,
float* hv1_ptr_, int hv1_pitch_) {
const unsigned int w = BLOCK_WIDTH;
const unsigned int h = BLOCK_HEIGHT;
const unsigned int gc = 2;
//Index of thread within block
const int tx = threadIdx.x;
const int ty = threadIdx.y;
@ -128,26 +132,28 @@ __global__ void KP07Kernel(
const int tj = blockDim.y*blockIdx.y + threadIdx.y + 2;
//Shared memory variables
__shared__ float Q[3][BLOCK_HEIGHT+4][BLOCK_WIDTH+4];
__shared__ float Q[3][h+4][w+4];
//The following slightly wastes memory, but enables us to reuse the
//funcitons in common.opencl
__shared__ float Qx[3][BLOCK_HEIGHT+2][BLOCK_WIDTH+2];
__shared__ float Qy[3][BLOCK_HEIGHT+2][BLOCK_WIDTH+2];
__shared__ float F[3][BLOCK_HEIGHT+1][BLOCK_WIDTH+1];
__shared__ float G[3][BLOCK_HEIGHT+1][BLOCK_WIDTH+1];
__shared__ float Qx[3][h+2][w+2];
__shared__ float Qy[3][h+2][w+2];
__shared__ float F[3][h+1][w+1];
__shared__ float G[3][h+1][w+1];
//Read into shared memory
float* Q_ptr[3] = {h0_ptr_, hu0_ptr_, hv0_ptr_};
int Q_pitch[3] = {h0_pitch_, hu0_pitch_, hv0_pitch_};
readBlock<3, BLOCK_WIDTH+4, BLOCK_HEIGHT+4, BLOCK_WIDTH, BLOCK_HEIGHT>(Q_ptr, Q_pitch, Q, nx_+4, ny_+4);
readBlock<w, h, gc>( h0_ptr_, h0_pitch_, Q[0], nx_+2, ny_+2);
readBlock<w, h, gc>(hu0_ptr_, hu0_pitch_, Q[1], nx_+2, ny_+2);
readBlock<w, h, gc>(hv0_ptr_, hv0_pitch_, Q[2], nx_+2, ny_+2);
__syncthreads();
//Fix boundary conditions
noFlowBoundary2(Q, nx_, ny_);
noFlowBoundary<w, h, gc, 1, 1>(Q[0], nx_, ny_);
noFlowBoundary<w, h, gc, -1, 1>(Q[1], nx_, ny_);
noFlowBoundary<w, h, gc, 1, -1>(Q[2], nx_, ny_);
__syncthreads();

View File

@ -147,23 +147,29 @@ __global__ void KP07DimsplitKernel(
float* hu1_ptr_, int hu1_pitch_,
float* hv1_ptr_, int hv1_pitch_) {
const unsigned int w = BLOCK_WIDTH;
const unsigned int h = BLOCK_HEIGHT;
const unsigned int gc = 2;
//Shared memory variables
__shared__ float Q[3][BLOCK_HEIGHT+4][BLOCK_WIDTH+4];
__shared__ float Qx[3][BLOCK_HEIGHT+2][BLOCK_WIDTH+2];
__shared__ float F[3][BLOCK_HEIGHT+1][BLOCK_WIDTH+1];
__shared__ float Q[3][h+4][w+4];
__shared__ float Qx[3][h+2][w+2];
__shared__ float F[3][h+1][w+1];
//Read into shared memory
float* Q_ptr[3] = {h0_ptr_, hu0_ptr_, hv0_ptr_};
int Q_pitch[3] = {h0_pitch_, hu0_pitch_, hv0_pitch_};
readBlock<3, BLOCK_WIDTH+4, BLOCK_HEIGHT+4, BLOCK_WIDTH, BLOCK_HEIGHT>(Q_ptr, Q_pitch, Q, nx_+4, ny_+4);
readBlock<w, h, gc>( h0_ptr_, h0_pitch_, Q[0], nx_+2, ny_+2);
readBlock<w, h, gc>(hu0_ptr_, hu0_pitch_, Q[1], nx_+2, ny_+2);
readBlock<w, h, gc>(hv0_ptr_, hv0_pitch_, Q[2], nx_+2, ny_+2);
__syncthreads();
//Fix boundary conditions
noFlowBoundary2(Q, nx_, ny_);
noFlowBoundary<w, h, gc, 1, 1>(Q[0], nx_, ny_);
noFlowBoundary<w, h, gc, -1, 1>(Q[1], nx_, ny_);
noFlowBoundary<w, h, gc, 1, -1>(Q[2], nx_, ny_);
__syncthreads();
@ -179,7 +185,9 @@ __global__ void KP07DimsplitKernel(
__syncthreads();
//Set boundary conditions
noFlowBoundary2(Q, nx_, ny_);
noFlowBoundary<w, h, gc, 1, 1>(Q[0], nx_, ny_);
noFlowBoundary<w, h, gc, -1, 1>(Q[1], nx_, ny_);
noFlowBoundary<w, h, gc, 1, -1>(Q[2], nx_, ny_);
__syncthreads();
//Compute fluxes along the y axis and evolve
@ -201,7 +209,9 @@ __global__ void KP07DimsplitKernel(
__syncthreads();
//Set boundary conditions
noFlowBoundary2(Q, nx_, ny_);
noFlowBoundary<w, h, gc, 1, 1>(Q[0], nx_, ny_);
noFlowBoundary<w, h, gc, -1, 1>(Q[1], nx_, ny_);
noFlowBoundary<w, h, gc, 1, -1>(Q[2], nx_, ny_);
__syncthreads();
//Compute fluxes along the x axis and evolve
@ -215,10 +225,9 @@ __global__ void KP07DimsplitKernel(
// Write to main memory for all internal cells
writeBlock2(h1_ptr_, h1_pitch_,
hu1_ptr_, hu1_pitch_,
hv1_ptr_, hv1_pitch_,
Q, nx_, ny_);
writeBlock<w, h, 2>( h1_ptr_, h1_pitch_, Q[0], nx_, ny_);
writeBlock<w, h, 2>(hu1_ptr_, hu1_pitch_, Q[1], nx_, ny_);
writeBlock<w, h, 2>(hv1_ptr_, hv1_pitch_, Q[2], nx_, ny_);
}
} // extern "C"

View File

@ -97,7 +97,6 @@ void computeFluxG(float Q[3][block_height+2][block_width+2],
}
extern "C" {
__global__
void LxFKernel(
@ -115,43 +114,49 @@ void LxFKernel(
float* hu1_ptr_, int hu1_pitch_,
float* hv1_ptr_, int hv1_pitch_) {
const int tx = threadIdx.x;
const int ty = threadIdx.y;
const unsigned int w = BLOCK_WIDTH;
const unsigned int h = BLOCK_HEIGHT;
const unsigned int gc = 1;
__shared__ float Q[3][BLOCK_HEIGHT+2][BLOCK_WIDTH+2];
__shared__ float F[3][BLOCK_HEIGHT][BLOCK_WIDTH+1];
__shared__ float G[3][BLOCK_HEIGHT+1][BLOCK_WIDTH];
__shared__ float Q[3][h+2][w+2];
__shared__ float F[3][h ][w+1];
__shared__ float G[3][h+1][w ];
float* Q_ptr[3] = {h0_ptr_, hu0_ptr_, hv0_ptr_};
int Q_pitch[3] = {h0_pitch_, hu0_pitch_, hv0_pitch_};
readBlock<3, BLOCK_WIDTH+2, BLOCK_HEIGHT+2, BLOCK_WIDTH, BLOCK_HEIGHT>(Q_ptr, Q_pitch, Q, nx_+2, ny_+2);
//Read from global memory
readBlock<w, h, gc>( h0_ptr_, h0_pitch_, Q[0], nx_+2, ny_+2);
readBlock<w, h, gc>(hu0_ptr_, hu0_pitch_, Q[1], nx_+2, ny_+2);
readBlock<w, h, gc>(hv0_ptr_, hv0_pitch_, Q[2], nx_+2, ny_+2);
__syncthreads();
//Set boundary conditions
noFlowBoundary1(Q, nx_, ny_);
noFlowBoundary<w, h, gc, 1, 1>(Q[0], nx_, ny_);
noFlowBoundary<w, h, gc, -1, 1>(Q[1], nx_, ny_);
noFlowBoundary<w, h, gc, 1, -1>(Q[2], nx_, ny_);
__syncthreads();
//Compute fluxes along the x and y axis
computeFluxF<BLOCK_WIDTH, BLOCK_HEIGHT>(Q, F, g_, dx_, dt_);
computeFluxG<BLOCK_WIDTH, BLOCK_HEIGHT>(Q, G, g_, dy_, dt_);
computeFluxF<w, h>(Q, F, g_, dx_, dt_);
computeFluxG<w, h>(Q, G, g_, dy_, dt_);
__syncthreads();
//Evolve for all cells
const int tx = threadIdx.x;
const int ty = threadIdx.y;
const int i = tx + 1; //Skip local ghost cells, i.e., +1
const int j = ty + 1;
Q[0][j][i] += (F[0][ty][tx] - F[0][ty ][tx+1]) * dt_ / dx_
+ (G[0][ty][tx] - G[0][ty+1][tx ]) * dt_ / dy_;
Q[1][j][i] += (F[1][ty][tx] - F[1][ty ][tx+1]) * dt_ / dx_
+ (G[1][ty][tx] - G[1][ty+1][tx ]) * dt_ / dy_;
Q[2][j][i] += (F[2][ty][tx] - F[2][ty ][tx+1]) * dt_ / dx_
+ (G[2][ty][tx] - G[2][ty+1][tx ]) * dt_ / dy_;
__syncthreads();
//Write to main memory
writeBlock<BLOCK_WIDTH+2, BLOCK_HEIGHT+2, 1, 1>( h1_ptr_, h1_pitch_, Q[0], nx_, ny_);
writeBlock<BLOCK_WIDTH+2, BLOCK_HEIGHT+2, 1, 1>(hu1_ptr_, hu1_pitch_, Q[1], nx_, ny_);
writeBlock<BLOCK_WIDTH+2, BLOCK_HEIGHT+2, 1, 1>(hv1_ptr_, hv1_pitch_, Q[2], nx_, ny_);
writeBlock<w, h, gc>( h1_ptr_, h1_pitch_, Q[0], nx_, ny_);
writeBlock<w, h, gc>(hu1_ptr_, hu1_pitch_, Q[1], nx_, ny_);
writeBlock<w, h, gc>(hv1_ptr_, hv1_pitch_, Q[2], nx_, ny_);
}
} // extern "C"

View File

@ -130,21 +130,28 @@ __global__ void WAFKernel(
float* h1_ptr_, int h1_pitch_,
float* hu1_ptr_, int hu1_pitch_,
float* hv1_ptr_, int hv1_pitch_) {
const unsigned int w = BLOCK_WIDTH;
const unsigned int h = BLOCK_HEIGHT;
const unsigned int gc = 2;
//Shared memory variables
__shared__ float Q[3][BLOCK_HEIGHT+4][BLOCK_WIDTH+4];
__shared__ float F[3][BLOCK_HEIGHT+1][BLOCK_WIDTH+1];
__shared__ float Q[3][h+4][w+4];
__shared__ float F[3][h+1][w+1];
//Read into shared memory Q from global memory
float* Q_ptr[3] = {h0_ptr_, hu0_ptr_, hv0_ptr_};
int Q_pitch[3] = {h0_pitch_, hu0_pitch_, hv0_pitch_};
readBlock<3, BLOCK_WIDTH+4, BLOCK_HEIGHT+4, BLOCK_WIDTH, BLOCK_HEIGHT>(Q_ptr, Q_pitch, Q, nx_+4, ny_+4);
readBlock<w, h, gc>( h0_ptr_, h0_pitch_, Q[0], nx_+2, ny_+2);
readBlock<w, h, gc>(hu0_ptr_, hu0_pitch_, Q[1], nx_+2, ny_+2);
readBlock<w, h, gc>(hv0_ptr_, hv0_pitch_, Q[2], nx_+2, ny_+2);
__syncthreads();
//Set boundary conditions
noFlowBoundary2(Q, nx_, ny_);
noFlowBoundary<w, h, gc, 1, 1>(Q[0], nx_, ny_);
noFlowBoundary<w, h, gc, -1, 1>(Q[1], nx_, ny_);
noFlowBoundary<w, h, gc, 1, -1>(Q[2], nx_, ny_);
__syncthreads();
@ -158,7 +165,9 @@ __global__ void WAFKernel(
__syncthreads();
//Fix boundary conditions
noFlowBoundary2(Q, nx_, ny_);
noFlowBoundary<w, h, gc, 1, 1>(Q[0], nx_, ny_);
noFlowBoundary<w, h, gc, -1, 1>(Q[1], nx_, ny_);
noFlowBoundary<w, h, gc, 1, -1>(Q[2], nx_, ny_);
__syncthreads();
//Compute fluxes along the y axis and evolve
@ -176,7 +185,9 @@ __global__ void WAFKernel(
__syncthreads();
//Fix boundary conditions
noFlowBoundary2(Q, nx_, ny_);
noFlowBoundary<w, h, gc, 1, 1>(Q[0], nx_, ny_);
noFlowBoundary<w, h, gc, -1, 1>(Q[1], nx_, ny_);
noFlowBoundary<w, h, gc, 1, -1>(Q[2], nx_, ny_);
__syncthreads();
//Compute fluxes along the x axis and evolve
@ -189,10 +200,9 @@ __global__ void WAFKernel(
// Write to main memory for all internal cells
writeBlock2(h1_ptr_, h1_pitch_,
hu1_ptr_, hu1_pitch_,
hv1_ptr_, hv1_pitch_,
Q, nx_, ny_);
writeBlock<w, h, 2>( h1_ptr_, h1_pitch_, Q[0], nx_, ny_);
writeBlock<w, h, 2>(hu1_ptr_, hu1_pitch_, Q[1], nx_, ny_);
writeBlock<w, h, 2>(hv1_ptr_, hv1_pitch_, Q[2], nx_, ny_);
}
} // extern "C"

View File

@ -48,6 +48,10 @@ inline __device__ __host__ float clamp(const float f, const float a, const float
return fmaxf(a, fminf(f, b));
}
inline __device__ __host__ int clamp(const int f, const int a, const int b) {
return (f < b) ? ( (f > a) ? f : a) : b;
}
@ -60,37 +64,27 @@ __device__ float desingularize(float x_, float eps_) {
/**
* Reads a block of data with ghost cells
*/
template<int vars, int sm_width, int sm_height, int block_width, int block_height>
inline __device__ void readBlock(float* ptr_[vars], int pitch_[vars],
float shmem[vars][sm_height][sm_width],
template<int block_width, int block_height, int ghost_cells>
inline __device__ void readBlock(float* ptr_, int pitch_,
float shmem[block_height+2*ghost_cells][block_width+2*ghost_cells],
const int max_x_, const int max_y_) {
//Index of block within domain
const int bx = blockDim.x * blockIdx.x;
const int by = blockDim.y * blockIdx.y;
float* rows[3];
//Read into shared memory
for (int j=threadIdx.y; j<sm_height; j+=block_height) {
const int l = clamp(by + j, 0, max_y_-1); // Clamp out of bounds
//Loop over all variables
for (int j=threadIdx.y; j<block_height+2*ghost_cells; j+=block_height) {
const int l = min(by + j, max_y_-1);
float* row = (float*) ((char*) ptr_ + pitch_*l);
//Compute the pointer to current row in the arrays
for (int m=0; m<vars; ++m) {
rows[m] = (float*) ((char*) ptr_[m] + pitch_[m]*l);
}
for (int i=threadIdx.x; i<block_width+2*ghost_cells; i+=block_width) {
const int k = min(bx + i, max_x_-1);
for (int i=threadIdx.x; i<sm_width; i+=block_width) {
const int k = clamp(bx + i, 0, max_x_-1); // Clamp out of bounds
for (int m=0; m<vars; ++m) {
shmem[m][j][i] = rows[m][k];
}
shmem[j][i] = row[k];
}
}
}
@ -98,24 +92,23 @@ inline __device__ void readBlock(float* ptr_[vars], int pitch_[vars],
/**
* Writes a block of data to global memory for the shallow water equations.
*/
template<int sm_width, int sm_height, int offset_x=0, int offset_y=0>
template<int block_width, int block_height, int ghost_cells>
inline __device__ void writeBlock(float* ptr_, int pitch_,
float shmem[sm_height][sm_width],
float shmem[block_height+2*ghost_cells][block_width+2*ghost_cells],
const int width, const int height) {
//Index of cell within domain
const int ti = blockDim.x*blockIdx.x + threadIdx.x + offset_x;
const int tj = blockDim.y*blockIdx.y + threadIdx.y + offset_y;
const int ti = blockDim.x*blockIdx.x + threadIdx.x + ghost_cells;
const int tj = blockDim.y*blockIdx.y + threadIdx.y + ghost_cells;
//Only write internal cells
if (ti < width+offset_x && tj < height+offset_y) {
if (ti < width+ghost_cells && tj < height+ghost_cells) {
//Index of thread within block
const int tx = threadIdx.x + offset_x;
const int ty = threadIdx.y + offset_y;
const int tx = threadIdx.x + ghost_cells;
const int ty = threadIdx.y + ghost_cells;
float* const row = (float*) ((char*) ptr_ + pitch_*tj);
row[ti] = shmem[ty][tx];
@ -129,60 +122,93 @@ inline __device__ void writeBlock(float* ptr_, int pitch_,
/**
* Writes a block of data to global memory for the shallow water equations.
*/
__device__ void writeBlock2(float* h_ptr_, int h_pitch_,
float* hu_ptr_, int hu_pitch_,
float* hv_ptr_, int hv_pitch_,
float Q[3][BLOCK_HEIGHT+4][BLOCK_WIDTH+4],
const int nx_, const int ny_) {
writeBlock<BLOCK_WIDTH+4, BLOCK_HEIGHT+4, 2, 2>( h_ptr_, h_pitch_, Q[0], nx_, ny_);
writeBlock<BLOCK_WIDTH+4, BLOCK_HEIGHT+4, 2, 2>(hu_ptr_, hu_pitch_, Q[1], nx_, ny_);
writeBlock<BLOCK_WIDTH+4, BLOCK_HEIGHT+4, 2, 2>(hv_ptr_, hv_pitch_, Q[2], nx_, ny_);
}
template<int block_width, int block_height, int ghost_cells, int scale_east_west=1, int scale_north_south=1>
__device__ void noFlowBoundary(float Q[block_height+2*ghost_cells][block_width+2*ghost_cells], const int nx_, const int ny_) {
const int ti = blockDim.x*blockIdx.x + threadIdx.x + ghost_cells;
const int tj = blockDim.y*blockIdx.y + threadIdx.y + ghost_cells;
/**
* No flow boundary conditions for the shallow water equations
* with one ghost cell in each direction
*/
__device__ void noFlowBoundary1(float Q[3][BLOCK_HEIGHT+2][BLOCK_WIDTH+2], const int nx_, const int ny_) {
//Global index
const int ti = blockDim.x*blockIdx.x + threadIdx.x + 1; //Skip global ghost cells, i.e., +1
const int tj = blockDim.y*blockIdx.y + threadIdx.y + 1;
const int i = threadIdx.x + ghost_cells;
const int j = threadIdx.y + ghost_cells;
//Block-local indices
const int tx = threadIdx.x;
const int ty = threadIdx.y;
const int i = tx + 1; //Skip local ghost cells, i.e., +1
const int j = ty + 1;
//Fix boundary conditions
if (ti == 1) {
Q[0][j][i-1] = Q[0][j][i];
Q[1][j][i-1] = -Q[1][j][i];
Q[2][j][i-1] = Q[2][j][i];
// West boundary
if (ti == ghost_cells) {
Q[j][i-1] = scale_east_west*Q[j][i];
}
if (ti == nx_) {
Q[0][j][i+1] = Q[0][j][i];
Q[1][j][i+1] = -Q[1][j][i];
Q[2][j][i+1] = Q[2][j][i];
if (ghost_cells >= 2 && ti == ghost_cells + 1) {
Q[j][i-3] = scale_east_west*Q[j][i];
}
if (tj == 1) {
Q[0][j-1][i] = Q[0][j][i];
Q[1][j-1][i] = Q[1][j][i];
Q[2][j-1][i] = -Q[2][j][i];
if (ghost_cells >= 3 && ti == ghost_cells + 2) {
Q[j][i-5] = scale_east_west*Q[j][i];
}
if (tj == ny_) {
Q[0][j+1][i] = Q[0][j][i];
Q[1][j+1][i] = Q[1][j][i];
Q[2][j+1][i] = -Q[2][j][i];
if (ghost_cells >= 4 && ti == ghost_cells + 3) {
Q[j][i-7] = scale_east_west*Q[j][i];
}
if (ghost_cells >= 5 && ti == ghost_cells + 4) {
Q[j][i-9] = scale_east_west*Q[j][i];
}
// East boundary
if (ti == nx_ + ghost_cells - 1) {
Q[j][i+1] = scale_east_west*Q[j][i];
}
if (ghost_cells >= 2 && ti == nx_ + ghost_cells - 2) {
Q[j][i+3] = scale_east_west*Q[j][i];
}
if (ghost_cells >= 3 && ti == nx_ + ghost_cells - 3) {
Q[j][i+5] = scale_east_west*Q[j][i];
}
if (ghost_cells >= 3 && ti == nx_ + ghost_cells - 4) {
Q[j][i+7] = scale_east_west*Q[j][i];
}
if (ghost_cells >= 3 && ti == nx_ + ghost_cells - 5) {
Q[j][i+9] = scale_east_west*Q[j][i];
}
// South boundary
if (tj == ghost_cells) {
Q[j-1][i] = scale_north_south*Q[j][i];
}
if (ghost_cells >= 2 && tj == ghost_cells + 1) {
Q[j-3][i] = scale_north_south*Q[j][i];
}
if (ghost_cells >= 3 && tj == ghost_cells + 2) {
Q[j-5][i] = scale_north_south*Q[j][i];
}
if (ghost_cells >= 4 && tj == ghost_cells + 3) {
Q[j-7][i] = scale_north_south*Q[j][i];
}
if (ghost_cells >= 5 && tj == ghost_cells + 4) {
Q[j-9][i] = scale_north_south*Q[j][i];
}
// North boundary
if (tj == ny_ + ghost_cells - 1) {
Q[j+1][i] = scale_north_south*Q[j][i];
}
if (ghost_cells >= 2 && tj == ny_ + ghost_cells - 2) {
Q[j+3][i] = scale_north_south*Q[j][i];
}
if (ghost_cells >= 3 && tj == ny_ + ghost_cells - 3) {
Q[j+5][i] = scale_north_south*Q[j][i];
}
if (ghost_cells >= 3 && tj == ny_ + ghost_cells - 4) {
Q[j+7][i] = scale_north_south*Q[j][i];
}
if (ghost_cells >= 3 && tj == ny_ + ghost_cells - 5) {
Q[j+9][i] = scale_north_south*Q[j][i];
}
}
@ -191,59 +217,9 @@ __device__ void noFlowBoundary1(float Q[3][BLOCK_HEIGHT+2][BLOCK_WIDTH+2], const
/**
* No flow boundary conditions for the shallow water equations
* with two ghost cells in each direction
*/
__device__ void noFlowBoundary2(float Q[3][BLOCK_HEIGHT+4][BLOCK_WIDTH+4], const int nx_, const int ny_) {
//Global index
const int ti = blockDim.x*blockIdx.x + threadIdx.x + 2; //Skip global ghost cells, i.e., +2
const int tj = blockDim.y*blockIdx.y + threadIdx.y + 2;
//Block-local indices
const int tx = threadIdx.x;
const int ty = threadIdx.y;
const int i = tx + 2; //Skip local ghost cells, i.e., +2
const int j = ty + 2;
if (ti == 2) {
Q[0][j][i-1] = Q[0][j][i];
Q[1][j][i-1] = -Q[1][j][i];
Q[2][j][i-1] = Q[2][j][i];
Q[0][j][i-2] = Q[0][j][i+1];
Q[1][j][i-2] = -Q[1][j][i+1];
Q[2][j][i-2] = Q[2][j][i+1];
}
if (ti == nx_+1) {
Q[0][j][i+1] = Q[0][j][i];
Q[1][j][i+1] = -Q[1][j][i];
Q[2][j][i+1] = Q[2][j][i];
Q[0][j][i+2] = Q[0][j][i-1];
Q[1][j][i+2] = -Q[1][j][i-1];
Q[2][j][i+2] = Q[2][j][i-1];
}
if (tj == 2) {
Q[0][j-1][i] = Q[0][j][i];
Q[1][j-1][i] = Q[1][j][i];
Q[2][j-1][i] = -Q[2][j][i];
Q[0][j-2][i] = Q[0][j+1][i];
Q[1][j-2][i] = Q[1][j+1][i];
Q[2][j-2][i] = -Q[2][j+1][i];
}
if (tj == ny_+1) {
Q[0][j+1][i] = Q[0][j][i];
Q[1][j+1][i] = Q[1][j][i];
Q[2][j+1][i] = -Q[2][j][i];
Q[0][j+2][i] = Q[0][j-1][i];
Q[1][j+2][i] = Q[1][j-1][i];
Q[2][j+2][i] = -Q[2][j-1][i];
}
}