Renamed macro and added iPython magic

This commit is contained in:
André R. Brodtkorb 2018-08-10 13:59:52 +02:00
parent 426b8dba5c
commit 3e401e3fe1
11 changed files with 178 additions and 99 deletions

View File

@ -37,6 +37,7 @@ class Timer(object):
Class which keeps track of the CUDA context and some helper functions
"""
class CudaContext(object):
def __init__(self, verbose=True, blocking=False):
self.verbose = verbose
self.blocking = blocking
@ -93,7 +94,10 @@ class CudaContext(object):
if (self.verbose):
print(" `-> <" + str(self.cuda_context.handle) + "> Detaching context")
self.cuda_context.detach()
def __str__(self):
return "CudaContext id " + str(self.cuda_context.handle)
def hash_kernel(kernel_filename, include_dirs, verbose=False):
# Generate a kernel ID for our cache
@ -172,8 +176,8 @@ class CudaContext(object):
print("`-> Kernel changed or not in hash => compiling " + kernel_filename)
#Create define string
define_string = "#define block_width " + str(block_width) + "\n"
define_string += "#define block_height " + str(block_height) + "\n\n"
define_string = "#define BLOCK_WIDTH " + str(block_width) + "\n"
define_string += "#define BLOCK_HEIGHT " + str(block_height) + "\n\n"
kernel_string = define_string + '#include "' + os.path.join(module_path, kernel_filename) + '"'

View File

@ -27,8 +27,8 @@ along with this program. If not, see <http://www.gnu.org/licenses/>.
* Computes the flux along the x axis for all faces
*/
__device__
void computeFluxF(float Q[3][block_height+2][block_width+2],
float F[3][block_height+1][block_width+1],
void computeFluxF(float Q[3][BLOCK_HEIGHT+2][BLOCK_WIDTH+2],
float F[3][BLOCK_HEIGHT+1][BLOCK_WIDTH+1],
const float g_, const float dx_, const float dt_) {
//Index of thread within block
@ -39,7 +39,7 @@ void computeFluxF(float Q[3][block_height+2][block_width+2],
{
int j=ty;
const int l = j + 1; //Skip ghost cells
for (int i=tx; i<block_width+1; i+=block_width) {
for (int i=tx; i<BLOCK_WIDTH+1; i+=BLOCK_WIDTH) {
const int k = i;
// Q at interface from the right and left
@ -64,15 +64,15 @@ void computeFluxF(float Q[3][block_height+2][block_width+2],
* Computes the flux along the y axis for all faces
*/
__device__
void computeFluxG(float Q[3][block_height+2][block_width+2],
float G[3][block_height+1][block_width+1],
void computeFluxG(float Q[3][BLOCK_HEIGHT+2][BLOCK_WIDTH+2],
float G[3][BLOCK_HEIGHT+1][BLOCK_WIDTH+1],
const float g_, const float dy_, const float dt_) {
//Index of thread within block
const int tx = get_local_id(0);
const int ty = get_local_id(1);
//Compute fluxes along the y axis
for (int j=ty; j<block_height+1; j+=block_height) {
for (int j=ty; j<BLOCK_HEIGHT+1; j+=BLOCK_HEIGHT) {
const int l = j;
{
int i=tx;
@ -113,8 +113,8 @@ __global__ void FORCEKernel(
float* hu1_ptr_, int hu1_pitch_,
float* hv1_ptr_, int hv1_pitch_) {
__shared__ float Q[3][block_height+2][block_width+2];
__shared__ float F[3][block_height+1][block_width+1];
__shared__ float Q[3][BLOCK_HEIGHT+2][BLOCK_WIDTH+2];
__shared__ float F[3][BLOCK_HEIGHT+1][BLOCK_WIDTH+1];
//Read into shared memory

View File

@ -32,9 +32,9 @@ along with this program. If not, see <http://www.gnu.org/licenses/>.
* Computes the flux along the x axis for all faces
*/
__device__
void computeFluxF(float Q[3][block_height+4][block_width+4],
float Qx[3][block_height+2][block_width+2],
float F[3][block_height+1][block_width+1],
void computeFluxF(float Q[3][BLOCK_HEIGHT+4][BLOCK_WIDTH+4],
float Qx[3][BLOCK_HEIGHT+2][BLOCK_WIDTH+2],
float F[3][BLOCK_HEIGHT+1][BLOCK_WIDTH+1],
const float g_, const float dx_, const float dt_) {
//Index of thread within block
const int tx = get_local_id(0);
@ -43,7 +43,7 @@ void computeFluxF(float Q[3][block_height+4][block_width+4],
{
const int j=ty;
const int l = j + 2; //Skip ghost cells
for (int i=tx; i<block_width+1; i+=block_width) {
for (int i=tx; i<BLOCK_WIDTH+1; i+=BLOCK_WIDTH) {
const int k = i + 1;
// Reconstruct point values of Q at the left and right hand side
// of the cell for both the left (i) and right (i+1) cell
@ -84,15 +84,15 @@ void computeFluxF(float Q[3][block_height+4][block_width+4],
* Computes the flux along the x axis for all faces
*/
__device__
void computeFluxG(float Q[3][block_height+4][block_width+4],
float Qy[3][block_height+2][block_width+2],
float G[3][block_height+1][block_width+1],
void computeFluxG(float Q[3][BLOCK_HEIGHT+4][BLOCK_WIDTH+4],
float Qy[3][BLOCK_HEIGHT+2][BLOCK_WIDTH+2],
float G[3][BLOCK_HEIGHT+1][BLOCK_WIDTH+1],
const float g_, const float dy_, const float dt_) {
//Index of thread within block
const int tx = get_local_id(0);
const int ty = get_local_id(1);
for (int j=ty; j<block_height+1; j+=block_height) {
for (int j=ty; j<BLOCK_HEIGHT+1; j+=BLOCK_HEIGHT) {
const int l = j + 1;
{
int i=tx;
@ -157,9 +157,9 @@ __global__ void HLL2Kernel(
float* hv1_ptr_, int hv1_pitch_) {
//Shared memory variables
__shared__ float Q[3][block_height+4][block_width+4];
__shared__ float Qx[3][block_height+2][block_width+2];
__shared__ float F[3][block_height+1][block_width+1];
__shared__ float Q[3][BLOCK_HEIGHT+4][BLOCK_WIDTH+4];
__shared__ float Qx[3][BLOCK_HEIGHT+2][BLOCK_WIDTH+2];
__shared__ float F[3][BLOCK_HEIGHT+1][BLOCK_WIDTH+1];

View File

@ -30,8 +30,8 @@ along with this program. If not, see <http://www.gnu.org/licenses/>.
* Computes the flux along the x axis for all faces
*/
__device__
void computeFluxF(float Q[3][block_height+2][block_width+2],
float F[3][block_height+1][block_width+1],
void computeFluxF(float Q[3][BLOCK_HEIGHT+2][BLOCK_WIDTH+2],
float F[3][BLOCK_HEIGHT+1][BLOCK_WIDTH+1],
const float g_) {
//Index of thread within block
const int tx = get_local_id(0);
@ -40,7 +40,7 @@ void computeFluxF(float Q[3][block_height+2][block_width+2],
{
const int j=ty;
const int l = j + 1; //Skip ghost cells
for (int i=tx; i<block_width+1; i+=block_width) {
for (int i=tx; i<BLOCK_WIDTH+1; i+=BLOCK_WIDTH) {
const int k = i;
const float3 Q_l = make_float3(Q[0][l][k ], Q[1][l][k ], Q[2][l][k ]);
@ -64,14 +64,14 @@ void computeFluxF(float Q[3][block_height+2][block_width+2],
* Computes the flux along the y axis for all faces
*/
__device__
void computeFluxG(float Q[3][block_height+2][block_width+2],
float G[3][block_height+1][block_width+1],
void computeFluxG(float Q[3][BLOCK_HEIGHT+2][BLOCK_WIDTH+2],
float G[3][BLOCK_HEIGHT+1][BLOCK_WIDTH+1],
const float g_) {
//Index of thread within block
const int tx = get_local_id(0);
const int ty = get_local_id(1);
for (int j=ty; j<block_height+1; j+=block_height) {
for (int j=ty; j<BLOCK_HEIGHT+1; j+=BLOCK_HEIGHT) {
const int l = j;
{
const int i=tx;
@ -120,8 +120,8 @@ __global__ void HLLKernel(
float* hu1_ptr_, int hu1_pitch_,
float* hv1_ptr_, int hv1_pitch_) {
//Shared memory variables
__shared__ float Q[3][block_height+2][block_width+2];
__shared__ float F[3][block_height+1][block_width+1];
__shared__ float Q[3][BLOCK_HEIGHT+2][BLOCK_WIDTH+2];
__shared__ float F[3][BLOCK_HEIGHT+1][BLOCK_WIDTH+1];
//Read into shared memory

View File

@ -0,0 +1,75 @@
# -*- coding: utf-8 -*-
"""
This python module implements helpers for IPython / Jupyter and CUDA
Copyright (C) 2018 SINTEF ICT
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
"""
from IPython.core.magic import line_magic, Magics, magics_class
import pycuda.driver as cuda
@magics_class
class CudaContextHandler(Magics):
@line_magic
def cuda_context_handler(self, context_name):
print("Registering " + context_name + " as a global context")
if context_name in self.shell.user_ns.keys():
print("`-> Context already registered! Ignoring")
return
else:
print("`-> Creating context")
self.shell.ex(context_name + " = Common.CudaContext(verbose=True, blocking=False)")
# this function will be called on exceptions in any cell
def custom_exc(shell, etype, evalue, tb, tb_offset=None):
print("Exception caught: Resetting to CUDA context " + context_name)
while (cuda.Context.get_current() != None):
context = cuda.Context.get_current()
print("`-> popping " + str(context.handle))
cuda.Context.pop()
if context_name in self.shell.user_ns.keys():
print("`-> pushing " + str(self.shell.user_ns[context_name].cuda_context.handle))
self.shell.ex(context_name + ".cuda_context.push()")
else:
print("No CUDA context called " + context_name + " found (something is wrong)!")
print("CUDA will not work now")
# still show the error within the notebook, don't just swallow it
shell.showtraceback((etype, evalue, tb), tb_offset=tb_offset)
# this registers a custom exception handler for the whole current notebook
get_ipython().set_custom_exc((Exception,), custom_exc)
# Handle CUDA context when exiting python
import atexit
def exitfunc():
print("Exitfunc: Resetting CUDA context stack")
while (cuda.Context.get_current() != None):
context = cuda.Context.get_current()
print("`-> popping " + str(context.handle))
cuda.Context.pop()
atexit.register(exitfunc)
print("Registering automatic CUDA context handling")
print("(use %cuda_context_handler my_context to create a context called my_context")
ip = get_ipython()
ip.register_magics(CudaContextHandler)

View File

@ -30,9 +30,9 @@ along with this program. If not, see <http://www.gnu.org/licenses/>.
__device__
void computeFluxF(float Q[3][block_height+4][block_width+4],
float Qx[3][block_height+2][block_width+2],
float F[3][block_height+1][block_width+1],
void computeFluxF(float Q[3][BLOCK_HEIGHT+4][BLOCK_WIDTH+4],
float Qx[3][BLOCK_HEIGHT+2][BLOCK_WIDTH+2],
float F[3][BLOCK_HEIGHT+1][BLOCK_WIDTH+1],
const float g_, const float dx_, const float dt_) {
//Index of thread within block
const int tx = get_local_id(0);
@ -41,7 +41,7 @@ void computeFluxF(float Q[3][block_height+4][block_width+4],
{
int j=ty;
const int l = j + 2; //Skip ghost cells
for (int i=tx; i<block_width+1; i+=block_width) {
for (int i=tx; i<BLOCK_WIDTH+1; i+=BLOCK_WIDTH) {
const int k = i + 1;
// Reconstruct point values of Q at the left and right hand side
// of the cell for both the left (i) and right (i+1) cell
@ -75,15 +75,15 @@ void computeFluxF(float Q[3][block_height+4][block_width+4],
}
__device__
void computeFluxG(float Q[3][block_height+4][block_width+4],
float Qy[3][block_height+2][block_width+2],
float G[3][block_height+1][block_width+1],
void computeFluxG(float Q[3][BLOCK_HEIGHT+4][BLOCK_WIDTH+4],
float Qy[3][BLOCK_HEIGHT+2][BLOCK_WIDTH+2],
float G[3][BLOCK_HEIGHT+1][BLOCK_WIDTH+1],
const float g_, const float dy_, const float dt_) {
//Index of thread within block
const int tx = get_local_id(0);
const int ty = get_local_id(1);
for (int j=ty; j<block_height+1; j+=block_height) {
for (int j=ty; j<BLOCK_HEIGHT+1; j+=BLOCK_HEIGHT) {
const int l = j + 1;
{
int i=tx;
@ -148,9 +148,9 @@ __global__ void KP07DimsplitKernel(
//Shared memory variables
__shared__ float Q[3][block_height+4][block_width+4];
__shared__ float Qx[3][block_height+2][block_width+2];
__shared__ float F[3][block_height+1][block_width+1];
__shared__ float Q[3][BLOCK_HEIGHT+4][BLOCK_WIDTH+4];
__shared__ float Qx[3][BLOCK_HEIGHT+2][BLOCK_WIDTH+2];
__shared__ float F[3][BLOCK_HEIGHT+1][BLOCK_WIDTH+1];

View File

@ -30,9 +30,9 @@ along with this program. If not, see <http://www.gnu.org/licenses/>.
__device__
void computeFluxF(float Q[3][block_height+4][block_width+4],
float Qx[3][block_height+2][block_width+2],
float F[3][block_height+1][block_width+1],
void computeFluxF(float Q[3][BLOCK_HEIGHT+4][BLOCK_WIDTH+4],
float Qx[3][BLOCK_HEIGHT+2][BLOCK_WIDTH+2],
float F[3][BLOCK_HEIGHT+1][BLOCK_WIDTH+1],
const float g_) {
//Index of thread within block
const int tx = get_local_id(0);
@ -41,7 +41,7 @@ void computeFluxF(float Q[3][block_height+4][block_width+4],
{
int j=ty;
const int l = j + 2; //Skip ghost cells
for (int i=tx; i<block_width+1; i+=block_width) {
for (int i=tx; i<BLOCK_WIDTH+1; i+=BLOCK_WIDTH) {
const int k = i + 1;
// Q at interface from the right and left
const float3 Qp = make_float3(Q[0][l][k+1] - 0.5f*Qx[0][j][i+1],
@ -61,15 +61,15 @@ void computeFluxF(float Q[3][block_height+4][block_width+4],
}
__device__
void computeFluxG(float Q[3][block_height+4][block_width+4],
float Qy[3][block_height+2][block_width+2],
float G[3][block_height+1][block_width+1],
void computeFluxG(float Q[3][BLOCK_HEIGHT+4][BLOCK_WIDTH+4],
float Qy[3][BLOCK_HEIGHT+2][BLOCK_WIDTH+2],
float G[3][BLOCK_HEIGHT+1][BLOCK_WIDTH+1],
const float g_) {
//Index of thread within block
const int tx = get_local_id(0);
const int ty = get_local_id(1);
for (int j=ty; j<block_height+1; j+=block_height) {
for (int j=ty; j<BLOCK_HEIGHT+1; j+=BLOCK_HEIGHT) {
const int l = j + 1;
{
int i=tx;
@ -129,14 +129,14 @@ __global__ void KP07Kernel(
const int tj = get_global_id(1) + 2;
//Shared memory variables
__shared__ float Q[3][block_height+4][block_width+4];
__shared__ float Q[3][BLOCK_HEIGHT+4][BLOCK_WIDTH+4];
//The following slightly wastes memory, but enables us to reuse the
//funcitons in common.opencl
__shared__ float Qx[3][block_height+2][block_width+2];
__shared__ float Qy[3][block_height+2][block_width+2];
__shared__ float F[3][block_height+1][block_width+1];
__shared__ float G[3][block_height+1][block_width+1];
__shared__ float Qx[3][BLOCK_HEIGHT+2][BLOCK_WIDTH+2];
__shared__ float Qy[3][BLOCK_HEIGHT+2][BLOCK_WIDTH+2];
__shared__ float F[3][BLOCK_HEIGHT+1][BLOCK_WIDTH+1];
__shared__ float G[3][BLOCK_HEIGHT+1][BLOCK_WIDTH+1];

View File

@ -27,8 +27,8 @@ along with this program. If not, see <http://www.gnu.org/licenses/>.
* Computes the flux along the x axis for all faces
*/
__device__
void computeFluxF(float Q[3][block_height+2][block_width+2],
float F[3][block_height][block_width+1],
void computeFluxF(float Q[3][BLOCK_HEIGHT+2][BLOCK_WIDTH+2],
float F[3][BLOCK_HEIGHT][BLOCK_WIDTH+1],
const float g_, const float dx_, const float dt_) {
//Index of thread within block
const int tx = get_local_id(0);
@ -37,7 +37,7 @@ void computeFluxF(float Q[3][block_height+2][block_width+2],
{
const int j=ty;
const int l = j + 1; //Skip ghost cells
for (int i=tx; i<block_width+1; i+=block_width) {
for (int i=tx; i<BLOCK_WIDTH+1; i+=BLOCK_WIDTH) {
const int k = i;
// Q at interface from the right and left
@ -62,14 +62,14 @@ void computeFluxF(float Q[3][block_height+2][block_width+2],
* Computes the flux along the y axis for all faces
*/
__device__
void computeFluxG(float Q[3][block_height+2][block_width+2],
float G[3][block_height+1][block_width],
void computeFluxG(float Q[3][BLOCK_HEIGHT+2][BLOCK_WIDTH+2],
float G[3][BLOCK_HEIGHT+1][BLOCK_WIDTH],
const float g_, const float dy_, const float dt_) {
//Index of thread within block
const int tx = get_local_id(0);
const int ty = get_local_id(1);
for (int j=ty; j<block_height+1; j+=block_height) {
for (int j=ty; j<BLOCK_HEIGHT+1; j+=BLOCK_HEIGHT) {
const int l = j;
{
const int i=tx;
@ -114,9 +114,9 @@ __global__ void LxFKernel(
const int ti = get_global_id(0) + 1; //Skip global ghost cells, i.e., +1
const int tj = get_global_id(1) + 1;
__shared__ float Q[3][block_height+2][block_width+2];
__shared__ float F[3][block_height][block_width+1];
__shared__ float G[3][block_height+1][block_width];
__shared__ float Q[3][BLOCK_HEIGHT+2][BLOCK_WIDTH+2];
__shared__ float F[3][BLOCK_HEIGHT][BLOCK_WIDTH+1];
__shared__ float G[3][BLOCK_HEIGHT+1][BLOCK_WIDTH];
//Read into shared memory
readBlock1(h0_ptr_, h0_pitch_,

View File

@ -33,8 +33,8 @@ along with this program. If not, see <http://www.gnu.org/licenses/>.
* Computes the flux along the x axis for all faces
*/
__device__
void computeFluxF(float Q[3][block_height+4][block_width+4],
float F[3][block_height+1][block_width+1],
void computeFluxF(float Q[3][BLOCK_HEIGHT+4][BLOCK_WIDTH+4],
float F[3][BLOCK_HEIGHT+1][BLOCK_WIDTH+1],
const float g_, const float dx_, const float dt_) {
//Index of thread within block
const int tx = get_local_id(0);
@ -43,7 +43,7 @@ void computeFluxF(float Q[3][block_height+4][block_width+4],
{
int j=ty;
const int l = j + 2; //Skip ghost cells
for (int i=tx; i<block_width+1; i+=block_width) {
for (int i=tx; i<BLOCK_WIDTH+1; i+=BLOCK_WIDTH) {
const int k = i + 1;
// Q at interface from the right and left
@ -72,15 +72,15 @@ void computeFluxF(float Q[3][block_height+4][block_width+4],
* Computes the flux along the y axis for all faces
*/
__device__
void computeFluxG(float Q[3][block_height+4][block_width+4],
float G[3][block_height+1][block_width+1],
void computeFluxG(float Q[3][BLOCK_HEIGHT+4][BLOCK_WIDTH+4],
float G[3][BLOCK_HEIGHT+1][BLOCK_WIDTH+1],
const float g_, const float dy_, const float dt_) {
//Index of thread within block
const int tx = get_local_id(0);
const int ty = get_local_id(1);
//Compute fluxes along the y axis
for (int j=ty; j<block_height+1; j+=block_height) {
for (int j=ty; j<BLOCK_HEIGHT+1; j+=BLOCK_HEIGHT) {
const int l = j + 1;
{
int i=tx;
@ -130,8 +130,8 @@ __global__ void WAFKernel(
float* hu1_ptr_, int hu1_pitch_,
float* hv1_ptr_, int hv1_pitch_) {
//Shared memory variables
__shared__ float Q[3][block_height+4][block_width+4];
__shared__ float F[3][block_height+1][block_width+1];
__shared__ float Q[3][BLOCK_HEIGHT+4][BLOCK_WIDTH+4];
__shared__ float F[3][BLOCK_HEIGHT+1][BLOCK_WIDTH+1];

View File

@ -94,18 +94,18 @@ inline __device__ __host__ float clamp(const float f, const float a, const float
__device__ void readBlock1(float* h_ptr_, int h_pitch_,
float* hu_ptr_, int hu_pitch_,
float* hv_ptr_, int hv_pitch_,
float Q[3][block_height+2][block_width+2],
float Q[3][BLOCK_HEIGHT+2][BLOCK_WIDTH+2],
const int nx_, const int ny_) {
//Index of thread within block
const int tx = get_local_id(0);
const int ty = get_local_id(1);
//Index of block within domain
const int bx = block_width * get_group_id(0);
const int by = block_height * get_group_id(1);
const int bx = BLOCK_WIDTH * get_group_id(0);
const int by = BLOCK_HEIGHT * get_group_id(1);
//Read into shared memory
for (int j=ty; j<block_height+2; j+=block_height) {
for (int j=ty; j<BLOCK_HEIGHT+2; j+=BLOCK_HEIGHT) {
const int l = clamp(by + j, 0, ny_+1); // Out of bounds
//Compute the pointer to current row in the arrays
@ -113,7 +113,7 @@ __device__ void readBlock1(float* h_ptr_, int h_pitch_,
float* const hu_row = (float*) ((char*) hu_ptr_ + hu_pitch_*l);
float* const hv_row = (float*) ((char*) hv_ptr_ + hv_pitch_*l);
for (int i=tx; i<block_width+2; i+=block_width) {
for (int i=tx; i<BLOCK_WIDTH+2; i+=BLOCK_WIDTH) {
const int k = clamp(bx + i, 0, nx_+1); // Out of bounds
Q[0][j][i] = h_row[k];
@ -133,18 +133,18 @@ __device__ void readBlock1(float* h_ptr_, int h_pitch_,
__device__ void readBlock2(float* h_ptr_, int h_pitch_,
float* hu_ptr_, int hu_pitch_,
float* hv_ptr_, int hv_pitch_,
float Q[3][block_height+4][block_width+4],
float Q[3][BLOCK_HEIGHT+4][BLOCK_WIDTH+4],
const int nx_, const int ny_) {
//Index of thread within block
const int tx = get_local_id(0);
const int ty = get_local_id(1);
//Index of block within domain
const int bx = block_width * get_group_id(0);
const int by = block_height * get_group_id(1);
const int bx = BLOCK_WIDTH * get_group_id(0);
const int by = BLOCK_HEIGHT * get_group_id(1);
//Read into shared memory
for (int j=ty; j<block_height+4; j+=block_height) {
for (int j=ty; j<BLOCK_HEIGHT+4; j+=BLOCK_HEIGHT) {
const int l = clamp(by + j, 0, ny_+3); // Out of bounds
//Compute the pointer to current row in the arrays
@ -152,7 +152,7 @@ __device__ void readBlock2(float* h_ptr_, int h_pitch_,
float* const hu_row = (float*) ((char*) hu_ptr_ + hu_pitch_*l);
float* const hv_row = (float*) ((char*) hv_ptr_ + hv_pitch_*l);
for (int i=tx; i<block_width+4; i+=block_width) {
for (int i=tx; i<BLOCK_WIDTH+4; i+=BLOCK_WIDTH) {
const int k = clamp(bx + i, 0, nx_+3); // Out of bounds
Q[0][j][i] = h_row[k];
@ -171,7 +171,7 @@ __device__ void readBlock2(float* h_ptr_, int h_pitch_,
__device__ void writeBlock1(float* h_ptr_, int h_pitch_,
float* hu_ptr_, int hu_pitch_,
float* hv_ptr_, int hv_pitch_,
float Q[3][block_height+2][block_width+2],
float Q[3][BLOCK_HEIGHT+2][BLOCK_WIDTH+2],
const int nx_, const int ny_) {
//Index of thread within block
const int tx = get_local_id(0);
@ -206,7 +206,7 @@ __device__ void writeBlock1(float* h_ptr_, int h_pitch_,
__device__ void writeBlock2(float* h_ptr_, int h_pitch_,
float* hu_ptr_, int hu_pitch_,
float* hv_ptr_, int hv_pitch_,
float Q[3][block_height+4][block_width+4],
float Q[3][BLOCK_HEIGHT+4][BLOCK_WIDTH+4],
const int nx_, const int ny_) {
//Index of thread within block
const int tx = get_local_id(0);
@ -240,7 +240,7 @@ __device__ void writeBlock2(float* h_ptr_, int h_pitch_,
* No flow boundary conditions for the shallow water equations
* with one ghost cell in each direction
*/
__device__ void noFlowBoundary1(float Q[3][block_height+2][block_width+2], const int nx_, const int ny_) {
__device__ void noFlowBoundary1(float Q[3][BLOCK_HEIGHT+2][BLOCK_WIDTH+2], const int nx_, const int ny_) {
//Global index
const int ti = get_global_id(0) + 1; //Skip global ghost cells, i.e., +1
const int tj = get_global_id(1) + 1;
@ -284,7 +284,7 @@ __device__ void noFlowBoundary1(float Q[3][block_height+2][block_width+2], const
* No flow boundary conditions for the shallow water equations
* with two ghost cells in each direction
*/
__device__ void noFlowBoundary2(float Q[3][block_height+4][block_width+4], const int nx_, const int ny_) {
__device__ void noFlowBoundary2(float Q[3][BLOCK_HEIGHT+4][BLOCK_WIDTH+4], const int nx_, const int ny_) {
//Global index
const int ti = get_global_id(0) + 2; //Skip global ghost cells, i.e., +2
const int tj = get_global_id(1) + 2;
@ -342,8 +342,8 @@ __device__ void noFlowBoundary2(float Q[3][block_height+4][block_width+4], const
/**
* Evolves the solution in time along the x axis (dimensional splitting)
*/
__device__ void evolveF1(float Q[3][block_height+2][block_width+2],
float F[3][block_height+1][block_width+1],
__device__ void evolveF1(float Q[3][BLOCK_HEIGHT+2][BLOCK_WIDTH+2],
float F[3][BLOCK_HEIGHT+1][BLOCK_WIDTH+1],
const int nx_, const int ny_,
const float dx_, const float dt_) {
//Index of thread within block
@ -372,8 +372,8 @@ __device__ void evolveF1(float Q[3][block_height+2][block_width+2],
/**
* Evolves the solution in time along the x axis (dimensional splitting)
*/
__device__ void evolveF2(float Q[3][block_height+4][block_width+4],
float F[3][block_height+1][block_width+1],
__device__ void evolveF2(float Q[3][BLOCK_HEIGHT+4][BLOCK_WIDTH+4],
float F[3][BLOCK_HEIGHT+1][BLOCK_WIDTH+1],
const int nx_, const int ny_,
const float dx_, const float dt_) {
//Index of thread within block
@ -402,8 +402,8 @@ __device__ void evolveF2(float Q[3][block_height+4][block_width+4],
/**
* Evolves the solution in time along the y axis (dimensional splitting)
*/
__device__ void evolveG1(float Q[3][block_height+2][block_width+2],
float G[3][block_height+1][block_width+1],
__device__ void evolveG1(float Q[3][BLOCK_HEIGHT+2][BLOCK_WIDTH+2],
float G[3][BLOCK_HEIGHT+1][BLOCK_WIDTH+1],
const int nx_, const int ny_,
const float dy_, const float dt_) {
//Index of thread within block
@ -433,8 +433,8 @@ __device__ void evolveG1(float Q[3][block_height+2][block_width+2],
/**
* Evolves the solution in time along the y axis (dimensional splitting)
*/
__device__ void evolveG2(float Q[3][block_height+4][block_width+4],
float G[3][block_height+1][block_width+1],
__device__ void evolveG2(float Q[3][BLOCK_HEIGHT+4][BLOCK_WIDTH+4],
float G[3][BLOCK_HEIGHT+1][BLOCK_WIDTH+1],
const int nx_, const int ny_,
const float dy_, const float dt_) {
//Index of thread within block

View File

@ -46,8 +46,8 @@ __device__ __inline__ float minmodSlope(float left, float center, float right, f
/**
* Reconstructs a minmod slope for a whole block along the abscissa
*/
__device__ void minmodSlopeX(float Q[3][block_height+4][block_width+4],
float Qx[3][block_height+2][block_width+2],
__device__ void minmodSlopeX(float Q[3][BLOCK_HEIGHT+4][BLOCK_WIDTH+4],
float Qx[3][BLOCK_HEIGHT+2][BLOCK_WIDTH+2],
const float theta_) {
//Index of thread within block
const int tx = get_local_id(0);
@ -57,7 +57,7 @@ __device__ void minmodSlopeX(float Q[3][block_height+4][block_width+4],
{
const int j = ty;
const int l = j + 2; //Skip ghost cells
for (int i=tx; i<block_width+2; i+=block_width) {
for (int i=tx; i<BLOCK_WIDTH+2; i+=BLOCK_WIDTH) {
const int k = i + 1;
for (int p=0; p<3; ++p) {
Qx[p][j][i] = minmodSlope(Q[p][l][k-1], Q[p][l][k], Q[p][l][k+1], theta_);
@ -70,14 +70,14 @@ __device__ void minmodSlopeX(float Q[3][block_height+4][block_width+4],
/**
* Reconstructs a minmod slope for a whole block along the ordinate
*/
__device__ void minmodSlopeY(float Q[3][block_height+4][block_width+4],
float Qy[3][block_height+2][block_width+2],
__device__ void minmodSlopeY(float Q[3][BLOCK_HEIGHT+4][BLOCK_WIDTH+4],
float Qy[3][BLOCK_HEIGHT+2][BLOCK_WIDTH+2],
const float theta_) {
//Index of thread within block
const int tx = get_local_id(0);
const int ty = get_local_id(1);
for (int j=ty; j<block_height+2; j+=block_height) {
for (int j=ty; j<BLOCK_HEIGHT+2; j+=BLOCK_HEIGHT) {
const int l = j + 1;
{
const int i = tx;