Renamed macro and added iPython magic

2025-05-18 14:34:13 +02:00 · 2018-08-10 13:59:52 +02:00 · 2018-08-10 13:59:52 +02:00 · 3e401e3fe1
commit 3e401e3fe1
parent 426b8dba5c
11 changed files with 178 additions and 99 deletions
--- a/SWESimulators/Common.py
+++ b/SWESimulators/Common.py
@ -37,6 +37,7 @@ class Timer(object):
 Class which keeps track of the CUDA context and some helper functions
 """
 class CudaContext(object):
    def __init__(self, verbose=True, blocking=False):
        self.verbose = verbose
        self.blocking = blocking
@ -95,6 +96,9 @@ class CudaContext(object):
        self.cuda_context.detach()
    def __str__(self):
        return "CudaContext id " + str(self.cuda_context.handle)
    def hash_kernel(kernel_filename, include_dirs, verbose=False):        
        # Generate a kernel ID for our cache
@ -172,8 +176,8 @@ class CudaContext(object):
                print("`-> Kernel changed or not in hash => compiling " + kernel_filename)
            #Create define string
-            define_string = "#define block_width " + str(block_width) + "\n"
+            define_string = "#define BLOCK_WIDTH " + str(block_width) + "\n"
-            define_string += "#define block_height " + str(block_height) + "\n\n"
+            define_string += "#define BLOCK_HEIGHT " + str(block_height) + "\n\n"
            kernel_string = define_string + '#include "' + os.path.join(module_path, kernel_filename) + '"'
--- a/SWESimulators/FORCE_kernel.cu
+++ b/SWESimulators/FORCE_kernel.cu
@ -27,8 +27,8 @@ along with this program.  If not, see <http://www.gnu.org/licenses/>.
  * Computes the flux along the x axis for all faces
  */
 __device__ 
-void computeFluxF(float Q[3][block_height+2][block_width+2],
+void computeFluxF(float Q[3][BLOCK_HEIGHT+2][BLOCK_WIDTH+2],
-                  float F[3][block_height+1][block_width+1],
+                  float F[3][BLOCK_HEIGHT+1][BLOCK_WIDTH+1],
                  const float g_, const float dx_, const float dt_) {
    //Index of thread within block
@ -39,7 +39,7 @@ void computeFluxF(float Q[3][block_height+2][block_width+2],
    {
        int j=ty;
        const int l = j + 1; //Skip ghost cells
-        for (int i=tx; i<block_width+1; i+=block_width) {
+        for (int i=tx; i<BLOCK_WIDTH+1; i+=BLOCK_WIDTH) {
            const int k = i;
            // Q at interface from the right and left
@ -64,15 +64,15 @@ void computeFluxF(float Q[3][block_height+2][block_width+2],
  * Computes the flux along the y axis for all faces
  */
 __device__ 
-void computeFluxG(float Q[3][block_height+2][block_width+2],
+void computeFluxG(float Q[3][BLOCK_HEIGHT+2][BLOCK_WIDTH+2],
-                  float G[3][block_height+1][block_width+1],
+                  float G[3][BLOCK_HEIGHT+1][BLOCK_WIDTH+1],
                  const float g_, const float dy_, const float dt_) {
    //Index of thread within block
    const int tx = get_local_id(0);
    const int ty = get_local_id(1);
    //Compute fluxes along the y axis
-    for (int j=ty; j<block_height+1; j+=block_height) {
+    for (int j=ty; j<BLOCK_HEIGHT+1; j+=BLOCK_HEIGHT) {
        const int l = j;
        {
            int i=tx;
@ -113,8 +113,8 @@ __global__ void FORCEKernel(
        float* hu1_ptr_, int hu1_pitch_,
        float* hv1_ptr_, int hv1_pitch_) {
-    __shared__ float Q[3][block_height+2][block_width+2];
+    __shared__ float Q[3][BLOCK_HEIGHT+2][BLOCK_WIDTH+2];
-    __shared__ float F[3][block_height+1][block_width+1];
+    __shared__ float F[3][BLOCK_HEIGHT+1][BLOCK_WIDTH+1];
    //Read into shared memory
--- a/SWESimulators/HLL2_kernel.cu
+++ b/SWESimulators/HLL2_kernel.cu
@ -32,9 +32,9 @@ along with this program.  If not, see <http://www.gnu.org/licenses/>.
  * Computes the flux along the x axis for all faces
  */
 __device__
-void computeFluxF(float Q[3][block_height+4][block_width+4],
+void computeFluxF(float Q[3][BLOCK_HEIGHT+4][BLOCK_WIDTH+4],
-                  float Qx[3][block_height+2][block_width+2],
+                  float Qx[3][BLOCK_HEIGHT+2][BLOCK_WIDTH+2],
-                  float F[3][block_height+1][block_width+1],
+                  float F[3][BLOCK_HEIGHT+1][BLOCK_WIDTH+1],
                  const float g_, const float dx_, const float dt_) {
    //Index of thread within block
    const int tx = get_local_id(0);
@ -43,7 +43,7 @@ void computeFluxF(float Q[3][block_height+4][block_width+4],
    {
        const int j=ty;
        const int l = j + 2; //Skip ghost cells
-        for (int i=tx; i<block_width+1; i+=block_width) {
+        for (int i=tx; i<BLOCK_WIDTH+1; i+=BLOCK_WIDTH) {
            const int k = i + 1;
            // Reconstruct point values of Q at the left and right hand side 
            // of the cell for both the left (i) and right (i+1) cell 
@ -84,15 +84,15 @@ void computeFluxF(float Q[3][block_height+4][block_width+4],
  * Computes the flux along the x axis for all faces
  */
 __device__
-void computeFluxG(float Q[3][block_height+4][block_width+4],
+void computeFluxG(float Q[3][BLOCK_HEIGHT+4][BLOCK_WIDTH+4],
-                  float Qy[3][block_height+2][block_width+2],
+                  float Qy[3][BLOCK_HEIGHT+2][BLOCK_WIDTH+2],
-                  float G[3][block_height+1][block_width+1],
+                  float G[3][BLOCK_HEIGHT+1][BLOCK_WIDTH+1],
                  const float g_, const float dy_, const float dt_) {
    //Index of thread within block
    const int tx = get_local_id(0);
    const int ty = get_local_id(1);
-    for (int j=ty; j<block_height+1; j+=block_height) {
+    for (int j=ty; j<BLOCK_HEIGHT+1; j+=BLOCK_HEIGHT) {
        const int l = j + 1;
        { 
            int i=tx;
@ -157,9 +157,9 @@ __global__ void HLL2Kernel(
        float* hv1_ptr_, int hv1_pitch_) {
    //Shared memory variables
-    __shared__ float Q[3][block_height+4][block_width+4];
+    __shared__ float Q[3][BLOCK_HEIGHT+4][BLOCK_WIDTH+4];
-    __shared__ float Qx[3][block_height+2][block_width+2];
+    __shared__ float Qx[3][BLOCK_HEIGHT+2][BLOCK_WIDTH+2];
-    __shared__ float F[3][block_height+1][block_width+1];
+    __shared__ float F[3][BLOCK_HEIGHT+1][BLOCK_WIDTH+1];
--- a/SWESimulators/HLL_kernel.cu
+++ b/SWESimulators/HLL_kernel.cu
@ -30,8 +30,8 @@ along with this program.  If not, see <http://www.gnu.org/licenses/>.
  * Computes the flux along the x axis for all faces
  */
 __device__
-void computeFluxF(float Q[3][block_height+2][block_width+2],
+void computeFluxF(float Q[3][BLOCK_HEIGHT+2][BLOCK_WIDTH+2],
-                  float F[3][block_height+1][block_width+1],
+                  float F[3][BLOCK_HEIGHT+1][BLOCK_WIDTH+1],
                  const float g_) {
    //Index of thread within block
    const int tx = get_local_id(0);
@ -40,7 +40,7 @@ void computeFluxF(float Q[3][block_height+2][block_width+2],
    {
        const int j=ty;
        const int l = j + 1; //Skip ghost cells     
-        for (int i=tx; i<block_width+1; i+=block_width) { 
+        for (int i=tx; i<BLOCK_WIDTH+1; i+=BLOCK_WIDTH) { 
            const int k = i;
            const float3 Q_l  = make_float3(Q[0][l][k  ], Q[1][l][k  ], Q[2][l][k  ]);
@ -64,14 +64,14 @@ void computeFluxF(float Q[3][block_height+2][block_width+2],
  * Computes the flux along the y axis for all faces
  */
 __device__
-void computeFluxG(float Q[3][block_height+2][block_width+2],
+void computeFluxG(float Q[3][BLOCK_HEIGHT+2][BLOCK_WIDTH+2],
-                  float G[3][block_height+1][block_width+1],
+                  float G[3][BLOCK_HEIGHT+1][BLOCK_WIDTH+1],
                  const float g_) {
    //Index of thread within block
    const int tx = get_local_id(0);
    const int ty = get_local_id(1);
-    for (int j=ty; j<block_height+1; j+=block_height) {
+    for (int j=ty; j<BLOCK_HEIGHT+1; j+=BLOCK_HEIGHT) {
        const int l = j;
        {
            const int i=tx;
@ -120,8 +120,8 @@ __global__ void HLLKernel(
        float* hu1_ptr_, int hu1_pitch_,
        float* hv1_ptr_, int hv1_pitch_) {
    //Shared memory variables
-    __shared__ float Q[3][block_height+2][block_width+2];
+    __shared__ float Q[3][BLOCK_HEIGHT+2][BLOCK_WIDTH+2];
-    __shared__ float F[3][block_height+1][block_width+1];
+    __shared__ float F[3][BLOCK_HEIGHT+1][BLOCK_WIDTH+1];
    //Read into shared memory
--- a/SWESimulators/IPythonMagic.py
+++ b/SWESimulators/IPythonMagic.py
@ -0,0 +1,75 @@
 # -*- coding: utf-8 -*-
 """
 This python module implements helpers for IPython / Jupyter and CUDA
 Copyright (C) 2018  SINTEF ICT
 This program is free software: you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation, either version 3 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License
 along with this program.  If not, see <http://www.gnu.org/licenses/>.
 """
 from IPython.core.magic import line_magic, Magics, magics_class
 import pycuda.driver as cuda
@magics_class
 class CudaContextHandler(Magics): 
    @line_magic
    def cuda_context_handler(self, context_name):
        print("Registering " + context_name + " as a global context")
        if context_name in self.shell.user_ns.keys():
            print("`-> Context already registered! Ignoring")
            return
        else:
            print("`-> Creating context")
            self.shell.ex(context_name + " = Common.CudaContext(verbose=True, blocking=False)")
        # this function will be called on exceptions in any cell
        def custom_exc(shell, etype, evalue, tb, tb_offset=None):
            print("Exception caught: Resetting to CUDA context " + context_name)
            while (cuda.Context.get_current() != None):
                context = cuda.Context.get_current()
                print("`-> popping " + str(context.handle))
                cuda.Context.pop()
            if context_name in self.shell.user_ns.keys():
                print("`-> pushing " + str(self.shell.user_ns[context_name].cuda_context.handle))
                self.shell.ex(context_name + ".cuda_context.push()")
            else:
                print("No CUDA context called " + context_name + " found (something is wrong)!")
                print("CUDA will not work now")
            # still show the error within the notebook, don't just swallow it
            shell.showtraceback((etype, evalue, tb), tb_offset=tb_offset)
        # this registers a custom exception handler for the whole current notebook
        get_ipython().set_custom_exc((Exception,), custom_exc)
        # Handle CUDA context when exiting python
        import atexit
        def exitfunc():
            print("Exitfunc: Resetting CUDA context stack")
            while (cuda.Context.get_current() != None):
                context = cuda.Context.get_current()
                print("`-> popping " + str(context.handle))
                cuda.Context.pop()
        atexit.register(exitfunc)
 print("Registering automatic CUDA context handling")
 print("(use %cuda_context_handler my_context to create a context called my_context")
 ip = get_ipython()
 ip.register_magics(CudaContextHandler)
--- a/SWESimulators/KP07_dimsplit_kernel.cu
+++ b/SWESimulators/KP07_dimsplit_kernel.cu
@ -30,9 +30,9 @@ along with this program.  If not, see <http://www.gnu.org/licenses/>.
 __device__
-void computeFluxF(float Q[3][block_height+4][block_width+4],
+void computeFluxF(float Q[3][BLOCK_HEIGHT+4][BLOCK_WIDTH+4],
-                  float Qx[3][block_height+2][block_width+2],
+                  float Qx[3][BLOCK_HEIGHT+2][BLOCK_WIDTH+2],
-                  float F[3][block_height+1][block_width+1],
+                  float F[3][BLOCK_HEIGHT+1][BLOCK_WIDTH+1],
                  const float g_, const float dx_, const float dt_) {
    //Index of thread within block
    const int tx = get_local_id(0);
@ -41,7 +41,7 @@ void computeFluxF(float Q[3][block_height+4][block_width+4],
    {
        int j=ty;
        const int l = j + 2; //Skip ghost cells
-        for (int i=tx; i<block_width+1; i+=block_width) {
+        for (int i=tx; i<BLOCK_WIDTH+1; i+=BLOCK_WIDTH) {
            const int k = i + 1;
            // Reconstruct point values of Q at the left and right hand side 
            // of the cell for both the left (i) and right (i+1) cell 
@ -75,15 +75,15 @@ void computeFluxF(float Q[3][block_height+4][block_width+4],
 }
 __device__
-void computeFluxG(float Q[3][block_height+4][block_width+4],
+void computeFluxG(float Q[3][BLOCK_HEIGHT+4][BLOCK_WIDTH+4],
-                  float Qy[3][block_height+2][block_width+2],
+                  float Qy[3][BLOCK_HEIGHT+2][BLOCK_WIDTH+2],
-                  float G[3][block_height+1][block_width+1],
+                  float G[3][BLOCK_HEIGHT+1][BLOCK_WIDTH+1],
                  const float g_, const float dy_, const float dt_) {
    //Index of thread within block
    const int tx = get_local_id(0);
    const int ty = get_local_id(1);
-    for (int j=ty; j<block_height+1; j+=block_height) {
+    for (int j=ty; j<BLOCK_HEIGHT+1; j+=BLOCK_HEIGHT) {
        const int l = j + 1;
        {
            int i=tx;
@ -148,9 +148,9 @@ __global__ void KP07DimsplitKernel(
    //Shared memory variables
-    __shared__ float Q[3][block_height+4][block_width+4];
+    __shared__ float Q[3][BLOCK_HEIGHT+4][BLOCK_WIDTH+4];
-    __shared__ float Qx[3][block_height+2][block_width+2];
+    __shared__ float Qx[3][BLOCK_HEIGHT+2][BLOCK_WIDTH+2];
-    __shared__ float F[3][block_height+1][block_width+1];
+    __shared__ float F[3][BLOCK_HEIGHT+1][BLOCK_WIDTH+1];
--- a/SWESimulators/KP07_kernel.cu
+++ b/SWESimulators/KP07_kernel.cu
@ -30,9 +30,9 @@ along with this program.  If not, see <http://www.gnu.org/licenses/>.
 __device__
-void computeFluxF(float Q[3][block_height+4][block_width+4],
+void computeFluxF(float Q[3][BLOCK_HEIGHT+4][BLOCK_WIDTH+4],
-                  float Qx[3][block_height+2][block_width+2],
+                  float Qx[3][BLOCK_HEIGHT+2][BLOCK_WIDTH+2],
-                  float F[3][block_height+1][block_width+1],
+                  float F[3][BLOCK_HEIGHT+1][BLOCK_WIDTH+1],
                  const float g_) {
    //Index of thread within block
    const int tx = get_local_id(0);
@ -41,7 +41,7 @@ void computeFluxF(float Q[3][block_height+4][block_width+4],
    {
        int j=ty;
        const int l = j + 2; //Skip ghost cells
-        for (int i=tx; i<block_width+1; i+=block_width) {
+        for (int i=tx; i<BLOCK_WIDTH+1; i+=BLOCK_WIDTH) {
            const int k = i + 1;
            // Q at interface from the right and left
            const float3 Qp = make_float3(Q[0][l][k+1] - 0.5f*Qx[0][j][i+1],
@ -61,15 +61,15 @@ void computeFluxF(float Q[3][block_height+4][block_width+4],
 }
 __device__
-void computeFluxG(float Q[3][block_height+4][block_width+4],
+void computeFluxG(float Q[3][BLOCK_HEIGHT+4][BLOCK_WIDTH+4],
-                  float Qy[3][block_height+2][block_width+2],
+                  float Qy[3][BLOCK_HEIGHT+2][BLOCK_WIDTH+2],
-                  float G[3][block_height+1][block_width+1],
+                  float G[3][BLOCK_HEIGHT+1][BLOCK_WIDTH+1],
                  const float g_) {
    //Index of thread within block
    const int tx = get_local_id(0);
    const int ty = get_local_id(1);
-    for (int j=ty; j<block_height+1; j+=block_height) {
+    for (int j=ty; j<BLOCK_HEIGHT+1; j+=BLOCK_HEIGHT) {
        const int l = j + 1;
        {
            int i=tx;
@ -129,14 +129,14 @@ __global__ void KP07Kernel(
    const int tj = get_global_id(1) + 2;
    //Shared memory variables
-    __shared__ float Q[3][block_height+4][block_width+4];
+    __shared__ float Q[3][BLOCK_HEIGHT+4][BLOCK_WIDTH+4];
    //The following slightly wastes memory, but enables us to reuse the 
    //funcitons in common.opencl
-    __shared__ float Qx[3][block_height+2][block_width+2];
+    __shared__ float Qx[3][BLOCK_HEIGHT+2][BLOCK_WIDTH+2];
-    __shared__ float Qy[3][block_height+2][block_width+2];
+    __shared__ float Qy[3][BLOCK_HEIGHT+2][BLOCK_WIDTH+2];
-    __shared__ float F[3][block_height+1][block_width+1];
+    __shared__ float F[3][BLOCK_HEIGHT+1][BLOCK_WIDTH+1];
-    __shared__ float G[3][block_height+1][block_width+1];
+    __shared__ float G[3][BLOCK_HEIGHT+1][BLOCK_WIDTH+1];
--- a/SWESimulators/LxF_kernel.cu
+++ b/SWESimulators/LxF_kernel.cu
@ -27,8 +27,8 @@ along with this program.  If not, see <http://www.gnu.org/licenses/>.
  * Computes the flux along the x axis for all faces
  */
 __device__ 
-void computeFluxF(float Q[3][block_height+2][block_width+2],
+void computeFluxF(float Q[3][BLOCK_HEIGHT+2][BLOCK_WIDTH+2],
-                  float F[3][block_height][block_width+1],
+                  float F[3][BLOCK_HEIGHT][BLOCK_WIDTH+1],
                  const float g_, const float dx_, const float dt_) {
    //Index of thread within block
    const int tx = get_local_id(0);
@ -37,7 +37,7 @@ void computeFluxF(float Q[3][block_height+2][block_width+2],
    {
        const int j=ty;
        const int l = j + 1; //Skip ghost cells
-        for (int i=tx; i<block_width+1; i+=block_width) {
+        for (int i=tx; i<BLOCK_WIDTH+1; i+=BLOCK_WIDTH) {
            const int k = i;
            // Q at interface from the right and left
@ -62,14 +62,14 @@ void computeFluxF(float Q[3][block_height+2][block_width+2],
  * Computes the flux along the y axis for all faces
  */
 __device__ 
-void computeFluxG(float Q[3][block_height+2][block_width+2],
+void computeFluxG(float Q[3][BLOCK_HEIGHT+2][BLOCK_WIDTH+2],
-                  float G[3][block_height+1][block_width],
+                  float G[3][BLOCK_HEIGHT+1][BLOCK_WIDTH],
                  const float g_, const float dy_, const float dt_) {
    //Index of thread within block
    const int tx = get_local_id(0);
    const int ty = get_local_id(1);
-    for (int j=ty; j<block_height+1; j+=block_height) {
+    for (int j=ty; j<BLOCK_HEIGHT+1; j+=BLOCK_HEIGHT) {
        const int l = j;
        {
            const int i=tx;
@ -114,9 +114,9 @@ __global__ void LxFKernel(
    const int ti = get_global_id(0) + 1; //Skip global ghost cells, i.e., +1
    const int tj = get_global_id(1) + 1;
-    __shared__ float Q[3][block_height+2][block_width+2];
+    __shared__ float Q[3][BLOCK_HEIGHT+2][BLOCK_WIDTH+2];
-    __shared__ float F[3][block_height][block_width+1];
+    __shared__ float F[3][BLOCK_HEIGHT][BLOCK_WIDTH+1];
-    __shared__ float G[3][block_height+1][block_width];
+    __shared__ float G[3][BLOCK_HEIGHT+1][BLOCK_WIDTH];
    //Read into shared memory
    readBlock1(h0_ptr_, h0_pitch_,
--- a/SWESimulators/WAF_kernel.cu
+++ b/SWESimulators/WAF_kernel.cu
@ -33,8 +33,8 @@ along with this program.  If not, see <http://www.gnu.org/licenses/>.
  * Computes the flux along the x axis for all faces
  */
 __device__
-void computeFluxF(float Q[3][block_height+4][block_width+4],
+void computeFluxF(float Q[3][BLOCK_HEIGHT+4][BLOCK_WIDTH+4],
-                  float F[3][block_height+1][block_width+1],
+                  float F[3][BLOCK_HEIGHT+1][BLOCK_WIDTH+1],
                  const float g_, const float dx_, const float dt_) {
    //Index of thread within block
    const int tx = get_local_id(0);
@ -43,7 +43,7 @@ void computeFluxF(float Q[3][block_height+4][block_width+4],
    {
        int j=ty; 
        const int l = j + 2; //Skip ghost cells
-        for (int i=tx; i<block_width+1; i+=block_width) {
+        for (int i=tx; i<BLOCK_WIDTH+1; i+=BLOCK_WIDTH) {
            const int k = i + 1;
            // Q at interface from the right and left
@ -72,15 +72,15 @@ void computeFluxF(float Q[3][block_height+4][block_width+4],
  * Computes the flux along the y axis for all faces
  */
 __device__
-void computeFluxG(float Q[3][block_height+4][block_width+4],
+void computeFluxG(float Q[3][BLOCK_HEIGHT+4][BLOCK_WIDTH+4],
-                  float G[3][block_height+1][block_width+1],
+                  float G[3][BLOCK_HEIGHT+1][BLOCK_WIDTH+1],
                  const float g_, const float dy_, const float dt_) {
    //Index of thread within block
    const int tx = get_local_id(0);
    const int ty = get_local_id(1);
    //Compute fluxes along the y axis
-    for (int j=ty; j<block_height+1; j+=block_height) {
+    for (int j=ty; j<BLOCK_HEIGHT+1; j+=BLOCK_HEIGHT) {
        const int l = j + 1;
        {
            int i=tx;
@ -130,8 +130,8 @@ __global__ void WAFKernel(
        float* hu1_ptr_, int hu1_pitch_,
        float* hv1_ptr_, int hv1_pitch_) {    
    //Shared memory variables
-    __shared__ float Q[3][block_height+4][block_width+4];
+    __shared__ float Q[3][BLOCK_HEIGHT+4][BLOCK_WIDTH+4];
-    __shared__ float F[3][block_height+1][block_width+1];
+    __shared__ float F[3][BLOCK_HEIGHT+1][BLOCK_WIDTH+1];
--- a/SWESimulators/common.cu
+++ b/SWESimulators/common.cu
@ -94,18 +94,18 @@ inline __device__ __host__ float clamp(const float f, const float a, const float
 __device__ void readBlock1(float* h_ptr_, int h_pitch_,
                float* hu_ptr_, int hu_pitch_,
                float* hv_ptr_, int hv_pitch_,
-                float Q[3][block_height+2][block_width+2], 
+                float Q[3][BLOCK_HEIGHT+2][BLOCK_WIDTH+2], 
                const int nx_, const int ny_) {
    //Index of thread within block
    const int tx = get_local_id(0);
    const int ty = get_local_id(1);
    //Index of block within domain
-    const int bx = block_width * get_group_id(0);
+    const int bx = BLOCK_WIDTH * get_group_id(0);
-    const int by = block_height * get_group_id(1);
+    const int by = BLOCK_HEIGHT * get_group_id(1);
    //Read into shared memory
-    for (int j=ty; j<block_height+2; j+=block_height) {
+    for (int j=ty; j<BLOCK_HEIGHT+2; j+=BLOCK_HEIGHT) {
        const int l = clamp(by + j, 0, ny_+1); // Out of bounds
        //Compute the pointer to current row in the arrays
@ -113,7 +113,7 @@ __device__ void readBlock1(float* h_ptr_, int h_pitch_,
        float* const hu_row = (float*) ((char*) hu_ptr_ + hu_pitch_*l);
        float* const hv_row = (float*) ((char*) hv_ptr_ + hv_pitch_*l);
-        for (int i=tx; i<block_width+2; i+=block_width) {
+        for (int i=tx; i<BLOCK_WIDTH+2; i+=BLOCK_WIDTH) {
            const int k = clamp(bx + i, 0, nx_+1); // Out of bounds
            Q[0][j][i] = h_row[k];
@ -133,18 +133,18 @@ __device__ void readBlock1(float* h_ptr_, int h_pitch_,
 __device__ void readBlock2(float* h_ptr_, int h_pitch_,
                float* hu_ptr_, int hu_pitch_,
                float* hv_ptr_, int hv_pitch_,
-                float Q[3][block_height+4][block_width+4], 
+                float Q[3][BLOCK_HEIGHT+4][BLOCK_WIDTH+4], 
                const int nx_, const int ny_) {
    //Index of thread within block
    const int tx = get_local_id(0);
    const int ty = get_local_id(1);
    //Index of block within domain
-    const int bx = block_width * get_group_id(0);
+    const int bx = BLOCK_WIDTH * get_group_id(0);
-    const int by = block_height * get_group_id(1);
+    const int by = BLOCK_HEIGHT * get_group_id(1);
    //Read into shared memory
-    for (int j=ty; j<block_height+4; j+=block_height) {
+    for (int j=ty; j<BLOCK_HEIGHT+4; j+=BLOCK_HEIGHT) {
        const int l = clamp(by + j, 0, ny_+3); // Out of bounds
        //Compute the pointer to current row in the arrays
@ -152,7 +152,7 @@ __device__ void readBlock2(float* h_ptr_, int h_pitch_,
        float* const hu_row = (float*) ((char*) hu_ptr_ + hu_pitch_*l);
        float* const hv_row = (float*) ((char*) hv_ptr_ + hv_pitch_*l);
-        for (int i=tx; i<block_width+4; i+=block_width) {
+        for (int i=tx; i<BLOCK_WIDTH+4; i+=BLOCK_WIDTH) {
            const int k = clamp(bx + i, 0, nx_+3); // Out of bounds
            Q[0][j][i] = h_row[k];
@ -171,7 +171,7 @@ __device__ void readBlock2(float* h_ptr_, int h_pitch_,
 __device__ void writeBlock1(float* h_ptr_, int h_pitch_,
                 float* hu_ptr_, int hu_pitch_,
                 float* hv_ptr_, int hv_pitch_,
-                 float Q[3][block_height+2][block_width+2],
+                 float Q[3][BLOCK_HEIGHT+2][BLOCK_WIDTH+2],
                 const int nx_, const int ny_) {
    //Index of thread within block
    const int tx = get_local_id(0);
@ -206,7 +206,7 @@ __device__ void writeBlock1(float* h_ptr_, int h_pitch_,
 __device__ void writeBlock2(float* h_ptr_, int h_pitch_,
                 float* hu_ptr_, int hu_pitch_,
                 float* hv_ptr_, int hv_pitch_,
-                 float Q[3][block_height+4][block_width+4], 
+                 float Q[3][BLOCK_HEIGHT+4][BLOCK_WIDTH+4], 
                 const int nx_, const int ny_) {
    //Index of thread within block
    const int tx = get_local_id(0);
@ -240,7 +240,7 @@ __device__ void writeBlock2(float* h_ptr_, int h_pitch_,
  * No flow boundary conditions for the shallow water equations
  * with one ghost cell in each direction
  */
-__device__ void noFlowBoundary1(float Q[3][block_height+2][block_width+2], const int nx_, const int ny_) {
+__device__ void noFlowBoundary1(float Q[3][BLOCK_HEIGHT+2][BLOCK_WIDTH+2], const int nx_, const int ny_) {
    //Global index
    const int ti = get_global_id(0) + 1; //Skip global ghost cells, i.e., +1
    const int tj = get_global_id(1) + 1;
@ -284,7 +284,7 @@ __device__ void noFlowBoundary1(float Q[3][block_height+2][block_width+2], const
  * No flow boundary conditions for the shallow water equations
  * with two ghost cells in each direction
  */
-__device__ void noFlowBoundary2(float Q[3][block_height+4][block_width+4], const int nx_, const int ny_) {
+__device__ void noFlowBoundary2(float Q[3][BLOCK_HEIGHT+4][BLOCK_WIDTH+4], const int nx_, const int ny_) {
    //Global index
    const int ti = get_global_id(0) + 2; //Skip global ghost cells, i.e., +2
    const int tj = get_global_id(1) + 2;
@ -342,8 +342,8 @@ __device__ void noFlowBoundary2(float Q[3][block_height+4][block_width+4], const
 /**
  * Evolves the solution in time along the x axis (dimensional splitting)
  */
-__device__ void evolveF1(float Q[3][block_height+2][block_width+2],
+__device__ void evolveF1(float Q[3][BLOCK_HEIGHT+2][BLOCK_WIDTH+2],
-              float F[3][block_height+1][block_width+1],
+              float F[3][BLOCK_HEIGHT+1][BLOCK_WIDTH+1],
              const int nx_, const int ny_,
              const float dx_, const float dt_) {
    //Index of thread within block
@ -372,8 +372,8 @@ __device__ void evolveF1(float Q[3][block_height+2][block_width+2],
 /**
  * Evolves the solution in time along the x axis (dimensional splitting)
  */
-__device__ void evolveF2(float Q[3][block_height+4][block_width+4],
+__device__ void evolveF2(float Q[3][BLOCK_HEIGHT+4][BLOCK_WIDTH+4],
-              float F[3][block_height+1][block_width+1],
+              float F[3][BLOCK_HEIGHT+1][BLOCK_WIDTH+1],
              const int nx_, const int ny_,
              const float dx_, const float dt_) {
    //Index of thread within block
@ -402,8 +402,8 @@ __device__ void evolveF2(float Q[3][block_height+4][block_width+4],
 /**
  * Evolves the solution in time along the y axis (dimensional splitting)
  */
-__device__ void evolveG1(float Q[3][block_height+2][block_width+2],
+__device__ void evolveG1(float Q[3][BLOCK_HEIGHT+2][BLOCK_WIDTH+2],
-              float G[3][block_height+1][block_width+1],
+              float G[3][BLOCK_HEIGHT+1][BLOCK_WIDTH+1],
              const int nx_, const int ny_,
              const float dy_, const float dt_) {
    //Index of thread within block
@ -433,8 +433,8 @@ __device__ void evolveG1(float Q[3][block_height+2][block_width+2],
 /**
  * Evolves the solution in time along the y axis (dimensional splitting)
  */
-__device__ void evolveG2(float Q[3][block_height+4][block_width+4],
+__device__ void evolveG2(float Q[3][BLOCK_HEIGHT+4][BLOCK_WIDTH+4],
-              float G[3][block_height+1][block_width+1],
+              float G[3][BLOCK_HEIGHT+1][BLOCK_WIDTH+1],
              const int nx_, const int ny_,
              const float dy_, const float dt_) {
    //Index of thread within block
--- a/SWESimulators/limiters.cu
+++ b/SWESimulators/limiters.cu
@ -46,8 +46,8 @@ __device__ __inline__ float minmodSlope(float left, float center, float right, f
 /**
  * Reconstructs a minmod slope for a whole block along the abscissa
  */
-__device__ void minmodSlopeX(float  Q[3][block_height+4][block_width+4],
+__device__ void minmodSlopeX(float  Q[3][BLOCK_HEIGHT+4][BLOCK_WIDTH+4],
-                  float Qx[3][block_height+2][block_width+2],
+                  float Qx[3][BLOCK_HEIGHT+2][BLOCK_WIDTH+2],
                  const float theta_) {
    //Index of thread within block
    const int tx = get_local_id(0);
@ -57,7 +57,7 @@ __device__ void minmodSlopeX(float  Q[3][block_height+4][block_width+4],
    {
        const int j = ty;
        const int l = j + 2; //Skip ghost cells
-        for (int i=tx; i<block_width+2; i+=block_width) {
+        for (int i=tx; i<BLOCK_WIDTH+2; i+=BLOCK_WIDTH) {
            const int k = i + 1;
            for (int p=0; p<3; ++p) {
                Qx[p][j][i] = minmodSlope(Q[p][l][k-1], Q[p][l][k], Q[p][l][k+1], theta_);
@ -70,14 +70,14 @@ __device__ void minmodSlopeX(float  Q[3][block_height+4][block_width+4],
 /**
  * Reconstructs a minmod slope for a whole block along the ordinate
  */
-__device__ void minmodSlopeY(float  Q[3][block_height+4][block_width+4],
+__device__ void minmodSlopeY(float  Q[3][BLOCK_HEIGHT+4][BLOCK_WIDTH+4],
-                  float Qy[3][block_height+2][block_width+2],
+                  float Qy[3][BLOCK_HEIGHT+2][BLOCK_WIDTH+2],
                  const float theta_) {
    //Index of thread within block
    const int tx = get_local_id(0);
    const int ty = get_local_id(1);
-    for (int j=ty; j<block_height+2; j+=block_height) {
+    for (int j=ty; j<BLOCK_HEIGHT+2; j+=BLOCK_HEIGHT) {
        const int l = j + 1;
        {
            const int i = tx;