Renamed macro and added iPython magic

2026-01-01 01:08:45 +01:00 · 2018-08-10 13:59:52 +02:00
parent 426b8dba5c
commit 3e401e3fe1
11 changed files with 178 additions and 99 deletions
--- a/SWESimulators/Common.py
+++ b/SWESimulators/Common.py
@@ -37,6 +37,7 @@ class Timer(object):
 Class which keeps track of the CUDA context and some helper functions
 """
 class CudaContext(object):
+    
    def __init__(self, verbose=True, blocking=False):
        self.verbose = verbose
        self.blocking = blocking
@@ -93,7 +94,10 @@ class CudaContext(object):
        if (self.verbose):
            print(" `-> <" + str(self.cuda_context.handle) + "> Detaching context")
        self.cuda_context.detach()
-                    
+        
+        
+    def __str__(self):
+        return "CudaContext id " + str(self.cuda_context.handle)
    
    def hash_kernel(kernel_filename, include_dirs, verbose=False):        
        # Generate a kernel ID for our cache
@@ -172,8 +176,8 @@ class CudaContext(object):
                print("`-> Kernel changed or not in hash => compiling " + kernel_filename)
                
            #Create define string
-            define_string = "#define block_width " + str(block_width) + "\n"
-            define_string += "#define block_height " + str(block_height) + "\n\n"
+            define_string = "#define BLOCK_WIDTH " + str(block_width) + "\n"
+            define_string += "#define BLOCK_HEIGHT " + str(block_height) + "\n\n"
            
            kernel_string = define_string + '#include "' + os.path.join(module_path, kernel_filename) + '"'
            
--- a/SWESimulators/FORCE_kernel.cu
+++ b/SWESimulators/FORCE_kernel.cu
@@ -27,8 +27,8 @@ along with this program.  If not, see <http://www.gnu.org/licenses/>.
  * Computes the flux along the x axis for all faces
  */
 __device__ 
-void computeFluxF(float Q[3][block_height+2][block_width+2],
-                  float F[3][block_height+1][block_width+1],
+void computeFluxF(float Q[3][BLOCK_HEIGHT+2][BLOCK_WIDTH+2],
+                  float F[3][BLOCK_HEIGHT+1][BLOCK_WIDTH+1],
                  const float g_, const float dx_, const float dt_) {
                      
    //Index of thread within block
@@ -39,7 +39,7 @@ void computeFluxF(float Q[3][block_height+2][block_width+2],
    {
        int j=ty;
        const int l = j + 1; //Skip ghost cells
-        for (int i=tx; i<block_width+1; i+=block_width) {
+        for (int i=tx; i<BLOCK_WIDTH+1; i+=BLOCK_WIDTH) {
            const int k = i;
            
            // Q at interface from the right and left
@@ -64,15 +64,15 @@ void computeFluxF(float Q[3][block_height+2][block_width+2],
  * Computes the flux along the y axis for all faces
  */
 __device__ 
-void computeFluxG(float Q[3][block_height+2][block_width+2],
-                  float G[3][block_height+1][block_width+1],
+void computeFluxG(float Q[3][BLOCK_HEIGHT+2][BLOCK_WIDTH+2],
+                  float G[3][BLOCK_HEIGHT+1][BLOCK_WIDTH+1],
                  const float g_, const float dy_, const float dt_) {
    //Index of thread within block
    const int tx = get_local_id(0);
    const int ty = get_local_id(1);
    
    //Compute fluxes along the y axis
-    for (int j=ty; j<block_height+1; j+=block_height) {
+    for (int j=ty; j<BLOCK_HEIGHT+1; j+=BLOCK_HEIGHT) {
        const int l = j;
        {
            int i=tx;
@@ -113,8 +113,8 @@ __global__ void FORCEKernel(
        float* hu1_ptr_, int hu1_pitch_,
        float* hv1_ptr_, int hv1_pitch_) {
    
-    __shared__ float Q[3][block_height+2][block_width+2];
-    __shared__ float F[3][block_height+1][block_width+1];
+    __shared__ float Q[3][BLOCK_HEIGHT+2][BLOCK_WIDTH+2];
+    __shared__ float F[3][BLOCK_HEIGHT+1][BLOCK_WIDTH+1];
    
    
    //Read into shared memory
--- a/SWESimulators/HLL2_kernel.cu
+++ b/SWESimulators/HLL2_kernel.cu
@@ -32,9 +32,9 @@ along with this program.  If not, see <http://www.gnu.org/licenses/>.
  * Computes the flux along the x axis for all faces
  */
 __device__
-void computeFluxF(float Q[3][block_height+4][block_width+4],
-                  float Qx[3][block_height+2][block_width+2],
-                  float F[3][block_height+1][block_width+1],
+void computeFluxF(float Q[3][BLOCK_HEIGHT+4][BLOCK_WIDTH+4],
+                  float Qx[3][BLOCK_HEIGHT+2][BLOCK_WIDTH+2],
+                  float F[3][BLOCK_HEIGHT+1][BLOCK_WIDTH+1],
                  const float g_, const float dx_, const float dt_) {
    //Index of thread within block
    const int tx = get_local_id(0);
@@ -43,7 +43,7 @@ void computeFluxF(float Q[3][block_height+4][block_width+4],
    {
        const int j=ty;
        const int l = j + 2; //Skip ghost cells
-        for (int i=tx; i<block_width+1; i+=block_width) {
+        for (int i=tx; i<BLOCK_WIDTH+1; i+=BLOCK_WIDTH) {
            const int k = i + 1;
            // Reconstruct point values of Q at the left and right hand side 
            // of the cell for both the left (i) and right (i+1) cell 
@@ -84,15 +84,15 @@ void computeFluxF(float Q[3][block_height+4][block_width+4],
  * Computes the flux along the x axis for all faces
  */
 __device__
-void computeFluxG(float Q[3][block_height+4][block_width+4],
-                  float Qy[3][block_height+2][block_width+2],
-                  float G[3][block_height+1][block_width+1],
+void computeFluxG(float Q[3][BLOCK_HEIGHT+4][BLOCK_WIDTH+4],
+                  float Qy[3][BLOCK_HEIGHT+2][BLOCK_WIDTH+2],
+                  float G[3][BLOCK_HEIGHT+1][BLOCK_WIDTH+1],
                  const float g_, const float dy_, const float dt_) {
    //Index of thread within block
    const int tx = get_local_id(0);
    const int ty = get_local_id(1);
    
-    for (int j=ty; j<block_height+1; j+=block_height) {
+    for (int j=ty; j<BLOCK_HEIGHT+1; j+=BLOCK_HEIGHT) {
        const int l = j + 1;
        { 
            int i=tx;
@@ -157,9 +157,9 @@ __global__ void HLL2Kernel(
        float* hv1_ptr_, int hv1_pitch_) {
            
    //Shared memory variables
-    __shared__ float Q[3][block_height+4][block_width+4];
-    __shared__ float Qx[3][block_height+2][block_width+2];
-    __shared__ float F[3][block_height+1][block_width+1];
+    __shared__ float Q[3][BLOCK_HEIGHT+4][BLOCK_WIDTH+4];
+    __shared__ float Qx[3][BLOCK_HEIGHT+2][BLOCK_WIDTH+2];
+    __shared__ float F[3][BLOCK_HEIGHT+1][BLOCK_WIDTH+1];
    
    
    
--- a/SWESimulators/HLL_kernel.cu
+++ b/SWESimulators/HLL_kernel.cu
@@ -30,8 +30,8 @@ along with this program.  If not, see <http://www.gnu.org/licenses/>.
  * Computes the flux along the x axis for all faces
  */
 __device__
-void computeFluxF(float Q[3][block_height+2][block_width+2],
-                  float F[3][block_height+1][block_width+1],
+void computeFluxF(float Q[3][BLOCK_HEIGHT+2][BLOCK_WIDTH+2],
+                  float F[3][BLOCK_HEIGHT+1][BLOCK_WIDTH+1],
                  const float g_) {
    //Index of thread within block
    const int tx = get_local_id(0);
@@ -40,7 +40,7 @@ void computeFluxF(float Q[3][block_height+2][block_width+2],
    {
        const int j=ty;
        const int l = j + 1; //Skip ghost cells     
-        for (int i=tx; i<block_width+1; i+=block_width) { 
+        for (int i=tx; i<BLOCK_WIDTH+1; i+=BLOCK_WIDTH) { 
            const int k = i;
            
            const float3 Q_l  = make_float3(Q[0][l][k  ], Q[1][l][k  ], Q[2][l][k  ]);
@@ -64,14 +64,14 @@ void computeFluxF(float Q[3][block_height+2][block_width+2],
  * Computes the flux along the y axis for all faces
  */
 __device__
-void computeFluxG(float Q[3][block_height+2][block_width+2],
-                  float G[3][block_height+1][block_width+1],
+void computeFluxG(float Q[3][BLOCK_HEIGHT+2][BLOCK_WIDTH+2],
+                  float G[3][BLOCK_HEIGHT+1][BLOCK_WIDTH+1],
                  const float g_) {
    //Index of thread within block
    const int tx = get_local_id(0);
    const int ty = get_local_id(1);
    
-    for (int j=ty; j<block_height+1; j+=block_height) {
+    for (int j=ty; j<BLOCK_HEIGHT+1; j+=BLOCK_HEIGHT) {
        const int l = j;
        {
            const int i=tx;
@@ -120,8 +120,8 @@ __global__ void HLLKernel(
        float* hu1_ptr_, int hu1_pitch_,
        float* hv1_ptr_, int hv1_pitch_) {
    //Shared memory variables
-    __shared__ float Q[3][block_height+2][block_width+2];
-    __shared__ float F[3][block_height+1][block_width+1];
+    __shared__ float Q[3][BLOCK_HEIGHT+2][BLOCK_WIDTH+2];
+    __shared__ float F[3][BLOCK_HEIGHT+1][BLOCK_WIDTH+1];
    
    
    //Read into shared memory
--- a/SWESimulators/IPythonMagic.py
+++ b/SWESimulators/IPythonMagic.py
@@ -0,0 +1,75 @@
+# -*- coding: utf-8 -*-
+
+"""
+This python module implements helpers for IPython / Jupyter and CUDA
+
+Copyright (C) 2018  SINTEF ICT
+
+This program is free software: you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation, either version 3 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program.  If not, see <http://www.gnu.org/licenses/>.
+"""
+
+from IPython.core.magic import line_magic, Magics, magics_class
+import pycuda.driver as cuda
+
+
+
+@magics_class
+class CudaContextHandler(Magics): 
+    @line_magic
+    def cuda_context_handler(self, context_name):
+        print("Registering " + context_name + " as a global context")
+        
+        if context_name in self.shell.user_ns.keys():
+            print("`-> Context already registered! Ignoring")
+            return
+        else:
+            print("`-> Creating context")
+            self.shell.ex(context_name + " = Common.CudaContext(verbose=True, blocking=False)")
+        
+        # this function will be called on exceptions in any cell
+        def custom_exc(shell, etype, evalue, tb, tb_offset=None):
+            print("Exception caught: Resetting to CUDA context " + context_name)
+            while (cuda.Context.get_current() != None):
+                context = cuda.Context.get_current()
+                print("`-> popping " + str(context.handle))
+                cuda.Context.pop()
+
+            if context_name in self.shell.user_ns.keys():
+                print("`-> pushing " + str(self.shell.user_ns[context_name].cuda_context.handle))
+                self.shell.ex(context_name + ".cuda_context.push()")
+            else:
+                print("No CUDA context called " + context_name + " found (something is wrong)!")
+                print("CUDA will not work now")
+
+            # still show the error within the notebook, don't just swallow it
+            shell.showtraceback((etype, evalue, tb), tb_offset=tb_offset)
+
+        # this registers a custom exception handler for the whole current notebook
+        get_ipython().set_custom_exc((Exception,), custom_exc)
+        
+        
+        # Handle CUDA context when exiting python
+        import atexit
+        def exitfunc():
+            print("Exitfunc: Resetting CUDA context stack")
+            while (cuda.Context.get_current() != None):
+                context = cuda.Context.get_current()
+                print("`-> popping " + str(context.handle))
+                cuda.Context.pop()
+        atexit.register(exitfunc)
+
+print("Registering automatic CUDA context handling")
+print("(use %cuda_context_handler my_context to create a context called my_context")
+ip = get_ipython()
+ip.register_magics(CudaContextHandler)
--- a/SWESimulators/KP07_dimsplit_kernel.cu
+++ b/SWESimulators/KP07_dimsplit_kernel.cu
@@ -30,9 +30,9 @@ along with this program.  If not, see <http://www.gnu.org/licenses/>.


 __device__
-void computeFluxF(float Q[3][block_height+4][block_width+4],
-                  float Qx[3][block_height+2][block_width+2],
-                  float F[3][block_height+1][block_width+1],
+void computeFluxF(float Q[3][BLOCK_HEIGHT+4][BLOCK_WIDTH+4],
+                  float Qx[3][BLOCK_HEIGHT+2][BLOCK_WIDTH+2],
+                  float F[3][BLOCK_HEIGHT+1][BLOCK_WIDTH+1],
                  const float g_, const float dx_, const float dt_) {
    //Index of thread within block
    const int tx = get_local_id(0);
@@ -41,7 +41,7 @@ void computeFluxF(float Q[3][block_height+4][block_width+4],
    {
        int j=ty;
        const int l = j + 2; //Skip ghost cells
-        for (int i=tx; i<block_width+1; i+=block_width) {
+        for (int i=tx; i<BLOCK_WIDTH+1; i+=BLOCK_WIDTH) {
            const int k = i + 1;
            // Reconstruct point values of Q at the left and right hand side 
            // of the cell for both the left (i) and right (i+1) cell 
@@ -75,15 +75,15 @@ void computeFluxF(float Q[3][block_height+4][block_width+4],
 }

 __device__
-void computeFluxG(float Q[3][block_height+4][block_width+4],
-                  float Qy[3][block_height+2][block_width+2],
-                  float G[3][block_height+1][block_width+1],
+void computeFluxG(float Q[3][BLOCK_HEIGHT+4][BLOCK_WIDTH+4],
+                  float Qy[3][BLOCK_HEIGHT+2][BLOCK_WIDTH+2],
+                  float G[3][BLOCK_HEIGHT+1][BLOCK_WIDTH+1],
                  const float g_, const float dy_, const float dt_) {
    //Index of thread within block
    const int tx = get_local_id(0);
    const int ty = get_local_id(1);
    
-    for (int j=ty; j<block_height+1; j+=block_height) {
+    for (int j=ty; j<BLOCK_HEIGHT+1; j+=BLOCK_HEIGHT) {
        const int l = j + 1;
        {
            int i=tx;
@@ -148,9 +148,9 @@ __global__ void KP07DimsplitKernel(
        
        
    //Shared memory variables
-    __shared__ float Q[3][block_height+4][block_width+4];
-    __shared__ float Qx[3][block_height+2][block_width+2];
-    __shared__ float F[3][block_height+1][block_width+1];
+    __shared__ float Q[3][BLOCK_HEIGHT+4][BLOCK_WIDTH+4];
+    __shared__ float Qx[3][BLOCK_HEIGHT+2][BLOCK_WIDTH+2];
+    __shared__ float F[3][BLOCK_HEIGHT+1][BLOCK_WIDTH+1];
    
    
    
--- a/SWESimulators/KP07_kernel.cu
+++ b/SWESimulators/KP07_kernel.cu
@@ -30,9 +30,9 @@ along with this program.  If not, see <http://www.gnu.org/licenses/>.


 __device__
-void computeFluxF(float Q[3][block_height+4][block_width+4],
-                  float Qx[3][block_height+2][block_width+2],
-                  float F[3][block_height+1][block_width+1],
+void computeFluxF(float Q[3][BLOCK_HEIGHT+4][BLOCK_WIDTH+4],
+                  float Qx[3][BLOCK_HEIGHT+2][BLOCK_WIDTH+2],
+                  float F[3][BLOCK_HEIGHT+1][BLOCK_WIDTH+1],
                  const float g_) {
    //Index of thread within block
    const int tx = get_local_id(0);
@@ -41,7 +41,7 @@ void computeFluxF(float Q[3][block_height+4][block_width+4],
    {
        int j=ty;
        const int l = j + 2; //Skip ghost cells
-        for (int i=tx; i<block_width+1; i+=block_width) {
+        for (int i=tx; i<BLOCK_WIDTH+1; i+=BLOCK_WIDTH) {
            const int k = i + 1;
            // Q at interface from the right and left
            const float3 Qp = make_float3(Q[0][l][k+1] - 0.5f*Qx[0][j][i+1],
@@ -61,15 +61,15 @@ void computeFluxF(float Q[3][block_height+4][block_width+4],
 }

 __device__
-void computeFluxG(float Q[3][block_height+4][block_width+4],
-                  float Qy[3][block_height+2][block_width+2],
-                  float G[3][block_height+1][block_width+1],
+void computeFluxG(float Q[3][BLOCK_HEIGHT+4][BLOCK_WIDTH+4],
+                  float Qy[3][BLOCK_HEIGHT+2][BLOCK_WIDTH+2],
+                  float G[3][BLOCK_HEIGHT+1][BLOCK_WIDTH+1],
                  const float g_) {
    //Index of thread within block
    const int tx = get_local_id(0);
    const int ty = get_local_id(1);
    
-    for (int j=ty; j<block_height+1; j+=block_height) {
+    for (int j=ty; j<BLOCK_HEIGHT+1; j+=BLOCK_HEIGHT) {
        const int l = j + 1;
        {
            int i=tx;
@@ -129,14 +129,14 @@ __global__ void KP07Kernel(
    const int tj = get_global_id(1) + 2;
    
    //Shared memory variables
-    __shared__ float Q[3][block_height+4][block_width+4];
+    __shared__ float Q[3][BLOCK_HEIGHT+4][BLOCK_WIDTH+4];
    
    //The following slightly wastes memory, but enables us to reuse the 
    //funcitons in common.opencl
-    __shared__ float Qx[3][block_height+2][block_width+2];
-    __shared__ float Qy[3][block_height+2][block_width+2];
-    __shared__ float F[3][block_height+1][block_width+1];
-    __shared__ float G[3][block_height+1][block_width+1];
+    __shared__ float Qx[3][BLOCK_HEIGHT+2][BLOCK_WIDTH+2];
+    __shared__ float Qy[3][BLOCK_HEIGHT+2][BLOCK_WIDTH+2];
+    __shared__ float F[3][BLOCK_HEIGHT+1][BLOCK_WIDTH+1];
+    __shared__ float G[3][BLOCK_HEIGHT+1][BLOCK_WIDTH+1];
    
    
    
--- a/SWESimulators/LxF_kernel.cu
+++ b/SWESimulators/LxF_kernel.cu
@@ -27,8 +27,8 @@ along with this program.  If not, see <http://www.gnu.org/licenses/>.
  * Computes the flux along the x axis for all faces
  */
 __device__ 
-void computeFluxF(float Q[3][block_height+2][block_width+2],
-                  float F[3][block_height][block_width+1],
+void computeFluxF(float Q[3][BLOCK_HEIGHT+2][BLOCK_WIDTH+2],
+                  float F[3][BLOCK_HEIGHT][BLOCK_WIDTH+1],
                  const float g_, const float dx_, const float dt_) {
    //Index of thread within block
    const int tx = get_local_id(0);
@@ -37,7 +37,7 @@ void computeFluxF(float Q[3][block_height+2][block_width+2],
    {
        const int j=ty;
        const int l = j + 1; //Skip ghost cells
-        for (int i=tx; i<block_width+1; i+=block_width) {
+        for (int i=tx; i<BLOCK_WIDTH+1; i+=BLOCK_WIDTH) {
            const int k = i;
            
            // Q at interface from the right and left
@@ -62,14 +62,14 @@ void computeFluxF(float Q[3][block_height+2][block_width+2],
  * Computes the flux along the y axis for all faces
  */
 __device__ 
-void computeFluxG(float Q[3][block_height+2][block_width+2],
-                  float G[3][block_height+1][block_width],
+void computeFluxG(float Q[3][BLOCK_HEIGHT+2][BLOCK_WIDTH+2],
+                  float G[3][BLOCK_HEIGHT+1][BLOCK_WIDTH],
                  const float g_, const float dy_, const float dt_) {
    //Index of thread within block
    const int tx = get_local_id(0);
    const int ty = get_local_id(1);
    
-    for (int j=ty; j<block_height+1; j+=block_height) {
+    for (int j=ty; j<BLOCK_HEIGHT+1; j+=BLOCK_HEIGHT) {
        const int l = j;
        {
            const int i=tx;
@@ -114,9 +114,9 @@ __global__ void LxFKernel(
    const int ti = get_global_id(0) + 1; //Skip global ghost cells, i.e., +1
    const int tj = get_global_id(1) + 1;
    
-    __shared__ float Q[3][block_height+2][block_width+2];
-    __shared__ float F[3][block_height][block_width+1];
-    __shared__ float G[3][block_height+1][block_width];
+    __shared__ float Q[3][BLOCK_HEIGHT+2][BLOCK_WIDTH+2];
+    __shared__ float F[3][BLOCK_HEIGHT][BLOCK_WIDTH+1];
+    __shared__ float G[3][BLOCK_HEIGHT+1][BLOCK_WIDTH];
    
    //Read into shared memory
    readBlock1(h0_ptr_, h0_pitch_,
--- a/SWESimulators/WAF_kernel.cu
+++ b/SWESimulators/WAF_kernel.cu
@@ -33,8 +33,8 @@ along with this program.  If not, see <http://www.gnu.org/licenses/>.
  * Computes the flux along the x axis for all faces
  */
 __device__
-void computeFluxF(float Q[3][block_height+4][block_width+4],
-                  float F[3][block_height+1][block_width+1],
+void computeFluxF(float Q[3][BLOCK_HEIGHT+4][BLOCK_WIDTH+4],
+                  float F[3][BLOCK_HEIGHT+1][BLOCK_WIDTH+1],
                  const float g_, const float dx_, const float dt_) {
    //Index of thread within block
    const int tx = get_local_id(0);
@@ -43,7 +43,7 @@ void computeFluxF(float Q[3][block_height+4][block_width+4],
    {
        int j=ty; 
        const int l = j + 2; //Skip ghost cells
-        for (int i=tx; i<block_width+1; i+=block_width) {
+        for (int i=tx; i<BLOCK_WIDTH+1; i+=BLOCK_WIDTH) {
            const int k = i + 1;
            
            // Q at interface from the right and left
@@ -72,15 +72,15 @@ void computeFluxF(float Q[3][block_height+4][block_width+4],
  * Computes the flux along the y axis for all faces
  */
 __device__
-void computeFluxG(float Q[3][block_height+4][block_width+4],
-                  float G[3][block_height+1][block_width+1],
+void computeFluxG(float Q[3][BLOCK_HEIGHT+4][BLOCK_WIDTH+4],
+                  float G[3][BLOCK_HEIGHT+1][BLOCK_WIDTH+1],
                  const float g_, const float dy_, const float dt_) {
    //Index of thread within block
    const int tx = get_local_id(0);
    const int ty = get_local_id(1);
    
    //Compute fluxes along the y axis
-    for (int j=ty; j<block_height+1; j+=block_height) {
+    for (int j=ty; j<BLOCK_HEIGHT+1; j+=BLOCK_HEIGHT) {
        const int l = j + 1;
        {
            int i=tx;
@@ -130,8 +130,8 @@ __global__ void WAFKernel(
        float* hu1_ptr_, int hu1_pitch_,
        float* hv1_ptr_, int hv1_pitch_) {    
    //Shared memory variables
-    __shared__ float Q[3][block_height+4][block_width+4];
-    __shared__ float F[3][block_height+1][block_width+1];
+    __shared__ float Q[3][BLOCK_HEIGHT+4][BLOCK_WIDTH+4];
+    __shared__ float F[3][BLOCK_HEIGHT+1][BLOCK_WIDTH+1];
    
    
    
--- a/SWESimulators/common.cu
+++ b/SWESimulators/common.cu
@@ -94,18 +94,18 @@ inline __device__ __host__ float clamp(const float f, const float a, const float
 __device__ void readBlock1(float* h_ptr_, int h_pitch_,
                float* hu_ptr_, int hu_pitch_,
                float* hv_ptr_, int hv_pitch_,
-                float Q[3][block_height+2][block_width+2], 
+                float Q[3][BLOCK_HEIGHT+2][BLOCK_WIDTH+2], 
                const int nx_, const int ny_) {
    //Index of thread within block
    const int tx = get_local_id(0);
    const int ty = get_local_id(1);
    
    //Index of block within domain
-    const int bx = block_width * get_group_id(0);
-    const int by = block_height * get_group_id(1);
+    const int bx = BLOCK_WIDTH * get_group_id(0);
+    const int by = BLOCK_HEIGHT * get_group_id(1);
    
    //Read into shared memory
-    for (int j=ty; j<block_height+2; j+=block_height) {
+    for (int j=ty; j<BLOCK_HEIGHT+2; j+=BLOCK_HEIGHT) {
        const int l = clamp(by + j, 0, ny_+1); // Out of bounds
        
        //Compute the pointer to current row in the arrays
@@ -113,7 +113,7 @@ __device__ void readBlock1(float* h_ptr_, int h_pitch_,
        float* const hu_row = (float*) ((char*) hu_ptr_ + hu_pitch_*l);
        float* const hv_row = (float*) ((char*) hv_ptr_ + hv_pitch_*l);
        
-        for (int i=tx; i<block_width+2; i+=block_width) {
+        for (int i=tx; i<BLOCK_WIDTH+2; i+=BLOCK_WIDTH) {
            const int k = clamp(bx + i, 0, nx_+1); // Out of bounds
            
            Q[0][j][i] = h_row[k];
@@ -133,18 +133,18 @@ __device__ void readBlock1(float* h_ptr_, int h_pitch_,
 __device__ void readBlock2(float* h_ptr_, int h_pitch_,
                float* hu_ptr_, int hu_pitch_,
                float* hv_ptr_, int hv_pitch_,
-                float Q[3][block_height+4][block_width+4], 
+                float Q[3][BLOCK_HEIGHT+4][BLOCK_WIDTH+4], 
                const int nx_, const int ny_) {
    //Index of thread within block
    const int tx = get_local_id(0);
    const int ty = get_local_id(1);
    
    //Index of block within domain
-    const int bx = block_width * get_group_id(0);
-    const int by = block_height * get_group_id(1);
+    const int bx = BLOCK_WIDTH * get_group_id(0);
+    const int by = BLOCK_HEIGHT * get_group_id(1);
    
    //Read into shared memory
-    for (int j=ty; j<block_height+4; j+=block_height) {
+    for (int j=ty; j<BLOCK_HEIGHT+4; j+=BLOCK_HEIGHT) {
        const int l = clamp(by + j, 0, ny_+3); // Out of bounds
        
        //Compute the pointer to current row in the arrays
@@ -152,7 +152,7 @@ __device__ void readBlock2(float* h_ptr_, int h_pitch_,
        float* const hu_row = (float*) ((char*) hu_ptr_ + hu_pitch_*l);
        float* const hv_row = (float*) ((char*) hv_ptr_ + hv_pitch_*l);
        
-        for (int i=tx; i<block_width+4; i+=block_width) {
+        for (int i=tx; i<BLOCK_WIDTH+4; i+=BLOCK_WIDTH) {
            const int k = clamp(bx + i, 0, nx_+3); // Out of bounds
            
            Q[0][j][i] = h_row[k];
@@ -171,7 +171,7 @@ __device__ void readBlock2(float* h_ptr_, int h_pitch_,
 __device__ void writeBlock1(float* h_ptr_, int h_pitch_,
                 float* hu_ptr_, int hu_pitch_,
                 float* hv_ptr_, int hv_pitch_,
-                 float Q[3][block_height+2][block_width+2],
+                 float Q[3][BLOCK_HEIGHT+2][BLOCK_WIDTH+2],
                 const int nx_, const int ny_) {
    //Index of thread within block
    const int tx = get_local_id(0);
@@ -206,7 +206,7 @@ __device__ void writeBlock1(float* h_ptr_, int h_pitch_,
 __device__ void writeBlock2(float* h_ptr_, int h_pitch_,
                 float* hu_ptr_, int hu_pitch_,
                 float* hv_ptr_, int hv_pitch_,
-                 float Q[3][block_height+4][block_width+4], 
+                 float Q[3][BLOCK_HEIGHT+4][BLOCK_WIDTH+4], 
                 const int nx_, const int ny_) {
    //Index of thread within block
    const int tx = get_local_id(0);
@@ -240,7 +240,7 @@ __device__ void writeBlock2(float* h_ptr_, int h_pitch_,
  * No flow boundary conditions for the shallow water equations
  * with one ghost cell in each direction
  */
-__device__ void noFlowBoundary1(float Q[3][block_height+2][block_width+2], const int nx_, const int ny_) {
+__device__ void noFlowBoundary1(float Q[3][BLOCK_HEIGHT+2][BLOCK_WIDTH+2], const int nx_, const int ny_) {
    //Global index
    const int ti = get_global_id(0) + 1; //Skip global ghost cells, i.e., +1
    const int tj = get_global_id(1) + 1;
@@ -284,7 +284,7 @@ __device__ void noFlowBoundary1(float Q[3][block_height+2][block_width+2], const
  * No flow boundary conditions for the shallow water equations
  * with two ghost cells in each direction
  */
-__device__ void noFlowBoundary2(float Q[3][block_height+4][block_width+4], const int nx_, const int ny_) {
+__device__ void noFlowBoundary2(float Q[3][BLOCK_HEIGHT+4][BLOCK_WIDTH+4], const int nx_, const int ny_) {
    //Global index
    const int ti = get_global_id(0) + 2; //Skip global ghost cells, i.e., +2
    const int tj = get_global_id(1) + 2;
@@ -342,8 +342,8 @@ __device__ void noFlowBoundary2(float Q[3][block_height+4][block_width+4], const
 /**
  * Evolves the solution in time along the x axis (dimensional splitting)
  */
-__device__ void evolveF1(float Q[3][block_height+2][block_width+2],
-              float F[3][block_height+1][block_width+1],
+__device__ void evolveF1(float Q[3][BLOCK_HEIGHT+2][BLOCK_WIDTH+2],
+              float F[3][BLOCK_HEIGHT+1][BLOCK_WIDTH+1],
              const int nx_, const int ny_,
              const float dx_, const float dt_) {
    //Index of thread within block
@@ -372,8 +372,8 @@ __device__ void evolveF1(float Q[3][block_height+2][block_width+2],
 /**
  * Evolves the solution in time along the x axis (dimensional splitting)
  */
-__device__ void evolveF2(float Q[3][block_height+4][block_width+4],
-              float F[3][block_height+1][block_width+1],
+__device__ void evolveF2(float Q[3][BLOCK_HEIGHT+4][BLOCK_WIDTH+4],
+              float F[3][BLOCK_HEIGHT+1][BLOCK_WIDTH+1],
              const int nx_, const int ny_,
              const float dx_, const float dt_) {
    //Index of thread within block
@@ -402,8 +402,8 @@ __device__ void evolveF2(float Q[3][block_height+4][block_width+4],
 /**
  * Evolves the solution in time along the y axis (dimensional splitting)
  */
-__device__ void evolveG1(float Q[3][block_height+2][block_width+2],
-              float G[3][block_height+1][block_width+1],
+__device__ void evolveG1(float Q[3][BLOCK_HEIGHT+2][BLOCK_WIDTH+2],
+              float G[3][BLOCK_HEIGHT+1][BLOCK_WIDTH+1],
              const int nx_, const int ny_,
              const float dy_, const float dt_) {
    //Index of thread within block
@@ -433,8 +433,8 @@ __device__ void evolveG1(float Q[3][block_height+2][block_width+2],
 /**
  * Evolves the solution in time along the y axis (dimensional splitting)
  */
-__device__ void evolveG2(float Q[3][block_height+4][block_width+4],
-              float G[3][block_height+1][block_width+1],
+__device__ void evolveG2(float Q[3][BLOCK_HEIGHT+4][BLOCK_WIDTH+4],
+              float G[3][BLOCK_HEIGHT+1][BLOCK_WIDTH+1],
              const int nx_, const int ny_,
              const float dy_, const float dt_) {
    //Index of thread within block
--- a/SWESimulators/limiters.cu
+++ b/SWESimulators/limiters.cu
@@ -46,8 +46,8 @@ __device__ __inline__ float minmodSlope(float left, float center, float right, f
 /**
  * Reconstructs a minmod slope for a whole block along the abscissa
  */
-__device__ void minmodSlopeX(float  Q[3][block_height+4][block_width+4],
-                  float Qx[3][block_height+2][block_width+2],
+__device__ void minmodSlopeX(float  Q[3][BLOCK_HEIGHT+4][BLOCK_WIDTH+4],
+                  float Qx[3][BLOCK_HEIGHT+2][BLOCK_WIDTH+2],
                  const float theta_) {
    //Index of thread within block
    const int tx = get_local_id(0);
@@ -57,7 +57,7 @@ __device__ void minmodSlopeX(float  Q[3][block_height+4][block_width+4],
    {
        const int j = ty;
        const int l = j + 2; //Skip ghost cells
-        for (int i=tx; i<block_width+2; i+=block_width) {
+        for (int i=tx; i<BLOCK_WIDTH+2; i+=BLOCK_WIDTH) {
            const int k = i + 1;
            for (int p=0; p<3; ++p) {
                Qx[p][j][i] = minmodSlope(Q[p][l][k-1], Q[p][l][k], Q[p][l][k+1], theta_);
@@ -70,14 +70,14 @@ __device__ void minmodSlopeX(float  Q[3][block_height+4][block_width+4],
 /**
  * Reconstructs a minmod slope for a whole block along the ordinate
  */
-__device__ void minmodSlopeY(float  Q[3][block_height+4][block_width+4],
-                  float Qy[3][block_height+2][block_width+2],
+__device__ void minmodSlopeY(float  Q[3][BLOCK_HEIGHT+4][BLOCK_WIDTH+4],
+                  float Qy[3][BLOCK_HEIGHT+2][BLOCK_WIDTH+2],
                  const float theta_) {
    //Index of thread within block
    const int tx = get_local_id(0);
    const int ty = get_local_id(1);
    
-    for (int j=ty; j<block_height+2; j+=block_height) {
+    for (int j=ty; j<BLOCK_HEIGHT+2; j+=BLOCK_HEIGHT) {
        const int l = j + 1;
        {
            const int i = tx;