Merge pull request #4 from babrodtk/cleanup

Cleanup
2025-07-15 22:51:00 +02:00 · 2018-08-13 16:14:07 +02:00 · 2018-08-13 16:14:07 +02:00 · 90df79fa9b
commit 90df79fa9b
parent a193e582c9 e48a408a7c
32 changed files with 2137 additions and 5163 deletions
--- a/ConvergenceShock1D.ipynb
+++ b/ConvergenceShock1D.ipynb
--- a/ConvergenceSmooth1D.ipynb
+++ b/ConvergenceSmooth1D.ipynb
--- a/ConvergenceSmooth2D.ipynb
+++ b/ConvergenceSmooth2D.ipynb
--- a/Desingularization.ipynb
+++ b/Desingularization.ipynb
--- a/SWESimulators/Common.py
+++ b/SWESimulators/Common.py
@ -1,108 +1,261 @@
+# -*- coding: utf-8 -*-
+
+"""
+This python module implements the different helper functions and 
+classes
+
+Copyright (C) 2018  SINTEF ICT
+
+This program is free software: you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation, either version 3 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program.  If not, see <http://www.gnu.org/licenses/>.
+"""
+
 import os

 import numpy as np
 import time
 import re
+import io
+import hashlib
+import logging

 import pycuda.compiler as cuda_compiler
 import pycuda.gpuarray
 import pycuda.driver as cuda

+
+
+"""
+Class which keeps track of time spent for a section of code
+"""
+class Timer(object):
+    def __init__(self, tag, log_level=logging.DEBUG):
+        self.tag = tag
+        self.log_level = log_level
+        self.logger = logging.getLogger(__name__)
+        
+    def __enter__(self):
+        self.start = time.time()
+        return self
+    
+    def __exit__(self, *args):
+        self.end = time.time()
+        self.secs = self.end - self.start
+        self.msecs = self.secs * 1000 # millisecs
+        self.logger.log(self.log_level, "%s: %f ms", self.tag, self.msecs)
+            
+            
+            
+            
+
 """
 Class which keeps track of the CUDA context and some helper functions
 """
 class CudaContext(object):
-    def __init__(self, verbose=True, blocking=False):
+    
+    def __init__(self, verbose=True, blocking=False, use_cache=True):
        self.verbose = verbose
        self.blocking = blocking
+        self.use_cache = use_cache
+        self.logger =  logging.getLogger(__name__)
        self.kernels = {}
        
+        self.module_path = os.path.dirname(os.path.realpath(__file__))
+        
+        #Initialize cuda (must be first call to PyCUDA)
        cuda.init(flags=0)
        
-        if (self.verbose):
-            print("CUDA version " + str(cuda.get_version()))
-            print("Driver version " + str(cuda.get_driver_version()))
+        #Print some info about CUDA
+        self.logger.info("CUDA version %s", str(cuda.get_version()))
+        self.logger.info("Driver version %s",  str(cuda.get_driver_version()))

        self.cuda_device = cuda.Device(0)
        if (self.verbose):
-            print("Using " + self.cuda_device.name())
-            print(" => compute capability: " + str(self.cuda_device.compute_capability()))
-            print(" => memory: " + str(self.cuda_device.total_memory() / (1024*1024)) + " MB")
+            self.logger.info("Using '%s' GPU", self.cuda_device.name())
+            self.logger.debug(" => compute capability: %s", str(self.cuda_device.compute_capability()))
+            self.logger.debug(" => memory: %d MB", self.cuda_device.total_memory() / (1024*1024))

+        # Create the CUDA context
        if (self.blocking):
            self.cuda_context = self.cuda_device.make_context(flags=cuda.ctx_flags.SCHED_BLOCKING_SYNC)
-            if (self.verbose):
-                print("=== WARNING ===")
-                print("Using blocking context")
-                print("=== WARNING ===")
+            self.logger.warning("Using blocking context")
        else:
            self.cuda_context = self.cuda_device.make_context(flags=cuda.ctx_flags.SCHED_AUTO)
        
-        if (self.verbose):
-            print("Created context <" + str(self.cuda_context.handle) + ">")
+        self.logger.info("Created context handle <%s>", str(self.cuda_context.handle))
+        
+        #Create cache dir for cubin files
+        if (self.use_cache):
+            self.cache_path = os.path.join(self.module_path, "cuda_cache") 
+            if not os.path.isdir(self.cache_path):
+                os.mkdir(self.cache_path)
+            self.logger.debug("Using CUDA cache dir %s", self.cache_path)
            
    
    def __del__(self, *args):
-        if self.verbose:
-            print("Cleaning up CUDA context <" + str(self.cuda_context.handle) + ">")
+        self.logger.info("Cleaning up CUDA context handle <%s>", str(self.cuda_context.handle))
            
        # Loop over all contexts in stack, and remove "this"
        other_contexts = []
        while (cuda.Context.get_current() != None):
            context = cuda.Context.get_current()
-            if (self.verbose):
-                if (context.handle != self.cuda_context.handle):
-                    print(" `-> <" + str(self.cuda_context.handle) + "> Popping context <" + str(context.handle) + "> which we do not own")
-                    other_contexts = [context] + other_contexts
-                    cuda.Context.pop()
-                else:
-                    print(" `-> <" + str(self.cuda_context.handle) + "> Popping context <" + str(context.handle) + "> (ourselves)")
-                    cuda.Context.pop()
+            if (context.handle != self.cuda_context.handle):
+                self.logger.debug("<%s> Popping <%s> (*not* ours)", str(self.cuda_context.handle), str(context.handle))
+                other_contexts = [context] + other_contexts
+                cuda.Context.pop()
+            else:
+                self.logger.debug("<%s> Popping <%s> (ours)", str(self.cuda_context.handle), str(context.handle))
+                cuda.Context.pop()

        # Add all the contexts we popped that were not our own
        for context in other_contexts:
-            if (self.verbose):
-                print(" `-> <" + str(self.cuda_context.handle) + "> Pushing <" + str(context.handle) + ">")
+            self.logger.debug("<%s> Pushing <%s>", str(self.cuda_context.handle), str(context.handle))
            cuda.Context.push(context)
            
-        if (self.verbose):
-            print(" `-> <" + str(self.cuda_context.handle) + "> Detaching context")
+        self.logger.debug("<%s> Detaching", str(self.cuda_context.handle))
        self.cuda_context.detach()
-
+        
+        
+    def __str__(self):
+        return "CudaContext id " + str(self.cuda_context.handle)
        
    
-    """
-    Reads a text file and creates an OpenCL kernel from that
-    """
-    def get_kernel(self, kernel_filename, block_width, block_height):
-        # Generate a kernel ID for our cache
-        module_path = os.path.dirname(os.path.realpath(__file__))
-        
-        kernel_hash = ""
+    def hash_kernel(kernel_filename, include_dirs, verbose=False):        
+        # Generate a kernel ID for our caches
+        num_includes = 0
+        max_includes = 100
+        kernel_hasher = hashlib.md5()
+        logger = logging.getLogger(__name__)
        
        # Loop over file and includes, and check if something has changed
        files = [kernel_filename]
        while len(files):
-            filename = os.path.join(module_path, files.pop())
-            modified = os.path.getmtime(filename)
-            with open(filename, "r") as file:
-                file_str = file.read()
-                file_hash = filename + "_" + str(hash(file_str)) + ":" + str(modified) + "--"
-                includes = re.findall('^\W*#include\W+(.+?)\W*$', file_str, re.M)
-                files = files + includes #WARNING FIXME This will not work with circular includes
+        
+            if (num_includes > max_includes):
+                raise("Maximum number of includes reached - circular include in {:}?".format(kernel_filename))
+        
+            filename = files.pop()
+            
+            logger.debug("Hashing %s", filename)
                
-            kernel_hash = kernel_hash + file_hash
+            modified = os.path.getmtime(filename)
+                
+            # Open the file
+            with io.open(filename, "r") as file:
+            
+                # Search for #inclue <something> and also hash the file
+                file_str = file.read()
+                kernel_hasher.update(file_str.encode('utf-8'))
+                kernel_hasher.update(str(modified).encode('utf-8'))
+                
+                #Find all includes
+                includes = re.findall('^\W*#include\W+(.+?)\W*$', file_str, re.M)
+                
+            # Loop over everything that looks like an include
+            for include_file in includes:
+                
+                #Search through include directories for the file
+                file_path = os.path.dirname(filename)
+                for include_path in [file_path] + include_dirs:
+                
+                    # If we find it, add it to list of files to check
+                    temp_path = os.path.join(include_path, include_file)
+                    if (os.path.isfile(temp_path)):
+                        files = files + [temp_path]
+                        num_includes = num_includes + 1 #For circular includes...
+                        break
+            
+        return kernel_hasher.hexdigest()
    
-        # Recompile kernel if file or includes have changed
-        if (kernel_hash not in self.kernels.keys()):
-            #Create define string
-            define_string = "#define block_width " + str(block_width) + "\n"
-            define_string += "#define block_height " + str(block_height) + "\n\n"
+    """
+    Reads a text file and creates an OpenCL kernel from that
+    """
+    def get_prepared_kernel(self, kernel_filename, kernel_function_name, \
+                    prepared_call_args, \
+                    include_dirs=[], verbose=False, no_extern_c=True, 
+                    **kwargs):
+        """
+        Helper function to print compilation output
+        """
+        def cuda_compile_message_handler(compile_success_bool, info_str, error_str):
+            self.logger.debug("Compilation returned %s", str(compile_success_bool))
+            if info_str:
+                self.logger.debug("Info: %s", info_str)
+            if error_str:
+                self.logger.debug("Error: %s", error_str)
+        
+        self.logger.debug("Getting %s", kernel_filename)
            
-            kernel_string = define_string + '#include "' + os.path.join(module_path, kernel_filename) + '"'
-            self.kernels[kernel_hash] = cuda_compiler.SourceModule(kernel_string, include_dirs=[module_path])
+        # Create a hash of the kernel (and its includes)
+        kwargs_hasher = hashlib.md5()
+        kwargs_hasher.update(str(kwargs).encode('utf-8'));
+        kwargs_hash = kwargs_hasher.hexdigest()
+        kwargs_hasher = None
+        root, ext = os.path.splitext(kernel_filename)
+        kernel_hash = root \
+                + "_" + CudaContext.hash_kernel( \
+                    os.path.join(self.module_path, kernel_filename), \
+                    include_dirs=[self.module_path] + include_dirs, \
+                    verbose=verbose) \
+                + "_" + kwargs_hash \
+                + ext
+        cached_kernel_filename = os.path.join(self.cache_path, kernel_hash)
+        
+        # If we have the kernel in our hashmap, return it
+        if (kernel_hash in self.kernels.keys()):
+            self.logger.debug("Found kernel %s cached in hashmap (%s)", kernel_filename, kernel_hash)
+            return self.kernels[kernel_hash]
+        
+        # If we have it on disk, return it
+        elif (self.use_cache and os.path.isfile(cached_kernel_filename)):
+            self.logger.debug("Found kernel %s cached on disk (%s)", kernel_filename, kernel_hash)
+                
+            with io.open(cached_kernel_filename, "rb") as file:
+                file_str = file.read()
+                module = cuda.module_from_buffer(file_str, message_handler=cuda_compile_message_handler)
+                
+            kernel = module.get_function(kernel_function_name)
+            kernel.prepare(prepared_call_args)
+            self.kernels[kernel_hash] = kernel
+            return kernel
            
-        return self.kernels[kernel_hash]
+        # Otherwise, compile it from source
+        else:
+            self.logger.debug("Compiling %s (%s)", kernel_filename, kernel_hash)
+                
+            #Create kernel string
+            kernel_string = ""
+            for key, value in kwargs.items():
+                kernel_string += "#define {:s} {:s}\n".format(str(key), str(value))
+            kernel_string += '#include "{:s}"'.format(os.path.join(self.module_path, kernel_filename))
+            if (self.use_cache):
+                with io.open(cached_kernel_filename + ".txt", "w") as file:
+                    file.write(kernel_string)
+                
+            
+            with Timer("compiler") as timer:
+                cubin = cuda_compiler.compile(kernel_string, include_dirs=include_dirs, no_extern_c=no_extern_c, cache_dir=False)
+                module = cuda.module_from_buffer(cubin, message_handler=cuda_compile_message_handler)
+                if (self.use_cache):
+                    with io.open(cached_kernel_filename, "wb") as file:
+                        file.write(cubin)
+                
+            kernel = module.get_function(kernel_function_name)
+            kernel.prepare(prepared_call_args)
+            self.kernels[kernel_hash] = kernel
+            
+            
+            return kernel
    
    """
    Clears the kernel cache (useful for debugging & development)
@ -113,21 +266,6 @@ class CudaContext(object):
        
        
        
-class Timer(object):
-    def __init__(self, tag, verbose=True):
-        self.verbose = verbose
-        self.tag = tag
-        
-    def __enter__(self):
-        self.start = time.time()
-        return self
-    
-    def __exit__(self, *args):
-        self.end = time.time()
-        self.secs = self.end - self.start
-        self.msecs = self.secs * 1000 # millisecs
-        if self.verbose:
-            print("=> " + self.tag + ' %f ms' % self.msecs)
        
        

@ -139,12 +277,14 @@ class CUDAArray2D:
    Uploads initial data to the CL device
    """
    def __init__(self, stream, nx, ny, halo_x, halo_y, data):
-        
+        self.logger =  logging.getLogger(__name__)
        self.nx = nx
        self.ny = ny
        self.nx_halo = nx + 2*halo_x
        self.ny_halo = ny + 2*halo_y
        
+        self.logger.debug("Allocating [%dx%d] buffer", self.nx, self.ny)
+        
        #Make sure data is in proper format
        assert np.issubdtype(data.dtype, np.float32), "Wrong datatype: %s" % str(data.dtype)
        assert not np.isfortran(data), "Wrong datatype (Fortran, expected C)"
@ -156,17 +296,26 @@ class CUDAArray2D:
        self.bytes_per_float = data.itemsize
        assert(self.bytes_per_float == 4)
        self.pitch = np.int32((self.nx_halo)*self.bytes_per_float)
+        self.logger.debug("Buffer <%s> [%dx%d]: Allocated ", int(self.data.gpudata), self.nx, self.ny)
        
        
+    def __del__(self, *args):
+        self.logger.debug("Buffer <%s> [%dx%d]: Releasing ", int(self.data.gpudata), self.nx, self.ny)
+        self.data.gpudata.free()
+        self.data = None
+        
    """
    Enables downloading data from CL device to Python
    """
    def download(self, stream, async=False):
+        
        #Copy data from device to host
        if (async):
+            self.logger.debug("Buffer <%s> [%dx%d]: Downloading async ", int(self.data.gpudata), self.nx, self.ny)
            host_data = self.data.get_async(stream=stream)
            return host_data
        else:
+            self.logger.debug("Buffer <%s> [%dx%d]: Downloading synchronously", int(self.data.gpudata), self.nx, self.ny)
            host_data = self.data.get(stream=stream)#, pagelocked=True) # pagelocked causes crash on windows at least
            return host_data

--- a/SWESimulators/FORCE.py
+++ b/SWESimulators/FORCE.py
@ -21,13 +21,7 @@ along with this program.  If not, see <http://www.gnu.org/licenses/>.
 """

 #Import packages we need
-import numpy as np
-
-import pycuda.compiler as cuda_compiler
-import pycuda.gpuarray
-import pycuda.driver as cuda
-
-from SWESimulators import Common
+from SWESimulators import Simulator



@ -42,7 +36,7 @@ from SWESimulators import Common
 """
 Class that solves the SW equations 
 """
-class FORCE:
+class FORCE (Simulator.BaseSimulator):

    """
    Initialization routine
@ -63,80 +57,40 @@ class FORCE:
                 dx, dy, dt, \
                 g, \
                 block_width=16, block_height=16):
-        #Create a CUDA stream
-        self.stream = cuda.Stream()
+                 
+        # Call super constructor
+        super().__init__(context, \
+            h0, hu0, hv0, \
+            nx, ny, \
+            1, 1, \
+            dx, dy, dt, \
+            g, \
+            block_width, block_height);

        #Get kernels
-        self.force_module = context.get_kernel("FORCE_kernel.cu", block_width, block_height)
-        self.force_kernel = self.force_module.get_function("FORCEKernel")
-        self.force_kernel.prepare("iiffffPiPiPiPiPiPi")
-        
-        #Create data by uploading to device
-        ghost_cells_x = 1
-        ghost_cells_y = 1
-        self.data = Common.SWEDataArakawaA(self.stream, \
-                            nx, ny, \
-                            ghost_cells_x, ghost_cells_y, \
-                            h0, hu0, hv0)
-        
-        #Save input parameters
-        #Notice that we need to specify them in the correct dataformat for the
-        #OpenCL kernel
-        self.nx = np.int32(nx)
-        self.ny = np.int32(ny)
-        self.dx = np.float32(dx)
-        self.dy = np.float32(dy)
-        self.dt = np.float32(dt)
-        self.g = np.float32(g)
-        
-        #Initialize time
-        self.t = np.float32(0.0)
-        
-        #Compute kernel launch parameters
-        self.local_size = (block_width, block_height, 1) 
-        self.global_size = ( \
-                       int(np.ceil(self.nx / float(self.local_size[0]))), \
-                       int(np.ceil(self.ny / float(self.local_size[1]))) \
-                      ) 
-    
+        self.kernel = context.get_prepared_kernel("FORCE_kernel.cu", "FORCEKernel", \
+                                        "iiffffPiPiPiPiPiPi", \
+                                        BLOCK_WIDTH=block_width, \
+                                        BLOCK_HEIGHT=block_height)
    
    def __str__(self):
        return "First order centered"
-    
-    
-    """
-    Function which steps n timesteps
-    """
-    def step(self, t_end=0.0):
-        n = int(t_end / self.dt + 1)
        
-        for i in range(0, n):        
-            local_dt = np.float32(min(self.dt, t_end-i*self.dt))
-            
-            if (local_dt <= 0.0):
-                break
+    def simulate(self, t_end):
+        return super().simulateEuler(t_end)
        
-            self.force_kernel.prepared_async_call(self.global_size, self.local_size, self.stream, \
-                    self.nx, self.ny, \
-                    self.dx, self.dy, local_dt, \
-                    self.g, \
-                    self.data.h0.data.gpudata, self.data.h0.pitch, \
-                    self.data.hu0.data.gpudata, self.data.hu0.pitch, \
-                    self.data.hv0.data.gpudata, self.data.hv0.pitch, \
-                    self.data.h1.data.gpudata, self.data.h1.pitch, \
-                    self.data.hu1.data.gpudata, self.data.hu1.pitch, \
-                    self.data.hv1.data.gpudata, self.data.hv1.pitch)
-                
-            self.t += local_dt
-            
-            self.data.swap()
+    def stepEuler(self, dt):
+        self.kernel.prepared_async_call(self.global_size, self.local_size, self.stream, \
+                self.nx, self.ny, \
+                self.dx, self.dy, dt, \
+                self.g, \
+                self.data.h0.data.gpudata, self.data.h0.pitch, \
+                self.data.hu0.data.gpudata, self.data.hu0.pitch, \
+                self.data.hv0.data.gpudata, self.data.hv0.pitch, \
+                self.data.h1.data.gpudata, self.data.h1.pitch, \
+                self.data.hu1.data.gpudata, self.data.hu1.pitch, \
+                self.data.hv1.data.gpudata, self.data.hv1.pitch)
+        self.data.swap()
+        self.t += dt
        
-        return self.t, n
-        
-        
-    
-    
-    
-    def download(self):
-        return self.data.download(self.stream)
-
+        
--- a/SWESimulators/FORCE_kernel.cu
+++ b/SWESimulators/FORCE_kernel.cu
@ -20,14 +20,15 @@ along with this program.  If not, see <http://www.gnu.org/licenses/>.


 #include "common.cu"
+#include "fluxes/FirstOrderCentered.cu"


 /**
  * Computes the flux along the x axis for all faces
  */
 __device__ 
-void computeFluxF(float Q[3][block_height+2][block_width+2],
-                  float F[3][block_height+1][block_width+1],
+void computeFluxF(float Q[3][BLOCK_HEIGHT+2][BLOCK_WIDTH+2],
+                  float F[3][BLOCK_HEIGHT+1][BLOCK_WIDTH+1],
                  const float g_, const float dx_, const float dt_) {
                      
    //Index of thread within block
@ -38,7 +39,7 @@ void computeFluxF(float Q[3][block_height+2][block_width+2],
    {
        int j=ty;
        const int l = j + 1; //Skip ghost cells
-        for (int i=tx; i<block_width+1; i+=block_width) {
+        for (int i=tx; i<BLOCK_WIDTH+1; i+=BLOCK_WIDTH) {
            const int k = i;
            
            // Q at interface from the right and left
@ -63,15 +64,15 @@ void computeFluxF(float Q[3][block_height+2][block_width+2],
  * Computes the flux along the y axis for all faces
  */
 __device__ 
-void computeFluxG(float Q[3][block_height+2][block_width+2],
-                  float G[3][block_height+1][block_width+1],
+void computeFluxG(float Q[3][BLOCK_HEIGHT+2][BLOCK_WIDTH+2],
+                  float G[3][BLOCK_HEIGHT+1][BLOCK_WIDTH+1],
                  const float g_, const float dy_, const float dt_) {
    //Index of thread within block
    const int tx = get_local_id(0);
    const int ty = get_local_id(1);
    
    //Compute fluxes along the y axis
-    for (int j=ty; j<block_height+1; j+=block_height) {
+    for (int j=ty; j<BLOCK_HEIGHT+1; j+=BLOCK_HEIGHT) {
        const int l = j;
        {
            int i=tx;
@ -97,6 +98,7 @@ void computeFluxG(float Q[3][block_height+2][block_width+2],
 }


+extern "C" {
 __global__ void FORCEKernel(
        int nx_, int ny_,
        float dx_, float dy_, float dt_,
@ -112,8 +114,8 @@ __global__ void FORCEKernel(
        float* hu1_ptr_, int hu1_pitch_,
        float* hv1_ptr_, int hv1_pitch_) {
    
-    __shared__ float Q[3][block_height+2][block_width+2];
-    __shared__ float F[3][block_height+1][block_width+1];
+    __shared__ float Q[3][BLOCK_HEIGHT+2][BLOCK_WIDTH+2];
+    __shared__ float F[3][BLOCK_HEIGHT+1][BLOCK_WIDTH+1];
    
    
    //Read into shared memory
@ -149,4 +151,6 @@ __global__ void FORCEKernel(
                hu1_ptr_, hu1_pitch_,
                hv1_ptr_, hv1_pitch_,
                Q, nx_, ny_);
-}
+}
+
+} // extern "C"
--- a/SWESimulators/HLL.py
+++ b/SWESimulators/HLL.py
@ -20,13 +20,7 @@ along with this program.  If not, see <http://www.gnu.org/licenses/>.
 """

 #Import packages we need
-import numpy as np
-
-import pycuda.compiler as cuda_compiler
-import pycuda.gpuarray
-import pycuda.driver as cuda
-
-from SWESimulators import Common
+from SWESimulators import Simulator



@ -37,7 +31,7 @@ from SWESimulators import Common
 """
 Class that solves the SW equations using the Harten-Lax -van Leer approximate Riemann solver
 """
-class HLL:
+class HLL (Simulator.BaseSimulator):

    """
    Initialization routine
@ -58,80 +52,40 @@ class HLL:
                 dx, dy, dt, \
                 g, \
                 block_width=16, block_height=16):
-        #Create a CUDA stream
-        self.stream = cuda.Stream()
+                 
+        # Call super constructor
+        super().__init__(context, \
+            h0, hu0, hv0, \
+            nx, ny, \
+            1, 1, \
+            dx, dy, dt, \
+            g, \
+            block_width, block_height);

        #Get kernels
-        self.hll_module = context.get_kernel("HLL_kernel.cu", block_width, block_height)
-        self.hll_kernel = self.hll_module.get_function("HLLKernel")
-        self.hll_kernel.prepare("iiffffPiPiPiPiPiPi")
-        
-        #Create data by uploading to device
-        ghost_cells_x = 1
-        ghost_cells_y = 1
-        self.data = Common.SWEDataArakawaA(self.stream, \
-                            nx, ny, \
-                            ghost_cells_x, ghost_cells_y, \
-                            h0, hu0, hv0)
-        
-        #Save input parameters
-        #Notice that we need to specify them in the correct dataformat for the
-        #OpenCL kernel
-        self.nx = np.int32(nx)
-        self.ny = np.int32(ny)
-        self.dx = np.float32(dx)
-        self.dy = np.float32(dy)
-        self.dt = np.float32(dt)
-        self.g = np.float32(g)
-        
-        #Initialize time
-        self.t = np.float32(0.0)
-        
-        #Compute kernel launch parameters
-        self.local_size = (block_width, block_height, 1) 
-        self.global_size = ( \
-                       int(np.ceil(self.nx / float(self.local_size[0]))), \
-                       int(np.ceil(self.ny / float(self.local_size[1]))) \
-                      ) 
-    
+        self.kernel = context.get_prepared_kernel("HLL_kernel.cu", "HLLKernel", \
+                                        "iiffffPiPiPiPiPiPi", \
+                                        BLOCK_WIDTH=block_width, \
+                                        BLOCK_HEIGHT=block_height)
    
    def __str__(self):
        return "Harten-Lax-van Leer"
    
-    
-    """
-    Function which steps n timesteps
-    """
-    def step(self, t_end=0.0):
-        n = int(t_end / self.dt + 1)
+    def simulate(self, t_end):
+        return super().simulateEuler(t_end)
        
-        for i in range(0, n):        
-            local_dt = np.float32(min(self.dt, t_end-i*self.dt))
-            
-            if (local_dt <= 0.0):
-                break
+    def stepEuler(self, dt):
+        self.kernel.prepared_async_call(self.global_size, self.local_size, self.stream, \
+                self.nx, self.ny, \
+                self.dx, self.dy, dt, \
+                self.g, \
+                self.data.h0.data.gpudata,  self.data.h0.pitch,  \
+                self.data.hu0.data.gpudata, self.data.hu0.pitch, \
+                self.data.hv0.data.gpudata, self.data.hv0.pitch, \
+                self.data.h1.data.gpudata,  self.data.h1.pitch,  \
+                self.data.hu1.data.gpudata, self.data.hu1.pitch, \
+                self.data.hv1.data.gpudata, self.data.hv1.pitch)        
+        self.data.swap()
+        self.t += dt
        
-            self.hll_kernel.prepared_async_call(self.global_size, self.local_size, self.stream, \
-                    self.nx, self.ny, \
-                    self.dx, self.dy, local_dt, \
-                    self.g, \
-                    self.data.h0.data.gpudata,  self.data.h0.pitch,  \
-                    self.data.hu0.data.gpudata, self.data.hu0.pitch, \
-                    self.data.hv0.data.gpudata, self.data.hv0.pitch, \
-                    self.data.h1.data.gpudata,  self.data.h1.pitch,  \
-                    self.data.hu1.data.gpudata, self.data.hu1.pitch, \
-                    self.data.hv1.data.gpudata, self.data.hv1.pitch)
-                
-            self.t += local_dt
-            
-            self.data.swap()
-        
-        return self.t, n
-    
-    
-    
-    
-    
-    def download(self):
-        return self.data.download(self.stream)

--- a/SWESimulators/HLL2.py
+++ b/SWESimulators/HLL2.py
@ -21,12 +21,7 @@ along with this program.  If not, see <http://www.gnu.org/licenses/>.

 #Import packages we need
 import numpy as np
-
-import pycuda.compiler as cuda_compiler
-import pycuda.gpuarray
-import pycuda.driver as cuda
-
-from SWESimulators import Common
+from SWESimulators import Simulator


        
@ -39,7 +34,7 @@ from SWESimulators import Common
 """
 Class that solves the SW equations using the Forward-Backward linear scheme
 """
-class HLL2:
+class HLL2 (Simulator.BaseSimulator):

    """
    Initialization routine
@ -61,99 +56,62 @@ class HLL2:
                 g, \
                 theta=1.8, \
                 block_width=16, block_height=16):
-        #Create a CUDA stream
-        self.stream = cuda.Stream()
+                 
+        # Call super constructor
+        super().__init__(context, \
+            h0, hu0, hv0, \
+            nx, ny, \
+            2, 2, \
+            dx, dy, dt, \
+            g, \
+            block_width, block_height);
+            
+        self.theta = np.float32(theta)

        #Get kernels
-        self.hll2_module = context.get_kernel("HLL2_kernel.cu", block_width, block_height)
-        self.hll2_kernel = self.hll2_module.get_function("HLL2Kernel")
-        self.hll2_kernel.prepare("iifffffiPiPiPiPiPiPi")
+        self.kernel = context.get_prepared_kernel("HLL2_kernel.cu", "HLL2Kernel", \
+                                        "iifffffiPiPiPiPiPiPi", \
+                                        BLOCK_WIDTH=block_width, \
+                                        BLOCK_HEIGHT=block_height)
        
-        #Create data by uploading to device
-        ghost_cells_x = 2
-        ghost_cells_y = 2
-        self.data = Common.SWEDataArakawaA(self.stream, \
-                            nx, ny, \
-                            ghost_cells_x, ghost_cells_y, \
-                            h0, hu0, hv0)
-        
-        #Save input parameters
-        #Notice that we need to specify them in the correct dataformat for the
-        #OpenCL kernel
-        self.nx = np.int32(nx)
-        self.ny = np.int32(ny)
-        self.dx = np.float32(dx)
-        self.dy = np.float32(dy)
-        self.dt = np.float32(dt)
-        self.g = np.float32(g)
-        self.theta = np.float32(theta)
-        
-        #Initialize time
-        self.t = np.float32(0.0)
-        
-        #Compute kernel launch parameters
-        self.local_size = (block_width, block_height, 1)
-        self.global_size = ( \
-                       int(np.ceil(self.nx / float(self.local_size[0]))), \
-                       int(np.ceil(self.ny / float(self.local_size[1]))) \
-                      )
-    
-    
    def __str__(self):
        return "Harten-Lax-van Leer (2nd order)"
    
-    
-    """
-    Function which steps n timesteps
-    """
-    def step(self, t_end=0.0):
-        n = int(t_end / (2.0*self.dt) + 1)
+    def simulate(self, t_end):
+        return super().simulateDimsplit(t_end)
        
-        for i in range(0, n): 
-            #Dimensional splitting: second order accurate for every other timestep,
-            #thus run two timesteps in a go
-            
-            local_dt = np.float32(0.5*min(2*self.dt, t_end-2*i*self.dt))
-            if (local_dt <= 0.0):
-                break
+    def stepEuler(self, dt):
+        return self.stepDimsplitXY(dt)
                
-            #Along X, then Y
-            self.hll2_kernel.prepared_async_call(self.global_size, self.local_size, self.stream, \
-                    self.nx, self.ny, \
-                    self.dx, self.dy, local_dt, \
-                    self.g, \
-                    self.theta, \
-                    np.int32(0), \
-                    self.data.h0.data.gpudata, self.data.h0.pitch, \
-                    self.data.hu0.data.gpudata, self.data.hu0.pitch, \
-                    self.data.hv0.data.gpudata, self.data.hv0.pitch, \
-                    self.data.h1.data.gpudata, self.data.h1.pitch, \
-                    self.data.hu1.data.gpudata, self.data.hu1.pitch, \
-                    self.data.hv1.data.gpudata, self.data.hv1.pitch)
-            self.data.swap()
+    def stepDimsplitXY(self, dt):
+        self.kernel.prepared_async_call(self.global_size, self.local_size, self.stream, \
+                self.nx, self.ny, \
+                self.dx, self.dy, dt, \
+                self.g, \
+                self.theta, \
+                np.int32(0), \
+                self.data.h0.data.gpudata, self.data.h0.pitch, \
+                self.data.hu0.data.gpudata, self.data.hu0.pitch, \
+                self.data.hv0.data.gpudata, self.data.hv0.pitch, \
+                self.data.h1.data.gpudata, self.data.h1.pitch, \
+                self.data.hu1.data.gpudata, self.data.hu1.pitch, \
+                self.data.hv1.data.gpudata, self.data.hv1.pitch)
+        self.data.swap()
+        self.t += dt
            
-            #Along Y, then X
-            self.hll2_kernel.prepared_async_call(self.global_size, self.local_size, self.stream, \
-                    self.nx, self.ny, \
-                    self.dx, self.dy, local_dt, \
-                    self.g, \
-                    self.theta, \
-                    np.int32(1), \
-                    self.data.h0.data.gpudata, self.data.h0.pitch, \
-                    self.data.hu0.data.gpudata, self.data.hu0.pitch, \
-                    self.data.hv0.data.gpudata, self.data.hv0.pitch, \
-                    self.data.h1.data.gpudata, self.data.h1.pitch, \
-                    self.data.hu1.data.gpudata, self.data.hu1.pitch, \
-                    self.data.hv1.data.gpudata, self.data.hv1.pitch)
-            self.data.swap()
-            
-            self.t += local_dt
-            
-        
-        return self.t, 2*n
-    
-    
-    
-    def download(self):
-        return self.data.download(self.stream)
-
+    def stepDimsplitYX(self, dt):
+        self.kernel.prepared_async_call(self.global_size, self.local_size, self.stream, \
+                self.nx, self.ny, \
+                self.dx, self.dy, dt, \
+                self.g, \
+                self.theta, \
+                np.int32(1), \
+                self.data.h0.data.gpudata, self.data.h0.pitch, \
+                self.data.hu0.data.gpudata, self.data.hu0.pitch, \
+                self.data.hv0.data.gpudata, self.data.hv0.pitch, \
+                self.data.h1.data.gpudata, self.data.h1.pitch, \
+                self.data.hu1.data.gpudata, self.data.hu1.pitch, \
+                self.data.hv1.data.gpudata, self.data.hv1.pitch)
+        self.data.swap()
+        self.t += dt
+        
--- a/SWESimulators/HLL2_kernel.cu
+++ b/SWESimulators/HLL2_kernel.cu
@ -19,6 +19,8 @@ along with this program.  If not, see <http://www.gnu.org/licenses/>.


 #include "common.cu"
+#include "limiters.cu"
+#include "fluxes/HartenLaxVanLeer.cu"



@ -30,9 +32,9 @@ along with this program.  If not, see <http://www.gnu.org/licenses/>.
  * Computes the flux along the x axis for all faces
  */
 __device__
-void computeFluxF(float Q[3][block_height+4][block_width+4],
-                  float Qx[3][block_height+2][block_width+2],
-                  float F[3][block_height+1][block_width+1],
+void computeFluxF(float Q[3][BLOCK_HEIGHT+4][BLOCK_WIDTH+4],
+                  float Qx[3][BLOCK_HEIGHT+2][BLOCK_WIDTH+2],
+                  float F[3][BLOCK_HEIGHT+1][BLOCK_WIDTH+1],
                  const float g_, const float dx_, const float dt_) {
    //Index of thread within block
    const int tx = get_local_id(0);
@ -41,7 +43,7 @@ void computeFluxF(float Q[3][block_height+4][block_width+4],
    {
        const int j=ty;
        const int l = j + 2; //Skip ghost cells
-        for (int i=tx; i<block_width+1; i+=block_width) {
+        for (int i=tx; i<BLOCK_WIDTH+1; i+=BLOCK_WIDTH) {
            const int k = i + 1;
            // Reconstruct point values of Q at the left and right hand side 
            // of the cell for both the left (i) and right (i+1) cell 
@ -64,7 +66,7 @@ void computeFluxF(float Q[3][block_height+4][block_width+4],
            const float3 Q_l_bar = Q_lr + dt_/(2.0f*dx_) * (F_func(Q_ll, g_) - F_func(Q_lr, g_));

            // Compute flux based on prediction
-            const float3 flux = HLLC_flux(Q_l_bar, Q_r_bar, g_);
+            const float3 flux = HLL_flux(Q_l_bar, Q_r_bar, g_);
            
            //Write to shared memory
            F[0][j][i] = flux.x;
@ -82,15 +84,15 @@ void computeFluxF(float Q[3][block_height+4][block_width+4],
  * Computes the flux along the x axis for all faces
  */
 __device__
-void computeFluxG(float Q[3][block_height+4][block_width+4],
-                  float Qy[3][block_height+2][block_width+2],
-                  float G[3][block_height+1][block_width+1],
+void computeFluxG(float Q[3][BLOCK_HEIGHT+4][BLOCK_WIDTH+4],
+                  float Qy[3][BLOCK_HEIGHT+2][BLOCK_WIDTH+2],
+                  float G[3][BLOCK_HEIGHT+1][BLOCK_WIDTH+1],
                  const float g_, const float dy_, const float dt_) {
    //Index of thread within block
    const int tx = get_local_id(0);
    const int ty = get_local_id(1);
    
-    for (int j=ty; j<block_height+1; j+=block_height) {
+    for (int j=ty; j<BLOCK_HEIGHT+1; j+=BLOCK_HEIGHT) {
        const int l = j + 1;
        { 
            int i=tx;
@ -134,7 +136,7 @@ void computeFluxG(float Q[3][block_height+4][block_width+4],



-
+extern "C" {
 __global__ void HLL2Kernel(
        int nx_, int ny_,
        float dx_, float dy_, float dt_,
@ -155,9 +157,9 @@ __global__ void HLL2Kernel(
        float* hv1_ptr_, int hv1_pitch_) {
            
    //Shared memory variables
-    __shared__ float Q[3][block_height+4][block_width+4];
-    __shared__ float Qx[3][block_height+2][block_width+2];
-    __shared__ float F[3][block_height+1][block_width+1];
+    __shared__ float Q[3][BLOCK_HEIGHT+4][BLOCK_WIDTH+4];
+    __shared__ float Qx[3][BLOCK_HEIGHT+2][BLOCK_WIDTH+2];
+    __shared__ float F[3][BLOCK_HEIGHT+1][BLOCK_WIDTH+1];
    
    
    
@ -226,4 +228,6 @@ __global__ void HLL2Kernel(
                hu1_ptr_, hu1_pitch_,
                hv1_ptr_, hv1_pitch_,
                Q, nx_, ny_);
-}
+}
+
+} // extern "C"
--- a/SWESimulators/HLL_kernel.cu
+++ b/SWESimulators/HLL_kernel.cu
@ -1,5 +1,5 @@
 /*
-This OpenCL kernel implements the HLL flux
+This GPU kernel implements the HLL flux

 Copyright (C) 2016  SINTEF ICT

@ -20,6 +20,7 @@ along with this program.  If not, see <http://www.gnu.org/licenses/>.


 #include "common.cu"
+#include "fluxes/HartenLaxVanLeer.cu"



@ -29,8 +30,8 @@ along with this program.  If not, see <http://www.gnu.org/licenses/>.
  * Computes the flux along the x axis for all faces
  */
 __device__
-void computeFluxF(float Q[3][block_height+2][block_width+2],
-                  float F[3][block_height+1][block_width+1],
+void computeFluxF(float Q[3][BLOCK_HEIGHT+2][BLOCK_WIDTH+2],
+                  float F[3][BLOCK_HEIGHT+1][BLOCK_WIDTH+1],
                  const float g_) {
    //Index of thread within block
    const int tx = get_local_id(0);
@ -39,7 +40,7 @@ void computeFluxF(float Q[3][block_height+2][block_width+2],
    {
        const int j=ty;
        const int l = j + 1; //Skip ghost cells     
-        for (int i=tx; i<block_width+1; i+=block_width) { 
+        for (int i=tx; i<BLOCK_WIDTH+1; i+=BLOCK_WIDTH) { 
            const int k = i;
            
            const float3 Q_l  = make_float3(Q[0][l][k  ], Q[1][l][k  ], Q[2][l][k  ]);
@ -63,14 +64,14 @@ void computeFluxF(float Q[3][block_height+2][block_width+2],
  * Computes the flux along the y axis for all faces
  */
 __device__
-void computeFluxG(float Q[3][block_height+2][block_width+2],
-                  float G[3][block_height+1][block_width+1],
+void computeFluxG(float Q[3][BLOCK_HEIGHT+2][BLOCK_WIDTH+2],
+                  float G[3][BLOCK_HEIGHT+1][BLOCK_WIDTH+1],
                  const float g_) {
    //Index of thread within block
    const int tx = get_local_id(0);
    const int ty = get_local_id(1);
    
-    for (int j=ty; j<block_height+1; j+=block_height) {
+    for (int j=ty; j<BLOCK_HEIGHT+1; j+=BLOCK_HEIGHT) {
        const int l = j;
        {
            const int i=tx;
@ -103,7 +104,8 @@ void computeFluxG(float Q[3][block_height+2][block_width+2],



-
+extern "C" {
+    
 __global__ void HLLKernel(
        int nx_, int ny_,
        float dx_, float dy_, float dt_,
@ -118,6 +120,10 @@ __global__ void HLLKernel(
        float* h1_ptr_, int h1_pitch_,
        float* hu1_ptr_, int hu1_pitch_,
        float* hv1_ptr_, int hv1_pitch_) {
+            
+    const int block_width = BLOCK_WIDTH;
+    const int block_height = BLOCK_HEIGHT;
+    
    //Shared memory variables
    __shared__ float Q[3][block_height+2][block_width+2];
    __shared__ float F[3][block_height+1][block_width+1];
@ -159,4 +165,6 @@ __global__ void HLLKernel(
                hu1_ptr_, hu1_pitch_,
                hv1_ptr_, hv1_pitch_,
                Q, nx_, ny_);
-}
+}
+
+} // extern "C"
--- a/SWESimulators/IPythonMagic.py
+++ b/SWESimulators/IPythonMagic.py
@ -0,0 +1,80 @@
+# -*- coding: utf-8 -*-
+
+"""
+This python module implements helpers for IPython / Jupyter and CUDA
+
+Copyright (C) 2018  SINTEF ICT
+
+This program is free software: you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation, either version 3 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program.  If not, see <http://www.gnu.org/licenses/>.
+"""
+
+import logging
+
+from IPython.core.magic import line_magic, Magics, magics_class
+import pycuda.driver as cuda
+
+
+
+@magics_class
+class CudaContextHandler(Magics): 
+    @line_magic
+    def cuda_context_handler(self, context_name):
+        self.logger =  logging.getLogger(__name__)
+        
+        self.logger.debug("Registering %s as a global context", context_name)
+        
+        if context_name in self.shell.user_ns.keys():
+            self.logger.debug("Context already registered! Ignoring")
+            return
+        else:
+            self.logger.debug("Creating context")
+            self.shell.ex(context_name + " = Common.CudaContext(verbose=True, blocking=False)")
+        
+        # this function will be called on exceptions in any cell
+        def custom_exc(shell, etype, evalue, tb, tb_offset=None):
+            self.logger.exception("Exception caught: Resetting to CUDA context %s", context_name)
+            while (cuda.Context.get_current() != None):
+                context = cuda.Context.get_current()
+                self.logger.info("Popping <%s>", str(context.handle))
+                cuda.Context.pop()
+
+            if context_name in self.shell.user_ns.keys():
+                self.logger.info("Pushing <%s>", str(self.shell.user_ns[context_name].cuda_context.handle))
+                self.shell.ex(context_name + ".cuda_context.push()")
+            else:
+                self.logger.error("No CUDA context called %s found (something is wrong)", context_name)
+                self.logger.error("CUDA will not work now")
+
+            # still show the error within the notebook, don't just swallow it
+            shell.showtraceback((etype, evalue, tb), tb_offset=tb_offset)
+
+        # this registers a custom exception handler for the whole current notebook
+        get_ipython().set_custom_exc((Exception,), custom_exc)
+        
+        
+        # Handle CUDA context when exiting python
+        import atexit
+        def exitfunc():
+            self.logger.info("Exitfunc: Resetting CUDA context stack")
+            while (cuda.Context.get_current() != None):
+                context = cuda.Context.get_current()
+                self.logger.info("`-> Popping <%s>", str(context.handle))
+                cuda.Context.pop()
+        atexit.register(exitfunc)
+
+logger = logging.getLogger(__name__)
+logger.info("Registering automatic CUDA context handling")
+logger.debug("(use %cuda_context_handler my_context to create a context called my_context")
+ip = get_ipython()
+ip.register_magics(CudaContextHandler)
--- a/SWESimulators/KP07.py
+++ b/SWESimulators/KP07.py
@ -26,12 +26,7 @@ along with this program.  If not, see <http://www.gnu.org/licenses/>.

 #Import packages we need
 import numpy as np
-
-import pycuda.compiler as cuda_compiler
-import pycuda.gpuarray
-import pycuda.driver as cuda
-
-from SWESimulators import Common
+from SWESimulators import Simulator



@ -40,7 +35,7 @@ from SWESimulators import Common
 """
 Class that solves the SW equations using the Forward-Backward linear scheme
 """
-class KP07:
+class KP07 (Simulator.BaseSimulator):

    """
    Initialization routine
@ -53,131 +48,64 @@ class KP07:
    dy: Grid cell spacing along y-axis (20 000 m)
    dt: Size of each timestep (90 s)
    g: Gravitational accelleration (9.81 m/s^2)
-    f: Coriolis parameter (1.2e-4 s^1)
-    r: Bottom friction coefficient (2.4e-3 m/s)
-    wind_type: Type of wind stress, 0=Uniform along shore, 1=bell shaped along shore, 2=moving cyclone
-    wind_tau0: Amplitude of wind stress (Pa)
-    wind_rho: Density of sea water (1025.0 kg / m^3)
-    wind_alpha: Offshore e-folding length (1/(10*dx) = 5e-6 m^-1)
-    wind_xm: Maximum wind stress for bell shaped wind stress
-    wind_Rc: Distance to max wind stress from center of cyclone (10dx = 200 000 m)
-    wind_x0: Initial x position of moving cyclone (dx*(nx/2) - u0*3600.0*48.0)
-    wind_y0: Initial y position of moving cyclone (dy*(ny/2) - v0*3600.0*48.0)
-    wind_u0: Translation speed along x for moving cyclone (30.0/sqrt(5.0))
-    wind_v0: Translation speed along y for moving cyclone (-0.5*u0)
    """
    def __init__(self, \
                 context, \
                 h0, hu0, hv0, \
                 nx, ny, \
                 dx, dy, dt, \
-                 g, theta=1.3, \
-                 r=0.0, use_rk2=True,
+                 g, \
+                 theta=1.3, \
                 block_width=16, block_height=16):
-        #Create a CUDA stream
-        self.stream = cuda.Stream()
+                 
+        # Call super constructor
+        super().__init__(context, \
+            h0, hu0, hv0, \
+            nx, ny, \
+            2, 2, \
+            dx, dy, dt, \
+            g, \
+            block_width, block_height);
+            
+        self.theta = np.float32(theta)

        #Get kernels
-        self.kp07_module = context.get_kernel("KP07_kernel.cu", block_width, block_height)
-        self.kp07_kernel = self.kp07_module.get_function("KP07Kernel")
-        self.kp07_kernel.prepare("iiffffffiPiPiPiPiPiPi")
+        self.kernel = context.get_prepared_kernel("KP07_kernel.cu", "KP07Kernel", \
+                                        "iifffffiPiPiPiPiPiPi", \
+                                        BLOCK_WIDTH=block_width, \
+                                        BLOCK_HEIGHT=block_height)
        
-        #Create data by uploading to device
-        ghost_cells_x = 2
-        ghost_cells_y = 2
-        self.data = Common.SWEDataArakawaA(self.stream, nx, ny, ghost_cells_x, ghost_cells_y, h0, hu0, hv0)
-        
-        #Save input parameters
-        #Notice that we need to specify them in the correct dataformat for the
-        #OpenCL kernel
-        self.nx = np.int32(nx)
-        self.ny = np.int32(ny)
-        self.dx = np.float32(dx)
-        self.dy = np.float32(dy)
-        self.dt = np.float32(dt)
-        self.g = np.float32(g)
-        self.theta = np.float32(theta)
-        self.r = np.float32(r)
-        self.use_rk2 = use_rk2
-        
-        #Initialize time
-        self.t = np.float32(0.0)
-        
-        #Compute kernel launch parameters
-        self.local_size = (block_width, block_height, 1) 
-        self.global_size = ( \
-                       int(np.ceil(self.nx / float(self.local_size[0]))), \
-                       int(np.ceil(self.ny / float(self.local_size[1]))) \
-                      ) 
-    
-    
-    
    def __str__(self):
        return "Kurganov-Petrova 2007"
    
-    """
-    Function which steps n timesteps
-    """
-    def step(self, t_end=0.0):
-        n = int(t_end / self.dt + 1)
+    def simulate(self, t_end):
+        return super().simulateRK(t_end, 2)
        
-        for i in range(0, n):        
-            local_dt = np.float32(min(self.dt, t_end-i*self.dt))
-            
-            if (local_dt <= 0.0):
-                break
+    def substepRK(self, dt, substep):
+        self.kernel.prepared_async_call(self.global_size, self.local_size, self.stream, \
+                self.nx, self.ny, \
+                self.dx, self.dy, dt, \
+                self.g, \
+                self.theta, \
+                np.int32(substep), \
+                self.data.h0.data.gpudata,  self.data.h0.pitch,  \
+                self.data.hu0.data.gpudata, self.data.hu0.pitch, \
+                self.data.hv0.data.gpudata, self.data.hv0.pitch, \
+                self.data.h1.data.gpudata,  self.data.h1.pitch,  \
+                self.data.hu1.data.gpudata, self.data.hu1.pitch, \
+                self.data.hv1.data.gpudata, self.data.hv1.pitch)
+        self.data.swap()
        
-            if (self.use_rk2):
-                self.kp07_kernel.prepared_async_call(self.global_size, self.local_size, self.stream, \
-                        self.nx, self.ny, \
-                        self.dx, self.dy, local_dt, \
-                        self.g, \
-                        self.theta, \
-                        self.r, \
-                        np.int32(0), \
-                        self.data.h0.data.gpudata,  self.data.h0.pitch,  \
-                        self.data.hu0.data.gpudata, self.data.hu0.pitch, \
-                        self.data.hv0.data.gpudata, self.data.hv0.pitch, \
-                        self.data.h1.data.gpudata,  self.data.h1.pitch,  \
-                        self.data.hu1.data.gpudata, self.data.hu1.pitch, \
-                        self.data.hv1.data.gpudata, self.data.hv1.pitch)
-                        
-                self.kp07_kernel.prepared_async_call(self.global_size, self.local_size, self.stream, \
-                        self.nx, self.ny, \
-                        self.dx, self.dy, local_dt, \
-                        self.g, \
-                        self.theta, \
-                        self.r, \
-                        np.int32(1), \
-                        self.data.h1.data.gpudata,  self.data.h1.pitch,  \
-                        self.data.hu1.data.gpudata, self.data.hu1.pitch, \
-                        self.data.hv1.data.gpudata, self.data.hv1.pitch, \
-                        self.data.h0.data.gpudata,  self.data.h0.pitch,  \
-                        self.data.hu0.data.gpudata, self.data.hu0.pitch, \
-                        self.data.hv0.data.gpudata, self.data.hv0.pitch)
-            else:
-                self.kp07_kernel.prepared_async_call(self.global_size, self.local_size, self.stream, \
-                        self.nx, self.ny, \
-                        self.dx, self.dy, local_dt, \
-                        self.g, \
-                        self.theta, \
-                        self.r, \
-                        np.int32(0), \
-                        self.data.h0.data.gpudata,  self.data.h0.pitch,  \
-                        self.data.hu0.data.gpudata, self.data.hu0.pitch, \
-                        self.data.hv0.data.gpudata, self.data.hv0.pitch, \
-                        self.data.h1.data.gpudata,  self.data.h1.pitch,  \
-                        self.data.hu1.data.gpudata, self.data.hu1.pitch, \
-                        self.data.hv1.data.gpudata, self.data.hv1.pitch)
-                self.cl_data.swap()
-                
-            self.t += local_dt
-            
+    def stepEuler(self, dt):
+        self.substepRK(dt, 0)
+        self.t += dt
        
-        return self.t, n
-    
-    
-    
+    def stepRK(self, dt, order):
+        if (order != 2):
+            raise NotImplementedError("Only second order implemented")
+        self.substepRK(dt, 0)
+        self.substepRK(dt, 1)
+        self.t += dt
    
    def download(self):
        return self.data.download(self.stream)
--- a/SWESimulators/KP07_dimsplit.py
+++ b/SWESimulators/KP07_dimsplit.py
@ -26,12 +26,7 @@ along with this program.  If not, see <http://www.gnu.org/licenses/>.

 #Import packages we need
 import numpy as np
-
-import pycuda.compiler as cuda_compiler
-import pycuda.gpuarray
-import pycuda.driver as cuda
-
-from SWESimulators import Common
+from SWESimulators import Simulator



@ -40,7 +35,7 @@ from SWESimulators import Common
 """
 Class that solves the SW equations using the dimentionally split KP07 scheme
 """
-class KP07_dimsplit:
+class KP07_dimsplit (Simulator.BaseSimulator):

    """
    Initialization routine
@ -62,98 +57,63 @@ class KP07_dimsplit:
                 g, \
                 theta=1.3, \
                 block_width=16, block_height=16):
-        #Create a CUDA stream
-        self.stream = cuda.Stream()
+                 
+        # Call super constructor
+        super().__init__(context, \
+            h0, hu0, hv0, \
+            nx, ny, \
+            2, 2, \
+            dx, dy, dt, \
+            g, \
+            block_width, block_height);
+            
+        self.theta = np.float32(theta)

        #Get kernels
-        self.kp07_dimsplit_module = context.get_kernel("KP07_dimsplit_kernel.cu", block_width, block_height)
-        self.kp07_dimsplit_kernel = self.kp07_dimsplit_module.get_function("KP07DimsplitKernel")
-        self.kp07_dimsplit_kernel.prepare("iifffffiPiPiPiPiPiPi")
-        
-        #Create data by uploading to device
-        ghost_cells_x = 2
-        ghost_cells_y = 2
-        self.data = Common.SWEDataArakawaA(self.stream, nx, ny, ghost_cells_x, ghost_cells_y, h0, hu0, hv0)
-        
-        #Save input parameters
-        #Notice that we need to specify them in the correct dataformat for the
-        #OpenCL kernel
-        self.nx = np.int32(nx)
-        self.ny = np.int32(ny)
-        self.dx = np.float32(dx)
-        self.dy = np.float32(dy)
-        self.dt = np.float32(dt)
-        self.g = np.float32(g)
-        self.theta = np.float32(theta)
-        
-        #Initialize time
-        self.t = np.float32(0.0)
-        
-        #Compute kernel launch parameters
-        self.local_size = (block_width, block_height, 1) 
-        self.global_size = ( \
-                       int(np.ceil(self.nx / float(self.local_size[0]))), \
-                       int(np.ceil(self.ny / float(self.local_size[1]))) \
-                      ) 
-    
+        self.kernel = context.get_prepared_kernel("KP07_dimsplit_kernel.cu", "KP07DimsplitKernel", \
+                                            "iifffffiPiPiPiPiPiPi", \
+                                            BLOCK_WIDTH=block_width, \
+                                            BLOCK_HEIGHT=block_height)
    
    def __str__(self):
        return "Kurganov-Petrova 2007 dimensionally split"
    
+    def simulate(self, t_end):
+        return super().simulateDimsplit(t_end)
    
-    """
-    Function which steps n timesteps
-    """
-    def step(self, t_end=0.0):
-        n = int(t_end / (2.0*self.dt) + 1)
+    def stepEuler(self, dt):
+        return self.stepDimsplitXY(dt)
+    
+    def stepDimsplitXY(self, dt):
+        self.kernel.prepared_async_call(self.global_size, self.local_size, self.stream, \
+                self.nx, self.ny, \
+                self.dx, self.dy, dt, \
+                self.g, \
+                self.theta, \
+                np.int32(0), \
+                self.data.h0.data.gpudata,  self.data.h0.pitch, \
+                self.data.hu0.data.gpudata, self.data.hu0.pitch, \
+                self.data.hv0.data.gpudata, self.data.hv0.pitch, \
+                self.data.h1.data.gpudata,  self.data.h1.pitch, \
+                self.data.hu1.data.gpudata, self.data.hu1.pitch, \
+                self.data.hv1.data.gpudata, self.data.hv1.pitch)
+        self.data.swap()
+        self.t += dt
+    
+    def stepDimsplitYX(self, dt):
+        self.kernel.prepared_async_call(self.global_size, self.local_size, self.stream, \
+                self.nx, self.ny, \
+                self.dx, self.dy, dt, \
+                self.g, \
+                self.theta, \
+                np.int32(1), \
+                self.data.h0.data.gpudata,  self.data.h0.pitch, \
+                self.data.hu0.data.gpudata, self.data.hu0.pitch, \
+                self.data.hv0.data.gpudata, self.data.hv0.pitch, \
+                self.data.h1.data.gpudata,  self.data.h1.pitch, \
+                self.data.hu1.data.gpudata, self.data.hu1.pitch, \
+                self.data.hv1.data.gpudata, self.data.hv1.pitch)
+        self.data.swap()
+        self.t += dt
        
-        for i in range(0, n): 
-            #Dimensional splitting: second order accurate for every other timestep,
-            #thus run two timesteps in a go
-            
-            #Compute timestep
-            local_dt = np.float32(0.5*min(2*self.dt, t_end-2*i*self.dt))
-            if (local_dt <= 0.0):
-                break
-                
-            #Along X, then Y
-            self.kp07_dimsplit_kernel.prepared_async_call(self.global_size, self.local_size, self.stream, \
-                    self.nx, self.ny, \
-                    self.dx, self.dy, local_dt, \
-                    self.g, \
-                    self.theta, \
-                    np.int32(0), \
-                    self.data.h0.data.gpudata,  self.data.h0.pitch, \
-                    self.data.hu0.data.gpudata, self.data.hu0.pitch, \
-                    self.data.hv0.data.gpudata, self.data.hv0.pitch, \
-                    self.data.h1.data.gpudata,  self.data.h1.pitch, \
-                    self.data.hu1.data.gpudata, self.data.hu1.pitch, \
-                    self.data.hv1.data.gpudata, self.data.hv1.pitch)
-            self.data.swap()
-            
-            #Along Y, then X
-            self.kp07_dimsplit_kernel.prepared_async_call(self.global_size, self.local_size, self.stream, \
-                    self.nx, self.ny, \
-                    self.dx, self.dy, local_dt, \
-                    self.g, \
-                    self.theta, \
-                    np.int32(1), \
-                    self.data.h0.data.gpudata,  self.data.h0.pitch, \
-                    self.data.hu0.data.gpudata, self.data.hu0.pitch, \
-                    self.data.hv0.data.gpudata, self.data.hv0.pitch, \
-                    self.data.h1.data.gpudata,  self.data.h1.pitch, \
-                    self.data.hu1.data.gpudata, self.data.hu1.pitch, \
-                    self.data.hv1.data.gpudata, self.data.hv1.pitch)
-            self.data.swap()
-            
-            self.t += 2.0*local_dt
-            
-        
-        return self.t, 2*n
-    
-    
-    
-    
-    def download(self):
-        return self.data.download(self.stream)
-
+        
--- a/SWESimulators/KP07_dimsplit_kernel.cu
+++ b/SWESimulators/KP07_dimsplit_kernel.cu
@ -25,12 +25,14 @@ along with this program.  If not, see <http://www.gnu.org/licenses/>.


 #include "common.cu"
+#include "limiters.cu"
+#include "fluxes/CentralUpwind.cu"


 __device__
-void computeFluxF(float Q[3][block_height+4][block_width+4],
-                  float Qx[3][block_height+2][block_width+2],
-                  float F[3][block_height+1][block_width+1],
+void computeFluxF(float Q[3][BLOCK_HEIGHT+4][BLOCK_WIDTH+4],
+                  float Qx[3][BLOCK_HEIGHT+2][BLOCK_WIDTH+2],
+                  float F[3][BLOCK_HEIGHT+1][BLOCK_WIDTH+1],
                  const float g_, const float dx_, const float dt_) {
    //Index of thread within block
    const int tx = get_local_id(0);
@ -39,7 +41,7 @@ void computeFluxF(float Q[3][block_height+4][block_width+4],
    {
        int j=ty;
        const int l = j + 2; //Skip ghost cells
-        for (int i=tx; i<block_width+1; i+=block_width) {
+        for (int i=tx; i<BLOCK_WIDTH+1; i+=BLOCK_WIDTH) {
            const int k = i + 1;
            // Reconstruct point values of Q at the left and right hand side 
            // of the cell for both the left (i) and right (i+1) cell 
@ -73,15 +75,15 @@ void computeFluxF(float Q[3][block_height+4][block_width+4],
 }

 __device__
-void computeFluxG(float Q[3][block_height+4][block_width+4],
-                  float Qy[3][block_height+2][block_width+2],
-                  float G[3][block_height+1][block_width+1],
+void computeFluxG(float Q[3][BLOCK_HEIGHT+4][BLOCK_WIDTH+4],
+                  float Qy[3][BLOCK_HEIGHT+2][BLOCK_WIDTH+2],
+                  float G[3][BLOCK_HEIGHT+1][BLOCK_WIDTH+1],
                  const float g_, const float dy_, const float dt_) {
    //Index of thread within block
    const int tx = get_local_id(0);
    const int ty = get_local_id(1);
    
-    for (int j=ty; j<block_height+1; j+=block_height) {
+    for (int j=ty; j<BLOCK_HEIGHT+1; j+=BLOCK_HEIGHT) {
        const int l = j + 1;
        {
            int i=tx;
@ -125,6 +127,7 @@ void computeFluxG(float Q[3][block_height+4][block_width+4],
 /**
  * This unsplit kernel computes the 2D numerical scheme with a TVD RK2 time integration scheme
  */
+extern "C" {
 __global__ void KP07DimsplitKernel(
        int nx_, int ny_,
        float dx_, float dy_, float dt_,
@ -146,9 +149,9 @@ __global__ void KP07DimsplitKernel(
        
        
    //Shared memory variables
-    __shared__ float Q[3][block_height+4][block_width+4];
-    __shared__ float Qx[3][block_height+2][block_width+2];
-    __shared__ float F[3][block_height+1][block_width+1];
+    __shared__ float Q[3][BLOCK_HEIGHT+4][BLOCK_WIDTH+4];
+    __shared__ float Qx[3][BLOCK_HEIGHT+2][BLOCK_WIDTH+2];
+    __shared__ float F[3][BLOCK_HEIGHT+1][BLOCK_WIDTH+1];
    
    
    
@ -217,4 +220,6 @@ __global__ void KP07DimsplitKernel(
                hu1_ptr_, hu1_pitch_,
                hv1_ptr_, hv1_pitch_,
                Q, nx_, ny_);
-}
+}
+
+} // extern "C"
--- a/SWESimulators/KP07_kernel.cu
+++ b/SWESimulators/KP07_kernel.cu
@ -25,12 +25,14 @@ along with this program.  If not, see <http://www.gnu.org/licenses/>.


 #include "common.cu"
+#include "limiters.cu"
+#include "fluxes/CentralUpwind.cu"


 __device__
-void computeFluxF(float Q[3][block_height+4][block_width+4],
-                  float Qx[3][block_height+2][block_width+2],
-                  float F[3][block_height+1][block_width+1],
+void computeFluxF(float Q[3][BLOCK_HEIGHT+4][BLOCK_WIDTH+4],
+                  float Qx[3][BLOCK_HEIGHT+2][BLOCK_WIDTH+2],
+                  float F[3][BLOCK_HEIGHT+1][BLOCK_WIDTH+1],
                  const float g_) {
    //Index of thread within block
    const int tx = get_local_id(0);
@ -39,7 +41,7 @@ void computeFluxF(float Q[3][block_height+4][block_width+4],
    {
        int j=ty;
        const int l = j + 2; //Skip ghost cells
-        for (int i=tx; i<block_width+1; i+=block_width) {
+        for (int i=tx; i<BLOCK_WIDTH+1; i+=BLOCK_WIDTH) {
            const int k = i + 1;
            // Q at interface from the right and left
            const float3 Qp = make_float3(Q[0][l][k+1] - 0.5f*Qx[0][j][i+1],
@ -59,15 +61,15 @@ void computeFluxF(float Q[3][block_height+4][block_width+4],
 }

 __device__
-void computeFluxG(float Q[3][block_height+4][block_width+4],
-                  float Qy[3][block_height+2][block_width+2],
-                  float G[3][block_height+1][block_width+1],
+void computeFluxG(float Q[3][BLOCK_HEIGHT+4][BLOCK_WIDTH+4],
+                  float Qy[3][BLOCK_HEIGHT+2][BLOCK_WIDTH+2],
+                  float G[3][BLOCK_HEIGHT+1][BLOCK_WIDTH+1],
                  const float g_) {
    //Index of thread within block
    const int tx = get_local_id(0);
    const int ty = get_local_id(1);
    
-    for (int j=ty; j<block_height+1; j+=block_height) {
+    for (int j=ty; j<BLOCK_HEIGHT+1; j+=BLOCK_HEIGHT) {
        const int l = j + 1;
        {
            int i=tx;
@ -104,8 +106,6 @@ __global__ void KP07Kernel(
        
        float theta_,
        
-        float r_, //< Bottom friction coefficient
-        
        int step_,
        
        //Input h^n
@ -127,14 +127,14 @@ __global__ void KP07Kernel(
    const int tj = get_global_id(1) + 2;
    
    //Shared memory variables
-    __shared__ float Q[3][block_height+4][block_width+4];
+    __shared__ float Q[3][BLOCK_HEIGHT+4][BLOCK_WIDTH+4];
    
    //The following slightly wastes memory, but enables us to reuse the 
    //funcitons in common.opencl
-    __shared__ float Qx[3][block_height+2][block_width+2];
-    __shared__ float Qy[3][block_height+2][block_width+2];
-    __shared__ float F[3][block_height+1][block_width+1];
-    __shared__ float G[3][block_height+1][block_width+1];
+    __shared__ float Qx[3][BLOCK_HEIGHT+2][BLOCK_WIDTH+2];
+    __shared__ float Qy[3][BLOCK_HEIGHT+2][BLOCK_WIDTH+2];
+    __shared__ float F[3][BLOCK_HEIGHT+1][BLOCK_WIDTH+1];
+    __shared__ float G[3][BLOCK_HEIGHT+1][BLOCK_WIDTH+1];
    
    
    
@ -178,15 +178,13 @@ __global__ void KP07Kernel(
        float* const h_row  = (float*) ((char*) h1_ptr_ + h1_pitch_*tj);
        float* const hu_row = (float*) ((char*) hu1_ptr_ + hu1_pitch_*tj);
        float* const hv_row = (float*) ((char*) hv1_ptr_ + hv1_pitch_*tj);
-        
-        const float C = 2.0f*r_*dt_/Q[0][j][i];
                    
        if  (step_ == 0) {
            //First step of RK2 ODE integrator
            
            h_row[ti] = h1;
-            hu_row[ti] = hu1 / (1.0f + C);
-            hv_row[ti] = hv1 / (1.0f + C);
+            hu_row[ti] = hu1;
+            hv_row[ti] = hv1;
        }
        else if (step_ == 1) {
            //Second step of RK2 ODE integrator
@ -203,8 +201,8 @@ __global__ void KP07Kernel(
            
            //Write to main memory
            h_row[ti] = h_b;
-            hu_row[ti] = hu_b / (1.0f + 0.5f*C);
-            hv_row[ti] = hv_b / (1.0f + 0.5f*C);
+            hu_row[ti] = hu_b;
+            hv_row[ti] = hv_b;
        }
    }
 }
--- a/SWESimulators/LxF.py
+++ b/SWESimulators/LxF.py
@ -21,13 +21,7 @@ along with this program.  If not, see <http://www.gnu.org/licenses/>.
 """

 #Import packages we need
-import numpy as np
-
-import pycuda.compiler as cuda_compiler
-import pycuda.gpuarray
-import pycuda.driver as cuda
-
-from SWESimulators import Common
+from SWESimulators import Simulator



@ -38,7 +32,7 @@ from SWESimulators import Common
 """
 Class that solves the SW equations using the Lax Friedrichs scheme
 """
-class LxF:
+class LxF (Simulator.BaseSimulator):

    """
    Initialization routine
@ -59,80 +53,40 @@ class LxF:
                 dx, dy, dt, \
                 g, \
                 block_width=16, block_height=16):
-        #Create a CUDA stream
-        self.stream = cuda.Stream()
+                 
+        # Call super constructor
+        super().__init__(context, \
+            h0, hu0, hv0, \
+            nx, ny, \
+            1, 1, \
+            dx, dy, dt, \
+            g, \
+            block_width, block_height);

-        #Get kernels
-        self.lxf_module = context.get_kernel("LxF_kernel.cu", block_width, block_height)
-        self.lxf_kernel = self.lxf_module.get_function("LxFKernel")
-        self.lxf_kernel.prepare("iiffffPiPiPiPiPiPi")
+        # Get kernels
+        self.kernel = context.get_prepared_kernel("LxF_kernel.cu", "LxFKernel", \
+                                        "iiffffPiPiPiPiPiPi", \
+                                        BLOCK_WIDTH=block_width, \
+                                        BLOCK_HEIGHT=block_height)
        
-        #Create data by uploading to device
-        ghost_cells_x = 1
-        ghost_cells_y = 1
-        self.data = Common.SWEDataArakawaA(self.stream, \
-                            nx, ny, \
-                            ghost_cells_x, ghost_cells_y, \
-                            h0, hu0, hv0)
-        
-        #Save input parameters
-        #Notice that we need to specify them in the correct dataformat for the
-        #OpenCL kernel
-        self.nx = np.int32(nx)
-        self.ny = np.int32(ny)
-        self.dx = np.float32(dx)
-        self.dy = np.float32(dy)
-        self.dt = np.float32(dt)
-        self.g = np.float32(g)
-        
-        #Initialize time
-        self.t = np.float32(0.0)
-        
-        #Compute kernel launch parameters
-        self.local_size = (block_width, block_height, 1) 
-        self.global_size = ( \
-                       int(np.ceil(self.nx / float(self.local_size[0]))), \
-                       int(np.ceil(self.ny / float(self.local_size[1]))) \
-                      ) 
-    
-    
-    
    def __str__(self):
        return "Lax Friedrichs"
-    
-    """
-    Function which steps n timesteps
-    """
-    def step(self, t_end=0.0):
-        n = int(t_end / self.dt + 1)
        
-        for i in range(0, n):        
-            local_dt = np.float32(min(self.dt, t_end-i*self.dt))
-            
-            if (local_dt <= 0.0):
-                break
+    def simulate(self, t_end):
+        return super().simulateEuler(t_end)
        
-            self.lxf_kernel.prepared_async_call(self.global_size, self.local_size, self.stream, \
-                    self.nx, self.ny, \
-                    self.dx, self.dy, local_dt, \
-                    self.g, \
-                    self.data.h0.data.gpudata, self.data.h0.pitch, \
-                    self.data.hu0.data.gpudata, self.data.hu0.pitch, \
-                    self.data.hv0.data.gpudata, self.data.hv0.pitch, \
-                    self.data.h1.data.gpudata, self.data.h1.pitch, \
-                    self.data.hu1.data.gpudata, self.data.hu1.pitch, \
-                    self.data.hv1.data.gpudata, self.data.hv1.pitch)
-                
-            self.t += local_dt
-            
-            self.data.swap()
-        
-        return self.t, n
-        
-        
-    
-    
-    
-    def download(self):
-        return self.data.download(self.stream)
+    def stepEuler(self, dt):
+        self.kernel.prepared_async_call(self.global_size, self.local_size, self.stream, \
+                self.nx, self.ny, \
+                self.dx, self.dy, dt, \
+                self.g, \
+                self.data.h0.data.gpudata, self.data.h0.pitch, \
+                self.data.hu0.data.gpudata, self.data.hu0.pitch, \
+                self.data.hv0.data.gpudata, self.data.hv0.pitch, \
+                self.data.h1.data.gpudata, self.data.h1.pitch, \
+                self.data.hu1.data.gpudata, self.data.hu1.pitch, \
+                self.data.hv1.data.gpudata, self.data.hv1.pitch)
+        self.data.swap()
+        self.t += dt
+  

--- a/SWESimulators/LxF_kernel.cu
+++ b/SWESimulators/LxF_kernel.cu
@ -20,11 +20,13 @@ along with this program.  If not, see <http://www.gnu.org/licenses/>.


 #include "common.cu"
+#include "fluxes/LaxFriedrichs.cu"


 /**
  * Computes the flux along the x axis for all faces
  */
+template <int block_width, int block_height>
 __device__ 
 void computeFluxF(float Q[3][block_height+2][block_width+2],
                  float F[3][block_height][block_width+1],
@ -59,8 +61,9 @@ void computeFluxF(float Q[3][block_height+2][block_width+2],

 /**
  * Computes the flux along the y axis for all faces
-  */
-__device__ 
+  */ 
+template <int block_width, int block_height>
+__device__
 void computeFluxG(float Q[3][block_height+2][block_width+2],
                  float G[3][block_height+1][block_width],
                  const float g_, const float dy_, const float dt_) {
@ -94,7 +97,9 @@ void computeFluxG(float Q[3][block_height+2][block_width+2],
 }


-__global__ void LxFKernel(
+extern "C" {
+__global__ 
+void LxFKernel(
        int nx_, int ny_,
        float dx_, float dy_, float dt_,
        float g_,
@ -109,6 +114,9 @@ __global__ void LxFKernel(
        float* hu1_ptr_, int hu1_pitch_,
        float* hv1_ptr_, int hv1_pitch_) {
            
+    const int block_width = BLOCK_WIDTH;
+    const int block_height = BLOCK_HEIGHT;
+            
    //Index of cell within domain
    const int ti = get_global_id(0) + 1; //Skip global ghost cells, i.e., +1
    const int tj = get_global_id(1) + 1;
@ -130,8 +138,8 @@ __global__ void LxFKernel(
    
    
    //Compute fluxes along the x and y axis
-    computeFluxF(Q, F, g_, dx_, dt_);
-    computeFluxG(Q, G, g_, dy_, dt_);
+    computeFluxF<block_width, block_height>(Q, F, g_, dx_, dt_);
+    computeFluxG<block_width, block_height>(Q, G, g_, dy_, dt_);
    __syncthreads();
    
    
@ -159,4 +167,7 @@ __global__ void LxFKernel(
        hu_row[ti] = hu1;
        hv_row[ti] = hv1;
    }
-}
+}
+
+} // extern "C"
+
--- a/SWESimulators/PlotHelper.py
+++ b/SWESimulators/PlotHelper.py
@ -1,165 +0,0 @@
-# -*- coding: utf-8 -*-
-
-"""
-This python class aids in plotting results from the numerical 
-simulations
-
-Copyright (C) 2016  SINTEF ICT
-
-This program is free software: you can redistribute it and/or modify
-it under the terms of the GNU General Public License as published by
-the Free Software Foundation, either version 3 of the License, or
-(at your option) any later version.
-
-This program is distributed in the hope that it will be useful,
-but WITHOUT ANY WARRANTY; without even the implied warranty of
-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-GNU General Public License for more details.
-
-You should have received a copy of the GNU General Public License
-along with this program.  If not, see <http://www.gnu.org/licenses/>.
-"""
-
-
-from matplotlib import pyplot as plt
-import matplotlib.gridspec as gridspec
-import numpy as np
-import time
-
-"""
-Class that makes plotting faster by caching the plots instead of recreating them
-"""
-class PlotHelper:
-
-    def __init__(self, fig, x_coords, y_coords, radius, eta1, u1, v1, eta2=None, u2=None, v2=None, interpolation_type='spline36'):
-        self.ny, self.nx = eta1.shape
-        self.fig = fig;
-        
-        fig.set_figheight(15)
-        fig.set_figwidth(15)
-        
-        min_x = np.min(x_coords[:,0]);
-        min_y = np.min(y_coords[0,:]);
-        
-        max_x = np.max(x_coords[0,:]);
-        max_y = np.max(y_coords[:,0]);
-        
-        domain_extent = [ x_coords[0, 0], x_coords[0, -1], y_coords[0, 0], y_coords[-1, 0] ]
-        
-        if (eta2 is not None):
-            assert(u2 is not None)
-            assert(v2 is not None)
-            self.gs = gridspec.GridSpec(3, 3)
-        else:
-            self.gs = gridspec.GridSpec(2, 3)
-        
-        ax = self.fig.add_subplot(self.gs[0, 0])
-        self.sp_eta = plt.imshow(eta1, interpolation=interpolation_type, origin='bottom', vmin=-0.05, vmax=0.05, extent=domain_extent)
-        plt.axis('tight')
-        ax.set_aspect('equal')
-        plt.title('Eta')
-        plt.colorbar()
-        
-        ax = self.fig.add_subplot(self.gs[0, 1])
-        self.sp_u = plt.imshow(u1, interpolation=interpolation_type, origin='bottom', vmin=-1.5, vmax=1.5, extent=domain_extent)
-        plt.axis('tight')
-        ax.set_aspect('equal')
-        plt.title('U')
-        plt.colorbar()
-        
-        ax = self.fig.add_subplot(self.gs[0, 2])
-        self.sp_v = plt.imshow(v1, interpolation=interpolation_type, origin='bottom', vmin=-1.5, vmax=1.5, extent=domain_extent)
-        plt.axis('tight')
-        ax.set_aspect('equal')
-        plt.title('V')
-        plt.colorbar()
-            
-        ax = self.fig.add_subplot(self.gs[1, 0])
-        self.sp_radial1, = plt.plot(radius.ravel(), eta1.ravel(), '.')
-        plt.axis([0, min(max_x, max_y), -1.5, 1])
-        plt.title('Eta Radial plot')
-
-        ax = self.fig.add_subplot(self.gs[1, 1])
-        self.sp_x_axis1, = plt.plot(x_coords[self.ny/2,:], eta1[self.ny/2,:], 'k+--', label='x-axis')
-        self.sp_y_axis1, = plt.plot(y_coords[:,self.nx/2], eta1[:,self.nx/2], 'kx:', label='y-axis')
-        plt.axis([max(min_x, min_y), min(max_x, max_y), -1.5, 1])
-        plt.title('Eta along axis')
-        plt.legend()
-
-        ax = self.fig.add_subplot(self.gs[1, 2])
-        self.sp_x_diag1, = plt.plot(1.41*np.diagonal(x_coords, offset=-abs(self.nx-self.ny)/2), \
-                                   np.diagonal(eta1, offset=-abs(self.nx-self.ny)/2), \
-                                   'k+--', label='x = -y')
-        self.sp_y_diag1, = plt.plot(1.41*np.diagonal(y_coords.T, offset=abs(self.nx-self.ny)/2), \
-                                   np.diagonal(eta1.T, offset=abs(self.nx-self.ny)/2), \
-                                   'kx:', label='x = y')
-        plt.axis([max(min_x, min_y), min(max_x, max_y), -1.5, 1])
-        plt.title('Eta along diagonal')
-        plt.legend()
-        
-        
-        if (eta2 is not None):
-            ax = self.fig.add_subplot(self.gs[2, 0])
-            self.sp_radial2, = plt.plot(radius.ravel(), eta2.ravel(), '.')
-            plt.axis([0, min(max_x, max_y), -1.5, 1])
-            plt.title('Eta2 Radial plot')
-
-            ax = self.fig.add_subplot(self.gs[2, 1])
-            self.sp_x_axis2, = plt.plot(x_coords[self.ny/2,:], eta2[self.ny/2,:], 'k+--', label='x-axis')
-            self.sp_y_axis2, = plt.plot(y_coords[:,self.nx/2], eta2[:,self.nx/2], 'kx:', label='y-axis')
-            plt.axis([max(min_x, min_y), min(max_x, max_y), -1.5, 1])
-            plt.title('Eta2 along axis')
-            plt.legend()
-
-            ax = self.fig.add_subplot(self.gs[2, 2])
-            self.sp_x_diag2, = plt.plot(1.41*np.diagonal(x_coords, offset=-abs(self.nx-self.ny)/2), \
-                                       np.diagonal(eta2, offset=-abs(self.nx-self.ny)/2), \
-                                       'k+--', label='x = -y')
-            self.sp_y_diag2, = plt.plot(1.41*np.diagonal(y_coords.T, offset=abs(self.nx-self.ny)/2), \
-                                       np.diagonal(eta2.T, offset=abs(self.nx-self.ny)/2), \
-                                       'kx:', label='x = y')
-            plt.axis([max(min_x, min_y), min(max_x, max_y), -1.5, 1])
-            plt.title('Eta2 along diagonal')
-            plt.legend()
-        
-        
-        
-        
-        
-    def plot(self, eta1, u1, v1, eta2=None, u2=None, v2=None):
-        self.fig.add_subplot(self.gs[0, 0])
-        self.sp_eta.set_data(eta1)
-
-        self.fig.add_subplot(self.gs[0, 1])
-        self.sp_u.set_data(u1)
-
-        self.fig.add_subplot(self.gs[0, 2])
-        self.sp_v.set_data(v1)
-            
-        self.fig.add_subplot(self.gs[1, 0])
-        self.sp_radial1.set_ydata(eta1.ravel());
-
-        self.fig.add_subplot(self.gs[1, 1])
-        self.sp_x_axis1.set_ydata(eta1[(self.ny+2)/2,:])
-        self.sp_y_axis1.set_ydata(eta1[:,(self.nx+2)/2])
-
-        self.fig.add_subplot(self.gs[1, 2])
-        self.sp_x_diag1.set_ydata(np.diagonal(eta1, offset=-abs(self.nx-self.ny)/2))
-        self.sp_y_diag1.set_ydata(np.diagonal(eta1.T, offset=abs(self.nx-self.ny)/2))
-        
-        if (eta2 is not None):
-            self.fig.add_subplot(self.gs[2, 0])
-            self.sp_radial2.set_ydata(eta2.ravel());
-
-            self.fig.add_subplot(self.gs[2, 1])
-            self.sp_x_axis2.set_ydata(eta2[(self.ny+2)/2,:])
-            self.sp_y_axis2.set_ydata(eta2[:,(self.nx+2)/2])
-
-            self.fig.add_subplot(self.gs[2, 2])
-            self.sp_x_diag2.set_ydata(np.diagonal(eta2, offset=-abs(self.nx-self.ny)/2))
-            self.sp_y_diag2.set_ydata(np.diagonal(eta2.T, offset=abs(self.nx-self.ny)/2))
-        
-        plt.draw()
-        time.sleep(0.001)
-        
-        
--- a/SWESimulators/Simulator.py
+++ b/SWESimulators/Simulator.py
@ -0,0 +1,186 @@
+# -*- coding: utf-8 -*-
+
+"""
+This python module implements the classical Lax-Friedrichs numerical
+scheme for the shallow water equations
+
+Copyright (C) 2016  SINTEF ICT
+
+This program is free software: you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation, either version 3 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program.  If not, see <http://www.gnu.org/licenses/>.
+"""
+
+#Import packages we need
+import numpy as np
+import logging
+
+import pycuda.compiler as cuda_compiler
+import pycuda.gpuarray
+import pycuda.driver as cuda
+
+from SWESimulators import Common
+
+
+class BaseSimulator:
+    """
+    Initialization routine
+    context: GPU context to use
+    kernel_wrapper: wrapper function of GPU kernel
+    h0: Water depth incl ghost cells, (nx+1)*(ny+1) cells
+    hu0: Initial momentum along x-axis incl ghost cells, (nx+1)*(ny+1) cells
+    hv0: Initial momentum along y-axis incl ghost cells, (nx+1)*(ny+1) cells
+    nx: Number of cells along x-axis
+    ny: Number of cells along y-axis
+    dx: Grid cell spacing along x-axis (20 000 m)
+    dy: Grid cell spacing along y-axis (20 000 m)
+    dt: Size of each timestep (90 s)
+    g: Gravitational accelleration (9.81 m/s^2)
+    """
+    def __init__(self, \
+                 context, \
+                 h0, hu0, hv0, \
+                 nx, ny, \
+                 ghost_cells_x, ghost_cells_y, \
+                 dx, dy, dt, \
+                 g, \
+                 block_width, block_height):
+        #Get logger
+        self.logger = logging.getLogger(__name__ + "." + self.__class__.__name__)
+        
+        #Create a CUDA stream
+        self.stream = cuda.Stream()
+        
+        #Create data by uploading to device
+        self.data = Common.SWEDataArakawaA(self.stream, \
+                            nx, ny, \
+                            ghost_cells_x, ghost_cells_y, \
+                            h0, hu0, hv0)
+                           
+        #Save input parameters
+        #Notice that we need to specify them in the correct dataformat for the
+        #GPU kernel
+        self.nx = np.int32(nx)
+        self.ny = np.int32(ny)
+        self.dx = np.float32(dx)
+        self.dy = np.float32(dy)
+        self.dt = np.float32(dt)
+        self.g = np.float32(g) 
+        
+        #Keep track of simulation time
+        self.t = 0.0;
+                            
+        #Compute kernel launch parameters
+        self.local_size = (block_width, block_height, 1) 
+        self.global_size = ( \
+                       int(np.ceil(self.nx / float(self.local_size[0]))), \
+                       int(np.ceil(self.ny / float(self.local_size[1]))) \
+                      ) 
+
+    """
+    Function which simulates forward in time using the default simulation type
+    """
+    def simulate(self, t_end):
+        raise(exceptions.NotImplementedError("Needs to be implemented in subclass"))
+                      
+    """ 
+    Function which simulates t_end seconds using forward Euler
+    Requires that the stepEuler functionality is implemented in the subclasses
+    """
+    def simulateEuler(self, t_end):
+        with Common.Timer(self.__class__.__name__ + ".simulateEuler") as t:
+            # Compute number of timesteps to perform
+            n = int(t_end / self.dt + 1)
+            
+            for i in range(0, n):
+                # Compute timestep for "this" iteration
+                local_dt = np.float32(min(self.dt, t_end-i*self.dt))
+                
+                # Stop if end reached (should not happen)
+                if (local_dt <= 0.0):
+                    break
+            
+                # Step with forward Euler 
+                self.stepEuler(local_dt)
+            
+        self.logger.info("%s simulated %f seconds to %f with %d steps in %f seconds", self.__class__.__name__, t_end, self.t, n, t.secs)
+        return self.t, n
+        
+    """
+    Function which simulates t_end seconds using Runge-Kutta 2
+    Requires that the stepRK functionality is implemented in the subclasses
+    """
+    def simulateRK(self, t_end, order):
+        with Common.Timer(self.__class__.__name__ + ".simulateRK") as t:
+            # Compute number of timesteps to perform
+            n = int(t_end / self.dt + 1)
+            
+            for i in range(0, n):
+                # Compute timestep for "this" iteration
+                local_dt = np.float32(min(self.dt, t_end-i*self.dt))
+                
+                # Stop if end reached (should not happen)
+                if (local_dt <= 0.0):
+                    break
+            
+                # Perform all the Runge-Kutta substeps
+                self.stepRK(local_dt, order)
+            
+        self.logger.info("%s simulated %f seconds to %f with %d steps in %f seconds", self.__class__.__name__, t_end, self.t, n, t.secs)
+        return self.t, n
+        
+    """
+    Function which simulates t_end seconds using second order dimensional splitting (XYYX)
+    Requires that the stepDimsplitX and stepDimsplitY functionality is implemented in the subclasses
+    """
+    def simulateDimsplit(self, t_end):
+        with Common.Timer(self.__class__.__name__ + ".simulateDimsplit") as t:
+            # Compute number of timesteps to perform
+            n = int(t_end / (2.0*self.dt) + 1)
+            
+            for i in range(0, n):
+                # Compute timestep for "this" iteration
+                local_dt = np.float32(0.5*min(2*self.dt, t_end-2*i*self.dt))
+                
+                # Stop if end reached (should not happen)
+                if (local_dt <= 0.0):
+                    break
+                
+                # Perform the dimensional split substeps
+                self.stepDimsplitXY(local_dt)
+                self.stepDimsplitYX(local_dt)
+            
+        self.logger.info("%s simulated %f seconds to %f with %d steps in %f seconds", self.__class__.__name__, t_end, self.t, 2*n, t.secs)
+        return self.t, 2*n
+        
+    
+    """
+    Function which performs one single timestep of size dt using forward euler
+    """
+    def stepEuler(self, dt):
+        raise(NotImplementedError("Needs to be implemented in subclass"))
+        
+    def stepRK(self, dt, substep):
+        raise(NotImplementedError("Needs to be implemented in subclass"))
+    
+    def stepDimsplitXY(self, dt):
+        raise(NotImplementedError("Needs to be implemented in subclass"))
+        
+    def stepDimsplitYX(self, dt):
+        raise(NotImplementedError("Needs to be implemented in subclass"))
+        
+    def sim_time(self):
+        return self.t
+
+    def download(self):
+        return self.data.download(self.stream)
+
--- a/SWESimulators/WAF.py
+++ b/SWESimulators/WAF.py
@ -22,12 +22,7 @@ along with this program.  If not, see <http://www.gnu.org/licenses/>.

 #Import packages we need
 import numpy as np
-
-import pycuda.compiler as cuda_compiler
-import pycuda.gpuarray
-import pycuda.driver as cuda
-
-from SWESimulators import Common
+from SWESimulators import Simulator



@ -36,7 +31,7 @@ from SWESimulators import Common
 """
 Class that solves the SW equations using the Forward-Backward linear scheme
 """
-class WAF:
+class WAF (Simulator.BaseSimulator):

    """
    Initialization routine
@ -57,92 +52,57 @@ class WAF:
                 dx, dy, dt, \
                 g, \
                 block_width=16, block_height=16):
-        #Create a CUDA stream
-        self.stream = cuda.Stream()
+                 
+        # Call super constructor
+        super().__init__(context, \
+            h0, hu0, hv0, \
+            nx, ny, \
+            2, 2, \
+            dx, dy, dt, \
+            g, \
+            block_width, block_height);

        #Get kernels
-        self.waf_module = context.get_kernel("WAF_kernel.cu", block_width, block_height)
-        self.waf_kernel = self.waf_module.get_function("WAFKernel")
-        self.waf_kernel.prepare("iiffffiPiPiPiPiPiPi")
-        
-        #Create data by uploading to device
-        ghost_cells_x = 2
-        ghost_cells_y = 2
-        self.data = Common.SWEDataArakawaA(self.stream, nx, ny, ghost_cells_x, ghost_cells_y, h0, hu0, hv0)
-        
-        #Save input parameters
-        #Notice that we need to specify them in the correct dataformat for the
-        #OpenCL kernel
-        self.nx = np.int32(nx)
-        self.ny = np.int32(ny)
-        self.dx = np.float32(dx)
-        self.dy = np.float32(dy)
-        self.dt = np.float32(dt)
-        self.g = np.float32(g)
-        
-        #Initialize time
-        self.t = np.float32(0.0)
-        
-        #Compute kernel launch parameters
-        self.local_size = (block_width, block_height, 1) 
-        self.global_size = ( \
-                       int(np.ceil(self.nx / float(self.local_size[0]))), \
-                       int(np.ceil(self.ny / float(self.local_size[1]))) \
-                      ) 
-    
-    
+        self.kernel = context.get_prepared_kernel("WAF_kernel.cu", "WAFKernel", \
+                                        "iiffffiPiPiPiPiPiPi", \
+                                        BLOCK_WIDTH=block_width, \
+                                        BLOCK_HEIGHT=block_height)
    
    def __str__(self):
        return "Weighted average flux"
    
-    """
-    Function which steps n timesteps
-    """
-    def step(self, t_end=0.0):
-        n = int(t_end / (2.0*self.dt) + 1)
+    def simulate(self, t_end):
+        return super().simulateDimsplit(t_end)
        
-        for i in range(0, n):
-            #Dimensional splitting: second order accurate for every other timestep,
-            #thus run two timesteps in a go
-            
-            local_dt = np.float32(0.5*min(2*self.dt, t_end-2*i*self.dt))
-            if (local_dt <= 0.0):
-                break
-                
-            #Along X, then Y
-            self.waf_kernel.prepared_async_call(self.global_size, self.local_size, self.stream, \
-                    self.nx, self.ny, \
-                    self.dx, self.dy, local_dt, \
-                    self.g, \
-                    np.int32(0), \
-                    self.data.h0.data.gpudata,  self.data.h0.pitch,  \
-                    self.data.hu0.data.gpudata, self.data.hu0.pitch, \
-                    self.data.hv0.data.gpudata, self.data.hv0.pitch, \
-                    self.data.h1.data.gpudata,  self.data.h1.pitch,  \
-                    self.data.hu1.data.gpudata, self.data.hu1.pitch, \
-                    self.data.hv1.data.gpudata, self.data.hv1.pitch)
-            
-            #Along Y, then X
-            self.waf_kernel.prepared_async_call(self.global_size, self.local_size, self.stream, \
-                    self.nx, self.ny, \
-                    self.dx, self.dy, local_dt, \
-                    self.g, \
-                    np.int32(1), \
-                    self.data.h1.data.gpudata,  self.data.h1.pitch,  \
-                    self.data.hu1.data.gpudata, self.data.hu1.pitch, \
-                    self.data.hv1.data.gpudata, self.data.hv1.pitch, \
-                    self.data.h0.data.gpudata,  self.data.h0.pitch,  \
-                    self.data.hu0.data.gpudata, self.data.hu0.pitch, \
-                    self.data.hv0.data.gpudata, self.data.hv0.pitch)
-                
-            self.t += local_dt
-            
+    def stepEuler(self, dt):
+        return self.stepDimsplitXY(dt)
        
-        return self.t, 2*n
-    
-    
-    
-    
-    def download(self):
-        return self.data.download(self.stream)
-
+    def stepDimsplitXY(self, dt):
+        self.kernel.prepared_async_call(self.global_size, self.local_size, self.stream, \
+                        self.nx, self.ny, \
+                        self.dx, self.dy, dt, \
+                        self.g, \
+                        np.int32(0), \
+                        self.data.h0.data.gpudata,  self.data.h0.pitch,  \
+                        self.data.hu0.data.gpudata, self.data.hu0.pitch, \
+                        self.data.hv0.data.gpudata, self.data.hv0.pitch, \
+                        self.data.h1.data.gpudata,  self.data.h1.pitch,  \
+                        self.data.hu1.data.gpudata, self.data.hu1.pitch, \
+                        self.data.hv1.data.gpudata, self.data.hv1.pitch)
+        self.data.swap()
+        self.t += dt
+        
+    def stepDimsplitYX(self, dt):
+        self.kernel.prepared_async_call(self.global_size, self.local_size, self.stream, \
+                        self.nx, self.ny, \
+                        self.dx, self.dy, dt, \
+                        self.g, \
+                        np.int32(1), \
+                        self.data.h0.data.gpudata,  self.data.h0.pitch,  \
+                        self.data.hu0.data.gpudata, self.data.hu0.pitch, \
+                        self.data.hv0.data.gpudata, self.data.hv0.pitch, \
+                        self.data.h1.data.gpudata,  self.data.h1.pitch,  \
+                        self.data.hu1.data.gpudata, self.data.hu1.pitch, \
+                        self.data.hv1.data.gpudata, self.data.hv1.pitch)
+        self.data.swap()
+        self.t += dt
--- a/SWESimulators/WAF_kernel.cu
+++ b/SWESimulators/WAF_kernel.cu
@ -25,6 +25,7 @@ along with this program.  If not, see <http://www.gnu.org/licenses/>.


 #include "common.cu"
+#include "fluxes/WeightedAverageFlux.cu"



@ -32,8 +33,8 @@ along with this program.  If not, see <http://www.gnu.org/licenses/>.
  * Computes the flux along the x axis for all faces
  */
 __device__
-void computeFluxF(float Q[3][block_height+4][block_width+4],
-                  float F[3][block_height+1][block_width+1],
+void computeFluxF(float Q[3][BLOCK_HEIGHT+4][BLOCK_WIDTH+4],
+                  float F[3][BLOCK_HEIGHT+1][BLOCK_WIDTH+1],
                  const float g_, const float dx_, const float dt_) {
    //Index of thread within block
    const int tx = get_local_id(0);
@ -42,7 +43,7 @@ void computeFluxF(float Q[3][block_height+4][block_width+4],
    {
        int j=ty; 
        const int l = j + 2; //Skip ghost cells
-        for (int i=tx; i<block_width+1; i+=block_width) {
+        for (int i=tx; i<BLOCK_WIDTH+1; i+=BLOCK_WIDTH) {
            const int k = i + 1;
            
            // Q at interface from the right and left
@ -71,15 +72,15 @@ void computeFluxF(float Q[3][block_height+4][block_width+4],
  * Computes the flux along the y axis for all faces
  */
 __device__
-void computeFluxG(float Q[3][block_height+4][block_width+4],
-                  float G[3][block_height+1][block_width+1],
+void computeFluxG(float Q[3][BLOCK_HEIGHT+4][BLOCK_WIDTH+4],
+                  float G[3][BLOCK_HEIGHT+1][BLOCK_WIDTH+1],
                  const float g_, const float dy_, const float dt_) {
    //Index of thread within block
    const int tx = get_local_id(0);
    const int ty = get_local_id(1);
    
    //Compute fluxes along the y axis
-    for (int j=ty; j<block_height+1; j+=block_height) {
+    for (int j=ty; j<BLOCK_HEIGHT+1; j+=BLOCK_HEIGHT) {
        const int l = j + 1;
        {
            int i=tx;
@ -114,6 +115,7 @@ void computeFluxG(float Q[3][block_height+4][block_width+4],



+extern "C" {
 __global__ void WAFKernel(
        int nx_, int ny_,
        float dx_, float dy_, float dt_,
@ -129,8 +131,8 @@ __global__ void WAFKernel(
        float* hu1_ptr_, int hu1_pitch_,
        float* hv1_ptr_, int hv1_pitch_) {    
    //Shared memory variables
-    __shared__ float Q[3][block_height+4][block_width+4];
-    __shared__ float F[3][block_height+1][block_width+1];
+    __shared__ float Q[3][BLOCK_HEIGHT+4][BLOCK_WIDTH+4];
+    __shared__ float F[3][BLOCK_HEIGHT+1][BLOCK_WIDTH+1];
    
    
    
@ -192,4 +194,6 @@ __global__ void WAFKernel(
                hu1_ptr_, hu1_pitch_,
                hv1_ptr_, hv1_pitch_,
                Q, nx_, ny_);
-}
+}
+
+} // extern "C"
--- a/SWESimulators/common.cu
+++ b/SWESimulators/common.cu
@ -94,18 +94,18 @@ inline __device__ __host__ float clamp(const float f, const float a, const float
 __device__ void readBlock1(float* h_ptr_, int h_pitch_,
                float* hu_ptr_, int hu_pitch_,
                float* hv_ptr_, int hv_pitch_,
-                float Q[3][block_height+2][block_width+2], 
+                float Q[3][BLOCK_HEIGHT+2][BLOCK_WIDTH+2], 
                const int nx_, const int ny_) {
    //Index of thread within block
    const int tx = get_local_id(0);
    const int ty = get_local_id(1);
    
    //Index of block within domain
-    const int bx = block_width * get_group_id(0);
-    const int by = block_height * get_group_id(1);
+    const int bx = BLOCK_WIDTH * get_group_id(0);
+    const int by = BLOCK_HEIGHT * get_group_id(1);
    
    //Read into shared memory
-    for (int j=ty; j<block_height+2; j+=block_height) {
+    for (int j=ty; j<BLOCK_HEIGHT+2; j+=BLOCK_HEIGHT) {
        const int l = clamp(by + j, 0, ny_+1); // Out of bounds
        
        //Compute the pointer to current row in the arrays
@ -113,7 +113,7 @@ __device__ void readBlock1(float* h_ptr_, int h_pitch_,
        float* const hu_row = (float*) ((char*) hu_ptr_ + hu_pitch_*l);
        float* const hv_row = (float*) ((char*) hv_ptr_ + hv_pitch_*l);
        
-        for (int i=tx; i<block_width+2; i+=block_width) {
+        for (int i=tx; i<BLOCK_WIDTH+2; i+=BLOCK_WIDTH) {
            const int k = clamp(bx + i, 0, nx_+1); // Out of bounds
            
            Q[0][j][i] = h_row[k];
@ -133,18 +133,18 @@ __device__ void readBlock1(float* h_ptr_, int h_pitch_,
 __device__ void readBlock2(float* h_ptr_, int h_pitch_,
                float* hu_ptr_, int hu_pitch_,
                float* hv_ptr_, int hv_pitch_,
-                float Q[3][block_height+4][block_width+4], 
+                float Q[3][BLOCK_HEIGHT+4][BLOCK_WIDTH+4], 
                const int nx_, const int ny_) {
    //Index of thread within block
    const int tx = get_local_id(0);
    const int ty = get_local_id(1);
    
    //Index of block within domain
-    const int bx = block_width * get_group_id(0);
-    const int by = block_height * get_group_id(1);
+    const int bx = BLOCK_WIDTH * get_group_id(0);
+    const int by = BLOCK_HEIGHT * get_group_id(1);
    
    //Read into shared memory
-    for (int j=ty; j<block_height+4; j+=block_height) {
+    for (int j=ty; j<BLOCK_HEIGHT+4; j+=BLOCK_HEIGHT) {
        const int l = clamp(by + j, 0, ny_+3); // Out of bounds
        
        //Compute the pointer to current row in the arrays
@ -152,7 +152,7 @@ __device__ void readBlock2(float* h_ptr_, int h_pitch_,
        float* const hu_row = (float*) ((char*) hu_ptr_ + hu_pitch_*l);
        float* const hv_row = (float*) ((char*) hv_ptr_ + hv_pitch_*l);
        
-        for (int i=tx; i<block_width+4; i+=block_width) {
+        for (int i=tx; i<BLOCK_WIDTH+4; i+=BLOCK_WIDTH) {
            const int k = clamp(bx + i, 0, nx_+3); // Out of bounds
            
            Q[0][j][i] = h_row[k];
@ -171,7 +171,7 @@ __device__ void readBlock2(float* h_ptr_, int h_pitch_,
 __device__ void writeBlock1(float* h_ptr_, int h_pitch_,
                 float* hu_ptr_, int hu_pitch_,
                 float* hv_ptr_, int hv_pitch_,
-                 float Q[3][block_height+2][block_width+2],
+                 float Q[3][BLOCK_HEIGHT+2][BLOCK_WIDTH+2],
                 const int nx_, const int ny_) {
    //Index of thread within block
    const int tx = get_local_id(0);
@ -206,7 +206,7 @@ __device__ void writeBlock1(float* h_ptr_, int h_pitch_,
 __device__ void writeBlock2(float* h_ptr_, int h_pitch_,
                 float* hu_ptr_, int hu_pitch_,
                 float* hv_ptr_, int hv_pitch_,
-                 float Q[3][block_height+4][block_width+4], 
+                 float Q[3][BLOCK_HEIGHT+4][BLOCK_WIDTH+4], 
                 const int nx_, const int ny_) {
    //Index of thread within block
    const int tx = get_local_id(0);
@ -240,7 +240,7 @@ __device__ void writeBlock2(float* h_ptr_, int h_pitch_,
  * No flow boundary conditions for the shallow water equations
  * with one ghost cell in each direction
  */
-__device__ void noFlowBoundary1(float Q[3][block_height+2][block_width+2], const int nx_, const int ny_) {
+__device__ void noFlowBoundary1(float Q[3][BLOCK_HEIGHT+2][BLOCK_WIDTH+2], const int nx_, const int ny_) {
    //Global index
    const int ti = get_global_id(0) + 1; //Skip global ghost cells, i.e., +1
    const int tj = get_global_id(1) + 1;
@ -284,7 +284,7 @@ __device__ void noFlowBoundary1(float Q[3][block_height+2][block_width+2], const
  * No flow boundary conditions for the shallow water equations
  * with two ghost cells in each direction
  */
-__device__ void noFlowBoundary2(float Q[3][block_height+4][block_width+4], const int nx_, const int ny_) {
+__device__ void noFlowBoundary2(float Q[3][BLOCK_HEIGHT+4][BLOCK_WIDTH+4], const int nx_, const int ny_) {
    //Global index
    const int ti = get_global_id(0) + 2; //Skip global ghost cells, i.e., +2
    const int tj = get_global_id(1) + 2;
@ -342,8 +342,8 @@ __device__ void noFlowBoundary2(float Q[3][block_height+4][block_width+4], const
 /**
  * Evolves the solution in time along the x axis (dimensional splitting)
  */
-__device__ void evolveF1(float Q[3][block_height+2][block_width+2],
-              float F[3][block_height+1][block_width+1],
+__device__ void evolveF1(float Q[3][BLOCK_HEIGHT+2][BLOCK_WIDTH+2],
+              float F[3][BLOCK_HEIGHT+1][BLOCK_WIDTH+1],
              const int nx_, const int ny_,
              const float dx_, const float dt_) {
    //Index of thread within block
@ -372,8 +372,8 @@ __device__ void evolveF1(float Q[3][block_height+2][block_width+2],
 /**
  * Evolves the solution in time along the x axis (dimensional splitting)
  */
-__device__ void evolveF2(float Q[3][block_height+4][block_width+4],
-              float F[3][block_height+1][block_width+1],
+__device__ void evolveF2(float Q[3][BLOCK_HEIGHT+4][BLOCK_WIDTH+4],
+              float F[3][BLOCK_HEIGHT+1][BLOCK_WIDTH+1],
              const int nx_, const int ny_,
              const float dx_, const float dt_) {
    //Index of thread within block
@ -402,8 +402,8 @@ __device__ void evolveF2(float Q[3][block_height+4][block_width+4],
 /**
  * Evolves the solution in time along the y axis (dimensional splitting)
  */
-__device__ void evolveG1(float Q[3][block_height+2][block_width+2],
-              float G[3][block_height+1][block_width+1],
+__device__ void evolveG1(float Q[3][BLOCK_HEIGHT+2][BLOCK_WIDTH+2],
+              float G[3][BLOCK_HEIGHT+1][BLOCK_WIDTH+1],
              const int nx_, const int ny_,
              const float dy_, const float dt_) {
    //Index of thread within block
@ -433,8 +433,8 @@ __device__ void evolveG1(float Q[3][block_height+2][block_width+2],
 /**
  * Evolves the solution in time along the y axis (dimensional splitting)
  */
-__device__ void evolveG2(float Q[3][block_height+4][block_width+4],
-              float G[3][block_height+1][block_width+1],
+__device__ void evolveG2(float Q[3][BLOCK_HEIGHT+4][BLOCK_WIDTH+4],
+              float G[3][BLOCK_HEIGHT+1][BLOCK_WIDTH+1],
              const int nx_, const int ny_,
              const float dy_, const float dt_) {
    //Index of thread within block
@ -464,79 +464,6 @@ __device__ void evolveG2(float Q[3][block_height+4][block_width+4],



-/**
-  * Reconstructs a slope using the minmod limiter based on three 
-  * consecutive values
-  */
-__device__ float minmodSlope(float left, float center, float right, float theta) {
-    const float backward = (center - left) * theta;
-    const float central = (right - left) * 0.5f;
-    const float forward = (right - center) * theta;
-    
-	return 0.25f
-		*copysign(1.0f, backward)
-		*(copysign(1.0f, backward) + copysign(1.0f, central))
-		*(copysign(1.0f, central) + copysign(1.0f, forward))
-		*min( min(fabs(backward), fabs(central)), fabs(forward) );
-}
-
-
-
-
-/**
-  * Reconstructs a minmod slope for a whole block along x
-  */
-__device__ void minmodSlopeX(float  Q[3][block_height+4][block_width+4],
-                  float Qx[3][block_height+2][block_width+2],
-                  const float theta_) {
-    //Index of thread within block
-    const int tx = get_local_id(0);
-    const int ty = get_local_id(1);
-    
-    //Reconstruct slopes along x axis
-    {
-        const int j = ty;
-        const int l = j + 2; //Skip ghost cells
-        for (int i=tx; i<block_width+2; i+=block_width) {
-            const int k = i + 1;
-            for (int p=0; p<3; ++p) {
-                Qx[p][j][i] = minmodSlope(Q[p][l][k-1], Q[p][l][k], Q[p][l][k+1], theta_);
-            }
-        }
-    }
-}
-
-
-/**
-  * Reconstructs a minmod slope for a whole block along y
-  */
-__device__ void minmodSlopeY(float  Q[3][block_height+4][block_width+4],
-                  float Qy[3][block_height+2][block_width+2],
-                  const float theta_) {
-    //Index of thread within block
-    const int tx = get_local_id(0);
-    const int ty = get_local_id(1);
-    
-    for (int j=ty; j<block_height+2; j+=block_height) {
-        const int l = j + 1;
-        {
-            const int i = tx;
-            const int k = i + 2; //Skip ghost cells
-            for (int p=0; p<3; ++p) {
-                Qy[p][j][i] = minmodSlope(Q[p][l-1][k], Q[p][l][k], Q[p][l+1][k], theta_);
-            }
-        }
-    }
-}
-
-
-
-
-
-
-
-
-
 __device__ float3 F_func(const float3 Q, const float g) {
    float3 F;

@ -551,23 +478,7 @@ __device__ float3 F_func(const float3 Q, const float g) {



-/**
-  * Central upwind flux function
-  */
-__device__ float3 CentralUpwindFlux(const float3 Qm, float3 Qp, const float g) {
-    const float3 Fp = F_func(Qp, g);
-    const float up = Qp.y / Qp.x;   // hu / h
-    const float cp = sqrt(g*Qp.x); // sqrt(g*h)

-    const float3 Fm = F_func(Qm, g);
-    const float um = Qm.y / Qm.x;   // hu / h
-    const float cm = sqrt(g*Qm.x); // sqrt(g*h)
-    
-    const float am = min(min(um-cm, up-cp), 0.0f); // largest negative wave speed
-    const float ap = max(max(um+cm, up+cp), 0.0f); // largest positive wave speed
-    
-    return ((ap*Fm - am*Fp) + ap*am*(Qp-Qm))/(ap-am);
-}



@ -578,49 +489,6 @@ __device__ float3 CentralUpwindFlux(const float3 Qm, float3 Qp, const float g) {



-/**
-  * Harten-Lax-van Leer with contact discontinuity (Toro 2001, p 180)
-  */
-__device__ float3 HLL_flux(const float3 Q_l, const float3 Q_r, const float g_) {    
-    const float h_l = Q_l.x;
-    const float h_r = Q_r.x;
-    
-    // Calculate velocities
-    const float u_l = Q_l.y / h_l;
-    const float u_r = Q_r.y / h_r;
-    
-    // Estimate the potential wave speeds
-    const float c_l = sqrt(g_*h_l);
-    const float c_r = sqrt(g_*h_r);
-    
-    // Compute h in the "star region", h^dagger
-    const float h_dag = 0.5f * (h_l+h_r) - 0.25f * (u_r-u_l)*(h_l+h_r)/(c_l+c_r);
-    
-    const float q_l_tmp = sqrt(0.5f * ( (h_dag+h_l)*h_dag / (h_l*h_l) ) );
-    const float q_r_tmp = sqrt(0.5f * ( (h_dag+h_r)*h_dag / (h_r*h_r) ) );
-    
-    const float q_l = (h_dag > h_l) ? q_l_tmp : 1.0f;
-    const float q_r = (h_dag > h_r) ? q_r_tmp : 1.0f;
-    
-    // Compute wave speed estimates
-    const float S_l = u_l - c_l*q_l;
-    const float S_r = u_r + c_r*q_r;
-    
-    //Upwind selection
-    if (S_l >= 0.0f) {
-        return F_func(Q_l, g_);
-    }
-    else if (S_r <= 0.0f) {
-        return F_func(Q_r, g_);
-    }
-    //Or estimate flux in the star region
-    else {
-        const float3 F_l = F_func(Q_l, g_);
-        const float3 F_r = F_func(Q_r, g_);
-        const float3 flux = (S_r*F_l - S_l*F_r + S_r*S_l*(Q_r - Q_l)) / (S_r-S_l);
-        return flux;
-    }
-}



@ -630,347 +498,3 @@ __device__ float3 HLL_flux(const float3 Q_l, const float3 Q_r, const float g_) {



-/**
-  * Harten-Lax-van Leer with contact discontinuity (Toro 2001, p 181)
-  */
-__device__ float3 HLLC_flux(const float3 Q_l, const float3 Q_r, const float g_) {    
-    const float h_l = Q_l.x;
-    const float h_r = Q_r.x;
-    
-    // Calculate velocities
-    const float u_l = Q_l.y / h_l;
-    const float u_r = Q_r.y / h_r;
-    
-    // Estimate the potential wave speeds
-    const float c_l = sqrt(g_*h_l);
-    const float c_r = sqrt(g_*h_r);
-    
-    // Compute h in the "star region", h^dagger
-    const float h_dag = 0.5f * (h_l+h_r) - 0.25f * (u_r-u_l)*(h_l+h_r)/(c_l+c_r);
-    
-    const float q_l_tmp = sqrt(0.5f * ( (h_dag+h_l)*h_dag / (h_l*h_l) ) );
-    const float q_r_tmp = sqrt(0.5f * ( (h_dag+h_r)*h_dag / (h_r*h_r) ) );
-    
-    const float q_l = (h_dag > h_l) ? q_l_tmp : 1.0f;
-    const float q_r = (h_dag > h_r) ? q_r_tmp : 1.0f;
-    
-    // Compute wave speed estimates
-    const float S_l = u_l - c_l*q_l;
-    const float S_r = u_r + c_r*q_r;
-    const float S_star = ( S_l*h_r*(u_r - S_r) - S_r*h_l*(u_l - S_l) ) / ( h_r*(u_r - S_r) - h_l*(u_l - S_l) );
-    
-    const float3 F_l = F_func(Q_l, g_);
-    const float3 F_r = F_func(Q_r, g_);
-    
-    //Upwind selection
-    if (S_l >= 0.0f) {
-        return F_l;
-    }
-    else if (S_r <= 0.0f) {
-        return F_r;
-    }
-    //Or estimate flux in the "left star" region
-    else if (S_l <= 0.0f && 0.0f <=S_star) {
-        const float v_l = Q_l.z / h_l;
-        const float3 Q_star_l = h_l * (S_l - u_l) / (S_l - S_star) * make_float3(1, S_star, v_l);
-        const float3 flux = F_l + S_l*(Q_star_l - Q_l);
-        return flux;
-    }
-    //Or estimate flux in the "righ star" region
-    else if (S_star <= 0.0f && 0.0f <=S_r) {
-        const float v_r = Q_r.z / h_r;
-        const float3 Q_star_r = h_r * (S_r - u_r) / (S_r - S_star) * make_float3(1, S_star, v_r);
-        const float3 flux = F_r + S_r*(Q_star_r - Q_r);
-        return flux;
-    }
-    else {
-        return make_float3(-99999.9f, -99999.9f, -99999.9f); //Something wrong here
-    }
-}
-
-
-
-/**
-  * Superbee flux limiter for WAF.
-  * Related to superbee limiter so that WAF_superbee(r, c) = 1 - (1-|c|)*superbee(r)
-  * @param r_ the ratio of upwind change (see Toro 2001, p. 203/204)
-  * @param c_ the courant number for wave k, dt*S_k/dx
-  */
-__device__ float WAF_superbee(float r_, float c_) {
-    // r <= 0.0
-    if (r_ <= 0.0f) { 
-        return 1.0f;
-    }
-    // 0.0 <= r <= 1/2
-    else if (r_ <= 0.5f) { 
-        return 1.0f - 2.0f*(1.0f - fabsf(c_))*r_;
-    }
-    // 1/2 <= r <= 1
-    else if (r_ <= 1.0f) {
-        return fabs(c_);
-    }
-    // 1 <= r <= 2
-    else  if (r_ <= 2.0f) {
-        return 1.0f - (1.0f - fabsf(c_))*r_;
-    }
-    // r >= 2
-    else {
-        return 2.0f*fabsf(c_) - 1.0f;
-    }
-}
-
-
-
-
-__device__ float WAF_albada(float r_, float c_) {
-    if (r_ <= 0.0f) {
-        return 1.0f;
-    }
-    else {
-        return 1.0f - (1.0f - fabsf(c_)) * r_ * (1.0f + r_) / (1.0f + r_*r_);
-    }
-}
-
-__device__ float WAF_minbee(float r_, float c_) {
-    r_ = fmaxf(-1.0f, fminf(2.0f, r_));
-    if (r_ <= 0.0f) {
-        return 1.0f;
-    }
-    if (r_ >= 0.0f && r_ <= 1.0f) {
-        return 1.0f - (1.0f - fabsf(c_)) * r_;
-    }
-    else {
-        return fabsf(c_);
-    }
-}
-
-__device__ float WAF_minmod(float r_, float c_) {
-    return 1.0f - (1.0f - fabsf(c_)) * fmaxf(0.0f, fminf(1.0f, r_));
-}
-
-
-__device__ float minmod(float r_) {
-    return fmaxf(0.0f, fminf(1.0f, r_));
-}
-
-__device__ float generalized_minmod(float r_, float theta_) {
-    return fmaxf(0.0f, fminf(theta_*r_, fminf( (1.0f + r_) / 2.0f, theta_)));
-}
-
-__device__ float superbee(float r_) {
-    return fmaxf(0.0f, fmaxf(fminf(2.0f*r_, 1.0f), fminf(r_, 2.0f)));
-}
-
-__device__ float vanAlbada1(float r_) {
-    return (r_*r_ + r_) / (r_*r_ + 1.0f);
-}
-
-__device__ float vanLeer(float r_) {
-    return (r_ + fabsf(r_)) / (1.0f + fabsf(r_));
-}
-
-__device__ float limiterToWAFLimiter(float r_, float c_) {
-    return 1.0f - (1.0f - fabsf(c_))*r_;
-}
-
-__device__ float desingularize(float x_, float eps_) {
-    return copysign(1.0f, x_)*fmaxf(fabsf(x_), fminf(x_*x_/(2.0f*eps_)+0.5f*eps_, eps_));
-}
-
-// Compute h in the "star region", h^dagger
-__device__ float computeHStar(float h_l, float h_r, float u_l, float u_r, float c_l, float c_r, float g_) {
-    //return 0.5f * (h_l+h_r) - 0.25f * (u_r-u_l)*(h_l+h_r)/(c_l+c_r);
-    
-    const float h_tmp = 0.5f * (c_l + c_r) + 0.25f * (u_l - u_r);
-    return h_tmp*h_tmp / g_;
-}
-
-/**
-  * Weighted average flux (Toro 2001, p 200) for interface {i+1/2}
-  * @param r_ The flux limiter parameter (see Toro 2001, p. 203)
-  * @param Q_l2 Q_{i-1}
-  * @param Q_l1 Q_{i}
-  * @param Q_r1 Q_{i+1}
-  * @param Q_r2 Q_{i+2}
-  */
-__device__ float3 WAF_1D_flux(const float3 Q_l2, const float3 Q_l1, const float3 Q_r1, const float3 Q_r2, const float g_, const float dx_, const float dt_) {     
-    const float h_l = Q_l1.x;
-    const float h_r = Q_r1.x;
-    
-    const float h_l2 = Q_l2.x;
-    const float h_r2 = Q_r2.x;
-    
-    // Calculate velocities
-    const float u_l = Q_l1.y / h_l;
-    const float u_r = Q_r1.y / h_r;
-    
-    const float u_l2 = Q_l2.y / h_l2;
-    const float u_r2 = Q_r2.y / h_r2;
-    
-    const float v_l = Q_l1.z / h_l;
-    const float v_r = Q_r1.z / h_r;
-    
-    const float v_l2 = Q_l2.z / h_l2;
-    const float v_r2 = Q_r2.z / h_r2;
-    
-    // Estimate the potential wave speeds
-    const float c_l = sqrt(g_*h_l);
-    const float c_r = sqrt(g_*h_r);
-    
-    const float c_l2 = sqrt(g_*h_l2);
-    const float c_r2 = sqrt(g_*h_r2);
-    
-    // Compute h in the "star region", h^dagger
-    const float h_dag_l = computeHStar(h_l2,  h_l, u_l2,  u_l, c_l2,  c_l, g_);
-    const float h_dag   = computeHStar( h_l,  h_r,  u_l,  u_r,  c_l,  c_r, g_);
-    const float h_dag_r = computeHStar( h_r, h_r2,  u_r, u_r2,  c_r, c_r2, g_);
-    
-    const float q_l_tmp = sqrt(0.5f * ( (h_dag+h_l)*h_dag ) ) / h_l;
-    const float q_r_tmp = sqrt(0.5f * ( (h_dag+h_r)*h_dag ) ) / h_r;
-    
-    const float q_l = (h_dag > h_l) ? q_l_tmp : 1.0f;
-    const float q_r = (h_dag > h_r) ? q_r_tmp : 1.0f;
-    
-    // Compute wave speed estimates
-    const float S_l = u_l - c_l*q_l; 
-    const float S_r = u_r + c_r*q_r;
-    const float S_star = ( S_l*h_r*(u_r - S_r) - S_r*h_l*(u_l - S_l) ) / ( h_r*(u_r - S_r) - h_l*(u_l - S_l) );
-    
-    const float3 Q_star_l = h_l * (S_l - u_l) / (S_l - S_star) * make_float3(1.0, S_star, v_l);
-    const float3 Q_star_r = h_r * (S_r - u_r) / (S_r - S_star) * make_float3(1.0, S_star, v_r);
-    
-    // Estimate the fluxes in the four regions
-    const float3 F_1 = F_func(Q_l1, g_);
-    const float3 F_4 = F_func(Q_r1, g_);
-    
-    const float3 F_2 = F_1 + S_l*(Q_star_l - Q_l1);
-    const float3 F_3 = F_4 + S_r*(Q_star_r - Q_r1);
-    //const float3 F_2 = F_func(Q_star_l, g_);
-    //const float3 F_3 = F_func(Q_star_r, g_);
-    
-    // Compute the courant numbers for the waves
-    const float c_1 = S_l * dt_ / dx_;
-    const float c_2 = S_star * dt_ / dx_;
-    const float c_3 = S_r * dt_ / dx_;
-    
-    // Compute the "upwind change" vectors for the i-3/2 and i+3/2 interfaces
-    const float eps = 1.0e-6f;
-    const float r_1 = desingularize( (c_1 > 0.0f) ? (h_dag_l - h_l2) : (h_dag_r - h_r), eps) / desingularize((h_dag - h_l), eps);
-    const float r_2 = desingularize( (c_2 > 0.0f) ? (v_l - v_l2) : (v_r2 - v_r), eps ) / desingularize((v_r - v_l), eps);
-    const float r_3 = desingularize( (c_3 > 0.0f) ? (h_l - h_dag_l) : (h_r2 - h_dag_r), eps ) / desingularize((h_r - h_dag), eps);
-    
-    /*
-    const float eps1 = 1.0e-6f;
-    const float eps2 = eps1;//*1.0e-1f;
-    const float rh_m = desingularize((h_l - h_l2), eps1) / desingularize((h_r - h_l), eps2);
-    const float rh_p = desingularize((h_r2 - h_r), eps1) / desingularize((h_r - h_l), eps2);
-    const float rv_m = desingularize((v_l - v_l2), eps1) / desingularize((v_r - v_l), eps2);
-    const float rv_p = desingularize((v_r2 - v_r), eps1) / desingularize((v_r - v_l), eps2);
-    
-    // Compute the r parameters for the flux limiter
-    const float rh_1 = (c_1 > 0.0f) ? rh_m : rh_p; 
-    const float rv_1 = (c_1 > 0.0f) ? rv_m : rv_p; 
-    
-    const float rh_2 = (c_2 > 0.0f) ? rh_m : rh_p; 
-    const float rv_2 = (c_2 > 0.0f) ? rv_m : rv_p;
-    
-    const float rh_3 = (c_3 > 0.0f) ? rh_m : rh_p;
-    const float rv_3 = (c_3 > 0.0f) ? rv_m : rv_p;
-    
-    const float r_1 = rh_1;
-    const float r_2 = rv_2;
-    const float r_3 = rh_3;
-    */
-    
-    // Compute the limiter
-    // We use h for the nonlinear waves, and v for the middle shear wave 
-    const float A_1 = copysign(1.0f, c_1) * limiterToWAFLimiter(generalized_minmod(r_1, 1.9f), c_1);
-    const float A_2 = copysign(1.0f, c_2) * limiterToWAFLimiter(generalized_minmod(r_2, 1.9f), c_2); //Middle shear wave 
-    const float A_3 = copysign(1.0f, c_3) * limiterToWAFLimiter(generalized_minmod(r_3, 1.9f), c_3);
-    
-    //Average the fluxes
-    const float3 flux = 0.5f*( F_1 + F_4 )
-                      - 0.5f*( A_1 * (F_2 - F_1)
-                             + A_2 * (F_3 - F_2)
-                             + A_3 * (F_4 - F_3) );
-
-    return flux;
-}
-
-
-
-
-
-
-
-/**
-  * Lax-Friedrichs flux (Toro 2001, p 163)
-  */
-__device__ float3 LxF_1D_flux(const float3 Q_l, const float3 Q_r, const float g_, const float dx_, const float dt_) {
-    const float3 F_l = F_func(Q_l, g_);
-    const float3 F_r = F_func(Q_r, g_);
-    
-    return 0.5f*(F_l + F_r) + (dx_/(2.0f*dt_))*(Q_l - Q_r);
-}
-
-
-
-/**
-  * Lax-Friedrichs extended to 2D
-  */
-__device__ float3 LxF_2D_flux(const float3 Q_l, const float3 Q_r, const float g_, const float dx_, const float dt_) {
-    const float3 F_l = F_func(Q_l, g_);
-    const float3 F_r = F_func(Q_r, g_);
-    
-    //Note numerical diffusion for 2D here (0.25)
-    return 0.5f*(F_l + F_r) + (dx_/(4.0f*dt_))*(Q_l - Q_r);
-}
-
-
-
-
-/**
-  * Richtmeyer / Two-step Lax-Wendroff flux (Toro 2001, p 164)
-  */
-__device__ float3 LxW2_1D_flux(const float3 Q_l, const float3 Q_r, const float g_, const float dx_, const float dt_) {
-    const float3 F_l = F_func(Q_l, g_);
-    const float3 F_r = F_func(Q_r, g_);
-    
-    const float3 Q_lw2 = 0.5f*(Q_l + Q_r) + (dt_/(2.0f*dx_))*(F_l - F_r);
-    
-    return F_func(Q_lw2, g_);
-}
-
-
-
-
-
-
-/**
-  * Godunovs centered scheme (Toro 2001, p 165)
-  */
-__device__ float3 GodC_1D_flux(const float3 Q_l, const float3 Q_r, const float g_, const float dx_, const float dt_) {
-    const float3 F_l = F_func(Q_l, g_);
-    const float3 F_r = F_func(Q_r, g_);
-    
-    const float3 Q_godc = 0.5f*(Q_l + Q_r) + (dt_/dx_)*(F_l - F_r);
-    
-    return F_func(Q_godc, g_);
-}
-    
-
-    
-    
-/**
-  * First Ordered Centered (Toro 2001, p.163)
-  */
-__device__ float3 FORCE_1D_flux(const float3 Q_l, const float3 Q_r, const float g_, const float dx_, const float dt_) {
-    const float3 F_lf = LxF_1D_flux(Q_l, Q_r, g_, dx_, dt_);
-    const float3 F_lw2 = LxW2_1D_flux(Q_l, Q_r, g_, dx_, dt_);
-    return 0.5f*(F_lf + F_lw2);
-}
-
-
-
-
-
--- a/SWESimulators/fluxes/CentralUpwind.cu
+++ b/SWESimulators/fluxes/CentralUpwind.cu
@ -0,0 +1,39 @@
+/*
+This file implements the Central upwind flux
+
+Copyright (C) 2016, 2017, 2018 SINTEF ICT
+
+This program is free software: you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation, either version 3 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+
+
+/**
+  * Central upwind flux function
+  */
+__device__ float3 CentralUpwindFlux(const float3 Qm, float3 Qp, const float g) {
+    const float3 Fp = F_func(Qp, g);
+    const float up = Qp.y / Qp.x;   // hu / h
+    const float cp = sqrt(g*Qp.x); // sqrt(g*h)
+
+    const float3 Fm = F_func(Qm, g);
+    const float um = Qm.y / Qm.x;   // hu / h
+    const float cm = sqrt(g*Qm.x); // sqrt(g*h)
+    
+    const float am = min(min(um-cm, up-cp), 0.0f); // largest negative wave speed
+    const float ap = max(max(um+cm, up+cp), 0.0f); // largest positive wave speed
+    
+    return ((ap*Fm - am*Fp) + ap*am*(Qp-Qm))/(ap-am);
+}
+
--- a/SWESimulators/fluxes/FirstOrderCentered.cu
+++ b/SWESimulators/fluxes/FirstOrderCentered.cu
@ -0,0 +1,33 @@
+/*
+This file implements the First ORder CEntered flux
+
+Copyright (C) 2016, 2017, 2018 SINTEF ICT
+
+This program is free software: you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation, either version 3 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "./LaxFriedrichs.cu"
+#include "./LaxWendroff.cu"
+
+
+    
+    
+/**
+  * First Ordered Centered (Toro 2001, p.163)
+  */
+__device__ float3 FORCE_1D_flux(const float3 Q_l, const float3 Q_r, const float g_, const float dx_, const float dt_) {
+    const float3 F_lf = LxF_1D_flux(Q_l, Q_r, g_, dx_, dt_);
+    const float3 F_lw2 = LxW2_1D_flux(Q_l, Q_r, g_, dx_, dt_);
+    return 0.5f*(F_lf + F_lw2);
+}
--- a/SWESimulators/fluxes/Godunov.cu
+++ b/SWESimulators/fluxes/Godunov.cu
@ -0,0 +1,36 @@
+/*
+This file implements the Godunov flux
+
+Copyright (C) 2016, 2017, 2018 SINTEF ICT
+
+This program is free software: you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation, either version 3 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+
+
+
+/**
+  * Godunovs centered scheme (Toro 2001, p 165)
+  */
+__device__ float3 GodC_1D_flux(const float3 Q_l, const float3 Q_r, const float g_, const float dx_, const float dt_) {
+    const float3 F_l = F_func(Q_l, g_);
+    const float3 F_r = F_func(Q_r, g_);
+    
+    const float3 Q_godc = 0.5f*(Q_l + Q_r) + (dt_/dx_)*(F_l - F_r);
+    
+    return F_func(Q_godc, g_);
+}
+    
+
+
--- a/SWESimulators/fluxes/HartenLaxVanLeer.cu
+++ b/SWESimulators/fluxes/HartenLaxVanLeer.cu
@ -0,0 +1,63 @@
+/*
+This file implements the Harten-Lax-van Leer flux
+
+Copyright (C) 2016, 2017, 2018 SINTEF ICT
+
+This program is free software: you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation, either version 3 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+
+/**
+  * Harten-Lax-van Leer with contact discontinuity (Toro 2001, p 180)
+  */
+__device__ float3 HLL_flux(const float3 Q_l, const float3 Q_r, const float g_) {    
+    const float h_l = Q_l.x;
+    const float h_r = Q_r.x;
+    
+    // Calculate velocities
+    const float u_l = Q_l.y / h_l;
+    const float u_r = Q_r.y / h_r;
+    
+    // Estimate the potential wave speeds
+    const float c_l = sqrt(g_*h_l);
+    const float c_r = sqrt(g_*h_r);
+    
+    // Compute h in the "star region", h^dagger
+    const float h_dag = 0.5f * (h_l+h_r) - 0.25f * (u_r-u_l)*(h_l+h_r)/(c_l+c_r);
+    
+    const float q_l_tmp = sqrt(0.5f * ( (h_dag+h_l)*h_dag / (h_l*h_l) ) );
+    const float q_r_tmp = sqrt(0.5f * ( (h_dag+h_r)*h_dag / (h_r*h_r) ) );
+    
+    const float q_l = (h_dag > h_l) ? q_l_tmp : 1.0f;
+    const float q_r = (h_dag > h_r) ? q_r_tmp : 1.0f;
+    
+    // Compute wave speed estimates
+    const float S_l = u_l - c_l*q_l;
+    const float S_r = u_r + c_r*q_r;
+    
+    //Upwind selection
+    if (S_l >= 0.0f) {
+        return F_func(Q_l, g_);
+    }
+    else if (S_r <= 0.0f) {
+        return F_func(Q_r, g_);
+    }
+    //Or estimate flux in the star region
+    else {
+        const float3 F_l = F_func(Q_l, g_);
+        const float3 F_r = F_func(Q_r, g_);
+        const float3 flux = (S_r*F_l - S_l*F_r + S_r*S_l*(Q_r - Q_l)) / (S_r-S_l);
+        return flux;
+    }
+}
--- a/SWESimulators/fluxes/HartenLaxVanLeerContact.cu
+++ b/SWESimulators/fluxes/HartenLaxVanLeerContact.cu
@ -0,0 +1,80 @@
+/*
+This file implements the Harten-Lax-van Leer flux with contact discontinuity
+
+Copyright (C) 2016, 2017, 2018 SINTEF ICT
+
+This program is free software: you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation, either version 3 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+
+
+
+/**
+  * Harten-Lax-van Leer with contact discontinuity (Toro 2001, p 181)
+  */
+__device__ float3 HLLC_flux(const float3 Q_l, const float3 Q_r, const float g_) {    
+    const float h_l = Q_l.x;
+    const float h_r = Q_r.x;
+    
+    // Calculate velocities
+    const float u_l = Q_l.y / h_l;
+    const float u_r = Q_r.y / h_r;
+    
+    // Estimate the potential wave speeds
+    const float c_l = sqrt(g_*h_l);
+    const float c_r = sqrt(g_*h_r);
+    
+    // Compute h in the "star region", h^dagger
+    const float h_dag = 0.5f * (h_l+h_r) - 0.25f * (u_r-u_l)*(h_l+h_r)/(c_l+c_r);
+    
+    const float q_l_tmp = sqrt(0.5f * ( (h_dag+h_l)*h_dag / (h_l*h_l) ) );
+    const float q_r_tmp = sqrt(0.5f * ( (h_dag+h_r)*h_dag / (h_r*h_r) ) );
+    
+    const float q_l = (h_dag > h_l) ? q_l_tmp : 1.0f;
+    const float q_r = (h_dag > h_r) ? q_r_tmp : 1.0f;
+    
+    // Compute wave speed estimates
+    const float S_l = u_l - c_l*q_l;
+    const float S_r = u_r + c_r*q_r;
+    const float S_star = ( S_l*h_r*(u_r - S_r) - S_r*h_l*(u_l - S_l) ) / ( h_r*(u_r - S_r) - h_l*(u_l - S_l) );
+    
+    const float3 F_l = F_func(Q_l, g_);
+    const float3 F_r = F_func(Q_r, g_);
+    
+    //Upwind selection
+    if (S_l >= 0.0f) {
+        return F_l;
+    }
+    else if (S_r <= 0.0f) {
+        return F_r;
+    }
+    //Or estimate flux in the "left star" region
+    else if (S_l <= 0.0f && 0.0f <=S_star) {
+        const float v_l = Q_l.z / h_l;
+        const float3 Q_star_l = h_l * (S_l - u_l) / (S_l - S_star) * make_float3(1, S_star, v_l);
+        const float3 flux = F_l + S_l*(Q_star_l - Q_l);
+        return flux;
+    }
+    //Or estimate flux in the "righ star" region
+    else if (S_star <= 0.0f && 0.0f <=S_r) {
+        const float v_r = Q_r.z / h_r;
+        const float3 Q_star_r = h_r * (S_r - u_r) / (S_r - S_star) * make_float3(1, S_star, v_r);
+        const float3 flux = F_r + S_r*(Q_star_r - Q_r);
+        return flux;
+    }
+    else {
+        return make_float3(-99999.9f, -99999.9f, -99999.9f); //Something wrong here
+    }
+}
+
--- a/SWESimulators/fluxes/LaxFriedrichs.cu
+++ b/SWESimulators/fluxes/LaxFriedrichs.cu
@ -0,0 +1,24 @@
+
+
+/**
+  * Lax-Friedrichs flux (Toro 2001, p 163)
+  */
+__device__ float3 LxF_1D_flux(const float3 Q_l, const float3 Q_r, const float g_, const float dx_, const float dt_) {
+    const float3 F_l = F_func(Q_l, g_);
+    const float3 F_r = F_func(Q_r, g_);
+    
+    return 0.5f*(F_l + F_r) + (dx_/(2.0f*dt_))*(Q_l - Q_r);
+}
+
+
+
+/**
+  * Lax-Friedrichs extended to 2D
+  */
+__device__ float3 LxF_2D_flux(const float3 Q_l, const float3 Q_r, const float g_, const float dx_, const float dt_) {
+    const float3 F_l = F_func(Q_l, g_);
+    const float3 F_r = F_func(Q_r, g_);
+    
+    //Note numerical diffusion for 2D here (0.25)
+    return 0.5f*(F_l + F_r) + (dx_/(4.0f*dt_))*(Q_l - Q_r);
+}
--- a/SWESimulators/fluxes/LaxWendroff.cu
+++ b/SWESimulators/fluxes/LaxWendroff.cu
@ -0,0 +1,33 @@
+/*
+This file implements the Lax-Wendroff flux
+
+Copyright (C) 2016, 2017, 2018 SINTEF ICT
+
+This program is free software: you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation, either version 3 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+
+
+/**
+  * Richtmeyer / Two-step Lax-Wendroff flux (Toro 2001, p 164)
+  */
+__device__ float3 LxW2_1D_flux(const float3 Q_l, const float3 Q_r, const float g_, const float dx_, const float dt_) {
+    const float3 F_l = F_func(Q_l, g_);
+    const float3 F_r = F_func(Q_r, g_);
+    
+    const float3 Q_lw2 = 0.5f*(Q_l + Q_r) + (dt_/(2.0f*dx_))*(F_l - F_r);
+    
+    return F_func(Q_lw2, g_);
+}
+
--- a/SWESimulators/fluxes/WeightedAverageFlux.cu
+++ b/SWESimulators/fluxes/WeightedAverageFlux.cu
@ -0,0 +1,187 @@
+/*
+This file implements the Weighted Average Flux
+
+Copyright (C) 2016, 2017, 2018 SINTEF ICT
+
+This program is free software: you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation, either version 3 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "limiters.cu"
+
+/**
+  * Superbee flux limiter for WAF.
+  * Related to superbee limiter so that WAF_superbee(r, c) = 1 - (1-|c|)*superbee(r)
+  * @param r_ the ratio of upwind change (see Toro 2001, p. 203/204)
+  * @param c_ the courant number for wave k, dt*S_k/dx
+  */
+__device__ float WAF_superbee(float r_, float c_) {
+    // r <= 0.0
+    if (r_ <= 0.0f) { 
+        return 1.0f;
+    }
+    // 0.0 <= r <= 1/2
+    else if (r_ <= 0.5f) { 
+        return 1.0f - 2.0f*(1.0f - fabsf(c_))*r_;
+    }
+    // 1/2 <= r <= 1
+    else if (r_ <= 1.0f) {
+        return fabs(c_);
+    }
+    // 1 <= r <= 2
+    else  if (r_ <= 2.0f) {
+        return 1.0f - (1.0f - fabsf(c_))*r_;
+    }
+    // r >= 2
+    else {
+        return 2.0f*fabsf(c_) - 1.0f;
+    }
+}
+
+
+
+
+__device__ float WAF_albada(float r_, float c_) {
+    if (r_ <= 0.0f) {
+        return 1.0f;
+    }
+    else {
+        return 1.0f - (1.0f - fabsf(c_)) * r_ * (1.0f + r_) / (1.0f + r_*r_);
+    }
+}
+
+__device__ float WAF_minbee(float r_, float c_) {
+    r_ = fmaxf(-1.0f, fminf(2.0f, r_));
+    if (r_ <= 0.0f) {
+        return 1.0f;
+    }
+    if (r_ >= 0.0f && r_ <= 1.0f) {
+        return 1.0f - (1.0f - fabsf(c_)) * r_;
+    }
+    else {
+        return fabsf(c_);
+    }
+}
+
+__device__ float WAF_minmod(float r_, float c_) {
+    return 1.0f - (1.0f - fabsf(c_)) * fmaxf(0.0f, fminf(1.0f, r_));
+}
+
+
+
+__device__ float limiterToWAFLimiter(float r_, float c_) {
+    return 1.0f - (1.0f - fabsf(c_))*r_;
+}
+
+__device__ float desingularize(float x_, float eps_) {
+    return copysign(1.0f, x_)*fmaxf(fabsf(x_), fminf(x_*x_/(2.0f*eps_)+0.5f*eps_, eps_));
+}
+
+// Compute h in the "star region", h^dagger
+__device__ __inline__ float computeHStar(float h_l, float h_r, float u_l, float u_r, float c_l, float c_r, float g_) {
+    
+    //This estimate for the h* gives rise to spurious oscillations. 
+    //return 0.5f * (h_l+h_r) - 0.25f * (u_r-u_l)*(h_l+h_r)/(c_l+c_r);
+    
+    const float h_tmp = 0.5f * (c_l + c_r) + 0.25f * (u_l - u_r);
+    return h_tmp*h_tmp / g_;
+}
+
+/**
+  * Weighted average flux (Toro 2001, p 200) for interface {i+1/2}
+  * @param r_ The flux limiter parameter (see Toro 2001, p. 203)
+  * @param Q_l2 Q_{i-1}
+  * @param Q_l1 Q_{i}
+  * @param Q_r1 Q_{i+1}
+  * @param Q_r2 Q_{i+2}
+  */
+__device__ float3 WAF_1D_flux(const float3 Q_l2, const float3 Q_l1, const float3 Q_r1, const float3 Q_r2, const float g_, const float dx_, const float dt_) {     
+    const float h_l = Q_l1.x;
+    const float h_r = Q_r1.x;
+    
+    const float h_l2 = Q_l2.x;
+    const float h_r2 = Q_r2.x;
+    
+    // Calculate velocities
+    const float u_l = Q_l1.y / h_l;
+    const float u_r = Q_r1.y / h_r;
+    
+    const float u_l2 = Q_l2.y / h_l2;
+    const float u_r2 = Q_r2.y / h_r2;
+    
+    const float v_l = Q_l1.z / h_l;
+    const float v_r = Q_r1.z / h_r;
+    
+    const float v_l2 = Q_l2.z / h_l2;
+    const float v_r2 = Q_r2.z / h_r2;
+    
+    // Estimate the potential wave speeds
+    const float c_l = sqrt(g_*h_l);
+    const float c_r = sqrt(g_*h_r);
+    
+    const float c_l2 = sqrt(g_*h_l2);
+    const float c_r2 = sqrt(g_*h_r2);
+    
+    // Compute h in the "star region", h^dagger
+    const float h_dag_l = computeHStar(h_l2,  h_l, u_l2,  u_l, c_l2,  c_l, g_);
+    const float h_dag   = computeHStar( h_l,  h_r,  u_l,  u_r,  c_l,  c_r, g_);
+    const float h_dag_r = computeHStar( h_r, h_r2,  u_r, u_r2,  c_r, c_r2, g_);
+    
+    const float q_l_tmp = sqrt(0.5f * ( (h_dag+h_l)*h_dag ) ) / h_l;
+    const float q_r_tmp = sqrt(0.5f * ( (h_dag+h_r)*h_dag ) ) / h_r;
+    
+    const float q_l = (h_dag > h_l) ? q_l_tmp : 1.0f;
+    const float q_r = (h_dag > h_r) ? q_r_tmp : 1.0f;
+    
+    // Compute wave speed estimates
+    const float S_l = u_l - c_l*q_l; 
+    const float S_r = u_r + c_r*q_r;
+    const float S_star = ( S_l*h_r*(u_r - S_r) - S_r*h_l*(u_l - S_l) ) / ( h_r*(u_r - S_r) - h_l*(u_l - S_l) );
+    
+    const float3 Q_star_l = h_l * (S_l - u_l) / (S_l - S_star) * make_float3(1.0, S_star, v_l);
+    const float3 Q_star_r = h_r * (S_r - u_r) / (S_r - S_star) * make_float3(1.0, S_star, v_r);
+    
+    // Estimate the fluxes in the four regions
+    const float3 F_1 = F_func(Q_l1, g_);
+    const float3 F_4 = F_func(Q_r1, g_);
+    
+    const float3 F_2 = F_1 + S_l*(Q_star_l - Q_l1);
+    const float3 F_3 = F_4 + S_r*(Q_star_r - Q_r1);
+    //const float3 F_2 = F_func(Q_star_l, g_);
+    //const float3 F_3 = F_func(Q_star_r, g_);
+    
+    // Compute the courant numbers for the waves
+    const float c_1 = S_l * dt_ / dx_;
+    const float c_2 = S_star * dt_ / dx_;
+    const float c_3 = S_r * dt_ / dx_;
+    
+    // Compute the "upwind change" vectors for the i-3/2 and i+3/2 interfaces
+    const float eps = 1.0e-6f;
+    const float r_1 = desingularize( (c_1 > 0.0f) ? (h_dag_l - h_l2) : (h_dag_r - h_r), eps) / desingularize((h_dag - h_l), eps);
+    const float r_2 = desingularize( (c_2 > 0.0f) ? (v_l - v_l2) : (v_r2 - v_r), eps ) / desingularize((v_r - v_l), eps);
+    const float r_3 = desingularize( (c_3 > 0.0f) ? (h_l - h_dag_l) : (h_r2 - h_dag_r), eps ) / desingularize((h_r - h_dag), eps);
+        
+    // Compute the limiter
+    // We use h for the nonlinear waves, and v for the middle shear wave 
+    const float A_1 = copysign(1.0f, c_1) * limiterToWAFLimiter(generalized_minmod(r_1, 1.9f), c_1);
+    const float A_2 = copysign(1.0f, c_2) * limiterToWAFLimiter(generalized_minmod(r_2, 1.9f), c_2); 
+    const float A_3 = copysign(1.0f, c_3) * limiterToWAFLimiter(generalized_minmod(r_3, 1.9f), c_3);
+    
+    //Average the fluxes
+    const float3 flux = 0.5f*( F_1 + F_4 )
+                      - 0.5f*( A_1 * (F_2 - F_1)
+                             + A_2 * (F_3 - F_2)
+                             + A_3 * (F_4 - F_3) );
+
+    return flux;
+}
--- a/SWESimulators/limiters.cu
+++ b/SWESimulators/limiters.cu
@ -0,0 +1,129 @@
+/*
+This file implements different flux and slope limiters
+
+Copyright (C) 2016, 2017, 2018 SINTEF ICT
+
+This program is free software: you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation, either version 3 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+
+
+
+
+
+
+
+/**
+  * Reconstructs a slope using the generalized minmod limiter based on three 
+  * consecutive values
+  */
+__device__ __inline__ float minmodSlope(float left, float center, float right, float theta) {
+    const float backward = (center - left) * theta;
+    const float central = (right - left) * 0.5f;
+    const float forward = (right - center) * theta;
+    
+	return 0.25f
+		*copysign(1.0f, backward)
+		*(copysign(1.0f, backward) + copysign(1.0f, central))
+		*(copysign(1.0f, central) + copysign(1.0f, forward))
+		*min( min(fabs(backward), fabs(central)), fabs(forward) );
+}
+
+
+
+
+/**
+  * Reconstructs a minmod slope for a whole block along the abscissa
+  */
+__device__ void minmodSlopeX(float  Q[3][BLOCK_HEIGHT+4][BLOCK_WIDTH+4],
+                  float Qx[3][BLOCK_HEIGHT+2][BLOCK_WIDTH+2],
+                  const float theta_) {
+    //Index of thread within block
+    const int tx = get_local_id(0);
+    const int ty = get_local_id(1);
+    
+    //Reconstruct slopes along x axis
+    {
+        const int j = ty;
+        const int l = j + 2; //Skip ghost cells
+        for (int i=tx; i<BLOCK_WIDTH+2; i+=BLOCK_WIDTH) {
+            const int k = i + 1;
+            for (int p=0; p<3; ++p) {
+                Qx[p][j][i] = minmodSlope(Q[p][l][k-1], Q[p][l][k], Q[p][l][k+1], theta_);
+            }
+        }
+    }
+}
+
+
+/**
+  * Reconstructs a minmod slope for a whole block along the ordinate
+  */
+__device__ void minmodSlopeY(float  Q[3][BLOCK_HEIGHT+4][BLOCK_WIDTH+4],
+                  float Qy[3][BLOCK_HEIGHT+2][BLOCK_WIDTH+2],
+                  const float theta_) {
+    //Index of thread within block
+    const int tx = get_local_id(0);
+    const int ty = get_local_id(1);
+    
+    for (int j=ty; j<BLOCK_HEIGHT+2; j+=BLOCK_HEIGHT) {
+        const int l = j + 1;
+        {
+            const int i = tx;
+            const int k = i + 2; //Skip ghost cells
+            for (int p=0; p<3; ++p) {
+                Qy[p][j][i] = minmodSlope(Q[p][l-1][k], Q[p][l][k], Q[p][l+1][k], theta_);
+            }
+        }
+    }
+}
+
+
+
+
+__device__ float monotonized_central(float r_) {
+    return fmaxf(0.0f, fminf(2.0f, fminf(2.0f*r_, 0.5f*(1.0f+r_))));
+}
+
+__device__ float osher(float r_, float beta_) {
+    return fmaxf(0.0f, fminf(beta_, r_));
+}
+
+__device__ float sweby(float r_, float beta_) {
+    return fmaxf(0.0f, fmaxf(fminf(r_, beta_), fminf(beta_*r_, 1.0f)));
+}
+
+__device__ float minmod(float r_) {
+    return fmaxf(0.0f, fminf(1.0f, r_));
+}
+
+__device__ float generalized_minmod(float r_, float theta_) {
+    return fmaxf(0.0f, fminf(theta_*r_, fminf( (1.0f + r_) / 2.0f, theta_)));
+}
+
+__device__ float superbee(float r_) {
+    return fmaxf(0.0f, fmaxf(fminf(2.0f*r_, 1.0f), fminf(r_, 2.0f)));
+}
+
+__device__ float vanAlbada1(float r_) {
+    return (r_*r_ + r_) / (r_*r_ + 1.0f);
+}
+
+__device__ float vanAlbada2(float r_) {
+    return 2.0f*r_ / (r_*r_* + 1.0f);
+}
+
+__device__ float vanLeer(float r_) {
+    return (r_ + fabsf(r_)) / (1.0f + fabsf(r_));
+}